summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/fs/zfs/dsl_scrub.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/fs/zfs/dsl_scrub.c')
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_scrub.c157
1 files changed, 118 insertions, 39 deletions
diff --git a/usr/src/uts/common/fs/zfs/dsl_scrub.c b/usr/src/uts/common/fs/zfs/dsl_scrub.c
index d511bb841a..48c900cbc3 100644
--- a/usr/src/uts/common/fs/zfs/dsl_scrub.c
+++ b/usr/src/uts/common/fs/zfs/dsl_scrub.c
@@ -53,6 +53,7 @@ static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */
int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */
boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
extern int zfs_txg_timeout;
@@ -78,6 +79,7 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dp->dp_scrub_min_txg = 0;
dp->dp_scrub_max_txg = tx->tx_txg;
+ dp->dp_scrub_ddt_class_max = zfs_scrub_ddt_class_max;
if (*funcp == SCRUB_FUNC_CLEAN) {
vdev_t *rvd = dp->dp_spa->spa_root_vdev;
@@ -101,6 +103,14 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
POOL_SCRUB_EVERYTHING, B_FALSE);
+ /*
+ * If this is an incremental scrub, limit the DDT scrub phase
+ * to just the auto-ditto class (for correctness); the rest
+ * of the scrub should go faster using top-down pruning.
+ */
+ if (dp->dp_scrub_min_txg > TXG_INITIAL)
+ dp->dp_scrub_ddt_class_max = DDT_CLASS_DITTO;
+
dp->dp_spa->spa_scrub_started = B_TRUE;
}
@@ -119,8 +129,8 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset,
ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx);
bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
+ bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t));
dp->dp_scrub_restart = B_FALSE;
- dp->dp_scrub_ditto = B_FALSE;
dp->dp_spa->spa_scrub_errors = 0;
VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
@@ -136,9 +146,17 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
&dp->dp_scrub_max_txg, tx));
VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+ DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
+ sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
&dp->dp_scrub_bookmark, tx));
VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
+ sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
+ &dp->dp_scrub_ddt_bookmark, tx));
+ VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
+ &dp->dp_scrub_ddt_class_max, tx));
+ VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
&dp->dp_spa->spa_scrub_errors, tx));
@@ -186,6 +204,7 @@ dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dp->dp_scrub_queue_obj, tx));
dp->dp_scrub_queue_obj = 0;
bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
+ bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t));
VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_SCRUB_QUEUE, tx));
@@ -200,6 +219,11 @@ dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_SCRUB_ERRORS, tx));
+ (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_DDT_BOOKMARK, tx);
+ (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_DDT_CLASS_MAX, tx);
+
spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr,
"complete=%u", *completep);
@@ -296,7 +320,7 @@ bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1,
}
static boolean_t
-scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb)
+scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb, const ddt_bookmark_t *ddb)
{
int elapsed_ticks;
int mintime;
@@ -308,7 +332,7 @@ scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb)
return (B_FALSE); /* we're resuming */
/* We only know how to resume from level-0 blocks. */
- if (zb->zb_level != 0)
+ if (zb != NULL && zb->zb_level != 0)
return (B_FALSE);
mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time :
@@ -316,11 +340,23 @@ scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb)
elapsed_ticks = ddi_get_lbolt64() - dp->dp_scrub_start_time;
if (elapsed_ticks > hz * zfs_txg_timeout ||
(elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) {
- dprintf("pausing at %llx/%llx/%llx/%llx\n",
- (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object,
- (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid);
+ if (zb) {
+ dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
+ (longlong_t)zb->zb_objset,
+ (longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (longlong_t)zb->zb_blkid);
+ dp->dp_scrub_bookmark = *zb;
+ }
+ if (ddb) {
+ dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
+ (longlong_t)ddb->ddb_class,
+ (longlong_t)ddb->ddb_type,
+ (longlong_t)ddb->ddb_checksum,
+ (longlong_t)ddb->ddb_cursor);
+ ASSERT(&dp->dp_scrub_ddt_bookmark == ddb);
+ }
dp->dp_scrub_pausing = B_TRUE;
- dp->dp_scrub_bookmark = *zb;
return (B_TRUE);
}
return (B_FALSE);
@@ -423,7 +459,7 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
if (bp->blk_birth <= dp->dp_scrub_min_txg)
return;
- if (scrub_pause(dp, zb))
+ if (scrub_pause(dp, zb, NULL))
return;
if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) {
@@ -525,7 +561,13 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
}
}
- (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
+ /*
+ * If dsl_pool_scrub_ddt() has aready scrubbed this block,
+ * don't scrub it again.
+ */
+ if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp))
+ (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
+
if (buf)
(void) arc_buf_remove_ref(buf, &buf);
}
@@ -542,7 +584,6 @@ scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb);
}
-
}
static void
@@ -564,8 +605,8 @@ dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
return;
if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
- SET_BOOKMARK(&dp->dp_scrub_bookmark, ZB_DESTROYED_OBJSET,
- 0, 0, 0);
+ SET_BOOKMARK(&dp->dp_scrub_bookmark,
+ ZB_DESTROYED_OBJSET, 0, 0, 0);
} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
ds->ds_object, tx) != 0) {
return;
@@ -771,34 +812,65 @@ enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
return (0);
}
+/*
+ * Scrub/dedup interaction.
+ *
+ * If there are N references to a deduped block, we don't want to scrub it
+ * N times -- ideally, we should scrub it exactly once.
+ *
+ * To prevent excess scrubbing, the scrub begins by walking the DDT
+ * to find all blocks with refcnt > 1, and scrubs each of these once.
+ * Then the top-down scrub begins, only visiting blocks with refcnt == 1.
+ *
+ * There would be nothing more to say if a block's refcnt couldn't change
+ * during a scrub, but of course it can. There are two cases to consider.
+ *
+ * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
+ * when visited during the top-down scrub phase, it will be scrubbed twice.
+ * This negates our scrub optimization, but is otherwise harmless.
+ *
+ * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
+ * on each visit during the top-down scrub phase, it will never be scrubbed.
+ * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
+ * reference count changes; if it transitions from refcnt == 1 to refcnt > 1
+ * while a scrub is in progress, it scrubs the block right then.
+ *
+ * The code does not actually use the refcnt directly, but rather uses the
+ * dde's replication class (enum ddt_class), which serves the same purpose.
+ */
static void
-dsl_pool_scrub_ddt(dsl_pool_t *dp, enum zio_checksum c, enum ddt_type type,
- enum ddt_class class)
+dsl_pool_scrub_ddt(dsl_pool_t *dp)
{
- ddt_t *ddt = ddt_select_by_checksum(dp->dp_spa, c);
+ ddt_bookmark_t *ddb = &dp->dp_scrub_ddt_bookmark;
ddt_entry_t dde;
- blkptr_t blk;
- zbookmark_t zb = { 0 };
- uint64_t walk = 0;
int error;
- if (!ddt_object_exists(ddt, type, class))
- return;
-
- while ((error = ddt_object_walk(ddt, type, class, &dde, &walk)) == 0) {
- int p = DDT_PHYS_DITTO;
- ddt_bp_create(ddt, &dde.dde_key, &dde.dde_phys[p], &blk);
- scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb);
+ while ((error = ddt_walk(dp->dp_spa, ddb, &dde)) == 0) {
+ if (ddb->ddb_class > dp->dp_scrub_ddt_class_max)
+ return;
+ dsl_pool_scrub_ddt_entry(dp, ddb->ddb_checksum, &dde);
+ if (scrub_pause(dp, NULL, ddb))
+ return;
}
ASSERT(error == ENOENT);
+ ASSERT(ddb->ddb_class > dp->dp_scrub_ddt_class_max);
}
-static void
-dsl_pool_scrub_ditto(dsl_pool_t *dp)
+void
+dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum,
+ const ddt_entry_t *dde)
{
- for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
- for (enum ddt_type type = 0; type < DDT_TYPES; type++)
- dsl_pool_scrub_ddt(dp, c, type, DDT_CLASS_DITTO);
+ const ddt_key_t *ddk = &dde->dde_key;
+ const ddt_phys_t *ddp = dde->dde_phys;
+ blkptr_t blk;
+ zbookmark_t zb = { 0 };
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+ ddt_bp_create(checksum, ddk, ddp, &blk);
+ scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb);
+ }
}
void
@@ -840,9 +912,10 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
spa->spa_scrub_active = B_TRUE;
- if (!dp->dp_scrub_ditto) {
- dsl_pool_scrub_ditto(dp);
- dp->dp_scrub_ditto = B_TRUE;
+ if (dp->dp_scrub_ddt_bookmark.ddb_class <= dp->dp_scrub_ddt_class_max) {
+ dsl_pool_scrub_ddt(dp);
+ if (dp->dp_scrub_pausing)
+ goto out;
}
if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) {
@@ -895,12 +968,18 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx);
return;
out:
- VERIFY(0 == zap_update(dp->dp_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+ VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
+ sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
&dp->dp_scrub_bookmark, tx));
- VERIFY(0 == zap_update(dp->dp_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT,
+ VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
+ sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
+ &dp->dp_scrub_ddt_bookmark, tx));
+ VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
+ &dp->dp_scrub_ddt_class_max, tx));
+ VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
&spa->spa_scrub_errors, tx));