diff options
author | George Wilson <George.Wilson@Sun.COM> | 2010-06-08 12:32:02 -0700 |
---|---|---|
committer | George Wilson <George.Wilson@Sun.COM> | 2010-06-08 12:32:02 -0700 |
commit | 44ecc5327ab4ce0750dcca2a17e05566bf2812e2 (patch) | |
tree | 71ecdd4ae792bc8a4902c6a68d536f6179edbbc4 /usr/src/uts/common/fs | |
parent | f2e5cf43165f583e4a435785d96ecfefa15539d1 (diff) | |
download | illumos-gate-44ecc5327ab4ce0750dcca2a17e05566bf2812e2.tar.gz |
6494473 ZFS needs a way to slow down resilvering
6743992 scrub/resilver causes systemic slowdown
6936821 scrub/resilver io should not be suspended
6956464 otoro: head panic in zfs:dnode_hold_impl; during system disk zinject testing
Diffstat (limited to 'usr/src/uts/common/fs')
-rw-r--r-- | usr/src/uts/common/fs/zfs/dnode.c | 3 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/dsl_pool.c | 2 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/dsl_scan.c | 36 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/spa_impl.h | 2 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zio.h | 2 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/txg.c | 2 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev.c | 12 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zio.c | 21 |
8 files changed, 55 insertions, 25 deletions
diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c index c16902d216..2b44cd2c96 100644 --- a/usr/src/uts/common/fs/zfs/dnode.c +++ b/usr/src/uts/common/fs/zfs/dnode.c @@ -603,8 +603,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, */ ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 || (spa_is_root(os->os_spa) && - spa_config_held(os->os_spa, SCL_STATE, RW_WRITER) && - !spa_config_held(os->os_spa, SCL_ZIO, RW_WRITER))); + spa_config_held(os->os_spa, SCL_STATE, RW_WRITER))); if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) { dn = (object == DMU_USERUSED_OBJECT) ? diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index 2cd21a102b..6ded1fd87e 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -42,7 +42,7 @@ int zfs_no_write_throttle = 0; int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ -int zfs_txg_synctime_ms = 5000; /* target millisecs to sync a txg */ +int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */ uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ uint64_t zfs_write_limit_max = 0; /* max data payload per txg */ diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c index 23c37c7ccf..e6393fc1b7 100644 --- a/usr/src/uts/common/fs/zfs/dsl_scan.c +++ b/usr/src/uts/common/fs/zfs/dsl_scan.c @@ -56,6 +56,11 @@ static scan_cb_t dsl_scan_remove_cb; static dsl_syncfunc_t dsl_scan_cancel_sync; static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx); +int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ +int zfs_resilver_delay = 2; /* number of ticks to delay resilver */ +int zfs_scrub_delay = 4; /* number of ticks to delay scrub */ +int zfs_scan_idle = 50; /* idle window in clock ticks */ + int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ @@ -601,8 +606,8 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, * done before setting xlateall (similar to dsl_read()) */ (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, - buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, - &flags, &czb); + buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb); } static boolean_t @@ -650,6 +655,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp) { dsl_pool_t *dp = scn->scn_dp; + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; int err; if (BP_GET_LEVEL(bp) > 0) { @@ -660,7 +666,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, err = arc_read_nolock(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); @@ -683,7 +689,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, err = arc_read_nolock(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); @@ -696,7 +702,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, err = arc_read_nolock(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); @@ -719,7 +725,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, err = arc_read_nolock(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); @@ -1446,7 +1452,6 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) dsl_scan_setup_sync(scn, &func, tx); } - if (!dsl_scan_active(scn) || spa_sync_pass(dp->dp_spa) > 1) return; @@ -1489,7 +1494,6 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (scn->scn_phys.scn_state != DSS_SCANNING) return; - if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= scn->scn_phys.scn_ddt_class_max) { zfs_dbgmsg("doing scan sync txg %llu; " @@ -1644,8 +1648,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, spa_t *spa = dp->dp_spa; uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); boolean_t needs_io; - int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; + int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; int zio_priority; + int scan_delay = 0; if (phys_birth <= scn->scn_phys.scn_min_txg || phys_birth >= scn->scn_phys.scn_max_txg) @@ -1658,10 +1663,12 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, zio_flags |= ZIO_FLAG_SCRUB; zio_priority = ZIO_PRIORITY_SCRUB; needs_io = B_TRUE; + scan_delay = zfs_scrub_delay; } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { zio_flags |= ZIO_FLAG_RESILVER; zio_priority = ZIO_PRIORITY_RESILVER; needs_io = B_FALSE; + scan_delay = zfs_resilver_delay; } /* If it's an intent log block, failure is expected. */ @@ -1699,14 +1706,23 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, } if (needs_io && !zfs_no_scrub_io) { + vdev_t *rvd = spa->spa_root_vdev; + uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; void *data = zio_data_buf_alloc(size); mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) + while (spa->spa_scrub_inflight >= maxinflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); + /* + * If we're seeing recent (zfs_scan_idle) "important" I/Os + * then throttle our workload to limit the impact of a scan. + */ + if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) + delay(scan_delay); + zio_nowait(zio_read(NULL, spa, bp, data, size, dsl_scan_scrub_done, NULL, zio_priority, zio_flags, zb)); diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index e2e1851ecc..db49e9cf7c 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -146,9 +146,9 @@ struct spa { uberblock_t spa_ubsync; /* last synced uberblock */ uberblock_t spa_uberblock; /* current uberblock */ boolean_t spa_extreme_rewind; /* rewind past deferred frees */ + uint64_t spa_last_io; /* lbolt of last non-scan I/O */ kmutex_t spa_scrub_lock; /* resilver/scrub lock */ uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */ - uint64_t spa_scrub_maxinflight; /* max in-flight scrub I/Os */ kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ uint8_t spa_scrub_active; /* active or suspended? */ uint8_t spa_scrub_type; /* type of scrub we're doing */ diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 0400c1702e..97d8ec74d2 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -147,7 +147,7 @@ enum zio_flag { ZIO_FLAG_SELF_HEAL = 1 << 2, ZIO_FLAG_RESILVER = 1 << 3, ZIO_FLAG_SCRUB = 1 << 4, - ZIO_FLAG_SCRUB_THREAD = 1 << 5, + ZIO_FLAG_SCAN_THREAD = 1 << 5, #define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c index f478ad0c67..9b308ca4e7 100644 --- a/usr/src/uts/common/fs/zfs/txg.c +++ b/usr/src/uts/common/fs/zfs/txg.c @@ -37,7 +37,7 @@ static void txg_sync_thread(dsl_pool_t *dp); static void txg_quiesce_thread(dsl_pool_t *dp); -int zfs_txg_timeout = 30; /* max seconds worth of delta per txg */ +int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */ /* * Prepare the txg subsystem. diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index a61f29b8e7..5bf6eebcd7 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -207,9 +207,6 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd) */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; - - if (cvd->vdev_ops->vdev_op_leaf) - cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit; } void @@ -244,9 +241,6 @@ vdev_remove_child(vdev_t *pvd, vdev_t *cvd) */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; - - if (cvd->vdev_ops->vdev_op_leaf) - cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit; } /* @@ -2541,7 +2535,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize) mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { - if (flags & ZIO_FLAG_SCRUB_THREAD) { + if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_phys_t *scn_phys = &spa->spa_dsl_pool->dp_scan->scn_phys; uint64_t *processed = &scn_phys->scn_processed; @@ -2597,7 +2591,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || - (flags & ZIO_FLAG_SCRUB_THREAD) || + (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's @@ -2616,7 +2610,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize) */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; - if (flags & ZIO_FLAG_SCRUB_THREAD) { + if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 88d80af4e9..258a9a3318 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -2247,6 +2247,26 @@ zio_vdev_io_start(zio_t *zio) return (vdev_mirror_ops.vdev_op_io_start(zio)); } + /* + * We keep track of time-sensitive I/Os so that the scan thread + * can quickly react to certain workloads. In particular, we care + * about non-scrubbing, top-level reads and writes with the following + * characteristics: + * - synchronous writes of user data to non-slog devices + * - any reads of user data + * When these conditions are met, adjust the timestamp of spa_last_io + * which allows the scan thread to adjust its workload accordingly. + */ + if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && + vd == vd->vdev_top && !vd->vdev_islog && + zio->io_bookmark.zb_objset != DMU_META_OBJSET && + zio->io_txg != spa_syncing_txg(spa)) { + uint64_t old = spa->spa_last_io; + uint64_t new = ddi_get_lbolt64(); + if (old != new) + (void) atomic_cas_64(&spa->spa_last_io, old, new); + } + align = 1ULL << vd->vdev_top->vdev_ashift; if (P2PHASE(zio->io_size, align) != 0) { @@ -2744,6 +2764,7 @@ zio_done(zio_t *zio) if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && + !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_error == ENXIO && spa_load_state(spa) == SPA_LOAD_NONE && spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) |