diff options
| author | Tom Caputi <tcaputi@datto.com> | 2019-03-15 17:14:31 -0400 |
|---|---|---|
| committer | Toomas Soome <tsoome@me.com> | 2019-05-13 23:49:15 +0300 |
| commit | 12a8814c13fbb1d6d58616cf090ea5815dc107f9 (patch) | |
| tree | 3f1b36f6702e76bf3b0636d6c3d9a8943d06470c /usr/src/uts/common/fs/zfs | |
| parent | a3874b8b1fe5103fc1f961609557c0587435fec0 (diff) | |
| download | illumos-gate-12a8814c13fbb1d6d58616cf090ea5815dc107f9.tar.gz | |
10566 Multiple DVA Scrubbing Fix
Portions contributed by: Toomas Soome <tsoome@me.com>
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>
Diffstat (limited to 'usr/src/uts/common/fs/zfs')
| -rw-r--r-- | usr/src/uts/common/fs/zfs/dsl_scan.c | 199 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/metaslab.c | 2 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/spa_misc.c | 2 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/sys/spa.h | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/sys/vdev.h | 3 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/sys/vdev_impl.h | 1 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/vdev.c | 22 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_disk.c | 10 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_file.c | 3 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_mirror.c | 365 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_queue.c | 33 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/zio_inject.c | 42 |
13 files changed, 546 insertions, 144 deletions
diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c index 00bd1498a2..ca82195178 100644 --- a/usr/src/uts/common/fs/zfs/dsl_scan.c +++ b/usr/src/uts/common/fs/zfs/dsl_scan.c @@ -249,24 +249,43 @@ typedef enum { */ typedef struct scan_io { /* fields from blkptr_t */ - uint64_t sio_offset; uint64_t sio_blk_prop; uint64_t sio_phys_birth; uint64_t sio_birth; zio_cksum_t sio_cksum; - uint32_t sio_asize; + uint32_t sio_nr_dvas; /* fields from zio_t */ - int sio_flags; + uint32_t sio_flags; zbookmark_phys_t sio_zb; /* members for queue sorting */ union { - avl_node_t sio_addr_node; /* link into issueing queue */ + avl_node_t sio_addr_node; /* link into issuing queue */ list_node_t sio_list_node; /* link for issuing to disk */ } sio_nodes; + + /* + * There may be up to SPA_DVAS_PER_BP DVAs here from the bp, + * depending on how many were in the original bp. Only the + * first DVA is really used for sorting and issuing purposes. + * The other DVAs (if provided) simply exist so that the zio + * layer can find additional copies to repair from in the + * event of an error. This array must go at the end of the + * struct to allow this for the variable number of elements. + */ + dva_t sio_dva[0]; } scan_io_t; +#define SIO_SET_OFFSET(sio, x) DVA_SET_OFFSET(&(sio)->sio_dva[0], x) +#define SIO_SET_ASIZE(sio, x) DVA_SET_ASIZE(&(sio)->sio_dva[0], x) +#define SIO_GET_OFFSET(sio) DVA_GET_OFFSET(&(sio)->sio_dva[0]) +#define SIO_GET_ASIZE(sio) DVA_GET_ASIZE(&(sio)->sio_dva[0]) +#define SIO_GET_END_OFFSET(sio) \ + (SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio)) +#define SIO_GET_MUSED(sio) \ + (sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t))) + struct dsl_scan_io_queue { dsl_scan_t *q_scn; /* associated dsl_scan_t */ vdev_t *q_vd; /* top-level vdev that this queue represents */ @@ -275,6 +294,7 @@ struct dsl_scan_io_queue { range_tree_t *q_exts_by_addr; avl_tree_t q_exts_by_size; avl_tree_t q_sios_by_addr; + uint64_t q_sio_memused; /* members for zio rate limiting */ uint64_t q_maxinflight_bytes; @@ -313,7 +333,27 @@ static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd); static void scan_io_queues_destroy(dsl_scan_t *scn); -static kmem_cache_t *sio_cache; +static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP]; + +/* sio->sio_nr_dvas must be set so we know which cache to free from */ +static void +sio_free(scan_io_t *sio) +{ + ASSERT3U(sio->sio_nr_dvas, >, 0); + ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP); + + kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio); +} + +/* It is up to the caller to set sio->sio_nr_dvas for freeing */ +static scan_io_t * +sio_alloc(unsigned short nr_dvas) +{ + ASSERT3U(nr_dvas, >, 0); + ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP); + + return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP)); +} void scan_init(void) @@ -328,14 +368,22 @@ scan_init(void) */ fill_weight = zfs_scan_fill_weight; - sio_cache = kmem_cache_create("sio_cache", - sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + for (int i = 0; i < SPA_DVAS_PER_BP; i++) { + char name[36]; + + (void) sprintf(name, "sio_cache_%d", i); + sio_cache[i] = kmem_cache_create(name, + (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))), + 0, NULL, NULL, NULL, NULL, NULL, 0); + } } void scan_fini(void) { - kmem_cache_destroy(sio_cache); + for (int i = 0; i < SPA_DVAS_PER_BP; i++) { + kmem_cache_destroy(sio_cache[i]); + } } static inline boolean_t @@ -352,29 +400,39 @@ dsl_scan_resilvering(dsl_pool_t *dp) } static inline void -sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id) +sio2bp(const scan_io_t *sio, blkptr_t *bp) { bzero(bp, sizeof (*bp)); - DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize); - DVA_SET_VDEV(&bp->blk_dva[0], vdev_id); - DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset); bp->blk_prop = sio->sio_blk_prop; bp->blk_phys_birth = sio->sio_phys_birth; bp->blk_birth = sio->sio_birth; bp->blk_fill = 1; /* we always only work with data pointers */ bp->blk_cksum = sio->sio_cksum; + + ASSERT3U(sio->sio_nr_dvas, >, 0); + ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP); + + bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t)); } static inline void bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) { - /* we discard the vdev id, since we can deduce it from the queue */ - sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]); - sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]); sio->sio_blk_prop = bp->blk_prop; sio->sio_phys_birth = bp->blk_phys_birth; sio->sio_birth = bp->blk_birth; sio->sio_cksum = bp->blk_cksum; + sio->sio_nr_dvas = BP_GET_NDVAS(bp); + + /* + * Copy the DVAs to the sio. We need all copies of the block so + * that the self healing code can use the alternate copies if the + * first is corrupted. We want the DVA at index dva_i to be first + * in the sio since this is the primary one that we want to issue. + */ + for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) { + sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas]; + } } int @@ -1076,11 +1134,9 @@ dsl_scan_should_clear(dsl_scan_t *scn) mutex_enter(&tvd->vdev_scan_io_queue_lock); queue = tvd->vdev_scan_io_queue; if (queue != NULL) { - /* #extents in exts_by_size = # in exts_by_addr */ + /* # extents in exts_by_size = # in exts_by_addr */ mused += avl_numnodes(&queue->q_exts_by_size) * - sizeof (range_seg_t) + - avl_numnodes(&queue->q_sios_by_addr) * - sizeof (scan_io_t); + sizeof (range_seg_t) + queue->q_sio_memused; } mutex_exit(&tvd->vdev_scan_io_queue_lock); } @@ -2546,13 +2602,13 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) break; } - sio2bp(sio, &bp, queue->q_vd->vdev_id); - bytes_issued += sio->sio_asize; + sio2bp(sio, &bp); + bytes_issued += SIO_GET_ASIZE(sio); scan_exec_io(scn->scn_dp, &bp, sio->sio_flags, &sio->sio_zb, queue); (void) list_remove_head(io_list); scan_io_queues_update_zio_stats(queue, &bp); - kmem_free(sio, sizeof (*sio)); + sio_free(sio); } atomic_add_64(&scn->scn_bytes_pending, -bytes_issued); @@ -2569,7 +2625,7 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) static boolean_t scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) { - scan_io_t srch_sio, *sio, *next_sio; + scan_io_t *srch_sio, *sio, *next_sio; avl_index_t idx; uint_t num_sios = 0; int64_t bytes_issued = 0; @@ -2577,24 +2633,30 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) ASSERT(rs != NULL); ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); - srch_sio.sio_offset = rs->rs_start; + srch_sio = sio_alloc(1); + srch_sio->sio_nr_dvas = 1; + SIO_SET_OFFSET(srch_sio, rs->rs_start); /* * The exact start of the extent might not contain any matching zios, * so if that's the case, examine the next one in the tree. */ - sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx); + sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx); + sio_free(srch_sio); + if (sio == NULL) sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); - while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) { - ASSERT3U(sio->sio_offset, >=, rs->rs_start); - ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end); + while (sio != NULL && + SIO_GET_OFFSET(sio) < rs->rs_end && num_sios <= 32) { + ASSERT3U(SIO_GET_OFFSET(sio), >=, rs->rs_start); + ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs->rs_end); next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); avl_remove(&queue->q_sios_by_addr, sio); + queue->q_sio_memused -= SIO_GET_MUSED(sio); - bytes_issued += sio->sio_asize; + bytes_issued += SIO_GET_ASIZE(sio); num_sios++; list_insert_tail(list, sio); sio = next_sio; @@ -2606,11 +2668,11 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) * in the segment we update it to reflect the work we were able to * complete. Otherwise, we remove it from the range tree entirely. */ - if (sio != NULL && sio->sio_offset < rs->rs_end) { + if (sio != NULL && SIO_GET_OFFSET(sio) < rs->rs_end) { range_tree_adjust_fill(queue->q_exts_by_addr, rs, -bytes_issued); range_tree_resize_segment(queue->q_exts_by_addr, rs, - sio->sio_offset, rs->rs_end - sio->sio_offset); + SIO_GET_OFFSET(sio), rs->rs_end - SIO_GET_OFFSET(sio)); return (B_TRUE); } else { @@ -2715,9 +2777,9 @@ scan_io_queues_run_one(void *arg) first_sio = list_head(&sio_list); last_sio = list_tail(&sio_list); - seg_end = last_sio->sio_offset + last_sio->sio_asize; + seg_end = SIO_GET_END_OFFSET(last_sio); if (seg_start == 0) - seg_start = first_sio->sio_offset; + seg_start = SIO_GET_OFFSET(first_sio); /* * Issuing sios can take a long time so drop the @@ -3369,10 +3431,23 @@ count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp) { int i; - /* update the spa's stats on how many bytes we have issued */ - for (i = 0; i < BP_GET_NDVAS(bp); i++) { + /* + * Update the spa's stats on how many bytes we have issued. + * Sequential scrubs create a zio for each DVA of the bp. Each + * of these will include all DVAs for repair purposes, but the + * zio code will only try the first one unless there is an issue. + * Therefore, we should only count the first DVA for these IOs. + */ + if (scn->scn_is_sorted) { atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued, - DVA_GET_ASIZE(&bp->blk_dva[i])); + DVA_GET_ASIZE(&bp->blk_dva[0])); + } else { + spa_t *spa = scn->scn_dp->dp_spa; + + for (i = 0; i < BP_GET_NDVAS(bp); i++) { + atomic_add_64(&spa->spa_scan_pass_issued, + DVA_GET_ASIZE(&bp->blk_dva[i])); + } } /* @@ -3426,7 +3501,7 @@ static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) { avl_index_t idx; - int64_t asize = sio->sio_asize; + int64_t asize = SIO_GET_ASIZE(sio); dsl_scan_t *scn = queue->q_scn; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); @@ -3434,11 +3509,12 @@ scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) { /* block is already scheduled for reading */ atomic_add_64(&scn->scn_bytes_pending, -asize); - kmem_free(sio, sizeof (*sio)); + sio_free(sio); return; } avl_insert(&queue->q_sios_by_addr, sio, idx); - range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize); + queue->q_sio_memused += SIO_GET_MUSED(sio); + range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize); } /* @@ -3452,7 +3528,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, int zio_flags, const zbookmark_phys_t *zb) { dsl_scan_t *scn = queue->q_scn; - scan_io_t *sio = kmem_zalloc(sizeof (*sio), KM_SLEEP); + scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp)); ASSERT0(BP_IS_GANG(bp)); ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); @@ -3466,7 +3542,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, * get an integer underflow in case the worker processes the * zio before we get to incrementing this counter. */ - atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize); + atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio)); scan_io_queue_insert_impl(queue, sio); } @@ -3699,15 +3775,11 @@ ext_size_compare(const void *x, const void *y) * based on LBA-order (from lowest to highest). */ static int -io_addr_compare(const void *x, const void *y) +sio_addr_compare(const void *x, const void *y) { const scan_io_t *a = x, *b = y; - if (a->sio_offset < b->sio_offset) - return (-1); - if (a->sio_offset == b->sio_offset) - return (0); - return (1); + return (AVL_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b))); } /* IO queues are created on demand when they are needed. */ @@ -3719,10 +3791,11 @@ scan_io_queue_create(vdev_t *vd) q->q_scn = scn; q->q_vd = vd; + q->q_sio_memused = 0; cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops, &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap); - avl_create(&q->q_sios_by_addr, io_addr_compare, + avl_create(&q->q_sios_by_addr, sio_addr_compare, sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); return (q); @@ -3746,11 +3819,13 @@ dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue) while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) != NULL) { ASSERT(range_tree_contains(queue->q_exts_by_addr, - sio->sio_offset, sio->sio_asize)); - bytes_dequeued += sio->sio_asize; - kmem_free(sio, sizeof (*sio)); + SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio))); + bytes_dequeued += SIO_GET_ASIZE(sio); + queue->q_sio_memused -= SIO_GET_MUSED(sio); + sio_free(sio); } + ASSERT0(queue->q_sio_memused); atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued); range_tree_vacate(queue->q_exts_by_addr, NULL, queue); range_tree_destroy(queue->q_exts_by_addr); @@ -3805,7 +3880,7 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) vdev_t *vdev; kmutex_t *q_lock; dsl_scan_io_queue_t *queue; - scan_io_t srch, *sio; + scan_io_t *srch_sio, *sio; avl_index_t idx; uint64_t start, size; @@ -3820,9 +3895,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) return; } - bp2sio(bp, &srch, dva_i); - start = srch.sio_offset; - size = srch.sio_asize; + srch_sio = sio_alloc(BP_GET_NDVAS(bp)); + bp2sio(bp, srch_sio, dva_i); + start = SIO_GET_OFFSET(srch_sio); + size = SIO_GET_ASIZE(srch_sio); /* * We can find the zio in two states: @@ -3842,15 +3918,18 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) * be done with issuing the zio's it gathered and will * signal us. */ - sio = avl_find(&queue->q_sios_by_addr, &srch, &idx); + sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx); + sio_free(srch_sio); + if (sio != NULL) { - int64_t asize = sio->sio_asize; + int64_t asize = SIO_GET_ASIZE(sio); blkptr_t tmpbp; /* Got it while it was cold in the queue */ - ASSERT3U(start, ==, sio->sio_offset); + ASSERT3U(start, ==, SIO_GET_OFFSET(sio)); ASSERT3U(size, ==, asize); avl_remove(&queue->q_sios_by_addr, sio); + queue->q_sio_memused -= SIO_GET_MUSED(sio); ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); range_tree_remove_fill(queue->q_exts_by_addr, start, size); @@ -3863,10 +3942,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) atomic_add_64(&scn->scn_bytes_pending, -asize); /* count the block as though we issued it */ - sio2bp(sio, &tmpbp, dva_i); + sio2bp(sio, &tmpbp); count_block(scn, dp->dp_blkstats, &tmpbp); - kmem_free(sio, sizeof (*sio)); + sio_free(sio); } mutex_exit(q_lock); } diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 1c004f87f3..d0b9f6960f 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -2069,7 +2069,7 @@ metaslab_space_weight(metaslab_t *msp) * In effect, this means that we'll select the metaslab with the most * free bandwidth rather than simply the one with the most free space. */ - if (metaslab_lba_weighting_enabled) { + if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; ASSERT(weight >= space && weight <= 2 * space); } diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 7a44ac86b0..0a5cec2644 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -2036,6 +2036,7 @@ spa_init(int mode) dmu_init(); zil_init(); vdev_cache_stat_init(); + vdev_mirror_stat_init(); zfs_prop_init(); zpool_prop_init(); zpool_feature_init(); @@ -2052,6 +2053,7 @@ spa_fini(void) spa_evict_all(); vdev_cache_stat_fini(); + vdev_mirror_stat_fini(); zil_fini(); dmu_fini(); zio_fini(); diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 4ff552447e..53b9e4ef5d 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -907,6 +907,10 @@ extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); extern void vdev_cache_stat_init(void); extern void vdev_cache_stat_fini(void); +/* vdev mirror */ +extern void vdev_mirror_stat_init(void); +extern void vdev_mirror_stat_fini(void); + /* Initialization and termination */ extern void spa_init(int flags); extern void spa_fini(void); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index 0c0bc874c1..e21989641b 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -139,6 +139,9 @@ extern zio_t *vdev_queue_io(zio_t *zio); extern void vdev_queue_io_done(zio_t *zio); extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority); +extern int vdev_queue_length(vdev_t *vd); +extern uint64_t vdev_queue_last_offset(vdev_t *vd); + extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 4e1b09c27d..a91927dbb6 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -225,6 +225,7 @@ struct vdev { vdev_stat_t vdev_stat; /* virtual device statistics */ boolean_t vdev_expanding; /* expand the vdev? */ boolean_t vdev_reopening; /* reopen in progress? */ + boolean_t vdev_nonrot; /* true if solid state */ int vdev_open_error; /* error on last open */ kthread_t *vdev_open_thread; /* thread opening children */ uint64_t vdev_crtxg; /* txg when top-level was added */ diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h index 824d1d8bb7..70916c45b7 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -318,13 +318,15 @@ typedef struct zinject_record { uint64_t zi_timer; uint64_t zi_nlanes; uint32_t zi_cmd; - uint32_t zi_pad; + uint32_t zi_dvas; } zinject_record_t; #define ZINJECT_NULL 0x1 #define ZINJECT_FLUSH_ARC 0x2 #define ZINJECT_UNLOAD_SPA 0x4 +#define ZI_NO_DVA (-1) + typedef enum zinject_type { ZINJECT_UNINITIALIZED, ZINJECT_DATA_FAULT, diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index c7dca83777..4971e9e79e 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -1478,19 +1478,27 @@ vdev_open_children(vdev_t *vd) * spa_namespace_lock */ if (vdev_uses_zvols(vd)) { +retry_sync: for (int c = 0; c < children; c++) vd->vdev_child[c]->vdev_open_error = vdev_open(vd->vdev_child[c]); - return; + } else { + tq = taskq_create("vdev_open", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + if (tq == NULL) + goto retry_sync; + + for (int c = 0; c < children; c++) + VERIFY(taskq_dispatch(tq, vdev_open_child, + vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID); + + taskq_destroy(tq); } - tq = taskq_create("vdev_open", children, minclsyspri, - children, children, TASKQ_PREPOPULATE); - for (int c = 0; c < children; c++) - VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], - TQ_SLEEP) != TASKQID_INVALID); + vd->vdev_nonrot = B_TRUE; - taskq_destroy(tq); + for (int c = 0; c < children; c++) + vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; } /* diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index 93462ee2ba..3f137c5d59 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -606,6 +606,16 @@ skip_open: */ vd->vdev_nowritecache = B_FALSE; + /* Inform the ZIO pipeline that we are non-rotational */ + vd->vdev_nonrot = B_FALSE; + if (ldi_prop_exists(dvd->vd_lh, DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, + "device-solid-state")) { + if (ldi_prop_get_int(dvd->vd_lh, + LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, + "device-solid-state", B_FALSE) != 0) + vd->vdev_nonrot = B_TRUE; + } + return (0); } diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c index 3aaebe8505..806716200a 100644 --- a/usr/src/uts/common/fs/zfs/vdev_file.c +++ b/usr/src/uts/common/fs/zfs/vdev_file.c @@ -58,6 +58,9 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, vattr_t vattr; int error; + /* Rotational optimizations only make sense on block devices */ + vd->vdev_nonrot = B_TRUE; + /* * We must have a pathname, and it must be absolute. */ diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c index f489bb1967..f654bf9afb 100644 --- a/usr/src/uts/common/fs/zfs/vdev_mirror.c +++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c @@ -38,6 +38,65 @@ #include <sys/fs/zfs.h> /* + * Vdev mirror kstats + */ +static kstat_t *mirror_ksp = NULL; + +typedef struct mirror_stats { + kstat_named_t vdev_mirror_stat_rotating_linear; + kstat_named_t vdev_mirror_stat_rotating_offset; + kstat_named_t vdev_mirror_stat_rotating_seek; + kstat_named_t vdev_mirror_stat_non_rotating_linear; + kstat_named_t vdev_mirror_stat_non_rotating_seek; + + kstat_named_t vdev_mirror_stat_preferred_found; + kstat_named_t vdev_mirror_stat_preferred_not_found; +} mirror_stats_t; + +static mirror_stats_t mirror_stats = { + /* New I/O follows directly the last I/O */ + { "rotating_linear", KSTAT_DATA_UINT64 }, + /* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */ + { "rotating_offset", KSTAT_DATA_UINT64 }, + /* New I/O requires random seek */ + { "rotating_seek", KSTAT_DATA_UINT64 }, + /* New I/O follows directly the last I/O (nonrot) */ + { "non_rotating_linear", KSTAT_DATA_UINT64 }, + /* New I/O requires random seek (nonrot) */ + { "non_rotating_seek", KSTAT_DATA_UINT64 }, + /* Preferred child vdev found */ + { "preferred_found", KSTAT_DATA_UINT64 }, + /* Preferred child vdev not found or equal load */ + { "preferred_not_found", KSTAT_DATA_UINT64 }, + +}; + +#define MIRROR_STAT(stat) (mirror_stats.stat.value.ui64) +#define MIRROR_INCR(stat, val) atomic_add_64(&MIRROR_STAT(stat), val) +#define MIRROR_BUMP(stat) MIRROR_INCR(stat, 1) + +void +vdev_mirror_stat_init(void) +{ + mirror_ksp = kstat_create("zfs", 0, "vdev_mirror_stats", + "misc", KSTAT_TYPE_NAMED, + sizeof (mirror_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (mirror_ksp != NULL) { + mirror_ksp->ks_data = &mirror_stats; + kstat_install(mirror_ksp); + } +} + +void +vdev_mirror_stat_fini(void) +{ + if (mirror_ksp != NULL) { + kstat_delete(mirror_ksp); + mirror_ksp = NULL; + } +} + +/* * Virtual device vector for mirroring. */ @@ -45,48 +104,182 @@ typedef struct mirror_child { vdev_t *mc_vd; uint64_t mc_offset; int mc_error; + int mc_load; uint8_t mc_tried; uint8_t mc_skipped; uint8_t mc_speculative; } mirror_child_t; typedef struct mirror_map { + int *mm_preferred; + int mm_preferred_cnt; int mm_children; int mm_resilvering; - int mm_preferred; int mm_root; - mirror_child_t mm_child[1]; + mirror_child_t mm_child[]; } mirror_map_t; int vdev_mirror_shift = 21; +/* + * The load configuration settings below are tuned by default for + * the case where all devices are of the same rotational type. + * + * If there is a mixture of rotating and non-rotating media, setting + * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results + * as it will direct more reads to the non-rotating vdevs which are more likely + * to have a higher performance. + */ + +/* Rotating media load calculation configuration. */ +static int zfs_vdev_mirror_rotating_inc = 0; +static int zfs_vdev_mirror_rotating_seek_inc = 5; +static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024; + +/* Non-rotating media load calculation configuration. */ +static int zfs_vdev_mirror_non_rotating_inc = 0; +static int zfs_vdev_mirror_non_rotating_seek_inc = 1; + +static inline size_t +vdev_mirror_map_size(int children) +{ + return (offsetof(mirror_map_t, mm_child[children]) + + sizeof (int) * children); +} + +static inline mirror_map_t * +vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root) +{ + mirror_map_t *mm; + + mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP); + mm->mm_children = children; + mm->mm_resilvering = resilvering; + mm->mm_root = root; + mm->mm_preferred = (int *)((uintptr_t)mm + + offsetof(mirror_map_t, mm_child[children])); + + return (mm); +} + static void vdev_mirror_map_free(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; - kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children])); + kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); } static const zio_vsd_ops_t vdev_mirror_vsd_ops = { - vdev_mirror_map_free, - zio_vsd_default_cksum_report + .vsd_free = vdev_mirror_map_free, + .vsd_cksum_report = zio_vsd_default_cksum_report }; +static int +vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) +{ + uint64_t last_offset; + int64_t offset_diff; + int load; + + /* All DVAs have equal weight at the root. */ + if (mm->mm_root) + return (INT_MAX); + + /* + * We don't return INT_MAX if the device is resilvering i.e. + * vdev_resilver_txg != 0 as when tested performance was slightly + * worse overall when resilvering with compared to without. + */ + + /* Fix zio_offset for leaf vdevs */ + if (vd->vdev_ops->vdev_op_leaf) + zio_offset += VDEV_LABEL_START_SIZE; + + /* Standard load based on pending queue length. */ + load = vdev_queue_length(vd); + last_offset = vdev_queue_last_offset(vd); + + if (vd->vdev_nonrot) { + /* Non-rotating media. */ + if (last_offset == zio_offset) { + MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear); + return (load + zfs_vdev_mirror_non_rotating_inc); + } + + /* + * Apply a seek penalty even for non-rotating devices as + * sequential I/O's can be aggregated into fewer operations on + * the device, thus avoiding unnecessary per-command overhead + * and boosting performance. + */ + MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek); + return (load + zfs_vdev_mirror_non_rotating_seek_inc); + } + + /* Rotating media I/O's which directly follow the last I/O. */ + if (last_offset == zio_offset) { + MIRROR_BUMP(vdev_mirror_stat_rotating_linear); + return (load + zfs_vdev_mirror_rotating_inc); + } + + /* + * Apply half the seek increment to I/O's within seek offset + * of the last I/O issued to this vdev as they should incur less + * of a seek increment. + */ + offset_diff = (int64_t)(last_offset - zio_offset); + if (ABS(offset_diff) < zfs_vdev_mirror_rotating_seek_offset) { + MIRROR_BUMP(vdev_mirror_stat_rotating_offset); + return (load + (zfs_vdev_mirror_rotating_seek_inc / 2)); + } + + /* Apply the full seek increment to all other I/O's. */ + MIRROR_BUMP(vdev_mirror_stat_rotating_seek); + return (load + zfs_vdev_mirror_rotating_seek_inc); +} + static mirror_map_t * -vdev_mirror_map_alloc(zio_t *zio) +vdev_mirror_map_init(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; - int c, d; + int c; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; + dsl_scan_t *scn = NULL; dva_t dva_copy[SPA_DVAS_PER_BP]; - c = BP_GET_NDVAS(zio->io_bp); + if (spa->spa_dsl_pool != NULL) { + scn = spa->spa_dsl_pool->dp_scan; + } + /* + * The sequential scrub code sorts and issues all DVAs + * of a bp separately. Each of these IOs includes all + * original DVA copies so that repairs can be performed + * in the event of an error, but we only actually want + * to check the first DVA since the others will be + * checked by their respective sorted IOs. Only if we + * hit an error will we try all DVAs upon retrying. + * + * Note: This check is safe even if the user switches + * from a legacy scrub to a sequential one in the middle + * of processing, since scn_is_sorted isn't updated until + * all outstanding IOs from the previous scrub pass + * complete. + */ + if ((zio->io_flags & ZIO_FLAG_SCRUB) && + !(zio->io_flags & ZIO_FLAG_IO_RETRY) && + scn != NULL && + scn->scn_is_sorted && + dsl_scan_scrubbing(spa->spa_dsl_pool)) { + c = 1; + } else { + c = BP_GET_NDVAS(zio->io_bp); + } /* * If we do not trust the pool config, some DVAs might be @@ -110,24 +303,7 @@ vdev_mirror_map_alloc(zio_t *zio) } } - mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); - mm->mm_children = c; - mm->mm_resilvering = B_FALSE; - mm->mm_preferred = spa_get_random(c); - mm->mm_root = B_TRUE; - - /* - * Check the other, lower-index DVAs to see if they're on - * the same vdev as the child we picked. If they are, use - * them since they are likely to have been allocated from - * the primary metaslab in use at the time, and hence are - * more likely to have locality with single-copy data. - */ - for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) { - if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c])) - mm->mm_preferred = d; - } - + mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; @@ -135,12 +311,6 @@ vdev_mirror_map_alloc(zio_t *zio) mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { - int replacing; - - c = vd->vdev_children; - - mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); - mm->mm_children = c; /* * If we are resilvering, then we should handle scrub reads * differently; we shouldn't issue them to the resilvering @@ -164,25 +334,12 @@ vdev_mirror_map_alloc(zio_t *zio) * automatically removed from the pool after the user replaces * the device that originally failed. */ - replacing = (vd->vdev_ops == &vdev_replacing_ops || - vd->vdev_ops == &vdev_spare_ops); - /* - * If a spa load is in progress, then spa_dsl_pool may be - * uninitialized. But we shouldn't be resilvering during a spa - * load anyway. - */ - if (replacing && - (spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE) && - dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool)) { - mm->mm_resilvering = B_TRUE; - } else { - mm->mm_resilvering = B_FALSE; - } - - mm->mm_preferred = mm->mm_resilvering ? 0 : - (zio->io_offset >> vdev_mirror_shift) % c; - mm->mm_root = B_FALSE; - + boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops) && + spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE && + dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool); + mm = vdev_mirror_map_alloc(vd->vdev_children, replacing, + B_FALSE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; @@ -269,6 +426,7 @@ vdev_mirror_scrub_done(zio_t *zio) } mutex_exit(&zio->io_lock); } + abd_free(zio->io_abd); mc->mc_error = zio->io_error; @@ -277,6 +435,54 @@ vdev_mirror_scrub_done(zio_t *zio) } /* + * Check the other, lower-index DVAs to see if they're on the same + * vdev as the child we picked. If they are, use them since they + * are likely to have been allocated from the primary metaslab in + * use at the time, and hence are more likely to have locality with + * single-copy data. + */ +static int +vdev_mirror_dva_select(zio_t *zio, int p) +{ + dva_t *dva = zio->io_bp->blk_dva; + mirror_map_t *mm = zio->io_vsd; + int preferred; + int c; + + preferred = mm->mm_preferred[p]; + for (p--; p >= 0; p--) { + c = mm->mm_preferred[p]; + if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred])) + preferred = c; + } + return (preferred); +} + +static int +vdev_mirror_preferred_child_randomize(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + int p; + + if (mm->mm_root) { + p = spa_get_random(mm->mm_preferred_cnt); + return (vdev_mirror_dva_select(zio, p)); + } + + /* + * To ensure we don't always favour the first matching vdev, + * which could lead to wear leveling issues on SSD's, we + * use the I/O offset as a pseudo random seed into the vdevs + * which have the lowest load. + */ + p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt; + return (mm->mm_preferred[p]); +} + +/* + * Try to find a vdev whose DTL doesn't contain the block we want to read + * prefering vdevs based on determined load. + * * Try to find a child whose DTL doesn't contain the block we want to read. * If we can't, try the read on any vdev we haven't already tried. */ @@ -284,43 +490,64 @@ static int vdev_mirror_child_select(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; - mirror_child_t *mc; uint64_t txg = zio->io_txg; - int i, c; + int c, lowest_load; ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); - /* - * Try to find a child whose DTL doesn't contain the block to read. - * If a child is known to be completely inaccessible (indicated by - * vdev_readable() returning B_FALSE), don't even try. - */ - for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) { - if (c >= mm->mm_children) - c = 0; + lowest_load = INT_MAX; + mm->mm_preferred_cnt = 0; + for (c = 0; c < mm->mm_children; c++) { + mirror_child_t *mc; + mc = &mm->mm_child[c]; if (mc->mc_tried || mc->mc_skipped) continue; - if (!vdev_readable(mc->mc_vd)) { + + if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; continue; } - if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) - return (c); - mc->mc_error = SET_ERROR(ESTALE); - mc->mc_skipped = 1; - mc->mc_speculative = 1; + + if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) { + mc->mc_error = SET_ERROR(ESTALE); + mc->mc_skipped = 1; + mc->mc_speculative = 1; + continue; + } + + mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); + if (mc->mc_load > lowest_load) + continue; + + if (mc->mc_load < lowest_load) { + lowest_load = mc->mc_load; + mm->mm_preferred_cnt = 0; + } + mm->mm_preferred[mm->mm_preferred_cnt] = c; + mm->mm_preferred_cnt++; + } + + if (mm->mm_preferred_cnt == 1) { + MIRROR_BUMP(vdev_mirror_stat_preferred_found); + return (mm->mm_preferred[0]); + } + + if (mm->mm_preferred_cnt > 1) { + MIRROR_BUMP(vdev_mirror_stat_preferred_not_found); + return (vdev_mirror_preferred_child_randomize(zio)); } /* * Every device is either missing or has this txg in its DTL. * Look for any child we haven't already tried before giving up. */ - for (c = 0; c < mm->mm_children; c++) + for (c = 0; c < mm->mm_children; c++) { if (!mm->mm_child[c].mc_tried) return (c); + } /* * Every child failed. There's no place left to look. @@ -335,7 +562,7 @@ vdev_mirror_io_start(zio_t *zio) mirror_child_t *mc; int c, children; - mm = vdev_mirror_map_alloc(zio); + mm = vdev_mirror_map_init(zio); if (mm == NULL) { ASSERT(!spa_trust_config(zio->io_spa)); diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index 0643c05f57..a89e06ebbf 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -276,6 +276,8 @@ vdev_queue_init(vdev_t *vd) avl_create(vdev_queue_class_tree(vq, p), compfn, sizeof (zio_t), offsetof(struct zio, io_queue_node)); } + + vq->vq_last_offset = 0; } void @@ -701,7 +703,7 @@ again: */ tree = vdev_queue_class_tree(vq, p); search.io_timestamp = 0; - search.io_offset = vq->vq_last_offset + 1; + search.io_offset = vq->vq_last_offset - 1; VERIFY3P(avl_find(tree, &search, &idx), ==, NULL); zio = avl_nearest(tree, idx, AVL_AFTER); if (zio == NULL) @@ -729,7 +731,7 @@ again: } vdev_queue_pending_add(vq, zio); - vq->vq_last_offset = zio->io_offset; + vq->vq_last_offset = zio->io_offset + zio->io_size; return (zio); } @@ -849,12 +851,39 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) */ tree = vdev_queue_class_tree(vq, zio->io_priority); if (avl_find(tree, zio, NULL) == zio) { + spa_t *spa = zio->io_spa; + zio_priority_t oldpri = zio->io_priority; + avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); zio->io_priority = priority; avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); + + mutex_enter(&spa->spa_iokstat_lock); + ASSERT3U(spa->spa_queue_stats[oldpri].spa_queued, >, 0); + spa->spa_queue_stats[oldpri].spa_queued--; + spa->spa_queue_stats[zio->io_priority].spa_queued++; + mutex_exit(&spa->spa_iokstat_lock); } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) { zio->io_priority = priority; } mutex_exit(&vq->vq_lock); } + +/* + * As these two methods are only used for load calculations we're not + * concerned if we get an incorrect value on 32bit platforms due to lack of + * vq_lock mutex use here, instead we prefer to keep it lock free for + * performance. + */ +int +vdev_queue_length(vdev_t *vd) +{ + return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); +} + +uint64_t +vdev_queue_last_offset(vdev_t *vd) +{ + return (vd->vdev_queue.vq_last_offset); +} diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c index 26f59af996..71b859bc3d 100644 --- a/usr/src/uts/common/fs/zfs/zio_inject.c +++ b/usr/src/uts/common/fs/zfs/zio_inject.c @@ -102,7 +102,7 @@ static int inject_next_id = 1; * Returns true if the given record matches the I/O in progress. */ static boolean_t -zio_match_handler(zbookmark_phys_t *zb, uint64_t type, +zio_match_handler(zbookmark_phys_t *zb, uint64_t type, int dva, zinject_record_t *record, int error) { /* @@ -127,9 +127,11 @@ zio_match_handler(zbookmark_phys_t *zb, uint64_t type, zb->zb_level == record->zi_level && zb->zb_blkid >= record->zi_start && zb->zb_blkid <= record->zi_end && - error == record->zi_error) + (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) && + error == record->zi_error) { return (record->zi_freq == 0 || spa_get_random(100) < record->zi_freq); + } return (B_FALSE); } @@ -159,6 +161,38 @@ zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type) rw_exit(&inject_lock); } + +/* + * If this is a physical I/O for a vdev child determine which DVA it is + * for. We iterate backwards through the DVAs matching on the offset so + * that we end up with ZI_NO_DVA (-1) if we don't find a match. + */ +static int +zio_match_dva(zio_t *zio) +{ + int i = ZI_NO_DVA; + + if (zio->io_bp != NULL && zio->io_vd != NULL && + zio->io_child_type == ZIO_CHILD_VDEV) { + for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) { + dva_t *dva = &zio->io_bp->blk_dva[i]; + uint64_t off = DVA_GET_OFFSET(dva); + vdev_t *vd = vdev_lookup_top(zio->io_spa, + DVA_GET_VDEV(dva)); + + /* Compensate for vdev label added to leaves */ + if (zio->io_vd->vdev_ops->vdev_op_leaf) + off += VDEV_LABEL_START_SIZE; + + if (zio->io_vd == vd && zio->io_offset == off) + break; + } + } + + return (i); +} + + /* * Determine if the I/O in question should return failure. Returns the errno * to be returned to the caller. @@ -190,10 +224,10 @@ zio_handle_fault_injection(zio_t *zio, int error) handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) continue; - /* If this handler matches, return EIO */ + /* If this handler matches, return the specified error */ if (zio_match_handler(&zio->io_logical->io_bookmark, zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, - &handler->zi_record, error)) { + zio_match_dva(zio), &handler->zi_record, error)) { ret = error; break; } |
