10566 Multiple DVA Scrubbing Fix

Portions contributed by: Toomas Soome <tsoome@me.com> Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Approved by: Dan McDonald <danmcd@joyent.com>
author: Tom Caputi <tcaputi@datto.com> 2019-03-15 17:14:31 -0400
committer: Toomas Soome <tsoome@me.com> 2019-05-13 23:49:15 +0300
commit: 12a8814c13fbb1d6d58616cf090ea5815dc107f9 (patch)
tree: 3f1b36f6702e76bf3b0636d6c3d9a8943d06470c /usr/src/uts/common/fs/zfs
parent: a3874b8b1fe5103fc1f961609557c0587435fec0 (diff)
download: illumos-gate-12a8814c13fbb1d6d58616cf090ea5815dc107f9.tar.gz
13 files changed, 546 insertions, 144 deletions
diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c
index 00bd1498a2..ca82195178 100644
--- a/usr/src/uts/common/fs/zfs/dsl_scan.c
+++ b/usr/src/uts/common/fs/zfs/dsl_scan.c
@@ -249,24 +249,43 @@ typedef enum {
  */
 typedef struct scan_io {
 	/* fields from blkptr_t */
-	uint64_t		sio_offset;
 	uint64_t		sio_blk_prop;
 	uint64_t		sio_phys_birth;
 	uint64_t		sio_birth;
 	zio_cksum_t		sio_cksum;
-	uint32_t		sio_asize;
+	uint32_t		sio_nr_dvas;
 
 	/* fields from zio_t */
-	int			sio_flags;
+	uint32_t		sio_flags;
 	zbookmark_phys_t	sio_zb;
 
 	/* members for queue sorting */
 	union {
-		avl_node_t	sio_addr_node; /* link into issueing queue */
+		avl_node_t	sio_addr_node; /* link into issuing queue */
 		list_node_t	sio_list_node; /* link for issuing to disk */
 	} sio_nodes;
+
+	/*
+	 * There may be up to SPA_DVAS_PER_BP DVAs here from the bp,
+	 * depending on how many were in the original bp. Only the
+	 * first DVA is really used for sorting and issuing purposes.
+	 * The other DVAs (if provided) simply exist so that the zio
+	 * layer can find additional copies to repair from in the
+	 * event of an error. This array must go at the end of the
+	 * struct to allow this for the variable number of elements.
+	 */
+	dva_t			sio_dva[0];
 } scan_io_t;
 
+#define	SIO_SET_OFFSET(sio, x)		DVA_SET_OFFSET(&(sio)->sio_dva[0], x)
+#define	SIO_SET_ASIZE(sio, x)		DVA_SET_ASIZE(&(sio)->sio_dva[0], x)
+#define	SIO_GET_OFFSET(sio)		DVA_GET_OFFSET(&(sio)->sio_dva[0])
+#define	SIO_GET_ASIZE(sio)		DVA_GET_ASIZE(&(sio)->sio_dva[0])
+#define	SIO_GET_END_OFFSET(sio)		\
+	(SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio))
+#define	SIO_GET_MUSED(sio)		\
+	(sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t)))
+
 struct dsl_scan_io_queue {
 	dsl_scan_t	*q_scn; /* associated dsl_scan_t */
 	vdev_t		*q_vd; /* top-level vdev that this queue represents */
@@ -275,6 +294,7 @@ struct dsl_scan_io_queue {
 	range_tree_t	*q_exts_by_addr;
 	avl_tree_t	q_exts_by_size;
 	avl_tree_t	q_sios_by_addr;
+	uint64_t	q_sio_memused;
 
 	/* members for zio rate limiting */
 	uint64_t	q_maxinflight_bytes;
@@ -313,7 +333,27 @@ static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
 static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
 static void scan_io_queues_destroy(dsl_scan_t *scn);
 
-static kmem_cache_t *sio_cache;
+static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP];
+
+/* sio->sio_nr_dvas must be set so we know which cache to free from */
+static void
+sio_free(scan_io_t *sio)
+{
+	ASSERT3U(sio->sio_nr_dvas, >, 0);
+	ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
+
+	kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio);
+}
+
+/* It is up to the caller to set sio->sio_nr_dvas for freeing */
+static scan_io_t *
+sio_alloc(unsigned short nr_dvas)
+{
+	ASSERT3U(nr_dvas, >, 0);
+	ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP);
+
+	return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP));
+}
 
 void
 scan_init(void)
@@ -328,14 +368,22 @@ scan_init(void)
 	 */
 	fill_weight = zfs_scan_fill_weight;
 
-	sio_cache = kmem_cache_create("sio_cache",
-	    sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+		char name[36];
+
+		(void) sprintf(name, "sio_cache_%d", i);
+		sio_cache[i] = kmem_cache_create(name,
+		    (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))),
+		    0, NULL, NULL, NULL, NULL, NULL, 0);
+	}
 }
 
 void
 scan_fini(void)
 {
-	kmem_cache_destroy(sio_cache);
+	for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+		kmem_cache_destroy(sio_cache[i]);
+	}
 }
 
 static inline boolean_t
@@ -352,29 +400,39 @@ dsl_scan_resilvering(dsl_pool_t *dp)
 }
 
 static inline void
-sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id)
+sio2bp(const scan_io_t *sio, blkptr_t *bp)
 {
 	bzero(bp, sizeof (*bp));
-	DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize);
-	DVA_SET_VDEV(&bp->blk_dva[0], vdev_id);
-	DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset);
 	bp->blk_prop = sio->sio_blk_prop;
 	bp->blk_phys_birth = sio->sio_phys_birth;
 	bp->blk_birth = sio->sio_birth;
 	bp->blk_fill = 1;	/* we always only work with data pointers */
 	bp->blk_cksum = sio->sio_cksum;
+
+	ASSERT3U(sio->sio_nr_dvas, >, 0);
+	ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
+
+	bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t));
 }
 
 static inline void
 bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
 {
-	/* we discard the vdev id, since we can deduce it from the queue */
-	sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]);
-	sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]);
 	sio->sio_blk_prop = bp->blk_prop;
 	sio->sio_phys_birth = bp->blk_phys_birth;
 	sio->sio_birth = bp->blk_birth;
 	sio->sio_cksum = bp->blk_cksum;
+	sio->sio_nr_dvas = BP_GET_NDVAS(bp);
+
+	/*
+	 * Copy the DVAs to the sio. We need all copies of the block so
+	 * that the self healing code can use the alternate copies if the
+	 * first is corrupted. We want the DVA at index dva_i to be first
+	 * in the sio since this is the primary one that we want to issue.
+	 */
+	for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) {
+		sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas];
+	}
 }
 
 int
@@ -1076,11 +1134,9 @@ dsl_scan_should_clear(dsl_scan_t *scn)
 		mutex_enter(&tvd->vdev_scan_io_queue_lock);
 		queue = tvd->vdev_scan_io_queue;
 		if (queue != NULL) {
-			/* #extents in exts_by_size = # in exts_by_addr */
+			/* # extents in exts_by_size = # in exts_by_addr */
 			mused += avl_numnodes(&queue->q_exts_by_size) *
-			    sizeof (range_seg_t) +
-			    avl_numnodes(&queue->q_sios_by_addr) *
-			    sizeof (scan_io_t);
+			    sizeof (range_seg_t) + queue->q_sio_memused;
 		}
 		mutex_exit(&tvd->vdev_scan_io_queue_lock);
 	}
@@ -2546,13 +2602,13 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
 			break;
 		}
 
-		sio2bp(sio, &bp, queue->q_vd->vdev_id);
-		bytes_issued += sio->sio_asize;
+		sio2bp(sio, &bp);
+		bytes_issued += SIO_GET_ASIZE(sio);
 		scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
 		    &sio->sio_zb, queue);
 		(void) list_remove_head(io_list);
 		scan_io_queues_update_zio_stats(queue, &bp);
-		kmem_free(sio, sizeof (*sio));
+		sio_free(sio);
 	}
 
 	atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
@@ -2569,7 +2625,7 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
 static boolean_t
 scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
 {
-	scan_io_t srch_sio, *sio, *next_sio;
+	scan_io_t *srch_sio, *sio, *next_sio;
 	avl_index_t idx;
 	uint_t num_sios = 0;
 	int64_t bytes_issued = 0;
@@ -2577,24 +2633,30 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
 	ASSERT(rs != NULL);
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
-	srch_sio.sio_offset = rs->rs_start;
+	srch_sio = sio_alloc(1);
+	srch_sio->sio_nr_dvas = 1;
+	SIO_SET_OFFSET(srch_sio, rs->rs_start);
 
 	/*
 	 * The exact start of the extent might not contain any matching zios,
 	 * so if that's the case, examine the next one in the tree.
 	 */
-	sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx);
+	sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
+	sio_free(srch_sio);
+
 	if (sio == NULL)
 		sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
 
-	while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) {
-		ASSERT3U(sio->sio_offset, >=, rs->rs_start);
-		ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end);
+	while (sio != NULL &&
+	    SIO_GET_OFFSET(sio) < rs->rs_end && num_sios <= 32) {
+		ASSERT3U(SIO_GET_OFFSET(sio), >=, rs->rs_start);
+		ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs->rs_end);
 
 		next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
 		avl_remove(&queue->q_sios_by_addr, sio);
+		queue->q_sio_memused -= SIO_GET_MUSED(sio);
 
-		bytes_issued += sio->sio_asize;
+		bytes_issued += SIO_GET_ASIZE(sio);
 		num_sios++;
 		list_insert_tail(list, sio);
 		sio = next_sio;
@@ -2606,11 +2668,11 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
 	 * in the segment we update it to reflect the work we were able to
 	 * complete. Otherwise, we remove it from the range tree entirely.
 	 */
-	if (sio != NULL && sio->sio_offset < rs->rs_end) {
+	if (sio != NULL && SIO_GET_OFFSET(sio) < rs->rs_end) {
 		range_tree_adjust_fill(queue->q_exts_by_addr, rs,
 		    -bytes_issued);
 		range_tree_resize_segment(queue->q_exts_by_addr, rs,
-		    sio->sio_offset, rs->rs_end - sio->sio_offset);
+		    SIO_GET_OFFSET(sio), rs->rs_end - SIO_GET_OFFSET(sio));
 
 		return (B_TRUE);
 	} else {
@@ -2715,9 +2777,9 @@ scan_io_queues_run_one(void *arg)
 			first_sio = list_head(&sio_list);
 			last_sio = list_tail(&sio_list);
 
-			seg_end = last_sio->sio_offset + last_sio->sio_asize;
+			seg_end = SIO_GET_END_OFFSET(last_sio);
 			if (seg_start == 0)
-				seg_start = first_sio->sio_offset;
+				seg_start = SIO_GET_OFFSET(first_sio);
 
 			/*
 			 * Issuing sios can take a long time so drop the
@@ -3369,10 +3431,23 @@ count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
 {
 	int i;
 
-	/* update the spa's stats on how many bytes we have issued */
-	for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+	/*
+	 * Update the spa's stats on how many bytes we have issued.
+	 * Sequential scrubs create a zio for each DVA of the bp. Each
+	 * of these will include all DVAs for repair purposes, but the
+	 * zio code will only try the first one unless there is an issue.
+	 * Therefore, we should only count the first DVA for these IOs.
+	 */
+	if (scn->scn_is_sorted) {
 		atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued,
-		    DVA_GET_ASIZE(&bp->blk_dva[i]));
+		    DVA_GET_ASIZE(&bp->blk_dva[0]));
+	} else {
+		spa_t *spa = scn->scn_dp->dp_spa;
+
+		for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+			atomic_add_64(&spa->spa_scan_pass_issued,
+			    DVA_GET_ASIZE(&bp->blk_dva[i]));
+		}
 	}
 
 	/*
@@ -3426,7 +3501,7 @@ static void
 scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
 {
 	avl_index_t idx;
-	int64_t asize = sio->sio_asize;
+	int64_t asize = SIO_GET_ASIZE(sio);
 	dsl_scan_t *scn = queue->q_scn;
 
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
@@ -3434,11 +3509,12 @@ scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
 	if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
 		/* block is already scheduled for reading */
 		atomic_add_64(&scn->scn_bytes_pending, -asize);
-		kmem_free(sio, sizeof (*sio));
+		sio_free(sio);
 		return;
 	}
 	avl_insert(&queue->q_sios_by_addr, sio, idx);
-	range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize);
+	queue->q_sio_memused += SIO_GET_MUSED(sio);
+	range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize);
 }
 
 /*
@@ -3452,7 +3528,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
     int zio_flags, const zbookmark_phys_t *zb)
 {
 	dsl_scan_t *scn = queue->q_scn;
-	scan_io_t *sio = kmem_zalloc(sizeof (*sio), KM_SLEEP);
+	scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp));
 
 	ASSERT0(BP_IS_GANG(bp));
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
@@ -3466,7 +3542,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
 	 * get an integer underflow in case the worker processes the
 	 * zio before we get to incrementing this counter.
 	 */
-	atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize);
+	atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio));
 
 	scan_io_queue_insert_impl(queue, sio);
 }
@@ -3699,15 +3775,11 @@ ext_size_compare(const void *x, const void *y)
  * based on LBA-order (from lowest to highest).
  */
 static int
-io_addr_compare(const void *x, const void *y)
+sio_addr_compare(const void *x, const void *y)
 {
 	const scan_io_t *a = x, *b = y;
 
-	if (a->sio_offset < b->sio_offset)
-		return (-1);
-	if (a->sio_offset == b->sio_offset)
-		return (0);
-	return (1);
+	return (AVL_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b)));
 }
 
 /* IO queues are created on demand when they are needed. */
@@ -3719,10 +3791,11 @@ scan_io_queue_create(vdev_t *vd)
 
 	q->q_scn = scn;
 	q->q_vd = vd;
+	q->q_sio_memused = 0;
 	cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
 	q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops,
 	    &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap);
-	avl_create(&q->q_sios_by_addr, io_addr_compare,
+	avl_create(&q->q_sios_by_addr, sio_addr_compare,
 	    sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
 
 	return (q);
@@ -3746,11 +3819,13 @@ dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
 	while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
 	    NULL) {
 		ASSERT(range_tree_contains(queue->q_exts_by_addr,
-		    sio->sio_offset, sio->sio_asize));
-		bytes_dequeued += sio->sio_asize;
-		kmem_free(sio, sizeof (*sio));
+		    SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)));
+		bytes_dequeued += SIO_GET_ASIZE(sio);
+		queue->q_sio_memused -= SIO_GET_MUSED(sio);
+		sio_free(sio);
 	}
 
+	ASSERT0(queue->q_sio_memused);
 	atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued);
 	range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
 	range_tree_destroy(queue->q_exts_by_addr);
@@ -3805,7 +3880,7 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
 	vdev_t *vdev;
 	kmutex_t *q_lock;
 	dsl_scan_io_queue_t *queue;
-	scan_io_t srch, *sio;
+	scan_io_t *srch_sio, *sio;
 	avl_index_t idx;
 	uint64_t start, size;
 
@@ -3820,9 +3895,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
 		return;
 	}
 
-	bp2sio(bp, &srch, dva_i);
-	start = srch.sio_offset;
-	size = srch.sio_asize;
+	srch_sio = sio_alloc(BP_GET_NDVAS(bp));
+	bp2sio(bp, srch_sio, dva_i);
+	start = SIO_GET_OFFSET(srch_sio);
+	size = SIO_GET_ASIZE(srch_sio);
 
 	/*
 	 * We can find the zio in two states:
@@ -3842,15 +3918,18 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
 	 *	be done with issuing the zio's it gathered and will
 	 *	signal us.
 	 */
-	sio = avl_find(&queue->q_sios_by_addr, &srch, &idx);
+	sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
+	sio_free(srch_sio);
+
 	if (sio != NULL) {
-		int64_t asize = sio->sio_asize;
+		int64_t asize = SIO_GET_ASIZE(sio);
 		blkptr_t tmpbp;
 
 		/* Got it while it was cold in the queue */
-		ASSERT3U(start, ==, sio->sio_offset);
+		ASSERT3U(start, ==, SIO_GET_OFFSET(sio));
 		ASSERT3U(size, ==, asize);
 		avl_remove(&queue->q_sios_by_addr, sio);
+		queue->q_sio_memused -= SIO_GET_MUSED(sio);
 
 		ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
 		range_tree_remove_fill(queue->q_exts_by_addr, start, size);
@@ -3863,10 +3942,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
 		atomic_add_64(&scn->scn_bytes_pending, -asize);
 
 		/* count the block as though we issued it */
-		sio2bp(sio, &tmpbp, dva_i);
+		sio2bp(sio, &tmpbp);
 		count_block(scn, dp->dp_blkstats, &tmpbp);
 
-		kmem_free(sio, sizeof (*sio));
+		sio_free(sio);
 	}
 	mutex_exit(q_lock);
 }
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 1c004f87f3..d0b9f6960f 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -2069,7 +2069,7 @@ metaslab_space_weight(metaslab_t *msp)
 	 * In effect, this means that we'll select the metaslab with the most
 	 * free bandwidth rather than simply the one with the most free space.
 	 */
-	if (metaslab_lba_weighting_enabled) {
+	if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
 		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
 		ASSERT(weight >= space && weight <= 2 * space);
 	}
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 7a44ac86b0..0a5cec2644 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -2036,6 +2036,7 @@ spa_init(int mode)
 	dmu_init();
 	zil_init();
 	vdev_cache_stat_init();
+	vdev_mirror_stat_init();
 	zfs_prop_init();
 	zpool_prop_init();
 	zpool_feature_init();
@@ -2052,6 +2053,7 @@ spa_fini(void)
 	spa_evict_all();
 
 	vdev_cache_stat_fini();
+	vdev_mirror_stat_fini();
 	zil_fini();
 	dmu_fini();
 	zio_fini();
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index 4ff552447e..53b9e4ef5d 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -907,6 +907,10 @@ extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
 extern void vdev_cache_stat_init(void);
 extern void vdev_cache_stat_fini(void);
 
+/* vdev mirror */
+extern void vdev_mirror_stat_init(void);
+extern void vdev_mirror_stat_fini(void);
+
 /* Initialization and termination */
 extern void spa_init(int flags);
 extern void spa_fini(void);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index 0c0bc874c1..e21989641b 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -139,6 +139,9 @@ extern zio_t *vdev_queue_io(zio_t *zio);
 extern void vdev_queue_io_done(zio_t *zio);
 extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
 
+extern int vdev_queue_length(vdev_t *vd);
+extern uint64_t vdev_queue_last_offset(vdev_t *vd);
+
 extern void vdev_config_dirty(vdev_t *vd);
 extern void vdev_config_clean(vdev_t *vd);
 extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 4e1b09c27d..a91927dbb6 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -225,6 +225,7 @@ struct vdev {
 	vdev_stat_t	vdev_stat;	/* virtual device statistics	*/
 	boolean_t	vdev_expanding;	/* expand the vdev?		*/
 	boolean_t	vdev_reopening;	/* reopen in progress?		*/
+	boolean_t	vdev_nonrot;	/* true if solid state		*/
 	int		vdev_open_error; /* error on last open		*/
 	kthread_t	*vdev_open_thread; /* thread opening children	*/
 	uint64_t	vdev_crtxg;	/* txg when top-level was added */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
index 824d1d8bb7..70916c45b7 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -318,13 +318,15 @@ typedef struct zinject_record {
 	uint64_t	zi_timer;
 	uint64_t	zi_nlanes;
 	uint32_t	zi_cmd;
-	uint32_t	zi_pad;
+	uint32_t	zi_dvas;
 } zinject_record_t;
 
 #define	ZINJECT_NULL		0x1
 #define	ZINJECT_FLUSH_ARC	0x2
 #define	ZINJECT_UNLOAD_SPA	0x4
 
+#define	ZI_NO_DVA		(-1)
+
 typedef enum zinject_type {
 	ZINJECT_UNINITIALIZED,
 	ZINJECT_DATA_FAULT,
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index c7dca83777..4971e9e79e 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -1478,19 +1478,27 @@ vdev_open_children(vdev_t *vd)
 	 * spa_namespace_lock
 	 */
 	if (vdev_uses_zvols(vd)) {
+retry_sync:
 		for (int c = 0; c < children; c++)
 			vd->vdev_child[c]->vdev_open_error =
 			    vdev_open(vd->vdev_child[c]);
-		return;
+	} else {
+		tq = taskq_create("vdev_open", children, minclsyspri,
+		    children, children, TASKQ_PREPOPULATE);
+		if (tq == NULL)
+			goto retry_sync;
+
+		for (int c = 0; c < children; c++)
+			VERIFY(taskq_dispatch(tq, vdev_open_child,
+			    vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID);
+
+		taskq_destroy(tq);
 	}
-	tq = taskq_create("vdev_open", children, minclsyspri,
-	    children, children, TASKQ_PREPOPULATE);
 
-	for (int c = 0; c < children; c++)
-		VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
-		    TQ_SLEEP) != TASKQID_INVALID);
+	vd->vdev_nonrot = B_TRUE;
 
-	taskq_destroy(tq);
+	for (int c = 0; c < children; c++)
+		vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
 }
 
 /*
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index 93462ee2ba..3f137c5d59 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -606,6 +606,16 @@ skip_open:
 	 */
 	vd->vdev_nowritecache = B_FALSE;
 
+	/* Inform the ZIO pipeline that we are non-rotational */
+	vd->vdev_nonrot = B_FALSE;
+	if (ldi_prop_exists(dvd->vd_lh, DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
+	    "device-solid-state")) {
+		if (ldi_prop_get_int(dvd->vd_lh,
+		    LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
+		    "device-solid-state", B_FALSE) != 0)
+			vd->vdev_nonrot = B_TRUE;
+	}
+
 	return (0);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c
index 3aaebe8505..806716200a 100644
--- a/usr/src/uts/common/fs/zfs/vdev_file.c
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c
@@ -58,6 +58,9 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
 	vattr_t vattr;
 	int error;
 
+	/* Rotational optimizations only make sense on block devices */
+	vd->vdev_nonrot = B_TRUE;
+
 	/*
 	 * We must have a pathname, and it must be absolute.
 	 */
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
index f489bb1967..f654bf9afb 100644
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -38,6 +38,65 @@
 #include <sys/fs/zfs.h>
 
 /*
+ * Vdev mirror kstats
+ */
+static kstat_t *mirror_ksp = NULL;
+
+typedef struct mirror_stats {
+	kstat_named_t vdev_mirror_stat_rotating_linear;
+	kstat_named_t vdev_mirror_stat_rotating_offset;
+	kstat_named_t vdev_mirror_stat_rotating_seek;
+	kstat_named_t vdev_mirror_stat_non_rotating_linear;
+	kstat_named_t vdev_mirror_stat_non_rotating_seek;
+
+	kstat_named_t vdev_mirror_stat_preferred_found;
+	kstat_named_t vdev_mirror_stat_preferred_not_found;
+} mirror_stats_t;
+
+static mirror_stats_t mirror_stats = {
+	/* New I/O follows directly the last I/O */
+	{ "rotating_linear",			KSTAT_DATA_UINT64 },
+	/* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */
+	{ "rotating_offset",			KSTAT_DATA_UINT64 },
+	/* New I/O requires random seek */
+	{ "rotating_seek",			KSTAT_DATA_UINT64 },
+	/* New I/O follows directly the last I/O  (nonrot) */
+	{ "non_rotating_linear",		KSTAT_DATA_UINT64 },
+	/* New I/O requires random seek (nonrot) */
+	{ "non_rotating_seek",			KSTAT_DATA_UINT64 },
+	/* Preferred child vdev found */
+	{ "preferred_found",			KSTAT_DATA_UINT64 },
+	/* Preferred child vdev not found or equal load  */
+	{ "preferred_not_found",		KSTAT_DATA_UINT64 },
+
+};
+
+#define	MIRROR_STAT(stat)		(mirror_stats.stat.value.ui64)
+#define	MIRROR_INCR(stat, val)		atomic_add_64(&MIRROR_STAT(stat), val)
+#define	MIRROR_BUMP(stat)		MIRROR_INCR(stat, 1)
+
+void
+vdev_mirror_stat_init(void)
+{
+	mirror_ksp = kstat_create("zfs", 0, "vdev_mirror_stats",
+	    "misc", KSTAT_TYPE_NAMED,
+	    sizeof (mirror_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+	if (mirror_ksp != NULL) {
+		mirror_ksp->ks_data = &mirror_stats;
+		kstat_install(mirror_ksp);
+	}
+}
+
+void
+vdev_mirror_stat_fini(void)
+{
+	if (mirror_ksp != NULL) {
+		kstat_delete(mirror_ksp);
+		mirror_ksp = NULL;
+	}
+}
+
+/*
  * Virtual device vector for mirroring.
  */
 
@@ -45,48 +104,182 @@ typedef struct mirror_child {
 	vdev_t		*mc_vd;
 	uint64_t	mc_offset;
 	int		mc_error;
+	int		mc_load;
 	uint8_t		mc_tried;
 	uint8_t		mc_skipped;
 	uint8_t		mc_speculative;
 } mirror_child_t;
 
 typedef struct mirror_map {
+	int		*mm_preferred;
+	int		mm_preferred_cnt;
 	int		mm_children;
 	int		mm_resilvering;
-	int		mm_preferred;
 	int		mm_root;
-	mirror_child_t	mm_child[1];
+	mirror_child_t	mm_child[];
 } mirror_map_t;
 
 int vdev_mirror_shift = 21;
 
+/*
+ * The load configuration settings below are tuned by default for
+ * the case where all devices are of the same rotational type.
+ *
+ * If there is a mixture of rotating and non-rotating media, setting
+ * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results
+ * as it will direct more reads to the non-rotating vdevs which are more likely
+ * to have a higher performance.
+ */
+
+/* Rotating media load calculation configuration. */
+static int zfs_vdev_mirror_rotating_inc = 0;
+static int zfs_vdev_mirror_rotating_seek_inc = 5;
+static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024;
+
+/* Non-rotating media load calculation configuration. */
+static int zfs_vdev_mirror_non_rotating_inc = 0;
+static int zfs_vdev_mirror_non_rotating_seek_inc = 1;
+
+static inline size_t
+vdev_mirror_map_size(int children)
+{
+	return (offsetof(mirror_map_t, mm_child[children]) +
+	    sizeof (int) * children);
+}
+
+static inline mirror_map_t *
+vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root)
+{
+	mirror_map_t *mm;
+
+	mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
+	mm->mm_children = children;
+	mm->mm_resilvering = resilvering;
+	mm->mm_root = root;
+	mm->mm_preferred = (int *)((uintptr_t)mm +
+	    offsetof(mirror_map_t, mm_child[children]));
+
+	return (mm);
+}
+
 static void
 vdev_mirror_map_free(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
 
-	kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
+	kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
 }
 
 static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
-	vdev_mirror_map_free,
-	zio_vsd_default_cksum_report
+	.vsd_free = vdev_mirror_map_free,
+	.vsd_cksum_report = zio_vsd_default_cksum_report
 };
 
+static int
+vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
+{
+	uint64_t last_offset;
+	int64_t offset_diff;
+	int load;
+
+	/* All DVAs have equal weight at the root. */
+	if (mm->mm_root)
+		return (INT_MAX);
+
+	/*
+	 * We don't return INT_MAX if the device is resilvering i.e.
+	 * vdev_resilver_txg != 0 as when tested performance was slightly
+	 * worse overall when resilvering with compared to without.
+	 */
+
+	/* Fix zio_offset for leaf vdevs */
+	if (vd->vdev_ops->vdev_op_leaf)
+		zio_offset += VDEV_LABEL_START_SIZE;
+
+	/* Standard load based on pending queue length. */
+	load = vdev_queue_length(vd);
+	last_offset = vdev_queue_last_offset(vd);
+
+	if (vd->vdev_nonrot) {
+		/* Non-rotating media. */
+		if (last_offset == zio_offset) {
+			MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear);
+			return (load + zfs_vdev_mirror_non_rotating_inc);
+		}
+
+		/*
+		 * Apply a seek penalty even for non-rotating devices as
+		 * sequential I/O's can be aggregated into fewer operations on
+		 * the device, thus avoiding unnecessary per-command overhead
+		 * and boosting performance.
+		 */
+		MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek);
+		return (load + zfs_vdev_mirror_non_rotating_seek_inc);
+	}
+
+	/* Rotating media I/O's which directly follow the last I/O. */
+	if (last_offset == zio_offset) {
+		MIRROR_BUMP(vdev_mirror_stat_rotating_linear);
+		return (load + zfs_vdev_mirror_rotating_inc);
+	}
+
+	/*
+	 * Apply half the seek increment to I/O's within seek offset
+	 * of the last I/O issued to this vdev as they should incur less
+	 * of a seek increment.
+	 */
+	offset_diff = (int64_t)(last_offset - zio_offset);
+	if (ABS(offset_diff) < zfs_vdev_mirror_rotating_seek_offset) {
+		MIRROR_BUMP(vdev_mirror_stat_rotating_offset);
+		return (load + (zfs_vdev_mirror_rotating_seek_inc / 2));
+	}
+
+	/* Apply the full seek increment to all other I/O's. */
+	MIRROR_BUMP(vdev_mirror_stat_rotating_seek);
+	return (load + zfs_vdev_mirror_rotating_seek_inc);
+}
+
 static mirror_map_t *
-vdev_mirror_map_alloc(zio_t *zio)
+vdev_mirror_map_init(zio_t *zio)
 {
 	mirror_map_t *mm = NULL;
 	mirror_child_t *mc;
 	vdev_t *vd = zio->io_vd;
-	int c, d;
+	int c;
 
 	if (vd == NULL) {
 		dva_t *dva = zio->io_bp->blk_dva;
 		spa_t *spa = zio->io_spa;
+		dsl_scan_t *scn = NULL;
 		dva_t dva_copy[SPA_DVAS_PER_BP];
 
-		c = BP_GET_NDVAS(zio->io_bp);
+		if (spa->spa_dsl_pool != NULL) {
+			scn = spa->spa_dsl_pool->dp_scan;
+		}
+		/*
+		 * The sequential scrub code sorts and issues all DVAs
+		 * of a bp separately. Each of these IOs includes all
+		 * original DVA copies so that repairs can be performed
+		 * in the event of an error, but we only actually want
+		 * to check the first DVA since the others will be
+		 * checked by their respective sorted IOs. Only if we
+		 * hit an error will we try all DVAs upon retrying.
+		 *
+		 * Note: This check is safe even if the user switches
+		 * from a legacy scrub to a sequential one in the middle
+		 * of processing, since scn_is_sorted isn't updated until
+		 * all outstanding IOs from the previous scrub pass
+		 * complete.
+		 */
+		if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
+		    !(zio->io_flags & ZIO_FLAG_IO_RETRY) &&
+		    scn != NULL &&
+		    scn->scn_is_sorted &&
+		    dsl_scan_scrubbing(spa->spa_dsl_pool)) {
+			c = 1;
+		} else {
+			c = BP_GET_NDVAS(zio->io_bp);
+		}
 
 		/*
 		 * If we do not trust the pool config, some DVAs might be
@@ -110,24 +303,7 @@ vdev_mirror_map_alloc(zio_t *zio)
 			}
 		}
 
-		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
-		mm->mm_children = c;
-		mm->mm_resilvering = B_FALSE;
-		mm->mm_preferred = spa_get_random(c);
-		mm->mm_root = B_TRUE;
-
-		/*
-		 * Check the other, lower-index DVAs to see if they're on
-		 * the same vdev as the child we picked.  If they are, use
-		 * them since they are likely to have been allocated from
-		 * the primary metaslab in use at the time, and hence are
-		 * more likely to have locality with single-copy data.
-		 */
-		for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
-			if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
-				mm->mm_preferred = d;
-		}
-
+		mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 
@@ -135,12 +311,6 @@ vdev_mirror_map_alloc(zio_t *zio)
 			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
 		}
 	} else {
-		int replacing;
-
-		c = vd->vdev_children;
-
-		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
-		mm->mm_children = c;
 		/*
 		 * If we are resilvering, then we should handle scrub reads
 		 * differently; we shouldn't issue them to the resilvering
@@ -164,25 +334,12 @@ vdev_mirror_map_alloc(zio_t *zio)
 		 * automatically removed from the pool after the user replaces
 		 * the device that originally failed.
 		 */
-		replacing = (vd->vdev_ops == &vdev_replacing_ops ||
-		    vd->vdev_ops == &vdev_spare_ops);
-		/*
-		 * If a spa load is in progress, then spa_dsl_pool may be
-		 * uninitialized.  But we shouldn't be resilvering during a spa
-		 * load anyway.
-		 */
-		if (replacing &&
-		    (spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE) &&
-		    dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool)) {
-			mm->mm_resilvering = B_TRUE;
-		} else {
-			mm->mm_resilvering = B_FALSE;
-		}
-
-		mm->mm_preferred = mm->mm_resilvering ? 0 :
-		    (zio->io_offset >> vdev_mirror_shift) % c;
-		mm->mm_root = B_FALSE;
-
+		boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops ||
+		    vd->vdev_ops == &vdev_spare_ops) &&
+		    spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
+		    dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);
+		mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
+		    B_FALSE);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 			mc->mc_vd = vd->vdev_child[c];
@@ -269,6 +426,7 @@ vdev_mirror_scrub_done(zio_t *zio)
 		}
 		mutex_exit(&zio->io_lock);
 	}
+
 	abd_free(zio->io_abd);
 
 	mc->mc_error = zio->io_error;
@@ -277,6 +435,54 @@ vdev_mirror_scrub_done(zio_t *zio)
 }
 
 /*
+ * Check the other, lower-index DVAs to see if they're on the same
+ * vdev as the child we picked.  If they are, use them since they
+ * are likely to have been allocated from the primary metaslab in
+ * use at the time, and hence are more likely to have locality with
+ * single-copy data.
+ */
+static int
+vdev_mirror_dva_select(zio_t *zio, int p)
+{
+	dva_t *dva = zio->io_bp->blk_dva;
+	mirror_map_t *mm = zio->io_vsd;
+	int preferred;
+	int c;
+
+	preferred = mm->mm_preferred[p];
+	for (p--; p >= 0; p--) {
+		c = mm->mm_preferred[p];
+		if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
+			preferred = c;
+	}
+	return (preferred);
+}
+
+static int
+vdev_mirror_preferred_child_randomize(zio_t *zio)
+{
+	mirror_map_t *mm = zio->io_vsd;
+	int p;
+
+	if (mm->mm_root) {
+		p = spa_get_random(mm->mm_preferred_cnt);
+		return (vdev_mirror_dva_select(zio, p));
+	}
+
+	/*
+	 * To ensure we don't always favour the first matching vdev,
+	 * which could lead to wear leveling issues on SSD's, we
+	 * use the I/O offset as a pseudo random seed into the vdevs
+	 * which have the lowest load.
+	 */
+	p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
+	return (mm->mm_preferred[p]);
+}
+
+/*
+ * Try to find a vdev whose DTL doesn't contain the block we want to read
+ * prefering vdevs based on determined load.
+ *
  * Try to find a child whose DTL doesn't contain the block we want to read.
  * If we can't, try the read on any vdev we haven't already tried.
  */
@@ -284,43 +490,64 @@ static int
 vdev_mirror_child_select(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
-	mirror_child_t *mc;
 	uint64_t txg = zio->io_txg;
-	int i, c;
+	int c, lowest_load;
 
 	ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
 
-	/*
-	 * Try to find a child whose DTL doesn't contain the block to read.
-	 * If a child is known to be completely inaccessible (indicated by
-	 * vdev_readable() returning B_FALSE), don't even try.
-	 */
-	for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
-		if (c >= mm->mm_children)
-			c = 0;
+	lowest_load = INT_MAX;
+	mm->mm_preferred_cnt = 0;
+	for (c = 0; c < mm->mm_children; c++) {
+		mirror_child_t *mc;
+
 		mc = &mm->mm_child[c];
 		if (mc->mc_tried || mc->mc_skipped)
 			continue;
-		if (!vdev_readable(mc->mc_vd)) {
+
+		if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) {
 			mc->mc_error = SET_ERROR(ENXIO);
 			mc->mc_tried = 1;	/* don't even try */
 			mc->mc_skipped = 1;
 			continue;
 		}
-		if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
-			return (c);
-		mc->mc_error = SET_ERROR(ESTALE);
-		mc->mc_skipped = 1;
-		mc->mc_speculative = 1;
+
+		if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
+			mc->mc_error = SET_ERROR(ESTALE);
+			mc->mc_skipped = 1;
+			mc->mc_speculative = 1;
+			continue;
+		}
+
+		mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
+		if (mc->mc_load > lowest_load)
+			continue;
+
+		if (mc->mc_load < lowest_load) {
+			lowest_load = mc->mc_load;
+			mm->mm_preferred_cnt = 0;
+		}
+		mm->mm_preferred[mm->mm_preferred_cnt] = c;
+		mm->mm_preferred_cnt++;
+	}
+
+	if (mm->mm_preferred_cnt == 1) {
+		MIRROR_BUMP(vdev_mirror_stat_preferred_found);
+		return (mm->mm_preferred[0]);
+	}
+
+	if (mm->mm_preferred_cnt > 1) {
+		MIRROR_BUMP(vdev_mirror_stat_preferred_not_found);
+		return (vdev_mirror_preferred_child_randomize(zio));
 	}
 
 	/*
 	 * Every device is either missing or has this txg in its DTL.
 	 * Look for any child we haven't already tried before giving up.
 	 */
-	for (c = 0; c < mm->mm_children; c++)
+	for (c = 0; c < mm->mm_children; c++) {
 		if (!mm->mm_child[c].mc_tried)
 			return (c);
+	}
 
 	/*
 	 * Every child failed.  There's no place left to look.
@@ -335,7 +562,7 @@ vdev_mirror_io_start(zio_t *zio)
 	mirror_child_t *mc;
 	int c, children;
 
-	mm = vdev_mirror_map_alloc(zio);
+	mm = vdev_mirror_map_init(zio);
 
 	if (mm == NULL) {
 		ASSERT(!spa_trust_config(zio->io_spa));
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 0643c05f57..a89e06ebbf 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -276,6 +276,8 @@ vdev_queue_init(vdev_t *vd)
 		avl_create(vdev_queue_class_tree(vq, p), compfn,
 		    sizeof (zio_t), offsetof(struct zio, io_queue_node));
 	}
+
+	vq->vq_last_offset = 0;
 }
 
 void
@@ -701,7 +703,7 @@ again:
 	 */
 	tree = vdev_queue_class_tree(vq, p);
 	search.io_timestamp = 0;
-	search.io_offset = vq->vq_last_offset + 1;
+	search.io_offset = vq->vq_last_offset - 1;
 	VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
 	zio = avl_nearest(tree, idx, AVL_AFTER);
 	if (zio == NULL)
@@ -729,7 +731,7 @@ again:
 	}
 
 	vdev_queue_pending_add(vq, zio);
-	vq->vq_last_offset = zio->io_offset;
+	vq->vq_last_offset = zio->io_offset + zio->io_size;
 
 	return (zio);
 }
@@ -849,12 +851,39 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
 	 */
 	tree = vdev_queue_class_tree(vq, zio->io_priority);
 	if (avl_find(tree, zio, NULL) == zio) {
+		spa_t *spa = zio->io_spa;
+		zio_priority_t oldpri = zio->io_priority;
+
 		avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
 		zio->io_priority = priority;
 		avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+
+		mutex_enter(&spa->spa_iokstat_lock);
+		ASSERT3U(spa->spa_queue_stats[oldpri].spa_queued, >, 0);
+		spa->spa_queue_stats[oldpri].spa_queued--;
+		spa->spa_queue_stats[zio->io_priority].spa_queued++;
+		mutex_exit(&spa->spa_iokstat_lock);
 	} else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
 		zio->io_priority = priority;
 	}
 
 	mutex_exit(&vq->vq_lock);
 }
+
+/*
+ * As these two methods are only used for load calculations we're not
+ * concerned if we get an incorrect value on 32bit platforms due to lack of
+ * vq_lock mutex use here, instead we prefer to keep it lock free for
+ * performance.
+ */
+int
+vdev_queue_length(vdev_t *vd)
+{
+	return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
+}
+
+uint64_t
+vdev_queue_last_offset(vdev_t *vd)
+{
+	return (vd->vdev_queue.vq_last_offset);
+}
diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c
index 26f59af996..71b859bc3d 100644
--- a/usr/src/uts/common/fs/zfs/zio_inject.c
+++ b/usr/src/uts/common/fs/zfs/zio_inject.c
@@ -102,7 +102,7 @@ static int inject_next_id = 1;
  * Returns true if the given record matches the I/O in progress.
  */
 static boolean_t
-zio_match_handler(zbookmark_phys_t *zb, uint64_t type,
+zio_match_handler(zbookmark_phys_t *zb, uint64_t type, int dva,
     zinject_record_t *record, int error)
 {
 	/*
@@ -127,9 +127,11 @@ zio_match_handler(zbookmark_phys_t *zb, uint64_t type,
 	    zb->zb_level == record->zi_level &&
 	    zb->zb_blkid >= record->zi_start &&
 	    zb->zb_blkid <= record->zi_end &&
-	    error == record->zi_error)
+	    (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) &&
+	    error == record->zi_error) {
 		return (record->zi_freq == 0 ||
 		    spa_get_random(100) < record->zi_freq);
+	}
 
 	return (B_FALSE);
 }
@@ -159,6 +161,38 @@ zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
 	rw_exit(&inject_lock);
 }
 
+
+/*
+ * If this is a physical I/O for a vdev child determine which DVA it is
+ * for. We iterate backwards through the DVAs matching on the offset so
+ * that we end up with ZI_NO_DVA (-1) if we don't find a match.
+ */
+static int
+zio_match_dva(zio_t *zio)
+{
+	int i = ZI_NO_DVA;
+
+	if (zio->io_bp != NULL && zio->io_vd != NULL &&
+	    zio->io_child_type == ZIO_CHILD_VDEV) {
+		for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) {
+			dva_t *dva = &zio->io_bp->blk_dva[i];
+			uint64_t off = DVA_GET_OFFSET(dva);
+			vdev_t *vd = vdev_lookup_top(zio->io_spa,
+			    DVA_GET_VDEV(dva));
+
+			/* Compensate for vdev label added to leaves */
+			if (zio->io_vd->vdev_ops->vdev_op_leaf)
+				off += VDEV_LABEL_START_SIZE;
+
+			if (zio->io_vd == vd && zio->io_offset == off)
+				break;
+		}
+	}
+
+	return (i);
+}
+
+
 /*
  * Determine if the I/O in question should return failure.  Returns the errno
  * to be returned to the caller.
@@ -190,10 +224,10 @@ zio_handle_fault_injection(zio_t *zio, int error)
 		    handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
 			continue;
 
-		/* If this handler matches, return EIO */
+		/* If this handler matches, return the specified error */
 		if (zio_match_handler(&zio->io_logical->io_bookmark,
 		    zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
-		    &handler->zi_record, error)) {
+		    zio_match_dva(zio), &handler->zi_record, error)) {
 			ret = error;
 			break;
 		}
author	Tom Caputi <tcaputi@datto.com>	2019-03-15 17:14:31 -0400
committer	Toomas Soome <tsoome@me.com>	2019-05-13 23:49:15 +0300
commit	12a8814c13fbb1d6d58616cf090ea5815dc107f9 (patch)
tree	3f1b36f6702e76bf3b0636d6c3d9a8943d06470c /usr/src/uts/common/fs/zfs
parent	a3874b8b1fe5103fc1f961609557c0587435fec0 (diff)
download	illumos-gate-12a8814c13fbb1d6d58616cf090ea5815dc107f9.tar.gz