summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/fs/zfs
diff options
context:
space:
mode:
authorTom Caputi <tcaputi@datto.com>2019-03-15 17:14:31 -0400
committerToomas Soome <tsoome@me.com>2019-05-13 23:49:15 +0300
commit12a8814c13fbb1d6d58616cf090ea5815dc107f9 (patch)
tree3f1b36f6702e76bf3b0636d6c3d9a8943d06470c /usr/src/uts/common/fs/zfs
parenta3874b8b1fe5103fc1f961609557c0587435fec0 (diff)
downloadillumos-gate-12a8814c13fbb1d6d58616cf090ea5815dc107f9.tar.gz
10566 Multiple DVA Scrubbing Fix
Portions contributed by: Toomas Soome <tsoome@me.com> Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Approved by: Dan McDonald <danmcd@joyent.com>
Diffstat (limited to 'usr/src/uts/common/fs/zfs')
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_scan.c199
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c2
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h4
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h4
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c22
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_disk.c10
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_file.c3
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_mirror.c365
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_queue.c33
-rw-r--r--usr/src/uts/common/fs/zfs/zio_inject.c42
13 files changed, 546 insertions, 144 deletions
diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c
index 00bd1498a2..ca82195178 100644
--- a/usr/src/uts/common/fs/zfs/dsl_scan.c
+++ b/usr/src/uts/common/fs/zfs/dsl_scan.c
@@ -249,24 +249,43 @@ typedef enum {
*/
typedef struct scan_io {
/* fields from blkptr_t */
- uint64_t sio_offset;
uint64_t sio_blk_prop;
uint64_t sio_phys_birth;
uint64_t sio_birth;
zio_cksum_t sio_cksum;
- uint32_t sio_asize;
+ uint32_t sio_nr_dvas;
/* fields from zio_t */
- int sio_flags;
+ uint32_t sio_flags;
zbookmark_phys_t sio_zb;
/* members for queue sorting */
union {
- avl_node_t sio_addr_node; /* link into issueing queue */
+ avl_node_t sio_addr_node; /* link into issuing queue */
list_node_t sio_list_node; /* link for issuing to disk */
} sio_nodes;
+
+ /*
+ * There may be up to SPA_DVAS_PER_BP DVAs here from the bp,
+ * depending on how many were in the original bp. Only the
+ * first DVA is really used for sorting and issuing purposes.
+ * The other DVAs (if provided) simply exist so that the zio
+ * layer can find additional copies to repair from in the
+ * event of an error. This array must go at the end of the
+ * struct to allow this for the variable number of elements.
+ */
+ dva_t sio_dva[0];
} scan_io_t;
+#define SIO_SET_OFFSET(sio, x) DVA_SET_OFFSET(&(sio)->sio_dva[0], x)
+#define SIO_SET_ASIZE(sio, x) DVA_SET_ASIZE(&(sio)->sio_dva[0], x)
+#define SIO_GET_OFFSET(sio) DVA_GET_OFFSET(&(sio)->sio_dva[0])
+#define SIO_GET_ASIZE(sio) DVA_GET_ASIZE(&(sio)->sio_dva[0])
+#define SIO_GET_END_OFFSET(sio) \
+ (SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio))
+#define SIO_GET_MUSED(sio) \
+ (sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t)))
+
struct dsl_scan_io_queue {
dsl_scan_t *q_scn; /* associated dsl_scan_t */
vdev_t *q_vd; /* top-level vdev that this queue represents */
@@ -275,6 +294,7 @@ struct dsl_scan_io_queue {
range_tree_t *q_exts_by_addr;
avl_tree_t q_exts_by_size;
avl_tree_t q_sios_by_addr;
+ uint64_t q_sio_memused;
/* members for zio rate limiting */
uint64_t q_maxinflight_bytes;
@@ -313,7 +333,27 @@ static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
static void scan_io_queues_destroy(dsl_scan_t *scn);
-static kmem_cache_t *sio_cache;
+static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP];
+
+/* sio->sio_nr_dvas must be set so we know which cache to free from */
+static void
+sio_free(scan_io_t *sio)
+{
+ ASSERT3U(sio->sio_nr_dvas, >, 0);
+ ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
+
+ kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio);
+}
+
+/* It is up to the caller to set sio->sio_nr_dvas for freeing */
+static scan_io_t *
+sio_alloc(unsigned short nr_dvas)
+{
+ ASSERT3U(nr_dvas, >, 0);
+ ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP);
+
+ return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP));
+}
void
scan_init(void)
@@ -328,14 +368,22 @@ scan_init(void)
*/
fill_weight = zfs_scan_fill_weight;
- sio_cache = kmem_cache_create("sio_cache",
- sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+ char name[36];
+
+ (void) sprintf(name, "sio_cache_%d", i);
+ sio_cache[i] = kmem_cache_create(name,
+ (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ }
}
void
scan_fini(void)
{
- kmem_cache_destroy(sio_cache);
+ for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+ kmem_cache_destroy(sio_cache[i]);
+ }
}
static inline boolean_t
@@ -352,29 +400,39 @@ dsl_scan_resilvering(dsl_pool_t *dp)
}
static inline void
-sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id)
+sio2bp(const scan_io_t *sio, blkptr_t *bp)
{
bzero(bp, sizeof (*bp));
- DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize);
- DVA_SET_VDEV(&bp->blk_dva[0], vdev_id);
- DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset);
bp->blk_prop = sio->sio_blk_prop;
bp->blk_phys_birth = sio->sio_phys_birth;
bp->blk_birth = sio->sio_birth;
bp->blk_fill = 1; /* we always only work with data pointers */
bp->blk_cksum = sio->sio_cksum;
+
+ ASSERT3U(sio->sio_nr_dvas, >, 0);
+ ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
+
+ bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t));
}
static inline void
bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
{
- /* we discard the vdev id, since we can deduce it from the queue */
- sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]);
- sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]);
sio->sio_blk_prop = bp->blk_prop;
sio->sio_phys_birth = bp->blk_phys_birth;
sio->sio_birth = bp->blk_birth;
sio->sio_cksum = bp->blk_cksum;
+ sio->sio_nr_dvas = BP_GET_NDVAS(bp);
+
+ /*
+ * Copy the DVAs to the sio. We need all copies of the block so
+ * that the self healing code can use the alternate copies if the
+ * first is corrupted. We want the DVA at index dva_i to be first
+ * in the sio since this is the primary one that we want to issue.
+ */
+ for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) {
+ sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas];
+ }
}
int
@@ -1076,11 +1134,9 @@ dsl_scan_should_clear(dsl_scan_t *scn)
mutex_enter(&tvd->vdev_scan_io_queue_lock);
queue = tvd->vdev_scan_io_queue;
if (queue != NULL) {
- /* #extents in exts_by_size = # in exts_by_addr */
+ /* # extents in exts_by_size = # in exts_by_addr */
mused += avl_numnodes(&queue->q_exts_by_size) *
- sizeof (range_seg_t) +
- avl_numnodes(&queue->q_sios_by_addr) *
- sizeof (scan_io_t);
+ sizeof (range_seg_t) + queue->q_sio_memused;
}
mutex_exit(&tvd->vdev_scan_io_queue_lock);
}
@@ -2546,13 +2602,13 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
break;
}
- sio2bp(sio, &bp, queue->q_vd->vdev_id);
- bytes_issued += sio->sio_asize;
+ sio2bp(sio, &bp);
+ bytes_issued += SIO_GET_ASIZE(sio);
scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
&sio->sio_zb, queue);
(void) list_remove_head(io_list);
scan_io_queues_update_zio_stats(queue, &bp);
- kmem_free(sio, sizeof (*sio));
+ sio_free(sio);
}
atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
@@ -2569,7 +2625,7 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
static boolean_t
scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
{
- scan_io_t srch_sio, *sio, *next_sio;
+ scan_io_t *srch_sio, *sio, *next_sio;
avl_index_t idx;
uint_t num_sios = 0;
int64_t bytes_issued = 0;
@@ -2577,24 +2633,30 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
ASSERT(rs != NULL);
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
- srch_sio.sio_offset = rs->rs_start;
+ srch_sio = sio_alloc(1);
+ srch_sio->sio_nr_dvas = 1;
+ SIO_SET_OFFSET(srch_sio, rs->rs_start);
/*
* The exact start of the extent might not contain any matching zios,
* so if that's the case, examine the next one in the tree.
*/
- sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx);
+ sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
+ sio_free(srch_sio);
+
if (sio == NULL)
sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
- while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) {
- ASSERT3U(sio->sio_offset, >=, rs->rs_start);
- ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end);
+ while (sio != NULL &&
+ SIO_GET_OFFSET(sio) < rs->rs_end && num_sios <= 32) {
+ ASSERT3U(SIO_GET_OFFSET(sio), >=, rs->rs_start);
+ ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs->rs_end);
next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
avl_remove(&queue->q_sios_by_addr, sio);
+ queue->q_sio_memused -= SIO_GET_MUSED(sio);
- bytes_issued += sio->sio_asize;
+ bytes_issued += SIO_GET_ASIZE(sio);
num_sios++;
list_insert_tail(list, sio);
sio = next_sio;
@@ -2606,11 +2668,11 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
* in the segment we update it to reflect the work we were able to
* complete. Otherwise, we remove it from the range tree entirely.
*/
- if (sio != NULL && sio->sio_offset < rs->rs_end) {
+ if (sio != NULL && SIO_GET_OFFSET(sio) < rs->rs_end) {
range_tree_adjust_fill(queue->q_exts_by_addr, rs,
-bytes_issued);
range_tree_resize_segment(queue->q_exts_by_addr, rs,
- sio->sio_offset, rs->rs_end - sio->sio_offset);
+ SIO_GET_OFFSET(sio), rs->rs_end - SIO_GET_OFFSET(sio));
return (B_TRUE);
} else {
@@ -2715,9 +2777,9 @@ scan_io_queues_run_one(void *arg)
first_sio = list_head(&sio_list);
last_sio = list_tail(&sio_list);
- seg_end = last_sio->sio_offset + last_sio->sio_asize;
+ seg_end = SIO_GET_END_OFFSET(last_sio);
if (seg_start == 0)
- seg_start = first_sio->sio_offset;
+ seg_start = SIO_GET_OFFSET(first_sio);
/*
* Issuing sios can take a long time so drop the
@@ -3369,10 +3431,23 @@ count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
{
int i;
- /* update the spa's stats on how many bytes we have issued */
- for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+ /*
+ * Update the spa's stats on how many bytes we have issued.
+ * Sequential scrubs create a zio for each DVA of the bp. Each
+ * of these will include all DVAs for repair purposes, but the
+ * zio code will only try the first one unless there is an issue.
+ * Therefore, we should only count the first DVA for these IOs.
+ */
+ if (scn->scn_is_sorted) {
atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued,
- DVA_GET_ASIZE(&bp->blk_dva[i]));
+ DVA_GET_ASIZE(&bp->blk_dva[0]));
+ } else {
+ spa_t *spa = scn->scn_dp->dp_spa;
+
+ for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+ atomic_add_64(&spa->spa_scan_pass_issued,
+ DVA_GET_ASIZE(&bp->blk_dva[i]));
+ }
}
/*
@@ -3426,7 +3501,7 @@ static void
scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
{
avl_index_t idx;
- int64_t asize = sio->sio_asize;
+ int64_t asize = SIO_GET_ASIZE(sio);
dsl_scan_t *scn = queue->q_scn;
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
@@ -3434,11 +3509,12 @@ scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
/* block is already scheduled for reading */
atomic_add_64(&scn->scn_bytes_pending, -asize);
- kmem_free(sio, sizeof (*sio));
+ sio_free(sio);
return;
}
avl_insert(&queue->q_sios_by_addr, sio, idx);
- range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize);
+ queue->q_sio_memused += SIO_GET_MUSED(sio);
+ range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize);
}
/*
@@ -3452,7 +3528,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
int zio_flags, const zbookmark_phys_t *zb)
{
dsl_scan_t *scn = queue->q_scn;
- scan_io_t *sio = kmem_zalloc(sizeof (*sio), KM_SLEEP);
+ scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp));
ASSERT0(BP_IS_GANG(bp));
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
@@ -3466,7 +3542,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
* get an integer underflow in case the worker processes the
* zio before we get to incrementing this counter.
*/
- atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize);
+ atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio));
scan_io_queue_insert_impl(queue, sio);
}
@@ -3699,15 +3775,11 @@ ext_size_compare(const void *x, const void *y)
* based on LBA-order (from lowest to highest).
*/
static int
-io_addr_compare(const void *x, const void *y)
+sio_addr_compare(const void *x, const void *y)
{
const scan_io_t *a = x, *b = y;
- if (a->sio_offset < b->sio_offset)
- return (-1);
- if (a->sio_offset == b->sio_offset)
- return (0);
- return (1);
+ return (AVL_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b)));
}
/* IO queues are created on demand when they are needed. */
@@ -3719,10 +3791,11 @@ scan_io_queue_create(vdev_t *vd)
q->q_scn = scn;
q->q_vd = vd;
+ q->q_sio_memused = 0;
cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops,
&q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap);
- avl_create(&q->q_sios_by_addr, io_addr_compare,
+ avl_create(&q->q_sios_by_addr, sio_addr_compare,
sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
return (q);
@@ -3746,11 +3819,13 @@ dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
NULL) {
ASSERT(range_tree_contains(queue->q_exts_by_addr,
- sio->sio_offset, sio->sio_asize));
- bytes_dequeued += sio->sio_asize;
- kmem_free(sio, sizeof (*sio));
+ SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)));
+ bytes_dequeued += SIO_GET_ASIZE(sio);
+ queue->q_sio_memused -= SIO_GET_MUSED(sio);
+ sio_free(sio);
}
+ ASSERT0(queue->q_sio_memused);
atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued);
range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
range_tree_destroy(queue->q_exts_by_addr);
@@ -3805,7 +3880,7 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
vdev_t *vdev;
kmutex_t *q_lock;
dsl_scan_io_queue_t *queue;
- scan_io_t srch, *sio;
+ scan_io_t *srch_sio, *sio;
avl_index_t idx;
uint64_t start, size;
@@ -3820,9 +3895,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
return;
}
- bp2sio(bp, &srch, dva_i);
- start = srch.sio_offset;
- size = srch.sio_asize;
+ srch_sio = sio_alloc(BP_GET_NDVAS(bp));
+ bp2sio(bp, srch_sio, dva_i);
+ start = SIO_GET_OFFSET(srch_sio);
+ size = SIO_GET_ASIZE(srch_sio);
/*
* We can find the zio in two states:
@@ -3842,15 +3918,18 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
* be done with issuing the zio's it gathered and will
* signal us.
*/
- sio = avl_find(&queue->q_sios_by_addr, &srch, &idx);
+ sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
+ sio_free(srch_sio);
+
if (sio != NULL) {
- int64_t asize = sio->sio_asize;
+ int64_t asize = SIO_GET_ASIZE(sio);
blkptr_t tmpbp;
/* Got it while it was cold in the queue */
- ASSERT3U(start, ==, sio->sio_offset);
+ ASSERT3U(start, ==, SIO_GET_OFFSET(sio));
ASSERT3U(size, ==, asize);
avl_remove(&queue->q_sios_by_addr, sio);
+ queue->q_sio_memused -= SIO_GET_MUSED(sio);
ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
range_tree_remove_fill(queue->q_exts_by_addr, start, size);
@@ -3863,10 +3942,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
atomic_add_64(&scn->scn_bytes_pending, -asize);
/* count the block as though we issued it */
- sio2bp(sio, &tmpbp, dva_i);
+ sio2bp(sio, &tmpbp);
count_block(scn, dp->dp_blkstats, &tmpbp);
- kmem_free(sio, sizeof (*sio));
+ sio_free(sio);
}
mutex_exit(q_lock);
}
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 1c004f87f3..d0b9f6960f 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -2069,7 +2069,7 @@ metaslab_space_weight(metaslab_t *msp)
* In effect, this means that we'll select the metaslab with the most
* free bandwidth rather than simply the one with the most free space.
*/
- if (metaslab_lba_weighting_enabled) {
+ if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
ASSERT(weight >= space && weight <= 2 * space);
}
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 7a44ac86b0..0a5cec2644 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -2036,6 +2036,7 @@ spa_init(int mode)
dmu_init();
zil_init();
vdev_cache_stat_init();
+ vdev_mirror_stat_init();
zfs_prop_init();
zpool_prop_init();
zpool_feature_init();
@@ -2052,6 +2053,7 @@ spa_fini(void)
spa_evict_all();
vdev_cache_stat_fini();
+ vdev_mirror_stat_fini();
zil_fini();
dmu_fini();
zio_fini();
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index 4ff552447e..53b9e4ef5d 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -907,6 +907,10 @@ extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
extern void vdev_cache_stat_init(void);
extern void vdev_cache_stat_fini(void);
+/* vdev mirror */
+extern void vdev_mirror_stat_init(void);
+extern void vdev_mirror_stat_fini(void);
+
/* Initialization and termination */
extern void spa_init(int flags);
extern void spa_fini(void);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index 0c0bc874c1..e21989641b 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -139,6 +139,9 @@ extern zio_t *vdev_queue_io(zio_t *zio);
extern void vdev_queue_io_done(zio_t *zio);
extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
+extern int vdev_queue_length(vdev_t *vd);
+extern uint64_t vdev_queue_last_offset(vdev_t *vd);
+
extern void vdev_config_dirty(vdev_t *vd);
extern void vdev_config_clean(vdev_t *vd);
extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 4e1b09c27d..a91927dbb6 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -225,6 +225,7 @@ struct vdev {
vdev_stat_t vdev_stat; /* virtual device statistics */
boolean_t vdev_expanding; /* expand the vdev? */
boolean_t vdev_reopening; /* reopen in progress? */
+ boolean_t vdev_nonrot; /* true if solid state */
int vdev_open_error; /* error on last open */
kthread_t *vdev_open_thread; /* thread opening children */
uint64_t vdev_crtxg; /* txg when top-level was added */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
index 824d1d8bb7..70916c45b7 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -318,13 +318,15 @@ typedef struct zinject_record {
uint64_t zi_timer;
uint64_t zi_nlanes;
uint32_t zi_cmd;
- uint32_t zi_pad;
+ uint32_t zi_dvas;
} zinject_record_t;
#define ZINJECT_NULL 0x1
#define ZINJECT_FLUSH_ARC 0x2
#define ZINJECT_UNLOAD_SPA 0x4
+#define ZI_NO_DVA (-1)
+
typedef enum zinject_type {
ZINJECT_UNINITIALIZED,
ZINJECT_DATA_FAULT,
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index c7dca83777..4971e9e79e 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -1478,19 +1478,27 @@ vdev_open_children(vdev_t *vd)
* spa_namespace_lock
*/
if (vdev_uses_zvols(vd)) {
+retry_sync:
for (int c = 0; c < children; c++)
vd->vdev_child[c]->vdev_open_error =
vdev_open(vd->vdev_child[c]);
- return;
+ } else {
+ tq = taskq_create("vdev_open", children, minclsyspri,
+ children, children, TASKQ_PREPOPULATE);
+ if (tq == NULL)
+ goto retry_sync;
+
+ for (int c = 0; c < children; c++)
+ VERIFY(taskq_dispatch(tq, vdev_open_child,
+ vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID);
+
+ taskq_destroy(tq);
}
- tq = taskq_create("vdev_open", children, minclsyspri,
- children, children, TASKQ_PREPOPULATE);
- for (int c = 0; c < children; c++)
- VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
- TQ_SLEEP) != TASKQID_INVALID);
+ vd->vdev_nonrot = B_TRUE;
- taskq_destroy(tq);
+ for (int c = 0; c < children; c++)
+ vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
}
/*
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index 93462ee2ba..3f137c5d59 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -606,6 +606,16 @@ skip_open:
*/
vd->vdev_nowritecache = B_FALSE;
+ /* Inform the ZIO pipeline that we are non-rotational */
+ vd->vdev_nonrot = B_FALSE;
+ if (ldi_prop_exists(dvd->vd_lh, DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
+ "device-solid-state")) {
+ if (ldi_prop_get_int(dvd->vd_lh,
+ LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
+ "device-solid-state", B_FALSE) != 0)
+ vd->vdev_nonrot = B_TRUE;
+ }
+
return (0);
}
diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c
index 3aaebe8505..806716200a 100644
--- a/usr/src/uts/common/fs/zfs/vdev_file.c
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c
@@ -58,6 +58,9 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
vattr_t vattr;
int error;
+ /* Rotational optimizations only make sense on block devices */
+ vd->vdev_nonrot = B_TRUE;
+
/*
* We must have a pathname, and it must be absolute.
*/
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
index f489bb1967..f654bf9afb 100644
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -38,6 +38,65 @@
#include <sys/fs/zfs.h>
/*
+ * Vdev mirror kstats
+ */
+static kstat_t *mirror_ksp = NULL;
+
+typedef struct mirror_stats {
+ kstat_named_t vdev_mirror_stat_rotating_linear;
+ kstat_named_t vdev_mirror_stat_rotating_offset;
+ kstat_named_t vdev_mirror_stat_rotating_seek;
+ kstat_named_t vdev_mirror_stat_non_rotating_linear;
+ kstat_named_t vdev_mirror_stat_non_rotating_seek;
+
+ kstat_named_t vdev_mirror_stat_preferred_found;
+ kstat_named_t vdev_mirror_stat_preferred_not_found;
+} mirror_stats_t;
+
+static mirror_stats_t mirror_stats = {
+ /* New I/O follows directly the last I/O */
+ { "rotating_linear", KSTAT_DATA_UINT64 },
+ /* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */
+ { "rotating_offset", KSTAT_DATA_UINT64 },
+ /* New I/O requires random seek */
+ { "rotating_seek", KSTAT_DATA_UINT64 },
+ /* New I/O follows directly the last I/O (nonrot) */
+ { "non_rotating_linear", KSTAT_DATA_UINT64 },
+ /* New I/O requires random seek (nonrot) */
+ { "non_rotating_seek", KSTAT_DATA_UINT64 },
+ /* Preferred child vdev found */
+ { "preferred_found", KSTAT_DATA_UINT64 },
+ /* Preferred child vdev not found or equal load */
+ { "preferred_not_found", KSTAT_DATA_UINT64 },
+
+};
+
+#define MIRROR_STAT(stat) (mirror_stats.stat.value.ui64)
+#define MIRROR_INCR(stat, val) atomic_add_64(&MIRROR_STAT(stat), val)
+#define MIRROR_BUMP(stat) MIRROR_INCR(stat, 1)
+
+void
+vdev_mirror_stat_init(void)
+{
+ mirror_ksp = kstat_create("zfs", 0, "vdev_mirror_stats",
+ "misc", KSTAT_TYPE_NAMED,
+ sizeof (mirror_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+ if (mirror_ksp != NULL) {
+ mirror_ksp->ks_data = &mirror_stats;
+ kstat_install(mirror_ksp);
+ }
+}
+
+void
+vdev_mirror_stat_fini(void)
+{
+ if (mirror_ksp != NULL) {
+ kstat_delete(mirror_ksp);
+ mirror_ksp = NULL;
+ }
+}
+
+/*
* Virtual device vector for mirroring.
*/
@@ -45,48 +104,182 @@ typedef struct mirror_child {
vdev_t *mc_vd;
uint64_t mc_offset;
int mc_error;
+ int mc_load;
uint8_t mc_tried;
uint8_t mc_skipped;
uint8_t mc_speculative;
} mirror_child_t;
typedef struct mirror_map {
+ int *mm_preferred;
+ int mm_preferred_cnt;
int mm_children;
int mm_resilvering;
- int mm_preferred;
int mm_root;
- mirror_child_t mm_child[1];
+ mirror_child_t mm_child[];
} mirror_map_t;
int vdev_mirror_shift = 21;
+/*
+ * The load configuration settings below are tuned by default for
+ * the case where all devices are of the same rotational type.
+ *
+ * If there is a mixture of rotating and non-rotating media, setting
+ * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results
+ * as it will direct more reads to the non-rotating vdevs which are more likely
+ * to have a higher performance.
+ */
+
+/* Rotating media load calculation configuration. */
+static int zfs_vdev_mirror_rotating_inc = 0;
+static int zfs_vdev_mirror_rotating_seek_inc = 5;
+static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024;
+
+/* Non-rotating media load calculation configuration. */
+static int zfs_vdev_mirror_non_rotating_inc = 0;
+static int zfs_vdev_mirror_non_rotating_seek_inc = 1;
+
+static inline size_t
+vdev_mirror_map_size(int children)
+{
+ return (offsetof(mirror_map_t, mm_child[children]) +
+ sizeof (int) * children);
+}
+
+static inline mirror_map_t *
+vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root)
+{
+ mirror_map_t *mm;
+
+ mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
+ mm->mm_children = children;
+ mm->mm_resilvering = resilvering;
+ mm->mm_root = root;
+ mm->mm_preferred = (int *)((uintptr_t)mm +
+ offsetof(mirror_map_t, mm_child[children]));
+
+ return (mm);
+}
+
static void
vdev_mirror_map_free(zio_t *zio)
{
mirror_map_t *mm = zio->io_vsd;
- kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
+ kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
}
static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
- vdev_mirror_map_free,
- zio_vsd_default_cksum_report
+ .vsd_free = vdev_mirror_map_free,
+ .vsd_cksum_report = zio_vsd_default_cksum_report
};
+static int
+vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
+{
+ uint64_t last_offset;
+ int64_t offset_diff;
+ int load;
+
+ /* All DVAs have equal weight at the root. */
+ if (mm->mm_root)
+ return (INT_MAX);
+
+ /*
+ * We don't return INT_MAX if the device is resilvering i.e.
+ * vdev_resilver_txg != 0 as when tested performance was slightly
+ * worse overall when resilvering with compared to without.
+ */
+
+ /* Fix zio_offset for leaf vdevs */
+ if (vd->vdev_ops->vdev_op_leaf)
+ zio_offset += VDEV_LABEL_START_SIZE;
+
+ /* Standard load based on pending queue length. */
+ load = vdev_queue_length(vd);
+ last_offset = vdev_queue_last_offset(vd);
+
+ if (vd->vdev_nonrot) {
+ /* Non-rotating media. */
+ if (last_offset == zio_offset) {
+ MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear);
+ return (load + zfs_vdev_mirror_non_rotating_inc);
+ }
+
+ /*
+ * Apply a seek penalty even for non-rotating devices as
+ * sequential I/O's can be aggregated into fewer operations on
+ * the device, thus avoiding unnecessary per-command overhead
+ * and boosting performance.
+ */
+ MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek);
+ return (load + zfs_vdev_mirror_non_rotating_seek_inc);
+ }
+
+ /* Rotating media I/O's which directly follow the last I/O. */
+ if (last_offset == zio_offset) {
+ MIRROR_BUMP(vdev_mirror_stat_rotating_linear);
+ return (load + zfs_vdev_mirror_rotating_inc);
+ }
+
+ /*
+ * Apply half the seek increment to I/O's within seek offset
+ * of the last I/O issued to this vdev as they should incur less
+ * of a seek increment.
+ */
+ offset_diff = (int64_t)(last_offset - zio_offset);
+ if (ABS(offset_diff) < zfs_vdev_mirror_rotating_seek_offset) {
+ MIRROR_BUMP(vdev_mirror_stat_rotating_offset);
+ return (load + (zfs_vdev_mirror_rotating_seek_inc / 2));
+ }
+
+ /* Apply the full seek increment to all other I/O's. */
+ MIRROR_BUMP(vdev_mirror_stat_rotating_seek);
+ return (load + zfs_vdev_mirror_rotating_seek_inc);
+}
+
static mirror_map_t *
-vdev_mirror_map_alloc(zio_t *zio)
+vdev_mirror_map_init(zio_t *zio)
{
mirror_map_t *mm = NULL;
mirror_child_t *mc;
vdev_t *vd = zio->io_vd;
- int c, d;
+ int c;
if (vd == NULL) {
dva_t *dva = zio->io_bp->blk_dva;
spa_t *spa = zio->io_spa;
+ dsl_scan_t *scn = NULL;
dva_t dva_copy[SPA_DVAS_PER_BP];
- c = BP_GET_NDVAS(zio->io_bp);
+ if (spa->spa_dsl_pool != NULL) {
+ scn = spa->spa_dsl_pool->dp_scan;
+ }
+ /*
+ * The sequential scrub code sorts and issues all DVAs
+ * of a bp separately. Each of these IOs includes all
+ * original DVA copies so that repairs can be performed
+ * in the event of an error, but we only actually want
+ * to check the first DVA since the others will be
+ * checked by their respective sorted IOs. Only if we
+ * hit an error will we try all DVAs upon retrying.
+ *
+ * Note: This check is safe even if the user switches
+ * from a legacy scrub to a sequential one in the middle
+ * of processing, since scn_is_sorted isn't updated until
+ * all outstanding IOs from the previous scrub pass
+ * complete.
+ */
+ if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
+ !(zio->io_flags & ZIO_FLAG_IO_RETRY) &&
+ scn != NULL &&
+ scn->scn_is_sorted &&
+ dsl_scan_scrubbing(spa->spa_dsl_pool)) {
+ c = 1;
+ } else {
+ c = BP_GET_NDVAS(zio->io_bp);
+ }
/*
* If we do not trust the pool config, some DVAs might be
@@ -110,24 +303,7 @@ vdev_mirror_map_alloc(zio_t *zio)
}
}
- mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
- mm->mm_children = c;
- mm->mm_resilvering = B_FALSE;
- mm->mm_preferred = spa_get_random(c);
- mm->mm_root = B_TRUE;
-
- /*
- * Check the other, lower-index DVAs to see if they're on
- * the same vdev as the child we picked. If they are, use
- * them since they are likely to have been allocated from
- * the primary metaslab in use at the time, and hence are
- * more likely to have locality with single-copy data.
- */
- for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
- if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
- mm->mm_preferred = d;
- }
-
+ mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE);
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
@@ -135,12 +311,6 @@ vdev_mirror_map_alloc(zio_t *zio)
mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
}
} else {
- int replacing;
-
- c = vd->vdev_children;
-
- mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
- mm->mm_children = c;
/*
* If we are resilvering, then we should handle scrub reads
* differently; we shouldn't issue them to the resilvering
@@ -164,25 +334,12 @@ vdev_mirror_map_alloc(zio_t *zio)
* automatically removed from the pool after the user replaces
* the device that originally failed.
*/
- replacing = (vd->vdev_ops == &vdev_replacing_ops ||
- vd->vdev_ops == &vdev_spare_ops);
- /*
- * If a spa load is in progress, then spa_dsl_pool may be
- * uninitialized. But we shouldn't be resilvering during a spa
- * load anyway.
- */
- if (replacing &&
- (spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE) &&
- dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool)) {
- mm->mm_resilvering = B_TRUE;
- } else {
- mm->mm_resilvering = B_FALSE;
- }
-
- mm->mm_preferred = mm->mm_resilvering ? 0 :
- (zio->io_offset >> vdev_mirror_shift) % c;
- mm->mm_root = B_FALSE;
-
+ boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops) &&
+ spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
+ dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);
+ mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
+ B_FALSE);
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
mc->mc_vd = vd->vdev_child[c];
@@ -269,6 +426,7 @@ vdev_mirror_scrub_done(zio_t *zio)
}
mutex_exit(&zio->io_lock);
}
+
abd_free(zio->io_abd);
mc->mc_error = zio->io_error;
@@ -277,6 +435,54 @@ vdev_mirror_scrub_done(zio_t *zio)
}
/*
+ * Check the other, lower-index DVAs to see if they're on the same
+ * vdev as the child we picked. If they are, use them since they
+ * are likely to have been allocated from the primary metaslab in
+ * use at the time, and hence are more likely to have locality with
+ * single-copy data.
+ */
+static int
+vdev_mirror_dva_select(zio_t *zio, int p)
+{
+ dva_t *dva = zio->io_bp->blk_dva;
+ mirror_map_t *mm = zio->io_vsd;
+ int preferred;
+ int c;
+
+ preferred = mm->mm_preferred[p];
+ for (p--; p >= 0; p--) {
+ c = mm->mm_preferred[p];
+ if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
+ preferred = c;
+ }
+ return (preferred);
+}
+
+static int
+vdev_mirror_preferred_child_randomize(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+ int p;
+
+ if (mm->mm_root) {
+ p = spa_get_random(mm->mm_preferred_cnt);
+ return (vdev_mirror_dva_select(zio, p));
+ }
+
+ /*
+ * To ensure we don't always favour the first matching vdev,
+ * which could lead to wear leveling issues on SSD's, we
+ * use the I/O offset as a pseudo random seed into the vdevs
+ * which have the lowest load.
+ */
+ p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
+ return (mm->mm_preferred[p]);
+}
+
+/*
+ * Try to find a vdev whose DTL doesn't contain the block we want to read
+ * prefering vdevs based on determined load.
+ *
* Try to find a child whose DTL doesn't contain the block we want to read.
* If we can't, try the read on any vdev we haven't already tried.
*/
@@ -284,43 +490,64 @@ static int
vdev_mirror_child_select(zio_t *zio)
{
mirror_map_t *mm = zio->io_vsd;
- mirror_child_t *mc;
uint64_t txg = zio->io_txg;
- int i, c;
+ int c, lowest_load;
ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
- /*
- * Try to find a child whose DTL doesn't contain the block to read.
- * If a child is known to be completely inaccessible (indicated by
- * vdev_readable() returning B_FALSE), don't even try.
- */
- for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
- if (c >= mm->mm_children)
- c = 0;
+ lowest_load = INT_MAX;
+ mm->mm_preferred_cnt = 0;
+ for (c = 0; c < mm->mm_children; c++) {
+ mirror_child_t *mc;
+
mc = &mm->mm_child[c];
if (mc->mc_tried || mc->mc_skipped)
continue;
- if (!vdev_readable(mc->mc_vd)) {
+
+ if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) {
mc->mc_error = SET_ERROR(ENXIO);
mc->mc_tried = 1; /* don't even try */
mc->mc_skipped = 1;
continue;
}
- if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
- return (c);
- mc->mc_error = SET_ERROR(ESTALE);
- mc->mc_skipped = 1;
- mc->mc_speculative = 1;
+
+ if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
+ mc->mc_error = SET_ERROR(ESTALE);
+ mc->mc_skipped = 1;
+ mc->mc_speculative = 1;
+ continue;
+ }
+
+ mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
+ if (mc->mc_load > lowest_load)
+ continue;
+
+ if (mc->mc_load < lowest_load) {
+ lowest_load = mc->mc_load;
+ mm->mm_preferred_cnt = 0;
+ }
+ mm->mm_preferred[mm->mm_preferred_cnt] = c;
+ mm->mm_preferred_cnt++;
+ }
+
+ if (mm->mm_preferred_cnt == 1) {
+ MIRROR_BUMP(vdev_mirror_stat_preferred_found);
+ return (mm->mm_preferred[0]);
+ }
+
+ if (mm->mm_preferred_cnt > 1) {
+ MIRROR_BUMP(vdev_mirror_stat_preferred_not_found);
+ return (vdev_mirror_preferred_child_randomize(zio));
}
/*
* Every device is either missing or has this txg in its DTL.
* Look for any child we haven't already tried before giving up.
*/
- for (c = 0; c < mm->mm_children; c++)
+ for (c = 0; c < mm->mm_children; c++) {
if (!mm->mm_child[c].mc_tried)
return (c);
+ }
/*
* Every child failed. There's no place left to look.
@@ -335,7 +562,7 @@ vdev_mirror_io_start(zio_t *zio)
mirror_child_t *mc;
int c, children;
- mm = vdev_mirror_map_alloc(zio);
+ mm = vdev_mirror_map_init(zio);
if (mm == NULL) {
ASSERT(!spa_trust_config(zio->io_spa));
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 0643c05f57..a89e06ebbf 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -276,6 +276,8 @@ vdev_queue_init(vdev_t *vd)
avl_create(vdev_queue_class_tree(vq, p), compfn,
sizeof (zio_t), offsetof(struct zio, io_queue_node));
}
+
+ vq->vq_last_offset = 0;
}
void
@@ -701,7 +703,7 @@ again:
*/
tree = vdev_queue_class_tree(vq, p);
search.io_timestamp = 0;
- search.io_offset = vq->vq_last_offset + 1;
+ search.io_offset = vq->vq_last_offset - 1;
VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
zio = avl_nearest(tree, idx, AVL_AFTER);
if (zio == NULL)
@@ -729,7 +731,7 @@ again:
}
vdev_queue_pending_add(vq, zio);
- vq->vq_last_offset = zio->io_offset;
+ vq->vq_last_offset = zio->io_offset + zio->io_size;
return (zio);
}
@@ -849,12 +851,39 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
*/
tree = vdev_queue_class_tree(vq, zio->io_priority);
if (avl_find(tree, zio, NULL) == zio) {
+ spa_t *spa = zio->io_spa;
+ zio_priority_t oldpri = zio->io_priority;
+
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
zio->io_priority = priority;
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+
+ mutex_enter(&spa->spa_iokstat_lock);
+ ASSERT3U(spa->spa_queue_stats[oldpri].spa_queued, >, 0);
+ spa->spa_queue_stats[oldpri].spa_queued--;
+ spa->spa_queue_stats[zio->io_priority].spa_queued++;
+ mutex_exit(&spa->spa_iokstat_lock);
} else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
zio->io_priority = priority;
}
mutex_exit(&vq->vq_lock);
}
+
+/*
+ * As these two methods are only used for load calculations we're not
+ * concerned if we get an incorrect value on 32bit platforms due to lack of
+ * vq_lock mutex use here, instead we prefer to keep it lock free for
+ * performance.
+ */
+int
+vdev_queue_length(vdev_t *vd)
+{
+ return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
+}
+
+uint64_t
+vdev_queue_last_offset(vdev_t *vd)
+{
+ return (vd->vdev_queue.vq_last_offset);
+}
diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c
index 26f59af996..71b859bc3d 100644
--- a/usr/src/uts/common/fs/zfs/zio_inject.c
+++ b/usr/src/uts/common/fs/zfs/zio_inject.c
@@ -102,7 +102,7 @@ static int inject_next_id = 1;
* Returns true if the given record matches the I/O in progress.
*/
static boolean_t
-zio_match_handler(zbookmark_phys_t *zb, uint64_t type,
+zio_match_handler(zbookmark_phys_t *zb, uint64_t type, int dva,
zinject_record_t *record, int error)
{
/*
@@ -127,9 +127,11 @@ zio_match_handler(zbookmark_phys_t *zb, uint64_t type,
zb->zb_level == record->zi_level &&
zb->zb_blkid >= record->zi_start &&
zb->zb_blkid <= record->zi_end &&
- error == record->zi_error)
+ (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) &&
+ error == record->zi_error) {
return (record->zi_freq == 0 ||
spa_get_random(100) < record->zi_freq);
+ }
return (B_FALSE);
}
@@ -159,6 +161,38 @@ zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
rw_exit(&inject_lock);
}
+
+/*
+ * If this is a physical I/O for a vdev child determine which DVA it is
+ * for. We iterate backwards through the DVAs matching on the offset so
+ * that we end up with ZI_NO_DVA (-1) if we don't find a match.
+ */
+static int
+zio_match_dva(zio_t *zio)
+{
+ int i = ZI_NO_DVA;
+
+ if (zio->io_bp != NULL && zio->io_vd != NULL &&
+ zio->io_child_type == ZIO_CHILD_VDEV) {
+ for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) {
+ dva_t *dva = &zio->io_bp->blk_dva[i];
+ uint64_t off = DVA_GET_OFFSET(dva);
+ vdev_t *vd = vdev_lookup_top(zio->io_spa,
+ DVA_GET_VDEV(dva));
+
+ /* Compensate for vdev label added to leaves */
+ if (zio->io_vd->vdev_ops->vdev_op_leaf)
+ off += VDEV_LABEL_START_SIZE;
+
+ if (zio->io_vd == vd && zio->io_offset == off)
+ break;
+ }
+ }
+
+ return (i);
+}
+
+
/*
* Determine if the I/O in question should return failure. Returns the errno
* to be returned to the caller.
@@ -190,10 +224,10 @@ zio_handle_fault_injection(zio_t *zio, int error)
handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
continue;
- /* If this handler matches, return EIO */
+ /* If this handler matches, return the specified error */
if (zio_match_handler(&zio->io_logical->io_bookmark,
zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
- &handler->zi_record, error)) {
+ zio_match_dva(zio), &handler->zi_record, error)) {
ret = error;
break;
}