summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/fs
diff options
context:
space:
mode:
authorbonwick <none@none>2007-11-27 22:58:05 -0800
committerbonwick <none@none>2007-11-27 22:58:05 -0800
commite05725b117836db173257fae43fb0746eb857fb5 (patch)
treedbdd58653bf6cebb69156f3361a6e1d72643b100 /usr/src/uts/common/fs
parentb9bc7f7832704fda46b4d6b04f3f7be1227dc644 (diff)
downloadillumos-gate-onnv_79.tar.gz
6354519 stack overflow in zfs due to zio pipelineonnv_79
6533726 single-threaded checksum & parity calculations limit write bandwidth 6547248 ztest detects a future leak when there is none 6604198 zfs only using single cpu for compression (part II) --HG-- rename : usr/src/uts/common/fs/zfs/rprwlock.c => deleted_files/usr/src/uts/common/fs/zfs/rprwlock.c rename : usr/src/uts/common/fs/zfs/sys/rprwlock.h => deleted_files/usr/src/uts/common/fs/zfs/sys/rprwlock.h
Diffstat (limited to 'usr/src/uts/common/fs')
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c7
-rw-r--r--usr/src/uts/common/fs/zfs/rprwlock.c118
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c116
-rw-r--r--usr/src/uts/common/fs/zfs/sys/rprwlock.h61
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa_impl.h11
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h4
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h19
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio_impl.h144
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c15
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_cache.c9
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_disk.c54
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_file.c35
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_mirror.c22
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_missing.c8
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_queue.c37
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz.c25
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c436
18 files changed, 430 insertions, 694 deletions
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 589dc7e3de..9365dbdb14 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -35,6 +35,7 @@
#include <sys/zio.h>
uint64_t metaslab_aliquot = 512ULL << 10;
+uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
/*
* ==========================================================================
@@ -728,6 +729,12 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
ASSERT(!DVA_IS_VALID(&dva[d]));
/*
+ * For testing, make some blocks above a certain size be gang blocks.
+ */
+ if (psize >= metaslab_gang_bang && (lbolt & 3) == 0)
+ return (ENOSPC);
+
+ /*
* Start at the rotor and loop through all mgs until we find something.
* Note that there's no locking on mc_rotor or mc_allocated because
* nothing actually breaks if we miss a few updates -- we just won't
diff --git a/usr/src/uts/common/fs/zfs/rprwlock.c b/usr/src/uts/common/fs/zfs/rprwlock.c
deleted file mode 100644
index 49ae505209..0000000000
--- a/usr/src/uts/common/fs/zfs/rprwlock.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/refcount.h>
-#include <sys/rprwlock.h>
-
-void
-rprw_init(rprwlock_t *rwl)
-{
- mutex_init(&rwl->rw_lock, NULL, MUTEX_DEFAULT, NULL);
- rwl->rw_writer = NULL;
- cv_init(&rwl->rw_cv, NULL, CV_DEFAULT, NULL);
- refcount_create(&rwl->rw_count);
-}
-
-void
-rprw_destroy(rprwlock_t *rwl)
-{
- mutex_destroy(&rwl->rw_lock);
- ASSERT(rwl->rw_writer == NULL);
- cv_destroy(&rwl->rw_cv);
- refcount_destroy(&rwl->rw_count);
-}
-
-void
-rprw_enter_read(rprwlock_t *rwl, void *tag)
-{
- mutex_enter(&rwl->rw_lock);
-
- if (rwl->rw_writer != curthread) {
- while (rwl->rw_writer != NULL)
- cv_wait(&rwl->rw_cv, &rwl->rw_lock);
- }
-
- (void) refcount_add(&rwl->rw_count, tag);
-
- mutex_exit(&rwl->rw_lock);
-}
-
-void
-rprw_enter_write(rprwlock_t *rwl, void *tag)
-{
- mutex_enter(&rwl->rw_lock);
-
- if (rwl->rw_writer != curthread) {
- while (!refcount_is_zero(&rwl->rw_count))
- cv_wait(&rwl->rw_cv, &rwl->rw_lock);
- rwl->rw_writer = curthread;
- }
-
- (void) refcount_add(&rwl->rw_count, tag);
-
- mutex_exit(&rwl->rw_lock);
-}
-
-void
-rprw_enter(rprwlock_t *rwl, krw_t rw, void *tag)
-{
- if (rw == RW_READER)
- rprw_enter_read(rwl, tag);
- else
- rprw_enter_write(rwl, tag);
-}
-
-void
-rprw_exit(rprwlock_t *rwl, void *tag)
-{
- mutex_enter(&rwl->rw_lock);
-
- ASSERT(!refcount_is_zero(&rwl->rw_count));
- ASSERT(rwl->rw_writer == NULL || curthread == rwl->rw_writer);
- if (refcount_remove(&rwl->rw_count, tag) == 0) {
- cv_broadcast(&rwl->rw_cv);
- rwl->rw_writer = NULL; /* OK in either case */
- }
-
- mutex_exit(&rwl->rw_lock);
-}
-
-boolean_t
-rprw_held(rprwlock_t *rwl, krw_t rw)
-{
- boolean_t held;
-
- mutex_enter(&rwl->rw_lock);
- if (rw == RW_WRITER)
- held = (rwl->rw_writer == curthread);
- else
- held = !rwl->rw_writer && !refcount_is_zero(&rwl->rw_count);
- mutex_exit(&rwl->rw_lock);
-
- return (held);
-}
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 6aefb025fc..6b1c28140a 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -144,16 +144,9 @@
* zero. Must be called with spa_namespace_lock
* held.
*
- * The spa_config_lock is manipulated using the following functions:
- *
- * spa_config_enter() Acquire the config lock as RW_READER or
- * RW_WRITER. At least one reference on the spa_t
- * must exist.
- *
- * spa_config_exit() Release the config lock.
- *
- * spa_config_held() Returns true if the config lock is currently
- * held in the given state.
+ * The spa_config_lock is a form of rwlock. It must be held as RW_READER
+ * to perform I/O to the pool, and as RW_WRITER to change the vdev config.
+ * The spa_config_lock is manipulated with spa_config_{enter,exit,held}().
*
* The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
*
@@ -202,6 +195,80 @@ int zfs_recover = 0;
/*
* ==========================================================================
+ * SPA config locking
+ * ==========================================================================
+ */
+static void
+spa_config_lock_init(spa_config_lock_t *scl)
+{
+ mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
+ scl->scl_writer = NULL;
+ cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
+ refcount_create(&scl->scl_count);
+}
+
+static void
+spa_config_lock_destroy(spa_config_lock_t *scl)
+{
+ mutex_destroy(&scl->scl_lock);
+ ASSERT(scl->scl_writer == NULL);
+ cv_destroy(&scl->scl_cv);
+ refcount_destroy(&scl->scl_count);
+}
+
+void
+spa_config_enter(spa_t *spa, krw_t rw, void *tag)
+{
+ spa_config_lock_t *scl = &spa->spa_config_lock;
+
+ mutex_enter(&scl->scl_lock);
+
+ if (rw == RW_READER) {
+ while (scl->scl_writer != NULL && scl->scl_writer != curthread)
+ cv_wait(&scl->scl_cv, &scl->scl_lock);
+ } else {
+ while (!refcount_is_zero(&scl->scl_count) &&
+ scl->scl_writer != curthread)
+ cv_wait(&scl->scl_cv, &scl->scl_lock);
+ scl->scl_writer = curthread;
+ }
+
+ (void) refcount_add(&scl->scl_count, tag);
+
+ mutex_exit(&scl->scl_lock);
+}
+
+void
+spa_config_exit(spa_t *spa, void *tag)
+{
+ spa_config_lock_t *scl = &spa->spa_config_lock;
+
+ mutex_enter(&scl->scl_lock);
+
+ ASSERT(!refcount_is_zero(&scl->scl_count));
+
+ if (refcount_remove(&scl->scl_count, tag) == 0) {
+ cv_broadcast(&scl->scl_cv);
+ ASSERT(scl->scl_writer == NULL || scl->scl_writer == curthread);
+ scl->scl_writer = NULL; /* OK in either case */
+ }
+
+ mutex_exit(&scl->scl_lock);
+}
+
+boolean_t
+spa_config_held(spa_t *spa, krw_t rw)
+{
+ spa_config_lock_t *scl = &spa->spa_config_lock;
+
+ if (rw == RW_READER)
+ return (!refcount_is_zero(&scl->scl_count));
+ else
+ return (scl->scl_writer == curthread);
+}
+
+/*
+ * ==========================================================================
* SPA namespace functions
* ==========================================================================
*/
@@ -275,7 +342,7 @@ spa_add(const char *name, const char *altroot)
spa->spa_final_txg = UINT64_MAX;
refcount_create(&spa->spa_refcount);
- rprw_init(&spa->spa_config_lock);
+ spa_config_lock_init(&spa->spa_config_lock);
avl_add(&spa_namespace_avl, spa);
@@ -324,7 +391,7 @@ spa_remove(spa_t *spa)
refcount_destroy(&spa->spa_refcount);
- rprw_destroy(&spa->spa_config_lock);
+ spa_config_lock_destroy(&spa->spa_config_lock);
rw_destroy(&spa->spa_traverse_lock);
@@ -639,29 +706,6 @@ spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc)
/*
* ==========================================================================
- * SPA config locking
- * ==========================================================================
- */
-void
-spa_config_enter(spa_t *spa, krw_t rw, void *tag)
-{
- rprw_enter(&spa->spa_config_lock, rw, tag);
-}
-
-void
-spa_config_exit(spa_t *spa, void *tag)
-{
- rprw_exit(&spa->spa_config_lock, tag);
-}
-
-boolean_t
-spa_config_held(spa_t *spa, krw_t rw)
-{
- return (rprw_held(&spa->spa_config_lock, rw));
-}
-
-/*
- * ==========================================================================
* SPA vdev locking
* ==========================================================================
*/
@@ -1003,7 +1047,7 @@ spa_name(spa_t *spa)
* config lock, both of which are required to do a rename.
*/
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
- spa_config_held(spa, RW_READER) || spa_config_held(spa, RW_WRITER));
+ spa_config_held(spa, RW_READER));
return (spa->spa_name);
}
diff --git a/usr/src/uts/common/fs/zfs/sys/rprwlock.h b/usr/src/uts/common/fs/zfs/sys/rprwlock.h
deleted file mode 100644
index ba23799c9d..0000000000
--- a/usr/src/uts/common/fs/zfs/sys/rprwlock.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_RPRWLOCK_H
-#define _SYS_RPRWLOCK_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/inttypes.h>
-#include <sys/list.h>
-#include <sys/zfs_context.h>
-#include <sys/refcount.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct rprwlock {
- kmutex_t rw_lock;
- kthread_t *rw_writer;
- kcondvar_t rw_cv;
- refcount_t rw_count;
-} rprwlock_t;
-
-void rprw_init(rprwlock_t *rwl);
-void rprw_destroy(rprwlock_t *rwl);
-void rprw_enter_read(rprwlock_t *rwl, void *tag);
-void rprw_enter_write(rprwlock_t *rwl, void *tag);
-void rprw_enter(rprwlock_t *rwl, krw_t rw, void *tag);
-void rprw_exit(rprwlock_t *rwl, void *tag);
-boolean_t rprw_held(rprwlock_t *rwl, krw_t rw);
-#define RPRW_READ_HELD(x) rprw_held(x, RW_READER)
-#define RPRW_WRITE_HELD(x) rprw_held(x, RW_WRITER)
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_RPRWLOCK_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
index eb2b6d6289..069255b4c0 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -37,7 +37,6 @@
#include <sys/zfs_context.h>
#include <sys/avl.h>
#include <sys/refcount.h>
-#include <sys/rprwlock.h>
#include <sys/bplist.h>
#ifdef __cplusplus
@@ -68,6 +67,14 @@ struct spa_aux_vdev {
uint_t sav_npending; /* # pending devices */
};
+typedef struct spa_config_lock {
+ kmutex_t scl_lock;
+ kthread_t *scl_writer;
+ uint16_t scl_write_wanted;
+ kcondvar_t scl_cv;
+ refcount_t scl_count;
+} spa_config_lock_t;
+
struct spa {
/*
* Fields protected by spa_namespace_lock.
@@ -157,7 +164,7 @@ struct spa {
* In order for the MDB module to function correctly, the other
* fields must remain in the same location.
*/
- rprwlock_t spa_config_lock; /* configuration changes */
+ spa_config_lock_t spa_config_lock; /* configuration changes */
refcount_t spa_refcount; /* number of opens */
};
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index b1ec648056..2ec3de6513 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -83,9 +83,6 @@ extern void vdev_space_update(vdev_t *vd, int64_t space_delta,
extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
-extern void vdev_io_start(zio_t *zio);
-extern void vdev_io_done(zio_t *zio);
-
extern int vdev_fault(spa_t *spa, uint64_t guid);
extern int vdev_degrade(spa_t *spa, uint64_t guid);
extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 2eebbba566..7d823bab10 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -62,8 +62,8 @@ typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift);
typedef void vdev_close_func_t(vdev_t *vd);
typedef int vdev_probe_func_t(vdev_t *vd);
typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
-typedef void vdev_io_start_func_t(zio_t *zio);
-typedef void vdev_io_done_func_t(zio_t *zio);
+typedef int vdev_io_start_func_t(zio_t *zio);
+typedef int vdev_io_done_func_t(zio_t *zio);
typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
typedef struct vdev_ops {
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 4591274518..e673edbac2 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -153,6 +153,7 @@ enum zio_compress {
(ZIO_FLAG_CANFAIL | \
ZIO_FLAG_FAILFAST | \
ZIO_FLAG_CONFIG_HELD | \
+ ZIO_FLAG_DONT_CACHE | \
ZIO_FLAG_DONT_RETRY | \
ZIO_FLAG_IO_REPAIR | \
ZIO_FLAG_SPECULATIVE | \
@@ -164,9 +165,11 @@ enum zio_compress {
#define ZIO_FLAG_VDEV_INHERIT \
(ZIO_FLAG_GANG_INHERIT | \
- ZIO_FLAG_DONT_CACHE | \
ZIO_FLAG_PHYSICAL)
+#define ZIO_PIPELINE_CONTINUE 0x100
+#define ZIO_PIPELINE_STOP 0x101
+
/*
* We'll take the unused errno 'EBADE' (from the Convergent graveyard)
* to indicate checksum errors.
@@ -262,7 +265,6 @@ struct zio {
uint32_t io_numerrors;
uint32_t io_pipeline;
uint32_t io_orig_pipeline;
- uint32_t io_async_stages;
uint64_t io_children_notready;
uint64_t io_children_notdone;
void *io_waiter;
@@ -319,21 +321,18 @@ extern void zio_flush_vdev(spa_t *spa, uint64_t vdev, zio_t **zio);
extern int zio_wait(zio_t *zio);
extern void zio_nowait(zio_t *zio);
+extern void zio_execute(zio_t *zio);
+extern void zio_interrupt(zio_t *zio);
+
+extern int zio_wait_for_children_ready(zio_t *zio);
+extern int zio_wait_for_children_done(zio_t *zio);
extern void *zio_buf_alloc(size_t size);
extern void zio_buf_free(void *buf, size_t size);
extern void *zio_data_buf_alloc(size_t size);
extern void zio_data_buf_free(void *buf, size_t size);
-/*
- * Move an I/O to the next stage of the pipeline and execute that stage.
- * There's no locking on io_stage because there's no legitimate way for
- * multiple threads to be attempting to process the same I/O.
- */
-extern void zio_next_stage(zio_t *zio);
-extern void zio_next_stage_async(zio_t *zio);
extern void zio_resubmit_stage_async(void *);
-extern void zio_wait_children_done(zio_t *zio);
/*
* Delegate I/O to a child vdev.
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_impl.h b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
index a5a0bb54e8..60a1c8b38e 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
@@ -38,16 +38,15 @@ extern "C" {
/*
* I/O Groups: pipeline stage definitions.
*/
-
typedef enum zio_stage {
ZIO_STAGE_OPEN = 0, /* RWFCI */
- ZIO_STAGE_WAIT_CHILDREN_READY, /* RWFCI */
+ ZIO_STAGE_WAIT_FOR_CHILDREN_READY, /* RWFCI */
+ ZIO_STAGE_READ_INIT, /* R---- */
+ ZIO_STAGE_ISSUE_ASYNC, /* -W--- */
ZIO_STAGE_WRITE_COMPRESS, /* -W--- */
ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */
- ZIO_STAGE_GANG_PIPELINE, /* -WFC- */
-
ZIO_STAGE_GET_GANG_HEADER, /* -WFC- */
ZIO_STAGE_REWRITE_GANG_MEMBERS, /* -W--- */
ZIO_STAGE_FREE_GANG_MEMBERS, /* --F-- */
@@ -61,13 +60,11 @@ typedef enum zio_stage {
ZIO_STAGE_READY, /* RWFCI */
- ZIO_STAGE_READ_INIT, /* R---- */
-
ZIO_STAGE_VDEV_IO_START, /* RW--I */
ZIO_STAGE_VDEV_IO_DONE, /* RW--I */
ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */
- ZIO_STAGE_WAIT_CHILDREN_DONE, /* RWFCI */
+ ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, /* RWFCI */
ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */
ZIO_STAGE_READ_GANG_MEMBERS, /* R---- */
@@ -77,30 +74,22 @@ typedef enum zio_stage {
ZIO_STAGE_DONE /* RWFCI */
} zio_stage_t;
-/*
- * The stages for which there's some performance value in going async.
- * When compression is enabled, ZIO_STAGE_WRITE_COMPRESS is ORed in as well.
- */
-#define ZIO_ASYNC_PIPELINE_STAGES \
- ((1U << ZIO_STAGE_CHECKSUM_GENERATE) | \
- (1U << ZIO_STAGE_VDEV_IO_DONE) | \
- (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \
- (1U << ZIO_STAGE_READ_DECOMPRESS))
+#define ZIO_INTERLOCK_STAGES \
+ ((1U << ZIO_STAGE_WAIT_FOR_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_READY) | \
+ (1U << ZIO_STAGE_WAIT_FOR_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_ASSESS) | \
+ (1U << ZIO_STAGE_DONE))
-#define ZIO_VDEV_IO_PIPELINE \
+#define ZIO_VDEV_IO_STAGES \
((1U << ZIO_STAGE_VDEV_IO_START) | \
(1U << ZIO_STAGE_VDEV_IO_DONE) | \
(1U << ZIO_STAGE_VDEV_IO_ASSESS))
#define ZIO_READ_PHYS_PIPELINE \
- ((1U << ZIO_STAGE_OPEN) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
- (1U << ZIO_STAGE_READY) | \
- ZIO_VDEV_IO_PIPELINE | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
- (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \
- (1U << ZIO_STAGE_ASSESS) | \
- (1U << ZIO_STAGE_DONE))
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES | \
+ (1U << ZIO_STAGE_CHECKSUM_VERIFY))
#define ZIO_READ_GANG_PIPELINE \
ZIO_READ_PHYS_PIPELINE
@@ -109,97 +98,66 @@ typedef enum zio_stage {
(1U << ZIO_STAGE_READ_INIT) | \
ZIO_READ_PHYS_PIPELINE
-#define ZIO_WRITE_PHYS_PIPELINE \
- ((1U << ZIO_STAGE_OPEN) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
- (1U << ZIO_STAGE_CHECKSUM_GENERATE) | \
- (1U << ZIO_STAGE_READY) | \
- ZIO_VDEV_IO_PIPELINE | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
- (1U << ZIO_STAGE_ASSESS) | \
- (1U << ZIO_STAGE_DONE))
+#define ZIO_WRITE_COMMON_STAGES \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES | \
+ (1U << ZIO_STAGE_ISSUE_ASYNC) | \
+ (1U << ZIO_STAGE_CHECKSUM_GENERATE))
-#define ZIO_WRITE_COMMON_PIPELINE \
- ZIO_WRITE_PHYS_PIPELINE
+#define ZIO_WRITE_PHYS_PIPELINE \
+ ZIO_WRITE_COMMON_STAGES
#define ZIO_WRITE_PIPELINE \
- ((1U << ZIO_STAGE_WRITE_COMPRESS) | \
- ZIO_WRITE_COMMON_PIPELINE)
+ (ZIO_WRITE_COMMON_STAGES | \
+ (1U << ZIO_STAGE_WRITE_COMPRESS))
-#define ZIO_GANG_STAGES \
+#define ZIO_GANG_REWRITE_STAGES \
((1U << ZIO_STAGE_GET_GANG_HEADER) | \
(1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \
- (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \
- (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \
- (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \
- (1U << ZIO_STAGE_READ_GANG_MEMBERS))
-
-#define ZIO_REWRITE_PIPELINE \
- ((1U << ZIO_STAGE_GANG_PIPELINE) | \
- (1U << ZIO_STAGE_GET_GANG_HEADER) | \
- (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \
- (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \
- ZIO_WRITE_COMMON_PIPELINE)
-
-#define ZIO_WRITE_ALLOCATE_PIPELINE \
- ((1U << ZIO_STAGE_DVA_ALLOCATE) | \
- ZIO_WRITE_COMMON_PIPELINE)
+ (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE))
#define ZIO_GANG_FREE_STAGES \
((1U << ZIO_STAGE_GET_GANG_HEADER) | \
(1U << ZIO_STAGE_FREE_GANG_MEMBERS))
-#define ZIO_FREE_PIPELINE \
- ((1U << ZIO_STAGE_OPEN) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
- (1U << ZIO_STAGE_GANG_PIPELINE) | \
- (1U << ZIO_STAGE_GET_GANG_HEADER) | \
- (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \
+#define ZIO_GANG_CLAIM_STAGES \
+ ((1U << ZIO_STAGE_GET_GANG_HEADER) | \
+ (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS))
+
+#define ZIO_REWRITE_PIPELINE(bp) \
+ (ZIO_WRITE_COMMON_STAGES | \
+ (BP_IS_GANG(bp) ? ZIO_GANG_REWRITE_STAGES : 0))
+
+#define ZIO_WRITE_ALLOCATE_PIPELINE \
+ (ZIO_WRITE_COMMON_STAGES | \
+ (1U << ZIO_STAGE_DVA_ALLOCATE))
+
+#define ZIO_FREE_PIPELINE(bp) \
+ (ZIO_INTERLOCK_STAGES | \
(1U << ZIO_STAGE_DVA_FREE) | \
- (1U << ZIO_STAGE_READY) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
- (1U << ZIO_STAGE_ASSESS) | \
- (1U << ZIO_STAGE_DONE))
+ (BP_IS_GANG(bp) ? ZIO_GANG_FREE_STAGES : 0))
-#define ZIO_CLAIM_PIPELINE \
- ((1U << ZIO_STAGE_OPEN) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
- (1U << ZIO_STAGE_GANG_PIPELINE) | \
- (1U << ZIO_STAGE_GET_GANG_HEADER) | \
- (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \
+#define ZIO_CLAIM_PIPELINE(bp) \
+ (ZIO_INTERLOCK_STAGES | \
(1U << ZIO_STAGE_DVA_CLAIM) | \
- (1U << ZIO_STAGE_READY) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
- (1U << ZIO_STAGE_ASSESS) | \
- (1U << ZIO_STAGE_DONE))
+ (BP_IS_GANG(bp) ? ZIO_GANG_CLAIM_STAGES : 0))
#define ZIO_IOCTL_PIPELINE \
- ((1U << ZIO_STAGE_OPEN) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
- (1U << ZIO_STAGE_READY) | \
- ZIO_VDEV_IO_PIPELINE | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
- (1U << ZIO_STAGE_ASSESS) | \
- (1U << ZIO_STAGE_DONE))
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES)
+
#define ZIO_WAIT_FOR_CHILDREN_PIPELINE \
- ((1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
- (1U << ZIO_STAGE_READY) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
- (1U << ZIO_STAGE_ASSESS) | \
- (1U << ZIO_STAGE_DONE))
+ ZIO_INTERLOCK_STAGES
-#define ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE \
- ((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+#define ZIO_VDEV_CHILD_PIPELINE \
+ (ZIO_VDEV_IO_STAGES | \
(1U << ZIO_STAGE_ASSESS) | \
+ (1U << ZIO_STAGE_WAIT_FOR_CHILDREN_DONE) | \
(1U << ZIO_STAGE_DONE))
-#define ZIO_VDEV_CHILD_PIPELINE \
- (ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE | \
- ZIO_VDEV_IO_PIPELINE)
-
#define ZIO_ERROR_PIPELINE_MASK \
- ZIO_WAIT_FOR_CHILDREN_PIPELINE
+ ZIO_INTERLOCK_STAGES
typedef struct zio_transform zio_transform_t;
struct zio_transform {
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 2a2dc1d625..2b4c663a1a 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -136,6 +136,9 @@ vdev_lookup_top(spa_t *spa, uint64_t vdev)
{
vdev_t *rvd = spa->spa_root_vdev;
+ ASSERT(spa_config_held(spa, RW_READER) ||
+ curthread == spa->spa_scrub_thread);
+
if (vdev < rvd->vdev_children)
return (rvd->vdev_child[vdev]);
@@ -1459,18 +1462,6 @@ vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
return (vd->vdev_ops->vdev_op_asize(vd, psize));
}
-void
-vdev_io_start(zio_t *zio)
-{
- zio->io_vd->vdev_ops->vdev_op_io_start(zio);
-}
-
-void
-vdev_io_done(zio_t *zio)
-{
- zio->io_vd->vdev_ops->vdev_op_io_done(zio);
-}
-
const char *
vdev_description(vdev_t *vd)
{
diff --git a/usr/src/uts/common/fs/zfs/vdev_cache.c b/usr/src/uts/common/fs/zfs/vdev_cache.c
index ce9508d2fb..5f475f9b47 100644
--- a/usr/src/uts/common/fs/zfs/vdev_cache.c
+++ b/usr/src/uts/common/fs/zfs/vdev_cache.c
@@ -231,7 +231,7 @@ vdev_cache_fill(zio_t *zio)
zio->io_delegate_list = dio->io_delegate_next;
dio->io_delegate_next = NULL;
dio->io_error = zio->io_error;
- zio_next_stage(dio);
+ zio_execute(dio);
}
}
@@ -286,15 +286,10 @@ vdev_cache_read(zio_t *zio)
zio_vdev_io_bypass(zio);
mutex_exit(&vc->vc_lock);
- zio_next_stage(zio);
+ zio_execute(zio);
return (0);
}
- if (!(zio->io_flags & ZIO_FLAG_METADATA)) {
- mutex_exit(&vc->vc_lock);
- return (EINVAL);
- }
-
ve = vdev_cache_allocate(zio);
if (ve == NULL) {
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index e4e13f2aac..933ed3e2bf 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -386,7 +386,7 @@ vdev_disk_io_intr(buf_t *bp)
kmem_free(vdb, sizeof (vdev_disk_buf_t));
- zio_next_stage_async(zio);
+ zio_interrupt(zio);
}
static void
@@ -396,10 +396,10 @@ vdev_disk_ioctl_done(void *zio_arg, int error)
zio->io_error = error;
- zio_next_stage_async(zio);
+ zio_interrupt(zio);
}
-static void
+static int
vdev_disk_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -414,8 +414,7 @@ vdev_disk_io_start(zio_t *zio)
/* XXPOLICY */
if (!vdev_readable(vd)) {
zio->io_error = ENXIO;
- zio_next_stage_async(zio);
- return;
+ return (ZIO_PIPELINE_CONTINUE);
}
switch (zio->io_cmd) {
@@ -444,8 +443,10 @@ vdev_disk_io_start(zio_t *zio)
* and will call vdev_disk_ioctl_done()
* upon completion.
*/
- return;
- } else if (error == ENOTSUP || error == ENOTTY) {
+ return (ZIO_PIPELINE_STOP);
+ }
+
+ if (error == ENOTSUP || error == ENOTTY) {
/*
* If we get ENOTSUP or ENOTTY, we know that
* no future attempts will ever succeed.
@@ -463,15 +464,26 @@ vdev_disk_io_start(zio_t *zio)
zio->io_error = ENOTSUP;
}
- zio_next_stage_async(zio);
- return;
+ return (ZIO_PIPELINE_CONTINUE);
}
if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
- return;
+ return (ZIO_PIPELINE_STOP);
if ((zio = vdev_queue_io(zio)) == NULL)
- return;
+ return (ZIO_PIPELINE_STOP);
+
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
+ else
+ error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
+ error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
+
+ if (error) {
+ zio->io_error = error;
+ zio_interrupt(zio);
+ return (ZIO_PIPELINE_STOP);
+ }
flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
flags |= B_BUSY | B_NOCACHE;
@@ -491,26 +503,14 @@ vdev_disk_io_start(zio_t *zio)
bp->b_bufsize = zio->io_size;
bp->b_iodone = (int (*)())vdev_disk_io_intr;
- /* XXPOLICY */
- if (zio->io_type == ZIO_TYPE_WRITE)
- error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
- else
- error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
- error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
- if (error) {
- zio->io_error = error;
- bioerror(bp, error);
- bp->b_resid = bp->b_bcount;
- bp->b_iodone(bp);
- return;
- }
-
error = ldi_strategy(dvd->vd_lh, bp);
/* ldi_strategy() will return non-zero only on programming errors */
ASSERT(error == 0);
+
+ return (ZIO_PIPELINE_STOP);
}
-static void
+static int
vdev_disk_io_done(zio_t *zio)
{
vdev_queue_io_done(zio);
@@ -544,7 +544,7 @@ vdev_disk_io_done(zio_t *zio)
}
}
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
vdev_ops_t vdev_disk_ops = {
diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c
index ee30845cb7..51abd9612b 100644
--- a/usr/src/uts/common/fs/zfs/vdev_file.c
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c
@@ -215,7 +215,7 @@ vdev_file_probe(vdev_t *vd)
return (error);
}
-static void
+static int
vdev_file_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -229,8 +229,7 @@ vdev_file_io_start(zio_t *zio)
/* XXPOLICY */
if (!vdev_readable(vd)) {
zio->io_error = ENXIO;
- zio_next_stage_async(zio);
- return;
+ return (ZIO_PIPELINE_CONTINUE);
}
switch (zio->io_cmd) {
@@ -244,8 +243,7 @@ vdev_file_io_start(zio_t *zio)
zio->io_error = ENOTSUP;
}
- zio_next_stage_async(zio);
- return;
+ return (ZIO_PIPELINE_CONTINUE);
}
/*
@@ -254,11 +252,11 @@ vdev_file_io_start(zio_t *zio)
*/
#ifndef _KERNEL
if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
- return;
+ return (ZIO_PIPELINE_STOP);
#endif
if ((zio = vdev_queue_io(zio)) == NULL)
- return;
+ return (ZIO_PIPELINE_STOP);
/* XXPOLICY */
if (zio->io_type == ZIO_TYPE_WRITE)
@@ -268,8 +266,8 @@ vdev_file_io_start(zio_t *zio)
error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
if (error) {
zio->io_error = error;
- zio_next_stage_async(zio);
- return;
+ zio_interrupt(zio);
+ return (ZIO_PIPELINE_STOP);
}
zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
@@ -280,26 +278,25 @@ vdev_file_io_start(zio_t *zio)
if (resid != 0 && zio->io_error == 0)
zio->io_error = ENOSPC;
- zio_next_stage_async(zio);
+ zio_interrupt(zio);
+
+ return (ZIO_PIPELINE_STOP);
}
-static void
+static int
vdev_file_io_done(zio_t *zio)
{
+ vdev_t *vd = zio->io_vd;
if (zio_injection_enabled && zio->io_error == 0)
- zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
+ zio->io_error = zio_handle_device_injection(vd, EIO);
/*
* If an error has been encountered then attempt to probe the device
* to determine if it's still accessible.
*/
- if (zio->io_error == EIO) {
- vdev_t *vd = zio->io_vd;
-
- if (vdev_probe(vd) != 0)
- vd->vdev_is_failing = B_TRUE;
- }
+ if (zio->io_error == EIO && vdev_probe(vd) != 0)
+ vd->vdev_is_failing = B_TRUE;
vdev_queue_io_done(zio);
@@ -308,7 +305,7 @@ vdev_file_io_done(zio_t *zio)
vdev_cache_write(zio);
#endif
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
vdev_ops_t vdev_file_ops = {
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
index 45d326ae69..4d6c499c10 100644
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -253,7 +253,7 @@ vdev_mirror_child_select(zio_t *zio)
return (-1);
}
-static void
+static int
vdev_mirror_io_start(zio_t *zio)
{
mirror_map_t *mm;
@@ -279,8 +279,7 @@ vdev_mirror_io_start(zio_t *zio)
ZIO_FLAG_CANFAIL,
vdev_mirror_scrub_done, mc));
}
- zio_wait_children_done(zio);
- return;
+ return (zio_wait_for_children_done(zio));
}
/*
* For normal reads just pick one child.
@@ -316,10 +315,10 @@ vdev_mirror_io_start(zio_t *zio)
c++;
}
- zio_wait_children_done(zio);
+ return (zio_wait_for_children_done(zio));
}
-static void
+static int
vdev_mirror_io_done(zio_t *zio)
{
mirror_map_t *mm = zio->io_vsd;
@@ -362,8 +361,7 @@ vdev_mirror_io_done(zio_t *zio)
if (good_copies != 0)
zio->io_error = 0;
vdev_mirror_map_free(zio);
- zio_next_stage(zio);
- return;
+ return (ZIO_PIPELINE_CONTINUE);
}
ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -383,8 +381,7 @@ vdev_mirror_io_done(zio_t *zio)
mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL,
vdev_mirror_child_done, mc));
- zio_wait_children_done(zio);
- return;
+ return (zio_wait_for_children_done(zio));
}
/* XXPOLICY */
@@ -441,12 +438,13 @@ vdev_mirror_io_done(zio_t *zio)
}
zio_nowait(rio);
- zio_wait_children_done(zio);
- return;
+
+ return (zio_wait_for_children_done(zio));
}
vdev_mirror_map_free(zio);
- zio_next_stage(zio);
+
+ return (ZIO_PIPELINE_CONTINUE);
}
static void
diff --git a/usr/src/uts/common/fs/zfs/vdev_missing.c b/usr/src/uts/common/fs/zfs/vdev_missing.c
index 3aa831c46d..49727ef996 100644
--- a/usr/src/uts/common/fs/zfs/vdev_missing.c
+++ b/usr/src/uts/common/fs/zfs/vdev_missing.c
@@ -62,18 +62,18 @@ vdev_missing_close(vdev_t *vd)
}
/* ARGSUSED */
-static void
+static int
vdev_missing_io_start(zio_t *zio)
{
zio->io_error = ENOTSUP;
- zio_next_stage_async(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
/* ARGSUSED */
-static void
+static int
vdev_missing_io_done(zio_t *zio)
{
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
/* ARGSUSED */
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 7e99c1fd5b..0f921e088a 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -162,7 +162,7 @@ vdev_queue_agg_io_done(zio_t *aio)
aio->io_delegate_list = dio->io_delegate_next;
dio->io_delegate_next = NULL;
dio->io_error = aio->io_error;
- zio_next_stage(dio);
+ zio_execute(dio);
}
ASSERT3U(offset, ==, aio->io_size);
@@ -172,11 +172,8 @@ vdev_queue_agg_io_done(zio_t *aio)
#define IS_ADJACENT(io, nio) \
((io)->io_offset + (io)->io_size == (nio)->io_offset)
-typedef void zio_issue_func_t(zio_t *);
-
static zio_t *
-vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
- zio_issue_func_t **funcp)
+vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
{
zio_t *fio, *lio, *aio, *dio;
avl_tree_t *tree;
@@ -184,8 +181,6 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
ASSERT(MUTEX_HELD(&vq->vq_lock));
- *funcp = NULL;
-
if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
avl_numnodes(&vq->vq_deadline_tree) == 0)
return (NULL);
@@ -245,7 +240,6 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
avl_add(&vq->vq_pending_tree, aio);
- *funcp = zio_nowait;
return (aio);
}
@@ -254,8 +248,6 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
avl_add(&vq->vq_pending_tree, fio);
- *funcp = zio_next_stage;
-
return (fio);
}
@@ -264,7 +256,6 @@ vdev_queue_io(zio_t *zio)
{
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
zio_t *nio;
- zio_issue_func_t *func;
ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
@@ -285,15 +276,19 @@ vdev_queue_io(zio_t *zio)
vdev_queue_io_add(vq, zio);
- nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending, &func);
+ nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending);
mutex_exit(&vq->vq_lock);
- if (nio == NULL || func != zio_nowait)
- return (nio);
+ if (nio == NULL)
+ return (NULL);
+
+ if (nio->io_done == vdev_queue_agg_io_done) {
+ zio_nowait(nio);
+ return (NULL);
+ }
- func(nio);
- return (NULL);
+ return (nio);
}
void
@@ -301,7 +296,6 @@ vdev_queue_io_done(zio_t *zio)
{
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
zio_t *nio;
- zio_issue_func_t *func;
int i;
mutex_enter(&vq->vq_lock);
@@ -309,13 +303,16 @@ vdev_queue_io_done(zio_t *zio)
avl_remove(&vq->vq_pending_tree, zio);
for (i = 0; i < zfs_vdev_ramp_rate; i++) {
- nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending, &func);
+ nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
if (nio == NULL)
break;
mutex_exit(&vq->vq_lock);
- if (func == zio_next_stage)
+ if (nio->io_done == vdev_queue_agg_io_done) {
+ zio_nowait(nio);
+ } else {
zio_vdev_io_reissue(nio);
- func(nio);
+ zio_execute(nio);
+ }
mutex_enter(&vq->vq_lock);
}
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index 73a3ae2565..74b035868c 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -639,7 +639,7 @@ vdev_raidz_repair_done(zio_t *zio)
vdev_raidz_map_free(zio->io_private);
}
-static void
+static int
vdev_raidz_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -672,8 +672,8 @@ vdev_raidz_io_start(zio_t *zio)
zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
vdev_raidz_child_done, rc));
}
- zio_wait_children_done(zio);
- return;
+
+ return (zio_wait_for_children_done(zio));
}
ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -714,7 +714,7 @@ vdev_raidz_io_start(zio_t *zio)
}
}
- zio_wait_children_done(zio);
+ return (zio_wait_for_children_done(zio));
}
/*
@@ -783,7 +783,7 @@ static uint64_t raidz_corrected_p;
static uint64_t raidz_corrected_q;
static uint64_t raidz_corrected_pq;
-static void
+static int
vdev_raidz_io_done(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -840,8 +840,8 @@ vdev_raidz_io_done(zio_t *zio)
zio->io_error = 0;
vdev_raidz_map_free(zio);
- zio_next_stage(zio);
- return;
+
+ return (ZIO_PIPELINE_CONTINUE);
}
ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -1022,8 +1022,8 @@ vdev_raidz_io_done(zio_t *zio)
vdev_raidz_child_done, rc));
} while (++c < rm->rm_cols);
dprintf("rereading\n");
- zio_wait_children_done(zio);
- return;
+
+ return (zio_wait_for_children_done(zio));
}
/*
@@ -1205,12 +1205,13 @@ done:
}
zio_nowait(rio);
- zio_wait_children_done(zio);
- return;
+
+ return (zio_wait_for_children_done(zio));
}
vdev_raidz_map_free(zio);
- zio_next_stage(zio);
+
+ return (ZIO_PIPELINE_CONTINUE);
}
static void
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 112aaa6f25..4aa21a6501 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -61,9 +61,6 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
char *zio_type_name[ZIO_TYPES] = {
"null", "read", "write", "free", "claim", "ioctl" };
-/* At or above this size, force gang blocking - for testing */
-uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1;
-
/* Force an allocation failure when non-zero */
uint16_t zio_zil_fail_shift = 0;
uint16_t zio_io_fail_shift = 0;
@@ -170,8 +167,6 @@ zio_init(void)
align, NULL, NULL, NULL, NULL, data_alloc_arena,
KMC_NODEBUG);
- dprintf("creating cache for size %5lx align %5lx\n",
- size, align);
}
}
@@ -356,9 +351,6 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio->io_bp = bp;
zio->io_bp_copy = *bp;
zio->io_bp_orig = *bp;
- if (dmu_ot[BP_GET_TYPE(bp)].ot_metadata ||
- BP_GET_LEVEL(bp) != 0)
- zio->io_flags |= ZIO_FLAG_METADATA;
}
zio->io_done = done;
zio->io_private = private;
@@ -366,10 +358,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio->io_priority = priority;
zio->io_stage = stage;
zio->io_pipeline = pipeline;
- zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES;
zio->io_timestamp = lbolt64;
- if (pio != NULL)
- zio->io_flags |= (pio->io_flags & ZIO_FLAG_METADATA);
mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
zio_push_transform(zio, data, size, size);
@@ -395,7 +384,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
if (pio == NULL) {
if (type != ZIO_TYPE_NULL &&
!(flags & ZIO_FLAG_CONFIG_HELD)) {
- spa_config_enter(zio->io_spa, RW_READER, zio);
+ spa_config_enter(spa, RW_READER, zio);
zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
}
zio->io_root = zio;
@@ -409,7 +398,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
!(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) &&
!(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) {
pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
- spa_config_enter(zio->io_spa, RW_READER, pio);
+ spa_config_enter(spa, RW_READER, pio);
}
if (stage < ZIO_STAGE_READY)
pio->io_children_notready++;
@@ -524,9 +513,6 @@ zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
zio->io_compress = compress;
zio->io_ndvas = ncopies;
- if (compress != ZIO_COMPRESS_OFF)
- zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS;
-
if (bp->blk_birth != txg) {
/* XXX the bp usually (always?) gets re-zeroed later */
BP_ZERO(bp);
@@ -551,7 +537,7 @@ zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
zio = zio_create(pio, spa, txg, bp, data, size, done, private,
ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
- ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
+ ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp));
zio->io_bookmark = *zb;
zio->io_checksum = checksum;
@@ -612,7 +598,7 @@ zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER,
- ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
+ ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp));
zio->io_bp = &zio->io_bp_copy;
@@ -641,7 +627,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
- ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+ ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp));
zio->io_bp = &zio->io_bp_copy;
@@ -820,7 +806,7 @@ zio_wait(zio_t *zio)
zio->io_waiter = curthread;
- zio_next_stage_async(zio);
+ zio_execute(zio);
mutex_enter(&zio->io_lock);
while (zio->io_stalled != ZIO_STAGE_DONE)
@@ -838,7 +824,23 @@ zio_wait(zio_t *zio)
void
zio_nowait(zio_t *zio)
{
- zio_next_stage_async(zio);
+ zio_execute(zio);
+}
+
+void
+zio_interrupt(zio_t *zio)
+{
+ (void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type],
+ (task_func_t *)zio_execute, zio, TQ_SLEEP);
+}
+
+static int
+zio_issue_async(zio_t *zio)
+{
+ (void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type],
+ (task_func_t *)zio_execute, zio, TQ_SLEEP);
+
+ return (ZIO_PIPELINE_STOP);
}
/*
@@ -846,18 +848,20 @@ zio_nowait(zio_t *zio)
* I/O pipeline interlocks: parent/child dependency scoreboarding
* ==========================================================================
*/
-static void
+static int
zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
{
+ int rv = ZIO_PIPELINE_CONTINUE;
+
mutex_enter(&zio->io_lock);
- if (*countp == 0) {
- ASSERT(zio->io_stalled == 0);
- mutex_exit(&zio->io_lock);
- zio_next_stage(zio);
- } else {
+ ASSERT(zio->io_stalled == 0);
+ if (*countp != 0) {
zio->io_stalled = stage;
- mutex_exit(&zio->io_lock);
+ rv = ZIO_PIPELINE_STOP;
}
+ mutex_exit(&zio->io_lock);
+
+ return (rv);
}
static void
@@ -872,48 +876,54 @@ zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
if (--*countp == 0 && pio->io_stalled == stage) {
pio->io_stalled = 0;
mutex_exit(&pio->io_lock);
- zio_next_stage_async(pio);
+ zio_execute(pio);
} else {
mutex_exit(&pio->io_lock);
}
}
-static void
-zio_wait_children_ready(zio_t *zio)
+int
+zio_wait_for_children_ready(zio_t *zio)
{
- zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
- &zio->io_children_notready);
+ return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY,
+ &zio->io_children_notready));
}
-void
-zio_wait_children_done(zio_t *zio)
+int
+zio_wait_for_children_done(zio_t *zio)
{
- zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
- &zio->io_children_notdone);
+ return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE,
+ &zio->io_children_notdone));
}
-static void
+static int
zio_read_init(zio_t *zio)
{
- if (BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF) {
- uint64_t csize = BP_GET_PSIZE(zio->io_bp);
+ blkptr_t *bp = zio->io_bp;
+
+ if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+ uint64_t csize = BP_GET_PSIZE(bp);
void *cbuf = zio_buf_alloc(csize);
zio_push_transform(zio, cbuf, csize, csize);
zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
}
- if (BP_IS_GANG(zio->io_bp)) {
+ if (BP_IS_GANG(bp)) {
uint64_t gsize = SPA_GANGBLOCKSIZE;
void *gbuf = zio_buf_alloc(gsize);
zio_push_transform(zio, gbuf, gsize, gsize);
zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
}
- zio_next_stage(zio);
+
+ if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
+ return (ZIO_PIPELINE_CONTINUE);
}
-static void
+static int
zio_ready(zio_t *zio)
{
zio_t *pio = zio->io_parent;
@@ -922,16 +932,16 @@ zio_ready(zio_t *zio)
zio->io_ready(zio);
if (pio != NULL)
- zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
+ zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY,
&pio->io_children_notready);
if (zio->io_bp)
zio->io_bp_copy = *zio->io_bp;
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
-static void
+static int
zio_vdev_retry_io(zio_t *zio)
{
zio_t *pio = zio->io_parent;
@@ -967,7 +977,7 @@ zio_vdev_retry_io(zio_t *zio)
if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio))
pio->io_flags |= ZIO_FLAG_WRITE_RETRY;
- ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_CHILDREN_DONE);
+ ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_FOR_CHILDREN_DONE);
mutex_exit(&pio->io_lock);
}
@@ -977,7 +987,8 @@ zio_vdev_retry_io(zio_t *zio)
*/
zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY;
zio->io_error = 0;
- zio_next_stage_async(zio);
+
+ return (ZIO_PIPELINE_CONTINUE);
}
int
@@ -1029,7 +1040,7 @@ zio_vdev_resume_io(spa_t *spa)
zio->io_stage = ZIO_STAGE_READY;
}
- (void) taskq_dispatch(zio_taskq, zio_resubmit_stage_async,
+ (void) taskq_dispatch(zio_taskq, (task_func_t *)zio_execute,
zio, TQ_SLEEP);
}
mutex_exit(&spa->spa_zio_lock);
@@ -1049,7 +1060,7 @@ zio_vdev_resume_io(spa_t *spa)
return (0);
}
-static void
+static int
zio_vdev_suspend_io(zio_t *zio)
{
spa_t *spa = zio->io_spa;
@@ -1069,9 +1080,11 @@ zio_vdev_suspend_io(zio_t *zio)
cv_broadcast(&spa->spa_zio_cv);
#endif
mutex_exit(&spa->spa_zio_lock);
+
+ return (ZIO_PIPELINE_STOP);
}
-static void
+static int
zio_assess(zio_t *zio)
{
spa_t *spa = zio->io_spa;
@@ -1138,10 +1151,9 @@ zio_assess(zio_t *zio)
* property.
*/
if (zio_write_retry && zio->io_error != ENOSPC &&
- IO_IS_ALLOCATING(zio)) {
- zio_vdev_retry_io(zio);
- return;
- }
+ IO_IS_ALLOCATING(zio))
+ return (zio_vdev_retry_io(zio));
+
ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
/*
@@ -1175,22 +1187,20 @@ zio_assess(zio_t *zio)
"uncorrectable I/O failure and the "
"failure mode property for this pool "
"is set to panic.", spa_name(spa));
- } else {
- cmn_err(CE_WARN, "Pool '%s' has encountered "
- "an uncorrectable I/O error. Manual "
- "intervention is required.",
- spa_name(spa));
- zio_vdev_suspend_io(zio);
}
- return;
+ cmn_err(CE_WARN, "Pool '%s' has encountered "
+ "an uncorrectable I/O error. "
+ "Manual intervention is required.", spa_name(spa));
+ return (zio_vdev_suspend_io(zio));
}
}
ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
ASSERT(zio->io_children_notready == 0);
- zio_next_stage(zio);
+
+ return (ZIO_PIPELINE_CONTINUE);
}
-static void
+static int
zio_done(zio_t *zio)
{
zio_t *pio = zio->io_parent;
@@ -1221,7 +1231,7 @@ zio_done(zio_t *zio)
pio->io_child = next;
mutex_exit(&pio->io_lock);
- zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
+ zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE,
&pio->io_children_notdone);
}
@@ -1243,6 +1253,8 @@ zio_done(zio_t *zio)
cv_destroy(&zio->io_cv);
kmem_cache_free(zio_cache, zio);
}
+
+ return (ZIO_PIPELINE_STOP);
}
/*
@@ -1250,7 +1262,7 @@ zio_done(zio_t *zio)
* Compression support
* ==========================================================================
*/
-static void
+static int
zio_write_compress(zio_t *zio)
{
int compress = zio->io_compress;
@@ -1300,7 +1312,7 @@ zio_write_compress(zio_t *zio)
ASSERT(csize != 0);
BP_SET_LSIZE(bp, lsize);
BP_SET_COMPRESS(bp, compress);
- zio->io_pipeline = ZIO_REWRITE_PIPELINE;
+ zio->io_pipeline = ZIO_REWRITE_PIPELINE(bp);
} else {
if (bp->blk_birth == zio->io_txg)
BP_ZERO(bp);
@@ -1316,10 +1328,10 @@ zio_write_compress(zio_t *zio)
}
}
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
-static void
+static int
zio_read_decompress(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
@@ -1338,7 +1350,7 @@ zio_read_decompress(zio_t *zio)
zio_buf_free(data, bufsize);
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
/*
@@ -1347,19 +1359,6 @@ zio_read_decompress(zio_t *zio)
* ==========================================================================
*/
static void
-zio_gang_pipeline(zio_t *zio)
-{
- /*
- * By default, the pipeline assumes that we're dealing with a gang
- * block. If we're not, strip out any gang-specific stages.
- */
- if (!BP_IS_GANG(zio->io_bp))
- zio->io_pipeline &= ~ZIO_GANG_STAGES;
-
- zio_next_stage(zio);
-}
-
-static void
zio_gang_byteswap(zio_t *zio)
{
ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
@@ -1368,7 +1367,7 @@ zio_gang_byteswap(zio_t *zio)
byteswap_uint64_array(zio->io_data, zio->io_size);
}
-static void
+static int
zio_get_gang_header(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
@@ -1384,10 +1383,10 @@ zio_get_gang_header(zio_t *zio)
zio->io_flags & ZIO_FLAG_GANG_INHERIT,
ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE));
- zio_wait_children_done(zio);
+ return (zio_wait_for_children_done(zio));
}
-static void
+static int
zio_read_gang_members(zio_t *zio)
{
zio_gbh_phys_t *gbh;
@@ -1410,16 +1409,17 @@ zio_read_gang_members(zio_t *zio)
ASSERT(!BP_IS_HOLE(gbp));
zio_nowait(zio_read(zio, zio->io_spa, gbp,
- (char *)zio->io_data + loff, lsize, NULL, NULL,
- zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT,
- &zio->io_bookmark));
+ (char *)zio->io_data + loff, lsize,
+ NULL, NULL, zio->io_priority,
+ zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark));
}
zio_buf_free(gbh, gbufsize);
- zio_wait_children_done(zio);
+
+ return (zio_wait_for_children_done(zio));
}
-static void
+static int
zio_rewrite_gang_members(zio_t *zio)
{
zio_gbh_phys_t *gbh;
@@ -1446,15 +1446,16 @@ zio_rewrite_gang_members(zio_t *zio)
zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
- NULL, NULL, zio->io_priority, zio->io_flags,
- &zio->io_bookmark));
+ NULL, NULL, zio->io_priority,
+ zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark));
}
zio_push_transform(zio, gbh, gsize, gbufsize);
- zio_wait_children_ready(zio);
+
+ return (zio_wait_for_children_ready(zio));
}
-static void
+static int
zio_free_gang_members(zio_t *zio)
{
zio_gbh_phys_t *gbh;
@@ -1476,10 +1477,11 @@ zio_free_gang_members(zio_t *zio)
}
zio_buf_free(gbh, gbufsize);
- zio_next_stage(zio);
+
+ return (ZIO_PIPELINE_CONTINUE);
}
-static void
+static int
zio_claim_gang_members(zio_t *zio)
{
zio_gbh_phys_t *gbh;
@@ -1500,7 +1502,8 @@ zio_claim_gang_members(zio_t *zio)
}
zio_buf_free(gbh, gbufsize);
- zio_next_stage(zio);
+
+ return (ZIO_PIPELINE_CONTINUE);
}
static void
@@ -1549,8 +1552,10 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL,
B_FALSE);
- if (error)
- return (error);
+ if (error) {
+ zio->io_error = error;
+ return (ZIO_PIPELINE_CONTINUE);
+ }
for (d = 0; d < gbh_ndvas; d++)
DVA_SET_GANG(&dva[d], 1);
@@ -1560,10 +1565,6 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
gbh = zio_buf_alloc(gsize);
bzero(gbh, gsize);
- /* We need to test multi-level gang blocks */
- if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0)
- maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE);
-
for (loff = 0, i = 0; loff != zio->io_size;
loff += lsize, resid -= lsize, gbps_left--, i++) {
blkptr_t *gbp = &gbh->zg_blkptr[i];
@@ -1579,8 +1580,10 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
break;
ASSERT3U(error, ==, ENOSPC);
/* XXX - free up previous allocations? */
- if (maxalloc == SPA_MINBLOCKSIZE)
- return (error);
+ if (maxalloc == SPA_MINBLOCKSIZE) {
+ zio->io_error = error;
+ return (ZIO_PIPELINE_CONTINUE);
+ }
maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
}
@@ -1614,14 +1617,14 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
zio_push_transform(zio, gbh, gsize, gsize);
+
/*
- * As much as we'd like this to be zio_wait_children_ready(),
+ * As much as we'd like this to be 'ready' instead of 'done',
* updating our ASIZE doesn't happen until the io_done callback,
* so we have to wait for that to finish in order for our BP
* to be stable.
*/
- zio_wait_children_done(zio);
- return (0);
+ return (zio_wait_for_children_done(zio));
}
/*
@@ -1629,7 +1632,7 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
* Allocate and free blocks
* ==========================================================================
*/
-static void
+static int
zio_dva_allocate(zio_t *zio)
{
spa_t *spa = zio->io_spa;
@@ -1642,14 +1645,6 @@ zio_dva_allocate(zio_t *zio)
ASSERT3U(zio->io_ndvas, >, 0);
ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa));
- /* For testing, make some blocks above a certain size be gang blocks */
- if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) {
- error = zio_write_allocate_gang_members(zio, mc);
- if (error)
- zio->io_error = error;
- return;
- }
-
/*
* For testing purposes, we force I/Os to retry. We don't allow
* retries beyond the first pass since those I/Os are non-allocating
@@ -1668,17 +1663,15 @@ zio_dva_allocate(zio_t *zio)
if (error == 0) {
bp->blk_birth = zio->io_txg;
} else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
- error = zio_write_allocate_gang_members(zio, mc);
- if (error == 0)
- return;
- zio->io_error = error;
+ return (zio_write_allocate_gang_members(zio, mc));
} else {
zio->io_error = error;
}
- zio_next_stage(zio);
+
+ return (ZIO_PIPELINE_CONTINUE);
}
-static void
+static int
zio_dva_free(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
@@ -1687,15 +1680,15 @@ zio_dva_free(zio_t *zio)
BP_ZERO(bp);
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
-static void
+static int
zio_dva_claim(zio_t *zio)
{
zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
/*
@@ -1704,7 +1697,7 @@ zio_dva_claim(zio_t *zio)
* ==========================================================================
*/
-static void
+static int
zio_vdev_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -1719,24 +1712,21 @@ zio_vdev_io_start(zio_t *zio)
* at that time.
*/
if (spa_state(spa) == POOL_STATE_IO_FAILURE &&
- zio->io_type == ZIO_TYPE_WRITE) {
- zio_vdev_suspend_io(zio);
- return;
- }
+ zio->io_type == ZIO_TYPE_WRITE)
+ return (zio_vdev_suspend_io(zio));
- if (vd == NULL) {
- /* The mirror_ops handle multiple DVAs in a single BP */
- vdev_mirror_ops.vdev_op_io_start(zio);
- return;
- }
+ /*
+ * The mirror_ops handle multiple DVAs in a single BP
+ */
+ if (vd == NULL)
+ return (vdev_mirror_ops.vdev_op_io_start(zio));
align = 1ULL << tvd->vdev_ashift;
if (zio->io_retries == 0 && vd == tvd)
zio->io_flags |= ZIO_FLAG_FAILFAST;
- if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
- vd->vdev_children == 0) {
+ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) {
zio->io_flags |= ZIO_FLAG_PHYSICAL;
zio->io_offset += VDEV_LABEL_START_SIZE;
}
@@ -1760,19 +1750,16 @@ zio_vdev_io_start(zio_t *zio)
P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size);
ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
- vdev_io_start(zio);
-
- /* zio_next_stage_async() gets called from io completion interrupt */
+ return (vd->vdev_ops->vdev_op_io_start(zio));
}
-static void
+static int
zio_vdev_io_done(zio_t *zio)
{
if (zio->io_vd == NULL)
- /* The mirror_ops handle multiple DVAs in a single BP */
- vdev_mirror_ops.vdev_op_io_done(zio);
- else
- vdev_io_done(zio);
+ return (vdev_mirror_ops.vdev_op_io_done(zio));
+
+ return (zio->io_vd->vdev_ops->vdev_op_io_done(zio));
}
/* XXPOLICY */
@@ -1795,7 +1782,7 @@ zio_should_retry(zio_t *zio)
return (B_TRUE);
}
-static void
+static int
zio_vdev_io_assess(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -1833,15 +1820,10 @@ zio_vdev_io_assess(zio_t *zio)
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
- dprintf("retry #%d for %s to %s offset %llx\n",
- zio->io_retries, zio_type_name[zio->io_type],
- vdev_description(vd), zio->io_offset);
-
- zio_next_stage_async(zio);
- return;
+ return (ZIO_PIPELINE_CONTINUE);
}
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
void
@@ -1876,7 +1858,7 @@ zio_vdev_io_bypass(zio_t *zio)
* Generate and verify checksums
* ==========================================================================
*/
-static void
+static int
zio_checksum_generate(zio_t *zio)
{
int checksum = zio->io_checksum;
@@ -1889,10 +1871,10 @@ zio_checksum_generate(zio_t *zio)
zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size);
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
-static void
+static int
zio_gang_checksum_generate(zio_t *zio)
{
zio_cksum_t zc;
@@ -1905,10 +1887,10 @@ zio_gang_checksum_generate(zio_t *zio)
zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size);
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
-static void
+static int
zio_checksum_verify(zio_t *zio)
{
if (zio->io_bp != NULL) {
@@ -1918,7 +1900,7 @@ zio_checksum_verify(zio_t *zio)
zio->io_spa, zio->io_vd, zio, 0, 0);
}
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
/*
@@ -1949,20 +1931,15 @@ zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
* Define the pipeline
* ==========================================================================
*/
-typedef void zio_pipe_stage_t(zio_t *zio);
-
-static void
-zio_badop(zio_t *zio)
-{
- panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio);
-}
+typedef int zio_pipe_stage_t(zio_t *zio);
zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
- zio_badop,
- zio_wait_children_ready,
+ NULL,
+ zio_wait_for_children_ready,
+ zio_read_init,
+ zio_issue_async,
zio_write_compress,
zio_checksum_generate,
- zio_gang_pipeline,
zio_get_gang_header,
zio_rewrite_gang_members,
zio_free_gang_members,
@@ -1972,116 +1949,63 @@ zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
zio_dva_claim,
zio_gang_checksum_generate,
zio_ready,
- zio_read_init,
zio_vdev_io_start,
zio_vdev_io_done,
zio_vdev_io_assess,
- zio_wait_children_done,
+ zio_wait_for_children_done,
zio_checksum_verify,
zio_read_gang_members,
zio_read_decompress,
zio_assess,
zio_done,
- zio_badop
+ NULL
};
/*
- * Move an I/O to the next stage of the pipeline and execute that stage.
- * There's no locking on io_stage because there's no legitimate way for
- * multiple threads to be attempting to process the same I/O.
+ * Execute the I/O pipeline until one of the following occurs:
+ * (1) the I/O completes; (2) the pipeline stalls waiting for
+ * dependent child I/Os; (3) the I/O issues, so we're waiting
+ * for an I/O completion interrupt; (4) the I/O is delegated by
+ * vdev-level caching or aggregation; (5) the I/O is deferred
+ * due to vdev-level queueing; (6) the I/O is handed off to
+ * another thread. In all cases, the pipeline stops whenever
+ * there's no CPU work; it never burns a thread in cv_wait().
+ *
+ * There's no locking on io_stage because there's no legitimate way
+ * for multiple threads to be attempting to process the same I/O.
*/
void
-zio_next_stage(zio_t *zio)
+zio_execute(zio_t *zio)
{
- uint32_t pipeline = zio->io_pipeline;
+ while (zio->io_stage < ZIO_STAGE_DONE) {
+ uint32_t pipeline = zio->io_pipeline;
+ int rv;
- ASSERT(!MUTEX_HELD(&zio->io_lock));
+ ASSERT(!MUTEX_HELD(&zio->io_lock));
- if (zio->io_error) {
- dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
- zio, vdev_description(zio->io_vd),
- zio->io_offset, zio->io_stage, zio->io_error);
- if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
+ /*
+ * If an error occurred outside the vdev stack,
+ * just execute the interlock stages to clean up.
+ */
+ if (zio->io_error &&
+ ((1U << zio->io_stage) & ZIO_VDEV_IO_STAGES) == 0)
pipeline &= ZIO_ERROR_PIPELINE_MASK;
- }
-
- while (((1U << ++zio->io_stage) & pipeline) == 0)
- continue;
-
- ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
- ASSERT(zio->io_stalled == 0);
-
- /*
- * See the comment in zio_next_stage_async() about per-CPU taskqs.
- */
- if (((1U << zio->io_stage) & zio->io_async_stages) &&
- (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) &&
- !(zio->io_flags & ZIO_FLAG_METADATA)) {
- taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
- (void) taskq_dispatch(tq,
- (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
- } else {
- zio_pipeline[zio->io_stage](zio);
- }
-}
-void
-zio_next_stage_async(zio_t *zio)
-{
- taskq_t *tq;
- uint32_t pipeline = zio->io_pipeline;
-
- ASSERT(!MUTEX_HELD(&zio->io_lock));
+ while (((1U << ++zio->io_stage) & pipeline) == 0)
+ continue;
- if (zio->io_error) {
- dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
- zio, vdev_description(zio->io_vd),
- zio->io_offset, zio->io_stage, zio->io_error);
- if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
- pipeline &= ZIO_ERROR_PIPELINE_MASK;
- }
+ ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
+ ASSERT(zio->io_stalled == 0);
- while (((1U << ++zio->io_stage) & pipeline) == 0)
- continue;
+ rv = zio_pipeline[zio->io_stage](zio);
- ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
- ASSERT(zio->io_stalled == 0);
+ if (rv == ZIO_PIPELINE_STOP)
+ return;
- /*
- * For performance, we'll probably want two sets of task queues:
- * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU
- * part is for read performance: since we have to make a pass over
- * the data to checksum it anyway, we want to do this on the same CPU
- * that issued the read, because (assuming CPU scheduling affinity)
- * that thread is probably still there. Getting this optimization
- * right avoids performance-hostile cache-to-cache transfers.
- *
- * Note that having two sets of task queues is also necessary for
- * correctness: if all of the issue threads get bogged down waiting
- * for dependent reads (e.g. metaslab freelist) to complete, then
- * there won't be any threads available to service I/O completion
- * interrupts.
- */
- if ((1U << zio->io_stage) & zio->io_async_stages) {
- if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE)
- tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
- else
- tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type];
- (void) taskq_dispatch(tq,
- (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
- } else {
- zio_pipeline[zio->io_stage](zio);
+ ASSERT(rv == ZIO_PIPELINE_CONTINUE);
}
}
-void
-zio_resubmit_stage_async(void *arg)
-{
- zio_t *zio = (zio_t *)(uintptr_t)arg;
-
- zio_next_stage_async(zio);
-}
-
static boolean_t
zio_io_should_fail(uint16_t range)
{