diff options
author | bonwick <none@none> | 2007-11-27 22:58:05 -0800 |
---|---|---|
committer | bonwick <none@none> | 2007-11-27 22:58:05 -0800 |
commit | e05725b117836db173257fae43fb0746eb857fb5 (patch) | |
tree | dbdd58653bf6cebb69156f3361a6e1d72643b100 /usr/src/uts/common/fs/zfs | |
parent | b9bc7f7832704fda46b4d6b04f3f7be1227dc644 (diff) | |
download | illumos-gate-onnv_79.tar.gz |
6354519 stack overflow in zfs due to zio pipelineonnv_79
6533726 single-threaded checksum & parity calculations limit write bandwidth
6547248 ztest detects a future leak when there is none
6604198 zfs only using single cpu for compression (part II)
--HG--
rename : usr/src/uts/common/fs/zfs/rprwlock.c => deleted_files/usr/src/uts/common/fs/zfs/rprwlock.c
rename : usr/src/uts/common/fs/zfs/sys/rprwlock.h => deleted_files/usr/src/uts/common/fs/zfs/sys/rprwlock.h
Diffstat (limited to 'usr/src/uts/common/fs/zfs')
-rw-r--r-- | usr/src/uts/common/fs/zfs/metaslab.c | 7 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/rprwlock.c | 118 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/spa_misc.c | 116 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/rprwlock.h | 61 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/spa_impl.h | 11 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/vdev.h | 3 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/vdev_impl.h | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zio.h | 19 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zio_impl.h | 144 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev.c | 15 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_cache.c | 9 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_disk.c | 54 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_file.c | 35 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_mirror.c | 22 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_missing.c | 8 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_queue.c | 37 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_raidz.c | 25 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zio.c | 436 |
18 files changed, 430 insertions, 694 deletions
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 589dc7e3de..9365dbdb14 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -35,6 +35,7 @@ #include <sys/zio.h> uint64_t metaslab_aliquot = 512ULL << 10; +uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ /* * ========================================================================== @@ -728,6 +729,12 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, ASSERT(!DVA_IS_VALID(&dva[d])); /* + * For testing, make some blocks above a certain size be gang blocks. + */ + if (psize >= metaslab_gang_bang && (lbolt & 3) == 0) + return (ENOSPC); + + /* * Start at the rotor and loop through all mgs until we find something. * Note that there's no locking on mc_rotor or mc_allocated because * nothing actually breaks if we miss a few updates -- we just won't diff --git a/usr/src/uts/common/fs/zfs/rprwlock.c b/usr/src/uts/common/fs/zfs/rprwlock.c deleted file mode 100644 index 49ae505209..0000000000 --- a/usr/src/uts/common/fs/zfs/rprwlock.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/refcount.h> -#include <sys/rprwlock.h> - -void -rprw_init(rprwlock_t *rwl) -{ - mutex_init(&rwl->rw_lock, NULL, MUTEX_DEFAULT, NULL); - rwl->rw_writer = NULL; - cv_init(&rwl->rw_cv, NULL, CV_DEFAULT, NULL); - refcount_create(&rwl->rw_count); -} - -void -rprw_destroy(rprwlock_t *rwl) -{ - mutex_destroy(&rwl->rw_lock); - ASSERT(rwl->rw_writer == NULL); - cv_destroy(&rwl->rw_cv); - refcount_destroy(&rwl->rw_count); -} - -void -rprw_enter_read(rprwlock_t *rwl, void *tag) -{ - mutex_enter(&rwl->rw_lock); - - if (rwl->rw_writer != curthread) { - while (rwl->rw_writer != NULL) - cv_wait(&rwl->rw_cv, &rwl->rw_lock); - } - - (void) refcount_add(&rwl->rw_count, tag); - - mutex_exit(&rwl->rw_lock); -} - -void -rprw_enter_write(rprwlock_t *rwl, void *tag) -{ - mutex_enter(&rwl->rw_lock); - - if (rwl->rw_writer != curthread) { - while (!refcount_is_zero(&rwl->rw_count)) - cv_wait(&rwl->rw_cv, &rwl->rw_lock); - rwl->rw_writer = curthread; - } - - (void) refcount_add(&rwl->rw_count, tag); - - mutex_exit(&rwl->rw_lock); -} - -void -rprw_enter(rprwlock_t *rwl, krw_t rw, void *tag) -{ - if (rw == RW_READER) - rprw_enter_read(rwl, tag); - else - rprw_enter_write(rwl, tag); -} - -void -rprw_exit(rprwlock_t *rwl, void *tag) -{ - mutex_enter(&rwl->rw_lock); - - ASSERT(!refcount_is_zero(&rwl->rw_count)); - ASSERT(rwl->rw_writer == NULL || curthread == rwl->rw_writer); - if (refcount_remove(&rwl->rw_count, tag) == 0) { - cv_broadcast(&rwl->rw_cv); - rwl->rw_writer = NULL; /* OK in either case */ - } - - mutex_exit(&rwl->rw_lock); -} - -boolean_t -rprw_held(rprwlock_t *rwl, krw_t rw) -{ - boolean_t held; - - mutex_enter(&rwl->rw_lock); - if (rw == RW_WRITER) - held = (rwl->rw_writer == curthread); - else - held = !rwl->rw_writer && !refcount_is_zero(&rwl->rw_count); - mutex_exit(&rwl->rw_lock); - - return (held); -} diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 6aefb025fc..6b1c28140a 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -144,16 +144,9 @@ * zero. Must be called with spa_namespace_lock * held. * - * The spa_config_lock is manipulated using the following functions: - * - * spa_config_enter() Acquire the config lock as RW_READER or - * RW_WRITER. At least one reference on the spa_t - * must exist. - * - * spa_config_exit() Release the config lock. - * - * spa_config_held() Returns true if the config lock is currently - * held in the given state. + * The spa_config_lock is a form of rwlock. It must be held as RW_READER + * to perform I/O to the pool, and as RW_WRITER to change the vdev config. + * The spa_config_lock is manipulated with spa_config_{enter,exit,held}(). * * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). * @@ -202,6 +195,80 @@ int zfs_recover = 0; /* * ========================================================================== + * SPA config locking + * ========================================================================== + */ +static void +spa_config_lock_init(spa_config_lock_t *scl) +{ + mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); + scl->scl_writer = NULL; + cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); + refcount_create(&scl->scl_count); +} + +static void +spa_config_lock_destroy(spa_config_lock_t *scl) +{ + mutex_destroy(&scl->scl_lock); + ASSERT(scl->scl_writer == NULL); + cv_destroy(&scl->scl_cv); + refcount_destroy(&scl->scl_count); +} + +void +spa_config_enter(spa_t *spa, krw_t rw, void *tag) +{ + spa_config_lock_t *scl = &spa->spa_config_lock; + + mutex_enter(&scl->scl_lock); + + if (rw == RW_READER) { + while (scl->scl_writer != NULL && scl->scl_writer != curthread) + cv_wait(&scl->scl_cv, &scl->scl_lock); + } else { + while (!refcount_is_zero(&scl->scl_count) && + scl->scl_writer != curthread) + cv_wait(&scl->scl_cv, &scl->scl_lock); + scl->scl_writer = curthread; + } + + (void) refcount_add(&scl->scl_count, tag); + + mutex_exit(&scl->scl_lock); +} + +void +spa_config_exit(spa_t *spa, void *tag) +{ + spa_config_lock_t *scl = &spa->spa_config_lock; + + mutex_enter(&scl->scl_lock); + + ASSERT(!refcount_is_zero(&scl->scl_count)); + + if (refcount_remove(&scl->scl_count, tag) == 0) { + cv_broadcast(&scl->scl_cv); + ASSERT(scl->scl_writer == NULL || scl->scl_writer == curthread); + scl->scl_writer = NULL; /* OK in either case */ + } + + mutex_exit(&scl->scl_lock); +} + +boolean_t +spa_config_held(spa_t *spa, krw_t rw) +{ + spa_config_lock_t *scl = &spa->spa_config_lock; + + if (rw == RW_READER) + return (!refcount_is_zero(&scl->scl_count)); + else + return (scl->scl_writer == curthread); +} + +/* + * ========================================================================== * SPA namespace functions * ========================================================================== */ @@ -275,7 +342,7 @@ spa_add(const char *name, const char *altroot) spa->spa_final_txg = UINT64_MAX; refcount_create(&spa->spa_refcount); - rprw_init(&spa->spa_config_lock); + spa_config_lock_init(&spa->spa_config_lock); avl_add(&spa_namespace_avl, spa); @@ -324,7 +391,7 @@ spa_remove(spa_t *spa) refcount_destroy(&spa->spa_refcount); - rprw_destroy(&spa->spa_config_lock); + spa_config_lock_destroy(&spa->spa_config_lock); rw_destroy(&spa->spa_traverse_lock); @@ -639,29 +706,6 @@ spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc) /* * ========================================================================== - * SPA config locking - * ========================================================================== - */ -void -spa_config_enter(spa_t *spa, krw_t rw, void *tag) -{ - rprw_enter(&spa->spa_config_lock, rw, tag); -} - -void -spa_config_exit(spa_t *spa, void *tag) -{ - rprw_exit(&spa->spa_config_lock, tag); -} - -boolean_t -spa_config_held(spa_t *spa, krw_t rw) -{ - return (rprw_held(&spa->spa_config_lock, rw)); -} - -/* - * ========================================================================== * SPA vdev locking * ========================================================================== */ @@ -1003,7 +1047,7 @@ spa_name(spa_t *spa) * config lock, both of which are required to do a rename. */ ASSERT(MUTEX_HELD(&spa_namespace_lock) || - spa_config_held(spa, RW_READER) || spa_config_held(spa, RW_WRITER)); + spa_config_held(spa, RW_READER)); return (spa->spa_name); } diff --git a/usr/src/uts/common/fs/zfs/sys/rprwlock.h b/usr/src/uts/common/fs/zfs/sys/rprwlock.h deleted file mode 100644 index ba23799c9d..0000000000 --- a/usr/src/uts/common/fs/zfs/sys/rprwlock.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_RPRWLOCK_H -#define _SYS_RPRWLOCK_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/inttypes.h> -#include <sys/list.h> -#include <sys/zfs_context.h> -#include <sys/refcount.h> - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct rprwlock { - kmutex_t rw_lock; - kthread_t *rw_writer; - kcondvar_t rw_cv; - refcount_t rw_count; -} rprwlock_t; - -void rprw_init(rprwlock_t *rwl); -void rprw_destroy(rprwlock_t *rwl); -void rprw_enter_read(rprwlock_t *rwl, void *tag); -void rprw_enter_write(rprwlock_t *rwl, void *tag); -void rprw_enter(rprwlock_t *rwl, krw_t rw, void *tag); -void rprw_exit(rprwlock_t *rwl, void *tag); -boolean_t rprw_held(rprwlock_t *rwl, krw_t rw); -#define RPRW_READ_HELD(x) rprw_held(x, RW_READER) -#define RPRW_WRITE_HELD(x) rprw_held(x, RW_WRITER) - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_RPRWLOCK_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index eb2b6d6289..069255b4c0 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -37,7 +37,6 @@ #include <sys/zfs_context.h> #include <sys/avl.h> #include <sys/refcount.h> -#include <sys/rprwlock.h> #include <sys/bplist.h> #ifdef __cplusplus @@ -68,6 +67,14 @@ struct spa_aux_vdev { uint_t sav_npending; /* # pending devices */ }; +typedef struct spa_config_lock { + kmutex_t scl_lock; + kthread_t *scl_writer; + uint16_t scl_write_wanted; + kcondvar_t scl_cv; + refcount_t scl_count; +} spa_config_lock_t; + struct spa { /* * Fields protected by spa_namespace_lock. @@ -157,7 +164,7 @@ struct spa { * In order for the MDB module to function correctly, the other * fields must remain in the same location. */ - rprwlock_t spa_config_lock; /* configuration changes */ + spa_config_lock_t spa_config_lock; /* configuration changes */ refcount_t spa_refcount; /* number of opens */ }; diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index b1ec648056..2ec3de6513 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -83,9 +83,6 @@ extern void vdev_space_update(vdev_t *vd, int64_t space_delta, extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); -extern void vdev_io_start(zio_t *zio); -extern void vdev_io_done(zio_t *zio); - extern int vdev_fault(spa_t *spa, uint64_t guid); extern int vdev_degrade(spa_t *spa, uint64_t guid); extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 2eebbba566..7d823bab10 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -62,8 +62,8 @@ typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift); typedef void vdev_close_func_t(vdev_t *vd); typedef int vdev_probe_func_t(vdev_t *vd); typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); -typedef void vdev_io_start_func_t(zio_t *zio); -typedef void vdev_io_done_func_t(zio_t *zio); +typedef int vdev_io_start_func_t(zio_t *zio); +typedef int vdev_io_done_func_t(zio_t *zio); typedef void vdev_state_change_func_t(vdev_t *vd, int, int); typedef struct vdev_ops { diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 4591274518..e673edbac2 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -153,6 +153,7 @@ enum zio_compress { (ZIO_FLAG_CANFAIL | \ ZIO_FLAG_FAILFAST | \ ZIO_FLAG_CONFIG_HELD | \ + ZIO_FLAG_DONT_CACHE | \ ZIO_FLAG_DONT_RETRY | \ ZIO_FLAG_IO_REPAIR | \ ZIO_FLAG_SPECULATIVE | \ @@ -164,9 +165,11 @@ enum zio_compress { #define ZIO_FLAG_VDEV_INHERIT \ (ZIO_FLAG_GANG_INHERIT | \ - ZIO_FLAG_DONT_CACHE | \ ZIO_FLAG_PHYSICAL) +#define ZIO_PIPELINE_CONTINUE 0x100 +#define ZIO_PIPELINE_STOP 0x101 + /* * We'll take the unused errno 'EBADE' (from the Convergent graveyard) * to indicate checksum errors. @@ -262,7 +265,6 @@ struct zio { uint32_t io_numerrors; uint32_t io_pipeline; uint32_t io_orig_pipeline; - uint32_t io_async_stages; uint64_t io_children_notready; uint64_t io_children_notdone; void *io_waiter; @@ -319,21 +321,18 @@ extern void zio_flush_vdev(spa_t *spa, uint64_t vdev, zio_t **zio); extern int zio_wait(zio_t *zio); extern void zio_nowait(zio_t *zio); +extern void zio_execute(zio_t *zio); +extern void zio_interrupt(zio_t *zio); + +extern int zio_wait_for_children_ready(zio_t *zio); +extern int zio_wait_for_children_done(zio_t *zio); extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); -/* - * Move an I/O to the next stage of the pipeline and execute that stage. - * There's no locking on io_stage because there's no legitimate way for - * multiple threads to be attempting to process the same I/O. - */ -extern void zio_next_stage(zio_t *zio); -extern void zio_next_stage_async(zio_t *zio); extern void zio_resubmit_stage_async(void *); -extern void zio_wait_children_done(zio_t *zio); /* * Delegate I/O to a child vdev. diff --git a/usr/src/uts/common/fs/zfs/sys/zio_impl.h b/usr/src/uts/common/fs/zfs/sys/zio_impl.h index a5a0bb54e8..60a1c8b38e 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h @@ -38,16 +38,15 @@ extern "C" { /* * I/O Groups: pipeline stage definitions. */ - typedef enum zio_stage { ZIO_STAGE_OPEN = 0, /* RWFCI */ - ZIO_STAGE_WAIT_CHILDREN_READY, /* RWFCI */ + ZIO_STAGE_WAIT_FOR_CHILDREN_READY, /* RWFCI */ + ZIO_STAGE_READ_INIT, /* R---- */ + ZIO_STAGE_ISSUE_ASYNC, /* -W--- */ ZIO_STAGE_WRITE_COMPRESS, /* -W--- */ ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */ - ZIO_STAGE_GANG_PIPELINE, /* -WFC- */ - ZIO_STAGE_GET_GANG_HEADER, /* -WFC- */ ZIO_STAGE_REWRITE_GANG_MEMBERS, /* -W--- */ ZIO_STAGE_FREE_GANG_MEMBERS, /* --F-- */ @@ -61,13 +60,11 @@ typedef enum zio_stage { ZIO_STAGE_READY, /* RWFCI */ - ZIO_STAGE_READ_INIT, /* R---- */ - ZIO_STAGE_VDEV_IO_START, /* RW--I */ ZIO_STAGE_VDEV_IO_DONE, /* RW--I */ ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */ - ZIO_STAGE_WAIT_CHILDREN_DONE, /* RWFCI */ + ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, /* RWFCI */ ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */ ZIO_STAGE_READ_GANG_MEMBERS, /* R---- */ @@ -77,30 +74,22 @@ typedef enum zio_stage { ZIO_STAGE_DONE /* RWFCI */ } zio_stage_t; -/* - * The stages for which there's some performance value in going async. - * When compression is enabled, ZIO_STAGE_WRITE_COMPRESS is ORed in as well. - */ -#define ZIO_ASYNC_PIPELINE_STAGES \ - ((1U << ZIO_STAGE_CHECKSUM_GENERATE) | \ - (1U << ZIO_STAGE_VDEV_IO_DONE) | \ - (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \ - (1U << ZIO_STAGE_READ_DECOMPRESS)) +#define ZIO_INTERLOCK_STAGES \ + ((1U << ZIO_STAGE_WAIT_FOR_CHILDREN_READY) | \ + (1U << ZIO_STAGE_READY) | \ + (1U << ZIO_STAGE_WAIT_FOR_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_ASSESS) | \ + (1U << ZIO_STAGE_DONE)) -#define ZIO_VDEV_IO_PIPELINE \ +#define ZIO_VDEV_IO_STAGES \ ((1U << ZIO_STAGE_VDEV_IO_START) | \ (1U << ZIO_STAGE_VDEV_IO_DONE) | \ (1U << ZIO_STAGE_VDEV_IO_ASSESS)) #define ZIO_READ_PHYS_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_READY) | \ - ZIO_VDEV_IO_PIPELINE | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \ - (1U << ZIO_STAGE_ASSESS) | \ - (1U << ZIO_STAGE_DONE)) + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + (1U << ZIO_STAGE_CHECKSUM_VERIFY)) #define ZIO_READ_GANG_PIPELINE \ ZIO_READ_PHYS_PIPELINE @@ -109,97 +98,66 @@ typedef enum zio_stage { (1U << ZIO_STAGE_READ_INIT) | \ ZIO_READ_PHYS_PIPELINE -#define ZIO_WRITE_PHYS_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_CHECKSUM_GENERATE) | \ - (1U << ZIO_STAGE_READY) | \ - ZIO_VDEV_IO_PIPELINE | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_ASSESS) | \ - (1U << ZIO_STAGE_DONE)) +#define ZIO_WRITE_COMMON_STAGES \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + (1U << ZIO_STAGE_ISSUE_ASYNC) | \ + (1U << ZIO_STAGE_CHECKSUM_GENERATE)) -#define ZIO_WRITE_COMMON_PIPELINE \ - ZIO_WRITE_PHYS_PIPELINE +#define ZIO_WRITE_PHYS_PIPELINE \ + ZIO_WRITE_COMMON_STAGES #define ZIO_WRITE_PIPELINE \ - ((1U << ZIO_STAGE_WRITE_COMPRESS) | \ - ZIO_WRITE_COMMON_PIPELINE) + (ZIO_WRITE_COMMON_STAGES | \ + (1U << ZIO_STAGE_WRITE_COMPRESS)) -#define ZIO_GANG_STAGES \ +#define ZIO_GANG_REWRITE_STAGES \ ((1U << ZIO_STAGE_GET_GANG_HEADER) | \ (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \ - (1U << ZIO_STAGE_READ_GANG_MEMBERS)) - -#define ZIO_REWRITE_PIPELINE \ - ((1U << ZIO_STAGE_GANG_PIPELINE) | \ - (1U << ZIO_STAGE_GET_GANG_HEADER) | \ - (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \ - ZIO_WRITE_COMMON_PIPELINE) - -#define ZIO_WRITE_ALLOCATE_PIPELINE \ - ((1U << ZIO_STAGE_DVA_ALLOCATE) | \ - ZIO_WRITE_COMMON_PIPELINE) + (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE)) #define ZIO_GANG_FREE_STAGES \ ((1U << ZIO_STAGE_GET_GANG_HEADER) | \ (1U << ZIO_STAGE_FREE_GANG_MEMBERS)) -#define ZIO_FREE_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_GANG_PIPELINE) | \ - (1U << ZIO_STAGE_GET_GANG_HEADER) | \ - (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \ +#define ZIO_GANG_CLAIM_STAGES \ + ((1U << ZIO_STAGE_GET_GANG_HEADER) | \ + (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS)) + +#define ZIO_REWRITE_PIPELINE(bp) \ + (ZIO_WRITE_COMMON_STAGES | \ + (BP_IS_GANG(bp) ? ZIO_GANG_REWRITE_STAGES : 0)) + +#define ZIO_WRITE_ALLOCATE_PIPELINE \ + (ZIO_WRITE_COMMON_STAGES | \ + (1U << ZIO_STAGE_DVA_ALLOCATE)) + +#define ZIO_FREE_PIPELINE(bp) \ + (ZIO_INTERLOCK_STAGES | \ (1U << ZIO_STAGE_DVA_FREE) | \ - (1U << ZIO_STAGE_READY) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_ASSESS) | \ - (1U << ZIO_STAGE_DONE)) + (BP_IS_GANG(bp) ? ZIO_GANG_FREE_STAGES : 0)) -#define ZIO_CLAIM_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_GANG_PIPELINE) | \ - (1U << ZIO_STAGE_GET_GANG_HEADER) | \ - (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \ +#define ZIO_CLAIM_PIPELINE(bp) \ + (ZIO_INTERLOCK_STAGES | \ (1U << ZIO_STAGE_DVA_CLAIM) | \ - (1U << ZIO_STAGE_READY) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_ASSESS) | \ - (1U << ZIO_STAGE_DONE)) + (BP_IS_GANG(bp) ? ZIO_GANG_CLAIM_STAGES : 0)) #define ZIO_IOCTL_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_READY) | \ - ZIO_VDEV_IO_PIPELINE | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_ASSESS) | \ - (1U << ZIO_STAGE_DONE)) + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES) + #define ZIO_WAIT_FOR_CHILDREN_PIPELINE \ - ((1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_READY) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_ASSESS) | \ - (1U << ZIO_STAGE_DONE)) + ZIO_INTERLOCK_STAGES -#define ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE \ - ((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ +#define ZIO_VDEV_CHILD_PIPELINE \ + (ZIO_VDEV_IO_STAGES | \ (1U << ZIO_STAGE_ASSESS) | \ + (1U << ZIO_STAGE_WAIT_FOR_CHILDREN_DONE) | \ (1U << ZIO_STAGE_DONE)) -#define ZIO_VDEV_CHILD_PIPELINE \ - (ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE | \ - ZIO_VDEV_IO_PIPELINE) - #define ZIO_ERROR_PIPELINE_MASK \ - ZIO_WAIT_FOR_CHILDREN_PIPELINE + ZIO_INTERLOCK_STAGES typedef struct zio_transform zio_transform_t; struct zio_transform { diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index 2a2dc1d625..2b4c663a1a 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -136,6 +136,9 @@ vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; + ASSERT(spa_config_held(spa, RW_READER) || + curthread == spa->spa_scrub_thread); + if (vdev < rvd->vdev_children) return (rvd->vdev_child[vdev]); @@ -1459,18 +1462,6 @@ vdev_psize_to_asize(vdev_t *vd, uint64_t psize) return (vd->vdev_ops->vdev_op_asize(vd, psize)); } -void -vdev_io_start(zio_t *zio) -{ - zio->io_vd->vdev_ops->vdev_op_io_start(zio); -} - -void -vdev_io_done(zio_t *zio) -{ - zio->io_vd->vdev_ops->vdev_op_io_done(zio); -} - const char * vdev_description(vdev_t *vd) { diff --git a/usr/src/uts/common/fs/zfs/vdev_cache.c b/usr/src/uts/common/fs/zfs/vdev_cache.c index ce9508d2fb..5f475f9b47 100644 --- a/usr/src/uts/common/fs/zfs/vdev_cache.c +++ b/usr/src/uts/common/fs/zfs/vdev_cache.c @@ -231,7 +231,7 @@ vdev_cache_fill(zio_t *zio) zio->io_delegate_list = dio->io_delegate_next; dio->io_delegate_next = NULL; dio->io_error = zio->io_error; - zio_next_stage(dio); + zio_execute(dio); } } @@ -286,15 +286,10 @@ vdev_cache_read(zio_t *zio) zio_vdev_io_bypass(zio); mutex_exit(&vc->vc_lock); - zio_next_stage(zio); + zio_execute(zio); return (0); } - if (!(zio->io_flags & ZIO_FLAG_METADATA)) { - mutex_exit(&vc->vc_lock); - return (EINVAL); - } - ve = vdev_cache_allocate(zio); if (ve == NULL) { diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index e4e13f2aac..933ed3e2bf 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -386,7 +386,7 @@ vdev_disk_io_intr(buf_t *bp) kmem_free(vdb, sizeof (vdev_disk_buf_t)); - zio_next_stage_async(zio); + zio_interrupt(zio); } static void @@ -396,10 +396,10 @@ vdev_disk_ioctl_done(void *zio_arg, int error) zio->io_error = error; - zio_next_stage_async(zio); + zio_interrupt(zio); } -static void +static int vdev_disk_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -414,8 +414,7 @@ vdev_disk_io_start(zio_t *zio) /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = ENXIO; - zio_next_stage_async(zio); - return; + return (ZIO_PIPELINE_CONTINUE); } switch (zio->io_cmd) { @@ -444,8 +443,10 @@ vdev_disk_io_start(zio_t *zio) * and will call vdev_disk_ioctl_done() * upon completion. */ - return; - } else if (error == ENOTSUP || error == ENOTTY) { + return (ZIO_PIPELINE_STOP); + } + + if (error == ENOTSUP || error == ENOTTY) { /* * If we get ENOTSUP or ENOTTY, we know that * no future attempts will ever succeed. @@ -463,15 +464,26 @@ vdev_disk_io_start(zio_t *zio) zio->io_error = ENOTSUP; } - zio_next_stage_async(zio); - return; + return (ZIO_PIPELINE_CONTINUE); } if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) - return; + return (ZIO_PIPELINE_STOP); if ((zio = vdev_queue_io(zio)) == NULL) - return; + return (ZIO_PIPELINE_STOP); + + if (zio->io_type == ZIO_TYPE_WRITE) + error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO; + else + error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO; + error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error; + + if (error) { + zio->io_error = error; + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); + } flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); flags |= B_BUSY | B_NOCACHE; @@ -491,26 +503,14 @@ vdev_disk_io_start(zio_t *zio) bp->b_bufsize = zio->io_size; bp->b_iodone = (int (*)())vdev_disk_io_intr; - /* XXPOLICY */ - if (zio->io_type == ZIO_TYPE_WRITE) - error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO; - else - error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO; - error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error; - if (error) { - zio->io_error = error; - bioerror(bp, error); - bp->b_resid = bp->b_bcount; - bp->b_iodone(bp); - return; - } - error = ldi_strategy(dvd->vd_lh, bp); /* ldi_strategy() will return non-zero only on programming errors */ ASSERT(error == 0); + + return (ZIO_PIPELINE_STOP); } -static void +static int vdev_disk_io_done(zio_t *zio) { vdev_queue_io_done(zio); @@ -544,7 +544,7 @@ vdev_disk_io_done(zio_t *zio) } } - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } vdev_ops_t vdev_disk_ops = { diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c index ee30845cb7..51abd9612b 100644 --- a/usr/src/uts/common/fs/zfs/vdev_file.c +++ b/usr/src/uts/common/fs/zfs/vdev_file.c @@ -215,7 +215,7 @@ vdev_file_probe(vdev_t *vd) return (error); } -static void +static int vdev_file_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -229,8 +229,7 @@ vdev_file_io_start(zio_t *zio) /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = ENXIO; - zio_next_stage_async(zio); - return; + return (ZIO_PIPELINE_CONTINUE); } switch (zio->io_cmd) { @@ -244,8 +243,7 @@ vdev_file_io_start(zio_t *zio) zio->io_error = ENOTSUP; } - zio_next_stage_async(zio); - return; + return (ZIO_PIPELINE_CONTINUE); } /* @@ -254,11 +252,11 @@ vdev_file_io_start(zio_t *zio) */ #ifndef _KERNEL if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) - return; + return (ZIO_PIPELINE_STOP); #endif if ((zio = vdev_queue_io(zio)) == NULL) - return; + return (ZIO_PIPELINE_STOP); /* XXPOLICY */ if (zio->io_type == ZIO_TYPE_WRITE) @@ -268,8 +266,8 @@ vdev_file_io_start(zio_t *zio) error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error; if (error) { zio->io_error = error; - zio_next_stage_async(zio); - return; + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); } zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? @@ -280,26 +278,25 @@ vdev_file_io_start(zio_t *zio) if (resid != 0 && zio->io_error == 0) zio->io_error = ENOSPC; - zio_next_stage_async(zio); + zio_interrupt(zio); + + return (ZIO_PIPELINE_STOP); } -static void +static int vdev_file_io_done(zio_t *zio) { + vdev_t *vd = zio->io_vd; if (zio_injection_enabled && zio->io_error == 0) - zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); + zio->io_error = zio_handle_device_injection(vd, EIO); /* * If an error has been encountered then attempt to probe the device * to determine if it's still accessible. */ - if (zio->io_error == EIO) { - vdev_t *vd = zio->io_vd; - - if (vdev_probe(vd) != 0) - vd->vdev_is_failing = B_TRUE; - } + if (zio->io_error == EIO && vdev_probe(vd) != 0) + vd->vdev_is_failing = B_TRUE; vdev_queue_io_done(zio); @@ -308,7 +305,7 @@ vdev_file_io_done(zio_t *zio) vdev_cache_write(zio); #endif - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } vdev_ops_t vdev_file_ops = { diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c index 45d326ae69..4d6c499c10 100644 --- a/usr/src/uts/common/fs/zfs/vdev_mirror.c +++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c @@ -253,7 +253,7 @@ vdev_mirror_child_select(zio_t *zio) return (-1); } -static void +static int vdev_mirror_io_start(zio_t *zio) { mirror_map_t *mm; @@ -279,8 +279,7 @@ vdev_mirror_io_start(zio_t *zio) ZIO_FLAG_CANFAIL, vdev_mirror_scrub_done, mc)); } - zio_wait_children_done(zio); - return; + return (zio_wait_for_children_done(zio)); } /* * For normal reads just pick one child. @@ -316,10 +315,10 @@ vdev_mirror_io_start(zio_t *zio) c++; } - zio_wait_children_done(zio); + return (zio_wait_for_children_done(zio)); } -static void +static int vdev_mirror_io_done(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; @@ -362,8 +361,7 @@ vdev_mirror_io_done(zio_t *zio) if (good_copies != 0) zio->io_error = 0; vdev_mirror_map_free(zio); - zio_next_stage(zio); - return; + return (ZIO_PIPELINE_CONTINUE); } ASSERT(zio->io_type == ZIO_TYPE_READ); @@ -383,8 +381,7 @@ vdev_mirror_io_done(zio_t *zio) mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL, vdev_mirror_child_done, mc)); - zio_wait_children_done(zio); - return; + return (zio_wait_for_children_done(zio)); } /* XXPOLICY */ @@ -441,12 +438,13 @@ vdev_mirror_io_done(zio_t *zio) } zio_nowait(rio); - zio_wait_children_done(zio); - return; + + return (zio_wait_for_children_done(zio)); } vdev_mirror_map_free(zio); - zio_next_stage(zio); + + return (ZIO_PIPELINE_CONTINUE); } static void diff --git a/usr/src/uts/common/fs/zfs/vdev_missing.c b/usr/src/uts/common/fs/zfs/vdev_missing.c index 3aa831c46d..49727ef996 100644 --- a/usr/src/uts/common/fs/zfs/vdev_missing.c +++ b/usr/src/uts/common/fs/zfs/vdev_missing.c @@ -62,18 +62,18 @@ vdev_missing_close(vdev_t *vd) } /* ARGSUSED */ -static void +static int vdev_missing_io_start(zio_t *zio) { zio->io_error = ENOTSUP; - zio_next_stage_async(zio); + return (ZIO_PIPELINE_CONTINUE); } /* ARGSUSED */ -static void +static int vdev_missing_io_done(zio_t *zio) { - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } /* ARGSUSED */ diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index 7e99c1fd5b..0f921e088a 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -162,7 +162,7 @@ vdev_queue_agg_io_done(zio_t *aio) aio->io_delegate_list = dio->io_delegate_next; dio->io_delegate_next = NULL; dio->io_error = aio->io_error; - zio_next_stage(dio); + zio_execute(dio); } ASSERT3U(offset, ==, aio->io_size); @@ -172,11 +172,8 @@ vdev_queue_agg_io_done(zio_t *aio) #define IS_ADJACENT(io, nio) \ ((io)->io_offset + (io)->io_size == (nio)->io_offset) -typedef void zio_issue_func_t(zio_t *); - static zio_t * -vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, - zio_issue_func_t **funcp) +vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) { zio_t *fio, *lio, *aio, *dio; avl_tree_t *tree; @@ -184,8 +181,6 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, ASSERT(MUTEX_HELD(&vq->vq_lock)); - *funcp = NULL; - if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || avl_numnodes(&vq->vq_deadline_tree) == 0) return (NULL); @@ -245,7 +240,6 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, avl_add(&vq->vq_pending_tree, aio); - *funcp = zio_nowait; return (aio); } @@ -254,8 +248,6 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, avl_add(&vq->vq_pending_tree, fio); - *funcp = zio_next_stage; - return (fio); } @@ -264,7 +256,6 @@ vdev_queue_io(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; zio_t *nio; - zio_issue_func_t *func; ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); @@ -285,15 +276,19 @@ vdev_queue_io(zio_t *zio) vdev_queue_io_add(vq, zio); - nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending, &func); + nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending); mutex_exit(&vq->vq_lock); - if (nio == NULL || func != zio_nowait) - return (nio); + if (nio == NULL) + return (NULL); + + if (nio->io_done == vdev_queue_agg_io_done) { + zio_nowait(nio); + return (NULL); + } - func(nio); - return (NULL); + return (nio); } void @@ -301,7 +296,6 @@ vdev_queue_io_done(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; zio_t *nio; - zio_issue_func_t *func; int i; mutex_enter(&vq->vq_lock); @@ -309,13 +303,16 @@ vdev_queue_io_done(zio_t *zio) avl_remove(&vq->vq_pending_tree, zio); for (i = 0; i < zfs_vdev_ramp_rate; i++) { - nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending, &func); + nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); if (nio == NULL) break; mutex_exit(&vq->vq_lock); - if (func == zio_next_stage) + if (nio->io_done == vdev_queue_agg_io_done) { + zio_nowait(nio); + } else { zio_vdev_io_reissue(nio); - func(nio); + zio_execute(nio); + } mutex_enter(&vq->vq_lock); } diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c index 73a3ae2565..74b035868c 100644 --- a/usr/src/uts/common/fs/zfs/vdev_raidz.c +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c @@ -639,7 +639,7 @@ vdev_raidz_repair_done(zio_t *zio) vdev_raidz_map_free(zio->io_private); } -static void +static int vdev_raidz_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -672,8 +672,8 @@ vdev_raidz_io_start(zio_t *zio) zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, vdev_raidz_child_done, rc)); } - zio_wait_children_done(zio); - return; + + return (zio_wait_for_children_done(zio)); } ASSERT(zio->io_type == ZIO_TYPE_READ); @@ -714,7 +714,7 @@ vdev_raidz_io_start(zio_t *zio) } } - zio_wait_children_done(zio); + return (zio_wait_for_children_done(zio)); } /* @@ -783,7 +783,7 @@ static uint64_t raidz_corrected_p; static uint64_t raidz_corrected_q; static uint64_t raidz_corrected_pq; -static void +static int vdev_raidz_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -840,8 +840,8 @@ vdev_raidz_io_done(zio_t *zio) zio->io_error = 0; vdev_raidz_map_free(zio); - zio_next_stage(zio); - return; + + return (ZIO_PIPELINE_CONTINUE); } ASSERT(zio->io_type == ZIO_TYPE_READ); @@ -1022,8 +1022,8 @@ vdev_raidz_io_done(zio_t *zio) vdev_raidz_child_done, rc)); } while (++c < rm->rm_cols); dprintf("rereading\n"); - zio_wait_children_done(zio); - return; + + return (zio_wait_for_children_done(zio)); } /* @@ -1205,12 +1205,13 @@ done: } zio_nowait(rio); - zio_wait_children_done(zio); - return; + + return (zio_wait_for_children_done(zio)); } vdev_raidz_map_free(zio); - zio_next_stage(zio); + + return (ZIO_PIPELINE_CONTINUE); } static void diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 112aaa6f25..4aa21a6501 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -61,9 +61,6 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { char *zio_type_name[ZIO_TYPES] = { "null", "read", "write", "free", "claim", "ioctl" }; -/* At or above this size, force gang blocking - for testing */ -uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; - /* Force an allocation failure when non-zero */ uint16_t zio_zil_fail_shift = 0; uint16_t zio_io_fail_shift = 0; @@ -170,8 +167,6 @@ zio_init(void) align, NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NODEBUG); - dprintf("creating cache for size %5lx align %5lx\n", - size, align); } } @@ -356,9 +351,6 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio->io_bp = bp; zio->io_bp_copy = *bp; zio->io_bp_orig = *bp; - if (dmu_ot[BP_GET_TYPE(bp)].ot_metadata || - BP_GET_LEVEL(bp) != 0) - zio->io_flags |= ZIO_FLAG_METADATA; } zio->io_done = done; zio->io_private = private; @@ -366,10 +358,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio->io_priority = priority; zio->io_stage = stage; zio->io_pipeline = pipeline; - zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES; zio->io_timestamp = lbolt64; - if (pio != NULL) - zio->io_flags |= (pio->io_flags & ZIO_FLAG_METADATA); mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); zio_push_transform(zio, data, size, size); @@ -395,7 +384,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, if (pio == NULL) { if (type != ZIO_TYPE_NULL && !(flags & ZIO_FLAG_CONFIG_HELD)) { - spa_config_enter(zio->io_spa, RW_READER, zio); + spa_config_enter(spa, RW_READER, zio); zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; } zio->io_root = zio; @@ -409,7 +398,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; - spa_config_enter(zio->io_spa, RW_READER, pio); + spa_config_enter(spa, RW_READER, pio); } if (stage < ZIO_STAGE_READY) pio->io_children_notready++; @@ -524,9 +513,6 @@ zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, zio->io_compress = compress; zio->io_ndvas = ncopies; - if (compress != ZIO_COMPRESS_OFF) - zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; - if (bp->blk_birth != txg) { /* XXX the bp usually (always?) gets re-zeroed later */ BP_ZERO(bp); @@ -551,7 +537,7 @@ zio_rewrite(zio_t *pio, spa_t *spa, int checksum, zio = zio_create(pio, spa, txg, bp, data, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, - ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); + ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp)); zio->io_bookmark = *zb; zio->io_checksum = checksum; @@ -612,7 +598,7 @@ zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, - ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); + ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp)); zio->io_bp = &zio->io_bp_copy; @@ -641,7 +627,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, - ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); + ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp)); zio->io_bp = &zio->io_bp_copy; @@ -820,7 +806,7 @@ zio_wait(zio_t *zio) zio->io_waiter = curthread; - zio_next_stage_async(zio); + zio_execute(zio); mutex_enter(&zio->io_lock); while (zio->io_stalled != ZIO_STAGE_DONE) @@ -838,7 +824,23 @@ zio_wait(zio_t *zio) void zio_nowait(zio_t *zio) { - zio_next_stage_async(zio); + zio_execute(zio); +} + +void +zio_interrupt(zio_t *zio) +{ + (void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type], + (task_func_t *)zio_execute, zio, TQ_SLEEP); +} + +static int +zio_issue_async(zio_t *zio) +{ + (void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type], + (task_func_t *)zio_execute, zio, TQ_SLEEP); + + return (ZIO_PIPELINE_STOP); } /* @@ -846,18 +848,20 @@ zio_nowait(zio_t *zio) * I/O pipeline interlocks: parent/child dependency scoreboarding * ========================================================================== */ -static void +static int zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) { + int rv = ZIO_PIPELINE_CONTINUE; + mutex_enter(&zio->io_lock); - if (*countp == 0) { - ASSERT(zio->io_stalled == 0); - mutex_exit(&zio->io_lock); - zio_next_stage(zio); - } else { + ASSERT(zio->io_stalled == 0); + if (*countp != 0) { zio->io_stalled = stage; - mutex_exit(&zio->io_lock); + rv = ZIO_PIPELINE_STOP; } + mutex_exit(&zio->io_lock); + + return (rv); } static void @@ -872,48 +876,54 @@ zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) if (--*countp == 0 && pio->io_stalled == stage) { pio->io_stalled = 0; mutex_exit(&pio->io_lock); - zio_next_stage_async(pio); + zio_execute(pio); } else { mutex_exit(&pio->io_lock); } } -static void -zio_wait_children_ready(zio_t *zio) +int +zio_wait_for_children_ready(zio_t *zio) { - zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY, - &zio->io_children_notready); + return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, + &zio->io_children_notready)); } -void -zio_wait_children_done(zio_t *zio) +int +zio_wait_for_children_done(zio_t *zio) { - zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, - &zio->io_children_notdone); + return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, + &zio->io_children_notdone)); } -static void +static int zio_read_init(zio_t *zio) { - if (BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF) { - uint64_t csize = BP_GET_PSIZE(zio->io_bp); + blkptr_t *bp = zio->io_bp; + + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { + uint64_t csize = BP_GET_PSIZE(bp); void *cbuf = zio_buf_alloc(csize); zio_push_transform(zio, cbuf, csize, csize); zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; } - if (BP_IS_GANG(zio->io_bp)) { + if (BP_IS_GANG(bp)) { uint64_t gsize = SPA_GANGBLOCKSIZE; void *gbuf = zio_buf_alloc(gsize); zio_push_transform(zio, gbuf, gsize, gsize); zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; } - zio_next_stage(zio); + + if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) + zio->io_flags |= ZIO_FLAG_DONT_CACHE; + + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_ready(zio_t *zio) { zio_t *pio = zio->io_parent; @@ -922,16 +932,16 @@ zio_ready(zio_t *zio) zio->io_ready(zio); if (pio != NULL) - zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, + zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, &pio->io_children_notready); if (zio->io_bp) zio->io_bp_copy = *zio->io_bp; - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_vdev_retry_io(zio_t *zio) { zio_t *pio = zio->io_parent; @@ -967,7 +977,7 @@ zio_vdev_retry_io(zio_t *zio) if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio)) pio->io_flags |= ZIO_FLAG_WRITE_RETRY; - ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_CHILDREN_DONE); + ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_FOR_CHILDREN_DONE); mutex_exit(&pio->io_lock); } @@ -977,7 +987,8 @@ zio_vdev_retry_io(zio_t *zio) */ zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY; zio->io_error = 0; - zio_next_stage_async(zio); + + return (ZIO_PIPELINE_CONTINUE); } int @@ -1029,7 +1040,7 @@ zio_vdev_resume_io(spa_t *spa) zio->io_stage = ZIO_STAGE_READY; } - (void) taskq_dispatch(zio_taskq, zio_resubmit_stage_async, + (void) taskq_dispatch(zio_taskq, (task_func_t *)zio_execute, zio, TQ_SLEEP); } mutex_exit(&spa->spa_zio_lock); @@ -1049,7 +1060,7 @@ zio_vdev_resume_io(spa_t *spa) return (0); } -static void +static int zio_vdev_suspend_io(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -1069,9 +1080,11 @@ zio_vdev_suspend_io(zio_t *zio) cv_broadcast(&spa->spa_zio_cv); #endif mutex_exit(&spa->spa_zio_lock); + + return (ZIO_PIPELINE_STOP); } -static void +static int zio_assess(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -1138,10 +1151,9 @@ zio_assess(zio_t *zio) * property. */ if (zio_write_retry && zio->io_error != ENOSPC && - IO_IS_ALLOCATING(zio)) { - zio_vdev_retry_io(zio); - return; - } + IO_IS_ALLOCATING(zio)) + return (zio_vdev_retry_io(zio)); + ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); /* @@ -1175,22 +1187,20 @@ zio_assess(zio_t *zio) "uncorrectable I/O failure and the " "failure mode property for this pool " "is set to panic.", spa_name(spa)); - } else { - cmn_err(CE_WARN, "Pool '%s' has encountered " - "an uncorrectable I/O error. Manual " - "intervention is required.", - spa_name(spa)); - zio_vdev_suspend_io(zio); } - return; + cmn_err(CE_WARN, "Pool '%s' has encountered " + "an uncorrectable I/O error. " + "Manual intervention is required.", spa_name(spa)); + return (zio_vdev_suspend_io(zio)); } } ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); ASSERT(zio->io_children_notready == 0); - zio_next_stage(zio); + + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_done(zio_t *zio) { zio_t *pio = zio->io_parent; @@ -1221,7 +1231,7 @@ zio_done(zio_t *zio) pio->io_child = next; mutex_exit(&pio->io_lock); - zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, + zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, &pio->io_children_notdone); } @@ -1243,6 +1253,8 @@ zio_done(zio_t *zio) cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); } + + return (ZIO_PIPELINE_STOP); } /* @@ -1250,7 +1262,7 @@ zio_done(zio_t *zio) * Compression support * ========================================================================== */ -static void +static int zio_write_compress(zio_t *zio) { int compress = zio->io_compress; @@ -1300,7 +1312,7 @@ zio_write_compress(zio_t *zio) ASSERT(csize != 0); BP_SET_LSIZE(bp, lsize); BP_SET_COMPRESS(bp, compress); - zio->io_pipeline = ZIO_REWRITE_PIPELINE; + zio->io_pipeline = ZIO_REWRITE_PIPELINE(bp); } else { if (bp->blk_birth == zio->io_txg) BP_ZERO(bp); @@ -1316,10 +1328,10 @@ zio_write_compress(zio_t *zio) } } - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_read_decompress(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -1338,7 +1350,7 @@ zio_read_decompress(zio_t *zio) zio_buf_free(data, bufsize); - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } /* @@ -1347,19 +1359,6 @@ zio_read_decompress(zio_t *zio) * ========================================================================== */ static void -zio_gang_pipeline(zio_t *zio) -{ - /* - * By default, the pipeline assumes that we're dealing with a gang - * block. If we're not, strip out any gang-specific stages. - */ - if (!BP_IS_GANG(zio->io_bp)) - zio->io_pipeline &= ~ZIO_GANG_STAGES; - - zio_next_stage(zio); -} - -static void zio_gang_byteswap(zio_t *zio) { ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); @@ -1368,7 +1367,7 @@ zio_gang_byteswap(zio_t *zio) byteswap_uint64_array(zio->io_data, zio->io_size); } -static void +static int zio_get_gang_header(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -1384,10 +1383,10 @@ zio_get_gang_header(zio_t *zio) zio->io_flags & ZIO_FLAG_GANG_INHERIT, ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE)); - zio_wait_children_done(zio); + return (zio_wait_for_children_done(zio)); } -static void +static int zio_read_gang_members(zio_t *zio) { zio_gbh_phys_t *gbh; @@ -1410,16 +1409,17 @@ zio_read_gang_members(zio_t *zio) ASSERT(!BP_IS_HOLE(gbp)); zio_nowait(zio_read(zio, zio->io_spa, gbp, - (char *)zio->io_data + loff, lsize, NULL, NULL, - zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, - &zio->io_bookmark)); + (char *)zio->io_data + loff, lsize, + NULL, NULL, zio->io_priority, + zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark)); } zio_buf_free(gbh, gbufsize); - zio_wait_children_done(zio); + + return (zio_wait_for_children_done(zio)); } -static void +static int zio_rewrite_gang_members(zio_t *zio) { zio_gbh_phys_t *gbh; @@ -1446,15 +1446,16 @@ zio_rewrite_gang_members(zio_t *zio) zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, - NULL, NULL, zio->io_priority, zio->io_flags, - &zio->io_bookmark)); + NULL, NULL, zio->io_priority, + zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark)); } zio_push_transform(zio, gbh, gsize, gbufsize); - zio_wait_children_ready(zio); + + return (zio_wait_for_children_ready(zio)); } -static void +static int zio_free_gang_members(zio_t *zio) { zio_gbh_phys_t *gbh; @@ -1476,10 +1477,11 @@ zio_free_gang_members(zio_t *zio) } zio_buf_free(gbh, gbufsize); - zio_next_stage(zio); + + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_claim_gang_members(zio_t *zio) { zio_gbh_phys_t *gbh; @@ -1500,7 +1502,8 @@ zio_claim_gang_members(zio_t *zio) } zio_buf_free(gbh, gbufsize); - zio_next_stage(zio); + + return (ZIO_PIPELINE_CONTINUE); } static void @@ -1549,8 +1552,10 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc) error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL, B_FALSE); - if (error) - return (error); + if (error) { + zio->io_error = error; + return (ZIO_PIPELINE_CONTINUE); + } for (d = 0; d < gbh_ndvas; d++) DVA_SET_GANG(&dva[d], 1); @@ -1560,10 +1565,6 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc) gbh = zio_buf_alloc(gsize); bzero(gbh, gsize); - /* We need to test multi-level gang blocks */ - if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0) - maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE); - for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, resid -= lsize, gbps_left--, i++) { blkptr_t *gbp = &gbh->zg_blkptr[i]; @@ -1579,8 +1580,10 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc) break; ASSERT3U(error, ==, ENOSPC); /* XXX - free up previous allocations? */ - if (maxalloc == SPA_MINBLOCKSIZE) - return (error); + if (maxalloc == SPA_MINBLOCKSIZE) { + zio->io_error = error; + return (ZIO_PIPELINE_CONTINUE); + } maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); } @@ -1614,14 +1617,14 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc) zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; zio_push_transform(zio, gbh, gsize, gsize); + /* - * As much as we'd like this to be zio_wait_children_ready(), + * As much as we'd like this to be 'ready' instead of 'done', * updating our ASIZE doesn't happen until the io_done callback, * so we have to wait for that to finish in order for our BP * to be stable. */ - zio_wait_children_done(zio); - return (0); + return (zio_wait_for_children_done(zio)); } /* @@ -1629,7 +1632,7 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc) * Allocate and free blocks * ========================================================================== */ -static void +static int zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -1642,14 +1645,6 @@ zio_dva_allocate(zio_t *zio) ASSERT3U(zio->io_ndvas, >, 0); ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa)); - /* For testing, make some blocks above a certain size be gang blocks */ - if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) { - error = zio_write_allocate_gang_members(zio, mc); - if (error) - zio->io_error = error; - return; - } - /* * For testing purposes, we force I/Os to retry. We don't allow * retries beyond the first pass since those I/Os are non-allocating @@ -1668,17 +1663,15 @@ zio_dva_allocate(zio_t *zio) if (error == 0) { bp->blk_birth = zio->io_txg; } else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { - error = zio_write_allocate_gang_members(zio, mc); - if (error == 0) - return; - zio->io_error = error; + return (zio_write_allocate_gang_members(zio, mc)); } else { zio->io_error = error; } - zio_next_stage(zio); + + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_dva_free(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -1687,15 +1680,15 @@ zio_dva_free(zio_t *zio) BP_ZERO(bp); - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_dva_claim(zio_t *zio) { zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } /* @@ -1704,7 +1697,7 @@ zio_dva_claim(zio_t *zio) * ========================================================================== */ -static void +static int zio_vdev_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -1719,24 +1712,21 @@ zio_vdev_io_start(zio_t *zio) * at that time. */ if (spa_state(spa) == POOL_STATE_IO_FAILURE && - zio->io_type == ZIO_TYPE_WRITE) { - zio_vdev_suspend_io(zio); - return; - } + zio->io_type == ZIO_TYPE_WRITE) + return (zio_vdev_suspend_io(zio)); - if (vd == NULL) { - /* The mirror_ops handle multiple DVAs in a single BP */ - vdev_mirror_ops.vdev_op_io_start(zio); - return; - } + /* + * The mirror_ops handle multiple DVAs in a single BP + */ + if (vd == NULL) + return (vdev_mirror_ops.vdev_op_io_start(zio)); align = 1ULL << tvd->vdev_ashift; if (zio->io_retries == 0 && vd == tvd) zio->io_flags |= ZIO_FLAG_FAILFAST; - if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && - vd->vdev_children == 0) { + if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) { zio->io_flags |= ZIO_FLAG_PHYSICAL; zio->io_offset += VDEV_LABEL_START_SIZE; } @@ -1760,19 +1750,16 @@ zio_vdev_io_start(zio_t *zio) P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); - vdev_io_start(zio); - - /* zio_next_stage_async() gets called from io completion interrupt */ + return (vd->vdev_ops->vdev_op_io_start(zio)); } -static void +static int zio_vdev_io_done(zio_t *zio) { if (zio->io_vd == NULL) - /* The mirror_ops handle multiple DVAs in a single BP */ - vdev_mirror_ops.vdev_op_io_done(zio); - else - vdev_io_done(zio); + return (vdev_mirror_ops.vdev_op_io_done(zio)); + + return (zio->io_vd->vdev_ops->vdev_op_io_done(zio)); } /* XXPOLICY */ @@ -1795,7 +1782,7 @@ zio_should_retry(zio_t *zio) return (B_TRUE); } -static void +static int zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -1833,15 +1820,10 @@ zio_vdev_io_assess(zio_t *zio) zio->io_flags |= ZIO_FLAG_DONT_CACHE; zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; - dprintf("retry #%d for %s to %s offset %llx\n", - zio->io_retries, zio_type_name[zio->io_type], - vdev_description(vd), zio->io_offset); - - zio_next_stage_async(zio); - return; + return (ZIO_PIPELINE_CONTINUE); } - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } void @@ -1876,7 +1858,7 @@ zio_vdev_io_bypass(zio_t *zio) * Generate and verify checksums * ========================================================================== */ -static void +static int zio_checksum_generate(zio_t *zio) { int checksum = zio->io_checksum; @@ -1889,10 +1871,10 @@ zio_checksum_generate(zio_t *zio) zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_gang_checksum_generate(zio_t *zio) { zio_cksum_t zc; @@ -1905,10 +1887,10 @@ zio_gang_checksum_generate(zio_t *zio) zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_checksum_verify(zio_t *zio) { if (zio->io_bp != NULL) { @@ -1918,7 +1900,7 @@ zio_checksum_verify(zio_t *zio) zio->io_spa, zio->io_vd, zio, 0, 0); } - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } /* @@ -1949,20 +1931,15 @@ zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) * Define the pipeline * ========================================================================== */ -typedef void zio_pipe_stage_t(zio_t *zio); - -static void -zio_badop(zio_t *zio) -{ - panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio); -} +typedef int zio_pipe_stage_t(zio_t *zio); zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { - zio_badop, - zio_wait_children_ready, + NULL, + zio_wait_for_children_ready, + zio_read_init, + zio_issue_async, zio_write_compress, zio_checksum_generate, - zio_gang_pipeline, zio_get_gang_header, zio_rewrite_gang_members, zio_free_gang_members, @@ -1972,116 +1949,63 @@ zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { zio_dva_claim, zio_gang_checksum_generate, zio_ready, - zio_read_init, zio_vdev_io_start, zio_vdev_io_done, zio_vdev_io_assess, - zio_wait_children_done, + zio_wait_for_children_done, zio_checksum_verify, zio_read_gang_members, zio_read_decompress, zio_assess, zio_done, - zio_badop + NULL }; /* - * Move an I/O to the next stage of the pipeline and execute that stage. - * There's no locking on io_stage because there's no legitimate way for - * multiple threads to be attempting to process the same I/O. + * Execute the I/O pipeline until one of the following occurs: + * (1) the I/O completes; (2) the pipeline stalls waiting for + * dependent child I/Os; (3) the I/O issues, so we're waiting + * for an I/O completion interrupt; (4) the I/O is delegated by + * vdev-level caching or aggregation; (5) the I/O is deferred + * due to vdev-level queueing; (6) the I/O is handed off to + * another thread. In all cases, the pipeline stops whenever + * there's no CPU work; it never burns a thread in cv_wait(). + * + * There's no locking on io_stage because there's no legitimate way + * for multiple threads to be attempting to process the same I/O. */ void -zio_next_stage(zio_t *zio) +zio_execute(zio_t *zio) { - uint32_t pipeline = zio->io_pipeline; + while (zio->io_stage < ZIO_STAGE_DONE) { + uint32_t pipeline = zio->io_pipeline; + int rv; - ASSERT(!MUTEX_HELD(&zio->io_lock)); + ASSERT(!MUTEX_HELD(&zio->io_lock)); - if (zio->io_error) { - dprintf("zio %p vdev %s offset %llx stage %d error %d\n", - zio, vdev_description(zio->io_vd), - zio->io_offset, zio->io_stage, zio->io_error); - if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) + /* + * If an error occurred outside the vdev stack, + * just execute the interlock stages to clean up. + */ + if (zio->io_error && + ((1U << zio->io_stage) & ZIO_VDEV_IO_STAGES) == 0) pipeline &= ZIO_ERROR_PIPELINE_MASK; - } - - while (((1U << ++zio->io_stage) & pipeline) == 0) - continue; - - ASSERT(zio->io_stage <= ZIO_STAGE_DONE); - ASSERT(zio->io_stalled == 0); - - /* - * See the comment in zio_next_stage_async() about per-CPU taskqs. - */ - if (((1U << zio->io_stage) & zio->io_async_stages) && - (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) && - !(zio->io_flags & ZIO_FLAG_METADATA)) { - taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; - (void) taskq_dispatch(tq, - (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); - } else { - zio_pipeline[zio->io_stage](zio); - } -} -void -zio_next_stage_async(zio_t *zio) -{ - taskq_t *tq; - uint32_t pipeline = zio->io_pipeline; - - ASSERT(!MUTEX_HELD(&zio->io_lock)); + while (((1U << ++zio->io_stage) & pipeline) == 0) + continue; - if (zio->io_error) { - dprintf("zio %p vdev %s offset %llx stage %d error %d\n", - zio, vdev_description(zio->io_vd), - zio->io_offset, zio->io_stage, zio->io_error); - if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) - pipeline &= ZIO_ERROR_PIPELINE_MASK; - } + ASSERT(zio->io_stage <= ZIO_STAGE_DONE); + ASSERT(zio->io_stalled == 0); - while (((1U << ++zio->io_stage) & pipeline) == 0) - continue; + rv = zio_pipeline[zio->io_stage](zio); - ASSERT(zio->io_stage <= ZIO_STAGE_DONE); - ASSERT(zio->io_stalled == 0); + if (rv == ZIO_PIPELINE_STOP) + return; - /* - * For performance, we'll probably want two sets of task queues: - * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU - * part is for read performance: since we have to make a pass over - * the data to checksum it anyway, we want to do this on the same CPU - * that issued the read, because (assuming CPU scheduling affinity) - * that thread is probably still there. Getting this optimization - * right avoids performance-hostile cache-to-cache transfers. - * - * Note that having two sets of task queues is also necessary for - * correctness: if all of the issue threads get bogged down waiting - * for dependent reads (e.g. metaslab freelist) to complete, then - * there won't be any threads available to service I/O completion - * interrupts. - */ - if ((1U << zio->io_stage) & zio->io_async_stages) { - if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE) - tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; - else - tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type]; - (void) taskq_dispatch(tq, - (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); - } else { - zio_pipeline[zio->io_stage](zio); + ASSERT(rv == ZIO_PIPELINE_CONTINUE); } } -void -zio_resubmit_stage_async(void *arg) -{ - zio_t *zio = (zio_t *)(uintptr_t)arg; - - zio_next_stage_async(zio); -} - static boolean_t zio_io_should_fail(uint16_t range) { |