summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--deleted_files/usr/src/uts/common/fs/zfs/rprwlock.c (renamed from usr/src/uts/common/fs/zfs/rprwlock.c)0
-rw-r--r--deleted_files/usr/src/uts/common/fs/zfs/sys/rprwlock.h (renamed from usr/src/uts/common/fs/zfs/sys/rprwlock.h)0
-rw-r--r--usr/src/cmd/zdb/zdb.c9
-rw-r--r--usr/src/cmd/ztest/ztest.c894
-rw-r--r--usr/src/lib/libzpool/common/llib-lzpool2
-rw-r--r--usr/src/uts/common/Makefile.files1
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c7
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c116
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa_impl.h11
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h4
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h19
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio_impl.h144
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c15
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_cache.c9
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_disk.c54
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_file.c35
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_mirror.c22
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_missing.c8
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_queue.c37
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz.c25
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c436
22 files changed, 865 insertions, 986 deletions
diff --git a/usr/src/uts/common/fs/zfs/rprwlock.c b/deleted_files/usr/src/uts/common/fs/zfs/rprwlock.c
index 49ae505209..49ae505209 100644
--- a/usr/src/uts/common/fs/zfs/rprwlock.c
+++ b/deleted_files/usr/src/uts/common/fs/zfs/rprwlock.c
diff --git a/usr/src/uts/common/fs/zfs/sys/rprwlock.h b/deleted_files/usr/src/uts/common/fs/zfs/sys/rprwlock.h
index ba23799c9d..ba23799c9d 100644
--- a/usr/src/uts/common/fs/zfs/sys/rprwlock.h
+++ b/deleted_files/usr/src/uts/common/fs/zfs/sys/rprwlock.h
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
index 426e275080..260e1cb391 100644
--- a/usr/src/cmd/zdb/zdb.c
+++ b/usr/src/cmd/zdb/zdb.c
@@ -501,10 +501,8 @@ dump_metaslabs(spa_t *spa)
for (c = 0; c < rvd->vdev_children; c++) {
vd = rvd->vdev_child[c];
- spa_config_enter(spa, RW_READER, FTAG);
(void) printf("\n vdev %llu = %s\n\n",
(u_longlong_t)vd->vdev_id, vdev_description(vd));
- spa_config_exit(spa, FTAG);
if (dump_opt['d'] <= 5) {
(void) printf("\t%10s %10s %5s\n",
@@ -522,7 +520,6 @@ static void
dump_dtl(vdev_t *vd, int indent)
{
avl_tree_t *t = &vd->vdev_dtl_map.sm_root;
- spa_t *spa = vd->vdev_spa;
space_seg_t *ss;
vdev_t *pvd;
int c;
@@ -530,9 +527,7 @@ dump_dtl(vdev_t *vd, int indent)
if (indent == 0)
(void) printf("\nDirty time logs:\n\n");
- spa_config_enter(spa, RW_READER, FTAG);
(void) printf("\t%*s%s\n", indent, "", vdev_description(vd));
- spa_config_exit(spa, FTAG);
for (ss = avl_first(t); ss; ss = AVL_NEXT(t, ss)) {
/*
@@ -1730,6 +1725,8 @@ dump_zpool(spa_t *spa)
dsl_pool_t *dp = spa_get_dsl(spa);
int rc = 0;
+ spa_config_enter(spa, RW_READER, FTAG);
+
if (dump_opt['u'])
dump_uberblock(&spa->spa_uberblock);
@@ -1751,6 +1748,8 @@ dump_zpool(spa_t *spa)
if (dump_opt['s'])
show_pool_stats(spa);
+ spa_config_exit(spa, FTAG);
+
if (rc != 0)
exit(rc);
}
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index 54fdec794e..2d9b11f20d 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -127,8 +127,18 @@ static uint64_t zopt_time = 300; /* 5 minutes */
static int zopt_maxfaults;
static uint16_t zopt_write_fail_shift = 5;
+typedef struct ztest_block_tag {
+ uint64_t bt_objset;
+ uint64_t bt_object;
+ uint64_t bt_offset;
+ uint64_t bt_txg;
+ uint64_t bt_thread;
+ uint64_t bt_seq;
+} ztest_block_tag_t;
+
typedef struct ztest_args {
- char *za_pool;
+ char za_pool[MAXNAMELEN];
+ spa_t *za_spa;
objset_t *za_os;
zilog_t *za_zilog;
thread_t za_thread;
@@ -141,6 +151,13 @@ typedef struct ztest_args {
hrtime_t za_stop;
hrtime_t za_kill;
traverse_handle_t *za_th;
+ /*
+ * Thread-local variables can go here to aid debugging.
+ */
+ ztest_block_tag_t za_rbt;
+ ztest_block_tag_t za_wbt;
+ dmu_object_info_t za_doi;
+ dmu_buf_t *za_dbuf;
} ztest_args_t;
typedef void ztest_func_t(ztest_args_t *);
@@ -167,6 +184,7 @@ ztest_func_t ztest_spa_rename;
typedef struct ztest_info {
ztest_func_t *zi_func; /* test function */
+ uint64_t zi_iters; /* iterations per execution */
uint64_t *zi_interval; /* execute every <interval> seconds */
uint64_t zi_calls; /* per-pass count */
uint64_t zi_call_time; /* per-pass time */
@@ -180,22 +198,22 @@ uint64_t zopt_sometimes = 10; /* every 10 seconds */
uint64_t zopt_rarely = 60; /* every 60 seconds */
ztest_info_t ztest_info[] = {
- { ztest_dmu_read_write, &zopt_always },
- { ztest_dmu_write_parallel, &zopt_always },
- { ztest_dmu_object_alloc_free, &zopt_always },
- { ztest_zap, &zopt_always },
- { ztest_zap_parallel, &zopt_always },
- { ztest_traverse, &zopt_often },
- { ztest_dsl_prop_get_set, &zopt_sometimes },
- { ztest_dmu_objset_create_destroy, &zopt_sometimes },
- { ztest_dmu_snapshot_create_destroy, &zopt_rarely },
- { ztest_spa_create_destroy, &zopt_sometimes },
- { ztest_fault_inject, &zopt_sometimes },
- { ztest_spa_rename, &zopt_rarely },
- { ztest_vdev_attach_detach, &zopt_rarely },
- { ztest_vdev_LUN_growth, &zopt_rarely },
- { ztest_vdev_add_remove, &zopt_vdevtime },
- { ztest_scrub, &zopt_vdevtime },
+ { ztest_dmu_read_write, 1, &zopt_always },
+ { ztest_dmu_write_parallel, 30, &zopt_always },
+ { ztest_dmu_object_alloc_free, 1, &zopt_always },
+ { ztest_zap, 30, &zopt_always },
+ { ztest_zap_parallel, 100, &zopt_always },
+ { ztest_traverse, 1, &zopt_often },
+ { ztest_dsl_prop_get_set, 1, &zopt_sometimes },
+ { ztest_dmu_objset_create_destroy, 1, &zopt_sometimes },
+ { ztest_dmu_snapshot_create_destroy, 1, &zopt_rarely },
+ { ztest_spa_create_destroy, 1, &zopt_sometimes },
+ { ztest_fault_inject, 1, &zopt_sometimes },
+ { ztest_spa_rename, 1, &zopt_rarely },
+ { ztest_vdev_attach_detach, 1, &zopt_rarely },
+ { ztest_vdev_LUN_growth, 1, &zopt_rarely },
+ { ztest_vdev_add_remove, 1, &zopt_vdevtime },
+ { ztest_scrub, 1, &zopt_vdevtime },
};
#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
@@ -214,21 +232,11 @@ typedef struct ztest_shared {
hrtime_t zs_stop_time;
uint64_t zs_alloc;
uint64_t zs_space;
- uint64_t zs_txg;
ztest_info_t zs_info[ZTEST_FUNCS];
mutex_t zs_sync_lock[ZTEST_SYNC_LOCKS];
uint64_t zs_seq[ZTEST_SYNC_LOCKS];
} ztest_shared_t;
-typedef struct ztest_block_tag {
- uint64_t bt_objset;
- uint64_t bt_object;
- uint64_t bt_offset;
- uint64_t bt_txg;
- uint64_t bt_thread;
- uint64_t bt_seq;
-} ztest_block_tag_t;
-
static char ztest_dev_template[] = "%s/%s.%llua";
static ztest_shared_t *ztest_shared;
@@ -237,7 +245,7 @@ static int ztest_dump_core = 1;
static boolean_t ztest_exiting = B_FALSE;
-extern uint64_t zio_gang_bang;
+extern uint64_t metaslab_gang_bang;
extern uint16_t zio_zil_fail_shift;
extern uint16_t zio_io_fail_shift;
@@ -359,7 +367,7 @@ usage(boolean_t requested)
FILE *fp = requested ? stdout : stderr;
nicenum(zopt_vdev_size, nice_vdev_size);
- nicenum(zio_gang_bang, nice_gang_bang);
+ nicenum(metaslab_gang_bang, nice_gang_bang);
(void) fprintf(fp, "Usage: %s\n"
"\t[-v vdevs (default: %llu)]\n"
@@ -432,7 +440,7 @@ process_options(int argc, char **argv)
uint64_t value;
/* By default, test gang blocks for blocks 32K and greater */
- zio_gang_bang = 32 << 10;
+ metaslab_gang_bang = 32 << 10;
/* Default value, fail every 32nd allocation */
zio_zil_fail_shift = 5;
@@ -484,7 +492,7 @@ process_options(int argc, char **argv)
zopt_threads = MAX(1, value);
break;
case 'g':
- zio_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, value);
+ metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, value);
break;
case 'i':
zopt_init = value;
@@ -835,7 +843,7 @@ ztest_spa_create_destroy(ztest_args_t *za)
void
ztest_vdev_add_remove(ztest_args_t *za)
{
- spa_t *spa = dmu_objset_spa(za->za_os);
+ spa_t *spa = za->za_spa;
uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
nvlist_t *nvroot;
int error;
@@ -906,7 +914,7 @@ vdev_lookup_by_path(vdev_t *vd, const char *path)
void
ztest_vdev_attach_detach(ztest_args_t *za)
{
- spa_t *spa = dmu_objset_spa(za->za_os);
+ spa_t *spa = za->za_spa;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *oldvd, *newvd, *pvd;
nvlist_t *root, *file;
@@ -1056,7 +1064,7 @@ ztest_vdev_attach_detach(ztest_args_t *za)
void
ztest_vdev_LUN_growth(ztest_args_t *za)
{
- spa_t *spa = dmu_objset_spa(za->za_os);
+ spa_t *spa = za->za_spa;
char dev_name[MAXPATHLEN];
uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
uint64_t vdev;
@@ -1106,7 +1114,7 @@ ztest_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
*/
VERIFY(dmu_object_claim(os, ZTEST_DIROBJ,
DMU_OT_UINT64_OTHER, ZTEST_DIROBJ_BLOCKSIZE,
- DMU_OT_UINT64_OTHER, sizeof (ztest_block_tag_t), tx) == 0);
+ DMU_OT_UINT64_OTHER, 5 * sizeof (ztest_block_tag_t), tx) == 0);
VERIFY(zap_create_claim(os, ZTEST_MICROZAP_OBJ,
DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
@@ -1115,12 +1123,12 @@ ztest_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
}
-/* ARGSUSED */
static int
ztest_destroy_cb(char *name, void *arg)
{
+ ztest_args_t *za = arg;
objset_t *os;
- dmu_object_info_t doi;
+ dmu_object_info_t *doi = &za->za_doi;
int error;
/*
@@ -1129,12 +1137,12 @@ ztest_destroy_cb(char *name, void *arg)
error = dmu_objset_open(name, DMU_OST_OTHER,
DS_MODE_STANDARD | DS_MODE_READONLY, &os);
ASSERT3U(error, ==, 0);
- error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
+ error = dmu_object_info(os, ZTEST_DIROBJ, doi);
if (error != ENOENT) {
/* We could have crashed in the middle of destroying it */
ASSERT3U(error, ==, 0);
- ASSERT3U(doi.doi_type, ==, DMU_OT_UINT64_OTHER);
- ASSERT3S(doi.doi_physical_blks, >=, 0);
+ ASSERT3U(doi->doi_type, ==, DMU_OT_UINT64_OTHER);
+ ASSERT3S(doi->doi_physical_blks, >=, 0);
}
dmu_objset_close(os);
@@ -1215,7 +1223,7 @@ ztest_dmu_objset_create_destroy(ztest_args_t *za)
* create lying around from a previous run. If so, destroy it
* and all of its snapshots.
*/
- (void) dmu_objset_find(name, ztest_destroy_cb, NULL,
+ (void) dmu_objset_find(name, ztest_destroy_cb, za,
DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
/*
@@ -1428,7 +1436,7 @@ ztest_blk_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
void
ztest_traverse(ztest_args_t *za)
{
- spa_t *spa = dmu_objset_spa(za->za_os);
+ spa_t *spa = za->za_spa;
traverse_handle_t *th = za->za_th;
int rc, advance;
uint64_t cbstart, cblimit;
@@ -1500,7 +1508,7 @@ ztest_dmu_object_alloc_free(ztest_args_t *za)
dmu_tx_t *tx;
uint64_t batchobj, object, batchsize, endoff, temp;
int b, c, error, bonuslen;
- dmu_object_info_t doi;
+ dmu_object_info_t *doi = &za->za_doi;
char osname[MAXNAMELEN];
dmu_objset_name(os, osname);
@@ -1545,13 +1553,14 @@ ztest_dmu_object_alloc_free(ztest_args_t *za)
* We expect the nth byte of the bonus buffer to be n.
*/
VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
+ za->za_dbuf = db;
- dmu_object_info_from_db(db, &doi);
- ASSERT(doi.doi_type == DMU_OT_UINT64_OTHER);
- ASSERT(doi.doi_bonus_type == DMU_OT_PLAIN_OTHER);
- ASSERT3S(doi.doi_physical_blks, >=, 0);
+ dmu_object_info_from_db(db, doi);
+ ASSERT(doi->doi_type == DMU_OT_UINT64_OTHER);
+ ASSERT(doi->doi_bonus_type == DMU_OT_PLAIN_OTHER);
+ ASSERT3S(doi->doi_physical_blks, >=, 0);
- bonuslen = doi.doi_bonus_size;
+ bonuslen = doi->doi_bonus_size;
for (c = 0; c < bonuslen; c++) {
if (((uint8_t *)db->db_data)[c] !=
@@ -1565,6 +1574,7 @@ ztest_dmu_object_alloc_free(ztest_args_t *za)
}
dmu_buf_rele(db, FTAG);
+ za->za_dbuf = NULL;
/*
* We expect the word at endoff to be our object number.
@@ -1669,7 +1679,8 @@ ztest_dmu_object_alloc_free(ztest_args_t *za)
/*
* Write to both the bonus buffer and the regular data.
*/
- VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
+ VERIFY(dmu_bonus_hold(os, object, FTAG, &db) == 0);
+ za->za_dbuf = db;
ASSERT3U(bonuslen, <=, db->db_size);
dmu_object_size_from_db(db, &va_blksize, &va_nblocks);
@@ -1685,6 +1696,7 @@ ztest_dmu_object_alloc_free(ztest_args_t *za)
((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen);
dmu_buf_rele(db, FTAG);
+ za->za_dbuf = NULL;
/*
* Write to a large offset to increase indirection.
@@ -1939,244 +1951,229 @@ ztest_dmu_read_write(ztest_args_t *za)
}
void
-ztest_dmu_check_future_leak(objset_t *os, uint64_t txg)
+ztest_dmu_check_future_leak(ztest_args_t *za)
{
+ objset_t *os = za->za_os;
dmu_buf_t *db;
- ztest_block_tag_t rbt;
-
- if (zopt_verbose >= 3) {
- char osname[MAXNAMELEN];
- dmu_objset_name(os, osname);
- (void) printf("checking %s for future leaks in txg %lld...\n",
- osname, (u_longlong_t)txg);
- }
+ ztest_block_tag_t *bt;
+ dmu_object_info_t *doi = &za->za_doi;
/*
* Make sure that, if there is a write record in the bonus buffer
* of the ZTEST_DIROBJ, that the txg for this record is <= the
* last synced txg of the pool.
*/
-
- VERIFY(0 == dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db));
- ASSERT3U(db->db_size, >=, sizeof (rbt));
- bcopy(db->db_data, &rbt, sizeof (rbt));
- if (rbt.bt_objset != 0) {
- ASSERT3U(rbt.bt_objset, ==, dmu_objset_id(os));
- ASSERT3U(rbt.bt_object, ==, ZTEST_DIROBJ);
- ASSERT3U(rbt.bt_offset, ==, -1ULL);
- if (rbt.bt_txg > txg) {
- fatal(0,
- "future leak: got %llx, last synced txg is %llx",
- rbt.bt_txg, txg);
- }
+ VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0);
+ za->za_dbuf = db;
+ VERIFY(dmu_object_info(os, ZTEST_DIROBJ, doi) == 0);
+ ASSERT3U(doi->doi_bonus_size, >=, sizeof (*bt));
+ ASSERT3U(doi->doi_bonus_size, <=, db->db_size);
+ ASSERT3U(doi->doi_bonus_size % sizeof (*bt), ==, 0);
+ bt = (void *)((char *)db->db_data + doi->doi_bonus_size - sizeof (*bt));
+ if (bt->bt_objset != 0) {
+ ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
+ ASSERT3U(bt->bt_object, ==, ZTEST_DIROBJ);
+ ASSERT3U(bt->bt_offset, ==, -1ULL);
+ ASSERT3U(bt->bt_txg, <, spa_first_txg(za->za_spa));
}
dmu_buf_rele(db, FTAG);
+ za->za_dbuf = NULL;
}
void
ztest_dmu_write_parallel(ztest_args_t *za)
{
objset_t *os = za->za_os;
- dmu_tx_t *tx;
+ ztest_block_tag_t *rbt = &za->za_rbt;
+ ztest_block_tag_t *wbt = &za->za_wbt;
+ const size_t btsize = sizeof (ztest_block_tag_t);
dmu_buf_t *db;
- int i, b, error, do_free, bs;
- uint64_t off, txg_how, txg;
+ int b, error;
+ int bs = ZTEST_DIROBJ_BLOCKSIZE;
+ int do_free = 0;
+ uint64_t off, txg_how;
mutex_t *lp;
char osname[MAXNAMELEN];
char iobuf[SPA_MAXBLOCKSIZE];
- ztest_block_tag_t rbt, wbt;
+ blkptr_t blk = { 0 };
+ uint64_t blkoff;
+ zbookmark_t zb;
+ dmu_tx_t *tx = dmu_tx_create(os);
dmu_objset_name(os, osname);
- bs = ZTEST_DIROBJ_BLOCKSIZE;
/*
* Have multiple threads write to large offsets in ZTEST_DIROBJ
* to verify that having multiple threads writing to the same object
* in parallel doesn't cause any trouble.
- * Also do parallel writes to the bonus buffer on occasion.
*/
- for (i = 0; i < 50; i++) {
+ if (ztest_random(4) == 0) {
+ /*
+ * Do the bonus buffer instead of a regular block.
+ * We need a lock to serialize resize vs. others,
+ * so we hash on the objset ID.
+ */
+ b = dmu_objset_id(os) % ZTEST_SYNC_LOCKS;
+ off = -1ULL;
+ dmu_tx_hold_bonus(tx, ZTEST_DIROBJ);
+ } else {
b = ztest_random(ZTEST_SYNC_LOCKS);
- lp = &ztest_shared->zs_sync_lock[b];
-
- do_free = (ztest_random(4) == 0);
-
- off = za->za_diroff_shared + ((uint64_t)b << SPA_MAXBLOCKSHIFT);
-
+ off = za->za_diroff_shared + (b << SPA_MAXBLOCKSHIFT);
if (ztest_random(4) == 0) {
- /*
- * Do the bonus buffer instead of a regular block.
- */
- do_free = 0;
- off = -1ULL;
- }
-
- tx = dmu_tx_create(os);
-
- if (off == -1ULL)
- dmu_tx_hold_bonus(tx, ZTEST_DIROBJ);
- else if (do_free)
+ do_free = 1;
dmu_tx_hold_free(tx, ZTEST_DIROBJ, off, bs);
- else
+ } else {
dmu_tx_hold_write(tx, ZTEST_DIROBJ, off, bs);
+ }
+ }
- txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT;
- error = dmu_tx_assign(tx, txg_how);
- if (error) {
- if (error == ERESTART) {
- ASSERT(txg_how == TXG_NOWAIT);
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- continue;
- }
- dmu_tx_abort(tx);
+ txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT;
+ error = dmu_tx_assign(tx, txg_how);
+ if (error) {
+ if (error == ERESTART) {
+ ASSERT(txg_how == TXG_NOWAIT);
+ dmu_tx_wait(tx);
+ } else {
ztest_record_enospc("dmu write parallel");
- return;
}
- txg = dmu_tx_get_txg(tx);
+ dmu_tx_abort(tx);
+ return;
+ }
- if (do_free) {
- (void) mutex_lock(lp);
- VERIFY(0 == dmu_free_range(os, ZTEST_DIROBJ, off,
- bs, tx));
- (void) mutex_unlock(lp);
- dmu_tx_commit(tx);
- continue;
+ lp = &ztest_shared->zs_sync_lock[b];
+ (void) mutex_lock(lp);
+
+ wbt->bt_objset = dmu_objset_id(os);
+ wbt->bt_object = ZTEST_DIROBJ;
+ wbt->bt_offset = off;
+ wbt->bt_txg = dmu_tx_get_txg(tx);
+ wbt->bt_thread = za->za_instance;
+ wbt->bt_seq = ztest_shared->zs_seq[b]++; /* protected by lp */
+
+ if (off == -1ULL) {
+ dmu_object_info_t *doi = &za->za_doi;
+ char *dboff;
+
+ VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0);
+ za->za_dbuf = db;
+ dmu_object_info_from_db(db, doi);
+ ASSERT3U(doi->doi_bonus_size, <=, db->db_size);
+ ASSERT3U(doi->doi_bonus_size, >=, btsize);
+ ASSERT3U(doi->doi_bonus_size % btsize, ==, 0);
+ dboff = (char *)db->db_data + doi->doi_bonus_size - btsize;
+ bcopy(dboff, rbt, btsize);
+ if (rbt->bt_objset != 0) {
+ ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset);
+ ASSERT3U(rbt->bt_object, ==, wbt->bt_object);
+ ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset);
+ ASSERT3U(rbt->bt_txg, <=, wbt->bt_txg);
}
-
- wbt.bt_objset = dmu_objset_id(os);
- wbt.bt_object = ZTEST_DIROBJ;
- wbt.bt_offset = off;
- wbt.bt_txg = txg;
- wbt.bt_thread = za->za_instance;
-
- if (off == -1ULL) {
- dmu_object_info_t doi;
- char *off;
-
- wbt.bt_seq = 0;
- VERIFY(0 == dmu_bonus_hold(os, ZTEST_DIROBJ,
- FTAG, &db));
- dmu_object_info_from_db(db, &doi);
- ASSERT3U(doi.doi_bonus_size, >=, sizeof (wbt));
- off = (char *)db->db_data +
- doi.doi_bonus_size - sizeof (wbt);
- bcopy(off, &rbt, sizeof (wbt));
- if (rbt.bt_objset != 0) {
- ASSERT3U(rbt.bt_objset, ==, wbt.bt_objset);
- ASSERT3U(rbt.bt_object, ==, wbt.bt_object);
- ASSERT3U(rbt.bt_offset, ==, wbt.bt_offset);
- ASSERT3U(rbt.bt_txg, <=, wbt.bt_txg);
- }
- if (ztest_random(10) == 0) {
- int newsize = (ztest_random(
- db->db_size / sizeof (wbt)) + 1) *
- sizeof (wbt);
-
- ASSERT3U(newsize, >=, sizeof (wbt));
- ASSERT3U(newsize, <=, db->db_size);
- error = dmu_set_bonus(db, newsize, tx);
- ASSERT3U(error, ==, 0);
- off = (char *)db->db_data + newsize -
- sizeof (wbt);
- }
- dmu_buf_will_dirty(db, tx);
- bcopy(&wbt, off, db->db_size);
- dmu_buf_rele(db, FTAG);
- dmu_tx_commit(tx);
- continue;
+ if (ztest_random(10) == 0) {
+ int newsize = (ztest_random(db->db_size /
+ btsize) + 1) * btsize;
+
+ ASSERT3U(newsize, >=, btsize);
+ ASSERT3U(newsize, <=, db->db_size);
+ VERIFY3U(dmu_set_bonus(db, newsize, tx), ==, 0);
+ dboff = (char *)db->db_data + newsize - btsize;
}
+ dmu_buf_will_dirty(db, tx);
+ bcopy(wbt, dboff, btsize);
+ dmu_buf_rele(db, FTAG);
+ za->za_dbuf = NULL;
+ } else if (do_free) {
+ VERIFY(dmu_free_range(os, ZTEST_DIROBJ, off, bs, tx) == 0);
+ } else {
+ dmu_write(os, ZTEST_DIROBJ, off, btsize, wbt, tx);
+ }
+
+ (void) mutex_unlock(lp);
+
+ if (ztest_random(1000) == 0)
+ (void) poll(NULL, 0, 1); /* open dn_notxholds window */
+
+ dmu_tx_commit(tx);
+
+ if (ztest_random(10000) == 0)
+ txg_wait_synced(dmu_objset_pool(os), wbt->bt_txg);
- (void) mutex_lock(lp);
+ if (off == -1 || do_free)
+ return;
- wbt.bt_seq = ztest_shared->zs_seq[b]++;
+ if (ztest_random(2) != 0)
+ return;
- dmu_write(os, ZTEST_DIROBJ, off, sizeof (wbt), &wbt, tx);
+ /*
+ * dmu_sync() the block we just wrote.
+ */
+ (void) mutex_lock(lp);
+ blkoff = P2ALIGN_TYPED(off, bs, uint64_t);
+ error = dmu_buf_hold(os, ZTEST_DIROBJ, blkoff, FTAG, &db);
+ za->za_dbuf = db;
+ if (error) {
+ dprintf("dmu_buf_hold(%s, %d, %llx) = %d\n",
+ osname, ZTEST_DIROBJ, blkoff, error);
(void) mutex_unlock(lp);
+ return;
+ }
+ blkoff = off - blkoff;
+ error = dmu_sync(NULL, db, &blk, wbt->bt_txg, NULL, NULL);
+ dmu_buf_rele(db, FTAG);
+ za->za_dbuf = NULL;
- if (ztest_random(100) == 0)
- (void) poll(NULL, 0, 1); /* open dn_notxholds window */
+ (void) mutex_unlock(lp);
- dmu_tx_commit(tx);
+ if (error) {
+ dprintf("dmu_sync(%s, %d, %llx) = %d\n",
+ osname, ZTEST_DIROBJ, off, error);
+ return;
+ }
- if (ztest_random(1000) == 0)
- txg_wait_synced(dmu_objset_pool(os), txg);
-
- if (ztest_random(2) == 0) {
- blkptr_t blk = { 0 };
- uint64_t blkoff;
- zbookmark_t zb;
-
- (void) mutex_lock(lp);
- blkoff = P2ALIGN_TYPED(off, bs, uint64_t);
- error = dmu_buf_hold(os,
- ZTEST_DIROBJ, blkoff, FTAG, &db);
- if (error) {
- dprintf("dmu_buf_hold(%s, %d, %llx) = %d\n",
- osname, ZTEST_DIROBJ, blkoff, error);
- (void) mutex_unlock(lp);
- continue;
- }
- blkoff = off - blkoff;
- error = dmu_sync(NULL, db, &blk, txg, NULL, NULL);
- dmu_buf_rele(db, FTAG);
- (void) mutex_unlock(lp);
- if (error) {
- dprintf("dmu_sync(%s, %d, %llx) = %d\n",
- osname, ZTEST_DIROBJ, off, error);
- continue;
- }
+ if (blk.blk_birth == 0) /* concurrent free */
+ return;
- if (blk.blk_birth == 0) { /* concurrent free */
- continue;
- }
- txg_suspend(dmu_objset_pool(os));
+ txg_suspend(dmu_objset_pool(os));
- ASSERT(blk.blk_fill == 1);
- ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER);
- ASSERT3U(BP_GET_LEVEL(&blk), ==, 0);
- ASSERT3U(BP_GET_LSIZE(&blk), ==, bs);
+ ASSERT(blk.blk_fill == 1);
+ ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER);
+ ASSERT3U(BP_GET_LEVEL(&blk), ==, 0);
+ ASSERT3U(BP_GET_LSIZE(&blk), ==, bs);
- /*
- * Read the block that dmu_sync() returned to
- * make sure its contents match what we wrote.
- * We do this while still txg_suspend()ed to ensure
- * that the block can't be reused before we read it.
- */
- zb.zb_objset = dmu_objset_id(os);
- zb.zb_object = ZTEST_DIROBJ;
- zb.zb_level = 0;
- zb.zb_blkid = off / bs;
- error = zio_wait(zio_read(NULL, dmu_objset_spa(os),
- &blk, iobuf, bs, NULL, NULL,
- ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, &zb));
- ASSERT(error == 0);
+ /*
+ * Read the block that dmu_sync() returned to make sure its contents
+ * match what we wrote. We do this while still txg_suspend()ed
+ * to ensure that the block can't be reused before we read it.
+ */
+ zb.zb_objset = dmu_objset_id(os);
+ zb.zb_object = ZTEST_DIROBJ;
+ zb.zb_level = 0;
+ zb.zb_blkid = off / bs;
+ error = zio_wait(zio_read(NULL, za->za_spa, &blk, iobuf, bs,
+ NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, &zb));
+ ASSERT3U(error, ==, 0);
- txg_resume(dmu_objset_pool(os));
+ txg_resume(dmu_objset_pool(os));
- bcopy(&iobuf[blkoff], &rbt, sizeof (rbt));
+ bcopy(&iobuf[blkoff], rbt, btsize);
- if (rbt.bt_objset == 0) /* concurrent free */
- continue;
+ if (rbt->bt_objset == 0) /* concurrent free */
+ return;
- ASSERT3U(rbt.bt_objset, ==, wbt.bt_objset);
- ASSERT3U(rbt.bt_object, ==, wbt.bt_object);
- ASSERT3U(rbt.bt_offset, ==, wbt.bt_offset);
+ ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset);
+ ASSERT3U(rbt->bt_object, ==, wbt->bt_object);
+ ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset);
- /*
- * The semantic of dmu_sync() is that we always
- * push the most recent version of the data,
- * so in the face of concurrent updates we may
- * see a newer version of the block. That's OK.
- */
- ASSERT3U(rbt.bt_txg, >=, wbt.bt_txg);
- if (rbt.bt_thread == wbt.bt_thread)
- ASSERT3U(rbt.bt_seq, ==, wbt.bt_seq);
- else
- ASSERT3U(rbt.bt_seq, >, wbt.bt_seq);
- }
- }
+ /*
+ * The semantic of dmu_sync() is that we always push the most recent
+ * version of the data, so in the face of concurrent updates we may
+ * see a newer version of the block. That's OK.
+ */
+ ASSERT3U(rbt->bt_txg, >=, wbt->bt_txg);
+ if (rbt->bt_thread == wbt->bt_thread)
+ ASSERT3U(rbt->bt_seq, ==, wbt->bt_seq);
+ else
+ ASSERT3U(rbt->bt_seq, >, wbt->bt_seq);
}
/*
@@ -2195,7 +2192,6 @@ ztest_zap(ztest_args_t *za)
uint64_t value[ZTEST_ZAP_MAX_INTS];
uint64_t zl_ints, zl_intsize, prop;
int i, ints;
- int iters = 100;
dmu_tx_t *tx;
char propname[100], txgname[100];
int error;
@@ -2259,122 +2255,113 @@ ztest_zap(ztest_args_t *za)
ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
- while (--iters >= 0) {
- prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
- (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
- (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
- bzero(value, sizeof (value));
- last_txg = 0;
+ prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
+ (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
+ (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
+ bzero(value, sizeof (value));
+ last_txg = 0;
- /*
- * If these zap entries already exist, validate their contents.
- */
- error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
- if (error == 0) {
- ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
- ASSERT3U(zl_ints, ==, 1);
-
- error = zap_lookup(os, object, txgname, zl_intsize,
- zl_ints, &last_txg);
-
- ASSERT3U(error, ==, 0);
+ /*
+ * If these zap entries already exist, validate their contents.
+ */
+ error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
+ if (error == 0) {
+ ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+ ASSERT3U(zl_ints, ==, 1);
- error = zap_length(os, object, propname, &zl_intsize,
- &zl_ints);
+ VERIFY(zap_lookup(os, object, txgname, zl_intsize,
+ zl_ints, &last_txg) == 0);
- ASSERT3U(error, ==, 0);
- ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
- ASSERT3U(zl_ints, ==, ints);
+ VERIFY(zap_length(os, object, propname, &zl_intsize,
+ &zl_ints) == 0);
- error = zap_lookup(os, object, propname, zl_intsize,
- zl_ints, value);
+ ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+ ASSERT3U(zl_ints, ==, ints);
- ASSERT3U(error, ==, 0);
+ VERIFY(zap_lookup(os, object, propname, zl_intsize,
+ zl_ints, value) == 0);
- for (i = 0; i < ints; i++) {
- ASSERT3U(value[i], ==, last_txg + object + i);
- }
- } else {
- ASSERT3U(error, ==, ENOENT);
+ for (i = 0; i < ints; i++) {
+ ASSERT3U(value[i], ==, last_txg + object + i);
}
+ } else {
+ ASSERT3U(error, ==, ENOENT);
+ }
- /*
- * Atomically update two entries in our zap object.
- * The first is named txg_%llu, and contains the txg
- * in which the property was last updated. The second
- * is named prop_%llu, and the nth element of its value
- * should be txg + object + n.
- */
- tx = dmu_tx_create(os);
- dmu_tx_hold_zap(tx, object, TRUE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("create zap entry");
- dmu_tx_abort(tx);
- return;
- }
- txg = dmu_tx_get_txg(tx);
+ /*
+ * Atomically update two entries in our zap object.
+ * The first is named txg_%llu, and contains the txg
+ * in which the property was last updated. The second
+ * is named prop_%llu, and the nth element of its value
+ * should be txg + object + n.
+ */
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, object, TRUE, NULL);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ ztest_record_enospc("create zap entry");
+ dmu_tx_abort(tx);
+ return;
+ }
+ txg = dmu_tx_get_txg(tx);
- if (last_txg > txg)
- fatal(0, "zap future leak: old %llu new %llu",
- last_txg, txg);
+ if (last_txg > txg)
+ fatal(0, "zap future leak: old %llu new %llu", last_txg, txg);
- for (i = 0; i < ints; i++)
- value[i] = txg + object + i;
+ for (i = 0; i < ints; i++)
+ value[i] = txg + object + i;
- error = zap_update(os, object, txgname, sizeof (uint64_t),
- 1, &txg, tx);
- if (error)
- fatal(0, "zap_update('%s', %llu, '%s') = %d",
- osname, object, txgname, error);
+ error = zap_update(os, object, txgname, sizeof (uint64_t), 1, &txg, tx);
+ if (error)
+ fatal(0, "zap_update('%s', %llu, '%s') = %d",
+ osname, object, txgname, error);
- error = zap_update(os, object, propname, sizeof (uint64_t),
- ints, value, tx);
- if (error)
- fatal(0, "zap_update('%s', %llu, '%s') = %d",
- osname, object, propname, error);
+ error = zap_update(os, object, propname, sizeof (uint64_t),
+ ints, value, tx);
+ if (error)
+ fatal(0, "zap_update('%s', %llu, '%s') = %d",
+ osname, object, propname, error);
- dmu_tx_commit(tx);
+ dmu_tx_commit(tx);
- /*
- * Remove a random pair of entries.
- */
- prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
- (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
- (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
+ /*
+ * Remove a random pair of entries.
+ */
+ prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
+ (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
+ (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
- error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
+ error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
- if (error == ENOENT)
- continue;
+ if (error == ENOENT)
+ return;
- ASSERT3U(error, ==, 0);
+ ASSERT3U(error, ==, 0);
- tx = dmu_tx_create(os);
- dmu_tx_hold_zap(tx, object, TRUE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("remove zap entry");
- dmu_tx_abort(tx);
- return;
- }
- error = zap_remove(os, object, txgname, tx);
- if (error)
- fatal(0, "zap_remove('%s', %llu, '%s') = %d",
- osname, object, txgname, error);
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, object, TRUE, NULL);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ ztest_record_enospc("remove zap entry");
+ dmu_tx_abort(tx);
+ return;
+ }
+ error = zap_remove(os, object, txgname, tx);
+ if (error)
+ fatal(0, "zap_remove('%s', %llu, '%s') = %d",
+ osname, object, txgname, error);
- error = zap_remove(os, object, propname, tx);
- if (error)
- fatal(0, "zap_remove('%s', %llu, '%s') = %d",
- osname, object, propname, error);
+ error = zap_remove(os, object, propname, tx);
+ if (error)
+ fatal(0, "zap_remove('%s', %llu, '%s') = %d",
+ osname, object, propname, error);
- dmu_tx_commit(tx);
- }
+ dmu_tx_commit(tx);
/*
* Once in a while, destroy the object.
*/
- if (ztest_random(100) != 0)
+ if (ztest_random(1000) != 0)
return;
tx = dmu_tx_create(os);
@@ -2401,111 +2388,107 @@ ztest_zap_parallel(ztest_args_t *za)
{
objset_t *os = za->za_os;
uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
- int iters = 100;
dmu_tx_t *tx;
int i, namelen, error;
char name[20], string_value[20];
void *data;
- while (--iters >= 0) {
- /*
- * Generate a random name of the form 'xxx.....' where each
- * x is a random printable character and the dots are dots.
- * There are 94 such characters, and the name length goes from
- * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
- */
- namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
+ /*
+ * Generate a random name of the form 'xxx.....' where each
+ * x is a random printable character and the dots are dots.
+ * There are 94 such characters, and the name length goes from
+ * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
+ */
+ namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
- for (i = 0; i < 3; i++)
- name[i] = '!' + ztest_random('~' - '!' + 1);
- for (; i < namelen - 1; i++)
- name[i] = '.';
- name[i] = '\0';
+ for (i = 0; i < 3; i++)
+ name[i] = '!' + ztest_random('~' - '!' + 1);
+ for (; i < namelen - 1; i++)
+ name[i] = '.';
+ name[i] = '\0';
- if (ztest_random(2) == 0)
- object = ZTEST_MICROZAP_OBJ;
- else
- object = ZTEST_FATZAP_OBJ;
+ if (ztest_random(2) == 0)
+ object = ZTEST_MICROZAP_OBJ;
+ else
+ object = ZTEST_FATZAP_OBJ;
- if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) {
- wsize = sizeof (txg);
- wc = 1;
- data = &txg;
- } else {
- wsize = 1;
- wc = namelen;
- data = string_value;
- }
+ if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) {
+ wsize = sizeof (txg);
+ wc = 1;
+ data = &txg;
+ } else {
+ wsize = 1;
+ wc = namelen;
+ data = string_value;
+ }
- count = -1ULL;
- VERIFY(zap_count(os, object, &count) == 0);
- ASSERT(count != -1ULL);
+ count = -1ULL;
+ VERIFY(zap_count(os, object, &count) == 0);
+ ASSERT(count != -1ULL);
- /*
- * Select an operation: length, lookup, add, update, remove.
- */
- i = ztest_random(5);
-
- if (i >= 2) {
- tx = dmu_tx_create(os);
- dmu_tx_hold_zap(tx, object, TRUE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("zap parallel");
- dmu_tx_abort(tx);
- return;
- }
- txg = dmu_tx_get_txg(tx);
- bcopy(name, string_value, namelen);
- } else {
- tx = NULL;
- txg = 0;
- bzero(string_value, namelen);
+ /*
+ * Select an operation: length, lookup, add, update, remove.
+ */
+ i = ztest_random(5);
+
+ if (i >= 2) {
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, object, TRUE, NULL);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ ztest_record_enospc("zap parallel");
+ dmu_tx_abort(tx);
+ return;
}
+ txg = dmu_tx_get_txg(tx);
+ bcopy(name, string_value, namelen);
+ } else {
+ tx = NULL;
+ txg = 0;
+ bzero(string_value, namelen);
+ }
- switch (i) {
+ switch (i) {
- case 0:
- error = zap_length(os, object, name, &zl_wsize, &zl_wc);
- if (error == 0) {
- ASSERT3U(wsize, ==, zl_wsize);
- ASSERT3U(wc, ==, zl_wc);
- } else {
- ASSERT3U(error, ==, ENOENT);
- }
- break;
-
- case 1:
- error = zap_lookup(os, object, name, wsize, wc, data);
- if (error == 0) {
- if (data == string_value &&
- bcmp(name, data, namelen) != 0)
- fatal(0, "name '%s' != val '%s' len %d",
- name, data, namelen);
- } else {
- ASSERT3U(error, ==, ENOENT);
- }
- break;
+ case 0:
+ error = zap_length(os, object, name, &zl_wsize, &zl_wc);
+ if (error == 0) {
+ ASSERT3U(wsize, ==, zl_wsize);
+ ASSERT3U(wc, ==, zl_wc);
+ } else {
+ ASSERT3U(error, ==, ENOENT);
+ }
+ break;
- case 2:
- error = zap_add(os, object, name, wsize, wc, data, tx);
- ASSERT(error == 0 || error == EEXIST);
- break;
+ case 1:
+ error = zap_lookup(os, object, name, wsize, wc, data);
+ if (error == 0) {
+ if (data == string_value &&
+ bcmp(name, data, namelen) != 0)
+ fatal(0, "name '%s' != val '%s' len %d",
+ name, data, namelen);
+ } else {
+ ASSERT3U(error, ==, ENOENT);
+ }
+ break;
- case 3:
- VERIFY(zap_update(os, object, name, wsize, wc,
- data, tx) == 0);
- break;
+ case 2:
+ error = zap_add(os, object, name, wsize, wc, data, tx);
+ ASSERT(error == 0 || error == EEXIST);
+ break;
- case 4:
- error = zap_remove(os, object, name, tx);
- ASSERT(error == 0 || error == ENOENT);
- break;
- }
+ case 3:
+ VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
+ break;
- if (tx != NULL)
- dmu_tx_commit(tx);
+ case 4:
+ error = zap_remove(os, object, name, tx);
+ ASSERT(error == 0 || error == ENOENT);
+ break;
}
+
+ if (tx != NULL)
+ dmu_tx_commit(tx);
}
void
@@ -2590,7 +2573,7 @@ ztest_fault_inject(ztest_args_t *za)
char path0[MAXPATHLEN];
char pathrand[MAXPATHLEN];
size_t fsize;
- spa_t *spa = dmu_objset_spa(za->za_os);
+ spa_t *spa = za->za_spa;
int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */
int iters = 1000;
vdev_t *vd0;
@@ -2689,7 +2672,7 @@ ztest_fault_inject(ztest_args_t *za)
void
ztest_scrub(ztest_args_t *za)
{
- spa_t *spa = dmu_objset_spa(za->za_os);
+ spa_t *spa = za->za_spa;
mutex_enter(&spa_namespace_lock);
(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_FALSE);
@@ -2739,7 +2722,7 @@ ztest_spa_rename(ztest_args_t *za)
if (error != 0)
fatal(0, "spa_open('%s') = %d", newname, error);
- ASSERT(spa == dmu_objset_spa(za->za_os));
+ ASSERT(spa == za->za_spa);
spa_close(spa, FTAG);
/*
@@ -2757,7 +2740,7 @@ ztest_spa_rename(ztest_args_t *za)
if (error != 0)
fatal(0, "spa_open('%s') = %d", oldname, error);
- ASSERT(spa == dmu_objset_spa(za->za_os));
+ ASSERT(spa == za->za_spa);
spa_close(spa, FTAG);
umem_free(newname, strlen(newname) + 1);
@@ -3038,29 +3021,15 @@ ztest_thread(void *arg)
ztest_shared_t *zs = ztest_shared;
hrtime_t now, functime;
ztest_info_t *zi;
- int f;
+ int f, i;
while ((now = gethrtime()) < za->za_stop) {
/*
* See if it's time to force a crash.
*/
if (now > za->za_kill) {
- dmu_tx_t *tx;
- uint64_t txg;
-
- mutex_enter(&spa_namespace_lock);
- tx = dmu_tx_create(za->za_os);
- VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
- txg = dmu_tx_get_txg(tx);
- dmu_tx_commit(tx);
- zs->zs_txg = txg;
- if (zopt_verbose >= 3)
- (void) printf(
- "killing process after txg %lld\n",
- (u_longlong_t)txg);
- txg_wait_synced(dmu_objset_pool(za->za_os), txg);
- zs->zs_alloc = spa_get_alloc(dmu_objset_spa(za->za_os));
- zs->zs_space = spa_get_space(dmu_objset_spa(za->za_os));
+ zs->zs_alloc = spa_get_alloc(za->za_spa);
+ zs->zs_space = spa_get_space(za->za_spa);
(void) kill(getpid(), SIGKILL);
}
@@ -3085,9 +3054,8 @@ ztest_thread(void *arg)
ZTEST_DIRSIZE;
za->za_diroff_shared = (1ULL << 63);
- ztest_dmu_write_parallel(za);
-
- zi->zi_func(za);
+ for (i = 0; i < zi->zi_iters; i++)
+ zi->zi_func(za);
functime = gethrtime() - now;
@@ -3234,6 +3202,17 @@ ztest_run(char *pool)
for (t = 0; t < zopt_threads; t++) {
d = t % zopt_datasets;
+
+ (void) strcpy(za[t].za_pool, pool);
+ za[t].za_os = za[d].za_os;
+ za[t].za_spa = spa;
+ za[t].za_zilog = za[d].za_zilog;
+ za[t].za_instance = t;
+ za[t].za_random = ztest_random(-1ULL);
+ za[t].za_start = za[0].za_start;
+ za[t].za_stop = za[0].za_stop;
+ za[t].za_kill = za[0].za_kill;
+
if (t < zopt_datasets) {
ztest_replay_t zr;
int test_future = FALSE;
@@ -3243,13 +3222,11 @@ ztest_run(char *pool)
ztest_create_cb, NULL);
if (error == EEXIST) {
test_future = TRUE;
+ } else if (error == ENOSPC) {
+ zs->zs_enospc_count++;
+ (void) rw_unlock(&ztest_shared->zs_name_lock);
+ break;
} else if (error != 0) {
- if (error == ENOSPC) {
- zs->zs_enospc_count++;
- (void) rw_unlock(
- &ztest_shared->zs_name_lock);
- break;
- }
fatal(0, "dmu_objset_create(%s) = %d",
name, error);
}
@@ -3259,22 +3236,13 @@ ztest_run(char *pool)
fatal(0, "dmu_objset_open('%s') = %d",
name, error);
(void) rw_unlock(&ztest_shared->zs_name_lock);
- if (test_future && ztest_shared->zs_txg > 0)
- ztest_dmu_check_future_leak(za[d].za_os,
- ztest_shared->zs_txg);
+ if (test_future)
+ ztest_dmu_check_future_leak(&za[t]);
zr.zr_os = za[d].za_os;
zil_replay(zr.zr_os, &zr, &zr.zr_assign,
ztest_replay_vector);
za[d].za_zilog = zil_open(za[d].za_os, NULL);
}
- za[t].za_pool = spa_strdup(pool);
- za[t].za_os = za[d].za_os;
- za[t].za_zilog = za[d].za_zilog;
- za[t].za_instance = t;
- za[t].za_random = ztest_random(-1ULL);
- za[t].za_start = za[0].za_start;
- za[t].za_stop = za[0].za_stop;
- za[t].za_kill = za[0].za_kill;
error = thr_create(0, 0, ztest_thread, &za[t], THR_BOUND,
&za[t].za_thread);
@@ -3282,7 +3250,6 @@ ztest_run(char *pool)
fatal(0, "can't create thread %d: error %d",
t, error);
}
- ztest_shared->zs_txg = 0;
while (--t >= 0) {
error = thr_join(za[t].za_thread, NULL, NULL);
@@ -3294,11 +3261,8 @@ ztest_run(char *pool)
zil_close(za[t].za_zilog);
dmu_objset_close(za[t].za_os);
}
- spa_strfree(za[t].za_pool);
}
- umem_free(za, zopt_threads * sizeof (ztest_args_t));
-
if (zopt_verbose >= 3)
show_pool_stats(spa);
@@ -3308,15 +3272,15 @@ ztest_run(char *pool)
zs->zs_space = spa_get_space(spa);
/*
- * Did we have out-of-space errors? If so, destroy a random objset.
+ * If we had out-of-space errors, destroy a random objset.
*/
if (zs->zs_enospc_count != 0) {
(void) rw_rdlock(&ztest_shared->zs_name_lock);
- (void) snprintf(name, 100, "%s/%s_%d", pool, pool,
- (int)ztest_random(zopt_datasets));
+ d = (int)ztest_random(zopt_datasets);
+ (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
if (zopt_verbose >= 3)
(void) printf("Destroying %s to free up space\n", name);
- (void) dmu_objset_find(name, ztest_destroy_cb, NULL,
+ (void) dmu_objset_find(name, ztest_destroy_cb, &za[d],
DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
(void) rw_unlock(&ztest_shared->zs_name_lock);
}
@@ -3330,8 +3294,6 @@ ztest_run(char *pool)
for (t = 1; t < 50; t++)
dmu_prefetch(spa->spa_meta_objset, t, 0, 1 << 15);
- spa_close(spa, FTAG);
-
/* Shutdown the suspend monitor thread */
zio_io_fail_shift = 0;
ztest_exiting = B_TRUE;
@@ -3342,6 +3304,10 @@ ztest_run(char *pool)
if (error)
fatal(0, "thr_join(%d) = %d", tid, error);
+ umem_free(za, zopt_threads * sizeof (ztest_args_t));
+
+ spa_close(spa, FTAG);
+
kernel_fini();
}
diff --git a/usr/src/lib/libzpool/common/llib-lzpool b/usr/src/lib/libzpool/common/llib-lzpool
index e3c895380f..9863eea759 100644
--- a/usr/src/lib/libzpool/common/llib-lzpool
+++ b/usr/src/lib/libzpool/common/llib-lzpool
@@ -48,6 +48,6 @@
#include <sys/bplist.h>
#include <sys/zfs_znode.h>
-extern uint64_t zio_gang_bang;
+extern uint64_t metaslab_gang_bang;
extern uint16_t zio_zil_fail_shift;
extern uint16_t zio_io_fail_shift;
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 5c495fffd5..3f7b242b70 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -1077,7 +1077,6 @@ ZFS_COMMON_OBJS += \
lzjb.o \
metaslab.o \
refcount.o \
- rprwlock.o \
sha256.o \
spa.o \
spa_config.o \
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 589dc7e3de..9365dbdb14 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -35,6 +35,7 @@
#include <sys/zio.h>
uint64_t metaslab_aliquot = 512ULL << 10;
+uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
/*
* ==========================================================================
@@ -728,6 +729,12 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
ASSERT(!DVA_IS_VALID(&dva[d]));
/*
+ * For testing, make some blocks above a certain size be gang blocks.
+ */
+ if (psize >= metaslab_gang_bang && (lbolt & 3) == 0)
+ return (ENOSPC);
+
+ /*
* Start at the rotor and loop through all mgs until we find something.
* Note that there's no locking on mc_rotor or mc_allocated because
* nothing actually breaks if we miss a few updates -- we just won't
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 6aefb025fc..6b1c28140a 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -144,16 +144,9 @@
* zero. Must be called with spa_namespace_lock
* held.
*
- * The spa_config_lock is manipulated using the following functions:
- *
- * spa_config_enter() Acquire the config lock as RW_READER or
- * RW_WRITER. At least one reference on the spa_t
- * must exist.
- *
- * spa_config_exit() Release the config lock.
- *
- * spa_config_held() Returns true if the config lock is currently
- * held in the given state.
+ * The spa_config_lock is a form of rwlock. It must be held as RW_READER
+ * to perform I/O to the pool, and as RW_WRITER to change the vdev config.
+ * The spa_config_lock is manipulated with spa_config_{enter,exit,held}().
*
* The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
*
@@ -202,6 +195,80 @@ int zfs_recover = 0;
/*
* ==========================================================================
+ * SPA config locking
+ * ==========================================================================
+ */
+static void
+spa_config_lock_init(spa_config_lock_t *scl)
+{
+ mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
+ scl->scl_writer = NULL;
+ cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
+ refcount_create(&scl->scl_count);
+}
+
+static void
+spa_config_lock_destroy(spa_config_lock_t *scl)
+{
+ mutex_destroy(&scl->scl_lock);
+ ASSERT(scl->scl_writer == NULL);
+ cv_destroy(&scl->scl_cv);
+ refcount_destroy(&scl->scl_count);
+}
+
+void
+spa_config_enter(spa_t *spa, krw_t rw, void *tag)
+{
+ spa_config_lock_t *scl = &spa->spa_config_lock;
+
+ mutex_enter(&scl->scl_lock);
+
+ if (rw == RW_READER) {
+ while (scl->scl_writer != NULL && scl->scl_writer != curthread)
+ cv_wait(&scl->scl_cv, &scl->scl_lock);
+ } else {
+ while (!refcount_is_zero(&scl->scl_count) &&
+ scl->scl_writer != curthread)
+ cv_wait(&scl->scl_cv, &scl->scl_lock);
+ scl->scl_writer = curthread;
+ }
+
+ (void) refcount_add(&scl->scl_count, tag);
+
+ mutex_exit(&scl->scl_lock);
+}
+
+void
+spa_config_exit(spa_t *spa, void *tag)
+{
+ spa_config_lock_t *scl = &spa->spa_config_lock;
+
+ mutex_enter(&scl->scl_lock);
+
+ ASSERT(!refcount_is_zero(&scl->scl_count));
+
+ if (refcount_remove(&scl->scl_count, tag) == 0) {
+ cv_broadcast(&scl->scl_cv);
+ ASSERT(scl->scl_writer == NULL || scl->scl_writer == curthread);
+ scl->scl_writer = NULL; /* OK in either case */
+ }
+
+ mutex_exit(&scl->scl_lock);
+}
+
+boolean_t
+spa_config_held(spa_t *spa, krw_t rw)
+{
+ spa_config_lock_t *scl = &spa->spa_config_lock;
+
+ if (rw == RW_READER)
+ return (!refcount_is_zero(&scl->scl_count));
+ else
+ return (scl->scl_writer == curthread);
+}
+
+/*
+ * ==========================================================================
* SPA namespace functions
* ==========================================================================
*/
@@ -275,7 +342,7 @@ spa_add(const char *name, const char *altroot)
spa->spa_final_txg = UINT64_MAX;
refcount_create(&spa->spa_refcount);
- rprw_init(&spa->spa_config_lock);
+ spa_config_lock_init(&spa->spa_config_lock);
avl_add(&spa_namespace_avl, spa);
@@ -324,7 +391,7 @@ spa_remove(spa_t *spa)
refcount_destroy(&spa->spa_refcount);
- rprw_destroy(&spa->spa_config_lock);
+ spa_config_lock_destroy(&spa->spa_config_lock);
rw_destroy(&spa->spa_traverse_lock);
@@ -639,29 +706,6 @@ spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc)
/*
* ==========================================================================
- * SPA config locking
- * ==========================================================================
- */
-void
-spa_config_enter(spa_t *spa, krw_t rw, void *tag)
-{
- rprw_enter(&spa->spa_config_lock, rw, tag);
-}
-
-void
-spa_config_exit(spa_t *spa, void *tag)
-{
- rprw_exit(&spa->spa_config_lock, tag);
-}
-
-boolean_t
-spa_config_held(spa_t *spa, krw_t rw)
-{
- return (rprw_held(&spa->spa_config_lock, rw));
-}
-
-/*
- * ==========================================================================
* SPA vdev locking
* ==========================================================================
*/
@@ -1003,7 +1047,7 @@ spa_name(spa_t *spa)
* config lock, both of which are required to do a rename.
*/
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
- spa_config_held(spa, RW_READER) || spa_config_held(spa, RW_WRITER));
+ spa_config_held(spa, RW_READER));
return (spa->spa_name);
}
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
index eb2b6d6289..069255b4c0 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -37,7 +37,6 @@
#include <sys/zfs_context.h>
#include <sys/avl.h>
#include <sys/refcount.h>
-#include <sys/rprwlock.h>
#include <sys/bplist.h>
#ifdef __cplusplus
@@ -68,6 +67,14 @@ struct spa_aux_vdev {
uint_t sav_npending; /* # pending devices */
};
+typedef struct spa_config_lock {
+ kmutex_t scl_lock;
+ kthread_t *scl_writer;
+ uint16_t scl_write_wanted;
+ kcondvar_t scl_cv;
+ refcount_t scl_count;
+} spa_config_lock_t;
+
struct spa {
/*
* Fields protected by spa_namespace_lock.
@@ -157,7 +164,7 @@ struct spa {
* In order for the MDB module to function correctly, the other
* fields must remain in the same location.
*/
- rprwlock_t spa_config_lock; /* configuration changes */
+ spa_config_lock_t spa_config_lock; /* configuration changes */
refcount_t spa_refcount; /* number of opens */
};
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index b1ec648056..2ec3de6513 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -83,9 +83,6 @@ extern void vdev_space_update(vdev_t *vd, int64_t space_delta,
extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
-extern void vdev_io_start(zio_t *zio);
-extern void vdev_io_done(zio_t *zio);
-
extern int vdev_fault(spa_t *spa, uint64_t guid);
extern int vdev_degrade(spa_t *spa, uint64_t guid);
extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 2eebbba566..7d823bab10 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -62,8 +62,8 @@ typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift);
typedef void vdev_close_func_t(vdev_t *vd);
typedef int vdev_probe_func_t(vdev_t *vd);
typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
-typedef void vdev_io_start_func_t(zio_t *zio);
-typedef void vdev_io_done_func_t(zio_t *zio);
+typedef int vdev_io_start_func_t(zio_t *zio);
+typedef int vdev_io_done_func_t(zio_t *zio);
typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
typedef struct vdev_ops {
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 4591274518..e673edbac2 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -153,6 +153,7 @@ enum zio_compress {
(ZIO_FLAG_CANFAIL | \
ZIO_FLAG_FAILFAST | \
ZIO_FLAG_CONFIG_HELD | \
+ ZIO_FLAG_DONT_CACHE | \
ZIO_FLAG_DONT_RETRY | \
ZIO_FLAG_IO_REPAIR | \
ZIO_FLAG_SPECULATIVE | \
@@ -164,9 +165,11 @@ enum zio_compress {
#define ZIO_FLAG_VDEV_INHERIT \
(ZIO_FLAG_GANG_INHERIT | \
- ZIO_FLAG_DONT_CACHE | \
ZIO_FLAG_PHYSICAL)
+#define ZIO_PIPELINE_CONTINUE 0x100
+#define ZIO_PIPELINE_STOP 0x101
+
/*
* We'll take the unused errno 'EBADE' (from the Convergent graveyard)
* to indicate checksum errors.
@@ -262,7 +265,6 @@ struct zio {
uint32_t io_numerrors;
uint32_t io_pipeline;
uint32_t io_orig_pipeline;
- uint32_t io_async_stages;
uint64_t io_children_notready;
uint64_t io_children_notdone;
void *io_waiter;
@@ -319,21 +321,18 @@ extern void zio_flush_vdev(spa_t *spa, uint64_t vdev, zio_t **zio);
extern int zio_wait(zio_t *zio);
extern void zio_nowait(zio_t *zio);
+extern void zio_execute(zio_t *zio);
+extern void zio_interrupt(zio_t *zio);
+
+extern int zio_wait_for_children_ready(zio_t *zio);
+extern int zio_wait_for_children_done(zio_t *zio);
extern void *zio_buf_alloc(size_t size);
extern void zio_buf_free(void *buf, size_t size);
extern void *zio_data_buf_alloc(size_t size);
extern void zio_data_buf_free(void *buf, size_t size);
-/*
- * Move an I/O to the next stage of the pipeline and execute that stage.
- * There's no locking on io_stage because there's no legitimate way for
- * multiple threads to be attempting to process the same I/O.
- */
-extern void zio_next_stage(zio_t *zio);
-extern void zio_next_stage_async(zio_t *zio);
extern void zio_resubmit_stage_async(void *);
-extern void zio_wait_children_done(zio_t *zio);
/*
* Delegate I/O to a child vdev.
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_impl.h b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
index a5a0bb54e8..60a1c8b38e 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
@@ -38,16 +38,15 @@ extern "C" {
/*
* I/O Groups: pipeline stage definitions.
*/
-
typedef enum zio_stage {
ZIO_STAGE_OPEN = 0, /* RWFCI */
- ZIO_STAGE_WAIT_CHILDREN_READY, /* RWFCI */
+ ZIO_STAGE_WAIT_FOR_CHILDREN_READY, /* RWFCI */
+ ZIO_STAGE_READ_INIT, /* R---- */
+ ZIO_STAGE_ISSUE_ASYNC, /* -W--- */
ZIO_STAGE_WRITE_COMPRESS, /* -W--- */
ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */
- ZIO_STAGE_GANG_PIPELINE, /* -WFC- */
-
ZIO_STAGE_GET_GANG_HEADER, /* -WFC- */
ZIO_STAGE_REWRITE_GANG_MEMBERS, /* -W--- */
ZIO_STAGE_FREE_GANG_MEMBERS, /* --F-- */
@@ -61,13 +60,11 @@ typedef enum zio_stage {
ZIO_STAGE_READY, /* RWFCI */
- ZIO_STAGE_READ_INIT, /* R---- */
-
ZIO_STAGE_VDEV_IO_START, /* RW--I */
ZIO_STAGE_VDEV_IO_DONE, /* RW--I */
ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */
- ZIO_STAGE_WAIT_CHILDREN_DONE, /* RWFCI */
+ ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, /* RWFCI */
ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */
ZIO_STAGE_READ_GANG_MEMBERS, /* R---- */
@@ -77,30 +74,22 @@ typedef enum zio_stage {
ZIO_STAGE_DONE /* RWFCI */
} zio_stage_t;
-/*
- * The stages for which there's some performance value in going async.
- * When compression is enabled, ZIO_STAGE_WRITE_COMPRESS is ORed in as well.
- */
-#define ZIO_ASYNC_PIPELINE_STAGES \
- ((1U << ZIO_STAGE_CHECKSUM_GENERATE) | \
- (1U << ZIO_STAGE_VDEV_IO_DONE) | \
- (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \
- (1U << ZIO_STAGE_READ_DECOMPRESS))
+#define ZIO_INTERLOCK_STAGES \
+ ((1U << ZIO_STAGE_WAIT_FOR_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_READY) | \
+ (1U << ZIO_STAGE_WAIT_FOR_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_ASSESS) | \
+ (1U << ZIO_STAGE_DONE))
-#define ZIO_VDEV_IO_PIPELINE \
+#define ZIO_VDEV_IO_STAGES \
((1U << ZIO_STAGE_VDEV_IO_START) | \
(1U << ZIO_STAGE_VDEV_IO_DONE) | \
(1U << ZIO_STAGE_VDEV_IO_ASSESS))
#define ZIO_READ_PHYS_PIPELINE \
- ((1U << ZIO_STAGE_OPEN) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
- (1U << ZIO_STAGE_READY) | \
- ZIO_VDEV_IO_PIPELINE | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
- (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \
- (1U << ZIO_STAGE_ASSESS) | \
- (1U << ZIO_STAGE_DONE))
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES | \
+ (1U << ZIO_STAGE_CHECKSUM_VERIFY))
#define ZIO_READ_GANG_PIPELINE \
ZIO_READ_PHYS_PIPELINE
@@ -109,97 +98,66 @@ typedef enum zio_stage {
(1U << ZIO_STAGE_READ_INIT) | \
ZIO_READ_PHYS_PIPELINE
-#define ZIO_WRITE_PHYS_PIPELINE \
- ((1U << ZIO_STAGE_OPEN) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
- (1U << ZIO_STAGE_CHECKSUM_GENERATE) | \
- (1U << ZIO_STAGE_READY) | \
- ZIO_VDEV_IO_PIPELINE | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
- (1U << ZIO_STAGE_ASSESS) | \
- (1U << ZIO_STAGE_DONE))
+#define ZIO_WRITE_COMMON_STAGES \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES | \
+ (1U << ZIO_STAGE_ISSUE_ASYNC) | \
+ (1U << ZIO_STAGE_CHECKSUM_GENERATE))
-#define ZIO_WRITE_COMMON_PIPELINE \
- ZIO_WRITE_PHYS_PIPELINE
+#define ZIO_WRITE_PHYS_PIPELINE \
+ ZIO_WRITE_COMMON_STAGES
#define ZIO_WRITE_PIPELINE \
- ((1U << ZIO_STAGE_WRITE_COMPRESS) | \
- ZIO_WRITE_COMMON_PIPELINE)
+ (ZIO_WRITE_COMMON_STAGES | \
+ (1U << ZIO_STAGE_WRITE_COMPRESS))
-#define ZIO_GANG_STAGES \
+#define ZIO_GANG_REWRITE_STAGES \
((1U << ZIO_STAGE_GET_GANG_HEADER) | \
(1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \
- (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \
- (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \
- (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \
- (1U << ZIO_STAGE_READ_GANG_MEMBERS))
-
-#define ZIO_REWRITE_PIPELINE \
- ((1U << ZIO_STAGE_GANG_PIPELINE) | \
- (1U << ZIO_STAGE_GET_GANG_HEADER) | \
- (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \
- (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \
- ZIO_WRITE_COMMON_PIPELINE)
-
-#define ZIO_WRITE_ALLOCATE_PIPELINE \
- ((1U << ZIO_STAGE_DVA_ALLOCATE) | \
- ZIO_WRITE_COMMON_PIPELINE)
+ (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE))
#define ZIO_GANG_FREE_STAGES \
((1U << ZIO_STAGE_GET_GANG_HEADER) | \
(1U << ZIO_STAGE_FREE_GANG_MEMBERS))
-#define ZIO_FREE_PIPELINE \
- ((1U << ZIO_STAGE_OPEN) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
- (1U << ZIO_STAGE_GANG_PIPELINE) | \
- (1U << ZIO_STAGE_GET_GANG_HEADER) | \
- (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \
+#define ZIO_GANG_CLAIM_STAGES \
+ ((1U << ZIO_STAGE_GET_GANG_HEADER) | \
+ (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS))
+
+#define ZIO_REWRITE_PIPELINE(bp) \
+ (ZIO_WRITE_COMMON_STAGES | \
+ (BP_IS_GANG(bp) ? ZIO_GANG_REWRITE_STAGES : 0))
+
+#define ZIO_WRITE_ALLOCATE_PIPELINE \
+ (ZIO_WRITE_COMMON_STAGES | \
+ (1U << ZIO_STAGE_DVA_ALLOCATE))
+
+#define ZIO_FREE_PIPELINE(bp) \
+ (ZIO_INTERLOCK_STAGES | \
(1U << ZIO_STAGE_DVA_FREE) | \
- (1U << ZIO_STAGE_READY) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
- (1U << ZIO_STAGE_ASSESS) | \
- (1U << ZIO_STAGE_DONE))
+ (BP_IS_GANG(bp) ? ZIO_GANG_FREE_STAGES : 0))
-#define ZIO_CLAIM_PIPELINE \
- ((1U << ZIO_STAGE_OPEN) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
- (1U << ZIO_STAGE_GANG_PIPELINE) | \
- (1U << ZIO_STAGE_GET_GANG_HEADER) | \
- (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \
+#define ZIO_CLAIM_PIPELINE(bp) \
+ (ZIO_INTERLOCK_STAGES | \
(1U << ZIO_STAGE_DVA_CLAIM) | \
- (1U << ZIO_STAGE_READY) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
- (1U << ZIO_STAGE_ASSESS) | \
- (1U << ZIO_STAGE_DONE))
+ (BP_IS_GANG(bp) ? ZIO_GANG_CLAIM_STAGES : 0))
#define ZIO_IOCTL_PIPELINE \
- ((1U << ZIO_STAGE_OPEN) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
- (1U << ZIO_STAGE_READY) | \
- ZIO_VDEV_IO_PIPELINE | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
- (1U << ZIO_STAGE_ASSESS) | \
- (1U << ZIO_STAGE_DONE))
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES)
+
#define ZIO_WAIT_FOR_CHILDREN_PIPELINE \
- ((1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
- (1U << ZIO_STAGE_READY) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
- (1U << ZIO_STAGE_ASSESS) | \
- (1U << ZIO_STAGE_DONE))
+ ZIO_INTERLOCK_STAGES
-#define ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE \
- ((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+#define ZIO_VDEV_CHILD_PIPELINE \
+ (ZIO_VDEV_IO_STAGES | \
(1U << ZIO_STAGE_ASSESS) | \
+ (1U << ZIO_STAGE_WAIT_FOR_CHILDREN_DONE) | \
(1U << ZIO_STAGE_DONE))
-#define ZIO_VDEV_CHILD_PIPELINE \
- (ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE | \
- ZIO_VDEV_IO_PIPELINE)
-
#define ZIO_ERROR_PIPELINE_MASK \
- ZIO_WAIT_FOR_CHILDREN_PIPELINE
+ ZIO_INTERLOCK_STAGES
typedef struct zio_transform zio_transform_t;
struct zio_transform {
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 2a2dc1d625..2b4c663a1a 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -136,6 +136,9 @@ vdev_lookup_top(spa_t *spa, uint64_t vdev)
{
vdev_t *rvd = spa->spa_root_vdev;
+ ASSERT(spa_config_held(spa, RW_READER) ||
+ curthread == spa->spa_scrub_thread);
+
if (vdev < rvd->vdev_children)
return (rvd->vdev_child[vdev]);
@@ -1459,18 +1462,6 @@ vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
return (vd->vdev_ops->vdev_op_asize(vd, psize));
}
-void
-vdev_io_start(zio_t *zio)
-{
- zio->io_vd->vdev_ops->vdev_op_io_start(zio);
-}
-
-void
-vdev_io_done(zio_t *zio)
-{
- zio->io_vd->vdev_ops->vdev_op_io_done(zio);
-}
-
const char *
vdev_description(vdev_t *vd)
{
diff --git a/usr/src/uts/common/fs/zfs/vdev_cache.c b/usr/src/uts/common/fs/zfs/vdev_cache.c
index ce9508d2fb..5f475f9b47 100644
--- a/usr/src/uts/common/fs/zfs/vdev_cache.c
+++ b/usr/src/uts/common/fs/zfs/vdev_cache.c
@@ -231,7 +231,7 @@ vdev_cache_fill(zio_t *zio)
zio->io_delegate_list = dio->io_delegate_next;
dio->io_delegate_next = NULL;
dio->io_error = zio->io_error;
- zio_next_stage(dio);
+ zio_execute(dio);
}
}
@@ -286,15 +286,10 @@ vdev_cache_read(zio_t *zio)
zio_vdev_io_bypass(zio);
mutex_exit(&vc->vc_lock);
- zio_next_stage(zio);
+ zio_execute(zio);
return (0);
}
- if (!(zio->io_flags & ZIO_FLAG_METADATA)) {
- mutex_exit(&vc->vc_lock);
- return (EINVAL);
- }
-
ve = vdev_cache_allocate(zio);
if (ve == NULL) {
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index e4e13f2aac..933ed3e2bf 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -386,7 +386,7 @@ vdev_disk_io_intr(buf_t *bp)
kmem_free(vdb, sizeof (vdev_disk_buf_t));
- zio_next_stage_async(zio);
+ zio_interrupt(zio);
}
static void
@@ -396,10 +396,10 @@ vdev_disk_ioctl_done(void *zio_arg, int error)
zio->io_error = error;
- zio_next_stage_async(zio);
+ zio_interrupt(zio);
}
-static void
+static int
vdev_disk_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -414,8 +414,7 @@ vdev_disk_io_start(zio_t *zio)
/* XXPOLICY */
if (!vdev_readable(vd)) {
zio->io_error = ENXIO;
- zio_next_stage_async(zio);
- return;
+ return (ZIO_PIPELINE_CONTINUE);
}
switch (zio->io_cmd) {
@@ -444,8 +443,10 @@ vdev_disk_io_start(zio_t *zio)
* and will call vdev_disk_ioctl_done()
* upon completion.
*/
- return;
- } else if (error == ENOTSUP || error == ENOTTY) {
+ return (ZIO_PIPELINE_STOP);
+ }
+
+ if (error == ENOTSUP || error == ENOTTY) {
/*
* If we get ENOTSUP or ENOTTY, we know that
* no future attempts will ever succeed.
@@ -463,15 +464,26 @@ vdev_disk_io_start(zio_t *zio)
zio->io_error = ENOTSUP;
}
- zio_next_stage_async(zio);
- return;
+ return (ZIO_PIPELINE_CONTINUE);
}
if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
- return;
+ return (ZIO_PIPELINE_STOP);
if ((zio = vdev_queue_io(zio)) == NULL)
- return;
+ return (ZIO_PIPELINE_STOP);
+
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
+ else
+ error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
+ error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
+
+ if (error) {
+ zio->io_error = error;
+ zio_interrupt(zio);
+ return (ZIO_PIPELINE_STOP);
+ }
flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
flags |= B_BUSY | B_NOCACHE;
@@ -491,26 +503,14 @@ vdev_disk_io_start(zio_t *zio)
bp->b_bufsize = zio->io_size;
bp->b_iodone = (int (*)())vdev_disk_io_intr;
- /* XXPOLICY */
- if (zio->io_type == ZIO_TYPE_WRITE)
- error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
- else
- error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
- error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
- if (error) {
- zio->io_error = error;
- bioerror(bp, error);
- bp->b_resid = bp->b_bcount;
- bp->b_iodone(bp);
- return;
- }
-
error = ldi_strategy(dvd->vd_lh, bp);
/* ldi_strategy() will return non-zero only on programming errors */
ASSERT(error == 0);
+
+ return (ZIO_PIPELINE_STOP);
}
-static void
+static int
vdev_disk_io_done(zio_t *zio)
{
vdev_queue_io_done(zio);
@@ -544,7 +544,7 @@ vdev_disk_io_done(zio_t *zio)
}
}
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
vdev_ops_t vdev_disk_ops = {
diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c
index ee30845cb7..51abd9612b 100644
--- a/usr/src/uts/common/fs/zfs/vdev_file.c
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c
@@ -215,7 +215,7 @@ vdev_file_probe(vdev_t *vd)
return (error);
}
-static void
+static int
vdev_file_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -229,8 +229,7 @@ vdev_file_io_start(zio_t *zio)
/* XXPOLICY */
if (!vdev_readable(vd)) {
zio->io_error = ENXIO;
- zio_next_stage_async(zio);
- return;
+ return (ZIO_PIPELINE_CONTINUE);
}
switch (zio->io_cmd) {
@@ -244,8 +243,7 @@ vdev_file_io_start(zio_t *zio)
zio->io_error = ENOTSUP;
}
- zio_next_stage_async(zio);
- return;
+ return (ZIO_PIPELINE_CONTINUE);
}
/*
@@ -254,11 +252,11 @@ vdev_file_io_start(zio_t *zio)
*/
#ifndef _KERNEL
if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
- return;
+ return (ZIO_PIPELINE_STOP);
#endif
if ((zio = vdev_queue_io(zio)) == NULL)
- return;
+ return (ZIO_PIPELINE_STOP);
/* XXPOLICY */
if (zio->io_type == ZIO_TYPE_WRITE)
@@ -268,8 +266,8 @@ vdev_file_io_start(zio_t *zio)
error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
if (error) {
zio->io_error = error;
- zio_next_stage_async(zio);
- return;
+ zio_interrupt(zio);
+ return (ZIO_PIPELINE_STOP);
}
zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
@@ -280,26 +278,25 @@ vdev_file_io_start(zio_t *zio)
if (resid != 0 && zio->io_error == 0)
zio->io_error = ENOSPC;
- zio_next_stage_async(zio);
+ zio_interrupt(zio);
+
+ return (ZIO_PIPELINE_STOP);
}
-static void
+static int
vdev_file_io_done(zio_t *zio)
{
+ vdev_t *vd = zio->io_vd;
if (zio_injection_enabled && zio->io_error == 0)
- zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
+ zio->io_error = zio_handle_device_injection(vd, EIO);
/*
* If an error has been encountered then attempt to probe the device
* to determine if it's still accessible.
*/
- if (zio->io_error == EIO) {
- vdev_t *vd = zio->io_vd;
-
- if (vdev_probe(vd) != 0)
- vd->vdev_is_failing = B_TRUE;
- }
+ if (zio->io_error == EIO && vdev_probe(vd) != 0)
+ vd->vdev_is_failing = B_TRUE;
vdev_queue_io_done(zio);
@@ -308,7 +305,7 @@ vdev_file_io_done(zio_t *zio)
vdev_cache_write(zio);
#endif
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
vdev_ops_t vdev_file_ops = {
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
index 45d326ae69..4d6c499c10 100644
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -253,7 +253,7 @@ vdev_mirror_child_select(zio_t *zio)
return (-1);
}
-static void
+static int
vdev_mirror_io_start(zio_t *zio)
{
mirror_map_t *mm;
@@ -279,8 +279,7 @@ vdev_mirror_io_start(zio_t *zio)
ZIO_FLAG_CANFAIL,
vdev_mirror_scrub_done, mc));
}
- zio_wait_children_done(zio);
- return;
+ return (zio_wait_for_children_done(zio));
}
/*
* For normal reads just pick one child.
@@ -316,10 +315,10 @@ vdev_mirror_io_start(zio_t *zio)
c++;
}
- zio_wait_children_done(zio);
+ return (zio_wait_for_children_done(zio));
}
-static void
+static int
vdev_mirror_io_done(zio_t *zio)
{
mirror_map_t *mm = zio->io_vsd;
@@ -362,8 +361,7 @@ vdev_mirror_io_done(zio_t *zio)
if (good_copies != 0)
zio->io_error = 0;
vdev_mirror_map_free(zio);
- zio_next_stage(zio);
- return;
+ return (ZIO_PIPELINE_CONTINUE);
}
ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -383,8 +381,7 @@ vdev_mirror_io_done(zio_t *zio)
mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL,
vdev_mirror_child_done, mc));
- zio_wait_children_done(zio);
- return;
+ return (zio_wait_for_children_done(zio));
}
/* XXPOLICY */
@@ -441,12 +438,13 @@ vdev_mirror_io_done(zio_t *zio)
}
zio_nowait(rio);
- zio_wait_children_done(zio);
- return;
+
+ return (zio_wait_for_children_done(zio));
}
vdev_mirror_map_free(zio);
- zio_next_stage(zio);
+
+ return (ZIO_PIPELINE_CONTINUE);
}
static void
diff --git a/usr/src/uts/common/fs/zfs/vdev_missing.c b/usr/src/uts/common/fs/zfs/vdev_missing.c
index 3aa831c46d..49727ef996 100644
--- a/usr/src/uts/common/fs/zfs/vdev_missing.c
+++ b/usr/src/uts/common/fs/zfs/vdev_missing.c
@@ -62,18 +62,18 @@ vdev_missing_close(vdev_t *vd)
}
/* ARGSUSED */
-static void
+static int
vdev_missing_io_start(zio_t *zio)
{
zio->io_error = ENOTSUP;
- zio_next_stage_async(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
/* ARGSUSED */
-static void
+static int
vdev_missing_io_done(zio_t *zio)
{
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
/* ARGSUSED */
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 7e99c1fd5b..0f921e088a 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -162,7 +162,7 @@ vdev_queue_agg_io_done(zio_t *aio)
aio->io_delegate_list = dio->io_delegate_next;
dio->io_delegate_next = NULL;
dio->io_error = aio->io_error;
- zio_next_stage(dio);
+ zio_execute(dio);
}
ASSERT3U(offset, ==, aio->io_size);
@@ -172,11 +172,8 @@ vdev_queue_agg_io_done(zio_t *aio)
#define IS_ADJACENT(io, nio) \
((io)->io_offset + (io)->io_size == (nio)->io_offset)
-typedef void zio_issue_func_t(zio_t *);
-
static zio_t *
-vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
- zio_issue_func_t **funcp)
+vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
{
zio_t *fio, *lio, *aio, *dio;
avl_tree_t *tree;
@@ -184,8 +181,6 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
ASSERT(MUTEX_HELD(&vq->vq_lock));
- *funcp = NULL;
-
if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
avl_numnodes(&vq->vq_deadline_tree) == 0)
return (NULL);
@@ -245,7 +240,6 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
avl_add(&vq->vq_pending_tree, aio);
- *funcp = zio_nowait;
return (aio);
}
@@ -254,8 +248,6 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
avl_add(&vq->vq_pending_tree, fio);
- *funcp = zio_next_stage;
-
return (fio);
}
@@ -264,7 +256,6 @@ vdev_queue_io(zio_t *zio)
{
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
zio_t *nio;
- zio_issue_func_t *func;
ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
@@ -285,15 +276,19 @@ vdev_queue_io(zio_t *zio)
vdev_queue_io_add(vq, zio);
- nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending, &func);
+ nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending);
mutex_exit(&vq->vq_lock);
- if (nio == NULL || func != zio_nowait)
- return (nio);
+ if (nio == NULL)
+ return (NULL);
+
+ if (nio->io_done == vdev_queue_agg_io_done) {
+ zio_nowait(nio);
+ return (NULL);
+ }
- func(nio);
- return (NULL);
+ return (nio);
}
void
@@ -301,7 +296,6 @@ vdev_queue_io_done(zio_t *zio)
{
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
zio_t *nio;
- zio_issue_func_t *func;
int i;
mutex_enter(&vq->vq_lock);
@@ -309,13 +303,16 @@ vdev_queue_io_done(zio_t *zio)
avl_remove(&vq->vq_pending_tree, zio);
for (i = 0; i < zfs_vdev_ramp_rate; i++) {
- nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending, &func);
+ nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
if (nio == NULL)
break;
mutex_exit(&vq->vq_lock);
- if (func == zio_next_stage)
+ if (nio->io_done == vdev_queue_agg_io_done) {
+ zio_nowait(nio);
+ } else {
zio_vdev_io_reissue(nio);
- func(nio);
+ zio_execute(nio);
+ }
mutex_enter(&vq->vq_lock);
}
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index 73a3ae2565..74b035868c 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -639,7 +639,7 @@ vdev_raidz_repair_done(zio_t *zio)
vdev_raidz_map_free(zio->io_private);
}
-static void
+static int
vdev_raidz_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -672,8 +672,8 @@ vdev_raidz_io_start(zio_t *zio)
zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
vdev_raidz_child_done, rc));
}
- zio_wait_children_done(zio);
- return;
+
+ return (zio_wait_for_children_done(zio));
}
ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -714,7 +714,7 @@ vdev_raidz_io_start(zio_t *zio)
}
}
- zio_wait_children_done(zio);
+ return (zio_wait_for_children_done(zio));
}
/*
@@ -783,7 +783,7 @@ static uint64_t raidz_corrected_p;
static uint64_t raidz_corrected_q;
static uint64_t raidz_corrected_pq;
-static void
+static int
vdev_raidz_io_done(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -840,8 +840,8 @@ vdev_raidz_io_done(zio_t *zio)
zio->io_error = 0;
vdev_raidz_map_free(zio);
- zio_next_stage(zio);
- return;
+
+ return (ZIO_PIPELINE_CONTINUE);
}
ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -1022,8 +1022,8 @@ vdev_raidz_io_done(zio_t *zio)
vdev_raidz_child_done, rc));
} while (++c < rm->rm_cols);
dprintf("rereading\n");
- zio_wait_children_done(zio);
- return;
+
+ return (zio_wait_for_children_done(zio));
}
/*
@@ -1205,12 +1205,13 @@ done:
}
zio_nowait(rio);
- zio_wait_children_done(zio);
- return;
+
+ return (zio_wait_for_children_done(zio));
}
vdev_raidz_map_free(zio);
- zio_next_stage(zio);
+
+ return (ZIO_PIPELINE_CONTINUE);
}
static void
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 112aaa6f25..4aa21a6501 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -61,9 +61,6 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
char *zio_type_name[ZIO_TYPES] = {
"null", "read", "write", "free", "claim", "ioctl" };
-/* At or above this size, force gang blocking - for testing */
-uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1;
-
/* Force an allocation failure when non-zero */
uint16_t zio_zil_fail_shift = 0;
uint16_t zio_io_fail_shift = 0;
@@ -170,8 +167,6 @@ zio_init(void)
align, NULL, NULL, NULL, NULL, data_alloc_arena,
KMC_NODEBUG);
- dprintf("creating cache for size %5lx align %5lx\n",
- size, align);
}
}
@@ -356,9 +351,6 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio->io_bp = bp;
zio->io_bp_copy = *bp;
zio->io_bp_orig = *bp;
- if (dmu_ot[BP_GET_TYPE(bp)].ot_metadata ||
- BP_GET_LEVEL(bp) != 0)
- zio->io_flags |= ZIO_FLAG_METADATA;
}
zio->io_done = done;
zio->io_private = private;
@@ -366,10 +358,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio->io_priority = priority;
zio->io_stage = stage;
zio->io_pipeline = pipeline;
- zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES;
zio->io_timestamp = lbolt64;
- if (pio != NULL)
- zio->io_flags |= (pio->io_flags & ZIO_FLAG_METADATA);
mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
zio_push_transform(zio, data, size, size);
@@ -395,7 +384,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
if (pio == NULL) {
if (type != ZIO_TYPE_NULL &&
!(flags & ZIO_FLAG_CONFIG_HELD)) {
- spa_config_enter(zio->io_spa, RW_READER, zio);
+ spa_config_enter(spa, RW_READER, zio);
zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
}
zio->io_root = zio;
@@ -409,7 +398,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
!(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) &&
!(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) {
pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
- spa_config_enter(zio->io_spa, RW_READER, pio);
+ spa_config_enter(spa, RW_READER, pio);
}
if (stage < ZIO_STAGE_READY)
pio->io_children_notready++;
@@ -524,9 +513,6 @@ zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
zio->io_compress = compress;
zio->io_ndvas = ncopies;
- if (compress != ZIO_COMPRESS_OFF)
- zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS;
-
if (bp->blk_birth != txg) {
/* XXX the bp usually (always?) gets re-zeroed later */
BP_ZERO(bp);
@@ -551,7 +537,7 @@ zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
zio = zio_create(pio, spa, txg, bp, data, size, done, private,
ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
- ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
+ ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp));
zio->io_bookmark = *zb;
zio->io_checksum = checksum;
@@ -612,7 +598,7 @@ zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER,
- ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
+ ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp));
zio->io_bp = &zio->io_bp_copy;
@@ -641,7 +627,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
- ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+ ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp));
zio->io_bp = &zio->io_bp_copy;
@@ -820,7 +806,7 @@ zio_wait(zio_t *zio)
zio->io_waiter = curthread;
- zio_next_stage_async(zio);
+ zio_execute(zio);
mutex_enter(&zio->io_lock);
while (zio->io_stalled != ZIO_STAGE_DONE)
@@ -838,7 +824,23 @@ zio_wait(zio_t *zio)
void
zio_nowait(zio_t *zio)
{
- zio_next_stage_async(zio);
+ zio_execute(zio);
+}
+
+void
+zio_interrupt(zio_t *zio)
+{
+ (void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type],
+ (task_func_t *)zio_execute, zio, TQ_SLEEP);
+}
+
+static int
+zio_issue_async(zio_t *zio)
+{
+ (void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type],
+ (task_func_t *)zio_execute, zio, TQ_SLEEP);
+
+ return (ZIO_PIPELINE_STOP);
}
/*
@@ -846,18 +848,20 @@ zio_nowait(zio_t *zio)
* I/O pipeline interlocks: parent/child dependency scoreboarding
* ==========================================================================
*/
-static void
+static int
zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
{
+ int rv = ZIO_PIPELINE_CONTINUE;
+
mutex_enter(&zio->io_lock);
- if (*countp == 0) {
- ASSERT(zio->io_stalled == 0);
- mutex_exit(&zio->io_lock);
- zio_next_stage(zio);
- } else {
+ ASSERT(zio->io_stalled == 0);
+ if (*countp != 0) {
zio->io_stalled = stage;
- mutex_exit(&zio->io_lock);
+ rv = ZIO_PIPELINE_STOP;
}
+ mutex_exit(&zio->io_lock);
+
+ return (rv);
}
static void
@@ -872,48 +876,54 @@ zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
if (--*countp == 0 && pio->io_stalled == stage) {
pio->io_stalled = 0;
mutex_exit(&pio->io_lock);
- zio_next_stage_async(pio);
+ zio_execute(pio);
} else {
mutex_exit(&pio->io_lock);
}
}
-static void
-zio_wait_children_ready(zio_t *zio)
+int
+zio_wait_for_children_ready(zio_t *zio)
{
- zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
- &zio->io_children_notready);
+ return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY,
+ &zio->io_children_notready));
}
-void
-zio_wait_children_done(zio_t *zio)
+int
+zio_wait_for_children_done(zio_t *zio)
{
- zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
- &zio->io_children_notdone);
+ return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE,
+ &zio->io_children_notdone));
}
-static void
+static int
zio_read_init(zio_t *zio)
{
- if (BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF) {
- uint64_t csize = BP_GET_PSIZE(zio->io_bp);
+ blkptr_t *bp = zio->io_bp;
+
+ if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+ uint64_t csize = BP_GET_PSIZE(bp);
void *cbuf = zio_buf_alloc(csize);
zio_push_transform(zio, cbuf, csize, csize);
zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
}
- if (BP_IS_GANG(zio->io_bp)) {
+ if (BP_IS_GANG(bp)) {
uint64_t gsize = SPA_GANGBLOCKSIZE;
void *gbuf = zio_buf_alloc(gsize);
zio_push_transform(zio, gbuf, gsize, gsize);
zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
}
- zio_next_stage(zio);
+
+ if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
+ return (ZIO_PIPELINE_CONTINUE);
}
-static void
+static int
zio_ready(zio_t *zio)
{
zio_t *pio = zio->io_parent;
@@ -922,16 +932,16 @@ zio_ready(zio_t *zio)
zio->io_ready(zio);
if (pio != NULL)
- zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
+ zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY,
&pio->io_children_notready);
if (zio->io_bp)
zio->io_bp_copy = *zio->io_bp;
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
-static void
+static int
zio_vdev_retry_io(zio_t *zio)
{
zio_t *pio = zio->io_parent;
@@ -967,7 +977,7 @@ zio_vdev_retry_io(zio_t *zio)
if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio))
pio->io_flags |= ZIO_FLAG_WRITE_RETRY;
- ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_CHILDREN_DONE);
+ ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_FOR_CHILDREN_DONE);
mutex_exit(&pio->io_lock);
}
@@ -977,7 +987,8 @@ zio_vdev_retry_io(zio_t *zio)
*/
zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY;
zio->io_error = 0;
- zio_next_stage_async(zio);
+
+ return (ZIO_PIPELINE_CONTINUE);
}
int
@@ -1029,7 +1040,7 @@ zio_vdev_resume_io(spa_t *spa)
zio->io_stage = ZIO_STAGE_READY;
}
- (void) taskq_dispatch(zio_taskq, zio_resubmit_stage_async,
+ (void) taskq_dispatch(zio_taskq, (task_func_t *)zio_execute,
zio, TQ_SLEEP);
}
mutex_exit(&spa->spa_zio_lock);
@@ -1049,7 +1060,7 @@ zio_vdev_resume_io(spa_t *spa)
return (0);
}
-static void
+static int
zio_vdev_suspend_io(zio_t *zio)
{
spa_t *spa = zio->io_spa;
@@ -1069,9 +1080,11 @@ zio_vdev_suspend_io(zio_t *zio)
cv_broadcast(&spa->spa_zio_cv);
#endif
mutex_exit(&spa->spa_zio_lock);
+
+ return (ZIO_PIPELINE_STOP);
}
-static void
+static int
zio_assess(zio_t *zio)
{
spa_t *spa = zio->io_spa;
@@ -1138,10 +1151,9 @@ zio_assess(zio_t *zio)
* property.
*/
if (zio_write_retry && zio->io_error != ENOSPC &&
- IO_IS_ALLOCATING(zio)) {
- zio_vdev_retry_io(zio);
- return;
- }
+ IO_IS_ALLOCATING(zio))
+ return (zio_vdev_retry_io(zio));
+
ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
/*
@@ -1175,22 +1187,20 @@ zio_assess(zio_t *zio)
"uncorrectable I/O failure and the "
"failure mode property for this pool "
"is set to panic.", spa_name(spa));
- } else {
- cmn_err(CE_WARN, "Pool '%s' has encountered "
- "an uncorrectable I/O error. Manual "
- "intervention is required.",
- spa_name(spa));
- zio_vdev_suspend_io(zio);
}
- return;
+ cmn_err(CE_WARN, "Pool '%s' has encountered "
+ "an uncorrectable I/O error. "
+ "Manual intervention is required.", spa_name(spa));
+ return (zio_vdev_suspend_io(zio));
}
}
ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
ASSERT(zio->io_children_notready == 0);
- zio_next_stage(zio);
+
+ return (ZIO_PIPELINE_CONTINUE);
}
-static void
+static int
zio_done(zio_t *zio)
{
zio_t *pio = zio->io_parent;
@@ -1221,7 +1231,7 @@ zio_done(zio_t *zio)
pio->io_child = next;
mutex_exit(&pio->io_lock);
- zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
+ zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE,
&pio->io_children_notdone);
}
@@ -1243,6 +1253,8 @@ zio_done(zio_t *zio)
cv_destroy(&zio->io_cv);
kmem_cache_free(zio_cache, zio);
}
+
+ return (ZIO_PIPELINE_STOP);
}
/*
@@ -1250,7 +1262,7 @@ zio_done(zio_t *zio)
* Compression support
* ==========================================================================
*/
-static void
+static int
zio_write_compress(zio_t *zio)
{
int compress = zio->io_compress;
@@ -1300,7 +1312,7 @@ zio_write_compress(zio_t *zio)
ASSERT(csize != 0);
BP_SET_LSIZE(bp, lsize);
BP_SET_COMPRESS(bp, compress);
- zio->io_pipeline = ZIO_REWRITE_PIPELINE;
+ zio->io_pipeline = ZIO_REWRITE_PIPELINE(bp);
} else {
if (bp->blk_birth == zio->io_txg)
BP_ZERO(bp);
@@ -1316,10 +1328,10 @@ zio_write_compress(zio_t *zio)
}
}
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
-static void
+static int
zio_read_decompress(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
@@ -1338,7 +1350,7 @@ zio_read_decompress(zio_t *zio)
zio_buf_free(data, bufsize);
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
/*
@@ -1347,19 +1359,6 @@ zio_read_decompress(zio_t *zio)
* ==========================================================================
*/
static void
-zio_gang_pipeline(zio_t *zio)
-{
- /*
- * By default, the pipeline assumes that we're dealing with a gang
- * block. If we're not, strip out any gang-specific stages.
- */
- if (!BP_IS_GANG(zio->io_bp))
- zio->io_pipeline &= ~ZIO_GANG_STAGES;
-
- zio_next_stage(zio);
-}
-
-static void
zio_gang_byteswap(zio_t *zio)
{
ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
@@ -1368,7 +1367,7 @@ zio_gang_byteswap(zio_t *zio)
byteswap_uint64_array(zio->io_data, zio->io_size);
}
-static void
+static int
zio_get_gang_header(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
@@ -1384,10 +1383,10 @@ zio_get_gang_header(zio_t *zio)
zio->io_flags & ZIO_FLAG_GANG_INHERIT,
ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE));
- zio_wait_children_done(zio);
+ return (zio_wait_for_children_done(zio));
}
-static void
+static int
zio_read_gang_members(zio_t *zio)
{
zio_gbh_phys_t *gbh;
@@ -1410,16 +1409,17 @@ zio_read_gang_members(zio_t *zio)
ASSERT(!BP_IS_HOLE(gbp));
zio_nowait(zio_read(zio, zio->io_spa, gbp,
- (char *)zio->io_data + loff, lsize, NULL, NULL,
- zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT,
- &zio->io_bookmark));
+ (char *)zio->io_data + loff, lsize,
+ NULL, NULL, zio->io_priority,
+ zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark));
}
zio_buf_free(gbh, gbufsize);
- zio_wait_children_done(zio);
+
+ return (zio_wait_for_children_done(zio));
}
-static void
+static int
zio_rewrite_gang_members(zio_t *zio)
{
zio_gbh_phys_t *gbh;
@@ -1446,15 +1446,16 @@ zio_rewrite_gang_members(zio_t *zio)
zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
- NULL, NULL, zio->io_priority, zio->io_flags,
- &zio->io_bookmark));
+ NULL, NULL, zio->io_priority,
+ zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark));
}
zio_push_transform(zio, gbh, gsize, gbufsize);
- zio_wait_children_ready(zio);
+
+ return (zio_wait_for_children_ready(zio));
}
-static void
+static int
zio_free_gang_members(zio_t *zio)
{
zio_gbh_phys_t *gbh;
@@ -1476,10 +1477,11 @@ zio_free_gang_members(zio_t *zio)
}
zio_buf_free(gbh, gbufsize);
- zio_next_stage(zio);
+
+ return (ZIO_PIPELINE_CONTINUE);
}
-static void
+static int
zio_claim_gang_members(zio_t *zio)
{
zio_gbh_phys_t *gbh;
@@ -1500,7 +1502,8 @@ zio_claim_gang_members(zio_t *zio)
}
zio_buf_free(gbh, gbufsize);
- zio_next_stage(zio);
+
+ return (ZIO_PIPELINE_CONTINUE);
}
static void
@@ -1549,8 +1552,10 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL,
B_FALSE);
- if (error)
- return (error);
+ if (error) {
+ zio->io_error = error;
+ return (ZIO_PIPELINE_CONTINUE);
+ }
for (d = 0; d < gbh_ndvas; d++)
DVA_SET_GANG(&dva[d], 1);
@@ -1560,10 +1565,6 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
gbh = zio_buf_alloc(gsize);
bzero(gbh, gsize);
- /* We need to test multi-level gang blocks */
- if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0)
- maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE);
-
for (loff = 0, i = 0; loff != zio->io_size;
loff += lsize, resid -= lsize, gbps_left--, i++) {
blkptr_t *gbp = &gbh->zg_blkptr[i];
@@ -1579,8 +1580,10 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
break;
ASSERT3U(error, ==, ENOSPC);
/* XXX - free up previous allocations? */
- if (maxalloc == SPA_MINBLOCKSIZE)
- return (error);
+ if (maxalloc == SPA_MINBLOCKSIZE) {
+ zio->io_error = error;
+ return (ZIO_PIPELINE_CONTINUE);
+ }
maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
}
@@ -1614,14 +1617,14 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
zio_push_transform(zio, gbh, gsize, gsize);
+
/*
- * As much as we'd like this to be zio_wait_children_ready(),
+ * As much as we'd like this to be 'ready' instead of 'done',
* updating our ASIZE doesn't happen until the io_done callback,
* so we have to wait for that to finish in order for our BP
* to be stable.
*/
- zio_wait_children_done(zio);
- return (0);
+ return (zio_wait_for_children_done(zio));
}
/*
@@ -1629,7 +1632,7 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
* Allocate and free blocks
* ==========================================================================
*/
-static void
+static int
zio_dva_allocate(zio_t *zio)
{
spa_t *spa = zio->io_spa;
@@ -1642,14 +1645,6 @@ zio_dva_allocate(zio_t *zio)
ASSERT3U(zio->io_ndvas, >, 0);
ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa));
- /* For testing, make some blocks above a certain size be gang blocks */
- if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) {
- error = zio_write_allocate_gang_members(zio, mc);
- if (error)
- zio->io_error = error;
- return;
- }
-
/*
* For testing purposes, we force I/Os to retry. We don't allow
* retries beyond the first pass since those I/Os are non-allocating
@@ -1668,17 +1663,15 @@ zio_dva_allocate(zio_t *zio)
if (error == 0) {
bp->blk_birth = zio->io_txg;
} else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
- error = zio_write_allocate_gang_members(zio, mc);
- if (error == 0)
- return;
- zio->io_error = error;
+ return (zio_write_allocate_gang_members(zio, mc));
} else {
zio->io_error = error;
}
- zio_next_stage(zio);
+
+ return (ZIO_PIPELINE_CONTINUE);
}
-static void
+static int
zio_dva_free(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
@@ -1687,15 +1680,15 @@ zio_dva_free(zio_t *zio)
BP_ZERO(bp);
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
-static void
+static int
zio_dva_claim(zio_t *zio)
{
zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
/*
@@ -1704,7 +1697,7 @@ zio_dva_claim(zio_t *zio)
* ==========================================================================
*/
-static void
+static int
zio_vdev_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -1719,24 +1712,21 @@ zio_vdev_io_start(zio_t *zio)
* at that time.
*/
if (spa_state(spa) == POOL_STATE_IO_FAILURE &&
- zio->io_type == ZIO_TYPE_WRITE) {
- zio_vdev_suspend_io(zio);
- return;
- }
+ zio->io_type == ZIO_TYPE_WRITE)
+ return (zio_vdev_suspend_io(zio));
- if (vd == NULL) {
- /* The mirror_ops handle multiple DVAs in a single BP */
- vdev_mirror_ops.vdev_op_io_start(zio);
- return;
- }
+ /*
+ * The mirror_ops handle multiple DVAs in a single BP
+ */
+ if (vd == NULL)
+ return (vdev_mirror_ops.vdev_op_io_start(zio));
align = 1ULL << tvd->vdev_ashift;
if (zio->io_retries == 0 && vd == tvd)
zio->io_flags |= ZIO_FLAG_FAILFAST;
- if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
- vd->vdev_children == 0) {
+ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) {
zio->io_flags |= ZIO_FLAG_PHYSICAL;
zio->io_offset += VDEV_LABEL_START_SIZE;
}
@@ -1760,19 +1750,16 @@ zio_vdev_io_start(zio_t *zio)
P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size);
ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
- vdev_io_start(zio);
-
- /* zio_next_stage_async() gets called from io completion interrupt */
+ return (vd->vdev_ops->vdev_op_io_start(zio));
}
-static void
+static int
zio_vdev_io_done(zio_t *zio)
{
if (zio->io_vd == NULL)
- /* The mirror_ops handle multiple DVAs in a single BP */
- vdev_mirror_ops.vdev_op_io_done(zio);
- else
- vdev_io_done(zio);
+ return (vdev_mirror_ops.vdev_op_io_done(zio));
+
+ return (zio->io_vd->vdev_ops->vdev_op_io_done(zio));
}
/* XXPOLICY */
@@ -1795,7 +1782,7 @@ zio_should_retry(zio_t *zio)
return (B_TRUE);
}
-static void
+static int
zio_vdev_io_assess(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -1833,15 +1820,10 @@ zio_vdev_io_assess(zio_t *zio)
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
- dprintf("retry #%d for %s to %s offset %llx\n",
- zio->io_retries, zio_type_name[zio->io_type],
- vdev_description(vd), zio->io_offset);
-
- zio_next_stage_async(zio);
- return;
+ return (ZIO_PIPELINE_CONTINUE);
}
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
void
@@ -1876,7 +1858,7 @@ zio_vdev_io_bypass(zio_t *zio)
* Generate and verify checksums
* ==========================================================================
*/
-static void
+static int
zio_checksum_generate(zio_t *zio)
{
int checksum = zio->io_checksum;
@@ -1889,10 +1871,10 @@ zio_checksum_generate(zio_t *zio)
zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size);
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
-static void
+static int
zio_gang_checksum_generate(zio_t *zio)
{
zio_cksum_t zc;
@@ -1905,10 +1887,10 @@ zio_gang_checksum_generate(zio_t *zio)
zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size);
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
-static void
+static int
zio_checksum_verify(zio_t *zio)
{
if (zio->io_bp != NULL) {
@@ -1918,7 +1900,7 @@ zio_checksum_verify(zio_t *zio)
zio->io_spa, zio->io_vd, zio, 0, 0);
}
- zio_next_stage(zio);
+ return (ZIO_PIPELINE_CONTINUE);
}
/*
@@ -1949,20 +1931,15 @@ zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
* Define the pipeline
* ==========================================================================
*/
-typedef void zio_pipe_stage_t(zio_t *zio);
-
-static void
-zio_badop(zio_t *zio)
-{
- panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio);
-}
+typedef int zio_pipe_stage_t(zio_t *zio);
zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
- zio_badop,
- zio_wait_children_ready,
+ NULL,
+ zio_wait_for_children_ready,
+ zio_read_init,
+ zio_issue_async,
zio_write_compress,
zio_checksum_generate,
- zio_gang_pipeline,
zio_get_gang_header,
zio_rewrite_gang_members,
zio_free_gang_members,
@@ -1972,116 +1949,63 @@ zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
zio_dva_claim,
zio_gang_checksum_generate,
zio_ready,
- zio_read_init,
zio_vdev_io_start,
zio_vdev_io_done,
zio_vdev_io_assess,
- zio_wait_children_done,
+ zio_wait_for_children_done,
zio_checksum_verify,
zio_read_gang_members,
zio_read_decompress,
zio_assess,
zio_done,
- zio_badop
+ NULL
};
/*
- * Move an I/O to the next stage of the pipeline and execute that stage.
- * There's no locking on io_stage because there's no legitimate way for
- * multiple threads to be attempting to process the same I/O.
+ * Execute the I/O pipeline until one of the following occurs:
+ * (1) the I/O completes; (2) the pipeline stalls waiting for
+ * dependent child I/Os; (3) the I/O issues, so we're waiting
+ * for an I/O completion interrupt; (4) the I/O is delegated by
+ * vdev-level caching or aggregation; (5) the I/O is deferred
+ * due to vdev-level queueing; (6) the I/O is handed off to
+ * another thread. In all cases, the pipeline stops whenever
+ * there's no CPU work; it never burns a thread in cv_wait().
+ *
+ * There's no locking on io_stage because there's no legitimate way
+ * for multiple threads to be attempting to process the same I/O.
*/
void
-zio_next_stage(zio_t *zio)
+zio_execute(zio_t *zio)
{
- uint32_t pipeline = zio->io_pipeline;
+ while (zio->io_stage < ZIO_STAGE_DONE) {
+ uint32_t pipeline = zio->io_pipeline;
+ int rv;
- ASSERT(!MUTEX_HELD(&zio->io_lock));
+ ASSERT(!MUTEX_HELD(&zio->io_lock));
- if (zio->io_error) {
- dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
- zio, vdev_description(zio->io_vd),
- zio->io_offset, zio->io_stage, zio->io_error);
- if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
+ /*
+ * If an error occurred outside the vdev stack,
+ * just execute the interlock stages to clean up.
+ */
+ if (zio->io_error &&
+ ((1U << zio->io_stage) & ZIO_VDEV_IO_STAGES) == 0)
pipeline &= ZIO_ERROR_PIPELINE_MASK;
- }
-
- while (((1U << ++zio->io_stage) & pipeline) == 0)
- continue;
-
- ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
- ASSERT(zio->io_stalled == 0);
-
- /*
- * See the comment in zio_next_stage_async() about per-CPU taskqs.
- */
- if (((1U << zio->io_stage) & zio->io_async_stages) &&
- (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) &&
- !(zio->io_flags & ZIO_FLAG_METADATA)) {
- taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
- (void) taskq_dispatch(tq,
- (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
- } else {
- zio_pipeline[zio->io_stage](zio);
- }
-}
-void
-zio_next_stage_async(zio_t *zio)
-{
- taskq_t *tq;
- uint32_t pipeline = zio->io_pipeline;
-
- ASSERT(!MUTEX_HELD(&zio->io_lock));
+ while (((1U << ++zio->io_stage) & pipeline) == 0)
+ continue;
- if (zio->io_error) {
- dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
- zio, vdev_description(zio->io_vd),
- zio->io_offset, zio->io_stage, zio->io_error);
- if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
- pipeline &= ZIO_ERROR_PIPELINE_MASK;
- }
+ ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
+ ASSERT(zio->io_stalled == 0);
- while (((1U << ++zio->io_stage) & pipeline) == 0)
- continue;
+ rv = zio_pipeline[zio->io_stage](zio);
- ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
- ASSERT(zio->io_stalled == 0);
+ if (rv == ZIO_PIPELINE_STOP)
+ return;
- /*
- * For performance, we'll probably want two sets of task queues:
- * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU
- * part is for read performance: since we have to make a pass over
- * the data to checksum it anyway, we want to do this on the same CPU
- * that issued the read, because (assuming CPU scheduling affinity)
- * that thread is probably still there. Getting this optimization
- * right avoids performance-hostile cache-to-cache transfers.
- *
- * Note that having two sets of task queues is also necessary for
- * correctness: if all of the issue threads get bogged down waiting
- * for dependent reads (e.g. metaslab freelist) to complete, then
- * there won't be any threads available to service I/O completion
- * interrupts.
- */
- if ((1U << zio->io_stage) & zio->io_async_stages) {
- if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE)
- tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
- else
- tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type];
- (void) taskq_dispatch(tq,
- (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
- } else {
- zio_pipeline[zio->io_stage](zio);
+ ASSERT(rv == ZIO_PIPELINE_CONTINUE);
}
}
-void
-zio_resubmit_stage_async(void *arg)
-{
- zio_t *zio = (zio_t *)(uintptr_t)arg;
-
- zio_next_stage_async(zio);
-}
-
static boolean_t
zio_io_should_fail(uint16_t range)
{