summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/fs
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/fs')
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c25
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_objset.c26
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_tx.c34
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c15
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c54
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c13
-rw-r--r--usr/src/uts/common/fs/zfs/space_map.c15
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h10
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa_impl.h4
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev.h5
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_context.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h13
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio_impl.h16
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c103
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_disk.c160
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_file.c137
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_label.c44
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_mirror.c9
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_missing.c15
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz.c3
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_root.c40
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c13
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c406
24 files changed, 1027 insertions, 137 deletions
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index f1c2de5a07..aafce2d68e 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -2533,12 +2533,34 @@ arc_write_ready(zio_t *zio)
{
arc_write_callback_t *callback = zio->io_private;
arc_buf_t *buf = callback->awcb_buf;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
- if (callback->awcb_ready) {
+ if (zio->io_error == 0 && callback->awcb_ready) {
ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
callback->awcb_ready(zio, buf, callback->awcb_private);
}
+ /*
+ * If the IO is already in progress, then this is a re-write
+ * attempt, so we need to thaw and re-compute the cksum. It is
+ * the responsibility of the callback to handle the freeing
+ * and accounting for any re-write attempt. If we don't have a
+ * callback registered then simply free the block here.
+ */
+ if (HDR_IO_IN_PROGRESS(hdr)) {
+ if (!BP_IS_HOLE(&zio->io_bp_orig) &&
+ callback->awcb_ready == NULL) {
+ zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
+ &zio->io_bp_orig, NULL, NULL));
+ }
+ mutex_enter(&hdr->b_freeze_lock);
+ if (hdr->b_freeze_cksum != NULL) {
+ kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
+ hdr->b_freeze_cksum = NULL;
+ }
+ mutex_exit(&hdr->b_freeze_lock);
+ }
arc_cksum_compute(buf);
+ hdr->b_flags |= ARC_IO_IN_PROGRESS;
}
static void
@@ -2635,7 +2657,6 @@ arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
callback->awcb_done = done;
callback->awcb_private = private;
callback->awcb_buf = buf;
- hdr->b_flags |= ARC_IO_IN_PROGRESS;
zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback,
priority, flags, zb);
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index 2758d84791..0f687ff66d 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -739,12 +739,26 @@ ready(zio_t *zio, arc_buf_t *abuf, void *arg)
dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
int i;
+ ASSERT(bp == zio->io_bp);
+
/*
* Update rootbp fill count.
*/
bp->blk_fill = 1; /* count the meta-dnode */
for (i = 0; i < dnp->dn_nblkptr; i++)
bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+
+ BP_SET_TYPE(bp, DMU_OT_OBJSET);
+ BP_SET_LEVEL(bp, 0);
+
+ /* We must do this after we've set the bp's type and level */
+ if (!DVA_EQUAL(BP_IDENTITY(bp),
+ BP_IDENTITY(&zio->io_bp_orig))) {
+ if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
+ dsl_dataset_block_kill(os->os_dsl_dataset,
+ &zio->io_bp_orig, NULL, os->os_synctx);
+ dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx);
+ }
}
/* ARGSUSED */
@@ -754,18 +768,6 @@ killer(zio_t *zio, arc_buf_t *abuf, void *arg)
objset_impl_t *os = arg;
ASSERT3U(zio->io_error, ==, 0);
-
- BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET);
- BP_SET_LEVEL(zio->io_bp, 0);
-
- if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
- BP_IDENTITY(&zio->io_bp_orig))) {
- if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
- dsl_dataset_block_kill(os->os_dsl_dataset,
- &zio->io_bp_orig, NULL, os->os_synctx);
- dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp,
- os->os_synctx);
- }
arc_release(os->os_phys_buf, &os->os_phys_buf);
}
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index 13fd8d4d9d..f89878facf 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -734,11 +734,30 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
{
dmu_tx_hold_t *txh;
uint64_t lsize, asize, fsize, towrite, tofree, tooverwrite;
+ spa_t *spa = tx->tx_pool->dp_spa;
ASSERT3U(tx->tx_txg, ==, 0);
+
if (tx->tx_err)
return (tx->tx_err);
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE) {
+ /*
+ * If the user has indicated a blocking failure mode
+ * then return ERESTART which will block in dmu_tx_wait().
+ * Otherwise, return EIO so that an error can get
+ * propagated back to the VOP calls.
+ *
+ * Note that we always honor the txg_how flag regardless
+ * of the failuremode setting.
+ */
+ if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
+ txg_how != TXG_WAIT)
+ return (EIO);
+
+ return (ERESTART);
+ }
+
tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
tx->tx_needassign_txh = NULL;
@@ -885,10 +904,19 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
void
dmu_tx_wait(dmu_tx_t *tx)
{
+ spa_t *spa = tx->tx_pool->dp_spa;
+
ASSERT(tx->tx_txg == 0);
- ASSERT(tx->tx_lasttried_txg != 0);
- if (tx->tx_needassign_txh) {
+ /*
+ * It's possible that the pool has become active after this thread
+ * has tried to obtain a tx. If that's the case then his
+ * tx_lasttried_txg would not have been assigned.
+ */
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE ||
+ tx->tx_lasttried_txg == 0) {
+ txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
+ } else if (tx->tx_needassign_txh) {
dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
mutex_enter(&dn->dn_mtx);
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 4fcc6bfd79..b2840e4e87 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -773,6 +773,20 @@ top:
all_zero = B_TRUE;
do {
vd = mg->mg_vd;
+ /*
+ * Dont allocate from faulted devices
+ */
+ if (!vdev_writeable(vd))
+ goto next;
+ /*
+ * Avoid writing single-copy data to a failing vdev
+ */
+ if ((vd->vdev_stat.vs_write_errors > 0 ||
+ vd->vdev_state < VDEV_STATE_HEALTHY) &&
+ d == 0 && dshift == 3) {
+ all_zero = B_FALSE;
+ goto next;
+ }
ASSERT(mg->mg_class == mc);
@@ -828,6 +842,7 @@ top:
return (0);
}
+next:
mc->mc_rotor = mg->mg_next;
mc->mc_allocated = 0;
} while ((mg = mg->mg_next) != rotor);
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index a838b0f45b..a780a2ca1f 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -362,6 +362,27 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
dmu_objset_close(os);
}
break;
+ case ZPOOL_PROP_FAILUREMODE:
+ error = nvpair_value_uint64(elem, &intval);
+ if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
+ intval > ZIO_FAILURE_MODE_PANIC))
+ error = EINVAL;
+
+ /*
+ * This is a special case which only occurs when
+ * the pool has completely failed. This allows
+ * the user to change the in-core failmode property
+ * without syncing it out to disk (I/Os might
+ * currently be blocked). We do this by returning
+ * EIO to the caller (spa_prop_set) to trick it
+ * into thinking we encountered a property validation
+ * error.
+ */
+ if (!error && spa_state(spa) == POOL_STATE_IO_FAILURE) {
+ spa->spa_failmode = intval;
+ error = EIO;
+ }
+ break;
}
if (error)
@@ -477,6 +498,8 @@ spa_activate(spa_t *spa)
list_create(&spa->spa_dirty_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_dirty_node));
+ list_create(&spa->spa_zio_list, sizeof (zio_t),
+ offsetof(zio_t, zio_link_node));
txg_list_create(&spa->spa_vdev_txg_list,
offsetof(struct vdev, vdev_txg_node));
@@ -506,6 +529,7 @@ spa_deactivate(spa_t *spa)
txg_list_destroy(&spa->spa_vdev_txg_list);
list_destroy(&spa->spa_dirty_list);
+ list_destroy(&spa->spa_zio_list);
for (t = 0; t < ZIO_TYPES; t++) {
taskq_destroy(spa->spa_zio_issue_taskq[t]);
@@ -1077,6 +1101,10 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
spa->spa_pool_props_object,
zpool_prop_to_name(ZPOOL_PROP_DELEGATION),
sizeof (uint64_t), 1, &spa->spa_delegation);
+ (void) zap_lookup(spa->spa_meta_objset,
+ spa->spa_pool_props_object,
+ zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE),
+ sizeof (uint64_t), 1, &spa->spa_failmode);
}
/*
@@ -1618,6 +1646,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
spa->spa_temporary = zpool_prop_default_numeric(ZPOOL_PROP_TEMPORARY);
+ spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
if (props)
spa_sync_props(spa, props, CRED(), tx);
@@ -3091,7 +3120,7 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
tvd->vdev_remove_wanted = 0;
vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED,
VDEV_AUX_NONE);
- vdev_clear(spa, tvd);
+ vdev_clear(spa, tvd, B_TRUE);
vdev_config_dirty(tvd->vdev_top);
}
spa_async_remove(spa, tvd);
@@ -3122,8 +3151,14 @@ spa_async_thread(spa_t *spa)
/*
* See if any devices need to be marked REMOVED.
+ *
+ * XXX - We avoid doing this when we are in
+ * I/O failure state since spa_vdev_enter() grabs
+ * the namespace lock and would not be able to obtain
+ * the writer config lock.
*/
- if (tasks & SPA_ASYNC_REMOVE) {
+ if (tasks & SPA_ASYNC_REMOVE &&
+ spa_state(spa) != POOL_STATE_IO_FAILURE) {
txg = spa_vdev_enter(spa);
spa_async_remove(spa, spa->spa_root_vdev);
(void) spa_vdev_exit(spa, NULL, txg, 0);
@@ -3379,7 +3414,6 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
VERIFY(nvpair_value_uint64(elem, &intval) == 0);
spa->spa_temporary = intval;
break;
-
default:
/*
* Set pool property values in the poolprops mos object.
@@ -3425,11 +3459,19 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
ASSERT(0); /* not allowed */
}
- if (prop == ZPOOL_PROP_DELEGATION)
+ switch (prop) {
+ case ZPOOL_PROP_DELEGATION:
spa->spa_delegation = intval;
-
- if (prop == ZPOOL_PROP_BOOTFS)
+ break;
+ case ZPOOL_PROP_BOOTFS:
spa->spa_bootfs = intval;
+ break;
+ case ZPOOL_PROP_FAILUREMODE:
+ spa->spa_failmode = intval;
+ break;
+ default:
+ break;
+ }
}
/* log internal history if this is not a zpool create */
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 8065ae85b6..5cb0890586 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -277,6 +277,8 @@ spa_add(const char *name, const char *altroot)
avl_add(&spa_namespace_avl, spa);
+ mutex_init(&spa->spa_zio_lock, NULL, MUTEX_DEFAULT, NULL);
+
/*
* Set the alternate root, if there is one.
*/
@@ -332,6 +334,7 @@ spa_remove(spa_t *spa)
mutex_destroy(&spa->spa_sync_bplist.bpl_lock);
mutex_destroy(&spa->spa_history_lock);
mutex_destroy(&spa->spa_props_lock);
+ mutex_destroy(&spa->spa_zio_lock);
kmem_free(spa, sizeof (spa_t));
}
@@ -989,6 +992,16 @@ spa_get_asize(spa_t *spa, uint64_t lsize)
return (lsize * 6);
}
+/*
+ * Return the failure mode that has been set to this pool. The default
+ * behavior will be to block all I/Os when a complete failure occurs.
+ */
+uint8_t
+spa_get_failmode(spa_t *spa)
+{
+ return (spa->spa_failmode);
+}
+
uint64_t
spa_version(spa_t *spa)
{
diff --git a/usr/src/uts/common/fs/zfs/space_map.c b/usr/src/uts/common/fs/zfs/space_map.c
index 9263b31172..a15e5ff815 100644
--- a/usr/src/uts/common/fs/zfs/space_map.c
+++ b/usr/src/uts/common/fs/zfs/space_map.c
@@ -298,6 +298,7 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
uint64_t *entry, *entry_map, *entry_map_end;
uint64_t bufsize, size, offset, end, space;
uint64_t mapstart = sm->sm_start;
+ int error = 0;
ASSERT(MUTEX_HELD(sm->sm_lock));
@@ -335,9 +336,10 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
smo->smo_object, offset, size);
mutex_exit(sm->sm_lock);
- VERIFY3U(dmu_read(os, smo->smo_object, offset, size,
- entry_map), ==, 0);
+ error = dmu_read(os, smo->smo_object, offset, size, entry_map);
mutex_enter(sm->sm_lock);
+ if (error != 0)
+ goto out;
entry_map_end = entry_map + (size / sizeof (uint64_t));
for (entry = entry_map; entry < entry_map_end; entry++) {
@@ -354,18 +356,19 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
}
VERIFY3U(sm->sm_space, ==, space);
+ sm->sm_loaded = B_TRUE;
+ sm->sm_ops = ops;
+out:
zio_buf_free(entry_map, bufsize);
sm->sm_loading = B_FALSE;
- sm->sm_loaded = B_TRUE;
- sm->sm_ops = ops;
cv_broadcast(&sm->sm_load_cv);
- if (ops != NULL)
+ if (!error && ops != NULL)
ops->smop_load(sm);
- return (0);
+ return (error);
}
void
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index cb5e09e4b0..032ead7f37 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -274,7 +274,7 @@ typedef struct blkptr {
#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0)
#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
-#define BP_ZERO(bp) \
+#define BP_ZERO_DVAS(bp) \
{ \
(bp)->blk_dva[0].dva_word[0] = 0; \
(bp)->blk_dva[0].dva_word[1] = 0; \
@@ -282,11 +282,16 @@ typedef struct blkptr {
(bp)->blk_dva[1].dva_word[1] = 0; \
(bp)->blk_dva[2].dva_word[0] = 0; \
(bp)->blk_dva[2].dva_word[1] = 0; \
+ (bp)->blk_birth = 0; \
+}
+
+#define BP_ZERO(bp) \
+{ \
+ BP_ZERO_DVAS(bp) \
(bp)->blk_prop = 0; \
(bp)->blk_pad[0] = 0; \
(bp)->blk_pad[1] = 0; \
(bp)->blk_pad[2] = 0; \
- (bp)->blk_birth = 0; \
(bp)->blk_fill = 0; \
ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
}
@@ -423,6 +428,7 @@ extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
extern uint64_t spa_version(spa_t *spa);
extern int spa_max_replication(spa_t *spa);
extern int spa_busy(void);
+extern uint8_t spa_get_failmode(spa_t *spa);
/* Miscellaneous support routines */
extern int spa_rename(const char *oldname, const char *newname);
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
index 18371aa13f..0310f985b8 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -141,6 +141,10 @@ struct spa {
uint64_t spa_bootfs; /* default boot filesystem */
boolean_t spa_delegation; /* delegation on/off */
boolean_t spa_temporary; /* temporary on/off */
+ list_t spa_zio_list; /* zio error list */
+ kcondvar_t spa_zio_cv; /* resume I/O pipeline */
+ kmutex_t spa_zio_lock; /* zio error lock */
+ uint8_t spa_failmode; /* failure mode for the pool */
/*
* spa_refcnt & spa_config_lock must be the last elements
* because refcount_t changes size based on compilation options.
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index c651d1eebb..dced3da5ff 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -54,6 +54,7 @@ extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
extern void vdev_init(vdev_t *, uint64_t txg);
extern void vdev_reopen(vdev_t *);
extern int vdev_validate_spare(vdev_t *);
+extern int vdev_probe(vdev_t *);
extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
@@ -89,10 +90,12 @@ extern int vdev_degrade(spa_t *spa, uint64_t guid);
extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
vdev_state_t *);
extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
-extern void vdev_clear(spa_t *spa, vdev_t *vd);
+extern void vdev_clear(spa_t *spa, vdev_t *vd, boolean_t reopen_wanted);
extern int vdev_error_inject(vdev_t *vd, zio_t *zio);
extern int vdev_is_dead(vdev_t *vd);
+extern int vdev_readable(vdev_t *vd);
+extern int vdev_writeable(vdev_t *vd);
extern void vdev_cache_init(vdev_t *vd);
extern void vdev_cache_fini(vdev_t *vd);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index e279bb2495..6fa21e83b0 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -60,6 +60,7 @@ typedef struct vdev_cache_entry vdev_cache_entry_t;
*/
typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift);
typedef void vdev_close_func_t(vdev_t *vd);
+typedef int vdev_probe_func_t(vdev_t *vd);
typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
typedef void vdev_io_start_func_t(zio_t *zio);
typedef void vdev_io_done_func_t(zio_t *zio);
@@ -68,6 +69,7 @@ typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
typedef struct vdev_ops {
vdev_open_func_t *vdev_op_open;
vdev_close_func_t *vdev_op_close;
+ vdev_probe_func_t *vdev_op_probe;
vdev_asize_func_t *vdev_op_asize;
vdev_io_start_func_t *vdev_op_io_start;
vdev_io_done_func_t *vdev_op_io_done;
@@ -174,6 +176,7 @@ struct vdev {
uint64_t vdev_unspare; /* unspare when resilvering done */
boolean_t vdev_checkremove; /* temporary online test */
boolean_t vdev_forcefault; /* force online fault */
+ boolean_t vdev_is_failing; /* device errors seen */
/*
* For DTrace to work in userland (libzpool) context, these fields must
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_context.h b/usr/src/uts/common/fs/zfs/sys/zfs_context.h
index 8a689e0760..a5be3e1303 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_context.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_context.h
@@ -62,6 +62,7 @@ extern "C" {
#include <sys/zfs_debug.h>
#include <sys/sysevent.h>
#include <sys/sysevent/eventdefs.h>
+#include <sys/fm/util.h>
#define CPU_SEQID (CPU->cpu_seqid)
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 0f38aae47d..cc08976074 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -107,6 +107,10 @@ enum zio_compress {
#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB
#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF
+#define ZIO_FAILURE_MODE_WAIT 0
+#define ZIO_FAILURE_MODE_CONTINUE 1
+#define ZIO_FAILURE_MODE_PANIC 2
+
#define ZIO_PRIORITY_NOW (zio_priority_table[0])
#define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1])
#define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2])
@@ -144,6 +148,7 @@ enum zio_compress {
#define ZIO_FLAG_USER 0x20000
#define ZIO_FLAG_METADATA 0x40000
+#define ZIO_FLAG_WRITE_RETRY 0x80000
#define ZIO_FLAG_GANG_INHERIT \
(ZIO_FLAG_CANFAIL | \
@@ -217,6 +222,7 @@ struct zio {
zio_t *io_sibling_next;
zio_transform_t *io_transform_stack;
zio_t *io_logical;
+ list_node_t zio_link_node;
/* Callback info */
zio_done_func_t *io_ready;
@@ -242,8 +248,10 @@ struct zio {
/* Internal pipeline state */
int io_flags;
+ int io_orig_flags;
enum zio_type io_type;
enum zio_stage io_stage;
+ enum zio_stage io_orig_stage;
uint8_t io_stalled;
uint8_t io_priority;
struct dk_callback io_dk_callback;
@@ -252,6 +260,7 @@ struct zio {
int io_error;
uint32_t io_numerrors;
uint32_t io_pipeline;
+ uint32_t io_orig_pipeline;
uint32_t io_async_stages;
uint64_t io_children_notready;
uint64_t io_children_notdone;
@@ -320,6 +329,7 @@ extern void zio_data_buf_free(void *buf, size_t size);
*/
extern void zio_next_stage(zio_t *zio);
extern void zio_next_stage_async(zio_t *zio);
+extern void zio_resubmit_stage_async(void *);
extern void zio_wait_children_done(zio_t *zio);
/*
@@ -339,7 +349,8 @@ extern void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp);
extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
-boolean_t zio_should_retry(zio_t *zio);
+extern boolean_t zio_should_retry(zio_t *zio);
+extern int zio_vdev_resume_io(spa_t *);
/*
* Initial setup and teardown.
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_impl.h b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
index d2ddbc34e9..a5a0bb54e8 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -61,6 +61,8 @@ typedef enum zio_stage {
ZIO_STAGE_READY, /* RWFCI */
+ ZIO_STAGE_READ_INIT, /* R---- */
+
ZIO_STAGE_VDEV_IO_START, /* RW--I */
ZIO_STAGE_VDEV_IO_DONE, /* RW--I */
ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */
@@ -71,6 +73,7 @@ typedef enum zio_stage {
ZIO_STAGE_READ_GANG_MEMBERS, /* R---- */
ZIO_STAGE_READ_DECOMPRESS, /* R---- */
+ ZIO_STAGE_ASSESS, /* RWFCI */
ZIO_STAGE_DONE /* RWFCI */
} zio_stage_t;
@@ -96,9 +99,14 @@ typedef enum zio_stage {
ZIO_VDEV_IO_PIPELINE | \
(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
(1U << ZIO_STAGE_CHECKSUM_VERIFY) | \
+ (1U << ZIO_STAGE_ASSESS) | \
(1U << ZIO_STAGE_DONE))
+#define ZIO_READ_GANG_PIPELINE \
+ ZIO_READ_PHYS_PIPELINE
+
#define ZIO_READ_PIPELINE \
+ (1U << ZIO_STAGE_READ_INIT) | \
ZIO_READ_PHYS_PIPELINE
#define ZIO_WRITE_PHYS_PIPELINE \
@@ -108,6 +116,7 @@ typedef enum zio_stage {
(1U << ZIO_STAGE_READY) | \
ZIO_VDEV_IO_PIPELINE | \
(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_ASSESS) | \
(1U << ZIO_STAGE_DONE))
#define ZIO_WRITE_COMMON_PIPELINE \
@@ -149,6 +158,7 @@ typedef enum zio_stage {
(1U << ZIO_STAGE_DVA_FREE) | \
(1U << ZIO_STAGE_READY) | \
(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_ASSESS) | \
(1U << ZIO_STAGE_DONE))
#define ZIO_CLAIM_PIPELINE \
@@ -160,6 +170,7 @@ typedef enum zio_stage {
(1U << ZIO_STAGE_DVA_CLAIM) | \
(1U << ZIO_STAGE_READY) | \
(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_ASSESS) | \
(1U << ZIO_STAGE_DONE))
#define ZIO_IOCTL_PIPELINE \
@@ -168,16 +179,19 @@ typedef enum zio_stage {
(1U << ZIO_STAGE_READY) | \
ZIO_VDEV_IO_PIPELINE | \
(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_ASSESS) | \
(1U << ZIO_STAGE_DONE))
#define ZIO_WAIT_FOR_CHILDREN_PIPELINE \
((1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
(1U << ZIO_STAGE_READY) | \
(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_ASSESS) | \
(1U << ZIO_STAGE_DONE))
#define ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE \
((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_ASSESS) | \
(1U << ZIO_STAGE_DONE))
#define ZIO_VDEV_CHILD_PIPELINE \
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 62ebf19a61..aed7d53ba1 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -793,6 +793,21 @@ vdev_metaslab_fini(vdev_t *vd)
}
}
+int
+vdev_probe(vdev_t *vd)
+{
+ if (vd == NULL)
+ return (EINVAL);
+
+ /*
+ * Right now we only support status checks on the leaf vdevs.
+ */
+ if (vd->vdev_ops->vdev_op_leaf)
+ return (vd->vdev_ops->vdev_op_probe(vd));
+
+ return (0);
+}
+
/*
* Prepare a virtual device for access.
*/
@@ -919,6 +934,17 @@ vdev_open(vdev_t *vd)
}
/*
+ * Ensure we can issue some IO before declaring the
+ * vdev open for business.
+ */
+ error = vdev_probe(vd);
+ if (error) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_OPEN_FAILED);
+ return (error);
+ }
+
+ /*
* If this is a top-level vdev, compute the raidz-deflation
* ratio. Note, we hard-code in 128k (1<<17) because it is the
* current "typical" blocksize. Even if SPA_MAXBLOCKSIZE
@@ -1467,6 +1493,17 @@ vdev_fault(spa_t *spa, uint64_t guid)
vdev_t *rvd, *vd;
uint64_t txg;
+ /*
+ * Disregard a vdev fault request if the pool has
+ * experienced a complete failure.
+ *
+ * XXX - We do this here so that we don't hold the
+ * spa_namespace_lock in the event that we can't get
+ * the RW_WRITER spa_config_lock.
+ */
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE)
+ return (EIO);
+
txg = spa_vdev_enter(spa);
rvd = spa->spa_root_vdev;
@@ -1499,7 +1536,7 @@ vdev_fault(spa_t *spa, uint64_t guid)
*/
vdev_reopen(vd);
- if (!vdev_is_dead(vd)) {
+ if (vdev_readable(vd)) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
VDEV_AUX_ERR_EXCEEDED);
}
@@ -1523,6 +1560,17 @@ vdev_degrade(spa_t *spa, uint64_t guid)
vdev_t *rvd, *vd;
uint64_t txg;
+ /*
+ * Disregard a vdev fault request if the pool has
+ * experienced a complete failure.
+ *
+ * XXX - We do this here so that we don't hold the
+ * spa_namespace_lock in the event that we can't get
+ * the RW_WRITER spa_config_lock.
+ */
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE)
+ return (EIO);
+
txg = spa_vdev_enter(spa);
rvd = spa->spa_root_vdev;
@@ -1564,6 +1612,17 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
vdev_t *rvd, *vd;
uint64_t txg;
+ /*
+ * Disregard a vdev fault request if the pool has
+ * experienced a complete failure.
+ *
+ * XXX - We do this here so that we don't hold the
+ * spa_namespace_lock in the event that we can't get
+ * the RW_WRITER spa_config_lock.
+ */
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE)
+ return (EIO);
+
txg = spa_vdev_enter(spa);
rvd = spa->spa_root_vdev;
@@ -1612,6 +1671,17 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
vdev_t *rvd, *vd;
uint64_t txg;
+ /*
+ * Disregard a vdev fault request if the pool has
+ * experienced a complete failure.
+ *
+ * XXX - We do this here so that we don't hold the
+ * spa_namespace_lock in the event that we can't get
+ * the RW_WRITER spa_config_lock.
+ */
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE)
+ return (EIO);
+
txg = spa_vdev_enter(spa);
rvd = spa->spa_root_vdev;
@@ -1662,9 +1732,11 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
* Clear the error counts associated with this vdev. Unlike vdev_online() and
* vdev_offline(), we assume the spa config is locked. We also clear all
* children. If 'vd' is NULL, then the user wants to clear all vdevs.
+ * If reopen is specified then attempt to reopen the vdev if the vdev is
+ * faulted or degraded.
*/
void
-vdev_clear(spa_t *spa, vdev_t *vd)
+vdev_clear(spa_t *spa, vdev_t *vd, boolean_t reopen_wanted)
{
int c;
@@ -1674,16 +1746,17 @@ vdev_clear(spa_t *spa, vdev_t *vd)
vd->vdev_stat.vs_read_errors = 0;
vd->vdev_stat.vs_write_errors = 0;
vd->vdev_stat.vs_checksum_errors = 0;
+ vd->vdev_is_failing = B_FALSE;
for (c = 0; c < vd->vdev_children; c++)
- vdev_clear(spa, vd->vdev_child[c]);
+ vdev_clear(spa, vd->vdev_child[c], reopen_wanted);
/*
* If we're in the FAULTED state, then clear the persistent state and
* attempt to reopen the device. We also mark the vdev config dirty, so
* that the new faulted state is written out to disk.
*/
- if (vd->vdev_faulted || vd->vdev_degraded) {
+ if (reopen_wanted && (vd->vdev_faulted || vd->vdev_degraded)) {
vd->vdev_faulted = vd->vdev_degraded = 0;
vdev_reopen(vd);
vdev_config_dirty(vd->vdev_top);
@@ -1696,6 +1769,20 @@ vdev_clear(spa_t *spa, vdev_t *vd)
}
int
+vdev_readable(vdev_t *vd)
+{
+ /* XXPOLICY */
+ return (!vdev_is_dead(vd));
+}
+
+int
+vdev_writeable(vdev_t *vd)
+{
+ return (vd->vdev_ops->vdev_op_leaf ?
+ !vd->vdev_is_failing : !vdev_is_dead(vd));
+}
+
+int
vdev_is_dead(vdev_t *vd)
{
return (vd->vdev_state < VDEV_STATE_DEGRADED);
@@ -1800,7 +1887,7 @@ vdev_stat_update(zio_t *zio)
if (flags & ZIO_FLAG_SPECULATIVE)
return;
- if (!vdev_is_dead(vd)) {
+ if (vdev_readable(vd)) {
mutex_enter(&vd->vdev_stat_lock);
if (type == ZIO_TYPE_READ) {
if (zio->io_error == ECKSUM)
@@ -1962,9 +2049,9 @@ vdev_propagate_state(vdev_t *vd)
if (vd->vdev_children > 0) {
for (c = 0; c < vd->vdev_children; c++) {
child = vd->vdev_child[c];
- if (vdev_is_dead(child))
+ if (vdev_is_dead(child) && !vdev_readable(child))
faulted++;
- else if (child->vdev_state == VDEV_STATE_DEGRADED)
+ else if (child->vdev_state <= VDEV_STATE_DEGRADED)
degraded++;
if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
@@ -2020,7 +2107,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
* want here. This is limited to leaf devices, because otherwise
* closing the device will affect other children.
*/
- if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf)
+ if (!vdev_readable(vd) && vd->vdev_ops->vdev_op_leaf)
vd->vdev_ops->vdev_op_close(vd);
if (vd->vdev_removed &&
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index a957c3671c..8bdd4d1f95 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -45,14 +45,11 @@ typedef struct vdev_disk_buf {
} vdev_disk_buf_t;
static int
-vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+vdev_disk_open_common(vdev_t *vd)
{
vdev_disk_t *dvd;
- struct dk_minfo dkm;
- int error;
dev_t dev;
- char *physpath, *minorname;
- int otyp;
+ int error;
/*
* We must have a pathname, and it must be absolute.
@@ -166,17 +163,34 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
&dvd->vd_lh, zfs_li);
}
- if (error) {
+ if (error)
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+
+ return (error);
+}
+
+static int
+vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+ vdev_disk_t *dvd;
+ struct dk_minfo dkm;
+ int error;
+ dev_t dev;
+ int otyp;
+
+ error = vdev_disk_open_common(vd);
+ if (error)
return (error);
- }
+ dvd = vd->vdev_tsd;
/*
* Once a device is opened, verify that the physical device path (if
* available) is up to date.
*/
if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
+ char *physpath, *minorname;
+
physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
minorname = NULL;
if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
@@ -252,6 +266,113 @@ vdev_disk_close(vdev_t *vd)
vd->vdev_tsd = NULL;
}
+static int
+vdev_disk_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset,
+ int flags)
+{
+ buf_t buf;
+ int error = 0;
+ vdev_disk_t *dvd = vd->vdev_tsd;
+
+ if (vd == NULL || dvd == NULL || dvd->vd_lh == NULL)
+ return (EINVAL);
+
+ ASSERT(flags & B_READ || flags & B_WRITE);
+
+ bioinit(&buf);
+ buf.b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
+ buf.b_bcount = size;
+ buf.b_un.b_addr = (void *)data;
+ buf.b_lblkno = lbtodb(offset);
+ buf.b_bufsize = size;
+
+ error = ldi_strategy(dvd->vd_lh, &buf);
+ ASSERT(error == 0);
+ error = biowait(&buf);
+
+ if (zio_injection_enabled && error == 0)
+ error = zio_handle_device_injection(vd, EIO);
+
+ return (error);
+}
+
+static int
+vdev_disk_probe(vdev_t *vd)
+{
+ uint64_t offset;
+ vdev_t *nvd;
+ int l, error = 0, retries = 0;
+ char *vl_pad;
+
+ if (vd == NULL)
+ return (EINVAL);
+
+ /* Hijack the current vdev */
+ nvd = vd;
+
+ /*
+ * Pick a random label to rewrite.
+ */
+ l = spa_get_random(VDEV_LABELS);
+ ASSERT(l < VDEV_LABELS);
+
+ offset = vdev_label_offset(vd->vdev_psize, l,
+ offsetof(vdev_label_t, vl_pad));
+
+ vl_pad = kmem_alloc(VDEV_SKIP_SIZE, KM_SLEEP);
+
+ /*
+ * Try to read and write to a special location on the
+ * label. We use the existing vdev initially and only
+ * try to create and reopen it if we encounter a failure.
+ */
+ while ((error = vdev_disk_probe_io(nvd, vl_pad, VDEV_SKIP_SIZE,
+ offset, B_READ)) != 0 && retries == 0) {
+
+ nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
+ if (vd->vdev_path)
+ nvd->vdev_path = spa_strdup(vd->vdev_path);
+ if (vd->vdev_physpath)
+ nvd->vdev_physpath = spa_strdup(vd->vdev_physpath);
+ if (vd->vdev_devid)
+ nvd->vdev_devid = spa_strdup(vd->vdev_devid);
+ nvd->vdev_wholedisk = vd->vdev_wholedisk;
+ nvd->vdev_guid = vd->vdev_guid;
+ retries++;
+
+ error = vdev_disk_open_common(nvd);
+ if (error) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ nvd->vdev_stat.vs_aux);
+ break;
+ }
+ }
+
+ if (!error) {
+ error = vdev_disk_probe_io(nvd, vl_pad, VDEV_SKIP_SIZE,
+ offset, B_WRITE);
+ }
+
+ /* Clean up if we allocated a new vdev */
+ if (retries) {
+ vdev_disk_close(nvd);
+ if (nvd->vdev_path)
+ spa_strfree(nvd->vdev_path);
+ if (nvd->vdev_physpath)
+ spa_strfree(nvd->vdev_physpath);
+ if (nvd->vdev_devid)
+ spa_strfree(nvd->vdev_devid);
+ kmem_free(nvd, sizeof (vdev_t));
+ }
+ kmem_free(vl_pad, VDEV_SKIP_SIZE);
+
+ /* Reset the failing flag */
+ if (!error)
+ vd->vdev_is_failing = B_FALSE;
+
+ return (error);
+}
+
static void
vdev_disk_io_intr(buf_t *bp)
{
@@ -289,7 +410,7 @@ vdev_disk_io_start(zio_t *zio)
zio_vdev_io_bypass(zio);
/* XXPOLICY */
- if (vdev_is_dead(vd)) {
+ if (!vdev_readable(vd)) {
zio->io_error = ENXIO;
zio_next_stage_async(zio);
return;
@@ -369,7 +490,11 @@ vdev_disk_io_start(zio_t *zio)
bp->b_iodone = (int (*)())vdev_disk_io_intr;
/* XXPOLICY */
- error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
+ else
+ error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
+ error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
if (error) {
zio->io_error = error;
bioerror(bp, error);
@@ -386,10 +511,6 @@ vdev_disk_io_start(zio_t *zio)
static void
vdev_disk_io_done(zio_t *zio)
{
- vdev_t *vd = zio->io_vd;
- vdev_disk_t *dvd = vd->vdev_tsd;
- int state;
-
vdev_queue_io_done(zio);
if (zio->io_type == ZIO_TYPE_WRITE)
@@ -401,15 +522,23 @@ vdev_disk_io_done(zio_t *zio)
/*
* If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
* the device has been removed. If this is the case, then we trigger an
- * asynchronous removal of the device.
+ * asynchronous removal of the device. Otherwise, probe the device and
+ * make sure it's still functional.
*/
if (zio->io_error == EIO) {
+ vdev_t *vd = zio->io_vd;
+ vdev_disk_t *dvd = vd->vdev_tsd;
+ int state;
+
state = DKIO_NONE;
- if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
+ if (dvd && ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
FKIOCTL, kcred, NULL) == 0 &&
state != DKIO_INSERTED) {
vd->vdev_remove_wanted = B_TRUE;
spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
+ } else if (vdev_probe(vd) != 0) {
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ vd->vdev_is_failing = B_TRUE;
}
}
@@ -419,6 +548,7 @@ vdev_disk_io_done(zio_t *zio)
vdev_ops_t vdev_disk_ops = {
vdev_disk_open,
vdev_disk_close,
+ vdev_disk_probe,
vdev_default_asize,
vdev_disk_io_start,
vdev_disk_io_done,
diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c
index b8e79f8c0c..6f099b6629 100644
--- a/usr/src/uts/common/fs/zfs/vdev_file.c
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -37,11 +37,10 @@
*/
static int
-vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+vdev_file_open_common(vdev_t *vd)
{
vdev_file_t *vf;
vnode_t *vp;
- vattr_t vattr;
int error;
/*
@@ -61,8 +60,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
* to local zone users, so the underlying devices should be as well.
*/
ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
- error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, spa_mode | FOFFMAX,
- 0, &vp, 0, 0, rootdir);
+ error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
+ spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir);
if (error) {
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
@@ -81,11 +80,26 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
}
#endif
+ return (0);
+}
+
+static int
+vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+ vdev_file_t *vf;
+ vattr_t vattr;
+ int error;
+
+ if ((error = vdev_file_open_common(vd)) != 0)
+ return (error);
+
+ vf = vd->vdev_tsd;
+
/*
* Determine the physical size of the file.
*/
vattr.va_mask = AT_SIZE;
- error = VOP_GETATTR(vp, &vattr, 0, kcred);
+ error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred);
if (error) {
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
return (error);
@@ -115,6 +129,89 @@ vdev_file_close(vdev_t *vd)
vd->vdev_tsd = NULL;
}
+static int
+vdev_file_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset,
+ enum uio_rw rw)
+{
+ vdev_file_t *vf = vd->vdev_tsd;
+ ssize_t resid;
+ int error = 0;
+
+ if (vd == NULL || vf == NULL || vf->vf_vnode == NULL)
+ return (EINVAL);
+
+ ASSERT(rw == UIO_READ || rw == UIO_WRITE);
+
+ error = vn_rdwr(rw, vf->vf_vnode, data, size, offset, UIO_SYSSPACE,
+ 0, RLIM64_INFINITY, kcred, &resid);
+ if (error || resid != 0)
+ return (EIO);
+ return (0);
+}
+
+static int
+vdev_file_probe(vdev_t *vd)
+{
+ vdev_t *nvd;
+ char *vl_boot;
+ uint64_t offset;
+ int l, error = 0, retries = 0;
+
+ if (vd == NULL)
+ return (EINVAL);
+
+ /* Hijack the current vdev */
+ nvd = vd;
+
+ /*
+ * Pick a random label to rewrite.
+ */
+ l = spa_get_random(VDEV_LABELS);
+ ASSERT(l < VDEV_LABELS);
+
+ offset = vdev_label_offset(vd->vdev_psize, l,
+ offsetof(vdev_label_t, vl_boot_header));
+
+ vl_boot = kmem_alloc(VDEV_BOOT_HEADER_SIZE, KM_SLEEP);
+
+ while ((error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE,
+ offset, UIO_READ)) != 0 && retries == 0) {
+
+ /*
+ * If we failed with the vdev that was passed in then
+ * try allocating a new one and try again.
+ */
+ nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
+ if (vd->vdev_path)
+ nvd->vdev_path = spa_strdup(vd->vdev_path);
+ error = vdev_file_open_common(nvd);
+ if (error) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ nvd->vdev_stat.vs_aux);
+ break;
+ }
+ retries++;
+ }
+
+ if ((spa_mode & FWRITE) && !error) {
+ error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE,
+ offset, UIO_WRITE);
+ }
+
+ if (retries) {
+ vdev_file_close(nvd);
+ if (nvd->vdev_path)
+ spa_strfree(nvd->vdev_path);
+ kmem_free(nvd, sizeof (vdev_t));
+ }
+ kmem_free(vl_boot, VDEV_BOOT_HEADER_SIZE);
+
+ if (!error)
+ vd->vdev_is_failing = B_FALSE;
+
+ return (error);
+}
+
static void
vdev_file_io_start(zio_t *zio)
{
@@ -127,7 +224,7 @@ vdev_file_io_start(zio_t *zio)
zio_vdev_io_bypass(zio);
/* XXPOLICY */
- if (vdev_is_dead(vd)) {
+ if (!vdev_readable(vd)) {
zio->io_error = ENXIO;
zio_next_stage_async(zio);
return;
@@ -161,7 +258,11 @@ vdev_file_io_start(zio_t *zio)
return;
/* XXPOLICY */
- error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
+ else
+ error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
+ error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
if (error) {
zio->io_error = error;
zio_next_stage_async(zio);
@@ -182,6 +283,21 @@ vdev_file_io_start(zio_t *zio)
static void
vdev_file_io_done(zio_t *zio)
{
+
+ if (zio_injection_enabled && zio->io_error == 0)
+ zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
+
+ /*
+ * If this device is truely gone, then attempt to remove it
+ * from the configuration.
+ */
+ if (zio->io_error == EIO) {
+ vdev_t *vd = zio->io_vd;
+
+ if (vdev_probe(vd) != 0)
+ vd->vdev_is_failing = B_TRUE;
+ }
+
vdev_queue_io_done(zio);
#ifndef _KERNEL
@@ -189,15 +305,13 @@ vdev_file_io_done(zio_t *zio)
vdev_cache_write(zio);
#endif
- if (zio_injection_enabled && zio->io_error == 0)
- zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
-
zio_next_stage(zio);
}
vdev_ops_t vdev_file_ops = {
vdev_file_open,
vdev_file_close,
+ vdev_file_probe,
vdev_default_asize,
vdev_file_io_start,
vdev_file_io_done,
@@ -214,6 +328,7 @@ vdev_ops_t vdev_file_ops = {
vdev_ops_t vdev_disk_ops = {
vdev_file_open,
vdev_file_close,
+ vdev_file_probe,
vdev_default_asize,
vdev_file_io_start,
vdev_file_io_done,
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index 4b22a68fee..070444a093 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -321,7 +321,7 @@ vdev_label_read_config(vdev_t *vd)
ASSERT(spa_config_held(spa, RW_READER) ||
spa_config_held(spa, RW_WRITER));
- if (vdev_is_dead(vd))
+ if (!vdev_readable(vd))
return (NULL);
vp = zio_buf_alloc(sizeof (vdev_phys_t));
@@ -902,7 +902,9 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg)
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd;
zio_t *zio;
- int l, error;
+ int l, last_error = 0, error = 0;
+ uint64_t good_writes = 0;
+ boolean_t retry_avail = B_TRUE;
ASSERT(ub->ub_txg <= txg);
@@ -941,6 +943,7 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg)
}
(void) zio_wait(zio);
+retry:
/*
* Sync out the even labels (L0, L2) for every dirty vdev. If the
* system dies in the middle of this process, that's OK: all of the
@@ -954,11 +957,29 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg)
if (l & 1)
continue;
if ((error = vdev_sync_labels(vd, l, txg)) != 0)
- return (error);
+ last_error = error;
+ else
+ good_writes++;
}
}
/*
+ * If all the vdevs that are currently dirty have failed or the
+ * spa_dirty_list is empty then we dirty all the vdevs and try again.
+ * This is a last ditch effort to ensure that we get at least one
+ * update before proceeding to the uberblock.
+ */
+ if (good_writes == 0 && retry_avail) {
+ vdev_config_dirty(rvd);
+ retry_avail = B_FALSE;
+ last_error = 0;
+ goto retry;
+ }
+
+ if (good_writes == 0)
+ return (last_error);
+
+ /*
* Flush the new labels to disk. This ensures that all even-label
* updates are committed to stable storage before the uberblock update.
*/
@@ -986,8 +1007,15 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg)
* will be the newest, and the even labels (which had all
* been successfully committed) will be valid with respect
* to the new uberblocks.
+ *
+ * NOTE: We retry to an uberblock update on the root if we were
+ * failed our initial update attempt.
*/
- if ((error = vdev_uberblock_sync_tree(spa, ub, uvd, txg)) != 0)
+ error = vdev_uberblock_sync_tree(spa, ub, uvd, txg);
+ if (error && uvd != rvd)
+ error = vdev_uberblock_sync_tree(spa, ub, rvd, txg);
+
+ if (error)
return (error);
/*
@@ -999,6 +1027,7 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg)
NULL, NULL, ZIO_PRIORITY_NOW,
ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+ last_error = 0;
/*
* Sync out odd labels for every dirty vdev. If the system dies
* in the middle of this process, the even labels and the new
@@ -1013,10 +1042,15 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg)
if ((l & 1) == 0)
continue;
if ((error = vdev_sync_labels(vd, l, txg)) != 0)
- return (error);
+ last_error = error;
+ else
+ good_writes++;
}
}
+ if (good_writes == 0)
+ return (last_error);
+
/*
* Flush the new labels to disk. This ensures that all odd-label
* updates are committed to stable storage before the next
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
index 73d1a83d94..45d326ae69 100644
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -219,7 +219,7 @@ vdev_mirror_child_select(zio_t *zio)
/*
* Try to find a child whose DTL doesn't contain the block to read.
* If a child is known to be completely inaccessible (indicated by
- * vdev_is_dead() returning B_TRUE), don't even try.
+ * vdev_readable() returning B_FALSE), don't even try.
*/
for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
if (c >= mm->mm_children)
@@ -227,7 +227,7 @@ vdev_mirror_child_select(zio_t *zio)
mc = &mm->mm_child[c];
if (mc->mc_tried || mc->mc_skipped)
continue;
- if (vdev_is_dead(mc->mc_vd)) {
+ if (vdev_is_dead(mc->mc_vd) && !vdev_readable(mc->mc_vd)) {
mc->mc_error = ENXIO;
mc->mc_tried = 1; /* don't even try */
mc->mc_skipped = 1;
@@ -464,6 +464,7 @@ vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
vdev_ops_t vdev_mirror_ops = {
vdev_mirror_open,
vdev_mirror_close,
+ NULL,
vdev_default_asize,
vdev_mirror_io_start,
vdev_mirror_io_done,
@@ -475,6 +476,7 @@ vdev_ops_t vdev_mirror_ops = {
vdev_ops_t vdev_replacing_ops = {
vdev_mirror_open,
vdev_mirror_close,
+ NULL,
vdev_default_asize,
vdev_mirror_io_start,
vdev_mirror_io_done,
@@ -486,6 +488,7 @@ vdev_ops_t vdev_replacing_ops = {
vdev_ops_t vdev_spare_ops = {
vdev_mirror_open,
vdev_mirror_close,
+ NULL,
vdev_default_asize,
vdev_mirror_io_start,
vdev_mirror_io_done,
diff --git a/usr/src/uts/common/fs/zfs/vdev_missing.c b/usr/src/uts/common/fs/zfs/vdev_missing.c
index b35f4a5bcd..3aa831c46d 100644
--- a/usr/src/uts/common/fs/zfs/vdev_missing.c
+++ b/usr/src/uts/common/fs/zfs/vdev_missing.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -77,9 +76,17 @@ vdev_missing_io_done(zio_t *zio)
zio_next_stage(zio);
}
+/* ARGSUSED */
+static int
+vdev_missing_probe(vdev_t *vd)
+{
+ return (0);
+}
+
vdev_ops_t vdev_missing_ops = {
vdev_missing_open,
vdev_missing_close,
+ vdev_missing_probe,
vdev_default_asize,
vdev_missing_io_start,
vdev_missing_io_done,
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index 0c86630765..73a3ae2565 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -686,7 +686,7 @@ vdev_raidz_io_start(zio_t *zio)
for (c = rm->rm_cols - 1; c >= 0; c--) {
rc = &rm->rm_col[c];
cvd = vd->vdev_child[rc->rc_devidx];
- if (vdev_is_dead(cvd)) {
+ if (!vdev_readable(cvd)) {
if (c >= rm->rm_firstdatacol)
rm->rm_missingdata++;
else
@@ -1228,6 +1228,7 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
vdev_ops_t vdev_raidz_ops = {
vdev_raidz_open,
vdev_raidz_close,
+ NULL,
vdev_raidz_asize,
vdev_raidz_io_start,
vdev_raidz_io_done,
diff --git a/usr/src/uts/common/fs/zfs/vdev_root.c b/usr/src/uts/common/fs/zfs/vdev_root.c
index 0e8752c6ce..77829c0aa3 100644
--- a/usr/src/uts/common/fs/zfs/vdev_root.c
+++ b/usr/src/uts/common/fs/zfs/vdev_root.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -44,18 +44,17 @@
* probably fine. Adding bean counters during alloc/free can make this
* future guesswork more accurate.
*/
-/*ARGSUSED*/
static int
too_many_errors(vdev_t *vd, int numerrors)
{
- return (numerrors > 0);
+ ASSERT3U(numerrors, <=, vd->vdev_children);
+ return (numerrors == vd->vdev_children);
}
static int
vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
{
- vdev_t *cvd;
- int c, error;
+ int c;
int lasterror = 0;
int numerrors = 0;
@@ -65,7 +64,8 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
}
for (c = 0; c < vd->vdev_children; c++) {
- cvd = vd->vdev_child[c];
+ vdev_t *cvd = vd->vdev_child[c];
+ int error;
if ((error = vdev_open(cvd)) != 0) {
lasterror = error;
@@ -74,9 +74,15 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
}
}
- if (too_many_errors(vd, numerrors)) {
- vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
- return (lasterror);
+ if (numerrors > 0) {
+ if (!too_many_errors(vd, numerrors)) {
+ /* XXX - should not be explicitly setting this state */
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED,
+ VDEV_AUX_NO_REPLICAS);
+ } else {
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (lasterror);
+ }
}
*asize = 0;
@@ -97,18 +103,24 @@ vdev_root_close(vdev_t *vd)
static void
vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
{
- if (too_many_errors(vd, faulted))
- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_NO_REPLICAS);
- else if (degraded != 0)
+ if (faulted) {
+ if (too_many_errors(vd, faulted))
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ else
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED,
+ VDEV_AUX_NO_REPLICAS);
+ } else if (degraded) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
- else
+ } else {
vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+ }
}
vdev_ops_t vdev_root_ops = {
vdev_root_open,
vdev_root_close,
+ NULL,
vdev_default_asize,
NULL, /* io_start - not applicable to the root */
NULL, /* io_done - not applicable to the root */
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 4a5e68b878..54158d03f2 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -2073,6 +2073,17 @@ zfs_ioc_clear(zfs_cmd_t *zc)
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
+ /*
+ * Try to resume any I/Os which may have been suspended
+ * as a result of a complete pool failure.
+ */
+ if (!list_is_empty(&spa->spa_zio_list)) {
+ if (zio_vdev_resume_io(spa) != 0) {
+ spa_close(spa, FTAG);
+ return (EIO);
+ }
+ }
+
txg = spa_vdev_enter(spa);
if (zc->zc_guid == 0) {
@@ -2083,7 +2094,7 @@ zfs_ioc_clear(zfs_cmd_t *zc)
return (ENODEV);
}
- vdev_clear(spa, vd);
+ vdev_clear(spa, vd, B_TRUE);
(void) spa_vdev_exit(spa, NULL, txg, 0);
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 40670d1321..103c9d9cad 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -66,6 +66,14 @@ uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1;
/* Force an allocation failure when non-zero */
uint16_t zio_zil_fail_shift = 0;
+uint16_t zio_io_fail_shift = 0;
+
+/* Enable/disable the write-retry logic */
+int zio_write_retry = 1;
+
+/* Taskq to handle reissuing of I/Os */
+taskq_t *zio_taskq;
+int zio_resume_threads = 4;
typedef struct zio_sync_pass {
int zp_defer_free; /* defer frees after this pass */
@@ -79,6 +87,8 @@ zio_sync_pass_t zio_sync_pass = {
1, /* zp_rewrite */
};
+static boolean_t zio_io_should_fail(uint16_t);
+
/*
* ==========================================================================
* I/O kmem caches
@@ -92,6 +102,34 @@ kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
extern vmem_t *zio_alloc_arena;
#endif
+/*
+ * Determine if we are allowed to issue the IO based on the
+ * pool state. If we must wait then block until we are told
+ * that we may continue.
+ */
+#define ZIO_ENTER(spa) { \
+ if (spa->spa_state == POOL_STATE_IO_FAILURE) { \
+ mutex_enter(&spa->spa_zio_lock); \
+ while (spa->spa_state == POOL_STATE_IO_FAILURE) \
+ cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock); \
+ mutex_exit(&spa->spa_zio_lock); \
+ } \
+}
+
+/*
+ * An allocation zio is one that either currently has the DVA allocate
+ * stage set or will have it later in it's lifetime.
+ */
+#define IO_IS_ALLOCATING(zio) \
+ ((zio)->io_orig_pipeline == ZIO_WRITE_PIPELINE || \
+ (zio)->io_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
+
+/*
+ * The only way to tell is by looking for the gang pipeline stage
+ */
+#define IO_IS_REWRITE(zio) \
+ ((zio)->io_pipeline & (1U << ZIO_STAGE_GANG_PIPELINE))
+
void
zio_init(void)
{
@@ -153,6 +191,9 @@ zio_init(void)
zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
}
+ zio_taskq = taskq_create("zio_taskq", zio_resume_threads,
+ maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
+
zio_inject_init();
}
@@ -177,6 +218,8 @@ zio_fini(void)
zio_data_buf_cache[c] = NULL;
}
+ taskq_destroy(zio_taskq);
+
kmem_cache_destroy(zio_cache);
zio_inject_fini();
@@ -386,9 +429,27 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
mutex_exit(&pio->io_lock);
}
+ /*
+ * Save off the original state incase we need to retry later.
+ */
+ zio->io_orig_stage = zio->io_stage;
+ zio->io_orig_pipeline = zio->io_pipeline;
+ zio->io_orig_flags = zio->io_flags;
+
return (zio);
}
+static void
+zio_reset(zio_t *zio)
+{
+ zio_clear_transform_stack(zio);
+
+ zio->io_flags = zio->io_orig_flags;
+ zio->io_stage = zio->io_orig_stage;
+ zio->io_pipeline = zio->io_orig_pipeline;
+ zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size);
+}
+
zio_t *
zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
int flags)
@@ -417,6 +478,13 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
ASSERT3U(size, ==, BP_GET_LSIZE(bp));
+ /*
+ * If the user has specified that we allow I/Os to continue
+ * then attempt to satisfy the read.
+ */
+ if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
+ ZIO_ENTER(spa);
+
zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER,
ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
@@ -429,22 +497,6 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
*/
zio->io_bp = &zio->io_bp_copy;
- if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
- uint64_t csize = BP_GET_PSIZE(bp);
- void *cbuf = zio_buf_alloc(csize);
-
- zio_push_transform(zio, cbuf, csize, csize);
- zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
- }
-
- if (BP_IS_GANG(bp)) {
- uint64_t gsize = SPA_GANGBLOCKSIZE;
- void *gbuf = zio_buf_alloc(gsize);
-
- zio_push_transform(zio, gbuf, gsize, gsize);
- zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
- }
-
return (zio);
}
@@ -462,6 +514,8 @@ zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
ASSERT(compress >= ZIO_COMPRESS_OFF &&
compress < ZIO_COMPRESS_FUNCTIONS);
+ ZIO_ENTER(spa);
+
zio = zio_create(pio, spa, txg, bp, data, size, done, private,
ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
@@ -515,6 +569,16 @@ zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
return (zio);
}
+static void
+zio_write_allocate_ready(zio_t *zio)
+{
+ /* Free up the previous block */
+ if (!BP_IS_HOLE(&zio->io_bp_orig)) {
+ zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
+ &zio->io_bp_orig, NULL, NULL));
+ }
+}
+
static zio_t *
zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
@@ -533,6 +597,7 @@ zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
zio->io_checksum = checksum;
zio->io_compress = ZIO_COMPRESS_OFF;
+ zio->io_ready = zio_write_allocate_ready;
return (zio);
}
@@ -649,6 +714,8 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
zio_t *zio;
blkptr_t blk;
+ ZIO_ENTER(vd->vdev_spa);
+
zio_phys_bp_init(vd, &blk, offset, size, checksum);
zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
@@ -676,6 +743,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
zio_t *zio;
blkptr_t blk;
+ ZIO_ENTER(vd->vdev_spa);
+
zio_phys_bp_init(vd, &blk, offset, size, checksum);
zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
@@ -801,6 +870,7 @@ zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
mutex_enter(&pio->io_lock);
if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
pio->io_error = zio->io_error;
+ ASSERT3U(*countp, >, 0);
if (--*countp == 0 && pio->io_stalled == stage) {
pio->io_stalled = 0;
mutex_exit(&pio->io_lock);
@@ -825,6 +895,27 @@ zio_wait_children_done(zio_t *zio)
}
static void
+zio_read_init(zio_t *zio)
+{
+ if (BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF) {
+ uint64_t csize = BP_GET_PSIZE(zio->io_bp);
+ void *cbuf = zio_buf_alloc(csize);
+
+ zio_push_transform(zio, cbuf, csize, csize);
+ zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
+ }
+
+ if (BP_IS_GANG(zio->io_bp)) {
+ uint64_t gsize = SPA_GANGBLOCKSIZE;
+ void *gbuf = zio_buf_alloc(gsize);
+
+ zio_push_transform(zio, gbuf, gsize, gsize);
+ zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
+ }
+ zio_next_stage(zio);
+}
+
+static void
zio_ready(zio_t *zio)
{
zio_t *pio = zio->io_parent;
@@ -843,9 +934,151 @@ zio_ready(zio_t *zio)
}
static void
-zio_done(zio_t *zio)
+zio_vdev_retry_io(zio_t *zio)
{
zio_t *pio = zio->io_parent;
+
+ /*
+ * Preserve the failed bp so that the io_ready() callback can
+ * update the accounting accordingly. The callback will also be
+ * responsible for freeing the previously allocated block, if one
+ * exists.
+ */
+ zio->io_bp_orig = *zio->io_bp;
+
+ /*
+ * We must zero out the old DVA and blk_birth before reallocating
+ * the bp. We don't want to do this if this is a rewrite however.
+ */
+ if (!IO_IS_REWRITE(zio)) {
+ BP_ZERO_DVAS(zio->io_bp);
+ }
+
+ zio_reset(zio);
+
+ if (pio) {
+ /*
+ * Let the parent know that we will
+ * re-alloc the write (=> new bp info).
+ */
+ mutex_enter(&pio->io_lock);
+ pio->io_children_notready++;
+
+ /*
+ * If the parent I/O is still in the open stage, then
+ * don't bother telling it to retry since it hasn't
+ * progressed far enough for it to care.
+ */
+ if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio))
+ pio->io_flags |= ZIO_FLAG_WRITE_RETRY;
+
+ ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_CHILDREN_DONE);
+ mutex_exit(&pio->io_lock);
+ }
+
+ /*
+ * We are getting ready to process the retry request so clear
+ * the flag and the zio's current error status.
+ */
+ zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY;
+ zio->io_error = 0;
+ zio_next_stage_async(zio);
+}
+
+int
+zio_vdev_resume_io(spa_t *spa)
+{
+ zio_t *zio;
+
+ mutex_enter(&spa->spa_zio_lock);
+
+ /*
+ * Probe all of vdevs that have experienced an I/O error.
+ * If we are still unable to verify the integrity of the vdev
+ * then we prevent the resume from proceeeding.
+ */
+ for (zio = list_head(&spa->spa_zio_list); zio != NULL;
+ zio = list_next(&spa->spa_zio_list, zio)) {
+ int error = 0;
+
+ /* We only care about I/Os that must succeed */
+ if (zio->io_vd == NULL || zio->io_flags & ZIO_FLAG_CANFAIL)
+ continue;
+ error = vdev_probe(zio->io_vd);
+ if (error) {
+ mutex_exit(&spa->spa_zio_lock);
+ return (error);
+ }
+ }
+
+ /*
+ * Clear the vdev stats so that I/O can flow.
+ */
+ vdev_clear(spa, NULL, B_FALSE);
+
+ spa->spa_state = POOL_STATE_ACTIVE;
+ while ((zio = list_head(&spa->spa_zio_list)) != NULL) {
+ list_remove(&spa->spa_zio_list, zio);
+ zio->io_error = 0;
+
+ /*
+ * If we are resuming an allocating I/O then we force it
+ * to retry and let it resume operation where it left off.
+ * Otherwise, go back to the ready stage and pick up from
+ * there.
+ */
+ if (zio_write_retry && IO_IS_ALLOCATING(zio)) {
+ zio->io_flags |= ZIO_FLAG_WRITE_RETRY;
+ zio->io_stage--;
+ } else {
+ zio->io_stage = ZIO_STAGE_READY;
+ }
+
+ (void) taskq_dispatch(zio_taskq, zio_resubmit_stage_async,
+ zio, TQ_SLEEP);
+ }
+ mutex_exit(&spa->spa_zio_lock);
+
+ /*
+ * Wait for the taskqs to finish and recheck the pool state since
+ * it's possible that a resumed I/O has failed again.
+ */
+ taskq_wait(zio_taskq);
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE)
+ return (EIO);
+
+ mutex_enter(&spa->spa_zio_lock);
+ cv_broadcast(&spa->spa_zio_cv);
+ mutex_exit(&spa->spa_zio_lock);
+
+ return (0);
+}
+
+static void
+zio_vdev_suspend_io(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+
+ /*
+ * We've experienced an unrecoverable failure so
+ * set the pool state accordingly and queue all
+ * failed IOs.
+ */
+ spa->spa_state = POOL_STATE_IO_FAILURE;
+
+ mutex_enter(&spa->spa_zio_lock);
+ list_insert_tail(&spa->spa_zio_list, zio);
+
+#ifndef _KERNEL
+ /* Used to notify ztest that the pool has suspended */
+ cv_broadcast(&spa->spa_zio_cv);
+#endif
+ mutex_exit(&spa->spa_zio_lock);
+}
+
+static void
+zio_assess(zio_t *zio)
+{
spa_t *spa = zio->io_spa;
blkptr_t *bp = zio->io_bp;
vdev_t *vd = zio->io_vd;
@@ -868,6 +1101,14 @@ zio_done(zio_t *zio)
}
}
+ /*
+ * Some child I/O has indicated that a retry is necessary, so
+ * we set an error on the I/O and let the logic below do the
+ * rest.
+ */
+ if (zio->io_flags & ZIO_FLAG_WRITE_RETRY)
+ zio->io_error = ERESTART;
+
if (vd != NULL)
vdev_stat_update(zio);
@@ -879,8 +1120,7 @@ zio_done(zio_t *zio)
* device is currently unavailable.
*/
if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
- zfs_ereport_post(FM_EREPORT_ZFS_IO,
- zio->io_spa, vd, zio, 0, 0);
+ zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
if ((zio->io_error == EIO ||
!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) &&
@@ -890,32 +1130,80 @@ zio_done(zio_t *zio)
* appropriately. Also, generate a logical data
* ereport.
*/
- spa_log_error(zio->io_spa, zio);
+ spa_log_error(spa, zio);
- zfs_ereport_post(FM_EREPORT_ZFS_DATA,
- zio->io_spa, NULL, zio, 0, 0);
+ zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
+ 0, 0);
}
/*
- * For I/O requests that cannot fail, panic appropriately.
+ * If we are an allocating I/O then we retry on another
+ * vdev unless the pool is out of space. We handle this
+ * condition based on the spa's failmode property.
+ */
+ if (zio_write_retry && zio->io_error != ENOSPC &&
+ IO_IS_ALLOCATING(zio) &&
+ zio->io_flags & ZIO_FLAG_WRITE_RETRY) {
+ zio_vdev_retry_io(zio);
+ return;
+ }
+ ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
+
+ /*
+ * For I/O requests that cannot fail, we carry out
+ * the requested behavior based on the failmode pool
+ * property.
+ *
+ * XXX - Need to differentiate between an ENOSPC as
+ * a result of vdev failures vs. a full pool.
*/
if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
char *blkbuf;
+#ifdef ZFS_DEBUG
blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP);
if (blkbuf) {
sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
bp ? bp : &zio->io_bp_copy);
}
- panic("ZFS: %s (%s on %s off %llx: zio %p %s): error "
- "%d", zio->io_error == ECKSUM ?
+ cmn_err(CE_WARN, "ZFS: %s (%s on %s off %llx: zio %p "
+ "%s): error %d", zio->io_error == ECKSUM ?
"bad checksum" : "I/O failure",
zio_type_name[zio->io_type],
vdev_description(vd),
(u_longlong_t)zio->io_offset,
- zio, blkbuf ? blkbuf : "", zio->io_error);
+ (void *)zio, blkbuf ? blkbuf : "", zio->io_error);
+#endif
+
+ if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) {
+ fm_panic("Pool '%s' has encountered an "
+ "uncorrectable I/O failure and the "
+ "failure mode property for this pool "
+ "is set to panic.", spa_name(spa));
+ } else {
+ cmn_err(CE_WARN, "Pool '%s' has encountered "
+ "an uncorrectable I/O error. Manual "
+ "intervention is required.",
+ spa_name(spa));
+ zio_vdev_suspend_io(zio);
+ }
+ return;
}
}
+ ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
+ ASSERT(zio->io_children_notready == 0);
+ zio_next_stage(zio);
+}
+
+static void
+zio_done(zio_t *zio)
+{
+ zio_t *pio = zio->io_parent;
+ spa_t *spa = zio->io_spa;
+
+ ASSERT(zio->io_children_notready == 0);
+ ASSERT(zio->io_children_notdone == 0);
+
zio_clear_transform_stack(zio);
if (zio->io_done)
@@ -1099,7 +1387,7 @@ zio_get_gang_header(zio_t *zio)
zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize,
NULL, NULL, ZIO_TYPE_READ, zio->io_priority,
zio->io_flags & ZIO_FLAG_GANG_INHERIT,
- ZIO_STAGE_OPEN, ZIO_READ_PIPELINE));
+ ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE));
zio_wait_children_done(zio);
}
@@ -1244,7 +1532,7 @@ zio_write_allocate_gang_member_done(zio_t *zio)
mutex_exit(&pio->io_lock);
}
-static void
+static int
zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
{
blkptr_t *bp = zio->io_bp;
@@ -1266,9 +1554,8 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL,
B_FALSE);
- if (error == ENOSPC)
- panic("can't allocate gang block header");
- ASSERT(error == 0);
+ if (error)
+ return (error);
for (d = 0; d < gbh_ndvas; d++)
DVA_SET_GANG(&dva[d], 1);
@@ -1296,8 +1583,9 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
if (error == 0)
break;
ASSERT3U(error, ==, ENOSPC);
+ /* XXX - free up previous allocations? */
if (maxalloc == SPA_MINBLOCKSIZE)
- panic("really out of space");
+ return (error);
maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
}
@@ -1336,6 +1624,7 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
* to be stable.
*/
zio_wait_children_done(zio);
+ return (0);
}
/*
@@ -1358,10 +1647,23 @@ zio_dva_allocate(zio_t *zio)
/* For testing, make some blocks above a certain size be gang blocks */
if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) {
- zio_write_allocate_gang_members(zio, mc);
+ error = zio_write_allocate_gang_members(zio, mc);
+ if (error)
+ zio->io_error = error;
return;
}
+ /*
+ * For testing purposes, we force I/Os to retry. We don't allow
+ * retries beyond the first pass since those I/Os are non-allocating
+ * writes. We do this after the gang block testing block so that
+ * they don't inherit the retry flag.
+ */
+ if (zio_io_fail_shift &&
+ spa_sync_pass(zio->io_spa) <= zio_sync_pass.zp_rewrite &&
+ zio_io_should_fail(zio_io_fail_shift))
+ zio->io_flags |= ZIO_FLAG_WRITE_RETRY;
+
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas,
@@ -1369,11 +1671,11 @@ zio_dva_allocate(zio_t *zio)
if (error == 0) {
bp->blk_birth = zio->io_txg;
- } else if (error == ENOSPC) {
- if (zio->io_size == SPA_MINBLOCKSIZE)
- panic("really, truly out of space");
- zio_write_allocate_gang_members(zio, mc);
- return;
+ } else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
+ error = zio_write_allocate_gang_members(zio, mc);
+ if (error == 0)
+ return;
+ zio->io_error = error;
} else {
zio->io_error = error;
}
@@ -1413,6 +1715,18 @@ zio_vdev_io_start(zio_t *zio)
vdev_t *tvd = vd ? vd->vdev_top : NULL;
blkptr_t *bp = zio->io_bp;
uint64_t align;
+ spa_t *spa = zio->io_spa;
+
+ /*
+ * If the pool is already in a failure state then just suspend
+ * this IO until the problem is resolved. We will reissue them
+ * at that time.
+ */
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE &&
+ zio->io_type == ZIO_TYPE_WRITE) {
+ zio_vdev_suspend_io(zio);
+ return;
+ }
if (vd == NULL) {
/* The mirror_ops handle multiple DVAs in a single BP */
@@ -1662,6 +1976,7 @@ zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
zio_dva_claim,
zio_gang_checksum_generate,
zio_ready,
+ zio_read_init,
zio_vdev_io_start,
zio_vdev_io_done,
zio_vdev_io_assess,
@@ -1669,6 +1984,7 @@ zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
zio_checksum_verify,
zio_read_gang_members,
zio_read_decompress,
+ zio_assess,
zio_done,
zio_badop
};
@@ -1762,12 +2078,20 @@ zio_next_stage_async(zio_t *zio)
}
}
+void
+zio_resubmit_stage_async(void *arg)
+{
+ zio_t *zio = (zio_t *)(uintptr_t)arg;
+
+ zio_next_stage_async(zio);
+}
+
static boolean_t
-zio_alloc_should_fail(void)
+zio_io_should_fail(uint16_t range)
{
static uint16_t allocs = 0;
- return (P2PHASE(allocs++, 1U<<zio_zil_fail_shift) == 0);
+ return (P2PHASE(allocs++, 1U<<range) == 0);
}
/*
@@ -1781,7 +2105,7 @@ zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
spa_config_enter(spa, RW_READER, FTAG);
- if (zio_zil_fail_shift && zio_alloc_should_fail()) {
+ if (zio_zil_fail_shift && zio_io_should_fail(zio_zil_fail_shift)) {
spa_config_exit(spa, FTAG);
return (ENOSPC);
}