24 files changed, 1027 insertions, 137 deletions
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index f1c2de5a07..aafce2d68e 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -2533,12 +2533,34 @@ arc_write_ready(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
+	arc_buf_hdr_t *hdr = buf->b_hdr;
 
-	if (callback->awcb_ready) {
+	if (zio->io_error == 0 && callback->awcb_ready) {
 		ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
 		callback->awcb_ready(zio, buf, callback->awcb_private);
 	}
+	/*
+	 * If the IO is already in progress, then this is a re-write
+	 * attempt, so we need to thaw and re-compute the cksum. It is
+	 * the responsibility of the callback to handle the freeing
+	 * and accounting for any re-write attempt. If we don't have a
+	 * callback registered then simply free the block here.
+	 */
+	if (HDR_IO_IN_PROGRESS(hdr)) {
+		if (!BP_IS_HOLE(&zio->io_bp_orig) &&
+		    callback->awcb_ready == NULL) {
+			zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
+			    &zio->io_bp_orig, NULL, NULL));
+		}
+		mutex_enter(&hdr->b_freeze_lock);
+		if (hdr->b_freeze_cksum != NULL) {
+			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
+			hdr->b_freeze_cksum = NULL;
+		}
+		mutex_exit(&hdr->b_freeze_lock);
+	}
 	arc_cksum_compute(buf);
+	hdr->b_flags |= ARC_IO_IN_PROGRESS;
 }
 
 static void
@@ -2635,7 +2657,6 @@ arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
 	callback->awcb_done = done;
 	callback->awcb_private = private;
 	callback->awcb_buf = buf;
-	hdr->b_flags |= ARC_IO_IN_PROGRESS;
 	zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
 	    buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback,
 	    priority, flags, zb);
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index 2758d84791..0f687ff66d 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -739,12 +739,26 @@ ready(zio_t *zio, arc_buf_t *abuf, void *arg)
 	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
 	int i;
 
+	ASSERT(bp == zio->io_bp);
+
 	/*
 	 * Update rootbp fill count.
 	 */
 	bp->blk_fill = 1;	/* count the meta-dnode */
 	for (i = 0; i < dnp->dn_nblkptr; i++)
 		bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+
+	BP_SET_TYPE(bp, DMU_OT_OBJSET);
+	BP_SET_LEVEL(bp, 0);
+
+	/* We must do this after we've set the bp's type and level */
+	if (!DVA_EQUAL(BP_IDENTITY(bp),
+	    BP_IDENTITY(&zio->io_bp_orig))) {
+		if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
+			dsl_dataset_block_kill(os->os_dsl_dataset,
+			    &zio->io_bp_orig, NULL, os->os_synctx);
+		dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx);
+	}
 }
 
 /* ARGSUSED */
@@ -754,18 +768,6 @@ killer(zio_t *zio, arc_buf_t *abuf, void *arg)
 	objset_impl_t *os = arg;
 
 	ASSERT3U(zio->io_error, ==, 0);
-
-	BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET);
-	BP_SET_LEVEL(zio->io_bp, 0);
-
-	if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
-	    BP_IDENTITY(&zio->io_bp_orig))) {
-		if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
-			dsl_dataset_block_kill(os->os_dsl_dataset,
-			    &zio->io_bp_orig, NULL, os->os_synctx);
-		dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp,
-		    os->os_synctx);
-	}
 	arc_release(os->os_phys_buf, &os->os_phys_buf);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index 13fd8d4d9d..f89878facf 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -734,11 +734,30 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 {
 	dmu_tx_hold_t *txh;
 	uint64_t lsize, asize, fsize, towrite, tofree, tooverwrite;
+	spa_t *spa = tx->tx_pool->dp_spa;
 
 	ASSERT3U(tx->tx_txg, ==, 0);
+
 	if (tx->tx_err)
 		return (tx->tx_err);
 
+	if (spa_state(spa) == POOL_STATE_IO_FAILURE) {
+		/*
+		 * If the user has indicated a blocking failure mode
+		 * then return ERESTART which will block in dmu_tx_wait().
+		 * Otherwise, return EIO so that an error can get
+		 * propagated back to the VOP calls.
+		 *
+		 * Note that we always honor the txg_how flag regardless
+		 * of the failuremode setting.
+		 */
+		if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
+		    txg_how != TXG_WAIT)
+			return (EIO);
+
+		return (ERESTART);
+	}
+
 	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 	tx->tx_needassign_txh = NULL;
 
@@ -885,10 +904,19 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
 void
 dmu_tx_wait(dmu_tx_t *tx)
 {
+	spa_t *spa = tx->tx_pool->dp_spa;
+
 	ASSERT(tx->tx_txg == 0);
-	ASSERT(tx->tx_lasttried_txg != 0);
 
-	if (tx->tx_needassign_txh) {
+	/*
+	 * It's possible that the pool has become active after this thread
+	 * has tried to obtain a tx. If that's the case then his
+	 * tx_lasttried_txg would not have been assigned.
+	 */
+	if (spa_state(spa) == POOL_STATE_IO_FAILURE ||
+	    tx->tx_lasttried_txg == 0) {
+		txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
+	} else if (tx->tx_needassign_txh) {
 		dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
 
 		mutex_enter(&dn->dn_mtx);
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 4fcc6bfd79..b2840e4e87 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -773,6 +773,20 @@ top:
 	all_zero = B_TRUE;
 	do {
 		vd = mg->mg_vd;
+		/*
+		 * Dont allocate from faulted devices
+		 */
+		if (!vdev_writeable(vd))
+			goto next;
+		/*
+		 * Avoid writing single-copy data to a failing vdev
+		 */
+		if ((vd->vdev_stat.vs_write_errors > 0 ||
+		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
+		    d == 0 && dshift == 3) {
+			all_zero = B_FALSE;
+			goto next;
+		}
 
 		ASSERT(mg->mg_class == mc);
 
@@ -828,6 +842,7 @@ top:
 
 			return (0);
 		}
+next:
 		mc->mc_rotor = mg->mg_next;
 		mc->mc_allocated = 0;
 	} while ((mg = mg->mg_next) != rotor);
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index a838b0f45b..a780a2ca1f 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -362,6 +362,27 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 				dmu_objset_close(os);
 			}
 			break;
+		case ZPOOL_PROP_FAILUREMODE:
+			error = nvpair_value_uint64(elem, &intval);
+			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
+			    intval > ZIO_FAILURE_MODE_PANIC))
+				error = EINVAL;
+
+			/*
+			 * This is a special case which only occurs when
+			 * the pool has completely failed. This allows
+			 * the user to change the in-core failmode property
+			 * without syncing it out to disk (I/Os might
+			 * currently be blocked). We do this by returning
+			 * EIO to the caller (spa_prop_set) to trick it
+			 * into thinking we encountered a property validation
+			 * error.
+			 */
+			if (!error && spa_state(spa) == POOL_STATE_IO_FAILURE) {
+				spa->spa_failmode = intval;
+				error = EIO;
+			}
+			break;
 		}
 
 		if (error)
@@ -477,6 +498,8 @@ spa_activate(spa_t *spa)
 
 	list_create(&spa->spa_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_dirty_node));
+	list_create(&spa->spa_zio_list, sizeof (zio_t),
+	    offsetof(zio_t, zio_link_node));
 
 	txg_list_create(&spa->spa_vdev_txg_list,
 	    offsetof(struct vdev, vdev_txg_node));
@@ -506,6 +529,7 @@ spa_deactivate(spa_t *spa)
 	txg_list_destroy(&spa->spa_vdev_txg_list);
 
 	list_destroy(&spa->spa_dirty_list);
+	list_destroy(&spa->spa_zio_list);
 
 	for (t = 0; t < ZIO_TYPES; t++) {
 		taskq_destroy(spa->spa_zio_issue_taskq[t]);
@@ -1077,6 +1101,10 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 		    spa->spa_pool_props_object,
 		    zpool_prop_to_name(ZPOOL_PROP_DELEGATION),
 		    sizeof (uint64_t), 1, &spa->spa_delegation);
+		(void) zap_lookup(spa->spa_meta_objset,
+		    spa->spa_pool_props_object,
+		    zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE),
+		    sizeof (uint64_t), 1, &spa->spa_failmode);
 	}
 
 	/*
@@ -1618,6 +1646,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 	spa->spa_temporary = zpool_prop_default_numeric(ZPOOL_PROP_TEMPORARY);
+	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
 	if (props)
 		spa_sync_props(spa, props, CRED(), tx);
 
@@ -3091,7 +3120,7 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
 			tvd->vdev_remove_wanted = 0;
 			vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED,
 			    VDEV_AUX_NONE);
-			vdev_clear(spa, tvd);
+			vdev_clear(spa, tvd, B_TRUE);
 			vdev_config_dirty(tvd->vdev_top);
 		}
 		spa_async_remove(spa, tvd);
@@ -3122,8 +3151,14 @@ spa_async_thread(spa_t *spa)
 
 	/*
 	 * See if any devices need to be marked REMOVED.
+	 *
+	 * XXX - We avoid doing this when we are in
+	 * I/O failure state since spa_vdev_enter() grabs
+	 * the namespace lock and would not be able to obtain
+	 * the writer config lock.
 	 */
-	if (tasks & SPA_ASYNC_REMOVE) {
+	if (tasks & SPA_ASYNC_REMOVE &&
+	    spa_state(spa) != POOL_STATE_IO_FAILURE) {
 		txg = spa_vdev_enter(spa);
 		spa_async_remove(spa, spa->spa_root_vdev);
 		(void) spa_vdev_exit(spa, NULL, txg, 0);
@@ -3379,7 +3414,6 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 			VERIFY(nvpair_value_uint64(elem, &intval) == 0);
 			spa->spa_temporary = intval;
 			break;
-
 		default:
 			/*
 			 * Set pool property values in the poolprops mos object.
@@ -3425,11 +3459,19 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 				ASSERT(0); /* not allowed */
 			}
 
-			if (prop ==  ZPOOL_PROP_DELEGATION)
+			switch (prop) {
+			case ZPOOL_PROP_DELEGATION:
 				spa->spa_delegation = intval;
-
-			if (prop == ZPOOL_PROP_BOOTFS)
+				break;
+			case ZPOOL_PROP_BOOTFS:
 				spa->spa_bootfs = intval;
+				break;
+			case ZPOOL_PROP_FAILUREMODE:
+				spa->spa_failmode = intval;
+				break;
+			default:
+				break;
+			}
 		}
 
 		/* log internal history if this is not a zpool create */
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 8065ae85b6..5cb0890586 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -277,6 +277,8 @@ spa_add(const char *name, const char *altroot)
 
 	avl_add(&spa_namespace_avl, spa);
 
+	mutex_init(&spa->spa_zio_lock, NULL, MUTEX_DEFAULT, NULL);
+
 	/*
 	 * Set the alternate root, if there is one.
 	 */
@@ -332,6 +334,7 @@ spa_remove(spa_t *spa)
 	mutex_destroy(&spa->spa_sync_bplist.bpl_lock);
 	mutex_destroy(&spa->spa_history_lock);
 	mutex_destroy(&spa->spa_props_lock);
+	mutex_destroy(&spa->spa_zio_lock);
 
 	kmem_free(spa, sizeof (spa_t));
 }
@@ -989,6 +992,16 @@ spa_get_asize(spa_t *spa, uint64_t lsize)
 	return (lsize * 6);
 }
 
+/*
+ * Return the failure mode that has been set to this pool. The default
+ * behavior will be to block all I/Os when a complete failure occurs.
+ */
+uint8_t
+spa_get_failmode(spa_t *spa)
+{
+	return (spa->spa_failmode);
+}
+
 uint64_t
 spa_version(spa_t *spa)
 {
diff --git a/usr/src/uts/common/fs/zfs/space_map.c b/usr/src/uts/common/fs/zfs/space_map.c
index 9263b31172..a15e5ff815 100644
--- a/usr/src/uts/common/fs/zfs/space_map.c
+++ b/usr/src/uts/common/fs/zfs/space_map.c
@@ -298,6 +298,7 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
 	uint64_t *entry, *entry_map, *entry_map_end;
 	uint64_t bufsize, size, offset, end, space;
 	uint64_t mapstart = sm->sm_start;
+	int error = 0;
 
 	ASSERT(MUTEX_HELD(sm->sm_lock));
 
@@ -335,9 +336,10 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
 		    smo->smo_object, offset, size);
 
 		mutex_exit(sm->sm_lock);
-		VERIFY3U(dmu_read(os, smo->smo_object, offset, size,
-		    entry_map), ==, 0);
+		error = dmu_read(os, smo->smo_object, offset, size, entry_map);
 		mutex_enter(sm->sm_lock);
+		if (error != 0)
+			goto out;
 
 		entry_map_end = entry_map + (size / sizeof (uint64_t));
 		for (entry = entry_map; entry < entry_map_end; entry++) {
@@ -354,18 +356,19 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
 	}
 	VERIFY3U(sm->sm_space, ==, space);
 
+	sm->sm_loaded = B_TRUE;
+	sm->sm_ops = ops;
+out:
 	zio_buf_free(entry_map, bufsize);
 
 	sm->sm_loading = B_FALSE;
-	sm->sm_loaded = B_TRUE;
-	sm->sm_ops = ops;
 
 	cv_broadcast(&sm->sm_load_cv);
 
-	if (ops != NULL)
+	if (!error && ops != NULL)
 		ops->smop_load(sm);
 
-	return (0);
+	return (error);
 }
 
 void
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index cb5e09e4b0..032ead7f37 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -274,7 +274,7 @@ typedef struct blkptr {
 #define	BP_IS_HOLE(bp)		((bp)->blk_birth == 0)
 #define	BP_IS_OLDER(bp, txg)	(!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
 
-#define	BP_ZERO(bp)				\
+#define	BP_ZERO_DVAS(bp)			\
 {						\
 	(bp)->blk_dva[0].dva_word[0] = 0;	\
 	(bp)->blk_dva[0].dva_word[1] = 0;	\
@@ -282,11 +282,16 @@ typedef struct blkptr {
 	(bp)->blk_dva[1].dva_word[1] = 0;	\
 	(bp)->blk_dva[2].dva_word[0] = 0;	\
 	(bp)->blk_dva[2].dva_word[1] = 0;	\
+	(bp)->blk_birth = 0;			\
+}
+
+#define	BP_ZERO(bp)				\
+{						\
+	BP_ZERO_DVAS(bp)			\
 	(bp)->blk_prop = 0;			\
 	(bp)->blk_pad[0] = 0;			\
 	(bp)->blk_pad[1] = 0;			\
 	(bp)->blk_pad[2] = 0;			\
-	(bp)->blk_birth = 0;			\
 	(bp)->blk_fill = 0;			\
 	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
 }
@@ -423,6 +428,7 @@ extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
 extern uint64_t spa_version(spa_t *spa);
 extern int spa_max_replication(spa_t *spa);
 extern int spa_busy(void);
+extern uint8_t spa_get_failmode(spa_t *spa);
 
 /* Miscellaneous support routines */
 extern int spa_rename(const char *oldname, const char *newname);
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
index 18371aa13f..0310f985b8 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -141,6 +141,10 @@ struct spa {
 	uint64_t	spa_bootfs;		/* default boot filesystem */
 	boolean_t	spa_delegation;		/* delegation on/off */
 	boolean_t	spa_temporary;		/* temporary on/off */
+	list_t		spa_zio_list;		/* zio error list */
+	kcondvar_t	spa_zio_cv;		/* resume I/O pipeline */
+	kmutex_t	spa_zio_lock;		/* zio error lock */
+	uint8_t		spa_failmode;		/* failure mode for the pool */
 	/*
 	 * spa_refcnt & spa_config_lock must be the last elements
 	 * because refcount_t changes size based on compilation options.
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index c651d1eebb..dced3da5ff 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -54,6 +54,7 @@ extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
 extern void vdev_init(vdev_t *, uint64_t txg);
 extern void vdev_reopen(vdev_t *);
 extern int vdev_validate_spare(vdev_t *);
+extern int vdev_probe(vdev_t *);
 
 extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
 extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
@@ -89,10 +90,12 @@ extern int vdev_degrade(spa_t *spa, uint64_t guid);
 extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
     vdev_state_t *);
 extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
-extern void vdev_clear(spa_t *spa, vdev_t *vd);
+extern void vdev_clear(spa_t *spa, vdev_t *vd, boolean_t reopen_wanted);
 
 extern int vdev_error_inject(vdev_t *vd, zio_t *zio);
 extern int vdev_is_dead(vdev_t *vd);
+extern int vdev_readable(vdev_t *vd);
+extern int vdev_writeable(vdev_t *vd);
 
 extern void vdev_cache_init(vdev_t *vd);
 extern void vdev_cache_fini(vdev_t *vd);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index e279bb2495..6fa21e83b0 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -60,6 +60,7 @@ typedef struct vdev_cache_entry vdev_cache_entry_t;
  */
 typedef int	vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift);
 typedef void	vdev_close_func_t(vdev_t *vd);
+typedef int	vdev_probe_func_t(vdev_t *vd);
 typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
 typedef void	vdev_io_start_func_t(zio_t *zio);
 typedef void	vdev_io_done_func_t(zio_t *zio);
@@ -68,6 +69,7 @@ typedef void	vdev_state_change_func_t(vdev_t *vd, int, int);
 typedef struct vdev_ops {
 	vdev_open_func_t		*vdev_op_open;
 	vdev_close_func_t		*vdev_op_close;
+	vdev_probe_func_t		*vdev_op_probe;
 	vdev_asize_func_t		*vdev_op_asize;
 	vdev_io_start_func_t		*vdev_op_io_start;
 	vdev_io_done_func_t		*vdev_op_io_done;
@@ -174,6 +176,7 @@ struct vdev {
 	uint64_t	vdev_unspare;	/* unspare when resilvering done */
 	boolean_t	vdev_checkremove; /* temporary online test	*/
 	boolean_t	vdev_forcefault; /* force online fault		*/
+	boolean_t	vdev_is_failing; /* device errors seen		*/
 
 	/*
 	 * For DTrace to work in userland (libzpool) context, these fields must
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_context.h b/usr/src/uts/common/fs/zfs/sys/zfs_context.h
index 8a689e0760..a5be3e1303 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_context.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_context.h
@@ -62,6 +62,7 @@ extern "C" {
 #include <sys/zfs_debug.h>
 #include <sys/sysevent.h>
 #include <sys/sysevent/eventdefs.h>
+#include <sys/fm/util.h>
 
 #define	CPU_SEQID	(CPU->cpu_seqid)
 
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 0f38aae47d..cc08976074 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -107,6 +107,10 @@ enum zio_compress {
 #define	ZIO_COMPRESS_ON_VALUE	ZIO_COMPRESS_LZJB
 #define	ZIO_COMPRESS_DEFAULT	ZIO_COMPRESS_OFF
 
+#define	ZIO_FAILURE_MODE_WAIT		0
+#define	ZIO_FAILURE_MODE_CONTINUE	1
+#define	ZIO_FAILURE_MODE_PANIC		2
+
 #define	ZIO_PRIORITY_NOW		(zio_priority_table[0])
 #define	ZIO_PRIORITY_SYNC_READ		(zio_priority_table[1])
 #define	ZIO_PRIORITY_SYNC_WRITE		(zio_priority_table[2])
@@ -144,6 +148,7 @@ enum zio_compress {
 #define	ZIO_FLAG_USER			0x20000
 
 #define	ZIO_FLAG_METADATA		0x40000
+#define	ZIO_FLAG_WRITE_RETRY		0x80000
 
 #define	ZIO_FLAG_GANG_INHERIT		\
 	(ZIO_FLAG_CANFAIL |		\
@@ -217,6 +222,7 @@ struct zio {
 	zio_t		*io_sibling_next;
 	zio_transform_t *io_transform_stack;
 	zio_t		*io_logical;
+	list_node_t	zio_link_node;
 
 	/* Callback info */
 	zio_done_func_t	*io_ready;
@@ -242,8 +248,10 @@ struct zio {
 
 	/* Internal pipeline state */
 	int		io_flags;
+	int		io_orig_flags;
 	enum zio_type	io_type;
 	enum zio_stage	io_stage;
+	enum zio_stage	io_orig_stage;
 	uint8_t		io_stalled;
 	uint8_t		io_priority;
 	struct dk_callback io_dk_callback;
@@ -252,6 +260,7 @@ struct zio {
 	int		io_error;
 	uint32_t	io_numerrors;
 	uint32_t	io_pipeline;
+	uint32_t	io_orig_pipeline;
 	uint32_t	io_async_stages;
 	uint64_t	io_children_notready;
 	uint64_t	io_children_notdone;
@@ -320,6 +329,7 @@ extern void zio_data_buf_free(void *buf, size_t size);
  */
 extern void zio_next_stage(zio_t *zio);
 extern void zio_next_stage_async(zio_t *zio);
+extern void zio_resubmit_stage_async(void *);
 extern void zio_wait_children_done(zio_t *zio);
 
 /*
@@ -339,7 +349,8 @@ extern void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp);
 extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
 extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
 
-boolean_t zio_should_retry(zio_t *zio);
+extern boolean_t zio_should_retry(zio_t *zio);
+extern int zio_vdev_resume_io(spa_t *);
 
 /*
  * Initial setup and teardown.
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_impl.h b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
index d2ddbc34e9..a5a0bb54e8 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -61,6 +61,8 @@ typedef enum zio_stage {
 
 	ZIO_STAGE_READY,			/* RWFCI */
 
+	ZIO_STAGE_READ_INIT,			/* R---- */
+
 	ZIO_STAGE_VDEV_IO_START,		/* RW--I */
 	ZIO_STAGE_VDEV_IO_DONE,			/* RW--I */
 	ZIO_STAGE_VDEV_IO_ASSESS,		/* RW--I */
@@ -71,6 +73,7 @@ typedef enum zio_stage {
 	ZIO_STAGE_READ_GANG_MEMBERS,		/* R---- */
 	ZIO_STAGE_READ_DECOMPRESS,		/* R---- */
 
+	ZIO_STAGE_ASSESS,			/* RWFCI */
 	ZIO_STAGE_DONE				/* RWFCI */
 } zio_stage_t;
 
@@ -96,9 +99,14 @@ typedef enum zio_stage {
 	ZIO_VDEV_IO_PIPELINE |					\
 	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
 	(1U << ZIO_STAGE_CHECKSUM_VERIFY) |			\
+	(1U << ZIO_STAGE_ASSESS) |				\
 	(1U << ZIO_STAGE_DONE))
 
+#define	ZIO_READ_GANG_PIPELINE					\
+	ZIO_READ_PHYS_PIPELINE
+
 #define	ZIO_READ_PIPELINE					\
+	(1U << ZIO_STAGE_READ_INIT) |				\
 	ZIO_READ_PHYS_PIPELINE
 
 #define	ZIO_WRITE_PHYS_PIPELINE					\
@@ -108,6 +116,7 @@ typedef enum zio_stage {
 	(1U << ZIO_STAGE_READY) |				\
 	ZIO_VDEV_IO_PIPELINE |					\
 	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_ASSESS) |				\
 	(1U << ZIO_STAGE_DONE))
 
 #define	ZIO_WRITE_COMMON_PIPELINE				\
@@ -149,6 +158,7 @@ typedef enum zio_stage {
 	(1U << ZIO_STAGE_DVA_FREE) |				\
 	(1U << ZIO_STAGE_READY) |				\
 	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_ASSESS) |				\
 	(1U << ZIO_STAGE_DONE))
 
 #define	ZIO_CLAIM_PIPELINE					\
@@ -160,6 +170,7 @@ typedef enum zio_stage {
 	(1U << ZIO_STAGE_DVA_CLAIM) |				\
 	(1U << ZIO_STAGE_READY) |				\
 	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_ASSESS) |				\
 	(1U << ZIO_STAGE_DONE))
 
 #define	ZIO_IOCTL_PIPELINE					\
@@ -168,16 +179,19 @@ typedef enum zio_stage {
 	(1U << ZIO_STAGE_READY) |				\
 	ZIO_VDEV_IO_PIPELINE |					\
 	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_ASSESS) |				\
 	(1U << ZIO_STAGE_DONE))
 
 #define	ZIO_WAIT_FOR_CHILDREN_PIPELINE				\
 	((1U << ZIO_STAGE_WAIT_CHILDREN_READY) |		\
 	(1U << ZIO_STAGE_READY) |				\
 	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_ASSESS) |				\
 	(1U << ZIO_STAGE_DONE))
 
 #define	ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE			\
 	((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_ASSESS) |				\
 	(1U << ZIO_STAGE_DONE))
 
 #define	ZIO_VDEV_CHILD_PIPELINE					\
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 62ebf19a61..aed7d53ba1 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -793,6 +793,21 @@ vdev_metaslab_fini(vdev_t *vd)
 	}
 }
 
+int
+vdev_probe(vdev_t *vd)
+{
+	if (vd == NULL)
+		return (EINVAL);
+
+	/*
+	 * Right now we only support status checks on the leaf vdevs.
+	 */
+	if (vd->vdev_ops->vdev_op_leaf)
+		return (vd->vdev_ops->vdev_op_probe(vd));
+
+	return (0);
+}
+
 /*
  * Prepare a virtual device for access.
  */
@@ -919,6 +934,17 @@ vdev_open(vdev_t *vd)
 	}
 
 	/*
+	 * Ensure we can issue some IO before declaring the
+	 * vdev open for business.
+	 */
+	error = vdev_probe(vd);
+	if (error) {
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_OPEN_FAILED);
+		return (error);
+	}
+
+	/*
 	 * If this is a top-level vdev, compute the raidz-deflation
 	 * ratio.  Note, we hard-code in 128k (1<<17) because it is the
 	 * current "typical" blocksize.  Even if SPA_MAXBLOCKSIZE
@@ -1467,6 +1493,17 @@ vdev_fault(spa_t *spa, uint64_t guid)
 	vdev_t *rvd, *vd;
 	uint64_t txg;
 
+	/*
+	 * Disregard a vdev fault request if the pool has
+	 * experienced a complete failure.
+	 *
+	 * XXX - We do this here so that we don't hold the
+	 * spa_namespace_lock in the event that we can't get
+	 * the RW_WRITER spa_config_lock.
+	 */
+	if (spa_state(spa) == POOL_STATE_IO_FAILURE)
+		return (EIO);
+
 	txg = spa_vdev_enter(spa);
 
 	rvd = spa->spa_root_vdev;
@@ -1499,7 +1536,7 @@ vdev_fault(spa_t *spa, uint64_t guid)
 		 */
 		vdev_reopen(vd);
 
-		if (!vdev_is_dead(vd)) {
+		if (vdev_readable(vd)) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
 			    VDEV_AUX_ERR_EXCEEDED);
 		}
@@ -1523,6 +1560,17 @@ vdev_degrade(spa_t *spa, uint64_t guid)
 	vdev_t *rvd, *vd;
 	uint64_t txg;
 
+	/*
+	 * Disregard a vdev fault request if the pool has
+	 * experienced a complete failure.
+	 *
+	 * XXX - We do this here so that we don't hold the
+	 * spa_namespace_lock in the event that we can't get
+	 * the RW_WRITER spa_config_lock.
+	 */
+	if (spa_state(spa) == POOL_STATE_IO_FAILURE)
+		return (EIO);
+
 	txg = spa_vdev_enter(spa);
 
 	rvd = spa->spa_root_vdev;
@@ -1564,6 +1612,17 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
 	vdev_t *rvd, *vd;
 	uint64_t txg;
 
+	/*
+	 * Disregard a vdev fault request if the pool has
+	 * experienced a complete failure.
+	 *
+	 * XXX - We do this here so that we don't hold the
+	 * spa_namespace_lock in the event that we can't get
+	 * the RW_WRITER spa_config_lock.
+	 */
+	if (spa_state(spa) == POOL_STATE_IO_FAILURE)
+		return (EIO);
+
 	txg = spa_vdev_enter(spa);
 
 	rvd = spa->spa_root_vdev;
@@ -1612,6 +1671,17 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
 	vdev_t *rvd, *vd;
 	uint64_t txg;
 
+	/*
+	 * Disregard a vdev fault request if the pool has
+	 * experienced a complete failure.
+	 *
+	 * XXX - We do this here so that we don't hold the
+	 * spa_namespace_lock in the event that we can't get
+	 * the RW_WRITER spa_config_lock.
+	 */
+	if (spa_state(spa) == POOL_STATE_IO_FAILURE)
+		return (EIO);
+
 	txg = spa_vdev_enter(spa);
 
 	rvd = spa->spa_root_vdev;
@@ -1662,9 +1732,11 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
  * Clear the error counts associated with this vdev.  Unlike vdev_online() and
  * vdev_offline(), we assume the spa config is locked.  We also clear all
  * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
+ * If reopen is specified then attempt to reopen the vdev if the vdev is
+ * faulted or degraded.
  */
 void
-vdev_clear(spa_t *spa, vdev_t *vd)
+vdev_clear(spa_t *spa, vdev_t *vd, boolean_t reopen_wanted)
 {
 	int c;
 
@@ -1674,16 +1746,17 @@ vdev_clear(spa_t *spa, vdev_t *vd)
 	vd->vdev_stat.vs_read_errors = 0;
 	vd->vdev_stat.vs_write_errors = 0;
 	vd->vdev_stat.vs_checksum_errors = 0;
+	vd->vdev_is_failing = B_FALSE;
 
 	for (c = 0; c < vd->vdev_children; c++)
-		vdev_clear(spa, vd->vdev_child[c]);
+		vdev_clear(spa, vd->vdev_child[c], reopen_wanted);
 
 	/*
 	 * If we're in the FAULTED state, then clear the persistent state and
 	 * attempt to reopen the device.  We also mark the vdev config dirty, so
 	 * that the new faulted state is written out to disk.
 	 */
-	if (vd->vdev_faulted || vd->vdev_degraded) {
+	if (reopen_wanted && (vd->vdev_faulted || vd->vdev_degraded)) {
 		vd->vdev_faulted = vd->vdev_degraded = 0;
 		vdev_reopen(vd);
 		vdev_config_dirty(vd->vdev_top);
@@ -1696,6 +1769,20 @@ vdev_clear(spa_t *spa, vdev_t *vd)
 }
 
 int
+vdev_readable(vdev_t *vd)
+{
+	/* XXPOLICY */
+	return (!vdev_is_dead(vd));
+}
+
+int
+vdev_writeable(vdev_t *vd)
+{
+	return (vd->vdev_ops->vdev_op_leaf ?
+	    !vd->vdev_is_failing : !vdev_is_dead(vd));
+}
+
+int
 vdev_is_dead(vdev_t *vd)
 {
 	return (vd->vdev_state < VDEV_STATE_DEGRADED);
@@ -1800,7 +1887,7 @@ vdev_stat_update(zio_t *zio)
 	if (flags & ZIO_FLAG_SPECULATIVE)
 		return;
 
-	if (!vdev_is_dead(vd)) {
+	if (vdev_readable(vd)) {
 		mutex_enter(&vd->vdev_stat_lock);
 		if (type == ZIO_TYPE_READ) {
 			if (zio->io_error == ECKSUM)
@@ -1962,9 +2049,9 @@ vdev_propagate_state(vdev_t *vd)
 	if (vd->vdev_children > 0) {
 		for (c = 0; c < vd->vdev_children; c++) {
 			child = vd->vdev_child[c];
-			if (vdev_is_dead(child))
+			if (vdev_is_dead(child) && !vdev_readable(child))
 				faulted++;
-			else if (child->vdev_state == VDEV_STATE_DEGRADED)
+			else if (child->vdev_state <= VDEV_STATE_DEGRADED)
 				degraded++;
 
 			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
@@ -2020,7 +2107,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 	 * want here.  This is limited to leaf devices, because otherwise
 	 * closing the device will affect other children.
 	 */
-	if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf)
+	if (!vdev_readable(vd) && vd->vdev_ops->vdev_op_leaf)
 		vd->vdev_ops->vdev_op_close(vd);
 
 	if (vd->vdev_removed &&
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index a957c3671c..8bdd4d1f95 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -45,14 +45,11 @@ typedef struct vdev_disk_buf {
 } vdev_disk_buf_t;
 
 static int
-vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+vdev_disk_open_common(vdev_t *vd)
 {
 	vdev_disk_t *dvd;
-	struct dk_minfo dkm;
-	int error;
 	dev_t dev;
-	char *physpath, *minorname;
-	int otyp;
+	int error;
 
 	/*
 	 * We must have a pathname, and it must be absolute.
@@ -166,17 +163,34 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 			    &dvd->vd_lh, zfs_li);
 	}
 
-	if (error) {
+	if (error)
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+
+	return (error);
+}
+
+static int
+vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+	vdev_disk_t *dvd;
+	struct dk_minfo dkm;
+	int error;
+	dev_t dev;
+	int otyp;
+
+	error = vdev_disk_open_common(vd);
+	if (error)
 		return (error);
-	}
 
+	dvd = vd->vdev_tsd;
 	/*
 	 * Once a device is opened, verify that the physical device path (if
 	 * available) is up to date.
 	 */
 	if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
 	    ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
+		char *physpath, *minorname;
+
 		physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 		minorname = NULL;
 		if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
@@ -252,6 +266,113 @@ vdev_disk_close(vdev_t *vd)
 	vd->vdev_tsd = NULL;
 }
 
+static int
+vdev_disk_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset,
+    int flags)
+{
+	buf_t buf;
+	int error = 0;
+	vdev_disk_t *dvd = vd->vdev_tsd;
+
+	if (vd == NULL || dvd == NULL || dvd->vd_lh == NULL)
+		return (EINVAL);
+
+	ASSERT(flags & B_READ || flags & B_WRITE);
+
+	bioinit(&buf);
+	buf.b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
+	buf.b_bcount = size;
+	buf.b_un.b_addr = (void *)data;
+	buf.b_lblkno = lbtodb(offset);
+	buf.b_bufsize = size;
+
+	error = ldi_strategy(dvd->vd_lh, &buf);
+	ASSERT(error == 0);
+	error = biowait(&buf);
+
+	if (zio_injection_enabled && error == 0)
+		error = zio_handle_device_injection(vd, EIO);
+
+	return (error);
+}
+
+static int
+vdev_disk_probe(vdev_t *vd)
+{
+	uint64_t offset;
+	vdev_t *nvd;
+	int l, error = 0, retries = 0;
+	char *vl_pad;
+
+	if (vd == NULL)
+		return (EINVAL);
+
+	/* Hijack the current vdev */
+	nvd = vd;
+
+	/*
+	 * Pick a random label to rewrite.
+	 */
+	l = spa_get_random(VDEV_LABELS);
+	ASSERT(l < VDEV_LABELS);
+
+	offset = vdev_label_offset(vd->vdev_psize, l,
+	    offsetof(vdev_label_t, vl_pad));
+
+	vl_pad = kmem_alloc(VDEV_SKIP_SIZE, KM_SLEEP);
+
+	/*
+	 * Try to read and write to a special location on the
+	 * label. We use the existing vdev initially and only
+	 * try to create and reopen it if we encounter a failure.
+	 */
+	while ((error = vdev_disk_probe_io(nvd, vl_pad, VDEV_SKIP_SIZE,
+	    offset, B_READ)) != 0 && retries == 0) {
+
+		nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
+		if (vd->vdev_path)
+			nvd->vdev_path = spa_strdup(vd->vdev_path);
+		if (vd->vdev_physpath)
+			nvd->vdev_physpath = spa_strdup(vd->vdev_physpath);
+		if (vd->vdev_devid)
+			nvd->vdev_devid = spa_strdup(vd->vdev_devid);
+		nvd->vdev_wholedisk = vd->vdev_wholedisk;
+		nvd->vdev_guid = vd->vdev_guid;
+		retries++;
+
+		error = vdev_disk_open_common(nvd);
+		if (error) {
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    nvd->vdev_stat.vs_aux);
+			break;
+		}
+	}
+
+	if (!error) {
+		error = vdev_disk_probe_io(nvd, vl_pad, VDEV_SKIP_SIZE,
+		    offset, B_WRITE);
+	}
+
+	/* Clean up if we allocated a new vdev */
+	if (retries) {
+		vdev_disk_close(nvd);
+		if (nvd->vdev_path)
+			spa_strfree(nvd->vdev_path);
+		if (nvd->vdev_physpath)
+			spa_strfree(nvd->vdev_physpath);
+		if (nvd->vdev_devid)
+			spa_strfree(nvd->vdev_devid);
+		kmem_free(nvd, sizeof (vdev_t));
+	}
+	kmem_free(vl_pad, VDEV_SKIP_SIZE);
+
+	/* Reset the failing flag */
+	if (!error)
+		vd->vdev_is_failing = B_FALSE;
+
+	return (error);
+}
+
 static void
 vdev_disk_io_intr(buf_t *bp)
 {
@@ -289,7 +410,7 @@ vdev_disk_io_start(zio_t *zio)
 		zio_vdev_io_bypass(zio);
 
 		/* XXPOLICY */
-		if (vdev_is_dead(vd)) {
+		if (!vdev_readable(vd)) {
 			zio->io_error = ENXIO;
 			zio_next_stage_async(zio);
 			return;
@@ -369,7 +490,11 @@ vdev_disk_io_start(zio_t *zio)
 	bp->b_iodone = (int (*)())vdev_disk_io_intr;
 
 	/* XXPOLICY */
-	error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
+	if (zio->io_type == ZIO_TYPE_WRITE)
+		error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
+	else
+		error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
+	error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
 	if (error) {
 		zio->io_error = error;
 		bioerror(bp, error);
@@ -386,10 +511,6 @@ vdev_disk_io_start(zio_t *zio)
 static void
 vdev_disk_io_done(zio_t *zio)
 {
-	vdev_t *vd = zio->io_vd;
-	vdev_disk_t *dvd = vd->vdev_tsd;
-	int state;
-
 	vdev_queue_io_done(zio);
 
 	if (zio->io_type == ZIO_TYPE_WRITE)
@@ -401,15 +522,23 @@ vdev_disk_io_done(zio_t *zio)
 	/*
 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
 	 * the device has been removed.  If this is the case, then we trigger an
-	 * asynchronous removal of the device.
+	 * asynchronous removal of the device. Otherwise, probe the device and
+	 * make sure it's still functional.
 	 */
 	if (zio->io_error == EIO) {
+		vdev_t *vd = zio->io_vd;
+		vdev_disk_t *dvd = vd->vdev_tsd;
+		int state;
+
 		state = DKIO_NONE;
-		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
+		if (dvd && ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
 		    FKIOCTL, kcred, NULL) == 0 &&
 		    state != DKIO_INSERTED) {
 			vd->vdev_remove_wanted = B_TRUE;
 			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
+		} else if (vdev_probe(vd) != 0) {
+			ASSERT(vd->vdev_ops->vdev_op_leaf);
+			vd->vdev_is_failing = B_TRUE;
 		}
 	}
 
@@ -419,6 +548,7 @@ vdev_disk_io_done(zio_t *zio)
 vdev_ops_t vdev_disk_ops = {
 	vdev_disk_open,
 	vdev_disk_close,
+	vdev_disk_probe,
 	vdev_default_asize,
 	vdev_disk_io_start,
 	vdev_disk_io_done,
diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c
index b8e79f8c0c..6f099b6629 100644
--- a/usr/src/uts/common/fs/zfs/vdev_file.c
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -37,11 +37,10 @@
  */
 
 static int
-vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+vdev_file_open_common(vdev_t *vd)
 {
 	vdev_file_t *vf;
 	vnode_t *vp;
-	vattr_t vattr;
 	int error;
 
 	/*
@@ -61,8 +60,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 	 * to local zone users, so the underlying devices should be as well.
 	 */
 	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
-	error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, spa_mode | FOFFMAX,
-	    0, &vp, 0, 0, rootdir);
+	error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
+	    spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir);
 
 	if (error) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
@@ -81,11 +80,26 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 	}
 #endif
 
+	return (0);
+}
+
+static int
+vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+	vdev_file_t *vf;
+	vattr_t vattr;
+	int error;
+
+	if ((error = vdev_file_open_common(vd)) != 0)
+		return (error);
+
+	vf = vd->vdev_tsd;
+
 	/*
 	 * Determine the physical size of the file.
 	 */
 	vattr.va_mask = AT_SIZE;
-	error = VOP_GETATTR(vp, &vattr, 0, kcred);
+	error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred);
 	if (error) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		return (error);
@@ -115,6 +129,89 @@ vdev_file_close(vdev_t *vd)
 	vd->vdev_tsd = NULL;
 }
 
+static int
+vdev_file_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset,
+    enum uio_rw rw)
+{
+	vdev_file_t *vf = vd->vdev_tsd;
+	ssize_t resid;
+	int error = 0;
+
+	if (vd == NULL || vf == NULL || vf->vf_vnode == NULL)
+		return (EINVAL);
+
+	ASSERT(rw == UIO_READ || rw ==  UIO_WRITE);
+
+	error = vn_rdwr(rw, vf->vf_vnode, data, size, offset, UIO_SYSSPACE,
+	    0, RLIM64_INFINITY, kcred, &resid);
+	if (error || resid != 0)
+		return (EIO);
+	return (0);
+}
+
+static int
+vdev_file_probe(vdev_t *vd)
+{
+	vdev_t *nvd;
+	char *vl_boot;
+	uint64_t offset;
+	int l, error = 0, retries = 0;
+
+	if (vd == NULL)
+		return (EINVAL);
+
+	/* Hijack the current vdev */
+	nvd = vd;
+
+	/*
+	 * Pick a random label to rewrite.
+	 */
+	l = spa_get_random(VDEV_LABELS);
+	ASSERT(l < VDEV_LABELS);
+
+	offset = vdev_label_offset(vd->vdev_psize, l,
+	    offsetof(vdev_label_t, vl_boot_header));
+
+	vl_boot = kmem_alloc(VDEV_BOOT_HEADER_SIZE, KM_SLEEP);
+
+	while ((error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE,
+	    offset, UIO_READ)) != 0 && retries == 0) {
+
+		/*
+		 * If we failed with the vdev that was passed in then
+		 * try allocating a new one and try again.
+		 */
+		nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
+		if (vd->vdev_path)
+			nvd->vdev_path = spa_strdup(vd->vdev_path);
+		error = vdev_file_open_common(nvd);
+		if (error) {
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    nvd->vdev_stat.vs_aux);
+			break;
+		}
+		retries++;
+	}
+
+	if ((spa_mode & FWRITE) && !error) {
+		error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE,
+		    offset, UIO_WRITE);
+	}
+
+	if (retries) {
+		vdev_file_close(nvd);
+		if (nvd->vdev_path)
+			spa_strfree(nvd->vdev_path);
+		kmem_free(nvd, sizeof (vdev_t));
+	}
+	kmem_free(vl_boot, VDEV_BOOT_HEADER_SIZE);
+
+	if (!error)
+		vd->vdev_is_failing = B_FALSE;
+
+	return (error);
+}
+
 static void
 vdev_file_io_start(zio_t *zio)
 {
@@ -127,7 +224,7 @@ vdev_file_io_start(zio_t *zio)
 		zio_vdev_io_bypass(zio);
 
 		/* XXPOLICY */
-		if (vdev_is_dead(vd)) {
+		if (!vdev_readable(vd)) {
 			zio->io_error = ENXIO;
 			zio_next_stage_async(zio);
 			return;
@@ -161,7 +258,11 @@ vdev_file_io_start(zio_t *zio)
 		return;
 
 	/* XXPOLICY */
-	error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
+	if (zio->io_type == ZIO_TYPE_WRITE)
+		error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
+	else
+		error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
+	error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
 	if (error) {
 		zio->io_error = error;
 		zio_next_stage_async(zio);
@@ -182,6 +283,21 @@ vdev_file_io_start(zio_t *zio)
 static void
 vdev_file_io_done(zio_t *zio)
 {
+
+	if (zio_injection_enabled && zio->io_error == 0)
+		zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
+
+	/*
+	 * If this device is truely gone, then attempt to remove it
+	 * from the configuration.
+	 */
+	if (zio->io_error == EIO) {
+		vdev_t *vd = zio->io_vd;
+
+		if (vdev_probe(vd) != 0)
+			vd->vdev_is_failing = B_TRUE;
+	}
+
 	vdev_queue_io_done(zio);
 
 #ifndef _KERNEL
@@ -189,15 +305,13 @@ vdev_file_io_done(zio_t *zio)
 		vdev_cache_write(zio);
 #endif
 
-	if (zio_injection_enabled && zio->io_error == 0)
-		zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
-
 	zio_next_stage(zio);
 }
 
 vdev_ops_t vdev_file_ops = {
 	vdev_file_open,
 	vdev_file_close,
+	vdev_file_probe,
 	vdev_default_asize,
 	vdev_file_io_start,
 	vdev_file_io_done,
@@ -214,6 +328,7 @@ vdev_ops_t vdev_file_ops = {
 vdev_ops_t vdev_disk_ops = {
 	vdev_file_open,
 	vdev_file_close,
+	vdev_file_probe,
 	vdev_default_asize,
 	vdev_file_io_start,
 	vdev_file_io_done,
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index 4b22a68fee..070444a093 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -321,7 +321,7 @@ vdev_label_read_config(vdev_t *vd)
 	ASSERT(spa_config_held(spa, RW_READER) ||
 	    spa_config_held(spa, RW_WRITER));
 
-	if (vdev_is_dead(vd))
+	if (!vdev_readable(vd))
 		return (NULL);
 
 	vp = zio_buf_alloc(sizeof (vdev_phys_t));
@@ -902,7 +902,9 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg)
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd;
 	zio_t *zio;
-	int l, error;
+	int l, last_error = 0, error = 0;
+	uint64_t good_writes = 0;
+	boolean_t retry_avail = B_TRUE;
 
 	ASSERT(ub->ub_txg <= txg);
 
@@ -941,6 +943,7 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg)
 	}
 	(void) zio_wait(zio);
 
+retry:
 	/*
 	 * Sync out the even labels (L0, L2) for every dirty vdev.  If the
 	 * system dies in the middle of this process, that's OK: all of the
@@ -954,11 +957,29 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg)
 			if (l & 1)
 				continue;
 			if ((error = vdev_sync_labels(vd, l, txg)) != 0)
-				return (error);
+				last_error = error;
+			else
+				good_writes++;
 		}
 	}
 
 	/*
+	 * If all the vdevs that are currently dirty have failed or the
+	 * spa_dirty_list is empty then we dirty all the vdevs and try again.
+	 * This is a last ditch effort to ensure that we get at least one
+	 * update before proceeding to the uberblock.
+	 */
+	if (good_writes == 0 && retry_avail) {
+		vdev_config_dirty(rvd);
+		retry_avail = B_FALSE;
+		last_error = 0;
+		goto retry;
+	}
+
+	if (good_writes == 0)
+		return (last_error);
+
+	/*
 	 * Flush the new labels to disk.  This ensures that all even-label
 	 * updates are committed to stable storage before the uberblock update.
 	 */
@@ -986,8 +1007,15 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg)
 	 *	will be the newest, and the even labels (which had all
 	 *	been successfully committed) will be valid with respect
 	 *	to the new uberblocks.
+	 *
+	 * NOTE: We retry to an uberblock update on the root if we were
+	 * failed our initial update attempt.
 	 */
-	if ((error = vdev_uberblock_sync_tree(spa, ub, uvd, txg)) != 0)
+	error = vdev_uberblock_sync_tree(spa, ub, uvd, txg);
+	if (error && uvd != rvd)
+		error = vdev_uberblock_sync_tree(spa, ub, rvd, txg);
+
+	if (error)
 		return (error);
 
 	/*
@@ -999,6 +1027,7 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg)
 	    NULL, NULL, ZIO_PRIORITY_NOW,
 	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
 
+	last_error = 0;
 	/*
 	 * Sync out odd labels for every dirty vdev.  If the system dies
 	 * in the middle of this process, the even labels and the new
@@ -1013,10 +1042,15 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg)
 			if ((l & 1) == 0)
 				continue;
 			if ((error = vdev_sync_labels(vd, l, txg)) != 0)
-				return (error);
+				last_error = error;
+			else
+				good_writes++;
 		}
 	}
 
+	if (good_writes == 0)
+		return (last_error);
+
 	/*
 	 * Flush the new labels to disk.  This ensures that all odd-label
 	 * updates are committed to stable storage before the next
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
index 73d1a83d94..45d326ae69 100644
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -219,7 +219,7 @@ vdev_mirror_child_select(zio_t *zio)
 	/*
 	 * Try to find a child whose DTL doesn't contain the block to read.
 	 * If a child is known to be completely inaccessible (indicated by
-	 * vdev_is_dead() returning B_TRUE), don't even try.
+	 * vdev_readable() returning B_FALSE), don't even try.
 	 */
 	for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
 		if (c >= mm->mm_children)
@@ -227,7 +227,7 @@ vdev_mirror_child_select(zio_t *zio)
 		mc = &mm->mm_child[c];
 		if (mc->mc_tried || mc->mc_skipped)
 			continue;
-		if (vdev_is_dead(mc->mc_vd)) {
+		if (vdev_is_dead(mc->mc_vd) && !vdev_readable(mc->mc_vd)) {
 			mc->mc_error = ENXIO;
 			mc->mc_tried = 1;	/* don't even try */
 			mc->mc_skipped = 1;
@@ -464,6 +464,7 @@ vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
 vdev_ops_t vdev_mirror_ops = {
 	vdev_mirror_open,
 	vdev_mirror_close,
+	NULL,
 	vdev_default_asize,
 	vdev_mirror_io_start,
 	vdev_mirror_io_done,
@@ -475,6 +476,7 @@ vdev_ops_t vdev_mirror_ops = {
 vdev_ops_t vdev_replacing_ops = {
 	vdev_mirror_open,
 	vdev_mirror_close,
+	NULL,
 	vdev_default_asize,
 	vdev_mirror_io_start,
 	vdev_mirror_io_done,
@@ -486,6 +488,7 @@ vdev_ops_t vdev_replacing_ops = {
 vdev_ops_t vdev_spare_ops = {
 	vdev_mirror_open,
 	vdev_mirror_close,
+	NULL,
 	vdev_default_asize,
 	vdev_mirror_io_start,
 	vdev_mirror_io_done,
diff --git a/usr/src/uts/common/fs/zfs/vdev_missing.c b/usr/src/uts/common/fs/zfs/vdev_missing.c
index b35f4a5bcd..3aa831c46d 100644
--- a/usr/src/uts/common/fs/zfs/vdev_missing.c
+++ b/usr/src/uts/common/fs/zfs/vdev_missing.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -77,9 +76,17 @@ vdev_missing_io_done(zio_t *zio)
 	zio_next_stage(zio);
 }
 
+/* ARGSUSED */
+static int
+vdev_missing_probe(vdev_t *vd)
+{
+	return (0);
+}
+
 vdev_ops_t vdev_missing_ops = {
 	vdev_missing_open,
 	vdev_missing_close,
+	vdev_missing_probe,
 	vdev_default_asize,
 	vdev_missing_io_start,
 	vdev_missing_io_done,
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index 0c86630765..73a3ae2565 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -686,7 +686,7 @@ vdev_raidz_io_start(zio_t *zio)
 	for (c = rm->rm_cols - 1; c >= 0; c--) {
 		rc = &rm->rm_col[c];
 		cvd = vd->vdev_child[rc->rc_devidx];
-		if (vdev_is_dead(cvd)) {
+		if (!vdev_readable(cvd)) {
 			if (c >= rm->rm_firstdatacol)
 				rm->rm_missingdata++;
 			else
@@ -1228,6 +1228,7 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
 vdev_ops_t vdev_raidz_ops = {
 	vdev_raidz_open,
 	vdev_raidz_close,
+	NULL,
 	vdev_raidz_asize,
 	vdev_raidz_io_start,
 	vdev_raidz_io_done,
diff --git a/usr/src/uts/common/fs/zfs/vdev_root.c b/usr/src/uts/common/fs/zfs/vdev_root.c
index 0e8752c6ce..77829c0aa3 100644
--- a/usr/src/uts/common/fs/zfs/vdev_root.c
+++ b/usr/src/uts/common/fs/zfs/vdev_root.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -44,18 +44,17 @@
  * probably fine.  Adding bean counters during alloc/free can make this
  * future guesswork more accurate.
  */
-/*ARGSUSED*/
 static int
 too_many_errors(vdev_t *vd, int numerrors)
 {
-	return (numerrors > 0);
+	ASSERT3U(numerrors, <=, vd->vdev_children);
+	return (numerrors == vd->vdev_children);
 }
 
 static int
 vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 {
-	vdev_t *cvd;
-	int c, error;
+	int c;
 	int lasterror = 0;
 	int numerrors = 0;
 
@@ -65,7 +64,8 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 	}
 
 	for (c = 0; c < vd->vdev_children; c++) {
-		cvd = vd->vdev_child[c];
+		vdev_t *cvd = vd->vdev_child[c];
+		int error;
 
 		if ((error = vdev_open(cvd)) != 0) {
 			lasterror = error;
@@ -74,9 +74,15 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 		}
 	}
 
-	if (too_many_errors(vd, numerrors)) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
-		return (lasterror);
+	if (numerrors > 0) {
+		if (!too_many_errors(vd, numerrors)) {
+			/* XXX - should not be explicitly setting this state */
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED,
+			    VDEV_AUX_NO_REPLICAS);
+		} else {
+			vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+			return (lasterror);
+		}
 	}
 
 	*asize = 0;
@@ -97,18 +103,24 @@ vdev_root_close(vdev_t *vd)
 static void
 vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
 {
-	if (too_many_errors(vd, faulted))
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_NO_REPLICAS);
-	else if (degraded != 0)
+	if (faulted) {
+		if (too_many_errors(vd, faulted))
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_NO_REPLICAS);
+		else
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED,
+			    VDEV_AUX_NO_REPLICAS);
+	} else if (degraded) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
-	else
+	} else {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+	}
 }
 
 vdev_ops_t vdev_root_ops = {
 	vdev_root_open,
 	vdev_root_close,
+	NULL,
 	vdev_default_asize,
 	NULL,			/* io_start - not applicable to the root */
 	NULL,			/* io_done - not applicable to the root */
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 4a5e68b878..54158d03f2 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -2073,6 +2073,17 @@ zfs_ioc_clear(zfs_cmd_t *zc)
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
+	/*
+	 * Try to resume any I/Os which may have been suspended
+	 * as a result of a complete pool failure.
+	 */
+	if (!list_is_empty(&spa->spa_zio_list)) {
+		if (zio_vdev_resume_io(spa) != 0) {
+			spa_close(spa, FTAG);
+			return (EIO);
+		}
+	}
+
 	txg = spa_vdev_enter(spa);
 
 	if (zc->zc_guid == 0) {
@@ -2083,7 +2094,7 @@ zfs_ioc_clear(zfs_cmd_t *zc)
 		return (ENODEV);
 	}
 
-	vdev_clear(spa, vd);
+	vdev_clear(spa, vd, B_TRUE);
 
 	(void) spa_vdev_exit(spa, NULL, txg, 0);
 
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 40670d1321..103c9d9cad 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -66,6 +66,14 @@ uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1;
 
 /* Force an allocation failure when non-zero */
 uint16_t zio_zil_fail_shift = 0;
+uint16_t zio_io_fail_shift = 0;
+
+/* Enable/disable the write-retry logic */
+int zio_write_retry = 1;
+
+/* Taskq to handle reissuing of I/Os */
+taskq_t *zio_taskq;
+int zio_resume_threads = 4;
 
 typedef struct zio_sync_pass {
 	int	zp_defer_free;		/* defer frees after this pass */
@@ -79,6 +87,8 @@ zio_sync_pass_t zio_sync_pass = {
 	1,	/* zp_rewrite */
 };
 
+static boolean_t zio_io_should_fail(uint16_t);
+
 /*
  * ==========================================================================
  * I/O kmem caches
@@ -92,6 +102,34 @@ kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 extern vmem_t *zio_alloc_arena;
 #endif
 
+/*
+ * Determine if we are allowed to issue the IO based on the
+ * pool state. If we must wait then block until we are told
+ * that we may continue.
+ */
+#define	ZIO_ENTER(spa) {						\
+	if (spa->spa_state == POOL_STATE_IO_FAILURE) {			\
+		mutex_enter(&spa->spa_zio_lock);			\
+		while (spa->spa_state == POOL_STATE_IO_FAILURE)		\
+			cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock);	\
+		mutex_exit(&spa->spa_zio_lock);				\
+	}								\
+}
+
+/*
+ * An allocation zio is one that either currently has the DVA allocate
+ * stage set or will have it later in it's lifetime.
+ */
+#define	IO_IS_ALLOCATING(zio) \
+	((zio)->io_orig_pipeline == ZIO_WRITE_PIPELINE ||		\
+	(zio)->io_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
+
+/*
+ * The only way to tell is by looking for the gang pipeline stage
+ */
+#define	IO_IS_REWRITE(zio)						\
+	((zio)->io_pipeline & (1U << ZIO_STAGE_GANG_PIPELINE))
+
 void
 zio_init(void)
 {
@@ -153,6 +191,9 @@ zio_init(void)
 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 	}
 
+	zio_taskq = taskq_create("zio_taskq", zio_resume_threads,
+	    maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
+
 	zio_inject_init();
 }
 
@@ -177,6 +218,8 @@ zio_fini(void)
 		zio_data_buf_cache[c] = NULL;
 	}
 
+	taskq_destroy(zio_taskq);
+
 	kmem_cache_destroy(zio_cache);
 
 	zio_inject_fini();
@@ -386,9 +429,27 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 		mutex_exit(&pio->io_lock);
 	}
 
+	/*
+	 * Save off the original state incase we need to retry later.
+	 */
+	zio->io_orig_stage = zio->io_stage;
+	zio->io_orig_pipeline = zio->io_pipeline;
+	zio->io_orig_flags = zio->io_flags;
+
 	return (zio);
 }
 
+static void
+zio_reset(zio_t *zio)
+{
+	zio_clear_transform_stack(zio);
+
+	zio->io_flags = zio->io_orig_flags;
+	zio->io_stage = zio->io_orig_stage;
+	zio->io_pipeline = zio->io_orig_pipeline;
+	zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size);
+}
+
 zio_t *
 zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
 	int flags)
@@ -417,6 +478,13 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
 
 	ASSERT3U(size, ==, BP_GET_LSIZE(bp));
 
+	/*
+	 * If the user has specified that we allow I/Os to continue
+	 * then attempt to satisfy the read.
+	 */
+	if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
+		ZIO_ENTER(spa);
+
 	zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
 	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER,
 	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
@@ -429,22 +497,6 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
 	 */
 	zio->io_bp = &zio->io_bp_copy;
 
-	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
-		uint64_t csize = BP_GET_PSIZE(bp);
-		void *cbuf = zio_buf_alloc(csize);
-
-		zio_push_transform(zio, cbuf, csize, csize);
-		zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
-	}
-
-	if (BP_IS_GANG(bp)) {
-		uint64_t gsize = SPA_GANGBLOCKSIZE;
-		void *gbuf = zio_buf_alloc(gsize);
-
-		zio_push_transform(zio, gbuf, gsize, gsize);
-		zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
-	}
-
 	return (zio);
 }
 
@@ -462,6 +514,8 @@ zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
 	ASSERT(compress >= ZIO_COMPRESS_OFF &&
 	    compress < ZIO_COMPRESS_FUNCTIONS);
 
+	ZIO_ENTER(spa);
+
 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
 	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
@@ -515,6 +569,16 @@ zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
 	return (zio);
 }
 
+static void
+zio_write_allocate_ready(zio_t *zio)
+{
+	/* Free up the previous block */
+	if (!BP_IS_HOLE(&zio->io_bp_orig)) {
+		zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
+		    &zio->io_bp_orig, NULL, NULL));
+	}
+}
+
 static zio_t *
 zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
@@ -533,6 +597,7 @@ zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
 
 	zio->io_checksum = checksum;
 	zio->io_compress = ZIO_COMPRESS_OFF;
+	zio->io_ready = zio_write_allocate_ready;
 
 	return (zio);
 }
@@ -649,6 +714,8 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 	zio_t *zio;
 	blkptr_t blk;
 
+	ZIO_ENTER(vd->vdev_spa);
+
 	zio_phys_bp_init(vd, &blk, offset, size, checksum);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
@@ -676,6 +743,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 	zio_t *zio;
 	blkptr_t blk;
 
+	ZIO_ENTER(vd->vdev_spa);
+
 	zio_phys_bp_init(vd, &blk, offset, size, checksum);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
@@ -801,6 +870,7 @@ zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
 	mutex_enter(&pio->io_lock);
 	if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		pio->io_error = zio->io_error;
+	ASSERT3U(*countp, >, 0);
 	if (--*countp == 0 && pio->io_stalled == stage) {
 		pio->io_stalled = 0;
 		mutex_exit(&pio->io_lock);
@@ -825,6 +895,27 @@ zio_wait_children_done(zio_t *zio)
 }
 
 static void
+zio_read_init(zio_t *zio)
+{
+	if (BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF) {
+		uint64_t csize = BP_GET_PSIZE(zio->io_bp);
+		void *cbuf = zio_buf_alloc(csize);
+
+		zio_push_transform(zio, cbuf, csize, csize);
+		zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
+	}
+
+	if (BP_IS_GANG(zio->io_bp)) {
+		uint64_t gsize = SPA_GANGBLOCKSIZE;
+		void *gbuf = zio_buf_alloc(gsize);
+
+		zio_push_transform(zio, gbuf, gsize, gsize);
+		zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
+	}
+	zio_next_stage(zio);
+}
+
+static void
 zio_ready(zio_t *zio)
 {
 	zio_t *pio = zio->io_parent;
@@ -843,9 +934,151 @@ zio_ready(zio_t *zio)
 }
 
 static void
-zio_done(zio_t *zio)
+zio_vdev_retry_io(zio_t *zio)
 {
 	zio_t *pio = zio->io_parent;
+
+	/*
+	 * Preserve the failed bp so that the io_ready() callback can
+	 * update the accounting accordingly. The callback will also be
+	 * responsible for freeing the previously allocated block, if one
+	 * exists.
+	 */
+	zio->io_bp_orig = *zio->io_bp;
+
+	/*
+	 * We must zero out the old DVA and blk_birth before reallocating
+	 * the bp. We don't want to do this if this is a rewrite however.
+	 */
+	if (!IO_IS_REWRITE(zio)) {
+		BP_ZERO_DVAS(zio->io_bp);
+	}
+
+	zio_reset(zio);
+
+	if (pio) {
+		/*
+		 * Let the parent know that we will
+		 * re-alloc the write (=> new bp info).
+		 */
+		mutex_enter(&pio->io_lock);
+		pio->io_children_notready++;
+
+		/*
+		 * If the parent I/O is still in the open stage, then
+		 * don't bother telling it to retry since it hasn't
+		 * progressed far enough for it to care.
+		 */
+		if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio))
+			pio->io_flags |= ZIO_FLAG_WRITE_RETRY;
+
+		ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_CHILDREN_DONE);
+		mutex_exit(&pio->io_lock);
+	}
+
+	/*
+	 * We are getting ready to process the retry request so clear
+	 * the flag and the zio's current error status.
+	 */
+	zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY;
+	zio->io_error = 0;
+	zio_next_stage_async(zio);
+}
+
+int
+zio_vdev_resume_io(spa_t *spa)
+{
+	zio_t *zio;
+
+	mutex_enter(&spa->spa_zio_lock);
+
+	/*
+	 * Probe all of vdevs that have experienced an I/O error.
+	 * If we are still unable to verify the integrity of the vdev
+	 * then we prevent the resume from proceeeding.
+	 */
+	for (zio = list_head(&spa->spa_zio_list); zio != NULL;
+	    zio = list_next(&spa->spa_zio_list, zio)) {
+		int error = 0;
+
+		/* We only care about I/Os that must succeed */
+		if (zio->io_vd == NULL || zio->io_flags & ZIO_FLAG_CANFAIL)
+			continue;
+		error = vdev_probe(zio->io_vd);
+		if (error) {
+			mutex_exit(&spa->spa_zio_lock);
+			return (error);
+		}
+	}
+
+	/*
+	 * Clear the vdev stats so that I/O can flow.
+	 */
+	vdev_clear(spa, NULL, B_FALSE);
+
+	spa->spa_state = POOL_STATE_ACTIVE;
+	while ((zio = list_head(&spa->spa_zio_list)) != NULL) {
+		list_remove(&spa->spa_zio_list, zio);
+		zio->io_error = 0;
+
+		/*
+		 * If we are resuming an allocating I/O then we force it
+		 * to retry and let it resume operation where it left off.
+		 * Otherwise, go back to the ready stage and pick up from
+		 * there.
+		 */
+		if (zio_write_retry && IO_IS_ALLOCATING(zio)) {
+			zio->io_flags |= ZIO_FLAG_WRITE_RETRY;
+			zio->io_stage--;
+		} else {
+			zio->io_stage = ZIO_STAGE_READY;
+		}
+
+		(void) taskq_dispatch(zio_taskq, zio_resubmit_stage_async,
+		    zio, TQ_SLEEP);
+	}
+	mutex_exit(&spa->spa_zio_lock);
+
+	/*
+	 * Wait for the taskqs to finish and recheck the pool state since
+	 * it's possible that a resumed I/O has failed again.
+	 */
+	taskq_wait(zio_taskq);
+	if (spa_state(spa) == POOL_STATE_IO_FAILURE)
+		return (EIO);
+
+	mutex_enter(&spa->spa_zio_lock);
+	cv_broadcast(&spa->spa_zio_cv);
+	mutex_exit(&spa->spa_zio_lock);
+
+	return (0);
+}
+
+static void
+zio_vdev_suspend_io(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+
+	/*
+	 * We've experienced an unrecoverable failure so
+	 * set the pool state accordingly and queue all
+	 * failed IOs.
+	 */
+	spa->spa_state = POOL_STATE_IO_FAILURE;
+
+	mutex_enter(&spa->spa_zio_lock);
+	list_insert_tail(&spa->spa_zio_list, zio);
+
+#ifndef _KERNEL
+	/* Used to notify ztest that the pool has suspended */
+	cv_broadcast(&spa->spa_zio_cv);
+#endif
+	mutex_exit(&spa->spa_zio_lock);
+}
+
+static void
+zio_assess(zio_t *zio)
+{
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	vdev_t *vd = zio->io_vd;
@@ -868,6 +1101,14 @@ zio_done(zio_t *zio)
 		}
 	}
 
+	/*
+	 * Some child I/O has indicated that a retry is necessary, so
+	 * we set an error on the I/O and let the logic below do the
+	 * rest.
+	 */
+	if (zio->io_flags & ZIO_FLAG_WRITE_RETRY)
+		zio->io_error = ERESTART;
+
 	if (vd != NULL)
 		vdev_stat_update(zio);
 
@@ -879,8 +1120,7 @@ zio_done(zio_t *zio)
 		 * device is currently unavailable.
 		 */
 		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
-			zfs_ereport_post(FM_EREPORT_ZFS_IO,
-			    zio->io_spa, vd, zio, 0, 0);
+			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
 
 		if ((zio->io_error == EIO ||
 		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) &&
@@ -890,32 +1130,80 @@ zio_done(zio_t *zio)
 			 * appropriately.  Also, generate a logical data
 			 * ereport.
 			 */
-			spa_log_error(zio->io_spa, zio);
+			spa_log_error(spa, zio);
 
-			zfs_ereport_post(FM_EREPORT_ZFS_DATA,
-			    zio->io_spa, NULL, zio, 0, 0);
+			zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
+			    0, 0);
 		}
 
 		/*
-		 * For I/O requests that cannot fail, panic appropriately.
+		 * If we are an allocating I/O then we retry on another
+		 * vdev unless the pool is out of space.  We handle this
+		 * condition based on the spa's failmode property.
+		 */
+		if (zio_write_retry && zio->io_error != ENOSPC &&
+		    IO_IS_ALLOCATING(zio) &&
+		    zio->io_flags & ZIO_FLAG_WRITE_RETRY) {
+			zio_vdev_retry_io(zio);
+			return;
+		}
+		ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
+
+		/*
+		 * For I/O requests that cannot fail, we carry out
+		 * the requested behavior based on the failmode pool
+		 * property.
+		 *
+		 * XXX - Need to differentiate between an ENOSPC as
+		 * a result of vdev failures vs. a full pool.
 		 */
 		if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
 			char *blkbuf;
 
+#ifdef ZFS_DEBUG
 			blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP);
 			if (blkbuf) {
 				sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
 				    bp ? bp : &zio->io_bp_copy);
 			}
-			panic("ZFS: %s (%s on %s off %llx: zio %p %s): error "
-			    "%d", zio->io_error == ECKSUM ?
+			cmn_err(CE_WARN, "ZFS: %s (%s on %s off %llx: zio %p "
+			    "%s): error %d", zio->io_error == ECKSUM ?
 			    "bad checksum" : "I/O failure",
 			    zio_type_name[zio->io_type],
 			    vdev_description(vd),
 			    (u_longlong_t)zio->io_offset,
-			    zio, blkbuf ? blkbuf : "", zio->io_error);
+			    (void *)zio, blkbuf ? blkbuf : "", zio->io_error);
+#endif
+
+			if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) {
+				fm_panic("Pool '%s' has encountered an "
+				    "uncorrectable I/O failure and the "
+				    "failure mode property for this pool "
+				    "is set to panic.", spa_name(spa));
+			} else {
+				cmn_err(CE_WARN, "Pool '%s' has encountered "
+				    "an uncorrectable I/O error. Manual "
+				    "intervention is required.",
+				    spa_name(spa));
+				zio_vdev_suspend_io(zio);
+			}
+			return;
 		}
 	}
+	ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
+	ASSERT(zio->io_children_notready == 0);
+	zio_next_stage(zio);
+}
+
+static void
+zio_done(zio_t *zio)
+{
+	zio_t *pio = zio->io_parent;
+	spa_t *spa = zio->io_spa;
+
+	ASSERT(zio->io_children_notready == 0);
+	ASSERT(zio->io_children_notdone == 0);
+
 	zio_clear_transform_stack(zio);
 
 	if (zio->io_done)
@@ -1099,7 +1387,7 @@ zio_get_gang_header(zio_t *zio)
 	zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize,
 	    NULL, NULL, ZIO_TYPE_READ, zio->io_priority,
 	    zio->io_flags & ZIO_FLAG_GANG_INHERIT,
-	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE));
+	    ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE));
 
 	zio_wait_children_done(zio);
 }
@@ -1244,7 +1532,7 @@ zio_write_allocate_gang_member_done(zio_t *zio)
 	mutex_exit(&pio->io_lock);
 }
 
-static void
+static int
 zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
 {
 	blkptr_t *bp = zio->io_bp;
@@ -1266,9 +1554,8 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
 
 	error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL,
 	    B_FALSE);
-	if (error == ENOSPC)
-		panic("can't allocate gang block header");
-	ASSERT(error == 0);
+	if (error)
+		return (error);
 
 	for (d = 0; d < gbh_ndvas; d++)
 		DVA_SET_GANG(&dva[d], 1);
@@ -1296,8 +1583,9 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
 			if (error == 0)
 				break;
 			ASSERT3U(error, ==, ENOSPC);
+			/* XXX - free up previous allocations? */
 			if (maxalloc == SPA_MINBLOCKSIZE)
-				panic("really out of space");
+				return (error);
 			maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
 		}
 
@@ -1336,6 +1624,7 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
 	 * to be stable.
 	 */
 	zio_wait_children_done(zio);
+	return (0);
 }
 
 /*
@@ -1358,10 +1647,23 @@ zio_dva_allocate(zio_t *zio)
 
 	/* For testing, make some blocks above a certain size be gang blocks */
 	if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) {
-		zio_write_allocate_gang_members(zio, mc);
+		error = zio_write_allocate_gang_members(zio, mc);
+		if (error)
+			zio->io_error = error;
 		return;
 	}
 
+	/*
+	 * For testing purposes, we force I/Os to retry. We don't allow
+	 * retries beyond the first pass since those I/Os are non-allocating
+	 * writes. We do this after the gang block testing block so that
+	 * they don't inherit the retry flag.
+	 */
+	if (zio_io_fail_shift &&
+	    spa_sync_pass(zio->io_spa) <= zio_sync_pass.zp_rewrite &&
+	    zio_io_should_fail(zio_io_fail_shift))
+		zio->io_flags |= ZIO_FLAG_WRITE_RETRY;
+
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
 	error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas,
@@ -1369,11 +1671,11 @@ zio_dva_allocate(zio_t *zio)
 
 	if (error == 0) {
 		bp->blk_birth = zio->io_txg;
-	} else if (error == ENOSPC) {
-		if (zio->io_size == SPA_MINBLOCKSIZE)
-			panic("really, truly out of space");
-		zio_write_allocate_gang_members(zio, mc);
-		return;
+	} else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
+		error = zio_write_allocate_gang_members(zio, mc);
+		if (error == 0)
+			return;
+		zio->io_error = error;
 	} else {
 		zio->io_error = error;
 	}
@@ -1413,6 +1715,18 @@ zio_vdev_io_start(zio_t *zio)
 	vdev_t *tvd = vd ? vd->vdev_top : NULL;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t align;
+	spa_t *spa = zio->io_spa;
+
+	/*
+	 * If the pool is already in a failure state then just suspend
+	 * this IO until the problem is resolved. We will reissue them
+	 * at that time.
+	 */
+	if (spa_state(spa) == POOL_STATE_IO_FAILURE &&
+	    zio->io_type == ZIO_TYPE_WRITE) {
+		zio_vdev_suspend_io(zio);
+		return;
+	}
 
 	if (vd == NULL) {
 		/* The mirror_ops handle multiple DVAs in a single BP */
@@ -1662,6 +1976,7 @@ zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
 	zio_dva_claim,
 	zio_gang_checksum_generate,
 	zio_ready,
+	zio_read_init,
 	zio_vdev_io_start,
 	zio_vdev_io_done,
 	zio_vdev_io_assess,
@@ -1669,6 +1984,7 @@ zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
 	zio_checksum_verify,
 	zio_read_gang_members,
 	zio_read_decompress,
+	zio_assess,
 	zio_done,
 	zio_badop
 };
@@ -1762,12 +2078,20 @@ zio_next_stage_async(zio_t *zio)
 	}
 }
 
+void
+zio_resubmit_stage_async(void *arg)
+{
+	zio_t *zio = (zio_t *)(uintptr_t)arg;
+
+	zio_next_stage_async(zio);
+}
+
 static boolean_t
-zio_alloc_should_fail(void)
+zio_io_should_fail(uint16_t range)
 {
 	static uint16_t	allocs = 0;
 
-	return (P2PHASE(allocs++, 1U<<zio_zil_fail_shift) == 0);
+	return (P2PHASE(allocs++, 1U<<range) == 0);
 }
 
 /*
@@ -1781,7 +2105,7 @@ zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
 
 	spa_config_enter(spa, RW_READER, FTAG);
 
-	if (zio_zil_fail_shift && zio_alloc_should_fail()) {
+	if (zio_zil_fail_shift && zio_io_should_fail(zio_zil_fail_shift)) {
 		spa_config_exit(spa, FTAG);
 		return (ENOSPC);
 	}