diff options
Diffstat (limited to 'usr/src/uts/common/fs')
24 files changed, 1027 insertions, 137 deletions
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index f1c2de5a07..aafce2d68e 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -2533,12 +2533,34 @@ arc_write_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; + arc_buf_hdr_t *hdr = buf->b_hdr; - if (callback->awcb_ready) { + if (zio->io_error == 0 && callback->awcb_ready) { ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); callback->awcb_ready(zio, buf, callback->awcb_private); } + /* + * If the IO is already in progress, then this is a re-write + * attempt, so we need to thaw and re-compute the cksum. It is + * the responsibility of the callback to handle the freeing + * and accounting for any re-write attempt. If we don't have a + * callback registered then simply free the block here. + */ + if (HDR_IO_IN_PROGRESS(hdr)) { + if (!BP_IS_HOLE(&zio->io_bp_orig) && + callback->awcb_ready == NULL) { + zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, + &zio->io_bp_orig, NULL, NULL)); + } + mutex_enter(&hdr->b_freeze_lock); + if (hdr->b_freeze_cksum != NULL) { + kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_freeze_cksum = NULL; + } + mutex_exit(&hdr->b_freeze_lock); + } arc_cksum_compute(buf); + hdr->b_flags |= ARC_IO_IN_PROGRESS; } static void @@ -2635,7 +2657,6 @@ arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; - hdr->b_flags |= ARC_IO_IN_PROGRESS; zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback, priority, flags, zb); diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c index 2758d84791..0f687ff66d 100644 --- a/usr/src/uts/common/fs/zfs/dmu_objset.c +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -739,12 +739,26 @@ ready(zio_t *zio, arc_buf_t *abuf, void *arg) dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; int i; + ASSERT(bp == zio->io_bp); + /* * Update rootbp fill count. */ bp->blk_fill = 1; /* count the meta-dnode */ for (i = 0; i < dnp->dn_nblkptr; i++) bp->blk_fill += dnp->dn_blkptr[i].blk_fill; + + BP_SET_TYPE(bp, DMU_OT_OBJSET); + BP_SET_LEVEL(bp, 0); + + /* We must do this after we've set the bp's type and level */ + if (!DVA_EQUAL(BP_IDENTITY(bp), + BP_IDENTITY(&zio->io_bp_orig))) { + if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg) + dsl_dataset_block_kill(os->os_dsl_dataset, + &zio->io_bp_orig, NULL, os->os_synctx); + dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx); + } } /* ARGSUSED */ @@ -754,18 +768,6 @@ killer(zio_t *zio, arc_buf_t *abuf, void *arg) objset_impl_t *os = arg; ASSERT3U(zio->io_error, ==, 0); - - BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET); - BP_SET_LEVEL(zio->io_bp, 0); - - if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), - BP_IDENTITY(&zio->io_bp_orig))) { - if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg) - dsl_dataset_block_kill(os->os_dsl_dataset, - &zio->io_bp_orig, NULL, os->os_synctx); - dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp, - os->os_synctx); - } arc_release(os->os_phys_buf, &os->os_phys_buf); } diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c index 13fd8d4d9d..f89878facf 100644 --- a/usr/src/uts/common/fs/zfs/dmu_tx.c +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -734,11 +734,30 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) { dmu_tx_hold_t *txh; uint64_t lsize, asize, fsize, towrite, tofree, tooverwrite; + spa_t *spa = tx->tx_pool->dp_spa; ASSERT3U(tx->tx_txg, ==, 0); + if (tx->tx_err) return (tx->tx_err); + if (spa_state(spa) == POOL_STATE_IO_FAILURE) { + /* + * If the user has indicated a blocking failure mode + * then return ERESTART which will block in dmu_tx_wait(). + * Otherwise, return EIO so that an error can get + * propagated back to the VOP calls. + * + * Note that we always honor the txg_how flag regardless + * of the failuremode setting. + */ + if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && + txg_how != TXG_WAIT) + return (EIO); + + return (ERESTART); + } + tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); tx->tx_needassign_txh = NULL; @@ -885,10 +904,19 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) void dmu_tx_wait(dmu_tx_t *tx) { + spa_t *spa = tx->tx_pool->dp_spa; + ASSERT(tx->tx_txg == 0); - ASSERT(tx->tx_lasttried_txg != 0); - if (tx->tx_needassign_txh) { + /* + * It's possible that the pool has become active after this thread + * has tried to obtain a tx. If that's the case then his + * tx_lasttried_txg would not have been assigned. + */ + if (spa_state(spa) == POOL_STATE_IO_FAILURE || + tx->tx_lasttried_txg == 0) { + txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1); + } else if (tx->tx_needassign_txh) { dnode_t *dn = tx->tx_needassign_txh->txh_dnode; mutex_enter(&dn->dn_mtx); diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 4fcc6bfd79..b2840e4e87 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -773,6 +773,20 @@ top: all_zero = B_TRUE; do { vd = mg->mg_vd; + /* + * Dont allocate from faulted devices + */ + if (!vdev_writeable(vd)) + goto next; + /* + * Avoid writing single-copy data to a failing vdev + */ + if ((vd->vdev_stat.vs_write_errors > 0 || + vd->vdev_state < VDEV_STATE_HEALTHY) && + d == 0 && dshift == 3) { + all_zero = B_FALSE; + goto next; + } ASSERT(mg->mg_class == mc); @@ -828,6 +842,7 @@ top: return (0); } +next: mc->mc_rotor = mg->mg_next; mc->mc_allocated = 0; } while ((mg = mg->mg_next) != rotor); diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index a838b0f45b..a780a2ca1f 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -362,6 +362,27 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) dmu_objset_close(os); } break; + case ZPOOL_PROP_FAILUREMODE: + error = nvpair_value_uint64(elem, &intval); + if (!error && (intval < ZIO_FAILURE_MODE_WAIT || + intval > ZIO_FAILURE_MODE_PANIC)) + error = EINVAL; + + /* + * This is a special case which only occurs when + * the pool has completely failed. This allows + * the user to change the in-core failmode property + * without syncing it out to disk (I/Os might + * currently be blocked). We do this by returning + * EIO to the caller (spa_prop_set) to trick it + * into thinking we encountered a property validation + * error. + */ + if (!error && spa_state(spa) == POOL_STATE_IO_FAILURE) { + spa->spa_failmode = intval; + error = EIO; + } + break; } if (error) @@ -477,6 +498,8 @@ spa_activate(spa_t *spa) list_create(&spa->spa_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_dirty_node)); + list_create(&spa->spa_zio_list, sizeof (zio_t), + offsetof(zio_t, zio_link_node)); txg_list_create(&spa->spa_vdev_txg_list, offsetof(struct vdev, vdev_txg_node)); @@ -506,6 +529,7 @@ spa_deactivate(spa_t *spa) txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_dirty_list); + list_destroy(&spa->spa_zio_list); for (t = 0; t < ZIO_TYPES; t++) { taskq_destroy(spa->spa_zio_issue_taskq[t]); @@ -1077,6 +1101,10 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) spa->spa_pool_props_object, zpool_prop_to_name(ZPOOL_PROP_DELEGATION), sizeof (uint64_t), 1, &spa->spa_delegation); + (void) zap_lookup(spa->spa_meta_objset, + spa->spa_pool_props_object, + zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), + sizeof (uint64_t), 1, &spa->spa_failmode); } /* @@ -1618,6 +1646,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); spa->spa_temporary = zpool_prop_default_numeric(ZPOOL_PROP_TEMPORARY); + spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); if (props) spa_sync_props(spa, props, CRED(), tx); @@ -3091,7 +3120,7 @@ spa_async_remove(spa_t *spa, vdev_t *vd) tvd->vdev_remove_wanted = 0; vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); - vdev_clear(spa, tvd); + vdev_clear(spa, tvd, B_TRUE); vdev_config_dirty(tvd->vdev_top); } spa_async_remove(spa, tvd); @@ -3122,8 +3151,14 @@ spa_async_thread(spa_t *spa) /* * See if any devices need to be marked REMOVED. + * + * XXX - We avoid doing this when we are in + * I/O failure state since spa_vdev_enter() grabs + * the namespace lock and would not be able to obtain + * the writer config lock. */ - if (tasks & SPA_ASYNC_REMOVE) { + if (tasks & SPA_ASYNC_REMOVE && + spa_state(spa) != POOL_STATE_IO_FAILURE) { txg = spa_vdev_enter(spa); spa_async_remove(spa, spa->spa_root_vdev); (void) spa_vdev_exit(spa, NULL, txg, 0); @@ -3379,7 +3414,6 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) VERIFY(nvpair_value_uint64(elem, &intval) == 0); spa->spa_temporary = intval; break; - default: /* * Set pool property values in the poolprops mos object. @@ -3425,11 +3459,19 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ASSERT(0); /* not allowed */ } - if (prop == ZPOOL_PROP_DELEGATION) + switch (prop) { + case ZPOOL_PROP_DELEGATION: spa->spa_delegation = intval; - - if (prop == ZPOOL_PROP_BOOTFS) + break; + case ZPOOL_PROP_BOOTFS: spa->spa_bootfs = intval; + break; + case ZPOOL_PROP_FAILUREMODE: + spa->spa_failmode = intval; + break; + default: + break; + } } /* log internal history if this is not a zpool create */ diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 8065ae85b6..5cb0890586 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -277,6 +277,8 @@ spa_add(const char *name, const char *altroot) avl_add(&spa_namespace_avl, spa); + mutex_init(&spa->spa_zio_lock, NULL, MUTEX_DEFAULT, NULL); + /* * Set the alternate root, if there is one. */ @@ -332,6 +334,7 @@ spa_remove(spa_t *spa) mutex_destroy(&spa->spa_sync_bplist.bpl_lock); mutex_destroy(&spa->spa_history_lock); mutex_destroy(&spa->spa_props_lock); + mutex_destroy(&spa->spa_zio_lock); kmem_free(spa, sizeof (spa_t)); } @@ -989,6 +992,16 @@ spa_get_asize(spa_t *spa, uint64_t lsize) return (lsize * 6); } +/* + * Return the failure mode that has been set to this pool. The default + * behavior will be to block all I/Os when a complete failure occurs. + */ +uint8_t +spa_get_failmode(spa_t *spa) +{ + return (spa->spa_failmode); +} + uint64_t spa_version(spa_t *spa) { diff --git a/usr/src/uts/common/fs/zfs/space_map.c b/usr/src/uts/common/fs/zfs/space_map.c index 9263b31172..a15e5ff815 100644 --- a/usr/src/uts/common/fs/zfs/space_map.c +++ b/usr/src/uts/common/fs/zfs/space_map.c @@ -298,6 +298,7 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype, uint64_t *entry, *entry_map, *entry_map_end; uint64_t bufsize, size, offset, end, space; uint64_t mapstart = sm->sm_start; + int error = 0; ASSERT(MUTEX_HELD(sm->sm_lock)); @@ -335,9 +336,10 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype, smo->smo_object, offset, size); mutex_exit(sm->sm_lock); - VERIFY3U(dmu_read(os, smo->smo_object, offset, size, - entry_map), ==, 0); + error = dmu_read(os, smo->smo_object, offset, size, entry_map); mutex_enter(sm->sm_lock); + if (error != 0) + goto out; entry_map_end = entry_map + (size / sizeof (uint64_t)); for (entry = entry_map; entry < entry_map_end; entry++) { @@ -354,18 +356,19 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype, } VERIFY3U(sm->sm_space, ==, space); + sm->sm_loaded = B_TRUE; + sm->sm_ops = ops; +out: zio_buf_free(entry_map, bufsize); sm->sm_loading = B_FALSE; - sm->sm_loaded = B_TRUE; - sm->sm_ops = ops; cv_broadcast(&sm->sm_load_cv); - if (ops != NULL) + if (!error && ops != NULL) ops->smop_load(sm); - return (0); + return (error); } void diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index cb5e09e4b0..032ead7f37 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -274,7 +274,7 @@ typedef struct blkptr { #define BP_IS_HOLE(bp) ((bp)->blk_birth == 0) #define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg)) -#define BP_ZERO(bp) \ +#define BP_ZERO_DVAS(bp) \ { \ (bp)->blk_dva[0].dva_word[0] = 0; \ (bp)->blk_dva[0].dva_word[1] = 0; \ @@ -282,11 +282,16 @@ typedef struct blkptr { (bp)->blk_dva[1].dva_word[1] = 0; \ (bp)->blk_dva[2].dva_word[0] = 0; \ (bp)->blk_dva[2].dva_word[1] = 0; \ + (bp)->blk_birth = 0; \ +} + +#define BP_ZERO(bp) \ +{ \ + BP_ZERO_DVAS(bp) \ (bp)->blk_prop = 0; \ (bp)->blk_pad[0] = 0; \ (bp)->blk_pad[1] = 0; \ (bp)->blk_pad[2] = 0; \ - (bp)->blk_birth = 0; \ (bp)->blk_fill = 0; \ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ } @@ -423,6 +428,7 @@ extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize); extern uint64_t spa_version(spa_t *spa); extern int spa_max_replication(spa_t *spa); extern int spa_busy(void); +extern uint8_t spa_get_failmode(spa_t *spa); /* Miscellaneous support routines */ extern int spa_rename(const char *oldname, const char *newname); diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index 18371aa13f..0310f985b8 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -141,6 +141,10 @@ struct spa { uint64_t spa_bootfs; /* default boot filesystem */ boolean_t spa_delegation; /* delegation on/off */ boolean_t spa_temporary; /* temporary on/off */ + list_t spa_zio_list; /* zio error list */ + kcondvar_t spa_zio_cv; /* resume I/O pipeline */ + kmutex_t spa_zio_lock; /* zio error lock */ + uint8_t spa_failmode; /* failure mode for the pool */ /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index c651d1eebb..dced3da5ff 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -54,6 +54,7 @@ extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); extern void vdev_init(vdev_t *, uint64_t txg); extern void vdev_reopen(vdev_t *); extern int vdev_validate_spare(vdev_t *); +extern int vdev_probe(vdev_t *); extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); @@ -89,10 +90,12 @@ extern int vdev_degrade(spa_t *spa, uint64_t guid); extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *); extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); -extern void vdev_clear(spa_t *spa, vdev_t *vd); +extern void vdev_clear(spa_t *spa, vdev_t *vd, boolean_t reopen_wanted); extern int vdev_error_inject(vdev_t *vd, zio_t *zio); extern int vdev_is_dead(vdev_t *vd); +extern int vdev_readable(vdev_t *vd); +extern int vdev_writeable(vdev_t *vd); extern void vdev_cache_init(vdev_t *vd); extern void vdev_cache_fini(vdev_t *vd); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index e279bb2495..6fa21e83b0 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -60,6 +60,7 @@ typedef struct vdev_cache_entry vdev_cache_entry_t; */ typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift); typedef void vdev_close_func_t(vdev_t *vd); +typedef int vdev_probe_func_t(vdev_t *vd); typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); typedef void vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio); @@ -68,6 +69,7 @@ typedef void vdev_state_change_func_t(vdev_t *vd, int, int); typedef struct vdev_ops { vdev_open_func_t *vdev_op_open; vdev_close_func_t *vdev_op_close; + vdev_probe_func_t *vdev_op_probe; vdev_asize_func_t *vdev_op_asize; vdev_io_start_func_t *vdev_op_io_start; vdev_io_done_func_t *vdev_op_io_done; @@ -174,6 +176,7 @@ struct vdev { uint64_t vdev_unspare; /* unspare when resilvering done */ boolean_t vdev_checkremove; /* temporary online test */ boolean_t vdev_forcefault; /* force online fault */ + boolean_t vdev_is_failing; /* device errors seen */ /* * For DTrace to work in userland (libzpool) context, these fields must diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_context.h b/usr/src/uts/common/fs/zfs/sys/zfs_context.h index 8a689e0760..a5be3e1303 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_context.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_context.h @@ -62,6 +62,7 @@ extern "C" { #include <sys/zfs_debug.h> #include <sys/sysevent.h> #include <sys/sysevent/eventdefs.h> +#include <sys/fm/util.h> #define CPU_SEQID (CPU->cpu_seqid) diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 0f38aae47d..cc08976074 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -107,6 +107,10 @@ enum zio_compress { #define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB #define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF +#define ZIO_FAILURE_MODE_WAIT 0 +#define ZIO_FAILURE_MODE_CONTINUE 1 +#define ZIO_FAILURE_MODE_PANIC 2 + #define ZIO_PRIORITY_NOW (zio_priority_table[0]) #define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1]) #define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2]) @@ -144,6 +148,7 @@ enum zio_compress { #define ZIO_FLAG_USER 0x20000 #define ZIO_FLAG_METADATA 0x40000 +#define ZIO_FLAG_WRITE_RETRY 0x80000 #define ZIO_FLAG_GANG_INHERIT \ (ZIO_FLAG_CANFAIL | \ @@ -217,6 +222,7 @@ struct zio { zio_t *io_sibling_next; zio_transform_t *io_transform_stack; zio_t *io_logical; + list_node_t zio_link_node; /* Callback info */ zio_done_func_t *io_ready; @@ -242,8 +248,10 @@ struct zio { /* Internal pipeline state */ int io_flags; + int io_orig_flags; enum zio_type io_type; enum zio_stage io_stage; + enum zio_stage io_orig_stage; uint8_t io_stalled; uint8_t io_priority; struct dk_callback io_dk_callback; @@ -252,6 +260,7 @@ struct zio { int io_error; uint32_t io_numerrors; uint32_t io_pipeline; + uint32_t io_orig_pipeline; uint32_t io_async_stages; uint64_t io_children_notready; uint64_t io_children_notdone; @@ -320,6 +329,7 @@ extern void zio_data_buf_free(void *buf, size_t size); */ extern void zio_next_stage(zio_t *zio); extern void zio_next_stage_async(zio_t *zio); +extern void zio_resubmit_stage_async(void *); extern void zio_wait_children_done(zio_t *zio); /* @@ -339,7 +349,8 @@ extern void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp); extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent); extern uint8_t zio_compress_select(uint8_t child, uint8_t parent); -boolean_t zio_should_retry(zio_t *zio); +extern boolean_t zio_should_retry(zio_t *zio); +extern int zio_vdev_resume_io(spa_t *); /* * Initial setup and teardown. diff --git a/usr/src/uts/common/fs/zfs/sys/zio_impl.h b/usr/src/uts/common/fs/zfs/sys/zio_impl.h index d2ddbc34e9..a5a0bb54e8 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -61,6 +61,8 @@ typedef enum zio_stage { ZIO_STAGE_READY, /* RWFCI */ + ZIO_STAGE_READ_INIT, /* R---- */ + ZIO_STAGE_VDEV_IO_START, /* RW--I */ ZIO_STAGE_VDEV_IO_DONE, /* RW--I */ ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */ @@ -71,6 +73,7 @@ typedef enum zio_stage { ZIO_STAGE_READ_GANG_MEMBERS, /* R---- */ ZIO_STAGE_READ_DECOMPRESS, /* R---- */ + ZIO_STAGE_ASSESS, /* RWFCI */ ZIO_STAGE_DONE /* RWFCI */ } zio_stage_t; @@ -96,9 +99,14 @@ typedef enum zio_stage { ZIO_VDEV_IO_PIPELINE | \ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \ + (1U << ZIO_STAGE_ASSESS) | \ (1U << ZIO_STAGE_DONE)) +#define ZIO_READ_GANG_PIPELINE \ + ZIO_READ_PHYS_PIPELINE + #define ZIO_READ_PIPELINE \ + (1U << ZIO_STAGE_READ_INIT) | \ ZIO_READ_PHYS_PIPELINE #define ZIO_WRITE_PHYS_PIPELINE \ @@ -108,6 +116,7 @@ typedef enum zio_stage { (1U << ZIO_STAGE_READY) | \ ZIO_VDEV_IO_PIPELINE | \ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_ASSESS) | \ (1U << ZIO_STAGE_DONE)) #define ZIO_WRITE_COMMON_PIPELINE \ @@ -149,6 +158,7 @@ typedef enum zio_stage { (1U << ZIO_STAGE_DVA_FREE) | \ (1U << ZIO_STAGE_READY) | \ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_ASSESS) | \ (1U << ZIO_STAGE_DONE)) #define ZIO_CLAIM_PIPELINE \ @@ -160,6 +170,7 @@ typedef enum zio_stage { (1U << ZIO_STAGE_DVA_CLAIM) | \ (1U << ZIO_STAGE_READY) | \ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_ASSESS) | \ (1U << ZIO_STAGE_DONE)) #define ZIO_IOCTL_PIPELINE \ @@ -168,16 +179,19 @@ typedef enum zio_stage { (1U << ZIO_STAGE_READY) | \ ZIO_VDEV_IO_PIPELINE | \ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_ASSESS) | \ (1U << ZIO_STAGE_DONE)) #define ZIO_WAIT_FOR_CHILDREN_PIPELINE \ ((1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ (1U << ZIO_STAGE_READY) | \ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_ASSESS) | \ (1U << ZIO_STAGE_DONE)) #define ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE \ ((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_ASSESS) | \ (1U << ZIO_STAGE_DONE)) #define ZIO_VDEV_CHILD_PIPELINE \ diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index 62ebf19a61..aed7d53ba1 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -793,6 +793,21 @@ vdev_metaslab_fini(vdev_t *vd) } } +int +vdev_probe(vdev_t *vd) +{ + if (vd == NULL) + return (EINVAL); + + /* + * Right now we only support status checks on the leaf vdevs. + */ + if (vd->vdev_ops->vdev_op_leaf) + return (vd->vdev_ops->vdev_op_probe(vd)); + + return (0); +} + /* * Prepare a virtual device for access. */ @@ -919,6 +934,17 @@ vdev_open(vdev_t *vd) } /* + * Ensure we can issue some IO before declaring the + * vdev open for business. + */ + error = vdev_probe(vd); + if (error) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_OPEN_FAILED); + return (error); + } + + /* * If this is a top-level vdev, compute the raidz-deflation * ratio. Note, we hard-code in 128k (1<<17) because it is the * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE @@ -1467,6 +1493,17 @@ vdev_fault(spa_t *spa, uint64_t guid) vdev_t *rvd, *vd; uint64_t txg; + /* + * Disregard a vdev fault request if the pool has + * experienced a complete failure. + * + * XXX - We do this here so that we don't hold the + * spa_namespace_lock in the event that we can't get + * the RW_WRITER spa_config_lock. + */ + if (spa_state(spa) == POOL_STATE_IO_FAILURE) + return (EIO); + txg = spa_vdev_enter(spa); rvd = spa->spa_root_vdev; @@ -1499,7 +1536,7 @@ vdev_fault(spa_t *spa, uint64_t guid) */ vdev_reopen(vd); - if (!vdev_is_dead(vd)) { + if (vdev_readable(vd)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } @@ -1523,6 +1560,17 @@ vdev_degrade(spa_t *spa, uint64_t guid) vdev_t *rvd, *vd; uint64_t txg; + /* + * Disregard a vdev fault request if the pool has + * experienced a complete failure. + * + * XXX - We do this here so that we don't hold the + * spa_namespace_lock in the event that we can't get + * the RW_WRITER spa_config_lock. + */ + if (spa_state(spa) == POOL_STATE_IO_FAILURE) + return (EIO); + txg = spa_vdev_enter(spa); rvd = spa->spa_root_vdev; @@ -1564,6 +1612,17 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_t *rvd, *vd; uint64_t txg; + /* + * Disregard a vdev fault request if the pool has + * experienced a complete failure. + * + * XXX - We do this here so that we don't hold the + * spa_namespace_lock in the event that we can't get + * the RW_WRITER spa_config_lock. + */ + if (spa_state(spa) == POOL_STATE_IO_FAILURE) + return (EIO); + txg = spa_vdev_enter(spa); rvd = spa->spa_root_vdev; @@ -1612,6 +1671,17 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) vdev_t *rvd, *vd; uint64_t txg; + /* + * Disregard a vdev fault request if the pool has + * experienced a complete failure. + * + * XXX - We do this here so that we don't hold the + * spa_namespace_lock in the event that we can't get + * the RW_WRITER spa_config_lock. + */ + if (spa_state(spa) == POOL_STATE_IO_FAILURE) + return (EIO); + txg = spa_vdev_enter(spa); rvd = spa->spa_root_vdev; @@ -1662,9 +1732,11 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all * children. If 'vd' is NULL, then the user wants to clear all vdevs. + * If reopen is specified then attempt to reopen the vdev if the vdev is + * faulted or degraded. */ void -vdev_clear(spa_t *spa, vdev_t *vd) +vdev_clear(spa_t *spa, vdev_t *vd, boolean_t reopen_wanted) { int c; @@ -1674,16 +1746,17 @@ vdev_clear(spa_t *spa, vdev_t *vd) vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; + vd->vdev_is_failing = B_FALSE; for (c = 0; c < vd->vdev_children; c++) - vdev_clear(spa, vd->vdev_child[c]); + vdev_clear(spa, vd->vdev_child[c], reopen_wanted); /* * If we're in the FAULTED state, then clear the persistent state and * attempt to reopen the device. We also mark the vdev config dirty, so * that the new faulted state is written out to disk. */ - if (vd->vdev_faulted || vd->vdev_degraded) { + if (reopen_wanted && (vd->vdev_faulted || vd->vdev_degraded)) { vd->vdev_faulted = vd->vdev_degraded = 0; vdev_reopen(vd); vdev_config_dirty(vd->vdev_top); @@ -1696,6 +1769,20 @@ vdev_clear(spa_t *spa, vdev_t *vd) } int +vdev_readable(vdev_t *vd) +{ + /* XXPOLICY */ + return (!vdev_is_dead(vd)); +} + +int +vdev_writeable(vdev_t *vd) +{ + return (vd->vdev_ops->vdev_op_leaf ? + !vd->vdev_is_failing : !vdev_is_dead(vd)); +} + +int vdev_is_dead(vdev_t *vd) { return (vd->vdev_state < VDEV_STATE_DEGRADED); @@ -1800,7 +1887,7 @@ vdev_stat_update(zio_t *zio) if (flags & ZIO_FLAG_SPECULATIVE) return; - if (!vdev_is_dead(vd)) { + if (vdev_readable(vd)) { mutex_enter(&vd->vdev_stat_lock); if (type == ZIO_TYPE_READ) { if (zio->io_error == ECKSUM) @@ -1962,9 +2049,9 @@ vdev_propagate_state(vdev_t *vd) if (vd->vdev_children > 0) { for (c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; - if (vdev_is_dead(child)) + if (vdev_is_dead(child) && !vdev_readable(child)) faulted++; - else if (child->vdev_state == VDEV_STATE_DEGRADED) + else if (child->vdev_state <= VDEV_STATE_DEGRADED) degraded++; if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) @@ -2020,7 +2107,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) * want here. This is limited to leaf devices, because otherwise * closing the device will affect other children. */ - if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) + if (!vdev_readable(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); if (vd->vdev_removed && diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index a957c3671c..8bdd4d1f95 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -45,14 +45,11 @@ typedef struct vdev_disk_buf { } vdev_disk_buf_t; static int -vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +vdev_disk_open_common(vdev_t *vd) { vdev_disk_t *dvd; - struct dk_minfo dkm; - int error; dev_t dev; - char *physpath, *minorname; - int otyp; + int error; /* * We must have a pathname, and it must be absolute. @@ -166,17 +163,34 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) &dvd->vd_lh, zfs_li); } - if (error) { + if (error) vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + + return (error); +} + +static int +vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +{ + vdev_disk_t *dvd; + struct dk_minfo dkm; + int error; + dev_t dev; + int otyp; + + error = vdev_disk_open_common(vd); + if (error) return (error); - } + dvd = vd->vdev_tsd; /* * Once a device is opened, verify that the physical device path (if * available) is up to date. */ if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { + char *physpath, *minorname; + physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); minorname = NULL; if (ddi_dev_pathname(dev, otyp, physpath) == 0 && @@ -252,6 +266,113 @@ vdev_disk_close(vdev_t *vd) vd->vdev_tsd = NULL; } +static int +vdev_disk_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset, + int flags) +{ + buf_t buf; + int error = 0; + vdev_disk_t *dvd = vd->vdev_tsd; + + if (vd == NULL || dvd == NULL || dvd->vd_lh == NULL) + return (EINVAL); + + ASSERT(flags & B_READ || flags & B_WRITE); + + bioinit(&buf); + buf.b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST; + buf.b_bcount = size; + buf.b_un.b_addr = (void *)data; + buf.b_lblkno = lbtodb(offset); + buf.b_bufsize = size; + + error = ldi_strategy(dvd->vd_lh, &buf); + ASSERT(error == 0); + error = biowait(&buf); + + if (zio_injection_enabled && error == 0) + error = zio_handle_device_injection(vd, EIO); + + return (error); +} + +static int +vdev_disk_probe(vdev_t *vd) +{ + uint64_t offset; + vdev_t *nvd; + int l, error = 0, retries = 0; + char *vl_pad; + + if (vd == NULL) + return (EINVAL); + + /* Hijack the current vdev */ + nvd = vd; + + /* + * Pick a random label to rewrite. + */ + l = spa_get_random(VDEV_LABELS); + ASSERT(l < VDEV_LABELS); + + offset = vdev_label_offset(vd->vdev_psize, l, + offsetof(vdev_label_t, vl_pad)); + + vl_pad = kmem_alloc(VDEV_SKIP_SIZE, KM_SLEEP); + + /* + * Try to read and write to a special location on the + * label. We use the existing vdev initially and only + * try to create and reopen it if we encounter a failure. + */ + while ((error = vdev_disk_probe_io(nvd, vl_pad, VDEV_SKIP_SIZE, + offset, B_READ)) != 0 && retries == 0) { + + nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); + if (vd->vdev_path) + nvd->vdev_path = spa_strdup(vd->vdev_path); + if (vd->vdev_physpath) + nvd->vdev_physpath = spa_strdup(vd->vdev_physpath); + if (vd->vdev_devid) + nvd->vdev_devid = spa_strdup(vd->vdev_devid); + nvd->vdev_wholedisk = vd->vdev_wholedisk; + nvd->vdev_guid = vd->vdev_guid; + retries++; + + error = vdev_disk_open_common(nvd); + if (error) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + nvd->vdev_stat.vs_aux); + break; + } + } + + if (!error) { + error = vdev_disk_probe_io(nvd, vl_pad, VDEV_SKIP_SIZE, + offset, B_WRITE); + } + + /* Clean up if we allocated a new vdev */ + if (retries) { + vdev_disk_close(nvd); + if (nvd->vdev_path) + spa_strfree(nvd->vdev_path); + if (nvd->vdev_physpath) + spa_strfree(nvd->vdev_physpath); + if (nvd->vdev_devid) + spa_strfree(nvd->vdev_devid); + kmem_free(nvd, sizeof (vdev_t)); + } + kmem_free(vl_pad, VDEV_SKIP_SIZE); + + /* Reset the failing flag */ + if (!error) + vd->vdev_is_failing = B_FALSE; + + return (error); +} + static void vdev_disk_io_intr(buf_t *bp) { @@ -289,7 +410,7 @@ vdev_disk_io_start(zio_t *zio) zio_vdev_io_bypass(zio); /* XXPOLICY */ - if (vdev_is_dead(vd)) { + if (!vdev_readable(vd)) { zio->io_error = ENXIO; zio_next_stage_async(zio); return; @@ -369,7 +490,11 @@ vdev_disk_io_start(zio_t *zio) bp->b_iodone = (int (*)())vdev_disk_io_intr; /* XXPOLICY */ - error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio); + if (zio->io_type == ZIO_TYPE_WRITE) + error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO; + else + error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO; + error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error; if (error) { zio->io_error = error; bioerror(bp, error); @@ -386,10 +511,6 @@ vdev_disk_io_start(zio_t *zio) static void vdev_disk_io_done(zio_t *zio) { - vdev_t *vd = zio->io_vd; - vdev_disk_t *dvd = vd->vdev_tsd; - int state; - vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE) @@ -401,15 +522,23 @@ vdev_disk_io_done(zio_t *zio) /* * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if * the device has been removed. If this is the case, then we trigger an - * asynchronous removal of the device. + * asynchronous removal of the device. Otherwise, probe the device and + * make sure it's still functional. */ if (zio->io_error == EIO) { + vdev_t *vd = zio->io_vd; + vdev_disk_t *dvd = vd->vdev_tsd; + int state; + state = DKIO_NONE; - if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, + if (dvd && ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { vd->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); + } else if (vdev_probe(vd) != 0) { + ASSERT(vd->vdev_ops->vdev_op_leaf); + vd->vdev_is_failing = B_TRUE; } } @@ -419,6 +548,7 @@ vdev_disk_io_done(zio_t *zio) vdev_ops_t vdev_disk_ops = { vdev_disk_open, vdev_disk_close, + vdev_disk_probe, vdev_default_asize, vdev_disk_io_start, vdev_disk_io_done, diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c index b8e79f8c0c..6f099b6629 100644 --- a/usr/src/uts/common/fs/zfs/vdev_file.c +++ b/usr/src/uts/common/fs/zfs/vdev_file.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -37,11 +37,10 @@ */ static int -vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +vdev_file_open_common(vdev_t *vd) { vdev_file_t *vf; vnode_t *vp; - vattr_t vattr; int error; /* @@ -61,8 +60,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) * to local zone users, so the underlying devices should be as well. */ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); - error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, spa_mode | FOFFMAX, - 0, &vp, 0, 0, rootdir); + error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, + spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir); if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; @@ -81,11 +80,26 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) } #endif + return (0); +} + +static int +vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +{ + vdev_file_t *vf; + vattr_t vattr; + int error; + + if ((error = vdev_file_open_common(vd)) != 0) + return (error); + + vf = vd->vdev_tsd; + /* * Determine the physical size of the file. */ vattr.va_mask = AT_SIZE; - error = VOP_GETATTR(vp, &vattr, 0, kcred); + error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred); if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); @@ -115,6 +129,89 @@ vdev_file_close(vdev_t *vd) vd->vdev_tsd = NULL; } +static int +vdev_file_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset, + enum uio_rw rw) +{ + vdev_file_t *vf = vd->vdev_tsd; + ssize_t resid; + int error = 0; + + if (vd == NULL || vf == NULL || vf->vf_vnode == NULL) + return (EINVAL); + + ASSERT(rw == UIO_READ || rw == UIO_WRITE); + + error = vn_rdwr(rw, vf->vf_vnode, data, size, offset, UIO_SYSSPACE, + 0, RLIM64_INFINITY, kcred, &resid); + if (error || resid != 0) + return (EIO); + return (0); +} + +static int +vdev_file_probe(vdev_t *vd) +{ + vdev_t *nvd; + char *vl_boot; + uint64_t offset; + int l, error = 0, retries = 0; + + if (vd == NULL) + return (EINVAL); + + /* Hijack the current vdev */ + nvd = vd; + + /* + * Pick a random label to rewrite. + */ + l = spa_get_random(VDEV_LABELS); + ASSERT(l < VDEV_LABELS); + + offset = vdev_label_offset(vd->vdev_psize, l, + offsetof(vdev_label_t, vl_boot_header)); + + vl_boot = kmem_alloc(VDEV_BOOT_HEADER_SIZE, KM_SLEEP); + + while ((error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE, + offset, UIO_READ)) != 0 && retries == 0) { + + /* + * If we failed with the vdev that was passed in then + * try allocating a new one and try again. + */ + nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); + if (vd->vdev_path) + nvd->vdev_path = spa_strdup(vd->vdev_path); + error = vdev_file_open_common(nvd); + if (error) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + nvd->vdev_stat.vs_aux); + break; + } + retries++; + } + + if ((spa_mode & FWRITE) && !error) { + error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE, + offset, UIO_WRITE); + } + + if (retries) { + vdev_file_close(nvd); + if (nvd->vdev_path) + spa_strfree(nvd->vdev_path); + kmem_free(nvd, sizeof (vdev_t)); + } + kmem_free(vl_boot, VDEV_BOOT_HEADER_SIZE); + + if (!error) + vd->vdev_is_failing = B_FALSE; + + return (error); +} + static void vdev_file_io_start(zio_t *zio) { @@ -127,7 +224,7 @@ vdev_file_io_start(zio_t *zio) zio_vdev_io_bypass(zio); /* XXPOLICY */ - if (vdev_is_dead(vd)) { + if (!vdev_readable(vd)) { zio->io_error = ENXIO; zio_next_stage_async(zio); return; @@ -161,7 +258,11 @@ vdev_file_io_start(zio_t *zio) return; /* XXPOLICY */ - error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio); + if (zio->io_type == ZIO_TYPE_WRITE) + error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO; + else + error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO; + error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error; if (error) { zio->io_error = error; zio_next_stage_async(zio); @@ -182,6 +283,21 @@ vdev_file_io_start(zio_t *zio) static void vdev_file_io_done(zio_t *zio) { + + if (zio_injection_enabled && zio->io_error == 0) + zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); + + /* + * If this device is truely gone, then attempt to remove it + * from the configuration. + */ + if (zio->io_error == EIO) { + vdev_t *vd = zio->io_vd; + + if (vdev_probe(vd) != 0) + vd->vdev_is_failing = B_TRUE; + } + vdev_queue_io_done(zio); #ifndef _KERNEL @@ -189,15 +305,13 @@ vdev_file_io_done(zio_t *zio) vdev_cache_write(zio); #endif - if (zio_injection_enabled && zio->io_error == 0) - zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); - zio_next_stage(zio); } vdev_ops_t vdev_file_ops = { vdev_file_open, vdev_file_close, + vdev_file_probe, vdev_default_asize, vdev_file_io_start, vdev_file_io_done, @@ -214,6 +328,7 @@ vdev_ops_t vdev_file_ops = { vdev_ops_t vdev_disk_ops = { vdev_file_open, vdev_file_close, + vdev_file_probe, vdev_default_asize, vdev_file_io_start, vdev_file_io_done, diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c index 4b22a68fee..070444a093 100644 --- a/usr/src/uts/common/fs/zfs/vdev_label.c +++ b/usr/src/uts/common/fs/zfs/vdev_label.c @@ -321,7 +321,7 @@ vdev_label_read_config(vdev_t *vd) ASSERT(spa_config_held(spa, RW_READER) || spa_config_held(spa, RW_WRITER)); - if (vdev_is_dead(vd)) + if (!vdev_readable(vd)) return (NULL); vp = zio_buf_alloc(sizeof (vdev_phys_t)); @@ -902,7 +902,9 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg) vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd; zio_t *zio; - int l, error; + int l, last_error = 0, error = 0; + uint64_t good_writes = 0; + boolean_t retry_avail = B_TRUE; ASSERT(ub->ub_txg <= txg); @@ -941,6 +943,7 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg) } (void) zio_wait(zio); +retry: /* * Sync out the even labels (L0, L2) for every dirty vdev. If the * system dies in the middle of this process, that's OK: all of the @@ -954,11 +957,29 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg) if (l & 1) continue; if ((error = vdev_sync_labels(vd, l, txg)) != 0) - return (error); + last_error = error; + else + good_writes++; } } /* + * If all the vdevs that are currently dirty have failed or the + * spa_dirty_list is empty then we dirty all the vdevs and try again. + * This is a last ditch effort to ensure that we get at least one + * update before proceeding to the uberblock. + */ + if (good_writes == 0 && retry_avail) { + vdev_config_dirty(rvd); + retry_avail = B_FALSE; + last_error = 0; + goto retry; + } + + if (good_writes == 0) + return (last_error); + + /* * Flush the new labels to disk. This ensures that all even-label * updates are committed to stable storage before the uberblock update. */ @@ -986,8 +1007,15 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg) * will be the newest, and the even labels (which had all * been successfully committed) will be valid with respect * to the new uberblocks. + * + * NOTE: We retry to an uberblock update on the root if we were + * failed our initial update attempt. */ - if ((error = vdev_uberblock_sync_tree(spa, ub, uvd, txg)) != 0) + error = vdev_uberblock_sync_tree(spa, ub, uvd, txg); + if (error && uvd != rvd) + error = vdev_uberblock_sync_tree(spa, ub, rvd, txg); + + if (error) return (error); /* @@ -999,6 +1027,7 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg) NULL, NULL, ZIO_PRIORITY_NOW, ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); + last_error = 0; /* * Sync out odd labels for every dirty vdev. If the system dies * in the middle of this process, the even labels and the new @@ -1013,10 +1042,15 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg) if ((l & 1) == 0) continue; if ((error = vdev_sync_labels(vd, l, txg)) != 0) - return (error); + last_error = error; + else + good_writes++; } } + if (good_writes == 0) + return (last_error); + /* * Flush the new labels to disk. This ensures that all odd-label * updates are committed to stable storage before the next diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c index 73d1a83d94..45d326ae69 100644 --- a/usr/src/uts/common/fs/zfs/vdev_mirror.c +++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -219,7 +219,7 @@ vdev_mirror_child_select(zio_t *zio) /* * Try to find a child whose DTL doesn't contain the block to read. * If a child is known to be completely inaccessible (indicated by - * vdev_is_dead() returning B_TRUE), don't even try. + * vdev_readable() returning B_FALSE), don't even try. */ for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) { if (c >= mm->mm_children) @@ -227,7 +227,7 @@ vdev_mirror_child_select(zio_t *zio) mc = &mm->mm_child[c]; if (mc->mc_tried || mc->mc_skipped) continue; - if (vdev_is_dead(mc->mc_vd)) { + if (vdev_is_dead(mc->mc_vd) && !vdev_readable(mc->mc_vd)) { mc->mc_error = ENXIO; mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; @@ -464,6 +464,7 @@ vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) vdev_ops_t vdev_mirror_ops = { vdev_mirror_open, vdev_mirror_close, + NULL, vdev_default_asize, vdev_mirror_io_start, vdev_mirror_io_done, @@ -475,6 +476,7 @@ vdev_ops_t vdev_mirror_ops = { vdev_ops_t vdev_replacing_ops = { vdev_mirror_open, vdev_mirror_close, + NULL, vdev_default_asize, vdev_mirror_io_start, vdev_mirror_io_done, @@ -486,6 +488,7 @@ vdev_ops_t vdev_replacing_ops = { vdev_ops_t vdev_spare_ops = { vdev_mirror_open, vdev_mirror_close, + NULL, vdev_default_asize, vdev_mirror_io_start, vdev_mirror_io_done, diff --git a/usr/src/uts/common/fs/zfs/vdev_missing.c b/usr/src/uts/common/fs/zfs/vdev_missing.c index b35f4a5bcd..3aa831c46d 100644 --- a/usr/src/uts/common/fs/zfs/vdev_missing.c +++ b/usr/src/uts/common/fs/zfs/vdev_missing.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -77,9 +76,17 @@ vdev_missing_io_done(zio_t *zio) zio_next_stage(zio); } +/* ARGSUSED */ +static int +vdev_missing_probe(vdev_t *vd) +{ + return (0); +} + vdev_ops_t vdev_missing_ops = { vdev_missing_open, vdev_missing_close, + vdev_missing_probe, vdev_default_asize, vdev_missing_io_start, vdev_missing_io_done, diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c index 0c86630765..73a3ae2565 100644 --- a/usr/src/uts/common/fs/zfs/vdev_raidz.c +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c @@ -686,7 +686,7 @@ vdev_raidz_io_start(zio_t *zio) for (c = rm->rm_cols - 1; c >= 0; c--) { rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; - if (vdev_is_dead(cvd)) { + if (!vdev_readable(cvd)) { if (c >= rm->rm_firstdatacol) rm->rm_missingdata++; else @@ -1228,6 +1228,7 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) vdev_ops_t vdev_raidz_ops = { vdev_raidz_open, vdev_raidz_close, + NULL, vdev_raidz_asize, vdev_raidz_io_start, vdev_raidz_io_done, diff --git a/usr/src/uts/common/fs/zfs/vdev_root.c b/usr/src/uts/common/fs/zfs/vdev_root.c index 0e8752c6ce..77829c0aa3 100644 --- a/usr/src/uts/common/fs/zfs/vdev_root.c +++ b/usr/src/uts/common/fs/zfs/vdev_root.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -44,18 +44,17 @@ * probably fine. Adding bean counters during alloc/free can make this * future guesswork more accurate. */ -/*ARGSUSED*/ static int too_many_errors(vdev_t *vd, int numerrors) { - return (numerrors > 0); + ASSERT3U(numerrors, <=, vd->vdev_children); + return (numerrors == vd->vdev_children); } static int vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) { - vdev_t *cvd; - int c, error; + int c; int lasterror = 0; int numerrors = 0; @@ -65,7 +64,8 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) } for (c = 0; c < vd->vdev_children; c++) { - cvd = vd->vdev_child[c]; + vdev_t *cvd = vd->vdev_child[c]; + int error; if ((error = vdev_open(cvd)) != 0) { lasterror = error; @@ -74,9 +74,15 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) } } - if (too_many_errors(vd, numerrors)) { - vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; - return (lasterror); + if (numerrors > 0) { + if (!too_many_errors(vd, numerrors)) { + /* XXX - should not be explicitly setting this state */ + vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, + VDEV_AUX_NO_REPLICAS); + } else { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } } *asize = 0; @@ -97,18 +103,24 @@ vdev_root_close(vdev_t *vd) static void vdev_root_state_change(vdev_t *vd, int faulted, int degraded) { - if (too_many_errors(vd, faulted)) - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_NO_REPLICAS); - else if (degraded != 0) + if (faulted) { + if (too_many_errors(vd, faulted)) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + else + vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, + VDEV_AUX_NO_REPLICAS); + } else if (degraded) { vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); - else + } else { vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); + } } vdev_ops_t vdev_root_ops = { vdev_root_open, vdev_root_close, + NULL, vdev_default_asize, NULL, /* io_start - not applicable to the root */ NULL, /* io_done - not applicable to the root */ diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index 4a5e68b878..54158d03f2 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -2073,6 +2073,17 @@ zfs_ioc_clear(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); + /* + * Try to resume any I/Os which may have been suspended + * as a result of a complete pool failure. + */ + if (!list_is_empty(&spa->spa_zio_list)) { + if (zio_vdev_resume_io(spa) != 0) { + spa_close(spa, FTAG); + return (EIO); + } + } + txg = spa_vdev_enter(spa); if (zc->zc_guid == 0) { @@ -2083,7 +2094,7 @@ zfs_ioc_clear(zfs_cmd_t *zc) return (ENODEV); } - vdev_clear(spa, vd); + vdev_clear(spa, vd, B_TRUE); (void) spa_vdev_exit(spa, NULL, txg, 0); diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 40670d1321..103c9d9cad 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -66,6 +66,14 @@ uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; /* Force an allocation failure when non-zero */ uint16_t zio_zil_fail_shift = 0; +uint16_t zio_io_fail_shift = 0; + +/* Enable/disable the write-retry logic */ +int zio_write_retry = 1; + +/* Taskq to handle reissuing of I/Os */ +taskq_t *zio_taskq; +int zio_resume_threads = 4; typedef struct zio_sync_pass { int zp_defer_free; /* defer frees after this pass */ @@ -79,6 +87,8 @@ zio_sync_pass_t zio_sync_pass = { 1, /* zp_rewrite */ }; +static boolean_t zio_io_should_fail(uint16_t); + /* * ========================================================================== * I/O kmem caches @@ -92,6 +102,34 @@ kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; extern vmem_t *zio_alloc_arena; #endif +/* + * Determine if we are allowed to issue the IO based on the + * pool state. If we must wait then block until we are told + * that we may continue. + */ +#define ZIO_ENTER(spa) { \ + if (spa->spa_state == POOL_STATE_IO_FAILURE) { \ + mutex_enter(&spa->spa_zio_lock); \ + while (spa->spa_state == POOL_STATE_IO_FAILURE) \ + cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock); \ + mutex_exit(&spa->spa_zio_lock); \ + } \ +} + +/* + * An allocation zio is one that either currently has the DVA allocate + * stage set or will have it later in it's lifetime. + */ +#define IO_IS_ALLOCATING(zio) \ + ((zio)->io_orig_pipeline == ZIO_WRITE_PIPELINE || \ + (zio)->io_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE)) + +/* + * The only way to tell is by looking for the gang pipeline stage + */ +#define IO_IS_REWRITE(zio) \ + ((zio)->io_pipeline & (1U << ZIO_STAGE_GANG_PIPELINE)) + void zio_init(void) { @@ -153,6 +191,9 @@ zio_init(void) zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } + zio_taskq = taskq_create("zio_taskq", zio_resume_threads, + maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); + zio_inject_init(); } @@ -177,6 +218,8 @@ zio_fini(void) zio_data_buf_cache[c] = NULL; } + taskq_destroy(zio_taskq); + kmem_cache_destroy(zio_cache); zio_inject_fini(); @@ -386,9 +429,27 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, mutex_exit(&pio->io_lock); } + /* + * Save off the original state incase we need to retry later. + */ + zio->io_orig_stage = zio->io_stage; + zio->io_orig_pipeline = zio->io_pipeline; + zio->io_orig_flags = zio->io_flags; + return (zio); } +static void +zio_reset(zio_t *zio) +{ + zio_clear_transform_stack(zio); + + zio->io_flags = zio->io_orig_flags; + zio->io_stage = zio->io_orig_stage; + zio->io_pipeline = zio->io_orig_pipeline; + zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size); +} + zio_t * zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, int flags) @@ -417,6 +478,13 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, ASSERT3U(size, ==, BP_GET_LSIZE(bp)); + /* + * If the user has specified that we allow I/Os to continue + * then attempt to satisfy the read. + */ + if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) + ZIO_ENTER(spa); + zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); @@ -429,22 +497,6 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, */ zio->io_bp = &zio->io_bp_copy; - if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { - uint64_t csize = BP_GET_PSIZE(bp); - void *cbuf = zio_buf_alloc(csize); - - zio_push_transform(zio, cbuf, csize, csize); - zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; - } - - if (BP_IS_GANG(bp)) { - uint64_t gsize = SPA_GANGBLOCKSIZE; - void *gbuf = zio_buf_alloc(gsize); - - zio_push_transform(zio, gbuf, gsize, gsize); - zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; - } - return (zio); } @@ -462,6 +514,8 @@ zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, ASSERT(compress >= ZIO_COMPRESS_OFF && compress < ZIO_COMPRESS_FUNCTIONS); + ZIO_ENTER(spa); + zio = zio_create(pio, spa, txg, bp, data, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); @@ -515,6 +569,16 @@ zio_rewrite(zio_t *pio, spa_t *spa, int checksum, return (zio); } +static void +zio_write_allocate_ready(zio_t *zio) +{ + /* Free up the previous block */ + if (!BP_IS_HOLE(&zio->io_bp_orig)) { + zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, + &zio->io_bp_orig, NULL, NULL)); + } +} + static zio_t * zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, @@ -533,6 +597,7 @@ zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, zio->io_checksum = checksum; zio->io_compress = ZIO_COMPRESS_OFF; + zio->io_ready = zio_write_allocate_ready; return (zio); } @@ -649,6 +714,8 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_t *zio; blkptr_t blk; + ZIO_ENTER(vd->vdev_spa); + zio_phys_bp_init(vd, &blk, offset, size, checksum); zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, @@ -676,6 +743,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_t *zio; blkptr_t blk; + ZIO_ENTER(vd->vdev_spa); + zio_phys_bp_init(vd, &blk, offset, size, checksum); zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, @@ -801,6 +870,7 @@ zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) mutex_enter(&pio->io_lock); if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) pio->io_error = zio->io_error; + ASSERT3U(*countp, >, 0); if (--*countp == 0 && pio->io_stalled == stage) { pio->io_stalled = 0; mutex_exit(&pio->io_lock); @@ -825,6 +895,27 @@ zio_wait_children_done(zio_t *zio) } static void +zio_read_init(zio_t *zio) +{ + if (BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF) { + uint64_t csize = BP_GET_PSIZE(zio->io_bp); + void *cbuf = zio_buf_alloc(csize); + + zio_push_transform(zio, cbuf, csize, csize); + zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; + } + + if (BP_IS_GANG(zio->io_bp)) { + uint64_t gsize = SPA_GANGBLOCKSIZE; + void *gbuf = zio_buf_alloc(gsize); + + zio_push_transform(zio, gbuf, gsize, gsize); + zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; + } + zio_next_stage(zio); +} + +static void zio_ready(zio_t *zio) { zio_t *pio = zio->io_parent; @@ -843,9 +934,151 @@ zio_ready(zio_t *zio) } static void -zio_done(zio_t *zio) +zio_vdev_retry_io(zio_t *zio) { zio_t *pio = zio->io_parent; + + /* + * Preserve the failed bp so that the io_ready() callback can + * update the accounting accordingly. The callback will also be + * responsible for freeing the previously allocated block, if one + * exists. + */ + zio->io_bp_orig = *zio->io_bp; + + /* + * We must zero out the old DVA and blk_birth before reallocating + * the bp. We don't want to do this if this is a rewrite however. + */ + if (!IO_IS_REWRITE(zio)) { + BP_ZERO_DVAS(zio->io_bp); + } + + zio_reset(zio); + + if (pio) { + /* + * Let the parent know that we will + * re-alloc the write (=> new bp info). + */ + mutex_enter(&pio->io_lock); + pio->io_children_notready++; + + /* + * If the parent I/O is still in the open stage, then + * don't bother telling it to retry since it hasn't + * progressed far enough for it to care. + */ + if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio)) + pio->io_flags |= ZIO_FLAG_WRITE_RETRY; + + ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_CHILDREN_DONE); + mutex_exit(&pio->io_lock); + } + + /* + * We are getting ready to process the retry request so clear + * the flag and the zio's current error status. + */ + zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY; + zio->io_error = 0; + zio_next_stage_async(zio); +} + +int +zio_vdev_resume_io(spa_t *spa) +{ + zio_t *zio; + + mutex_enter(&spa->spa_zio_lock); + + /* + * Probe all of vdevs that have experienced an I/O error. + * If we are still unable to verify the integrity of the vdev + * then we prevent the resume from proceeeding. + */ + for (zio = list_head(&spa->spa_zio_list); zio != NULL; + zio = list_next(&spa->spa_zio_list, zio)) { + int error = 0; + + /* We only care about I/Os that must succeed */ + if (zio->io_vd == NULL || zio->io_flags & ZIO_FLAG_CANFAIL) + continue; + error = vdev_probe(zio->io_vd); + if (error) { + mutex_exit(&spa->spa_zio_lock); + return (error); + } + } + + /* + * Clear the vdev stats so that I/O can flow. + */ + vdev_clear(spa, NULL, B_FALSE); + + spa->spa_state = POOL_STATE_ACTIVE; + while ((zio = list_head(&spa->spa_zio_list)) != NULL) { + list_remove(&spa->spa_zio_list, zio); + zio->io_error = 0; + + /* + * If we are resuming an allocating I/O then we force it + * to retry and let it resume operation where it left off. + * Otherwise, go back to the ready stage and pick up from + * there. + */ + if (zio_write_retry && IO_IS_ALLOCATING(zio)) { + zio->io_flags |= ZIO_FLAG_WRITE_RETRY; + zio->io_stage--; + } else { + zio->io_stage = ZIO_STAGE_READY; + } + + (void) taskq_dispatch(zio_taskq, zio_resubmit_stage_async, + zio, TQ_SLEEP); + } + mutex_exit(&spa->spa_zio_lock); + + /* + * Wait for the taskqs to finish and recheck the pool state since + * it's possible that a resumed I/O has failed again. + */ + taskq_wait(zio_taskq); + if (spa_state(spa) == POOL_STATE_IO_FAILURE) + return (EIO); + + mutex_enter(&spa->spa_zio_lock); + cv_broadcast(&spa->spa_zio_cv); + mutex_exit(&spa->spa_zio_lock); + + return (0); +} + +static void +zio_vdev_suspend_io(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + + /* + * We've experienced an unrecoverable failure so + * set the pool state accordingly and queue all + * failed IOs. + */ + spa->spa_state = POOL_STATE_IO_FAILURE; + + mutex_enter(&spa->spa_zio_lock); + list_insert_tail(&spa->spa_zio_list, zio); + +#ifndef _KERNEL + /* Used to notify ztest that the pool has suspended */ + cv_broadcast(&spa->spa_zio_cv); +#endif + mutex_exit(&spa->spa_zio_lock); +} + +static void +zio_assess(zio_t *zio) +{ spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; vdev_t *vd = zio->io_vd; @@ -868,6 +1101,14 @@ zio_done(zio_t *zio) } } + /* + * Some child I/O has indicated that a retry is necessary, so + * we set an error on the I/O and let the logic below do the + * rest. + */ + if (zio->io_flags & ZIO_FLAG_WRITE_RETRY) + zio->io_error = ERESTART; + if (vd != NULL) vdev_stat_update(zio); @@ -879,8 +1120,7 @@ zio_done(zio_t *zio) * device is currently unavailable. */ if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) - zfs_ereport_post(FM_EREPORT_ZFS_IO, - zio->io_spa, vd, zio, 0, 0); + zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); if ((zio->io_error == EIO || !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && @@ -890,32 +1130,80 @@ zio_done(zio_t *zio) * appropriately. Also, generate a logical data * ereport. */ - spa_log_error(zio->io_spa, zio); + spa_log_error(spa, zio); - zfs_ereport_post(FM_EREPORT_ZFS_DATA, - zio->io_spa, NULL, zio, 0, 0); + zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, + 0, 0); } /* - * For I/O requests that cannot fail, panic appropriately. + * If we are an allocating I/O then we retry on another + * vdev unless the pool is out of space. We handle this + * condition based on the spa's failmode property. + */ + if (zio_write_retry && zio->io_error != ENOSPC && + IO_IS_ALLOCATING(zio) && + zio->io_flags & ZIO_FLAG_WRITE_RETRY) { + zio_vdev_retry_io(zio); + return; + } + ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); + + /* + * For I/O requests that cannot fail, we carry out + * the requested behavior based on the failmode pool + * property. + * + * XXX - Need to differentiate between an ENOSPC as + * a result of vdev failures vs. a full pool. */ if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { char *blkbuf; +#ifdef ZFS_DEBUG blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP); if (blkbuf) { sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp ? bp : &zio->io_bp_copy); } - panic("ZFS: %s (%s on %s off %llx: zio %p %s): error " - "%d", zio->io_error == ECKSUM ? + cmn_err(CE_WARN, "ZFS: %s (%s on %s off %llx: zio %p " + "%s): error %d", zio->io_error == ECKSUM ? "bad checksum" : "I/O failure", zio_type_name[zio->io_type], vdev_description(vd), (u_longlong_t)zio->io_offset, - zio, blkbuf ? blkbuf : "", zio->io_error); + (void *)zio, blkbuf ? blkbuf : "", zio->io_error); +#endif + + if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) { + fm_panic("Pool '%s' has encountered an " + "uncorrectable I/O failure and the " + "failure mode property for this pool " + "is set to panic.", spa_name(spa)); + } else { + cmn_err(CE_WARN, "Pool '%s' has encountered " + "an uncorrectable I/O error. Manual " + "intervention is required.", + spa_name(spa)); + zio_vdev_suspend_io(zio); + } + return; } } + ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); + ASSERT(zio->io_children_notready == 0); + zio_next_stage(zio); +} + +static void +zio_done(zio_t *zio) +{ + zio_t *pio = zio->io_parent; + spa_t *spa = zio->io_spa; + + ASSERT(zio->io_children_notready == 0); + ASSERT(zio->io_children_notdone == 0); + zio_clear_transform_stack(zio); if (zio->io_done) @@ -1099,7 +1387,7 @@ zio_get_gang_header(zio_t *zio) zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, NULL, NULL, ZIO_TYPE_READ, zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, - ZIO_STAGE_OPEN, ZIO_READ_PIPELINE)); + ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE)); zio_wait_children_done(zio); } @@ -1244,7 +1532,7 @@ zio_write_allocate_gang_member_done(zio_t *zio) mutex_exit(&pio->io_lock); } -static void +static int zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc) { blkptr_t *bp = zio->io_bp; @@ -1266,9 +1554,8 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc) error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL, B_FALSE); - if (error == ENOSPC) - panic("can't allocate gang block header"); - ASSERT(error == 0); + if (error) + return (error); for (d = 0; d < gbh_ndvas; d++) DVA_SET_GANG(&dva[d], 1); @@ -1296,8 +1583,9 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc) if (error == 0) break; ASSERT3U(error, ==, ENOSPC); + /* XXX - free up previous allocations? */ if (maxalloc == SPA_MINBLOCKSIZE) - panic("really out of space"); + return (error); maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); } @@ -1336,6 +1624,7 @@ zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc) * to be stable. */ zio_wait_children_done(zio); + return (0); } /* @@ -1358,10 +1647,23 @@ zio_dva_allocate(zio_t *zio) /* For testing, make some blocks above a certain size be gang blocks */ if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) { - zio_write_allocate_gang_members(zio, mc); + error = zio_write_allocate_gang_members(zio, mc); + if (error) + zio->io_error = error; return; } + /* + * For testing purposes, we force I/Os to retry. We don't allow + * retries beyond the first pass since those I/Os are non-allocating + * writes. We do this after the gang block testing block so that + * they don't inherit the retry flag. + */ + if (zio_io_fail_shift && + spa_sync_pass(zio->io_spa) <= zio_sync_pass.zp_rewrite && + zio_io_should_fail(zio_io_fail_shift)) + zio->io_flags |= ZIO_FLAG_WRITE_RETRY; + ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas, @@ -1369,11 +1671,11 @@ zio_dva_allocate(zio_t *zio) if (error == 0) { bp->blk_birth = zio->io_txg; - } else if (error == ENOSPC) { - if (zio->io_size == SPA_MINBLOCKSIZE) - panic("really, truly out of space"); - zio_write_allocate_gang_members(zio, mc); - return; + } else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { + error = zio_write_allocate_gang_members(zio, mc); + if (error == 0) + return; + zio->io_error = error; } else { zio->io_error = error; } @@ -1413,6 +1715,18 @@ zio_vdev_io_start(zio_t *zio) vdev_t *tvd = vd ? vd->vdev_top : NULL; blkptr_t *bp = zio->io_bp; uint64_t align; + spa_t *spa = zio->io_spa; + + /* + * If the pool is already in a failure state then just suspend + * this IO until the problem is resolved. We will reissue them + * at that time. + */ + if (spa_state(spa) == POOL_STATE_IO_FAILURE && + zio->io_type == ZIO_TYPE_WRITE) { + zio_vdev_suspend_io(zio); + return; + } if (vd == NULL) { /* The mirror_ops handle multiple DVAs in a single BP */ @@ -1662,6 +1976,7 @@ zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { zio_dva_claim, zio_gang_checksum_generate, zio_ready, + zio_read_init, zio_vdev_io_start, zio_vdev_io_done, zio_vdev_io_assess, @@ -1669,6 +1984,7 @@ zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { zio_checksum_verify, zio_read_gang_members, zio_read_decompress, + zio_assess, zio_done, zio_badop }; @@ -1762,12 +2078,20 @@ zio_next_stage_async(zio_t *zio) } } +void +zio_resubmit_stage_async(void *arg) +{ + zio_t *zio = (zio_t *)(uintptr_t)arg; + + zio_next_stage_async(zio); +} + static boolean_t -zio_alloc_should_fail(void) +zio_io_should_fail(uint16_t range) { static uint16_t allocs = 0; - return (P2PHASE(allocs++, 1U<<zio_zil_fail_shift) == 0); + return (P2PHASE(allocs++, 1U<<range) == 0); } /* @@ -1781,7 +2105,7 @@ zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, spa_config_enter(spa, RW_READER, FTAG); - if (zio_zil_fail_shift && zio_alloc_should_fail()) { + if (zio_zil_fail_shift && zio_io_should_fail(zio_zil_fail_shift)) { spa_config_exit(spa, FTAG); return (ENOSPC); } |