diff options
author | eschrock <none@none> | 2007-06-12 13:18:17 -0700 |
---|---|---|
committer | eschrock <none@none> | 2007-06-12 13:18:17 -0700 |
commit | 3d7072f8bd27709dba14f6fe336f149d25d9e207 (patch) | |
tree | d325ae63ce74901b55494e8a0dc011b9e2e13d43 /usr/src/uts/common/fs | |
parent | a5b881a79e40ec2c21d682e676b130a1ee3d2a73 (diff) | |
download | illumos-gate-3d7072f8bd27709dba14f6fe336f149d25d9e207.tar.gz |
PSARC 2007/197 ZFS hotplug
PSARC 2007/283 FMA for ZFS Phase 2
6401126 ZFS DE should verify that diagnosis is still valid before solving cases
6500545 ZFS does not handle changes in devids
6508521 zpool online should warn when it is being used incorrectly
6509807 ZFS checksum ereports are not being posted
6514712 zfs_nicenum() doesn't work with perfectly-sized buffers
6520510 media state doesn't get updated properly on device removal
6520513 ZFS should have better support for device removal
6520514 vdev state should be controlled through a single ioctl()
6520519 ZFS should diagnose faulty devices
6520947 ZFS DE should close cases which no longer apply
6521393 ZFS case timeout should be FMD_TYPE_TIME
6521624 fmd_hash_walk() can dump core when given a bad address
6521946 ZFS DE needlessly subscribes to faults
6522085 ZFS dictionary files contain spelling errors
6523185 vdev_reopen() doesn't correctly propagate state
6523555 'zpool online' should be less chatty unless something goes wrong
6527379 zpool(1M) should not try to open faulted devices
6527700 ZFS should post a sysevent when topology changes
6528194 lofi should support force unmap and DKIO_DEV_GONE
6528732 ZFS should store physical device path in addition to /dev path
6532635 ZFS keeps devices open unnecessarily
6532979 bad argument to ZFS_IOC_VDEV_ATTACH can panic system
6567983 deadlock with spa_scrub_thread() and spa_namespace_lock
Diffstat (limited to 'usr/src/uts/common/fs')
-rw-r--r-- | usr/src/uts/common/fs/zfs/spa.c | 243 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/spa_misc.c | 8 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/spa.h | 9 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/vdev.h | 8 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/vdev_impl.h | 12 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zfs_context.h | 9 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev.c | 401 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_cache.c | 20 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_disk.c | 73 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_label.c | 25 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_fm.c | 53 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_ioctl.c | 61 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zio.c | 19 |
13 files changed, 698 insertions, 243 deletions
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index dfdf0c846e..6963bcecab 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -424,6 +424,24 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) } /* + * Checks to see if the given vdev could not be opened, in which case we post a + * sysevent to notify the autoreplace code that the device has been removed. + */ +static void +spa_check_removed(vdev_t *vd) +{ + int c; + + for (c = 0; c < vd->vdev_children; c++) + spa_check_removed(vd->vdev_child[c]); + + if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { + zfs_post_autoreplace(vd->vdev_spa, vd); + spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); + } +} + +/* * Load an existing storage pool, using the pool's builtin spa_config as a * source of configuration information. */ @@ -438,6 +456,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) uint64_t pool_guid; uint64_t version; zio_t *zio; + uint64_t autoreplace = 0; spa->spa_load_state = state; @@ -711,11 +730,25 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) if (error == 0) { (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, - zpool_prop_to_name(ZFS_PROP_BOOTFS), + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), sizeof (uint64_t), 1, &spa->spa_bootfs); + (void) zap_lookup(spa->spa_meta_objset, + spa->spa_pool_props_object, + zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), + sizeof (uint64_t), 1, &autoreplace); } /* + * If the 'autoreplace' property is set, then post a resource notifying + * the ZFS DE that it should not issue any faults for unopenable + * devices. We also iterate over the vdevs, and post a sysevent for any + * unopenable vdevs so that the normal autoreplace handler can take + * over. + */ + if (autoreplace) + spa_check_removed(spa->spa_root_vdev); + + /* * Load the vdev state for all toplevel vdevs. */ vdev_load(rvd); @@ -795,7 +828,7 @@ out: * The import case is identical to an open except that the configuration is sent * down from userland, instead of grabbed from the configuration cache. For the * case of an open, the pool configuration will exist in the - * POOL_STATE_UNITIALIZED state. + * POOL_STATE_UNINITIALIZED state. * * The stats information (gen/count/ustats) is used to gather vdev statistics at * the same time open the pool, without having to keep around the spa_t in some @@ -879,6 +912,13 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) } spa_open_ref(spa, tag); + + /* + * If we just loaded the pool, resilver anything that's out of date. + */ + if (loaded && (spa_mode & FWRITE)) + VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + if (locked) mutex_exit(&spa_namespace_lock); @@ -890,12 +930,6 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) spa_config_exit(spa, FTAG); } - /* - * If we just loaded the pool, resilver anything that's out of date. - */ - if (loaded && (spa_mode & FWRITE)) - VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); - return (0); } @@ -1219,7 +1253,7 @@ spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) dmu_tx_commit(tx); - spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); + spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); @@ -1325,14 +1359,14 @@ spa_import(const char *pool, nvlist_t *config, const char *altroot) */ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); - mutex_exit(&spa_namespace_lock); - /* * Resilver anything that's out of date. */ if (spa_mode & FWRITE) VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + mutex_exit(&spa_namespace_lock); + return (0); } @@ -1476,6 +1510,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) } } + spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); + if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); @@ -1657,7 +1693,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) * * If 'replacing' is specified, the new device is intended to replace the * existing device; in this case the two devices are made into their own - * mirror using the 'replacing' vdev, which is functionally idendical to + * mirror using the 'replacing' vdev, which is functionally identical to * the mirror vdev (it actually reuses all the same ops) but has a few * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) @@ -1685,7 +1721,10 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) pvd = oldvd->vdev_parent; if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, - VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) + VDEV_ALLOC_ADD)) != 0) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + if (newrootvd->vdev_children != 1) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); newvd = newrootvd->vdev_child[0]; @@ -1818,9 +1857,12 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); /* - * Kick off a resilver to update newvd. + * Kick off a resilver to update newvd. We need to grab the namespace + * lock because spa_scrub() needs to post a sysevent with the pool name. */ + mutex_enter(&spa_namespace_lock); VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + mutex_exit(&spa_namespace_lock); return (0); } @@ -1973,7 +2015,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) /* * Reevaluate the parent vdev state. */ - vdev_propagate_state(cvd->vdev_parent); + vdev_propagate_state(cvd); /* * If the device we just detached was smaller than the others, it may be @@ -1996,6 +2038,8 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); + spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); + error = spa_vdev_exit(spa, vd, txg, 0); /* @@ -2098,20 +2142,24 @@ out: } /* - * Find any device that's done replacing, so we can detach it. + * Find any device that's done replacing, or a vdev marked 'unspare' that's + * current spared, so we can detach it. */ static vdev_t * -spa_vdev_replace_done_hunt(vdev_t *vd) +spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; int c; for (c = 0; c < vd->vdev_children; c++) { - oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); + oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); } + /* + * Check for a completed replacement. + */ if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { oldvd = vd->vdev_child[0]; newvd = vd->vdev_child[1]; @@ -2125,11 +2173,29 @@ spa_vdev_replace_done_hunt(vdev_t *vd) mutex_exit(&newvd->vdev_dtl_lock); } + /* + * Check for a completed resilver with the 'unspare' flag set. + */ + if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { + newvd = vd->vdev_child[0]; + oldvd = vd->vdev_child[1]; + + mutex_enter(&newvd->vdev_dtl_lock); + if (newvd->vdev_unspare && + newvd->vdev_dtl_map.sm_space == 0 && + newvd->vdev_dtl_scrub.sm_space == 0) { + newvd->vdev_unspare = 0; + mutex_exit(&newvd->vdev_dtl_lock); + return (oldvd); + } + mutex_exit(&newvd->vdev_dtl_lock); + } + return (NULL); } static void -spa_vdev_replace_done(spa_t *spa) +spa_vdev_resilver_done(spa_t *spa) { vdev_t *vd; vdev_t *pvd; @@ -2138,7 +2204,7 @@ spa_vdev_replace_done(spa_t *spa) spa_config_enter(spa, RW_READER, FTAG); - while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { + while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { guid = vd->vdev_guid; /* * If we have just finished replacing a hot spared device, then @@ -2449,6 +2515,9 @@ spa_scrub_thread(spa_t *spa) vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); spa_errlog_rotate(spa); + if (scrub_type == POOL_SCRUB_RESILVER && complete) + spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH); + spa_config_exit(spa, FTAG); mutex_enter(&spa->spa_scrub_lock); @@ -2457,7 +2526,7 @@ spa_scrub_thread(spa_t *spa) * We may have finished replacing a device. * Let the async thread assess this and handle the detach. */ - spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); /* * If we were told to restart, our final act is to start a new scrub. @@ -2568,7 +2637,7 @@ spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) */ if (type == POOL_SCRUB_RESILVER) { type = POOL_SCRUB_NONE; - spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); } } else { /* @@ -2593,6 +2662,8 @@ spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) mintxg = ss->ss_start - 1; ss = avl_last(&rvd->vdev_dtl_map.sm_root); maxtxg = MIN(ss->ss_end, maxtxg); + + spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); } mutex_exit(&rvd->vdev_dtl_lock); @@ -2624,29 +2695,29 @@ spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) */ static void -spa_async_reopen(spa_t *spa) +spa_async_remove(spa_t *spa, vdev_t *vd) { - vdev_t *rvd = spa->spa_root_vdev; vdev_t *tvd; int c; - spa_config_enter(spa, RW_WRITER, FTAG); - - for (c = 0; c < rvd->vdev_children; c++) { - tvd = rvd->vdev_child[c]; - if (tvd->vdev_reopen_wanted) { - tvd->vdev_reopen_wanted = 0; - vdev_reopen(tvd); + for (c = 0; c < vd->vdev_children; c++) { + tvd = vd->vdev_child[c]; + if (tvd->vdev_remove_wanted) { + tvd->vdev_remove_wanted = 0; + vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED, + VDEV_AUX_NONE); + vdev_clear(spa, tvd); + vdev_config_dirty(tvd->vdev_top); } + spa_async_remove(spa, tvd); } - - spa_config_exit(spa, FTAG); } static void spa_async_thread(spa_t *spa) { int tasks; + uint64_t txg; ASSERT(spa->spa_sync_on); @@ -2665,28 +2736,40 @@ spa_async_thread(spa_t *spa) } /* - * See if any devices need to be reopened. + * See if any devices need to be marked REMOVED. */ - if (tasks & SPA_ASYNC_REOPEN) - spa_async_reopen(spa); + if (tasks & SPA_ASYNC_REMOVE) { + txg = spa_vdev_enter(spa); + spa_async_remove(spa, spa->spa_root_vdev); + (void) spa_vdev_exit(spa, NULL, txg, 0); + } /* * If any devices are done replacing, detach them. */ - if (tasks & SPA_ASYNC_REPLACE_DONE) - spa_vdev_replace_done(spa); + if (tasks & SPA_ASYNC_RESILVER_DONE) + spa_vdev_resilver_done(spa); /* - * Kick off a scrub. + * Kick off a scrub. When starting a RESILVER scrub (or an EVERYTHING + * scrub which can become a resilver), we need to hold + * spa_namespace_lock() because the sysevent we post via + * spa_event_notify() needs to get the name of the pool. */ - if (tasks & SPA_ASYNC_SCRUB) + if (tasks & SPA_ASYNC_SCRUB) { + mutex_enter(&spa_namespace_lock); VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); + mutex_exit(&spa_namespace_lock); + } /* * Kick off a resilver. */ - if (tasks & SPA_ASYNC_RESILVER) + if (tasks & SPA_ASYNC_RESILVER) { + mutex_enter(&spa_namespace_lock); VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + mutex_exit(&spa_namespace_lock); + } /* * Let the world know that we're done. @@ -2810,7 +2893,7 @@ spa_sync_spares(spa_t *spa, dmu_tx_t *tx) /* * Update the MOS nvlist describing the list of available spares. * spa_validate_spares() will have already made sure this nvlist is - * valid and the vdevs are labelled appropriately. + * valid and the vdevs are labeled appropriately. */ if (spa->spa_spares_object == 0) { spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, @@ -2869,6 +2952,7 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) nvpair_t *nvpair; objset_t *mos = spa->spa_meta_objset; uint64_t zapobj; + uint64_t intval; mutex_enter(&spa->spa_props_lock); if (spa->spa_pool_props_object == 0) { @@ -2886,14 +2970,23 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) nvpair = NULL; while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { switch (zpool_name_to_prop(nvpair_name(nvpair))) { - case ZFS_PROP_BOOTFS: + case ZPOOL_PROP_BOOTFS: VERIFY(nvlist_lookup_uint64(nvp, nvpair_name(nvpair), &spa->spa_bootfs) == 0); VERIFY(zap_update(mos, spa->spa_pool_props_object, - zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 8, 1, &spa->spa_bootfs, tx) == 0); break; + + case ZPOOL_PROP_AUTOREPLACE: + VERIFY(nvlist_lookup_uint64(nvp, + nvpair_name(nvpair), &intval) == 0); + VERIFY(zap_update(mos, + spa->spa_pool_props_object, + zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 8, 1, + &intval, tx) == 0); + break; } } } @@ -3191,7 +3284,7 @@ spa_get_props(spa_t *spa, nvlist_t **nvp) zap_attribute_t za; objset_t *mos = spa->spa_meta_objset; zfs_source_t src; - zfs_prop_t prop; + zpool_prop_t prop; nvlist_t *propval; uint64_t value; int err; @@ -3215,14 +3308,14 @@ spa_get_props(spa_t *spa, nvlist_t **nvp) VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); switch (za.za_integer_length) { case 8: - if (zfs_prop_default_numeric(prop) == + if (zpool_prop_default_numeric(prop) == za.za_first_integer) src = ZFS_SRC_DEFAULT; else src = ZFS_SRC_LOCAL; value = za.za_first_integer; - if (prop == ZFS_PROP_BOOTFS) { + if (prop == ZPOOL_PROP_BOOTFS) { dsl_pool_t *dp; dsl_dataset_t *ds = NULL; char strval[MAXPATHLEN]; @@ -3274,7 +3367,61 @@ spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { VERIFY(zap_remove(spa->spa_meta_objset, spa->spa_pool_props_object, - zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); spa->spa_bootfs = 0; } } + +/* + * Post a sysevent corresponding to the given event. The 'name' must be one of + * the event definitions in sys/sysevent/eventdefs.h. The payload will be + * filled in from the spa and (optionally) the vdev. This doesn't do anything + * in the userland libzpool, as we don't want consumers to misinterpret ztest + * or zdb as real changes. + */ +void +spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) +{ +#ifdef _KERNEL + sysevent_t *ev; + sysevent_attr_list_t *attr = NULL; + sysevent_value_t value; + sysevent_id_t eid; + + ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", + SE_SLEEP); + + value.value_type = SE_DATA_TYPE_STRING; + value.value.sv_string = spa_name(spa); + if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) + goto done; + + value.value_type = SE_DATA_TYPE_UINT64; + value.value.sv_uint64 = spa_guid(spa); + if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) + goto done; + + if (vd) { + value.value_type = SE_DATA_TYPE_UINT64; + value.value.sv_uint64 = vd->vdev_guid; + if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, + SE_SLEEP) != 0) + goto done; + + if (vd->vdev_path) { + value.value_type = SE_DATA_TYPE_STRING; + value.value.sv_string = vd->vdev_path; + if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, + &value, SE_SLEEP) != 0) + goto done; + } + } + + (void) log_sysevent(ev, SE_SLEEP, &eid); + +done: + if (attr) + sysevent_free_attr(attr); + sysevent_free(ev); +#endif +} diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 3e51849766..c08c58cffe 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -590,13 +590,15 @@ spa_config_held(spa_t *spa, krw_t rw) uint64_t spa_vdev_enter(spa_t *spa) { + mutex_enter(&spa_namespace_lock); + /* - * Suspend scrub activity while we mess with the config. + * Suspend scrub activity while we mess with the config. We must do + * this after acquiring the namespace lock to avoid a 3-way deadlock + * with spa_scrub_stop() and the scrub thread. */ spa_scrub_suspend(spa); - mutex_enter(&spa_namespace_lock); - spa_config_enter(spa, RW_WRITER, spa); return (spa_last_synced_txg(spa) + 1); diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 2bcf4c8a32..8c2a286847 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -330,8 +330,8 @@ extern void spa_async_resume(spa_t *spa); extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); -#define SPA_ASYNC_REOPEN 0x01 -#define SPA_ASYNC_REPLACE_DONE 0x02 +#define SPA_ASYNC_REMOVE 0x01 +#define SPA_ASYNC_RESILVER_DONE 0x02 #define SPA_ASYNC_SCRUB 0x04 #define SPA_ASYNC_RESILVER 0x08 #define SPA_ASYNC_CONFIG_UPDATE 0x10 @@ -452,6 +452,8 @@ extern void spa_log_error(spa_t *spa, struct zio *zio); extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, struct zio *zio, uint64_t stateoroffset, uint64_t length); extern void zfs_post_ok(spa_t *spa, vdev_t *vd); +extern void zfs_post_remove(spa_t *spa, vdev_t *vd); +extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_get_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); extern void spa_errlog_rotate(spa_t *spa); @@ -469,6 +471,9 @@ extern int spa_get_props(spa_t *spa, nvlist_t **nvp); extern void spa_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); extern boolean_t spa_has_bootfs(spa_t *spa); +/* asynchronous event notification */ +extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name); + #ifdef ZFS_DEBUG #define dprintf_bp(bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index 3120811625..c651d1eebb 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -84,8 +84,11 @@ extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); extern void vdev_io_start(zio_t *zio); extern void vdev_io_done(zio_t *zio); -extern int vdev_online(spa_t *spa, uint64_t guid); -extern int vdev_offline(spa_t *spa, uint64_t guid, int istmp); +extern int vdev_fault(spa_t *spa, uint64_t guid); +extern int vdev_degrade(spa_t *spa, uint64_t guid); +extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, + vdev_state_t *); +extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); extern void vdev_clear(spa_t *spa, vdev_t *vd); extern int vdev_error_inject(vdev_t *vd, zio_t *zio); @@ -95,6 +98,7 @@ extern void vdev_cache_init(vdev_t *vd); extern void vdev_cache_fini(vdev_t *vd); extern int vdev_cache_read(zio_t *zio); extern void vdev_cache_write(zio_t *zio); +extern void vdev_cache_purge(vdev_t *vd); extern void vdev_queue_init(vdev_t *vd); extern void vdev_queue_fini(vdev_t *vd); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 0891fcc0ad..4e83497420 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -140,7 +140,7 @@ struct vdev { txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ - uint8_t vdev_reopen_wanted; /* async reopen wanted? */ + boolean_t vdev_remove_wanted; /* async remove wanted? */ list_node_t vdev_dirty_node; /* config dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ @@ -151,14 +151,17 @@ struct vdev { space_map_obj_t vdev_dtl; /* dirty time log on-disk state */ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ uint64_t vdev_wholedisk; /* true if this is a whole disk */ - uint64_t vdev_offline; /* device taken offline? */ + uint64_t vdev_offline; /* persistent offline state */ + uint64_t vdev_faulted; /* persistent faulted state */ + uint64_t vdev_degraded; /* persistent degraded state */ + uint64_t vdev_removed; /* persistent removed state */ uint64_t vdev_nparity; /* number of parity devices for raidz */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ + char *vdev_physpath; /* vdev device path (if any) */ uint64_t vdev_fault_arg; /* fault injection paramater */ int vdev_fault_mask; /* zio types to fault */ uint8_t vdev_fault_mode; /* fault injection mode */ - uint8_t vdev_cache_active; /* vdev_cache and vdev_queue */ uint8_t vdev_tmpoffline; /* device taken offline temporarily? */ uint8_t vdev_detached; /* device detached? */ uint64_t vdev_isspare; /* was a hot spare */ @@ -167,6 +170,9 @@ struct vdev { uint64_t vdev_not_present; /* not present during import */ hrtime_t vdev_last_try; /* last reopen time */ boolean_t vdev_nowritecache; /* true if flushwritecache failed */ + uint64_t vdev_unspare; /* unspare when resilvering done */ + boolean_t vdev_checkremove; /* temporary online test */ + boolean_t vdev_forcefault; /* force online fault */ /* * For DTrace to work in userland (libzpool) context, these fields must diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_context.h b/usr/src/uts/common/fs/zfs/sys/zfs_context.h index 2f0e3e792d..8a689e0760 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_context.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_context.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -61,6 +60,8 @@ extern "C" { #include <sys/zone.h> #include <sys/uio.h> #include <sys/zfs_debug.h> +#include <sys/sysevent.h> +#include <sys/sysevent/eventdefs.h> #define CPU_SEQID (CPU->cpu_seqid) diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index fbb77774c2..9b2ec04710 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -319,44 +319,13 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) txg_list_create(&vd->vdev_dtl_list, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); + vdev_queue_init(vd); + vdev_cache_init(vd); return (vd); } /* - * Free a vdev_t that has been removed from service. - */ -static void -vdev_free_common(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - - if (vd->vdev_path) - spa_strfree(vd->vdev_path); - if (vd->vdev_devid) - spa_strfree(vd->vdev_devid); - - if (vd->vdev_isspare) - spa_spare_remove(vd); - - txg_list_destroy(&vd->vdev_ms_list); - txg_list_destroy(&vd->vdev_dtl_list); - mutex_enter(&vd->vdev_dtl_lock); - space_map_unload(&vd->vdev_dtl_map); - space_map_destroy(&vd->vdev_dtl_map); - space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); - space_map_destroy(&vd->vdev_dtl_scrub); - mutex_exit(&vd->vdev_dtl_lock); - mutex_destroy(&vd->vdev_dtl_lock); - mutex_destroy(&vd->vdev_stat_lock); - - if (vd == spa->spa_root_vdev) - spa->spa_root_vdev = NULL; - - kmem_free(vd, sizeof (vdev_t)); -} - -/* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. @@ -408,6 +377,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, vd->vdev_path = spa_strdup(vd->vdev_path); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) vd->vdev_devid = spa_strdup(vd->vdev_devid); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, + &vd->vdev_physpath) == 0) + vd->vdev_physpath = spa_strdup(vd->vdev_physpath); /* * Set the nparity propery for RAID-Z vdevs. @@ -477,13 +449,28 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } /* - * If we're a leaf vdev, try to load the DTL object and offline state. + * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl.smo_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, + &vd->vdev_unspare); + /* + * When importing a pool, we want to ignore the persistent fault + * state, as the diagnosis made on another system may not be + * valid in the current context. + */ + if (spa->spa_load_state == SPA_LOAD_OPEN) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, + &vd->vdev_faulted); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, + &vd->vdev_degraded); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, + &vd->vdev_removed); + } } /* @@ -500,6 +487,7 @@ void vdev_free(vdev_t *vd) { int c; + spa_t *spa = vd->vdev_spa; /* * vdev_free() implies closing the vdev first. This is simpler than @@ -507,6 +495,7 @@ vdev_free(vdev_t *vd) */ vdev_close(vd); + ASSERT(!list_link_active(&vd->vdev_dirty_node)); /* @@ -535,7 +524,37 @@ vdev_free(vdev_t *vd) ASSERT(vd->vdev_parent == NULL); - vdev_free_common(vd); + /* + * Clean up vdev structure. + */ + vdev_queue_fini(vd); + vdev_cache_fini(vd); + + if (vd->vdev_path) + spa_strfree(vd->vdev_path); + if (vd->vdev_devid) + spa_strfree(vd->vdev_devid); + if (vd->vdev_physpath) + spa_strfree(vd->vdev_physpath); + + if (vd->vdev_isspare) + spa_spare_remove(vd); + + txg_list_destroy(&vd->vdev_ms_list); + txg_list_destroy(&vd->vdev_dtl_list); + mutex_enter(&vd->vdev_dtl_lock); + space_map_unload(&vd->vdev_dtl_map); + space_map_destroy(&vd->vdev_dtl_map); + space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); + space_map_destroy(&vd->vdev_dtl_scrub); + mutex_exit(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_stat_lock); + + if (vd == spa->spa_root_vdev) + spa->spa_root_vdev = NULL; + + kmem_free(vd, sizeof (vdev_t)); } /* @@ -590,9 +609,6 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) vdev_config_dirty(tvd); } - tvd->vdev_reopen_wanted = svd->vdev_reopen_wanted; - svd->vdev_reopen_wanted = 0; - tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; } @@ -781,13 +797,12 @@ vdev_open(vdev_t *vd) vd->vdev_stat.vs_aux = VDEV_AUX_NONE; - if (vd->vdev_ops->vdev_op_leaf) { - vdev_cache_init(vd); - vdev_queue_init(vd); - vd->vdev_cache_active = B_TRUE; - } - - if (vd->vdev_offline) { + if (!vd->vdev_removed && vd->vdev_faulted) { + ASSERT(vd->vdev_children == 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); + return (ENXIO); + } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (ENXIO); @@ -798,16 +813,25 @@ vdev_open(vdev_t *vd) if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, ENXIO); - dprintf("%s = %d, osize %llu, state = %d\n", - vdev_description(vd), error, osize, vd->vdev_state); - if (error) { + if (vd->vdev_removed && + vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) + vd->vdev_removed = B_FALSE; + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); return (error); } - vd->vdev_state = VDEV_STATE_HEALTHY; + vd->vdev_removed = B_FALSE; + + if (vd->vdev_degraded) { + ASSERT(vd->vdev_children == 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); + } else { + vd->vdev_state = VDEV_STATE_HEALTHY; + } for (c = 0; c < vd->vdev_children; c++) if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { @@ -905,8 +929,7 @@ vdev_open(vdev_t *vd) /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't - * inadvertently do repair I/Os to the wrong device, and so that vdev_reopen() - * won't succeed if the device has been changed underneath. + * inadvertently do repair I/Os to the wrong device. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if @@ -988,11 +1011,7 @@ vdev_close(vdev_t *vd) { vd->vdev_ops->vdev_op_close(vd); - if (vd->vdev_cache_active) { - vdev_cache_fini(vd); - vdev_queue_fini(vd); - vd->vdev_cache_active = B_FALSE; - } + vdev_cache_purge(vd); /* * We record the previous state before we close it, so that if we are @@ -1022,22 +1041,13 @@ vdev_reopen(vdev_t *vd) * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). - * - * The downside to this is that if the user is simply experimenting by - * overwriting an entire disk, we'll fault the device rather than - * demonstrate self-healing capabilities. On the other hand, with - * proper FMA integration, the series of errors we'd see from the device - * would result in a faulted device anyway. Given that this doesn't - * model any real-world corruption, it's better to catch this here and - * correctly identify that the device has either changed beneath us, or - * is corrupted beyond recognition. */ (void) vdev_validate(vd); /* - * Reassess root vdev's health. + * Reassess parent vdev's health. */ - vdev_propagate_state(spa->spa_root_vdev); + vdev_propagate_state(vd); } int @@ -1428,8 +1438,12 @@ vdev_description(vdev_t *vd) return (vd->vdev_ops->vdev_op_type); } +/* + * Mark the given vdev faulted. A faulted vdev behaves as if the device could + * not be opened, and no I/O is attempted. + */ int -vdev_online(spa_t *spa, uint64_t guid) +vdev_fault(spa_t *spa, uint64_t guid) { vdev_t *rvd, *vd; uint64_t txg; @@ -1440,27 +1454,141 @@ vdev_online(spa_t *spa, uint64_t guid) if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + /* + * Faulted state takes precedence over degraded. + */ + vd->vdev_faulted = 1ULL; + vd->vdev_degraded = 0ULL; + vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); + + /* + * If marking the vdev as faulted cause the toplevel vdev to become + * unavailable, then back off and simply mark the vdev as degraded + * instead. + */ + if (vdev_is_dead(vd->vdev_top)) { + vd->vdev_degraded = 1ULL; + vd->vdev_faulted = 0ULL; + + /* + * If we reopen the device and it's not dead, only then do we + * mark it degraded. + */ + vdev_reopen(vd); + + if (!vdev_is_dead(vd)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); + } + } + + vdev_config_dirty(vd->vdev_top); + + (void) spa_vdev_exit(spa, NULL, txg, 0); + + return (0); +} + +/* + * Mark the given vdev degraded. A degraded vdev is purely an indication to the + * user that something is wrong. The vdev continues to operate as normal as far + * as I/O is concerned. + */ +int +vdev_degrade(spa_t *spa, uint64_t guid) +{ + vdev_t *rvd, *vd; + uint64_t txg; + + txg = spa_vdev_enter(spa); + + rvd = spa->spa_root_vdev; + + if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - dprintf("ONLINE: %s\n", vdev_description(vd)); + /* + * If the vdev is already faulted, then don't do anything. + */ + if (vd->vdev_faulted || vd->vdev_degraded) { + (void) spa_vdev_exit(spa, NULL, txg, 0); + return (0); + } + + vd->vdev_degraded = 1ULL; + if (!vdev_is_dead(vd)) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); + vdev_config_dirty(vd->vdev_top); + + (void) spa_vdev_exit(spa, NULL, txg, 0); + + return (0); +} + +/* + * Online the given vdev. If 'unspare' is set, it implies two things. First, + * any attached spare device should be detached when the device finishes + * resilvering. Second, the online should be treated like a 'test' online case, + * so no FMA events are generated if the device fails to open. + */ +int +vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, + vdev_state_t *newstate) +{ + vdev_t *rvd, *vd; + uint64_t txg; + + txg = spa_vdev_enter(spa); + + rvd = spa->spa_root_vdev; + + if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; + vd->vdev_checkremove = (flags & ZFS_ONLINE_CHECKREMOVE) ? + B_TRUE : B_FALSE; + vd->vdev_forcefault = (flags & ZFS_ONLINE_FORCEFAULT) ? + B_TRUE : B_FALSE; vdev_reopen(vd->vdev_top); + vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; + + if (newstate) + *newstate = vd->vdev_state; + if ((flags & ZFS_ONLINE_UNSPARE) && + !vdev_is_dead(vd) && vd->vdev_parent && + vd->vdev_parent->vdev_ops == &vdev_spare_ops && + vd->vdev_parent->vdev_child[0] == vd) + vd->vdev_unspare = B_TRUE; vdev_config_dirty(vd->vdev_top); (void) spa_vdev_exit(spa, NULL, txg, 0); + /* + * Must hold spa_namespace_lock in order to post resilver sysevent + * w/pool name. + */ + mutex_enter(&spa_namespace_lock); VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + mutex_exit(&spa_namespace_lock); return (0); } int -vdev_offline(spa_t *spa, uint64_t guid, int istmp) +vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *rvd, *vd; uint64_t txg; @@ -1475,8 +1603,6 @@ vdev_offline(spa_t *spa, uint64_t guid, int istmp) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - dprintf("OFFLINE: %s\n", vdev_description(vd)); - /* * If the device isn't already offline, try to offline it. */ @@ -1505,7 +1631,8 @@ vdev_offline(spa_t *spa, uint64_t guid, int istmp) } } - vd->vdev_tmpoffline = istmp; + vd->vdev_tmpoffline = (flags & ZFS_OFFLINE_TEMPORARY) ? + B_TRUE : B_FALSE; vdev_config_dirty(vd->vdev_top); @@ -1531,12 +1658,29 @@ vdev_clear(spa_t *spa, vdev_t *vd) for (c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); + + /* + * If we're in the FAULTED state, then clear the persistent state and + * attempt to reopen the device. We also mark the vdev config dirty, so + * that the new faulted state is written out to disk. + */ + if (vd->vdev_faulted || vd->vdev_degraded) { + vd->vdev_faulted = vd->vdev_degraded = 0; + vdev_reopen(vd); + vdev_config_dirty(vd->vdev_top); + + if (vd->vdev_faulted) + VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, + B_TRUE) == 0); + + spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); + } } int vdev_is_dead(vdev_t *vd) { - return (vd->vdev_state <= VDEV_STATE_CANT_OPEN); + return (vd->vdev_state < VDEV_STATE_DEGRADED); } int @@ -1563,12 +1707,6 @@ vdev_error_inject(vdev_t *vd, zio_t *zio) break; } - if (error != 0) { - dprintf("returning %d for type %d on %s state %d offset %llx\n", - error, zio->io_type, vdev_description(vd), - vd->vdev_state, zio->io_offset); - } - return (error); } @@ -1792,28 +1930,34 @@ vdev_propagate_state(vdev_t *vd) int c; vdev_t *child; - for (c = 0; c < vd->vdev_children; c++) { - child = vd->vdev_child[c]; - if (child->vdev_state <= VDEV_STATE_CANT_OPEN) - faulted++; - else if (child->vdev_state == VDEV_STATE_DEGRADED) - degraded++; - - if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) - corrupted++; - } + if (vd->vdev_children > 0) { + for (c = 0; c < vd->vdev_children; c++) { + child = vd->vdev_child[c]; + if (vdev_is_dead(child)) + faulted++; + else if (child->vdev_state == VDEV_STATE_DEGRADED) + degraded++; - vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); + if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) + corrupted++; + } - /* - * Root special: if there is a toplevel vdev that cannot be - * opened due to corrupted metadata, then propagate the root - * vdev's aux state as 'corrupt' rather than 'insufficient - * replicas'. - */ - if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) - vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); + vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); + + /* + * Root special: if there is a toplevel vdev that cannot be + * opened due to corrupted metadata, then propagate the root + * vdev's aux state as 'corrupt' rather than 'insufficient + * replicas'. + */ + if (corrupted && vd == rvd && + rvd->vdev_state == VDEV_STATE_CANT_OPEN) + vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + } + + if (vd->vdev_parent) + vdev_propagate_state(vd->vdev_parent); } /* @@ -1839,7 +1983,39 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; - if (state == VDEV_STATE_CANT_OPEN) { + /* + * If we are setting the vdev state to anything but an open state, then + * always close the underlying device. Otherwise, we keep accessible + * but invalid devices open forever. We don't call vdev_close() itself, + * because that implies some extra checks (offline, etc) that we don't + * want here. This is limited to leaf devices, because otherwise + * closing the device will affect other children. + */ + if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) + vd->vdev_ops->vdev_op_close(vd); + + if (vd->vdev_removed && + state == VDEV_STATE_CANT_OPEN && + (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { + /* + * If the previous state is set to VDEV_STATE_REMOVED, then this + * device was previously marked removed and someone attempted to + * reopen it. If this failed due to a nonexistent device, then + * keep the device in the REMOVED state. We also let this be if + * it is one of our special test online cases, which is only + * attempting to online the device and shouldn't generate an FMA + * fault. + */ + vd->vdev_state = VDEV_STATE_REMOVED; + vd->vdev_stat.vs_aux = VDEV_AUX_NONE; + } else if (state == VDEV_STATE_REMOVED) { + /* + * Indicate to the ZFS DE that this device has been removed, and + * any recent errors should be ignored. + */ + zfs_post_remove(vd->vdev_spa, vd); + vd->vdev_removed = B_TRUE; + } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import, we mark it as * "not available", which signifies that it was never there to @@ -1856,8 +2032,17 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. + * + * If the 'checkremove' flag is set, then this is an attempt to + * online the device in response to an insertion event. If we + * hit this case, then we have detected an insertion event for a + * faulted or offline device that wasn't in the removed state. + * In this scenario, we don't post an ereport because we are + * about to replace the device, or attempt an online with + * vdev_forcefault, which will generate the fault for us. */ - if (vd->vdev_prevstate != state && !vd->vdev_not_present && + if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && + !vd->vdev_not_present && !vd->vdev_checkremove && vd != vd->vdev_spa->spa_root_vdev) { const char *class; @@ -1887,11 +2072,13 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) zfs_ereport_post(class, vd->vdev_spa, vd, NULL, save_state, 0); } - } - if (isopen) - return; + /* Erase any notion of persistent removed state */ + vd->vdev_removed = B_FALSE; + } else { + vd->vdev_removed = B_FALSE; + } - if (vd->vdev_parent != NULL) - vdev_propagate_state(vd->vdev_parent); + if (!isopen) + vdev_propagate_state(vd); } diff --git a/usr/src/uts/common/fs/zfs/vdev_cache.c b/usr/src/uts/common/fs/zfs/vdev_cache.c index 2d8795c660..d7d8755f92 100644 --- a/usr/src/uts/common/fs/zfs/vdev_cache.c +++ b/usr/src/uts/common/fs/zfs/vdev_cache.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -352,6 +352,18 @@ vdev_cache_write(zio_t *zio) } void +vdev_cache_purge(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + vdev_cache_entry_t *ve; + + mutex_enter(&vc->vc_lock); + while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) + vdev_cache_evict(vc, ve); + mutex_exit(&vc->vc_lock); +} + +void vdev_cache_init(vdev_t *vd) { vdev_cache_t *vc = &vd->vdev_cache; @@ -371,12 +383,8 @@ void vdev_cache_fini(vdev_t *vd) { vdev_cache_t *vc = &vd->vdev_cache; - vdev_cache_entry_t *ve; - mutex_enter(&vc->vc_lock); - while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) - vdev_cache_evict(vc, ve); - mutex_exit(&vc->vc_lock); + vdev_cache_purge(vd); avl_destroy(&vc->vc_offset_tree); avl_destroy(&vc->vc_lastused_tree); diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index b965b1c5f0..5789312667 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -50,6 +50,9 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) vdev_disk_t *dvd; struct dk_minfo dkm; int error; + dev_t dev; + char *physpath, *minorname; + int otyp; /* * We must have a pathname, and it must be absolute. @@ -141,12 +144,57 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, spa_mode, kcred, &dvd->vd_lh, zfs_li); + /* + * If all else fails, then try opening by physical path (if available) + * or the logical path (if we failed due to the devid check). While not + * as reliable as the devid, this will give us something, and the higher + * level vdev validation will prevent us from opening the wrong device. + */ + if (error) { + if (vd->vdev_physpath != NULL && + (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != ENODEV) + error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode, + kcred, &dvd->vd_lh, zfs_li); + + /* + * Note that we don't support the legacy auto-wholedisk support + * as above. This hasn't been used in a very long time and we + * don't need to propagate its oddities to this edge condition. + */ + if (error && vd->vdev_path != NULL) + error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred, + &dvd->vd_lh, zfs_li); + } + if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); } /* + * Once a device is opened, verify that the physical device path (if + * available) is up to date. + */ + if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && + ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { + physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); + minorname = NULL; + if (ddi_dev_pathname(dev, otyp, physpath) == 0 && + ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && + (vd->vdev_physpath == NULL || + strcmp(vd->vdev_physpath, physpath) != 0)) { + if (vd->vdev_physpath) + spa_strfree(vd->vdev_physpath); + (void) strlcat(physpath, ":", MAXPATHLEN); + (void) strlcat(physpath, minorname, MAXPATHLEN); + vd->vdev_physpath = spa_strdup(physpath); + } + if (minorname) + kmem_free(minorname, strlen(minorname) + 1); + kmem_free(physpath, MAXPATHLEN); + } + + /* * Determine the actual size of the device. */ if (ldi_get_size(dvd->vd_lh, psize) != 0) { @@ -191,10 +239,6 @@ vdev_disk_close(vdev_t *vd) if (dvd == NULL) return; - dprintf("removing disk %s, devid %s\n", - vd->vdev_path ? vd->vdev_path : "<none>", - vd->vdev_devid ? vd->vdev_devid : "<none>"); - if (dvd->vd_minor != NULL) ddi_devid_str_free(dvd->vd_minor); @@ -340,6 +384,10 @@ vdev_disk_io_start(zio_t *zio) static void vdev_disk_io_done(zio_t *zio) { + vdev_t *vd = zio->io_vd; + vdev_disk_t *dvd = vd->vdev_tsd; + int state; + vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE) @@ -348,6 +396,21 @@ vdev_disk_io_done(zio_t *zio) if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); + /* + * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if + * the device has been removed. If this is the case, then we trigger an + * asynchronous removal of the device. + */ + if (zio->io_error == EIO) { + state = DKIO_NONE; + if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, + FKIOCTL, kcred, NULL) == 0 && + state != DKIO_INSERTED) { + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); + } + } + zio_next_stage(zio); } diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c index 9d9f5556fa..f7c51a1594 100644 --- a/usr/src/uts/common/fs/zfs/vdev_label.c +++ b/usr/src/uts/common/fs/zfs/vdev_label.c @@ -62,7 +62,7 @@ * or a device was added, we want to update all the labels such that we can deal * with fatal failure at any point. To this end, each disk has two labels which * are updated before and after the uberblock is synced. Assuming we have - * labels and an uberblock with the following transacation groups: + * labels and an uberblock with the following transaction groups: * * L1 UB L2 * +------+ +------+ +------+ @@ -209,6 +209,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid) == 0); + if (vd->vdev_physpath != NULL) + VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, + vd->vdev_physpath) == 0); + if (vd->vdev_nparity != 0) { ASSERT(strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_RAIDZ) == 0); @@ -285,9 +289,18 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_offline && !vd->vdev_tmpoffline) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE) == 0); - else - (void) nvlist_remove(nv, ZPOOL_CONFIG_OFFLINE, - DATA_TYPE_UINT64); + if (vd->vdev_faulted) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, + B_TRUE) == 0); + if (vd->vdev_degraded) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, + B_TRUE) == 0); + if (vd->vdev_removed) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, + B_TRUE) == 0); + if (vd->vdev_unspare) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, + B_TRUE) == 0); } return (nv); @@ -496,7 +509,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * If this is a replacement, then we want to fallthrough to the * rest of the code. If we're adding a spare, then it's already - * labelled appropriately and we can just return. + * labeled appropriately and we can just return. */ if (reason == VDEV_LABEL_SPARE) return (0); @@ -605,7 +618,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * If this vdev hasn't been previously identified as a spare, then we - * mark it as such only if a) we are labelling it as a spare, or b) it + * mark it as such only if a) we are labeling it as a spare, or b) it * exists as a spare elsewhere in the system. */ if (error == 0 && !vd->vdev_isspare && diff --git a/usr/src/uts/common/fs/zfs/zfs_fm.c b/usr/src/uts/common/fs/zfs/zfs_fm.c index a886d614d8..146c4ec438 100644 --- a/usr/src/uts/common/fs/zfs/zfs_fm.c +++ b/usr/src/uts/common/fs/zfs/zfs_fm.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -117,9 +117,11 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, /* * Ignore any errors from I/Os that we are going to retry anyway - we - * only generate errors from the final failure. + * only generate errors from the final failure. Checksum errors are + * generated after the pipeline stage responsible for retrying the I/O + * (VDEV_IO_ASSESS), so this only applies to standard I/O errors. */ - if (zio && zio_should_retry(zio)) + if (zio && zio_should_retry(zio) && zio->io_error != ECKSUM) return; /* @@ -292,13 +294,8 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, #endif } -/* - * The 'resource.fs.zfs.ok' event is an internal signal that the associated - * resource (pool or disk) has been identified by ZFS as healthy. This will - * then trigger the DE to close the associated case, if any. - */ -void -zfs_post_ok(spa_t *spa, vdev_t *vd) +static void +zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) { #ifdef _KERNEL nvlist_t *resource; @@ -308,7 +305,7 @@ zfs_post_ok(spa_t *spa, vdev_t *vd) return; (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE, - ZFS_ERROR_CLASS, FM_RESOURCE_OK); + ZFS_ERROR_CLASS, name); VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0); VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0); VERIFY(nvlist_add_uint64(resource, @@ -322,3 +319,37 @@ zfs_post_ok(spa_t *spa, vdev_t *vd) fm_nvlist_destroy(resource, FM_NVA_FREE); #endif } + +/* + * The 'resource.fs.zfs.ok' event is an internal signal that the associated + * resource (pool or disk) has been identified by ZFS as healthy. This will + * then trigger the DE to close the associated case, if any. + */ +void +zfs_post_ok(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RESOURCE_OK); +} + +/* + * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev + * has been removed from the system. This will cause the DE to ignore any + * recent I/O errors, inferring that they are due to the asynchronous device + * removal. + */ +void +zfs_post_remove(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RESOURCE_REMOVED); +} + +/* + * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool + * has the 'autoreplace' property set, and therefore any broken vdevs will be + * handled by higher level logic, and no vdev fault should be generated. + */ +void +zfs_post_autoreplace(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE); +} diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index fccfc1355e..74d033001b 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -439,7 +439,9 @@ zfs_ioc_pool_scrub(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); + spa_config_enter(spa, RW_READER, FTAG); error = spa_scrub(spa, zc->zc_cookie, B_FALSE); + spa_config_exit(spa, FTAG); spa_close(spa, FTAG); @@ -618,28 +620,35 @@ zfs_ioc_vdev_remove(zfs_cmd_t *zc) } static int -zfs_ioc_vdev_online(zfs_cmd_t *zc) +zfs_ioc_vdev_set_state(zfs_cmd_t *zc) { spa_t *spa; int error; + vdev_state_t newstate = VDEV_STATE_UNKNOWN; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - error = vdev_online(spa, zc->zc_guid); - spa_close(spa, FTAG); - return (error); -} + switch (zc->zc_cookie) { + case VDEV_STATE_ONLINE: + error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); + break; -static int -zfs_ioc_vdev_offline(zfs_cmd_t *zc) -{ - spa_t *spa; - int istmp = zc->zc_cookie; - int error; + case VDEV_STATE_OFFLINE: + error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); + break; - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - error = vdev_offline(spa, zc->zc_guid, istmp); + case VDEV_STATE_FAULTED: + error = vdev_fault(spa, zc->zc_guid); + break; + + case VDEV_STATE_DEGRADED: + error = vdev_degrade(spa, zc->zc_guid); + break; + + default: + error = EINVAL; + } + zc->zc_cookie = newstate; spa_close(spa, FTAG); return (error); } @@ -1096,7 +1105,7 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) } switch (prop) { - case ZFS_PROP_BOOTFS: + case ZPOOL_PROP_BOOTFS: /* * A bootable filesystem can not be on a RAIDZ pool * nor a striped pool with more than 1 device. @@ -1115,8 +1124,8 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) VERIFY(nvpair_value_string(elem, &strval) == 0); if (strval == NULL || strval[0] == '\0') { - objnum = - zfs_prop_default_numeric(ZFS_PROP_BOOTFS); + objnum = zpool_prop_default_numeric( + ZPOOL_PROP_BOOTFS); break; } @@ -1126,9 +1135,6 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) objnum = dmu_objset_id(os); dmu_objset_close(os); break; - - default: - error = EINVAL; } if (error) @@ -1137,10 +1143,11 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) if (error == 0) { if (reset_bootfs) { VERIFY(nvlist_remove(nvl, - zpool_prop_to_name(ZFS_PROP_BOOTFS), + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING) == 0); VERIFY(nvlist_add_uint64(nvl, - zpool_prop_to_name(ZFS_PROP_BOOTFS), objnum) == 0); + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), + objnum) == 0); } error = spa_set_props(spa, nvl); } @@ -1565,23 +1572,24 @@ zfs_ioc_clear(zfs_cmd_t *zc) spa_t *spa; vdev_t *vd; int error; + uint64_t txg; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - spa_config_enter(spa, RW_WRITER, FTAG); + txg = spa_vdev_enter(spa); if (zc->zc_guid == 0) { vd = NULL; } else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) { - spa_config_exit(spa, FTAG); + (void) spa_vdev_exit(spa, NULL, txg, ENODEV); spa_close(spa, FTAG); return (ENODEV); } vdev_clear(spa, vd); - spa_config_exit(spa, FTAG); + (void) spa_vdev_exit(spa, NULL, txg, 0); spa_close(spa, FTAG); @@ -1620,8 +1628,7 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { { zfs_ioc_pool_log_history, zfs_secpolicy_config, pool_name }, { zfs_ioc_vdev_add, zfs_secpolicy_config, pool_name }, { zfs_ioc_vdev_remove, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_online, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_offline, zfs_secpolicy_config, pool_name }, + { zfs_ioc_vdev_set_state, zfs_secpolicy_config, pool_name }, { zfs_ioc_vdev_attach, zfs_secpolicy_config, pool_name }, { zfs_ioc_vdev_detach, zfs_secpolicy_config, pool_name }, { zfs_ioc_vdev_setpath, zfs_secpolicy_config, pool_name }, diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 42c30d7edd..130e697d60 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -1519,25 +1519,6 @@ zio_vdev_io_assess(zio_t *zio) return; } - if (zio->io_error != 0 && zio->io_error != ECKSUM && - !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) { - /* - * Poor man's hotplug support. Even if we're done retrying this - * I/O, try to reopen the vdev to see if it's still attached. - * To avoid excessive thrashing, we only try it once a minute. - * This also has the effect of detecting when missing devices - * have come back, by polling the device once a minute. - * - * We need to do this asynchronously because we can't grab - * all the necessary locks way down here. - */ - if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) { - vd->vdev_last_try = gethrtime(); - tvd->vdev_reopen_wanted = 1; - spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN); - } - } - zio_next_stage(zio); } |