diff options
author | eschrock <none@none> | 2007-06-12 13:18:17 -0700 |
---|---|---|
committer | eschrock <none@none> | 2007-06-12 13:18:17 -0700 |
commit | 3d7072f8bd27709dba14f6fe336f149d25d9e207 (patch) | |
tree | d325ae63ce74901b55494e8a0dc011b9e2e13d43 /usr/src/uts/common | |
parent | a5b881a79e40ec2c21d682e676b130a1ee3d2a73 (diff) | |
download | illumos-joyent-3d7072f8bd27709dba14f6fe336f149d25d9e207.tar.gz |
PSARC 2007/197 ZFS hotplug
PSARC 2007/283 FMA for ZFS Phase 2
6401126 ZFS DE should verify that diagnosis is still valid before solving cases
6500545 ZFS does not handle changes in devids
6508521 zpool online should warn when it is being used incorrectly
6509807 ZFS checksum ereports are not being posted
6514712 zfs_nicenum() doesn't work with perfectly-sized buffers
6520510 media state doesn't get updated properly on device removal
6520513 ZFS should have better support for device removal
6520514 vdev state should be controlled through a single ioctl()
6520519 ZFS should diagnose faulty devices
6520947 ZFS DE should close cases which no longer apply
6521393 ZFS case timeout should be FMD_TYPE_TIME
6521624 fmd_hash_walk() can dump core when given a bad address
6521946 ZFS DE needlessly subscribes to faults
6522085 ZFS dictionary files contain spelling errors
6523185 vdev_reopen() doesn't correctly propagate state
6523555 'zpool online' should be less chatty unless something goes wrong
6527379 zpool(1M) should not try to open faulted devices
6527700 ZFS should post a sysevent when topology changes
6528194 lofi should support force unmap and DKIO_DEV_GONE
6528732 ZFS should store physical device path in addition to /dev path
6532635 ZFS keeps devices open unnecessarily
6532979 bad argument to ZFS_IOC_VDEV_ATTACH can panic system
6567983 deadlock with spa_scrub_thread() and spa_namespace_lock
Diffstat (limited to 'usr/src/uts/common')
-rw-r--r-- | usr/src/uts/common/fs/zfs/spa.c | 243 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/spa_misc.c | 8 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/spa.h | 9 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/vdev.h | 8 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/vdev_impl.h | 12 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zfs_context.h | 9 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev.c | 401 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_cache.c | 20 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_disk.c | 73 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_label.c | 25 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_fm.c | 53 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_ioctl.c | 61 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zio.c | 19 | ||||
-rw-r--r-- | usr/src/uts/common/io/lofi.c | 192 | ||||
-rw-r--r-- | usr/src/uts/common/io/scsi/targets/sd.c | 263 | ||||
-rw-r--r-- | usr/src/uts/common/sys/dditypes.h | 5 | ||||
-rw-r--r-- | usr/src/uts/common/sys/fm/fs/zfs.h | 4 | ||||
-rw-r--r-- | usr/src/uts/common/sys/fs/zfs.h | 69 | ||||
-rw-r--r-- | usr/src/uts/common/sys/lofi.h | 26 | ||||
-rw-r--r-- | usr/src/uts/common/sys/sysevent/eventdefs.h | 12 |
20 files changed, 1082 insertions, 430 deletions
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index dfdf0c846e..6963bcecab 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -424,6 +424,24 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) } /* + * Checks to see if the given vdev could not be opened, in which case we post a + * sysevent to notify the autoreplace code that the device has been removed. + */ +static void +spa_check_removed(vdev_t *vd) +{ + int c; + + for (c = 0; c < vd->vdev_children; c++) + spa_check_removed(vd->vdev_child[c]); + + if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { + zfs_post_autoreplace(vd->vdev_spa, vd); + spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); + } +} + +/* * Load an existing storage pool, using the pool's builtin spa_config as a * source of configuration information. */ @@ -438,6 +456,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) uint64_t pool_guid; uint64_t version; zio_t *zio; + uint64_t autoreplace = 0; spa->spa_load_state = state; @@ -711,11 +730,25 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) if (error == 0) { (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, - zpool_prop_to_name(ZFS_PROP_BOOTFS), + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), sizeof (uint64_t), 1, &spa->spa_bootfs); + (void) zap_lookup(spa->spa_meta_objset, + spa->spa_pool_props_object, + zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), + sizeof (uint64_t), 1, &autoreplace); } /* + * If the 'autoreplace' property is set, then post a resource notifying + * the ZFS DE that it should not issue any faults for unopenable + * devices. We also iterate over the vdevs, and post a sysevent for any + * unopenable vdevs so that the normal autoreplace handler can take + * over. + */ + if (autoreplace) + spa_check_removed(spa->spa_root_vdev); + + /* * Load the vdev state for all toplevel vdevs. */ vdev_load(rvd); @@ -795,7 +828,7 @@ out: * The import case is identical to an open except that the configuration is sent * down from userland, instead of grabbed from the configuration cache. For the * case of an open, the pool configuration will exist in the - * POOL_STATE_UNITIALIZED state. + * POOL_STATE_UNINITIALIZED state. * * The stats information (gen/count/ustats) is used to gather vdev statistics at * the same time open the pool, without having to keep around the spa_t in some @@ -879,6 +912,13 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) } spa_open_ref(spa, tag); + + /* + * If we just loaded the pool, resilver anything that's out of date. + */ + if (loaded && (spa_mode & FWRITE)) + VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + if (locked) mutex_exit(&spa_namespace_lock); @@ -890,12 +930,6 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) spa_config_exit(spa, FTAG); } - /* - * If we just loaded the pool, resilver anything that's out of date. - */ - if (loaded && (spa_mode & FWRITE)) - VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); - return (0); } @@ -1219,7 +1253,7 @@ spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) dmu_tx_commit(tx); - spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); + spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); @@ -1325,14 +1359,14 @@ spa_import(const char *pool, nvlist_t *config, const char *altroot) */ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); - mutex_exit(&spa_namespace_lock); - /* * Resilver anything that's out of date. */ if (spa_mode & FWRITE) VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + mutex_exit(&spa_namespace_lock); + return (0); } @@ -1476,6 +1510,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) } } + spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); + if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); @@ -1657,7 +1693,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) * * If 'replacing' is specified, the new device is intended to replace the * existing device; in this case the two devices are made into their own - * mirror using the 'replacing' vdev, which is functionally idendical to + * mirror using the 'replacing' vdev, which is functionally identical to * the mirror vdev (it actually reuses all the same ops) but has a few * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) @@ -1685,7 +1721,10 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) pvd = oldvd->vdev_parent; if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, - VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) + VDEV_ALLOC_ADD)) != 0) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + if (newrootvd->vdev_children != 1) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); newvd = newrootvd->vdev_child[0]; @@ -1818,9 +1857,12 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); /* - * Kick off a resilver to update newvd. + * Kick off a resilver to update newvd. We need to grab the namespace + * lock because spa_scrub() needs to post a sysevent with the pool name. */ + mutex_enter(&spa_namespace_lock); VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + mutex_exit(&spa_namespace_lock); return (0); } @@ -1973,7 +2015,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) /* * Reevaluate the parent vdev state. */ - vdev_propagate_state(cvd->vdev_parent); + vdev_propagate_state(cvd); /* * If the device we just detached was smaller than the others, it may be @@ -1996,6 +2038,8 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); + spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); + error = spa_vdev_exit(spa, vd, txg, 0); /* @@ -2098,20 +2142,24 @@ out: } /* - * Find any device that's done replacing, so we can detach it. + * Find any device that's done replacing, or a vdev marked 'unspare' that's + * current spared, so we can detach it. */ static vdev_t * -spa_vdev_replace_done_hunt(vdev_t *vd) +spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; int c; for (c = 0; c < vd->vdev_children; c++) { - oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); + oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); } + /* + * Check for a completed replacement. + */ if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { oldvd = vd->vdev_child[0]; newvd = vd->vdev_child[1]; @@ -2125,11 +2173,29 @@ spa_vdev_replace_done_hunt(vdev_t *vd) mutex_exit(&newvd->vdev_dtl_lock); } + /* + * Check for a completed resilver with the 'unspare' flag set. + */ + if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { + newvd = vd->vdev_child[0]; + oldvd = vd->vdev_child[1]; + + mutex_enter(&newvd->vdev_dtl_lock); + if (newvd->vdev_unspare && + newvd->vdev_dtl_map.sm_space == 0 && + newvd->vdev_dtl_scrub.sm_space == 0) { + newvd->vdev_unspare = 0; + mutex_exit(&newvd->vdev_dtl_lock); + return (oldvd); + } + mutex_exit(&newvd->vdev_dtl_lock); + } + return (NULL); } static void -spa_vdev_replace_done(spa_t *spa) +spa_vdev_resilver_done(spa_t *spa) { vdev_t *vd; vdev_t *pvd; @@ -2138,7 +2204,7 @@ spa_vdev_replace_done(spa_t *spa) spa_config_enter(spa, RW_READER, FTAG); - while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { + while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { guid = vd->vdev_guid; /* * If we have just finished replacing a hot spared device, then @@ -2449,6 +2515,9 @@ spa_scrub_thread(spa_t *spa) vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); spa_errlog_rotate(spa); + if (scrub_type == POOL_SCRUB_RESILVER && complete) + spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH); + spa_config_exit(spa, FTAG); mutex_enter(&spa->spa_scrub_lock); @@ -2457,7 +2526,7 @@ spa_scrub_thread(spa_t *spa) * We may have finished replacing a device. * Let the async thread assess this and handle the detach. */ - spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); /* * If we were told to restart, our final act is to start a new scrub. @@ -2568,7 +2637,7 @@ spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) */ if (type == POOL_SCRUB_RESILVER) { type = POOL_SCRUB_NONE; - spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); } } else { /* @@ -2593,6 +2662,8 @@ spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) mintxg = ss->ss_start - 1; ss = avl_last(&rvd->vdev_dtl_map.sm_root); maxtxg = MIN(ss->ss_end, maxtxg); + + spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); } mutex_exit(&rvd->vdev_dtl_lock); @@ -2624,29 +2695,29 @@ spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) */ static void -spa_async_reopen(spa_t *spa) +spa_async_remove(spa_t *spa, vdev_t *vd) { - vdev_t *rvd = spa->spa_root_vdev; vdev_t *tvd; int c; - spa_config_enter(spa, RW_WRITER, FTAG); - - for (c = 0; c < rvd->vdev_children; c++) { - tvd = rvd->vdev_child[c]; - if (tvd->vdev_reopen_wanted) { - tvd->vdev_reopen_wanted = 0; - vdev_reopen(tvd); + for (c = 0; c < vd->vdev_children; c++) { + tvd = vd->vdev_child[c]; + if (tvd->vdev_remove_wanted) { + tvd->vdev_remove_wanted = 0; + vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED, + VDEV_AUX_NONE); + vdev_clear(spa, tvd); + vdev_config_dirty(tvd->vdev_top); } + spa_async_remove(spa, tvd); } - - spa_config_exit(spa, FTAG); } static void spa_async_thread(spa_t *spa) { int tasks; + uint64_t txg; ASSERT(spa->spa_sync_on); @@ -2665,28 +2736,40 @@ spa_async_thread(spa_t *spa) } /* - * See if any devices need to be reopened. + * See if any devices need to be marked REMOVED. */ - if (tasks & SPA_ASYNC_REOPEN) - spa_async_reopen(spa); + if (tasks & SPA_ASYNC_REMOVE) { + txg = spa_vdev_enter(spa); + spa_async_remove(spa, spa->spa_root_vdev); + (void) spa_vdev_exit(spa, NULL, txg, 0); + } /* * If any devices are done replacing, detach them. */ - if (tasks & SPA_ASYNC_REPLACE_DONE) - spa_vdev_replace_done(spa); + if (tasks & SPA_ASYNC_RESILVER_DONE) + spa_vdev_resilver_done(spa); /* - * Kick off a scrub. + * Kick off a scrub. When starting a RESILVER scrub (or an EVERYTHING + * scrub which can become a resilver), we need to hold + * spa_namespace_lock() because the sysevent we post via + * spa_event_notify() needs to get the name of the pool. */ - if (tasks & SPA_ASYNC_SCRUB) + if (tasks & SPA_ASYNC_SCRUB) { + mutex_enter(&spa_namespace_lock); VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); + mutex_exit(&spa_namespace_lock); + } /* * Kick off a resilver. */ - if (tasks & SPA_ASYNC_RESILVER) + if (tasks & SPA_ASYNC_RESILVER) { + mutex_enter(&spa_namespace_lock); VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + mutex_exit(&spa_namespace_lock); + } /* * Let the world know that we're done. @@ -2810,7 +2893,7 @@ spa_sync_spares(spa_t *spa, dmu_tx_t *tx) /* * Update the MOS nvlist describing the list of available spares. * spa_validate_spares() will have already made sure this nvlist is - * valid and the vdevs are labelled appropriately. + * valid and the vdevs are labeled appropriately. */ if (spa->spa_spares_object == 0) { spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, @@ -2869,6 +2952,7 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) nvpair_t *nvpair; objset_t *mos = spa->spa_meta_objset; uint64_t zapobj; + uint64_t intval; mutex_enter(&spa->spa_props_lock); if (spa->spa_pool_props_object == 0) { @@ -2886,14 +2970,23 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) nvpair = NULL; while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { switch (zpool_name_to_prop(nvpair_name(nvpair))) { - case ZFS_PROP_BOOTFS: + case ZPOOL_PROP_BOOTFS: VERIFY(nvlist_lookup_uint64(nvp, nvpair_name(nvpair), &spa->spa_bootfs) == 0); VERIFY(zap_update(mos, spa->spa_pool_props_object, - zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 8, 1, &spa->spa_bootfs, tx) == 0); break; + + case ZPOOL_PROP_AUTOREPLACE: + VERIFY(nvlist_lookup_uint64(nvp, + nvpair_name(nvpair), &intval) == 0); + VERIFY(zap_update(mos, + spa->spa_pool_props_object, + zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 8, 1, + &intval, tx) == 0); + break; } } } @@ -3191,7 +3284,7 @@ spa_get_props(spa_t *spa, nvlist_t **nvp) zap_attribute_t za; objset_t *mos = spa->spa_meta_objset; zfs_source_t src; - zfs_prop_t prop; + zpool_prop_t prop; nvlist_t *propval; uint64_t value; int err; @@ -3215,14 +3308,14 @@ spa_get_props(spa_t *spa, nvlist_t **nvp) VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); switch (za.za_integer_length) { case 8: - if (zfs_prop_default_numeric(prop) == + if (zpool_prop_default_numeric(prop) == za.za_first_integer) src = ZFS_SRC_DEFAULT; else src = ZFS_SRC_LOCAL; value = za.za_first_integer; - if (prop == ZFS_PROP_BOOTFS) { + if (prop == ZPOOL_PROP_BOOTFS) { dsl_pool_t *dp; dsl_dataset_t *ds = NULL; char strval[MAXPATHLEN]; @@ -3274,7 +3367,61 @@ spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { VERIFY(zap_remove(spa->spa_meta_objset, spa->spa_pool_props_object, - zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); spa->spa_bootfs = 0; } } + +/* + * Post a sysevent corresponding to the given event. The 'name' must be one of + * the event definitions in sys/sysevent/eventdefs.h. The payload will be + * filled in from the spa and (optionally) the vdev. This doesn't do anything + * in the userland libzpool, as we don't want consumers to misinterpret ztest + * or zdb as real changes. + */ +void +spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) +{ +#ifdef _KERNEL + sysevent_t *ev; + sysevent_attr_list_t *attr = NULL; + sysevent_value_t value; + sysevent_id_t eid; + + ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", + SE_SLEEP); + + value.value_type = SE_DATA_TYPE_STRING; + value.value.sv_string = spa_name(spa); + if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) + goto done; + + value.value_type = SE_DATA_TYPE_UINT64; + value.value.sv_uint64 = spa_guid(spa); + if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) + goto done; + + if (vd) { + value.value_type = SE_DATA_TYPE_UINT64; + value.value.sv_uint64 = vd->vdev_guid; + if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, + SE_SLEEP) != 0) + goto done; + + if (vd->vdev_path) { + value.value_type = SE_DATA_TYPE_STRING; + value.value.sv_string = vd->vdev_path; + if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, + &value, SE_SLEEP) != 0) + goto done; + } + } + + (void) log_sysevent(ev, SE_SLEEP, &eid); + +done: + if (attr) + sysevent_free_attr(attr); + sysevent_free(ev); +#endif +} diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 3e51849766..c08c58cffe 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -590,13 +590,15 @@ spa_config_held(spa_t *spa, krw_t rw) uint64_t spa_vdev_enter(spa_t *spa) { + mutex_enter(&spa_namespace_lock); + /* - * Suspend scrub activity while we mess with the config. + * Suspend scrub activity while we mess with the config. We must do + * this after acquiring the namespace lock to avoid a 3-way deadlock + * with spa_scrub_stop() and the scrub thread. */ spa_scrub_suspend(spa); - mutex_enter(&spa_namespace_lock); - spa_config_enter(spa, RW_WRITER, spa); return (spa_last_synced_txg(spa) + 1); diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 2bcf4c8a32..8c2a286847 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -330,8 +330,8 @@ extern void spa_async_resume(spa_t *spa); extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); -#define SPA_ASYNC_REOPEN 0x01 -#define SPA_ASYNC_REPLACE_DONE 0x02 +#define SPA_ASYNC_REMOVE 0x01 +#define SPA_ASYNC_RESILVER_DONE 0x02 #define SPA_ASYNC_SCRUB 0x04 #define SPA_ASYNC_RESILVER 0x08 #define SPA_ASYNC_CONFIG_UPDATE 0x10 @@ -452,6 +452,8 @@ extern void spa_log_error(spa_t *spa, struct zio *zio); extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, struct zio *zio, uint64_t stateoroffset, uint64_t length); extern void zfs_post_ok(spa_t *spa, vdev_t *vd); +extern void zfs_post_remove(spa_t *spa, vdev_t *vd); +extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_get_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); extern void spa_errlog_rotate(spa_t *spa); @@ -469,6 +471,9 @@ extern int spa_get_props(spa_t *spa, nvlist_t **nvp); extern void spa_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); extern boolean_t spa_has_bootfs(spa_t *spa); +/* asynchronous event notification */ +extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name); + #ifdef ZFS_DEBUG #define dprintf_bp(bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index 3120811625..c651d1eebb 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -84,8 +84,11 @@ extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); extern void vdev_io_start(zio_t *zio); extern void vdev_io_done(zio_t *zio); -extern int vdev_online(spa_t *spa, uint64_t guid); -extern int vdev_offline(spa_t *spa, uint64_t guid, int istmp); +extern int vdev_fault(spa_t *spa, uint64_t guid); +extern int vdev_degrade(spa_t *spa, uint64_t guid); +extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, + vdev_state_t *); +extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); extern void vdev_clear(spa_t *spa, vdev_t *vd); extern int vdev_error_inject(vdev_t *vd, zio_t *zio); @@ -95,6 +98,7 @@ extern void vdev_cache_init(vdev_t *vd); extern void vdev_cache_fini(vdev_t *vd); extern int vdev_cache_read(zio_t *zio); extern void vdev_cache_write(zio_t *zio); +extern void vdev_cache_purge(vdev_t *vd); extern void vdev_queue_init(vdev_t *vd); extern void vdev_queue_fini(vdev_t *vd); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 0891fcc0ad..4e83497420 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -140,7 +140,7 @@ struct vdev { txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ - uint8_t vdev_reopen_wanted; /* async reopen wanted? */ + boolean_t vdev_remove_wanted; /* async remove wanted? */ list_node_t vdev_dirty_node; /* config dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ @@ -151,14 +151,17 @@ struct vdev { space_map_obj_t vdev_dtl; /* dirty time log on-disk state */ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ uint64_t vdev_wholedisk; /* true if this is a whole disk */ - uint64_t vdev_offline; /* device taken offline? */ + uint64_t vdev_offline; /* persistent offline state */ + uint64_t vdev_faulted; /* persistent faulted state */ + uint64_t vdev_degraded; /* persistent degraded state */ + uint64_t vdev_removed; /* persistent removed state */ uint64_t vdev_nparity; /* number of parity devices for raidz */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ + char *vdev_physpath; /* vdev device path (if any) */ uint64_t vdev_fault_arg; /* fault injection paramater */ int vdev_fault_mask; /* zio types to fault */ uint8_t vdev_fault_mode; /* fault injection mode */ - uint8_t vdev_cache_active; /* vdev_cache and vdev_queue */ uint8_t vdev_tmpoffline; /* device taken offline temporarily? */ uint8_t vdev_detached; /* device detached? */ uint64_t vdev_isspare; /* was a hot spare */ @@ -167,6 +170,9 @@ struct vdev { uint64_t vdev_not_present; /* not present during import */ hrtime_t vdev_last_try; /* last reopen time */ boolean_t vdev_nowritecache; /* true if flushwritecache failed */ + uint64_t vdev_unspare; /* unspare when resilvering done */ + boolean_t vdev_checkremove; /* temporary online test */ + boolean_t vdev_forcefault; /* force online fault */ /* * For DTrace to work in userland (libzpool) context, these fields must diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_context.h b/usr/src/uts/common/fs/zfs/sys/zfs_context.h index 2f0e3e792d..8a689e0760 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_context.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_context.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -61,6 +60,8 @@ extern "C" { #include <sys/zone.h> #include <sys/uio.h> #include <sys/zfs_debug.h> +#include <sys/sysevent.h> +#include <sys/sysevent/eventdefs.h> #define CPU_SEQID (CPU->cpu_seqid) diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index fbb77774c2..9b2ec04710 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -319,44 +319,13 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) txg_list_create(&vd->vdev_dtl_list, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); + vdev_queue_init(vd); + vdev_cache_init(vd); return (vd); } /* - * Free a vdev_t that has been removed from service. - */ -static void -vdev_free_common(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - - if (vd->vdev_path) - spa_strfree(vd->vdev_path); - if (vd->vdev_devid) - spa_strfree(vd->vdev_devid); - - if (vd->vdev_isspare) - spa_spare_remove(vd); - - txg_list_destroy(&vd->vdev_ms_list); - txg_list_destroy(&vd->vdev_dtl_list); - mutex_enter(&vd->vdev_dtl_lock); - space_map_unload(&vd->vdev_dtl_map); - space_map_destroy(&vd->vdev_dtl_map); - space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); - space_map_destroy(&vd->vdev_dtl_scrub); - mutex_exit(&vd->vdev_dtl_lock); - mutex_destroy(&vd->vdev_dtl_lock); - mutex_destroy(&vd->vdev_stat_lock); - - if (vd == spa->spa_root_vdev) - spa->spa_root_vdev = NULL; - - kmem_free(vd, sizeof (vdev_t)); -} - -/* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. @@ -408,6 +377,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, vd->vdev_path = spa_strdup(vd->vdev_path); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) vd->vdev_devid = spa_strdup(vd->vdev_devid); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, + &vd->vdev_physpath) == 0) + vd->vdev_physpath = spa_strdup(vd->vdev_physpath); /* * Set the nparity propery for RAID-Z vdevs. @@ -477,13 +449,28 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } /* - * If we're a leaf vdev, try to load the DTL object and offline state. + * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl.smo_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, + &vd->vdev_unspare); + /* + * When importing a pool, we want to ignore the persistent fault + * state, as the diagnosis made on another system may not be + * valid in the current context. + */ + if (spa->spa_load_state == SPA_LOAD_OPEN) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, + &vd->vdev_faulted); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, + &vd->vdev_degraded); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, + &vd->vdev_removed); + } } /* @@ -500,6 +487,7 @@ void vdev_free(vdev_t *vd) { int c; + spa_t *spa = vd->vdev_spa; /* * vdev_free() implies closing the vdev first. This is simpler than @@ -507,6 +495,7 @@ vdev_free(vdev_t *vd) */ vdev_close(vd); + ASSERT(!list_link_active(&vd->vdev_dirty_node)); /* @@ -535,7 +524,37 @@ vdev_free(vdev_t *vd) ASSERT(vd->vdev_parent == NULL); - vdev_free_common(vd); + /* + * Clean up vdev structure. + */ + vdev_queue_fini(vd); + vdev_cache_fini(vd); + + if (vd->vdev_path) + spa_strfree(vd->vdev_path); + if (vd->vdev_devid) + spa_strfree(vd->vdev_devid); + if (vd->vdev_physpath) + spa_strfree(vd->vdev_physpath); + + if (vd->vdev_isspare) + spa_spare_remove(vd); + + txg_list_destroy(&vd->vdev_ms_list); + txg_list_destroy(&vd->vdev_dtl_list); + mutex_enter(&vd->vdev_dtl_lock); + space_map_unload(&vd->vdev_dtl_map); + space_map_destroy(&vd->vdev_dtl_map); + space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); + space_map_destroy(&vd->vdev_dtl_scrub); + mutex_exit(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_stat_lock); + + if (vd == spa->spa_root_vdev) + spa->spa_root_vdev = NULL; + + kmem_free(vd, sizeof (vdev_t)); } /* @@ -590,9 +609,6 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) vdev_config_dirty(tvd); } - tvd->vdev_reopen_wanted = svd->vdev_reopen_wanted; - svd->vdev_reopen_wanted = 0; - tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; } @@ -781,13 +797,12 @@ vdev_open(vdev_t *vd) vd->vdev_stat.vs_aux = VDEV_AUX_NONE; - if (vd->vdev_ops->vdev_op_leaf) { - vdev_cache_init(vd); - vdev_queue_init(vd); - vd->vdev_cache_active = B_TRUE; - } - - if (vd->vdev_offline) { + if (!vd->vdev_removed && vd->vdev_faulted) { + ASSERT(vd->vdev_children == 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); + return (ENXIO); + } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (ENXIO); @@ -798,16 +813,25 @@ vdev_open(vdev_t *vd) if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, ENXIO); - dprintf("%s = %d, osize %llu, state = %d\n", - vdev_description(vd), error, osize, vd->vdev_state); - if (error) { + if (vd->vdev_removed && + vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) + vd->vdev_removed = B_FALSE; + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); return (error); } - vd->vdev_state = VDEV_STATE_HEALTHY; + vd->vdev_removed = B_FALSE; + + if (vd->vdev_degraded) { + ASSERT(vd->vdev_children == 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); + } else { + vd->vdev_state = VDEV_STATE_HEALTHY; + } for (c = 0; c < vd->vdev_children; c++) if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { @@ -905,8 +929,7 @@ vdev_open(vdev_t *vd) /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't - * inadvertently do repair I/Os to the wrong device, and so that vdev_reopen() - * won't succeed if the device has been changed underneath. + * inadvertently do repair I/Os to the wrong device. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if @@ -988,11 +1011,7 @@ vdev_close(vdev_t *vd) { vd->vdev_ops->vdev_op_close(vd); - if (vd->vdev_cache_active) { - vdev_cache_fini(vd); - vdev_queue_fini(vd); - vd->vdev_cache_active = B_FALSE; - } + vdev_cache_purge(vd); /* * We record the previous state before we close it, so that if we are @@ -1022,22 +1041,13 @@ vdev_reopen(vdev_t *vd) * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). - * - * The downside to this is that if the user is simply experimenting by - * overwriting an entire disk, we'll fault the device rather than - * demonstrate self-healing capabilities. On the other hand, with - * proper FMA integration, the series of errors we'd see from the device - * would result in a faulted device anyway. Given that this doesn't - * model any real-world corruption, it's better to catch this here and - * correctly identify that the device has either changed beneath us, or - * is corrupted beyond recognition. */ (void) vdev_validate(vd); /* - * Reassess root vdev's health. + * Reassess parent vdev's health. */ - vdev_propagate_state(spa->spa_root_vdev); + vdev_propagate_state(vd); } int @@ -1428,8 +1438,12 @@ vdev_description(vdev_t *vd) return (vd->vdev_ops->vdev_op_type); } +/* + * Mark the given vdev faulted. A faulted vdev behaves as if the device could + * not be opened, and no I/O is attempted. + */ int -vdev_online(spa_t *spa, uint64_t guid) +vdev_fault(spa_t *spa, uint64_t guid) { vdev_t *rvd, *vd; uint64_t txg; @@ -1440,27 +1454,141 @@ vdev_online(spa_t *spa, uint64_t guid) if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + /* + * Faulted state takes precedence over degraded. + */ + vd->vdev_faulted = 1ULL; + vd->vdev_degraded = 0ULL; + vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); + + /* + * If marking the vdev as faulted cause the toplevel vdev to become + * unavailable, then back off and simply mark the vdev as degraded + * instead. + */ + if (vdev_is_dead(vd->vdev_top)) { + vd->vdev_degraded = 1ULL; + vd->vdev_faulted = 0ULL; + + /* + * If we reopen the device and it's not dead, only then do we + * mark it degraded. + */ + vdev_reopen(vd); + + if (!vdev_is_dead(vd)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); + } + } + + vdev_config_dirty(vd->vdev_top); + + (void) spa_vdev_exit(spa, NULL, txg, 0); + + return (0); +} + +/* + * Mark the given vdev degraded. A degraded vdev is purely an indication to the + * user that something is wrong. The vdev continues to operate as normal as far + * as I/O is concerned. + */ +int +vdev_degrade(spa_t *spa, uint64_t guid) +{ + vdev_t *rvd, *vd; + uint64_t txg; + + txg = spa_vdev_enter(spa); + + rvd = spa->spa_root_vdev; + + if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - dprintf("ONLINE: %s\n", vdev_description(vd)); + /* + * If the vdev is already faulted, then don't do anything. + */ + if (vd->vdev_faulted || vd->vdev_degraded) { + (void) spa_vdev_exit(spa, NULL, txg, 0); + return (0); + } + + vd->vdev_degraded = 1ULL; + if (!vdev_is_dead(vd)) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); + vdev_config_dirty(vd->vdev_top); + + (void) spa_vdev_exit(spa, NULL, txg, 0); + + return (0); +} + +/* + * Online the given vdev. If 'unspare' is set, it implies two things. First, + * any attached spare device should be detached when the device finishes + * resilvering. Second, the online should be treated like a 'test' online case, + * so no FMA events are generated if the device fails to open. + */ +int +vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, + vdev_state_t *newstate) +{ + vdev_t *rvd, *vd; + uint64_t txg; + + txg = spa_vdev_enter(spa); + + rvd = spa->spa_root_vdev; + + if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; + vd->vdev_checkremove = (flags & ZFS_ONLINE_CHECKREMOVE) ? + B_TRUE : B_FALSE; + vd->vdev_forcefault = (flags & ZFS_ONLINE_FORCEFAULT) ? + B_TRUE : B_FALSE; vdev_reopen(vd->vdev_top); + vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; + + if (newstate) + *newstate = vd->vdev_state; + if ((flags & ZFS_ONLINE_UNSPARE) && + !vdev_is_dead(vd) && vd->vdev_parent && + vd->vdev_parent->vdev_ops == &vdev_spare_ops && + vd->vdev_parent->vdev_child[0] == vd) + vd->vdev_unspare = B_TRUE; vdev_config_dirty(vd->vdev_top); (void) spa_vdev_exit(spa, NULL, txg, 0); + /* + * Must hold spa_namespace_lock in order to post resilver sysevent + * w/pool name. + */ + mutex_enter(&spa_namespace_lock); VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + mutex_exit(&spa_namespace_lock); return (0); } int -vdev_offline(spa_t *spa, uint64_t guid, int istmp) +vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *rvd, *vd; uint64_t txg; @@ -1475,8 +1603,6 @@ vdev_offline(spa_t *spa, uint64_t guid, int istmp) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - dprintf("OFFLINE: %s\n", vdev_description(vd)); - /* * If the device isn't already offline, try to offline it. */ @@ -1505,7 +1631,8 @@ vdev_offline(spa_t *spa, uint64_t guid, int istmp) } } - vd->vdev_tmpoffline = istmp; + vd->vdev_tmpoffline = (flags & ZFS_OFFLINE_TEMPORARY) ? + B_TRUE : B_FALSE; vdev_config_dirty(vd->vdev_top); @@ -1531,12 +1658,29 @@ vdev_clear(spa_t *spa, vdev_t *vd) for (c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); + + /* + * If we're in the FAULTED state, then clear the persistent state and + * attempt to reopen the device. We also mark the vdev config dirty, so + * that the new faulted state is written out to disk. + */ + if (vd->vdev_faulted || vd->vdev_degraded) { + vd->vdev_faulted = vd->vdev_degraded = 0; + vdev_reopen(vd); + vdev_config_dirty(vd->vdev_top); + + if (vd->vdev_faulted) + VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, + B_TRUE) == 0); + + spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); + } } int vdev_is_dead(vdev_t *vd) { - return (vd->vdev_state <= VDEV_STATE_CANT_OPEN); + return (vd->vdev_state < VDEV_STATE_DEGRADED); } int @@ -1563,12 +1707,6 @@ vdev_error_inject(vdev_t *vd, zio_t *zio) break; } - if (error != 0) { - dprintf("returning %d for type %d on %s state %d offset %llx\n", - error, zio->io_type, vdev_description(vd), - vd->vdev_state, zio->io_offset); - } - return (error); } @@ -1792,28 +1930,34 @@ vdev_propagate_state(vdev_t *vd) int c; vdev_t *child; - for (c = 0; c < vd->vdev_children; c++) { - child = vd->vdev_child[c]; - if (child->vdev_state <= VDEV_STATE_CANT_OPEN) - faulted++; - else if (child->vdev_state == VDEV_STATE_DEGRADED) - degraded++; - - if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) - corrupted++; - } + if (vd->vdev_children > 0) { + for (c = 0; c < vd->vdev_children; c++) { + child = vd->vdev_child[c]; + if (vdev_is_dead(child)) + faulted++; + else if (child->vdev_state == VDEV_STATE_DEGRADED) + degraded++; - vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); + if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) + corrupted++; + } - /* - * Root special: if there is a toplevel vdev that cannot be - * opened due to corrupted metadata, then propagate the root - * vdev's aux state as 'corrupt' rather than 'insufficient - * replicas'. - */ - if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) - vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); + vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); + + /* + * Root special: if there is a toplevel vdev that cannot be + * opened due to corrupted metadata, then propagate the root + * vdev's aux state as 'corrupt' rather than 'insufficient + * replicas'. + */ + if (corrupted && vd == rvd && + rvd->vdev_state == VDEV_STATE_CANT_OPEN) + vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + } + + if (vd->vdev_parent) + vdev_propagate_state(vd->vdev_parent); } /* @@ -1839,7 +1983,39 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; - if (state == VDEV_STATE_CANT_OPEN) { + /* + * If we are setting the vdev state to anything but an open state, then + * always close the underlying device. Otherwise, we keep accessible + * but invalid devices open forever. We don't call vdev_close() itself, + * because that implies some extra checks (offline, etc) that we don't + * want here. This is limited to leaf devices, because otherwise + * closing the device will affect other children. + */ + if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) + vd->vdev_ops->vdev_op_close(vd); + + if (vd->vdev_removed && + state == VDEV_STATE_CANT_OPEN && + (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { + /* + * If the previous state is set to VDEV_STATE_REMOVED, then this + * device was previously marked removed and someone attempted to + * reopen it. If this failed due to a nonexistent device, then + * keep the device in the REMOVED state. We also let this be if + * it is one of our special test online cases, which is only + * attempting to online the device and shouldn't generate an FMA + * fault. + */ + vd->vdev_state = VDEV_STATE_REMOVED; + vd->vdev_stat.vs_aux = VDEV_AUX_NONE; + } else if (state == VDEV_STATE_REMOVED) { + /* + * Indicate to the ZFS DE that this device has been removed, and + * any recent errors should be ignored. + */ + zfs_post_remove(vd->vdev_spa, vd); + vd->vdev_removed = B_TRUE; + } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import, we mark it as * "not available", which signifies that it was never there to @@ -1856,8 +2032,17 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. + * + * If the 'checkremove' flag is set, then this is an attempt to + * online the device in response to an insertion event. If we + * hit this case, then we have detected an insertion event for a + * faulted or offline device that wasn't in the removed state. + * In this scenario, we don't post an ereport because we are + * about to replace the device, or attempt an online with + * vdev_forcefault, which will generate the fault for us. */ - if (vd->vdev_prevstate != state && !vd->vdev_not_present && + if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && + !vd->vdev_not_present && !vd->vdev_checkremove && vd != vd->vdev_spa->spa_root_vdev) { const char *class; @@ -1887,11 +2072,13 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) zfs_ereport_post(class, vd->vdev_spa, vd, NULL, save_state, 0); } - } - if (isopen) - return; + /* Erase any notion of persistent removed state */ + vd->vdev_removed = B_FALSE; + } else { + vd->vdev_removed = B_FALSE; + } - if (vd->vdev_parent != NULL) - vdev_propagate_state(vd->vdev_parent); + if (!isopen) + vdev_propagate_state(vd); } diff --git a/usr/src/uts/common/fs/zfs/vdev_cache.c b/usr/src/uts/common/fs/zfs/vdev_cache.c index 2d8795c660..d7d8755f92 100644 --- a/usr/src/uts/common/fs/zfs/vdev_cache.c +++ b/usr/src/uts/common/fs/zfs/vdev_cache.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -352,6 +352,18 @@ vdev_cache_write(zio_t *zio) } void +vdev_cache_purge(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + vdev_cache_entry_t *ve; + + mutex_enter(&vc->vc_lock); + while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) + vdev_cache_evict(vc, ve); + mutex_exit(&vc->vc_lock); +} + +void vdev_cache_init(vdev_t *vd) { vdev_cache_t *vc = &vd->vdev_cache; @@ -371,12 +383,8 @@ void vdev_cache_fini(vdev_t *vd) { vdev_cache_t *vc = &vd->vdev_cache; - vdev_cache_entry_t *ve; - mutex_enter(&vc->vc_lock); - while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) - vdev_cache_evict(vc, ve); - mutex_exit(&vc->vc_lock); + vdev_cache_purge(vd); avl_destroy(&vc->vc_offset_tree); avl_destroy(&vc->vc_lastused_tree); diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index b965b1c5f0..5789312667 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -50,6 +50,9 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) vdev_disk_t *dvd; struct dk_minfo dkm; int error; + dev_t dev; + char *physpath, *minorname; + int otyp; /* * We must have a pathname, and it must be absolute. @@ -141,12 +144,57 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, spa_mode, kcred, &dvd->vd_lh, zfs_li); + /* + * If all else fails, then try opening by physical path (if available) + * or the logical path (if we failed due to the devid check). While not + * as reliable as the devid, this will give us something, and the higher + * level vdev validation will prevent us from opening the wrong device. + */ + if (error) { + if (vd->vdev_physpath != NULL && + (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != ENODEV) + error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode, + kcred, &dvd->vd_lh, zfs_li); + + /* + * Note that we don't support the legacy auto-wholedisk support + * as above. This hasn't been used in a very long time and we + * don't need to propagate its oddities to this edge condition. + */ + if (error && vd->vdev_path != NULL) + error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred, + &dvd->vd_lh, zfs_li); + } + if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); } /* + * Once a device is opened, verify that the physical device path (if + * available) is up to date. + */ + if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && + ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { + physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); + minorname = NULL; + if (ddi_dev_pathname(dev, otyp, physpath) == 0 && + ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && + (vd->vdev_physpath == NULL || + strcmp(vd->vdev_physpath, physpath) != 0)) { + if (vd->vdev_physpath) + spa_strfree(vd->vdev_physpath); + (void) strlcat(physpath, ":", MAXPATHLEN); + (void) strlcat(physpath, minorname, MAXPATHLEN); + vd->vdev_physpath = spa_strdup(physpath); + } + if (minorname) + kmem_free(minorname, strlen(minorname) + 1); + kmem_free(physpath, MAXPATHLEN); + } + + /* * Determine the actual size of the device. */ if (ldi_get_size(dvd->vd_lh, psize) != 0) { @@ -191,10 +239,6 @@ vdev_disk_close(vdev_t *vd) if (dvd == NULL) return; - dprintf("removing disk %s, devid %s\n", - vd->vdev_path ? vd->vdev_path : "<none>", - vd->vdev_devid ? vd->vdev_devid : "<none>"); - if (dvd->vd_minor != NULL) ddi_devid_str_free(dvd->vd_minor); @@ -340,6 +384,10 @@ vdev_disk_io_start(zio_t *zio) static void vdev_disk_io_done(zio_t *zio) { + vdev_t *vd = zio->io_vd; + vdev_disk_t *dvd = vd->vdev_tsd; + int state; + vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE) @@ -348,6 +396,21 @@ vdev_disk_io_done(zio_t *zio) if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); + /* + * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if + * the device has been removed. If this is the case, then we trigger an + * asynchronous removal of the device. + */ + if (zio->io_error == EIO) { + state = DKIO_NONE; + if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, + FKIOCTL, kcred, NULL) == 0 && + state != DKIO_INSERTED) { + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); + } + } + zio_next_stage(zio); } diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c index 9d9f5556fa..f7c51a1594 100644 --- a/usr/src/uts/common/fs/zfs/vdev_label.c +++ b/usr/src/uts/common/fs/zfs/vdev_label.c @@ -62,7 +62,7 @@ * or a device was added, we want to update all the labels such that we can deal * with fatal failure at any point. To this end, each disk has two labels which * are updated before and after the uberblock is synced. Assuming we have - * labels and an uberblock with the following transacation groups: + * labels and an uberblock with the following transaction groups: * * L1 UB L2 * +------+ +------+ +------+ @@ -209,6 +209,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid) == 0); + if (vd->vdev_physpath != NULL) + VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, + vd->vdev_physpath) == 0); + if (vd->vdev_nparity != 0) { ASSERT(strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_RAIDZ) == 0); @@ -285,9 +289,18 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_offline && !vd->vdev_tmpoffline) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE) == 0); - else - (void) nvlist_remove(nv, ZPOOL_CONFIG_OFFLINE, - DATA_TYPE_UINT64); + if (vd->vdev_faulted) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, + B_TRUE) == 0); + if (vd->vdev_degraded) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, + B_TRUE) == 0); + if (vd->vdev_removed) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, + B_TRUE) == 0); + if (vd->vdev_unspare) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, + B_TRUE) == 0); } return (nv); @@ -496,7 +509,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * If this is a replacement, then we want to fallthrough to the * rest of the code. If we're adding a spare, then it's already - * labelled appropriately and we can just return. + * labeled appropriately and we can just return. */ if (reason == VDEV_LABEL_SPARE) return (0); @@ -605,7 +618,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * If this vdev hasn't been previously identified as a spare, then we - * mark it as such only if a) we are labelling it as a spare, or b) it + * mark it as such only if a) we are labeling it as a spare, or b) it * exists as a spare elsewhere in the system. */ if (error == 0 && !vd->vdev_isspare && diff --git a/usr/src/uts/common/fs/zfs/zfs_fm.c b/usr/src/uts/common/fs/zfs/zfs_fm.c index a886d614d8..146c4ec438 100644 --- a/usr/src/uts/common/fs/zfs/zfs_fm.c +++ b/usr/src/uts/common/fs/zfs/zfs_fm.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -117,9 +117,11 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, /* * Ignore any errors from I/Os that we are going to retry anyway - we - * only generate errors from the final failure. + * only generate errors from the final failure. Checksum errors are + * generated after the pipeline stage responsible for retrying the I/O + * (VDEV_IO_ASSESS), so this only applies to standard I/O errors. */ - if (zio && zio_should_retry(zio)) + if (zio && zio_should_retry(zio) && zio->io_error != ECKSUM) return; /* @@ -292,13 +294,8 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, #endif } -/* - * The 'resource.fs.zfs.ok' event is an internal signal that the associated - * resource (pool or disk) has been identified by ZFS as healthy. This will - * then trigger the DE to close the associated case, if any. - */ -void -zfs_post_ok(spa_t *spa, vdev_t *vd) +static void +zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) { #ifdef _KERNEL nvlist_t *resource; @@ -308,7 +305,7 @@ zfs_post_ok(spa_t *spa, vdev_t *vd) return; (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE, - ZFS_ERROR_CLASS, FM_RESOURCE_OK); + ZFS_ERROR_CLASS, name); VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0); VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0); VERIFY(nvlist_add_uint64(resource, @@ -322,3 +319,37 @@ zfs_post_ok(spa_t *spa, vdev_t *vd) fm_nvlist_destroy(resource, FM_NVA_FREE); #endif } + +/* + * The 'resource.fs.zfs.ok' event is an internal signal that the associated + * resource (pool or disk) has been identified by ZFS as healthy. This will + * then trigger the DE to close the associated case, if any. + */ +void +zfs_post_ok(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RESOURCE_OK); +} + +/* + * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev + * has been removed from the system. This will cause the DE to ignore any + * recent I/O errors, inferring that they are due to the asynchronous device + * removal. + */ +void +zfs_post_remove(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RESOURCE_REMOVED); +} + +/* + * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool + * has the 'autoreplace' property set, and therefore any broken vdevs will be + * handled by higher level logic, and no vdev fault should be generated. + */ +void +zfs_post_autoreplace(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE); +} diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index fccfc1355e..74d033001b 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -439,7 +439,9 @@ zfs_ioc_pool_scrub(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); + spa_config_enter(spa, RW_READER, FTAG); error = spa_scrub(spa, zc->zc_cookie, B_FALSE); + spa_config_exit(spa, FTAG); spa_close(spa, FTAG); @@ -618,28 +620,35 @@ zfs_ioc_vdev_remove(zfs_cmd_t *zc) } static int -zfs_ioc_vdev_online(zfs_cmd_t *zc) +zfs_ioc_vdev_set_state(zfs_cmd_t *zc) { spa_t *spa; int error; + vdev_state_t newstate = VDEV_STATE_UNKNOWN; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - error = vdev_online(spa, zc->zc_guid); - spa_close(spa, FTAG); - return (error); -} + switch (zc->zc_cookie) { + case VDEV_STATE_ONLINE: + error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); + break; -static int -zfs_ioc_vdev_offline(zfs_cmd_t *zc) -{ - spa_t *spa; - int istmp = zc->zc_cookie; - int error; + case VDEV_STATE_OFFLINE: + error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); + break; - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - error = vdev_offline(spa, zc->zc_guid, istmp); + case VDEV_STATE_FAULTED: + error = vdev_fault(spa, zc->zc_guid); + break; + + case VDEV_STATE_DEGRADED: + error = vdev_degrade(spa, zc->zc_guid); + break; + + default: + error = EINVAL; + } + zc->zc_cookie = newstate; spa_close(spa, FTAG); return (error); } @@ -1096,7 +1105,7 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) } switch (prop) { - case ZFS_PROP_BOOTFS: + case ZPOOL_PROP_BOOTFS: /* * A bootable filesystem can not be on a RAIDZ pool * nor a striped pool with more than 1 device. @@ -1115,8 +1124,8 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) VERIFY(nvpair_value_string(elem, &strval) == 0); if (strval == NULL || strval[0] == '\0') { - objnum = - zfs_prop_default_numeric(ZFS_PROP_BOOTFS); + objnum = zpool_prop_default_numeric( + ZPOOL_PROP_BOOTFS); break; } @@ -1126,9 +1135,6 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) objnum = dmu_objset_id(os); dmu_objset_close(os); break; - - default: - error = EINVAL; } if (error) @@ -1137,10 +1143,11 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) if (error == 0) { if (reset_bootfs) { VERIFY(nvlist_remove(nvl, - zpool_prop_to_name(ZFS_PROP_BOOTFS), + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING) == 0); VERIFY(nvlist_add_uint64(nvl, - zpool_prop_to_name(ZFS_PROP_BOOTFS), objnum) == 0); + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), + objnum) == 0); } error = spa_set_props(spa, nvl); } @@ -1565,23 +1572,24 @@ zfs_ioc_clear(zfs_cmd_t *zc) spa_t *spa; vdev_t *vd; int error; + uint64_t txg; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - spa_config_enter(spa, RW_WRITER, FTAG); + txg = spa_vdev_enter(spa); if (zc->zc_guid == 0) { vd = NULL; } else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) { - spa_config_exit(spa, FTAG); + (void) spa_vdev_exit(spa, NULL, txg, ENODEV); spa_close(spa, FTAG); return (ENODEV); } vdev_clear(spa, vd); - spa_config_exit(spa, FTAG); + (void) spa_vdev_exit(spa, NULL, txg, 0); spa_close(spa, FTAG); @@ -1620,8 +1628,7 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { { zfs_ioc_pool_log_history, zfs_secpolicy_config, pool_name }, { zfs_ioc_vdev_add, zfs_secpolicy_config, pool_name }, { zfs_ioc_vdev_remove, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_online, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_offline, zfs_secpolicy_config, pool_name }, + { zfs_ioc_vdev_set_state, zfs_secpolicy_config, pool_name }, { zfs_ioc_vdev_attach, zfs_secpolicy_config, pool_name }, { zfs_ioc_vdev_detach, zfs_secpolicy_config, pool_name }, { zfs_ioc_vdev_setpath, zfs_secpolicy_config, pool_name }, diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 42c30d7edd..130e697d60 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -1519,25 +1519,6 @@ zio_vdev_io_assess(zio_t *zio) return; } - if (zio->io_error != 0 && zio->io_error != ECKSUM && - !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) { - /* - * Poor man's hotplug support. Even if we're done retrying this - * I/O, try to reopen the vdev to see if it's still attached. - * To avoid excessive thrashing, we only try it once a minute. - * This also has the effect of detecting when missing devices - * have come back, by polling the device once a minute. - * - * We need to do this asynchronously because we can't grab - * all the necessary locks way down here. - */ - if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) { - vd->vdev_last_try = gethrtime(); - tvd->vdev_reopen_wanted = 1; - spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN); - } - } - zio_next_stage(zio); } diff --git a/usr/src/uts/common/io/lofi.c b/usr/src/uts/common/io/lofi.c index 4af7fe70b4..1a068ef3ee 100644 --- a/usr/src/uts/common/io/lofi.c +++ b/usr/src/uts/common/io/lofi.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -59,6 +59,14 @@ * controller to talk to, and that didn't seem easy to fake. Or possibly even * necessary, since we have mkfs_pcfs now). * + * Normally, a lofi device cannot be detached if it is open (i.e. busy). To + * support simulation of hotplug events, an optional force flag is provided. + * If a lofi device is open when a force detach is requested, then the + * underlying file is closed and any subsequent operations return EIO. When the + * device is closed for the last time, it will be cleaned up at that time. In + * addition, the DKIOCSTATE ioctl will return DKIO_DEV_GONE when the device is + * detached but not removed. + * * Known problems: * * UFS logging. Mounting a UFS filesystem image "logging" @@ -207,7 +215,38 @@ mark_closed(struct lofi_state *lsp, int otyp) } } -/*ARGSUSED3*/ +static void +lofi_free_handle(dev_t dev, minor_t minor, struct lofi_state *lsp, + cred_t *credp) +{ + dev_t newdev; + char namebuf[50]; + + if (lsp->ls_vp) { + (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 1, 0, credp); + VN_RELE(lsp->ls_vp); + lsp->ls_vp = NULL; + } + + newdev = makedevice(getmajor(dev), minor); + (void) ddi_prop_remove(newdev, lofi_dip, SIZE_PROP_NAME); + (void) ddi_prop_remove(newdev, lofi_dip, NBLOCKS_PROP_NAME); + + (void) snprintf(namebuf, sizeof (namebuf), "%d", minor); + ddi_remove_minor_node(lofi_dip, namebuf); + (void) snprintf(namebuf, sizeof (namebuf), "%d,raw", minor); + ddi_remove_minor_node(lofi_dip, namebuf); + + kmem_free(lsp->ls_filename, lsp->ls_filename_sz); + taskq_destroy(lsp->ls_taskq); + if (lsp->ls_kstat) { + kstat_delete(lsp->ls_kstat); + mutex_destroy(&lsp->ls_kstat_lock); + } + ddi_soft_state_free(lofi_statep, minor); +} + +/*ARGSUSED*/ static int lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp) { @@ -244,6 +283,11 @@ lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp) return (EINVAL); } + if (lsp->ls_vp == NULL) { + mutex_exit(&lofi_lock); + return (ENXIO); + } + if (mark_opened(lsp, otyp) == -1) { mutex_exit(&lofi_lock); return (EINVAL); @@ -253,16 +297,13 @@ lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp) return (0); } -/*ARGSUSED3*/ +/*ARGSUSED*/ static int lofi_close(dev_t dev, int flag, int otyp, struct cred *credp) { minor_t minor; struct lofi_state *lsp; -#ifdef lint - flag = flag; -#endif mutex_enter(&lofi_lock); minor = getminor(dev); lsp = ddi_get_soft_state(lofi_statep, minor); @@ -271,6 +312,13 @@ lofi_close(dev_t dev, int flag, int otyp, struct cred *credp) return (EINVAL); } mark_closed(lsp, otyp); + + /* + * If we have forcibly closed the underlying device, and this is the + * last close, then tear down the rest of the device. + */ + if (minor != 0 && lsp->ls_vp == NULL && !is_opened(lsp)) + lofi_free_handle(dev, minor, lsp, credp); mutex_exit(&lofi_lock); return (0); } @@ -312,7 +360,9 @@ lofi_strategy_task(void *arg) * we have the rw_lock. So instead we page, unless it's not * mapable or it's a character device. */ - if (((lsp->ls_vp->v_flag & VNOMAP) == 0) && + if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { + error = EIO; + } else if (((lsp->ls_vp->v_flag & VNOMAP) == 0) && (lsp->ls_vp->v_type != VCHR)) { /* * segmap always gives us an 8K (MAXBSIZE) chunk, aligned on @@ -400,6 +450,12 @@ lofi_strategy_task(void *arg) kstat_runq_exit(kioptr); mutex_exit(lsp->ls_kstat->ks_lock); } + + mutex_enter(&lsp->ls_vp_lock); + if (--lsp->ls_vp_iocount == 0) + cv_broadcast(&lsp->ls_vp_cv); + mutex_exit(&lsp->ls_vp_lock); + bioerror(bp, error); biodone(bp); } @@ -422,6 +478,14 @@ lofi_strategy(struct buf *bp) * queues were incredibly easy so they win. */ lsp = ddi_get_soft_state(lofi_statep, getminor(bp->b_edev)); + mutex_enter(&lsp->ls_vp_lock); + if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { + bioerror(bp, EIO); + biodone(bp); + mutex_exit(&lsp->ls_vp_lock); + return (0); + } + offset = bp->b_lblkno * DEV_BSIZE; /* offset within file */ if (offset == lsp->ls_vp_size) { /* EOF */ @@ -433,13 +497,18 @@ lofi_strategy(struct buf *bp) bioerror(bp, ENXIO); } biodone(bp); + mutex_exit(&lsp->ls_vp_lock); return (0); } if (offset > lsp->ls_vp_size) { bioerror(bp, ENXIO); biodone(bp); + mutex_exit(&lsp->ls_vp_lock); return (0); } + lsp->ls_vp_iocount++; + mutex_exit(&lsp->ls_vp_lock); + if (lsp->ls_kstat) { mutex_enter(lsp->ls_kstat->ks_lock); kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat)); @@ -720,15 +789,15 @@ lofi_map_file(dev_t dev, struct lofi_ioctl *ulip, int pickminor, struct lofi_state *lsp; struct lofi_ioctl *klip; int error; - char namebuf[50]; struct vnode *vp; int64_t Nblocks_prop_val; int64_t Size_prop_val; vattr_t vattr; int flag; enum vtype v_type; - dev_t newdev; int zalloced = 0; + dev_t newdev; + char namebuf[50]; klip = copy_in_lofi_ioctl(ulip, ioctl_flag); if (klip == NULL) @@ -846,6 +915,9 @@ lofi_map_file(dev_t dev, struct lofi_ioctl *ulip, int pickminor, lsp->ls_kstat->ks_lock = &lsp->ls_kstat_lock; kstat_install(lsp->ls_kstat); } + cv_init(&lsp->ls_vp_cv, NULL, CV_DRIVER, NULL); + mutex_init(&lsp->ls_vp_lock, NULL, MUTEX_DRIVER, NULL); + /* * save open mode so file can be closed properly and vnode counts * updated correctly. @@ -911,8 +983,6 @@ lofi_unmap_file(dev_t dev, struct lofi_ioctl *ulip, int byfilename, struct lofi_state *lsp; struct lofi_ioctl *klip; minor_t minor; - char namebuf[20]; - dev_t newdev; klip = copy_in_lofi_ioctl(ulip, ioctl_flag); if (klip == NULL) @@ -930,38 +1000,51 @@ lofi_unmap_file(dev_t dev, struct lofi_ioctl *ulip, int byfilename, return (ENXIO); } lsp = ddi_get_soft_state(lofi_statep, minor); - if (lsp == NULL) { + if (lsp == NULL || lsp->ls_vp == NULL) { mutex_exit(&lofi_lock); free_lofi_ioctl(klip); return (ENXIO); } + if (is_opened(lsp)) { + /* + * If the 'force' flag is set, then we forcibly close the + * underlying file. Subsequent operations will fail, and the + * DKIOCSTATE ioctl will return DKIO_DEV_GONE. When the device + * is last closed, the device will be cleaned up appropriately. + * + * This is complicated by the fact that we may have outstanding + * dispatched I/Os. Rather than having a single mutex to + * serialize all I/O, we keep a count of the number of + * outstanding I/O requests, as well as a flag to indicate that + * no new I/Os should be dispatched. We set the flag, wait for + * the number of outstanding I/Os to reach 0, and then close the + * underlying vnode. + */ + if (klip->li_force) { + mutex_enter(&lsp->ls_vp_lock); + lsp->ls_vp_closereq = B_TRUE; + while (lsp->ls_vp_iocount > 0) + cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); + (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 1, 0, + credp); + VN_RELE(lsp->ls_vp); + lsp->ls_vp = NULL; + cv_broadcast(&lsp->ls_vp_cv); + mutex_exit(&lsp->ls_vp_lock); + mutex_exit(&lofi_lock); + klip->li_minor = minor; + (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); + free_lofi_ioctl(klip); + return (0); + } mutex_exit(&lofi_lock); free_lofi_ioctl(klip); return (EBUSY); } - /* - * Use saved open mode to properly update vnode counts - */ - (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 1, 0, credp); - VN_RELE(lsp->ls_vp); - lsp->ls_vp = NULL; - newdev = makedevice(getmajor(dev), minor); - (void) ddi_prop_remove(newdev, lofi_dip, SIZE_PROP_NAME); - (void) ddi_prop_remove(newdev, lofi_dip, NBLOCKS_PROP_NAME); - (void) snprintf(namebuf, sizeof (namebuf), "%d", minor); - ddi_remove_minor_node(lofi_dip, namebuf); - (void) snprintf(namebuf, sizeof (namebuf), "%d,raw", minor); - ddi_remove_minor_node(lofi_dip, namebuf); + lofi_free_handle(dev, minor, lsp, credp); - kmem_free(lsp->ls_filename, lsp->ls_filename_sz); - taskq_destroy(lsp->ls_taskq); - if (lsp->ls_kstat) { - kstat_delete(lsp->ls_kstat); - mutex_destroy(&lsp->ls_kstat_lock); - } - ddi_soft_state_free(lofi_statep, minor); klip->li_minor = minor; mutex_exit(&lofi_lock); (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); @@ -973,7 +1056,7 @@ lofi_unmap_file(dev_t dev, struct lofi_ioctl *ulip, int byfilename, * get the filename given the minor number, or the minor number given * the name. */ -/*ARGSUSED3*/ +/*ARGSUSED*/ static int lofi_get_info(dev_t dev, struct lofi_ioctl *ulip, int which, struct cred *credp, int ioctl_flag) @@ -983,9 +1066,6 @@ lofi_get_info(dev_t dev, struct lofi_ioctl *ulip, int which, int error; minor_t minor; -#ifdef lint - dev = dev; -#endif klip = copy_in_lofi_ioctl(ulip, ioctl_flag); if (klip == NULL) return (EFAULT); @@ -1089,6 +1169,13 @@ lofi_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, if (lsp == NULL) return (ENXIO); + /* + * We explicitly allow DKIOCSTATE, but all other ioctls should fail with + * EIO as if the device was no longer present. + */ + if (lsp->ls_vp == NULL && cmd != DKIOCSTATE) + return (EIO); + /* these are for faking out utilities like newfs */ switch (cmd) { case DKIOCGVTOC: @@ -1125,11 +1212,34 @@ lofi_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, return (EFAULT); return (0); case DKIOCSTATE: - /* the file is always there */ - dkstate = DKIO_INSERTED; - error = ddi_copyout(&dkstate, (void *)arg, - sizeof (enum dkio_state), flag); - if (error) + /* + * Normally, lofi devices are always in the INSERTED state. If + * a device is forcefully unmapped, then the device transitions + * to the DKIO_DEV_GONE state. + */ + if (ddi_copyin((void *)arg, &dkstate, sizeof (dkstate), + flag) != 0) + return (EFAULT); + + mutex_enter(&lsp->ls_vp_lock); + while ((dkstate == DKIO_INSERTED && lsp->ls_vp != NULL) || + (dkstate == DKIO_DEV_GONE && lsp->ls_vp == NULL)) { + /* + * By virtue of having the device open, we know that + * 'lsp' will remain valid when we return. + */ + if (!cv_wait_sig(&lsp->ls_vp_cv, + &lsp->ls_vp_lock)) { + mutex_exit(&lsp->ls_vp_lock); + return (EINTR); + } + } + + dkstate = (lsp->ls_vp != NULL ? DKIO_INSERTED : DKIO_DEV_GONE); + mutex_exit(&lsp->ls_vp_lock); + + if (ddi_copyout(&dkstate, (void *)arg, + sizeof (dkstate), flag) != 0) return (EFAULT); return (0); default: diff --git a/usr/src/uts/common/io/scsi/targets/sd.c b/usr/src/uts/common/io/scsi/targets/sd.c index a43f9baf5c..90ea1d608f 100644 --- a/usr/src/uts/common/io/scsi/targets/sd.c +++ b/usr/src/uts/common/io/scsi/targets/sd.c @@ -81,11 +81,11 @@ char _depends_on[] = "misc/scsi misc/cmlb"; * Define the interconnect type, to allow the driver to distinguish * between parallel SCSI (sd) and fibre channel (ssd) behaviors. * - * This is really for backward compatability. In the future, the driver + * This is really for backward compatibility. In the future, the driver * should actually check the "interconnect-type" property as reported by * the HBA; however at present this property is not defined by all HBAs, * so we will use this #define (1) to permit the driver to run in - * backward-compatability mode; and (2) to print a notification message + * backward-compatibility mode; and (2) to print a notification message * if an FC HBA does not support the "interconnect-type" property. The * behavior of the driver will be to assume parallel SCSI behaviors unless * the "interconnect-type" property is defined by the HBA **AND** has a @@ -136,7 +136,7 @@ static char *sd_config_list = "sd-config-list"; #if (defined(__fibre)) /* * These #defines are to avoid namespace collisions that occur because this - * code is currently used to compile two seperate driver modules: sd and ssd. + * code is currently used to compile two separate driver modules: sd and ssd. * All global variables need to be treated this way (even if declared static) * in order to allow the debugger to resolve the names properly. * It is anticipated that in the near future the ssd module will be obsoleted, @@ -539,7 +539,7 @@ static sd_tunables tst_properties = { }; #endif -/* This is similiar to the ANSI toupper implementation */ +/* This is similar to the ANSI toupper implementation */ #define SD_TOUPPER(C) (((C) >= 'a' && (C) <= 'z') ? (C) - 'a' + 'A' : (C)) /* @@ -797,7 +797,7 @@ static int sd_pm_idletime = 1; #if (defined(__fibre)) /* * These #defines are to avoid namespace collisions that occur because this - * code is currently used to compile two seperate driver modules: sd and ssd. + * code is currently used to compile two separate driver modules: sd and ssd. * All function names need to be treated this way (even if declared static) * in order to allow the debugger to resolve the names properly. * It is anticipated that in the near future the ssd module will be obsoleted, @@ -1674,7 +1674,7 @@ struct sd_sense_info { }; /* - * Table of function pointers for iostart-side routines. Seperate "chains" + * Table of function pointers for iostart-side routines. Separate "chains" * of layered function calls are formed by placing the function pointers * sequentially in the desired order. Functions are called according to an * incrementing table index ordering. The last function in each chain must @@ -1683,9 +1683,9 @@ struct sd_sense_info { * * Note: It may seem more natural to organize both the iostart and iodone * functions together, into an array of structures (or some similar - * organization) with a common index, rather than two seperate arrays which + * organization) with a common index, rather than two separate arrays which * must be maintained in synchronization. The purpose of this division is - * to achiece improved performance: individual arrays allows for more + * to achieve improved performance: individual arrays allows for more * effective cache line utilization on certain platforms. */ @@ -2139,7 +2139,7 @@ _init(void) sd_label = mod_modname(&modlinkage); err = ddi_soft_state_init(&sd_state, sizeof (struct sd_lun), - SD_MAXUNIT); + SD_MAXUNIT); if (err != 0) { return (err); @@ -2481,9 +2481,9 @@ sdprobe(dev_info_t *devi) */ if (sd_dtype_optical_bind < 0) { - sd_dtype_optical_bind = ddi_prop_get_int - (DDI_DEV_T_ANY, devi, 0, - "optical-device-bind", 1); + sd_dtype_optical_bind = ddi_prop_get_int + (DDI_DEV_T_ANY, devi, 0, + "optical-device-bind", 1); } if (sd_dtype_optical_bind == 0) { @@ -3611,11 +3611,11 @@ sd_process_sdconf_file(struct sd_lun *un) * * This function reads the data list from the sd.conf file and pulls * the values that can have numeric values as arguments and places - * the values in the apropriate sd_tunables member. + * the values in the appropriate sd_tunables member. * Since the order of the data list members varies across platforms * This function reads them from the data list in a platform specific * order and places them into the correct sd_tunable member that is - * a consistant across all platforms. + * consistent across all platforms. */ static void sd_get_tunables_from_conf(struct sd_lun *un, int flags, int *data_list, @@ -4024,7 +4024,7 @@ sd_set_vers1_properties(struct sd_lun *un, int flags, sd_tunables *prop_list) if (flags & SD_CONF_BSET_NO_READ_HEADER) { un->un_f_cfg_no_read_header = TRUE; SD_INFO(SD_LOG_ATTACH_DETACH, un, - "sd_set_vers1_properties: no_read_header set\n"); + "sd_set_vers1_properties: no_read_header set\n"); } if (flags & SD_CONF_BSET_READ_CD_XD4) { un->un_f_cfg_read_cd_xd4 = TRUE; @@ -4054,7 +4054,7 @@ sd_set_vers1_properties(struct sd_lun *un, int flags, sd_tunables *prop_list) ASSERT(prop_list != NULL); if (prop_list->sdt_not_rdy_retries) { un->un_notready_retry_count = - prop_list->sdt_not_rdy_retries; + prop_list->sdt_not_rdy_retries; SD_INFO(SD_LOG_ATTACH_DETACH, un, "sd_set_vers1_properties: not ready retry count" " set to %d\n", un->un_notready_retry_count); @@ -4074,8 +4074,8 @@ sd_set_vers1_properties(struct sd_lun *un, int flags, sd_tunables *prop_list) case CTYPE_CCS: un->un_ctype = prop_list->sdt_ctype; SD_INFO(SD_LOG_ATTACH_DETACH, un, - "sd_set_vers1_properties: ctype set to " - "CTYPE_CCS\n"); + "sd_set_vers1_properties: ctype set to " + "CTYPE_CCS\n"); break; case CTYPE_ROD: /* RW optical */ un->un_ctype = prop_list->sdt_ctype; @@ -4095,7 +4095,7 @@ sd_set_vers1_properties(struct sd_lun *un, int flags, sd_tunables *prop_list) if (flags & SD_CONF_BSET_BSY_RETRY_COUNT) { ASSERT(prop_list != NULL); un->un_busy_retry_count = - prop_list->sdt_busy_retries; + prop_list->sdt_busy_retries; SD_INFO(SD_LOG_ATTACH_DETACH, un, "sd_set_vers1_properties: " "busy retry count set to %d\n", @@ -4106,7 +4106,7 @@ sd_set_vers1_properties(struct sd_lun *un, int flags, sd_tunables *prop_list) if (flags & SD_CONF_BSET_RST_RETRIES) { ASSERT(prop_list != NULL); un->un_reset_retry_count = - prop_list->sdt_reset_retries; + prop_list->sdt_reset_retries; SD_INFO(SD_LOG_ATTACH_DETACH, un, "sd_set_vers1_properties: " "reset retry count set to %d\n", @@ -4117,7 +4117,7 @@ sd_set_vers1_properties(struct sd_lun *un, int flags, sd_tunables *prop_list) if (flags & SD_CONF_BSET_RSV_REL_TIME) { ASSERT(prop_list != NULL); un->un_reserve_release_time = - prop_list->sdt_reserv_rel_time; + prop_list->sdt_reserv_rel_time; SD_INFO(SD_LOG_ATTACH_DETACH, un, "sd_set_vers1_properties: " "reservation release timeout set to %d\n", @@ -4177,7 +4177,7 @@ sd_set_vers1_properties(struct sd_lun *un, int flags, sd_tunables *prop_list) /* * Function: sd_is_lsi() * - * Description: Check for lsi devices, step throught the static device + * Description: Check for lsi devices, step through the static device * table to match vid/pid. * * Args: un - ptr to sd_lun @@ -5691,8 +5691,8 @@ sd_pm_idletimeout_handler(void *arg) un->un_pm_idle_timeid = NULL; } else { un->un_pm_idle_timeid = - timeout(sd_pm_idletimeout_handler, un, - (drv_usectohz((clock_t)300000))); /* 300 ms. */ + timeout(sd_pm_idletimeout_handler, un, + (drv_usectohz((clock_t)300000))); /* 300 ms. */ } mutex_exit(&un->un_pm_mutex); mutex_exit(SD_MUTEX(un)); @@ -6509,7 +6509,7 @@ sd_unit_attach(dev_info_t *devi) if (un->un_f_is_fibre == TRUE) { if (scsi_ifgetcap(SD_ADDRESS(un), "scsi-version", 1) == - SCSI_VERSION_3) { + SCSI_VERSION_3) { switch (un->un_interconnect_type) { case SD_INTERCONNECT_FIBRE: case SD_INTERCONNECT_SSA: @@ -6530,7 +6530,7 @@ sd_unit_attach(dev_info_t *devi) /* * Set un_retry_count with SD_RETRY_COUNT, this is ok for Sparc - * with seperate binary for sd and ssd. + * with separate binary for sd and ssd. * * x86 has 1 binary, un_retry_count is set base on connection type. * The hardcoded values will go away when Sparc uses 1 binary @@ -6552,7 +6552,7 @@ sd_unit_attach(dev_info_t *devi) */ un->un_notready_retry_count = ISCD(un) ? CD_NOT_READY_RETRY_COUNT(un) - : DISK_NOT_READY_RETRY_COUNT(un); + : DISK_NOT_READY_RETRY_COUNT(un); /* * Set the busy retry count to the default value of un_retry_count. @@ -6603,16 +6603,16 @@ sd_unit_attach(dev_info_t *devi) un->un_f_allow_bus_device_reset = TRUE; } else { if (ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, - "allow-bus-device-reset", 1) != 0) { + "allow-bus-device-reset", 1) != 0) { un->un_f_allow_bus_device_reset = TRUE; SD_INFO(SD_LOG_ATTACH_DETACH, un, - "sd_unit_attach: un:0x%p Bus device reset enabled\n", - un); + "sd_unit_attach: un:0x%p Bus device reset " + "enabled\n", un); } else { un->un_f_allow_bus_device_reset = FALSE; SD_INFO(SD_LOG_ATTACH_DETACH, un, - "sd_unit_attach: un:0x%p Bus device reset disabled\n", - un); + "sd_unit_attach: un:0x%p Bus device reset " + "disabled\n", un); } } @@ -7197,11 +7197,12 @@ sd_unit_attach(dev_info_t *devi) * register or not. */ if (un->un_f_is_fibre) { - if (strcmp(un->un_node_type, DDI_NT_BLOCK_CHAN)) { - sd_init_event_callbacks(un); - SD_TRACE(SD_LOG_ATTACH_DETACH, un, - "sd_unit_attach: un:0x%p event callbacks inserted", un); - } + if (strcmp(un->un_node_type, DDI_NT_BLOCK_CHAN)) { + sd_init_event_callbacks(un); + SD_TRACE(SD_LOG_ATTACH_DETACH, un, + "sd_unit_attach: un:0x%p event callbacks inserted", + un); + } } #endif @@ -7324,7 +7325,7 @@ cmlb_attach_failed: } if (un->un_f_is_fibre == FALSE) { - (void) scsi_ifsetcap(SD_ADDRESS(un), "auto-rqsense", 0, 1); + (void) scsi_ifsetcap(SD_ADDRESS(un), "auto-rqsense", 0, 1); } spinup_failed: @@ -7746,8 +7747,8 @@ sd_unit_detach(dev_info_t *devi) */ if (un->un_f_is_fibre == TRUE) { if ((un->un_insert_event != NULL) && - (ddi_remove_event_handler(un->un_insert_cb_id) != - DDI_SUCCESS)) { + (ddi_remove_event_handler(un->un_insert_cb_id) != + DDI_SUCCESS)) { /* * Note: We are returning here after having done * substantial cleanup above. This is consistent @@ -7755,14 +7756,14 @@ sd_unit_detach(dev_info_t *devi) * be the right thing to do. */ SD_ERROR(SD_LOG_ATTACH_DETACH, un, - "sd_dr_detach: Cannot cancel insert event\n"); + "sd_dr_detach: Cannot cancel insert event\n"); goto err_remove_event; } un->un_insert_event = NULL; if ((un->un_remove_event != NULL) && - (ddi_remove_event_handler(un->un_remove_cb_id) != - DDI_SUCCESS)) { + (ddi_remove_event_handler(un->un_remove_cb_id) != + DDI_SUCCESS)) { /* * Note: We are returning here after having done * substantial cleanup above. This is consistent @@ -7770,7 +7771,7 @@ sd_unit_detach(dev_info_t *devi) * be the right thing to do. */ SD_ERROR(SD_LOG_ATTACH_DETACH, un, - "sd_dr_detach: Cannot cancel remove event\n"); + "sd_dr_detach: Cannot cancel remove event\n"); goto err_remove_event; } un->un_remove_event = NULL; @@ -8270,7 +8271,7 @@ sd_cache_control(struct sd_lun *un, int rcd_flag, int wce_flag) * will fail. mode_cache_scsi3 is a superset of mode_caching. */ buflen = hdrlen + MODE_BLK_DESC_LENGTH + - sizeof (struct mode_cache_scsi3); + sizeof (struct mode_cache_scsi3); header = kmem_zalloc(buflen, KM_SLEEP); @@ -8332,8 +8333,8 @@ sd_cache_control(struct sd_lun *un, int rcd_flag, int wce_flag) * length of the sense data returned. */ sbuflen = hdrlen + MODE_BLK_DESC_LENGTH + - sizeof (struct mode_page) + - (int)mode_caching_page->mode_page.length; + sizeof (struct mode_page) + + (int)mode_caching_page->mode_page.length; /* * Set the caching bits as requested. @@ -8353,7 +8354,7 @@ sd_cache_control(struct sd_lun *un, int rcd_flag, int wce_flag) * drive supports it. */ save_pg = mode_caching_page->mode_page.ps ? - SD_SAVE_PAGE : SD_DONTSAVE_PAGE; + SD_SAVE_PAGE : SD_DONTSAVE_PAGE; /* Clear reserved bits before mode select. */ mode_caching_page->mode_page.ps = 0; @@ -8964,7 +8965,7 @@ sdopen(dev_t *dev_p, int flag, int otyp, cred_t *cred_p) cp = &un->un_ocmap.chkd[0]; while (cp < &un->un_ocmap.chkd[OCSIZE]) { if (*cp != (uchar_t)0) { - break; + break; } cp++; } @@ -9116,7 +9117,7 @@ sdclose(dev_t dev, int flag, int otyp, cred_t *cred_p) if (un->un_state == SD_STATE_OFFLINE) { if (un->un_f_is_fibre == FALSE) { scsi_log(SD_DEVINFO(un), sd_label, - CE_WARN, "offline\n"); + CE_WARN, "offline\n"); } mutex_exit(SD_MUTEX(un)); cmlb_invalidate(un->un_cmlbhandle, @@ -9838,7 +9839,7 @@ sdawrite(dev_t dev, struct aio_req *aio, cred_t *cred_p) * +----> SCSA ---->+ * * - * This code is based upon the following presumtions: + * This code is based upon the following presumptions: * * - iostart and iodone functions operate on buf(9S) structures. These * functions perform the necessary operations on the buf(9S) and pass @@ -9903,7 +9904,7 @@ static int sd_taskq_maxalloc = SD_TASKQ_MAXALLOC; /* * The following task queue is being created for the write part of * read-modify-write of non-512 block size devices. - * Limit the number of threads to 1 for now. This number has been choosen + * Limit the number of threads to 1 for now. This number has been chosen * considering the fact that it applies only to dvd ram drives/MO drives * currently. Performance for which is not main criteria at this stage. * Note: It needs to be explored if we can use a single taskq in future @@ -10487,7 +10488,7 @@ sd_uscsi_iodone(int index, struct sd_lun *un, struct buf *bp) /* * Function: sd_mapblockaddr_iostart * - * Description: Verify request lies withing the partition limits for + * Description: Verify request lies within the partition limits for * the indicated minor device. Issue "overrun" buf if * request would exceed partition range. Converts * partition-relative block address to absolute. @@ -10610,7 +10611,7 @@ sd_mapblockaddr_iostart(int index, struct sd_lun *un, struct buf *bp) ASSERT(bp->b_bcount >= resid); bp = sd_bioclone_alloc(bp, count, blocknum, - (int (*)(struct buf *)) sd_mapblockaddr_iodone); + (int (*)(struct buf *)) sd_mapblockaddr_iodone); xp = SD_GET_XBUF(bp); /* Update for 'new' bp! */ ASSERT(xp != NULL); } @@ -11756,7 +11757,7 @@ sd_setup_rw_pkt(struct sd_lun *un, */ blockcount -= SD_BYTES2TGTBLOCKS(un, - return_pktp->pkt_resid); + return_pktp->pkt_resid); } cdbp = (union scsi_cdb *)return_pktp->pkt_cdbp; @@ -11767,7 +11768,7 @@ sd_setup_rw_pkt(struct sd_lun *un, */ cdbp->scc_cmd = cp->sc_grpmask | ((bp->b_flags & B_READ) ? - SCMD_READ : SCMD_WRITE); + SCMD_READ : SCMD_WRITE); SD_FILL_SCSI1_LUN(un, return_pktp); @@ -12738,7 +12739,7 @@ sd_start_cmds(struct sd_lun *un, struct buf *immed_bp) if ((un->un_state != SD_STATE_SUSPENDED) && (un->un_state != SD_STATE_PM_CHANGING)) { New_state(un, SD_STATE_NORMAL); - } + } xp = SD_GET_XBUF(bp); ASSERT(xp != NULL); @@ -13012,8 +13013,8 @@ got_pkt: SD_UPDATE_KSTATS(un, kstat_runq_exit, bp); bp = sd_mark_rqs_idle(un, xp); sd_retry_command(un, bp, SD_RETRIES_STANDARD, - NULL, NULL, EIO, SD_BSY_TIMEOUT / 500, - kstat_waitq_enter); + NULL, NULL, EIO, SD_BSY_TIMEOUT / 500, + kstat_waitq_enter); goto exit; } @@ -13081,7 +13082,7 @@ got_pkt: * for this condition? */ sd_set_retry_bp(un, bp, SD_BSY_TIMEOUT / 500, - kstat_runq_back_to_waitq); + kstat_runq_back_to_waitq); goto exit; case TRAN_FATAL_ERROR: @@ -13180,8 +13181,8 @@ sd_return_command(struct sd_lun *un, struct buf *bp) * Note:x86: check for the "sdrestart failed" case. */ if (((xp->xb_pkt_flags & SD_XB_USCSICMD) != SD_XB_USCSICMD) && - (geterror(bp) == 0) && (xp->xb_dma_resid != 0) && - (xp->xb_pktp->pkt_resid == 0)) { + (geterror(bp) == 0) && (xp->xb_dma_resid != 0) && + (xp->xb_pktp->pkt_resid == 0)) { if (sd_setup_next_xfer(un, bp, pktp, xp) != 0) { /* @@ -13407,7 +13408,7 @@ sd_return_failed_command_no_restart(struct sd_lun *un, struct buf *bp, * is queued for a delayed retry. May be NULL if no kstat * update is desired. * - * Context: May be called from interupt context. + * Context: May be called from interrupt context. */ static void @@ -13639,7 +13640,7 @@ sd_retry_command(struct sd_lun *un, struct buf *bp, int retry_check_flag, xp->xb_ua_retry_count++; SD_TRACE(SD_LOG_IO_CORE | SD_LOG_ERROR, un, "sd_retry_command: retry count:%d\n", - xp->xb_ua_retry_count); + xp->xb_ua_retry_count); break; case SD_RETRIES_BUSY: @@ -14220,22 +14221,22 @@ sd_alloc_rqs(struct scsi_device *devp, struct sd_lun *un) switch (scsi_ifgetcap(SD_ADDRESS(un), "auto-rqsense", 1)) { case 0: SD_INFO(SD_LOG_ATTACH_DETACH, un, - "sd_alloc_rqs: HBA supports ARQ\n"); + "sd_alloc_rqs: HBA supports ARQ\n"); /* * ARQ is supported by this HBA but currently is not * enabled. Attempt to enable it and if successful then * mark this instance as ARQ enabled. */ if (scsi_ifsetcap(SD_ADDRESS(un), "auto-rqsense", 1, 1) - == 1) { + == 1) { /* Successfully enabled ARQ in the HBA */ SD_INFO(SD_LOG_ATTACH_DETACH, un, - "sd_alloc_rqs: ARQ enabled\n"); + "sd_alloc_rqs: ARQ enabled\n"); un->un_f_arq_enabled = TRUE; } else { /* Could not enable ARQ in the HBA */ SD_INFO(SD_LOG_ATTACH_DETACH, un, - "sd_alloc_rqs: failed ARQ enable\n"); + "sd_alloc_rqs: failed ARQ enable\n"); un->un_f_arq_enabled = FALSE; } break; @@ -14245,7 +14246,7 @@ sd_alloc_rqs(struct scsi_device *devp, struct sd_lun *un) * Just mark ARQ as enabled for this instance. */ SD_INFO(SD_LOG_ATTACH_DETACH, un, - "sd_alloc_rqs: ARQ already enabled\n"); + "sd_alloc_rqs: ARQ already enabled\n"); un->un_f_arq_enabled = TRUE; break; default: @@ -14254,7 +14255,7 @@ sd_alloc_rqs(struct scsi_device *devp, struct sd_lun *un) * instance. */ SD_INFO(SD_LOG_ATTACH_DETACH, un, - "sd_alloc_rqs: HBA does not support ARQ\n"); + "sd_alloc_rqs: HBA does not support ARQ\n"); un->un_f_arq_enabled = FALSE; break; } @@ -14304,7 +14305,7 @@ sd_free_rqs(struct sd_lun *un) /* * Function: sd_reduce_throttle * - * Description: Reduces the maximun # of outstanding commands on a + * Description: Reduces the maximum # of outstanding commands on a * target to the current number of outstanding commands. * Queues a tiemout(9F) callback to restore the limit * after a specified interval has elapsed. @@ -14344,7 +14345,7 @@ sd_reduce_throttle(struct sd_lun *un, int throttle_type) } if (un->un_ncmds_in_transport > 0) { - un->un_throttle = un->un_ncmds_in_transport; + un->un_throttle = un->un_ncmds_in_transport; } } else { @@ -14423,9 +14424,10 @@ sd_restore_throttle(void *arg) (throttle < un->un_saved_throttle) ? throttle : un->un_saved_throttle; if (un->un_throttle < un->un_saved_throttle) { - un->un_reset_throttle_timeid = - timeout(sd_restore_throttle, - un, SD_QFULL_THROTTLE_RESET_INTERVAL); + un->un_reset_throttle_timeid = + timeout(sd_restore_throttle, + un, + SD_QFULL_THROTTLE_RESET_INTERVAL); } } } @@ -14565,11 +14567,16 @@ sdintr(struct scsi_pkt *pktp) #endif /* - * If pkt_reason is CMD_DEV_GONE, just fail the command + * If pkt_reason is CMD_DEV_GONE, fail the command, and update the media + * state if needed. */ if (pktp->pkt_reason == CMD_DEV_GONE) { scsi_log(SD_DEVINFO(un), sd_label, CE_CONT, - "Device is gone\n"); + "Device is gone\n"); + if (un->un_mediastate != DKIO_DEV_GONE) { + un->un_mediastate = DKIO_DEV_GONE; + cv_broadcast(&un->un_state_cv); + } sd_return_failed_command(un, bp, EIO); goto exit; } @@ -14682,7 +14689,7 @@ sdintr(struct scsi_pkt *pktp) } else if (xp->xb_pkt_flags & SD_XB_USCSICMD) { SD_UPDATE_B_RESID(bp, pktp); SD_TRACE(SD_LOG_IO_CORE | SD_LOG_ERROR, un, - "sdintr: returning uscsi command\n"); + "sdintr: returning uscsi command\n"); } else { goto not_successful; } @@ -15320,7 +15327,7 @@ sense_failed: */ sd_retry_command(un, bp, SD_RETRIES_STANDARD, sd_print_sense_failed_msg, msgp, EIO, - un->un_f_is_fibre?drv_usectohz(100000):(clock_t)0, NULL); + un->un_f_is_fibre?drv_usectohz(100000):(clock_t)0, NULL); #else sd_retry_command(un, bp, SD_RETRIES_STANDARD, sd_print_sense_failed_msg, msgp, EIO, SD_RETRY_DELAY, NULL); @@ -15566,7 +15573,7 @@ sd_print_sense_msg(struct sd_lun *un, struct buf *bp, void *arg, int code) sensep = xp->xb_sense_data; if (scsi_sense_info_uint64(sensep, SENSE_LENGTH, - (uint64_t *)&err_blkno)) { + (uint64_t *)&err_blkno)) { /* * We retrieved the error block number from the information * portion of the sense data. @@ -15657,7 +15664,7 @@ sd_sense_key_no_sense(struct sd_lun *un, struct buf *bp, SD_UPDATE_ERRSTATS(un, sd_softerrs); sd_retry_command(un, bp, SD_RETRIES_STANDARD, sd_print_sense_msg, - &si, EIO, (clock_t)0, NULL); + &si, EIO, (clock_t)0, NULL); } @@ -15803,21 +15810,21 @@ sd_sense_key_not_ready(struct sd_lun *un, */ if (un->un_f_is_fibre == TRUE) { if (((sd_level_mask & SD_LOGMASK_DIAG) || - (xp->xb_retry_count > 0)) && - (un->un_startstop_timeid == NULL)) { + (xp->xb_retry_count > 0)) && + (un->un_startstop_timeid == NULL)) { scsi_log(SD_DEVINFO(un), sd_label, - CE_WARN, "logical unit not ready, " - "resetting disk\n"); + CE_WARN, "logical unit not ready, " + "resetting disk\n"); sd_reset_target(un, pktp); } } else { if (((sd_level_mask & SD_LOGMASK_DIAG) || - (xp->xb_retry_count > - un->un_reset_retry_count)) && - (un->un_startstop_timeid == NULL)) { + (xp->xb_retry_count > + un->un_reset_retry_count)) && + (un->un_startstop_timeid == NULL)) { scsi_log(SD_DEVINFO(un), sd_label, - CE_WARN, "logical unit not ready, " - "resetting disk\n"); + CE_WARN, "logical unit not ready, " + "resetting disk\n"); sd_reset_target(un, pktp); } } @@ -16856,8 +16863,8 @@ sd_pkt_status_check_condition(struct sd_lun *un, struct buf *bp, * when SD_RETRY_DELAY change in sddef.h */ sd_retry_command(un, bp, SD_RETRIES_STANDARD, NULL, NULL, EIO, - un->un_f_is_fibre?drv_usectohz(100000):(clock_t)0, - NULL); + un->un_f_is_fibre?drv_usectohz(100000):(clock_t)0, + NULL); #else sd_retry_command(un, bp, SD_RETRIES_STANDARD, NULL, NULL, EIO, SD_RETRY_DELAY, NULL); @@ -17821,13 +17828,13 @@ sd_send_scsi_START_STOP_UNIT(struct sd_lun *un, int flag, int path_flag) case STATUS_CHECK: if (ucmd_buf.uscsi_rqstatus == STATUS_GOOD) { switch (scsi_sense_key( - (uint8_t *)&sense_buf)) { + (uint8_t *)&sense_buf)) { case KEY_ILLEGAL_REQUEST: status = ENOTSUP; break; case KEY_NOT_READY: if (scsi_sense_asc( - (uint8_t *)&sense_buf) + (uint8_t *)&sense_buf) == 0x3A) { status = ENXIO; } @@ -18111,7 +18118,7 @@ sd_send_scsi_TEST_UNIT_READY(struct sd_lun *un, int flag) } if ((ucmd_buf.uscsi_rqstatus == STATUS_GOOD) && (scsi_sense_key((uint8_t *)&sense_buf) == - KEY_NOT_READY) && + KEY_NOT_READY) && (scsi_sense_asc((uint8_t *)&sense_buf) == 0x3A)) { status = ENXIO; } @@ -18200,7 +18207,7 @@ sd_send_scsi_PERSISTENT_RESERVE_IN(struct sd_lun *un, uchar_t usr_cmd, case STATUS_CHECK: if ((ucmd_buf.uscsi_rqstatus == STATUS_GOOD) && (scsi_sense_key((uint8_t *)&sense_buf) == - KEY_ILLEGAL_REQUEST)) { + KEY_ILLEGAL_REQUEST)) { status = ENOTSUP; } break; @@ -18345,7 +18352,7 @@ sd_send_scsi_PERSISTENT_RESERVE_OUT(struct sd_lun *un, uchar_t usr_cmd, case STATUS_CHECK: if ((ucmd_buf.uscsi_rqstatus == STATUS_GOOD) && (scsi_sense_key((uint8_t *)&sense_buf) == - KEY_ILLEGAL_REQUEST)) { + KEY_ILLEGAL_REQUEST)) { status = ENOTSUP; } break; @@ -18493,7 +18500,7 @@ sd_send_scsi_SYNCHRONIZE_CACHE_biodone(struct buf *bp) case STATUS_CHECK: if ((uscmd->uscsi_rqstatus == STATUS_GOOD) && (scsi_sense_key(sense_buf) == - KEY_ILLEGAL_REQUEST)) { + KEY_ILLEGAL_REQUEST)) { /* Ignore Illegal Request error */ mutex_enter(SD_MUTEX(un)); un->un_f_sync_cache_supported = FALSE; @@ -18627,7 +18634,7 @@ sd_send_scsi_GET_CONFIGURATION(struct sd_lun *un, struct uscsi_cmd *ucmdbuf, * Function: sd_send_scsi_feature_GET_CONFIGURATION * * Description: Issues the get configuration command to the device to - * retrieve a specfic feature. Called from + * retrieve a specific feature. Called from * sd_check_for_writable_cd & sd_set_mmc_caps. * Arguments: un * ucmdbuf @@ -19934,7 +19941,7 @@ skip_ready_valid: * the drive speed. Thus EINVAL would be returned * if a set request was made for an mmc device. * We no longer support get or set speed for - * mmc but need to remain consistant with regard + * mmc but need to remain consistent with regard * to the error code returned. */ err = EINVAL; @@ -20030,7 +20037,7 @@ skip_ready_valid: if (!un->un_f_sync_cache_supported || !un->un_f_write_cache_enabled) { err = un->un_f_sync_cache_supported ? - 0 : ENOTSUP; + 0 : ENOTSUP; mutex_exit(SD_MUTEX(un)); if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback != NULL) { @@ -20135,7 +20142,7 @@ skip_ready_valid: mutex_exit(SD_MUTEX(un)); err = sd_cache_control(un, SD_CACHE_NOCHANGE, - SD_CACHE_ENABLE); + SD_CACHE_ENABLE); mutex_enter(SD_MUTEX(un)); @@ -20194,7 +20201,7 @@ sd_dkio_ctrl_info(dev_t dev, caddr_t arg, int flag) } info = (struct dk_cinfo *) - kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); + kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); switch (un->un_ctype) { case CTYPE_CDROM: @@ -20302,8 +20309,8 @@ sd_get_media_info(dev_t dev, caddr_t arg, int flag) /* Allow SCMD_GET_CONFIGURATION to MMC devices only */ if (un->un_f_mmc_cap == TRUE) { rtn = sd_send_scsi_GET_CONFIGURATION(un, &com, rqbuf, - SENSE_LENGTH, out_data, SD_PROFILE_HEADER_LEN, - SD_PATH_STANDARD); + SENSE_LENGTH, out_data, SD_PROFILE_HEADER_LEN, + SD_PATH_STANDARD); if (rtn) { /* @@ -21395,17 +21402,17 @@ sd_mhdioc_inresv(dev_t dev, caddr_t arg, int flag) * SCSI-2 * The cluster software takes ownership of a multi-hosted disk by issuing the * MHIOCTKOWN ioctl to the disk driver. It releases ownership by issuing the - * MHIOCRELEASE ioctl.Closely related is the MHIOCENFAILFAST ioctl -- a cluster, - * just after taking ownership of the disk with the MHIOCTKOWN ioctl then issues - * the MHIOCENFAILFAST ioctl. This ioctl "enables failfast" in the driver. The - * meaning of failfast is that if the driver (on this host) ever encounters the - * scsi error return code RESERVATION_CONFLICT from the device, it should - * immediately panic the host. The motivation for this ioctl is that if this - * host does encounter reservation conflict, the underlying cause is that some - * other host of the cluster has decided that this host is no longer in the - * cluster and has seized control of the disks for itself. Since this host is no - * longer in the cluster, it ought to panic itself. The MHIOCENFAILFAST ioctl - * does two things: + * MHIOCRELEASE ioctl. Closely related is the MHIOCENFAILFAST ioctl -- a + * cluster, just after taking ownership of the disk with the MHIOCTKOWN ioctl + * then issues the MHIOCENFAILFAST ioctl. This ioctl "enables failfast" in the + * driver. The meaning of failfast is that if the driver (on this host) ever + * encounters the scsi error return code RESERVATION_CONFLICT from the device, + * it should immediately panic the host. The motivation for this ioctl is that + * if this host does encounter reservation conflict, the underlying cause is + * that some other host of the cluster has decided that this host is no longer + * in the cluster and has seized control of the disks for itself. Since this + * host is no longer in the cluster, it ought to panic itself. The + * MHIOCENFAILFAST ioctl does two things: * (a) it sets a flag that will cause any returned RESERVATION_CONFLICT * error to panic the host * (b) it sets up a periodic timer to test whether this host still has @@ -22498,7 +22505,7 @@ sddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) if (sd_send_polled_RQS(un) == SD_FAILURE) { SD_INFO(SD_LOG_DUMP, un, - "sddump: sd_send_polled_RQS failed\n"); + "sddump: sd_send_polled_RQS failed\n"); } mutex_enter(SD_MUTEX(un)); } @@ -22530,8 +22537,8 @@ sddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) tgt_blkno = tgt_byte_offset / un->un_tgt_blocksize; tgt_nblk = ((tgt_byte_offset + tgt_byte_count + - (un->un_tgt_blocksize - 1)) / - un->un_tgt_blocksize) - tgt_blkno; + (un->un_tgt_blocksize - 1)) / + un->un_tgt_blocksize) - tgt_blkno; /* * Invoke the routine which is going to do read part @@ -22604,8 +22611,8 @@ sddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) #if defined(__i386) || defined(__amd64) blkno = oblkno + - ((wr_bp->b_bcount - dma_resid) / - un->un_tgt_blocksize); + ((wr_bp->b_bcount - dma_resid) / + un->un_tgt_blocksize); nblk = dma_resid / un->un_tgt_blocksize; if (wr_pktp) { @@ -23025,7 +23032,7 @@ sd_ddi_scsi_poll(struct scsi_pkt *pkt) } else if ((sensep != NULL) && (scsi_sense_key(sensep) == - KEY_UNIT_ATTENTION)) { + KEY_UNIT_ATTENTION)) { /* Unit Attention - try again */ busy_count += (SD_SEC_TO_CSEC - 1); /* 1 */ continue; @@ -24453,7 +24460,7 @@ sr_read_tocentry(dev_t dev, caddr_t data, int flag) * READ HEADER command failed, since this is * obsoleted in one spec, its better to return * -1 for an invlid track so that we can still - * recieve the rest of the TOC data. + * receive the rest of the TOC data. */ entry->cdte_datamode = (uchar_t)-1; } @@ -26486,7 +26493,7 @@ sd_setup_next_xfer(struct sd_lun *un, struct buf *bp, /* * Function: sd_panic_for_res_conflict * - * Description: Call panic with a string formated with "Reservation Conflict" + * Description: Call panic with a string formatted with "Reservation Conflict" * and a human readable identifier indicating the SD instance * that experienced the reservation conflict. * @@ -26526,7 +26533,7 @@ static uint_t sd_fault_injection_on = 0; * faultinjection ioctls to inject errors into the * layer model * - * Arguments: cmd - the ioctl cmd recieved + * Arguments: cmd - the ioctl cmd received * arg - the arguments from user and returns */ @@ -26878,7 +26885,7 @@ sd_faultinjection(struct scsi_pkt *pktp) /* if injection is off return */ if (sd_fault_injection_on == 0 || - un->sd_fi_fifo_start == un->sd_fi_fifo_end) { + un->sd_fi_fifo_start == un->sd_fi_fifo_end) { mutex_exit(SD_MUTEX(un)); return; } @@ -27164,7 +27171,7 @@ sd_faultinjection(struct scsi_pkt *pktp) * Firewire hard disks now have partition kstats * * ------------------------------------------------------ - * removable media hotplugable | kstat + * removable media hotpluggable | kstat * ------------------------------------------------------ * false false | Yes * false true | Yes @@ -27366,7 +27373,7 @@ sd_set_unit_attributes(struct sd_lun *un, dev_info_t *devi) */ un->un_f_pkstats_enabled = (ddi_prop_get_int(DDI_DEV_T_ANY, SD_DEVINFO(un), DDI_PROP_DONTPASS, - "enable-partition-kstats", 1)); + "enable-partition-kstats", 1)); /* * Check if HBA has set the "pm-capable" property. diff --git a/usr/src/uts/common/sys/dditypes.h b/usr/src/uts/common/sys/dditypes.h index f38a1c29d1..52b6198972 100644 --- a/usr/src/uts/common/sys/dditypes.h +++ b/usr/src/uts/common/sys/dditypes.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -30,6 +30,9 @@ #pragma ident "%Z%%M% %I% %E% SMI" #include <sys/isa_defs.h> +#ifndef _ASM +#include <sys/types.h> +#endif #ifdef __cplusplus extern "C" { diff --git a/usr/src/uts/common/sys/fm/fs/zfs.h b/usr/src/uts/common/sys/fm/fs/zfs.h index aa5c7ee0d7..8af2701aff 100644 --- a/usr/src/uts/common/sys/fm/fs/zfs.h +++ b/usr/src/uts/common/sys/fm/fs/zfs.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -67,6 +67,8 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state" #define FM_RESOURCE_OK "ok" +#define FM_RESOURCE_REMOVED "removed" +#define FM_RESOURCE_AUTOREPLACE "autoreplace" #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h index 354e837212..deecc0d36a 100644 --- a/usr/src/uts/common/sys/fs/zfs.h +++ b/usr/src/uts/common/sys/fs/zfs.h @@ -54,7 +54,7 @@ typedef enum { /* * Properties are identified by these constants and must be added to the - * end of this list to ensure that external conumsers are not affected + * end of this list to ensure that external consumers are not affected * by the change. The property list also determines how 'zfs get' will * display them. If you make any changes to this list, be sure to update * the property table in usr/src/common/zfs/zfs_prop.c. @@ -96,11 +96,16 @@ typedef enum { ZFS_PROP_XATTR, ZFS_PROP_NUMCLONES, /* not exposed to the user */ ZFS_PROP_COPIES, - ZFS_PROP_BOOTFS + ZPOOL_PROP_BOOTFS, + ZPOOL_PROP_AUTOREPLACE, + ZPOOL_PROP_NAME } zfs_prop_t; typedef zfs_prop_t zpool_prop_t; +#define ZPOOL_PROP_CONT ZFS_PROP_CONT +#define ZPOOL_PROP_INVAL ZFS_PROP_INVAL + #define ZFS_PROP_VALUE "value" #define ZFS_PROP_SOURCE "source" @@ -123,17 +128,18 @@ boolean_t zfs_prop_user(const char *); int zfs_prop_readonly(zfs_prop_t); const char *zfs_prop_default_string(zfs_prop_t); const char *zfs_prop_to_name(zfs_prop_t); -const char *zpool_prop_to_name(zfs_prop_t); +const char *zpool_prop_to_name(zpool_prop_t); uint64_t zfs_prop_default_numeric(zfs_prop_t); int zfs_prop_inheritable(zfs_prop_t); int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); +uint64_t zpool_prop_default_numeric(zpool_prop_t); /* * Property Iterator */ typedef zfs_prop_t (*zfs_prop_f)(zfs_prop_t, void *); -typedef zfs_prop_f zpool_prop_f; +typedef zpool_prop_t (*zpool_prop_f)(zpool_prop_t, void *); extern zfs_prop_t zfs_prop_iter(zfs_prop_f, void *, boolean_t); extern zpool_prop_t zpool_prop_iter(zpool_prop_f, void *, boolean_t); @@ -201,7 +207,6 @@ extern zpool_prop_t zpool_prop_iter(zpool_prop_f, void *, boolean_t); #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_STATS "stats" #define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" -#define ZPOOL_CONFIG_OFFLINE "offline" #define ZPOOL_CONFIG_ERRCOUNT "error_count" #define ZPOOL_CONFIG_NOT_PRESENT "not_present" #define ZPOOL_CONFIG_SPARES "spares" @@ -210,6 +215,17 @@ extern zpool_prop_t zpool_prop_iter(zpool_prop_f, void *, boolean_t); #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ +#define ZPOOL_CONFIG_UNSPARE "unspare" +#define ZPOOL_CONFIG_PHYS_PATH "phys_path" +/* + * The persistent vdev state is stored as separate values rather than a single + * 'vdev_state' entry. This is because a device can be in multiple states, such + * as offline and degraded. + */ +#define ZPOOL_CONFIG_OFFLINE "offline" +#define ZPOOL_CONFIG_FAULTED "faulted" +#define ZPOOL_CONFIG_DEGRADED "degraded" +#define ZPOOL_CONFIG_REMOVED "removed" #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" @@ -243,11 +259,15 @@ typedef enum vdev_state { VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */ VDEV_STATE_CLOSED, /* Not currently open */ VDEV_STATE_OFFLINE, /* Not allowed to open */ + VDEV_STATE_REMOVED, /* Explicitly removed from system */ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ + VDEV_STATE_FAULTED, /* External request to fault device */ VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ VDEV_STATE_HEALTHY /* Presumed good */ } vdev_state_t; +#define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY + /* * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field * of the vdev stats structure uses these constants to distinguish why. @@ -262,7 +282,8 @@ typedef enum vdev_aux { VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ - VDEV_AUX_SPARED /* hot spare used in another pool */ + VDEV_AUX_SPARED, /* hot spare used in another pool */ + VDEV_AUX_ERR_EXCEEDED /* too many errors */ } vdev_aux_t; /* @@ -369,8 +390,7 @@ typedef enum zfs_ioc { ZFS_IOC_POOL_LOG_HISTORY, ZFS_IOC_VDEV_ADD, ZFS_IOC_VDEV_REMOVE, - ZFS_IOC_VDEV_ONLINE, - ZFS_IOC_VDEV_OFFLINE, + ZFS_IOC_VDEV_SET_STATE, ZFS_IOC_VDEV_ATTACH, ZFS_IOC_VDEV_DETACH, ZFS_IOC_VDEV_SETPATH, @@ -427,6 +447,39 @@ typedef enum { #define ZPOOL_HIST_TIME "history time" #define ZPOOL_HIST_CMD "history command" +/* + * Flags for ZFS_IOC_VDEV_SET_STATE + */ +#define ZFS_ONLINE_CHECKREMOVE 0x1 +#define ZFS_ONLINE_UNSPARE 0x2 +#define ZFS_ONLINE_FORCEFAULT 0x4 +#define ZFS_OFFLINE_TEMPORARY 0x1 + +/* + * Sysevent payload members. ZFS will generate the following sysevents with the + * given payloads: + * + * ESC_ZFS_RESILVER_START + * ESC_ZFS_RESILVER_END + * ESC_ZFS_POOL_DESTROY + * + * ZFS_EV_POOL_NAME DATA_TYPE_STRING + * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 + * + * ESC_ZFS_VDEV_REMOVE + * ESC_ZFS_VDEV_CLEAR + * ESC_ZFS_VDEV_CHECK + * + * ZFS_EV_POOL_NAME DATA_TYPE_STRING + * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 + * ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional) + * ZFS_EV_VDEV_GUID DATA_TYPE_UINT64 + */ +#define ZFS_EV_POOL_NAME "pool_name" +#define ZFS_EV_POOL_GUID "pool_guid" +#define ZFS_EV_VDEV_PATH "vdev_path" +#define ZFS_EV_VDEV_GUID "vdev_guid" + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/lofi.h b/usr/src/uts/common/sys/lofi.h index a5f0eb1d97..362af884e3 100644 --- a/usr/src/uts/common/sys/lofi.h +++ b/usr/src/uts/common/sys/lofi.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -92,13 +91,20 @@ extern "C" { * ioctl(ld, LOFI_GET_MAXMINOR, &li); * maxminor = li.li_minor; * + * If the 'li_force' flag is set for any of the LOFI_UNMAP_* commands, then if + * the device is busy, the underlying vnode will be closed, and any subsequent + * operations will fail. It will behave as if the device had been forcibly + * removed, so the DKIOCSTATE ioctl will return DKIO_DEV_GONE. When the device + * is last closed, it will be torn down. + * * Oh, and last but not least: these ioctls are totally private and only * for use by lofiadm(1M). * */ struct lofi_ioctl { - uint32_t li_minor; + uint32_t li_minor; + boolean_t li_force; char li_filename[MAXPATHLEN + 1]; }; @@ -134,9 +140,13 @@ extern uint32_t lofi_max_files; ((vtype == VREG) || (vtype == VBLK) || (vtype == VCHR)) struct lofi_state { - char *ls_filename; /* filename to open */ - size_t ls_filename_sz; - struct vnode *ls_vp; /* open vnode */ + char *ls_filename; /* filename to open */ + size_t ls_filename_sz; + struct vnode *ls_vp; /* open vnode */ + kmutex_t ls_vp_lock; /* protects ls_vp */ + kcondvar_t ls_vp_cv; /* signal changes to ls_vp */ + uint32_t ls_vp_iocount; /* # pending I/O requests */ + boolean_t ls_vp_closereq; /* force close requested */ u_offset_t ls_vp_size; uint32_t ls_blk_open; uint32_t ls_chr_open; diff --git a/usr/src/uts/common/sys/sysevent/eventdefs.h b/usr/src/uts/common/sys/sysevent/eventdefs.h index 7e8eff763f..69f01b9af4 100644 --- a/usr/src/uts/common/sys/sysevent/eventdefs.h +++ b/usr/src/uts/common/sys/sysevent/eventdefs.h @@ -51,6 +51,7 @@ extern "C" { #define EC_DEV_REMOVE "EC_dev_remove" /* device remove event class */ #define EC_DEV_BRANCH "EC_dev_branch" /* device tree branch event class */ #define EC_FM "EC_fm" /* FMA error report event */ +#define EC_ZFS "EC_zfs" /* ZFS event */ /* * The following event class is reserved for exclusive use @@ -215,6 +216,17 @@ extern "C" { #define ESC_ACPIEV_LOW "ESC_acpiev_low" #define ESC_ACPIEV_STATE_CHANGE "ESC_acpiev_state_change" +/* + * ZFS subclass definitions. supporting attributes (name/value paris) are found + * in sys/fs/zfs.h + */ +#define ESC_ZFS_RESILVER_START "ESC_ZFS_resilver_start" +#define ESC_ZFS_RESILVER_FINISH "ESC_ZFS_resilver_finish" +#define ESC_ZFS_VDEV_REMOVE "ESC_ZFS_vdev_remove" +#define ESC_ZFS_POOL_DESTROY "ESC_ZFS_pool_destroy" +#define ESC_ZFS_VDEV_CLEAR "ESC_ZFS_vdev_clear" +#define ESC_ZFS_VDEV_CHECK "ESC_ZFS_vdev_check" + #ifdef __cplusplus } #endif |