summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/fs/zfs/vdev.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/fs/zfs/vdev.c')
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c401
1 files changed, 294 insertions, 107 deletions
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index fbb77774c2..9b2ec04710 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -319,44 +319,13 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
txg_list_create(&vd->vdev_dtl_list,
offsetof(struct vdev, vdev_dtl_node));
vd->vdev_stat.vs_timestamp = gethrtime();
+ vdev_queue_init(vd);
+ vdev_cache_init(vd);
return (vd);
}
/*
- * Free a vdev_t that has been removed from service.
- */
-static void
-vdev_free_common(vdev_t *vd)
-{
- spa_t *spa = vd->vdev_spa;
-
- if (vd->vdev_path)
- spa_strfree(vd->vdev_path);
- if (vd->vdev_devid)
- spa_strfree(vd->vdev_devid);
-
- if (vd->vdev_isspare)
- spa_spare_remove(vd);
-
- txg_list_destroy(&vd->vdev_ms_list);
- txg_list_destroy(&vd->vdev_dtl_list);
- mutex_enter(&vd->vdev_dtl_lock);
- space_map_unload(&vd->vdev_dtl_map);
- space_map_destroy(&vd->vdev_dtl_map);
- space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
- space_map_destroy(&vd->vdev_dtl_scrub);
- mutex_exit(&vd->vdev_dtl_lock);
- mutex_destroy(&vd->vdev_dtl_lock);
- mutex_destroy(&vd->vdev_stat_lock);
-
- if (vd == spa->spa_root_vdev)
- spa->spa_root_vdev = NULL;
-
- kmem_free(vd, sizeof (vdev_t));
-}
-
-/*
* Allocate a new vdev. The 'alloctype' is used to control whether we are
* creating a new vdev or loading an existing one - the behavior is slightly
* different for each case.
@@ -408,6 +377,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
vd->vdev_path = spa_strdup(vd->vdev_path);
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
vd->vdev_devid = spa_strdup(vd->vdev_devid);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
+ &vd->vdev_physpath) == 0)
+ vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
/*
* Set the nparity propery for RAID-Z vdevs.
@@ -477,13 +449,28 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
}
/*
- * If we're a leaf vdev, try to load the DTL object and offline state.
+ * If we're a leaf vdev, try to load the DTL object and other state.
*/
if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
&vd->vdev_dtl.smo_object);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
&vd->vdev_offline);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
+ &vd->vdev_unspare);
+ /*
+ * When importing a pool, we want to ignore the persistent fault
+ * state, as the diagnosis made on another system may not be
+ * valid in the current context.
+ */
+ if (spa->spa_load_state == SPA_LOAD_OPEN) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
+ &vd->vdev_faulted);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
+ &vd->vdev_degraded);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
+ &vd->vdev_removed);
+ }
}
/*
@@ -500,6 +487,7 @@ void
vdev_free(vdev_t *vd)
{
int c;
+ spa_t *spa = vd->vdev_spa;
/*
* vdev_free() implies closing the vdev first. This is simpler than
@@ -507,6 +495,7 @@ vdev_free(vdev_t *vd)
*/
vdev_close(vd);
+
ASSERT(!list_link_active(&vd->vdev_dirty_node));
/*
@@ -535,7 +524,37 @@ vdev_free(vdev_t *vd)
ASSERT(vd->vdev_parent == NULL);
- vdev_free_common(vd);
+ /*
+ * Clean up vdev structure.
+ */
+ vdev_queue_fini(vd);
+ vdev_cache_fini(vd);
+
+ if (vd->vdev_path)
+ spa_strfree(vd->vdev_path);
+ if (vd->vdev_devid)
+ spa_strfree(vd->vdev_devid);
+ if (vd->vdev_physpath)
+ spa_strfree(vd->vdev_physpath);
+
+ if (vd->vdev_isspare)
+ spa_spare_remove(vd);
+
+ txg_list_destroy(&vd->vdev_ms_list);
+ txg_list_destroy(&vd->vdev_dtl_list);
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_unload(&vd->vdev_dtl_map);
+ space_map_destroy(&vd->vdev_dtl_map);
+ space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+ space_map_destroy(&vd->vdev_dtl_scrub);
+ mutex_exit(&vd->vdev_dtl_lock);
+ mutex_destroy(&vd->vdev_dtl_lock);
+ mutex_destroy(&vd->vdev_stat_lock);
+
+ if (vd == spa->spa_root_vdev)
+ spa->spa_root_vdev = NULL;
+
+ kmem_free(vd, sizeof (vdev_t));
}
/*
@@ -590,9 +609,6 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
vdev_config_dirty(tvd);
}
- tvd->vdev_reopen_wanted = svd->vdev_reopen_wanted;
- svd->vdev_reopen_wanted = 0;
-
tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
svd->vdev_deflate_ratio = 0;
}
@@ -781,13 +797,12 @@ vdev_open(vdev_t *vd)
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
- if (vd->vdev_ops->vdev_op_leaf) {
- vdev_cache_init(vd);
- vdev_queue_init(vd);
- vd->vdev_cache_active = B_TRUE;
- }
-
- if (vd->vdev_offline) {
+ if (!vd->vdev_removed && vd->vdev_faulted) {
+ ASSERT(vd->vdev_children == 0);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ VDEV_AUX_ERR_EXCEEDED);
+ return (ENXIO);
+ } else if (vd->vdev_offline) {
ASSERT(vd->vdev_children == 0);
vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
return (ENXIO);
@@ -798,16 +813,25 @@ vdev_open(vdev_t *vd)
if (zio_injection_enabled && error == 0)
error = zio_handle_device_injection(vd, ENXIO);
- dprintf("%s = %d, osize %llu, state = %d\n",
- vdev_description(vd), error, osize, vd->vdev_state);
-
if (error) {
+ if (vd->vdev_removed &&
+ vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
+ vd->vdev_removed = B_FALSE;
+
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
vd->vdev_stat.vs_aux);
return (error);
}
- vd->vdev_state = VDEV_STATE_HEALTHY;
+ vd->vdev_removed = B_FALSE;
+
+ if (vd->vdev_degraded) {
+ ASSERT(vd->vdev_children == 0);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
+ VDEV_AUX_ERR_EXCEEDED);
+ } else {
+ vd->vdev_state = VDEV_STATE_HEALTHY;
+ }
for (c = 0; c < vd->vdev_children; c++)
if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
@@ -905,8 +929,7 @@ vdev_open(vdev_t *vd)
/*
* Called once the vdevs are all opened, this routine validates the label
* contents. This needs to be done before vdev_load() so that we don't
- * inadvertently do repair I/Os to the wrong device, and so that vdev_reopen()
- * won't succeed if the device has been changed underneath.
+ * inadvertently do repair I/Os to the wrong device.
*
* This function will only return failure if one of the vdevs indicates that it
* has since been destroyed or exported. This is only possible if
@@ -988,11 +1011,7 @@ vdev_close(vdev_t *vd)
{
vd->vdev_ops->vdev_op_close(vd);
- if (vd->vdev_cache_active) {
- vdev_cache_fini(vd);
- vdev_queue_fini(vd);
- vd->vdev_cache_active = B_FALSE;
- }
+ vdev_cache_purge(vd);
/*
* We record the previous state before we close it, so that if we are
@@ -1022,22 +1041,13 @@ vdev_reopen(vdev_t *vd)
* Call vdev_validate() here to make sure we have the same device.
* Otherwise, a device with an invalid label could be successfully
* opened in response to vdev_reopen().
- *
- * The downside to this is that if the user is simply experimenting by
- * overwriting an entire disk, we'll fault the device rather than
- * demonstrate self-healing capabilities. On the other hand, with
- * proper FMA integration, the series of errors we'd see from the device
- * would result in a faulted device anyway. Given that this doesn't
- * model any real-world corruption, it's better to catch this here and
- * correctly identify that the device has either changed beneath us, or
- * is corrupted beyond recognition.
*/
(void) vdev_validate(vd);
/*
- * Reassess root vdev's health.
+ * Reassess parent vdev's health.
*/
- vdev_propagate_state(spa->spa_root_vdev);
+ vdev_propagate_state(vd);
}
int
@@ -1428,8 +1438,12 @@ vdev_description(vdev_t *vd)
return (vd->vdev_ops->vdev_op_type);
}
+/*
+ * Mark the given vdev faulted. A faulted vdev behaves as if the device could
+ * not be opened, and no I/O is attempted.
+ */
int
-vdev_online(spa_t *spa, uint64_t guid)
+vdev_fault(spa_t *spa, uint64_t guid)
{
vdev_t *rvd, *vd;
uint64_t txg;
@@ -1440,27 +1454,141 @@ vdev_online(spa_t *spa, uint64_t guid)
if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+ /*
+ * Faulted state takes precedence over degraded.
+ */
+ vd->vdev_faulted = 1ULL;
+ vd->vdev_degraded = 0ULL;
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED,
+ VDEV_AUX_ERR_EXCEEDED);
+
+ /*
+ * If marking the vdev as faulted cause the toplevel vdev to become
+ * unavailable, then back off and simply mark the vdev as degraded
+ * instead.
+ */
+ if (vdev_is_dead(vd->vdev_top)) {
+ vd->vdev_degraded = 1ULL;
+ vd->vdev_faulted = 0ULL;
+
+ /*
+ * If we reopen the device and it's not dead, only then do we
+ * mark it degraded.
+ */
+ vdev_reopen(vd);
+
+ if (!vdev_is_dead(vd)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
+ VDEV_AUX_ERR_EXCEEDED);
+ }
+ }
+
+ vdev_config_dirty(vd->vdev_top);
+
+ (void) spa_vdev_exit(spa, NULL, txg, 0);
+
+ return (0);
+}
+
+/*
+ * Mark the given vdev degraded. A degraded vdev is purely an indication to the
+ * user that something is wrong. The vdev continues to operate as normal as far
+ * as I/O is concerned.
+ */
+int
+vdev_degrade(spa_t *spa, uint64_t guid)
+{
+ vdev_t *rvd, *vd;
+ uint64_t txg;
+
+ txg = spa_vdev_enter(spa);
+
+ rvd = spa->spa_root_vdev;
+
+ if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
- dprintf("ONLINE: %s\n", vdev_description(vd));
+ /*
+ * If the vdev is already faulted, then don't do anything.
+ */
+ if (vd->vdev_faulted || vd->vdev_degraded) {
+ (void) spa_vdev_exit(spa, NULL, txg, 0);
+ return (0);
+ }
+
+ vd->vdev_degraded = 1ULL;
+ if (!vdev_is_dead(vd))
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
+ VDEV_AUX_ERR_EXCEEDED);
+ vdev_config_dirty(vd->vdev_top);
+
+ (void) spa_vdev_exit(spa, NULL, txg, 0);
+
+ return (0);
+}
+
+/*
+ * Online the given vdev. If 'unspare' is set, it implies two things. First,
+ * any attached spare device should be detached when the device finishes
+ * resilvering. Second, the online should be treated like a 'test' online case,
+ * so no FMA events are generated if the device fails to open.
+ */
+int
+vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
+ vdev_state_t *newstate)
+{
+ vdev_t *rvd, *vd;
+ uint64_t txg;
+
+ txg = spa_vdev_enter(spa);
+
+ rvd = spa->spa_root_vdev;
+
+ if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
vd->vdev_offline = B_FALSE;
vd->vdev_tmpoffline = B_FALSE;
+ vd->vdev_checkremove = (flags & ZFS_ONLINE_CHECKREMOVE) ?
+ B_TRUE : B_FALSE;
+ vd->vdev_forcefault = (flags & ZFS_ONLINE_FORCEFAULT) ?
+ B_TRUE : B_FALSE;
vdev_reopen(vd->vdev_top);
+ vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
+
+ if (newstate)
+ *newstate = vd->vdev_state;
+ if ((flags & ZFS_ONLINE_UNSPARE) &&
+ !vdev_is_dead(vd) && vd->vdev_parent &&
+ vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+ vd->vdev_parent->vdev_child[0] == vd)
+ vd->vdev_unspare = B_TRUE;
vdev_config_dirty(vd->vdev_top);
(void) spa_vdev_exit(spa, NULL, txg, 0);
+ /*
+ * Must hold spa_namespace_lock in order to post resilver sysevent
+ * w/pool name.
+ */
+ mutex_enter(&spa_namespace_lock);
VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+ mutex_exit(&spa_namespace_lock);
return (0);
}
int
-vdev_offline(spa_t *spa, uint64_t guid, int istmp)
+vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
{
vdev_t *rvd, *vd;
uint64_t txg;
@@ -1475,8 +1603,6 @@ vdev_offline(spa_t *spa, uint64_t guid, int istmp)
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
- dprintf("OFFLINE: %s\n", vdev_description(vd));
-
/*
* If the device isn't already offline, try to offline it.
*/
@@ -1505,7 +1631,8 @@ vdev_offline(spa_t *spa, uint64_t guid, int istmp)
}
}
- vd->vdev_tmpoffline = istmp;
+ vd->vdev_tmpoffline = (flags & ZFS_OFFLINE_TEMPORARY) ?
+ B_TRUE : B_FALSE;
vdev_config_dirty(vd->vdev_top);
@@ -1531,12 +1658,29 @@ vdev_clear(spa_t *spa, vdev_t *vd)
for (c = 0; c < vd->vdev_children; c++)
vdev_clear(spa, vd->vdev_child[c]);
+
+ /*
+ * If we're in the FAULTED state, then clear the persistent state and
+ * attempt to reopen the device. We also mark the vdev config dirty, so
+ * that the new faulted state is written out to disk.
+ */
+ if (vd->vdev_faulted || vd->vdev_degraded) {
+ vd->vdev_faulted = vd->vdev_degraded = 0;
+ vdev_reopen(vd);
+ vdev_config_dirty(vd->vdev_top);
+
+ if (vd->vdev_faulted)
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER,
+ B_TRUE) == 0);
+
+ spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
+ }
}
int
vdev_is_dead(vdev_t *vd)
{
- return (vd->vdev_state <= VDEV_STATE_CANT_OPEN);
+ return (vd->vdev_state < VDEV_STATE_DEGRADED);
}
int
@@ -1563,12 +1707,6 @@ vdev_error_inject(vdev_t *vd, zio_t *zio)
break;
}
- if (error != 0) {
- dprintf("returning %d for type %d on %s state %d offset %llx\n",
- error, zio->io_type, vdev_description(vd),
- vd->vdev_state, zio->io_offset);
- }
-
return (error);
}
@@ -1792,28 +1930,34 @@ vdev_propagate_state(vdev_t *vd)
int c;
vdev_t *child;
- for (c = 0; c < vd->vdev_children; c++) {
- child = vd->vdev_child[c];
- if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
- faulted++;
- else if (child->vdev_state == VDEV_STATE_DEGRADED)
- degraded++;
-
- if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
- corrupted++;
- }
+ if (vd->vdev_children > 0) {
+ for (c = 0; c < vd->vdev_children; c++) {
+ child = vd->vdev_child[c];
+ if (vdev_is_dead(child))
+ faulted++;
+ else if (child->vdev_state == VDEV_STATE_DEGRADED)
+ degraded++;
- vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
+ if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
+ corrupted++;
+ }
- /*
- * Root special: if there is a toplevel vdev that cannot be
- * opened due to corrupted metadata, then propagate the root
- * vdev's aux state as 'corrupt' rather than 'insufficient
- * replicas'.
- */
- if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN)
- vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
+ vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
+
+ /*
+ * Root special: if there is a toplevel vdev that cannot be
+ * opened due to corrupted metadata, then propagate the root
+ * vdev's aux state as 'corrupt' rather than 'insufficient
+ * replicas'.
+ */
+ if (corrupted && vd == rvd &&
+ rvd->vdev_state == VDEV_STATE_CANT_OPEN)
+ vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ }
+
+ if (vd->vdev_parent)
+ vdev_propagate_state(vd->vdev_parent);
}
/*
@@ -1839,7 +1983,39 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
vd->vdev_state = state;
vd->vdev_stat.vs_aux = aux;
- if (state == VDEV_STATE_CANT_OPEN) {
+ /*
+ * If we are setting the vdev state to anything but an open state, then
+ * always close the underlying device. Otherwise, we keep accessible
+ * but invalid devices open forever. We don't call vdev_close() itself,
+ * because that implies some extra checks (offline, etc) that we don't
+ * want here. This is limited to leaf devices, because otherwise
+ * closing the device will affect other children.
+ */
+ if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_ops->vdev_op_close(vd);
+
+ if (vd->vdev_removed &&
+ state == VDEV_STATE_CANT_OPEN &&
+ (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
+ /*
+ * If the previous state is set to VDEV_STATE_REMOVED, then this
+ * device was previously marked removed and someone attempted to
+ * reopen it. If this failed due to a nonexistent device, then
+ * keep the device in the REMOVED state. We also let this be if
+ * it is one of our special test online cases, which is only
+ * attempting to online the device and shouldn't generate an FMA
+ * fault.
+ */
+ vd->vdev_state = VDEV_STATE_REMOVED;
+ vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+ } else if (state == VDEV_STATE_REMOVED) {
+ /*
+ * Indicate to the ZFS DE that this device has been removed, and
+ * any recent errors should be ignored.
+ */
+ zfs_post_remove(vd->vdev_spa, vd);
+ vd->vdev_removed = B_TRUE;
+ } else if (state == VDEV_STATE_CANT_OPEN) {
/*
* If we fail to open a vdev during an import, we mark it as
* "not available", which signifies that it was never there to
@@ -1856,8 +2032,17 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
* that this is part of a vdev_reopen(). In this case, we don't
* want to post the ereport if the device was already in the
* CANT_OPEN state beforehand.
+ *
+ * If the 'checkremove' flag is set, then this is an attempt to
+ * online the device in response to an insertion event. If we
+ * hit this case, then we have detected an insertion event for a
+ * faulted or offline device that wasn't in the removed state.
+ * In this scenario, we don't post an ereport because we are
+ * about to replace the device, or attempt an online with
+ * vdev_forcefault, which will generate the fault for us.
*/
- if (vd->vdev_prevstate != state && !vd->vdev_not_present &&
+ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
+ !vd->vdev_not_present && !vd->vdev_checkremove &&
vd != vd->vdev_spa->spa_root_vdev) {
const char *class;
@@ -1887,11 +2072,13 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
zfs_ereport_post(class, vd->vdev_spa,
vd, NULL, save_state, 0);
}
- }
- if (isopen)
- return;
+ /* Erase any notion of persistent removed state */
+ vd->vdev_removed = B_FALSE;
+ } else {
+ vd->vdev_removed = B_FALSE;
+ }
- if (vd->vdev_parent != NULL)
- vdev_propagate_state(vd->vdev_parent);
+ if (!isopen)
+ vdev_propagate_state(vd);
}