diff options
Diffstat (limited to 'usr/src/uts/common/fs/zfs/vdev.c')
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev.c | 401 |
1 files changed, 294 insertions, 107 deletions
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index fbb77774c2..9b2ec04710 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -319,44 +319,13 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) txg_list_create(&vd->vdev_dtl_list, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); + vdev_queue_init(vd); + vdev_cache_init(vd); return (vd); } /* - * Free a vdev_t that has been removed from service. - */ -static void -vdev_free_common(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - - if (vd->vdev_path) - spa_strfree(vd->vdev_path); - if (vd->vdev_devid) - spa_strfree(vd->vdev_devid); - - if (vd->vdev_isspare) - spa_spare_remove(vd); - - txg_list_destroy(&vd->vdev_ms_list); - txg_list_destroy(&vd->vdev_dtl_list); - mutex_enter(&vd->vdev_dtl_lock); - space_map_unload(&vd->vdev_dtl_map); - space_map_destroy(&vd->vdev_dtl_map); - space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); - space_map_destroy(&vd->vdev_dtl_scrub); - mutex_exit(&vd->vdev_dtl_lock); - mutex_destroy(&vd->vdev_dtl_lock); - mutex_destroy(&vd->vdev_stat_lock); - - if (vd == spa->spa_root_vdev) - spa->spa_root_vdev = NULL; - - kmem_free(vd, sizeof (vdev_t)); -} - -/* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. @@ -408,6 +377,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, vd->vdev_path = spa_strdup(vd->vdev_path); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) vd->vdev_devid = spa_strdup(vd->vdev_devid); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, + &vd->vdev_physpath) == 0) + vd->vdev_physpath = spa_strdup(vd->vdev_physpath); /* * Set the nparity propery for RAID-Z vdevs. @@ -477,13 +449,28 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } /* - * If we're a leaf vdev, try to load the DTL object and offline state. + * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl.smo_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, + &vd->vdev_unspare); + /* + * When importing a pool, we want to ignore the persistent fault + * state, as the diagnosis made on another system may not be + * valid in the current context. + */ + if (spa->spa_load_state == SPA_LOAD_OPEN) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, + &vd->vdev_faulted); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, + &vd->vdev_degraded); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, + &vd->vdev_removed); + } } /* @@ -500,6 +487,7 @@ void vdev_free(vdev_t *vd) { int c; + spa_t *spa = vd->vdev_spa; /* * vdev_free() implies closing the vdev first. This is simpler than @@ -507,6 +495,7 @@ vdev_free(vdev_t *vd) */ vdev_close(vd); + ASSERT(!list_link_active(&vd->vdev_dirty_node)); /* @@ -535,7 +524,37 @@ vdev_free(vdev_t *vd) ASSERT(vd->vdev_parent == NULL); - vdev_free_common(vd); + /* + * Clean up vdev structure. + */ + vdev_queue_fini(vd); + vdev_cache_fini(vd); + + if (vd->vdev_path) + spa_strfree(vd->vdev_path); + if (vd->vdev_devid) + spa_strfree(vd->vdev_devid); + if (vd->vdev_physpath) + spa_strfree(vd->vdev_physpath); + + if (vd->vdev_isspare) + spa_spare_remove(vd); + + txg_list_destroy(&vd->vdev_ms_list); + txg_list_destroy(&vd->vdev_dtl_list); + mutex_enter(&vd->vdev_dtl_lock); + space_map_unload(&vd->vdev_dtl_map); + space_map_destroy(&vd->vdev_dtl_map); + space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); + space_map_destroy(&vd->vdev_dtl_scrub); + mutex_exit(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_stat_lock); + + if (vd == spa->spa_root_vdev) + spa->spa_root_vdev = NULL; + + kmem_free(vd, sizeof (vdev_t)); } /* @@ -590,9 +609,6 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) vdev_config_dirty(tvd); } - tvd->vdev_reopen_wanted = svd->vdev_reopen_wanted; - svd->vdev_reopen_wanted = 0; - tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; } @@ -781,13 +797,12 @@ vdev_open(vdev_t *vd) vd->vdev_stat.vs_aux = VDEV_AUX_NONE; - if (vd->vdev_ops->vdev_op_leaf) { - vdev_cache_init(vd); - vdev_queue_init(vd); - vd->vdev_cache_active = B_TRUE; - } - - if (vd->vdev_offline) { + if (!vd->vdev_removed && vd->vdev_faulted) { + ASSERT(vd->vdev_children == 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); + return (ENXIO); + } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (ENXIO); @@ -798,16 +813,25 @@ vdev_open(vdev_t *vd) if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, ENXIO); - dprintf("%s = %d, osize %llu, state = %d\n", - vdev_description(vd), error, osize, vd->vdev_state); - if (error) { + if (vd->vdev_removed && + vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) + vd->vdev_removed = B_FALSE; + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); return (error); } - vd->vdev_state = VDEV_STATE_HEALTHY; + vd->vdev_removed = B_FALSE; + + if (vd->vdev_degraded) { + ASSERT(vd->vdev_children == 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); + } else { + vd->vdev_state = VDEV_STATE_HEALTHY; + } for (c = 0; c < vd->vdev_children; c++) if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { @@ -905,8 +929,7 @@ vdev_open(vdev_t *vd) /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't - * inadvertently do repair I/Os to the wrong device, and so that vdev_reopen() - * won't succeed if the device has been changed underneath. + * inadvertently do repair I/Os to the wrong device. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if @@ -988,11 +1011,7 @@ vdev_close(vdev_t *vd) { vd->vdev_ops->vdev_op_close(vd); - if (vd->vdev_cache_active) { - vdev_cache_fini(vd); - vdev_queue_fini(vd); - vd->vdev_cache_active = B_FALSE; - } + vdev_cache_purge(vd); /* * We record the previous state before we close it, so that if we are @@ -1022,22 +1041,13 @@ vdev_reopen(vdev_t *vd) * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). - * - * The downside to this is that if the user is simply experimenting by - * overwriting an entire disk, we'll fault the device rather than - * demonstrate self-healing capabilities. On the other hand, with - * proper FMA integration, the series of errors we'd see from the device - * would result in a faulted device anyway. Given that this doesn't - * model any real-world corruption, it's better to catch this here and - * correctly identify that the device has either changed beneath us, or - * is corrupted beyond recognition. */ (void) vdev_validate(vd); /* - * Reassess root vdev's health. + * Reassess parent vdev's health. */ - vdev_propagate_state(spa->spa_root_vdev); + vdev_propagate_state(vd); } int @@ -1428,8 +1438,12 @@ vdev_description(vdev_t *vd) return (vd->vdev_ops->vdev_op_type); } +/* + * Mark the given vdev faulted. A faulted vdev behaves as if the device could + * not be opened, and no I/O is attempted. + */ int -vdev_online(spa_t *spa, uint64_t guid) +vdev_fault(spa_t *spa, uint64_t guid) { vdev_t *rvd, *vd; uint64_t txg; @@ -1440,27 +1454,141 @@ vdev_online(spa_t *spa, uint64_t guid) if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + /* + * Faulted state takes precedence over degraded. + */ + vd->vdev_faulted = 1ULL; + vd->vdev_degraded = 0ULL; + vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); + + /* + * If marking the vdev as faulted cause the toplevel vdev to become + * unavailable, then back off and simply mark the vdev as degraded + * instead. + */ + if (vdev_is_dead(vd->vdev_top)) { + vd->vdev_degraded = 1ULL; + vd->vdev_faulted = 0ULL; + + /* + * If we reopen the device and it's not dead, only then do we + * mark it degraded. + */ + vdev_reopen(vd); + + if (!vdev_is_dead(vd)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); + } + } + + vdev_config_dirty(vd->vdev_top); + + (void) spa_vdev_exit(spa, NULL, txg, 0); + + return (0); +} + +/* + * Mark the given vdev degraded. A degraded vdev is purely an indication to the + * user that something is wrong. The vdev continues to operate as normal as far + * as I/O is concerned. + */ +int +vdev_degrade(spa_t *spa, uint64_t guid) +{ + vdev_t *rvd, *vd; + uint64_t txg; + + txg = spa_vdev_enter(spa); + + rvd = spa->spa_root_vdev; + + if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - dprintf("ONLINE: %s\n", vdev_description(vd)); + /* + * If the vdev is already faulted, then don't do anything. + */ + if (vd->vdev_faulted || vd->vdev_degraded) { + (void) spa_vdev_exit(spa, NULL, txg, 0); + return (0); + } + + vd->vdev_degraded = 1ULL; + if (!vdev_is_dead(vd)) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); + vdev_config_dirty(vd->vdev_top); + + (void) spa_vdev_exit(spa, NULL, txg, 0); + + return (0); +} + +/* + * Online the given vdev. If 'unspare' is set, it implies two things. First, + * any attached spare device should be detached when the device finishes + * resilvering. Second, the online should be treated like a 'test' online case, + * so no FMA events are generated if the device fails to open. + */ +int +vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, + vdev_state_t *newstate) +{ + vdev_t *rvd, *vd; + uint64_t txg; + + txg = spa_vdev_enter(spa); + + rvd = spa->spa_root_vdev; + + if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; + vd->vdev_checkremove = (flags & ZFS_ONLINE_CHECKREMOVE) ? + B_TRUE : B_FALSE; + vd->vdev_forcefault = (flags & ZFS_ONLINE_FORCEFAULT) ? + B_TRUE : B_FALSE; vdev_reopen(vd->vdev_top); + vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; + + if (newstate) + *newstate = vd->vdev_state; + if ((flags & ZFS_ONLINE_UNSPARE) && + !vdev_is_dead(vd) && vd->vdev_parent && + vd->vdev_parent->vdev_ops == &vdev_spare_ops && + vd->vdev_parent->vdev_child[0] == vd) + vd->vdev_unspare = B_TRUE; vdev_config_dirty(vd->vdev_top); (void) spa_vdev_exit(spa, NULL, txg, 0); + /* + * Must hold spa_namespace_lock in order to post resilver sysevent + * w/pool name. + */ + mutex_enter(&spa_namespace_lock); VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + mutex_exit(&spa_namespace_lock); return (0); } int -vdev_offline(spa_t *spa, uint64_t guid, int istmp) +vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *rvd, *vd; uint64_t txg; @@ -1475,8 +1603,6 @@ vdev_offline(spa_t *spa, uint64_t guid, int istmp) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - dprintf("OFFLINE: %s\n", vdev_description(vd)); - /* * If the device isn't already offline, try to offline it. */ @@ -1505,7 +1631,8 @@ vdev_offline(spa_t *spa, uint64_t guid, int istmp) } } - vd->vdev_tmpoffline = istmp; + vd->vdev_tmpoffline = (flags & ZFS_OFFLINE_TEMPORARY) ? + B_TRUE : B_FALSE; vdev_config_dirty(vd->vdev_top); @@ -1531,12 +1658,29 @@ vdev_clear(spa_t *spa, vdev_t *vd) for (c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); + + /* + * If we're in the FAULTED state, then clear the persistent state and + * attempt to reopen the device. We also mark the vdev config dirty, so + * that the new faulted state is written out to disk. + */ + if (vd->vdev_faulted || vd->vdev_degraded) { + vd->vdev_faulted = vd->vdev_degraded = 0; + vdev_reopen(vd); + vdev_config_dirty(vd->vdev_top); + + if (vd->vdev_faulted) + VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, + B_TRUE) == 0); + + spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); + } } int vdev_is_dead(vdev_t *vd) { - return (vd->vdev_state <= VDEV_STATE_CANT_OPEN); + return (vd->vdev_state < VDEV_STATE_DEGRADED); } int @@ -1563,12 +1707,6 @@ vdev_error_inject(vdev_t *vd, zio_t *zio) break; } - if (error != 0) { - dprintf("returning %d for type %d on %s state %d offset %llx\n", - error, zio->io_type, vdev_description(vd), - vd->vdev_state, zio->io_offset); - } - return (error); } @@ -1792,28 +1930,34 @@ vdev_propagate_state(vdev_t *vd) int c; vdev_t *child; - for (c = 0; c < vd->vdev_children; c++) { - child = vd->vdev_child[c]; - if (child->vdev_state <= VDEV_STATE_CANT_OPEN) - faulted++; - else if (child->vdev_state == VDEV_STATE_DEGRADED) - degraded++; - - if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) - corrupted++; - } + if (vd->vdev_children > 0) { + for (c = 0; c < vd->vdev_children; c++) { + child = vd->vdev_child[c]; + if (vdev_is_dead(child)) + faulted++; + else if (child->vdev_state == VDEV_STATE_DEGRADED) + degraded++; - vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); + if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) + corrupted++; + } - /* - * Root special: if there is a toplevel vdev that cannot be - * opened due to corrupted metadata, then propagate the root - * vdev's aux state as 'corrupt' rather than 'insufficient - * replicas'. - */ - if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) - vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); + vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); + + /* + * Root special: if there is a toplevel vdev that cannot be + * opened due to corrupted metadata, then propagate the root + * vdev's aux state as 'corrupt' rather than 'insufficient + * replicas'. + */ + if (corrupted && vd == rvd && + rvd->vdev_state == VDEV_STATE_CANT_OPEN) + vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + } + + if (vd->vdev_parent) + vdev_propagate_state(vd->vdev_parent); } /* @@ -1839,7 +1983,39 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; - if (state == VDEV_STATE_CANT_OPEN) { + /* + * If we are setting the vdev state to anything but an open state, then + * always close the underlying device. Otherwise, we keep accessible + * but invalid devices open forever. We don't call vdev_close() itself, + * because that implies some extra checks (offline, etc) that we don't + * want here. This is limited to leaf devices, because otherwise + * closing the device will affect other children. + */ + if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) + vd->vdev_ops->vdev_op_close(vd); + + if (vd->vdev_removed && + state == VDEV_STATE_CANT_OPEN && + (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { + /* + * If the previous state is set to VDEV_STATE_REMOVED, then this + * device was previously marked removed and someone attempted to + * reopen it. If this failed due to a nonexistent device, then + * keep the device in the REMOVED state. We also let this be if + * it is one of our special test online cases, which is only + * attempting to online the device and shouldn't generate an FMA + * fault. + */ + vd->vdev_state = VDEV_STATE_REMOVED; + vd->vdev_stat.vs_aux = VDEV_AUX_NONE; + } else if (state == VDEV_STATE_REMOVED) { + /* + * Indicate to the ZFS DE that this device has been removed, and + * any recent errors should be ignored. + */ + zfs_post_remove(vd->vdev_spa, vd); + vd->vdev_removed = B_TRUE; + } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import, we mark it as * "not available", which signifies that it was never there to @@ -1856,8 +2032,17 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. + * + * If the 'checkremove' flag is set, then this is an attempt to + * online the device in response to an insertion event. If we + * hit this case, then we have detected an insertion event for a + * faulted or offline device that wasn't in the removed state. + * In this scenario, we don't post an ereport because we are + * about to replace the device, or attempt an online with + * vdev_forcefault, which will generate the fault for us. */ - if (vd->vdev_prevstate != state && !vd->vdev_not_present && + if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && + !vd->vdev_not_present && !vd->vdev_checkremove && vd != vd->vdev_spa->spa_root_vdev) { const char *class; @@ -1887,11 +2072,13 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) zfs_ereport_post(class, vd->vdev_spa, vd, NULL, save_state, 0); } - } - if (isopen) - return; + /* Erase any notion of persistent removed state */ + vd->vdev_removed = B_FALSE; + } else { + vd->vdev_removed = B_FALSE; + } - if (vd->vdev_parent != NULL) - vdev_propagate_state(vd->vdev_parent); + if (!isopen) + vdev_propagate_state(vd); } |