summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/fs/zfs/vdev.c
diff options
context:
space:
mode:
authorgw25295 <none@none>2007-10-24 20:00:39 -0700
committergw25295 <none@none>2007-10-24 20:00:39 -0700
commit0a4e9518a44f226be6d39383330b5b1792d2f184 (patch)
treed651529cab845c82f673ed686ffa5e3a625fbb1c /usr/src/uts/common/fs/zfs/vdev.c
parent1f7be8d9c56cac9b6eeebaed96fe8763d1e90dd6 (diff)
downloadillumos-gate-0a4e9518a44f226be6d39383330b5b1792d2f184.tar.gz
PSARC 2007/567 zpool failmode property
6322646 ZFS should gracefully handle all devices failing (when writing) 6413847 vdev label write failure should be handled more gracefully 6417772 need nicer message on write failure 6417779 ZFS: I/O failure (write on ...) -- need to reallocate writes 6467927 Node gets into a panic loop when devices are fenced off 6565042 ZFS should gracefully handle all devices failing (when reading) 6596239 Stop issuing IOs to a vdev that is going to be removed
Diffstat (limited to 'usr/src/uts/common/fs/zfs/vdev.c')
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c103
1 files changed, 95 insertions, 8 deletions
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 62ebf19a61..aed7d53ba1 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -793,6 +793,21 @@ vdev_metaslab_fini(vdev_t *vd)
}
}
+int
+vdev_probe(vdev_t *vd)
+{
+ if (vd == NULL)
+ return (EINVAL);
+
+ /*
+ * Right now we only support status checks on the leaf vdevs.
+ */
+ if (vd->vdev_ops->vdev_op_leaf)
+ return (vd->vdev_ops->vdev_op_probe(vd));
+
+ return (0);
+}
+
/*
* Prepare a virtual device for access.
*/
@@ -919,6 +934,17 @@ vdev_open(vdev_t *vd)
}
/*
+ * Ensure we can issue some IO before declaring the
+ * vdev open for business.
+ */
+ error = vdev_probe(vd);
+ if (error) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_OPEN_FAILED);
+ return (error);
+ }
+
+ /*
* If this is a top-level vdev, compute the raidz-deflation
* ratio. Note, we hard-code in 128k (1<<17) because it is the
* current "typical" blocksize. Even if SPA_MAXBLOCKSIZE
@@ -1467,6 +1493,17 @@ vdev_fault(spa_t *spa, uint64_t guid)
vdev_t *rvd, *vd;
uint64_t txg;
+ /*
+ * Disregard a vdev fault request if the pool has
+ * experienced a complete failure.
+ *
+ * XXX - We do this here so that we don't hold the
+ * spa_namespace_lock in the event that we can't get
+ * the RW_WRITER spa_config_lock.
+ */
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE)
+ return (EIO);
+
txg = spa_vdev_enter(spa);
rvd = spa->spa_root_vdev;
@@ -1499,7 +1536,7 @@ vdev_fault(spa_t *spa, uint64_t guid)
*/
vdev_reopen(vd);
- if (!vdev_is_dead(vd)) {
+ if (vdev_readable(vd)) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
VDEV_AUX_ERR_EXCEEDED);
}
@@ -1523,6 +1560,17 @@ vdev_degrade(spa_t *spa, uint64_t guid)
vdev_t *rvd, *vd;
uint64_t txg;
+ /*
+ * Disregard a vdev fault request if the pool has
+ * experienced a complete failure.
+ *
+ * XXX - We do this here so that we don't hold the
+ * spa_namespace_lock in the event that we can't get
+ * the RW_WRITER spa_config_lock.
+ */
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE)
+ return (EIO);
+
txg = spa_vdev_enter(spa);
rvd = spa->spa_root_vdev;
@@ -1564,6 +1612,17 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
vdev_t *rvd, *vd;
uint64_t txg;
+ /*
+ * Disregard a vdev fault request if the pool has
+ * experienced a complete failure.
+ *
+ * XXX - We do this here so that we don't hold the
+ * spa_namespace_lock in the event that we can't get
+ * the RW_WRITER spa_config_lock.
+ */
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE)
+ return (EIO);
+
txg = spa_vdev_enter(spa);
rvd = spa->spa_root_vdev;
@@ -1612,6 +1671,17 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
vdev_t *rvd, *vd;
uint64_t txg;
+ /*
+ * Disregard a vdev fault request if the pool has
+ * experienced a complete failure.
+ *
+ * XXX - We do this here so that we don't hold the
+ * spa_namespace_lock in the event that we can't get
+ * the RW_WRITER spa_config_lock.
+ */
+ if (spa_state(spa) == POOL_STATE_IO_FAILURE)
+ return (EIO);
+
txg = spa_vdev_enter(spa);
rvd = spa->spa_root_vdev;
@@ -1662,9 +1732,11 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
* Clear the error counts associated with this vdev. Unlike vdev_online() and
* vdev_offline(), we assume the spa config is locked. We also clear all
* children. If 'vd' is NULL, then the user wants to clear all vdevs.
+ * If reopen is specified then attempt to reopen the vdev if the vdev is
+ * faulted or degraded.
*/
void
-vdev_clear(spa_t *spa, vdev_t *vd)
+vdev_clear(spa_t *spa, vdev_t *vd, boolean_t reopen_wanted)
{
int c;
@@ -1674,16 +1746,17 @@ vdev_clear(spa_t *spa, vdev_t *vd)
vd->vdev_stat.vs_read_errors = 0;
vd->vdev_stat.vs_write_errors = 0;
vd->vdev_stat.vs_checksum_errors = 0;
+ vd->vdev_is_failing = B_FALSE;
for (c = 0; c < vd->vdev_children; c++)
- vdev_clear(spa, vd->vdev_child[c]);
+ vdev_clear(spa, vd->vdev_child[c], reopen_wanted);
/*
* If we're in the FAULTED state, then clear the persistent state and
* attempt to reopen the device. We also mark the vdev config dirty, so
* that the new faulted state is written out to disk.
*/
- if (vd->vdev_faulted || vd->vdev_degraded) {
+ if (reopen_wanted && (vd->vdev_faulted || vd->vdev_degraded)) {
vd->vdev_faulted = vd->vdev_degraded = 0;
vdev_reopen(vd);
vdev_config_dirty(vd->vdev_top);
@@ -1696,6 +1769,20 @@ vdev_clear(spa_t *spa, vdev_t *vd)
}
int
+vdev_readable(vdev_t *vd)
+{
+ /* XXPOLICY */
+ return (!vdev_is_dead(vd));
+}
+
+int
+vdev_writeable(vdev_t *vd)
+{
+ return (vd->vdev_ops->vdev_op_leaf ?
+ !vd->vdev_is_failing : !vdev_is_dead(vd));
+}
+
+int
vdev_is_dead(vdev_t *vd)
{
return (vd->vdev_state < VDEV_STATE_DEGRADED);
@@ -1800,7 +1887,7 @@ vdev_stat_update(zio_t *zio)
if (flags & ZIO_FLAG_SPECULATIVE)
return;
- if (!vdev_is_dead(vd)) {
+ if (vdev_readable(vd)) {
mutex_enter(&vd->vdev_stat_lock);
if (type == ZIO_TYPE_READ) {
if (zio->io_error == ECKSUM)
@@ -1962,9 +2049,9 @@ vdev_propagate_state(vdev_t *vd)
if (vd->vdev_children > 0) {
for (c = 0; c < vd->vdev_children; c++) {
child = vd->vdev_child[c];
- if (vdev_is_dead(child))
+ if (vdev_is_dead(child) && !vdev_readable(child))
faulted++;
- else if (child->vdev_state == VDEV_STATE_DEGRADED)
+ else if (child->vdev_state <= VDEV_STATE_DEGRADED)
degraded++;
if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
@@ -2020,7 +2107,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
* want here. This is limited to leaf devices, because otherwise
* closing the device will affect other children.
*/
- if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf)
+ if (!vdev_readable(vd) && vd->vdev_ops->vdev_op_leaf)
vd->vdev_ops->vdev_op_close(vd);
if (vd->vdev_removed &&