PSARC 2007/197 ZFS hotplug

PSARC 2007/283 FMA for ZFS Phase 2 6401126 ZFS DE should verify that diagnosis is still valid before solving cases 6500545 ZFS does not handle changes in devids 6508521 zpool online should warn when it is being used incorrectly 6509807 ZFS checksum ereports are not being posted 6514712 zfs_nicenum() doesn't work with perfectly-sized buffers 6520510 media state doesn't get updated properly on device removal 6520513 ZFS should have better support for device removal 6520514 vdev state should be controlled through a single ioctl() 6520519 ZFS should diagnose faulty devices 6520947 ZFS DE should close cases which no longer apply 6521393 ZFS case timeout should be FMD_TYPE_TIME 6521624 fmd_hash_walk() can dump core when given a bad address 6521946 ZFS DE needlessly subscribes to faults 6522085 ZFS dictionary files contain spelling errors 6523185 vdev_reopen() doesn't correctly propagate state 6523555 'zpool online' should be less chatty unless something goes wrong 6527379 zpool(1M) should not try to open faulted devices 6527700 ZFS should post a sysevent when topology changes 6528194 lofi should support force unmap and DKIO_DEV_GONE 6528732 ZFS should store physical device path in addition to /dev path 6532635 ZFS keeps devices open unnecessarily 6532979 bad argument to ZFS_IOC_VDEV_ATTACH can panic system 6567983 deadlock with spa_scrub_thread() and spa_namespace_lock
author: eschrock <none@none> 2007-06-12 13:18:17 -0700
committer: eschrock <none@none> 2007-06-12 13:18:17 -0700
commit: 3d7072f8bd27709dba14f6fe336f149d25d9e207 (patch)
tree: d325ae63ce74901b55494e8a0dc011b9e2e13d43 /usr/src/uts/common/fs/zfs/vdev.c
parent: a5b881a79e40ec2c21d682e676b130a1ee3d2a73 (diff)
download: illumos-joyent-3d7072f8bd27709dba14f6fe336f149d25d9e207.tar.gz
1 files changed, 294 insertions, 107 deletions
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index fbb77774c2..9b2ec04710 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -319,44 +319,13 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	txg_list_create(&vd->vdev_dtl_list,
 	    offsetof(struct vdev, vdev_dtl_node));
 	vd->vdev_stat.vs_timestamp = gethrtime();
+	vdev_queue_init(vd);
+	vdev_cache_init(vd);
 
 	return (vd);
 }
 
 /*
- * Free a vdev_t that has been removed from service.
- */
-static void
-vdev_free_common(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-
-	if (vd->vdev_path)
-		spa_strfree(vd->vdev_path);
-	if (vd->vdev_devid)
-		spa_strfree(vd->vdev_devid);
-
-	if (vd->vdev_isspare)
-		spa_spare_remove(vd);
-
-	txg_list_destroy(&vd->vdev_ms_list);
-	txg_list_destroy(&vd->vdev_dtl_list);
-	mutex_enter(&vd->vdev_dtl_lock);
-	space_map_unload(&vd->vdev_dtl_map);
-	space_map_destroy(&vd->vdev_dtl_map);
-	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
-	space_map_destroy(&vd->vdev_dtl_scrub);
-	mutex_exit(&vd->vdev_dtl_lock);
-	mutex_destroy(&vd->vdev_dtl_lock);
-	mutex_destroy(&vd->vdev_stat_lock);
-
-	if (vd == spa->spa_root_vdev)
-		spa->spa_root_vdev = NULL;
-
-	kmem_free(vd, sizeof (vdev_t));
-}
-
-/*
  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
  * creating a new vdev or loading an existing one - the behavior is slightly
  * different for each case.
@@ -408,6 +377,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 		vd->vdev_path = spa_strdup(vd->vdev_path);
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
 		vd->vdev_devid = spa_strdup(vd->vdev_devid);
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
+	    &vd->vdev_physpath) == 0)
+		vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
 
 	/*
 	 * Set the nparity propery for RAID-Z vdevs.
@@ -477,13 +449,28 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	}
 
 	/*
-	 * If we're a leaf vdev, try to load the DTL object and offline state.
+	 * If we're a leaf vdev, try to load the DTL object and other state.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
 		    &vd->vdev_dtl.smo_object);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 		    &vd->vdev_offline);
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
+		    &vd->vdev_unspare);
+		/*
+		 * When importing a pool, we want to ignore the persistent fault
+		 * state, as the diagnosis made on another system may not be
+		 * valid in the current context.
+		 */
+		if (spa->spa_load_state == SPA_LOAD_OPEN) {
+			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
+			    &vd->vdev_faulted);
+			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
+			    &vd->vdev_degraded);
+			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
+			    &vd->vdev_removed);
+		}
 	}
 
 	/*
@@ -500,6 +487,7 @@ void
 vdev_free(vdev_t *vd)
 {
 	int c;
+	spa_t *spa = vd->vdev_spa;
 
 	/*
 	 * vdev_free() implies closing the vdev first.  This is simpler than
@@ -507,6 +495,7 @@ vdev_free(vdev_t *vd)
 	 */
 	vdev_close(vd);
 
+
 	ASSERT(!list_link_active(&vd->vdev_dirty_node));
 
 	/*
@@ -535,7 +524,37 @@ vdev_free(vdev_t *vd)
 
 	ASSERT(vd->vdev_parent == NULL);
 
-	vdev_free_common(vd);
+	/*
+	 * Clean up vdev structure.
+	 */
+	vdev_queue_fini(vd);
+	vdev_cache_fini(vd);
+
+	if (vd->vdev_path)
+		spa_strfree(vd->vdev_path);
+	if (vd->vdev_devid)
+		spa_strfree(vd->vdev_devid);
+	if (vd->vdev_physpath)
+		spa_strfree(vd->vdev_physpath);
+
+	if (vd->vdev_isspare)
+		spa_spare_remove(vd);
+
+	txg_list_destroy(&vd->vdev_ms_list);
+	txg_list_destroy(&vd->vdev_dtl_list);
+	mutex_enter(&vd->vdev_dtl_lock);
+	space_map_unload(&vd->vdev_dtl_map);
+	space_map_destroy(&vd->vdev_dtl_map);
+	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+	space_map_destroy(&vd->vdev_dtl_scrub);
+	mutex_exit(&vd->vdev_dtl_lock);
+	mutex_destroy(&vd->vdev_dtl_lock);
+	mutex_destroy(&vd->vdev_stat_lock);
+
+	if (vd == spa->spa_root_vdev)
+		spa->spa_root_vdev = NULL;
+
+	kmem_free(vd, sizeof (vdev_t));
 }
 
 /*
@@ -590,9 +609,6 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 		vdev_config_dirty(tvd);
 	}
 
-	tvd->vdev_reopen_wanted = svd->vdev_reopen_wanted;
-	svd->vdev_reopen_wanted = 0;
-
 	tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
 	svd->vdev_deflate_ratio = 0;
 }
@@ -781,13 +797,12 @@ vdev_open(vdev_t *vd)
 
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 
-	if (vd->vdev_ops->vdev_op_leaf) {
-		vdev_cache_init(vd);
-		vdev_queue_init(vd);
-		vd->vdev_cache_active = B_TRUE;
-	}
-
-	if (vd->vdev_offline) {
+	if (!vd->vdev_removed && vd->vdev_faulted) {
+		ASSERT(vd->vdev_children == 0);
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+		    VDEV_AUX_ERR_EXCEEDED);
+		return (ENXIO);
+	} else if (vd->vdev_offline) {
 		ASSERT(vd->vdev_children == 0);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
 		return (ENXIO);
@@ -798,16 +813,25 @@ vdev_open(vdev_t *vd)
 	if (zio_injection_enabled && error == 0)
 		error = zio_handle_device_injection(vd, ENXIO);
 
-	dprintf("%s = %d, osize %llu, state = %d\n",
-	    vdev_description(vd), error, osize, vd->vdev_state);
-
 	if (error) {
+		if (vd->vdev_removed &&
+		    vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
+			vd->vdev_removed = B_FALSE;
+
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    vd->vdev_stat.vs_aux);
 		return (error);
 	}
 
-	vd->vdev_state = VDEV_STATE_HEALTHY;
+	vd->vdev_removed = B_FALSE;
+
+	if (vd->vdev_degraded) {
+		ASSERT(vd->vdev_children == 0);
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
+		    VDEV_AUX_ERR_EXCEEDED);
+	} else {
+		vd->vdev_state = VDEV_STATE_HEALTHY;
+	}
 
 	for (c = 0; c < vd->vdev_children; c++)
 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
@@ -905,8 +929,7 @@ vdev_open(vdev_t *vd)
 /*
  * Called once the vdevs are all opened, this routine validates the label
  * contents.  This needs to be done before vdev_load() so that we don't
- * inadvertently do repair I/Os to the wrong device, and so that vdev_reopen()
- * won't succeed if the device has been changed underneath.
+ * inadvertently do repair I/Os to the wrong device.
  *
  * This function will only return failure if one of the vdevs indicates that it
  * has since been destroyed or exported.  This is only possible if
@@ -988,11 +1011,7 @@ vdev_close(vdev_t *vd)
 {
 	vd->vdev_ops->vdev_op_close(vd);
 
-	if (vd->vdev_cache_active) {
-		vdev_cache_fini(vd);
-		vdev_queue_fini(vd);
-		vd->vdev_cache_active = B_FALSE;
-	}
+	vdev_cache_purge(vd);
 
 	/*
 	 * We record the previous state before we close it, so  that if we are
@@ -1022,22 +1041,13 @@ vdev_reopen(vdev_t *vd)
 	 * Call vdev_validate() here to make sure we have the same device.
 	 * Otherwise, a device with an invalid label could be successfully
 	 * opened in response to vdev_reopen().
-	 *
-	 * The downside to this is that if the user is simply experimenting by
-	 * overwriting an entire disk, we'll fault the device rather than
-	 * demonstrate self-healing capabilities.  On the other hand, with
-	 * proper FMA integration, the series of errors we'd see from the device
-	 * would result in a faulted device anyway.  Given that this doesn't
-	 * model any real-world corruption, it's better to catch this here and
-	 * correctly identify that the device has either changed beneath us, or
-	 * is corrupted beyond recognition.
 	 */
 	(void) vdev_validate(vd);
 
 	/*
-	 * Reassess root vdev's health.
+	 * Reassess parent vdev's health.
 	 */
-	vdev_propagate_state(spa->spa_root_vdev);
+	vdev_propagate_state(vd);
 }
 
 int
@@ -1428,8 +1438,12 @@ vdev_description(vdev_t *vd)
 	return (vd->vdev_ops->vdev_op_type);
 }
 
+/*
+ * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
+ * not be opened, and no I/O is attempted.
+ */
 int
-vdev_online(spa_t *spa, uint64_t guid)
+vdev_fault(spa_t *spa, uint64_t guid)
 {
 	vdev_t *rvd, *vd;
 	uint64_t txg;
@@ -1440,27 +1454,141 @@ vdev_online(spa_t *spa, uint64_t guid)
 
 	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
+	/*
+	 * Faulted state takes precedence over degraded.
+	 */
+	vd->vdev_faulted = 1ULL;
+	vd->vdev_degraded = 0ULL;
+	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED,
+	    VDEV_AUX_ERR_EXCEEDED);
+
+	/*
+	 * If marking the vdev as faulted cause the toplevel vdev to become
+	 * unavailable, then back off and simply mark the vdev as degraded
+	 * instead.
+	 */
+	if (vdev_is_dead(vd->vdev_top)) {
+		vd->vdev_degraded = 1ULL;
+		vd->vdev_faulted = 0ULL;
+
+		/*
+		 * If we reopen the device and it's not dead, only then do we
+		 * mark it degraded.
+		 */
+		vdev_reopen(vd);
+
+		if (!vdev_is_dead(vd)) {
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
+			    VDEV_AUX_ERR_EXCEEDED);
+		}
+	}
+
+	vdev_config_dirty(vd->vdev_top);
+
+	(void) spa_vdev_exit(spa, NULL, txg, 0);
+
+	return (0);
+}
+
+/*
+ * Mark the given vdev degraded.  A degraded vdev is purely an indication to the
+ * user that something is wrong.  The vdev continues to operate as normal as far
+ * as I/O is concerned.
+ */
+int
+vdev_degrade(spa_t *spa, uint64_t guid)
+{
+	vdev_t *rvd, *vd;
+	uint64_t txg;
+
+	txg = spa_vdev_enter(spa);
+
+	rvd = spa->spa_root_vdev;
+
+	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
+		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
-	dprintf("ONLINE: %s\n", vdev_description(vd));
+	/*
+	 * If the vdev is already faulted, then don't do anything.
+	 */
+	if (vd->vdev_faulted || vd->vdev_degraded) {
+		(void) spa_vdev_exit(spa, NULL, txg, 0);
+		return (0);
+	}
+
+	vd->vdev_degraded = 1ULL;
+	if (!vdev_is_dead(vd))
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
+		    VDEV_AUX_ERR_EXCEEDED);
+	vdev_config_dirty(vd->vdev_top);
+
+	(void) spa_vdev_exit(spa, NULL, txg, 0);
+
+	return (0);
+}
+
+/*
+ * Online the given vdev.  If 'unspare' is set, it implies two things.  First,
+ * any attached spare device should be detached when the device finishes
+ * resilvering.  Second, the online should be treated like a 'test' online case,
+ * so no FMA events are generated if the device fails to open.
+ */
+int
+vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
+    vdev_state_t *newstate)
+{
+	vdev_t *rvd, *vd;
+	uint64_t txg;
+
+	txg = spa_vdev_enter(spa);
+
+	rvd = spa->spa_root_vdev;
+
+	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
+		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	vd->vdev_offline = B_FALSE;
 	vd->vdev_tmpoffline = B_FALSE;
+	vd->vdev_checkremove = (flags & ZFS_ONLINE_CHECKREMOVE) ?
+	    B_TRUE : B_FALSE;
+	vd->vdev_forcefault = (flags & ZFS_ONLINE_FORCEFAULT) ?
+	    B_TRUE : B_FALSE;
 	vdev_reopen(vd->vdev_top);
+	vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
+
+	if (newstate)
+		*newstate = vd->vdev_state;
+	if ((flags & ZFS_ONLINE_UNSPARE) &&
+	    !vdev_is_dead(vd) && vd->vdev_parent &&
+	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+	    vd->vdev_parent->vdev_child[0] == vd)
+		vd->vdev_unspare = B_TRUE;
 
 	vdev_config_dirty(vd->vdev_top);
 
 	(void) spa_vdev_exit(spa, NULL, txg, 0);
 
+	/*
+	 * Must hold spa_namespace_lock in order to post resilver sysevent
+	 * w/pool name.
+	 */
+	mutex_enter(&spa_namespace_lock);
 	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 int
-vdev_offline(spa_t *spa, uint64_t guid, int istmp)
+vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	vdev_t *rvd, *vd;
 	uint64_t txg;
@@ -1475,8 +1603,6 @@ vdev_offline(spa_t *spa, uint64_t guid, int istmp)
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
-	dprintf("OFFLINE: %s\n", vdev_description(vd));
-
 	/*
 	 * If the device isn't already offline, try to offline it.
 	 */
@@ -1505,7 +1631,8 @@ vdev_offline(spa_t *spa, uint64_t guid, int istmp)
 		}
 	}
 
-	vd->vdev_tmpoffline = istmp;
+	vd->vdev_tmpoffline = (flags & ZFS_OFFLINE_TEMPORARY) ?
+	    B_TRUE : B_FALSE;
 
 	vdev_config_dirty(vd->vdev_top);
 
@@ -1531,12 +1658,29 @@ vdev_clear(spa_t *spa, vdev_t *vd)
 
 	for (c = 0; c < vd->vdev_children; c++)
 		vdev_clear(spa, vd->vdev_child[c]);
+
+	/*
+	 * If we're in the FAULTED state, then clear the persistent state and
+	 * attempt to reopen the device.  We also mark the vdev config dirty, so
+	 * that the new faulted state is written out to disk.
+	 */
+	if (vd->vdev_faulted || vd->vdev_degraded) {
+		vd->vdev_faulted = vd->vdev_degraded = 0;
+		vdev_reopen(vd);
+		vdev_config_dirty(vd->vdev_top);
+
+		if (vd->vdev_faulted)
+			VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER,
+			    B_TRUE) == 0);
+
+		spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
+	}
 }
 
 int
 vdev_is_dead(vdev_t *vd)
 {
-	return (vd->vdev_state <= VDEV_STATE_CANT_OPEN);
+	return (vd->vdev_state < VDEV_STATE_DEGRADED);
 }
 
 int
@@ -1563,12 +1707,6 @@ vdev_error_inject(vdev_t *vd, zio_t *zio)
 		break;
 	}
 
-	if (error != 0) {
-		dprintf("returning %d for type %d on %s state %d offset %llx\n",
-		    error, zio->io_type, vdev_description(vd),
-		    vd->vdev_state, zio->io_offset);
-	}
-
 	return (error);
 }
 
@@ -1792,28 +1930,34 @@ vdev_propagate_state(vdev_t *vd)
 	int c;
 	vdev_t *child;
 
-	for (c = 0; c < vd->vdev_children; c++) {
-		child = vd->vdev_child[c];
-		if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
-			faulted++;
-		else if (child->vdev_state == VDEV_STATE_DEGRADED)
-			degraded++;
-
-		if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
-			corrupted++;
-	}
+	if (vd->vdev_children > 0) {
+		for (c = 0; c < vd->vdev_children; c++) {
+			child = vd->vdev_child[c];
+			if (vdev_is_dead(child))
+				faulted++;
+			else if (child->vdev_state == VDEV_STATE_DEGRADED)
+				degraded++;
 
-	vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
+			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
+				corrupted++;
+		}
 
-	/*
-	 * Root special: if there is a toplevel vdev that cannot be
-	 * opened due to corrupted metadata, then propagate the root
-	 * vdev's aux state as 'corrupt' rather than 'insufficient
-	 * replicas'.
-	 */
-	if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN)
-		vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
+		vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
+
+		/*
+		 * Root special: if there is a toplevel vdev that cannot be
+		 * opened due to corrupted metadata, then propagate the root
+		 * vdev's aux state as 'corrupt' rather than 'insufficient
+		 * replicas'.
+		 */
+		if (corrupted && vd == rvd &&
+		    rvd->vdev_state == VDEV_STATE_CANT_OPEN)
+			vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+	}
+
+	if (vd->vdev_parent)
+		vdev_propagate_state(vd->vdev_parent);
 }
 
 /*
@@ -1839,7 +1983,39 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 	vd->vdev_state = state;
 	vd->vdev_stat.vs_aux = aux;
 
-	if (state == VDEV_STATE_CANT_OPEN) {
+	/*
+	 * If we are setting the vdev state to anything but an open state, then
+	 * always close the underlying device.  Otherwise, we keep accessible
+	 * but invalid devices open forever.  We don't call vdev_close() itself,
+	 * because that implies some extra checks (offline, etc) that we don't
+	 * want here.  This is limited to leaf devices, because otherwise
+	 * closing the device will affect other children.
+	 */
+	if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf)
+		vd->vdev_ops->vdev_op_close(vd);
+
+	if (vd->vdev_removed &&
+	    state == VDEV_STATE_CANT_OPEN &&
+	    (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
+		/*
+		 * If the previous state is set to VDEV_STATE_REMOVED, then this
+		 * device was previously marked removed and someone attempted to
+		 * reopen it.  If this failed due to a nonexistent device, then
+		 * keep the device in the REMOVED state.  We also let this be if
+		 * it is one of our special test online cases, which is only
+		 * attempting to online the device and shouldn't generate an FMA
+		 * fault.
+		 */
+		vd->vdev_state = VDEV_STATE_REMOVED;
+		vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+	} else if (state == VDEV_STATE_REMOVED) {
+		/*
+		 * Indicate to the ZFS DE that this device has been removed, and
+		 * any recent errors should be ignored.
+		 */
+		zfs_post_remove(vd->vdev_spa, vd);
+		vd->vdev_removed = B_TRUE;
+	} else if (state == VDEV_STATE_CANT_OPEN) {
 		/*
 		 * If we fail to open a vdev during an import, we mark it as
 		 * "not available", which signifies that it was never there to
@@ -1856,8 +2032,17 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 		 * that this is part of a vdev_reopen().  In this case, we don't
 		 * want to post the ereport if the device was already in the
 		 * CANT_OPEN state beforehand.
+		 *
+		 * If the 'checkremove' flag is set, then this is an attempt to
+		 * online the device in response to an insertion event.  If we
+		 * hit this case, then we have detected an insertion event for a
+		 * faulted or offline device that wasn't in the removed state.
+		 * In this scenario, we don't post an ereport because we are
+		 * about to replace the device, or attempt an online with
+		 * vdev_forcefault, which will generate the fault for us.
 		 */
-		if (vd->vdev_prevstate != state && !vd->vdev_not_present &&
+		if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
+		    !vd->vdev_not_present && !vd->vdev_checkremove &&
 		    vd != vd->vdev_spa->spa_root_vdev) {
 			const char *class;
 
@@ -1887,11 +2072,13 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 			zfs_ereport_post(class, vd->vdev_spa,
 			    vd, NULL, save_state, 0);
 		}
-	}
 
-	if (isopen)
-		return;
+		/* Erase any notion of persistent removed state */
+		vd->vdev_removed = B_FALSE;
+	} else {
+		vd->vdev_removed = B_FALSE;
+	}
 
-	if (vd->vdev_parent != NULL)
-		vdev_propagate_state(vd->vdev_parent);
+	if (!isopen)
+		vdev_propagate_state(vd);
 }
author	eschrock <none@none>	2007-06-12 13:18:17 -0700
committer	eschrock <none@none>	2007-06-12 13:18:17 -0700
commit	3d7072f8bd27709dba14f6fe336f149d25d9e207 (patch)
tree	d325ae63ce74901b55494e8a0dc011b9e2e13d43 /usr/src/uts/common/fs/zfs/vdev.c
parent	a5b881a79e40ec2c21d682e676b130a1ee3d2a73 (diff)
download	illumos-joyent-3d7072f8bd27709dba14f6fe336f149d25d9e207.tar.gz