16 files changed, 505 insertions, 84 deletions
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 77556ac5d7..3ebde10240 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -57,12 +57,13 @@ int metaslab_df_free_pct = 30;
  * ==========================================================================
  */
 metaslab_class_t *
-metaslab_class_create(space_map_ops_t *ops)
+metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
 {
 	metaslab_class_t *mc;
 
 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 
+	mc->mc_spa = spa;
 	mc->mc_rotor = NULL;
 	mc->mc_ops = ops;
 
@@ -126,6 +127,32 @@ metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
 	mg->mg_class = NULL;
 }
 
+int
+metaslab_class_validate(metaslab_class_t *mc)
+{
+	metaslab_group_t *mg;
+	vdev_t *vd;
+
+	/*
+	 * Must hold one of the spa_config locks.
+	 */
+	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
+	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
+
+	if ((mg = mc->mc_rotor) == NULL)
+		return (0);
+
+	do {
+		vd = mg->mg_vd;
+		ASSERT(vd->vdev_mg != NULL);
+		ASSERT3P(vd->vdev_top, ==, vd);
+		ASSERT3P(mg->mg_class, ==, mc);
+		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
+	} while ((mg = mg->mg_next) != mc->mc_rotor);
+
+	return (0);
+}
+
 /*
  * ==========================================================================
  * Metaslab groups
@@ -634,6 +661,8 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	dmu_tx_t *tx;
 	int t;
 
+	ASSERT(!vd->vdev_ishole);
+
 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
 	/*
@@ -721,6 +750,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	vdev_t *vd = mg->mg_vd;
 	int t;
 
+	ASSERT(!vd->vdev_ishole);
+
 	mutex_enter(&msp->ms_lock);
 
 	/*
@@ -932,10 +963,21 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	 */
 	if (hintdva) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
-		if (flags & METASLAB_HINTBP_AVOID)
-			mg = vd->vdev_mg->mg_next;
-		else
+
+		/*
+		 * It's possible the vdev we're using as the hint no
+		 * longer exists (i.e. removed). Consult the rotor when
+		 * all else fails.
+		 */
+		if (vd != NULL && vd->vdev_mg != NULL) {
 			mg = vd->vdev_mg;
+
+			if (flags & METASLAB_HINTBP_AVOID &&
+			    mg->mg_next != NULL)
+				mg = mg->mg_next;
+		} else {
+			mg = mc->mc_rotor;
+		}
 	} else if (d != 0) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
 		mg = vd->vdev_mg->mg_next;
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 53e1ac0f4a..f503592396 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -42,6 +42,7 @@
 #include <sys/zil.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
@@ -578,8 +579,8 @@ spa_activate(spa_t *spa, int mode)
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_mode = mode;
 
-	spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops);
-	spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
+	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
+	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
 
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		const zio_taskq_info_t *ztip = &zio_taskqs[t];
@@ -1101,26 +1102,23 @@ spa_check_removed(vdev_t *vd)
  * that the label does not contain the most up-to-date information.
  */
 void
-spa_load_log_state(spa_t *spa)
+spa_load_log_state(spa_t *spa, nvlist_t *nv)
 {
-	nvlist_t *nv, *nvroot, **child;
-	uint64_t is_log;
-	uint_t children;
-	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *ovd, *rvd = spa->spa_root_vdev;
 
-	VERIFY(load_nvlist(spa, spa->spa_config_object, &nv) == 0);
-	VERIFY(nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
-	VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
-	    &child, &children) == 0);
-
-	for (int c = 0; c < children; c++) {
-		vdev_t *tvd = rvd->vdev_child[c];
+	/*
+	 * Load the original root vdev tree from the passed config.
+	 */
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
 
-		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
-		    &is_log) == 0 && is_log)
-			vdev_load_log_state(tvd, child[c]);
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *cvd = rvd->vdev_child[c];
+		if (cvd->vdev_islog)
+			vdev_load_log_state(cvd, ovd->vdev_child[c]);
 	}
-	nvlist_free(nv);
+	vdev_free(ovd);
+	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
@@ -1151,7 +1149,7 @@ static int
 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 {
 	int error = 0;
-	nvlist_t *nvroot = NULL;
+	nvlist_t *nvconfig, *nvroot = NULL;
 	vdev_t *rvd;
 	uberblock_t *ub = &spa->spa_uberblock;
 	uint64_t config_cache_txg = spa->spa_config_txg;
@@ -1306,23 +1304,22 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 		goto out;
 	}
 
+	if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		error = EIO;
+		goto out;
+	}
+
 	if (!mosconfig) {
-		nvlist_t *newconfig;
 		uint64_t hostid;
 
-		if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
-			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_CORRUPT_DATA);
-			error = EIO;
-			goto out;
-		}
-
-		if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig,
+		if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
 		    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
 			char *hostname;
 			unsigned long myhostid = 0;
 
-			VERIFY(nvlist_lookup_string(newconfig,
+			VERIFY(nvlist_lookup_string(nvconfig,
 			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
 
 #ifdef	_KERNEL
@@ -1347,12 +1344,12 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 			}
 		}
 
-		spa_config_set(spa, newconfig);
+		spa_config_set(spa, nvconfig);
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_activate(spa, orig_mode);
 
-		return (spa_load(spa, newconfig, state, B_TRUE));
+		return (spa_load(spa, nvconfig, state, B_TRUE));
 	}
 
 	if (zap_lookup(spa->spa_meta_objset,
@@ -1471,7 +1468,10 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 		spa_config_exit(spa, SCL_ALL, FTAG);
 	}
 
-	spa_load_log_state(spa);
+	VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+	spa_load_log_state(spa, nvroot);
+	nvlist_free(nvconfig);
 
 	if (spa_check_logs(spa)) {
 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
@@ -2910,7 +2910,7 @@ spa_reset(char *pool)
 int
 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 {
-	uint64_t txg;
+	uint64_t txg, id;
 	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *tvd;
@@ -2951,9 +2951,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 	 * Transfer each new top-level vdev from vd to rvd.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++) {
+
+		/*
+		 * Set the vdev id to the first hole, if one exists.
+		 */
+		for (id = 0; id < rvd->vdev_children; id++) {
+			if (rvd->vdev_child[id]->vdev_ishole) {
+				vdev_free(rvd->vdev_child[id]);
+				break;
+			}
+		}
 		tvd = vd->vdev_child[c];
 		vdev_remove_child(vd, tvd);
-		tvd->vdev_id = rvd->vdev_children;
+		tvd->vdev_id = id;
 		vdev_add_child(rvd, tvd);
 		vdev_config_dirty(tvd);
 	}
@@ -3136,6 +3146,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 	 */
 	vdev_remove_child(newrootvd, newvd);
 	newvd->vdev_id = pvd->vdev_children;
+	newvd->vdev_crtxg = oldvd->vdev_crtxg;
 	vdev_add_child(pvd, newvd);
 
 	tvd = newvd->vdev_top;
@@ -3444,16 +3455,127 @@ spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
 }
 
 /*
+ * Removing a device from the vdev namespace requires several steps
+ * and can take a significant amount of time.  As a result we use
+ * the spa_vdev_config_[enter/exit] functions which allow us to
+ * grab and release the spa_config_lock while still holding the namespace
+ * lock.  During each step the configuration is synced out.
+ */
+
+/*
+ * Initial phase of device removal - stop future allocations from this device.
+ */
+void
+spa_vdev_remove_start(spa_t *spa, vdev_t *vd)
+{
+	metaslab_group_t *mg = vd->vdev_mg;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	/*
+	 * Remove our vdev from the allocatable vdevs
+	 */
+	if (mg)
+		metaslab_class_remove(mg->mg_class, mg);
+}
+
+/*
+ * Evacuate the device.
+ */
+int
+spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
+{
+	uint64_t txg;
+	int error;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+
+	/*
+	 * Evacuate the device.  We don't hold the config lock as writer
+	 * since we need to do I/O but we do keep the
+	 * spa_namespace_lock held.  Once this completes the device
+	 * should no longer have any blocks allocated on it.
+	 */
+	if (vd->vdev_islog) {
+		/*
+		 * Evacuate the device.
+		 */
+		if (error = dmu_objset_find(spa_name(spa),
+		    zil_vdev_offline, NULL, DS_FIND_CHILDREN)) {
+			uint64_t txg;
+
+			txg = spa_vdev_config_enter(spa);
+			metaslab_class_add(spa->spa_log_class,
+			    vd->vdev_mg);
+			return (spa_vdev_exit(spa, NULL, txg, error));
+		}
+		txg_wait_synced(spa_get_dsl(spa), 0);
+	}
+
+	/*
+	 * Remove any remaining MOS metadata associated with the device.
+	 */
+	txg = spa_vdev_config_enter(spa);
+	vd->vdev_removing = B_TRUE;
+	vdev_dirty(vd, 0, NULL, txg);
+	vdev_config_dirty(vd);
+	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+
+	return (0);
+}
+
+/*
+ * Complete the removal by cleaning up the namespace.
+ */
+void
+spa_vdev_remove_done(spa_t *spa, vdev_t *vd)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	metaslab_group_t *mg = vd->vdev_mg;
+	uint64_t id = vd->vdev_id;
+	boolean_t last_vdev = (id == (rvd->vdev_children - 1));
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+	vdev_free(vd);
+
+	/*
+	 * It's possible that another thread is trying todo a spa_vdev_add()
+	 * at the same time we're trying remove it. As a result the
+	 * added vdev may not have initialized its metaslabs yet.
+	 */
+	if (mg != NULL)
+		metaslab_group_destroy(mg);
+
+	if (last_vdev) {
+		vdev_compact_children(rvd);
+	} else {
+		vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
+		vdev_add_child(rvd, vd);
+	}
+	vdev_config_dirty(rvd);
+
+	/*
+	 * Reassess the health of our root vdev.
+	 */
+	vdev_reopen(rvd);
+}
+
+/*
  * Remove a device from the pool.  Currently, this supports removing only hot
- * spares and level 2 ARC devices.
+ * spares, slogs, and level 2 ARC devices.
  */
 int
 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 {
 	vdev_t *vd;
 	nvlist_t **spares, **l2cache, *nv;
-	uint_t nspares, nl2cache;
 	uint64_t txg = 0;
+	uint_t nspares, nl2cache;
 	int error = 0;
 	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
 
@@ -3489,6 +3611,29 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
 		spa_load_l2cache(spa);
 		spa->spa_l2cache.sav_sync = B_TRUE;
+	} else if (vd != NULL && vd->vdev_islog) {
+		ASSERT(!locked);
+
+		/*
+		 * XXX - Once we have bp-rewrite this should
+		 * become the common case.
+		 */
+
+		/*
+		 * 1. Stop allocations
+		 * 2. Evacuate the device (i.e. kill off stubby and
+		 *    metadata) and wait for it to complete (i.e. sync).
+		 * 3. Cleanup the vdev namespace.
+		 */
+		spa_vdev_remove_start(spa, vd);
+
+		spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+		if ((error = spa_vdev_remove_evacuate(spa, vd)) != 0)
+			return (error);
+		txg = spa_vdev_config_enter(spa);
+
+		spa_vdev_remove_done(spa, vd);
+
 	} else if (vd != NULL) {
 		/*
 		 * Normal vdevs cannot be removed (yet).
diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c
index b2063bba13..d611e0aa9b 100644
--- a/usr/src/uts/common/fs/zfs/spa_config.c
+++ b/usr/src/uts/common/fs/zfs/spa_config.c
@@ -383,6 +383,13 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 		vd = vd->vdev_top;		/* label contains top config */
 	}
 
+	/*
+	 * Add the top-level config.  We even add this on pools which
+	 * don't support holes in the namespace as older pools will
+	 * just ignore it.
+	 */
+	vdev_top_config_generate(spa, config);
+
 	nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE);
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
 	nvlist_free(nvroot);
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 9384db4ae9..38474c194d 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -836,6 +836,18 @@ uint64_t
 spa_vdev_enter(spa_t *spa)
 {
 	mutex_enter(&spa_namespace_lock);
+	return (spa_vdev_config_enter(spa));
+}
+
+/*
+ * Internal implementation for spa_vdev_enter().  Used when a vdev
+ * operation requires multiple syncs (i.e. removing a device) while
+ * keeping the spa_namespace_lock held.
+ */
+uint64_t
+spa_vdev_config_enter(spa_t *spa)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 
@@ -843,14 +855,14 @@ spa_vdev_enter(spa_t *spa)
 }
 
 /*
- * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
- * locking of spa_vdev_enter(), we also want make sure the transactions have
- * synced to disk, and then update the global configuration cache with the new
- * information.
+ * Used in combination with spa_vdev_config_enter() to allow the syncing
+ * of multiple transactions without releasing the spa_namespace_lock.
  */
-int
-spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+void
+spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
 {
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
 	int config_changed = B_FALSE;
 
 	ASSERT(txg > spa_last_synced_txg(spa));
@@ -870,9 +882,23 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 		config_changed = B_TRUE;
 	}
 
+	/*
+	 * Verify the metaslab classes.
+	 */
+	ASSERT(metaslab_class_validate(spa->spa_normal_class) == 0);
+	ASSERT(metaslab_class_validate(spa->spa_log_class) == 0);
+
 	spa_config_exit(spa, SCL_ALL, spa);
 
 	/*
+	 * Panic the system if the specified tag requires it.  This
+	 * is useful for ensuring that configurations are updated
+	 * transactionally.
+	 */
+	if (zio_injection_enabled)
+		zio_handle_panic_injection(spa, tag);
+
+	/*
 	 * Note: this txg_wait_synced() is important because it ensures
 	 * that there won't be more than one config change per txg.
 	 * This allows us to use the txg as the generation number.
@@ -892,7 +918,18 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 	 */
 	if (config_changed)
 		spa_config_sync(spa, B_FALSE, B_TRUE);
+}
 
+/*
+ * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
+ * locking of spa_vdev_enter(), we also want make sure the transactions have
+ * synced to disk, and then update the global configuration cache with the new
+ * information.
+ */
+int
+spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+{
+	spa_vdev_config_exit(spa, vd, txg, error, FTAG);
 	mutex_exit(&spa_namespace_lock);
 
 	return (error);
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h
index 5d3e11c971..78a5f94952 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h
@@ -57,10 +57,12 @@ extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
     boolean_t now);
 extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
 
-extern metaslab_class_t *metaslab_class_create(space_map_ops_t *ops);
+extern metaslab_class_t *metaslab_class_create(spa_t *spa,
+    space_map_ops_t *ops);
 extern void metaslab_class_destroy(metaslab_class_t *mc);
 extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
 extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
+extern int metaslab_class_validate(metaslab_class_t *mc);
 
 extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
     vdev_t *vd);
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
index d67dea7e97..bdf9559631 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -37,6 +37,7 @@ extern "C" {
 #endif
 
 struct metaslab_class {
+	spa_t			*mc_spa;
 	metaslab_group_t	*mc_rotor;
 	uint64_t		mc_allocated;
 	space_map_ops_t		*mc_ops;
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index bccee25da9..b4165b24c8 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -430,6 +430,9 @@ extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
 
 /* Pool vdev add/remove lock */
 extern uint64_t spa_vdev_enter(spa_t *spa);
+extern uint64_t spa_vdev_config_enter(spa_t *spa);
+extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
+    int error, char *tag);
 extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
 
 /* Pool vdev state change lock */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index a76cecb4b2..ecf6c2fe17 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -122,6 +122,7 @@ extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg,
 extern void vdev_state_dirty(vdev_t *vd);
 extern void vdev_state_clean(vdev_t *vd);
 
+extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
 extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
     boolean_t getstats, boolean_t isspare, boolean_t isl2cache);
 
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 23780430df..bb2f98c33e 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -129,6 +129,7 @@ struct vdev {
 	boolean_t	vdev_expanding;	/* expand the vdev?		*/
 	int		vdev_open_error; /* error on last open		*/
 	kthread_t	*vdev_open_thread; /* thread opening children	*/
+	uint64_t	vdev_crtxg;	/* txg when top-level was added */
 
 	/*
 	 * Top-level vdev state.
@@ -143,10 +144,12 @@ struct vdev {
 	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
 	boolean_t	vdev_remove_wanted; /* async remove wanted?	*/
 	boolean_t	vdev_probe_wanted; /* async probe wanted?	*/
+	boolean_t	vdev_removing;	/* device is being removed?	*/
 	list_node_t	vdev_config_dirty_node; /* config dirty list	*/
 	list_node_t	vdev_state_dirty_node; /* state dirty list	*/
 	uint64_t	vdev_deflate_ratio; /* deflation ratio (x512)	*/
 	uint64_t	vdev_islog;	/* is an intent log device	*/
+	uint64_t	vdev_ishole;	/* is a hole in the namespace 	*/
 
 	/*
 	 * Leaf vdev state.
@@ -248,6 +251,8 @@ typedef struct vdev_label {
 /*
  * Allocate or free a vdev
  */
+extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid,
+    vdev_ops_t *ops);
 extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
     vdev_t *parent, uint_t id, int alloctype);
 extern void vdev_free(vdev_t *vd);
@@ -264,7 +269,7 @@ extern void vdev_remove_parent(vdev_t *cvd);
 /*
  * vdev sync load and sync
  */
-extern void vdev_load_log_state(vdev_t *vd, nvlist_t *nv);
+extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
 extern void vdev_load(vdev_t *vd);
 extern void vdev_sync(vdev_t *vd, uint64_t txg);
 extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
@@ -280,6 +285,7 @@ extern vdev_ops_t vdev_raidz_ops;
 extern vdev_ops_t vdev_disk_ops;
 extern vdev_ops_t vdev_file_ops;
 extern vdev_ops_t vdev_missing_ops;
+extern vdev_ops_t vdev_hole_ops;
 extern vdev_ops_t vdev_spare_ops;
 
 /*
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
index b7a2f57cbc..37615ba35f 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -117,6 +117,7 @@ typedef struct zinject_record {
 	uint64_t	zi_type;
 	uint32_t	zi_freq;
 	uint32_t	zi_failfast;
+	char		zi_func[MAXNAMELEN];
 } zinject_record_t;
 
 #define	ZINJECT_NULL		0x1
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index a85a1cdfcb..305c697697 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -442,6 +442,7 @@ extern int zio_inject_fault(char *name, int flags, int *id,
 extern int zio_inject_list_next(int *id, char *name, size_t buflen,
     struct zinject_record *record);
 extern int zio_clear_fault(int id);
+extern void zio_handle_panic_injection(spa_t *spa, char *tag);
 extern int zio_handle_fault_injection(zio_t *zio, int error);
 extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
 extern int zio_handle_label_injection(zio_t *zio, int error);
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 415cd4a9e9..9c8aa43425 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -54,6 +54,7 @@ static vdev_ops_t *vdev_ops_table[] = {
 	&vdev_disk_ops,
 	&vdev_file_ops,
 	&vdev_missing_ops,
+	&vdev_hole_ops,
 	NULL
 };
 
@@ -281,7 +282,7 @@ vdev_compact_children(vdev_t *pvd)
 /*
  * Allocate and minimally initialize a vdev_t.
  */
-static vdev_t *
+vdev_t *
 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 {
 	vdev_t *vd;
@@ -293,7 +294,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 		spa->spa_root_vdev = vd;
 	}
 
-	if (guid == 0) {
+	if (guid == 0 && ops != &vdev_hole_ops) {
 		if (spa->spa_root_vdev == vd) {
 			/*
 			 * The root vdev's guid will also be the pool guid,
@@ -318,6 +319,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	vd->vdev_guid_sum = guid;
 	vd->vdev_ops = ops;
 	vd->vdev_state = VDEV_STATE_CLOSED;
+	vd->vdev_ishole = (ops == &vdev_hole_ops);
 
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -397,6 +399,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
 		return (ENOTSUP);
 
+	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
+		return (ENOTSUP);
+
 	/*
 	 * Set the nparity property for RAID-Z vdevs.
 	 */
@@ -472,6 +477,12 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
 
 	/*
+	 * Retrieve the vdev creation time.
+	 */
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+	    &vd->vdev_crtxg);
+
+	/*
 	 * If we're a top-level vdev, try to load the allocation parameters.
 	 */
 	if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
@@ -705,6 +716,7 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
 	mvd->vdev_min_asize = cvd->vdev_min_asize;
 	mvd->vdev_ashift = cvd->vdev_ashift;
 	mvd->vdev_state = cvd->vdev_state;
+	mvd->vdev_crtxg = cvd->vdev_crtxg;
 
 	vdev_remove_child(pvd, cvd);
 	vdev_add_child(pvd, mvd);
@@ -772,9 +784,14 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 	metaslab_t **mspp;
 	int error;
 
-	if (vd->vdev_ms_shift == 0)	/* not being allocated from yet */
+	/*
+	 * This vdev is not being allocated from yet or is a hole.
+	 */
+	if (vd->vdev_ms_shift == 0)
 		return (0);
 
+	ASSERT(!vd->vdev_ishole);
+
 	/*
 	 * Compute the raidz-deflation ratio.  Note, we hard-code
 	 * in 128k (1 << 17) because it is the current "typical" blocksize.
@@ -1105,6 +1122,12 @@ vdev_open(vdev_t *vd)
 		vd->vdev_state = VDEV_STATE_HEALTHY;
 	}
 
+	/*
+	 * For hole or missing vdevs we just return success.
+	 */
+	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
+		return (0);
+
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
@@ -1393,6 +1416,7 @@ void
 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
 {
 	ASSERT(vd == vd->vdev_top);
+	ASSERT(!vd->vdev_ishole);
 	ASSERT(ISP2(flags));
 
 	if (flags & VDD_METASLAB)
@@ -1502,7 +1526,7 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 		vdev_dtl_reassess(vd->vdev_child[c], txg,
 		    scrub_txg, scrub_done);
 
-	if (vd == spa->spa_root_vdev)
+	if (vd == spa->spa_root_vdev || vd->vdev_ishole)
 		return;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
@@ -1592,6 +1616,8 @@ vdev_dtl_load(vdev_t *vd)
 	if (smo->smo_object == 0)
 		return (0);
 
+	ASSERT(!vd->vdev_ishole);
+
 	if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
 		return (error);
 
@@ -1619,6 +1645,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 	dmu_buf_t *db;
 	dmu_tx_t *tx;
 
+	ASSERT(!vd->vdev_ishole);
+
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
 	if (vd->vdev_detached) {
@@ -1755,7 +1783,7 @@ vdev_load(vdev_t *vd)
 	/*
 	 * If this is a top-level vdev, initialize its metaslabs.
 	 */
-	if (vd == vd->vdev_top &&
+	if (vd == vd->vdev_top && !vd->vdev_ishole &&
 	    (vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
 	    vdev_metaslab_init(vd, 0) != 0))
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@@ -1812,10 +1840,48 @@ vdev_validate_aux(vdev_t *vd)
 }
 
 void
+vdev_remove(vdev_t *vd, uint64_t txg)
+{
+	spa_t *spa = vd->vdev_spa;
+	objset_t *mos = spa->spa_meta_objset;
+	dmu_tx_t *tx;
+
+	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+	if (vd->vdev_dtl_smo.smo_object) {
+		ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0);
+		(void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
+		vd->vdev_dtl_smo.smo_object = 0;
+	}
+
+	if (vd->vdev_ms != NULL) {
+		for (int m = 0; m < vd->vdev_ms_count; m++) {
+			metaslab_t *msp = vd->vdev_ms[m];
+
+			if (msp == NULL || msp->ms_smo.smo_object == 0)
+				continue;
+
+			ASSERT3U(msp->ms_smo.smo_alloc, ==, 0);
+			(void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
+			msp->ms_smo.smo_object = 0;
+		}
+	}
+
+	if (vd->vdev_ms_array) {
+		(void) dmu_object_free(mos, vd->vdev_ms_array, tx);
+		vd->vdev_ms_array = 0;
+		vd->vdev_ms_shift = 0;
+	}
+	dmu_tx_commit(tx);
+}
+
+void
 vdev_sync_done(vdev_t *vd, uint64_t txg)
 {
 	metaslab_t *msp;
 
+	ASSERT(!vd->vdev_ishole);
+
 	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
 		metaslab_sync_done(msp, txg);
 }
@@ -1828,6 +1894,8 @@ vdev_sync(vdev_t *vd, uint64_t txg)
 	metaslab_t *msp;
 	dmu_tx_t *tx;
 
+	ASSERT(!vd->vdev_ishole);
+
 	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
 		ASSERT(vd == vd->vdev_top);
 		tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
@@ -1838,6 +1906,9 @@ vdev_sync(vdev_t *vd, uint64_t txg)
 		dmu_tx_commit(tx);
 	}
 
+	if (vd->vdev_removing)
+		vdev_remove(vd, txg);
+
 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
 		metaslab_sync(msp, txg);
 		(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
@@ -2110,7 +2181,15 @@ vdev_clear(spa_t *spa, vdev_t *vd)
 boolean_t
 vdev_is_dead(vdev_t *vd)
 {
-	return (vd->vdev_state < VDEV_STATE_DEGRADED);
+	/*
+	 * Holes and missing devices are always considered "dead".
+	 * This simplifies the code since we don't have to check for
+	 * these types of devices in the various code paths.
+	 * Instead we rely on the fact that we skip over dead devices
+	 * before issuing I/O to them.
+	 */
+	return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
+	    vd->vdev_ops == &vdev_missing_ops);
 }
 
 boolean_t
@@ -2139,7 +2218,7 @@ vdev_allocatable(vdev_t *vd)
 	 * we're asking two separate questions about it.
 	 */
 	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
-	    !vd->vdev_cant_write);
+	    !vd->vdev_cant_write && !vd->vdev_ishole && !vd->vdev_removing);
 }
 
 boolean_t
@@ -2391,7 +2470,7 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
 		 * Don't count non-normal (e.g. intent log) space as part of
 		 * the pool's capacity.
 		 */
-		if (vd->vdev_mg->mg_class != spa->spa_normal_class)
+		if (vd->vdev_islog)
 			return;
 
 		mutex_enter(&rvd->vdev_stat_lock);
@@ -2472,7 +2551,8 @@ vdev_config_dirty(vdev_t *vd)
 	} else {
 		ASSERT(vd == vd->vdev_top);
 
-		if (!list_link_active(&vd->vdev_config_dirty_node))
+		if (!list_link_active(&vd->vdev_config_dirty_node) &&
+		    !vd->vdev_ishole)
 			list_insert_head(&spa->spa_config_dirty_list, vd);
 	}
 }
@@ -2546,6 +2626,12 @@ vdev_propagate_state(vdev_t *vd)
 		for (int c = 0; c < vd->vdev_children; c++) {
 			child = vd->vdev_child[c];
 
+			/*
+			 * Don't factor holes into the decision.
+			 */
+			if (child->vdev_ishole)
+				continue;
+
 			if (!vdev_readable(child) ||
 			    (!vdev_writeable(child) && spa_writeable(spa))) {
 				/*
@@ -2739,32 +2825,31 @@ vdev_is_bootable(vdev_t *vd)
 	return (B_TRUE);
 }
 
+/*
+ * Load the state from the original vdev tree (ovd) which
+ * we've retrieved from the MOS config object. If the original
+ * vdev was offline then we transfer that state to the device
+ * in the current vdev tree (nvd).
+ */
 void
-vdev_load_log_state(vdev_t *vd, nvlist_t *nv)
+vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
 {
-	uint_t children;
-	nvlist_t **child;
-	uint64_t val;
-	spa_t *spa = vd->vdev_spa;
+	spa_t *spa = nvd->vdev_spa;
 
-	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-	    &child, &children) == 0) {
-		for (int c = 0; c < children; c++)
-			vdev_load_log_state(vd->vdev_child[c], child[c]);
-	}
+	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+	ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
 
-	if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv,
-	    ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) {
+	for (int c = 0; c < nvd->vdev_children; c++)
+		vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
 
+	if (nvd->vdev_ops->vdev_op_leaf && ovd->vdev_offline) {
 		/*
 		 * It would be nice to call vdev_offline()
 		 * directly but the pool isn't fully loaded and
 		 * the txg threads have not been started yet.
 		 */
-		spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER);
-		vd->vdev_offline = val;
-		vdev_reopen(vd->vdev_top);
-		spa_config_exit(spa, SCL_STATE_ALL, FTAG);
+		nvd->vdev_offline = ovd->vdev_offline;
+		vdev_reopen(nvd->vdev_top);
 	}
 }
 
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index 06cb720128..87adc01622 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -287,6 +287,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
 		    vd->vdev_dtl_smo.smo_object) == 0);
 
+	if (vd->vdev_crtxg)
+		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+		    vd->vdev_crtxg) == 0);
+
 	if (getstats) {
 		vdev_stat_t vs;
 		vdev_get_stats(vd, &vs);
@@ -298,6 +302,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		nvlist_t **child;
 		int c;
 
+		ASSERT(!vd->vdev_ishole);
+
 		child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
 		    KM_SLEEP);
 
@@ -329,11 +335,45 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		if (vd->vdev_unspare)
 			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 			    B_TRUE) == 0);
+		if (vd->vdev_ishole)
+			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE,
+			    B_TRUE) == 0);
 	}
 
 	return (nv);
 }
 
+/*
+ * Generate a view of the top-level vdevs.  If we currently have holes
+ * in the namespace, then generate an array which contains a list of holey
+ * vdevs.  Additionally, add the number of top-level children that currently
+ * exist.
+ */
+void
+vdev_top_config_generate(spa_t *spa, nvlist_t *config)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t *array;
+	uint_t idx;
+
+	array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
+
+	idx = 0;
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+
+		if (tvd->vdev_ishole)
+			array[idx++] = c;
+	}
+
+	VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
+	    array, idx++) == 0);
+	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
+	    rvd->vdev_children) == 0);
+
+	kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
+}
+
 nvlist_t *
 vdev_label_read_config(vdev_t *vd)
 {
@@ -516,6 +556,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 		    crtxg, reason)) != 0)
 			return (error);
 
+	/* Track the creation time for this vdev */
+	vd->vdev_crtxg = crtxg;
+
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (0);
 
@@ -976,6 +1019,9 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
 	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
 		uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
 		    KM_SLEEP);
+
+		ASSERT(!vd->vdev_ishole);
+
 		zio_t *vio = zio_null(zio, spa, NULL,
 		    (vd->vdev_islog || vd->vdev_aux != NULL) ?
 		    vdev_label_sync_ignore_done : vdev_label_sync_top_done,
diff --git a/usr/src/uts/common/fs/zfs/vdev_missing.c b/usr/src/uts/common/fs/zfs/vdev_missing.c
index 731f7d3dce..e1bf7d86a3 100644
--- a/usr/src/uts/common/fs/zfs/vdev_missing.c
+++ b/usr/src/uts/common/fs/zfs/vdev_missing.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -48,8 +48,8 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 	 * VDEV_AUX_BAD_GUID_SUM.  So we pretend to succeed, knowing that we
 	 * will fail the GUID sum check before ever trying to open the pool.
 	 */
-	*psize = SPA_MINDEVSIZE;
-	*ashift = SPA_MINBLOCKSHIFT;
+	*psize = 0;
+	*ashift = 0;
 	return (0);
 }
 
@@ -83,3 +83,14 @@ vdev_ops_t vdev_missing_ops = {
 	VDEV_TYPE_MISSING,	/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
+
+vdev_ops_t vdev_hole_ops = {
+	vdev_missing_open,
+	vdev_missing_close,
+	vdev_default_asize,
+	vdev_missing_io_start,
+	vdev_missing_io_done,
+	NULL,
+	VDEV_TYPE_HOLE,		/* name of this vdev type */
+	B_TRUE			/* leaf vdev */
+};
diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c
index f8e6880c90..c5ff55243a 100644
--- a/usr/src/uts/common/fs/zfs/zio_inject.c
+++ b/usr/src/uts/common/fs/zfs/zio_inject.c
@@ -96,6 +96,30 @@ zio_match_handler(zbookmark_t *zb, uint64_t type,
 }
 
 /*
+ * Panic the system when a config change happens in the function
+ * specified by tag.
+ */
+void
+zio_handle_panic_injection(spa_t *spa, char *tag)
+{
+	inject_handler_t *handler;
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+
+		if (spa != handler->zi_spa)
+			continue;
+
+		if (strcmp(tag, handler->zi_record.zi_func) == 0)
+			panic("Panic requested in function %s\n", tag);
+	}
+
+	rw_exit(&inject_lock);
+}
+
+/*
  * Determine if the I/O in question should return failure.  Returns the errno
  * to be returned to the caller.
  */
@@ -126,8 +150,9 @@ zio_handle_fault_injection(zio_t *zio, int error)
 		if (zio->io_spa != handler->zi_spa)
 			continue;
 
-		/* Ignore device errors */
-		if (handler->zi_record.zi_guid != 0)
+		/* Ignore device errors and panic injection */
+		if (handler->zi_record.zi_guid != 0 ||
+		    handler->zi_record.zi_func[0] != '\0')
 			continue;
 
 		/* If this handler matches, return EIO */
@@ -170,8 +195,9 @@ zio_handle_label_injection(zio_t *zio, int error)
 		uint64_t start = handler->zi_record.zi_start;
 		uint64_t end = handler->zi_record.zi_end;
 
-		/* Ignore device only faults */
-		if (handler->zi_record.zi_start == 0)
+		/* Ignore device only faults or panic injection */
+		if (handler->zi_record.zi_start == 0 ||
+		    handler->zi_record.zi_func[0] != '\0')
 			continue;
 
 		/*
@@ -205,8 +231,9 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
-		/* Ignore label specific faults */
-		if (handler->zi_record.zi_start != 0)
+		/* Ignore label specific faults or panic injection */
+		if (handler->zi_record.zi_start != 0 ||
+		    handler->zi_record.zi_func[0] != '\0')
 			continue;
 
 		if (vd->vdev_guid == handler->zi_record.zi_guid) {
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
index b88fb5419a..de0d67176e 100644
--- a/usr/src/uts/common/sys/fs/zfs.h
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -295,14 +295,15 @@ typedef enum zfs_cache_type {
 #define	SPA_VERSION_16			16ULL
 #define	SPA_VERSION_17			17ULL
 #define	SPA_VERSION_18			18ULL
+#define	SPA_VERSION_19			19ULL
 /*
  * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
  * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
  * and do the appropriate changes.  Also bump the version number in
  * usr/src/grub/capability.
  */
-#define	SPA_VERSION			SPA_VERSION_18
-#define	SPA_VERSION_STRING		"18"
+#define	SPA_VERSION			SPA_VERSION_19
+#define	SPA_VERSION_STRING		"19"
 
 /*
  * Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -342,6 +343,7 @@ typedef enum zfs_cache_type {
 #define	SPA_VERSION_STMF_PROP		SPA_VERSION_16
 #define	SPA_VERSION_RAIDZ3		SPA_VERSION_17
 #define	SPA_VERSION_USERREFS		SPA_VERSION_18
+#define	SPA_VERSION_HOLES		SPA_VERSION_19
 
 /*
  * ZPL version - rev'd whenever an incompatible on-disk format change
@@ -401,6 +403,9 @@ typedef enum zfs_cache_type {
 #define	ZPOOL_CONFIG_PHYS_PATH		"phys_path"
 #define	ZPOOL_CONFIG_IS_LOG		"is_log"
 #define	ZPOOL_CONFIG_L2CACHE		"l2cache"
+#define	ZPOOL_CONFIG_HOLE_ARRAY		"hole_array"
+#define	ZPOOL_CONFIG_VDEV_CHILDREN	"vdev_children"
+#define	ZPOOL_CONFIG_IS_HOLE		"is_hole"
 #define	ZPOOL_CONFIG_SUSPENDED		"suspended"	/* not stored on disk */
 #define	ZPOOL_CONFIG_TIMESTAMP		"timestamp"	/* not stored on disk */
 #define	ZPOOL_CONFIG_BOOTFS		"bootfs"	/* not stored on disk */
@@ -422,6 +427,7 @@ typedef enum zfs_cache_type {
 #define	VDEV_TYPE_DISK			"disk"
 #define	VDEV_TYPE_FILE			"file"
 #define	VDEV_TYPE_MISSING		"missing"
+#define	VDEV_TYPE_HOLE			"hole"
 #define	VDEV_TYPE_SPARE			"spare"
 #define	VDEV_TYPE_LOG			"log"
 #define	VDEV_TYPE_L2CACHE		"l2cache"