17 files changed, 1630 insertions, 637 deletions
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 56d316a16d..a772c24e65 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -151,9 +151,8 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 static void spa_sync_version(void *arg, dmu_tx_t *tx);
 static void spa_sync_props(void *arg, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
-static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
-    spa_load_state_t state, spa_import_type_t type, boolean_t trust_config,
-    char **ereport);
+static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
+    boolean_t reloading);
 static void spa_vdev_resilver_done(spa_t *spa);
 
 uint_t		zio_taskq_batch_pct = 75;	/* 1 thread per cpu in pset */
@@ -177,6 +176,54 @@ boolean_t	spa_load_verify_dryrun = B_FALSE;
 #define	TRYIMPORT_NAME	"$import"
 
 /*
+ * For debugging purposes: print out vdev tree during pool import.
+ */
+boolean_t	spa_load_print_vdev_tree = B_FALSE;
+
+/*
+ * A non-zero value for zfs_max_missing_tvds means that we allow importing
+ * pools with missing top-level vdevs. This is strictly intended for advanced
+ * pool recovery cases since missing data is almost inevitable. Pools with
+ * missing devices can only be imported read-only for safety reasons, and their
+ * fail-mode will be automatically set to "continue".
+ *
+ * With 1 missing vdev we should be able to import the pool and mount all
+ * datasets. User data that was not modified after the missing device has been
+ * added should be recoverable. This means that snapshots created prior to the
+ * addition of that device should be completely intact.
+ *
+ * With 2 missing vdevs, some datasets may fail to mount since there are
+ * dataset statistics that are stored as regular metadata. Some data might be
+ * recoverable if those vdevs were added recently.
+ *
+ * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
+ * may be missing entirely. Chances of data recovery are very low. Note that
+ * there are also risks of performing an inadvertent rewind as we might be
+ * missing all the vdevs with the latest uberblocks.
+ */
+uint64_t	zfs_max_missing_tvds = 0;
+
+/*
+ * The parameters below are similar to zfs_max_missing_tvds but are only
+ * intended for a preliminary open of the pool with an untrusted config which
+ * might be incomplete or out-dated.
+ *
+ * We are more tolerant for pools opened from a cachefile since we could have
+ * an out-dated cachefile where a device removal was not registered.
+ * We could have set the limit arbitrarily high but in the case where devices
+ * are really missing we would want to return the proper error codes; we chose
+ * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
+ * and we get a chance to retrieve the trusted config.
+ */
+uint64_t	zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
+/*
+ * In the case where config was assembled by scanning device paths (/dev/dsks
+ * by default) we are less tolerant since all the existing devices should have
+ * been detected and we want spa_load to return the right error codes.
+ */
+uint64_t	zfs_max_missing_tvds_scan = 0;
+
+/*
  * ==========================================================================
  * SPA properties routines
  * ==========================================================================
@@ -1291,6 +1338,12 @@ spa_unload(spa_t *spa)
 		spa->spa_vdev_removal = NULL;
 	}
 
+	if (spa->spa_condense_zthr != NULL) {
+		ASSERT(!zthr_isrunning(spa->spa_condense_zthr));
+		zthr_destroy(spa->spa_condense_zthr);
+		spa->spa_condense_zthr = NULL;
+	}
+
 	spa_condense_fini(spa);
 
 	bpobj_close(&spa->spa_deferred_bpobj);
@@ -1633,13 +1686,34 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
 }
 
 /*
+ * Concrete top-level vdevs that are not missing and are not logs. At every
+ * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
+ */
+static uint64_t
+spa_healthy_core_tvds(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t tvds = 0;
+
+	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+		vdev_t *vd = rvd->vdev_child[i];
+		if (vd->vdev_islog)
+			continue;
+		if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
+			tvds++;
+	}
+
+	return (tvds);
+}
+
+/*
  * Checks to see if the given vdev could not be opened, in which case we post a
  * sysevent to notify the autoreplace code that the device has been removed.
  */
 static void
 spa_check_removed(vdev_t *vd)
 {
-	for (int c = 0; c < vd->vdev_children; c++)
+	for (uint64_t c = 0; c < vd->vdev_children; c++)
 		spa_check_removed(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
@@ -1649,38 +1723,14 @@ spa_check_removed(vdev_t *vd)
 	}
 }
 
-static void
-spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd)
-{
-	ASSERT3U(vd->vdev_children, ==, mvd->vdev_children);
-
-	vd->vdev_top_zap = mvd->vdev_top_zap;
-	vd->vdev_leaf_zap = mvd->vdev_leaf_zap;
-
-	for (uint64_t i = 0; i < vd->vdev_children; i++) {
-		spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]);
-	}
-}
-
-/*
- * Validate the current config against the MOS config
- */
-static boolean_t
-spa_config_valid(spa_t *spa, nvlist_t *config)
+static int
+spa_check_for_missing_logs(spa_t *spa)
 {
-	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
-	nvlist_t *nv;
-
-	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
-
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
-
-	ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
+	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * If we're doing a normal import, then build up any additional
-	 * diagnostic information about missing devices in this config.
+	 * diagnostic information about missing log devices.
 	 * We'll pass this up to the user for further processing.
 	 */
 	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
@@ -1691,109 +1741,52 @@ spa_config_valid(spa_t *spa, nvlist_t *config)
 		    KM_SLEEP);
 		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
-		for (int c = 0; c < rvd->vdev_children; c++) {
+		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *tvd = rvd->vdev_child[c];
-			vdev_t *mtvd  = mrvd->vdev_child[c];
 
-			if (tvd->vdev_ops == &vdev_missing_ops &&
-			    mtvd->vdev_ops != &vdev_missing_ops &&
-			    mtvd->vdev_islog)
-				child[idx++] = vdev_config_generate(spa, mtvd,
-				    B_FALSE, 0);
+			/*
+			 * We consider a device as missing only if it failed
+			 * to open (i.e. offline or faulted is not considered
+			 * as missing).
+			 */
+			if (tvd->vdev_islog &&
+			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
+				child[idx++] = vdev_config_generate(spa, tvd,
+				    B_FALSE, VDEV_CONFIG_MISSING);
+			}
 		}
 
-		if (idx) {
-			VERIFY(nvlist_add_nvlist_array(nv,
-			    ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
-			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
-			    ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
+		if (idx > 0) {
+			fnvlist_add_nvlist_array(nv,
+			    ZPOOL_CONFIG_CHILDREN, child, idx);
+			fnvlist_add_nvlist(spa->spa_load_info,
+			    ZPOOL_CONFIG_MISSING_DEVICES, nv);
 
-			for (int i = 0; i < idx; i++)
+			for (uint64_t i = 0; i < idx; i++)
 				nvlist_free(child[i]);
 		}
 		nvlist_free(nv);
 		kmem_free(child, rvd->vdev_children * sizeof (char **));
-	}
-
-	/*
-	 * Compare the root vdev tree with the information we have
-	 * from the MOS config (mrvd). Check each top-level vdev
-	 * with the corresponding MOS config top-level (mtvd).
-	 */
-	for (int c = 0; c < rvd->vdev_children; c++) {
-		vdev_t *tvd = rvd->vdev_child[c];
-		vdev_t *mtvd  = mrvd->vdev_child[c];
-
-		/*
-		 * Resolve any "missing" vdevs in the current configuration.
-		 * Also trust the MOS config about any "indirect" vdevs.
-		 * If we find that the MOS config has more accurate information
-		 * about the top-level vdev then use that vdev instead.
-		 */
-		if ((tvd->vdev_ops == &vdev_missing_ops &&
-		    mtvd->vdev_ops != &vdev_missing_ops) ||
-		    (mtvd->vdev_ops == &vdev_indirect_ops &&
-		    tvd->vdev_ops != &vdev_indirect_ops)) {
 
-			/*
-			 * Device specific actions.
-			 */
-			if (mtvd->vdev_islog) {
-				if (!(spa->spa_import_flags &
-				    ZFS_IMPORT_MISSING_LOG)) {
-					continue;
-				}
+		if (idx > 0) {
+			spa_load_failed(spa, "some log devices are missing");
+			return (SET_ERROR(ENXIO));
+		}
+	} else {
+		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+			vdev_t *tvd = rvd->vdev_child[c];
 
+			if (tvd->vdev_islog &&
+			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
 				spa_set_log_state(spa, SPA_LOG_CLEAR);
-			} else if (mtvd->vdev_ops != &vdev_indirect_ops) {
-				continue;
-			}
-
-			/*
-			 * Swap the missing vdev with the data we were
-			 * able to obtain from the MOS config.
-			 */
-			vdev_remove_child(rvd, tvd);
-			vdev_remove_child(mrvd, mtvd);
-
-			vdev_add_child(rvd, mtvd);
-			vdev_add_child(mrvd, tvd);
-
-			vdev_reopen(rvd);
-		} else {
-			if (mtvd->vdev_islog) {
-				/*
-				 * Load the slog device's state from the MOS
-				 * config since it's possible that the label
-				 * does not contain the most up-to-date
-				 * information.
-				 */
-				vdev_load_log_state(tvd, mtvd);
-				vdev_reopen(tvd);
+				spa_load_note(spa, "some log devices are "
+				    "missing, ZIL is dropped.");
+				break;
 			}
-
-			/*
-			 * Per-vdev ZAP info is stored exclusively in the MOS.
-			 */
-			spa_config_valid_zaps(tvd, mtvd);
 		}
-
-		/*
-		 * Never trust this info from userland; always use what's
-		 * in the MOS.  This prevents it from getting out of sync
-		 * with the rest of the info in the MOS.
-		 */
-		tvd->vdev_removing = mtvd->vdev_removing;
-		tvd->vdev_indirect_config = mtvd->vdev_indirect_config;
 	}
 
-	vdev_free(mrvd);
-	spa_config_exit(spa, SCL_ALL, FTAG);
-
-	/*
-	 * Ensure we were able to validate the config.
-	 */
-	return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
+	return (0);
 }
 
 /*
@@ -2092,6 +2085,16 @@ spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
 	return (SET_ERROR(err));
 }
 
+static void
+spa_spawn_aux_threads(spa_t *spa)
+{
+	ASSERT(spa_writeable(spa));
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	spa_start_indirect_condensing_thread(spa);
+}
+
 /*
  * Fix up config after a partly-completed split.  This is done with the
  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
@@ -2175,53 +2178,15 @@ spa_try_repair(spa_t *spa, nvlist_t *config)
 }
 
 static int
-spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
-    boolean_t trust_config)
+spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
 {
-	nvlist_t *config = spa->spa_config;
 	char *ereport = FM_EREPORT_ZFS_POOL;
-	char *comment;
 	int error;
-	uint64_t pool_guid;
-	nvlist_t *nvl;
-
-	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
-		return (SET_ERROR(EINVAL));
-
-	ASSERT(spa->spa_comment == NULL);
-	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
-		spa->spa_comment = spa_strdup(comment);
-
-	/*
-	 * Versioning wasn't explicitly added to the label until later, so if
-	 * it's not present treat it as the initial version.
-	 */
-	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
-	    &spa->spa_ubsync.ub_version) != 0)
-		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
-
-	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
-	    &spa->spa_config_txg);
 
-	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
-	    spa_guid_exists(pool_guid, 0)) {
-		error = SET_ERROR(EEXIST);
-	} else {
-		spa->spa_config_guid = pool_guid;
-
-		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
-		    &nvl) == 0) {
-			VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
-			    KM_SLEEP) == 0);
-		}
-
-		nvlist_free(spa->spa_load_info);
-		spa->spa_load_info = fnvlist_alloc();
+	spa->spa_load_state = state;
 
-		gethrestime(&spa->spa_loaded_ts);
-		error = spa_load_impl(spa, pool_guid, config, state, type,
-		    trust_config, &ereport);
-	}
+	gethrestime(&spa->spa_loaded_ts);
+	error = spa_load_impl(spa, type, &ereport, B_FALSE);
 
 	/*
 	 * Don't count references from objsets that are already closed
@@ -2273,13 +2238,80 @@ vdev_count_verify_zaps(vdev_t *vd)
 }
 
 static int
-spa_ld_parse_config(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
-    spa_import_type_t type)
+spa_verify_host(spa_t *spa, nvlist_t *mos_config)
+{
+	uint64_t hostid;
+	char *hostname;
+	uint64_t myhostid = 0;
+
+	if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
+	    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
+		hostname = fnvlist_lookup_string(mos_config,
+		    ZPOOL_CONFIG_HOSTNAME);
+
+		myhostid = zone_get_hostid(NULL);
+
+		if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
+			cmn_err(CE_WARN, "pool '%s' could not be "
+			    "loaded as it was last accessed by "
+			    "another system (host: %s hostid: 0x%llx). "
+			    "See: http://illumos.org/msg/ZFS-8000-EY",
+			    spa_name(spa), hostname, (u_longlong_t)hostid);
+			spa_load_failed(spa, "hostid verification failed: pool "
+			    "last accessed by host: %s (hostid: 0x%llx)",
+			    hostname, (u_longlong_t)hostid);
+			return (SET_ERROR(EBADF));
+		}
+	}
+
+	return (0);
+}
+
+static int
+spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
 {
 	int error = 0;
-	nvlist_t *nvtree = NULL;
+	nvlist_t *nvtree, *nvl, *config = spa->spa_config;
 	int parse;
 	vdev_t *rvd;
+	uint64_t pool_guid;
+	char *comment;
+
+	/*
+	 * Versioning wasn't explicitly added to the label until later, so if
+	 * it's not present treat it as the initial version.
+	 */
+	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+	    &spa->spa_ubsync.ub_version) != 0)
+		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
+
+	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
+		spa_load_failed(spa, "invalid config provided: '%s' missing",
+		    ZPOOL_CONFIG_POOL_GUID);
+		return (SET_ERROR(EINVAL));
+	}
+
+	if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state ==
+	    SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) {
+		spa_load_failed(spa, "a pool with guid %llu is already open",
+		    (u_longlong_t)pool_guid);
+		return (SET_ERROR(EEXIST));
+	}
+
+	spa->spa_config_guid = pool_guid;
+
+	nvlist_free(spa->spa_load_info);
+	spa->spa_load_info = fnvlist_alloc();
+
+	ASSERT(spa->spa_comment == NULL);
+	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
+		spa->spa_comment = spa_strdup(comment);
+
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+	    &spa->spa_config_txg);
+
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
+		spa->spa_config_splitting = fnvlist_dup(nvl);
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
 		spa_load_failed(spa, "invalid config provided: '%s' missing",
@@ -2287,9 +2319,6 @@ spa_ld_parse_config(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 		return (SET_ERROR(EINVAL));
 	}
 
-	parse = (type == SPA_IMPORT_EXISTING ?
-	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
-
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
@@ -2307,6 +2336,8 @@ spa_ld_parse_config(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 	 * configuration requires knowing the version number.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	parse = (type == SPA_IMPORT_EXISTING ?
+	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
 	error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
@@ -2327,71 +2358,105 @@ spa_ld_parse_config(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 	return (0);
 }
 
+/*
+ * Recursively open all vdevs in the vdev tree. This function is called twice:
+ * first with the untrusted config, then with the trusted config.
+ */
 static int
 spa_ld_open_vdevs(spa_t *spa)
 {
 	int error = 0;
 
+	/*
+	 * spa_missing_tvds_allowed defines how many top-level vdevs can be
+	 * missing/unopenable for the root vdev to be still considered openable.
+	 */
+	if (spa->spa_trust_config) {
+		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
+	} else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
+		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
+	} else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
+		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
+	} else {
+		spa->spa_missing_tvds_allowed = 0;
+	}
+
+	spa->spa_missing_tvds_allowed =
+	    MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
+
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = vdev_open(spa->spa_root_vdev);
 	spa_config_exit(spa, SCL_ALL, FTAG);
+
+	if (spa->spa_missing_tvds != 0) {
+		spa_load_note(spa, "vdev tree has %lld missing top-level "
+		    "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
+		if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) {
+			/*
+			 * Although theoretically we could allow users to open
+			 * incomplete pools in RW mode, we'd need to add a lot
+			 * of extra logic (e.g. adjust pool space to account
+			 * for missing vdevs).
+			 * This limitation also prevents users from accidentally
+			 * opening the pool in RW mode during data recovery and
+			 * damaging it further.
+			 */
+			spa_load_note(spa, "pools with missing top-level "
+			    "vdevs can only be opened in read-only mode.");
+			error = SET_ERROR(ENXIO);
+		} else {
+			spa_load_note(spa, "current settings allow for maximum "
+			    "%lld missing top-level vdevs at this stage.",
+			    (u_longlong_t)spa->spa_missing_tvds_allowed);
+		}
+	}
 	if (error != 0) {
 		spa_load_failed(spa, "unable to open vdev tree [error=%d]",
 		    error);
 	}
+	if (spa->spa_missing_tvds != 0 || error != 0)
+		vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
 
 	return (error);
 }
 
+/*
+ * We need to validate the vdev labels against the configuration that
+ * we have in hand. This function is called twice: first with an untrusted
+ * config, then with a trusted config. The validation is more strict when the
+ * config is trusted.
+ */
 static int
-spa_ld_validate_vdevs(spa_t *spa, spa_import_type_t type,
-    boolean_t trust_config)
+spa_ld_validate_vdevs(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
-	/*
-	 * We need to validate the vdev labels against the configuration that
-	 * we have in hand, which is dependent on the setting of trust_config.
-	 * If trust_config is true then we're validating the vdev labels based
-	 * on that config.  Otherwise, we're validating against the cached
-	 * config (zpool.cache) that was read when we loaded the zfs module, and
-	 * then later we will recursively call spa_load() and validate against
-	 * the vdev config.
-	 *
-	 * If we're assembling a new pool that's been split off from an
-	 * existing pool, the labels haven't yet been updated so we skip
-	 * validation for now.
-	 */
-	if (type != SPA_IMPORT_ASSEMBLE) {
-		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-		error = vdev_validate(rvd, trust_config);
-		spa_config_exit(spa, SCL_ALL, FTAG);
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	error = vdev_validate(rvd);
+	spa_config_exit(spa, SCL_ALL, FTAG);
 
-		if (error != 0) {
-			spa_load_failed(spa, "vdev_validate failed [error=%d]",
-			    error);
-			return (error);
-		}
+	if (error != 0) {
+		spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
+		return (error);
+	}
 
-		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
-			spa_load_failed(spa, "cannot open vdev tree after "
-			    "invalidating some vdevs");
-			return (SET_ERROR(ENXIO));
-		}
+	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
+		spa_load_failed(spa, "cannot open vdev tree after invalidating "
+		    "some vdevs");
+		vdev_dbgmsg_print_tree(rvd, 2);
+		return (SET_ERROR(ENXIO));
 	}
 
 	return (0);
 }
 
 static int
-spa_ld_select_uberblock(spa_t *spa, nvlist_t *config, spa_import_type_t type,
-    boolean_t trust_config)
+spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	nvlist_t *label;
 	uberblock_t *ub = &spa->spa_uberblock;
-	uint64_t children;
 
 	/*
 	 * Find the best uberblock.
@@ -2484,26 +2549,9 @@ spa_ld_select_uberblock(spa_t *spa, nvlist_t *config, spa_import_type_t type,
 		nvlist_free(unsup_feat);
 	}
 
-	/*
-	 * If the vdev guid sum doesn't match the uberblock, we have an
-	 * incomplete configuration.  We first check to see if the pool
-	 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
-	 * If it is, defer the vdev_guid_sum check till later so we
-	 * can handle missing vdevs.
-	 */
-	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
-	    &children) != 0 && trust_config && type != SPA_IMPORT_ASSEMBLE &&
-	    rvd->vdev_guid_sum != ub->ub_guid_sum) {
-		spa_load_failed(spa, "guid sum in config doesn't match guid "
-		    "sum in uberblock (%llu != %llu)",
-		    (u_longlong_t)rvd->vdev_guid_sum,
-		    (u_longlong_t)ub->ub_guid_sum);
-		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
-	}
-
 	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-		spa_try_repair(spa, config);
+		spa_try_repair(spa, spa->spa_config);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		nvlist_free(spa->spa_config_splitting);
 		spa->spa_config_splitting = NULL;
@@ -2542,49 +2590,167 @@ spa_ld_open_rootbp(spa_t *spa)
 }
 
 static int
-spa_ld_validate_config(spa_t *spa, spa_import_type_t type)
+spa_ld_load_trusted_config(spa_t *spa, spa_import_type_t type,
+    boolean_t reloading)
 {
-	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
+	nvlist_t *nv, *mos_config, *policy;
+	int error = 0, copy_error;
+	uint64_t healthy_tvds, healthy_tvds_mos;
+	uint64_t mos_config_txg;
 
 	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
 	    != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
-	 * Validate the config, using the MOS config to fill in any
-	 * information which might be missing.  If we fail to validate
-	 * the config then declare the pool unfit for use. If we're
-	 * assembling a pool from a split, the log is not transferred
-	 * over.
+	 * If we're assembling a pool from a split, the config provided is
+	 * already trusted so there is nothing to do.
 	 */
-	if (type != SPA_IMPORT_ASSEMBLE) {
-		nvlist_t *mos_config;
-		if (load_nvlist(spa, spa->spa_config_object, &mos_config)
-		    != 0) {
-			spa_load_failed(spa, "unable to retrieve MOS config");
-			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-		}
+	if (type == SPA_IMPORT_ASSEMBLE)
+		return (0);
+
+	healthy_tvds = spa_healthy_core_tvds(spa);
+
+	if (load_nvlist(spa, spa->spa_config_object, &mos_config)
+	    != 0) {
+		spa_load_failed(spa, "unable to retrieve MOS config");
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	}
 
-		if (!spa_config_valid(spa, mos_config)) {
+	/*
+	 * If we are doing an open, pool owner wasn't verified yet, thus do
+	 * the verification here.
+	 */
+	if (spa->spa_load_state == SPA_LOAD_OPEN) {
+		error = spa_verify_host(spa, mos_config);
+		if (error != 0) {
 			nvlist_free(mos_config);
-			spa_load_failed(spa, "mismatch between config provided "
-			    "and config stored in MOS");
-			return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
-			    ENXIO));
+			return (error);
 		}
-		nvlist_free(mos_config);
+	}
+
+	nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
+
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
+	/*
+	 * Build a new vdev tree from the trusted config
+	 */
+	VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
+
+	/*
+	 * Vdev paths in the MOS may be obsolete. If the untrusted config was
+	 * obtained by scanning /dev/dsk, then it will have the right vdev
+	 * paths. We update the trusted MOS config with this information.
+	 * We first try to copy the paths with vdev_copy_path_strict, which
+	 * succeeds only when both configs have exactly the same vdev tree.
+	 * If that fails, we fall back to a more flexible method that has a
+	 * best effort policy.
+	 */
+	copy_error = vdev_copy_path_strict(rvd, mrvd);
+	if (copy_error != 0 || spa_load_print_vdev_tree) {
+		spa_load_note(spa, "provided vdev tree:");
+		vdev_dbgmsg_print_tree(rvd, 2);
+		spa_load_note(spa, "MOS vdev tree:");
+		vdev_dbgmsg_print_tree(mrvd, 2);
+	}
+	if (copy_error != 0) {
+		spa_load_note(spa, "vdev_copy_path_strict failed, falling "
+		    "back to vdev_copy_path_relaxed");
+		vdev_copy_path_relaxed(rvd, mrvd);
+	}
+
+	vdev_close(rvd);
+	vdev_free(rvd);
+	spa->spa_root_vdev = mrvd;
+	rvd = mrvd;
+	spa_config_exit(spa, SCL_ALL, FTAG);
+
+	/*
+	 * We will use spa_config if we decide to reload the spa or if spa_load
+	 * fails and we rewind. We must thus regenerate the config using the
+	 * MOS information with the updated paths. Rewind policy is an import
+	 * setting and is not in the MOS. We copy it over to our new, trusted
+	 * config.
+	 */
+	mos_config_txg = fnvlist_lookup_uint64(mos_config,
+	    ZPOOL_CONFIG_POOL_TXG);
+	nvlist_free(mos_config);
+	mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
+	if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_REWIND_POLICY,
+	    &policy) == 0)
+		fnvlist_add_nvlist(mos_config, ZPOOL_REWIND_POLICY, policy);
+	spa_config_set(spa, mos_config);
+	spa->spa_config_source = SPA_CONFIG_SRC_MOS;
+
+	/*
+	 * Now that we got the config from the MOS, we should be more strict
+	 * in checking blkptrs and can make assumptions about the consistency
+	 * of the vdev tree. spa_trust_config must be set to true before opening
+	 * vdevs in order for them to be writeable.
+	 */
+	spa->spa_trust_config = B_TRUE;
+
+	/*
+	 * Open and validate the new vdev tree
+	 */
+	error = spa_ld_open_vdevs(spa);
+	if (error != 0)
+		return (error);
+
+	error = spa_ld_validate_vdevs(spa);
+	if (error != 0)
+		return (error);
+
+	if (copy_error != 0 || spa_load_print_vdev_tree) {
+		spa_load_note(spa, "final vdev tree:");
+		vdev_dbgmsg_print_tree(rvd, 2);
+	}
+
+	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
+	    !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
 		/*
-		 * Now that we've validated the config, check the state of the
-		 * root vdev.  If it can't be opened, it indicates one or
-		 * more toplevel vdevs are faulted.
+		 * Sanity check to make sure that we are indeed loading the
+		 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
+		 * in the config provided and they happened to be the only ones
+		 * to have the latest uberblock, we could involuntarily perform
+		 * an extreme rewind.
 		 */
-		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
-			spa_load_failed(spa, "some top vdevs are unavailable");
-			return (SET_ERROR(ENXIO));
+		healthy_tvds_mos = spa_healthy_core_tvds(spa);
+		if (healthy_tvds_mos - healthy_tvds >=
+		    SPA_SYNC_MIN_VDEVS) {
+			spa_load_note(spa, "config provided misses too many "
+			    "top-level vdevs compared to MOS (%lld vs %lld). ",
+			    (u_longlong_t)healthy_tvds,
+			    (u_longlong_t)healthy_tvds_mos);
+			spa_load_note(spa, "vdev tree:");
+			vdev_dbgmsg_print_tree(rvd, 2);
+			if (reloading) {
+				spa_load_failed(spa, "config was already "
+				    "provided from MOS. Aborting.");
+				return (spa_vdev_err(rvd,
+				    VDEV_AUX_CORRUPT_DATA, EIO));
+			}
+			spa_load_note(spa, "spa must be reloaded using MOS "
+			    "config");
+			return (SET_ERROR(EAGAIN));
 		}
 	}
 
+	error = spa_check_for_missing_logs(spa);
+	if (error != 0)
+		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
+
+	if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
+		spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
+		    "guid sum (%llu != %llu)",
+		    (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
+		    (u_longlong_t)rvd->vdev_guid_sum);
+		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
+		    ENXIO));
+	}
+
 	return (0);
 }
 
@@ -2751,62 +2917,6 @@ spa_ld_load_special_directories(spa_t *spa)
 }
 
 static int
-spa_ld_prepare_for_reload(spa_t *spa, int orig_mode)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	uint64_t hostid;
-	nvlist_t *policy = NULL;
-	nvlist_t *mos_config;
-
-	if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
-		spa_load_failed(spa, "unable to retrieve MOS config");
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-	}
-
-	if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
-	    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
-		char *hostname;
-		unsigned long myhostid = 0;
-
-		VERIFY(nvlist_lookup_string(mos_config,
-		    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
-
-#ifdef	_KERNEL
-		myhostid = zone_get_hostid(NULL);
-#else	/* _KERNEL */
-		/*
-		 * We're emulating the system's hostid in userland, so
-		 * we can't use zone_get_hostid().
-		 */
-		(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
-#endif	/* _KERNEL */
-		if (hostid != 0 && myhostid != 0 &&
-		    hostid != myhostid) {
-			nvlist_free(mos_config);
-			cmn_err(CE_WARN, "pool '%s' could not be "
-			    "loaded as it was last accessed by "
-			    "another system (host: %s hostid: 0x%lx). "
-			    "See: http://illumos.org/msg/ZFS-8000-EY",
-			    spa_name(spa), hostname,
-			    (unsigned long)hostid);
-			return (SET_ERROR(EBADF));
-		}
-	}
-	if (nvlist_lookup_nvlist(spa->spa_config,
-	    ZPOOL_REWIND_POLICY, &policy) == 0)
-		VERIFY(nvlist_add_nvlist(mos_config,
-		    ZPOOL_REWIND_POLICY, policy) == 0);
-
-	spa_config_set(spa, mos_config);
-	spa_unload(spa);
-	spa_deactivate(spa);
-	spa_activate(spa, orig_mode);
-
-	return (0);
-}
-
-static int
 spa_ld_get_props(spa_t *spa)
 {
 	int error = 0;
@@ -2933,6 +3043,19 @@ spa_ld_get_props(spa_t *spa)
 		spa->spa_autoreplace = (autoreplace != 0);
 	}
 
+	/*
+	 * If we are importing a pool with missing top-level vdevs,
+	 * we enforce that the pool doesn't panic or get suspended on
+	 * error since the likelihood of missing data is extremely high.
+	 */
+	if (spa->spa_missing_tvds > 0 &&
+	    spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
+	    spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
+		spa_load_note(spa, "forcing failmode to 'continue' "
+		    "as some top level vdevs are missing");
+		spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
+	}
+
 	return (0);
 }
 
@@ -3063,9 +3186,15 @@ spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport)
 	if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
 		boolean_t missing = spa_check_logs(spa);
 		if (missing) {
-			*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
-			spa_load_failed(spa, "spa_check_logs failed");
-			return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
+			if (spa->spa_missing_tvds != 0) {
+				spa_load_note(spa, "spa_check_logs failed "
+				    "so dropping the logs");
+			} else {
+				*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
+				spa_load_failed(spa, "spa_check_logs failed");
+				return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
+				    ENXIO));
+			}
 		}
 	}
 
@@ -3121,7 +3250,8 @@ spa_ld_claim_log_blocks(spa_t *spa)
 }
 
 static void
-spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg)
+spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
+    boolean_t reloading)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	int need_update = B_FALSE;
@@ -3133,7 +3263,7 @@ spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg)
 	 * If this is a verbatim import, trust the current
 	 * in-core spa_config and update the disk labels.
 	 */
-	if (config_cache_txg != spa->spa_config_txg ||
+	if (reloading || config_cache_txg != spa->spa_config_txg ||
 	    spa->spa_load_state == SPA_LOAD_IMPORT ||
 	    spa->spa_load_state == SPA_LOAD_RECOVER ||
 	    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
@@ -3151,6 +3281,24 @@ spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg)
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 }
 
+static void
+spa_ld_prepare_for_reload(spa_t *spa)
+{
+	int mode = spa->spa_mode;
+	int async_suspended = spa->spa_async_suspended;
+
+	spa_unload(spa);
+	spa_deactivate(spa);
+	spa_activate(spa, mode);
+
+	/*
+	 * We save the value of spa_async_suspended as it gets reset to 0 by
+	 * spa_unload(). We want to restore it back to the original value before
+	 * returning as we might be calling spa_async_resume() later.
+	 */
+	spa->spa_async_suspended = async_suspended;
+}
+
 /*
  * Load an existing storage pool, using the config provided. This config
  * describes which vdevs are part of the pool and is later validated against
@@ -3158,32 +3306,35 @@ spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg)
  * config stored in the MOS.
  */
 static int
-spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
-    spa_load_state_t state, spa_import_type_t type, boolean_t trust_config,
-    char **ereport)
+spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
+    boolean_t reloading)
 {
 	int error = 0;
-	uint64_t config_cache_txg = spa->spa_config_txg;
-	int orig_mode = spa->spa_mode;
 	boolean_t missing_feat_write = B_FALSE;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	spa->spa_load_state = state;
-	spa_load_note(spa, "LOADING");
+	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
 
 	/*
-	 * If this is an untrusted config, first access the pool in read-only
-	 * mode. We will then retrieve a trusted copy of the config from the MOS
-	 * and use it to reopen the pool in read-write mode.
+	 * Never trust the config that is provided unless we are assembling
+	 * a pool following a split.
+	 * This means don't trust blkptrs and the vdev tree in general. This
+	 * also effectively puts the spa in read-only mode since
+	 * spa_writeable() checks for spa_trust_config to be true.
+	 * We will later load a trusted config from the MOS.
 	 */
-	if (!trust_config)
-		spa->spa_mode = FREAD;
+	if (type != SPA_IMPORT_ASSEMBLE)
+		spa->spa_trust_config = B_FALSE;
+
+	if (reloading)
+		spa_load_note(spa, "RELOADING");
+	else
+		spa_load_note(spa, "LOADING");
 
 	/*
 	 * Parse the config provided to create a vdev tree.
 	 */
-	error = spa_ld_parse_config(spa, pool_guid, config, type);
+	error = spa_ld_parse_config(spa, type);
 	if (error != 0)
 		return (error);
 
@@ -3201,10 +3352,15 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 	/*
 	 * Read the label of each vdev and make sure that the GUIDs stored
 	 * there match the GUIDs in the config provided.
+	 * If we're assembling a new pool that's been split off from an
+	 * existing pool, the labels haven't yet been updated so we skip
+	 * validation for now.
 	 */
-	error = spa_ld_validate_vdevs(spa, type, trust_config);
-	if (error != 0)
-		return (error);
+	if (type != SPA_IMPORT_ASSEMBLE) {
+		error = spa_ld_validate_vdevs(spa);
+		if (error != 0)
+			return (error);
+	}
 
 	/*
 	 * Read vdev labels to find the best uberblock (i.e. latest, unless
@@ -3213,7 +3369,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 	 * label with the best uberblock and verify that our version of zfs
 	 * supports them all.
 	 */
-	error = spa_ld_select_uberblock(spa, config, type, trust_config);
+	error = spa_ld_select_uberblock(spa, type);
 	if (error != 0)
 		return (error);
 
@@ -3227,13 +3383,21 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 		return (error);
 
 	/*
-	 * Retrieve the config stored in the MOS and use it to validate the
-	 * config provided. Also extract some information from the MOS config
-	 * to update our vdev tree.
+	 * Retrieve the trusted config stored in the MOS and use it to create
+	 * a new, exact version of the vdev tree, then reopen all vdevs.
 	 */
-	error = spa_ld_validate_config(spa, type);
-	if (error != 0)
+	error = spa_ld_load_trusted_config(spa, type, reloading);
+	if (error == EAGAIN) {
+		VERIFY(!reloading);
+		/*
+		 * Redo the loading process with the trusted config if it is
+		 * too different from the untrusted config.
+		 */
+		spa_ld_prepare_for_reload(spa);
+		return (spa_load_impl(spa, type, ereport, B_TRUE));
+	} else if (error != 0) {
 		return (error);
+	}
 
 	/*
 	 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
@@ -3264,19 +3428,6 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 		return (error);
 
 	/*
-	 * If the config provided is not trusted, discard it and use the config
-	 * from the MOS to reload the pool.
-	 */
-	if (!trust_config) {
-		error = spa_ld_prepare_for_reload(spa, orig_mode);
-		if (error != 0)
-			return (error);
-
-		spa_load_note(spa, "RELOADING");
-		return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
-	}
-
-	/*
 	 * Retrieve pool properties from the MOS.
 	 */
 	error = spa_ld_get_props(spa);
@@ -3312,7 +3463,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 		return (error);
 
 	if (missing_feat_write) {
-		ASSERT(state == SPA_LOAD_TRYIMPORT);
+		ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * At this point, we know that we can open the pool in
@@ -3344,21 +3495,11 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 	 * pool. If we are importing the pool in read-write mode, a few
 	 * additional steps must be performed to finish the import.
 	 */
-	if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
+	if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
 	    spa->spa_load_max_txg == UINT64_MAX)) {
-		ASSERT(state != SPA_LOAD_TRYIMPORT);
+		uint64_t config_cache_txg = spa->spa_config_txg;
 
-		/*
-		 * We must check this before we start the sync thread, because
-		 * we only want to start a condense thread for condense
-		 * operations that were in progress when the pool was
-		 * imported.  Once we start syncing, spa_sync() could
-		 * initiate a condense (and start a thread for it).  In
-		 * that case it would be wrong to start a second
-		 * condense thread.
-		 */
-		boolean_t condense_in_progress =
-		    (spa->spa_condensing_indirect != NULL);
+		ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * Traverse the ZIL and claim all blocks.
@@ -3385,7 +3526,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 		 * next sync, we would update the config stored in vdev labels
 		 * and the cachefile (by default /etc/zfs/zpool.cache).
 		 */
-		spa_ld_check_for_config_update(spa, config_cache_txg);
+		spa_ld_check_for_config_update(spa, config_cache_txg,
+		    reloading);
 
 		/*
 		 * Check all DTLs to see if anything needs resilvering.
@@ -3411,15 +3553,9 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 		 */
 		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
 
-		/*
-		 * Note: unlike condensing, we don't need an analogous
-		 * "removal_in_progress" dance because no other thread
-		 * can start a removal while we hold the spa_namespace_lock.
-		 */
 		spa_restart_removal(spa);
 
-		if (condense_in_progress)
-			spa_condense_indirect_restart(spa);
+		spa_spawn_aux_threads(spa);
 	}
 
 	spa_load_note(spa, "LOADED");
@@ -3428,7 +3564,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 }
 
 static int
-spa_load_retry(spa_t *spa, spa_load_state_t state, int trust_config)
+spa_load_retry(spa_t *spa, spa_load_state_t state)
 {
 	int mode = spa->spa_mode;
 
@@ -3443,7 +3579,7 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int trust_config)
 	spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
 	    (u_longlong_t)spa->spa_load_max_txg);
 
-	return (spa_load(spa, state, SPA_IMPORT_EXISTING, trust_config));
+	return (spa_load(spa, state, SPA_IMPORT_EXISTING));
 }
 
 /*
@@ -3454,8 +3590,8 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int trust_config)
  * spa_load().
  */
 static int
-spa_load_best(spa_t *spa, spa_load_state_t state, int trust_config,
-    uint64_t max_request, int rewind_flags)
+spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
+    int rewind_flags)
 {
 	nvlist_t *loadinfo = NULL;
 	nvlist_t *config = NULL;
@@ -3472,8 +3608,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int trust_config,
 			spa->spa_extreme_rewind = B_TRUE;
 	}
 
-	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
-	    trust_config);
+	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
 	if (load_error == 0)
 		return (0);
 
@@ -3514,7 +3649,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int trust_config,
 	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
 		if (spa->spa_load_max_txg < safe_rewind_txg)
 			spa->spa_extreme_rewind = B_TRUE;
-		rewind_error = spa_load_retry(spa, state, trust_config);
+		rewind_error = spa_load_retry(spa, state);
 	}
 
 	spa->spa_extreme_rewind = B_FALSE;
@@ -3593,9 +3728,10 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
 
 		if (state != SPA_LOAD_RECOVER)
 			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
 
 		zfs_dbgmsg("spa_open_common: opening %s", pool);
-		error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
+		error = spa_load_best(spa, state, policy.zrp_txg,
 		    policy.zrp_request);
 
 		if (error == EBADF) {
@@ -4328,6 +4464,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	 */
 	txg_wait_synced(spa->spa_dsl_pool, txg);
 
+	spa_spawn_aux_threads(spa);
+
 	spa_write_cachefile(spa, B_FALSE, B_TRUE);
 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
 
@@ -4616,18 +4754,16 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 	if (policy.zrp_request & ZPOOL_DO_REWIND)
 		state = SPA_LOAD_RECOVER;
 
-	/*
-	 * Pass off the heavy lifting to spa_load().  Pass TRUE for trust_config
-	 * because the user-supplied config is actually the one to trust when
-	 * doing an import.
-	 */
-	if (state != SPA_LOAD_RECOVER)
-		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+	spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
 
-	zfs_dbgmsg("spa_import: importing %s%s", pool,
-	    (state == SPA_LOAD_RECOVER) ? " (RECOVERY MODE)" : "");
-	error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
-	    policy.zrp_request);
+	if (state != SPA_LOAD_RECOVER) {
+		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+		zfs_dbgmsg("spa_import: importing %s", pool);
+	} else {
+		zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
+		    "(RECOVERY MODE)", pool, (longlong_t)policy.zrp_txg);
+	}
+	error = spa_load_best(spa, state, policy.zrp_txg, policy.zrp_request);
 
 	/*
 	 * Propagate anything learned while loading the pool and pass it
@@ -4745,10 +4881,11 @@ nvlist_t *
 spa_tryimport(nvlist_t *tryconfig)
 {
 	nvlist_t *config = NULL;
-	char *poolname;
+	char *poolname, *cachefile;
 	spa_t *spa;
 	uint64_t state;
 	int error;
+	zpool_rewind_policy_t policy;
 
 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
 		return (NULL);
@@ -4763,14 +4900,30 @@ spa_tryimport(nvlist_t *tryconfig)
 	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
 	spa_activate(spa, FREAD);
 
-	zfs_dbgmsg("spa_tryimport: importing %s", poolname);
-
 	/*
-	 * Pass off the heavy lifting to spa_load().
-	 * Pass TRUE for trust_config because the user-supplied config
-	 * is actually the one to trust when doing an import.
+	 * Rewind pool if a max txg was provided. Note that even though we
+	 * retrieve the complete rewind policy, only the rewind txg is relevant
+	 * for tryimport.
 	 */
-	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
+	zpool_get_rewind_policy(spa->spa_config, &policy);
+	if (policy.zrp_txg != UINT64_MAX) {
+		spa->spa_load_max_txg = policy.zrp_txg;
+		spa->spa_extreme_rewind = B_TRUE;
+		zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
+		    poolname, (longlong_t)policy.zrp_txg);
+	} else {
+		zfs_dbgmsg("spa_tryimport: importing %s", poolname);
+	}
+
+	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
+	    == 0) {
+		zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
+		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
+	} else {
+		spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
+	}
+
+	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
 
 	/*
 	 * If 'tryconfig' was at least parsable, return the current config.
@@ -5775,8 +5928,10 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
 	spa_activate(newspa, spa_mode_global);
 	spa_async_suspend(newspa);
 
+	newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
+
 	/* create the new pool from the disks of the original pool */
-	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
+	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
 	if (error)
 		goto out;
 
@@ -6250,12 +6405,15 @@ spa_async_suspend(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_suspended++;
-	while (spa->spa_async_thread != NULL ||
-	    spa->spa_condense_thread != NULL)
+	while (spa->spa_async_thread != NULL)
 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
 	mutex_exit(&spa->spa_async_lock);
 
 	spa_vdev_remove_suspend(spa);
+
+	zthr_t *condense_thread = spa->spa_condense_zthr;
+	if (condense_thread != NULL && zthr_isrunning(condense_thread))
+		VERIFY0(zthr_cancel(condense_thread));
 }
 
 void
@@ -6266,6 +6424,10 @@ spa_async_resume(spa_t *spa)
 	spa->spa_async_suspended--;
 	mutex_exit(&spa->spa_async_lock);
 	spa_restart_removal(spa);
+
+	zthr_t *condense_thread = spa->spa_condense_zthr;
+	if (condense_thread != NULL && !zthr_isrunning(condense_thread))
+		zthr_resume(condense_thread);
 }
 
 static boolean_t
@@ -7079,7 +7241,7 @@ spa_sync(spa_t *spa, uint64_t txg)
 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 		if (list_is_empty(&spa->spa_config_dirty_list)) {
-			vdev_t *svd[SPA_DVAS_PER_BP];
+			vdev_t *svd[SPA_SYNC_MIN_VDEVS];
 			int svdcount = 0;
 			int children = rvd->vdev_children;
 			int c0 = spa_get_random(children);
@@ -7090,7 +7252,7 @@ spa_sync(spa_t *spa, uint64_t txg)
 				    !vdev_is_concrete(vd))
 					continue;
 				svd[svdcount++] = vd;
-				if (svdcount == SPA_DVAS_PER_BP)
+				if (svdcount == SPA_SYNC_MIN_VDEVS)
 					break;
 			}
 			error = vdev_config_sync(svd, svdcount, txg);
diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c
index e118317adb..1fe675337f 100644
--- a/usr/src/uts/common/fs/zfs/spa_config.c
+++ b/usr/src/uts/common/fs/zfs/spa_config.c
@@ -339,7 +339,8 @@ void
 spa_config_set(spa_t *spa, nvlist_t *config)
 {
 	mutex_enter(&spa->spa_props_lock);
-	nvlist_free(spa->spa_config);
+	if (spa->spa_config != NULL && spa->spa_config != config)
+		nvlist_free(spa->spa_config);
 	spa->spa_config = config;
 	mutex_exit(&spa->spa_props_lock);
 }
@@ -386,15 +387,8 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 		    spa->spa_comment);
 	}
 
-#ifdef	_KERNEL
 	hostid = zone_get_hostid(NULL);
-#else	/* _KERNEL */
-	/*
-	 * We're emulating the system's hostid in userland, so we can't use
-	 * zone_get_hostid().
-	 */
-	(void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
-#endif	/* _KERNEL */
+
 	if (hostid != 0) {
 		fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid);
 	}
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 33857db7ac..e79664dd33 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -365,7 +365,8 @@ spa_load_failed(spa_t *spa, const char *fmt, ...)
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
-	zfs_dbgmsg("spa_load(%s): FAILED: %s", spa->spa_name, buf);
+	zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name,
+	    spa->spa_trust_config ? "trusted" : "untrusted", buf);
 }
 
 /*PRINTFLIKE2*/
@@ -379,7 +380,8 @@ spa_load_note(spa_t *spa, const char *fmt, ...)
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
-	zfs_dbgmsg("spa_load(%s): %s", spa->spa_name, buf);
+	zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
+	    spa->spa_trust_config ? "trusted" : "untrusted", buf);
 }
 
 /*
@@ -620,6 +622,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	spa->spa_load_max_txg = UINT64_MAX;
 	spa->spa_proc = &p0;
 	spa->spa_proc_state = SPA_PROC_NONE;
+	spa->spa_trust_config = B_TRUE;
 
 	hdlr.cyh_func = spa_deadman;
 	hdlr.cyh_arg = spa;
@@ -2013,7 +2016,7 @@ spa_is_root(spa_t *spa)
 boolean_t
 spa_writeable(spa_t *spa)
 {
-	return (!!(spa->spa_mode & FWRITE));
+	return (!!(spa->spa_mode & FWRITE) && spa->spa_trust_config);
 }
 
 /*
@@ -2161,3 +2164,21 @@ spa_get_last_removal_txg(spa_t *spa)
 
 	return (ret);
 }
+
+boolean_t
+spa_trust_config(spa_t *spa)
+{
+	return (spa->spa_trust_config);
+}
+
+uint64_t
+spa_missing_tvds_allowed(spa_t *spa)
+{
+	return (spa->spa_missing_tvds_allowed);
+}
+
+void
+spa_set_missing_tvds(spa_t *spa, uint64_t missing)
+{
+	spa->spa_missing_tvds = missing;
+}
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index aff251b80e..d6d48ddfbf 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -322,6 +322,7 @@ typedef enum bp_embedded_type {
 
 #define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
 #define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
+#define	SPA_SYNC_MIN_VDEVS 3		/* min vdevs to update during sync */
 
 /*
  * A block is a hole when it has either 1) never been written to, or
@@ -836,11 +837,16 @@ extern boolean_t spa_writeable(spa_t *spa);
 extern boolean_t spa_has_pending_synctask(spa_t *spa);
 extern int spa_maxblocksize(spa_t *spa);
 extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
+extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva,
+    const blkptr_t *bp);
 typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size,
     void *arg);
 extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp,
     spa_remap_cb_t callback, void *arg);
 extern uint64_t spa_get_last_removal_txg(spa_t *spa);
+extern boolean_t spa_trust_config(spa_t *spa);
+extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
+extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
 
 extern int spa_mode(spa_t *spa);
 extern uint64_t zfs_strtonum(const char *str, char **nptr);
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
index 955568c05b..1e440a6767 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -43,6 +43,7 @@
 #include <sys/bplist.h>
 #include <sys/bpobj.h>
 #include <sys/zfeature.h>
+#include <sys/zthr.h>
 #include <zfeature_common.h>
 
 #ifdef	__cplusplus
@@ -181,6 +182,15 @@ typedef enum spa_all_vdev_zap_action {
 	AVZ_ACTION_INITIALIZE
 } spa_avz_action_t;
 
+typedef enum spa_config_source {
+	SPA_CONFIG_SRC_NONE = 0,
+	SPA_CONFIG_SRC_SCAN,		/* scan of path (default: /dev/dsk) */
+	SPA_CONFIG_SRC_CACHEFILE,	/* any cachefile */
+	SPA_CONFIG_SRC_TRYIMPORT,	/* returned from call to tryimport */
+	SPA_CONFIG_SRC_SPLIT,		/* new pool in a pool split */
+	SPA_CONFIG_SRC_MOS		/* MOS, but not always from right txg */
+} spa_config_source_t;
+
 struct spa {
 	/*
 	 * Fields protected by spa_namespace_lock.
@@ -199,6 +209,8 @@ struct spa {
 	uint8_t		spa_sync_on;		/* sync threads are running */
 	spa_load_state_t spa_load_state;	/* current load operation */
 	boolean_t	spa_indirect_vdevs_loaded; /* mappings loaded? */
+	boolean_t	spa_trust_config;	/* do we trust vdev tree? */
+	spa_config_source_t spa_config_source;	/* where config comes from? */
 	uint64_t	spa_import_flags;	/* import specific flags */
 	spa_taskqs_t	spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
 	dsl_pool_t	*spa_dsl_pool;
@@ -259,13 +271,15 @@ struct spa {
 	int		spa_async_suspended;	/* async tasks suspended */
 	kcondvar_t	spa_async_cv;		/* wait for thread_exit() */
 	uint16_t	spa_async_tasks;	/* async task mask */
+	uint64_t	spa_missing_tvds;	/* unopenable tvds on load */
+	uint64_t	spa_missing_tvds_allowed; /* allow loading spa? */
 
 	spa_removing_phys_t spa_removing_phys;
 	spa_vdev_removal_t *spa_vdev_removal;
 
 	spa_condensing_indirect_phys_t	spa_condensing_indirect_phys;
 	spa_condensing_indirect_t	*spa_condensing_indirect;
-	kthread_t	*spa_condense_thread;	/* thread doing condense. */
+	zthr_t		*spa_condense_zthr;	/* zthr doing condense. */
 
 	char		*spa_root;		/* alternate root directory */
 	uint64_t	spa_ena;		/* spa-wide ereport ENA */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index 1297590eb3..91d4db62f9 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -48,10 +48,13 @@ typedef enum vdev_dtl_type {
 extern boolean_t zfs_nocacheflush;
 
 extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...);
+extern void vdev_dbgmsg_print_tree(vdev_t *, int);
 extern int vdev_open(vdev_t *);
 extern void vdev_open_children(vdev_t *);
 extern boolean_t vdev_uses_zvols(vdev_t *);
-extern int vdev_validate(vdev_t *, boolean_t);
+extern int vdev_validate(vdev_t *);
+extern int vdev_copy_path_strict(vdev_t *, vdev_t *);
+extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *);
 extern void vdev_close(vdev_t *);
 extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
 extern void vdev_reopen(vdev_t *);
@@ -99,6 +102,7 @@ extern void vdev_scan_stat_init(vdev_t *vd);
 extern void vdev_propagate_state(vdev_t *vd);
 extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
     vdev_aux_t aux);
+extern boolean_t vdev_children_are_offline(vdev_t *vd);
 
 extern void vdev_space_update(vdev_t *vd,
     int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
@@ -140,7 +144,8 @@ typedef enum vdev_config_flag {
 	VDEV_CONFIG_SPARE = 1 << 0,
 	VDEV_CONFIG_L2CACHE = 1 << 1,
 	VDEV_CONFIG_REMOVING = 1 << 2,
-	VDEV_CONFIG_MOS = 1 << 3
+	VDEV_CONFIG_MOS = 1 << 3,
+	VDEV_CONFIG_MISSING = 1 << 4
 } vdev_config_flag_t;
 
 extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 12d8df93af..5035e6ab9d 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -406,7 +406,6 @@ extern void vdev_remove_parent(vdev_t *cvd);
 /*
  * vdev sync load and sync
  */
-extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
 extern boolean_t vdev_log_state_valid(vdev_t *vd);
 extern int vdev_load(vdev_t *vd);
 extern int vdev_dtl_load(vdev_t *vd);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_removal.h b/usr/src/uts/common/fs/zfs/sys/vdev_removal.h
index 5b1e3056be..45cf4d8ec5 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_removal.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_removal.h
@@ -76,7 +76,7 @@ extern int spa_remove_init(spa_t *);
 extern void spa_restart_removal(spa_t *);
 extern int spa_condense_init(spa_t *);
 extern void spa_condense_fini(spa_t *);
-extern void spa_condense_indirect_restart(spa_t *);
+extern void spa_start_indirect_condensing_thread(spa_t *);
 extern void spa_vdev_condense_suspend(spa_t *);
 extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t);
 extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t, uint64_t);
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 48121d4c50..828cc2a92b 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -208,6 +208,9 @@ enum zio_flag {
 	(((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) |		\
 	ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL)
 
+#define	ZIO_CHILD_BIT(x)		(1 << (x))
+#define	ZIO_CHILD_BIT_IS_SET(val, x)	((val) & (1 << (x)))
+
 enum zio_child {
 	ZIO_CHILD_VDEV = 0,
 	ZIO_CHILD_GANG,
@@ -216,6 +219,14 @@ enum zio_child {
 	ZIO_CHILD_TYPES
 };
 
+#define	ZIO_CHILD_VDEV_BIT		ZIO_CHILD_BIT(ZIO_CHILD_VDEV)
+#define	ZIO_CHILD_GANG_BIT		ZIO_CHILD_BIT(ZIO_CHILD_GANG)
+#define	ZIO_CHILD_DDT_BIT		ZIO_CHILD_BIT(ZIO_CHILD_DDT)
+#define	ZIO_CHILD_LOGICAL_BIT		ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL)
+#define	ZIO_CHILD_ALL_BITS					\
+	(ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | 		\
+	ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT)
+
 enum zio_wait_type {
 	ZIO_WAIT_READY = 0,
 	ZIO_WAIT_DONE,
diff --git a/usr/src/uts/common/fs/zfs/sys/zthr.h b/usr/src/uts/common/fs/zfs/sys/zthr.h
new file mode 100644
index 0000000000..6bfb6b6c0d
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zthr.h
@@ -0,0 +1,52 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_ZTHR_H
+#define	_SYS_ZTHR_H
+
+typedef struct zthr zthr_t;
+typedef int (zthr_func_t)(void *, zthr_t *);
+typedef boolean_t (zthr_checkfunc_t)(void *, zthr_t *);
+
+struct zthr {
+	kthread_t	*zthr_thread;
+	kmutex_t	zthr_lock;
+	kcondvar_t	zthr_cv;
+	boolean_t	zthr_cancel;
+
+	zthr_checkfunc_t	*zthr_checkfunc;
+	zthr_func_t	*zthr_func;
+	void		*zthr_arg;
+	int		zthr_rc;
+};
+
+extern zthr_t *zthr_create(zthr_checkfunc_t checkfunc,
+    zthr_func_t *func, void *arg);
+extern void zthr_exit(zthr_t *t, int rc);
+extern void zthr_destroy(zthr_t *t);
+
+extern void zthr_wakeup(zthr_t *t);
+extern int zthr_cancel(zthr_t *t);
+extern void zthr_resume(zthr_t *t);
+
+extern boolean_t zthr_iscancelled(zthr_t *t);
+extern boolean_t zthr_isrunning(zthr_t *t);
+
+#endif /* _SYS_ZTHR_H */
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 656008bf9e..73306aec85 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -77,6 +77,8 @@ int zfs_scrub_limit = 10;
  */
 int metaslabs_per_vdev = 200;
 
+boolean_t vdev_validate_skip = B_FALSE;
+
 /*PRINTFLIKE2*/
 void
 vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
@@ -99,6 +101,57 @@ vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
 	}
 }
 
+void
+vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
+{
+	char state[20];
+
+	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
+		zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id,
+		    vd->vdev_ops->vdev_op_type);
+		return;
+	}
+
+	switch (vd->vdev_state) {
+	case VDEV_STATE_UNKNOWN:
+		(void) snprintf(state, sizeof (state), "unknown");
+		break;
+	case VDEV_STATE_CLOSED:
+		(void) snprintf(state, sizeof (state), "closed");
+		break;
+	case VDEV_STATE_OFFLINE:
+		(void) snprintf(state, sizeof (state), "offline");
+		break;
+	case VDEV_STATE_REMOVED:
+		(void) snprintf(state, sizeof (state), "removed");
+		break;
+	case VDEV_STATE_CANT_OPEN:
+		(void) snprintf(state, sizeof (state), "can't open");
+		break;
+	case VDEV_STATE_FAULTED:
+		(void) snprintf(state, sizeof (state), "faulted");
+		break;
+	case VDEV_STATE_DEGRADED:
+		(void) snprintf(state, sizeof (state), "degraded");
+		break;
+	case VDEV_STATE_HEALTHY:
+		(void) snprintf(state, sizeof (state), "healthy");
+		break;
+	default:
+		(void) snprintf(state, sizeof (state), "<state %u>",
+		    (uint_t)vd->vdev_state);
+	}
+
+	zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
+	    "", vd->vdev_id, vd->vdev_ops->vdev_op_type,
+	    vd->vdev_islog ? " (log)" : "",
+	    (u_longlong_t)vd->vdev_guid,
+	    vd->vdev_path ? vd->vdev_path : "N/A", state);
+
+	for (uint64_t i = 0; i < vd->vdev_children; i++)
+		vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
+}
+
 /*
  * Given a vdev type, return the appropriate ops vector.
  */
@@ -1287,8 +1340,13 @@ vdev_open(vdev_t *vd)
 		    vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
 			vd->vdev_removed = B_FALSE;
 
-		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    vd->vdev_stat.vs_aux);
+		if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) {
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE,
+			    vd->vdev_stat.vs_aux);
+		} else {
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    vd->vdev_stat.vs_aux);
+		}
 		return (error);
 	}
 
@@ -1453,29 +1511,29 @@ vdev_open(vdev_t *vd)
 
 /*
  * Called once the vdevs are all opened, this routine validates the label
- * contents.  This needs to be done before vdev_load() so that we don't
+ * contents. This needs to be done before vdev_load() so that we don't
  * inadvertently do repair I/Os to the wrong device.
  *
- * If 'strict' is false ignore the spa guid check. This is necessary because
- * if the machine crashed during a re-guid the new guid might have been written
- * to all of the vdev labels, but not the cached config. The strict check
- * will be performed when the pool is opened again using the mos config.
- *
  * This function will only return failure if one of the vdevs indicates that it
  * has since been destroyed or exported.  This is only possible if
  * /etc/zfs/zpool.cache was readonly at the time.  Otherwise, the vdev state
  * will be updated but the function will return 0.
  */
 int
-vdev_validate(vdev_t *vd, boolean_t strict)
+vdev_validate(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	nvlist_t *label;
-	uint64_t guid = 0, top_guid;
+	uint64_t guid = 0, aux_guid = 0, top_guid;
 	uint64_t state;
+	nvlist_t *nvl;
+	uint64_t txg;
 
-	for (int c = 0; c < vd->vdev_children; c++)
-		if (vdev_validate(vd->vdev_child[c], strict) != 0)
+	if (vdev_validate_skip)
+		return (0);
+
+	for (uint64_t c = 0; c < vd->vdev_children; c++)
+		if (vdev_validate(vd->vdev_child[c]) != 0)
 			return (SET_ERROR(EBADF));
 
 	/*
@@ -1483,115 +1541,276 @@ vdev_validate(vdev_t *vd, boolean_t strict)
 	 * any further validation.  Otherwise, label I/O will fail and we will
 	 * overwrite the previous state.
 	 */
-	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
-		uint64_t aux_guid = 0;
-		nvlist_t *nvl;
-		uint64_t txg = spa_last_synced_txg(spa) != 0 ?
-		    spa_last_synced_txg(spa) : -1ULL;
+	if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd))
+		return (0);
 
-		if ((label = vdev_label_read_config(vd, txg)) == NULL) {
-			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_BAD_LABEL);
-			vdev_dbgmsg(vd, "vdev_validate: failed reading config");
-			return (0);
-		}
+	/*
+	 * If we are performing an extreme rewind, we allow for a label that
+	 * was modified at a point after the current txg.
+	 */
+	if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0)
+		txg = UINT64_MAX;
+	else
+		txg = spa_last_synced_txg(spa);
 
-		/*
-		 * Determine if this vdev has been split off into another
-		 * pool.  If so, then refuse to open it.
-		 */
-		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
-		    &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
-			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_SPLIT_POOL);
-			nvlist_free(label);
-			vdev_dbgmsg(vd, "vdev_validate: vdev split into other "
-			    "pool");
-			return (0);
-		}
+	if ((label = vdev_label_read_config(vd, txg)) == NULL) {
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_BAD_LABEL);
+		vdev_dbgmsg(vd, "vdev_validate: failed reading config");
+		return (0);
+	}
 
-		if (strict && (nvlist_lookup_uint64(label,
-		    ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
-		    guid != spa_guid(spa))) {
-			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_CORRUPT_DATA);
-			nvlist_free(label);
-			vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid "
-			    "doesn't match config (%llu != %llu)",
-			    (u_longlong_t)guid,
-			    (u_longlong_t)spa_guid(spa));
-			return (0);
-		}
+	/*
+	 * Determine if this vdev has been split off into another
+	 * pool.  If so, then refuse to open it.
+	 */
+	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
+	    &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_SPLIT_POOL);
+		nvlist_free(label);
+		vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool");
+		return (0);
+	}
 
-		if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
-		    != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
-		    &aux_guid) != 0)
-			aux_guid = 0;
+	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) {
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		nvlist_free(label);
+		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+		    ZPOOL_CONFIG_POOL_GUID);
+		return (0);
+	}
 
-		/*
-		 * If this vdev just became a top-level vdev because its
-		 * sibling was detached, it will have adopted the parent's
-		 * vdev guid -- but the label may or may not be on disk yet.
-		 * Fortunately, either version of the label will have the
-		 * same top guid, so if we're a top-level vdev, we can
-		 * safely compare to that instead.
-		 *
-		 * If we split this vdev off instead, then we also check the
-		 * original pool's guid.  We don't want to consider the vdev
-		 * corrupt if it is partway through a split operation.
-		 */
-		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
-		    &guid) != 0 ||
-		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
-		    &top_guid) != 0 ||
-		    ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
-		    (vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
-			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_CORRUPT_DATA);
-			nvlist_free(label);
-			vdev_dbgmsg(vd, "vdev_validate: config guid doesn't "
-			    "match label guid (%llu != %llu)",
-			    (u_longlong_t)vd->vdev_guid, (u_longlong_t)guid);
-			return (0);
+	/*
+	 * If config is not trusted then ignore the spa guid check. This is
+	 * necessary because if the machine crashed during a re-guid the new
+	 * guid might have been written to all of the vdev labels, but not the
+	 * cached config. The check will be performed again once we have the
+	 * trusted config from the MOS.
+	 */
+	if (spa->spa_trust_config && guid != spa_guid(spa)) {
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		nvlist_free(label);
+		vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't "
+		    "match config (%llu != %llu)", (u_longlong_t)guid,
+		    (u_longlong_t)spa_guid(spa));
+		return (0);
+	}
+
+	if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
+	    != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
+	    &aux_guid) != 0)
+		aux_guid = 0;
+
+	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) {
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		nvlist_free(label);
+		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+		    ZPOOL_CONFIG_GUID);
+		return (0);
+	}
+
+	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid)
+	    != 0) {
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		nvlist_free(label);
+		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+		    ZPOOL_CONFIG_TOP_GUID);
+		return (0);
+	}
+
+	/*
+	 * If this vdev just became a top-level vdev because its sibling was
+	 * detached, it will have adopted the parent's vdev guid -- but the
+	 * label may or may not be on disk yet. Fortunately, either version
+	 * of the label will have the same top guid, so if we're a top-level
+	 * vdev, we can safely compare to that instead.
+	 * However, if the config comes from a cachefile that failed to update
+	 * after the detach, a top-level vdev will appear as a non top-level
+	 * vdev in the config. Also relax the constraints if we perform an
+	 * extreme rewind.
+	 *
+	 * If we split this vdev off instead, then we also check the
+	 * original pool's guid. We don't want to consider the vdev
+	 * corrupt if it is partway through a split operation.
+	 */
+	if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) {
+		boolean_t mismatch = B_FALSE;
+		if (spa->spa_trust_config && !spa->spa_extreme_rewind) {
+			if (vd != vd->vdev_top || vd->vdev_guid != top_guid)
+				mismatch = B_TRUE;
+		} else {
+			if (vd->vdev_guid != top_guid &&
+			    vd->vdev_top->vdev_guid != guid)
+				mismatch = B_TRUE;
 		}
 
-		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
-		    &state) != 0) {
+		if (mismatch) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
-			vdev_dbgmsg(vd, "vdev_validate: '%s' missing",
-			    ZPOOL_CONFIG_POOL_STATE);
+			vdev_dbgmsg(vd, "vdev_validate: config guid "
+			    "doesn't match label guid");
+			vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu",
+			    (u_longlong_t)vd->vdev_guid,
+			    (u_longlong_t)vd->vdev_top->vdev_guid);
+			vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, "
+			    "aux_guid %llu", (u_longlong_t)guid,
+			    (u_longlong_t)top_guid, (u_longlong_t)aux_guid);
 			return (0);
 		}
+	}
 
+	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+	    &state) != 0) {
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
+		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+		    ZPOOL_CONFIG_POOL_STATE);
+		return (0);
+	}
 
-		/*
-		 * If this is a verbatim import, no need to check the
-		 * state of the pool.
-		 */
-		if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
-		    spa_load_state(spa) == SPA_LOAD_OPEN &&
-		    state != POOL_STATE_ACTIVE) {
-			vdev_dbgmsg(vd, "vdev_validate: invalid pool state "
-			    "(%llu) for spa %s", (u_longlong_t)state,
-			    spa->spa_name);
-			return (SET_ERROR(EBADF));
+	nvlist_free(label);
+
+	/*
+	 * If this is a verbatim import, no need to check the
+	 * state of the pool.
+	 */
+	if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
+	    spa_load_state(spa) == SPA_LOAD_OPEN &&
+	    state != POOL_STATE_ACTIVE) {
+		vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) "
+		    "for spa %s", (u_longlong_t)state, spa->spa_name);
+		return (SET_ERROR(EBADF));
+	}
+
+	/*
+	 * If we were able to open and validate a vdev that was
+	 * previously marked permanently unavailable, clear that state
+	 * now.
+	 */
+	if (vd->vdev_not_present)
+		vd->vdev_not_present = 0;
+
+	return (0);
+}
+
+static void
+vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
+{
+	if (svd->vdev_path != NULL && dvd->vdev_path != NULL) {
+		if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) {
+			zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed "
+			    "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
+			    dvd->vdev_path, svd->vdev_path);
+			spa_strfree(dvd->vdev_path);
+			dvd->vdev_path = spa_strdup(svd->vdev_path);
 		}
+	} else if (svd->vdev_path != NULL) {
+		dvd->vdev_path = spa_strdup(svd->vdev_path);
+		zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
+		    (u_longlong_t)dvd->vdev_guid, dvd->vdev_path);
+	}
+}
 
-		/*
-		 * If we were able to open and validate a vdev that was
-		 * previously marked permanently unavailable, clear that state
-		 * now.
-		 */
-		if (vd->vdev_not_present)
-			vd->vdev_not_present = 0;
+/*
+ * Recursively copy vdev paths from one vdev to another. Source and destination
+ * vdev trees must have same geometry otherwise return error. Intended to copy
+ * paths from userland config into MOS config.
+ */
+int
+vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd)
+{
+	if ((svd->vdev_ops == &vdev_missing_ops) ||
+	    (svd->vdev_ishole && dvd->vdev_ishole) ||
+	    (dvd->vdev_ops == &vdev_indirect_ops))
+		return (0);
+
+	if (svd->vdev_ops != dvd->vdev_ops) {
+		vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s",
+		    svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type);
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (svd->vdev_guid != dvd->vdev_guid) {
+		vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != "
+		    "%llu)", (u_longlong_t)svd->vdev_guid,
+		    (u_longlong_t)dvd->vdev_guid);
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (svd->vdev_children != dvd->vdev_children) {
+		vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: "
+		    "%llu != %llu", (u_longlong_t)svd->vdev_children,
+		    (u_longlong_t)dvd->vdev_children);
+		return (SET_ERROR(EINVAL));
 	}
 
+	for (uint64_t i = 0; i < svd->vdev_children; i++) {
+		int error = vdev_copy_path_strict(svd->vdev_child[i],
+		    dvd->vdev_child[i]);
+		if (error != 0)
+			return (error);
+	}
+
+	if (svd->vdev_ops->vdev_op_leaf)
+		vdev_copy_path_impl(svd, dvd);
+
 	return (0);
 }
 
+static void
+vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd)
+{
+	ASSERT(stvd->vdev_top == stvd);
+	ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id);
+
+	for (uint64_t i = 0; i < dvd->vdev_children; i++) {
+		vdev_copy_path_search(stvd, dvd->vdev_child[i]);
+	}
+
+	if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd))
+		return;
+
+	/*
+	 * The idea here is that while a vdev can shift positions within
+	 * a top vdev (when replacing, attaching mirror, etc.) it cannot
+	 * step outside of it.
+	 */
+	vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid);
+
+	if (vd == NULL || vd->vdev_ops != dvd->vdev_ops)
+		return;
+
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	vdev_copy_path_impl(vd, dvd);
+}
+
+/*
+ * Recursively copy vdev paths from one root vdev to another. Source and
+ * destination vdev trees may differ in geometry. For each destination leaf
+ * vdev, search a vdev with the same guid and top vdev id in the source.
+ * Intended to copy paths from userland config into MOS config.
+ */
+void
+vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd)
+{
+	uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children);
+	ASSERT(srvd->vdev_ops == &vdev_root_ops);
+	ASSERT(drvd->vdev_ops == &vdev_root_ops);
+
+	for (uint64_t i = 0; i < children; i++) {
+		vdev_copy_path_search(srvd->vdev_child[i],
+		    drvd->vdev_child[i]);
+	}
+}
+
 /*
  * Close a virtual device.
  */
@@ -1687,7 +1906,7 @@ vdev_reopen(vdev_t *vd)
 		    !l2arc_vdev_present(vd))
 			l2arc_add_vdev(spa, vd);
 	} else {
-		(void) vdev_validate(vd, B_TRUE);
+		(void) vdev_validate(vd);
 	}
 
 	/*
@@ -3551,6 +3770,19 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 		vdev_propagate_state(vd->vdev_parent);
 }
 
+boolean_t
+vdev_children_are_offline(vdev_t *vd)
+{
+	ASSERT(!vd->vdev_ops->vdev_op_leaf);
+
+	for (uint64_t i = 0; i < vd->vdev_children; i++) {
+		if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE)
+			return (B_FALSE);
+	}
+
+	return (B_TRUE);
+}
+
 /*
  * Check the vdev configuration to ensure that it's capable of supporting
  * a root pool. We do not support partial configuration.
@@ -3591,35 +3823,6 @@ vdev_is_concrete(vdev_t *vd)
 }
 
 /*
- * Load the state from the original vdev tree (ovd) which
- * we've retrieved from the MOS config object. If the original
- * vdev was offline or faulted then we transfer that state to the
- * device in the current vdev tree (nvd).
- */
-void
-vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
-{
-	spa_t *spa = nvd->vdev_spa;
-
-	ASSERT(nvd->vdev_top->vdev_islog);
-	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
-	ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
-
-	for (int c = 0; c < nvd->vdev_children; c++)
-		vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
-
-	if (nvd->vdev_ops->vdev_op_leaf) {
-		/*
-		 * Restore the persistent vdev state
-		 */
-		nvd->vdev_offline = ovd->vdev_offline;
-		nvd->vdev_faulted = ovd->vdev_faulted;
-		nvd->vdev_degraded = ovd->vdev_degraded;
-		nvd->vdev_removed = ovd->vdev_removed;
-	}
-}
-
-/*
  * Determine if a log device has valid content.  If the vdev was
  * removed or faulted in the MOS config then we know that
  * the content on the log device has already been written to the pool.
diff --git a/usr/src/uts/common/fs/zfs/vdev_indirect.c b/usr/src/uts/common/fs/zfs/vdev_indirect.c
index 1025c8090e..5c4ea02047 100644
--- a/usr/src/uts/common/fs/zfs/vdev_indirect.c
+++ b/usr/src/uts/common/fs/zfs/vdev_indirect.c
@@ -14,7 +14,7 @@
  */
 
 /*
- * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -30,6 +30,8 @@
 #include <sys/dmu_tx.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zap.h>
+#include <sys/abd.h>
+#include <sys/zthr.h>
 
 /*
  * An indirect vdev corresponds to a vdev that has been removed.  Since
@@ -475,7 +477,7 @@ spa_condense_indirect_commit_entry(spa_t *spa,
 
 static void
 spa_condense_indirect_generate_new_mapping(vdev_t *vd,
-    uint32_t *obsolete_counts, uint64_t start_index)
+    uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t mapi = start_index;
@@ -490,7 +492,15 @@ spa_condense_indirect_generate_new_mapping(vdev_t *vd,
 	    (u_longlong_t)vd->vdev_id,
 	    (u_longlong_t)mapi);
 
-	while (mapi < old_num_entries && !spa_shutting_down(spa)) {
+	while (mapi < old_num_entries) {
+
+		if (zthr_iscancelled(zthr)) {
+			zfs_dbgmsg("pausing condense of vdev %llu "
+			    "at index %llu", (u_longlong_t)vd->vdev_id,
+			    (u_longlong_t)mapi);
+			break;
+		}
+
 		vdev_indirect_mapping_entry_phys_t *entry =
 		    &old_mapping->vim_entries[mapi];
 		uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
@@ -508,18 +518,30 @@ spa_condense_indirect_generate_new_mapping(vdev_t *vd,
 
 		mapi++;
 	}
-	if (spa_shutting_down(spa)) {
-		zfs_dbgmsg("pausing condense of vdev %llu at index %llu",
-		    (u_longlong_t)vd->vdev_id,
-		    (u_longlong_t)mapi);
-	}
 }
 
-static void
-spa_condense_indirect_thread(void *arg)
+/* ARGSUSED */
+static boolean_t
+spa_condense_indirect_thread_check(void *arg, zthr_t *zthr)
 {
-	vdev_t *vd = arg;
-	spa_t *spa = vd->vdev_spa;
+	spa_t *spa = arg;
+
+	return (spa->spa_condensing_indirect != NULL);
+}
+
+/* ARGSUSED */
+static int
+spa_condense_indirect_thread(void *arg, zthr_t *zthr)
+{
+	spa_t *spa = arg;
+	vdev_t *vd;
+
+	ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+	vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
+	ASSERT3P(vd, !=, NULL);
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+
 	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
 	spa_condensing_indirect_phys_t *scip =
 	    &spa->spa_condensing_indirect_phys;
@@ -593,25 +615,24 @@ spa_condense_indirect_thread(void *arg)
 		}
 	}
 
-	spa_condense_indirect_generate_new_mapping(vd, counts, start_index);
+	spa_condense_indirect_generate_new_mapping(vd, counts,
+	    start_index, zthr);
 
 	vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts);
 
 	/*
-	 * We may have bailed early from generate_new_mapping(), if
-	 * the spa is shutting down.  In this case, do not complete
-	 * the condense.
+	 * If the zthr has received a cancellation signal while running
+	 * in generate_new_mapping() or at any point after that, then bail
+	 * early. We don't want to complete the condense if the spa is
+	 * shutting down.
 	 */
-	if (!spa_shutting_down(spa)) {
-		VERIFY0(dsl_sync_task(spa_name(spa), NULL,
-		    spa_condense_indirect_complete_sync, sci, 0,
-		    ZFS_SPACE_CHECK_NONE));
-	}
+	if (zthr_iscancelled(zthr))
+		return (0);
+
+	VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+	    spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_NONE));
 
-	mutex_enter(&spa->spa_async_lock);
-	spa->spa_condense_thread = NULL;
-	cv_broadcast(&spa->spa_async_cv);
-	mutex_exit(&spa->spa_async_lock);
+	return (0);
 }
 
 /*
@@ -664,9 +685,7 @@ spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx)
 	    (u_longlong_t)scip->scip_prev_obsolete_sm_object,
 	    (u_longlong_t)scip->scip_next_mapping_object);
 
-	ASSERT3P(spa->spa_condense_thread, ==, NULL);
-	spa->spa_condense_thread = thread_create(NULL, 0,
-	    spa_condense_indirect_thread, vd, 0, &p0, TS_RUN, minclsyspri);
+	zthr_wakeup(spa->spa_condense_zthr);
 }
 
 /*
@@ -743,24 +762,12 @@ spa_condense_fini(spa_t *spa)
 	}
 }
 
-/*
- * Restart the condense - called when the pool is opened.
- */
 void
-spa_condense_indirect_restart(spa_t *spa)
+spa_start_indirect_condensing_thread(spa_t *spa)
 {
-	vdev_t *vd;
-	ASSERT(spa->spa_condensing_indirect != NULL);
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-	vd = vdev_lookup_top(spa,
-	    spa->spa_condensing_indirect_phys.scip_vdev);
-	ASSERT(vd != NULL);
-	spa_config_exit(spa, SCL_VDEV, FTAG);
-
-	ASSERT3P(spa->spa_condense_thread, ==, NULL);
-	spa->spa_condense_thread = thread_create(NULL, 0,
-	    spa_condense_indirect_thread, vd, 0, &p0, TS_RUN,
-	    minclsyspri);
+	ASSERT3P(spa->spa_condense_zthr, ==, NULL);
+	spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check,
+	    spa_condense_indirect_thread, spa);
 }
 
 /*
@@ -845,6 +852,57 @@ rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
 }
 
 /*
+ * Given an indirect vdev and an extent on that vdev, it duplicates the
+ * physical entries of the indirect mapping that correspond to the extent
+ * to a new array and returns a pointer to it. In addition, copied_entries
+ * is populated with the number of mapping entries that were duplicated.
+ *
+ * Note that the function assumes that the caller holds vdev_indirect_rwlock.
+ * This ensures that the mapping won't change due to condensing as we
+ * copy over its contents.
+ *
+ * Finally, since we are doing an allocation, it is up to the caller to
+ * free the array allocated in this function.
+ */
+vdev_indirect_mapping_entry_phys_t *
+vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
+    uint64_t asize, uint64_t *copied_entries)
+{
+	vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
+	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+	uint64_t entries = 0;
+
+	ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock));
+
+	vdev_indirect_mapping_entry_phys_t *first_mapping =
+	    vdev_indirect_mapping_entry_for_offset(vim, offset);
+	ASSERT3P(first_mapping, !=, NULL);
+
+	vdev_indirect_mapping_entry_phys_t *m = first_mapping;
+	while (asize > 0) {
+		uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
+
+		ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m));
+		ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size);
+
+		uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
+		uint64_t inner_size = MIN(asize, size - inner_offset);
+
+		offset += inner_size;
+		asize -= inner_size;
+		entries++;
+		m++;
+	}
+
+	size_t copy_length = entries * sizeof (*first_mapping);
+	duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP);
+	bcopy(first_mapping, duplicate_mappings, copy_length);
+	*copied_entries = entries;
+
+	return (duplicate_mappings);
+}
+
+/*
  * Goes through the relevant indirect mappings until it hits a concrete vdev
  * and issues the callback. On the way to the concrete vdev, if any other
  * indirect vdevs are encountered, then the callback will also be called on
@@ -884,24 +942,42 @@ vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
 	for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0);
 	    rs != NULL; rs = list_remove_head(&stack)) {
 		vdev_t *v = rs->rs_vd;
+		uint64_t num_entries = 0;
+
+		ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+		ASSERT(rs->rs_asize > 0);
 
 		/*
-		 * Note: this can be called from open context
-		 * (eg. zio_read()), so we need the rwlock to prevent
-		 * the mapping from being changed by condensing.
+		 * Note: As this function can be called from open context
+		 * (e.g. zio_read()), we need the following rwlock to
+		 * prevent the mapping from being changed by condensing.
+		 *
+		 * So we grab the lock and we make a copy of the entries
+		 * that are relevant to the extent that we are working on.
+		 * Once that is done, we drop the lock and iterate over
+		 * our copy of the mapping. Once we are done with the with
+		 * the remap segment and we free it, we also free our copy
+		 * of the indirect mapping entries that are relevant to it.
+		 *
+		 * This way we don't need to wait until the function is
+		 * finished with a segment, to condense it. In addition, we
+		 * don't need a recursive rwlock for the case that a call to
+		 * vdev_indirect_remap() needs to call itself (through the
+		 * codepath of its callback) for the same vdev in the middle
+		 * of its execution.
 		 */
 		rw_enter(&v->vdev_indirect_rwlock, RW_READER);
 		vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping;
 		ASSERT3P(vim, !=, NULL);
 
-		ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
-		ASSERT(rs->rs_asize > 0);
-
 		vdev_indirect_mapping_entry_phys_t *mapping =
-		    vdev_indirect_mapping_entry_for_offset(vim, rs->rs_offset);
+		    vdev_indirect_mapping_duplicate_adjacent_entries(v,
+		    rs->rs_offset, rs->rs_asize, &num_entries);
 		ASSERT3P(mapping, !=, NULL);
+		ASSERT3U(num_entries, >, 0);
+		rw_exit(&v->vdev_indirect_rwlock);
 
-		while (rs->rs_asize > 0) {
+		for (uint64_t i = 0; i < num_entries; i++) {
 			/*
 			 * Note: the vdev_indirect_mapping can not change
 			 * while we are running.  It only changes while the
@@ -910,20 +986,23 @@ vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
 			 * function is only called for frees, which also only
 			 * happen from syncing context.
 			 */
+			vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
+
+			ASSERT3P(m, !=, NULL);
+			ASSERT3U(rs->rs_asize, >, 0);
 
-			uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst);
-			uint64_t dst_offset =
-			    DVA_GET_OFFSET(&mapping->vimep_dst);
-			uint64_t dst_vdev = DVA_GET_VDEV(&mapping->vimep_dst);
+			uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
+			uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
+			uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
 
 			ASSERT3U(rs->rs_offset, >=,
-			    DVA_MAPPING_GET_SRC_OFFSET(mapping));
+			    DVA_MAPPING_GET_SRC_OFFSET(m));
 			ASSERT3U(rs->rs_offset, <,
-			    DVA_MAPPING_GET_SRC_OFFSET(mapping) + size);
+			    DVA_MAPPING_GET_SRC_OFFSET(m) + size);
 			ASSERT3U(dst_vdev, !=, v->vdev_id);
 
 			uint64_t inner_offset = rs->rs_offset -
-			    DVA_MAPPING_GET_SRC_OFFSET(mapping);
+			    DVA_MAPPING_GET_SRC_OFFSET(m);
 			uint64_t inner_size =
 			    MIN(rs->rs_asize, size - inner_offset);
 
@@ -964,10 +1043,10 @@ vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
 			rs->rs_offset += inner_size;
 			rs->rs_asize -= inner_size;
 			rs->rs_split_offset += inner_size;
-			mapping++;
 		}
+		VERIFY0(rs->rs_asize);
 
-		rw_exit(&v->vdev_indirect_rwlock);
+		kmem_free(mapping, num_entries * sizeof (*mapping));
 		kmem_free(rs, sizeof (remap_segment_t));
 	}
 	list_destroy(&stack);
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index be3b89ee37..7ccc672437 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -265,7 +265,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 		    vd->vdev_wholedisk);
 
-	if (vd->vdev_not_present)
+	if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
 
 	if (vd->vdev_isspare)
@@ -1062,6 +1062,11 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
 		    "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg);
 
 		*config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
+		if (*config == NULL && spa->spa_extreme_rewind) {
+			vdev_dbgmsg(cb.ubl_vd, "failed to read label config. "
+			    "Trying again without txg restrictions.");
+			*config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX);
+		}
 		if (*config == NULL) {
 			vdev_dbgmsg(cb.ubl_vd, "failed to read label config");
 		}
@@ -1088,7 +1093,7 @@ vdev_uberblock_sync_done(zio_t *zio)
 static void
 vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
 {
-	for (int c = 0; c < vd->vdev_children; c++)
+	for (uint64_t c = 0; c < vd->vdev_children; c++)
 		vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags);
 
 	if (!vd->vdev_ops->vdev_op_leaf)
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
index a2a4925942..9d181a874e 100644
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -84,9 +84,32 @@ vdev_mirror_map_alloc(zio_t *zio)
 	if (vd == NULL) {
 		dva_t *dva = zio->io_bp->blk_dva;
 		spa_t *spa = zio->io_spa;
+		dva_t dva_copy[SPA_DVAS_PER_BP];
 
 		c = BP_GET_NDVAS(zio->io_bp);
 
+		/*
+		 * If we do not trust the pool config, some DVAs might be
+		 * invalid or point to vdevs that do not exist. We skip them.
+		 */
+		if (!spa_trust_config(spa)) {
+			ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+			int j = 0;
+			for (int i = 0; i < c; i++) {
+				if (zfs_dva_valid(spa, &dva[i], zio->io_bp))
+					dva_copy[j++] = dva[i];
+			}
+			if (j == 0) {
+				zio->io_vsd = NULL;
+				zio->io_error = ENXIO;
+				return (NULL);
+			}
+			if (j < c) {
+				dva = dva_copy;
+				c = j;
+			}
+		}
+
 		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
 		mm->mm_children = c;
 		mm->mm_resilvering = B_FALSE;
@@ -201,7 +224,10 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
 	}
 
 	if (numerrors == vd->vdev_children) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+		if (vdev_children_are_offline(vd))
+			vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE;
+		else
+			vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
 		return (lasterror);
 	}
 
@@ -311,6 +337,13 @@ vdev_mirror_io_start(zio_t *zio)
 
 	mm = vdev_mirror_map_alloc(zio);
 
+	if (mm == NULL) {
+		ASSERT(!spa_trust_config(zio->io_spa));
+		ASSERT(zio->io_type == ZIO_TYPE_READ);
+		zio_execute(zio);
+		return;
+	}
+
 	if (zio->io_type == ZIO_TYPE_READ) {
 		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
 			/*
@@ -381,6 +414,9 @@ vdev_mirror_io_done(zio_t *zio)
 	int good_copies = 0;
 	int unexpected_errors = 0;
 
+	if (mm == NULL)
+		return;
+
 	for (c = 0; c < mm->mm_children; c++) {
 		mc = &mm->mm_child[c];
 
@@ -486,13 +522,19 @@ vdev_mirror_io_done(zio_t *zio)
 static void
 vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
 {
-	if (faulted == vd->vdev_children)
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_NO_REPLICAS);
-	else if (degraded + faulted != 0)
+	if (faulted == vd->vdev_children) {
+		if (vdev_children_are_offline(vd)) {
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE,
+			    VDEV_AUX_CHILDREN_OFFLINE);
+		} else {
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_NO_REPLICAS);
+		}
+	} else if (degraded + faulted != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
-	else
+	} else {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+	}
 }
 
 vdev_ops_t vdev_mirror_ops = {
diff --git a/usr/src/uts/common/fs/zfs/vdev_root.c b/usr/src/uts/common/fs/zfs/vdev_root.c
index 6971b942f0..b3433c2424 100644
--- a/usr/src/uts/common/fs/zfs/vdev_root.c
+++ b/usr/src/uts/common/fs/zfs/vdev_root.c
@@ -37,6 +37,23 @@
  * Virtual device vector for the pool's root vdev.
  */
 
+static uint64_t
+vdev_root_core_tvds(vdev_t *vd)
+{
+	uint64_t tvds = 0;
+
+	for (uint64_t c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (!cvd->vdev_ishole && !cvd->vdev_islog &&
+		    cvd->vdev_ops != &vdev_indirect_ops) {
+			tvds++;
+		}
+	}
+
+	return (tvds);
+}
+
 /*
  * We should be able to tolerate one failure with absolutely no damage
  * to our metadata.  Two failures will take out space maps, a bunch of
@@ -46,17 +63,28 @@
  * probably fine.  Adding bean counters during alloc/free can make this
  * future guesswork more accurate.
  */
-static int
-too_many_errors(vdev_t *vd, int numerrors)
+static boolean_t
+too_many_errors(vdev_t *vd, uint64_t numerrors)
 {
-	ASSERT3U(numerrors, <=, vd->vdev_children);
-	return (numerrors > 0);
+	uint64_t tvds;
+
+	if (numerrors == 0)
+		return (B_FALSE);
+
+	tvds = vdev_root_core_tvds(vd);
+	ASSERT3U(numerrors, <=, tvds);
+
+	if (numerrors == tvds)
+		return (B_TRUE);
+
+	return (numerrors > spa_missing_tvds_allowed(vd->vdev_spa));
 }
 
 static int
 vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
     uint64_t *ashift)
 {
+	spa_t *spa = vd->vdev_spa;
 	int lasterror = 0;
 	int numerrors = 0;
 
@@ -76,6 +104,9 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
 		}
 	}
 
+	if (spa_load_state(spa) != SPA_LOAD_NONE)
+		spa_set_missing_tvds(spa, numerrors);
+
 	if (too_many_errors(vd, numerrors)) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
 		return (lasterror);
@@ -101,7 +132,7 @@ vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
 	if (too_many_errors(vd, faulted)) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_NO_REPLICAS);
-	} else if (degraded) {
+	} else if (degraded || faulted) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	} else {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 7bde5a8391..29cdf08d52 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -444,21 +444,26 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 }
 
 static boolean_t
-zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
+zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
 {
-	uint64_t *countp = &zio->io_children[child][wait];
 	boolean_t waiting = B_FALSE;
 
 	mutex_enter(&zio->io_lock);
 	ASSERT(zio->io_stall == NULL);
-	if (*countp != 0) {
-		zio->io_stage >>= 1;
-		ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
-		zio->io_stall = countp;
-		waiting = B_TRUE;
+	for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
+		if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
+			continue;
+
+		uint64_t *countp = &zio->io_children[c][wait];
+		if (*countp != 0) {
+			zio->io_stage >>= 1;
+			ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
+			zio->io_stall = countp;
+			waiting = B_TRUE;
+			break;
+		}
 	}
 	mutex_exit(&zio->io_lock);
-
 	return (waiting);
 }
 
@@ -693,6 +698,13 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
 	}
 
 	/*
+	 * Do not verify individual DVAs if the config is not trusted. This
+	 * will be done once the zio is executed in vdev_mirror_map_alloc.
+	 */
+	if (!spa->spa_trust_config)
+		return;
+
+	/*
 	 * Pool-specific checks.
 	 *
 	 * Note: it would be nice to verify that the blk_birth and
@@ -741,6 +753,36 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
 	}
 }
 
+boolean_t
+zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp)
+{
+	uint64_t vdevid = DVA_GET_VDEV(dva);
+
+	if (vdevid >= spa->spa_root_vdev->vdev_children)
+		return (B_FALSE);
+
+	vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
+	if (vd == NULL)
+		return (B_FALSE);
+
+	if (vd->vdev_ops == &vdev_hole_ops)
+		return (B_FALSE);
+
+	if (vd->vdev_ops == &vdev_missing_ops) {
+		return (B_FALSE);
+	}
+
+	uint64_t offset = DVA_GET_OFFSET(dva);
+	uint64_t asize = DVA_GET_ASIZE(dva);
+
+	if (BP_IS_GANG(bp))
+		asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+	if (offset + asize > vd->vdev_asize)
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
@@ -1267,9 +1309,10 @@ zio_write_compress(zio_t *zio)
 	 * If our children haven't all reached the ready stage,
 	 * wait for them and then repeat this pipeline stage.
 	 */
-	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
-	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
+	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
+	    ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
 		return (ZIO_PIPELINE_STOP);
+	}
 
 	if (!IO_IS_ALLOCATING(zio))
 		return (ZIO_PIPELINE_CONTINUE);
@@ -2112,8 +2155,9 @@ zio_gang_issue(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
-	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
+	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
 		return (ZIO_PIPELINE_STOP);
+	}
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
@@ -2434,8 +2478,9 @@ zio_ddt_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
-	if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
+	if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
 		return (ZIO_PIPELINE_STOP);
+	}
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
@@ -3038,10 +3083,14 @@ zio_vdev_io_start(zio_t *zio)
 	}
 
 	ASSERT3P(zio->io_logical, !=, zio);
-	if (zio->io_type == ZIO_TYPE_WRITE && zio->io_vd->vdev_removing) {
-		ASSERT(zio->io_flags &
-		    (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
-		    ZIO_FLAG_INDUCE_DAMAGE));
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+		ASSERT(spa->spa_trust_config);
+
+		if (zio->io_vd->vdev_removing) {
+			ASSERT(zio->io_flags &
+			    (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
+			    ZIO_FLAG_INDUCE_DAMAGE));
+		}
 	}
 
 	/*
@@ -3146,8 +3195,9 @@ zio_vdev_io_done(zio_t *zio)
 	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
 	boolean_t unexpected_error = B_FALSE;
 
-	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
+	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
 		return (ZIO_PIPELINE_STOP);
+	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
 
@@ -3213,8 +3263,9 @@ zio_vdev_io_assess(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
-	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
+	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
 		return (ZIO_PIPELINE_STOP);
+	}
 
 	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
@@ -3429,9 +3480,10 @@ zio_ready(zio_t *zio)
 	zio_t *pio, *pio_next;
 	zio_link_t *zl = NULL;
 
-	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
-	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
+	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
+	    ZIO_WAIT_READY)) {
 		return (ZIO_PIPELINE_STOP);
+	}
 
 	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
@@ -3571,11 +3623,9 @@ zio_done(zio_t *zio)
 	 * If our children haven't all completed,
 	 * wait for them and then repeat this pipeline stage.
 	 */
-	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
-	    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
-	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
-	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
+	if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
 		return (ZIO_PIPELINE_STOP);
+	}
 
 	/*
 	 * If the allocation throttle is enabled, then update the accounting.
diff --git a/usr/src/uts/common/fs/zfs/zthr.c b/usr/src/uts/common/fs/zfs/zthr.c
new file mode 100644
index 0000000000..9beb7e128f
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zthr.c
@@ -0,0 +1,319 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+/*
+ * ZTHR Infrastructure
+ * ===================
+ *
+ * ZTHR threads are used for isolated operations that span multiple txgs
+ * within a SPA. They generally exist from SPA creation/loading and until
+ * the SPA is exported/destroyed. The ideal requirements for an operation
+ * to be modeled with a zthr are the following:
+ *
+ * 1] The operation needs to run over multiple txgs.
+ * 2] There is be a single point of reference in memory or on disk that
+ *    indicates whether the operation should run/is running or is
+ *    stopped.
+ *
+ * If the operation satisfies the above then the following rules guarantee
+ * a certain level of correctness:
+ *
+ * 1] Any thread EXCEPT the zthr changes the work indicator from stopped
+ *    to running but not the opposite.
+ * 2] Only the zthr can change the work indicator from running to stopped
+ *    (e.g. when it is done) but not the opposite.
+ *
+ * This way a normal zthr cycle should go like this:
+ *
+ * 1] An external thread changes the work indicator from stopped to
+ *    running and wakes up the zthr.
+ * 2] The zthr wakes up, checks the indicator and starts working.
+ * 3] When the zthr is done, it changes the indicator to stopped, allowing
+ *    a new cycle to start.
+ *
+ * == ZTHR creation
+ *
+ * Every zthr needs three inputs to start running:
+ *
+ * 1] A user-defined checker function (checkfunc) that decides whether
+ *    the zthr should start working or go to sleep. The function should
+ *    return TRUE when the zthr needs to work or FALSE to let it sleep,
+ *    and should adhere to the following signature:
+ *    boolean_t checkfunc_name(void *args, zthr_t *t);
+ *
+ * 2] A user-defined ZTHR function (func) which the zthr executes when
+ *    it is not sleeping. The function should adhere to the following
+ *    signature type:
+ *    int func_name(void *args, zthr_t *t);
+ *
+ * 3] A void args pointer that will be passed to checkfunc and func
+ *    implicitly by the infrastructure.
+ *
+ * The reason why the above API needs two different functions,
+ * instead of one that both checks and does the work, has to do with
+ * the zthr's internal lock (zthr_lock) and the allowed cancellation
+ * windows. We want to hold the zthr_lock while running checkfunc
+ * but not while running func. This way the zthr can be cancelled
+ * while doing work and not while checking for work.
+ *
+ * To start a zthr:
+ *     zthr_t *zthr_pointer = zthr_create(checkfunc, func, args);
+ *
+ * After that you should be able to wakeup, cancel, and resume the
+ * zthr from another thread using zthr_pointer.
+ *
+ * NOTE: ZTHR threads could potentially wake up spuriously and the
+ * user should take this into account when writing a checkfunc.
+ * [see ZTHR state transitions]
+ *
+ * == ZTHR cancellation
+ *
+ * ZTHR threads must be cancelled when their SPA is being exported
+ * or when they need to be paused so they don't interfere with other
+ * operations.
+ *
+ * To cancel a zthr:
+ *     zthr_cancel(zthr_pointer);
+ *
+ * To resume it:
+ *     zthr_resume(zthr_pointer);
+ *
+ * A zthr will implicitly check if it has received a cancellation
+ * signal every time func returns and everytime it wakes up [see ZTHR
+ * state transitions below].
+ *
+ * At times, waiting for the zthr's func to finish its job may take
+ * time. This may be very time-consuming for some operations that
+ * need to cancel the SPA's zthrs (e.g spa_export). For this scenario
+ * the user can explicitly make their ZTHR function aware of incoming
+ * cancellation signals using zthr_iscancelled(). A common pattern for
+ * that looks like this:
+ *
+ * int
+ * func_name(void *args, zthr_t *t)
+ * {
+ *     ... <unpack args> ...
+ *     while (!work_done && !zthr_iscancelled(t)) {
+ *         ... <do more work> ...
+ *     }
+ *     return (0);
+ * }
+ *
+ * == ZTHR exit
+ *
+ * For the rare cases where the zthr wants to stop running voluntarily
+ * while running its ZTHR function (func), we provide zthr_exit().
+ * When a zthr has voluntarily stopped running, it can be resumed with
+ * zthr_resume(), just like it would if it was cancelled by some other
+ * thread.
+ *
+ * == ZTHR cleanup
+ *
+ * Cancelling a zthr doesn't clean up its metadata (internal locks,
+ * function pointers to func and checkfunc, etc..). This is because
+ * we want to keep them around in case we want to resume the execution
+ * of the zthr later. Similarly for zthrs that exit themselves.
+ *
+ * To completely cleanup a zthr, cancel it first to ensure that it
+ * is not running and then use zthr_destroy().
+ *
+ * == ZTHR state transitions
+ *
+ *    zthr creation
+ *      +
+ *      |
+ *      |      woke up
+ *      |   +--------------+ sleep
+ *      |   |                  ^
+ *      |   |                  |
+ *      |   |                  | FALSE
+ *      |   |                  |
+ *      v   v     FALSE        +
+ *   cancelled? +---------> checkfunc?
+ *      +   ^                  +
+ *      |   |                  |
+ *      |   |                  | TRUE
+ *      |   |                  |
+ *      |   |  func returned   v
+ *      |   +---------------+ func
+ *      |
+ *      | TRUE
+ *      |
+ *      v
+ *   zthr stopped running
+ *
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zthr.h>
+
+void
+zthr_exit(zthr_t *t, int rc)
+{
+	ASSERT3P(t->zthr_thread, ==, curthread);
+	mutex_enter(&t->zthr_lock);
+	t->zthr_thread = NULL;
+	t->zthr_rc = rc;
+	cv_broadcast(&t->zthr_cv);
+	mutex_exit(&t->zthr_lock);
+	thread_exit();
+}
+
+static void
+zthr_procedure(void *arg)
+{
+	zthr_t *t = arg;
+	int rc = 0;
+
+	mutex_enter(&t->zthr_lock);
+	while (!t->zthr_cancel) {
+		if (t->zthr_checkfunc(t->zthr_arg, t)) {
+			mutex_exit(&t->zthr_lock);
+			rc = t->zthr_func(t->zthr_arg, t);
+			mutex_enter(&t->zthr_lock);
+		} else {
+			/* go to sleep */
+			cv_wait(&t->zthr_cv, &t->zthr_lock);
+		}
+	}
+	mutex_exit(&t->zthr_lock);
+
+	zthr_exit(t, rc);
+}
+
+zthr_t *
+zthr_create(zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg)
+{
+	zthr_t *t = kmem_zalloc(sizeof (*t), KM_SLEEP);
+	mutex_init(&t->zthr_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL);
+
+	mutex_enter(&t->zthr_lock);
+	t->zthr_checkfunc = checkfunc;
+	t->zthr_func = func;
+	t->zthr_arg = arg;
+
+	t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t,
+	    0, &p0, TS_RUN, minclsyspri);
+	mutex_exit(&t->zthr_lock);
+
+	return (t);
+}
+
+void
+zthr_destroy(zthr_t *t)
+{
+	VERIFY3P(t->zthr_thread, ==, NULL);
+	mutex_destroy(&t->zthr_lock);
+	cv_destroy(&t->zthr_cv);
+	kmem_free(t, sizeof (*t));
+}
+
+/*
+ * Note: If the zthr is not sleeping and misses the wakeup
+ * (e.g it is running its ZTHR function), it will check if
+ * there is work to do before going to sleep using its checker
+ * function [see ZTHR state transition in ZTHR block comment].
+ * Thus, missing the wakeup still yields the expected behavior.
+ */
+void
+zthr_wakeup(zthr_t *t)
+{
+	ASSERT3P(t->zthr_thread, !=, NULL);
+
+	mutex_enter(&t->zthr_lock);
+	cv_broadcast(&t->zthr_cv);
+	mutex_exit(&t->zthr_lock);
+}
+
+/*
+ * Note: If the zthr is not running (e.g. has been cancelled
+ * already), this is a no-op.
+ */
+int
+zthr_cancel(zthr_t *t)
+{
+	int rc = 0;
+
+	mutex_enter(&t->zthr_lock);
+
+	/* broadcast in case the zthr is sleeping */
+	cv_broadcast(&t->zthr_cv);
+
+	t->zthr_cancel = B_TRUE;
+	while (t->zthr_thread != NULL)
+		cv_wait(&t->zthr_cv, &t->zthr_lock);
+	t->zthr_cancel = B_FALSE;
+	rc = t->zthr_rc;
+	mutex_exit(&t->zthr_lock);
+
+	return (rc);
+}
+
+void
+zthr_resume(zthr_t *t)
+{
+	ASSERT3P(t->zthr_thread, ==, NULL);
+
+	mutex_enter(&t->zthr_lock);
+
+	ASSERT3P(&t->zthr_checkfunc, !=, NULL);
+	ASSERT3P(&t->zthr_func, !=, NULL);
+	ASSERT(!t->zthr_cancel);
+
+	t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t,
+	    0, &p0, TS_RUN, minclsyspri);
+
+	mutex_exit(&t->zthr_lock);
+}
+
+/*
+ * This function is intended to be used by the zthr itself
+ * to check if another thread has signal it to stop running.
+ *
+ * returns TRUE if we are in the middle of trying to cancel
+ *     this thread.
+ *
+ * returns FALSE otherwise.
+ */
+boolean_t
+zthr_iscancelled(zthr_t *t)
+{
+	boolean_t cancelled;
+
+	ASSERT3P(t->zthr_thread, ==, curthread);
+
+	mutex_enter(&t->zthr_lock);
+	cancelled = t->zthr_cancel;
+	mutex_exit(&t->zthr_lock);
+
+	return (cancelled);
+}
+
+boolean_t
+zthr_isrunning(zthr_t *t)
+{
+	boolean_t running;
+
+	mutex_enter(&t->zthr_lock);
+	running = (t->zthr_thread != NULL);
+	mutex_exit(&t->zthr_lock);
+
+	return (running);
+}