[illumos-gate merge]

commit a3b5583021b7b45676bf1f0cc68adf7a97900b56 9192 explicitly pass good_writes to vdev_uberblock/label_sync commit 3a4b1be953ee5601bab748afa07c26ed4996cde6 9290 device removal reduces redundancy of mirrors commit ff9e88cea66c73818b035c646e3bb615ba3ff2bd 9452 ptable_dcmd() needs a little cleanup commit a800027ae7e1ce605f8a3187c6fcad94986988f3 9450 OS Unit Test Suite is in the ostest package commit ed1b18c7444d27a1c8637e59ba0b8b360902050b 9346 autofs: comparison between pointer and zero character constant commit 4e72ade1d48747d1105e26d42fc4787278f8f35e 9462 panic in smbfs_delmap_callback
author: Jerry Jelinek <jerry.jelinek@joyent.com> 2018-04-17 12:47:02 +0000
committer: Jerry Jelinek <jerry.jelinek@joyent.com> 2018-04-17 12:47:02 +0000
commit: f43293b2f5505423bbff409e6bcac2caa468e4de (patch)
tree: 39d58d968f31585fd49fb87d5ab8b60666c62a08
parent: c680934c282c791f151f4efe173c53604bac7c93 (diff)
parent: a3b5583021b7b45676bf1f0cc68adf7a97900b56 (diff)
download: illumos-joyent-f43293b2f5505423bbff409e6bcac2caa468e4de.tar.gz
22 files changed, 1028 insertions, 320 deletions
diff --git a/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.c b/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.c
index c6ac1d2967..15f4697b91 100644
--- a/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.c
+++ b/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.c
@@ -835,7 +835,6 @@ do_ptable_dcmd(pfn_t pfn, uint64_t level)
 	int entry;
 	uintptr_t pagesize;
 	x86pte_t pte;
-	x86pte_t buf;
 	physaddr_t paddr;
 	size_t len;
 
@@ -899,11 +898,10 @@ found_it:
 
 	paddr = mmu_ptob((physaddr_t)pfn);
 	for (entry = 0; entry < mmu.ptes_per_table; ++entry) {
-		len = mdb_pread(&buf, mmu.pte_size,
+		len = mdb_pread(&pte, mmu.pte_size,
 		    paddr + entry * mmu.pte_size);
 		if (len != mmu.pte_size)
 			return (DCMD_ERR);
-			pte = buf;
 
 		if (pte == 0)
 			continue;
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
index 6ce4263db8..cb02698ceb 100644
--- a/usr/src/cmd/zdb/zdb.c
+++ b/usr/src/cmd/zdb/zdb.c
@@ -3006,7 +3006,7 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
-	vdev_t *vd = svr->svr_vdev;
+	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
@@ -3022,13 +3022,17 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
 			    svr->svr_allocd_segs, SM_ALLOC));
 
 			/*
-			 * Clear everything past what has been synced,
-			 * because we have not allocated mappings for it yet.
+			 * Clear everything past what has been synced unless
+			 * it's past the spacemap, because we have not allocated
+			 * mappings for it yet.
 			 */
-			range_tree_clear(svr->svr_allocd_segs,
-			    vdev_indirect_mapping_max_offset(vim),
-			    msp->ms_sm->sm_start + msp->ms_sm->sm_size -
-			    vdev_indirect_mapping_max_offset(vim));
+			uint64_t vim_max_offset =
+			    vdev_indirect_mapping_max_offset(vim);
+			uint64_t sm_end = msp->ms_sm->sm_start +
+			    msp->ms_sm->sm_size;
+			if (sm_end > vim_max_offset)
+				range_tree_clear(svr->svr_allocd_segs,
+				    vim_max_offset, sm_end - vim_max_offset);
 		}
 
 		zcb->zcb_removing_size +=
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index ff45ab193e..0b98eeef26 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -436,6 +436,7 @@ static ztest_ds_t *ztest_ds;
 
 static kmutex_t ztest_vdev_lock;
 static kmutex_t ztest_checkpoint_lock;
+static boolean_t ztest_device_removal_active = B_FALSE;
 
 /*
  * The ztest_name_lock protects the pool and dataset namespace used by
@@ -2880,7 +2881,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
 	 * value.  Don't bother trying to attach while we are in the middle
 	 * of removal.
 	 */
-	if (spa->spa_vdev_removal != NULL) {
+	if (ztest_device_removal_active) {
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		mutex_exit(&ztest_vdev_lock);
 		return;
@@ -3055,16 +3056,49 @@ ztest_device_removal(ztest_ds_t *zd, uint64_t id)
 	spa_t *spa = ztest_spa;
 	vdev_t *vd;
 	uint64_t guid;
+	int error;
 
 	mutex_enter(&ztest_vdev_lock);
 
+	if (ztest_device_removal_active) {
+		mutex_exit(&ztest_vdev_lock);
+		return;
+	}
+
+	/*
+	 * Remove a random top-level vdev and wait for removal to finish.
+	 */
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE));
 	guid = vd->vdev_guid;
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
-	(void) spa_vdev_remove(spa, guid, B_FALSE);
+	error = spa_vdev_remove(spa, guid, B_FALSE);
+	if (error == 0) {
+		ztest_device_removal_active = B_TRUE;
+		mutex_exit(&ztest_vdev_lock);
+
+		while (spa->spa_vdev_removal != NULL)
+			txg_wait_synced(spa_get_dsl(spa), 0);
+	} else {
+		mutex_exit(&ztest_vdev_lock);
+		return;
+	}
 
+	/*
+	 * The pool needs to be scrubbed after completing device removal.
+	 * Failure to do so may result in checksum errors due to the
+	 * strategy employed by ztest_fault_inject() when selecting which
+	 * offset are redundant and can be damaged.
+	 */
+	error = spa_scan(spa, POOL_SCAN_SCRUB);
+	if (error == 0) {
+		while (dsl_scan_scrubbing(spa_get_dsl(spa)))
+			txg_wait_synced(spa_get_dsl(spa), 0);
+	}
+
+	mutex_enter(&ztest_vdev_lock);
+	ztest_device_removal_active = B_FALSE;
 	mutex_exit(&ztest_vdev_lock);
 }
 
@@ -3203,7 +3237,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
 	 * that the metaslab_class space increased (because it decreases
 	 * when the device removal completes).
 	 */
-	if (spa->spa_vdev_removal != NULL) {
+	if (ztest_device_removal_active) {
 		spa_config_exit(spa, SCL_STATE, spa);
 		mutex_exit(&ztest_vdev_lock);
 		mutex_exit(&ztest_checkpoint_lock);
@@ -4988,6 +5022,18 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
 	boolean_t islog = B_FALSE;
 
 	mutex_enter(&ztest_vdev_lock);
+
+	/*
+	 * Device removal is in progress, fault injection must be disabled
+	 * until it completes and the pool is scrubbed.  The fault injection
+	 * strategy for damaging blocks does not take in to account evacuated
+	 * blocks which may have already been damaged.
+	 */
+	if (ztest_device_removal_active) {
+		mutex_exit(&ztest_vdev_lock);
+		return;
+	}
+
 	maxfaults = MAXFAULTS();
 	leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
 	mirror_save = zs->zs_mirrors;
@@ -5333,6 +5379,12 @@ ztest_scrub(ztest_ds_t *zd, uint64_t id)
 {
 	spa_t *spa = ztest_spa;
 
+	/*
+	 * Scrub in progress by device removal.
+	 */
+	if (ztest_device_removal_active)
+		return;
+
 	(void) spa_scan(spa, POOL_SCAN_SCRUB);
 	(void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
 	(void) spa_scan(spa, POOL_SCAN_SCRUB);
diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c
index 4378151cae..7321f419fc 100644
--- a/usr/src/lib/libzfs/common/libzfs_pool.c
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c
@@ -2809,7 +2809,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 
 	case EBUSY:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, "
-		    "or pool has removing/removed vdevs"),
+		    "or device removal is in progress"),
 		    new_disk);
 		(void) zfs_error(hdl, EZFS_BADDEV, msg);
 		break;
diff --git a/usr/src/lib/libzpool/common/llib-lzpool b/usr/src/lib/libzpool/common/llib-lzpool
index 7b58c21513..9e52a46aee 100644
--- a/usr/src/lib/libzpool/common/llib-lzpool
+++ b/usr/src/lib/libzpool/common/llib-lzpool
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 /* LINTLIBRARY */
@@ -39,6 +39,7 @@
 #include <sys/dnode.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dataset.h>
+#include <sys/dsl_scan.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/space_map.h>
diff --git a/usr/src/test/os-tests/doc/README b/usr/src/test/os-tests/doc/README
index e7d0b3f527..7e8f5236e8 100644
--- a/usr/src/test/os-tests/doc/README
+++ b/usr/src/test/os-tests/doc/README
@@ -36,7 +36,7 @@ and the testrunner without running a full nightly:
 Then set the publisher on the test machine to point to your repository and
 install the OS Unit Test Suite.
 
-	test_machine# pkg install pkg:/system/test/zfstest
+	test_machine# pkg install pkg:/system/test/ostest
 
 Note, the framework will be installed automatically, as the OS Unit Test Suite
 depends on it.
diff --git a/usr/src/uts/common/fs/autofs/auto_subr.c b/usr/src/uts/common/fs/autofs/auto_subr.c
index c9da46b66b..4e3280dfe1 100644
--- a/usr/src/uts/common/fs/autofs/auto_subr.c
+++ b/usr/src/uts/common/fs/autofs/auto_subr.c
@@ -162,11 +162,13 @@ auto_lookup_aux(fnnode_t *fnp, char *name, cred_t *cred)
 	bzero(&link, sizeof (link));
 	error = auto_lookup_request(fnip, name, &link, TRUE, &mountreq, cred);
 	if (!error) {
-		if (link.link != NULL || link.link != '\0') {
+		if (link.link != NULL) {
+			error = ENOENT;
 			/*
 			 * This node should be a symlink
 			 */
-			error = auto_perform_link(fnp, &link, cred);
+			if (*link.link != '\0')
+				error = auto_perform_link(fnp, &link, cred);
 		} else if (mountreq) {
 			/*
 			 * The automount daemon is requesting a mount,
diff --git a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs.h b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs.h
index 32dd58142e..8af2a5ee9d 100644
--- a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs.h
+++ b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs.h
@@ -115,6 +115,7 @@ typedef struct smbmntinfo {
 	struct vfs		*smi_vfsp;	/* mount back pointer to vfs */
 	struct smbnode		*smi_root;	/* the root node */
 	struct smb_share	*smi_share;	/* netsmb SMB share conn data */
+	struct taskq		*smi_taskq;	/* for async work */
 	kmutex_t		smi_lock;	/* mutex for flags, etc. */
 	uint32_t		smi_flags;	/* NFS-derived flag bits */
 	uint32_t		smi_status;	/* status bits for this mount */
diff --git a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vfsops.c b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vfsops.c
index af3f44d164..0122d52115 100644
--- a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vfsops.c
+++ b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vfsops.c
@@ -52,6 +52,7 @@
 #include <sys/statvfs.h>
 #include <sys/errno.h>
 #include <sys/debug.h>
+#include <sys/disp.h>
 #include <sys/cmn_err.h>
 #include <sys/modctl.h>
 #include <sys/policy.h>
@@ -60,6 +61,7 @@
 #include <sys/vfs_opreg.h>
 #include <sys/mntent.h>
 #include <sys/priv.h>
+#include <sys/taskq.h>
 #include <sys/tsol/label.h>
 #include <sys/tsol/tndb.h>
 #include <inet/ip.h>
@@ -86,6 +88,12 @@
 int smbfs_default_opt_acl = 0;
 
 /*
+ * How many taskq threads per-mount should we use.
+ * Just one is fine (until we do more async work).
+ */
+int smbfs_tq_nthread = 1;
+
+/*
  * Local functions definitions.
  */
 int		smbfsinit(int fstyp, char *name);
@@ -633,6 +641,14 @@ smbfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 	smi->smi_root = rtnp;
 
 	/*
+	 * Create a taskq for async work (i.e. putpage)
+	 */
+	smi->smi_taskq = taskq_create_proc("smbfs",
+	    smbfs_tq_nthread, minclsyspri,
+	    smbfs_tq_nthread, smbfs_tq_nthread * 2,
+	    zone->zone_zsched, TASKQ_PREPOPULATE);
+
+	/*
 	 * NFS does other stuff here too:
 	 *   async worker threads
 	 *   init kstats
@@ -701,15 +717,6 @@ smbfs_unmount(vfs_t *vfsp, int flag, cred_t *cr)
 	vfsp->vfs_flag |= VFS_UNMOUNTED;
 
 	/*
-	 * Shutdown any outstanding I/O requests on this share,
-	 * and force a tree disconnect.  The share object will
-	 * continue to hang around until smb_share_rele().
-	 * This should also cause most active nodes to be
-	 * released as their operations fail with EIO.
-	 */
-	smb_share_kill(smi->smi_share);
-
-	/*
 	 * If we hold the root VP (and we normally do)
 	 * then it's safe to release it now.
 	 */
@@ -732,6 +739,21 @@ smbfs_unmount(vfs_t *vfsp, int flag, cred_t *cr)
 	smbfs_destroy_table(vfsp);
 
 	/*
+	 * Shutdown any outstanding I/O requests on this share,
+	 * and force a tree disconnect.  The share object will
+	 * continue to hang around until smb_share_rele().
+	 * This should also cause most active nodes to be
+	 * released as their operations fail with EIO.
+	 */
+	smb_share_kill(smi->smi_share);
+
+	/*
+	 * Any async taskq work should be giving up.
+	 * Wait for those to exit.
+	 */
+	taskq_destroy(smi->smi_taskq);
+
+	/*
 	 * Delete our kstats...
 	 *
 	 * Doing it here, rather than waiting until
diff --git a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vnops.c b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vnops.c
index 07a69b21e5..23c9f8f15d 100644
--- a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vnops.c
+++ b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vnops.c
@@ -58,6 +58,7 @@
 #include <sys/vfs_opreg.h>
 #include <sys/policy.h>
 #include <sys/sdt.h>
+#include <sys/taskq_impl.h>
 #include <sys/zone.h>
 #include <sys/vmsystm.h>
 
@@ -153,7 +154,7 @@ static int	smbfs_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
 			enum seg_rw, cred_t *);
 static int	smbfs_putapage(vnode_t *, page_t *, u_offset_t *, size_t *,
 			int, cred_t *);
-static void	smbfs_delmap_callback(struct as *, void *, uint_t);
+static void	smbfs_delmap_async(void *);
 
 /*
  * Error flags used to pass information about certain special errors
@@ -4475,6 +4476,13 @@ done:
 	return (error);
 }
 
+/*
+ * This uses addmap/delmap functions to hold the SMB FID open as long as
+ * there are pages mapped in this as/seg.  Increment the FID refs. when
+ * the maping count goes from zero to non-zero, and release the FID ref
+ * when the maping count goes from non-zero to zero.
+ */
+
 /* ARGSUSED */
 static int
 smbfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
@@ -4504,79 +4512,76 @@ smbfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
 }
 
 /*
- * Use an address space callback to flush pages dirty pages after unmap,
- * which we can't do directly in smbfs_delmap due to locking issues.
+ * Args passed to smbfs_delmap_async
  */
 typedef struct smbfs_delmap_args {
-	vnode_t			*vp;
-	cred_t			*cr;
-	offset_t		off;
-	caddr_t			addr;
-	size_t			len;
-	uint_t			prot;
-	uint_t			maxprot;
-	uint_t			flags;
-	boolean_t		dec_fidrefs;
+	taskq_ent_t		dm_tqent;
+	cred_t			*dm_cr;
+	vnode_t			*dm_vp;
+	offset_t		dm_off;
+	caddr_t			dm_addr;
+	size_t			dm_len;
+	uint_t			dm_prot;
+	uint_t			dm_maxprot;
+	uint_t			dm_flags;
+	boolean_t		dm_rele_fid;
 } smbfs_delmap_args_t;
 
+/*
+ * Using delmap not only to release the SMB FID (as described above)
+ * but to flush dirty pages as needed.  Both of those do the actual
+ * work in an async taskq job to avoid interfering with locks held
+ * in the VM layer when this is called.
+ */
+
 /* ARGSUSED */
 static int
 smbfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
 	size_t len, uint_t prot, uint_t maxprot, uint_t flags,
 	cred_t *cr, caller_context_t *ct)
 {
-	smbnode_t *np = VTOSMB(vp);
+	smbnode_t		*np = VTOSMB(vp);
+	smbmntinfo_t		*smi = VTOSMI(vp);
 	smbfs_delmap_args_t	*dmapp;
-	int error;
 
 	dmapp = kmem_zalloc(sizeof (*dmapp), KM_SLEEP);
 
-	dmapp->vp = vp;
-	dmapp->off = off;
-	dmapp->addr = addr;
-	dmapp->len = len;
-	dmapp->prot = prot;
-	dmapp->maxprot = maxprot;
-	dmapp->flags = flags;
-	dmapp->cr = cr;
-	dmapp->dec_fidrefs = B_FALSE;
+	/*
+	 * The VM layer may segvn_free the seg holding this vnode
+	 * before our callback has a chance run, so take a hold on
+	 * the vnode here and release it in the callback.
+	 * (same for the cred)
+	 */
+	crhold(cr);
+	VN_HOLD(vp);
+
+	dmapp->dm_vp = vp;
+	dmapp->dm_cr = cr;
+	dmapp->dm_off = off;
+	dmapp->dm_addr = addr;
+	dmapp->dm_len = len;
+	dmapp->dm_prot = prot;
+	dmapp->dm_maxprot = maxprot;
+	dmapp->dm_flags = flags;
+	dmapp->dm_rele_fid = B_FALSE;
 
 	/*
-	 * When r_mapcnt returns to zero, arrange for the
-	 * callback to decrement n_fidrefs
+	 * Go ahead and decrement r_mapcount now, which is
+	 * the primary purpose of this function.
+	 *
+	 * When r_mapcnt goes to zero, we need to call
+	 * smbfs_rele_fid, but can't do that here, so
+	 * set a flag telling the async task to do it.
 	 */
 	mutex_enter(&np->r_statelock);
 	np->r_mapcnt -= btopr(len);
 	ASSERT(np->r_mapcnt >= 0);
 	if (np->r_mapcnt == 0)
-		dmapp->dec_fidrefs = B_TRUE;
+		dmapp->dm_rele_fid = B_TRUE;
 	mutex_exit(&np->r_statelock);
 
-	error = as_add_callback(as, smbfs_delmap_callback, dmapp,
-	    AS_UNMAP_EVENT, addr, len, KM_SLEEP);
-	if (error != 0) {
-		/*
-		 * So sad, no callback is coming. Can't flush pages
-		 * in delmap (as locks).  Just handle n_fidrefs.
-		 */
-		cmn_err(CE_NOTE, "smbfs_delmap(%p) "
-		    "as_add_callback err=%d",
-		    (void *)vp, error);
-
-		if (dmapp->dec_fidrefs) {
-			struct smb_cred scred;
-
-			(void) smbfs_rw_enter_sig(&np->r_lkserlock,
-			    RW_WRITER, 0);
-			smb_credinit(&scred, dmapp->cr);
-
-			smbfs_rele_fid(np, &scred);
-
-			smb_credrele(&scred);
-			smbfs_rw_exit(&np->r_lkserlock);
-		}
-		kmem_free(dmapp, sizeof (*dmapp));
-	}
+	taskq_dispatch_ent(smi->smi_taskq, smbfs_delmap_async, dmapp, 0,
+	    &dmapp->dm_tqent);
 
 	return (0);
 }
@@ -4587,14 +4592,16 @@ smbfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
  */
 /* ARGSUSED */
 static void
-smbfs_delmap_callback(struct as *as, void *arg, uint_t event)
+smbfs_delmap_async(void *varg)
 {
+	smbfs_delmap_args_t	*dmapp = varg;
+	cred_t			*cr;
 	vnode_t			*vp;
 	smbnode_t		*np;
 	smbmntinfo_t		*smi;
-	smbfs_delmap_args_t	*dmapp = arg;
 
-	vp = dmapp->vp;
+	cr = dmapp->dm_cr;
+	vp = dmapp->dm_vp;
 	np = VTOSMB(vp);
 	smi = VTOSMI(vp);
 
@@ -4609,7 +4616,8 @@ smbfs_delmap_callback(struct as *as, void *arg, uint_t event)
 	 * unmount smbfs
 	 */
 	if (vn_has_cached_data(vp) && !vn_is_readonly(vp) &&
-	    dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
+	    dmapp->dm_flags == MAP_SHARED &&
+	    (dmapp->dm_maxprot & PROT_WRITE) != 0) {
 		mutex_enter(&np->r_statelock);
 		np->r_flags |= RDIRTY;
 		mutex_exit(&np->r_statelock);
@@ -4618,23 +4626,23 @@ smbfs_delmap_callback(struct as *as, void *arg, uint_t event)
 		 * Need to finish the putpage before we
 		 * close the OtW FID needed for I/O.
 		 */
-		(void) smbfs_putpage(vp, dmapp->off, dmapp->len, 0,
-		    dmapp->cr, NULL);
+		(void) smbfs_putpage(vp, dmapp->dm_off, dmapp->dm_len, 0,
+		    dmapp->dm_cr, NULL);
 	}
 
 	if ((np->r_flags & RDIRECTIO) || (smi->smi_flags & SMI_DIRECTIO))
-		(void) smbfs_putpage(vp, dmapp->off, dmapp->len,
-		    B_INVAL, dmapp->cr, NULL);
+		(void) smbfs_putpage(vp, dmapp->dm_off, dmapp->dm_len,
+		    B_INVAL, dmapp->dm_cr, NULL);
 
 	/*
 	 * If r_mapcnt went to zero, drop our FID ref now.
 	 * On the last fidref, this does an OtW close.
 	 */
-	if (dmapp->dec_fidrefs) {
+	if (dmapp->dm_rele_fid) {
 		struct smb_cred scred;
 
 		(void) smbfs_rw_enter_sig(&np->r_lkserlock, RW_WRITER, 0);
-		smb_credinit(&scred, dmapp->cr);
+		smb_credinit(&scred, dmapp->dm_cr);
 
 		smbfs_rele_fid(np, &scred);
 
@@ -4642,7 +4650,10 @@ smbfs_delmap_callback(struct as *as, void *arg, uint_t event)
 		smbfs_rw_exit(&np->r_lkserlock);
 	}
 
-	(void) as_delete_callback(as, arg);
+	/* Release holds taken in smbfs_delmap */
+	VN_RELE(vp);
+	crfree(cr);
+
 	kmem_free(dmapp, sizeof (*dmapp));
 }
 
diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c
index 0660d6fbfd..fbc1c4d08b 100644
--- a/usr/src/uts/common/fs/zfs/dsl_scan.c
+++ b/usr/src/uts/common/fs/zfs/dsl_scan.c
@@ -1988,7 +1988,16 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
 
 		/* if it's a resilver, this may not be in the target range */
 		if (!needs_io) {
-			if (DVA_GET_GANG(&bp->blk_dva[d])) {
+			if (vd->vdev_ops == &vdev_indirect_ops) {
+				/*
+				 * The indirect vdev can point to multiple
+				 * vdevs.  For simplicity, always create
+				 * the resilver zio_t. zio_vdev_io_start()
+				 * will bypass the child resilver i/o's if
+				 * they are on vdevs that don't have DTL's.
+				 */
+				needs_io = B_TRUE;
+			} else if (DVA_GET_GANG(&bp->blk_dva[d])) {
 				/*
 				 * Gang members may be spread across multiple
 				 * vdevs, so the best estimate we have is the
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 1f5a7fbd26..b6a5665936 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -3590,7 +3590,7 @@ metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
 		return;
 
 	if (spa->spa_vdev_removal != NULL &&
-	    spa->spa_vdev_removal->svr_vdev == vd &&
+	    spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
 	    vdev_is_concrete(vd)) {
 		/*
 		 * Note: we check if the vdev is concrete because when
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 163f5e054e..9185c5b182 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -5509,8 +5509,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 		for (int c = 0; c < vd->vdev_children; c++) {
 			tvd = vd->vdev_child[c];
 			if (spa->spa_vdev_removal != NULL &&
-			    tvd->vdev_ashift !=
-			    spa->spa_vdev_removal->svr_vdev->vdev_ashift) {
+			    tvd->vdev_ashift != spa->spa_max_ashift) {
 				return (spa_vdev_exit(spa, vd, txg, EINVAL));
 			}
 			/* Fail if top level vdev is raidz */
@@ -5626,10 +5625,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
-	if (spa->spa_vdev_removal != NULL ||
-	    spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
+	if (spa->spa_vdev_removal != NULL)
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
-	}
 
 	if (oldvd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index fe0971a720..81dded7482 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -1761,9 +1761,12 @@ spa_update_dspace(spa_t *spa)
 		 * allocated twice (on the old device and the new
 		 * device).
 		 */
-		vdev_t *vd = spa->spa_vdev_removal->svr_vdev;
+		spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+		vdev_t *vd =
+		    vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
 		spa->spa_dspace -= spa_deflate(spa) ?
 		    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+		spa_config_exit(spa, SCL_VDEV, FTAG);
 	}
 }
 
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_removal.h b/usr/src/uts/common/fs/zfs/sys/vdev_removal.h
index a29ae58610..5fcd40b6e7 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_removal.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_removal.h
@@ -30,7 +30,7 @@ extern "C" {
 #endif
 
 typedef struct spa_vdev_removal {
-	vdev_t		*svr_vdev;
+	uint64_t	svr_vdev_id;
 	uint64_t	svr_max_offset_to_sync[TXG_SIZE];
 	/* Thread performing a vdev removal. */
 	kthread_t	*svr_thread;
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 273e5fcb0b..c12cb70906 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -552,7 +552,7 @@ extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
     zio_done_func_t *done, void *private);
 
 extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
-    struct abd *data, uint64_t size, int type, zio_priority_t priority,
+    struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority,
     enum zio_flag flags, zio_done_func_t *done, void *private);
 
 extern void zio_vdev_io_bypass(zio_t *zio);
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index e761ee4b39..71b690c123 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -867,6 +867,32 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 	svd->vdev_stat.vs_space = 0;
 	svd->vdev_stat.vs_dspace = 0;
 
+	/*
+	 * State which may be set on a top-level vdev that's in the
+	 * process of being removed.
+	 */
+	ASSERT0(tvd->vdev_indirect_config.vic_births_object);
+	ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
+	ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
+	ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
+	ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
+	ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
+	ASSERT0(tvd->vdev_removing);
+	tvd->vdev_removing = svd->vdev_removing;
+	tvd->vdev_indirect_config = svd->vdev_indirect_config;
+	tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
+	tvd->vdev_indirect_births = svd->vdev_indirect_births;
+	range_tree_swap(&svd->vdev_obsolete_segments,
+	    &tvd->vdev_obsolete_segments);
+	tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
+	svd->vdev_indirect_config.vic_mapping_object = 0;
+	svd->vdev_indirect_config.vic_births_object = 0;
+	svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
+	svd->vdev_indirect_mapping = NULL;
+	svd->vdev_indirect_births = NULL;
+	svd->vdev_obsolete_sm = NULL;
+	svd->vdev_removing = 0;
+
 	for (t = 0; t < TXG_SIZE; t++) {
 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
diff --git a/usr/src/uts/common/fs/zfs/vdev_indirect.c b/usr/src/uts/common/fs/zfs/vdev_indirect.c
index 304453aa94..3f2ff799b6 100644
--- a/usr/src/uts/common/fs/zfs/vdev_indirect.c
+++ b/usr/src/uts/common/fs/zfs/vdev_indirect.c
@@ -23,6 +23,7 @@
 #include <sys/vdev_impl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
+#include <sys/zio_checksum.h>
 #include <sys/metaslab.h>
 #include <sys/refcount.h>
 #include <sys/dmu.h>
@@ -46,10 +47,11 @@
  * "vdev_remap" operation that executes a callback on each contiguous
  * segment of the new location.  This function is used in multiple ways:
  *
- *  - reads and repair writes to this device use the callback to create
- *    a child io for each mapped segment.
+ *  - i/os to this vdev use the callback to determine where the
+ *    data is now located, and issue child i/os for each segment's new
+ *    location.
  *
- *  - frees and claims to this device use the callback to free or claim
+ *  - frees and claims to this vdev use the callback to free or claim
  *    each mapped segment.  (Note that we don't actually need to claim
  *    log blocks on indirect vdevs, because we don't allocate to
  *    removing vdevs.  However, zdb uses zio_claim() for its leak
@@ -204,6 +206,94 @@ uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
 int zfs_condense_indirect_commit_entry_delay_ticks = 0;
 
 /*
+ * If a split block contains more than this many segments, consider it too
+ * computationally expensive to check all (2^num_segments) possible
+ * combinations. Instead, try at most 2^_segments_max randomly-selected
+ * combinations.
+ *
+ * This is reasonable if only a few segment copies are damaged and the
+ * majority of segment copies are good. This allows all the segment copies to
+ * participate fairly in the reconstruction and prevents the repeated use of
+ * one bad copy.
+ */
+int zfs_reconstruct_indirect_segments_max = 10;
+
+/*
+ * The indirect_child_t represents the vdev that we will read from, when we
+ * need to read all copies of the data (e.g. for scrub or reconstruction).
+ * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
+ * ic_vdev is the same as is_vdev.  However, for mirror top-level vdevs,
+ * ic_vdev is a child of the mirror.
+ */
+typedef struct indirect_child {
+	abd_t *ic_data;
+	vdev_t *ic_vdev;
+} indirect_child_t;
+
+/*
+ * The indirect_split_t represents one mapped segment of an i/o to the
+ * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
+ * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
+ * For split blocks, there will be several of these.
+ */
+typedef struct indirect_split {
+	list_node_t is_node; /* link on iv_splits */
+
+	/*
+	 * is_split_offset is the offset into the i/o.
+	 * This is the sum of the previous splits' is_size's.
+	 */
+	uint64_t is_split_offset;
+
+	vdev_t *is_vdev; /* top-level vdev */
+	uint64_t is_target_offset; /* offset on is_vdev */
+	uint64_t is_size;
+	int is_children; /* number of entries in is_child[] */
+
+	/*
+	 * is_good_child is the child that we are currently using to
+	 * attempt reconstruction.
+	 */
+	int is_good_child;
+
+	indirect_child_t is_child[1]; /* variable-length */
+} indirect_split_t;
+
+/*
+ * The indirect_vsd_t is associated with each i/o to the indirect vdev.
+ * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
+ */
+typedef struct indirect_vsd {
+	boolean_t iv_split_block;
+	boolean_t iv_reconstruct;
+
+	list_t iv_splits; /* list of indirect_split_t's */
+} indirect_vsd_t;
+
+static void
+vdev_indirect_map_free(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	indirect_split_t *is;
+	while ((is = list_head(&iv->iv_splits)) != NULL) {
+		for (int c = 0; c < is->is_children; c++) {
+			indirect_child_t *ic = &is->is_child[c];
+			if (ic->ic_data != NULL)
+				abd_free(ic->ic_data);
+		}
+		list_remove(&iv->iv_splits, is);
+		kmem_free(is,
+		    offsetof(indirect_split_t, is_child[is->is_children]));
+	}
+	kmem_free(iv, sizeof (*iv));
+}
+
+static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
+	vdev_indirect_map_free,
+	zio_vsd_default_cksum_report
+};
+/*
  * Mark the given offset and size as being obsolete.
  */
 void
@@ -817,12 +907,6 @@ vdev_indirect_close(vdev_t *vd)
 }
 
 /* ARGSUSED */
-static void
-vdev_indirect_io_done(zio_t *zio)
-{
-}
-
-/* ARGSUSED */
 static int
 vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
     uint64_t *ashift)
@@ -1065,41 +1149,475 @@ vdev_indirect_child_io_done(zio_t *zio)
 	abd_put(zio->io_abd);
 }
 
+/*
+ * This is a callback for vdev_indirect_remap() which allocates an
+ * indirect_split_t for each split segment and adds it to iv_splits.
+ */
 static void
-vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset,
+vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	zio_t *zio = arg;
+	indirect_vsd_t *iv = zio->io_vsd;
 
 	ASSERT3P(vd, !=, NULL);
 
 	if (vd->vdev_ops == &vdev_indirect_ops)
 		return;
 
-	zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset,
-	    abd_get_offset(zio->io_abd, split_offset),
-	    size, zio->io_type, zio->io_priority,
-	    0, vdev_indirect_child_io_done, zio));
+	int n = 1;
+	if (vd->vdev_ops == &vdev_mirror_ops)
+		n = vd->vdev_children;
+
+	indirect_split_t *is =
+	    kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP);
+
+	is->is_children = n;
+	is->is_size = size;
+	is->is_split_offset = split_offset;
+	is->is_target_offset = offset;
+	is->is_vdev = vd;
+
+	/*
+	 * Note that we only consider multiple copies of the data for
+	 * *mirror* vdevs.  We don't for "replacing" or "spare" vdevs, even
+	 * though they use the same ops as mirror, because there's only one
+	 * "good" copy under the replacing/spare.
+	 */
+	if (vd->vdev_ops == &vdev_mirror_ops) {
+		for (int i = 0; i < n; i++) {
+			is->is_child[i].ic_vdev = vd->vdev_child[i];
+		}
+	} else {
+		is->is_child[0].ic_vdev = vd;
+	}
+
+	list_insert_tail(&iv->iv_splits, is);
+}
+
+static void
+vdev_indirect_read_split_done(zio_t *zio)
+{
+	indirect_child_t *ic = zio->io_private;
+
+	if (zio->io_error != 0) {
+		/*
+		 * Clear ic_data to indicate that we do not have data for this
+		 * child.
+		 */
+		abd_free(ic->ic_data);
+		ic->ic_data = NULL;
+	}
+}
+
+/*
+ * Issue reads for all copies (mirror children) of all splits.
+ */
+static void
+vdev_indirect_read_all(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		for (int i = 0; i < is->is_children; i++) {
+			indirect_child_t *ic = &is->is_child[i];
+
+			if (!vdev_readable(ic->ic_vdev))
+				continue;
+
+			/*
+			 * Note, we may read from a child whose DTL
+			 * indicates that the data may not be present here.
+			 * While this might result in a few i/os that will
+			 * likely return incorrect data, it simplifies the
+			 * code since we can treat scrub and resilver
+			 * identically.  (The incorrect data will be
+			 * detected and ignored when we verify the
+			 * checksum.)
+			 */
+
+			ic->ic_data = abd_alloc_sametype(zio->io_abd,
+			    is->is_size);
+
+			zio_nowait(zio_vdev_child_io(zio, NULL,
+			    ic->ic_vdev, is->is_target_offset, ic->ic_data,
+			    is->is_size, zio->io_type, zio->io_priority, 0,
+			    vdev_indirect_read_split_done, ic));
+		}
+	}
+	iv->iv_reconstruct = B_TRUE;
 }
 
 static void
 vdev_indirect_io_start(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
+	indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP);
+	list_create(&iv->iv_splits,
+	    sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
+
+	zio->io_vsd = iv;
+	zio->io_vsd_ops = &vdev_indirect_vsd_ops;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 	if (zio->io_type != ZIO_TYPE_READ) {
 		ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
-		ASSERT((zio->io_flags &
-		    (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
+		/*
+		 * Note: this code can handle other kinds of writes,
+		 * but we don't expect them.
+		 */
+		ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL |
+		    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
 	}
 
 	vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
-	    vdev_indirect_io_start_cb, zio);
+	    vdev_indirect_gather_splits, zio);
+
+	indirect_split_t *first = list_head(&iv->iv_splits);
+	if (first->is_size == zio->io_size) {
+		/*
+		 * This is not a split block; we are pointing to the entire
+		 * data, which will checksum the same as the original data.
+		 * Pass the BP down so that the child i/o can verify the
+		 * checksum, and try a different location if available
+		 * (e.g. on a mirror).
+		 *
+		 * While this special case could be handled the same as the
+		 * general (split block) case, doing it this way ensures
+		 * that the vast majority of blocks on indirect vdevs
+		 * (which are not split) are handled identically to blocks
+		 * on non-indirect vdevs.  This allows us to be less strict
+		 * about performance in the general (but rare) case.
+		 */
+		ASSERT0(first->is_split_offset);
+		ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL);
+		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+		    first->is_vdev, first->is_target_offset,
+		    abd_get_offset(zio->io_abd, 0),
+		    zio->io_size, zio->io_type, zio->io_priority, 0,
+		    vdev_indirect_child_io_done, zio));
+	} else {
+		iv->iv_split_block = B_TRUE;
+		if (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
+			/*
+			 * Read all copies.  Note that for simplicity,
+			 * we don't bother consulting the DTL in the
+			 * resilver case.
+			 */
+			vdev_indirect_read_all(zio);
+		} else {
+			/*
+			 * Read one copy of each split segment, from the
+			 * top-level vdev.  Since we don't know the
+			 * checksum of each split individually, the child
+			 * zio can't ensure that we get the right data.
+			 * E.g. if it's a mirror, it will just read from a
+			 * random (healthy) leaf vdev.  We have to verify
+			 * the checksum in vdev_indirect_io_done().
+			 */
+			for (indirect_split_t *is = list_head(&iv->iv_splits);
+			    is != NULL; is = list_next(&iv->iv_splits, is)) {
+				zio_nowait(zio_vdev_child_io(zio, NULL,
+				    is->is_vdev, is->is_target_offset,
+				    abd_get_offset(zio->io_abd,
+				    is->is_split_offset),
+				    is->is_size, zio->io_type,
+				    zio->io_priority, 0,
+				    vdev_indirect_child_io_done, zio));
+			}
+		}
+	}
 
 	zio_execute(zio);
 }
 
+/*
+ * Report a checksum error for a child.
+ */
+static void
+vdev_indirect_checksum_error(zio_t *zio,
+    indirect_split_t *is, indirect_child_t *ic)
+{
+	vdev_t *vd = ic->ic_vdev;
+
+	if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+		return;
+
+	mutex_enter(&vd->vdev_stat_lock);
+	vd->vdev_stat.vs_checksum_errors++;
+	mutex_exit(&vd->vdev_stat_lock);
+
+	zio_bad_cksum_t zbc = { 0 };
+	void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size);
+	abd_t *good_abd = is->is_child[is->is_good_child].ic_data;
+	void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size);
+	zfs_ereport_post_checksum(zio->io_spa, vd, zio,
+	    is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc);
+	abd_return_buf(ic->ic_data, bad_buf, is->is_size);
+	abd_return_buf(good_abd, good_buf, is->is_size);
+}
+
+/*
+ * Issue repair i/os for any incorrect copies.  We do this by comparing
+ * each split segment's correct data (is_good_child's ic_data) with each
+ * other copy of the data.  If they differ, then we overwrite the bad data
+ * with the good copy.  Note that we do this without regard for the DTL's,
+ * which simplifies this code and also issues the optimal number of writes
+ * (based on which copies actually read bad data, as opposed to which we
+ * think might be wrong).  For the same reason, we always use
+ * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start().
+ */
+static void
+vdev_indirect_repair(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	enum zio_flag flags = ZIO_FLAG_IO_REPAIR;
+
+	if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))
+		flags |= ZIO_FLAG_SELF_HEAL;
+
+	if (!spa_writeable(zio->io_spa))
+		return;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		indirect_child_t *good_child = &is->is_child[is->is_good_child];
+
+		for (int c = 0; c < is->is_children; c++) {
+			indirect_child_t *ic = &is->is_child[c];
+			if (ic == good_child)
+				continue;
+			if (ic->ic_data == NULL)
+				continue;
+			if (abd_cmp(good_child->ic_data, ic->ic_data,
+			    is->is_size) == 0)
+				continue;
+
+			zio_nowait(zio_vdev_child_io(zio, NULL,
+			    ic->ic_vdev, is->is_target_offset,
+			    good_child->ic_data, is->is_size,
+			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
+			    NULL, NULL));
+
+			vdev_indirect_checksum_error(zio, is, ic);
+		}
+	}
+}
+
+/*
+ * Report checksum errors on all children that we read from.
+ */
+static void
+vdev_indirect_all_checksum_errors(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+		return;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		for (int c = 0; c < is->is_children; c++) {
+			indirect_child_t *ic = &is->is_child[c];
+
+			if (ic->ic_data == NULL)
+				continue;
+
+			vdev_t *vd = ic->ic_vdev;
+
+			mutex_enter(&vd->vdev_stat_lock);
+			vd->vdev_stat.vs_checksum_errors++;
+			mutex_exit(&vd->vdev_stat_lock);
+
+			zfs_ereport_post_checksum(zio->io_spa, vd, zio,
+			    is->is_target_offset, is->is_size,
+			    NULL, NULL, NULL);
+		}
+	}
+}
+
+/*
+ * This function is called when we have read all copies of the data and need
+ * to try to find a combination of copies that gives us the right checksum.
+ *
+ * If we pointed to any mirror vdevs, this effectively does the job of the
+ * mirror.  The mirror vdev code can't do its own job because we don't know
+ * the checksum of each split segment individually.  We have to try every
+ * combination of copies of split segments, until we find one that checksums
+ * correctly.  (Or until we have tried all combinations, or have tried
+ * 2^zfs_reconstruct_indirect_segments_max combinations.  In these cases we
+ * set io_error to ECKSUM to propagate the error up to the user.)
+ *
+ * For example, if we have 3 segments in the split,
+ * and each points to a 2-way mirror, we will have the following pieces of
+ * data:
+ *
+ *       |     mirror child
+ * split |     [0]        [1]
+ * ======|=====================
+ *   A   |  data_A_0   data_A_1
+ *   B   |  data_B_0   data_B_1
+ *   C   |  data_C_0   data_C_1
+ *
+ * We will try the following (mirror children)^(number of splits) (2^3=8)
+ * combinations, which is similar to bitwise-little-endian counting in
+ * binary.  In general each "digit" corresponds to a split segment, and the
+ * base of each digit is is_children, which can be different for each
+ * digit.
+ *
+ * "low bit"        "high bit"
+ *        v                 v
+ * data_A_0 data_B_0 data_C_0
+ * data_A_1 data_B_0 data_C_0
+ * data_A_0 data_B_1 data_C_0
+ * data_A_1 data_B_1 data_C_0
+ * data_A_0 data_B_0 data_C_1
+ * data_A_1 data_B_0 data_C_1
+ * data_A_0 data_B_1 data_C_1
+ * data_A_1 data_B_1 data_C_1
+ *
+ * Note that the split segments may be on the same or different top-level
+ * vdevs. In either case, we try lots of combinations (see
+ * zfs_reconstruct_indirect_segments_max).  This ensures that if a mirror has
+ * small silent errors on all of its children, we can still reconstruct the
+ * correct data, as long as those errors are at sufficiently-separated
+ * offsets (specifically, separated by the largest block size - default of
+ * 128KB, but up to 16MB).
+ */
+static void
+vdev_indirect_reconstruct_io_done(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+	uint64_t attempts = 0;
+	uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max;
+	int segments = 0;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is))
+		segments++;
+
+	for (;;) {
+		/* copy data from splits to main zio */
+		int ret;
+		for (indirect_split_t *is = list_head(&iv->iv_splits);
+		    is != NULL; is = list_next(&iv->iv_splits, is)) {
+
+			/*
+			 * If this child failed, its ic_data will be NULL.
+			 * Skip this combination.
+			 */
+			if (is->is_child[is->is_good_child].ic_data == NULL) {
+				ret = EIO;
+				goto next;
+			}
+
+			abd_copy_off(zio->io_abd,
+			    is->is_child[is->is_good_child].ic_data,
+			    is->is_split_offset, 0, is->is_size);
+		}
+
+		/* See if this checksum matches. */
+		zio_bad_cksum_t zbc;
+		ret = zio_checksum_error(zio, &zbc);
+		if (ret == 0) {
+			/* Found a matching checksum.  Issue repair i/os. */
+			vdev_indirect_repair(zio);
+			zio_checksum_verified(zio);
+			return;
+		}
+
+		/*
+		 * Checksum failed; try a different combination of split
+		 * children.
+		 */
+		boolean_t more;
+next:
+		more = B_FALSE;
+		if (segments <= zfs_reconstruct_indirect_segments_max) {
+			/*
+			 * There are relatively few segments, so
+			 * deterministically check all combinations.  We do
+			 * this by by adding one to the first split's
+			 * good_child.  If it overflows, then "carry over" to
+			 * the next split (like counting in base is_children,
+			 * but each digit can have a different base).
+			 */
+			for (indirect_split_t *is = list_head(&iv->iv_splits);
+			    is != NULL; is = list_next(&iv->iv_splits, is)) {
+				is->is_good_child++;
+				if (is->is_good_child < is->is_children) {
+					more = B_TRUE;
+					break;
+				}
+				is->is_good_child = 0;
+			}
+		} else if (++attempts < attempts_max) {
+			/*
+			 * There are too many combinations to try all of them
+			 * in a reasonable amount of time, so try a fixed
+			 * number of random combinations, after which we'll
+			 * consider the block unrecoverable.
+			 */
+			for (indirect_split_t *is = list_head(&iv->iv_splits);
+			    is != NULL; is = list_next(&iv->iv_splits, is)) {
+				is->is_good_child =
+				    spa_get_random(is->is_children);
+			}
+			more = B_TRUE;
+		}
+		if (!more) {
+			/* All combinations failed. */
+			zio->io_error = ret;
+			vdev_indirect_all_checksum_errors(zio);
+			zio_checksum_verified(zio);
+			return;
+		}
+	}
+}
+
+static void
+vdev_indirect_io_done(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	if (iv->iv_reconstruct) {
+		/*
+		 * We have read all copies of the data (e.g. from mirrors),
+		 * either because this was a scrub/resilver, or because the
+		 * one-copy read didn't checksum correctly.
+		 */
+		vdev_indirect_reconstruct_io_done(zio);
+		return;
+	}
+
+	if (!iv->iv_split_block) {
+		/*
+		 * This was not a split block, so we passed the BP down,
+		 * and the checksum was handled by the (one) child zio.
+		 */
+		return;
+	}
+
+	zio_bad_cksum_t zbc;
+	int ret = zio_checksum_error(zio, &zbc);
+	if (ret == 0) {
+		zio_checksum_verified(zio);
+		return;
+	}
+
+	/*
+	 * The checksum didn't match.  Read all copies of all splits, and
+	 * then we will try to reconstruct.  The next time
+	 * vdev_indirect_io_done() is called, iv_reconstruct will be set.
+	 */
+	vdev_indirect_read_all(zio);
+
+	zio_vdev_io_redone(zio);
+}
+
 vdev_ops_t vdev_indirect_ops = {
 	vdev_indirect_open,
 	vdev_indirect_close,
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index d8a0762c42..d906860346 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -1117,10 +1117,13 @@ vdev_uberblock_sync_done(zio_t *zio)
  * Write the uberblock to all labels of all leaves of the specified vdev.
  */
 static void
-vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
+vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
+    uberblock_t *ub, vdev_t *vd, int flags)
 {
-	for (uint64_t c = 0; c < vd->vdev_children; c++)
-		vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags);
+	for (uint64_t c = 0; c < vd->vdev_children; c++) {
+		vdev_uberblock_sync(zio, good_writes,
+		    ub, vd->vdev_child[c], flags);
+	}
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return;
@@ -1138,7 +1141,7 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
 	for (int l = 0; l < VDEV_LABELS; l++)
 		vdev_label_write(zio, vd, l, ub_abd,
 		    VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
-		    vdev_uberblock_sync_done, zio->io_private,
+		    vdev_uberblock_sync_done, good_writes,
 		    flags | ZIO_FLAG_DONT_PROPAGATE);
 
 	abd_free(ub_abd);
@@ -1152,10 +1155,10 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
 	zio_t *zio;
 	uint64_t good_writes = 0;
 
-	zio = zio_root(spa, NULL, &good_writes, flags);
+	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (int v = 0; v < svdcount; v++)
-		vdev_uberblock_sync(zio, ub, svd[v], flags);
+		vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags);
 
 	(void) zio_wait(zio);
 
@@ -1216,7 +1219,8 @@ vdev_label_sync_ignore_done(zio_t *zio)
  * Write all even or odd labels to all leaves of the specified vdev.
  */
 static void
-vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
+vdev_label_sync(zio_t *zio, uint64_t *good_writes,
+    vdev_t *vd, int l, uint64_t txg, int flags)
 {
 	nvlist_t *label;
 	vdev_phys_t *vp;
@@ -1224,8 +1228,10 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
 	char *buf;
 	size_t buflen;
 
-	for (int c = 0; c < vd->vdev_children; c++)
-		vdev_label_sync(zio, vd->vdev_child[c], l, txg, flags);
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_label_sync(zio, good_writes,
+		    vd->vdev_child[c], l, txg, flags);
+	}
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return;
@@ -1250,7 +1256,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
 			vdev_label_write(zio, vd, l, vp_abd,
 			    offsetof(vdev_label_t, vl_vdev_phys),
 			    sizeof (vdev_phys_t),
-			    vdev_label_sync_done, zio->io_private,
+			    vdev_label_sync_done, good_writes,
 			    flags | ZIO_FLAG_DONT_PROPAGATE);
 		}
 	}
@@ -1282,7 +1288,7 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
 		    (vd->vdev_islog || vd->vdev_aux != NULL) ?
 		    vdev_label_sync_ignore_done : vdev_label_sync_top_done,
 		    good_writes, flags);
-		vdev_label_sync(vio, vd, l, txg, flags);
+		vdev_label_sync(vio, good_writes, vd, l, txg, flags);
 		zio_nowait(vio);
 	}
 
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
index 9d181a874e..34a750fe4d 100644
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -345,12 +345,15 @@ vdev_mirror_io_start(zio_t *zio)
 	}
 
 	if (zio->io_type == ZIO_TYPE_READ) {
-		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
+		if (zio->io_bp != NULL &&
+		    (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
 			/*
-			 * For scrubbing reads we need to allocate a read
-			 * buffer for each child and issue reads to all
-			 * children.  If any child succeeds, it will copy its
-			 * data into zio->io_data in vdev_mirror_scrub_done.
+			 * For scrubbing reads (if we can verify the
+			 * checksum here, as indicated by io_bp being
+			 * non-NULL) we need to allocate a read buffer for
+			 * each child and issue reads to all children.  If
+			 * any child succeeds, it will copy its data into
+			 * zio->io_data in vdev_mirror_scrub_done.
 			 */
 			for (c = 0; c < mm->mm_children; c++) {
 				mc = &mm->mm_child[c];
@@ -502,7 +505,21 @@ vdev_mirror_io_done(zio_t *zio)
 			if (mc->mc_error == 0) {
 				if (mc->mc_tried)
 					continue;
+				/*
+				 * We didn't try this child.  We need to
+				 * repair it if:
+				 * 1. it's a scrub (in which case we have
+				 * tried everything that was healthy)
+				 *  - or -
+				 * 2. it's an indirect vdev (in which case
+				 * it could point to any other vdev, which
+				 * might have a bad DTL)
+				 *  - or -
+				 * 3. the DTL indicates that this data is
+				 * missing from this vdev
+				 */
 				if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
+				    mc->mc_vd->vdev_ops != &vdev_indirect_ops &&
 				    !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
 				    zio->io_txg, 1))
 					continue;
diff --git a/usr/src/uts/common/fs/zfs/vdev_removal.c b/usr/src/uts/common/fs/zfs/vdev_removal.c
index d00b5b35f7..fc613ff58a 100644
--- a/usr/src/uts/common/fs/zfs/vdev_removal.c
+++ b/usr/src/uts/common/fs/zfs/vdev_removal.c
@@ -83,18 +83,12 @@ typedef struct vdev_copy_arg {
 	kmutex_t	vca_lock;
 } vdev_copy_arg_t;
 
-typedef struct vdev_copy_seg_arg {
-	vdev_copy_arg_t	*vcsa_copy_arg;
-	uint64_t	vcsa_txg;
-	dva_t		*vcsa_dest_dva;
-	blkptr_t	*vcsa_dest_bp;
-} vdev_copy_seg_arg_t;
-
 /*
- * The maximum amount of allowed data we're allowed to copy from a device
- * at a time when removing it.
+ * The maximum amount of memory we can use for outstanding i/o while
+ * doing a device removal.  This determines how much i/o we can have
+ * in flight concurrently.
  */
-int zfs_remove_max_copy_bytes = 8 * 1024 * 1024;
+int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
 
 /*
  * The largest contiguous segment that we will attempt to allocate when
@@ -176,7 +170,7 @@ spa_vdev_removal_create(vdev_t *vd)
 	mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
 	svr->svr_allocd_segs = range_tree_create(NULL, NULL);
-	svr->svr_vdev = vd;
+	svr->svr_vdev_id = vd->vdev_id;
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		svr->svr_frees[i] = range_tree_create(NULL, NULL);
@@ -218,9 +212,10 @@ spa_vdev_removal_destroy(spa_vdev_removal_t *svr)
 static void
 vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
 {
-	vdev_t *vd = arg;
+	int vdev_id = (uintptr_t)arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
-	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
 	spa_vdev_removal_t *svr = NULL;
 	uint64_t txg = dmu_tx_get_txg(tx);
@@ -342,7 +337,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
 	ASSERT3P(spa->spa_vdev_removal, ==, NULL);
 	spa->spa_vdev_removal = svr;
 	svr->svr_thread = thread_create(NULL, 0,
-	    spa_vdev_remove_thread, vd, 0, &p0, TS_RUN, minclsyspri);
+	    spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri);
 }
 
 /*
@@ -383,21 +378,24 @@ spa_remove_init(spa_t *spa)
 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 		vdev_t *vd = vdev_lookup_top(spa,
 		    spa->spa_removing_phys.sr_removing_vdev);
-		spa_config_exit(spa, SCL_STATE, FTAG);
 
-		if (vd == NULL)
+		if (vd == NULL) {
+			spa_config_exit(spa, SCL_STATE, FTAG);
 			return (EINVAL);
+		}
 
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 		ASSERT(vdev_is_concrete(vd));
 		spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
-		ASSERT(svr->svr_vdev->vdev_removing);
+		ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id);
+		ASSERT(vd->vdev_removing);
 
 		vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
 		    spa->spa_meta_objset, vic->vic_mapping_object);
 		vd->vdev_indirect_births = vdev_indirect_births_open(
 		    spa->spa_meta_objset, vic->vic_births_object);
+		spa_config_exit(spa, SCL_STATE, FTAG);
 
 		spa->spa_vdev_removal = svr;
 	}
@@ -450,15 +448,8 @@ spa_restart_removal(spa_t *spa)
 	if (!spa_writeable(spa))
 		return;
 
-	vdev_t *vd = svr->svr_vdev;
-	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-
-	ASSERT3P(vd, !=, NULL);
-	ASSERT(vd->vdev_removing);
-
-	zfs_dbgmsg("restarting removal of %llu at count=%llu",
-	    vd->vdev_id, vdev_indirect_mapping_num_entries(vim));
-	svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, vd,
+	zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id);
+	svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa,
 	    0, &p0, TS_RUN, minclsyspri);
 }
 
@@ -479,7 +470,7 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
 	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
 	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
 	    vdev_indirect_mapping_object(vim));
-	ASSERT3P(vd, ==, svr->svr_vdev);
+	ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
 
 	mutex_enter(&svr->svr_lock);
 
@@ -662,7 +653,7 @@ spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
 
 	if (state == DSS_FINISHED) {
 		spa_removing_phys_t *srp = &spa->spa_removing_phys;
-		vdev_t *vd = svr->svr_vdev;
+		vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 		if (srp->sr_prev_indirect_vdev != UINT64_MAX) {
@@ -705,7 +696,7 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_vdev_removal_t *svr = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-	vdev_t *vd = svr->svr_vdev;
+	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 	uint64_t txg = dmu_tx_get_txg(tx);
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
@@ -733,64 +724,128 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
 	spa_sync_removing_state(spa, tx);
 }
 
+/*
+ * All reads and writes associated with a call to spa_vdev_copy_segment()
+ * are done.
+ */
+static void
+spa_vdev_copy_nullzio_done(zio_t *zio)
+{
+	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
+}
+
+/*
+ * The write of the new location is done.
+ */
 static void
 spa_vdev_copy_segment_write_done(zio_t *zio)
 {
-	vdev_copy_seg_arg_t *vcsa = zio->io_private;
-	vdev_copy_arg_t *vca = vcsa->vcsa_copy_arg;
-	spa_config_exit(zio->io_spa, SCL_STATE, FTAG);
+	vdev_copy_arg_t *vca = zio->io_private;
+
 	abd_free(zio->io_abd);
 
 	mutex_enter(&vca->vca_lock);
 	vca->vca_outstanding_bytes -= zio->io_size;
 	cv_signal(&vca->vca_cv);
 	mutex_exit(&vca->vca_lock);
-
-	ASSERT0(zio->io_error);
-	kmem_free(vcsa->vcsa_dest_bp, sizeof (blkptr_t));
-	kmem_free(vcsa, sizeof (vdev_copy_seg_arg_t));
 }
 
+/*
+ * The read of the old location is done.  The parent zio is the write to
+ * the new location.  Allow it to start.
+ */
 static void
 spa_vdev_copy_segment_read_done(zio_t *zio)
 {
-	vdev_copy_seg_arg_t *vcsa = zio->io_private;
-	dva_t *dest_dva = vcsa->vcsa_dest_dva;
-	uint64_t txg = vcsa->vcsa_txg;
-	spa_t *spa = zio->io_spa;
-	vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(dest_dva));
-	blkptr_t *bp = NULL;
-	dva_t *dva = NULL;
-	uint64_t size = zio->io_size;
-
-	ASSERT3P(dest_vd, !=, NULL);
-	ASSERT0(zio->io_error);
-
-	vcsa->vcsa_dest_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
-	bp = vcsa->vcsa_dest_bp;
-	dva = bp->blk_dva;
-
-	BP_ZERO(bp);
-
-	/* initialize with dest_dva */
-	bcopy(dest_dva, dva, sizeof (dva_t));
-	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
-
-	BP_SET_LSIZE(bp, size);
-	BP_SET_PSIZE(bp, size);
-	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
-	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
-	BP_SET_TYPE(bp, DMU_OT_NONE);
-	BP_SET_LEVEL(bp, 0);
-	BP_SET_DEDUP(bp, 0);
-	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
-	zio_nowait(zio_rewrite(spa->spa_txg_zio[txg & TXG_MASK], spa,
-	    txg, bp, zio->io_abd, size,
-	    spa_vdev_copy_segment_write_done, vcsa,
-	    ZIO_PRIORITY_REMOVAL, 0, NULL));
+	zio_nowait(zio_unique_parent(zio));
 }
 
+/*
+ * If the old and new vdevs are mirrors, we will read both sides of the old
+ * mirror, and write each copy to the corresponding side of the new mirror.
+ * If the old and new vdevs have a different number of children, we will do
+ * this as best as possible.  Since we aren't verifying checksums, this
+ * ensures that as long as there's a good copy of the data, we'll have a
+ * good copy after the removal, even if there's silent damage to one side
+ * of the mirror. If we're removing a mirror that has some silent damage,
+ * we'll have exactly the same damage in the new location (assuming that
+ * the new location is also a mirror).
+ *
+ * We accomplish this by creating a tree of zio_t's, with as many writes as
+ * there are "children" of the new vdev (a non-redundant vdev counts as one
+ * child, a 2-way mirror has 2 children, etc). Each write has an associated
+ * read from a child of the old vdev. Typically there will be the same
+ * number of children of the old and new vdevs.  However, if there are more
+ * children of the new vdev, some child(ren) of the old vdev will be issued
+ * multiple reads.  If there are more children of the old vdev, some copies
+ * will be dropped.
+ *
+ * For example, the tree of zio_t's for a 2-way mirror is:
+ *
+ *                            null
+ *                           /    \
+ *    write(new vdev, child 0)      write(new vdev, child 1)
+ *      |                             |
+ *    read(old vdev, child 0)       read(old vdev, child 1)
+ *
+ * Child zio's complete before their parents complete.  However, zio's
+ * created with zio_vdev_child_io() may be issued before their children
+ * complete.  In this case we need to make sure that the children (reads)
+ * complete before the parents (writes) are *issued*.  We do this by not
+ * calling zio_nowait() on each write until its corresponding read has
+ * completed.
+ *
+ * The spa_config_lock must be held while zio's created by
+ * zio_vdev_child_io() are in progress, to ensure that the vdev tree does
+ * not change (e.g. due to a concurrent "zpool attach/detach"). The "null"
+ * zio is needed to release the spa_config_lock after all the reads and
+ * writes complete. (Note that we can't grab the config lock for each read,
+ * because it is not reentrant - we could deadlock with a thread waiting
+ * for a write lock.)
+ */
+static void
+spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
+    vdev_t *source_vd, uint64_t source_offset,
+    vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size)
+{
+	ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
+
+	mutex_enter(&vca->vca_lock);
+	vca->vca_outstanding_bytes += size;
+	mutex_exit(&vca->vca_lock);
+
+	abd_t *abd = abd_alloc_for_io(size, B_FALSE);
+
+	vdev_t *source_child_vd;
+	if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
+		/*
+		 * Source and dest are both mirrors.  Copy from the same
+		 * child id as we are copying to (wrapping around if there
+		 * are more dest children than source children).
+		 */
+		source_child_vd =
+		    source_vd->vdev_child[dest_id % source_vd->vdev_children];
+	} else {
+		source_child_vd = source_vd;
+	}
+
+	zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
+	    dest_child_vd, dest_offset, abd, size,
+	    ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
+	    ZIO_FLAG_CANFAIL,
+	    spa_vdev_copy_segment_write_done, vca);
+
+	zio_nowait(zio_vdev_child_io(write_zio, NULL,
+	    source_child_vd, source_offset, abd, size,
+	    ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
+	    ZIO_FLAG_CANFAIL,
+	    spa_vdev_copy_segment_read_done, vca));
+}
+
+/*
+ * Allocate a new location for this segment, and create the zio_t's to
+ * read from the old location and write to the new location.
+ */
 static int
 spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
     vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
@@ -799,10 +854,7 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
 	spa_t *spa = vd->vdev_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_indirect_mapping_entry_t *entry;
-	vdev_copy_seg_arg_t *private;
 	dva_t dst = { 0 };
-	blkptr_t blk, *bp = &blk;
-	dva_t *dva = bp->blk_dva;
 
 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 
@@ -826,51 +878,28 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
 	 */
 	ASSERT3U(DVA_GET_ASIZE(&dst), ==, size);
 
-	mutex_enter(&vca->vca_lock);
-	vca->vca_outstanding_bytes += size;
-	mutex_exit(&vca->vca_lock);
-
 	entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
 	DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
 	entry->vime_mapping.vimep_dst = dst;
 
-	private = kmem_alloc(sizeof (vdev_copy_seg_arg_t), KM_SLEEP);
-	private->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
-	private->vcsa_txg = txg;
-	private->vcsa_copy_arg = vca;
-
 	/*
-	 * This lock is eventually released by the donefunc for the
-	 * zio_write_phys that finishes copying the data.
+	 * See comment before spa_vdev_copy_one_child().
 	 */
-	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-
-	/*
-	 * Do logical I/O, letting the redundancy vdevs (like mirror)
-	 * handle their own I/O instead of duplicating that code here.
-	 */
-	BP_ZERO(bp);
-
-	DVA_SET_VDEV(&dva[0], vd->vdev_id);
-	DVA_SET_OFFSET(&dva[0], start);
-	DVA_SET_GANG(&dva[0], 0);
-	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, size));
-
-	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
-
-	BP_SET_LSIZE(bp, size);
-	BP_SET_PSIZE(bp, size);
-	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
-	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
-	BP_SET_TYPE(bp, DMU_OT_NONE);
-	BP_SET_LEVEL(bp, 0);
-	BP_SET_DEDUP(bp, 0);
-	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
-	zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa,
-	    bp, abd_alloc_for_io(size, B_FALSE), size,
-	    spa_vdev_copy_segment_read_done, private,
-	    ZIO_PRIORITY_REMOVAL, 0, NULL));
+	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+	zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
+	    spa_vdev_copy_nullzio_done, NULL, 0);
+	vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
+	if (dest_vd->vdev_ops == &vdev_mirror_ops) {
+		for (int i = 0; i < dest_vd->vdev_children; i++) {
+			vdev_t *child = dest_vd->vdev_child[i];
+			spa_vdev_copy_one_child(vca, nzio, vd, start,
+			    child, DVA_GET_OFFSET(&dst), i, size);
+		}
+	} else {
+		spa_vdev_copy_one_child(vca, nzio, vd, start,
+		    dest_vd, DVA_GET_OFFSET(&dst), -1, size);
+	}
+	zio_nowait(nzio);
 
 	list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
 	ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
@@ -888,8 +917,8 @@ static void
 vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_vdev_removal_t *svr = arg;
-	vdev_t *vd = svr->svr_vdev;
-	spa_t *spa = vd->vdev_spa;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 
@@ -918,37 +947,6 @@ vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
 }
 
 static void
-vdev_indirect_state_transfer(vdev_t *ivd, vdev_t *vd)
-{
-	ivd->vdev_indirect_config = vd->vdev_indirect_config;
-
-	ASSERT3P(ivd->vdev_indirect_mapping, ==, NULL);
-	ASSERT(vd->vdev_indirect_mapping != NULL);
-	ivd->vdev_indirect_mapping = vd->vdev_indirect_mapping;
-	vd->vdev_indirect_mapping = NULL;
-
-	ASSERT3P(ivd->vdev_indirect_births, ==, NULL);
-	ASSERT(vd->vdev_indirect_births != NULL);
-	ivd->vdev_indirect_births = vd->vdev_indirect_births;
-	vd->vdev_indirect_births = NULL;
-
-	ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
-	ASSERT0(range_tree_space(ivd->vdev_obsolete_segments));
-
-	if (vd->vdev_obsolete_sm != NULL) {
-		ASSERT3U(ivd->vdev_asize, ==, vd->vdev_asize);
-
-		/*
-		 * We cannot use space_map_{open,close} because we hold all
-		 * the config locks as writer.
-		 */
-		ASSERT3P(ivd->vdev_obsolete_sm, ==, NULL);
-		ivd->vdev_obsolete_sm = vd->vdev_obsolete_sm;
-		vd->vdev_obsolete_sm = NULL;
-	}
-}
-
-static void
 vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
 {
 	ASSERT3P(zlist, !=, NULL);
@@ -983,17 +981,13 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
 	vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
 
 	ivd = vdev_add_parent(vd, &vdev_indirect_ops);
+	ivd->vdev_removing = 0;
 
 	vd->vdev_leaf_zap = 0;
 
 	vdev_remove_child(ivd, vd);
 	vdev_compact_children(ivd);
 
-	vdev_indirect_state_transfer(ivd, vd);
-
-	svr->svr_vdev = ivd;
-
-	ASSERT(!ivd->vdev_removing);
 	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
@@ -1016,9 +1010,8 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
  * context by the removal thread after we have copied all vdev's data.
  */
 static void
-vdev_remove_complete(vdev_t *vd)
+vdev_remove_complete(spa_t *spa)
 {
-	spa_t *spa = vd->vdev_spa;
 	uint64_t txg;
 
 	/*
@@ -1026,8 +1019,12 @@ vdev_remove_complete(vdev_t *vd)
 	 * vdev_metaslab_fini()
 	 */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
-
 	txg = spa_vdev_enter(spa);
+	vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+
+	sysevent_t *ev = spa_event_create(spa, vd, NULL,
+	    ESC_ZFS_VDEV_REMOVE_DEV);
+
 	zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
 	    vd->vdev_id, txg);
 
@@ -1047,6 +1044,10 @@ vdev_remove_complete(vdev_t *vd)
 	/*
 	 * We now release the locks, allowing spa_sync to run and finish the
 	 * removal via vdev_remove_complete_sync in syncing context.
+	 *
+	 * Note that we hold on to the vdev_t that has been replaced.  Since
+	 * it isn't part of the vdev tree any longer, it can't be concurrently
+	 * manipulated, even while we don't have the config lock.
 	 */
 	(void) spa_vdev_exit(spa, NULL, txg, 0);
 
@@ -1068,6 +1069,8 @@ vdev_remove_complete(vdev_t *vd)
 	 */
 	vdev_config_dirty(spa->spa_root_vdev);
 	(void) spa_vdev_exit(spa, vd, txg, 0);
+
+	spa_event_post(ev);
 }
 
 /*
@@ -1078,7 +1081,7 @@ vdev_remove_complete(vdev_t *vd)
  * this size again this txg.
  */
 static void
-spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
+spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
     uint64_t *max_alloc, dmu_tx_t *tx)
 {
 	uint64_t txg = dmu_tx_get_txg(tx);
@@ -1117,7 +1120,7 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
 	while (length > 0) {
 		uint64_t mylen = MIN(length, thismax);
 
-		int error = spa_vdev_copy_segment(svr->svr_vdev,
+		int error = spa_vdev_copy_segment(vd,
 		    offset, mylen, txg, vca, &zal);
 
 		if (error == ENOSPC) {
@@ -1175,12 +1178,14 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
 static void
 spa_vdev_remove_thread(void *arg)
 {
-	vdev_t *vd = arg;
-	spa_t *spa = vd->vdev_spa;
+	spa_t *spa = arg;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_copy_arg_t vca;
 	uint64_t max_alloc = zfs_remove_max_segment;
 	uint64_t last_txg = 0;
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
 
@@ -1188,7 +1193,6 @@ spa_vdev_remove_thread(void *arg)
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT(vd->vdev_removing);
 	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
-	ASSERT3P(svr->svr_vdev, ==, vd);
 	ASSERT(vim != NULL);
 
 	mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -1270,6 +1274,17 @@ spa_vdev_remove_thread(void *arg)
 			mutex_exit(&svr->svr_lock);
 
 			/*
+			 * We need to periodically drop the config lock so that
+			 * writers can get in.  Additionally, we can't wait
+			 * for a txg to sync while holding a config lock
+			 * (since a waiting writer could cause a 3-way deadlock
+			 * with the sync thread, which also gets a config
+			 * lock for reader).  So we can't hold the config lock
+			 * while calling dmu_tx_assign().
+			 */
+			spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+			/*
 			 * This delay will pause the removal around the point
 			 * specified by zfs_remove_max_bytes_pause. We do this
 			 * solely from the test suite or during debugging.
@@ -1295,11 +1310,19 @@ spa_vdev_remove_thread(void *arg)
 			VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 			uint64_t txg = dmu_tx_get_txg(tx);
 
+			/*
+			 * Reacquire the vdev_config lock.  The vdev_t
+			 * that we're removing may have changed, e.g. due
+			 * to a vdev_attach or vdev_detach.
+			 */
+			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+			vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+
 			if (txg != last_txg)
 				max_alloc = zfs_remove_max_segment;
 			last_txg = txg;
 
-			spa_vdev_copy_impl(svr, &vca, &max_alloc, tx);
+			spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
 
 			dmu_tx_commit(tx);
 			mutex_enter(&svr->svr_lock);
@@ -1307,6 +1330,9 @@ spa_vdev_remove_thread(void *arg)
 	}
 
 	mutex_exit(&svr->svr_lock);
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
 	/*
 	 * Wait for all copies to finish before cleaning up the vca.
 	 */
@@ -1324,7 +1350,7 @@ spa_vdev_remove_thread(void *arg)
 		mutex_exit(&svr->svr_lock);
 	} else {
 		ASSERT0(range_tree_space(svr->svr_allocd_segs));
-		vdev_remove_complete(vd);
+		vdev_remove_complete(spa);
 	}
 }
 
@@ -1365,7 +1391,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
-	vdev_t *vd = svr->svr_vdev;
+	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	objset_t *mos = spa->spa_meta_objset;
@@ -1438,8 +1464,11 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 			 * because we have not allocated mappings for it yet.
 			 */
 			uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
-			range_tree_clear(svr->svr_allocd_segs, syncd,
-			    msp->ms_sm->sm_start + msp->ms_sm->sm_size - syncd);
+			uint64_t sm_end = msp->ms_sm->sm_start +
+			    msp->ms_sm->sm_size;
+			if (sm_end > syncd)
+				range_tree_clear(svr->svr_allocd_segs,
+				    syncd, sm_end - syncd);
 
 			mutex_exit(&svr->svr_lock);
 		}
@@ -1500,7 +1529,7 @@ spa_vdev_remove_cancel(spa_t *spa)
 	if (spa->spa_vdev_removal == NULL)
 		return (ENOTACTIVE);
 
-	uint64_t vdid = spa->spa_vdev_removal->svr_vdev->vdev_id;
+	uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
 
 	int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
 	    spa_vdev_remove_cancel_sync, NULL, 0,
@@ -1811,7 +1840,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);
 	dsl_sync_task_nowait(spa->spa_dsl_pool,
 	    vdev_remove_initiate_sync,
-	    vd, 0, ZFS_SPACE_CHECK_NONE, tx);
+	    (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx);
 	dmu_tx_commit(tx);
 
 	return (0);
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index f2c511ef77..2390a1ee90 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -1075,17 +1075,6 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 	ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
 	    done != NULL);
 
-	/*
-	 * In the common case, where the parent zio was to a normal vdev,
-	 * the child zio must be to a child vdev of that vdev.  Otherwise,
-	 * the child zio must be to a top-level vdev.
-	 */
-	if (pio->io_vd != NULL && pio->io_vd->vdev_ops != &vdev_indirect_ops) {
-		ASSERT3P(vd->vdev_parent, ==, pio->io_vd);
-	} else {
-		ASSERT3P(vd, ==, vd->vdev_top);
-	}
-
 	if (type == ZIO_TYPE_READ && bp != NULL) {
 		/*
 		 * If we have the bp, then the child should perform the
@@ -1145,7 +1134,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 
 zio_t *
 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
-    int type, zio_priority_t priority, enum zio_flag flags,
+    zio_type_t type, zio_priority_t priority, enum zio_flag flags,
     zio_done_func_t *done, void *private)
 {
 	zio_t *zio;
@@ -3097,9 +3086,13 @@ zio_vdev_io_start(zio_t *zio)
 		ASSERT(spa->spa_trust_config);
 
 		if (zio->io_vd->vdev_removing) {
+			/*
+			 * Note: the code can handle other kinds of writes,
+			 * but we don't expect them.
+			 */
 			ASSERT(zio->io_flags &
 			    (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
-			    ZIO_FLAG_INDUCE_DAMAGE));
+			    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
 		}
 	}
 
@@ -3160,18 +3153,37 @@ zio_vdev_io_start(zio_t *zio)
 	 * If this is a repair I/O, and there's no self-healing involved --
 	 * that is, we're just resilvering what we expect to resilver --
 	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
-	 * This prevents spurious resilvering with nested replication.
-	 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
-	 * A is out of date, we'll read from C+D, then use the data to
-	 * resilver A+B -- but we don't actually want to resilver B, just A.
-	 * The top-level mirror has no way to know this, so instead we just
-	 * discard unnecessary repairs as we work our way down the vdev tree.
-	 * The same logic applies to any form of nested replication:
-	 * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
+	 * This prevents spurious resilvering.
+	 *
+	 * There are a few ways that we can end up creating these spurious
+	 * resilver i/os:
+	 *
+	 * 1. A resilver i/o will be issued if any DVA in the BP has a
+	 * dirty DTL.  The mirror code will issue resilver writes to
+	 * each DVA, including the one(s) that are not on vdevs with dirty
+	 * DTLs.
+	 *
+	 * 2. With nested replication, which happens when we have a
+	 * "replacing" or "spare" vdev that's a child of a mirror or raidz.
+	 * For example, given mirror(replacing(A+B), C), it's likely that
+	 * only A is out of date (it's the new device). In this case, we'll
+	 * read from C, then use the data to resilver A+B -- but we don't
+	 * actually want to resilver B, just A. The top-level mirror has no
+	 * way to know this, so instead we just discard unnecessary repairs
+	 * as we work our way down the vdev tree.
+	 *
+	 * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
+	 * The same logic applies to any form of nested replication: ditto
+	 * + mirror, RAID-Z + replacing, etc.
+	 *
+	 * However, indirect vdevs point off to other vdevs which may have
+	 * DTL's, so we never bypass them.  The child i/os on concrete vdevs
+	 * will be properly bypassed instead.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
 	    zio->io_txg != 0 &&	/* not a delegated i/o */
+	    vd->vdev_ops != &vdev_indirect_ops &&
 	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		zio_vdev_io_bypass(zio);
author	Jerry Jelinek <jerry.jelinek@joyent.com>	2018-04-17 12:47:02 +0000
committer	Jerry Jelinek <jerry.jelinek@joyent.com>	2018-04-17 12:47:02 +0000
commit	f43293b2f5505423bbff409e6bcac2caa468e4de (patch)
tree	39d58d968f31585fd49fb87d5ab8b60666c62a08
parent	c680934c282c791f151f4efe173c53604bac7c93 (diff)
parent	a3b5583021b7b45676bf1f0cc68adf7a97900b56 (diff)
download	illumos-joyent-f43293b2f5505423bbff409e6bcac2caa468e4de.tar.gz