summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJerry Jelinek <jerry.jelinek@joyent.com>2018-04-17 12:47:02 +0000
committerJerry Jelinek <jerry.jelinek@joyent.com>2018-04-17 12:47:02 +0000
commitf43293b2f5505423bbff409e6bcac2caa468e4de (patch)
tree39d58d968f31585fd49fb87d5ab8b60666c62a08
parentc680934c282c791f151f4efe173c53604bac7c93 (diff)
parenta3b5583021b7b45676bf1f0cc68adf7a97900b56 (diff)
downloadillumos-joyent-f43293b2f5505423bbff409e6bcac2caa468e4de.tar.gz
[illumos-gate merge]
commit a3b5583021b7b45676bf1f0cc68adf7a97900b56 9192 explicitly pass good_writes to vdev_uberblock/label_sync commit 3a4b1be953ee5601bab748afa07c26ed4996cde6 9290 device removal reduces redundancy of mirrors commit ff9e88cea66c73818b035c646e3bb615ba3ff2bd 9452 ptable_dcmd() needs a little cleanup commit a800027ae7e1ce605f8a3187c6fcad94986988f3 9450 OS Unit Test Suite is in the ostest package commit ed1b18c7444d27a1c8637e59ba0b8b360902050b 9346 autofs: comparison between pointer and zero character constant commit 4e72ade1d48747d1105e26d42fc4787278f8f35e 9462 panic in smbfs_delmap_callback
-rw-r--r--usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.c4
-rw-r--r--usr/src/cmd/zdb/zdb.c18
-rw-r--r--usr/src/cmd/ztest/ztest.c58
-rw-r--r--usr/src/lib/libzfs/common/libzfs_pool.c2
-rw-r--r--usr/src/lib/libzpool/common/llib-lzpool3
-rw-r--r--usr/src/test/os-tests/doc/README2
-rw-r--r--usr/src/uts/common/fs/autofs/auto_subr.c6
-rw-r--r--usr/src/uts/common/fs/smbclnt/smbfs/smbfs.h1
-rw-r--r--usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vfsops.c40
-rw-r--r--usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vnops.c135
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_scan.c11
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c2
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c7
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c5
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_removal.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h2
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c26
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_indirect.c552
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_label.c28
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_mirror.c29
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_removal.c361
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c54
22 files changed, 1028 insertions, 320 deletions
diff --git a/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.c b/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.c
index c6ac1d2967..15f4697b91 100644
--- a/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.c
+++ b/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.c
@@ -835,7 +835,6 @@ do_ptable_dcmd(pfn_t pfn, uint64_t level)
int entry;
uintptr_t pagesize;
x86pte_t pte;
- x86pte_t buf;
physaddr_t paddr;
size_t len;
@@ -899,11 +898,10 @@ found_it:
paddr = mmu_ptob((physaddr_t)pfn);
for (entry = 0; entry < mmu.ptes_per_table; ++entry) {
- len = mdb_pread(&buf, mmu.pte_size,
+ len = mdb_pread(&pte, mmu.pte_size,
paddr + entry * mmu.pte_size);
if (len != mmu.pte_size)
return (DCMD_ERR);
- pte = buf;
if (pte == 0)
continue;
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
index 6ce4263db8..cb02698ceb 100644
--- a/usr/src/cmd/zdb/zdb.c
+++ b/usr/src/cmd/zdb/zdb.c
@@ -3006,7 +3006,7 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
- vdev_t *vd = svr->svr_vdev;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
@@ -3022,13 +3022,17 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
svr->svr_allocd_segs, SM_ALLOC));
/*
- * Clear everything past what has been synced,
- * because we have not allocated mappings for it yet.
+ * Clear everything past what has been synced unless
+ * it's past the spacemap, because we have not allocated
+ * mappings for it yet.
*/
- range_tree_clear(svr->svr_allocd_segs,
- vdev_indirect_mapping_max_offset(vim),
- msp->ms_sm->sm_start + msp->ms_sm->sm_size -
- vdev_indirect_mapping_max_offset(vim));
+ uint64_t vim_max_offset =
+ vdev_indirect_mapping_max_offset(vim);
+ uint64_t sm_end = msp->ms_sm->sm_start +
+ msp->ms_sm->sm_size;
+ if (sm_end > vim_max_offset)
+ range_tree_clear(svr->svr_allocd_segs,
+ vim_max_offset, sm_end - vim_max_offset);
}
zcb->zcb_removing_size +=
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index ff45ab193e..0b98eeef26 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -436,6 +436,7 @@ static ztest_ds_t *ztest_ds;
static kmutex_t ztest_vdev_lock;
static kmutex_t ztest_checkpoint_lock;
+static boolean_t ztest_device_removal_active = B_FALSE;
/*
* The ztest_name_lock protects the pool and dataset namespace used by
@@ -2880,7 +2881,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
* value. Don't bother trying to attach while we are in the middle
* of removal.
*/
- if (spa->spa_vdev_removal != NULL) {
+ if (ztest_device_removal_active) {
spa_config_exit(spa, SCL_ALL, FTAG);
mutex_exit(&ztest_vdev_lock);
return;
@@ -3055,16 +3056,49 @@ ztest_device_removal(ztest_ds_t *zd, uint64_t id)
spa_t *spa = ztest_spa;
vdev_t *vd;
uint64_t guid;
+ int error;
mutex_enter(&ztest_vdev_lock);
+ if (ztest_device_removal_active) {
+ mutex_exit(&ztest_vdev_lock);
+ return;
+ }
+
+ /*
+ * Remove a random top-level vdev and wait for removal to finish.
+ */
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE));
guid = vd->vdev_guid;
spa_config_exit(spa, SCL_VDEV, FTAG);
- (void) spa_vdev_remove(spa, guid, B_FALSE);
+ error = spa_vdev_remove(spa, guid, B_FALSE);
+ if (error == 0) {
+ ztest_device_removal_active = B_TRUE;
+ mutex_exit(&ztest_vdev_lock);
+
+ while (spa->spa_vdev_removal != NULL)
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ } else {
+ mutex_exit(&ztest_vdev_lock);
+ return;
+ }
+ /*
+ * The pool needs to be scrubbed after completing device removal.
+ * Failure to do so may result in checksum errors due to the
+ * strategy employed by ztest_fault_inject() when selecting which
+ * offset are redundant and can be damaged.
+ */
+ error = spa_scan(spa, POOL_SCAN_SCRUB);
+ if (error == 0) {
+ while (dsl_scan_scrubbing(spa_get_dsl(spa)))
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ }
+
+ mutex_enter(&ztest_vdev_lock);
+ ztest_device_removal_active = B_FALSE;
mutex_exit(&ztest_vdev_lock);
}
@@ -3203,7 +3237,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
* that the metaslab_class space increased (because it decreases
* when the device removal completes).
*/
- if (spa->spa_vdev_removal != NULL) {
+ if (ztest_device_removal_active) {
spa_config_exit(spa, SCL_STATE, spa);
mutex_exit(&ztest_vdev_lock);
mutex_exit(&ztest_checkpoint_lock);
@@ -4988,6 +5022,18 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
boolean_t islog = B_FALSE;
mutex_enter(&ztest_vdev_lock);
+
+ /*
+ * Device removal is in progress, fault injection must be disabled
+ * until it completes and the pool is scrubbed. The fault injection
+ * strategy for damaging blocks does not take in to account evacuated
+ * blocks which may have already been damaged.
+ */
+ if (ztest_device_removal_active) {
+ mutex_exit(&ztest_vdev_lock);
+ return;
+ }
+
maxfaults = MAXFAULTS();
leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
mirror_save = zs->zs_mirrors;
@@ -5333,6 +5379,12 @@ ztest_scrub(ztest_ds_t *zd, uint64_t id)
{
spa_t *spa = ztest_spa;
+ /*
+ * Scrub in progress by device removal.
+ */
+ if (ztest_device_removal_active)
+ return;
+
(void) spa_scan(spa, POOL_SCAN_SCRUB);
(void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
(void) spa_scan(spa, POOL_SCAN_SCRUB);
diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c
index 4378151cae..7321f419fc 100644
--- a/usr/src/lib/libzfs/common/libzfs_pool.c
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c
@@ -2809,7 +2809,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
case EBUSY:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, "
- "or pool has removing/removed vdevs"),
+ "or device removal is in progress"),
new_disk);
(void) zfs_error(hdl, EZFS_BADDEV, msg);
break;
diff --git a/usr/src/lib/libzpool/common/llib-lzpool b/usr/src/lib/libzpool/common/llib-lzpool
index 7b58c21513..9e52a46aee 100644
--- a/usr/src/lib/libzpool/common/llib-lzpool
+++ b/usr/src/lib/libzpool/common/llib-lzpool
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/
/* LINTLIBRARY */
@@ -39,6 +39,7 @@
#include <sys/dnode.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_dataset.h>
+#include <sys/dsl_scan.h>
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/space_map.h>
diff --git a/usr/src/test/os-tests/doc/README b/usr/src/test/os-tests/doc/README
index e7d0b3f527..7e8f5236e8 100644
--- a/usr/src/test/os-tests/doc/README
+++ b/usr/src/test/os-tests/doc/README
@@ -36,7 +36,7 @@ and the testrunner without running a full nightly:
Then set the publisher on the test machine to point to your repository and
install the OS Unit Test Suite.
- test_machine# pkg install pkg:/system/test/zfstest
+ test_machine# pkg install pkg:/system/test/ostest
Note, the framework will be installed automatically, as the OS Unit Test Suite
depends on it.
diff --git a/usr/src/uts/common/fs/autofs/auto_subr.c b/usr/src/uts/common/fs/autofs/auto_subr.c
index c9da46b66b..4e3280dfe1 100644
--- a/usr/src/uts/common/fs/autofs/auto_subr.c
+++ b/usr/src/uts/common/fs/autofs/auto_subr.c
@@ -162,11 +162,13 @@ auto_lookup_aux(fnnode_t *fnp, char *name, cred_t *cred)
bzero(&link, sizeof (link));
error = auto_lookup_request(fnip, name, &link, TRUE, &mountreq, cred);
if (!error) {
- if (link.link != NULL || link.link != '\0') {
+ if (link.link != NULL) {
+ error = ENOENT;
/*
* This node should be a symlink
*/
- error = auto_perform_link(fnp, &link, cred);
+ if (*link.link != '\0')
+ error = auto_perform_link(fnp, &link, cred);
} else if (mountreq) {
/*
* The automount daemon is requesting a mount,
diff --git a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs.h b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs.h
index 32dd58142e..8af2a5ee9d 100644
--- a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs.h
+++ b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs.h
@@ -115,6 +115,7 @@ typedef struct smbmntinfo {
struct vfs *smi_vfsp; /* mount back pointer to vfs */
struct smbnode *smi_root; /* the root node */
struct smb_share *smi_share; /* netsmb SMB share conn data */
+ struct taskq *smi_taskq; /* for async work */
kmutex_t smi_lock; /* mutex for flags, etc. */
uint32_t smi_flags; /* NFS-derived flag bits */
uint32_t smi_status; /* status bits for this mount */
diff --git a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vfsops.c b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vfsops.c
index af3f44d164..0122d52115 100644
--- a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vfsops.c
+++ b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vfsops.c
@@ -52,6 +52,7 @@
#include <sys/statvfs.h>
#include <sys/errno.h>
#include <sys/debug.h>
+#include <sys/disp.h>
#include <sys/cmn_err.h>
#include <sys/modctl.h>
#include <sys/policy.h>
@@ -60,6 +61,7 @@
#include <sys/vfs_opreg.h>
#include <sys/mntent.h>
#include <sys/priv.h>
+#include <sys/taskq.h>
#include <sys/tsol/label.h>
#include <sys/tsol/tndb.h>
#include <inet/ip.h>
@@ -86,6 +88,12 @@
int smbfs_default_opt_acl = 0;
/*
+ * How many taskq threads per-mount should we use.
+ * Just one is fine (until we do more async work).
+ */
+int smbfs_tq_nthread = 1;
+
+/*
* Local functions definitions.
*/
int smbfsinit(int fstyp, char *name);
@@ -633,6 +641,14 @@ smbfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
smi->smi_root = rtnp;
/*
+ * Create a taskq for async work (i.e. putpage)
+ */
+ smi->smi_taskq = taskq_create_proc("smbfs",
+ smbfs_tq_nthread, minclsyspri,
+ smbfs_tq_nthread, smbfs_tq_nthread * 2,
+ zone->zone_zsched, TASKQ_PREPOPULATE);
+
+ /*
* NFS does other stuff here too:
* async worker threads
* init kstats
@@ -701,15 +717,6 @@ smbfs_unmount(vfs_t *vfsp, int flag, cred_t *cr)
vfsp->vfs_flag |= VFS_UNMOUNTED;
/*
- * Shutdown any outstanding I/O requests on this share,
- * and force a tree disconnect. The share object will
- * continue to hang around until smb_share_rele().
- * This should also cause most active nodes to be
- * released as their operations fail with EIO.
- */
- smb_share_kill(smi->smi_share);
-
- /*
* If we hold the root VP (and we normally do)
* then it's safe to release it now.
*/
@@ -732,6 +739,21 @@ smbfs_unmount(vfs_t *vfsp, int flag, cred_t *cr)
smbfs_destroy_table(vfsp);
/*
+ * Shutdown any outstanding I/O requests on this share,
+ * and force a tree disconnect. The share object will
+ * continue to hang around until smb_share_rele().
+ * This should also cause most active nodes to be
+ * released as their operations fail with EIO.
+ */
+ smb_share_kill(smi->smi_share);
+
+ /*
+ * Any async taskq work should be giving up.
+ * Wait for those to exit.
+ */
+ taskq_destroy(smi->smi_taskq);
+
+ /*
* Delete our kstats...
*
* Doing it here, rather than waiting until
diff --git a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vnops.c b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vnops.c
index 07a69b21e5..23c9f8f15d 100644
--- a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vnops.c
+++ b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vnops.c
@@ -58,6 +58,7 @@
#include <sys/vfs_opreg.h>
#include <sys/policy.h>
#include <sys/sdt.h>
+#include <sys/taskq_impl.h>
#include <sys/zone.h>
#include <sys/vmsystm.h>
@@ -153,7 +154,7 @@ static int smbfs_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
enum seg_rw, cred_t *);
static int smbfs_putapage(vnode_t *, page_t *, u_offset_t *, size_t *,
int, cred_t *);
-static void smbfs_delmap_callback(struct as *, void *, uint_t);
+static void smbfs_delmap_async(void *);
/*
* Error flags used to pass information about certain special errors
@@ -4475,6 +4476,13 @@ done:
return (error);
}
+/*
+ * This uses addmap/delmap functions to hold the SMB FID open as long as
+ * there are pages mapped in this as/seg. Increment the FID refs. when
+ * the maping count goes from zero to non-zero, and release the FID ref
+ * when the maping count goes from non-zero to zero.
+ */
+
/* ARGSUSED */
static int
smbfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
@@ -4504,79 +4512,76 @@ smbfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
}
/*
- * Use an address space callback to flush pages dirty pages after unmap,
- * which we can't do directly in smbfs_delmap due to locking issues.
+ * Args passed to smbfs_delmap_async
*/
typedef struct smbfs_delmap_args {
- vnode_t *vp;
- cred_t *cr;
- offset_t off;
- caddr_t addr;
- size_t len;
- uint_t prot;
- uint_t maxprot;
- uint_t flags;
- boolean_t dec_fidrefs;
+ taskq_ent_t dm_tqent;
+ cred_t *dm_cr;
+ vnode_t *dm_vp;
+ offset_t dm_off;
+ caddr_t dm_addr;
+ size_t dm_len;
+ uint_t dm_prot;
+ uint_t dm_maxprot;
+ uint_t dm_flags;
+ boolean_t dm_rele_fid;
} smbfs_delmap_args_t;
+/*
+ * Using delmap not only to release the SMB FID (as described above)
+ * but to flush dirty pages as needed. Both of those do the actual
+ * work in an async taskq job to avoid interfering with locks held
+ * in the VM layer when this is called.
+ */
+
/* ARGSUSED */
static int
smbfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
size_t len, uint_t prot, uint_t maxprot, uint_t flags,
cred_t *cr, caller_context_t *ct)
{
- smbnode_t *np = VTOSMB(vp);
+ smbnode_t *np = VTOSMB(vp);
+ smbmntinfo_t *smi = VTOSMI(vp);
smbfs_delmap_args_t *dmapp;
- int error;
dmapp = kmem_zalloc(sizeof (*dmapp), KM_SLEEP);
- dmapp->vp = vp;
- dmapp->off = off;
- dmapp->addr = addr;
- dmapp->len = len;
- dmapp->prot = prot;
- dmapp->maxprot = maxprot;
- dmapp->flags = flags;
- dmapp->cr = cr;
- dmapp->dec_fidrefs = B_FALSE;
+ /*
+ * The VM layer may segvn_free the seg holding this vnode
+ * before our callback has a chance run, so take a hold on
+ * the vnode here and release it in the callback.
+ * (same for the cred)
+ */
+ crhold(cr);
+ VN_HOLD(vp);
+
+ dmapp->dm_vp = vp;
+ dmapp->dm_cr = cr;
+ dmapp->dm_off = off;
+ dmapp->dm_addr = addr;
+ dmapp->dm_len = len;
+ dmapp->dm_prot = prot;
+ dmapp->dm_maxprot = maxprot;
+ dmapp->dm_flags = flags;
+ dmapp->dm_rele_fid = B_FALSE;
/*
- * When r_mapcnt returns to zero, arrange for the
- * callback to decrement n_fidrefs
+ * Go ahead and decrement r_mapcount now, which is
+ * the primary purpose of this function.
+ *
+ * When r_mapcnt goes to zero, we need to call
+ * smbfs_rele_fid, but can't do that here, so
+ * set a flag telling the async task to do it.
*/
mutex_enter(&np->r_statelock);
np->r_mapcnt -= btopr(len);
ASSERT(np->r_mapcnt >= 0);
if (np->r_mapcnt == 0)
- dmapp->dec_fidrefs = B_TRUE;
+ dmapp->dm_rele_fid = B_TRUE;
mutex_exit(&np->r_statelock);
- error = as_add_callback(as, smbfs_delmap_callback, dmapp,
- AS_UNMAP_EVENT, addr, len, KM_SLEEP);
- if (error != 0) {
- /*
- * So sad, no callback is coming. Can't flush pages
- * in delmap (as locks). Just handle n_fidrefs.
- */
- cmn_err(CE_NOTE, "smbfs_delmap(%p) "
- "as_add_callback err=%d",
- (void *)vp, error);
-
- if (dmapp->dec_fidrefs) {
- struct smb_cred scred;
-
- (void) smbfs_rw_enter_sig(&np->r_lkserlock,
- RW_WRITER, 0);
- smb_credinit(&scred, dmapp->cr);
-
- smbfs_rele_fid(np, &scred);
-
- smb_credrele(&scred);
- smbfs_rw_exit(&np->r_lkserlock);
- }
- kmem_free(dmapp, sizeof (*dmapp));
- }
+ taskq_dispatch_ent(smi->smi_taskq, smbfs_delmap_async, dmapp, 0,
+ &dmapp->dm_tqent);
return (0);
}
@@ -4587,14 +4592,16 @@ smbfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
*/
/* ARGSUSED */
static void
-smbfs_delmap_callback(struct as *as, void *arg, uint_t event)
+smbfs_delmap_async(void *varg)
{
+ smbfs_delmap_args_t *dmapp = varg;
+ cred_t *cr;
vnode_t *vp;
smbnode_t *np;
smbmntinfo_t *smi;
- smbfs_delmap_args_t *dmapp = arg;
- vp = dmapp->vp;
+ cr = dmapp->dm_cr;
+ vp = dmapp->dm_vp;
np = VTOSMB(vp);
smi = VTOSMI(vp);
@@ -4609,7 +4616,8 @@ smbfs_delmap_callback(struct as *as, void *arg, uint_t event)
* unmount smbfs
*/
if (vn_has_cached_data(vp) && !vn_is_readonly(vp) &&
- dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
+ dmapp->dm_flags == MAP_SHARED &&
+ (dmapp->dm_maxprot & PROT_WRITE) != 0) {
mutex_enter(&np->r_statelock);
np->r_flags |= RDIRTY;
mutex_exit(&np->r_statelock);
@@ -4618,23 +4626,23 @@ smbfs_delmap_callback(struct as *as, void *arg, uint_t event)
* Need to finish the putpage before we
* close the OtW FID needed for I/O.
*/
- (void) smbfs_putpage(vp, dmapp->off, dmapp->len, 0,
- dmapp->cr, NULL);
+ (void) smbfs_putpage(vp, dmapp->dm_off, dmapp->dm_len, 0,
+ dmapp->dm_cr, NULL);
}
if ((np->r_flags & RDIRECTIO) || (smi->smi_flags & SMI_DIRECTIO))
- (void) smbfs_putpage(vp, dmapp->off, dmapp->len,
- B_INVAL, dmapp->cr, NULL);
+ (void) smbfs_putpage(vp, dmapp->dm_off, dmapp->dm_len,
+ B_INVAL, dmapp->dm_cr, NULL);
/*
* If r_mapcnt went to zero, drop our FID ref now.
* On the last fidref, this does an OtW close.
*/
- if (dmapp->dec_fidrefs) {
+ if (dmapp->dm_rele_fid) {
struct smb_cred scred;
(void) smbfs_rw_enter_sig(&np->r_lkserlock, RW_WRITER, 0);
- smb_credinit(&scred, dmapp->cr);
+ smb_credinit(&scred, dmapp->dm_cr);
smbfs_rele_fid(np, &scred);
@@ -4642,7 +4650,10 @@ smbfs_delmap_callback(struct as *as, void *arg, uint_t event)
smbfs_rw_exit(&np->r_lkserlock);
}
- (void) as_delete_callback(as, arg);
+ /* Release holds taken in smbfs_delmap */
+ VN_RELE(vp);
+ crfree(cr);
+
kmem_free(dmapp, sizeof (*dmapp));
}
diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c
index 0660d6fbfd..fbc1c4d08b 100644
--- a/usr/src/uts/common/fs/zfs/dsl_scan.c
+++ b/usr/src/uts/common/fs/zfs/dsl_scan.c
@@ -1988,7 +1988,16 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
/* if it's a resilver, this may not be in the target range */
if (!needs_io) {
- if (DVA_GET_GANG(&bp->blk_dva[d])) {
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ /*
+ * The indirect vdev can point to multiple
+ * vdevs. For simplicity, always create
+ * the resilver zio_t. zio_vdev_io_start()
+ * will bypass the child resilver i/o's if
+ * they are on vdevs that don't have DTL's.
+ */
+ needs_io = B_TRUE;
+ } else if (DVA_GET_GANG(&bp->blk_dva[d])) {
/*
* Gang members may be spread across multiple
* vdevs, so the best estimate we have is the
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 1f5a7fbd26..b6a5665936 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -3590,7 +3590,7 @@ metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
return;
if (spa->spa_vdev_removal != NULL &&
- spa->spa_vdev_removal->svr_vdev == vd &&
+ spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
vdev_is_concrete(vd)) {
/*
* Note: we check if the vdev is concrete because when
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 163f5e054e..9185c5b182 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -5509,8 +5509,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
for (int c = 0; c < vd->vdev_children; c++) {
tvd = vd->vdev_child[c];
if (spa->spa_vdev_removal != NULL &&
- tvd->vdev_ashift !=
- spa->spa_vdev_removal->svr_vdev->vdev_ashift) {
+ tvd->vdev_ashift != spa->spa_max_ashift) {
return (spa_vdev_exit(spa, vd, txg, EINVAL));
}
/* Fail if top level vdev is raidz */
@@ -5626,10 +5625,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
return (spa_vdev_exit(spa, NULL, txg, error));
}
- if (spa->spa_vdev_removal != NULL ||
- spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
+ if (spa->spa_vdev_removal != NULL)
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
- }
if (oldvd == NULL)
return (spa_vdev_exit(spa, NULL, txg, ENODEV));
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index fe0971a720..81dded7482 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -1761,9 +1761,12 @@ spa_update_dspace(spa_t *spa)
* allocated twice (on the old device and the new
* device).
*/
- vdev_t *vd = spa->spa_vdev_removal->svr_vdev;
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ vdev_t *vd =
+ vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
spa->spa_dspace -= spa_deflate(spa) ?
vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+ spa_config_exit(spa, SCL_VDEV, FTAG);
}
}
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_removal.h b/usr/src/uts/common/fs/zfs/sys/vdev_removal.h
index a29ae58610..5fcd40b6e7 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_removal.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_removal.h
@@ -30,7 +30,7 @@ extern "C" {
#endif
typedef struct spa_vdev_removal {
- vdev_t *svr_vdev;
+ uint64_t svr_vdev_id;
uint64_t svr_max_offset_to_sync[TXG_SIZE];
/* Thread performing a vdev removal. */
kthread_t *svr_thread;
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 273e5fcb0b..c12cb70906 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -552,7 +552,7 @@ extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
zio_done_func_t *done, void *private);
extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
- struct abd *data, uint64_t size, int type, zio_priority_t priority,
+ struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *private);
extern void zio_vdev_io_bypass(zio_t *zio);
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index e761ee4b39..71b690c123 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -867,6 +867,32 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
svd->vdev_stat.vs_space = 0;
svd->vdev_stat.vs_dspace = 0;
+ /*
+ * State which may be set on a top-level vdev that's in the
+ * process of being removed.
+ */
+ ASSERT0(tvd->vdev_indirect_config.vic_births_object);
+ ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
+ ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
+ ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
+ ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
+ ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
+ ASSERT0(tvd->vdev_removing);
+ tvd->vdev_removing = svd->vdev_removing;
+ tvd->vdev_indirect_config = svd->vdev_indirect_config;
+ tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
+ tvd->vdev_indirect_births = svd->vdev_indirect_births;
+ range_tree_swap(&svd->vdev_obsolete_segments,
+ &tvd->vdev_obsolete_segments);
+ tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
+ svd->vdev_indirect_config.vic_mapping_object = 0;
+ svd->vdev_indirect_config.vic_births_object = 0;
+ svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
+ svd->vdev_indirect_mapping = NULL;
+ svd->vdev_indirect_births = NULL;
+ svd->vdev_obsolete_sm = NULL;
+ svd->vdev_removing = 0;
+
for (t = 0; t < TXG_SIZE; t++) {
while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
diff --git a/usr/src/uts/common/fs/zfs/vdev_indirect.c b/usr/src/uts/common/fs/zfs/vdev_indirect.c
index 304453aa94..3f2ff799b6 100644
--- a/usr/src/uts/common/fs/zfs/vdev_indirect.c
+++ b/usr/src/uts/common/fs/zfs/vdev_indirect.c
@@ -23,6 +23,7 @@
#include <sys/vdev_impl.h>
#include <sys/fs/zfs.h>
#include <sys/zio.h>
+#include <sys/zio_checksum.h>
#include <sys/metaslab.h>
#include <sys/refcount.h>
#include <sys/dmu.h>
@@ -46,10 +47,11 @@
* "vdev_remap" operation that executes a callback on each contiguous
* segment of the new location. This function is used in multiple ways:
*
- * - reads and repair writes to this device use the callback to create
- * a child io for each mapped segment.
+ * - i/os to this vdev use the callback to determine where the
+ * data is now located, and issue child i/os for each segment's new
+ * location.
*
- * - frees and claims to this device use the callback to free or claim
+ * - frees and claims to this vdev use the callback to free or claim
* each mapped segment. (Note that we don't actually need to claim
* log blocks on indirect vdevs, because we don't allocate to
* removing vdevs. However, zdb uses zio_claim() for its leak
@@ -204,6 +206,94 @@ uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
int zfs_condense_indirect_commit_entry_delay_ticks = 0;
/*
+ * If a split block contains more than this many segments, consider it too
+ * computationally expensive to check all (2^num_segments) possible
+ * combinations. Instead, try at most 2^_segments_max randomly-selected
+ * combinations.
+ *
+ * This is reasonable if only a few segment copies are damaged and the
+ * majority of segment copies are good. This allows all the segment copies to
+ * participate fairly in the reconstruction and prevents the repeated use of
+ * one bad copy.
+ */
+int zfs_reconstruct_indirect_segments_max = 10;
+
+/*
+ * The indirect_child_t represents the vdev that we will read from, when we
+ * need to read all copies of the data (e.g. for scrub or reconstruction).
+ * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
+ * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs,
+ * ic_vdev is a child of the mirror.
+ */
+typedef struct indirect_child {
+ abd_t *ic_data;
+ vdev_t *ic_vdev;
+} indirect_child_t;
+
+/*
+ * The indirect_split_t represents one mapped segment of an i/o to the
+ * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
+ * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
+ * For split blocks, there will be several of these.
+ */
+typedef struct indirect_split {
+ list_node_t is_node; /* link on iv_splits */
+
+ /*
+ * is_split_offset is the offset into the i/o.
+ * This is the sum of the previous splits' is_size's.
+ */
+ uint64_t is_split_offset;
+
+ vdev_t *is_vdev; /* top-level vdev */
+ uint64_t is_target_offset; /* offset on is_vdev */
+ uint64_t is_size;
+ int is_children; /* number of entries in is_child[] */
+
+ /*
+ * is_good_child is the child that we are currently using to
+ * attempt reconstruction.
+ */
+ int is_good_child;
+
+ indirect_child_t is_child[1]; /* variable-length */
+} indirect_split_t;
+
+/*
+ * The indirect_vsd_t is associated with each i/o to the indirect vdev.
+ * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
+ */
+typedef struct indirect_vsd {
+ boolean_t iv_split_block;
+ boolean_t iv_reconstruct;
+
+ list_t iv_splits; /* list of indirect_split_t's */
+} indirect_vsd_t;
+
+static void
+vdev_indirect_map_free(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ indirect_split_t *is;
+ while ((is = list_head(&iv->iv_splits)) != NULL) {
+ for (int c = 0; c < is->is_children; c++) {
+ indirect_child_t *ic = &is->is_child[c];
+ if (ic->ic_data != NULL)
+ abd_free(ic->ic_data);
+ }
+ list_remove(&iv->iv_splits, is);
+ kmem_free(is,
+ offsetof(indirect_split_t, is_child[is->is_children]));
+ }
+ kmem_free(iv, sizeof (*iv));
+}
+
+static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
+ vdev_indirect_map_free,
+ zio_vsd_default_cksum_report
+};
+/*
* Mark the given offset and size as being obsolete.
*/
void
@@ -817,12 +907,6 @@ vdev_indirect_close(vdev_t *vd)
}
/* ARGSUSED */
-static void
-vdev_indirect_io_done(zio_t *zio)
-{
-}
-
-/* ARGSUSED */
static int
vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
uint64_t *ashift)
@@ -1065,41 +1149,475 @@ vdev_indirect_child_io_done(zio_t *zio)
abd_put(zio->io_abd);
}
+/*
+ * This is a callback for vdev_indirect_remap() which allocates an
+ * indirect_split_t for each split segment and adds it to iv_splits.
+ */
static void
-vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset,
+vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
uint64_t size, void *arg)
{
zio_t *zio = arg;
+ indirect_vsd_t *iv = zio->io_vsd;
ASSERT3P(vd, !=, NULL);
if (vd->vdev_ops == &vdev_indirect_ops)
return;
- zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset,
- abd_get_offset(zio->io_abd, split_offset),
- size, zio->io_type, zio->io_priority,
- 0, vdev_indirect_child_io_done, zio));
+ int n = 1;
+ if (vd->vdev_ops == &vdev_mirror_ops)
+ n = vd->vdev_children;
+
+ indirect_split_t *is =
+ kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP);
+
+ is->is_children = n;
+ is->is_size = size;
+ is->is_split_offset = split_offset;
+ is->is_target_offset = offset;
+ is->is_vdev = vd;
+
+ /*
+ * Note that we only consider multiple copies of the data for
+ * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even
+ * though they use the same ops as mirror, because there's only one
+ * "good" copy under the replacing/spare.
+ */
+ if (vd->vdev_ops == &vdev_mirror_ops) {
+ for (int i = 0; i < n; i++) {
+ is->is_child[i].ic_vdev = vd->vdev_child[i];
+ }
+ } else {
+ is->is_child[0].ic_vdev = vd;
+ }
+
+ list_insert_tail(&iv->iv_splits, is);
+}
+
+static void
+vdev_indirect_read_split_done(zio_t *zio)
+{
+ indirect_child_t *ic = zio->io_private;
+
+ if (zio->io_error != 0) {
+ /*
+ * Clear ic_data to indicate that we do not have data for this
+ * child.
+ */
+ abd_free(ic->ic_data);
+ ic->ic_data = NULL;
+ }
+}
+
+/*
+ * Issue reads for all copies (mirror children) of all splits.
+ */
+static void
+vdev_indirect_read_all(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ for (int i = 0; i < is->is_children; i++) {
+ indirect_child_t *ic = &is->is_child[i];
+
+ if (!vdev_readable(ic->ic_vdev))
+ continue;
+
+ /*
+ * Note, we may read from a child whose DTL
+ * indicates that the data may not be present here.
+ * While this might result in a few i/os that will
+ * likely return incorrect data, it simplifies the
+ * code since we can treat scrub and resilver
+ * identically. (The incorrect data will be
+ * detected and ignored when we verify the
+ * checksum.)
+ */
+
+ ic->ic_data = abd_alloc_sametype(zio->io_abd,
+ is->is_size);
+
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ ic->ic_vdev, is->is_target_offset, ic->ic_data,
+ is->is_size, zio->io_type, zio->io_priority, 0,
+ vdev_indirect_read_split_done, ic));
+ }
+ }
+ iv->iv_reconstruct = B_TRUE;
}
static void
vdev_indirect_io_start(zio_t *zio)
{
spa_t *spa = zio->io_spa;
+ indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP);
+ list_create(&iv->iv_splits,
+ sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
+
+ zio->io_vsd = iv;
+ zio->io_vsd_ops = &vdev_indirect_vsd_ops;
ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
if (zio->io_type != ZIO_TYPE_READ) {
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
- ASSERT((zio->io_flags &
- (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
+ /*
+ * Note: this code can handle other kinds of writes,
+ * but we don't expect them.
+ */
+ ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL |
+ ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
}
vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
- vdev_indirect_io_start_cb, zio);
+ vdev_indirect_gather_splits, zio);
+
+ indirect_split_t *first = list_head(&iv->iv_splits);
+ if (first->is_size == zio->io_size) {
+ /*
+ * This is not a split block; we are pointing to the entire
+ * data, which will checksum the same as the original data.
+ * Pass the BP down so that the child i/o can verify the
+ * checksum, and try a different location if available
+ * (e.g. on a mirror).
+ *
+ * While this special case could be handled the same as the
+ * general (split block) case, doing it this way ensures
+ * that the vast majority of blocks on indirect vdevs
+ * (which are not split) are handled identically to blocks
+ * on non-indirect vdevs. This allows us to be less strict
+ * about performance in the general (but rare) case.
+ */
+ ASSERT0(first->is_split_offset);
+ ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL);
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ first->is_vdev, first->is_target_offset,
+ abd_get_offset(zio->io_abd, 0),
+ zio->io_size, zio->io_type, zio->io_priority, 0,
+ vdev_indirect_child_io_done, zio));
+ } else {
+ iv->iv_split_block = B_TRUE;
+ if (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
+ /*
+ * Read all copies. Note that for simplicity,
+ * we don't bother consulting the DTL in the
+ * resilver case.
+ */
+ vdev_indirect_read_all(zio);
+ } else {
+ /*
+ * Read one copy of each split segment, from the
+ * top-level vdev. Since we don't know the
+ * checksum of each split individually, the child
+ * zio can't ensure that we get the right data.
+ * E.g. if it's a mirror, it will just read from a
+ * random (healthy) leaf vdev. We have to verify
+ * the checksum in vdev_indirect_io_done().
+ */
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ is->is_vdev, is->is_target_offset,
+ abd_get_offset(zio->io_abd,
+ is->is_split_offset),
+ is->is_size, zio->io_type,
+ zio->io_priority, 0,
+ vdev_indirect_child_io_done, zio));
+ }
+ }
+ }
zio_execute(zio);
}
+/*
+ * Report a checksum error for a child.
+ */
+static void
+vdev_indirect_checksum_error(zio_t *zio,
+ indirect_split_t *is, indirect_child_t *ic)
+{
+ vdev_t *vd = ic->ic_vdev;
+
+ if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+ return;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+
+ zio_bad_cksum_t zbc = { 0 };
+ void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size);
+ abd_t *good_abd = is->is_child[is->is_good_child].ic_data;
+ void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size);
+ zfs_ereport_post_checksum(zio->io_spa, vd, zio,
+ is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc);
+ abd_return_buf(ic->ic_data, bad_buf, is->is_size);
+ abd_return_buf(good_abd, good_buf, is->is_size);
+}
+
+/*
+ * Issue repair i/os for any incorrect copies. We do this by comparing
+ * each split segment's correct data (is_good_child's ic_data) with each
+ * other copy of the data. If they differ, then we overwrite the bad data
+ * with the good copy. Note that we do this without regard for the DTL's,
+ * which simplifies this code and also issues the optimal number of writes
+ * (based on which copies actually read bad data, as opposed to which we
+ * think might be wrong). For the same reason, we always use
+ * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start().
+ */
+static void
+vdev_indirect_repair(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ enum zio_flag flags = ZIO_FLAG_IO_REPAIR;
+
+ if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))
+ flags |= ZIO_FLAG_SELF_HEAL;
+
+ if (!spa_writeable(zio->io_spa))
+ return;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ indirect_child_t *good_child = &is->is_child[is->is_good_child];
+
+ for (int c = 0; c < is->is_children; c++) {
+ indirect_child_t *ic = &is->is_child[c];
+ if (ic == good_child)
+ continue;
+ if (ic->ic_data == NULL)
+ continue;
+ if (abd_cmp(good_child->ic_data, ic->ic_data,
+ is->is_size) == 0)
+ continue;
+
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ ic->ic_vdev, is->is_target_offset,
+ good_child->ic_data, is->is_size,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
+ NULL, NULL));
+
+ vdev_indirect_checksum_error(zio, is, ic);
+ }
+ }
+}
+
+/*
+ * Report checksum errors on all children that we read from.
+ */
+static void
+vdev_indirect_all_checksum_errors(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+ return;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ for (int c = 0; c < is->is_children; c++) {
+ indirect_child_t *ic = &is->is_child[c];
+
+ if (ic->ic_data == NULL)
+ continue;
+
+ vdev_t *vd = ic->ic_vdev;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+
+ zfs_ereport_post_checksum(zio->io_spa, vd, zio,
+ is->is_target_offset, is->is_size,
+ NULL, NULL, NULL);
+ }
+ }
+}
+
+/*
+ * This function is called when we have read all copies of the data and need
+ * to try to find a combination of copies that gives us the right checksum.
+ *
+ * If we pointed to any mirror vdevs, this effectively does the job of the
+ * mirror. The mirror vdev code can't do its own job because we don't know
+ * the checksum of each split segment individually. We have to try every
+ * combination of copies of split segments, until we find one that checksums
+ * correctly. (Or until we have tried all combinations, or have tried
+ * 2^zfs_reconstruct_indirect_segments_max combinations. In these cases we
+ * set io_error to ECKSUM to propagate the error up to the user.)
+ *
+ * For example, if we have 3 segments in the split,
+ * and each points to a 2-way mirror, we will have the following pieces of
+ * data:
+ *
+ * | mirror child
+ * split | [0] [1]
+ * ======|=====================
+ * A | data_A_0 data_A_1
+ * B | data_B_0 data_B_1
+ * C | data_C_0 data_C_1
+ *
+ * We will try the following (mirror children)^(number of splits) (2^3=8)
+ * combinations, which is similar to bitwise-little-endian counting in
+ * binary. In general each "digit" corresponds to a split segment, and the
+ * base of each digit is is_children, which can be different for each
+ * digit.
+ *
+ * "low bit" "high bit"
+ * v v
+ * data_A_0 data_B_0 data_C_0
+ * data_A_1 data_B_0 data_C_0
+ * data_A_0 data_B_1 data_C_0
+ * data_A_1 data_B_1 data_C_0
+ * data_A_0 data_B_0 data_C_1
+ * data_A_1 data_B_0 data_C_1
+ * data_A_0 data_B_1 data_C_1
+ * data_A_1 data_B_1 data_C_1
+ *
+ * Note that the split segments may be on the same or different top-level
+ * vdevs. In either case, we try lots of combinations (see
+ * zfs_reconstruct_indirect_segments_max). This ensures that if a mirror has
+ * small silent errors on all of its children, we can still reconstruct the
+ * correct data, as long as those errors are at sufficiently-separated
+ * offsets (specifically, separated by the largest block size - default of
+ * 128KB, but up to 16MB).
+ */
+static void
+vdev_indirect_reconstruct_io_done(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+ uint64_t attempts = 0;
+ uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max;
+ int segments = 0;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is))
+ segments++;
+
+ for (;;) {
+ /* copy data from splits to main zio */
+ int ret;
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+
+ /*
+ * If this child failed, its ic_data will be NULL.
+ * Skip this combination.
+ */
+ if (is->is_child[is->is_good_child].ic_data == NULL) {
+ ret = EIO;
+ goto next;
+ }
+
+ abd_copy_off(zio->io_abd,
+ is->is_child[is->is_good_child].ic_data,
+ is->is_split_offset, 0, is->is_size);
+ }
+
+ /* See if this checksum matches. */
+ zio_bad_cksum_t zbc;
+ ret = zio_checksum_error(zio, &zbc);
+ if (ret == 0) {
+ /* Found a matching checksum. Issue repair i/os. */
+ vdev_indirect_repair(zio);
+ zio_checksum_verified(zio);
+ return;
+ }
+
+ /*
+ * Checksum failed; try a different combination of split
+ * children.
+ */
+ boolean_t more;
+next:
+ more = B_FALSE;
+ if (segments <= zfs_reconstruct_indirect_segments_max) {
+ /*
+ * There are relatively few segments, so
+ * deterministically check all combinations. We do
+ * this by by adding one to the first split's
+ * good_child. If it overflows, then "carry over" to
+ * the next split (like counting in base is_children,
+ * but each digit can have a different base).
+ */
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ is->is_good_child++;
+ if (is->is_good_child < is->is_children) {
+ more = B_TRUE;
+ break;
+ }
+ is->is_good_child = 0;
+ }
+ } else if (++attempts < attempts_max) {
+ /*
+ * There are too many combinations to try all of them
+ * in a reasonable amount of time, so try a fixed
+ * number of random combinations, after which we'll
+ * consider the block unrecoverable.
+ */
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ is->is_good_child =
+ spa_get_random(is->is_children);
+ }
+ more = B_TRUE;
+ }
+ if (!more) {
+ /* All combinations failed. */
+ zio->io_error = ret;
+ vdev_indirect_all_checksum_errors(zio);
+ zio_checksum_verified(zio);
+ return;
+ }
+ }
+}
+
+static void
+vdev_indirect_io_done(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ if (iv->iv_reconstruct) {
+ /*
+ * We have read all copies of the data (e.g. from mirrors),
+ * either because this was a scrub/resilver, or because the
+ * one-copy read didn't checksum correctly.
+ */
+ vdev_indirect_reconstruct_io_done(zio);
+ return;
+ }
+
+ if (!iv->iv_split_block) {
+ /*
+ * This was not a split block, so we passed the BP down,
+ * and the checksum was handled by the (one) child zio.
+ */
+ return;
+ }
+
+ zio_bad_cksum_t zbc;
+ int ret = zio_checksum_error(zio, &zbc);
+ if (ret == 0) {
+ zio_checksum_verified(zio);
+ return;
+ }
+
+ /*
+ * The checksum didn't match. Read all copies of all splits, and
+ * then we will try to reconstruct. The next time
+ * vdev_indirect_io_done() is called, iv_reconstruct will be set.
+ */
+ vdev_indirect_read_all(zio);
+
+ zio_vdev_io_redone(zio);
+}
+
vdev_ops_t vdev_indirect_ops = {
vdev_indirect_open,
vdev_indirect_close,
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index d8a0762c42..d906860346 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -1117,10 +1117,13 @@ vdev_uberblock_sync_done(zio_t *zio)
* Write the uberblock to all labels of all leaves of the specified vdev.
*/
static void
-vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
+vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
+ uberblock_t *ub, vdev_t *vd, int flags)
{
- for (uint64_t c = 0; c < vd->vdev_children; c++)
- vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags);
+ for (uint64_t c = 0; c < vd->vdev_children; c++) {
+ vdev_uberblock_sync(zio, good_writes,
+ ub, vd->vdev_child[c], flags);
+ }
if (!vd->vdev_ops->vdev_op_leaf)
return;
@@ -1138,7 +1141,7 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
for (int l = 0; l < VDEV_LABELS; l++)
vdev_label_write(zio, vd, l, ub_abd,
VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
- vdev_uberblock_sync_done, zio->io_private,
+ vdev_uberblock_sync_done, good_writes,
flags | ZIO_FLAG_DONT_PROPAGATE);
abd_free(ub_abd);
@@ -1152,10 +1155,10 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
zio_t *zio;
uint64_t good_writes = 0;
- zio = zio_root(spa, NULL, &good_writes, flags);
+ zio = zio_root(spa, NULL, NULL, flags);
for (int v = 0; v < svdcount; v++)
- vdev_uberblock_sync(zio, ub, svd[v], flags);
+ vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags);
(void) zio_wait(zio);
@@ -1216,7 +1219,8 @@ vdev_label_sync_ignore_done(zio_t *zio)
* Write all even or odd labels to all leaves of the specified vdev.
*/
static void
-vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
+vdev_label_sync(zio_t *zio, uint64_t *good_writes,
+ vdev_t *vd, int l, uint64_t txg, int flags)
{
nvlist_t *label;
vdev_phys_t *vp;
@@ -1224,8 +1228,10 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
char *buf;
size_t buflen;
- for (int c = 0; c < vd->vdev_children; c++)
- vdev_label_sync(zio, vd->vdev_child[c], l, txg, flags);
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_label_sync(zio, good_writes,
+ vd->vdev_child[c], l, txg, flags);
+ }
if (!vd->vdev_ops->vdev_op_leaf)
return;
@@ -1250,7 +1256,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
vdev_label_write(zio, vd, l, vp_abd,
offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t),
- vdev_label_sync_done, zio->io_private,
+ vdev_label_sync_done, good_writes,
flags | ZIO_FLAG_DONT_PROPAGATE);
}
}
@@ -1282,7 +1288,7 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
(vd->vdev_islog || vd->vdev_aux != NULL) ?
vdev_label_sync_ignore_done : vdev_label_sync_top_done,
good_writes, flags);
- vdev_label_sync(vio, vd, l, txg, flags);
+ vdev_label_sync(vio, good_writes, vd, l, txg, flags);
zio_nowait(vio);
}
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
index 9d181a874e..34a750fe4d 100644
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -345,12 +345,15 @@ vdev_mirror_io_start(zio_t *zio)
}
if (zio->io_type == ZIO_TYPE_READ) {
- if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
+ if (zio->io_bp != NULL &&
+ (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
/*
- * For scrubbing reads we need to allocate a read
- * buffer for each child and issue reads to all
- * children. If any child succeeds, it will copy its
- * data into zio->io_data in vdev_mirror_scrub_done.
+ * For scrubbing reads (if we can verify the
+ * checksum here, as indicated by io_bp being
+ * non-NULL) we need to allocate a read buffer for
+ * each child and issue reads to all children. If
+ * any child succeeds, it will copy its data into
+ * zio->io_data in vdev_mirror_scrub_done.
*/
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
@@ -502,7 +505,21 @@ vdev_mirror_io_done(zio_t *zio)
if (mc->mc_error == 0) {
if (mc->mc_tried)
continue;
+ /*
+ * We didn't try this child. We need to
+ * repair it if:
+ * 1. it's a scrub (in which case we have
+ * tried everything that was healthy)
+ * - or -
+ * 2. it's an indirect vdev (in which case
+ * it could point to any other vdev, which
+ * might have a bad DTL)
+ * - or -
+ * 3. the DTL indicates that this data is
+ * missing from this vdev
+ */
if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
+ mc->mc_vd->vdev_ops != &vdev_indirect_ops &&
!vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
zio->io_txg, 1))
continue;
diff --git a/usr/src/uts/common/fs/zfs/vdev_removal.c b/usr/src/uts/common/fs/zfs/vdev_removal.c
index d00b5b35f7..fc613ff58a 100644
--- a/usr/src/uts/common/fs/zfs/vdev_removal.c
+++ b/usr/src/uts/common/fs/zfs/vdev_removal.c
@@ -83,18 +83,12 @@ typedef struct vdev_copy_arg {
kmutex_t vca_lock;
} vdev_copy_arg_t;
-typedef struct vdev_copy_seg_arg {
- vdev_copy_arg_t *vcsa_copy_arg;
- uint64_t vcsa_txg;
- dva_t *vcsa_dest_dva;
- blkptr_t *vcsa_dest_bp;
-} vdev_copy_seg_arg_t;
-
/*
- * The maximum amount of allowed data we're allowed to copy from a device
- * at a time when removing it.
+ * The maximum amount of memory we can use for outstanding i/o while
+ * doing a device removal. This determines how much i/o we can have
+ * in flight concurrently.
*/
-int zfs_remove_max_copy_bytes = 8 * 1024 * 1024;
+int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
/*
* The largest contiguous segment that we will attempt to allocate when
@@ -176,7 +170,7 @@ spa_vdev_removal_create(vdev_t *vd)
mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
svr->svr_allocd_segs = range_tree_create(NULL, NULL);
- svr->svr_vdev = vd;
+ svr->svr_vdev_id = vd->vdev_id;
for (int i = 0; i < TXG_SIZE; i++) {
svr->svr_frees[i] = range_tree_create(NULL, NULL);
@@ -218,9 +212,10 @@ spa_vdev_removal_destroy(spa_vdev_removal_t *svr)
static void
vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
{
- vdev_t *vd = arg;
+ int vdev_id = (uintptr_t)arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
- spa_t *spa = vd->vdev_spa;
objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
spa_vdev_removal_t *svr = NULL;
uint64_t txg = dmu_tx_get_txg(tx);
@@ -342,7 +337,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
ASSERT3P(spa->spa_vdev_removal, ==, NULL);
spa->spa_vdev_removal = svr;
svr->svr_thread = thread_create(NULL, 0,
- spa_vdev_remove_thread, vd, 0, &p0, TS_RUN, minclsyspri);
+ spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri);
}
/*
@@ -383,21 +378,24 @@ spa_remove_init(spa_t *spa)
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
vdev_t *vd = vdev_lookup_top(spa,
spa->spa_removing_phys.sr_removing_vdev);
- spa_config_exit(spa, SCL_STATE, FTAG);
- if (vd == NULL)
+ if (vd == NULL) {
+ spa_config_exit(spa, SCL_STATE, FTAG);
return (EINVAL);
+ }
vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
ASSERT(vdev_is_concrete(vd));
spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
- ASSERT(svr->svr_vdev->vdev_removing);
+ ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id);
+ ASSERT(vd->vdev_removing);
vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
spa->spa_meta_objset, vic->vic_mapping_object);
vd->vdev_indirect_births = vdev_indirect_births_open(
spa->spa_meta_objset, vic->vic_births_object);
+ spa_config_exit(spa, SCL_STATE, FTAG);
spa->spa_vdev_removal = svr;
}
@@ -450,15 +448,8 @@ spa_restart_removal(spa_t *spa)
if (!spa_writeable(spa))
return;
- vdev_t *vd = svr->svr_vdev;
- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-
- ASSERT3P(vd, !=, NULL);
- ASSERT(vd->vdev_removing);
-
- zfs_dbgmsg("restarting removal of %llu at count=%llu",
- vd->vdev_id, vdev_indirect_mapping_num_entries(vim));
- svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, vd,
+ zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id);
+ svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa,
0, &p0, TS_RUN, minclsyspri);
}
@@ -479,7 +470,7 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
vdev_indirect_mapping_object(vim));
- ASSERT3P(vd, ==, svr->svr_vdev);
+ ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
mutex_enter(&svr->svr_lock);
@@ -662,7 +653,7 @@ spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
if (state == DSS_FINISHED) {
spa_removing_phys_t *srp = &spa->spa_removing_phys;
- vdev_t *vd = svr->svr_vdev;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
if (srp->sr_prev_indirect_vdev != UINT64_MAX) {
@@ -705,7 +696,7 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
{
spa_vdev_removal_t *svr = arg;
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
- vdev_t *vd = svr->svr_vdev;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
uint64_t txg = dmu_tx_get_txg(tx);
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
@@ -733,64 +724,128 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
spa_sync_removing_state(spa, tx);
}
+/*
+ * All reads and writes associated with a call to spa_vdev_copy_segment()
+ * are done.
+ */
+static void
+spa_vdev_copy_nullzio_done(zio_t *zio)
+{
+ spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
+}
+
+/*
+ * The write of the new location is done.
+ */
static void
spa_vdev_copy_segment_write_done(zio_t *zio)
{
- vdev_copy_seg_arg_t *vcsa = zio->io_private;
- vdev_copy_arg_t *vca = vcsa->vcsa_copy_arg;
- spa_config_exit(zio->io_spa, SCL_STATE, FTAG);
+ vdev_copy_arg_t *vca = zio->io_private;
+
abd_free(zio->io_abd);
mutex_enter(&vca->vca_lock);
vca->vca_outstanding_bytes -= zio->io_size;
cv_signal(&vca->vca_cv);
mutex_exit(&vca->vca_lock);
-
- ASSERT0(zio->io_error);
- kmem_free(vcsa->vcsa_dest_bp, sizeof (blkptr_t));
- kmem_free(vcsa, sizeof (vdev_copy_seg_arg_t));
}
+/*
+ * The read of the old location is done. The parent zio is the write to
+ * the new location. Allow it to start.
+ */
static void
spa_vdev_copy_segment_read_done(zio_t *zio)
{
- vdev_copy_seg_arg_t *vcsa = zio->io_private;
- dva_t *dest_dva = vcsa->vcsa_dest_dva;
- uint64_t txg = vcsa->vcsa_txg;
- spa_t *spa = zio->io_spa;
- vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(dest_dva));
- blkptr_t *bp = NULL;
- dva_t *dva = NULL;
- uint64_t size = zio->io_size;
-
- ASSERT3P(dest_vd, !=, NULL);
- ASSERT0(zio->io_error);
-
- vcsa->vcsa_dest_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
- bp = vcsa->vcsa_dest_bp;
- dva = bp->blk_dva;
-
- BP_ZERO(bp);
-
- /* initialize with dest_dva */
- bcopy(dest_dva, dva, sizeof (dva_t));
- BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
-
- BP_SET_LSIZE(bp, size);
- BP_SET_PSIZE(bp, size);
- BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
- BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
- BP_SET_TYPE(bp, DMU_OT_NONE);
- BP_SET_LEVEL(bp, 0);
- BP_SET_DEDUP(bp, 0);
- BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
- zio_nowait(zio_rewrite(spa->spa_txg_zio[txg & TXG_MASK], spa,
- txg, bp, zio->io_abd, size,
- spa_vdev_copy_segment_write_done, vcsa,
- ZIO_PRIORITY_REMOVAL, 0, NULL));
+ zio_nowait(zio_unique_parent(zio));
}
+/*
+ * If the old and new vdevs are mirrors, we will read both sides of the old
+ * mirror, and write each copy to the corresponding side of the new mirror.
+ * If the old and new vdevs have a different number of children, we will do
+ * this as best as possible. Since we aren't verifying checksums, this
+ * ensures that as long as there's a good copy of the data, we'll have a
+ * good copy after the removal, even if there's silent damage to one side
+ * of the mirror. If we're removing a mirror that has some silent damage,
+ * we'll have exactly the same damage in the new location (assuming that
+ * the new location is also a mirror).
+ *
+ * We accomplish this by creating a tree of zio_t's, with as many writes as
+ * there are "children" of the new vdev (a non-redundant vdev counts as one
+ * child, a 2-way mirror has 2 children, etc). Each write has an associated
+ * read from a child of the old vdev. Typically there will be the same
+ * number of children of the old and new vdevs. However, if there are more
+ * children of the new vdev, some child(ren) of the old vdev will be issued
+ * multiple reads. If there are more children of the old vdev, some copies
+ * will be dropped.
+ *
+ * For example, the tree of zio_t's for a 2-way mirror is:
+ *
+ * null
+ * / \
+ * write(new vdev, child 0) write(new vdev, child 1)
+ * | |
+ * read(old vdev, child 0) read(old vdev, child 1)
+ *
+ * Child zio's complete before their parents complete. However, zio's
+ * created with zio_vdev_child_io() may be issued before their children
+ * complete. In this case we need to make sure that the children (reads)
+ * complete before the parents (writes) are *issued*. We do this by not
+ * calling zio_nowait() on each write until its corresponding read has
+ * completed.
+ *
+ * The spa_config_lock must be held while zio's created by
+ * zio_vdev_child_io() are in progress, to ensure that the vdev tree does
+ * not change (e.g. due to a concurrent "zpool attach/detach"). The "null"
+ * zio is needed to release the spa_config_lock after all the reads and
+ * writes complete. (Note that we can't grab the config lock for each read,
+ * because it is not reentrant - we could deadlock with a thread waiting
+ * for a write lock.)
+ */
+static void
+spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
+ vdev_t *source_vd, uint64_t source_offset,
+ vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size)
+{
+ ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
+
+ mutex_enter(&vca->vca_lock);
+ vca->vca_outstanding_bytes += size;
+ mutex_exit(&vca->vca_lock);
+
+ abd_t *abd = abd_alloc_for_io(size, B_FALSE);
+
+ vdev_t *source_child_vd;
+ if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
+ /*
+ * Source and dest are both mirrors. Copy from the same
+ * child id as we are copying to (wrapping around if there
+ * are more dest children than source children).
+ */
+ source_child_vd =
+ source_vd->vdev_child[dest_id % source_vd->vdev_children];
+ } else {
+ source_child_vd = source_vd;
+ }
+
+ zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
+ dest_child_vd, dest_offset, abd, size,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
+ ZIO_FLAG_CANFAIL,
+ spa_vdev_copy_segment_write_done, vca);
+
+ zio_nowait(zio_vdev_child_io(write_zio, NULL,
+ source_child_vd, source_offset, abd, size,
+ ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
+ ZIO_FLAG_CANFAIL,
+ spa_vdev_copy_segment_read_done, vca));
+}
+
+/*
+ * Allocate a new location for this segment, and create the zio_t's to
+ * read from the old location and write to the new location.
+ */
static int
spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
@@ -799,10 +854,7 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
spa_t *spa = vd->vdev_spa;
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
vdev_indirect_mapping_entry_t *entry;
- vdev_copy_seg_arg_t *private;
dva_t dst = { 0 };
- blkptr_t blk, *bp = &blk;
- dva_t *dva = bp->blk_dva;
ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
@@ -826,51 +878,28 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
*/
ASSERT3U(DVA_GET_ASIZE(&dst), ==, size);
- mutex_enter(&vca->vca_lock);
- vca->vca_outstanding_bytes += size;
- mutex_exit(&vca->vca_lock);
-
entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
entry->vime_mapping.vimep_dst = dst;
- private = kmem_alloc(sizeof (vdev_copy_seg_arg_t), KM_SLEEP);
- private->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
- private->vcsa_txg = txg;
- private->vcsa_copy_arg = vca;
-
/*
- * This lock is eventually released by the donefunc for the
- * zio_write_phys that finishes copying the data.
+ * See comment before spa_vdev_copy_one_child().
*/
- spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-
- /*
- * Do logical I/O, letting the redundancy vdevs (like mirror)
- * handle their own I/O instead of duplicating that code here.
- */
- BP_ZERO(bp);
-
- DVA_SET_VDEV(&dva[0], vd->vdev_id);
- DVA_SET_OFFSET(&dva[0], start);
- DVA_SET_GANG(&dva[0], 0);
- DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, size));
-
- BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
-
- BP_SET_LSIZE(bp, size);
- BP_SET_PSIZE(bp, size);
- BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
- BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
- BP_SET_TYPE(bp, DMU_OT_NONE);
- BP_SET_LEVEL(bp, 0);
- BP_SET_DEDUP(bp, 0);
- BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
- zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa,
- bp, abd_alloc_for_io(size, B_FALSE), size,
- spa_vdev_copy_segment_read_done, private,
- ZIO_PRIORITY_REMOVAL, 0, NULL));
+ spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+ zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
+ spa_vdev_copy_nullzio_done, NULL, 0);
+ vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
+ if (dest_vd->vdev_ops == &vdev_mirror_ops) {
+ for (int i = 0; i < dest_vd->vdev_children; i++) {
+ vdev_t *child = dest_vd->vdev_child[i];
+ spa_vdev_copy_one_child(vca, nzio, vd, start,
+ child, DVA_GET_OFFSET(&dst), i, size);
+ }
+ } else {
+ spa_vdev_copy_one_child(vca, nzio, vd, start,
+ dest_vd, DVA_GET_OFFSET(&dst), -1, size);
+ }
+ zio_nowait(nzio);
list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
@@ -888,8 +917,8 @@ static void
vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
{
spa_vdev_removal_t *svr = arg;
- vdev_t *vd = svr->svr_vdev;
- spa_t *spa = vd->vdev_spa;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
@@ -918,37 +947,6 @@ vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
}
static void
-vdev_indirect_state_transfer(vdev_t *ivd, vdev_t *vd)
-{
- ivd->vdev_indirect_config = vd->vdev_indirect_config;
-
- ASSERT3P(ivd->vdev_indirect_mapping, ==, NULL);
- ASSERT(vd->vdev_indirect_mapping != NULL);
- ivd->vdev_indirect_mapping = vd->vdev_indirect_mapping;
- vd->vdev_indirect_mapping = NULL;
-
- ASSERT3P(ivd->vdev_indirect_births, ==, NULL);
- ASSERT(vd->vdev_indirect_births != NULL);
- ivd->vdev_indirect_births = vd->vdev_indirect_births;
- vd->vdev_indirect_births = NULL;
-
- ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
- ASSERT0(range_tree_space(ivd->vdev_obsolete_segments));
-
- if (vd->vdev_obsolete_sm != NULL) {
- ASSERT3U(ivd->vdev_asize, ==, vd->vdev_asize);
-
- /*
- * We cannot use space_map_{open,close} because we hold all
- * the config locks as writer.
- */
- ASSERT3P(ivd->vdev_obsolete_sm, ==, NULL);
- ivd->vdev_obsolete_sm = vd->vdev_obsolete_sm;
- vd->vdev_obsolete_sm = NULL;
- }
-}
-
-static void
vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
{
ASSERT3P(zlist, !=, NULL);
@@ -983,17 +981,13 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
ivd = vdev_add_parent(vd, &vdev_indirect_ops);
+ ivd->vdev_removing = 0;
vd->vdev_leaf_zap = 0;
vdev_remove_child(ivd, vd);
vdev_compact_children(ivd);
- vdev_indirect_state_transfer(ivd, vd);
-
- svr->svr_vdev = ivd;
-
- ASSERT(!ivd->vdev_removing);
ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
@@ -1016,9 +1010,8 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
* context by the removal thread after we have copied all vdev's data.
*/
static void
-vdev_remove_complete(vdev_t *vd)
+vdev_remove_complete(spa_t *spa)
{
- spa_t *spa = vd->vdev_spa;
uint64_t txg;
/*
@@ -1026,8 +1019,12 @@ vdev_remove_complete(vdev_t *vd)
* vdev_metaslab_fini()
*/
txg_wait_synced(spa->spa_dsl_pool, 0);
-
txg = spa_vdev_enter(spa);
+ vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+
+ sysevent_t *ev = spa_event_create(spa, vd, NULL,
+ ESC_ZFS_VDEV_REMOVE_DEV);
+
zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
vd->vdev_id, txg);
@@ -1047,6 +1044,10 @@ vdev_remove_complete(vdev_t *vd)
/*
* We now release the locks, allowing spa_sync to run and finish the
* removal via vdev_remove_complete_sync in syncing context.
+ *
+ * Note that we hold on to the vdev_t that has been replaced. Since
+ * it isn't part of the vdev tree any longer, it can't be concurrently
+ * manipulated, even while we don't have the config lock.
*/
(void) spa_vdev_exit(spa, NULL, txg, 0);
@@ -1068,6 +1069,8 @@ vdev_remove_complete(vdev_t *vd)
*/
vdev_config_dirty(spa->spa_root_vdev);
(void) spa_vdev_exit(spa, vd, txg, 0);
+
+ spa_event_post(ev);
}
/*
@@ -1078,7 +1081,7 @@ vdev_remove_complete(vdev_t *vd)
* this size again this txg.
*/
static void
-spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
+spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
uint64_t *max_alloc, dmu_tx_t *tx)
{
uint64_t txg = dmu_tx_get_txg(tx);
@@ -1117,7 +1120,7 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
while (length > 0) {
uint64_t mylen = MIN(length, thismax);
- int error = spa_vdev_copy_segment(svr->svr_vdev,
+ int error = spa_vdev_copy_segment(vd,
offset, mylen, txg, vca, &zal);
if (error == ENOSPC) {
@@ -1175,12 +1178,14 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
static void
spa_vdev_remove_thread(void *arg)
{
- vdev_t *vd = arg;
- spa_t *spa = vd->vdev_spa;
+ spa_t *spa = arg;
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
vdev_copy_arg_t vca;
uint64_t max_alloc = zfs_remove_max_segment;
uint64_t last_txg = 0;
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
@@ -1188,7 +1193,6 @@ spa_vdev_remove_thread(void *arg)
ASSERT(vdev_is_concrete(vd));
ASSERT(vd->vdev_removing);
ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
- ASSERT3P(svr->svr_vdev, ==, vd);
ASSERT(vim != NULL);
mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -1270,6 +1274,17 @@ spa_vdev_remove_thread(void *arg)
mutex_exit(&svr->svr_lock);
/*
+ * We need to periodically drop the config lock so that
+ * writers can get in. Additionally, we can't wait
+ * for a txg to sync while holding a config lock
+ * (since a waiting writer could cause a 3-way deadlock
+ * with the sync thread, which also gets a config
+ * lock for reader). So we can't hold the config lock
+ * while calling dmu_tx_assign().
+ */
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ /*
* This delay will pause the removal around the point
* specified by zfs_remove_max_bytes_pause. We do this
* solely from the test suite or during debugging.
@@ -1295,11 +1310,19 @@ spa_vdev_remove_thread(void *arg)
VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
uint64_t txg = dmu_tx_get_txg(tx);
+ /*
+ * Reacquire the vdev_config lock. The vdev_t
+ * that we're removing may have changed, e.g. due
+ * to a vdev_attach or vdev_detach.
+ */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+
if (txg != last_txg)
max_alloc = zfs_remove_max_segment;
last_txg = txg;
- spa_vdev_copy_impl(svr, &vca, &max_alloc, tx);
+ spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
dmu_tx_commit(tx);
mutex_enter(&svr->svr_lock);
@@ -1307,6 +1330,9 @@ spa_vdev_remove_thread(void *arg)
}
mutex_exit(&svr->svr_lock);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
/*
* Wait for all copies to finish before cleaning up the vca.
*/
@@ -1324,7 +1350,7 @@ spa_vdev_remove_thread(void *arg)
mutex_exit(&svr->svr_lock);
} else {
ASSERT0(range_tree_space(svr->svr_allocd_segs));
- vdev_remove_complete(vd);
+ vdev_remove_complete(spa);
}
}
@@ -1365,7 +1391,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
- vdev_t *vd = svr->svr_vdev;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
objset_t *mos = spa->spa_meta_objset;
@@ -1438,8 +1464,11 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
* because we have not allocated mappings for it yet.
*/
uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
- range_tree_clear(svr->svr_allocd_segs, syncd,
- msp->ms_sm->sm_start + msp->ms_sm->sm_size - syncd);
+ uint64_t sm_end = msp->ms_sm->sm_start +
+ msp->ms_sm->sm_size;
+ if (sm_end > syncd)
+ range_tree_clear(svr->svr_allocd_segs,
+ syncd, sm_end - syncd);
mutex_exit(&svr->svr_lock);
}
@@ -1500,7 +1529,7 @@ spa_vdev_remove_cancel(spa_t *spa)
if (spa->spa_vdev_removal == NULL)
return (ENOTACTIVE);
- uint64_t vdid = spa->spa_vdev_removal->svr_vdev->vdev_id;
+ uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
spa_vdev_remove_cancel_sync, NULL, 0,
@@ -1811,7 +1840,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);
dsl_sync_task_nowait(spa->spa_dsl_pool,
vdev_remove_initiate_sync,
- vd, 0, ZFS_SPACE_CHECK_NONE, tx);
+ (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx);
dmu_tx_commit(tx);
return (0);
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index f2c511ef77..2390a1ee90 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -1075,17 +1075,6 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
done != NULL);
- /*
- * In the common case, where the parent zio was to a normal vdev,
- * the child zio must be to a child vdev of that vdev. Otherwise,
- * the child zio must be to a top-level vdev.
- */
- if (pio->io_vd != NULL && pio->io_vd->vdev_ops != &vdev_indirect_ops) {
- ASSERT3P(vd->vdev_parent, ==, pio->io_vd);
- } else {
- ASSERT3P(vd, ==, vd->vdev_top);
- }
-
if (type == ZIO_TYPE_READ && bp != NULL) {
/*
* If we have the bp, then the child should perform the
@@ -1145,7 +1134,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
zio_t *
zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
- int type, zio_priority_t priority, enum zio_flag flags,
+ zio_type_t type, zio_priority_t priority, enum zio_flag flags,
zio_done_func_t *done, void *private)
{
zio_t *zio;
@@ -3097,9 +3086,13 @@ zio_vdev_io_start(zio_t *zio)
ASSERT(spa->spa_trust_config);
if (zio->io_vd->vdev_removing) {
+ /*
+ * Note: the code can handle other kinds of writes,
+ * but we don't expect them.
+ */
ASSERT(zio->io_flags &
(ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
- ZIO_FLAG_INDUCE_DAMAGE));
+ ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
}
}
@@ -3160,18 +3153,37 @@ zio_vdev_io_start(zio_t *zio)
* If this is a repair I/O, and there's no self-healing involved --
* that is, we're just resilvering what we expect to resilver --
* then don't do the I/O unless zio's txg is actually in vd's DTL.
- * This prevents spurious resilvering with nested replication.
- * For example, given a mirror of mirrors, (A+B)+(C+D), if only
- * A is out of date, we'll read from C+D, then use the data to
- * resilver A+B -- but we don't actually want to resilver B, just A.
- * The top-level mirror has no way to know this, so instead we just
- * discard unnecessary repairs as we work our way down the vdev tree.
- * The same logic applies to any form of nested replication:
- * ditto + mirror, RAID-Z + replacing, etc. This covers them all.
+ * This prevents spurious resilvering.
+ *
+ * There are a few ways that we can end up creating these spurious
+ * resilver i/os:
+ *
+ * 1. A resilver i/o will be issued if any DVA in the BP has a
+ * dirty DTL. The mirror code will issue resilver writes to
+ * each DVA, including the one(s) that are not on vdevs with dirty
+ * DTLs.
+ *
+ * 2. With nested replication, which happens when we have a
+ * "replacing" or "spare" vdev that's a child of a mirror or raidz.
+ * For example, given mirror(replacing(A+B), C), it's likely that
+ * only A is out of date (it's the new device). In this case, we'll
+ * read from C, then use the data to resilver A+B -- but we don't
+ * actually want to resilver B, just A. The top-level mirror has no
+ * way to know this, so instead we just discard unnecessary repairs
+ * as we work our way down the vdev tree.
+ *
+ * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
+ * The same logic applies to any form of nested replication: ditto
+ * + mirror, RAID-Z + replacing, etc.
+ *
+ * However, indirect vdevs point off to other vdevs which may have
+ * DTL's, so we never bypass them. The child i/os on concrete vdevs
+ * will be properly bypassed instead.
*/
if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
!(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
zio->io_txg != 0 && /* not a delegated i/o */
+ vd->vdev_ops != &vdev_indirect_ops &&
!vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
zio_vdev_io_bypass(zio);