summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith M Wesolowski <wesolows@foobazco.org>2013-08-07 22:53:10 +0000
committerKeith M Wesolowski <wesolows@foobazco.org>2013-08-07 22:53:13 +0000
commit8d105dfdc34afdf1754018fe761746fa9c7ddb29 (patch)
treef2ae1738ba44bd9942a8f94a9a0e965e327ef02b
parent1bfc1ecf10a7a4ecdd3e74fedc5fea9a61f9bd57 (diff)
parentb4952e17e8858d3225793b28788278de9fe6038d (diff)
downloadillumos-joyent-8d105dfdc34afdf1754018fe761746fa9c7ddb29.tar.gz
[illumos-gate merge]
commit b4952e17e8858d3225793b28788278de9fe6038d 3956 ::vdev -r should work with pipelines 3957 ztest should update the cachefile before killing itself 3958 multiple scans can lead to partial resilvering 3959 ddt entries are not always resilvered 3960 dsl_scan can skip over dedup-ed blocks if physical birth != logical birth 3961 freed gang blocks are not resilvered and can cause pool to suspend 3962 ztest should print out zfs debug buffer before exiting commit be9000cc677e0a8d04e5be45c61d7370fc8c7b54 3955 ztest failure: assertion refcount_count(&tx->tx_space_written) + delta <= tx->tx_space_towrite commit 2c1e2b44148432fb7a509dd216a99299b6740250 3949 ztest fault injection should avoid resilvering devices 3950 ztest: deadman fires when we're doing a scan 3951 ztest hang when running dedup test 3952 ztest: ztest_reguid test and ztest_fault_inject don't place nice together commit 98144673ce45bddc6d5dbe7e2afab720c660b5d7 3992 mdb ::stacks segv
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/findstack.c5
-rw-r--r--usr/src/cmd/mdb/common/modules/zfs/zfs.c2
-rw-r--r--usr/src/cmd/ztest/ztest.c74
-rw-r--r--usr/src/lib/libzpool/common/llib-lzpool1
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_tx.c4
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_scan.c21
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c7
-rw-r--r--usr/src/uts/common/fs/zfs/spa_config.c9
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_scan.h31
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h4
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_debug.h3
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c100
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_file.c5
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_label.c107
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_debug.c15
-rw-r--r--usr/src/uts/common/sys/fs/zfs.h4
16 files changed, 289 insertions, 103 deletions
diff --git a/usr/src/cmd/mdb/common/modules/genunix/findstack.c b/usr/src/cmd/mdb/common/modules/genunix/findstack.c
index 223764eda9..12b90b8f7d 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/findstack.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/findstack.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
*/
#include <mdb/mdb_modapi.h>
@@ -340,6 +341,8 @@ stacks_cleanup(int force)
mdb_free(stacks_array,
stacks_array_size * sizeof (*stacks_array));
+ mdb_free(stacks_hash, STACKS_HSIZE * sizeof (*stacks_hash));
+
} else if (stacks_array != NULL) {
for (idx = 0; idx < stacks_array_size; idx++) {
if ((cur = stacks_array[idx]) != NULL) {
@@ -360,6 +363,8 @@ stacks_cleanup(int force)
stacks_array_size = 0;
stacks_state = STACKS_STATE_CLEAN;
+ stacks_hash = NULL;
+ stacks_array = NULL;
}
/*ARGSUSED*/
diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
index b04b05d75a..b3cf1d5f0b 100644
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
@@ -1140,7 +1140,7 @@ do_print_vdev(uintptr_t addr, int flags, int depth, int stats,
}
if (flags & DCMD_PIPE_OUT) {
- mdb_printf("%#lr", addr);
+ mdb_printf("%#lr\n", addr);
} else {
if (vdev.vdev_path != NULL) {
if (mdb_readstr(desc, sizeof (desc),
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index 3aca1fe0c9..390061a8c5 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
*/
@@ -184,6 +184,7 @@ static const ztest_shared_opts_t ztest_opts_defaults = {
extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
+extern uint64_t zfs_deadman_synctime;
static ztest_shared_opts_t *ztest_shared_opts;
static ztest_shared_opts_t ztest_opts;
@@ -363,7 +364,7 @@ ztest_info_t ztest_info[] = {
{ ztest_fault_inject, 1, &zopt_sometimes },
{ ztest_ddt_repair, 1, &zopt_sometimes },
{ ztest_dmu_snapshot_hold, 1, &zopt_sometimes },
- { ztest_reguid, 1, &zopt_sometimes },
+ { ztest_reguid, 1, &zopt_rarely },
{ ztest_spa_rename, 1, &zopt_rarely },
{ ztest_scrub, 1, &zopt_rarely },
{ ztest_spa_upgrade, 1, &zopt_rarely },
@@ -766,6 +767,16 @@ ztest_kill(ztest_shared_t *zs)
{
zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa));
zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa));
+
+ /*
+ * Before we kill off ztest, make sure that the config is updated.
+ * See comment above spa_config_sync().
+ */
+ mutex_enter(&spa_namespace_lock);
+ spa_config_sync(ztest_spa, B_FALSE, B_FALSE);
+ mutex_exit(&spa_namespace_lock);
+
+ zfs_dbgmsg_print(FTAG);
(void) kill(getpid(), SIGKILL);
}
@@ -2730,7 +2741,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
uint64_t leaf, top;
uint64_t ashift = ztest_get_ashift();
uint64_t oldguid, pguid;
- size_t oldsize, newsize;
+ uint64_t oldsize, newsize;
char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
int replacing;
int oldvd_has_siblings = B_FALSE;
@@ -2889,8 +2900,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
if (error != expected_error && expected_error != EBUSY) {
fatal(0, "attach (%s %llu, %s %llu, %d) "
"returned %d, expected %d",
- oldpath, (longlong_t)oldsize, newpath,
- (longlong_t)newsize, replacing, error, expected_error);
+ oldpath, oldsize, newpath,
+ newsize, replacing, error, expected_error);
}
VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
@@ -3604,6 +3615,9 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
else
dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
+ /* This accounts for setting the checksum/compression. */
+ dmu_tx_hold_bonus(tx, bigobj);
+
txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
if (txg == 0) {
umem_free(packbuf, packsize);
@@ -4754,6 +4768,14 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
ASSERT(leaves >= 1);
/*
+ * Grab the name lock as reader. There are some operations
+ * which don't like to have their vdevs changed while
+ * they are in progress (i.e. spa_change_guid). Those
+ * operations will have grabbed the name lock as writer.
+ */
+ (void) rw_rdlock(&ztest_name_lock);
+
+ /*
* We need SCL_STATE here because we're going to look at vd0->vdev_tsd.
*/
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
@@ -4782,7 +4804,14 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
if (vd0 != NULL && vd0->vdev_top->vdev_islog)
islog = B_TRUE;
- if (vd0 != NULL && maxfaults != 1) {
+ /*
+ * If the top-level vdev needs to be resilvered
+ * then we only allow faults on the device that is
+ * resilvering.
+ */
+ if (vd0 != NULL && maxfaults != 1 &&
+ (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) ||
+ vd0->vdev_resilver_txg != 0)) {
/*
* Make vd0 explicitly claim to be unreadable,
* or unwriteable, or reach behind its back
@@ -4813,6 +4842,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
if (sav->sav_count == 0) {
spa_config_exit(spa, SCL_STATE, FTAG);
+ (void) rw_unlock(&ztest_name_lock);
return;
}
vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)];
@@ -4826,6 +4856,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
}
spa_config_exit(spa, SCL_STATE, FTAG);
+ (void) rw_unlock(&ztest_name_lock);
/*
* If we can tolerate two or more faults, or we're dealing
@@ -5290,16 +5321,33 @@ static void *
ztest_deadman_thread(void *arg)
{
ztest_shared_t *zs = arg;
- int grace = 300;
- hrtime_t delta;
+ spa_t *spa = ztest_spa;
+ hrtime_t delta, total = 0;
- delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace;
+ for (;;) {
+ delta = (zs->zs_thread_stop - zs->zs_thread_start) /
+ NANOSEC + zfs_deadman_synctime;
- (void) poll(NULL, 0, (int)(1000 * delta));
+ (void) poll(NULL, 0, (int)(1000 * delta));
- fatal(0, "failed to complete within %d seconds of deadline", grace);
+ /*
+ * If the pool is suspended then fail immediately. Otherwise,
+ * check to see if the pool is making any progress. If
+ * vdev_deadman() discovers that there hasn't been any recent
+ * I/Os then it will end up aborting the tests.
+ */
+ if (spa_suspended(spa)) {
+ fatal(0, "aborting test after %llu seconds because "
+ "pool has transitioned to a suspended state.",
+ zfs_deadman_synctime);
+ return (NULL);
+ }
+ vdev_deadman(spa->spa_root_vdev);
- return (NULL);
+ total += zfs_deadman_synctime;
+ (void) printf("ztest has been running for %lld seconds\n",
+ total);
+ }
}
static void
@@ -5613,6 +5661,7 @@ ztest_run(ztest_shared_t *zs)
zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
+ zfs_dbgmsg_print(FTAG);
umem_free(tid, ztest_opts.zo_threads * sizeof (thread_t));
@@ -6024,6 +6073,7 @@ main(int argc, char **argv)
(void) setvbuf(stdout, NULL, _IOLBF, 0);
dprintf_setup(&argc, argv);
+ zfs_deadman_synctime = 300;
ztest_fd_rand = open("/dev/urandom", O_RDONLY);
ASSERT3S(ztest_fd_rand, >=, 0);
diff --git a/usr/src/lib/libzpool/common/llib-lzpool b/usr/src/lib/libzpool/common/llib-lzpool
index d3864d2a9a..7e61b55a91 100644
--- a/usr/src/lib/libzpool/common/llib-lzpool
+++ b/usr/src/lib/libzpool/common/llib-lzpool
@@ -64,3 +64,4 @@
extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
extern boolean_t zfeature_checks_disable;
+extern uint64_t zfs_deadman_synctime;
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index 84e518dfd1..d0dd730ed4 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -450,12 +450,12 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
blkid = off >> dn->dn_datablkshift;
nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
- if (blkid >= dn->dn_maxblkid) {
+ if (blkid > dn->dn_maxblkid) {
rw_exit(&dn->dn_struct_rwlock);
return;
}
if (blkid + nblks > dn->dn_maxblkid)
- nblks = dn->dn_maxblkid - blkid;
+ nblks = dn->dn_maxblkid - blkid + 1;
}
l0span = nblks; /* save for later use to calc level > 1 overhead */
diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c
index 34816986a1..e1859cc34c 100644
--- a/usr/src/uts/common/fs/zfs/dsl_scan.c
+++ b/usr/src/uts/common/fs/zfs/dsl_scan.c
@@ -194,6 +194,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
scn->scn_phys.scn_errors = 0;
scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
scn->scn_restart_txg = 0;
+ scn->scn_done_txg = 0;
spa_scan_stat_init(spa);
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
@@ -769,7 +770,7 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
* Don't scan it now unless we need to because something
* under it was modified.
*/
- if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) {
+ if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) {
scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
}
if (buf)
@@ -1214,7 +1215,7 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0 ||
- ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg)
+ ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
continue;
ddt_bp_create(checksum, ddk, ddp, &bp);
@@ -1457,6 +1458,16 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
if (scn->scn_phys.scn_state != DSS_SCANNING)
return;
+ if (scn->scn_done_txg == tx->tx_txg) {
+ ASSERT(!scn->scn_pausing);
+ /* finished with scan. */
+ zfs_dbgmsg("txg %llu scan complete", tx->tx_txg);
+ dsl_scan_done(scn, B_TRUE, tx);
+ ASSERT3U(spa->spa_scrub_inflight, ==, 0);
+ dsl_scan_sync_state(scn, tx);
+ return;
+ }
+
if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
scn->scn_phys.scn_ddt_class_max) {
zfs_dbgmsg("doing scan sync txg %llu; "
@@ -1492,9 +1503,9 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
(longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));
if (!scn->scn_pausing) {
- /* finished with scan. */
- zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg);
- dsl_scan_done(scn, B_TRUE, tx);
+ scn->scn_done_txg = tx->tx_txg + 1;
+ zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu",
+ tx->tx_txg, scn->scn_done_txg);
}
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 7334d39516..13eaaecbf7 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -757,6 +757,7 @@ spa_change_guid(spa_t *spa)
int error;
uint64_t guid;
+ mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
guid = spa_generate_guid(NULL);
@@ -769,6 +770,7 @@ spa_change_guid(spa_t *spa)
}
mutex_exit(&spa_namespace_lock);
+ mutex_exit(&spa->spa_vdev_top_lock);
return (error);
}
@@ -4444,7 +4446,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
}
/* mark the device being resilvered */
- newvd->vdev_resilvering = B_TRUE;
+ newvd->vdev_resilver_txg = txg;
/*
* If the parent is not a mirror, or if we're replacing, insert the new
@@ -4674,7 +4676,6 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
if (pvd->vdev_ops == &vdev_spare_ops)
cvd->vdev_unspare = B_FALSE;
vdev_remove_parent(cvd);
- cvd->vdev_resilvering = B_FALSE;
}
@@ -5391,6 +5392,8 @@ spa_vdev_resilver_done(spa_t *spa)
ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
sguid = ppvd->vdev_child[1]->vdev_guid;
}
+ ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
+
spa_config_exit(spa, SCL_ALL, FTAG);
if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
return;
diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c
index d97fc32fbf..47bb595908 100644
--- a/usr/src/uts/common/fs/zfs/spa_config.c
+++ b/usr/src/uts/common/fs/zfs/spa_config.c
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
*/
#include <sys/spa.h>
@@ -198,7 +198,12 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
/*
* Synchronize pool configuration to disk. This must be called with the
- * namespace lock held.
+ * namespace lock held. Synchronizing the pool cache is typically done after
+ * the configuration has been synced to the MOS. This exposes a window where
+ * the MOS config will have been updated but the cache file has not. If
+ * the system were to crash at that instant then the cached config may not
+ * contain the correct information to open the pool and an explicity import
+ * would be required.
*/
void
spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h
index aae8c312af..bf8c5ac824 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h
@@ -72,11 +72,42 @@ typedef enum dsl_scan_flags {
DSF_VISIT_DS_AGAIN = 1<<0,
} dsl_scan_flags_t;
+/*
+ * Every pool will have one dsl_scan_t and this structure will contain
+ * in-memory information about the scan and a pointer to the on-disk
+ * representation (i.e. dsl_scan_phys_t). Most of the state of the scan
+ * is contained on-disk to allow the scan to resume in the event of a reboot
+ * or panic. This structure maintains information about the behavior of a
+ * running scan, some caching information, and how it should traverse the pool.
+ *
+ * The following members of this structure direct the behavior of the scan:
+ *
+ * scn_pausing - a scan that cannot be completed in a single txg or
+ * has exceeded its allotted time will need to pause.
+ * When this flag is set the scanner will stop traversing
+ * the pool and write out the current state to disk.
+ *
+ * scn_restart_txg - directs the scanner to either restart or start a
+ * a scan at the specified txg value.
+ *
+ * scn_done_txg - when a scan completes its traversal it will set
+ * the completion txg to the next txg. This is necessary
+ * to ensure that any blocks that were freed during
+ * the scan but have not yet been processed (i.e deferred
+ * frees) are accounted for.
+ *
+ * This structure also maintains information about deferred frees which are
+ * a special kind of traversal. Deferred free can exist in either a bptree or
+ * a bpobj structure. The scn_is_bptree flag will indicate the type of
+ * deferred free that is in progress. If the deferred free is part of an
+ * asynchronous destroy then the scn_async_destroying flag will be set.
+ */
typedef struct dsl_scan {
struct dsl_pool *scn_dp;
boolean_t scn_pausing;
uint64_t scn_restart_txg;
+ uint64_t scn_done_txg;
uint64_t scn_sync_start_time;
zio_t *scn_zio_root;
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 02e3e838c3..9f68ac2968 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
*/
#ifndef _SYS_VDEV_IMPL_H
@@ -174,7 +174,7 @@ struct vdev {
uint64_t vdev_faulted; /* persistent faulted state */
uint64_t vdev_degraded; /* persistent degraded state */
uint64_t vdev_removed; /* persistent removed state */
- uint64_t vdev_resilvering; /* persistent resilvering state */
+ uint64_t vdev_resilver_txg; /* persistent resilvering state */
uint64_t vdev_nparity; /* number of parity devices for raidz */
char *vdev_path; /* vdev path (if any) */
char *vdev_devid; /* vdev devid (if any) */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
index 14eb2abdc1..c4dcfaec65 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZFS_DEBUG_H
@@ -77,6 +77,7 @@ typedef struct zfs_dbgmsg {
extern void zfs_dbgmsg_init(void);
extern void zfs_dbgmsg_fini(void);
extern void zfs_dbgmsg(const char *fmt, ...);
+extern void zfs_dbgmsg_print(const char *tag);
#ifndef _KERNEL
extern int dprintf_find_string(const char *string);
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 7a409bd7ed..dc5eb627f1 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -521,8 +521,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
&vd->vdev_offline);
- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING,
- &vd->vdev_resilvering);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
+ &vd->vdev_resilver_txg);
/*
* When importing a pool, we want to ignore the persistent fault
@@ -1663,6 +1663,75 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
}
/*
+ * Returns the lowest txg in the DTL range.
+ */
+static uint64_t
+vdev_dtl_min(vdev_t *vd)
+{
+ space_seg_t *ss;
+
+ ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
+ ASSERT3U(vd->vdev_dtl[DTL_MISSING].sm_space, !=, 0);
+ ASSERT0(vd->vdev_children);
+
+ ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
+ return (ss->ss_start - 1);
+}
+
+/*
+ * Returns the highest txg in the DTL.
+ */
+static uint64_t
+vdev_dtl_max(vdev_t *vd)
+{
+ space_seg_t *ss;
+
+ ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
+ ASSERT3U(vd->vdev_dtl[DTL_MISSING].sm_space, !=, 0);
+ ASSERT0(vd->vdev_children);
+
+ ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
+ return (ss->ss_end);
+}
+
+/*
+ * Determine if a resilvering vdev should remove any DTL entries from
+ * its range. If the vdev was resilvering for the entire duration of the
+ * scan then it should excise that range from its DTLs. Otherwise, this
+ * vdev is considered partially resilvered and should leave its DTL
+ * entries intact. The comment in vdev_dtl_reassess() describes how we
+ * excise the DTLs.
+ */
+static boolean_t
+vdev_dtl_should_excise(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+
+ ASSERT0(scn->scn_phys.scn_errors);
+ ASSERT0(vd->vdev_children);
+
+ if (vd->vdev_resilver_txg == 0 ||
+ vd->vdev_dtl[DTL_MISSING].sm_space == 0)
+ return (B_TRUE);
+
+ /*
+ * When a resilver is initiated the scan will assign the scn_max_txg
+ * value to the highest txg value that exists in all DTLs. If this
+ * device's max DTL is not part of this scan (i.e. it is not in
+ * the range (scn_min_txg, scn_max_txg] then it is not eligible
+ * for excision.
+ */
+ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
+ ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
+ ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
+ ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
* Reassess DTLs after a config change or scrub completion.
*/
void
@@ -1685,9 +1754,17 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
mutex_enter(&vd->vdev_dtl_lock);
+
+ /*
+ * If we've completed a scan cleanly then determine
+ * if this vdev should remove any DTLs. We only want to
+ * excise regions on vdevs that were available during
+ * the entire duration of this scan.
+ */
if (scrub_txg != 0 &&
(spa->spa_scrub_started ||
- (scn && scn->scn_phys.scn_errors == 0))) {
+ (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
+ vdev_dtl_should_excise(vd)) {
/*
* We completed a scrub up to scrub_txg. If we
* did it without rebooting, then the scrub dtl
@@ -1726,6 +1803,16 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
else
space_map_walk(&vd->vdev_dtl[DTL_MISSING],
space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);
+
+ /*
+ * If the vdev was resilvering and no longer has any
+ * DTLs then reset its resilvering flag.
+ */
+ if (vd->vdev_resilver_txg != 0 &&
+ vd->vdev_dtl[DTL_MISSING].sm_space == 0 &&
+ vd->vdev_dtl[DTL_OUTAGE].sm_space == 0)
+ vd->vdev_resilver_txg = 0;
+
mutex_exit(&vd->vdev_dtl_lock);
if (txg != 0)
@@ -1902,12 +1989,9 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
mutex_enter(&vd->vdev_dtl_lock);
if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
vdev_writeable(vd)) {
- space_seg_t *ss;
- ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
- thismin = ss->ss_start - 1;
- ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
- thismax = ss->ss_end;
+ thismin = vdev_dtl_min(vd);
+ thismax = vdev_dtl_max(vd);
needed = B_TRUE;
}
mutex_exit(&vd->vdev_dtl_lock);
diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c
index 2e67544345..a05abeb9d9 100644
--- a/usr/src/uts/common/fs/zfs/vdev_file.c
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c
@@ -185,7 +185,6 @@ vdev_file_io_strategy(void *arg)
static int
vdev_file_io_start(zio_t *zio)
{
- spa_t *spa = zio->io_spa;
vdev_t *vd = zio->io_vd;
vdev_file_t *vf = vd->vdev_tsd;
vdev_buf_t *vb;
@@ -224,8 +223,8 @@ vdev_file_io_start(zio_t *zio)
bp->b_private = vf->vf_vnode;
bp->b_iodone = (int (*)())vdev_file_io_intr;
- spa_taskq_dispatch_ent(spa, ZIO_TYPE_FREE, ZIO_TASKQ_ISSUE,
- vdev_file_io_strategy, bp, 0, &zio->io_tqent);
+ VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, bp,
+ TQ_SLEEP), !=, 0);
return (ZIO_PIPELINE_STOP);
}
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index 904918c3a4..ee9921f94a 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -216,30 +216,25 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
{
nvlist_t *nv = NULL;
- VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ nv = fnvlist_alloc();
- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
- vd->vdev_ops->vdev_op_type) == 0);
+ fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type);
if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
- == 0);
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);
if (vd->vdev_path != NULL)
- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH,
- vd->vdev_path) == 0);
+ fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);
if (vd->vdev_devid != NULL)
- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID,
- vd->vdev_devid) == 0);
+ fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid);
if (vd->vdev_physpath != NULL)
- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
- vd->vdev_physpath) == 0);
+ fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
+ vd->vdev_physpath);
if (vd->vdev_fru != NULL)
- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_FRU,
- vd->vdev_fru) == 0);
+ fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
if (vd->vdev_nparity != 0) {
ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
@@ -260,59 +255,54 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
* that only support a single parity device -- older software
* will just ignore it.
*/
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY,
- vd->vdev_nparity) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
}
if (vd->vdev_wholedisk != -1ULL)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
- vd->vdev_wholedisk) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+ vd->vdev_wholedisk);
if (vd->vdev_not_present)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
if (vd->vdev_isspare)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
vd == vd->vdev_top) {
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
- vd->vdev_ms_array) == 0);
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
- vd->vdev_ms_shift) == 0);
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT,
- vd->vdev_ashift) == 0);
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
- vd->vdev_asize) == 0);
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG,
- vd->vdev_islog) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+ vd->vdev_ms_array);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+ vd->vdev_ms_shift);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
+ vd->vdev_asize);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
if (vd->vdev_removing)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
- vd->vdev_removing) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
+ vd->vdev_removing);
}
if (vd->vdev_dtl_smo.smo_object != 0)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
- vd->vdev_dtl_smo.smo_object) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
+ vd->vdev_dtl_smo.smo_object);
if (vd->vdev_crtxg)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
- vd->vdev_crtxg) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
if (getstats) {
vdev_stat_t vs;
pool_scan_stat_t ps;
vdev_get_stats(vd, &vs);
- VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
- (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
+ fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t));
/* provide either current or previous scan information */
if (spa_scan_get_stats(spa, &ps) == 0) {
- VERIFY(nvlist_add_uint64_array(nv,
+ fnvlist_add_uint64_array(nv,
ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
- sizeof (pool_scan_stat_t) / sizeof (uint64_t))
- == 0);
+ sizeof (pool_scan_stat_t) / sizeof (uint64_t));
}
}
@@ -342,8 +332,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
}
if (idx) {
- VERIFY(nvlist_add_nvlist_array(nv,
- ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
+ fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ child, idx);
}
for (c = 0; c < idx; c++)
@@ -355,26 +345,20 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
const char *aux = NULL;
if (vd->vdev_offline && !vd->vdev_tmpoffline)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
- B_TRUE) == 0);
- if (vd->vdev_resilvering)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVERING,
- B_TRUE) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE);
+ if (vd->vdev_resilver_txg != 0)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
+ vd->vdev_resilver_txg);
if (vd->vdev_faulted)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED,
- B_TRUE) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
if (vd->vdev_degraded)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED,
- B_TRUE) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE);
if (vd->vdev_removed)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED,
- B_TRUE) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE);
if (vd->vdev_unspare)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE,
- B_TRUE) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE);
if (vd->vdev_ishole)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE,
- B_TRUE) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE);
switch (vd->vdev_stat.vs_aux) {
case VDEV_AUX_ERR_EXCEEDED:
@@ -387,12 +371,11 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
}
if (aux != NULL)
- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE,
- aux) == 0);
+ fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux);
if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
- vd->vdev_orig_guid) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
+ vd->vdev_orig_guid);
}
}
diff --git a/usr/src/uts/common/fs/zfs/zfs_debug.c b/usr/src/uts/common/fs/zfs/zfs_debug.c
index 44824e15a0..26ea561eb1 100644
--- a/usr/src/uts/common/fs/zfs/zfs_debug.c
+++ b/usr/src/uts/common/fs/zfs/zfs_debug.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -94,3 +94,16 @@ zfs_dbgmsg(const char *fmt, ...)
}
mutex_exit(&zfs_dbgmsgs_lock);
}
+
+void
+zfs_dbgmsg_print(const char *tag)
+{
+ zfs_dbgmsg_t *zdm;
+
+ (void) printf("ZFS_DBGMSG(%s):\n", tag);
+ mutex_enter(&zfs_dbgmsgs_lock);
+ for (zdm = list_head(&zfs_dbgmsgs); zdm;
+ zdm = list_next(&zfs_dbgmsgs, zdm))
+ (void) printf("%s\n", zdm->zdm_msg);
+ mutex_exit(&zfs_dbgmsgs_lock);
+}
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
index 6be8aee742..f6eb9aa490 100644
--- a/usr/src/uts/common/sys/fs/zfs.h
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
@@ -522,7 +522,7 @@ typedef struct zpool_rewind_policy {
#define ZPOOL_CONFIG_SPLIT_GUID "split_guid"
#define ZPOOL_CONFIG_SPLIT_LIST "guid_list"
#define ZPOOL_CONFIG_REMOVING "removing"
-#define ZPOOL_CONFIG_RESILVERING "resilvering"
+#define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg"
#define ZPOOL_CONFIG_COMMENT "comment"
#define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */
#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */