summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorGeorge Wilson <George.Wilson@Sun.COM>2009-09-29 07:29:35 -0700
committerGeorge Wilson <George.Wilson@Sun.COM>2009-09-29 07:29:35 -0700
commit8f18d1fadf6a0c20fac9ff7259a5368faa3c3bfb (patch)
treea4cf4175bd17bcccaad3442e7d46e3809c1eeade /usr/src
parent6033a7923719b64e630b12cfb0a4fc4c3a2c9e23 (diff)
downloadillumos-joyent-8f18d1fadf6a0c20fac9ff7259a5368faa3c3bfb.tar.gz
6846163 ZFS continues to use faulted logzilla, bringing system to a crawl
6872547 ztest LUN expansion test fails 6873635 zdb should be able to open a pool with a failed slog 6873654 system panics when a slog device is offlined 6875236 zdb should be able to dump the spa history
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/zdb/zdb.c87
-rw-r--r--usr/src/cmd/zinject/zinject.c63
-rw-r--r--usr/src/cmd/zpool/zpool_main.c43
-rw-r--r--usr/src/cmd/ztest/ztest.c61
-rw-r--r--usr/src/lib/libzfs/common/libzfs.h4
-rw-r--r--usr/src/lib/libzfs/common/libzfs_pool.c45
-rw-r--r--usr/src/lib/libzfs/common/mapfile-vers2
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_scrub.c2
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c4
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c15
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa_impl.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h2
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c102
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c2
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vfsops.c2
-rw-r--r--usr/src/uts/common/fs/zfs/zil.c33
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c5
-rw-r--r--usr/src/uts/common/fs/zfs/zio_inject.c20
-rw-r--r--usr/src/uts/common/sys/fs/zfs.h2
21 files changed, 379 insertions, 121 deletions
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
index f0d3fa77a4..d58332a969 100644
--- a/usr/src/cmd/zdb/zdb.c
+++ b/usr/src/cmd/zdb/zdb.c
@@ -100,6 +100,7 @@ usage(void)
(void) fprintf(stderr, " -u uberblock\n");
(void) fprintf(stderr, " -d datasets\n");
(void) fprintf(stderr, " -C cached pool configuration\n");
+ (void) fprintf(stderr, " -h pool history\n");
(void) fprintf(stderr, " -i intent logs\n");
(void) fprintf(stderr, " -b block statistics\n");
(void) fprintf(stderr, " -m metaslabs\n");
@@ -504,7 +505,7 @@ dump_dtl(vdev_t *vd, int indent)
char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" };
char prefix[256];
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
required = vdev_dtl_required(vd);
(void) spa_vdev_state_exit(spa, NULL, 0);
@@ -534,6 +535,67 @@ dump_dtl(vdev_t *vd, int indent)
dump_dtl(vd->vdev_child[c], indent + 4);
}
+static void
+dump_history(spa_t *spa)
+{
+ nvlist_t **events = NULL;
+ char buf[SPA_MAXBLOCKSIZE];
+ uint64_t resid, off = 0;
+ uint64_t len = sizeof (buf);
+ uint_t num = 0;
+ int error;
+ time_t tsec;
+ struct tm t;
+ char tbuf[30];
+ char internalstr[MAXPATHLEN];
+
+ do {
+ if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
+ (void) fprintf(stderr, "Unable to read history: "
+ "error %d\n", error);
+ return;
+ }
+
+ if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
+ break;
+
+ off -= resid;
+ } while (len != 0);
+
+ (void) printf("\nHistory:\n");
+ for (int i = 0; i < num; i++) {
+ uint64_t time, txg, ievent;
+ char *cmd, *intstr;
+
+ if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
+ &time) != 0)
+ continue;
+ if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
+ &cmd) != 0) {
+ if (nvlist_lookup_uint64(events[i],
+ ZPOOL_HIST_INT_EVENT, &ievent) != 0)
+ continue;
+ verify(nvlist_lookup_uint64(events[i],
+ ZPOOL_HIST_TXG, &txg) == 0);
+ verify(nvlist_lookup_string(events[i],
+ ZPOOL_HIST_INT_STR, &intstr) == 0);
+ if (ievent >= LOG_END)
+ continue;
+
+ (void) snprintf(internalstr,
+ sizeof (internalstr),
+ "[internal %s txg:%lld] %s",
+ hist_event_table[ievent], txg,
+ intstr);
+ cmd = internalstr;
+ }
+ tsec = time;
+ (void) localtime_r(&tsec, &t);
+ (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
+ (void) printf("%s %s\n", tbuf, cmd);
+ }
+}
+
/*ARGSUSED*/
static void
dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
@@ -1791,6 +1853,9 @@ dump_zpool(spa_t *spa)
if (dump_opt['s'])
show_pool_stats(spa);
+ if (dump_opt['h'])
+ dump_history(spa);
+
if (rc != 0)
exit(rc);
}
@@ -2256,11 +2321,12 @@ main(int argc, char **argv)
dprintf_setup(&argc, argv);
- while ((c = getopt(argc, argv, "udibcmsvCLS:U:lRep:t:")) != -1) {
+ while ((c = getopt(argc, argv, "udhibcmsvCLS:U:lRep:t:")) != -1) {
switch (c) {
case 'u':
case 'd':
case 'i':
+ case 'h':
case 'b':
case 'c':
case 'm':
@@ -2415,6 +2481,23 @@ main(int argc, char **argv)
B_TRUE, FTAG, &os);
} else {
error = spa_open(argv[0], &spa, FTAG);
+ if (error) {
+ /*
+ * If we're missing the log device then
+ * try opening the pool after clearing the
+ * log state.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(argv[0])) != NULL &&
+ spa->spa_log_state == SPA_LOG_MISSING) {
+ spa->spa_log_state = SPA_LOG_CLEAR;
+ error = 0;
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ if (!error)
+ error = spa_open(argv[0], &spa, FTAG);
+ }
}
}
diff --git a/usr/src/cmd/zinject/zinject.c b/usr/src/cmd/zinject/zinject.c
index 5528ac330a..c673cf9ea1 100644
--- a/usr/src/cmd/zinject/zinject.c
+++ b/usr/src/cmd/zinject/zinject.c
@@ -227,11 +227,15 @@ usage(void)
"\t\tfunctions which call spa_vdev_config_exit(), or \n"
"\t\tspa_vdev_exit() will trigger a panic.\n"
"\n"
- "\tzinject -d device [-e errno] [-L <nvlist|uber>] [-F] pool\n"
+ "\tzinject -d device [-e errno] [-L <nvlist|uber>] [-F]\n"
+ "\t [-T <read|write|free|claim|all> pool\n"
"\t\tInject a fault into a particular device or the device's\n"
"\t\tlabel. Label injection can either be 'nvlist' or 'uber'.\n"
"\t\t'errno' can either be 'nxio' (the default) or 'io'.\n"
"\n"
+ "\tzinject -d device -A <degrade|fault> pool\n"
+ "\t\tPerform a specific action on a particular device\n"
+ "\n"
"\tzinject -b objset:object:level:blkid pool\n"
"\n"
"\t\tInject an error into pool 'pool' with the numeric bookmark\n"
@@ -497,6 +501,22 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
}
int
+perform_action(const char *pool, zinject_record_t *record, int cmd)
+{
+ zfs_cmd_t zc;
+
+ ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED);
+ (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
+ zc.zc_guid = record->zi_guid;
+ zc.zc_cookie = cmd;
+
+ if (ioctl(zfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+ return (0);
+
+ return (1);
+}
+
+int
main(int argc, char **argv)
{
int c;
@@ -509,6 +529,8 @@ main(int argc, char **argv)
int quiet = 0;
int error = 0;
int domount = 0;
+ int io_type = ZIO_TYPES;
+ int action = VDEV_STATE_UNKNOWN;
err_type_t type = TYPE_INVAL;
err_type_t label = TYPE_INVAL;
zinject_record_t record = { 0 };
@@ -546,11 +568,24 @@ main(int argc, char **argv)
return (0);
}
- while ((c = getopt(argc, argv, ":ab:d:f:Fqhc:t:l:mr:e:uL:p:")) != -1) {
+ while ((c = getopt(argc, argv,
+ ":aA:b:d:f:Fqhc:t:T:l:mr:e:uL:p:")) != -1) {
switch (c) {
case 'a':
flags |= ZINJECT_FLUSH_ARC;
break;
+ case 'A':
+ if (strcasecmp(optarg, "degrade") == 0) {
+ action = VDEV_STATE_DEGRADED;
+ } else if (strcasecmp(optarg, "fault") == 0) {
+ action = VDEV_STATE_FAULTED;
+ } else {
+ (void) fprintf(stderr, "invalid action '%s': "
+ "must be 'degrade' or 'fault'\n", optarg);
+ usage();
+ return (1);
+ }
+ break;
case 'b':
raw = optarg;
break;
@@ -611,6 +646,25 @@ main(int argc, char **argv)
case 'r':
range = optarg;
break;
+ case 'T':
+ if (strcasecmp(optarg, "read") == 0) {
+ io_type = ZIO_TYPE_READ;
+ } else if (strcasecmp(optarg, "write") == 0) {
+ io_type = ZIO_TYPE_WRITE;
+ } else if (strcasecmp(optarg, "free") == 0) {
+ io_type = ZIO_TYPE_FREE;
+ } else if (strcasecmp(optarg, "claim") == 0) {
+ io_type = ZIO_TYPE_CLAIM;
+ } else if (strcasecmp(optarg, "all") == 0) {
+ io_type = ZIO_TYPES;
+ } else {
+ (void) fprintf(stderr, "invalid I/O type "
+ "'%s': must be 'read', 'write', 'free', "
+ "'claim' or 'all'\n", optarg);
+ usage();
+ return (1);
+ }
+ break;
case 't':
if ((type = name_to_type(optarg)) == TYPE_INVAL &&
!MOS_TYPE(type)) {
@@ -708,10 +762,15 @@ main(int argc, char **argv)
return (1);
}
+ record.zi_iotype = io_type;
if (translate_device(pool, device, label, &record) != 0)
return (1);
if (!error)
error = ENXIO;
+
+ if (action != VDEV_STATE_UNKNOWN)
+ return (perform_action(pool, &record, action));
+
} else if (raw != NULL) {
if (range != NULL || type != TYPE_INVAL || level != 0 ||
record.zi_func[0] != '\0') {
diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c
index 41bd4794c7..42478db280 100644
--- a/usr/src/cmd/zpool/zpool_main.c
+++ b/usr/src/cmd/zpool/zpool_main.c
@@ -3624,49 +3624,6 @@ typedef struct hist_cbdata {
int internal;
} hist_cbdata_t;
-char *hist_event_table[LOG_END] = {
- "invalid event",
- "pool create",
- "vdev add",
- "pool remove",
- "pool destroy",
- "pool export",
- "pool import",
- "vdev attach",
- "vdev replace",
- "vdev detach",
- "vdev online",
- "vdev offline",
- "vdev upgrade",
- "pool clear",
- "pool scrub",
- "pool property set",
- "create",
- "clone",
- "destroy",
- "destroy_begin_sync",
- "inherit",
- "property set",
- "quota set",
- "permission update",
- "permission remove",
- "permission who remove",
- "promote",
- "receive",
- "rename",
- "reservation set",
- "replay_inc_sync",
- "replay_full_sync",
- "rollback",
- "snapshot",
- "filesystem version upgrade",
- "refquota set",
- "refreservation set",
- "pool scrub done",
- "user hold",
- "user release",
-};
-
/*
* Print out the command history for a specific pool.
*/
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index 4def729b40..05a1b8ec4b 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -203,13 +203,13 @@ uint64_t zopt_rarely = 60; /* every 60 seconds */
ztest_info_t ztest_info[] = {
{ ztest_dmu_read_write, 1, &zopt_always },
- { ztest_dmu_read_write_zcopy, 1, &zopt_always },
{ ztest_dmu_write_parallel, 30, &zopt_always },
{ ztest_dmu_object_alloc_free, 1, &zopt_always },
{ ztest_dmu_commit_callbacks, 10, &zopt_always },
{ ztest_zap, 30, &zopt_always },
{ ztest_fzap, 30, &zopt_always },
{ ztest_zap_parallel, 100, &zopt_always },
+ { ztest_dmu_read_write_zcopy, 1, &zopt_sometimes },
{ ztest_dsl_prop_get_set, 1, &zopt_sometimes },
{ ztest_dmu_objset_create_destroy, 1, &zopt_sometimes },
{ ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes },
@@ -1245,8 +1245,8 @@ online_vdev(vdev_t *vd, void *arg)
{
spa_t *spa = vd->vdev_spa;
vdev_t *tvd = vd->vdev_top;
- vdev_t *pvd = vd->vdev_parent;
uint64_t guid = vd->vdev_guid;
+ uint64_t generation = spa->spa_config_generation + 1;
ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
ASSERT(vd->vdev_ops->vdev_op_leaf);
@@ -1262,10 +1262,14 @@ online_vdev(vdev_t *vd, void *arg)
* vdev may have been detached/replaced while we were
* trying to online it.
*/
- if (vd != vdev_lookup_by_guid(tvd, guid) || vd->vdev_parent != pvd) {
- if (zopt_verbose >= 6) {
- (void) printf("vdev %p has disappeared, was "
- "guid %llu\n", (void *)vd, (u_longlong_t)guid);
+ if (generation != spa->spa_config_generation) {
+ if (zopt_verbose >= 5) {
+ (void) printf("vdev configuration has changed, "
+ "guid %llu, state %llu, expected gen %llu, "
+ "got gen %llu\n", (u_longlong_t)guid,
+ (u_longlong_t)tvd->vdev_state,
+ (u_longlong_t)generation,
+ (u_longlong_t)spa->spa_config_generation);
}
return (vd);
}
@@ -1309,7 +1313,6 @@ ztest_vdev_LUN_growth(ztest_args_t *za)
uint64_t spa_newsize, spa_cursize, ms_count;
(void) mutex_lock(&ztest_shared->zs_vdev_lock);
- mutex_enter(&spa_namespace_lock);
spa_config_enter(spa, SCL_STATE, spa, RW_READER);
while (tvd == NULL || tvd->vdev_islog) {
@@ -1330,12 +1333,12 @@ ztest_vdev_LUN_growth(ztest_args_t *za)
psize = vd->vdev_psize;
/*
- * We only try to expand the vdev if it's less than 4x its
- * original size and it has a valid psize.
+ * We only try to expand the vdev if it's healthy, less than 4x its
+ * original size, and it has a valid psize.
*/
- if (psize == 0 || psize >= 4 * zopt_vdev_size) {
+ if (tvd->vdev_state != VDEV_STATE_HEALTHY ||
+ psize == 0 || psize >= 4 * zopt_vdev_size) {
spa_config_exit(spa, SCL_STATE, spa);
- mutex_exit(&spa_namespace_lock);
(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
return;
}
@@ -1361,16 +1364,14 @@ ztest_vdev_LUN_growth(ztest_args_t *za)
tvd->vdev_state != VDEV_STATE_HEALTHY) {
if (zopt_verbose >= 5) {
(void) printf("Could not expand LUN because "
- "some vdevs were not healthy\n");
+ "the vdev configuration changed.\n");
}
(void) spa_config_exit(spa, SCL_STATE, spa);
- mutex_exit(&spa_namespace_lock);
(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
return;
}
(void) spa_config_exit(spa, SCL_STATE, spa);
- mutex_exit(&spa_namespace_lock);
/*
* Expanding the LUN will update the config asynchronously,
@@ -3486,6 +3487,7 @@ ztest_fault_inject(ztest_args_t *za)
int maxfaults = zopt_maxfaults;
vdev_t *vd0 = NULL;
uint64_t guid0 = 0;
+ boolean_t islog = B_FALSE;
ASSERT(leaves >= 1);
@@ -3513,6 +3515,9 @@ ztest_fault_inject(ztest_args_t *za)
zopt_dir, zopt_pool, top * leaves + leaf);
vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
+ if (vd0 != NULL && vd0->vdev_top->vdev_islog)
+ islog = B_TRUE;
+
if (vd0 != NULL && maxfaults != 1) {
/*
* Make vd0 explicitly claim to be unreadable,
@@ -3558,22 +3563,38 @@ ztest_fault_inject(ztest_args_t *za)
spa_config_exit(spa, SCL_STATE, FTAG);
- if (maxfaults == 0)
- return;
-
/*
- * If we can tolerate two or more faults, randomly online/offline vd0.
+ * If we can tolerate two or more faults, or we're dealing
+ * with a slog, randomly online/offline vd0.
*/
- if (maxfaults >= 2 && guid0 != 0) {
+ if ((maxfaults >= 2 || islog) && guid0 != 0) {
if (ztest_random(10) < 6) {
int flags = (ztest_random(2) == 0 ?
ZFS_OFFLINE_TEMPORARY : 0);
+
+ /*
+ * We have to grab the zs_name_lock as writer to
+ * prevent a race between offlining a slog and
+ * destroying a dataset. Offlining the slog will
+ * grab a reference on the dataset which may cause
+ * dmu_objset_destroy() to fail with EBUSY thus
+ * leaving the dataset in an inconsistent state.
+ */
+ if (islog)
+ (void) rw_wrlock(&ztest_shared->zs_name_lock);
+
VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
+
+ if (islog)
+ (void) rw_unlock(&ztest_shared->zs_name_lock);
} else {
(void) vdev_online(spa, guid0, 0, NULL);
}
}
+ if (maxfaults == 0)
+ return;
+
/*
* We have at least single-fault tolerance, so inject data corruption.
*/
@@ -3921,7 +3942,7 @@ static void
ztest_resume(spa_t *spa)
{
if (spa_suspended(spa)) {
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
vdev_clear(spa, NULL);
(void) spa_vdev_state_exit(spa, NULL, 0);
(void) zio_resume(spa);
diff --git a/usr/src/lib/libzfs/common/libzfs.h b/usr/src/lib/libzfs/common/libzfs.h
index 546c8e451f..41ad44af77 100644
--- a/usr/src/lib/libzfs/common/libzfs.h
+++ b/usr/src/lib/libzfs/common/libzfs.h
@@ -332,10 +332,14 @@ extern nvlist_t *zpool_find_import_activeok(libzfs_handle_t *, int, char **);
*/
struct zfs_cmd;
+extern const char *hist_event_table[LOG_END];
+
extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *,
boolean_t verbose);
extern int zpool_upgrade(zpool_handle_t *, uint64_t);
extern int zpool_get_history(zpool_handle_t *, nvlist_t **);
+extern int zpool_history_unpack(char *, uint64_t, uint64_t *,
+ nvlist_t ***, uint_t *);
extern void zpool_set_history_str(const char *subcommand, int argc,
char **argv, char *history_str);
extern int zpool_stage_history(libzfs_handle_t *, const char *);
diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c
index da19f7a780..a860c85788 100644
--- a/usr/src/lib/libzfs/common/libzfs_pool.c
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c
@@ -42,6 +42,49 @@
#include "zfs_prop.h"
#include "libzfs_impl.h"
+const char *hist_event_table[LOG_END] = {
+ "invalid event",
+ "pool create",
+ "vdev add",
+ "pool remove",
+ "pool destroy",
+ "pool export",
+ "pool import",
+ "vdev attach",
+ "vdev replace",
+ "vdev detach",
+ "vdev online",
+ "vdev offline",
+ "vdev upgrade",
+ "pool clear",
+ "pool scrub",
+ "pool property set",
+ "create",
+ "clone",
+ "destroy",
+ "destroy_begin_sync",
+ "inherit",
+ "property set",
+ "quota set",
+ "permission update",
+ "permission remove",
+ "permission who remove",
+ "promote",
+ "receive",
+ "rename",
+ "reservation set",
+ "replay_inc_sync",
+ "replay_full_sync",
+ "rollback",
+ "snapshot",
+ "filesystem version upgrade",
+ "refquota set",
+ "refreservation set",
+ "pool scrub done",
+ "user hold",
+ "user release",
+};
+
static int read_efi_label(nvlist_t *config, diskaddr_t *sb);
#if defined(__i386) || defined(__amd64)
@@ -2804,7 +2847,7 @@ get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len)
* into 'records'. 'leftover' is set to the number of bytes that weren't
* processed as there wasn't a complete record.
*/
-static int
+int
zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover,
nvlist_t ***records, uint_t *numrecords)
{
diff --git a/usr/src/lib/libzfs/common/mapfile-vers b/usr/src/lib/libzfs/common/mapfile-vers
index 1877ef5391..aa04db73e0 100644
--- a/usr/src/lib/libzfs/common/mapfile-vers
+++ b/usr/src/lib/libzfs/common/mapfile-vers
@@ -45,6 +45,7 @@ SUNWprivate_1.1 {
fletcher_4_byteswap;
fletcher_4_incremental_native;
fletcher_4_incremental_byteswap;
+ hist_event_table;
libzfs_errno;
libzfs_error_action;
libzfs_error_description;
@@ -170,6 +171,7 @@ SUNWprivate_1.1 {
zpool_get_prop_int;
zpool_get_state;
zpool_get_status;
+ zpool_history_unpack;
zpool_import;
zpool_import_props;
zpool_import_status;
diff --git a/usr/src/uts/common/fs/zfs/dsl_scrub.c b/usr/src/uts/common/fs/zfs/dsl_scrub.c
index 03ebb90bbd..d3b11a7643 100644
--- a/usr/src/uts/common/fs/zfs/dsl_scrub.c
+++ b/usr/src/uts/common/fs/zfs/dsl_scrub.c
@@ -1033,7 +1033,7 @@ dsl_pool_scrub_clean(dsl_pool_t *dp)
* spa_scrub_reopen flag indicates that vdev_open() should not
* attempt to start another scrub.
*/
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
spa->spa_scrub_reopen = B_TRUE;
vdev_reopen(spa->spa_root_vdev);
spa->spa_scrub_reopen = B_FALSE;
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 20bd8482be..e242926b5a 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -3958,7 +3958,7 @@ spa_async_thread(spa_t *spa)
* See if any devices need to be marked REMOVED.
*/
if (tasks & SPA_ASYNC_REMOVE) {
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
spa_async_remove(spa, spa->spa_root_vdev);
for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
@@ -3977,7 +3977,7 @@ spa_async_thread(spa_t *spa)
* See if any devices need to be probed.
*/
if (tasks & SPA_ASYNC_PROBE) {
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
spa_async_probe(spa, spa->spa_root_vdev);
(void) spa_vdev_state_exit(spa, NULL, 0);
}
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 38474c194d..38779fc3ab 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -880,6 +880,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
dsl_pool_scrub_restart(spa->spa_dsl_pool);
config_changed = B_TRUE;
+ spa->spa_config_generation++;
}
/*
@@ -939,18 +940,24 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
* Lock the given spa_t for the purpose of changing vdev state.
*/
void
-spa_vdev_state_enter(spa_t *spa)
+spa_vdev_state_enter(spa_t *spa, int oplocks)
{
- spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER);
+ int locks = SCL_STATE_ALL | oplocks;
+
+ spa_config_enter(spa, locks, spa, RW_WRITER);
+ spa->spa_vdev_locks = locks;
}
int
spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
{
- if (vd != NULL)
+ if (vd != NULL) {
vdev_state_dirty(vd->vdev_top);
+ spa->spa_config_generation++;
+ }
- spa_config_exit(spa, SCL_STATE_ALL, spa);
+ ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
+ spa_config_exit(spa, spa->spa_vdev_locks, spa);
/*
* If anything changed, wait for it to sync. This ensures that,
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index b4165b24c8..7a1175a43c 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -411,6 +411,7 @@ extern void spa_open_ref(spa_t *spa, void *tag);
extern void spa_close(spa_t *spa, void *tag);
extern boolean_t spa_refcount_zero(spa_t *spa);
+#define SCL_NONE 0x00
#define SCL_CONFIG 0x01
#define SCL_STATE 0x02
#define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */
@@ -436,7 +437,7 @@ extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
/* Pool vdev state change lock */
-extern void spa_vdev_state_enter(spa_t *spa);
+extern void spa_vdev_state_enter(spa_t *spa, int oplock);
extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
/* Accessor functions */
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
index 9c224d752f..ac272083b4 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -122,6 +122,7 @@ struct spa {
spa_aux_vdev_t spa_spares; /* hot spares */
spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
uint64_t spa_config_object; /* MOS object for pool config */
+ uint64_t spa_config_generation; /* config generation number */
uint64_t spa_syncing_txg; /* txg currently syncing */
uint64_t spa_sync_bplist_obj; /* object for deferred frees */
bplist_t spa_sync_bplist; /* deferred-free bplist */
@@ -172,6 +173,7 @@ struct spa {
spa_log_state_t spa_log_state; /* log state */
uint64_t spa_autoexpand; /* lun expansion on/off */
boolean_t spa_autoreplace; /* autoreplace set in open */
+ int spa_vdev_locks; /* locks grabbed */
/*
* spa_refcnt & spa_config_lock must be the last elements
* because refcount_t changes size based on compilation options.
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index ecf6c2fe17..e6f5c43f3b 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -80,7 +80,6 @@ extern void vdev_clear_stats(vdev_t *vd);
extern void vdev_stat_update(zio_t *zio, uint64_t psize);
extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
boolean_t complete);
-extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
extern void vdev_propagate_state(vdev_t *vd);
extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
vdev_aux_t aux);
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
index 71dfaf5ab7..32668aae13 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -118,6 +118,8 @@ typedef struct zinject_record {
uint32_t zi_freq;
uint32_t zi_failfast;
char zi_func[MAXNAMELEN];
+ uint32_t zi_iotype;
+ uint32_t zi_pad; /* 64-bit alignment */
} zinject_record_t;
#define ZINJECT_NULL 0x1
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 9c8aa43425..d4e158dad7 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -1935,7 +1935,7 @@ vdev_fault(spa_t *spa, uint64_t guid)
{
vdev_t *vd;
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1955,7 +1955,8 @@ vdev_fault(spa_t *spa, uint64_t guid)
* unavailable, then back off and simply mark the vdev as degraded
* instead.
*/
- if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) {
+ if (vdev_is_dead(vd->vdev_top) && !vd->vdev_islog &&
+ vd->vdev_aux == NULL) {
vd->vdev_degraded = 1ULL;
vd->vdev_faulted = 0ULL;
@@ -1984,7 +1985,7 @@ vdev_degrade(spa_t *spa, uint64_t guid)
{
vdev_t *vd;
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -2017,7 +2018,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
{
vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -2064,12 +2065,33 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
}
int
+vdev_offline_log(spa_t *spa)
+{
+ int error = 0;
+
+ if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
+ NULL, DS_FIND_CHILDREN)) == 0) {
+
+ /*
+ * We successfully offlined the log device, sync out the
+ * current txg so that the "stubby" block can be removed
+ * by zil_sync().
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ }
+ return (error);
+}
+
+int
vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
{
vdev_t *vd, *tvd;
- int error;
+ int error = 0;
+ uint64_t generation;
+ metaslab_group_t *mg;
- spa_vdev_state_enter(spa);
+top:
+ spa_vdev_state_enter(spa, SCL_ALLOC);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -2078,6 +2100,8 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
tvd = vd->vdev_top;
+ mg = tvd->vdev_mg;
+ generation = spa->spa_config_generation + 1;
/*
* If the device isn't already offline, try to offline it.
@@ -2093,6 +2117,38 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
return (spa_vdev_state_exit(spa, NULL, EBUSY));
/*
+ * If the top-level is a slog and it's had allocations
+ * then proceed. We check that the vdev's metaslab
+ * grop is not NULL since it's possible that we may
+ * have just added this vdev and have not yet initialized
+ * it's metaslabs.
+ */
+ if (tvd->vdev_islog && mg != NULL) {
+ /*
+ * Prevent any future allocations.
+ */
+ metaslab_class_remove(spa->spa_log_class, mg);
+ (void) spa_vdev_state_exit(spa, vd, 0);
+
+ error = vdev_offline_log(spa);
+
+ spa_vdev_state_enter(spa, SCL_ALLOC);
+
+ /*
+ * Check to see if the config has changed.
+ */
+ if (error || generation != spa->spa_config_generation) {
+ metaslab_class_add(spa->spa_log_class, mg);
+ if (error)
+ return (spa_vdev_state_exit(spa,
+ vd, error));
+ (void) spa_vdev_state_exit(spa, vd, 0);
+ goto top;
+ }
+ ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0);
+ }
+
+ /*
* Offline this device and reopen its top-level vdev.
* If the top-level vdev is a log device then just offline
* it. Otherwise, if this action results in the top-level
@@ -2107,28 +2163,18 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
vdev_reopen(tvd);
return (spa_vdev_state_exit(spa, NULL, EBUSY));
}
+
+ /*
+ * Add the device back into the metaslab rotor so that
+ * once we online the device it's open for business.
+ */
+ if (tvd->vdev_islog && mg != NULL)
+ metaslab_class_add(spa->spa_log_class, mg);
}
vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
- if (!tvd->vdev_islog || !vdev_is_dead(tvd))
- return (spa_vdev_state_exit(spa, vd, 0));
-
- (void) spa_vdev_state_exit(spa, vd, 0);
-
- error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
- NULL, DS_FIND_CHILDREN);
- if (error) {
- (void) vdev_online(spa, guid, 0, NULL);
- return (error);
- }
- /*
- * If we successfully offlined the log device then we need to
- * sync out the current txg so that the "stubby" block can be
- * removed by zil_sync().
- */
- txg_wait_synced(spa->spa_dsl_pool, 0);
- return (0);
+ return (spa_vdev_state_exit(spa, vd, 0));
}
/*
@@ -2356,6 +2402,14 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
!(zio->io_flags & ZIO_FLAG_IO_RETRY))
return;
+ /*
+ * Intent logs writes won't propagate their error to the root
+ * I/O so don't mark these types of failures as pool-level
+ * errors.
+ */
+ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+ return;
+
mutex_enter(&vd->vdev_stat_lock);
if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
if (zio->io_error == ECKSUM)
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 40d848fe1a..4ffb13207b 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -2983,7 +2983,7 @@ zfs_ioc_clear(zfs_cmd_t *zc)
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
if (zc->zc_guid == 0) {
vd = NULL;
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index d8868ffc0d..8f51a0dbf2 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -951,7 +951,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
if (zil_disable) {
- zil_destroy(zfsvfs->z_log, 0);
+ zil_destroy(zfsvfs->z_log, B_FALSE);
zfsvfs->z_log = NULL;
}
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index 1845715a85..625ec719fb 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -77,6 +77,8 @@ boolean_t zfs_nocacheflush = B_FALSE;
static kmem_cache_t *zil_lwb_cache;
+static boolean_t zil_empty(zilog_t *zilog);
+
static int
zil_dva_compare(const void *x1, const void *x2)
{
@@ -436,23 +438,12 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
mutex_enter(&zilog->zl_lock);
- /*
- * It is possible for the ZIL to get the previously mounted zilog
- * structure of the same dataset if quickly remounted and the dbuf
- * eviction has not completed. In this case we can see a non
- * empty lwb list and keep_first will be set. We fix this by
- * clearing the keep_first. This will be slower but it's very rare.
- */
- if (!list_is_empty(&zilog->zl_lwb_list) && keep_first)
- keep_first = B_FALSE;
-
ASSERT3U(zilog->zl_destroy_txg, <, txg);
zilog->zl_destroy_txg = txg;
- zilog->zl_keep_first = keep_first;
if (!list_is_empty(&zilog->zl_lwb_list)) {
ASSERT(zh->zh_claim_txg == 0);
- ASSERT(!keep_first);
+ zilog->zl_keep_first = B_FALSE;
while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
list_remove(&zilog->zl_lwb_list, lwb);
if (lwb->lwb_buf != NULL)
@@ -461,9 +452,23 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
kmem_cache_free(zil_lwb_cache, lwb);
}
} else {
- if (!keep_first) {
+ zilog->zl_keep_first = keep_first;
+ if (zh->zh_flags & ZIL_REPLAY_NEEDED) {
+ ASSERT(!keep_first);
(void) zil_parse(zilog, zil_free_log_block,
zil_free_log_record, tx, zh->zh_claim_txg);
+ } else {
+ /*
+ * Would like to assert zil_empty() but that
+ * would force us to read the log chain which
+ * requires us to do I/O to the log. This is
+ * overkill since we really just want to destroy
+ * the chain anyway.
+ */
+ if (!keep_first) {
+ blkptr_t bp = zh->zh_log;
+ zio_free_blk(zilog->zl_spa, &bp, txg);
+ }
}
}
mutex_exit(&zilog->zl_lock);
@@ -746,7 +751,7 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz,
zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb);
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
}
}
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 8798699fc8..98c593e375 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -2196,8 +2196,9 @@ zio_done(zio_t *zio)
if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
- if ((zio->io_error == EIO ||
- !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && zio == lio) {
+ if ((zio->io_error == EIO || !(zio->io_flags &
+ (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
+ zio == lio) {
/*
* For logical I/O requests, tell the SPA to log the
* error and generate a logical data ereport.
diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c
index c5ff55243a..078b40b87a 100644
--- a/usr/src/uts/common/fs/zfs/zio_inject.c
+++ b/usr/src/uts/common/fs/zfs/zio_inject.c
@@ -184,7 +184,7 @@ zio_handle_label_injection(zio_t *zio, int error)
int label;
int ret = 0;
- if (offset + zio->io_size > VDEV_LABEL_START_SIZE &&
+ if (offset >= VDEV_LABEL_START_SIZE &&
offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
return (0);
@@ -226,6 +226,18 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
inject_handler_t *handler;
int ret = 0;
+ /*
+ * We skip over faults in the labels unless it's during
+ * device open (i.e. zio == NULL).
+ */
+ if (zio != NULL) {
+ uint64_t offset = zio->io_offset;
+
+ if (offset < VDEV_LABEL_START_SIZE ||
+ offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
+ return (0);
+ }
+
rw_enter(&inject_lock, RW_READER);
for (handler = list_head(&inject_handlers); handler != NULL;
@@ -243,6 +255,12 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
continue;
}
+ /* Handle type specific I/O failures */
+ if (zio != NULL &&
+ handler->zi_record.zi_iotype != ZIO_TYPES &&
+ handler->zi_record.zi_iotype != zio->io_type)
+ continue;
+
if (handler->zi_record.zi_error == error) {
/*
* For a failed open, pretend like the device
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
index de0d67176e..8b663badcb 100644
--- a/usr/src/uts/common/sys/fs/zfs.h
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -692,7 +692,7 @@ typedef enum {
/*
* Note: This is encoded on-disk, so new events must be added to the
* end, and unused events can not be removed. Be sure to edit
- * zpool_main.c: hist_event_table[].
+ * libzfs_pool.c: hist_event_table[].
*/
typedef enum history_internal_events {
LOG_NO_EVENT = 0,