summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorge Wilson <George.Wilson@Sun.COM>2009-09-21 10:38:24 -0700
committerGeorge Wilson <George.Wilson@Sun.COM>2009-09-21 10:38:24 -0700
commit88ecc943b4eb72f7c4fbbd8435997b85ef171fc3 (patch)
treeebceb7c59c849c35d63917995146dc8ad430fa31
parent53520bfd0d8e6401efee237b91e682ab66f77eef (diff)
downloadillumos-joyent-88ecc943b4eb72f7c4fbbd8435997b85ef171fc3.tar.gz
6574286 removing a slog doesn't work
6856566 zpool import -F can cause panic 6863456 system panic by load_nvlist(spa, spa->spa_config_object, &nv) == 0 while running zfs test suite 6882947 dump_nvlist() should live in libnvpair
-rw-r--r--usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c2
-rw-r--r--usr/src/cmd/fstyp/fstyp.c148
-rw-r--r--usr/src/cmd/power/handlers.c2
-rw-r--r--usr/src/cmd/zdb/zdb.c62
-rw-r--r--usr/src/cmd/zinject/zinject.c69
-rw-r--r--usr/src/cmd/zpool/zpool_main.c37
-rw-r--r--usr/src/cmd/zpool/zpool_util.c16
-rw-r--r--usr/src/cmd/zpool/zpool_util.h1
-rw-r--r--usr/src/cmd/ztest/ztest.c82
-rw-r--r--usr/src/grub/capability2
-rw-r--r--usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h6
-rw-r--r--usr/src/lib/libnvpair/libnvpair.c155
-rw-r--r--usr/src/lib/libnvpair/libnvpair.h5
-rw-r--r--usr/src/lib/libnvpair/mapfile-vers1
-rw-r--r--usr/src/lib/libzfs/common/libzfs.h3
-rw-r--r--usr/src/lib/libzfs/common/libzfs_import.c138
-rw-r--r--usr/src/lib/libzfs/common/libzfs_pool.c110
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c50
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c217
-rw-r--r--usr/src/uts/common/fs/zfs/spa_config.c7
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c49
-rw-r--r--usr/src/uts/common/fs/zfs/sys/metaslab.h4
-rw-r--r--usr/src/uts/common/fs/zfs/sys/metaslab_impl.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h8
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h1
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c135
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_label.c46
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_missing.c17
-rw-r--r--usr/src/uts/common/fs/zfs/zio_inject.c39
-rw-r--r--usr/src/uts/common/sys/fs/zfs.h10
33 files changed, 1047 insertions, 381 deletions
diff --git a/usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c b/usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c
index 3f0a6eee43..bc35ad9cfb 100644
--- a/usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c
+++ b/usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c
@@ -166,7 +166,7 @@ replace_with_spare(zpool_handle_t *zhp, nvlist_t *vdev)
return;
}
- dev_name = zpool_vdev_name(NULL, zhp, vdev);
+ dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE);
/*
* Try to replace each spare, ending when we successfully
diff --git a/usr/src/cmd/fstyp/fstyp.c b/usr/src/cmd/fstyp/fstyp.c
index fb81b0edbb..464a3114a4 100644
--- a/usr/src/cmd/fstyp/fstyp.c
+++ b/usr/src/cmd/fstyp/fstyp.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -50,7 +50,6 @@
static const char *getmodfsname();
static char *getexecpathname();
-static void dump_nvlist(nvlist_t *list, int indent);
static boolean_t dos_to_dev(char *path, char **devpath, int *num);
static boolean_t find_dos_drive(int fd, int num, off_t *offset);
static void run_legacy_cmds(int fd, char *device, int vflag);
@@ -177,151 +176,6 @@ out:
}
-#define NVP(elem, type, vtype, ptype, format) { \
- vtype value; \
-\
- (void) nvpair_value_##type(elem, &value); \
- (void) printf("%*s%s: " format "\n", indent, "", \
- nvpair_name(elem), (ptype)value); \
-}
-
-#define NVPA(elem, type, vtype, ptype, format) { \
- uint_t i, count; \
- vtype *value; \
-\
- (void) nvpair_value_##type(elem, &value, &count); \
- for (i = 0; i < count; i++) { \
- (void) printf("%*s%s[%d]: " format "\n", indent, "", \
- nvpair_name(elem), i, (ptype)value[i]); \
- } \
-}
-
-static void
-dump_nvlist(nvlist_t *list, int indent)
-{
- nvpair_t *elem = NULL;
- boolean_t bool_value;
- nvlist_t *nvlist_value;
- nvlist_t **nvlist_array_value;
- uint_t i, count;
-
- if (list == NULL) {
- return;
- }
-
- while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
- switch (nvpair_type(elem)) {
- case DATA_TYPE_BOOLEAN_VALUE:
- (void) nvpair_value_boolean_value(elem, &bool_value);
- (void) printf("%*s%s: %s\n", indent, "",
- nvpair_name(elem), bool_value ? "true" : "false");
- break;
-
- case DATA_TYPE_BYTE:
- NVP(elem, byte, uchar_t, int, "%u");
- break;
-
- case DATA_TYPE_INT8:
- NVP(elem, int8, int8_t, int, "%d");
- break;
-
- case DATA_TYPE_UINT8:
- NVP(elem, uint8, uint8_t, int, "%u");
- break;
-
- case DATA_TYPE_INT16:
- NVP(elem, int16, int16_t, int, "%d");
- break;
-
- case DATA_TYPE_UINT16:
- NVP(elem, uint16, uint16_t, int, "%u");
- break;
-
- case DATA_TYPE_INT32:
- NVP(elem, int32, int32_t, long, "%ld");
- break;
-
- case DATA_TYPE_UINT32:
- NVP(elem, uint32, uint32_t, ulong_t, "%lu");
- break;
-
- case DATA_TYPE_INT64:
- NVP(elem, int64, int64_t, longlong_t, "%lld");
- break;
-
- case DATA_TYPE_UINT64:
- NVP(elem, uint64, uint64_t, u_longlong_t, "%llu");
- break;
-
- case DATA_TYPE_STRING:
- NVP(elem, string, char *, char *, "'%s'");
- break;
-
- case DATA_TYPE_BYTE_ARRAY:
- NVPA(elem, byte_array, uchar_t, int, "%u");
- break;
-
- case DATA_TYPE_INT8_ARRAY:
- NVPA(elem, int8_array, int8_t, int, "%d");
- break;
-
- case DATA_TYPE_UINT8_ARRAY:
- NVPA(elem, uint8_array, uint8_t, int, "%u");
- break;
-
- case DATA_TYPE_INT16_ARRAY:
- NVPA(elem, int16_array, int16_t, int, "%d");
- break;
-
- case DATA_TYPE_UINT16_ARRAY:
- NVPA(elem, uint16_array, uint16_t, int, "%u");
- break;
-
- case DATA_TYPE_INT32_ARRAY:
- NVPA(elem, int32_array, int32_t, long, "%ld");
- break;
-
- case DATA_TYPE_UINT32_ARRAY:
- NVPA(elem, uint32_array, uint32_t, ulong_t, "%lu");
- break;
-
- case DATA_TYPE_INT64_ARRAY:
- NVPA(elem, int64_array, int64_t, longlong_t, "%lld");
- break;
-
- case DATA_TYPE_UINT64_ARRAY:
- NVPA(elem, uint64_array, uint64_t, u_longlong_t,
- "%llu");
- break;
-
- case DATA_TYPE_STRING_ARRAY:
- NVPA(elem, string_array, char *, char *, "'%s'");
- break;
-
- case DATA_TYPE_NVLIST:
- (void) nvpair_value_nvlist(elem, &nvlist_value);
- (void) printf("%*s%s:\n", indent, "",
- nvpair_name(elem));
- dump_nvlist(nvlist_value, indent + 4);
- break;
-
- case DATA_TYPE_NVLIST_ARRAY:
- (void) nvpair_value_nvlist_array(elem,
- &nvlist_array_value, &count);
- for (i = 0; i < count; i++) {
- (void) printf("%*s%s[%u]:\n", indent, "",
- nvpair_name(elem), i);
- dump_nvlist(nvlist_array_value[i], indent + 4);
- }
- break;
-
- default:
- (void) printf(gettext("bad config type %d for %s\n"),
- nvpair_type(elem), nvpair_name(elem));
- }
- }
-}
-
/*
* If the executable is a fs-specific hardlink, /usr/lib/fs/<fsname>/fstyp,
* return that fsname; otherwise return NULL.
diff --git a/usr/src/cmd/power/handlers.c b/usr/src/cmd/power/handlers.c
index 5d2d51851c..ba66f288ae 100644
--- a/usr/src/cmd/power/handlers.c
+++ b/usr/src/cmd/power/handlers.c
@@ -1043,7 +1043,7 @@ ztop(char *arg, char *diskname)
libzfs_fini(lzfs);
return (-1);
}
- vname = zpool_vdev_name(lzfs, zpool_handle, child[0]);
+ vname = zpool_vdev_name(lzfs, zpool_handle, child[0], B_FALSE);
if (vname == NULL) {
mesg(MERR, "couldn't determine vdev name\n");
zpool_close(zpool_handle);
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
index 068dd228a8..f0d3fa77a4 100644
--- a/usr/src/cmd/zdb/zdb.c
+++ b/usr/src/cmd/zdb/zdb.c
@@ -146,68 +146,6 @@ fatal(const char *fmt, ...)
exit(1);
}
-static void
-dump_nvlist(nvlist_t *list, int indent)
-{
- nvpair_t *elem = NULL;
-
- while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
- switch (nvpair_type(elem)) {
- case DATA_TYPE_STRING:
- {
- char *value;
-
- VERIFY(nvpair_value_string(elem, &value) == 0);
- (void) printf("%*s%s='%s'\n", indent, "",
- nvpair_name(elem), value);
- }
- break;
-
- case DATA_TYPE_UINT64:
- {
- uint64_t value;
-
- VERIFY(nvpair_value_uint64(elem, &value) == 0);
- (void) printf("%*s%s=%llu\n", indent, "",
- nvpair_name(elem), (u_longlong_t)value);
- }
- break;
-
- case DATA_TYPE_NVLIST:
- {
- nvlist_t *value;
-
- VERIFY(nvpair_value_nvlist(elem, &value) == 0);
- (void) printf("%*s%s\n", indent, "",
- nvpair_name(elem));
- dump_nvlist(value, indent + 4);
- }
- break;
-
- case DATA_TYPE_NVLIST_ARRAY:
- {
- nvlist_t **value;
- uint_t c, count;
-
- VERIFY(nvpair_value_nvlist_array(elem, &value,
- &count) == 0);
-
- for (c = 0; c < count; c++) {
- (void) printf("%*s%s[%u]\n", indent, "",
- nvpair_name(elem), c);
- dump_nvlist(value[c], indent + 8);
- }
- }
- break;
-
- default:
-
- (void) printf("bad config type %d for %s\n",
- nvpair_type(elem), nvpair_name(elem));
- }
- }
-}
-
/* ARGSUSED */
static void
dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
diff --git a/usr/src/cmd/zinject/zinject.c b/usr/src/cmd/zinject/zinject.c
index 09c377ef8d..5528ac330a 100644
--- a/usr/src/cmd/zinject/zinject.c
+++ b/usr/src/cmd/zinject/zinject.c
@@ -222,6 +222,11 @@ usage(void)
"\t\tClear the particular record (if given a numeric ID), or\n"
"\t\tall records if 'all' is specificed.\n"
"\n"
+ "\tzinject -p <function name> pool\n"
+ "\t\tInject a panic fault at the specified function. Only \n"
+ "\t\tfunctions which call spa_vdev_config_exit(), or \n"
+ "\t\tspa_vdev_exit() will trigger a panic.\n"
+ "\n"
"\tzinject -d device [-e errno] [-L <nvlist|uber>] [-F] pool\n"
"\t\tInject a fault into a particular device or the device's\n"
"\t\tlabel. Label injection can either be 'nvlist' or 'uber'.\n"
@@ -295,7 +300,7 @@ print_data_handler(int id, const char *pool, zinject_record_t *record,
{
int *count = data;
- if (record->zi_guid != 0)
+ if (record->zi_guid != 0 || record->zi_func[0] != '\0')
return (0);
if (*count == 0) {
@@ -327,7 +332,7 @@ print_device_handler(int id, const char *pool, zinject_record_t *record,
{
int *count = data;
- if (record->zi_guid == 0)
+ if (record->zi_guid == 0 || record->zi_func[0] != '\0')
return (0);
if (*count == 0) {
@@ -343,6 +348,27 @@ print_device_handler(int id, const char *pool, zinject_record_t *record,
return (0);
}
+static int
+print_panic_handler(int id, const char *pool, zinject_record_t *record,
+ void *data)
+{
+ int *count = data;
+
+ if (record->zi_func[0] == '\0')
+ return (0);
+
+ if (*count == 0) {
+ (void) printf("%3s %-15s %s\n", "ID", "POOL", "FUNCTION");
+ (void) printf("--- --------------- ----------------\n");
+ }
+
+ *count += 1;
+
+ (void) printf("%3d %-15s %s\n", id, pool, record->zi_func);
+
+ return (0);
+}
+
/*
* Print all registered error handlers. Returns the number of handlers
* registered.
@@ -356,6 +382,9 @@ print_all_handlers(void)
(void) printf("\n");
count = 0;
(void) iter_handlers(print_data_handler, &count);
+ (void) printf("\n");
+ count = 0;
+ (void) iter_handlers(print_panic_handler, &count);
return (count);
}
@@ -443,6 +472,9 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
if (record->zi_guid) {
(void) printf(" vdev: %llx\n",
(u_longlong_t)record->zi_guid);
+ } else if (record->zi_func[0] != '\0') {
+ (void) printf(" panic function: %s\n",
+ record->zi_func);
} else {
(void) printf("objset: %llu\n",
(u_longlong_t)record->zi_objset);
@@ -514,7 +546,7 @@ main(int argc, char **argv)
return (0);
}
- while ((c = getopt(argc, argv, ":ab:d:f:Fqhc:t:l:mr:e:uL:")) != -1) {
+ while ((c = getopt(argc, argv, ":ab:d:f:Fqhc:t:l:mr:e:uL:p:")) != -1) {
switch (c) {
case 'a':
flags |= ZINJECT_FLUSH_ARC;
@@ -569,6 +601,10 @@ main(int argc, char **argv)
case 'm':
domount = 1;
break;
+ case 'p':
+ (void) strlcpy(record.zi_func, optarg,
+ sizeof (record.zi_func));
+ break;
case 'q':
quiet = 1;
break;
@@ -617,7 +653,7 @@ main(int argc, char **argv)
* '-c' is invalid with any other options.
*/
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
- level != 0) {
+ level != 0 || record.zi_func[0] != '\0') {
(void) fprintf(stderr, "cancel (-c) incompatible with "
"any other options\n");
usage();
@@ -649,7 +685,7 @@ main(int argc, char **argv)
* for doing injection, so handle it separately here.
*/
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
- level != 0) {
+ level != 0 || record.zi_func[0] != '\0') {
(void) fprintf(stderr, "device (-d) incompatible with "
"data error injection\n");
usage();
@@ -677,7 +713,8 @@ main(int argc, char **argv)
if (!error)
error = ENXIO;
} else if (raw != NULL) {
- if (range != NULL || type != TYPE_INVAL || level != 0) {
+ if (range != NULL || type != TYPE_INVAL || level != 0 ||
+ record.zi_func[0] != '\0') {
(void) fprintf(stderr, "raw (-b) format with "
"any other options\n");
usage();
@@ -704,10 +741,28 @@ main(int argc, char **argv)
return (1);
if (!error)
error = EIO;
+ } else if (record.zi_func[0] != '\0') {
+ if (raw != NULL || range != NULL || type != TYPE_INVAL ||
+ level != 0 || device != NULL) {
+ (void) fprintf(stderr, "panic (-p) incompatible with "
+ "other options\n");
+ usage();
+ return (2);
+ }
+
+ if (argc != 1) {
+ (void) fprintf(stderr, "panic (-p) injection requires "
+ "a single pool name\n");
+ usage();
+ return (2);
+ }
+
+ (void) strcpy(pool, argv[0]);
+ dataset[0] = '\0';
} else if (type == TYPE_INVAL) {
if (flags == 0) {
(void) fprintf(stderr, "at least one of '-b', '-d', "
- "'-t', '-a', or '-u' must be specified\n");
+ "'-t', '-a', '-p', or '-u' must be specified\n");
usage();
return (2);
}
diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c
index c78dc6f646..41bd4794c7 100644
--- a/usr/src/cmd/zpool/zpool_main.c
+++ b/usr/src/cmd/zpool/zpool_main.c
@@ -343,7 +343,7 @@ print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent,
if ((is_log && !print_logs) || (!is_log && print_logs))
continue;
- vname = zpool_vdev_name(g_zfs, zhp, child[c]);
+ vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE);
print_vdev_tree(zhp, vname, child[c], indent + 2,
B_FALSE);
free(vname);
@@ -944,7 +944,7 @@ zpool_do_export(int argc, char **argv)
static int
max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max)
{
- char *name = zpool_vdev_name(g_zfs, zhp, nv);
+ char *name = zpool_vdev_name(g_zfs, zhp, nv, B_TRUE);
nvlist_t **child;
uint_t c, children;
int ret;
@@ -1144,14 +1144,16 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
(void) printf("\n");
for (c = 0; c < children; c++) {
- uint64_t is_log = B_FALSE;
+ uint64_t islog = B_FALSE, ishole = B_FALSE;
- /* Don't print logs here */
+ /* Don't print logs or holes here */
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
- &is_log);
- if (is_log)
+ &islog);
+ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+ &ishole);
+ if (islog || ishole)
continue;
- vname = zpool_vdev_name(g_zfs, zhp, child[c]);
+ vname = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE);
print_status_config(zhp, vname, child[c],
namewidth, depth + 2, isspare);
free(vname);
@@ -1172,7 +1174,8 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
char *type, *vname;
verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
- if (strcmp(type, VDEV_TYPE_MISSING) == 0)
+ if (strcmp(type, VDEV_TYPE_MISSING) == 0 ||
+ strcmp(type, VDEV_TYPE_HOLE) == 0)
return;
verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
@@ -1224,7 +1227,7 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
if (is_log)
continue;
- vname = zpool_vdev_name(g_zfs, NULL, child[c]);
+ vname = zpool_vdev_name(g_zfs, NULL, child[c], B_TRUE);
print_import_config(vname, child[c], namewidth, depth + 2);
free(vname);
}
@@ -1233,7 +1236,7 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
&child, &children) == 0) {
(void) printf(gettext("\tcache\n"));
for (c = 0; c < children; c++) {
- vname = zpool_vdev_name(g_zfs, NULL, child[c]);
+ vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE);
(void) printf("\t %s\n", vname);
free(vname);
}
@@ -1243,7 +1246,7 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
&child, &children) == 0) {
(void) printf(gettext("\tspares\n"));
for (c = 0; c < children; c++) {
- vname = zpool_vdev_name(g_zfs, NULL, child[c]);
+ vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE);
(void) printf("\t %s\n", vname);
free(vname);
}
@@ -1278,7 +1281,7 @@ print_logs(zpool_handle_t *zhp, nvlist_t *nv, int namewidth, boolean_t verbose)
&is_log);
if (!is_log)
continue;
- name = zpool_vdev_name(g_zfs, zhp, child[c]);
+ name = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE);
if (verbose)
print_status_config(zhp, name, child[c], namewidth,
2, B_FALSE);
@@ -1964,7 +1967,7 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
return;
for (c = 0; c < children; c++) {
- vname = zpool_vdev_name(g_zfs, zhp, newchild[c]);
+ vname = zpool_vdev_name(g_zfs, zhp, newchild[c], B_FALSE);
print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
newchild[c], cb, depth + 2);
free(vname);
@@ -1985,7 +1988,8 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
(void) printf("%-*s - - - - - "
"-\n", cb->cb_namewidth, "cache");
for (c = 0; c < children; c++) {
- vname = zpool_vdev_name(g_zfs, zhp, newchild[c]);
+ vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
+ B_FALSE);
print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
newchild[c], cb, depth + 2);
free(vname);
@@ -2996,7 +3000,7 @@ print_spares(zpool_handle_t *zhp, nvlist_t **spares, uint_t nspares,
(void) printf(gettext("\tspares\n"));
for (i = 0; i < nspares; i++) {
- name = zpool_vdev_name(g_zfs, zhp, spares[i]);
+ name = zpool_vdev_name(g_zfs, zhp, spares[i], B_FALSE);
print_status_config(zhp, name, spares[i],
namewidth, 2, B_TRUE);
free(name);
@@ -3016,7 +3020,7 @@ print_l2cache(zpool_handle_t *zhp, nvlist_t **l2cache, uint_t nl2cache,
(void) printf(gettext("\tcache\n"));
for (i = 0; i < nl2cache; i++) {
- name = zpool_vdev_name(g_zfs, zhp, l2cache[i]);
+ name = zpool_vdev_name(g_zfs, zhp, l2cache[i], B_FALSE);
print_status_config(zhp, name, l2cache[i],
namewidth, 2, B_FALSE);
free(name);
@@ -3573,6 +3577,7 @@ zpool_do_upgrade(int argc, char **argv)
(void) printf(gettext(" 16 stmf property support\n"));
(void) printf(gettext(" 17 Triple-parity RAID-Z\n"));
(void) printf(gettext(" 18 snapshot user holds\n"));
+ (void) printf(gettext(" 19 Log device removal\n"));
(void) printf(gettext("For more information on a particular "
"version, including supported releases, see:\n\n"));
(void) printf("http://www.opensolaris.org/os/community/zfs/"
diff --git a/usr/src/cmd/zpool/zpool_util.c b/usr/src/cmd/zpool/zpool_util.c
index bc34e41a4c..c7a002efb1 100644
--- a/usr/src/cmd/zpool/zpool_util.c
+++ b/usr/src/cmd/zpool/zpool_util.c
@@ -49,22 +49,6 @@ safe_malloc(size_t size)
}
/*
- * Same as above, but for strdup()
- */
-char *
-zpool_safe_strdup(const char *str)
-{
- char *ret;
-
- if ((ret = strdup(str)) == NULL) {
- (void) fprintf(stderr, "internal error: out of memory\n");
- exit(1);
- }
-
- return (ret);
-}
-
-/*
* Display an out of memory error message and abort the current program.
*/
void
diff --git a/usr/src/cmd/zpool/zpool_util.h b/usr/src/cmd/zpool/zpool_util.h
index 2bcefee62b..c86b2e7405 100644
--- a/usr/src/cmd/zpool/zpool_util.h
+++ b/usr/src/cmd/zpool/zpool_util.h
@@ -37,7 +37,6 @@ extern "C" {
* Basic utility functions
*/
void *safe_malloc(size_t);
-char *zpool_safe_strdup(const char *);
void zpool_no_memory(void);
uint_t num_logs(nvlist_t *nv);
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index 4cf36302c0..81b53a68bc 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -92,6 +92,7 @@
#include <sys/vdev_impl.h>
#include <sys/vdev_file.h>
#include <sys/spa_impl.h>
+#include <sys/metaslab_impl.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_dataset.h>
#include <sys/refcount.h>
@@ -231,7 +232,7 @@ ztest_info_t ztest_info[] = {
typedef struct ztest_shared {
mutex_t zs_vdev_lock;
rwlock_t zs_name_lock;
- uint64_t zs_vdev_primaries;
+ uint64_t zs_vdev_next_leaf;
uint64_t zs_vdev_aux;
uint64_t zs_enospc_count;
hrtime_t zs_start_time;
@@ -558,7 +559,7 @@ make_vdev_file(char *path, char *aux, size_t size, uint64_t ashift)
(void) sprintf(path, ztest_aux_template,
zopt_dir, zopt_pool, aux, vdev);
} else {
- vdev = ztest_shared->zs_vdev_primaries++;
+ vdev = ztest_shared->zs_vdev_next_leaf++;
(void) sprintf(path, ztest_dev_template,
zopt_dir, zopt_pool, vdev);
}
@@ -850,6 +851,26 @@ vdev_lookup_by_path(vdev_t *vd, const char *path)
}
/*
+ * Find the first available hole which can be used as a top-level.
+ */
+int
+find_vdev_hole(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ int c;
+
+ ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV);
+
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *cvd = rvd->vdev_child[c];
+
+ if (cvd->vdev_ishole)
+ break;
+ }
+ return (c);
+}
+
+/*
* Verify that vdev_add() works as expected.
*/
void
@@ -857,6 +878,7 @@ ztest_vdev_add_remove(ztest_args_t *za)
{
spa_t *spa = za->za_spa;
uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
+ uint64_t guid;
nvlist_t *nvroot;
int error;
@@ -864,26 +886,52 @@ ztest_vdev_add_remove(ztest_args_t *za)
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
- ztest_shared->zs_vdev_primaries =
- spa->spa_root_vdev->vdev_children * leaves;
-
- spa_config_exit(spa, SCL_VDEV, FTAG);
+ ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
/*
- * Make 1/4 of the devices be log devices.
+ * If we have slogs then remove them 1/4 of the time.
*/
- nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
- ztest_random(4) == 0, zopt_raidz, zopt_mirrors, 1);
+ if (spa_has_slogs(spa) && ztest_random(4) == 0) {
+ /*
+ * Grab the guid from the head of the log class rotor.
+ */
+ guid = spa->spa_log_class->mc_rotor->mg_vd->vdev_guid;
- error = spa_vdev_add(spa, nvroot);
- nvlist_free(nvroot);
+ spa_config_exit(spa, SCL_VDEV, FTAG);
- (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+ /*
+ * We have to grab the zs_name_lock as writer to
+ * prevent a race between removing a slog (dmu_objset_find)
+ * and destroying a dataset. Removing the slog will
+ * grab a reference on the dataset which may cause
+ * dmu_objset_destroy() to fail with EBUSY thus
+ * leaving the dataset in an inconsistent state.
+ */
+ (void) rw_wrlock(&ztest_shared->zs_name_lock);
+ error = spa_vdev_remove(spa, guid, B_FALSE);
+ (void) rw_unlock(&ztest_shared->zs_name_lock);
- if (error == ENOSPC)
- ztest_record_enospc("spa_vdev_add");
- else if (error != 0)
- fatal(0, "spa_vdev_add() = %d", error);
+ if (error && error != EEXIST)
+ fatal(0, "spa_vdev_remove() = %d", error);
+ } else {
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ /*
+ * Make 1/4 of the devices be log devices.
+ */
+ nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
+ ztest_random(4) == 0, zopt_raidz, zopt_mirrors, 1);
+
+ error = spa_vdev_add(spa, nvroot);
+ nvlist_free(nvroot);
+
+ if (error == ENOSPC)
+ ztest_record_enospc("spa_vdev_add");
+ else if (error != 0)
+ fatal(0, "spa_vdev_add() = %d", error);
+ }
+
+ (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
}
/*
@@ -4004,7 +4052,7 @@ ztest_init(char *pool)
* Create the storage pool.
*/
(void) spa_destroy(pool);
- ztest_shared->zs_vdev_primaries = 0;
+ ztest_shared->zs_vdev_next_leaf = 0;
nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
0, zopt_raidz, zopt_mirrors, 1);
error = spa_create(pool, nvroot, NULL, NULL, NULL);
diff --git a/usr/src/grub/capability b/usr/src/grub/capability
index 5d81d0e2f3..25987fb4f7 100644
--- a/usr/src/grub/capability
+++ b/usr/src/grub/capability
@@ -40,7 +40,7 @@
# This file and the associated version are Solaris specific and are
# not a part of the open source distribution of GRUB.
#
-VERSION=11
+VERSION=12
dboot
xVM
zfs
diff --git a/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h b/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h
index 4e4a72c139..612a0c4c2d 100644
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h
@@ -27,7 +27,7 @@
/*
* On-disk version number.
*/
-#define SPA_VERSION 18ULL
+#define SPA_VERSION 19ULL
/*
* The following are configuration names used in the nvlist describing a pool's
@@ -61,6 +61,9 @@
#define ZPOOL_CONFIG_NPARITY "nparity"
#define ZPOOL_CONFIG_PHYS_PATH "phys_path"
#define ZPOOL_CONFIG_L2CACHE "l2cache"
+#define ZPOOL_CONFIG_HOLE_ARRAY "hole_array"
+#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children"
+#define ZPOOL_CONFIG_IS_HOLE "is_hole"
/*
* The persistent vdev state is stored as separate values rather than a single
* 'vdev_state' entry. This is because a device can be in multiple states, such
@@ -78,6 +81,7 @@
#define VDEV_TYPE_DISK "disk"
#define VDEV_TYPE_FILE "file"
#define VDEV_TYPE_MISSING "missing"
+#define VDEV_TYPE_HOLE "hole"
#define VDEV_TYPE_SPARE "spare"
#define VDEV_TYPE_L2CACHE "l2cache"
diff --git a/usr/src/lib/libnvpair/libnvpair.c b/usr/src/lib/libnvpair/libnvpair.c
index 0845cb08cf..57915cd737 100644
--- a/usr/src/lib/libnvpair/libnvpair.c
+++ b/usr/src/lib/libnvpair/libnvpair.c
@@ -19,14 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <unistd.h>
#include <strings.h>
+#include <libintl.h>
#include <sys/types.h>
#include <sys/inttypes.h>
#include "libnvpair.h"
@@ -272,6 +271,156 @@ nvlist_print(FILE *fp, nvlist_t *nvl)
nvlist_print_with_indent(fp, nvl, 0);
}
+
+#define NVP(elem, type, vtype, ptype, format) { \
+ vtype value; \
+\
+ (void) nvpair_value_##type(elem, &value); \
+ (void) printf("%*s%s: " format "\n", indent, "", \
+ nvpair_name(elem), (ptype)value); \
+}
+
+#define NVPA(elem, type, vtype, ptype, format) { \
+ uint_t i, count; \
+ vtype *value; \
+\
+ (void) nvpair_value_##type(elem, &value, &count); \
+ for (i = 0; i < count; i++) { \
+ (void) printf("%*s%s[%d]: " format "\n", indent, "", \
+ nvpair_name(elem), i, (ptype)value[i]); \
+ } \
+}
+
+/*
+ * Similar to nvlist_print() but handles arrays slightly differently.
+ */
+void
+dump_nvlist(nvlist_t *list, int indent)
+{
+ nvpair_t *elem = NULL;
+ boolean_t bool_value;
+ nvlist_t *nvlist_value;
+ nvlist_t **nvlist_array_value;
+ uint_t i, count;
+
+ if (list == NULL) {
+ return;
+ }
+
+ while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
+ switch (nvpair_type(elem)) {
+ case DATA_TYPE_BOOLEAN_VALUE:
+ (void) nvpair_value_boolean_value(elem, &bool_value);
+ (void) printf("%*s%s: %s\n", indent, "",
+ nvpair_name(elem), bool_value ? "true" : "false");
+ break;
+
+ case DATA_TYPE_BYTE:
+ NVP(elem, byte, uchar_t, int, "%u");
+ break;
+
+ case DATA_TYPE_INT8:
+ NVP(elem, int8, int8_t, int, "%d");
+ break;
+
+ case DATA_TYPE_UINT8:
+ NVP(elem, uint8, uint8_t, int, "%u");
+ break;
+
+ case DATA_TYPE_INT16:
+ NVP(elem, int16, int16_t, int, "%d");
+ break;
+
+ case DATA_TYPE_UINT16:
+ NVP(elem, uint16, uint16_t, int, "%u");
+ break;
+
+ case DATA_TYPE_INT32:
+ NVP(elem, int32, int32_t, long, "%ld");
+ break;
+
+ case DATA_TYPE_UINT32:
+ NVP(elem, uint32, uint32_t, ulong_t, "%lu");
+ break;
+
+ case DATA_TYPE_INT64:
+ NVP(elem, int64, int64_t, longlong_t, "%lld");
+ break;
+
+ case DATA_TYPE_UINT64:
+ NVP(elem, uint64, uint64_t, u_longlong_t, "%llu");
+ break;
+
+ case DATA_TYPE_STRING:
+ NVP(elem, string, char *, char *, "'%s'");
+ break;
+
+ case DATA_TYPE_BYTE_ARRAY:
+ NVPA(elem, byte_array, uchar_t, int, "%u");
+ break;
+
+ case DATA_TYPE_INT8_ARRAY:
+ NVPA(elem, int8_array, int8_t, int, "%d");
+ break;
+
+ case DATA_TYPE_UINT8_ARRAY:
+ NVPA(elem, uint8_array, uint8_t, int, "%u");
+ break;
+
+ case DATA_TYPE_INT16_ARRAY:
+ NVPA(elem, int16_array, int16_t, int, "%d");
+ break;
+
+ case DATA_TYPE_UINT16_ARRAY:
+ NVPA(elem, uint16_array, uint16_t, int, "%u");
+ break;
+
+ case DATA_TYPE_INT32_ARRAY:
+ NVPA(elem, int32_array, int32_t, long, "%ld");
+ break;
+
+ case DATA_TYPE_UINT32_ARRAY:
+ NVPA(elem, uint32_array, uint32_t, ulong_t, "%lu");
+ break;
+
+ case DATA_TYPE_INT64_ARRAY:
+ NVPA(elem, int64_array, int64_t, longlong_t, "%lld");
+ break;
+
+ case DATA_TYPE_UINT64_ARRAY:
+ NVPA(elem, uint64_array, uint64_t, u_longlong_t,
+ "%llu");
+ break;
+
+ case DATA_TYPE_STRING_ARRAY:
+ NVPA(elem, string_array, char *, char *, "'%s'");
+ break;
+
+ case DATA_TYPE_NVLIST:
+ (void) nvpair_value_nvlist(elem, &nvlist_value);
+ (void) printf("%*s%s:\n", indent, "",
+ nvpair_name(elem));
+ dump_nvlist(nvlist_value, indent + 4);
+ break;
+
+ case DATA_TYPE_NVLIST_ARRAY:
+ (void) nvpair_value_nvlist_array(elem,
+ &nvlist_array_value, &count);
+ for (i = 0; i < count; i++) {
+ (void) printf("%*s%s[%u]:\n", indent, "",
+ nvpair_name(elem), i);
+ dump_nvlist(nvlist_array_value[i], indent + 4);
+ }
+ break;
+
+ default:
+ (void) printf(dgettext(TEXT_DOMAIN, "bad config type "
+ "%d for %s\n"), nvpair_type(elem),
+ nvpair_name(elem));
+ }
+ }
+}
+
/*
* Determine if string 'value' matches 'nvp' value. The 'value' string is
* converted, depending on the type of 'nvp', prior to match. For numeric
diff --git a/usr/src/lib/libnvpair/libnvpair.h b/usr/src/lib/libnvpair/libnvpair.h
index e655e0d406..15c1c78167 100644
--- a/usr/src/lib/libnvpair/libnvpair.h
+++ b/usr/src/lib/libnvpair/libnvpair.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _LIBNVPAIR_H
#define _LIBNVPAIR_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/nvpair.h>
#include <stdlib.h>
#include <stdio.h>
@@ -40,6 +38,7 @@ extern "C" {
void nvlist_print(FILE *, nvlist_t *);
int nvpair_value_match(nvpair_t *, int, char *, char **);
int nvpair_value_match_regex(nvpair_t *, int, char *, regex_t *, char **);
+void dump_nvlist(nvlist_t *, int);
#ifdef __cplusplus
}
diff --git a/usr/src/lib/libnvpair/mapfile-vers b/usr/src/lib/libnvpair/mapfile-vers
index a6d56b5ca8..52f9fcfaec 100644
--- a/usr/src/lib/libnvpair/mapfile-vers
+++ b/usr/src/lib/libnvpair/mapfile-vers
@@ -166,6 +166,7 @@ SUNW_1.1 {
SUNWprivate_1.1 {
global:
+ dump_nvlist;
nvlist_add_hrtime;
nvlist_lookup_hrtime;
nvlist_print;
diff --git a/usr/src/lib/libzfs/common/libzfs.h b/usr/src/lib/libzfs/common/libzfs.h
index 81f556d816..546c8e451f 100644
--- a/usr/src/lib/libzfs/common/libzfs.h
+++ b/usr/src/lib/libzfs/common/libzfs.h
@@ -332,7 +332,8 @@ extern nvlist_t *zpool_find_import_activeok(libzfs_handle_t *, int, char **);
*/
struct zfs_cmd;
-extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *);
+extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *,
+ boolean_t verbose);
extern int zpool_upgrade(zpool_handle_t *, uint64_t);
extern int zpool_get_history(zpool_handle_t *, nvlist_t **);
extern void zpool_set_history_str(const char *subcommand, int argc,
diff --git a/usr/src/lib/libzfs/common/libzfs_import.c b/usr/src/lib/libzfs/common/libzfs_import.c
index d67776889d..f5793390ea 100644
--- a/usr/src/lib/libzfs/common/libzfs_import.c
+++ b/usr/src/lib/libzfs/common/libzfs_import.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Pool import support functions.
*
@@ -388,8 +386,6 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config)
}
if (err) {
- (void) zpool_standard_error(hdl, errno,
- dgettext(TEXT_DOMAIN, "cannot discover pools"));
zcmd_free_nvlists(&zc);
return (NULL);
}
@@ -404,6 +400,21 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config)
}
/*
+ * Determine if the vdev id is a hole in the namespace.
+ */
+boolean_t
+vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
+{
+ for (int c = 0; c < holes; c++) {
+
+ /* Top-level is a hole */
+ if (hole_array[c] == id)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
* Convert our list of pools into the definitive set of configurations. We
* start by picking the best config for each toplevel vdev. Once that's done,
* we assemble the toplevel vdevs into a full config for the pool. We make a
@@ -425,17 +436,20 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
uint64_t version, guid;
uint_t children = 0;
nvlist_t **child = NULL;
+ uint_t holes;
+ uint64_t *hole_array, max_id;
uint_t c;
boolean_t isactive;
uint64_t hostid;
nvlist_t *nvl;
boolean_t found_one = B_FALSE;
+ boolean_t valid_top_config = B_FALSE;
if (nvlist_alloc(&ret, 0, 0) != 0)
goto nomem;
for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
- uint64_t id;
+ uint64_t id, max_txg = 0;
if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0)
goto nomem;
@@ -463,6 +477,42 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
}
}
+ /*
+ * We rely on the fact that the max txg for the
+ * pool will contain the most up-to-date information
+ * about the valid top-levels in the vdev namespace.
+ */
+ if (best_txg > max_txg) {
+ (void) nvlist_remove(config,
+ ZPOOL_CONFIG_VDEV_CHILDREN,
+ DATA_TYPE_UINT64);
+ (void) nvlist_remove(config,
+ ZPOOL_CONFIG_HOLE_ARRAY,
+ DATA_TYPE_UINT64_ARRAY);
+
+ max_txg = best_txg;
+ hole_array = NULL;
+ holes = 0;
+ max_id = 0;
+ valid_top_config = B_FALSE;
+
+ if (nvlist_lookup_uint64(tmp,
+ ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) {
+ verify(nvlist_add_uint64(config,
+ ZPOOL_CONFIG_VDEV_CHILDREN,
+ max_id) == 0);
+ valid_top_config = B_TRUE;
+ }
+
+ if (nvlist_lookup_uint64_array(tmp,
+ ZPOOL_CONFIG_HOLE_ARRAY, &hole_array,
+ &holes) == 0) {
+ verify(nvlist_add_uint64_array(config,
+ ZPOOL_CONFIG_HOLE_ARRAY,
+ hole_array, holes) == 0);
+ }
+ }
+
if (!config_seen) {
/*
* Copy the relevant pieces of data to the pool
@@ -522,6 +572,7 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0);
verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID,
&id) == 0);
+
if (id >= children) {
nvlist_t **newchild;
@@ -542,17 +593,82 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
}
+ /*
+ * If we have information about all the top-levels then
+ * clean up the nvlist which we've constructed. This
+ * means removing any extraneous devices that are
+ * beyond the valid range or adding devices to the end
+ * of our array which appear to be missing.
+ */
+ if (valid_top_config) {
+ if (max_id < children) {
+ for (c = max_id; c < children; c++)
+ nvlist_free(child[c]);
+ children = max_id;
+ } else if (max_id > children) {
+ nvlist_t **newchild;
+
+ newchild = zfs_alloc(hdl, (max_id) *
+ sizeof (nvlist_t *));
+ if (newchild == NULL)
+ goto nomem;
+
+ for (c = 0; c < children; c++)
+ newchild[c] = child[c];
+
+ free(child);
+ child = newchild;
+ children = max_id;
+ }
+ }
+
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
&guid) == 0);
/*
+ * The vdev namespace may contain holes as a result of
+ * device removal. We must add them back into the vdev
+ * tree before we process any missing devices.
+ */
+ if (holes > 0) {
+ ASSERT(valid_top_config);
+
+ for (c = 0; c < children; c++) {
+ nvlist_t *holey;
+
+ if (child[c] != NULL ||
+ !vdev_is_hole(hole_array, holes, c))
+ continue;
+
+ if (nvlist_alloc(&holey, NV_UNIQUE_NAME,
+ 0) != 0)
+ goto nomem;
+
+ /*
+ * Holes in the namespace are treated as
+ * "hole" top-level vdevs and have a
+ * special flag set on them.
+ */
+ if (nvlist_add_string(holey,
+ ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_HOLE) != 0 ||
+ nvlist_add_uint64(holey,
+ ZPOOL_CONFIG_ID, c) != 0 ||
+ nvlist_add_uint64(holey,
+ ZPOOL_CONFIG_GUID, 0ULL) != 0)
+ goto nomem;
+ child[c] = holey;
+ }
+ }
+
+ /*
* Look for any missing top-level vdevs. If this is the case,
* create a faked up 'missing' vdev as a placeholder. We cannot
* simply compress the child array, because the kernel performs
* certain checks to make sure the vdev IDs match their location
* in the configuration.
*/
- for (c = 0; c < children; c++)
+ for (c = 0; c < children; c++) {
if (child[c] == NULL) {
nvlist_t *missing;
if (nvlist_alloc(&missing, NV_UNIQUE_NAME,
@@ -570,6 +686,7 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
}
child[c] = missing;
}
+ }
/*
* Put all of this pool's top-level vdevs into a root vdev.
@@ -636,8 +753,11 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
continue;
}
- if ((nvl = refresh_config(hdl, config)) == NULL)
- goto error;
+ if ((nvl = refresh_config(hdl, config)) == NULL) {
+ nvlist_free(config);
+ config = NULL;
+ continue;
+ }
nvlist_free(config);
config = nvl;
diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c
index de6587ec40..da19f7a780 100644
--- a/usr/src/lib/libzfs/common/libzfs_pool.c
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c
@@ -1063,7 +1063,8 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"device '%s' contains an EFI label and "
"cannot be used on root pools."),
- zpool_vdev_name(hdl, NULL, spares[s]));
+ zpool_vdev_name(hdl, NULL, spares[s],
+ B_FALSE));
return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg));
}
}
@@ -1419,8 +1420,9 @@ vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare,
/*
* Search for the requested value. We special case the search
- * for ZPOOL_CONFIG_PATH when it's a wholedisk. Otherwise,
- * all other searches are simple string compares.
+ * for ZPOOL_CONFIG_PATH when it's a wholedisk and when
+ * Looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE).
+ * Otherwise, all other searches are simple string compares.
*/
if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 && val) {
uint64_t wholedisk = 0;
@@ -1437,6 +1439,52 @@ vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare,
return (nv);
break;
}
+ } else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) {
+ char *type, *idx, *end, *p;
+ uint64_t id, vdev_id;
+
+ /*
+ * Determine our vdev type, keeping in mind
+ * that the srchval is composed of a type and
+ * vdev id pair (i.e. mirror-4).
+ */
+ if ((type = strdup(srchval)) == NULL)
+ return (NULL);
+
+ if ((p = strrchr(type, '-')) == NULL) {
+ free(type);
+ break;
+ }
+ idx = p + 1;
+ *p = '\0';
+
+ /*
+ * If the types don't match then keep looking.
+ */
+ if (strncmp(val, type, strlen(val)) != 0) {
+ free(type);
+ break;
+ }
+
+ verify(strncmp(type, VDEV_TYPE_RAIDZ,
+ strlen(VDEV_TYPE_RAIDZ)) == 0 ||
+ strncmp(type, VDEV_TYPE_MIRROR,
+ strlen(VDEV_TYPE_MIRROR)) == 0);
+ verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
+ &id) == 0);
+
+ errno = 0;
+ vdev_id = strtoull(idx, &end, 10);
+
+ free(type);
+ if (errno != 0)
+ return (NULL);
+
+ /*
+ * Now verify that we have the correct vdev id.
+ */
+ if (vdev_id == id)
+ return (nv);
}
/*
@@ -1522,6 +1570,18 @@ zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath,
return (ret);
}
+/*
+ * Determine if we have an "interior" top-level vdev (i.e mirror/raidz).
+ */
+boolean_t
+zpool_vdev_is_interior(const char *name)
+{
+ if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||
+ strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0)
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
nvlist_t *
zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
boolean_t *l2cache, boolean_t *log)
@@ -1536,6 +1596,8 @@ zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
guid = strtoull(path, &end, 10);
if (guid != 0 && *end == '\0') {
verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0);
+ } else if (zpool_vdev_is_interior(path)) {
+ verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0);
} else if (path[0] != '/') {
(void) snprintf(buf, sizeof (buf), "%s%s", "/dev/dsk/", path);
verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, buf) == 0);
@@ -2038,7 +2100,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
ZPOOL_CONFIG_VDEV_TREE, &config_root) == 0);
- if ((newname = zpool_vdev_name(NULL, NULL, child[0])) == NULL)
+ if ((newname = zpool_vdev_name(NULL, NULL, child[0], B_FALSE)) == NULL)
return (-1);
/*
@@ -2235,24 +2297,34 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
zfs_cmd_t zc = { 0 };
char msg[1024];
nvlist_t *tgt;
- boolean_t avail_spare, l2cache;
+ boolean_t avail_spare, l2cache, islog;
libzfs_handle_t *hdl = zhp->zpool_hdl;
+ uint64_t version;
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot remove %s"), path);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
- NULL)) == 0)
+ &islog)) == 0)
return (zfs_error(hdl, EZFS_NODEVICE, msg));
-
- if (!avail_spare && !l2cache) {
+ /*
+ * XXX - this should just go away.
+ */
+ if (!avail_spare && !l2cache && !islog) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "only inactive hot spares or cache devices "
- "can be removed"));
+ "only inactive hot spares, cache, top-level, "
+ "or log devices can be removed"));
return (zfs_error(hdl, EZFS_NODEVICE, msg));
}
+ version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
+ if (islog && version < SPA_VERSION_HOLES) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "pool must be upgrade to support log removal"));
+ return (zfs_error(hdl, EZFS_BADVERSION, msg));
+ }
+
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0)
@@ -2420,7 +2492,8 @@ set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path)
* of these checks.
*/
char *
-zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv)
+zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
+ boolean_t verbose)
{
char *path, *devid;
uint64_t value;
@@ -2499,6 +2572,20 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv)
(u_longlong_t)value);
path = buf;
}
+
+ /*
+ * We identify each top-level vdev by using a <type-id>
+ * naming convention.
+ */
+ if (verbose) {
+ uint64_t id;
+
+ verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
+ &id) == 0);
+ (void) snprintf(buf, sizeof (buf), "%s-%llu", path,
+ (u_longlong_t)id);
+ path = buf;
+ }
}
return (zfs_strdup(hdl, path));
@@ -3036,6 +3123,7 @@ supported_dump_vdev_type(libzfs_handle_t *hdl, nvlist_t *config, char *errbuf)
if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
strcmp(type, VDEV_TYPE_FILE) == 0 ||
strcmp(type, VDEV_TYPE_LOG) == 0 ||
+ strcmp(type, VDEV_TYPE_HOLE) == 0 ||
strcmp(type, VDEV_TYPE_MISSING) == 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"vdev type '%s' is not supported"), type);
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 77556ac5d7..3ebde10240 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -57,12 +57,13 @@ int metaslab_df_free_pct = 30;
* ==========================================================================
*/
metaslab_class_t *
-metaslab_class_create(space_map_ops_t *ops)
+metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
{
metaslab_class_t *mc;
mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
+ mc->mc_spa = spa;
mc->mc_rotor = NULL;
mc->mc_ops = ops;
@@ -126,6 +127,32 @@ metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
mg->mg_class = NULL;
}
+int
+metaslab_class_validate(metaslab_class_t *mc)
+{
+ metaslab_group_t *mg;
+ vdev_t *vd;
+
+ /*
+ * Must hold one of the spa_config locks.
+ */
+ ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
+ spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
+
+ if ((mg = mc->mc_rotor) == NULL)
+ return (0);
+
+ do {
+ vd = mg->mg_vd;
+ ASSERT(vd->vdev_mg != NULL);
+ ASSERT3P(vd->vdev_top, ==, vd);
+ ASSERT3P(mg->mg_class, ==, mc);
+ ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
+ } while ((mg = mg->mg_next) != mc->mc_rotor);
+
+ return (0);
+}
+
/*
* ==========================================================================
* Metaslab groups
@@ -634,6 +661,8 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
dmu_tx_t *tx;
int t;
+ ASSERT(!vd->vdev_ishole);
+
tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
/*
@@ -721,6 +750,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
vdev_t *vd = mg->mg_vd;
int t;
+ ASSERT(!vd->vdev_ishole);
+
mutex_enter(&msp->ms_lock);
/*
@@ -932,10 +963,21 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
*/
if (hintdva) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
- if (flags & METASLAB_HINTBP_AVOID)
- mg = vd->vdev_mg->mg_next;
- else
+
+ /*
+ * It's possible the vdev we're using as the hint no
+ * longer exists (i.e. removed). Consult the rotor when
+ * all else fails.
+ */
+ if (vd != NULL && vd->vdev_mg != NULL) {
mg = vd->vdev_mg;
+
+ if (flags & METASLAB_HINTBP_AVOID &&
+ mg->mg_next != NULL)
+ mg = mg->mg_next;
+ } else {
+ mg = mc->mc_rotor;
+ }
} else if (d != 0) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
mg = vd->vdev_mg->mg_next;
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 53e1ac0f4a..f503592396 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -42,6 +42,7 @@
#include <sys/zil.h>
#include <sys/vdev_impl.h>
#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
#include <sys/uberblock_impl.h>
#include <sys/txg.h>
#include <sys/avl.h>
@@ -578,8 +579,8 @@ spa_activate(spa_t *spa, int mode)
spa->spa_state = POOL_STATE_ACTIVE;
spa->spa_mode = mode;
- spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops);
- spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
+ spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
+ spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
for (int t = 0; t < ZIO_TYPES; t++) {
const zio_taskq_info_t *ztip = &zio_taskqs[t];
@@ -1101,26 +1102,23 @@ spa_check_removed(vdev_t *vd)
* that the label does not contain the most up-to-date information.
*/
void
-spa_load_log_state(spa_t *spa)
+spa_load_log_state(spa_t *spa, nvlist_t *nv)
{
- nvlist_t *nv, *nvroot, **child;
- uint64_t is_log;
- uint_t children;
- vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *ovd, *rvd = spa->spa_root_vdev;
- VERIFY(load_nvlist(spa, spa->spa_config_object, &nv) == 0);
- VERIFY(nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
- VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
- &child, &children) == 0);
-
- for (int c = 0; c < children; c++) {
- vdev_t *tvd = rvd->vdev_child[c];
+ /*
+ * Load the original root vdev tree from the passed config.
+ */
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
- if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
- &is_log) == 0 && is_log)
- vdev_load_log_state(tvd, child[c]);
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *cvd = rvd->vdev_child[c];
+ if (cvd->vdev_islog)
+ vdev_load_log_state(cvd, ovd->vdev_child[c]);
}
- nvlist_free(nv);
+ vdev_free(ovd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
}
/*
@@ -1151,7 +1149,7 @@ static int
spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
{
int error = 0;
- nvlist_t *nvroot = NULL;
+ nvlist_t *nvconfig, *nvroot = NULL;
vdev_t *rvd;
uberblock_t *ub = &spa->spa_uberblock;
uint64_t config_cache_txg = spa->spa_config_txg;
@@ -1306,23 +1304,22 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
goto out;
}
+ if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
if (!mosconfig) {
- nvlist_t *newconfig;
uint64_t hostid;
- if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
-
- if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig,
+ if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
char *hostname;
unsigned long myhostid = 0;
- VERIFY(nvlist_lookup_string(newconfig,
+ VERIFY(nvlist_lookup_string(nvconfig,
ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
#ifdef _KERNEL
@@ -1347,12 +1344,12 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
}
}
- spa_config_set(spa, newconfig);
+ spa_config_set(spa, nvconfig);
spa_unload(spa);
spa_deactivate(spa);
spa_activate(spa, orig_mode);
- return (spa_load(spa, newconfig, state, B_TRUE));
+ return (spa_load(spa, nvconfig, state, B_TRUE));
}
if (zap_lookup(spa->spa_meta_objset,
@@ -1471,7 +1468,10 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
spa_config_exit(spa, SCL_ALL, FTAG);
}
- spa_load_log_state(spa);
+ VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ spa_load_log_state(spa, nvroot);
+ nvlist_free(nvconfig);
if (spa_check_logs(spa)) {
vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
@@ -2910,7 +2910,7 @@ spa_reset(char *pool)
int
spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
{
- uint64_t txg;
+ uint64_t txg, id;
int error;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd, *tvd;
@@ -2951,9 +2951,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
* Transfer each new top-level vdev from vd to rvd.
*/
for (int c = 0; c < vd->vdev_children; c++) {
+
+ /*
+ * Set the vdev id to the first hole, if one exists.
+ */
+ for (id = 0; id < rvd->vdev_children; id++) {
+ if (rvd->vdev_child[id]->vdev_ishole) {
+ vdev_free(rvd->vdev_child[id]);
+ break;
+ }
+ }
tvd = vd->vdev_child[c];
vdev_remove_child(vd, tvd);
- tvd->vdev_id = rvd->vdev_children;
+ tvd->vdev_id = id;
vdev_add_child(rvd, tvd);
vdev_config_dirty(tvd);
}
@@ -3136,6 +3146,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
*/
vdev_remove_child(newrootvd, newvd);
newvd->vdev_id = pvd->vdev_children;
+ newvd->vdev_crtxg = oldvd->vdev_crtxg;
vdev_add_child(pvd, newvd);
tvd = newvd->vdev_top;
@@ -3444,16 +3455,127 @@ spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
}
/*
+ * Removing a device from the vdev namespace requires several steps
+ * and can take a significant amount of time. As a result we use
+ * the spa_vdev_config_[enter/exit] functions which allow us to
+ * grab and release the spa_config_lock while still holding the namespace
+ * lock. During each step the configuration is synced out.
+ */
+
+/*
+ * Initial phase of device removal - stop future allocations from this device.
+ */
+void
+spa_vdev_remove_start(spa_t *spa, vdev_t *vd)
+{
+ metaslab_group_t *mg = vd->vdev_mg;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ /*
+ * Remove our vdev from the allocatable vdevs
+ */
+ if (mg)
+ metaslab_class_remove(mg->mg_class, mg);
+}
+
+/*
+ * Evacuate the device.
+ */
+int
+spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
+{
+ uint64_t txg;
+ int error;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+
+ /*
+ * Evacuate the device. We don't hold the config lock as writer
+ * since we need to do I/O but we do keep the
+ * spa_namespace_lock held. Once this completes the device
+ * should no longer have any blocks allocated on it.
+ */
+ if (vd->vdev_islog) {
+ /*
+ * Evacuate the device.
+ */
+ if (error = dmu_objset_find(spa_name(spa),
+ zil_vdev_offline, NULL, DS_FIND_CHILDREN)) {
+ uint64_t txg;
+
+ txg = spa_vdev_config_enter(spa);
+ metaslab_class_add(spa->spa_log_class,
+ vd->vdev_mg);
+ return (spa_vdev_exit(spa, NULL, txg, error));
+ }
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ }
+
+ /*
+ * Remove any remaining MOS metadata associated with the device.
+ */
+ txg = spa_vdev_config_enter(spa);
+ vd->vdev_removing = B_TRUE;
+ vdev_dirty(vd, 0, NULL, txg);
+ vdev_config_dirty(vd);
+ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+
+ return (0);
+}
+
+/*
+ * Complete the removal by cleaning up the namespace.
+ */
+void
+spa_vdev_remove_done(spa_t *spa, vdev_t *vd)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ metaslab_group_t *mg = vd->vdev_mg;
+ uint64_t id = vd->vdev_id;
+ boolean_t last_vdev = (id == (rvd->vdev_children - 1));
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+ vdev_free(vd);
+
+ /*
+ * It's possible that another thread is trying todo a spa_vdev_add()
+ * at the same time we're trying remove it. As a result the
+ * added vdev may not have initialized its metaslabs yet.
+ */
+ if (mg != NULL)
+ metaslab_group_destroy(mg);
+
+ if (last_vdev) {
+ vdev_compact_children(rvd);
+ } else {
+ vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
+ vdev_add_child(rvd, vd);
+ }
+ vdev_config_dirty(rvd);
+
+ /*
+ * Reassess the health of our root vdev.
+ */
+ vdev_reopen(rvd);
+}
+
+/*
* Remove a device from the pool. Currently, this supports removing only hot
- * spares and level 2 ARC devices.
+ * spares, slogs, and level 2 ARC devices.
*/
int
spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
{
vdev_t *vd;
nvlist_t **spares, **l2cache, *nv;
- uint_t nspares, nl2cache;
uint64_t txg = 0;
+ uint_t nspares, nl2cache;
int error = 0;
boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
@@ -3489,6 +3611,29 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
spa_load_l2cache(spa);
spa->spa_l2cache.sav_sync = B_TRUE;
+ } else if (vd != NULL && vd->vdev_islog) {
+ ASSERT(!locked);
+
+ /*
+ * XXX - Once we have bp-rewrite this should
+ * become the common case.
+ */
+
+ /*
+ * 1. Stop allocations
+ * 2. Evacuate the device (i.e. kill off stubby and
+ * metadata) and wait for it to complete (i.e. sync).
+ * 3. Cleanup the vdev namespace.
+ */
+ spa_vdev_remove_start(spa, vd);
+
+ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+ if ((error = spa_vdev_remove_evacuate(spa, vd)) != 0)
+ return (error);
+ txg = spa_vdev_config_enter(spa);
+
+ spa_vdev_remove_done(spa, vd);
+
} else if (vd != NULL) {
/*
* Normal vdevs cannot be removed (yet).
diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c
index b2063bba13..d611e0aa9b 100644
--- a/usr/src/uts/common/fs/zfs/spa_config.c
+++ b/usr/src/uts/common/fs/zfs/spa_config.c
@@ -383,6 +383,13 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
vd = vd->vdev_top; /* label contains top config */
}
+ /*
+ * Add the top-level config. We even add this on pools which
+ * don't support holes in the namespace as older pools will
+ * just ignore it.
+ */
+ vdev_top_config_generate(spa, config);
+
nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE);
VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
nvlist_free(nvroot);
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 9384db4ae9..38474c194d 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -836,6 +836,18 @@ uint64_t
spa_vdev_enter(spa_t *spa)
{
mutex_enter(&spa_namespace_lock);
+ return (spa_vdev_config_enter(spa));
+}
+
+/*
+ * Internal implementation for spa_vdev_enter(). Used when a vdev
+ * operation requires multiple syncs (i.e. removing a device) while
+ * keeping the spa_namespace_lock held.
+ */
+uint64_t
+spa_vdev_config_enter(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
@@ -843,14 +855,14 @@ spa_vdev_enter(spa_t *spa)
}
/*
- * Unlock the spa_t after adding or removing a vdev. Besides undoing the
- * locking of spa_vdev_enter(), we also want make sure the transactions have
- * synced to disk, and then update the global configuration cache with the new
- * information.
+ * Used in combination with spa_vdev_config_enter() to allow the syncing
+ * of multiple transactions without releasing the spa_namespace_lock.
*/
-int
-spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+void
+spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
int config_changed = B_FALSE;
ASSERT(txg > spa_last_synced_txg(spa));
@@ -870,9 +882,23 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
config_changed = B_TRUE;
}
+ /*
+ * Verify the metaslab classes.
+ */
+ ASSERT(metaslab_class_validate(spa->spa_normal_class) == 0);
+ ASSERT(metaslab_class_validate(spa->spa_log_class) == 0);
+
spa_config_exit(spa, SCL_ALL, spa);
/*
+ * Panic the system if the specified tag requires it. This
+ * is useful for ensuring that configurations are updated
+ * transactionally.
+ */
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, tag);
+
+ /*
* Note: this txg_wait_synced() is important because it ensures
* that there won't be more than one config change per txg.
* This allows us to use the txg as the generation number.
@@ -892,7 +918,18 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
*/
if (config_changed)
spa_config_sync(spa, B_FALSE, B_TRUE);
+}
+/*
+ * Unlock the spa_t after adding or removing a vdev. Besides undoing the
+ * locking of spa_vdev_enter(), we also want make sure the transactions have
+ * synced to disk, and then update the global configuration cache with the new
+ * information.
+ */
+int
+spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+{
+ spa_vdev_config_exit(spa, vd, txg, error, FTAG);
mutex_exit(&spa_namespace_lock);
return (error);
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h
index 5d3e11c971..78a5f94952 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h
@@ -57,10 +57,12 @@ extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
boolean_t now);
extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
-extern metaslab_class_t *metaslab_class_create(space_map_ops_t *ops);
+extern metaslab_class_t *metaslab_class_create(spa_t *spa,
+ space_map_ops_t *ops);
extern void metaslab_class_destroy(metaslab_class_t *mc);
extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
+extern int metaslab_class_validate(metaslab_class_t *mc);
extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
vdev_t *vd);
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
index d67dea7e97..bdf9559631 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -37,6 +37,7 @@ extern "C" {
#endif
struct metaslab_class {
+ spa_t *mc_spa;
metaslab_group_t *mc_rotor;
uint64_t mc_allocated;
space_map_ops_t *mc_ops;
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index bccee25da9..b4165b24c8 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -430,6 +430,9 @@ extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
/* Pool vdev add/remove lock */
extern uint64_t spa_vdev_enter(spa_t *spa);
+extern uint64_t spa_vdev_config_enter(spa_t *spa);
+extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
+ int error, char *tag);
extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
/* Pool vdev state change lock */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index a76cecb4b2..ecf6c2fe17 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -122,6 +122,7 @@ extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg,
extern void vdev_state_dirty(vdev_t *vd);
extern void vdev_state_clean(vdev_t *vd);
+extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
boolean_t getstats, boolean_t isspare, boolean_t isl2cache);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 23780430df..bb2f98c33e 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -129,6 +129,7 @@ struct vdev {
boolean_t vdev_expanding; /* expand the vdev? */
int vdev_open_error; /* error on last open */
kthread_t *vdev_open_thread; /* thread opening children */
+ uint64_t vdev_crtxg; /* txg when top-level was added */
/*
* Top-level vdev state.
@@ -143,10 +144,12 @@ struct vdev {
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
boolean_t vdev_remove_wanted; /* async remove wanted? */
boolean_t vdev_probe_wanted; /* async probe wanted? */
+ boolean_t vdev_removing; /* device is being removed? */
list_node_t vdev_config_dirty_node; /* config dirty list */
list_node_t vdev_state_dirty_node; /* state dirty list */
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
uint64_t vdev_islog; /* is an intent log device */
+ uint64_t vdev_ishole; /* is a hole in the namespace */
/*
* Leaf vdev state.
@@ -248,6 +251,8 @@ typedef struct vdev_label {
/*
* Allocate or free a vdev
*/
+extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid,
+ vdev_ops_t *ops);
extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
vdev_t *parent, uint_t id, int alloctype);
extern void vdev_free(vdev_t *vd);
@@ -264,7 +269,7 @@ extern void vdev_remove_parent(vdev_t *cvd);
/*
* vdev sync load and sync
*/
-extern void vdev_load_log_state(vdev_t *vd, nvlist_t *nv);
+extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
extern void vdev_load(vdev_t *vd);
extern void vdev_sync(vdev_t *vd, uint64_t txg);
extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
@@ -280,6 +285,7 @@ extern vdev_ops_t vdev_raidz_ops;
extern vdev_ops_t vdev_disk_ops;
extern vdev_ops_t vdev_file_ops;
extern vdev_ops_t vdev_missing_ops;
+extern vdev_ops_t vdev_hole_ops;
extern vdev_ops_t vdev_spare_ops;
/*
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
index b7a2f57cbc..37615ba35f 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -117,6 +117,7 @@ typedef struct zinject_record {
uint64_t zi_type;
uint32_t zi_freq;
uint32_t zi_failfast;
+ char zi_func[MAXNAMELEN];
} zinject_record_t;
#define ZINJECT_NULL 0x1
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index a85a1cdfcb..305c697697 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -442,6 +442,7 @@ extern int zio_inject_fault(char *name, int flags, int *id,
extern int zio_inject_list_next(int *id, char *name, size_t buflen,
struct zinject_record *record);
extern int zio_clear_fault(int id);
+extern void zio_handle_panic_injection(spa_t *spa, char *tag);
extern int zio_handle_fault_injection(zio_t *zio, int error);
extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
extern int zio_handle_label_injection(zio_t *zio, int error);
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 415cd4a9e9..9c8aa43425 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -54,6 +54,7 @@ static vdev_ops_t *vdev_ops_table[] = {
&vdev_disk_ops,
&vdev_file_ops,
&vdev_missing_ops,
+ &vdev_hole_ops,
NULL
};
@@ -281,7 +282,7 @@ vdev_compact_children(vdev_t *pvd)
/*
* Allocate and minimally initialize a vdev_t.
*/
-static vdev_t *
+vdev_t *
vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
{
vdev_t *vd;
@@ -293,7 +294,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
spa->spa_root_vdev = vd;
}
- if (guid == 0) {
+ if (guid == 0 && ops != &vdev_hole_ops) {
if (spa->spa_root_vdev == vd) {
/*
* The root vdev's guid will also be the pool guid,
@@ -318,6 +319,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
vd->vdev_guid_sum = guid;
vd->vdev_ops = ops;
vd->vdev_state = VDEV_STATE_CLOSED;
+ vd->vdev_ishole = (ops == &vdev_hole_ops);
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -397,6 +399,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
return (ENOTSUP);
+ if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
+ return (ENOTSUP);
+
/*
* Set the nparity property for RAID-Z vdevs.
*/
@@ -472,6 +477,12 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
/*
+ * Retrieve the vdev creation time.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+ &vd->vdev_crtxg);
+
+ /*
* If we're a top-level vdev, try to load the allocation parameters.
*/
if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
@@ -705,6 +716,7 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
mvd->vdev_min_asize = cvd->vdev_min_asize;
mvd->vdev_ashift = cvd->vdev_ashift;
mvd->vdev_state = cvd->vdev_state;
+ mvd->vdev_crtxg = cvd->vdev_crtxg;
vdev_remove_child(pvd, cvd);
vdev_add_child(pvd, mvd);
@@ -772,9 +784,14 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
metaslab_t **mspp;
int error;
- if (vd->vdev_ms_shift == 0) /* not being allocated from yet */
+ /*
+ * This vdev is not being allocated from yet or is a hole.
+ */
+ if (vd->vdev_ms_shift == 0)
return (0);
+ ASSERT(!vd->vdev_ishole);
+
/*
* Compute the raidz-deflation ratio. Note, we hard-code
* in 128k (1 << 17) because it is the current "typical" blocksize.
@@ -1105,6 +1122,12 @@ vdev_open(vdev_t *vd)
vd->vdev_state = VDEV_STATE_HEALTHY;
}
+ /*
+ * For hole or missing vdevs we just return success.
+ */
+ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
+ return (0);
+
for (int c = 0; c < vd->vdev_children; c++) {
if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
@@ -1393,6 +1416,7 @@ void
vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
{
ASSERT(vd == vd->vdev_top);
+ ASSERT(!vd->vdev_ishole);
ASSERT(ISP2(flags));
if (flags & VDD_METASLAB)
@@ -1502,7 +1526,7 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
vdev_dtl_reassess(vd->vdev_child[c], txg,
scrub_txg, scrub_done);
- if (vd == spa->spa_root_vdev)
+ if (vd == spa->spa_root_vdev || vd->vdev_ishole)
return;
if (vd->vdev_ops->vdev_op_leaf) {
@@ -1592,6 +1616,8 @@ vdev_dtl_load(vdev_t *vd)
if (smo->smo_object == 0)
return (0);
+ ASSERT(!vd->vdev_ishole);
+
if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
return (error);
@@ -1619,6 +1645,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
dmu_buf_t *db;
dmu_tx_t *tx;
+ ASSERT(!vd->vdev_ishole);
+
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
if (vd->vdev_detached) {
@@ -1755,7 +1783,7 @@ vdev_load(vdev_t *vd)
/*
* If this is a top-level vdev, initialize its metaslabs.
*/
- if (vd == vd->vdev_top &&
+ if (vd == vd->vdev_top && !vd->vdev_ishole &&
(vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
vdev_metaslab_init(vd, 0) != 0))
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@@ -1812,10 +1840,48 @@ vdev_validate_aux(vdev_t *vd)
}
void
+vdev_remove(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_tx_t *tx;
+
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+ if (vd->vdev_dtl_smo.smo_object) {
+ ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0);
+ (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
+ vd->vdev_dtl_smo.smo_object = 0;
+ }
+
+ if (vd->vdev_ms != NULL) {
+ for (int m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+
+ if (msp == NULL || msp->ms_smo.smo_object == 0)
+ continue;
+
+ ASSERT3U(msp->ms_smo.smo_alloc, ==, 0);
+ (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
+ msp->ms_smo.smo_object = 0;
+ }
+ }
+
+ if (vd->vdev_ms_array) {
+ (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
+ vd->vdev_ms_array = 0;
+ vd->vdev_ms_shift = 0;
+ }
+ dmu_tx_commit(tx);
+}
+
+void
vdev_sync_done(vdev_t *vd, uint64_t txg)
{
metaslab_t *msp;
+ ASSERT(!vd->vdev_ishole);
+
while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
metaslab_sync_done(msp, txg);
}
@@ -1828,6 +1894,8 @@ vdev_sync(vdev_t *vd, uint64_t txg)
metaslab_t *msp;
dmu_tx_t *tx;
+ ASSERT(!vd->vdev_ishole);
+
if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
ASSERT(vd == vd->vdev_top);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
@@ -1838,6 +1906,9 @@ vdev_sync(vdev_t *vd, uint64_t txg)
dmu_tx_commit(tx);
}
+ if (vd->vdev_removing)
+ vdev_remove(vd, txg);
+
while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
metaslab_sync(msp, txg);
(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
@@ -2110,7 +2181,15 @@ vdev_clear(spa_t *spa, vdev_t *vd)
boolean_t
vdev_is_dead(vdev_t *vd)
{
- return (vd->vdev_state < VDEV_STATE_DEGRADED);
+ /*
+ * Holes and missing devices are always considered "dead".
+ * This simplifies the code since we don't have to check for
+ * these types of devices in the various code paths.
+ * Instead we rely on the fact that we skip over dead devices
+ * before issuing I/O to them.
+ */
+ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
+ vd->vdev_ops == &vdev_missing_ops);
}
boolean_t
@@ -2139,7 +2218,7 @@ vdev_allocatable(vdev_t *vd)
* we're asking two separate questions about it.
*/
return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
- !vd->vdev_cant_write);
+ !vd->vdev_cant_write && !vd->vdev_ishole && !vd->vdev_removing);
}
boolean_t
@@ -2391,7 +2470,7 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
* Don't count non-normal (e.g. intent log) space as part of
* the pool's capacity.
*/
- if (vd->vdev_mg->mg_class != spa->spa_normal_class)
+ if (vd->vdev_islog)
return;
mutex_enter(&rvd->vdev_stat_lock);
@@ -2472,7 +2551,8 @@ vdev_config_dirty(vdev_t *vd)
} else {
ASSERT(vd == vd->vdev_top);
- if (!list_link_active(&vd->vdev_config_dirty_node))
+ if (!list_link_active(&vd->vdev_config_dirty_node) &&
+ !vd->vdev_ishole)
list_insert_head(&spa->spa_config_dirty_list, vd);
}
}
@@ -2546,6 +2626,12 @@ vdev_propagate_state(vdev_t *vd)
for (int c = 0; c < vd->vdev_children; c++) {
child = vd->vdev_child[c];
+ /*
+ * Don't factor holes into the decision.
+ */
+ if (child->vdev_ishole)
+ continue;
+
if (!vdev_readable(child) ||
(!vdev_writeable(child) && spa_writeable(spa))) {
/*
@@ -2739,32 +2825,31 @@ vdev_is_bootable(vdev_t *vd)
return (B_TRUE);
}
+/*
+ * Load the state from the original vdev tree (ovd) which
+ * we've retrieved from the MOS config object. If the original
+ * vdev was offline then we transfer that state to the device
+ * in the current vdev tree (nvd).
+ */
void
-vdev_load_log_state(vdev_t *vd, nvlist_t *nv)
+vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
{
- uint_t children;
- nvlist_t **child;
- uint64_t val;
- spa_t *spa = vd->vdev_spa;
+ spa_t *spa = nvd->vdev_spa;
- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
- &child, &children) == 0) {
- for (int c = 0; c < children; c++)
- vdev_load_log_state(vd->vdev_child[c], child[c]);
- }
+ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
- if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv,
- ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) {
+ for (int c = 0; c < nvd->vdev_children; c++)
+ vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
+ if (nvd->vdev_ops->vdev_op_leaf && ovd->vdev_offline) {
/*
* It would be nice to call vdev_offline()
* directly but the pool isn't fully loaded and
* the txg threads have not been started yet.
*/
- spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER);
- vd->vdev_offline = val;
- vdev_reopen(vd->vdev_top);
- spa_config_exit(spa, SCL_STATE_ALL, FTAG);
+ nvd->vdev_offline = ovd->vdev_offline;
+ vdev_reopen(nvd->vdev_top);
}
}
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index 06cb720128..87adc01622 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -287,6 +287,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
vd->vdev_dtl_smo.smo_object) == 0);
+ if (vd->vdev_crtxg)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+ vd->vdev_crtxg) == 0);
+
if (getstats) {
vdev_stat_t vs;
vdev_get_stats(vd, &vs);
@@ -298,6 +302,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
nvlist_t **child;
int c;
+ ASSERT(!vd->vdev_ishole);
+
child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
KM_SLEEP);
@@ -329,11 +335,45 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (vd->vdev_unspare)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE,
B_TRUE) == 0);
+ if (vd->vdev_ishole)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE,
+ B_TRUE) == 0);
}
return (nv);
}
+/*
+ * Generate a view of the top-level vdevs. If we currently have holes
+ * in the namespace, then generate an array which contains a list of holey
+ * vdevs. Additionally, add the number of top-level children that currently
+ * exist.
+ */
+void
+vdev_top_config_generate(spa_t *spa, nvlist_t *config)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t *array;
+ uint_t idx;
+
+ array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
+
+ idx = 0;
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+
+ if (tvd->vdev_ishole)
+ array[idx++] = c;
+ }
+
+ VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
+ array, idx++) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
+ rvd->vdev_children) == 0);
+
+ kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
+}
+
nvlist_t *
vdev_label_read_config(vdev_t *vd)
{
@@ -516,6 +556,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
crtxg, reason)) != 0)
return (error);
+ /* Track the creation time for this vdev */
+ vd->vdev_crtxg = crtxg;
+
if (!vd->vdev_ops->vdev_op_leaf)
return (0);
@@ -976,6 +1019,9 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
KM_SLEEP);
+
+ ASSERT(!vd->vdev_ishole);
+
zio_t *vio = zio_null(zio, spa, NULL,
(vd->vdev_islog || vd->vdev_aux != NULL) ?
vdev_label_sync_ignore_done : vdev_label_sync_top_done,
diff --git a/usr/src/uts/common/fs/zfs/vdev_missing.c b/usr/src/uts/common/fs/zfs/vdev_missing.c
index 731f7d3dce..e1bf7d86a3 100644
--- a/usr/src/uts/common/fs/zfs/vdev_missing.c
+++ b/usr/src/uts/common/fs/zfs/vdev_missing.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -48,8 +48,8 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
* VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we
* will fail the GUID sum check before ever trying to open the pool.
*/
- *psize = SPA_MINDEVSIZE;
- *ashift = SPA_MINBLOCKSHIFT;
+ *psize = 0;
+ *ashift = 0;
return (0);
}
@@ -83,3 +83,14 @@ vdev_ops_t vdev_missing_ops = {
VDEV_TYPE_MISSING, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
+
+vdev_ops_t vdev_hole_ops = {
+ vdev_missing_open,
+ vdev_missing_close,
+ vdev_default_asize,
+ vdev_missing_io_start,
+ vdev_missing_io_done,
+ NULL,
+ VDEV_TYPE_HOLE, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c
index f8e6880c90..c5ff55243a 100644
--- a/usr/src/uts/common/fs/zfs/zio_inject.c
+++ b/usr/src/uts/common/fs/zfs/zio_inject.c
@@ -96,6 +96,30 @@ zio_match_handler(zbookmark_t *zb, uint64_t type,
}
/*
+ * Panic the system when a config change happens in the function
+ * specified by tag.
+ */
+void
+zio_handle_panic_injection(spa_t *spa, char *tag)
+{
+ inject_handler_t *handler;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ if (spa != handler->zi_spa)
+ continue;
+
+ if (strcmp(tag, handler->zi_record.zi_func) == 0)
+ panic("Panic requested in function %s\n", tag);
+ }
+
+ rw_exit(&inject_lock);
+}
+
+/*
* Determine if the I/O in question should return failure. Returns the errno
* to be returned to the caller.
*/
@@ -126,8 +150,9 @@ zio_handle_fault_injection(zio_t *zio, int error)
if (zio->io_spa != handler->zi_spa)
continue;
- /* Ignore device errors */
- if (handler->zi_record.zi_guid != 0)
+ /* Ignore device errors and panic injection */
+ if (handler->zi_record.zi_guid != 0 ||
+ handler->zi_record.zi_func[0] != '\0')
continue;
/* If this handler matches, return EIO */
@@ -170,8 +195,9 @@ zio_handle_label_injection(zio_t *zio, int error)
uint64_t start = handler->zi_record.zi_start;
uint64_t end = handler->zi_record.zi_end;
- /* Ignore device only faults */
- if (handler->zi_record.zi_start == 0)
+ /* Ignore device only faults or panic injection */
+ if (handler->zi_record.zi_start == 0 ||
+ handler->zi_record.zi_func[0] != '\0')
continue;
/*
@@ -205,8 +231,9 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
for (handler = list_head(&inject_handlers); handler != NULL;
handler = list_next(&inject_handlers, handler)) {
- /* Ignore label specific faults */
- if (handler->zi_record.zi_start != 0)
+ /* Ignore label specific faults or panic injection */
+ if (handler->zi_record.zi_start != 0 ||
+ handler->zi_record.zi_func[0] != '\0')
continue;
if (vd->vdev_guid == handler->zi_record.zi_guid) {
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
index b88fb5419a..de0d67176e 100644
--- a/usr/src/uts/common/sys/fs/zfs.h
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -295,14 +295,15 @@ typedef enum zfs_cache_type {
#define SPA_VERSION_16 16ULL
#define SPA_VERSION_17 17ULL
#define SPA_VERSION_18 18ULL
+#define SPA_VERSION_19 19ULL
/*
* When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
* format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
* and do the appropriate changes. Also bump the version number in
* usr/src/grub/capability.
*/
-#define SPA_VERSION SPA_VERSION_18
-#define SPA_VERSION_STRING "18"
+#define SPA_VERSION SPA_VERSION_19
+#define SPA_VERSION_STRING "19"
/*
* Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -342,6 +343,7 @@ typedef enum zfs_cache_type {
#define SPA_VERSION_STMF_PROP SPA_VERSION_16
#define SPA_VERSION_RAIDZ3 SPA_VERSION_17
#define SPA_VERSION_USERREFS SPA_VERSION_18
+#define SPA_VERSION_HOLES SPA_VERSION_19
/*
* ZPL version - rev'd whenever an incompatible on-disk format change
@@ -401,6 +403,9 @@ typedef enum zfs_cache_type {
#define ZPOOL_CONFIG_PHYS_PATH "phys_path"
#define ZPOOL_CONFIG_IS_LOG "is_log"
#define ZPOOL_CONFIG_L2CACHE "l2cache"
+#define ZPOOL_CONFIG_HOLE_ARRAY "hole_array"
+#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children"
+#define ZPOOL_CONFIG_IS_HOLE "is_hole"
#define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */
#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
#define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */
@@ -422,6 +427,7 @@ typedef enum zfs_cache_type {
#define VDEV_TYPE_DISK "disk"
#define VDEV_TYPE_FILE "file"
#define VDEV_TYPE_MISSING "missing"
+#define VDEV_TYPE_HOLE "hole"
#define VDEV_TYPE_SPARE "spare"
#define VDEV_TYPE_LOG "log"
#define VDEV_TYPE_L2CACHE "l2cache"