From 1195e687f1c03c8d57417b5999578922e20a3554 Mon Sep 17 00:00:00 2001 From: Mark J Musante Date: Mon, 4 Jan 2010 17:24:41 -0500 Subject: PSARC/2009/511 zpool split 5097228 provide 'zpool split' to create new pool by breaking all mirrors 6880831 memory leak in zpool add 6891438 zfs_ioc_userspace_upgrade could reference uninitialised error variable 6891441 zvol_create_minor sets local variable zv but never references it 6891442 spa_import() sets local variable spa but never references it 6895446 vdevs left open after removing slogs or offlining device/file --- usr/src/cmd/mdb/common/modules/zfs/zfs.c | 8 +- usr/src/cmd/zinject/zinject.c | 8 +- usr/src/cmd/zpool/zpool_main.c | 157 +++++- usr/src/cmd/zpool/zpool_util.h | 4 +- usr/src/cmd/zpool/zpool_vdev.c | 48 +- usr/src/cmd/ztest/ztest.c | 174 +++++- usr/src/lib/libzfs/common/libzfs.h | 11 + usr/src/lib/libzfs/common/libzfs_pool.c | 265 ++++++++- usr/src/lib/libzfs/common/libzfs_util.c | 5 +- usr/src/lib/libzfs/common/mapfile-vers | 3 +- usr/src/uts/common/fs/zfs/spa.c | 868 ++++++++++++++++++++++-------- usr/src/uts/common/fs/zfs/spa_config.c | 21 +- usr/src/uts/common/fs/zfs/spa_misc.c | 22 +- usr/src/uts/common/fs/zfs/sys/spa.h | 11 +- usr/src/uts/common/fs/zfs/sys/spa_impl.h | 4 +- usr/src/uts/common/fs/zfs/sys/vdev.h | 7 +- usr/src/uts/common/fs/zfs/sys/vdev_impl.h | 5 +- usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h | 4 +- usr/src/uts/common/fs/zfs/sys/zio.h | 4 +- usr/src/uts/common/fs/zfs/vdev.c | 89 +-- usr/src/uts/common/fs/zfs/vdev_label.c | 18 +- usr/src/uts/common/fs/zfs/zfs_ioctl.c | 45 +- usr/src/uts/common/fs/zfs/zio_inject.c | 7 +- usr/src/uts/common/fs/zfs/zvol.c | 4 +- usr/src/uts/common/sys/fs/zfs.h | 13 +- 25 files changed, 1479 insertions(+), 326 deletions(-) (limited to 'usr/src') diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c index df1fb44332..55c2b7133f 100644 --- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c +++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1040,6 +1040,12 @@ do_print_vdev(uintptr_t addr, int flags, int depth, int stats, case VDEV_AUX_BAD_LOG: aux = "BAD_LOG"; break; + case VDEV_AUX_EXTERNAL: + aux = "EXTERNAL"; + break; + case VDEV_AUX_SPLIT_POOL: + aux = "SPLIT_POOL"; + break; default: aux = "UNKNOWN"; break; diff --git a/usr/src/cmd/zinject/zinject.c b/usr/src/cmd/zinject/zinject.c index b892c418a8..caa1b75a6a 100644 --- a/usr/src/cmd/zinject/zinject.c +++ b/usr/src/cmd/zinject/zinject.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -855,14 +855,16 @@ main(int argc, char **argv) return (2); } - if (argc != 1) { + if (argc < 1 || argc > 2) { (void) fprintf(stderr, "panic (-p) injection requires " - "a single pool name\n"); + "a single pool name and an optional id\n"); usage(); return (2); } (void) strcpy(pool, argv[0]); + if (argv[1] != NULL) + record.zi_type = atoi(argv[1]); dataset[0] = '\0'; } else if (record.zi_duration != 0) { if (nowrites == 0) { diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c index e7a9dfb3f4..6ded428fe3 100644 --- a/usr/src/cmd/zpool/zpool_main.c +++ b/usr/src/cmd/zpool/zpool_main.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -69,6 +69,7 @@ static int zpool_do_clear(int, char **); static int zpool_do_attach(int, char **); static int zpool_do_detach(int, char **); static int zpool_do_replace(int, char **); +static int zpool_do_split(int, char **); static int zpool_do_scrub(int, char **); @@ -121,7 +122,8 @@ typedef enum { HELP_STATUS, HELP_UPGRADE, HELP_GET, - HELP_SET + HELP_SET, + HELP_SPLIT } zpool_help_t; @@ -158,6 +160,7 @@ static zpool_command_t command_table[] = { { "attach", zpool_do_attach, HELP_ATTACH }, { "detach", zpool_do_detach, HELP_DETACH }, { "replace", zpool_do_replace, HELP_REPLACE }, + { "split", zpool_do_split, HELP_SPLIT }, { NULL }, { "scrub", zpool_do_scrub, HELP_SCRUB }, { NULL }, @@ -235,6 +238,10 @@ get_usage(zpool_help_t idx) { " ...\n")); case HELP_SET: return (gettext("\tset \n")); + case HELP_SPLIT: + return (gettext("\tsplit [-n] [-R altroot] [-o mntopts]\n" + "\t [-o property=value] " + "[ ...]\n")); } abort(); @@ -1132,6 +1139,10 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, (void) printf(gettext("external device fault")); break; + case VDEV_AUX_SPLIT_POOL: + (void) printf(gettext("split into new pool")); + break; + default: (void) printf(gettext("corrupted data")); break; @@ -1622,7 +1633,7 @@ zpool_do_import(int argc, char **argv) char *cachefile = NULL; /* check options */ - while ((c = getopt(argc, argv, ":aCc:d:DEfFno:p:rR:VX")) != -1) { + while ((c = getopt(argc, argv, ":aCc:d:DEfFno:rR:VX")) != -1) { switch (c) { case 'a': do_all = B_TRUE; @@ -2696,6 +2707,146 @@ zpool_do_detach(int argc, char **argv) return (ret); } +/* + * zpool split [-n] [-o prop=val] ... + * [-o mntopt] ... + * [-R altroot] [ ...] + * + * -n Do not split the pool, but display the resulting layout if + * it were to be split. + * -o Set property=value, or set mount options. + * -R Mount the split-off pool under an alternate root. + * + * Splits the named pool and gives it the new pool name. Devices to be split + * off may be listed, provided that no more than one device is specified + * per top-level vdev mirror. The newly split pool is left in an exported + * state unless -R is specified. + * + * Restrictions: the top-level of the pool pool must only be made up of + * mirrors; all devices in the pool must be healthy; no device may be + * undergoing a resilvering operation. + */ +int +zpool_do_split(int argc, char **argv) +{ + char *srcpool, *newpool, *propval; + char *mntopts = NULL; + splitflags_t flags; + int c, ret = 0; + zpool_handle_t *zhp; + nvlist_t *config, *props = NULL; + + flags.dryrun = B_FALSE; + flags.import = B_FALSE; + + /* check options */ + while ((c = getopt(argc, argv, ":R:no:")) != -1) { + switch (c) { + case 'R': + flags.import = B_TRUE; + if (add_prop_list( + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg, + &props, B_TRUE) != 0) { + if (props) + nvlist_free(props); + usage(B_FALSE); + } + break; + case 'n': + flags.dryrun = B_TRUE; + break; + case 'o': + if ((propval = strchr(optarg, '=')) != NULL) { + *propval = '\0'; + propval++; + if (add_prop_list(optarg, propval, + &props, B_TRUE) != 0) { + if (props) + nvlist_free(props); + usage(B_FALSE); + } + } else { + mntopts = optarg; + } + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + break; + } + } + + if (!flags.import && mntopts != NULL) { + (void) fprintf(stderr, gettext("setting mntopts is only " + "valid when importing the pool\n")); + usage(B_FALSE); + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("Missing pool name\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("Missing new pool name\n")); + usage(B_FALSE); + } + + srcpool = argv[0]; + newpool = argv[1]; + + argc -= 2; + argv += 2; + + if ((zhp = zpool_open(g_zfs, srcpool)) == NULL) + return (1); + + config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv); + if (config == NULL) { + ret = 1; + } else { + if (flags.dryrun) { + (void) printf(gettext("would create '%s' with the " + "following layout:\n\n"), newpool); + print_vdev_tree(NULL, newpool, config, 0, B_FALSE); + } + nvlist_free(config); + } + + zpool_close(zhp); + + if (ret != 0 || flags.dryrun || !flags.import) + return (ret); + + /* + * The split was successful. Now we need to open the new + * pool and import it. + */ + if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL) + return (1); + if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && + zpool_enable_datasets(zhp, mntopts, 0) != 0) { + ret = 1; + (void) fprintf(stderr, gettext("Split was succssful, but " + "the datasets could not all be mounted\n")); + (void) fprintf(stderr, gettext("Try doing '%s' with a " + "different altroot\n"), "zpool import"); + } + zpool_close(zhp); + + return (ret); +} + + + /* * zpool online ... */ diff --git a/usr/src/cmd/zpool/zpool_util.h b/usr/src/cmd/zpool/zpool_util.h index c86b2e7405..a18b8b705f 100644 --- a/usr/src/cmd/zpool/zpool_util.h +++ b/usr/src/cmd/zpool/zpool_util.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -46,6 +46,8 @@ uint_t num_logs(nvlist_t *nv); nvlist_t *make_root_vdev(zpool_handle_t *zhp, int force, int check_rep, boolean_t isreplace, boolean_t dryrun, int argc, char **argv); +nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname, + nvlist_t *props, splitflags_t flags, int argc, char **argv); /* * Pool list functions diff --git a/usr/src/cmd/zpool/zpool_vdev.c b/usr/src/cmd/zpool/zpool_vdev.c index 621519197d..3c725d232c 100644 --- a/usr/src/cmd/zpool/zpool_vdev.c +++ b/usr/src/cmd/zpool/zpool_vdev.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1360,6 +1360,52 @@ construct_spec(int argc, char **argv) return (nvroot); } +nvlist_t * +split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, + splitflags_t flags, int argc, char **argv) +{ + nvlist_t *newroot = NULL, **child; + uint_t c, children; + + if (argc > 0) { + if ((newroot = construct_spec(argc, argv)) == NULL) { + (void) fprintf(stderr, gettext("Unable to build a " + "pool from the specified devices\n")); + return (NULL); + } + + if (!flags.dryrun && make_disks(zhp, newroot) != 0) { + nvlist_free(newroot); + return (NULL); + } + + /* avoid any tricks in the spec */ + verify(nvlist_lookup_nvlist_array(newroot, + ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); + for (c = 0; c < children; c++) { + char *path; + const char *type; + int min, max; + + verify(nvlist_lookup_string(child[c], + ZPOOL_CONFIG_PATH, &path) == 0); + if ((type = is_grouping(path, &min, &max)) != NULL) { + (void) fprintf(stderr, gettext("Cannot use " + "'%s' as a device for splitting\n"), type); + nvlist_free(newroot); + return (NULL); + } + } + } + + if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) { + if (newroot != NULL) + nvlist_free(newroot); + return (NULL); + } + + return (newroot); +} /* * Get and validate the contents of the given vdev specification. This ensures diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c index 63b858b4fb..ed36d90279 100644 --- a/usr/src/cmd/ztest/ztest.c +++ b/usr/src/cmd/ztest/ztest.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -125,9 +125,9 @@ static int zopt_verbose = 0; static int zopt_init = 1; static char *zopt_dir = "/tmp"; static uint64_t zopt_time = 300; /* 5 minutes */ -static int zopt_maxfaults; #define BT_MAGIC 0x123456789abcdefULL +#define MAXFAULTS() (MAX(zs->zs_mirrors, 1) * (zopt_raidz_parity + 1) - 1) enum ztest_io_type { ZTEST_IO_WRITE_TAG, @@ -251,6 +251,7 @@ ztest_func_t ztest_vdev_attach_detach; ztest_func_t ztest_vdev_LUN_growth; ztest_func_t ztest_vdev_add_remove; ztest_func_t ztest_vdev_aux_add_remove; +ztest_func_t ztest_split_pool; uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ @@ -265,6 +266,7 @@ ztest_info_t ztest_info[] = { { ztest_dmu_commit_callbacks, 1, &zopt_always }, { ztest_zap, 30, &zopt_always }, { ztest_zap_parallel, 100, &zopt_always }, + { ztest_split_pool, 1, &zopt_always }, { ztest_zil_commit, 1, &zopt_incessant }, { ztest_dmu_read_write_zcopy, 1, &zopt_often }, { ztest_dmu_objset_create_destroy, 1, &zopt_often }, @@ -318,6 +320,8 @@ typedef struct ztest_shared { mutex_t zs_vdev_lock; rwlock_t zs_name_lock; ztest_info_t zs_info[ZTEST_FUNCS]; + uint64_t zs_splits; + uint64_t zs_mirrors; ztest_ds_t zs_zd[]; } ztest_shared_t; @@ -592,7 +596,6 @@ process_options(int argc, char **argv) zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time * NANOSEC / zopt_vdevs : UINT64_MAX >> 2); - zopt_maxfaults = MAX(zopt_mirrors, 1) * (zopt_raidz_parity + 1) - 1; } static void @@ -2134,12 +2137,13 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; spa_t *spa = zs->zs_spa; - uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz; + uint64_t leaves; uint64_t guid; nvlist_t *nvroot; int error; VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * zopt_raidz; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -2177,7 +2181,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) * Make 1/4 of the devices be log devices. */ nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0, - ztest_random(4) == 0, zopt_raidz, zopt_mirrors, 1); + ztest_random(4) == 0, zopt_raidz, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); nvlist_free(nvroot); @@ -2273,6 +2277,99 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); } +/* + * split a pool if it has mirror tlvdevs + */ +/* ARGSUSED */ +void +ztest_split_pool(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; + vdev_t *rvd = spa->spa_root_vdev; + nvlist_t *tree, **child, *config, *split, **schild; + uint_t c, children, schildren = 0, lastlogid = 0; + int error = 0; + + VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + + /* ensure we have a useable config; mirrors of raidz aren't supported */ + if (zs->zs_mirrors < 3 || zopt_raidz > 1) { + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + return; + } + + /* clean up the old pool, if any */ + (void) spa_destroy("splitp"); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + /* generate a config from the existing config */ + VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE, + &tree) == 0); + VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, + &children) == 0); + + schild = malloc(rvd->vdev_children * sizeof (nvlist_t *)); + for (c = 0; c < children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + nvlist_t **mchild; + uint_t mchildren; + + if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { + VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME, + 0) == 0); + VERIFY(nvlist_add_string(schild[schildren], + ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0); + VERIFY(nvlist_add_uint64(schild[schildren], + ZPOOL_CONFIG_IS_HOLE, 1) == 0); + if (lastlogid == 0) + lastlogid = schildren; + ++schildren; + continue; + } + lastlogid = 0; + VERIFY(nvlist_lookup_nvlist_array(child[c], + ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); + VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0); + } + + /* OK, create a config that can be used to split */ + VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) == 0); + VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild, + lastlogid != 0 ? lastlogid : schildren) == 0); + + VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0); + + for (c = 0; c < schildren; c++) + nvlist_free(schild[c]); + free(schild); + nvlist_free(split); + + spa_config_exit(spa, SCL_VDEV, FTAG); + + (void) rw_wrlock(&zs->zs_name_lock); + error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); + (void) rw_unlock(&zs->zs_name_lock); + + nvlist_free(config); + + if (error == 0) { + (void) printf("successful split - results:\n"); + mutex_enter(&spa_namespace_lock); + show_pool_stats(spa); + show_pool_stats(spa_lookup("splitp")); + mutex_exit(&spa_namespace_lock); + ++zs->zs_splits; + --zs->zs_mirrors; + } + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + +} + /* * Verify that we can attach and detach devices. */ @@ -2286,7 +2383,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *pvd; nvlist_t *root; - uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz; + uint64_t leaves; uint64_t leaf, top; uint64_t ashift = ztest_get_ashift(); uint64_t oldguid, pguid; @@ -2299,6 +2396,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) int error, expected_error; VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -2315,15 +2413,15 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) /* * Pick a random leaf within it. */ - leaf = ztest_random(leaves); + leaf = ztest_random(leaves) + zs->zs_splits; /* * Locate this vdev. */ oldvd = rvd->vdev_child[top]; - if (zopt_mirrors >= 1) { + if (zs->zs_mirrors >= 1) { ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); - ASSERT(oldvd->vdev_children >= zopt_mirrors); + ASSERT(oldvd->vdev_children >= zs->zs_mirrors); oldvd = oldvd->vdev_child[leaf / zopt_raidz]; } if (zopt_raidz > 1) { @@ -4268,7 +4366,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) spa_t *spa = zs->zs_spa; int fd; uint64_t offset; - uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz; + uint64_t leaves; uint64_t bad = 0x1990c0ffeedecade; uint64_t top, leaf; char path0[MAXPATHLEN]; @@ -4276,11 +4374,18 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) size_t fsize; int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */ int iters = 1000; - int maxfaults = zopt_maxfaults; + int maxfaults; + int mirror_save; vdev_t *vd0 = NULL; uint64_t guid0 = 0; boolean_t islog = B_FALSE; + VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + maxfaults = MAXFAULTS(); + leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz; + mirror_save = zs->zs_mirrors; + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + ASSERT(leaves >= 1); /* @@ -4293,7 +4398,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) * Inject errors on a normal data device or slog device. */ top = ztest_random_vdev_top(spa, B_TRUE); - leaf = ztest_random(leaves); + leaf = ztest_random(leaves) + zs->zs_splits; /* * Generate paths to the first leaf in this top-level vdev, @@ -4302,7 +4407,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) * and we'll write random garbage to the randomly chosen leaf. */ (void) snprintf(path0, sizeof (path0), ztest_dev_template, - zopt_dir, zopt_pool, top * leaves + 0); + zopt_dir, zopt_pool, top * leaves + zs->zs_splits); (void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template, zopt_dir, zopt_pool, top * leaves + leaf); @@ -4405,13 +4510,22 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) if (offset >= fsize) continue; - if (zopt_verbose >= 7) - (void) printf("injecting bad word into %s," - " offset 0x%llx\n", pathrand, (u_longlong_t)offset); + VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + if (mirror_save != zs->zs_mirrors) { + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + (void) close(fd); + return; + } if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) fatal(1, "can't inject bad word at 0x%llx in %s", offset, pathrand); + + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + + if (zopt_verbose >= 7) + (void) printf("injected bad word into %s," + " offset 0x%llx\n", pathrand, (u_longlong_t)offset); } (void) close(fd); @@ -4992,7 +5106,7 @@ ztest_run(ztest_shared_t *zs) * in which case ztest_fault_inject() temporarily takes away * the only valid replica. */ - if (zopt_maxfaults == 0) + if (MAXFAULTS() == 0) spa->spa_failmode = ZIO_FAILURE_MODE_WAIT; else spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; @@ -5202,6 +5316,23 @@ print_time(hrtime_t t, char *timebuf) (void) sprintf(timebuf, "%llus", s); } +static nvlist_t * +make_random_props() +{ + nvlist_t *props; + + if (ztest_random(2) == 0) + return (NULL); + + VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0); + + (void) printf("props:\n"); + dump_nvlist(props, 4); + + return (props); +} + /* * Create a storage pool with the given name and initial vdev size. * Then test spa_freeze() functionality. @@ -5210,7 +5341,7 @@ static void ztest_init(ztest_shared_t *zs) { spa_t *spa; - nvlist_t *nvroot; + nvlist_t *nvroot, *props; VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0); VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0); @@ -5222,9 +5353,12 @@ ztest_init(ztest_shared_t *zs) */ (void) spa_destroy(zs->zs_pool); ztest_shared->zs_vdev_next_leaf = 0; + zs->zs_splits = 0; + zs->zs_mirrors = zopt_mirrors; nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0, - 0, zopt_raidz, zopt_mirrors, 1); - VERIFY3U(0, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL)); + 0, zopt_raidz, zs->zs_mirrors, 1); + props = make_random_props(); + VERIFY3U(0, ==, spa_create(zs->zs_pool, nvroot, props, NULL, NULL)); nvlist_free(nvroot); VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); diff --git a/usr/src/lib/libzfs/common/libzfs.h b/usr/src/lib/libzfs/common/libzfs.h index 03870019d5..0a4c734dbe 100644 --- a/usr/src/lib/libzfs/common/libzfs.h +++ b/usr/src/lib/libzfs/common/libzfs.h @@ -120,6 +120,7 @@ enum { EZFS_TAGTOOLONG, /* snapshot hold/rele: tag too long */ EZFS_PIPEFAILED, /* pipe create failed */ EZFS_THREADCREATEFAILED, /* thread create failed */ + EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */ EZFS_UNKNOWN }; @@ -214,6 +215,14 @@ extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, extern int zpool_destroy(zpool_handle_t *); extern int zpool_add(zpool_handle_t *, nvlist_t *); +typedef struct splitflags { + /* do not split, but return the config that would be split off */ + int dryrun : 1; + + /* after splitting, import the pool */ + int import : 1; +} splitflags_t; + /* * Functions to manipulate pool and vdev state */ @@ -227,6 +236,8 @@ extern int zpool_vdev_attach(zpool_handle_t *, const char *, const char *, nvlist_t *, int); extern int zpool_vdev_detach(zpool_handle_t *, const char *); extern int zpool_vdev_remove(zpool_handle_t *, const char *); +extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *, + splitflags_t); extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t); extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t); diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c index 0b28814a50..3c0f46815b 100644 --- a/usr/src/lib/libzfs/common/libzfs_pool.c +++ b/usr/src/lib/libzfs/common/libzfs_pool.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -84,6 +84,7 @@ const char *hist_event_table[LOG_END] = { "pool scrub done", "user hold", "user release", + "pool split", }; static int read_efi_label(nvlist_t *config, diskaddr_t *sb); @@ -231,6 +232,8 @@ zpool_state_to_name(vdev_state_t state, vdev_aux_t aux) case VDEV_STATE_CANT_OPEN: if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG) return (gettext("FAULTED")); + else if (aux == VDEV_AUX_SPLIT_POOL) + return (gettext("SPLIT")); else return (gettext("UNAVAIL")); case VDEV_STATE_FAULTED: @@ -2074,8 +2077,15 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags, zc.zc_cookie = VDEV_STATE_ONLINE; zc.zc_obj = flags; - if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) + if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) { + if (errno == EINVAL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split " + "from this pool into a new one. Use '%s' " + "instead"), "zpool detach"); + return (zfs_error(hdl, EZFS_POSTSPLIT_ONLINE, msg)); + } return (zpool_standard_error(hdl, errno, msg)); + } *newstate = zc.zc_cookie; return (0); @@ -2467,6 +2477,257 @@ zpool_vdev_detach(zpool_handle_t *zhp, const char *path) return (-1); } +/* + * Find a mirror vdev in the source nvlist. + * + * The mchild array contains a list of disks in one of the top-level mirrors + * of the source pool. The schild array contains a list of disks that the + * user specified on the command line. We loop over the mchild array to + * see if any entry in the schild array matches. + * + * If a disk in the mchild array is found in the schild array, we return + * the index of that entry. Otherwise we return -1. + */ +static int +find_vdev_entry(zpool_handle_t *zhp, nvlist_t **mchild, uint_t mchildren, + nvlist_t **schild, uint_t schildren) +{ + uint_t mc; + + for (mc = 0; mc < mchildren; mc++) { + uint_t sc; + char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp, + mchild[mc], B_FALSE); + + for (sc = 0; sc < schildren; sc++) { + char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp, + schild[sc], B_FALSE); + boolean_t result = (strcmp(mpath, spath) == 0); + + free(spath); + if (result) { + free(mpath); + return (mc); + } + } + + free(mpath); + } + + return (-1); +} + +/* + * Split a mirror pool. If newroot points to null, then a new nvlist + * is generated and it is the responsibility of the caller to free it. + */ +int +zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, + nvlist_t *props, splitflags_t flags) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL; + nvlist_t **varray = NULL, *zc_props = NULL; + uint_t c, children, newchildren, lastlog = 0, vcount, found = 0; + libzfs_handle_t *hdl = zhp->zpool_hdl; + uint64_t vers; + boolean_t freelist = B_FALSE, memory_err = B_TRUE; + int retval = 0; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "Unable to split %s"), zhp->zpool_name); + + if (!zpool_name_valid(hdl, B_FALSE, newname)) + return (zfs_error(hdl, EZFS_INVALIDNAME, msg)); + + if ((config = zpool_get_config(zhp, NULL)) == NULL) { + (void) fprintf(stderr, gettext("Internal error: unable to " + "retrieve pool configuration\n")); + return (-1); + } + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) + == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &vers) == 0); + + if (props) { + if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name, + props, vers, B_TRUE, msg)) == NULL) + return (-1); + } + + if (nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, + &children) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Source pool is missing vdev tree")); + if (zc_props) + nvlist_free(zc_props); + return (-1); + } + + varray = zfs_alloc(hdl, children * sizeof (nvlist_t *)); + vcount = 0; + + if (*newroot == NULL || + nvlist_lookup_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, + &newchild, &newchildren) != 0) + newchildren = 0; + + for (c = 0; c < children; c++) { + uint64_t is_log = B_FALSE, is_hole = B_FALSE; + char *type; + nvlist_t **mchild, *vdev; + uint_t mchildren; + int entry; + + /* + * Unlike cache & spares, slogs are stored in the + * ZPOOL_CONFIG_CHILDREN array. We filter them out here. + */ + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, + &is_hole); + if (is_log || is_hole) { + /* + * Create a hole vdev and put it in the config. + */ + if (nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) != 0) + goto out; + if (nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_HOLE) != 0) + goto out; + if (nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_HOLE, + 1) != 0) + goto out; + if (lastlog == 0) + lastlog = vcount; + varray[vcount++] = vdev; + continue; + } + lastlog = 0; + verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type) + == 0); + if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Source pool must be composed only of mirrors\n")); + retval = zfs_error(hdl, EZFS_INVALCONFIG, msg); + goto out; + } + + verify(nvlist_lookup_nvlist_array(child[c], + ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); + + /* find or add an entry for this top-level vdev */ + if (newchildren > 0 && + (entry = find_vdev_entry(zhp, mchild, mchildren, + newchild, newchildren)) >= 0) { + /* We found a disk that the user specified. */ + vdev = mchild[entry]; + ++found; + } else { + /* User didn't specify a disk for this vdev. */ + vdev = mchild[mchildren - 1]; + } + + if (nvlist_dup(vdev, &varray[vcount++], 0) != 0) + goto out; + } + + /* did we find every disk the user specified? */ + if (found != newchildren) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Device list must " + "include at most one disk from each mirror")); + retval = zfs_error(hdl, EZFS_INVALCONFIG, msg); + goto out; + } + + /* Prepare the nvlist for populating. */ + if (*newroot == NULL) { + if (nvlist_alloc(newroot, NV_UNIQUE_NAME, 0) != 0) + goto out; + freelist = B_TRUE; + if (nvlist_add_string(*newroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) != 0) + goto out; + } else { + verify(nvlist_remove_all(*newroot, ZPOOL_CONFIG_CHILDREN) == 0); + } + + /* Add all the children we found */ + if (nvlist_add_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, varray, + lastlog == 0 ? vcount : lastlog) != 0) + goto out; + + /* + * If we're just doing a dry run, exit now with success. + */ + if (flags.dryrun) { + memory_err = B_FALSE; + freelist = B_FALSE; + goto out; + } + + /* now build up the config list & call the ioctl */ + if (nvlist_alloc(&newconfig, NV_UNIQUE_NAME, 0) != 0) + goto out; + + if (nvlist_add_nvlist(newconfig, + ZPOOL_CONFIG_VDEV_TREE, *newroot) != 0 || + nvlist_add_string(newconfig, + ZPOOL_CONFIG_POOL_NAME, newname) != 0 || + nvlist_add_uint64(newconfig, ZPOOL_CONFIG_VERSION, vers) != 0) + goto out; + + /* + * The new pool is automatically part of the namespace unless we + * explicitly export it. + */ + if (!flags.import) + zc.zc_cookie = ZPOOL_EXPORT_AFTER_SPLIT; + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_string, newname, sizeof (zc.zc_string)); + if (zcmd_write_conf_nvlist(hdl, &zc, newconfig) != 0) + goto out; + if (zc_props != NULL && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0) + goto out; + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SPLIT, &zc) != 0) { + retval = zpool_standard_error(hdl, errno, msg); + goto out; + } + + freelist = B_FALSE; + memory_err = B_FALSE; + +out: + if (varray != NULL) { + int v; + + for (v = 0; v < vcount; v++) + nvlist_free(varray[v]); + free(varray); + } + zcmd_free_nvlists(&zc); + if (zc_props) + nvlist_free(zc_props); + if (newconfig) + nvlist_free(newconfig); + if (freelist) { + nvlist_free(*newroot); + *newroot = NULL; + } + + if (retval != 0) + return (retval); + + if (memory_err) + return (no_memory(hdl)); + + return (0); +} + /* * Remove the given device. Currently, this is supported only for hot spares * and level 2 cache devices. diff --git a/usr/src/lib/libzfs/common/libzfs_util.c b/usr/src/lib/libzfs/common/libzfs_util.c index f7beab37dd..a400dc9c1e 100644 --- a/usr/src/lib/libzfs/common/libzfs_util.c +++ b/usr/src/lib/libzfs/common/libzfs_util.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -220,6 +220,9 @@ libzfs_error_description(libzfs_handle_t *hdl) return (dgettext(TEXT_DOMAIN, "pipe create failed")); case EZFS_THREADCREATEFAILED: return (dgettext(TEXT_DOMAIN, "thread create failed")); + case EZFS_POSTSPLIT_ONLINE: + return (dgettext(TEXT_DOMAIN, "disk was split from this pool " + "into a new one")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: diff --git a/usr/src/lib/libzfs/common/mapfile-vers b/usr/src/lib/libzfs/common/mapfile-vers index d3006b09c7..88fb30fb44 100644 --- a/usr/src/lib/libzfs/common/mapfile-vers +++ b/usr/src/lib/libzfs/common/mapfile-vers @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Copyright 2010 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -216,6 +216,7 @@ SUNWprivate_1.1 { zpool_vdev_offline; zpool_vdev_online; zpool_vdev_remove; + zpool_vdev_split; zprop_free_list; zprop_get_list; zprop_iter; diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index 4c73d6aa1e..d1e770eb08 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -113,6 +113,9 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); +static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, + spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, + char **ereport); uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ id_t zio_taskq_psrset_bind = PS_NONE; @@ -1313,7 +1316,7 @@ spa_check_logs(spa_t *spa) case SPA_LOG_UNKNOWN: if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, DS_FIND_CHILDREN)) { - spa->spa_log_state = SPA_LOG_MISSING; + spa_set_log_state(spa, SPA_LOG_MISSING); return (1); } break; @@ -1321,6 +1324,64 @@ spa_check_logs(spa_t *spa) return (0); } +static boolean_t +spa_passivate_log(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + boolean_t slog_found = B_FALSE; + + ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); + + if (!spa_has_slogs(spa)) + return (B_FALSE); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (tvd->vdev_islog) { + metaslab_group_passivate(mg); + slog_found = B_TRUE; + } + } + + return (slog_found); +} + +static void +spa_activate_log(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + + ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (tvd->vdev_islog) + metaslab_group_activate(mg); + } +} + +int +spa_offline_log(spa_t *spa) +{ + int error = 0; + + if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, + NULL, DS_FIND_CHILDREN)) == 0) { + + /* + * We successfully offlined the log device, sync out the + * current txg so that the "stubby" block can be removed + * by zil_sync(). + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + } + return (error); +} + static void spa_aux_check_removed(spa_aux_vdev_t *sav) { @@ -1423,24 +1484,179 @@ spa_load_verify(spa_t *spa) return (verify_ok ? 0 : EIO); } +/* + * Find a value in the pool props object. + */ +static void +spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) +{ + (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, + zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); +} + +/* + * Find a value in the pool directory object. + */ +static int +spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) +{ + return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + name, sizeof (uint64_t), 1, val)); +} + +static int +spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) +{ + vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); + return (err); +} + +/* + * Fix up config after a partly-completed split. This is done with the + * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off + * pool have that entry in their config, but only the splitting one contains + * a list of all the guids of the vdevs that are being split off. + * + * This function determines what to do with that list: either rejoin + * all the disks to the pool, or complete the splitting process. To attempt + * the rejoin, each disk that is offlined is marked online again, and + * we do a reopen() call. If the vdev label for every disk that was + * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) + * then we call vdev_split() on each disk, and complete the split. + * + * Otherwise we leave the config alone, and rejoined to the original pool. + */ +static void +spa_try_repair(spa_t *spa, nvlist_t *config) +{ + uint_t extracted; + uint64_t *glist; + uint_t i, gcount; + nvlist_t *nvl; + vdev_t **vd; + boolean_t attempt_reopen; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) + return; + + /* check that the config is complete */ + if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, + &glist, &gcount) != 0) + return; + + vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); + + /* attempt to online all the vdevs & validate */ + attempt_reopen = B_TRUE; + for (i = 0; i < gcount; i++) { + if (glist[i] == 0) /* vdev is hole */ + continue; + + vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); + if (vd[i] == NULL) { + /* + * Don't bother attempting to reopen the disks; + * just do the split. + */ + attempt_reopen = B_FALSE; + } else { + /* attempt to re-online it */ + vd[i]->vdev_offline = B_FALSE; + } + } + + if (attempt_reopen) { + vdev_reopen(spa->spa_root_vdev); + + /* check each device to see what state it's in */ + for (extracted = 0, i = 0; i < gcount; i++) { + if (vd[i] != NULL && + vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) + break; + ++extracted; + } + } + + /* + * If every disk has been moved to the new pool, or if we never + * even attempted to look at them, then we split them off for + * good. + */ + if (!attempt_reopen || gcount == extracted) { + for (i = 0; i < gcount; i++) + if (vd[i] != NULL) + vdev_split(vd[i]); + vdev_reopen(spa->spa_root_vdev); + } + + kmem_free(vd, gcount * sizeof (vdev_t *)); +} + +static int +spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, + boolean_t mosconfig) +{ + nvlist_t *config = spa->spa_config; + char *ereport = FM_EREPORT_ZFS_POOL; + int error; + uint64_t pool_guid; + nvlist_t *nvl; + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) + return (EINVAL); + + /* + * Versioning wasn't explicitly added to the label until later, so if + * it's not present treat it as the initial version. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &spa->spa_ubsync.ub_version) != 0) + spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; + + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &spa->spa_config_txg); + + if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && + spa_guid_exists(pool_guid, 0)) { + error = EEXIST; + } else { + spa->spa_load_guid = pool_guid; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, + &nvl) == 0) { + VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, + KM_SLEEP) == 0); + } + + error = spa_load_impl(spa, pool_guid, config, state, type, + mosconfig, &ereport); + } + + spa->spa_minref = refcount_count(&spa->spa_refcount); + if (error && error != EBADF) + zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); + spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; + spa->spa_ena = 0; + + return (error); +} + /* * Load an existing storage pool, using the pool's builtin spa_config as a * source of configuration information. */ static int -spa_load(spa_t *spa, spa_load_state_t state, int mosconfig) +spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, + spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, + char **ereport) { int error = 0; nvlist_t *nvconfig, *nvroot = NULL; vdev_t *rvd; uberblock_t *ub = &spa->spa_uberblock; uint64_t config_cache_txg = spa->spa_config_txg; - uint64_t pool_guid; - uint64_t version; - uint64_t autoreplace = 0; int orig_mode = spa->spa_mode; - char *ereport = FM_EREPORT_ZFS_POOL; - nvlist_t *config = spa->spa_config; + int parse; /* * If this is an untrusted config, access the pool in read-only mode. @@ -1453,29 +1669,11 @@ spa_load(spa_t *spa, spa_load_state_t state, int mosconfig) spa->spa_load_state = state; - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || - nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { - error = EINVAL; - goto out; - } - - /* - * Versioning wasn't explicitly added to the label until later, so if - * it's not present treat it as the initial version. - */ - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) - version = SPA_VERSION_INITIAL; - - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, - &spa->spa_config_txg); - - if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && - spa_guid_exists(pool_guid, 0)) { - error = EEXIST; - goto out; - } + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) + return (EINVAL); - spa->spa_load_guid = pool_guid; + parse = (type == SPA_IMPORT_EXISTING ? + VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); /* * Create "The Godfather" zio to hold all async IOs @@ -1489,15 +1687,17 @@ spa_load(spa_t *spa, spa_load_state_t state, int mosconfig) * configuration requires knowing the version number. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - spa->spa_ubsync.ub_version = version; - error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); + error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) - goto out; + return (error); ASSERT(spa->spa_root_vdev == rvd); - ASSERT(spa_guid(spa) == pool_guid); + + if (type != SPA_IMPORT_ASSEMBLE) { + ASSERT(spa_guid(spa) == pool_guid); + } /* * Try to open all vdevs, loading each label in the process. @@ -1506,26 +1706,31 @@ spa_load(spa_t *spa, spa_load_state_t state, int mosconfig) error = vdev_open(rvd); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) - goto out; + return (error); /* * We need to validate the vdev labels against the configuration that * we have in hand, which is dependent on the setting of mosconfig. If * mosconfig is true then we're validating the vdev labels based on - * that config. Otherwise, we're validating against the cached config + * that config. Otherwise, we're validating against the cached config * (zpool.cache) that was read when we loaded the zfs module, and then * later we will recursively call spa_load() and validate against * the vdev config. + * + * If we're assembling a new pool that's been split off from an + * existing pool, the labels haven't yet been updated so we skip + * validation for now. */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_validate(rvd); - spa_config_exit(spa, SCL_ALL, FTAG); - if (error != 0) - goto out; + if (type != SPA_IMPORT_ASSEMBLE) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_validate(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { - error = ENXIO; - goto out; + if (error != 0) + return (error); + + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) + return (ENXIO); } /* @@ -1536,32 +1741,29 @@ spa_load(spa_t *spa, spa_load_state_t state, int mosconfig) /* * If we weren't able to find a single valid uberblock, return failure. */ - if (ub->ub_txg == 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = ENXIO; - goto out; - } + if (ub->ub_txg == 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); /* * If the pool is newer than the code, we can't open it. */ - if (ub->ub_version > SPA_VERSION) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_VERSION_NEWER); - error = ENOTSUP; - goto out; - } + if (ub->ub_version > SPA_VERSION) + return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); /* * If the vdev guid sum doesn't match the uberblock, we have an * incomplete configuration. */ - if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_GUID_SUM); - error = ENXIO; - goto out; + if (mosconfig && type != SPA_IMPORT_ASSEMBLE && + rvd->vdev_guid_sum != ub->ub_guid_sum) + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); + + if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_try_repair(spa, config); + spa_config_exit(spa, SCL_ALL, FTAG); + nvlist_free(spa->spa_config_splitting); + spa->spa_config_splitting = NULL; } /* @@ -1576,29 +1778,15 @@ spa_load(spa_t *spa, spa_load_state_t state, int mosconfig) spa->spa_claim_max_txg = spa->spa_first_txg; error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); - if (error) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + if (error) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; - if (zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, - sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (!mosconfig) { uint64_t hostid; @@ -1628,8 +1816,7 @@ spa_load(spa_t *spa, spa_load_state_t state, int mosconfig) "See: http://www.sun.com/msg/ZFS-8000-EY", spa_name(spa), hostname, (unsigned long)hostid); - error = EBADF; - goto out; + return (EBADF); } } @@ -1638,163 +1825,106 @@ spa_load(spa_t *spa, spa_load_state_t state, int mosconfig) spa_deactivate(spa); spa_activate(spa, orig_mode); - return (spa_load(spa, state, B_TRUE)); + return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); } - if (zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, - sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj) != 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + if (spa_dir_prop(spa, DMU_POOL_SYNC_BPLIST, + &spa->spa_deferred_bplist_obj) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the bit that tells us to use the new accounting function * (raid-z deflation). If we have an older pool, this will not * be present. */ - error = zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, - sizeof (uint64_t), 1, &spa->spa_deflate); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the persistent error log. If we have an older pool, this will * not be present. */ - error = zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, - sizeof (uint64_t), 1, &spa->spa_errlog_last); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - error = zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, - sizeof (uint64_t), 1, &spa->spa_errlog_scrub); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, + &spa->spa_errlog_scrub); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the history object. If we have an older pool, this * will not be present. */ - error = zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, - sizeof (uint64_t), 1, &spa->spa_history); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + /* + * If we're assembling the pool from the split-off vdevs of + * an existing pool, we don't want to attach the spares & cache + * devices. + */ /* * Load any hot spares for this pool. */ - error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } - if (error == 0) { + error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); if (load_nvlist(spa, spa->spa_spares.sav_object, - &spa->spa_spares.sav_config) != 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + &spa->spa_spares.sav_config) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); + } else if (error == 0) { + spa->spa_spares.sav_sync = B_TRUE; } /* * Load any level 2 ARC devices for this pool. */ - error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_L2CACHE, sizeof (uint64_t), 1, + error = spa_dir_prop(spa, DMU_POOL_L2CACHE, &spa->spa_l2cache.sav_object); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } - if (error == 0) { + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); if (load_nvlist(spa, spa->spa_l2cache.sav_object, - &spa->spa_l2cache.sav_config) != 0) { - vdev_set_state(rvd, B_TRUE, - VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + &spa->spa_l2cache.sav_config) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); + } else if (error == 0) { + spa->spa_l2cache.sav_sync = B_TRUE; } spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); - error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); - - if (error && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); + if (error && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0) { - (void) zap_lookup(spa->spa_meta_objset, - spa->spa_pool_props_object, - zpool_prop_to_name(ZPOOL_PROP_BOOTFS), - sizeof (uint64_t), 1, &spa->spa_bootfs); - (void) zap_lookup(spa->spa_meta_objset, - spa->spa_pool_props_object, - zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), - sizeof (uint64_t), 1, &autoreplace); + uint64_t autoreplace; + + spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); + spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); + spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); + spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); + spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); + spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, + &spa->spa_dedup_ditto); + spa->spa_autoreplace = (autoreplace != 0); - (void) zap_lookup(spa->spa_meta_objset, - spa->spa_pool_props_object, - zpool_prop_to_name(ZPOOL_PROP_DELEGATION), - sizeof (uint64_t), 1, &spa->spa_delegation); - (void) zap_lookup(spa->spa_meta_objset, - spa->spa_pool_props_object, - zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), - sizeof (uint64_t), 1, &spa->spa_failmode); - (void) zap_lookup(spa->spa_meta_objset, - spa->spa_pool_props_object, - zpool_prop_to_name(ZPOOL_PROP_AUTOEXPAND), - sizeof (uint64_t), 1, &spa->spa_autoexpand); - (void) zap_lookup(spa->spa_meta_objset, - spa->spa_pool_props_object, - zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO), - sizeof (uint64_t), 1, &spa->spa_dedup_ditto); } /* @@ -1833,47 +1963,39 @@ spa_load(spa_t *spa, spa_load_state_t state, int mosconfig) * Check the state of the root vdev. If it can't be opened, it * indicates one or more toplevel vdevs are faulted. */ - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { - error = ENXIO; - goto out; - } + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) + return (ENXIO); /* * Load the DDTs (dedup tables). */ error = ddt_load(spa); - if (error != 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + if (error != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa_update_dspace(spa); if (state != SPA_LOAD_TRYIMPORT) { error = spa_load_verify(spa); - if (error) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - goto out; - } + if (error) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + error)); } /* - * Load the intent log state and check log integrity. + * Load the intent log state and check log integrity. If we're + * assembling a pool from a split, the log is not transferred over. */ - VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - spa_load_log_state(spa, nvroot); - nvlist_free(nvconfig); - - if (spa_check_logs(spa)) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_LOG); - error = ENXIO; - ereport = FM_EREPORT_ZFS_LOG_REPLAY; - goto out; + if (type != SPA_IMPORT_ASSEMBLE) { + VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + spa_load_log_state(spa, nvroot); + nvlist_free(nvconfig); + + if (spa_check_logs(spa)) { + *ereport = FM_EREPORT_ZFS_LOG_REPLAY; + return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); + } } if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || @@ -1900,7 +2022,7 @@ spa_load(spa_t *spa, spa_load_state_t state, int mosconfig) spa->spa_claiming = B_FALSE; - spa->spa_log_state = SPA_LOG_GOOD; + spa_set_log_state(spa, SPA_LOG_GOOD); spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); @@ -1954,17 +2076,7 @@ spa_load(spa_t *spa, spa_load_state_t state, int mosconfig) dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); } - error = 0; -out: - - spa->spa_minref = refcount_count(&spa->spa_refcount); - if (error && error != EBADF) - zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); - - spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; - spa->spa_ena = 0; - - return (error); + return (0); } static int @@ -1978,7 +2090,7 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) spa_activate(spa, spa_mode_global); spa_async_suspend(spa); - return (spa_load(spa, state, mosconfig)); + return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); } static int @@ -1992,12 +2104,13 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { spa->spa_load_max_txg = spa->spa_load_txg; - spa->spa_log_state = SPA_LOG_CLEAR; + spa_set_log_state(spa, SPA_LOG_CLEAR); } else { spa->spa_load_max_txg = max_request; } - load_error = rewind_error = spa_load(spa, state, mosconfig); + load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, + mosconfig); if (load_error == 0) return (0); @@ -2015,7 +2128,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, /* Price of rolling back is discarding txgs, including log */ if (state == SPA_LOAD_RECOVER) - spa->spa_log_state = SPA_LOG_CLEAR; + spa_set_log_state(spa, SPA_LOG_CLEAR); spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; safe_rollback_txg = spa->spa_uberblock.ub_txg - TXG_DEFER_SIZE; @@ -3016,7 +3129,7 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) * If a pool with this name exists, return failure. */ mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(pool)) != NULL) { + if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (EEXIST); } @@ -3183,7 +3296,7 @@ spa_tryimport(nvlist_t *tryconfig) * Pass TRUE for mosconfig because the user-supplied config * is actually the one to trust when doing an import. */ - error = spa_load(spa, SPA_LOAD_TRYIMPORT, B_TRUE); + error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); /* * If 'tryconfig' was at least parsable, return the current config. @@ -3700,6 +3813,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) boolean_t unspare = B_FALSE; uint64_t unspare_guid; size_t len; + char *vdpath; txg = spa_vdev_enter(spa); @@ -3866,6 +3980,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) * But first make sure we're not on any *other* txg's DTL list, to * prevent vd from being accessed after it's freed. */ + vdpath = spa_strdup(vd->vdev_path); for (int t = 0; t < TXG_SIZE; t++) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); vd->vdev_detached = B_TRUE; @@ -3875,6 +3990,10 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) error = spa_vdev_exit(spa, vd, txg, 0); + spa_history_internal_log(LOG_POOL_VDEV_DETACH, spa, NULL, CRED(), + "vdev=%s", vdpath); + spa_strfree(vdpath); + /* * If this was the removal of the original device in a hot spare vdev, * then we want to go through and remove the device from the hot spare @@ -3901,6 +4020,281 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) return (error); } +/* + * Split a set of devices from their mirrors, and create a new pool from them. + */ +int +spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, + nvlist_t *props, boolean_t exp) +{ + int error = 0; + uint64_t txg, *glist; + spa_t *newspa; + uint_t c, children, lastlog; + nvlist_t **child, *nvl, *tmp; + dmu_tx_t *tx; + char *altroot = NULL; + vdev_t *rvd, **vml = NULL; /* vdev modify list */ + boolean_t activate_slog; + + if (!spa_writeable(spa)) + return (EROFS); + + txg = spa_vdev_enter(spa); + + /* clear the log and flush everything up to now */ + activate_slog = spa_passivate_log(spa); + (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); + error = spa_offline_log(spa); + txg = spa_vdev_config_enter(spa); + + if (activate_slog) + spa_activate_log(spa); + + if (error != 0) + return (spa_vdev_exit(spa, NULL, txg, error)); + + /* check new spa name before going any further */ + if (spa_lookup(newname) != NULL) + return (spa_vdev_exit(spa, NULL, txg, EEXIST)); + + /* + * scan through all the children to ensure they're all mirrors + */ + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || + nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, + &children) != 0) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + /* first, check to ensure we've got the right child count */ + rvd = spa->spa_root_vdev; + lastlog = 0; + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + /* don't count the holes & logs as children */ + if (vd->vdev_islog || vd->vdev_ishole) { + if (lastlog == 0) + lastlog = c; + continue; + } + + lastlog = 0; + } + if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + /* next, ensure no spare or cache devices are part of the split */ + if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || + nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); + glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); + + /* then, loop over each vdev and validate it */ + for (c = 0; c < children; c++) { + uint64_t is_hole = 0; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, + &is_hole); + + if (is_hole != 0) { + if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || + spa->spa_root_vdev->vdev_child[c]->vdev_islog) { + continue; + } else { + error = EINVAL; + break; + } + } + + /* which disk is going to be split? */ + if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, + &glist[c]) != 0) { + error = EINVAL; + break; + } + + /* look it up in the spa */ + vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); + if (vml[c] == NULL) { + error = ENODEV; + break; + } + + /* make sure there's nothing stopping the split */ + if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || + vml[c]->vdev_islog || + vml[c]->vdev_ishole || + vml[c]->vdev_isspare || + vml[c]->vdev_isl2cache || + !vdev_writeable(vml[c]) || + vml[c]->vdev_state != VDEV_STATE_HEALTHY || + c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { + error = EINVAL; + break; + } + + if (vdev_dtl_required(vml[c])) { + error = EBUSY; + break; + } + + /* we need certain info from the top level */ + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, + vml[c]->vdev_top->vdev_ms_array) == 0); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, + vml[c]->vdev_top->vdev_ms_shift) == 0); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, + vml[c]->vdev_top->vdev_asize) == 0); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, + vml[c]->vdev_top->vdev_ashift) == 0); + } + + if (error != 0) { + kmem_free(vml, children * sizeof (vdev_t *)); + kmem_free(glist, children * sizeof (uint64_t)); + return (spa_vdev_exit(spa, NULL, txg, error)); + } + + /* stop writers from using the disks */ + for (c = 0; c < children; c++) { + if (vml[c] != NULL) + vml[c]->vdev_offline = B_TRUE; + } + vdev_reopen(spa->spa_root_vdev); + + /* + * Temporarily record the splitting vdevs in the spa config. This + * will disappear once the config is regenerated. + */ + VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, + glist, children) == 0); + kmem_free(glist, children * sizeof (uint64_t)); + + VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, + nvl) == 0); + spa->spa_config_splitting = nvl; + vdev_config_dirty(spa->spa_root_vdev); + + /* configure and create the new pool */ + VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, + exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, + spa_version(spa)) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, + spa->spa_config_txg) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, + spa_generate_guid(NULL)) == 0); + (void) nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); + + newspa = spa_add(newname, config, altroot); + mutex_enter(&newspa->spa_vdev_top_lock); + newspa->spa_config_txg = spa->spa_config_txg; + spa_set_log_state(newspa, SPA_LOG_CLEAR); + + /* release the spa config lock, retaining the namespace lock */ + spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); + + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 1); + + spa_activate(newspa, spa_mode_global); + spa_async_suspend(newspa); + + /* create the new pool from the disks of the original pool */ + error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); + if (error) + goto out; + + /* if that worked, generate a real config for the new pool */ + if (newspa->spa_root_vdev != NULL) { + VERIFY(nvlist_alloc(&newspa->spa_config_splitting, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, + ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); + spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, + B_TRUE)); + } + + /* set the props */ + if (props != NULL) { + spa_configfile_set(newspa, props, B_FALSE); + error = spa_prop_set(newspa, props); + if (error) + goto out; + } + + /* flush everything */ + txg = spa_vdev_config_enter(newspa); + vdev_config_dirty(newspa->spa_root_vdev); + spa_config_sync(newspa, B_FALSE, B_TRUE); + (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); + mutex_exit(&newspa->spa_vdev_top_lock); + + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 2); + + spa_async_resume(newspa); + + /* finally, update the original pool's config */ + txg = spa_vdev_config_enter(spa); + tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) + dmu_tx_abort(tx); + for (c = 0; c < children; c++) { + if (vml[c] != NULL) { + vdev_split(vml[c]); + if (error == 0) + spa_history_internal_log(LOG_POOL_VDEV_DETACH, + spa, tx, CRED(), "vdev=%s", + vml[c]->vdev_path); + vdev_free(vml[c]); + } + } + vdev_config_dirty(spa->spa_root_vdev); + spa->spa_config_splitting = NULL; + nvlist_free(nvl); + if (error == 0) + dmu_tx_commit(tx); + (void) spa_vdev_exit(spa, NULL, txg, 0); + + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 3); + + /* split is complete; log a history record */ + spa_history_internal_log(LOG_POOL_SPLIT, newspa, NULL, CRED(), + "split new pool %s from pool %s", newname, spa_name(spa)); + + kmem_free(vml, children * sizeof (vdev_t *)); + + /* if we're not going to mount the filesystems in userland, export */ + if (exp) + error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, + B_FALSE, B_FALSE); + + return (error); + +out: + mutex_exit(&newspa->spa_vdev_top_lock); + spa_unload(newspa); + spa_deactivate(newspa); + spa_remove(newspa); + + txg = spa_vdev_config_enter(spa); + nvlist_free(spa->spa_config_splitting); + spa->spa_config_splitting = NULL; + (void) spa_vdev_exit(spa, NULL, txg, 0); + + kmem_free(vml, children * sizeof (vdev_t *)); + return (error); +} + static nvlist_t * spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) { diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c index 176543c173..68a40bec89 100644 --- a/usr/src/uts/common/fs/zfs/spa_config.c +++ b/usr/src/uts/common/fs/zfs/spa_config.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -335,6 +335,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) vdev_t *rvd = spa->spa_root_vdev; unsigned long hostid = 0; boolean_t locked = B_FALSE; + uint64_t split_guid; if (vd == NULL) { vd = rvd; @@ -391,6 +392,14 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG, 1ULL) == 0); vd = vd->vdev_top; /* label contains top config */ + } else { + /* + * Only add the (potentially large) split information + * in the mos config, and not in the vdev labels + */ + if (spa->spa_config_splitting != NULL) + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT, + spa->spa_config_splitting) == 0); } /* @@ -400,6 +409,16 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) */ vdev_top_config_generate(spa, config); + /* + * If we're splitting, record the original pool's guid. + */ + if (spa->spa_config_splitting != NULL && + nvlist_lookup_uint64(spa->spa_config_splitting, + ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) { + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, + split_guid) == 0); + } + nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE); VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); nvlist_free(nvroot); diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index cff272abda..5a48dc6093 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -498,6 +498,8 @@ spa_remove(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + nvlist_free(spa->spa_config_splitting); + avl_remove(&spa_namespace_avl, spa); cv_broadcast(&spa_namespace_cv); @@ -909,7 +911,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) * transactionally. */ if (zio_injection_enabled) - zio_handle_panic_injection(spa, tag); + zio_handle_panic_injection(spa, tag, 0); /* * Note: this txg_wait_synced() is important because it ensures @@ -1117,6 +1119,22 @@ spa_get_random(uint64_t range) return (r % range); } +uint64_t +spa_generate_guid(spa_t *spa) +{ + uint64_t guid = spa_get_random(-1ULL); + + if (spa != NULL) { + while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) + guid = spa_get_random(-1ULL); + } else { + while (guid == 0 || spa_guid_exists(guid, 0)) + guid = spa_get_random(-1ULL); + } + + return (guid); +} + void sprintf_blkptr(char *buf, const blkptr_t *bp) { diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index c0d5ded3a2..e338b0a0fc 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -401,6 +401,11 @@ typedef struct blkptr { (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \ ARC_BUFC_METADATA : ARC_BUFC_DATA); +typedef enum spa_import_type { + SPA_IMPORT_EXISTING, + SPA_IMPORT_ASSEMBLE +} spa_import_type_t; + /* state manipulation functions */ extern int spa_open(const char *pool, spa_t **, void *tag); extern int spa_open_rewind(const char *pool, spa_t **, void *tag, @@ -440,6 +445,8 @@ extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); +extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, + nvlist_t *props, boolean_t exp); /* spare state (which is global across all pools) */ extern void spa_spare_add(vdev_t *vd); @@ -537,6 +544,7 @@ typedef enum spa_log_state { extern spa_log_state_t spa_get_log_state(spa_t *spa); extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); +extern int spa_offline_log(spa_t *spa); /* Log claim callback */ extern void spa_claim_notify(zio_t *zio); @@ -579,6 +587,7 @@ extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); extern char *spa_strdup(const char *); extern void spa_strfree(char *); extern uint64_t spa_get_random(uint64_t range); +extern uint64_t spa_generate_guid(spa_t *spa); extern void sprintf_blkptr(char *buf, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); extern void spa_upgrade(spa_t *spa, uint64_t version); diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index a5d21508a3..9df395bf55 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -113,6 +113,7 @@ struct spa { avl_node_t spa_avl; /* node in spa_namespace_avl */ nvlist_t *spa_config; /* last synced config */ nvlist_t *spa_config_syncing; /* currently syncing config */ + nvlist_t *spa_config_splitting; /* config for splitting */ uint64_t spa_config_txg; /* txg of last config change */ int spa_sync_pass; /* iterate-to-convergence */ pool_state_t spa_state; /* pool state */ @@ -212,6 +213,7 @@ struct spa { uint64_t spa_did; /* if procp != p0, did of t1 */ boolean_t spa_autoreplace; /* autoreplace set in open */ int spa_vdev_locks; /* locks grabbed */ + /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index 2f9dbef0e0..3bf5ba8042 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -74,6 +74,8 @@ extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); extern void vdev_metaslab_fini(vdev_t *vd); extern void vdev_metaslab_set_size(vdev_t *); extern void vdev_expand(vdev_t *vd, uint64_t txg); +extern void vdev_split(vdev_t *vd); + extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_clear_stats(vdev_t *vd); @@ -139,7 +141,8 @@ typedef enum { VDEV_LABEL_REPLACE, /* replace an existing device */ VDEV_LABEL_SPARE, /* add a new hot spare */ VDEV_LABEL_REMOVE, /* remove an existing device */ - VDEV_LABEL_L2CACHE /* add an L2ARC cache device */ + VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */ + VDEV_LABEL_SPLIT /* generating new label for split-off dev */ } vdev_labeltype_t; extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 20c828b9e9..f46c08c6fb 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -112,6 +112,7 @@ struct vdev { uint64_t vdev_id; /* child number in vdev parent */ uint64_t vdev_guid; /* unique ID for this vdev */ uint64_t vdev_guid_sum; /* self guid + all child guids */ + uint64_t vdev_orig_guid; /* orig. guid prior to remove */ uint64_t vdev_asize; /* allocatable device capacity */ uint64_t vdev_min_asize; /* min acceptable asize */ uint64_t vdev_ashift; /* block alignment shift */ @@ -174,6 +175,7 @@ struct vdev { boolean_t vdev_nowritecache; /* true if flushwritecache failed */ boolean_t vdev_checkremove; /* temporary online test */ boolean_t vdev_forcefault; /* force online fault */ + boolean_t vdev_splitting; /* split or repair in progress */ uint8_t vdev_tmpoffline; /* device taken offline temporarily? */ uint8_t vdev_detached; /* device detached? */ uint8_t vdev_cant_read; /* vdev is failing all reads */ @@ -251,6 +253,7 @@ typedef struct vdev_label { #define VDEV_ALLOC_SPARE 2 #define VDEV_ALLOC_L2CACHE 3 #define VDEV_ALLOC_ROOTPOOL 4 +#define VDEV_ALLOC_SPLIT 5 /* * Allocate or free a vdev diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h index 103e5d3050..90eecb812f 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -269,6 +269,8 @@ typedef struct zfs_useracct { #define ZVOL_MAX_MINOR (1 << 16) #define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1) +#define ZPOOL_EXPORT_AFTER_SPLIT 0x1 + #ifdef _KERNEL typedef struct zfs_creat { diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index fb749f8f18..d201278b11 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -522,7 +522,7 @@ extern int zio_inject_fault(char *name, int flags, int *id, extern int zio_inject_list_next(int *id, char *name, size_t buflen, struct zinject_record *record); extern int zio_clear_fault(int id); -extern void zio_handle_panic_injection(spa_t *spa, char *tag); +extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type); extern int zio_handle_fault_injection(zio_t *zio, int error); extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_label_injection(zio_t *zio, int error); diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index 07aebf5347..48082c8bf9 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -300,15 +300,12 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ - while (guid == 0 || spa_guid_exists(guid, 0)) - guid = spa_get_random(-1ULL); + guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ - while (guid == 0 || - spa_guid_exists(spa_guid(spa), guid)) - guid = spa_get_random(-1ULL); + guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } @@ -482,7 +479,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, /* * If we're a top-level vdev, try to load the allocation parameters. */ - if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { + if (parent && !parent->vdev_parent && + (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, @@ -494,6 +492,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (parent && !parent->vdev_parent) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || + alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); vd->vdev_mg = metaslab_group_create(islog ? spa_log_class(spa) : spa_normal_class(spa), vd); @@ -778,6 +777,7 @@ vdev_remove_parent(vdev_t *cvd) */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; + cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; } @@ -1282,7 +1282,7 @@ vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; nvlist_t *label; - uint64_t guid, top_guid; + uint64_t guid = 0, top_guid; uint64_t state; for (int c = 0; c < vd->vdev_children; c++) @@ -1295,6 +1295,8 @@ vdev_validate(vdev_t *vd) * overwrite the previous state. */ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + uint64_t aux_guid = 0; + nvlist_t *nvl; if ((label = vdev_label_read_config(vd)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, @@ -1302,6 +1304,18 @@ vdev_validate(vdev_t *vd) return (0); } + /* + * Determine if this vdev has been split off into another + * pool. If so, then refuse to open it. + */ + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, + &aux_guid) == 0 && aux_guid == spa_guid(spa)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_SPLIT_POOL); + nvlist_free(label); + return (0); + } + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, @@ -1310,6 +1324,11 @@ vdev_validate(vdev_t *vd) return (0); } + if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) + != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, + &aux_guid) != 0) + aux_guid = 0; + /* * If this vdev just became a top-level vdev because its * sibling was detached, it will have adopted the parent's @@ -1317,12 +1336,16 @@ vdev_validate(vdev_t *vd) * Fortunately, either version of the label will have the * same top guid, so if we're a top-level vdev, we can * safely compare to that instead. + * + * If we split this vdev off instead, then we also check the + * original pool's guid. We don't want to consider the vdev + * corrupt if it is partway through a split operation. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0 || - (vd->vdev_guid != guid && + ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); @@ -1372,8 +1395,12 @@ vdev_close(vdev_t *vd) ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + /* + * If our parent is reopening, then we are as well, unless we are + * going offline. + */ if (pvd != NULL && pvd->vdev_reopening) - vd->vdev_reopening = pvd->vdev_reopening; + vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); vd->vdev_ops->vdev_op_close(vd); @@ -1406,7 +1433,8 @@ vdev_reopen(vdev_t *vd) ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); - vd->vdev_reopening = B_TRUE; + /* set the reopening flag unless we're taking the vdev offline */ + vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); @@ -2133,24 +2161,6 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) return (spa_vdev_state_exit(spa, vd, 0)); } -int -vdev_offline_log(spa_t *spa) -{ - int error = 0; - - if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, - NULL, DS_FIND_CHILDREN)) == 0) { - - /* - * We successfully offlined the log device, sync out the - * current txg so that the "stubby" block can be removed - * by zil_sync(). - */ - txg_wait_synced(spa->spa_dsl_pool, 0); - } - return (error); -} - static int vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { @@ -2198,7 +2208,7 @@ top: metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); - error = vdev_offline_log(spa); + error = spa_offline_log(spa); spa_vdev_state_enter(spa, SCL_ALLOC); @@ -3035,3 +3045,22 @@ vdev_expand(vdev_t *vd, uint64_t txg) vdev_config_dirty(vd); } } + +/* + * Split a vdev. + */ +void +vdev_split(vdev_t *vd) +{ + vdev_t *cvd, *pvd = vd->vdev_parent; + + vdev_remove_child(pvd, vd); + vdev_compact_children(pvd); + + cvd = pvd->vdev_child[0]; + if (pvd->vdev_children == 1) { + vdev_remove_parent(cvd); + cvd->vdev_splitting = B_TRUE; + } + vdev_propagate_state(cvd); +} diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c index e8f8e43e02..d11b3df7c6 100644 --- a/usr/src/uts/common/fs/zfs/vdev_label.c +++ b/usr/src/uts/common/fs/zfs/vdev_label.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -354,6 +354,11 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (aux != NULL) VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux) == 0); + + if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) { + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, + vd->vdev_orig_guid) == 0); + } } return (nv); @@ -590,7 +595,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Determine if the vdev is in use. */ - if (reason != VDEV_LABEL_REMOVE && + if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT && vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) return (EBUSY); @@ -616,7 +621,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) */ if (reason == VDEV_LABEL_SPARE) return (0); - ASSERT(reason == VDEV_LABEL_REPLACE); + ASSERT(reason == VDEV_LABEL_REPLACE || + reason == VDEV_LABEL_SPLIT); } if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE && @@ -681,7 +687,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); } else { - label = spa_config_generate(spa, vd, 0ULL, B_FALSE); + uint64_t txg = 0ULL; + + if (reason == VDEV_LABEL_SPLIT) + txg = spa->spa_uberblock.ub_txg; + label = spa_config_generate(spa, vd, txg, B_FALSE); /* * Add our creation time. This allows us to detect multiple diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index 5500b27e2d..14e479463d 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1397,6 +1397,7 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) * l2cache and spare devices are ok to be added to a rootpool. */ if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) { + nvlist_free(config); spa_close(spa, FTAG); return (EDOM); } @@ -1501,6 +1502,41 @@ zfs_ioc_vdev_detach(zfs_cmd_t *zc) return (error); } +static int +zfs_ioc_vdev_split(zfs_cmd_t *zc) +{ + spa_t *spa; + nvlist_t *config, *props = NULL; + int error; + boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + zc->zc_iflags, &config)) { + spa_close(spa, FTAG); + return (error); + } + + if (zc->zc_nvlist_src_size != 0 && (error = + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props))) { + spa_close(spa, FTAG); + nvlist_free(config); + return (error); + } + + error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); + + spa_close(spa, FTAG); + + nvlist_free(config); + nvlist_free(props); + + return (error); +} + static int zfs_ioc_vdev_setpath(zfs_cmd_t *zc) { @@ -3839,7 +3875,7 @@ static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) { objset_t *os; - int error; + int error = 0; zfsvfs_t *zfsvfs; if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { @@ -4131,6 +4167,7 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) VN_RELE(vp); VN_RELE(ZTOV(sharedir)); ZFS_EXIT(zfsvfs); + nvlist_free(nvlist); return (error); } error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target, @@ -4324,7 +4361,9 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { { zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE, B_TRUE }, { zfs_ioc_objset_recvd_props, zfs_secpolicy_read, DATASET_NAME, B_FALSE, - B_FALSE } + B_FALSE }, + { zfs_ioc_vdev_split, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE } }; int diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c index e8f8f7b723..fa040ea4b3 100644 --- a/usr/src/uts/common/fs/zfs/zio_inject.c +++ b/usr/src/uts/common/fs/zfs/zio_inject.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -101,7 +101,7 @@ zio_match_handler(zbookmark_t *zb, uint64_t type, * specified by tag. */ void -zio_handle_panic_injection(spa_t *spa, char *tag) +zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type) { inject_handler_t *handler; @@ -113,7 +113,8 @@ zio_handle_panic_injection(spa_t *spa, char *tag) if (spa != handler->zi_spa) continue; - if (strcmp(tag, handler->zi_record.zi_func) == 0) + if (handler->zi_record.zi_type == type && + strcmp(tag, handler->zi_record.zi_func) == 0) panic("Panic requested in function %s\n", tag); } diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c index 3419d708c4..3733e495ec 100644 --- a/usr/src/uts/common/fs/zfs/zvol.c +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -456,7 +456,7 @@ zvol_create_minor(const char *name) mutex_enter(&zvol_state_lock); - if ((zv = zvol_minor_lookup(name)) != NULL) { + if (zvol_minor_lookup(name) != NULL) { mutex_exit(&zvol_state_lock); return (EEXIST); } diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h index 800f790453..3889fd9b12 100644 --- a/usr/src/uts/common/sys/fs/zfs.h +++ b/usr/src/uts/common/sys/fs/zfs.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -454,6 +454,10 @@ typedef struct zpool_rewind_policy { #define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram" #define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats" #define ZPOOL_CONFIG_DDT_STATS "ddt_stats" +#define ZPOOL_CONFIG_SPLIT "splitcfg" +#define ZPOOL_CONFIG_ORIG_GUID "orig_guid" +#define ZPOOL_CONFIG_SPLIT_GUID "split_guid" +#define ZPOOL_CONFIG_SPLIT_LIST "guid_list" #define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ #define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ #define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ @@ -539,7 +543,8 @@ typedef enum vdev_aux { VDEV_AUX_ERR_EXCEEDED, /* too many errors */ VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */ - VDEV_AUX_EXTERNAL /* external diagnosis */ + VDEV_AUX_EXTERNAL, /* external diagnosis */ + VDEV_AUX_SPLIT_POOL /* vdev was split off into another pool */ } vdev_aux_t; /* @@ -707,7 +712,8 @@ typedef enum zfs_ioc { ZFS_IOC_HOLD, ZFS_IOC_RELEASE, ZFS_IOC_GET_HOLDS, - ZFS_IOC_OBJSET_RECVD_PROPS + ZFS_IOC_OBJSET_RECVD_PROPS, + ZFS_IOC_VDEV_SPLIT } zfs_ioc_t; /* @@ -825,6 +831,7 @@ typedef enum history_internal_events { LOG_POOL_SCRUB_DONE, LOG_DS_USER_HOLD, LOG_DS_USER_RELEASE, + LOG_POOL_SPLIT, LOG_END } history_internal_events_t; -- cgit v1.2.3