summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJerry Jelinek <jerry.jelinek@joyent.com>2018-04-26 11:49:22 +0000
committerJerry Jelinek <jerry.jelinek@joyent.com>2018-04-26 11:49:22 +0000
commiteee119ebc2fe042c8bf9adde74d41245472ce616 (patch)
tree5ab74a6b2ffcee42faf13a7e6ce8917da8fba80a
parent40a63c2461142314f475a7498badf6596f950c2b (diff)
parent094e47e980b0796b94b1b8f51f462a64d246e516 (diff)
downloadillumos-joyent-eee119ebc2fe042c8bf9adde74d41245472ce616.tar.gz
[illumos-gate merge]
commit 094e47e980b0796b94b1b8f51f462a64d246e516 9102 zfs should be able to initialize storage devices commit 5850749aaf781e7f284cedc8429eb16adf367802 9501 bootadm update-archive -nvC does more ZFS work than required
-rw-r--r--usr/src/cmd/boot/bootadm/bootadm.c3
-rw-r--r--usr/src/cmd/truss/codes.c2
-rw-r--r--usr/src/cmd/zpool/zpool_main.c155
-rw-r--r--usr/src/cmd/ztest/ztest.c96
-rw-r--r--usr/src/lib/libzfs/common/libzfs.h5
-rw-r--r--usr/src/lib/libzfs/common/libzfs_pool.c94
-rw-r--r--usr/src/lib/libzfs/common/libzfs_util.c7
-rw-r--r--usr/src/lib/libzfs/common/mapfile-vers1
-rw-r--r--usr/src/lib/libzfs_core/common/libzfs_core.c37
-rw-r--r--usr/src/lib/libzfs_core/common/libzfs_core.h4
-rw-r--r--usr/src/lib/libzfs_core/common/mapfile-vers7
-rw-r--r--usr/src/lib/libzpool/common/llib-lzpool1
-rw-r--r--usr/src/man/man1m/zpool.1m31
-rw-r--r--usr/src/pkg/manifests/system-test-zfstest.mf39
-rw-r--r--usr/src/test/zfs-tests/include/commands.cfg1
-rw-r--r--usr/src/test/zfs-tests/runfiles/delphix.run13
-rw-r--r--usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile21
-rw-r--r--usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh31
-rw-r--r--usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib43
-rw-r--r--usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh68
-rw-r--r--usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh78
-rw-r--r--usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh66
-rw-r--r--usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh74
-rw-r--r--usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_split.ksh64
-rw-r--r--usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh60
-rw-r--r--usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh52
-rw-r--r--usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh63
-rw-r--r--usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh74
-rw-r--r--usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh59
-rw-r--r--usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh88
-rw-r--r--usr/src/uts/common/Makefile.files1
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c23
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c158
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c7
-rw-r--r--usr/src/uts/common/fs/zfs/sys/metaslab_impl.h10
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h31
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_initialize.h46
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio_priority.h3
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c44
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_disk.c1
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_file.c4
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_indirect.c1
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_initialize.c791
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_mirror.c3
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_missing.c4
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_queue.c16
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz.c75
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_removal.c13
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_root.c3
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c80
-rw-r--r--usr/src/uts/common/sys/fs/zfs.h36
52 files changed, 2661 insertions, 28 deletions
diff --git a/usr/src/cmd/boot/bootadm/bootadm.c b/usr/src/cmd/boot/bootadm/bootadm.c
index b2bfc4e849..0444c5cda7 100644
--- a/usr/src/cmd/boot/bootadm/bootadm.c
+++ b/usr/src/cmd/boot/bootadm/bootadm.c
@@ -25,6 +25,7 @@
* Copyright (c) 2015 by Delphix. All rights reserved.
* Copyright 2016 Toomas Soome <tsoome@me.com>
* Copyright 2016 Nexenta Systems, Inc.
+ * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
*/
/*
@@ -4074,7 +4075,7 @@ update_archive(char *root, char *opt)
/*
* Never update non-BE root in update_all
*/
- if (!is_be(root) && bam_update_all)
+ if (bam_update_all && !is_be(root))
return (BAM_SUCCESS);
/*
* root must belong to a boot archive based OS,
diff --git a/usr/src/cmd/truss/codes.c b/usr/src/cmd/truss/codes.c
index 7c510311f9..1e384f2f10 100644
--- a/usr/src/cmd/truss/codes.c
+++ b/usr/src/cmd/truss/codes.c
@@ -1288,6 +1288,8 @@ const struct ioc {
"zfs_cmd_t" },
{ (uint_t)ZFS_IOC_CHANNEL_PROGRAM, "ZFS_IOC_CHANNEL_PROGRAM",
"zfs_cmd_t" },
+ { (uint_t)ZFS_IOC_POOL_INITIALIZE, "ZFS_IOC_POOL_INITIALIZE",
+ "zfs_cmd_t" },
/* kssl ioctls */
{ (uint_t)KSSL_ADD_ENTRY, "KSSL_ADD_ENTRY",
diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c
index aa0463608b..01af9604a1 100644
--- a/usr/src/cmd/zpool/zpool_main.c
+++ b/usr/src/cmd/zpool/zpool_main.c
@@ -85,6 +85,7 @@ static int zpool_do_detach(int, char **);
static int zpool_do_replace(int, char **);
static int zpool_do_split(int, char **);
+static int zpool_do_initialize(int, char **);
static int zpool_do_scrub(int, char **);
static int zpool_do_import(int, char **);
@@ -134,6 +135,7 @@ typedef enum {
HELP_ONLINE,
HELP_REPLACE,
HELP_REMOVE,
+ HELP_INITIALIZE,
HELP_SCRUB,
HELP_STATUS,
HELP_UPGRADE,
@@ -185,6 +187,7 @@ static zpool_command_t command_table[] = {
{ "replace", zpool_do_replace, HELP_REPLACE },
{ "split", zpool_do_split, HELP_SPLIT },
{ NULL },
+ { "initialize", zpool_do_initialize, HELP_INITIALIZE },
{ "scrub", zpool_do_scrub, HELP_SCRUB },
{ NULL },
{ "import", zpool_do_import, HELP_IMPORT },
@@ -258,6 +261,8 @@ get_usage(zpool_help_t idx)
return (gettext("\tremove [-nps] <pool> <device> ...\n"));
case HELP_REOPEN:
return (gettext("\treopen <pool>\n"));
+ case HELP_INITIALIZE:
+ return (gettext("\tinitialize [-cs] <pool> [<device> ...]\n"));
case HELP_SCRUB:
return (gettext("\tscrub [-s | -p] <pool> ...\n"));
case HELP_STATUS:
@@ -1590,6 +1595,43 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
"resilvering" : "repairing");
}
+ if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE ||
+ vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
+ vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) &&
+ !vs->vs_scan_removing) {
+ char zbuf[1024];
+ char tbuf[256];
+ struct tm zaction_ts;
+
+ time_t t = vs->vs_initialize_action_time;
+ int initialize_pct = 100;
+ if (vs->vs_initialize_state != VDEV_INITIALIZE_COMPLETE) {
+ initialize_pct = (vs->vs_initialize_bytes_done * 100 /
+ (vs->vs_initialize_bytes_est + 1));
+ }
+
+ (void) localtime_r(&t, &zaction_ts);
+ (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts);
+
+ switch (vs->vs_initialize_state) {
+ case VDEV_INITIALIZE_SUSPENDED:
+ (void) snprintf(zbuf, sizeof (zbuf),
+ ", suspended, started at %s", tbuf);
+ break;
+ case VDEV_INITIALIZE_ACTIVE:
+ (void) snprintf(zbuf, sizeof (zbuf),
+ ", started at %s", tbuf);
+ break;
+ case VDEV_INITIALIZE_COMPLETE:
+ (void) snprintf(zbuf, sizeof (zbuf),
+ ", completed at %s", tbuf);
+ break;
+ }
+
+ (void) printf(gettext(" (%d%% initialized%s)"),
+ initialize_pct, zbuf);
+ }
+
(void) printf("\n");
for (c = 0; c < children; c++) {
@@ -4166,6 +4208,119 @@ zpool_do_scrub(int argc, char **argv)
return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb));
}
+static void
+zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res)
+{
+ uint_t children = 0;
+ nvlist_t **child;
+ uint_t i;
+
+ (void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &child, &children);
+
+ if (children == 0) {
+ char *path = zpool_vdev_name(g_zfs, zhp, nvroot, B_FALSE);
+ fnvlist_add_boolean(res, path);
+ free(path);
+ return;
+ }
+
+ for (i = 0; i < children; i++) {
+ zpool_collect_leaves(zhp, child[i], res);
+ }
+}
+
+/*
+ * zpool initialize [-cs] <pool> [<vdev> ...]
+ * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool
+ * if none specified.
+ *
+ * -c Cancel. Ends active initializing.
+ * -s Suspend. Initializing can then be restarted with no flags.
+ */
+int
+zpool_do_initialize(int argc, char **argv)
+{
+ int c;
+ char *poolname;
+ zpool_handle_t *zhp;
+ nvlist_t *vdevs;
+ int err = 0;
+
+ struct option long_options[] = {
+ {"cancel", no_argument, NULL, 'c'},
+ {"suspend", no_argument, NULL, 's'},
+ {0, 0, 0, 0}
+ };
+
+ pool_initialize_func_t cmd_type = POOL_INITIALIZE_DO;
+ while ((c = getopt_long(argc, argv, "cs", long_options, NULL)) != -1) {
+ switch (c) {
+ case 'c':
+ if (cmd_type != POOL_INITIALIZE_DO) {
+ (void) fprintf(stderr, gettext("-c cannot be "
+ "combined with other options\n"));
+ usage(B_FALSE);
+ }
+ cmd_type = POOL_INITIALIZE_CANCEL;
+ break;
+ case 's':
+ if (cmd_type != POOL_INITIALIZE_DO) {
+ (void) fprintf(stderr, gettext("-s cannot be "
+ "combined with other options\n"));
+ usage(B_FALSE);
+ }
+ cmd_type = POOL_INITIALIZE_SUSPEND;
+ break;
+ case '?':
+ if (optopt != 0) {
+ (void) fprintf(stderr,
+ gettext("invalid option '%c'\n"), optopt);
+ } else {
+ (void) fprintf(stderr,
+ gettext("invalid option '%s'\n"),
+ argv[optind - 1]);
+ }
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name argument\n"));
+ usage(B_FALSE);
+ return (-1);
+ }
+
+ poolname = argv[0];
+ zhp = zpool_open(g_zfs, poolname);
+ if (zhp == NULL)
+ return (-1);
+
+ vdevs = fnvlist_alloc();
+ if (argc == 1) {
+ /* no individual leaf vdevs specified, so add them all */
+ nvlist_t *config = zpool_get_config(zhp, NULL);
+ nvlist_t *nvroot = fnvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE);
+ zpool_collect_leaves(zhp, nvroot, vdevs);
+ } else {
+ int i;
+ for (i = 1; i < argc; i++) {
+ fnvlist_add_boolean(vdevs, argv[i]);
+ }
+ }
+
+ err = zpool_initialize(zhp, cmd_type, vdevs);
+
+ fnvlist_free(vdevs);
+ zpool_close(zhp);
+
+ return (err);
+}
+
typedef struct status_cbdata {
int cb_count;
boolean_t cb_allpools;
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index dab209f157..c10186aa38 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -103,6 +103,7 @@
#include <sys/zil_impl.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_file.h>
+#include <sys/vdev_initialize.h>
#include <sys/spa_impl.h>
#include <sys/metaslab_impl.h>
#include <sys/dsl_prop.h>
@@ -346,6 +347,7 @@ ztest_func_t ztest_spa_upgrade;
ztest_func_t ztest_device_removal;
ztest_func_t ztest_remap_blocks;
ztest_func_t ztest_spa_checkpoint_create_discard;
+ztest_func_t ztest_initialize;
uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
@@ -389,7 +391,8 @@ ztest_info_t ztest_info[] = {
&ztest_opts.zo_vdevtime },
{ ztest_device_removal, 1, &zopt_sometimes },
{ ztest_remap_blocks, 1, &zopt_sometimes },
- { ztest_spa_checkpoint_create_discard, 1, &zopt_rarely }
+ { ztest_spa_checkpoint_create_discard, 1, &zopt_rarely },
+ { ztest_initialize, 1, &zopt_sometimes }
};
#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
@@ -5473,6 +5476,97 @@ ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
rw_exit(&ztest_name_lock);
}
+static vdev_t *
+ztest_random_concrete_vdev_leaf(vdev_t *vd)
+{
+ if (vd == NULL)
+ return (NULL);
+
+ if (vd->vdev_children == 0)
+ return (vd);
+
+ vdev_t *eligible[vd->vdev_children];
+ int eligible_idx = 0, i;
+ for (i = 0; i < vd->vdev_children; i++) {
+ vdev_t *cvd = vd->vdev_child[i];
+ if (cvd->vdev_top->vdev_removing)
+ continue;
+ if (cvd->vdev_children > 0 ||
+ (vdev_is_concrete(cvd) && !cvd->vdev_detached)) {
+ eligible[eligible_idx++] = cvd;
+ }
+ }
+ VERIFY(eligible_idx > 0);
+
+ uint64_t child_no = ztest_random(eligible_idx);
+ return (ztest_random_concrete_vdev_leaf(eligible[child_no]));
+}
+
+/* ARGSUSED */
+void
+ztest_initialize(ztest_ds_t *zd, uint64_t id)
+{
+ spa_t *spa = ztest_spa;
+ int error = 0;
+
+ mutex_enter(&ztest_vdev_lock);
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+ /* Random leaf vdev */
+ vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev);
+ if (rand_vd == NULL) {
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+ mutex_exit(&ztest_vdev_lock);
+ return;
+ }
+
+ /*
+ * The random vdev we've selected may change as soon as we
+ * drop the spa_config_lock. We create local copies of things
+ * we're interested in.
+ */
+ uint64_t guid = rand_vd->vdev_guid;
+ char *path = strdup(rand_vd->vdev_path);
+ boolean_t active = rand_vd->vdev_initialize_thread != NULL;
+
+ zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid);
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS);
+ error = spa_vdev_initialize(spa, guid, cmd);
+ switch (cmd) {
+ case POOL_INITIALIZE_CANCEL:
+ if (ztest_opts.zo_verbose >= 4) {
+ (void) printf("Cancel initialize %s", path);
+ if (!active)
+ (void) printf(" failed (no initialize active)");
+ (void) printf("\n");
+ }
+ break;
+ case POOL_INITIALIZE_DO:
+ if (ztest_opts.zo_verbose >= 4) {
+ (void) printf("Start initialize %s", path);
+ if (active && error == 0)
+ (void) printf(" failed (already active)");
+ else if (error != 0)
+ (void) printf(" failed (error %d)", error);
+ (void) printf("\n");
+ }
+ break;
+ case POOL_INITIALIZE_SUSPEND:
+ if (ztest_opts.zo_verbose >= 4) {
+ (void) printf("Suspend initialize %s", path);
+ if (!active)
+ (void) printf(" failed (no initialize active)");
+ (void) printf("\n");
+ }
+ break;
+ }
+ free(path);
+ mutex_exit(&ztest_vdev_lock);
+}
+
/*
* Verify pool integrity by running zdb.
*/
diff --git a/usr/src/lib/libzfs/common/libzfs.h b/usr/src/lib/libzfs/common/libzfs.h
index e8cfd75816..8fc19ba61e 100644
--- a/usr/src/lib/libzfs/common/libzfs.h
+++ b/usr/src/lib/libzfs/common/libzfs.h
@@ -137,6 +137,9 @@ typedef enum zfs_error {
EZFS_NO_CHECKPOINT, /* pool has no checkpoint */
EZFS_DEVRM_IN_PROGRESS, /* a device is currently being removed */
EZFS_VDEV_TOO_BIG, /* a device is too big to be used */
+ EZFS_TOOMANY, /* argument list too long */
+ EZFS_INITIALIZING, /* currently initializing */
+ EZFS_NO_INITIALIZE, /* no active initialize */
EZFS_UNKNOWN
} zfs_error_t;
@@ -262,6 +265,8 @@ typedef struct splitflags {
* Functions to manipulate pool and vdev state
*/
extern int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t);
+extern int zpool_initialize(zpool_handle_t *, pool_initialize_func_t,
+ nvlist_t *);
extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
extern int zpool_reguid(zpool_handle_t *);
extern int zpool_reopen(zpool_handle_t *);
diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c
index 7321f419fc..ae23e06184 100644
--- a/usr/src/lib/libzfs/common/libzfs_pool.c
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c
@@ -1971,6 +1971,100 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd)
}
}
+static int
+xlate_init_err(int err)
+{
+ switch (err) {
+ case ENODEV:
+ return (EZFS_NODEVICE);
+ case EINVAL:
+ case EROFS:
+ return (EZFS_BADDEV);
+ case EBUSY:
+ return (EZFS_INITIALIZING);
+ case ESRCH:
+ return (EZFS_NO_INITIALIZE);
+ }
+ return (err);
+}
+
+/*
+ * Begin, suspend, or cancel the initialization (initializing of all free
+ * blocks) for the given vdevs in the given pool.
+ */
+int
+zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
+ nvlist_t *vds)
+{
+ char msg[1024];
+ libzfs_handle_t *hdl = zhp->zpool_hdl;
+
+ nvlist_t *errlist;
+
+ /* translate vdev names to guids */
+ nvlist_t *vdev_guids = fnvlist_alloc();
+ nvlist_t *guids_to_paths = fnvlist_alloc();
+ boolean_t spare, cache;
+ nvlist_t *tgt;
+ nvpair_t *elem;
+
+ for (elem = nvlist_next_nvpair(vds, NULL); elem != NULL;
+ elem = nvlist_next_nvpair(vds, elem)) {
+ char *vd_path = nvpair_name(elem);
+ tgt = zpool_find_vdev(zhp, vd_path, &spare, &cache, NULL);
+
+ if ((tgt == NULL) || cache || spare) {
+ (void) snprintf(msg, sizeof (msg),
+ dgettext(TEXT_DOMAIN, "cannot initialize '%s'"),
+ vd_path);
+ int err = (tgt == NULL) ? EZFS_NODEVICE :
+ (spare ? EZFS_ISSPARE : EZFS_ISL2CACHE);
+ fnvlist_free(vdev_guids);
+ fnvlist_free(guids_to_paths);
+ return (zfs_error(hdl, err, msg));
+ }
+
+ uint64_t guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID);
+ fnvlist_add_uint64(vdev_guids, vd_path, guid);
+
+ (void) snprintf(msg, sizeof (msg), "%llu", guid);
+ fnvlist_add_string(guids_to_paths, msg, vd_path);
+ }
+
+ int err = lzc_initialize(zhp->zpool_name, cmd_type, vdev_guids,
+ &errlist);
+ fnvlist_free(vdev_guids);
+
+ if (err == 0) {
+ fnvlist_free(guids_to_paths);
+ return (0);
+ }
+
+ nvlist_t *vd_errlist = NULL;
+ if (errlist != NULL) {
+ vd_errlist = fnvlist_lookup_nvlist(errlist,
+ ZPOOL_INITIALIZE_VDEVS);
+ }
+
+ (void) snprintf(msg, sizeof (msg),
+ dgettext(TEXT_DOMAIN, "operation failed"));
+
+ for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL;
+ elem = nvlist_next_nvpair(vd_errlist, elem)) {
+ int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem));
+ char *path = fnvlist_lookup_string(guids_to_paths,
+ nvpair_name(elem));
+ (void) zfs_error_fmt(hdl, vd_error, "cannot initialize '%s'",
+ path);
+ }
+
+ fnvlist_free(guids_to_paths);
+ if (vd_errlist != NULL)
+ return (-1);
+
+ return (zpool_standard_error(hdl, err, msg));
+}
+
/*
* This provides a very minimal check whether a given string is likely a
* c#t#d# style string. Users of this are expected to do their own
diff --git a/usr/src/lib/libzfs/common/libzfs_util.c b/usr/src/lib/libzfs/common/libzfs_util.c
index 61f3127662..3c00b33b02 100644
--- a/usr/src/lib/libzfs/common/libzfs_util.c
+++ b/usr/src/lib/libzfs/common/libzfs_util.c
@@ -252,6 +252,13 @@ libzfs_error_description(libzfs_handle_t *hdl)
return (dgettext(TEXT_DOMAIN, "device removal in progress"));
case EZFS_VDEV_TOO_BIG:
return (dgettext(TEXT_DOMAIN, "device exceeds supported size"));
+ case EZFS_TOOMANY:
+ return (dgettext(TEXT_DOMAIN, "argument list too long"));
+ case EZFS_INITIALIZING:
+ return (dgettext(TEXT_DOMAIN, "currently initializing"));
+ case EZFS_NO_INITIALIZE:
+ return (dgettext(TEXT_DOMAIN, "there is no active "
+ "initialization"));
case EZFS_UNKNOWN:
return (dgettext(TEXT_DOMAIN, "unknown error"));
default:
diff --git a/usr/src/lib/libzfs/common/mapfile-vers b/usr/src/lib/libzfs/common/mapfile-vers
index fc3e63d45b..5b38fc3eae 100644
--- a/usr/src/lib/libzfs/common/mapfile-vers
+++ b/usr/src/lib/libzfs/common/mapfile-vers
@@ -218,6 +218,7 @@ SYMBOL_VERSION SUNWprivate_1.1 {
zpool_import_props;
zpool_import_status;
zpool_in_use;
+ zpool_initialize;
zpool_is_bootable;
zpool_iter;
zpool_label_disk;
diff --git a/usr/src/lib/libzfs_core/common/libzfs_core.c b/usr/src/lib/libzfs_core/common/libzfs_core.c
index d09304fbbb..ac25820c61 100644
--- a/usr/src/lib/libzfs_core/common/libzfs_core.c
+++ b/usr/src/lib/libzfs_core/common/libzfs_core.c
@@ -1038,3 +1038,40 @@ lzc_channel_program_nosync(const char *pool, const char *program,
return (lzc_channel_program_impl(pool, program, B_FALSE, timeout,
memlimit, argnvl, outnvl));
}
+
+/*
+ * Changes initializing state.
+ *
+ * vdevs should be a list of (<key>, guid) where guid is a uint64 vdev GUID.
+ * The key is ignored.
+ *
+ * If there are errors related to vdev arguments, per-vdev errors are returned
+ * in an nvlist with the key "vdevs". Each error is a (guid, errno) pair where
+ * guid is stringified with PRIu64, and errno is one of the following as
+ * an int64_t:
+ * - ENODEV if the device was not found
+ * - EINVAL if the devices is not a leaf or is not concrete (e.g. missing)
+ * - EROFS if the device is not writeable
+ * - EBUSY start requested but the device is already being initialized
+ * - ESRCH cancel/suspend requested but device is not being initialized
+ *
+ * If the errlist is empty, then return value will be:
+ * - EINVAL if one or more arguments was invalid
+ * - Other spa_open failures
+ * - 0 if the operation succeeded
+ */
+int
+lzc_initialize(const char *poolname, pool_initialize_func_t cmd_type,
+ nvlist_t *vdevs, nvlist_t **errlist)
+{
+ int error;
+ nvlist_t *args = fnvlist_alloc();
+ fnvlist_add_uint64(args, ZPOOL_INITIALIZE_COMMAND, (uint64_t)cmd_type);
+ fnvlist_add_nvlist(args, ZPOOL_INITIALIZE_VDEVS, vdevs);
+
+ error = lzc_ioctl(ZFS_IOC_POOL_INITIALIZE, poolname, args, errlist);
+
+ fnvlist_free(args);
+
+ return (error);
+}
diff --git a/usr/src/lib/libzfs_core/common/libzfs_core.h b/usr/src/lib/libzfs_core/common/libzfs_core.h
index 8c6743f503..d4a9a49cc1 100644
--- a/usr/src/lib/libzfs_core/common/libzfs_core.h
+++ b/usr/src/lib/libzfs_core/common/libzfs_core.h
@@ -31,6 +31,8 @@
#include <libnvpair.h>
#include <sys/param.h>
#include <sys/types.h>
+#include <sys/fs/zfs.h>
+
#ifdef __cplusplus
extern "C" {
@@ -56,6 +58,8 @@ int lzc_destroy_snaps(nvlist_t *, boolean_t, nvlist_t **);
int lzc_bookmark(nvlist_t *, nvlist_t **);
int lzc_get_bookmarks(const char *, nvlist_t *, nvlist_t **);
int lzc_destroy_bookmarks(nvlist_t *, nvlist_t **);
+int lzc_initialize(const char *, pool_initialize_func_t, nvlist_t *,
+ nvlist_t **);
int lzc_snaprange_space(const char *, const char *, uint64_t *);
diff --git a/usr/src/lib/libzfs_core/common/mapfile-vers b/usr/src/lib/libzfs_core/common/mapfile-vers
index 7f63f041b6..588eb76e97 100644
--- a/usr/src/lib/libzfs_core/common/mapfile-vers
+++ b/usr/src/lib/libzfs_core/common/mapfile-vers
@@ -37,6 +37,13 @@
$mapfile_version 2
+SYMBOL_VERSION ILLUMOS_0.3 {
+ global:
+
+ lzc_initialize;
+} ILLUMOS_0.1;
+
+
SYMBOL_VERSION ILLUMOS_0.2 {
global:
diff --git a/usr/src/lib/libzpool/common/llib-lzpool b/usr/src/lib/libzpool/common/llib-lzpool
index 9e52a46aee..43938533ca 100644
--- a/usr/src/lib/libzpool/common/llib-lzpool
+++ b/usr/src/lib/libzpool/common/llib-lzpool
@@ -45,6 +45,7 @@
#include <sys/space_map.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_initialize.h>
#include <sys/zap.h>
#include <sys/zio.h>
#include <sys/zio_compress.h>
diff --git a/usr/src/man/man1m/zpool.1m b/usr/src/man/man1m/zpool.1m
index e4305fc7af..728f73de9d 100644
--- a/usr/src/man/man1m/zpool.1m
+++ b/usr/src/man/man1m/zpool.1m
@@ -106,6 +106,11 @@
.Ar pool Ns | Ns Ar id
.Op Ar newpool
.Nm
+.Cm initialize
+.Op Fl cs
+.Ar pool
+.Op Ar device Ns ...
+.Nm
.Cm iostat
.Op Fl v
.Op Fl T Sy u Ns | Ns Sy d
@@ -1333,6 +1338,32 @@ to fully rewind.
.El
.It Xo
.Nm
+.Cm initialize
+.Op Fl cs
+.Ar pool
+.Op Ar device Ns ...
+.Xc
+Begins initializing by writing to all unallocated regions on the specified
+devices, or all eligible devices in the pool if no individual devices are
+specified.
+Only leaf data or log devices may be initialized.
+.Bl -tag -width Ds
+.It Fl c, -cancel
+Cancel initializing on the specified devices, or all eligible devices if none
+are specified.
+If one or more target devices are invalid or are not currently being
+initialized, the command will fail and no cancellation will occur on any device.
+.It Fl s -suspend
+Suspend initializing on the specified devices, or all eligible devices if none
+are specified.
+If one or more target devices are invalid or are not currently being
+initialized, the command will fail and no suspension will occur on any device.
+Initializing can then be resumed by running
+.Nm zpool Cm initialize
+with no flags on the relevant target devices.
+.El
+.It Xo
+.Nm
.Cm iostat
.Op Fl v
.Op Fl T Sy u Ns | Ns Sy d
diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf
index faf818c66f..47ed5a1f33 100644
--- a/usr/src/pkg/manifests/system-test-zfstest.mf
+++ b/usr/src/pkg/manifests/system-test-zfstest.mf
@@ -79,6 +79,7 @@ dir path=opt/zfs-tests/tests/functional/cli_root/zpool_get
dir path=opt/zfs-tests/tests/functional/cli_root/zpool_history
dir path=opt/zfs-tests/tests/functional/cli_root/zpool_import
dir path=opt/zfs-tests/tests/functional/cli_root/zpool_import/blockfiles
+dir path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize
dir path=opt/zfs-tests/tests/functional/cli_root/zpool_labelclear
dir path=opt/zfs-tests/tests/functional/cli_root/zpool_offline
dir path=opt/zfs-tests/tests/functional/cli_root/zpool_online
@@ -1570,6 +1571,44 @@ file \
file \
path=opt/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_rename_001_pos \
mode=0555
+file path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup \
+ mode=0555
+file \
+ path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib \
+ mode=0444
+file \
+ path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove \
+ mode=0555
+file \
+ path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export \
+ mode=0555
+file \
+ path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online \
+ mode=0555
+file \
+ path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline \
+ mode=0555
+file \
+ path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_split \
+ mode=0555
+file \
+ path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg \
+ mode=0555
+file \
+ path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos \
+ mode=0555
+file \
+ path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume \
+ mode=0555
+file \
+ path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs \
+ mode=0555
+file \
+ path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums \
+ mode=0555
+file \
+ path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized \
+ mode=0555
file \
path=opt/zfs-tests/tests/functional/cli_root/zpool_labelclear/labelclear.cfg \
mode=0444
diff --git a/usr/src/test/zfs-tests/include/commands.cfg b/usr/src/test/zfs-tests/include/commands.cfg
index b85717523a..c9097f0fc2 100644
--- a/usr/src/test/zfs-tests/include/commands.cfg
+++ b/usr/src/test/zfs-tests/include/commands.cfg
@@ -73,6 +73,7 @@ export USR_BIN_FILES='awk
mpstat
mv
nawk
+ od
pack
pagesize
pax
diff --git a/usr/src/test/zfs-tests/runfiles/delphix.run b/usr/src/test/zfs-tests/runfiles/delphix.run
index ff77d8f1f2..e37f606fe0 100644
--- a/usr/src/test/zfs-tests/runfiles/delphix.run
+++ b/usr/src/test/zfs-tests/runfiles/delphix.run
@@ -296,6 +296,19 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos',
tests = ['zpool_labelclear_active', 'zpool_labelclear_exported']
pre =
post =
+[/opt/zfs-tests/tests/functional/cli_root/zpool_initialize]
+tests = ['zpool_initialize_attach_detach_add_remove',
+ 'zpool_initialize_import_export',
+ 'zpool_initialize_offline_export_import_online',
+ 'zpool_initialize_online_offline',
+ 'zpool_initialize_split',
+ 'zpool_initialize_start_and_cancel_neg',
+ 'zpool_initialize_start_and_cancel_pos',
+ 'zpool_initialize_suspend_resume',
+ 'zpool_initialize_unsupported_vdevs',
+ 'zpool_initialize_verify_checksums',
+ 'zpool_initialize_verify_initialized']
+pre =
[/opt/zfs-tests/tests/functional/cli_root/zpool_offline]
tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg']
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile
new file mode 100644
index 0000000000..36f8a12f70
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile
@@ -0,0 +1,21 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+include $(SRC)/Makefile.master
+
+ROOTOPTPKG = $(ROOT)/opt/zfs-tests
+TARGETDIR = $(ROOTOPTPKG)/tests/functional/cli_root/zpool_initialize
+
+include $(SRC)/test/zfs-tests/Makefile.com
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh
new file mode 100644
index 0000000000..d9f9570f47
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh
@@ -0,0 +1,31 @@
+#!/usr/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+default_cleanup
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
new file mode 100644
index 0000000000..0f4e7f0fa9
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
@@ -0,0 +1,43 @@
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+function initialize_prog_line # pool disk
+{
+ typeset pool="$1"
+ typeset disk="$2"
+ zpool status "$pool" | grep "$disk" | grep "initialized"
+}
+
+function initialize_progress # pool disk
+{
+ initialize_prog_line "$1" "$2" | \
+ sed 's/.*(\([0-9]\{1,\}\)% initialized.*/\1/g'
+}
+
+function cleanup
+{
+ if poolexists $TESTPOOL; then
+ log_must zpool destroy -f $TESTPOOL
+ fi
+
+ if poolexists $TESTPOOL1; then
+ log_must zpool destroy -f $TESTPOOL1
+ fi
+}
+log_onexit cleanup
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh
new file mode 100644
index 0000000000..2a695025d2
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh
@@ -0,0 +1,68 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Detaching/attaching, adding/removing data devices works with initializing.
+#
+# STRATEGY:
+# 1. Create a single-disk pool.
+# 2. Start initializing.
+# 3. Attach a second disk, ensure initializing continues.
+# 4. Detach the second disk, ensure initializing continues.
+# 5. Add a second disk, ensure initializing continues.
+# 6. Remove the first disk, ensure initializing stops.
+#
+
+DISK1="$(echo $DISKS | cut -d' ' -f1)"
+DISK2="$(echo $DISKS | cut -d' ' -f2)"
+
+log_must zpool create -f $TESTPOOL $DISK1
+
+log_must zpool initialize $TESTPOOL $DISK1
+progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ -z "$progress" ]] && log_fail "Initializing did not start"
+
+log_must zpool attach $TESTPOOL $DISK1 $DISK2
+new_progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ "$progress" -le "$new_progress" ]] || \
+ log_fail "Lost initializing progress on demotion to child vdev"
+progress="$new_progress"
+
+log_must zpool detach $TESTPOOL $DISK2
+new_progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ "$progress" -le "$new_progress" ]] || \
+ log_fail "Lost initializing progress on promotion to top vdev"
+progress="$new_progress"
+
+log_must zpool add $TESTPOOL $DISK2
+log_must zpool remove $TESTPOOL $DISK1
+[[ -z "$(initialize_prog_line $TESTPOOL $DISK1)" ]] || \
+ log_fail "Initializing continued after initiating removal"
+
+log_pass "Initializing worked as expected across attach/detach and add/remove"
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh
new file mode 100644
index 0000000000..386d2a5dc2
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh
@@ -0,0 +1,78 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Initializing automatically resumes across import/export.
+#
+# STRATEGY:
+# 1. Create a one-disk pool.
+# 2. Start initializing and verify that initializing is active.
+# 3. Export the pool.
+# 4. Import the pool.
+# 5. Verify that initializing resumes and progress does not regress.
+# 6. Suspend initializing.
+# 7. Repeat steps 3-4.
+# 8. Verify that progress does not regress but initializing is still suspended.
+#
+
+DISK1=${DISKS%% *}
+
+log_must zpool create -f $TESTPOOL $DISK1
+log_must zpool initialize $TESTPOOL
+
+sleep 2
+
+progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ -z "$progress" ]] && log_fail "Initializing did not start"
+
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+
+new_progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ -z "$new_progress" ]] && log_fail "Initializing did not restart after import"
+[[ "$progress" -le "$new_progress" ]] || \
+ log_fail "Initializing lost progress after import"
+log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended"
+
+log_must zpool initialize -s $TESTPOOL $DISK1
+action_date="$(initialize_prog_line $TESTPOOL $DISK1 | \
+ sed 's/.*ed at \(.*\)).*/\1/g')"
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+new_action_date=$(initialize_prog_line $TESTPOOL $DISK1 | \
+ sed 's/.*ed at \(.*\)).*/\1/g')
+[[ "$action_date" != "$new_action_date" ]] && \
+ log_fail "Initializing action date did not persist across export/import"
+
+[[ "$new_progress" -le "$(initialize_progress $TESTPOOL $DISK1)" ]] || \
+ log_fail "Initializing lost progress after import"
+
+log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended"
+
+log_pass "Initializing retains state as expected across export/import"
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh
new file mode 100644
index 0000000000..dedd466e4e
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh
@@ -0,0 +1,66 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Miscellaneous complex sequences of operations function as expected.
+#
+# STRATEGY:
+# 1. Create a pool with a two-way mirror.
+# 2. Start initializing, offline, export, import, online and verify that
+# initializing state is preserved / initializing behaves as expected
+# at each step.
+#
+
+DISK1="$(echo $DISKS | cut -d' ' -f1)"
+DISK2="$(echo $DISKS | cut -d' ' -f2)"
+
+log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2
+
+log_must zpool initialize $TESTPOOL $DISK1
+log_must zpool offline $TESTPOOL $DISK1
+progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ -z "$progress" ]] && log_fail "Initializing did not start"
+log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended"
+
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+
+new_progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ -z "$new_progress" ]] && log_fail "Initializing did not start after import"
+[[ "$new_progress" -ge "$progress" ]] || \
+ log_fail "Initializing lost progress after import"
+log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended"
+
+log_must zpool online $TESTPOOL $DISK1
+new_progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ "$new_progress" -ge "$progress" ]] || \
+ log_fail "Initializing lost progress after online"
+
+log_pass "Initializing behaves as expected at each step of:" \
+ "initialize + offline + export + import + online"
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh
new file mode 100644
index 0000000000..55bd3188c9
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh
@@ -0,0 +1,74 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Initializing automatically resumes across offline/online.
+#
+# STRATEGY:
+# 1. Create a pool with a two-way mirror.
+# 2. Start initializing one of the disks and verify that initializing is active.
+# 3. Offline the disk.
+# 4. Online the disk.
+# 5. Verify that initializing resumes and progress does not regress.
+# 6. Suspend initializing.
+# 7. Repeat steps 3-4 and verify that initializing does not resume.
+#
+
+DISK1=${DISKS%% *}
+DISK2="$(echo $DISKS | cut -d' ' -f2)"
+
+log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2
+log_must zpool initialize $TESTPOOL $DISK1
+
+log_must zpool offline $TESTPOOL $DISK1
+
+progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ -z "$progress" ]] && log_fail "Initializing did not start"
+
+log_must zpool online $TESTPOOL $DISK1
+
+new_progress="$(initialize_progress $TESTPOOL $DISK1)"
+[[ -z "$new_progress" ]] && \
+ log_fail "Initializing did not restart after onlining"
+[[ "$progress" -le "$new_progress" ]] || \
+ log_fail "Initializing lost progress after onlining"
+log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended"
+
+log_must zpool initialize -s $TESTPOOL $DISK1
+action_date="$(initialize_prog_line $TESTPOOL $DISK1 | \
+ sed 's/.*ed at \(.*\)).*/\1/g')"
+log_must zpool offline $TESTPOOL $DISK1
+log_must zpool online $TESTPOOL $DISK1
+new_action_date=$(initialize_prog_line $TESTPOOL $DISK1 | \
+ sed 's/.*ed at \(.*\)).*/\1/g')
+[[ "$action_date" != "$new_action_date" ]] && \
+ log_fail "Initializing action date did not persist across offline/online"
+log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended"
+
+log_pass "Initializing performs as expected across offline/online"
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_split.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_split.ksh
new file mode 100644
index 0000000000..69b27c26c9
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_split.ksh
@@ -0,0 +1,64 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Initializing state is preserved across zpool split.
+#
+# STRATEGY:
+# 1. Create a pool with a two-way mirror.
+# 2. Start initializing both devices.
+# 3. Split the pool. Ensure initializing continues on the original.
+# 4. Import the new pool. Ensure initializing resumes on it.
+#
+
+DISK1="$(echo $DISKS | cut -d' ' -f1)"
+DISK2="$(echo $DISKS | cut -d' ' -f2)"
+POOL2="${TESTPOOL}_split"
+
+log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2
+
+log_must zpool initialize $TESTPOOL $DISK1 $DISK2
+orig_prog1="$(initialize_progress $TESTPOOL $DISK1)"
+orig_prog2="$(initialize_progress $TESTPOOL $DISK2)"
+[[ -z "$orig_prog1" ]] && log_fail "Initializing did not start"
+
+log_must zpool split $TESTPOOL $TESTPOOL1 $DISK2
+
+# Ensure initializing continued as expected on the original pool.
+[[ "$(initialize_progress $TESTPOOL $DISK1)" -ge "$orig_prog1" ]] || \
+ log_fail "Initializing lost progress on original pool"
+log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended"
+
+log_must zpool import $TESTPOOL1
+
+[[ "$(initialize_progress $TESTPOOL1 $DISK2)" -ge "$orig_prog2" ]] || \
+ log_fail "Initializing lost progress on split pool"
+log_mustnot eval "initialize_prog_line $TESTPOOL1 $DISK1 | grep suspended"
+
+log_pass "Initializing behaves as expected on zpool split"
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh
new file mode 100644
index 0000000000..59b266d321
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh
@@ -0,0 +1,60 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Cancelling and suspending initialize doesn't work if not all specified vdevs
+# are being initialized.
+#
+# STRATEGY:
+# 1. Create a three-disk pool.
+# 2. Start initializing and verify that initializing is active.
+# 3. Try to cancel and suspend initializing on the non-initializing disks.
+# 4. Try to re-initialize the currently initializing disk.
+#
+
+DISK1=${DISKS%% *}
+DISK2="$(echo $DISKS | cut -d' ' -f2)"
+DISK3="$(echo $DISKS | cut -d' ' -f3)"
+
+log_must zpool list -v
+log_must zpool create -f $TESTPOOL $DISK1 $DISK2 $DISK3
+log_must zpool initialize $TESTPOOL $DISK1
+
+[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \
+ log_fail "Initialize did not start"
+
+log_mustnot zpool initialize -c $TESTPOOL $DISK2
+log_mustnot zpool initialize -c $TESTPOOL $DISK2 $DISK3
+
+log_mustnot zpool initialize -s $TESTPOOL $DISK2
+log_mustnot zpool initialize -s $TESTPOOL $DISK2 $DISK3
+
+log_mustnot zpool initialize $TESTPOOL $DISK1
+
+log_pass "Nonsensical initialize operations fail"
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh
new file mode 100644
index 0000000000..5003b5f10b
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh
@@ -0,0 +1,52 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Starting and stopping an initialize works.
+#
+# STRATEGY:
+# 1. Create a one-disk pool.
+# 2. Start initializing and verify that initializing is active.
+# 3. Cancel initializing and verify that initializing is not active.
+#
+
+DISK1=${DISKS%% *}
+
+log_must zpool create -f $TESTPOOL $DISK1
+log_must zpool initialize $TESTPOOL
+
+[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \
+ log_fail "Initialize did not start"
+
+log_must zpool initialize -c $TESTPOOL
+
+[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] || \
+ log_fail "Initialize did not stop"
+
+log_pass "Initialize start + cancel works"
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh
new file mode 100644
index 0000000000..bce3da5267
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh
@@ -0,0 +1,63 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Suspending and resuming initializing works.
+#
+# STRATEGY:
+# 1. Create a one-disk pool.
+# 2. Start initializing and verify that initializing is active.
+# 3. Wait 3 seconds, then suspend initializing and verify that the progress
+# reporting says so.
+# 4. Wait 5 seconds and ensure initializing progress doesn't advance.
+# 5. Restart initializing and verify that the progress doesn't regress.
+#
+
+DISK1=${DISKS%% *}
+
+log_must zpool create -f $TESTPOOL $DISK1
+log_must zpool initialize $TESTPOOL
+
+[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \
+ log_fail "Initializing did not start"
+
+sleep 5
+log_must zpool initialize -s $TESTPOOL
+log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended"
+progress="$(initialize_progress $TESTPOOL $DISK1)"
+
+sleep 3
+[[ "$progress" -eq "$(initialize_progress $TESTPOOL $DISK1)" ]] || \
+ log_fail "Initializing progress advanced while suspended"
+
+log_must zpool initialize $TESTPOOL $DISK1
+[[ "$progress" -le "$(initialize_progress $TESTPOOL $DISK1)" ]] ||
+ log_fail "Initializing progress regressed after resuming"
+
+log_pass "Suspend + resume initializing works as expected"
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh
new file mode 100644
index 0000000000..bd4ca069c4
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh
@@ -0,0 +1,74 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Attempting to initialize unsupported vdevs should fail.
+#
+# STRATEGY:
+# 1. Create a pool with the following configuration:
+# root
+# mirror
+# vdev0
+# vdev1 (offline)
+# cache
+# vdev2
+# spare
+# vdev3
+# 2. Try to initialize vdev1, vdev2, and vdev3. Ensure that all 3 fail.
+#
+function cleanup
+{
+ if datasetexists $TESTPOOL; then
+ log_must zpool destroy -f $TESTPOOL
+ fi
+ if [[ -d $TESTDIR ]]; then
+ log_must rm -rf $TESTDIR
+ fi
+}
+log_onexit cleanup
+
+log_must mkdir $TESTDIR
+set -A FDISKS
+for n in {0..2}; do
+ log_must mkfile $MINVDEVSIZE $TESTDIR/vdev$n
+ FDISKS+=("$TESTDIR/vdev$n")
+done
+FDISKS+=("${DISKS%% *}")
+
+log_must zpool create $TESTPOOL mirror ${FDISKS[0]} ${FDISKS[1]} \
+ spare ${FDISKS[2]} cache ${FDISKS[3]}
+
+log_must zpool offline $TESTPOOL ${FDISKS[1]}
+
+log_mustnot zpool initialize $TESTPOOL mirror-0
+for n in {1..3}; do
+ log_mustnot zpool initialize $TESTPOOL ${FDISKS[$n]}
+done
+
+log_pass "Attempting to initialize failed on unsupported devices"
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh
new file mode 100644
index 0000000000..6cc82b9baa
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh
@@ -0,0 +1,59 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Initializing does not cause file corruption.
+#
+# STRATEGY:
+# 1. Create a one-disk pool.
+# 2. Write data to the pool.
+# 3. Start initializing and verify that initializing is active.
+# 4. Write more data to the pool.
+# 5. Run zdb to validate checksums.
+#
+
+DISK1=${DISKS%% *}
+
+log_must zpool create -f $TESTPOOL $DISK1
+log_must /usr/bin/dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1M count=30
+log_must sync
+
+log_must zpool initialize $TESTPOOL
+
+log_must zdb -cc $TESTPOOL
+
+[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \
+ log_fail "Initializing did not start"
+
+log_must /usr/bin/dd if=/dev/urandom of=/$TESTPOOL/file2 bs=1M count=30
+log_must sync
+
+log_must zdb -cc $TESTPOOL
+
+log_pass "Initializing does not corrupt existing or new data"
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh
new file mode 100644
index 0000000000..8d20e13e0a
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh
@@ -0,0 +1,88 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# After initializing, the disk is actually initialized.
+#
+# STRATEGY:
+# 1. Create a one-disk pool.
+# 2. Initialize the disk to completion.
+# 3. Load all metaslabs that don't have a spacemap, and make sure the entire
+# metaslab has been filled with the initializing pattern (deadbeef).
+#
+
+function cleanup
+{
+ mdb -kwe "zfs_initialize_value/Z $ORIG_PATTERN"
+ zpool import -d $TESTDIR $TESTPOOL
+
+ if datasetexists $TESTPOOL ; then
+ zpool destroy -f $TESTPOOL
+ fi
+ if [[ -d "$TESTDIR" ]]; then
+ rm -rf "$TESTDIR"
+ fi
+}
+log_onexit cleanup
+PATTERN="deadbeefdeadbeef"
+SMALLFILE="$TESTDIR/smallfile"
+
+ORIG_PATTERN=$(mdb -ke "zfs_initialize_value/J" | tail -1 | awk '{print $NF}')
+log_must mdb -kwe "zfs_initialize_value/Z $PATTERN"
+
+log_must mkdir "$TESTDIR"
+log_must mkfile $MINVDEVSIZE "$SMALLFILE"
+log_must zpool create $TESTPOOL "$SMALLFILE"
+log_must zpool initialize $TESTPOOL
+
+while [[ "$(initialize_progress $TESTPOOL $SMALLFILE)" -lt "100" ]]; do
+ sleep 0.5
+done
+
+log_must zpool export $TESTPOOL
+
+spacemaps=0
+bs=512
+while read -r sm; do
+ typeset offset="$(echo $sm | cut -d ' ' -f1)"
+ typeset size="$(echo $sm | cut -d ' ' -f2)"
+
+ spacemaps=$((spacemaps + 1))
+ offset=$(((4 * 1024 * 1024) + 16#$offset))
+ out=$(dd if=$SMALLFILE skip=$(($offset / $bs)) \
+ count=$(($size / $bs)) bs=$bs 2>/dev/null | od -t x8 -Ad)
+ echo "$out" | log_must egrep "$PATTERN|\*|$size"
+done <<< "$(zdb -p $TESTDIR -Pme $TESTPOOL | egrep 'spacemap[ ]+0 ' | \
+ awk '{print $4, $8}')"
+
+if [[ $spacemaps -eq 0 ]];then
+ log_fail "Did not find any empty space maps to check"
+else
+ log_pass "Initializing wrote appropriate amount to disk"
+fi
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index ad668b9e71..4c6be296cb 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -1469,6 +1469,7 @@ ZFS_COMMON_OBJS += \
vdev_indirect.o \
vdev_indirect_births.o \
vdev_indirect_mapping.o \
+ vdev_initialize.o \
vdev_label.o \
vdev_mirror.o \
vdev_missing.o \
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index d2c6830cc7..17613c41f9 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -640,6 +640,8 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL);
mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
KM_SLEEP);
mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
@@ -686,6 +688,8 @@ metaslab_group_destroy(metaslab_group_t *mg)
kmem_free(mg->mg_secondaries, mg->mg_allocators *
sizeof (metaslab_t *));
mutex_destroy(&mg->mg_lock);
+ mutex_destroy(&mg->mg_ms_initialize_lock);
+ cv_destroy(&mg->mg_ms_initialize_cv);
for (int i = 0; i < mg->mg_allocators; i++) {
refcount_destroy(&mg->mg_alloc_queue_depth[i]);
@@ -1546,6 +1550,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
+
ms->ms_id = id;
ms->ms_start = id << vd->vdev_ms_shift;
ms->ms_size = 1ULL << vd->vdev_ms_shift;
@@ -2725,6 +2730,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
* from it in 'metaslab_unload_delay' txgs, then unload it.
*/
if (msp->ms_loaded &&
+ msp->ms_initializing == 0 &&
msp->ms_selected_txg + metaslab_unload_delay < txg) {
for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
VERIFY0(range_tree_space(
@@ -2974,6 +2980,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
metaslab_class_t *mc = msp->ms_group->mg_class;
VERIFY(!msp->ms_condensing);
+ VERIFY0(msp->ms_initializing);
start = mc->mc_ops->msop_alloc(msp, size);
if (start != -1ULL) {
@@ -3034,9 +3041,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
}
/*
- * If the selected metaslab is condensing, skip it.
+ * If the selected metaslab is condensing or being
+ * initialized, skip it.
*/
- if (msp->ms_condensing)
+ if (msp->ms_condensing || msp->ms_initializing > 0)
continue;
*was_active = msp->ms_allocator != -1;
@@ -3201,7 +3209,9 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
/*
* If this metaslab is currently condensing then pick again as
* we can't manipulate this metaslab until it's committed
- * to disk.
+ * to disk. If this metaslab is being initialized, we shouldn't
+ * allocate from it since the allocated region might be
+ * overwritten after allocation.
*/
if (msp->ms_condensing) {
metaslab_trace_add(zal, mg, msp, asize, d,
@@ -3210,6 +3220,13 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
~METASLAB_ACTIVE_MASK);
mutex_exit(&msp->ms_lock);
continue;
+ } else if (msp->ms_initializing > 0) {
+ metaslab_trace_add(zal, mg, msp, asize, d,
+ TRACE_INITIALIZING, allocator);
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_ACTIVE_MASK);
+ mutex_exit(&msp->ms_lock);
+ continue;
}
offset = metaslab_block_alloc(msp, asize, txg);
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 9185c5b182..b71710bbd7 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -54,6 +54,7 @@
#include <sys/vdev_removal.h>
#include <sys/vdev_indirect_mapping.h>
#include <sys/vdev_indirect_births.h>
+#include <sys/vdev_initialize.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
#include <sys/uberblock_impl.h>
@@ -413,8 +414,9 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
dp = spa_get_dsl(spa);
dsl_pool_config_enter(dp, FTAG);
- if (err = dsl_dataset_hold_obj(dp,
- za.za_first_integer, FTAG, &ds)) {
+ err = dsl_dataset_hold_obj(dp,
+ za.za_first_integer, FTAG, &ds);
+ if (err != 0) {
dsl_pool_config_exit(dp, FTAG);
break;
}
@@ -569,7 +571,8 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
break;
}
- if (error = dmu_objset_hold(strval, FTAG, &os))
+ error = dmu_objset_hold(strval, FTAG, &os);
+ if (error != 0)
break;
/*
@@ -1155,8 +1158,10 @@ spa_activate(spa_t *spa, int mode)
spa_create_zio_taskqs(spa);
}
- for (size_t i = 0; i < TXG_SIZE; i++)
- spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0);
+ for (size_t i = 0; i < TXG_SIZE; i++) {
+ spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+ }
list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_config_dirty_node));
@@ -1315,6 +1320,11 @@ spa_unload(spa_t *spa)
*/
spa_async_suspend(spa);
+ if (spa->spa_root_vdev) {
+ vdev_initialize_stop_all(spa->spa_root_vdev,
+ VDEV_INITIALIZE_ACTIVE);
+ }
+
/*
* Stop syncing.
*/
@@ -1330,10 +1340,10 @@ spa_unload(spa_t *spa)
* calling taskq_wait(mg_taskq).
*/
if (spa->spa_root_vdev != NULL) {
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
- spa_config_exit(spa, SCL_ALL, FTAG);
+ spa_config_exit(spa, SCL_ALL, spa);
}
/*
@@ -1367,7 +1377,7 @@ spa_unload(spa_t *spa)
bpobj_close(&spa->spa_deferred_bpobj);
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
/*
* Close all vdevs.
@@ -1429,7 +1439,7 @@ spa_unload(spa_t *spa)
spa->spa_comment = NULL;
}
- spa_config_exit(spa, SCL_ALL, FTAG);
+ spa_config_exit(spa, SCL_ALL, spa);
}
/*
@@ -3866,6 +3876,10 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
spa_restart_removal(spa);
spa_spawn_aux_threads(spa);
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_initialize_restart(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
}
spa_load_note(spa, "LOADED");
@@ -5347,6 +5361,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
* in which case we can modify its state.
*/
if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
+
/*
* Objsets may be open only because they're dirty, so we
* have to force it to sync before checking spa_refcnt.
@@ -5381,6 +5396,18 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
}
/*
+ * We're about to export or destroy this pool. Make sure
+ * we stop all initializtion activity here before we
+ * set the spa_final_txg. This will ensure that all
+ * dirty data resulting from the initialization is
+ * committed to disk before we unload the pool.
+ */
+ if (spa->spa_root_vdev != NULL) {
+ vdev_initialize_stop_all(spa->spa_root_vdev,
+ VDEV_INITIALIZE_ACTIVE);
+ }
+
+ /*
* We want this to be reflected on every label,
* so mark them all dirty. spa_unload() will do the
* final sync that pushes these changes out.
@@ -6070,6 +6097,86 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
return (error);
}
+int
+spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type)
+{
+ /*
+ * We hold the namespace lock through the whole function
+ * to prevent any changes to the pool while we're starting or
+ * stopping initialization. The config and state locks are held so that
+ * we can properly assess the vdev state before we commit to
+ * the initializing operation.
+ */
+ mutex_enter(&spa_namespace_lock);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+
+ /* Look up vdev and ensure it's a leaf. */
+ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+ if (vd == NULL || vd->vdev_detached) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(ENODEV));
+ } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EINVAL));
+ } else if (!vdev_writeable(vd)) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EROFS));
+ }
+ mutex_enter(&vd->vdev_initialize_lock);
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+
+ /*
+ * When we activate an initialize action we check to see
+ * if the vdev_initialize_thread is NULL. We do this instead
+ * of using the vdev_initialize_state since there might be
+ * a previous initialization process which has completed but
+ * the thread is not exited.
+ */
+ if (cmd_type == POOL_INITIALIZE_DO &&
+ (vd->vdev_initialize_thread != NULL ||
+ vd->vdev_top->vdev_removing)) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EBUSY));
+ } else if (cmd_type == POOL_INITIALIZE_CANCEL &&
+ (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
+ vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(ESRCH));
+ } else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
+ vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(ESRCH));
+ }
+
+ switch (cmd_type) {
+ case POOL_INITIALIZE_DO:
+ vdev_initialize(vd);
+ break;
+ case POOL_INITIALIZE_CANCEL:
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
+ break;
+ case POOL_INITIALIZE_SUSPEND:
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED);
+ break;
+ default:
+ panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
+ }
+ mutex_exit(&vd->vdev_initialize_lock);
+
+ /* Sync out the initializing state */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+
/*
* Split a set of devices from their mirrors, and create a new pool from them.
*/
@@ -6277,6 +6384,19 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
spa_activate(newspa, spa_mode_global);
spa_async_suspend(newspa);
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL) {
+ /*
+ * Temporarily stop the initializing activity. We set
+ * the state to ACTIVE so that we know to resume
+ * the initializing once the split has completed.
+ */
+ mutex_enter(&vml[c]->vdev_initialize_lock);
+ vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE);
+ mutex_exit(&vml[c]->vdev_initialize_lock);
+ }
+ }
+
newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
/* create the new pool from the disks of the original pool */
@@ -6364,6 +6484,10 @@ out:
if (vml[c] != NULL)
vml[c]->vdev_offline = B_FALSE;
}
+
+ /* restart initializing disks as necessary */
+ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+
vdev_reopen(spa->spa_root_vdev);
nvlist_free(spa->spa_config_splitting);
@@ -6739,6 +6863,14 @@ spa_async_thread(void *arg)
if (tasks & SPA_ASYNC_RESILVER)
dsl_resilver_restart(spa->spa_dsl_pool, 0);
+ if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_initialize_restart(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ }
+
/*
* Let the world know that we're done.
*/
@@ -7384,8 +7516,9 @@ spa_sync(spa_t *spa, uint64_t txg)
* Wait for i/os issued in open context that need to complete
* before this txg syncs.
*/
- VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK]));
- spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0);
+ (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
+ spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
/*
* Lock out configuration changes.
@@ -7674,7 +7807,8 @@ spa_sync(spa_t *spa, uint64_t txg)
/*
* Update usable space statistics.
*/
- while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+ != NULL)
vdev_sync_done(vd, txg);
spa_update_dspace(spa);
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 8a348af53c..41342f37ea 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -39,6 +39,7 @@
#include <sys/zap.h>
#include <sys/zil.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_initialize.h>
#include <sys/metaslab.h>
#include <sys/uberblock_impl.h>
#include <sys/txg.h>
@@ -1196,6 +1197,12 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
if (vd != NULL) {
ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
+ if (vd->vdev_ops->vdev_op_leaf) {
+ mutex_enter(&vd->vdev_initialize_lock);
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
+ mutex_exit(&vd->vdev_initialize_lock);
+ }
+
spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
vdev_free(vd);
spa_config_exit(spa, SCL_ALL, spa);
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
index 6a02f7c800..3c4ce37303 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -68,7 +68,8 @@ typedef enum trace_alloc_type {
TRACE_GROUP_FAILURE = -5ULL,
TRACE_ENOSPC = -6ULL,
TRACE_CONDENSING = -7ULL,
- TRACE_VDEV_ERROR = -8ULL
+ TRACE_VDEV_ERROR = -8ULL,
+ TRACE_INITIALIZING = -9ULL
} trace_alloc_type_t;
#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
@@ -270,6 +271,11 @@ struct metaslab_group {
uint64_t mg_failed_allocations;
uint64_t mg_fragmentation;
uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+
+ int mg_ms_initializing;
+ boolean_t mg_initialize_updating;
+ kmutex_t mg_ms_initialize_lock;
+ kcondvar_t mg_ms_initialize_cv;
};
/*
@@ -360,6 +366,8 @@ struct metaslab {
boolean_t ms_condense_wanted;
uint64_t ms_condense_checked_txg;
+ uint64_t ms_initializing; /* leaves initializing this ms */
+
/*
* We must hold both ms_lock and ms_group->mg_lock in order to
* modify ms_loaded.
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index 55c306d0a4..34f02ed430 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -650,6 +650,7 @@ extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
#define SPA_ASYNC_AUTOEXPAND 0x20
#define SPA_ASYNC_REMOVE_DONE 0x40
#define SPA_ASYNC_REMOVE_STOP 0x80
+#define SPA_ASYNC_INITIALIZE_RESTART 0x100
/*
* Controls the behavior of spa_vdev_remove().
@@ -665,6 +666,7 @@ extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
int replace_done);
extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
extern boolean_t spa_vdev_remove_active(spa_t *spa);
+extern int spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type);
extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 252069c5a4..71753cf24f 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -79,6 +79,12 @@ typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd,
uint64_t offset, uint64_t size, void *arg);
typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size,
vdev_remap_cb_t callback, void *arg);
+/*
+ * Given a target vdev, translates the logical range "in" to the physical
+ * range "res"
+ */
+typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in,
+ range_seg_t *res);
typedef struct vdev_ops {
vdev_open_func_t *vdev_op_open;
@@ -90,6 +96,11 @@ typedef struct vdev_ops {
vdev_hold_func_t *vdev_op_hold;
vdev_rele_func_t *vdev_op_rele;
vdev_remap_func_t *vdev_op_remap;
+ /*
+ * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves.
+ * Used when initializing vdevs. Isn't used by leaf ops.
+ */
+ vdev_xlation_func_t *vdev_op_xlate;
char vdev_op_type[16];
boolean_t vdev_op_leaf;
} vdev_ops_t;
@@ -232,6 +243,24 @@ struct vdev {
/* pool checkpoint related */
space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
+
+ boolean_t vdev_initialize_exit_wanted;
+ vdev_initializing_state_t vdev_initialize_state;
+ kthread_t *vdev_initialize_thread;
+ /* Protects vdev_initialize_thread and vdev_initialize_state. */
+ kmutex_t vdev_initialize_lock;
+ kcondvar_t vdev_initialize_cv;
+ uint64_t vdev_initialize_offset[TXG_SIZE];
+ uint64_t vdev_initialize_last_offset;
+ range_tree_t *vdev_initialize_tree; /* valid while initializing */
+ uint64_t vdev_initialize_bytes_est;
+ uint64_t vdev_initialize_bytes_done;
+ time_t vdev_initialize_action_time; /* start and end time */
+
+ /* for limiting outstanding I/Os */
+ kmutex_t vdev_initialize_io_lock;
+ kcondvar_t vdev_initialize_io_cv;
+ uint64_t vdev_initialize_inflight;
/*
* Values stored in the config for an indirect or removing vdev.
@@ -435,6 +464,8 @@ extern vdev_ops_t vdev_indirect_ops;
/*
* Common size functions
*/
+extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in,
+ range_seg_t *out);
extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
extern uint64_t vdev_get_min_asize(vdev_t *vd);
extern void vdev_set_min_asize(vdev_t *vd);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_initialize.h b/usr/src/uts/common/fs/zfs/sys/vdev_initialize.h
new file mode 100644
index 0000000000..db4b0572cd
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_initialize.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_INITIALIZE_H
+#define _SYS_VDEV_INITIALIZE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void vdev_initialize(vdev_t *vd);
+extern void vdev_initialize_stop(vdev_t *vd,
+ vdev_initializing_state_t tgt_state);
+extern void vdev_initialize_stop_all(vdev_t *vd,
+ vdev_initializing_state_t tgt_state);
+extern void vdev_initialize_restart(vdev_t *vd);
+extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs,
+ range_seg_t *physical_rs);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_INITIALIZE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_priority.h b/usr/src/uts/common/fs/zfs/sys/zio_priority.h
index 42ce1ea898..7bd0995728 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio_priority.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio_priority.h
@@ -13,7 +13,7 @@
* CDDL HEADER END
*/
/*
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
*/
#ifndef _ZIO_PRIORITY_H
#define _ZIO_PRIORITY_H
@@ -29,6 +29,7 @@ typedef enum zio_priority {
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */
+ ZIO_PRIORITY_INITIALIZING, /* initializing I/O */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 71b690c123..0c0057e9b6 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -49,6 +49,7 @@
#include <sys/zil.h>
#include <sys/dsl_scan.h>
#include <sys/abd.h>
+#include <sys/vdev_initialize.h>
/*
* Virtual device management.
@@ -183,6 +184,14 @@ vdev_getops(const char *type)
return (ops);
}
+/* ARGSUSED */
+void
+vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res)
+{
+ res->rs_start = in->rs_start;
+ res->rs_end = in->rs_end;
+}
+
/*
* Default asize function: return the MAX of psize with the asize of
* all children. This is what's used by anything other than RAID-Z.
@@ -453,6 +462,11 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
+
for (int t = 0; t < DTL_TYPES; t++) {
vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
}
@@ -725,6 +739,7 @@ void
vdev_free(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
/*
* vdev_free() implies closing the vdev first. This is simpler than
@@ -743,6 +758,7 @@ vdev_free(vdev_t *vd)
ASSERT(vd->vdev_child == NULL);
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+ ASSERT(vd->vdev_initialize_thread == NULL);
/*
* Discard allocation state.
@@ -815,6 +831,10 @@ vdev_free(vdev_t *vd)
mutex_destroy(&vd->vdev_dtl_lock);
mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock);
+ mutex_destroy(&vd->vdev_initialize_lock);
+ mutex_destroy(&vd->vdev_initialize_io_lock);
+ cv_destroy(&vd->vdev_initialize_io_cv);
+ cv_destroy(&vd->vdev_initialize_cv);
if (vd == spa->spa_root_vdev)
spa->spa_root_vdev = NULL;
@@ -2841,7 +2861,8 @@ vdev_sync_done(vdev_t *vd, uint64_t txg)
ASSERT(vdev_is_concrete(vd));
- while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+ while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+ != NULL)
metaslab_sync_done(msp, txg);
if (reassess)
@@ -3067,6 +3088,15 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
}
+ /* Restart initializing if necessary */
+ mutex_enter(&vd->vdev_initialize_lock);
+ if (vdev_writeable(vd) &&
+ vd->vdev_initialize_thread == NULL &&
+ vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
+ (void) vdev_initialize(vd);
+ }
+ mutex_exit(&vd->vdev_initialize_lock);
+
if (wasoffline ||
(oldstate < VDEV_STATE_DEGRADED &&
vd->vdev_state >= VDEV_STATE_DEGRADED))
@@ -3361,8 +3391,18 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
vs->vs_state = vd->vdev_state;
vs->vs_rsize = vdev_get_min_asize(vd);
- if (vd->vdev_ops->vdev_op_leaf)
+ if (vd->vdev_ops->vdev_op_leaf) {
vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
+ /*
+ * Report intializing progress. Since we don't have the
+ * initializing locks held, this is only an estimate (although a
+ * fairly accurate one).
+ */
+ vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done;
+ vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est;
+ vs->vs_initialize_state = vd->vdev_initialize_state;
+ vs->vs_initialize_action_time = vd->vdev_initialize_action_time;
+ }
/*
* Report expandable space on top-level, non-auxillary devices only.
* The expandable space is reported in terms of metaslab sized units
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index b63285d9fa..e4b86b419b 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -853,6 +853,7 @@ vdev_ops_t vdev_disk_ops = {
vdev_disk_hold,
vdev_disk_rele,
NULL,
+ vdev_default_xlate,
VDEV_TYPE_DISK, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c
index f93b646fd8..96534436bb 100644
--- a/usr/src/uts/common/fs/zfs/vdev_file.c
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -263,6 +263,7 @@ vdev_ops_t vdev_file_ops = {
vdev_file_hold,
vdev_file_rele,
NULL,
+ vdev_default_xlate,
VDEV_TYPE_FILE, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
@@ -282,6 +283,7 @@ vdev_ops_t vdev_disk_ops = {
vdev_file_hold,
vdev_file_rele,
NULL,
+ vdev_default_xlate,
VDEV_TYPE_DISK, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
diff --git a/usr/src/uts/common/fs/zfs/vdev_indirect.c b/usr/src/uts/common/fs/zfs/vdev_indirect.c
index 3f2ff799b6..f093a6920f 100644
--- a/usr/src/uts/common/fs/zfs/vdev_indirect.c
+++ b/usr/src/uts/common/fs/zfs/vdev_indirect.c
@@ -1628,6 +1628,7 @@ vdev_ops_t vdev_indirect_ops = {
NULL,
NULL,
vdev_indirect_remap,
+ NULL,
VDEV_TYPE_INDIRECT, /* name of this vdev type */
B_FALSE /* leaf vdev */
};
diff --git a/usr/src/uts/common/fs/zfs/vdev_initialize.c b/usr/src/uts/common/fs/zfs/vdev_initialize.c
new file mode 100644
index 0000000000..559c0153d6
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_initialize.c
@@ -0,0 +1,791 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/txg.h>
+#include <sys/vdev_impl.h>
+#include <sys/refcount.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+
+/*
+ * Maximum number of metaslabs per group that can be initialized
+ * simultaneously.
+ */
+int max_initialize_ms = 3;
+
+/*
+ * Value that is written to disk during initialization.
+ */
+uint64_t zfs_initialize_value = 0xdeadbeefdeadbeefULL;
+
+/* maximum number of I/Os outstanding per leaf vdev */
+int zfs_initialize_limit = 1;
+
+/* size of initializing writes; default 1MiB, see zfs_remove_max_segment */
+uint64_t zfs_initialize_chunk_size = 1024 * 1024;
+
+static boolean_t
+vdev_initialize_should_stop(vdev_t *vd)
+{
+ return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||
+ vd->vdev_detached || vd->vdev_top->vdev_removing);
+}
+
+static void
+vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
+{
+ /*
+ * We pass in the guid instead of the vdev_t since the vdev may
+ * have been freed prior to the sync task being processed. This
+ * happens when a vdev is detached as we call spa_config_vdev_exit(),
+ * stop the intializing thread, schedule the sync task, and free
+ * the vdev. Later when the scheduled sync task is invoked, it would
+ * find that the vdev has been freed.
+ */
+ uint64_t guid = *(uint64_t *)arg;
+ uint64_t txg = dmu_tx_get_txg(tx);
+ kmem_free(arg, sizeof (uint64_t));
+
+ vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
+ if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+ return;
+
+ uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
+ vd->vdev_initialize_offset[txg & TXG_MASK] = 0;
+
+ VERIFY(vd->vdev_leaf_zap != 0);
+
+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
+
+ if (last_offset > 0) {
+ vd->vdev_initialize_last_offset = last_offset;
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
+ sizeof (last_offset), 1, &last_offset, tx));
+ }
+ if (vd->vdev_initialize_action_time > 0) {
+ uint64_t val = (uint64_t)vd->vdev_initialize_action_time;
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val),
+ 1, &val, tx));
+ }
+
+ uint64_t initialize_state = vd->vdev_initialize_state;
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1,
+ &initialize_state, tx));
+}
+
+static void
+vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
+{
+ ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+ spa_t *spa = vd->vdev_spa;
+
+ if (new_state == vd->vdev_initialize_state)
+ return;
+
+ /*
+ * Copy the vd's guid, this will be freed by the sync task.
+ */
+ uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+ *guid = vd->vdev_guid;
+
+ /*
+ * If we're suspending, then preserving the original start time.
+ */
+ if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
+ vd->vdev_initialize_action_time = gethrestime_sec();
+ }
+ vd->vdev_initialize_state = new_state;
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
+ guid, 2, ZFS_SPACE_CHECK_RESERVED, tx);
+
+ switch (new_state) {
+ case VDEV_INITIALIZE_ACTIVE:
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s activated", vd->vdev_path);
+ break;
+ case VDEV_INITIALIZE_SUSPENDED:
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s suspended", vd->vdev_path);
+ break;
+ case VDEV_INITIALIZE_CANCELED:
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s canceled", vd->vdev_path);
+ break;
+ case VDEV_INITIALIZE_COMPLETE:
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s complete", vd->vdev_path);
+ break;
+ default:
+ panic("invalid state %llu", (unsigned long long)new_state);
+ }
+
+ dmu_tx_commit(tx);
+}
+
+static void
+vdev_initialize_cb(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
+ /*
+ * The I/O failed because the vdev was unavailable; roll the
+ * last offset back. (This works because spa_sync waits on
+ * spa_txg_zio before it runs sync tasks.)
+ */
+ uint64_t *off =
+ &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK];
+ *off = MIN(*off, zio->io_offset);
+ } else {
+ /*
+ * Since initializing is best-effort, we ignore I/O errors and
+ * rely on vdev_probe to determine if the errors are more
+ * critical.
+ */
+ if (zio->io_error != 0)
+ vd->vdev_stat.vs_initialize_errors++;
+
+ vd->vdev_initialize_bytes_done += zio->io_orig_size;
+ }
+ ASSERT3U(vd->vdev_initialize_inflight, >, 0);
+ vd->vdev_initialize_inflight--;
+ cv_broadcast(&vd->vdev_initialize_io_cv);
+ mutex_exit(&vd->vdev_initialize_io_lock);
+
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+
+/* Takes care of physical writing and limiting # of concurrent ZIOs. */
+static int
+vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ /* Limit inflight initializing I/Os */
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ while (vd->vdev_initialize_inflight >= zfs_initialize_limit) {
+ cv_wait(&vd->vdev_initialize_io_cv,
+ &vd->vdev_initialize_io_lock);
+ }
+ vd->vdev_initialize_inflight++;
+ mutex_exit(&vd->vdev_initialize_io_lock);
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
+ mutex_enter(&vd->vdev_initialize_lock);
+
+ if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) {
+ uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+ *guid = vd->vdev_guid;
+
+ /* This is the first write of this txg. */
+ dsl_sync_task_nowait(spa_get_dsl(spa),
+ vdev_initialize_zap_update_sync, guid, 2,
+ ZFS_SPACE_CHECK_RESERVED, tx);
+ }
+
+ /*
+ * We know the vdev struct will still be around since all
+ * consumers of vdev_free must stop the initialization first.
+ */
+ if (vdev_initialize_should_stop(vd)) {
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ ASSERT3U(vd->vdev_initialize_inflight, >, 0);
+ vd->vdev_initialize_inflight--;
+ mutex_exit(&vd->vdev_initialize_io_lock);
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+ mutex_exit(&vd->vdev_initialize_lock);
+ dmu_tx_commit(tx);
+ return (SET_ERROR(EINTR));
+ }
+ mutex_exit(&vd->vdev_initialize_lock);
+
+ vd->vdev_initialize_offset[txg & TXG_MASK] = start + size;
+ zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start,
+ size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL,
+ ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE));
+ /* vdev_initialize_cb releases SCL_STATE_ALL */
+
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
+/*
+ * Translate a logical range to the physical range for the specified vdev_t.
+ * This function is initially called with a leaf vdev and will walk each
+ * parent vdev until it reaches a top-level vdev. Once the top-level is
+ * reached the physical range is initialized and the recursive function
+ * begins to unwind. As it unwinds it calls the parent's vdev specific
+ * translation function to do the real conversion.
+ */
+void
+vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
+{
+ /*
+ * Walk up the vdev tree
+ */
+ if (vd != vd->vdev_top) {
+ vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
+ } else {
+ /*
+ * We've reached the top-level vdev, initialize the
+ * physical range to the logical range and start to
+ * unwind.
+ */
+ physical_rs->rs_start = logical_rs->rs_start;
+ physical_rs->rs_end = logical_rs->rs_end;
+ return;
+ }
+
+ vdev_t *pvd = vd->vdev_parent;
+ ASSERT3P(pvd, !=, NULL);
+ ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
+
+ /*
+ * As this recursive function unwinds, translate the logical
+ * range into its physical components by calling the
+ * vdev specific translate function.
+ */
+ range_seg_t intermediate = { 0 };
+ pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
+
+ physical_rs->rs_start = intermediate.rs_start;
+ physical_rs->rs_end = intermediate.rs_end;
+}
+
+/*
+ * Callback to fill each ABD chunk with zfs_initialize_value. len must be
+ * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
+ * allocation will guarantee these for us.
+ */
+/* ARGSUSED */
+static int
+vdev_initialize_block_fill(void *buf, size_t len, void *unused)
+{
+ ASSERT0(len % sizeof (uint64_t));
+ for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) {
+ *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value;
+ }
+ return (0);
+}
+
+static abd_t *
+vdev_initialize_block_alloc()
+{
+ /* Allocate ABD for filler data */
+ abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE);
+
+ ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t));
+ (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size,
+ vdev_initialize_block_fill, NULL);
+
+ return (data);
+}
+
+static void
+vdev_initialize_block_free(abd_t *data)
+{
+ abd_free(data);
+}
+
+static int
+vdev_initialize_ranges(vdev_t *vd, abd_t *data)
+{
+ avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root;
+
+ for (range_seg_t *rs = avl_first(rt); rs != NULL;
+ rs = AVL_NEXT(rt, rs)) {
+ uint64_t size = rs->rs_end - rs->rs_start;
+
+ /* Split range into legally-sized physical chunks */
+ uint64_t writes_required =
+ ((size - 1) / zfs_initialize_chunk_size) + 1;
+
+ for (uint64_t w = 0; w < writes_required; w++) {
+ int error;
+
+ error = vdev_initialize_write(vd,
+ VDEV_LABEL_START_SIZE + rs->rs_start +
+ (w * zfs_initialize_chunk_size),
+ MIN(size - (w * zfs_initialize_chunk_size),
+ zfs_initialize_chunk_size), data);
+ if (error != 0)
+ return (error);
+ }
+ }
+ return (0);
+}
+
+static void
+vdev_initialize_ms_load(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ metaslab_load_wait(msp);
+ if (!msp->ms_loaded)
+ VERIFY0(metaslab_load(msp));
+}
+
+static void
+vdev_initialize_mg_wait(metaslab_group_t *mg)
+{
+ ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
+ while (mg->mg_initialize_updating) {
+ cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
+ }
+}
+
+static void
+vdev_initialize_mg_mark(metaslab_group_t *mg)
+{
+ ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
+ ASSERT(mg->mg_initialize_updating);
+
+ while (mg->mg_ms_initializing >= max_initialize_ms) {
+ cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
+ }
+ mg->mg_ms_initializing++;
+ ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms);
+}
+
+/*
+ * Mark the metaslab as being initialized to prevent any allocations
+ * on this metaslab. We must also track how many metaslabs are currently
+ * being initialized within a metaslab group and limit them to prevent
+ * allocation failures from occurring because all metaslabs are being
+ * initialized.
+ */
+static void
+vdev_initialize_ms_mark(metaslab_t *msp)
+{
+ ASSERT(!MUTEX_HELD(&msp->ms_lock));
+ metaslab_group_t *mg = msp->ms_group;
+
+ mutex_enter(&mg->mg_ms_initialize_lock);
+
+ /*
+ * To keep an accurate count of how many threads are initializing
+ * a specific metaslab group, we only allow one thread to mark
+ * the metaslab group at a time. This ensures that the value of
+ * ms_initializing will be accurate when we decide to mark a metaslab
+ * group as being initialized. To do this we force all other threads
+ * to wait till the metaslab's mg_initialize_updating flag is no
+ * longer set.
+ */
+ vdev_initialize_mg_wait(mg);
+ mg->mg_initialize_updating = B_TRUE;
+ if (msp->ms_initializing == 0) {
+ vdev_initialize_mg_mark(mg);
+ }
+ mutex_enter(&msp->ms_lock);
+ msp->ms_initializing++;
+ mutex_exit(&msp->ms_lock);
+
+ mg->mg_initialize_updating = B_FALSE;
+ cv_broadcast(&mg->mg_ms_initialize_cv);
+ mutex_exit(&mg->mg_ms_initialize_lock);
+}
+
+static void
+vdev_initialize_ms_unmark(metaslab_t *msp)
+{
+ ASSERT(!MUTEX_HELD(&msp->ms_lock));
+ metaslab_group_t *mg = msp->ms_group;
+ mutex_enter(&mg->mg_ms_initialize_lock);
+ mutex_enter(&msp->ms_lock);
+ if (--msp->ms_initializing == 0) {
+ mg->mg_ms_initializing--;
+ cv_broadcast(&mg->mg_ms_initialize_cv);
+ }
+ mutex_exit(&msp->ms_lock);
+ mutex_exit(&mg->mg_ms_initialize_lock);
+}
+
+static void
+vdev_initialize_calculate_progress(vdev_t *vd)
+{
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+ spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+ ASSERT(vd->vdev_leaf_zap != 0);
+
+ vd->vdev_initialize_bytes_est = 0;
+ vd->vdev_initialize_bytes_done = 0;
+
+ for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
+ metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+ mutex_enter(&msp->ms_lock);
+
+ uint64_t ms_free = msp->ms_size -
+ space_map_allocated(msp->ms_sm);
+
+ if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
+ ms_free /= vd->vdev_top->vdev_children;
+
+ /*
+ * Convert the metaslab range to a physical range
+ * on our vdev. We use this to determine if we are
+ * in the middle of this metaslab range.
+ */
+ range_seg_t logical_rs, physical_rs;
+ logical_rs.rs_start = msp->ms_start;
+ logical_rs.rs_end = msp->ms_start + msp->ms_size;
+ vdev_xlate(vd, &logical_rs, &physical_rs);
+
+ if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
+ vd->vdev_initialize_bytes_est += ms_free;
+ mutex_exit(&msp->ms_lock);
+ continue;
+ } else if (vd->vdev_initialize_last_offset >
+ physical_rs.rs_end) {
+ vd->vdev_initialize_bytes_done += ms_free;
+ vd->vdev_initialize_bytes_est += ms_free;
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ /*
+ * If we get here, we're in the middle of initializing this
+ * metaslab. Load it and walk the free tree for more accurate
+ * progress estimation.
+ */
+ vdev_initialize_ms_load(msp);
+
+ for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); rs;
+ rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
+ logical_rs.rs_start = rs->rs_start;
+ logical_rs.rs_end = rs->rs_end;
+ vdev_xlate(vd, &logical_rs, &physical_rs);
+
+ uint64_t size = physical_rs.rs_end -
+ physical_rs.rs_start;
+ vd->vdev_initialize_bytes_est += size;
+ if (vd->vdev_initialize_last_offset >
+ physical_rs.rs_end) {
+ vd->vdev_initialize_bytes_done += size;
+ } else if (vd->vdev_initialize_last_offset >
+ physical_rs.rs_start &&
+ vd->vdev_initialize_last_offset <
+ physical_rs.rs_end) {
+ vd->vdev_initialize_bytes_done +=
+ vd->vdev_initialize_last_offset -
+ physical_rs.rs_start;
+ }
+ }
+ mutex_exit(&msp->ms_lock);
+ }
+}
+
+static void
+vdev_initialize_load(vdev_t *vd)
+{
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+ spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+ ASSERT(vd->vdev_leaf_zap != 0);
+
+ if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE ||
+ vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) {
+ int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
+ sizeof (vd->vdev_initialize_last_offset), 1,
+ &vd->vdev_initialize_last_offset);
+ ASSERT(err == 0 || err == ENOENT);
+ }
+
+ vdev_initialize_calculate_progress(vd);
+}
+
+
+/*
+ * Convert the logical range into a physcial range and add it to our
+ * avl tree.
+ */
+void
+vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
+{
+ vdev_t *vd = arg;
+ range_seg_t logical_rs, physical_rs;
+ logical_rs.rs_start = start;
+ logical_rs.rs_end = start + size;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ vdev_xlate(vd, &logical_rs, &physical_rs);
+
+ IMPLY(vd->vdev_top == vd,
+ logical_rs.rs_start == physical_rs.rs_start);
+ IMPLY(vd->vdev_top == vd,
+ logical_rs.rs_end == physical_rs.rs_end);
+
+ /* Only add segments that we have not visited yet */
+ if (physical_rs.rs_end <= vd->vdev_initialize_last_offset)
+ return;
+
+ /* Pick up where we left off mid-range. */
+ if (vd->vdev_initialize_last_offset > physical_rs.rs_start) {
+ zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
+ "(%llu, %llu)", vd->vdev_path,
+ (u_longlong_t)physical_rs.rs_start,
+ (u_longlong_t)physical_rs.rs_end,
+ (u_longlong_t)vd->vdev_initialize_last_offset,
+ (u_longlong_t)physical_rs.rs_end);
+ ASSERT3U(physical_rs.rs_end, >,
+ vd->vdev_initialize_last_offset);
+ physical_rs.rs_start = vd->vdev_initialize_last_offset;
+ }
+ ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
+
+ /*
+ * With raidz, it's possible that the logical range does not live on
+ * this leaf vdev. We only add the physical range to this vdev's if it
+ * has a length greater than 0.
+ */
+ if (physical_rs.rs_end > physical_rs.rs_start) {
+ range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start,
+ physical_rs.rs_end - physical_rs.rs_start);
+ } else {
+ ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
+ }
+}
+
+static void
+vdev_initialize_thread(void *arg)
+{
+ vdev_t *vd = arg;
+ spa_t *spa = vd->vdev_spa;
+ int error = 0;
+ uint64_t ms_count = 0;
+
+ ASSERT(vdev_is_concrete(vd));
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ vd->vdev_initialize_last_offset = 0;
+ vdev_initialize_load(vd);
+
+ abd_t *deadbeef = vdev_initialize_block_alloc();
+
+ vd->vdev_initialize_tree = range_tree_create(NULL, NULL);
+
+ for (uint64_t i = 0; !vd->vdev_detached &&
+ i < vd->vdev_top->vdev_ms_count; i++) {
+ metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+
+ /*
+ * If we've expanded the top-level vdev or it's our
+ * first pass, calculate our progress.
+ */
+ if (vd->vdev_top->vdev_ms_count != ms_count) {
+ vdev_initialize_calculate_progress(vd);
+ ms_count = vd->vdev_top->vdev_ms_count;
+ }
+
+ vdev_initialize_ms_mark(msp);
+ mutex_enter(&msp->ms_lock);
+ vdev_initialize_ms_load(msp);
+
+ range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
+ vd);
+ mutex_exit(&msp->ms_lock);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ error = vdev_initialize_ranges(vd, deadbeef);
+ vdev_initialize_ms_unmark(msp);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
+ if (error != 0)
+ break;
+ }
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ while (vd->vdev_initialize_inflight > 0) {
+ cv_wait(&vd->vdev_initialize_io_cv,
+ &vd->vdev_initialize_io_lock);
+ }
+ mutex_exit(&vd->vdev_initialize_io_lock);
+
+ range_tree_destroy(vd->vdev_initialize_tree);
+ vdev_initialize_block_free(deadbeef);
+ vd->vdev_initialize_tree = NULL;
+
+ mutex_enter(&vd->vdev_initialize_lock);
+ if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) {
+ vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE);
+ }
+ ASSERT(vd->vdev_initialize_thread != NULL ||
+ vd->vdev_initialize_inflight == 0);
+
+ /*
+ * Drop the vdev_initialize_lock while we sync out the
+ * txg since it's possible that a device might be trying to
+ * come online and must check to see if it needs to restart an
+ * initialization. That thread will be holding the spa_config_lock
+ * which would prevent the txg_wait_synced from completing.
+ */
+ mutex_exit(&vd->vdev_initialize_lock);
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ mutex_enter(&vd->vdev_initialize_lock);
+
+ vd->vdev_initialize_thread = NULL;
+ cv_broadcast(&vd->vdev_initialize_cv);
+ mutex_exit(&vd->vdev_initialize_lock);
+}
+
+/*
+ * Initiates a device. Caller must hold vdev_initialize_lock.
+ * Device must be a leaf and not already be initializing.
+ */
+void
+vdev_initialize(vdev_t *vd)
+{
+ ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+ ASSERT(!vd->vdev_detached);
+ ASSERT(!vd->vdev_initialize_exit_wanted);
+ ASSERT(!vd->vdev_top->vdev_removing);
+
+ vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
+ vd->vdev_initialize_thread = thread_create(NULL, 0,
+ vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+}
+
+/*
+ * Stop initializng a device, with the resultant initialing state being
+ * tgt_state. Blocks until the initializing thread has exited.
+ * Caller must hold vdev_initialize_lock and must not be writing to the spa
+ * config, as the initializing thread may try to enter the config as a reader
+ * before exiting.
+ */
+void
+vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+ spa_t *spa = vd->vdev_spa;
+ ASSERT(!spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_WRITER));
+
+ ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(vdev_is_concrete(vd));
+
+ /*
+ * Allow cancel requests to proceed even if the initialize thread
+ * has stopped.
+ */
+ if (vd->vdev_initialize_thread == NULL &&
+ tgt_state != VDEV_INITIALIZE_CANCELED) {
+ return;
+ }
+
+ vdev_initialize_change_state(vd, tgt_state);
+ vd->vdev_initialize_exit_wanted = B_TRUE;
+ while (vd->vdev_initialize_thread != NULL)
+ cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock);
+
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+ vd->vdev_initialize_exit_wanted = B_FALSE;
+}
+
+static void
+vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+ if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
+ mutex_enter(&vd->vdev_initialize_lock);
+ vdev_initialize_stop(vd, tgt_state);
+ mutex_exit(&vd->vdev_initialize_lock);
+ return;
+ }
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state);
+ }
+}
+
+/*
+ * Convenience function to stop initializing of a vdev tree and set all
+ * initialize thread pointers to NULL.
+ */
+void
+vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+ vdev_initialize_stop_all_impl(vd, tgt_state);
+
+ if (vd->vdev_spa->spa_sync_on) {
+ /* Make sure that our state has been synced to disk */
+ txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
+ }
+}
+
+void
+vdev_initialize_restart(vdev_t *vd)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+
+ if (vd->vdev_leaf_zap != 0) {
+ mutex_enter(&vd->vdev_initialize_lock);
+ uint64_t initialize_state = VDEV_INITIALIZE_NONE;
+ int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE,
+ sizeof (initialize_state), 1, &initialize_state);
+ ASSERT(err == 0 || err == ENOENT);
+ vd->vdev_initialize_state = initialize_state;
+
+ uint64_t timestamp = 0;
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME,
+ sizeof (timestamp), 1, &timestamp);
+ ASSERT(err == 0 || err == ENOENT);
+ vd->vdev_initialize_action_time = (time_t)timestamp;
+
+ if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
+ vd->vdev_offline) {
+ /* load progress for reporting, but don't resume */
+ vdev_initialize_load(vd);
+ } else if (vd->vdev_initialize_state ==
+ VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd)) {
+ vdev_initialize(vd);
+ }
+
+ mutex_exit(&vd->vdev_initialize_lock);
+ }
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ vdev_initialize_restart(vd->vdev_child[i]);
+ }
+}
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
index 34a750fe4d..133558d3d3 100644
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -564,6 +564,7 @@ vdev_ops_t vdev_mirror_ops = {
NULL,
NULL,
NULL,
+ vdev_default_xlate,
VDEV_TYPE_MIRROR, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
@@ -578,6 +579,7 @@ vdev_ops_t vdev_replacing_ops = {
NULL,
NULL,
NULL,
+ vdev_default_xlate,
VDEV_TYPE_REPLACING, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
@@ -592,6 +594,7 @@ vdev_ops_t vdev_spare_ops = {
NULL,
NULL,
NULL,
+ vdev_default_xlate,
VDEV_TYPE_SPARE, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
diff --git a/usr/src/uts/common/fs/zfs/vdev_missing.c b/usr/src/uts/common/fs/zfs/vdev_missing.c
index d7d017fb8f..c761de8a20 100644
--- a/usr/src/uts/common/fs/zfs/vdev_missing.c
+++ b/usr/src/uts/common/fs/zfs/vdev_missing.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
*/
/*
@@ -89,6 +89,7 @@ vdev_ops_t vdev_missing_ops = {
NULL,
NULL,
NULL,
+ NULL,
VDEV_TYPE_MISSING, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
@@ -103,6 +104,7 @@ vdev_ops_t vdev_hole_ops = {
NULL,
NULL,
NULL,
+ NULL,
VDEV_TYPE_HOLE, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 9f962350db..37de37e4b6 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -152,6 +152,8 @@ uint32_t zfs_vdev_scrub_min_active = 1;
uint32_t zfs_vdev_scrub_max_active = 2;
uint32_t zfs_vdev_removal_min_active = 1;
uint32_t zfs_vdev_removal_max_active = 2;
+uint32_t zfs_vdev_initializing_min_active = 1;
+uint32_t zfs_vdev_initializing_max_active = 1;
/*
* When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
@@ -413,6 +415,8 @@ vdev_queue_class_min_active(zio_priority_t p)
return (zfs_vdev_scrub_min_active);
case ZIO_PRIORITY_REMOVAL:
return (zfs_vdev_removal_min_active);
+ case ZIO_PRIORITY_INITIALIZING:
+ return (zfs_vdev_initializing_min_active);
default:
panic("invalid priority %u", p);
return (0);
@@ -474,6 +478,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
return (zfs_vdev_scrub_max_active);
case ZIO_PRIORITY_REMOVAL:
return (zfs_vdev_removal_max_active);
+ case ZIO_PRIORITY_INITIALIZING:
+ return (zfs_vdev_initializing_max_active);
default:
panic("invalid priority %u", p);
return (0);
@@ -694,8 +700,8 @@ again:
}
/*
- * For LBA-ordered queues (async / scrub), issue the i/o which follows
- * the most recently issued i/o in LBA (offset) order.
+ * For LBA-ordered queues (async / scrub / initializing), issue the
+ * i/o which follows the most recently issued i/o in LBA (offset) order.
*
* For FIFO queues (sync), issue the i/o with the lowest timestamp.
*/
@@ -755,13 +761,15 @@ vdev_queue_io(zio_t *zio)
if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
zio->io_priority != ZIO_PRIORITY_SCRUB &&
- zio->io_priority != ZIO_PRIORITY_REMOVAL)
+ zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+ zio->io_priority != ZIO_PRIORITY_INITIALIZING)
zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
} else {
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
- zio->io_priority != ZIO_PRIORITY_REMOVAL)
+ zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+ zio->io_priority != ZIO_PRIORITY_INITIALIZING)
zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
}
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index 60360a0a46..0e6dfcc2c0 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -38,6 +38,10 @@
#include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h>
+#ifdef ZFS_DEBUG
+#include <sys/vdev_initialize.h> /* vdev_xlate testing */
+#endif
+
/*
* Virtual device vector for RAID-Z.
*
@@ -1884,6 +1888,39 @@ vdev_raidz_child_done(zio_t *zio)
rc->rc_skipped = 0;
}
+static void
+vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
+{
+#ifdef ZFS_DEBUG
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd->vdev_top;
+
+ range_seg_t logical_rs, physical_rs;
+ logical_rs.rs_start = zio->io_offset;
+ logical_rs.rs_end = logical_rs.rs_start +
+ vdev_raidz_asize(zio->io_vd, zio->io_size);
+
+ raidz_col_t *rc = &rm->rm_col[col];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ vdev_xlate(cvd, &logical_rs, &physical_rs);
+ ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
+ ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
+ /*
+ * It would be nice to assert that rs_end is equal
+ * to rc_offset + rc_size but there might be an
+ * optional I/O at the end that is not accounted in
+ * rc_size.
+ */
+ if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
+ ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
+ rc->rc_size + (1 << tvd->vdev_ashift));
+ } else {
+ ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
+ }
+#endif
+}
+
/*
* Start an IO operation on a RAIDZ VDev
*
@@ -1926,6 +1963,12 @@ vdev_raidz_io_start(zio_t *zio)
for (c = 0; c < rm->rm_cols; c++) {
rc = &rm->rm_col[c];
cvd = vd->vdev_child[rc->rc_devidx];
+
+ /*
+ * Verify physical to logical translation.
+ */
+ vdev_raidz_io_verify(zio, rm, c);
+
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0,
@@ -2555,6 +2598,37 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
}
+static void
+vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
+{
+ vdev_t *raidvd = cvd->vdev_parent;
+ ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
+
+ uint64_t width = raidvd->vdev_children;
+ uint64_t tgt_col = cvd->vdev_id;
+ uint64_t ashift = raidvd->vdev_top->vdev_ashift;
+
+ /* make sure the offsets are block-aligned */
+ ASSERT0(in->rs_start % (1 << ashift));
+ ASSERT0(in->rs_end % (1 << ashift));
+ uint64_t b_start = in->rs_start >> ashift;
+ uint64_t b_end = in->rs_end >> ashift;
+
+ uint64_t start_row = 0;
+ if (b_start > tgt_col) /* avoid underflow */
+ start_row = ((b_start - tgt_col - 1) / width) + 1;
+
+ uint64_t end_row = 0;
+ if (b_end > tgt_col)
+ end_row = ((b_end - tgt_col - 1) / width) + 1;
+
+ res->rs_start = start_row << ashift;
+ res->rs_end = end_row << ashift;
+
+ ASSERT3U(res->rs_start, <=, in->rs_start);
+ ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
+}
+
vdev_ops_t vdev_raidz_ops = {
vdev_raidz_open,
vdev_raidz_close,
@@ -2565,6 +2639,7 @@ vdev_ops_t vdev_raidz_ops = {
NULL,
NULL,
NULL,
+ vdev_raidz_xlate,
VDEV_TYPE_RAIDZ, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
diff --git a/usr/src/uts/common/fs/zfs/vdev_removal.c b/usr/src/uts/common/fs/zfs/vdev_removal.c
index fc613ff58a..f913432bd0 100644
--- a/usr/src/uts/common/fs/zfs/vdev_removal.c
+++ b/usr/src/uts/common/fs/zfs/vdev_removal.c
@@ -44,6 +44,7 @@
#include <sys/vdev_indirect_births.h>
#include <sys/vdev_indirect_mapping.h>
#include <sys/abd.h>
+#include <sys/vdev_initialize.h>
/*
* This file contains the necessary logic to remove vdevs from a
@@ -1021,6 +1022,7 @@ vdev_remove_complete(spa_t *spa)
txg_wait_synced(spa->spa_dsl_pool, 0);
txg = spa_vdev_enter(spa);
vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
sysevent_t *ev = spa_event_create(spa, vd, NULL,
ESC_ZFS_VDEV_REMOVE_DEV);
@@ -1659,6 +1661,9 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
/* Make sure these changes are sync'ed */
spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
+ /* Stop initializing */
+ (void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
+
*txg = spa_vdev_config_enter(spa);
sysevent_t *ev = spa_event_create(spa, vd, NULL,
@@ -1819,6 +1824,13 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
*/
error = spa_reset_logs(spa);
+ /*
+ * We stop any initializing that is currently in progress but leave
+ * the state as "active". This will allow the initializing to resume
+ * if the removal is canceled sometime later.
+ */
+ vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
+
*txg = spa_vdev_config_enter(spa);
/*
@@ -1830,6 +1842,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
if (error != 0) {
metaslab_group_activate(mg);
+ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
return (error);
}
diff --git a/usr/src/uts/common/fs/zfs/vdev_root.c b/usr/src/uts/common/fs/zfs/vdev_root.c
index b3433c2424..edb52d6ca7 100644
--- a/usr/src/uts/common/fs/zfs/vdev_root.c
+++ b/usr/src/uts/common/fs/zfs/vdev_root.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -149,6 +149,7 @@ vdev_ops_t vdev_root_ops = {
NULL,
NULL,
NULL,
+ NULL,
VDEV_TYPE_ROOT, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 4acd2cf86a..a62e41d939 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -189,6 +189,8 @@
#include <sys/zcp.h>
#include <sys/zio_checksum.h>
#include <sys/vdev_removal.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_initialize.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
@@ -3745,6 +3747,80 @@ zfs_ioc_destroy(zfs_cmd_t *zc)
}
/*
+ * innvl: {
+ * vdevs: {
+ * guid 1, guid 2, ...
+ * },
+ * func: POOL_INITIALIZE_{CANCEL|DO|SUSPEND}
+ * }
+ *
+ * outnvl: {
+ * [func: EINVAL (if provided command type didn't make sense)],
+ * [vdevs: {
+ * guid1: errno, (see function body for possible errnos)
+ * ...
+ * }]
+ * }
+ *
+ */
+static int
+zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(poolname, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ uint64_t cmd_type;
+ if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND,
+ &cmd_type) != 0) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ if (!(cmd_type == POOL_INITIALIZE_CANCEL ||
+ cmd_type == POOL_INITIALIZE_DO ||
+ cmd_type == POOL_INITIALIZE_SUSPEND)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ nvlist_t *vdev_guids;
+ if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS,
+ &vdev_guids) != 0) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ nvlist_t *vdev_errlist = fnvlist_alloc();
+ int total_errors = 0;
+
+ for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) {
+ uint64_t vdev_guid = fnvpair_value_uint64(pair);
+
+ error = spa_vdev_initialize(spa, vdev_guid, cmd_type);
+ if (error != 0) {
+ char guid_as_str[MAXNAMELEN];
+
+ (void) snprintf(guid_as_str, sizeof (guid_as_str),
+ "%llu", (unsigned long long)vdev_guid);
+ fnvlist_add_int64(vdev_errlist, guid_as_str, error);
+ total_errors++;
+ }
+ }
+ if (fnvlist_size(vdev_errlist) > 0) {
+ fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS,
+ vdev_errlist);
+ }
+ fnvlist_free(vdev_errlist);
+
+ spa_close(spa, FTAG);
+ return (total_errors > 0 ? EINVAL : 0);
+}
+
+/*
* fsname is name of dataset to rollback (to most recent snapshot)
*
* innvl may contain name of expected target snapshot
@@ -5907,6 +5983,10 @@ zfs_ioctl_init(void)
zfs_secpolicy_config, POOL_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+ zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE,
+ zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
/* IOCTLS that use the legacy function signature */
zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
index 790514ddef..4325a502fe 100644
--- a/usr/src/uts/common/sys/fs/zfs.h
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -626,6 +626,13 @@ typedef struct zpool_load_policy {
#define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \
"com.delphix:pool_checkpoint_sm"
+#define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \
+ "com.delphix:next_offset_to_initialize"
+#define VDEV_LEAF_ZAP_INITIALIZE_STATE \
+ "com.delphix:vdev_initialize_state"
+#define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \
+ "com.delphix:vdev_initialize_action_time"
+
/*
* This is needed in userland to report the minimum necessary device size.
*
@@ -723,6 +730,15 @@ typedef enum pool_scrub_cmd {
POOL_SCRUB_FLAGS_END
} pool_scrub_cmd_t;
+/*
+ * Initialize functions.
+ */
+typedef enum pool_initialize_func {
+ POOL_INITIALIZE_DO,
+ POOL_INITIALIZE_CANCEL,
+ POOL_INITIALIZE_SUSPEND,
+ POOL_INITIALIZE_FUNCS
+} pool_initialize_func_t;
/*
* ZIO types. Needed to interpret vdev statistics below.
@@ -796,6 +812,14 @@ typedef struct pool_checkpoint_stat {
uint64_t pcs_space; /* checkpointed space */
} pool_checkpoint_stat_t;
+typedef enum {
+ VDEV_INITIALIZE_NONE,
+ VDEV_INITIALIZE_ACTIVE,
+ VDEV_INITIALIZE_CANCELED,
+ VDEV_INITIALIZE_SUSPENDED,
+ VDEV_INITIALIZE_COMPLETE
+} vdev_initializing_state_t;
+
/*
* Vdev statistics. Note: all fields should be 64-bit because this
* is passed between kernel and userland as an nvlist uint64 array.
@@ -814,10 +838,15 @@ typedef struct vdev_stat {
uint64_t vs_read_errors; /* read errors */
uint64_t vs_write_errors; /* write errors */
uint64_t vs_checksum_errors; /* checksum errors */
+ uint64_t vs_initialize_errors; /* initializing errors */
uint64_t vs_self_healed; /* self-healed bytes */
uint64_t vs_scan_removing; /* removing? */
uint64_t vs_scan_processed; /* scan processed bytes */
uint64_t vs_fragmentation; /* device fragmentation */
+ uint64_t vs_initialize_bytes_done; /* bytes initialized */
+ uint64_t vs_initialize_bytes_est; /* total bytes to initialize */
+ uint64_t vs_initialize_state; /* vdev_initialzing_state_t */
+ uint64_t vs_initialize_action_time; /* time_t */
uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
} vdev_stat_t;
@@ -945,6 +974,7 @@ typedef enum zfs_ioc {
ZFS_IOC_REMAP,
ZFS_IOC_POOL_CHECKPOINT,
ZFS_IOC_POOL_DISCARD_CHECKPOINT,
+ ZFS_IOC_POOL_INITIALIZE,
ZFS_IOC_LAST
} zfs_ioc_t;
@@ -1008,6 +1038,12 @@ typedef enum {
#define ZPOOL_HIST_ERRNO "errno"
/*
+ * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE.
+ */
+#define ZPOOL_INITIALIZE_COMMAND "initialize_command"
+#define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs"
+
+/*
* Flags for ZFS_IOC_VDEV_SET_STATE
*/
#define ZFS_ONLINE_CHECKREMOVE 0x1