summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTony Hutter <hutter2@llnl.gov>2020-02-11 06:20:40 -0700
committerJerry Jelinek <jerry.jelinek@joyent.com>2020-02-13 10:45:47 -0700
commitdd50e0cc4cbe1474096300fe52e9855769c0d478 (patch)
tree2ec7eb7dda66d419ab4de535b103b6f680e239f6
parent3df9f0641f28754051d5e82c6457527cf4af1258 (diff)
downloadillumos-joyent-dd50e0cc4cbe1474096300fe52e9855769c0d478.tar.gz
11682 zpool iostat and status improvements
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com> Portions contributed by: Hajo Möller <dasjoe@gmail.com> Portions contributed by: Damian Wojslaw <damian@wojslaw.pl> Portions contributed by: kpande <github@tripleback.net> Portions contributed by: Anatoly Borodin <anatoly.borodin@gmail.com> Portions contributed by: Gregor Kopka <mailfrom-github@kopka.net> Portions contributed by: George Melikov <mail@gmelikov.ru> Portions contributed by: George G <gg7@users.noreply.github.com> Portions contributed by: DeHackEd <DeHackEd@users.noreply.github.com> Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed by: Tim Chase <tim@chase2k.com> Reviewed by: Joshua M. Clulow <josh@sysmgr.org> Reviewed by: Giuseppe Di Natale <guss80@gmail.com> Reviewed by: George Melikov <mail@gmelikov.ru> Reviewed by: Tony Hutter <hutter2@llnl.gov> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Alek Pinchuk <apinchuk@datto.com> Reviewed by: Allan Jude <allanjude@freebsd.org> Reviewed by: Olaf Faaland <faaland1@llnl.gov> Reviewed by: loli10K <ezomori.nozomu@gmail.com> Reviewed by: C Fraire <cfraire@me.com> Reviewed by: Andy Fiddaman <omnios@citrus-it.co.uk> Approved by: Gordon Ross <gordon.w.ross@gmail.com>
-rw-r--r--usr/src/cmd/zpool/Makefile3
-rw-r--r--usr/src/cmd/zpool/zpool_iter.c67
-rw-r--r--usr/src/cmd/zpool/zpool_main.c1708
-rw-r--r--usr/src/cmd/zpool/zpool_util.c28
-rw-r--r--usr/src/cmd/zpool/zpool_util.h8
-rw-r--r--usr/src/lib/libzfs/common/libzfs_config.c14
-rw-r--r--usr/src/lib/libzfs/common/libzfs_pool.c12
-rw-r--r--usr/src/lib/libzpool/common/kernel.c77
-rw-r--r--usr/src/lib/libzpool/common/util.c8
-rw-r--r--usr/src/man/man1m/zpool.1m148
-rw-r--r--usr/src/pkg/manifests/system-test-zfstest.mf9
-rw-r--r--usr/src/test/zfs-tests/include/libtest.shlib4
-rw-r--r--usr/src/test/zfs-tests/include/zpool_script.shlib49
-rw-r--r--usr/src/test/zfs-tests/runfiles/delphix.run5
-rw-r--r--usr/src/test/zfs-tests/runfiles/omnios.run5
-rw-r--r--usr/src/test/zfs-tests/runfiles/openindiana.run5
-rw-r--r--usr/src/test/zfs-tests/runfiles/smartos.run5
-rwxr-xr-xusr/src/test/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh13
-rwxr-xr-xusr/src/test/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh4
-rw-r--r--usr/src/test/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib8
-rw-r--r--usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh2
-rw-r--r--usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh7
-rw-r--r--usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh6
-rwxr-xr-xusr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh76
-rw-r--r--usr/src/test/zfs-tests/tests/functional/devices/devices_common.kshlib5
-rw-r--r--usr/src/test/zfs-tests/tests/functional/fault/Makefile21
-rwxr-xr-xusr/src/test/zfs-tests/tests/functional/fault/cleanup.ksh36
-rw-r--r--usr/src/test/zfs-tests/tests/functional/fault/fault.cfg57
-rwxr-xr-xusr/src/test/zfs-tests/tests/functional/fault/setup.ksh34
-rwxr-xr-xusr/src/test/zfs-tests/tests/functional/fault/zpool_status_-s.ksh84
-rw-r--r--usr/src/test/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh10
-rw-r--r--usr/src/test/zfs-tests/tests/functional/trim/trim.kshlib49
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c20
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c139
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h10
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa_impl.h8
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h5
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio_priority.h6
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c247
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_label.c171
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_queue.c1
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_fm.c196
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c33
-rw-r--r--usr/src/uts/common/sys/fm/fs/zfs.h2
-rw-r--r--usr/src/uts/common/sys/fs/zfs.h113
47 files changed, 3093 insertions, 428 deletions
diff --git a/usr/src/cmd/zpool/Makefile b/usr/src/cmd/zpool/Makefile
index 55d8abc80f..4b7e6600c8 100644
--- a/usr/src/cmd/zpool/Makefile
+++ b/usr/src/cmd/zpool/Makefile
@@ -40,7 +40,8 @@ STAT_COMMON_OBJS = timestamp.o
STAT_COMMON_SRCS = $(STAT_COMMON_OBJS:%.o=$(STATCOMMONDIR)/%.c)
SRCS += $(STAT_COMMON_SRCS)
-LDLIBS += -lzfs -lnvpair -ldevid -lefi -ldiskmgt -luutil -lumem -lzutil
+LDLIBS += -lzfs -lnvpair -ldevid -lefi -ldiskmgt -luutil -lumem -lzutil \
+ -lm -lzpool
INCS += -I../../common/zfs -I../../uts/common/fs/zfs -I$(STATCOMMONDIR)
INCS += -I../../lib/libzutil/common
diff --git a/usr/src/cmd/zpool/zpool_iter.c b/usr/src/cmd/zpool/zpool_iter.c
index c05c665ada..e69f9778e0 100644
--- a/usr/src/cmd/zpool/zpool_iter.c
+++ b/usr/src/cmd/zpool/zpool_iter.c
@@ -32,6 +32,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
+#include <sys/sysmacros.h>
#include <libzfs.h>
#include <libzutil.h>
@@ -253,3 +254,69 @@ for_each_pool(int argc, char **argv, boolean_t unavail,
return (ret);
}
+
+static int
+for_each_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, pool_vdev_iter_f func,
+ void *data)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ int ret = 0;
+ int i;
+ char *type;
+
+ const char *list[] = {
+ ZPOOL_CONFIG_SPARES,
+ ZPOOL_CONFIG_L2CACHE,
+ ZPOOL_CONFIG_CHILDREN
+ };
+
+ for (i = 0; i < ARRAY_SIZE(list); i++) {
+ if (nvlist_lookup_nvlist_array(nv, list[i], &child,
+ &children) == 0) {
+ for (c = 0; c < children; c++) {
+ uint64_t ishole = 0;
+
+ (void) nvlist_lookup_uint64(child[c],
+ ZPOOL_CONFIG_IS_HOLE, &ishole);
+
+ if (ishole)
+ continue;
+
+ ret |= for_each_vdev_cb(zhp, child[c], func,
+ data);
+ }
+ }
+ }
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
+ return (ret);
+
+ /* Don't run our function on root vdevs */
+ if (strcmp(type, VDEV_TYPE_ROOT) != 0) {
+ ret |= func(zhp, nv, data);
+ }
+
+ return (ret);
+}
+
+/*
+ * This is the equivalent of for_each_pool() for vdevs. It iterates thorough
+ * all vdevs in the pool, ignoring root vdevs and holes, calling func() on
+ * each one.
+ *
+ * @zhp: Zpool handle
+ * @func: Function to call on each vdev
+ * @data: Custom data to pass to the function
+ */
+int
+for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data)
+{
+ nvlist_t *config, *nvroot;
+
+ if ((config = zpool_get_config(zhp, NULL)) != NULL) {
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ }
+ return (for_each_vdev_cb(zhp, nvroot, func, data));
+}
diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c
index 23269c20d6..053b5cce86 100644
--- a/usr/src/cmd/zpool/zpool_main.c
+++ b/usr/src/cmd/zpool/zpool_main.c
@@ -29,7 +29,7 @@
* Copyright (c) 2017 Datto Inc.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>
- * Copyright 2019 Joyent, Inc.
+ * Copyright 2020 Joyent, Inc.
* Copyright (c) 2012 by Cyril Plisko. All rights reserved.
* Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
*/
@@ -56,6 +56,9 @@
#include <sys/fs/zfs.h>
#include <sys/stat.h>
#include <sys/debug.h>
+#include <math.h>
+#include <sys/sysmacros.h>
+#include <sys/termios.h>
#include <libzfs.h>
#include <libzutil.h>
@@ -160,6 +163,85 @@ typedef enum {
} zpool_help_t;
+/*
+ * Flags for stats to display with "zpool iostats"
+ */
+enum iostat_type {
+ IOS_DEFAULT = 0,
+ IOS_LATENCY = 1,
+ IOS_QUEUES = 2,
+ IOS_L_HISTO = 3,
+ IOS_RQ_HISTO = 4,
+ IOS_COUNT, /* always last element */
+};
+
+/* iostat_type entries as bitmasks */
+#define IOS_DEFAULT_M (1ULL << IOS_DEFAULT)
+#define IOS_LATENCY_M (1ULL << IOS_LATENCY)
+#define IOS_QUEUES_M (1ULL << IOS_QUEUES)
+#define IOS_L_HISTO_M (1ULL << IOS_L_HISTO)
+#define IOS_RQ_HISTO_M (1ULL << IOS_RQ_HISTO)
+
+/* Mask of all the histo bits */
+#define IOS_ANYHISTO_M (IOS_L_HISTO_M | IOS_RQ_HISTO_M)
+
+/*
+ * Lookup table for iostat flags to nvlist names. Basically a list
+ * of all the nvlists a flag requires. Also specifies the order in
+ * which data gets printed in zpool iostat.
+ */
+static const char *vsx_type_to_nvlist[IOS_COUNT][13] = {
+ [IOS_L_HISTO] = {
+ ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
+ NULL},
+ [IOS_LATENCY] = {
+ ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
+ NULL},
+ [IOS_QUEUES] = {
+ ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE,
+ NULL},
+ [IOS_RQ_HISTO] = {
+ ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO,
+ ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO,
+ ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO,
+ ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO,
+ ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO,
+ NULL},
+};
+
+
+/*
+ * Given a cb->cb_flags with a histogram bit set, return the iostat_type.
+ * Right now, only one histo bit is ever set at one time, so we can
+ * just do a highbit64(a)
+ */
+#define IOS_HISTO_IDX(a) (highbit64(a & IOS_ANYHISTO_M) - 1)
+
typedef struct zpool_command {
const char *name;
int (*func)(int, char **);
@@ -216,7 +298,7 @@ static zpool_command_t command_table[] = {
{ "sync", zpool_do_sync, HELP_SYNC },
};
-#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0]))
+#define NCOMMAND (ARRAY_SIZE(command_table))
#define VDEV_ALLOC_CLASS_LOGS "logs"
@@ -265,8 +347,10 @@ get_usage(zpool_help_t idx)
"[-R root] [-F [-n]] [-t]\n"
"\t [--rewind-to-checkpoint] <pool | id> [newpool]\n"));
case HELP_IOSTAT:
- return (gettext("\tiostat [-gLPv] [-T d|u] [pool] ... "
- "[interval [count]]\n"));
+ return (gettext("\tiostat "
+ "[[-lq]|[-rw]] [-T d | u] [-ghHLpPvy]\n"
+ "\t [pool] ..."
+ " [[-n] interval [count]]\n"));
case HELP_LABELCLEAR:
return (gettext("\tlabelclear [-f] <vdev>\n"));
case HELP_LIST:
@@ -294,7 +378,8 @@ get_usage(zpool_help_t idx)
return (gettext("\ttrim [-d] [-r <rate>] [-c | -s] <pool> "
"[<device> ...]\n"));
case HELP_STATUS:
- return (gettext("\tstatus [-igLPvxD] [-T d|u] [pool] ... "
+ return (gettext("\tstatus "
+ "[-igLpPsvxD] [-T d|u] [pool] ... "
"[interval [count]]\n"));
case HELP_UPGRADE:
return (gettext("\tupgrade\n"
@@ -1621,10 +1706,12 @@ typedef struct status_cbdata {
int cb_namewidth;
boolean_t cb_allpools;
boolean_t cb_verbose;
+ boolean_t cb_literal;
boolean_t cb_explain;
boolean_t cb_first;
boolean_t cb_dedup_stats;
boolean_t cb_print_status;
+ boolean_t cb_print_slow_ios;
boolean_t cb_print_vdev_init;
boolean_t cb_print_vdev_trim;
} status_cbdata_t;
@@ -1778,10 +1865,34 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
name, state);
if (!isspare) {
- zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
- zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
- zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf));
- (void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
+ if (cb->cb_literal) {
+ printf(" %5llu %5llu %5llu",
+ (u_longlong_t)vs->vs_read_errors,
+ (u_longlong_t)vs->vs_write_errors,
+ (u_longlong_t)vs->vs_checksum_errors);
+ } else {
+ zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
+ zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
+ zfs_nicenum(vs->vs_checksum_errors, cbuf,
+ sizeof (cbuf));
+ printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
+ }
+
+ if (cb->cb_print_slow_ios) {
+ if (children == 0) {
+ /* Only leafs vdevs have slow IOs */
+ zfs_nicenum(vs->vs_slow_ios, rbuf,
+ sizeof (rbuf));
+ } else {
+ (void) snprintf(rbuf, sizeof (rbuf), "-");
+ }
+
+ if (cb->cb_literal)
+ printf(" %5llu", (u_longlong_t)vs->vs_slow_ios);
+ else
+ printf(" %5s", rbuf);
+ }
+
}
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
@@ -2382,7 +2493,8 @@ show_import(nvlist_t *config)
(void) printf(gettext(" config:\n\n"));
- cb.cb_namewidth = max_width(NULL, nvroot, 0, 0, 0);
+ cb.cb_namewidth = max_width(NULL, nvroot, 0, strlen(name),
+ VDEV_NAME_TYPE_ID);
if (cb.cb_namewidth < 10)
cb.cb_namewidth = 10;
@@ -3112,44 +3224,737 @@ zpool_do_sync(int argc, char **argv)
}
typedef struct iostat_cbdata {
- boolean_t cb_verbose;
+ uint64_t cb_flags;
int cb_name_flags;
int cb_namewidth;
int cb_iteration;
+ char **cb_vdev_names; /* Only show these vdevs */
+ unsigned int cb_vdev_names_count;
+ boolean_t cb_verbose;
+ boolean_t cb_literal;
boolean_t cb_scripted;
zpool_list_t *cb_list;
} iostat_cbdata_t;
+/* iostat labels */
+typedef struct name_and_columns {
+ const char *name; /* Column name */
+ unsigned int columns; /* Center name to this number of columns */
+} name_and_columns_t;
+
+#define IOSTAT_MAX_LABELS 13 /* Max number of labels on one line */
+
+static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] =
+{
+ [IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2},
+ {NULL}},
+ [IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2},
+ {"asyncq_wait", 2}, {"scrub"}, {"trim", 1}, {NULL}},
+ [IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2},
+ {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2},
+ {"trimq_write", 2}, {NULL}},
+ [IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2},
+ {"asyncq_wait", 2}, {NULL}},
+ [IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2},
+ {"async_read", 2}, {"async_write", 2}, {"scrub", 2},
+ {"trim", 2}, {NULL}},
+
+};
+
+/* Shorthand - if "columns" field not set, default to 1 column */
+static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] =
+{
+ [IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"},
+ {"write"}, {NULL}},
+ [IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"},
+ {"write"}, {"read"}, {"write"}, {"wait"}, {"wait"}, {NULL}},
+ [IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"},
+ {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"},
+ {"pend"}, {"activ"}, {NULL}},
+ [IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"},
+ {"write"}, {"read"}, {"write"}, {"scrub"}, {"trim"}, {NULL}},
+ [IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"},
+ {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {NULL}},
+};
+
+static const char *histo_to_title[] = {
+ [IOS_L_HISTO] = "latency",
+ [IOS_RQ_HISTO] = "req_size",
+};
+
+/*
+ * Return the number of labels in a null-terminated name_and_columns_t
+ * array.
+ *
+ */
+static unsigned int
+label_array_len(const name_and_columns_t *labels)
+{
+ int i = 0;
+
+ while (labels[i].name)
+ i++;
+
+ return (i);
+}
+
+/*
+ * Return the number of strings in a null-terminated string array.
+ * For example:
+ *
+ * const char foo[] = {"bar", "baz", NULL}
+ *
+ * returns 2
+ */
+static uint64_t
+str_array_len(const char *array[])
+{
+ uint64_t i = 0;
+ while (array[i])
+ i++;
+
+ return (i);
+}
+
+
+/*
+ * Return a default column width for default/latency/queue columns. This does
+ * not include histograms, which have their columns autosized.
+ */
+static unsigned int
+default_column_width(iostat_cbdata_t *cb, enum iostat_type type)
+{
+ unsigned long column_width = 5; /* Normal niceprint */
+ static unsigned long widths[] = {
+ /*
+ * Choose some sane default column sizes for printing the
+ * raw numbers.
+ */
+ [IOS_DEFAULT] = 15, /* 1PB capacity */
+ [IOS_LATENCY] = 10, /* 1B ns = 10sec */
+ [IOS_QUEUES] = 6, /* 1M queue entries */
+ [IOS_L_HISTO] = 10, /* 1B ns = 10sec */
+ [IOS_RQ_HISTO] = 6, /* 1M queue entries */
+ };
+
+ if (cb->cb_literal)
+ column_width = widths[type];
+
+ return (column_width);
+}
+
+/*
+ * Print the column labels, i.e:
+ *
+ * capacity operations bandwidth
+ * alloc free read write read write ...
+ *
+ * If force_column_width is set, use it for the column width. If not set, use
+ * the default column width.
+ */
+void
+print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width,
+ const name_and_columns_t labels[][IOSTAT_MAX_LABELS])
+{
+ int i, idx, s;
+ int text_start, rw_column_width, spaces_to_end;
+ uint64_t flags = cb->cb_flags;
+ uint64_t f;
+ unsigned int column_width = force_column_width;
+
+ /* For each bit set in flags */
+ for (f = flags; f; f &= ~(1ULL << idx)) {
+ idx = lowbit64(f) - 1;
+ if (!force_column_width)
+ column_width = default_column_width(cb, idx);
+ /* Print our top labels centered over "read write" label. */
+ for (i = 0; i < label_array_len(labels[idx]); i++) {
+ const char *name = labels[idx][i].name;
+ /*
+ * We treat labels[][].columns == 0 as shorthand
+ * for one column. It makes writing out the label
+ * tables more concise.
+ */
+ unsigned int columns = MAX(1, labels[idx][i].columns);
+ unsigned int slen = strlen(name);
+
+ rw_column_width = (column_width * columns) +
+ (2 * (columns - 1));
+
+ text_start = (int)((rw_column_width) / columns -
+ slen / columns);
+ if (text_start < 0)
+ text_start = 0;
+
+ printf(" "); /* Two spaces between columns */
+
+ /* Space from beginning of column to label */
+ for (s = 0; s < text_start; s++)
+ printf(" ");
+
+ printf("%s", name);
+
+ /* Print space after label to end of column */
+ spaces_to_end = rw_column_width - text_start - slen;
+ if (spaces_to_end < 0)
+ spaces_to_end = 0;
+
+ for (s = 0; s < spaces_to_end; s++)
+ printf(" ");
+ }
+ }
+}
+
+/*
+ * Utility function to print out a line of dashes like:
+ *
+ * -------------------------------- ----- ----- ----- ----- -----
+ *
+ * ...or a dashed named-row line like:
+ *
+ * logs - - - - -
+ *
+ * @cb: iostat data
+ *
+ * @force_column_width If non-zero, use the value as the column width.
+ * Otherwise use the default column widths.
+ *
+ * @name: Print a dashed named-row line starting
+ * with @name. Otherwise, print a regular
+ * dashed line.
+ */
+static void
+print_iostat_dashes(iostat_cbdata_t *cb, unsigned int force_column_width,
+ const char *name)
+{
+ int i;
+ unsigned int namewidth;
+ uint64_t flags = cb->cb_flags;
+ uint64_t f;
+ int idx;
+ const name_and_columns_t *labels;
+ const char *title;
+
+
+ if (cb->cb_flags & IOS_ANYHISTO_M) {
+ title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)];
+ } else if (cb->cb_vdev_names_count) {
+ title = "vdev";
+ } else {
+ title = "pool";
+ }
+
+ namewidth = MAX(MAX(strlen(title), cb->cb_namewidth),
+ name ? strlen(name) : 0);
+
+
+ if (name) {
+ printf("%-*s", namewidth, name);
+ } else {
+ for (i = 0; i < namewidth; i++)
+ (void) printf("-");
+ }
+
+ /* For each bit in flags */
+ for (f = flags; f; f &= ~(1ULL << idx)) {
+ unsigned int column_width;
+ idx = lowbit64(f) - 1;
+ if (force_column_width)
+ column_width = force_column_width;
+ else
+ column_width = default_column_width(cb, idx);
+
+ labels = iostat_bottom_labels[idx];
+ for (i = 0; i < label_array_len(labels); i++) {
+ if (name)
+ printf(" %*s-", column_width - 1, " ");
+ else
+ printf(" %.*s", column_width,
+ "--------------------");
+ }
+ }
+}
+
+
+static void
+print_iostat_separator_impl(iostat_cbdata_t *cb,
+ unsigned int force_column_width)
+{
+ print_iostat_dashes(cb, force_column_width, NULL);
+}
+
static void
print_iostat_separator(iostat_cbdata_t *cb)
{
- int i = 0;
+ print_iostat_separator_impl(cb, 0);
+}
+
+static void
+print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width,
+ const char *histo_vdev_name)
+{
+ unsigned int namewidth;
+ const char *title;
+
+ if (cb->cb_flags & IOS_ANYHISTO_M) {
+ title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)];
+ } else if (cb->cb_vdev_names_count) {
+ title = "vdev";
+ } else {
+ title = "pool";
+ }
+
+ namewidth = MAX(MAX(strlen(title), cb->cb_namewidth),
+ histo_vdev_name ? strlen(histo_vdev_name) : 0);
+
+ if (histo_vdev_name)
+ printf("%-*s", namewidth, histo_vdev_name);
+ else
+ printf("%*s", namewidth, "");
- for (i = 0; i < cb->cb_namewidth; i++)
- (void) printf("-");
- (void) printf(" ----- ----- ----- ----- ----- -----\n");
+
+ print_iostat_labels(cb, force_column_width, iostat_top_labels);
+ printf("\n");
+
+ printf("%-*s", namewidth, title);
+
+ print_iostat_labels(cb, force_column_width, iostat_bottom_labels);
+
+ printf("\n");
+
+ print_iostat_separator_impl(cb, force_column_width);
+
+ printf("\n");
}
static void
print_iostat_header(iostat_cbdata_t *cb)
{
- (void) printf("%*s capacity operations bandwidth\n",
- cb->cb_namewidth, "");
- (void) printf("%-*s alloc free read write read write\n",
- cb->cb_namewidth, "pool");
- print_iostat_separator(cb);
+ print_iostat_header_impl(cb, 0, NULL);
}
/*
* Display a single statistic.
*/
static void
-print_one_stat(uint64_t value)
+print_one_stat(uint64_t value, enum zfs_nicenum_format format,
+ unsigned int column_size, boolean_t scripted)
{
char buf[64];
- zfs_nicenum(value, buf, sizeof (buf));
- (void) printf(" %5s", buf);
+ zfs_nicenum_format(value, buf, sizeof (buf), format);
+
+ if (scripted)
+ printf("\t%s", buf);
+ else
+ printf(" %*s", column_size, buf);
+}
+
+/*
+ * Calculate the default vdev stats
+ *
+ * Subtract oldvs from newvs, apply a scaling factor, and save the resulting
+ * stats into calcvs.
+ */
+static void
+calc_default_iostats(vdev_stat_t *oldvs, vdev_stat_t *newvs,
+ vdev_stat_t *calcvs)
+{
+ int i;
+
+ memcpy(calcvs, newvs, sizeof (*calcvs));
+ for (i = 0; i < ARRAY_SIZE(calcvs->vs_ops); i++)
+ calcvs->vs_ops[i] = (newvs->vs_ops[i] - oldvs->vs_ops[i]);
+
+ for (i = 0; i < ARRAY_SIZE(calcvs->vs_bytes); i++)
+ calcvs->vs_bytes[i] = (newvs->vs_bytes[i] - oldvs->vs_bytes[i]);
+}
+
+/*
+ * Internal representation of the extended iostats data.
+ *
+ * The extended iostat stats are exported in nvlists as either uint64_t arrays
+ * or single uint64_t's. We make both look like arrays to make them easier
+ * to process. In order to make single uint64_t's look like arrays, we set
+ * __data to the stat data, and then set *data = &__data with count = 1. Then,
+ * we can just use *data and count.
+ */
+struct stat_array {
+ uint64_t *data;
+ uint_t count; /* Number of entries in data[] */
+ uint64_t __data; /* Only used when data is a single uint64_t */
+};
+
+static uint64_t
+stat_histo_max(struct stat_array *nva, unsigned int len)
+{
+ uint64_t max = 0;
+ int i;
+ for (i = 0; i < len; i++)
+ max = MAX(max, array64_max(nva[i].data, nva[i].count));
+
+ return (max);
+}
+
+/*
+ * Helper function to lookup a uint64_t array or uint64_t value and store its
+ * data as a stat_array. If the nvpair is a single uint64_t value, then we make
+ * it look like a one element array to make it easier to process.
+ */
+static int
+nvpair64_to_stat_array(nvlist_t *nvl, const char *name,
+ struct stat_array *nva)
+{
+ nvpair_t *tmp;
+ int ret;
+
+ verify(nvlist_lookup_nvpair(nvl, name, &tmp) == 0);
+ switch (nvpair_type(tmp)) {
+ case DATA_TYPE_UINT64_ARRAY:
+ ret = nvpair_value_uint64_array(tmp, &nva->data, &nva->count);
+ break;
+ case DATA_TYPE_UINT64:
+ ret = nvpair_value_uint64(tmp, &nva->__data);
+ nva->data = &nva->__data;
+ nva->count = 1;
+ break;
+ default:
+ /* Not a uint64_t */
+ ret = EINVAL;
+ break;
+ }
+
+ return (ret);
+}
+
+/*
+ * Given a list of nvlist names, look up the extended stats in newnv and oldnv,
+ * subtract them, and return the results in a newly allocated stat_array.
+ * You must free the returned array after you are done with it with
+ * free_calc_stats().
+ *
+ * Additionally, you can set "oldnv" to NULL if you simply want the newnv
+ * values.
+ */
+static struct stat_array *
+calc_and_alloc_stats_ex(const char **names, unsigned int len, nvlist_t *oldnv,
+ nvlist_t *newnv)
+{
+ nvlist_t *oldnvx = NULL, *newnvx;
+ struct stat_array *oldnva, *newnva, *calcnva;
+ int i, j;
+ unsigned int alloc_size = (sizeof (struct stat_array)) * len;
+
+ /* Extract our extended stats nvlist from the main list */
+ verify(nvlist_lookup_nvlist(newnv, ZPOOL_CONFIG_VDEV_STATS_EX,
+ &newnvx) == 0);
+ if (oldnv) {
+ verify(nvlist_lookup_nvlist(oldnv, ZPOOL_CONFIG_VDEV_STATS_EX,
+ &oldnvx) == 0);
+ }
+
+ newnva = safe_malloc(alloc_size);
+ oldnva = safe_malloc(alloc_size);
+ calcnva = safe_malloc(alloc_size);
+
+ for (j = 0; j < len; j++) {
+ verify(nvpair64_to_stat_array(newnvx, names[j],
+ &newnva[j]) == 0);
+ calcnva[j].count = newnva[j].count;
+ alloc_size = calcnva[j].count * sizeof (calcnva[j].data[0]);
+ calcnva[j].data = safe_malloc(alloc_size);
+ memcpy(calcnva[j].data, newnva[j].data, alloc_size);
+
+ if (oldnvx) {
+ verify(nvpair64_to_stat_array(oldnvx, names[j],
+ &oldnva[j]) == 0);
+ for (i = 0; i < oldnva[j].count; i++)
+ calcnva[j].data[i] -= oldnva[j].data[i];
+ }
+ }
+ free(newnva);
+ free(oldnva);
+ return (calcnva);
+}
+
+static void
+free_calc_stats(struct stat_array *nva, unsigned int len)
+{
+ int i;
+ for (i = 0; i < len; i++)
+ free(nva[i].data);
+
+ free(nva);
+}
+
+static void
+print_iostat_histo(struct stat_array *nva, unsigned int len,
+ iostat_cbdata_t *cb, unsigned int column_width, unsigned int namewidth,
+ double scale)
+{
+ int i, j;
+ char buf[6];
+ uint64_t val;
+ enum zfs_nicenum_format format;
+ unsigned int buckets;
+ unsigned int start_bucket;
+
+ if (cb->cb_literal)
+ format = ZFS_NICENUM_RAW;
+ else
+ format = ZFS_NICENUM_1024;
+
+ /* All these histos are the same size, so just use nva[0].count */
+ buckets = nva[0].count;
+
+ if (cb->cb_flags & IOS_RQ_HISTO_M) {
+ /* Start at 512 - req size should never be lower than this */
+ start_bucket = 9;
+ } else {
+ start_bucket = 0;
+ }
+
+ for (j = start_bucket; j < buckets; j++) {
+ /* Print histogram bucket label */
+ if (cb->cb_flags & IOS_L_HISTO_M) {
+ /* Ending range of this bucket */
+ val = (1ULL << (j + 1)) - 1;
+ zfs_nicetime(val, buf, sizeof (buf));
+ } else {
+ /* Request size (starting range of bucket) */
+ val = (1UL << j);
+ zfs_nicenum(val, buf, sizeof (buf));
+ }
+
+ if (cb->cb_scripted)
+ printf("%llu", (u_longlong_t)val);
+ else
+ printf("%-*s", namewidth, buf);
+
+ /* Print the values on the line */
+ for (i = 0; i < len; i++) {
+ print_one_stat(nva[i].data[j] * scale, format,
+ column_width, cb->cb_scripted);
+ }
+ printf("\n");
+ }
+}
+
+static void
+print_solid_separator(unsigned int length)
+{
+ while (length--)
+ printf("-");
+ printf("\n");
+}
+
+static void
+print_iostat_histos(iostat_cbdata_t *cb, nvlist_t *oldnv,
+ nvlist_t *newnv, double scale, const char *name)
+{
+ unsigned int column_width;
+ unsigned int namewidth;
+ unsigned int entire_width;
+ enum iostat_type type;
+ struct stat_array *nva;
+ const char **names;
+ unsigned int names_len;
+
+ /* What type of histo are we? */
+ type = IOS_HISTO_IDX(cb->cb_flags);
+
+ /* Get NULL-terminated array of nvlist names for our histo */
+ names = vsx_type_to_nvlist[type];
+ names_len = str_array_len(names); /* num of names */
+
+ nva = calc_and_alloc_stats_ex(names, names_len, oldnv, newnv);
+
+ if (cb->cb_literal) {
+ column_width = MAX(5,
+ (unsigned int) log10(stat_histo_max(nva, names_len)) + 1);
+ } else {
+ column_width = 5;
+ }
+
+ namewidth = MAX(cb->cb_namewidth,
+ strlen(histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]));
+
+ /*
+ * Calculate the entire line width of what we're printing. The
+ * +2 is for the two spaces between columns:
+ */
+ /* read write */
+ /* ----- ----- */
+ /* |___| <---------- column_width */
+ /* */
+ /* |__________| <--- entire_width */
+ /* */
+ entire_width = namewidth + (column_width + 2) *
+ label_array_len(iostat_bottom_labels[type]);
+
+ if (cb->cb_scripted)
+ printf("%s\n", name);
+ else
+ print_iostat_header_impl(cb, column_width, name);
+
+ print_iostat_histo(nva, names_len, cb, column_width,
+ namewidth, scale);
+
+ free_calc_stats(nva, names_len);
+ if (!cb->cb_scripted)
+ print_solid_separator(entire_width);
+}
+
+/*
+ * Calculate the average latency of a power-of-two latency histogram
+ */
+static uint64_t
+single_histo_average(uint64_t *histo, unsigned int buckets)
+{
+ int i;
+ uint64_t count = 0, total = 0;
+
+ for (i = 0; i < buckets; i++) {
+ /*
+ * Our buckets are power-of-two latency ranges. Use the
+ * midpoint latency of each bucket to calculate the average.
+ * For example:
+ *
+ * Bucket Midpoint
+ * 8ns-15ns: 12ns
+ * 16ns-31ns: 24ns
+ * ...
+ */
+ if (histo[i] != 0) {
+ total += histo[i] * (((1UL << i) + ((1UL << i)/2)));
+ count += histo[i];
+ }
+ }
+
+ /* Prevent divide by zero */
+ return (count == 0 ? 0 : total / count);
+}
+
+static void
+print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *oldnv,
+ nvlist_t *newnv)
+{
+ int i;
+ uint64_t val;
+ const char *names[] = {
+ ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE,
+ ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE,
+ ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE,
+ ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE,
+ ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE,
+ ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE,
+ ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE,
+ };
+
+ struct stat_array *nva;
+
+ unsigned int column_width = default_column_width(cb, IOS_QUEUES);
+ enum zfs_nicenum_format format;
+
+ nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), NULL, newnv);
+
+ if (cb->cb_literal)
+ format = ZFS_NICENUM_RAW;
+ else
+ format = ZFS_NICENUM_1024;
+
+ for (i = 0; i < ARRAY_SIZE(names); i++) {
+ val = nva[i].data[0];
+ print_one_stat(val, format, column_width, cb->cb_scripted);
+ }
+
+ free_calc_stats(nva, ARRAY_SIZE(names));
+}
+
+static void
+print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv,
+ nvlist_t *newnv)
+{
+ int i;
+ uint64_t val;
+ const char *names[] = {
+ ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
+ };
+ struct stat_array *nva;
+
+ unsigned int column_width = default_column_width(cb, IOS_LATENCY);
+ enum zfs_nicenum_format format;
+
+ nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv);
+
+ if (cb->cb_literal)
+ format = ZFS_NICENUM_RAWTIME;
+ else
+ format = ZFS_NICENUM_TIME;
+
+ /* Print our avg latencies on the line */
+ for (i = 0; i < ARRAY_SIZE(names); i++) {
+ /* Compute average latency for a latency histo */
+ val = single_histo_average(nva[i].data, nva[i].count);
+ print_one_stat(val, format, column_width, cb->cb_scripted);
+ }
+ free_calc_stats(nva, ARRAY_SIZE(names));
+}
+
+/*
+ * Print default statistics (capacity/operations/bandwidth)
+ */
+static void
+print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale)
+{
+ unsigned int column_width = default_column_width(cb, IOS_DEFAULT);
+ enum zfs_nicenum_format format;
+ char na; /* char to print for "not applicable" values */
+
+ if (cb->cb_literal) {
+ format = ZFS_NICENUM_RAW;
+ na = '0';
+ } else {
+ format = ZFS_NICENUM_1024;
+ na = '-';
+ }
+
+ /* only toplevel vdevs have capacity stats */
+ if (vs->vs_space == 0) {
+ if (cb->cb_scripted)
+ printf("\t%c\t%c", na, na);
+ else
+ printf(" %*c %*c", column_width, na, column_width,
+ na);
+ } else {
+ print_one_stat(vs->vs_alloc, format, column_width,
+ cb->cb_scripted);
+ print_one_stat(vs->vs_space - vs->vs_alloc, format,
+ column_width, cb->cb_scripted);
+ }
+
+ print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_READ] * scale),
+ format, column_width, cb->cb_scripted);
+ print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_WRITE] * scale),
+ format, column_width, cb->cb_scripted);
+ print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_READ] * scale),
+ format, column_width, cb->cb_scripted);
+ print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_WRITE] * scale),
+ format, column_width, cb->cb_scripted);
}
static const char *class_name[] = {
@@ -3162,21 +3967,27 @@ static const char *class_name[] = {
* Print out all the statistics for the given vdev. This can either be the
* toplevel configuration, or called recursively. If 'name' is NULL, then this
* is a verbose output, and we don't want to display the toplevel pool stats.
+ *
+ * Returns the number of stat lines printed.
*/
-static void
+unsigned int
print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
nvlist_t *newnv, iostat_cbdata_t *cb, int depth)
{
nvlist_t **oldchild, **newchild;
- uint_t c, children;
- vdev_stat_t *oldvs, *newvs;
+ uint_t c, children, oldchildren;
+ vdev_stat_t *oldvs, *newvs, *calcvs;
vdev_stat_t zerovs = { 0 };
char *vname;
+ int i;
+ int ret = 0;
uint64_t tdelta;
double scale;
if (strcmp(name, VDEV_TYPE_INDIRECT) == 0)
- return;
+ return (0);
+
+ calcvs = safe_malloc(sizeof (*calcvs));
if (oldnv != NULL) {
verify(nvlist_lookup_uint64_array(oldnv,
@@ -3185,54 +3996,98 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
oldvs = &zerovs;
}
+ /* Do we only want to see a specific vdev? */
+ for (i = 0; i < cb->cb_vdev_names_count; i++) {
+ /* Yes we do. Is this the vdev? */
+ if (strcmp(name, cb->cb_vdev_names[i]) == 0) {
+ /*
+ * This is our vdev. Since it is the only vdev we
+ * will be displaying, make depth = 0 so that it
+ * doesn't get indented.
+ */
+ depth = 0;
+ break;
+ }
+ }
+
+ if (cb->cb_vdev_names_count && (i == cb->cb_vdev_names_count)) {
+ /* Couldn't match the name */
+ goto children;
+ }
+
+
verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&newvs, &c) == 0);
- if (strlen(name) + depth > cb->cb_namewidth)
- (void) printf("%*s%s", depth, "", name);
- else
- (void) printf("%*s%s%*s", depth, "", name,
- (int)(cb->cb_namewidth - strlen(name) - depth), "");
+ /*
+ * Print the vdev name unless it's is a histogram. Histograms
+ * display the vdev name in the header itself.
+ */
+ if (!(cb->cb_flags & IOS_ANYHISTO_M)) {
+ if (cb->cb_scripted) {
+ printf("%s", name);
+ } else {
+ if (strlen(name) + depth > cb->cb_namewidth)
+ (void) printf("%*s%s", depth, "", name);
+ else
+ (void) printf("%*s%s%*s", depth, "", name,
+ (int)(cb->cb_namewidth - strlen(name) -
+ depth), "");
+ }
+ }
+ /* Calculate our scaling factor */
tdelta = newvs->vs_timestamp - oldvs->vs_timestamp;
-
- if (tdelta == 0)
- scale = 1.0;
- else
- scale = (double)NANOSEC / tdelta;
-
- /* only toplevel vdevs have capacity stats */
- if (newvs->vs_space == 0) {
- (void) printf(" - -");
+ if ((oldvs->vs_timestamp == 0) && (cb->cb_flags & IOS_ANYHISTO_M)) {
+ /*
+ * If we specify printing histograms with no time interval, then
+ * print the histogram numbers over the entire lifetime of the
+ * vdev.
+ */
+ scale = 1;
} else {
- print_one_stat(newvs->vs_alloc);
- print_one_stat(newvs->vs_space - newvs->vs_alloc);
+ if (tdelta == 0)
+ scale = 1.0;
+ else
+ scale = (double)NANOSEC / tdelta;
}
- print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_READ] -
- oldvs->vs_ops[ZIO_TYPE_READ])));
+ if (cb->cb_flags & IOS_DEFAULT_M) {
+ calc_default_iostats(oldvs, newvs, calcvs);
+ print_iostat_default(calcvs, cb, scale);
+ }
+ if (cb->cb_flags & IOS_LATENCY_M)
+ print_iostat_latency(cb, oldnv, newnv);
+ if (cb->cb_flags & IOS_QUEUES_M)
+ print_iostat_queues(cb, oldnv, newnv);
+ if (cb->cb_flags & IOS_ANYHISTO_M) {
+ printf("\n");
+ print_iostat_histos(cb, oldnv, newnv, scale, name);
+ }
- print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_WRITE] -
- oldvs->vs_ops[ZIO_TYPE_WRITE])));
+ if (!(cb->cb_flags & IOS_ANYHISTO_M))
+ printf("\n");
- print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_READ] -
- oldvs->vs_bytes[ZIO_TYPE_READ])));
+ ret++;
- print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_WRITE] -
- oldvs->vs_bytes[ZIO_TYPE_WRITE])));
+children:
- (void) printf("\n");
+ free(calcvs);
if (!cb->cb_verbose)
- return;
+ return (ret);
if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN,
&newchild, &children) != 0)
- return;
+ return (ret);
- if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN,
- &oldchild, &c) != 0)
- return;
+ if (oldnv) {
+ if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN,
+ &oldchild, &oldchildren) != 0)
+ return (ret);
+
+ children = MIN(oldchildren, children);
+ }
/*
* print normal top-level devices
@@ -3254,7 +4109,7 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
cb->cb_name_flags);
- print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
+ ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
newchild[c], cb, depth + 2);
free(vname);
}
@@ -3264,6 +4119,7 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
*/
for (uint_t n = 0; n < 3; n++) {
boolean_t printed = B_FALSE;
+
for (c = 0; c < children; c++) {
uint64_t islog = B_FALSE;
char *bias = NULL;
@@ -3285,11 +4141,10 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
continue;
if (!printed) {
- if (!cb->cb_scripted) {
- (void) printf(
- "%-*s - - - -"
- " - -",
- cb->cb_namewidth, class_name[n]);
+ if ((!(cb->cb_flags & IOS_ANYHISTO_M)) &&
+ !cb->cb_scripted && !cb->cb_vdev_names) {
+ print_iostat_dashes(cb, 0,
+ class_name[n]);
}
printf("\n");
printed = B_TRUE;
@@ -3297,7 +4152,7 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
cb->cb_name_flags);
- print_vdev_stats(zhp, vname, oldnv ?
+ ret += print_vdev_stats(zhp, vname, oldnv ?
oldchild[c] : NULL, newchild[c], cb, depth + 2);
free(vname);
}
@@ -3309,23 +4164,33 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
*/
if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE,
&newchild, &children) != 0)
- return;
+ return (ret);
- if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE,
- &oldchild, &c) != 0)
- return;
+ if (oldnv) {
+ if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE,
+ &oldchild, &oldchildren) != 0)
+ return (ret);
+
+ children = MIN(oldchildren, children);
+ }
if (children > 0) {
- (void) printf("%-*s - - - - - "
- "-\n", cb->cb_namewidth, "cache");
+ if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted &&
+ !cb->cb_vdev_names) {
+ print_iostat_dashes(cb, 0, "cache");
+ }
+ printf("\n");
+
for (c = 0; c < children; c++) {
vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
cb->cb_name_flags);
- print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
- newchild[c], cb, depth + 2);
+ ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c]
+ : NULL, newchild[c], cb, depth + 2);
free(vname);
}
}
+
+ return (ret);
}
static int
@@ -3355,6 +4220,7 @@ print_iostat(zpool_handle_t *zhp, void *data)
iostat_cbdata_t *cb = data;
nvlist_t *oldconfig, *newconfig;
nvlist_t *oldnvroot, *newnvroot;
+ int ret;
newconfig = zpool_get_config(zhp, &oldconfig);
@@ -3370,63 +4236,79 @@ print_iostat(zpool_handle_t *zhp, void *data)
verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE,
&oldnvroot) == 0);
- /*
- * Print out the statistics for the pool.
- */
- print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0);
-
- if (cb->cb_verbose)
+ ret = print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot,
+ cb, 0);
+ if ((ret != 0) && !(cb->cb_flags & IOS_ANYHISTO_M) &&
+ !cb->cb_scripted && cb->cb_verbose && !cb->cb_vdev_names_count) {
print_iostat_separator(cb);
+ printf("\n");
+ }
- return (0);
+ return (ret);
}
-int
-get_namewidth(zpool_handle_t *zhp, void *data)
+static int
+get_columns(void)
+{
+ struct winsize ws;
+ int columns = 80;
+ int error;
+
+ if (isatty(STDOUT_FILENO)) {
+ error = ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws);
+ if (error == 0)
+ columns = ws.ws_col;
+ } else {
+ columns = 999;
+ }
+
+ return (columns);
+}
+
+/*
+ * Return the required length of the pool/vdev name column. The minimum
+ * allowed width and output formatting flags must be provided.
+ */
+static int
+get_namewidth(zpool_handle_t *zhp, int min_width, int flags, boolean_t verbose)
{
- iostat_cbdata_t *cb = data;
nvlist_t *config, *nvroot;
+ int width = min_width;
if ((config = zpool_get_config(zhp, NULL)) != NULL) {
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
- if (!cb->cb_verbose)
- cb->cb_namewidth = strlen(zpool_get_name(zhp));
- else
- cb->cb_namewidth = max_width(zhp, nvroot, 0,
- cb->cb_namewidth, cb->cb_name_flags);
+ unsigned int poolname_len = strlen(zpool_get_name(zhp));
+ if (verbose == B_FALSE) {
+ width = MAX(poolname_len, min_width);
+ } else {
+ width = MAX(poolname_len,
+ max_width(zhp, nvroot, 0, min_width, flags));
+ }
}
- /*
- * The width must fall into the range [10,38]. The upper limit is the
- * maximum we can have and still fit in 80 columns.
- */
- if (cb->cb_namewidth < 10)
- cb->cb_namewidth = 10;
- if (cb->cb_namewidth > 38)
- cb->cb_namewidth = 38;
-
- return (0);
+ return (width);
}
/*
* Parse the input string, get the 'interval' and 'count' value if there is one.
*/
static void
-get_interval_count(int *argcp, char **argv, unsigned long *iv,
+get_interval_count(int *argcp, char **argv, float *iv,
unsigned long *cnt)
{
- unsigned long interval = 0, count = 0;
+ float interval = 0;
+ unsigned long count = 0;
int argc = *argcp, errno;
/*
* Determine if the last argument is an integer or a pool name
*/
- if (argc > 0 && zfs_isnumber(argv[argc - 1])) {
+ if (argc > 0 && isnumber(argv[argc - 1])) {
char *end;
errno = 0;
- interval = strtoul(argv[argc - 1], &end, 10);
+ interval = strtof(argv[argc - 1], &end);
if (*end == '\0' && errno == 0) {
if (interval == 0) {
@@ -3452,12 +4334,12 @@ get_interval_count(int *argcp, char **argv, unsigned long *iv,
* If the last argument is also an integer, then we have both a count
* and an interval.
*/
- if (argc > 0 && zfs_isnumber(argv[argc - 1])) {
+ if (argc > 0 && isnumber(argv[argc - 1])) {
char *end;
errno = 0;
count = interval;
- interval = strtoul(argv[argc - 1], &end, 10);
+ interval = strtof(argv[argc - 1], &end);
if (*end == '\0' && errno == 0) {
if (interval == 0) {
@@ -3492,13 +4374,296 @@ get_timestamp_arg(char c)
}
/*
- * zpool iostat [-gLPv] [-T d|u] [pool] ... [interval [count]]
+ * Return stat flags that are supported by all pools by both the module and
+ * zpool iostat. "*data" should be initialized to all 0xFFs before running.
+ * It will get ANDed down until only the flags that are supported on all pools
+ * remain.
+ */
+static int
+get_stat_flags_cb(zpool_handle_t *zhp, void *data)
+{
+ uint64_t *mask = data;
+ nvlist_t *config, *nvroot, *nvx;
+ uint64_t flags = 0;
+ int i, j;
+
+ config = zpool_get_config(zhp, NULL);
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+
+ /* Default stats are always supported, but for completeness.. */
+ if (nvlist_exists(nvroot, ZPOOL_CONFIG_VDEV_STATS))
+ flags |= IOS_DEFAULT_M;
+
+ /* Get our extended stats nvlist from the main list */
+ if (nvlist_lookup_nvlist(nvroot, ZPOOL_CONFIG_VDEV_STATS_EX,
+ &nvx) != 0) {
+ /*
+ * No extended stats; they're probably running an older
+ * module. No big deal, we support that too.
+ */
+ goto end;
+ }
+
+ /* For each extended stat, make sure all its nvpairs are supported */
+ for (j = 0; j < ARRAY_SIZE(vsx_type_to_nvlist); j++) {
+ if (!vsx_type_to_nvlist[j][0])
+ continue;
+
+ /* Start off by assuming the flag is supported, then check */
+ flags |= (1ULL << j);
+ for (i = 0; vsx_type_to_nvlist[j][i]; i++) {
+ if (!nvlist_exists(nvx, vsx_type_to_nvlist[j][i])) {
+ /* flag isn't supported */
+ flags = flags & ~(1ULL << j);
+ break;
+ }
+ }
+ }
+end:
+ *mask = *mask & flags;
+ return (0);
+}
+
+/*
+ * Return a bitmask of stats that are supported on all pools by both the module
+ * and zpool iostat.
+ */
+static uint64_t
+get_stat_flags(zpool_list_t *list)
+{
+ uint64_t mask = -1;
+
+ /*
+ * get_stat_flags_cb() will lop off bits from "mask" until only the
+ * flags that are supported on all pools remain.
+ */
+ (void) pool_list_iter(list, B_FALSE, get_stat_flags_cb, &mask);
+ return (mask);
+}
+
+/*
+ * Return 1 if cb_data->cb_vdev_names[0] is this vdev's name, 0 otherwise.
+ */
+static int
+is_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_data)
+{
+ iostat_cbdata_t *cb = cb_data;
+ char *name;
+
+ name = zpool_vdev_name(g_zfs, zhp, nv, cb->cb_name_flags);
+
+ if (strcmp(name, cb->cb_vdev_names[0]) == 0)
+ return (1); /* match */
+
+ return (0);
+}
+
+/*
+ * Returns 1 if cb_data->cb_vdev_names[0] is a vdev name, 0 otherwise.
+ */
+static int
+is_vdev(zpool_handle_t *zhp, void *cb_data)
+{
+ return (for_each_vdev(zhp, is_vdev_cb, cb_data));
+}
+
+/*
+ * Check if vdevs are in a pool
+ *
+ * Return 1 if all argv[] strings are vdev names in pool "pool_name". Otherwise
+ * return 0. If pool_name is NULL, then search all pools.
+ */
+static int
+are_vdevs_in_pool(int argc, char **argv, char *pool_name,
+ iostat_cbdata_t *cb)
+{
+ char **tmp_name;
+ int ret = 0;
+ int i;
+ int pool_count = 0;
+
+ if ((argc == 0) || !*argv)
+ return (0);
+
+ if (pool_name)
+ pool_count = 1;
+
+ /* Temporarily hijack cb_vdev_names for a second... */
+ tmp_name = cb->cb_vdev_names;
+
+ /* Go though our list of prospective vdev names */
+ for (i = 0; i < argc; i++) {
+ cb->cb_vdev_names = argv + i;
+
+ /* Is this name a vdev in our pools? */
+ ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL,
+ is_vdev, cb);
+ if (!ret) {
+ /* No match */
+ break;
+ }
+ }
+
+ cb->cb_vdev_names = tmp_name;
+
+ return (ret);
+}
+
+static int
+is_pool_cb(zpool_handle_t *zhp, void *data)
+{
+ char *name = data;
+ if (strcmp(name, zpool_get_name(zhp)) == 0)
+ return (1);
+
+ return (0);
+}
+
+/*
+ * Do we have a pool named *name? If so, return 1, otherwise 0.
+ */
+static int
+is_pool(char *name)
+{
+ return (for_each_pool(0, NULL, B_TRUE, NULL, is_pool_cb, name));
+}
+
+/* Are all our argv[] strings pool names? If so return 1, 0 otherwise. */
+static int
+are_all_pools(int argc, char **argv)
+{
+ if ((argc == 0) || !*argv)
+ return (0);
+
+ while (--argc >= 0)
+ if (!is_pool(argv[argc]))
+ return (0);
+
+ return (1);
+}
+
+/*
+ * Helper function to print out vdev/pool names we can't resolve. Used for an
+ * error message.
+ */
+static void
+error_list_unresolved_vdevs(int argc, char **argv, char *pool_name,
+ iostat_cbdata_t *cb)
+{
+ int i;
+ char *name;
+ char *str;
+ for (i = 0; i < argc; i++) {
+ name = argv[i];
+
+ if (is_pool(name))
+ str = gettext("pool");
+ else if (are_vdevs_in_pool(1, &name, pool_name, cb))
+ str = gettext("vdev in this pool");
+ else if (are_vdevs_in_pool(1, &name, NULL, cb))
+ str = gettext("vdev in another pool");
+ else
+ str = gettext("unknown");
+
+ fprintf(stderr, "\t%s (%s)\n", name, str);
+ }
+}
+
+/*
+ * Same as get_interval_count(), but with additional checks to not misinterpret
+ * guids as interval/count values. Assumes VDEV_NAME_GUID is set in
+ * cb.cb_name_flags.
+ */
+static void
+get_interval_count_filter_guids(int *argc, char **argv, float *interval,
+ unsigned long *count, iostat_cbdata_t *cb)
+{
+ char **tmpargv = argv;
+ int argc_for_interval = 0;
+
+ /* Is the last arg an interval value? Or a guid? */
+ if (*argc >= 1 && !are_vdevs_in_pool(1, &argv[*argc - 1], NULL, cb)) {
+ /*
+ * The last arg is not a guid, so it's probably an
+ * interval value.
+ */
+ argc_for_interval++;
+
+ if (*argc >= 2 &&
+ !are_vdevs_in_pool(1, &argv[*argc - 2], NULL, cb)) {
+ /*
+ * The 2nd to last arg is not a guid, so it's probably
+ * an interval value.
+ */
+ argc_for_interval++;
+ }
+ }
+
+ /* Point to our list of possible intervals */
+ tmpargv = &argv[*argc - argc_for_interval];
+
+ *argc = *argc - argc_for_interval;
+ get_interval_count(&argc_for_interval, tmpargv,
+ interval, count);
+}
+
+/*
+ * Floating point sleep(). Allows you to pass in a floating point value for
+ * seconds.
+ */
+static void
+fsleep(float sec)
+{
+ struct timespec req;
+ req.tv_sec = floor(sec);
+ req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC;
+ (void) nanosleep(&req, NULL);
+}
+
+/*
+ * Set the minimum pool/vdev name column width. The width must be at least 10,
+ * but may be as large as the column width - 42 so it still fits on one line.
+ */
+static int
+get_namewidth_iostat(zpool_handle_t *zhp, void *data)
+{
+ iostat_cbdata_t *cb = data;
+ int width, columns;
+
+ width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags,
+ cb->cb_verbose);
+ columns = get_columns();
+
+ if (width < 10)
+ width = 10;
+ if (width > columns - 42)
+ width = columns - 42;
+
+ cb->cb_namewidth = width;
+
+ return (0);
+}
+
+/*
+ * zpool iostat [-ghHLpPvy] [[-lq]|[-r|-w]] [-n name] [-T d|u]
+ * [[ pool ...]|[pool vdev ...]|[vdev ...]]
+ * [interval [count]]
*
* -g Display guid for individual vdev name.
* -L Follow links when resolving vdev path name.
* -P Display full path for vdev name.
* -v Display statistics for individual vdevs
+ * -h Display help
+ * -p Display values in parsable (exact) format.
+ * -H Scripted mode. Don't display headers, and separate properties
+ * by a single tab.
+ * -l Display average latency
+ * -q Display queue depths
+ * -w Display latency histograms
+ * -r Display request size histogram
* -T Display a timestamp in date(1) or Unix format
+ * -n Only print headers once
*
* This command can be tricky because we want to be able to deal with pool
* creation/destruction as well as vdev configuration changes. The bulk of this
@@ -3512,16 +4677,29 @@ zpool_do_iostat(int argc, char **argv)
int c;
int ret;
int npools;
- unsigned long interval = 0, count = 0;
+ float interval = 0;
+ unsigned long count = 0;
+ int winheight = 24;
+ struct winsize win;
zpool_list_t *list;
boolean_t verbose = B_FALSE;
+ boolean_t latency = B_FALSE, l_histo = B_FALSE, rq_histo = B_FALSE;
+ boolean_t queues = B_FALSE, parseable = B_FALSE, scripted = B_FALSE;
+ boolean_t omit_since_boot = B_FALSE;
boolean_t guid = B_FALSE;
boolean_t follow_links = B_FALSE;
boolean_t full_name = B_FALSE;
+ boolean_t headers_once = B_FALSE;
iostat_cbdata_t cb = { 0 };
+ /* Used for printing error message */
+ const char flag_to_arg[] = {[IOS_LATENCY] = 'l', [IOS_QUEUES] = 'q',
+ [IOS_L_HISTO] = 'w', [IOS_RQ_HISTO] = 'r'};
+
+ uint64_t unsupported_flags;
+
/* check options */
- while ((c = getopt(argc, argv, "gLPT:v")) != -1) {
+ while ((c = getopt(argc, argv, "gLPT:vyhplqrwnH")) != -1) {
switch (c) {
case 'g':
guid = B_TRUE;
@@ -3538,6 +4716,33 @@ zpool_do_iostat(int argc, char **argv)
case 'v':
verbose = B_TRUE;
break;
+ case 'p':
+ parseable = B_TRUE;
+ break;
+ case 'l':
+ latency = B_TRUE;
+ break;
+ case 'q':
+ queues = B_TRUE;
+ break;
+ case 'H':
+ scripted = B_TRUE;
+ break;
+ case 'w':
+ l_histo = B_TRUE;
+ break;
+ case 'r':
+ rq_histo = B_TRUE;
+ break;
+ case 'y':
+ omit_since_boot = B_TRUE;
+ break;
+ case 'n':
+ headers_once = B_TRUE;
+ break;
+ case 'h':
+ usage(B_FALSE);
+ break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
@@ -3548,7 +4753,70 @@ zpool_do_iostat(int argc, char **argv)
argc -= optind;
argv += optind;
- get_interval_count(&argc, argv, &interval, &count);
+ cb.cb_literal = parseable;
+ cb.cb_scripted = scripted;
+
+ if (guid)
+ cb.cb_name_flags |= VDEV_NAME_GUID;
+ if (follow_links)
+ cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
+ if (full_name)
+ cb.cb_name_flags |= VDEV_NAME_PATH;
+ cb.cb_iteration = 0;
+ cb.cb_namewidth = 0;
+ cb.cb_verbose = verbose;
+
+ /* Get our interval and count values (if any) */
+ if (guid) {
+ get_interval_count_filter_guids(&argc, argv, &interval,
+ &count, &cb);
+ } else {
+ get_interval_count(&argc, argv, &interval, &count);
+ }
+
+ if (argc == 0) {
+ /* No args, so just print the defaults. */
+ } else if (are_all_pools(argc, argv)) {
+ /* All the args are pool names */
+ } else if (are_vdevs_in_pool(argc, argv, NULL, &cb)) {
+ /* All the args are vdevs */
+ cb.cb_vdev_names = argv;
+ cb.cb_vdev_names_count = argc;
+ argc = 0; /* No pools to process */
+ } else if (are_all_pools(1, argv)) {
+ /* The first arg is a pool name */
+ if (are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb)) {
+ /* ...and the rest are vdev names */
+ cb.cb_vdev_names = argv + 1;
+ cb.cb_vdev_names_count = argc - 1;
+ argc = 1; /* One pool to process */
+ } else {
+ fprintf(stderr, gettext("Expected either a list of "));
+ fprintf(stderr, gettext("pools, or list of vdevs in"));
+ fprintf(stderr, " \"%s\", ", argv[0]);
+ fprintf(stderr, gettext("but got:\n"));
+ error_list_unresolved_vdevs(argc - 1, argv + 1,
+ argv[0], &cb);
+ fprintf(stderr, "\n");
+ usage(B_FALSE);
+ return (1);
+ }
+ } else {
+ /*
+ * The args don't make sense. The first arg isn't a pool name,
+ * nor are all the args vdevs.
+ */
+ fprintf(stderr, gettext("Unable to parse pools/vdevs list.\n"));
+ fprintf(stderr, "\n");
+ return (1);
+ }
+
+ if (cb.cb_vdev_names_count != 0) {
+ /*
+ * If user specified vdevs, it implies verbose.
+ */
+ cb.cb_verbose = B_TRUE;
+ }
/*
* Construct the list of all interesting pools.
@@ -3568,60 +4836,156 @@ zpool_do_iostat(int argc, char **argv)
return (1);
}
+ if ((l_histo || rq_histo) && (queues || latency)) {
+ pool_list_free(list);
+ (void) fprintf(stderr,
+ gettext("[-r|-w] isn't allowed with [-q|-l]\n"));
+ usage(B_FALSE);
+ return (1);
+ }
+
+ if (l_histo && rq_histo) {
+ pool_list_free(list);
+ (void) fprintf(stderr,
+ gettext("Only one of [-r|-w] can be passed at a time\n"));
+ usage(B_FALSE);
+ return (1);
+ }
+
/*
* Enter the main iostat loop.
*/
cb.cb_list = list;
- cb.cb_verbose = verbose;
- if (guid)
- cb.cb_name_flags |= VDEV_NAME_GUID;
- if (follow_links)
- cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
- if (full_name)
- cb.cb_name_flags |= VDEV_NAME_PATH;
- cb.cb_iteration = 0;
- cb.cb_namewidth = 0;
- for (;;) {
- pool_list_update(list);
+ if (l_histo) {
+ /*
+ * Histograms tables look out of place when you try to display
+ * them with the other stats, so make a rule that you can only
+ * print histograms by themselves.
+ */
+ cb.cb_flags = IOS_L_HISTO_M;
+ } else if (rq_histo) {
+ cb.cb_flags = IOS_RQ_HISTO_M;
+ } else {
+ cb.cb_flags = IOS_DEFAULT_M;
+ if (latency)
+ cb.cb_flags |= IOS_LATENCY_M;
+ if (queues)
+ cb.cb_flags |= IOS_QUEUES_M;
+ }
+ /*
+ * See if the module supports all the stats we want to display.
+ */
+ unsupported_flags = cb.cb_flags & ~get_stat_flags(list);
+ if (unsupported_flags) {
+ uint64_t f;
+ int idx;
+ fprintf(stderr,
+ gettext("The loaded zfs module doesn't support:"));
+
+ /* for each bit set in unsupported_flags */
+ for (f = unsupported_flags; f; f &= ~(1ULL << idx)) {
+ idx = lowbit64(f) - 1;
+ fprintf(stderr, " -%c", flag_to_arg[idx]);
+ }
+
+ fprintf(stderr, ". Try running a newer module.\n"),
+ pool_list_free(list);
+
+ return (1);
+ }
+
+ for (;;) {
if ((npools = pool_list_count(list)) == 0)
- break;
+ (void) fprintf(stderr, gettext("no pools available\n"));
+ else {
+ /*
+ * If this is the first iteration and -y was supplied
+ * we skip any printing.
+ */
+ boolean_t skip = (omit_since_boot &&
+ cb.cb_iteration == 0);
- /*
- * Refresh all statistics. This is done as an explicit step
- * before calculating the maximum name width, so that any
- * configuration changes are properly accounted for.
- */
- (void) pool_list_iter(list, B_FALSE, refresh_iostat, &cb);
+ /*
+ * Refresh all statistics. This is done as an
+ * explicit step before calculating the maximum name
+ * width, so that any configuration changes are
+ * properly accounted for.
+ */
+ (void) pool_list_iter(list, B_FALSE, refresh_iostat,
+ &cb);
- /*
- * Iterate over all pools to determine the maximum width
- * for the pool / device name column across all pools.
- */
- cb.cb_namewidth = 0;
- (void) pool_list_iter(list, B_FALSE, get_namewidth, &cb);
+ /*
+ * Iterate over all pools to determine the maximum width
+ * for the pool / device name column across all pools.
+ */
+ cb.cb_namewidth = 0;
+ (void) pool_list_iter(list, B_FALSE,
+ get_namewidth_iostat, &cb);
- if (timestamp_fmt != NODATE)
- print_timestamp(timestamp_fmt);
+ if (timestamp_fmt != NODATE)
+ print_timestamp(timestamp_fmt);
- /*
- * If it's the first time, or verbose mode, print the header.
- */
- if (++cb.cb_iteration == 1 || verbose)
- print_iostat_header(&cb);
+ /*
+ * Check terminal size so we can print headers
+ * even when terminal window has its height
+ * changed.
+ */
+ if (headers_once == B_FALSE) {
+ if (ioctl(1, TIOCGWINSZ, &win) != -1) {
+ if (win.ws_row <= 0) {
+ headers_once = B_TRUE;
+ } else {
+ winheight = win.ws_row;
+ }
+ }
+ }
+ /*
+ * Are we connected to TTY? If not, headers_once
+ * should be true, to avoid breaking scripts.
+ */
+ if (isatty(fileno(stdout)) == 0)
+ headers_once = B_TRUE;
- (void) pool_list_iter(list, B_FALSE, print_iostat, &cb);
+ /*
+ * If it's the first time and we're not skipping it,
+ * or either skip or verbose mode, print the header.
+ *
+ * The histogram code explicitly prints its header on
+ * every vdev, so skip this for histograms.
+ */
+ if (((++cb.cb_iteration == 1 && !skip) ||
+ (skip != verbose) ||
+ (!headers_once &&
+ (cb.cb_iteration % winheight) == 0)) &&
+ (!(cb.cb_flags & IOS_ANYHISTO_M)) &&
+ !cb.cb_scripted)
+ print_iostat_header(&cb);
+
+ if (skip) {
+ (void) fsleep(interval);
+ continue;
+ }
- /*
- * If there's more than one pool, and we're not in verbose mode
- * (which prints a separator for us), then print a separator.
- */
- if (npools > 1 && !verbose)
- print_iostat_separator(&cb);
+ (void) pool_list_iter(list, B_FALSE, print_iostat, &cb);
- if (verbose)
- (void) printf("\n");
+ /*
+ * If there's more than one pool, and we're not in
+ * verbose mode (which prints a separator for us),
+ * then print a separator.
+ *
+ * In addition, if we're printing specific vdevs then
+ * we also want an ending separator.
+ */
+ if (((npools > 1 && !verbose &&
+ !(cb.cb_flags & IOS_ANYHISTO_M)) ||
+ (!(cb.cb_flags & IOS_ANYHISTO_M) &&
+ cb.cb_vdev_names_count)) &&
+ !cb.cb_scripted) {
+ print_iostat_separator(&cb);
+ }
+ }
/*
* Flush the output so that redirection to a file isn't buffered
@@ -3635,7 +4999,7 @@ zpool_do_iostat(int argc, char **argv)
if (count != 0 && --count == 0)
break;
- (void) sleep(interval);
+ (void) fsleep(interval);
}
pool_list_free(list);
@@ -3992,6 +5356,27 @@ list_callback(zpool_handle_t *zhp, void *data)
}
/*
+ * Set the minimum pool/vdev name column width. The width must be at least 9,
+ * but may be as large as needed.
+ */
+static int
+get_namewidth_list(zpool_handle_t *zhp, void *data)
+{
+ list_cbdata_t *cb = data;
+ int width;
+
+ width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags,
+ cb->cb_verbose);
+
+ if (width < 9)
+ width = 9;
+
+ cb->cb_namewidth = width;
+
+ return (0);
+}
+
+/*
* zpool list [-gHLP] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]]
*
* -g Display guid for individual vdev name.
@@ -4018,7 +5403,8 @@ zpool_do_list(int argc, char **argv)
"name,size,allocated,free,checkpoint,expandsize,fragmentation,"
"capacity,dedupratio,health,altroot";
char *props = default_props;
- unsigned long interval = 0, count = 0;
+ float interval = 0;
+ unsigned long count = 0;
zpool_list_t *list;
boolean_t first = B_TRUE;
@@ -4079,7 +5465,7 @@ zpool_do_list(int argc, char **argv)
break;
cb.cb_namewidth = 0;
- (void) pool_list_iter(list, B_FALSE, get_namewidth, &cb);
+ (void) pool_list_iter(list, B_FALSE, get_namewidth_list, &cb);
if (timestamp_fmt != NODATE)
print_timestamp(timestamp_fmt);
@@ -4097,7 +5483,7 @@ zpool_do_list(int argc, char **argv)
break;
pool_list_free(list);
- (void) sleep(interval);
+ (void) fsleep(interval);
}
if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) {
@@ -5942,6 +7328,9 @@ status_callback(zpool_handle_t *zhp, void *data)
cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE",
"CKSUM");
+ if (cbp->cb_print_slow_ios)
+ (void) printf(" %5s", gettext("SLOW"));
+
print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0,
B_FALSE);
@@ -6003,12 +7392,14 @@ status_callback(zpool_handle_t *zhp, void *data)
}
/*
- * zpool status [-igLPtvx] [-T d|u] [pool] ... [interval [count]]
+ * zpool status [-igLpPstvx] [-T d|u] [pool] ... [interval [count]]
*
* -i Display vdev initialization status.
* -g Display guid for individual vdev name.
* -L Follow links when resolving vdev path name.
+ * -p Display values in parsable (exact) format.
* -P Display full path for vdev name.
+ * -s Display slow IOs column.
* -v Display complete error logs
* -x Display only pools with potential problems
* -D Display dedup status (undocumented)
@@ -6022,11 +7413,12 @@ zpool_do_status(int argc, char **argv)
{
int c;
int ret;
- unsigned long interval = 0, count = 0;
+ float interval = 0;
+ unsigned long count = 0;
status_cbdata_t cb = { 0 };
/* check options */
- while ((c = getopt(argc, argv, "igLPvxDtT:")) != -1) {
+ while ((c = getopt(argc, argv, "igLpPsvxDtT:")) != -1) {
switch (c) {
case 'i':
cb.cb_print_vdev_init = B_TRUE;
@@ -6037,9 +7429,15 @@ zpool_do_status(int argc, char **argv)
case 'L':
cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
break;
+ case 'p':
+ cb.cb_literal = B_TRUE;
+ break;
case 'P':
cb.cb_name_flags |= VDEV_NAME_PATH;
break;
+ case 's':
+ cb.cb_print_slow_ios = B_TRUE;
+ break;
case 'v':
cb.cb_verbose = B_TRUE;
break;
@@ -6094,7 +7492,7 @@ zpool_do_status(int argc, char **argv)
if (count != 0 && --count == 0)
break;
- (void) sleep(interval);
+ (void) fsleep(interval);
}
return (0);
diff --git a/usr/src/cmd/zpool/zpool_util.c b/usr/src/cmd/zpool/zpool_util.c
index c7a002efb1..e4281af210 100644
--- a/usr/src/cmd/zpool/zpool_util.c
+++ b/usr/src/cmd/zpool/zpool_util.c
@@ -29,6 +29,8 @@
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
+#include <ctype.h>
+#include <sys/sysmacros.h>
#include "zpool_util.h"
@@ -84,3 +86,29 @@ num_logs(nvlist_t *nv)
}
return (nlogs);
}
+
+/* Find the max element in an array of uint64_t values */
+uint64_t
+array64_max(uint64_t array[], unsigned int len)
+{
+ uint64_t max = 0;
+ int i;
+ for (i = 0; i < len; i++)
+ max = MAX(max, array[i]);
+
+ return (max);
+}
+
+/*
+ * Return 1 if "str" is a number string, 0 otherwise. Works for integer and
+ * floating point numbers.
+ */
+int
+isnumber(char *str)
+{
+ for (; *str; str++)
+ if (!(isdigit(*str) || (*str == '.')))
+ return (0);
+
+ return (1);
+}
diff --git a/usr/src/cmd/zpool/zpool_util.h b/usr/src/cmd/zpool/zpool_util.h
index 3aeb9b5431..e4c93acf39 100644
--- a/usr/src/cmd/zpool/zpool_util.h
+++ b/usr/src/cmd/zpool/zpool_util.h
@@ -38,6 +38,10 @@ extern "C" {
void *safe_malloc(size_t);
void zpool_no_memory(void);
uint_t num_logs(nvlist_t *nv);
+uint64_t array64_max(uint64_t array[], unsigned int len);
+int highbit64(uint64_t i);
+int lowbit64(uint64_t i);
+int isnumber(char *str);
/*
* Virtual device functions
@@ -55,6 +59,10 @@ nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname,
int for_each_pool(int, char **, boolean_t unavail, zprop_list_t **,
zpool_iter_f, void *);
+/* Vdev list functions */
+typedef int (*pool_vdev_iter_f)(zpool_handle_t *, nvlist_t *, void *);
+int for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data);
+
typedef struct zpool_list zpool_list_t;
zpool_list_t *pool_list_get(int, char **, zprop_list_t **, int *);
diff --git a/usr/src/lib/libzfs/common/libzfs_config.c b/usr/src/lib/libzfs/common/libzfs_config.c
index b33d86432d..e6c7ae025d 100644
--- a/usr/src/lib/libzfs/common/libzfs_config.c
+++ b/usr/src/lib/libzfs/common/libzfs_config.c
@@ -312,21 +312,9 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing)
zhp->zpool_config_size = zc.zc_nvlist_dst_size;
if (zhp->zpool_config != NULL) {
- uint64_t oldtxg, newtxg;
-
- verify(nvlist_lookup_uint64(zhp->zpool_config,
- ZPOOL_CONFIG_POOL_TXG, &oldtxg) == 0);
- verify(nvlist_lookup_uint64(config,
- ZPOOL_CONFIG_POOL_TXG, &newtxg) == 0);
-
nvlist_free(zhp->zpool_old_config);
- if (oldtxg != newtxg) {
- nvlist_free(zhp->zpool_config);
- zhp->zpool_old_config = NULL;
- } else {
- zhp->zpool_old_config = zhp->zpool_config;
- }
+ zhp->zpool_old_config = zhp->zpool_config;
}
zhp->zpool_config = config;
diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c
index f1784dae9c..2986fc1b8c 100644
--- a/usr/src/lib/libzfs/common/libzfs_pool.c
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c
@@ -3945,10 +3945,18 @@ char *
zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
int name_flags)
{
- char *path, *env;
+ char *path, *type, *env;
uint64_t value;
char buf[64];
+ /*
+ * vdev_name will be "root"/"root-0" for the root vdev, but it is the
+ * zpool name that will be displayed to the user.
+ */
+ verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+ if (zhp != NULL && strcmp(type, "root") == 0)
+ return (zfs_strdup(hdl, zpool_get_name(zhp)));
+
env = getenv("ZPOOL_VDEV_NAME_PATH");
if (env && (strtoul(env, NULL, 0) > 0 ||
!strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2)))
@@ -4066,7 +4074,7 @@ after_open:
return (tmp);
}
} else {
- verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &path) == 0);
+ path = type;
/*
* If it's a raidz device, we need to stick in the parity level.
diff --git a/usr/src/lib/libzpool/common/kernel.c b/usr/src/lib/libzpool/common/kernel.c
index 0d43302861..d26955d83d 100644
--- a/usr/src/lib/libzpool/common/kernel.c
+++ b/usr/src/lib/libzpool/common/kernel.c
@@ -422,6 +422,83 @@ kobj_get_filesize(struct _buf *file, uint64_t *size)
/*
* =========================================================================
+ * misc routines
+ * =========================================================================
+ */
+
+/*
+ * Find lowest one bit set.
+ * Returns bit number + 1 of lowest bit that is set, otherwise returns 0.
+ * This is basically a reimplementation of ffsll(), which is GNU specific.
+ */
+int
+lowbit64(uint64_t i)
+{
+ register int h = 64;
+ if (i == 0)
+ return (0);
+
+ if (i & 0x00000000ffffffffULL)
+ h -= 32;
+ else
+ i >>= 32;
+
+ if (i & 0x0000ffff)
+ h -= 16;
+ else
+ i >>= 16;
+
+ if (i & 0x00ff)
+ h -= 8;
+ else
+ i >>= 8;
+
+ if (i & 0x0f)
+ h -= 4;
+ else
+ i >>= 4;
+
+ if (i & 0x3)
+ h -= 2;
+ else
+ i >>= 2;
+
+ if (i & 0x1)
+ h -= 1;
+
+ return (h);
+}
+
+int
+highbit64(uint64_t i)
+{
+ int h = 1;
+
+ if (i == 0)
+ return (0);
+ if (i & 0xffffffff00000000ULL) {
+ h += 32; i >>= 32;
+ }
+ if (i & 0xffff0000) {
+ h += 16; i >>= 16;
+ }
+ if (i & 0xff00) {
+ h += 8; i >>= 8;
+ }
+ if (i & 0xf0) {
+ h += 4; i >>= 4;
+ }
+ if (i & 0xc) {
+ h += 2; i >>= 2;
+ }
+ if (i & 0x2) {
+ h += 1;
+ }
+ return (h);
+}
+
+/*
+ * =========================================================================
* kernel emulation setup & teardown
* =========================================================================
*/
diff --git a/usr/src/lib/libzpool/common/util.c b/usr/src/lib/libzpool/common/util.c
index 8525b5f299..65ffa91ebb 100644
--- a/usr/src/lib/libzpool/common/util.c
+++ b/usr/src/lib/libzpool/common/util.c
@@ -23,6 +23,7 @@
* Copyright (c) 2016 by Delphix. All rights reserved.
* Copyright 2017 RackTop Systems.
* Copyright (c) 2017, Intel Corporation.
+ * Copyright 2020 Joyent, Inc.
*/
#include <assert.h>
@@ -48,7 +49,7 @@ static void
show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent)
{
vdev_stat_t *vs;
- vdev_stat_t v0 = { 0 };
+ vdev_stat_t *v0 = { 0 };
uint64_t sec;
uint64_t is_log = 0;
nvlist_t **child;
@@ -56,6 +57,8 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent)
char used[6], avail[6];
char rops[6], wops[6], rbytes[6], wbytes[6], rerr[6], werr[6], cerr[6];
+ v0 = umem_zalloc(sizeof (*v0), UMEM_NOFAIL);
+
if (indent == 0 && desc != NULL) {
(void) printf(" "
" capacity operations bandwidth ---- errors ----\n");
@@ -72,7 +75,7 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent)
&bias);
if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) != 0)
- vs = &v0;
+ vs = v0;
if (bias != NULL) {
(void) snprintf(bias_suffix, sizeof (bias_suffix),
@@ -105,6 +108,7 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent)
vs->vs_space ? 6 : 0, vs->vs_space ? avail : "",
rops, wops, rbytes, wbytes, rerr, werr, cerr);
}
+ umem_free(v0, sizeof (*v0));
if (nvlist_lookup_nvlist_array(nv, ctype, &child, &children) != 0)
return;
diff --git a/usr/src/man/man1m/zpool.1m b/usr/src/man/man1m/zpool.1m
index e8f10ea5b2..fdbf7e741b 100644
--- a/usr/src/man/man1m/zpool.1m
+++ b/usr/src/man/man1m/zpool.1m
@@ -23,8 +23,8 @@
.\" Copyright (c) 2012, 2017 by Delphix. All rights reserved.
.\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Datto Inc.
-.\" Copyright (c) 2017 George Melikov. All Rights Reserved.
-.\" Copyright 2019 Joyent, Inc.
+.\" Copyright (c) 2018 George Melikov. All Rights Reserved.
+.\" Copyright 2020 Joyent, Inc.
.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
.\"
.Dd August 30, 2019
@@ -116,10 +116,10 @@
.Op Ar device Ns ...
.Nm
.Cm iostat
-.Op Fl v
+.Op Oo Fl lq Oc | Ns Fl rw
.Op Fl T Sy u Ns | Ns Sy d
-.Op Fl gLP
-.Oo Ar pool Oc Ns ...
+.Op Fl ghHLnpPvy
+.Oo Oo Ar pool Ns ... Oc Ns | Ns Oo Ar pool vdev Ns ... Oc Ns | Ns Oo Ar vdev Ns ... Oc Oc
.Op Ar interval Op Ar count
.Nm
.Cm labelclear
@@ -184,7 +184,7 @@
.Ar pool newpool
.Nm
.Cm status
-.Op Fl DigLPtvx
+.Op Fl DigLpPstvx
.Op Fl T Sy u Ns | Ns Sy d
.Oo Ar pool Oc Ns ...
.Op Ar interval Op Ar count
@@ -1606,25 +1606,48 @@ with no flags on the relevant target devices.
.It Xo
.Nm
.Cm iostat
+.Op Oo Fl lq Oc | Ns Fl rw
.Op Fl T Sy u Ns | Ns Sy d
-.Op Fl gLPv
-.Oo Ar pool Oc Ns ...
+.Op Fl ghHLnpPvy
+.Oo Oo Ar pool Ns ... Oc Ns | Ns Oo Ar pool vdev Ns ... Oc Ns | Ns Oo Ar vdev Ns ... Oc Oc
.Op Ar interval Op Ar count
.Xc
-Displays I/O statistics for the given pools.
+Displays I/O statistics for the given pools/vdevs.
+Physical I/Os may be observed via
+.Xr iostat 1 .
+If writes are located nearby, they may be merged into a single larger operation.
+Additional I/O may be generated depending on the level of vdev redundancy.
+To filter output, you may pass in a list of pools, a pool and list of vdevs
+in that pool, or a list of any vdevs from any pool.
+If no items are specified, statistics for every pool in the system are shown.
When given an
.Ar interval ,
the statistics are printed every
.Ar interval
seconds until ^C is pressed.
-If no
-.Ar pool Ns s
-are specified, statistics for every pool in the system is shown.
+If
+.Fl n
+flag is specified the headers are displayed only once, otherwise they are
+displayed periodically.
If
.Ar count
is specified, the command exits after
.Ar count
reports are printed.
+The first report printed is always the statistics since boot regardless of
+whether
+.Ar interval
+and
+.Ar count
+are passed.
+Also note that the units of
+.Sy K ,
+.Sy M ,
+.Sy G ...
+that are printed in the report are in base 1024.
+To get the raw values, use the
+.Fl p
+flag.
.Bl -tag -width Ds
.It Fl T Sy u Ns | Ns Sy d
Display a time stamp.
@@ -1644,20 +1667,99 @@ Display vdev initialization status.
Display vdev GUIDs instead of the normal device names.
These GUIDs can be used in place of device names for the zpool
detach/offline/remove/replace commands.
+.It Fl H
+Scripted mode.
+Do not display headers, and separate fields by a single tab instead of
+arbitrary space.
.It Fl L
Display real paths for vdevs resolving all symbolic links.
This can be used to look up the current block device name regardless of the
-.Pa /dev/disk/
+.Pa /dev/dsk/
path used to open it.
+.It Fl n
+Print headers only once when passed.
+.It Fl p
+Display numbers in parsable (exact) values.
+Time values are in nanoseconds.
.It Fl P
Display full paths for vdevs instead of only the last component of
the path.
This can be used in conjunction with the
.Fl L
flag.
+.It Fl r
+Print request size histograms for the leaf vdev's IO
+This includes histograms of individual IOs (ind) and aggregate IOs (agg).
+These stats can be useful for observing how well IO aggregation is working.
+Note that TRIM IOs may exceed 16M, but will be counted as 16M.
.It Fl v
Verbose statistics Reports usage statistics for individual vdevs within the
pool, in addition to the pool-wide statistics.
+.It Fl y
+Omit statistics since boot.
+Normally the first line of output reports the statistics since boot.
+This option suppresses that first line of output.
+.Ar interval
+.It Fl w
+Display latency histograms:
+.Pp
+.Ar total_wait :
+Total IO time (queuing + disk IO time).
+.Ar disk_wait :
+Disk IO time (time reading/writing the disk).
+.Ar syncq_wait :
+Amount of time IO spent in synchronous priority queues.
+Does not include disk time.
+.Ar asyncq_wait :
+Amount of time IO spent in asynchronous priority queues.
+Does not include disk time.
+.Ar scrub :
+Amount of time IO spent in scrub queue.
+Does not include disk time.
+.It Fl l
+Include average latency statistics:
+.Pp
+.Ar total_wait :
+Average total IO time (queuing + disk IO time).
+.Ar disk_wait :
+Average disk IO time (time reading/writing the disk).
+.Ar syncq_wait :
+Average amount of time IO spent in synchronous priority queues.
+Does not include disk time.
+.Ar asyncq_wait :
+Average amount of time IO spent in asynchronous priority queues.
+Does not include disk time.
+.Ar scrub :
+Average queuing time in scrub queue.
+Does not include disk time.
+.Ar trim :
+Average queuing time in trim queue.
+Does not include disk time.
+.It Fl q
+Include active queue statistics.
+Each priority queue has both pending (
+.Ar pend )
+and active (
+.Ar activ )
+IOs.
+Pending IOs are waiting to be issued to the disk, and active IOs have been
+issued to disk and are waiting for completion.
+These stats are broken out by priority queue:
+.Pp
+.Ar syncq_read/write :
+Current number of entries in synchronous priority
+queues.
+.Ar asyncq_read/write :
+Current number of entries in asynchronous priority queues.
+.Ar scrubq_read :
+Current number of entries in scrub queue.
+.Ar trimq_write :
+Current number of entries in trim queue.
+.Pp
+All queue statistics are instantaneous measurements of the number of
+entries in the queues.
+If you specify an interval, the measurements will be sampled from the end of
+the interval.
.El
.It Xo
.Nm
@@ -1731,12 +1833,12 @@ flag.
.It Fl T Sy u Ns | Ns Sy d
Display a time stamp.
Specify
-.Fl u
+.Sy u
for a printed representation of the internal representation of time.
See
.Xr time 2 .
Specify
-.Fl d
+.Sy d
for standard date format.
See
.Xr date 1 .
@@ -2021,7 +2123,7 @@ and automatically import it.
.It Xo
.Nm
.Cm status
-.Op Fl DigLPtvx
+.Op Fl DigLpPstvx
.Op Fl T Sy u Ns | Ns Sy d
.Oo Ar pool Oc Ns ...
.Op Ar interval Op Ar count
@@ -2054,23 +2156,33 @@ Display real paths for vdevs resolving all symbolic links.
This can be used to look up the current block device name regardless of the
.Pa /dev/disk/
path used to open it.
+.It Fl p
+Display numbers in parsable (exact) values.
.It Fl P
Display full paths for vdevs instead of only the last component of
the path.
This can be used in conjunction with the
.Fl L
flag.
+.It Fl s
+Display the number of leaf VDEV slow IOs.
+This is the number of IOs that didn't complete in
+.Sy zio_slow_io_ms
+milliseconds (default 30 seconds).
+This does not necessarily mean the IOs failed to complete, just took an
+unreasonably long amount of time.
+This may indicate a problem with the underlying storage.
.It Fl t
Display vdev TRIM status.
.It Fl T Sy u Ns | Ns Sy d
Display a time stamp.
Specify
-.Fl u
+.Sy u
for a printed representation of the internal representation of time.
See
.Xr time 2 .
Specify
-.Fl d
+.Sy d
for standard date format.
See
.Xr date 1 .
diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf
index d4432b45a1..693f83af3e 100644
--- a/usr/src/pkg/manifests/system-test-zfstest.mf
+++ b/usr/src/pkg/manifests/system-test-zfstest.mf
@@ -112,6 +112,7 @@ dir path=opt/zfs-tests/tests/functional/ctime
dir path=opt/zfs-tests/tests/functional/delegate
dir path=opt/zfs-tests/tests/functional/devices
dir path=opt/zfs-tests/tests/functional/exec
+dir path=opt/zfs-tests/tests/functional/fault
dir path=opt/zfs-tests/tests/functional/features
dir path=opt/zfs-tests/tests/functional/features/async_destroy
dir path=opt/zfs-tests/tests/functional/grow_pool
@@ -209,6 +210,7 @@ file path=opt/zfs-tests/include/default.cfg mode=0444
file path=opt/zfs-tests/include/libtest.shlib mode=0444
file path=opt/zfs-tests/include/math.shlib mode=0444
file path=opt/zfs-tests/include/properties.shlib mode=0444
+file path=opt/zfs-tests/include/zpool_script.shlib mode=0444
file path=opt/zfs-tests/runfiles/delphix.run mode=0444
file path=opt/zfs-tests/runfiles/longevity.run mode=0444
file path=opt/zfs-tests/runfiles/omnios.run mode=0444
@@ -2375,6 +2377,9 @@ file \
file \
path=opt/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg \
mode=0555
+file \
+ path=opt/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos \
+ mode=0555
file path=opt/zfs-tests/tests/functional/cli_user/zpool_list/cleanup mode=0555
file path=opt/zfs-tests/tests/functional/cli_user/zpool_list/setup mode=0555
file \
@@ -2443,6 +2448,10 @@ file path=opt/zfs-tests/tests/functional/exec/exec_001_pos mode=0555
file path=opt/zfs-tests/tests/functional/exec/exec_002_neg mode=0555
file path=opt/zfs-tests/tests/functional/exec/mmap_exec mode=0555
file path=opt/zfs-tests/tests/functional/exec/setup mode=0555
+file path=opt/zfs-tests/tests/functional/fault/cleanup mode=0555
+file path=opt/zfs-tests/tests/functional/fault/fault.cfg mode=0444
+file path=opt/zfs-tests/tests/functional/fault/setup mode=0555
+file path=opt/zfs-tests/tests/functional/fault/zpool_status_-s mode=0555
file \
path=opt/zfs-tests/tests/functional/features/async_destroy/async_destroy_001_pos \
mode=0555
diff --git a/usr/src/test/zfs-tests/include/libtest.shlib b/usr/src/test/zfs-tests/include/libtest.shlib
index 725c971a4c..2edf9123ab 100644
--- a/usr/src/test/zfs-tests/include/libtest.shlib
+++ b/usr/src/test/zfs-tests/include/libtest.shlib
@@ -26,7 +26,7 @@
# Copyright (c) 2017 by Tim Chase. All rights reserved.
# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved.
# Copyright (c) 2017 Datto Inc.
-# Copyright 2019 Joyent, Inc.
+# Copyright 2020 Joyent, Inc.
#
. ${STF_TOOLS}/contrib/include/logapi.shlib
@@ -1568,7 +1568,7 @@ function get_disklist # pool
disklist=$(zpool iostat -v $1 | nawk '(NR >4) {print $1}' | \
grep -v "\-\-\-\-\-" | \
- egrep -v -e "^(mirror|raidz1|raidz2|spare|log|cache)$")
+ egrep -v -e "^(mirror|raidz[1-3]|spare|log|cache|special|dedup)$")
echo $disklist
}
diff --git a/usr/src/test/zfs-tests/include/zpool_script.shlib b/usr/src/test/zfs-tests/include/zpool_script.shlib
new file mode 100644
index 0000000000..25daa3f06a
--- /dev/null
+++ b/usr/src/test/zfs-tests/include/zpool_script.shlib
@@ -0,0 +1,49 @@
+#
+# Common functions used by the zpool_status and zpool_iostat tests for running
+# scripts with the -c option.
+#
+# Copyright (c) 2017 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+function test_zpool_script {
+ script="$1"
+ testpool="$2"
+ cmd="$3"
+ wholecmd="$cmd $script $testpool"
+ out="$($wholecmd)"
+
+ # Default number of columns that get printed without -c
+ if echo "$cmd" | grep -q iostat ; then
+ # iostat
+ dcols=7
+ else
+
+ # status
+ dcols=5
+ fi
+
+ # Get the new column name that the script created
+ col="$(echo "$out" | \
+ awk '/^pool +alloc +free +read +write +/ {print $8} \
+ /NAME +STATE +READ +WRITE +CKSUM/ {print $6}')"
+
+ if [ -z "$col" ] ; then
+ log_fail "'$wholecmd' created no new columns"
+ fi
+
+ # Count the number of columns for each vdev. Each script should produce
+ # at least one new column value. Even if scripts return blank, zpool
+ # will convert the blank to a '-' to make things awk-able. Normal
+ # zpool iostat -v output is 7 columns, so if the script ran correctly
+ # we should see more than that.
+ if ! newcols=$(echo "$out" | \
+ awk '/\/dev/{print NF-'$dcols'; if (NF <= '$dcols') {exit 1}}' | \
+ head -n 1) ; \
+ then
+ log_fail "'$wholecmd' didn't create a new column value"
+ else
+ log_note "'$wholecmd' passed ($newcols new columns)"
+ fi
+}
diff --git a/usr/src/test/zfs-tests/runfiles/delphix.run b/usr/src/test/zfs-tests/runfiles/delphix.run
index beb23d9d82..ef1c80efcc 100644
--- a/usr/src/test/zfs-tests/runfiles/delphix.run
+++ b/usr/src/test/zfs-tests/runfiles/delphix.run
@@ -430,7 +430,7 @@ user =
[/opt/zfs-tests/tests/functional/cli_user/zpool_iostat]
tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos',
- 'zpool_iostat_003_neg']
+ 'zpool_iostat_003_neg', 'zpool_iostat_004_pos']
user =
[/opt/zfs-tests/tests/functional/cli_user/zpool_list]
@@ -459,6 +459,9 @@ tests = ['devices_001_pos', 'devices_002_neg', 'devices_003_pos']
[/opt/zfs-tests/tests/functional/exec]
tests = ['exec_001_pos', 'exec_002_neg']
+[/opt/zfs-tests/tests/functional/fault]
+tests = ['zpool_status_-s']
+
[/opt/zfs-tests/tests/functional/features/async_destroy]
tests = ['async_destroy_001_pos']
diff --git a/usr/src/test/zfs-tests/runfiles/omnios.run b/usr/src/test/zfs-tests/runfiles/omnios.run
index af9f29e8ca..010a16af3a 100644
--- a/usr/src/test/zfs-tests/runfiles/omnios.run
+++ b/usr/src/test/zfs-tests/runfiles/omnios.run
@@ -429,7 +429,7 @@ user =
[/opt/zfs-tests/tests/functional/cli_user/zpool_iostat]
tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos',
- 'zpool_iostat_003_neg']
+ 'zpool_iostat_003_neg', 'zpool_iostat_004_pos']
user =
[/opt/zfs-tests/tests/functional/cli_user/zpool_list]
@@ -458,6 +458,9 @@ tests = ['devices_001_pos', 'devices_002_neg', 'devices_003_pos']
[/opt/zfs-tests/tests/functional/exec]
tests = ['exec_001_pos', 'exec_002_neg']
+[/opt/zfs-tests/tests/functional/fault]
+tests = ['zpool_status_-s']
+
[/opt/zfs-tests/tests/functional/features/async_destroy]
tests = ['async_destroy_001_pos']
diff --git a/usr/src/test/zfs-tests/runfiles/openindiana.run b/usr/src/test/zfs-tests/runfiles/openindiana.run
index 78923b6a1f..71bad54e0f 100644
--- a/usr/src/test/zfs-tests/runfiles/openindiana.run
+++ b/usr/src/test/zfs-tests/runfiles/openindiana.run
@@ -429,7 +429,7 @@ user =
[/opt/zfs-tests/tests/functional/cli_user/zpool_iostat]
tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos',
- 'zpool_iostat_003_neg']
+ 'zpool_iostat_003_neg', 'zpool_iostat_004_pos']
user =
[/opt/zfs-tests/tests/functional/cli_user/zpool_list]
@@ -461,6 +461,9 @@ tests = ['exec_001_pos', 'exec_002_neg']
[/opt/zfs-tests/tests/functional/features/async_destroy]
tests = ['async_destroy_001_pos']
+[/opt/zfs-tests/tests/functional/fault]
+tests = ['zpool_status_-s']
+
[/opt/zfs-tests/tests/functional/grow_pool]
tests = ['grow_pool_001_pos']
diff --git a/usr/src/test/zfs-tests/runfiles/smartos.run b/usr/src/test/zfs-tests/runfiles/smartos.run
index 52fbf045dc..7b1c27b214 100644
--- a/usr/src/test/zfs-tests/runfiles/smartos.run
+++ b/usr/src/test/zfs-tests/runfiles/smartos.run
@@ -377,7 +377,7 @@ user =
[/opt/zfs-tests/tests/functional/cli_user/zpool_iostat]
tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos',
- 'zpool_iostat_003_neg']
+ 'zpool_iostat_003_neg', 'zpool_iostat_004_pos']
user =
[/opt/zfs-tests/tests/functional/cli_user/zpool_list]
@@ -397,6 +397,9 @@ tests = ['devices_003_pos']
[/opt/zfs-tests/tests/functional/exec]
tests = ['exec_001_pos', 'exec_002_neg']
+[/opt/zfs-tests/tests/functional/fault]
+tests = ['zpool_status_-s']
+
[/opt/zfs-tests/tests/functional/features/async_destroy]
tests = ['async_destroy_001_pos']
diff --git a/usr/src/test/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh b/usr/src/test/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh
index 0c631e0eea..78d40ce56d 100755
--- a/usr/src/test/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh
@@ -14,7 +14,6 @@
#
# Copyright (c) 2017, Intel Corporation.
# Copyright (c) 2018 by Delphix. All rights reserved.
-# Copyright 2019 Joyent, Inc.
#
. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
@@ -39,17 +38,17 @@ do
if [ "$type" = "mirror" ]; then
log_must zpool add $TESTPOOL special mirror \
$CLASS_DISK0 $CLASS_DISK1 $CLASS_DISK2
- log_must zpool clear $TESTPOOL $CLASS_DISK0
- log_must zpool clear $TESTPOOL $CLASS_DISK1
- log_must zpool clear $TESTPOOL $CLASS_DISK2
+ log_must zpool iostat -H $TESTPOOL $CLASS_DISK0
+ log_must zpool iostat -H $TESTPOOL $CLASS_DISK1
+ log_must zpool iostat -H $TESTPOOL $CLASS_DISK2
elif [ "$type" = "raidz" ]; then
log_must zpool add $TESTPOOL special mirror \
$CLASS_DISK0 $CLASS_DISK1
- log_must zpool clear $TESTPOOL $CLASS_DISK0
- log_must zpool clear $TESTPOOL $CLASS_DISK1
+ log_must zpool iostat -H $TESTPOOL $CLASS_DISK0
+ log_must zpool iostat -H $TESTPOOL $CLASS_DISK1
else
log_must zpool add $TESTPOOL special $CLASS_DISK0
- log_must zpool clear $TESTPOOL $CLASS_DISK0
+ log_must zpool iostat -H $TESTPOOL $CLASS_DISK0
fi
log_must zpool destroy -f $TESTPOOL
diff --git a/usr/src/test/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh b/usr/src/test/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh
index b871034394..106a6d933a 100755
--- a/usr/src/test/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh
@@ -14,7 +14,6 @@
#
# Copyright (c) 2017, Intel Corporation.
# Copyright (c) 2018 by Delphix. All rights reserved.
-# Copyright 2019 Joyent, Inc.
#
. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
@@ -36,8 +35,7 @@ log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS \
special mirror $CLASS_DISK0 $CLASS_DISK1
log_must zpool replace $TESTPOOL $CLASS_DISK1 $CLASS_DISK2
log_must sleep 10
-log_must zpool status $TESTPOOL | nawk -v dev=$CLASS_DISK2 \
- 'BEGIN {res=1} {if ($1 == dev) res=0} END {exit res}'
+log_must zpool iostat -H $TESTPOOL $CLASS_DISK2
log_must zpool destroy -f $TESTPOOL
log_pass $claim
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib
index c32f72b504..ff06248588 100644
--- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib
@@ -343,6 +343,14 @@ function get_last_txg_synced
{
typeset pool=$1
+ if is_linux; then
+ txg=$(tail "/proc/spl/kstat/zfs/$pool/txgs" |
+ awk '$3=="C" {print $1}' | tail -1)
+ [[ "$txg" ]] || txg=0
+ echo $txg
+ return 0
+ fi
+
typeset spas
spas=$(mdb -k -e "::spa")
[[ $? -ne 0 ]] && return 1
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh b/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh
index 9c2fb74ed4..b605ceb8ee 100644
--- a/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh
@@ -33,4 +33,4 @@
DISK=${DISKS%% *}
-default_setup $DISK
+default_raidz_setup $DISKS
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh
index 903ea1c5f7..c67031780b 100644
--- a/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh
@@ -33,13 +33,13 @@
#
# DESCRIPTION:
-# Verify that 'zpool iostat [interval [count]' can be executed as non-root.
+# Verify that 'zpool iostat [interval [count]]' can be executed as non-root.
#
# STRATEGY:
# 1. Set the interval to 1 and count to 4.
# 2. Sleep for 4 seconds.
# 3. Verify that the output has 4 records.
-#
+# 4. Set interval to 0.5 and count to 1 to test floating point intervals.
verify_runnable "both"
@@ -68,4 +68,7 @@ if [[ $stat_count -ne 4 ]]; then
log_fail "zpool iostat [pool_name] [interval] [count] failed"
fi
+# Test a floating point interval value
+log_must zpool iostat -v 0.5 1
+
log_pass "zpool iostat [pool_name ...] [interval] [count] passed"
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh b/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh
index 0b4a87f66e..c42ddf000f 100644
--- a/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh
@@ -51,13 +51,15 @@ else
fi
set -A args "" "-?" "-f" "nonexistpool" "$TESTPOOL/$TESTFS" \
- "$testpool 1.23" "$testpool 0" "$testpool -1" "$testpool 1 0" \
- "$testpool 0 0"
+ "$testpool 0" "$testpool -1" "$testpool 1 0" \
+ "$testpool 0 0" "$testpool -wl" "$testpool -wq" "$testpool -wr" \
+ "$testpool -rq" "$testpool -lr"
log_assert "Executing 'zpool iostat' with bad options fails"
typeset -i i=1
while [[ $i -lt ${#args[*]} ]]; do
+ log_assert "doing $ZPOOL iostat ${args[i]}"
log_mustnot zpool iostat ${args[i]}
((i = i + 1))
done
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh
new file mode 100755
index 0000000000..25d1c845b6
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh
@@ -0,0 +1,76 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013 by Delphix. All rights reserved.
+#
+
+# Copyright (C) 2016 Lawrence Livermore National Security, LLC.
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Executing 'zpool iostat' command with various combinations of extended
+# stats (-lqwr), parsable/script options (-pH), and misc lists of pools
+# and vdevs.
+#
+# STRATEGY:
+# 1. Create an array of mixed 'zpool iostat' options.
+# 2. Execute each element of the array.
+# 3. Verify an error code is returned.
+#
+
+verify_runnable "both"
+
+typeset testpool
+if is_global_zone ; then
+ testpool=$TESTPOOL
+else
+ testpool=${TESTPOOL%%/*}
+fi
+
+set -A args "" "-v" "-q" "-l" "-lq $TESTPOOL" "-ql ${DISKS[0]} ${DISKS[1]}" \
+ "-w $TESTPOOL ${DISKS[0]} ${DISKS[1]}" \
+ "-wp $TESTPOOL" \
+ "-qlH $TESTPOOL ${DISKS[0]}" \
+ "-vpH ${DISKS[0]}" \
+ "-wpH ${DISKS[0]}" \
+ "-r ${DISKS[0]}" \
+ "-rpH ${DISKS[0]}"
+
+log_assert "Executing 'zpool iostat' with extended stat options succeeds"
+log_note "testpool: $TESTPOOL, disks $DISKS"
+
+typeset -i i=1
+while [[ $i -lt ${#args[*]} ]]; do
+ log_note "doing $ZPOOL iostat ${args[i]}"
+ log_must zpool iostat ${args[i]}
+ ((i = i + 1))
+done
+
+log_pass "Executing 'zpool iostat' with extended stat options succeeds"
diff --git a/usr/src/test/zfs-tests/tests/functional/devices/devices_common.kshlib b/usr/src/test/zfs-tests/tests/functional/devices/devices_common.kshlib
index 24f7c7e018..a68e22038f 100644
--- a/usr/src/test/zfs-tests/tests/functional/devices/devices_common.kshlib
+++ b/usr/src/test/zfs-tests/tests/functional/devices/devices_common.kshlib
@@ -26,6 +26,7 @@
#
# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+# Copyright 2020 Joyent, Inc.
#
. $STF_SUITE/tests/functional/devices/devices.cfg
@@ -42,6 +43,10 @@ function create_dev_file
typeset filetype=$1
typeset filename=$2
+ # On illumos we need access to the root zpool to get the major/minor
+ # numbers here.
+ export __ZFS_POOL_EXCLUDE=""
+
case $filetype in
b)
devtype=$(df -n / | awk '{print $3}')
diff --git a/usr/src/test/zfs-tests/tests/functional/fault/Makefile b/usr/src/test/zfs-tests/tests/functional/fault/Makefile
new file mode 100644
index 0000000000..00d2de10a3
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/fault/Makefile
@@ -0,0 +1,21 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+include $(SRC)/Makefile.master
+
+ROOTOPTPKG = $(ROOT)/opt/zfs-tests
+TARGETDIR = $(ROOTOPTPKG)/tests/functional/fault
+
+include $(SRC)/test/zfs-tests/Makefile.com
diff --git a/usr/src/test/zfs-tests/tests/functional/fault/cleanup.ksh b/usr/src/test/zfs-tests/tests/functional/fault/cleanup.ksh
new file mode 100755
index 0000000000..45b94723a5
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/fault/cleanup.ksh
@@ -0,0 +1,36 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+
+#
+# Copyright (c) 2016, 2017 by Intel Corporation. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/fault/fault.cfg
+
+verify_runnable "global"
+
+cleanup_devices $DISKS
+
+zed_stop
+zed_cleanup resilver_finish-start-scrub.sh
+
+log_pass
diff --git a/usr/src/test/zfs-tests/tests/functional/fault/fault.cfg b/usr/src/test/zfs-tests/tests/functional/fault/fault.cfg
new file mode 100644
index 0000000000..25601a71a3
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/fault/fault.cfg
@@ -0,0 +1,57 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+
+#
+# Copyright (c) 2016, 2017 by Intel Corporation. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+export DISK_ARRAY_NUM=$(echo ${DISKS} | nawk '{print NF}')
+export DISKSARRAY=$DISKS
+export FSIZE=10M
+export MAXTIMEOUT=30
+
+export SDSIZE=256
+export SDHOSTS=1
+export SDTGTS=1
+export SDLUNS=1
+
+export DISK1=$(echo $DISKS | nawk '{print $1}')
+export DISK2=$(echo $DISKS | nawk '{print $2}')
+export DISK3=$(echo $DISKS | nawk '{print $3}')
+
+if is_linux; then
+ set_slice_prefix
+ set_device_dir
+ devs_id[0]=$(get_persistent_disk_name $DISK1)
+ devs_id[1]=$(get_persistent_disk_name $DISK2)
+ devs_id[2]=$(get_persistent_disk_name $DISK3)
+ export devs_id
+else
+ DEV_DSKDIR="/dev"
+fi
+
+export VDEV_FILES="$TEST_BASE_DIR/file-1 $TEST_BASE_DIR/file-2 \
+ $TEST_BASE_DIR/file-3 $TEST_BASE_DIR/file-4"
+export SPARE_FILE="$TEST_BASE_DIR/spare-1"
+export FAULT_FILE="$TEST_BASE_DIR/file-1"
diff --git a/usr/src/test/zfs-tests/tests/functional/fault/setup.ksh b/usr/src/test/zfs-tests/tests/functional/fault/setup.ksh
new file mode 100755
index 0000000000..b78ee8ccdc
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/fault/setup.ksh
@@ -0,0 +1,34 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+
+#
+# Copyright (c) 2016, 2017 by Intel Corporation. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/fault/fault.cfg
+
+verify_runnable "global"
+
+zed_setup resilver_finish-start-scrub.sh
+zed_start
+
+log_pass
diff --git a/usr/src/test/zfs-tests/tests/functional/fault/zpool_status_-s.ksh b/usr/src/test/zfs-tests/tests/functional/fault/zpool_status_-s.ksh
new file mode 100755
index 0000000000..b90db23c36
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/fault/zpool_status_-s.ksh
@@ -0,0 +1,84 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
+# Copyright 2019 Joyent, Inc.
+#
+
+# DESCRIPTION:
+# Verify zpool status -s (slow IOs) works
+#
+# STRATEGY:
+# 1. Create a file
+# 2. Inject slow IOs into the pool
+# 3. Verify we can see the slow IOs with "zpool status -s".
+# 4. Verify we can see delay events.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/include/zpool_script.shlib
+
+# Zol/illumos Porting Note: The commands commented out below are due to the ZoL
+# "event" support which has not yet been ported to illumos.
+
+DISK=${DISKS%% *}
+
+verify_runnable "both"
+
+log_must zpool create $TESTPOOL mirror ${DISKS}
+
+function cleanup
+{
+ log_must zinject -c all
+ log_must set_tunable32 zio_slow_io_ms $OLD_SLOW_IO
+# log_must set_tunable64 zfs_slow_io_events_per_second $OLD_SLOW_IO_EVENTS
+ log_must destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+# log_must zpool events -c
+
+# Mark any IOs greater than 10ms as slow IOs
+OLD_SLOW_IO=$(get_tunable zio_slow_io_ms)
+# OLD_SLOW_IO_EVENTS=$(get_tunable zfs_slow_io_events_per_second)
+log_must set_tunable32 zio_slow_io_ms 10
+# log_must set_tunable64 zfs_slow_io_events_per_second 1000
+
+# Create 20ms IOs
+log_must zinject -d $DISK -D20:100 $TESTPOOL
+log_must mkfile 1048576 /$TESTPOOL/testfile
+log_must zpool sync $TESTPOOL
+
+log_must zinject -c all
+SLOW_IOS=$(zpool status -sp | grep "$DISK" | awk '{print $6}')
+#DELAY_EVENTS=$(zpool events | grep delay | wc -l)
+
+# if [ $SLOW_IOS -gt 0 ] && [ $DELAY_EVENTS -gt 0 ] ; then
+# log_pass "Correctly saw $SLOW_IOS slow IOs and $DELAY_EVENTS delay events"
+if [ $SLOW_IOS -gt 0 ] ; then
+ log_pass "Correctly saw $SLOW_IOS slow IOs"
+else
+# log_fail "Only saw $SLOW_IOS slow IOs and $DELAY_EVENTS delay events"
+ log_fail "Only saw $SLOW_IOS slow IOs"
+fi
diff --git a/usr/src/test/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh b/usr/src/test/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh
index 497529f94f..ccf4312e76 100644
--- a/usr/src/test/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh
@@ -26,6 +26,7 @@
#
# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+# Copyright 2020 Joyent, Inc.
#
. $STF_SUITE/include/libtest.shlib
@@ -201,8 +202,13 @@ function get_mntpt_val #dataset src index
mnt_val=`get_prop mountpoint $dset`
- mod_prop_val=${mnt_val##*/}
- new_path="/"$mod_prop_val$new_path
+ if [[ $dset == $src ]]; then
+ new_path=$mnt_val$new_path
+ else
+ mod_prop_val=${mnt_val##*/}
+ new_path="/"$mod_prop_val$new_path
+ fi
+
dataset=$dset
done
diff --git a/usr/src/test/zfs-tests/tests/functional/trim/trim.kshlib b/usr/src/test/zfs-tests/tests/functional/trim/trim.kshlib
index 77ee7fe0eb..92687c3eba 100644
--- a/usr/src/test/zfs-tests/tests/functional/trim/trim.kshlib
+++ b/usr/src/test/zfs-tests/tests/functional/trim/trim.kshlib
@@ -19,7 +19,7 @@
. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
#
-# Get the actual on disk disk for the provided file.
+# Get the actual size on disk for the provided file.
#
function get_size_mb
{
@@ -29,47 +29,6 @@ function get_size_mb
}
#
-# Use mdb to get the approximate number of trim IOs issued for the pool.
-# This really is just used to ensure that trim IO has occured and is a
-# temporary solution until illumos supports zpool iostat histograms.
-#
-function get_illumos_trim_io
-{
- typeset pool="${1-:$TESTPOOL}"
- typeset spa
- typeset vdevs
- typeset total_trim
- typeset v
- typeset trimmed
- typeset b
-
- # Get vdevs for the pool
- spa=$(mdb -ke '::spa' | awk -v pool=$pool '{if ($3 == pool) print $1}')
- vdevs=$(mdb -ke "$spa::spa -v" | awk '{
- if ($4 == "DESCRIPTION") {st=1; next}
- if (st == 1) print $1
- }')
-
- # Get trim counts for each vdev
- total_trim=0
- for v in $vdevs
- do
- b=$(mdb -ke "$v::print vdev_t vdev_trim_bytes_done" | \
- awk '{print $3}')
- trimmed=$(mdb -e "$b=E")
- trimmed=$((trimmed / 4096))
- total_trim=$((total_trim + trimmed))
-
- b=$(mdb -ke "$v::print vdev_t vdev_autotrim_bytes_done" | \
- awk '{print $3}')
- trimmed=$(mdb -e "$b=E")
- trimmed=$((trimmed / 4096))
- total_trim=$((total_trim + trimmed))
- done
- echo -n "$total_trim"
-}
-
-#
# Get the number of trim IOs issued for the pool (ind or agg).
#
function get_trim_io
@@ -106,11 +65,7 @@ function verify_trim_io
typeset min_trim_ios=${3:-100}
typeset ios
- if is_linux; then
- ios=$(get_trim_io $pool $type)
- else
- ios=$(get_illumos_trim_io $pool $type)
- fi
+ ios=$(get_trim_io $pool $type)
if [[ $ios -ge $min_trim_ios ]]; then
log_note "Issued $ios $type trim IOs for pool $pool"
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 05fd29810b..60f739d1b4 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -1428,6 +1428,7 @@ spa_unload(spa_t *spa)
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
+ spa_import_progress_remove(spa);
spa_load_note(spa, "UNLOADING");
/*
@@ -2372,6 +2373,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
int error;
spa->spa_load_state = state;
+ (void) spa_import_progress_set_state(spa, spa_load_state(spa));
gethrestime(&spa->spa_loaded_ts);
error = spa_load_impl(spa, type, &ereport);
@@ -2394,6 +2396,8 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
spa->spa_ena = 0;
+ (void) spa_import_progress_set_state(spa, spa_load_state(spa));
+
return (error);
}
@@ -2617,6 +2621,9 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
import_expire = gethrtime() + import_delay;
while (gethrtime() < import_expire) {
+ (void) spa_import_progress_set_mmp_check(spa,
+ NSEC2SEC(import_expire - gethrtime()));
+
vdev_uberblock_load(rvd, ub, &mmp_label);
if (txg != ub->ub_txg || timestamp != ub->ub_timestamp ||
@@ -2983,6 +2990,10 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
}
+ if (spa->spa_load_max_txg != UINT64_MAX) {
+ (void) spa_import_progress_set_max_txg(spa,
+ (u_longlong_t)spa->spa_load_max_txg);
+ }
spa_load_note(spa, "using uberblock with txg=%llu",
(u_longlong_t)ub->ub_txg);
@@ -3918,6 +3929,8 @@ spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
if (error != 0)
return (error);
+ spa_import_progress_add(spa);
+
/*
* Now that we have the vdev tree, try to open each vdev. This involves
* opening the underlying physical device, retrieving its geometry and
@@ -4348,6 +4361,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
spa_config_exit(spa, SCL_CONFIG, FTAG);
}
+ spa_import_progress_remove(spa);
spa_load_note(spa, "LOADED");
return (0);
@@ -4408,6 +4422,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
* from previous txgs when spa_load fails.
*/
ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+ spa_import_progress_remove(spa);
return (load_error);
}
@@ -4419,6 +4434,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
if (rewind_flags & ZPOOL_NEVER_REWIND) {
nvlist_free(config);
+ spa_import_progress_remove(spa);
return (load_error);
}
@@ -4461,6 +4477,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
if (state == SPA_LOAD_RECOVER) {
ASSERT3P(loadinfo, ==, NULL);
+ spa_import_progress_remove(spa);
return (rewind_error);
} else {
/* Store the rewind info as part of the initial load info */
@@ -4471,6 +4488,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
fnvlist_free(spa->spa_load_info);
spa->spa_load_info = loadinfo;
+ spa_import_progress_remove(spa);
return (load_error);
}
}
@@ -4740,6 +4758,8 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config)
ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
== 0);
vdev_get_stats(vd, vs);
+ vdev_config_generate_stats(vd, l2cache[i]);
+
}
}
}
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 45e1978803..9dac4e2ddc 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -28,6 +28,7 @@
* Copyright (c) 2017 Datto Inc.
* Copyright 2019 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
+ * Copyright 2020 Joyent, Inc.
*/
#include <sys/zfs_context.h>
@@ -635,6 +636,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_iokstat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_imp_kstat_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
@@ -844,6 +846,7 @@ spa_remove(spa_t *spa)
mutex_destroy(&spa->spa_suspend_lock);
mutex_destroy(&spa->spa_vdev_top_lock);
mutex_destroy(&spa->spa_iokstat_lock);
+ mutex_destroy(&spa->spa_imp_kstat_lock);
kmem_free(spa, sizeof (spa_t));
}
@@ -2046,6 +2049,140 @@ spa_dirty_data(spa_t *spa)
/*
* ==========================================================================
+ * SPA Import Progress Routines
+ * The illumos implementation of these are different from OpenZFS. OpenZFS
+ * uses the Linux /proc fs, whereas we use a kstat on the spa.
+ * ==========================================================================
+ */
+
+typedef struct spa_import_progress {
+ kstat_named_t sip_load_state;
+ kstat_named_t sip_mmp_sec_remaining; /* MMP activity check */
+ kstat_named_t sip_load_max_txg; /* rewind txg */
+} spa_import_progress_t;
+
+static void
+spa_import_progress_init(void)
+{
+}
+
+static void
+spa_import_progress_destroy(void)
+{
+}
+
+void spa_import_progress_add(spa_t *);
+
+int
+spa_import_progress_set_state(spa_t *spa, spa_load_state_t load_state)
+{
+ if (spa->spa_imp_kstat == NULL)
+ spa_import_progress_add(spa);
+
+ mutex_enter(&spa->spa_imp_kstat_lock);
+ if (spa->spa_imp_kstat != NULL) {
+ spa_import_progress_t *sip = spa->spa_imp_kstat->ks_data;
+ if (sip != NULL)
+ sip->sip_load_state.value.ui64 = (uint64_t)load_state;
+ }
+ mutex_exit(&spa->spa_imp_kstat_lock);
+
+ return (0);
+}
+
+int
+spa_import_progress_set_max_txg(spa_t *spa, uint64_t load_max_txg)
+{
+ if (spa->spa_imp_kstat == NULL)
+ spa_import_progress_add(spa);
+
+ mutex_enter(&spa->spa_imp_kstat_lock);
+ if (spa->spa_imp_kstat != NULL) {
+ spa_import_progress_t *sip = spa->spa_imp_kstat->ks_data;
+ if (sip != NULL)
+ sip->sip_load_max_txg.value.ui64 = load_max_txg;
+ }
+ mutex_exit(&spa->spa_imp_kstat_lock);
+
+ return (0);
+}
+
+int
+spa_import_progress_set_mmp_check(spa_t *spa, uint64_t mmp_sec_remaining)
+{
+ if (spa->spa_imp_kstat == NULL)
+ spa_import_progress_add(spa);
+
+ mutex_enter(&spa->spa_imp_kstat_lock);
+ if (spa->spa_imp_kstat != NULL) {
+ spa_import_progress_t *sip = spa->spa_imp_kstat->ks_data;
+ if (sip != NULL)
+ sip->sip_mmp_sec_remaining.value.ui64 =
+ mmp_sec_remaining;
+ }
+ mutex_exit(&spa->spa_imp_kstat_lock);
+
+ return (0);
+}
+
+/*
+ * A new import is in progress. Add an entry.
+ */
+void
+spa_import_progress_add(spa_t *spa)
+{
+ char *poolname = NULL;
+ spa_import_progress_t *sip;
+
+ mutex_enter(&spa->spa_imp_kstat_lock);
+ if (spa->spa_imp_kstat != NULL) {
+ sip = spa->spa_imp_kstat->ks_data;
+ sip->sip_load_state.value.ui64 = (uint64_t)spa_load_state(spa);
+ mutex_exit(&spa->spa_imp_kstat_lock);
+ return;
+ }
+
+ (void) nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME,
+ &poolname);
+ if (poolname == NULL)
+ poolname = spa_name(spa);
+
+ spa->spa_imp_kstat = kstat_create("zfs_import", 0, poolname,
+ "zfs_misc", KSTAT_TYPE_NAMED,
+ sizeof (spa_import_progress_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (spa->spa_imp_kstat != NULL) {
+ sip = kmem_alloc(sizeof (spa_import_progress_t), KM_SLEEP);
+ spa->spa_imp_kstat->ks_data = sip;
+
+ sip->sip_load_state.value.ui64 = (uint64_t)spa_load_state(spa);
+
+ kstat_named_init(&sip->sip_load_state,
+ "spa_load_state", KSTAT_DATA_UINT64);
+ kstat_named_init(&sip->sip_mmp_sec_remaining,
+ "mmp_sec_remaining", KSTAT_DATA_UINT64);
+ kstat_named_init(&sip->sip_load_max_txg,
+ "spa_load_max_txg", KSTAT_DATA_UINT64);
+ spa->spa_imp_kstat->ks_lock = &spa->spa_imp_kstat_lock;
+ kstat_install(spa->spa_imp_kstat);
+ }
+ mutex_exit(&spa->spa_imp_kstat_lock);
+}
+
+void
+spa_import_progress_remove(spa_t *spa)
+{
+ if (spa->spa_imp_kstat != NULL) {
+ void *data = spa->spa_imp_kstat->ks_data;
+
+ kstat_delete(spa->spa_imp_kstat);
+ spa->spa_imp_kstat = NULL;
+ kmem_free(data, sizeof (spa_import_progress_t));
+ }
+}
+
+/*
+ * ==========================================================================
* Initialization and Termination
* ==========================================================================
*/
@@ -2122,6 +2259,7 @@ spa_init(int mode)
spa_config_load();
l2arc_start();
scan_init();
+ spa_import_progress_init();
}
void
@@ -2141,6 +2279,7 @@ spa_fini(void)
unique_fini();
zfs_refcount_fini();
scan_fini();
+ spa_import_progress_destroy();
avl_destroy(&spa_namespace_avl);
avl_destroy(&spa_spare_avl);
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index cb736db5dd..e017462613 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -900,6 +900,12 @@ typedef struct spa_iostats {
kstat_named_t autotrim_bytes_failed;
} spa_iostats_t;
+extern int spa_import_progress_set_state(spa_t *, spa_load_state_t);
+extern int spa_import_progress_set_max_txg(spa_t *, uint64_t);
+extern int spa_import_progress_set_mmp_check(spa_t *, uint64_t);
+extern void spa_import_progress_add(spa_t *);
+extern void spa_import_progress_remove(spa_t *);
+
/* Pool configuration locks */
extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);
extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw);
@@ -1053,9 +1059,11 @@ extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
/* error handling */
struct zbookmark_phys;
extern void spa_log_error(spa_t *spa, const struct zbookmark_phys *zb);
-extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
+extern int zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
const struct zbookmark_phys *zb, struct zio *zio, uint64_t stateoroffset,
uint64_t length);
+extern boolean_t zfs_ereport_is_valid(const char *class, spa_t *spa, vdev_t *vd,
+ zio_t *zio);
extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
extern void zfs_post_state_change(spa_t *spa, vdev_t *vd);
extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
index bc00528fa9..45a78717da 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -399,6 +399,14 @@ struct spa {
int spa_queued;
} spa_queue_stats[ZIO_PRIORITY_NUM_QUEUEABLE];
+ /*
+ * The following two members diverge from OpenZFS. Upstream import
+ * status is built around the Linux /proc fs. On illumos we use a kstat
+ * to track import status. spa_imp_kstat_lock protects spa_imp_kstat.
+ */
+ kmutex_t spa_imp_kstat_lock;
+ struct kstat *spa_imp_kstat; /* kstat for import status */
+
/* arc_memory_throttle() parameters during low memory condition */
uint64_t spa_lowmem_page_load; /* memory load during txg */
uint64_t spa_lowmem_last_txg; /* txg window start */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index 707e177fc3..a6de7e6f2c 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -99,6 +99,7 @@ extern void vdev_deadman(vdev_t *vd);
extern void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
range_seg64_t *physical_rs);
+extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
extern void vdev_clear_stats(vdev_t *vd);
extern void vdev_stat_update(zio_t *zio, uint64_t psize);
@@ -173,6 +174,7 @@ extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
extern int vdev_label_number(uint64_t psise, uint64_t offset);
extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg);
extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **);
+extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv);
extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t
offset, uint64_t size, zio_done_func_t *done, void *private, int flags);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 8e61572a50..b9b538455c 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -229,6 +229,7 @@ struct vdev {
vdev_t **vdev_child; /* array of children */
uint64_t vdev_children; /* number of children */
vdev_stat_t vdev_stat; /* virtual device statistics */
+ vdev_stat_ex_t vdev_stat_ex; /* extended statistics */
boolean_t vdev_expanding; /* expand the vdev? */
boolean_t vdev_reopening; /* reopen in progress? */
boolean_t vdev_nonrot; /* true if solid state */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 7d3e0579c2..d03106b942 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -465,6 +465,9 @@ struct zio {
hrtime_t io_timestamp;
hrtime_t io_queued_timestamp;
hrtime_t io_target_timestamp;
+ hrtime_t io_delta; /* vdev queue service delta */
+ hrtime_t io_delay; /* Device access time (disk or */
+ /* file). */
avl_node_t io_queue_node;
avl_node_t io_offset_node;
avl_node_t io_alloc_node;
@@ -649,7 +652,7 @@ extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report);
extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
/* If we have the good data in hand, this function can be used */
-extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
+extern int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset,
uint64_t length, const abd_t *good_data, const abd_t *bad_data,
struct zio_bad_cksum *info);
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_priority.h b/usr/src/uts/common/fs/zfs/sys/zio_priority.h
index 7d91a646d1..feb23fafd6 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio_priority.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio_priority.h
@@ -14,6 +14,7 @@
*/
/*
* Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+ * Copyright 2020 Joyent, Inc.
*/
#ifndef _ZIO_PRIORITY_H
#define _ZIO_PRIORITY_H
@@ -22,6 +23,10 @@
extern "C" {
#endif
+/*
+ * NOTE: If ZIO_PRIORITY_NUM_QUEUEABLE changes, update ZIO_PRIORITY_N_QUEUEABLE
+ * in uts/common/sys/fs/zfs.h to match.
+ */
typedef enum zio_priority {
ZIO_PRIORITY_SYNC_READ,
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
@@ -32,7 +37,6 @@ typedef enum zio_priority {
ZIO_PRIORITY_INITIALIZING, /* initializing I/O */
ZIO_PRIORITY_TRIM, /* trim I/O (discard) */
ZIO_PRIORITY_NUM_QUEUEABLE,
-
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
} zio_priority_t;
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 52a7a62e4a..01e892f4c4 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -3505,6 +3505,7 @@ vdev_clear(spa_t *spa, vdev_t *vd)
vd->vdev_stat.vs_read_errors = 0;
vd->vdev_stat.vs_write_errors = 0;
vd->vdev_stat.vs_checksum_errors = 0;
+ vd->vdev_stat.vs_slow_ios = 0;
for (int c = 0; c < vd->vdev_children; c++)
vdev_clear(spa, vd->vdev_child[c]);
@@ -3628,6 +3629,51 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
return (B_TRUE);
}
+static void
+vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
+{
+ for (int t = 0; t < VS_ZIO_TYPES; t++) {
+ vs->vs_ops[t] += cvs->vs_ops[t];
+ vs->vs_bytes[t] += cvs->vs_bytes[t];
+ }
+
+ cvs->vs_scan_removing = cvd->vdev_removing;
+}
+
+/*
+ * Get extended stats
+ */
+static void
+vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx)
+{
+ int t, b;
+ for (t = 0; t < ZIO_TYPES; t++) {
+ for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++)
+ vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b];
+
+ for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) {
+ vsx->vsx_total_histo[t][b] +=
+ cvsx->vsx_total_histo[t][b];
+ }
+ }
+
+ for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
+ for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) {
+ vsx->vsx_queue_histo[t][b] +=
+ cvsx->vsx_queue_histo[t][b];
+ }
+ vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t];
+ vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t];
+
+ for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++)
+ vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b];
+
+ for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++)
+ vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b];
+ }
+
+}
+
boolean_t
vdev_is_spacemap_addressable(vdev_t *vd)
{
@@ -3652,81 +3698,121 @@ vdev_is_spacemap_addressable(vdev_t *vd)
/*
* Get statistics for the given vdev.
*/
-void
-vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
+static void
+vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
{
- spa_t *spa = vd->vdev_spa;
- vdev_t *rvd = spa->spa_root_vdev;
- vdev_t *tvd = vd->vdev_top;
+ int t;
+ /*
+ * If we're getting stats on the root vdev, aggregate the I/O counts
+ * over all top-level vdevs (i.e. the direct children of the root).
+ */
+ if (!vd->vdev_ops->vdev_op_leaf) {
+ if (vs) {
+ memset(vs->vs_ops, 0, sizeof (vs->vs_ops));
+ memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes));
+ }
+ if (vsx)
+ memset(vsx, 0, sizeof (*vsx));
- ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ vdev_stat_t *cvs = &cvd->vdev_stat;
+ vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex;
- mutex_enter(&vd->vdev_stat_lock);
- bcopy(&vd->vdev_stat, vs, sizeof (*vs));
- vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
- vs->vs_state = vd->vdev_state;
- vs->vs_rsize = vdev_get_min_asize(vd);
- if (vd->vdev_ops->vdev_op_leaf) {
- vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
+ vdev_get_stats_ex_impl(cvd, cvs, cvsx);
+ if (vs)
+ vdev_get_child_stat(cvd, vs, cvs);
+ if (vsx)
+ vdev_get_child_stat_ex(cvd, vsx, cvsx);
+
+ }
+ } else {
/*
- * Report intializing progress. Since we don't have the
- * initializing locks held, this is only an estimate (although a
- * fairly accurate one).
+ * We're a leaf. Just copy our ZIO active queue stats in. The
+ * other leaf stats are updated in vdev_stat_update().
*/
- vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done;
- vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est;
- vs->vs_initialize_state = vd->vdev_initialize_state;
- vs->vs_initialize_action_time = vd->vdev_initialize_action_time;
- }
+ if (!vsx)
+ return;
- /*
- * Report manual TRIM progress. Since we don't have
- * the manual TRIM locks held, this is only an
- * estimate (although fairly accurate one).
- */
- vs->vs_trim_notsup = !vd->vdev_has_trim;
- vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done;
- vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
- vs->vs_trim_state = vd->vdev_trim_state;
- vs->vs_trim_action_time = vd->vdev_trim_action_time;
+ memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
- /*
- * Report expandable space on top-level, non-auxillary devices only.
- * The expandable space is reported in terms of metaslab sized units
- * since that determines how much space the pool can expand.
- */
- if (vd->vdev_aux == NULL && tvd != NULL) {
- vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize -
- spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift);
- }
- if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
- vdev_is_concrete(vd)) {
- vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
- vd->vdev_mg->mg_fragmentation : 0;
+ for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
+ vsx->vsx_active_queue[t] =
+ vd->vdev_queue.vq_class[t].vqc_active;
+ vsx->vsx_pend_queue[t] = avl_numnodes(
+ &vd->vdev_queue.vq_class[t].vqc_queued_tree);
+ }
}
- if (vd->vdev_ops->vdev_op_leaf)
- vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
+}
- /*
- * If we're getting stats on the root vdev, aggregate the I/O counts
- * over all top-level vdevs (i.e. the direct children of the root).
- */
- if (vd == rvd) {
- for (int c = 0; c < rvd->vdev_children; c++) {
- vdev_t *cvd = rvd->vdev_child[c];
- vdev_stat_t *cvs = &cvd->vdev_stat;
+void
+vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
+{
+ vdev_t *tvd = vd->vdev_top;
+ mutex_enter(&vd->vdev_stat_lock);
+ if (vs) {
+ bcopy(&vd->vdev_stat, vs, sizeof (*vs));
+ vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
+ vs->vs_state = vd->vdev_state;
+ vs->vs_rsize = vdev_get_min_asize(vd);
+ if (vd->vdev_ops->vdev_op_leaf) {
+ vs->vs_rsize += VDEV_LABEL_START_SIZE +
+ VDEV_LABEL_END_SIZE;
+ /*
+ * Report initializing progress. Since we don't
+ * have the initializing locks held, this is only
+ * an estimate (although a fairly accurate one).
+ */
+ vs->vs_initialize_bytes_done =
+ vd->vdev_initialize_bytes_done;
+ vs->vs_initialize_bytes_est =
+ vd->vdev_initialize_bytes_est;
+ vs->vs_initialize_state = vd->vdev_initialize_state;
+ vs->vs_initialize_action_time =
+ vd->vdev_initialize_action_time;
- for (int t = 0; t < VS_ZIO_TYPES; t++) {
- vs->vs_ops[t] += cvs->vs_ops[t];
- vs->vs_bytes[t] += cvs->vs_bytes[t];
- }
- cvs->vs_scan_removing = cvd->vdev_removing;
+ /*
+ * Report manual TRIM progress. Since we don't have
+ * the manual TRIM locks held, this is only an
+ * estimate (although fairly accurate one).
+ */
+ vs->vs_trim_notsup = !vd->vdev_has_trim;
+ vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done;
+ vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
+ vs->vs_trim_state = vd->vdev_trim_state;
+ vs->vs_trim_action_time = vd->vdev_trim_action_time;
+ }
+ /*
+ * Report expandable space on top-level, non-auxiliary devices
+ * only. The expandable space is reported in terms of metaslab
+ * sized units since that determines how much space the pool
+ * can expand.
+ */
+ if (vd->vdev_aux == NULL && tvd != NULL) {
+ vs->vs_esize = P2ALIGN(
+ vd->vdev_max_asize - vd->vdev_asize,
+ 1ULL << tvd->vdev_ms_shift);
}
+ if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
+ vdev_is_concrete(vd)) {
+ vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
+ vd->vdev_mg->mg_fragmentation : 0;
+ }
+ if (vd->vdev_ops->vdev_op_leaf)
+ vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
}
+
+ vdev_get_stats_ex_impl(vd, vs, vsx);
mutex_exit(&vd->vdev_stat_lock);
}
void
+vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
+{
+ return (vdev_get_stats_ex(vd, vs, NULL));
+}
+
+void
vdev_clear_stats(vdev_t *vd)
{
mutex_enter(&vd->vdev_stat_lock);
@@ -3758,6 +3844,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
vdev_t *pvd;
uint64_t txg = zio->io_txg;
vdev_stat_t *vs = &vd->vdev_stat;
+ vdev_stat_ex_t *vsx = &vd->vdev_stat_ex;
zio_type_t type = zio->io_type;
int flags = zio->io_flags;
@@ -3808,18 +3895,42 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
vs->vs_self_healed += psize;
}
- zio_type_t vs_type = type;
-
/*
- * TRIM ops and bytes are reported to user space as
- * ZIO_TYPE_IOCTL. This is done to preserve the
- * vdev_stat_t structure layout for user space.
+ * The bytes/ops/histograms are recorded at the leaf level and
+ * aggregated into the higher level vdevs in vdev_get_stats().
*/
- if (type == ZIO_TYPE_TRIM)
- vs_type = ZIO_TYPE_IOCTL;
+ if (vd->vdev_ops->vdev_op_leaf &&
+ (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
+ zio_type_t vs_type = type;
+
+ /*
+ * TRIM ops and bytes are reported to user space as
+ * ZIO_TYPE_IOCTL. This is done to preserve the
+ * vdev_stat_t structure layout for user space.
+ */
+ if (type == ZIO_TYPE_TRIM)
+ vs_type = ZIO_TYPE_IOCTL;
+
+ vs->vs_ops[vs_type]++;
+ vs->vs_bytes[vs_type] += psize;
+
+ if (flags & ZIO_FLAG_DELEGATED) {
+ vsx->vsx_agg_histo[zio->io_priority]
+ [RQ_HISTO(zio->io_size)]++;
+ } else {
+ vsx->vsx_ind_histo[zio->io_priority]
+ [RQ_HISTO(zio->io_size)]++;
+ }
- vs->vs_ops[vs_type]++;
- vs->vs_bytes[vs_type] += psize;
+ if (zio->io_delta && zio->io_delay) {
+ vsx->vsx_queue_histo[zio->io_priority]
+ [L_HISTO(zio->io_delta - zio->io_delay)]++;
+ vsx->vsx_disk_histo[type]
+ [L_HISTO(zio->io_delay)]++;
+ vsx->vsx_total_histo[type]
+ [L_HISTO(zio->io_delta)]++;
+ }
+ }
mutex_exit(&vd->vdev_stat_lock);
return;
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index bb377f08ce..6235b06f17 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -23,7 +23,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
- * Copyright 2019 Joyent, Inc.
+ * Copyright 2020 Joyent, Inc.
*/
/*
@@ -211,6 +211,169 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE));
}
+/*
+ * Generate the nvlist representing this vdev's stats
+ */
+void
+vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
+{
+ nvlist_t *nvx;
+ vdev_stat_t *vs;
+ vdev_stat_ex_t *vsx;
+
+ vs = kmem_alloc(sizeof (*vs), KM_SLEEP);
+ vsx = kmem_alloc(sizeof (*vsx), KM_SLEEP);
+
+ vdev_get_stats_ex(vd, vs, vsx);
+ fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t *)vs, sizeof (*vs) / sizeof (uint64_t));
+
+ /*
+ * Add extended stats into a special extended stats nvlist. This keeps
+ * all the extended stats nicely grouped together. The extended stats
+ * nvlist is then added to the main nvlist.
+ */
+ nvx = fnvlist_alloc();
+
+ /* ZIOs in flight to disk */
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,
+ vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_READ]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,
+ vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_WRITE]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,
+ vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_READ]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,
+ vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_WRITE]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
+ vsx->vsx_active_queue[ZIO_PRIORITY_SCRUB]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE,
+ vsx->vsx_active_queue[ZIO_PRIORITY_TRIM]);
+
+ /* ZIOs pending */
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE,
+ vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE,
+ vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_WRITE]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE,
+ vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_READ]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE,
+ vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_WRITE]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE,
+ vsx->vsx_pend_queue[ZIO_PRIORITY_SCRUB]);
+
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE,
+ vsx->vsx_pend_queue[ZIO_PRIORITY_TRIM]);
+
+ /* Histograms */
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
+ vsx->vsx_total_histo[ZIO_TYPE_READ],
+ ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
+ vsx->vsx_total_histo[ZIO_TYPE_WRITE],
+ ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
+ vsx->vsx_disk_histo[ZIO_TYPE_READ],
+ ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
+ vsx->vsx_disk_histo[ZIO_TYPE_WRITE],
+ ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,
+ vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ],
+ ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,
+ vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE],
+ ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO,
+ vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ],
+ ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO,
+ vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE],
+ ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,
+ vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB],
+ ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
+ vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM],
+ ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM]));
+
+ /* Request sizes */
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO,
+ vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ],
+ ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO,
+ vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE],
+ ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO,
+ vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ],
+ ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO,
+ vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE],
+ ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO,
+ vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB],
+ ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO,
+ vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM],
+ ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO,
+ vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ],
+ ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO,
+ vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE],
+ ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO,
+ vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ],
+ ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO,
+ vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE],
+ ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO,
+ vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB],
+ ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB]));
+
+ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO,
+ vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM],
+ ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM]));
+
+ /* IO delays */
+ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios);
+
+ /* Add extended stats nvlist to main nvlist */
+ fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx);
+
+ nvlist_free(nvx);
+ kmem_free(vs, sizeof (*vs));
+ kmem_free(vsx, sizeof (*vsx));
+}
+
static void
root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
{
@@ -386,11 +549,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
}
if (getstats) {
- vdev_stat_t vs;
-
- vdev_get_stats(vd, &vs);
- fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
- (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t));
+ vdev_config_generate_stats(vd, nv);
root_vdev_actions_getprogress(vd, nv);
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 7f9795ac6f..4c6515c43d 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -829,6 +829,7 @@ vdev_queue_io_done(zio_t *zio)
vdev_queue_pending_remove(vq, zio);
+ zio->io_delta = gethrtime() - zio->io_timestamp;
vq->vq_io_complete_ts = gethrtime();
while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
diff --git a/usr/src/uts/common/fs/zfs/zfs_fm.c b/usr/src/uts/common/fs/zfs/zfs_fm.c
index 87846292b1..dd854c12e1 100644
--- a/usr/src/uts/common/fs/zfs/zfs_fm.c
+++ b/usr/src/uts/common/fs/zfs/zfs_fm.c
@@ -102,7 +102,11 @@
* ereport with information about the differences.
*/
#ifdef _KERNEL
-static void
+
+/*
+ * Return B_TRUE if the event actually posted, B_FALSE if not.
+ */
+static boolean_t
zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
zio_t *zio, uint64_t stateoroffset, uint64_t size)
@@ -112,88 +116,15 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
uint64_t ena;
char class[64];
- /*
- * If we are doing a spa_tryimport() or in recovery mode,
- * ignore errors.
- */
- if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
- spa_load_state(spa) == SPA_LOAD_RECOVER)
- return;
-
- /*
- * If we are in the middle of opening a pool, and the previous attempt
- * failed, don't bother logging any new ereports - we're just going to
- * get the same diagnosis anyway.
- */
- if (spa_load_state(spa) != SPA_LOAD_NONE &&
- spa->spa_last_open_failed)
- return;
-
- if (zio != NULL) {
- /*
- * If this is not a read or write zio, ignore the error. This
- * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
- */
- if (zio->io_type != ZIO_TYPE_READ &&
- zio->io_type != ZIO_TYPE_WRITE)
- return;
-
- /*
- * Ignore any errors from speculative I/Os, as failure is an
- * expected result.
- */
- if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
- return;
-
- /*
- * If this I/O is not a retry I/O, don't post an ereport.
- * Otherwise, we risk making bad diagnoses based on B_FAILFAST
- * I/Os.
- */
- if (zio->io_error == EIO &&
- !(zio->io_flags & ZIO_FLAG_IO_RETRY))
- return;
-
- if (vd != NULL) {
- /*
- * If the vdev has already been marked as failing due
- * to a failed probe, then ignore any subsequent I/O
- * errors, as the DE will automatically fault the vdev
- * on the first such failure. This also catches cases
- * where vdev_remove_wanted is set and the device has
- * not yet been asynchronously placed into the REMOVED
- * state.
- */
- if (zio->io_vd == vd && !vdev_accessible(vd, zio))
- return;
-
- /*
- * Ignore checksum errors for reads from DTL regions of
- * leaf vdevs.
- */
- if (zio->io_type == ZIO_TYPE_READ &&
- zio->io_error == ECKSUM &&
- vd->vdev_ops->vdev_op_leaf &&
- vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
- return;
- }
- }
-
- /*
- * For probe failure, we want to avoid posting ereports if we've
- * already removed the device in the meantime.
- */
- if (vd != NULL &&
- strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
- (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
- return;
+ if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
+ return (B_FALSE);
if ((ereport = fm_nvlist_create(NULL)) == NULL)
- return;
+ return (B_FALSE);
if ((detector = fm_nvlist_create(NULL)) == NULL) {
fm_nvlist_destroy(ereport, FM_NVA_FREE);
- return;
+ return (B_FALSE);
}
/*
@@ -336,7 +267,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
/*
* Payload for I/Os with corresponding logical information.
*/
- if (zb != NULL && (zio == NULL || zio->io_logical != NULL))
+ if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) {
fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
DATA_TYPE_UINT64, zb->zb_objset,
@@ -346,11 +277,13 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
DATA_TYPE_INT64, zb->zb_level,
FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
DATA_TYPE_UINT64, zb->zb_blkid, NULL);
+ }
mutex_exit(&spa->spa_errlist_lock);
*ereport_out = ereport;
*detector_out = detector;
+ return (B_TRUE);
}
/* if it's <= 128 bytes, save the corruption directly */
@@ -674,26 +607,110 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
}
#endif
-void
+/*
+ * Make sure our event is still valid for the given zio/vdev/pool. For example,
+ * we don't want to keep logging events for a faulted or missing vdev.
+ */
+boolean_t
+zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
+{
+#ifdef _KERNEL
+ /*
+ * If we are doing a spa_tryimport() or in recovery mode,
+ * ignore errors.
+ */
+ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
+ spa_load_state(spa) == SPA_LOAD_RECOVER)
+ return (B_FALSE);
+
+ /*
+ * If we are in the middle of opening a pool, and the previous attempt
+ * failed, don't bother logging any new ereports - we're just going to
+ * get the same diagnosis anyway.
+ */
+ if (spa_load_state(spa) != SPA_LOAD_NONE &&
+ spa->spa_last_open_failed)
+ return (B_FALSE);
+
+ if (zio != NULL) {
+ /*
+ * If this is not a read or write zio, ignore the error. This
+ * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
+ */
+ if (zio->io_type != ZIO_TYPE_READ &&
+ zio->io_type != ZIO_TYPE_WRITE)
+ return (B_FALSE);
+
+ if (vd != NULL) {
+ /*
+ * If the vdev has already been marked as failing due
+ * to a failed probe, then ignore any subsequent I/O
+ * errors, as the DE will automatically fault the vdev
+ * on the first such failure. This also catches cases
+ * where vdev_remove_wanted is set and the device has
+ * not yet been asynchronously placed into the REMOVED
+ * state.
+ */
+ if (zio->io_vd == vd && !vdev_accessible(vd, zio))
+ return (B_FALSE);
+
+ /*
+ * Ignore checksum errors for reads from DTL regions of
+ * leaf vdevs.
+ */
+ if (zio->io_type == ZIO_TYPE_READ &&
+ zio->io_error == ECKSUM &&
+ vd->vdev_ops->vdev_op_leaf &&
+ vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
+ return (B_FALSE);
+ }
+ }
+
+ /*
+ * For probe failure, we want to avoid posting ereports if we've
+ * already removed the device in the meantime.
+ */
+ if (vd != NULL &&
+ strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
+ (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
+ return (B_FALSE);
+
+ /* Ignore bogus delay events (like from ioctls or unqueued IOs) */
+ if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
+ (zio != NULL) && (!zio->io_timestamp)) {
+ return (B_FALSE);
+ }
+#endif
+ return (B_TRUE);
+}
+
+/*
+ * Return 0 if event was posted, EINVAL if there was a problem posting it or
+ * EBUSY if the event was rate limited.
+ */
+int
zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
const struct zbookmark_phys *zb, zio_t *zio, uint64_t stateoroffset,
uint64_t size)
{
+ int rc = 0;
#ifdef _KERNEL
nvlist_t *ereport = NULL;
nvlist_t *detector = NULL;
- zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
- zb, zio, stateoroffset, size);
+ if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
+ zb, zio, stateoroffset, size))
+ return (SET_ERROR(EINVAL)); /* couldn't post event */
if (ereport == NULL)
- return;
+ return (SET_ERROR(EINVAL));
fm_ereport_post(ereport, EVCH_SLEEP);
fm_nvlist_destroy(ereport, FM_NVA_FREE);
fm_nvlist_destroy(detector, FM_NVA_FREE);
#endif
+ return (rc);
}
void
@@ -786,21 +803,21 @@ zfs_ereport_send_interim_checksum(zio_cksum_report_t *report)
#endif
}
-void
+int
zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
struct zio *zio, uint64_t offset, uint64_t length,
const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc)
{
+ int rc = 0;
#ifdef _KERNEL
nvlist_t *ereport = NULL;
nvlist_t *detector = NULL;
zfs_ecksum_info_t *info;
- zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
- spa, vd, zb, zio, offset, length);
-
- if (ereport == NULL)
- return;
+ if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
+ spa, vd, zb, zio, offset, length) || (ereport == NULL)) {
+ return (SET_ERROR(EINVAL));
+ }
info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
B_FALSE);
@@ -814,6 +831,7 @@ zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
if (info != NULL)
kmem_free(info, sizeof (*info));
#endif
+ return (rc);
}
static void
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index ed9d6e86f5..ff0e4bbded 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -42,6 +42,7 @@
#include <sys/ddt.h>
#include <sys/blkptr.h>
#include <sys/zfeature.h>
+#include <sys/time.h>
#include <sys/dsl_scan.h>
#include <sys/metaslab_impl.h>
#include <sys/abd.h>
@@ -77,6 +78,9 @@ extern vmem_t *zio_alloc_arena;
#define ZIO_PIPELINE_CONTINUE 0x100
#define ZIO_PIPELINE_STOP 0x101
+/* Mark IOs as "slow" if they take longer than 30 seconds */
+int zio_slow_io_ms = (30 * MILLISEC);
+
#define BP_SPANB(indblkshift, level) \
(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
#define COMPARE_META_LEVEL 0x80000000ul
@@ -3388,6 +3392,8 @@ zio_vdev_io_start(zio_t *zio)
uint64_t align;
spa_t *spa = zio->io_spa;
+ zio->io_delay = 0;
+
ASSERT(zio->io_error == 0);
ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
@@ -3505,6 +3511,7 @@ zio_vdev_io_start(zio_t *zio)
zio_interrupt(zio);
return (ZIO_PIPELINE_STOP);
}
+ zio->io_delay = gethrtime();
}
vd->vdev_ops->vdev_op_io_start(zio);
@@ -3525,6 +3532,9 @@ zio_vdev_io_done(zio_t *zio)
ASSERT(zio->io_type == ZIO_TYPE_READ ||
zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM);
+ if (zio->io_delay)
+ zio->io_delay = gethrtime() - zio->io_delay;
+
if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
vdev_queue_io_done(zio);
@@ -4228,6 +4238,29 @@ zio_done(zio_t *zio)
vdev_stat_update(zio, psize);
+ if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) {
+ if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) {
+ /*
+ * We want to only increment our slow IO counters if
+ * the IO is valid (i.e. not if the drive is removed).
+ *
+ * zfs_ereport_post() will also do these checks, but
+ * it can also have other failures, so we need to
+ * increment the slow_io counters independent of it.
+ */
+ if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY,
+ zio->io_spa, zio->io_vd, zio)) {
+ mutex_enter(&zio->io_vd->vdev_stat_lock);
+ zio->io_vd->vdev_stat.vs_slow_ios++;
+ mutex_exit(&zio->io_vd->vdev_stat_lock);
+
+ zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
+ zio->io_spa, zio->io_vd, &zio->io_bookmark,
+ zio, 0, 0);
+ }
+ }
+ }
+
if (zio->io_error) {
/*
* If this I/O is attached to a particular vdev,
diff --git a/usr/src/uts/common/sys/fm/fs/zfs.h b/usr/src/uts/common/sys/fm/fs/zfs.h
index c3eb950326..8e56f244cd 100644
--- a/usr/src/uts/common/sys/fm/fs/zfs.h
+++ b/usr/src/uts/common/sys/fm/fs/zfs.h
@@ -36,6 +36,7 @@ extern "C" {
#define FM_EREPORT_ZFS_AUTHENTICATION "authentication"
#define FM_EREPORT_ZFS_IO "io"
#define FM_EREPORT_ZFS_DATA "data"
+#define FM_EREPORT_ZFS_DELAY "delay"
#define FM_EREPORT_ZFS_POOL "zpool"
#define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown"
#define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed"
@@ -61,6 +62,7 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE "vdev_state"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT "vdev_ashift"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path"
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
index 1bc421e33b..93a2b5887a 100644
--- a/usr/src/uts/common/sys/fs/zfs.h
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -24,7 +24,7 @@
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2020 Joyent, Inc.
* Copyright (c) 2017 Datto Inc.
* Copyright (c) 2017, Intel Corporation.
*/
@@ -35,6 +35,15 @@
#define _SYS_FS_ZFS_H
#include <sys/time.h>
+/*
+ * In OpenZFS we include sys/zio_priority.h to get the enum value of
+ * ZIO_PRIORITY_NUM_QUEUEABLE, which is used for the various array sizes in
+ * the structure definitions below. However, in illumos zio_priority.h is not
+ * readily available to the userland code where we have a very large number of
+ * files including sys/zfs.h. Thus, we define ZIO_PRIORITY_N_QUEUEABLE here and
+ * this should be kept in sync if ZIO_PRIORITY_NUM_QUEUEABLE changes.
+ */
+#define ZIO_PRIORITY_N_QUEUEABLE 8
#ifdef __cplusplus
extern "C" {
@@ -601,6 +610,55 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */
#define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */
#define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */
+
+/* container nvlist of extended stats */
+#define ZPOOL_CONFIG_VDEV_STATS_EX "vdev_stats_ex"
+
+/* Active queue read/write stats */
+#define ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE "vdev_sync_r_active_queue"
+#define ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE "vdev_sync_w_active_queue"
+#define ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE "vdev_async_r_active_queue"
+#define ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE "vdev_async_w_active_queue"
+#define ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE "vdev_async_scrub_active_queue"
+#define ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE "vdev_async_trim_active_queue"
+
+/* Queue sizes */
+#define ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE "vdev_sync_r_pend_queue"
+#define ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE "vdev_sync_w_pend_queue"
+#define ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE "vdev_async_r_pend_queue"
+#define ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE "vdev_async_w_pend_queue"
+#define ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE "vdev_async_scrub_pend_queue"
+#define ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE "vdev_async_trim_pend_queue"
+
+/* Latency read/write histogram stats */
+#define ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO "vdev_tot_r_lat_histo"
+#define ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO "vdev_tot_w_lat_histo"
+#define ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO "vdev_disk_r_lat_histo"
+#define ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO "vdev_disk_w_lat_histo"
+#define ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO "vdev_sync_r_lat_histo"
+#define ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO "vdev_sync_w_lat_histo"
+#define ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO "vdev_async_r_lat_histo"
+#define ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO "vdev_async_w_lat_histo"
+#define ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO "vdev_scrub_histo"
+#define ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO "vdev_trim_histo"
+
+/* Request size histograms */
+#define ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO "vdev_sync_ind_r_histo"
+#define ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO "vdev_sync_ind_w_histo"
+#define ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO "vdev_async_ind_r_histo"
+#define ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO "vdev_async_ind_w_histo"
+#define ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO "vdev_ind_scrub_histo"
+#define ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO "vdev_ind_trim_histo"
+#define ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO "vdev_sync_agg_r_histo"
+#define ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO "vdev_sync_agg_w_histo"
+#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO "vdev_async_agg_r_histo"
+#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo"
+#define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo"
+#define ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO "vdev_agg_trim_histo"
+
+/* Number of slow IOs */
+#define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios"
+
#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
#define ZPOOL_CONFIG_ERRCOUNT "error_count"
#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
@@ -1001,6 +1059,7 @@ typedef struct vdev_stat {
uint64_t vs_initialize_action_time; /* time_t */
uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
uint64_t vs_resilver_deferred; /* resilver deferred */
+ uint64_t vs_slow_ios; /* slow IOs */
uint64_t vs_trim_errors; /* trimming errors */
uint64_t vs_trim_notsup; /* supported by device */
uint64_t vs_trim_bytes_done; /* bytes trimmed */
@@ -1010,6 +1069,58 @@ typedef struct vdev_stat {
} vdev_stat_t;
/*
+ * Extended stats
+ *
+ * These are stats which aren't included in the original iostat output. For
+ * convenience, they are grouped together in vdev_stat_ex, although each stat
+ * is individually exported as a nvlist.
+ */
+typedef struct vdev_stat_ex {
+ /* Number of ZIOs issued to disk and waiting to finish */
+ uint64_t vsx_active_queue[ZIO_PRIORITY_N_QUEUEABLE];
+
+ /* Number of ZIOs pending to be issued to disk */
+ uint64_t vsx_pend_queue[ZIO_PRIORITY_N_QUEUEABLE];
+
+ /*
+ * Below are the histograms for various latencies. Buckets are in
+ * units of nanoseconds.
+ */
+
+ /*
+ * 2^37 nanoseconds = 134s. Timeouts will probably start kicking in
+ * before this.
+ */
+#define VDEV_L_HISTO_BUCKETS 37 /* Latency histo buckets */
+#define VDEV_RQ_HISTO_BUCKETS 25 /* Request size histo buckets */
+
+ /* Amount of time in ZIO queue (ns) */
+ uint64_t vsx_queue_histo[ZIO_PRIORITY_N_QUEUEABLE]
+ [VDEV_L_HISTO_BUCKETS];
+
+ /* Total ZIO latency (ns). Includes queuing and disk access time */
+ uint64_t vsx_total_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS];
+
+ /* Amount of time to read/write the disk (ns) */
+ uint64_t vsx_disk_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS];
+
+ /* "lookup the bucket for a value" macro */
+#define HISTO(val, buckets) (val != 0 ? MIN(highbit64(val) - 1, \
+ buckets - 1) : 0)
+#define L_HISTO(a) HISTO(a, VDEV_L_HISTO_BUCKETS)
+#define RQ_HISTO(a) HISTO(a, VDEV_RQ_HISTO_BUCKETS)
+
+ /* Physical IO histogram */
+ uint64_t vsx_ind_histo[ZIO_PRIORITY_N_QUEUEABLE]
+ [VDEV_RQ_HISTO_BUCKETS];
+
+ /* Delegated (aggregated) physical IO histogram */
+ uint64_t vsx_agg_histo[ZIO_PRIORITY_N_QUEUEABLE]
+ [VDEV_RQ_HISTO_BUCKETS];
+
+} vdev_stat_ex_t;
+
+/*
* DDT statistics. Note: all fields should be 64-bit because this
* is passed between kernel and userland as an nvlist uint64 array.
*/