diff options
author | Tony Hutter <hutter2@llnl.gov> | 2020-02-11 06:20:40 -0700 |
---|---|---|
committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2020-02-13 10:45:47 -0700 |
commit | dd50e0cc4cbe1474096300fe52e9855769c0d478 (patch) | |
tree | 2ec7eb7dda66d419ab4de535b103b6f680e239f6 | |
parent | 3df9f0641f28754051d5e82c6457527cf4af1258 (diff) | |
download | illumos-joyent-dd50e0cc4cbe1474096300fe52e9855769c0d478.tar.gz |
11682 zpool iostat and status improvements
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Portions contributed by: Hajo Möller <dasjoe@gmail.com>
Portions contributed by: Damian Wojslaw <damian@wojslaw.pl>
Portions contributed by: kpande <github@tripleback.net>
Portions contributed by: Anatoly Borodin <anatoly.borodin@gmail.com>
Portions contributed by: Gregor Kopka <mailfrom-github@kopka.net>
Portions contributed by: George Melikov <mail@gmelikov.ru>
Portions contributed by: George G <gg7@users.noreply.github.com>
Portions contributed by: DeHackEd <DeHackEd@users.noreply.github.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Joshua M. Clulow <josh@sysmgr.org>
Reviewed by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Alek Pinchuk <apinchuk@datto.com>
Reviewed by: Allan Jude <allanjude@freebsd.org>
Reviewed by: Olaf Faaland <faaland1@llnl.gov>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: C Fraire <cfraire@me.com>
Reviewed by: Andy Fiddaman <omnios@citrus-it.co.uk>
Approved by: Gordon Ross <gordon.w.ross@gmail.com>
47 files changed, 3093 insertions, 428 deletions
diff --git a/usr/src/cmd/zpool/Makefile b/usr/src/cmd/zpool/Makefile index 55d8abc80f..4b7e6600c8 100644 --- a/usr/src/cmd/zpool/Makefile +++ b/usr/src/cmd/zpool/Makefile @@ -40,7 +40,8 @@ STAT_COMMON_OBJS = timestamp.o STAT_COMMON_SRCS = $(STAT_COMMON_OBJS:%.o=$(STATCOMMONDIR)/%.c) SRCS += $(STAT_COMMON_SRCS) -LDLIBS += -lzfs -lnvpair -ldevid -lefi -ldiskmgt -luutil -lumem -lzutil +LDLIBS += -lzfs -lnvpair -ldevid -lefi -ldiskmgt -luutil -lumem -lzutil \ + -lm -lzpool INCS += -I../../common/zfs -I../../uts/common/fs/zfs -I$(STATCOMMONDIR) INCS += -I../../lib/libzutil/common diff --git a/usr/src/cmd/zpool/zpool_iter.c b/usr/src/cmd/zpool/zpool_iter.c index c05c665ada..e69f9778e0 100644 --- a/usr/src/cmd/zpool/zpool_iter.c +++ b/usr/src/cmd/zpool/zpool_iter.c @@ -32,6 +32,7 @@ #include <stdio.h> #include <stdlib.h> #include <strings.h> +#include <sys/sysmacros.h> #include <libzfs.h> #include <libzutil.h> @@ -253,3 +254,69 @@ for_each_pool(int argc, char **argv, boolean_t unavail, return (ret); } + +static int +for_each_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, pool_vdev_iter_f func, + void *data) +{ + nvlist_t **child; + uint_t c, children; + int ret = 0; + int i; + char *type; + + const char *list[] = { + ZPOOL_CONFIG_SPARES, + ZPOOL_CONFIG_L2CACHE, + ZPOOL_CONFIG_CHILDREN + }; + + for (i = 0; i < ARRAY_SIZE(list); i++) { + if (nvlist_lookup_nvlist_array(nv, list[i], &child, + &children) == 0) { + for (c = 0; c < children; c++) { + uint64_t ishole = 0; + + (void) nvlist_lookup_uint64(child[c], + ZPOOL_CONFIG_IS_HOLE, &ishole); + + if (ishole) + continue; + + ret |= for_each_vdev_cb(zhp, child[c], func, + data); + } + } + } + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) + return (ret); + + /* Don't run our function on root vdevs */ + if (strcmp(type, VDEV_TYPE_ROOT) != 0) { + ret |= func(zhp, nv, data); + } + + return (ret); +} + +/* + * This is the equivalent of for_each_pool() for vdevs. It iterates thorough + * all vdevs in the pool, ignoring root vdevs and holes, calling func() on + * each one. + * + * @zhp: Zpool handle + * @func: Function to call on each vdev + * @data: Custom data to pass to the function + */ +int +for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data) +{ + nvlist_t *config, *nvroot; + + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + } + return (for_each_vdev_cb(zhp, nvroot, func, data)); +} diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c index 23269c20d6..053b5cce86 100644 --- a/usr/src/cmd/zpool/zpool_main.c +++ b/usr/src/cmd/zpool/zpool_main.c @@ -29,7 +29,7 @@ * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com> - * Copyright 2019 Joyent, Inc. + * Copyright 2020 Joyent, Inc. * Copyright (c) 2012 by Cyril Plisko. All rights reserved. * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. */ @@ -56,6 +56,9 @@ #include <sys/fs/zfs.h> #include <sys/stat.h> #include <sys/debug.h> +#include <math.h> +#include <sys/sysmacros.h> +#include <sys/termios.h> #include <libzfs.h> #include <libzutil.h> @@ -160,6 +163,85 @@ typedef enum { } zpool_help_t; +/* + * Flags for stats to display with "zpool iostats" + */ +enum iostat_type { + IOS_DEFAULT = 0, + IOS_LATENCY = 1, + IOS_QUEUES = 2, + IOS_L_HISTO = 3, + IOS_RQ_HISTO = 4, + IOS_COUNT, /* always last element */ +}; + +/* iostat_type entries as bitmasks */ +#define IOS_DEFAULT_M (1ULL << IOS_DEFAULT) +#define IOS_LATENCY_M (1ULL << IOS_LATENCY) +#define IOS_QUEUES_M (1ULL << IOS_QUEUES) +#define IOS_L_HISTO_M (1ULL << IOS_L_HISTO) +#define IOS_RQ_HISTO_M (1ULL << IOS_RQ_HISTO) + +/* Mask of all the histo bits */ +#define IOS_ANYHISTO_M (IOS_L_HISTO_M | IOS_RQ_HISTO_M) + +/* + * Lookup table for iostat flags to nvlist names. Basically a list + * of all the nvlists a flag requires. Also specifies the order in + * which data gets printed in zpool iostat. + */ +static const char *vsx_type_to_nvlist[IOS_COUNT][13] = { + [IOS_L_HISTO] = { + ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, + NULL}, + [IOS_LATENCY] = { + ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, + NULL}, + [IOS_QUEUES] = { + ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, + NULL}, + [IOS_RQ_HISTO] = { + ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, + ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, + ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, + ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO, + ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO, + NULL}, +}; + + +/* + * Given a cb->cb_flags with a histogram bit set, return the iostat_type. + * Right now, only one histo bit is ever set at one time, so we can + * just do a highbit64(a) + */ +#define IOS_HISTO_IDX(a) (highbit64(a & IOS_ANYHISTO_M) - 1) + typedef struct zpool_command { const char *name; int (*func)(int, char **); @@ -216,7 +298,7 @@ static zpool_command_t command_table[] = { { "sync", zpool_do_sync, HELP_SYNC }, }; -#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) +#define NCOMMAND (ARRAY_SIZE(command_table)) #define VDEV_ALLOC_CLASS_LOGS "logs" @@ -265,8 +347,10 @@ get_usage(zpool_help_t idx) "[-R root] [-F [-n]] [-t]\n" "\t [--rewind-to-checkpoint] <pool | id> [newpool]\n")); case HELP_IOSTAT: - return (gettext("\tiostat [-gLPv] [-T d|u] [pool] ... " - "[interval [count]]\n")); + return (gettext("\tiostat " + "[[-lq]|[-rw]] [-T d | u] [-ghHLpPvy]\n" + "\t [pool] ..." + " [[-n] interval [count]]\n")); case HELP_LABELCLEAR: return (gettext("\tlabelclear [-f] <vdev>\n")); case HELP_LIST: @@ -294,7 +378,8 @@ get_usage(zpool_help_t idx) return (gettext("\ttrim [-d] [-r <rate>] [-c | -s] <pool> " "[<device> ...]\n")); case HELP_STATUS: - return (gettext("\tstatus [-igLPvxD] [-T d|u] [pool] ... " + return (gettext("\tstatus " + "[-igLpPsvxD] [-T d|u] [pool] ... " "[interval [count]]\n")); case HELP_UPGRADE: return (gettext("\tupgrade\n" @@ -1621,10 +1706,12 @@ typedef struct status_cbdata { int cb_namewidth; boolean_t cb_allpools; boolean_t cb_verbose; + boolean_t cb_literal; boolean_t cb_explain; boolean_t cb_first; boolean_t cb_dedup_stats; boolean_t cb_print_status; + boolean_t cb_print_slow_ios; boolean_t cb_print_vdev_init; boolean_t cb_print_vdev_trim; } status_cbdata_t; @@ -1778,10 +1865,34 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, name, state); if (!isspare) { - zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); - zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); - zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf)); - (void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf); + if (cb->cb_literal) { + printf(" %5llu %5llu %5llu", + (u_longlong_t)vs->vs_read_errors, + (u_longlong_t)vs->vs_write_errors, + (u_longlong_t)vs->vs_checksum_errors); + } else { + zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); + zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); + zfs_nicenum(vs->vs_checksum_errors, cbuf, + sizeof (cbuf)); + printf(" %5s %5s %5s", rbuf, wbuf, cbuf); + } + + if (cb->cb_print_slow_ios) { + if (children == 0) { + /* Only leafs vdevs have slow IOs */ + zfs_nicenum(vs->vs_slow_ios, rbuf, + sizeof (rbuf)); + } else { + (void) snprintf(rbuf, sizeof (rbuf), "-"); + } + + if (cb->cb_literal) + printf(" %5llu", (u_longlong_t)vs->vs_slow_ios); + else + printf(" %5s", rbuf); + } + } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, @@ -2382,7 +2493,8 @@ show_import(nvlist_t *config) (void) printf(gettext(" config:\n\n")); - cb.cb_namewidth = max_width(NULL, nvroot, 0, 0, 0); + cb.cb_namewidth = max_width(NULL, nvroot, 0, strlen(name), + VDEV_NAME_TYPE_ID); if (cb.cb_namewidth < 10) cb.cb_namewidth = 10; @@ -3112,44 +3224,737 @@ zpool_do_sync(int argc, char **argv) } typedef struct iostat_cbdata { - boolean_t cb_verbose; + uint64_t cb_flags; int cb_name_flags; int cb_namewidth; int cb_iteration; + char **cb_vdev_names; /* Only show these vdevs */ + unsigned int cb_vdev_names_count; + boolean_t cb_verbose; + boolean_t cb_literal; boolean_t cb_scripted; zpool_list_t *cb_list; } iostat_cbdata_t; +/* iostat labels */ +typedef struct name_and_columns { + const char *name; /* Column name */ + unsigned int columns; /* Center name to this number of columns */ +} name_and_columns_t; + +#define IOSTAT_MAX_LABELS 13 /* Max number of labels on one line */ + +static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] = +{ + [IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2}, + {NULL}}, + [IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, + {"asyncq_wait", 2}, {"scrub"}, {"trim", 1}, {NULL}}, + [IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2}, + {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2}, + {"trimq_write", 2}, {NULL}}, + [IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, + {"asyncq_wait", 2}, {NULL}}, + [IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2}, + {"async_read", 2}, {"async_write", 2}, {"scrub", 2}, + {"trim", 2}, {NULL}}, + +}; + +/* Shorthand - if "columns" field not set, default to 1 column */ +static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] = +{ + [IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"}, + {"write"}, {NULL}}, + [IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, + {"write"}, {"read"}, {"write"}, {"wait"}, {"wait"}, {NULL}}, + [IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, + {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, + {"pend"}, {"activ"}, {NULL}}, + [IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, + {"write"}, {"read"}, {"write"}, {"scrub"}, {"trim"}, {NULL}}, + [IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, + {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {NULL}}, +}; + +static const char *histo_to_title[] = { + [IOS_L_HISTO] = "latency", + [IOS_RQ_HISTO] = "req_size", +}; + +/* + * Return the number of labels in a null-terminated name_and_columns_t + * array. + * + */ +static unsigned int +label_array_len(const name_and_columns_t *labels) +{ + int i = 0; + + while (labels[i].name) + i++; + + return (i); +} + +/* + * Return the number of strings in a null-terminated string array. + * For example: + * + * const char foo[] = {"bar", "baz", NULL} + * + * returns 2 + */ +static uint64_t +str_array_len(const char *array[]) +{ + uint64_t i = 0; + while (array[i]) + i++; + + return (i); +} + + +/* + * Return a default column width for default/latency/queue columns. This does + * not include histograms, which have their columns autosized. + */ +static unsigned int +default_column_width(iostat_cbdata_t *cb, enum iostat_type type) +{ + unsigned long column_width = 5; /* Normal niceprint */ + static unsigned long widths[] = { + /* + * Choose some sane default column sizes for printing the + * raw numbers. + */ + [IOS_DEFAULT] = 15, /* 1PB capacity */ + [IOS_LATENCY] = 10, /* 1B ns = 10sec */ + [IOS_QUEUES] = 6, /* 1M queue entries */ + [IOS_L_HISTO] = 10, /* 1B ns = 10sec */ + [IOS_RQ_HISTO] = 6, /* 1M queue entries */ + }; + + if (cb->cb_literal) + column_width = widths[type]; + + return (column_width); +} + +/* + * Print the column labels, i.e: + * + * capacity operations bandwidth + * alloc free read write read write ... + * + * If force_column_width is set, use it for the column width. If not set, use + * the default column width. + */ +void +print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width, + const name_and_columns_t labels[][IOSTAT_MAX_LABELS]) +{ + int i, idx, s; + int text_start, rw_column_width, spaces_to_end; + uint64_t flags = cb->cb_flags; + uint64_t f; + unsigned int column_width = force_column_width; + + /* For each bit set in flags */ + for (f = flags; f; f &= ~(1ULL << idx)) { + idx = lowbit64(f) - 1; + if (!force_column_width) + column_width = default_column_width(cb, idx); + /* Print our top labels centered over "read write" label. */ + for (i = 0; i < label_array_len(labels[idx]); i++) { + const char *name = labels[idx][i].name; + /* + * We treat labels[][].columns == 0 as shorthand + * for one column. It makes writing out the label + * tables more concise. + */ + unsigned int columns = MAX(1, labels[idx][i].columns); + unsigned int slen = strlen(name); + + rw_column_width = (column_width * columns) + + (2 * (columns - 1)); + + text_start = (int)((rw_column_width) / columns - + slen / columns); + if (text_start < 0) + text_start = 0; + + printf(" "); /* Two spaces between columns */ + + /* Space from beginning of column to label */ + for (s = 0; s < text_start; s++) + printf(" "); + + printf("%s", name); + + /* Print space after label to end of column */ + spaces_to_end = rw_column_width - text_start - slen; + if (spaces_to_end < 0) + spaces_to_end = 0; + + for (s = 0; s < spaces_to_end; s++) + printf(" "); + } + } +} + +/* + * Utility function to print out a line of dashes like: + * + * -------------------------------- ----- ----- ----- ----- ----- + * + * ...or a dashed named-row line like: + * + * logs - - - - - + * + * @cb: iostat data + * + * @force_column_width If non-zero, use the value as the column width. + * Otherwise use the default column widths. + * + * @name: Print a dashed named-row line starting + * with @name. Otherwise, print a regular + * dashed line. + */ +static void +print_iostat_dashes(iostat_cbdata_t *cb, unsigned int force_column_width, + const char *name) +{ + int i; + unsigned int namewidth; + uint64_t flags = cb->cb_flags; + uint64_t f; + int idx; + const name_and_columns_t *labels; + const char *title; + + + if (cb->cb_flags & IOS_ANYHISTO_M) { + title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; + } else if (cb->cb_vdev_names_count) { + title = "vdev"; + } else { + title = "pool"; + } + + namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), + name ? strlen(name) : 0); + + + if (name) { + printf("%-*s", namewidth, name); + } else { + for (i = 0; i < namewidth; i++) + (void) printf("-"); + } + + /* For each bit in flags */ + for (f = flags; f; f &= ~(1ULL << idx)) { + unsigned int column_width; + idx = lowbit64(f) - 1; + if (force_column_width) + column_width = force_column_width; + else + column_width = default_column_width(cb, idx); + + labels = iostat_bottom_labels[idx]; + for (i = 0; i < label_array_len(labels); i++) { + if (name) + printf(" %*s-", column_width - 1, " "); + else + printf(" %.*s", column_width, + "--------------------"); + } + } +} + + +static void +print_iostat_separator_impl(iostat_cbdata_t *cb, + unsigned int force_column_width) +{ + print_iostat_dashes(cb, force_column_width, NULL); +} + static void print_iostat_separator(iostat_cbdata_t *cb) { - int i = 0; + print_iostat_separator_impl(cb, 0); +} + +static void +print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width, + const char *histo_vdev_name) +{ + unsigned int namewidth; + const char *title; + + if (cb->cb_flags & IOS_ANYHISTO_M) { + title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; + } else if (cb->cb_vdev_names_count) { + title = "vdev"; + } else { + title = "pool"; + } + + namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), + histo_vdev_name ? strlen(histo_vdev_name) : 0); + + if (histo_vdev_name) + printf("%-*s", namewidth, histo_vdev_name); + else + printf("%*s", namewidth, ""); - for (i = 0; i < cb->cb_namewidth; i++) - (void) printf("-"); - (void) printf(" ----- ----- ----- ----- ----- -----\n"); + + print_iostat_labels(cb, force_column_width, iostat_top_labels); + printf("\n"); + + printf("%-*s", namewidth, title); + + print_iostat_labels(cb, force_column_width, iostat_bottom_labels); + + printf("\n"); + + print_iostat_separator_impl(cb, force_column_width); + + printf("\n"); } static void print_iostat_header(iostat_cbdata_t *cb) { - (void) printf("%*s capacity operations bandwidth\n", - cb->cb_namewidth, ""); - (void) printf("%-*s alloc free read write read write\n", - cb->cb_namewidth, "pool"); - print_iostat_separator(cb); + print_iostat_header_impl(cb, 0, NULL); } /* * Display a single statistic. */ static void -print_one_stat(uint64_t value) +print_one_stat(uint64_t value, enum zfs_nicenum_format format, + unsigned int column_size, boolean_t scripted) { char buf[64]; - zfs_nicenum(value, buf, sizeof (buf)); - (void) printf(" %5s", buf); + zfs_nicenum_format(value, buf, sizeof (buf), format); + + if (scripted) + printf("\t%s", buf); + else + printf(" %*s", column_size, buf); +} + +/* + * Calculate the default vdev stats + * + * Subtract oldvs from newvs, apply a scaling factor, and save the resulting + * stats into calcvs. + */ +static void +calc_default_iostats(vdev_stat_t *oldvs, vdev_stat_t *newvs, + vdev_stat_t *calcvs) +{ + int i; + + memcpy(calcvs, newvs, sizeof (*calcvs)); + for (i = 0; i < ARRAY_SIZE(calcvs->vs_ops); i++) + calcvs->vs_ops[i] = (newvs->vs_ops[i] - oldvs->vs_ops[i]); + + for (i = 0; i < ARRAY_SIZE(calcvs->vs_bytes); i++) + calcvs->vs_bytes[i] = (newvs->vs_bytes[i] - oldvs->vs_bytes[i]); +} + +/* + * Internal representation of the extended iostats data. + * + * The extended iostat stats are exported in nvlists as either uint64_t arrays + * or single uint64_t's. We make both look like arrays to make them easier + * to process. In order to make single uint64_t's look like arrays, we set + * __data to the stat data, and then set *data = &__data with count = 1. Then, + * we can just use *data and count. + */ +struct stat_array { + uint64_t *data; + uint_t count; /* Number of entries in data[] */ + uint64_t __data; /* Only used when data is a single uint64_t */ +}; + +static uint64_t +stat_histo_max(struct stat_array *nva, unsigned int len) +{ + uint64_t max = 0; + int i; + for (i = 0; i < len; i++) + max = MAX(max, array64_max(nva[i].data, nva[i].count)); + + return (max); +} + +/* + * Helper function to lookup a uint64_t array or uint64_t value and store its + * data as a stat_array. If the nvpair is a single uint64_t value, then we make + * it look like a one element array to make it easier to process. + */ +static int +nvpair64_to_stat_array(nvlist_t *nvl, const char *name, + struct stat_array *nva) +{ + nvpair_t *tmp; + int ret; + + verify(nvlist_lookup_nvpair(nvl, name, &tmp) == 0); + switch (nvpair_type(tmp)) { + case DATA_TYPE_UINT64_ARRAY: + ret = nvpair_value_uint64_array(tmp, &nva->data, &nva->count); + break; + case DATA_TYPE_UINT64: + ret = nvpair_value_uint64(tmp, &nva->__data); + nva->data = &nva->__data; + nva->count = 1; + break; + default: + /* Not a uint64_t */ + ret = EINVAL; + break; + } + + return (ret); +} + +/* + * Given a list of nvlist names, look up the extended stats in newnv and oldnv, + * subtract them, and return the results in a newly allocated stat_array. + * You must free the returned array after you are done with it with + * free_calc_stats(). + * + * Additionally, you can set "oldnv" to NULL if you simply want the newnv + * values. + */ +static struct stat_array * +calc_and_alloc_stats_ex(const char **names, unsigned int len, nvlist_t *oldnv, + nvlist_t *newnv) +{ + nvlist_t *oldnvx = NULL, *newnvx; + struct stat_array *oldnva, *newnva, *calcnva; + int i, j; + unsigned int alloc_size = (sizeof (struct stat_array)) * len; + + /* Extract our extended stats nvlist from the main list */ + verify(nvlist_lookup_nvlist(newnv, ZPOOL_CONFIG_VDEV_STATS_EX, + &newnvx) == 0); + if (oldnv) { + verify(nvlist_lookup_nvlist(oldnv, ZPOOL_CONFIG_VDEV_STATS_EX, + &oldnvx) == 0); + } + + newnva = safe_malloc(alloc_size); + oldnva = safe_malloc(alloc_size); + calcnva = safe_malloc(alloc_size); + + for (j = 0; j < len; j++) { + verify(nvpair64_to_stat_array(newnvx, names[j], + &newnva[j]) == 0); + calcnva[j].count = newnva[j].count; + alloc_size = calcnva[j].count * sizeof (calcnva[j].data[0]); + calcnva[j].data = safe_malloc(alloc_size); + memcpy(calcnva[j].data, newnva[j].data, alloc_size); + + if (oldnvx) { + verify(nvpair64_to_stat_array(oldnvx, names[j], + &oldnva[j]) == 0); + for (i = 0; i < oldnva[j].count; i++) + calcnva[j].data[i] -= oldnva[j].data[i]; + } + } + free(newnva); + free(oldnva); + return (calcnva); +} + +static void +free_calc_stats(struct stat_array *nva, unsigned int len) +{ + int i; + for (i = 0; i < len; i++) + free(nva[i].data); + + free(nva); +} + +static void +print_iostat_histo(struct stat_array *nva, unsigned int len, + iostat_cbdata_t *cb, unsigned int column_width, unsigned int namewidth, + double scale) +{ + int i, j; + char buf[6]; + uint64_t val; + enum zfs_nicenum_format format; + unsigned int buckets; + unsigned int start_bucket; + + if (cb->cb_literal) + format = ZFS_NICENUM_RAW; + else + format = ZFS_NICENUM_1024; + + /* All these histos are the same size, so just use nva[0].count */ + buckets = nva[0].count; + + if (cb->cb_flags & IOS_RQ_HISTO_M) { + /* Start at 512 - req size should never be lower than this */ + start_bucket = 9; + } else { + start_bucket = 0; + } + + for (j = start_bucket; j < buckets; j++) { + /* Print histogram bucket label */ + if (cb->cb_flags & IOS_L_HISTO_M) { + /* Ending range of this bucket */ + val = (1ULL << (j + 1)) - 1; + zfs_nicetime(val, buf, sizeof (buf)); + } else { + /* Request size (starting range of bucket) */ + val = (1UL << j); + zfs_nicenum(val, buf, sizeof (buf)); + } + + if (cb->cb_scripted) + printf("%llu", (u_longlong_t)val); + else + printf("%-*s", namewidth, buf); + + /* Print the values on the line */ + for (i = 0; i < len; i++) { + print_one_stat(nva[i].data[j] * scale, format, + column_width, cb->cb_scripted); + } + printf("\n"); + } +} + +static void +print_solid_separator(unsigned int length) +{ + while (length--) + printf("-"); + printf("\n"); +} + +static void +print_iostat_histos(iostat_cbdata_t *cb, nvlist_t *oldnv, + nvlist_t *newnv, double scale, const char *name) +{ + unsigned int column_width; + unsigned int namewidth; + unsigned int entire_width; + enum iostat_type type; + struct stat_array *nva; + const char **names; + unsigned int names_len; + + /* What type of histo are we? */ + type = IOS_HISTO_IDX(cb->cb_flags); + + /* Get NULL-terminated array of nvlist names for our histo */ + names = vsx_type_to_nvlist[type]; + names_len = str_array_len(names); /* num of names */ + + nva = calc_and_alloc_stats_ex(names, names_len, oldnv, newnv); + + if (cb->cb_literal) { + column_width = MAX(5, + (unsigned int) log10(stat_histo_max(nva, names_len)) + 1); + } else { + column_width = 5; + } + + namewidth = MAX(cb->cb_namewidth, + strlen(histo_to_title[IOS_HISTO_IDX(cb->cb_flags)])); + + /* + * Calculate the entire line width of what we're printing. The + * +2 is for the two spaces between columns: + */ + /* read write */ + /* ----- ----- */ + /* |___| <---------- column_width */ + /* */ + /* |__________| <--- entire_width */ + /* */ + entire_width = namewidth + (column_width + 2) * + label_array_len(iostat_bottom_labels[type]); + + if (cb->cb_scripted) + printf("%s\n", name); + else + print_iostat_header_impl(cb, column_width, name); + + print_iostat_histo(nva, names_len, cb, column_width, + namewidth, scale); + + free_calc_stats(nva, names_len); + if (!cb->cb_scripted) + print_solid_separator(entire_width); +} + +/* + * Calculate the average latency of a power-of-two latency histogram + */ +static uint64_t +single_histo_average(uint64_t *histo, unsigned int buckets) +{ + int i; + uint64_t count = 0, total = 0; + + for (i = 0; i < buckets; i++) { + /* + * Our buckets are power-of-two latency ranges. Use the + * midpoint latency of each bucket to calculate the average. + * For example: + * + * Bucket Midpoint + * 8ns-15ns: 12ns + * 16ns-31ns: 24ns + * ... + */ + if (histo[i] != 0) { + total += histo[i] * (((1UL << i) + ((1UL << i)/2))); + count += histo[i]; + } + } + + /* Prevent divide by zero */ + return (count == 0 ? 0 : total / count); +} + +static void +print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *oldnv, + nvlist_t *newnv) +{ + int i; + uint64_t val; + const char *names[] = { + ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, + }; + + struct stat_array *nva; + + unsigned int column_width = default_column_width(cb, IOS_QUEUES); + enum zfs_nicenum_format format; + + nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), NULL, newnv); + + if (cb->cb_literal) + format = ZFS_NICENUM_RAW; + else + format = ZFS_NICENUM_1024; + + for (i = 0; i < ARRAY_SIZE(names); i++) { + val = nva[i].data[0]; + print_one_stat(val, format, column_width, cb->cb_scripted); + } + + free_calc_stats(nva, ARRAY_SIZE(names)); +} + +static void +print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv, + nvlist_t *newnv) +{ + int i; + uint64_t val; + const char *names[] = { + ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, + }; + struct stat_array *nva; + + unsigned int column_width = default_column_width(cb, IOS_LATENCY); + enum zfs_nicenum_format format; + + nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv); + + if (cb->cb_literal) + format = ZFS_NICENUM_RAWTIME; + else + format = ZFS_NICENUM_TIME; + + /* Print our avg latencies on the line */ + for (i = 0; i < ARRAY_SIZE(names); i++) { + /* Compute average latency for a latency histo */ + val = single_histo_average(nva[i].data, nva[i].count); + print_one_stat(val, format, column_width, cb->cb_scripted); + } + free_calc_stats(nva, ARRAY_SIZE(names)); +} + +/* + * Print default statistics (capacity/operations/bandwidth) + */ +static void +print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale) +{ + unsigned int column_width = default_column_width(cb, IOS_DEFAULT); + enum zfs_nicenum_format format; + char na; /* char to print for "not applicable" values */ + + if (cb->cb_literal) { + format = ZFS_NICENUM_RAW; + na = '0'; + } else { + format = ZFS_NICENUM_1024; + na = '-'; + } + + /* only toplevel vdevs have capacity stats */ + if (vs->vs_space == 0) { + if (cb->cb_scripted) + printf("\t%c\t%c", na, na); + else + printf(" %*c %*c", column_width, na, column_width, + na); + } else { + print_one_stat(vs->vs_alloc, format, column_width, + cb->cb_scripted); + print_one_stat(vs->vs_space - vs->vs_alloc, format, + column_width, cb->cb_scripted); + } + + print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_READ] * scale), + format, column_width, cb->cb_scripted); + print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_WRITE] * scale), + format, column_width, cb->cb_scripted); + print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_READ] * scale), + format, column_width, cb->cb_scripted); + print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_WRITE] * scale), + format, column_width, cb->cb_scripted); } static const char *class_name[] = { @@ -3162,21 +3967,27 @@ static const char *class_name[] = { * Print out all the statistics for the given vdev. This can either be the * toplevel configuration, or called recursively. If 'name' is NULL, then this * is a verbose output, and we don't want to display the toplevel pool stats. + * + * Returns the number of stat lines printed. */ -static void +unsigned int print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, nvlist_t *newnv, iostat_cbdata_t *cb, int depth) { nvlist_t **oldchild, **newchild; - uint_t c, children; - vdev_stat_t *oldvs, *newvs; + uint_t c, children, oldchildren; + vdev_stat_t *oldvs, *newvs, *calcvs; vdev_stat_t zerovs = { 0 }; char *vname; + int i; + int ret = 0; uint64_t tdelta; double scale; if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) - return; + return (0); + + calcvs = safe_malloc(sizeof (*calcvs)); if (oldnv != NULL) { verify(nvlist_lookup_uint64_array(oldnv, @@ -3185,54 +3996,98 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, oldvs = &zerovs; } + /* Do we only want to see a specific vdev? */ + for (i = 0; i < cb->cb_vdev_names_count; i++) { + /* Yes we do. Is this the vdev? */ + if (strcmp(name, cb->cb_vdev_names[i]) == 0) { + /* + * This is our vdev. Since it is the only vdev we + * will be displaying, make depth = 0 so that it + * doesn't get indented. + */ + depth = 0; + break; + } + } + + if (cb->cb_vdev_names_count && (i == cb->cb_vdev_names_count)) { + /* Couldn't match the name */ + goto children; + } + + verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&newvs, &c) == 0); - if (strlen(name) + depth > cb->cb_namewidth) - (void) printf("%*s%s", depth, "", name); - else - (void) printf("%*s%s%*s", depth, "", name, - (int)(cb->cb_namewidth - strlen(name) - depth), ""); + /* + * Print the vdev name unless it's is a histogram. Histograms + * display the vdev name in the header itself. + */ + if (!(cb->cb_flags & IOS_ANYHISTO_M)) { + if (cb->cb_scripted) { + printf("%s", name); + } else { + if (strlen(name) + depth > cb->cb_namewidth) + (void) printf("%*s%s", depth, "", name); + else + (void) printf("%*s%s%*s", depth, "", name, + (int)(cb->cb_namewidth - strlen(name) - + depth), ""); + } + } + /* Calculate our scaling factor */ tdelta = newvs->vs_timestamp - oldvs->vs_timestamp; - - if (tdelta == 0) - scale = 1.0; - else - scale = (double)NANOSEC / tdelta; - - /* only toplevel vdevs have capacity stats */ - if (newvs->vs_space == 0) { - (void) printf(" - -"); + if ((oldvs->vs_timestamp == 0) && (cb->cb_flags & IOS_ANYHISTO_M)) { + /* + * If we specify printing histograms with no time interval, then + * print the histogram numbers over the entire lifetime of the + * vdev. + */ + scale = 1; } else { - print_one_stat(newvs->vs_alloc); - print_one_stat(newvs->vs_space - newvs->vs_alloc); + if (tdelta == 0) + scale = 1.0; + else + scale = (double)NANOSEC / tdelta; } - print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_READ] - - oldvs->vs_ops[ZIO_TYPE_READ]))); + if (cb->cb_flags & IOS_DEFAULT_M) { + calc_default_iostats(oldvs, newvs, calcvs); + print_iostat_default(calcvs, cb, scale); + } + if (cb->cb_flags & IOS_LATENCY_M) + print_iostat_latency(cb, oldnv, newnv); + if (cb->cb_flags & IOS_QUEUES_M) + print_iostat_queues(cb, oldnv, newnv); + if (cb->cb_flags & IOS_ANYHISTO_M) { + printf("\n"); + print_iostat_histos(cb, oldnv, newnv, scale, name); + } - print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_WRITE] - - oldvs->vs_ops[ZIO_TYPE_WRITE]))); + if (!(cb->cb_flags & IOS_ANYHISTO_M)) + printf("\n"); - print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_READ] - - oldvs->vs_bytes[ZIO_TYPE_READ]))); + ret++; - print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_WRITE] - - oldvs->vs_bytes[ZIO_TYPE_WRITE]))); +children: - (void) printf("\n"); + free(calcvs); if (!cb->cb_verbose) - return; + return (ret); if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN, &newchild, &children) != 0) - return; + return (ret); - if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN, - &oldchild, &c) != 0) - return; + if (oldnv) { + if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN, + &oldchild, &oldchildren) != 0) + return (ret); + + children = MIN(oldchildren, children); + } /* * print normal top-level devices @@ -3254,7 +4109,7 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); - print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, + ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } @@ -3264,6 +4119,7 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, */ for (uint_t n = 0; n < 3; n++) { boolean_t printed = B_FALSE; + for (c = 0; c < children; c++) { uint64_t islog = B_FALSE; char *bias = NULL; @@ -3285,11 +4141,10 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, continue; if (!printed) { - if (!cb->cb_scripted) { - (void) printf( - "%-*s - - - -" - " - -", - cb->cb_namewidth, class_name[n]); + if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && + !cb->cb_scripted && !cb->cb_vdev_names) { + print_iostat_dashes(cb, 0, + class_name[n]); } printf("\n"); printed = B_TRUE; @@ -3297,7 +4152,7 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); - print_vdev_stats(zhp, vname, oldnv ? + ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } @@ -3309,23 +4164,33 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, */ if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, &newchild, &children) != 0) - return; + return (ret); - if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE, - &oldchild, &c) != 0) - return; + if (oldnv) { + if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE, + &oldchild, &oldchildren) != 0) + return (ret); + + children = MIN(oldchildren, children); + } if (children > 0) { - (void) printf("%-*s - - - - - " - "-\n", cb->cb_namewidth, "cache"); + if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && + !cb->cb_vdev_names) { + print_iostat_dashes(cb, 0, "cache"); + } + printf("\n"); + for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); - print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, - newchild[c], cb, depth + 2); + ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] + : NULL, newchild[c], cb, depth + 2); free(vname); } } + + return (ret); } static int @@ -3355,6 +4220,7 @@ print_iostat(zpool_handle_t *zhp, void *data) iostat_cbdata_t *cb = data; nvlist_t *oldconfig, *newconfig; nvlist_t *oldnvroot, *newnvroot; + int ret; newconfig = zpool_get_config(zhp, &oldconfig); @@ -3370,63 +4236,79 @@ print_iostat(zpool_handle_t *zhp, void *data) verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE, &oldnvroot) == 0); - /* - * Print out the statistics for the pool. - */ - print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0); - - if (cb->cb_verbose) + ret = print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, + cb, 0); + if ((ret != 0) && !(cb->cb_flags & IOS_ANYHISTO_M) && + !cb->cb_scripted && cb->cb_verbose && !cb->cb_vdev_names_count) { print_iostat_separator(cb); + printf("\n"); + } - return (0); + return (ret); } -int -get_namewidth(zpool_handle_t *zhp, void *data) +static int +get_columns(void) +{ + struct winsize ws; + int columns = 80; + int error; + + if (isatty(STDOUT_FILENO)) { + error = ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws); + if (error == 0) + columns = ws.ws_col; + } else { + columns = 999; + } + + return (columns); +} + +/* + * Return the required length of the pool/vdev name column. The minimum + * allowed width and output formatting flags must be provided. + */ +static int +get_namewidth(zpool_handle_t *zhp, int min_width, int flags, boolean_t verbose) { - iostat_cbdata_t *cb = data; nvlist_t *config, *nvroot; + int width = min_width; if ((config = zpool_get_config(zhp, NULL)) != NULL) { verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - if (!cb->cb_verbose) - cb->cb_namewidth = strlen(zpool_get_name(zhp)); - else - cb->cb_namewidth = max_width(zhp, nvroot, 0, - cb->cb_namewidth, cb->cb_name_flags); + unsigned int poolname_len = strlen(zpool_get_name(zhp)); + if (verbose == B_FALSE) { + width = MAX(poolname_len, min_width); + } else { + width = MAX(poolname_len, + max_width(zhp, nvroot, 0, min_width, flags)); + } } - /* - * The width must fall into the range [10,38]. The upper limit is the - * maximum we can have and still fit in 80 columns. - */ - if (cb->cb_namewidth < 10) - cb->cb_namewidth = 10; - if (cb->cb_namewidth > 38) - cb->cb_namewidth = 38; - - return (0); + return (width); } /* * Parse the input string, get the 'interval' and 'count' value if there is one. */ static void -get_interval_count(int *argcp, char **argv, unsigned long *iv, +get_interval_count(int *argcp, char **argv, float *iv, unsigned long *cnt) { - unsigned long interval = 0, count = 0; + float interval = 0; + unsigned long count = 0; int argc = *argcp, errno; /* * Determine if the last argument is an integer or a pool name */ - if (argc > 0 && zfs_isnumber(argv[argc - 1])) { + if (argc > 0 && isnumber(argv[argc - 1])) { char *end; errno = 0; - interval = strtoul(argv[argc - 1], &end, 10); + interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { @@ -3452,12 +4334,12 @@ get_interval_count(int *argcp, char **argv, unsigned long *iv, * If the last argument is also an integer, then we have both a count * and an interval. */ - if (argc > 0 && zfs_isnumber(argv[argc - 1])) { + if (argc > 0 && isnumber(argv[argc - 1])) { char *end; errno = 0; count = interval; - interval = strtoul(argv[argc - 1], &end, 10); + interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { @@ -3492,13 +4374,296 @@ get_timestamp_arg(char c) } /* - * zpool iostat [-gLPv] [-T d|u] [pool] ... [interval [count]] + * Return stat flags that are supported by all pools by both the module and + * zpool iostat. "*data" should be initialized to all 0xFFs before running. + * It will get ANDed down until only the flags that are supported on all pools + * remain. + */ +static int +get_stat_flags_cb(zpool_handle_t *zhp, void *data) +{ + uint64_t *mask = data; + nvlist_t *config, *nvroot, *nvx; + uint64_t flags = 0; + int i, j; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + /* Default stats are always supported, but for completeness.. */ + if (nvlist_exists(nvroot, ZPOOL_CONFIG_VDEV_STATS)) + flags |= IOS_DEFAULT_M; + + /* Get our extended stats nvlist from the main list */ + if (nvlist_lookup_nvlist(nvroot, ZPOOL_CONFIG_VDEV_STATS_EX, + &nvx) != 0) { + /* + * No extended stats; they're probably running an older + * module. No big deal, we support that too. + */ + goto end; + } + + /* For each extended stat, make sure all its nvpairs are supported */ + for (j = 0; j < ARRAY_SIZE(vsx_type_to_nvlist); j++) { + if (!vsx_type_to_nvlist[j][0]) + continue; + + /* Start off by assuming the flag is supported, then check */ + flags |= (1ULL << j); + for (i = 0; vsx_type_to_nvlist[j][i]; i++) { + if (!nvlist_exists(nvx, vsx_type_to_nvlist[j][i])) { + /* flag isn't supported */ + flags = flags & ~(1ULL << j); + break; + } + } + } +end: + *mask = *mask & flags; + return (0); +} + +/* + * Return a bitmask of stats that are supported on all pools by both the module + * and zpool iostat. + */ +static uint64_t +get_stat_flags(zpool_list_t *list) +{ + uint64_t mask = -1; + + /* + * get_stat_flags_cb() will lop off bits from "mask" until only the + * flags that are supported on all pools remain. + */ + (void) pool_list_iter(list, B_FALSE, get_stat_flags_cb, &mask); + return (mask); +} + +/* + * Return 1 if cb_data->cb_vdev_names[0] is this vdev's name, 0 otherwise. + */ +static int +is_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_data) +{ + iostat_cbdata_t *cb = cb_data; + char *name; + + name = zpool_vdev_name(g_zfs, zhp, nv, cb->cb_name_flags); + + if (strcmp(name, cb->cb_vdev_names[0]) == 0) + return (1); /* match */ + + return (0); +} + +/* + * Returns 1 if cb_data->cb_vdev_names[0] is a vdev name, 0 otherwise. + */ +static int +is_vdev(zpool_handle_t *zhp, void *cb_data) +{ + return (for_each_vdev(zhp, is_vdev_cb, cb_data)); +} + +/* + * Check if vdevs are in a pool + * + * Return 1 if all argv[] strings are vdev names in pool "pool_name". Otherwise + * return 0. If pool_name is NULL, then search all pools. + */ +static int +are_vdevs_in_pool(int argc, char **argv, char *pool_name, + iostat_cbdata_t *cb) +{ + char **tmp_name; + int ret = 0; + int i; + int pool_count = 0; + + if ((argc == 0) || !*argv) + return (0); + + if (pool_name) + pool_count = 1; + + /* Temporarily hijack cb_vdev_names for a second... */ + tmp_name = cb->cb_vdev_names; + + /* Go though our list of prospective vdev names */ + for (i = 0; i < argc; i++) { + cb->cb_vdev_names = argv + i; + + /* Is this name a vdev in our pools? */ + ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL, + is_vdev, cb); + if (!ret) { + /* No match */ + break; + } + } + + cb->cb_vdev_names = tmp_name; + + return (ret); +} + +static int +is_pool_cb(zpool_handle_t *zhp, void *data) +{ + char *name = data; + if (strcmp(name, zpool_get_name(zhp)) == 0) + return (1); + + return (0); +} + +/* + * Do we have a pool named *name? If so, return 1, otherwise 0. + */ +static int +is_pool(char *name) +{ + return (for_each_pool(0, NULL, B_TRUE, NULL, is_pool_cb, name)); +} + +/* Are all our argv[] strings pool names? If so return 1, 0 otherwise. */ +static int +are_all_pools(int argc, char **argv) +{ + if ((argc == 0) || !*argv) + return (0); + + while (--argc >= 0) + if (!is_pool(argv[argc])) + return (0); + + return (1); +} + +/* + * Helper function to print out vdev/pool names we can't resolve. Used for an + * error message. + */ +static void +error_list_unresolved_vdevs(int argc, char **argv, char *pool_name, + iostat_cbdata_t *cb) +{ + int i; + char *name; + char *str; + for (i = 0; i < argc; i++) { + name = argv[i]; + + if (is_pool(name)) + str = gettext("pool"); + else if (are_vdevs_in_pool(1, &name, pool_name, cb)) + str = gettext("vdev in this pool"); + else if (are_vdevs_in_pool(1, &name, NULL, cb)) + str = gettext("vdev in another pool"); + else + str = gettext("unknown"); + + fprintf(stderr, "\t%s (%s)\n", name, str); + } +} + +/* + * Same as get_interval_count(), but with additional checks to not misinterpret + * guids as interval/count values. Assumes VDEV_NAME_GUID is set in + * cb.cb_name_flags. + */ +static void +get_interval_count_filter_guids(int *argc, char **argv, float *interval, + unsigned long *count, iostat_cbdata_t *cb) +{ + char **tmpargv = argv; + int argc_for_interval = 0; + + /* Is the last arg an interval value? Or a guid? */ + if (*argc >= 1 && !are_vdevs_in_pool(1, &argv[*argc - 1], NULL, cb)) { + /* + * The last arg is not a guid, so it's probably an + * interval value. + */ + argc_for_interval++; + + if (*argc >= 2 && + !are_vdevs_in_pool(1, &argv[*argc - 2], NULL, cb)) { + /* + * The 2nd to last arg is not a guid, so it's probably + * an interval value. + */ + argc_for_interval++; + } + } + + /* Point to our list of possible intervals */ + tmpargv = &argv[*argc - argc_for_interval]; + + *argc = *argc - argc_for_interval; + get_interval_count(&argc_for_interval, tmpargv, + interval, count); +} + +/* + * Floating point sleep(). Allows you to pass in a floating point value for + * seconds. + */ +static void +fsleep(float sec) +{ + struct timespec req; + req.tv_sec = floor(sec); + req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC; + (void) nanosleep(&req, NULL); +} + +/* + * Set the minimum pool/vdev name column width. The width must be at least 10, + * but may be as large as the column width - 42 so it still fits on one line. + */ +static int +get_namewidth_iostat(zpool_handle_t *zhp, void *data) +{ + iostat_cbdata_t *cb = data; + int width, columns; + + width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, + cb->cb_verbose); + columns = get_columns(); + + if (width < 10) + width = 10; + if (width > columns - 42) + width = columns - 42; + + cb->cb_namewidth = width; + + return (0); +} + +/* + * zpool iostat [-ghHLpPvy] [[-lq]|[-r|-w]] [-n name] [-T d|u] + * [[ pool ...]|[pool vdev ...]|[vdev ...]] + * [interval [count]] * * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -P Display full path for vdev name. * -v Display statistics for individual vdevs + * -h Display help + * -p Display values in parsable (exact) format. + * -H Scripted mode. Don't display headers, and separate properties + * by a single tab. + * -l Display average latency + * -q Display queue depths + * -w Display latency histograms + * -r Display request size histogram * -T Display a timestamp in date(1) or Unix format + * -n Only print headers once * * This command can be tricky because we want to be able to deal with pool * creation/destruction as well as vdev configuration changes. The bulk of this @@ -3512,16 +4677,29 @@ zpool_do_iostat(int argc, char **argv) int c; int ret; int npools; - unsigned long interval = 0, count = 0; + float interval = 0; + unsigned long count = 0; + int winheight = 24; + struct winsize win; zpool_list_t *list; boolean_t verbose = B_FALSE; + boolean_t latency = B_FALSE, l_histo = B_FALSE, rq_histo = B_FALSE; + boolean_t queues = B_FALSE, parseable = B_FALSE, scripted = B_FALSE; + boolean_t omit_since_boot = B_FALSE; boolean_t guid = B_FALSE; boolean_t follow_links = B_FALSE; boolean_t full_name = B_FALSE; + boolean_t headers_once = B_FALSE; iostat_cbdata_t cb = { 0 }; + /* Used for printing error message */ + const char flag_to_arg[] = {[IOS_LATENCY] = 'l', [IOS_QUEUES] = 'q', + [IOS_L_HISTO] = 'w', [IOS_RQ_HISTO] = 'r'}; + + uint64_t unsupported_flags; + /* check options */ - while ((c = getopt(argc, argv, "gLPT:v")) != -1) { + while ((c = getopt(argc, argv, "gLPT:vyhplqrwnH")) != -1) { switch (c) { case 'g': guid = B_TRUE; @@ -3538,6 +4716,33 @@ zpool_do_iostat(int argc, char **argv) case 'v': verbose = B_TRUE; break; + case 'p': + parseable = B_TRUE; + break; + case 'l': + latency = B_TRUE; + break; + case 'q': + queues = B_TRUE; + break; + case 'H': + scripted = B_TRUE; + break; + case 'w': + l_histo = B_TRUE; + break; + case 'r': + rq_histo = B_TRUE; + break; + case 'y': + omit_since_boot = B_TRUE; + break; + case 'n': + headers_once = B_TRUE; + break; + case 'h': + usage(B_FALSE); + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -3548,7 +4753,70 @@ zpool_do_iostat(int argc, char **argv) argc -= optind; argv += optind; - get_interval_count(&argc, argv, &interval, &count); + cb.cb_literal = parseable; + cb.cb_scripted = scripted; + + if (guid) + cb.cb_name_flags |= VDEV_NAME_GUID; + if (follow_links) + cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; + if (full_name) + cb.cb_name_flags |= VDEV_NAME_PATH; + cb.cb_iteration = 0; + cb.cb_namewidth = 0; + cb.cb_verbose = verbose; + + /* Get our interval and count values (if any) */ + if (guid) { + get_interval_count_filter_guids(&argc, argv, &interval, + &count, &cb); + } else { + get_interval_count(&argc, argv, &interval, &count); + } + + if (argc == 0) { + /* No args, so just print the defaults. */ + } else if (are_all_pools(argc, argv)) { + /* All the args are pool names */ + } else if (are_vdevs_in_pool(argc, argv, NULL, &cb)) { + /* All the args are vdevs */ + cb.cb_vdev_names = argv; + cb.cb_vdev_names_count = argc; + argc = 0; /* No pools to process */ + } else if (are_all_pools(1, argv)) { + /* The first arg is a pool name */ + if (are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb)) { + /* ...and the rest are vdev names */ + cb.cb_vdev_names = argv + 1; + cb.cb_vdev_names_count = argc - 1; + argc = 1; /* One pool to process */ + } else { + fprintf(stderr, gettext("Expected either a list of ")); + fprintf(stderr, gettext("pools, or list of vdevs in")); + fprintf(stderr, " \"%s\", ", argv[0]); + fprintf(stderr, gettext("but got:\n")); + error_list_unresolved_vdevs(argc - 1, argv + 1, + argv[0], &cb); + fprintf(stderr, "\n"); + usage(B_FALSE); + return (1); + } + } else { + /* + * The args don't make sense. The first arg isn't a pool name, + * nor are all the args vdevs. + */ + fprintf(stderr, gettext("Unable to parse pools/vdevs list.\n")); + fprintf(stderr, "\n"); + return (1); + } + + if (cb.cb_vdev_names_count != 0) { + /* + * If user specified vdevs, it implies verbose. + */ + cb.cb_verbose = B_TRUE; + } /* * Construct the list of all interesting pools. @@ -3568,60 +4836,156 @@ zpool_do_iostat(int argc, char **argv) return (1); } + if ((l_histo || rq_histo) && (queues || latency)) { + pool_list_free(list); + (void) fprintf(stderr, + gettext("[-r|-w] isn't allowed with [-q|-l]\n")); + usage(B_FALSE); + return (1); + } + + if (l_histo && rq_histo) { + pool_list_free(list); + (void) fprintf(stderr, + gettext("Only one of [-r|-w] can be passed at a time\n")); + usage(B_FALSE); + return (1); + } + /* * Enter the main iostat loop. */ cb.cb_list = list; - cb.cb_verbose = verbose; - if (guid) - cb.cb_name_flags |= VDEV_NAME_GUID; - if (follow_links) - cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; - if (full_name) - cb.cb_name_flags |= VDEV_NAME_PATH; - cb.cb_iteration = 0; - cb.cb_namewidth = 0; - for (;;) { - pool_list_update(list); + if (l_histo) { + /* + * Histograms tables look out of place when you try to display + * them with the other stats, so make a rule that you can only + * print histograms by themselves. + */ + cb.cb_flags = IOS_L_HISTO_M; + } else if (rq_histo) { + cb.cb_flags = IOS_RQ_HISTO_M; + } else { + cb.cb_flags = IOS_DEFAULT_M; + if (latency) + cb.cb_flags |= IOS_LATENCY_M; + if (queues) + cb.cb_flags |= IOS_QUEUES_M; + } + /* + * See if the module supports all the stats we want to display. + */ + unsupported_flags = cb.cb_flags & ~get_stat_flags(list); + if (unsupported_flags) { + uint64_t f; + int idx; + fprintf(stderr, + gettext("The loaded zfs module doesn't support:")); + + /* for each bit set in unsupported_flags */ + for (f = unsupported_flags; f; f &= ~(1ULL << idx)) { + idx = lowbit64(f) - 1; + fprintf(stderr, " -%c", flag_to_arg[idx]); + } + + fprintf(stderr, ". Try running a newer module.\n"), + pool_list_free(list); + + return (1); + } + + for (;;) { if ((npools = pool_list_count(list)) == 0) - break; + (void) fprintf(stderr, gettext("no pools available\n")); + else { + /* + * If this is the first iteration and -y was supplied + * we skip any printing. + */ + boolean_t skip = (omit_since_boot && + cb.cb_iteration == 0); - /* - * Refresh all statistics. This is done as an explicit step - * before calculating the maximum name width, so that any - * configuration changes are properly accounted for. - */ - (void) pool_list_iter(list, B_FALSE, refresh_iostat, &cb); + /* + * Refresh all statistics. This is done as an + * explicit step before calculating the maximum name + * width, so that any configuration changes are + * properly accounted for. + */ + (void) pool_list_iter(list, B_FALSE, refresh_iostat, + &cb); - /* - * Iterate over all pools to determine the maximum width - * for the pool / device name column across all pools. - */ - cb.cb_namewidth = 0; - (void) pool_list_iter(list, B_FALSE, get_namewidth, &cb); + /* + * Iterate over all pools to determine the maximum width + * for the pool / device name column across all pools. + */ + cb.cb_namewidth = 0; + (void) pool_list_iter(list, B_FALSE, + get_namewidth_iostat, &cb); - if (timestamp_fmt != NODATE) - print_timestamp(timestamp_fmt); + if (timestamp_fmt != NODATE) + print_timestamp(timestamp_fmt); - /* - * If it's the first time, or verbose mode, print the header. - */ - if (++cb.cb_iteration == 1 || verbose) - print_iostat_header(&cb); + /* + * Check terminal size so we can print headers + * even when terminal window has its height + * changed. + */ + if (headers_once == B_FALSE) { + if (ioctl(1, TIOCGWINSZ, &win) != -1) { + if (win.ws_row <= 0) { + headers_once = B_TRUE; + } else { + winheight = win.ws_row; + } + } + } + /* + * Are we connected to TTY? If not, headers_once + * should be true, to avoid breaking scripts. + */ + if (isatty(fileno(stdout)) == 0) + headers_once = B_TRUE; - (void) pool_list_iter(list, B_FALSE, print_iostat, &cb); + /* + * If it's the first time and we're not skipping it, + * or either skip or verbose mode, print the header. + * + * The histogram code explicitly prints its header on + * every vdev, so skip this for histograms. + */ + if (((++cb.cb_iteration == 1 && !skip) || + (skip != verbose) || + (!headers_once && + (cb.cb_iteration % winheight) == 0)) && + (!(cb.cb_flags & IOS_ANYHISTO_M)) && + !cb.cb_scripted) + print_iostat_header(&cb); + + if (skip) { + (void) fsleep(interval); + continue; + } - /* - * If there's more than one pool, and we're not in verbose mode - * (which prints a separator for us), then print a separator. - */ - if (npools > 1 && !verbose) - print_iostat_separator(&cb); + (void) pool_list_iter(list, B_FALSE, print_iostat, &cb); - if (verbose) - (void) printf("\n"); + /* + * If there's more than one pool, and we're not in + * verbose mode (which prints a separator for us), + * then print a separator. + * + * In addition, if we're printing specific vdevs then + * we also want an ending separator. + */ + if (((npools > 1 && !verbose && + !(cb.cb_flags & IOS_ANYHISTO_M)) || + (!(cb.cb_flags & IOS_ANYHISTO_M) && + cb.cb_vdev_names_count)) && + !cb.cb_scripted) { + print_iostat_separator(&cb); + } + } /* * Flush the output so that redirection to a file isn't buffered @@ -3635,7 +4999,7 @@ zpool_do_iostat(int argc, char **argv) if (count != 0 && --count == 0) break; - (void) sleep(interval); + (void) fsleep(interval); } pool_list_free(list); @@ -3992,6 +5356,27 @@ list_callback(zpool_handle_t *zhp, void *data) } /* + * Set the minimum pool/vdev name column width. The width must be at least 9, + * but may be as large as needed. + */ +static int +get_namewidth_list(zpool_handle_t *zhp, void *data) +{ + list_cbdata_t *cb = data; + int width; + + width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, + cb->cb_verbose); + + if (width < 9) + width = 9; + + cb->cb_namewidth = width; + + return (0); +} + +/* * zpool list [-gHLP] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]] * * -g Display guid for individual vdev name. @@ -4018,7 +5403,8 @@ zpool_do_list(int argc, char **argv) "name,size,allocated,free,checkpoint,expandsize,fragmentation," "capacity,dedupratio,health,altroot"; char *props = default_props; - unsigned long interval = 0, count = 0; + float interval = 0; + unsigned long count = 0; zpool_list_t *list; boolean_t first = B_TRUE; @@ -4079,7 +5465,7 @@ zpool_do_list(int argc, char **argv) break; cb.cb_namewidth = 0; - (void) pool_list_iter(list, B_FALSE, get_namewidth, &cb); + (void) pool_list_iter(list, B_FALSE, get_namewidth_list, &cb); if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); @@ -4097,7 +5483,7 @@ zpool_do_list(int argc, char **argv) break; pool_list_free(list); - (void) sleep(interval); + (void) fsleep(interval); } if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) { @@ -5942,6 +7328,9 @@ status_callback(zpool_handle_t *zhp, void *data) cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE", "CKSUM"); + if (cbp->cb_print_slow_ios) + (void) printf(" %5s", gettext("SLOW")); + print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0, B_FALSE); @@ -6003,12 +7392,14 @@ status_callback(zpool_handle_t *zhp, void *data) } /* - * zpool status [-igLPtvx] [-T d|u] [pool] ... [interval [count]] + * zpool status [-igLpPstvx] [-T d|u] [pool] ... [interval [count]] * * -i Display vdev initialization status. * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. + * -p Display values in parsable (exact) format. * -P Display full path for vdev name. + * -s Display slow IOs column. * -v Display complete error logs * -x Display only pools with potential problems * -D Display dedup status (undocumented) @@ -6022,11 +7413,12 @@ zpool_do_status(int argc, char **argv) { int c; int ret; - unsigned long interval = 0, count = 0; + float interval = 0; + unsigned long count = 0; status_cbdata_t cb = { 0 }; /* check options */ - while ((c = getopt(argc, argv, "igLPvxDtT:")) != -1) { + while ((c = getopt(argc, argv, "igLpPsvxDtT:")) != -1) { switch (c) { case 'i': cb.cb_print_vdev_init = B_TRUE; @@ -6037,9 +7429,15 @@ zpool_do_status(int argc, char **argv) case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; + case 'p': + cb.cb_literal = B_TRUE; + break; case 'P': cb.cb_name_flags |= VDEV_NAME_PATH; break; + case 's': + cb.cb_print_slow_ios = B_TRUE; + break; case 'v': cb.cb_verbose = B_TRUE; break; @@ -6094,7 +7492,7 @@ zpool_do_status(int argc, char **argv) if (count != 0 && --count == 0) break; - (void) sleep(interval); + (void) fsleep(interval); } return (0); diff --git a/usr/src/cmd/zpool/zpool_util.c b/usr/src/cmd/zpool/zpool_util.c index c7a002efb1..e4281af210 100644 --- a/usr/src/cmd/zpool/zpool_util.c +++ b/usr/src/cmd/zpool/zpool_util.c @@ -29,6 +29,8 @@ #include <stdio.h> #include <stdlib.h> #include <strings.h> +#include <ctype.h> +#include <sys/sysmacros.h> #include "zpool_util.h" @@ -84,3 +86,29 @@ num_logs(nvlist_t *nv) } return (nlogs); } + +/* Find the max element in an array of uint64_t values */ +uint64_t +array64_max(uint64_t array[], unsigned int len) +{ + uint64_t max = 0; + int i; + for (i = 0; i < len; i++) + max = MAX(max, array[i]); + + return (max); +} + +/* + * Return 1 if "str" is a number string, 0 otherwise. Works for integer and + * floating point numbers. + */ +int +isnumber(char *str) +{ + for (; *str; str++) + if (!(isdigit(*str) || (*str == '.'))) + return (0); + + return (1); +} diff --git a/usr/src/cmd/zpool/zpool_util.h b/usr/src/cmd/zpool/zpool_util.h index 3aeb9b5431..e4c93acf39 100644 --- a/usr/src/cmd/zpool/zpool_util.h +++ b/usr/src/cmd/zpool/zpool_util.h @@ -38,6 +38,10 @@ extern "C" { void *safe_malloc(size_t); void zpool_no_memory(void); uint_t num_logs(nvlist_t *nv); +uint64_t array64_max(uint64_t array[], unsigned int len); +int highbit64(uint64_t i); +int lowbit64(uint64_t i); +int isnumber(char *str); /* * Virtual device functions @@ -55,6 +59,10 @@ nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname, int for_each_pool(int, char **, boolean_t unavail, zprop_list_t **, zpool_iter_f, void *); +/* Vdev list functions */ +typedef int (*pool_vdev_iter_f)(zpool_handle_t *, nvlist_t *, void *); +int for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data); + typedef struct zpool_list zpool_list_t; zpool_list_t *pool_list_get(int, char **, zprop_list_t **, int *); diff --git a/usr/src/lib/libzfs/common/libzfs_config.c b/usr/src/lib/libzfs/common/libzfs_config.c index b33d86432d..e6c7ae025d 100644 --- a/usr/src/lib/libzfs/common/libzfs_config.c +++ b/usr/src/lib/libzfs/common/libzfs_config.c @@ -312,21 +312,9 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) zhp->zpool_config_size = zc.zc_nvlist_dst_size; if (zhp->zpool_config != NULL) { - uint64_t oldtxg, newtxg; - - verify(nvlist_lookup_uint64(zhp->zpool_config, - ZPOOL_CONFIG_POOL_TXG, &oldtxg) == 0); - verify(nvlist_lookup_uint64(config, - ZPOOL_CONFIG_POOL_TXG, &newtxg) == 0); - nvlist_free(zhp->zpool_old_config); - if (oldtxg != newtxg) { - nvlist_free(zhp->zpool_config); - zhp->zpool_old_config = NULL; - } else { - zhp->zpool_old_config = zhp->zpool_config; - } + zhp->zpool_old_config = zhp->zpool_config; } zhp->zpool_config = config; diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c index f1784dae9c..2986fc1b8c 100644 --- a/usr/src/lib/libzfs/common/libzfs_pool.c +++ b/usr/src/lib/libzfs/common/libzfs_pool.c @@ -3945,10 +3945,18 @@ char * zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, int name_flags) { - char *path, *env; + char *path, *type, *env; uint64_t value; char buf[64]; + /* + * vdev_name will be "root"/"root-0" for the root vdev, but it is the + * zpool name that will be displayed to the user. + */ + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); + if (zhp != NULL && strcmp(type, "root") == 0) + return (zfs_strdup(hdl, zpool_get_name(zhp))); + env = getenv("ZPOOL_VDEV_NAME_PATH"); if (env && (strtoul(env, NULL, 0) > 0 || !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) @@ -4066,7 +4074,7 @@ after_open: return (tmp); } } else { - verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &path) == 0); + path = type; /* * If it's a raidz device, we need to stick in the parity level. diff --git a/usr/src/lib/libzpool/common/kernel.c b/usr/src/lib/libzpool/common/kernel.c index 0d43302861..d26955d83d 100644 --- a/usr/src/lib/libzpool/common/kernel.c +++ b/usr/src/lib/libzpool/common/kernel.c @@ -422,6 +422,83 @@ kobj_get_filesize(struct _buf *file, uint64_t *size) /* * ========================================================================= + * misc routines + * ========================================================================= + */ + +/* + * Find lowest one bit set. + * Returns bit number + 1 of lowest bit that is set, otherwise returns 0. + * This is basically a reimplementation of ffsll(), which is GNU specific. + */ +int +lowbit64(uint64_t i) +{ + register int h = 64; + if (i == 0) + return (0); + + if (i & 0x00000000ffffffffULL) + h -= 32; + else + i >>= 32; + + if (i & 0x0000ffff) + h -= 16; + else + i >>= 16; + + if (i & 0x00ff) + h -= 8; + else + i >>= 8; + + if (i & 0x0f) + h -= 4; + else + i >>= 4; + + if (i & 0x3) + h -= 2; + else + i >>= 2; + + if (i & 0x1) + h -= 1; + + return (h); +} + +int +highbit64(uint64_t i) +{ + int h = 1; + + if (i == 0) + return (0); + if (i & 0xffffffff00000000ULL) { + h += 32; i >>= 32; + } + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +} + +/* + * ========================================================================= * kernel emulation setup & teardown * ========================================================================= */ diff --git a/usr/src/lib/libzpool/common/util.c b/usr/src/lib/libzpool/common/util.c index 8525b5f299..65ffa91ebb 100644 --- a/usr/src/lib/libzpool/common/util.c +++ b/usr/src/lib/libzpool/common/util.c @@ -23,6 +23,7 @@ * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright 2017 RackTop Systems. * Copyright (c) 2017, Intel Corporation. + * Copyright 2020 Joyent, Inc. */ #include <assert.h> @@ -48,7 +49,7 @@ static void show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) { vdev_stat_t *vs; - vdev_stat_t v0 = { 0 }; + vdev_stat_t *v0 = { 0 }; uint64_t sec; uint64_t is_log = 0; nvlist_t **child; @@ -56,6 +57,8 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) char used[6], avail[6]; char rops[6], wops[6], rbytes[6], wbytes[6], rerr[6], werr[6], cerr[6]; + v0 = umem_zalloc(sizeof (*v0), UMEM_NOFAIL); + if (indent == 0 && desc != NULL) { (void) printf(" " " capacity operations bandwidth ---- errors ----\n"); @@ -72,7 +75,7 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) &bias); if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) != 0) - vs = &v0; + vs = v0; if (bias != NULL) { (void) snprintf(bias_suffix, sizeof (bias_suffix), @@ -105,6 +108,7 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) vs->vs_space ? 6 : 0, vs->vs_space ? avail : "", rops, wops, rbytes, wbytes, rerr, werr, cerr); } + umem_free(v0, sizeof (*v0)); if (nvlist_lookup_nvlist_array(nv, ctype, &child, &children) != 0) return; diff --git a/usr/src/man/man1m/zpool.1m b/usr/src/man/man1m/zpool.1m index e8f10ea5b2..fdbf7e741b 100644 --- a/usr/src/man/man1m/zpool.1m +++ b/usr/src/man/man1m/zpool.1m @@ -23,8 +23,8 @@ .\" Copyright (c) 2012, 2017 by Delphix. All rights reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Datto Inc. -.\" Copyright (c) 2017 George Melikov. All Rights Reserved. -.\" Copyright 2019 Joyent, Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2020 Joyent, Inc. .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. .\" .Dd August 30, 2019 @@ -116,10 +116,10 @@ .Op Ar device Ns ... .Nm .Cm iostat -.Op Fl v +.Op Oo Fl lq Oc | Ns Fl rw .Op Fl T Sy u Ns | Ns Sy d -.Op Fl gLP -.Oo Ar pool Oc Ns ... +.Op Fl ghHLnpPvy +.Oo Oo Ar pool Ns ... Oc Ns | Ns Oo Ar pool vdev Ns ... Oc Ns | Ns Oo Ar vdev Ns ... Oc Oc .Op Ar interval Op Ar count .Nm .Cm labelclear @@ -184,7 +184,7 @@ .Ar pool newpool .Nm .Cm status -.Op Fl DigLPtvx +.Op Fl DigLpPstvx .Op Fl T Sy u Ns | Ns Sy d .Oo Ar pool Oc Ns ... .Op Ar interval Op Ar count @@ -1606,25 +1606,48 @@ with no flags on the relevant target devices. .It Xo .Nm .Cm iostat +.Op Oo Fl lq Oc | Ns Fl rw .Op Fl T Sy u Ns | Ns Sy d -.Op Fl gLPv -.Oo Ar pool Oc Ns ... +.Op Fl ghHLnpPvy +.Oo Oo Ar pool Ns ... Oc Ns | Ns Oo Ar pool vdev Ns ... Oc Ns | Ns Oo Ar vdev Ns ... Oc Oc .Op Ar interval Op Ar count .Xc -Displays I/O statistics for the given pools. +Displays I/O statistics for the given pools/vdevs. +Physical I/Os may be observed via +.Xr iostat 1 . +If writes are located nearby, they may be merged into a single larger operation. +Additional I/O may be generated depending on the level of vdev redundancy. +To filter output, you may pass in a list of pools, a pool and list of vdevs +in that pool, or a list of any vdevs from any pool. +If no items are specified, statistics for every pool in the system are shown. When given an .Ar interval , the statistics are printed every .Ar interval seconds until ^C is pressed. -If no -.Ar pool Ns s -are specified, statistics for every pool in the system is shown. +If +.Fl n +flag is specified the headers are displayed only once, otherwise they are +displayed periodically. If .Ar count is specified, the command exits after .Ar count reports are printed. +The first report printed is always the statistics since boot regardless of +whether +.Ar interval +and +.Ar count +are passed. +Also note that the units of +.Sy K , +.Sy M , +.Sy G ... +that are printed in the report are in base 1024. +To get the raw values, use the +.Fl p +flag. .Bl -tag -width Ds .It Fl T Sy u Ns | Ns Sy d Display a time stamp. @@ -1644,20 +1667,99 @@ Display vdev initialization status. Display vdev GUIDs instead of the normal device names. These GUIDs can be used in place of device names for the zpool detach/offline/remove/replace commands. +.It Fl H +Scripted mode. +Do not display headers, and separate fields by a single tab instead of +arbitrary space. .It Fl L Display real paths for vdevs resolving all symbolic links. This can be used to look up the current block device name regardless of the -.Pa /dev/disk/ +.Pa /dev/dsk/ path used to open it. +.It Fl n +Print headers only once when passed. +.It Fl p +Display numbers in parsable (exact) values. +Time values are in nanoseconds. .It Fl P Display full paths for vdevs instead of only the last component of the path. This can be used in conjunction with the .Fl L flag. +.It Fl r +Print request size histograms for the leaf vdev's IO +This includes histograms of individual IOs (ind) and aggregate IOs (agg). +These stats can be useful for observing how well IO aggregation is working. +Note that TRIM IOs may exceed 16M, but will be counted as 16M. .It Fl v Verbose statistics Reports usage statistics for individual vdevs within the pool, in addition to the pool-wide statistics. +.It Fl y +Omit statistics since boot. +Normally the first line of output reports the statistics since boot. +This option suppresses that first line of output. +.Ar interval +.It Fl w +Display latency histograms: +.Pp +.Ar total_wait : +Total IO time (queuing + disk IO time). +.Ar disk_wait : +Disk IO time (time reading/writing the disk). +.Ar syncq_wait : +Amount of time IO spent in synchronous priority queues. +Does not include disk time. +.Ar asyncq_wait : +Amount of time IO spent in asynchronous priority queues. +Does not include disk time. +.Ar scrub : +Amount of time IO spent in scrub queue. +Does not include disk time. +.It Fl l +Include average latency statistics: +.Pp +.Ar total_wait : +Average total IO time (queuing + disk IO time). +.Ar disk_wait : +Average disk IO time (time reading/writing the disk). +.Ar syncq_wait : +Average amount of time IO spent in synchronous priority queues. +Does not include disk time. +.Ar asyncq_wait : +Average amount of time IO spent in asynchronous priority queues. +Does not include disk time. +.Ar scrub : +Average queuing time in scrub queue. +Does not include disk time. +.Ar trim : +Average queuing time in trim queue. +Does not include disk time. +.It Fl q +Include active queue statistics. +Each priority queue has both pending ( +.Ar pend ) +and active ( +.Ar activ ) +IOs. +Pending IOs are waiting to be issued to the disk, and active IOs have been +issued to disk and are waiting for completion. +These stats are broken out by priority queue: +.Pp +.Ar syncq_read/write : +Current number of entries in synchronous priority +queues. +.Ar asyncq_read/write : +Current number of entries in asynchronous priority queues. +.Ar scrubq_read : +Current number of entries in scrub queue. +.Ar trimq_write : +Current number of entries in trim queue. +.Pp +All queue statistics are instantaneous measurements of the number of +entries in the queues. +If you specify an interval, the measurements will be sampled from the end of +the interval. .El .It Xo .Nm @@ -1731,12 +1833,12 @@ flag. .It Fl T Sy u Ns | Ns Sy d Display a time stamp. Specify -.Fl u +.Sy u for a printed representation of the internal representation of time. See .Xr time 2 . Specify -.Fl d +.Sy d for standard date format. See .Xr date 1 . @@ -2021,7 +2123,7 @@ and automatically import it. .It Xo .Nm .Cm status -.Op Fl DigLPtvx +.Op Fl DigLpPstvx .Op Fl T Sy u Ns | Ns Sy d .Oo Ar pool Oc Ns ... .Op Ar interval Op Ar count @@ -2054,23 +2156,33 @@ Display real paths for vdevs resolving all symbolic links. This can be used to look up the current block device name regardless of the .Pa /dev/disk/ path used to open it. +.It Fl p +Display numbers in parsable (exact) values. .It Fl P Display full paths for vdevs instead of only the last component of the path. This can be used in conjunction with the .Fl L flag. +.It Fl s +Display the number of leaf VDEV slow IOs. +This is the number of IOs that didn't complete in +.Sy zio_slow_io_ms +milliseconds (default 30 seconds). +This does not necessarily mean the IOs failed to complete, just took an +unreasonably long amount of time. +This may indicate a problem with the underlying storage. .It Fl t Display vdev TRIM status. .It Fl T Sy u Ns | Ns Sy d Display a time stamp. Specify -.Fl u +.Sy u for a printed representation of the internal representation of time. See .Xr time 2 . Specify -.Fl d +.Sy d for standard date format. See .Xr date 1 . diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf index d4432b45a1..693f83af3e 100644 --- a/usr/src/pkg/manifests/system-test-zfstest.mf +++ b/usr/src/pkg/manifests/system-test-zfstest.mf @@ -112,6 +112,7 @@ dir path=opt/zfs-tests/tests/functional/ctime dir path=opt/zfs-tests/tests/functional/delegate dir path=opt/zfs-tests/tests/functional/devices dir path=opt/zfs-tests/tests/functional/exec +dir path=opt/zfs-tests/tests/functional/fault dir path=opt/zfs-tests/tests/functional/features dir path=opt/zfs-tests/tests/functional/features/async_destroy dir path=opt/zfs-tests/tests/functional/grow_pool @@ -209,6 +210,7 @@ file path=opt/zfs-tests/include/default.cfg mode=0444 file path=opt/zfs-tests/include/libtest.shlib mode=0444 file path=opt/zfs-tests/include/math.shlib mode=0444 file path=opt/zfs-tests/include/properties.shlib mode=0444 +file path=opt/zfs-tests/include/zpool_script.shlib mode=0444 file path=opt/zfs-tests/runfiles/delphix.run mode=0444 file path=opt/zfs-tests/runfiles/longevity.run mode=0444 file path=opt/zfs-tests/runfiles/omnios.run mode=0444 @@ -2375,6 +2377,9 @@ file \ file \ path=opt/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg \ mode=0555 +file \ + path=opt/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos \ + mode=0555 file path=opt/zfs-tests/tests/functional/cli_user/zpool_list/cleanup mode=0555 file path=opt/zfs-tests/tests/functional/cli_user/zpool_list/setup mode=0555 file \ @@ -2443,6 +2448,10 @@ file path=opt/zfs-tests/tests/functional/exec/exec_001_pos mode=0555 file path=opt/zfs-tests/tests/functional/exec/exec_002_neg mode=0555 file path=opt/zfs-tests/tests/functional/exec/mmap_exec mode=0555 file path=opt/zfs-tests/tests/functional/exec/setup mode=0555 +file path=opt/zfs-tests/tests/functional/fault/cleanup mode=0555 +file path=opt/zfs-tests/tests/functional/fault/fault.cfg mode=0444 +file path=opt/zfs-tests/tests/functional/fault/setup mode=0555 +file path=opt/zfs-tests/tests/functional/fault/zpool_status_-s mode=0555 file \ path=opt/zfs-tests/tests/functional/features/async_destroy/async_destroy_001_pos \ mode=0555 diff --git a/usr/src/test/zfs-tests/include/libtest.shlib b/usr/src/test/zfs-tests/include/libtest.shlib index 725c971a4c..2edf9123ab 100644 --- a/usr/src/test/zfs-tests/include/libtest.shlib +++ b/usr/src/test/zfs-tests/include/libtest.shlib @@ -26,7 +26,7 @@ # Copyright (c) 2017 by Tim Chase. All rights reserved. # Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved. # Copyright (c) 2017 Datto Inc. -# Copyright 2019 Joyent, Inc. +# Copyright 2020 Joyent, Inc. # . ${STF_TOOLS}/contrib/include/logapi.shlib @@ -1568,7 +1568,7 @@ function get_disklist # pool disklist=$(zpool iostat -v $1 | nawk '(NR >4) {print $1}' | \ grep -v "\-\-\-\-\-" | \ - egrep -v -e "^(mirror|raidz1|raidz2|spare|log|cache)$") + egrep -v -e "^(mirror|raidz[1-3]|spare|log|cache|special|dedup)$") echo $disklist } diff --git a/usr/src/test/zfs-tests/include/zpool_script.shlib b/usr/src/test/zfs-tests/include/zpool_script.shlib new file mode 100644 index 0000000000..25daa3f06a --- /dev/null +++ b/usr/src/test/zfs-tests/include/zpool_script.shlib @@ -0,0 +1,49 @@ +# +# Common functions used by the zpool_status and zpool_iostat tests for running +# scripts with the -c option. +# +# Copyright (c) 2017 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +function test_zpool_script { + script="$1" + testpool="$2" + cmd="$3" + wholecmd="$cmd $script $testpool" + out="$($wholecmd)" + + # Default number of columns that get printed without -c + if echo "$cmd" | grep -q iostat ; then + # iostat + dcols=7 + else + + # status + dcols=5 + fi + + # Get the new column name that the script created + col="$(echo "$out" | \ + awk '/^pool +alloc +free +read +write +/ {print $8} \ + /NAME +STATE +READ +WRITE +CKSUM/ {print $6}')" + + if [ -z "$col" ] ; then + log_fail "'$wholecmd' created no new columns" + fi + + # Count the number of columns for each vdev. Each script should produce + # at least one new column value. Even if scripts return blank, zpool + # will convert the blank to a '-' to make things awk-able. Normal + # zpool iostat -v output is 7 columns, so if the script ran correctly + # we should see more than that. + if ! newcols=$(echo "$out" | \ + awk '/\/dev/{print NF-'$dcols'; if (NF <= '$dcols') {exit 1}}' | \ + head -n 1) ; \ + then + log_fail "'$wholecmd' didn't create a new column value" + else + log_note "'$wholecmd' passed ($newcols new columns)" + fi +} diff --git a/usr/src/test/zfs-tests/runfiles/delphix.run b/usr/src/test/zfs-tests/runfiles/delphix.run index beb23d9d82..ef1c80efcc 100644 --- a/usr/src/test/zfs-tests/runfiles/delphix.run +++ b/usr/src/test/zfs-tests/runfiles/delphix.run @@ -430,7 +430,7 @@ user = [/opt/zfs-tests/tests/functional/cli_user/zpool_iostat] tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', - 'zpool_iostat_003_neg'] + 'zpool_iostat_003_neg', 'zpool_iostat_004_pos'] user = [/opt/zfs-tests/tests/functional/cli_user/zpool_list] @@ -459,6 +459,9 @@ tests = ['devices_001_pos', 'devices_002_neg', 'devices_003_pos'] [/opt/zfs-tests/tests/functional/exec] tests = ['exec_001_pos', 'exec_002_neg'] +[/opt/zfs-tests/tests/functional/fault] +tests = ['zpool_status_-s'] + [/opt/zfs-tests/tests/functional/features/async_destroy] tests = ['async_destroy_001_pos'] diff --git a/usr/src/test/zfs-tests/runfiles/omnios.run b/usr/src/test/zfs-tests/runfiles/omnios.run index af9f29e8ca..010a16af3a 100644 --- a/usr/src/test/zfs-tests/runfiles/omnios.run +++ b/usr/src/test/zfs-tests/runfiles/omnios.run @@ -429,7 +429,7 @@ user = [/opt/zfs-tests/tests/functional/cli_user/zpool_iostat] tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', - 'zpool_iostat_003_neg'] + 'zpool_iostat_003_neg', 'zpool_iostat_004_pos'] user = [/opt/zfs-tests/tests/functional/cli_user/zpool_list] @@ -458,6 +458,9 @@ tests = ['devices_001_pos', 'devices_002_neg', 'devices_003_pos'] [/opt/zfs-tests/tests/functional/exec] tests = ['exec_001_pos', 'exec_002_neg'] +[/opt/zfs-tests/tests/functional/fault] +tests = ['zpool_status_-s'] + [/opt/zfs-tests/tests/functional/features/async_destroy] tests = ['async_destroy_001_pos'] diff --git a/usr/src/test/zfs-tests/runfiles/openindiana.run b/usr/src/test/zfs-tests/runfiles/openindiana.run index 78923b6a1f..71bad54e0f 100644 --- a/usr/src/test/zfs-tests/runfiles/openindiana.run +++ b/usr/src/test/zfs-tests/runfiles/openindiana.run @@ -429,7 +429,7 @@ user = [/opt/zfs-tests/tests/functional/cli_user/zpool_iostat] tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', - 'zpool_iostat_003_neg'] + 'zpool_iostat_003_neg', 'zpool_iostat_004_pos'] user = [/opt/zfs-tests/tests/functional/cli_user/zpool_list] @@ -461,6 +461,9 @@ tests = ['exec_001_pos', 'exec_002_neg'] [/opt/zfs-tests/tests/functional/features/async_destroy] tests = ['async_destroy_001_pos'] +[/opt/zfs-tests/tests/functional/fault] +tests = ['zpool_status_-s'] + [/opt/zfs-tests/tests/functional/grow_pool] tests = ['grow_pool_001_pos'] diff --git a/usr/src/test/zfs-tests/runfiles/smartos.run b/usr/src/test/zfs-tests/runfiles/smartos.run index 52fbf045dc..7b1c27b214 100644 --- a/usr/src/test/zfs-tests/runfiles/smartos.run +++ b/usr/src/test/zfs-tests/runfiles/smartos.run @@ -377,7 +377,7 @@ user = [/opt/zfs-tests/tests/functional/cli_user/zpool_iostat] tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', - 'zpool_iostat_003_neg'] + 'zpool_iostat_003_neg', 'zpool_iostat_004_pos'] user = [/opt/zfs-tests/tests/functional/cli_user/zpool_list] @@ -397,6 +397,9 @@ tests = ['devices_003_pos'] [/opt/zfs-tests/tests/functional/exec] tests = ['exec_001_pos', 'exec_002_neg'] +[/opt/zfs-tests/tests/functional/fault] +tests = ['zpool_status_-s'] + [/opt/zfs-tests/tests/functional/features/async_destroy] tests = ['async_destroy_001_pos'] diff --git a/usr/src/test/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh b/usr/src/test/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh index 0c631e0eea..78d40ce56d 100755 --- a/usr/src/test/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh @@ -14,7 +14,6 @@ # # Copyright (c) 2017, Intel Corporation. # Copyright (c) 2018 by Delphix. All rights reserved. -# Copyright 2019 Joyent, Inc. # . $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib @@ -39,17 +38,17 @@ do if [ "$type" = "mirror" ]; then log_must zpool add $TESTPOOL special mirror \ $CLASS_DISK0 $CLASS_DISK1 $CLASS_DISK2 - log_must zpool clear $TESTPOOL $CLASS_DISK0 - log_must zpool clear $TESTPOOL $CLASS_DISK1 - log_must zpool clear $TESTPOOL $CLASS_DISK2 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK1 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK2 elif [ "$type" = "raidz" ]; then log_must zpool add $TESTPOOL special mirror \ $CLASS_DISK0 $CLASS_DISK1 - log_must zpool clear $TESTPOOL $CLASS_DISK0 - log_must zpool clear $TESTPOOL $CLASS_DISK1 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK1 else log_must zpool add $TESTPOOL special $CLASS_DISK0 - log_must zpool clear $TESTPOOL $CLASS_DISK0 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 fi log_must zpool destroy -f $TESTPOOL diff --git a/usr/src/test/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh b/usr/src/test/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh index b871034394..106a6d933a 100755 --- a/usr/src/test/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh @@ -14,7 +14,6 @@ # # Copyright (c) 2017, Intel Corporation. # Copyright (c) 2018 by Delphix. All rights reserved. -# Copyright 2019 Joyent, Inc. # . $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib @@ -36,8 +35,7 @@ log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS \ special mirror $CLASS_DISK0 $CLASS_DISK1 log_must zpool replace $TESTPOOL $CLASS_DISK1 $CLASS_DISK2 log_must sleep 10 -log_must zpool status $TESTPOOL | nawk -v dev=$CLASS_DISK2 \ - 'BEGIN {res=1} {if ($1 == dev) res=0} END {exit res}' +log_must zpool iostat -H $TESTPOOL $CLASS_DISK2 log_must zpool destroy -f $TESTPOOL log_pass $claim diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib index c32f72b504..ff06248588 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib @@ -343,6 +343,14 @@ function get_last_txg_synced { typeset pool=$1 + if is_linux; then + txg=$(tail "/proc/spl/kstat/zfs/$pool/txgs" | + awk '$3=="C" {print $1}' | tail -1) + [[ "$txg" ]] || txg=0 + echo $txg + return 0 + fi + typeset spas spas=$(mdb -k -e "::spa") [[ $? -ne 0 ]] && return 1 diff --git a/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh b/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh index 9c2fb74ed4..b605ceb8ee 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh @@ -33,4 +33,4 @@ DISK=${DISKS%% *} -default_setup $DISK +default_raidz_setup $DISKS diff --git a/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh index 903ea1c5f7..c67031780b 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh @@ -33,13 +33,13 @@ # # DESCRIPTION: -# Verify that 'zpool iostat [interval [count]' can be executed as non-root. +# Verify that 'zpool iostat [interval [count]]' can be executed as non-root. # # STRATEGY: # 1. Set the interval to 1 and count to 4. # 2. Sleep for 4 seconds. # 3. Verify that the output has 4 records. -# +# 4. Set interval to 0.5 and count to 1 to test floating point intervals. verify_runnable "both" @@ -68,4 +68,7 @@ if [[ $stat_count -ne 4 ]]; then log_fail "zpool iostat [pool_name] [interval] [count] failed" fi +# Test a floating point interval value +log_must zpool iostat -v 0.5 1 + log_pass "zpool iostat [pool_name ...] [interval] [count] passed" diff --git a/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh b/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh index 0b4a87f66e..c42ddf000f 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh @@ -51,13 +51,15 @@ else fi set -A args "" "-?" "-f" "nonexistpool" "$TESTPOOL/$TESTFS" \ - "$testpool 1.23" "$testpool 0" "$testpool -1" "$testpool 1 0" \ - "$testpool 0 0" + "$testpool 0" "$testpool -1" "$testpool 1 0" \ + "$testpool 0 0" "$testpool -wl" "$testpool -wq" "$testpool -wr" \ + "$testpool -rq" "$testpool -lr" log_assert "Executing 'zpool iostat' with bad options fails" typeset -i i=1 while [[ $i -lt ${#args[*]} ]]; do + log_assert "doing $ZPOOL iostat ${args[i]}" log_mustnot zpool iostat ${args[i]} ((i = i + 1)) done diff --git a/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh new file mode 100755 index 0000000000..25d1c845b6 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh @@ -0,0 +1,76 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +# Copyright (C) 2016 Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Executing 'zpool iostat' command with various combinations of extended +# stats (-lqwr), parsable/script options (-pH), and misc lists of pools +# and vdevs. +# +# STRATEGY: +# 1. Create an array of mixed 'zpool iostat' options. +# 2. Execute each element of the array. +# 3. Verify an error code is returned. +# + +verify_runnable "both" + +typeset testpool +if is_global_zone ; then + testpool=$TESTPOOL +else + testpool=${TESTPOOL%%/*} +fi + +set -A args "" "-v" "-q" "-l" "-lq $TESTPOOL" "-ql ${DISKS[0]} ${DISKS[1]}" \ + "-w $TESTPOOL ${DISKS[0]} ${DISKS[1]}" \ + "-wp $TESTPOOL" \ + "-qlH $TESTPOOL ${DISKS[0]}" \ + "-vpH ${DISKS[0]}" \ + "-wpH ${DISKS[0]}" \ + "-r ${DISKS[0]}" \ + "-rpH ${DISKS[0]}" + +log_assert "Executing 'zpool iostat' with extended stat options succeeds" +log_note "testpool: $TESTPOOL, disks $DISKS" + +typeset -i i=1 +while [[ $i -lt ${#args[*]} ]]; do + log_note "doing $ZPOOL iostat ${args[i]}" + log_must zpool iostat ${args[i]} + ((i = i + 1)) +done + +log_pass "Executing 'zpool iostat' with extended stat options succeeds" diff --git a/usr/src/test/zfs-tests/tests/functional/devices/devices_common.kshlib b/usr/src/test/zfs-tests/tests/functional/devices/devices_common.kshlib index 24f7c7e018..a68e22038f 100644 --- a/usr/src/test/zfs-tests/tests/functional/devices/devices_common.kshlib +++ b/usr/src/test/zfs-tests/tests/functional/devices/devices_common.kshlib @@ -26,6 +26,7 @@ # # Copyright (c) 2013, 2016 by Delphix. All rights reserved. +# Copyright 2020 Joyent, Inc. # . $STF_SUITE/tests/functional/devices/devices.cfg @@ -42,6 +43,10 @@ function create_dev_file typeset filetype=$1 typeset filename=$2 + # On illumos we need access to the root zpool to get the major/minor + # numbers here. + export __ZFS_POOL_EXCLUDE="" + case $filetype in b) devtype=$(df -n / | awk '{print $3}') diff --git a/usr/src/test/zfs-tests/tests/functional/fault/Makefile b/usr/src/test/zfs-tests/tests/functional/fault/Makefile new file mode 100644 index 0000000000..00d2de10a3 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/fault/Makefile @@ -0,0 +1,21 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +include $(SRC)/Makefile.master + +ROOTOPTPKG = $(ROOT)/opt/zfs-tests +TARGETDIR = $(ROOTOPTPKG)/tests/functional/fault + +include $(SRC)/test/zfs-tests/Makefile.com diff --git a/usr/src/test/zfs-tests/tests/functional/fault/cleanup.ksh b/usr/src/test/zfs-tests/tests/functional/fault/cleanup.ksh new file mode 100755 index 0000000000..45b94723a5 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/fault/cleanup.ksh @@ -0,0 +1,36 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END + +# +# Copyright (c) 2016, 2017 by Intel Corporation. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/fault/fault.cfg + +verify_runnable "global" + +cleanup_devices $DISKS + +zed_stop +zed_cleanup resilver_finish-start-scrub.sh + +log_pass diff --git a/usr/src/test/zfs-tests/tests/functional/fault/fault.cfg b/usr/src/test/zfs-tests/tests/functional/fault/fault.cfg new file mode 100644 index 0000000000..25601a71a3 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/fault/fault.cfg @@ -0,0 +1,57 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END + +# +# Copyright (c) 2016, 2017 by Intel Corporation. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +export DISK_ARRAY_NUM=$(echo ${DISKS} | nawk '{print NF}') +export DISKSARRAY=$DISKS +export FSIZE=10M +export MAXTIMEOUT=30 + +export SDSIZE=256 +export SDHOSTS=1 +export SDTGTS=1 +export SDLUNS=1 + +export DISK1=$(echo $DISKS | nawk '{print $1}') +export DISK2=$(echo $DISKS | nawk '{print $2}') +export DISK3=$(echo $DISKS | nawk '{print $3}') + +if is_linux; then + set_slice_prefix + set_device_dir + devs_id[0]=$(get_persistent_disk_name $DISK1) + devs_id[1]=$(get_persistent_disk_name $DISK2) + devs_id[2]=$(get_persistent_disk_name $DISK3) + export devs_id +else + DEV_DSKDIR="/dev" +fi + +export VDEV_FILES="$TEST_BASE_DIR/file-1 $TEST_BASE_DIR/file-2 \ + $TEST_BASE_DIR/file-3 $TEST_BASE_DIR/file-4" +export SPARE_FILE="$TEST_BASE_DIR/spare-1" +export FAULT_FILE="$TEST_BASE_DIR/file-1" diff --git a/usr/src/test/zfs-tests/tests/functional/fault/setup.ksh b/usr/src/test/zfs-tests/tests/functional/fault/setup.ksh new file mode 100755 index 0000000000..b78ee8ccdc --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/fault/setup.ksh @@ -0,0 +1,34 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END + +# +# Copyright (c) 2016, 2017 by Intel Corporation. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/fault/fault.cfg + +verify_runnable "global" + +zed_setup resilver_finish-start-scrub.sh +zed_start + +log_pass diff --git a/usr/src/test/zfs-tests/tests/functional/fault/zpool_status_-s.ksh b/usr/src/test/zfs-tests/tests/functional/fault/zpool_status_-s.ksh new file mode 100755 index 0000000000..b90db23c36 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/fault/zpool_status_-s.ksh @@ -0,0 +1,84 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# Copyright 2019 Joyent, Inc. +# + +# DESCRIPTION: +# Verify zpool status -s (slow IOs) works +# +# STRATEGY: +# 1. Create a file +# 2. Inject slow IOs into the pool +# 3. Verify we can see the slow IOs with "zpool status -s". +# 4. Verify we can see delay events. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/zpool_script.shlib + +# Zol/illumos Porting Note: The commands commented out below are due to the ZoL +# "event" support which has not yet been ported to illumos. + +DISK=${DISKS%% *} + +verify_runnable "both" + +log_must zpool create $TESTPOOL mirror ${DISKS} + +function cleanup +{ + log_must zinject -c all + log_must set_tunable32 zio_slow_io_ms $OLD_SLOW_IO +# log_must set_tunable64 zfs_slow_io_events_per_second $OLD_SLOW_IO_EVENTS + log_must destroy_pool $TESTPOOL +} + +log_onexit cleanup + +# log_must zpool events -c + +# Mark any IOs greater than 10ms as slow IOs +OLD_SLOW_IO=$(get_tunable zio_slow_io_ms) +# OLD_SLOW_IO_EVENTS=$(get_tunable zfs_slow_io_events_per_second) +log_must set_tunable32 zio_slow_io_ms 10 +# log_must set_tunable64 zfs_slow_io_events_per_second 1000 + +# Create 20ms IOs +log_must zinject -d $DISK -D20:100 $TESTPOOL +log_must mkfile 1048576 /$TESTPOOL/testfile +log_must zpool sync $TESTPOOL + +log_must zinject -c all +SLOW_IOS=$(zpool status -sp | grep "$DISK" | awk '{print $6}') +#DELAY_EVENTS=$(zpool events | grep delay | wc -l) + +# if [ $SLOW_IOS -gt 0 ] && [ $DELAY_EVENTS -gt 0 ] ; then +# log_pass "Correctly saw $SLOW_IOS slow IOs and $DELAY_EVENTS delay events" +if [ $SLOW_IOS -gt 0 ] ; then + log_pass "Correctly saw $SLOW_IOS slow IOs" +else +# log_fail "Only saw $SLOW_IOS slow IOs and $DELAY_EVENTS delay events" + log_fail "Only saw $SLOW_IOS slow IOs" +fi diff --git a/usr/src/test/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh b/usr/src/test/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh index 497529f94f..ccf4312e76 100644 --- a/usr/src/test/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh @@ -26,6 +26,7 @@ # # Copyright (c) 2013, 2016 by Delphix. All rights reserved. +# Copyright 2020 Joyent, Inc. # . $STF_SUITE/include/libtest.shlib @@ -201,8 +202,13 @@ function get_mntpt_val #dataset src index mnt_val=`get_prop mountpoint $dset` - mod_prop_val=${mnt_val##*/} - new_path="/"$mod_prop_val$new_path + if [[ $dset == $src ]]; then + new_path=$mnt_val$new_path + else + mod_prop_val=${mnt_val##*/} + new_path="/"$mod_prop_val$new_path + fi + dataset=$dset done diff --git a/usr/src/test/zfs-tests/tests/functional/trim/trim.kshlib b/usr/src/test/zfs-tests/tests/functional/trim/trim.kshlib index 77ee7fe0eb..92687c3eba 100644 --- a/usr/src/test/zfs-tests/tests/functional/trim/trim.kshlib +++ b/usr/src/test/zfs-tests/tests/functional/trim/trim.kshlib @@ -19,7 +19,7 @@ . $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib # -# Get the actual on disk disk for the provided file. +# Get the actual size on disk for the provided file. # function get_size_mb { @@ -29,47 +29,6 @@ function get_size_mb } # -# Use mdb to get the approximate number of trim IOs issued for the pool. -# This really is just used to ensure that trim IO has occured and is a -# temporary solution until illumos supports zpool iostat histograms. -# -function get_illumos_trim_io -{ - typeset pool="${1-:$TESTPOOL}" - typeset spa - typeset vdevs - typeset total_trim - typeset v - typeset trimmed - typeset b - - # Get vdevs for the pool - spa=$(mdb -ke '::spa' | awk -v pool=$pool '{if ($3 == pool) print $1}') - vdevs=$(mdb -ke "$spa::spa -v" | awk '{ - if ($4 == "DESCRIPTION") {st=1; next} - if (st == 1) print $1 - }') - - # Get trim counts for each vdev - total_trim=0 - for v in $vdevs - do - b=$(mdb -ke "$v::print vdev_t vdev_trim_bytes_done" | \ - awk '{print $3}') - trimmed=$(mdb -e "$b=E") - trimmed=$((trimmed / 4096)) - total_trim=$((total_trim + trimmed)) - - b=$(mdb -ke "$v::print vdev_t vdev_autotrim_bytes_done" | \ - awk '{print $3}') - trimmed=$(mdb -e "$b=E") - trimmed=$((trimmed / 4096)) - total_trim=$((total_trim + trimmed)) - done - echo -n "$total_trim" -} - -# # Get the number of trim IOs issued for the pool (ind or agg). # function get_trim_io @@ -106,11 +65,7 @@ function verify_trim_io typeset min_trim_ios=${3:-100} typeset ios - if is_linux; then - ios=$(get_trim_io $pool $type) - else - ios=$(get_illumos_trim_io $pool $type) - fi + ios=$(get_trim_io $pool $type) if [[ $ios -ge $min_trim_ios ]]; then log_note "Issued $ios $type trim IOs for pool $pool" diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index 05fd29810b..60f739d1b4 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -1428,6 +1428,7 @@ spa_unload(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); + spa_import_progress_remove(spa); spa_load_note(spa, "UNLOADING"); /* @@ -2372,6 +2373,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) int error; spa->spa_load_state = state; + (void) spa_import_progress_set_state(spa, spa_load_state(spa)); gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, type, &ereport); @@ -2394,6 +2396,8 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; spa->spa_ena = 0; + (void) spa_import_progress_set_state(spa, spa_load_state(spa)); + return (error); } @@ -2617,6 +2621,9 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) import_expire = gethrtime() + import_delay; while (gethrtime() < import_expire) { + (void) spa_import_progress_set_mmp_check(spa, + NSEC2SEC(import_expire - gethrtime())); + vdev_uberblock_load(rvd, ub, &mmp_label); if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || @@ -2983,6 +2990,10 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } + if (spa->spa_load_max_txg != UINT64_MAX) { + (void) spa_import_progress_set_max_txg(spa, + (u_longlong_t)spa->spa_load_max_txg); + } spa_load_note(spa, "using uberblock with txg=%llu", (u_longlong_t)ub->ub_txg); @@ -3918,6 +3929,8 @@ spa_ld_mos_init(spa_t *spa, spa_import_type_t type) if (error != 0) return (error); + spa_import_progress_add(spa); + /* * Now that we have the vdev tree, try to open each vdev. This involves * opening the underlying physical device, retrieving its geometry and @@ -4348,6 +4361,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) spa_config_exit(spa, SCL_CONFIG, FTAG); } + spa_import_progress_remove(spa); spa_load_note(spa, "LOADED"); return (0); @@ -4408,6 +4422,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, * from previous txgs when spa_load fails. */ ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); + spa_import_progress_remove(spa); return (load_error); } @@ -4419,6 +4434,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, if (rewind_flags & ZPOOL_NEVER_REWIND) { nvlist_free(config); + spa_import_progress_remove(spa); return (load_error); } @@ -4461,6 +4477,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, if (state == SPA_LOAD_RECOVER) { ASSERT3P(loadinfo, ==, NULL); + spa_import_progress_remove(spa); return (rewind_error); } else { /* Store the rewind info as part of the initial load info */ @@ -4471,6 +4488,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, fnvlist_free(spa->spa_load_info); spa->spa_load_info = loadinfo; + spa_import_progress_remove(spa); return (load_error); } } @@ -4740,6 +4758,8 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vdev_get_stats(vd, vs); + vdev_config_generate_stats(vd, l2cache[i]); + } } } diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 45e1978803..9dac4e2ddc 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -28,6 +28,7 @@ * Copyright (c) 2017 Datto Inc. * Copyright 2019 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. + * Copyright 2020 Joyent, Inc. */ #include <sys/zfs_context.h> @@ -635,6 +636,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_iokstat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_imp_kstat_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); @@ -844,6 +846,7 @@ spa_remove(spa_t *spa) mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock); mutex_destroy(&spa->spa_iokstat_lock); + mutex_destroy(&spa->spa_imp_kstat_lock); kmem_free(spa, sizeof (spa_t)); } @@ -2046,6 +2049,140 @@ spa_dirty_data(spa_t *spa) /* * ========================================================================== + * SPA Import Progress Routines + * The illumos implementation of these are different from OpenZFS. OpenZFS + * uses the Linux /proc fs, whereas we use a kstat on the spa. + * ========================================================================== + */ + +typedef struct spa_import_progress { + kstat_named_t sip_load_state; + kstat_named_t sip_mmp_sec_remaining; /* MMP activity check */ + kstat_named_t sip_load_max_txg; /* rewind txg */ +} spa_import_progress_t; + +static void +spa_import_progress_init(void) +{ +} + +static void +spa_import_progress_destroy(void) +{ +} + +void spa_import_progress_add(spa_t *); + +int +spa_import_progress_set_state(spa_t *spa, spa_load_state_t load_state) +{ + if (spa->spa_imp_kstat == NULL) + spa_import_progress_add(spa); + + mutex_enter(&spa->spa_imp_kstat_lock); + if (spa->spa_imp_kstat != NULL) { + spa_import_progress_t *sip = spa->spa_imp_kstat->ks_data; + if (sip != NULL) + sip->sip_load_state.value.ui64 = (uint64_t)load_state; + } + mutex_exit(&spa->spa_imp_kstat_lock); + + return (0); +} + +int +spa_import_progress_set_max_txg(spa_t *spa, uint64_t load_max_txg) +{ + if (spa->spa_imp_kstat == NULL) + spa_import_progress_add(spa); + + mutex_enter(&spa->spa_imp_kstat_lock); + if (spa->spa_imp_kstat != NULL) { + spa_import_progress_t *sip = spa->spa_imp_kstat->ks_data; + if (sip != NULL) + sip->sip_load_max_txg.value.ui64 = load_max_txg; + } + mutex_exit(&spa->spa_imp_kstat_lock); + + return (0); +} + +int +spa_import_progress_set_mmp_check(spa_t *spa, uint64_t mmp_sec_remaining) +{ + if (spa->spa_imp_kstat == NULL) + spa_import_progress_add(spa); + + mutex_enter(&spa->spa_imp_kstat_lock); + if (spa->spa_imp_kstat != NULL) { + spa_import_progress_t *sip = spa->spa_imp_kstat->ks_data; + if (sip != NULL) + sip->sip_mmp_sec_remaining.value.ui64 = + mmp_sec_remaining; + } + mutex_exit(&spa->spa_imp_kstat_lock); + + return (0); +} + +/* + * A new import is in progress. Add an entry. + */ +void +spa_import_progress_add(spa_t *spa) +{ + char *poolname = NULL; + spa_import_progress_t *sip; + + mutex_enter(&spa->spa_imp_kstat_lock); + if (spa->spa_imp_kstat != NULL) { + sip = spa->spa_imp_kstat->ks_data; + sip->sip_load_state.value.ui64 = (uint64_t)spa_load_state(spa); + mutex_exit(&spa->spa_imp_kstat_lock); + return; + } + + (void) nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME, + &poolname); + if (poolname == NULL) + poolname = spa_name(spa); + + spa->spa_imp_kstat = kstat_create("zfs_import", 0, poolname, + "zfs_misc", KSTAT_TYPE_NAMED, + sizeof (spa_import_progress_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (spa->spa_imp_kstat != NULL) { + sip = kmem_alloc(sizeof (spa_import_progress_t), KM_SLEEP); + spa->spa_imp_kstat->ks_data = sip; + + sip->sip_load_state.value.ui64 = (uint64_t)spa_load_state(spa); + + kstat_named_init(&sip->sip_load_state, + "spa_load_state", KSTAT_DATA_UINT64); + kstat_named_init(&sip->sip_mmp_sec_remaining, + "mmp_sec_remaining", KSTAT_DATA_UINT64); + kstat_named_init(&sip->sip_load_max_txg, + "spa_load_max_txg", KSTAT_DATA_UINT64); + spa->spa_imp_kstat->ks_lock = &spa->spa_imp_kstat_lock; + kstat_install(spa->spa_imp_kstat); + } + mutex_exit(&spa->spa_imp_kstat_lock); +} + +void +spa_import_progress_remove(spa_t *spa) +{ + if (spa->spa_imp_kstat != NULL) { + void *data = spa->spa_imp_kstat->ks_data; + + kstat_delete(spa->spa_imp_kstat); + spa->spa_imp_kstat = NULL; + kmem_free(data, sizeof (spa_import_progress_t)); + } +} + +/* + * ========================================================================== * Initialization and Termination * ========================================================================== */ @@ -2122,6 +2259,7 @@ spa_init(int mode) spa_config_load(); l2arc_start(); scan_init(); + spa_import_progress_init(); } void @@ -2141,6 +2279,7 @@ spa_fini(void) unique_fini(); zfs_refcount_fini(); scan_fini(); + spa_import_progress_destroy(); avl_destroy(&spa_namespace_avl); avl_destroy(&spa_spare_avl); diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index cb736db5dd..e017462613 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -900,6 +900,12 @@ typedef struct spa_iostats { kstat_named_t autotrim_bytes_failed; } spa_iostats_t; +extern int spa_import_progress_set_state(spa_t *, spa_load_state_t); +extern int spa_import_progress_set_max_txg(spa_t *, uint64_t); +extern int spa_import_progress_set_mmp_check(spa_t *, uint64_t); +extern void spa_import_progress_add(spa_t *); +extern void spa_import_progress_remove(spa_t *); + /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw); @@ -1053,9 +1059,11 @@ extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, /* error handling */ struct zbookmark_phys; extern void spa_log_error(spa_t *spa, const struct zbookmark_phys *zb); -extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, +extern int zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, const struct zbookmark_phys *zb, struct zio *zio, uint64_t stateoroffset, uint64_t length); +extern boolean_t zfs_ereport_is_valid(const char *class, spa_t *spa, vdev_t *vd, + zio_t *zio); extern void zfs_post_remove(spa_t *spa, vdev_t *vd); extern void zfs_post_state_change(spa_t *spa, vdev_t *vd); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index bc00528fa9..45a78717da 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -399,6 +399,14 @@ struct spa { int spa_queued; } spa_queue_stats[ZIO_PRIORITY_NUM_QUEUEABLE]; + /* + * The following two members diverge from OpenZFS. Upstream import + * status is built around the Linux /proc fs. On illumos we use a kstat + * to track import status. spa_imp_kstat_lock protects spa_imp_kstat. + */ + kmutex_t spa_imp_kstat_lock; + struct kstat *spa_imp_kstat; /* kstat for import status */ + /* arc_memory_throttle() parameters during low memory condition */ uint64_t spa_lowmem_page_load; /* memory load during txg */ uint64_t spa_lowmem_last_txg; /* txg window start */ diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index 707e177fc3..a6de7e6f2c 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -99,6 +99,7 @@ extern void vdev_deadman(vdev_t *vd); extern void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs); +extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_clear_stats(vdev_t *vd); extern void vdev_stat_update(zio_t *zio, uint64_t psize); @@ -173,6 +174,7 @@ extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); extern int vdev_label_number(uint64_t psise, uint64_t offset); extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg); extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **); +extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv); extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 8e61572a50..b9b538455c 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -229,6 +229,7 @@ struct vdev { vdev_t **vdev_child; /* array of children */ uint64_t vdev_children; /* number of children */ vdev_stat_t vdev_stat; /* virtual device statistics */ + vdev_stat_ex_t vdev_stat_ex; /* extended statistics */ boolean_t vdev_expanding; /* expand the vdev? */ boolean_t vdev_reopening; /* reopen in progress? */ boolean_t vdev_nonrot; /* true if solid state */ diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 7d3e0579c2..d03106b942 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -465,6 +465,9 @@ struct zio { hrtime_t io_timestamp; hrtime_t io_queued_timestamp; hrtime_t io_target_timestamp; + hrtime_t io_delta; /* vdev queue service delta */ + hrtime_t io_delay; /* Device access time (disk or */ + /* file). */ avl_node_t io_queue_node; avl_node_t io_offset_node; avl_node_t io_alloc_node; @@ -649,7 +652,7 @@ extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report); extern void zfs_ereport_free_checksum(zio_cksum_report_t *report); /* If we have the good data in hand, this function can be used */ -extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, +extern int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, const abd_t *good_data, const abd_t *bad_data, struct zio_bad_cksum *info); diff --git a/usr/src/uts/common/fs/zfs/sys/zio_priority.h b/usr/src/uts/common/fs/zfs/sys/zio_priority.h index 7d91a646d1..feb23fafd6 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio_priority.h +++ b/usr/src/uts/common/fs/zfs/sys/zio_priority.h @@ -14,6 +14,7 @@ */ /* * Copyright (c) 2014, 2016 by Delphix. All rights reserved. + * Copyright 2020 Joyent, Inc. */ #ifndef _ZIO_PRIORITY_H #define _ZIO_PRIORITY_H @@ -22,6 +23,10 @@ extern "C" { #endif +/* + * NOTE: If ZIO_PRIORITY_NUM_QUEUEABLE changes, update ZIO_PRIORITY_N_QUEUEABLE + * in uts/common/sys/fs/zfs.h to match. + */ typedef enum zio_priority { ZIO_PRIORITY_SYNC_READ, ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ @@ -32,7 +37,6 @@ typedef enum zio_priority { ZIO_PRIORITY_INITIALIZING, /* initializing I/O */ ZIO_PRIORITY_TRIM, /* trim I/O (discard) */ ZIO_PRIORITY_NUM_QUEUEABLE, - ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */ } zio_priority_t; diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index 52a7a62e4a..01e892f4c4 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -3505,6 +3505,7 @@ vdev_clear(spa_t *spa, vdev_t *vd) vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; + vd->vdev_stat.vs_slow_ios = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); @@ -3628,6 +3629,51 @@ vdev_accessible(vdev_t *vd, zio_t *zio) return (B_TRUE); } +static void +vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) +{ + for (int t = 0; t < VS_ZIO_TYPES; t++) { + vs->vs_ops[t] += cvs->vs_ops[t]; + vs->vs_bytes[t] += cvs->vs_bytes[t]; + } + + cvs->vs_scan_removing = cvd->vdev_removing; +} + +/* + * Get extended stats + */ +static void +vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) +{ + int t, b; + for (t = 0; t < ZIO_TYPES; t++) { + for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++) + vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b]; + + for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) { + vsx->vsx_total_histo[t][b] += + cvsx->vsx_total_histo[t][b]; + } + } + + for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { + for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) { + vsx->vsx_queue_histo[t][b] += + cvsx->vsx_queue_histo[t][b]; + } + vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t]; + vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t]; + + for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++) + vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b]; + + for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++) + vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b]; + } + +} + boolean_t vdev_is_spacemap_addressable(vdev_t *vd) { @@ -3652,81 +3698,121 @@ vdev_is_spacemap_addressable(vdev_t *vd) /* * Get statistics for the given vdev. */ -void -vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) +static void +vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { - spa_t *spa = vd->vdev_spa; - vdev_t *rvd = spa->spa_root_vdev; - vdev_t *tvd = vd->vdev_top; + int t; + /* + * If we're getting stats on the root vdev, aggregate the I/O counts + * over all top-level vdevs (i.e. the direct children of the root). + */ + if (!vd->vdev_ops->vdev_op_leaf) { + if (vs) { + memset(vs->vs_ops, 0, sizeof (vs->vs_ops)); + memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes)); + } + if (vsx) + memset(vsx, 0, sizeof (*vsx)); - ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + vdev_stat_t *cvs = &cvd->vdev_stat; + vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex; - mutex_enter(&vd->vdev_stat_lock); - bcopy(&vd->vdev_stat, vs, sizeof (*vs)); - vs->vs_timestamp = gethrtime() - vs->vs_timestamp; - vs->vs_state = vd->vdev_state; - vs->vs_rsize = vdev_get_min_asize(vd); - if (vd->vdev_ops->vdev_op_leaf) { - vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + vdev_get_stats_ex_impl(cvd, cvs, cvsx); + if (vs) + vdev_get_child_stat(cvd, vs, cvs); + if (vsx) + vdev_get_child_stat_ex(cvd, vsx, cvsx); + + } + } else { /* - * Report intializing progress. Since we don't have the - * initializing locks held, this is only an estimate (although a - * fairly accurate one). + * We're a leaf. Just copy our ZIO active queue stats in. The + * other leaf stats are updated in vdev_stat_update(). */ - vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done; - vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est; - vs->vs_initialize_state = vd->vdev_initialize_state; - vs->vs_initialize_action_time = vd->vdev_initialize_action_time; - } + if (!vsx) + return; - /* - * Report manual TRIM progress. Since we don't have - * the manual TRIM locks held, this is only an - * estimate (although fairly accurate one). - */ - vs->vs_trim_notsup = !vd->vdev_has_trim; - vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done; - vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est; - vs->vs_trim_state = vd->vdev_trim_state; - vs->vs_trim_action_time = vd->vdev_trim_action_time; + memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); - /* - * Report expandable space on top-level, non-auxillary devices only. - * The expandable space is reported in terms of metaslab sized units - * since that determines how much space the pool can expand. - */ - if (vd->vdev_aux == NULL && tvd != NULL) { - vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize - - spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift); - } - if (vd->vdev_aux == NULL && vd == vd->vdev_top && - vdev_is_concrete(vd)) { - vs->vs_fragmentation = (vd->vdev_mg != NULL) ? - vd->vdev_mg->mg_fragmentation : 0; + for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) { + vsx->vsx_active_queue[t] = + vd->vdev_queue.vq_class[t].vqc_active; + vsx->vsx_pend_queue[t] = avl_numnodes( + &vd->vdev_queue.vq_class[t].vqc_queued_tree); + } } - if (vd->vdev_ops->vdev_op_leaf) - vs->vs_resilver_deferred = vd->vdev_resilver_deferred; +} - /* - * If we're getting stats on the root vdev, aggregate the I/O counts - * over all top-level vdevs (i.e. the direct children of the root). - */ - if (vd == rvd) { - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *cvd = rvd->vdev_child[c]; - vdev_stat_t *cvs = &cvd->vdev_stat; +void +vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) +{ + vdev_t *tvd = vd->vdev_top; + mutex_enter(&vd->vdev_stat_lock); + if (vs) { + bcopy(&vd->vdev_stat, vs, sizeof (*vs)); + vs->vs_timestamp = gethrtime() - vs->vs_timestamp; + vs->vs_state = vd->vdev_state; + vs->vs_rsize = vdev_get_min_asize(vd); + if (vd->vdev_ops->vdev_op_leaf) { + vs->vs_rsize += VDEV_LABEL_START_SIZE + + VDEV_LABEL_END_SIZE; + /* + * Report initializing progress. Since we don't + * have the initializing locks held, this is only + * an estimate (although a fairly accurate one). + */ + vs->vs_initialize_bytes_done = + vd->vdev_initialize_bytes_done; + vs->vs_initialize_bytes_est = + vd->vdev_initialize_bytes_est; + vs->vs_initialize_state = vd->vdev_initialize_state; + vs->vs_initialize_action_time = + vd->vdev_initialize_action_time; - for (int t = 0; t < VS_ZIO_TYPES; t++) { - vs->vs_ops[t] += cvs->vs_ops[t]; - vs->vs_bytes[t] += cvs->vs_bytes[t]; - } - cvs->vs_scan_removing = cvd->vdev_removing; + /* + * Report manual TRIM progress. Since we don't have + * the manual TRIM locks held, this is only an + * estimate (although fairly accurate one). + */ + vs->vs_trim_notsup = !vd->vdev_has_trim; + vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done; + vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est; + vs->vs_trim_state = vd->vdev_trim_state; + vs->vs_trim_action_time = vd->vdev_trim_action_time; + } + /* + * Report expandable space on top-level, non-auxiliary devices + * only. The expandable space is reported in terms of metaslab + * sized units since that determines how much space the pool + * can expand. + */ + if (vd->vdev_aux == NULL && tvd != NULL) { + vs->vs_esize = P2ALIGN( + vd->vdev_max_asize - vd->vdev_asize, + 1ULL << tvd->vdev_ms_shift); } + if (vd->vdev_aux == NULL && vd == vd->vdev_top && + vdev_is_concrete(vd)) { + vs->vs_fragmentation = (vd->vdev_mg != NULL) ? + vd->vdev_mg->mg_fragmentation : 0; + } + if (vd->vdev_ops->vdev_op_leaf) + vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } + + vdev_get_stats_ex_impl(vd, vs, vsx); mutex_exit(&vd->vdev_stat_lock); } void +vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) +{ + return (vdev_get_stats_ex(vd, vs, NULL)); +} + +void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); @@ -3758,6 +3844,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize) vdev_t *pvd; uint64_t txg = zio->io_txg; vdev_stat_t *vs = &vd->vdev_stat; + vdev_stat_ex_t *vsx = &vd->vdev_stat_ex; zio_type_t type = zio->io_type; int flags = zio->io_flags; @@ -3808,18 +3895,42 @@ vdev_stat_update(zio_t *zio, uint64_t psize) vs->vs_self_healed += psize; } - zio_type_t vs_type = type; - /* - * TRIM ops and bytes are reported to user space as - * ZIO_TYPE_IOCTL. This is done to preserve the - * vdev_stat_t structure layout for user space. + * The bytes/ops/histograms are recorded at the leaf level and + * aggregated into the higher level vdevs in vdev_get_stats(). */ - if (type == ZIO_TYPE_TRIM) - vs_type = ZIO_TYPE_IOCTL; + if (vd->vdev_ops->vdev_op_leaf && + (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { + zio_type_t vs_type = type; + + /* + * TRIM ops and bytes are reported to user space as + * ZIO_TYPE_IOCTL. This is done to preserve the + * vdev_stat_t structure layout for user space. + */ + if (type == ZIO_TYPE_TRIM) + vs_type = ZIO_TYPE_IOCTL; + + vs->vs_ops[vs_type]++; + vs->vs_bytes[vs_type] += psize; + + if (flags & ZIO_FLAG_DELEGATED) { + vsx->vsx_agg_histo[zio->io_priority] + [RQ_HISTO(zio->io_size)]++; + } else { + vsx->vsx_ind_histo[zio->io_priority] + [RQ_HISTO(zio->io_size)]++; + } - vs->vs_ops[vs_type]++; - vs->vs_bytes[vs_type] += psize; + if (zio->io_delta && zio->io_delay) { + vsx->vsx_queue_histo[zio->io_priority] + [L_HISTO(zio->io_delta - zio->io_delay)]++; + vsx->vsx_disk_histo[type] + [L_HISTO(zio->io_delay)]++; + vsx->vsx_total_histo[type] + [L_HISTO(zio->io_delta)]++; + } + } mutex_exit(&vd->vdev_stat_lock); return; diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c index bb377f08ce..6235b06f17 100644 --- a/usr/src/uts/common/fs/zfs/vdev_label.c +++ b/usr/src/uts/common/fs/zfs/vdev_label.c @@ -23,7 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. - * Copyright 2019 Joyent, Inc. + * Copyright 2020 Joyent, Inc. */ /* @@ -211,6 +211,169 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE)); } +/* + * Generate the nvlist representing this vdev's stats + */ +void +vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) +{ + nvlist_t *nvx; + vdev_stat_t *vs; + vdev_stat_ex_t *vsx; + + vs = kmem_alloc(sizeof (*vs), KM_SLEEP); + vsx = kmem_alloc(sizeof (*vsx), KM_SLEEP); + + vdev_get_stats_ex(vd, vs, vsx); + fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t *)vs, sizeof (*vs) / sizeof (uint64_t)); + + /* + * Add extended stats into a special extended stats nvlist. This keeps + * all the extended stats nicely grouped together. The extended stats + * nvlist is then added to the main nvlist. + */ + nvx = fnvlist_alloc(); + + /* ZIOs in flight to disk */ + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_SCRUB]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_TRIM]); + + /* ZIOs pending */ + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_SCRUB]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_TRIM]); + + /* Histograms */ + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + vsx->vsx_total_histo[ZIO_TYPE_READ], + ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + vsx->vsx_total_histo[ZIO_TYPE_WRITE], + ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + vsx->vsx_disk_histo[ZIO_TYPE_READ], + ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + vsx->vsx_disk_histo[ZIO_TYPE_WRITE], + ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM])); + + /* Request sizes */ + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM])); + + /* IO delays */ + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios); + + /* Add extended stats nvlist to main nvlist */ + fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx); + + nvlist_free(nvx); + kmem_free(vs, sizeof (*vs)); + kmem_free(vsx, sizeof (*vsx)); +} + static void root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) { @@ -386,11 +549,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, } if (getstats) { - vdev_stat_t vs; - - vdev_get_stats(vd, &vs); - fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)); + vdev_config_generate_stats(vd, nv); root_vdev_actions_getprogress(vd, nv); diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index 7f9795ac6f..4c6515c43d 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -829,6 +829,7 @@ vdev_queue_io_done(zio_t *zio) vdev_queue_pending_remove(vq, zio); + zio->io_delta = gethrtime() - zio->io_timestamp; vq->vq_io_complete_ts = gethrtime(); while ((nio = vdev_queue_io_to_issue(vq)) != NULL) { diff --git a/usr/src/uts/common/fs/zfs/zfs_fm.c b/usr/src/uts/common/fs/zfs/zfs_fm.c index 87846292b1..dd854c12e1 100644 --- a/usr/src/uts/common/fs/zfs/zfs_fm.c +++ b/usr/src/uts/common/fs/zfs/zfs_fm.c @@ -102,7 +102,11 @@ * ereport with information about the differences. */ #ifdef _KERNEL -static void + +/* + * Return B_TRUE if the event actually posted, B_FALSE if not. + */ +static boolean_t zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, uint64_t size) @@ -112,88 +116,15 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, uint64_t ena; char class[64]; - /* - * If we are doing a spa_tryimport() or in recovery mode, - * ignore errors. - */ - if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || - spa_load_state(spa) == SPA_LOAD_RECOVER) - return; - - /* - * If we are in the middle of opening a pool, and the previous attempt - * failed, don't bother logging any new ereports - we're just going to - * get the same diagnosis anyway. - */ - if (spa_load_state(spa) != SPA_LOAD_NONE && - spa->spa_last_open_failed) - return; - - if (zio != NULL) { - /* - * If this is not a read or write zio, ignore the error. This - * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. - */ - if (zio->io_type != ZIO_TYPE_READ && - zio->io_type != ZIO_TYPE_WRITE) - return; - - /* - * Ignore any errors from speculative I/Os, as failure is an - * expected result. - */ - if (zio->io_flags & ZIO_FLAG_SPECULATIVE) - return; - - /* - * If this I/O is not a retry I/O, don't post an ereport. - * Otherwise, we risk making bad diagnoses based on B_FAILFAST - * I/Os. - */ - if (zio->io_error == EIO && - !(zio->io_flags & ZIO_FLAG_IO_RETRY)) - return; - - if (vd != NULL) { - /* - * If the vdev has already been marked as failing due - * to a failed probe, then ignore any subsequent I/O - * errors, as the DE will automatically fault the vdev - * on the first such failure. This also catches cases - * where vdev_remove_wanted is set and the device has - * not yet been asynchronously placed into the REMOVED - * state. - */ - if (zio->io_vd == vd && !vdev_accessible(vd, zio)) - return; - - /* - * Ignore checksum errors for reads from DTL regions of - * leaf vdevs. - */ - if (zio->io_type == ZIO_TYPE_READ && - zio->io_error == ECKSUM && - vd->vdev_ops->vdev_op_leaf && - vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) - return; - } - } - - /* - * For probe failure, we want to avoid posting ereports if we've - * already removed the device in the meantime. - */ - if (vd != NULL && - strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && - (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) - return; + if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) + return (B_FALSE); if ((ereport = fm_nvlist_create(NULL)) == NULL) - return; + return (B_FALSE); if ((detector = fm_nvlist_create(NULL)) == NULL) { fm_nvlist_destroy(ereport, FM_NVA_FREE); - return; + return (B_FALSE); } /* @@ -336,7 +267,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, /* * Payload for I/Os with corresponding logical information. */ - if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) + if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, DATA_TYPE_UINT64, zb->zb_objset, @@ -346,11 +277,13 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, DATA_TYPE_INT64, zb->zb_level, FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, DATA_TYPE_UINT64, zb->zb_blkid, NULL); + } mutex_exit(&spa->spa_errlist_lock); *ereport_out = ereport; *detector_out = detector; + return (B_TRUE); } /* if it's <= 128 bytes, save the corruption directly */ @@ -674,26 +607,110 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, } #endif -void +/* + * Make sure our event is still valid for the given zio/vdev/pool. For example, + * we don't want to keep logging events for a faulted or missing vdev. + */ +boolean_t +zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) +{ +#ifdef _KERNEL + /* + * If we are doing a spa_tryimport() or in recovery mode, + * ignore errors. + */ + if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || + spa_load_state(spa) == SPA_LOAD_RECOVER) + return (B_FALSE); + + /* + * If we are in the middle of opening a pool, and the previous attempt + * failed, don't bother logging any new ereports - we're just going to + * get the same diagnosis anyway. + */ + if (spa_load_state(spa) != SPA_LOAD_NONE && + spa->spa_last_open_failed) + return (B_FALSE); + + if (zio != NULL) { + /* + * If this is not a read or write zio, ignore the error. This + * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. + */ + if (zio->io_type != ZIO_TYPE_READ && + zio->io_type != ZIO_TYPE_WRITE) + return (B_FALSE); + + if (vd != NULL) { + /* + * If the vdev has already been marked as failing due + * to a failed probe, then ignore any subsequent I/O + * errors, as the DE will automatically fault the vdev + * on the first such failure. This also catches cases + * where vdev_remove_wanted is set and the device has + * not yet been asynchronously placed into the REMOVED + * state. + */ + if (zio->io_vd == vd && !vdev_accessible(vd, zio)) + return (B_FALSE); + + /* + * Ignore checksum errors for reads from DTL regions of + * leaf vdevs. + */ + if (zio->io_type == ZIO_TYPE_READ && + zio->io_error == ECKSUM && + vd->vdev_ops->vdev_op_leaf && + vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) + return (B_FALSE); + } + } + + /* + * For probe failure, we want to avoid posting ereports if we've + * already removed the device in the meantime. + */ + if (vd != NULL && + strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && + (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) + return (B_FALSE); + + /* Ignore bogus delay events (like from ioctls or unqueued IOs) */ + if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && + (zio != NULL) && (!zio->io_timestamp)) { + return (B_FALSE); + } +#endif + return (B_TRUE); +} + +/* + * Return 0 if event was posted, EINVAL if there was a problem posting it or + * EBUSY if the event was rate limited. + */ +int zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, const struct zbookmark_phys *zb, zio_t *zio, uint64_t stateoroffset, uint64_t size) { + int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; - zfs_ereport_start(&ereport, &detector, subclass, spa, vd, - zb, zio, stateoroffset, size); + if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, + zb, zio, stateoroffset, size)) + return (SET_ERROR(EINVAL)); /* couldn't post event */ if (ereport == NULL) - return; + return (SET_ERROR(EINVAL)); fm_ereport_post(ereport, EVCH_SLEEP); fm_nvlist_destroy(ereport, FM_NVA_FREE); fm_nvlist_destroy(detector, FM_NVA_FREE); #endif + return (rc); } void @@ -786,21 +803,21 @@ zfs_ereport_send_interim_checksum(zio_cksum_report_t *report) #endif } -void +int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc) { + int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; zfs_ecksum_info_t *info; - zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, - spa, vd, zb, zio, offset, length); - - if (ereport == NULL) - return; + if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, + spa, vd, zb, zio, offset, length) || (ereport == NULL)) { + return (SET_ERROR(EINVAL)); + } info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, B_FALSE); @@ -814,6 +831,7 @@ zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, if (info != NULL) kmem_free(info, sizeof (*info)); #endif + return (rc); } static void diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index ed9d6e86f5..ff0e4bbded 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -42,6 +42,7 @@ #include <sys/ddt.h> #include <sys/blkptr.h> #include <sys/zfeature.h> +#include <sys/time.h> #include <sys/dsl_scan.h> #include <sys/metaslab_impl.h> #include <sys/abd.h> @@ -77,6 +78,9 @@ extern vmem_t *zio_alloc_arena; #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 +/* Mark IOs as "slow" if they take longer than 30 seconds */ +int zio_slow_io_ms = (30 * MILLISEC); + #define BP_SPANB(indblkshift, level) \ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) #define COMPARE_META_LEVEL 0x80000000ul @@ -3388,6 +3392,8 @@ zio_vdev_io_start(zio_t *zio) uint64_t align; spa_t *spa = zio->io_spa; + zio->io_delay = 0; + ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); @@ -3505,6 +3511,7 @@ zio_vdev_io_start(zio_t *zio) zio_interrupt(zio); return (ZIO_PIPELINE_STOP); } + zio->io_delay = gethrtime(); } vd->vdev_ops->vdev_op_io_start(zio); @@ -3525,6 +3532,9 @@ zio_vdev_io_done(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM); + if (zio->io_delay) + zio->io_delay = gethrtime() - zio->io_delay; + if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { vdev_queue_io_done(zio); @@ -4228,6 +4238,29 @@ zio_done(zio_t *zio) vdev_stat_update(zio, psize); + if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) { + if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { + /* + * We want to only increment our slow IO counters if + * the IO is valid (i.e. not if the drive is removed). + * + * zfs_ereport_post() will also do these checks, but + * it can also have other failures, so we need to + * increment the slow_io counters independent of it. + */ + if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY, + zio->io_spa, zio->io_vd, zio)) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + zio->io_vd->vdev_stat.vs_slow_ios++; + mutex_exit(&zio->io_vd->vdev_stat_lock); + + zfs_ereport_post(FM_EREPORT_ZFS_DELAY, + zio->io_spa, zio->io_vd, &zio->io_bookmark, + zio, 0, 0); + } + } + } + if (zio->io_error) { /* * If this I/O is attached to a particular vdev, diff --git a/usr/src/uts/common/sys/fm/fs/zfs.h b/usr/src/uts/common/sys/fm/fs/zfs.h index c3eb950326..8e56f244cd 100644 --- a/usr/src/uts/common/sys/fm/fs/zfs.h +++ b/usr/src/uts/common/sys/fm/fs/zfs.h @@ -36,6 +36,7 @@ extern "C" { #define FM_EREPORT_ZFS_AUTHENTICATION "authentication" #define FM_EREPORT_ZFS_IO "io" #define FM_EREPORT_ZFS_DATA "data" +#define FM_EREPORT_ZFS_DELAY "delay" #define FM_EREPORT_ZFS_POOL "zpool" #define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown" #define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed" @@ -61,6 +62,7 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE "vdev_state" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT "vdev_ashift" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h index 1bc421e33b..93a2b5887a 100644 --- a/usr/src/uts/common/sys/fs/zfs.h +++ b/usr/src/uts/common/sys/fs/zfs.h @@ -24,7 +24,7 @@ * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] - * Copyright 2017 Joyent, Inc. + * Copyright 2020 Joyent, Inc. * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. */ @@ -35,6 +35,15 @@ #define _SYS_FS_ZFS_H #include <sys/time.h> +/* + * In OpenZFS we include sys/zio_priority.h to get the enum value of + * ZIO_PRIORITY_NUM_QUEUEABLE, which is used for the various array sizes in + * the structure definitions below. However, in illumos zio_priority.h is not + * readily available to the userland code where we have a very large number of + * files including sys/zfs.h. Thus, we define ZIO_PRIORITY_N_QUEUEABLE here and + * this should be kept in sync if ZIO_PRIORITY_NUM_QUEUEABLE changes. + */ +#define ZIO_PRIORITY_N_QUEUEABLE 8 #ifdef __cplusplus extern "C" { @@ -601,6 +610,55 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ #define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ + +/* container nvlist of extended stats */ +#define ZPOOL_CONFIG_VDEV_STATS_EX "vdev_stats_ex" + +/* Active queue read/write stats */ +#define ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE "vdev_sync_r_active_queue" +#define ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE "vdev_sync_w_active_queue" +#define ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE "vdev_async_r_active_queue" +#define ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE "vdev_async_w_active_queue" +#define ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE "vdev_async_scrub_active_queue" +#define ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE "vdev_async_trim_active_queue" + +/* Queue sizes */ +#define ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE "vdev_sync_r_pend_queue" +#define ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE "vdev_sync_w_pend_queue" +#define ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE "vdev_async_r_pend_queue" +#define ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE "vdev_async_w_pend_queue" +#define ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE "vdev_async_scrub_pend_queue" +#define ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE "vdev_async_trim_pend_queue" + +/* Latency read/write histogram stats */ +#define ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO "vdev_tot_r_lat_histo" +#define ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO "vdev_tot_w_lat_histo" +#define ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO "vdev_disk_r_lat_histo" +#define ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO "vdev_disk_w_lat_histo" +#define ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO "vdev_sync_r_lat_histo" +#define ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO "vdev_sync_w_lat_histo" +#define ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO "vdev_async_r_lat_histo" +#define ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO "vdev_async_w_lat_histo" +#define ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO "vdev_scrub_histo" +#define ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO "vdev_trim_histo" + +/* Request size histograms */ +#define ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO "vdev_sync_ind_r_histo" +#define ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO "vdev_sync_ind_w_histo" +#define ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO "vdev_async_ind_r_histo" +#define ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO "vdev_async_ind_w_histo" +#define ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO "vdev_ind_scrub_histo" +#define ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO "vdev_ind_trim_histo" +#define ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO "vdev_sync_agg_r_histo" +#define ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO "vdev_sync_agg_w_histo" +#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO "vdev_async_agg_r_histo" +#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo" +#define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo" +#define ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO "vdev_agg_trim_histo" + +/* Number of slow IOs */ +#define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios" + #define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" #define ZPOOL_CONFIG_ERRCOUNT "error_count" #define ZPOOL_CONFIG_NOT_PRESENT "not_present" @@ -1001,6 +1059,7 @@ typedef struct vdev_stat { uint64_t vs_initialize_action_time; /* time_t */ uint64_t vs_checkpoint_space; /* checkpoint-consumed space */ uint64_t vs_resilver_deferred; /* resilver deferred */ + uint64_t vs_slow_ios; /* slow IOs */ uint64_t vs_trim_errors; /* trimming errors */ uint64_t vs_trim_notsup; /* supported by device */ uint64_t vs_trim_bytes_done; /* bytes trimmed */ @@ -1010,6 +1069,58 @@ typedef struct vdev_stat { } vdev_stat_t; /* + * Extended stats + * + * These are stats which aren't included in the original iostat output. For + * convenience, they are grouped together in vdev_stat_ex, although each stat + * is individually exported as a nvlist. + */ +typedef struct vdev_stat_ex { + /* Number of ZIOs issued to disk and waiting to finish */ + uint64_t vsx_active_queue[ZIO_PRIORITY_N_QUEUEABLE]; + + /* Number of ZIOs pending to be issued to disk */ + uint64_t vsx_pend_queue[ZIO_PRIORITY_N_QUEUEABLE]; + + /* + * Below are the histograms for various latencies. Buckets are in + * units of nanoseconds. + */ + + /* + * 2^37 nanoseconds = 134s. Timeouts will probably start kicking in + * before this. + */ +#define VDEV_L_HISTO_BUCKETS 37 /* Latency histo buckets */ +#define VDEV_RQ_HISTO_BUCKETS 25 /* Request size histo buckets */ + + /* Amount of time in ZIO queue (ns) */ + uint64_t vsx_queue_histo[ZIO_PRIORITY_N_QUEUEABLE] + [VDEV_L_HISTO_BUCKETS]; + + /* Total ZIO latency (ns). Includes queuing and disk access time */ + uint64_t vsx_total_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS]; + + /* Amount of time to read/write the disk (ns) */ + uint64_t vsx_disk_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS]; + + /* "lookup the bucket for a value" macro */ +#define HISTO(val, buckets) (val != 0 ? MIN(highbit64(val) - 1, \ + buckets - 1) : 0) +#define L_HISTO(a) HISTO(a, VDEV_L_HISTO_BUCKETS) +#define RQ_HISTO(a) HISTO(a, VDEV_RQ_HISTO_BUCKETS) + + /* Physical IO histogram */ + uint64_t vsx_ind_histo[ZIO_PRIORITY_N_QUEUEABLE] + [VDEV_RQ_HISTO_BUCKETS]; + + /* Delegated (aggregated) physical IO histogram */ + uint64_t vsx_agg_histo[ZIO_PRIORITY_N_QUEUEABLE] + [VDEV_RQ_HISTO_BUCKETS]; + +} vdev_stat_ex_t; + +/* * DDT statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ |