summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorTom Caputi <tcaputi@datto.com>2019-03-15 17:14:31 -0400
committerToomas Soome <tsoome@me.com>2019-05-13 23:49:15 +0300
commit12a8814c13fbb1d6d58616cf090ea5815dc107f9 (patch)
tree3f1b36f6702e76bf3b0636d6c3d9a8943d06470c /usr/src
parenta3874b8b1fe5103fc1f961609557c0587435fec0 (diff)
downloadillumos-gate-12a8814c13fbb1d6d58616cf090ea5815dc107f9.tar.gz
10566 Multiple DVA Scrubbing Fix
Portions contributed by: Toomas Soome <tsoome@me.com> Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Approved by: Dan McDonald <danmcd@joyent.com>
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/zinject/zinject.c176
-rw-r--r--usr/src/pkg/manifests/system-test-zfstest.mf9
-rw-r--r--usr/src/test/zfs-tests/include/libtest.shlib18
-rw-r--r--usr/src/test/zfs-tests/runfiles/delphix.run3
-rw-r--r--usr/src/test/zfs-tests/runfiles/omnios.run3
-rw-r--r--usr/src/test/zfs-tests/runfiles/openindiana.run3
-rw-r--r--usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_024_pos.ksh152
-rwxr-xr-xusr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh77
-rwxr-xr-xusr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh133
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_scan.c199
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c2
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h4
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h4
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c22
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_disk.c10
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_file.c3
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_mirror.c365
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_queue.c33
-rw-r--r--usr/src/uts/common/fs/zfs/zio_inject.c42
22 files changed, 1077 insertions, 187 deletions
diff --git a/usr/src/cmd/zinject/zinject.c b/usr/src/cmd/zinject/zinject.c
index efae04675e..1c0b3199bd 100644
--- a/usr/src/cmd/zinject/zinject.c
+++ b/usr/src/cmd/zinject/zinject.c
@@ -47,48 +47,48 @@
*
* This form of the command looks like:
*
- * zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool
+ * zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool
*
*
* DATA FAULTS
*
* We begin with a tuple of the form:
*
- * <type,level,range,object>
+ * <type,level,range,object>
*
- * type A string describing the type of data to target. Each type
- * implicitly describes how to interpret 'object'. Currently,
- * the following values are supported:
+ * type A string describing the type of data to target. Each type
+ * implicitly describes how to interpret 'object'. Currently,
+ * the following values are supported:
*
- * data User data for a file
- * dnode Dnode for a file or directory
+ * data User data for a file
+ * dnode Dnode for a file or directory
*
* The following MOS objects are special. Instead of injecting
* errors on a particular object or blkid, we inject errors across
* all objects of the given type.
*
- * mos Any data in the MOS
- * mosdir object directory
- * config pool configuration
- * bpobj blkptr list
- * spacemap spacemap
- * metaslab metaslab
- * errlog persistent error log
+ * mos Any data in the MOS
+ * mosdir object directory
+ * config pool configuration
+ * bpobj blkptr list
+ * spacemap spacemap
+ * metaslab metaslab
+ * errlog persistent error log
*
- * level Object level. Defaults to '0', not applicable to all types. If
- * a range is given, this corresponds to the indirect block
- * corresponding to the specific range.
+ * level Object level. Defaults to '0', not applicable to all types. If
+ * a range is given, this corresponds to the indirect block
+ * corresponding to the specific range.
*
* range A numerical range [start,end) within the object. Defaults to
* the full size of the file.
*
- * object A string describing the logical location of the object. For
- * files and directories (currently the only supported types),
- * this is the path of the object on disk.
+ * object A string describing the logical location of the object. For
+ * files and directories (currently the only supported types),
+ * this is the path of the object on disk.
*
* This is translated, via libzpool, into the following internal representation:
*
- * <type,objset,object,level,range>
+ * <type,objset,object,level,range>
*
* These types should be self-explanatory. This tuple is then passed to the
* kernel via a special ioctl() to initiate fault injection for the given
@@ -98,12 +98,12 @@
*
* The command itself takes one of the forms:
*
- * zinject
- * zinject <-a | -u pool>
- * zinject -c <id|all>
- * zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level]
+ * zinject
+ * zinject <-a | -u pool>
+ * zinject -c <id|all>
+ * zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level]
* [-r range] <object>
- * zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool
+ * zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool
*
* With no arguments, the command prints all currently registered injection
* handlers, with their numeric identifiers.
@@ -288,8 +288,8 @@ usage(void)
"\t\tspecified by the remaining tuple. Each number is in\n"
"\t\thexidecimal, and only one block can be specified.\n"
"\n"
- "\tzinject [-q] <-t type> [-e errno] [-l level] [-r range]\n"
- "\t [-a] [-m] [-u] [-f freq] <object>\n"
+ "\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n"
+ "\t\t[-r range] [-a] [-m] [-u] [-f freq] <object>\n"
"\n"
"\t\tInject an error into the object specified by the '-t' option\n"
"\t\tand the object descriptor. The 'object' parameter is\n"
@@ -297,7 +297,10 @@ usage(void)
"\n"
"\t\t-q\tQuiet mode. Only print out the handler number added.\n"
"\t\t-e\tInject a specific error. Must be either 'io' or\n"
- "\t\t\t'checksum'. Default is 'io'.\n"
+ "\t\t\t'checksum', or 'decompress'. Default is 'io'.\n"
+ "\t\t-C\tInject the given error only into specific DVAs. The\n"
+ "\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n"
+ "\t\t\tseparated by commas (ex. '0,2').\n"
"\t\t-l\tInject error at a particular block level. Default is "
"0.\n"
"\t\t-m\tAutomatically remount underlying filesystem.\n"
@@ -358,17 +361,19 @@ print_data_handler(int id, const char *pool, zinject_record_t *record,
return (0);
if (*count == 0) {
- (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-15s\n",
- "ID", "POOL", "OBJSET", "OBJECT", "TYPE", "LVL", "RANGE");
+ (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s ",
+ "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE",
+ "LVL", "DVAs", "RANGE");
(void) printf("--- --------------- ------ "
- "------ -------- --- ---------------\n");
+ "------ -------- --- ---- ----------------\n");
}
*count += 1;
- (void) printf("%3d %-15s %-6llu %-6llu %-8s %3d ", id, pool,
- (u_longlong_t)record->zi_objset, (u_longlong_t)record->zi_object,
- type_to_name(record->zi_type), record->zi_level);
+ (void) printf("%3d %-15s %-6llu %-6llu %-8s %-3d 0x%02x ",
+ id, pool, (u_longlong_t)record->zi_objset,
+ (u_longlong_t)record->zi_object, type_to_name(record->zi_type),
+ record->zi_level, record->zi_dvas);
if (record->zi_start == 0 &&
record->zi_end == -1ULL)
@@ -598,6 +603,7 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
(void) printf(" range: [%llu, %llu)\n",
(u_longlong_t)record->zi_start,
(u_longlong_t)record->zi_end);
+ (void) printf(" dvas: 0x%x\n", record->zi_dvas);
}
}
@@ -649,6 +655,59 @@ parse_delay(char *str, uint64_t *delay, uint64_t *nlanes)
return (0);
}
+/*
+ * This function converts a string specifier for DVAs into a bit mask.
+ * The dva's provided by the user should be 0 indexed and separated by
+ * a comma. For example:
+ * "1" -> 0b0010 (0x2)
+ * "0,1" -> 0b0011 (0x3)
+ * "0,1,2" -> 0b0111 (0x7)
+ */
+static int
+parse_dvas(const char *str, uint32_t *dvas_out)
+{
+ const char *c = str;
+ uint32_t mask = 0;
+ boolean_t need_delim = B_FALSE;
+
+ /* max string length is 5 ("0,1,2") */
+ if (strlen(str) > 5 || strlen(str) == 0)
+ return (EINVAL);
+
+ while (*c != '\0') {
+ switch (*c) {
+ case '0':
+ case '1':
+ case '2':
+ /* check for pipe between DVAs */
+ if (need_delim)
+ return (EINVAL);
+
+ /* check if this DVA has been set already */
+ if (mask & (1 << ((*c) - '0')))
+ return (EINVAL);
+
+ mask |= (1 << ((*c) - '0'));
+ need_delim = B_TRUE;
+ break;
+ case ',':
+ need_delim = B_FALSE;
+ break;
+ default:
+ /* check for invalid character */
+ return (EINVAL);
+ }
+ c++;
+ }
+
+ /* check for dangling delimiter */
+ if (!need_delim)
+ return (EINVAL);
+
+ *dvas_out = mask;
+ return (0);
+}
+
int
main(int argc, char **argv)
{
@@ -675,6 +734,7 @@ main(int argc, char **argv)
int dur_secs = 0;
int ret;
int flags = 0;
+ uint32_t dvas = 0;
if ((g_zfs = libzfs_init()) == NULL) {
(void) fprintf(stderr, "internal error: failed to "
@@ -705,7 +765,7 @@ main(int argc, char **argv)
}
while ((c = getopt(argc, argv,
- ":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
+ ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
switch (c) {
case 'a':
flags |= ZINJECT_FLUSH_ARC;
@@ -728,6 +788,17 @@ main(int argc, char **argv)
case 'c':
cancel = optarg;
break;
+ case 'C':
+ ret = parse_dvas(optarg, &dvas);
+ if (ret != 0) {
+ (void) fprintf(stderr, "invalid DVA list '%s': "
+ "DVAs should be 0 indexed and separated by "
+ "commas.\n", optarg);
+ usage();
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ break;
case 'd':
device = optarg;
break;
@@ -887,7 +958,8 @@ main(int argc, char **argv)
* '-c' is invalid with any other options.
*/
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
- level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
+ level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
+ record.zi_freq > 0 || dvas != 0) {
(void) fprintf(stderr, "cancel (-c) incompatible with "
"any other options\n");
usage();
@@ -919,7 +991,8 @@ main(int argc, char **argv)
* for doing injection, so handle it separately here.
*/
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
- level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
+ level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
+ dvas != 0) {
(void) fprintf(stderr, "device (-d) incompatible with "
"data error injection\n");
usage();
@@ -953,7 +1026,8 @@ main(int argc, char **argv)
} else if (raw != NULL) {
if (range != NULL || type != TYPE_INVAL || level != 0 ||
- record.zi_cmd != ZINJECT_UNINITIALIZED) {
+ record.zi_cmd != ZINJECT_UNINITIALIZED ||
+ record.zi_freq > 0 || dvas != 0) {
(void) fprintf(stderr, "raw (-b) format with "
"any other options\n");
usage();
@@ -983,7 +1057,8 @@ main(int argc, char **argv)
error = EIO;
} else if (record.zi_cmd == ZINJECT_PANIC) {
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
- level != 0 || device != NULL) {
+ level != 0 || device != NULL || record.zi_freq > 0 ||
+ dvas != 0) {
(void) fprintf(stderr, "panic (-p) incompatible with "
"other options\n");
usage();
@@ -1002,6 +1077,15 @@ main(int argc, char **argv)
record.zi_type = atoi(argv[1]);
dataset[0] = '\0';
} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
+ if (raw != NULL || range != NULL || type != TYPE_INVAL ||
+ level != 0 || record.zi_freq > 0 || dvas != 0) {
+ (void) fprintf(stderr, "hardware failure (-I) "
+ "incompatible with other options\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (2);
+ }
+
if (nowrites == 0) {
(void) fprintf(stderr, "-s or -g meaningless "
"without -I (ignore writes)\n");
@@ -1055,6 +1139,18 @@ main(int argc, char **argv)
return (1);
}
+ if (dvas != 0) {
+ if (error == EACCES || error == EINVAL) {
+ (void) fprintf(stderr, "the '-C' option may "
+ "not be used with logical data errors "
+ "'decrypt' and 'decompress'\n");
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+
+ record.zi_dvas = dvas;
+ }
+
record.zi_cmd = ZINJECT_DATA_FAULT;
if (translate_record(type, argv[0], range, level, &record, pool,
dataset) != 0)
diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf
index bd2df5aec1..aeb7288efc 100644
--- a/usr/src/pkg/manifests/system-test-zfstest.mf
+++ b/usr/src/pkg/manifests/system-test-zfstest.mf
@@ -1449,6 +1449,9 @@ file \
path=opt/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_023_neg \
mode=0555
file \
+ path=opt/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_024_pos \
+ mode=0555
+file \
path=opt/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos \
mode=0555
file \
@@ -1750,6 +1753,12 @@ file \
path=opt/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_005_pos \
mode=0555
file \
+ path=opt/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies \
+ mode=0555
+file \
+ path=opt/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device \
+ mode=0555
+file \
path=opt/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_print_repairing \
mode=0555
file path=opt/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_001_pos \
diff --git a/usr/src/test/zfs-tests/include/libtest.shlib b/usr/src/test/zfs-tests/include/libtest.shlib
index 295620102a..bedd71c4b4 100644
--- a/usr/src/test/zfs-tests/include/libtest.shlib
+++ b/usr/src/test/zfs-tests/include/libtest.shlib
@@ -1747,6 +1747,24 @@ function wait_for_degraded
}
#
+# Wait for a pool to be scrubbed
+#
+# $1 pool name
+# $2 number of seconds to wait (optional)
+#
+# Returns true when pool has been scrubbed, or false if there's a timeout or if
+# no scrub was done.
+#
+function wait_scrubbed
+{
+ typeset pool=${1:-$TESTPOOL}
+ while true ; do
+ is_pool_scrubbed $pool && break
+ log_must sleep 1
+ done
+}
+
+#
# Use create_pool()/destroy_pool() to clean up the infomation in
# in the given disk to avoid slice overlapping.
#
diff --git a/usr/src/test/zfs-tests/runfiles/delphix.run b/usr/src/test/zfs-tests/runfiles/delphix.run
index d501a4d2a0..2c5e4529b1 100644
--- a/usr/src/test/zfs-tests/runfiles/delphix.run
+++ b/usr/src/test/zfs-tests/runfiles/delphix.run
@@ -259,6 +259,7 @@ tests = ['zpool_create_001_pos', 'zpool_create_002_pos',
'zpool_create_015_neg', 'zpool_create_016_pos', 'zpool_create_017_neg',
'zpool_create_018_pos', 'zpool_create_019_pos', 'zpool_create_020_pos',
'zpool_create_021_pos', 'zpool_create_022_pos', 'zpool_create_023_neg',
+ 'zpool_create_024_pos',
'zpool_create_features_001_pos', 'zpool_create_features_002_pos',
'zpool_create_features_003_pos', 'zpool_create_features_004_neg',
'zpool_create_tempname']
@@ -342,7 +343,7 @@ tests = ['zpool_replace_001_neg']
[/opt/zfs-tests/tests/functional/cli_root/zpool_scrub]
tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos',
- 'zpool_scrub_004_pos', 'zpool_scrub_005_pos']
+ 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', 'zpool_scrub_multiple_copies']
[/opt/zfs-tests/tests/functional/cli_root/zpool_set]
tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg']
diff --git a/usr/src/test/zfs-tests/runfiles/omnios.run b/usr/src/test/zfs-tests/runfiles/omnios.run
index c908e3868c..255a8d0cb0 100644
--- a/usr/src/test/zfs-tests/runfiles/omnios.run
+++ b/usr/src/test/zfs-tests/runfiles/omnios.run
@@ -252,6 +252,7 @@ tests = ['zpool_create_001_pos', 'zpool_create_002_pos',
'zpool_create_015_neg', 'zpool_create_016_pos', 'zpool_create_017_neg',
'zpool_create_018_pos', 'zpool_create_019_pos', 'zpool_create_020_pos',
'zpool_create_021_pos', 'zpool_create_022_pos', 'zpool_create_023_neg',
+ 'zpool_create_024_pos',
'zpool_create_features_001_pos', 'zpool_create_features_002_pos',
'zpool_create_features_003_pos', 'zpool_create_features_004_neg',
'zpool_create_tempname']
@@ -312,7 +313,7 @@ tests = ['zpool_replace_001_neg']
[/opt/zfs-tests/tests/functional/cli_root/zpool_scrub]
tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos',
- 'zpool_scrub_004_pos', 'zpool_scrub_005_pos']
+ 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', 'zpool_scrub_multiple_copies']
[/opt/zfs-tests/tests/functional/cli_root/zpool_set]
tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg']
diff --git a/usr/src/test/zfs-tests/runfiles/openindiana.run b/usr/src/test/zfs-tests/runfiles/openindiana.run
index 83fbf29375..4005a19b11 100644
--- a/usr/src/test/zfs-tests/runfiles/openindiana.run
+++ b/usr/src/test/zfs-tests/runfiles/openindiana.run
@@ -252,6 +252,7 @@ tests = ['zpool_create_001_pos', 'zpool_create_002_pos',
'zpool_create_015_neg', 'zpool_create_016_pos', 'zpool_create_017_neg',
'zpool_create_018_pos', 'zpool_create_019_pos', 'zpool_create_020_pos',
'zpool_create_021_pos', 'zpool_create_022_pos', 'zpool_create_023_neg',
+ 'zpool_create_024_pos',
'zpool_create_features_001_pos', 'zpool_create_features_002_pos',
'zpool_create_features_003_pos', 'zpool_create_features_004_neg',
'zpool_create_tempname']
@@ -312,7 +313,7 @@ tests = ['zpool_replace_001_neg']
[/opt/zfs-tests/tests/functional/cli_root/zpool_scrub]
tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos',
- 'zpool_scrub_004_pos', 'zpool_scrub_005_pos']
+ 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', 'zpool_scrub_multiple_copies']
[/opt/zfs-tests/tests/functional/cli_root/zpool_set]
tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg']
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_024_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_024_pos.ksh
new file mode 100644
index 0000000000..5b464c3c24
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_024_pos.ksh
@@ -0,0 +1,152 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.cfg
+
+#
+# DESCRIPTION:
+# Many 'zpool create' and 'zpool destroy' must succeed concurrently.
+#
+# STRATEGY:
+# 1. Create N process each of which create/destroy a pool M times.
+# 2. Allow all process to run to completion.
+# 3. Verify all pools and their vdevs were destroyed.
+#
+
+verify_runnable "global"
+
+if is_32bit; then
+ log_unsupported "Test case runs slowly on 32 bit"
+fi
+
+function cleanup
+{
+ if [[ -n "$child_pids" ]]; then
+ for wait_pid in $child_pids; do
+ kill $wait_pid 2>/dev/null
+ done
+ fi
+
+ if [[ -n "$child_pools" ]]; then
+ for pool in $child_pools; do
+ typeset vdev0="$TEST_BASE_DIR/$pool-vdev0.img"
+ typeset vdev1="$TEST_BASE_DIR/$pool-vdev1.img"
+
+ if poolexists $pool; then
+ destroy_pool $pool
+ fi
+
+ rm -f $vdev0 $vdev1
+ done
+ fi
+}
+
+log_onexit cleanup
+
+log_assert "Many 'zpool create' and 'zpool destroy' must succeed concurrently."
+
+child_pids=""
+child_pools=""
+
+function zpool_stress
+{
+ typeset pool=$1
+ typeset vdev0="$TEST_BASE_DIR/$pool-vdev0.img"
+ typeset vdev1="$TEST_BASE_DIR/$pool-vdev1.img"
+ typeset -i iters=$2
+ typeset retry=10
+ typeset j=0
+
+ truncate -s $FILESIZE $vdev0
+ truncate -s $FILESIZE $vdev1
+
+ while [[ $j -lt $iters ]]; do
+ ((j = j + 1))
+ sleep 1
+
+ zpool create $pool $vdev0 $vdev1
+ if [ $? -ne 0 ]; then
+ return 1;
+ fi
+
+ # The 'zfs destroy' command is retried because it can
+ # transiently return EBUSY when blkid is concurrently
+ # probing new volumes and therefore has them open.
+ typeset k=0;
+ while [[ $k -lt $retry ]]; do
+ ((k = k + 1))
+
+ zpool destroy $pool
+ if [ $? -eq 0 ]; then
+ break;
+ elif [ $k -eq $retry ]; then
+ return 1;
+ fi
+
+ sleep 3
+ done
+ done
+
+ rm -f $vdev0 $vdev1
+ return 0
+}
+
+# 1. Create 128 process each of which create/destroy a pool 5 times.
+typeset i=0
+while [[ $i -lt 128 ]]; do
+ typeset uuid=$(uuidgen | cut -c1-13)
+
+ zpool_stress $TESTPOOL-$uuid 5 &
+ typeset pid=$!
+
+ child_pids="$child_pids $pid"
+ child_pools="$child_pools $TESTPOOL-$uuid"
+ ((i = i + 1))
+done
+
+# 2. Allow all process to run to completion.
+wait
+
+# 3. Verify all pools and their vdevs were destroyed.
+for pool in $child_pools; do
+ typeset vdev0="$TEST_BASE_DIR/$pool-vdev0.img"
+ typeset vdev1="$TEST_BASE_DIR/$pool-vdev1.img"
+
+ if poolexists $pool; then
+ log_fail "pool $pool exists"
+ fi
+
+ if [ -e $vdev0 ]; then
+ log_fail "pool vdev $vdev0 exists"
+ fi
+
+ if [ -e $vdev1 ]; then
+ log_fail "pool vdev $vdev1 exists"
+ fi
+done
+
+log_pass "Many 'zpool create' and 'zpool destroy' must succeed concurrently."
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh
new file mode 100755
index 0000000000..d62b3afb8f
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh
@@ -0,0 +1,77 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 Datto, Inc. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Scrubs and self-healing should be able to repair data from additional
+# copies that may be stored.
+#
+#
+# STRATEGY:
+# 1. Create a dataset with copies=3
+# 2. Write a file to the dataset
+# 3. zinject errors into the first and second DVAs of that file
+# 4. Scrub and verify the scrub repaired all errors
+# 7. Read the file normally to check that self healing also works
+# 8. Remove the zinject handler
+# 9. Scrub again and confirm 0 bytes were scrubbed
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+ destroy_dataset $TESTPOOL/$TESTFS2
+ log_must zinject -c all
+}
+log_onexit cleanup
+
+log_assert "Scrubs and self healing must work with additional copies"
+
+log_must zfs create -o copies=3 $TESTPOOL/$TESTFS2
+typeset mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS2)
+log_must mkfile 10m $mntpnt/file
+log_must zpool sync $TESTPOOL
+
+log_must zinject -a -t data -C 0,1 -e io $mntpnt/file
+
+log_must zpool scrub $TESTPOOL
+log_must wait_scrubbed $TESTPOOL
+
+log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
+log_must check_pool_status $TESTPOOL "errors" "No known data errors"
+
+log_must dd if=$mntpnt/file of=/dev/null bs=1M iflag=fullblock
+log_must check_pool_status $TESTPOOL "errors" "No known data errors"
+
+log_must zinject -c all
+
+log_must zpool scrub $TESTPOOL
+log_must wait_scrubbed $TESTPOOL
+
+zpool status
+
+log_must check_pool_status $TESTPOOL "errors" "No known data errors"
+log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
+log_must check_pool_status $TESTPOOL "scan" "repaired 0"
+
+log_pass "Scrubs and self healing work with additional copies"
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh
new file mode 100755
index 0000000000..7a07e64334
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh
@@ -0,0 +1,133 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2017, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_reopen/zpool_reopen.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg
+
+#
+# DESCRIPTION:
+# Scrubbing a pool with offline devices correctly preserves DTL entries
+#
+# STRATEGY:
+# 1. Create the pool
+# 2. Offline the first device
+# 3. Write to the pool
+# 4. Scrub the pool
+# 5. Online the first device and offline the second device
+# 6. Scrub the pool again
+# 7. Verify data integrity
+#
+# NOTE:
+# Ported from script used to reproduce issue #5806
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+ poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2
+ log_must rm -f $DISK1 $DISK2 $DISK3 $DISK4
+}
+
+#
+# Update to [online|offline] $device status on $pool synchronously
+#
+function zpool_do_sync # <status> <pool> <device>
+{
+ status="$1"
+ pool="$2"
+ device="$3"
+
+ if [[ $status != "online" && $status != "offline" ]]; then
+ log_fail "zpool_do_sync: invalid status $status"
+ fi
+
+ log_must zpool $status $pool $device
+ for i in {1..10}; do
+ check_state $pool $device $status && return 0
+ done
+ log_fail "Failed to $status device $device"
+}
+
+#
+# Start a scrub on $pool and wait for its completion
+#
+function zpool_scrub_sync # <pool>
+{
+ pool="$1"
+
+ log_must zpool scrub $pool
+ while ! is_pool_scrubbed $pool; do
+ sleep 1
+ done
+}
+
+log_assert "Scrubbing a pool with offline devices correctly preserves DTLs"
+log_onexit cleanup
+
+DEVSIZE='128m'
+FILESIZE='100m'
+TESTDIR="$TEST_BASE_DIR/zpool_scrub_offline_device"
+DISK1="$TEST_BASE_DIR/zpool_disk1.dat"
+DISK2="$TEST_BASE_DIR/zpool_disk2.dat"
+DISK3="$TEST_BASE_DIR/zpool_disk3.dat"
+DISK4="$TEST_BASE_DIR/zpool_disk4.dat"
+RESILVER_TIMEOUT=40
+
+# 1. Create the pool
+log_must truncate -s $DEVSIZE $DISK1
+log_must truncate -s $DEVSIZE $DISK2
+log_must truncate -s $DEVSIZE $DISK3
+log_must truncate -s $DEVSIZE $DISK4
+poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2
+log_must zpool create -O mountpoint=$TESTDIR $TESTPOOL2 \
+ raidz2 $DISK1 $DISK2 $DISK3 $DISK4
+
+# 2. Offline the first device
+zpool_do_sync 'offline' $TESTPOOL2 $DISK1
+
+# 3. Write to the pool
+log_must mkfile $FILESIZE "$TESTDIR/data.bin"
+
+# 4. Scrub the pool
+zpool_scrub_sync $TESTPOOL2
+
+# 5. Online the first device and offline the second device
+zpool_do_sync 'online' $TESTPOOL2 $DISK1
+zpool_do_sync 'offline' $TESTPOOL2 $DISK2
+log_must wait_for_resilver_end $TESTPOOL2 $RESILVER_TIMEOUT
+
+# 6. Scrub the pool again
+zpool_scrub_sync $TESTPOOL2
+
+# 7. Verify data integrity
+cksum=$(zpool status $TESTPOOL2 | awk 'L{print $NF;L=0} /CKSUM$/{L=1}')
+if [[ $cksum != 0 ]]; then
+ log_fail "Unexpected CKSUM errors found on $TESTPOOL2 ($cksum)"
+fi
+
+log_pass "Scrubbing a pool with offline devices correctly preserves DTLs"
diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c
index 00bd1498a2..ca82195178 100644
--- a/usr/src/uts/common/fs/zfs/dsl_scan.c
+++ b/usr/src/uts/common/fs/zfs/dsl_scan.c
@@ -249,24 +249,43 @@ typedef enum {
*/
typedef struct scan_io {
/* fields from blkptr_t */
- uint64_t sio_offset;
uint64_t sio_blk_prop;
uint64_t sio_phys_birth;
uint64_t sio_birth;
zio_cksum_t sio_cksum;
- uint32_t sio_asize;
+ uint32_t sio_nr_dvas;
/* fields from zio_t */
- int sio_flags;
+ uint32_t sio_flags;
zbookmark_phys_t sio_zb;
/* members for queue sorting */
union {
- avl_node_t sio_addr_node; /* link into issueing queue */
+ avl_node_t sio_addr_node; /* link into issuing queue */
list_node_t sio_list_node; /* link for issuing to disk */
} sio_nodes;
+
+ /*
+ * There may be up to SPA_DVAS_PER_BP DVAs here from the bp,
+ * depending on how many were in the original bp. Only the
+ * first DVA is really used for sorting and issuing purposes.
+ * The other DVAs (if provided) simply exist so that the zio
+ * layer can find additional copies to repair from in the
+ * event of an error. This array must go at the end of the
+ * struct to allow this for the variable number of elements.
+ */
+ dva_t sio_dva[0];
} scan_io_t;
+#define SIO_SET_OFFSET(sio, x) DVA_SET_OFFSET(&(sio)->sio_dva[0], x)
+#define SIO_SET_ASIZE(sio, x) DVA_SET_ASIZE(&(sio)->sio_dva[0], x)
+#define SIO_GET_OFFSET(sio) DVA_GET_OFFSET(&(sio)->sio_dva[0])
+#define SIO_GET_ASIZE(sio) DVA_GET_ASIZE(&(sio)->sio_dva[0])
+#define SIO_GET_END_OFFSET(sio) \
+ (SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio))
+#define SIO_GET_MUSED(sio) \
+ (sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t)))
+
struct dsl_scan_io_queue {
dsl_scan_t *q_scn; /* associated dsl_scan_t */
vdev_t *q_vd; /* top-level vdev that this queue represents */
@@ -275,6 +294,7 @@ struct dsl_scan_io_queue {
range_tree_t *q_exts_by_addr;
avl_tree_t q_exts_by_size;
avl_tree_t q_sios_by_addr;
+ uint64_t q_sio_memused;
/* members for zio rate limiting */
uint64_t q_maxinflight_bytes;
@@ -313,7 +333,27 @@ static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
static void scan_io_queues_destroy(dsl_scan_t *scn);
-static kmem_cache_t *sio_cache;
+static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP];
+
+/* sio->sio_nr_dvas must be set so we know which cache to free from */
+static void
+sio_free(scan_io_t *sio)
+{
+ ASSERT3U(sio->sio_nr_dvas, >, 0);
+ ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
+
+ kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio);
+}
+
+/* It is up to the caller to set sio->sio_nr_dvas for freeing */
+static scan_io_t *
+sio_alloc(unsigned short nr_dvas)
+{
+ ASSERT3U(nr_dvas, >, 0);
+ ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP);
+
+ return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP));
+}
void
scan_init(void)
@@ -328,14 +368,22 @@ scan_init(void)
*/
fill_weight = zfs_scan_fill_weight;
- sio_cache = kmem_cache_create("sio_cache",
- sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+ char name[36];
+
+ (void) sprintf(name, "sio_cache_%d", i);
+ sio_cache[i] = kmem_cache_create(name,
+ (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ }
}
void
scan_fini(void)
{
- kmem_cache_destroy(sio_cache);
+ for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+ kmem_cache_destroy(sio_cache[i]);
+ }
}
static inline boolean_t
@@ -352,29 +400,39 @@ dsl_scan_resilvering(dsl_pool_t *dp)
}
static inline void
-sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id)
+sio2bp(const scan_io_t *sio, blkptr_t *bp)
{
bzero(bp, sizeof (*bp));
- DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize);
- DVA_SET_VDEV(&bp->blk_dva[0], vdev_id);
- DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset);
bp->blk_prop = sio->sio_blk_prop;
bp->blk_phys_birth = sio->sio_phys_birth;
bp->blk_birth = sio->sio_birth;
bp->blk_fill = 1; /* we always only work with data pointers */
bp->blk_cksum = sio->sio_cksum;
+
+ ASSERT3U(sio->sio_nr_dvas, >, 0);
+ ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
+
+ bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t));
}
static inline void
bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
{
- /* we discard the vdev id, since we can deduce it from the queue */
- sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]);
- sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]);
sio->sio_blk_prop = bp->blk_prop;
sio->sio_phys_birth = bp->blk_phys_birth;
sio->sio_birth = bp->blk_birth;
sio->sio_cksum = bp->blk_cksum;
+ sio->sio_nr_dvas = BP_GET_NDVAS(bp);
+
+ /*
+ * Copy the DVAs to the sio. We need all copies of the block so
+ * that the self healing code can use the alternate copies if the
+ * first is corrupted. We want the DVA at index dva_i to be first
+ * in the sio since this is the primary one that we want to issue.
+ */
+ for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) {
+ sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas];
+ }
}
int
@@ -1076,11 +1134,9 @@ dsl_scan_should_clear(dsl_scan_t *scn)
mutex_enter(&tvd->vdev_scan_io_queue_lock);
queue = tvd->vdev_scan_io_queue;
if (queue != NULL) {
- /* #extents in exts_by_size = # in exts_by_addr */
+ /* # extents in exts_by_size = # in exts_by_addr */
mused += avl_numnodes(&queue->q_exts_by_size) *
- sizeof (range_seg_t) +
- avl_numnodes(&queue->q_sios_by_addr) *
- sizeof (scan_io_t);
+ sizeof (range_seg_t) + queue->q_sio_memused;
}
mutex_exit(&tvd->vdev_scan_io_queue_lock);
}
@@ -2546,13 +2602,13 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
break;
}
- sio2bp(sio, &bp, queue->q_vd->vdev_id);
- bytes_issued += sio->sio_asize;
+ sio2bp(sio, &bp);
+ bytes_issued += SIO_GET_ASIZE(sio);
scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
&sio->sio_zb, queue);
(void) list_remove_head(io_list);
scan_io_queues_update_zio_stats(queue, &bp);
- kmem_free(sio, sizeof (*sio));
+ sio_free(sio);
}
atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
@@ -2569,7 +2625,7 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
static boolean_t
scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
{
- scan_io_t srch_sio, *sio, *next_sio;
+ scan_io_t *srch_sio, *sio, *next_sio;
avl_index_t idx;
uint_t num_sios = 0;
int64_t bytes_issued = 0;
@@ -2577,24 +2633,30 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
ASSERT(rs != NULL);
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
- srch_sio.sio_offset = rs->rs_start;
+ srch_sio = sio_alloc(1);
+ srch_sio->sio_nr_dvas = 1;
+ SIO_SET_OFFSET(srch_sio, rs->rs_start);
/*
* The exact start of the extent might not contain any matching zios,
* so if that's the case, examine the next one in the tree.
*/
- sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx);
+ sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
+ sio_free(srch_sio);
+
if (sio == NULL)
sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
- while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) {
- ASSERT3U(sio->sio_offset, >=, rs->rs_start);
- ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end);
+ while (sio != NULL &&
+ SIO_GET_OFFSET(sio) < rs->rs_end && num_sios <= 32) {
+ ASSERT3U(SIO_GET_OFFSET(sio), >=, rs->rs_start);
+ ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs->rs_end);
next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
avl_remove(&queue->q_sios_by_addr, sio);
+ queue->q_sio_memused -= SIO_GET_MUSED(sio);
- bytes_issued += sio->sio_asize;
+ bytes_issued += SIO_GET_ASIZE(sio);
num_sios++;
list_insert_tail(list, sio);
sio = next_sio;
@@ -2606,11 +2668,11 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
* in the segment we update it to reflect the work we were able to
* complete. Otherwise, we remove it from the range tree entirely.
*/
- if (sio != NULL && sio->sio_offset < rs->rs_end) {
+ if (sio != NULL && SIO_GET_OFFSET(sio) < rs->rs_end) {
range_tree_adjust_fill(queue->q_exts_by_addr, rs,
-bytes_issued);
range_tree_resize_segment(queue->q_exts_by_addr, rs,
- sio->sio_offset, rs->rs_end - sio->sio_offset);
+ SIO_GET_OFFSET(sio), rs->rs_end - SIO_GET_OFFSET(sio));
return (B_TRUE);
} else {
@@ -2715,9 +2777,9 @@ scan_io_queues_run_one(void *arg)
first_sio = list_head(&sio_list);
last_sio = list_tail(&sio_list);
- seg_end = last_sio->sio_offset + last_sio->sio_asize;
+ seg_end = SIO_GET_END_OFFSET(last_sio);
if (seg_start == 0)
- seg_start = first_sio->sio_offset;
+ seg_start = SIO_GET_OFFSET(first_sio);
/*
* Issuing sios can take a long time so drop the
@@ -3369,10 +3431,23 @@ count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
{
int i;
- /* update the spa's stats on how many bytes we have issued */
- for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+ /*
+ * Update the spa's stats on how many bytes we have issued.
+ * Sequential scrubs create a zio for each DVA of the bp. Each
+ * of these will include all DVAs for repair purposes, but the
+ * zio code will only try the first one unless there is an issue.
+ * Therefore, we should only count the first DVA for these IOs.
+ */
+ if (scn->scn_is_sorted) {
atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued,
- DVA_GET_ASIZE(&bp->blk_dva[i]));
+ DVA_GET_ASIZE(&bp->blk_dva[0]));
+ } else {
+ spa_t *spa = scn->scn_dp->dp_spa;
+
+ for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+ atomic_add_64(&spa->spa_scan_pass_issued,
+ DVA_GET_ASIZE(&bp->blk_dva[i]));
+ }
}
/*
@@ -3426,7 +3501,7 @@ static void
scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
{
avl_index_t idx;
- int64_t asize = sio->sio_asize;
+ int64_t asize = SIO_GET_ASIZE(sio);
dsl_scan_t *scn = queue->q_scn;
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
@@ -3434,11 +3509,12 @@ scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
/* block is already scheduled for reading */
atomic_add_64(&scn->scn_bytes_pending, -asize);
- kmem_free(sio, sizeof (*sio));
+ sio_free(sio);
return;
}
avl_insert(&queue->q_sios_by_addr, sio, idx);
- range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize);
+ queue->q_sio_memused += SIO_GET_MUSED(sio);
+ range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize);
}
/*
@@ -3452,7 +3528,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
int zio_flags, const zbookmark_phys_t *zb)
{
dsl_scan_t *scn = queue->q_scn;
- scan_io_t *sio = kmem_zalloc(sizeof (*sio), KM_SLEEP);
+ scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp));
ASSERT0(BP_IS_GANG(bp));
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
@@ -3466,7 +3542,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
* get an integer underflow in case the worker processes the
* zio before we get to incrementing this counter.
*/
- atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize);
+ atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio));
scan_io_queue_insert_impl(queue, sio);
}
@@ -3699,15 +3775,11 @@ ext_size_compare(const void *x, const void *y)
* based on LBA-order (from lowest to highest).
*/
static int
-io_addr_compare(const void *x, const void *y)
+sio_addr_compare(const void *x, const void *y)
{
const scan_io_t *a = x, *b = y;
- if (a->sio_offset < b->sio_offset)
- return (-1);
- if (a->sio_offset == b->sio_offset)
- return (0);
- return (1);
+ return (AVL_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b)));
}
/* IO queues are created on demand when they are needed. */
@@ -3719,10 +3791,11 @@ scan_io_queue_create(vdev_t *vd)
q->q_scn = scn;
q->q_vd = vd;
+ q->q_sio_memused = 0;
cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops,
&q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap);
- avl_create(&q->q_sios_by_addr, io_addr_compare,
+ avl_create(&q->q_sios_by_addr, sio_addr_compare,
sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
return (q);
@@ -3746,11 +3819,13 @@ dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
NULL) {
ASSERT(range_tree_contains(queue->q_exts_by_addr,
- sio->sio_offset, sio->sio_asize));
- bytes_dequeued += sio->sio_asize;
- kmem_free(sio, sizeof (*sio));
+ SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)));
+ bytes_dequeued += SIO_GET_ASIZE(sio);
+ queue->q_sio_memused -= SIO_GET_MUSED(sio);
+ sio_free(sio);
}
+ ASSERT0(queue->q_sio_memused);
atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued);
range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
range_tree_destroy(queue->q_exts_by_addr);
@@ -3805,7 +3880,7 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
vdev_t *vdev;
kmutex_t *q_lock;
dsl_scan_io_queue_t *queue;
- scan_io_t srch, *sio;
+ scan_io_t *srch_sio, *sio;
avl_index_t idx;
uint64_t start, size;
@@ -3820,9 +3895,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
return;
}
- bp2sio(bp, &srch, dva_i);
- start = srch.sio_offset;
- size = srch.sio_asize;
+ srch_sio = sio_alloc(BP_GET_NDVAS(bp));
+ bp2sio(bp, srch_sio, dva_i);
+ start = SIO_GET_OFFSET(srch_sio);
+ size = SIO_GET_ASIZE(srch_sio);
/*
* We can find the zio in two states:
@@ -3842,15 +3918,18 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
* be done with issuing the zio's it gathered and will
* signal us.
*/
- sio = avl_find(&queue->q_sios_by_addr, &srch, &idx);
+ sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
+ sio_free(srch_sio);
+
if (sio != NULL) {
- int64_t asize = sio->sio_asize;
+ int64_t asize = SIO_GET_ASIZE(sio);
blkptr_t tmpbp;
/* Got it while it was cold in the queue */
- ASSERT3U(start, ==, sio->sio_offset);
+ ASSERT3U(start, ==, SIO_GET_OFFSET(sio));
ASSERT3U(size, ==, asize);
avl_remove(&queue->q_sios_by_addr, sio);
+ queue->q_sio_memused -= SIO_GET_MUSED(sio);
ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
range_tree_remove_fill(queue->q_exts_by_addr, start, size);
@@ -3863,10 +3942,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
atomic_add_64(&scn->scn_bytes_pending, -asize);
/* count the block as though we issued it */
- sio2bp(sio, &tmpbp, dva_i);
+ sio2bp(sio, &tmpbp);
count_block(scn, dp->dp_blkstats, &tmpbp);
- kmem_free(sio, sizeof (*sio));
+ sio_free(sio);
}
mutex_exit(q_lock);
}
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 1c004f87f3..d0b9f6960f 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -2069,7 +2069,7 @@ metaslab_space_weight(metaslab_t *msp)
* In effect, this means that we'll select the metaslab with the most
* free bandwidth rather than simply the one with the most free space.
*/
- if (metaslab_lba_weighting_enabled) {
+ if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
ASSERT(weight >= space && weight <= 2 * space);
}
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 7a44ac86b0..0a5cec2644 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -2036,6 +2036,7 @@ spa_init(int mode)
dmu_init();
zil_init();
vdev_cache_stat_init();
+ vdev_mirror_stat_init();
zfs_prop_init();
zpool_prop_init();
zpool_feature_init();
@@ -2052,6 +2053,7 @@ spa_fini(void)
spa_evict_all();
vdev_cache_stat_fini();
+ vdev_mirror_stat_fini();
zil_fini();
dmu_fini();
zio_fini();
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index 4ff552447e..53b9e4ef5d 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -907,6 +907,10 @@ extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
extern void vdev_cache_stat_init(void);
extern void vdev_cache_stat_fini(void);
+/* vdev mirror */
+extern void vdev_mirror_stat_init(void);
+extern void vdev_mirror_stat_fini(void);
+
/* Initialization and termination */
extern void spa_init(int flags);
extern void spa_fini(void);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index 0c0bc874c1..e21989641b 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -139,6 +139,9 @@ extern zio_t *vdev_queue_io(zio_t *zio);
extern void vdev_queue_io_done(zio_t *zio);
extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
+extern int vdev_queue_length(vdev_t *vd);
+extern uint64_t vdev_queue_last_offset(vdev_t *vd);
+
extern void vdev_config_dirty(vdev_t *vd);
extern void vdev_config_clean(vdev_t *vd);
extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 4e1b09c27d..a91927dbb6 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -225,6 +225,7 @@ struct vdev {
vdev_stat_t vdev_stat; /* virtual device statistics */
boolean_t vdev_expanding; /* expand the vdev? */
boolean_t vdev_reopening; /* reopen in progress? */
+ boolean_t vdev_nonrot; /* true if solid state */
int vdev_open_error; /* error on last open */
kthread_t *vdev_open_thread; /* thread opening children */
uint64_t vdev_crtxg; /* txg when top-level was added */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
index 824d1d8bb7..70916c45b7 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -318,13 +318,15 @@ typedef struct zinject_record {
uint64_t zi_timer;
uint64_t zi_nlanes;
uint32_t zi_cmd;
- uint32_t zi_pad;
+ uint32_t zi_dvas;
} zinject_record_t;
#define ZINJECT_NULL 0x1
#define ZINJECT_FLUSH_ARC 0x2
#define ZINJECT_UNLOAD_SPA 0x4
+#define ZI_NO_DVA (-1)
+
typedef enum zinject_type {
ZINJECT_UNINITIALIZED,
ZINJECT_DATA_FAULT,
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index c7dca83777..4971e9e79e 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -1478,19 +1478,27 @@ vdev_open_children(vdev_t *vd)
* spa_namespace_lock
*/
if (vdev_uses_zvols(vd)) {
+retry_sync:
for (int c = 0; c < children; c++)
vd->vdev_child[c]->vdev_open_error =
vdev_open(vd->vdev_child[c]);
- return;
+ } else {
+ tq = taskq_create("vdev_open", children, minclsyspri,
+ children, children, TASKQ_PREPOPULATE);
+ if (tq == NULL)
+ goto retry_sync;
+
+ for (int c = 0; c < children; c++)
+ VERIFY(taskq_dispatch(tq, vdev_open_child,
+ vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID);
+
+ taskq_destroy(tq);
}
- tq = taskq_create("vdev_open", children, minclsyspri,
- children, children, TASKQ_PREPOPULATE);
- for (int c = 0; c < children; c++)
- VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
- TQ_SLEEP) != TASKQID_INVALID);
+ vd->vdev_nonrot = B_TRUE;
- taskq_destroy(tq);
+ for (int c = 0; c < children; c++)
+ vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
}
/*
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index 93462ee2ba..3f137c5d59 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -606,6 +606,16 @@ skip_open:
*/
vd->vdev_nowritecache = B_FALSE;
+ /* Inform the ZIO pipeline that we are non-rotational */
+ vd->vdev_nonrot = B_FALSE;
+ if (ldi_prop_exists(dvd->vd_lh, DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
+ "device-solid-state")) {
+ if (ldi_prop_get_int(dvd->vd_lh,
+ LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
+ "device-solid-state", B_FALSE) != 0)
+ vd->vdev_nonrot = B_TRUE;
+ }
+
return (0);
}
diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c
index 3aaebe8505..806716200a 100644
--- a/usr/src/uts/common/fs/zfs/vdev_file.c
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c
@@ -58,6 +58,9 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
vattr_t vattr;
int error;
+ /* Rotational optimizations only make sense on block devices */
+ vd->vdev_nonrot = B_TRUE;
+
/*
* We must have a pathname, and it must be absolute.
*/
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
index f489bb1967..f654bf9afb 100644
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -38,6 +38,65 @@
#include <sys/fs/zfs.h>
/*
+ * Vdev mirror kstats
+ */
+static kstat_t *mirror_ksp = NULL;
+
+typedef struct mirror_stats {
+ kstat_named_t vdev_mirror_stat_rotating_linear;
+ kstat_named_t vdev_mirror_stat_rotating_offset;
+ kstat_named_t vdev_mirror_stat_rotating_seek;
+ kstat_named_t vdev_mirror_stat_non_rotating_linear;
+ kstat_named_t vdev_mirror_stat_non_rotating_seek;
+
+ kstat_named_t vdev_mirror_stat_preferred_found;
+ kstat_named_t vdev_mirror_stat_preferred_not_found;
+} mirror_stats_t;
+
+static mirror_stats_t mirror_stats = {
+ /* New I/O follows directly the last I/O */
+ { "rotating_linear", KSTAT_DATA_UINT64 },
+ /* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */
+ { "rotating_offset", KSTAT_DATA_UINT64 },
+ /* New I/O requires random seek */
+ { "rotating_seek", KSTAT_DATA_UINT64 },
+ /* New I/O follows directly the last I/O (nonrot) */
+ { "non_rotating_linear", KSTAT_DATA_UINT64 },
+ /* New I/O requires random seek (nonrot) */
+ { "non_rotating_seek", KSTAT_DATA_UINT64 },
+ /* Preferred child vdev found */
+ { "preferred_found", KSTAT_DATA_UINT64 },
+ /* Preferred child vdev not found or equal load */
+ { "preferred_not_found", KSTAT_DATA_UINT64 },
+
+};
+
+#define MIRROR_STAT(stat) (mirror_stats.stat.value.ui64)
+#define MIRROR_INCR(stat, val) atomic_add_64(&MIRROR_STAT(stat), val)
+#define MIRROR_BUMP(stat) MIRROR_INCR(stat, 1)
+
+void
+vdev_mirror_stat_init(void)
+{
+ mirror_ksp = kstat_create("zfs", 0, "vdev_mirror_stats",
+ "misc", KSTAT_TYPE_NAMED,
+ sizeof (mirror_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+ if (mirror_ksp != NULL) {
+ mirror_ksp->ks_data = &mirror_stats;
+ kstat_install(mirror_ksp);
+ }
+}
+
+void
+vdev_mirror_stat_fini(void)
+{
+ if (mirror_ksp != NULL) {
+ kstat_delete(mirror_ksp);
+ mirror_ksp = NULL;
+ }
+}
+
+/*
* Virtual device vector for mirroring.
*/
@@ -45,48 +104,182 @@ typedef struct mirror_child {
vdev_t *mc_vd;
uint64_t mc_offset;
int mc_error;
+ int mc_load;
uint8_t mc_tried;
uint8_t mc_skipped;
uint8_t mc_speculative;
} mirror_child_t;
typedef struct mirror_map {
+ int *mm_preferred;
+ int mm_preferred_cnt;
int mm_children;
int mm_resilvering;
- int mm_preferred;
int mm_root;
- mirror_child_t mm_child[1];
+ mirror_child_t mm_child[];
} mirror_map_t;
int vdev_mirror_shift = 21;
+/*
+ * The load configuration settings below are tuned by default for
+ * the case where all devices are of the same rotational type.
+ *
+ * If there is a mixture of rotating and non-rotating media, setting
+ * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results
+ * as it will direct more reads to the non-rotating vdevs which are more likely
+ * to have a higher performance.
+ */
+
+/* Rotating media load calculation configuration. */
+static int zfs_vdev_mirror_rotating_inc = 0;
+static int zfs_vdev_mirror_rotating_seek_inc = 5;
+static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024;
+
+/* Non-rotating media load calculation configuration. */
+static int zfs_vdev_mirror_non_rotating_inc = 0;
+static int zfs_vdev_mirror_non_rotating_seek_inc = 1;
+
+static inline size_t
+vdev_mirror_map_size(int children)
+{
+ return (offsetof(mirror_map_t, mm_child[children]) +
+ sizeof (int) * children);
+}
+
+static inline mirror_map_t *
+vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root)
+{
+ mirror_map_t *mm;
+
+ mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
+ mm->mm_children = children;
+ mm->mm_resilvering = resilvering;
+ mm->mm_root = root;
+ mm->mm_preferred = (int *)((uintptr_t)mm +
+ offsetof(mirror_map_t, mm_child[children]));
+
+ return (mm);
+}
+
static void
vdev_mirror_map_free(zio_t *zio)
{
mirror_map_t *mm = zio->io_vsd;
- kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
+ kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
}
static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
- vdev_mirror_map_free,
- zio_vsd_default_cksum_report
+ .vsd_free = vdev_mirror_map_free,
+ .vsd_cksum_report = zio_vsd_default_cksum_report
};
+static int
+vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
+{
+ uint64_t last_offset;
+ int64_t offset_diff;
+ int load;
+
+ /* All DVAs have equal weight at the root. */
+ if (mm->mm_root)
+ return (INT_MAX);
+
+ /*
+ * We don't return INT_MAX if the device is resilvering i.e.
+ * vdev_resilver_txg != 0 as when tested performance was slightly
+ * worse overall when resilvering with compared to without.
+ */
+
+ /* Fix zio_offset for leaf vdevs */
+ if (vd->vdev_ops->vdev_op_leaf)
+ zio_offset += VDEV_LABEL_START_SIZE;
+
+ /* Standard load based on pending queue length. */
+ load = vdev_queue_length(vd);
+ last_offset = vdev_queue_last_offset(vd);
+
+ if (vd->vdev_nonrot) {
+ /* Non-rotating media. */
+ if (last_offset == zio_offset) {
+ MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear);
+ return (load + zfs_vdev_mirror_non_rotating_inc);
+ }
+
+ /*
+ * Apply a seek penalty even for non-rotating devices as
+ * sequential I/O's can be aggregated into fewer operations on
+ * the device, thus avoiding unnecessary per-command overhead
+ * and boosting performance.
+ */
+ MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek);
+ return (load + zfs_vdev_mirror_non_rotating_seek_inc);
+ }
+
+ /* Rotating media I/O's which directly follow the last I/O. */
+ if (last_offset == zio_offset) {
+ MIRROR_BUMP(vdev_mirror_stat_rotating_linear);
+ return (load + zfs_vdev_mirror_rotating_inc);
+ }
+
+ /*
+ * Apply half the seek increment to I/O's within seek offset
+ * of the last I/O issued to this vdev as they should incur less
+ * of a seek increment.
+ */
+ offset_diff = (int64_t)(last_offset - zio_offset);
+ if (ABS(offset_diff) < zfs_vdev_mirror_rotating_seek_offset) {
+ MIRROR_BUMP(vdev_mirror_stat_rotating_offset);
+ return (load + (zfs_vdev_mirror_rotating_seek_inc / 2));
+ }
+
+ /* Apply the full seek increment to all other I/O's. */
+ MIRROR_BUMP(vdev_mirror_stat_rotating_seek);
+ return (load + zfs_vdev_mirror_rotating_seek_inc);
+}
+
static mirror_map_t *
-vdev_mirror_map_alloc(zio_t *zio)
+vdev_mirror_map_init(zio_t *zio)
{
mirror_map_t *mm = NULL;
mirror_child_t *mc;
vdev_t *vd = zio->io_vd;
- int c, d;
+ int c;
if (vd == NULL) {
dva_t *dva = zio->io_bp->blk_dva;
spa_t *spa = zio->io_spa;
+ dsl_scan_t *scn = NULL;
dva_t dva_copy[SPA_DVAS_PER_BP];
- c = BP_GET_NDVAS(zio->io_bp);
+ if (spa->spa_dsl_pool != NULL) {
+ scn = spa->spa_dsl_pool->dp_scan;
+ }
+ /*
+ * The sequential scrub code sorts and issues all DVAs
+ * of a bp separately. Each of these IOs includes all
+ * original DVA copies so that repairs can be performed
+ * in the event of an error, but we only actually want
+ * to check the first DVA since the others will be
+ * checked by their respective sorted IOs. Only if we
+ * hit an error will we try all DVAs upon retrying.
+ *
+ * Note: This check is safe even if the user switches
+ * from a legacy scrub to a sequential one in the middle
+ * of processing, since scn_is_sorted isn't updated until
+ * all outstanding IOs from the previous scrub pass
+ * complete.
+ */
+ if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
+ !(zio->io_flags & ZIO_FLAG_IO_RETRY) &&
+ scn != NULL &&
+ scn->scn_is_sorted &&
+ dsl_scan_scrubbing(spa->spa_dsl_pool)) {
+ c = 1;
+ } else {
+ c = BP_GET_NDVAS(zio->io_bp);
+ }
/*
* If we do not trust the pool config, some DVAs might be
@@ -110,24 +303,7 @@ vdev_mirror_map_alloc(zio_t *zio)
}
}
- mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
- mm->mm_children = c;
- mm->mm_resilvering = B_FALSE;
- mm->mm_preferred = spa_get_random(c);
- mm->mm_root = B_TRUE;
-
- /*
- * Check the other, lower-index DVAs to see if they're on
- * the same vdev as the child we picked. If they are, use
- * them since they are likely to have been allocated from
- * the primary metaslab in use at the time, and hence are
- * more likely to have locality with single-copy data.
- */
- for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
- if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
- mm->mm_preferred = d;
- }
-
+ mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE);
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
@@ -135,12 +311,6 @@ vdev_mirror_map_alloc(zio_t *zio)
mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
}
} else {
- int replacing;
-
- c = vd->vdev_children;
-
- mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
- mm->mm_children = c;
/*
* If we are resilvering, then we should handle scrub reads
* differently; we shouldn't issue them to the resilvering
@@ -164,25 +334,12 @@ vdev_mirror_map_alloc(zio_t *zio)
* automatically removed from the pool after the user replaces
* the device that originally failed.
*/
- replacing = (vd->vdev_ops == &vdev_replacing_ops ||
- vd->vdev_ops == &vdev_spare_ops);
- /*
- * If a spa load is in progress, then spa_dsl_pool may be
- * uninitialized. But we shouldn't be resilvering during a spa
- * load anyway.
- */
- if (replacing &&
- (spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE) &&
- dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool)) {
- mm->mm_resilvering = B_TRUE;
- } else {
- mm->mm_resilvering = B_FALSE;
- }
-
- mm->mm_preferred = mm->mm_resilvering ? 0 :
- (zio->io_offset >> vdev_mirror_shift) % c;
- mm->mm_root = B_FALSE;
-
+ boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops) &&
+ spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
+ dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);
+ mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
+ B_FALSE);
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
mc->mc_vd = vd->vdev_child[c];
@@ -269,6 +426,7 @@ vdev_mirror_scrub_done(zio_t *zio)
}
mutex_exit(&zio->io_lock);
}
+
abd_free(zio->io_abd);
mc->mc_error = zio->io_error;
@@ -277,6 +435,54 @@ vdev_mirror_scrub_done(zio_t *zio)
}
/*
+ * Check the other, lower-index DVAs to see if they're on the same
+ * vdev as the child we picked. If they are, use them since they
+ * are likely to have been allocated from the primary metaslab in
+ * use at the time, and hence are more likely to have locality with
+ * single-copy data.
+ */
+static int
+vdev_mirror_dva_select(zio_t *zio, int p)
+{
+ dva_t *dva = zio->io_bp->blk_dva;
+ mirror_map_t *mm = zio->io_vsd;
+ int preferred;
+ int c;
+
+ preferred = mm->mm_preferred[p];
+ for (p--; p >= 0; p--) {
+ c = mm->mm_preferred[p];
+ if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
+ preferred = c;
+ }
+ return (preferred);
+}
+
+static int
+vdev_mirror_preferred_child_randomize(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+ int p;
+
+ if (mm->mm_root) {
+ p = spa_get_random(mm->mm_preferred_cnt);
+ return (vdev_mirror_dva_select(zio, p));
+ }
+
+ /*
+ * To ensure we don't always favour the first matching vdev,
+ * which could lead to wear leveling issues on SSD's, we
+ * use the I/O offset as a pseudo random seed into the vdevs
+ * which have the lowest load.
+ */
+ p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
+ return (mm->mm_preferred[p]);
+}
+
+/*
+ * Try to find a vdev whose DTL doesn't contain the block we want to read
+ * prefering vdevs based on determined load.
+ *
* Try to find a child whose DTL doesn't contain the block we want to read.
* If we can't, try the read on any vdev we haven't already tried.
*/
@@ -284,43 +490,64 @@ static int
vdev_mirror_child_select(zio_t *zio)
{
mirror_map_t *mm = zio->io_vsd;
- mirror_child_t *mc;
uint64_t txg = zio->io_txg;
- int i, c;
+ int c, lowest_load;
ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
- /*
- * Try to find a child whose DTL doesn't contain the block to read.
- * If a child is known to be completely inaccessible (indicated by
- * vdev_readable() returning B_FALSE), don't even try.
- */
- for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
- if (c >= mm->mm_children)
- c = 0;
+ lowest_load = INT_MAX;
+ mm->mm_preferred_cnt = 0;
+ for (c = 0; c < mm->mm_children; c++) {
+ mirror_child_t *mc;
+
mc = &mm->mm_child[c];
if (mc->mc_tried || mc->mc_skipped)
continue;
- if (!vdev_readable(mc->mc_vd)) {
+
+ if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) {
mc->mc_error = SET_ERROR(ENXIO);
mc->mc_tried = 1; /* don't even try */
mc->mc_skipped = 1;
continue;
}
- if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
- return (c);
- mc->mc_error = SET_ERROR(ESTALE);
- mc->mc_skipped = 1;
- mc->mc_speculative = 1;
+
+ if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
+ mc->mc_error = SET_ERROR(ESTALE);
+ mc->mc_skipped = 1;
+ mc->mc_speculative = 1;
+ continue;
+ }
+
+ mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
+ if (mc->mc_load > lowest_load)
+ continue;
+
+ if (mc->mc_load < lowest_load) {
+ lowest_load = mc->mc_load;
+ mm->mm_preferred_cnt = 0;
+ }
+ mm->mm_preferred[mm->mm_preferred_cnt] = c;
+ mm->mm_preferred_cnt++;
+ }
+
+ if (mm->mm_preferred_cnt == 1) {
+ MIRROR_BUMP(vdev_mirror_stat_preferred_found);
+ return (mm->mm_preferred[0]);
+ }
+
+ if (mm->mm_preferred_cnt > 1) {
+ MIRROR_BUMP(vdev_mirror_stat_preferred_not_found);
+ return (vdev_mirror_preferred_child_randomize(zio));
}
/*
* Every device is either missing or has this txg in its DTL.
* Look for any child we haven't already tried before giving up.
*/
- for (c = 0; c < mm->mm_children; c++)
+ for (c = 0; c < mm->mm_children; c++) {
if (!mm->mm_child[c].mc_tried)
return (c);
+ }
/*
* Every child failed. There's no place left to look.
@@ -335,7 +562,7 @@ vdev_mirror_io_start(zio_t *zio)
mirror_child_t *mc;
int c, children;
- mm = vdev_mirror_map_alloc(zio);
+ mm = vdev_mirror_map_init(zio);
if (mm == NULL) {
ASSERT(!spa_trust_config(zio->io_spa));
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 0643c05f57..a89e06ebbf 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -276,6 +276,8 @@ vdev_queue_init(vdev_t *vd)
avl_create(vdev_queue_class_tree(vq, p), compfn,
sizeof (zio_t), offsetof(struct zio, io_queue_node));
}
+
+ vq->vq_last_offset = 0;
}
void
@@ -701,7 +703,7 @@ again:
*/
tree = vdev_queue_class_tree(vq, p);
search.io_timestamp = 0;
- search.io_offset = vq->vq_last_offset + 1;
+ search.io_offset = vq->vq_last_offset - 1;
VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
zio = avl_nearest(tree, idx, AVL_AFTER);
if (zio == NULL)
@@ -729,7 +731,7 @@ again:
}
vdev_queue_pending_add(vq, zio);
- vq->vq_last_offset = zio->io_offset;
+ vq->vq_last_offset = zio->io_offset + zio->io_size;
return (zio);
}
@@ -849,12 +851,39 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
*/
tree = vdev_queue_class_tree(vq, zio->io_priority);
if (avl_find(tree, zio, NULL) == zio) {
+ spa_t *spa = zio->io_spa;
+ zio_priority_t oldpri = zio->io_priority;
+
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
zio->io_priority = priority;
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+
+ mutex_enter(&spa->spa_iokstat_lock);
+ ASSERT3U(spa->spa_queue_stats[oldpri].spa_queued, >, 0);
+ spa->spa_queue_stats[oldpri].spa_queued--;
+ spa->spa_queue_stats[zio->io_priority].spa_queued++;
+ mutex_exit(&spa->spa_iokstat_lock);
} else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
zio->io_priority = priority;
}
mutex_exit(&vq->vq_lock);
}
+
+/*
+ * As these two methods are only used for load calculations we're not
+ * concerned if we get an incorrect value on 32bit platforms due to lack of
+ * vq_lock mutex use here, instead we prefer to keep it lock free for
+ * performance.
+ */
+int
+vdev_queue_length(vdev_t *vd)
+{
+ return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
+}
+
+uint64_t
+vdev_queue_last_offset(vdev_t *vd)
+{
+ return (vd->vdev_queue.vq_last_offset);
+}
diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c
index 26f59af996..71b859bc3d 100644
--- a/usr/src/uts/common/fs/zfs/zio_inject.c
+++ b/usr/src/uts/common/fs/zfs/zio_inject.c
@@ -102,7 +102,7 @@ static int inject_next_id = 1;
* Returns true if the given record matches the I/O in progress.
*/
static boolean_t
-zio_match_handler(zbookmark_phys_t *zb, uint64_t type,
+zio_match_handler(zbookmark_phys_t *zb, uint64_t type, int dva,
zinject_record_t *record, int error)
{
/*
@@ -127,9 +127,11 @@ zio_match_handler(zbookmark_phys_t *zb, uint64_t type,
zb->zb_level == record->zi_level &&
zb->zb_blkid >= record->zi_start &&
zb->zb_blkid <= record->zi_end &&
- error == record->zi_error)
+ (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) &&
+ error == record->zi_error) {
return (record->zi_freq == 0 ||
spa_get_random(100) < record->zi_freq);
+ }
return (B_FALSE);
}
@@ -159,6 +161,38 @@ zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
rw_exit(&inject_lock);
}
+
+/*
+ * If this is a physical I/O for a vdev child determine which DVA it is
+ * for. We iterate backwards through the DVAs matching on the offset so
+ * that we end up with ZI_NO_DVA (-1) if we don't find a match.
+ */
+static int
+zio_match_dva(zio_t *zio)
+{
+ int i = ZI_NO_DVA;
+
+ if (zio->io_bp != NULL && zio->io_vd != NULL &&
+ zio->io_child_type == ZIO_CHILD_VDEV) {
+ for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) {
+ dva_t *dva = &zio->io_bp->blk_dva[i];
+ uint64_t off = DVA_GET_OFFSET(dva);
+ vdev_t *vd = vdev_lookup_top(zio->io_spa,
+ DVA_GET_VDEV(dva));
+
+ /* Compensate for vdev label added to leaves */
+ if (zio->io_vd->vdev_ops->vdev_op_leaf)
+ off += VDEV_LABEL_START_SIZE;
+
+ if (zio->io_vd == vd && zio->io_offset == off)
+ break;
+ }
+ }
+
+ return (i);
+}
+
+
/*
* Determine if the I/O in question should return failure. Returns the errno
* to be returned to the caller.
@@ -190,10 +224,10 @@ zio_handle_fault_injection(zio_t *zio, int error)
handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
continue;
- /* If this handler matches, return EIO */
+ /* If this handler matches, return the specified error */
if (zio_match_handler(&zio->io_logical->io_bookmark,
zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
- &handler->zi_record, error)) {
+ zio_match_dva(zio), &handler->zi_record, error)) {
ret = error;
break;
}