diff options
author | Tom Caputi <tcaputi@datto.com> | 2019-03-15 17:14:31 -0400 |
---|---|---|
committer | Toomas Soome <tsoome@me.com> | 2019-05-13 23:49:15 +0300 |
commit | 12a8814c13fbb1d6d58616cf090ea5815dc107f9 (patch) | |
tree | 3f1b36f6702e76bf3b0636d6c3d9a8943d06470c /usr/src | |
parent | a3874b8b1fe5103fc1f961609557c0587435fec0 (diff) | |
download | illumos-gate-12a8814c13fbb1d6d58616cf090ea5815dc107f9.tar.gz |
10566 Multiple DVA Scrubbing Fix
Portions contributed by: Toomas Soome <tsoome@me.com>
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>
Diffstat (limited to 'usr/src')
22 files changed, 1077 insertions, 187 deletions
diff --git a/usr/src/cmd/zinject/zinject.c b/usr/src/cmd/zinject/zinject.c index efae04675e..1c0b3199bd 100644 --- a/usr/src/cmd/zinject/zinject.c +++ b/usr/src/cmd/zinject/zinject.c @@ -47,48 +47,48 @@ * * This form of the command looks like: * - * zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool + * zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool * * * DATA FAULTS * * We begin with a tuple of the form: * - * <type,level,range,object> + * <type,level,range,object> * - * type A string describing the type of data to target. Each type - * implicitly describes how to interpret 'object'. Currently, - * the following values are supported: + * type A string describing the type of data to target. Each type + * implicitly describes how to interpret 'object'. Currently, + * the following values are supported: * - * data User data for a file - * dnode Dnode for a file or directory + * data User data for a file + * dnode Dnode for a file or directory * * The following MOS objects are special. Instead of injecting * errors on a particular object or blkid, we inject errors across * all objects of the given type. * - * mos Any data in the MOS - * mosdir object directory - * config pool configuration - * bpobj blkptr list - * spacemap spacemap - * metaslab metaslab - * errlog persistent error log + * mos Any data in the MOS + * mosdir object directory + * config pool configuration + * bpobj blkptr list + * spacemap spacemap + * metaslab metaslab + * errlog persistent error log * - * level Object level. Defaults to '0', not applicable to all types. If - * a range is given, this corresponds to the indirect block - * corresponding to the specific range. + * level Object level. Defaults to '0', not applicable to all types. If + * a range is given, this corresponds to the indirect block + * corresponding to the specific range. * * range A numerical range [start,end) within the object. Defaults to * the full size of the file. * - * object A string describing the logical location of the object. For - * files and directories (currently the only supported types), - * this is the path of the object on disk. + * object A string describing the logical location of the object. For + * files and directories (currently the only supported types), + * this is the path of the object on disk. * * This is translated, via libzpool, into the following internal representation: * - * <type,objset,object,level,range> + * <type,objset,object,level,range> * * These types should be self-explanatory. This tuple is then passed to the * kernel via a special ioctl() to initiate fault injection for the given @@ -98,12 +98,12 @@ * * The command itself takes one of the forms: * - * zinject - * zinject <-a | -u pool> - * zinject -c <id|all> - * zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level] + * zinject + * zinject <-a | -u pool> + * zinject -c <id|all> + * zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level] * [-r range] <object> - * zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool + * zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool * * With no arguments, the command prints all currently registered injection * handlers, with their numeric identifiers. @@ -288,8 +288,8 @@ usage(void) "\t\tspecified by the remaining tuple. Each number is in\n" "\t\thexidecimal, and only one block can be specified.\n" "\n" - "\tzinject [-q] <-t type> [-e errno] [-l level] [-r range]\n" - "\t [-a] [-m] [-u] [-f freq] <object>\n" + "\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n" + "\t\t[-r range] [-a] [-m] [-u] [-f freq] <object>\n" "\n" "\t\tInject an error into the object specified by the '-t' option\n" "\t\tand the object descriptor. The 'object' parameter is\n" @@ -297,7 +297,10 @@ usage(void) "\n" "\t\t-q\tQuiet mode. Only print out the handler number added.\n" "\t\t-e\tInject a specific error. Must be either 'io' or\n" - "\t\t\t'checksum'. Default is 'io'.\n" + "\t\t\t'checksum', or 'decompress'. Default is 'io'.\n" + "\t\t-C\tInject the given error only into specific DVAs. The\n" + "\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n" + "\t\t\tseparated by commas (ex. '0,2').\n" "\t\t-l\tInject error at a particular block level. Default is " "0.\n" "\t\t-m\tAutomatically remount underlying filesystem.\n" @@ -358,17 +361,19 @@ print_data_handler(int id, const char *pool, zinject_record_t *record, return (0); if (*count == 0) { - (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-15s\n", - "ID", "POOL", "OBJSET", "OBJECT", "TYPE", "LVL", "RANGE"); + (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s ", + "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE", + "LVL", "DVAs", "RANGE"); (void) printf("--- --------------- ------ " - "------ -------- --- ---------------\n"); + "------ -------- --- ---- ----------------\n"); } *count += 1; - (void) printf("%3d %-15s %-6llu %-6llu %-8s %3d ", id, pool, - (u_longlong_t)record->zi_objset, (u_longlong_t)record->zi_object, - type_to_name(record->zi_type), record->zi_level); + (void) printf("%3d %-15s %-6llu %-6llu %-8s %-3d 0x%02x ", + id, pool, (u_longlong_t)record->zi_objset, + (u_longlong_t)record->zi_object, type_to_name(record->zi_type), + record->zi_level, record->zi_dvas); if (record->zi_start == 0 && record->zi_end == -1ULL) @@ -598,6 +603,7 @@ register_handler(const char *pool, int flags, zinject_record_t *record, (void) printf(" range: [%llu, %llu)\n", (u_longlong_t)record->zi_start, (u_longlong_t)record->zi_end); + (void) printf(" dvas: 0x%x\n", record->zi_dvas); } } @@ -649,6 +655,59 @@ parse_delay(char *str, uint64_t *delay, uint64_t *nlanes) return (0); } +/* + * This function converts a string specifier for DVAs into a bit mask. + * The dva's provided by the user should be 0 indexed and separated by + * a comma. For example: + * "1" -> 0b0010 (0x2) + * "0,1" -> 0b0011 (0x3) + * "0,1,2" -> 0b0111 (0x7) + */ +static int +parse_dvas(const char *str, uint32_t *dvas_out) +{ + const char *c = str; + uint32_t mask = 0; + boolean_t need_delim = B_FALSE; + + /* max string length is 5 ("0,1,2") */ + if (strlen(str) > 5 || strlen(str) == 0) + return (EINVAL); + + while (*c != '\0') { + switch (*c) { + case '0': + case '1': + case '2': + /* check for pipe between DVAs */ + if (need_delim) + return (EINVAL); + + /* check if this DVA has been set already */ + if (mask & (1 << ((*c) - '0'))) + return (EINVAL); + + mask |= (1 << ((*c) - '0')); + need_delim = B_TRUE; + break; + case ',': + need_delim = B_FALSE; + break; + default: + /* check for invalid character */ + return (EINVAL); + } + c++; + } + + /* check for dangling delimiter */ + if (!need_delim) + return (EINVAL); + + *dvas_out = mask; + return (0); +} + int main(int argc, char **argv) { @@ -675,6 +734,7 @@ main(int argc, char **argv) int dur_secs = 0; int ret; int flags = 0; + uint32_t dvas = 0; if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "internal error: failed to " @@ -705,7 +765,7 @@ main(int argc, char **argv) } while ((c = getopt(argc, argv, - ":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { + ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { switch (c) { case 'a': flags |= ZINJECT_FLUSH_ARC; @@ -728,6 +788,17 @@ main(int argc, char **argv) case 'c': cancel = optarg; break; + case 'C': + ret = parse_dvas(optarg, &dvas); + if (ret != 0) { + (void) fprintf(stderr, "invalid DVA list '%s': " + "DVAs should be 0 indexed and separated by " + "commas.\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; case 'd': device = optarg; break; @@ -887,7 +958,8 @@ main(int argc, char **argv) * '-c' is invalid with any other options. */ if (raw != NULL || range != NULL || type != TYPE_INVAL || - level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) { + level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || + record.zi_freq > 0 || dvas != 0) { (void) fprintf(stderr, "cancel (-c) incompatible with " "any other options\n"); usage(); @@ -919,7 +991,8 @@ main(int argc, char **argv) * for doing injection, so handle it separately here. */ if (raw != NULL || range != NULL || type != TYPE_INVAL || - level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) { + level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || + dvas != 0) { (void) fprintf(stderr, "device (-d) incompatible with " "data error injection\n"); usage(); @@ -953,7 +1026,8 @@ main(int argc, char **argv) } else if (raw != NULL) { if (range != NULL || type != TYPE_INVAL || level != 0 || - record.zi_cmd != ZINJECT_UNINITIALIZED) { + record.zi_cmd != ZINJECT_UNINITIALIZED || + record.zi_freq > 0 || dvas != 0) { (void) fprintf(stderr, "raw (-b) format with " "any other options\n"); usage(); @@ -983,7 +1057,8 @@ main(int argc, char **argv) error = EIO; } else if (record.zi_cmd == ZINJECT_PANIC) { if (raw != NULL || range != NULL || type != TYPE_INVAL || - level != 0 || device != NULL) { + level != 0 || device != NULL || record.zi_freq > 0 || + dvas != 0) { (void) fprintf(stderr, "panic (-p) incompatible with " "other options\n"); usage(); @@ -1002,6 +1077,15 @@ main(int argc, char **argv) record.zi_type = atoi(argv[1]); dataset[0] = '\0'; } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) { + if (raw != NULL || range != NULL || type != TYPE_INVAL || + level != 0 || record.zi_freq > 0 || dvas != 0) { + (void) fprintf(stderr, "hardware failure (-I) " + "incompatible with other options\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + if (nowrites == 0) { (void) fprintf(stderr, "-s or -g meaningless " "without -I (ignore writes)\n"); @@ -1055,6 +1139,18 @@ main(int argc, char **argv) return (1); } + if (dvas != 0) { + if (error == EACCES || error == EINVAL) { + (void) fprintf(stderr, "the '-C' option may " + "not be used with logical data errors " + "'decrypt' and 'decompress'\n"); + libzfs_fini(g_zfs); + return (1); + } + + record.zi_dvas = dvas; + } + record.zi_cmd = ZINJECT_DATA_FAULT; if (translate_record(type, argv[0], range, level, &record, pool, dataset) != 0) diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf index bd2df5aec1..aeb7288efc 100644 --- a/usr/src/pkg/manifests/system-test-zfstest.mf +++ b/usr/src/pkg/manifests/system-test-zfstest.mf @@ -1449,6 +1449,9 @@ file \ path=opt/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_023_neg \ mode=0555 file \ + path=opt/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_024_pos \ + mode=0555 +file \ path=opt/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos \ mode=0555 file \ @@ -1750,6 +1753,12 @@ file \ path=opt/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_005_pos \ mode=0555 file \ + path=opt/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies \ + mode=0555 +file \ + path=opt/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device \ + mode=0555 +file \ path=opt/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_print_repairing \ mode=0555 file path=opt/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_001_pos \ diff --git a/usr/src/test/zfs-tests/include/libtest.shlib b/usr/src/test/zfs-tests/include/libtest.shlib index 295620102a..bedd71c4b4 100644 --- a/usr/src/test/zfs-tests/include/libtest.shlib +++ b/usr/src/test/zfs-tests/include/libtest.shlib @@ -1747,6 +1747,24 @@ function wait_for_degraded } # +# Wait for a pool to be scrubbed +# +# $1 pool name +# $2 number of seconds to wait (optional) +# +# Returns true when pool has been scrubbed, or false if there's a timeout or if +# no scrub was done. +# +function wait_scrubbed +{ + typeset pool=${1:-$TESTPOOL} + while true ; do + is_pool_scrubbed $pool && break + log_must sleep 1 + done +} + +# # Use create_pool()/destroy_pool() to clean up the infomation in # in the given disk to avoid slice overlapping. # diff --git a/usr/src/test/zfs-tests/runfiles/delphix.run b/usr/src/test/zfs-tests/runfiles/delphix.run index d501a4d2a0..2c5e4529b1 100644 --- a/usr/src/test/zfs-tests/runfiles/delphix.run +++ b/usr/src/test/zfs-tests/runfiles/delphix.run @@ -259,6 +259,7 @@ tests = ['zpool_create_001_pos', 'zpool_create_002_pos', 'zpool_create_015_neg', 'zpool_create_016_pos', 'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos', 'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos', 'zpool_create_023_neg', + 'zpool_create_024_pos', 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', 'zpool_create_tempname'] @@ -342,7 +343,7 @@ tests = ['zpool_replace_001_neg'] [/opt/zfs-tests/tests/functional/cli_root/zpool_scrub] tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos', - 'zpool_scrub_004_pos', 'zpool_scrub_005_pos'] + 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', 'zpool_scrub_multiple_copies'] [/opt/zfs-tests/tests/functional/cli_root/zpool_set] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg'] diff --git a/usr/src/test/zfs-tests/runfiles/omnios.run b/usr/src/test/zfs-tests/runfiles/omnios.run index c908e3868c..255a8d0cb0 100644 --- a/usr/src/test/zfs-tests/runfiles/omnios.run +++ b/usr/src/test/zfs-tests/runfiles/omnios.run @@ -252,6 +252,7 @@ tests = ['zpool_create_001_pos', 'zpool_create_002_pos', 'zpool_create_015_neg', 'zpool_create_016_pos', 'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos', 'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos', 'zpool_create_023_neg', + 'zpool_create_024_pos', 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', 'zpool_create_tempname'] @@ -312,7 +313,7 @@ tests = ['zpool_replace_001_neg'] [/opt/zfs-tests/tests/functional/cli_root/zpool_scrub] tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos', - 'zpool_scrub_004_pos', 'zpool_scrub_005_pos'] + 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', 'zpool_scrub_multiple_copies'] [/opt/zfs-tests/tests/functional/cli_root/zpool_set] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg'] diff --git a/usr/src/test/zfs-tests/runfiles/openindiana.run b/usr/src/test/zfs-tests/runfiles/openindiana.run index 83fbf29375..4005a19b11 100644 --- a/usr/src/test/zfs-tests/runfiles/openindiana.run +++ b/usr/src/test/zfs-tests/runfiles/openindiana.run @@ -252,6 +252,7 @@ tests = ['zpool_create_001_pos', 'zpool_create_002_pos', 'zpool_create_015_neg', 'zpool_create_016_pos', 'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos', 'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos', 'zpool_create_023_neg', + 'zpool_create_024_pos', 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', 'zpool_create_tempname'] @@ -312,7 +313,7 @@ tests = ['zpool_replace_001_neg'] [/opt/zfs-tests/tests/functional/cli_root/zpool_scrub] tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos', - 'zpool_scrub_004_pos', 'zpool_scrub_005_pos'] + 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', 'zpool_scrub_multiple_copies'] [/opt/zfs-tests/tests/functional/cli_root/zpool_set] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg'] diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_024_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_024_pos.ksh new file mode 100644 index 0000000000..5b464c3c24 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_024_pos.ksh @@ -0,0 +1,152 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016, Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.cfg + +# +# DESCRIPTION: +# Many 'zpool create' and 'zpool destroy' must succeed concurrently. +# +# STRATEGY: +# 1. Create N process each of which create/destroy a pool M times. +# 2. Allow all process to run to completion. +# 3. Verify all pools and their vdevs were destroyed. +# + +verify_runnable "global" + +if is_32bit; then + log_unsupported "Test case runs slowly on 32 bit" +fi + +function cleanup +{ + if [[ -n "$child_pids" ]]; then + for wait_pid in $child_pids; do + kill $wait_pid 2>/dev/null + done + fi + + if [[ -n "$child_pools" ]]; then + for pool in $child_pools; do + typeset vdev0="$TEST_BASE_DIR/$pool-vdev0.img" + typeset vdev1="$TEST_BASE_DIR/$pool-vdev1.img" + + if poolexists $pool; then + destroy_pool $pool + fi + + rm -f $vdev0 $vdev1 + done + fi +} + +log_onexit cleanup + +log_assert "Many 'zpool create' and 'zpool destroy' must succeed concurrently." + +child_pids="" +child_pools="" + +function zpool_stress +{ + typeset pool=$1 + typeset vdev0="$TEST_BASE_DIR/$pool-vdev0.img" + typeset vdev1="$TEST_BASE_DIR/$pool-vdev1.img" + typeset -i iters=$2 + typeset retry=10 + typeset j=0 + + truncate -s $FILESIZE $vdev0 + truncate -s $FILESIZE $vdev1 + + while [[ $j -lt $iters ]]; do + ((j = j + 1)) + sleep 1 + + zpool create $pool $vdev0 $vdev1 + if [ $? -ne 0 ]; then + return 1; + fi + + # The 'zfs destroy' command is retried because it can + # transiently return EBUSY when blkid is concurrently + # probing new volumes and therefore has them open. + typeset k=0; + while [[ $k -lt $retry ]]; do + ((k = k + 1)) + + zpool destroy $pool + if [ $? -eq 0 ]; then + break; + elif [ $k -eq $retry ]; then + return 1; + fi + + sleep 3 + done + done + + rm -f $vdev0 $vdev1 + return 0 +} + +# 1. Create 128 process each of which create/destroy a pool 5 times. +typeset i=0 +while [[ $i -lt 128 ]]; do + typeset uuid=$(uuidgen | cut -c1-13) + + zpool_stress $TESTPOOL-$uuid 5 & + typeset pid=$! + + child_pids="$child_pids $pid" + child_pools="$child_pools $TESTPOOL-$uuid" + ((i = i + 1)) +done + +# 2. Allow all process to run to completion. +wait + +# 3. Verify all pools and their vdevs were destroyed. +for pool in $child_pools; do + typeset vdev0="$TEST_BASE_DIR/$pool-vdev0.img" + typeset vdev1="$TEST_BASE_DIR/$pool-vdev1.img" + + if poolexists $pool; then + log_fail "pool $pool exists" + fi + + if [ -e $vdev0 ]; then + log_fail "pool vdev $vdev0 exists" + fi + + if [ -e $vdev1 ]; then + log_fail "pool vdev $vdev1 exists" + fi +done + +log_pass "Many 'zpool create' and 'zpool destroy' must succeed concurrently." diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh new file mode 100755 index 0000000000..d62b3afb8f --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh @@ -0,0 +1,77 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 Datto, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Scrubs and self-healing should be able to repair data from additional +# copies that may be stored. +# +# +# STRATEGY: +# 1. Create a dataset with copies=3 +# 2. Write a file to the dataset +# 3. zinject errors into the first and second DVAs of that file +# 4. Scrub and verify the scrub repaired all errors +# 7. Read the file normally to check that self healing also works +# 8. Remove the zinject handler +# 9. Scrub again and confirm 0 bytes were scrubbed +# + +verify_runnable "global" + +function cleanup +{ + destroy_dataset $TESTPOOL/$TESTFS2 + log_must zinject -c all +} +log_onexit cleanup + +log_assert "Scrubs and self healing must work with additional copies" + +log_must zfs create -o copies=3 $TESTPOOL/$TESTFS2 +typeset mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS2) +log_must mkfile 10m $mntpnt/file +log_must zpool sync $TESTPOOL + +log_must zinject -a -t data -C 0,1 -e io $mntpnt/file + +log_must zpool scrub $TESTPOOL +log_must wait_scrubbed $TESTPOOL + +log_must check_pool_status $TESTPOOL "scan" "with 0 errors" +log_must check_pool_status $TESTPOOL "errors" "No known data errors" + +log_must dd if=$mntpnt/file of=/dev/null bs=1M iflag=fullblock +log_must check_pool_status $TESTPOOL "errors" "No known data errors" + +log_must zinject -c all + +log_must zpool scrub $TESTPOOL +log_must wait_scrubbed $TESTPOOL + +zpool status + +log_must check_pool_status $TESTPOOL "errors" "No known data errors" +log_must check_pool_status $TESTPOOL "scan" "with 0 errors" +log_must check_pool_status $TESTPOOL "scan" "repaired 0" + +log_pass "Scrubs and self healing work with additional copies" diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh new file mode 100755 index 0000000000..7a07e64334 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh @@ -0,0 +1,133 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2017, loli10K <ezomori.nozomu@gmail.com>. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_reopen/zpool_reopen.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg + +# +# DESCRIPTION: +# Scrubbing a pool with offline devices correctly preserves DTL entries +# +# STRATEGY: +# 1. Create the pool +# 2. Offline the first device +# 3. Write to the pool +# 4. Scrub the pool +# 5. Online the first device and offline the second device +# 6. Scrub the pool again +# 7. Verify data integrity +# +# NOTE: +# Ported from script used to reproduce issue #5806 +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2 + log_must rm -f $DISK1 $DISK2 $DISK3 $DISK4 +} + +# +# Update to [online|offline] $device status on $pool synchronously +# +function zpool_do_sync # <status> <pool> <device> +{ + status="$1" + pool="$2" + device="$3" + + if [[ $status != "online" && $status != "offline" ]]; then + log_fail "zpool_do_sync: invalid status $status" + fi + + log_must zpool $status $pool $device + for i in {1..10}; do + check_state $pool $device $status && return 0 + done + log_fail "Failed to $status device $device" +} + +# +# Start a scrub on $pool and wait for its completion +# +function zpool_scrub_sync # <pool> +{ + pool="$1" + + log_must zpool scrub $pool + while ! is_pool_scrubbed $pool; do + sleep 1 + done +} + +log_assert "Scrubbing a pool with offline devices correctly preserves DTLs" +log_onexit cleanup + +DEVSIZE='128m' +FILESIZE='100m' +TESTDIR="$TEST_BASE_DIR/zpool_scrub_offline_device" +DISK1="$TEST_BASE_DIR/zpool_disk1.dat" +DISK2="$TEST_BASE_DIR/zpool_disk2.dat" +DISK3="$TEST_BASE_DIR/zpool_disk3.dat" +DISK4="$TEST_BASE_DIR/zpool_disk4.dat" +RESILVER_TIMEOUT=40 + +# 1. Create the pool +log_must truncate -s $DEVSIZE $DISK1 +log_must truncate -s $DEVSIZE $DISK2 +log_must truncate -s $DEVSIZE $DISK3 +log_must truncate -s $DEVSIZE $DISK4 +poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2 +log_must zpool create -O mountpoint=$TESTDIR $TESTPOOL2 \ + raidz2 $DISK1 $DISK2 $DISK3 $DISK4 + +# 2. Offline the first device +zpool_do_sync 'offline' $TESTPOOL2 $DISK1 + +# 3. Write to the pool +log_must mkfile $FILESIZE "$TESTDIR/data.bin" + +# 4. Scrub the pool +zpool_scrub_sync $TESTPOOL2 + +# 5. Online the first device and offline the second device +zpool_do_sync 'online' $TESTPOOL2 $DISK1 +zpool_do_sync 'offline' $TESTPOOL2 $DISK2 +log_must wait_for_resilver_end $TESTPOOL2 $RESILVER_TIMEOUT + +# 6. Scrub the pool again +zpool_scrub_sync $TESTPOOL2 + +# 7. Verify data integrity +cksum=$(zpool status $TESTPOOL2 | awk 'L{print $NF;L=0} /CKSUM$/{L=1}') +if [[ $cksum != 0 ]]; then + log_fail "Unexpected CKSUM errors found on $TESTPOOL2 ($cksum)" +fi + +log_pass "Scrubbing a pool with offline devices correctly preserves DTLs" diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c index 00bd1498a2..ca82195178 100644 --- a/usr/src/uts/common/fs/zfs/dsl_scan.c +++ b/usr/src/uts/common/fs/zfs/dsl_scan.c @@ -249,24 +249,43 @@ typedef enum { */ typedef struct scan_io { /* fields from blkptr_t */ - uint64_t sio_offset; uint64_t sio_blk_prop; uint64_t sio_phys_birth; uint64_t sio_birth; zio_cksum_t sio_cksum; - uint32_t sio_asize; + uint32_t sio_nr_dvas; /* fields from zio_t */ - int sio_flags; + uint32_t sio_flags; zbookmark_phys_t sio_zb; /* members for queue sorting */ union { - avl_node_t sio_addr_node; /* link into issueing queue */ + avl_node_t sio_addr_node; /* link into issuing queue */ list_node_t sio_list_node; /* link for issuing to disk */ } sio_nodes; + + /* + * There may be up to SPA_DVAS_PER_BP DVAs here from the bp, + * depending on how many were in the original bp. Only the + * first DVA is really used for sorting and issuing purposes. + * The other DVAs (if provided) simply exist so that the zio + * layer can find additional copies to repair from in the + * event of an error. This array must go at the end of the + * struct to allow this for the variable number of elements. + */ + dva_t sio_dva[0]; } scan_io_t; +#define SIO_SET_OFFSET(sio, x) DVA_SET_OFFSET(&(sio)->sio_dva[0], x) +#define SIO_SET_ASIZE(sio, x) DVA_SET_ASIZE(&(sio)->sio_dva[0], x) +#define SIO_GET_OFFSET(sio) DVA_GET_OFFSET(&(sio)->sio_dva[0]) +#define SIO_GET_ASIZE(sio) DVA_GET_ASIZE(&(sio)->sio_dva[0]) +#define SIO_GET_END_OFFSET(sio) \ + (SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio)) +#define SIO_GET_MUSED(sio) \ + (sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t))) + struct dsl_scan_io_queue { dsl_scan_t *q_scn; /* associated dsl_scan_t */ vdev_t *q_vd; /* top-level vdev that this queue represents */ @@ -275,6 +294,7 @@ struct dsl_scan_io_queue { range_tree_t *q_exts_by_addr; avl_tree_t q_exts_by_size; avl_tree_t q_sios_by_addr; + uint64_t q_sio_memused; /* members for zio rate limiting */ uint64_t q_maxinflight_bytes; @@ -313,7 +333,27 @@ static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd); static void scan_io_queues_destroy(dsl_scan_t *scn); -static kmem_cache_t *sio_cache; +static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP]; + +/* sio->sio_nr_dvas must be set so we know which cache to free from */ +static void +sio_free(scan_io_t *sio) +{ + ASSERT3U(sio->sio_nr_dvas, >, 0); + ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP); + + kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio); +} + +/* It is up to the caller to set sio->sio_nr_dvas for freeing */ +static scan_io_t * +sio_alloc(unsigned short nr_dvas) +{ + ASSERT3U(nr_dvas, >, 0); + ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP); + + return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP)); +} void scan_init(void) @@ -328,14 +368,22 @@ scan_init(void) */ fill_weight = zfs_scan_fill_weight; - sio_cache = kmem_cache_create("sio_cache", - sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + for (int i = 0; i < SPA_DVAS_PER_BP; i++) { + char name[36]; + + (void) sprintf(name, "sio_cache_%d", i); + sio_cache[i] = kmem_cache_create(name, + (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))), + 0, NULL, NULL, NULL, NULL, NULL, 0); + } } void scan_fini(void) { - kmem_cache_destroy(sio_cache); + for (int i = 0; i < SPA_DVAS_PER_BP; i++) { + kmem_cache_destroy(sio_cache[i]); + } } static inline boolean_t @@ -352,29 +400,39 @@ dsl_scan_resilvering(dsl_pool_t *dp) } static inline void -sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id) +sio2bp(const scan_io_t *sio, blkptr_t *bp) { bzero(bp, sizeof (*bp)); - DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize); - DVA_SET_VDEV(&bp->blk_dva[0], vdev_id); - DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset); bp->blk_prop = sio->sio_blk_prop; bp->blk_phys_birth = sio->sio_phys_birth; bp->blk_birth = sio->sio_birth; bp->blk_fill = 1; /* we always only work with data pointers */ bp->blk_cksum = sio->sio_cksum; + + ASSERT3U(sio->sio_nr_dvas, >, 0); + ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP); + + bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t)); } static inline void bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) { - /* we discard the vdev id, since we can deduce it from the queue */ - sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]); - sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]); sio->sio_blk_prop = bp->blk_prop; sio->sio_phys_birth = bp->blk_phys_birth; sio->sio_birth = bp->blk_birth; sio->sio_cksum = bp->blk_cksum; + sio->sio_nr_dvas = BP_GET_NDVAS(bp); + + /* + * Copy the DVAs to the sio. We need all copies of the block so + * that the self healing code can use the alternate copies if the + * first is corrupted. We want the DVA at index dva_i to be first + * in the sio since this is the primary one that we want to issue. + */ + for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) { + sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas]; + } } int @@ -1076,11 +1134,9 @@ dsl_scan_should_clear(dsl_scan_t *scn) mutex_enter(&tvd->vdev_scan_io_queue_lock); queue = tvd->vdev_scan_io_queue; if (queue != NULL) { - /* #extents in exts_by_size = # in exts_by_addr */ + /* # extents in exts_by_size = # in exts_by_addr */ mused += avl_numnodes(&queue->q_exts_by_size) * - sizeof (range_seg_t) + - avl_numnodes(&queue->q_sios_by_addr) * - sizeof (scan_io_t); + sizeof (range_seg_t) + queue->q_sio_memused; } mutex_exit(&tvd->vdev_scan_io_queue_lock); } @@ -2546,13 +2602,13 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) break; } - sio2bp(sio, &bp, queue->q_vd->vdev_id); - bytes_issued += sio->sio_asize; + sio2bp(sio, &bp); + bytes_issued += SIO_GET_ASIZE(sio); scan_exec_io(scn->scn_dp, &bp, sio->sio_flags, &sio->sio_zb, queue); (void) list_remove_head(io_list); scan_io_queues_update_zio_stats(queue, &bp); - kmem_free(sio, sizeof (*sio)); + sio_free(sio); } atomic_add_64(&scn->scn_bytes_pending, -bytes_issued); @@ -2569,7 +2625,7 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) static boolean_t scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) { - scan_io_t srch_sio, *sio, *next_sio; + scan_io_t *srch_sio, *sio, *next_sio; avl_index_t idx; uint_t num_sios = 0; int64_t bytes_issued = 0; @@ -2577,24 +2633,30 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) ASSERT(rs != NULL); ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); - srch_sio.sio_offset = rs->rs_start; + srch_sio = sio_alloc(1); + srch_sio->sio_nr_dvas = 1; + SIO_SET_OFFSET(srch_sio, rs->rs_start); /* * The exact start of the extent might not contain any matching zios, * so if that's the case, examine the next one in the tree. */ - sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx); + sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx); + sio_free(srch_sio); + if (sio == NULL) sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); - while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) { - ASSERT3U(sio->sio_offset, >=, rs->rs_start); - ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end); + while (sio != NULL && + SIO_GET_OFFSET(sio) < rs->rs_end && num_sios <= 32) { + ASSERT3U(SIO_GET_OFFSET(sio), >=, rs->rs_start); + ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs->rs_end); next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); avl_remove(&queue->q_sios_by_addr, sio); + queue->q_sio_memused -= SIO_GET_MUSED(sio); - bytes_issued += sio->sio_asize; + bytes_issued += SIO_GET_ASIZE(sio); num_sios++; list_insert_tail(list, sio); sio = next_sio; @@ -2606,11 +2668,11 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) * in the segment we update it to reflect the work we were able to * complete. Otherwise, we remove it from the range tree entirely. */ - if (sio != NULL && sio->sio_offset < rs->rs_end) { + if (sio != NULL && SIO_GET_OFFSET(sio) < rs->rs_end) { range_tree_adjust_fill(queue->q_exts_by_addr, rs, -bytes_issued); range_tree_resize_segment(queue->q_exts_by_addr, rs, - sio->sio_offset, rs->rs_end - sio->sio_offset); + SIO_GET_OFFSET(sio), rs->rs_end - SIO_GET_OFFSET(sio)); return (B_TRUE); } else { @@ -2715,9 +2777,9 @@ scan_io_queues_run_one(void *arg) first_sio = list_head(&sio_list); last_sio = list_tail(&sio_list); - seg_end = last_sio->sio_offset + last_sio->sio_asize; + seg_end = SIO_GET_END_OFFSET(last_sio); if (seg_start == 0) - seg_start = first_sio->sio_offset; + seg_start = SIO_GET_OFFSET(first_sio); /* * Issuing sios can take a long time so drop the @@ -3369,10 +3431,23 @@ count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp) { int i; - /* update the spa's stats on how many bytes we have issued */ - for (i = 0; i < BP_GET_NDVAS(bp); i++) { + /* + * Update the spa's stats on how many bytes we have issued. + * Sequential scrubs create a zio for each DVA of the bp. Each + * of these will include all DVAs for repair purposes, but the + * zio code will only try the first one unless there is an issue. + * Therefore, we should only count the first DVA for these IOs. + */ + if (scn->scn_is_sorted) { atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued, - DVA_GET_ASIZE(&bp->blk_dva[i])); + DVA_GET_ASIZE(&bp->blk_dva[0])); + } else { + spa_t *spa = scn->scn_dp->dp_spa; + + for (i = 0; i < BP_GET_NDVAS(bp); i++) { + atomic_add_64(&spa->spa_scan_pass_issued, + DVA_GET_ASIZE(&bp->blk_dva[i])); + } } /* @@ -3426,7 +3501,7 @@ static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) { avl_index_t idx; - int64_t asize = sio->sio_asize; + int64_t asize = SIO_GET_ASIZE(sio); dsl_scan_t *scn = queue->q_scn; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); @@ -3434,11 +3509,12 @@ scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) { /* block is already scheduled for reading */ atomic_add_64(&scn->scn_bytes_pending, -asize); - kmem_free(sio, sizeof (*sio)); + sio_free(sio); return; } avl_insert(&queue->q_sios_by_addr, sio, idx); - range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize); + queue->q_sio_memused += SIO_GET_MUSED(sio); + range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize); } /* @@ -3452,7 +3528,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, int zio_flags, const zbookmark_phys_t *zb) { dsl_scan_t *scn = queue->q_scn; - scan_io_t *sio = kmem_zalloc(sizeof (*sio), KM_SLEEP); + scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp)); ASSERT0(BP_IS_GANG(bp)); ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); @@ -3466,7 +3542,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, * get an integer underflow in case the worker processes the * zio before we get to incrementing this counter. */ - atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize); + atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio)); scan_io_queue_insert_impl(queue, sio); } @@ -3699,15 +3775,11 @@ ext_size_compare(const void *x, const void *y) * based on LBA-order (from lowest to highest). */ static int -io_addr_compare(const void *x, const void *y) +sio_addr_compare(const void *x, const void *y) { const scan_io_t *a = x, *b = y; - if (a->sio_offset < b->sio_offset) - return (-1); - if (a->sio_offset == b->sio_offset) - return (0); - return (1); + return (AVL_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b))); } /* IO queues are created on demand when they are needed. */ @@ -3719,10 +3791,11 @@ scan_io_queue_create(vdev_t *vd) q->q_scn = scn; q->q_vd = vd; + q->q_sio_memused = 0; cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops, &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap); - avl_create(&q->q_sios_by_addr, io_addr_compare, + avl_create(&q->q_sios_by_addr, sio_addr_compare, sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); return (q); @@ -3746,11 +3819,13 @@ dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue) while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) != NULL) { ASSERT(range_tree_contains(queue->q_exts_by_addr, - sio->sio_offset, sio->sio_asize)); - bytes_dequeued += sio->sio_asize; - kmem_free(sio, sizeof (*sio)); + SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio))); + bytes_dequeued += SIO_GET_ASIZE(sio); + queue->q_sio_memused -= SIO_GET_MUSED(sio); + sio_free(sio); } + ASSERT0(queue->q_sio_memused); atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued); range_tree_vacate(queue->q_exts_by_addr, NULL, queue); range_tree_destroy(queue->q_exts_by_addr); @@ -3805,7 +3880,7 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) vdev_t *vdev; kmutex_t *q_lock; dsl_scan_io_queue_t *queue; - scan_io_t srch, *sio; + scan_io_t *srch_sio, *sio; avl_index_t idx; uint64_t start, size; @@ -3820,9 +3895,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) return; } - bp2sio(bp, &srch, dva_i); - start = srch.sio_offset; - size = srch.sio_asize; + srch_sio = sio_alloc(BP_GET_NDVAS(bp)); + bp2sio(bp, srch_sio, dva_i); + start = SIO_GET_OFFSET(srch_sio); + size = SIO_GET_ASIZE(srch_sio); /* * We can find the zio in two states: @@ -3842,15 +3918,18 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) * be done with issuing the zio's it gathered and will * signal us. */ - sio = avl_find(&queue->q_sios_by_addr, &srch, &idx); + sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx); + sio_free(srch_sio); + if (sio != NULL) { - int64_t asize = sio->sio_asize; + int64_t asize = SIO_GET_ASIZE(sio); blkptr_t tmpbp; /* Got it while it was cold in the queue */ - ASSERT3U(start, ==, sio->sio_offset); + ASSERT3U(start, ==, SIO_GET_OFFSET(sio)); ASSERT3U(size, ==, asize); avl_remove(&queue->q_sios_by_addr, sio); + queue->q_sio_memused -= SIO_GET_MUSED(sio); ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); range_tree_remove_fill(queue->q_exts_by_addr, start, size); @@ -3863,10 +3942,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) atomic_add_64(&scn->scn_bytes_pending, -asize); /* count the block as though we issued it */ - sio2bp(sio, &tmpbp, dva_i); + sio2bp(sio, &tmpbp); count_block(scn, dp->dp_blkstats, &tmpbp); - kmem_free(sio, sizeof (*sio)); + sio_free(sio); } mutex_exit(q_lock); } diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 1c004f87f3..d0b9f6960f 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -2069,7 +2069,7 @@ metaslab_space_weight(metaslab_t *msp) * In effect, this means that we'll select the metaslab with the most * free bandwidth rather than simply the one with the most free space. */ - if (metaslab_lba_weighting_enabled) { + if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; ASSERT(weight >= space && weight <= 2 * space); } diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 7a44ac86b0..0a5cec2644 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -2036,6 +2036,7 @@ spa_init(int mode) dmu_init(); zil_init(); vdev_cache_stat_init(); + vdev_mirror_stat_init(); zfs_prop_init(); zpool_prop_init(); zpool_feature_init(); @@ -2052,6 +2053,7 @@ spa_fini(void) spa_evict_all(); vdev_cache_stat_fini(); + vdev_mirror_stat_fini(); zil_fini(); dmu_fini(); zio_fini(); diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 4ff552447e..53b9e4ef5d 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -907,6 +907,10 @@ extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); extern void vdev_cache_stat_init(void); extern void vdev_cache_stat_fini(void); +/* vdev mirror */ +extern void vdev_mirror_stat_init(void); +extern void vdev_mirror_stat_fini(void); + /* Initialization and termination */ extern void spa_init(int flags); extern void spa_fini(void); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index 0c0bc874c1..e21989641b 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -139,6 +139,9 @@ extern zio_t *vdev_queue_io(zio_t *zio); extern void vdev_queue_io_done(zio_t *zio); extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority); +extern int vdev_queue_length(vdev_t *vd); +extern uint64_t vdev_queue_last_offset(vdev_t *vd); + extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 4e1b09c27d..a91927dbb6 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -225,6 +225,7 @@ struct vdev { vdev_stat_t vdev_stat; /* virtual device statistics */ boolean_t vdev_expanding; /* expand the vdev? */ boolean_t vdev_reopening; /* reopen in progress? */ + boolean_t vdev_nonrot; /* true if solid state */ int vdev_open_error; /* error on last open */ kthread_t *vdev_open_thread; /* thread opening children */ uint64_t vdev_crtxg; /* txg when top-level was added */ diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h index 824d1d8bb7..70916c45b7 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -318,13 +318,15 @@ typedef struct zinject_record { uint64_t zi_timer; uint64_t zi_nlanes; uint32_t zi_cmd; - uint32_t zi_pad; + uint32_t zi_dvas; } zinject_record_t; #define ZINJECT_NULL 0x1 #define ZINJECT_FLUSH_ARC 0x2 #define ZINJECT_UNLOAD_SPA 0x4 +#define ZI_NO_DVA (-1) + typedef enum zinject_type { ZINJECT_UNINITIALIZED, ZINJECT_DATA_FAULT, diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index c7dca83777..4971e9e79e 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -1478,19 +1478,27 @@ vdev_open_children(vdev_t *vd) * spa_namespace_lock */ if (vdev_uses_zvols(vd)) { +retry_sync: for (int c = 0; c < children; c++) vd->vdev_child[c]->vdev_open_error = vdev_open(vd->vdev_child[c]); - return; + } else { + tq = taskq_create("vdev_open", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + if (tq == NULL) + goto retry_sync; + + for (int c = 0; c < children; c++) + VERIFY(taskq_dispatch(tq, vdev_open_child, + vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID); + + taskq_destroy(tq); } - tq = taskq_create("vdev_open", children, minclsyspri, - children, children, TASKQ_PREPOPULATE); - for (int c = 0; c < children; c++) - VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], - TQ_SLEEP) != TASKQID_INVALID); + vd->vdev_nonrot = B_TRUE; - taskq_destroy(tq); + for (int c = 0; c < children; c++) + vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; } /* diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index 93462ee2ba..3f137c5d59 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -606,6 +606,16 @@ skip_open: */ vd->vdev_nowritecache = B_FALSE; + /* Inform the ZIO pipeline that we are non-rotational */ + vd->vdev_nonrot = B_FALSE; + if (ldi_prop_exists(dvd->vd_lh, DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, + "device-solid-state")) { + if (ldi_prop_get_int(dvd->vd_lh, + LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, + "device-solid-state", B_FALSE) != 0) + vd->vdev_nonrot = B_TRUE; + } + return (0); } diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c index 3aaebe8505..806716200a 100644 --- a/usr/src/uts/common/fs/zfs/vdev_file.c +++ b/usr/src/uts/common/fs/zfs/vdev_file.c @@ -58,6 +58,9 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, vattr_t vattr; int error; + /* Rotational optimizations only make sense on block devices */ + vd->vdev_nonrot = B_TRUE; + /* * We must have a pathname, and it must be absolute. */ diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c index f489bb1967..f654bf9afb 100644 --- a/usr/src/uts/common/fs/zfs/vdev_mirror.c +++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c @@ -38,6 +38,65 @@ #include <sys/fs/zfs.h> /* + * Vdev mirror kstats + */ +static kstat_t *mirror_ksp = NULL; + +typedef struct mirror_stats { + kstat_named_t vdev_mirror_stat_rotating_linear; + kstat_named_t vdev_mirror_stat_rotating_offset; + kstat_named_t vdev_mirror_stat_rotating_seek; + kstat_named_t vdev_mirror_stat_non_rotating_linear; + kstat_named_t vdev_mirror_stat_non_rotating_seek; + + kstat_named_t vdev_mirror_stat_preferred_found; + kstat_named_t vdev_mirror_stat_preferred_not_found; +} mirror_stats_t; + +static mirror_stats_t mirror_stats = { + /* New I/O follows directly the last I/O */ + { "rotating_linear", KSTAT_DATA_UINT64 }, + /* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */ + { "rotating_offset", KSTAT_DATA_UINT64 }, + /* New I/O requires random seek */ + { "rotating_seek", KSTAT_DATA_UINT64 }, + /* New I/O follows directly the last I/O (nonrot) */ + { "non_rotating_linear", KSTAT_DATA_UINT64 }, + /* New I/O requires random seek (nonrot) */ + { "non_rotating_seek", KSTAT_DATA_UINT64 }, + /* Preferred child vdev found */ + { "preferred_found", KSTAT_DATA_UINT64 }, + /* Preferred child vdev not found or equal load */ + { "preferred_not_found", KSTAT_DATA_UINT64 }, + +}; + +#define MIRROR_STAT(stat) (mirror_stats.stat.value.ui64) +#define MIRROR_INCR(stat, val) atomic_add_64(&MIRROR_STAT(stat), val) +#define MIRROR_BUMP(stat) MIRROR_INCR(stat, 1) + +void +vdev_mirror_stat_init(void) +{ + mirror_ksp = kstat_create("zfs", 0, "vdev_mirror_stats", + "misc", KSTAT_TYPE_NAMED, + sizeof (mirror_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (mirror_ksp != NULL) { + mirror_ksp->ks_data = &mirror_stats; + kstat_install(mirror_ksp); + } +} + +void +vdev_mirror_stat_fini(void) +{ + if (mirror_ksp != NULL) { + kstat_delete(mirror_ksp); + mirror_ksp = NULL; + } +} + +/* * Virtual device vector for mirroring. */ @@ -45,48 +104,182 @@ typedef struct mirror_child { vdev_t *mc_vd; uint64_t mc_offset; int mc_error; + int mc_load; uint8_t mc_tried; uint8_t mc_skipped; uint8_t mc_speculative; } mirror_child_t; typedef struct mirror_map { + int *mm_preferred; + int mm_preferred_cnt; int mm_children; int mm_resilvering; - int mm_preferred; int mm_root; - mirror_child_t mm_child[1]; + mirror_child_t mm_child[]; } mirror_map_t; int vdev_mirror_shift = 21; +/* + * The load configuration settings below are tuned by default for + * the case where all devices are of the same rotational type. + * + * If there is a mixture of rotating and non-rotating media, setting + * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results + * as it will direct more reads to the non-rotating vdevs which are more likely + * to have a higher performance. + */ + +/* Rotating media load calculation configuration. */ +static int zfs_vdev_mirror_rotating_inc = 0; +static int zfs_vdev_mirror_rotating_seek_inc = 5; +static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024; + +/* Non-rotating media load calculation configuration. */ +static int zfs_vdev_mirror_non_rotating_inc = 0; +static int zfs_vdev_mirror_non_rotating_seek_inc = 1; + +static inline size_t +vdev_mirror_map_size(int children) +{ + return (offsetof(mirror_map_t, mm_child[children]) + + sizeof (int) * children); +} + +static inline mirror_map_t * +vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root) +{ + mirror_map_t *mm; + + mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP); + mm->mm_children = children; + mm->mm_resilvering = resilvering; + mm->mm_root = root; + mm->mm_preferred = (int *)((uintptr_t)mm + + offsetof(mirror_map_t, mm_child[children])); + + return (mm); +} + static void vdev_mirror_map_free(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; - kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children])); + kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); } static const zio_vsd_ops_t vdev_mirror_vsd_ops = { - vdev_mirror_map_free, - zio_vsd_default_cksum_report + .vsd_free = vdev_mirror_map_free, + .vsd_cksum_report = zio_vsd_default_cksum_report }; +static int +vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) +{ + uint64_t last_offset; + int64_t offset_diff; + int load; + + /* All DVAs have equal weight at the root. */ + if (mm->mm_root) + return (INT_MAX); + + /* + * We don't return INT_MAX if the device is resilvering i.e. + * vdev_resilver_txg != 0 as when tested performance was slightly + * worse overall when resilvering with compared to without. + */ + + /* Fix zio_offset for leaf vdevs */ + if (vd->vdev_ops->vdev_op_leaf) + zio_offset += VDEV_LABEL_START_SIZE; + + /* Standard load based on pending queue length. */ + load = vdev_queue_length(vd); + last_offset = vdev_queue_last_offset(vd); + + if (vd->vdev_nonrot) { + /* Non-rotating media. */ + if (last_offset == zio_offset) { + MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear); + return (load + zfs_vdev_mirror_non_rotating_inc); + } + + /* + * Apply a seek penalty even for non-rotating devices as + * sequential I/O's can be aggregated into fewer operations on + * the device, thus avoiding unnecessary per-command overhead + * and boosting performance. + */ + MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek); + return (load + zfs_vdev_mirror_non_rotating_seek_inc); + } + + /* Rotating media I/O's which directly follow the last I/O. */ + if (last_offset == zio_offset) { + MIRROR_BUMP(vdev_mirror_stat_rotating_linear); + return (load + zfs_vdev_mirror_rotating_inc); + } + + /* + * Apply half the seek increment to I/O's within seek offset + * of the last I/O issued to this vdev as they should incur less + * of a seek increment. + */ + offset_diff = (int64_t)(last_offset - zio_offset); + if (ABS(offset_diff) < zfs_vdev_mirror_rotating_seek_offset) { + MIRROR_BUMP(vdev_mirror_stat_rotating_offset); + return (load + (zfs_vdev_mirror_rotating_seek_inc / 2)); + } + + /* Apply the full seek increment to all other I/O's. */ + MIRROR_BUMP(vdev_mirror_stat_rotating_seek); + return (load + zfs_vdev_mirror_rotating_seek_inc); +} + static mirror_map_t * -vdev_mirror_map_alloc(zio_t *zio) +vdev_mirror_map_init(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; - int c, d; + int c; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; + dsl_scan_t *scn = NULL; dva_t dva_copy[SPA_DVAS_PER_BP]; - c = BP_GET_NDVAS(zio->io_bp); + if (spa->spa_dsl_pool != NULL) { + scn = spa->spa_dsl_pool->dp_scan; + } + /* + * The sequential scrub code sorts and issues all DVAs + * of a bp separately. Each of these IOs includes all + * original DVA copies so that repairs can be performed + * in the event of an error, but we only actually want + * to check the first DVA since the others will be + * checked by their respective sorted IOs. Only if we + * hit an error will we try all DVAs upon retrying. + * + * Note: This check is safe even if the user switches + * from a legacy scrub to a sequential one in the middle + * of processing, since scn_is_sorted isn't updated until + * all outstanding IOs from the previous scrub pass + * complete. + */ + if ((zio->io_flags & ZIO_FLAG_SCRUB) && + !(zio->io_flags & ZIO_FLAG_IO_RETRY) && + scn != NULL && + scn->scn_is_sorted && + dsl_scan_scrubbing(spa->spa_dsl_pool)) { + c = 1; + } else { + c = BP_GET_NDVAS(zio->io_bp); + } /* * If we do not trust the pool config, some DVAs might be @@ -110,24 +303,7 @@ vdev_mirror_map_alloc(zio_t *zio) } } - mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); - mm->mm_children = c; - mm->mm_resilvering = B_FALSE; - mm->mm_preferred = spa_get_random(c); - mm->mm_root = B_TRUE; - - /* - * Check the other, lower-index DVAs to see if they're on - * the same vdev as the child we picked. If they are, use - * them since they are likely to have been allocated from - * the primary metaslab in use at the time, and hence are - * more likely to have locality with single-copy data. - */ - for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) { - if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c])) - mm->mm_preferred = d; - } - + mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; @@ -135,12 +311,6 @@ vdev_mirror_map_alloc(zio_t *zio) mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { - int replacing; - - c = vd->vdev_children; - - mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); - mm->mm_children = c; /* * If we are resilvering, then we should handle scrub reads * differently; we shouldn't issue them to the resilvering @@ -164,25 +334,12 @@ vdev_mirror_map_alloc(zio_t *zio) * automatically removed from the pool after the user replaces * the device that originally failed. */ - replacing = (vd->vdev_ops == &vdev_replacing_ops || - vd->vdev_ops == &vdev_spare_ops); - /* - * If a spa load is in progress, then spa_dsl_pool may be - * uninitialized. But we shouldn't be resilvering during a spa - * load anyway. - */ - if (replacing && - (spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE) && - dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool)) { - mm->mm_resilvering = B_TRUE; - } else { - mm->mm_resilvering = B_FALSE; - } - - mm->mm_preferred = mm->mm_resilvering ? 0 : - (zio->io_offset >> vdev_mirror_shift) % c; - mm->mm_root = B_FALSE; - + boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops) && + spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE && + dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool); + mm = vdev_mirror_map_alloc(vd->vdev_children, replacing, + B_FALSE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; @@ -269,6 +426,7 @@ vdev_mirror_scrub_done(zio_t *zio) } mutex_exit(&zio->io_lock); } + abd_free(zio->io_abd); mc->mc_error = zio->io_error; @@ -277,6 +435,54 @@ vdev_mirror_scrub_done(zio_t *zio) } /* + * Check the other, lower-index DVAs to see if they're on the same + * vdev as the child we picked. If they are, use them since they + * are likely to have been allocated from the primary metaslab in + * use at the time, and hence are more likely to have locality with + * single-copy data. + */ +static int +vdev_mirror_dva_select(zio_t *zio, int p) +{ + dva_t *dva = zio->io_bp->blk_dva; + mirror_map_t *mm = zio->io_vsd; + int preferred; + int c; + + preferred = mm->mm_preferred[p]; + for (p--; p >= 0; p--) { + c = mm->mm_preferred[p]; + if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred])) + preferred = c; + } + return (preferred); +} + +static int +vdev_mirror_preferred_child_randomize(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + int p; + + if (mm->mm_root) { + p = spa_get_random(mm->mm_preferred_cnt); + return (vdev_mirror_dva_select(zio, p)); + } + + /* + * To ensure we don't always favour the first matching vdev, + * which could lead to wear leveling issues on SSD's, we + * use the I/O offset as a pseudo random seed into the vdevs + * which have the lowest load. + */ + p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt; + return (mm->mm_preferred[p]); +} + +/* + * Try to find a vdev whose DTL doesn't contain the block we want to read + * prefering vdevs based on determined load. + * * Try to find a child whose DTL doesn't contain the block we want to read. * If we can't, try the read on any vdev we haven't already tried. */ @@ -284,43 +490,64 @@ static int vdev_mirror_child_select(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; - mirror_child_t *mc; uint64_t txg = zio->io_txg; - int i, c; + int c, lowest_load; ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); - /* - * Try to find a child whose DTL doesn't contain the block to read. - * If a child is known to be completely inaccessible (indicated by - * vdev_readable() returning B_FALSE), don't even try. - */ - for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) { - if (c >= mm->mm_children) - c = 0; + lowest_load = INT_MAX; + mm->mm_preferred_cnt = 0; + for (c = 0; c < mm->mm_children; c++) { + mirror_child_t *mc; + mc = &mm->mm_child[c]; if (mc->mc_tried || mc->mc_skipped) continue; - if (!vdev_readable(mc->mc_vd)) { + + if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; continue; } - if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) - return (c); - mc->mc_error = SET_ERROR(ESTALE); - mc->mc_skipped = 1; - mc->mc_speculative = 1; + + if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) { + mc->mc_error = SET_ERROR(ESTALE); + mc->mc_skipped = 1; + mc->mc_speculative = 1; + continue; + } + + mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); + if (mc->mc_load > lowest_load) + continue; + + if (mc->mc_load < lowest_load) { + lowest_load = mc->mc_load; + mm->mm_preferred_cnt = 0; + } + mm->mm_preferred[mm->mm_preferred_cnt] = c; + mm->mm_preferred_cnt++; + } + + if (mm->mm_preferred_cnt == 1) { + MIRROR_BUMP(vdev_mirror_stat_preferred_found); + return (mm->mm_preferred[0]); + } + + if (mm->mm_preferred_cnt > 1) { + MIRROR_BUMP(vdev_mirror_stat_preferred_not_found); + return (vdev_mirror_preferred_child_randomize(zio)); } /* * Every device is either missing or has this txg in its DTL. * Look for any child we haven't already tried before giving up. */ - for (c = 0; c < mm->mm_children; c++) + for (c = 0; c < mm->mm_children; c++) { if (!mm->mm_child[c].mc_tried) return (c); + } /* * Every child failed. There's no place left to look. @@ -335,7 +562,7 @@ vdev_mirror_io_start(zio_t *zio) mirror_child_t *mc; int c, children; - mm = vdev_mirror_map_alloc(zio); + mm = vdev_mirror_map_init(zio); if (mm == NULL) { ASSERT(!spa_trust_config(zio->io_spa)); diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index 0643c05f57..a89e06ebbf 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -276,6 +276,8 @@ vdev_queue_init(vdev_t *vd) avl_create(vdev_queue_class_tree(vq, p), compfn, sizeof (zio_t), offsetof(struct zio, io_queue_node)); } + + vq->vq_last_offset = 0; } void @@ -701,7 +703,7 @@ again: */ tree = vdev_queue_class_tree(vq, p); search.io_timestamp = 0; - search.io_offset = vq->vq_last_offset + 1; + search.io_offset = vq->vq_last_offset - 1; VERIFY3P(avl_find(tree, &search, &idx), ==, NULL); zio = avl_nearest(tree, idx, AVL_AFTER); if (zio == NULL) @@ -729,7 +731,7 @@ again: } vdev_queue_pending_add(vq, zio); - vq->vq_last_offset = zio->io_offset; + vq->vq_last_offset = zio->io_offset + zio->io_size; return (zio); } @@ -849,12 +851,39 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) */ tree = vdev_queue_class_tree(vq, zio->io_priority); if (avl_find(tree, zio, NULL) == zio) { + spa_t *spa = zio->io_spa; + zio_priority_t oldpri = zio->io_priority; + avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); zio->io_priority = priority; avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); + + mutex_enter(&spa->spa_iokstat_lock); + ASSERT3U(spa->spa_queue_stats[oldpri].spa_queued, >, 0); + spa->spa_queue_stats[oldpri].spa_queued--; + spa->spa_queue_stats[zio->io_priority].spa_queued++; + mutex_exit(&spa->spa_iokstat_lock); } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) { zio->io_priority = priority; } mutex_exit(&vq->vq_lock); } + +/* + * As these two methods are only used for load calculations we're not + * concerned if we get an incorrect value on 32bit platforms due to lack of + * vq_lock mutex use here, instead we prefer to keep it lock free for + * performance. + */ +int +vdev_queue_length(vdev_t *vd) +{ + return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); +} + +uint64_t +vdev_queue_last_offset(vdev_t *vd) +{ + return (vd->vdev_queue.vq_last_offset); +} diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c index 26f59af996..71b859bc3d 100644 --- a/usr/src/uts/common/fs/zfs/zio_inject.c +++ b/usr/src/uts/common/fs/zfs/zio_inject.c @@ -102,7 +102,7 @@ static int inject_next_id = 1; * Returns true if the given record matches the I/O in progress. */ static boolean_t -zio_match_handler(zbookmark_phys_t *zb, uint64_t type, +zio_match_handler(zbookmark_phys_t *zb, uint64_t type, int dva, zinject_record_t *record, int error) { /* @@ -127,9 +127,11 @@ zio_match_handler(zbookmark_phys_t *zb, uint64_t type, zb->zb_level == record->zi_level && zb->zb_blkid >= record->zi_start && zb->zb_blkid <= record->zi_end && - error == record->zi_error) + (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) && + error == record->zi_error) { return (record->zi_freq == 0 || spa_get_random(100) < record->zi_freq); + } return (B_FALSE); } @@ -159,6 +161,38 @@ zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type) rw_exit(&inject_lock); } + +/* + * If this is a physical I/O for a vdev child determine which DVA it is + * for. We iterate backwards through the DVAs matching on the offset so + * that we end up with ZI_NO_DVA (-1) if we don't find a match. + */ +static int +zio_match_dva(zio_t *zio) +{ + int i = ZI_NO_DVA; + + if (zio->io_bp != NULL && zio->io_vd != NULL && + zio->io_child_type == ZIO_CHILD_VDEV) { + for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) { + dva_t *dva = &zio->io_bp->blk_dva[i]; + uint64_t off = DVA_GET_OFFSET(dva); + vdev_t *vd = vdev_lookup_top(zio->io_spa, + DVA_GET_VDEV(dva)); + + /* Compensate for vdev label added to leaves */ + if (zio->io_vd->vdev_ops->vdev_op_leaf) + off += VDEV_LABEL_START_SIZE; + + if (zio->io_vd == vd && zio->io_offset == off) + break; + } + } + + return (i); +} + + /* * Determine if the I/O in question should return failure. Returns the errno * to be returned to the caller. @@ -190,10 +224,10 @@ zio_handle_fault_injection(zio_t *zio, int error) handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) continue; - /* If this handler matches, return EIO */ + /* If this handler matches, return the specified error */ if (zio_match_handler(&zio->io_logical->io_bookmark, zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, - &handler->zi_record, error)) { + zio_match_dva(zio), &handler->zi_record, error)) { ret = error; break; } |