diff options
author | Mike Gerdts <mike.gerdts@joyent.com> | 2019-06-11 04:05:22 +0000 |
---|---|---|
committer | Dan McDonald <danmcd@joyent.com> | 2019-06-21 18:03:31 -0400 |
commit | b73ccab03ec36581b1ae5945ef1fee1d06c79ccf (patch) | |
tree | f1c4869b6f166226abc7054f914f0aab7f9cbb81 | |
parent | 29d33d4278d4226572dfc214775224334bc444a6 (diff) | |
download | illumos-joyent-b73ccab03ec36581b1ae5945ef1fee1d06c79ccf.tar.gz |
9318 vol_volsize_to_reservation does not account for raidz skip blocks
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Matt Ahrens <matt@delphix.com>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>
-rw-r--r-- | usr/src/cmd/zfs/zfs_main.c | 7 | ||||
-rw-r--r-- | usr/src/lib/libzfs/common/libzfs.h | 3 | ||||
-rw-r--r-- | usr/src/lib/libzfs/common/libzfs_dataset.c | 189 | ||||
-rw-r--r-- | usr/src/pkg/manifests/system-test-zfstest.mf | 3 | ||||
-rw-r--r-- | usr/src/test/zfs-tests/runfiles/delphix.run | 3 | ||||
-rw-r--r-- | usr/src/test/zfs-tests/runfiles/omnios.run | 3 | ||||
-rw-r--r-- | usr/src/test/zfs-tests/runfiles/openindiana.run | 3 | ||||
-rw-r--r-- | usr/src/test/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz.ksh | 197 | ||||
-rw-r--r-- | usr/src/test/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh | 130 |
9 files changed, 523 insertions, 15 deletions
diff --git a/usr/src/cmd/zfs/zfs_main.c b/usr/src/cmd/zfs/zfs_main.c index ca11041cba..283f3ff044 100644 --- a/usr/src/cmd/zfs/zfs_main.c +++ b/usr/src/cmd/zfs/zfs_main.c @@ -23,7 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright 2012 Milan Jurik. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -876,10 +876,11 @@ zfs_do_create(int argc, char **argv) zpool_close(zpool_handle); goto error; } - zpool_close(zpool_handle); - volsize = zvol_volsize_to_reservation(volsize, real_props); + volsize = zvol_volsize_to_reservation(zpool_handle, volsize, + real_props); nvlist_free(real_props); + zpool_close(zpool_handle); if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop), &strval) != 0) { diff --git a/usr/src/lib/libzfs/common/libzfs.h b/usr/src/lib/libzfs/common/libzfs.h index 3dc5454c48..af5e5c35d5 100644 --- a/usr/src/lib/libzfs/common/libzfs.h +++ b/usr/src/lib/libzfs/common/libzfs.h @@ -671,7 +671,8 @@ extern int zfs_hold(zfs_handle_t *, const char *, const char *, extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *); extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); extern int zfs_get_holds(zfs_handle_t *, nvlist_t **); -extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *); +extern uint64_t zvol_volsize_to_reservation(zpool_handle_t *, uint64_t, + nvlist_t *); typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, uid_t rid, uint64_t space); diff --git a/usr/src/lib/libzfs/common/libzfs_dataset.c b/usr/src/lib/libzfs/common/libzfs_dataset.c index 1b2bf860e2..2ca09e51d4 100644 --- a/usr/src/lib/libzfs/common/libzfs_dataset.c +++ b/usr/src/lib/libzfs/common/libzfs_dataset.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2018, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved. @@ -1514,6 +1514,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) uint64_t new_reservation; zfs_prop_t resv_prop; nvlist_t *props; + zpool_handle_t *zph = zpool_handle(zhp); /* * If this is an existing volume, and someone is setting the volsize, @@ -1528,7 +1529,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); - if ((zvol_volsize_to_reservation(old_volsize, props) != + if ((zvol_volsize_to_reservation(zph, old_volsize, props) != old_reservation) || nvlist_exists(nvl, zfs_prop_to_name(resv_prop))) { fnvlist_free(props); @@ -1539,7 +1540,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) fnvlist_free(props); return (-1); } - new_reservation = zvol_volsize_to_reservation(new_volsize, props); + new_reservation = zvol_volsize_to_reservation(zph, new_volsize, props); fnvlist_free(props); if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop), @@ -1594,7 +1595,8 @@ zfs_fix_auto_resv(zfs_handle_t *zhp, nvlist_t *nvl) volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); } - resvsize = zvol_volsize_to_reservation(volsize, props); + resvsize = zvol_volsize_to_reservation(zpool_handle(zhp), volsize, + props); fnvlist_free(props); (void) nvlist_remove_all(nvl, zfs_prop_to_name(prop)); @@ -5111,12 +5113,176 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) } /* - * Convert the zvol's volume size to an appropriate reservation. + * The theory of raidz space accounting + * + * The "referenced" property of RAIDZ vdevs is scaled such that a 128KB block + * will "reference" 128KB, even though it allocates more than that, to store the + * parity information (and perhaps skip sectors). This concept of the + * "referenced" (and other DMU space accounting) being lower than the allocated + * space by a constant factor is called "raidz deflation." + * + * As mentioned above, the constant factor for raidz deflation assumes a 128KB + * block size. However, zvols typically have a much smaller block size (default + * 8KB). These smaller blocks may require proportionally much more parity + * information (and perhaps skip sectors). In this case, the change to the + * "referenced" property may be much more than the logical block size. + * + * Suppose a raidz vdev has 5 disks with ashift=12. A 128k block may be written + * as follows. + * + * +-------+-------+-------+-------+-------+ + * | disk1 | disk2 | disk3 | disk4 | disk5 | + * +-------+-------+-------+-------+-------+ + * | P0 | D0 | D8 | D16 | D24 | + * | P1 | D1 | D9 | D17 | D25 | + * | P2 | D2 | D10 | D18 | D26 | + * | P3 | D3 | D11 | D19 | D27 | + * | P4 | D4 | D12 | D20 | D28 | + * | P5 | D5 | D13 | D21 | D29 | + * | P6 | D6 | D14 | D22 | D30 | + * | P7 | D7 | D15 | D23 | D31 | + * +-------+-------+-------+-------+-------+ + * + * Above, notice that 160k was allocated: 8 x 4k parity sectors + 32 x 4k data + * sectors. The dataset's referenced will increase by 128k and the pool's + * allocated and free properties will be adjusted by 160k. + * + * A 4k block written to the same raidz vdev will require two 4k sectors. The + * blank cells represent unallocated space. + * + * +-------+-------+-------+-------+-------+ + * | disk1 | disk2 | disk3 | disk4 | disk5 | + * +-------+-------+-------+-------+-------+ + * | P0 | D0 | | | | + * +-------+-------+-------+-------+-------+ + * + * Above, notice that the 4k block required one sector for parity and another + * for data. vdev_raidz_asize() will return 8k and as such the pool's allocated + * and free properties will be adjusted by 8k. The dataset will not be charged + * 8k. Rather, it will be charged a value that is scaled according to the + * overhead of the 128k block on the same vdev. This 8k allocation will be + * charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as + * calculated in the 128k block example above. + * + * Every raidz allocation is sized to be a multiple of nparity+1 sectors. That + * is, every raidz1 allocation will be a multiple of 2 sectors, raidz2 + * allocations are a multiple of 3 sectors, and raidz3 allocations are a + * multiple of of 4 sectors. When a block does not fill the required number of + * sectors, skip blocks (sectors) are used. + * + * An 8k block being written to a raidz vdev may be written as follows: + * + * +-------+-------+-------+-------+-------+ + * | disk1 | disk2 | disk3 | disk4 | disk5 | + * +-------+-------+-------+-------+-------+ + * | P0 | D0 | D1 | S0 | | + * +-------+-------+-------+-------+-------+ + * + * In order to maintain the nparity+1 allocation size, a skip block (S0) was + * added. For this 8k block, the pool's allocated and free properties are + * adjusted by 16k and the dataset's referenced is increased by 16k * 128k / + * 160k. Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in + * the 128k block example above. + * + * Compression may lead to a variety of block sizes being written for the same + * volume or file. There is no clear way to reserve just the amount of space + * that will be required, so the worst case (no compression) is assumed. + * Note that metadata blocks will typically be compressed, so the reservation + * size returned by zvol_volsize_to_reservation() will generally be slightly + * larger than the maximum that the volume can reference. + */ + +/* + * Derived from function of same name in uts/common/fs/zfs/vdev_raidz.c. + * Returns the amount of space (in bytes) that will be allocated for the + * specified block size. Note that the "referenced" space accounted will be less + * than this, but not necessarily equal to "blksize", due to RAIDZ deflation. + */ +static uint64_t +vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, + uint64_t blksize) +{ + uint64_t asize, ndata; + + ASSERT3U(ndisks, >, nparity); + ndata = ndisks - nparity; + asize = ((blksize - 1) >> ashift) + 1; + asize += nparity * ((asize + ndata - 1) / ndata); + asize = roundup(asize, nparity + 1) << ashift; + + return (asize); +} + +/* + * Determine how much space will be allocated if it lands on the most space- + * inefficient top-level vdev. Returns the size in bytes required to store one + * copy of the volume data. See theory comment above. + */ +static uint64_t +volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) +{ + nvlist_t *config, *tree, **vdevs; + uint_t nvdevs, v; + uint64_t ret = 0; + + config = zpool_get_config(zhp, NULL); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || + nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, + &vdevs, &nvdevs) != 0) { + return (nblocks * blksize); + } + + for (v = 0; v < nvdevs; v++) { + char *type; + uint64_t nparity, ashift, asize, tsize; + nvlist_t **disks; + uint_t ndisks; + uint64_t volsize; + + if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE, + &type) != 0 || strcmp(type, VDEV_TYPE_RAIDZ) != 0 || + nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY, + &nparity) != 0 || + nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT, + &ashift) != 0 || + nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN, + &disks, &ndisks) != 0) { + continue; + } + + /* allocation size for the "typical" 128k block */ + tsize = vdev_raidz_asize(ndisks, nparity, ashift, + SPA_OLD_MAXBLOCKSIZE); + /* allocation size for the blksize block */ + asize = vdev_raidz_asize(ndisks, nparity, ashift, blksize); + + /* + * Scale this size down as a ratio of 128k / tsize. See theory + * statement above. + */ + volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize; + if (volsize > ret) { + ret = volsize; + } + } + + if (ret == 0) { + ret = nblocks * blksize; + } + + return (ret); +} + +/* + * Convert the zvol's volume size to an appropriate reservation. See theory + * comment above. + * * Note: If this routine is updated, it is necessary to update the ZFS test - * suite's shell version in reservation.kshlib. + * suite's shell version in reservation.shlib. */ uint64_t -zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props) +zvol_volsize_to_reservation(zpool_handle_t *zph, uint64_t volsize, + nvlist_t *props) { uint64_t numdb; uint64_t nblocks, volblocksize; @@ -5132,7 +5298,14 @@ zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props) zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) volblocksize = ZVOL_DEFAULT_BLOCKSIZE; - nblocks = volsize/volblocksize; + + nblocks = volsize / volblocksize; + /* + * Metadata defaults to using 128k blocks, not volblocksize blocks. For + * this reason, only the data blocks are scaled based on vdev config. + */ + volsize = volsize_from_vdevs(zph, nblocks, volblocksize); + /* start with metadnode L0-L6 */ numdb = 7; /* calculate number of indirects */ diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf index a29f2c6bc8..bf1e75980b 100644 --- a/usr/src/pkg/manifests/system-test-zfstest.mf +++ b/usr/src/pkg/manifests/system-test-zfstest.mf @@ -2521,6 +2521,9 @@ file path=opt/zfs-tests/tests/functional/refreserv/refreserv_002_pos mode=0555 file path=opt/zfs-tests/tests/functional/refreserv/refreserv_003_pos mode=0555 file path=opt/zfs-tests/tests/functional/refreserv/refreserv_004_pos mode=0555 file path=opt/zfs-tests/tests/functional/refreserv/refreserv_005_pos mode=0555 +file path=opt/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz \ + mode=0555 +file path=opt/zfs-tests/tests/functional/refreserv/refreserv_raidz mode=0555 file path=opt/zfs-tests/tests/functional/refreserv/setup mode=0555 file path=opt/zfs-tests/tests/functional/removal/cleanup mode=0555 file path=opt/zfs-tests/tests/functional/removal/removal.kshlib mode=0444 diff --git a/usr/src/test/zfs-tests/runfiles/delphix.run b/usr/src/test/zfs-tests/runfiles/delphix.run index dbee1a5433..738fe89309 100644 --- a/usr/src/test/zfs-tests/runfiles/delphix.run +++ b/usr/src/test/zfs-tests/runfiles/delphix.run @@ -536,7 +536,8 @@ tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos', [/opt/zfs-tests/tests/functional/refreserv] tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos', - 'refreserv_004_pos', 'refreserv_005_pos'] + 'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_raidz', + 'refreserv_multi_raidz'] [/opt/zfs-tests/tests/functional/removal] pre = diff --git a/usr/src/test/zfs-tests/runfiles/omnios.run b/usr/src/test/zfs-tests/runfiles/omnios.run index 875b529e9e..926f65cb20 100644 --- a/usr/src/test/zfs-tests/runfiles/omnios.run +++ b/usr/src/test/zfs-tests/runfiles/omnios.run @@ -536,7 +536,8 @@ tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos', [/opt/zfs-tests/tests/functional/refreserv] tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos', - 'refreserv_004_pos', 'refreserv_005_pos'] + 'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_raidz', + 'refreserv_multi_raidz'] [/opt/zfs-tests/tests/functional/removal] pre = diff --git a/usr/src/test/zfs-tests/runfiles/openindiana.run b/usr/src/test/zfs-tests/runfiles/openindiana.run index f8c0c40328..f86d6d9a7b 100644 --- a/usr/src/test/zfs-tests/runfiles/openindiana.run +++ b/usr/src/test/zfs-tests/runfiles/openindiana.run @@ -536,7 +536,8 @@ tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos', [/opt/zfs-tests/tests/functional/refreserv] tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos', - 'refreserv_004_pos', 'refreserv_005_pos'] + 'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_raidz', + 'refreserv_multi_raidz'] [/opt/zfs-tests/tests/functional/removal] pre = diff --git a/usr/src/test/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz.ksh b/usr/src/test/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz.ksh new file mode 100644 index 0000000000..ec5e0bfc27 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz.ksh @@ -0,0 +1,197 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/refreserv/refreserv.cfg + +# +# DESCRIPTION: +# raidz refreservation=auto picks worst raidz vdev +# +# STRATEGY: +# 1. Create a pool with a single raidz vdev +# 2. For each block size [512b, 1k, 128k] or [4k, 8k, 128k] +# - create a volume +# - remember its refreservation +# - destroy the volume +# 3. Destroy the pool +# 4. Recreate the pool with one more disk in the vdev, then repeat steps +# 2 and 3. +# +# NOTES: +# 1. This test will use up to 14 disks but can cover the key concepts with +# 5 disks. +# 2. If the disks are a mixture of 4Kn and 512n/512e, failures are likely. +# + +verify_runnable "global" + +typeset -a alldisks=($DISKS) + +# The larger the volsize, the better zvol_volsize_to_reservation() is at +# guessing the right number - though it is horrible with tiny blocks. At 10M on +# ashift=12, the estimate may be over 26% too high. +volsize=100 + +function cleanup +{ + default_cleanup_noexit + default_setup_noexit "${alldisks[0]}" +} + +log_assert "raidz refreservation=auto picks worst raidz vdev" +log_onexit cleanup + +poolexists "$TESTPOOL" && log_must zpool destroy "$TESTPOOL" + +# Testing tiny block sizes on ashift=12 pools causes so much size inflation +# that small test disks may fill before creating small volumes. However, +# testing 512b and 1K blocks on ashift=9 pools is an ok approximation for +# testing the problems that arise from 4K and 8K blocks on ashift=12 pools. +bps=$(prtvtoc /dev/rdsk/${alldisks[0]} | + awk '$NF == "bytes/sector" { print $2; exit 0 }') +case "$bps" in +512) + allshifts=(9 10 17) + ;; +4096) + allshifts=(12 13 17) + ;; +*) + log_fail "bytes/sector != (512|4096)" + ;; +esac +log_note "Testing in ashift=${allshifts[0]} mode" + +typeset -A sizes= + +# +# Determine the refreservation for a $volsize MiB volume on each raidz type at +# various block sizes. +# +for parity in 1 2 3; do + raid=raidz$parity + typeset -A sizes["$raid"] + + # Ensure we hit scenarios with and without skip blocks + for ndisks in $((parity * 2)) $((parity * 2 + 1)); do + typeset -a disks=(${alldisks[0..$((ndisks - 1))]}) + + if (( ${#disks[@]} < ndisks )); then + log_note "Too few disks to test $raid-$ndisks" + continue + fi + + typeset -A sizes["$raid"]["$ndisks"] + + log_must zpool create "$TESTPOOL" "$raid" "${disks[@]}" + + for bits in "${allshifts[@]}"; do + vbs=$((1 << bits)) + log_note "Gathering refreservation for $raid-$ndisks" \ + "volblocksize=$vbs" + + vol=$TESTPOOL/$TESTVOL + log_must zfs create -V ${volsize}m \ + -o volblocksize=$vbs "$vol" + + refres=$(zfs get -Hpo value refreservation "$vol") + log_must test -n "$refres" + sizes["$raid"]["$ndisks"]["$vbs"]=$refres + + log_must zfs destroy "$vol" + done + + log_must zpool destroy "$TESTPOOL" + done +done + +# A little extra info is always helpful when diagnosing problems. To +# pretty-print what you find in the log, do this in ksh: +# typeset -A sizes=(...) +# print -v sizes +log_note "sizes=$(print -C sizes)" + +# +# Helper furnction for checking that refreservation is calculated properly in +# multi-vdev pools. "Properly" is defined as assuming that all vdevs are as +# space inefficient as the worst one. +# +function check_vdevs { + typeset raid=$1 + typeset nd1=$2 + typeset nd2=$3 + typeset -a disks1 disks2 + typeset vbs vol refres refres1 refres2 expect + + disks1=(${alldisks[0..$((nd1 - 1))]}) + disks2=(${alldisks[$nd1..$((nd1 + nd2 - 1))]}) + if (( ${#disks2[@]} < nd2 )); then + log_note "Too few disks to test $raid-$nd1 + $raid=$nd2" + return + fi + + log_must zpool create -f "$TESTPOOL" \ + "$raid" "${disks1[@]}" "$raid" "${disks2[@]}" + + for bits in "${allshifts[@]}"; do + vbs=$((1 << bits)) + log_note "Verifying $raid-$nd1 $raid-$nd2 volblocksize=$vbs" + + vol=$TESTPOOL/$TESTVOL + log_must zfs create -V ${volsize}m -o volblocksize=$vbs "$vol" + refres=$(zfs get -Hpo value refreservation "$vol") + log_must test -n "$refres" + + refres1=${sizes["$raid"]["$nd1"]["$vbs"]} + refres2=${sizes["$raid"]["$nd2"]["$vbs"]} + + if (( refres1 > refres2 )); then + log_note "Expecting refres ($refres) to match refres" \ + "from $raid-$nd1 ($refres1)" + log_must test "$refres" -eq "$refres1" + else + log_note "Expecting refres ($refres) to match refres" \ + "from $raid-$nd1 ($refres2)" + log_must test "$refres" -eq "$refres2" + fi + + log_must zfs destroy "$vol" + done + + log_must zpool destroy "$TESTPOOL" +} + +# +# Verify that multi-vdev pools use the last optimistic size for all the +# permutations within a particular raidz variant. +# +for raid in "${!sizes[@]}"; do + # ksh likes to create a [0] item for us. Thanks, ksh! + [[ $raid == "0" ]] && continue + + for nd1 in "${!sizes["$raid"][@]}"; do + [[ $nd1 == "0" ]] && continue + + for nd2 in "${!sizes["$raid"][@]}"; do + [[ $nd2 == "0" ]] && continue + + check_vdevs "$raid" "$nd1" "$nd2" + done + done +done + +log_pass "raidz refreservation=auto picks worst raidz vdev" diff --git a/usr/src/test/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh b/usr/src/test/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh new file mode 100644 index 0000000000..8628d65e7e --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh @@ -0,0 +1,130 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/refreserv/refreserv.cfg + +# +# DESCRIPTION: +# raidz refreservation=auto accounts for extra parity and skip blocks +# +# STRATEGY: +# 1. Create a pool with a single raidz vdev +# 2. For each block size [512b, 1k, 128k] or [4k, 8k, 128k] +# - create a volume +# - fully overwrite it +# - verify that referenced is less than or equal to reservation +# - destroy the volume +# 3. Destroy the pool +# 4. Recreate the pool with one more disk in the vdev, then repeat steps +# 2 and 3. +# 5. Repeat all steps above for raidz2 and raidz3. +# +# NOTES: +# 1. This test will use up to 14 disks but can cover the key concepts with +# 5 disks. +# 2. If the disks are a mixture of 4Kn and 512n/512e, failures are likely. +# + +verify_runnable "global" + +typeset -a alldisks=($DISKS) + +# The larger the volsize, the better zvol_volsize_to_reservation() is at +# guessing the right number. At 10M on ashift=12, the estimate may be over 26% +# too high. +volsize=100 + +function cleanup +{ + default_cleanup_noexit + default_setup_noexit "${alldisks[0]}" +} + +log_assert "raidz refreservation=auto accounts for extra parity and skip blocks" +log_onexit cleanup + +poolexists "$TESTPOOL" && log_must zpool destroy "$TESTPOOL" + +# Testing tiny block sizes on ashift=12 pools causes so much size inflation +# that small test disks may fill before creating small volumes. However, +# testing 512b and 1K blocks on ashift=9 pools is an ok approximation for +# testing the problems that arise from 4K and 8K blocks on ashift=12 pools. +bps=$(prtvtoc /dev/rdsk/${alldisks[0]} | + awk '$NF == "bytes/sector" { print $2; exit 0 }') +log_must test "$bps" -eq 512 -o "$bps" -eq 4096 +case "$bps" in +512) + allshifts=(9 10 17) + maxpct=151 + ;; +4096) + allshifts=(12 13 17) + maxpct=110 + ;; +*) + log_fail "bytes/sector != (512|4096)" + ;; +esac +log_note "Testing in ashift=${allshifts[0]} mode" + +# This loop handles all iterations of steps 1 through 4 described in strategy +# comment above, +for parity in 1 2 3; do + raid=raidz$parity + + # Ensure we hit scenarios with and without skip blocks + for ndisks in $((parity * 2)) $((parity * 2 + 1)); do + typeset -a disks=(${alldisks[0..$((ndisks - 1))]}) + + if (( ${#disks[@]} < ndisks )); then + log_note "Too few disks to test $raid-$ndisks" + continue + fi + + log_must zpool create "$TESTPOOL" "$raid" "${disks[@]}" + + for bits in "${allshifts[@]}"; do + vbs=$((1 << bits)) + log_note "Testing $raid-$ndisks volblocksize=$vbs" + + vol=$TESTPOOL/$TESTVOL + log_must zfs create -V ${volsize}m \ + -o volblocksize=$vbs "$vol" + log_must dd if=/dev/zero of=/dev/zvol/dsk/$vol \ + bs=1024k count=$volsize + sync + + ref=$(zfs get -Hpo value referenced "$vol") + refres=$(zfs get -Hpo value refreservation "$vol") + log_must test -n "$ref" + log_must test -n "$refres" + + typeset -F2 deltapct=$((refres * 100.0 / ref)) + log_note "$raid-$ndisks refreservation $refres" \ + "is $deltapct% of reservation $res" + + log_must test "$ref" -le "$refres" + log_must test "$deltapct" -le $maxpct + + log_must zfs destroy "$vol" + done + + log_must zpool destroy "$TESTPOOL" + done +done + +log_pass "raidz refreservation=auto accounts for extra parity and skip blocks" |