summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMike Gerdts <mike.gerdts@joyent.com>2019-06-11 04:05:22 +0000
committerDan McDonald <danmcd@joyent.com>2019-06-21 18:03:31 -0400
commitb73ccab03ec36581b1ae5945ef1fee1d06c79ccf (patch)
treef1c4869b6f166226abc7054f914f0aab7f9cbb81
parent29d33d4278d4226572dfc214775224334bc444a6 (diff)
downloadillumos-joyent-b73ccab03ec36581b1ae5945ef1fee1d06c79ccf.tar.gz
9318 vol_volsize_to_reservation does not account for raidz skip blocks
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com> Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com> Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Matt Ahrens <matt@delphix.com> Reviewed by: Kody Kantor <kody.kantor@joyent.com> Approved by: Dan McDonald <danmcd@joyent.com>
-rw-r--r--usr/src/cmd/zfs/zfs_main.c7
-rw-r--r--usr/src/lib/libzfs/common/libzfs.h3
-rw-r--r--usr/src/lib/libzfs/common/libzfs_dataset.c189
-rw-r--r--usr/src/pkg/manifests/system-test-zfstest.mf3
-rw-r--r--usr/src/test/zfs-tests/runfiles/delphix.run3
-rw-r--r--usr/src/test/zfs-tests/runfiles/omnios.run3
-rw-r--r--usr/src/test/zfs-tests/runfiles/openindiana.run3
-rw-r--r--usr/src/test/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz.ksh197
-rw-r--r--usr/src/test/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh130
9 files changed, 523 insertions, 15 deletions
diff --git a/usr/src/cmd/zfs/zfs_main.c b/usr/src/cmd/zfs/zfs_main.c
index ca11041cba..283f3ff044 100644
--- a/usr/src/cmd/zfs/zfs_main.c
+++ b/usr/src/cmd/zfs/zfs_main.c
@@ -23,7 +23,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright 2012 Milan Jurik. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
* Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
@@ -876,10 +876,11 @@ zfs_do_create(int argc, char **argv)
zpool_close(zpool_handle);
goto error;
}
- zpool_close(zpool_handle);
- volsize = zvol_volsize_to_reservation(volsize, real_props);
+ volsize = zvol_volsize_to_reservation(zpool_handle, volsize,
+ real_props);
nvlist_free(real_props);
+ zpool_close(zpool_handle);
if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop),
&strval) != 0) {
diff --git a/usr/src/lib/libzfs/common/libzfs.h b/usr/src/lib/libzfs/common/libzfs.h
index 3dc5454c48..af5e5c35d5 100644
--- a/usr/src/lib/libzfs/common/libzfs.h
+++ b/usr/src/lib/libzfs/common/libzfs.h
@@ -671,7 +671,8 @@ extern int zfs_hold(zfs_handle_t *, const char *, const char *,
extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *);
extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
extern int zfs_get_holds(zfs_handle_t *, nvlist_t **);
-extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);
+extern uint64_t zvol_volsize_to_reservation(zpool_handle_t *, uint64_t,
+ nvlist_t *);
typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
uid_t rid, uint64_t space);
diff --git a/usr/src/lib/libzfs/common/libzfs_dataset.c b/usr/src/lib/libzfs/common/libzfs_dataset.c
index 1b2bf860e2..2ca09e51d4 100644
--- a/usr/src/lib/libzfs/common/libzfs_dataset.c
+++ b/usr/src/lib/libzfs/common/libzfs_dataset.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2018, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
@@ -1514,6 +1514,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
uint64_t new_reservation;
zfs_prop_t resv_prop;
nvlist_t *props;
+ zpool_handle_t *zph = zpool_handle(zhp);
/*
* If this is an existing volume, and someone is setting the volsize,
@@ -1528,7 +1529,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE));
- if ((zvol_volsize_to_reservation(old_volsize, props) !=
+ if ((zvol_volsize_to_reservation(zph, old_volsize, props) !=
old_reservation) || nvlist_exists(nvl,
zfs_prop_to_name(resv_prop))) {
fnvlist_free(props);
@@ -1539,7 +1540,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
fnvlist_free(props);
return (-1);
}
- new_reservation = zvol_volsize_to_reservation(new_volsize, props);
+ new_reservation = zvol_volsize_to_reservation(zph, new_volsize, props);
fnvlist_free(props);
if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop),
@@ -1594,7 +1595,8 @@ zfs_fix_auto_resv(zfs_handle_t *zhp, nvlist_t *nvl)
volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
}
- resvsize = zvol_volsize_to_reservation(volsize, props);
+ resvsize = zvol_volsize_to_reservation(zpool_handle(zhp), volsize,
+ props);
fnvlist_free(props);
(void) nvlist_remove_all(nvl, zfs_prop_to_name(prop));
@@ -5111,12 +5113,176 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
}
/*
- * Convert the zvol's volume size to an appropriate reservation.
+ * The theory of raidz space accounting
+ *
+ * The "referenced" property of RAIDZ vdevs is scaled such that a 128KB block
+ * will "reference" 128KB, even though it allocates more than that, to store the
+ * parity information (and perhaps skip sectors). This concept of the
+ * "referenced" (and other DMU space accounting) being lower than the allocated
+ * space by a constant factor is called "raidz deflation."
+ *
+ * As mentioned above, the constant factor for raidz deflation assumes a 128KB
+ * block size. However, zvols typically have a much smaller block size (default
+ * 8KB). These smaller blocks may require proportionally much more parity
+ * information (and perhaps skip sectors). In this case, the change to the
+ * "referenced" property may be much more than the logical block size.
+ *
+ * Suppose a raidz vdev has 5 disks with ashift=12. A 128k block may be written
+ * as follows.
+ *
+ * +-------+-------+-------+-------+-------+
+ * | disk1 | disk2 | disk3 | disk4 | disk5 |
+ * +-------+-------+-------+-------+-------+
+ * | P0 | D0 | D8 | D16 | D24 |
+ * | P1 | D1 | D9 | D17 | D25 |
+ * | P2 | D2 | D10 | D18 | D26 |
+ * | P3 | D3 | D11 | D19 | D27 |
+ * | P4 | D4 | D12 | D20 | D28 |
+ * | P5 | D5 | D13 | D21 | D29 |
+ * | P6 | D6 | D14 | D22 | D30 |
+ * | P7 | D7 | D15 | D23 | D31 |
+ * +-------+-------+-------+-------+-------+
+ *
+ * Above, notice that 160k was allocated: 8 x 4k parity sectors + 32 x 4k data
+ * sectors. The dataset's referenced will increase by 128k and the pool's
+ * allocated and free properties will be adjusted by 160k.
+ *
+ * A 4k block written to the same raidz vdev will require two 4k sectors. The
+ * blank cells represent unallocated space.
+ *
+ * +-------+-------+-------+-------+-------+
+ * | disk1 | disk2 | disk3 | disk4 | disk5 |
+ * +-------+-------+-------+-------+-------+
+ * | P0 | D0 | | | |
+ * +-------+-------+-------+-------+-------+
+ *
+ * Above, notice that the 4k block required one sector for parity and another
+ * for data. vdev_raidz_asize() will return 8k and as such the pool's allocated
+ * and free properties will be adjusted by 8k. The dataset will not be charged
+ * 8k. Rather, it will be charged a value that is scaled according to the
+ * overhead of the 128k block on the same vdev. This 8k allocation will be
+ * charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as
+ * calculated in the 128k block example above.
+ *
+ * Every raidz allocation is sized to be a multiple of nparity+1 sectors. That
+ * is, every raidz1 allocation will be a multiple of 2 sectors, raidz2
+ * allocations are a multiple of 3 sectors, and raidz3 allocations are a
+ * multiple of of 4 sectors. When a block does not fill the required number of
+ * sectors, skip blocks (sectors) are used.
+ *
+ * An 8k block being written to a raidz vdev may be written as follows:
+ *
+ * +-------+-------+-------+-------+-------+
+ * | disk1 | disk2 | disk3 | disk4 | disk5 |
+ * +-------+-------+-------+-------+-------+
+ * | P0 | D0 | D1 | S0 | |
+ * +-------+-------+-------+-------+-------+
+ *
+ * In order to maintain the nparity+1 allocation size, a skip block (S0) was
+ * added. For this 8k block, the pool's allocated and free properties are
+ * adjusted by 16k and the dataset's referenced is increased by 16k * 128k /
+ * 160k. Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in
+ * the 128k block example above.
+ *
+ * Compression may lead to a variety of block sizes being written for the same
+ * volume or file. There is no clear way to reserve just the amount of space
+ * that will be required, so the worst case (no compression) is assumed.
+ * Note that metadata blocks will typically be compressed, so the reservation
+ * size returned by zvol_volsize_to_reservation() will generally be slightly
+ * larger than the maximum that the volume can reference.
+ */
+
+/*
+ * Derived from function of same name in uts/common/fs/zfs/vdev_raidz.c.
+ * Returns the amount of space (in bytes) that will be allocated for the
+ * specified block size. Note that the "referenced" space accounted will be less
+ * than this, but not necessarily equal to "blksize", due to RAIDZ deflation.
+ */
+static uint64_t
+vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
+ uint64_t blksize)
+{
+ uint64_t asize, ndata;
+
+ ASSERT3U(ndisks, >, nparity);
+ ndata = ndisks - nparity;
+ asize = ((blksize - 1) >> ashift) + 1;
+ asize += nparity * ((asize + ndata - 1) / ndata);
+ asize = roundup(asize, nparity + 1) << ashift;
+
+ return (asize);
+}
+
+/*
+ * Determine how much space will be allocated if it lands on the most space-
+ * inefficient top-level vdev. Returns the size in bytes required to store one
+ * copy of the volume data. See theory comment above.
+ */
+static uint64_t
+volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
+{
+ nvlist_t *config, *tree, **vdevs;
+ uint_t nvdevs, v;
+ uint64_t ret = 0;
+
+ config = zpool_get_config(zhp, NULL);
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 ||
+ nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN,
+ &vdevs, &nvdevs) != 0) {
+ return (nblocks * blksize);
+ }
+
+ for (v = 0; v < nvdevs; v++) {
+ char *type;
+ uint64_t nparity, ashift, asize, tsize;
+ nvlist_t **disks;
+ uint_t ndisks;
+ uint64_t volsize;
+
+ if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE,
+ &type) != 0 || strcmp(type, VDEV_TYPE_RAIDZ) != 0 ||
+ nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY,
+ &nparity) != 0 ||
+ nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT,
+ &ashift) != 0 ||
+ nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN,
+ &disks, &ndisks) != 0) {
+ continue;
+ }
+
+ /* allocation size for the "typical" 128k block */
+ tsize = vdev_raidz_asize(ndisks, nparity, ashift,
+ SPA_OLD_MAXBLOCKSIZE);
+ /* allocation size for the blksize block */
+ asize = vdev_raidz_asize(ndisks, nparity, ashift, blksize);
+
+ /*
+ * Scale this size down as a ratio of 128k / tsize. See theory
+ * statement above.
+ */
+ volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize;
+ if (volsize > ret) {
+ ret = volsize;
+ }
+ }
+
+ if (ret == 0) {
+ ret = nblocks * blksize;
+ }
+
+ return (ret);
+}
+
+/*
+ * Convert the zvol's volume size to an appropriate reservation. See theory
+ * comment above.
+ *
* Note: If this routine is updated, it is necessary to update the ZFS test
- * suite's shell version in reservation.kshlib.
+ * suite's shell version in reservation.shlib.
*/
uint64_t
-zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
+zvol_volsize_to_reservation(zpool_handle_t *zph, uint64_t volsize,
+ nvlist_t *props)
{
uint64_t numdb;
uint64_t nblocks, volblocksize;
@@ -5132,7 +5298,14 @@ zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
&volblocksize) != 0)
volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
- nblocks = volsize/volblocksize;
+
+ nblocks = volsize / volblocksize;
+ /*
+ * Metadata defaults to using 128k blocks, not volblocksize blocks. For
+ * this reason, only the data blocks are scaled based on vdev config.
+ */
+ volsize = volsize_from_vdevs(zph, nblocks, volblocksize);
+
/* start with metadnode L0-L6 */
numdb = 7;
/* calculate number of indirects */
diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf
index a29f2c6bc8..bf1e75980b 100644
--- a/usr/src/pkg/manifests/system-test-zfstest.mf
+++ b/usr/src/pkg/manifests/system-test-zfstest.mf
@@ -2521,6 +2521,9 @@ file path=opt/zfs-tests/tests/functional/refreserv/refreserv_002_pos mode=0555
file path=opt/zfs-tests/tests/functional/refreserv/refreserv_003_pos mode=0555
file path=opt/zfs-tests/tests/functional/refreserv/refreserv_004_pos mode=0555
file path=opt/zfs-tests/tests/functional/refreserv/refreserv_005_pos mode=0555
+file path=opt/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz \
+ mode=0555
+file path=opt/zfs-tests/tests/functional/refreserv/refreserv_raidz mode=0555
file path=opt/zfs-tests/tests/functional/refreserv/setup mode=0555
file path=opt/zfs-tests/tests/functional/removal/cleanup mode=0555
file path=opt/zfs-tests/tests/functional/removal/removal.kshlib mode=0444
diff --git a/usr/src/test/zfs-tests/runfiles/delphix.run b/usr/src/test/zfs-tests/runfiles/delphix.run
index dbee1a5433..738fe89309 100644
--- a/usr/src/test/zfs-tests/runfiles/delphix.run
+++ b/usr/src/test/zfs-tests/runfiles/delphix.run
@@ -536,7 +536,8 @@ tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos',
[/opt/zfs-tests/tests/functional/refreserv]
tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos',
- 'refreserv_004_pos', 'refreserv_005_pos']
+ 'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_raidz',
+ 'refreserv_multi_raidz']
[/opt/zfs-tests/tests/functional/removal]
pre =
diff --git a/usr/src/test/zfs-tests/runfiles/omnios.run b/usr/src/test/zfs-tests/runfiles/omnios.run
index 875b529e9e..926f65cb20 100644
--- a/usr/src/test/zfs-tests/runfiles/omnios.run
+++ b/usr/src/test/zfs-tests/runfiles/omnios.run
@@ -536,7 +536,8 @@ tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos',
[/opt/zfs-tests/tests/functional/refreserv]
tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos',
- 'refreserv_004_pos', 'refreserv_005_pos']
+ 'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_raidz',
+ 'refreserv_multi_raidz']
[/opt/zfs-tests/tests/functional/removal]
pre =
diff --git a/usr/src/test/zfs-tests/runfiles/openindiana.run b/usr/src/test/zfs-tests/runfiles/openindiana.run
index f8c0c40328..f86d6d9a7b 100644
--- a/usr/src/test/zfs-tests/runfiles/openindiana.run
+++ b/usr/src/test/zfs-tests/runfiles/openindiana.run
@@ -536,7 +536,8 @@ tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos',
[/opt/zfs-tests/tests/functional/refreserv]
tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos',
- 'refreserv_004_pos', 'refreserv_005_pos']
+ 'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_raidz',
+ 'refreserv_multi_raidz']
[/opt/zfs-tests/tests/functional/removal]
pre =
diff --git a/usr/src/test/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz.ksh b/usr/src/test/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz.ksh
new file mode 100644
index 0000000000..ec5e0bfc27
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz.ksh
@@ -0,0 +1,197 @@
+#!/bin/ksh -p
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/refreserv/refreserv.cfg
+
+#
+# DESCRIPTION:
+# raidz refreservation=auto picks worst raidz vdev
+#
+# STRATEGY:
+# 1. Create a pool with a single raidz vdev
+# 2. For each block size [512b, 1k, 128k] or [4k, 8k, 128k]
+# - create a volume
+# - remember its refreservation
+# - destroy the volume
+# 3. Destroy the pool
+# 4. Recreate the pool with one more disk in the vdev, then repeat steps
+# 2 and 3.
+#
+# NOTES:
+# 1. This test will use up to 14 disks but can cover the key concepts with
+# 5 disks.
+# 2. If the disks are a mixture of 4Kn and 512n/512e, failures are likely.
+#
+
+verify_runnable "global"
+
+typeset -a alldisks=($DISKS)
+
+# The larger the volsize, the better zvol_volsize_to_reservation() is at
+# guessing the right number - though it is horrible with tiny blocks. At 10M on
+# ashift=12, the estimate may be over 26% too high.
+volsize=100
+
+function cleanup
+{
+ default_cleanup_noexit
+ default_setup_noexit "${alldisks[0]}"
+}
+
+log_assert "raidz refreservation=auto picks worst raidz vdev"
+log_onexit cleanup
+
+poolexists "$TESTPOOL" && log_must zpool destroy "$TESTPOOL"
+
+# Testing tiny block sizes on ashift=12 pools causes so much size inflation
+# that small test disks may fill before creating small volumes. However,
+# testing 512b and 1K blocks on ashift=9 pools is an ok approximation for
+# testing the problems that arise from 4K and 8K blocks on ashift=12 pools.
+bps=$(prtvtoc /dev/rdsk/${alldisks[0]} |
+ awk '$NF == "bytes/sector" { print $2; exit 0 }')
+case "$bps" in
+512)
+ allshifts=(9 10 17)
+ ;;
+4096)
+ allshifts=(12 13 17)
+ ;;
+*)
+ log_fail "bytes/sector != (512|4096)"
+ ;;
+esac
+log_note "Testing in ashift=${allshifts[0]} mode"
+
+typeset -A sizes=
+
+#
+# Determine the refreservation for a $volsize MiB volume on each raidz type at
+# various block sizes.
+#
+for parity in 1 2 3; do
+ raid=raidz$parity
+ typeset -A sizes["$raid"]
+
+ # Ensure we hit scenarios with and without skip blocks
+ for ndisks in $((parity * 2)) $((parity * 2 + 1)); do
+ typeset -a disks=(${alldisks[0..$((ndisks - 1))]})
+
+ if (( ${#disks[@]} < ndisks )); then
+ log_note "Too few disks to test $raid-$ndisks"
+ continue
+ fi
+
+ typeset -A sizes["$raid"]["$ndisks"]
+
+ log_must zpool create "$TESTPOOL" "$raid" "${disks[@]}"
+
+ for bits in "${allshifts[@]}"; do
+ vbs=$((1 << bits))
+ log_note "Gathering refreservation for $raid-$ndisks" \
+ "volblocksize=$vbs"
+
+ vol=$TESTPOOL/$TESTVOL
+ log_must zfs create -V ${volsize}m \
+ -o volblocksize=$vbs "$vol"
+
+ refres=$(zfs get -Hpo value refreservation "$vol")
+ log_must test -n "$refres"
+ sizes["$raid"]["$ndisks"]["$vbs"]=$refres
+
+ log_must zfs destroy "$vol"
+ done
+
+ log_must zpool destroy "$TESTPOOL"
+ done
+done
+
+# A little extra info is always helpful when diagnosing problems. To
+# pretty-print what you find in the log, do this in ksh:
+# typeset -A sizes=(...)
+# print -v sizes
+log_note "sizes=$(print -C sizes)"
+
+#
+# Helper furnction for checking that refreservation is calculated properly in
+# multi-vdev pools. "Properly" is defined as assuming that all vdevs are as
+# space inefficient as the worst one.
+#
+function check_vdevs {
+ typeset raid=$1
+ typeset nd1=$2
+ typeset nd2=$3
+ typeset -a disks1 disks2
+ typeset vbs vol refres refres1 refres2 expect
+
+ disks1=(${alldisks[0..$((nd1 - 1))]})
+ disks2=(${alldisks[$nd1..$((nd1 + nd2 - 1))]})
+ if (( ${#disks2[@]} < nd2 )); then
+ log_note "Too few disks to test $raid-$nd1 + $raid=$nd2"
+ return
+ fi
+
+ log_must zpool create -f "$TESTPOOL" \
+ "$raid" "${disks1[@]}" "$raid" "${disks2[@]}"
+
+ for bits in "${allshifts[@]}"; do
+ vbs=$((1 << bits))
+ log_note "Verifying $raid-$nd1 $raid-$nd2 volblocksize=$vbs"
+
+ vol=$TESTPOOL/$TESTVOL
+ log_must zfs create -V ${volsize}m -o volblocksize=$vbs "$vol"
+ refres=$(zfs get -Hpo value refreservation "$vol")
+ log_must test -n "$refres"
+
+ refres1=${sizes["$raid"]["$nd1"]["$vbs"]}
+ refres2=${sizes["$raid"]["$nd2"]["$vbs"]}
+
+ if (( refres1 > refres2 )); then
+ log_note "Expecting refres ($refres) to match refres" \
+ "from $raid-$nd1 ($refres1)"
+ log_must test "$refres" -eq "$refres1"
+ else
+ log_note "Expecting refres ($refres) to match refres" \
+ "from $raid-$nd1 ($refres2)"
+ log_must test "$refres" -eq "$refres2"
+ fi
+
+ log_must zfs destroy "$vol"
+ done
+
+ log_must zpool destroy "$TESTPOOL"
+}
+
+#
+# Verify that multi-vdev pools use the last optimistic size for all the
+# permutations within a particular raidz variant.
+#
+for raid in "${!sizes[@]}"; do
+ # ksh likes to create a [0] item for us. Thanks, ksh!
+ [[ $raid == "0" ]] && continue
+
+ for nd1 in "${!sizes["$raid"][@]}"; do
+ [[ $nd1 == "0" ]] && continue
+
+ for nd2 in "${!sizes["$raid"][@]}"; do
+ [[ $nd2 == "0" ]] && continue
+
+ check_vdevs "$raid" "$nd1" "$nd2"
+ done
+ done
+done
+
+log_pass "raidz refreservation=auto picks worst raidz vdev"
diff --git a/usr/src/test/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh b/usr/src/test/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh
new file mode 100644
index 0000000000..8628d65e7e
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh
@@ -0,0 +1,130 @@
+#!/bin/ksh -p
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/refreserv/refreserv.cfg
+
+#
+# DESCRIPTION:
+# raidz refreservation=auto accounts for extra parity and skip blocks
+#
+# STRATEGY:
+# 1. Create a pool with a single raidz vdev
+# 2. For each block size [512b, 1k, 128k] or [4k, 8k, 128k]
+# - create a volume
+# - fully overwrite it
+# - verify that referenced is less than or equal to reservation
+# - destroy the volume
+# 3. Destroy the pool
+# 4. Recreate the pool with one more disk in the vdev, then repeat steps
+# 2 and 3.
+# 5. Repeat all steps above for raidz2 and raidz3.
+#
+# NOTES:
+# 1. This test will use up to 14 disks but can cover the key concepts with
+# 5 disks.
+# 2. If the disks are a mixture of 4Kn and 512n/512e, failures are likely.
+#
+
+verify_runnable "global"
+
+typeset -a alldisks=($DISKS)
+
+# The larger the volsize, the better zvol_volsize_to_reservation() is at
+# guessing the right number. At 10M on ashift=12, the estimate may be over 26%
+# too high.
+volsize=100
+
+function cleanup
+{
+ default_cleanup_noexit
+ default_setup_noexit "${alldisks[0]}"
+}
+
+log_assert "raidz refreservation=auto accounts for extra parity and skip blocks"
+log_onexit cleanup
+
+poolexists "$TESTPOOL" && log_must zpool destroy "$TESTPOOL"
+
+# Testing tiny block sizes on ashift=12 pools causes so much size inflation
+# that small test disks may fill before creating small volumes. However,
+# testing 512b and 1K blocks on ashift=9 pools is an ok approximation for
+# testing the problems that arise from 4K and 8K blocks on ashift=12 pools.
+bps=$(prtvtoc /dev/rdsk/${alldisks[0]} |
+ awk '$NF == "bytes/sector" { print $2; exit 0 }')
+log_must test "$bps" -eq 512 -o "$bps" -eq 4096
+case "$bps" in
+512)
+ allshifts=(9 10 17)
+ maxpct=151
+ ;;
+4096)
+ allshifts=(12 13 17)
+ maxpct=110
+ ;;
+*)
+ log_fail "bytes/sector != (512|4096)"
+ ;;
+esac
+log_note "Testing in ashift=${allshifts[0]} mode"
+
+# This loop handles all iterations of steps 1 through 4 described in strategy
+# comment above,
+for parity in 1 2 3; do
+ raid=raidz$parity
+
+ # Ensure we hit scenarios with and without skip blocks
+ for ndisks in $((parity * 2)) $((parity * 2 + 1)); do
+ typeset -a disks=(${alldisks[0..$((ndisks - 1))]})
+
+ if (( ${#disks[@]} < ndisks )); then
+ log_note "Too few disks to test $raid-$ndisks"
+ continue
+ fi
+
+ log_must zpool create "$TESTPOOL" "$raid" "${disks[@]}"
+
+ for bits in "${allshifts[@]}"; do
+ vbs=$((1 << bits))
+ log_note "Testing $raid-$ndisks volblocksize=$vbs"
+
+ vol=$TESTPOOL/$TESTVOL
+ log_must zfs create -V ${volsize}m \
+ -o volblocksize=$vbs "$vol"
+ log_must dd if=/dev/zero of=/dev/zvol/dsk/$vol \
+ bs=1024k count=$volsize
+ sync
+
+ ref=$(zfs get -Hpo value referenced "$vol")
+ refres=$(zfs get -Hpo value refreservation "$vol")
+ log_must test -n "$ref"
+ log_must test -n "$refres"
+
+ typeset -F2 deltapct=$((refres * 100.0 / ref))
+ log_note "$raid-$ndisks refreservation $refres" \
+ "is $deltapct% of reservation $res"
+
+ log_must test "$ref" -le "$refres"
+ log_must test "$deltapct" -le $maxpct
+
+ log_must zfs destroy "$vol"
+ done
+
+ log_must zpool destroy "$TESTPOOL"
+ done
+done
+
+log_pass "raidz refreservation=auto accounts for extra parity and skip blocks"