diff options
author | Gvozden Neskovic <neskovic@gmail.com> | 2020-06-01 06:10:07 -0600 |
---|---|---|
committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2020-06-01 15:05:58 -0600 |
commit | e86372a01d2d16a5dd4a64e144ed978ba17fe7dd (patch) | |
tree | bc899b99b0409baebbf09e9e392997a7a8596543 | |
parent | 82049ff560eed6fbdf4cf222d894467f5809f9b3 (diff) | |
download | illumos-joyent-e86372a01d2d16a5dd4a64e144ed978ba17fe7dd.tar.gz |
12668 ZFS support for vectorized algorithms on x86 (initial support)
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Jason King <jason.king@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>
29 files changed, 4464 insertions, 192 deletions
diff --git a/exception_lists/check_rtime b/exception_lists/check_rtime index 42964957d4..43978cdbd6 100644 --- a/exception_lists/check_rtime +++ b/exception_lists/check_rtime @@ -231,6 +231,7 @@ FORBIDDEN libfakekernel\.so FORBIDDEN_DEP usr/MACH(lib)/libzpool.so.1 FORBIDDEN_DEP usr/bin/amd64/ztest FORBIDDEN_DEP usr/bin/i86/ztest +FORBIDDEN_DEP usr/bin/raidz_test FORBIDDEN_DEP usr/bin/sparcv7/ztest FORBIDDEN_DEP usr/bin/sparcv9/ztest FORBIDDEN_DEP usr/lib/MACH(smbfs)/libfknsmb.so.1 diff --git a/usr/src/cmd/Makefile b/usr/src/cmd/Makefile index 0f2cc306aa..7e0a681cd6 100644 --- a/usr/src/cmd/Makefile +++ b/usr/src/cmd/Makefile @@ -21,7 +21,7 @@ # # Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. -# Copyright 2019 Joyent, Inc. +# Copyright 2020 Joyent, Inc. # Copyright (c) 2012, 2015 by Delphix. All rights reserved. # Copyright (c) 2013 DEY Storage Systems, Inc. All rights reserved. # Copyright 2014 Garrett D'Amore <garrett@damore.org> @@ -346,6 +346,7 @@ COMMON_SUBDIRS= \ pwd \ pyzfs \ raidctl \ + raidz_test \ ramdiskadm \ rcap \ rcm_daemon \ diff --git a/usr/src/cmd/raidz_test/Makefile b/usr/src/cmd/raidz_test/Makefile new file mode 100644 index 0000000000..43e0c07829 --- /dev/null +++ b/usr/src/cmd/raidz_test/Makefile @@ -0,0 +1,61 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2020 Joyent, Inc. +# + +include ../Makefile.cmd +include ../Makefile.cmd.64 + +PROG= raidz_test +OBJS= raidz_test.o raidz_bench.o +SRCS= $(OBJS:%.o=%.c) +POFILES= $(PROG:%=%.po) + +# No msg catalog here. +POFILE= + +LDLIBS += -lzpool -lfakekernel -lumem + +INCS += -I../../lib/libzpool/common +INCS += -I../../uts/common/fs/zfs + +CPPFLAGS.first = -I$(SRC)/lib/libfakekernel/common -D_FAKE_KERNEL +CPPFLAGS += -D_LARGEFILE64_SOURCE=1 +CPPFLAGS += $(INCS) + +CSTD = $(CSTD_GNU99) + +CERRWARN += -_gcc=-Wno-type-limits + +SMATCH=off + +.KEEP_STATE: + +all: $(PROG) + +$(PROG): $(OBJS) + $(LINK.c) -o $(PROG) $(OBJS) $(LDLIBS) + $(POST_PROCESS) + +install: all $(ROOTPROG) + +clean: + $(RM) $(OBJS) + +_msg: $(MSGDOMAIN) $(POFILES) + $(CP) $(POFILES) $(MSGDOMAIN) + +$(MSGDOMAIN): + $(INS.dir) + +include ../Makefile.targ diff --git a/usr/src/cmd/raidz_test/raidz_bench.c b/usr/src/cmd/raidz_test/raidz_bench.c new file mode 100644 index 0000000000..9dc22af6fd --- /dev/null +++ b/usr/src/cmd/raidz_test/raidz_bench.c @@ -0,0 +1,228 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2016 Gvozden Nešković. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <sys/zio.h> +#include <sys/vdev_raidz.h> +#include <sys/vdev_raidz_impl.h> +#include <stdio.h> +#include <strings.h> + +#include <sys/time.h> + +#include "raidz_test.h" + +#define GEN_BENCH_MEMORY (((uint64_t)1ULL)<<32) +#define REC_BENCH_MEMORY (((uint64_t)1ULL)<<29) +#define BENCH_ASHIFT 12 +#define MIN_CS_SHIFT BENCH_ASHIFT +#define MAX_CS_SHIFT SPA_MAXBLOCKSHIFT + +static zio_t zio_bench; +static raidz_map_t *rm_bench; +static size_t max_data_size = SPA_MAXBLOCKSIZE; + +static void +bench_init_raidz_map(void) +{ + zio_bench.io_offset = 0; + zio_bench.io_size = max_data_size; + + /* + * To permit larger column sizes these have to be done + * allocated using aligned alloc instead of zio_abd_buf_alloc + */ + zio_bench.io_abd = raidz_alloc(max_data_size); + + init_zio_abd(&zio_bench); +} + +static void +bench_fini_raidz_maps(void) +{ + /* tear down golden zio */ + raidz_free(zio_bench.io_abd, max_data_size); + bzero(&zio_bench, sizeof (zio_t)); +} + +static inline void +run_gen_bench_impl(const char *impl) +{ + int fn, ncols; + uint64_t ds, iter_cnt, iter, disksize; + hrtime_t start; + double elapsed, d_bw; + + /* Benchmark generate functions */ + for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { + + for (ds = MIN_CS_SHIFT; ds <= MAX_CS_SHIFT; ds++) { + /* create suitable raidz_map */ + ncols = rto_opts.rto_dcols + fn + 1; + zio_bench.io_size = 1ULL << ds; + rm_bench = vdev_raidz_map_alloc(&zio_bench, + BENCH_ASHIFT, ncols, fn+1); + + /* estimate iteration count */ + iter_cnt = GEN_BENCH_MEMORY; + iter_cnt /= zio_bench.io_size; + + start = gethrtime(); + for (iter = 0; iter < iter_cnt; iter++) + vdev_raidz_generate_parity(rm_bench); + elapsed = NSEC2SEC((double)(gethrtime() - start)); + + disksize = (1ULL << ds) / rto_opts.rto_dcols; + d_bw = (double)iter_cnt * (double)disksize; + d_bw /= (1024.0 * 1024.0 * elapsed); + + LOG(D_ALL, "%10s, %8s, %zu, %10llu, %lf, %lf, %u\n", + impl, + raidz_gen_name[fn], + rto_opts.rto_dcols, + (1ULL<<ds), + d_bw, + d_bw * (double)(ncols), + (unsigned)iter_cnt); + + vdev_raidz_map_free(rm_bench); + } + } +} + +void +run_gen_bench(void) +{ + char **impl_name; + + LOG(D_INFO, DBLSEP "\nBenchmarking parity generation...\n\n"); + LOG(D_ALL, "impl, math, dcols, iosize, disk_bw, total_bw, iter\n"); + + for (impl_name = (char **)raidz_impl_names; *impl_name != NULL; + impl_name++) { + + if (vdev_raidz_impl_set(*impl_name) != 0) + continue; + + run_gen_bench_impl(*impl_name); + } +} + +static void +run_rec_bench_impl(const char *impl) +{ + int fn, ncols, nbad; + uint64_t ds, iter_cnt, iter, disksize; + hrtime_t start; + double elapsed, d_bw; + static const int tgt[7][3] = { + {1, 2, 3}, /* rec_p: bad QR & D[0] */ + {0, 2, 3}, /* rec_q: bad PR & D[0] */ + {0, 1, 3}, /* rec_r: bad PQ & D[0] */ + {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ + {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ + {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ + {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ + }; + + for (fn = 0; fn < RAIDZ_REC_NUM; fn++) { + for (ds = MIN_CS_SHIFT; ds <= MAX_CS_SHIFT; ds++) { + + /* create suitable raidz_map */ + ncols = rto_opts.rto_dcols + PARITY_PQR; + zio_bench.io_size = 1ULL << ds; + + /* + * raidz block is too short to test + * the requested method + */ + if (zio_bench.io_size / rto_opts.rto_dcols < + (1ULL << BENCH_ASHIFT)) + continue; + + rm_bench = vdev_raidz_map_alloc(&zio_bench, + BENCH_ASHIFT, ncols, PARITY_PQR); + + /* estimate iteration count */ + iter_cnt = (REC_BENCH_MEMORY); + iter_cnt /= zio_bench.io_size; + + /* calculate how many bad columns there are */ + nbad = MIN(3, raidz_ncols(rm_bench) - + raidz_parity(rm_bench)); + + start = gethrtime(); + for (iter = 0; iter < iter_cnt; iter++) + vdev_raidz_reconstruct(rm_bench, tgt[fn], nbad); + elapsed = NSEC2SEC((double)(gethrtime() - start)); + + disksize = (1ULL << ds) / rto_opts.rto_dcols; + d_bw = (double)iter_cnt * (double)(disksize); + d_bw /= (1024.0 * 1024.0 * elapsed); + + LOG(D_ALL, "%10s, %8s, %zu, %10llu, %lf, %lf, %u\n", + impl, + raidz_rec_name[fn], + rto_opts.rto_dcols, + (1ULL<<ds), + d_bw, + d_bw * (double)ncols, + (unsigned)iter_cnt); + + vdev_raidz_map_free(rm_bench); + } + } +} + +void +run_rec_bench(void) +{ + char **impl_name; + + LOG(D_INFO, DBLSEP "\nBenchmarking data reconstruction...\n\n"); + LOG(D_ALL, "impl, math, dcols, iosize, disk_bw, total_bw, iter\n"); + + for (impl_name = (char **)raidz_impl_names; *impl_name != NULL; + impl_name++) { + + if (vdev_raidz_impl_set(*impl_name) != 0) + continue; + + run_rec_bench_impl(*impl_name); + } +} + +void +run_raidz_benchmark(void) +{ + bench_init_raidz_map(); + + run_gen_bench(); + run_rec_bench(); + + bench_fini_raidz_maps(); +} diff --git a/usr/src/cmd/raidz_test/raidz_test.c b/usr/src/cmd/raidz_test/raidz_test.c new file mode 100644 index 0000000000..8d025b479d --- /dev/null +++ b/usr/src/cmd/raidz_test/raidz_test.c @@ -0,0 +1,761 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2016 Gvozden Nešković. All rights reserved. + * Copyright 2020 Joyent, Inc. + */ + +#include <sys/zfs_context.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <sys/zio.h> +#include <umem.h> +#include <sys/vdev_raidz.h> +#include <sys/vdev_raidz_impl.h> +#include <assert.h> +#include <stdio.h> +#include <strings.h> +#include <unistd.h> +#include "raidz_test.h" + +static int *rand_data; +raidz_test_opts_t rto_opts; + +static char gdb[256]; +static const char gdb_tmpl[] = "gdb -ex \"set pagination 0\" -p %d"; + +#define boot_ncpus (sysconf(_SC_NPROCESSORS_ONLN)) + +static void print_opts(raidz_test_opts_t *opts, boolean_t force) +{ + char *verbose; + switch (opts->rto_v) { + case 0: + verbose = "no"; + break; + case 1: + verbose = "info"; + break; + default: + verbose = "debug"; + break; + } + + if (force || opts->rto_v >= D_INFO) { + (void) fprintf(stdout, DBLSEP "Running with options:\n" + " (-a) zio ashift : %zu\n" + " (-o) zio offset : 1 << %zu\n" + " (-d) number of raidz data columns : %zu\n" + " (-s) size of DATA : 1 << %zu\n" + " (-S) sweep parameters : %s \n" + " (-v) verbose : %s \n\n", + opts->rto_ashift, /* -a */ + ilog2(opts->rto_offset), /* -o */ + opts->rto_dcols, /* -d */ + ilog2(opts->rto_dsize), /* -s */ + opts->rto_sweep ? "yes" : "no", /* -S */ + verbose); /* -v */ + } +} + +static void usage(boolean_t requested) +{ + const raidz_test_opts_t *o = &rto_opts_defaults; + + FILE *fp = requested ? stdout : stderr; + + (void) fprintf(fp, "Usage:\n" + "\t[-a zio ashift (default: %zu)]\n" + "\t[-o zio offset, exponent radix 2 (default: %zu)]\n" + "\t[-d number of raidz data columns (default: %zu)]\n" + "\t[-s zio size, exponent radix 2 (default: %zu)]\n" + "\t[-S parameter sweep (default: %s)]\n" + "\t[-t timeout for parameter sweep test]\n" + "\t[-B benchmark all raidz implementations]\n" + "\t[-v increase verbosity (default: %zu)]\n" + "\t[-h (print help)]\n" + "\t[-T test the test, see if failure would be detected]\n" + "\t[-D debug (attach gdb on SIGSEGV)]\n" + "", + o->rto_ashift, /* -a */ + ilog2(o->rto_offset), /* -o */ + o->rto_dcols, /* -d */ + ilog2(o->rto_dsize), /* -s */ + rto_opts.rto_sweep ? "yes" : "no", /* -S */ + o->rto_v); /* -d */ + + exit(requested ? 0 : 1); +} + +static void process_options(int argc, char **argv) +{ + size_t value; + int opt; + + raidz_test_opts_t *o = &rto_opts; + + bcopy(&rto_opts_defaults, o, sizeof (*o)); + + while ((opt = getopt(argc, argv, "TDBSvha:o:d:s:t:")) != -1) { + value = 0; + + switch (opt) { + case 'a': + value = strtoull(optarg, NULL, 0); + o->rto_ashift = MIN(13, MAX(9, value)); + break; + case 'o': + value = strtoull(optarg, NULL, 0); + o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9; + break; + case 'd': + value = strtoull(optarg, NULL, 0); + o->rto_dcols = MIN(255, MAX(1, value)); + break; + case 's': + value = strtoull(optarg, NULL, 0); + o->rto_dsize = 1ULL << MIN(SPA_MAXBLOCKSHIFT, + MAX(SPA_MINBLOCKSHIFT, value)); + break; + case 't': + value = strtoull(optarg, NULL, 0); + o->rto_sweep_timeout = value; + break; + case 'v': + o->rto_v++; + break; + case 'S': + o->rto_sweep = 1; + break; + case 'B': + o->rto_benchmark = 1; + break; + case 'D': + o->rto_gdb = 1; + break; + case 'T': + o->rto_sanity = 1; + break; + case 'h': + usage(B_TRUE); + break; + case '?': + default: + usage(B_FALSE); + break; + } + } +} + +#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd) +#define DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size) + +#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd) +#define CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size) + +static int +cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity) +{ + int i, ret = 0; + + VERIFY(parity >= 1 && parity <= 3); + + for (i = 0; i < parity; i++) { + if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i), + CODE_COL(rm, i)->abd_size) != 0) { + ret++; + LOG_OPT(D_DEBUG, opts, + "\nParity block [%d] different!\n", i); + } + } + return (ret); +} + +static int +cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm) +{ + int i, ret = 0; + int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden); + + for (i = 0; i < dcols; i++) { + if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i), + DATA_COL(opts->rm_golden, i)->abd_size) != 0) { + ret++; + + LOG_OPT(D_DEBUG, opts, + "\nData block [%d] different!\n", i); + } + } + return (ret); +} + +static int +init_rand(void *data, size_t size, void *private) +{ + int i; + int *dst = (int *)data; + + for (i = 0; i < size / sizeof (int); i++) + dst[i] = rand_data[i]; + + return (0); +} + +static void +corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt) +{ + int i; + raidz_col_t *col; + + for (i = 0; i < cnt; i++) { + col = &rm->rm_col[tgts[i]]; + (void) abd_iterate_func(col->rc_abd, 0, col->rc_size, + init_rand, NULL); + } +} + +void +init_zio_abd(zio_t *zio) +{ + (void) abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL); +} + +static void +fini_raidz_map(zio_t **zio, raidz_map_t **rm) +{ + vdev_raidz_map_free(*rm); + raidz_free((*zio)->io_abd, (*zio)->io_size); + umem_free(*zio, sizeof (zio_t)); + + *zio = NULL; + *rm = NULL; +} + +static int +init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) +{ + int err = 0; + zio_t *zio_test; + raidz_map_t *rm_test; + const size_t total_ncols = opts->rto_dcols + parity; + + if (opts->rm_golden) { + fini_raidz_map(&opts->zio_golden, &opts->rm_golden); + } + + opts->zio_golden = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); + zio_test = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); + + opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset; + opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize; + + opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize); + zio_test->io_abd = raidz_alloc(opts->rto_dsize); + + init_zio_abd(opts->zio_golden); + init_zio_abd(zio_test); + + VERIFY0(vdev_raidz_impl_set("original")); + + opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, + opts->rto_ashift, total_ncols, parity); + rm_test = vdev_raidz_map_alloc(zio_test, + opts->rto_ashift, total_ncols, parity); + + VERIFY(opts->zio_golden); + VERIFY(opts->rm_golden); + + vdev_raidz_generate_parity(opts->rm_golden); + vdev_raidz_generate_parity(rm_test); + + /* sanity check */ + err |= cmp_data(opts, rm_test); + err |= cmp_code(opts, rm_test, parity); + + if (err) + ERRMSG("initializing the golden copy ... [FAIL]!\n"); + + /* tear down raidz_map of test zio */ + fini_raidz_map(&zio_test, &rm_test); + + return (err); +} + +static raidz_map_t * +init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) +{ + raidz_map_t *rm = NULL; + const size_t alloc_dsize = opts->rto_dsize; + const size_t total_ncols = opts->rto_dcols + parity; + const int ccols[] = { 0, 1, 2 }; + + VERIFY(zio); + VERIFY(parity <= 3 && parity >= 1); + + *zio = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); + + (*zio)->io_offset = 0; + (*zio)->io_size = alloc_dsize; + (*zio)->io_abd = raidz_alloc(alloc_dsize); + init_zio_abd(*zio); + + rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, + total_ncols, parity); + VERIFY(rm); + + /* Make sure code columns are destroyed */ + corrupt_colums(rm, ccols, parity); + + return (rm); +} + +static int +run_gen_check(raidz_test_opts_t *opts) +{ + char **impl_name; + int fn, err = 0; + zio_t *zio_test; + raidz_map_t *rm_test; + + err = init_raidz_golden_map(opts, PARITY_PQR); + if (0 != err) + return (err); + + LOG(D_INFO, DBLSEP); + LOG(D_INFO, "Testing parity generation...\n"); + + for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL; + impl_name++) { + + LOG(D_INFO, SEP); + LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name); + + if (0 != vdev_raidz_impl_set(*impl_name)) { + LOG(D_INFO, "[SKIP]\n"); + continue; + } else { + LOG(D_INFO, "[SUPPORTED]\n"); + } + + for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { + + /* Check if should stop */ + if (rto_opts.rto_should_stop) + return (err); + + /* create suitable raidz_map */ + rm_test = init_raidz_map(opts, &zio_test, fn+1); + VERIFY(rm_test); + + LOG(D_INFO, "\t\tTesting method [%s] ...", + raidz_gen_name[fn]); + + if (!opts->rto_sanity) + vdev_raidz_generate_parity(rm_test); + + if (cmp_code(opts, rm_test, fn+1) != 0) { + LOG(D_INFO, "[FAIL]\n"); + err++; + } else + LOG(D_INFO, "[PASS]\n"); + + fini_raidz_map(&zio_test, &rm_test); + } + } + + fini_raidz_map(&opts->zio_golden, &opts->rm_golden); + + return (err); +} + +static int +run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) +{ + int x0, x1, x2; + int tgtidx[3]; + int err = 0; + static const int rec_tgts[7][3] = { + {1, 2, 3}, /* rec_p: bad QR & D[0] */ + {0, 2, 3}, /* rec_q: bad PR & D[0] */ + {0, 1, 3}, /* rec_r: bad PQ & D[0] */ + {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ + {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ + {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ + {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ + }; + + memcpy(tgtidx, rec_tgts[fn], sizeof (tgtidx)); + + if (fn < RAIDZ_REC_PQ) { + /* can reconstruct 1 failed data disk */ + for (x0 = 0; x0 < opts->rto_dcols; x0++) { + if (x0 >= rm->rm_cols - raidz_parity(rm)) + continue; + + /* Check if should stop */ + if (rto_opts.rto_should_stop) + return (err); + + LOG(D_DEBUG, "[%d] ", x0); + + tgtidx[2] = x0 + raidz_parity(rm); + + corrupt_colums(rm, tgtidx+2, 1); + + if (!opts->rto_sanity) + (void) vdev_raidz_reconstruct(rm, tgtidx, 3); + + if (cmp_data(opts, rm) != 0) { + err++; + LOG(D_DEBUG, "\nREC D[%d]... [FAIL]\n", x0); + } + } + + } else if (fn < RAIDZ_REC_PQR) { + /* can reconstruct 2 failed data disk */ + for (x0 = 0; x0 < opts->rto_dcols; x0++) { + if (x0 >= rm->rm_cols - raidz_parity(rm)) + continue; + for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { + if (x1 >= rm->rm_cols - raidz_parity(rm)) + continue; + + /* Check if should stop */ + if (rto_opts.rto_should_stop) + return (err); + + LOG(D_DEBUG, "[%d %d] ", x0, x1); + + tgtidx[1] = x0 + raidz_parity(rm); + tgtidx[2] = x1 + raidz_parity(rm); + + corrupt_colums(rm, tgtidx+1, 2); + + if (!opts->rto_sanity) + (void) vdev_raidz_reconstruct(rm, + tgtidx, 3); + + if (cmp_data(opts, rm) != 0) { + err++; + LOG(D_DEBUG, "\nREC D[%d %d]... " + "[FAIL]\n", x0, x1); + } + } + } + } else { + /* can reconstruct 3 failed data disk */ + for (x0 = 0; x0 < opts->rto_dcols; x0++) { + if (x0 >= rm->rm_cols - raidz_parity(rm)) + continue; + for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { + if (x1 >= rm->rm_cols - raidz_parity(rm)) + continue; + for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) { + if (x2 >= + rm->rm_cols - raidz_parity(rm)) + continue; + + /* Check if should stop */ + if (rto_opts.rto_should_stop) + return (err); + + LOG(D_DEBUG, "[%d %d %d]", x0, x1, x2); + + tgtidx[0] = x0 + raidz_parity(rm); + tgtidx[1] = x1 + raidz_parity(rm); + tgtidx[2] = x2 + raidz_parity(rm); + + corrupt_colums(rm, tgtidx, 3); + + if (!opts->rto_sanity) + (void) vdev_raidz_reconstruct( + rm, tgtidx, 3); + + if (cmp_data(opts, rm) != 0) { + err++; + LOG(D_DEBUG, + "\nREC D[%d %d %d]... " + "[FAIL]\n", x0, x1, x2); + } + } + } + } + } + return (err); +} + +static int +run_rec_check(raidz_test_opts_t *opts) +{ + char **impl_name; + unsigned fn, err = 0; + zio_t *zio_test; + raidz_map_t *rm_test; + + err = init_raidz_golden_map(opts, PARITY_PQR); + if (0 != err) + return (err); + + LOG(D_INFO, DBLSEP); + LOG(D_INFO, "Testing data reconstruction...\n"); + + for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL; + impl_name++) { + + LOG(D_INFO, SEP); + LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name); + + if (vdev_raidz_impl_set(*impl_name) != 0) { + LOG(D_INFO, "[SKIP]\n"); + continue; + } else + LOG(D_INFO, "[SUPPORTED]\n"); + + + /* create suitable raidz_map */ + rm_test = init_raidz_map(opts, &zio_test, PARITY_PQR); + /* generate parity */ + vdev_raidz_generate_parity(rm_test); + + for (fn = 0; fn < RAIDZ_REC_NUM; fn++) { + + LOG(D_INFO, "\t\tTesting method [%s] ...", + raidz_rec_name[fn]); + + if (run_rec_check_impl(opts, rm_test, fn) != 0) { + LOG(D_INFO, "[FAIL]\n"); + err++; + + } else + LOG(D_INFO, "[PASS]\n"); + + } + /* tear down test raidz_map */ + fini_raidz_map(&zio_test, &rm_test); + } + + fini_raidz_map(&opts->zio_golden, &opts->rm_golden); + + return (err); +} + +static int +run_test(raidz_test_opts_t *opts) +{ + int err = 0; + + if (opts == NULL) + opts = &rto_opts; + + print_opts(opts, B_FALSE); + + err |= run_gen_check(opts); + err |= run_rec_check(opts); + + return (err); +} + +#define SWEEP_RUNNING 0 +#define SWEEP_FINISHED 1 +#define SWEEP_ERROR 2 +#define SWEEP_TIMEOUT 3 + +static int sweep_state = 0; +static raidz_test_opts_t failed_opts; + +static kmutex_t sem_mtx; +static kcondvar_t sem_cv; +static int max_free_slots; +static int free_slots; + +static void +sweep_thread(void *arg) +{ + int err = 0; + raidz_test_opts_t *opts = (raidz_test_opts_t *)arg; + VERIFY(opts != NULL); + + err = run_test(opts); + + if (rto_opts.rto_sanity) { + /* 25% chance that a sweep test fails */ + if (rand() < (RAND_MAX/4)) + err = 1; + } + + if (0 != err) { + mutex_enter(&sem_mtx); + memcpy(&failed_opts, opts, sizeof (raidz_test_opts_t)); + sweep_state = SWEEP_ERROR; + mutex_exit(&sem_mtx); + } + + umem_free(opts, sizeof (raidz_test_opts_t)); + + /* signal the next thread */ + mutex_enter(&sem_mtx); + free_slots++; + cv_signal(&sem_cv); + mutex_exit(&sem_mtx); + + thread_exit(); +} + +static int +run_sweep(void) +{ + static const size_t dcols_v[] = { 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16 }; + static const size_t ashift_v[] = { 9, 12, 14 }; + static const size_t size_v[] = { 1 << 9, 21 * (1 << 9), 13 * (1 << 12), + 1 << 17, (1 << 20) - (1 << 12), SPA_MAXBLOCKSIZE }; + + (void) setvbuf(stdout, NULL, _IONBF, 0); + + ulong_t total_comb = ARRAY_SIZE(size_v) * ARRAY_SIZE(ashift_v) * + ARRAY_SIZE(dcols_v); + ulong_t tried_comb = 0; + hrtime_t time_diff, start_time = gethrtime(); + raidz_test_opts_t *opts; + int a, d, s; + + max_free_slots = free_slots = MAX(2, boot_ncpus); + + mutex_init(&sem_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&sem_cv, NULL, CV_DEFAULT, NULL); + + for (s = 0; s < ARRAY_SIZE(size_v); s++) + for (a = 0; a < ARRAY_SIZE(ashift_v); a++) + for (d = 0; d < ARRAY_SIZE(dcols_v); d++) { + + if (size_v[s] < (1 << ashift_v[a])) { + total_comb--; + continue; + } + + if (++tried_comb % 20 == 0) + LOG(D_ALL, "%lu/%lu... ", tried_comb, total_comb); + + /* wait for signal to start new thread */ + mutex_enter(&sem_mtx); + while (cv_timedwait_sig(&sem_cv, &sem_mtx, + ddi_get_lbolt() + hz)) { + + /* check if should stop the test (timeout) */ + time_diff = (gethrtime() - start_time) / NANOSEC; + if (rto_opts.rto_sweep_timeout > 0 && + time_diff >= rto_opts.rto_sweep_timeout) { + sweep_state = SWEEP_TIMEOUT; + rto_opts.rto_should_stop = B_TRUE; + mutex_exit(&sem_mtx); + goto exit; + } + + /* check if should stop the test (error) */ + if (sweep_state != SWEEP_RUNNING) { + mutex_exit(&sem_mtx); + goto exit; + } + + /* exit loop if a slot is available */ + if (free_slots > 0) { + break; + } + } + + free_slots--; + mutex_exit(&sem_mtx); + + opts = umem_zalloc(sizeof (raidz_test_opts_t), UMEM_NOFAIL); + opts->rto_ashift = ashift_v[a]; + opts->rto_dcols = dcols_v[d]; + opts->rto_offset = (1 << ashift_v[a]) * rand(); + opts->rto_dsize = size_v[s]; + opts->rto_v = 0; /* be quiet */ + + VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts, + 0, NULL, TS_RUN, maxclsyspri), !=, NULL); + } + +exit: + LOG(D_ALL, "\nWaiting for test threads to finish...\n"); + mutex_enter(&sem_mtx); + VERIFY(free_slots <= max_free_slots); + while (free_slots < max_free_slots) { + (void) cv_wait(&sem_cv, &sem_mtx); + } + mutex_exit(&sem_mtx); + + if (sweep_state == SWEEP_ERROR) { + ERRMSG("Sweep test failed! Failed option: \n"); + print_opts(&failed_opts, B_TRUE); + } else { + if (sweep_state == SWEEP_TIMEOUT) + LOG(D_ALL, "Test timeout (%lus). Stopping...\n", + (ulong_t)rto_opts.rto_sweep_timeout); + + LOG(D_ALL, "Sweep test succeeded on %lu raidz maps!\n", + (ulong_t)tried_comb); + } + + mutex_destroy(&sem_mtx); + + return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0); +} + +int +main(int argc, char **argv) +{ + size_t i; + int err = 0; + + /* init gdb string early */ + (void) sprintf(gdb, gdb_tmpl, getpid()); + + (void) setvbuf(stdout, NULL, _IOLBF, 0); + + dprintf_setup(&argc, argv); + + process_options(argc, argv); + + kernel_init(FREAD); + + /* setup random data because rand() is not reentrant */ + rand_data = (int *)umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + srand((unsigned)time(NULL) * getpid()); + for (i = 0; i < SPA_MAXBLOCKSIZE / sizeof (int); i++) + rand_data[i] = rand(); + + mprotect((void *)rand_data, SPA_MAXBLOCKSIZE, PROT_READ); + + if (rto_opts.rto_benchmark) { + run_raidz_benchmark(); + } else if (rto_opts.rto_sweep) { + err = run_sweep(); + } else { + err = run_test(NULL); + } + + umem_free(rand_data, SPA_MAXBLOCKSIZE); + kernel_fini(); + + return (err); +} diff --git a/usr/src/cmd/raidz_test/raidz_test.h b/usr/src/cmd/raidz_test/raidz_test.h new file mode 100644 index 0000000000..c91e521436 --- /dev/null +++ b/usr/src/cmd/raidz_test/raidz_test.h @@ -0,0 +1,117 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2016 Gvozden Nešković. All rights reserved. + * Copyright 2020 Joyent, Inc. + */ + +#ifndef RAIDZ_TEST_H +#define RAIDZ_TEST_H + +#include <sys/spa.h> + +static const char *raidz_impl_names[] = { + "original", + "scalar", + "sse2", + "ssse3", + "avx2", + "avx512f", + "avx512bw", + "aarch64_neon", + "aarch64_neonx2", + "powerpc_altivec", + NULL +}; + +typedef struct raidz_test_opts { + size_t rto_ashift; + size_t rto_offset; + size_t rto_dcols; + size_t rto_dsize; + size_t rto_v; + size_t rto_sweep; + size_t rto_sweep_timeout; + size_t rto_benchmark; + size_t rto_sanity; + size_t rto_gdb; + + /* non-user options */ + boolean_t rto_should_stop; + + zio_t *zio_golden; + raidz_map_t *rm_golden; +} raidz_test_opts_t; + +static const raidz_test_opts_t rto_opts_defaults = { + .rto_ashift = 9, + .rto_offset = 1ULL << 0, + .rto_dcols = 8, + .rto_dsize = 1<<19, + .rto_v = 0, + .rto_sweep = 0, + .rto_benchmark = 0, + .rto_sanity = 0, + .rto_gdb = 0, + .rto_should_stop = B_FALSE +}; + +extern raidz_test_opts_t rto_opts; + +static inline size_t ilog2(size_t a) +{ + return (a > 1 ? 1 + ilog2(a >> 1) : 0); +} + + +#define D_ALL 0 +#define D_INFO 1 +#define D_DEBUG 2 + +#define LOG(lvl, a...) \ +{ \ + if (rto_opts.rto_v >= lvl) \ + (void) fprintf(stdout, a); \ +} \ + +#define LOG_OPT(lvl, opt, a...) \ +{ \ + if (opt->rto_v >= lvl) \ + (void) fprintf(stdout, a); \ +} \ + +#define ERRMSG(a...) (void) fprintf(stderr, a) + + +#define DBLSEP "================\n" +#define SEP "----------------\n" + + +#define raidz_alloc(size) abd_alloc(size, B_FALSE) +#define raidz_free(p, size) abd_free(p) + + +void init_zio_abd(zio_t *zio); + +void run_raidz_benchmark(void); + +#endif /* RAIDZ_TEST_H */ diff --git a/usr/src/pkg/manifests/system-file-system-zfs-tests.mf b/usr/src/pkg/manifests/system-file-system-zfs-tests.mf index 4e2b5f1add..d39248a2e4 100644 --- a/usr/src/pkg/manifests/system-file-system-zfs-tests.mf +++ b/usr/src/pkg/manifests/system-file-system-zfs-tests.mf @@ -22,6 +22,7 @@ # # Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. # Copyright (c) 2012, 2015 by Delphix. All rights reserved. +# Copyright 2020 Joyent, Inc. # set name=pkg.fmri value=pkg:/system/file-system/zfs/tests@$(PKGVERS) @@ -53,6 +54,7 @@ $(i386_ONLY)file path=usr/bin/$(ARCH32)/zlook mode=0555 $(i386_ONLY)file path=usr/bin/$(ARCH32)/ztest mode=0555 file path=usr/bin/$(ARCH64)/zlook mode=0555 file path=usr/bin/$(ARCH64)/ztest mode=0555 +file path=usr/bin/raidz_test mode=0555 file path=usr/bin/zloop mode=0555 file path=usr/include/sys/fs/zut.h file path=usr/lib/devfsadm/linkmod/SUNW_zut_link.so group=sys diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf index 3a4d20d9bf..40629ff671 100644 --- a/usr/src/pkg/manifests/system-test-zfstest.mf +++ b/usr/src/pkg/manifests/system-test-zfstest.mf @@ -143,6 +143,7 @@ dir path=opt/zfs-tests/tests/functional/poolversion dir path=opt/zfs-tests/tests/functional/privilege dir path=opt/zfs-tests/tests/functional/projectquota dir path=opt/zfs-tests/tests/functional/quota +dir path=opt/zfs-tests/tests/functional/raidz dir path=opt/zfs-tests/tests/functional/redundancy dir path=opt/zfs-tests/tests/functional/refquota dir path=opt/zfs-tests/tests/functional/refreserv @@ -2798,6 +2799,10 @@ file path=opt/zfs-tests/tests/functional/quota/quota_004_pos mode=0555 file path=opt/zfs-tests/tests/functional/quota/quota_005_pos mode=0555 file path=opt/zfs-tests/tests/functional/quota/quota_006_neg mode=0555 file path=opt/zfs-tests/tests/functional/quota/setup mode=0555 +file path=opt/zfs-tests/tests/functional/raidz/cleanup mode=0555 +file path=opt/zfs-tests/tests/functional/raidz/raidz_001_neg mode=0555 +file path=opt/zfs-tests/tests/functional/raidz/raidz_002_pos mode=0555 +file path=opt/zfs-tests/tests/functional/raidz/setup mode=0555 file path=opt/zfs-tests/tests/functional/redundancy/cleanup mode=0555 file path=opt/zfs-tests/tests/functional/redundancy/redundancy.cfg mode=0444 file path=opt/zfs-tests/tests/functional/redundancy/redundancy.kshlib \ diff --git a/usr/src/test/zfs-tests/include/commands.cfg b/usr/src/test/zfs-tests/include/commands.cfg index 050d6caba7..f9b0bdf7ac 100644 --- a/usr/src/test/zfs-tests/include/commands.cfg +++ b/usr/src/test/zfs-tests/include/commands.cfg @@ -11,7 +11,7 @@ # # Copyright (c) 2016, 2018 by Delphix. All rights reserved. -# Copyright 2019 Joyent, Inc. +# Copyright 2020 Joyent, Inc. # # @@ -89,6 +89,7 @@ export USR_BIN_FILES='awk ps pwd python + raidz_test readlink rm rmdir diff --git a/usr/src/test/zfs-tests/runfiles/delphix.run b/usr/src/test/zfs-tests/runfiles/delphix.run index 8acd2710bf..3d1e71cb36 100644 --- a/usr/src/test/zfs-tests/runfiles/delphix.run +++ b/usr/src/test/zfs-tests/runfiles/delphix.run @@ -582,6 +582,9 @@ tests = ['projectid_001_pos', 'projectid_002_pos', 'projectid_003_pos', tests = ['quota_001_pos', 'quota_002_pos', 'quota_003_pos', 'quota_004_pos', 'quota_005_pos', 'quota_006_neg'] +[/opt/zfs-tests/tests/functional/raidz] +tests = ['raidz_001_neg', 'raidz_002_pos'] + [/opt/zfs-tests/tests/functional/redundancy] tests = ['redundancy_001_pos', 'redundancy_002_pos', 'redundancy_003_pos', 'redundancy_004_neg'] diff --git a/usr/src/test/zfs-tests/runfiles/omnios.run b/usr/src/test/zfs-tests/runfiles/omnios.run index f287933c2f..35b87d44e6 100644 --- a/usr/src/test/zfs-tests/runfiles/omnios.run +++ b/usr/src/test/zfs-tests/runfiles/omnios.run @@ -581,6 +581,9 @@ tests = ['projectid_001_pos', 'projectid_002_pos', 'projectid_003_pos', tests = ['quota_001_pos', 'quota_002_pos', 'quota_003_pos', 'quota_004_pos', 'quota_005_pos', 'quota_006_neg'] +[/opt/zfs-tests/tests/functional/raidz] +tests = ['raidz_001_neg', 'raidz_002_pos'] + [/opt/zfs-tests/tests/functional/redundancy] tests = ['redundancy_001_pos', 'redundancy_002_pos', 'redundancy_003_pos', 'redundancy_004_neg'] diff --git a/usr/src/test/zfs-tests/runfiles/openindiana.run b/usr/src/test/zfs-tests/runfiles/openindiana.run index 21d2055a8c..be80f55301 100644 --- a/usr/src/test/zfs-tests/runfiles/openindiana.run +++ b/usr/src/test/zfs-tests/runfiles/openindiana.run @@ -581,6 +581,9 @@ tests = ['projectid_001_pos', 'projectid_002_pos', 'projectid_003_pos', tests = ['quota_001_pos', 'quota_002_pos', 'quota_003_pos', 'quota_004_pos', 'quota_005_pos', 'quota_006_neg'] +[/opt/zfs-tests/tests/functional/raidz] +tests = ['raidz_001_neg', 'raidz_002_pos'] + [/opt/zfs-tests/tests/functional/redundancy] tests = ['redundancy_001_pos', 'redundancy_002_pos', 'redundancy_003_pos', 'redundancy_004_neg'] diff --git a/usr/src/test/zfs-tests/runfiles/smartos.run b/usr/src/test/zfs-tests/runfiles/smartos.run index 92b9b18c57..2fe72661c5 100644 --- a/usr/src/test/zfs-tests/runfiles/smartos.run +++ b/usr/src/test/zfs-tests/runfiles/smartos.run @@ -509,6 +509,9 @@ tests = ['projectid_001_pos', 'projectid_002_pos', 'projectid_003_pos', tests = ['quota_001_pos', 'quota_002_pos', 'quota_003_pos', 'quota_004_pos', 'quota_005_pos', 'quota_006_neg'] +[/opt/zfs-tests/tests/functional/raidz] +tests = ['raidz_001_neg', 'raidz_002_pos'] + [/opt/zfs-tests/tests/functional/refquota] tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos', 'refquota_004_pos', 'refquota_005_pos', 'refquota_006_neg'] diff --git a/usr/src/test/zfs-tests/tests/functional/raidz/Makefile b/usr/src/test/zfs-tests/tests/functional/raidz/Makefile new file mode 100644 index 0000000000..5d0bf4506a --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/raidz/Makefile @@ -0,0 +1,21 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2020 Joyent, Inc. +# + +include $(SRC)/Makefile.master + +ROOTOPTPKG = $(ROOT)/opt/zfs-tests +TARGETDIR = $(ROOTOPTPKG)/tests/functional/raidz + +include $(SRC)/test/zfs-tests/Makefile.com diff --git a/usr/src/test/zfs-tests/tests/functional/raidz/cleanup.ksh b/usr/src/test/zfs-tests/tests/functional/raidz/cleanup.ksh new file mode 100755 index 0000000000..c92c54c270 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/raidz/cleanup.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Gvozden Neskovic. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib + +# default_cleanup diff --git a/usr/src/test/zfs-tests/tests/functional/raidz/raidz_001_neg.ksh b/usr/src/test/zfs-tests/tests/functional/raidz/raidz_001_neg.ksh new file mode 100755 index 0000000000..4c105b9411 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/raidz/raidz_001_neg.ksh @@ -0,0 +1,38 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Gvozden Neskovic. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Call the raidz_test tool with -T options to test the infrastructure. +# This option should make raidz_test to return non 0. +# + +log_mustnot raidz_test -T + +log_pass "raidz_test detects errors as espected." diff --git a/usr/src/test/zfs-tests/tests/functional/raidz/raidz_002_pos.ksh b/usr/src/test/zfs-tests/tests/functional/raidz/raidz_002_pos.ksh new file mode 100755 index 0000000000..e238a881b0 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/raidz/raidz_002_pos.ksh @@ -0,0 +1,41 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Gvozden Neskovic. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Call the raidz_test tool with -S to test all supported raidz +# implementations. This options will test several raidz block geometries +# and several zio parameters that affect raidz block layout. Data +# reconstruction performs all combinations of failed disks. Wall time +# is set to 5min, but actual runtime might be longer. +# + +log_must raidz_test -S -t 300 + +log_pass "raidz_test parameter sweep test succeeded." diff --git a/usr/src/test/zfs-tests/tests/functional/raidz/setup.ksh b/usr/src/test/zfs-tests/tests/functional/raidz/setup.ksh new file mode 100755 index 0000000000..4e155d24d5 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/raidz/setup.ksh @@ -0,0 +1,32 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Gvozden Neskovic. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +log_pass diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 63f314ca93..783b436bf4 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -1403,6 +1403,8 @@ ZFS_COMMON_OBJS += \ vdev_missing.o \ vdev_queue.o \ vdev_raidz.o \ + vdev_raidz_math.o \ + vdev_raidz_math_scalar.o \ vdev_removal.o \ vdev_root.o \ vdev_trim.o \ diff --git a/usr/src/uts/common/fs/zfs/abd.c b/usr/src/uts/common/fs/zfs/abd.c index 596545afd9..3aabaf3f01 100644 --- a/usr/src/uts/common/fs/zfs/abd.c +++ b/usr/src/uts/common/fs/zfs/abd.c @@ -12,6 +12,7 @@ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 by Delphix. All rights reserved. + * Copyright 2020 Joyent, Inc. */ /* @@ -761,7 +762,8 @@ abd_iter_map(struct abd_iter *aiter) } else { size_t index = abd_iter_scatter_chunk_index(aiter); offset = abd_iter_scatter_chunk_offset(aiter); - aiter->iter_mapsize = zfs_abd_chunk_size - offset; + aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset, + aiter->iter_abd->abd_size - aiter->iter_pos); paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]; } aiter->iter_mapaddr = (char *)paddr + offset; @@ -990,3 +992,180 @@ abd_cmp(abd_t *dabd, abd_t *sabd, size_t size) { return (abd_iterate_func2(dabd, sabd, 0, 0, size, abd_cmp_cb, NULL)); } + +/* + * Iterate over code ABDs and a data ABD and call @func_raidz_gen. + * + * @cabds parity ABDs, must have equal size + * @dabd data ABD. Can be NULL (in this case @dsize = 0) + * @func_raidz_gen should be implemented so that its behaviour + * is the same when taking linear and when taking scatter + */ +void +abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, + ssize_t csize, ssize_t dsize, const unsigned parity, + void (*func_raidz_gen)(void **, const void *, size_t, size_t)) +{ + int i; + ssize_t len, dlen; + struct abd_iter caiters[3]; + struct abd_iter daiter = {0}; + void *caddrs[3]; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) + abd_iter_init(&caiters[i], cabds[i]); + + if (dabd) + abd_iter_init(&daiter, dabd); + + ASSERT3S(dsize, >=, 0); + +#ifdef _KERNEL + kpreempt_disable(); +#endif + while (csize > 0) { + len = csize; + + if (dabd && dsize > 0) + abd_iter_map(&daiter); + + for (i = 0; i < parity; i++) { + abd_iter_map(&caiters[i]); + caddrs[i] = caiters[i].iter_mapaddr; + } + + switch (parity) { + case 3: + len = MIN(caiters[2].iter_mapsize, len); + /* falls through */ + case 2: + len = MIN(caiters[1].iter_mapsize, len); + /* falls through */ + case 1: + len = MIN(caiters[0].iter_mapsize, len); + } + + /* must be progressive */ + ASSERT3S(len, >, 0); + + if (dabd && dsize > 0) { + /* this needs precise iter.length */ + len = MIN(daiter.iter_mapsize, len); + len = MIN(dsize, len); + dlen = len; + } else + dlen = 0; + + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); + + for (i = parity-1; i >= 0; i--) { + abd_iter_unmap(&caiters[i]); + abd_iter_advance(&caiters[i], len); + } + + if (dabd && dsize > 0) { + abd_iter_unmap(&daiter); + abd_iter_advance(&daiter, dlen); + dsize -= dlen; + } + + csize -= len; + + ASSERT3S(dsize, >=, 0); + ASSERT3S(csize, >=, 0); + } +#ifdef _KERNEL + kpreempt_enable(); +#endif +} + +/* + * Iterate over code ABDs and data reconstruction target ABDs and call + * @func_raidz_rec. Function maps at most 6 pages atomically. + * + * @cabds parity ABDs, must have equal size + * @tabds rec target ABDs, at most 3 + * @tsize size of data target columns + * @func_raidz_rec expects syndrome data in target columns. Function + * reconstructs data and overwrites target columns. + */ +void +abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, + ssize_t tsize, const unsigned parity, + void (*func_raidz_rec)(void **t, const size_t tsize, void **c, + const unsigned *mul), + const unsigned *mul) +{ + int i; + ssize_t len; + struct abd_iter citers[3]; + struct abd_iter xiters[3]; + void *caddrs[3], *xaddrs[3]; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) { + abd_iter_init(&citers[i], cabds[i]); + abd_iter_init(&xiters[i], tabds[i]); + } + +#ifdef _KERNEL + kpreempt_disable(); +#endif + while (tsize > 0) { + + for (i = 0; i < parity; i++) { + abd_iter_map(&citers[i]); + abd_iter_map(&xiters[i]); + caddrs[i] = citers[i].iter_mapaddr; + xaddrs[i] = xiters[i].iter_mapaddr; + } + + len = tsize; + switch (parity) { + case 3: + len = MIN(xiters[2].iter_mapsize, len); + len = MIN(citers[2].iter_mapsize, len); + /* falls through */ + case 2: + len = MIN(xiters[1].iter_mapsize, len); + len = MIN(citers[1].iter_mapsize, len); + /* falls through */ + case 1: + len = MIN(xiters[0].iter_mapsize, len); + len = MIN(citers[0].iter_mapsize, len); + } + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + func_raidz_rec(xaddrs, len, caddrs, mul); + + for (i = parity-1; i >= 0; i--) { + abd_iter_unmap(&xiters[i]); + abd_iter_unmap(&citers[i]); + abd_iter_advance(&xiters[i], len); + abd_iter_advance(&citers[i], len); + } + + tsize -= len; + ASSERT3S(tsize, >=, 0); + } +#ifdef _KERNEL + kpreempt_enable(); +#endif +} diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 9dac4e2ddc..c9ceeb6873 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -44,6 +44,7 @@ #include <sys/vdev_impl.h> #include <sys/vdev_initialize.h> #include <sys/vdev_trim.h> +#include <sys/vdev_raidz.h> #include <sys/metaslab.h> #include <sys/uberblock_impl.h> #include <sys/txg.h> @@ -2253,6 +2254,7 @@ spa_init(int mode) zil_init(); vdev_cache_stat_init(); vdev_mirror_stat_init(); + vdev_raidz_math_init(); zfs_prop_init(); zpool_prop_init(); zpool_feature_init(); @@ -2271,6 +2273,7 @@ spa_fini(void) vdev_cache_stat_fini(); vdev_mirror_stat_fini(); + vdev_raidz_math_fini(); zil_fini(); dmu_fini(); zio_fini(); diff --git a/usr/src/uts/common/fs/zfs/sys/abd.h b/usr/src/uts/common/fs/zfs/sys/abd.h index 621635933e..23699c0420 100644 --- a/usr/src/uts/common/fs/zfs/sys/abd.h +++ b/usr/src/uts/common/fs/zfs/sys/abd.h @@ -103,6 +103,15 @@ int abd_cmp(abd_t *, abd_t *, size_t); int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); void abd_zero_off(abd_t *, size_t, size_t); +void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, + ssize_t csize, ssize_t dsize, const unsigned parity, + void (*func_raidz_gen)(void **, const void *, size_t, size_t)); +void abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, + ssize_t tsize, const unsigned parity, + void (*func_raidz_rec)(void **t, const size_t tsize, void **c, + const unsigned *mul), + const unsigned *mul); + /* * Wrappers for calls with offsets of 0 */ diff --git a/usr/src/uts/common/fs/zfs/sys/simd.h b/usr/src/uts/common/fs/zfs/sys/simd.h new file mode 100644 index 0000000000..4494c7d02a --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/simd.h @@ -0,0 +1,40 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2020 Joyent, Inc. + */ + +#ifndef _SIMD_H +#define _SIMD_H + +#if defined(__amd64__) || defined(__i386__) + +/* Temporararily disabled until subsequent work to turn this on. */ +#define kfpu_allowed() 0 +#define kfpu_initialize(tsk) do {} while (0) +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#define kfpu_init() (0) +#define kfpu_fini() do {} while (0) + +#else + +/* Non-x86 CPUs currently always disallow kernel FPU support */ +#define kfpu_allowed() 0 +#define kfpu_initialize(tsk) do {} while (0) +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#define kfpu_init() (0) +#define kfpu_fini() do {} while (0) +#endif + +#endif /* _SIMD_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h new file mode 100644 index 0000000000..bf5c840139 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h @@ -0,0 +1,65 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden Neskovic <neskovic@compeng.uni-frankfurt.de>. + * Copyright 2020 Joyent, Inc. + */ + +#ifndef _SYS_VDEV_RAIDZ_H +#define _SYS_VDEV_RAIDZ_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct zio; +struct raidz_map; +#if !defined(_KERNEL) +struct kernel_param {}; +#endif + +/* + * vdev_raidz interface + */ +struct raidz_map * vdev_raidz_map_alloc(struct zio *, uint64_t, + uint64_t, uint64_t); +void vdev_raidz_map_free(struct raidz_map *); +void vdev_raidz_generate_parity(struct raidz_map *); +int vdev_raidz_reconstruct(struct raidz_map *, const int *, int); + +/* + * vdev_raidz_math interface + */ +void vdev_raidz_math_init(void); +void vdev_raidz_math_fini(void); +const struct raidz_impl_ops *vdev_raidz_math_get_ops(void); +int vdev_raidz_math_generate(struct raidz_map *); +int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, + const int *, const int); +int vdev_raidz_impl_set(const char *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_RAIDZ_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_raidz_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_raidz_impl.h new file mode 100644 index 0000000000..d8defc04ea --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/vdev_raidz_impl.h @@ -0,0 +1,351 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden Nešković. All rights reserved. + */ + +#ifndef _VDEV_RAIDZ_H +#define _VDEV_RAIDZ_H + +#include <sys/types.h> +#include <sys/debug.h> +#include <sys/kstat.h> +#include <sys/abd.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define CODE_P (0U) +#define CODE_Q (1U) +#define CODE_R (2U) + +#define PARITY_P (1U) +#define PARITY_PQ (2U) +#define PARITY_PQR (3U) + +#define TARGET_X (0U) +#define TARGET_Y (1U) +#define TARGET_Z (2U) + +/* + * Parity generation methods indexes + */ +enum raidz_math_gen_op { + RAIDZ_GEN_P = 0, + RAIDZ_GEN_PQ, + RAIDZ_GEN_PQR, + RAIDZ_GEN_NUM = 3 +}; +/* + * Data reconstruction methods indexes + */ +enum raidz_rec_op { + RAIDZ_REC_P = 0, + RAIDZ_REC_Q, + RAIDZ_REC_R, + RAIDZ_REC_PQ, + RAIDZ_REC_PR, + RAIDZ_REC_QR, + RAIDZ_REC_PQR, + RAIDZ_REC_NUM = 7 +}; + +extern const char *raidz_gen_name[RAIDZ_GEN_NUM]; +extern const char *raidz_rec_name[RAIDZ_REC_NUM]; + +/* + * Methods used to define raidz implementation + * + * @raidz_gen_f Parity generation function + * @par1 pointer to raidz_map + * @raidz_rec_f Data reconstruction function + * @par1 pointer to raidz_map + * @par2 array of reconstruction targets + * @will_work_f Function returns TRUE if impl. is supported on the system + * @init_impl_f Function is called once on init + * @fini_impl_f Function is called once on fini + */ +typedef void (*raidz_gen_f)(void *); +typedef int (*raidz_rec_f)(void *, const int *); +typedef boolean_t (*will_work_f)(void); +typedef void (*init_impl_f)(void); +typedef void (*fini_impl_f)(void); + +#define RAIDZ_IMPL_NAME_MAX (20) + +typedef struct raidz_impl_ops { + init_impl_f init; + fini_impl_f fini; + raidz_gen_f gen[RAIDZ_GEN_NUM]; /* Parity generate functions */ + raidz_rec_f rec[RAIDZ_REC_NUM]; /* Data reconstruction functions */ + will_work_f is_supported; /* Support check function */ + char name[RAIDZ_IMPL_NAME_MAX]; /* Name of the implementation */ +} raidz_impl_ops_t; + +typedef struct raidz_col { + size_t rc_devidx; /* child device index for I/O */ + size_t rc_offset; /* device offset */ + size_t rc_size; /* I/O size */ + abd_t *rc_abd; /* I/O data */ + void *rc_gdata; /* used to store the "good" version */ + int rc_error; /* I/O error for this device */ + unsigned int rc_tried; /* Did we attempt this I/O column? */ + unsigned int rc_skipped; /* Did we skip this I/O column? */ +} raidz_col_t; + +typedef struct raidz_map { + size_t rm_cols; /* Regular column count */ + size_t rm_scols; /* Count including skipped columns */ + size_t rm_bigcols; /* Number of oversized columns */ + size_t rm_asize; /* Actual total I/O size */ + size_t rm_missingdata; /* Count of missing data devices */ + size_t rm_missingparity; /* Count of missing parity devices */ + size_t rm_firstdatacol; /* First data column/parity count */ + size_t rm_nskip; /* Skipped sectors for padding */ + size_t rm_skipstart; /* Column index of padding start */ + void *rm_abd_copy; /* rm_asize-buffer of copied data */ + size_t rm_reports; /* # of referencing checksum reports */ + unsigned int rm_freed; /* map no longer has referencing ZIO */ + unsigned int rm_ecksuminjected; /* checksum error was injected */ + const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ + raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ +} raidz_map_t; + +#define RAIDZ_ORIGINAL_IMPL (INT_MAX) + +extern const raidz_impl_ops_t vdev_raidz_scalar_impl; + +/* + * Commonly used raidz_map helpers + * + * raidz_parity Returns parity of the RAIDZ block + * raidz_ncols Returns number of columns the block spans + * raidz_nbigcols Returns number of big columns columns + * raidz_col_p Returns pointer to a column + * raidz_col_size Returns size of a column + * raidz_big_size Returns size of big columns + * raidz_short_size Returns size of short columns + */ +#define raidz_parity(rm) ((rm)->rm_firstdatacol) +#define raidz_ncols(rm) ((rm)->rm_cols) +#define raidz_nbigcols(rm) ((rm)->rm_bigcols) +#define raidz_col_p(rm, c) ((rm)->rm_col + (c)) +#define raidz_col_size(rm, c) ((rm)->rm_col[c].rc_size) +#define raidz_big_size(rm) (raidz_col_size(rm, CODE_P)) +#define raidz_short_size(rm) (raidz_col_size(rm, raidz_ncols(rm)-1)) + +/* + * Macro defines an RAIDZ parity generation method + * + * @code parity the function produce + * @impl name of the implementation + */ +#define _RAIDZ_GEN_WRAP(code, impl) \ +static void \ +impl ## _gen_ ## code(void *rmp) \ +{ \ + raidz_map_t *rm = (raidz_map_t *) rmp; \ + raidz_generate_## code ## _impl(rm); \ +} + +/* + * Macro defines an RAIDZ data reconstruction method + * + * @code parity the function produce + * @impl name of the implementation + */ +#define _RAIDZ_REC_WRAP(code, impl) \ +static int \ +impl ## _rec_ ## code(void *rmp, const int *tgtidx) \ +{ \ + raidz_map_t *rm = (raidz_map_t *) rmp; \ + return (raidz_reconstruct_## code ## _impl(rm, tgtidx)); \ +} + +/* + * Define all gen methods for an implementation + * + * @impl name of the implementation + */ +#define DEFINE_GEN_METHODS(impl) \ + _RAIDZ_GEN_WRAP(p, impl); \ + _RAIDZ_GEN_WRAP(pq, impl); \ + _RAIDZ_GEN_WRAP(pqr, impl) + +/* + * Define all rec functions for an implementation + * + * @impl name of the implementation + */ +#define DEFINE_REC_METHODS(impl) \ + _RAIDZ_REC_WRAP(p, impl); \ + _RAIDZ_REC_WRAP(q, impl); \ + _RAIDZ_REC_WRAP(r, impl); \ + _RAIDZ_REC_WRAP(pq, impl); \ + _RAIDZ_REC_WRAP(pr, impl); \ + _RAIDZ_REC_WRAP(qr, impl); \ + _RAIDZ_REC_WRAP(pqr, impl) + +#define RAIDZ_GEN_METHODS(impl) \ +{ \ + [RAIDZ_GEN_P] = & impl ## _gen_p, \ + [RAIDZ_GEN_PQ] = & impl ## _gen_pq, \ + [RAIDZ_GEN_PQR] = & impl ## _gen_pqr \ +} + +#define RAIDZ_REC_METHODS(impl) \ +{ \ + [RAIDZ_REC_P] = & impl ## _rec_p, \ + [RAIDZ_REC_Q] = & impl ## _rec_q, \ + [RAIDZ_REC_R] = & impl ## _rec_r, \ + [RAIDZ_REC_PQ] = & impl ## _rec_pq, \ + [RAIDZ_REC_PR] = & impl ## _rec_pr, \ + [RAIDZ_REC_QR] = & impl ## _rec_qr, \ + [RAIDZ_REC_PQR] = & impl ## _rec_pqr \ +} + + +typedef struct raidz_impl_kstat { + uint64_t gen[RAIDZ_GEN_NUM]; /* gen method speed kiB/s */ + uint64_t rec[RAIDZ_REC_NUM]; /* rec method speed kiB/s */ +} raidz_impl_kstat_t; + +/* + * Enumerate various multiplication constants + * used in reconstruction methods + */ +typedef enum raidz_mul_info { + /* Reconstruct Q */ + MUL_Q_X = 0, + /* Reconstruct R */ + MUL_R_X = 0, + /* Reconstruct PQ */ + MUL_PQ_X = 0, + MUL_PQ_Y = 1, + /* Reconstruct PR */ + MUL_PR_X = 0, + MUL_PR_Y = 1, + /* Reconstruct QR */ + MUL_QR_XQ = 0, + MUL_QR_X = 1, + MUL_QR_YQ = 2, + MUL_QR_Y = 3, + /* Reconstruct PQR */ + MUL_PQR_XP = 0, + MUL_PQR_XQ = 1, + MUL_PQR_XR = 2, + MUL_PQR_YU = 3, + MUL_PQR_YP = 4, + MUL_PQR_YQ = 5, + + MUL_CNT = 6 +} raidz_mul_info_t; + +/* + * Powers of 2 in the Galois field. + */ +extern const uint8_t vdev_raidz_pow2[256] __attribute__((aligned(256))); +/* Logs of 2 in the Galois field defined above. */ +extern const uint8_t vdev_raidz_log2[256] __attribute__((aligned(256))); + +/* + * Multiply a given number by 2 raised to the given power. + */ +static inline uint8_t +vdev_raidz_exp2(const uint8_t a, const unsigned exp) +{ + if (a == 0) + return (0); + + return (vdev_raidz_pow2[(exp + (unsigned) vdev_raidz_log2[a]) % 255]); +} + +/* + * Galois Field operations. + * + * gf_exp2 - computes 2 raised to the given power + * gf_exp2 - computes 4 raised to the given power + * gf_mul - multiplication + * gf_div - division + * gf_inv - multiplicative inverse + */ +typedef unsigned gf_t; +typedef unsigned gf_log_t; + +static inline gf_t +gf_mul(const gf_t a, const gf_t b) +{ + gf_log_t logsum; + + if (a == 0 || b == 0) + return (0); + + logsum = (gf_log_t) vdev_raidz_log2[a] + (gf_log_t) vdev_raidz_log2[b]; + + return ((gf_t) vdev_raidz_pow2[logsum % 255]); +} + +static inline gf_t +gf_div(const gf_t a, const gf_t b) +{ + gf_log_t logsum; + + ASSERT3U(b, >, 0); + if (a == 0) + return (0); + + logsum = (gf_log_t) 255 + (gf_log_t) vdev_raidz_log2[a] - + (gf_log_t) vdev_raidz_log2[b]; + + return ((gf_t) vdev_raidz_pow2[logsum % 255]); +} + +static inline gf_t +gf_inv(const gf_t a) +{ + gf_log_t logsum; + + ASSERT3U(a, >, 0); + + logsum = (gf_log_t) 255 - (gf_log_t) vdev_raidz_log2[a]; + + return ((gf_t) vdev_raidz_pow2[logsum]); +} + +static inline gf_t +gf_exp2(gf_log_t exp) +{ + return (vdev_raidz_pow2[exp % 255]); +} + +static inline gf_t +gf_exp4(gf_log_t exp) +{ + ASSERT3U(exp, <=, 255); + return ((gf_t) vdev_raidz_pow2[(2 * exp) % 255]); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _VDEV_RAIDZ_H */ diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c index 10772d5265..e4db03ce89 100644 --- a/usr/src/uts/common/fs/zfs/vdev_raidz.c +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2016 Gvozden Nešković. All rights reserved. * Copyright 2019 Joyent, Inc. * Copyright (c) 2014 Integros [integros.com] */ @@ -35,6 +36,8 @@ #include <sys/abd.h> #include <sys/fs/zfs.h> #include <sys/fm/fs/zfs.h> +#include <sys/vdev_raidz.h> +#include <sys/vdev_raidz_impl.h> #ifdef ZFS_DEBUG #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ @@ -98,7 +101,7 @@ * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 * - * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival + * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial * XOR operation, and 2 and 4 can be computed quickly and generate linearly- * independent coefficients. (There are no additional coefficients that have * this property which is why the uncorrected Plank method breaks down.) @@ -107,34 +110,6 @@ * or in concert to recover missing data columns. */ -typedef struct raidz_col { - uint64_t rc_devidx; /* child device index for I/O */ - uint64_t rc_offset; /* device offset */ - uint64_t rc_size; /* I/O size */ - abd_t *rc_abd; /* I/O data */ - void *rc_gdata; /* used to store the "good" version */ - int rc_error; /* I/O error for this device */ - uint8_t rc_tried; /* Did we attempt this I/O column? */ - uint8_t rc_skipped; /* Did we skip this I/O column? */ -} raidz_col_t; - -typedef struct raidz_map { - uint64_t rm_cols; /* Regular column count */ - uint64_t rm_scols; /* Count including skipped columns */ - uint64_t rm_bigcols; /* Number of oversized columns */ - uint64_t rm_asize; /* Actual total I/O size */ - uint64_t rm_missingdata; /* Count of missing data devices */ - uint64_t rm_missingparity; /* Count of missing parity devices */ - uint64_t rm_firstdatacol; /* First data column/parity count */ - uint64_t rm_nskip; /* Skipped sectors for padding */ - uint64_t rm_skipstart; /* Column index of padding start */ - abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ - uintptr_t rm_reports; /* # of referencing checksum reports */ - uint8_t rm_freed; /* map no longer has referencing ZIO */ - uint8_t rm_ecksuminjected; /* checksum error was injected */ - raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ -} raidz_map_t; - #define VDEV_RAIDZ_P 0 #define VDEV_RAIDZ_Q 1 #define VDEV_RAIDZ_R 2 @@ -153,7 +128,7 @@ typedef struct raidz_map { (mask) = (x) & 0x8080808080808080ULL; \ (mask) = ((mask) << 1) - ((mask) >> 7); \ (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ - ((mask) & 0x1d1d1d1d1d1d1d1d); \ + ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ } #define VDEV_RAIDZ_64MUL_4(x, mask) \ @@ -164,104 +139,7 @@ typedef struct raidz_map { #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) -/* - * Force reconstruction to use the general purpose method. - */ -int vdev_raidz_default_to_general; - -/* Powers of 2 in the Galois field defined above. */ -static const uint8_t vdev_raidz_pow2[256] = { - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, - 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, - 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, - 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, - 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, - 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, - 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, - 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, - 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, - 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, - 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, - 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, - 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, - 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, - 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, - 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, - 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, - 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, - 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, - 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, - 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, - 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, - 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, - 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, - 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, - 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, - 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, - 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, - 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, - 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, - 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 -}; -/* Logs of 2 in the Galois field defined above. */ -static const uint8_t vdev_raidz_log2[256] = { - 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, - 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, - 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, - 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, - 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, - 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, - 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, - 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, - 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, - 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, - 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, - 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, - 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, - 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, - 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, - 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, - 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, - 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, - 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, - 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, - 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, - 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, - 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, - 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, - 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, - 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, - 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, - 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, - 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, - 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, - 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, - 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, -}; - -static void vdev_raidz_generate_parity(raidz_map_t *rm); - -/* - * Multiply a given number by 2 raised to the given power. - */ -static uint8_t -vdev_raidz_exp2(uint_t a, int exp) -{ - if (a == 0) - return (0); - - ASSERT(exp >= 0); - ASSERT(vdev_raidz_log2[a] > 0 || a == 1); - - exp += vdev_raidz_log2[a]; - if (exp > 255) - exp -= 255; - - return (vdev_raidz_pow2[exp]); -} - -static void +void vdev_raidz_map_free(raidz_map_t *rm) { int c; @@ -271,7 +149,6 @@ vdev_raidz_map_free(raidz_map_t *rm) if (rm->rm_col[c].rc_gdata != NULL) abd_free(rm->rm_col[c].rc_gdata); - } for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) @@ -311,7 +188,7 @@ static void vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) { raidz_map_t *rm = zcr->zcr_cbdata; - size_t c = zcr->zcr_cbinfo; + const size_t c = zcr->zcr_cbinfo; size_t x, offset; const abd_t *good = NULL; @@ -459,19 +336,19 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = { * Divides the IO evenly across all child vdevs; usually, dcols is * the number of children in the target vdev. */ -static raidz_map_t * -vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, - uint64_t unit_shift, uint64_t dcols, uint64_t nparity) +raidz_map_t * +vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, + uint64_t nparity) { raidz_map_t *rm; /* The starting RAIDZ (parent) vdev sector of the block. */ - uint64_t b = offset >> unit_shift; + uint64_t b = zio->io_offset >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ - uint64_t s = size >> unit_shift; + uint64_t s = zio->io_size >> ashift; /* The first column for this stripe. */ uint64_t f = b % dcols; /* The starting byte offset on each child vdev. */ - uint64_t o = (b / dcols) << unit_shift; + uint64_t o = (b / dcols) << ashift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; uint64_t off = 0; @@ -530,7 +407,7 @@ vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, coff = o; if (col >= dcols) { col -= dcols; - coff += 1ULL << unit_shift; + coff += 1ULL << ashift; } rm->rm_col[c].rc_devidx = col; rm->rm_col[c].rc_offset = coff; @@ -543,29 +420,29 @@ vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, if (c >= acols) rm->rm_col[c].rc_size = 0; else if (c < bc) - rm->rm_col[c].rc_size = (q + 1) << unit_shift; + rm->rm_col[c].rc_size = (q + 1) << ashift; else - rm->rm_col[c].rc_size = q << unit_shift; + rm->rm_col[c].rc_size = q << ashift; asize += rm->rm_col[c].rc_size; } - ASSERT3U(asize, ==, tot << unit_shift); - rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); + ASSERT3U(asize, ==, tot << ashift); + rm->rm_asize = roundup(asize, (nparity + 1) << ashift); rm->rm_nskip = roundup(tot, nparity + 1) - tot; - ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); + ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift); ASSERT3U(rm->rm_nskip, <=, nparity); for (c = 0; c < rm->rm_firstdatacol; c++) rm->rm_col[c].rc_abd = abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); - rm->rm_col[c].rc_abd = abd_get_offset_size(abd, 0, + rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, rm->rm_col[c].rc_size); off = rm->rm_col[c].rc_size; for (c = c + 1; c < acols; c++) { - rm->rm_col[c].rc_abd = abd_get_offset_size(abd, off, + rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off, rm->rm_col[c].rc_size); off += rm->rm_col[c].rc_size; } @@ -573,7 +450,7 @@ vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, /* * If all data stored spans all columns, there's a danger that parity * will always be on the same device and, since parity isn't read - * during normal operation, that that device's I/O bandwidth won't be + * during normal operation, that device's I/O bandwidth won't be * used effectively. We therefore switch the parity every 1MB. * * ... at least that was, ostensibly, the theory. As a practical @@ -593,7 +470,7 @@ vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, ASSERT(rm->rm_cols >= 2); ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); - if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { + if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { devidx = rm->rm_col[0].rc_devidx; o = rm->rm_col[0].rc_offset; rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; @@ -605,6 +482,9 @@ vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, rm->rm_skipstart = 1; } + /* init RAIDZ parity ops */ + rm->rm_ops = vdev_raidz_math_get_ops(); + return (rm); } @@ -681,7 +561,6 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm) p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); if (c == rm->rm_firstdatacol) { - ASSERT3U(src->abd_size, >=, rm->rm_col[c].rc_size); abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size); } else { struct pqr_struct pqr = { p, NULL, NULL }; @@ -793,9 +672,13 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) * Generate RAID parity in the first virtual columns according to the number of * parity columns available. */ -static void +void vdev_raidz_generate_parity(raidz_map_t *rm) { + /* Generate using the new math implementation */ + if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL) + return; + switch (rm->rm_firstdatacol) { case 1: vdev_raidz_generate_parity_p(rm); @@ -873,8 +756,8 @@ vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) int cnt = size / sizeof (dst[0]); for (int i = 0; i < cnt; i++, dst++, rq->q++) { - *dst ^= *rq->q; + *dst ^= *rq->q; int j; uint8_t *b; for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { @@ -1159,9 +1042,12 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) * ~~ ~~ * __ __ * | 1 1 1 1 1 1 1 1 | + * | 128 64 32 16 8 4 2 1 | * | 19 205 116 29 64 16 4 1 | * | 1 0 0 0 0 0 0 0 | - * (V|I)' = | 0 0 0 1 0 0 0 0 | + * | 0 1 0 0 0 0 0 0 | + * (V|I)' = | 0 0 1 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | @@ -1385,8 +1271,8 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, int i, j, x, cc, c; uint8_t *src; uint64_t ccount; - uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; - uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; + uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; + uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; uint8_t log = 0; uint8_t val; int ll; @@ -1595,12 +1481,12 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) return (code); } -static int -vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) +int +vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) { int tgts[VDEV_RAIDZ_MAXPARITY], *dt; int ntgts; - int i, c; + int i, c, ret; int code; int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; @@ -1638,34 +1524,37 @@ vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) dt = &tgts[nbadparity]; + /* Reconstruct using the new math implementation */ + ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); + if (ret != RAIDZ_ORIGINAL_IMPL) + return (ret); + /* * See if we can use any of our optimized reconstruction routines. */ - if (!vdev_raidz_default_to_general) { - switch (nbaddata) { - case 1: - if (parity_valid[VDEV_RAIDZ_P]) - return (vdev_raidz_reconstruct_p(rm, dt, 1)); + switch (nbaddata) { + case 1: + if (parity_valid[VDEV_RAIDZ_P]) + return (vdev_raidz_reconstruct_p(rm, dt, 1)); - ASSERT(rm->rm_firstdatacol > 1); + ASSERT(rm->rm_firstdatacol > 1); - if (parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_q(rm, dt, 1)); + if (parity_valid[VDEV_RAIDZ_Q]) + return (vdev_raidz_reconstruct_q(rm, dt, 1)); - ASSERT(rm->rm_firstdatacol > 2); - break; + ASSERT(rm->rm_firstdatacol > 2); + break; - case 2: - ASSERT(rm->rm_firstdatacol > 1); + case 2: + ASSERT(rm->rm_firstdatacol > 1); - if (parity_valid[VDEV_RAIDZ_P] && - parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_pq(rm, dt, 2)); + if (parity_valid[VDEV_RAIDZ_P] && + parity_valid[VDEV_RAIDZ_Q]) + return (vdev_raidz_reconstruct_pq(rm, dt, 2)); - ASSERT(rm->rm_firstdatacol > 2); + ASSERT(rm->rm_firstdatacol > 2); - break; - } + break; } code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); @@ -1821,11 +1710,16 @@ vdev_raidz_dumpio(vdev_t *vd, caddr_t data, size_t size, * treat the on-disk format as if the only blocks are the complete 128 * KB size. */ - abd_t *abd = abd_get_from_buf(data - (offset - origoffset), + + /* First, fake a zio for vdev_raidz_map_alloc. */ + zio_t *zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); + zio->io_offset = origoffset; + zio->io_size = SPA_OLD_MAXBLOCKSIZE; + zio->io_abd = abd_get_from_buf(data - (offset - origoffset), SPA_OLD_MAXBLOCKSIZE); - rm = vdev_raidz_map_alloc(abd, - SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, - vd->vdev_children, vd->vdev_nparity); + + rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, + vd->vdev_nparity); coloffset = origoffset; @@ -1874,7 +1768,9 @@ vdev_raidz_dumpio(vdev_t *vd, caddr_t data, size_t size, } vdev_raidz_map_free(rm); - abd_put(abd); + abd_put(zio->io_abd); + kmem_free(zio, sizeof (zio_t)); + #endif /* KERNEL */ return (err); @@ -1965,8 +1861,7 @@ vdev_raidz_io_start(zio_t *zio) raidz_col_t *rc; int c, i; - rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset, - tvd->vdev_ashift, vd->vdev_children, + rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, vd->vdev_nparity); zio->io_vsd = rm; @@ -2141,11 +2036,6 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) return (ret); } -/* - * Keep statistics on all the ways that we used parity to correct data. - */ -static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; - static int vdev_raidz_worst_error(raidz_map_t *rm) { @@ -2251,7 +2141,6 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) */ code = vdev_raidz_reconstruct(rm, tgts, n); if (raidz_checksum_verify(zio) == 0) { - atomic_inc_64(&raidz_corrected[code]); for (i = 0; i < n; i++) { c = tgts[i]; @@ -2466,8 +2355,6 @@ vdev_raidz_io_done(zio_t *zio) code = vdev_raidz_reconstruct(rm, tgts, n); if (raidz_checksum_verify(zio) == 0) { - atomic_inc_64(&raidz_corrected[code]); - /* * If we read more parity disks than were used * for reconstruction, confirm that the other @@ -2620,7 +2507,7 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) /* * Determine if any portion of the provided block resides on a child vdev * with a dirty DTL and therefore needs to be resilvered. The function - * assumes that at least one DTL is dirty which imples that full stripe + * assumes that at least one DTL is dirty which implies that full stripe * width blocks must be resilvered. */ static boolean_t diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz_math.c b/usr/src/uts/common/fs/zfs/vdev_raidz_math.c new file mode 100644 index 0000000000..2a1dac33c5 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev_raidz_math.c @@ -0,0 +1,571 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden Nešković. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/types.h> +#include <sys/zio.h> +#include <sys/debug.h> +#include <sys/zfs_debug.h> +#include <sys/vdev_raidz.h> +#include <sys/vdev_raidz_impl.h> +#include <sys/simd.h> + +#ifndef isspace +#define isspace(c) ((c) == ' ' || (c) == '\t' || (c) == '\n' || \ + (c) == '\r' || (c) == '\f' || (c) == '\013') +#endif + +extern boolean_t raidz_will_scalar_work(void); + +/* Opaque implementation with NULL methods to represent original methods */ +static const raidz_impl_ops_t vdev_raidz_original_impl = { + .name = "original", + .is_supported = raidz_will_scalar_work, +}; + +/* RAIDZ parity op that contain the fastest methods */ +static raidz_impl_ops_t vdev_raidz_fastest_impl = { + .name = "fastest" +}; + +/* All compiled in implementations */ +const raidz_impl_ops_t *raidz_all_maths[] = { + &vdev_raidz_original_impl, + &vdev_raidz_scalar_impl, +}; + +/* Indicate that benchmark has been completed */ +static boolean_t raidz_math_initialized = B_FALSE; + +/* Select raidz implementation */ +#define IMPL_FASTEST (UINT32_MAX) +#define IMPL_CYCLE (UINT32_MAX - 1) +#define IMPL_ORIGINAL (0) +#define IMPL_SCALAR (1) + +#define RAIDZ_IMPL_READ(i) (*(volatile uint32_t *) &(i)) + +static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR; +static uint32_t user_sel_impl = IMPL_FASTEST; + +/* Hold all supported implementations */ +static size_t raidz_supp_impl_cnt = 0; +static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)]; + +#if defined(_KERNEL) +/* + * kstats values for supported implementations + * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s] + * + * PORTING NOTE: + * On illumos this is not a kstat. OpenZFS uses their home-grown kstat code + * which implements a free-form kstat using additional functionality that does + * not exist in illumos. Because there are no software consumers of this + * information, we omit a kstat API. If an administrator needs to see this + * data for some reason, they can use mdb. + * + * The format of the kstat data on OpenZFS would be a "header" that looks like + * this (a column for each entry in the "raidz_gen_name" and "raidz_rec_name" + * arrays, starting with the parity function "implementation" name): + * impl gen_p gen_pq gen_pqr rec_p rec_q rec_r rec_pq rec_pr rec_qr rec_pqr + * This is followed by a row for each parity function implementation, showing + * the "speed" values calculated for that implementation for each of the + * parity generation and reconstruction functions in the "raidz_all_maths" + * array. + */ +static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1]; + +#endif + +/* + * Returns the RAIDZ operations for raidz_map() parity calculations. When + * a SIMD implementation is not allowed in the current context, then fallback + * to the fastest generic implementation. + */ +const raidz_impl_ops_t * +vdev_raidz_math_get_ops(void) +{ + /* + * illumos porting note: + * The following check from OpenZFS is disabled since we don't have + * this compiled in yet and we need to be able to change the + * implementation for the user-level test suite. + * + * if (!kfpu_allowed()) + * return (&vdev_raidz_scalar_impl); + */ + + raidz_impl_ops_t *ops = NULL; + const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); + + switch (impl) { + case IMPL_FASTEST: + ASSERT(raidz_math_initialized); + ops = &vdev_raidz_fastest_impl; + break; + case IMPL_CYCLE: + /* Cycle through all supported implementations */ + ASSERT(raidz_math_initialized); + ASSERT3U(raidz_supp_impl_cnt, >, 0); + static size_t cycle_impl_idx = 0; + size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt; + ops = raidz_supp_impl[idx]; + break; + case IMPL_ORIGINAL: + ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl; + break; + case IMPL_SCALAR: + ops = (raidz_impl_ops_t *)&vdev_raidz_scalar_impl; + break; + default: + ASSERT3U(impl, <, raidz_supp_impl_cnt); + ASSERT3U(raidz_supp_impl_cnt, >, 0); + if (impl < ARRAY_SIZE(raidz_all_maths)) + ops = raidz_supp_impl[impl]; + break; + } + + ASSERT3P(ops, !=, NULL); + + return (ops); +} + +/* + * Select parity generation method for raidz_map + */ +int +vdev_raidz_math_generate(raidz_map_t *rm) +{ + raidz_gen_f gen_parity = NULL; + + switch (raidz_parity(rm)) { + case 1: + gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P]; + break; + case 2: + gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ]; + break; + case 3: + gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR]; + break; + default: + gen_parity = NULL; + cmn_err(CE_PANIC, "invalid RAID-Z configuration %u", + (uint_t)raidz_parity(rm)); + break; + } + + /* if method is NULL execute the original implementation */ + if (gen_parity == NULL) + return (RAIDZ_ORIGINAL_IMPL); + + gen_parity(rm); + + return (0); +} + +static raidz_rec_f +reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid, + const int nbaddata) +{ + if (nbaddata == 1 && parity_valid[CODE_P]) { + return (rm->rm_ops->rec[RAIDZ_REC_P]); + } + return ((raidz_rec_f) NULL); +} + +static raidz_rec_f +reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid, + const int nbaddata) +{ + if (nbaddata == 1) { + if (parity_valid[CODE_P]) { + return (rm->rm_ops->rec[RAIDZ_REC_P]); + } else if (parity_valid[CODE_Q]) { + return (rm->rm_ops->rec[RAIDZ_REC_Q]); + } + } else if (nbaddata == 2 && + parity_valid[CODE_P] && parity_valid[CODE_Q]) { + return (rm->rm_ops->rec[RAIDZ_REC_PQ]); + } + return ((raidz_rec_f) NULL); +} + +static raidz_rec_f +reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid, + const int nbaddata) +{ + if (nbaddata == 1) { + if (parity_valid[CODE_P]) { + return (rm->rm_ops->rec[RAIDZ_REC_P]); + } else if (parity_valid[CODE_Q]) { + return (rm->rm_ops->rec[RAIDZ_REC_Q]); + } else if (parity_valid[CODE_R]) { + return (rm->rm_ops->rec[RAIDZ_REC_R]); + } + } else if (nbaddata == 2) { + if (parity_valid[CODE_P] && parity_valid[CODE_Q]) { + return (rm->rm_ops->rec[RAIDZ_REC_PQ]); + } else if (parity_valid[CODE_P] && parity_valid[CODE_R]) { + return (rm->rm_ops->rec[RAIDZ_REC_PR]); + } else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) { + return (rm->rm_ops->rec[RAIDZ_REC_QR]); + } + } else if (nbaddata == 3 && + parity_valid[CODE_P] && parity_valid[CODE_Q] && + parity_valid[CODE_R]) { + return (rm->rm_ops->rec[RAIDZ_REC_PQR]); + } + return ((raidz_rec_f) NULL); +} + +/* + * Select data reconstruction method for raidz_map + * @parity_valid - Parity validity flag + * @dt - Failed data index array + * @nbaddata - Number of failed data columns + */ +int +vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, + const int *dt, const int nbaddata) +{ + raidz_rec_f rec_fn = NULL; + + switch (raidz_parity(rm)) { + case PARITY_P: + rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata); + break; + case PARITY_PQ: + rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata); + break; + case PARITY_PQR: + rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata); + break; + default: + cmn_err(CE_PANIC, "invalid RAID-Z configuration %u", + (uint_t)raidz_parity(rm)); + break; + } + + if (rec_fn == NULL) + return (RAIDZ_ORIGINAL_IMPL); + else + return (rec_fn(rm, dt)); +} + +const char *raidz_gen_name[] = { + "gen_p", "gen_pq", "gen_pqr" +}; +const char *raidz_rec_name[] = { + "rec_p", "rec_q", "rec_r", + "rec_pq", "rec_pr", "rec_qr", "rec_pqr" +}; + +#if defined(_KERNEL) + +#define BENCH_D_COLS (8ULL) +#define BENCH_COLS (BENCH_D_COLS + PARITY_PQR) +#define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */ +#define BENCH_NS MSEC2NSEC(25) /* 25ms */ + +typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn); + +static void +benchmark_gen_impl(raidz_map_t *rm, const int fn) +{ + (void) fn; + vdev_raidz_generate_parity(rm); +} + +static void +benchmark_rec_impl(raidz_map_t *rm, const int fn) +{ + static const int rec_tgt[7][3] = { + {1, 2, 3}, /* rec_p: bad QR & D[0] */ + {0, 2, 3}, /* rec_q: bad PR & D[0] */ + {0, 1, 3}, /* rec_r: bad PQ & D[0] */ + {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ + {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ + {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ + {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ + }; + + vdev_raidz_reconstruct(rm, rec_tgt[fn], 3); +} + +/* + * Benchmarking of all supported implementations (raidz_supp_impl_cnt) + * is performed by setting the rm_ops pointer and calling the top level + * generate/reconstruct methods of bench_rm. + */ +static void +benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn) +{ + uint64_t run_cnt, speed, best_speed = 0; + hrtime_t t_start, t_diff; + raidz_impl_ops_t *curr_impl; + raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt]; + int impl, i; + + for (impl = 0; impl < raidz_supp_impl_cnt; impl++) { + /* set an implementation to benchmark */ + curr_impl = raidz_supp_impl[impl]; + bench_rm->rm_ops = curr_impl; + + run_cnt = 0; + t_start = gethrtime(); + + do { + for (i = 0; i < 25; i++, run_cnt++) + bench_fn(bench_rm, fn); + + t_diff = gethrtime() - t_start; + } while (t_diff < BENCH_NS); + + speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC; + speed /= (t_diff * BENCH_COLS); + + if (bench_fn == benchmark_gen_impl) + raidz_impl_kstats[impl].gen[fn] = speed; + else + raidz_impl_kstats[impl].rec[fn] = speed; + + /* Update fastest implementation method */ + if (speed > best_speed) { + best_speed = speed; + + if (bench_fn == benchmark_gen_impl) { + fstat->gen[fn] = impl; + vdev_raidz_fastest_impl.gen[fn] = + curr_impl->gen[fn]; + } else { + fstat->rec[fn] = impl; + vdev_raidz_fastest_impl.rec[fn] = + curr_impl->rec[fn]; + } + } + } +} +#endif + +/* + * Initialize and benchmark all supported implementations. + */ +static void +benchmark_raidz(void) +{ + raidz_impl_ops_t *curr_impl; + int i, c; + + /* Move supported impl into raidz_supp_impl */ + for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { + curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i]; + + if (curr_impl->init) + curr_impl->init(); + + if (curr_impl->is_supported()) + raidz_supp_impl[c++] = (raidz_impl_ops_t *)curr_impl; + } + membar_producer(); /* complete raidz_supp_impl[] init */ + raidz_supp_impl_cnt = c; /* number of supported impl */ + +#if defined(_KERNEL) + zio_t *bench_zio = NULL; + raidz_map_t *bench_rm = NULL; + uint64_t bench_parity; + + /* Fake a zio and run the benchmark on a warmed up buffer */ + bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); + bench_zio->io_offset = 0; + bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */ + bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE); + memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE); + + /* Benchmark parity generation methods */ + for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) { + bench_parity = fn + 1; + /* New raidz_map is needed for each generate_p/q/r */ + bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, + BENCH_D_COLS + bench_parity, bench_parity); + + benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl); + + vdev_raidz_map_free(bench_rm); + } + + /* Benchmark data reconstruction methods */ + bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, + BENCH_COLS, PARITY_PQR); + + for (int fn = 0; fn < RAIDZ_REC_NUM; fn++) + benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl); + + vdev_raidz_map_free(bench_rm); + + /* cleanup the bench zio */ + abd_free(bench_zio->io_abd); + kmem_free(bench_zio, sizeof (zio_t)); +#else + /* + * Skip the benchmark in user space to avoid impacting libzpool + * consumers (zdb, zhack, zinject, ztest). The last implementation + * is assumed to be the fastest and used by default. + */ + memcpy(&vdev_raidz_fastest_impl, + raidz_supp_impl[raidz_supp_impl_cnt - 1], + sizeof (vdev_raidz_fastest_impl)); + strcpy(vdev_raidz_fastest_impl.name, "fastest"); +#endif /* _KERNEL */ +} + +void +vdev_raidz_math_init(void) +{ + /* Determine the fastest available implementation. */ + benchmark_raidz(); + + /* Finish initialization */ + atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl); + raidz_math_initialized = B_TRUE; +} + +void +vdev_raidz_math_fini(void) +{ + raidz_impl_ops_t const *curr_impl; + + for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { + curr_impl = raidz_all_maths[i]; + if (curr_impl->fini) + curr_impl->fini(); + } +} + +static const struct { + char *name; + uint32_t sel; +} math_impl_opts[] = { + { "cycle", IMPL_CYCLE }, + { "fastest", IMPL_FASTEST }, + { "original", IMPL_ORIGINAL }, + { "scalar", IMPL_SCALAR } +}; + +/* + * Function sets desired raidz implementation. + * + * If we are called before init(), user preference will be saved in + * user_sel_impl, and applied in later init() call. This occurs when module + * parameter is specified on module load. Otherwise, directly update + * zfs_vdev_raidz_impl. + * + * @val Name of raidz implementation to use + * @param Unused. + */ +int +vdev_raidz_impl_set(const char *val) +{ + int err = -EINVAL; + char req_name[RAIDZ_IMPL_NAME_MAX]; + uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl); + size_t i; + + /* sanitize input */ + i = strnlen(val, RAIDZ_IMPL_NAME_MAX); + if (i == 0 || i == RAIDZ_IMPL_NAME_MAX) + return (err); + + strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX); + while (i > 0 && !!isspace(req_name[i-1])) + i--; + req_name[i] = '\0'; + + /* Check mandatory options */ + for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) { + if (strcmp(req_name, math_impl_opts[i].name) == 0) { + impl = math_impl_opts[i].sel; + err = 0; + break; + } + } + + /* check all supported impl if init() was already called */ + if (err != 0 && raidz_math_initialized) { + /* check all supported implementations */ + for (i = 0; i < raidz_supp_impl_cnt; i++) { + if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) { + impl = i; + err = 0; + break; + } + } + } + + if (err == 0) { + if (raidz_math_initialized) + atomic_swap_32(&zfs_vdev_raidz_impl, impl); + else + atomic_swap_32(&user_sel_impl, impl); + } + + return (err); +} + +#if defined(_KERNEL) && defined(__linux__) + +static int +zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp) +{ + return (vdev_raidz_impl_set(val)); +} + +static int +zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp) +{ + int i, cnt = 0; + char *fmt; + const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); + + ASSERT(raidz_math_initialized); + + /* list mandatory options */ + for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) { + fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s "; + cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name); + } + + /* list all supported implementations */ + for (i = 0; i < raidz_supp_impl_cnt; i++) { + fmt = (i == impl) ? "[%s] " : "%s "; + cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name); + } + + return (cnt); +} + +module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set, + zfs_vdev_raidz_impl_get, NULL, 0644); +MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation."); +#endif diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz_math_impl.h b/usr/src/uts/common/fs/zfs/vdev_raidz_math_impl.h new file mode 100644 index 0000000000..89c2082c4a --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev_raidz_math_impl.h @@ -0,0 +1,1477 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden Nešković. All rights reserved. + */ + +#ifndef _VDEV_RAIDZ_MATH_IMPL_H +#define _VDEV_RAIDZ_MATH_IMPL_H + +#include <sys/types.h> + +#define raidz_inline inline __attribute__((always_inline)) +#ifndef noinline +#define noinline __attribute__((noinline)) +#endif + +/* + * Functions calculate multiplication constants for data reconstruction. + * Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and + * used parity columns for reconstruction. + * @rm RAIDZ map + * @tgtidx array of missing data indexes + * @coeff output array of coefficients. Array must be provided by + * user and must hold minimum MUL_CNT values. + */ +static noinline void +raidz_rec_q_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +{ + const unsigned ncols = raidz_ncols(rm); + const unsigned x = tgtidx[TARGET_X]; + + coeff[MUL_Q_X] = gf_exp2(255 - (ncols - x - 1)); +} + +static noinline void +raidz_rec_r_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +{ + const unsigned ncols = raidz_ncols(rm); + const unsigned x = tgtidx[TARGET_X]; + + coeff[MUL_R_X] = gf_exp4(255 - (ncols - x - 1)); +} + +static noinline void +raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +{ + const unsigned ncols = raidz_ncols(rm); + const unsigned x = tgtidx[TARGET_X]; + const unsigned y = tgtidx[TARGET_Y]; + gf_t a, b, e; + + a = gf_exp2(x + 255 - y); + b = gf_exp2(255 - (ncols - x - 1)); + e = a ^ 0x01; + + coeff[MUL_PQ_X] = gf_div(a, e); + coeff[MUL_PQ_Y] = gf_div(b, e); +} + +static noinline void +raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +{ + const unsigned ncols = raidz_ncols(rm); + const unsigned x = tgtidx[TARGET_X]; + const unsigned y = tgtidx[TARGET_Y]; + + gf_t a, b, e; + + a = gf_exp4(x + 255 - y); + b = gf_exp4(255 - (ncols - x - 1)); + e = a ^ 0x01; + + coeff[MUL_PR_X] = gf_div(a, e); + coeff[MUL_PR_Y] = gf_div(b, e); +} + +static noinline void +raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +{ + const unsigned ncols = raidz_ncols(rm); + const unsigned x = tgtidx[TARGET_X]; + const unsigned y = tgtidx[TARGET_Y]; + + gf_t nx, ny, nxxy, nxyy, d; + + nx = gf_exp2(ncols - x - 1); + ny = gf_exp2(ncols - y - 1); + nxxy = gf_mul(gf_mul(nx, nx), ny); + nxyy = gf_mul(gf_mul(nx, ny), ny); + d = nxxy ^ nxyy; + + coeff[MUL_QR_XQ] = ny; + coeff[MUL_QR_X] = gf_div(ny, d); + coeff[MUL_QR_YQ] = nx; + coeff[MUL_QR_Y] = gf_div(nx, d); +} + +static noinline void +raidz_rec_pqr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +{ + const unsigned ncols = raidz_ncols(rm); + const unsigned x = tgtidx[TARGET_X]; + const unsigned y = tgtidx[TARGET_Y]; + const unsigned z = tgtidx[TARGET_Z]; + + gf_t nx, ny, nz, nxx, nyy, nzz, nyyz, nyzz, xd, yd; + + nx = gf_exp2(ncols - x - 1); + ny = gf_exp2(ncols - y - 1); + nz = gf_exp2(ncols - z - 1); + + nxx = gf_exp4(ncols - x - 1); + nyy = gf_exp4(ncols - y - 1); + nzz = gf_exp4(ncols - z - 1); + + nyyz = gf_mul(gf_mul(ny, nz), ny); + nyzz = gf_mul(nzz, ny); + + xd = gf_mul(nxx, ny) ^ gf_mul(nx, nyy) ^ nyyz ^ + gf_mul(nxx, nz) ^ gf_mul(nzz, nx) ^ nyzz; + + yd = gf_inv(ny ^ nz); + + coeff[MUL_PQR_XP] = gf_div(nyyz ^ nyzz, xd); + coeff[MUL_PQR_XQ] = gf_div(nyy ^ nzz, xd); + coeff[MUL_PQR_XR] = gf_div(ny ^ nz, xd); + coeff[MUL_PQR_YU] = nx; + coeff[MUL_PQR_YP] = gf_mul(nz, yd); + coeff[MUL_PQR_YQ] = yd; +} + +/* + * Method for zeroing a buffer (can be implemented using SIMD). + * This method is used by multiple for gen/rec functions. + * + * @dc Destination buffer + * @dsize Destination buffer size + * @private Unused + */ +static int +raidz_zero_abd_cb(void *dc, size_t dsize, void *private) +{ + v_t *dst = (v_t *)dc; + size_t i; + + ZERO_DEFINE(); + + (void) private; /* unused */ + + ZERO(ZERO_D); + + for (i = 0; i < dsize / sizeof (v_t); i += (2 * ZERO_STRIDE)) { + STORE(dst + i, ZERO_D); + STORE(dst + i + ZERO_STRIDE, ZERO_D); + } + + return (0); +} + +#define raidz_zero(dabd, size) \ +{ \ + abd_iterate_func(dabd, 0, size, raidz_zero_abd_cb, NULL); \ +} + +/* + * Method for copying two buffers (can be implemented using SIMD). + * This method is used by multiple for gen/rec functions. + * + * @dc Destination buffer + * @sc Source buffer + * @dsize Destination buffer size + * @ssize Source buffer size + * @private Unused + */ +static int +raidz_copy_abd_cb(void *dc, void *sc, size_t size, void *private) +{ + v_t *dst = (v_t *)dc; + const v_t *src = (v_t *)sc; + size_t i; + + COPY_DEFINE(); + + (void) private; /* unused */ + + for (i = 0; i < size / sizeof (v_t); i += (2 * COPY_STRIDE)) { + LOAD(src + i, COPY_D); + STORE(dst + i, COPY_D); + + LOAD(src + i + COPY_STRIDE, COPY_D); + STORE(dst + i + COPY_STRIDE, COPY_D); + } + + return (0); +} + + +#define raidz_copy(dabd, sabd, size) \ +{ \ + abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_copy_abd_cb, NULL);\ +} + +/* + * Method for adding (XORing) two buffers. + * Source and destination are XORed together and result is stored in + * destination buffer. This method is used by multiple for gen/rec functions. + * + * @dc Destination buffer + * @sc Source buffer + * @dsize Destination buffer size + * @ssize Source buffer size + * @private Unused + */ +static int +raidz_add_abd_cb(void *dc, void *sc, size_t size, void *private) +{ + v_t *dst = (v_t *)dc; + const v_t *src = (v_t *)sc; + size_t i; + + ADD_DEFINE(); + + (void) private; /* unused */ + + for (i = 0; i < size / sizeof (v_t); i += (2 * ADD_STRIDE)) { + LOAD(dst + i, ADD_D); + XOR_ACC(src + i, ADD_D); + STORE(dst + i, ADD_D); + + LOAD(dst + i + ADD_STRIDE, ADD_D); + XOR_ACC(src + i + ADD_STRIDE, ADD_D); + STORE(dst + i + ADD_STRIDE, ADD_D); + } + + return (0); +} + +#define raidz_add(dabd, sabd, size) \ +{ \ + abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_add_abd_cb, NULL);\ +} + +/* + * Method for multiplying a buffer with a constant in GF(2^8). + * Symbols from buffer are multiplied by a constant and result is stored + * back in the same buffer. + * + * @dc In/Out data buffer. + * @size Size of the buffer + * @private pointer to the multiplication constant (unsigned) + */ +static int +raidz_mul_abd_cb(void *dc, size_t size, void *private) +{ + const unsigned mul = *((unsigned *)private); + v_t *d = (v_t *)dc; + size_t i; + + MUL_DEFINE(); + + for (i = 0; i < size / sizeof (v_t); i += (2 * MUL_STRIDE)) { + LOAD(d + i, MUL_D); + MUL(mul, MUL_D); + STORE(d + i, MUL_D); + + LOAD(d + i + MUL_STRIDE, MUL_D); + MUL(mul, MUL_D); + STORE(d + i + MUL_STRIDE, MUL_D); + } + + return (0); +} + + +/* + * Syndrome generation/update macros + * + * Require LOAD(), XOR(), STORE(), MUL2(), and MUL4() macros + */ +#define P_D_SYNDROME(D, T, t) \ +{ \ + LOAD((t), T); \ + XOR(D, T); \ + STORE((t), T); \ +} + +#define Q_D_SYNDROME(D, T, t) \ +{ \ + LOAD((t), T); \ + MUL2(T); \ + XOR(D, T); \ + STORE((t), T); \ +} + +#define Q_SYNDROME(T, t) \ +{ \ + LOAD((t), T); \ + MUL2(T); \ + STORE((t), T); \ +} + +#define R_D_SYNDROME(D, T, t) \ +{ \ + LOAD((t), T); \ + MUL4(T); \ + XOR(D, T); \ + STORE((t), T); \ +} + +#define R_SYNDROME(T, t) \ +{ \ + LOAD((t), T); \ + MUL4(T); \ + STORE((t), T); \ +} + + +/* + * PARITY CALCULATION + * + * Macros *_SYNDROME are used for parity/syndrome calculation. + * *_D_SYNDROME() macros are used to calculate syndrome between 0 and + * length of data column, and *_SYNDROME() macros are only for updating + * the parity/syndrome if data column is shorter. + * + * P parity is calculated using raidz_add_abd(). + */ + +/* + * Generate P parity (RAIDZ1) + * + * @rm RAIDZ map + */ +static raidz_inline void +raidz_generate_p_impl(raidz_map_t * const rm) +{ + size_t c; + const size_t ncols = raidz_ncols(rm); + const size_t psize = rm->rm_col[CODE_P].rc_size; + abd_t *pabd = rm->rm_col[CODE_P].rc_abd; + size_t size; + abd_t *dabd; + + raidz_math_begin(); + + /* start with first data column */ + raidz_copy(pabd, rm->rm_col[1].rc_abd, psize); + + for (c = 2; c < ncols; c++) { + dabd = rm->rm_col[c].rc_abd; + size = rm->rm_col[c].rc_size; + + /* add data column */ + raidz_add(pabd, dabd, size); + } + + raidz_math_end(); +} + + +/* + * Generate PQ parity (RAIDZ2) + * The function is called per data column. + * + * @c array of pointers to parity (code) columns + * @dc pointer to data column + * @csize size of parity columns + * @dsize size of data column + */ +static void +raidz_gen_pq_add(void **c, const void *dc, const size_t csize, + const size_t dsize) +{ + v_t *p = (v_t *)c[0]; + v_t *q = (v_t *)c[1]; + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const qend = q + (csize / sizeof (v_t)); + + GEN_PQ_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += GEN_PQ_STRIDE, p += GEN_PQ_STRIDE, + q += GEN_PQ_STRIDE) { + LOAD(d, GEN_PQ_D); + P_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, p); + Q_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, q); + } + for (; q < qend; q += GEN_PQ_STRIDE) { + Q_SYNDROME(GEN_PQ_C, q); + } +} + + +/* + * Generate PQ parity (RAIDZ2) + * + * @rm RAIDZ map + */ +static raidz_inline void +raidz_generate_pq_impl(raidz_map_t * const rm) +{ + size_t c; + const size_t ncols = raidz_ncols(rm); + const size_t csize = rm->rm_col[CODE_P].rc_size; + size_t dsize; + abd_t *dabd; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd + }; + + raidz_math_begin(); + + raidz_copy(cabds[CODE_P], rm->rm_col[2].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rm->rm_col[2].rc_abd, csize); + + for (c = 3; c < ncols; c++) { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + + abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2, + raidz_gen_pq_add); + } + + raidz_math_end(); +} + + +/* + * Generate PQR parity (RAIDZ3) + * The function is called per data column. + * + * @c array of pointers to parity (code) columns + * @dc pointer to data column + * @csize size of parity columns + * @dsize size of data column + */ +static void +raidz_gen_pqr_add(void **c, const void *dc, const size_t csize, + const size_t dsize) +{ + v_t *p = (v_t *)c[0]; + v_t *q = (v_t *)c[1]; + v_t *r = (v_t *)c[CODE_R]; + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const qend = q + (csize / sizeof (v_t)); + + GEN_PQR_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += GEN_PQR_STRIDE, p += GEN_PQR_STRIDE, + q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) { + LOAD(d, GEN_PQR_D); + P_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, p); + Q_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, q); + R_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, r); + } + for (; q < qend; q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) { + Q_SYNDROME(GEN_PQR_C, q); + R_SYNDROME(GEN_PQR_C, r); + } +} + + +/* + * Generate PQR parity (RAIDZ2) + * + * @rm RAIDZ map + */ +static raidz_inline void +raidz_generate_pqr_impl(raidz_map_t * const rm) +{ + size_t c; + const size_t ncols = raidz_ncols(rm); + const size_t csize = rm->rm_col[CODE_P].rc_size; + size_t dsize; + abd_t *dabd; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; + + raidz_math_begin(); + + raidz_copy(cabds[CODE_P], rm->rm_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rm->rm_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_R], rm->rm_col[3].rc_abd, csize); + + for (c = 4; c < ncols; c++) { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + + abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3, + raidz_gen_pqr_add); + } + + raidz_math_end(); +} + + +/* + * DATA RECONSTRUCTION + * + * Data reconstruction process consists of two phases: + * - Syndrome calculation + * - Data reconstruction + * + * Syndrome is calculated by generating parity using available data columns + * and zeros in places of erasure. Existing parity is added to corresponding + * syndrome value to obtain the [P|Q|R]syn values from equation: + * P = Psyn + Dx + Dy + Dz + * Q = Qsyn + 2^x * Dx + 2^y * Dy + 2^z * Dz + * R = Rsyn + 4^x * Dx + 4^y * Dy + 4^z * Dz + * + * For data reconstruction phase, the corresponding equations are solved + * for missing data (Dx, Dy, Dz). This generally involves multiplying known + * symbols by an coefficient and adding them together. The multiplication + * constant coefficients are calculated ahead of the operation in + * raidz_rec_[q|r|pq|pq|qr|pqr]_coeff() functions. + * + * IMPLEMENTATION NOTE: RAID-Z block can have complex geometry, with "big" + * and "short" columns. + * For this reason, reconstruction is performed in minimum of + * two steps. First, from offset 0 to short_size, then from short_size to + * short_size. Calculation functions REC_[*]_BLOCK() are implemented to work + * over both ranges. The split also enables removal of conditional expressions + * from loop bodies, improving throughput of SIMD implementations. + * For the best performance, all functions marked with raidz_inline attribute + * must be inlined by compiler. + * + * parity data + * columns columns + * <----------> <------------------> + * x y <----+ missing columns (x, y) + * | | + * +---+---+---+---+-v-+---+-v-+---+ ^ 0 + * | | | | | | | | | | + * | | | | | | | | | | + * | P | Q | R | D | D | D | D | D | | + * | | | | 0 | 1 | 2 | 3 | 4 | | + * | | | | | | | | | v + * | | | | | +---+---+---+ ^ short_size + * | | | | | | | + * +---+---+---+---+---+ v big_size + * <------------------> <----------> + * big columns short columns + * + */ + + + + +/* + * Reconstruct single data column using P parity + * + * @syn_method raidz_add_abd() + * @rec_method not applicable + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t xsize = rm->rm_col[x].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + size_t size; + abd_t *dabd; + + raidz_math_begin(); + + /* copy P into target */ + raidz_copy(xabd, rm->rm_col[CODE_P].rc_abd, xsize); + + /* generate p_syndrome */ + for (c = firstdc; c < ncols; c++) { + if (c == x) + continue; + + dabd = rm->rm_col[c].rc_abd; + size = MIN(rm->rm_col[c].rc_size, xsize); + + raidz_add(xabd, dabd, size); + } + + raidz_math_end(); + + return (1 << CODE_P); +} + + +/* + * Generate Q syndrome (Qsyn) + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @xsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize, + const size_t dsize) +{ + v_t *x = (v_t *)xc[TARGET_X]; + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const xend = x + (xsize / sizeof (v_t)); + + SYN_Q_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) { + LOAD(d, SYN_Q_D); + Q_D_SYNDROME(SYN_Q_D, SYN_Q_X, x); + } + for (; x < xend; x += SYN_STRIDE) { + Q_SYNDROME(SYN_Q_X, x); + } +} + + +/* + * Reconstruct single data column using Q parity + * + * @syn_method raidz_add_abd() + * @rec_method raidz_mul_abd_cb() + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + abd_t *xabd = rm->rm_col[x].rc_abd; + const size_t xsize = rm->rm_col[x].rc_size; + abd_t *tabds[] = { xabd }; + + unsigned coeff[MUL_CNT]; + raidz_rec_q_coeff(rm, tgtidx, coeff); + + raidz_math_begin(); + + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, + raidz_syn_q_abd); + } + + /* add Q to the syndrome */ + raidz_add(xabd, rm->rm_col[CODE_Q].rc_abd, xsize); + + /* transform the syndrome */ + abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff); + + raidz_math_end(); + + return (1 << CODE_Q); +} + + +/* + * Generate R syndrome (Rsyn) + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)xc[TARGET_X]; + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const xend = x + (tsize / sizeof (v_t)); + + SYN_R_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) { + LOAD(d, SYN_R_D); + R_D_SYNDROME(SYN_R_D, SYN_R_X, x); + } + for (; x < xend; x += SYN_STRIDE) { + R_SYNDROME(SYN_R_X, x); + } +} + + +/* + * Reconstruct single data column using R parity + * + * @syn_method raidz_add_abd() + * @rec_method raidz_mul_abd_cb() + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t xsize = rm->rm_col[x].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *tabds[] = { xabd }; + + unsigned coeff[MUL_CNT]; + raidz_rec_r_coeff(rm, tgtidx, coeff); + + raidz_math_begin(); + + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + } + + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, + raidz_syn_r_abd); + } + + /* add R to the syndrome */ + raidz_add(xabd, rm->rm_col[CODE_R].rc_abd, xsize); + + /* transform the syndrome */ + abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff); + + raidz_math_end(); + + return (1 << CODE_R); +} + + +/* + * Generate P and Q syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_pq_abd(void **tc, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)tc[TARGET_X]; + v_t *y = (v_t *)tc[TARGET_Y]; + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const yend = y + (tsize / sizeof (v_t)); + + SYN_PQ_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { + LOAD(d, SYN_PQ_D); + P_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, x); + Q_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, y); + } + for (; y < yend; y += SYN_STRIDE) { + Q_SYNDROME(SYN_PQ_X, y); + } +} + +/* + * Reconstruct data using PQ parity and PQ syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants + */ +static void +raidz_rec_pq_abd(void **tc, const size_t tsize, void **c, + const unsigned *mul) +{ + v_t *x = (v_t *)tc[TARGET_X]; + v_t *y = (v_t *)tc[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *)c[CODE_P]; + const v_t *q = (v_t *)c[CODE_Q]; + + REC_PQ_DEFINE(); + + for (; x < xend; x += REC_PQ_STRIDE, y += REC_PQ_STRIDE, + p += REC_PQ_STRIDE, q += REC_PQ_STRIDE) { + LOAD(x, REC_PQ_X); + LOAD(y, REC_PQ_Y); + + XOR_ACC(p, REC_PQ_X); + XOR_ACC(q, REC_PQ_Y); + + /* Save Pxy */ + COPY(REC_PQ_X, REC_PQ_T); + + /* Calc X */ + MUL(mul[MUL_PQ_X], REC_PQ_X); + MUL(mul[MUL_PQ_Y], REC_PQ_Y); + XOR(REC_PQ_Y, REC_PQ_X); + STORE(x, REC_PQ_X); + + /* Calc Y */ + XOR(REC_PQ_T, REC_PQ_X); + STORE(y, REC_PQ_X); + } +} + + +/* + * Reconstruct two data columns using PQ parity + * + * @syn_method raidz_syn_pq_abd() + * @rec_method raidz_rec_pq_abd() + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t y = tgtidx[TARGET_Y]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *tabds[2] = { xabd, yabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd + }; + + unsigned coeff[MUL_CNT]; + raidz_rec_pq_coeff(rm, tgtidx, coeff); + + /* + * Check if some of targets is shorter then others + * In this case, shorter target needs to be replaced with + * new buffer so that syndrome can be calculated. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + + raidz_math_begin(); + + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + raidz_syn_pq_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pq_abd, coeff); + + /* Copy shorter targets back to the original abd buffer */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + + raidz_math_end(); + + if (ysize < xsize) + abd_free(yabd); + + return ((1 << CODE_P) | (1 << CODE_Q)); +} + + +/* + * Generate P and R syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_pr_abd(void **c, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)c[TARGET_X]; + v_t *y = (v_t *)c[TARGET_Y]; + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const yend = y + (tsize / sizeof (v_t)); + + SYN_PR_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { + LOAD(d, SYN_PR_D); + P_D_SYNDROME(SYN_PR_D, SYN_PR_X, x); + R_D_SYNDROME(SYN_PR_D, SYN_PR_X, y); + } + for (; y < yend; y += SYN_STRIDE) { + R_SYNDROME(SYN_PR_X, y); + } +} + +/* + * Reconstruct data using PR parity and PR syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants + */ +static void +raidz_rec_pr_abd(void **t, const size_t tsize, void **c, + const unsigned *mul) +{ + v_t *x = (v_t *)t[TARGET_X]; + v_t *y = (v_t *)t[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *)c[CODE_P]; + const v_t *q = (v_t *)c[CODE_Q]; + + REC_PR_DEFINE(); + + for (; x < xend; x += REC_PR_STRIDE, y += REC_PR_STRIDE, + p += REC_PR_STRIDE, q += REC_PR_STRIDE) { + LOAD(x, REC_PR_X); + LOAD(y, REC_PR_Y); + XOR_ACC(p, REC_PR_X); + XOR_ACC(q, REC_PR_Y); + + /* Save Pxy */ + COPY(REC_PR_X, REC_PR_T); + + /* Calc X */ + MUL(mul[MUL_PR_X], REC_PR_X); + MUL(mul[MUL_PR_Y], REC_PR_Y); + XOR(REC_PR_Y, REC_PR_X); + STORE(x, REC_PR_X); + + /* Calc Y */ + XOR(REC_PR_T, REC_PR_X); + STORE(y, REC_PR_X); + } +} + + +/* + * Reconstruct two data columns using PR parity + * + * @syn_method raidz_syn_pr_abd() + * @rec_method raidz_rec_pr_abd() + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[0]; + const size_t y = tgtidx[1]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *tabds[2] = { xabd, yabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; + unsigned coeff[MUL_CNT]; + raidz_rec_pr_coeff(rm, tgtidx, coeff); + + /* + * Check if some of targets are shorter then others. + * They need to be replaced with a new buffer so that syndrome can + * be calculated on full length. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + + raidz_math_begin(); + + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + raidz_syn_pr_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pr_abd, coeff); + + /* + * Copy shorter targets back to the original abd buffer + */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + + raidz_math_end(); + + if (ysize < xsize) + abd_free(yabd); + + return ((1 << CODE_P) | (1 << CODE_Q)); +} + + +/* + * Generate Q and R syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_qr_abd(void **c, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)c[TARGET_X]; + v_t *y = (v_t *)c[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + + SYN_QR_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { + LOAD(d, SYN_PQ_D); + Q_D_SYNDROME(SYN_QR_D, SYN_QR_X, x); + R_D_SYNDROME(SYN_QR_D, SYN_QR_X, y); + } + for (; x < xend; x += SYN_STRIDE, y += SYN_STRIDE) { + Q_SYNDROME(SYN_QR_X, x); + R_SYNDROME(SYN_QR_X, y); + } +} + + +/* + * Reconstruct data using QR parity and QR syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants + */ +static void +raidz_rec_qr_abd(void **t, const size_t tsize, void **c, + const unsigned *mul) +{ + v_t *x = (v_t *)t[TARGET_X]; + v_t *y = (v_t *)t[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *)c[CODE_P]; + const v_t *q = (v_t *)c[CODE_Q]; + + REC_QR_DEFINE(); + + for (; x < xend; x += REC_QR_STRIDE, y += REC_QR_STRIDE, + p += REC_QR_STRIDE, q += REC_QR_STRIDE) { + LOAD(x, REC_QR_X); + LOAD(y, REC_QR_Y); + + XOR_ACC(p, REC_QR_X); + XOR_ACC(q, REC_QR_Y); + + /* Save Pxy */ + COPY(REC_QR_X, REC_QR_T); + + /* Calc X */ + MUL(mul[MUL_QR_XQ], REC_QR_X); /* X = Q * xqm */ + XOR(REC_QR_Y, REC_QR_X); /* X = R ^ X */ + MUL(mul[MUL_QR_X], REC_QR_X); /* X = X * xm */ + STORE(x, REC_QR_X); + + /* Calc Y */ + MUL(mul[MUL_QR_YQ], REC_QR_T); /* X = Q * xqm */ + XOR(REC_QR_Y, REC_QR_T); /* X = R ^ X */ + MUL(mul[MUL_QR_Y], REC_QR_T); /* X = X * xm */ + STORE(y, REC_QR_T); + } +} + + +/* + * Reconstruct two data columns using QR parity + * + * @syn_method raidz_syn_qr_abd() + * @rec_method raidz_rec_qr_abd() + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t y = tgtidx[TARGET_Y]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *tabds[2] = { xabd, yabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_Q].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; + unsigned coeff[MUL_CNT]; + raidz_rec_qr_coeff(rm, tgtidx, coeff); + + /* + * Check if some of targets is shorter then others + * In this case, shorter target needs to be replaced with + * new buffer so that syndrome can be calculated. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + + raidz_math_begin(); + + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + raidz_syn_qr_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_qr_abd, coeff); + + /* + * Copy shorter targets back to the original abd buffer + */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + + raidz_math_end(); + + if (ysize < xsize) + abd_free(yabd); + + + return ((1 << CODE_Q) | (1 << CODE_R)); +} + + +/* + * Generate P, Q, and R syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_pqr_abd(void **c, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)c[TARGET_X]; + v_t *y = (v_t *)c[TARGET_Y]; + v_t *z = (v_t *)c[TARGET_Z]; + const v_t * const yend = y + (tsize / sizeof (v_t)); + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + + SYN_PQR_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE, + z += SYN_STRIDE) { + LOAD(d, SYN_PQR_D); + P_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, x) + Q_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, y); + R_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, z); + } + for (; y < yend; y += SYN_STRIDE, z += SYN_STRIDE) { + Q_SYNDROME(SYN_PQR_X, y); + R_SYNDROME(SYN_PQR_X, z); + } +} + + +/* + * Reconstruct data using PRQ parity and PQR syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants + */ +static void +raidz_rec_pqr_abd(void **t, const size_t tsize, void **c, + const unsigned * const mul) +{ + v_t *x = (v_t *)t[TARGET_X]; + v_t *y = (v_t *)t[TARGET_Y]; + v_t *z = (v_t *)t[TARGET_Z]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *)c[CODE_P]; + const v_t *q = (v_t *)c[CODE_Q]; + const v_t *r = (v_t *)c[CODE_R]; + + REC_PQR_DEFINE(); + + for (; x < xend; x += REC_PQR_STRIDE, y += REC_PQR_STRIDE, + z += REC_PQR_STRIDE, p += REC_PQR_STRIDE, q += REC_PQR_STRIDE, + r += REC_PQR_STRIDE) { + LOAD(x, REC_PQR_X); + LOAD(y, REC_PQR_Y); + LOAD(z, REC_PQR_Z); + + XOR_ACC(p, REC_PQR_X); + XOR_ACC(q, REC_PQR_Y); + XOR_ACC(r, REC_PQR_Z); + + /* Save Pxyz and Qxyz */ + COPY(REC_PQR_X, REC_PQR_XS); + COPY(REC_PQR_Y, REC_PQR_YS); + + /* Calc X */ + MUL(mul[MUL_PQR_XP], REC_PQR_X); /* Xp = Pxyz * xp */ + MUL(mul[MUL_PQR_XQ], REC_PQR_Y); /* Xq = Qxyz * xq */ + XOR(REC_PQR_Y, REC_PQR_X); + MUL(mul[MUL_PQR_XR], REC_PQR_Z); /* Xr = Rxyz * xr */ + XOR(REC_PQR_Z, REC_PQR_X); /* X = Xp + Xq + Xr */ + STORE(x, REC_PQR_X); + + /* Calc Y */ + XOR(REC_PQR_X, REC_PQR_XS); /* Pyz = Pxyz + X */ + MUL(mul[MUL_PQR_YU], REC_PQR_X); /* Xq = X * upd_q */ + XOR(REC_PQR_X, REC_PQR_YS); /* Qyz = Qxyz + Xq */ + COPY(REC_PQR_XS, REC_PQR_X); /* restore Pyz */ + MUL(mul[MUL_PQR_YP], REC_PQR_X); /* Yp = Pyz * yp */ + MUL(mul[MUL_PQR_YQ], REC_PQR_YS); /* Yq = Qyz * yq */ + XOR(REC_PQR_X, REC_PQR_YS); /* Y = Yp + Yq */ + STORE(y, REC_PQR_YS); + + /* Calc Z */ + XOR(REC_PQR_XS, REC_PQR_YS); /* Z = Pz = Pyz + Y */ + STORE(z, REC_PQR_YS); + } +} + + +/* + * Reconstruct three data columns using PQR parity + * + * @syn_method raidz_syn_pqr_abd() + * @rec_method raidz_rec_pqr_abd() + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t y = tgtidx[TARGET_Y]; + const size_t z = tgtidx[TARGET_Z]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + const size_t zsize = rm->rm_col[z].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *zabd = rm->rm_col[z].rc_abd; + abd_t *tabds[] = { xabd, yabd, zabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; + unsigned coeff[MUL_CNT]; + raidz_rec_pqr_coeff(rm, tgtidx, coeff); + + /* + * Check if some of targets is shorter then others + * In this case, shorter target needs to be replaced with + * new buffer so that syndrome can be calculated. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + if (zsize < xsize) { + zabd = abd_alloc(xsize, B_FALSE); + tabds[2] = zabd; + } + + raidz_math_begin(); + + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(zabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + raidz_zero(zabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y || c == z) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3, + raidz_syn_pqr_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 3, raidz_rec_pqr_abd, coeff); + + /* + * Copy shorter targets back to the original abd buffer + */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + if (zsize < xsize) + raidz_copy(rm->rm_col[z].rc_abd, zabd, zsize); + + raidz_math_end(); + + if (ysize < xsize) + abd_free(yabd); + if (zsize < xsize) + abd_free(zabd); + + return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R)); +} + +#endif /* _VDEV_RAIDZ_MATH_IMPL_H */ diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz_math_scalar.c b/usr/src/uts/common/fs/zfs/vdev_raidz_math_scalar.c new file mode 100644 index 0000000000..cd742e146c --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev_raidz_math_scalar.c @@ -0,0 +1,337 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2016 Gvozden Nešković. All rights reserved. + */ + +#include <sys/vdev_raidz_impl.h> + +/* + * Provide native CPU scalar routines. + * Support 32bit and 64bit CPUs. + */ +#if ((~(0x0ULL)) >> 24) == 0xffULL +#define ELEM_SIZE 4 +typedef uint32_t iv_t; +#elif ((~(0x0ULL)) >> 56) == 0xffULL +#define ELEM_SIZE 8 +typedef uint64_t iv_t; +#endif + +/* + * Vector type used in scalar implementation + * + * The union is expected to be of native CPU register size. Since addition + * uses XOR operation, it can be performed an all byte elements at once. + * Multiplication requires per byte access. + */ +typedef union { + iv_t e; + uint8_t b[ELEM_SIZE]; +} v_t; + +/* + * Precomputed lookup tables for multiplication by a constant + * + * Reconstruction path requires multiplication by a constant factors. Instead of + * performing two step lookup (log & exp tables), a direct lookup can be used + * instead. Multiplication of element 'a' by a constant 'c' is obtained as: + * + * r = vdev_raidz_mul_lt[c_log][a]; + * + * where c_log = vdev_raidz_log2[c]. Log of coefficient factors is used because + * they are faster to obtain while solving the syndrome equations. + * + * PERFORMANCE NOTE: + * Even though the complete lookup table uses 64kiB, only relatively small + * portion of it is used at the same time. Following shows number of accessed + * bytes for different cases: + * - 1 failed disk: 256B (1 mul. coefficient) + * - 2 failed disks: 512B (2 mul. coefficients) + * - 3 failed disks: 1536B (6 mul. coefficients) + * + * Size of actually accessed lookup table regions is only larger for + * reconstruction of 3 failed disks, when compared to traditional log/exp + * method. But since the result is obtained in one lookup step performance is + * doubled. + */ +static uint8_t vdev_raidz_mul_lt[256][256] __attribute__((aligned(256))); + +static void +raidz_init_scalar(void) +{ + int c, i; + for (c = 0; c < 256; c++) + for (i = 0; i < 256; i++) + vdev_raidz_mul_lt[c][i] = gf_mul(c, i); + +} + +#define PREFETCHNTA(ptr, offset) {} +#define PREFETCH(ptr, offset) {} + +#define XOR_ACC(src, acc) acc.e ^= ((v_t *)src)[0].e +#define XOR(src, acc) acc.e ^= src.e +#define ZERO(acc) acc.e = 0 +#define COPY(src, dst) dst = src +#define LOAD(src, val) val = ((v_t *)src)[0] +#define STORE(dst, val) ((v_t *)dst)[0] = val + +/* + * Constants used for optimized multiplication by 2. + */ +static const struct { + iv_t mod; + iv_t mask; + iv_t msb; +} scalar_mul2_consts = { +#if ELEM_SIZE == 8 + .mod = 0x1d1d1d1d1d1d1d1dULL, + .mask = 0xfefefefefefefefeULL, + .msb = 0x8080808080808080ULL, +#else + .mod = 0x1d1d1d1dULL, + .mask = 0xfefefefeULL, + .msb = 0x80808080ULL, +#endif +}; + +#define MUL2_SETUP() {} + +#define MUL2(a) \ +{ \ + iv_t _mask; \ + \ + _mask = (a).e & scalar_mul2_consts.msb; \ + _mask = (_mask << 1) - (_mask >> 7); \ + (a).e = ((a).e << 1) & scalar_mul2_consts.mask; \ + (a).e = (a).e ^ (_mask & scalar_mul2_consts.mod); \ +} + +#define MUL4(a) \ +{ \ + MUL2(a); \ + MUL2(a); \ +} + +#define MUL(c, a) \ +{ \ + const uint8_t *mul_lt = vdev_raidz_mul_lt[c]; \ + switch (ELEM_SIZE) { \ + case 8: \ + a.b[7] = mul_lt[a.b[7]]; \ + a.b[6] = mul_lt[a.b[6]]; \ + a.b[5] = mul_lt[a.b[5]]; \ + a.b[4] = mul_lt[a.b[4]]; \ + /* falls through */ \ + case 4: \ + a.b[3] = mul_lt[a.b[3]]; \ + a.b[2] = mul_lt[a.b[2]]; \ + a.b[1] = mul_lt[a.b[1]]; \ + a.b[0] = mul_lt[a.b[0]]; \ + break; \ + } \ +} + +#define raidz_math_begin() {} +#define raidz_math_end() {} + +#define SYN_STRIDE 1 + +#define ZERO_DEFINE() v_t d0 +#define ZERO_STRIDE 1 +#define ZERO_D d0 + +#define COPY_DEFINE() v_t d0 +#define COPY_STRIDE 1 +#define COPY_D d0 + +#define ADD_DEFINE() v_t d0 +#define ADD_STRIDE 1 +#define ADD_D d0 + +#define MUL_DEFINE() v_t d0 +#define MUL_STRIDE 1 +#define MUL_D d0 + +#define GEN_P_STRIDE 1 +#define GEN_P_DEFINE() v_t p0 +#define GEN_P_P p0 + +#define GEN_PQ_STRIDE 1 +#define GEN_PQ_DEFINE() v_t d0, c0 +#define GEN_PQ_D d0 +#define GEN_PQ_C c0 + +#define GEN_PQR_STRIDE 1 +#define GEN_PQR_DEFINE() v_t d0, c0 +#define GEN_PQR_D d0 +#define GEN_PQR_C c0 + +#define SYN_Q_DEFINE() v_t d0, x0 +#define SYN_Q_D d0 +#define SYN_Q_X x0 + + +#define SYN_R_DEFINE() v_t d0, x0 +#define SYN_R_D d0 +#define SYN_R_X x0 + + +#define SYN_PQ_DEFINE() v_t d0, x0 +#define SYN_PQ_D d0 +#define SYN_PQ_X x0 + + +#define REC_PQ_STRIDE 1 +#define REC_PQ_DEFINE() v_t x0, y0, t0 +#define REC_PQ_X x0 +#define REC_PQ_Y y0 +#define REC_PQ_T t0 + + +#define SYN_PR_DEFINE() v_t d0, x0 +#define SYN_PR_D d0 +#define SYN_PR_X x0 + +#define REC_PR_STRIDE 1 +#define REC_PR_DEFINE() v_t x0, y0, t0 +#define REC_PR_X x0 +#define REC_PR_Y y0 +#define REC_PR_T t0 + + +#define SYN_QR_DEFINE() v_t d0, x0 +#define SYN_QR_D d0 +#define SYN_QR_X x0 + + +#define REC_QR_STRIDE 1 +#define REC_QR_DEFINE() v_t x0, y0, t0 +#define REC_QR_X x0 +#define REC_QR_Y y0 +#define REC_QR_T t0 + + +#define SYN_PQR_DEFINE() v_t d0, x0 +#define SYN_PQR_D d0 +#define SYN_PQR_X x0 + +#define REC_PQR_STRIDE 1 +#define REC_PQR_DEFINE() v_t x0, y0, z0, xs0, ys0 +#define REC_PQR_X x0 +#define REC_PQR_Y y0 +#define REC_PQR_Z z0 +#define REC_PQR_XS xs0 +#define REC_PQR_YS ys0 + +#include "vdev_raidz_math_impl.h" + +DEFINE_GEN_METHODS(scalar); +DEFINE_REC_METHODS(scalar); + +boolean_t +raidz_will_scalar_work(void) +{ + return (B_TRUE); /* always */ +} + +const raidz_impl_ops_t vdev_raidz_scalar_impl = { + .init = raidz_init_scalar, + .fini = NULL, + .gen = RAIDZ_GEN_METHODS(scalar), + .rec = RAIDZ_REC_METHODS(scalar), + .is_supported = &raidz_will_scalar_work, + .name = "scalar" +}; + +/* Powers of 2 in the RAID-Z Galois field. */ +const uint8_t vdev_raidz_pow2[256] __attribute__((aligned(256))) = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, + 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, + 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, + 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, + 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, + 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, + 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, + 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, + 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, + 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, + 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, + 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, + 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, + 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, + 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, + 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, + 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, + 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, + 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, + 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, + 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, + 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, + 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, + 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, + 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, + 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, + 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, + 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, + 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, + 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, + 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 +}; + +/* Logs of 2 in the RAID-Z Galois field. */ +const uint8_t vdev_raidz_log2[256] __attribute__((aligned(256))) = { + 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, + 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, + 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, + 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, + 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, + 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, + 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, + 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, + 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, + 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, + 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, + 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, + 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, + 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, + 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, + 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, + 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, + 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, + 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, + 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, + 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, + 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, + 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, + 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, + 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, + 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, + 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, + 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, + 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, + 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, + 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, + 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, +}; |