diff options
author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2016-07-15 11:53:56 +0000 |
---|---|---|
committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2016-07-15 11:53:56 +0000 |
commit | 9f5f51bfbb3b88deac203d4a5a721684d859fce3 (patch) | |
tree | 0f0c7d7e31aad834a63b18a9e4887e87b55a790f | |
parent | 3e5c385058213debeef48e2ae511fe2be800e9e9 (diff) | |
parent | 4b5c8e93cab28d3c65ba9d407fd8f46e3be1db1c (diff) | |
download | illumos-joyent-9f5f51bfbb3b88deac203d4a5a721684d859fce3.tar.gz |
[illumos-gate merge]
commit 4b5c8e93cab28d3c65ba9d407fd8f46e3be1db1c
7104 increase indirect block size
commit 470bc2d6d44a4a70ed9403c0bce321333e897c31
6327 devfsadm and bootadm taking long time for configs with large number of zvols
commit 25f7d993adbfb3452ac4625b3791670746d35ae3
7071 lzc_snapshot does not fill in errlist on ENOENT
commit dcbf3bd6a1f1360fc1afcee9e22c6dcff7844bf2
6950 ARC should cache compressed data
107 files changed, 4580 insertions, 2064 deletions
diff --git a/usr/src/cmd/mdb/common/mdb/mdb_ctf.c b/usr/src/cmd/mdb/common/mdb/mdb_ctf.c index 4d4cb226c6..9b6d54c7b1 100644 --- a/usr/src/cmd/mdb/common/mdb/mdb_ctf.c +++ b/usr/src/cmd/mdb/common/mdb/mdb_ctf.c @@ -23,8 +23,8 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #include <mdb/mdb_ctf.h> @@ -912,6 +912,24 @@ mdb_ctf_offsetof_by_name(const char *type, const char *member) return (off); } +ssize_t +mdb_ctf_sizeof_by_name(const char *type) +{ + mdb_ctf_id_t id; + ssize_t size; + + if (mdb_ctf_lookup_by_name(type, &id) == -1) { + mdb_warn("couldn't find type %s", type); + return (-1); + } + + if ((size = mdb_ctf_type_size(id)) == -1) { + mdb_warn("couldn't determine type size of %s", type); + return (-1); + } + + return (size); +} /*ARGSUSED*/ static int diff --git a/usr/src/cmd/mdb/common/mdb/mdb_ctf.h b/usr/src/cmd/mdb/common/mdb/mdb_ctf.h index 85e60494d0..21f27d782b 100644 --- a/usr/src/cmd/mdb/common/mdb/mdb_ctf.h +++ b/usr/src/cmd/mdb/common/mdb/mdb_ctf.h @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. * Copyright (c) 2015, Joyent, Inc. */ @@ -136,6 +136,7 @@ extern int mdb_ctf_member_info(mdb_ctf_id_t, const char *, extern int mdb_ctf_offsetof(mdb_ctf_id_t, const char *, ulong_t *); extern int mdb_ctf_num_members(mdb_ctf_id_t); extern int mdb_ctf_offsetof_by_name(const char *, const char *); +extern ssize_t mdb_ctf_sizeof_by_name(const char *); extern ssize_t mdb_ctf_offset_to_name(mdb_ctf_id_t, ulong_t, char *, size_t, int, mdb_ctf_id_t *, ulong_t *); diff --git a/usr/src/cmd/mdb/common/modules/conf/mapfile-extern b/usr/src/cmd/mdb/common/modules/conf/mapfile-extern index 2491a7a8d5..73de8f1e41 100644 --- a/usr/src/cmd/mdb/common/modules/conf/mapfile-extern +++ b/usr/src/cmd/mdb/common/modules/conf/mapfile-extern @@ -1,6 +1,6 @@ # # Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # # CDDL HEADER START # @@ -67,6 +67,7 @@ SYMBOL_SCOPE { mdb_ctf_module_lookup { FLAGS = EXTERN }; mdb_ctf_offsetof { FLAGS = EXTERN }; mdb_ctf_offsetof_by_name { FLAGS = EXTERN }; + mdb_ctf_sizeof_by_name { FLAGS = EXTERN }; mdb_ctf_readsym { FLAGS = EXTERN }; mdb_ctf_type_cmp { FLAGS = EXTERN }; mdb_ctf_type_invalidate { FLAGS = EXTERN }; diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c index 697349b020..12f26d382c 100644 --- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c +++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c @@ -42,6 +42,7 @@ #include <ctype.h> #include <sys/zfs_acl.h> #include <sys/sa_impl.h> +#include <sys/multilist.h> #ifdef _KERNEL #define ZFS_OBJ_NAME "zfs" @@ -973,6 +974,7 @@ arc_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) "mfu_ghost_evictable_metadata", "evict_l2_cached", "evict_l2_eligible", "evict_l2_ineligible", "l2_read_bytes", "l2_write_bytes", "l2_size", "l2_asize", "l2_hdr_size", + "compressed_size", "uncompressed_size", "overhead_size", NULL }; @@ -1655,7 +1657,6 @@ metaslab_walk_step(mdb_walk_state_t *wsp) return (wsp->walk_callback(msp, &ms, wsp->walk_cbdata)); } -/* ARGSUSED */ static int metaslab_walk_init(mdb_walk_state_t *wsp) { @@ -2183,6 +2184,69 @@ zio_state(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) return (mdb_pwalk_dcmd("zio_root", "zio", argc, argv, addr)); } +typedef struct mdb_multilist { + uint64_t ml_num_sublists; + uintptr_t ml_sublists; +} mdb_multilist_t; + +typedef struct multilist_walk_data { + uint64_t mwd_idx; + mdb_multilist_t mwd_ml; +} multilist_walk_data_t; + +/* ARGSUSED */ +static int +multilist_print_cb(uintptr_t addr, const void *unknown, void *arg) +{ + mdb_printf("%#lr\n", addr); + return (WALK_NEXT); +} + +static int +multilist_walk_step(mdb_walk_state_t *wsp) +{ + multilist_walk_data_t *mwd = wsp->walk_data; + + if (mwd->mwd_idx >= mwd->mwd_ml.ml_num_sublists) + return (WALK_DONE); + + wsp->walk_addr = mwd->mwd_ml.ml_sublists + + mdb_ctf_sizeof_by_name("multilist_sublist_t") * mwd->mwd_idx + + mdb_ctf_offsetof_by_name("multilist_sublist_t", "mls_list"); + + mdb_pwalk("list", multilist_print_cb, (void*)NULL, wsp->walk_addr); + mwd->mwd_idx++; + + return (WALK_NEXT); +} + +static int +multilist_walk_init(mdb_walk_state_t *wsp) +{ + multilist_walk_data_t *mwd; + + if (wsp->walk_addr == NULL) { + mdb_warn("must supply address of multilist_t\n"); + return (WALK_ERR); + } + + mwd = mdb_zalloc(sizeof (multilist_walk_data_t), UM_SLEEP | UM_GC); + if (mdb_ctf_vread(&mwd->mwd_ml, "multilist_t", "mdb_multilist_t", + wsp->walk_addr, 0) == -1) { + return (WALK_ERR); + } + + if (mwd->mwd_ml.ml_num_sublists == 0 || + mwd->mwd_ml.ml_sublists == NULL) { + mdb_warn("invalid or uninitialized multilist at %#lx\n", + wsp->walk_addr); + return (WALK_ERR); + } + + wsp->walk_data = mwd; + return (WALK_NEXT); +} + typedef struct txg_list_walk_data { uintptr_t lw_head[TXG_SIZE]; int lw_txgoff; @@ -3269,6 +3333,359 @@ rrwlock(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) return (DCMD_OK); } +typedef struct mdb_arc_buf_hdr_t { + uint16_t b_psize; + uint16_t b_lsize; + struct { + uint32_t b_bufcnt; + uintptr_t b_state; + uintptr_t b_pdata; + } b_l1hdr; +} mdb_arc_buf_hdr_t; + +enum arc_cflags { + ARC_CFLAG_VERBOSE = 1 << 0, + ARC_CFLAG_ANON = 1 << 1, + ARC_CFLAG_MRU = 1 << 2, + ARC_CFLAG_MFU = 1 << 3, + ARC_CFLAG_BUFS = 1 << 4, +}; + +typedef struct arc_compression_stats_data { + GElf_Sym anon_sym; /* ARC_anon symbol */ + GElf_Sym mru_sym; /* ARC_mru symbol */ + GElf_Sym mrug_sym; /* ARC_mru_ghost symbol */ + GElf_Sym mfu_sym; /* ARC_mfu symbol */ + GElf_Sym mfug_sym; /* ARC_mfu_ghost symbol */ + GElf_Sym l2c_sym; /* ARC_l2c_only symbol */ + uint64_t *anon_c_hist; /* histogram of compressed sizes in anon */ + uint64_t *anon_u_hist; /* histogram of uncompressed sizes in anon */ + uint64_t *anon_bufs; /* histogram of buffer counts in anon state */ + uint64_t *mru_c_hist; /* histogram of compressed sizes in mru */ + uint64_t *mru_u_hist; /* histogram of uncompressed sizes in mru */ + uint64_t *mru_bufs; /* histogram of buffer counts in mru */ + uint64_t *mfu_c_hist; /* histogram of compressed sizes in mfu */ + uint64_t *mfu_u_hist; /* histogram of uncompressed sizes in mfu */ + uint64_t *mfu_bufs; /* histogram of buffer counts in mfu */ + uint64_t *all_c_hist; /* histogram of compressed anon + mru + mfu */ + uint64_t *all_u_hist; /* histogram of uncompressed anon + mru + mfu */ + uint64_t *all_bufs; /* histogram of buffer counts in all states */ + int arc_cflags; /* arc compression flags, specified by user */ + int hist_nbuckets; /* number of buckets in each histogram */ +} arc_compression_stats_data_t; + +int +highbit64(uint64_t i) +{ + int h = 1; + + if (i == 0) + return (0); + if (i & 0xffffffff00000000ULL) { + h += 32; i >>= 32; + } + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +} + +/* ARGSUSED */ +static int +arc_compression_stats_cb(uintptr_t addr, const void *unknown, void *arg) +{ + arc_compression_stats_data_t *data = arg; + mdb_arc_buf_hdr_t hdr; + int cbucket, ubucket, bufcnt; + + if (mdb_ctf_vread(&hdr, "arc_buf_hdr_t", "mdb_arc_buf_hdr_t", + addr, 0) == -1) { + return (WALK_ERR); + } + + /* + * Headers in the ghost states, or the l2c_only state don't have + * arc buffers linked off of them. Thus, their compressed size + * is meaningless, so we skip these from the stats. + */ + if (hdr.b_l1hdr.b_state == data->mrug_sym.st_value || + hdr.b_l1hdr.b_state == data->mfug_sym.st_value || + hdr.b_l1hdr.b_state == data->l2c_sym.st_value) { + return (WALK_NEXT); + } + + /* + * The physical size (compressed) and logical size + * (uncompressed) are in units of SPA_MINBLOCKSIZE. By default, + * we use the log2 of this value (rounded down to the nearest + * integer) to determine the bucket to assign this header to. + * Thus, the histogram is logarithmic with respect to the size + * of the header. For example, the following is a mapping of the + * bucket numbers and the range of header sizes they correspond to: + * + * 0: 0 byte headers + * 1: 512 byte headers + * 2: [1024 - 2048) byte headers + * 3: [2048 - 4096) byte headers + * 4: [4096 - 8192) byte headers + * 5: [8192 - 16394) byte headers + * 6: [16384 - 32768) byte headers + * 7: [32768 - 65536) byte headers + * 8: [65536 - 131072) byte headers + * 9: 131072 byte headers + * + * If the ARC_CFLAG_VERBOSE flag was specified, we use the + * physical and logical sizes directly. Thus, the histogram will + * no longer be logarithmic; instead it will be linear with + * respect to the size of the header. The following is a mapping + * of the first many bucket numbers and the header size they + * correspond to: + * + * 0: 0 byte headers + * 1: 512 byte headers + * 2: 1024 byte headers + * 3: 1536 byte headers + * 4: 2048 byte headers + * 5: 2560 byte headers + * 6: 3072 byte headers + * + * And so on. Keep in mind that a range of sizes isn't used in + * the case of linear scale because the headers can only + * increment or decrement in sizes of 512 bytes. So, it's not + * possible for a header to be sized in between whats listed + * above. + * + * Also, the above mapping values were calculated assuming a + * SPA_MINBLOCKSHIFT of 512 bytes and a SPA_MAXBLOCKSIZE of 128K. + */ + + if (data->arc_cflags & ARC_CFLAG_VERBOSE) { + cbucket = hdr.b_psize; + ubucket = hdr.b_lsize; + } else { + cbucket = highbit64(hdr.b_psize); + ubucket = highbit64(hdr.b_lsize); + } + + bufcnt = hdr.b_l1hdr.b_bufcnt; + if (bufcnt >= data->hist_nbuckets) + bufcnt = data->hist_nbuckets - 1; + + /* Ensure we stay within the bounds of the histogram array */ + ASSERT3U(cbucket, <, data->hist_nbuckets); + ASSERT3U(ubucket, <, data->hist_nbuckets); + + if (hdr.b_l1hdr.b_state == data->anon_sym.st_value) { + data->anon_c_hist[cbucket]++; + data->anon_u_hist[ubucket]++; + data->anon_bufs[bufcnt]++; + } else if (hdr.b_l1hdr.b_state == data->mru_sym.st_value) { + data->mru_c_hist[cbucket]++; + data->mru_u_hist[ubucket]++; + data->mru_bufs[bufcnt]++; + } else if (hdr.b_l1hdr.b_state == data->mfu_sym.st_value) { + data->mfu_c_hist[cbucket]++; + data->mfu_u_hist[ubucket]++; + data->mfu_bufs[bufcnt]++; + } + + data->all_c_hist[cbucket]++; + data->all_u_hist[ubucket]++; + data->all_bufs[bufcnt]++; + + return (WALK_NEXT); +} + +/* ARGSUSED */ +static int +arc_compression_stats(uintptr_t addr, uint_t flags, int argc, + const mdb_arg_t *argv) +{ + arc_compression_stats_data_t data = { 0 }; + unsigned int max_shifted = SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; + unsigned int hist_size; + char range[32]; + int rc = DCMD_OK; + + if (mdb_getopts(argc, argv, + 'v', MDB_OPT_SETBITS, ARC_CFLAG_VERBOSE, &data.arc_cflags, + 'a', MDB_OPT_SETBITS, ARC_CFLAG_ANON, &data.arc_cflags, + 'b', MDB_OPT_SETBITS, ARC_CFLAG_BUFS, &data.arc_cflags, + 'r', MDB_OPT_SETBITS, ARC_CFLAG_MRU, &data.arc_cflags, + 'f', MDB_OPT_SETBITS, ARC_CFLAG_MFU, &data.arc_cflags) != argc) + return (DCMD_USAGE); + + if (mdb_lookup_by_obj(ZFS_OBJ_NAME, "ARC_anon", &data.anon_sym) || + mdb_lookup_by_obj(ZFS_OBJ_NAME, "ARC_mru", &data.mru_sym) || + mdb_lookup_by_obj(ZFS_OBJ_NAME, "ARC_mru_ghost", &data.mrug_sym) || + mdb_lookup_by_obj(ZFS_OBJ_NAME, "ARC_mfu", &data.mfu_sym) || + mdb_lookup_by_obj(ZFS_OBJ_NAME, "ARC_mfu_ghost", &data.mfug_sym) || + mdb_lookup_by_obj(ZFS_OBJ_NAME, "ARC_l2c_only", &data.l2c_sym)) { + mdb_warn("can't find arc state symbol"); + return (DCMD_ERR); + } + + /* + * Determine the maximum expected size for any header, and use + * this to determine the number of buckets needed for each + * histogram. If ARC_CFLAG_VERBOSE is specified, this value is + * used directly; otherwise the log2 of the maximum size is + * used. Thus, if using a log2 scale there's a maximum of 10 + * possible buckets, while the linear scale (when using + * ARC_CFLAG_VERBOSE) has a maximum of 257 buckets. + */ + if (data.arc_cflags & ARC_CFLAG_VERBOSE) + data.hist_nbuckets = max_shifted + 1; + else + data.hist_nbuckets = highbit64(max_shifted) + 1; + + hist_size = sizeof (uint64_t) * data.hist_nbuckets; + + data.anon_c_hist = mdb_zalloc(hist_size, UM_SLEEP); + data.anon_u_hist = mdb_zalloc(hist_size, UM_SLEEP); + data.anon_bufs = mdb_zalloc(hist_size, UM_SLEEP); + + data.mru_c_hist = mdb_zalloc(hist_size, UM_SLEEP); + data.mru_u_hist = mdb_zalloc(hist_size, UM_SLEEP); + data.mru_bufs = mdb_zalloc(hist_size, UM_SLEEP); + + data.mfu_c_hist = mdb_zalloc(hist_size, UM_SLEEP); + data.mfu_u_hist = mdb_zalloc(hist_size, UM_SLEEP); + data.mfu_bufs = mdb_zalloc(hist_size, UM_SLEEP); + + data.all_c_hist = mdb_zalloc(hist_size, UM_SLEEP); + data.all_u_hist = mdb_zalloc(hist_size, UM_SLEEP); + data.all_bufs = mdb_zalloc(hist_size, UM_SLEEP); + + if (mdb_walk("arc_buf_hdr_t_full", arc_compression_stats_cb, + &data) != 0) { + mdb_warn("can't walk arc_buf_hdr's"); + rc = DCMD_ERR; + goto out; + } + + if (data.arc_cflags & ARC_CFLAG_VERBOSE) { + rc = mdb_snprintf(range, sizeof (range), + "[n*%llu, (n+1)*%llu)", SPA_MINBLOCKSIZE, + SPA_MINBLOCKSIZE); + } else { + rc = mdb_snprintf(range, sizeof (range), + "[2^(n-1)*%llu, 2^n*%llu)", SPA_MINBLOCKSIZE, + SPA_MINBLOCKSIZE); + } + + if (rc < 0) { + /* snprintf failed, abort the dcmd */ + rc = DCMD_ERR; + goto out; + } else { + /* snprintf succeeded above, reset return code */ + rc = DCMD_OK; + } + + if (data.arc_cflags & ARC_CFLAG_ANON) { + if (data.arc_cflags & ARC_CFLAG_BUFS) { + mdb_printf("Histogram of the number of anon buffers " + "that are associated with an arc hdr.\n"); + dump_histogram(data.anon_bufs, data.hist_nbuckets, 0); + mdb_printf("\n"); + } + mdb_printf("Histogram of compressed anon buffers.\n" + "Each bucket represents buffers of size: %s.\n", range); + dump_histogram(data.anon_c_hist, data.hist_nbuckets, 0); + mdb_printf("\n"); + + mdb_printf("Histogram of uncompressed anon buffers.\n" + "Each bucket represents buffers of size: %s.\n", range); + dump_histogram(data.anon_u_hist, data.hist_nbuckets, 0); + mdb_printf("\n"); + } + + if (data.arc_cflags & ARC_CFLAG_MRU) { + if (data.arc_cflags & ARC_CFLAG_BUFS) { + mdb_printf("Histogram of the number of mru buffers " + "that are associated with an arc hdr.\n"); + dump_histogram(data.mru_bufs, data.hist_nbuckets, 0); + mdb_printf("\n"); + } + mdb_printf("Histogram of compressed mru buffers.\n" + "Each bucket represents buffers of size: %s.\n", range); + dump_histogram(data.mru_c_hist, data.hist_nbuckets, 0); + mdb_printf("\n"); + + mdb_printf("Histogram of uncompressed mru buffers.\n" + "Each bucket represents buffers of size: %s.\n", range); + dump_histogram(data.mru_u_hist, data.hist_nbuckets, 0); + mdb_printf("\n"); + } + + if (data.arc_cflags & ARC_CFLAG_MFU) { + if (data.arc_cflags & ARC_CFLAG_BUFS) { + mdb_printf("Histogram of the number of mfu buffers " + "that are associated with an arc hdr.\n"); + dump_histogram(data.mfu_bufs, data.hist_nbuckets, 0); + mdb_printf("\n"); + } + + mdb_printf("Histogram of compressed mfu buffers.\n" + "Each bucket represents buffers of size: %s.\n", range); + dump_histogram(data.mfu_c_hist, data.hist_nbuckets, 0); + mdb_printf("\n"); + + mdb_printf("Histogram of uncompressed mfu buffers.\n" + "Each bucket represents buffers of size: %s.\n", range); + dump_histogram(data.mfu_u_hist, data.hist_nbuckets, 0); + mdb_printf("\n"); + } + + if (data.arc_cflags & ARC_CFLAG_BUFS) { + mdb_printf("Histogram of all buffers that " + "are associated with an arc hdr.\n"); + dump_histogram(data.all_bufs, data.hist_nbuckets, 0); + mdb_printf("\n"); + } + + mdb_printf("Histogram of all compressed buffers.\n" + "Each bucket represents buffers of size: %s.\n", range); + dump_histogram(data.all_c_hist, data.hist_nbuckets, 0); + mdb_printf("\n"); + + mdb_printf("Histogram of all uncompressed buffers.\n" + "Each bucket represents buffers of size: %s.\n", range); + dump_histogram(data.all_u_hist, data.hist_nbuckets, 0); + +out: + mdb_free(data.anon_c_hist, hist_size); + mdb_free(data.anon_u_hist, hist_size); + mdb_free(data.anon_bufs, hist_size); + + mdb_free(data.mru_c_hist, hist_size); + mdb_free(data.mru_u_hist, hist_size); + mdb_free(data.mru_bufs, hist_size); + + mdb_free(data.mfu_c_hist, hist_size); + mdb_free(data.mfu_u_hist, hist_size); + mdb_free(data.mfu_bufs, hist_size); + + mdb_free(data.all_c_hist, hist_size); + mdb_free(data.all_u_hist, hist_size); + mdb_free(data.all_bufs, hist_size); + + return (rc); +} + /* * MDB module linkage information: * @@ -3339,6 +3756,14 @@ static const mdb_dcmd_t dcmds[] = { "print zfs debug log", dbgmsg}, { "rrwlock", ":", "print rrwlock_t, including readers", rrwlock}, + { "arc_compression_stats", ":[-vabrf]\n" + "\t-v verbose, display a linearly scaled histogram\n" + "\t-a display ARC_anon state statistics individually\n" + "\t-r display ARC_mru state statistics individually\n" + "\t-f display ARC_mfu state statistics individually\n" + "\t-b display histogram of buffer counts\n", + "print a histogram of compressed arc buffer sizes", + arc_compression_stats}, { NULL } }; @@ -3364,6 +3789,8 @@ static const mdb_walker_t walkers[] = { spa_walk_init, spa_walk_step, NULL }, { "metaslab", "given a spa_t *, walk all metaslab_t structures", metaslab_walk_init, metaslab_walk_step, NULL }, + { "multilist", "given a multilist_t *, walk all list_t structures", + multilist_walk_init, multilist_walk_step, NULL }, { "zfs_acl_node", "given a zfs_acl_t, walk all zfs_acl_nodes", zfs_acl_node_walk_init, zfs_acl_node_walk_step, NULL }, { "zfs_acl_node_aces", "given a zfs_acl_node_t, walk all ACEs", diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c index 812453b826..deef7b4084 100644 --- a/usr/src/cmd/zdb/zdb.c +++ b/usr/src/cmd/zdb/zdb.c @@ -1264,7 +1264,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, } if (!err) ASSERT3U(fill, ==, BP_GET_FILL(bp)); - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } return (err); diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c index a3800f60ce..b21d61d3a7 100644 --- a/usr/src/cmd/ztest/ztest.c +++ b/usr/src/cmd/ztest/ztest.c @@ -187,6 +187,7 @@ extern uint64_t metaslab_gang_bang; extern uint64_t metaslab_df_alloc_threshold; extern uint64_t zfs_deadman_synctime_ms; extern int metaslab_preload_limit; +extern boolean_t zfs_compressed_arc_enabled; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; @@ -5378,6 +5379,12 @@ ztest_resume_thread(void *arg) if (spa_suspended(spa)) ztest_resume(spa); (void) poll(NULL, 0, 100); + + /* + * Periodically change the zfs_compressed_arc_enabled setting. + */ + if (ztest_random(10) == 0) + zfs_compressed_arc_enabled = ztest_random(2); } return (NULL); } diff --git a/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h b/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h index 822f67c923..a159cec627 100644 --- a/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h +++ b/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h @@ -23,7 +23,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -81,6 +81,8 @@ #define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ #define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ +#define SPA_COMPRESSBITS 7 + /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). * The members of the dva_t should be considered opaque outside the SPA. @@ -203,8 +205,10 @@ typedef struct blkptr { #define BP_SET_PSIZE(bp, x) \ BF64_SET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) -#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7) -#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x) +#define BP_GET_COMPRESS(bp) \ + BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) +#define BP_SET_COMPRESS(bp, x) \ + BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) #define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8) #define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x) diff --git a/usr/src/lib/libzpool/common/llib-lzpool b/usr/src/lib/libzpool/common/llib-lzpool index e173e16658..d0421bea94 100644 --- a/usr/src/lib/libzpool/common/llib-lzpool +++ b/usr/src/lib/libzpool/common/llib-lzpool @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ /* LINTLIBRARY */ @@ -67,3 +67,4 @@ extern uint64_t metaslab_df_alloc_threshold; extern boolean_t zfeature_checks_disable; extern uint64_t zfs_deadman_synctime_ms; extern int metaslab_preload_limit; +extern boolean_t zfs_compressed_arc_enabled; diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf index 9516cd0254..4ea9b22ac3 100644 --- a/usr/src/pkg/manifests/system-test-zfstest.mf +++ b/usr/src/pkg/manifests/system-test-zfstest.mf @@ -143,6 +143,10 @@ dir path=opt/zfs-tests/tests/functional/zvol/zvol_ENOSPC dir path=opt/zfs-tests/tests/functional/zvol/zvol_cli dir path=opt/zfs-tests/tests/functional/zvol/zvol_misc dir path=opt/zfs-tests/tests/functional/zvol/zvol_swap +dir path=opt/zfs-tests/tests/perf +dir path=opt/zfs-tests/tests/perf/fio +dir path=opt/zfs-tests/tests/perf/regression +dir path=opt/zfs-tests/tests/perf/scripts file path=opt/zfs-tests/README mode=0444 file path=opt/zfs-tests/bin/chg_usr_exec mode=0555 file path=opt/zfs-tests/bin/devname2devid mode=0555 @@ -170,6 +174,7 @@ file path=opt/zfs-tests/include/properties.shlib mode=0555 file path=opt/zfs-tests/runfiles/delphix.run mode=0444 file path=opt/zfs-tests/runfiles/omnios.run mode=0444 file path=opt/zfs-tests/runfiles/openindiana.run mode=0444 +file path=opt/zfs-tests/runfiles/perf-regression.run mode=0444 file path=opt/zfs-tests/tests/functional/acl/acl.cfg mode=0555 file path=opt/zfs-tests/tests/functional/acl/acl_common.kshlib mode=0555 file path=opt/zfs-tests/tests/functional/acl/cifs/cifs.kshlib mode=0555 @@ -2247,8 +2252,28 @@ file path=opt/zfs-tests/tests/functional/zvol/zvol_swap/zvol_swap_005_pos \ mode=0555 file path=opt/zfs-tests/tests/functional/zvol/zvol_swap/zvol_swap_006_pos \ mode=0555 +file path=opt/zfs-tests/tests/perf/fio/mkfiles.fio mode=0444 +file path=opt/zfs-tests/tests/perf/fio/random_reads.fio mode=0444 +file path=opt/zfs-tests/tests/perf/fio/random_readwrite.fio mode=0444 +file path=opt/zfs-tests/tests/perf/fio/random_writes.fio mode=0444 +file path=opt/zfs-tests/tests/perf/fio/sequential_reads.fio mode=0444 +file path=opt/zfs-tests/tests/perf/fio/sequential_writes.fio mode=0444 +file path=opt/zfs-tests/tests/perf/perf.shlib mode=0555 +file path=opt/zfs-tests/tests/perf/regression/random_reads mode=0555 +file path=opt/zfs-tests/tests/perf/regression/random_readwrite mode=0555 +file path=opt/zfs-tests/tests/perf/regression/random_writes mode=0555 +file path=opt/zfs-tests/tests/perf/regression/sequential_reads mode=0555 +file path=opt/zfs-tests/tests/perf/regression/sequential_reads_cached \ + mode=0555 +file path=opt/zfs-tests/tests/perf/regression/sequential_reads_cached_clone \ + mode=0555 +file path=opt/zfs-tests/tests/perf/regression/sequential_writes mode=0555 +file path=opt/zfs-tests/tests/perf/regression/setup mode=0555 +file path=opt/zfs-tests/tests/perf/scripts/io.d mode=0555 +file path=opt/zfs-tests/tests/perf/scripts/prefetch_io.d mode=0555 license cr_Sun license=cr_Sun license lic_CDDL license=lic_CDDL +#depend fmri=benchmark/fio type=require depend fmri=system/file-system/zfs/tests type=require depend fmri=system/test/testrunner type=require depend fmri=system/xopen/xcu4 type=require diff --git a/usr/src/test/test-runner/cmd/run.py b/usr/src/test/test-runner/cmd/run.py index 0609a89aff..32c6245956 100644 --- a/usr/src/test/test-runner/cmd/run.py +++ b/usr/src/test/test-runner/cmd/run.py @@ -27,6 +27,7 @@ from select import select from subprocess import PIPE from subprocess import Popen from sys import argv +from sys import maxint from sys import exit from threading import Timer from time import time @@ -144,13 +145,16 @@ class Cmd(object): def __init__(self, pathname, outputdir=None, timeout=None, user=None): self.pathname = pathname self.outputdir = outputdir or 'BASEDIR' - self.timeout = timeout or 60 + self.timeout = timeout self.user = user or '' self.killed = False self.result = Result() + if self.timeout == None: + self.timeout = 60 + def __str__(self): - return "Pathname: %s\nOutputdir: %s\nTimeout: %s\nUser: %s\n" % ( + return "Pathname: %s\nOutputdir: %s\nTimeout: %d\nUser: %s\n" % ( self.pathname, self.outputdir, self.timeout, self.user) def kill_cmd(self, proc): @@ -227,6 +231,10 @@ class Cmd(object): try: self.result.starttime = time() proc = Popen(privcmd, stdout=PIPE, stderr=PIPE) + + # Allow a special timeout value of 0 to mean infinity + if int(self.timeout) == 0: + self.timeout = maxint t = Timer(int(self.timeout), self.kill_cmd, [proc]) t.start() self.result.stdout, self.result.stderr = self.collect_output(proc) @@ -315,7 +323,7 @@ class Test(Cmd): pre_user = ' (as %s)' % (self.pre_user) if len(self.post_user): post_user = ' (as %s)' % (self.post_user) - return "Pathname: %s\nOutputdir: %s\nTimeout: %s\nPre: %s%s\nPost: " \ + return "Pathname: %s\nOutputdir: %s\nTimeout: %d\nPre: %s%s\nPost: " \ "%s%s\nUser: %s\n" % (self.pathname, self.outputdir, self.timeout, self.pre, pre_user, self.post, post_user, self.user) @@ -390,7 +398,7 @@ class TestGroup(Test): pre_user = ' (as %s)' % (self.pre_user) if len(self.post_user): post_user = ' (as %s)' % (self.post_user) - return "Pathname: %s\nOutputdir: %s\nTests: %s\nTimeout: %s\n" \ + return "Pathname: %s\nOutputdir: %s\nTests: %s\nTimeout: %d\n" \ "Pre: %s%s\nPost: %s%s\nUser: %s\n" % (self.pathname, self.outputdir, self.tests, self.timeout, self.pre, pre_user, self.post, post_user, self.user) diff --git a/usr/src/test/zfs-tests/include/commands.cfg b/usr/src/test/zfs-tests/include/commands.cfg index c65cafb3df..33a6fe21dd 100644 --- a/usr/src/test/zfs-tests/include/commands.cfg +++ b/usr/src/test/zfs-tests/include/commands.cfg @@ -41,17 +41,20 @@ export DF="/usr/bin/df" export DIFF="/usr/bin/diff" export DIRCMP="/usr/bin/dircmp" export DIRNAME="/usr/bin/dirname" +export DTRACE="/usr/sbin/dtrace" export DU="/usr/bin/du" export DUMPADM="/usr/sbin/dumpadm" export ECHO="/usr/bin/echo" export EGREP="/usr/bin/egrep" # Don't use $ENV here, because in ksh scripts it evaluates to # $HOME/.kshrc - likely not what you wanted. +export FALSE="/usr/bin/false" export FDISK="/usr/sbin/fdisk" export FF="/usr/sbin/ff" export FGREP="/usr/bin/fgrep" export FILE="/usr/bin/file" export FIND="/usr/bin/find" +export FIO="/usr/bin/fio" export FMADM="/usr/sbin/fmadm" export FMDUMP="/usr/sbin/fmdump" export FORMAT="/usr/sbin/format" @@ -70,6 +73,7 @@ export GROUPS="/usr/bin/groups" export HEAD="/usr/bin/head" export HOSTNAME="/usr/bin/hostname" export ID="/usr/bin/id" +export IOSTAT="/usr/bin/iostat" export ISAINFO="/usr/bin/isainfo" export KILL="/usr/bin/kill" export KSH="/usr/bin/ksh" @@ -88,6 +92,7 @@ export MKNOD="/usr/sbin/mknod" export MODINFO="/usr/sbin/modinfo" export MODUNLOAD="/usr/sbin/modunload" export MOUNT="/usr/sbin/mount" +export MPSTAT="/usr/bin/mpstat" export MV="/usr/bin/mv" export NAWK="/usr/bin/nawk" export NCHECK="/usr/sbin/ncheck" @@ -108,6 +113,7 @@ export PSRINFO="/usr/sbin/psrinfo" export PWD="/usr/bin/pwd" export PYTHON="/usr/bin/python" export QUOTAON="/usr/sbin/quotaon" +export READLINK="/usr/bin/readlink" export RCP="/usr/bin/rcp" export REBOOT="/usr/sbin/reboot" export RM="/usr/bin/rm" @@ -130,6 +136,7 @@ export SWAPADD="/sbin/swapadd" export SYNC="/usr/bin/sync" export TAIL="/usr/bin/tail" export TAR="/usr/sbin/tar" +export TIMEOUT="/usr/bin/timeout" export TOUCH="/usr/bin/touch" export TR="/usr/bin/tr" export TRUNCATE="/usr/bin/truncate" @@ -148,11 +155,13 @@ export UNSHARE="/usr/sbin/unshare" export USERADD="/usr/sbin/useradd" export USERDEL="/usr/sbin/userdel" export USERMOD="/usr/sbin/usermod" +export VMSTAT="/usr/bin/vmstat" export WAIT="/usr/bin/wait" export WC="/usr/bin/wc" export ZDB="/usr/sbin/zdb" export ZFS="/usr/sbin/zfs" export ZHACK="/usr/sbin/zhack" +export ZINJECT="/usr/sbin/zinject" export ZLOGIN="/usr/sbin/zlogin" export ZLOOK="/usr/bin/zlook" export ZONEADM="/usr/sbin/zoneadm" diff --git a/usr/src/test/zfs-tests/include/default.cfg b/usr/src/test/zfs-tests/include/default.cfg index 61fb25e628..9318c31d35 100644 --- a/usr/src/test/zfs-tests/include/default.cfg +++ b/usr/src/test/zfs-tests/include/default.cfg @@ -81,12 +81,15 @@ export COMPRESSION_PROP=on export CHECKSUM_PROP=on # some common variables used by test scripts : +export FIO_SCRIPTS=$STF_SUITE/tests/perf/fio +export PERF_SCRIPTS=$STF_SUITE/tests/perf/scripts # some test pool names export TESTPOOL=testpool.$$ export TESTPOOL1=testpool1.$$ export TESTPOOL2=testpool2.$$ export TESTPOOL3=testpool3.$$ +export PERFPOOL=perfpool # some test file system names export TESTFS=testfs.$$ @@ -130,6 +133,12 @@ export BIGVOLSIZE=1eb # Default to limit disks to be checked export MAX_FINDDISKSNUM=6 +# Default minimum size for file based vdevs in the test suite +export MINVDEVSIZE=$((256 * 1024 * 1024)) + +# Minimum vdev size possible as defined in the OS +export SPA_MINDEVSIZE=$((64 * 1024 * 1024)) + export AUTO_SNAP=$($SVCS -a | $GREP auto-snapshot | $GREP online | $AWK \ '{print $3}') diff --git a/usr/src/test/zfs-tests/include/libtest.shlib b/usr/src/test/zfs-tests/include/libtest.shlib index 30f10fcf9f..1d3202768b 100644 --- a/usr/src/test/zfs-tests/include/libtest.shlib +++ b/usr/src/test/zfs-tests/include/libtest.shlib @@ -1172,7 +1172,7 @@ function zfs_zones_setup #zone_name zone_root zone_ip # if verify_slog_support ; then typeset sdevs="/var/tmp/sdev1 /var/tmp/sdev2" - log_must $MKFILE 100M $sdevs + log_must $MKFILE $MINVDEVSIZE $sdevs log_must $ZPOOL add $pool_name log mirror $sdevs fi @@ -2088,7 +2088,7 @@ function verify_slog_support typeset sdev=$dir/b $MKDIR -p $dir - $MKFILE 64M $vdev $sdev + $MKFILE $MINVDEVSIZE $vdev $sdev typeset -i ret=0 if ! $ZPOOL create -n $pool $vdev log $sdev > /dev/null 2>&1; then @@ -2446,3 +2446,27 @@ function vdevs_in_pool return 0; } + +function get_max +{ + typeset -l i max=$1 + shift + + for i in "$@"; do + max=$(echo $((max > i ? max : i))) + done + + echo $max +} + +function get_min +{ + typeset -l i min=$1 + shift + + for i in "$@"; do + min=$(echo $((min < i ? min : i))) + done + + echo $min +} diff --git a/usr/src/test/zfs-tests/runfiles/Makefile b/usr/src/test/zfs-tests/runfiles/Makefile index 0721285dba..039a0caff3 100644 --- a/usr/src/test/zfs-tests/runfiles/Makefile +++ b/usr/src/test/zfs-tests/runfiles/Makefile @@ -16,7 +16,10 @@ include $(SRC)/Makefile.master -SRCS = delphix.run openindiana.run omnios.run +SRCS = delphix.run \ + openindiana.run \ + omnios.run \ + perf-regression.run ROOTOPTPKG = $(ROOT)/opt/zfs-tests RUNFILES = $(ROOTOPTPKG)/runfiles diff --git a/usr/src/test/zfs-tests/runfiles/delphix.run b/usr/src/test/zfs-tests/runfiles/delphix.run index 796a2ebfbe..3b42bc01ef 100644 --- a/usr/src/test/zfs-tests/runfiles/delphix.run +++ b/usr/src/test/zfs-tests/runfiles/delphix.run @@ -443,7 +443,8 @@ tests = ['quota_001_pos', 'quota_002_pos', 'quota_003_pos', 'quota_004_pos', 'quota_005_pos', 'quota_006_neg'] [/opt/zfs-tests/tests/functional/redundancy] -tests = ['redundancy_001_pos', 'redundancy_002_pos', 'redundancy_003_pos'] +tests = ['redundancy_001_pos', 'redundancy_002_pos', 'redundancy_003_pos', + 'redundancy_004_neg'] [/opt/zfs-tests/tests/functional/refquota] tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos', diff --git a/usr/src/test/zfs-tests/runfiles/perf-regression.run b/usr/src/test/zfs-tests/runfiles/perf-regression.run new file mode 100644 index 0000000000..0095931ad5 --- /dev/null +++ b/usr/src/test/zfs-tests/runfiles/perf-regression.run @@ -0,0 +1,30 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +[DEFAULT] +pre = setup +quiet = False +pre_user = root +user = root +timeout = 0 +post_user = root +post = cleanup +outputdir = /var/tmp/test_results + +[/opt/zfs-tests/tests/perf/regression] +tests = ['sequential_writes', 'sequential_reads', 'sequential_reads_cached', + 'sequential_reads_cached_clone', 'random_reads', 'random_writes', + 'random_readwrite'] +post = diff --git a/usr/src/test/zfs-tests/tests/Makefile b/usr/src/test/zfs-tests/tests/Makefile index 5502e74dfd..0f569c9400 100644 --- a/usr/src/test/zfs-tests/tests/Makefile +++ b/usr/src/test/zfs-tests/tests/Makefile @@ -10,11 +10,11 @@ # # -# Copyright (c) 2012 by Delphix. All rights reserved. +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. # .PARALLEL: $(SUBDIRS) -SUBDIRS = functional stress +SUBDIRS = functional perf include $(SRC)/test/Makefile.com diff --git a/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_001_pos.ksh b/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_001_pos.ksh index abd3d0a7f5..8b0f98305b 100644 --- a/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_001_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_001_pos.ksh @@ -27,6 +27,10 @@ # Copyright 2015 Nexenta Systems, Inc. # +# +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. +# + . $STF_SUITE/include/libtest.shlib # @@ -62,7 +66,7 @@ log_onexit cleanup typeset VDEV=/bootfs_001_pos_a.$$.dat -log_must $MKFILE 400m $VDEV +log_must $MKFILE $MINVDEVSIZE $VDEV create_pool "$TESTPOOL" "$VDEV" log_must $ZFS create $TESTPOOL/$TESTFS diff --git a/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_002_neg.ksh b/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_002_neg.ksh index a894778773..2ee502a4b7 100644 --- a/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_002_neg.ksh +++ b/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_002_neg.ksh @@ -27,6 +27,10 @@ # Copyright 2015 Nexenta Systems, Inc. # +# +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. +# + . $STF_SUITE/include/libtest.shlib # diff --git a/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_003_pos.ksh b/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_003_pos.ksh index b16258901f..a488a35e02 100644 --- a/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_003_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_003_pos.ksh @@ -25,6 +25,10 @@ # Use is subject to license terms. # +# +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. +# + . $STF_SUITE/include/libtest.shlib # @@ -59,7 +63,7 @@ fi log_onexit cleanup log_assert "Valid pool names are accepted by zpool set bootfs" -$MKFILE 64m /bootfs_003.$$.dat +$MKFILE $MINVDEVSIZE /bootfs_003.$$.dat typeset -i i=0; diff --git a/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_004_neg.ksh b/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_004_neg.ksh index b9166db82f..5c57545938 100644 --- a/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_004_neg.ksh +++ b/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_004_neg.ksh @@ -25,6 +25,10 @@ # Use is subject to license terms. # +# +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. +# + . $STF_SUITE/include/libtest.shlib # @@ -74,7 +78,7 @@ pools[${#pools[@]}]="$bigname" -$MKFILE 64m /bootfs_004.$$.dat +$MKFILE $MINVDEVSIZE /bootfs_004.$$.dat typeset -i i=0; diff --git a/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_005_neg.ksh b/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_005_neg.ksh index fb7f084312..e555be23a3 100644 --- a/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_005_neg.ksh +++ b/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_005_neg.ksh @@ -25,6 +25,10 @@ # Use is subject to license terms. # +# +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. +# + . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/cli_root/zpool_upgrade/zpool_upgrade.kshlib diff --git a/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_006_pos.ksh b/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_006_pos.ksh index 2af3e7b0cf..45bf94d270 100644 --- a/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_006_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_006_pos.ksh @@ -25,6 +25,10 @@ # Use is subject to license terms. # +# +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. +# + . $STF_SUITE/include/libtest.shlib # @@ -92,7 +96,7 @@ log_assert "Pools of correct vdev types accept boot property" log_onexit cleanup -log_must $MKFILE 64m $VDEV1 $VDEV2 $VDEV3 $VDEV4 +log_must $MKFILE $MINVDEVSIZE $VDEV1 $VDEV2 $VDEV3 $VDEV4 ## the following configurations are supported bootable pools diff --git a/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_008_neg.ksh b/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_008_neg.ksh index 8951718abe..0bdefe6d10 100644 --- a/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_008_neg.ksh +++ b/usr/src/test/zfs-tests/tests/functional/bootfs/bootfs_008_neg.ksh @@ -25,6 +25,10 @@ # Use is subject to license terms. # +# +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. +# + . $STF_SUITE/include/libtest.shlib # @@ -60,7 +64,7 @@ typeset COMP_FS=$TESTPOOL/COMP_FS log_onexit cleanup log_assert $assert_msg -log_must $MKFILE 300m $VDEV +log_must $MKFILE $MINVDEVSIZE $VDEV log_must $ZPOOL create $TESTPOOL $VDEV log_must $ZFS create $COMP_FS diff --git a/usr/src/test/zfs-tests/tests/functional/cache/cache.cfg b/usr/src/test/zfs-tests/tests/functional/cache/cache.cfg index 34d58483de..db00d94dff 100644 --- a/usr/src/test/zfs-tests/tests/functional/cache/cache.cfg +++ b/usr/src/test/zfs-tests/tests/functional/cache/cache.cfg @@ -25,7 +25,7 @@ # # -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -59,7 +59,7 @@ function set_disks set_disks -export SIZE=64M +export SIZE=$MINVDEVSIZE export VDIR=/disk.cache export VDIR2=/disk2.cache diff --git a/usr/src/test/zfs-tests/tests/functional/cachefile/cachefile_004_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cachefile/cachefile_004_pos.ksh index 98adfd08db..ea12cf2b6e 100644 --- a/usr/src/test/zfs-tests/tests/functional/cachefile/cachefile_004_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cachefile/cachefile_004_pos.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -84,7 +84,7 @@ log_must $ZPOOL create $TESTPOOL $DISKS mntpnt=$(get_prop mountpoint $TESTPOOL) typeset -i i=0 while ((i < 2)); do - log_must $MKFILE 64M $mntpnt/vdev$i + log_must $MKFILE $MINVDEVSIZE $mntpnt/vdev$i eval vdev$i=$mntpnt/vdev$i ((i += 1)) done diff --git a/usr/src/test/zfs-tests/tests/functional/clean_mirror/clean_mirror_common.kshlib b/usr/src/test/zfs-tests/tests/functional/clean_mirror/clean_mirror_common.kshlib index 3fd6c02aa1..91a6d132d2 100644 --- a/usr/src/test/zfs-tests/tests/functional/clean_mirror/clean_mirror_common.kshlib +++ b/usr/src/test/zfs-tests/tests/functional/clean_mirror/clean_mirror_common.kshlib @@ -25,7 +25,7 @@ # # -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # . $STF_SUITE/tests/functional/clean_mirror/default.cfg @@ -36,6 +36,32 @@ # the contents of the mirror. # This code is sourced into each of these test cases. +# +# Synchronize all the data in pool +# +# $1 pool name +# +function sync_pool #pool +{ + typeset pool=$1 + + log_must $SYNC + log_must $SLEEP 2 + # Flush all the pool data. + typeset -i ret + $ZPOOL scrub $pool >/dev/null 2>&1 + ret=$? + (( $ret != 0 )) && \ + log_fail "$ZPOOL scrub $pool failed." + + while ! is_pool_scrubbed $pool; do + if is_pool_resilvered $pool ; then + log_fail "$pool should not be resilver completed." + fi + log_must $SLEEP 2 + done +} + function overwrite_verify_mirror { typeset AFFECTED_DEVICE=$1 @@ -60,6 +86,12 @@ function overwrite_verify_mirror atfile=0 + # + # Flush out the cache so that we ensure we're reading from disk. + # + log_must $ZPOOL export $TESTPOOL + log_must $ZPOOL import $TESTPOOL + typeset -i failedcount=0 while (( atfile < FILE_COUNT )); do files[$atfile]=$TESTDIR/file.$atfile @@ -75,4 +107,6 @@ function overwrite_verify_mirror log_fail "of the $FILE_COUNT files $failedcount did not " \ "have the same checksum before and after." fi + + sync_pool $TESTPOOL } diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_015_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_015_pos.ksh index 579e747841..c59159f0cd 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_015_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_015_pos.ksh @@ -11,7 +11,7 @@ # # -# Copyright (c) 2012 by Delphix. All rights reserved. +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. # # DESCRIPTION @@ -139,7 +139,7 @@ log_must snapexists $TESTPOOL/$TESTFS1@snap3 log_note "zfs destroy for snapshots from different pools" VIRTUAL_DISK=/var/tmp/disk -log_must $DD if=/dev/urandom of=$VIRTUAL_DISK bs=1M count=64 +log_must mkfile $MINVDEVSIZE $VIRTUAL_DISK log_must $ZPOOL create $TESTPOOL2 $VIRTUAL_DISK log_must poolexists $TESTPOOL2 diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_004_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_004_pos.ksh index 1dce6e9cc1..566da2272a 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_004_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_004_pos.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2012 by Delphix. All rights reserved. +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -107,9 +107,9 @@ allds="$fs $clone $fssnap $volsnap" #create pool and datasets to guarantee testing under multiple pools and datasets. file=$TESTDIR1/poolfile -typeset -i FILESIZE=104857600 #100M -(( DFILESIZE = FILESIZE * 2 )) # double of FILESIZE -typeset -i VOLSIZE=10485760 #10M +typeset FILESIZE=$MINVDEVSIZE +(( DFILESIZE = $FILESIZE * 2 )) +typeset -i VOLSIZE=10485760 availspace=$(get_prop available $TESTPOOL) typeset -i i=0 diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_005_neg.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_005_neg.ksh index 6010af18ae..b3f1b39671 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_005_neg.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_005_neg.ksh @@ -24,6 +24,11 @@ # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # + +# +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. +# + . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/cli_root/zfs_rename/zfs_rename.kshlib @@ -63,8 +68,7 @@ log_assert "'zfs rename' should fail while datasets are within different pool." additional_setup -typeset FILESIZE=64m -log_must $MKFILE $FILESIZE $TESTDIR/$TESTFILE1 +log_must $MKFILE $MINVDEVSIZE $TESTDIR/$TESTFILE1 create_pool $TESTPOOL1 $TESTDIR/$TESTFILE1 for src in ${src_dataset[@]} ; do diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh index 153a0166c7..20fbdd863b 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh @@ -21,7 +21,7 @@ # # -# Copyright (c) 2012 by Delphix. All rights reserved. +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. # . $STF_SUITE/tests/functional/cli_root/zfs_snapshot/zfs_snapshot.cfg @@ -57,8 +57,8 @@ function cleanup log_assert "'zfs snapshot pool1@snap1 pool2@snap2' should fail since snapshots are in different pools." log_onexit cleanup -log_must $MKFILE 64m $SNAPDEV1 -log_must $MKFILE 64m $SNAPDEV2 +log_must $MKFILE $MINVDEVSIZE $SNAPDEV1 +log_must $MKFILE $MINVDEVSIZE $SNAPDEV2 log_must $ZPOOL create $SNAPPOOL1 $SNAPDEV1 log_must $ZPOOL create $SNAPPOOL2 $SNAPDEV2 diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool/zpool_002_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool/zpool_002_pos.ksh index e8ed789b5a..f3fc2ecb7b 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool/zpool_002_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool/zpool_002_pos.ksh @@ -24,6 +24,11 @@ # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # + +# +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. +# + . $STF_SUITE/include/libtest.shlib # @@ -65,7 +70,7 @@ vdev1=$TESTDIR/file1 vdev2=$TESTDIR/file2 vdev3=$TESTDIR/file3 for vdev in $vdev1 $vdev2 $vdev3; do - $MKFILE 64m $vdev + $MKFILE $MINVDEVSIZE $vdev done set -A cmds "create $pool mirror $vdev1 $vdev2" "list $pool" "iostat $pool" \ diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.cfg b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.cfg index 05da4cb795..d152c3e3f8 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.cfg +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.cfg @@ -25,18 +25,13 @@ # # -# Copyright (c) 2012, 2014 by Delphix. All rights reserved. +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. # export DISK_ARRAY_NUM=0 export DISK_ARRAY_LIMIT=4 export DISKSARRAY="" -# -# Variables for zpool_add_006 -# -export VDEVS_NUM=32 - function set_disks { set -A disk_array $(find_disks $DISKS) @@ -66,10 +61,7 @@ function set_disks set_disks -export FILESIZE="100m" -export FILESIZE1="150m" -export SIZE="150m" -export SIZE1="250m" +export SIZE="$(((MINVDEVSIZE / (1024 * 1024)) * 2))m" export SLICE0=0 export SLICE1=1 export SLICE3=3 @@ -77,4 +69,4 @@ export SLICE4=4 export SLICE5=5 export SLICE6=6 -export VOLSIZE=64mb +export VOLSIZE=$MINVDEVSIZE diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_006_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_006_pos.ksh index c60814d40d..ad2aa685b5 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_006_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_006_pos.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2014 by Delphix. All rights reserved. +# Copyright (c) 2014, 2015 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -38,7 +38,7 @@ # # STRATEGY: # 1. Create a file based pool. -# 2. Add 32 file based vdevs to it. +# 2. Add 16 file based vdevs to it. # 3. Attempt to add a file based vdev that's too small; verify failure. # @@ -61,11 +61,11 @@ log_onexit cleanup create_pool $TESTPOOL ${DISKS%% *} log_must $ZFS create -o mountpoint=$TESTDIR $TESTPOOL/$TESTFS -log_must $MKFILE 64m $TESTDIR/file.00 +log_must $MKFILE $MINVDEVSIZE $TESTDIR/file.00 create_pool "$TESTPOOL1" "$TESTDIR/file.00" -vdevs_list=$($ECHO $TESTDIR/file.{01..32}) -log_must $MKFILE 64m $vdevs_list +vdevs_list=$($ECHO $TESTDIR/file.{01..16}) +log_must $MKFILE $MINVDEVSIZE $vdevs_list log_must $ZPOOL add -f "$TESTPOOL1" $vdevs_list log_must vdevs_in_pool "$TESTPOOL1" "$vdevs_list" diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear.cfg b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear.cfg index 097a43b0ed..e6977350af 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear.cfg +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear.cfg @@ -25,9 +25,9 @@ # # -# Copyright (c) 2012 by Delphix. All rights reserved. +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. # -export FILESIZE=100m +export FILESIZE=$MINVDEVSIZE export BLOCKSZ=$(( 1024 * 1024 )) export NUM_WRITES=40 diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/setup.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/setup.ksh index 7c7e489541..06420b362a 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/setup.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/setup.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2012 by Delphix. All rights reserved. +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -45,12 +45,12 @@ if [[ -n $DISK ]]; then # cleanup_devices $DISK - partition_disk $SIZE $DISK 7 + partition_disk $((($MINVDEVSIZE / (1024 * 1024)) * 3))m $DISK 7 else for disk in `$ECHO $DISKSARRAY`; do cleanup_devices $disk - partition_disk $SIZE $disk 7 + partition_disk $((($MINVDEVSIZE / (1024 * 1024)) * 3))m $disk 7 done fi diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.cfg b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.cfg index 3798b929f1..e4c1e1a806 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.cfg +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.cfg @@ -25,7 +25,7 @@ # # -# Copyright (c) 2012, 2014 by Delphix. All rights reserved. +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -33,7 +33,6 @@ export DISK_ARRAY_NUM=0 export DISK_ARRAY_LIMIT=4 export DISKSARRAY="" -export VDEVS_NUM=32 function set_disks { @@ -57,10 +56,10 @@ function set_disks set_disks -export FILESIZE="100m" -export FILESIZE1="150m" -export SIZE="200m" -export SIZE1="250m" +export FILESIZE="$MINVDEVSIZE" +export FILESIZE1="$(($MINVDEVSIZE * 2))" +export SIZE="$((MINVDEVSIZE / (1024 * 1024)))"m +export SIZE1="$(($MINVDEVSIZE * 2 / (1024 * 1024)))m" export SLICE0=0 export SLICE1=1 export SLICE2=2 diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh index cd4fd38be5..a7cd67a979 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2012 by Delphix. All rights reserved. +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -52,11 +52,12 @@ function cleanup clean_blockfile "$TESTDIR0 $TESTDIR1" if [[ -n $DISK ]]; then - partition_disk $SIZE $DISK 7 + partition_disk $((($MINVDEVSIZE / (1024 * 1024)) * 3))m $DISK 7 else typeset disk="" for disk in $DISK0 $DISK1; do - partition_disk $SIZE $disk 7 + partition_disk \ + $((($MINVDEVSIZE / (1024 * 1024)) * 3))m $disk 7 done fi } diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_004_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_004_pos.ksh index ee604fce13..1ba6408222 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_004_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_004_pos.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2012, 2014 by Delphix. All rights reserved. +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. # . $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib @@ -54,15 +54,15 @@ function cleanup partition_disk $SIZE $disk 6 } -log_assert "Storage pools with $VDEVS_NUM file based vdevs can be created." +log_assert "Storage pools with 16 file based vdevs can be created." log_onexit cleanup disk=${DISKS%% *} create_pool $TESTPOOL $disk log_must $ZFS create -o mountpoint=$TESTDIR $TESTPOOL/$TESTFS -vdevs_list=$($ECHO $TESTDIR/file.{01..32}) -log_must $MKFILE 64m $vdevs_list +vdevs_list=$($ECHO $TESTDIR/file.{01..16}) +log_must $MKFILE $MINVDEVSIZE $vdevs_list create_pool "$TESTPOOL1" $vdevs_list log_must vdevs_in_pool "$TESTPOOL1" "$vdevs_list" @@ -74,7 +74,7 @@ else fi log_must $MKFILE 32m $TESTDIR/broken_file -vdevs_list="$vdevs_list ${TESTDIR}/broken_file" +vdevs_list="$vdevs_list $TESTDIR/broken_file" log_mustnot $ZPOOL create -f $TESTPOOL1 $vdevs_list log_pass "Storage pools with many file based vdevs can be created." diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh index 70dcadc91c..6b3a183f5f 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh @@ -25,6 +25,10 @@ # Use is subject to license terms. # +# +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. +# + . $STF_SUITE/include/libtest.shlib # @@ -33,7 +37,7 @@ # # STRATEGY: # 1. Create base filesystem to hold virtual disk files. -# 2. Create several files >= 64M. +# 2. Create several files == $MINVDEVSIZE. # 3. Verify 'zpool create' succeed with valid keywords combination. # @@ -54,7 +58,7 @@ mntpnt=$(get_prop mountpoint $TESTPOOL) typeset -i i=0 while ((i < 10)); do - log_must $MKFILE 64M $mntpnt/vdev$i + log_must $MKFILE $MINVDEVSIZE $mntpnt/vdev$i eval vdev$i=$mntpnt/vdev$i ((i += 1)) diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh index 7b555de9bd..694397ff14 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2012 by Delphix. All rights reserved. +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -34,7 +34,7 @@ # # DESCRIPTION: -# 'zpool create' should return an error with VDEVsof size <64mb +# 'zpool create' should return an error with VDEVs of size SPA_MINDEVSIZE # # STRATEGY: # 1. Create an array of parameters @@ -42,7 +42,7 @@ # 3. Verify an error is returned. # -log_assert "'zpool create' should return an error with VDEVs <64mb" +log_assert "'zpool create' should return an error with VDEVs SPA_MINDEVSIZE" verify_runnable "global" @@ -69,9 +69,10 @@ create_pool $TESTPOOL $disk log_must $ZFS create $TESTPOOL/$TESTFS log_must $ZFS set mountpoint=$TESTDIR $TESTPOOL/$TESTFS +typeset -l devsize=$(($SPA_MINDEVSIZE - 1024 * 1024)) for files in $TESTDIR/file1 $TESTDIR/file2 do - log_must $MKFILE 63m $files + log_must $MKFILE $devsize $files done set -A args \ diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh index e481682f45..2ce6508174 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2012 by Delphix. All rights reserved. +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -103,7 +103,7 @@ for type in " " mirror raidz raidz2; do "expanded size: $expand_size" # compare available pool size from zfs - if [[ $zfs_expand_size > $zfs_prev_size ]]; then + if [[ $zfs_expand_size -gt $zfs_prev_size ]]; then # check for zpool history for the pool size expansion if [[ $type == " " ]]; then typeset size_addition=$($ZPOOL history -il $TESTPOOL1 \ diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_004_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_004_pos.ksh index 91f2968817..14b9f21982 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_004_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_004_pos.ksh @@ -25,6 +25,10 @@ # Use is subject to license terms. # +# +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. +# + . $STF_SUITE/include/libtest.shlib # @@ -69,7 +73,7 @@ mntpnt=$(get_prop mountpoint $TESTPOOL) typeset -i i=0 while ((i < 5)); do - log_must $MKFILE 64M $mntpnt/vdev$i + log_must $MKFILE $MINVDEVSIZE $mntpnt/vdev$i eval vdev$i=$mntpnt/vdev$i ((i += 1)) done diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg index baa67d05d1..08cb5f63e8 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg @@ -25,7 +25,7 @@ # # -# Copyright (c) 2012 by Delphix. All rights reserved. +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -59,8 +59,8 @@ esac export DISK_COUNT ZFS_DISK1 ZFSSIDE_DISK1 ZFS_DISK2 ZFSSIDE_DISK2 export FS_SIZE=2gb -export FILE_SIZE=64m -export SLICE_SIZE=128m +export FILE_SIZE=$MINVDEVSIZE +export SLICE_SIZE="$((MINVDEVSIZE / (1024 * 1024)))m" export MAX_NUM=5 export GROUP_NUM=3 export DEVICE_DIR=/dev_import-test diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_002_neg.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_002_neg.ksh index 4d2b4bed9c..943eb73a96 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_002_neg.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_002_neg.ksh @@ -25,6 +25,10 @@ # Use is subject to license terms. # +# +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. +# + . $STF_SUITE/include/libtest.shlib # @@ -99,7 +103,7 @@ arguments[${#arguments[@]}]="bootfs=$bigname" # Create a pool called bootfs (so-called, so as to trip any clashes between # property name, and pool name) # Also create a filesystem in this pool -log_must $MKFILE 64m /tmp/zpool_set_002.$$.dat +log_must $MKFILE $MINVDEVSIZE /tmp/zpool_set_002.$$.dat log_must $ZPOOL create bootfs /tmp/zpool_set_002.$$.dat log_must $ZFS create bootfs/root diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_003_neg.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_003_neg.ksh index 02e20ffaef..dc5a284578 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_003_neg.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_003_neg.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2012 by Delphix. All rights reserved. +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -56,7 +56,7 @@ log_onexit cleanup log_assert "zpool set cannot set a readonly property" -log_must $MKFILE 64m /tmp/zpool_set_003.$$.dat +log_must $MKFILE $MINVDEVSIZE /tmp/zpool_set_003.$$.dat log_must $ZPOOL create $TESTPOOL /tmp/zpool_set_003.$$.dat typeset -i i=0; diff --git a/usr/src/test/zfs-tests/tests/functional/cli_user/misc/setup.ksh b/usr/src/test/zfs-tests/tests/functional/cli_user/misc/setup.ksh index c491a9381f..966421015d 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_user/misc/setup.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_user/misc/setup.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # . $STF_SUITE/tests/functional/cli_user/misc/misc.cfg @@ -116,14 +116,14 @@ then # Now create several virtual disks to test zpool with - $MKFILE 100m /$TESTDIR/disk1.dat - $MKFILE 100m /$TESTDIR/disk2.dat - $MKFILE 100m /$TESTDIR/disk3.dat - $MKFILE 100m /$TESTDIR/disk-additional.dat - $MKFILE 100m /$TESTDIR/disk-export.dat - $MKFILE 100m /$TESTDIR/disk-offline.dat - $MKFILE 100m /$TESTDIR/disk-spare1.dat - $MKFILE 100m /$TESTDIR/disk-spare2.dat + $MKFILE $MINVDEVSIZE /$TESTDIR/disk1.dat + $MKFILE $MINVDEVSIZE /$TESTDIR/disk2.dat + $MKFILE $MINVDEVSIZE /$TESTDIR/disk3.dat + $MKFILE $MINVDEVSIZE /$TESTDIR/disk-additional.dat + $MKFILE $MINVDEVSIZE /$TESTDIR/disk-export.dat + $MKFILE $MINVDEVSIZE /$TESTDIR/disk-offline.dat + $MKFILE $MINVDEVSIZE /$TESTDIR/disk-spare1.dat + $MKFILE $MINVDEVSIZE /$TESTDIR/disk-spare2.dat # and create a pool we can perform attach remove replace, # etc. operations with diff --git a/usr/src/test/zfs-tests/tests/functional/history/history_001_pos.ksh b/usr/src/test/zfs-tests/tests/functional/history/history_001_pos.ksh index e5a5c8327b..b65fa99d3c 100644 --- a/usr/src/test/zfs-tests/tests/functional/history/history_001_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/history/history_001_pos.ksh @@ -25,7 +25,7 @@ # Use is subject to license terms. # -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # . $STF_SUITE/tests/functional/history/history_common.kshlib @@ -63,8 +63,8 @@ mntpnt=$(get_prop mountpoint $TESTPOOL) VDEV1=$mntpnt/vdev1; VDEV2=$mntpnt/vdev2; VDEV3=$mntpnt/vdev3; VDEV4=$mntpnt/vdev4; -log_must $MKFILE 64m $VDEV1 $VDEV2 $VDEV3 -log_must $MKFILE 100m $VDEV4 +log_must $MKFILE $MINVDEVSIZE $VDEV1 $VDEV2 $VDEV3 +log_must $MKFILE $(($MINVDEVSIZE * 2)) $VDEV4 run_and_verify -p "$MPOOL" "$ZPOOL create $MPOOL mirror $VDEV1 $VDEV2" run_and_verify -p "$MPOOL" "$ZPOOL add -f $MPOOL spare $VDEV3" diff --git a/usr/src/test/zfs-tests/tests/functional/history/history_003_pos.ksh b/usr/src/test/zfs-tests/tests/functional/history/history_003_pos.ksh index 224ee159e0..01bba0c966 100644 --- a/usr/src/test/zfs-tests/tests/functional/history/history_003_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/history/history_003_pos.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -36,11 +36,10 @@ # zpool history will truncate on small pools, leaving pool creation intact # # STRATEGY: -# 1. Create two 100M virtual disk files. -# 2. Create test pool using the two virtual files. -# 3. Loop 100 times to set and remove compression to test dataset. -# 4. Make sure 'zpool history' output is truncated -# 5. Verify that the initial pool creation is preserved. +# 1. Create a test pool on a file. +# 2. Loop 300 times to set and remove compression to test dataset. +# 3. Make sure 'zpool history' output is truncated +# 4. Verify that the initial pool creation is preserved. # verify_runnable "global" @@ -49,7 +48,6 @@ function cleanup { datasetexists $spool && log_must $ZPOOL destroy $spool [[ -f $VDEV0 ]] && log_must $RM -f $VDEV0 - [[ -f $VDEV1 ]] && log_must $RM -f $VDEV1 [[ -f $TMPFILE ]] && log_must $RM -f $TMPFILE } @@ -59,11 +57,11 @@ log_onexit cleanup mntpnt=$(get_prop mountpoint $TESTPOOL) (( $? != 0 )) && log_fail "get_prop mountpoint $TESTPOOL" -VDEV0=$mntpnt/vdev0; VDEV1=$mntpnt/vdev1 -log_must $MKFILE 100m $VDEV0 $VDEV1 +VDEV0=$mntpnt/vdev0 +log_must $MKFILE $MINVDEVSIZE $VDEV0 spool=smallpool.$$; sfs=smallfs.$$ -log_must $ZPOOL create $spool $VDEV0 $VDEV1 +log_must $ZPOOL create $spool $VDEV0 log_must $ZFS create $spool/$sfs typeset -i orig_count=$($ZPOOL history $spool | $WC -l) @@ -71,7 +69,7 @@ typeset orig_md5=$($ZPOOL history $spool | $HEAD -2 | $MD5SUM | \ $AWK '{print $1}') typeset -i i=0 -while ((i < 100)); do +while ((i < 300)); do $ZFS set compression=off $spool/$sfs $ZFS set compression=on $spool/$sfs $ZFS set compression=off $spool/$sfs diff --git a/usr/src/test/zfs-tests/tests/functional/online_offline/online_offline_003_neg.ksh b/usr/src/test/zfs-tests/tests/functional/online_offline/online_offline_003_neg.ksh index 95d423c73e..51eddbb815 100644 --- a/usr/src/test/zfs-tests/tests/functional/online_offline/online_offline_003_neg.ksh +++ b/usr/src/test/zfs-tests/tests/functional/online_offline/online_offline_003_neg.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2013, 2014 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -58,7 +58,7 @@ log_onexit cleanup specials_list="" for i in 0 1 2; do - $MKFILE 64m $TESTDIR/$TESTFILE1.$i + $MKFILE $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" done disk=($specials_list) diff --git a/usr/src/test/zfs-tests/tests/functional/poolversion/setup.ksh b/usr/src/test/zfs-tests/tests/functional/poolversion/setup.ksh index ce4f45bb72..0bb0caf2d9 100644 --- a/usr/src/test/zfs-tests/tests/functional/poolversion/setup.ksh +++ b/usr/src/test/zfs-tests/tests/functional/poolversion/setup.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -34,12 +34,12 @@ verify_runnable "global" # create a version 1 pool -log_must $MKFILE 64m /tmp/zpool_version_1.dat +log_must $MKFILE $MINVDEVSIZE /tmp/zpool_version_1.dat log_must $ZPOOL create -o version=1 $TESTPOOL /tmp/zpool_version_1.dat # create another version 1 pool -log_must $MKFILE 64m /tmp/zpool2_version_1.dat +log_must $MKFILE $MINVDEVSIZE /tmp/zpool2_version_1.dat log_must $ZPOOL create -o version=1 $TESTPOOL2 /tmp/zpool2_version_1.dat log_pass diff --git a/usr/src/test/zfs-tests/tests/functional/redundancy/redundancy.cfg b/usr/src/test/zfs-tests/tests/functional/redundancy/redundancy.cfg index 079ff8b730..f49b7e76ae 100644 --- a/usr/src/test/zfs-tests/tests/functional/redundancy/redundancy.cfg +++ b/usr/src/test/zfs-tests/tests/functional/redundancy/redundancy.cfg @@ -25,7 +25,7 @@ # # -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # export BASEDIR=/var/tmp/basedir.$$ @@ -34,7 +34,5 @@ export TESTFILE=testfile.$$ export PRE_RECORD_FILE=$BASEDIR/pre-record-file.$$ export PST_RECORD_FILE=$BASEDIR/pst-record-file.$$ -export DEV_SIZE=64M - export BLOCKSZ=$(( 1024 * 1024 )) export NUM_WRITES=40 diff --git a/usr/src/test/zfs-tests/tests/functional/redundancy/redundancy.kshlib b/usr/src/test/zfs-tests/tests/functional/redundancy/redundancy.kshlib index cb8271797b..3428eb9eca 100644 --- a/usr/src/test/zfs-tests/tests/functional/redundancy/redundancy.kshlib +++ b/usr/src/test/zfs-tests/tests/functional/redundancy/redundancy.kshlib @@ -25,7 +25,7 @@ # # -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -119,7 +119,7 @@ function setup_test_env destroy_pool $pool fi - log_must $MKFILE $DEV_SIZE $vdevs + log_must $MKFILE $MINVDEVSIZE $vdevs log_must $ZPOOL create -m $TESTDIR $pool $keyword $vdevs @@ -252,7 +252,7 @@ function replace_missing_devs typeset vdev for vdev in $@; do - log_must $MKFILE $DEV_SIZE $vdev + log_must $MKFILE $MINVDEVSIZE $vdev log_must $ZPOOL replace -f $pool $vdev $vdev while true; do if ! is_pool_resilvered $pool ; then @@ -278,19 +278,20 @@ function damage_devs typeset -i cnt=$2 typeset label="$3" typeset vdevs - typeset -i bs_count + typeset -i bs_count=$((64 * 1024)) vdevs=$(get_vdevs $pool $cnt) + typeset dev if [[ -n $label ]]; then - typeset dev for dev in $vdevs; do - bs_count=$($LS -l $dev | $AWK '{print $5}') - (( bs_count = bs_count/1024 - 512 )) $DD if=/dev/zero of=$dev seek=512 bs=1024 \ - count=$bs_count conv=notrunc >/dev/null 2>&1 + count=$bs_count conv=notrunc >/dev/null 2>&1 done else - log_must $MKFILE $DEV_SIZE $vdevs + for dev in $vdevs; do + $DD if=/dev/zero of=$dev bs=1024 count=$bs_count \ + conv=notrunc >/dev/null 2>&1 + done fi sync_pool $pool diff --git a/usr/src/test/zfs-tests/tests/functional/replacement/replacement_001_pos.ksh b/usr/src/test/zfs-tests/tests/functional/replacement/replacement_001_pos.ksh index f874973f03..11b6eb3c47 100644 --- a/usr/src/test/zfs-tests/tests/functional/replacement/replacement_001_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/replacement/replacement_001_pos.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -128,7 +128,7 @@ function replace_test specials_list="" i=0 while [[ $i != 2 ]]; do - $MKFILE 100m $TESTDIR/$TESTFILE1.$i + $MKFILE $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" ((i = i + 1)) @@ -137,7 +137,7 @@ done # # Create a replacement disk special file. # -$MKFILE 100m $TESTDIR/$REPLACEFILE +$MKFILE $MINVDEVSIZE $TESTDIR/$REPLACEFILE for type in "" "raidz" "raidz1" "mirror"; do for op in "" "-f"; do diff --git a/usr/src/test/zfs-tests/tests/functional/replacement/replacement_002_pos.ksh b/usr/src/test/zfs-tests/tests/functional/replacement/replacement_002_pos.ksh index 634b677ae3..19bbd3f129 100644 --- a/usr/src/test/zfs-tests/tests/functional/replacement/replacement_002_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/replacement/replacement_002_pos.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -128,7 +128,7 @@ function attach_test specials_list="" i=0 while [[ $i != 2 ]]; do - $MKFILE 100m $TESTDIR/$TESTFILE1.$i + $MKFILE $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" ((i = i + 1)) @@ -137,7 +137,7 @@ done # # Create a replacement disk special file. # -$MKFILE 100m $TESTDIR/$REPLACEFILE +$MKFILE $MINVDEVSIZE $TESTDIR/$REPLACEFILE for op in "" "-f"; do create_pool $TESTPOOL1 mirror $specials_list diff --git a/usr/src/test/zfs-tests/tests/functional/replacement/replacement_003_pos.ksh b/usr/src/test/zfs-tests/tests/functional/replacement/replacement_003_pos.ksh index 80d1dd1226..04fe2685a8 100644 --- a/usr/src/test/zfs-tests/tests/functional/replacement/replacement_003_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/replacement/replacement_003_pos.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -125,7 +125,7 @@ function detach_test specials_list="" i=0 while [[ $i != 2 ]]; do - $MKFILE 100m $TESTDIR/$TESTFILE1.$i + $MKFILE $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" ((i = i + 1)) diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_009_pos.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_009_pos.ksh index 0e6fcf1323..8f201fde7f 100644 --- a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_009_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_009_pos.ksh @@ -58,10 +58,10 @@ function cleanup log_assert "Verify zfs receive can handle out of space correctly." log_onexit cleanup -log_must $MKFILE 100M $TESTDIR/bfile -log_must $MKFILE 64M $TESTDIR/sfile -log_must $ZPOOL create bpool $TESTDIR/bfile -log_must $ZPOOL create spool $TESTDIR/sfile +log_must $MKFILE $MINVDEVSIZE $TESTDIR/bfile +log_must $MKFILE $SPA_MINDEVSIZE $TESTDIR/sfile +log_must zpool create bpool $TESTDIR/bfile +log_must zpool create spool $TESTDIR/sfile # # Test out of space on sub-filesystem diff --git a/usr/src/test/zfs-tests/tests/functional/slog/setup.ksh b/usr/src/test/zfs-tests/tests/functional/slog/setup.ksh index a5dc70fff5..c945344932 100644 --- a/usr/src/test/zfs-tests/tests/functional/slog/setup.ksh +++ b/usr/src/test/zfs-tests/tests/functional/slog/setup.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -45,6 +45,6 @@ if [[ -d $VDEV2 ]]; then log_must $RM -rf $VDIR2 fi log_must $MKDIR -p $VDIR $VDIR2 -log_must $MKFILE $SIZE $VDEV $SDEV $LDEV $VDEV2 $SDEV2 $LDEV2 +log_must $MKFILE $MINVDEVSIZE $VDEV $SDEV $LDEV $VDEV2 $SDEV2 $LDEV2 log_pass diff --git a/usr/src/test/zfs-tests/tests/functional/slog/slog.cfg b/usr/src/test/zfs-tests/tests/functional/slog/slog.cfg index 28cce1aae3..6a28880ade 100644 --- a/usr/src/test/zfs-tests/tests/functional/slog/slog.cfg +++ b/usr/src/test/zfs-tests/tests/functional/slog/slog.cfg @@ -25,11 +25,9 @@ # # -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # -export SIZE=64M - export VDIR=/disk-slog export VDIR2=/disk2-slog diff --git a/usr/src/test/zfs-tests/tests/functional/slog/slog_012_neg.ksh b/usr/src/test/zfs-tests/tests/functional/slog/slog_012_neg.ksh index 86047255c1..b982ffa112 100644 --- a/usr/src/test/zfs-tests/tests/functional/slog/slog_012_neg.ksh +++ b/usr/src/test/zfs-tests/tests/functional/slog/slog_012_neg.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # . $STF_SUITE/tests/functional/slog/slog.kshlib @@ -60,7 +60,7 @@ do log_must $DD if=/dev/random of=$mntpnt/testfile.$$ count=100 ldev=$(random_get $LDEV) - log_must $MKFILE $SIZE $ldev + log_must $MKFILE $MINVDEVSIZE $ldev log_must $ZPOOL scrub $TESTPOOL log_must display_status $TESTPOOL diff --git a/usr/src/test/zfs-tests/tests/functional/slog/slog_013_pos.ksh b/usr/src/test/zfs-tests/tests/functional/slog/slog_013_pos.ksh index 8a17bd0464..18970752e2 100644 --- a/usr/src/test/zfs-tests/tests/functional/slog/slog_013_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/slog/slog_013_pos.ksh @@ -26,7 +26,7 @@ # # -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # . $STF_SUITE/tests/functional/slog/slog.kshlib @@ -81,14 +81,12 @@ log_must verify_slog_device $TESTPOOL $lofidev 'ONLINE' log_pass "Verify slog device can be disk, file, lofi device or any device " \ "that presents a block interface." -# Temp disable fore bug 6569095 # Add file which reside in the itself mntpnt=$(get_prop mountpoint $TESTPOOL) -log_must $MKFILE 100M $mntpnt/vdev +log_must $MKFILE $MINVDEVSIZE $mntpnt/vdev log_must $ZPOOL add $TESTPOOL $mntpnt/vdev -# Temp disable fore bug 6569072 # Add ZFS volume vol=$TESTPOOL/vol -log_must $ZPOOL create -V 64M $vol +log_must $ZPOOL create -V $MINVDEVSIZE $vol log_must $ZPOOL add $TESTPOOL /dev/zvol/dsk/$vol diff --git a/usr/src/test/zfs-tests/tests/perf/Makefile b/usr/src/test/zfs-tests/tests/perf/Makefile new file mode 100644 index 0000000000..7886eabad6 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/Makefile @@ -0,0 +1,44 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +.PARALLEL: $(SUBDIRS) + +include $(SRC)/Makefile.master + +ROOTOPTPKG = $(ROOT)/opt/zfs-tests +TESTDIR = $(ROOTOPTPKG)/tests/perf + +PROGS = perf.shlib + +CMDS = $(PROGS:%=$(TESTDIR)/%) +$(CMDS) := FILEMODE = 0555 + +all lint clean clobber: + +install: $(CMDS) + +$(CMDS): $(TESTDIR) + +$(TESTDIR): + $(INS.dir) + +$(TESTDIR)/%: % + $(INS.file) + +SUBDIRS = fio \ + regression \ + scripts + +include $(SRC)/test/Makefile.com diff --git a/usr/src/test/zfs-tests/tests/perf/fio/Makefile b/usr/src/test/zfs-tests/tests/perf/fio/Makefile new file mode 100644 index 0000000000..012e286de1 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/fio/Makefile @@ -0,0 +1,41 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +include $(SRC)/Makefile.master + +ROOTOPTPKG = $(ROOT)/opt/zfs-tests +TESTDIR = $(ROOTOPTPKG)/tests/perf/fio + +FILES = mkfiles.fio \ + random_reads.fio \ + random_readwrite.fio \ + random_writes.fio \ + sequential_reads.fio \ + sequential_writes.fio + +CMDS = $(FILES:%=$(TESTDIR)/%) +$(CMDS) := FILEMODE = 0444 + +all lint clean clobber: + +install: $(CMDS) + +$(CMDS): $(TESTDIR) + +$(TESTDIR): + $(INS.dir) + +$(TESTDIR)/%: % + $(INS.file) diff --git a/usr/src/test/zfs-tests/tests/perf/fio/mkfiles.fio b/usr/src/test/zfs-tests/tests/perf/fio/mkfiles.fio new file mode 100644 index 0000000000..f876bd63d3 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/fio/mkfiles.fio @@ -0,0 +1,30 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +[global] +filename_format=file$jobnum +group_reporting=1 +fallocate=0 +ioengine=psync +bs=1024k +rw=write +thread=1 +directory=/${TESTFS} +numjobs=${NUMJOBS} +filesize=${FILE_SIZE} +buffer_compress_percentage=33 +buffer_compress_chunk=4096 + +[job] diff --git a/usr/src/test/zfs-tests/tests/perf/fio/random_reads.fio b/usr/src/test/zfs-tests/tests/perf/fio/random_reads.fio new file mode 100644 index 0000000000..25dd2ff838 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/fio/random_reads.fio @@ -0,0 +1,31 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +[global] +filename_format=file$jobnum +group_reporting=1 +fallocate=0 +overwrite=0 +thread=1 +rw=randread +time_based=1 +directory=/${TESTFS} +runtime=${RUNTIME} +bs=${BLOCKSIZE} +ioengine=psync +sync=${SYNC_TYPE} +numjobs=${NUMJOBS} + +[job] diff --git a/usr/src/test/zfs-tests/tests/perf/fio/random_readwrite.fio b/usr/src/test/zfs-tests/tests/perf/fio/random_readwrite.fio new file mode 100644 index 0000000000..0b750260ff --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/fio/random_readwrite.fio @@ -0,0 +1,35 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +[global] +filename_format=file$jobnum +nrfiles=16 +group_reporting=1 +fallocate=0 +overwrite=0 +thread=1 +rw=randrw +rwmixread=80 +time_based=1 +directory=/${TESTFS} +runtime=${RUNTIME} +bssplit=4k/50:8k/30:128k/10:1m/10 +ioengine=psync +sync=${SYNC_TYPE} +numjobs=${NUMJOBS} +buffer_compress_percentage=33 +buffer_compress_chunk=4096 + +[job] diff --git a/usr/src/test/zfs-tests/tests/perf/fio/random_writes.fio b/usr/src/test/zfs-tests/tests/perf/fio/random_writes.fio new file mode 100644 index 0000000000..b1860a71dd --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/fio/random_writes.fio @@ -0,0 +1,33 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +[global] +filename_format=file$jobnum +group_reporting=1 +fallocate=0 +thread=1 +rw=randwrite +time_based=1 +directory=/${TESTFS} +runtime=${RUNTIME} +bs=${BLOCKSIZE} +ioengine=psync +sync=${SYNC_TYPE} +numjobs=${NUMJOBS} +filesize=${FILESIZE} +buffer_compress_percentage=33 +buffer_compress_chunk=4096 + +[job] diff --git a/usr/src/test/zfs-tests/tests/perf/fio/sequential_reads.fio b/usr/src/test/zfs-tests/tests/perf/fio/sequential_reads.fio new file mode 100644 index 0000000000..b7d9fea5f3 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/fio/sequential_reads.fio @@ -0,0 +1,31 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +[global] +filename_format=file$jobnum +group_reporting=1 +fallocate=0 +overwrite=0 +thread=1 +rw=read +time_based=1 +directory=/${TESTFS} +runtime=${RUNTIME} +bs=${BLOCKSIZE} +ioengine=psync +sync=${SYNC_TYPE} +numjobs=${NUMJOBS} + +[job] diff --git a/usr/src/test/zfs-tests/tests/perf/fio/sequential_writes.fio b/usr/src/test/zfs-tests/tests/perf/fio/sequential_writes.fio new file mode 100644 index 0000000000..df1590cf11 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/fio/sequential_writes.fio @@ -0,0 +1,33 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +[global] +filename_format=file$jobnum +group_reporting=1 +fallocate=0 +thread=1 +rw=write +time_based=1 +directory=/${TESTFS} +runtime=${RUNTIME} +bs=${BLOCKSIZE} +ioengine=psync +sync=${SYNC_TYPE} +numjobs=${NUMJOBS} +filesize=${FILESIZE} +buffer_compress_percentage=33 +buffer_compress_chunk=4096 + +[job] diff --git a/usr/src/test/zfs-tests/tests/perf/perf.shlib b/usr/src/test/zfs-tests/tests/perf/perf.shlib new file mode 100644 index 0000000000..2b4d043349 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/perf.shlib @@ -0,0 +1,240 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# If neither is specified, do a nightly run. +[[ -z $PERF_REGRESSION_WEEKLY ]] && export PERF_REGRESSION_NIGHTLY=1 + +# Default runtime for each type of test run. +export PERF_RUNTIME_WEEKLY=$((30 * 60)) +export PERF_RUNTIME_NIGHTLY=$((10 * 60)) + +# Default fs creation options +export PERF_FS_OPTS=${PERF_FS_OPTS:-'-o recsize=8k -o compress=lz4' \ + ' -o checksum=sha256 -o redundant_metadata=most'} + +function get_sync_str +{ + typeset sync=$1 + typeset sync_str='' + + [[ $sync -eq 0 ]] && sync_str='async' + [[ $sync -eq 1 ]] && sync_str='sync' + echo $sync_str +} + +# +# This function will run fio in a loop, according to the .fio file passed +# in and a number of environment variables. The following variables can be +# set before launching zfstest to override the defaults. +# +# PERF_RUNTIME: The time in seconds each fio invocation should run. +# PERF_RUNTYPE: A human readable tag that appears in logs. The defaults are +# nightly and weekly. +# PERF_NTHREADS: A list of how many threads each fio invocation will use. +# PERF_SYNC_TYPES: Whether to use (O_SYNC) or not. 1 is sync IO, 0 is async IO. +# PERF_IOSIZES: A list of blocksizes in which each fio invocation will do IO. +# PERF_COLLECT_SCRIPTS: A comma delimited list of 'command args, logfile_tag' +# pairs that will be added to the scripts specified in each test. +# +function do_fio_run +{ + typeset script=$1 + typeset do_recreate=$2 + typeset clear_cache=$3 + typeset threads sync iosize + + for threads in $PERF_NTHREADS; do + for sync in $PERF_SYNC_TYPES; do + for iosize in $PERF_IOSIZES; do + log_note "Running with $threads" \ + "$(get_sync_str $sync) threads, $iosize ios" + + if $do_recreate; then + recreate_perfpool + log_must $ZFS create $PERF_FS_OPTS \ + $TESTFS + fi + + if $clear_cache; then + # Clear the ARC + $ZPOOL export $PERFPOOL + $ZPOOL import $PERFPOOL + fi + + export RUNTIME=$PERF_RUNTIME + export FILESIZE=$((TOTAL_SIZE / threads)) + export NUMJOBS=$threads + export SYNC_TYPE=$sync + export BLOCKSIZE=$iosize + $SYNC + + # Start the data collection + do_collect_scripts $threads $sync $iosize + + # Start the load + log_must $FIO $FIO_SCRIPTS/$script + done + done + done +} + +# +# This function iterates through the value pairs in $PERF_COLLECT_SCRIPTS. +# The script at index N is launched in the background, with its output +# redirected to a logfile containing the tag specified at index N + 1. +# +function do_collect_scripts +{ + typeset threads=$1 + typeset sync=$2 + typeset iosize=$3 + + [[ -n $collect_scripts ]] || log_fail "No data collection scripts." + [[ -n $PERF_RUNTIME ]] || log_fail "No runtime specified." + + # This will be part of the output filename. + typeset sync_str=$(get_sync_str $sync) + typeset suffix="$sync_str.$iosize-ios.$threads-threads" + + # Add in user supplied scripts and logfiles, if any. + typeset oIFS=$IFS + IFS=',' + for item in $PERF_COLLECT_SCRIPTS; do + collect_scripts+=($($ECHO $item | $SED 's/^ *//g')) + done + IFS=$oIFS + + typeset idx=0 + while [[ $idx -lt "${#collect_scripts[@]}" ]]; do + typeset logbase="$(get_perf_output_dir)/$($BASENAME \ + $SUDO_COMMAND)" + typeset outfile="$logbase.${collect_scripts[$idx + 1]}.$suffix" + + $TIMEOUT $PERF_RUNTIME ${collect_scripts[$idx]} >$outfile 2>&1 & + ((idx += 2)) + done + + # Need to explicitly return 0 because timeout(1) will kill + # a child process and cause us to return non-zero. + return 0 +} + +# Find a place to deposit performance data collected while under load. +function get_perf_output_dir +{ + typeset dir="$(pwd)/perf_data" + [[ -d $dir ]] || $MKDIR -p $dir + + $ECHO $dir +} + +# +# Destroy and create the pool used for performance tests. The +# PERFPOOL_CREATE_CMD variable allows users to test with a custom pool +# configuration by specifying the pool creation command in their environment. +# If PERFPOOL_CREATE_CMD is empty, a pool using all available disks is created. +# +function recreate_perfpool +{ + [[ -n $PERFPOOL ]] || log_fail "The \$PERFPOOL variable isn't set." + + poolexists $PERFPOOL && destroy_pool $PERFPOOL + + if [[ -n $PERFPOOL_CREATE_CMD ]]; then + log_must $PERFPOOL_CREATE_CMD + else + log_must eval "$ZPOOL create -f $PERFPOOL $DISKS" + fi +} + +function get_max_arc_size +{ + typeset -l max_arc_size=$(dtrace -qn 'BEGIN { + printf("%u\n", `arc_stats.arcstat_c_max.value.ui64); + exit(0); + }') + + [[ $? -eq 0 ]] || log_fail "get_max_arc_size failed" + + echo $max_arc_size +} + +# Create a file with some information about how this system is configured. +function get_system_config +{ + typeset config=$PERF_DATA_DIR/$1 + + echo "{" >>$config + $DTRACE -qn 'BEGIN{ + printf(" \"ncpus\": %d,\n", `ncpus); + printf(" \"physmem\": %u,\n", `physmem * `_pagesize); + printf(" \"c_max\": %u,\n", `arc_stats.arcstat_c_max.value.ui64); + printf(" \"kmem_flags\": \"0x%x\",", `kmem_flags); + exit(0)}' >>$config + $ECHO " \"hostname\": \"$($UNAME -n)\"," >>$config + $ECHO " \"kernel version\": \"$($UNAME -v)\"," >>$config + $IOSTAT -En | $AWK 'BEGIN { + printf(" \"disks\": {\n"); first = 1} + /^c/ {disk = $1} + /^Size: [^0]/ {size = $2; + if (first != 1) {printf(",\n")} else {first = 0} + printf(" \"%s\": \"%s\"", disk, size)} + END {printf("\n },\n")}' >>$config + $SED -n 's/^set \(.*\)[ ]=[ ]\(.*\)/\1=\2/p' /etc/system | \ + $AWK -F= 'BEGIN {printf(" \"system\": {\n"); first = 1} + {if (first != 1) {printf(",\n")} else {first = 0}; + printf(" \"%s\": %s", $1, $2)} + END {printf("\n }\n")}' >>$config + echo "}" >>$config +} + +function num_jobs_by_cpu +{ + typeset ncpu=$($PSRINFO | $WC -l) + typeset num_jobs=$ncpu + + [[ $ncpu -gt 8 ]] && num_jobs=$($ECHO "$ncpu * 3 / 4" | $BC) + + $ECHO $num_jobs +} + +function pool_to_lun_list +{ + typeset pool=$1 + typeset ctd ctds devname lun + typeset lun_list=':' + + ctds=$($ZPOOL list -v $pool | $AWK '/c[0-9]*t[0-9a-fA-F]*d[0-9]*/ \ + {print $1}') + + for ctd in $ctds; do + # Get the device name as it appears in /etc/path_to_inst + devname=$($READLINK -f /dev/dsk/${ctd}s0 | $SED -n \ + 's/\/devices\([^:]*\):.*/\1/p') + # Add a string composed of the driver name and instance + # number to the list for comparison with dev_statname. + lun=$($SED 's/"//g' /etc/path_to_inst | $GREP $devname | $AWK \ + '{print $3$2}') + lun_list="$lun_list$lun:" + done + echo $lun_list +} + +# Create a perf_data directory to hold performance statistics and +# configuration information. +export PERF_DATA_DIR=$(get_perf_output_dir) +[[ -f $PERF_DATA_DIR/config.json ]] || get_system_config config.json diff --git a/usr/src/test/zfs-tests/tests/perf/regression/Makefile b/usr/src/test/zfs-tests/tests/perf/regression/Makefile new file mode 100644 index 0000000000..1b4aa2befe --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/regression/Makefile @@ -0,0 +1,46 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +include $(SRC)/Makefile.master + +ROOTOPTPKG = $(ROOT)/opt/zfs-tests +TESTDIR = $(ROOTOPTPKG)/tests/perf/regression + +PROGS = random_reads \ + random_readwrite \ + random_writes \ + sequential_reads \ + sequential_reads_cached \ + sequential_reads_cached_clone \ + sequential_writes \ + setup + +CMDS = $(PROGS:%=$(TESTDIR)/%) +$(CMDS) := FILEMODE = 0555 + +all lint clean clobber: + +install: $(CMDS) + +$(CMDS): $(TESTDIR) + +$(TESTDIR): + $(INS.dir) + +$(TESTDIR)/%: %.ksh + $(INS.rename) + +$(TESTDIR)/%: % + $(INS.file) diff --git a/usr/src/test/zfs-tests/tests/perf/regression/random_reads.ksh b/usr/src/test/zfs-tests/tests/perf/regression/random_reads.ksh new file mode 100644 index 0000000000..2395b89183 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/regression/random_reads.ksh @@ -0,0 +1,77 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +# +# Description: +# Trigger fio runs using the random_reads job file. The number of runs and +# data collected is determined by the PERF_* variables. See do_fio_run for +# details about these variables. +# +# The files to read from are created prior to the first fio run, and used +# for all fio runs. The ARC is cleared with `zinject -a` prior to each run +# so reads will go to disk. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/perf/perf.shlib + +function cleanup +{ + log_must $ZFS destroy $TESTFS +} + +log_assert "Measure IO stats during random read load" +log_onexit cleanup + +export TESTFS=$PERFPOOL/testfs +recreate_perfpool +log_must $ZFS create $PERF_FS_OPTS $TESTFS + +# Aim to fill the pool to 50% capacity while accounting for a 3x compressratio. +export TOTAL_SIZE=$(($(get_prop avail $TESTFS) * 3 / 2)) + +# Variables for use by fio. +if [[ -n $PERF_REGRESSION_WEEKLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_WEEKLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'weekly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'8 16 64'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'8k'} +elif [[ -n $PERF_REGRESSION_NIGHTLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_NIGHTLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'nightly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'64 128'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'8k'} +fi + +# Layout the files to be used by the read tests. Create as many files as the +# largest number of threads. An fio run with fewer threads will use a subset +# of the available files. +export NUMJOBS=$(get_max $PERF_NTHREADS) +export FILE_SIZE=$((TOTAL_SIZE / NUMJOBS)) +log_must $FIO $FIO_SCRIPTS/mkfiles.fio + +# Set up the scripts and output files that will log performance data. +lun_list=$(pool_to_lun_list $PERFPOOL) +log_note "Collecting backend IO stats with lun list $lun_list" +export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" + "$VMSTAT 1" "vmstat" "$MPSTAT 1" "mpstat" "$IOSTAT -xcnz 1" "iostat") + +log_note "Random reads with $PERF_RUNTYPE settings" +do_fio_run random_reads.fio $FALSE $TRUE +log_pass "Measure IO stats during random read load" diff --git a/usr/src/test/zfs-tests/tests/perf/regression/random_readwrite.ksh b/usr/src/test/zfs-tests/tests/perf/regression/random_readwrite.ksh new file mode 100644 index 0000000000..be87e43319 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/regression/random_readwrite.ksh @@ -0,0 +1,77 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +# +# Description: +# Trigger fio runs using the random_readwrite job file. The number of runs and +# data collected is determined by the PERF_* variables. See do_fio_run for +# details about these variables. +# +# The files to read and write from are created prior to the first fio run, +# and used for all fio runs. The ARC is cleared with `zinject -a` prior to +# each run so reads will go to disk. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/perf/perf.shlib + +function cleanup +{ + log_must $ZFS destroy $TESTFS +} + +log_assert "Measure IO stats during random read-write load" +log_onexit cleanup + +export TESTFS=$PERFPOOL/testfs +recreate_perfpool +log_must $ZFS create $PERF_FS_OPTS $TESTFS + +# Aim to fill the pool to 50% capacity while accounting for a 3x compressratio. +export TOTAL_SIZE=$(($(get_prop avail $TESTFS) * 3 / 2)) + +# Variables for use by fio. +if [[ -n $PERF_REGRESSION_WEEKLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_WEEKLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'weekly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'8 16 64'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'0 1'} + export PERF_IOSIZES='' # bssplit used instead +elif [[ -n $PERF_REGRESSION_NIGHTLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_NIGHTLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'nightly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'64 128'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES='' # bssplit used instead +fi + +# Layout the files to be used by the readwrite tests. Create as many files +# as the largest number of threads. An fio run with fewer threads will use +# a subset of the available files. +export NUMJOBS=$(get_max $PERF_NTHREADS) +export FILE_SIZE=$((TOTAL_SIZE / NUMJOBS)) +log_must $FIO $FIO_SCRIPTS/mkfiles.fio + +# Set up the scripts and output files that will log performance data. +lun_list=$(pool_to_lun_list $PERFPOOL) +log_note "Collecting backend IO stats with lun list $lun_list" +export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" + "$VMSTAT 1" "vmstat" "$MPSTAT 1" "mpstat" "$IOSTAT -xcnz 1" "iostat") + +log_note "Random reads and writes with $PERF_RUNTYPE settings" +do_fio_run random_readwrite.fio $FALSE $TRUE +log_pass "Measure IO stats during random read and write load" diff --git a/usr/src/test/zfs-tests/tests/perf/regression/random_writes.ksh b/usr/src/test/zfs-tests/tests/perf/regression/random_writes.ksh new file mode 100644 index 0000000000..a4469c32a8 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/regression/random_writes.ksh @@ -0,0 +1,69 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +# +# Description: +# Trigger fio runs using the random_writes job file. The number of runs and +# data collected is determined by the PERF_* variables. See do_fio_run for +# details about these variables. +# +# Prior to each fio run the dataset is recreated, and fio writes new files +# into an otherwise empty pool. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/perf/perf.shlib + +function cleanup +{ + log_must $ZFS destroy $TESTFS +} + +log_assert "Measure IO stats during random write load" +log_onexit cleanup + +export TESTFS=$PERFPOOL/testfs +recreate_perfpool +log_must $ZFS create $PERF_FS_OPTS $TESTFS + +# Aim to fill the pool to 50% capacity while accounting for a 3x compressratio. +export TOTAL_SIZE=$(($(get_prop avail $TESTFS) * 3 / 2)) + +# Variables for use by fio. +if [[ -n $PERF_REGRESSION_WEEKLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_WEEKLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'weekly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'8 16 64'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'0 1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'8k'} +elif [[ -n $PERF_REGRESSION_NIGHTLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_NIGHTLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'nightly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'64 128'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'8k'} +fi + +# Set up the scripts and output files that will log performance data. +lun_list=$(pool_to_lun_list $PERFPOOL) +log_note "Collecting backend IO stats with lun list $lun_list" +export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" + "$VMSTAT 1" "vmstat" "$MPSTAT 1" "mpstat" "$IOSTAT -xcnz 1" "iostat") + +log_note "Random writes with $PERF_RUNTYPE settings" +do_fio_run random_writes.fio $TRUE $FALSE +log_pass "Measure IO stats during random write load" diff --git a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads.ksh b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads.ksh new file mode 100644 index 0000000000..b04d06c939 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads.ksh @@ -0,0 +1,78 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +# +# Description: +# Trigger fio runs using the sequential_reads job file. The number of runs and +# data collected is determined by the PERF_* variables. See do_fio_run for +# details about these variables. +# +# The files to read from are created prior to the first fio run, and used +# for all fio runs. The ARC is cleared with `zinject -a` prior to each run +# so reads will go to disk. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/perf/perf.shlib + +function cleanup +{ + log_must $ZFS destroy $TESTFS +} + +log_assert "Measure IO stats during sequential read load" +log_onexit cleanup + +export TESTFS=$PERFPOOL/testfs +recreate_perfpool +log_must $ZFS create $PERF_FS_OPTS $TESTFS + +# Aim to fill the pool to 50% capacity while accounting for a 3x compressratio. +export TOTAL_SIZE=$(($(get_prop avail $TESTFS) * 3 / 2)) + +# Variables for use by fio. +if [[ -n $PERF_REGRESSION_WEEKLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_WEEKLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'weekly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'16 64'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'64k 128k 1m'} +elif [[ -n $PERF_REGRESSION_NIGHTLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_NIGHTLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'nightly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'64 128'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'128k 1m'} +fi + +# Layout the files to be used by the read tests. Create as many files as the +# largest number of threads. An fio run with fewer threads will use a subset +# of the available files. +export NUMJOBS=$(get_max $PERF_NTHREADS) +export FILE_SIZE=$((TOTAL_SIZE / NUMJOBS)) +log_must $FIO $FIO_SCRIPTS/mkfiles.fio + +# Set up the scripts and output files that will log performance data. +lun_list=$(pool_to_lun_list $PERFPOOL) +log_note "Collecting backend IO stats with lun list $lun_list" +export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" + "$PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" "$VMSTAT 1" "vmstat" + "$MPSTAT 1" "mpstat" "$IOSTAT -xcnz 1" "iostat") + +log_note "Sequential reads with $PERF_RUNTYPE settings" +do_fio_run sequential_reads.fio $FALSE $TRUE +log_pass "Measure IO stats during sequential read load" diff --git a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached.ksh b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached.ksh new file mode 100644 index 0000000000..70cddd4eed --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached.ksh @@ -0,0 +1,77 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +# +# Description: +# Trigger fio runs using the sequential_reads job file. The number of runs and +# data collected is determined by the PERF_* variables. See do_fio_run for +# details about these variables. +# +# The files to read from are created prior to the first fio run, and used +# for all fio runs. The ARC is not cleared to ensure that all data is cached. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/perf/perf.shlib + +function cleanup +{ + log_must $ZFS destroy $TESTFS +} + +log_assert "Measure IO stats during sequential read load" +log_onexit cleanup + +export TESTFS=$PERFPOOL/testfs +recreate_perfpool +log_must $ZFS create $PERF_FS_OPTS $TESTFS + +# Make sure the working set can be cached in the arc. Aim for 1/2 of arc. +export TOTAL_SIZE=$(($(get_max_arc_size) / 2)) + +# Variables for use by fio. +if [[ -n $PERF_REGRESSION_WEEKLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_WEEKLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'weekly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'16 64'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'64k 128k 1m'} +elif [[ -n $PERF_REGRESSION_NIGHTLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_NIGHTLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'nightly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'64 128'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'128k 1m'} +fi + +# Layout the files to be used by the read tests. Create as many files as the +# largest number of threads. An fio run with fewer threads will use a subset +# of the available files. +export NUMJOBS=$(get_max $PERF_NTHREADS) +export FILE_SIZE=$((TOTAL_SIZE / NUMJOBS)) +log_must $FIO $FIO_SCRIPTS/mkfiles.fio + +# Set up the scripts and output files that will log performance data. +lun_list=$(pool_to_lun_list $PERFPOOL) +log_note "Collecting backend IO stats with lun list $lun_list" +export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" + "$PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" "$VMSTAT 1" "vmstat" + "$MPSTAT 1" "mpstat" "$IOSTAT -xcnz 1" "iostat") + +log_note "Sequential cached reads with $PERF_RUNTYPE settings" +do_fio_run sequential_reads.fio $FALSE $FALSE +log_pass "Measure IO stats during sequential cached read load" diff --git a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached_clone.ksh b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached_clone.ksh new file mode 100644 index 0000000000..c4790f1fc4 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached_clone.ksh @@ -0,0 +1,93 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +# +# Description: +# Trigger fio runs using the sequential_reads job file. The number of runs and +# data collected is determined by the PERF_* variables. See do_fio_run for +# details about these variables. +# +# The files to read from are created prior to the first fio run, and used +# for all fio runs. This test will exercise cached read performance from +# a clone filesystem. The data is initially cached in the ARC and then +# a snapshot and clone are created. All the performance runs are then +# initiated against the clone filesystem to exercise the performance of +# reads when the ARC has to create another buffer from a different dataset. +# It will also exercise the need to evict the duplicate buffer once the last +# reference on that buffer is released. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/perf/perf.shlib + +function cleanup +{ + log_must $ZFS destroy $TESTFS +} + +log_assert "Measure IO stats during sequential read load" +log_onexit cleanup + +export TESTFS=$PERFPOOL/testfs +recreate_perfpool +log_must $ZFS create $PERF_FS_OPTS $TESTFS + +# Make sure the working set can be cached in the arc. Aim for 1/2 of arc. +export TOTAL_SIZE=$(($(get_max_arc_size) / 2)) + +# Variables for use by fio. +if [[ -n $PERF_REGRESSION_WEEKLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_WEEKLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'weekly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'16 64'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'64k 128k 1m'} +elif [[ -n $PERF_REGRESSION_NIGHTLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_NIGHTLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'nightly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'64 128'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'128k 1m'} +fi + +# Layout the files to be used by the read tests. Create as many files as the +# largest number of threads. An fio run with fewer threads will use a subset +# of the available files. +export NUMJOBS=$(get_max $PERF_NTHREADS) +export FILE_SIZE=$((TOTAL_SIZE / NUMJOBS)) +log_must $FIO $FIO_SCRIPTS/mkfiles.fio + +log_note "Creating snapshot, $TESTSNAP, of $TESTFS" +create_snapshot $TESTFS $TESTSNAP +log_note "Creating clone, $PERFPOOL/$TESTCLONE, from $TESTFS@$TESTSNAP" +create_clone $TESTFS@$TESTSNAP $PERFPOOL/$TESTCLONE + +# +# Reset the TESTFS to point to the clone +# +export TESTFS=$PERFPOOL/$TESTCLONE + +# Set up the scripts and output files that will log performance data. +lun_list=$(pool_to_lun_list $PERFPOOL) +log_note "Collecting backend IO stats with lun list $lun_list" +export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" + "$PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" "$VMSTAT 1" "vmstat" + "$MPSTAT 1" "mpstat" "$IOSTAT -xcnz 1" "iostat") + +log_note "Sequential cached reads from $TESTFS with $PERF_RUNTYPE settings" +do_fio_run sequential_reads.fio $FALSE $FALSE +log_pass "Measure IO stats during sequential cached read load" diff --git a/usr/src/test/zfs-tests/tests/perf/regression/sequential_writes.ksh b/usr/src/test/zfs-tests/tests/perf/regression/sequential_writes.ksh new file mode 100644 index 0000000000..5e587e8641 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/regression/sequential_writes.ksh @@ -0,0 +1,69 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +# +# Description: +# Trigger fio runs using the sequential_writes job file. The number of runs and +# data collected is determined by the PERF_* variables. See do_fio_run for +# details about these variables. +# +# Prior to each fio run the dataset is recreated, and fio writes new files +# into an otherwise empty pool. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/perf/perf.shlib + +log_assert "Measure IO stats during sequential write load" +log_onexit cleanup + +function cleanup +{ + log_must $ZFS destroy $TESTFS +} + +export TESTFS=$PERFPOOL/testfs +recreate_perfpool +log_must $ZFS create $PERF_FS_OPTS $TESTFS + +# Aim to fill the pool to 50% capacity while accounting for a 3x compressratio. +export TOTAL_SIZE=$(($(get_prop avail $TESTFS) * 3 / 2)) + +# Variables for use by fio. +if [[ -n $PERF_REGRESSION_WEEKLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_WEEKLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'weekly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'8 16'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'0 1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'8k 128k 1m'} +elif [[ -n $PERF_REGRESSION_NIGHTLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_NIGHTLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'nightly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'64 128'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'8k 128k 1m'} +fi + +# Set up the scripts and output files that will log performance data. +lun_list=$(pool_to_lun_list $PERFPOOL) +log_note "Collecting backend IO stats with lun list $lun_list" +export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" + "$VMSTAT 1" "vmstat" "$MPSTAT 1" "mpstat" "$IOSTAT -xcnz 1" "iostat") + +log_note "Sequential writes with $PERF_RUNTYPE settings" +do_fio_run sequential_writes.fio $TRUE $FALSE +log_pass "Measure IO stats during sequential write load" diff --git a/usr/src/test/zfs-tests/tests/perf/regression/setup.ksh b/usr/src/test/zfs-tests/tests/perf/regression/setup.ksh new file mode 100644 index 0000000000..1206bbdaac --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/regression/setup.ksh @@ -0,0 +1,23 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" +verify_disk_count "$DISKS" 3 + +log_pass diff --git a/usr/src/test/zfs-tests/tests/perf/scripts/Makefile b/usr/src/test/zfs-tests/tests/perf/scripts/Makefile new file mode 100644 index 0000000000..aad3dd5943 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/scripts/Makefile @@ -0,0 +1,37 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +include $(SRC)/Makefile.master + +ROOTOPTPKG = $(ROOT)/opt/zfs-tests +TESTDIR = $(ROOTOPTPKG)/tests/perf/scripts + +PROGS = io.d \ + prefetch_io.d + +CMDS = $(PROGS:%=$(TESTDIR)/%) +$(CMDS) := FILEMODE = 0555 + +all lint clean clobber: + +install: $(CMDS) + +$(CMDS): $(TESTDIR) + +$(TESTDIR): + $(INS.dir) + +$(TESTDIR)/%: % + $(INS.file) diff --git a/usr/src/test/zfs-tests/tests/perf/scripts/io.d b/usr/src/test/zfs-tests/tests/perf/scripts/io.d new file mode 100644 index 0000000000..bbcbf8dc54 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/scripts/io.d @@ -0,0 +1,109 @@ +#!/usr/sbin/dtrace -s + +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + */ + +/* + * time: Seconds since the epoch + * @ops: The number of reads and writes per interval + * @bytes: Bytes read and written per interval + * @latencies: Mean read and write latency per interval in ns + * These aggregations are indexed with read/write for back end + * statistics and zfs_read/zfs_write for ZPL level statistics. + */ + +#pragma D option aggsortkey +#pragma D option quiet + +BEGIN +{ + @ops["read"] = count(); + @ops["write"] = count(); + @ops["zfs_read"] = count(); + @ops["zfs_write"] = count(); + @latencies["read"] = avg(0); + @latencies["write"] = avg(0); + @latencies["zfs_read"] = avg(0); + @latencies["zfs_write"] = avg(0); + @bytes["read"] = sum(0); + @bytes["write"] = sum(0); + @bytes["zfs_read"] = sum(0); + @bytes["zfs_write"] = sum(0); + clear(@ops); + clear(@latencies); + clear(@bytes); +} + +fbt:zfs:zfs_read:entry, +fbt:zfs:zfs_write:entry +{ + this->zp = (znode_t *)args[0]->v_data; + this->poolname = stringof(this->zp->z_zfsvfs->z_os->os_spa->spa_name); +} + +fbt:zfs:zfs_read:entry, +fbt:zfs:zfs_write:entry +/ this->poolname == $$1 / +{ + self->ts = timestamp; + @ops[probefunc] = count(); + @bytes[probefunc] = sum(args[1]->uio_resid); +} + +fbt:zfs:zfs_read:return, +fbt:zfs:zfs_write:return +/ self->ts != 0 / +{ + @latencies[probefunc] = avg(timestamp - self->ts); + self->ts = 0; +} + +io:::start +/ strstr($$2, args[1]->dev_statname) != NULL / +{ + start[args[0]->b_edev, args[0]->b_blkno] = timestamp; +} + +io:::done +/ start[args[0]->b_edev, args[0]->b_blkno] / +{ + this->elapsed = timestamp - start[args[0]->b_edev, args[0]->b_blkno]; + this->name = args[0]->b_flags & B_READ ? "read" : "write"; + @ops[this->name] = count(); + @bytes[this->name] = sum(args[0]->b_bcount); + @latencies[this->name] = avg(this->elapsed); + start[args[0]->b_edev, args[0]->b_blkno] = 0; +} + +tick-$3s +{ + printf("%u\n", `time); + printa("ops_%-21s%@u\n", @ops); + printa("bytes_%-21s%@u\n", @bytes); + printa("latencies_%-21s%@u\n", @latencies); + + clear(@ops); + clear(@bytes); + clear(@latencies); +} + +ERROR +{ + trace(arg1); + trace(arg2); + trace(arg3); + trace(arg4); + trace(arg5); +} diff --git a/usr/src/test/zfs-tests/tests/perf/scripts/prefetch_io.d b/usr/src/test/zfs-tests/tests/perf/scripts/prefetch_io.d new file mode 100644 index 0000000000..3689823362 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/scripts/prefetch_io.d @@ -0,0 +1,87 @@ +#!/usr/sbin/dtrace -Cs + +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +/* + * prefetch_ios: Number of IOs the prefetcher issued + * @pf["prefetched_demand_reads"]: Number of demand reads already prefetched + * @pf["sync_wait_for_async"]: Number of times sync IO waited for prefetch IO + * @pf["demand"]: Number of non-prefetch read IOs + * @pf["logical"]: Logical (uncompressed) bytes read per interval + * @pf["physical"]: Physical (compressed) bytes read per interval + */ + +#pragma D option aggsortkey +#pragma D option quiet + +#define SPA_MINBLOCKSHIFT 9 +#define ARC_FLAGS_PREFETCH (1 << 3) +#define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT) +#define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT) + +BEGIN +{ + prefetch_ios = `arc_stats.arcstat_prefetch_data_misses.value.ui64; + prefetch_ios += `arc_stats.arcstat_prefetch_metadata_misses.value.ui64; + @pf["demand"] = sum(0); + @pf["logical"] = sum(0); + @pf["physical"] = sum(0); + @pf["prefetched_demand_reads"] = count(); + @pf["sync_wait_for_async"] = count(); + clear(@pf); +} + +arc_read:arc-demand-hit-predictive-prefetch +{ + @pf["prefetched_demand_reads"] = count(); +} + +arc_read:arc-sync-wait-for-async +{ + @pf["sync_wait_for_async"] = count(); +} + +arc_read_done:entry +/ args[0]->io_spa->spa_name == $$1 / +{ + this->zio = args[0]; + this->buf = (arc_buf_t *)this->zio->io_private; + this->hdr = this->buf->b_hdr; + @pf["demand"] = sum(this->hdr->b_flags & ARC_FLAGS_PREFETCH ? 0 : 1); + @pf["logical"] = sum(HDR_GET_LSIZE(this->hdr)); + @pf["physical"] = sum(HDR_GET_PSIZE(this->hdr)); +} + +tick-$2s +{ + this->new_prefetch_ios = + `arc_stats.arcstat_prefetch_data_misses.value.ui64 + + `arc_stats.arcstat_prefetch_metadata_misses.value.ui64; + printf("%u\n%-24s\t%u\n", `time, "prefetch_ios", + this->new_prefetch_ios - prefetch_ios); + printa("%-24s\t%@u\n", @pf); + prefetch_ios = this->new_prefetch_ios; + clear(@pf); +} + +ERROR +{ + trace(arg1); + trace(arg2); + trace(arg3); + trace(arg4); + trace(arg5); +} diff --git a/usr/src/uts/common/fs/dev/sdev_zvolops.c b/usr/src/uts/common/fs/dev/sdev_zvolops.c index 407ad1d55b..a52606aeda 100644 --- a/usr/src/uts/common/fs/dev/sdev_zvolops.c +++ b/usr/src/uts/common/fs/dev/sdev_zvolops.c @@ -22,6 +22,7 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2013, 2016 Joyent, Inc. All rights reserved. + * Copyright (c) 2014 by Delphix. All rights reserved. */ /* vnode ops for the /dev/zvol directory */ @@ -64,6 +65,13 @@ ddi_modhandle_t zfs_mod; int (*szcm)(char *); int (*szn2m)(char *, minor_t *); + +/* + * Enable/disable snapshots from being created in /dev/zvol. By default, + * they are enabled, preserving the historic behavior. + */ +boolean_t devzvol_snaps_allowed = B_TRUE; + int sdev_zvol_create_minor(char *dsname) { @@ -172,12 +180,18 @@ again: int devzvol_objset_check(char *dsname, dmu_objset_type_t *type) { - boolean_t ispool; + boolean_t ispool, is_snapshot; zfs_cmd_t *zc; int rc; nvlist_t *nvl; size_t nvsz; + ispool = (strchr(dsname, '/') == NULL); + is_snapshot = (strchr(dsname, '@') != NULL); + + if (is_snapshot && !devzvol_snaps_allowed) + return (ENOTSUP); + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strlcpy(zc->zc_name, dsname, MAXPATHLEN); @@ -187,7 +201,6 @@ devzvol_objset_check(char *dsname, dmu_objset_type_t *type) zc->zc_nvlist_src_size = nvsz; fnvlist_free(nvl); - ispool = (strchr(dsname, '/') == NULL) ? B_TRUE : B_FALSE; rc = devzvol_handle_ioctl(ispool ? ZFS_IOC_POOL_STATS : ZFS_IOC_OBJSET_STATS, zc, NULL); if (type && rc == 0) @@ -865,7 +878,8 @@ sdev_iter_datasets(struct vnode *dvp, int arg, char *name) goto skip; } if (arg == ZFS_IOC_DATASET_LIST_NEXT && - zc->zc_objset_stats.dds_type != DMU_OST_ZFS) + zc->zc_objset_stats.dds_type == DMU_OST_ZVOL && + devzvol_snaps_allowed) sdev_iter_snapshots(dvp, zc->zc_name); skip: (void) strcpy(zc->zc_name, name); diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 5c06a1bb29..aa073419c5 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -120,9 +120,134 @@ * - ARC header release, as it removes from L2ARC buflists */ +/* + * ARC operation: + * + * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. + * This structure can point either to a block that is still in the cache or to + * one that is only accessible in an L2 ARC device, or it can provide + * information about a block that was recently evicted. If a block is + * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough + * information to retrieve it from the L2ARC device. This information is + * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block + * that is in this state cannot access the data directly. + * + * Blocks that are actively being referenced or have not been evicted + * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within + * the arc_buf_hdr_t that will point to the data block in memory. A block can + * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC + * caches data in two ways -- in a list of arc buffers (arc_buf_t) and + * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata). + * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC + * consumer, and always contains uncompressed data. The ARC will provide + * references to this data and will keep it cached until it is no longer in + * use. Typically, the arc will try to cache only the L1ARC's physical data + * block and will aggressively evict any arc_buf_t that is no longer referenced. + * The amount of memory consumed by the arc_buf_t's can be seen via the + * "overhead_size" kstat. + * + * + * arc_buf_hdr_t + * +-----------+ + * | | + * | | + * | | + * +-----------+ + * l2arc_buf_hdr_t| | + * | | + * +-----------+ + * l1arc_buf_hdr_t| | + * | | arc_buf_t + * | b_buf +------------>+---------+ arc_buf_t + * | | |b_next +---->+---------+ + * | b_pdata +-+ |---------| |b_next +-->NULL + * +-----------+ | | | +---------+ + * | |b_data +-+ | | + * | +---------+ | |b_data +-+ + * +->+------+ | +---------+ | + * (potentially) | | | | + * compressed | | | | + * data +------+ | v + * +->+------+ +------+ + * uncompressed | | | | + * data | | | | + * +------+ +------+ + * + * The L1ARC's data pointer, however, may or may not be uncompressed. The + * ARC has the ability to store the physical data (b_pdata) associated with + * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk + * physical block, it will match its on-disk compression characteristics. + * If the block on-disk is compressed, then the physical data block + * in the cache will also be compressed and vice-versa. This behavior + * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the + * compressed ARC functionality is disabled, the b_pdata will point to an + * uncompressed version of the on-disk data. + * + * When a consumer reads a block, the ARC must first look to see if the + * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t, + * then an additional arc_buf_t is allocated and the uncompressed data is + * bcopied from the existing arc_buf_t. If the hdr is cached but does not + * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses + * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's + * b_pdata is not compressed, then the block is shared with the newly + * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t + * in the arc buffer chain. Sharing the block reduces the memory overhead + * required when the hdr is caching uncompressed blocks or the compressed + * arc functionality has been disabled via 'zfs_compressed_arc_enabled'. + * + * The diagram below shows an example of an uncompressed ARC hdr that is + * sharing its data with an arc_buf_t: + * + * arc_buf_hdr_t + * +-----------+ + * | | + * | | + * | | + * +-----------+ + * l2arc_buf_hdr_t| | + * | | + * +-----------+ + * l1arc_buf_hdr_t| | + * | | arc_buf_t (shared) + * | b_buf +------------>+---------+ arc_buf_t + * | | |b_next +---->+---------+ + * | b_pdata +-+ |---------| |b_next +-->NULL + * +-----------+ | | | +---------+ + * | |b_data +-+ | | + * | +---------+ | |b_data +-+ + * +->+------+ | +---------+ | + * | | | | + * uncompressed | | | | + * data +------+ | | + * ^ +->+------+ | + * | uncompressed | | | + * | data | | | + * | +------+ | + * +---------------------------------+ + * + * Writing to the arc requires that the ARC first discard the b_pdata + * since the physical block is about to be rewritten. The new data contents + * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline + * performs the write, it may compress the data before writing it to disk. + * The ARC will be called with the transformed data and will bcopy the + * transformed on-disk block into a newly allocated b_pdata. + * + * When the L2ARC is in use, it will also take advantage of the b_pdata. The + * L2ARC will always write the contents of b_pdata to the L2ARC. This means + * that when compressed arc is enabled that the L2ARC blocks are identical + * to the on-disk block in the main data pool. This provides a significant + * advantage since the ARC can leverage the bp's checksum when reading from the + * L2ARC to determine if the contents are valid. However, if the compressed + * arc is disabled, then the L2ARC's block must be transformed to look + * like the physical block in the main data pool before comparing the + * checksum and determining its validity. + */ + #include <sys/spa.h> #include <sys/zio.h> +#include <sys/spa_impl.h> #include <sys/zio_compress.h> +#include <sys/zio_checksum.h> #include <sys/zfs_context.h> #include <sys/arc.h> #include <sys/refcount.h> @@ -152,10 +277,6 @@ static kcondvar_t arc_reclaim_thread_cv; static boolean_t arc_reclaim_thread_exit; static kcondvar_t arc_reclaim_waiters_cv; -static kmutex_t arc_user_evicts_lock; -static kcondvar_t arc_user_evicts_cv; -static boolean_t arc_user_evicts_thread_exit; - uint_t arc_reduce_dnlc_percent = 3; /* @@ -231,9 +352,10 @@ uint64_t zfs_arc_meta_min = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; -int zfs_disable_dup_eviction = 0; int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ +boolean_t zfs_compressed_arc_enabled = B_TRUE; + /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) @@ -274,7 +396,7 @@ typedef struct arc_state { /* * total amount of evictable data in this state */ - uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; + refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; /* * total amount of data in this state; this includes: evictable, * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. @@ -339,6 +461,26 @@ typedef struct arc_stats { kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; /* + * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata. + * Note that the compressed bytes may match the uncompressed bytes + * if the block is either not compressed or compressed arc is disabled. + */ + kstat_named_t arcstat_compressed_size; + /* + * Uncompressed size of the data stored in b_pdata. If compressed + * arc is disabled then this value will be identical to the stat + * above. + */ + kstat_named_t arcstat_uncompressed_size; + /* + * Number of bytes stored in all the arc_buf_t's. This is classified + * as "overhead" since this data is typically short-lived and will + * be evicted from the arc when it becomes unreferenced unless the + * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level + * values have been set (see comment in dbuf.c for more information). + */ + kstat_named_t arcstat_overhead_size; + /* * Number of bytes consumed by internal ARC structures necessary * for tracking purposes; these structures are not actually * backed by ARC buffers. This includes arc_buf_hdr_t structures @@ -483,20 +625,13 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; - kstat_named_t arcstat_l2_cdata_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_asize; kstat_named_t arcstat_l2_hdr_size; - kstat_named_t arcstat_l2_compress_successes; - kstat_named_t arcstat_l2_compress_zeros; - kstat_named_t arcstat_l2_compress_failures; kstat_named_t arcstat_memory_throttle_count; - kstat_named_t arcstat_duplicate_buffers; - kstat_named_t arcstat_duplicate_buffers_size; - kstat_named_t arcstat_duplicate_reads; kstat_named_t arcstat_meta_used; kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_meta_max; @@ -538,6 +673,9 @@ static arc_stats_t arc_stats = { { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, + { "compressed_size", KSTAT_DATA_UINT64 }, + { "uncompressed_size", KSTAT_DATA_UINT64 }, + { "overhead_size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "metadata_size", KSTAT_DATA_UINT64 }, @@ -571,20 +709,13 @@ static arc_stats_t arc_stats = { { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, - { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "l2_compress_successes", KSTAT_DATA_UINT64 }, - { "l2_compress_zeros", KSTAT_DATA_UINT64 }, - { "l2_compress_failures", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, - { "duplicate_buffers", KSTAT_DATA_UINT64 }, - { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, - { "duplicate_reads", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, @@ -657,8 +788,12 @@ static arc_state_t *arc_l2c_only; #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ -#define L2ARC_IS_VALID_COMPRESS(_c_) \ - ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) +/* compressed size of entire arc */ +#define arc_compressed_size ARCSTAT(arcstat_compressed_size) +/* uncompressed size of entire arc */ +#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) +/* number of bytes in the arc from arc_buf_t's */ +#define arc_overhead_size ARCSTAT(arcstat_overhead_size) static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; @@ -718,6 +853,7 @@ struct arc_write_callback { */ typedef struct l1arc_buf_hdr { kmutex_t b_freeze_lock; + zio_cksum_t *b_freeze_cksum; #ifdef ZFS_DEBUG /* * used for debugging wtih kmem_flags - by allocating and freeing @@ -728,9 +864,10 @@ typedef struct l1arc_buf_hdr { #endif arc_buf_t *b_buf; - uint32_t b_datacnt; + uint32_t b_bufcnt; /* for waiting on writes to complete */ kcondvar_t b_cv; + uint8_t b_byteswap; /* protected by arc state mutex */ arc_state_t *b_state; @@ -743,8 +880,7 @@ typedef struct l1arc_buf_hdr { refcount_t b_refcnt; arc_callback_t *b_acb; - /* temporary buffer holder for in-flight compressed data */ - void *b_tmp_cdata; + void *b_pdata; } l1arc_buf_hdr_t; typedef struct l2arc_dev l2arc_dev_t; @@ -753,9 +889,6 @@ typedef struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ uint64_t b_daddr; /* disk address, offset byte */ - /* real alloc'd buffer size depending on b_compress applied */ - int32_t b_asize; - uint8_t b_compress; list_node_t b_l2node; } l2arc_buf_hdr_t; @@ -764,20 +897,37 @@ struct arc_buf_hdr { /* protected by hash lock */ dva_t b_dva; uint64_t b_birth; - /* - * Even though this checksum is only set/verified when a buffer is in - * the L1 cache, it needs to be in the set of common fields because it - * must be preserved from the time before a buffer is written out to - * L2ARC until after it is read back in. - */ - zio_cksum_t *b_freeze_cksum; + arc_buf_contents_t b_type; arc_buf_hdr_t *b_hash_next; arc_flags_t b_flags; - /* immutable */ - int32_t b_size; - uint64_t b_spa; + /* + * This field stores the size of the data buffer after + * compression, and is set in the arc's zio completion handlers. + * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). + * + * While the block pointers can store up to 32MB in their psize + * field, we can only store up to 32MB minus 512B. This is due + * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. + * a field of zeros represents 512B in the bp). We can't use a + * bias of 1 since we need to reserve a psize of zero, here, to + * represent holes and embedded blocks. + * + * This isn't a problem in practice, since the maximum size of a + * buffer is limited to 16MB, so we never need to store 32MB in + * this field. Even in the upstream illumos code base, the + * maximum size of a buffer is limited to 16MB. + */ + uint16_t b_psize; + + /* + * This field stores the size of the data buffer before + * compression, and cannot change once set. It is in units + * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) + */ + uint16_t b_lsize; /* immutable */ + uint64_t b_spa; /* immutable */ /* L2ARC fields. Undefined when not in L2ARC. */ l2arc_buf_hdr_t b_l2hdr; @@ -785,9 +935,6 @@ struct arc_buf_hdr { l1arc_buf_hdr_t b_l1hdr; }; -static arc_buf_t *arc_eviction_list; -static arc_buf_hdr_t arc_eviction_hdr; - #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) @@ -796,25 +943,35 @@ static arc_buf_hdr_t arc_eviction_hdr; #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) -#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) -#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) +#define HDR_COMPRESSION_ENABLED(hdr) \ + ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) -#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) #define HDR_L2_READING(hdr) \ - (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ - ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) + (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ + ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) +#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) #define HDR_ISTYPE_METADATA(hdr) \ - ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) + ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) +/* For storing compression mode in b_flags */ +#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) + +#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ + HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) +#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ + HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); + +#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) + /* * Other sizes */ @@ -867,16 +1024,6 @@ uint64_t zfs_crc64_table[256]; #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ -/* - * Used to distinguish headers that are being process by - * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk - * address. This can happen when the header is added to the l2arc's list - * of buffers to write in the first stage of l2arc_write_buffers(), but - * has not yet been written out which happens in the second stage of - * l2arc_write_buffers(). - */ -#define L2ARC_ADDR_UNSET ((uint64_t)(-1)) - #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) @@ -918,12 +1065,10 @@ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { - arc_buf_t *l2rcb_buf; /* read buffer */ - spa_t *l2rcb_spa; /* spa */ + arc_buf_hdr_t *l2rcb_hdr; /* read buffer */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ - enum zio_compress l2rcb_compress; /* applied compress */ } l2arc_read_callback_t; typedef struct l2arc_write_callback { @@ -935,7 +1080,7 @@ typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ void *l2df_data; size_t l2df_size; - void (*l2df_func)(void *, size_t); + arc_buf_contents_t l2df_type; list_node_t l2df_list_node; } l2arc_data_free_t; @@ -943,21 +1088,22 @@ static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; -static void arc_get_data_buf(arc_buf_t *); +static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); +static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); +static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr); +static void arc_hdr_alloc_pdata(arc_buf_hdr_t *); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(); static void arc_buf_watch(arc_buf_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); +static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); +static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); -static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); -static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); -static void l2arc_release_cdata_buf(arc_buf_hdr_t *); - static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { @@ -975,14 +1121,14 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) return (crc); } -#define BUF_EMPTY(buf) \ - ((buf)->b_dva.dva_word[0] == 0 && \ - (buf)->b_dva.dva_word[1] == 0) +#define HDR_EMPTY(hdr) \ + ((hdr)->b_dva.dva_word[0] == 0 && \ + (hdr)->b_dva.dva_word[1] == 0) -#define BUF_EQUAL(spa, dva, birth, buf) \ - ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ - ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ - ((buf)->b_birth == birth) && ((buf)->b_spa == spa) +#define HDR_EQUAL(spa, dva, birth, hdr) \ + ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ + ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ + ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) @@ -1004,7 +1150,7 @@ buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) mutex_enter(hash_lock); for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; hdr = hdr->b_hash_next) { - if (BUF_EQUAL(spa, dva, birth, hdr)) { + if (HDR_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; return (hdr); } @@ -1042,13 +1188,13 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { - if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) + if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) return (fhdr); } hdr->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = hdr; - hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; + arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ if (i > 0) { @@ -1076,12 +1222,12 @@ buf_hash_remove(arc_buf_hdr_t *hdr) hdrp = &buf_hash_table.ht_table[idx]; while ((fhdr = *hdrp) != hdr) { - ASSERT(fhdr != NULL); + ASSERT3P(fhdr, !=, NULL); hdrp = &fhdr->b_hash_next; } *hdrp = hdr->b_hash_next; hdr->b_hash_next = NULL; - hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; + arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ ARCSTAT_BUMPDOWN(arcstat_hash_elements); @@ -1167,7 +1313,7 @@ hdr_full_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); cv_destroy(&hdr->b_l1hdr.b_cv); refcount_destroy(&hdr->b_l1hdr.b_refcnt); mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); @@ -1181,7 +1327,7 @@ hdr_l2only_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } @@ -1254,166 +1400,138 @@ retry: } } -/* - * Transition between the two allocation states for the arc_buf_hdr struct. - * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without - * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller - * version is used when a cache buffer is only in the L2ARC in order to reduce - * memory usage. - */ -static arc_buf_hdr_t * -arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) -{ - ASSERT(HDR_HAS_L2HDR(hdr)); - - arc_buf_hdr_t *nhdr; - l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; - - ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || - (old == hdr_l2only_cache && new == hdr_full_cache)); - - nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); - - ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); - buf_hash_remove(hdr); - - bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); - - if (new == hdr_full_cache) { - nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; - /* - * arc_access and arc_change_state need to be aware that a - * header has just come out of L2ARC, so we set its state to - * l2c_only even though it's about to change. - */ - nhdr->b_l1hdr.b_state = arc_l2c_only; - - /* Verify previous threads set to NULL before freeing */ - ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL); - } else { - ASSERT(hdr->b_l1hdr.b_buf == NULL); - ASSERT0(hdr->b_l1hdr.b_datacnt); - - /* - * If we've reached here, We must have been called from - * arc_evict_hdr(), as such we should have already been - * removed from any ghost list we were previously on - * (which protects us from racing with arc_evict_state), - * thus no locking is needed during this check. - */ - ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - - /* - * A buffer must not be moved into the arc_l2c_only - * state if it's not finished being written out to the - * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field - * might try to be accessed, even though it was removed. - */ - VERIFY(!HDR_L2_WRITING(hdr)); - VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); +#define ARC_MINTIME (hz>>4) /* 62 ms */ -#ifdef ZFS_DEBUG - if (hdr->b_l1hdr.b_thawed != NULL) { - kmem_free(hdr->b_l1hdr.b_thawed, 1); - hdr->b_l1hdr.b_thawed = NULL; - } -#endif +static inline boolean_t +arc_buf_is_shared(arc_buf_t *buf) +{ + boolean_t shared = (buf->b_data != NULL && + buf->b_data == buf->b_hdr->b_l1hdr.b_pdata); + IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); + return (shared); +} - nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; +static inline void +arc_cksum_free(arc_buf_hdr_t *hdr) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_l1hdr.b_freeze_cksum = NULL; } - /* - * The header has been reallocated so we need to re-insert it into any - * lists it was on. - */ - (void) buf_hash_insert(nhdr, NULL); - - ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); - - mutex_enter(&dev->l2ad_mtx); - - /* - * We must place the realloc'ed header back into the list at - * the same spot. Otherwise, if it's placed earlier in the list, - * l2arc_write_buffers() could find it during the function's - * write phase, and try to write it out to the l2arc. - */ - list_insert_after(&dev->l2ad_buflist, hdr, nhdr); - list_remove(&dev->l2ad_buflist, hdr); - - mutex_exit(&dev->l2ad_mtx); - - /* - * Since we're using the pointer address as the tag when - * incrementing and decrementing the l2ad_alloc refcount, we - * must remove the old pointer (that we're about to destroy) and - * add the new pointer to the refcount. Otherwise we'd remove - * the wrong pointer address when calling arc_hdr_destroy() later. - */ - - (void) refcount_remove_many(&dev->l2ad_alloc, - hdr->b_l2hdr.b_asize, hdr); - - (void) refcount_add_many(&dev->l2ad_alloc, - nhdr->b_l2hdr.b_asize, nhdr); - - buf_discard_identity(hdr); - hdr->b_freeze_cksum = NULL; - kmem_cache_free(old, hdr); - - return (nhdr); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } - -#define ARC_MINTIME (hz>>4) /* 62 ms */ - static void arc_cksum_verify(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + ASSERT(HDR_HAS_L1HDR(hdr)); + + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); - if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) + fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, &zc); + if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } -static int -arc_cksum_equal(arc_buf_t *buf) +static boolean_t +arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) { - zio_cksum_t zc; - int equal; + enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); + boolean_t valid_cksum; - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); - equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); + VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); + + /* + * We rely on the blkptr's checksum to determine if the block + * is valid or not. When compressed arc is enabled, the l2arc + * writes the block to the l2arc just as it appears in the pool. + * This allows us to use the blkptr's checksum to validate the + * data that we just read off of the l2arc without having to store + * a separate checksum in the arc_buf_hdr_t. However, if compressed + * arc is disabled, then the data written to the l2arc is always + * uncompressed and won't match the block as it exists in the main + * pool. When this is the case, we must first compress it if it is + * compressed on the main pool before we can validate the checksum. + */ + if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + uint64_t lsize = HDR_GET_LSIZE(hdr); + uint64_t csize; + + void *cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr)); + csize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); + if (csize < HDR_GET_PSIZE(hdr)) { + /* + * Compressed blocks are always a multiple of the + * smallest ashift in the pool. Ideally, we would + * like to round up the csize to the next + * spa_min_ashift but that value may have changed + * since the block was last written. Instead, + * we rely on the fact that the hdr's psize + * was set to the psize of the block when it was + * last written. We set the csize to that value + * and zero out any part that should not contain + * data. + */ + bzero((char *)cbuf + csize, HDR_GET_PSIZE(hdr) - csize); + csize = HDR_GET_PSIZE(hdr); + } + zio_push_transform(zio, cbuf, csize, HDR_GET_PSIZE(hdr), NULL); + } - return (equal); + /* + * Block pointers always store the checksum for the logical data. + * If the block pointer has the gang bit set, then the checksum + * it represents is for the reconstituted data and not for an + * individual gang member. The zio pipeline, however, must be able to + * determine the checksum of each of the gang constituents so it + * treats the checksum comparison differently than what we need + * for l2arc blocks. This prevents us from using the + * zio_checksum_error() interface directly. Instead we must call the + * zio_checksum_error_impl() so that we can ensure the checksum is + * generated using the correct checksum algorithm and accounts for the + * logical I/O size and not just a gang fragment. + */ + valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, + BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size, + zio->io_offset, NULL) == 0); + zio_pop_transforms(zio); + return (valid_cksum); } static void -arc_cksum_compute(arc_buf_t *buf, boolean_t force) +arc_cksum_compute(arc_buf_t *buf) { - if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) + arc_buf_hdr_t *hdr = buf->b_hdr; + + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; + ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum != NULL) { - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } - buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, - NULL, buf->b_hdr->b_freeze_cksum); - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), + KM_SLEEP); + fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, + hdr->b_l1hdr.b_freeze_cksum); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); arc_buf_watch(buf); } @@ -1452,7 +1570,7 @@ arc_buf_watch(arc_buf_t *buf) procctl_t ctl; ctl.cmd = PCWATCH; ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; - ctl.prwatch.pr_size = buf->b_hdr->b_size; + ctl.prwatch.pr_size = HDR_GET_LSIZE(buf->b_hdr); ctl.prwatch.pr_wflags = WA_WRITE; result = write(arc_procfd, &ctl, sizeof (ctl)); ASSERT3U(result, ==, sizeof (ctl)); @@ -1463,11 +1581,14 @@ arc_buf_watch(arc_buf_t *buf) static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *hdr) { + arc_buf_contents_t type; if (HDR_ISTYPE_METADATA(hdr)) { - return (ARC_BUFC_METADATA); + type = ARC_BUFC_METADATA; } else { - return (ARC_BUFC_DATA); + type = ARC_BUFC_DATA; } + VERIFY3U(hdr->b_type, ==, type); + return (type); } static uint32_t @@ -1489,29 +1610,29 @@ arc_bufc_to_flags(arc_buf_contents_t type) void arc_buf_thaw(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; + if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_l1hdr.b_state != arc_anon) + if (hdr->b_l1hdr.b_state != arc_anon) panic("modifying non-anon buffer!"); - if (HDR_IO_IN_PROGRESS(buf->b_hdr)) + if (HDR_IO_IN_PROGRESS(hdr)) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum != NULL) { - kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - buf->b_hdr->b_freeze_cksum = NULL; - } + ASSERT(HDR_HAS_L1HDR(hdr)); + arc_cksum_free(hdr); + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); #ifdef ZFS_DEBUG if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_l1hdr.b_thawed != NULL) - kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); - buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); + if (hdr->b_l1hdr.b_thawed != NULL) + kmem_free(hdr->b_l1hdr.b_thawed, 1); + hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); } #endif - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); arc_buf_unwatch(buf); } @@ -1519,53 +1640,246 @@ arc_buf_thaw(arc_buf_t *buf) void arc_buf_freeze(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - hash_lock = HDR_LOCK(buf->b_hdr); + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); - ASSERT(buf->b_hdr->b_freeze_cksum != NULL || - buf->b_hdr->b_l1hdr.b_state == arc_anon); - arc_cksum_compute(buf, B_FALSE); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL || + hdr->b_l1hdr.b_state == arc_anon); + arc_cksum_compute(buf); mutex_exit(hash_lock); } +/* + * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, + * the following functions should be used to ensure that the flags are + * updated in a thread-safe way. When manipulating the flags either + * the hash_lock must be held or the hdr must be undiscoverable. This + * ensures that we're not racing with any other threads when updating + * the flags. + */ +static inline void +arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + hdr->b_flags |= flags; +} + +static inline void +arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + hdr->b_flags &= ~flags; +} + +/* + * Setting the compression bits in the arc_buf_hdr_t's b_flags is + * done in a special way since we have to clear and set bits + * at the same time. Consumers that wish to set the compression bits + * must use this function to ensure that the flags are updated in + * thread-safe manner. + */ static void -add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) +arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) { + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * Holes and embedded blocks will always have a psize = 0 so + * we ignore the compression of the blkptr and set the + * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. + * Holes and embedded blocks remain anonymous so we don't + * want to uncompress them. Mark them as uncompressed. + */ + if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { + arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); + ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); + HDR_SET_COMPRESS(hdr, cmp); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); + ASSERT(HDR_COMPRESSION_ENABLED(hdr)); + } +} + +static int +arc_decompress(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; + int error; + + if (arc_buf_is_shared(buf)) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { + /* + * The arc_buf_hdr_t is either not compressed or is + * associated with an embedded block or a hole in which + * case they remain anonymous. + */ + IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 || + HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr)); + ASSERT(!HDR_SHARED_DATA(hdr)); + bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr)); + } else { + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); + error = zio_decompress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr)); + if (error != 0) { + zfs_dbgmsg("hdr %p, compress %d, psize %d, lsize %d", + hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr)); + return (SET_ERROR(EIO)); + } + } + if (bswap != DMU_BSWAP_NUMFUNCS) { + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); + dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); + } + arc_cksum_compute(buf); + return (0); +} + +/* + * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t. + */ +static uint64_t +arc_hdr_size(arc_buf_hdr_t *hdr) +{ + uint64_t size; + + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + HDR_GET_PSIZE(hdr) > 0) { + size = HDR_GET_PSIZE(hdr); + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); + size = HDR_GET_LSIZE(hdr); + } + return (size); +} + +/* + * Increment the amount of evictable space in the arc_state_t's refcount. + * We account for the space used by the hdr and the arc buf individually + * so that we can add and remove them from the refcount individually. + */ +static void +arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(MUTEX_HELD(hash_lock)); + + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + (void) refcount_add_many(&state->arcs_esize[type], lsize, hdr); + return; + } + + ASSERT(!GHOST_STATE(state)); + if (hdr->b_l1hdr.b_pdata != NULL) { + (void) refcount_add_many(&state->arcs_esize[type], + arc_hdr_size(hdr), hdr); + } + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_add_many(&state->arcs_esize[type], lsize, buf); + } +} + +/* + * Decrement the amount of evictable space in the arc_state_t's refcount. + * We account for the space used by the hdr and the arc buf individually + * so that we can add and remove them from the refcount individually. + */ +static void +arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + + ASSERT(HDR_HAS_L1HDR(hdr)); + + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + (void) refcount_remove_many(&state->arcs_esize[type], + lsize, hdr); + return; + } + + ASSERT(!GHOST_STATE(state)); + if (hdr->b_l1hdr.b_pdata != NULL) { + (void) refcount_remove_many(&state->arcs_esize[type], + arc_hdr_size(hdr), hdr); + } + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_remove_many(&state->arcs_esize[type], + lsize, buf); + } +} + +/* + * Add a reference to this hdr indicating that someone is actively + * referencing that memory. When the refcount transitions from 0 to 1, + * we remove it from the respective arc_state_t list to indicate that + * it is not evictable. + */ +static void +add_reference(arc_buf_hdr_t *hdr, void *tag) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + if (!MUTEX_HELD(HDR_LOCK(hdr))) { + ASSERT(hdr->b_l1hdr.b_state == arc_anon); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + } + arc_state_t *state = hdr->b_l1hdr.b_state; if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && (state != arc_anon)) { /* We don't use the L2-only state list. */ if (state != arc_l2c_only) { - arc_buf_contents_t type = arc_buf_type(hdr); - uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; - multilist_t *list = &state->arcs_list[type]; - uint64_t *size = &state->arcs_lsize[type]; - - multilist_remove(list, hdr); - - if (GHOST_STATE(state)) { - ASSERT0(hdr->b_l1hdr.b_datacnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - delta = hdr->b_size; - } - ASSERT(delta > 0); - ASSERT3U(*size, >=, delta); - atomic_add_64(size, -delta); + multilist_remove(&state->arcs_list[arc_buf_type(hdr)], + hdr); + arc_evitable_space_decrement(hdr, state); } /* remove the prefetch flag if we get a reference */ - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); } } +/* + * Remove a reference from this hdr. When the reference transitions from + * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's + * list making it eligible for eviction. + */ static int remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { @@ -1582,15 +1896,9 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) */ if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { - arc_buf_contents_t type = arc_buf_type(hdr); - multilist_t *list = &state->arcs_list[type]; - uint64_t *size = &state->arcs_lsize[type]; - - multilist_insert(list, hdr); - - ASSERT(hdr->b_l1hdr.b_datacnt > 0); - atomic_add_64(size, hdr->b_size * - hdr->b_l1hdr.b_datacnt); + multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); + arc_evictable_space_increment(hdr, state); } return (cnt); } @@ -1605,8 +1913,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, { arc_state_t *old_state; int64_t refcnt; - uint32_t datacnt; - uint64_t from_delta, to_delta; + uint32_t bufcnt; + boolean_t update_old, update_new; arc_buf_contents_t buftype = arc_buf_type(hdr); /* @@ -1619,20 +1927,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); - datacnt = hdr->b_l1hdr.b_datacnt; + bufcnt = hdr->b_l1hdr.b_bufcnt; + update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL); } else { old_state = arc_l2c_only; refcnt = 0; - datacnt = 0; + bufcnt = 0; + update_old = B_FALSE; } + update_new = update_old; ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); - ASSERT(refcnt == 0 || datacnt > 0); - ASSERT(!GHOST_STATE(new_state) || datacnt == 0); - ASSERT(old_state != arc_anon || datacnt <= 1); - - from_delta = to_delta = datacnt * hdr->b_size; + ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); + ASSERT(old_state != arc_anon || bufcnt <= 1); /* * If this buffer is evictable, transfer it from the @@ -1640,25 +1948,17 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { - uint64_t *size = &old_state->arcs_lsize[buftype]; - ASSERT(HDR_HAS_L1HDR(hdr)); multilist_remove(&old_state->arcs_list[buftype], hdr); - /* - * If prefetching out of the ghost cache, - * we will have a non-zero datacnt. - */ - if (GHOST_STATE(old_state) && datacnt == 0) { - /* ghost elements have a ghost size */ - ASSERT(hdr->b_l1hdr.b_buf == NULL); - from_delta = hdr->b_size; + if (GHOST_STATE(old_state)) { + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + update_old = B_TRUE; } - ASSERT3U(*size, >=, from_delta); - atomic_add_64(size, -from_delta); + arc_evitable_space_decrement(hdr, old_state); } if (new_state != arc_anon && new_state != arc_l2c_only) { - uint64_t *size = &new_state->arcs_lsize[buftype]; /* * An L1 header always exists here, since if we're @@ -1669,38 +1969,38 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, ASSERT(HDR_HAS_L1HDR(hdr)); multilist_insert(&new_state->arcs_list[buftype], hdr); - /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { - ASSERT0(datacnt); - ASSERT(hdr->b_l1hdr.b_buf == NULL); - to_delta = hdr->b_size; + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + update_new = B_TRUE; } - atomic_add_64(size, to_delta); + arc_evictable_space_increment(hdr, new_state); } } - ASSERT(!BUF_EMPTY(hdr)); + ASSERT(!HDR_EMPTY(hdr)); if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); /* adjust state sizes (ignore arc_l2c_only) */ - if (to_delta && new_state != arc_l2c_only) { + if (update_new && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { - ASSERT0(datacnt); + ASSERT0(bufcnt); /* - * We moving a header to a ghost state, we first + * When moving a header to a ghost state, we first * remove all arc buffers. Thus, we'll have a - * datacnt of zero, and no arc buffer to use for + * bufcnt of zero, and no arc buffer to use for * the reference. As a result, we use the arc * header pointer for the reference. */ (void) refcount_add_many(&new_state->arcs_size, - hdr->b_size, hdr); + HDR_GET_LSIZE(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); } else { - ASSERT3U(datacnt, !=, 0); + uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, @@ -1709,34 +2009,53 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { + ASSERT3U(bufcnt, !=, 0); + buffers++; + + /* + * When the arc_buf_t is sharing the data + * block with the hdr, the owner of the + * reference belongs to the hdr. Only + * add to the refcount if the arc_buf_t is + * not shared. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + + (void) refcount_add_many(&new_state->arcs_size, + HDR_GET_LSIZE(hdr), buf); + } + ASSERT3U(bufcnt, ==, buffers); + + if (hdr->b_l1hdr.b_pdata != NULL) { (void) refcount_add_many(&new_state->arcs_size, - hdr->b_size, buf); + arc_hdr_size(hdr), hdr); + } else { + ASSERT(GHOST_STATE(old_state)); } } } - if (from_delta && old_state != arc_l2c_only) { + if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { + ASSERT0(bufcnt); + /* * When moving a header off of a ghost state, - * there's the possibility for datacnt to be - * non-zero. This is because we first add the - * arc buffer to the header prior to changing - * the header's state. Since we used the header - * for the reference when putting the header on - * the ghost state, we must balance that and use - * the header when removing off the ghost state - * (even though datacnt is non zero). + * the header will not contain any arc buffers. + * We use the arc header pointer for the reference + * which is exactly what we did when we put the + * header on the ghost state. */ - IMPLY(datacnt == 0, new_state == arc_anon || - new_state == arc_l2c_only); - (void) refcount_remove_many(&old_state->arcs_size, - hdr->b_size, hdr); + HDR_GET_LSIZE(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); } else { - ASSERT3P(datacnt, !=, 0); + uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, @@ -1745,9 +2064,29 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { + ASSERT3P(bufcnt, !=, 0); + buffers++; + + /* + * When the arc_buf_t is sharing the data + * block with the hdr, the owner of the + * reference belongs to the hdr. Only + * add to the refcount if the arc_buf_t is + * not shared. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_remove_many( - &old_state->arcs_size, hdr->b_size, buf); + &old_state->arcs_size, HDR_GET_LSIZE(hdr), + buf); } + ASSERT3U(bufcnt, ==, buffers); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + (void) refcount_remove_many( + &old_state->arcs_size, arc_hdr_size(hdr), hdr); } } @@ -1825,39 +2164,85 @@ arc_space_return(uint64_t space, arc_space_type_t type) atomic_add_64(&arc_size, -space); } -arc_buf_t * -arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) +/* + * Allocate an initial buffer for this hdr, subsequent buffers will + * use arc_buf_clone(). + */ +static arc_buf_t * +arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag) { - arc_buf_hdr_t *hdr; arc_buf_t *buf; - ASSERT3U(size, >, 0); - hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); - ASSERT(BUF_EMPTY(hdr)); - ASSERT3P(hdr->b_freeze_cksum, ==, NULL); - hdr->b_size = size; - hdr->b_spa = spa_load_guid(spa); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); + VERIFY(hdr->b_type == ARC_BUFC_DATA || + hdr->b_type == ARC_BUFC_METADATA); + + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(hdr->b_l1hdr.b_bufcnt); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; buf->b_next = NULL; - hdr->b_flags = arc_bufc_to_flags(type); - hdr->b_flags |= ARC_FLAG_HAS_L1HDR; + add_reference(hdr, tag); + + /* + * We're about to change the hdr's b_flags. We must either + * hold the hash_lock or be undiscoverable. + */ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * If the hdr's data can be shared (no byteswapping, hdr is + * uncompressed, hdr's data is not currently being written to the + * L2ARC write) then we share the data buffer and set the appropriate + * bit in the hdr's b_flags to indicate the hdr is sharing it's + * b_pdata with the arc_buf_t. Otherwise, we allocate a new buffer to + * store the buf's data. + */ + if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && + HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF && !HDR_L2_WRITING(hdr)) { + buf->b_data = hdr->b_l1hdr.b_pdata; + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + } else { + buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + } + VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; - hdr->b_l1hdr.b_state = arc_anon; - hdr->b_l1hdr.b_arc_access = 0; - hdr->b_l1hdr.b_datacnt = 1; - hdr->b_l1hdr.b_tmp_cdata = NULL; + hdr->b_l1hdr.b_bufcnt += 1; - arc_get_data_buf(buf); - ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); + return (buf); +} + +/* + * Used when allocating additional buffers. + */ +static arc_buf_t * +arc_buf_clone(arc_buf_t *from) +{ + arc_buf_t *buf; + arc_buf_hdr_t *hdr = from->b_hdr; + uint64_t size = HDR_GET_LSIZE(hdr); + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_state != arc_anon); + + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_next = hdr->b_l1hdr.b_buf; + hdr->b_l1hdr.b_buf = buf; + buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + bcopy(from->b_data, buf->b_data, size); + hdr->b_l1hdr.b_bufcnt += 1; + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); return (buf); } @@ -1874,7 +2259,7 @@ arc_loan_buf(spa_t *spa, int size) { arc_buf_t *buf; - buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); + buf = arc_alloc_buf(spa, size, arc_onloan_tag, ARC_BUFC_DATA); atomic_add_64(&arc_loaned_bytes, size); return (buf); @@ -1888,12 +2273,12 @@ arc_return_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(buf->b_data != NULL); + ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); - atomic_add_64(&arc_loaned_bytes, -hdr->b_size); + atomic_add_64(&arc_loaned_bytes, -HDR_GET_LSIZE(hdr)); } /* Detach an arc_buf from a dbuf (tag) */ @@ -1902,171 +2287,106 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(buf->b_data != NULL); + ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); - buf->b_efunc = NULL; - buf->b_private = NULL; - - atomic_add_64(&arc_loaned_bytes, hdr->b_size); -} - -static arc_buf_t * -arc_buf_clone(arc_buf_t *from) -{ - arc_buf_t *buf; - arc_buf_hdr_t *hdr = from->b_hdr; - uint64_t size = hdr->b_size; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(hdr->b_l1hdr.b_state != arc_anon); - - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_next = hdr->b_l1hdr.b_buf; - hdr->b_l1hdr.b_buf = buf; - arc_get_data_buf(buf); - bcopy(from->b_data, buf->b_data, size); - - /* - * This buffer already exists in the arc so create a duplicate - * copy for the caller. If the buffer is associated with user data - * then track the size and number of duplicates. These stats will be - * updated as duplicate buffers are created and destroyed. - */ - if (HDR_ISTYPE_DATA(hdr)) { - ARCSTAT_BUMP(arcstat_duplicate_buffers); - ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); - } - hdr->b_l1hdr.b_datacnt += 1; - return (buf); -} - -void -arc_buf_add_ref(arc_buf_t *buf, void* tag) -{ - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - - /* - * Check to see if this buffer is evicted. Callers - * must verify b_data != NULL to know if the add_ref - * was successful. - */ - mutex_enter(&buf->b_evict_lock); - if (buf->b_data == NULL) { - mutex_exit(&buf->b_evict_lock); - return; - } - hash_lock = HDR_LOCK(buf->b_hdr); - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - mutex_exit(&buf->b_evict_lock); - - ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); - add_reference(hdr, hash_lock, tag); - DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); - mutex_exit(hash_lock); - ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), - data, metadata, hits); + atomic_add_64(&arc_loaned_bytes, HDR_GET_LSIZE(hdr)); } static void -arc_buf_free_on_write(void *data, size_t size, - void (*free_func)(void *, size_t)) +l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type) { - l2arc_data_free_t *df; + l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); - df = kmem_alloc(sizeof (*df), KM_SLEEP); df->l2df_data = data; df->l2df_size = size; - df->l2df_func = free_func; + df->l2df_type = type; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); } -/* - * Free the arc data buffer. If it is an l2arc write in progress, - * the buffer is placed on l2arc_free_on_write to be freed later. - */ static void -arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) +arc_hdr_free_on_write(arc_buf_hdr_t *hdr) { - arc_buf_hdr_t *hdr = buf->b_hdr; + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t size = arc_hdr_size(hdr); - if (HDR_L2_WRITING(hdr)) { - arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); - ARCSTAT_BUMP(arcstat_l2_free_on_write); - } else { - free_func(buf->b_data, hdr->b_size); + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); + + (void) refcount_remove_many(&state->arcs_esize[type], + size, hdr); } + (void) refcount_remove_many(&state->arcs_size, size, hdr); + + l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type); } +/* + * Share the arc_buf_t's data with the hdr. Whenever we are sharing the + * data buffer, we transfer the refcount ownership to the hdr and update + * the appropriate kstats. + */ static void -arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) +arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { - ASSERT(HDR_HAS_L2HDR(hdr)); - ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); + arc_state_t *state = hdr->b_l1hdr.b_state; + + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* - * The b_tmp_cdata field is linked off of the b_l1hdr, so if - * that doesn't exist, the header is in the arc_l2c_only state, - * and there isn't anything to free (it's already been freed). + * Start sharing the data buffer. We transfer the + * refcount ownership to the hdr since it always owns + * the refcount whenever an arc_buf_t is shared. */ - if (!HDR_HAS_L1HDR(hdr)) - return; + refcount_transfer_ownership(&state->arcs_size, buf, hdr); + hdr->b_l1hdr.b_pdata = buf->b_data; + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); /* - * The header isn't being written to the l2arc device, thus it - * shouldn't have a b_tmp_cdata to free. + * Since we've transferred ownership to the hdr we need + * to increment its compressed and uncompressed kstats and + * decrement the overhead size. */ - if (!HDR_L2_WRITING(hdr)) { - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); - return; - } + ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, -HDR_GET_LSIZE(hdr)); +} + +static void +arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + + ASSERT(HDR_SHARED_DATA(hdr)); + ASSERT(arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* - * The header does not have compression enabled. This can be due - * to the buffer not being compressible, or because we're - * freeing the buffer before the second phase of - * l2arc_write_buffer() has started (which does the compression - * step). In either case, b_tmp_cdata does not point to a - * separately compressed buffer, so there's nothing to free (it - * points to the same buffer as the arc_buf_t's b_data field). + * We are no longer sharing this buffer so we need + * to transfer its ownership to the rightful owner. */ - if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_OFF) { - hdr->b_l1hdr.b_tmp_cdata = NULL; - return; - } + refcount_transfer_ownership(&state->arcs_size, hdr, buf); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + hdr->b_l1hdr.b_pdata = NULL; /* - * There's nothing to free since the buffer was all zero's and - * compressed to a zero length buffer. + * Since the buffer is no longer shared between + * the arc buf and the hdr, count it as overhead. */ - if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_EMPTY) { - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); - return; - } - - ASSERT(L2ARC_IS_VALID_COMPRESS(hdr->b_l2hdr.b_compress)); - - arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, - hdr->b_size, zio_data_buf_free); - - ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); - hdr->b_l1hdr.b_tmp_cdata = NULL; + ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); } /* @@ -2074,54 +2394,41 @@ arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) * arc_buf_t off of the the arc_buf_hdr_t's list and free it. */ static void -arc_buf_destroy(arc_buf_t *buf, boolean_t remove) +arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove) { arc_buf_t **bufp; + arc_buf_hdr_t *hdr = buf->b_hdr; + uint64_t size = HDR_GET_LSIZE(hdr); + boolean_t destroyed_buf_is_shared = arc_buf_is_shared(buf); - /* free up data associated with the buf */ + /* + * Free up the data associated with the buf but only + * if we're not sharing this with the hdr. If we are sharing + * it with the hdr, then hdr will have performed the allocation + * so allow it to do the free. + */ if (buf->b_data != NULL) { - arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; - uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = arc_buf_type(buf->b_hdr); + /* + * We're about to change the hdr's b_flags. We must either + * hold the hash_lock or be undiscoverable. + */ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); arc_cksum_verify(buf); arc_buf_unwatch(buf); - if (type == ARC_BUFC_METADATA) { - arc_buf_data_free(buf, zio_buf_free); - arc_space_return(size, ARC_SPACE_META); + if (destroyed_buf_is_shared) { + ASSERT(ARC_BUF_LAST(buf)); + ASSERT(HDR_SHARED_DATA(hdr)); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { - ASSERT(type == ARC_BUFC_DATA); - arc_buf_data_free(buf, zio_data_buf_free); - arc_space_return(size, ARC_SPACE_DATA); + arc_free_data_buf(hdr, buf->b_data, size, buf); + ARCSTAT_INCR(arcstat_overhead_size, -size); } - - /* protected by hash lock, if in the hash table */ - if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { - uint64_t *cnt = &state->arcs_lsize[type]; - - ASSERT(refcount_is_zero( - &buf->b_hdr->b_l1hdr.b_refcnt)); - ASSERT(state != arc_anon && state != arc_l2c_only); - - ASSERT3U(*cnt, >=, size); - atomic_add_64(cnt, -size); - } - - (void) refcount_remove_many(&state->arcs_size, size, buf); buf->b_data = NULL; - /* - * If we're destroying a duplicate buffer make sure - * that the appropriate statistics are updated. - */ - if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && - HDR_ISTYPE_DATA(buf->b_hdr)) { - ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); - ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); - } - ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); - buf->b_hdr->b_l1hdr.b_datacnt -= 1; + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + hdr->b_l1hdr.b_bufcnt -= 1; } /* only remove the buf if requested */ @@ -2129,13 +2436,53 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t remove) return; /* remove the buf from the hdr list */ - for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; - bufp = &(*bufp)->b_next) - continue; - *bufp = buf->b_next; + arc_buf_t *lastbuf = NULL; + bufp = &hdr->b_l1hdr.b_buf; + while (*bufp != NULL) { + if (*bufp == buf) + *bufp = buf->b_next; + + /* + * If we've removed a buffer in the middle of + * the list then update the lastbuf and update + * bufp. + */ + if (*bufp != NULL) { + lastbuf = *bufp; + bufp = &(*bufp)->b_next; + } + } buf->b_next = NULL; + ASSERT3P(lastbuf, !=, buf); + + /* + * If the current arc_buf_t is sharing its data + * buffer with the hdr, then reassign the hdr's + * b_pdata to share it with the new buffer at the end + * of the list. The shared buffer is always the last one + * on the hdr's buffer list. + */ + if (destroyed_buf_is_shared && lastbuf != NULL) { + ASSERT(ARC_BUF_LAST(buf)); + ASSERT(ARC_BUF_LAST(lastbuf)); + VERIFY(!arc_buf_is_shared(lastbuf)); + + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + arc_hdr_free_pdata(hdr); + + /* + * We must setup a new shared block between the + * last buffer and the hdr. The data would have + * been allocated by the arc buf so we need to transfer + * ownership to the hdr since it's now being shared. + */ + arc_share_buf(hdr, lastbuf); + } else if (HDR_SHARED_DATA(hdr)) { + ASSERT(arc_buf_is_shared(lastbuf)); + } - ASSERT(buf->b_efunc == NULL); + if (hdr->b_l1hdr.b_bufcnt == 0) + arc_cksum_free(hdr); /* clean up the buf */ buf->b_hdr = NULL; @@ -2143,54 +2490,224 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t remove) } static void -arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) +arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr) { - l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; - l2arc_dev_t *dev = l2hdr->b_dev; + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!HDR_SHARED_DATA(hdr)); - ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr); + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + + ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); +} + +static void +arc_hdr_free_pdata(arc_buf_hdr_t *hdr) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + + /* + * If the hdr is currently being written to the l2arc then + * we defer freeing the data by adding it to the l2arc_free_on_write + * list. The l2arc will free the data once it's finished + * writing it to the l2arc device. + */ + if (HDR_L2_WRITING(hdr)) { + arc_hdr_free_on_write(hdr); + ARCSTAT_BUMP(arcstat_l2_free_on_write); + } else { + arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata, + arc_hdr_size(hdr), hdr); + } + hdr->b_l1hdr.b_pdata = NULL; + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + + ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); +} + +static arc_buf_hdr_t * +arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, + enum zio_compress compress, arc_buf_contents_t type) +{ + arc_buf_hdr_t *hdr; + + ASSERT3U(lsize, >, 0); + VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); + + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); + ASSERT(HDR_EMPTY(hdr)); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL); + HDR_SET_PSIZE(hdr, psize); + HDR_SET_LSIZE(hdr, lsize); + hdr->b_spa = spa; + hdr->b_type = type; + hdr->b_flags = 0; + arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); + arc_hdr_set_compress(hdr, compress); + + hdr->b_l1hdr.b_state = arc_anon; + hdr->b_l1hdr.b_arc_access = 0; + hdr->b_l1hdr.b_bufcnt = 0; + hdr->b_l1hdr.b_buf = NULL; + + /* + * Allocate the hdr's buffer. This will contain either + * the compressed or uncompressed data depending on the block + * it references and compressed arc enablement. + */ + arc_hdr_alloc_pdata(hdr); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + + return (hdr); +} + +/* + * Transition between the two allocation states for the arc_buf_hdr struct. + * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without + * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller + * version is used when a cache buffer is only in the L2ARC in order to reduce + * memory usage. + */ +static arc_buf_hdr_t * +arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) +{ ASSERT(HDR_HAS_L2HDR(hdr)); - list_remove(&dev->l2ad_buflist, hdr); + arc_buf_hdr_t *nhdr; + l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; + ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || + (old == hdr_l2only_cache && new == hdr_full_cache)); + + nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); + + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); + buf_hash_remove(hdr); + + bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); + + if (new == hdr_full_cache) { + arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); + /* + * arc_access and arc_change_state need to be aware that a + * header has just come out of L2ARC, so we set its state to + * l2c_only even though it's about to change. + */ + nhdr->b_l1hdr.b_state = arc_l2c_only; + + /* Verify previous threads set to NULL before freeing */ + ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL); + } else { + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + + /* + * If we've reached here, We must have been called from + * arc_evict_hdr(), as such we should have already been + * removed from any ghost list we were previously on + * (which protects us from racing with arc_evict_state), + * thus no locking is needed during this check. + */ + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + + /* + * A buffer must not be moved into the arc_l2c_only + * state if it's not finished being written out to the + * l2arc device. Otherwise, the b_l1hdr.b_pdata field + * might try to be accessed, even though it was removed. + */ + VERIFY(!HDR_L2_WRITING(hdr)); + VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL); + +#ifdef ZFS_DEBUG + if (hdr->b_l1hdr.b_thawed != NULL) { + kmem_free(hdr->b_l1hdr.b_thawed, 1); + hdr->b_l1hdr.b_thawed = NULL; + } +#endif + + arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); + } /* - * We don't want to leak the b_tmp_cdata buffer that was - * allocated in l2arc_write_buffers() + * The header has been reallocated so we need to re-insert it into any + * lists it was on. */ - arc_buf_l2_cdata_free(hdr); + (void) buf_hash_insert(nhdr, NULL); + + ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); + + mutex_enter(&dev->l2ad_mtx); /* - * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then - * this header is being processed by l2arc_write_buffers() (i.e. - * it's in the first stage of l2arc_write_buffers()). - * Re-affirming that truth here, just to serve as a reminder. If - * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or - * may not have its HDR_L2_WRITING flag set. (the write may have - * completed, in which case HDR_L2_WRITING will be false and the - * b_daddr field will point to the address of the buffer on disk). + * We must place the realloc'ed header back into the list at + * the same spot. Otherwise, if it's placed earlier in the list, + * l2arc_write_buffers() could find it during the function's + * write phase, and try to write it out to the l2arc. */ - IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); + list_insert_after(&dev->l2ad_buflist, hdr, nhdr); + list_remove(&dev->l2ad_buflist, hdr); + + mutex_exit(&dev->l2ad_mtx); /* - * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with - * l2arc_write_buffers(). Since we've just removed this header - * from the l2arc buffer list, this header will never reach the - * second stage of l2arc_write_buffers(), which increments the - * accounting stats for this header. Thus, we must be careful - * not to decrement them for this header either. + * Since we're using the pointer address as the tag when + * incrementing and decrementing the l2ad_alloc refcount, we + * must remove the old pointer (that we're about to destroy) and + * add the new pointer to the refcount. Otherwise we'd remove + * the wrong pointer address when calling arc_hdr_destroy() later. */ - if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) { - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - vdev_space_update(dev->l2ad_vdev, - -l2hdr->b_asize, 0, 0); + (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); + (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); - (void) refcount_remove_many(&dev->l2ad_alloc, - l2hdr->b_asize, hdr); - } + buf_discard_identity(hdr); + kmem_cache_free(old, hdr); - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; + return (nhdr); +} + +/* + * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. + * The buf is returned thawed since we expect the consumer to modify it. + */ +arc_buf_t * +arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) +{ + arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, + ZIO_COMPRESS_OFF, type); + ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); + arc_buf_t *buf = arc_buf_alloc_impl(hdr, tag); + arc_buf_thaw(buf); + return (buf); +} + +static void +arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) +{ + l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; + l2arc_dev_t *dev = l2hdr->b_dev; + uint64_t asize = arc_hdr_size(hdr); + + ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); + ASSERT(HDR_HAS_L2HDR(hdr)); + + list_remove(&dev->l2ad_buflist, hdr); + + ARCSTAT_INCR(arcstat_l2_asize, -asize); + ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); + + vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); + + (void) refcount_remove_many(&dev->l2ad_alloc, asize, hdr); + arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); } static void @@ -2198,13 +2715,16 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_buf == NULL || - hdr->b_l1hdr.b_datacnt > 0); + hdr->b_l1hdr.b_bufcnt > 0); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); + if (!HDR_EMPTY(hdr)) + buf_discard_identity(hdr); + if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); @@ -2228,40 +2748,22 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) mutex_exit(&dev->l2ad_mtx); } - if (!BUF_EMPTY(hdr)) - buf_discard_identity(hdr); + if (HDR_HAS_L1HDR(hdr)) { + arc_cksum_free(hdr); - if (hdr->b_freeze_cksum != NULL) { - kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - hdr->b_freeze_cksum = NULL; - } + while (hdr->b_l1hdr.b_buf != NULL) + arc_buf_destroy_impl(hdr->b_l1hdr.b_buf, B_TRUE); - if (HDR_HAS_L1HDR(hdr)) { - while (hdr->b_l1hdr.b_buf) { - arc_buf_t *buf = hdr->b_l1hdr.b_buf; - - if (buf->b_efunc != NULL) { - mutex_enter(&arc_user_evicts_lock); - mutex_enter(&buf->b_evict_lock); - ASSERT(buf->b_hdr != NULL); - arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE); - hdr->b_l1hdr.b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - mutex_exit(&buf->b_evict_lock); - cv_signal(&arc_user_evicts_cv); - mutex_exit(&arc_user_evicts_lock); - } else { - arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE); - } - } #ifdef ZFS_DEBUG if (hdr->b_l1hdr.b_thawed != NULL) { kmem_free(hdr->b_l1hdr.b_thawed, 1); hdr->b_l1hdr.b_thawed = NULL; } #endif + + if (hdr->b_l1hdr.b_pdata != NULL) { + arc_hdr_free_pdata(hdr); + } } ASSERT3P(hdr->b_hash_next, ==, NULL); @@ -2275,133 +2777,35 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) } void -arc_buf_free(arc_buf_t *buf, void *tag) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - int hashed = hdr->b_l1hdr.b_state != arc_anon; - - ASSERT(buf->b_efunc == NULL); - ASSERT(buf->b_data != NULL); - - if (hashed) { - kmutex_t *hash_lock = HDR_LOCK(hdr); - - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - - (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_l1hdr.b_datacnt > 1) { - arc_buf_destroy(buf, TRUE); - } else { - ASSERT(buf == hdr->b_l1hdr.b_buf); - ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; - } - mutex_exit(hash_lock); - } else if (HDR_IO_IN_PROGRESS(hdr)) { - int destroy_hdr; - /* - * We are in the middle of an async write. Don't destroy - * this buffer unless the write completes before we finish - * decrementing the reference count. - */ - mutex_enter(&arc_user_evicts_lock); - (void) remove_reference(hdr, NULL, tag); - ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); - mutex_exit(&arc_user_evicts_lock); - if (destroy_hdr) - arc_hdr_destroy(hdr); - } else { - if (remove_reference(hdr, NULL, tag) > 0) - arc_buf_destroy(buf, TRUE); - else - arc_hdr_destroy(hdr); - } -} - -boolean_t -arc_buf_remove_ref(arc_buf_t *buf, void* tag) +arc_buf_destroy(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock = HDR_LOCK(hdr); - boolean_t no_callback = (buf->b_efunc == NULL); if (hdr->b_l1hdr.b_state == arc_anon) { - ASSERT(hdr->b_l1hdr.b_datacnt == 1); - arc_buf_free(buf, tag); - return (no_callback); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + VERIFY0(remove_reference(hdr, NULL, tag)); + arc_hdr_destroy(hdr); + return; } mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT(hdr->b_l1hdr.b_datacnt > 0); + ASSERT3P(hdr, ==, buf->b_hdr); + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT(hdr->b_l1hdr.b_state != arc_anon); - ASSERT(buf->b_data != NULL); + ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); + ASSERT3P(buf->b_data, !=, NULL); (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_l1hdr.b_datacnt > 1) { - if (no_callback) - arc_buf_destroy(buf, TRUE); - } else if (no_callback) { - ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); - ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; - } - ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || - refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + arc_buf_destroy_impl(buf, B_TRUE); mutex_exit(hash_lock); - return (no_callback); } int32_t arc_buf_size(arc_buf_t *buf) { - return (buf->b_hdr->b_size); -} - -/* - * Called from the DMU to determine if the current buffer should be - * evicted. In order to ensure proper locking, the eviction must be initiated - * from the DMU. Return true if the buffer is associated with user data and - * duplicate buffers still exist. - */ -boolean_t -arc_buf_eviction_needed(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr; - boolean_t evict_needed = B_FALSE; - - if (zfs_disable_dup_eviction) - return (B_FALSE); - - mutex_enter(&buf->b_evict_lock); - hdr = buf->b_hdr; - if (hdr == NULL) { - /* - * We are in arc_do_user_evicts(); let that function - * perform the eviction. - */ - ASSERT(buf->b_data == NULL); - mutex_exit(&buf->b_evict_lock); - return (B_FALSE); - } else if (buf->b_data == NULL) { - /* - * We have already been added to the arc eviction list; - * recommend eviction. - */ - ASSERT3P(hdr, ==, &arc_eviction_hdr); - mutex_exit(&buf->b_evict_lock); - return (B_TRUE); - } - - if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) - evict_needed = B_TRUE; - - mutex_exit(&buf->b_evict_lock); - return (evict_needed); + return (HDR_GET_LSIZE(buf->b_hdr)); } /* @@ -2428,11 +2832,11 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(hdr->b_l1hdr.b_buf == NULL); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* * l2arc_write_buffers() relies on a header's L1 portion - * (i.e. it's b_tmp_cdata field) during it's write phase. + * (i.e. its b_pdata field) during its write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing it's L1 piece) until the header is * done being written to the l2arc. @@ -2443,11 +2847,13 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) } ARCSTAT_BUMP(arcstat_deleted); - bytes_evicted += hdr->b_size; + bytes_evicted += HDR_GET_LSIZE(hdr); DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); if (HDR_HAS_L2HDR(hdr)) { + ASSERT(hdr->b_l1hdr.b_pdata == NULL); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. @@ -2460,6 +2866,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) hdr = arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); } else { + ASSERT(hdr->b_l1hdr.b_pdata == NULL); arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } @@ -2479,7 +2886,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) } ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); - ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); while (hdr->b_l1hdr.b_buf) { arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { @@ -2487,37 +2893,39 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) break; } if (buf->b_data != NULL) - bytes_evicted += hdr->b_size; - if (buf->b_efunc != NULL) { - mutex_enter(&arc_user_evicts_lock); - arc_buf_destroy(buf, FALSE); - hdr->b_l1hdr.b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - cv_signal(&arc_user_evicts_cv); - mutex_exit(&arc_user_evicts_lock); - mutex_exit(&buf->b_evict_lock); - } else { - mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, TRUE); - } + bytes_evicted += HDR_GET_LSIZE(hdr); + mutex_exit(&buf->b_evict_lock); + arc_buf_destroy_impl(buf, B_TRUE); } if (HDR_HAS_L2HDR(hdr)) { - ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size); + ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { - if (l2arc_write_eligible(hdr->b_spa, hdr)) - ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size); - else - ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size); + if (l2arc_write_eligible(hdr->b_spa, hdr)) { + ARCSTAT_INCR(arcstat_evict_l2_eligible, + HDR_GET_LSIZE(hdr)); + } else { + ARCSTAT_INCR(arcstat_evict_l2_ineligible, + HDR_GET_LSIZE(hdr)); + } } - if (hdr->b_l1hdr.b_datacnt == 0) { + if (hdr->b_l1hdr.b_bufcnt == 0) { + arc_cksum_free(hdr); + + bytes_evicted += arc_hdr_size(hdr); + + /* + * If this hdr is being evicted and has a compressed + * buffer then we discard it here before we change states. + * This ensures that the accounting is updated correctly + * in arc_free_data_buf(). + */ + arc_hdr_free_pdata(hdr); + arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); - hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; + arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); } @@ -2761,12 +3169,12 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, * Flush all "evictable" data of the given type from the arc state * specified. This will not evict any "active" buffers (i.e. referenced). * - * When 'retry' is set to FALSE, the function will make a single pass + * When 'retry' is set to B_FALSE, the function will make a single pass * over the state and evict any buffers that it can. Since it doesn't * continually retry the eviction, it might end up leaving some buffers * in the ARC due to lock misses. * - * When 'retry' is set to TRUE, the function will continually retry the + * When 'retry' is set to B_TRUE, the function will continually retry the * eviction until *all* evictable buffers have been removed from the * state. As a result, if concurrent insertions into the state are * allowed (e.g. if the ARC isn't shutting down), this function might @@ -2778,7 +3186,7 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, { uint64_t evicted = 0; - while (state->arcs_lsize[type] != 0) { + while (refcount_count(&state->arcs_esize[type]) != 0) { evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); if (!retry) @@ -2802,8 +3210,8 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, { int64_t delta; - if (bytes > 0 && state->arcs_lsize[type] > 0) { - delta = MIN(state->arcs_lsize[type], bytes); + if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) { + delta = MIN(refcount_count(&state->arcs_esize[type]), bytes); return (arc_evict_state(state, spa, delta, type)); } @@ -3066,36 +3474,13 @@ arc_adjust(void) return (total_evicted); } -static void -arc_do_user_evicts(void) -{ - mutex_enter(&arc_user_evicts_lock); - while (arc_eviction_list != NULL) { - arc_buf_t *buf = arc_eviction_list; - arc_eviction_list = buf->b_next; - mutex_enter(&buf->b_evict_lock); - buf->b_hdr = NULL; - mutex_exit(&buf->b_evict_lock); - mutex_exit(&arc_user_evicts_lock); - - if (buf->b_efunc != NULL) - VERIFY0(buf->b_efunc(buf->b_private)); - - buf->b_efunc = NULL; - buf->b_private = NULL; - kmem_cache_free(buf_cache, buf); - mutex_enter(&arc_user_evicts_lock); - } - mutex_exit(&arc_user_evicts_lock); -} - void arc_flush(spa_t *spa, boolean_t retry) { uint64_t guid = 0; /* - * If retry is TRUE, a spa must not be specified since we have + * If retry is B_TRUE, a spa must not be specified since we have * no good way to determine if all of a spa's buffers have been * evicted from an arc state. */ @@ -3115,9 +3500,6 @@ arc_flush(spa_t *spa, boolean_t retry) (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); - - arc_do_user_evicts(); - ASSERT(spa || arc_eviction_list == NULL); } void @@ -3282,7 +3664,7 @@ arc_available_memory(void) /* * Determine if the system is under memory pressure and is asking - * to reclaim memory. A return value of TRUE indicates that the system + * to reclaim memory. A return value of B_TRUE indicates that the system * is under memory pressure and that the arc should adjust accordingly. */ static boolean_t @@ -3370,6 +3752,20 @@ arc_reclaim_thread(void) int64_t free_memory = arc_available_memory(); uint64_t evicted = 0; + /* + * This is necessary in order for the mdb ::arc dcmd to + * show up to date information. Since the ::arc command + * does not call the kstat's update function, without + * this call, the command may show stale stats for the + * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even + * with this change, the data might be up to 1 second + * out of date; but that should suffice. The arc_state_t + * structures can be queried directly if more accurate + * information is needed. + */ + if (arc_ksp != NULL) + arc_ksp->ks_update(arc_ksp, KSTAT_READ); + mutex_exit(&arc_reclaim_lock); if (free_memory < 0) { @@ -3438,57 +3834,12 @@ arc_reclaim_thread(void) } } - arc_reclaim_thread_exit = FALSE; + arc_reclaim_thread_exit = B_FALSE; cv_broadcast(&arc_reclaim_thread_cv); CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ thread_exit(); } -static void -arc_user_evicts_thread(void) -{ - callb_cpr_t cpr; - - CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG); - - mutex_enter(&arc_user_evicts_lock); - while (!arc_user_evicts_thread_exit) { - mutex_exit(&arc_user_evicts_lock); - - arc_do_user_evicts(); - - /* - * This is necessary in order for the mdb ::arc dcmd to - * show up to date information. Since the ::arc command - * does not call the kstat's update function, without - * this call, the command may show stale stats for the - * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even - * with this change, the data might be up to 1 second - * out of date; but that should suffice. The arc_state_t - * structures can be queried directly if more accurate - * information is needed. - */ - if (arc_ksp != NULL) - arc_ksp->ks_update(arc_ksp, KSTAT_READ); - - mutex_enter(&arc_user_evicts_lock); - - /* - * Block until signaled, or after one second (we need to - * call the arc's kstat update function regularly). - */ - CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait(&arc_user_evicts_cv, - &arc_user_evicts_lock, ddi_get_lbolt() + hz); - CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock); - } - - arc_user_evicts_thread_exit = FALSE; - cv_broadcast(&arc_user_evicts_cv); - CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */ - thread_exit(); -} - /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are comming from. This function is only called @@ -3572,18 +3923,17 @@ arc_is_overflowing(void) } /* - * The buffer, supplied as the first argument, needs a data block. If we - * are hitting the hard limit for the cache size, we must sleep, waiting - * for the eviction thread to catch up. If we're past the target size - * but below the hard limit, we'll only signal the reclaim thread and - * continue on. + * Allocate a block and return it to the caller. If we are hitting the + * hard limit for the cache size, we must sleep, waiting for the eviction + * thread to catch up. If we're past the target size but below the hard + * limit, we'll only signal the reclaim thread and continue on. */ -static void -arc_get_data_buf(arc_buf_t *buf) +static void * +arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { - arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; - uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = arc_buf_type(buf->b_hdr); + void *datap = NULL; + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); arc_adapt(size, state); @@ -3623,12 +3973,13 @@ arc_get_data_buf(arc_buf_t *buf) mutex_exit(&arc_reclaim_lock); } + VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); + datap = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); + datap = zio_data_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } @@ -3636,11 +3987,9 @@ arc_get_data_buf(arc_buf_t *buf) * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ - if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { - arc_buf_hdr_t *hdr = buf->b_hdr; - arc_state_t *state = hdr->b_l1hdr.b_state; + if (!GHOST_STATE(state)) { - (void) refcount_add_many(&state->arcs_size, size, buf); + (void) refcount_add_many(&state->arcs_size, size, tag); /* * If this is reached via arc_read, the link is @@ -3653,9 +4002,10 @@ arc_get_data_buf(arc_buf_t *buf) */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], - size); + (void) refcount_add_many(&state->arcs_esize[type], + size, tag); } + /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p @@ -3665,6 +4015,37 @@ arc_get_data_buf(arc_buf_t *buf) refcount_count(&arc_mru->arcs_size) > arc_p)) arc_p = MIN(arc_c, arc_p + size); } + return (datap); +} + +/* + * Free the arc data buffer. + */ +static void +arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); + + (void) refcount_remove_many(&state->arcs_esize[type], + size, tag); + } + (void) refcount_remove_many(&state->arcs_size, size, tag); + + VERIFY3U(hdr->b_type, ==, type); + if (type == ARC_BUFC_METADATA) { + zio_buf_free(data, size); + arc_space_return(size, ARC_SPACE_META); + } else { + ASSERT(type == ARC_BUFC_DATA); + zio_data_buf_free(data, size); + arc_space_return(size, ARC_SPACE_DATA); + } } /* @@ -3708,7 +4089,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); ARCSTAT_BUMP(arcstat_mru_hits); } hdr->b_l1hdr.b_arc_access = now; @@ -3742,7 +4123,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) if (HDR_PREFETCH(hdr)) { new_state = arc_mru; if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; @@ -3811,8 +4192,8 @@ void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) - bcopy(buf->b_data, arg, buf->b_hdr->b_size); - VERIFY(arc_buf_remove_ref(buf, arg)); + bcopy(buf->b_data, arg, HDR_GET_LSIZE(buf->b_hdr)); + arc_buf_destroy(buf, arg); } /* a generic arc_done_func_t */ @@ -3821,7 +4202,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; if (zio && zio->io_error) { - VERIFY(arc_buf_remove_ref(buf, arg)); + arc_buf_destroy(buf, arg); *bufp = NULL; } else { *bufp = buf; @@ -3830,17 +4211,29 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) } static void +arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) +{ + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { + ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else { + if (HDR_COMPRESSION_ENABLED(hdr)) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, + BP_GET_COMPRESS(bp)); + } + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); + ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); + } +} + +static void arc_read_done(zio_t *zio) { - arc_buf_hdr_t *hdr; - arc_buf_t *buf; - arc_buf_t *abuf; /* buffer we're assigning to callback */ + arc_buf_hdr_t *hdr = zio->io_private; + arc_buf_t *abuf = NULL; /* buffer we're assigning to callback */ kmutex_t *hash_lock = NULL; arc_callback_t *callback_list, *acb; - int freeable = FALSE; - - buf = zio->io_private; - hdr = buf->b_hdr; + int freeable = B_FALSE; /* * The hdr was inserted into hash-table and removed from lists @@ -3860,31 +4253,32 @@ arc_read_done(zio_t *zio) arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock); - ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && - hash_lock == NULL) || - (found == hdr && + ASSERT((found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); + ASSERT3P(hash_lock, !=, NULL); + } + + if (zio->io_error == 0) { + /* byteswap if necessary */ + if (BP_SHOULD_BYTESWAP(zio->io_bp)) { + if (BP_GET_LEVEL(zio->io_bp) > 0) { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; + } else { + hdr->b_l1hdr.b_byteswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); + } + } else { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + } } - hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; + arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); if (l2arc_noprefetch && HDR_PREFETCH(hdr)) - hdr->b_flags &= ~ARC_FLAG_L2CACHE; + arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); - /* byteswap if necessary */ callback_list = hdr->b_l1hdr.b_acb; - ASSERT(callback_list != NULL); - if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { - dmu_object_byteswap_t bswap = - DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); - arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? - byteswap_uint64_array : - dmu_ot_byteswap[bswap].ob_func; - func(buf->b_data, hdr->b_size); - } - - arc_cksum_compute(buf, B_FALSE); - arc_buf_watch(buf); + ASSERT3P(callback_list, !=, NULL); if (hash_lock && zio->io_error == 0 && hdr->b_l1hdr.b_state == arc_anon) { @@ -3898,31 +4292,50 @@ arc_read_done(zio_t *zio) } /* create copies of the data buffer for the callers */ - abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { - if (acb->acb_done) { + if (acb->acb_done != NULL) { + /* + * If we're here, then this must be a demand read + * since prefetch requests don't have callbacks. + * If a read request has a callback (i.e. acb_done is + * not NULL), then we decompress the data for the + * first request and clone the rest. This avoids + * having to waste cpu resources decompressing data + * that nobody is explicitly waiting to read. + */ if (abuf == NULL) { - ARCSTAT_BUMP(arcstat_duplicate_reads); - abuf = arc_buf_clone(buf); + acb->acb_buf = arc_buf_alloc_impl(hdr, + acb->acb_private); + if (zio->io_error == 0) { + zio->io_error = + arc_decompress(acb->acb_buf); + } + abuf = acb->acb_buf; + } else { + add_reference(hdr, acb->acb_private); + acb->acb_buf = arc_buf_clone(abuf); } - acb->acb_buf = abuf; - abuf = NULL; } } hdr->b_l1hdr.b_acb = NULL; - hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; - ASSERT(!HDR_BUF_AVAILABLE(hdr)); - if (abuf == buf) { - ASSERT(buf->b_efunc == NULL); - ASSERT(hdr->b_l1hdr.b_datacnt == 1); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + if (abuf == NULL) { + /* + * This buffer didn't have a callback so it must + * be a prefetch. + */ + ASSERT(HDR_PREFETCH(hdr)); + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); } ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); - if (zio->io_error != 0) { - hdr->b_flags |= ARC_FLAG_IO_ERROR; + if (zio->io_error == 0) { + arc_hdr_verify(hdr, zio->io_bp); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) @@ -3992,7 +4405,6 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; - arc_buf_t *buf = NULL; kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); @@ -4009,8 +4421,8 @@ top: hdr = buf_hash_find(guid, bp, &hash_lock); } - if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { - + if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) { + arc_buf_t *buf = NULL; *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { @@ -4042,7 +4454,8 @@ top: ARCSTAT_BUMP(arcstat_sync_wait_for_async); } if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { - hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREDICTIVE_PREFETCH); } if (*arc_flags & ARC_FLAG_WAIT) { @@ -4063,10 +4476,9 @@ top: acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); - ASSERT(acb->acb_done != NULL); + ASSERT3P(acb->acb_done, !=, NULL); acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; - add_reference(hdr, hash_lock, private); mutex_exit(hash_lock); return (0); } @@ -4089,34 +4501,36 @@ top: arc_buf_hdr_t *, hdr); ARCSTAT_BUMP( arcstat_demand_hit_predictive_prefetch); - hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREDICTIVE_PREFETCH); } - add_reference(hdr, hash_lock, private); + ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); + /* * If this block is already in use, create a new * copy of the data so that we will be guaranteed * that arc_release() will always succeed. */ buf = hdr->b_l1hdr.b_buf; - ASSERT(buf); - ASSERT(buf->b_data); - if (HDR_BUF_AVAILABLE(hdr)) { - ASSERT(buf->b_efunc == NULL); - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; + if (buf == NULL) { + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + buf = arc_buf_alloc_impl(hdr, private); + VERIFY0(arc_decompress(buf)); } else { + add_reference(hdr, private); buf = arc_buf_clone(buf); } + ASSERT3P(buf->b_data, !=, NULL); } else if (*arc_flags & ARC_FLAG_PREFETCH && refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { - hdr->b_flags |= ARC_FLAG_PREFETCH; + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); if (*arc_flags & ARC_FLAG_L2CACHE) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (*arc_flags & ARC_FLAG_L2COMPRESS) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), @@ -4126,20 +4540,21 @@ top: if (done) done(NULL, buf, private); } else { - uint64_t size = BP_GET_LSIZE(bp); + uint64_t lsize = BP_GET_LSIZE(bp); + uint64_t psize = BP_GET_PSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; - enum zio_compress b_compress = ZIO_COMPRESS_OFF; - int32_t b_asize = 0; + uint64_t size; if (hdr == NULL) { /* this block is not in the cache */ arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); - buf = arc_buf_alloc(spa, size, private, type); - hdr = buf->b_hdr; + hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, + BP_GET_COMPRESS(bp), type); + if (!BP_IS_EMBEDDED(bp)) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); @@ -4149,26 +4564,9 @@ top: /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); - (void) arc_buf_remove_ref(buf, private); + arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } - - /* - * If there is a callback, we pass our reference to - * it; otherwise we remove our reference. - */ - if (done == NULL) { - (void) remove_reference(hdr, hash_lock, - private); - } - if (*arc_flags & ARC_FLAG_PREFETCH) - hdr->b_flags |= ARC_FLAG_PREFETCH; - if (*arc_flags & ARC_FLAG_L2CACHE) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (*arc_flags & ARC_FLAG_L2COMPRESS) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; - if (BP_GET_LEVEL(bp) > 0) - hdr->b_flags |= ARC_FLAG_INDIRECT; } else { /* * This block is in the ghost cache. If it was L2-only @@ -4179,54 +4577,60 @@ top: hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, hdr_full_cache); } - + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* - * If there is a callback, we pass a reference to it. + * This is a delicate dance that we play here. + * This hdr is in the ghost list so we access it + * to move it out of the ghost list before we + * initiate the read. If it's a prefetch then + * it won't have a callback so we'll remove the + * reference that arc_buf_alloc_impl() created. We + * do this after we've called arc_access() to + * avoid hitting an assert in remove_reference(). */ - if (done != NULL) - add_reference(hdr, hash_lock, private); - if (*arc_flags & ARC_FLAG_PREFETCH) - hdr->b_flags |= ARC_FLAG_PREFETCH; - if (*arc_flags & ARC_FLAG_L2CACHE) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (*arc_flags & ARC_FLAG_L2COMPRESS) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_next = NULL; - hdr->b_l1hdr.b_buf = buf; - ASSERT0(hdr->b_l1hdr.b_datacnt); - hdr->b_l1hdr.b_datacnt = 1; - arc_get_data_buf(buf); arc_access(hdr, hash_lock); + arc_hdr_alloc_pdata(hdr); + } + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + size = arc_hdr_size(hdr); + + /* + * If compression is enabled on the hdr, then will do + * RAW I/O and will store the compressed data in the hdr's + * data block. Otherwise, the hdr's data block will contain + * the uncompressed data. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + zio_flags |= ZIO_FLAG_RAW; } + if (*arc_flags & ARC_FLAG_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (*arc_flags & ARC_FLAG_L2CACHE) + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + if (BP_GET_LEVEL(bp) > 0) + arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) - hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH; + arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; - ASSERT(hdr->b_l1hdr.b_acb == NULL); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; - hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr.b_dev->l2ad_writing; addr = hdr->b_l2hdr.b_daddr; - b_compress = hdr->b_l2hdr.b_compress; - b_asize = hdr->b_l2hdr.b_asize; /* * Lock out device removal. */ @@ -4235,6 +4639,11 @@ top: vd = NULL; } + if (priority == ZIO_PRIORITY_ASYNC_READ) + arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); + else + arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); + if (hash_lock != NULL) mutex_exit(hash_lock); @@ -4242,19 +4651,15 @@ top: * At this point, we have a level 1 cache miss. Try again in * L2ARC if possible. */ - ASSERT3U(hdr->b_size, ==, size); + ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); + DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, - uint64_t, size, zbookmark_phys_t *, zb); + uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); - if (priority == ZIO_PRIORITY_ASYNC_READ) - hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ; - else - hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ; - if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: @@ -4275,15 +4680,13 @@ top: cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); - cb->l2rcb_buf = buf; - cb->l2rcb_spa = spa; + cb->l2rcb_hdr = hdr; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; - cb->l2rcb_compress = b_compress; ASSERT(addr >= VDEV_LABEL_START_SIZE && - addr + size < vd->vdev_psize - + addr + lsize < vd->vdev_psize - VDEV_LABEL_END_SIZE); /* @@ -4292,26 +4695,19 @@ top: * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ - if (b_compress == ZIO_COMPRESS_EMPTY) { - rzio = zio_null(pio, spa, vd, - l2arc_read_done, cb, - zio_flags | ZIO_FLAG_DONT_CACHE | - ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY); - } else { - rzio = zio_read_phys(pio, vd, addr, - b_asize, buf->b_data, - ZIO_CHECKSUM_OFF, - l2arc_read_done, cb, priority, - zio_flags | ZIO_FLAG_DONT_CACHE | - ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY, B_FALSE); - } + ASSERT3U(HDR_GET_COMPRESS(hdr), !=, + ZIO_COMPRESS_EMPTY); + rzio = zio_read_phys(pio, vd, addr, + size, hdr->b_l1hdr.b_pdata, + ZIO_CHECKSUM_OFF, + l2arc_read_done, cb, priority, + zio_flags | ZIO_FLAG_DONT_CACHE | + ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY, B_FALSE); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); - ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); + ARCSTAT_INCR(arcstat_l2_read_bytes, size); if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); @@ -4341,8 +4737,8 @@ top: } } - rzio = zio_read(pio, spa, bp, buf->b_data, size, - arc_read_done, buf, priority, zio_flags, zb); + rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size, + arc_read_done, hdr, priority, zio_flags, zb); /* * At this point, this read I/O has already missed in the ARC @@ -4361,20 +4757,6 @@ top: return (0); } -void -arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) -{ - ASSERT(buf->b_hdr != NULL); - ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); - ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || - func == NULL); - ASSERT(buf->b_efunc == NULL); - ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); - - buf->b_efunc = func; - buf->b_private = private; -} - /* * Notify the arc that a block was freed, and thus will never be used again. */ @@ -4390,85 +4772,38 @@ arc_freed(spa_t *spa, const blkptr_t *bp) hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; - if (HDR_BUF_AVAILABLE(hdr)) { - arc_buf_t *buf = hdr->b_l1hdr.b_buf; - add_reference(hdr, hash_lock, FTAG); - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; - mutex_exit(hash_lock); - arc_release(buf, FTAG); - (void) arc_buf_remove_ref(buf, FTAG); - } else { + /* + * We might be trying to free a block that is still doing I/O + * (i.e. prefetch) or has a reference (i.e. a dedup-ed, + * dmu_sync-ed block). If this block is being prefetched, then it + * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr + * until the I/O completes. A block may also have a reference if it is + * part of a dedup-ed, dmu_synced write. The dmu_sync() function would + * have written the new block to its final resting place on disk but + * without the dedup flag set. This would have left the hdr in the MRU + * state and discoverable. When the txg finally syncs it detects that + * the block was overridden in open context and issues an override I/O. + * Since this is a dedup block, the override I/O will determine if the + * block is already in the DDT. If so, then it will replace the io_bp + * with the bp from the DDT and allow the I/O to finish. When the I/O + * reaches the done callback, dbuf_write_override_done, it will + * check to see if the io_bp and io_bp_override are identical. + * If they are not, then it indicates that the bp was replaced with + * the bp in the DDT and the override bp is freed. This allows + * us to arrive here with a reference on a block that is being + * freed. So if we have an I/O in progress, or a reference to + * this hdr, then we don't destroy the hdr. + */ + if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && + refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); mutex_exit(hash_lock); - } - -} - -/* - * Clear the user eviction callback set by arc_set_callback(), first calling - * it if it exists. Because the presence of a callback keeps an arc_buf cached - * clearing the callback may result in the arc_buf being destroyed. However, - * it will not result in the *last* arc_buf being destroyed, hence the data - * will remain cached in the ARC. We make a copy of the arc buffer here so - * that we can process the callback without holding any locks. - * - * It's possible that the callback is already in the process of being cleared - * by another thread. In this case we can not clear the callback. - * - * Returns B_TRUE if the callback was successfully called and cleared. - */ -boolean_t -arc_clear_callback(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - arc_evict_func_t *efunc = buf->b_efunc; - void *private = buf->b_private; - - mutex_enter(&buf->b_evict_lock); - hdr = buf->b_hdr; - if (hdr == NULL) { - /* - * We are in arc_do_user_evicts(). - */ - ASSERT(buf->b_data == NULL); - mutex_exit(&buf->b_evict_lock); - return (B_FALSE); - } else if (buf->b_data == NULL) { - /* - * We are on the eviction list; process this buffer now - * but let arc_do_user_evicts() do the reaping. - */ - buf->b_efunc = NULL; - mutex_exit(&buf->b_evict_lock); - VERIFY0(efunc(private)); - return (B_TRUE); - } - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - - ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, - hdr->b_l1hdr.b_datacnt); - ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); - - buf->b_efunc = NULL; - buf->b_private = NULL; - - if (hdr->b_l1hdr.b_datacnt > 1) { - mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, TRUE); } else { - ASSERT(buf == hdr->b_l1hdr.b_buf); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; - mutex_exit(&buf->b_evict_lock); + mutex_exit(hash_lock); } - mutex_exit(hash_lock); - VERIFY0(efunc(private)); - return (B_TRUE); } /* @@ -4502,16 +4837,19 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); - ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); - ASSERT3P(buf->b_efunc, ==, NULL); - ASSERT3P(buf->b_private, ==, NULL); - hdr->b_l1hdr.b_arc_access = 0; + + /* + * If the buf is being overridden then it may already + * have a hdr that is not empty. + */ + buf_discard_identity(hdr); arc_buf_thaw(buf); return; @@ -4552,72 +4890,111 @@ arc_release(arc_buf_t *buf, void *tag) /* * Do we have more than one buf? */ - if (hdr->b_l1hdr.b_datacnt > 1) { + if (hdr->b_l1hdr.b_bufcnt > 1) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; - uint64_t blksz = hdr->b_size; uint64_t spa = hdr->b_spa; + uint64_t psize = HDR_GET_PSIZE(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + enum zio_compress compress = HDR_GET_COMPRESS(hdr); arc_buf_contents_t type = arc_buf_type(hdr); - uint32_t flags = hdr->b_flags; + VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); + (void) remove_reference(hdr, hash_lock, tag); + + if (arc_buf_is_shared(buf)) { + ASSERT(HDR_SHARED_DATA(hdr)); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); + ASSERT(ARC_BUF_LAST(buf)); + } + /* * Pull the data off of this hdr and attach it to - * a new anonymous hdr. + * a new anonymous hdr. Also find the last buffer + * in the hdr's buffer list. */ - (void) remove_reference(hdr, hash_lock, tag); + arc_buf_t *lastbuf = NULL; bufp = &hdr->b_l1hdr.b_buf; - while (*bufp != buf) - bufp = &(*bufp)->b_next; - *bufp = buf->b_next; + while (*bufp != NULL) { + if (*bufp == buf) { + *bufp = buf->b_next; + } + + /* + * If we've removed a buffer in the middle of + * the list then update the lastbuf and update + * bufp. + */ + if (*bufp != NULL) { + lastbuf = *bufp; + bufp = &(*bufp)->b_next; + } + } buf->b_next = NULL; + ASSERT3P(lastbuf, !=, buf); + ASSERT3P(lastbuf, !=, NULL); + /* + * If the current arc_buf_t and the hdr are sharing their data + * buffer, then we must stop sharing that block, transfer + * ownership and setup sharing with a new arc_buf_t at the end + * of the hdr's b_buf list. + */ + if (arc_buf_is_shared(buf)) { + ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); + ASSERT(ARC_BUF_LAST(lastbuf)); + VERIFY(!arc_buf_is_shared(lastbuf)); + + /* + * First, sever the block sharing relationship between + * buf and the arc_buf_hdr_t. Then, setup a new + * block sharing relationship with the last buffer + * on the arc_buf_t list. + */ + arc_unshare_buf(hdr, buf); + arc_share_buf(hdr, lastbuf); + VERIFY3P(lastbuf->b_data, !=, NULL); + } else if (HDR_SHARED_DATA(hdr)) { + ASSERT(arc_buf_is_shared(lastbuf)); + } + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT3P(state, !=, arc_l2c_only); - (void) refcount_remove_many( - &state->arcs_size, hdr->b_size, buf); + (void) refcount_remove_many(&state->arcs_size, + HDR_GET_LSIZE(hdr), buf); if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { ASSERT3P(state, !=, arc_l2c_only); - uint64_t *size = &state->arcs_lsize[type]; - ASSERT3U(*size, >=, hdr->b_size); - atomic_add_64(size, -hdr->b_size); + (void) refcount_remove_many(&state->arcs_esize[type], + HDR_GET_LSIZE(hdr), buf); } - /* - * We're releasing a duplicate user data buffer, update - * our statistics accordingly. - */ - if (HDR_ISTYPE_DATA(hdr)) { - ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); - ARCSTAT_INCR(arcstat_duplicate_buffers_size, - -hdr->b_size); - } - hdr->b_l1hdr.b_datacnt -= 1; + hdr->b_l1hdr.b_bufcnt -= 1; arc_cksum_verify(buf); arc_buf_unwatch(buf); mutex_exit(hash_lock); - nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); - nhdr->b_size = blksz; - nhdr->b_spa = spa; - - nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; - nhdr->b_flags |= arc_bufc_to_flags(type); - nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; + /* + * Allocate a new hdr. The new hdr will contain a b_pdata + * buffer which will be freed in arc_write(). + */ + nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); + ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(nhdr->b_l1hdr.b_bufcnt); + ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt)); + VERIFY3U(nhdr->b_type, ==, type); + ASSERT(!HDR_SHARED_DATA(nhdr)); nhdr->b_l1hdr.b_buf = buf; - nhdr->b_l1hdr.b_datacnt = 1; - nhdr->b_l1hdr.b_state = arc_anon; - nhdr->b_l1hdr.b_arc_access = 0; - nhdr->b_l1hdr.b_tmp_cdata = NULL; - nhdr->b_freeze_cksum = NULL; - + nhdr->b_l1hdr.b_bufcnt = 1; (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; + mutex_exit(&buf->b_evict_lock); - (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf); + (void) refcount_add_many(&arc_anon->arcs_size, + HDR_GET_LSIZE(nhdr), buf); } else { mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); @@ -4631,8 +5008,6 @@ arc_release(arc_buf_t *buf, void *tag) buf_discard_identity(hdr); arc_buf_thaw(buf); } - buf->b_efunc = NULL; - buf->b_private = NULL; } int @@ -4666,28 +5041,83 @@ arc_write_ready(zio_t *zio) arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; + uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); - ASSERT(hdr->b_l1hdr.b_datacnt > 0); - callback->awcb_ready(zio, buf, callback->awcb_private); + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); /* - * If the IO is already in progress, then this is a re-write - * attempt, so we need to thaw and re-compute the cksum. - * It is the responsibility of the callback to handle the - * accounting for any re-write attempt. + * If we're reexecuting this zio because the pool suspended, then + * cleanup any state that was previously set the first time the + * callback as invoked. */ - if (HDR_IO_IN_PROGRESS(hdr)) { - mutex_enter(&hdr->b_l1hdr.b_freeze_lock); - if (hdr->b_freeze_cksum != NULL) { - kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - hdr->b_freeze_cksum = NULL; + if (zio->io_flags & ZIO_FLAG_REEXECUTED) { + arc_cksum_free(hdr); + arc_buf_unwatch(buf); + if (hdr->b_l1hdr.b_pdata != NULL) { + if (arc_buf_is_shared(buf)) { + ASSERT(HDR_SHARED_DATA(hdr)); + + arc_unshare_buf(hdr, buf); + } else { + arc_hdr_free_pdata(hdr); + } } - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } - arc_cksum_compute(buf, B_FALSE); - hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + + callback->awcb_ready(zio, buf, callback->awcb_private); + + if (HDR_IO_IN_PROGRESS(hdr)) + ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); + + arc_cksum_compute(buf); + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + + enum zio_compress compress; + if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { + compress = ZIO_COMPRESS_OFF; + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); + compress = BP_GET_COMPRESS(zio->io_bp); + } + HDR_SET_PSIZE(hdr, psize); + arc_hdr_set_compress(hdr, compress); + + /* + * If the hdr is compressed, then copy the compressed + * zio contents into arc_buf_hdr_t. Otherwise, copy the original + * data buf into the hdr. Ideally, we would like to always copy the + * io_data into b_pdata but the user may have disabled compressed + * arc thus the on-disk block may or may not match what we maintain + * in the hdr's b_pdata field. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + ASSERT(BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF); + ASSERT3U(psize, >, 0); + arc_hdr_alloc_pdata(hdr); + bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize); + } else { + ASSERT3P(buf->b_data, ==, zio->io_orig_data); + ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr)); + ASSERT3U(hdr->b_l1hdr.b_byteswap, ==, DMU_BSWAP_NUMFUNCS); + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + + /* + * This hdr is not compressed so we're able to share + * the arc_buf_t data buffer with the hdr. + */ + arc_share_buf(hdr, buf); + VERIFY0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata, + HDR_GET_LSIZE(hdr))); + } + arc_hdr_verify(hdr, zio->io_bp); } static void @@ -4718,9 +5148,11 @@ arc_write_done(zio_t *zio) arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(hdr->b_l1hdr.b_acb == NULL); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (zio->io_error == 0) { + arc_hdr_verify(hdr, zio->io_bp); + if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { @@ -4728,7 +5160,7 @@ arc_write_done(zio_t *zio) hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); } } else { - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); } /* @@ -4737,7 +5169,7 @@ arc_write_done(zio_t *zio) * dva/birth/checksum. The buffer must therefore remain anonymous * (and uncached). */ - if (!BUF_EMPTY(hdr)) { + if (!HDR_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; @@ -4771,19 +5203,19 @@ arc_write_done(zio_t *zio) (void *)hdr, (void *)exists); } else { /* Dedup */ - ASSERT(hdr->b_l1hdr.b_datacnt == 1); + ASSERT(hdr->b_l1hdr.b_bufcnt == 1); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } - hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { - hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); } ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); @@ -4793,9 +5225,8 @@ arc_write_done(zio_t *zio) } zio_t * -arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, - const zio_prop_t *zp, arc_done_func_t *ready, +arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, + boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *children_ready, arc_done_func_t *physdone, arc_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb) @@ -4804,16 +5235,14 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, arc_write_callback_t *callback; zio_t *zio; - ASSERT(ready != NULL); - ASSERT(done != NULL); + ASSERT3P(ready, !=, NULL); + ASSERT3P(done, !=, NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(hdr->b_l1hdr.b_acb == NULL); - ASSERT(hdr->b_l1hdr.b_datacnt > 0); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); if (l2arc) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (l2arc_compress) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; @@ -4822,7 +5251,30 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, callback->awcb_private = private; callback->awcb_buf = buf; - zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, + /* + * The hdr's b_pdata is now stale, free it now. A new data block + * will be allocated when the zio pipeline calls arc_write_ready(). + */ + if (hdr->b_l1hdr.b_pdata != NULL) { + /* + * If the buf is currently sharing the data block with + * the hdr then we need to break that relationship here. + * The hdr will remain with a NULL data pointer and the + * buf will take sole ownership of the block. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + arc_unshare_buf(hdr, buf); + } else { + arc_hdr_free_pdata(hdr); + } + VERIFY3P(buf->b_data, !=, NULL); + arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); + } + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + + zio = zio_write(pio, spa, txg, bp, buf->b_data, HDR_GET_LSIZE(hdr), zp, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, arc_write_physdone, arc_write_done, callback, @@ -4917,12 +5369,14 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) if (reserve + arc_tempreserve + anon_size > arc_c / 2 && anon_size > arc_c / 4) { + uint64_t meta_esize = + refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + uint64_t data_esize = + refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", - arc_tempreserve>>10, - arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, - arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, - reserve>>10, arc_c>>10); + arc_tempreserve >> 10, meta_esize >> 10, + data_esize >> 10, reserve >> 10, arc_c >> 10); return (SET_ERROR(ERESTART)); } atomic_add_64(&arc_tempreserve, reserve); @@ -4934,8 +5388,10 @@ arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { size->value.ui64 = refcount_count(&state->arcs_size); - evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; - evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; + evict_data->value.ui64 = + refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); + evict_metadata->value.ui64 = + refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); } static int @@ -4988,7 +5444,7 @@ arc_state_multilist_index_func(multilist_t *ml, void *obj) * numbers using buf_hash below. So, as an added precaution, * let's make sure we never add empty buffers to the arc lists. */ - ASSERT(!BUF_EMPTY(hdr)); + ASSERT(!HDR_EMPTY(hdr)); /* * The assumption here, is the hash value for a given @@ -5006,6 +5462,117 @@ arc_state_multilist_index_func(multilist_t *ml, void *obj) multilist_get_num_sublists(ml)); } +static void +arc_state_init(void) +{ + arc_anon = &ARC_anon; + arc_mru = &ARC_mru; + arc_mru_ghost = &ARC_mru_ghost; + arc_mfu = &ARC_mfu; + arc_mfu_ghost = &ARC_mfu_ghost; + arc_l2c_only = &ARC_l2c_only; + + multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + + refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); + + refcount_create(&arc_anon->arcs_size); + refcount_create(&arc_mru->arcs_size); + refcount_create(&arc_mru_ghost->arcs_size); + refcount_create(&arc_mfu->arcs_size); + refcount_create(&arc_mfu_ghost->arcs_size); + refcount_create(&arc_l2c_only->arcs_size); +} + +static void +arc_state_fini(void) +{ + refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); + + refcount_destroy(&arc_anon->arcs_size); + refcount_destroy(&arc_mru->arcs_size); + refcount_destroy(&arc_mru_ghost->arcs_size); + refcount_destroy(&arc_mfu->arcs_size); + refcount_destroy(&arc_mfu_ghost->arcs_size); + refcount_destroy(&arc_l2c_only->arcs_size); + + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); +} + +uint64_t +arc_max_bytes(void) +{ + return (arc_c_max); +} + void arc_init(void) { @@ -5022,9 +5589,6 @@ arc_init(void) cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL); - /* Convert seconds to clock ticks */ arc_min_prefetch_lifespan = 1 * hz; @@ -5058,6 +5622,7 @@ arc_init(void) arc_c = arc_c_max; arc_p = (arc_c >> 1); + arc_size = 0; /* limit meta-data to 1/4 of the arc capacity */ arc_meta_limit = arc_c_max / 4; @@ -5108,68 +5673,10 @@ arc_init(void) if (arc_c < arc_c_min) arc_c = arc_c_min; - arc_anon = &ARC_anon; - arc_mru = &ARC_mru; - arc_mru_ghost = &ARC_mru_ghost; - arc_mfu = &ARC_mfu; - arc_mfu_ghost = &ARC_mfu_ghost; - arc_l2c_only = &ARC_l2c_only; - arc_size = 0; - - multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - - refcount_create(&arc_anon->arcs_size); - refcount_create(&arc_mru->arcs_size); - refcount_create(&arc_mru_ghost->arcs_size); - refcount_create(&arc_mfu->arcs_size); - refcount_create(&arc_mfu_ghost->arcs_size); - refcount_create(&arc_l2c_only->arcs_size); - + arc_state_init(); buf_init(); - arc_reclaim_thread_exit = FALSE; - arc_user_evicts_thread_exit = FALSE; - arc_eviction_list = NULL; - bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); + arc_reclaim_thread_exit = B_FALSE; arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); @@ -5183,10 +5690,7 @@ arc_init(void) (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, TS_RUN, minclsyspri); - (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0, - TS_RUN, minclsyspri); - - arc_dead = FALSE; + arc_dead = B_FALSE; arc_warm = B_FALSE; /* @@ -5209,10 +5713,10 @@ void arc_fini(void) { mutex_enter(&arc_reclaim_lock); - arc_reclaim_thread_exit = TRUE; + arc_reclaim_thread_exit = B_TRUE; /* * The reclaim thread will set arc_reclaim_thread_exit back to - * FALSE when it is finished exiting; we're waiting for that. + * B_FALSE when it is finished exiting; we're waiting for that. */ while (arc_reclaim_thread_exit) { cv_signal(&arc_reclaim_thread_cv); @@ -5220,22 +5724,10 @@ arc_fini(void) } mutex_exit(&arc_reclaim_lock); - mutex_enter(&arc_user_evicts_lock); - arc_user_evicts_thread_exit = TRUE; - /* - * The user evicts thread will set arc_user_evicts_thread_exit - * to FALSE when it is finished exiting; we're waiting for that. - */ - while (arc_user_evicts_thread_exit) { - cv_signal(&arc_user_evicts_cv); - cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock); - } - mutex_exit(&arc_user_evicts_lock); + /* Use B_TRUE to ensure *all* buffers are evicted */ + arc_flush(NULL, B_TRUE); - /* Use TRUE to ensure *all* buffers are evicted */ - arc_flush(NULL, TRUE); - - arc_dead = TRUE; + arc_dead = B_TRUE; if (arc_ksp != NULL) { kstat_delete(arc_ksp); @@ -5246,27 +5738,7 @@ arc_fini(void) cv_destroy(&arc_reclaim_thread_cv); cv_destroy(&arc_reclaim_waiters_cv); - mutex_destroy(&arc_user_evicts_lock); - cv_destroy(&arc_user_evicts_cv); - - refcount_destroy(&arc_anon->arcs_size); - refcount_destroy(&arc_mru->arcs_size); - refcount_destroy(&arc_mru_ghost->arcs_size); - refcount_destroy(&arc_mfu->arcs_size); - refcount_destroy(&arc_mfu_ghost->arcs_size); - refcount_destroy(&arc_l2c_only->arcs_size); - - multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); - + arc_state_fini(); buf_fini(); ASSERT0(arc_loaned_bytes); @@ -5555,9 +6027,13 @@ l2arc_do_free_on_write() for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); - ASSERT(df->l2df_data != NULL); - ASSERT(df->l2df_func != NULL); - df->l2df_func(df->l2df_data, df->l2df_size); + ASSERT3P(df->l2df_data, !=, NULL); + if (df->l2df_type == ARC_BUFC_METADATA) { + zio_buf_free(df->l2df_data, df->l2df_size); + } else { + ASSERT(df->l2df_type == ARC_BUFC_DATA); + zio_data_buf_free(df->l2df_data, df->l2df_size); + } list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } @@ -5580,13 +6056,13 @@ l2arc_write_done(zio_t *zio) int64_t bytes_dropped = 0; cb = zio->io_private; - ASSERT(cb != NULL); + ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; - ASSERT(dev != NULL); + ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; - ASSERT(head != NULL); + ASSERT3P(head, !=, NULL); buflist = &dev->l2ad_buflist; - ASSERT(buflist != NULL); + ASSERT3P(buflist, !=, NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); @@ -5644,32 +6120,26 @@ top: */ ASSERT(HDR_HAS_L1HDR(hdr)); - /* - * We may have allocated a buffer for L2ARC compression, - * we must release it to avoid leaking this data. - */ - l2arc_release_cdata_buf(hdr); - if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; + arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); - ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + ARCSTAT_INCR(arcstat_l2_asize, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); - bytes_dropped += hdr->b_l2hdr.b_asize; + bytes_dropped += arc_hdr_size(hdr); (void) refcount_remove_many(&dev->l2ad_alloc, - hdr->b_l2hdr.b_asize, hdr); + arc_hdr_size(hdr), hdr); } /* * Allow ARC to begin reads and ghost list evictions to * this L2ARC entry. */ - hdr->b_flags &= ~ARC_FLAG_L2_WRITING; + arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); } @@ -5696,43 +6166,36 @@ l2arc_read_done(zio_t *zio) { l2arc_read_callback_t *cb; arc_buf_hdr_t *hdr; - arc_buf_t *buf; kmutex_t *hash_lock; - int equal; + boolean_t valid_cksum; - ASSERT(zio->io_vd != NULL); + ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); cb = zio->io_private; - ASSERT(cb != NULL); - buf = cb->l2rcb_buf; - ASSERT(buf != NULL); + ASSERT3P(cb, !=, NULL); + hdr = cb->l2rcb_hdr; + ASSERT3P(hdr, !=, NULL); - hash_lock = HDR_LOCK(buf->b_hdr); + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); - hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - /* - * If the buffer was compressed, decompress it first. - */ - if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) - l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); - ASSERT(zio->io_data != NULL); - ASSERT3U(zio->io_size, ==, hdr->b_size); - ASSERT3U(BP_GET_LSIZE(&cb->l2rcb_bp), ==, hdr->b_size); + ASSERT3P(zio->io_data, !=, NULL); /* * Check this survived the L2ARC journey. */ - equal = arc_cksum_equal(buf); - if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { + ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata); + zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ + zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + + valid_cksum = arc_cksum_is_equal(hdr, zio); + if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); - zio->io_private = buf; - zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ - zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + zio->io_private = hdr; arc_read_done(zio); } else { mutex_exit(hash_lock); @@ -5745,7 +6208,7 @@ l2arc_read_done(zio_t *zio) } else { zio->io_error = SET_ERROR(EIO); } - if (!equal) + if (!valid_cksum) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* @@ -5758,9 +6221,10 @@ l2arc_read_done(zio_t *zio) ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); - zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, - buf->b_data, hdr->b_size, arc_read_done, buf, - zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); + zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, + hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done, + hdr, zio->io_priority, cb->l2rcb_flags, + &cb->l2rcb_zb)); } } @@ -5910,12 +6374,11 @@ top: */ if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); - hdr->b_flags |= ARC_FLAG_L2_EVICTED; + arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); } /* Ensure this header has finished being written */ ASSERT(!HDR_L2_WRITING(hdr)); - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); arc_hdr_l2hdr_destroy(hdr); } @@ -5936,36 +6399,22 @@ top: * the delta by which the device hand has changed due to alignment). */ static uint64_t -l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, - boolean_t *headroom_boost) +l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *hdr, *hdr_prev, *head; - uint64_t write_asize, write_sz, headroom, - buf_compress_minsz; - void *buf_data; + uint64_t write_asize, write_psize, write_sz, headroom; boolean_t full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); - const boolean_t do_headroom_boost = *headroom_boost; - ASSERT(dev->l2ad_vdev != NULL); - - /* Lower the flag now, we might want to raise it again later. */ - *headroom_boost = B_FALSE; + ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; - write_sz = write_asize = 0; + write_sz = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); - head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; - head->b_flags |= ARC_FLAG_HAS_L2HDR; - - /* - * We will want to try to compress buffers that are at least 2x the - * device sector size. - */ - buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; + arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); /* * Copy buffers for L2ARC writing. @@ -5986,13 +6435,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, hdr = multilist_sublist_tail(mls); headroom = target_sz * l2arc_headroom; - if (do_headroom_boost) + if (zfs_compressed_arc_enabled) headroom = (headroom * l2arc_headroom_boost) / 100; for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; - uint64_t buf_sz; - uint64_t buf_a_sz; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); @@ -6007,7 +6454,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, continue; } - passed_sz += hdr->b_size; + passed_sz += HDR_GET_LSIZE(hdr); if (passed_sz > headroom) { /* * Searched too far. @@ -6021,15 +6468,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, continue; } - /* - * Assume that the buffer is not going to be compressed - * and could take more space on disk because of a larger - * disk block size. - */ - buf_sz = hdr->b_size; - buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); - - if ((write_asize + buf_a_sz) > target_sz) { + if ((write_asize + HDR_GET_LSIZE(hdr)) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; @@ -6053,63 +6492,76 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, ZIO_FLAG_CANFAIL); } - /* - * Create and add a new L2ARC header. - */ hdr->b_l2hdr.b_dev = dev; - hdr->b_flags |= ARC_FLAG_L2_WRITING; + hdr->b_l2hdr.b_daddr = dev->l2ad_hand; + arc_hdr_set_flags(hdr, + ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); + + mutex_enter(&dev->l2ad_mtx); + list_insert_head(&dev->l2ad_buflist, hdr); + mutex_exit(&dev->l2ad_mtx); + /* - * Temporarily stash the data buffer in b_tmp_cdata. - * The subsequent write step will pick it up from - * there. This is because can't access b_l1hdr.b_buf - * without holding the hash_lock, which we in turn - * can't access without holding the ARC list locks - * (which we want to avoid during compression/writing). + * We rely on the L1 portion of the header below, so + * it's invalid for this header to have been evicted out + * of the ghost cache, prior to being written out. The + * ARC_FLAG_L2_WRITING bit ensures this won't happen. */ - hdr->b_l2hdr.b_compress = ZIO_COMPRESS_OFF; - hdr->b_l2hdr.b_asize = hdr->b_size; - hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; + ASSERT(HDR_HAS_L1HDR(hdr)); + + ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3U(arc_hdr_size(hdr), >, 0); + uint64_t size = arc_hdr_size(hdr); + + (void) refcount_add_many(&dev->l2ad_alloc, size, hdr); /* - * Explicitly set the b_daddr field to a known - * value which means "invalid address". This - * enables us to differentiate which stage of - * l2arc_write_buffers() the particular header - * is in (e.g. this loop, or the one below). - * ARC_FLAG_L2_WRITING is not enough to make - * this distinction, and we need to know in - * order to do proper l2arc vdev accounting in - * arc_release() and arc_hdr_destroy(). - * - * Note, we can't use a new flag to distinguish - * the two stages because we don't hold the - * header's hash_lock below, in the second stage - * of this function. Thus, we can't simply - * change the b_flags field to denote that the - * IO has been sent. We can change the b_daddr - * field of the L2 portion, though, since we'll - * be holding the l2ad_mtx; which is why we're - * using it to denote the header's state change. + * Normally the L2ARC can use the hdr's data, but if + * we're sharing data between the hdr and one of its + * bufs, L2ARC needs its own copy of the data so that + * the ZIO below can't race with the buf consumer. To + * ensure that this copy will be available for the + * lifetime of the ZIO and be cleaned up afterwards, we + * add it to the l2arc_free_on_write queue. */ - hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; + void *to_write; + if (!HDR_SHARED_DATA(hdr)) { + to_write = hdr->b_l1hdr.b_pdata; + } else { + arc_buf_contents_t type = arc_buf_type(hdr); + if (type == ARC_BUFC_METADATA) { + to_write = zio_buf_alloc(size); + } else { + ASSERT3U(type, ==, ARC_BUFC_DATA); + to_write = zio_data_buf_alloc(size); + } - hdr->b_flags |= ARC_FLAG_HAS_L2HDR; + bcopy(hdr->b_l1hdr.b_pdata, to_write, size); + l2arc_free_data_on_write(to_write, size, type); + } + wzio = zio_write_phys(pio, dev->l2ad_vdev, + hdr->b_l2hdr.b_daddr, size, to_write, + ZIO_CHECKSUM_OFF, NULL, hdr, + ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_CANFAIL, B_FALSE); - mutex_enter(&dev->l2ad_mtx); - list_insert_head(&dev->l2ad_buflist, hdr); - mutex_exit(&dev->l2ad_mtx); + write_sz += HDR_GET_LSIZE(hdr); + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, + zio_t *, wzio); + write_asize += size; /* - * Compute and store the buffer cksum before - * writing. On debug the cksum is verified first. + * Keep the clock hand suitably device-aligned. */ - arc_cksum_verify(hdr->b_l1hdr.b_buf); - arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); + uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, + size); + write_psize += asize; + dev->l2ad_hand += asize; mutex_exit(hash_lock); - write_sz += buf_sz; - write_asize += buf_a_sz; + (void) zio_nowait(wzio); } multilist_sublist_unlock(mls); @@ -6126,104 +6578,12 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, return (0); } - mutex_enter(&dev->l2ad_mtx); - - /* - * Note that elsewhere in this file arcstat_l2_asize - * and the used space on l2ad_vdev are updated using b_asize, - * which is not necessarily rounded up to the device block size. - * Too keep accounting consistent we do the same here as well: - * stats_size accumulates the sum of b_asize of the written buffers, - * while write_asize accumulates the sum of b_asize rounded up - * to the device block size. - * The latter sum is used only to validate the corectness of the code. - */ - uint64_t stats_size = 0; - write_asize = 0; - - /* - * Now start writing the buffers. We're starting at the write head - * and work backwards, retracing the course of the buffer selector - * loop above. - */ - for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; - hdr = list_prev(&dev->l2ad_buflist, hdr)) { - uint64_t buf_sz; - - /* - * We rely on the L1 portion of the header below, so - * it's invalid for this header to have been evicted out - * of the ghost cache, prior to being written out. The - * ARC_FLAG_L2_WRITING bit ensures this won't happen. - */ - ASSERT(HDR_HAS_L1HDR(hdr)); - - /* - * We shouldn't need to lock the buffer here, since we flagged - * it as ARC_FLAG_L2_WRITING in the previous step, but we must - * take care to only access its L2 cache parameters. In - * particular, hdr->l1hdr.b_buf may be invalid by now due to - * ARC eviction. - */ - hdr->b_l2hdr.b_daddr = dev->l2ad_hand; - - if ((HDR_L2COMPRESS(hdr)) && - hdr->b_l2hdr.b_asize >= buf_compress_minsz) { - if (l2arc_compress_buf(hdr)) { - /* - * If compression succeeded, enable headroom - * boost on the next scan cycle. - */ - *headroom_boost = B_TRUE; - } - } - - /* - * Pick up the buffer data we had previously stashed away - * (and now potentially also compressed). - */ - buf_data = hdr->b_l1hdr.b_tmp_cdata; - buf_sz = hdr->b_l2hdr.b_asize; - - /* - * We need to do this regardless if buf_sz is zero or - * not, otherwise, when this l2hdr is evicted we'll - * remove a reference that was never added. - */ - (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); - - /* Compression may have squashed the buffer to zero length. */ - if (buf_sz != 0) { - uint64_t buf_a_sz; - - wzio = zio_write_phys(pio, dev->l2ad_vdev, - dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, - NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_CANFAIL, B_FALSE); - - DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, - zio_t *, wzio); - (void) zio_nowait(wzio); - - stats_size += buf_sz; - - /* - * Keep the clock hand suitably device-aligned. - */ - buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); - write_asize += buf_a_sz; - dev->l2ad_hand += buf_a_sz; - } - } - - mutex_exit(&dev->l2ad_mtx); - ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); ARCSTAT_INCR(arcstat_l2_size, write_sz); - ARCSTAT_INCR(arcstat_l2_asize, stats_size); - vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0); + ARCSTAT_INCR(arcstat_l2_asize, write_asize); + vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); /* * Bump device hand to the device start if it is approaching the end. @@ -6242,182 +6602,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, } /* - * Compresses an L2ARC buffer. - * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its - * size in l2hdr->b_asize. This routine tries to compress the data and - * depending on the compression result there are three possible outcomes: - * *) The buffer was incompressible. The original l2hdr contents were left - * untouched and are ready for writing to an L2 device. - * *) The buffer was all-zeros, so there is no need to write it to an L2 - * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is - * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. - * *) Compression succeeded and b_tmp_cdata was replaced with a temporary - * data buffer which holds the compressed data to be written, and b_asize - * tells us how much data there is. b_compress is set to the appropriate - * compression algorithm. Once writing is done, invoke - * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. - * - * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the - * buffer was incompressible). - */ -static boolean_t -l2arc_compress_buf(arc_buf_hdr_t *hdr) -{ - void *cdata; - size_t csize, len, rounded; - ASSERT(HDR_HAS_L2HDR(hdr)); - l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3S(l2hdr->b_compress, ==, ZIO_COMPRESS_OFF); - ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); - - len = l2hdr->b_asize; - cdata = zio_data_buf_alloc(len); - ASSERT3P(cdata, !=, NULL); - csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata, - cdata, l2hdr->b_asize); - - rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE); - if (rounded > csize) { - bzero((char *)cdata + csize, rounded - csize); - csize = rounded; - } - - if (csize == 0) { - /* zero block, indicate that there's nothing to write */ - zio_data_buf_free(cdata, len); - l2hdr->b_compress = ZIO_COMPRESS_EMPTY; - l2hdr->b_asize = 0; - hdr->b_l1hdr.b_tmp_cdata = NULL; - ARCSTAT_BUMP(arcstat_l2_compress_zeros); - return (B_TRUE); - } else if (csize > 0 && csize < len) { - /* - * Compression succeeded, we'll keep the cdata around for - * writing and release it afterwards. - */ - l2hdr->b_compress = ZIO_COMPRESS_LZ4; - l2hdr->b_asize = csize; - hdr->b_l1hdr.b_tmp_cdata = cdata; - ARCSTAT_BUMP(arcstat_l2_compress_successes); - return (B_TRUE); - } else { - /* - * Compression failed, release the compressed buffer. - * l2hdr will be left unmodified. - */ - zio_data_buf_free(cdata, len); - ARCSTAT_BUMP(arcstat_l2_compress_failures); - return (B_FALSE); - } -} - -/* - * Decompresses a zio read back from an l2arc device. On success, the - * underlying zio's io_data buffer is overwritten by the uncompressed - * version. On decompression error (corrupt compressed stream), the - * zio->io_error value is set to signal an I/O error. - * - * Please note that the compressed data stream is not checksummed, so - * if the underlying device is experiencing data corruption, we may feed - * corrupt data to the decompressor, so the decompressor needs to be - * able to handle this situation (LZ4 does). - */ -static void -l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) -{ - ASSERT(L2ARC_IS_VALID_COMPRESS(c)); - - if (zio->io_error != 0) { - /* - * An io error has occured, just restore the original io - * size in preparation for a main pool read. - */ - zio->io_orig_size = zio->io_size = hdr->b_size; - return; - } - - if (c == ZIO_COMPRESS_EMPTY) { - /* - * An empty buffer results in a null zio, which means we - * need to fill its io_data after we're done restoring the - * buffer's contents. - */ - ASSERT(hdr->b_l1hdr.b_buf != NULL); - bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); - zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; - } else { - ASSERT(zio->io_data != NULL); - /* - * We copy the compressed data from the start of the arc buffer - * (the zio_read will have pulled in only what we need, the - * rest is garbage which we will overwrite at decompression) - * and then decompress back to the ARC data buffer. This way we - * can minimize copying by simply decompressing back over the - * original compressed data (rather than decompressing to an - * aux buffer and then copying back the uncompressed buffer, - * which is likely to be much larger). - */ - uint64_t csize; - void *cdata; - - csize = zio->io_size; - cdata = zio_data_buf_alloc(csize); - bcopy(zio->io_data, cdata, csize); - if (zio_decompress_data(c, cdata, zio->io_data, csize, - hdr->b_size) != 0) - zio->io_error = EIO; - zio_data_buf_free(cdata, csize); - } - - /* Restore the expected uncompressed IO size. */ - zio->io_orig_size = zio->io_size = hdr->b_size; -} - -/* - * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. - * This buffer serves as a temporary holder of compressed data while - * the buffer entry is being written to an l2arc device. Once that is - * done, we can dispose of it. - */ -static void -l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) -{ - ASSERT(HDR_HAS_L2HDR(hdr)); - enum zio_compress comp = hdr->b_l2hdr.b_compress; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp)); - - if (comp == ZIO_COMPRESS_OFF) { - /* - * In this case, b_tmp_cdata points to the same buffer - * as the arc_buf_t's b_data field. We don't want to - * free it, since the arc_buf_t will handle that. - */ - hdr->b_l1hdr.b_tmp_cdata = NULL; - } else if (comp == ZIO_COMPRESS_EMPTY) { - /* - * In this case, b_tmp_cdata was compressed to an empty - * buffer, thus there's nothing to free and b_tmp_cdata - * should have been set to NULL in l2arc_write_buffers(). - */ - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); - } else { - /* - * If the data was compressed, then we've allocated a - * temporary buffer for it, so now we need to release it. - */ - ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); - zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, - hdr->b_size); - hdr->b_l1hdr.b_tmp_cdata = NULL; - } - -} - -/* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. */ @@ -6429,7 +6613,6 @@ l2arc_feed_thread(void) spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); - boolean_t headroom_boost = B_FALSE; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); @@ -6467,7 +6650,7 @@ l2arc_feed_thread(void) continue; spa = dev->l2ad_spa; - ASSERT(spa != NULL); + ASSERT3P(spa, !=, NULL); /* * If the pool is read-only then force the feed thread to @@ -6500,7 +6683,7 @@ l2arc_feed_thread(void) /* * Write ARC buffers. */ - wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); + wrote = l2arc_write_buffers(spa, dev, size); /* * Calculate interval between writes. @@ -6593,7 +6776,7 @@ l2arc_remove_vdev(vdev_t *vd) break; } } - ASSERT(remdev != NULL); + ASSERT3P(remdev, !=, NULL); /* * Remove device from global list diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index 64a4cb74d0..dc5c0f5a2e 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -45,6 +45,9 @@ #include <sys/zfeature.h> #include <sys/blkptr.h> #include <sys/range_tree.h> +#include <sys/callb.h> + +uint_t zfs_dbuf_evict_key; /* * Number of times that zfs_free_range() took the slow path while doing @@ -52,7 +55,6 @@ */ uint64_t zfs_free_range_recv_miss; -static void dbuf_destroy(dmu_buf_impl_t *db); static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); @@ -64,9 +66,76 @@ extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, /* * Global data structures and functions for the dbuf cache. */ -static kmem_cache_t *dbuf_cache; +static kmem_cache_t *dbuf_kmem_cache; static taskq_t *dbu_evict_taskq; +static kthread_t *dbuf_cache_evict_thread; +static kmutex_t dbuf_evict_lock; +static kcondvar_t dbuf_evict_cv; +static boolean_t dbuf_evict_thread_exit; + +/* + * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that + * are not currently held but have been recently released. These dbufs + * are not eligible for arc eviction until they are aged out of the cache. + * Dbufs are added to the dbuf cache once the last hold is released. If a + * dbuf is later accessed and still exists in the dbuf cache, then it will + * be removed from the cache and later re-added to the head of the cache. + * Dbufs that are aged out of the cache will be immediately destroyed and + * become eligible for arc eviction. + */ +static multilist_t dbuf_cache; +static refcount_t dbuf_cache_size; +uint64_t dbuf_cache_max_bytes = 100 * 1024 * 1024; + +/* Cap the size of the dbuf cache to log2 fraction of arc size. */ +int dbuf_cache_max_shift = 5; + +/* + * The dbuf cache uses a three-stage eviction policy: + * - A low water marker designates when the dbuf eviction thread + * should stop evicting from the dbuf cache. + * - When we reach the maximum size (aka mid water mark), we + * signal the eviction thread to run. + * - The high water mark indicates when the eviction thread + * is unable to keep up with the incoming load and eviction must + * happen in the context of the calling thread. + * + * The dbuf cache: + * (max size) + * low water mid water hi water + * +----------------------------------------+----------+----------+ + * | | | | + * | | | | + * | | | | + * | | | | + * +----------------------------------------+----------+----------+ + * stop signal evict + * evicting eviction directly + * thread + * + * The high and low water marks indicate the operating range for the eviction + * thread. The low water mark is, by default, 90% of the total size of the + * cache and the high water mark is at 110% (both of these percentages can be + * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, + * respectively). The eviction thread will try to ensure that the cache remains + * within this range by waking up every second and checking if the cache is + * above the low water mark. The thread can also be woken up by callers adding + * elements into the cache if the cache is larger than the mid water (i.e max + * cache size). Once the eviction thread is woken up and eviction is required, + * it will continue evicting buffers until it's able to reduce the cache size + * to the low water mark. If the cache size continues to grow and hits the high + * water mark, then callers adding elments to the cache will begin to evict + * directly from the cache until the cache is no longer above the high water + * mark. + */ + +/* + * The percentage above and below the maximum cache size. + */ +uint_t dbuf_cache_hiwater_pct = 10; +uint_t dbuf_cache_lowater_pct = 10; + /* ARGSUSED */ static int dbuf_cons(void *vdb, void *unused, int kmflag) @@ -76,6 +145,7 @@ dbuf_cons(void *vdb, void *unused, int kmflag) mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); + multilist_link_init(&db->db_cache_link); refcount_create(&db->db_holds); return (0); @@ -88,6 +158,7 @@ dbuf_dest(void *vdb, void *unused) dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); cv_destroy(&db->db_changed); + ASSERT(!multilist_link_active(&db->db_cache_link)); refcount_destroy(&db->db_holds); } @@ -117,8 +188,6 @@ dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) return (crc); } -#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); - #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ ((dbuf)->db.db_object == (obj) && \ (dbuf)->db_objset == (os) && \ @@ -129,7 +198,7 @@ dmu_buf_impl_t * dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) { dbuf_hash_table_t *h = &dbuf_hash_table; - uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t hv = dbuf_hash(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *db; @@ -180,7 +249,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db) uint64_t obj = db->db.db_object; int level = db->db_level; uint64_t blkid = db->db_blkid; - uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t hv = dbuf_hash(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *dbf; @@ -212,7 +281,7 @@ static void dbuf_hash_remove(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; - uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, + uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *dbf, **dbp; @@ -237,8 +306,6 @@ dbuf_hash_remove(dmu_buf_impl_t *db) atomic_dec_64(&dbuf_hash_count); } -static arc_evict_func_t dbuf_do_evict; - typedef enum { DBVU_EVICTING, DBVU_NOT_EVICTING @@ -323,15 +390,181 @@ dbuf_is_metadata(dmu_buf_impl_t *db) } } -void -dbuf_evict(dmu_buf_impl_t *db) +/* + * This function *must* return indices evenly distributed between all + * sublists of the multilist. This is needed due to how the dbuf eviction + * code is laid out; dbuf_evict_thread() assumes dbufs are evenly + * distributed between all sublists and uses this assumption when + * deciding which sublist to evict from and how much to evict from it. + */ +unsigned int +dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) { - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_buf == NULL); - ASSERT(db->db_data_pending == NULL); + dmu_buf_impl_t *db = obj; + + /* + * The assumption here, is the hash value for a given + * dmu_buf_impl_t will remain constant throughout it's lifetime + * (i.e. it's objset, object, level and blkid fields don't change). + * Thus, we don't need to store the dbuf's sublist index + * on insertion, as this index can be recalculated on removal. + * + * Also, the low order bits of the hash value are thought to be + * distributed evenly. Otherwise, in the case that the multilist + * has a power of two number of sublists, each sublists' usage + * would not be evenly distributed. + */ + return (dbuf_hash(db->db_objset, db->db.db_object, + db->db_level, db->db_blkid) % + multilist_get_num_sublists(ml)); +} + +static inline boolean_t +dbuf_cache_above_hiwater(void) +{ + uint64_t dbuf_cache_hiwater_bytes = + (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100; + + return (refcount_count(&dbuf_cache_size) > + dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes); +} + +static inline boolean_t +dbuf_cache_above_lowater(void) +{ + uint64_t dbuf_cache_lowater_bytes = + (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100; + + return (refcount_count(&dbuf_cache_size) > + dbuf_cache_max_bytes - dbuf_cache_lowater_bytes); +} + +/* + * Evict the oldest eligible dbuf from the dbuf cache. + */ +static void +dbuf_evict_one(void) +{ + int idx = multilist_get_random_index(&dbuf_cache); + multilist_sublist_t *mls = multilist_sublist_lock(&dbuf_cache, idx); + + ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); + + /* + * Set the thread's tsd to indicate that it's processing evictions. + * Once a thread stops evicting from the dbuf cache it will + * reset its tsd to NULL. + */ + ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL); + (void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE); + + dmu_buf_impl_t *db = multilist_sublist_tail(mls); + while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { + db = multilist_sublist_prev(mls, db); + } + + DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, + multilist_sublist_t *, mls); + + if (db != NULL) { + multilist_sublist_remove(mls, db); + multilist_sublist_unlock(mls); + (void) refcount_remove_many(&dbuf_cache_size, + db->db.db_size, db); + dbuf_destroy(db); + } else { + multilist_sublist_unlock(mls); + } + (void) tsd_set(zfs_dbuf_evict_key, NULL); +} + +/* + * The dbuf evict thread is responsible for aging out dbufs from the + * cache. Once the cache has reached it's maximum size, dbufs are removed + * and destroyed. The eviction thread will continue running until the size + * of the dbuf cache is at or below the maximum size. Once the dbuf is aged + * out of the cache it is destroyed and becomes eligible for arc eviction. + */ +static void +dbuf_evict_thread(void) +{ + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); + + mutex_enter(&dbuf_evict_lock); + while (!dbuf_evict_thread_exit) { + while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_hires(&dbuf_evict_cv, + &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); + CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); + } + mutex_exit(&dbuf_evict_lock); + + /* + * Keep evicting as long as we're above the low water mark + * for the cache. We do this without holding the locks to + * minimize lock contention. + */ + while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { + dbuf_evict_one(); + } + + mutex_enter(&dbuf_evict_lock); + } - dbuf_clear(db); - dbuf_destroy(db); + dbuf_evict_thread_exit = B_FALSE; + cv_broadcast(&dbuf_evict_cv); + CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ + thread_exit(); +} + +/* + * Wake up the dbuf eviction thread if the dbuf cache is at its max size. + * If the dbuf cache is at its high water mark, then evict a dbuf from the + * dbuf cache using the callers context. + */ +static void +dbuf_evict_notify(void) +{ + + /* + * We use thread specific data to track when a thread has + * started processing evictions. This allows us to avoid deeply + * nested stacks that would have a call flow similar to this: + * + * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() + * ^ | + * | | + * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ + * + * The dbuf_eviction_thread will always have its tsd set until + * that thread exits. All other threads will only set their tsd + * if they are participating in the eviction process. This only + * happens if the eviction thread is unable to process evictions + * fast enough. To keep the dbuf cache size in check, other threads + * can evict from the dbuf cache directly. Those threads will set + * their tsd values so that we ensure that they only evict one dbuf + * from the dbuf cache. + */ + if (tsd_get(zfs_dbuf_evict_key) != NULL) + return; + + if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { + boolean_t evict_now = B_FALSE; + + mutex_enter(&dbuf_evict_lock); + if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { + evict_now = dbuf_cache_above_hiwater(); + cv_signal(&dbuf_evict_cv); + } + mutex_exit(&dbuf_evict_lock); + + if (evict_now) { + dbuf_evict_one(); + } + } } void @@ -359,7 +592,7 @@ retry: goto retry; } - dbuf_cache = kmem_cache_create("dmu_buf_impl_t", + dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); @@ -367,10 +600,30 @@ retry: mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); /* + * Setup the parameters for the dbuf cache. We cap the size of the + * dbuf cache to 1/32nd (default) of the size of the ARC. + */ + dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes, + arc_max_bytes() >> dbuf_cache_max_shift); + + /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc * configuration is not required. */ dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); + + multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_cache_link), + zfs_arc_num_sublists_per_state, + dbuf_cache_multilist_index_func); + refcount_create(&dbuf_cache_size); + + tsd_create(&zfs_dbuf_evict_key, NULL); + dbuf_evict_thread_exit = B_FALSE; + mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); + dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, + NULL, 0, &p0, TS_RUN, minclsyspri); } void @@ -382,8 +635,23 @@ dbuf_fini(void) for (i = 0; i < DBUF_MUTEXES; i++) mutex_destroy(&h->hash_mutexes[i]); kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); - kmem_cache_destroy(dbuf_cache); + kmem_cache_destroy(dbuf_kmem_cache); taskq_destroy(dbu_evict_taskq); + + mutex_enter(&dbuf_evict_lock); + dbuf_evict_thread_exit = B_TRUE; + while (dbuf_evict_thread_exit) { + cv_signal(&dbuf_evict_cv); + cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); + } + mutex_exit(&dbuf_evict_lock); + tsd_destroy(&zfs_dbuf_evict_key); + + mutex_destroy(&dbuf_evict_lock); + cv_destroy(&dbuf_evict_cv); + + refcount_destroy(&dbuf_cache_size); + multilist_destroy(&dbuf_cache); } /* @@ -541,7 +809,7 @@ dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); - db->db_buf = NULL; + ASSERT3P(db->db_buf, ==, NULL); db->db.db_data = NULL; if (db->db_state != DB_NOFILL) db->db_state = DB_UNCACHED; @@ -556,8 +824,6 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) db->db_buf = buf; ASSERT(buf->b_data != NULL); db->db.db_data = buf->b_data; - if (!arc_released(buf)) - arc_set_callback(buf, dbuf_do_evict, db); } /* @@ -568,6 +834,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) { arc_buf_t *abuf; + ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; @@ -579,6 +846,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); + db->db_buf = NULL; dbuf_clear_data(db); mutex_exit(&db->db_mtx); } @@ -647,7 +915,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) } else { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); - VERIFY(arc_buf_remove_ref(buf, db)); + arc_buf_destroy(buf, db); db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); @@ -707,7 +975,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, + dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db->db.db_size, db, type)); bzero(db->db.db_data, db->db.db_size); @@ -744,8 +1012,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) if (DBUF_IS_L2CACHEABLE(db)) aflags |= ARC_FLAG_L2CACHE; - if (DBUF_IS_L2COMPRESSIBLE(db)) - aflags |= ARC_FLAG_L2COMPRESS; SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, @@ -862,7 +1128,7 @@ dbuf_noread(dmu_buf_impl_t *db) ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); - dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); + dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { dbuf_clear_data(db); @@ -918,9 +1184,10 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; - dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); + dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { + db->db_buf = NULL; dbuf_clear_data(db); } } @@ -1044,7 +1311,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, } if (refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); - dbuf_clear(db); + dbuf_destroy(db); continue; } /* The dbuf is referenced */ @@ -1149,7 +1416,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ - buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); + buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type); /* copy old block data to the new block */ obuf = db->db_buf; @@ -1160,7 +1427,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); - VERIFY(arc_buf_remove_ref(obuf, db)); + arc_buf_destroy(obuf, db); db->db.db_size = size; if (db->db_level == 0) { @@ -1558,7 +1825,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(db->db_buf != NULL); ASSERT(dr->dt.dl.dr_data != NULL); if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); + arc_buf_destroy(dr->dt.dl.dr_data, db); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); @@ -1567,12 +1834,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_dirtycnt -= 1; if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { - arc_buf_t *buf = db->db_buf; - - ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); - dbuf_clear_data(db); - VERIFY(arc_buf_remove_ref(buf, db)); - dbuf_evict(db); + ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); + dbuf_destroy(db); return (B_TRUE); } @@ -1736,7 +1999,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); - VERIFY(arc_buf_remove_ref(buf, db)); + arc_buf_destroy(buf, db); xuio_stat_wbuf_copied(); return; } @@ -1754,10 +2017,10 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) arc_release(db->db_buf, db); } dr->dt.dl.dr_data = buf; - VERIFY(arc_buf_remove_ref(db->db_buf, db)); + arc_buf_destroy(db->db_buf, db); } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { arc_release(db->db_buf, db); - VERIFY(arc_buf_remove_ref(db->db_buf, db)); + arc_buf_destroy(db->db_buf, db); } db->db_buf = NULL; } @@ -1769,59 +2032,62 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) dmu_buf_fill_done(&db->db, tx); } -/* - * "Clear" the contents of this dbuf. This will mark the dbuf - * EVICTING and clear *most* of its references. Unfortunately, - * when we are not holding the dn_dbufs_mtx, we can't clear the - * entry in the dn_dbufs list. We have to wait until dbuf_destroy() - * in this case. For callers from the DMU we will usually see: - * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() - * For the arc callback, we will usually see: - * dbuf_do_evict()->dbuf_clear();dbuf_destroy() - * Sometimes, though, we will get a mix of these two: - * DMU: dbuf_clear()->arc_clear_callback() - * ARC: dbuf_do_evict()->dbuf_destroy() - * - * This routine will dissociate the dbuf from the arc, by calling - * arc_clear_callback(), but will not evict the data from the ARC. - */ void -dbuf_clear(dmu_buf_impl_t *db) +dbuf_destroy(dmu_buf_impl_t *db) { dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; dmu_buf_impl_t *dndb; - boolean_t dbuf_gone = B_FALSE; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(refcount_is_zero(&db->db_holds)); - dbuf_evict_user(db); + if (db->db_buf != NULL) { + arc_buf_destroy(db->db_buf, db); + db->db_buf = NULL; + } - if (db->db_state == DB_CACHED) { + if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(db->db.db_data != NULL); - if (db->db_blkid == DMU_BONUS_BLKID) { - zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); - arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); - } - db->db.db_data = NULL; + zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); db->db_state = DB_UNCACHED; } + dbuf_clear_data(db); + + if (multilist_link_active(&db->db_cache_link)) { + multilist_remove(&dbuf_cache, db); + (void) refcount_remove_many(&dbuf_cache_size, + db->db.db_size, db); + } + ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); db->db_state = DB_EVICTING; db->db_blkptr = NULL; + /* + * Now that db_state is DB_EVICTING, nobody else can find this via + * the hash table. We can now drop db_mtx, which allows us to + * acquire the dn_dbufs_mtx. + */ + mutex_exit(&db->db_mtx); + DB_DNODE_ENTER(db); dn = DB_DNODE(db); dndb = dn->dn_dbuf; - if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { + if (db->db_blkid != DMU_BONUS_BLKID) { + boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); + if (needlock) + mutex_enter(&dn->dn_dbufs_mtx); avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); membar_producer(); DB_DNODE_EXIT(db); + if (needlock) + mutex_exit(&dn->dn_dbufs_mtx); /* * Decrementing the dbuf count means that the hold corresponding * to the removed dbuf is no longer discounted in dnode_move(), @@ -1832,15 +2098,25 @@ dbuf_clear(dmu_buf_impl_t *db) */ dnode_rele(dn, db); db->db_dnode_handle = NULL; + + dbuf_hash_remove(db); } else { DB_DNODE_EXIT(db); } - if (db->db_buf) - dbuf_gone = arc_clear_callback(db->db_buf); + ASSERT(refcount_is_zero(&db->db_holds)); - if (!dbuf_gone) - mutex_exit(&db->db_mtx); + db->db_parent = NULL; + + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + ASSERT(db->db_hash_next == NULL); + ASSERT(db->db_blkptr == NULL); + ASSERT(db->db_data_pending == NULL); + ASSERT(!multilist_link_active(&db->db_cache_link)); + + kmem_cache_free(dbuf_kmem_cache, db); + arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); /* * If this dbuf is referenced from an indirect dbuf, @@ -1933,7 +2209,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_type != DMU_OT_NONE); - db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); + db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); db->db_objset = os; db->db.db_object = dn->dn_object; @@ -1982,7 +2258,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_state = DB_EVICTING; if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ - kmem_cache_free(dbuf_cache, db); + kmem_cache_free(dbuf_kmem_cache, db); mutex_exit(&dn->dn_dbufs_mtx); return (odb); } @@ -2007,76 +2283,12 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, return (db); } -static int -dbuf_do_evict(void *private) -{ - dmu_buf_impl_t *db = private; - - if (!MUTEX_HELD(&db->db_mtx)) - mutex_enter(&db->db_mtx); - - ASSERT(refcount_is_zero(&db->db_holds)); - - if (db->db_state != DB_EVICTING) { - ASSERT(db->db_state == DB_CACHED); - DBUF_VERIFY(db); - db->db_buf = NULL; - dbuf_evict(db); - } else { - mutex_exit(&db->db_mtx); - dbuf_destroy(db); - } - return (0); -} - -static void -dbuf_destroy(dmu_buf_impl_t *db) -{ - ASSERT(refcount_is_zero(&db->db_holds)); - - if (db->db_blkid != DMU_BONUS_BLKID) { - /* - * If this dbuf is still on the dn_dbufs list, - * remove it from that list. - */ - if (db->db_dnode_handle != NULL) { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - mutex_enter(&dn->dn_dbufs_mtx); - avl_remove(&dn->dn_dbufs, db); - atomic_dec_32(&dn->dn_dbufs_count); - mutex_exit(&dn->dn_dbufs_mtx); - DB_DNODE_EXIT(db); - /* - * Decrementing the dbuf count means that the hold - * corresponding to the removed dbuf is no longer - * discounted in dnode_move(), so the dnode cannot be - * moved until after we release the hold. - */ - dnode_rele(dn, db); - db->db_dnode_handle = NULL; - } - dbuf_hash_remove(db); - } - db->db_parent = NULL; - db->db_buf = NULL; - - ASSERT(db->db.db_data == NULL); - ASSERT(db->db_hash_next == NULL); - ASSERT(db->db_blkptr == NULL); - ASSERT(db->db_data_pending == NULL); - - kmem_cache_free(dbuf_cache, db); - arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); -} - typedef struct dbuf_prefetch_arg { spa_t *dpa_spa; /* The spa to issue the prefetch in. */ zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ int dpa_curlevel; /* The current level that we're reading */ + dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ @@ -2114,10 +2326,37 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); ASSERT3S(dpa->dpa_curlevel, >, 0); + + /* + * The dpa_dnode is only valid if we are called with a NULL + * zio. This indicates that the arc_read() returned without + * first calling zio_read() to issue a physical read. Once + * a physical read is made the dpa_dnode must be invalidated + * as the locks guarding it may have been dropped. If the + * dpa_dnode is still valid, then we want to add it to the dbuf + * cache. To do so, we must hold the dbuf associated with the block + * we just prefetched, read its contents so that we associate it + * with an arc_buf_t, and then release it. + */ if (zio != NULL) { ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); - ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); + if (zio->io_flags & ZIO_FLAG_RAW) { + ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); + } else { + ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); + } ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); + + dpa->dpa_dnode = NULL; + } else if (dpa->dpa_dnode != NULL) { + uint64_t curblkid = dpa->dpa_zb.zb_blkid >> + (dpa->dpa_epbs * (dpa->dpa_curlevel - + dpa->dpa_zb.zb_level)); + dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, + dpa->dpa_curlevel, curblkid, FTAG); + (void) dbuf_read(db, NULL, + DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); + dbuf_rele(db, FTAG); } dpa->dpa_curlevel--; @@ -2146,7 +2385,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } - (void) arc_buf_remove_ref(abuf, private); + + arc_buf_destroy(abuf, private); } /* @@ -2240,6 +2480,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, dpa->dpa_prio = prio; dpa->dpa_aflags = aflags; dpa->dpa_spa = dn->dn_objset->os_spa; + dpa->dpa_dnode = dn; dpa->dpa_epbs = epbs; dpa->dpa_zio = pio; @@ -2320,18 +2561,8 @@ top: return (SET_ERROR(ENOENT)); } - if (db->db_buf && refcount_is_zero(&db->db_holds)) { - arc_buf_add_ref(db->db_buf, db); - if (db->db_buf->b_data == NULL) { - dbuf_clear(db); - if (parent) { - dbuf_rele(parent, NULL); - parent = NULL; - } - goto top; - } + if (db->db_buf != NULL) ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); - } ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); @@ -2349,13 +2580,19 @@ top: arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dbuf_set_data(db, - arc_buf_alloc(dn->dn_objset->os_spa, + arc_alloc_buf(dn->dn_objset->os_spa, db->db.db_size, db, type)); bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, db->db.db_size); } } + if (multilist_link_active(&db->db_cache_link)) { + ASSERT(refcount_is_zero(&db->db_holds)); + multilist_remove(&dbuf_cache, db); + (void) refcount_remove_many(&dbuf_cache_size, + db->db.db_size, db); + } (void) refcount_add(&db->db_holds, tag); DBUF_VERIFY(db); mutex_exit(&db->db_mtx); @@ -2429,7 +2666,7 @@ void dbuf_add_ref(dmu_buf_impl_t *db, void *tag) { int64_t holds = refcount_add(&db->db_holds, tag); - ASSERT(holds > 1); + ASSERT3S(holds, >, 1); } #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref @@ -2500,8 +2737,10 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) * We can't freeze indirects if there is a possibility that they * may be modified in the current syncing context. */ - if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) + if (db->db_buf != NULL && + holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { arc_buf_freeze(db->db_buf); + } if (holds == db->db_dirtycnt && db->db_level == 0 && db->db_user_immediate_evict) @@ -2546,55 +2785,44 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) */ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); - dbuf_evict(db); + dbuf_destroy(db); } else if (arc_released(db->db_buf)) { - arc_buf_t *buf = db->db_buf; /* * This dbuf has anonymous data associated with it. */ - dbuf_clear_data(db); - VERIFY(arc_buf_remove_ref(buf, db)); - dbuf_evict(db); + dbuf_destroy(db); } else { - VERIFY(!arc_buf_remove_ref(db->db_buf, db)); + boolean_t do_arc_evict = B_FALSE; + blkptr_t bp; + spa_t *spa = dmu_objset_spa(db->db_objset); + + if (!DBUF_IS_CACHEABLE(db) && + db->db_blkptr != NULL && + !BP_IS_HOLE(db->db_blkptr) && + !BP_IS_EMBEDDED(db->db_blkptr)) { + do_arc_evict = B_TRUE; + bp = *db->db_blkptr; + } - /* - * A dbuf will be eligible for eviction if either the - * 'primarycache' property is set or a duplicate - * copy of this buffer is already cached in the arc. - * - * In the case of the 'primarycache' a buffer - * is considered for eviction if it matches the - * criteria set in the property. - * - * To decide if our buffer is considered a - * duplicate, we must call into the arc to determine - * if multiple buffers are referencing the same - * block on-disk. If so, then we simply evict - * ourselves. - */ - if (!DBUF_IS_CACHEABLE(db)) { - if (db->db_blkptr != NULL && - !BP_IS_HOLE(db->db_blkptr) && - !BP_IS_EMBEDDED(db->db_blkptr)) { - spa_t *spa = - dmu_objset_spa(db->db_objset); - blkptr_t bp = *db->db_blkptr; - dbuf_clear(db); - arc_freed(spa, &bp); - } else { - dbuf_clear(db); - } - } else if (db->db_pending_evict || - arc_buf_eviction_needed(db->db_buf)) { - dbuf_clear(db); - } else { + if (!DBUF_IS_CACHEABLE(db) || + db->db_pending_evict) { + dbuf_destroy(db); + } else if (!multilist_link_active(&db->db_cache_link)) { + multilist_insert(&dbuf_cache, db); + (void) refcount_add_many(&dbuf_cache_size, + db->db.db_size, db); mutex_exit(&db->db_mtx); + + dbuf_evict_notify(); } + + if (do_arc_evict) + arc_freed(spa, &bp); } } else { mutex_exit(&db->db_mtx); } + } #pragma weak dmu_buf_refcount = dbuf_refcount @@ -2878,7 +3106,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) */ int blksz = arc_buf_size(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - *datap = arc_buf_alloc(os->os_spa, blksz, db, type); + *datap = arc_alloc_buf(os->os_spa, blksz, db, type); bcopy(db->db.db_data, (*datap)->b_data, blksz); } db->db_data_pending = dr; @@ -3144,10 +3372,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, - db)); - else if (!arc_released(db->db_buf)) - arc_set_callback(db->db_buf, dbuf_do_evict, db); + arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { dnode_t *dn; @@ -3163,8 +3388,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); - if (!arc_released(db->db_buf)) - arc_set_callback(db->db_buf, dbuf_do_evict, db); } DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); @@ -3341,8 +3564,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dr_zio = arc_write(zio, os->os_spa, txg, &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), - DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, - children_ready_cb, + &zp, dbuf_write_ready, children_ready_cb, dbuf_write_physdone, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 8ce9178ad2..74c6ede4fe 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -1318,7 +1318,7 @@ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); - VERIFY(arc_buf_remove_ref(buf, FTAG)); + arc_buf_destroy(buf, FTAG); } /* @@ -1663,8 +1663,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) zio_nowait(arc_write(pio, os->os_spa, txg, bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), - DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, - NULL, NULL, dmu_sync_done, dsa, + &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); @@ -2034,10 +2033,10 @@ dmu_init(void) xuio_stat_init(); dmu_objset_init(); dnode_init(); - dbuf_init(); zfetch_init(); l2arc_init(); arc_init(); + dbuf_init(); } void diff --git a/usr/src/uts/common/fs/zfs/dmu_diff.c b/usr/src/uts/common/fs/zfs/dmu_diff.c index 7665d1ca59..982b96132c 100644 --- a/usr/src/uts/common/fs/zfs/dmu_diff.c +++ b/usr/src/uts/common/fs/zfs/dmu_diff.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include <sys/dmu.h> @@ -146,7 +146,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (err) break; } - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); if (err) return (err); /* Don't care about the data blocks */ diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c index 78ab9624d3..b6ae9680ca 100644 --- a/usr/src/uts/common/fs/zfs/dmu_objset.c +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -316,8 +316,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, if (DMU_OS_IS_L2CACHEABLE(os)) aflags |= ARC_FLAG_L2CACHE; - if (DMU_OS_IS_L2COMPRESSIBLE(os)) - aflags |= ARC_FLAG_L2COMPRESS; dprintf_bp(os->os_rootbp, "reading %s", ""); err = arc_read(NULL, spa, os->os_rootbp, @@ -334,14 +332,13 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, /* Increase the blocksize if we are permitted. */ if (spa_version(spa) >= SPA_VERSION_USERSPACE && arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { - arc_buf_t *buf = arc_buf_alloc(spa, + arc_buf_t *buf = arc_alloc_buf(spa, sizeof (objset_phys_t), &os->os_phys_buf, ARC_BUFC_METADATA); bzero(buf->b_data, sizeof (objset_phys_t)); bcopy(os->os_phys_buf->b_data, buf->b_data, arc_buf_size(os->os_phys_buf)); - (void) arc_buf_remove_ref(os->os_phys_buf, - &os->os_phys_buf); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); os->os_phys_buf = buf; } @@ -350,7 +347,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; - os->os_phys_buf = arc_buf_alloc(spa, size, + os->os_phys_buf = arc_alloc_buf(spa, size, &os->os_phys_buf, ARC_BUFC_METADATA); os->os_phys = os->os_phys_buf->b_data; bzero(os->os_phys, size); @@ -428,8 +425,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, if (needlock) dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (err != 0) { - VERIFY(arc_buf_remove_ref(os->os_phys_buf, - &os->os_phys_buf)); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); kmem_free(os, sizeof (objset_t)); return (err); } @@ -731,7 +727,7 @@ dmu_objset_evict_done(objset_t *os) } zil_free(os->os_zil); - VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); /* * This is a barrier to prevent the objset from going away in @@ -791,11 +787,17 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, /* * Determine the number of levels necessary for the meta-dnode - * to contain DN_MAX_OBJECT dnodes. + * to contain DN_MAX_OBJECT dnodes. Note that in order to + * ensure that we do not overflow 64 bits, there has to be + * a nlevels that gives us a number of blocks > DN_MAX_OBJECT + * but < 2^64. Therefore, + * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) (10) must be + * less than (64 - log2(DN_MAX_OBJECT)) (16). */ - while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift + + while ((uint64_t)mdn->dn_nblkptr << + (mdn->dn_datablkshift - DNODE_SHIFT + (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < - DN_MAX_OBJECT * sizeof (dnode_phys_t)) + DN_MAX_OBJECT) levels++; mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = @@ -1122,7 +1124,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) zio = arc_write(pio, os->os_spa, tx->tx_txg, os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), - DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c index 6c51e92915..156e46d7b7 100644 --- a/usr/src/uts/common/fs/zfs/dmu_send.c +++ b/usr/src/uts/common/fs/zfs/dmu_send.c @@ -608,7 +608,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) if (err != 0) break; } - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } else if (type == DMU_OT_SA) { arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; @@ -620,7 +620,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) return (SET_ERROR(EIO)); err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data); - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } else if (backup_do_embed(dsa, bp)) { /* it's an embedded level-0 block of a regular object */ int blksz = dblkszsec << SPA_MINBLOCKSHIFT; @@ -644,7 +644,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) &aflags, zb) != 0) { if (zfs_send_corrupt_data) { /* Send a block filled with 0x"zfs badd bloc" */ - abuf = arc_buf_alloc(spa, blksz, &abuf, + abuf = arc_alloc_buf(spa, blksz, &abuf, ARC_BUFC_DATA); uint64_t *ptr; for (ptr = abuf->b_data; @@ -674,7 +674,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) err = dump_write(dsa, type, zb->zb_object, offset, blksz, bp, abuf->b_data); } - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } ASSERT(err == 0 || err == EINTR); diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c index 2822ca4525..5075818de4 100644 --- a/usr/src/uts/common/fs/zfs/dmu_traverse.c +++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c @@ -379,7 +379,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, } if (buf) - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); post: if (err == 0 && (td->td_flags & TRAVERSE_POST)) @@ -594,7 +594,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, osp = buf->b_data; traverse_zil(&td, &osp->os_zil_header); - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } if (!(flags & TRAVERSE_PREFETCH_DATA) || diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c index 024c64897b..4126da2279 100644 --- a/usr/src/uts/common/fs/zfs/dnode.c +++ b/usr/src/uts/common/fs/zfs/dnode.c @@ -509,7 +509,7 @@ dnode_destroy(dnode_t *dn) } if (dn->dn_bonus != NULL) { mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); + dbuf_destroy(dn->dn_bonus); dn->dn_bonus = NULL; } dn->dn_zio = NULL; diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c index 7179c41cbf..daf539ec5c 100644 --- a/usr/src/uts/common/fs/zfs/dnode_sync.c +++ b/usr/src/uts/common/fs/zfs/dnode_sync.c @@ -413,7 +413,7 @@ dnode_evict_dbufs(dnode_t *dn) avl_insert_here(&dn->dn_dbufs, &db_marker, db, AVL_BEFORE); - dbuf_clear(db); + dbuf_destroy(db); db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker); avl_remove(&dn->dn_dbufs, &db_marker); @@ -435,7 +435,7 @@ dnode_evict_bonus(dnode_t *dn) if (dn->dn_bonus != NULL) { if (refcount_is_zero(&dn->dn_bonus->db_holds)) { mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); + dbuf_destroy(dn->dn_bonus); dn->dn_bonus = NULL; } else { dn->dn_bonus->db_pending_evict = TRUE; diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c index 21a5787e42..0a382ee1d9 100644 --- a/usr/src/uts/common/fs/zfs/dsl_scan.c +++ b/usr/src/uts/common/fs/zfs/dsl_scan.c @@ -651,7 +651,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dsl_scan_visitbp(cbp, &czb, dnp, ds, scn, ostype, tx); } - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { arc_flags_t flags = ARC_FLAG_WAIT; dnode_phys_t *cdnp; @@ -677,7 +677,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, cdnp, zb->zb_blkid * epb + i, tx); } - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; @@ -709,7 +709,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, &osp->os_userused_dnode, DMU_USERUSED_OBJECT, tx); } - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } return (0); diff --git a/usr/src/uts/common/fs/zfs/refcount.c b/usr/src/uts/common/fs/zfs/refcount.c index df0f256849..1e6229877b 100644 --- a/usr/src/uts/common/fs/zfs/refcount.c +++ b/usr/src/uts/common/fs/zfs/refcount.c @@ -227,4 +227,28 @@ refcount_transfer(refcount_t *dst, refcount_t *src) list_destroy(&removed); } +void +refcount_transfer_ownership(refcount_t *rc, void *current_holder, + void *new_holder) +{ + reference_t *ref; + boolean_t found = B_FALSE; + + mutex_enter(&rc->rc_mtx); + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return; + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == current_holder) { + ref->ref_holder = new_holder; + found = B_TRUE; + break; + } + } + ASSERT(found); + mutex_exit(&rc->rc_mtx); +} #endif /* ZFS_DEBUG */ diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 699b15cc58..a674eea6b0 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -341,9 +341,14 @@ int spa_asize_inflation = 24; * it is possible to run the pool completely out of space, causing it to * be permanently read-only. * + * Note that on very small pools, the slop space will be larger than + * 3.2%, in an effort to have it be at least spa_min_slop (128MB), + * but we never allow it to be more than half the pool size. + * * See also the comments in zfs_space_check_t. */ int spa_slop_shift = 5; +uint64_t spa_min_slop = 128 * 1024 * 1024; /* * ========================================================================== @@ -1637,14 +1642,16 @@ spa_get_asize(spa_t *spa, uint64_t lsize) /* * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%), - * or at least 32MB. + * or at least 128MB, unless that would cause it to be more than half the + * pool size. * * See the comment above spa_slop_shift for details. */ uint64_t -spa_get_slop_space(spa_t *spa) { +spa_get_slop_space(spa_t *spa) +{ uint64_t space = spa_get_dspace(spa); - return (MAX(space >> spa_slop_shift, SPA_MINDEVSIZE >> 1)); + return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop))); } uint64_t diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index f886a2ae3a..b1e9456f5a 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -43,51 +43,83 @@ extern "C" { */ #define ARC_EVICT_ALL -1ULL +#define HDR_SET_LSIZE(hdr, x) do { \ + ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \ + (hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \ +_NOTE(CONSTCOND) } while (0) + +#define HDR_SET_PSIZE(hdr, x) do { \ + ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \ + (hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \ +_NOTE(CONSTCOND) } while (0) + +#define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT) +#define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT) + typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private); -typedef int arc_evict_func_t(void *private); /* generic arc_done_func_t's which you can use */ arc_done_func_t arc_bcopy_func; arc_done_func_t arc_getbuf_func; +extern int zfs_arc_num_sublists_per_state; + typedef enum arc_flags { /* * Public flags that can be passed into the ARC by external consumers. */ - ARC_FLAG_NONE = 1 << 0, /* No flags set */ - ARC_FLAG_WAIT = 1 << 1, /* perform sync I/O */ - ARC_FLAG_NOWAIT = 1 << 2, /* perform async I/O */ - ARC_FLAG_PREFETCH = 1 << 3, /* I/O is a prefetch */ - ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */ - ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */ - ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */ - ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 7, /* I/O from zfetch */ + ARC_FLAG_WAIT = 1 << 0, /* perform sync I/O */ + ARC_FLAG_NOWAIT = 1 << 1, /* perform async I/O */ + ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */ + ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */ + ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */ + ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */ /* * Private ARC flags. These flags are private ARC only flags that * will show up in b_flags in the arc_hdr_buf_t. These flags should * only be set by ARC code. */ - ARC_FLAG_IN_HASH_TABLE = 1 << 8, /* buffer is hashed */ - ARC_FLAG_IO_IN_PROGRESS = 1 << 9, /* I/O in progress */ - ARC_FLAG_IO_ERROR = 1 << 10, /* I/O failed for buf */ - ARC_FLAG_FREED_IN_READ = 1 << 11, /* freed during read */ - ARC_FLAG_BUF_AVAILABLE = 1 << 12, /* block not in use */ - ARC_FLAG_INDIRECT = 1 << 13, /* indirect block */ + ARC_FLAG_IN_HASH_TABLE = 1 << 6, /* buffer is hashed */ + ARC_FLAG_IO_IN_PROGRESS = 1 << 7, /* I/O in progress */ + ARC_FLAG_IO_ERROR = 1 << 8, /* I/O failed for buf */ + ARC_FLAG_INDIRECT = 1 << 9, /* indirect block */ /* Indicates that block was read with ASYNC priority. */ - ARC_FLAG_PRIO_ASYNC_READ = 1 << 14, - ARC_FLAG_L2_WRITING = 1 << 15, /* write in progress */ - ARC_FLAG_L2_EVICTED = 1 << 16, /* evicted during I/O */ - ARC_FLAG_L2_WRITE_HEAD = 1 << 17, /* head of write list */ + ARC_FLAG_PRIO_ASYNC_READ = 1 << 10, + ARC_FLAG_L2_WRITING = 1 << 11, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 12, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 13, /* head of write list */ /* indicates that the buffer contains metadata (otherwise, data) */ - ARC_FLAG_BUFC_METADATA = 1 << 18, + ARC_FLAG_BUFC_METADATA = 1 << 14, /* Flags specifying whether optional hdr struct fields are defined */ - ARC_FLAG_HAS_L1HDR = 1 << 19, - ARC_FLAG_HAS_L2HDR = 1 << 20, + ARC_FLAG_HAS_L1HDR = 1 << 15, + ARC_FLAG_HAS_L2HDR = 1 << 16, + + /* + * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data. + * This allows the l2arc to use the blkptr's checksum to verify + * the data without having to store the checksum in the hdr. + */ + ARC_FLAG_COMPRESSED_ARC = 1 << 17, + ARC_FLAG_SHARED_DATA = 1 << 18, + + /* + * The arc buffer's compression mode is stored in the top 7 bits of the + * flags field, so these dummy flags are included so that MDB can + * interpret the enum properly. + */ + ARC_FLAG_COMPRESS_0 = 1 << 24, + ARC_FLAG_COMPRESS_1 = 1 << 25, + ARC_FLAG_COMPRESS_2 = 1 << 26, + ARC_FLAG_COMPRESS_3 = 1 << 27, + ARC_FLAG_COMPRESS_4 = 1 << 28, + ARC_FLAG_COMPRESS_5 = 1 << 29, + ARC_FLAG_COMPRESS_6 = 1 << 30 + } arc_flags_t; struct arc_buf { @@ -95,11 +127,10 @@ struct arc_buf { arc_buf_t *b_next; kmutex_t b_evict_lock; void *b_data; - arc_evict_func_t *b_efunc; - void *b_private; }; typedef enum arc_buf_contents { + ARC_BUFC_INVALID, /* invalid type */ ARC_BUFC_DATA, /* buffer contains data */ ARC_BUFC_METADATA, /* buffer contains metadata */ ARC_BUFC_NUMTYPES @@ -119,19 +150,17 @@ typedef enum arc_space_type { void arc_space_consume(uint64_t space, arc_space_type_t type); void arc_space_return(uint64_t space, arc_space_type_t type); -arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, +arc_buf_t *arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type); arc_buf_t *arc_loan_buf(spa_t *spa, int size); void arc_return_buf(arc_buf_t *buf, void *tag); void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); -void arc_buf_add_ref(arc_buf_t *buf, void *tag); -boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag); +void arc_buf_destroy(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); int arc_released(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); -boolean_t arc_buf_eviction_needed(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); #endif @@ -140,21 +169,18 @@ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, zio_priority_t priority, int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb); zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, - const zio_prop_t *zp, + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *child_ready, arc_done_func_t *physdone, arc_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb); void arc_freed(spa_t *spa, const blkptr_t *bp); -void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); -boolean_t arc_clear_callback(arc_buf_t *buf); - void arc_flush(spa_t *spa, boolean_t retry); void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(uint64_t reserve, uint64_t txg); +uint64_t arc_max_bytes(void); void arc_init(void); void arc_fini(void); diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h index 496412614b..6862599a65 100644 --- a/usr/src/uts/common/fs/zfs/sys/dbuf.h +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h @@ -36,6 +36,7 @@ #include <sys/zfs_context.h> #include <sys/refcount.h> #include <sys/zrlock.h> +#include <sys/multilist.h> #ifdef __cplusplus extern "C" { @@ -228,6 +229,11 @@ typedef struct dmu_buf_impl { */ avl_node_t db_link; + /* + * Link in dbuf_cache. + */ + multilist_node_t db_cache_link; + /* Data which is unique to data (leaf) blocks: */ /* User callback information. */ @@ -305,8 +311,7 @@ void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); -void dbuf_clear(dmu_buf_impl_t *db); -void dbuf_evict(dmu_buf_impl_t *db); +void dbuf_destroy(dmu_buf_impl_t *db); void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); void dbuf_unoverride(dbuf_dirty_record_t *dr); @@ -342,10 +347,6 @@ boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); (dbuf_is_metadata(_db) && \ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) -#define DBUF_IS_L2COMPRESSIBLE(_db) \ - ((_db)->db_objset->os_compress != ZIO_COMPRESS_OFF || \ - (dbuf_is_metadata(_db) && zfs_mdcomp_disable == B_FALSE)) - #ifdef ZFS_DEBUG /* diff --git a/usr/src/uts/common/fs/zfs/sys/dnode.h b/usr/src/uts/common/fs/zfs/sys/dnode.h index dfa3e576c5..98780b2566 100644 --- a/usr/src/uts/common/fs/zfs/sys/dnode.h +++ b/usr/src/uts/common/fs/zfs/sys/dnode.h @@ -58,7 +58,7 @@ extern "C" { */ #define DNODE_SHIFT 9 /* 512 bytes */ #define DN_MIN_INDBLKSHIFT 12 /* 4k */ -#define DN_MAX_INDBLKSHIFT 14 /* 16k */ +#define DN_MAX_INDBLKSHIFT 17 /* 128k */ #define DNODE_BLOCK_SHIFT 14 /* 16k */ #define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */ #define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */ @@ -88,6 +88,11 @@ extern "C" { #define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) #define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) + +/* + * This is inaccurate if the indblkshift of the particular object is not the + * max. But it's only used by userland to calculate the zvol reservation. + */ #define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT) #define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT) diff --git a/usr/src/uts/common/fs/zfs/sys/refcount.h b/usr/src/uts/common/fs/zfs/sys/refcount.h index 27c39135e0..d13a87ea86 100644 --- a/usr/src/uts/common/fs/zfs/sys/refcount.h +++ b/usr/src/uts/common/fs/zfs/sys/refcount.h @@ -71,6 +71,7 @@ int64_t refcount_remove(refcount_t *rc, void *holder_tag); int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag); int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag); void refcount_transfer(refcount_t *dst, refcount_t *src); +void refcount_transfer_ownership(refcount_t *, void *, void *); void refcount_init(void); void refcount_fini(void); @@ -98,6 +99,7 @@ typedef struct refcount { atomic_add_64(&(src)->rc_count, -__tmp); \ atomic_add_64(&(dst)->rc_count, __tmp); \ } +#define refcount_transfer_ownership(rc, current_holder, new_holder) #define refcount_init() #define refcount_fini() diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index d8840bf86d..da63812831 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -133,6 +133,8 @@ _NOTE(CONSTCOND) } while (0) #define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ #define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ +#define SPA_COMPRESSBITS 7 + /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). * The members of the dva_t should be considered opaque outside the SPA. @@ -375,8 +377,10 @@ _NOTE(CONSTCOND) } while (0) 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ _NOTE(CONSTCOND) } while (0) -#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7) -#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x) +#define BP_GET_COMPRESS(bp) \ + BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) +#define BP_SET_COMPRESS(bp, x) \ + BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) #define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) #define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 6d02c95b22..b9e14711c6 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -524,6 +524,10 @@ extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); +extern void zio_push_transform(zio_t *zio, void *data, uint64_t size, + uint64_t bufsize, zio_transform_func_t *transform); +extern void zio_pop_transforms(zio_t *zio); + extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, diff --git a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h index 572b29d3cb..2f7579fd73 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h +++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h @@ -97,8 +97,12 @@ extern zio_checksum_t zio_checksum_edonr_byteswap; extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init; extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free; +extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum, + void *, uint64_t, uint64_t, zio_bad_cksum_t *); extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, void *data, uint64_t size); +extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum, + void *, uint64_t, uint64_t, zio_bad_cksum_t *); extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); extern enum zio_checksum spa_dedup_checksum(spa_t *spa); extern void zio_checksum_templates_free(spa_t *spa); diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index f997e63aba..9fdc7838be 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -491,6 +491,14 @@ zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) dsl_dataset_t *ds; dsl_pool_t *dp; + /* + * First do a quick check for root in the global zone, which + * is allowed to do all write_perms. This ensures that zfs_ioc_* + * will get to handle nonexistent datasets. + */ + if (INGLOBALZONE(curproc) && secpolicy_zfs(cr) == 0) + return (0); + error = dsl_pool_hold(name, FTAG, &dp); if (error != 0) return (error); diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index 7b71fc58b9..96bd320404 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -245,7 +246,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, } } - VERIFY(arc_buf_remove_ref(abuf, &abuf)); + arc_buf_destroy(abuf, &abuf); } return (error); @@ -282,7 +283,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) if (error == 0) { if (wbuf != NULL) bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } return (error); diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index bfbcdfb511..e297003788 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -268,7 +268,7 @@ zio_data_buf_free(void *buf, size_t size) * Push and pop I/O transform buffers * ========================================================================== */ -static void +void zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { @@ -286,7 +286,7 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zio->io_size = size; } -static void +void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; @@ -957,8 +957,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, zio_priority_t priority, - enum zio_flag flags, zio_done_func_t *done, void *private) + void *data, uint64_t size, int type, zio_priority_t priority, + enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; @@ -2268,7 +2268,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) bcmp(abuf->b_data, zio->io_orig_data, zio->io_orig_size) != 0) error = SET_ERROR(EEXIST); - VERIFY(arc_buf_remove_ref(abuf, &abuf)); + arc_buf_destroy(abuf, &abuf); } ddt_enter(ddt); diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c index 8bd7e02bef..469af54477 100644 --- a/usr/src/uts/common/fs/zfs/zio_checksum.c +++ b/usr/src/uts/common/fs/zfs/zio_checksum.c @@ -293,20 +293,12 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, } int -zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) +zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, + void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) { - blkptr_t *bp = zio->io_bp; - uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : - (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); - int byteswap; - int error; - uint64_t size = (bp == NULL ? zio->io_size : - (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); - uint64_t offset = zio->io_offset; - void *data = zio->io_data; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; - zio_cksum_t actual_cksum, expected_cksum, verifier; - spa_t *spa = zio->io_spa; + zio_cksum_t actual_cksum, expected_cksum; + int byteswap; if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) return (SET_ERROR(EINVAL)); @@ -315,6 +307,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_eck_t *eck; + zio_cksum_t verifier; if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = data; @@ -354,35 +347,54 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) spa->spa_cksum_tmpls[checksum], &actual_cksum); eck->zec_cksum = expected_cksum; - if (byteswap) + if (byteswap) { byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); + } } else { - ASSERT(!BP_IS_GANG(bp)); byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; ci->ci_func[byteswap](data, size, spa->spa_cksum_tmpls[checksum], &actual_cksum); } - info->zbc_expected = expected_cksum; - info->zbc_actual = actual_cksum; - info->zbc_checksum_name = ci->ci_name; - info->zbc_byteswapped = byteswap; - info->zbc_injected = 0; - info->zbc_has_cksum = 1; + if (info != NULL) { + info->zbc_expected = expected_cksum; + info->zbc_actual = actual_cksum; + info->zbc_checksum_name = ci->ci_name; + info->zbc_byteswapped = byteswap; + info->zbc_injected = 0; + info->zbc_has_cksum = 1; + } if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) return (SET_ERROR(ECKSUM)); - if (zio_injection_enabled && !zio->io_error && + return (0); +} + +int +zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) +{ + blkptr_t *bp = zio->io_bp; + uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : + (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); + int error; + uint64_t size = (bp == NULL ? zio->io_size : + (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); + uint64_t offset = zio->io_offset; + void *data = zio->io_data; + spa_t *spa = zio->io_spa; + + error = zio_checksum_error_impl(spa, bp, checksum, data, size, + offset, info); + if (error != 0 && zio_injection_enabled && !zio->io_error && (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) { info->zbc_injected = 1; return (error); } - - return (0); + return (error); } /* diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h index 0356d7f05b..bd3adf67f8 100644 --- a/usr/src/uts/common/sys/fs/zfs.h +++ b/usr/src/uts/common/sys/fs/zfs.h @@ -601,6 +601,8 @@ typedef struct zpool_rewind_policy { /* * This is needed in userland to report the minimum necessary device size. + * + * Note that the zfs test suite uses 64MB vdevs. */ #define SPA_MINDEVSIZE (64ULL << 20) |