diff options
author | George Amanakis <gamanakis@gmail.com> | 2020-07-30 18:40:44 -0500 |
---|---|---|
committer | Jason King <jason.king@joyent.com> | 2020-10-16 11:10:02 -0500 |
commit | f0a052391861a2b96cf28973c3b7f2854591aa79 (patch) | |
tree | 653d2330669b465bac1ab1c55b7e24c018cddc8a | |
parent | 6218f28969018904255fddf306e6489c7ae28bba (diff) | |
download | illumos-joyent-f0a052391861a2b96cf28973c3b7f2854591aa79.tar.gz |
3525 Persistent L2ARC
Portions contributed by: Saso Kiselkov <skiselkov@gmail.com>
Portions contributed by: Jorgen Lundman <lundman@lundman.net>
Portions contributed by: Brian Behlendorf <behlendorf1@llnl.gov>
Portions contributed by: Alexander Motin <mav@FreeBSD.org>
Portions contributed by: Jason King <jason.king@joyent.com>
Reviewed by: C Fraire <cfraire@me.com>
Reviewed by: Toomas Soome <tsoome@me.com>
Approved by: Dan McDonald <danmcd@joyent.com>
28 files changed, 3928 insertions, 632 deletions
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c index bbd637c49f..24fc8b9e76 100644 --- a/usr/src/cmd/zdb/zdb.c +++ b/usr/src/cmd/zdb/zdb.c @@ -58,8 +58,10 @@ #include <sys/dmu_traverse.h> #include <sys/zio_checksum.h> #include <sys/zio_compress.h> +#include <zfs_fletcher.h> #include <sys/zfs_fuid.h> #include <sys/arc.h> +#include <sys/arc_impl.h> #include <sys/ddt.h> #include <sys/zfeature.h> #include <sys/abd.h> @@ -73,7 +75,6 @@ #include <libnvpair.h> #include <libzutil.h> -#include <zfs_fletcher.h> #include "zdb.h" @@ -2631,6 +2632,265 @@ dump_cachefile(const char *cachefile) nvlist_free(config); } +static void +print_l2arc_header(void) +{ + (void) printf("------------------------------------\n"); + (void) printf("L2ARC device header\n"); + (void) printf("------------------------------------\n"); +} + +static void +print_l2arc_log_blocks(void) +{ + (void) printf("------------------------------------\n"); + (void) printf("L2ARC device log blocks\n"); + (void) printf("------------------------------------\n"); +} + +static void +dump_l2arc_log_entries(uint64_t log_entries, + l2arc_log_ent_phys_t *le, uint64_t i) +{ + for (uint64_t j = 0; j < log_entries; j++) { + dva_t dva = le[j].le_dva; + (void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, " + "vdev: %llu, offset: %llu\n", + (u_longlong_t)i, j + 1, + (u_longlong_t)DVA_GET_ASIZE(&dva), + (u_longlong_t)DVA_GET_VDEV(&dva), + (u_longlong_t)DVA_GET_OFFSET(&dva)); + (void) printf("|\t\t\t\tbirth: %llu\n", + (u_longlong_t)le[j].le_birth); + (void) printf("|\t\t\t\tlsize: %llu\n", + (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop)); + (void) printf("|\t\t\t\tpsize: %llu\n", + (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop)); + (void) printf("|\t\t\t\tcompr: %llu\n", + (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop)); + (void) printf("|\t\t\t\ttype: %llu\n", + (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop)); + (void) printf("|\t\t\t\tprotected: %llu\n", + (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop)); + (void) printf("|\t\t\t\tprefetch: %llu\n", + (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop)); + (void) printf("|\t\t\t\taddress: %llu\n", + (u_longlong_t)le[j].le_daddr); + (void) printf("|\n"); + } + (void) printf("\n"); +} + +static void +dump_l2arc_log_blkptr(l2arc_log_blkptr_t lbps) +{ + (void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps.lbp_daddr); + (void) printf("|\t\tpayload_asize: %llu\n", + (u_longlong_t)lbps.lbp_payload_asize); + (void) printf("|\t\tpayload_start: %llu\n", + (u_longlong_t)lbps.lbp_payload_start); + (void) printf("|\t\tlsize: %llu\n", + (u_longlong_t)L2BLK_GET_LSIZE((&lbps)->lbp_prop)); + (void) printf("|\t\tasize: %llu\n", + (u_longlong_t)L2BLK_GET_PSIZE((&lbps)->lbp_prop)); + (void) printf("|\t\tcompralgo: %llu\n", + (u_longlong_t)L2BLK_GET_COMPRESS((&lbps)->lbp_prop)); + (void) printf("|\t\tcksumalgo: %llu\n", + (u_longlong_t)L2BLK_GET_CHECKSUM((&lbps)->lbp_prop)); + (void) printf("|\n\n"); +} + +static void +dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr, + l2arc_dev_hdr_phys_t *rebuild) +{ + l2arc_log_blk_phys_t this_lb; + uint64_t asize; + l2arc_log_blkptr_t lbps[2]; + abd_t *abd; + zio_cksum_t cksum; + int failed = 0; + l2arc_dev_t dev; + + if (!dump_opt['q']) + print_l2arc_log_blocks(); + bcopy((&l2dhdr)->dh_start_lbps, lbps, sizeof (lbps)); + + dev.l2ad_evict = l2dhdr.dh_evict; + dev.l2ad_start = l2dhdr.dh_start; + dev.l2ad_end = l2dhdr.dh_end; + + if (l2dhdr.dh_start_lbps[0].lbp_daddr == 0) { + /* no log blocks to read */ + if (!dump_opt['q']) { + (void) printf("No log blocks to read\n"); + (void) printf("\n"); + } + return; + } else { + dev.l2ad_hand = lbps[0].lbp_daddr + + L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + } + + dev.l2ad_first = !!(l2dhdr.dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); + + for (;;) { + if (!l2arc_log_blkptr_valid(&dev, &lbps[0])) + break; + + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != + (ssize_t)asize) { + if (!dump_opt['q']) { + (void) printf("Error while reading next log " + "block\n\n"); + } + break; + } + + fletcher_4_native(&this_lb, asize, NULL, &cksum); + if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) { + failed++; + if (!dump_opt['q']) { + (void) printf("Invalid cksum\n"); + dump_l2arc_log_blkptr(lbps[0]); + } + break; + } + + switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) { + case ZIO_COMPRESS_OFF: + break; + case ZIO_COMPRESS_LZ4: + abd = abd_alloc_for_io(asize, B_TRUE); + abd_copy_from_buf_off(abd, &this_lb, 0, asize); + zio_decompress_data(L2BLK_GET_COMPRESS( + (&lbps[0])->lbp_prop), abd, &this_lb, + asize, sizeof (this_lb)); + abd_free(abd); + break; + default: + break; + } + + if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) + byteswap_uint64_array(&this_lb, sizeof (this_lb)); + if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) { + if (!dump_opt['q']) + (void) printf("Invalid log block magic\n\n"); + break; + } + + rebuild->dh_lb_count++; + rebuild->dh_lb_asize += asize; + if (dump_opt['l'] > 1 && !dump_opt['q']) { + (void) printf("lb[%4llu]\tmagic: %llu\n", + (u_longlong_t)rebuild->dh_lb_count, + (u_longlong_t)this_lb.lb_magic); + dump_l2arc_log_blkptr(lbps[0]); + } + + if (dump_opt['l'] > 2 && !dump_opt['q']) + dump_l2arc_log_entries(l2dhdr.dh_log_entries, + this_lb.lb_entries, + rebuild->dh_lb_count); + + if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, + lbps[0].lbp_payload_start, dev.l2ad_evict) && + !dev.l2ad_first) + break; + + lbps[0] = lbps[1]; + lbps[1] = this_lb.lb_prev_lbp; + } + + if (!dump_opt['q']) { + (void) printf("log_blk_count:\t %llu with valid cksum\n", + (u_longlong_t)rebuild->dh_lb_count); + (void) printf("\t\t %d with invalid cksum\n", failed); + (void) printf("log_blk_asize:\t %llu\n\n", + (u_longlong_t)rebuild->dh_lb_asize); + } +} + +static int +dump_l2arc_header(int fd) +{ + l2arc_dev_hdr_phys_t l2dhdr, rebuild; + int error = B_FALSE; + + bzero(&l2dhdr, sizeof (l2dhdr)); + bzero(&rebuild, sizeof (rebuild)); + + if (pread64(fd, &l2dhdr, sizeof (l2dhdr), + VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) { + error = B_TRUE; + } else { + if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) + byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr)); + + if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC) + error = B_TRUE; + } + + if (error) { + (void) printf("L2ARC device header not found\n\n"); + /* Do not return an error here for backward compatibility */ + return (0); + } else if (!dump_opt['q']) { + print_l2arc_header(); + + (void) printf(" magic: %llu\n", + (u_longlong_t)l2dhdr.dh_magic); + (void) printf(" version: %llu\n", + (u_longlong_t)l2dhdr.dh_version); + (void) printf(" pool_guid: %llu\n", + (u_longlong_t)l2dhdr.dh_spa_guid); + (void) printf(" flags: %llu\n", + (u_longlong_t)l2dhdr.dh_flags); + (void) printf(" start_lbps[0]: %llu\n", + (u_longlong_t) + l2dhdr.dh_start_lbps[0].lbp_daddr); + (void) printf(" start_lbps[1]: %llu\n", + (u_longlong_t) + l2dhdr.dh_start_lbps[1].lbp_daddr); + (void) printf(" log_blk_ent: %llu\n", + (u_longlong_t)l2dhdr.dh_log_entries); + (void) printf(" start: %llu\n", + (u_longlong_t)l2dhdr.dh_start); + (void) printf(" end: %llu\n", + (u_longlong_t)l2dhdr.dh_end); + (void) printf(" evict: %llu\n", + (u_longlong_t)l2dhdr.dh_evict); + (void) printf(" lb_asize_refcount: %llu\n", + (u_longlong_t)l2dhdr.dh_lb_asize); + (void) printf(" lb_count_refcount: %llu\n\n", + (u_longlong_t)l2dhdr.dh_lb_count); + } + + dump_l2arc_log_blocks(fd, l2dhdr, &rebuild); + /* + * The total aligned size of log blocks and the number of log blocks + * reported in the header of the device may be less than what zdb + * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild(). + * This happens because dump_l2arc_log_blocks() lacks the memory + * pressure valve that l2arc_rebuild() has. Thus, if we are on a system + * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize + * and dh_lb_count will be lower to begin with than what exists on the + * device. This is normal and zdb should not exit with an error. The + * opposite case should never happen though, the values reported in the + * header should never be higher than what dump_l2arc_log_blocks() and + * l2arc_rebuild() report. If this happens there is a leak in the + * accounting of log blocks. + */ + if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize || + l2dhdr.dh_lb_count > rebuild.dh_lb_count) + return (1); + + return (0); +} + static char curpath[PATH_MAX]; /* @@ -2860,7 +3120,6 @@ dump_config_from_label(zdb_label_t *label, size_t buflen, int l) static void dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num) { - vdev_t vd; char header[ZDB_MAX_UB_HEADER_SIZE]; @@ -2896,10 +3155,11 @@ dump_label(const char *dev) { char path[MAXPATHLEN]; zdb_label_t labels[VDEV_LABELS]; - uint64_t psize, ashift; + uint64_t psize, ashift, l2cache; struct stat64 statbuf; boolean_t config_found = B_FALSE; boolean_t error = B_FALSE; + boolean_t read_l2arc_header = B_FALSE; avl_tree_t config_tree; avl_tree_t uberblock_tree; void *node, *cookie; @@ -2989,6 +3249,15 @@ dump_label(const char *dev) ZPOOL_CONFIG_ASHIFT, &ashift) != 0)) ashift = SPA_MINBLOCKSHIFT; + /* If the device is a cache device clear the header. */ + if (!read_l2arc_header) { + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 && + l2cache == POOL_STATE_L2CACHE) { + read_l2arc_header = B_TRUE; + } + } + if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0) size = buflen; @@ -3035,12 +3304,19 @@ dump_label(const char *dev) if (!dump_opt['q']) (void) printf("failed to unpack label %d\n", l); } + if (dump_opt['u']) dump_label_uberblocks(label, ashift, l); nvlist_free(label->config_nv); } + /* + * Dump the L2ARC header, if existent. + */ + if (read_l2arc_header) + error |= dump_l2arc_header(fd); + cookie = NULL; while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL) umem_free(node, sizeof (cksum_record_t)); @@ -3061,9 +3337,8 @@ dump_label(const char *dev) static uint64_t dataset_feature_count[SPA_FEATURES]; static uint64_t remap_deadlist_count = 0; -/*ARGSUSED*/ static int -dump_one_dir(const char *dsname, void *arg) +dump_one_dir(const char *dsname, void *arg __unused) { int error; objset_t *os; diff --git a/usr/src/lib/libzfs/common/libzfs_import.c b/usr/src/lib/libzfs/common/libzfs_import.c index dc15aca0c0..257cd5e59e 100644 --- a/usr/src/lib/libzfs/common/libzfs_import.c +++ b/usr/src/lib/libzfs/common/libzfs_import.c @@ -60,6 +60,7 @@ #include <sys/vdev_impl.h> #include <libzutil.h> +#include <sys/arc_impl.h> #include "libzfs.h" #include "libzfs_impl.h" @@ -168,8 +169,10 @@ zpool_clear_label(int fd) struct stat64 statbuf; int l; vdev_label_t *label; + l2arc_dev_hdr_phys_t *l2dhdr; uint64_t size; - int labels_cleared = 0; + int labels_cleared = 0, header_cleared = 0; + boolean_t clear_l2arc_header = B_FALSE; if (fstat64(fd, &statbuf) == -1) return (0); @@ -179,8 +182,13 @@ zpool_clear_label(int fd) if ((label = calloc(sizeof (vdev_label_t), 1)) == NULL) return (-1); + if ((l2dhdr = calloc(1, sizeof (l2arc_dev_hdr_phys_t))) == NULL) { + free(label); + return (-1); + } + for (l = 0; l < VDEV_LABELS; l++) { - uint64_t state, guid; + uint64_t state, guid, l2cache; nvlist_t *config; if (pread64(fd, label, sizeof (vdev_label_t), @@ -207,6 +215,15 @@ zpool_clear_label(int fd) continue; } + /* If the device is a cache device clear the header. */ + if (!clear_l2arc_header) { + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 && + l2cache == POOL_STATE_L2CACHE) { + clear_l2arc_header = B_TRUE; + } + } + nvlist_free(config); /* @@ -224,7 +241,17 @@ zpool_clear_label(int fd) } } + /* Clear the L2ARC header. */ + if (clear_l2arc_header) { + memset(l2dhdr, 0, sizeof (l2arc_dev_hdr_phys_t)); + if (pwrite64(fd, l2dhdr, sizeof (l2arc_dev_hdr_phys_t), + VDEV_LABEL_START_SIZE) == sizeof (l2arc_dev_hdr_phys_t)) { + header_cleared++; + } + } + free(label); + free(l2dhdr); if (labels_cleared == 0) return (-1); diff --git a/usr/src/man/man1m/zdb.1m b/usr/src/man/man1m/zdb.1m index 9720024f90..3ebaf049cb 100644 --- a/usr/src/man/man1m/zdb.1m +++ b/usr/src/man/man1m/zdb.1m @@ -174,16 +174,28 @@ If specified multiple times, display counts of each intent log transaction type. Examine the checkpointed state of the pool. Note, the on disk format of the pool is not reverted to the checkpointed state. .It Fl l Ar device -Read the vdev labels from the specified device and dump the unique configuration -nvlists(t). +Read the vdev labels and L2ARC header from the specified device. .Nm Fl l -will return 1 if an error occurred, 2 if no configuration nvlist could be -unpacked (errors or not), and 0 otherwise. -Specify multiple times to increase verbosity. +will return 0 if valid label was found, 1 if error occurred, and 2 if no valid +labels were found. +The presence of L2ARC header is indicated by a specific +sequence (L2ARC_DEV_HDR_MAGIC). +If there is an accounting error in the size or the number of L2ARC log blocks +.Nm Fl l +will return 1. +Each unique configuration is displayed only once. +.It Fl ll Ar device +In addition display label space usage stats. +If a valid L2ARC header was found also display the properties of log blocks +used for restoring L2ARC contents (persistent L2ARC). +.It Fl lll Ar device +Display every configuration, unique or not. +If a valid L2ARC header was found also display the properties of log entries in +log blocks used for restoring L2ARC contents (persistent L2ARC). .Pp If the .Fl q -option is also specified, don't dump the configurations or the uberblocks. +option is also specified, don't print the labels or the L2ARC header. .Pp If the .Fl u diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf index 8174cd0af3..5b9783052c 100644 --- a/usr/src/pkg/manifests/system-test-zfstest.mf +++ b/usr/src/pkg/manifests/system-test-zfstest.mf @@ -137,6 +137,7 @@ dir path=opt/zfs-tests/tests/functional/nestedfs dir path=opt/zfs-tests/tests/functional/no_space dir path=opt/zfs-tests/tests/functional/nopwrite dir path=opt/zfs-tests/tests/functional/online_offline +dir path=opt/zfs-tests/tests/functional/persist_l2arc dir path=opt/zfs-tests/tests/functional/pool_checkpoint dir path=opt/zfs-tests/tests/functional/pool_names dir path=opt/zfs-tests/tests/functional/poolversion @@ -408,6 +409,7 @@ file path=opt/zfs-tests/tests/functional/cache/cache_008_neg mode=0555 file path=opt/zfs-tests/tests/functional/cache/cache_009_pos mode=0555 file path=opt/zfs-tests/tests/functional/cache/cache_010_neg mode=0555 file path=opt/zfs-tests/tests/functional/cache/cache_011_pos mode=0555 +file path=opt/zfs-tests/tests/functional/cache/cache_012_pos mode=0555 file path=opt/zfs-tests/tests/functional/cache/cleanup mode=0555 file path=opt/zfs-tests/tests/functional/cache/setup mode=0555 file path=opt/zfs-tests/tests/functional/cachefile/cachefile.cfg mode=0444 @@ -2695,6 +2697,26 @@ file path=opt/zfs-tests/tests/functional/online_offline/online_offline_002_neg \ file path=opt/zfs-tests/tests/functional/online_offline/online_offline_003_neg \ mode=0555 file path=opt/zfs-tests/tests/functional/online_offline/setup mode=0555 +file path=opt/zfs-tests/tests/functional/persist_l2arc/cleanup mode=0555 +file path=opt/zfs-tests/tests/functional/persist_l2arc/persist_l2arc.cfg \ + mode=0444 +file path=opt/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos \ + mode=0555 +file path=opt/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_002_pos \ + mode=0555 +file path=opt/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_003_neg \ + mode=0555 +file path=opt/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_004_pos \ + mode=0555 +file path=opt/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_005_pos \ + mode=0555 +file path=opt/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_006_pos \ + mode=0555 +file path=opt/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_007_pos \ + mode=0555 +file path=opt/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_008_pos \ + mode=0555 +file path=opt/zfs-tests/tests/functional/persist_l2arc/setup mode=0555 file \ path=opt/zfs-tests/tests/functional/pool_checkpoint/checkpoint_after_rewind \ mode=0555 diff --git a/usr/src/test/zfs-tests/include/libtest.shlib b/usr/src/test/zfs-tests/include/libtest.shlib index 2edf9123ab..2806f31027 100644 --- a/usr/src/test/zfs-tests/include/libtest.shlib +++ b/usr/src/test/zfs-tests/include/libtest.shlib @@ -2906,3 +2906,17 @@ function sha256digest fi return 0 } + +function get_arcstat # stat +{ + if is_linux; then + typeset stat=$1 + typeset zfs_arcstats="/proc/spl/kstat/zfs/arcstats" + [[ -f "$zfs_arcstats" ]] || return 1 + grep $stat $zfs_arcstats | awk '{print $3}' + return $? + else + kstat -p zfs::arcstats:$1 | awk '{ print $2 }' + return $? + fi +} diff --git a/usr/src/test/zfs-tests/runfiles/omnios.run b/usr/src/test/zfs-tests/runfiles/omnios.run index 4c52926b16..6db4f62e66 100644 --- a/usr/src/test/zfs-tests/runfiles/omnios.run +++ b/usr/src/test/zfs-tests/runfiles/omnios.run @@ -74,7 +74,7 @@ tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos', [/opt/zfs-tests/tests/functional/cache] tests = ['cache_001_pos', 'cache_002_pos', 'cache_003_pos', 'cache_004_neg', 'cache_005_neg', 'cache_006_pos', 'cache_007_neg', 'cache_008_neg', - 'cache_009_pos', 'cache_010_neg', 'cache_011_pos'] + 'cache_009_pos', 'cache_010_neg', 'cache_011_pos', 'cache_012_pos'] [/opt/zfs-tests/tests/functional/cachefile] tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos', @@ -702,6 +702,12 @@ tests = [ 'userspace_001_pos', 'userspace_002_pos', 'userspace_003_pos', 'groupspace_001_pos', 'groupspace_002_pos', 'groupspace_003_pos' ] +[/opt/zfs-tests/tests/functional/persist_l2arc] +tests = ['persist_l2arc_001_pos', 'persist_l2arc_002_pos', + 'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos', + 'persist_l2arc_006_pos', 'persist_l2arc_007_pos', 'persist_l2arc_008_pos'] +tags = ['functional', 'persist_l2arc'] + [/opt/zfs-tests/tests/functional/utils_test] tests = ['utils_test_001_pos', 'utils_test_002_pos', 'utils_test_003_pos', 'utils_test_004_pos', 'utils_test_005_pos', 'utils_test_006_pos', diff --git a/usr/src/test/zfs-tests/runfiles/openindiana.run b/usr/src/test/zfs-tests/runfiles/openindiana.run index 27697582ef..426b215a60 100644 --- a/usr/src/test/zfs-tests/runfiles/openindiana.run +++ b/usr/src/test/zfs-tests/runfiles/openindiana.run @@ -74,7 +74,7 @@ tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos', [/opt/zfs-tests/tests/functional/cache] tests = ['cache_001_pos', 'cache_002_pos', 'cache_003_pos', 'cache_004_neg', 'cache_005_neg', 'cache_006_pos', 'cache_007_neg', 'cache_008_neg', - 'cache_009_pos', 'cache_010_neg', 'cache_011_pos'] + 'cache_009_pos', 'cache_010_neg', 'cache_011_pos', 'cache_012_pos'] [/opt/zfs-tests/tests/functional/cachefile] tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos', @@ -702,6 +702,12 @@ tests = [ 'userspace_001_pos', 'userspace_002_pos', 'userspace_003_pos', 'groupspace_001_pos', 'groupspace_002_pos', 'groupspace_003_pos' ] +[/opt/zfs-tests/tests/functional/persist_l2arc] +tests = ['persist_l2arc_001_pos', 'persist_l2arc_002_pos', + 'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos', + 'persist_l2arc_006_pos', 'persist_l2arc_007_pos', 'persist_l2arc_008_pos'] +tags = ['functional', 'persist_l2arc'] + [/opt/zfs-tests/tests/functional/utils_test] tests = ['utils_test_001_pos', 'utils_test_002_pos', 'utils_test_003_pos', 'utils_test_004_pos', 'utils_test_005_pos', 'utils_test_006_pos', diff --git a/usr/src/test/zfs-tests/runfiles/smartos.run b/usr/src/test/zfs-tests/runfiles/smartos.run index 30e8efc0d3..375e894870 100644 --- a/usr/src/test/zfs-tests/runfiles/smartos.run +++ b/usr/src/test/zfs-tests/runfiles/smartos.run @@ -45,7 +45,7 @@ tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos', [/opt/zfs-tests/tests/functional/cache] tests = ['cache_001_pos', 'cache_002_pos', 'cache_003_pos', 'cache_004_neg', 'cache_005_neg', 'cache_006_pos', 'cache_007_neg', 'cache_008_neg', - 'cache_009_pos', 'cache_010_neg', 'cache_011_pos'] + 'cache_009_pos', 'cache_010_neg', 'cache_011_pos', 'cache_012_pos'] [/opt/zfs-tests/tests/functional/cachefile] tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos', @@ -523,6 +523,12 @@ tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos', 'refreserv_005_pos', 'refreserv_raidz', 'refreserv_multi_raidz'] +[/opt/zfs-tests/tests/functional/persist_l2arc] +tests = ['persist_l2arc_001_pos', 'persist_l2arc_002_pos', + 'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos', + 'persist_l2arc_006_pos', 'persist_l2arc_007_pos', 'persist_l2arc_008_pos'] +tags = ['functional', 'persist_l2arc'] + [/opt/zfs-tests/tests/functional/rename_dirs] tests = ['rename_dirs_001_pos'] diff --git a/usr/src/test/zfs-tests/tests/functional/cache/cache_012_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cache/cache_012_pos.ksh new file mode 100755 index 0000000000..66940b798f --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cache/cache_012_pos.ksh @@ -0,0 +1,110 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/tests/functional/cache/cache.cfg +. $STF_SUITE/tests/functional/cache/cache.kshlib + +# +# DESCRIPTION: +# Looping around a cache device with l2arc_write_size exceeding +# the device size succeeds. +# +# STRATEGY: +# 1. Create pool with a cache device. +# 2. Set l2arc_write_max to a value larger than the cache device. +# 3. Create a file larger than the cache device and random read +# for 10 sec. +# 4. Verify that l2arc_write_max is set back to the default. +# 5. Set l2arc_write_max to a value less than the cache device size but +# larger than the default (64MB). +# 6. Record the l2_size. +# 7. Random read for 1 sec. +# 8. Record the l2_size again. +# 9. If (6) <= (8) then we have not looped around yet. +# 10. If (6) > (8) then we looped around. Break out of the loop and test. +# 11. Destroy pool. +# + +verify_runnable "global" + +log_assert "Looping around a cache device succeeds." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 l2arc_write_max $write_max + log_must set_tunable32 l2arc_noprefetch $noprefetch +} +log_onexit cleanup + +typeset write_max=$(get_tunable l2arc_write_max) +typeset noprefetch=$(get_tunable l2arc_noprefetch) +log_must set_tunable32 l2arc_noprefetch 0 + +typeset VDEV="$VDIR/vdev.disk" +typeset VDEV_SZ=$(( 4 * 1024 * 1024 * 1024 )) +typeset VCACHE="$VDIR/vdev.cache" +typeset VCACHE_SZ=$(( $VDEV_SZ / 2 )) + +typeset fill_mb=$(( floor($VDEV_SZ * 3 / 4 ) )) +export DIRECTORY=/$TESTPOOL +export NUMJOBS=4 +export RUNTIME=10 +export PERF_RANDSEED=1234 +export PERF_COMPPERCENT=66 +export PERF_COMPCHUNK=0 +export BLOCKSIZE=128K +export SYNC_TYPE=0 +export DIRECT=1 +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) )) + +log_must set_tunable32 l2arc_write_max $(( $VCACHE_SZ * 2 )) + +log_must truncate -s $VCACHE_SZ $VCACHE +log_must truncate -s $VDEV_SZ $VDEV + +log_must zpool create -f $TESTPOOL $VDEV cache $VCACHE + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +typeset write_max2=$(get_tunable l2arc_write_max) + +log_must test $write_max2 -eq $write_max + +log_must set_tunable32 l2arc_write_max $(( 64 * 1024 * 1024 )) +export RUNTIME=1 + +typeset do_once=true +while $do_once || [[ $l2_size1 -le $l2_size2 ]]; do + typeset l2_size1=$(get_arcstat l2_size) + log_must fio $FIO_SCRIPTS/random_reads.fio + typeset l2_size2=$(get_arcstat l2_size) + do_once=false +done + +log_must test $l2_size1 -gt $l2_size2 + +log_must zpool destroy $TESTPOOL + +log_pass "Looping around a cache device succeeds." diff --git a/usr/src/test/zfs-tests/tests/functional/persist_l2arc/Makefile b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/Makefile new file mode 100644 index 0000000000..f8b7917182 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/Makefile @@ -0,0 +1,21 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2020 Joyent, Inc. +# + +include $(SRC)/Makefile.master + +ROOTOPTPKG = $(ROOT)/opt/zfs-tests +TARGETDIR = $(ROOTOPTPKG)/tests/functional/persist_l2arc + +include $(SRC)/test/zfs-tests/Makefile.com diff --git a/usr/src/test/zfs-tests/tests/functional/persist_l2arc/cleanup.ksh b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/cleanup.ksh new file mode 100755 index 0000000000..828de38625 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/cleanup.ksh @@ -0,0 +1,31 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg + +verify_runnable "global" + +if datasetexists $TESTPOOL ; then + log_must zpool destroy -f $TESTPOOL +fi + +log_must rm -rf $VDIR + +log_pass diff --git a/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc.cfg b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc.cfg new file mode 100644 index 0000000000..60bb246376 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc.cfg @@ -0,0 +1,37 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +export SIZE=1G +export VDIR=$TESTDIR/disk.persist_l2arc +export VDEV="$VDIR/a" +export VDEV_CACHE="$VDIR/b" + +# fio options +export DIRECTORY=/$TESTPOOL +export NUMJOBS=4 +export RUNTIME=30 +export PERF_RANDSEED=1234 +export PERF_COMPPERCENT=66 +export PERF_COMPCHUNK=0 +export BLOCKSIZE=128K +export SYNC_TYPE=0 +export DIRECT=1 diff --git a/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh new file mode 100755 index 0000000000..f69ead3753 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh @@ -0,0 +1,106 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg + +# +# DESCRIPTION: +# Persistent L2ARC with an unencrypted ZFS file system succeeds +# +# STRATEGY: +# 1. Create pool with a cache device. +# 2. Export and re-import pool without writing any data. +# 3. Create a random file in that pool and random read for 30 sec. +# 4. Export pool. +# 5. Read the amount of log blocks written from the header of the +# L2ARC device. +# 6. Import pool. +# 7. Read the amount of log blocks rebuilt in arcstats and compare to +# (4). +# 8. Check if the labels of the L2ARC device are intact. +# +# * We can predict the minimum bytes of L2ARC restored if we subtract +# from the effective size of the cache device the bytes l2arc_evict() +# evicts: +# l2: L2ARC device size - VDEV_LABEL_START_SIZE - l2ad_dev_hdr_asize +# wr_sz: l2arc_write_max + l2arc_write_boost (worst case) +# blk_overhead: wr_sz / SPA_MINBLOCKSIZE / (l2 / SPA_MAXBLOCKSIZE) * +# sizeof (l2arc_log_blk_phys_t) +# min restored size: l2 - (wr_sz + blk_overhead) +# + +verify_runnable "global" + +log_assert "Persistent L2ARC with an unencrypted ZFS file system succeeds." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 l2arc_noprefetch $noprefetch + log_must set_tunable32 l2arc_rebuild_blocks_min_l2size \ + $rebuild_blocks_min_l2size +} +log_onexit cleanup + +# l2arc_noprefetch is set to 0 to let L2ARC handle prefetches +typeset noprefetch=$(get_tunable l2arc_noprefetch) +typeset rebuild_blocks_min_l2size=$(get_tunable l2arc_rebuild_blocks_min_l2size) +log_must set_tunable32 l2arc_noprefetch 0 +log_must set_tunable32 l2arc_rebuild_blocks_min_l2size 0 + +typeset fill_mb=800 +typeset cache_sz=$(( floor($fill_mb / 2) )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE + +log_must zpool export $TESTPOOL +log_must zpool import -d $VDIR $TESTPOOL + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +log_must zpool export $TESTPOOL + +typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | grep log_blk_count | \ + awk '{print $2}') + +typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) + +log_must zpool import -d $VDIR $TESTPOOL + +sleep 2 + +typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks) + +log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start )) +log_must test $l2_dh_log_blk -gt 0 + +log_must zdb -lll $VDEV_CACHE + +log_must zpool destroy -f $TESTPOOL + +log_pass "Persistent L2ARC with an unencrypted ZFS file system succeeds." diff --git a/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_002_pos.ksh b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_002_pos.ksh new file mode 100755 index 0000000000..79cefd8af4 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_002_pos.ksh @@ -0,0 +1,112 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg +. $STF_SUITE/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib + +# +# DESCRIPTION: +# Persistent L2ARC with an encrypted ZFS file system succeeds +# +# STRATEGY: +# 1. Create pool with a cache device. +# 2. Create a an encrypted ZFS file system. +# 3. Create a random file in the encyrpted file system and random +# read for 30 sec. +# 4. Export pool. +# 5. Read the amount of log blocks written from the header of the +# L2ARC device. +# 5. Import pool. +# 6. Mount the encypted ZFS file system. +# 7. Read the amount of log blocks rebuilt in arcstats and compare to +# (5). +# 8. Check if the labels of the L2ARC device are intact. +# +# * We can predict the minimum bytes of L2ARC restored if we subtract +# from the effective size of the cache device the bytes l2arc_evict() +# evicts: +# l2: L2ARC device size - VDEV_LABEL_START_SIZE - l2ad_dev_hdr_asize +# wr_sz: l2arc_write_max + l2arc_write_boost (worst case) +# blk_overhead: wr_sz / SPA_MINBLOCKSIZE / (l2 / SPA_MAXBLOCKSIZE) * +# sizeof (l2arc_log_blk_phys_t) +# min restored size: l2 - (wr_sz + blk_overhead) +# + +verify_runnable "global" + +log_assert "Persistent L2ARC with an encrypted ZFS file system succeeds." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 l2arc_noprefetch $noprefetch + log_must set_tunable32 l2arc_rebuild_blocks_min_l2size \ + $rebuild_blocks_min_l2size +} +log_onexit cleanup + +# l2arc_noprefetch is set to 0 to let L2ARC handle prefetches +typeset noprefetch=$(get_tunable l2arc_noprefetch) +typeset rebuild_blocks_min_l2size=$(get_tunable l2arc_rebuild_blocks_min_l2size) +log_must set_tunable32 l2arc_noprefetch 0 +log_must set_tunable32 l2arc_rebuild_blocks_min_l2size 0 + +typeset fill_mb=800 +typeset cache_sz=$(( floor($fill_mb / 2) )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE + +log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ + "-o keyformat=passphrase $TESTPOOL/$TESTFS1" + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +log_must zpool export $TESTPOOL + +sleep 2 + +typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | grep log_blk_count | \ + awk '{print $2}') + +typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) + +log_must zpool import -d $VDIR $TESTPOOL +log_must eval "echo $PASSPHRASE | zfs mount -l $TESTPOOL/$TESTFS1" + +sleep 2 + +typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks) + +log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start )) +log_must test $l2_dh_log_blk -gt 0 + +log_must zdb -lq $VDEV_CACHE + +log_must zpool destroy -f $TESTPOOL + +log_pass "Persistent L2ARC with an encrypted ZFS file system succeeds." diff --git a/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_003_neg.ksh b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_003_neg.ksh new file mode 100755 index 0000000000..7fe3d9ca21 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_003_neg.ksh @@ -0,0 +1,87 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg + +# +# DESCRIPTION: +# Persistent L2ARC fails as expected when l2arc_rebuild_enabled = 0 +# +# STRATEGY: +# 1. Set l2arc_rebuild_enabled = 0 +# 2. Create pool with a cache device. +# 3. Create a random file in that pool and random read for 30 sec. +# 4. Export pool. +# 5. Import pool. +# 6. Check in zpool iostat if the cache device has space allocated. +# 7. Read the file written in (2) and check if l2_hits in +# /proc/spl/kstat/zfs/arcstats increased. +# + +verify_runnable "global" + +log_assert "Persistent L2ARC fails as expected when l2arc_rebuild_enabled = 0." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 l2arc_rebuild_enabled $rebuild_enabled + log_must set_tunable32 l2arc_noprefetch $noprefetch +} +log_onexit cleanup + +# l2arc_noprefetch is set to 0 to let L2ARC handle prefetches +typeset noprefetch=$(get_tunable l2arc_noprefetch) +log_must set_tunable32 l2arc_noprefetch 0 + +# disable L2ARC rebuild +typeset rebuild_enabled=$(get_tunable l2arc_rebuild_enabled) +log_must set_tunable32 l2arc_rebuild_enabled 0 + +typeset fill_mb=800 +typeset cache_sz=$(( 2 * $fill_mb )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +log_must zpool export $TESTPOOL + +typeset l2_success_start=$(get_arcstat l2_rebuild_success) + +log_must zpool import -d $VDIR $TESTPOOL +log_mustnot test "$(zpool iostat -Hpv $TESTPOOL $VDEV_CACHE | awk '{print $2}')" -gt 80000000 + +typeset l2_success_end=$(get_arcstat l2_rebuild_success) + +log_mustnot test $l2_success_end -gt $l2_success_start + +log_must zpool destroy -f $TESTPOOL +log_must set_tunable32 l2arc_rebuild_enabled $rebuild_enabled + +log_pass "Persistent L2ARC fails as expected when l2arc_rebuild_enabled = 0." diff --git a/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_004_pos.ksh b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_004_pos.ksh new file mode 100755 index 0000000000..b0529dccae --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_004_pos.ksh @@ -0,0 +1,101 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg + +# +# DESCRIPTION: +# Persistent L2ARC restores all written log blocks +# +# STRATEGY: +# 1. Create pool with a cache device. +# 2. Create a random file in that pool, smaller than the cache device +# and random read for 30 sec. +# 3. Export pool. +# 4. Read amount of log blocks written. +# 5. Import pool. +# 6. Read amount of log blocks built. +# 7. Compare the two amounts +# 8. Read the file written in (2) and check if l2_hits in +# /proc/spl/kstat/zfs/arcstats increased. +# 9. Check if the labels of the L2ARC device are intact. +# + +verify_runnable "global" + +log_assert "Persistent L2ARC restores all written log blocks." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 l2arc_noprefetch $noprefetch +} +log_onexit cleanup + +# l2arc_noprefetch is set to 0 to let L2ARC handle prefetches +typeset noprefetch=$(get_tunable l2arc_noprefetch) +log_must set_tunable32 l2arc_noprefetch 0 + +typeset fill_mb=800 +typeset cache_sz=$(( 2 * $fill_mb )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +typeset log_blk_start=$(get_arcstat l2_log_blk_writes) + +log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +log_must zpool export $TESTPOOL + +sleep 2 + +typeset log_blk_end=$(get_arcstat l2_log_blk_writes) + +typeset log_blk_rebuild_start=$(get_arcstat l2_rebuild_log_blks) + +log_must zpool import -d $VDIR $TESTPOOL + +typeset l2_hits_start=$(get_arcstat l2_hits) + +export RUNTIME=10 +log_must fio $FIO_SCRIPTS/random_reads.fio + +typeset l2_hits_end=$(get_arcstat l2_hits) + +typeset log_blk_rebuild_end=$(get_arcstat l2_rebuild_log_blks) + +log_must test $(( $log_blk_rebuild_end - $log_blk_rebuild_start )) -eq \ + $(( $log_blk_end - $log_blk_start )) + +log_must test $l2_hits_end -gt $l2_hits_start + +log_must zdb -lq $VDEV_CACHE + +log_must zpool destroy -f $TESTPOOL + +log_pass "Persistent L2ARC restores all written log blocks." diff --git a/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_005_pos.ksh b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_005_pos.ksh new file mode 100755 index 0000000000..4a9a8a114c --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_005_pos.ksh @@ -0,0 +1,108 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg +. $STF_SUITE/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib + +# +# DESCRIPTION: +# Persistent L2ARC restores all written log blocks with encryption +# +# STRATEGY: +# 1. Create pool with a cache device. +# 2. Create a an encrypted ZFS file system. +# 3. Create a random file in the entrypted file system, +# smaller than the cache device, and random read for 30 sec. +# 4. Export pool. +# 5. Read amount of log blocks written. +# 6. Import pool. +# 7. Mount the encypted ZFS file system. +# 8. Read amount of log blocks built. +# 9. Compare the two amounts +# 10. Read the file written in (3) and check if l2_hits in +# /proc/spl/kstat/zfs/arcstats increased. +# 11. Check if the labels of the L2ARC device are intact. +# + +verify_runnable "global" + +log_assert "Persistent L2ARC restores all written log blocks with encryption." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 l2arc_noprefetch $noprefetch +} +log_onexit cleanup + +# l2arc_noprefetch is set to 0 to let L2ARC handle prefetches +typeset noprefetch=$(get_tunable l2arc_noprefetch) +log_must set_tunable32 l2arc_noprefetch 0 + +typeset fill_mb=800 +typeset cache_sz=$(( 2 * $fill_mb )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +typeset log_blk_start=$(get_arcstat l2_log_blk_writes) + +log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE + +log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ + "-o keyformat=passphrase $TESTPOOL/$TESTFS1" + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +log_must zpool export $TESTPOOL + +sleep 2 + +typeset log_blk_end=$(get_arcstat l2_log_blk_writes) + +typeset log_blk_rebuild_start=$(get_arcstat l2_rebuild_log_blks) + +log_must zpool import -d $VDIR $TESTPOOL +log_must eval "echo $PASSPHRASE | zfs mount -l $TESTPOOL/$TESTFS1" + +typeset l2_hits_start=$(get_arcstat l2_hits) + +export RUNTIME=10 +log_must fio $FIO_SCRIPTS/random_reads.fio + +typeset l2_hits_end=$(get_arcstat l2_hits) + +typeset log_blk_rebuild_end=$(get_arcstat l2_rebuild_log_blks) + +log_must test $(( $log_blk_rebuild_end - $log_blk_rebuild_start )) -eq \ + $(( $log_blk_end - $log_blk_start )) + +log_must test $l2_hits_end -gt $l2_hits_start + +log_must zdb -lq $VDEV_CACHE + +log_must zpool destroy -f $TESTPOOL + +log_pass "Persistent L2ARC restores all written log blocks with encryption." diff --git a/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_006_pos.ksh b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_006_pos.ksh new file mode 100755 index 0000000000..b7de5050c0 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_006_pos.ksh @@ -0,0 +1,98 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg + +# +# DESCRIPTION: +# Off/onlining an L2ARC device results in rebuilding L2ARC, vdev not +# present. +# +# STRATEGY: +# 1. Create pool with a cache device. +# 2. Create a random file in that pool and random read for 30 sec. +# 3. Read the amount of log blocks written from the header of the +# L2ARC device. +# 4. Offline the L2ARC device and export pool. +# 5. Import pool and online the L2ARC device. +# 6. Read the amount of log blocks rebuilt in arcstats and compare to +# (3). +# 7. Check if the labels of the L2ARC device are intact. +# + +verify_runnable "global" + +log_assert "Off/onlining an L2ARC device results in rebuilding L2ARC, vdev not present." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 l2arc_noprefetch $noprefetch + log_must set_tunable32 l2arc_rebuild_blocks_min_l2size \ + $rebuild_blocks_min_l2size +} +log_onexit cleanup + +# l2arc_noprefetch is set to 0 to let L2ARC handle prefetches +typeset noprefetch=$(get_tunable l2arc_noprefetch) +typeset rebuild_blocks_min_l2size=$(get_tunable l2arc_rebuild_blocks_min_l2size) +log_must set_tunable32 l2arc_noprefetch 0 +log_must set_tunable32 l2arc_rebuild_blocks_min_l2size 0 + +typeset fill_mb=800 +typeset cache_sz=$(( floor($fill_mb / 2) )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +log_must zpool offline $TESTPOOL $VDEV_CACHE +log_must zpool export $TESTPOOL + +sleep 5 + +typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) + +typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | grep log_blk_count | \ + awk '{print $2}') + +log_must zpool import -d $VDIR $TESTPOOL +log_must zpool online $TESTPOOL $VDEV_CACHE + +sleep 5 + +typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks) + +log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start )) +log_must test $l2_dh_log_blk -gt 0 + +log_must zdb -lq $VDEV_CACHE + +log_must zpool destroy -f $TESTPOOL + +log_pass "Off/onlining an L2ARC device results in rebuilding L2ARC, vdev not present." diff --git a/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_007_pos.ksh b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_007_pos.ksh new file mode 100755 index 0000000000..3c28d7a5fb --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_007_pos.ksh @@ -0,0 +1,95 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg + +# +# DESCRIPTION: +# Off/onlining an L2ARC device results in rebuilding L2ARC, vdev present. +# +# STRATEGY: +# 1. Create pool with a cache device. +# 2. Create a random file in that pool and random read for 30 sec. +# 3. Read the amount of log blocks written from the header of the +# L2ARC device. +# 4. Offline the L2ARC device. +# 5. Online the L2ARC device. +# 6. Read the amount of log blocks rebuilt in arcstats and compare to +# (3). +# 7. Check if the labels of the L2ARC device are intact. +# + +verify_runnable "global" + +log_assert "Off/onlining an L2ARC device results in rebuilding L2ARC, vdev present." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 l2arc_noprefetch $noprefetch + log_must set_tunable32 l2arc_rebuild_blocks_min_l2size \ + $rebuild_blocks_min_l2size +} +log_onexit cleanup + +# l2arc_noprefetch is set to 0 to let L2ARC handle prefetches +typeset noprefetch=$(get_tunable l2arc_noprefetch) +typeset rebuild_blocks_min_l2size=$(get_tunable l2arc_rebuild_blocks_min_l2size) +log_must set_tunable32 l2arc_noprefetch 0 +log_must set_tunable32 l2arc_rebuild_blocks_min_l2size 0 + +typeset fill_mb=800 +typeset cache_sz=$(( floor($fill_mb / 2) )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +log_must zpool offline $TESTPOOL $VDEV_CACHE + +sleep 10 + +typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) + +typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | grep log_blk_count | \ + awk '{print $2}') + +log_must zpool online $TESTPOOL $VDEV_CACHE + +sleep 10 + +typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks) + +log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start )) +log_must test $l2_dh_log_blk -gt 0 + +log_must zdb -lq $VDEV_CACHE + +log_must zpool destroy -f $TESTPOOL + +log_pass "Off/onlining an L2ARC device results in rebuilding L2ARC, vdev present." diff --git a/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_008_pos.ksh b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_008_pos.ksh new file mode 100755 index 0000000000..c94b7ad9fe --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_008_pos.ksh @@ -0,0 +1,143 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg + +# +# DESCRIPTION: +# Off/onlining an L2ARC device restores all written blocks, vdev present. +# +# STRATEGY: +# 1. Create pool with a cache device. +# 2. Create a random file in that pool and random read for 30 sec. +# 3. Read the amount of log blocks written from the header of the +# L2ARC device. +# 4. Offline the L2ARC device. +# 5. Online the L2ARC device. +# 6. Read the amount of log blocks rebuilt in arcstats and compare to +# (3). +# 7. Create another random file in that pool and random read for 30 sec. +# 8. Read the amount of log blocks written from the header of the +# L2ARC device. +# 9. Offline the L2ARC device. +# 10. Online the L2ARC device. +# 11. Read the amount of log blocks rebuilt in arcstats and compare to +# (7). +# 12. Check if the amount of log blocks on the cache device has +# increased. +# 13. Export the pool. +# 14. Read the amount of log blocks on the cache device. +# 15. Import the pool. +# 16. Read the amount of log blocks rebuilt in arcstats and compare to +# (14). +# 17. Check if the labels of the L2ARC device are intact. +# + +verify_runnable "global" + +log_assert "Off/onlining an L2ARC device restores all written blocks , vdev present." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 l2arc_noprefetch $noprefetch +} +log_onexit cleanup + +# l2arc_noprefetch is set to 0 to let L2ARC handle prefetches +typeset noprefetch=$(get_tunable l2arc_noprefetch) +log_must set_tunable32 l2arc_noprefetch 0 + +typeset fill_mb=400 +typeset cache_sz=$(( 3 * $fill_mb )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +log_must zpool offline $TESTPOOL $VDEV_CACHE + +sleep 2 + +typeset l2_dh_log_blk1=$(zdb -l $VDEV_CACHE | grep log_blk_count | \ + awk '{print $2}') + +typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) + +log_must zpool online $TESTPOOL $VDEV_CACHE + +sleep 5 + +typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks) + +log_must test $l2_dh_log_blk1 -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start )) +log_must test $l2_dh_log_blk1 -gt 0 + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +log_must zpool offline $TESTPOOL $VDEV_CACHE + +sleep 2 + +typeset l2_dh_log_blk2=$(zdb -l $VDEV_CACHE | grep log_blk_count | \ + awk '{print $2}') + +typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) + +log_must zpool online $TESTPOOL $VDEV_CACHE + +sleep 5 + +typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks) + +log_must test $l2_dh_log_blk2 -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start )) + +log_must test $l2_dh_log_blk2 -gt $l2_dh_log_blk1 + +log_must zpool export $TESTPOOL + +typeset l2_dh_log_blk3=$(zdb -l $VDEV_CACHE | grep log_blk_count | \ + awk '{print $2}') + +typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) + +log_must zpool import -d $VDIR $TESTPOOL + +sleep 5 + +typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks) + +log_must test $l2_dh_log_blk3 -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start )) +log_must test $l2_dh_log_blk3 -gt 0 + +log_must zdb -lq $VDEV_CACHE + +log_must zpool destroy -f $TESTPOOL + +log_pass "Off/onlining an L2ARC device restores all written blocks, vdev present." diff --git a/usr/src/test/zfs-tests/tests/functional/persist_l2arc/setup.ksh b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/setup.ksh new file mode 100755 index 0000000000..ef95c84cdd --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/persist_l2arc/setup.ksh @@ -0,0 +1,29 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg + +verify_runnable "global" + +log_must rm -rf $VDIR +log_must mkdir -p $VDIR +log_must mkfile $SIZE $VDEV + +log_pass diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 48ae3aa829..9a962b420e 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -24,6 +24,8 @@ * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011, 2019, Delphix. All rights reserved. + * Copyright (c) 2020, George Amanakis. All rights reserved. */ /* @@ -293,6 +295,7 @@ #include <sys/kstat.h> #include <sys/zthr.h> #include <zfs_fletcher.h> +#include <sys/arc_impl.h> #include <sys/aggsum.h> #include <sys/cityhash.h> #include <sys/param.h> @@ -407,54 +410,6 @@ uint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */ boolean_t zfs_compressed_arc_enabled = B_TRUE; -/* - * Note that buffers can be in one of 6 states: - * ARC_anon - anonymous (discussed below) - * ARC_mru - recently used, currently cached - * ARC_mru_ghost - recentely used, no longer in cache - * ARC_mfu - frequently used, currently cached - * ARC_mfu_ghost - frequently used, no longer in cache - * ARC_l2c_only - exists in L2ARC but not other states - * When there are no active references to the buffer, they are - * are linked onto a list in one of these arc states. These are - * the only buffers that can be evicted or deleted. Within each - * state there are multiple lists, one for meta-data and one for - * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, - * etc.) is tracked separately so that it can be managed more - * explicitly: favored over data, limited explicitly. - * - * Anonymous buffers are buffers that are not associated with - * a DVA. These are buffers that hold dirty block copies - * before they are written to stable storage. By definition, - * they are "ref'd" and are considered part of arc_mru - * that cannot be freed. Generally, they will aquire a DVA - * as they are written and migrate onto the arc_mru list. - * - * The ARC_l2c_only state is for buffers that are in the second - * level ARC but no longer in any of the ARC_m* lists. The second - * level ARC itself may also contain buffers that are in any of - * the ARC_m* states - meaning that a buffer can exist in two - * places. The reason for the ARC_l2c_only state is to keep the - * buffer header in the hash table, so that reads that hit the - * second level ARC benefit from these fast lookups. - */ - -typedef struct arc_state { - /* - * list of evictable buffers - */ - multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; - /* - * total amount of evictable data in this state - */ - zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; - /* - * total amount of data in this state; this includes: evictable, - * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. - */ - zfs_refcount_t arcs_size; -} arc_state_t; - /* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; @@ -463,263 +418,7 @@ static arc_state_t ARC_mfu; static arc_state_t ARC_mfu_ghost; static arc_state_t ARC_l2c_only; -typedef struct arc_stats { - kstat_named_t arcstat_hits; - kstat_named_t arcstat_misses; - kstat_named_t arcstat_demand_data_hits; - kstat_named_t arcstat_demand_data_misses; - kstat_named_t arcstat_demand_metadata_hits; - kstat_named_t arcstat_demand_metadata_misses; - kstat_named_t arcstat_prefetch_data_hits; - kstat_named_t arcstat_prefetch_data_misses; - kstat_named_t arcstat_prefetch_metadata_hits; - kstat_named_t arcstat_prefetch_metadata_misses; - kstat_named_t arcstat_mru_hits; - kstat_named_t arcstat_mru_ghost_hits; - kstat_named_t arcstat_mfu_hits; - kstat_named_t arcstat_mfu_ghost_hits; - kstat_named_t arcstat_deleted; - /* - * Number of buffers that could not be evicted because the hash lock - * was held by another thread. The lock may not necessarily be held - * by something using the same buffer, since hash locks are shared - * by multiple buffers. - */ - kstat_named_t arcstat_mutex_miss; - /* - * Number of buffers skipped when updating the access state due to the - * header having already been released after acquiring the hash lock. - */ - kstat_named_t arcstat_access_skip; - /* - * Number of buffers skipped because they have I/O in progress, are - * indirect prefetch buffers that have not lived long enough, or are - * not from the spa we're trying to evict from. - */ - kstat_named_t arcstat_evict_skip; - /* - * Number of times arc_evict_state() was unable to evict enough - * buffers to reach its target amount. - */ - kstat_named_t arcstat_evict_not_enough; - kstat_named_t arcstat_evict_l2_cached; - kstat_named_t arcstat_evict_l2_eligible; - kstat_named_t arcstat_evict_l2_ineligible; - kstat_named_t arcstat_evict_l2_skip; - kstat_named_t arcstat_hash_elements; - kstat_named_t arcstat_hash_elements_max; - kstat_named_t arcstat_hash_collisions; - kstat_named_t arcstat_hash_chains; - kstat_named_t arcstat_hash_chain_max; - kstat_named_t arcstat_p; - kstat_named_t arcstat_c; - kstat_named_t arcstat_c_min; - kstat_named_t arcstat_c_max; - /* Not updated directly; only synced in arc_kstat_update. */ - kstat_named_t arcstat_size; - /* - * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. - * Note that the compressed bytes may match the uncompressed bytes - * if the block is either not compressed or compressed arc is disabled. - */ - kstat_named_t arcstat_compressed_size; - /* - * Uncompressed size of the data stored in b_pabd. If compressed - * arc is disabled then this value will be identical to the stat - * above. - */ - kstat_named_t arcstat_uncompressed_size; - /* - * Number of bytes stored in all the arc_buf_t's. This is classified - * as "overhead" since this data is typically short-lived and will - * be evicted from the arc when it becomes unreferenced unless the - * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level - * values have been set (see comment in dbuf.c for more information). - */ - kstat_named_t arcstat_overhead_size; - /* - * Number of bytes consumed by internal ARC structures necessary - * for tracking purposes; these structures are not actually - * backed by ARC buffers. This includes arc_buf_hdr_t structures - * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only - * caches), and arc_buf_t structures (allocated via arc_buf_t - * cache). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_hdr_size; - /* - * Number of bytes consumed by ARC buffers of type equal to - * ARC_BUFC_DATA. This is generally consumed by buffers backing - * on disk user data (e.g. plain file contents). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_data_size; - /* - * Number of bytes consumed by ARC buffers of type equal to - * ARC_BUFC_METADATA. This is generally consumed by buffers - * backing on disk data that is used for internal ZFS - * structures (e.g. ZAP, dnode, indirect blocks, etc). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_metadata_size; - /* - * Number of bytes consumed by various buffers and structures - * not actually backed with ARC buffers. This includes bonus - * buffers (allocated directly via zio_buf_* functions), - * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t - * cache), and dnode_t structures (allocated via dnode_t cache). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_other_size; - /* - * Total number of bytes consumed by ARC buffers residing in the - * arc_anon state. This includes *all* buffers in the arc_anon - * state; e.g. data, metadata, evictable, and unevictable buffers - * are all included in this value. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_anon_size; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_DATA, - * residing in the arc_anon state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_anon_evictable_data; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_METADATA, - * residing in the arc_anon state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_anon_evictable_metadata; - /* - * Total number of bytes consumed by ARC buffers residing in the - * arc_mru state. This includes *all* buffers in the arc_mru - * state; e.g. data, metadata, evictable, and unevictable buffers - * are all included in this value. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_size; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_DATA, - * residing in the arc_mru state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_evictable_data; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_METADATA, - * residing in the arc_mru state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_evictable_metadata; - /* - * Total number of bytes that *would have been* consumed by ARC - * buffers in the arc_mru_ghost state. The key thing to note - * here, is the fact that this size doesn't actually indicate - * RAM consumption. The ghost lists only consist of headers and - * don't actually have ARC buffers linked off of these headers. - * Thus, *if* the headers had associated ARC buffers, these - * buffers *would have* consumed this number of bytes. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_ghost_size; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_ghost_evictable_data; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_ghost_evictable_metadata; - /* - * Total number of bytes consumed by ARC buffers residing in the - * arc_mfu state. This includes *all* buffers in the arc_mfu - * state; e.g. data, metadata, evictable, and unevictable buffers - * are all included in this value. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_size; - /* - * Number of bytes consumed by ARC buffers that are eligible for - * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu - * state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_evictable_data; - /* - * Number of bytes consumed by ARC buffers that are eligible for - * eviction, of type ARC_BUFC_METADATA, and reside in the - * arc_mfu state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_evictable_metadata; - /* - * Total number of bytes that *would have been* consumed by ARC - * buffers in the arc_mfu_ghost state. See the comment above - * arcstat_mru_ghost_size for more details. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_ghost_size; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_ghost_evictable_data; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_ghost_evictable_metadata; - kstat_named_t arcstat_l2_hits; - kstat_named_t arcstat_l2_misses; - kstat_named_t arcstat_l2_feeds; - kstat_named_t arcstat_l2_rw_clash; - kstat_named_t arcstat_l2_read_bytes; - kstat_named_t arcstat_l2_write_bytes; - kstat_named_t arcstat_l2_writes_sent; - kstat_named_t arcstat_l2_writes_done; - kstat_named_t arcstat_l2_writes_error; - kstat_named_t arcstat_l2_writes_lock_retry; - kstat_named_t arcstat_l2_evict_lock_retry; - kstat_named_t arcstat_l2_evict_reading; - kstat_named_t arcstat_l2_evict_l1cached; - kstat_named_t arcstat_l2_free_on_write; - kstat_named_t arcstat_l2_abort_lowmem; - kstat_named_t arcstat_l2_cksum_bad; - kstat_named_t arcstat_l2_io_error; - kstat_named_t arcstat_l2_lsize; - kstat_named_t arcstat_l2_psize; - /* Not updated directly; only synced in arc_kstat_update. */ - kstat_named_t arcstat_l2_hdr_size; - kstat_named_t arcstat_memory_throttle_count; - /* Not updated directly; only synced in arc_kstat_update. */ - kstat_named_t arcstat_meta_used; - kstat_named_t arcstat_meta_limit; - kstat_named_t arcstat_meta_max; - kstat_named_t arcstat_meta_min; - kstat_named_t arcstat_async_upgrade_sync; - kstat_named_t arcstat_demand_hit_predictive_prefetch; - kstat_named_t arcstat_demand_hit_prescient_prefetch; -} arc_stats_t; - -static arc_stats_t arc_stats = { +arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, @@ -795,6 +494,22 @@ static arc_stats_t arc_stats = { { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, + { "l2_log_blk_writes", KSTAT_DATA_UINT64 }, + { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 }, + { "l2_log_blk_asize", KSTAT_DATA_UINT64 }, + { "l2_log_blk_count", KSTAT_DATA_UINT64 }, + { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 }, + { "l2_rebuild_success", KSTAT_DATA_UINT64 }, + { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, + { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 }, + { "l2_rebuild_size", KSTAT_DATA_UINT64 }, + { "l2_rebuild_asize", KSTAT_DATA_UINT64 }, + { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, + { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 }, + { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, @@ -805,14 +520,6 @@ static arc_stats_t arc_stats = { { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, }; -#define ARCSTAT(stat) (arc_stats.stat.value.ui64) - -#define ARCSTAT_INCR(stat, val) \ - atomic_add_64(&arc_stats.stat.value.ui64, (val)) - -#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) -#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) - #define ARCSTAT_MAX(stat, val) { \ uint64_t m; \ while ((val) > (m = arc_stats.stat.value.ui64) && \ @@ -843,6 +550,24 @@ static arc_stats_t arc_stats = { } \ } +/* + * This macro allows us to use kstats as floating averages. Each time we + * update this kstat, we first factor it and the update value by + * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall + * average. This macro assumes that integer loads and stores are atomic, but + * is not safe for multiple writers updating the kstat in parallel (only the + * last writer's update will remain). + */ +#define ARCSTAT_F_AVG_FACTOR 3 +#define ARCSTAT_F_AVG(stat, value) \ + do { \ + uint64_t x = ARCSTAT(stat); \ + x = x - x / ARCSTAT_F_AVG_FACTOR + \ + (value) / ARCSTAT_F_AVG_FACTOR; \ + ARCSTAT(stat) = x; \ + _NOTE(CONSTCOND) \ + } while (0) + kstat_t *arc_ksp; static arc_state_t *arc_anon; static arc_state_t *arc_mru; @@ -852,29 +577,6 @@ static arc_state_t *arc_mfu_ghost; static arc_state_t *arc_l2c_only; /* - * There are several ARC variables that are critical to export as kstats -- - * but we don't want to have to grovel around in the kstat whenever we wish to - * manipulate them. For these variables, we therefore define them to be in - * terms of the statistic variable. This assures that we are not introducing - * the possibility of inconsistency by having shadow copies of the variables, - * while still allowing the code to be readable. - */ -#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ -#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ -#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ -#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ -#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ -#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ -#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ - -/* compressed size of entire arc */ -#define arc_compressed_size ARCSTAT(arcstat_compressed_size) -/* uncompressed size of entire arc */ -#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) -/* number of bytes in the arc from arc_buf_t's */ -#define arc_overhead_size ARCSTAT(arcstat_overhead_size) - -/* * There are also some ARC variables that we want to export, but that are * updated so often that having the canonical representation be the statistic * variable causes a performance bottleneck. We want to use aggsum_t's for these @@ -895,182 +597,6 @@ static hrtime_t arc_growtime; static uint64_t arc_tempreserve; static uint64_t arc_loaned_bytes; -typedef struct arc_callback arc_callback_t; - -struct arc_callback { - void *acb_private; - arc_read_done_func_t *acb_done; - arc_buf_t *acb_buf; - boolean_t acb_encrypted; - boolean_t acb_compressed; - boolean_t acb_noauth; - zbookmark_phys_t acb_zb; - zio_t *acb_zio_dummy; - zio_t *acb_zio_head; - arc_callback_t *acb_next; -}; - -typedef struct arc_write_callback arc_write_callback_t; - -struct arc_write_callback { - void *awcb_private; - arc_write_done_func_t *awcb_ready; - arc_write_done_func_t *awcb_children_ready; - arc_write_done_func_t *awcb_physdone; - arc_write_done_func_t *awcb_done; - arc_buf_t *awcb_buf; -}; - -/* - * ARC buffers are separated into multiple structs as a memory saving measure: - * - Common fields struct, always defined, and embedded within it: - * - L2-only fields, always allocated but undefined when not in L2ARC - * - L1-only fields, only allocated when in L1ARC - * - * Buffer in L1 Buffer only in L2 - * +------------------------+ +------------------------+ - * | arc_buf_hdr_t | | arc_buf_hdr_t | - * | | | | - * | | | | - * | | | | - * +------------------------+ +------------------------+ - * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | - * | (undefined if L1-only) | | | - * +------------------------+ +------------------------+ - * | l1arc_buf_hdr_t | - * | | - * | | - * | | - * | | - * +------------------------+ - * - * Because it's possible for the L2ARC to become extremely large, we can wind - * up eating a lot of memory in L2ARC buffer headers, so the size of a header - * is minimized by only allocating the fields necessary for an L1-cached buffer - * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and - * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple - * words in pointers. arc_hdr_realloc() is used to switch a header between - * these two allocation states. - */ -typedef struct l1arc_buf_hdr { - kmutex_t b_freeze_lock; - zio_cksum_t *b_freeze_cksum; -#ifdef ZFS_DEBUG - /* - * Used for debugging with kmem_flags - by allocating and freeing - * b_thawed when the buffer is thawed, we get a record of the stack - * trace that thawed it. - */ - void *b_thawed; -#endif - - arc_buf_t *b_buf; - uint32_t b_bufcnt; - /* for waiting on writes to complete */ - kcondvar_t b_cv; - uint8_t b_byteswap; - - /* protected by arc state mutex */ - arc_state_t *b_state; - multilist_node_t b_arc_node; - - /* updated atomically */ - clock_t b_arc_access; - - /* self protecting */ - zfs_refcount_t b_refcnt; - - arc_callback_t *b_acb; - abd_t *b_pabd; -} l1arc_buf_hdr_t; - -/* - * Encrypted blocks will need to be stored encrypted on the L2ARC - * disk as they appear in the main pool. In order for this to work we - * need to pass around the encryption parameters so they can be used - * to write data to the L2ARC. This struct is only defined in the - * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED - * flag set. - */ -typedef struct arc_buf_hdr_crypt { - abd_t *b_rabd; /* raw encrypted data */ - dmu_object_type_t b_ot; /* object type */ - uint32_t b_ebufcnt; /* number or encryped buffers */ - - /* dsobj for looking up encryption key for l2arc encryption */ - uint64_t b_dsobj; /* for looking up key */ - - /* encryption parameters */ - uint8_t b_salt[ZIO_DATA_SALT_LEN]; - uint8_t b_iv[ZIO_DATA_IV_LEN]; - - /* - * Technically this could be removed since we will always be able to - * get the mac from the bp when we need it. However, it is inconvenient - * for callers of arc code to have to pass a bp in all the time. This - * also allows us to assert that L2ARC data is properly encrypted to - * match the data in the main storage pool. - */ - uint8_t b_mac[ZIO_DATA_MAC_LEN]; -} arc_buf_hdr_crypt_t; - -typedef struct l2arc_dev l2arc_dev_t; - -typedef struct l2arc_buf_hdr { - /* protected by arc_buf_hdr mutex */ - l2arc_dev_t *b_dev; /* L2ARC device */ - uint64_t b_daddr; /* disk address, offset byte */ - - list_node_t b_l2node; -} l2arc_buf_hdr_t; - -struct arc_buf_hdr { - /* protected by hash lock */ - dva_t b_dva; - uint64_t b_birth; - - arc_buf_contents_t b_type; - arc_buf_hdr_t *b_hash_next; - arc_flags_t b_flags; - - /* - * This field stores the size of the data buffer after - * compression, and is set in the arc's zio completion handlers. - * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). - * - * While the block pointers can store up to 32MB in their psize - * field, we can only store up to 32MB minus 512B. This is due - * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. - * a field of zeros represents 512B in the bp). We can't use a - * bias of 1 since we need to reserve a psize of zero, here, to - * represent holes and embedded blocks. - * - * This isn't a problem in practice, since the maximum size of a - * buffer is limited to 16MB, so we never need to store 32MB in - * this field. Even in the upstream illumos code base, the - * maximum size of a buffer is limited to 16MB. - */ - uint16_t b_psize; - - /* - * This field stores the size of the data buffer before - * compression, and cannot change once set. It is in units - * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) - */ - uint16_t b_lsize; /* immutable */ - uint64_t b_spa; /* immutable */ - - /* L2ARC fields. Undefined when not in L2ARC. */ - l2arc_buf_hdr_t b_l2hdr; - /* L1ARC fields. Undefined when in l2arc_only state */ - l1arc_buf_hdr_t b_l1hdr; - /* - * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED - * is set and the L1 header exists. - */ - arc_buf_hdr_crypt_t b_crypt_hdr; -}; - #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) @@ -1192,20 +718,6 @@ boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ /* * L2ARC Internals */ -struct l2arc_dev { - vdev_t *l2ad_vdev; /* vdev */ - spa_t *l2ad_spa; /* spa */ - uint64_t l2ad_hand; /* next write location */ - uint64_t l2ad_start; /* first addr on device */ - uint64_t l2ad_end; /* last addr on device */ - boolean_t l2ad_first; /* first sweep through */ - boolean_t l2ad_writing; /* currently writing */ - kmutex_t l2ad_mtx; /* lock for buffer list */ - list_t l2ad_buflist; /* buffer list */ - list_node_t l2ad_node; /* device list node */ - zfs_refcount_t l2ad_alloc; /* allocated bytes */ -}; - static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ @@ -1223,11 +735,6 @@ typedef struct l2arc_read_callback { abd_t *l2rcb_abd; /* temporary buffer */ } l2arc_read_callback_t; -typedef struct l2arc_write_callback { - l2arc_dev_t *l2wcb_dev; /* device info */ - arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ -} l2arc_write_callback_t; - typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ abd_t *l2df_abd; @@ -1240,6 +747,9 @@ static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; +static kmutex_t l2arc_rebuild_thr_lock; +static kcondvar_t l2arc_rebuild_thr_cv; + static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *); typedef enum arc_fill_flags { ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */ @@ -1259,6 +769,7 @@ static void arc_hdr_alloc_pabd(arc_buf_hdr_t *, boolean_t); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(); static void arc_buf_watch(arc_buf_t *); +static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); @@ -1297,6 +808,9 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) ((hdr)->b_dva.dva_word[0] == 0 && \ (hdr)->b_dva.dva_word[1] == 0) +#define HDR_EMPTY_OR_LOCKED(hdr) \ + (HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr))) + #define HDR_EQUAL(spa, dva, birth, hdr) \ ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ @@ -1725,8 +1239,7 @@ arc_cksum_free(arc_buf_hdr_t *hdr) static boolean_t arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) { - ASSERT(hdr->b_l1hdr.b_state == arc_anon || - MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr)); for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { if (!ARC_BUF_COMPRESSED(b)) { @@ -2010,14 +1523,14 @@ arc_buf_freeze(arc_buf_t *buf) static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags |= flags; } static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags &= ~flags; } @@ -2031,7 +1544,7 @@ arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) static void arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) { - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Holes and embedded blocks will always have a psize = 0 so @@ -2124,7 +1637,7 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) void *tmpbuf = NULL; abd_t *abd = hdr->b_l1hdr.b_pabd; - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_AUTHENTICATED(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); @@ -2194,7 +1707,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) boolean_t no_crypt = B_FALSE; boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_ENCRYPTED(hdr)); arc_hdr_alloc_pabd(hdr, B_FALSE); @@ -2314,7 +1827,7 @@ arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock) ASSERT(HDR_ENCRYPTED(hdr)); ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data, @@ -2634,7 +2147,7 @@ static void add_reference(arc_buf_hdr_t *hdr, void *tag) { ASSERT(HDR_HAS_L1HDR(hdr)); - if (!MUTEX_HELD(HDR_LOCK(hdr))) { + if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) { ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); @@ -3039,7 +2552,7 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Only honor requests for compressed bufs if the hdr is actually @@ -3159,6 +2672,58 @@ arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, return (buf); } +/* + * Performance tuning of L2ARC persistence: + * + * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding + * an L2ARC device (either at pool import or later) will attempt + * to rebuild L2ARC buffer contents. + * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls + * whether log blocks are written to the L2ARC device. If the L2ARC + * device is less than 1GB, the amount of data l2arc_evict() + * evicts is significant compared to the amount of restored L2ARC + * data. In this case do not write log blocks in L2ARC in order + * not to waste space. + */ +int l2arc_rebuild_enabled = B_TRUE; +unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024; + +/* L2ARC persistence rebuild control routines. */ +void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); +static void l2arc_dev_rebuild_start(l2arc_dev_t *dev); +static int l2arc_rebuild(l2arc_dev_t *dev); + +/* L2ARC persistence read I/O routines. */ +static int l2arc_dev_hdr_read(l2arc_dev_t *dev); +static int l2arc_log_blk_read(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp, + l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, + zio_t *this_io, zio_t **next_io); +static zio_t *l2arc_log_blk_fetch(vdev_t *vd, + const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb); +static void l2arc_log_blk_fetch_abort(zio_t *zio); + +/* L2ARC persistence block restoration routines. */ +static void l2arc_log_blk_restore(l2arc_dev_t *dev, + const l2arc_log_blk_phys_t *lb, uint64_t lb_asize, uint64_t lb_daddr); +static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, + l2arc_dev_t *dev); + +/* L2ARC persistence write I/O routines. */ +static void l2arc_dev_hdr_update(l2arc_dev_t *dev); +static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, + l2arc_write_callback_t *cb); + +/* L2ARC persistence auxilliary routines. */ +boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *lbp); +static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, + const arc_buf_hdr_t *ab); +boolean_t l2arc_range_check_overlap(uint64_t bottom, + uint64_t top, uint64_t check); +static void l2arc_blk_fetch_done(zio_t *zio); +static inline uint64_t + l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev); /* * Return a loaned arc buffer to the arc. @@ -3247,7 +2812,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) ASSERT(arc_can_share(hdr, buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!ARC_BUF_ENCRYPTED(buf)); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Start sharing the data buffer. We transfer the @@ -3280,7 +2845,7 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) ASSERT(arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * We are no longer sharing this buffer so we need @@ -3315,7 +2880,7 @@ arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) arc_buf_t *lastbuf = NULL; ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Remove the buf from the hdr list and locate the last @@ -3363,7 +2928,7 @@ arc_buf_destroy_impl(arc_buf_t *buf) * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); arc_cksum_verify(buf); arc_buf_unwatch(buf); @@ -3841,7 +3406,6 @@ arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, B_FALSE, ZIO_COMPRESS_OFF, type, B_FALSE); - ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE, @@ -3852,6 +3416,42 @@ arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) } /* + * Allocates an ARC buf header that's in an evicted & L2-cached state. + * This is used during l2arc reconstruction to make empty ARC buffers + * which circumvent the regular disk->arc->l2arc path and instead come + * into being in the reverse order, i.e. l2arc->arc. + */ +arc_buf_hdr_t * +arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, + dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth, + enum zio_compress compress, boolean_t protected, boolean_t prefetch) +{ + arc_buf_hdr_t *hdr; + + ASSERT(size != 0); + hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP); + hdr->b_birth = birth; + hdr->b_type = type; + hdr->b_flags = 0; + arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR); + HDR_SET_LSIZE(hdr, size); + HDR_SET_PSIZE(hdr, psize); + arc_hdr_set_compress(hdr, compress); + if (protected) + arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); + if (prefetch) + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa); + + hdr->b_dva = dva; + + hdr->b_l2hdr.b_dev = dev; + hdr->b_l2hdr.b_daddr = daddr; + + return (hdr); +} + +/* * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this * for bufs containing metadata. */ @@ -3866,7 +3466,6 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_FALSE, compression_type, ARC_BUFC_DATA, B_FALSE); - ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, @@ -3907,7 +3506,6 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE, compression_type, type, B_TRUE); - ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; @@ -3966,9 +3564,6 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); - if (!HDR_EMPTY(hdr)) - buf_discard_identity(hdr); - if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); @@ -3992,6 +3587,15 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) mutex_exit(&dev->l2ad_mtx); } + /* + * The header's identity can only be safely discarded once it is no + * longer discoverable. This requires removing it from the hash table + * and the l2arc header list. After this point the hash lock can not + * be used to protect the header. + */ + if (!HDR_EMPTY(hdr)) + buf_discard_identity(hdr); + if (HDR_HAS_L1HDR(hdr)) { arc_cksum_free(hdr); @@ -4005,9 +3609,8 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) } #endif - if (hdr->b_l1hdr.b_pabd != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) arc_hdr_free_pabd(hdr, B_FALSE); - } if (HDR_HAS_RABD(hdr)) arc_hdr_free_pabd(hdr, B_TRUE); @@ -4032,7 +3635,6 @@ void arc_buf_destroy(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - kmutex_t *hash_lock = HDR_LOCK(hdr); if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); @@ -4042,7 +3644,9 @@ arc_buf_destroy(arc_buf_t *buf, void* tag) return; } + kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); + ASSERT3P(hdr, ==, buf->b_hdr); ASSERT(hdr->b_l1hdr.b_bufcnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); @@ -6886,8 +6490,8 @@ arc_write_done(zio_t *zio) ASSERT(zfs_refcount_is_zero( &exists->b_l1hdr.b_refcnt)); arc_change_state(arc_anon, exists, hash_lock); - mutex_exit(hash_lock); arc_hdr_destroy(exists); + mutex_exit(hash_lock); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { @@ -7659,6 +7263,103 @@ arc_fini(void) * * These three functions determine what to write, how much, and how quickly * to send writes. + * + * L2ARC persistence: + * + * When writing buffers to L2ARC, we periodically add some metadata to + * make sure we can pick them up after reboot, thus dramatically reducing + * the impact that any downtime has on the performance of storage systems + * with large caches. + * + * The implementation works fairly simply by integrating the following two + * modifications: + * + * *) When writing to the L2ARC, we occasionally write a "l2arc log block", + * which is an additional piece of metadata which describes what's been + * written. This allows us to rebuild the arc_buf_hdr_t structures of the + * main ARC buffers. There are 2 linked-lists of log blocks headed by + * dh_start_lbps[2]. We alternate which chain we append to, so they are + * time-wise and offset-wise interleaved, but that is an optimization rather + * than for correctness. The log block also includes a pointer to the + * previous block in its chain. + * + * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device + * for our header bookkeeping purposes. This contains a device header, + * which contains our top-level reference structures. We update it each + * time we write a new log block, so that we're able to locate it in the + * L2ARC device. If this write results in an inconsistent device header + * (e.g. due to power failure), we detect this by verifying the header's + * checksum and simply fail to reconstruct the L2ARC after reboot. + * + * Implementation diagram: + * + * +=== L2ARC device (not to scale) ======================================+ + * | ___two newest log block pointers__.__________ | + * | / \dh_start_lbps[1] | + * | / \ \dh_start_lbps[0]| + * |.___/__. V V | + * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---| + * || hdr| ^ /^ /^ / / | + * |+------+ ...--\-------/ \-----/--\------/ / | + * | \--------------/ \--------------/ | + * +======================================================================+ + * + * As can be seen on the diagram, rather than using a simple linked list, + * we use a pair of linked lists with alternating elements. This is a + * performance enhancement due to the fact that we only find out the + * address of the next log block access once the current block has been + * completely read in. Obviously, this hurts performance, because we'd be + * keeping the device's I/O queue at only a 1 operation deep, thus + * incurring a large amount of I/O round-trip latency. Having two lists + * allows us to fetch two log blocks ahead of where we are currently + * rebuilding L2ARC buffers. + * + * On-device data structures: + * + * L2ARC device header: l2arc_dev_hdr_phys_t + * L2ARC log block: l2arc_log_blk_phys_t + * + * L2ARC reconstruction: + * + * When writing data, we simply write in the standard rotary fashion, + * evicting buffers as we go and simply writing new data over them (writing + * a new log block every now and then). This obviously means that once we + * loop around the end of the device, we will start cutting into an already + * committed log block (and its referenced data buffers), like so: + * + * current write head__ __old tail + * \ / + * V V + * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |--> + * ^ ^^^^^^^^^___________________________________ + * | \ + * <<nextwrite>> may overwrite this blk and/or its bufs --' + * + * When importing the pool, we detect this situation and use it to stop + * our scanning process (see l2arc_rebuild). + * + * There is one significant caveat to consider when rebuilding ARC contents + * from an L2ARC device: what about invalidated buffers? Given the above + * construction, we cannot update blocks which we've already written to amend + * them to remove buffers which were invalidated. Thus, during reconstruction, + * we might be populating the cache with buffers for data that's not on the + * main pool anymore, or may have been overwritten! + * + * As it turns out, this isn't a problem. Every arc_read request includes + * both the DVA and, crucially, the birth TXG of the BP the caller is + * looking for. So even if the cache were populated by completely rotten + * blocks for data that had been long deleted and/or overwritten, we'll + * never actually return bad data from the cache, since the DVA with the + * birth TXG uniquely identify a block in space and time - once created, + * a block is immutable on disk. The worst thing we have done is wasted + * some time and memory at l2arc rebuild to reconstruct outdated ARC + * entries that will get dropped from the l2arc as it is being updated + * with new blocks. + * + * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write + * hand are not restored. This is done by saving the offset (in bytes) + * l2arc_evict() has evicted to in the L2ARC device header and taking it + * into account when restoring buffers. */ static boolean_t @@ -7679,9 +7380,9 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) } static uint64_t -l2arc_write_size(void) +l2arc_write_size(l2arc_dev_t *dev) { - uint64_t size; + uint64_t size, dev_size; /* * Make sure our globals have meaningful values in case the user @@ -7698,6 +7399,25 @@ l2arc_write_size(void) if (arc_warm == B_FALSE) size += l2arc_write_boost; + /* + * Make sure the write size does not exceed the size of the cache + * device. This is important in l2arc_evict(), otherwise infinite + * iteration can occur. + */ + dev_size = dev->l2ad_end - dev->l2ad_start; + if ((size + l2arc_log_blk_overhead(size, dev)) >= dev_size) { + cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " + "plus the overhead of log blocks (persistent L2ARC, " + "%" PRIu64 " bytes) exceeds the size of the cache device " + "(guid %" PRIu64 "), resetting them to the default (%d)", + l2arc_log_blk_overhead(size, dev), + dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); + size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; + + if (arc_warm == B_FALSE) + size += l2arc_write_boost; + } + return (size); } @@ -7763,10 +7483,10 @@ l2arc_dev_get_next(void) else if (next == first) break; - } while (vdev_is_dead(next->l2ad_vdev)); + } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild); /* if we were unable to find any usable vdevs, return NULL */ - if (vdev_is_dead(next->l2ad_vdev)) + if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild) next = NULL; l2arc_dev_last = next; @@ -7815,16 +7535,20 @@ l2arc_do_free_on_write() static void l2arc_write_done(zio_t *zio) { - l2arc_write_callback_t *cb; - l2arc_dev_t *dev; - list_t *buflist; - arc_buf_hdr_t *head, *hdr, *hdr_prev; - kmutex_t *hash_lock; - int64_t bytes_dropped = 0; + l2arc_write_callback_t *cb; + l2arc_lb_abd_buf_t *abd_buf; + l2arc_lb_ptr_buf_t *lb_ptr_buf; + l2arc_dev_t *dev; + l2arc_dev_hdr_phys_t *l2dhdr; + list_t *buflist; + arc_buf_hdr_t *head, *hdr, *hdr_prev; + kmutex_t *hash_lock; + int64_t bytes_dropped = 0; cb = zio->io_private; ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; + l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; ASSERT3P(head, !=, NULL); @@ -7913,12 +7637,72 @@ top: mutex_exit(hash_lock); } + /* + * Free the allocated abd buffers for writing the log blocks. + * If the zio failed reclaim the allocated space and remove the + * pointers to these log blocks from the log block pointer list + * of the L2ARC device. + */ + while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) { + abd_free(abd_buf->abd); + zio_buf_free(abd_buf, sizeof (*abd_buf)); + if (zio->io_error != 0) { + lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list); + /* + * L2BLK_GET_PSIZE returns aligned size for log + * blocks. + */ + uint64_t asize = + L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop); + bytes_dropped += asize; + ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); + ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); + zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, + lb_ptr_buf); + zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); + kmem_free(lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); + } + } + list_destroy(&cb->l2wcb_abd_list); + + if (zio->io_error != 0) { + /* + * Restore the lbps array in the header to its previous state. + * If the list of log block pointers is empty, zero out the + * log block pointers in the device header. + */ + lb_ptr_buf = list_head(&dev->l2ad_lbptr_list); + for (int i = 0; i < 2; i++) { + if (lb_ptr_buf == NULL) { + /* + * If the list is empty zero out the device + * header. Otherwise zero out the second log + * block pointer in the header. + */ + if (i == 0) { + bzero(l2dhdr, dev->l2ad_dev_hdr_asize); + } else { + bzero(&l2dhdr->dh_start_lbps[i], + sizeof (l2arc_log_blkptr_t)); + } + break; + } + bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i], + sizeof (l2arc_log_blkptr_t)); + lb_ptr_buf = list_next(&dev->l2ad_lbptr_list, + lb_ptr_buf); + } + } + atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); mutex_exit(&dev->l2ad_mtx); + ASSERT(dev->l2ad_vdev != NULL); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); l2arc_do_free_on_write(); @@ -8110,7 +7894,6 @@ l2arc_read_done(zio_t *zio) zio->io_private = hdr; arc_read_done(zio); } else { - mutex_exit(hash_lock); /* * Buffer didn't survive caching. Increment stats and * reissue to the original storage device. @@ -8135,10 +7918,24 @@ l2arc_read_done(zio_t *zio) ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); - zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, + zio = zio_read(pio, zio->io_spa, zio->io_bp, abd, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, - &cb->l2rcb_zb)); + &cb->l2rcb_zb); + + /* + * Original ZIO will be freed, so we need to update + * ARC header with the new ZIO pointer to be used + * by zio_change_priority() in arc_read(). + */ + for (struct arc_callback *acb = hdr->b_l1hdr.b_acb; + acb != NULL; acb = acb->acb_next) + acb->acb_zio_head = zio; + + mutex_exit(hash_lock); + zio_nowait(zio); + } else { + mutex_exit(hash_lock); } } @@ -8189,8 +7986,31 @@ l2arc_sublist_lock(int list_num) } /* + * Calculates the maximum overhead of L2ARC metadata log blocks for a given + * L2ARC write size. l2arc_evict and l2arc_write_size need to include this + * overhead in processing to make sure there is enough headroom available + * when writing buffers. + */ +static inline uint64_t +l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev) +{ + if (dev->l2ad_log_entries == 0) { + return (0); + } else { + uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT; + + uint64_t log_blocks = (log_entries + + dev->l2ad_log_entries - 1) / + dev->l2ad_log_entries; + + return (vdev_psize_to_asize(dev->l2ad_vdev, + sizeof (l2arc_log_blk_phys_t)) * log_blocks); + } +} + +/* * Evict buffers from the device write hand to the distance specified in - * bytes. This distance may span populated buffers, it may span nothing. + * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ @@ -8201,22 +8021,28 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; + l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev; + boolean_t rerun; buflist = &dev->l2ad_buflist; - if (!all && dev->l2ad_first) { - /* - * This is the first sweep through the device. There is - * nothing to evict. - */ - return; - } + /* + * We need to add in the worst case scenario of log block overhead. + */ + distance += l2arc_log_blk_overhead(distance, dev); - if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { +top: + rerun = B_FALSE; + if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { /* - * When nearing the end of the device, evict to the end - * before the device write hand jumps to the start. + * When there is no space to accommodate upcoming writes, + * evict to the end. Then bump the write and evict hands + * to the start and iterate. This iteration does not + * happen indefinitely as we make sure in + * l2arc_write_size() that when the write hand is reset, + * the write size does not exceed the end of the device. */ + rerun = B_TRUE; taddr = dev->l2ad_end; } else { taddr = dev->l2ad_hand + distance; @@ -8224,11 +8050,68 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); -top: + /* + * This check has to be placed after deciding whether to iterate + * (rerun). + */ + if (!all && dev->l2ad_first) { + /* + * This is the first sweep through the device. There is + * nothing to evict. + */ + goto out; + } + + /* + * When rebuilding L2ARC we retrieve the evict hand from the header of + * the device. Of note, l2arc_evict() does not actually delete buffers + * from the cache device, but keeping track of the evict hand will be + * useful when TRIM is implemented. + */ + dev->l2ad_evict = MAX(dev->l2ad_evict, taddr); + +retry: mutex_enter(&dev->l2ad_mtx); + /* + * We have to account for evicted log blocks. Run vdev_space_update() + * on log blocks whose offset (in bytes) is before the evicted offset + * (in bytes) by searching in the list of pointers to log blocks + * present in the L2ARC device. + */ + for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf; + lb_ptr_buf = lb_ptr_buf_prev) { + + lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf); + + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + uint64_t asize = L2BLK_GET_PSIZE( + (lb_ptr_buf->lb_ptr)->lbp_prop); + + /* + * We don't worry about log blocks left behind (ie + * lbp_payload_start < l2ad_hand) because l2arc_write_buffers() + * will never write more than l2arc_evict() evicts. + */ + if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { + break; + } else { + vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); + ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); + zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, + lb_ptr_buf); + zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); + list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf); + kmem_free(lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); + } + } + for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); + ASSERT(!HDR_EMPTY(hdr)); hash_lock = HDR_LOCK(hdr); /* @@ -8244,7 +8127,7 @@ top: mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); - goto top; + goto retry; } /* @@ -8256,7 +8139,7 @@ top: ASSERT(!HDR_L2_WRITING(hdr)); ASSERT(!HDR_L2_WRITE_HEAD(hdr)); - if (!all && (hdr->b_l2hdr.b_daddr >= taddr || + if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict || hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, @@ -8293,6 +8176,26 @@ top: mutex_exit(hash_lock); } mutex_exit(&dev->l2ad_mtx); + +out: + /* + * We need to check if we evict all buffers, otherwise we may iterate + * unnecessarily. + */ + if (!all && rerun) { + /* + * Bump device hand to the device start if it is approaching the + * end. l2arc_evict() has already evicted ahead for this case. + */ + dev->l2ad_hand = dev->l2ad_start; + dev->l2ad_evict = dev->l2ad_start; + dev->l2ad_first = B_FALSE; + goto top; + } + + ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); + if (!dev->l2ad_first) + ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); } /* @@ -8412,6 +8315,17 @@ error: return (ret); } +static void +l2arc_blk_fetch_done(zio_t *zio) +{ + l2arc_read_callback_t *cb; + + cb = zio->io_private; + if (cb->l2rcb_abd != NULL) + abd_put(cb->l2rcb_abd); + kmem_free(cb, sizeof (l2arc_read_callback_t)); +} + /* * Find and write ARC buffers to the L2ARC device. * @@ -8421,17 +8335,18 @@ error: * state between calls to this function. * * Returns the number of bytes actually written (which may be smaller than - * the delta by which the device hand has changed due to alignment). + * the delta by which the device hand has changed due to alignment and the + * writing of log blocks). */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { - arc_buf_hdr_t *hdr, *hdr_prev, *head; - uint64_t write_asize, write_psize, write_lsize, headroom; - boolean_t full; - l2arc_write_callback_t *cb; - zio_t *pio, *wzio; - uint64_t guid = spa_load_guid(spa); + arc_buf_hdr_t *hdr, *hdr_prev, *head; + uint64_t write_asize, write_psize, write_lsize, headroom; + boolean_t full; + l2arc_write_callback_t *cb = NULL; + zio_t *pio, *wzio; + uint64_t guid = spa_load_guid(spa); ASSERT3P(dev->l2ad_vdev, !=, NULL); @@ -8483,7 +8398,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) } passed_sz += HDR_GET_LSIZE(hdr); - if (passed_sz > headroom) { + if (l2arc_headroom != 0 && passed_sz > headroom) { /* * Searched too far. */ @@ -8583,6 +8498,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; + /* + * Create a list to save allocated abd buffers + * for l2arc_log_blk_commit(). + */ + list_create(&cb->l2wcb_abd_list, + sizeof (l2arc_lb_abd_buf_t), + offsetof(l2arc_lb_abd_buf_t, node)); pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); } @@ -8616,6 +8538,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) mutex_exit(hash_lock); + /* + * Append buf info to current log and commit if full. + * arcstat_l2_{size,asize} kstats are updated + * internally. + */ + if (l2arc_log_blk_insert(dev, hdr)) + l2arc_log_blk_commit(dev, pio, cb); + (void) zio_nowait(wzio); } @@ -8630,28 +8560,36 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ASSERT0(write_lsize); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); + + /* + * Although we did not write any buffers l2ad_evict may + * have advanced. + */ + l2arc_dev_hdr_update(dev); + return (0); } + if (!dev->l2ad_first) + ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); + ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); ARCSTAT_INCR(arcstat_l2_psize, write_psize); - /* - * Bump device hand to the device start if it is approaching the end. - * l2arc_evict() will already have evicted ahead for this case. - */ - if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { - dev->l2ad_hand = dev->l2ad_start; - dev->l2ad_first = B_FALSE; - } - dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; + /* + * Update the device header after the zio completes as + * l2arc_write_done() may have updated the memory holding the log block + * pointers in the device header. + */ + l2arc_dev_hdr_update(dev); + return (write_asize); } @@ -8728,7 +8666,7 @@ l2arc_feed_thread(void *unused) ARCSTAT_BUMP(arcstat_l2_feeds); - size = l2arc_write_size(); + size = l2arc_write_size(dev); /* * Evict L2ARC buffers that will be overwritten. @@ -8756,7 +8694,17 @@ l2arc_feed_thread(void *unused) boolean_t l2arc_vdev_present(vdev_t *vd) { - l2arc_dev_t *dev; + return (l2arc_vdev_get(vd) != NULL); +} + +/* + * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if + * the vdev_t isn't an L2ARC device. + */ +static l2arc_dev_t * +l2arc_vdev_get(vdev_t *vd) +{ + l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; @@ -8766,7 +8714,7 @@ l2arc_vdev_present(vdev_t *vd) } mutex_exit(&l2arc_dev_mtx); - return (dev != NULL); + return (dev); } /* @@ -8776,7 +8724,8 @@ l2arc_vdev_present(vdev_t *vd) void l2arc_add_vdev(spa_t *spa, vdev_t *vd) { - l2arc_dev_t *adddev; + l2arc_dev_t *adddev; + uint64_t l2dhdr_asize; ASSERT(!l2arc_vdev_present(vd)); @@ -8786,11 +8735,17 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; - adddev->l2ad_start = VDEV_LABEL_START_SIZE; + /* leave extra size for an l2arc device header */ + l2dhdr_asize = adddev->l2ad_dev_hdr_asize = + MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift); + adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); + ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end); adddev->l2ad_hand = adddev->l2ad_start; + adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; + adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP); mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* @@ -8800,8 +8755,17 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); + /* + * This is a list of pointers to log blocks that are still present + * on the device. + */ + list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t), + offsetof(l2arc_lb_ptr_buf_t, node)); + vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); zfs_refcount_create(&adddev->l2ad_alloc); + zfs_refcount_create(&adddev->l2ad_lb_asize); + zfs_refcount_create(&adddev->l2ad_lb_count); /* * Add device to global list @@ -8810,6 +8774,87 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); + + /* + * Decide if vdev is eligible for L2ARC rebuild + */ + l2arc_rebuild_vdev(adddev->l2ad_vdev, B_FALSE); +} + +void +l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) +{ + l2arc_dev_t *dev = NULL; + l2arc_dev_hdr_phys_t *l2dhdr; + uint64_t l2dhdr_asize; + spa_t *spa; + int err; + boolean_t l2dhdr_valid = B_TRUE; + + dev = l2arc_vdev_get(vd); + ASSERT3P(dev, !=, NULL); + spa = dev->l2ad_spa; + l2dhdr = dev->l2ad_dev_hdr; + l2dhdr_asize = dev->l2ad_dev_hdr_asize; + + /* + * The L2ARC has to hold at least the payload of one log block for + * them to be restored (persistent L2ARC). The payload of a log block + * depends on the amount of its log entries. We always write log blocks + * with 1022 entries. How many of them are committed or restored depends + * on the size of the L2ARC device. Thus the maximum payload of + * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device + * is less than that, we reduce the amount of committed and restored + * log entries per block so as to enable persistence. + */ + if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) { + dev->l2ad_log_entries = 0; + } else { + dev->l2ad_log_entries = MIN((dev->l2ad_end - + dev->l2ad_start) >> SPA_MAXBLOCKSHIFT, + L2ARC_LOG_BLK_MAX_ENTRIES); + } + + /* + * Read the device header, if an error is returned do not rebuild L2ARC. + */ + if ((err = l2arc_dev_hdr_read(dev)) != 0) + l2dhdr_valid = B_FALSE; + + if (l2dhdr_valid && dev->l2ad_log_entries > 0) { + /* + * If we are onlining a cache device (vdev_reopen) that was + * still present (l2arc_vdev_present()) and rebuild is enabled, + * we should evict all ARC buffers and pointers to log blocks + * and reclaim their space before restoring its contents to + * L2ARC. + */ + if (reopen) { + if (!l2arc_rebuild_enabled) { + return; + } else { + l2arc_evict(dev, 0, B_TRUE); + /* start a new log block */ + dev->l2ad_log_ent_idx = 0; + dev->l2ad_log_blk_payload_asize = 0; + dev->l2ad_log_blk_payload_start = 0; + } + } + /* + * Just mark the device as pending for a rebuild. We won't + * be starting a rebuild in line here as it would block pool + * import. Instead spa_load_impl will hand that off to an + * async task which will call l2arc_spa_rebuild_start. + */ + dev->l2ad_rebuild = B_TRUE; + } else if (spa_writeable(spa)) { + /* + * In this case create a new header. We zero out the memory + * holding the header to reset dh_start_lbps. + */ + bzero(l2dhdr, l2dhdr_asize); + l2arc_dev_hdr_update(dev); + } } /* @@ -8818,24 +8863,29 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) void l2arc_remove_vdev(vdev_t *vd) { - l2arc_dev_t *dev, *nextdev, *remdev = NULL; + l2arc_dev_t *remdev = NULL; /* * Find the device by vdev */ - mutex_enter(&l2arc_dev_mtx); - for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { - nextdev = list_next(l2arc_dev_list, dev); - if (vd == dev->l2ad_vdev) { - remdev = dev; - break; - } - } + remdev = l2arc_vdev_get(vd); ASSERT3P(remdev, !=, NULL); /* + * Cancel any ongoing or scheduled rebuild. + */ + mutex_enter(&l2arc_rebuild_thr_lock); + if (remdev->l2ad_rebuild_began == B_TRUE) { + remdev->l2ad_rebuild_cancel = B_TRUE; + while (remdev->l2ad_rebuild == B_TRUE) + cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); + } + mutex_exit(&l2arc_rebuild_thr_lock); + + /* * Remove device from global list */ + mutex_enter(&l2arc_dev_mtx); list_remove(l2arc_dev_list, remdev); l2arc_dev_last = NULL; /* may have been invalidated */ atomic_dec_64(&l2arc_ndev); @@ -8846,8 +8896,13 @@ l2arc_remove_vdev(vdev_t *vd) */ l2arc_evict(remdev, 0, B_TRUE); list_destroy(&remdev->l2ad_buflist); + ASSERT(list_is_empty(&remdev->l2ad_lbptr_list)); + list_destroy(&remdev->l2ad_lbptr_list); mutex_destroy(&remdev->l2ad_mtx); zfs_refcount_destroy(&remdev->l2ad_alloc); + zfs_refcount_destroy(&remdev->l2ad_lb_asize); + zfs_refcount_destroy(&remdev->l2ad_lb_count); + kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); kmem_free(remdev, sizeof (l2arc_dev_t)); } @@ -8861,6 +8916,8 @@ l2arc_init(void) mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); @@ -8885,6 +8942,8 @@ l2arc_fini(void) mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); + mutex_destroy(&l2arc_rebuild_thr_lock); + cv_destroy(&l2arc_rebuild_thr_cv); mutex_destroy(&l2arc_dev_mtx); mutex_destroy(&l2arc_free_on_write_mtx); @@ -8915,3 +8974,901 @@ l2arc_stop(void) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); } + +/* + * Punches out rebuild threads for the L2ARC devices in a spa. This should + * be called after pool import from the spa async thread, since starting + * these threads directly from spa_import() will make them part of the + * "zpool import" context and delay process exit (and thus pool import). + */ +void +l2arc_spa_rebuild_start(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + /* + * Locate the spa's l2arc devices and kick off rebuild threads. + */ + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + l2arc_dev_t *dev = + l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); + if (dev == NULL) { + /* Don't attempt a rebuild if the vdev is UNAVAIL */ + continue; + } + mutex_enter(&l2arc_rebuild_thr_lock); + if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) { + dev->l2ad_rebuild_began = B_TRUE; + (void) thread_create(NULL, 0, + (void (*)(void *))l2arc_dev_rebuild_start, + dev, 0, &p0, TS_RUN, minclsyspri); + } + mutex_exit(&l2arc_rebuild_thr_lock); + } +} + +/* + * Main entry point for L2ARC rebuilding. + */ +static void +l2arc_dev_rebuild_start(l2arc_dev_t *dev) +{ + VERIFY(!dev->l2ad_rebuild_cancel); + VERIFY(dev->l2ad_rebuild); + (void) l2arc_rebuild(dev); + mutex_enter(&l2arc_rebuild_thr_lock); + dev->l2ad_rebuild_began = B_FALSE; + dev->l2ad_rebuild = B_FALSE; + mutex_exit(&l2arc_rebuild_thr_lock); + + thread_exit(); +} + +/* + * This function implements the actual L2ARC metadata rebuild. It: + * starts reading the log block chain and restores each block's contents + * to memory (reconstructing arc_buf_hdr_t's). + * + * Operation stops under any of the following conditions: + * + * 1) We reach the end of the log block chain. + * 2) We encounter *any* error condition (cksum errors, io errors) + */ +static int +l2arc_rebuild(l2arc_dev_t *dev) +{ + vdev_t *vd = dev->l2ad_vdev; + spa_t *spa = vd->vdev_spa; + int err = 0; + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + l2arc_log_blk_phys_t *this_lb, *next_lb; + zio_t *this_io = NULL, *next_io = NULL; + l2arc_log_blkptr_t lbps[2]; + l2arc_lb_ptr_buf_t *lb_ptr_buf; + boolean_t lock_held; + + this_lb = kmem_zalloc(sizeof (*this_lb), KM_SLEEP); + next_lb = kmem_zalloc(sizeof (*next_lb), KM_SLEEP); + + /* + * We prevent device removal while issuing reads to the device, + * then during the rebuilding phases we drop this lock again so + * that a spa_unload or device remove can be initiated - this is + * safe, because the spa will signal us to stop before removing + * our device and wait for us to stop. + */ + spa_config_enter(spa, SCL_L2ARC, vd, RW_READER); + lock_held = B_TRUE; + + /* + * Retrieve the persistent L2ARC device state. + * L2BLK_GET_PSIZE returns aligned size for log blocks. + */ + dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start); + dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr + + L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop), + dev->l2ad_start); + dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); + + /* + * In case the zfs module parameter l2arc_rebuild_enabled is false + * we do not start the rebuild process. + */ + if (!l2arc_rebuild_enabled) + goto out; + + /* Prepare the rebuild process */ + bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps)); + + /* Start the rebuild process */ + for (;;) { + if (!l2arc_log_blkptr_valid(dev, &lbps[0])) + break; + + if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1], + this_lb, next_lb, this_io, &next_io)) != 0) + goto out; + + /* + * Our memory pressure valve. If the system is running low + * on memory, rather than swamping memory with new ARC buf + * hdrs, we opt not to rebuild the L2ARC. At this point, + * however, we have already set up our L2ARC dev to chain in + * new metadata log blocks, so the user may choose to offline/ + * online the L2ARC dev at a later time (or re-import the pool) + * to reconstruct it (when there's less memory pressure). + */ + if (arc_reclaim_needed()) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem); + cmn_err(CE_NOTE, "System running low on memory, " + "aborting L2ARC rebuild."); + err = SET_ERROR(ENOMEM); + goto out; + } + + spa_config_exit(spa, SCL_L2ARC, vd); + lock_held = B_FALSE; + + /* + * Now that we know that the next_lb checks out alright, we + * can start reconstruction from this log block. + * L2BLK_GET_PSIZE returns aligned size for log blocks. + */ + uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + l2arc_log_blk_restore(dev, this_lb, asize, lbps[0].lbp_daddr); + + /* + * log block restored, include its pointer in the list of + * pointers to log blocks present in the L2ARC device. + */ + lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); + lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), + KM_SLEEP); + bcopy(&lbps[0], lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + mutex_enter(&dev->l2ad_mtx); + list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_count); + zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); + zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); + mutex_exit(&dev->l2ad_mtx); + vdev_space_update(vd, asize, 0, 0); + + /* BEGIN CSTYLED */ + /* + * Protection against loops of log blocks: + * + * l2ad_hand l2ad_evict + * V V + * l2ad_start |=======================================| l2ad_end + * -----|||----|||---|||----||| + * (3) (2) (1) (0) + * ---|||---|||----|||---||| + * (7) (6) (5) (4) + * + * In this situation the pointer of log block (4) passes + * l2arc_log_blkptr_valid() but the log block should not be + * restored as it is overwritten by the payload of log block + * (0). Only log blocks (0)-(3) should be restored. We check + * whether l2ad_evict lies in between the payload starting + * offset of the next log block (lbps[1].lbp_payload_start) + * and the payload starting offset of the present log block + * (lbps[0].lbp_payload_start). If true and this isn't the + * first pass, we are looping from the beginning and we should + * stop. + */ + /* END CSTYLED */ + if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, + lbps[0].lbp_payload_start, dev->l2ad_evict) && + !dev->l2ad_first) + goto out; + + for (;;) { + mutex_enter(&l2arc_rebuild_thr_lock); + if (dev->l2ad_rebuild_cancel) { + dev->l2ad_rebuild = B_FALSE; + cv_signal(&l2arc_rebuild_thr_cv); + mutex_exit(&l2arc_rebuild_thr_lock); + err = SET_ERROR(ECANCELED); + goto out; + } + mutex_exit(&l2arc_rebuild_thr_lock); + if (spa_config_tryenter(spa, SCL_L2ARC, vd, + RW_READER)) { + lock_held = B_TRUE; + break; + } + /* + * L2ARC config lock held by somebody in writer, + * possibly due to them trying to remove us. They'll + * likely to want us to shut down, so after a little + * delay, we check l2ad_rebuild_cancel and retry + * the lock again. + */ + delay(1); + } + + /* + * Continue with the next log block. + */ + lbps[0] = lbps[1]; + lbps[1] = this_lb->lb_prev_lbp; + PTR_SWAP(this_lb, next_lb); + this_io = next_io; + next_io = NULL; + } + + if (this_io != NULL) + l2arc_log_blk_fetch_abort(this_io); +out: + if (next_io != NULL) + l2arc_log_blk_fetch_abort(next_io); + kmem_free(this_lb, sizeof (*this_lb)); + kmem_free(next_lb, sizeof (*next_lb)); + + if (!l2arc_rebuild_enabled) { + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "disabled"); + } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_success); + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "successful, restored %llu blocks", + (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); + } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) { + /* + * No error but also nothing restored, meaning the lbps array + * in the device header points to invalid/non-present log + * blocks. Reset the header. + */ + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "no valid log blocks"); + bzero(l2dhdr, dev->l2ad_dev_hdr_asize); + l2arc_dev_hdr_update(dev); + } else if (err != 0) { + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "aborted, restored %llu blocks", + (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); + } + + if (lock_held) + spa_config_exit(spa, SCL_L2ARC, vd); + + return (err); +} + +/* + * Attempts to read the device header on the provided L2ARC device and writes + * it to `hdr'. On success, this function returns 0, otherwise the appropriate + * error code is returned. + */ +static int +l2arc_dev_hdr_read(l2arc_dev_t *dev) +{ + int err; + uint64_t guid; + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; + abd_t *abd; + + guid = spa_guid(dev->l2ad_vdev->vdev_spa); + + abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); + + err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, + VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, + ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_SPECULATIVE, B_FALSE)); + + abd_put(abd); + + if (err != 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors); + zfs_dbgmsg("L2ARC IO error (%d) while reading device header, " + "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid); + return (err); + } + + if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) + byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr)); + + if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC || + l2dhdr->dh_spa_guid != guid || + l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid || + l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION || + l2dhdr->dh_log_entries != dev->l2ad_log_entries || + l2dhdr->dh_end != dev->l2ad_end || + !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end, + l2dhdr->dh_evict)) { + /* + * Attempt to rebuild a device containing no actual dev hdr + * or containing a header from some other pool or from another + * version of persistent L2ARC. + */ + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported); + return (SET_ERROR(ENOTSUP)); + } + + return (0); +} + +/* + * Reads L2ARC log blocks from storage and validates their contents. + * + * This function implements a simple fetcher to make sure that while + * we're processing one buffer the L2ARC is already fetching the next + * one in the chain. + * + * The arguments this_lp and next_lp point to the current and next log block + * address in the block chain. Similarly, this_lb and next_lb hold the + * l2arc_log_blk_phys_t's of the current and next L2ARC blk. + * + * The `this_io' and `next_io' arguments are used for block fetching. + * When issuing the first blk IO during rebuild, you should pass NULL for + * `this_io'. This function will then issue a sync IO to read the block and + * also issue an async IO to fetch the next block in the block chain. The + * fetched IO is returned in `next_io'. On subsequent calls to this + * function, pass the value returned in `next_io' from the previous call + * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO. + * Prior to the call, you should initialize your `next_io' pointer to be + * NULL. If no fetch IO was issued, the pointer is left set at NULL. + * + * On success, this function returns 0, otherwise it returns an appropriate + * error code. On error the fetching IO is aborted and cleared before + * returning from this function. Therefore, if we return `success', the + * caller can assume that we have taken care of cleanup of fetch IOs. + */ +static int +l2arc_log_blk_read(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp, + l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, + zio_t *this_io, zio_t **next_io) +{ + int err = 0; + zio_cksum_t cksum; + abd_t *abd = NULL; + uint64_t asize; + + ASSERT(this_lbp != NULL && next_lbp != NULL); + ASSERT(this_lb != NULL && next_lb != NULL); + ASSERT(next_io != NULL && *next_io == NULL); + ASSERT(l2arc_log_blkptr_valid(dev, this_lbp)); + + /* + * Check to see if we have issued the IO for this log block in a + * previous run. If not, this is the first call, so issue it now. + */ + if (this_io == NULL) { + this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp, + this_lb); + } + + /* + * Peek to see if we can start issuing the next IO immediately. + */ + if (l2arc_log_blkptr_valid(dev, next_lbp)) { + /* + * Start issuing IO for the next log block early - this + * should help keep the L2ARC device busy while we + * decompress and restore this log block. + */ + *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp, + next_lb); + } + + /* Wait for the IO to read this log block to complete */ + if ((err = zio_wait(this_io)) != 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors); + zfs_dbgmsg("L2ARC IO error (%d) while reading log block, " + "offset: %llu, vdev guid: %llu", err, this_lbp->lbp_daddr, + dev->l2ad_vdev->vdev_guid); + goto cleanup; + } + + /* + * Make sure the buffer checks out. + * L2BLK_GET_PSIZE returns aligned size for log blocks. + */ + asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop); + fletcher_4_native(this_lb, asize, NULL, &cksum); + if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors); + zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, " + "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu", + this_lbp->lbp_daddr, dev->l2ad_vdev->vdev_guid, + dev->l2ad_hand, dev->l2ad_evict); + err = SET_ERROR(ECKSUM); + goto cleanup; + } + + /* Now we can take our time decoding this buffer */ + switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) { + case ZIO_COMPRESS_OFF: + break; + case ZIO_COMPRESS_LZ4: + abd = abd_alloc_for_io(asize, B_TRUE); + abd_copy_from_buf_off(abd, this_lb, 0, asize); + if ((err = zio_decompress_data( + L2BLK_GET_COMPRESS((this_lbp)->lbp_prop), + abd, this_lb, asize, sizeof (*this_lb))) != 0) { + err = SET_ERROR(EINVAL); + goto cleanup; + } + break; + default: + err = SET_ERROR(EINVAL); + goto cleanup; + } + if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) + byteswap_uint64_array(this_lb, sizeof (*this_lb)); + if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) { + err = SET_ERROR(EINVAL); + goto cleanup; + } +cleanup: + /* Abort an in-flight fetch I/O in case of error */ + if (err != 0 && *next_io != NULL) { + l2arc_log_blk_fetch_abort(*next_io); + *next_io = NULL; + } + if (abd != NULL) + abd_free(abd); + return (err); +} + +/* + * Restores the payload of a log block to ARC. This creates empty ARC hdr + * entries which only contain an l2arc hdr, essentially restoring the + * buffers to their L2ARC evicted state. This function also updates space + * usage on the L2ARC vdev to make sure it tracks restored buffers. + */ +static void +l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, + uint64_t lb_asize, uint64_t lb_daddr) +{ + uint64_t size = 0, asize = 0; + uint64_t log_entries = dev->l2ad_log_entries; + + for (int i = log_entries - 1; i >= 0; i--) { + /* + * Restore goes in the reverse temporal direction to preserve + * correct temporal ordering of buffers in the l2ad_buflist. + * l2arc_hdr_restore also does a list_insert_tail instead of + * list_insert_head on the l2ad_buflist: + * + * LIST l2ad_buflist LIST + * HEAD <------ (time) ------ TAIL + * direction +-----+-----+-----+-----+-----+ direction + * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild + * fill +-----+-----+-----+-----+-----+ + * ^ ^ + * | | + * | | + * l2arc_feed_thread l2arc_rebuild + * will place new bufs here restores bufs here + * + * During l2arc_rebuild() the device is not used by + * l2arc_feed_thread() as dev->l2ad_rebuild is set to true. + */ + size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop); + asize += vdev_psize_to_asize(dev->l2ad_vdev, + L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop)); + l2arc_hdr_restore(&lb->lb_entries[i], dev); + } + + /* + * Record rebuild stats: + * size Logical size of restored buffers in the L2ARC + * asize Aligned size of restored buffers in the L2ARC + */ + ARCSTAT_INCR(arcstat_l2_rebuild_size, size); + ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize); + ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize); + ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize); + ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks); +} + +/* + * Restores a single ARC buf hdr from a log entry. The ARC buffer is put + * into a state indicating that it has been evicted to L2ARC. + */ +static void +l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) +{ + arc_buf_hdr_t *hdr, *exists; + kmutex_t *hash_lock; + arc_buf_contents_t type = L2BLK_GET_TYPE((le)->le_prop); + uint64_t asize; + + /* + * Do all the allocation before grabbing any locks, this lets us + * sleep if memory is full and we don't have to deal with failed + * allocations. + */ + hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type, + dev, le->le_dva, le->le_daddr, + L2BLK_GET_PSIZE((le)->le_prop), le->le_birth, + L2BLK_GET_COMPRESS((le)->le_prop), + L2BLK_GET_PROTECTED((le)->le_prop), + L2BLK_GET_PREFETCH((le)->le_prop)); + asize = vdev_psize_to_asize(dev->l2ad_vdev, + L2BLK_GET_PSIZE((le)->le_prop)); + + /* + * vdev_space_update() has to be called before arc_hdr_destroy() to + * avoid underflow since the latter also calls the former. + */ + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + + ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(hdr)); + + mutex_enter(&dev->l2ad_mtx); + list_insert_tail(&dev->l2ad_buflist, hdr); + (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); + mutex_exit(&dev->l2ad_mtx); + + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* Buffer was already cached, no need to restore it. */ + arc_hdr_destroy(hdr); + /* + * If the buffer is already cached, check whether it has + * L2ARC metadata. If not, enter them and update the flag. + * This is important is case of onlining a cache device, since + * we previously evicted all L2ARC metadata from ARC. + */ + if (!HDR_HAS_L2HDR(exists)) { + arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR); + exists->b_l2hdr.b_dev = dev; + exists->b_l2hdr.b_daddr = le->le_daddr; + mutex_enter(&dev->l2ad_mtx); + list_insert_tail(&dev->l2ad_buflist, exists); + (void) zfs_refcount_add_many(&dev->l2ad_alloc, + arc_hdr_size(exists), exists); + mutex_exit(&dev->l2ad_mtx); + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(exists)); + ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(exists)); + } + ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached); + } + + mutex_exit(hash_lock); +} + +/* + * Starts an asynchronous read IO to read a log block. This is used in log + * block reconstruction to start reading the next block before we are done + * decoding and reconstructing the current block, to keep the l2arc device + * nice and hot with read IO to process. + * The returned zio will contain newly allocated memory buffers for the IO + * data which should then be freed by the caller once the zio is no longer + * needed (i.e. due to it having completed). If you wish to abort this + * zio, you should do so using l2arc_log_blk_fetch_abort, which takes + * care of disposing of the allocated buffers correctly. + */ +static zio_t * +l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, + l2arc_log_blk_phys_t *lb) +{ + uint32_t asize; + zio_t *pio; + l2arc_read_callback_t *cb; + + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); + ASSERT(asize <= sizeof (l2arc_log_blk_phys_t)); + + cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); + cb->l2rcb_abd = abd_get_from_buf(lb, asize); + pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY); + (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, + cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); + + return (pio); +} + +/* + * Aborts a zio returned from l2arc_log_blk_fetch and frees the data + * buffers allocated for it. + */ +static void +l2arc_log_blk_fetch_abort(zio_t *zio) +{ + (void) zio_wait(zio); +} + +/* + * Creates a zio to update the device header on an l2arc device. + */ +static void +l2arc_dev_hdr_update(l2arc_dev_t *dev) +{ + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; + abd_t *abd; + int err; + + VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER)); + + l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC; + l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION; + l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); + l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid; + l2dhdr->dh_log_entries = dev->l2ad_log_entries; + l2dhdr->dh_evict = dev->l2ad_evict; + l2dhdr->dh_start = dev->l2ad_start; + l2dhdr->dh_end = dev->l2ad_end; + l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize); + l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count); + l2dhdr->dh_flags = 0; + if (dev->l2ad_first) + l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; + + abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); + + err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev, + VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, + NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE)); + + abd_put(abd); + + if (err != 0) { + zfs_dbgmsg("L2ARC IO error (%d) while writing device header, " + "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid); + } +} + +/* + * Commits a log block to the L2ARC device. This routine is invoked from + * l2arc_write_buffers when the log block fills up. + * This function allocates some memory to temporarily hold the serialized + * buffer to be written. This is then released in l2arc_write_done. + */ +static void +l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) +{ + l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + uint64_t psize, asize; + zio_t *wzio; + l2arc_lb_abd_buf_t *abd_buf; + uint8_t *tmpbuf; + l2arc_lb_ptr_buf_t *lb_ptr_buf; + + VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries); + + tmpbuf = zio_buf_alloc(sizeof (*lb)); + abd_buf = zio_buf_alloc(sizeof (*abd_buf)); + abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb)); + lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); + lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); + + /* link the buffer into the block chain */ + lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1]; + lb->lb_magic = L2ARC_LOG_BLK_MAGIC; + + /* + * l2arc_log_blk_commit() may be called multiple times during a single + * l2arc_write_buffers() call. Save the allocated abd buffers in a list + * so we can free them in l2arc_write_done() later on. + */ + list_insert_tail(&cb->l2wcb_abd_list, abd_buf); + + /* try to compress the buffer */ + psize = zio_compress_data(ZIO_COMPRESS_LZ4, + abd_buf->abd, tmpbuf, sizeof (*lb)); + + /* a log block is never entirely zero */ + ASSERT(psize != 0); + asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); + ASSERT(asize <= sizeof (*lb)); + + /* + * Update the start log block pointer in the device header to point + * to the log block we're about to write. + */ + l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0]; + l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand; + l2dhdr->dh_start_lbps[0].lbp_payload_asize = + dev->l2ad_log_blk_payload_asize; + l2dhdr->dh_start_lbps[0].lbp_payload_start = + dev->l2ad_log_blk_payload_start; + _NOTE(CONSTCOND) + L2BLK_SET_LSIZE( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb)); + L2BLK_SET_PSIZE( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize); + L2BLK_SET_CHECKSUM( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, + ZIO_CHECKSUM_FLETCHER_4); + if (asize < sizeof (*lb)) { + /* compression succeeded */ + bzero(tmpbuf + psize, asize - psize); + L2BLK_SET_COMPRESS( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, + ZIO_COMPRESS_LZ4); + } else { + /* compression failed */ + bcopy(lb, tmpbuf, sizeof (*lb)); + L2BLK_SET_COMPRESS( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, + ZIO_COMPRESS_OFF); + } + + /* checksum what we're about to write */ + fletcher_4_native(tmpbuf, asize, NULL, + &l2dhdr->dh_start_lbps[0].lbp_cksum); + + abd_put(abd_buf->abd); + + /* perform the write itself */ + abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb)); + abd_take_ownership_of_buf(abd_buf->abd, B_TRUE); + wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, + asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); + (void) zio_nowait(wzio); + + dev->l2ad_hand += asize; + /* + * Include the committed log block's pointer in the list of pointers + * to log blocks present in the L2ARC device. + */ + bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + mutex_enter(&dev->l2ad_mtx); + list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_count); + zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); + zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); + mutex_exit(&dev->l2ad_mtx); + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + + /* bump the kstats */ + ARCSTAT_INCR(arcstat_l2_write_bytes, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_writes); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize); + ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, + dev->l2ad_log_blk_payload_asize / asize); + + /* start a new log block */ + dev->l2ad_log_ent_idx = 0; + dev->l2ad_log_blk_payload_asize = 0; + dev->l2ad_log_blk_payload_start = 0; +} + +/* + * Validates an L2ARC log block address to make sure that it can be read + * from the provided L2ARC device. + */ +boolean_t +l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) +{ + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); + uint64_t end = lbp->lbp_daddr + asize - 1; + uint64_t start = lbp->lbp_payload_start; + boolean_t evicted = B_FALSE; + + /* BEGIN CSTYLED */ + /* + * A log block is valid if all of the following conditions are true: + * - it fits entirely (including its payload) between l2ad_start and + * l2ad_end + * - it has a valid size + * - neither the log block itself nor part of its payload was evicted + * by l2arc_evict(): + * + * l2ad_hand l2ad_evict + * | | lbp_daddr + * | start | | end + * | | | | | + * V V V V V + * l2ad_start ============================================ l2ad_end + * --------------------------|||| + * ^ ^ + * | log block + * payload + */ + /* END CSTYLED */ + evicted = + l2arc_range_check_overlap(start, end, dev->l2ad_hand) || + l2arc_range_check_overlap(start, end, dev->l2ad_evict) || + l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) || + l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end); + + return (start >= dev->l2ad_start && end <= dev->l2ad_end && + asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) && + (!evicted || dev->l2ad_first)); +} + +/* + * Inserts ARC buffer header `hdr' into the current L2ARC log block on + * the device. The buffer being inserted must be present in L2ARC. + * Returns B_TRUE if the L2ARC log block is full and needs to be committed + * to L2ARC, or B_FALSE if it still has room for more ARC buffers. + */ +static boolean_t +l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) +{ + l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; + l2arc_log_ent_phys_t *le; + + if (dev->l2ad_log_entries == 0) + return (B_FALSE); + + int index = dev->l2ad_log_ent_idx++; + + ASSERT3S(index, <, dev->l2ad_log_entries); + ASSERT(HDR_HAS_L2HDR(hdr)); + + le = &lb->lb_entries[index]; + bzero(le, sizeof (*le)); + le->le_dva = hdr->b_dva; + le->le_birth = hdr->b_birth; + le->le_daddr = hdr->b_l2hdr.b_daddr; + if (index == 0) + dev->l2ad_log_blk_payload_start = le->le_daddr; + L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr)); + L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr)); + L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr)); + L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); + L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); + L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); + + dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, + HDR_GET_PSIZE(hdr)); + + return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries); +} + +/* + * Checks whether a given L2ARC device address sits in a time-sequential + * range. The trick here is that the L2ARC is a rotary buffer, so we can't + * just do a range comparison, we need to handle the situation in which the + * range wraps around the end of the L2ARC device. Arguments: + * bottom -- Lower end of the range to check (written to earlier). + * top -- Upper end of the range to check (written to later). + * check -- The address for which we want to determine if it sits in + * between the top and bottom. + * + * The 3-way conditional below represents the following cases: + * + * bottom < top : Sequentially ordered case: + * <check>--------+-------------------+ + * | (overlap here?) | + * L2ARC dev V V + * |---------------<bottom>============<top>--------------| + * + * bottom > top: Looped-around case: + * <check>--------+------------------+ + * | (overlap here?) | + * L2ARC dev V V + * |===============<top>---------------<bottom>===========| + * ^ ^ + * | (or here?) | + * +---------------+---------<check> + * + * top == bottom : Just a single address comparison. + */ +boolean_t +l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check) +{ + if (bottom < top) + return (bottom <= check && check <= top); + else if (bottom > top) + return (check <= top || bottom <= check); + else + return (check == top); +} diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index 21a8595b72..ced6a3ead5 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -4364,6 +4364,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) } spa_import_progress_remove(spa); + spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); + spa_load_note(spa, "LOADED"); return (0); @@ -7648,6 +7650,17 @@ spa_async_thread(void *arg) } /* + * Kick off L2 cache rebuilding. + */ + if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); + l2arc_spa_rebuild_start(spa); + spa_config_exit(spa, SCL_L2ARC, FTAG); + mutex_exit(&spa_namespace_lock); + } + + /* * Let the world know that we're done. */ mutex_enter(&spa->spa_async_lock); diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index 1ef3bb79ca..ddcbfa748d 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -248,10 +248,14 @@ void arc_fini(void); void l2arc_add_vdev(spa_t *spa, vdev_t *vd); void l2arc_remove_vdev(vdev_t *vd); boolean_t l2arc_vdev_present(vdev_t *vd); +void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); +boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, + uint64_t check); void l2arc_init(void); void l2arc_fini(void); void l2arc_start(void); void l2arc_stop(void); +void l2arc_spa_rebuild_start(spa_t *spa); #ifndef _KERNEL extern boolean_t arc_watch; diff --git a/usr/src/uts/common/fs/zfs/sys/arc_impl.h b/usr/src/uts/common/fs/zfs/sys/arc_impl.h new file mode 100644 index 0000000000..0c18849b59 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/arc_impl.h @@ -0,0 +1,857 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2019, Joyent, Inc. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2020, George Amanakis. All rights reserved. + */ + +#ifndef _SYS_ARC_IMPL_H +#define _SYS_ARC_IMPL_H + +#include <sys/arc.h> +#include <sys/multilist.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Note that buffers can be in one of 6 states: + * ARC_anon - anonymous (discussed below) + * ARC_mru - recently used, currently cached + * ARC_mru_ghost - recently used, no longer in cache + * ARC_mfu - frequently used, currently cached + * ARC_mfu_ghost - frequently used, no longer in cache + * ARC_l2c_only - exists in L2ARC but not other states + * When there are no active references to the buffer, they are + * are linked onto a list in one of these arc states. These are + * the only buffers that can be evicted or deleted. Within each + * state there are multiple lists, one for meta-data and one for + * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, + * etc.) is tracked separately so that it can be managed more + * explicitly: favored over data, limited explicitly. + * + * Anonymous buffers are buffers that are not associated with + * a DVA. These are buffers that hold dirty block copies + * before they are written to stable storage. By definition, + * they are "ref'd" and are considered part of arc_mru + * that cannot be freed. Generally, they will aquire a DVA + * as they are written and migrate onto the arc_mru list. + * + * The ARC_l2c_only state is for buffers that are in the second + * level ARC but no longer in any of the ARC_m* lists. The second + * level ARC itself may also contain buffers that are in any of + * the ARC_m* states - meaning that a buffer can exist in two + * places. The reason for the ARC_l2c_only state is to keep the + * buffer header in the hash table, so that reads that hit the + * second level ARC benefit from these fast lookups. + */ + +typedef struct arc_state { + /* + * list of evictable buffers + */ + multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; + /* + * total amount of evictable data in this state + */ + zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; + /* + * total amount of data in this state; this includes: evictable, + * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. + */ + zfs_refcount_t arcs_size; +} arc_state_t; + +typedef struct arc_callback arc_callback_t; + +struct arc_callback { + void *acb_private; + arc_read_done_func_t *acb_done; + arc_buf_t *acb_buf; + boolean_t acb_encrypted; + boolean_t acb_compressed; + boolean_t acb_noauth; + zbookmark_phys_t acb_zb; + zio_t *acb_zio_dummy; + zio_t *acb_zio_head; + arc_callback_t *acb_next; +}; + +typedef struct arc_write_callback arc_write_callback_t; + +struct arc_write_callback { + void *awcb_private; + arc_write_done_func_t *awcb_ready; + arc_write_done_func_t *awcb_children_ready; + arc_write_done_func_t *awcb_physdone; + arc_write_done_func_t *awcb_done; + arc_buf_t *awcb_buf; +}; + +/* + * ARC buffers are separated into multiple structs as a memory saving measure: + * - Common fields struct, always defined, and embedded within it: + * - L2-only fields, always allocated but undefined when not in L2ARC + * - L1-only fields, only allocated when in L1ARC + * + * Buffer in L1 Buffer only in L2 + * +------------------------+ +------------------------+ + * | arc_buf_hdr_t | | arc_buf_hdr_t | + * | | | | + * | | | | + * | | | | + * +------------------------+ +------------------------+ + * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | + * | (undefined if L1-only) | | | + * +------------------------+ +------------------------+ + * | l1arc_buf_hdr_t | + * | | + * | | + * | | + * | | + * +------------------------+ + * + * Because it's possible for the L2ARC to become extremely large, we can wind + * up eating a lot of memory in L2ARC buffer headers, so the size of a header + * is minimized by only allocating the fields necessary for an L1-cached buffer + * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and + * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple + * words in pointers. arc_hdr_realloc() is used to switch a header between + * these two allocation states. + */ +typedef struct l1arc_buf_hdr { + kmutex_t b_freeze_lock; + zio_cksum_t *b_freeze_cksum; +#ifdef ZFS_DEBUG + /* + * Used for debugging with kmem_flags - by allocating and freeing + * b_thawed when the buffer is thawed, we get a record of the stack + * trace that thawed it. + */ + void *b_thawed; +#endif + + arc_buf_t *b_buf; + uint32_t b_bufcnt; + /* for waiting on writes to complete */ + kcondvar_t b_cv; + uint8_t b_byteswap; + + /* protected by arc state mutex */ + arc_state_t *b_state; + multilist_node_t b_arc_node; + + /* updated atomically */ + clock_t b_arc_access; + + /* self protecting */ + zfs_refcount_t b_refcnt; + + arc_callback_t *b_acb; + abd_t *b_pabd; +} l1arc_buf_hdr_t; + +typedef enum l2arc_dev_hdr_flags_t { + L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */ +} l2arc_dev_hdr_flags_t; + +/* + * Pointer used in persistent L2ARC (for pointing to log blocks). + */ +typedef struct l2arc_log_blkptr { + /* + * Offset of log block within the device, in bytes + */ + uint64_t lbp_daddr; + /* + * Aligned payload size (in bytes) of the log block + */ + uint64_t lbp_payload_asize; + /* + * Offset in bytes of the first buffer in the payload + */ + uint64_t lbp_payload_start; + /* + * lbp_prop has the following format: + * * logical size (in bytes) + * * aligned (after compression) size (in bytes) + * * compression algorithm (we always LZ4-compress l2arc logs) + * * checksum algorithm (used for lbp_cksum) + */ + uint64_t lbp_prop; + zio_cksum_t lbp_cksum; /* checksum of log */ +} l2arc_log_blkptr_t; + +/* + * The persistent L2ARC device header. + * Byte order of magic determines whether 64-bit bswap of fields is necessary. + */ +typedef struct l2arc_dev_hdr_phys { + uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */ + uint64_t dh_version; /* Persistent L2ARC version */ + + /* + * Global L2ARC device state and metadata. + */ + uint64_t dh_spa_guid; + uint64_t dh_vdev_guid; + uint64_t dh_log_entries; /* mirror of l2ad_log_entries */ + uint64_t dh_evict; /* evicted offset in bytes */ + uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */ + /* + * Used in zdb.c for determining if a log block is valid, in the same + * way that l2arc_rebuild() does. + */ + uint64_t dh_start; /* mirror of l2ad_start */ + uint64_t dh_end; /* mirror of l2ad_end */ + /* + * Start of log block chain. [0] -> newest log, [1] -> one older (used + * for initiating prefetch). + */ + l2arc_log_blkptr_t dh_start_lbps[2]; + /* + * Aligned size of all log blocks as accounted by vdev_space_update(). + */ + uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */ + uint64_t dh_lb_count; /* mirror of l2ad_lb_count */ + const uint64_t dh_pad[32]; /* pad to 512 bytes */ + zio_eck_t dh_tail; +} l2arc_dev_hdr_phys_t; +CTASSERT(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE); + +/* + * A single ARC buffer header entry in a l2arc_log_blk_phys_t. + */ +typedef struct l2arc_log_ent_phys { + dva_t le_dva; /* dva of buffer */ + uint64_t le_birth; /* birth txg of buffer */ + /* + * le_prop has the following format: + * * logical size (in bytes) + * * physical (compressed) size (in bytes) + * * compression algorithm + * * object type (used to restore arc_buf_contents_t) + * * protected status (used for encryption) + * * prefetch status (used in l2arc_read_done()) + */ + uint64_t le_prop; + uint64_t le_daddr; /* buf location on l2dev */ + /* + * We pad the size of each entry to a power of 2 so that the size of + * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT, + * because of the L2ARC_SET_*SIZE macros. + */ + const uint64_t le_pad[3]; /* pad to 64 bytes */ +} l2arc_log_ent_phys_t; + +#define L2ARC_LOG_BLK_MAX_ENTRIES (1022) + +/* + * A log block of up to 1022 ARC buffer log entries, chained into the + * persistent L2ARC metadata linked list. Byte order of magic determines + * whether 64-bit bswap of fields is necessary. + */ +typedef struct l2arc_log_blk_phys { + uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */ + /* + * There are 2 chains (headed by dh_start_lbps[2]), and this field + * points back to the previous block in this chain. We alternate + * which chain we append to, so they are time-wise and offset-wise + * interleaved, but that is an optimization rather than for + * correctness. + */ + l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */ + /* + * Pad header section to 128 bytes + */ + uint64_t lb_pad[7]; + /* Payload */ + l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES]; +} l2arc_log_blk_phys_t; /* 64K total */ +/* + * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with + * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros. + */ +CTASSERT(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t), + 1ULL << SPA_MINBLOCKSHIFT)); +CTASSERT(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE); +CTASSERT(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE); + +/* + * These structures hold in-flight abd buffers for log blocks as they're being + * written to the L2ARC device. + */ +typedef struct l2arc_lb_abd_buf { + abd_t *abd; + list_node_t node; +} l2arc_lb_abd_buf_t; + +/* + * These structures hold pointers to log blocks present on the L2ARC device. + */ +typedef struct l2arc_lb_ptr_buf { + l2arc_log_blkptr_t *lb_ptr; + list_node_t node; +} l2arc_lb_ptr_buf_t; + +/* Macros for setting fields in le_prop and lbp_prop */ +#define L2BLK_GET_LSIZE(field) \ + BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1) +#define L2BLK_SET_LSIZE(field, x) \ + BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) +#define L2BLK_GET_PSIZE(field) \ + BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1) +#define L2BLK_SET_PSIZE(field, x) \ + BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) +#define L2BLK_GET_COMPRESS(field) \ + BF64_GET((field), 32, SPA_COMPRESSBITS) +#define L2BLK_SET_COMPRESS(field, x) \ + BF64_SET((field), 32, SPA_COMPRESSBITS, x) +#define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1) +#define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x) +#define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8) +#define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x) +#define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8) +#define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x) +#define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1) +#define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x) + +#define PTR_SWAP(x, y) \ + do { \ + void *tmp = (x);\ + x = y; \ + y = tmp; \ + _NOTE(CONSTCOND)\ + } while (0) + +#define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */ +#define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */ + +/* + * L2ARC Internals + */ +typedef struct l2arc_dev { + vdev_t *l2ad_vdev; /* vdev */ + spa_t *l2ad_spa; /* spa */ + uint64_t l2ad_hand; /* next write location */ + uint64_t l2ad_start; /* first addr on device */ + uint64_t l2ad_end; /* last addr on device */ + boolean_t l2ad_first; /* first sweep through */ + boolean_t l2ad_writing; /* currently writing */ + kmutex_t l2ad_mtx; /* lock for buffer list */ + list_t l2ad_buflist; /* buffer list */ + list_node_t l2ad_node; /* device list node */ + zfs_refcount_t l2ad_alloc; /* allocated bytes */ + /* + * Persistence-related stuff + */ + l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */ + uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */ + l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */ + int l2ad_log_ent_idx; /* index into cur log blk */ + /* Number of bytes in current log block's payload */ + uint64_t l2ad_log_blk_payload_asize; + /* + * Offset (in bytes) of the first buffer in current log block's + * payload. + */ + uint64_t l2ad_log_blk_payload_start; + /* Flag indicating whether a rebuild is scheduled or is going on */ + boolean_t l2ad_rebuild; + boolean_t l2ad_rebuild_cancel; + boolean_t l2ad_rebuild_began; + uint64_t l2ad_log_entries; /* entries per log blk */ + uint64_t l2ad_evict; /* evicted offset in bytes */ + /* List of pointers to log blocks present in the L2ARC device */ + list_t l2ad_lbptr_list; + /* + * Aligned size of all log blocks as accounted by vdev_space_update(). + */ + zfs_refcount_t l2ad_lb_asize; + /* + * Number of log blocks present on the device. + */ + zfs_refcount_t l2ad_lb_count; +} l2arc_dev_t; + +/* + * Encrypted blocks will need to be stored encrypted on the L2ARC + * disk as they appear in the main pool. In order for this to work we + * need to pass around the encryption parameters so they can be used + * to write data to the L2ARC. This struct is only defined in the + * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED + * flag set. + */ +typedef struct arc_buf_hdr_crypt { + abd_t *b_rabd; /* raw encrypted data */ + dmu_object_type_t b_ot; /* object type */ + uint32_t b_ebufcnt; /* number or encryped buffers */ + + /* dsobj for looking up encryption key for l2arc encryption */ + uint64_t b_dsobj; /* for looking up key */ + + /* encryption parameters */ + uint8_t b_salt[ZIO_DATA_SALT_LEN]; + uint8_t b_iv[ZIO_DATA_IV_LEN]; + + /* + * Technically this could be removed since we will always be able to + * get the mac from the bp when we need it. However, it is inconvenient + * for callers of arc code to have to pass a bp in all the time. This + * also allows us to assert that L2ARC data is properly encrypted to + * match the data in the main storage pool. + */ + uint8_t b_mac[ZIO_DATA_MAC_LEN]; +} arc_buf_hdr_crypt_t; + +typedef struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + uint64_t b_daddr; /* disk address, offset byte */ + + list_node_t b_l2node; +} l2arc_buf_hdr_t; + +typedef struct l2arc_write_callback { + l2arc_dev_t *l2wcb_dev; /* device info */ + arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ + /* in-flight list of log blocks */ + list_t l2wcb_abd_list; +} l2arc_write_callback_t; + +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + + arc_buf_contents_t b_type; + arc_buf_hdr_t *b_hash_next; + arc_flags_t b_flags; + + /* + * This field stores the size of the data buffer after + * compression, and is set in the arc's zio completion handlers. + * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). + * + * While the block pointers can store up to 32MB in their psize + * field, we can only store up to 32MB minus 512B. This is due + * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. + * a field of zeros represents 512B in the bp). We can't use a + * bias of 1 since we need to reserve a psize of zero, here, to + * represent holes and embedded blocks. + * + * This isn't a problem in practice, since the maximum size of a + * buffer is limited to 16MB, so we never need to store 32MB in + * this field. + */ + uint16_t b_psize; + + /* + * This field stores the size of the data buffer before + * compression, and cannot change once set. It is in units + * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) + */ + uint16_t b_lsize; /* immutable */ + uint64_t b_spa; /* immutable */ + + /* L2ARC fields. Undefined when not in L2ARC. */ + l2arc_buf_hdr_t b_l2hdr; + /* L1ARC fields. Undefined when in l2arc_only state */ + l1arc_buf_hdr_t b_l1hdr; + /* + * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED + * is set and the L1 header exists. + */ + arc_buf_hdr_crypt_t b_crypt_hdr; +}; + +typedef struct arc_stats { + kstat_named_t arcstat_hits; + kstat_named_t arcstat_misses; + kstat_named_t arcstat_demand_data_hits; + kstat_named_t arcstat_demand_data_misses; + kstat_named_t arcstat_demand_metadata_hits; + kstat_named_t arcstat_demand_metadata_misses; + kstat_named_t arcstat_prefetch_data_hits; + kstat_named_t arcstat_prefetch_data_misses; + kstat_named_t arcstat_prefetch_metadata_hits; + kstat_named_t arcstat_prefetch_metadata_misses; + kstat_named_t arcstat_mru_hits; + kstat_named_t arcstat_mru_ghost_hits; + kstat_named_t arcstat_mfu_hits; + kstat_named_t arcstat_mfu_ghost_hits; + kstat_named_t arcstat_deleted; + /* + * Number of buffers that could not be evicted because the hash lock + * was held by another thread. The lock may not necessarily be held + * by something using the same buffer, since hash locks are shared + * by multiple buffers. + */ + kstat_named_t arcstat_mutex_miss; + /* + * Number of buffers skipped when updating the access state due to the + * header having already been released after acquiring the hash lock. + */ + kstat_named_t arcstat_access_skip; + /* + * Number of buffers skipped because they have I/O in progress, are + * indirect prefetch buffers that have not lived long enough, or are + * not from the spa we're trying to evict from. + */ + kstat_named_t arcstat_evict_skip; + /* + * Number of times arc_evict_state() was unable to evict enough + * buffers to reach its target amount. + */ + kstat_named_t arcstat_evict_not_enough; + kstat_named_t arcstat_evict_l2_cached; + kstat_named_t arcstat_evict_l2_eligible; + kstat_named_t arcstat_evict_l2_ineligible; + kstat_named_t arcstat_evict_l2_skip; + kstat_named_t arcstat_hash_elements; + kstat_named_t arcstat_hash_elements_max; + kstat_named_t arcstat_hash_collisions; + kstat_named_t arcstat_hash_chains; + kstat_named_t arcstat_hash_chain_max; + kstat_named_t arcstat_p; + kstat_named_t arcstat_c; + kstat_named_t arcstat_c_min; + kstat_named_t arcstat_c_max; + /* Not updated directly; only synced in arc_kstat_update. */ + kstat_named_t arcstat_size; + /* + * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. + * Note that the compressed bytes may match the uncompressed bytes + * if the block is either not compressed or compressed arc is disabled. + */ + kstat_named_t arcstat_compressed_size; + /* + * Uncompressed size of the data stored in b_pabd. If compressed + * arc is disabled then this value will be identical to the stat + * above. + */ + kstat_named_t arcstat_uncompressed_size; + /* + * Number of bytes stored in all the arc_buf_t's. This is classified + * as "overhead" since this data is typically short-lived and will + * be evicted from the arc when it becomes unreferenced unless the + * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level + * values have been set (see comment in dbuf.c for more information). + */ + kstat_named_t arcstat_overhead_size; + /* + * Number of bytes consumed by internal ARC structures necessary + * for tracking purposes; these structures are not actually + * backed by ARC buffers. This includes arc_buf_hdr_t structures + * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only + * caches), and arc_buf_t structures (allocated via arc_buf_t + * cache). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_hdr_size; + /* + * Number of bytes consumed by ARC buffers of type equal to + * ARC_BUFC_DATA. This is generally consumed by buffers backing + * on disk user data (e.g. plain file contents). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_data_size; + /* + * Number of bytes consumed by ARC buffers of type equal to + * ARC_BUFC_METADATA. This is generally consumed by buffers + * backing on disk data that is used for internal ZFS + * structures (e.g. ZAP, dnode, indirect blocks, etc). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_metadata_size; + /* + * Number of bytes consumed by various buffers and structures + * not actually backed with ARC buffers. This includes bonus + * buffers (allocated directly via zio_buf_* functions), + * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t + * cache), and dnode_t structures (allocated via dnode_t cache). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_other_size; + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_anon state. This includes *all* buffers in the arc_anon + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_anon_size; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_DATA, + * residing in the arc_anon state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_anon_evictable_data; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_METADATA, + * residing in the arc_anon state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_anon_evictable_metadata; + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_mru state. This includes *all* buffers in the arc_mru + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_size; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_DATA, + * residing in the arc_mru state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_evictable_data; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_METADATA, + * residing in the arc_mru state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_evictable_metadata; + /* + * Total number of bytes that *would have been* consumed by ARC + * buffers in the arc_mru_ghost state. The key thing to note + * here, is the fact that this size doesn't actually indicate + * RAM consumption. The ghost lists only consist of headers and + * don't actually have ARC buffers linked off of these headers. + * Thus, *if* the headers had associated ARC buffers, these + * buffers *would have* consumed this number of bytes. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_ghost_size; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_ghost_evictable_data; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_ghost_evictable_metadata; + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_mfu state. This includes *all* buffers in the arc_mfu + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_size; + /* + * Number of bytes consumed by ARC buffers that are eligible for + * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu + * state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_evictable_data; + /* + * Number of bytes consumed by ARC buffers that are eligible for + * eviction, of type ARC_BUFC_METADATA, and reside in the + * arc_mfu state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_evictable_metadata; + /* + * Total number of bytes that *would have been* consumed by ARC + * buffers in the arc_mfu_ghost state. See the comment above + * arcstat_mru_ghost_size for more details. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_ghost_size; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_ghost_evictable_data; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_ghost_evictable_metadata; + kstat_named_t arcstat_l2_hits; + kstat_named_t arcstat_l2_misses; + kstat_named_t arcstat_l2_feeds; + kstat_named_t arcstat_l2_rw_clash; + kstat_named_t arcstat_l2_read_bytes; + kstat_named_t arcstat_l2_write_bytes; + kstat_named_t arcstat_l2_writes_sent; + kstat_named_t arcstat_l2_writes_done; + kstat_named_t arcstat_l2_writes_error; + kstat_named_t arcstat_l2_writes_lock_retry; + kstat_named_t arcstat_l2_evict_lock_retry; + kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_evict_l1cached; + kstat_named_t arcstat_l2_free_on_write; + kstat_named_t arcstat_l2_abort_lowmem; + kstat_named_t arcstat_l2_cksum_bad; + kstat_named_t arcstat_l2_io_error; + kstat_named_t arcstat_l2_lsize; + kstat_named_t arcstat_l2_psize; + /* Not updated directly; only synced in arc_kstat_update. */ + kstat_named_t arcstat_l2_hdr_size; + /* + * Number of L2ARC log blocks written. These are used for restoring the + * L2ARC. Updated during writing of L2ARC log blocks. + */ + kstat_named_t arcstat_l2_log_blk_writes; + /* + * Moving average of the aligned size of the L2ARC log blocks, in + * bytes. Updated during L2ARC rebuild and during writing of L2ARC + * log blocks. + */ + kstat_named_t arcstat_l2_log_blk_avg_asize; + /* Aligned size of L2ARC log blocks on L2ARC devices. */ + kstat_named_t arcstat_l2_log_blk_asize; + /* Number of L2ARC log blocks present on L2ARC devices. */ + kstat_named_t arcstat_l2_log_blk_count; + /* + * Moving average of the aligned size of L2ARC restored data, in bytes, + * to the aligned size of their metadata in L2ARC, in bytes. + * Updated during L2ARC rebuild and during writing of L2ARC log blocks. + */ + kstat_named_t arcstat_l2_data_to_meta_ratio; + /* + * Number of times the L2ARC rebuild was successful for an L2ARC device. + */ + kstat_named_t arcstat_l2_rebuild_success; + /* + * Number of times the L2ARC rebuild failed because the device header + * was in an unsupported format or corrupted. + */ + kstat_named_t arcstat_l2_rebuild_abort_unsupported; + /* + * Number of times the L2ARC rebuild failed because of IO errors + * while reading a log block. + */ + kstat_named_t arcstat_l2_rebuild_abort_io_errors; + /* + * Number of times the L2ARC rebuild failed because of IO errors when + * reading the device header. + */ + kstat_named_t arcstat_l2_rebuild_abort_dh_errors; + /* + * Number of L2ARC log blocks which failed to be restored due to + * checksum errors. + */ + kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors; + /* + * Number of times the L2ARC rebuild was aborted due to low system + * memory. + */ + kstat_named_t arcstat_l2_rebuild_abort_lowmem; + /* Logical size of L2ARC restored data, in bytes. */ + kstat_named_t arcstat_l2_rebuild_size; + /* Aligned size of L2ARC restored data, in bytes. */ + kstat_named_t arcstat_l2_rebuild_asize; + /* + * Number of L2ARC log entries (buffers) that were successfully + * restored in ARC. + */ + kstat_named_t arcstat_l2_rebuild_bufs; + /* + * Number of L2ARC log entries (buffers) already cached in ARC. These + * were not restored again. + */ + kstat_named_t arcstat_l2_rebuild_bufs_precached; + /* + * Number of L2ARC log blocks that were restored successfully. Each + * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers. + */ + kstat_named_t arcstat_l2_rebuild_log_blks; + kstat_named_t arcstat_memory_throttle_count; + /* Not updated directly; only synced in arc_kstat_update. */ + kstat_named_t arcstat_meta_used; + kstat_named_t arcstat_meta_limit; + kstat_named_t arcstat_meta_max; + kstat_named_t arcstat_meta_min; + kstat_named_t arcstat_async_upgrade_sync; + kstat_named_t arcstat_demand_hit_predictive_prefetch; + kstat_named_t arcstat_demand_hit_prescient_prefetch; +} arc_stats_t; + +#define ARCSTAT(stat) (arc_stats.stat.value.ui64) + +#define ARCSTAT_INCR(stat, val) \ + atomic_add_64(&arc_stats.stat.value.ui64, (val)) + +#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) +#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) + +/* + * There are several ARC variables that are critical to export as kstats -- + * but we don't want to have to grovel around in the kstat whenever we wish to + * manipulate them. For these variables, we therefore define them to be in + * terms of the statistic variable. This assures that we are not introducing + * the possibility of inconsistency by having shadow copies of the variables, + * while still allowing the code to be readable. + */ +#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ +#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ +#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ +#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ +#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ +#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ +#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ + +/* compressed size of entire arc */ +#define arc_compressed_size ARCSTAT(arcstat_compressed_size) +/* uncompressed size of entire arc */ +#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) +/* number of bytes in the arc from arc_buf_t's */ +#define arc_overhead_size ARCSTAT(arcstat_overhead_size) + +extern arc_stats_t arc_stats; + +/* used in zdb.c */ +boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *lbp); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ARC_IMPL_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 33cdfbeb4b..af8057be8f 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -792,6 +792,7 @@ extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); #define SPA_ASYNC_INITIALIZE_RESTART 0x100 #define SPA_ASYNC_TRIM_RESTART 0x200 #define SPA_ASYNC_AUTOTRIM_RESTART 0x400 +#define SPA_ASYNC_L2CACHE_REBUILD 0x800 /* * Controls the behavior of spa_vdev_remove(). diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index 254af68099..cd05edcffa 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -2179,9 +2179,22 @@ vdev_reopen(vdev_t *vd) if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && - vd->vdev_aux == &spa->spa_l2cache && - !l2arc_vdev_present(vd)) - l2arc_add_vdev(spa, vd); + vd->vdev_aux == &spa->spa_l2cache) { + /* + * When reopening we can assume the device label has + * already the attribute l2cache_persistent, since we've + * opened the device in the past and updated the label. + * In case the vdev is present we should evict all ARC + * buffers and pointers to log blocks and reclaim their + * space before restoring its contents to L2ARC. + */ + if (l2arc_vdev_present(vd)) { + l2arc_rebuild_vdev(vd, B_TRUE); + } else { + l2arc_add_vdev(spa, vd); + } + spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); + } } else { (void) vdev_validate(vd); } diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h index f870d6ce7c..819905a8d9 100644 --- a/usr/src/uts/common/sys/fs/zfs.h +++ b/usr/src/uts/common/sys/fs/zfs.h @@ -561,6 +561,11 @@ typedef enum zfs_key_location { #define ZPL_VERSION_USERSPACE ZPL_VERSION_4 #define ZPL_VERSION_SA ZPL_VERSION_5 +/* Persistent L2ARC version */ +#define L2ARC_PERSISTENT_VERSION_1 1ULL +#define L2ARC_PERSISTENT_VERSION L2ARC_PERSISTENT_VERSION_1 +#define L2ARC_PERSISTENT_VERSION_STRING "1" + /* Rewind policy information */ #define ZPOOL_NO_REWIND 1 /* No policy - default behavior */ #define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */ |