diff options
author | George Amanakis <gamanakis@gmail.com> | 2020-07-30 18:40:44 -0500 |
---|---|---|
committer | Jason King <jason.king@joyent.com> | 2020-10-16 11:10:02 -0500 |
commit | f0a052391861a2b96cf28973c3b7f2854591aa79 (patch) | |
tree | 653d2330669b465bac1ab1c55b7e24c018cddc8a /usr/src/uts | |
parent | 6218f28969018904255fddf306e6489c7ae28bba (diff) | |
download | illumos-joyent-f0a052391861a2b96cf28973c3b7f2854591aa79.tar.gz |
3525 Persistent L2ARC
Portions contributed by: Saso Kiselkov <skiselkov@gmail.com>
Portions contributed by: Jorgen Lundman <lundman@lundman.net>
Portions contributed by: Brian Behlendorf <behlendorf1@llnl.gov>
Portions contributed by: Alexander Motin <mav@FreeBSD.org>
Portions contributed by: Jason King <jason.king@joyent.com>
Reviewed by: C Fraire <cfraire@me.com>
Reviewed by: Toomas Soome <tsoome@me.com>
Approved by: Dan McDonald <danmcd@joyent.com>
Diffstat (limited to 'usr/src/uts')
-rw-r--r-- | usr/src/uts/common/fs/zfs/arc.c | 2183 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/spa.c | 13 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/arc.h | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/arc_impl.h | 857 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/spa.h | 1 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev.c | 19 | ||||
-rw-r--r-- | usr/src/uts/common/sys/fs/zfs.h | 5 |
7 files changed, 2466 insertions, 616 deletions
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 48ae3aa829..9a962b420e 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -24,6 +24,8 @@ * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011, 2019, Delphix. All rights reserved. + * Copyright (c) 2020, George Amanakis. All rights reserved. */ /* @@ -293,6 +295,7 @@ #include <sys/kstat.h> #include <sys/zthr.h> #include <zfs_fletcher.h> +#include <sys/arc_impl.h> #include <sys/aggsum.h> #include <sys/cityhash.h> #include <sys/param.h> @@ -407,54 +410,6 @@ uint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */ boolean_t zfs_compressed_arc_enabled = B_TRUE; -/* - * Note that buffers can be in one of 6 states: - * ARC_anon - anonymous (discussed below) - * ARC_mru - recently used, currently cached - * ARC_mru_ghost - recentely used, no longer in cache - * ARC_mfu - frequently used, currently cached - * ARC_mfu_ghost - frequently used, no longer in cache - * ARC_l2c_only - exists in L2ARC but not other states - * When there are no active references to the buffer, they are - * are linked onto a list in one of these arc states. These are - * the only buffers that can be evicted or deleted. Within each - * state there are multiple lists, one for meta-data and one for - * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, - * etc.) is tracked separately so that it can be managed more - * explicitly: favored over data, limited explicitly. - * - * Anonymous buffers are buffers that are not associated with - * a DVA. These are buffers that hold dirty block copies - * before they are written to stable storage. By definition, - * they are "ref'd" and are considered part of arc_mru - * that cannot be freed. Generally, they will aquire a DVA - * as they are written and migrate onto the arc_mru list. - * - * The ARC_l2c_only state is for buffers that are in the second - * level ARC but no longer in any of the ARC_m* lists. The second - * level ARC itself may also contain buffers that are in any of - * the ARC_m* states - meaning that a buffer can exist in two - * places. The reason for the ARC_l2c_only state is to keep the - * buffer header in the hash table, so that reads that hit the - * second level ARC benefit from these fast lookups. - */ - -typedef struct arc_state { - /* - * list of evictable buffers - */ - multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; - /* - * total amount of evictable data in this state - */ - zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; - /* - * total amount of data in this state; this includes: evictable, - * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. - */ - zfs_refcount_t arcs_size; -} arc_state_t; - /* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; @@ -463,263 +418,7 @@ static arc_state_t ARC_mfu; static arc_state_t ARC_mfu_ghost; static arc_state_t ARC_l2c_only; -typedef struct arc_stats { - kstat_named_t arcstat_hits; - kstat_named_t arcstat_misses; - kstat_named_t arcstat_demand_data_hits; - kstat_named_t arcstat_demand_data_misses; - kstat_named_t arcstat_demand_metadata_hits; - kstat_named_t arcstat_demand_metadata_misses; - kstat_named_t arcstat_prefetch_data_hits; - kstat_named_t arcstat_prefetch_data_misses; - kstat_named_t arcstat_prefetch_metadata_hits; - kstat_named_t arcstat_prefetch_metadata_misses; - kstat_named_t arcstat_mru_hits; - kstat_named_t arcstat_mru_ghost_hits; - kstat_named_t arcstat_mfu_hits; - kstat_named_t arcstat_mfu_ghost_hits; - kstat_named_t arcstat_deleted; - /* - * Number of buffers that could not be evicted because the hash lock - * was held by another thread. The lock may not necessarily be held - * by something using the same buffer, since hash locks are shared - * by multiple buffers. - */ - kstat_named_t arcstat_mutex_miss; - /* - * Number of buffers skipped when updating the access state due to the - * header having already been released after acquiring the hash lock. - */ - kstat_named_t arcstat_access_skip; - /* - * Number of buffers skipped because they have I/O in progress, are - * indirect prefetch buffers that have not lived long enough, or are - * not from the spa we're trying to evict from. - */ - kstat_named_t arcstat_evict_skip; - /* - * Number of times arc_evict_state() was unable to evict enough - * buffers to reach its target amount. - */ - kstat_named_t arcstat_evict_not_enough; - kstat_named_t arcstat_evict_l2_cached; - kstat_named_t arcstat_evict_l2_eligible; - kstat_named_t arcstat_evict_l2_ineligible; - kstat_named_t arcstat_evict_l2_skip; - kstat_named_t arcstat_hash_elements; - kstat_named_t arcstat_hash_elements_max; - kstat_named_t arcstat_hash_collisions; - kstat_named_t arcstat_hash_chains; - kstat_named_t arcstat_hash_chain_max; - kstat_named_t arcstat_p; - kstat_named_t arcstat_c; - kstat_named_t arcstat_c_min; - kstat_named_t arcstat_c_max; - /* Not updated directly; only synced in arc_kstat_update. */ - kstat_named_t arcstat_size; - /* - * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. - * Note that the compressed bytes may match the uncompressed bytes - * if the block is either not compressed or compressed arc is disabled. - */ - kstat_named_t arcstat_compressed_size; - /* - * Uncompressed size of the data stored in b_pabd. If compressed - * arc is disabled then this value will be identical to the stat - * above. - */ - kstat_named_t arcstat_uncompressed_size; - /* - * Number of bytes stored in all the arc_buf_t's. This is classified - * as "overhead" since this data is typically short-lived and will - * be evicted from the arc when it becomes unreferenced unless the - * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level - * values have been set (see comment in dbuf.c for more information). - */ - kstat_named_t arcstat_overhead_size; - /* - * Number of bytes consumed by internal ARC structures necessary - * for tracking purposes; these structures are not actually - * backed by ARC buffers. This includes arc_buf_hdr_t structures - * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only - * caches), and arc_buf_t structures (allocated via arc_buf_t - * cache). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_hdr_size; - /* - * Number of bytes consumed by ARC buffers of type equal to - * ARC_BUFC_DATA. This is generally consumed by buffers backing - * on disk user data (e.g. plain file contents). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_data_size; - /* - * Number of bytes consumed by ARC buffers of type equal to - * ARC_BUFC_METADATA. This is generally consumed by buffers - * backing on disk data that is used for internal ZFS - * structures (e.g. ZAP, dnode, indirect blocks, etc). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_metadata_size; - /* - * Number of bytes consumed by various buffers and structures - * not actually backed with ARC buffers. This includes bonus - * buffers (allocated directly via zio_buf_* functions), - * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t - * cache), and dnode_t structures (allocated via dnode_t cache). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_other_size; - /* - * Total number of bytes consumed by ARC buffers residing in the - * arc_anon state. This includes *all* buffers in the arc_anon - * state; e.g. data, metadata, evictable, and unevictable buffers - * are all included in this value. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_anon_size; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_DATA, - * residing in the arc_anon state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_anon_evictable_data; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_METADATA, - * residing in the arc_anon state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_anon_evictable_metadata; - /* - * Total number of bytes consumed by ARC buffers residing in the - * arc_mru state. This includes *all* buffers in the arc_mru - * state; e.g. data, metadata, evictable, and unevictable buffers - * are all included in this value. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_size; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_DATA, - * residing in the arc_mru state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_evictable_data; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_METADATA, - * residing in the arc_mru state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_evictable_metadata; - /* - * Total number of bytes that *would have been* consumed by ARC - * buffers in the arc_mru_ghost state. The key thing to note - * here, is the fact that this size doesn't actually indicate - * RAM consumption. The ghost lists only consist of headers and - * don't actually have ARC buffers linked off of these headers. - * Thus, *if* the headers had associated ARC buffers, these - * buffers *would have* consumed this number of bytes. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_ghost_size; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_ghost_evictable_data; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_ghost_evictable_metadata; - /* - * Total number of bytes consumed by ARC buffers residing in the - * arc_mfu state. This includes *all* buffers in the arc_mfu - * state; e.g. data, metadata, evictable, and unevictable buffers - * are all included in this value. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_size; - /* - * Number of bytes consumed by ARC buffers that are eligible for - * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu - * state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_evictable_data; - /* - * Number of bytes consumed by ARC buffers that are eligible for - * eviction, of type ARC_BUFC_METADATA, and reside in the - * arc_mfu state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_evictable_metadata; - /* - * Total number of bytes that *would have been* consumed by ARC - * buffers in the arc_mfu_ghost state. See the comment above - * arcstat_mru_ghost_size for more details. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_ghost_size; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_ghost_evictable_data; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_ghost_evictable_metadata; - kstat_named_t arcstat_l2_hits; - kstat_named_t arcstat_l2_misses; - kstat_named_t arcstat_l2_feeds; - kstat_named_t arcstat_l2_rw_clash; - kstat_named_t arcstat_l2_read_bytes; - kstat_named_t arcstat_l2_write_bytes; - kstat_named_t arcstat_l2_writes_sent; - kstat_named_t arcstat_l2_writes_done; - kstat_named_t arcstat_l2_writes_error; - kstat_named_t arcstat_l2_writes_lock_retry; - kstat_named_t arcstat_l2_evict_lock_retry; - kstat_named_t arcstat_l2_evict_reading; - kstat_named_t arcstat_l2_evict_l1cached; - kstat_named_t arcstat_l2_free_on_write; - kstat_named_t arcstat_l2_abort_lowmem; - kstat_named_t arcstat_l2_cksum_bad; - kstat_named_t arcstat_l2_io_error; - kstat_named_t arcstat_l2_lsize; - kstat_named_t arcstat_l2_psize; - /* Not updated directly; only synced in arc_kstat_update. */ - kstat_named_t arcstat_l2_hdr_size; - kstat_named_t arcstat_memory_throttle_count; - /* Not updated directly; only synced in arc_kstat_update. */ - kstat_named_t arcstat_meta_used; - kstat_named_t arcstat_meta_limit; - kstat_named_t arcstat_meta_max; - kstat_named_t arcstat_meta_min; - kstat_named_t arcstat_async_upgrade_sync; - kstat_named_t arcstat_demand_hit_predictive_prefetch; - kstat_named_t arcstat_demand_hit_prescient_prefetch; -} arc_stats_t; - -static arc_stats_t arc_stats = { +arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, @@ -795,6 +494,22 @@ static arc_stats_t arc_stats = { { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, + { "l2_log_blk_writes", KSTAT_DATA_UINT64 }, + { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 }, + { "l2_log_blk_asize", KSTAT_DATA_UINT64 }, + { "l2_log_blk_count", KSTAT_DATA_UINT64 }, + { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 }, + { "l2_rebuild_success", KSTAT_DATA_UINT64 }, + { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, + { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 }, + { "l2_rebuild_size", KSTAT_DATA_UINT64 }, + { "l2_rebuild_asize", KSTAT_DATA_UINT64 }, + { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, + { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 }, + { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, @@ -805,14 +520,6 @@ static arc_stats_t arc_stats = { { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, }; -#define ARCSTAT(stat) (arc_stats.stat.value.ui64) - -#define ARCSTAT_INCR(stat, val) \ - atomic_add_64(&arc_stats.stat.value.ui64, (val)) - -#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) -#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) - #define ARCSTAT_MAX(stat, val) { \ uint64_t m; \ while ((val) > (m = arc_stats.stat.value.ui64) && \ @@ -843,6 +550,24 @@ static arc_stats_t arc_stats = { } \ } +/* + * This macro allows us to use kstats as floating averages. Each time we + * update this kstat, we first factor it and the update value by + * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall + * average. This macro assumes that integer loads and stores are atomic, but + * is not safe for multiple writers updating the kstat in parallel (only the + * last writer's update will remain). + */ +#define ARCSTAT_F_AVG_FACTOR 3 +#define ARCSTAT_F_AVG(stat, value) \ + do { \ + uint64_t x = ARCSTAT(stat); \ + x = x - x / ARCSTAT_F_AVG_FACTOR + \ + (value) / ARCSTAT_F_AVG_FACTOR; \ + ARCSTAT(stat) = x; \ + _NOTE(CONSTCOND) \ + } while (0) + kstat_t *arc_ksp; static arc_state_t *arc_anon; static arc_state_t *arc_mru; @@ -852,29 +577,6 @@ static arc_state_t *arc_mfu_ghost; static arc_state_t *arc_l2c_only; /* - * There are several ARC variables that are critical to export as kstats -- - * but we don't want to have to grovel around in the kstat whenever we wish to - * manipulate them. For these variables, we therefore define them to be in - * terms of the statistic variable. This assures that we are not introducing - * the possibility of inconsistency by having shadow copies of the variables, - * while still allowing the code to be readable. - */ -#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ -#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ -#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ -#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ -#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ -#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ -#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ - -/* compressed size of entire arc */ -#define arc_compressed_size ARCSTAT(arcstat_compressed_size) -/* uncompressed size of entire arc */ -#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) -/* number of bytes in the arc from arc_buf_t's */ -#define arc_overhead_size ARCSTAT(arcstat_overhead_size) - -/* * There are also some ARC variables that we want to export, but that are * updated so often that having the canonical representation be the statistic * variable causes a performance bottleneck. We want to use aggsum_t's for these @@ -895,182 +597,6 @@ static hrtime_t arc_growtime; static uint64_t arc_tempreserve; static uint64_t arc_loaned_bytes; -typedef struct arc_callback arc_callback_t; - -struct arc_callback { - void *acb_private; - arc_read_done_func_t *acb_done; - arc_buf_t *acb_buf; - boolean_t acb_encrypted; - boolean_t acb_compressed; - boolean_t acb_noauth; - zbookmark_phys_t acb_zb; - zio_t *acb_zio_dummy; - zio_t *acb_zio_head; - arc_callback_t *acb_next; -}; - -typedef struct arc_write_callback arc_write_callback_t; - -struct arc_write_callback { - void *awcb_private; - arc_write_done_func_t *awcb_ready; - arc_write_done_func_t *awcb_children_ready; - arc_write_done_func_t *awcb_physdone; - arc_write_done_func_t *awcb_done; - arc_buf_t *awcb_buf; -}; - -/* - * ARC buffers are separated into multiple structs as a memory saving measure: - * - Common fields struct, always defined, and embedded within it: - * - L2-only fields, always allocated but undefined when not in L2ARC - * - L1-only fields, only allocated when in L1ARC - * - * Buffer in L1 Buffer only in L2 - * +------------------------+ +------------------------+ - * | arc_buf_hdr_t | | arc_buf_hdr_t | - * | | | | - * | | | | - * | | | | - * +------------------------+ +------------------------+ - * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | - * | (undefined if L1-only) | | | - * +------------------------+ +------------------------+ - * | l1arc_buf_hdr_t | - * | | - * | | - * | | - * | | - * +------------------------+ - * - * Because it's possible for the L2ARC to become extremely large, we can wind - * up eating a lot of memory in L2ARC buffer headers, so the size of a header - * is minimized by only allocating the fields necessary for an L1-cached buffer - * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and - * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple - * words in pointers. arc_hdr_realloc() is used to switch a header between - * these two allocation states. - */ -typedef struct l1arc_buf_hdr { - kmutex_t b_freeze_lock; - zio_cksum_t *b_freeze_cksum; -#ifdef ZFS_DEBUG - /* - * Used for debugging with kmem_flags - by allocating and freeing - * b_thawed when the buffer is thawed, we get a record of the stack - * trace that thawed it. - */ - void *b_thawed; -#endif - - arc_buf_t *b_buf; - uint32_t b_bufcnt; - /* for waiting on writes to complete */ - kcondvar_t b_cv; - uint8_t b_byteswap; - - /* protected by arc state mutex */ - arc_state_t *b_state; - multilist_node_t b_arc_node; - - /* updated atomically */ - clock_t b_arc_access; - - /* self protecting */ - zfs_refcount_t b_refcnt; - - arc_callback_t *b_acb; - abd_t *b_pabd; -} l1arc_buf_hdr_t; - -/* - * Encrypted blocks will need to be stored encrypted on the L2ARC - * disk as they appear in the main pool. In order for this to work we - * need to pass around the encryption parameters so they can be used - * to write data to the L2ARC. This struct is only defined in the - * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED - * flag set. - */ -typedef struct arc_buf_hdr_crypt { - abd_t *b_rabd; /* raw encrypted data */ - dmu_object_type_t b_ot; /* object type */ - uint32_t b_ebufcnt; /* number or encryped buffers */ - - /* dsobj for looking up encryption key for l2arc encryption */ - uint64_t b_dsobj; /* for looking up key */ - - /* encryption parameters */ - uint8_t b_salt[ZIO_DATA_SALT_LEN]; - uint8_t b_iv[ZIO_DATA_IV_LEN]; - - /* - * Technically this could be removed since we will always be able to - * get the mac from the bp when we need it. However, it is inconvenient - * for callers of arc code to have to pass a bp in all the time. This - * also allows us to assert that L2ARC data is properly encrypted to - * match the data in the main storage pool. - */ - uint8_t b_mac[ZIO_DATA_MAC_LEN]; -} arc_buf_hdr_crypt_t; - -typedef struct l2arc_dev l2arc_dev_t; - -typedef struct l2arc_buf_hdr { - /* protected by arc_buf_hdr mutex */ - l2arc_dev_t *b_dev; /* L2ARC device */ - uint64_t b_daddr; /* disk address, offset byte */ - - list_node_t b_l2node; -} l2arc_buf_hdr_t; - -struct arc_buf_hdr { - /* protected by hash lock */ - dva_t b_dva; - uint64_t b_birth; - - arc_buf_contents_t b_type; - arc_buf_hdr_t *b_hash_next; - arc_flags_t b_flags; - - /* - * This field stores the size of the data buffer after - * compression, and is set in the arc's zio completion handlers. - * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). - * - * While the block pointers can store up to 32MB in their psize - * field, we can only store up to 32MB minus 512B. This is due - * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. - * a field of zeros represents 512B in the bp). We can't use a - * bias of 1 since we need to reserve a psize of zero, here, to - * represent holes and embedded blocks. - * - * This isn't a problem in practice, since the maximum size of a - * buffer is limited to 16MB, so we never need to store 32MB in - * this field. Even in the upstream illumos code base, the - * maximum size of a buffer is limited to 16MB. - */ - uint16_t b_psize; - - /* - * This field stores the size of the data buffer before - * compression, and cannot change once set. It is in units - * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) - */ - uint16_t b_lsize; /* immutable */ - uint64_t b_spa; /* immutable */ - - /* L2ARC fields. Undefined when not in L2ARC. */ - l2arc_buf_hdr_t b_l2hdr; - /* L1ARC fields. Undefined when in l2arc_only state */ - l1arc_buf_hdr_t b_l1hdr; - /* - * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED - * is set and the L1 header exists. - */ - arc_buf_hdr_crypt_t b_crypt_hdr; -}; - #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) @@ -1192,20 +718,6 @@ boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ /* * L2ARC Internals */ -struct l2arc_dev { - vdev_t *l2ad_vdev; /* vdev */ - spa_t *l2ad_spa; /* spa */ - uint64_t l2ad_hand; /* next write location */ - uint64_t l2ad_start; /* first addr on device */ - uint64_t l2ad_end; /* last addr on device */ - boolean_t l2ad_first; /* first sweep through */ - boolean_t l2ad_writing; /* currently writing */ - kmutex_t l2ad_mtx; /* lock for buffer list */ - list_t l2ad_buflist; /* buffer list */ - list_node_t l2ad_node; /* device list node */ - zfs_refcount_t l2ad_alloc; /* allocated bytes */ -}; - static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ @@ -1223,11 +735,6 @@ typedef struct l2arc_read_callback { abd_t *l2rcb_abd; /* temporary buffer */ } l2arc_read_callback_t; -typedef struct l2arc_write_callback { - l2arc_dev_t *l2wcb_dev; /* device info */ - arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ -} l2arc_write_callback_t; - typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ abd_t *l2df_abd; @@ -1240,6 +747,9 @@ static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; +static kmutex_t l2arc_rebuild_thr_lock; +static kcondvar_t l2arc_rebuild_thr_cv; + static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *); typedef enum arc_fill_flags { ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */ @@ -1259,6 +769,7 @@ static void arc_hdr_alloc_pabd(arc_buf_hdr_t *, boolean_t); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(); static void arc_buf_watch(arc_buf_t *); +static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); @@ -1297,6 +808,9 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) ((hdr)->b_dva.dva_word[0] == 0 && \ (hdr)->b_dva.dva_word[1] == 0) +#define HDR_EMPTY_OR_LOCKED(hdr) \ + (HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr))) + #define HDR_EQUAL(spa, dva, birth, hdr) \ ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ @@ -1725,8 +1239,7 @@ arc_cksum_free(arc_buf_hdr_t *hdr) static boolean_t arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) { - ASSERT(hdr->b_l1hdr.b_state == arc_anon || - MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr)); for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { if (!ARC_BUF_COMPRESSED(b)) { @@ -2010,14 +1523,14 @@ arc_buf_freeze(arc_buf_t *buf) static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags |= flags; } static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags &= ~flags; } @@ -2031,7 +1544,7 @@ arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) static void arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) { - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Holes and embedded blocks will always have a psize = 0 so @@ -2124,7 +1637,7 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) void *tmpbuf = NULL; abd_t *abd = hdr->b_l1hdr.b_pabd; - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_AUTHENTICATED(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); @@ -2194,7 +1707,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) boolean_t no_crypt = B_FALSE; boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_ENCRYPTED(hdr)); arc_hdr_alloc_pabd(hdr, B_FALSE); @@ -2314,7 +1827,7 @@ arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock) ASSERT(HDR_ENCRYPTED(hdr)); ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data, @@ -2634,7 +2147,7 @@ static void add_reference(arc_buf_hdr_t *hdr, void *tag) { ASSERT(HDR_HAS_L1HDR(hdr)); - if (!MUTEX_HELD(HDR_LOCK(hdr))) { + if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) { ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); @@ -3039,7 +2552,7 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Only honor requests for compressed bufs if the hdr is actually @@ -3159,6 +2672,58 @@ arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, return (buf); } +/* + * Performance tuning of L2ARC persistence: + * + * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding + * an L2ARC device (either at pool import or later) will attempt + * to rebuild L2ARC buffer contents. + * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls + * whether log blocks are written to the L2ARC device. If the L2ARC + * device is less than 1GB, the amount of data l2arc_evict() + * evicts is significant compared to the amount of restored L2ARC + * data. In this case do not write log blocks in L2ARC in order + * not to waste space. + */ +int l2arc_rebuild_enabled = B_TRUE; +unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024; + +/* L2ARC persistence rebuild control routines. */ +void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); +static void l2arc_dev_rebuild_start(l2arc_dev_t *dev); +static int l2arc_rebuild(l2arc_dev_t *dev); + +/* L2ARC persistence read I/O routines. */ +static int l2arc_dev_hdr_read(l2arc_dev_t *dev); +static int l2arc_log_blk_read(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp, + l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, + zio_t *this_io, zio_t **next_io); +static zio_t *l2arc_log_blk_fetch(vdev_t *vd, + const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb); +static void l2arc_log_blk_fetch_abort(zio_t *zio); + +/* L2ARC persistence block restoration routines. */ +static void l2arc_log_blk_restore(l2arc_dev_t *dev, + const l2arc_log_blk_phys_t *lb, uint64_t lb_asize, uint64_t lb_daddr); +static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, + l2arc_dev_t *dev); + +/* L2ARC persistence write I/O routines. */ +static void l2arc_dev_hdr_update(l2arc_dev_t *dev); +static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, + l2arc_write_callback_t *cb); + +/* L2ARC persistence auxilliary routines. */ +boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *lbp); +static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, + const arc_buf_hdr_t *ab); +boolean_t l2arc_range_check_overlap(uint64_t bottom, + uint64_t top, uint64_t check); +static void l2arc_blk_fetch_done(zio_t *zio); +static inline uint64_t + l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev); /* * Return a loaned arc buffer to the arc. @@ -3247,7 +2812,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) ASSERT(arc_can_share(hdr, buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!ARC_BUF_ENCRYPTED(buf)); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Start sharing the data buffer. We transfer the @@ -3280,7 +2845,7 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) ASSERT(arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * We are no longer sharing this buffer so we need @@ -3315,7 +2880,7 @@ arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) arc_buf_t *lastbuf = NULL; ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Remove the buf from the hdr list and locate the last @@ -3363,7 +2928,7 @@ arc_buf_destroy_impl(arc_buf_t *buf) * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); arc_cksum_verify(buf); arc_buf_unwatch(buf); @@ -3841,7 +3406,6 @@ arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, B_FALSE, ZIO_COMPRESS_OFF, type, B_FALSE); - ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE, @@ -3852,6 +3416,42 @@ arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) } /* + * Allocates an ARC buf header that's in an evicted & L2-cached state. + * This is used during l2arc reconstruction to make empty ARC buffers + * which circumvent the regular disk->arc->l2arc path and instead come + * into being in the reverse order, i.e. l2arc->arc. + */ +arc_buf_hdr_t * +arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, + dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth, + enum zio_compress compress, boolean_t protected, boolean_t prefetch) +{ + arc_buf_hdr_t *hdr; + + ASSERT(size != 0); + hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP); + hdr->b_birth = birth; + hdr->b_type = type; + hdr->b_flags = 0; + arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR); + HDR_SET_LSIZE(hdr, size); + HDR_SET_PSIZE(hdr, psize); + arc_hdr_set_compress(hdr, compress); + if (protected) + arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); + if (prefetch) + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa); + + hdr->b_dva = dva; + + hdr->b_l2hdr.b_dev = dev; + hdr->b_l2hdr.b_daddr = daddr; + + return (hdr); +} + +/* * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this * for bufs containing metadata. */ @@ -3866,7 +3466,6 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_FALSE, compression_type, ARC_BUFC_DATA, B_FALSE); - ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, @@ -3907,7 +3506,6 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE, compression_type, type, B_TRUE); - ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; @@ -3966,9 +3564,6 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); - if (!HDR_EMPTY(hdr)) - buf_discard_identity(hdr); - if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); @@ -3992,6 +3587,15 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) mutex_exit(&dev->l2ad_mtx); } + /* + * The header's identity can only be safely discarded once it is no + * longer discoverable. This requires removing it from the hash table + * and the l2arc header list. After this point the hash lock can not + * be used to protect the header. + */ + if (!HDR_EMPTY(hdr)) + buf_discard_identity(hdr); + if (HDR_HAS_L1HDR(hdr)) { arc_cksum_free(hdr); @@ -4005,9 +3609,8 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) } #endif - if (hdr->b_l1hdr.b_pabd != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) arc_hdr_free_pabd(hdr, B_FALSE); - } if (HDR_HAS_RABD(hdr)) arc_hdr_free_pabd(hdr, B_TRUE); @@ -4032,7 +3635,6 @@ void arc_buf_destroy(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - kmutex_t *hash_lock = HDR_LOCK(hdr); if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); @@ -4042,7 +3644,9 @@ arc_buf_destroy(arc_buf_t *buf, void* tag) return; } + kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); + ASSERT3P(hdr, ==, buf->b_hdr); ASSERT(hdr->b_l1hdr.b_bufcnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); @@ -6886,8 +6490,8 @@ arc_write_done(zio_t *zio) ASSERT(zfs_refcount_is_zero( &exists->b_l1hdr.b_refcnt)); arc_change_state(arc_anon, exists, hash_lock); - mutex_exit(hash_lock); arc_hdr_destroy(exists); + mutex_exit(hash_lock); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { @@ -7659,6 +7263,103 @@ arc_fini(void) * * These three functions determine what to write, how much, and how quickly * to send writes. + * + * L2ARC persistence: + * + * When writing buffers to L2ARC, we periodically add some metadata to + * make sure we can pick them up after reboot, thus dramatically reducing + * the impact that any downtime has on the performance of storage systems + * with large caches. + * + * The implementation works fairly simply by integrating the following two + * modifications: + * + * *) When writing to the L2ARC, we occasionally write a "l2arc log block", + * which is an additional piece of metadata which describes what's been + * written. This allows us to rebuild the arc_buf_hdr_t structures of the + * main ARC buffers. There are 2 linked-lists of log blocks headed by + * dh_start_lbps[2]. We alternate which chain we append to, so they are + * time-wise and offset-wise interleaved, but that is an optimization rather + * than for correctness. The log block also includes a pointer to the + * previous block in its chain. + * + * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device + * for our header bookkeeping purposes. This contains a device header, + * which contains our top-level reference structures. We update it each + * time we write a new log block, so that we're able to locate it in the + * L2ARC device. If this write results in an inconsistent device header + * (e.g. due to power failure), we detect this by verifying the header's + * checksum and simply fail to reconstruct the L2ARC after reboot. + * + * Implementation diagram: + * + * +=== L2ARC device (not to scale) ======================================+ + * | ___two newest log block pointers__.__________ | + * | / \dh_start_lbps[1] | + * | / \ \dh_start_lbps[0]| + * |.___/__. V V | + * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---| + * || hdr| ^ /^ /^ / / | + * |+------+ ...--\-------/ \-----/--\------/ / | + * | \--------------/ \--------------/ | + * +======================================================================+ + * + * As can be seen on the diagram, rather than using a simple linked list, + * we use a pair of linked lists with alternating elements. This is a + * performance enhancement due to the fact that we only find out the + * address of the next log block access once the current block has been + * completely read in. Obviously, this hurts performance, because we'd be + * keeping the device's I/O queue at only a 1 operation deep, thus + * incurring a large amount of I/O round-trip latency. Having two lists + * allows us to fetch two log blocks ahead of where we are currently + * rebuilding L2ARC buffers. + * + * On-device data structures: + * + * L2ARC device header: l2arc_dev_hdr_phys_t + * L2ARC log block: l2arc_log_blk_phys_t + * + * L2ARC reconstruction: + * + * When writing data, we simply write in the standard rotary fashion, + * evicting buffers as we go and simply writing new data over them (writing + * a new log block every now and then). This obviously means that once we + * loop around the end of the device, we will start cutting into an already + * committed log block (and its referenced data buffers), like so: + * + * current write head__ __old tail + * \ / + * V V + * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |--> + * ^ ^^^^^^^^^___________________________________ + * | \ + * <<nextwrite>> may overwrite this blk and/or its bufs --' + * + * When importing the pool, we detect this situation and use it to stop + * our scanning process (see l2arc_rebuild). + * + * There is one significant caveat to consider when rebuilding ARC contents + * from an L2ARC device: what about invalidated buffers? Given the above + * construction, we cannot update blocks which we've already written to amend + * them to remove buffers which were invalidated. Thus, during reconstruction, + * we might be populating the cache with buffers for data that's not on the + * main pool anymore, or may have been overwritten! + * + * As it turns out, this isn't a problem. Every arc_read request includes + * both the DVA and, crucially, the birth TXG of the BP the caller is + * looking for. So even if the cache were populated by completely rotten + * blocks for data that had been long deleted and/or overwritten, we'll + * never actually return bad data from the cache, since the DVA with the + * birth TXG uniquely identify a block in space and time - once created, + * a block is immutable on disk. The worst thing we have done is wasted + * some time and memory at l2arc rebuild to reconstruct outdated ARC + * entries that will get dropped from the l2arc as it is being updated + * with new blocks. + * + * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write + * hand are not restored. This is done by saving the offset (in bytes) + * l2arc_evict() has evicted to in the L2ARC device header and taking it + * into account when restoring buffers. */ static boolean_t @@ -7679,9 +7380,9 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) } static uint64_t -l2arc_write_size(void) +l2arc_write_size(l2arc_dev_t *dev) { - uint64_t size; + uint64_t size, dev_size; /* * Make sure our globals have meaningful values in case the user @@ -7698,6 +7399,25 @@ l2arc_write_size(void) if (arc_warm == B_FALSE) size += l2arc_write_boost; + /* + * Make sure the write size does not exceed the size of the cache + * device. This is important in l2arc_evict(), otherwise infinite + * iteration can occur. + */ + dev_size = dev->l2ad_end - dev->l2ad_start; + if ((size + l2arc_log_blk_overhead(size, dev)) >= dev_size) { + cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " + "plus the overhead of log blocks (persistent L2ARC, " + "%" PRIu64 " bytes) exceeds the size of the cache device " + "(guid %" PRIu64 "), resetting them to the default (%d)", + l2arc_log_blk_overhead(size, dev), + dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); + size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; + + if (arc_warm == B_FALSE) + size += l2arc_write_boost; + } + return (size); } @@ -7763,10 +7483,10 @@ l2arc_dev_get_next(void) else if (next == first) break; - } while (vdev_is_dead(next->l2ad_vdev)); + } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild); /* if we were unable to find any usable vdevs, return NULL */ - if (vdev_is_dead(next->l2ad_vdev)) + if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild) next = NULL; l2arc_dev_last = next; @@ -7815,16 +7535,20 @@ l2arc_do_free_on_write() static void l2arc_write_done(zio_t *zio) { - l2arc_write_callback_t *cb; - l2arc_dev_t *dev; - list_t *buflist; - arc_buf_hdr_t *head, *hdr, *hdr_prev; - kmutex_t *hash_lock; - int64_t bytes_dropped = 0; + l2arc_write_callback_t *cb; + l2arc_lb_abd_buf_t *abd_buf; + l2arc_lb_ptr_buf_t *lb_ptr_buf; + l2arc_dev_t *dev; + l2arc_dev_hdr_phys_t *l2dhdr; + list_t *buflist; + arc_buf_hdr_t *head, *hdr, *hdr_prev; + kmutex_t *hash_lock; + int64_t bytes_dropped = 0; cb = zio->io_private; ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; + l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; ASSERT3P(head, !=, NULL); @@ -7913,12 +7637,72 @@ top: mutex_exit(hash_lock); } + /* + * Free the allocated abd buffers for writing the log blocks. + * If the zio failed reclaim the allocated space and remove the + * pointers to these log blocks from the log block pointer list + * of the L2ARC device. + */ + while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) { + abd_free(abd_buf->abd); + zio_buf_free(abd_buf, sizeof (*abd_buf)); + if (zio->io_error != 0) { + lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list); + /* + * L2BLK_GET_PSIZE returns aligned size for log + * blocks. + */ + uint64_t asize = + L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop); + bytes_dropped += asize; + ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); + ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); + zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, + lb_ptr_buf); + zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); + kmem_free(lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); + } + } + list_destroy(&cb->l2wcb_abd_list); + + if (zio->io_error != 0) { + /* + * Restore the lbps array in the header to its previous state. + * If the list of log block pointers is empty, zero out the + * log block pointers in the device header. + */ + lb_ptr_buf = list_head(&dev->l2ad_lbptr_list); + for (int i = 0; i < 2; i++) { + if (lb_ptr_buf == NULL) { + /* + * If the list is empty zero out the device + * header. Otherwise zero out the second log + * block pointer in the header. + */ + if (i == 0) { + bzero(l2dhdr, dev->l2ad_dev_hdr_asize); + } else { + bzero(&l2dhdr->dh_start_lbps[i], + sizeof (l2arc_log_blkptr_t)); + } + break; + } + bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i], + sizeof (l2arc_log_blkptr_t)); + lb_ptr_buf = list_next(&dev->l2ad_lbptr_list, + lb_ptr_buf); + } + } + atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); mutex_exit(&dev->l2ad_mtx); + ASSERT(dev->l2ad_vdev != NULL); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); l2arc_do_free_on_write(); @@ -8110,7 +7894,6 @@ l2arc_read_done(zio_t *zio) zio->io_private = hdr; arc_read_done(zio); } else { - mutex_exit(hash_lock); /* * Buffer didn't survive caching. Increment stats and * reissue to the original storage device. @@ -8135,10 +7918,24 @@ l2arc_read_done(zio_t *zio) ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); - zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, + zio = zio_read(pio, zio->io_spa, zio->io_bp, abd, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, - &cb->l2rcb_zb)); + &cb->l2rcb_zb); + + /* + * Original ZIO will be freed, so we need to update + * ARC header with the new ZIO pointer to be used + * by zio_change_priority() in arc_read(). + */ + for (struct arc_callback *acb = hdr->b_l1hdr.b_acb; + acb != NULL; acb = acb->acb_next) + acb->acb_zio_head = zio; + + mutex_exit(hash_lock); + zio_nowait(zio); + } else { + mutex_exit(hash_lock); } } @@ -8189,8 +7986,31 @@ l2arc_sublist_lock(int list_num) } /* + * Calculates the maximum overhead of L2ARC metadata log blocks for a given + * L2ARC write size. l2arc_evict and l2arc_write_size need to include this + * overhead in processing to make sure there is enough headroom available + * when writing buffers. + */ +static inline uint64_t +l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev) +{ + if (dev->l2ad_log_entries == 0) { + return (0); + } else { + uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT; + + uint64_t log_blocks = (log_entries + + dev->l2ad_log_entries - 1) / + dev->l2ad_log_entries; + + return (vdev_psize_to_asize(dev->l2ad_vdev, + sizeof (l2arc_log_blk_phys_t)) * log_blocks); + } +} + +/* * Evict buffers from the device write hand to the distance specified in - * bytes. This distance may span populated buffers, it may span nothing. + * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ @@ -8201,22 +8021,28 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; + l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev; + boolean_t rerun; buflist = &dev->l2ad_buflist; - if (!all && dev->l2ad_first) { - /* - * This is the first sweep through the device. There is - * nothing to evict. - */ - return; - } + /* + * We need to add in the worst case scenario of log block overhead. + */ + distance += l2arc_log_blk_overhead(distance, dev); - if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { +top: + rerun = B_FALSE; + if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { /* - * When nearing the end of the device, evict to the end - * before the device write hand jumps to the start. + * When there is no space to accommodate upcoming writes, + * evict to the end. Then bump the write and evict hands + * to the start and iterate. This iteration does not + * happen indefinitely as we make sure in + * l2arc_write_size() that when the write hand is reset, + * the write size does not exceed the end of the device. */ + rerun = B_TRUE; taddr = dev->l2ad_end; } else { taddr = dev->l2ad_hand + distance; @@ -8224,11 +8050,68 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); -top: + /* + * This check has to be placed after deciding whether to iterate + * (rerun). + */ + if (!all && dev->l2ad_first) { + /* + * This is the first sweep through the device. There is + * nothing to evict. + */ + goto out; + } + + /* + * When rebuilding L2ARC we retrieve the evict hand from the header of + * the device. Of note, l2arc_evict() does not actually delete buffers + * from the cache device, but keeping track of the evict hand will be + * useful when TRIM is implemented. + */ + dev->l2ad_evict = MAX(dev->l2ad_evict, taddr); + +retry: mutex_enter(&dev->l2ad_mtx); + /* + * We have to account for evicted log blocks. Run vdev_space_update() + * on log blocks whose offset (in bytes) is before the evicted offset + * (in bytes) by searching in the list of pointers to log blocks + * present in the L2ARC device. + */ + for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf; + lb_ptr_buf = lb_ptr_buf_prev) { + + lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf); + + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + uint64_t asize = L2BLK_GET_PSIZE( + (lb_ptr_buf->lb_ptr)->lbp_prop); + + /* + * We don't worry about log blocks left behind (ie + * lbp_payload_start < l2ad_hand) because l2arc_write_buffers() + * will never write more than l2arc_evict() evicts. + */ + if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { + break; + } else { + vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); + ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); + zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, + lb_ptr_buf); + zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); + list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf); + kmem_free(lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); + } + } + for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); + ASSERT(!HDR_EMPTY(hdr)); hash_lock = HDR_LOCK(hdr); /* @@ -8244,7 +8127,7 @@ top: mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); - goto top; + goto retry; } /* @@ -8256,7 +8139,7 @@ top: ASSERT(!HDR_L2_WRITING(hdr)); ASSERT(!HDR_L2_WRITE_HEAD(hdr)); - if (!all && (hdr->b_l2hdr.b_daddr >= taddr || + if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict || hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, @@ -8293,6 +8176,26 @@ top: mutex_exit(hash_lock); } mutex_exit(&dev->l2ad_mtx); + +out: + /* + * We need to check if we evict all buffers, otherwise we may iterate + * unnecessarily. + */ + if (!all && rerun) { + /* + * Bump device hand to the device start if it is approaching the + * end. l2arc_evict() has already evicted ahead for this case. + */ + dev->l2ad_hand = dev->l2ad_start; + dev->l2ad_evict = dev->l2ad_start; + dev->l2ad_first = B_FALSE; + goto top; + } + + ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); + if (!dev->l2ad_first) + ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); } /* @@ -8412,6 +8315,17 @@ error: return (ret); } +static void +l2arc_blk_fetch_done(zio_t *zio) +{ + l2arc_read_callback_t *cb; + + cb = zio->io_private; + if (cb->l2rcb_abd != NULL) + abd_put(cb->l2rcb_abd); + kmem_free(cb, sizeof (l2arc_read_callback_t)); +} + /* * Find and write ARC buffers to the L2ARC device. * @@ -8421,17 +8335,18 @@ error: * state between calls to this function. * * Returns the number of bytes actually written (which may be smaller than - * the delta by which the device hand has changed due to alignment). + * the delta by which the device hand has changed due to alignment and the + * writing of log blocks). */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { - arc_buf_hdr_t *hdr, *hdr_prev, *head; - uint64_t write_asize, write_psize, write_lsize, headroom; - boolean_t full; - l2arc_write_callback_t *cb; - zio_t *pio, *wzio; - uint64_t guid = spa_load_guid(spa); + arc_buf_hdr_t *hdr, *hdr_prev, *head; + uint64_t write_asize, write_psize, write_lsize, headroom; + boolean_t full; + l2arc_write_callback_t *cb = NULL; + zio_t *pio, *wzio; + uint64_t guid = spa_load_guid(spa); ASSERT3P(dev->l2ad_vdev, !=, NULL); @@ -8483,7 +8398,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) } passed_sz += HDR_GET_LSIZE(hdr); - if (passed_sz > headroom) { + if (l2arc_headroom != 0 && passed_sz > headroom) { /* * Searched too far. */ @@ -8583,6 +8498,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; + /* + * Create a list to save allocated abd buffers + * for l2arc_log_blk_commit(). + */ + list_create(&cb->l2wcb_abd_list, + sizeof (l2arc_lb_abd_buf_t), + offsetof(l2arc_lb_abd_buf_t, node)); pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); } @@ -8616,6 +8538,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) mutex_exit(hash_lock); + /* + * Append buf info to current log and commit if full. + * arcstat_l2_{size,asize} kstats are updated + * internally. + */ + if (l2arc_log_blk_insert(dev, hdr)) + l2arc_log_blk_commit(dev, pio, cb); + (void) zio_nowait(wzio); } @@ -8630,28 +8560,36 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ASSERT0(write_lsize); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); + + /* + * Although we did not write any buffers l2ad_evict may + * have advanced. + */ + l2arc_dev_hdr_update(dev); + return (0); } + if (!dev->l2ad_first) + ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); + ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); ARCSTAT_INCR(arcstat_l2_psize, write_psize); - /* - * Bump device hand to the device start if it is approaching the end. - * l2arc_evict() will already have evicted ahead for this case. - */ - if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { - dev->l2ad_hand = dev->l2ad_start; - dev->l2ad_first = B_FALSE; - } - dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; + /* + * Update the device header after the zio completes as + * l2arc_write_done() may have updated the memory holding the log block + * pointers in the device header. + */ + l2arc_dev_hdr_update(dev); + return (write_asize); } @@ -8728,7 +8666,7 @@ l2arc_feed_thread(void *unused) ARCSTAT_BUMP(arcstat_l2_feeds); - size = l2arc_write_size(); + size = l2arc_write_size(dev); /* * Evict L2ARC buffers that will be overwritten. @@ -8756,7 +8694,17 @@ l2arc_feed_thread(void *unused) boolean_t l2arc_vdev_present(vdev_t *vd) { - l2arc_dev_t *dev; + return (l2arc_vdev_get(vd) != NULL); +} + +/* + * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if + * the vdev_t isn't an L2ARC device. + */ +static l2arc_dev_t * +l2arc_vdev_get(vdev_t *vd) +{ + l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; @@ -8766,7 +8714,7 @@ l2arc_vdev_present(vdev_t *vd) } mutex_exit(&l2arc_dev_mtx); - return (dev != NULL); + return (dev); } /* @@ -8776,7 +8724,8 @@ l2arc_vdev_present(vdev_t *vd) void l2arc_add_vdev(spa_t *spa, vdev_t *vd) { - l2arc_dev_t *adddev; + l2arc_dev_t *adddev; + uint64_t l2dhdr_asize; ASSERT(!l2arc_vdev_present(vd)); @@ -8786,11 +8735,17 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; - adddev->l2ad_start = VDEV_LABEL_START_SIZE; + /* leave extra size for an l2arc device header */ + l2dhdr_asize = adddev->l2ad_dev_hdr_asize = + MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift); + adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); + ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end); adddev->l2ad_hand = adddev->l2ad_start; + adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; + adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP); mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* @@ -8800,8 +8755,17 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); + /* + * This is a list of pointers to log blocks that are still present + * on the device. + */ + list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t), + offsetof(l2arc_lb_ptr_buf_t, node)); + vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); zfs_refcount_create(&adddev->l2ad_alloc); + zfs_refcount_create(&adddev->l2ad_lb_asize); + zfs_refcount_create(&adddev->l2ad_lb_count); /* * Add device to global list @@ -8810,6 +8774,87 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); + + /* + * Decide if vdev is eligible for L2ARC rebuild + */ + l2arc_rebuild_vdev(adddev->l2ad_vdev, B_FALSE); +} + +void +l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) +{ + l2arc_dev_t *dev = NULL; + l2arc_dev_hdr_phys_t *l2dhdr; + uint64_t l2dhdr_asize; + spa_t *spa; + int err; + boolean_t l2dhdr_valid = B_TRUE; + + dev = l2arc_vdev_get(vd); + ASSERT3P(dev, !=, NULL); + spa = dev->l2ad_spa; + l2dhdr = dev->l2ad_dev_hdr; + l2dhdr_asize = dev->l2ad_dev_hdr_asize; + + /* + * The L2ARC has to hold at least the payload of one log block for + * them to be restored (persistent L2ARC). The payload of a log block + * depends on the amount of its log entries. We always write log blocks + * with 1022 entries. How many of them are committed or restored depends + * on the size of the L2ARC device. Thus the maximum payload of + * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device + * is less than that, we reduce the amount of committed and restored + * log entries per block so as to enable persistence. + */ + if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) { + dev->l2ad_log_entries = 0; + } else { + dev->l2ad_log_entries = MIN((dev->l2ad_end - + dev->l2ad_start) >> SPA_MAXBLOCKSHIFT, + L2ARC_LOG_BLK_MAX_ENTRIES); + } + + /* + * Read the device header, if an error is returned do not rebuild L2ARC. + */ + if ((err = l2arc_dev_hdr_read(dev)) != 0) + l2dhdr_valid = B_FALSE; + + if (l2dhdr_valid && dev->l2ad_log_entries > 0) { + /* + * If we are onlining a cache device (vdev_reopen) that was + * still present (l2arc_vdev_present()) and rebuild is enabled, + * we should evict all ARC buffers and pointers to log blocks + * and reclaim their space before restoring its contents to + * L2ARC. + */ + if (reopen) { + if (!l2arc_rebuild_enabled) { + return; + } else { + l2arc_evict(dev, 0, B_TRUE); + /* start a new log block */ + dev->l2ad_log_ent_idx = 0; + dev->l2ad_log_blk_payload_asize = 0; + dev->l2ad_log_blk_payload_start = 0; + } + } + /* + * Just mark the device as pending for a rebuild. We won't + * be starting a rebuild in line here as it would block pool + * import. Instead spa_load_impl will hand that off to an + * async task which will call l2arc_spa_rebuild_start. + */ + dev->l2ad_rebuild = B_TRUE; + } else if (spa_writeable(spa)) { + /* + * In this case create a new header. We zero out the memory + * holding the header to reset dh_start_lbps. + */ + bzero(l2dhdr, l2dhdr_asize); + l2arc_dev_hdr_update(dev); + } } /* @@ -8818,24 +8863,29 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) void l2arc_remove_vdev(vdev_t *vd) { - l2arc_dev_t *dev, *nextdev, *remdev = NULL; + l2arc_dev_t *remdev = NULL; /* * Find the device by vdev */ - mutex_enter(&l2arc_dev_mtx); - for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { - nextdev = list_next(l2arc_dev_list, dev); - if (vd == dev->l2ad_vdev) { - remdev = dev; - break; - } - } + remdev = l2arc_vdev_get(vd); ASSERT3P(remdev, !=, NULL); /* + * Cancel any ongoing or scheduled rebuild. + */ + mutex_enter(&l2arc_rebuild_thr_lock); + if (remdev->l2ad_rebuild_began == B_TRUE) { + remdev->l2ad_rebuild_cancel = B_TRUE; + while (remdev->l2ad_rebuild == B_TRUE) + cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); + } + mutex_exit(&l2arc_rebuild_thr_lock); + + /* * Remove device from global list */ + mutex_enter(&l2arc_dev_mtx); list_remove(l2arc_dev_list, remdev); l2arc_dev_last = NULL; /* may have been invalidated */ atomic_dec_64(&l2arc_ndev); @@ -8846,8 +8896,13 @@ l2arc_remove_vdev(vdev_t *vd) */ l2arc_evict(remdev, 0, B_TRUE); list_destroy(&remdev->l2ad_buflist); + ASSERT(list_is_empty(&remdev->l2ad_lbptr_list)); + list_destroy(&remdev->l2ad_lbptr_list); mutex_destroy(&remdev->l2ad_mtx); zfs_refcount_destroy(&remdev->l2ad_alloc); + zfs_refcount_destroy(&remdev->l2ad_lb_asize); + zfs_refcount_destroy(&remdev->l2ad_lb_count); + kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); kmem_free(remdev, sizeof (l2arc_dev_t)); } @@ -8861,6 +8916,8 @@ l2arc_init(void) mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); @@ -8885,6 +8942,8 @@ l2arc_fini(void) mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); + mutex_destroy(&l2arc_rebuild_thr_lock); + cv_destroy(&l2arc_rebuild_thr_cv); mutex_destroy(&l2arc_dev_mtx); mutex_destroy(&l2arc_free_on_write_mtx); @@ -8915,3 +8974,901 @@ l2arc_stop(void) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); } + +/* + * Punches out rebuild threads for the L2ARC devices in a spa. This should + * be called after pool import from the spa async thread, since starting + * these threads directly from spa_import() will make them part of the + * "zpool import" context and delay process exit (and thus pool import). + */ +void +l2arc_spa_rebuild_start(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + /* + * Locate the spa's l2arc devices and kick off rebuild threads. + */ + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + l2arc_dev_t *dev = + l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); + if (dev == NULL) { + /* Don't attempt a rebuild if the vdev is UNAVAIL */ + continue; + } + mutex_enter(&l2arc_rebuild_thr_lock); + if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) { + dev->l2ad_rebuild_began = B_TRUE; + (void) thread_create(NULL, 0, + (void (*)(void *))l2arc_dev_rebuild_start, + dev, 0, &p0, TS_RUN, minclsyspri); + } + mutex_exit(&l2arc_rebuild_thr_lock); + } +} + +/* + * Main entry point for L2ARC rebuilding. + */ +static void +l2arc_dev_rebuild_start(l2arc_dev_t *dev) +{ + VERIFY(!dev->l2ad_rebuild_cancel); + VERIFY(dev->l2ad_rebuild); + (void) l2arc_rebuild(dev); + mutex_enter(&l2arc_rebuild_thr_lock); + dev->l2ad_rebuild_began = B_FALSE; + dev->l2ad_rebuild = B_FALSE; + mutex_exit(&l2arc_rebuild_thr_lock); + + thread_exit(); +} + +/* + * This function implements the actual L2ARC metadata rebuild. It: + * starts reading the log block chain and restores each block's contents + * to memory (reconstructing arc_buf_hdr_t's). + * + * Operation stops under any of the following conditions: + * + * 1) We reach the end of the log block chain. + * 2) We encounter *any* error condition (cksum errors, io errors) + */ +static int +l2arc_rebuild(l2arc_dev_t *dev) +{ + vdev_t *vd = dev->l2ad_vdev; + spa_t *spa = vd->vdev_spa; + int err = 0; + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + l2arc_log_blk_phys_t *this_lb, *next_lb; + zio_t *this_io = NULL, *next_io = NULL; + l2arc_log_blkptr_t lbps[2]; + l2arc_lb_ptr_buf_t *lb_ptr_buf; + boolean_t lock_held; + + this_lb = kmem_zalloc(sizeof (*this_lb), KM_SLEEP); + next_lb = kmem_zalloc(sizeof (*next_lb), KM_SLEEP); + + /* + * We prevent device removal while issuing reads to the device, + * then during the rebuilding phases we drop this lock again so + * that a spa_unload or device remove can be initiated - this is + * safe, because the spa will signal us to stop before removing + * our device and wait for us to stop. + */ + spa_config_enter(spa, SCL_L2ARC, vd, RW_READER); + lock_held = B_TRUE; + + /* + * Retrieve the persistent L2ARC device state. + * L2BLK_GET_PSIZE returns aligned size for log blocks. + */ + dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start); + dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr + + L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop), + dev->l2ad_start); + dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); + + /* + * In case the zfs module parameter l2arc_rebuild_enabled is false + * we do not start the rebuild process. + */ + if (!l2arc_rebuild_enabled) + goto out; + + /* Prepare the rebuild process */ + bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps)); + + /* Start the rebuild process */ + for (;;) { + if (!l2arc_log_blkptr_valid(dev, &lbps[0])) + break; + + if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1], + this_lb, next_lb, this_io, &next_io)) != 0) + goto out; + + /* + * Our memory pressure valve. If the system is running low + * on memory, rather than swamping memory with new ARC buf + * hdrs, we opt not to rebuild the L2ARC. At this point, + * however, we have already set up our L2ARC dev to chain in + * new metadata log blocks, so the user may choose to offline/ + * online the L2ARC dev at a later time (or re-import the pool) + * to reconstruct it (when there's less memory pressure). + */ + if (arc_reclaim_needed()) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem); + cmn_err(CE_NOTE, "System running low on memory, " + "aborting L2ARC rebuild."); + err = SET_ERROR(ENOMEM); + goto out; + } + + spa_config_exit(spa, SCL_L2ARC, vd); + lock_held = B_FALSE; + + /* + * Now that we know that the next_lb checks out alright, we + * can start reconstruction from this log block. + * L2BLK_GET_PSIZE returns aligned size for log blocks. + */ + uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + l2arc_log_blk_restore(dev, this_lb, asize, lbps[0].lbp_daddr); + + /* + * log block restored, include its pointer in the list of + * pointers to log blocks present in the L2ARC device. + */ + lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); + lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), + KM_SLEEP); + bcopy(&lbps[0], lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + mutex_enter(&dev->l2ad_mtx); + list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_count); + zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); + zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); + mutex_exit(&dev->l2ad_mtx); + vdev_space_update(vd, asize, 0, 0); + + /* BEGIN CSTYLED */ + /* + * Protection against loops of log blocks: + * + * l2ad_hand l2ad_evict + * V V + * l2ad_start |=======================================| l2ad_end + * -----|||----|||---|||----||| + * (3) (2) (1) (0) + * ---|||---|||----|||---||| + * (7) (6) (5) (4) + * + * In this situation the pointer of log block (4) passes + * l2arc_log_blkptr_valid() but the log block should not be + * restored as it is overwritten by the payload of log block + * (0). Only log blocks (0)-(3) should be restored. We check + * whether l2ad_evict lies in between the payload starting + * offset of the next log block (lbps[1].lbp_payload_start) + * and the payload starting offset of the present log block + * (lbps[0].lbp_payload_start). If true and this isn't the + * first pass, we are looping from the beginning and we should + * stop. + */ + /* END CSTYLED */ + if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, + lbps[0].lbp_payload_start, dev->l2ad_evict) && + !dev->l2ad_first) + goto out; + + for (;;) { + mutex_enter(&l2arc_rebuild_thr_lock); + if (dev->l2ad_rebuild_cancel) { + dev->l2ad_rebuild = B_FALSE; + cv_signal(&l2arc_rebuild_thr_cv); + mutex_exit(&l2arc_rebuild_thr_lock); + err = SET_ERROR(ECANCELED); + goto out; + } + mutex_exit(&l2arc_rebuild_thr_lock); + if (spa_config_tryenter(spa, SCL_L2ARC, vd, + RW_READER)) { + lock_held = B_TRUE; + break; + } + /* + * L2ARC config lock held by somebody in writer, + * possibly due to them trying to remove us. They'll + * likely to want us to shut down, so after a little + * delay, we check l2ad_rebuild_cancel and retry + * the lock again. + */ + delay(1); + } + + /* + * Continue with the next log block. + */ + lbps[0] = lbps[1]; + lbps[1] = this_lb->lb_prev_lbp; + PTR_SWAP(this_lb, next_lb); + this_io = next_io; + next_io = NULL; + } + + if (this_io != NULL) + l2arc_log_blk_fetch_abort(this_io); +out: + if (next_io != NULL) + l2arc_log_blk_fetch_abort(next_io); + kmem_free(this_lb, sizeof (*this_lb)); + kmem_free(next_lb, sizeof (*next_lb)); + + if (!l2arc_rebuild_enabled) { + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "disabled"); + } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_success); + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "successful, restored %llu blocks", + (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); + } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) { + /* + * No error but also nothing restored, meaning the lbps array + * in the device header points to invalid/non-present log + * blocks. Reset the header. + */ + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "no valid log blocks"); + bzero(l2dhdr, dev->l2ad_dev_hdr_asize); + l2arc_dev_hdr_update(dev); + } else if (err != 0) { + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "aborted, restored %llu blocks", + (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); + } + + if (lock_held) + spa_config_exit(spa, SCL_L2ARC, vd); + + return (err); +} + +/* + * Attempts to read the device header on the provided L2ARC device and writes + * it to `hdr'. On success, this function returns 0, otherwise the appropriate + * error code is returned. + */ +static int +l2arc_dev_hdr_read(l2arc_dev_t *dev) +{ + int err; + uint64_t guid; + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; + abd_t *abd; + + guid = spa_guid(dev->l2ad_vdev->vdev_spa); + + abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); + + err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, + VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, + ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_SPECULATIVE, B_FALSE)); + + abd_put(abd); + + if (err != 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors); + zfs_dbgmsg("L2ARC IO error (%d) while reading device header, " + "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid); + return (err); + } + + if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) + byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr)); + + if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC || + l2dhdr->dh_spa_guid != guid || + l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid || + l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION || + l2dhdr->dh_log_entries != dev->l2ad_log_entries || + l2dhdr->dh_end != dev->l2ad_end || + !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end, + l2dhdr->dh_evict)) { + /* + * Attempt to rebuild a device containing no actual dev hdr + * or containing a header from some other pool or from another + * version of persistent L2ARC. + */ + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported); + return (SET_ERROR(ENOTSUP)); + } + + return (0); +} + +/* + * Reads L2ARC log blocks from storage and validates their contents. + * + * This function implements a simple fetcher to make sure that while + * we're processing one buffer the L2ARC is already fetching the next + * one in the chain. + * + * The arguments this_lp and next_lp point to the current and next log block + * address in the block chain. Similarly, this_lb and next_lb hold the + * l2arc_log_blk_phys_t's of the current and next L2ARC blk. + * + * The `this_io' and `next_io' arguments are used for block fetching. + * When issuing the first blk IO during rebuild, you should pass NULL for + * `this_io'. This function will then issue a sync IO to read the block and + * also issue an async IO to fetch the next block in the block chain. The + * fetched IO is returned in `next_io'. On subsequent calls to this + * function, pass the value returned in `next_io' from the previous call + * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO. + * Prior to the call, you should initialize your `next_io' pointer to be + * NULL. If no fetch IO was issued, the pointer is left set at NULL. + * + * On success, this function returns 0, otherwise it returns an appropriate + * error code. On error the fetching IO is aborted and cleared before + * returning from this function. Therefore, if we return `success', the + * caller can assume that we have taken care of cleanup of fetch IOs. + */ +static int +l2arc_log_blk_read(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp, + l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, + zio_t *this_io, zio_t **next_io) +{ + int err = 0; + zio_cksum_t cksum; + abd_t *abd = NULL; + uint64_t asize; + + ASSERT(this_lbp != NULL && next_lbp != NULL); + ASSERT(this_lb != NULL && next_lb != NULL); + ASSERT(next_io != NULL && *next_io == NULL); + ASSERT(l2arc_log_blkptr_valid(dev, this_lbp)); + + /* + * Check to see if we have issued the IO for this log block in a + * previous run. If not, this is the first call, so issue it now. + */ + if (this_io == NULL) { + this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp, + this_lb); + } + + /* + * Peek to see if we can start issuing the next IO immediately. + */ + if (l2arc_log_blkptr_valid(dev, next_lbp)) { + /* + * Start issuing IO for the next log block early - this + * should help keep the L2ARC device busy while we + * decompress and restore this log block. + */ + *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp, + next_lb); + } + + /* Wait for the IO to read this log block to complete */ + if ((err = zio_wait(this_io)) != 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors); + zfs_dbgmsg("L2ARC IO error (%d) while reading log block, " + "offset: %llu, vdev guid: %llu", err, this_lbp->lbp_daddr, + dev->l2ad_vdev->vdev_guid); + goto cleanup; + } + + /* + * Make sure the buffer checks out. + * L2BLK_GET_PSIZE returns aligned size for log blocks. + */ + asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop); + fletcher_4_native(this_lb, asize, NULL, &cksum); + if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors); + zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, " + "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu", + this_lbp->lbp_daddr, dev->l2ad_vdev->vdev_guid, + dev->l2ad_hand, dev->l2ad_evict); + err = SET_ERROR(ECKSUM); + goto cleanup; + } + + /* Now we can take our time decoding this buffer */ + switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) { + case ZIO_COMPRESS_OFF: + break; + case ZIO_COMPRESS_LZ4: + abd = abd_alloc_for_io(asize, B_TRUE); + abd_copy_from_buf_off(abd, this_lb, 0, asize); + if ((err = zio_decompress_data( + L2BLK_GET_COMPRESS((this_lbp)->lbp_prop), + abd, this_lb, asize, sizeof (*this_lb))) != 0) { + err = SET_ERROR(EINVAL); + goto cleanup; + } + break; + default: + err = SET_ERROR(EINVAL); + goto cleanup; + } + if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) + byteswap_uint64_array(this_lb, sizeof (*this_lb)); + if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) { + err = SET_ERROR(EINVAL); + goto cleanup; + } +cleanup: + /* Abort an in-flight fetch I/O in case of error */ + if (err != 0 && *next_io != NULL) { + l2arc_log_blk_fetch_abort(*next_io); + *next_io = NULL; + } + if (abd != NULL) + abd_free(abd); + return (err); +} + +/* + * Restores the payload of a log block to ARC. This creates empty ARC hdr + * entries which only contain an l2arc hdr, essentially restoring the + * buffers to their L2ARC evicted state. This function also updates space + * usage on the L2ARC vdev to make sure it tracks restored buffers. + */ +static void +l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, + uint64_t lb_asize, uint64_t lb_daddr) +{ + uint64_t size = 0, asize = 0; + uint64_t log_entries = dev->l2ad_log_entries; + + for (int i = log_entries - 1; i >= 0; i--) { + /* + * Restore goes in the reverse temporal direction to preserve + * correct temporal ordering of buffers in the l2ad_buflist. + * l2arc_hdr_restore also does a list_insert_tail instead of + * list_insert_head on the l2ad_buflist: + * + * LIST l2ad_buflist LIST + * HEAD <------ (time) ------ TAIL + * direction +-----+-----+-----+-----+-----+ direction + * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild + * fill +-----+-----+-----+-----+-----+ + * ^ ^ + * | | + * | | + * l2arc_feed_thread l2arc_rebuild + * will place new bufs here restores bufs here + * + * During l2arc_rebuild() the device is not used by + * l2arc_feed_thread() as dev->l2ad_rebuild is set to true. + */ + size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop); + asize += vdev_psize_to_asize(dev->l2ad_vdev, + L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop)); + l2arc_hdr_restore(&lb->lb_entries[i], dev); + } + + /* + * Record rebuild stats: + * size Logical size of restored buffers in the L2ARC + * asize Aligned size of restored buffers in the L2ARC + */ + ARCSTAT_INCR(arcstat_l2_rebuild_size, size); + ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize); + ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize); + ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize); + ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks); +} + +/* + * Restores a single ARC buf hdr from a log entry. The ARC buffer is put + * into a state indicating that it has been evicted to L2ARC. + */ +static void +l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) +{ + arc_buf_hdr_t *hdr, *exists; + kmutex_t *hash_lock; + arc_buf_contents_t type = L2BLK_GET_TYPE((le)->le_prop); + uint64_t asize; + + /* + * Do all the allocation before grabbing any locks, this lets us + * sleep if memory is full and we don't have to deal with failed + * allocations. + */ + hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type, + dev, le->le_dva, le->le_daddr, + L2BLK_GET_PSIZE((le)->le_prop), le->le_birth, + L2BLK_GET_COMPRESS((le)->le_prop), + L2BLK_GET_PROTECTED((le)->le_prop), + L2BLK_GET_PREFETCH((le)->le_prop)); + asize = vdev_psize_to_asize(dev->l2ad_vdev, + L2BLK_GET_PSIZE((le)->le_prop)); + + /* + * vdev_space_update() has to be called before arc_hdr_destroy() to + * avoid underflow since the latter also calls the former. + */ + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + + ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(hdr)); + + mutex_enter(&dev->l2ad_mtx); + list_insert_tail(&dev->l2ad_buflist, hdr); + (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); + mutex_exit(&dev->l2ad_mtx); + + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* Buffer was already cached, no need to restore it. */ + arc_hdr_destroy(hdr); + /* + * If the buffer is already cached, check whether it has + * L2ARC metadata. If not, enter them and update the flag. + * This is important is case of onlining a cache device, since + * we previously evicted all L2ARC metadata from ARC. + */ + if (!HDR_HAS_L2HDR(exists)) { + arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR); + exists->b_l2hdr.b_dev = dev; + exists->b_l2hdr.b_daddr = le->le_daddr; + mutex_enter(&dev->l2ad_mtx); + list_insert_tail(&dev->l2ad_buflist, exists); + (void) zfs_refcount_add_many(&dev->l2ad_alloc, + arc_hdr_size(exists), exists); + mutex_exit(&dev->l2ad_mtx); + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(exists)); + ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(exists)); + } + ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached); + } + + mutex_exit(hash_lock); +} + +/* + * Starts an asynchronous read IO to read a log block. This is used in log + * block reconstruction to start reading the next block before we are done + * decoding and reconstructing the current block, to keep the l2arc device + * nice and hot with read IO to process. + * The returned zio will contain newly allocated memory buffers for the IO + * data which should then be freed by the caller once the zio is no longer + * needed (i.e. due to it having completed). If you wish to abort this + * zio, you should do so using l2arc_log_blk_fetch_abort, which takes + * care of disposing of the allocated buffers correctly. + */ +static zio_t * +l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, + l2arc_log_blk_phys_t *lb) +{ + uint32_t asize; + zio_t *pio; + l2arc_read_callback_t *cb; + + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); + ASSERT(asize <= sizeof (l2arc_log_blk_phys_t)); + + cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); + cb->l2rcb_abd = abd_get_from_buf(lb, asize); + pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY); + (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, + cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); + + return (pio); +} + +/* + * Aborts a zio returned from l2arc_log_blk_fetch and frees the data + * buffers allocated for it. + */ +static void +l2arc_log_blk_fetch_abort(zio_t *zio) +{ + (void) zio_wait(zio); +} + +/* + * Creates a zio to update the device header on an l2arc device. + */ +static void +l2arc_dev_hdr_update(l2arc_dev_t *dev) +{ + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; + abd_t *abd; + int err; + + VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER)); + + l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC; + l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION; + l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); + l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid; + l2dhdr->dh_log_entries = dev->l2ad_log_entries; + l2dhdr->dh_evict = dev->l2ad_evict; + l2dhdr->dh_start = dev->l2ad_start; + l2dhdr->dh_end = dev->l2ad_end; + l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize); + l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count); + l2dhdr->dh_flags = 0; + if (dev->l2ad_first) + l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; + + abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); + + err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev, + VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, + NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE)); + + abd_put(abd); + + if (err != 0) { + zfs_dbgmsg("L2ARC IO error (%d) while writing device header, " + "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid); + } +} + +/* + * Commits a log block to the L2ARC device. This routine is invoked from + * l2arc_write_buffers when the log block fills up. + * This function allocates some memory to temporarily hold the serialized + * buffer to be written. This is then released in l2arc_write_done. + */ +static void +l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) +{ + l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + uint64_t psize, asize; + zio_t *wzio; + l2arc_lb_abd_buf_t *abd_buf; + uint8_t *tmpbuf; + l2arc_lb_ptr_buf_t *lb_ptr_buf; + + VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries); + + tmpbuf = zio_buf_alloc(sizeof (*lb)); + abd_buf = zio_buf_alloc(sizeof (*abd_buf)); + abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb)); + lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); + lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); + + /* link the buffer into the block chain */ + lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1]; + lb->lb_magic = L2ARC_LOG_BLK_MAGIC; + + /* + * l2arc_log_blk_commit() may be called multiple times during a single + * l2arc_write_buffers() call. Save the allocated abd buffers in a list + * so we can free them in l2arc_write_done() later on. + */ + list_insert_tail(&cb->l2wcb_abd_list, abd_buf); + + /* try to compress the buffer */ + psize = zio_compress_data(ZIO_COMPRESS_LZ4, + abd_buf->abd, tmpbuf, sizeof (*lb)); + + /* a log block is never entirely zero */ + ASSERT(psize != 0); + asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); + ASSERT(asize <= sizeof (*lb)); + + /* + * Update the start log block pointer in the device header to point + * to the log block we're about to write. + */ + l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0]; + l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand; + l2dhdr->dh_start_lbps[0].lbp_payload_asize = + dev->l2ad_log_blk_payload_asize; + l2dhdr->dh_start_lbps[0].lbp_payload_start = + dev->l2ad_log_blk_payload_start; + _NOTE(CONSTCOND) + L2BLK_SET_LSIZE( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb)); + L2BLK_SET_PSIZE( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize); + L2BLK_SET_CHECKSUM( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, + ZIO_CHECKSUM_FLETCHER_4); + if (asize < sizeof (*lb)) { + /* compression succeeded */ + bzero(tmpbuf + psize, asize - psize); + L2BLK_SET_COMPRESS( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, + ZIO_COMPRESS_LZ4); + } else { + /* compression failed */ + bcopy(lb, tmpbuf, sizeof (*lb)); + L2BLK_SET_COMPRESS( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, + ZIO_COMPRESS_OFF); + } + + /* checksum what we're about to write */ + fletcher_4_native(tmpbuf, asize, NULL, + &l2dhdr->dh_start_lbps[0].lbp_cksum); + + abd_put(abd_buf->abd); + + /* perform the write itself */ + abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb)); + abd_take_ownership_of_buf(abd_buf->abd, B_TRUE); + wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, + asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); + (void) zio_nowait(wzio); + + dev->l2ad_hand += asize; + /* + * Include the committed log block's pointer in the list of pointers + * to log blocks present in the L2ARC device. + */ + bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + mutex_enter(&dev->l2ad_mtx); + list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_count); + zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); + zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); + mutex_exit(&dev->l2ad_mtx); + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + + /* bump the kstats */ + ARCSTAT_INCR(arcstat_l2_write_bytes, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_writes); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize); + ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, + dev->l2ad_log_blk_payload_asize / asize); + + /* start a new log block */ + dev->l2ad_log_ent_idx = 0; + dev->l2ad_log_blk_payload_asize = 0; + dev->l2ad_log_blk_payload_start = 0; +} + +/* + * Validates an L2ARC log block address to make sure that it can be read + * from the provided L2ARC device. + */ +boolean_t +l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) +{ + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); + uint64_t end = lbp->lbp_daddr + asize - 1; + uint64_t start = lbp->lbp_payload_start; + boolean_t evicted = B_FALSE; + + /* BEGIN CSTYLED */ + /* + * A log block is valid if all of the following conditions are true: + * - it fits entirely (including its payload) between l2ad_start and + * l2ad_end + * - it has a valid size + * - neither the log block itself nor part of its payload was evicted + * by l2arc_evict(): + * + * l2ad_hand l2ad_evict + * | | lbp_daddr + * | start | | end + * | | | | | + * V V V V V + * l2ad_start ============================================ l2ad_end + * --------------------------|||| + * ^ ^ + * | log block + * payload + */ + /* END CSTYLED */ + evicted = + l2arc_range_check_overlap(start, end, dev->l2ad_hand) || + l2arc_range_check_overlap(start, end, dev->l2ad_evict) || + l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) || + l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end); + + return (start >= dev->l2ad_start && end <= dev->l2ad_end && + asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) && + (!evicted || dev->l2ad_first)); +} + +/* + * Inserts ARC buffer header `hdr' into the current L2ARC log block on + * the device. The buffer being inserted must be present in L2ARC. + * Returns B_TRUE if the L2ARC log block is full and needs to be committed + * to L2ARC, or B_FALSE if it still has room for more ARC buffers. + */ +static boolean_t +l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) +{ + l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; + l2arc_log_ent_phys_t *le; + + if (dev->l2ad_log_entries == 0) + return (B_FALSE); + + int index = dev->l2ad_log_ent_idx++; + + ASSERT3S(index, <, dev->l2ad_log_entries); + ASSERT(HDR_HAS_L2HDR(hdr)); + + le = &lb->lb_entries[index]; + bzero(le, sizeof (*le)); + le->le_dva = hdr->b_dva; + le->le_birth = hdr->b_birth; + le->le_daddr = hdr->b_l2hdr.b_daddr; + if (index == 0) + dev->l2ad_log_blk_payload_start = le->le_daddr; + L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr)); + L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr)); + L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr)); + L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); + L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); + L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); + + dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, + HDR_GET_PSIZE(hdr)); + + return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries); +} + +/* + * Checks whether a given L2ARC device address sits in a time-sequential + * range. The trick here is that the L2ARC is a rotary buffer, so we can't + * just do a range comparison, we need to handle the situation in which the + * range wraps around the end of the L2ARC device. Arguments: + * bottom -- Lower end of the range to check (written to earlier). + * top -- Upper end of the range to check (written to later). + * check -- The address for which we want to determine if it sits in + * between the top and bottom. + * + * The 3-way conditional below represents the following cases: + * + * bottom < top : Sequentially ordered case: + * <check>--------+-------------------+ + * | (overlap here?) | + * L2ARC dev V V + * |---------------<bottom>============<top>--------------| + * + * bottom > top: Looped-around case: + * <check>--------+------------------+ + * | (overlap here?) | + * L2ARC dev V V + * |===============<top>---------------<bottom>===========| + * ^ ^ + * | (or here?) | + * +---------------+---------<check> + * + * top == bottom : Just a single address comparison. + */ +boolean_t +l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check) +{ + if (bottom < top) + return (bottom <= check && check <= top); + else if (bottom > top) + return (check <= top || bottom <= check); + else + return (check == top); +} diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index 21a8595b72..ced6a3ead5 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -4364,6 +4364,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) } spa_import_progress_remove(spa); + spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); + spa_load_note(spa, "LOADED"); return (0); @@ -7648,6 +7650,17 @@ spa_async_thread(void *arg) } /* + * Kick off L2 cache rebuilding. + */ + if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); + l2arc_spa_rebuild_start(spa); + spa_config_exit(spa, SCL_L2ARC, FTAG); + mutex_exit(&spa_namespace_lock); + } + + /* * Let the world know that we're done. */ mutex_enter(&spa->spa_async_lock); diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index 1ef3bb79ca..ddcbfa748d 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -248,10 +248,14 @@ void arc_fini(void); void l2arc_add_vdev(spa_t *spa, vdev_t *vd); void l2arc_remove_vdev(vdev_t *vd); boolean_t l2arc_vdev_present(vdev_t *vd); +void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); +boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, + uint64_t check); void l2arc_init(void); void l2arc_fini(void); void l2arc_start(void); void l2arc_stop(void); +void l2arc_spa_rebuild_start(spa_t *spa); #ifndef _KERNEL extern boolean_t arc_watch; diff --git a/usr/src/uts/common/fs/zfs/sys/arc_impl.h b/usr/src/uts/common/fs/zfs/sys/arc_impl.h new file mode 100644 index 0000000000..0c18849b59 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/arc_impl.h @@ -0,0 +1,857 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2019, Joyent, Inc. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2020, George Amanakis. All rights reserved. + */ + +#ifndef _SYS_ARC_IMPL_H +#define _SYS_ARC_IMPL_H + +#include <sys/arc.h> +#include <sys/multilist.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Note that buffers can be in one of 6 states: + * ARC_anon - anonymous (discussed below) + * ARC_mru - recently used, currently cached + * ARC_mru_ghost - recently used, no longer in cache + * ARC_mfu - frequently used, currently cached + * ARC_mfu_ghost - frequently used, no longer in cache + * ARC_l2c_only - exists in L2ARC but not other states + * When there are no active references to the buffer, they are + * are linked onto a list in one of these arc states. These are + * the only buffers that can be evicted or deleted. Within each + * state there are multiple lists, one for meta-data and one for + * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, + * etc.) is tracked separately so that it can be managed more + * explicitly: favored over data, limited explicitly. + * + * Anonymous buffers are buffers that are not associated with + * a DVA. These are buffers that hold dirty block copies + * before they are written to stable storage. By definition, + * they are "ref'd" and are considered part of arc_mru + * that cannot be freed. Generally, they will aquire a DVA + * as they are written and migrate onto the arc_mru list. + * + * The ARC_l2c_only state is for buffers that are in the second + * level ARC but no longer in any of the ARC_m* lists. The second + * level ARC itself may also contain buffers that are in any of + * the ARC_m* states - meaning that a buffer can exist in two + * places. The reason for the ARC_l2c_only state is to keep the + * buffer header in the hash table, so that reads that hit the + * second level ARC benefit from these fast lookups. + */ + +typedef struct arc_state { + /* + * list of evictable buffers + */ + multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; + /* + * total amount of evictable data in this state + */ + zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; + /* + * total amount of data in this state; this includes: evictable, + * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. + */ + zfs_refcount_t arcs_size; +} arc_state_t; + +typedef struct arc_callback arc_callback_t; + +struct arc_callback { + void *acb_private; + arc_read_done_func_t *acb_done; + arc_buf_t *acb_buf; + boolean_t acb_encrypted; + boolean_t acb_compressed; + boolean_t acb_noauth; + zbookmark_phys_t acb_zb; + zio_t *acb_zio_dummy; + zio_t *acb_zio_head; + arc_callback_t *acb_next; +}; + +typedef struct arc_write_callback arc_write_callback_t; + +struct arc_write_callback { + void *awcb_private; + arc_write_done_func_t *awcb_ready; + arc_write_done_func_t *awcb_children_ready; + arc_write_done_func_t *awcb_physdone; + arc_write_done_func_t *awcb_done; + arc_buf_t *awcb_buf; +}; + +/* + * ARC buffers are separated into multiple structs as a memory saving measure: + * - Common fields struct, always defined, and embedded within it: + * - L2-only fields, always allocated but undefined when not in L2ARC + * - L1-only fields, only allocated when in L1ARC + * + * Buffer in L1 Buffer only in L2 + * +------------------------+ +------------------------+ + * | arc_buf_hdr_t | | arc_buf_hdr_t | + * | | | | + * | | | | + * | | | | + * +------------------------+ +------------------------+ + * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | + * | (undefined if L1-only) | | | + * +------------------------+ +------------------------+ + * | l1arc_buf_hdr_t | + * | | + * | | + * | | + * | | + * +------------------------+ + * + * Because it's possible for the L2ARC to become extremely large, we can wind + * up eating a lot of memory in L2ARC buffer headers, so the size of a header + * is minimized by only allocating the fields necessary for an L1-cached buffer + * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and + * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple + * words in pointers. arc_hdr_realloc() is used to switch a header between + * these two allocation states. + */ +typedef struct l1arc_buf_hdr { + kmutex_t b_freeze_lock; + zio_cksum_t *b_freeze_cksum; +#ifdef ZFS_DEBUG + /* + * Used for debugging with kmem_flags - by allocating and freeing + * b_thawed when the buffer is thawed, we get a record of the stack + * trace that thawed it. + */ + void *b_thawed; +#endif + + arc_buf_t *b_buf; + uint32_t b_bufcnt; + /* for waiting on writes to complete */ + kcondvar_t b_cv; + uint8_t b_byteswap; + + /* protected by arc state mutex */ + arc_state_t *b_state; + multilist_node_t b_arc_node; + + /* updated atomically */ + clock_t b_arc_access; + + /* self protecting */ + zfs_refcount_t b_refcnt; + + arc_callback_t *b_acb; + abd_t *b_pabd; +} l1arc_buf_hdr_t; + +typedef enum l2arc_dev_hdr_flags_t { + L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */ +} l2arc_dev_hdr_flags_t; + +/* + * Pointer used in persistent L2ARC (for pointing to log blocks). + */ +typedef struct l2arc_log_blkptr { + /* + * Offset of log block within the device, in bytes + */ + uint64_t lbp_daddr; + /* + * Aligned payload size (in bytes) of the log block + */ + uint64_t lbp_payload_asize; + /* + * Offset in bytes of the first buffer in the payload + */ + uint64_t lbp_payload_start; + /* + * lbp_prop has the following format: + * * logical size (in bytes) + * * aligned (after compression) size (in bytes) + * * compression algorithm (we always LZ4-compress l2arc logs) + * * checksum algorithm (used for lbp_cksum) + */ + uint64_t lbp_prop; + zio_cksum_t lbp_cksum; /* checksum of log */ +} l2arc_log_blkptr_t; + +/* + * The persistent L2ARC device header. + * Byte order of magic determines whether 64-bit bswap of fields is necessary. + */ +typedef struct l2arc_dev_hdr_phys { + uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */ + uint64_t dh_version; /* Persistent L2ARC version */ + + /* + * Global L2ARC device state and metadata. + */ + uint64_t dh_spa_guid; + uint64_t dh_vdev_guid; + uint64_t dh_log_entries; /* mirror of l2ad_log_entries */ + uint64_t dh_evict; /* evicted offset in bytes */ + uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */ + /* + * Used in zdb.c for determining if a log block is valid, in the same + * way that l2arc_rebuild() does. + */ + uint64_t dh_start; /* mirror of l2ad_start */ + uint64_t dh_end; /* mirror of l2ad_end */ + /* + * Start of log block chain. [0] -> newest log, [1] -> one older (used + * for initiating prefetch). + */ + l2arc_log_blkptr_t dh_start_lbps[2]; + /* + * Aligned size of all log blocks as accounted by vdev_space_update(). + */ + uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */ + uint64_t dh_lb_count; /* mirror of l2ad_lb_count */ + const uint64_t dh_pad[32]; /* pad to 512 bytes */ + zio_eck_t dh_tail; +} l2arc_dev_hdr_phys_t; +CTASSERT(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE); + +/* + * A single ARC buffer header entry in a l2arc_log_blk_phys_t. + */ +typedef struct l2arc_log_ent_phys { + dva_t le_dva; /* dva of buffer */ + uint64_t le_birth; /* birth txg of buffer */ + /* + * le_prop has the following format: + * * logical size (in bytes) + * * physical (compressed) size (in bytes) + * * compression algorithm + * * object type (used to restore arc_buf_contents_t) + * * protected status (used for encryption) + * * prefetch status (used in l2arc_read_done()) + */ + uint64_t le_prop; + uint64_t le_daddr; /* buf location on l2dev */ + /* + * We pad the size of each entry to a power of 2 so that the size of + * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT, + * because of the L2ARC_SET_*SIZE macros. + */ + const uint64_t le_pad[3]; /* pad to 64 bytes */ +} l2arc_log_ent_phys_t; + +#define L2ARC_LOG_BLK_MAX_ENTRIES (1022) + +/* + * A log block of up to 1022 ARC buffer log entries, chained into the + * persistent L2ARC metadata linked list. Byte order of magic determines + * whether 64-bit bswap of fields is necessary. + */ +typedef struct l2arc_log_blk_phys { + uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */ + /* + * There are 2 chains (headed by dh_start_lbps[2]), and this field + * points back to the previous block in this chain. We alternate + * which chain we append to, so they are time-wise and offset-wise + * interleaved, but that is an optimization rather than for + * correctness. + */ + l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */ + /* + * Pad header section to 128 bytes + */ + uint64_t lb_pad[7]; + /* Payload */ + l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES]; +} l2arc_log_blk_phys_t; /* 64K total */ +/* + * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with + * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros. + */ +CTASSERT(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t), + 1ULL << SPA_MINBLOCKSHIFT)); +CTASSERT(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE); +CTASSERT(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE); + +/* + * These structures hold in-flight abd buffers for log blocks as they're being + * written to the L2ARC device. + */ +typedef struct l2arc_lb_abd_buf { + abd_t *abd; + list_node_t node; +} l2arc_lb_abd_buf_t; + +/* + * These structures hold pointers to log blocks present on the L2ARC device. + */ +typedef struct l2arc_lb_ptr_buf { + l2arc_log_blkptr_t *lb_ptr; + list_node_t node; +} l2arc_lb_ptr_buf_t; + +/* Macros for setting fields in le_prop and lbp_prop */ +#define L2BLK_GET_LSIZE(field) \ + BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1) +#define L2BLK_SET_LSIZE(field, x) \ + BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) +#define L2BLK_GET_PSIZE(field) \ + BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1) +#define L2BLK_SET_PSIZE(field, x) \ + BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) +#define L2BLK_GET_COMPRESS(field) \ + BF64_GET((field), 32, SPA_COMPRESSBITS) +#define L2BLK_SET_COMPRESS(field, x) \ + BF64_SET((field), 32, SPA_COMPRESSBITS, x) +#define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1) +#define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x) +#define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8) +#define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x) +#define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8) +#define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x) +#define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1) +#define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x) + +#define PTR_SWAP(x, y) \ + do { \ + void *tmp = (x);\ + x = y; \ + y = tmp; \ + _NOTE(CONSTCOND)\ + } while (0) + +#define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */ +#define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */ + +/* + * L2ARC Internals + */ +typedef struct l2arc_dev { + vdev_t *l2ad_vdev; /* vdev */ + spa_t *l2ad_spa; /* spa */ + uint64_t l2ad_hand; /* next write location */ + uint64_t l2ad_start; /* first addr on device */ + uint64_t l2ad_end; /* last addr on device */ + boolean_t l2ad_first; /* first sweep through */ + boolean_t l2ad_writing; /* currently writing */ + kmutex_t l2ad_mtx; /* lock for buffer list */ + list_t l2ad_buflist; /* buffer list */ + list_node_t l2ad_node; /* device list node */ + zfs_refcount_t l2ad_alloc; /* allocated bytes */ + /* + * Persistence-related stuff + */ + l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */ + uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */ + l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */ + int l2ad_log_ent_idx; /* index into cur log blk */ + /* Number of bytes in current log block's payload */ + uint64_t l2ad_log_blk_payload_asize; + /* + * Offset (in bytes) of the first buffer in current log block's + * payload. + */ + uint64_t l2ad_log_blk_payload_start; + /* Flag indicating whether a rebuild is scheduled or is going on */ + boolean_t l2ad_rebuild; + boolean_t l2ad_rebuild_cancel; + boolean_t l2ad_rebuild_began; + uint64_t l2ad_log_entries; /* entries per log blk */ + uint64_t l2ad_evict; /* evicted offset in bytes */ + /* List of pointers to log blocks present in the L2ARC device */ + list_t l2ad_lbptr_list; + /* + * Aligned size of all log blocks as accounted by vdev_space_update(). + */ + zfs_refcount_t l2ad_lb_asize; + /* + * Number of log blocks present on the device. + */ + zfs_refcount_t l2ad_lb_count; +} l2arc_dev_t; + +/* + * Encrypted blocks will need to be stored encrypted on the L2ARC + * disk as they appear in the main pool. In order for this to work we + * need to pass around the encryption parameters so they can be used + * to write data to the L2ARC. This struct is only defined in the + * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED + * flag set. + */ +typedef struct arc_buf_hdr_crypt { + abd_t *b_rabd; /* raw encrypted data */ + dmu_object_type_t b_ot; /* object type */ + uint32_t b_ebufcnt; /* number or encryped buffers */ + + /* dsobj for looking up encryption key for l2arc encryption */ + uint64_t b_dsobj; /* for looking up key */ + + /* encryption parameters */ + uint8_t b_salt[ZIO_DATA_SALT_LEN]; + uint8_t b_iv[ZIO_DATA_IV_LEN]; + + /* + * Technically this could be removed since we will always be able to + * get the mac from the bp when we need it. However, it is inconvenient + * for callers of arc code to have to pass a bp in all the time. This + * also allows us to assert that L2ARC data is properly encrypted to + * match the data in the main storage pool. + */ + uint8_t b_mac[ZIO_DATA_MAC_LEN]; +} arc_buf_hdr_crypt_t; + +typedef struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + uint64_t b_daddr; /* disk address, offset byte */ + + list_node_t b_l2node; +} l2arc_buf_hdr_t; + +typedef struct l2arc_write_callback { + l2arc_dev_t *l2wcb_dev; /* device info */ + arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ + /* in-flight list of log blocks */ + list_t l2wcb_abd_list; +} l2arc_write_callback_t; + +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + + arc_buf_contents_t b_type; + arc_buf_hdr_t *b_hash_next; + arc_flags_t b_flags; + + /* + * This field stores the size of the data buffer after + * compression, and is set in the arc's zio completion handlers. + * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). + * + * While the block pointers can store up to 32MB in their psize + * field, we can only store up to 32MB minus 512B. This is due + * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. + * a field of zeros represents 512B in the bp). We can't use a + * bias of 1 since we need to reserve a psize of zero, here, to + * represent holes and embedded blocks. + * + * This isn't a problem in practice, since the maximum size of a + * buffer is limited to 16MB, so we never need to store 32MB in + * this field. + */ + uint16_t b_psize; + + /* + * This field stores the size of the data buffer before + * compression, and cannot change once set. It is in units + * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) + */ + uint16_t b_lsize; /* immutable */ + uint64_t b_spa; /* immutable */ + + /* L2ARC fields. Undefined when not in L2ARC. */ + l2arc_buf_hdr_t b_l2hdr; + /* L1ARC fields. Undefined when in l2arc_only state */ + l1arc_buf_hdr_t b_l1hdr; + /* + * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED + * is set and the L1 header exists. + */ + arc_buf_hdr_crypt_t b_crypt_hdr; +}; + +typedef struct arc_stats { + kstat_named_t arcstat_hits; + kstat_named_t arcstat_misses; + kstat_named_t arcstat_demand_data_hits; + kstat_named_t arcstat_demand_data_misses; + kstat_named_t arcstat_demand_metadata_hits; + kstat_named_t arcstat_demand_metadata_misses; + kstat_named_t arcstat_prefetch_data_hits; + kstat_named_t arcstat_prefetch_data_misses; + kstat_named_t arcstat_prefetch_metadata_hits; + kstat_named_t arcstat_prefetch_metadata_misses; + kstat_named_t arcstat_mru_hits; + kstat_named_t arcstat_mru_ghost_hits; + kstat_named_t arcstat_mfu_hits; + kstat_named_t arcstat_mfu_ghost_hits; + kstat_named_t arcstat_deleted; + /* + * Number of buffers that could not be evicted because the hash lock + * was held by another thread. The lock may not necessarily be held + * by something using the same buffer, since hash locks are shared + * by multiple buffers. + */ + kstat_named_t arcstat_mutex_miss; + /* + * Number of buffers skipped when updating the access state due to the + * header having already been released after acquiring the hash lock. + */ + kstat_named_t arcstat_access_skip; + /* + * Number of buffers skipped because they have I/O in progress, are + * indirect prefetch buffers that have not lived long enough, or are + * not from the spa we're trying to evict from. + */ + kstat_named_t arcstat_evict_skip; + /* + * Number of times arc_evict_state() was unable to evict enough + * buffers to reach its target amount. + */ + kstat_named_t arcstat_evict_not_enough; + kstat_named_t arcstat_evict_l2_cached; + kstat_named_t arcstat_evict_l2_eligible; + kstat_named_t arcstat_evict_l2_ineligible; + kstat_named_t arcstat_evict_l2_skip; + kstat_named_t arcstat_hash_elements; + kstat_named_t arcstat_hash_elements_max; + kstat_named_t arcstat_hash_collisions; + kstat_named_t arcstat_hash_chains; + kstat_named_t arcstat_hash_chain_max; + kstat_named_t arcstat_p; + kstat_named_t arcstat_c; + kstat_named_t arcstat_c_min; + kstat_named_t arcstat_c_max; + /* Not updated directly; only synced in arc_kstat_update. */ + kstat_named_t arcstat_size; + /* + * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. + * Note that the compressed bytes may match the uncompressed bytes + * if the block is either not compressed or compressed arc is disabled. + */ + kstat_named_t arcstat_compressed_size; + /* + * Uncompressed size of the data stored in b_pabd. If compressed + * arc is disabled then this value will be identical to the stat + * above. + */ + kstat_named_t arcstat_uncompressed_size; + /* + * Number of bytes stored in all the arc_buf_t's. This is classified + * as "overhead" since this data is typically short-lived and will + * be evicted from the arc when it becomes unreferenced unless the + * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level + * values have been set (see comment in dbuf.c for more information). + */ + kstat_named_t arcstat_overhead_size; + /* + * Number of bytes consumed by internal ARC structures necessary + * for tracking purposes; these structures are not actually + * backed by ARC buffers. This includes arc_buf_hdr_t structures + * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only + * caches), and arc_buf_t structures (allocated via arc_buf_t + * cache). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_hdr_size; + /* + * Number of bytes consumed by ARC buffers of type equal to + * ARC_BUFC_DATA. This is generally consumed by buffers backing + * on disk user data (e.g. plain file contents). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_data_size; + /* + * Number of bytes consumed by ARC buffers of type equal to + * ARC_BUFC_METADATA. This is generally consumed by buffers + * backing on disk data that is used for internal ZFS + * structures (e.g. ZAP, dnode, indirect blocks, etc). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_metadata_size; + /* + * Number of bytes consumed by various buffers and structures + * not actually backed with ARC buffers. This includes bonus + * buffers (allocated directly via zio_buf_* functions), + * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t + * cache), and dnode_t structures (allocated via dnode_t cache). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_other_size; + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_anon state. This includes *all* buffers in the arc_anon + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_anon_size; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_DATA, + * residing in the arc_anon state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_anon_evictable_data; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_METADATA, + * residing in the arc_anon state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_anon_evictable_metadata; + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_mru state. This includes *all* buffers in the arc_mru + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_size; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_DATA, + * residing in the arc_mru state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_evictable_data; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_METADATA, + * residing in the arc_mru state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_evictable_metadata; + /* + * Total number of bytes that *would have been* consumed by ARC + * buffers in the arc_mru_ghost state. The key thing to note + * here, is the fact that this size doesn't actually indicate + * RAM consumption. The ghost lists only consist of headers and + * don't actually have ARC buffers linked off of these headers. + * Thus, *if* the headers had associated ARC buffers, these + * buffers *would have* consumed this number of bytes. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_ghost_size; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_ghost_evictable_data; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_ghost_evictable_metadata; + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_mfu state. This includes *all* buffers in the arc_mfu + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_size; + /* + * Number of bytes consumed by ARC buffers that are eligible for + * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu + * state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_evictable_data; + /* + * Number of bytes consumed by ARC buffers that are eligible for + * eviction, of type ARC_BUFC_METADATA, and reside in the + * arc_mfu state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_evictable_metadata; + /* + * Total number of bytes that *would have been* consumed by ARC + * buffers in the arc_mfu_ghost state. See the comment above + * arcstat_mru_ghost_size for more details. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_ghost_size; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_ghost_evictable_data; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_ghost_evictable_metadata; + kstat_named_t arcstat_l2_hits; + kstat_named_t arcstat_l2_misses; + kstat_named_t arcstat_l2_feeds; + kstat_named_t arcstat_l2_rw_clash; + kstat_named_t arcstat_l2_read_bytes; + kstat_named_t arcstat_l2_write_bytes; + kstat_named_t arcstat_l2_writes_sent; + kstat_named_t arcstat_l2_writes_done; + kstat_named_t arcstat_l2_writes_error; + kstat_named_t arcstat_l2_writes_lock_retry; + kstat_named_t arcstat_l2_evict_lock_retry; + kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_evict_l1cached; + kstat_named_t arcstat_l2_free_on_write; + kstat_named_t arcstat_l2_abort_lowmem; + kstat_named_t arcstat_l2_cksum_bad; + kstat_named_t arcstat_l2_io_error; + kstat_named_t arcstat_l2_lsize; + kstat_named_t arcstat_l2_psize; + /* Not updated directly; only synced in arc_kstat_update. */ + kstat_named_t arcstat_l2_hdr_size; + /* + * Number of L2ARC log blocks written. These are used for restoring the + * L2ARC. Updated during writing of L2ARC log blocks. + */ + kstat_named_t arcstat_l2_log_blk_writes; + /* + * Moving average of the aligned size of the L2ARC log blocks, in + * bytes. Updated during L2ARC rebuild and during writing of L2ARC + * log blocks. + */ + kstat_named_t arcstat_l2_log_blk_avg_asize; + /* Aligned size of L2ARC log blocks on L2ARC devices. */ + kstat_named_t arcstat_l2_log_blk_asize; + /* Number of L2ARC log blocks present on L2ARC devices. */ + kstat_named_t arcstat_l2_log_blk_count; + /* + * Moving average of the aligned size of L2ARC restored data, in bytes, + * to the aligned size of their metadata in L2ARC, in bytes. + * Updated during L2ARC rebuild and during writing of L2ARC log blocks. + */ + kstat_named_t arcstat_l2_data_to_meta_ratio; + /* + * Number of times the L2ARC rebuild was successful for an L2ARC device. + */ + kstat_named_t arcstat_l2_rebuild_success; + /* + * Number of times the L2ARC rebuild failed because the device header + * was in an unsupported format or corrupted. + */ + kstat_named_t arcstat_l2_rebuild_abort_unsupported; + /* + * Number of times the L2ARC rebuild failed because of IO errors + * while reading a log block. + */ + kstat_named_t arcstat_l2_rebuild_abort_io_errors; + /* + * Number of times the L2ARC rebuild failed because of IO errors when + * reading the device header. + */ + kstat_named_t arcstat_l2_rebuild_abort_dh_errors; + /* + * Number of L2ARC log blocks which failed to be restored due to + * checksum errors. + */ + kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors; + /* + * Number of times the L2ARC rebuild was aborted due to low system + * memory. + */ + kstat_named_t arcstat_l2_rebuild_abort_lowmem; + /* Logical size of L2ARC restored data, in bytes. */ + kstat_named_t arcstat_l2_rebuild_size; + /* Aligned size of L2ARC restored data, in bytes. */ + kstat_named_t arcstat_l2_rebuild_asize; + /* + * Number of L2ARC log entries (buffers) that were successfully + * restored in ARC. + */ + kstat_named_t arcstat_l2_rebuild_bufs; + /* + * Number of L2ARC log entries (buffers) already cached in ARC. These + * were not restored again. + */ + kstat_named_t arcstat_l2_rebuild_bufs_precached; + /* + * Number of L2ARC log blocks that were restored successfully. Each + * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers. + */ + kstat_named_t arcstat_l2_rebuild_log_blks; + kstat_named_t arcstat_memory_throttle_count; + /* Not updated directly; only synced in arc_kstat_update. */ + kstat_named_t arcstat_meta_used; + kstat_named_t arcstat_meta_limit; + kstat_named_t arcstat_meta_max; + kstat_named_t arcstat_meta_min; + kstat_named_t arcstat_async_upgrade_sync; + kstat_named_t arcstat_demand_hit_predictive_prefetch; + kstat_named_t arcstat_demand_hit_prescient_prefetch; +} arc_stats_t; + +#define ARCSTAT(stat) (arc_stats.stat.value.ui64) + +#define ARCSTAT_INCR(stat, val) \ + atomic_add_64(&arc_stats.stat.value.ui64, (val)) + +#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) +#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) + +/* + * There are several ARC variables that are critical to export as kstats -- + * but we don't want to have to grovel around in the kstat whenever we wish to + * manipulate them. For these variables, we therefore define them to be in + * terms of the statistic variable. This assures that we are not introducing + * the possibility of inconsistency by having shadow copies of the variables, + * while still allowing the code to be readable. + */ +#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ +#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ +#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ +#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ +#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ +#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ +#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ + +/* compressed size of entire arc */ +#define arc_compressed_size ARCSTAT(arcstat_compressed_size) +/* uncompressed size of entire arc */ +#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) +/* number of bytes in the arc from arc_buf_t's */ +#define arc_overhead_size ARCSTAT(arcstat_overhead_size) + +extern arc_stats_t arc_stats; + +/* used in zdb.c */ +boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *lbp); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ARC_IMPL_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 33cdfbeb4b..af8057be8f 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -792,6 +792,7 @@ extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); #define SPA_ASYNC_INITIALIZE_RESTART 0x100 #define SPA_ASYNC_TRIM_RESTART 0x200 #define SPA_ASYNC_AUTOTRIM_RESTART 0x400 +#define SPA_ASYNC_L2CACHE_REBUILD 0x800 /* * Controls the behavior of spa_vdev_remove(). diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index 254af68099..cd05edcffa 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -2179,9 +2179,22 @@ vdev_reopen(vdev_t *vd) if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && - vd->vdev_aux == &spa->spa_l2cache && - !l2arc_vdev_present(vd)) - l2arc_add_vdev(spa, vd); + vd->vdev_aux == &spa->spa_l2cache) { + /* + * When reopening we can assume the device label has + * already the attribute l2cache_persistent, since we've + * opened the device in the past and updated the label. + * In case the vdev is present we should evict all ARC + * buffers and pointers to log blocks and reclaim their + * space before restoring its contents to L2ARC. + */ + if (l2arc_vdev_present(vd)) { + l2arc_rebuild_vdev(vd, B_TRUE); + } else { + l2arc_add_vdev(spa, vd); + } + spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); + } } else { (void) vdev_validate(vd); } diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h index f870d6ce7c..819905a8d9 100644 --- a/usr/src/uts/common/sys/fs/zfs.h +++ b/usr/src/uts/common/sys/fs/zfs.h @@ -561,6 +561,11 @@ typedef enum zfs_key_location { #define ZPL_VERSION_USERSPACE ZPL_VERSION_4 #define ZPL_VERSION_SA ZPL_VERSION_5 +/* Persistent L2ARC version */ +#define L2ARC_PERSISTENT_VERSION_1 1ULL +#define L2ARC_PERSISTENT_VERSION L2ARC_PERSISTENT_VERSION_1 +#define L2ARC_PERSISTENT_VERSION_STRING "1" + /* Rewind policy information */ #define ZPOOL_NO_REWIND 1 /* No policy - default behavior */ #define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */ |