summaryrefslogtreecommitdiff
path: root/usr/src/uts
diff options
context:
space:
mode:
authorGeorge Amanakis <gamanakis@gmail.com>2020-07-30 18:40:44 -0500
committerJason King <jason.king@joyent.com>2020-10-16 11:10:02 -0500
commitf0a052391861a2b96cf28973c3b7f2854591aa79 (patch)
tree653d2330669b465bac1ab1c55b7e24c018cddc8a /usr/src/uts
parent6218f28969018904255fddf306e6489c7ae28bba (diff)
downloadillumos-joyent-f0a052391861a2b96cf28973c3b7f2854591aa79.tar.gz
3525 Persistent L2ARC
Portions contributed by: Saso Kiselkov <skiselkov@gmail.com> Portions contributed by: Jorgen Lundman <lundman@lundman.net> Portions contributed by: Brian Behlendorf <behlendorf1@llnl.gov> Portions contributed by: Alexander Motin <mav@FreeBSD.org> Portions contributed by: Jason King <jason.king@joyent.com> Reviewed by: C Fraire <cfraire@me.com> Reviewed by: Toomas Soome <tsoome@me.com> Approved by: Dan McDonald <danmcd@joyent.com>
Diffstat (limited to 'usr/src/uts')
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c2183
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c13
-rw-r--r--usr/src/uts/common/fs/zfs/sys/arc.h4
-rw-r--r--usr/src/uts/common/fs/zfs/sys/arc_impl.h857
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h1
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c19
-rw-r--r--usr/src/uts/common/sys/fs/zfs.h5
7 files changed, 2466 insertions, 616 deletions
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index 48ae3aa829..9a962b420e 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -24,6 +24,8 @@
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011, 2019, Delphix. All rights reserved.
+ * Copyright (c) 2020, George Amanakis. All rights reserved.
*/
/*
@@ -293,6 +295,7 @@
#include <sys/kstat.h>
#include <sys/zthr.h>
#include <zfs_fletcher.h>
+#include <sys/arc_impl.h>
#include <sys/aggsum.h>
#include <sys/cityhash.h>
#include <sys/param.h>
@@ -407,54 +410,6 @@ uint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */
boolean_t zfs_compressed_arc_enabled = B_TRUE;
-/*
- * Note that buffers can be in one of 6 states:
- * ARC_anon - anonymous (discussed below)
- * ARC_mru - recently used, currently cached
- * ARC_mru_ghost - recentely used, no longer in cache
- * ARC_mfu - frequently used, currently cached
- * ARC_mfu_ghost - frequently used, no longer in cache
- * ARC_l2c_only - exists in L2ARC but not other states
- * When there are no active references to the buffer, they are
- * are linked onto a list in one of these arc states. These are
- * the only buffers that can be evicted or deleted. Within each
- * state there are multiple lists, one for meta-data and one for
- * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
- * etc.) is tracked separately so that it can be managed more
- * explicitly: favored over data, limited explicitly.
- *
- * Anonymous buffers are buffers that are not associated with
- * a DVA. These are buffers that hold dirty block copies
- * before they are written to stable storage. By definition,
- * they are "ref'd" and are considered part of arc_mru
- * that cannot be freed. Generally, they will aquire a DVA
- * as they are written and migrate onto the arc_mru list.
- *
- * The ARC_l2c_only state is for buffers that are in the second
- * level ARC but no longer in any of the ARC_m* lists. The second
- * level ARC itself may also contain buffers that are in any of
- * the ARC_m* states - meaning that a buffer can exist in two
- * places. The reason for the ARC_l2c_only state is to keep the
- * buffer header in the hash table, so that reads that hit the
- * second level ARC benefit from these fast lookups.
- */
-
-typedef struct arc_state {
- /*
- * list of evictable buffers
- */
- multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
- /*
- * total amount of evictable data in this state
- */
- zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
- /*
- * total amount of data in this state; this includes: evictable,
- * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
- */
- zfs_refcount_t arcs_size;
-} arc_state_t;
-
/* The 6 states: */
static arc_state_t ARC_anon;
static arc_state_t ARC_mru;
@@ -463,263 +418,7 @@ static arc_state_t ARC_mfu;
static arc_state_t ARC_mfu_ghost;
static arc_state_t ARC_l2c_only;
-typedef struct arc_stats {
- kstat_named_t arcstat_hits;
- kstat_named_t arcstat_misses;
- kstat_named_t arcstat_demand_data_hits;
- kstat_named_t arcstat_demand_data_misses;
- kstat_named_t arcstat_demand_metadata_hits;
- kstat_named_t arcstat_demand_metadata_misses;
- kstat_named_t arcstat_prefetch_data_hits;
- kstat_named_t arcstat_prefetch_data_misses;
- kstat_named_t arcstat_prefetch_metadata_hits;
- kstat_named_t arcstat_prefetch_metadata_misses;
- kstat_named_t arcstat_mru_hits;
- kstat_named_t arcstat_mru_ghost_hits;
- kstat_named_t arcstat_mfu_hits;
- kstat_named_t arcstat_mfu_ghost_hits;
- kstat_named_t arcstat_deleted;
- /*
- * Number of buffers that could not be evicted because the hash lock
- * was held by another thread. The lock may not necessarily be held
- * by something using the same buffer, since hash locks are shared
- * by multiple buffers.
- */
- kstat_named_t arcstat_mutex_miss;
- /*
- * Number of buffers skipped when updating the access state due to the
- * header having already been released after acquiring the hash lock.
- */
- kstat_named_t arcstat_access_skip;
- /*
- * Number of buffers skipped because they have I/O in progress, are
- * indirect prefetch buffers that have not lived long enough, or are
- * not from the spa we're trying to evict from.
- */
- kstat_named_t arcstat_evict_skip;
- /*
- * Number of times arc_evict_state() was unable to evict enough
- * buffers to reach its target amount.
- */
- kstat_named_t arcstat_evict_not_enough;
- kstat_named_t arcstat_evict_l2_cached;
- kstat_named_t arcstat_evict_l2_eligible;
- kstat_named_t arcstat_evict_l2_ineligible;
- kstat_named_t arcstat_evict_l2_skip;
- kstat_named_t arcstat_hash_elements;
- kstat_named_t arcstat_hash_elements_max;
- kstat_named_t arcstat_hash_collisions;
- kstat_named_t arcstat_hash_chains;
- kstat_named_t arcstat_hash_chain_max;
- kstat_named_t arcstat_p;
- kstat_named_t arcstat_c;
- kstat_named_t arcstat_c_min;
- kstat_named_t arcstat_c_max;
- /* Not updated directly; only synced in arc_kstat_update. */
- kstat_named_t arcstat_size;
- /*
- * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
- * Note that the compressed bytes may match the uncompressed bytes
- * if the block is either not compressed or compressed arc is disabled.
- */
- kstat_named_t arcstat_compressed_size;
- /*
- * Uncompressed size of the data stored in b_pabd. If compressed
- * arc is disabled then this value will be identical to the stat
- * above.
- */
- kstat_named_t arcstat_uncompressed_size;
- /*
- * Number of bytes stored in all the arc_buf_t's. This is classified
- * as "overhead" since this data is typically short-lived and will
- * be evicted from the arc when it becomes unreferenced unless the
- * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
- * values have been set (see comment in dbuf.c for more information).
- */
- kstat_named_t arcstat_overhead_size;
- /*
- * Number of bytes consumed by internal ARC structures necessary
- * for tracking purposes; these structures are not actually
- * backed by ARC buffers. This includes arc_buf_hdr_t structures
- * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
- * caches), and arc_buf_t structures (allocated via arc_buf_t
- * cache).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_hdr_size;
- /*
- * Number of bytes consumed by ARC buffers of type equal to
- * ARC_BUFC_DATA. This is generally consumed by buffers backing
- * on disk user data (e.g. plain file contents).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_data_size;
- /*
- * Number of bytes consumed by ARC buffers of type equal to
- * ARC_BUFC_METADATA. This is generally consumed by buffers
- * backing on disk data that is used for internal ZFS
- * structures (e.g. ZAP, dnode, indirect blocks, etc).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_metadata_size;
- /*
- * Number of bytes consumed by various buffers and structures
- * not actually backed with ARC buffers. This includes bonus
- * buffers (allocated directly via zio_buf_* functions),
- * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
- * cache), and dnode_t structures (allocated via dnode_t cache).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_other_size;
- /*
- * Total number of bytes consumed by ARC buffers residing in the
- * arc_anon state. This includes *all* buffers in the arc_anon
- * state; e.g. data, metadata, evictable, and unevictable buffers
- * are all included in this value.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_anon_size;
- /*
- * Number of bytes consumed by ARC buffers that meet the
- * following criteria: backing buffers of type ARC_BUFC_DATA,
- * residing in the arc_anon state, and are eligible for eviction
- * (e.g. have no outstanding holds on the buffer).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_anon_evictable_data;
- /*
- * Number of bytes consumed by ARC buffers that meet the
- * following criteria: backing buffers of type ARC_BUFC_METADATA,
- * residing in the arc_anon state, and are eligible for eviction
- * (e.g. have no outstanding holds on the buffer).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_anon_evictable_metadata;
- /*
- * Total number of bytes consumed by ARC buffers residing in the
- * arc_mru state. This includes *all* buffers in the arc_mru
- * state; e.g. data, metadata, evictable, and unevictable buffers
- * are all included in this value.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mru_size;
- /*
- * Number of bytes consumed by ARC buffers that meet the
- * following criteria: backing buffers of type ARC_BUFC_DATA,
- * residing in the arc_mru state, and are eligible for eviction
- * (e.g. have no outstanding holds on the buffer).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mru_evictable_data;
- /*
- * Number of bytes consumed by ARC buffers that meet the
- * following criteria: backing buffers of type ARC_BUFC_METADATA,
- * residing in the arc_mru state, and are eligible for eviction
- * (e.g. have no outstanding holds on the buffer).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mru_evictable_metadata;
- /*
- * Total number of bytes that *would have been* consumed by ARC
- * buffers in the arc_mru_ghost state. The key thing to note
- * here, is the fact that this size doesn't actually indicate
- * RAM consumption. The ghost lists only consist of headers and
- * don't actually have ARC buffers linked off of these headers.
- * Thus, *if* the headers had associated ARC buffers, these
- * buffers *would have* consumed this number of bytes.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mru_ghost_size;
- /*
- * Number of bytes that *would have been* consumed by ARC
- * buffers that are eligible for eviction, of type
- * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mru_ghost_evictable_data;
- /*
- * Number of bytes that *would have been* consumed by ARC
- * buffers that are eligible for eviction, of type
- * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mru_ghost_evictable_metadata;
- /*
- * Total number of bytes consumed by ARC buffers residing in the
- * arc_mfu state. This includes *all* buffers in the arc_mfu
- * state; e.g. data, metadata, evictable, and unevictable buffers
- * are all included in this value.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mfu_size;
- /*
- * Number of bytes consumed by ARC buffers that are eligible for
- * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
- * state.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mfu_evictable_data;
- /*
- * Number of bytes consumed by ARC buffers that are eligible for
- * eviction, of type ARC_BUFC_METADATA, and reside in the
- * arc_mfu state.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mfu_evictable_metadata;
- /*
- * Total number of bytes that *would have been* consumed by ARC
- * buffers in the arc_mfu_ghost state. See the comment above
- * arcstat_mru_ghost_size for more details.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mfu_ghost_size;
- /*
- * Number of bytes that *would have been* consumed by ARC
- * buffers that are eligible for eviction, of type
- * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mfu_ghost_evictable_data;
- /*
- * Number of bytes that *would have been* consumed by ARC
- * buffers that are eligible for eviction, of type
- * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mfu_ghost_evictable_metadata;
- kstat_named_t arcstat_l2_hits;
- kstat_named_t arcstat_l2_misses;
- kstat_named_t arcstat_l2_feeds;
- kstat_named_t arcstat_l2_rw_clash;
- kstat_named_t arcstat_l2_read_bytes;
- kstat_named_t arcstat_l2_write_bytes;
- kstat_named_t arcstat_l2_writes_sent;
- kstat_named_t arcstat_l2_writes_done;
- kstat_named_t arcstat_l2_writes_error;
- kstat_named_t arcstat_l2_writes_lock_retry;
- kstat_named_t arcstat_l2_evict_lock_retry;
- kstat_named_t arcstat_l2_evict_reading;
- kstat_named_t arcstat_l2_evict_l1cached;
- kstat_named_t arcstat_l2_free_on_write;
- kstat_named_t arcstat_l2_abort_lowmem;
- kstat_named_t arcstat_l2_cksum_bad;
- kstat_named_t arcstat_l2_io_error;
- kstat_named_t arcstat_l2_lsize;
- kstat_named_t arcstat_l2_psize;
- /* Not updated directly; only synced in arc_kstat_update. */
- kstat_named_t arcstat_l2_hdr_size;
- kstat_named_t arcstat_memory_throttle_count;
- /* Not updated directly; only synced in arc_kstat_update. */
- kstat_named_t arcstat_meta_used;
- kstat_named_t arcstat_meta_limit;
- kstat_named_t arcstat_meta_max;
- kstat_named_t arcstat_meta_min;
- kstat_named_t arcstat_async_upgrade_sync;
- kstat_named_t arcstat_demand_hit_predictive_prefetch;
- kstat_named_t arcstat_demand_hit_prescient_prefetch;
-} arc_stats_t;
-
-static arc_stats_t arc_stats = {
+arc_stats_t arc_stats = {
{ "hits", KSTAT_DATA_UINT64 },
{ "misses", KSTAT_DATA_UINT64 },
{ "demand_data_hits", KSTAT_DATA_UINT64 },
@@ -795,6 +494,22 @@ static arc_stats_t arc_stats = {
{ "l2_size", KSTAT_DATA_UINT64 },
{ "l2_asize", KSTAT_DATA_UINT64 },
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
+ { "l2_log_blk_writes", KSTAT_DATA_UINT64 },
+ { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 },
+ { "l2_log_blk_asize", KSTAT_DATA_UINT64 },
+ { "l2_log_blk_count", KSTAT_DATA_UINT64 },
+ { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_success", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_size", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_asize", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_bufs", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 },
{ "memory_throttle_count", KSTAT_DATA_UINT64 },
{ "arc_meta_used", KSTAT_DATA_UINT64 },
{ "arc_meta_limit", KSTAT_DATA_UINT64 },
@@ -805,14 +520,6 @@ static arc_stats_t arc_stats = {
{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
};
-#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
-
-#define ARCSTAT_INCR(stat, val) \
- atomic_add_64(&arc_stats.stat.value.ui64, (val))
-
-#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
-#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
-
#define ARCSTAT_MAX(stat, val) { \
uint64_t m; \
while ((val) > (m = arc_stats.stat.value.ui64) && \
@@ -843,6 +550,24 @@ static arc_stats_t arc_stats = {
} \
}
+/*
+ * This macro allows us to use kstats as floating averages. Each time we
+ * update this kstat, we first factor it and the update value by
+ * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
+ * average. This macro assumes that integer loads and stores are atomic, but
+ * is not safe for multiple writers updating the kstat in parallel (only the
+ * last writer's update will remain).
+ */
+#define ARCSTAT_F_AVG_FACTOR 3
+#define ARCSTAT_F_AVG(stat, value) \
+ do { \
+ uint64_t x = ARCSTAT(stat); \
+ x = x - x / ARCSTAT_F_AVG_FACTOR + \
+ (value) / ARCSTAT_F_AVG_FACTOR; \
+ ARCSTAT(stat) = x; \
+ _NOTE(CONSTCOND) \
+ } while (0)
+
kstat_t *arc_ksp;
static arc_state_t *arc_anon;
static arc_state_t *arc_mru;
@@ -852,29 +577,6 @@ static arc_state_t *arc_mfu_ghost;
static arc_state_t *arc_l2c_only;
/*
- * There are several ARC variables that are critical to export as kstats --
- * but we don't want to have to grovel around in the kstat whenever we wish to
- * manipulate them. For these variables, we therefore define them to be in
- * terms of the statistic variable. This assures that we are not introducing
- * the possibility of inconsistency by having shadow copies of the variables,
- * while still allowing the code to be readable.
- */
-#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
-#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
-#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
-#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
-#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
-#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
-#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
-
-/* compressed size of entire arc */
-#define arc_compressed_size ARCSTAT(arcstat_compressed_size)
-/* uncompressed size of entire arc */
-#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size)
-/* number of bytes in the arc from arc_buf_t's */
-#define arc_overhead_size ARCSTAT(arcstat_overhead_size)
-
-/*
* There are also some ARC variables that we want to export, but that are
* updated so often that having the canonical representation be the statistic
* variable causes a performance bottleneck. We want to use aggsum_t's for these
@@ -895,182 +597,6 @@ static hrtime_t arc_growtime;
static uint64_t arc_tempreserve;
static uint64_t arc_loaned_bytes;
-typedef struct arc_callback arc_callback_t;
-
-struct arc_callback {
- void *acb_private;
- arc_read_done_func_t *acb_done;
- arc_buf_t *acb_buf;
- boolean_t acb_encrypted;
- boolean_t acb_compressed;
- boolean_t acb_noauth;
- zbookmark_phys_t acb_zb;
- zio_t *acb_zio_dummy;
- zio_t *acb_zio_head;
- arc_callback_t *acb_next;
-};
-
-typedef struct arc_write_callback arc_write_callback_t;
-
-struct arc_write_callback {
- void *awcb_private;
- arc_write_done_func_t *awcb_ready;
- arc_write_done_func_t *awcb_children_ready;
- arc_write_done_func_t *awcb_physdone;
- arc_write_done_func_t *awcb_done;
- arc_buf_t *awcb_buf;
-};
-
-/*
- * ARC buffers are separated into multiple structs as a memory saving measure:
- * - Common fields struct, always defined, and embedded within it:
- * - L2-only fields, always allocated but undefined when not in L2ARC
- * - L1-only fields, only allocated when in L1ARC
- *
- * Buffer in L1 Buffer only in L2
- * +------------------------+ +------------------------+
- * | arc_buf_hdr_t | | arc_buf_hdr_t |
- * | | | |
- * | | | |
- * | | | |
- * +------------------------+ +------------------------+
- * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
- * | (undefined if L1-only) | | |
- * +------------------------+ +------------------------+
- * | l1arc_buf_hdr_t |
- * | |
- * | |
- * | |
- * | |
- * +------------------------+
- *
- * Because it's possible for the L2ARC to become extremely large, we can wind
- * up eating a lot of memory in L2ARC buffer headers, so the size of a header
- * is minimized by only allocating the fields necessary for an L1-cached buffer
- * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
- * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
- * words in pointers. arc_hdr_realloc() is used to switch a header between
- * these two allocation states.
- */
-typedef struct l1arc_buf_hdr {
- kmutex_t b_freeze_lock;
- zio_cksum_t *b_freeze_cksum;
-#ifdef ZFS_DEBUG
- /*
- * Used for debugging with kmem_flags - by allocating and freeing
- * b_thawed when the buffer is thawed, we get a record of the stack
- * trace that thawed it.
- */
- void *b_thawed;
-#endif
-
- arc_buf_t *b_buf;
- uint32_t b_bufcnt;
- /* for waiting on writes to complete */
- kcondvar_t b_cv;
- uint8_t b_byteswap;
-
- /* protected by arc state mutex */
- arc_state_t *b_state;
- multilist_node_t b_arc_node;
-
- /* updated atomically */
- clock_t b_arc_access;
-
- /* self protecting */
- zfs_refcount_t b_refcnt;
-
- arc_callback_t *b_acb;
- abd_t *b_pabd;
-} l1arc_buf_hdr_t;
-
-/*
- * Encrypted blocks will need to be stored encrypted on the L2ARC
- * disk as they appear in the main pool. In order for this to work we
- * need to pass around the encryption parameters so they can be used
- * to write data to the L2ARC. This struct is only defined in the
- * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED
- * flag set.
- */
-typedef struct arc_buf_hdr_crypt {
- abd_t *b_rabd; /* raw encrypted data */
- dmu_object_type_t b_ot; /* object type */
- uint32_t b_ebufcnt; /* number or encryped buffers */
-
- /* dsobj for looking up encryption key for l2arc encryption */
- uint64_t b_dsobj; /* for looking up key */
-
- /* encryption parameters */
- uint8_t b_salt[ZIO_DATA_SALT_LEN];
- uint8_t b_iv[ZIO_DATA_IV_LEN];
-
- /*
- * Technically this could be removed since we will always be able to
- * get the mac from the bp when we need it. However, it is inconvenient
- * for callers of arc code to have to pass a bp in all the time. This
- * also allows us to assert that L2ARC data is properly encrypted to
- * match the data in the main storage pool.
- */
- uint8_t b_mac[ZIO_DATA_MAC_LEN];
-} arc_buf_hdr_crypt_t;
-
-typedef struct l2arc_dev l2arc_dev_t;
-
-typedef struct l2arc_buf_hdr {
- /* protected by arc_buf_hdr mutex */
- l2arc_dev_t *b_dev; /* L2ARC device */
- uint64_t b_daddr; /* disk address, offset byte */
-
- list_node_t b_l2node;
-} l2arc_buf_hdr_t;
-
-struct arc_buf_hdr {
- /* protected by hash lock */
- dva_t b_dva;
- uint64_t b_birth;
-
- arc_buf_contents_t b_type;
- arc_buf_hdr_t *b_hash_next;
- arc_flags_t b_flags;
-
- /*
- * This field stores the size of the data buffer after
- * compression, and is set in the arc's zio completion handlers.
- * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
- *
- * While the block pointers can store up to 32MB in their psize
- * field, we can only store up to 32MB minus 512B. This is due
- * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
- * a field of zeros represents 512B in the bp). We can't use a
- * bias of 1 since we need to reserve a psize of zero, here, to
- * represent holes and embedded blocks.
- *
- * This isn't a problem in practice, since the maximum size of a
- * buffer is limited to 16MB, so we never need to store 32MB in
- * this field. Even in the upstream illumos code base, the
- * maximum size of a buffer is limited to 16MB.
- */
- uint16_t b_psize;
-
- /*
- * This field stores the size of the data buffer before
- * compression, and cannot change once set. It is in units
- * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
- */
- uint16_t b_lsize; /* immutable */
- uint64_t b_spa; /* immutable */
-
- /* L2ARC fields. Undefined when not in L2ARC. */
- l2arc_buf_hdr_t b_l2hdr;
- /* L1ARC fields. Undefined when in l2arc_only state */
- l1arc_buf_hdr_t b_l1hdr;
- /*
- * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED
- * is set and the L1 header exists.
- */
- arc_buf_hdr_crypt_t b_crypt_hdr;
-};
-
#define GHOST_STATE(state) \
((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
(state) == arc_l2c_only)
@@ -1192,20 +718,6 @@ boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
/*
* L2ARC Internals
*/
-struct l2arc_dev {
- vdev_t *l2ad_vdev; /* vdev */
- spa_t *l2ad_spa; /* spa */
- uint64_t l2ad_hand; /* next write location */
- uint64_t l2ad_start; /* first addr on device */
- uint64_t l2ad_end; /* last addr on device */
- boolean_t l2ad_first; /* first sweep through */
- boolean_t l2ad_writing; /* currently writing */
- kmutex_t l2ad_mtx; /* lock for buffer list */
- list_t l2ad_buflist; /* buffer list */
- list_node_t l2ad_node; /* device list node */
- zfs_refcount_t l2ad_alloc; /* allocated bytes */
-};
-
static list_t L2ARC_dev_list; /* device list */
static list_t *l2arc_dev_list; /* device list pointer */
static kmutex_t l2arc_dev_mtx; /* device list mutex */
@@ -1223,11 +735,6 @@ typedef struct l2arc_read_callback {
abd_t *l2rcb_abd; /* temporary buffer */
} l2arc_read_callback_t;
-typedef struct l2arc_write_callback {
- l2arc_dev_t *l2wcb_dev; /* device info */
- arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
-} l2arc_write_callback_t;
-
typedef struct l2arc_data_free {
/* protected by l2arc_free_on_write_mtx */
abd_t *l2df_abd;
@@ -1240,6 +747,9 @@ static kmutex_t l2arc_feed_thr_lock;
static kcondvar_t l2arc_feed_thr_cv;
static uint8_t l2arc_thread_exit;
+static kmutex_t l2arc_rebuild_thr_lock;
+static kcondvar_t l2arc_rebuild_thr_cv;
+
static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *);
typedef enum arc_fill_flags {
ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */
@@ -1259,6 +769,7 @@ static void arc_hdr_alloc_pabd(arc_buf_hdr_t *, boolean_t);
static void arc_access(arc_buf_hdr_t *, kmutex_t *);
static boolean_t arc_is_overflowing();
static void arc_buf_watch(arc_buf_t *);
+static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
@@ -1297,6 +808,9 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
((hdr)->b_dva.dva_word[0] == 0 && \
(hdr)->b_dva.dva_word[1] == 0)
+#define HDR_EMPTY_OR_LOCKED(hdr) \
+ (HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr)))
+
#define HDR_EQUAL(spa, dva, birth, hdr) \
((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
@@ -1725,8 +1239,7 @@ arc_cksum_free(arc_buf_hdr_t *hdr)
static boolean_t
arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
{
- ASSERT(hdr->b_l1hdr.b_state == arc_anon ||
- MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr));
for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
if (!ARC_BUF_COMPRESSED(b)) {
@@ -2010,14 +1523,14 @@ arc_buf_freeze(arc_buf_t *buf)
static inline void
arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
{
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
hdr->b_flags |= flags;
}
static inline void
arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
{
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
hdr->b_flags &= ~flags;
}
@@ -2031,7 +1544,7 @@ arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
static void
arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
{
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
/*
* Holes and embedded blocks will always have a psize = 0 so
@@ -2124,7 +1637,7 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
void *tmpbuf = NULL;
abd_t *abd = hdr->b_l1hdr.b_pabd;
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
ASSERT(HDR_AUTHENTICATED(hdr));
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
@@ -2194,7 +1707,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
boolean_t no_crypt = B_FALSE;
boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
ASSERT(HDR_ENCRYPTED(hdr));
arc_hdr_alloc_pabd(hdr, B_FALSE);
@@ -2314,7 +1827,7 @@ arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock)
ASSERT(HDR_ENCRYPTED(hdr));
ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
@@ -2634,7 +2147,7 @@ static void
add_reference(arc_buf_hdr_t *hdr, void *tag)
{
ASSERT(HDR_HAS_L1HDR(hdr));
- if (!MUTEX_HELD(HDR_LOCK(hdr))) {
+ if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) {
ASSERT(hdr->b_l1hdr.b_state == arc_anon);
ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
@@ -3039,7 +2552,7 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
* We're about to change the hdr's b_flags. We must either
* hold the hash_lock or be undiscoverable.
*/
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
/*
* Only honor requests for compressed bufs if the hdr is actually
@@ -3159,6 +2672,58 @@ arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
return (buf);
}
+/*
+ * Performance tuning of L2ARC persistence:
+ *
+ * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
+ * an L2ARC device (either at pool import or later) will attempt
+ * to rebuild L2ARC buffer contents.
+ * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls
+ * whether log blocks are written to the L2ARC device. If the L2ARC
+ * device is less than 1GB, the amount of data l2arc_evict()
+ * evicts is significant compared to the amount of restored L2ARC
+ * data. In this case do not write log blocks in L2ARC in order
+ * not to waste space.
+ */
+int l2arc_rebuild_enabled = B_TRUE;
+unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
+
+/* L2ARC persistence rebuild control routines. */
+void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
+static void l2arc_dev_rebuild_start(l2arc_dev_t *dev);
+static int l2arc_rebuild(l2arc_dev_t *dev);
+
+/* L2ARC persistence read I/O routines. */
+static int l2arc_dev_hdr_read(l2arc_dev_t *dev);
+static int l2arc_log_blk_read(l2arc_dev_t *dev,
+ const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp,
+ l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
+ zio_t *this_io, zio_t **next_io);
+static zio_t *l2arc_log_blk_fetch(vdev_t *vd,
+ const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb);
+static void l2arc_log_blk_fetch_abort(zio_t *zio);
+
+/* L2ARC persistence block restoration routines. */
+static void l2arc_log_blk_restore(l2arc_dev_t *dev,
+ const l2arc_log_blk_phys_t *lb, uint64_t lb_asize, uint64_t lb_daddr);
+static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
+ l2arc_dev_t *dev);
+
+/* L2ARC persistence write I/O routines. */
+static void l2arc_dev_hdr_update(l2arc_dev_t *dev);
+static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
+ l2arc_write_callback_t *cb);
+
+/* L2ARC persistence auxilliary routines. */
+boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
+ const l2arc_log_blkptr_t *lbp);
+static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
+ const arc_buf_hdr_t *ab);
+boolean_t l2arc_range_check_overlap(uint64_t bottom,
+ uint64_t top, uint64_t check);
+static void l2arc_blk_fetch_done(zio_t *zio);
+static inline uint64_t
+ l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev);
/*
* Return a loaned arc buffer to the arc.
@@ -3247,7 +2812,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
ASSERT(arc_can_share(hdr, buf));
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(!ARC_BUF_ENCRYPTED(buf));
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
/*
* Start sharing the data buffer. We transfer the
@@ -3280,7 +2845,7 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
ASSERT(arc_buf_is_shared(buf));
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
/*
* We are no longer sharing this buffer so we need
@@ -3315,7 +2880,7 @@ arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
arc_buf_t *lastbuf = NULL;
ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
/*
* Remove the buf from the hdr list and locate the last
@@ -3363,7 +2928,7 @@ arc_buf_destroy_impl(arc_buf_t *buf)
* We're about to change the hdr's b_flags. We must either
* hold the hash_lock or be undiscoverable.
*/
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
arc_cksum_verify(buf);
arc_buf_unwatch(buf);
@@ -3841,7 +3406,6 @@ arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
{
arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
B_FALSE, ZIO_COMPRESS_OFF, type, B_FALSE);
- ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
arc_buf_t *buf = NULL;
VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE,
@@ -3852,6 +3416,42 @@ arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
}
/*
+ * Allocates an ARC buf header that's in an evicted & L2-cached state.
+ * This is used during l2arc reconstruction to make empty ARC buffers
+ * which circumvent the regular disk->arc->l2arc path and instead come
+ * into being in the reverse order, i.e. l2arc->arc.
+ */
+arc_buf_hdr_t *
+arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev,
+ dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth,
+ enum zio_compress compress, boolean_t protected, boolean_t prefetch)
+{
+ arc_buf_hdr_t *hdr;
+
+ ASSERT(size != 0);
+ hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP);
+ hdr->b_birth = birth;
+ hdr->b_type = type;
+ hdr->b_flags = 0;
+ arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR);
+ HDR_SET_LSIZE(hdr, size);
+ HDR_SET_PSIZE(hdr, psize);
+ arc_hdr_set_compress(hdr, compress);
+ if (protected)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
+ if (prefetch)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+ hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa);
+
+ hdr->b_dva = dva;
+
+ hdr->b_l2hdr.b_dev = dev;
+ hdr->b_l2hdr.b_daddr = daddr;
+
+ return (hdr);
+}
+
+/*
* Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
* for bufs containing metadata.
*/
@@ -3866,7 +3466,6 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
B_FALSE, compression_type, ARC_BUFC_DATA, B_FALSE);
- ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
arc_buf_t *buf = NULL;
VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE,
@@ -3907,7 +3506,6 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE,
compression_type, type, B_TRUE);
- ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
hdr->b_crypt_hdr.b_dsobj = dsobj;
hdr->b_crypt_hdr.b_ot = ot;
@@ -3966,9 +3564,6 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
ASSERT(!HDR_IN_HASH_TABLE(hdr));
- if (!HDR_EMPTY(hdr))
- buf_discard_identity(hdr);
-
if (HDR_HAS_L2HDR(hdr)) {
l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
@@ -3992,6 +3587,15 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
mutex_exit(&dev->l2ad_mtx);
}
+ /*
+ * The header's identity can only be safely discarded once it is no
+ * longer discoverable. This requires removing it from the hash table
+ * and the l2arc header list. After this point the hash lock can not
+ * be used to protect the header.
+ */
+ if (!HDR_EMPTY(hdr))
+ buf_discard_identity(hdr);
+
if (HDR_HAS_L1HDR(hdr)) {
arc_cksum_free(hdr);
@@ -4005,9 +3609,8 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
}
#endif
- if (hdr->b_l1hdr.b_pabd != NULL) {
+ if (hdr->b_l1hdr.b_pabd != NULL)
arc_hdr_free_pabd(hdr, B_FALSE);
- }
if (HDR_HAS_RABD(hdr))
arc_hdr_free_pabd(hdr, B_TRUE);
@@ -4032,7 +3635,6 @@ void
arc_buf_destroy(arc_buf_t *buf, void* tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
- kmutex_t *hash_lock = HDR_LOCK(hdr);
if (hdr->b_l1hdr.b_state == arc_anon) {
ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
@@ -4042,7 +3644,9 @@ arc_buf_destroy(arc_buf_t *buf, void* tag)
return;
}
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
+
ASSERT3P(hdr, ==, buf->b_hdr);
ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
@@ -6886,8 +6490,8 @@ arc_write_done(zio_t *zio)
ASSERT(zfs_refcount_is_zero(
&exists->b_l1hdr.b_refcnt));
arc_change_state(arc_anon, exists, hash_lock);
- mutex_exit(hash_lock);
arc_hdr_destroy(exists);
+ mutex_exit(hash_lock);
exists = buf_hash_insert(hdr, &hash_lock);
ASSERT3P(exists, ==, NULL);
} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
@@ -7659,6 +7263,103 @@ arc_fini(void)
*
* These three functions determine what to write, how much, and how quickly
* to send writes.
+ *
+ * L2ARC persistence:
+ *
+ * When writing buffers to L2ARC, we periodically add some metadata to
+ * make sure we can pick them up after reboot, thus dramatically reducing
+ * the impact that any downtime has on the performance of storage systems
+ * with large caches.
+ *
+ * The implementation works fairly simply by integrating the following two
+ * modifications:
+ *
+ * *) When writing to the L2ARC, we occasionally write a "l2arc log block",
+ * which is an additional piece of metadata which describes what's been
+ * written. This allows us to rebuild the arc_buf_hdr_t structures of the
+ * main ARC buffers. There are 2 linked-lists of log blocks headed by
+ * dh_start_lbps[2]. We alternate which chain we append to, so they are
+ * time-wise and offset-wise interleaved, but that is an optimization rather
+ * than for correctness. The log block also includes a pointer to the
+ * previous block in its chain.
+ *
+ * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
+ * for our header bookkeeping purposes. This contains a device header,
+ * which contains our top-level reference structures. We update it each
+ * time we write a new log block, so that we're able to locate it in the
+ * L2ARC device. If this write results in an inconsistent device header
+ * (e.g. due to power failure), we detect this by verifying the header's
+ * checksum and simply fail to reconstruct the L2ARC after reboot.
+ *
+ * Implementation diagram:
+ *
+ * +=== L2ARC device (not to scale) ======================================+
+ * | ___two newest log block pointers__.__________ |
+ * | / \dh_start_lbps[1] |
+ * | / \ \dh_start_lbps[0]|
+ * |.___/__. V V |
+ * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
+ * || hdr| ^ /^ /^ / / |
+ * |+------+ ...--\-------/ \-----/--\------/ / |
+ * | \--------------/ \--------------/ |
+ * +======================================================================+
+ *
+ * As can be seen on the diagram, rather than using a simple linked list,
+ * we use a pair of linked lists with alternating elements. This is a
+ * performance enhancement due to the fact that we only find out the
+ * address of the next log block access once the current block has been
+ * completely read in. Obviously, this hurts performance, because we'd be
+ * keeping the device's I/O queue at only a 1 operation deep, thus
+ * incurring a large amount of I/O round-trip latency. Having two lists
+ * allows us to fetch two log blocks ahead of where we are currently
+ * rebuilding L2ARC buffers.
+ *
+ * On-device data structures:
+ *
+ * L2ARC device header: l2arc_dev_hdr_phys_t
+ * L2ARC log block: l2arc_log_blk_phys_t
+ *
+ * L2ARC reconstruction:
+ *
+ * When writing data, we simply write in the standard rotary fashion,
+ * evicting buffers as we go and simply writing new data over them (writing
+ * a new log block every now and then). This obviously means that once we
+ * loop around the end of the device, we will start cutting into an already
+ * committed log block (and its referenced data buffers), like so:
+ *
+ * current write head__ __old tail
+ * \ /
+ * V V
+ * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |-->
+ * ^ ^^^^^^^^^___________________________________
+ * | \
+ * <<nextwrite>> may overwrite this blk and/or its bufs --'
+ *
+ * When importing the pool, we detect this situation and use it to stop
+ * our scanning process (see l2arc_rebuild).
+ *
+ * There is one significant caveat to consider when rebuilding ARC contents
+ * from an L2ARC device: what about invalidated buffers? Given the above
+ * construction, we cannot update blocks which we've already written to amend
+ * them to remove buffers which were invalidated. Thus, during reconstruction,
+ * we might be populating the cache with buffers for data that's not on the
+ * main pool anymore, or may have been overwritten!
+ *
+ * As it turns out, this isn't a problem. Every arc_read request includes
+ * both the DVA and, crucially, the birth TXG of the BP the caller is
+ * looking for. So even if the cache were populated by completely rotten
+ * blocks for data that had been long deleted and/or overwritten, we'll
+ * never actually return bad data from the cache, since the DVA with the
+ * birth TXG uniquely identify a block in space and time - once created,
+ * a block is immutable on disk. The worst thing we have done is wasted
+ * some time and memory at l2arc rebuild to reconstruct outdated ARC
+ * entries that will get dropped from the l2arc as it is being updated
+ * with new blocks.
+ *
+ * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write
+ * hand are not restored. This is done by saving the offset (in bytes)
+ * l2arc_evict() has evicted to in the L2ARC device header and taking it
+ * into account when restoring buffers.
*/
static boolean_t
@@ -7679,9 +7380,9 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
}
static uint64_t
-l2arc_write_size(void)
+l2arc_write_size(l2arc_dev_t *dev)
{
- uint64_t size;
+ uint64_t size, dev_size;
/*
* Make sure our globals have meaningful values in case the user
@@ -7698,6 +7399,25 @@ l2arc_write_size(void)
if (arc_warm == B_FALSE)
size += l2arc_write_boost;
+ /*
+ * Make sure the write size does not exceed the size of the cache
+ * device. This is important in l2arc_evict(), otherwise infinite
+ * iteration can occur.
+ */
+ dev_size = dev->l2ad_end - dev->l2ad_start;
+ if ((size + l2arc_log_blk_overhead(size, dev)) >= dev_size) {
+ cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
+ "plus the overhead of log blocks (persistent L2ARC, "
+ "%" PRIu64 " bytes) exceeds the size of the cache device "
+ "(guid %" PRIu64 "), resetting them to the default (%d)",
+ l2arc_log_blk_overhead(size, dev),
+ dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE);
+ size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE;
+
+ if (arc_warm == B_FALSE)
+ size += l2arc_write_boost;
+ }
+
return (size);
}
@@ -7763,10 +7483,10 @@ l2arc_dev_get_next(void)
else if (next == first)
break;
- } while (vdev_is_dead(next->l2ad_vdev));
+ } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild);
/* if we were unable to find any usable vdevs, return NULL */
- if (vdev_is_dead(next->l2ad_vdev))
+ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild)
next = NULL;
l2arc_dev_last = next;
@@ -7815,16 +7535,20 @@ l2arc_do_free_on_write()
static void
l2arc_write_done(zio_t *zio)
{
- l2arc_write_callback_t *cb;
- l2arc_dev_t *dev;
- list_t *buflist;
- arc_buf_hdr_t *head, *hdr, *hdr_prev;
- kmutex_t *hash_lock;
- int64_t bytes_dropped = 0;
+ l2arc_write_callback_t *cb;
+ l2arc_lb_abd_buf_t *abd_buf;
+ l2arc_lb_ptr_buf_t *lb_ptr_buf;
+ l2arc_dev_t *dev;
+ l2arc_dev_hdr_phys_t *l2dhdr;
+ list_t *buflist;
+ arc_buf_hdr_t *head, *hdr, *hdr_prev;
+ kmutex_t *hash_lock;
+ int64_t bytes_dropped = 0;
cb = zio->io_private;
ASSERT3P(cb, !=, NULL);
dev = cb->l2wcb_dev;
+ l2dhdr = dev->l2ad_dev_hdr;
ASSERT3P(dev, !=, NULL);
head = cb->l2wcb_head;
ASSERT3P(head, !=, NULL);
@@ -7913,12 +7637,72 @@ top:
mutex_exit(hash_lock);
}
+ /*
+ * Free the allocated abd buffers for writing the log blocks.
+ * If the zio failed reclaim the allocated space and remove the
+ * pointers to these log blocks from the log block pointer list
+ * of the L2ARC device.
+ */
+ while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) {
+ abd_free(abd_buf->abd);
+ zio_buf_free(abd_buf, sizeof (*abd_buf));
+ if (zio->io_error != 0) {
+ lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list);
+ /*
+ * L2BLK_GET_PSIZE returns aligned size for log
+ * blocks.
+ */
+ uint64_t asize =
+ L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop);
+ bytes_dropped += asize;
+ ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
+ ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
+ zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
+ lb_ptr_buf);
+ zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
+ kmem_free(lb_ptr_buf->lb_ptr,
+ sizeof (l2arc_log_blkptr_t));
+ kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
+ }
+ }
+ list_destroy(&cb->l2wcb_abd_list);
+
+ if (zio->io_error != 0) {
+ /*
+ * Restore the lbps array in the header to its previous state.
+ * If the list of log block pointers is empty, zero out the
+ * log block pointers in the device header.
+ */
+ lb_ptr_buf = list_head(&dev->l2ad_lbptr_list);
+ for (int i = 0; i < 2; i++) {
+ if (lb_ptr_buf == NULL) {
+ /*
+ * If the list is empty zero out the device
+ * header. Otherwise zero out the second log
+ * block pointer in the header.
+ */
+ if (i == 0) {
+ bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
+ } else {
+ bzero(&l2dhdr->dh_start_lbps[i],
+ sizeof (l2arc_log_blkptr_t));
+ }
+ break;
+ }
+ bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i],
+ sizeof (l2arc_log_blkptr_t));
+ lb_ptr_buf = list_next(&dev->l2ad_lbptr_list,
+ lb_ptr_buf);
+ }
+ }
+
atomic_inc_64(&l2arc_writes_done);
list_remove(buflist, head);
ASSERT(!HDR_HAS_L1HDR(head));
kmem_cache_free(hdr_l2only_cache, head);
mutex_exit(&dev->l2ad_mtx);
+ ASSERT(dev->l2ad_vdev != NULL);
vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
l2arc_do_free_on_write();
@@ -8110,7 +7894,6 @@ l2arc_read_done(zio_t *zio)
zio->io_private = hdr;
arc_read_done(zio);
} else {
- mutex_exit(hash_lock);
/*
* Buffer didn't survive caching. Increment stats and
* reissue to the original storage device.
@@ -8135,10 +7918,24 @@ l2arc_read_done(zio_t *zio)
ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
- zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp,
+ zio = zio_read(pio, zio->io_spa, zio->io_bp,
abd, zio->io_size, arc_read_done,
hdr, zio->io_priority, cb->l2rcb_flags,
- &cb->l2rcb_zb));
+ &cb->l2rcb_zb);
+
+ /*
+ * Original ZIO will be freed, so we need to update
+ * ARC header with the new ZIO pointer to be used
+ * by zio_change_priority() in arc_read().
+ */
+ for (struct arc_callback *acb = hdr->b_l1hdr.b_acb;
+ acb != NULL; acb = acb->acb_next)
+ acb->acb_zio_head = zio;
+
+ mutex_exit(hash_lock);
+ zio_nowait(zio);
+ } else {
+ mutex_exit(hash_lock);
}
}
@@ -8189,8 +7986,31 @@ l2arc_sublist_lock(int list_num)
}
/*
+ * Calculates the maximum overhead of L2ARC metadata log blocks for a given
+ * L2ARC write size. l2arc_evict and l2arc_write_size need to include this
+ * overhead in processing to make sure there is enough headroom available
+ * when writing buffers.
+ */
+static inline uint64_t
+l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev)
+{
+ if (dev->l2ad_log_entries == 0) {
+ return (0);
+ } else {
+ uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT;
+
+ uint64_t log_blocks = (log_entries +
+ dev->l2ad_log_entries - 1) /
+ dev->l2ad_log_entries;
+
+ return (vdev_psize_to_asize(dev->l2ad_vdev,
+ sizeof (l2arc_log_blk_phys_t)) * log_blocks);
+ }
+}
+
+/*
* Evict buffers from the device write hand to the distance specified in
- * bytes. This distance may span populated buffers, it may span nothing.
+ * bytes. This distance may span populated buffers, it may span nothing.
* This is clearing a region on the L2ARC device ready for writing.
* If the 'all' boolean is set, every buffer is evicted.
*/
@@ -8201,22 +8021,28 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
arc_buf_hdr_t *hdr, *hdr_prev;
kmutex_t *hash_lock;
uint64_t taddr;
+ l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev;
+ boolean_t rerun;
buflist = &dev->l2ad_buflist;
- if (!all && dev->l2ad_first) {
- /*
- * This is the first sweep through the device. There is
- * nothing to evict.
- */
- return;
- }
+ /*
+ * We need to add in the worst case scenario of log block overhead.
+ */
+ distance += l2arc_log_blk_overhead(distance, dev);
- if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
+top:
+ rerun = B_FALSE;
+ if (dev->l2ad_hand >= (dev->l2ad_end - distance)) {
/*
- * When nearing the end of the device, evict to the end
- * before the device write hand jumps to the start.
+ * When there is no space to accommodate upcoming writes,
+ * evict to the end. Then bump the write and evict hands
+ * to the start and iterate. This iteration does not
+ * happen indefinitely as we make sure in
+ * l2arc_write_size() that when the write hand is reset,
+ * the write size does not exceed the end of the device.
*/
+ rerun = B_TRUE;
taddr = dev->l2ad_end;
} else {
taddr = dev->l2ad_hand + distance;
@@ -8224,11 +8050,68 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
uint64_t, taddr, boolean_t, all);
-top:
+ /*
+ * This check has to be placed after deciding whether to iterate
+ * (rerun).
+ */
+ if (!all && dev->l2ad_first) {
+ /*
+ * This is the first sweep through the device. There is
+ * nothing to evict.
+ */
+ goto out;
+ }
+
+ /*
+ * When rebuilding L2ARC we retrieve the evict hand from the header of
+ * the device. Of note, l2arc_evict() does not actually delete buffers
+ * from the cache device, but keeping track of the evict hand will be
+ * useful when TRIM is implemented.
+ */
+ dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
+
+retry:
mutex_enter(&dev->l2ad_mtx);
+ /*
+ * We have to account for evicted log blocks. Run vdev_space_update()
+ * on log blocks whose offset (in bytes) is before the evicted offset
+ * (in bytes) by searching in the list of pointers to log blocks
+ * present in the L2ARC device.
+ */
+ for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf;
+ lb_ptr_buf = lb_ptr_buf_prev) {
+
+ lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf);
+
+ /* L2BLK_GET_PSIZE returns aligned size for log blocks */
+ uint64_t asize = L2BLK_GET_PSIZE(
+ (lb_ptr_buf->lb_ptr)->lbp_prop);
+
+ /*
+ * We don't worry about log blocks left behind (ie
+ * lbp_payload_start < l2ad_hand) because l2arc_write_buffers()
+ * will never write more than l2arc_evict() evicts.
+ */
+ if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
+ break;
+ } else {
+ vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
+ ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
+ ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
+ zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
+ lb_ptr_buf);
+ zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
+ list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf);
+ kmem_free(lb_ptr_buf->lb_ptr,
+ sizeof (l2arc_log_blkptr_t));
+ kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
+ }
+ }
+
for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
hdr_prev = list_prev(buflist, hdr);
+ ASSERT(!HDR_EMPTY(hdr));
hash_lock = HDR_LOCK(hdr);
/*
@@ -8244,7 +8127,7 @@ top:
mutex_exit(&dev->l2ad_mtx);
mutex_enter(hash_lock);
mutex_exit(hash_lock);
- goto top;
+ goto retry;
}
/*
@@ -8256,7 +8139,7 @@ top:
ASSERT(!HDR_L2_WRITING(hdr));
ASSERT(!HDR_L2_WRITE_HEAD(hdr));
- if (!all && (hdr->b_l2hdr.b_daddr >= taddr ||
+ if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict ||
hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
/*
* We've evicted to the target address,
@@ -8293,6 +8176,26 @@ top:
mutex_exit(hash_lock);
}
mutex_exit(&dev->l2ad_mtx);
+
+out:
+ /*
+ * We need to check if we evict all buffers, otherwise we may iterate
+ * unnecessarily.
+ */
+ if (!all && rerun) {
+ /*
+ * Bump device hand to the device start if it is approaching the
+ * end. l2arc_evict() has already evicted ahead for this case.
+ */
+ dev->l2ad_hand = dev->l2ad_start;
+ dev->l2ad_evict = dev->l2ad_start;
+ dev->l2ad_first = B_FALSE;
+ goto top;
+ }
+
+ ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);
+ if (!dev->l2ad_first)
+ ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict);
}
/*
@@ -8412,6 +8315,17 @@ error:
return (ret);
}
+static void
+l2arc_blk_fetch_done(zio_t *zio)
+{
+ l2arc_read_callback_t *cb;
+
+ cb = zio->io_private;
+ if (cb->l2rcb_abd != NULL)
+ abd_put(cb->l2rcb_abd);
+ kmem_free(cb, sizeof (l2arc_read_callback_t));
+}
+
/*
* Find and write ARC buffers to the L2ARC device.
*
@@ -8421,17 +8335,18 @@ error:
* state between calls to this function.
*
* Returns the number of bytes actually written (which may be smaller than
- * the delta by which the device hand has changed due to alignment).
+ * the delta by which the device hand has changed due to alignment and the
+ * writing of log blocks).
*/
static uint64_t
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
{
- arc_buf_hdr_t *hdr, *hdr_prev, *head;
- uint64_t write_asize, write_psize, write_lsize, headroom;
- boolean_t full;
- l2arc_write_callback_t *cb;
- zio_t *pio, *wzio;
- uint64_t guid = spa_load_guid(spa);
+ arc_buf_hdr_t *hdr, *hdr_prev, *head;
+ uint64_t write_asize, write_psize, write_lsize, headroom;
+ boolean_t full;
+ l2arc_write_callback_t *cb = NULL;
+ zio_t *pio, *wzio;
+ uint64_t guid = spa_load_guid(spa);
ASSERT3P(dev->l2ad_vdev, !=, NULL);
@@ -8483,7 +8398,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
}
passed_sz += HDR_GET_LSIZE(hdr);
- if (passed_sz > headroom) {
+ if (l2arc_headroom != 0 && passed_sz > headroom) {
/*
* Searched too far.
*/
@@ -8583,6 +8498,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
sizeof (l2arc_write_callback_t), KM_SLEEP);
cb->l2wcb_dev = dev;
cb->l2wcb_head = head;
+ /*
+ * Create a list to save allocated abd buffers
+ * for l2arc_log_blk_commit().
+ */
+ list_create(&cb->l2wcb_abd_list,
+ sizeof (l2arc_lb_abd_buf_t),
+ offsetof(l2arc_lb_abd_buf_t, node));
pio = zio_root(spa, l2arc_write_done, cb,
ZIO_FLAG_CANFAIL);
}
@@ -8616,6 +8538,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
mutex_exit(hash_lock);
+ /*
+ * Append buf info to current log and commit if full.
+ * arcstat_l2_{size,asize} kstats are updated
+ * internally.
+ */
+ if (l2arc_log_blk_insert(dev, hdr))
+ l2arc_log_blk_commit(dev, pio, cb);
+
(void) zio_nowait(wzio);
}
@@ -8630,28 +8560,36 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ASSERT0(write_lsize);
ASSERT(!HDR_HAS_L1HDR(head));
kmem_cache_free(hdr_l2only_cache, head);
+
+ /*
+ * Although we did not write any buffers l2ad_evict may
+ * have advanced.
+ */
+ l2arc_dev_hdr_update(dev);
+
return (0);
}
+ if (!dev->l2ad_first)
+ ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
+
ASSERT3U(write_asize, <=, target_sz);
ARCSTAT_BUMP(arcstat_l2_writes_sent);
ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
ARCSTAT_INCR(arcstat_l2_lsize, write_lsize);
ARCSTAT_INCR(arcstat_l2_psize, write_psize);
- /*
- * Bump device hand to the device start if it is approaching the end.
- * l2arc_evict() will already have evicted ahead for this case.
- */
- if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
- dev->l2ad_hand = dev->l2ad_start;
- dev->l2ad_first = B_FALSE;
- }
-
dev->l2ad_writing = B_TRUE;
(void) zio_wait(pio);
dev->l2ad_writing = B_FALSE;
+ /*
+ * Update the device header after the zio completes as
+ * l2arc_write_done() may have updated the memory holding the log block
+ * pointers in the device header.
+ */
+ l2arc_dev_hdr_update(dev);
+
return (write_asize);
}
@@ -8728,7 +8666,7 @@ l2arc_feed_thread(void *unused)
ARCSTAT_BUMP(arcstat_l2_feeds);
- size = l2arc_write_size();
+ size = l2arc_write_size(dev);
/*
* Evict L2ARC buffers that will be overwritten.
@@ -8756,7 +8694,17 @@ l2arc_feed_thread(void *unused)
boolean_t
l2arc_vdev_present(vdev_t *vd)
{
- l2arc_dev_t *dev;
+ return (l2arc_vdev_get(vd) != NULL);
+}
+
+/*
+ * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
+ * the vdev_t isn't an L2ARC device.
+ */
+static l2arc_dev_t *
+l2arc_vdev_get(vdev_t *vd)
+{
+ l2arc_dev_t *dev;
mutex_enter(&l2arc_dev_mtx);
for (dev = list_head(l2arc_dev_list); dev != NULL;
@@ -8766,7 +8714,7 @@ l2arc_vdev_present(vdev_t *vd)
}
mutex_exit(&l2arc_dev_mtx);
- return (dev != NULL);
+ return (dev);
}
/*
@@ -8776,7 +8724,8 @@ l2arc_vdev_present(vdev_t *vd)
void
l2arc_add_vdev(spa_t *spa, vdev_t *vd)
{
- l2arc_dev_t *adddev;
+ l2arc_dev_t *adddev;
+ uint64_t l2dhdr_asize;
ASSERT(!l2arc_vdev_present(vd));
@@ -8786,11 +8735,17 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
adddev->l2ad_spa = spa;
adddev->l2ad_vdev = vd;
- adddev->l2ad_start = VDEV_LABEL_START_SIZE;
+ /* leave extra size for an l2arc device header */
+ l2dhdr_asize = adddev->l2ad_dev_hdr_asize =
+ MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift);
+ adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize;
adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
+ ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end);
adddev->l2ad_hand = adddev->l2ad_start;
+ adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
+ adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
/*
@@ -8800,8 +8755,17 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
+ /*
+ * This is a list of pointers to log blocks that are still present
+ * on the device.
+ */
+ list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t),
+ offsetof(l2arc_lb_ptr_buf_t, node));
+
vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
zfs_refcount_create(&adddev->l2ad_alloc);
+ zfs_refcount_create(&adddev->l2ad_lb_asize);
+ zfs_refcount_create(&adddev->l2ad_lb_count);
/*
* Add device to global list
@@ -8810,6 +8774,87 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
list_insert_head(l2arc_dev_list, adddev);
atomic_inc_64(&l2arc_ndev);
mutex_exit(&l2arc_dev_mtx);
+
+ /*
+ * Decide if vdev is eligible for L2ARC rebuild
+ */
+ l2arc_rebuild_vdev(adddev->l2ad_vdev, B_FALSE);
+}
+
+void
+l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
+{
+ l2arc_dev_t *dev = NULL;
+ l2arc_dev_hdr_phys_t *l2dhdr;
+ uint64_t l2dhdr_asize;
+ spa_t *spa;
+ int err;
+ boolean_t l2dhdr_valid = B_TRUE;
+
+ dev = l2arc_vdev_get(vd);
+ ASSERT3P(dev, !=, NULL);
+ spa = dev->l2ad_spa;
+ l2dhdr = dev->l2ad_dev_hdr;
+ l2dhdr_asize = dev->l2ad_dev_hdr_asize;
+
+ /*
+ * The L2ARC has to hold at least the payload of one log block for
+ * them to be restored (persistent L2ARC). The payload of a log block
+ * depends on the amount of its log entries. We always write log blocks
+ * with 1022 entries. How many of them are committed or restored depends
+ * on the size of the L2ARC device. Thus the maximum payload of
+ * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device
+ * is less than that, we reduce the amount of committed and restored
+ * log entries per block so as to enable persistence.
+ */
+ if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) {
+ dev->l2ad_log_entries = 0;
+ } else {
+ dev->l2ad_log_entries = MIN((dev->l2ad_end -
+ dev->l2ad_start) >> SPA_MAXBLOCKSHIFT,
+ L2ARC_LOG_BLK_MAX_ENTRIES);
+ }
+
+ /*
+ * Read the device header, if an error is returned do not rebuild L2ARC.
+ */
+ if ((err = l2arc_dev_hdr_read(dev)) != 0)
+ l2dhdr_valid = B_FALSE;
+
+ if (l2dhdr_valid && dev->l2ad_log_entries > 0) {
+ /*
+ * If we are onlining a cache device (vdev_reopen) that was
+ * still present (l2arc_vdev_present()) and rebuild is enabled,
+ * we should evict all ARC buffers and pointers to log blocks
+ * and reclaim their space before restoring its contents to
+ * L2ARC.
+ */
+ if (reopen) {
+ if (!l2arc_rebuild_enabled) {
+ return;
+ } else {
+ l2arc_evict(dev, 0, B_TRUE);
+ /* start a new log block */
+ dev->l2ad_log_ent_idx = 0;
+ dev->l2ad_log_blk_payload_asize = 0;
+ dev->l2ad_log_blk_payload_start = 0;
+ }
+ }
+ /*
+ * Just mark the device as pending for a rebuild. We won't
+ * be starting a rebuild in line here as it would block pool
+ * import. Instead spa_load_impl will hand that off to an
+ * async task which will call l2arc_spa_rebuild_start.
+ */
+ dev->l2ad_rebuild = B_TRUE;
+ } else if (spa_writeable(spa)) {
+ /*
+ * In this case create a new header. We zero out the memory
+ * holding the header to reset dh_start_lbps.
+ */
+ bzero(l2dhdr, l2dhdr_asize);
+ l2arc_dev_hdr_update(dev);
+ }
}
/*
@@ -8818,24 +8863,29 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
void
l2arc_remove_vdev(vdev_t *vd)
{
- l2arc_dev_t *dev, *nextdev, *remdev = NULL;
+ l2arc_dev_t *remdev = NULL;
/*
* Find the device by vdev
*/
- mutex_enter(&l2arc_dev_mtx);
- for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
- nextdev = list_next(l2arc_dev_list, dev);
- if (vd == dev->l2ad_vdev) {
- remdev = dev;
- break;
- }
- }
+ remdev = l2arc_vdev_get(vd);
ASSERT3P(remdev, !=, NULL);
/*
+ * Cancel any ongoing or scheduled rebuild.
+ */
+ mutex_enter(&l2arc_rebuild_thr_lock);
+ if (remdev->l2ad_rebuild_began == B_TRUE) {
+ remdev->l2ad_rebuild_cancel = B_TRUE;
+ while (remdev->l2ad_rebuild == B_TRUE)
+ cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock);
+ }
+ mutex_exit(&l2arc_rebuild_thr_lock);
+
+ /*
* Remove device from global list
*/
+ mutex_enter(&l2arc_dev_mtx);
list_remove(l2arc_dev_list, remdev);
l2arc_dev_last = NULL; /* may have been invalidated */
atomic_dec_64(&l2arc_ndev);
@@ -8846,8 +8896,13 @@ l2arc_remove_vdev(vdev_t *vd)
*/
l2arc_evict(remdev, 0, B_TRUE);
list_destroy(&remdev->l2ad_buflist);
+ ASSERT(list_is_empty(&remdev->l2ad_lbptr_list));
+ list_destroy(&remdev->l2ad_lbptr_list);
mutex_destroy(&remdev->l2ad_mtx);
zfs_refcount_destroy(&remdev->l2ad_alloc);
+ zfs_refcount_destroy(&remdev->l2ad_lb_asize);
+ zfs_refcount_destroy(&remdev->l2ad_lb_count);
+ kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
kmem_free(remdev, sizeof (l2arc_dev_t));
}
@@ -8861,6 +8916,8 @@ l2arc_init(void)
mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -8885,6 +8942,8 @@ l2arc_fini(void)
mutex_destroy(&l2arc_feed_thr_lock);
cv_destroy(&l2arc_feed_thr_cv);
+ mutex_destroy(&l2arc_rebuild_thr_lock);
+ cv_destroy(&l2arc_rebuild_thr_cv);
mutex_destroy(&l2arc_dev_mtx);
mutex_destroy(&l2arc_free_on_write_mtx);
@@ -8915,3 +8974,901 @@ l2arc_stop(void)
cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
mutex_exit(&l2arc_feed_thr_lock);
}
+
+/*
+ * Punches out rebuild threads for the L2ARC devices in a spa. This should
+ * be called after pool import from the spa async thread, since starting
+ * these threads directly from spa_import() will make them part of the
+ * "zpool import" context and delay process exit (and thus pool import).
+ */
+void
+l2arc_spa_rebuild_start(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ /*
+ * Locate the spa's l2arc devices and kick off rebuild threads.
+ */
+ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+ l2arc_dev_t *dev =
+ l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
+ if (dev == NULL) {
+ /* Don't attempt a rebuild if the vdev is UNAVAIL */
+ continue;
+ }
+ mutex_enter(&l2arc_rebuild_thr_lock);
+ if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) {
+ dev->l2ad_rebuild_began = B_TRUE;
+ (void) thread_create(NULL, 0,
+ (void (*)(void *))l2arc_dev_rebuild_start,
+ dev, 0, &p0, TS_RUN, minclsyspri);
+ }
+ mutex_exit(&l2arc_rebuild_thr_lock);
+ }
+}
+
+/*
+ * Main entry point for L2ARC rebuilding.
+ */
+static void
+l2arc_dev_rebuild_start(l2arc_dev_t *dev)
+{
+ VERIFY(!dev->l2ad_rebuild_cancel);
+ VERIFY(dev->l2ad_rebuild);
+ (void) l2arc_rebuild(dev);
+ mutex_enter(&l2arc_rebuild_thr_lock);
+ dev->l2ad_rebuild_began = B_FALSE;
+ dev->l2ad_rebuild = B_FALSE;
+ mutex_exit(&l2arc_rebuild_thr_lock);
+
+ thread_exit();
+}
+
+/*
+ * This function implements the actual L2ARC metadata rebuild. It:
+ * starts reading the log block chain and restores each block's contents
+ * to memory (reconstructing arc_buf_hdr_t's).
+ *
+ * Operation stops under any of the following conditions:
+ *
+ * 1) We reach the end of the log block chain.
+ * 2) We encounter *any* error condition (cksum errors, io errors)
+ */
+static int
+l2arc_rebuild(l2arc_dev_t *dev)
+{
+ vdev_t *vd = dev->l2ad_vdev;
+ spa_t *spa = vd->vdev_spa;
+ int err = 0;
+ l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
+ l2arc_log_blk_phys_t *this_lb, *next_lb;
+ zio_t *this_io = NULL, *next_io = NULL;
+ l2arc_log_blkptr_t lbps[2];
+ l2arc_lb_ptr_buf_t *lb_ptr_buf;
+ boolean_t lock_held;
+
+ this_lb = kmem_zalloc(sizeof (*this_lb), KM_SLEEP);
+ next_lb = kmem_zalloc(sizeof (*next_lb), KM_SLEEP);
+
+ /*
+ * We prevent device removal while issuing reads to the device,
+ * then during the rebuilding phases we drop this lock again so
+ * that a spa_unload or device remove can be initiated - this is
+ * safe, because the spa will signal us to stop before removing
+ * our device and wait for us to stop.
+ */
+ spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
+ lock_held = B_TRUE;
+
+ /*
+ * Retrieve the persistent L2ARC device state.
+ * L2BLK_GET_PSIZE returns aligned size for log blocks.
+ */
+ dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start);
+ dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr +
+ L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop),
+ dev->l2ad_start);
+ dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
+
+ /*
+ * In case the zfs module parameter l2arc_rebuild_enabled is false
+ * we do not start the rebuild process.
+ */
+ if (!l2arc_rebuild_enabled)
+ goto out;
+
+ /* Prepare the rebuild process */
+ bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps));
+
+ /* Start the rebuild process */
+ for (;;) {
+ if (!l2arc_log_blkptr_valid(dev, &lbps[0]))
+ break;
+
+ if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1],
+ this_lb, next_lb, this_io, &next_io)) != 0)
+ goto out;
+
+ /*
+ * Our memory pressure valve. If the system is running low
+ * on memory, rather than swamping memory with new ARC buf
+ * hdrs, we opt not to rebuild the L2ARC. At this point,
+ * however, we have already set up our L2ARC dev to chain in
+ * new metadata log blocks, so the user may choose to offline/
+ * online the L2ARC dev at a later time (or re-import the pool)
+ * to reconstruct it (when there's less memory pressure).
+ */
+ if (arc_reclaim_needed()) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
+ cmn_err(CE_NOTE, "System running low on memory, "
+ "aborting L2ARC rebuild.");
+ err = SET_ERROR(ENOMEM);
+ goto out;
+ }
+
+ spa_config_exit(spa, SCL_L2ARC, vd);
+ lock_held = B_FALSE;
+
+ /*
+ * Now that we know that the next_lb checks out alright, we
+ * can start reconstruction from this log block.
+ * L2BLK_GET_PSIZE returns aligned size for log blocks.
+ */
+ uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
+ l2arc_log_blk_restore(dev, this_lb, asize, lbps[0].lbp_daddr);
+
+ /*
+ * log block restored, include its pointer in the list of
+ * pointers to log blocks present in the L2ARC device.
+ */
+ lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
+ lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t),
+ KM_SLEEP);
+ bcopy(&lbps[0], lb_ptr_buf->lb_ptr,
+ sizeof (l2arc_log_blkptr_t));
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf);
+ ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
+ ARCSTAT_BUMP(arcstat_l2_log_blk_count);
+ zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
+ zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
+ mutex_exit(&dev->l2ad_mtx);
+ vdev_space_update(vd, asize, 0, 0);
+
+ /* BEGIN CSTYLED */
+ /*
+ * Protection against loops of log blocks:
+ *
+ * l2ad_hand l2ad_evict
+ * V V
+ * l2ad_start |=======================================| l2ad_end
+ * -----|||----|||---|||----|||
+ * (3) (2) (1) (0)
+ * ---|||---|||----|||---|||
+ * (7) (6) (5) (4)
+ *
+ * In this situation the pointer of log block (4) passes
+ * l2arc_log_blkptr_valid() but the log block should not be
+ * restored as it is overwritten by the payload of log block
+ * (0). Only log blocks (0)-(3) should be restored. We check
+ * whether l2ad_evict lies in between the payload starting
+ * offset of the next log block (lbps[1].lbp_payload_start)
+ * and the payload starting offset of the present log block
+ * (lbps[0].lbp_payload_start). If true and this isn't the
+ * first pass, we are looping from the beginning and we should
+ * stop.
+ */
+ /* END CSTYLED */
+ if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
+ lbps[0].lbp_payload_start, dev->l2ad_evict) &&
+ !dev->l2ad_first)
+ goto out;
+
+ for (;;) {
+ mutex_enter(&l2arc_rebuild_thr_lock);
+ if (dev->l2ad_rebuild_cancel) {
+ dev->l2ad_rebuild = B_FALSE;
+ cv_signal(&l2arc_rebuild_thr_cv);
+ mutex_exit(&l2arc_rebuild_thr_lock);
+ err = SET_ERROR(ECANCELED);
+ goto out;
+ }
+ mutex_exit(&l2arc_rebuild_thr_lock);
+ if (spa_config_tryenter(spa, SCL_L2ARC, vd,
+ RW_READER)) {
+ lock_held = B_TRUE;
+ break;
+ }
+ /*
+ * L2ARC config lock held by somebody in writer,
+ * possibly due to them trying to remove us. They'll
+ * likely to want us to shut down, so after a little
+ * delay, we check l2ad_rebuild_cancel and retry
+ * the lock again.
+ */
+ delay(1);
+ }
+
+ /*
+ * Continue with the next log block.
+ */
+ lbps[0] = lbps[1];
+ lbps[1] = this_lb->lb_prev_lbp;
+ PTR_SWAP(this_lb, next_lb);
+ this_io = next_io;
+ next_io = NULL;
+ }
+
+ if (this_io != NULL)
+ l2arc_log_blk_fetch_abort(this_io);
+out:
+ if (next_io != NULL)
+ l2arc_log_blk_fetch_abort(next_io);
+ kmem_free(this_lb, sizeof (*this_lb));
+ kmem_free(next_lb, sizeof (*next_lb));
+
+ if (!l2arc_rebuild_enabled) {
+ spa_history_log_internal(spa, "L2ARC rebuild", NULL,
+ "disabled");
+ } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_success);
+ spa_history_log_internal(spa, "L2ARC rebuild", NULL,
+ "successful, restored %llu blocks",
+ (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
+ } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) {
+ /*
+ * No error but also nothing restored, meaning the lbps array
+ * in the device header points to invalid/non-present log
+ * blocks. Reset the header.
+ */
+ spa_history_log_internal(spa, "L2ARC rebuild", NULL,
+ "no valid log blocks");
+ bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
+ l2arc_dev_hdr_update(dev);
+ } else if (err != 0) {
+ spa_history_log_internal(spa, "L2ARC rebuild", NULL,
+ "aborted, restored %llu blocks",
+ (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
+ }
+
+ if (lock_held)
+ spa_config_exit(spa, SCL_L2ARC, vd);
+
+ return (err);
+}
+
+/*
+ * Attempts to read the device header on the provided L2ARC device and writes
+ * it to `hdr'. On success, this function returns 0, otherwise the appropriate
+ * error code is returned.
+ */
+static int
+l2arc_dev_hdr_read(l2arc_dev_t *dev)
+{
+ int err;
+ uint64_t guid;
+ l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
+ const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
+ abd_t *abd;
+
+ guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+
+ abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
+
+ err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
+ VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
+ ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
+ ZIO_FLAG_SPECULATIVE, B_FALSE));
+
+ abd_put(abd);
+
+ if (err != 0) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors);
+ zfs_dbgmsg("L2ARC IO error (%d) while reading device header, "
+ "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid);
+ return (err);
+ }
+
+ if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
+ byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr));
+
+ if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC ||
+ l2dhdr->dh_spa_guid != guid ||
+ l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid ||
+ l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION ||
+ l2dhdr->dh_log_entries != dev->l2ad_log_entries ||
+ l2dhdr->dh_end != dev->l2ad_end ||
+ !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end,
+ l2dhdr->dh_evict)) {
+ /*
+ * Attempt to rebuild a device containing no actual dev hdr
+ * or containing a header from some other pool or from another
+ * version of persistent L2ARC.
+ */
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ return (0);
+}
+
+/*
+ * Reads L2ARC log blocks from storage and validates their contents.
+ *
+ * This function implements a simple fetcher to make sure that while
+ * we're processing one buffer the L2ARC is already fetching the next
+ * one in the chain.
+ *
+ * The arguments this_lp and next_lp point to the current and next log block
+ * address in the block chain. Similarly, this_lb and next_lb hold the
+ * l2arc_log_blk_phys_t's of the current and next L2ARC blk.
+ *
+ * The `this_io' and `next_io' arguments are used for block fetching.
+ * When issuing the first blk IO during rebuild, you should pass NULL for
+ * `this_io'. This function will then issue a sync IO to read the block and
+ * also issue an async IO to fetch the next block in the block chain. The
+ * fetched IO is returned in `next_io'. On subsequent calls to this
+ * function, pass the value returned in `next_io' from the previous call
+ * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO.
+ * Prior to the call, you should initialize your `next_io' pointer to be
+ * NULL. If no fetch IO was issued, the pointer is left set at NULL.
+ *
+ * On success, this function returns 0, otherwise it returns an appropriate
+ * error code. On error the fetching IO is aborted and cleared before
+ * returning from this function. Therefore, if we return `success', the
+ * caller can assume that we have taken care of cleanup of fetch IOs.
+ */
+static int
+l2arc_log_blk_read(l2arc_dev_t *dev,
+ const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp,
+ l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
+ zio_t *this_io, zio_t **next_io)
+{
+ int err = 0;
+ zio_cksum_t cksum;
+ abd_t *abd = NULL;
+ uint64_t asize;
+
+ ASSERT(this_lbp != NULL && next_lbp != NULL);
+ ASSERT(this_lb != NULL && next_lb != NULL);
+ ASSERT(next_io != NULL && *next_io == NULL);
+ ASSERT(l2arc_log_blkptr_valid(dev, this_lbp));
+
+ /*
+ * Check to see if we have issued the IO for this log block in a
+ * previous run. If not, this is the first call, so issue it now.
+ */
+ if (this_io == NULL) {
+ this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp,
+ this_lb);
+ }
+
+ /*
+ * Peek to see if we can start issuing the next IO immediately.
+ */
+ if (l2arc_log_blkptr_valid(dev, next_lbp)) {
+ /*
+ * Start issuing IO for the next log block early - this
+ * should help keep the L2ARC device busy while we
+ * decompress and restore this log block.
+ */
+ *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp,
+ next_lb);
+ }
+
+ /* Wait for the IO to read this log block to complete */
+ if ((err = zio_wait(this_io)) != 0) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
+ zfs_dbgmsg("L2ARC IO error (%d) while reading log block, "
+ "offset: %llu, vdev guid: %llu", err, this_lbp->lbp_daddr,
+ dev->l2ad_vdev->vdev_guid);
+ goto cleanup;
+ }
+
+ /*
+ * Make sure the buffer checks out.
+ * L2BLK_GET_PSIZE returns aligned size for log blocks.
+ */
+ asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop);
+ fletcher_4_native(this_lb, asize, NULL, &cksum);
+ if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors);
+ zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, "
+ "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu",
+ this_lbp->lbp_daddr, dev->l2ad_vdev->vdev_guid,
+ dev->l2ad_hand, dev->l2ad_evict);
+ err = SET_ERROR(ECKSUM);
+ goto cleanup;
+ }
+
+ /* Now we can take our time decoding this buffer */
+ switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
+ case ZIO_COMPRESS_OFF:
+ break;
+ case ZIO_COMPRESS_LZ4:
+ abd = abd_alloc_for_io(asize, B_TRUE);
+ abd_copy_from_buf_off(abd, this_lb, 0, asize);
+ if ((err = zio_decompress_data(
+ L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
+ abd, this_lb, asize, sizeof (*this_lb))) != 0) {
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+ break;
+ default:
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+ if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
+ byteswap_uint64_array(this_lb, sizeof (*this_lb));
+ if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) {
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+cleanup:
+ /* Abort an in-flight fetch I/O in case of error */
+ if (err != 0 && *next_io != NULL) {
+ l2arc_log_blk_fetch_abort(*next_io);
+ *next_io = NULL;
+ }
+ if (abd != NULL)
+ abd_free(abd);
+ return (err);
+}
+
+/*
+ * Restores the payload of a log block to ARC. This creates empty ARC hdr
+ * entries which only contain an l2arc hdr, essentially restoring the
+ * buffers to their L2ARC evicted state. This function also updates space
+ * usage on the L2ARC vdev to make sure it tracks restored buffers.
+ */
+static void
+l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
+ uint64_t lb_asize, uint64_t lb_daddr)
+{
+ uint64_t size = 0, asize = 0;
+ uint64_t log_entries = dev->l2ad_log_entries;
+
+ for (int i = log_entries - 1; i >= 0; i--) {
+ /*
+ * Restore goes in the reverse temporal direction to preserve
+ * correct temporal ordering of buffers in the l2ad_buflist.
+ * l2arc_hdr_restore also does a list_insert_tail instead of
+ * list_insert_head on the l2ad_buflist:
+ *
+ * LIST l2ad_buflist LIST
+ * HEAD <------ (time) ------ TAIL
+ * direction +-----+-----+-----+-----+-----+ direction
+ * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild
+ * fill +-----+-----+-----+-----+-----+
+ * ^ ^
+ * | |
+ * | |
+ * l2arc_feed_thread l2arc_rebuild
+ * will place new bufs here restores bufs here
+ *
+ * During l2arc_rebuild() the device is not used by
+ * l2arc_feed_thread() as dev->l2ad_rebuild is set to true.
+ */
+ size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop);
+ asize += vdev_psize_to_asize(dev->l2ad_vdev,
+ L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop));
+ l2arc_hdr_restore(&lb->lb_entries[i], dev);
+ }
+
+ /*
+ * Record rebuild stats:
+ * size Logical size of restored buffers in the L2ARC
+ * asize Aligned size of restored buffers in the L2ARC
+ */
+ ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
+ ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize);
+ ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries);
+ ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize);
+ ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize);
+ ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
+}
+
+/*
+ * Restores a single ARC buf hdr from a log entry. The ARC buffer is put
+ * into a state indicating that it has been evicted to L2ARC.
+ */
+static void
+l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev)
+{
+ arc_buf_hdr_t *hdr, *exists;
+ kmutex_t *hash_lock;
+ arc_buf_contents_t type = L2BLK_GET_TYPE((le)->le_prop);
+ uint64_t asize;
+
+ /*
+ * Do all the allocation before grabbing any locks, this lets us
+ * sleep if memory is full and we don't have to deal with failed
+ * allocations.
+ */
+ hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type,
+ dev, le->le_dva, le->le_daddr,
+ L2BLK_GET_PSIZE((le)->le_prop), le->le_birth,
+ L2BLK_GET_COMPRESS((le)->le_prop),
+ L2BLK_GET_PROTECTED((le)->le_prop),
+ L2BLK_GET_PREFETCH((le)->le_prop));
+ asize = vdev_psize_to_asize(dev->l2ad_vdev,
+ L2BLK_GET_PSIZE((le)->le_prop));
+
+ /*
+ * vdev_space_update() has to be called before arc_hdr_destroy() to
+ * avoid underflow since the latter also calls the former.
+ */
+ vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+
+ ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(hdr));
+ ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(hdr));
+
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_tail(&dev->l2ad_buflist, hdr);
+ (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
+ mutex_exit(&dev->l2ad_mtx);
+
+ exists = buf_hash_insert(hdr, &hash_lock);
+ if (exists) {
+ /* Buffer was already cached, no need to restore it. */
+ arc_hdr_destroy(hdr);
+ /*
+ * If the buffer is already cached, check whether it has
+ * L2ARC metadata. If not, enter them and update the flag.
+ * This is important is case of onlining a cache device, since
+ * we previously evicted all L2ARC metadata from ARC.
+ */
+ if (!HDR_HAS_L2HDR(exists)) {
+ arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR);
+ exists->b_l2hdr.b_dev = dev;
+ exists->b_l2hdr.b_daddr = le->le_daddr;
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_tail(&dev->l2ad_buflist, exists);
+ (void) zfs_refcount_add_many(&dev->l2ad_alloc,
+ arc_hdr_size(exists), exists);
+ mutex_exit(&dev->l2ad_mtx);
+ vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+ ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(exists));
+ ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(exists));
+ }
+ ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
+ }
+
+ mutex_exit(hash_lock);
+}
+
+/*
+ * Starts an asynchronous read IO to read a log block. This is used in log
+ * block reconstruction to start reading the next block before we are done
+ * decoding and reconstructing the current block, to keep the l2arc device
+ * nice and hot with read IO to process.
+ * The returned zio will contain newly allocated memory buffers for the IO
+ * data which should then be freed by the caller once the zio is no longer
+ * needed (i.e. due to it having completed). If you wish to abort this
+ * zio, you should do so using l2arc_log_blk_fetch_abort, which takes
+ * care of disposing of the allocated buffers correctly.
+ */
+static zio_t *
+l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
+ l2arc_log_blk_phys_t *lb)
+{
+ uint32_t asize;
+ zio_t *pio;
+ l2arc_read_callback_t *cb;
+
+ /* L2BLK_GET_PSIZE returns aligned size for log blocks */
+ asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
+ ASSERT(asize <= sizeof (l2arc_log_blk_phys_t));
+
+ cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
+ cb->l2rcb_abd = abd_get_from_buf(lb, asize);
+ pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY);
+ (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize,
+ cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
+
+ return (pio);
+}
+
+/*
+ * Aborts a zio returned from l2arc_log_blk_fetch and frees the data
+ * buffers allocated for it.
+ */
+static void
+l2arc_log_blk_fetch_abort(zio_t *zio)
+{
+ (void) zio_wait(zio);
+}
+
+/*
+ * Creates a zio to update the device header on an l2arc device.
+ */
+static void
+l2arc_dev_hdr_update(l2arc_dev_t *dev)
+{
+ l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
+ const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
+ abd_t *abd;
+ int err;
+
+ VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER));
+
+ l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC;
+ l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION;
+ l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+ l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid;
+ l2dhdr->dh_log_entries = dev->l2ad_log_entries;
+ l2dhdr->dh_evict = dev->l2ad_evict;
+ l2dhdr->dh_start = dev->l2ad_start;
+ l2dhdr->dh_end = dev->l2ad_end;
+ l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize);
+ l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count);
+ l2dhdr->dh_flags = 0;
+ if (dev->l2ad_first)
+ l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
+
+ abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
+
+ err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev,
+ VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL,
+ NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE));
+
+ abd_put(abd);
+
+ if (err != 0) {
+ zfs_dbgmsg("L2ARC IO error (%d) while writing device header, "
+ "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid);
+ }
+}
+
+/*
+ * Commits a log block to the L2ARC device. This routine is invoked from
+ * l2arc_write_buffers when the log block fills up.
+ * This function allocates some memory to temporarily hold the serialized
+ * buffer to be written. This is then released in l2arc_write_done.
+ */
+static void
+l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
+{
+ l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
+ l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
+ uint64_t psize, asize;
+ zio_t *wzio;
+ l2arc_lb_abd_buf_t *abd_buf;
+ uint8_t *tmpbuf;
+ l2arc_lb_ptr_buf_t *lb_ptr_buf;
+
+ VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
+
+ tmpbuf = zio_buf_alloc(sizeof (*lb));
+ abd_buf = zio_buf_alloc(sizeof (*abd_buf));
+ abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb));
+ lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
+ lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP);
+
+ /* link the buffer into the block chain */
+ lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1];
+ lb->lb_magic = L2ARC_LOG_BLK_MAGIC;
+
+ /*
+ * l2arc_log_blk_commit() may be called multiple times during a single
+ * l2arc_write_buffers() call. Save the allocated abd buffers in a list
+ * so we can free them in l2arc_write_done() later on.
+ */
+ list_insert_tail(&cb->l2wcb_abd_list, abd_buf);
+
+ /* try to compress the buffer */
+ psize = zio_compress_data(ZIO_COMPRESS_LZ4,
+ abd_buf->abd, tmpbuf, sizeof (*lb));
+
+ /* a log block is never entirely zero */
+ ASSERT(psize != 0);
+ asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
+ ASSERT(asize <= sizeof (*lb));
+
+ /*
+ * Update the start log block pointer in the device header to point
+ * to the log block we're about to write.
+ */
+ l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0];
+ l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand;
+ l2dhdr->dh_start_lbps[0].lbp_payload_asize =
+ dev->l2ad_log_blk_payload_asize;
+ l2dhdr->dh_start_lbps[0].lbp_payload_start =
+ dev->l2ad_log_blk_payload_start;
+ _NOTE(CONSTCOND)
+ L2BLK_SET_LSIZE(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb));
+ L2BLK_SET_PSIZE(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize);
+ L2BLK_SET_CHECKSUM(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop,
+ ZIO_CHECKSUM_FLETCHER_4);
+ if (asize < sizeof (*lb)) {
+ /* compression succeeded */
+ bzero(tmpbuf + psize, asize - psize);
+ L2BLK_SET_COMPRESS(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop,
+ ZIO_COMPRESS_LZ4);
+ } else {
+ /* compression failed */
+ bcopy(lb, tmpbuf, sizeof (*lb));
+ L2BLK_SET_COMPRESS(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop,
+ ZIO_COMPRESS_OFF);
+ }
+
+ /* checksum what we're about to write */
+ fletcher_4_native(tmpbuf, asize, NULL,
+ &l2dhdr->dh_start_lbps[0].lbp_cksum);
+
+ abd_put(abd_buf->abd);
+
+ /* perform the write itself */
+ abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb));
+ abd_take_ownership_of_buf(abd_buf->abd, B_TRUE);
+ wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
+ asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
+ DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
+ (void) zio_nowait(wzio);
+
+ dev->l2ad_hand += asize;
+ /*
+ * Include the committed log block's pointer in the list of pointers
+ * to log blocks present in the L2ARC device.
+ */
+ bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr,
+ sizeof (l2arc_log_blkptr_t));
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf);
+ ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
+ ARCSTAT_BUMP(arcstat_l2_log_blk_count);
+ zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
+ zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
+ mutex_exit(&dev->l2ad_mtx);
+ vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+
+ /* bump the kstats */
+ ARCSTAT_INCR(arcstat_l2_write_bytes, asize);
+ ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
+ ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize);
+ ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
+ dev->l2ad_log_blk_payload_asize / asize);
+
+ /* start a new log block */
+ dev->l2ad_log_ent_idx = 0;
+ dev->l2ad_log_blk_payload_asize = 0;
+ dev->l2ad_log_blk_payload_start = 0;
+}
+
+/*
+ * Validates an L2ARC log block address to make sure that it can be read
+ * from the provided L2ARC device.
+ */
+boolean_t
+l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp)
+{
+ /* L2BLK_GET_PSIZE returns aligned size for log blocks */
+ uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
+ uint64_t end = lbp->lbp_daddr + asize - 1;
+ uint64_t start = lbp->lbp_payload_start;
+ boolean_t evicted = B_FALSE;
+
+ /* BEGIN CSTYLED */
+ /*
+ * A log block is valid if all of the following conditions are true:
+ * - it fits entirely (including its payload) between l2ad_start and
+ * l2ad_end
+ * - it has a valid size
+ * - neither the log block itself nor part of its payload was evicted
+ * by l2arc_evict():
+ *
+ * l2ad_hand l2ad_evict
+ * | | lbp_daddr
+ * | start | | end
+ * | | | | |
+ * V V V V V
+ * l2ad_start ============================================ l2ad_end
+ * --------------------------||||
+ * ^ ^
+ * | log block
+ * payload
+ */
+ /* END CSTYLED */
+ evicted =
+ l2arc_range_check_overlap(start, end, dev->l2ad_hand) ||
+ l2arc_range_check_overlap(start, end, dev->l2ad_evict) ||
+ l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) ||
+ l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end);
+
+ return (start >= dev->l2ad_start && end <= dev->l2ad_end &&
+ asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) &&
+ (!evicted || dev->l2ad_first));
+}
+
+/*
+ * Inserts ARC buffer header `hdr' into the current L2ARC log block on
+ * the device. The buffer being inserted must be present in L2ARC.
+ * Returns B_TRUE if the L2ARC log block is full and needs to be committed
+ * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
+ */
+static boolean_t
+l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
+{
+ l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
+ l2arc_log_ent_phys_t *le;
+
+ if (dev->l2ad_log_entries == 0)
+ return (B_FALSE);
+
+ int index = dev->l2ad_log_ent_idx++;
+
+ ASSERT3S(index, <, dev->l2ad_log_entries);
+ ASSERT(HDR_HAS_L2HDR(hdr));
+
+ le = &lb->lb_entries[index];
+ bzero(le, sizeof (*le));
+ le->le_dva = hdr->b_dva;
+ le->le_birth = hdr->b_birth;
+ le->le_daddr = hdr->b_l2hdr.b_daddr;
+ if (index == 0)
+ dev->l2ad_log_blk_payload_start = le->le_daddr;
+ L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr));
+ L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr));
+ L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr));
+ L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
+ L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
+ L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
+
+ dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
+ HDR_GET_PSIZE(hdr));
+
+ return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries);
+}
+
+/*
+ * Checks whether a given L2ARC device address sits in a time-sequential
+ * range. The trick here is that the L2ARC is a rotary buffer, so we can't
+ * just do a range comparison, we need to handle the situation in which the
+ * range wraps around the end of the L2ARC device. Arguments:
+ * bottom -- Lower end of the range to check (written to earlier).
+ * top -- Upper end of the range to check (written to later).
+ * check -- The address for which we want to determine if it sits in
+ * between the top and bottom.
+ *
+ * The 3-way conditional below represents the following cases:
+ *
+ * bottom < top : Sequentially ordered case:
+ * <check>--------+-------------------+
+ * | (overlap here?) |
+ * L2ARC dev V V
+ * |---------------<bottom>============<top>--------------|
+ *
+ * bottom > top: Looped-around case:
+ * <check>--------+------------------+
+ * | (overlap here?) |
+ * L2ARC dev V V
+ * |===============<top>---------------<bottom>===========|
+ * ^ ^
+ * | (or here?) |
+ * +---------------+---------<check>
+ *
+ * top == bottom : Just a single address comparison.
+ */
+boolean_t
+l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
+{
+ if (bottom < top)
+ return (bottom <= check && check <= top);
+ else if (bottom > top)
+ return (check <= top || bottom <= check);
+ else
+ return (check == top);
+}
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 21a8595b72..ced6a3ead5 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -4364,6 +4364,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
}
spa_import_progress_remove(spa);
+ spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
+
spa_load_note(spa, "LOADED");
return (0);
@@ -7648,6 +7650,17 @@ spa_async_thread(void *arg)
}
/*
+ * Kick off L2 cache rebuilding.
+ */
+ if (tasks & SPA_ASYNC_L2CACHE_REBUILD) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER);
+ l2arc_spa_rebuild_start(spa);
+ spa_config_exit(spa, SCL_L2ARC, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ /*
* Let the world know that we're done.
*/
mutex_enter(&spa->spa_async_lock);
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
index 1ef3bb79ca..ddcbfa748d 100644
--- a/usr/src/uts/common/fs/zfs/sys/arc.h
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -248,10 +248,14 @@ void arc_fini(void);
void l2arc_add_vdev(spa_t *spa, vdev_t *vd);
void l2arc_remove_vdev(vdev_t *vd);
boolean_t l2arc_vdev_present(vdev_t *vd);
+void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
+boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top,
+ uint64_t check);
void l2arc_init(void);
void l2arc_fini(void);
void l2arc_start(void);
void l2arc_stop(void);
+void l2arc_spa_rebuild_start(spa_t *spa);
#ifndef _KERNEL
extern boolean_t arc_watch;
diff --git a/usr/src/uts/common/fs/zfs/sys/arc_impl.h b/usr/src/uts/common/fs/zfs/sys/arc_impl.h
new file mode 100644
index 0000000000..0c18849b59
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/arc_impl.h
@@ -0,0 +1,857 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2019, Joyent, Inc.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2020, George Amanakis. All rights reserved.
+ */
+
+#ifndef _SYS_ARC_IMPL_H
+#define _SYS_ARC_IMPL_H
+
+#include <sys/arc.h>
+#include <sys/multilist.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Note that buffers can be in one of 6 states:
+ * ARC_anon - anonymous (discussed below)
+ * ARC_mru - recently used, currently cached
+ * ARC_mru_ghost - recently used, no longer in cache
+ * ARC_mfu - frequently used, currently cached
+ * ARC_mfu_ghost - frequently used, no longer in cache
+ * ARC_l2c_only - exists in L2ARC but not other states
+ * When there are no active references to the buffer, they are
+ * are linked onto a list in one of these arc states. These are
+ * the only buffers that can be evicted or deleted. Within each
+ * state there are multiple lists, one for meta-data and one for
+ * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
+ * etc.) is tracked separately so that it can be managed more
+ * explicitly: favored over data, limited explicitly.
+ *
+ * Anonymous buffers are buffers that are not associated with
+ * a DVA. These are buffers that hold dirty block copies
+ * before they are written to stable storage. By definition,
+ * they are "ref'd" and are considered part of arc_mru
+ * that cannot be freed. Generally, they will aquire a DVA
+ * as they are written and migrate onto the arc_mru list.
+ *
+ * The ARC_l2c_only state is for buffers that are in the second
+ * level ARC but no longer in any of the ARC_m* lists. The second
+ * level ARC itself may also contain buffers that are in any of
+ * the ARC_m* states - meaning that a buffer can exist in two
+ * places. The reason for the ARC_l2c_only state is to keep the
+ * buffer header in the hash table, so that reads that hit the
+ * second level ARC benefit from these fast lookups.
+ */
+
+typedef struct arc_state {
+ /*
+ * list of evictable buffers
+ */
+ multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
+ /*
+ * total amount of evictable data in this state
+ */
+ zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
+ /*
+ * total amount of data in this state; this includes: evictable,
+ * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
+ */
+ zfs_refcount_t arcs_size;
+} arc_state_t;
+
+typedef struct arc_callback arc_callback_t;
+
+struct arc_callback {
+ void *acb_private;
+ arc_read_done_func_t *acb_done;
+ arc_buf_t *acb_buf;
+ boolean_t acb_encrypted;
+ boolean_t acb_compressed;
+ boolean_t acb_noauth;
+ zbookmark_phys_t acb_zb;
+ zio_t *acb_zio_dummy;
+ zio_t *acb_zio_head;
+ arc_callback_t *acb_next;
+};
+
+typedef struct arc_write_callback arc_write_callback_t;
+
+struct arc_write_callback {
+ void *awcb_private;
+ arc_write_done_func_t *awcb_ready;
+ arc_write_done_func_t *awcb_children_ready;
+ arc_write_done_func_t *awcb_physdone;
+ arc_write_done_func_t *awcb_done;
+ arc_buf_t *awcb_buf;
+};
+
+/*
+ * ARC buffers are separated into multiple structs as a memory saving measure:
+ * - Common fields struct, always defined, and embedded within it:
+ * - L2-only fields, always allocated but undefined when not in L2ARC
+ * - L1-only fields, only allocated when in L1ARC
+ *
+ * Buffer in L1 Buffer only in L2
+ * +------------------------+ +------------------------+
+ * | arc_buf_hdr_t | | arc_buf_hdr_t |
+ * | | | |
+ * | | | |
+ * | | | |
+ * +------------------------+ +------------------------+
+ * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
+ * | (undefined if L1-only) | | |
+ * +------------------------+ +------------------------+
+ * | l1arc_buf_hdr_t |
+ * | |
+ * | |
+ * | |
+ * | |
+ * +------------------------+
+ *
+ * Because it's possible for the L2ARC to become extremely large, we can wind
+ * up eating a lot of memory in L2ARC buffer headers, so the size of a header
+ * is minimized by only allocating the fields necessary for an L1-cached buffer
+ * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
+ * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
+ * words in pointers. arc_hdr_realloc() is used to switch a header between
+ * these two allocation states.
+ */
+typedef struct l1arc_buf_hdr {
+ kmutex_t b_freeze_lock;
+ zio_cksum_t *b_freeze_cksum;
+#ifdef ZFS_DEBUG
+ /*
+ * Used for debugging with kmem_flags - by allocating and freeing
+ * b_thawed when the buffer is thawed, we get a record of the stack
+ * trace that thawed it.
+ */
+ void *b_thawed;
+#endif
+
+ arc_buf_t *b_buf;
+ uint32_t b_bufcnt;
+ /* for waiting on writes to complete */
+ kcondvar_t b_cv;
+ uint8_t b_byteswap;
+
+ /* protected by arc state mutex */
+ arc_state_t *b_state;
+ multilist_node_t b_arc_node;
+
+ /* updated atomically */
+ clock_t b_arc_access;
+
+ /* self protecting */
+ zfs_refcount_t b_refcnt;
+
+ arc_callback_t *b_acb;
+ abd_t *b_pabd;
+} l1arc_buf_hdr_t;
+
+typedef enum l2arc_dev_hdr_flags_t {
+ L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */
+} l2arc_dev_hdr_flags_t;
+
+/*
+ * Pointer used in persistent L2ARC (for pointing to log blocks).
+ */
+typedef struct l2arc_log_blkptr {
+ /*
+ * Offset of log block within the device, in bytes
+ */
+ uint64_t lbp_daddr;
+ /*
+ * Aligned payload size (in bytes) of the log block
+ */
+ uint64_t lbp_payload_asize;
+ /*
+ * Offset in bytes of the first buffer in the payload
+ */
+ uint64_t lbp_payload_start;
+ /*
+ * lbp_prop has the following format:
+ * * logical size (in bytes)
+ * * aligned (after compression) size (in bytes)
+ * * compression algorithm (we always LZ4-compress l2arc logs)
+ * * checksum algorithm (used for lbp_cksum)
+ */
+ uint64_t lbp_prop;
+ zio_cksum_t lbp_cksum; /* checksum of log */
+} l2arc_log_blkptr_t;
+
+/*
+ * The persistent L2ARC device header.
+ * Byte order of magic determines whether 64-bit bswap of fields is necessary.
+ */
+typedef struct l2arc_dev_hdr_phys {
+ uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */
+ uint64_t dh_version; /* Persistent L2ARC version */
+
+ /*
+ * Global L2ARC device state and metadata.
+ */
+ uint64_t dh_spa_guid;
+ uint64_t dh_vdev_guid;
+ uint64_t dh_log_entries; /* mirror of l2ad_log_entries */
+ uint64_t dh_evict; /* evicted offset in bytes */
+ uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */
+ /*
+ * Used in zdb.c for determining if a log block is valid, in the same
+ * way that l2arc_rebuild() does.
+ */
+ uint64_t dh_start; /* mirror of l2ad_start */
+ uint64_t dh_end; /* mirror of l2ad_end */
+ /*
+ * Start of log block chain. [0] -> newest log, [1] -> one older (used
+ * for initiating prefetch).
+ */
+ l2arc_log_blkptr_t dh_start_lbps[2];
+ /*
+ * Aligned size of all log blocks as accounted by vdev_space_update().
+ */
+ uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */
+ uint64_t dh_lb_count; /* mirror of l2ad_lb_count */
+ const uint64_t dh_pad[32]; /* pad to 512 bytes */
+ zio_eck_t dh_tail;
+} l2arc_dev_hdr_phys_t;
+CTASSERT(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
+
+/*
+ * A single ARC buffer header entry in a l2arc_log_blk_phys_t.
+ */
+typedef struct l2arc_log_ent_phys {
+ dva_t le_dva; /* dva of buffer */
+ uint64_t le_birth; /* birth txg of buffer */
+ /*
+ * le_prop has the following format:
+ * * logical size (in bytes)
+ * * physical (compressed) size (in bytes)
+ * * compression algorithm
+ * * object type (used to restore arc_buf_contents_t)
+ * * protected status (used for encryption)
+ * * prefetch status (used in l2arc_read_done())
+ */
+ uint64_t le_prop;
+ uint64_t le_daddr; /* buf location on l2dev */
+ /*
+ * We pad the size of each entry to a power of 2 so that the size of
+ * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT,
+ * because of the L2ARC_SET_*SIZE macros.
+ */
+ const uint64_t le_pad[3]; /* pad to 64 bytes */
+} l2arc_log_ent_phys_t;
+
+#define L2ARC_LOG_BLK_MAX_ENTRIES (1022)
+
+/*
+ * A log block of up to 1022 ARC buffer log entries, chained into the
+ * persistent L2ARC metadata linked list. Byte order of magic determines
+ * whether 64-bit bswap of fields is necessary.
+ */
+typedef struct l2arc_log_blk_phys {
+ uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */
+ /*
+ * There are 2 chains (headed by dh_start_lbps[2]), and this field
+ * points back to the previous block in this chain. We alternate
+ * which chain we append to, so they are time-wise and offset-wise
+ * interleaved, but that is an optimization rather than for
+ * correctness.
+ */
+ l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */
+ /*
+ * Pad header section to 128 bytes
+ */
+ uint64_t lb_pad[7];
+ /* Payload */
+ l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES];
+} l2arc_log_blk_phys_t; /* 64K total */
+/*
+ * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with
+ * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros.
+ */
+CTASSERT(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t),
+ 1ULL << SPA_MINBLOCKSHIFT));
+CTASSERT(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE);
+CTASSERT(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE);
+
+/*
+ * These structures hold in-flight abd buffers for log blocks as they're being
+ * written to the L2ARC device.
+ */
+typedef struct l2arc_lb_abd_buf {
+ abd_t *abd;
+ list_node_t node;
+} l2arc_lb_abd_buf_t;
+
+/*
+ * These structures hold pointers to log blocks present on the L2ARC device.
+ */
+typedef struct l2arc_lb_ptr_buf {
+ l2arc_log_blkptr_t *lb_ptr;
+ list_node_t node;
+} l2arc_lb_ptr_buf_t;
+
+/* Macros for setting fields in le_prop and lbp_prop */
+#define L2BLK_GET_LSIZE(field) \
+ BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
+#define L2BLK_SET_LSIZE(field, x) \
+ BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
+#define L2BLK_GET_PSIZE(field) \
+ BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)
+#define L2BLK_SET_PSIZE(field, x) \
+ BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
+#define L2BLK_GET_COMPRESS(field) \
+ BF64_GET((field), 32, SPA_COMPRESSBITS)
+#define L2BLK_SET_COMPRESS(field, x) \
+ BF64_SET((field), 32, SPA_COMPRESSBITS, x)
+#define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1)
+#define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x)
+#define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8)
+#define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x)
+#define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8)
+#define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x)
+#define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1)
+#define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x)
+
+#define PTR_SWAP(x, y) \
+ do { \
+ void *tmp = (x);\
+ x = y; \
+ y = tmp; \
+ _NOTE(CONSTCOND)\
+ } while (0)
+
+#define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */
+#define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */
+
+/*
+ * L2ARC Internals
+ */
+typedef struct l2arc_dev {
+ vdev_t *l2ad_vdev; /* vdev */
+ spa_t *l2ad_spa; /* spa */
+ uint64_t l2ad_hand; /* next write location */
+ uint64_t l2ad_start; /* first addr on device */
+ uint64_t l2ad_end; /* last addr on device */
+ boolean_t l2ad_first; /* first sweep through */
+ boolean_t l2ad_writing; /* currently writing */
+ kmutex_t l2ad_mtx; /* lock for buffer list */
+ list_t l2ad_buflist; /* buffer list */
+ list_node_t l2ad_node; /* device list node */
+ zfs_refcount_t l2ad_alloc; /* allocated bytes */
+ /*
+ * Persistence-related stuff
+ */
+ l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */
+ uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */
+ l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */
+ int l2ad_log_ent_idx; /* index into cur log blk */
+ /* Number of bytes in current log block's payload */
+ uint64_t l2ad_log_blk_payload_asize;
+ /*
+ * Offset (in bytes) of the first buffer in current log block's
+ * payload.
+ */
+ uint64_t l2ad_log_blk_payload_start;
+ /* Flag indicating whether a rebuild is scheduled or is going on */
+ boolean_t l2ad_rebuild;
+ boolean_t l2ad_rebuild_cancel;
+ boolean_t l2ad_rebuild_began;
+ uint64_t l2ad_log_entries; /* entries per log blk */
+ uint64_t l2ad_evict; /* evicted offset in bytes */
+ /* List of pointers to log blocks present in the L2ARC device */
+ list_t l2ad_lbptr_list;
+ /*
+ * Aligned size of all log blocks as accounted by vdev_space_update().
+ */
+ zfs_refcount_t l2ad_lb_asize;
+ /*
+ * Number of log blocks present on the device.
+ */
+ zfs_refcount_t l2ad_lb_count;
+} l2arc_dev_t;
+
+/*
+ * Encrypted blocks will need to be stored encrypted on the L2ARC
+ * disk as they appear in the main pool. In order for this to work we
+ * need to pass around the encryption parameters so they can be used
+ * to write data to the L2ARC. This struct is only defined in the
+ * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED
+ * flag set.
+ */
+typedef struct arc_buf_hdr_crypt {
+ abd_t *b_rabd; /* raw encrypted data */
+ dmu_object_type_t b_ot; /* object type */
+ uint32_t b_ebufcnt; /* number or encryped buffers */
+
+ /* dsobj for looking up encryption key for l2arc encryption */
+ uint64_t b_dsobj; /* for looking up key */
+
+ /* encryption parameters */
+ uint8_t b_salt[ZIO_DATA_SALT_LEN];
+ uint8_t b_iv[ZIO_DATA_IV_LEN];
+
+ /*
+ * Technically this could be removed since we will always be able to
+ * get the mac from the bp when we need it. However, it is inconvenient
+ * for callers of arc code to have to pass a bp in all the time. This
+ * also allows us to assert that L2ARC data is properly encrypted to
+ * match the data in the main storage pool.
+ */
+ uint8_t b_mac[ZIO_DATA_MAC_LEN];
+} arc_buf_hdr_crypt_t;
+
+typedef struct l2arc_buf_hdr {
+ /* protected by arc_buf_hdr mutex */
+ l2arc_dev_t *b_dev; /* L2ARC device */
+ uint64_t b_daddr; /* disk address, offset byte */
+
+ list_node_t b_l2node;
+} l2arc_buf_hdr_t;
+
+typedef struct l2arc_write_callback {
+ l2arc_dev_t *l2wcb_dev; /* device info */
+ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
+ /* in-flight list of log blocks */
+ list_t l2wcb_abd_list;
+} l2arc_write_callback_t;
+
+struct arc_buf_hdr {
+ /* protected by hash lock */
+ dva_t b_dva;
+ uint64_t b_birth;
+
+ arc_buf_contents_t b_type;
+ arc_buf_hdr_t *b_hash_next;
+ arc_flags_t b_flags;
+
+ /*
+ * This field stores the size of the data buffer after
+ * compression, and is set in the arc's zio completion handlers.
+ * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
+ *
+ * While the block pointers can store up to 32MB in their psize
+ * field, we can only store up to 32MB minus 512B. This is due
+ * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
+ * a field of zeros represents 512B in the bp). We can't use a
+ * bias of 1 since we need to reserve a psize of zero, here, to
+ * represent holes and embedded blocks.
+ *
+ * This isn't a problem in practice, since the maximum size of a
+ * buffer is limited to 16MB, so we never need to store 32MB in
+ * this field.
+ */
+ uint16_t b_psize;
+
+ /*
+ * This field stores the size of the data buffer before
+ * compression, and cannot change once set. It is in units
+ * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
+ */
+ uint16_t b_lsize; /* immutable */
+ uint64_t b_spa; /* immutable */
+
+ /* L2ARC fields. Undefined when not in L2ARC. */
+ l2arc_buf_hdr_t b_l2hdr;
+ /* L1ARC fields. Undefined when in l2arc_only state */
+ l1arc_buf_hdr_t b_l1hdr;
+ /*
+ * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED
+ * is set and the L1 header exists.
+ */
+ arc_buf_hdr_crypt_t b_crypt_hdr;
+};
+
+typedef struct arc_stats {
+ kstat_named_t arcstat_hits;
+ kstat_named_t arcstat_misses;
+ kstat_named_t arcstat_demand_data_hits;
+ kstat_named_t arcstat_demand_data_misses;
+ kstat_named_t arcstat_demand_metadata_hits;
+ kstat_named_t arcstat_demand_metadata_misses;
+ kstat_named_t arcstat_prefetch_data_hits;
+ kstat_named_t arcstat_prefetch_data_misses;
+ kstat_named_t arcstat_prefetch_metadata_hits;
+ kstat_named_t arcstat_prefetch_metadata_misses;
+ kstat_named_t arcstat_mru_hits;
+ kstat_named_t arcstat_mru_ghost_hits;
+ kstat_named_t arcstat_mfu_hits;
+ kstat_named_t arcstat_mfu_ghost_hits;
+ kstat_named_t arcstat_deleted;
+ /*
+ * Number of buffers that could not be evicted because the hash lock
+ * was held by another thread. The lock may not necessarily be held
+ * by something using the same buffer, since hash locks are shared
+ * by multiple buffers.
+ */
+ kstat_named_t arcstat_mutex_miss;
+ /*
+ * Number of buffers skipped when updating the access state due to the
+ * header having already been released after acquiring the hash lock.
+ */
+ kstat_named_t arcstat_access_skip;
+ /*
+ * Number of buffers skipped because they have I/O in progress, are
+ * indirect prefetch buffers that have not lived long enough, or are
+ * not from the spa we're trying to evict from.
+ */
+ kstat_named_t arcstat_evict_skip;
+ /*
+ * Number of times arc_evict_state() was unable to evict enough
+ * buffers to reach its target amount.
+ */
+ kstat_named_t arcstat_evict_not_enough;
+ kstat_named_t arcstat_evict_l2_cached;
+ kstat_named_t arcstat_evict_l2_eligible;
+ kstat_named_t arcstat_evict_l2_ineligible;
+ kstat_named_t arcstat_evict_l2_skip;
+ kstat_named_t arcstat_hash_elements;
+ kstat_named_t arcstat_hash_elements_max;
+ kstat_named_t arcstat_hash_collisions;
+ kstat_named_t arcstat_hash_chains;
+ kstat_named_t arcstat_hash_chain_max;
+ kstat_named_t arcstat_p;
+ kstat_named_t arcstat_c;
+ kstat_named_t arcstat_c_min;
+ kstat_named_t arcstat_c_max;
+ /* Not updated directly; only synced in arc_kstat_update. */
+ kstat_named_t arcstat_size;
+ /*
+ * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
+ * Note that the compressed bytes may match the uncompressed bytes
+ * if the block is either not compressed or compressed arc is disabled.
+ */
+ kstat_named_t arcstat_compressed_size;
+ /*
+ * Uncompressed size of the data stored in b_pabd. If compressed
+ * arc is disabled then this value will be identical to the stat
+ * above.
+ */
+ kstat_named_t arcstat_uncompressed_size;
+ /*
+ * Number of bytes stored in all the arc_buf_t's. This is classified
+ * as "overhead" since this data is typically short-lived and will
+ * be evicted from the arc when it becomes unreferenced unless the
+ * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
+ * values have been set (see comment in dbuf.c for more information).
+ */
+ kstat_named_t arcstat_overhead_size;
+ /*
+ * Number of bytes consumed by internal ARC structures necessary
+ * for tracking purposes; these structures are not actually
+ * backed by ARC buffers. This includes arc_buf_hdr_t structures
+ * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
+ * caches), and arc_buf_t structures (allocated via arc_buf_t
+ * cache).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_hdr_size;
+ /*
+ * Number of bytes consumed by ARC buffers of type equal to
+ * ARC_BUFC_DATA. This is generally consumed by buffers backing
+ * on disk user data (e.g. plain file contents).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_data_size;
+ /*
+ * Number of bytes consumed by ARC buffers of type equal to
+ * ARC_BUFC_METADATA. This is generally consumed by buffers
+ * backing on disk data that is used for internal ZFS
+ * structures (e.g. ZAP, dnode, indirect blocks, etc).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_metadata_size;
+ /*
+ * Number of bytes consumed by various buffers and structures
+ * not actually backed with ARC buffers. This includes bonus
+ * buffers (allocated directly via zio_buf_* functions),
+ * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
+ * cache), and dnode_t structures (allocated via dnode_t cache).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_other_size;
+ /*
+ * Total number of bytes consumed by ARC buffers residing in the
+ * arc_anon state. This includes *all* buffers in the arc_anon
+ * state; e.g. data, metadata, evictable, and unevictable buffers
+ * are all included in this value.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_anon_size;
+ /*
+ * Number of bytes consumed by ARC buffers that meet the
+ * following criteria: backing buffers of type ARC_BUFC_DATA,
+ * residing in the arc_anon state, and are eligible for eviction
+ * (e.g. have no outstanding holds on the buffer).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_anon_evictable_data;
+ /*
+ * Number of bytes consumed by ARC buffers that meet the
+ * following criteria: backing buffers of type ARC_BUFC_METADATA,
+ * residing in the arc_anon state, and are eligible for eviction
+ * (e.g. have no outstanding holds on the buffer).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_anon_evictable_metadata;
+ /*
+ * Total number of bytes consumed by ARC buffers residing in the
+ * arc_mru state. This includes *all* buffers in the arc_mru
+ * state; e.g. data, metadata, evictable, and unevictable buffers
+ * are all included in this value.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mru_size;
+ /*
+ * Number of bytes consumed by ARC buffers that meet the
+ * following criteria: backing buffers of type ARC_BUFC_DATA,
+ * residing in the arc_mru state, and are eligible for eviction
+ * (e.g. have no outstanding holds on the buffer).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mru_evictable_data;
+ /*
+ * Number of bytes consumed by ARC buffers that meet the
+ * following criteria: backing buffers of type ARC_BUFC_METADATA,
+ * residing in the arc_mru state, and are eligible for eviction
+ * (e.g. have no outstanding holds on the buffer).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mru_evictable_metadata;
+ /*
+ * Total number of bytes that *would have been* consumed by ARC
+ * buffers in the arc_mru_ghost state. The key thing to note
+ * here, is the fact that this size doesn't actually indicate
+ * RAM consumption. The ghost lists only consist of headers and
+ * don't actually have ARC buffers linked off of these headers.
+ * Thus, *if* the headers had associated ARC buffers, these
+ * buffers *would have* consumed this number of bytes.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mru_ghost_size;
+ /*
+ * Number of bytes that *would have been* consumed by ARC
+ * buffers that are eligible for eviction, of type
+ * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mru_ghost_evictable_data;
+ /*
+ * Number of bytes that *would have been* consumed by ARC
+ * buffers that are eligible for eviction, of type
+ * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mru_ghost_evictable_metadata;
+ /*
+ * Total number of bytes consumed by ARC buffers residing in the
+ * arc_mfu state. This includes *all* buffers in the arc_mfu
+ * state; e.g. data, metadata, evictable, and unevictable buffers
+ * are all included in this value.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mfu_size;
+ /*
+ * Number of bytes consumed by ARC buffers that are eligible for
+ * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
+ * state.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mfu_evictable_data;
+ /*
+ * Number of bytes consumed by ARC buffers that are eligible for
+ * eviction, of type ARC_BUFC_METADATA, and reside in the
+ * arc_mfu state.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mfu_evictable_metadata;
+ /*
+ * Total number of bytes that *would have been* consumed by ARC
+ * buffers in the arc_mfu_ghost state. See the comment above
+ * arcstat_mru_ghost_size for more details.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mfu_ghost_size;
+ /*
+ * Number of bytes that *would have been* consumed by ARC
+ * buffers that are eligible for eviction, of type
+ * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mfu_ghost_evictable_data;
+ /*
+ * Number of bytes that *would have been* consumed by ARC
+ * buffers that are eligible for eviction, of type
+ * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mfu_ghost_evictable_metadata;
+ kstat_named_t arcstat_l2_hits;
+ kstat_named_t arcstat_l2_misses;
+ kstat_named_t arcstat_l2_feeds;
+ kstat_named_t arcstat_l2_rw_clash;
+ kstat_named_t arcstat_l2_read_bytes;
+ kstat_named_t arcstat_l2_write_bytes;
+ kstat_named_t arcstat_l2_writes_sent;
+ kstat_named_t arcstat_l2_writes_done;
+ kstat_named_t arcstat_l2_writes_error;
+ kstat_named_t arcstat_l2_writes_lock_retry;
+ kstat_named_t arcstat_l2_evict_lock_retry;
+ kstat_named_t arcstat_l2_evict_reading;
+ kstat_named_t arcstat_l2_evict_l1cached;
+ kstat_named_t arcstat_l2_free_on_write;
+ kstat_named_t arcstat_l2_abort_lowmem;
+ kstat_named_t arcstat_l2_cksum_bad;
+ kstat_named_t arcstat_l2_io_error;
+ kstat_named_t arcstat_l2_lsize;
+ kstat_named_t arcstat_l2_psize;
+ /* Not updated directly; only synced in arc_kstat_update. */
+ kstat_named_t arcstat_l2_hdr_size;
+ /*
+ * Number of L2ARC log blocks written. These are used for restoring the
+ * L2ARC. Updated during writing of L2ARC log blocks.
+ */
+ kstat_named_t arcstat_l2_log_blk_writes;
+ /*
+ * Moving average of the aligned size of the L2ARC log blocks, in
+ * bytes. Updated during L2ARC rebuild and during writing of L2ARC
+ * log blocks.
+ */
+ kstat_named_t arcstat_l2_log_blk_avg_asize;
+ /* Aligned size of L2ARC log blocks on L2ARC devices. */
+ kstat_named_t arcstat_l2_log_blk_asize;
+ /* Number of L2ARC log blocks present on L2ARC devices. */
+ kstat_named_t arcstat_l2_log_blk_count;
+ /*
+ * Moving average of the aligned size of L2ARC restored data, in bytes,
+ * to the aligned size of their metadata in L2ARC, in bytes.
+ * Updated during L2ARC rebuild and during writing of L2ARC log blocks.
+ */
+ kstat_named_t arcstat_l2_data_to_meta_ratio;
+ /*
+ * Number of times the L2ARC rebuild was successful for an L2ARC device.
+ */
+ kstat_named_t arcstat_l2_rebuild_success;
+ /*
+ * Number of times the L2ARC rebuild failed because the device header
+ * was in an unsupported format or corrupted.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_unsupported;
+ /*
+ * Number of times the L2ARC rebuild failed because of IO errors
+ * while reading a log block.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_io_errors;
+ /*
+ * Number of times the L2ARC rebuild failed because of IO errors when
+ * reading the device header.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_dh_errors;
+ /*
+ * Number of L2ARC log blocks which failed to be restored due to
+ * checksum errors.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors;
+ /*
+ * Number of times the L2ARC rebuild was aborted due to low system
+ * memory.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_lowmem;
+ /* Logical size of L2ARC restored data, in bytes. */
+ kstat_named_t arcstat_l2_rebuild_size;
+ /* Aligned size of L2ARC restored data, in bytes. */
+ kstat_named_t arcstat_l2_rebuild_asize;
+ /*
+ * Number of L2ARC log entries (buffers) that were successfully
+ * restored in ARC.
+ */
+ kstat_named_t arcstat_l2_rebuild_bufs;
+ /*
+ * Number of L2ARC log entries (buffers) already cached in ARC. These
+ * were not restored again.
+ */
+ kstat_named_t arcstat_l2_rebuild_bufs_precached;
+ /*
+ * Number of L2ARC log blocks that were restored successfully. Each
+ * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers.
+ */
+ kstat_named_t arcstat_l2_rebuild_log_blks;
+ kstat_named_t arcstat_memory_throttle_count;
+ /* Not updated directly; only synced in arc_kstat_update. */
+ kstat_named_t arcstat_meta_used;
+ kstat_named_t arcstat_meta_limit;
+ kstat_named_t arcstat_meta_max;
+ kstat_named_t arcstat_meta_min;
+ kstat_named_t arcstat_async_upgrade_sync;
+ kstat_named_t arcstat_demand_hit_predictive_prefetch;
+ kstat_named_t arcstat_demand_hit_prescient_prefetch;
+} arc_stats_t;
+
+#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
+
+#define ARCSTAT_INCR(stat, val) \
+ atomic_add_64(&arc_stats.stat.value.ui64, (val))
+
+#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
+#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
+
+/*
+ * There are several ARC variables that are critical to export as kstats --
+ * but we don't want to have to grovel around in the kstat whenever we wish to
+ * manipulate them. For these variables, we therefore define them to be in
+ * terms of the statistic variable. This assures that we are not introducing
+ * the possibility of inconsistency by having shadow copies of the variables,
+ * while still allowing the code to be readable.
+ */
+#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
+#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
+#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
+#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
+#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
+#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
+#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
+
+/* compressed size of entire arc */
+#define arc_compressed_size ARCSTAT(arcstat_compressed_size)
+/* uncompressed size of entire arc */
+#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size)
+/* number of bytes in the arc from arc_buf_t's */
+#define arc_overhead_size ARCSTAT(arcstat_overhead_size)
+
+extern arc_stats_t arc_stats;
+
+/* used in zdb.c */
+boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
+ const l2arc_log_blkptr_t *lbp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ARC_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index 33cdfbeb4b..af8057be8f 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -792,6 +792,7 @@ extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
#define SPA_ASYNC_INITIALIZE_RESTART 0x100
#define SPA_ASYNC_TRIM_RESTART 0x200
#define SPA_ASYNC_AUTOTRIM_RESTART 0x400
+#define SPA_ASYNC_L2CACHE_REBUILD 0x800
/*
* Controls the behavior of spa_vdev_remove().
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 254af68099..cd05edcffa 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -2179,9 +2179,22 @@ vdev_reopen(vdev_t *vd)
if (vd->vdev_aux) {
(void) vdev_validate_aux(vd);
if (vdev_readable(vd) && vdev_writeable(vd) &&
- vd->vdev_aux == &spa->spa_l2cache &&
- !l2arc_vdev_present(vd))
- l2arc_add_vdev(spa, vd);
+ vd->vdev_aux == &spa->spa_l2cache) {
+ /*
+ * When reopening we can assume the device label has
+ * already the attribute l2cache_persistent, since we've
+ * opened the device in the past and updated the label.
+ * In case the vdev is present we should evict all ARC
+ * buffers and pointers to log blocks and reclaim their
+ * space before restoring its contents to L2ARC.
+ */
+ if (l2arc_vdev_present(vd)) {
+ l2arc_rebuild_vdev(vd, B_TRUE);
+ } else {
+ l2arc_add_vdev(spa, vd);
+ }
+ spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
+ }
} else {
(void) vdev_validate(vd);
}
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
index f870d6ce7c..819905a8d9 100644
--- a/usr/src/uts/common/sys/fs/zfs.h
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -561,6 +561,11 @@ typedef enum zfs_key_location {
#define ZPL_VERSION_USERSPACE ZPL_VERSION_4
#define ZPL_VERSION_SA ZPL_VERSION_5
+/* Persistent L2ARC version */
+#define L2ARC_PERSISTENT_VERSION_1 1ULL
+#define L2ARC_PERSISTENT_VERSION L2ARC_PERSISTENT_VERSION_1
+#define L2ARC_PERSISTENT_VERSION_STRING "1"
+
/* Rewind policy information */
#define ZPOOL_NO_REWIND 1 /* No policy - default behavior */
#define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */