summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Williamson <Chris.Williamson@delphix.com>2014-12-29 19:12:23 -0800
committerChristopher Siden <chris@delphix.com>2014-12-29 19:12:23 -0800
commit89c86e32293a30cdd7af530c38b2073fee01411c (patch)
tree2c9a3719ff5dce90ebfff1072dcba5729d3bcb77
parent1fcc078ae7c0a359a9274d2a5a90547aceb213a6 (diff)
downloadillumos-joyent-89c86e32293a30cdd7af530c38b2073fee01411c.tar.gz
5408 managing ZFS cache devices requires lots of RAM
Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Don Brady <dev.fs.zfs@gmail.com> Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com> Approved by: Garrett D'Amore <garrett@damore.org>
-rw-r--r--usr/src/cmd/mdb/common/modules/zfs/zfs.c3
-rw-r--r--usr/src/cmd/ztest/ztest.c9
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c1410
-rw-r--r--usr/src/uts/common/fs/zfs/sys/arc.h27
4 files changed, 883 insertions, 566 deletions
diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
index c8b296847d..e940623d22 100644
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
@@ -961,6 +961,9 @@ arc_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
"p", "c", "c_min", "c_max", "size", "duplicate_buffers_size",
"arc_meta_used", "arc_meta_limit", "arc_meta_max",
"arc_meta_min", "hdr_size", "data_size", "other_size",
+ "evict_l2_cached", "evict_l2_eligible", "evict_l2_ineligible",
+ "l2_read_bytes", "l2_write_bytes", "l2_size", "l2_asize",
+ "l2_hdr_size",
NULL
};
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index a5f9600804..9fdbd2118c 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -3905,7 +3905,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
* assign an arcbuf to a dbuf.
*/
for (j = 0; j < s; j++) {
- if (i != 5) {
+ if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
bigbuf_arcbufs[j] =
dmu_request_arcbuf(bonus_db, chunksize);
} else {
@@ -3929,7 +3929,8 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
umem_free(packbuf, packsize);
umem_free(bigbuf, bigsize);
for (j = 0; j < s; j++) {
- if (i != 5) {
+ if (i != 5 ||
+ chunksize < (SPA_MINBLOCKSIZE * 2)) {
dmu_return_arcbuf(bigbuf_arcbufs[j]);
} else {
dmu_return_arcbuf(
@@ -3973,7 +3974,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
}
for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
dmu_buf_t *dbt;
- if (i != 5) {
+ if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
bcopy((caddr_t)bigbuf + (off - bigoff),
bigbuf_arcbufs[j]->b_data, chunksize);
} else {
@@ -3990,7 +3991,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
VERIFY(dmu_buf_hold(os, bigobj, off,
FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
}
- if (i != 5) {
+ if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
dmu_assign_arcbuf(bonus_db, off,
bigbuf_arcbufs[j], tx);
} else {
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index 656b4212fe..012dd83cd7 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -111,7 +111,7 @@
* Note that the majority of the performance stats are manipulated
* with atomic operations.
*
- * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
+ * The L2ARC uses the l2ad_mtx on each vdev for the following:
*
* - L2ARC buflist creation
* - L2ARC buflist eviction
@@ -314,6 +314,7 @@ typedef struct arc_stats {
kstat_named_t arcstat_l2_writes_hdr_miss;
kstat_named_t arcstat_l2_evict_lock_retry;
kstat_named_t arcstat_l2_evict_reading;
+ kstat_named_t arcstat_l2_evict_l1cached;
kstat_named_t arcstat_l2_free_on_write;
kstat_named_t arcstat_l2_abort_lowmem;
kstat_named_t arcstat_l2_cksum_bad;
@@ -381,6 +382,7 @@ static arc_stats_t arc_stats = {
{ "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
{ "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
{ "l2_evict_reading", KSTAT_DATA_UINT64 },
+ { "l2_evict_l1cached", KSTAT_DATA_UINT64 },
{ "l2_free_on_write", KSTAT_DATA_UINT64 },
{ "l2_abort_lowmem", KSTAT_DATA_UINT64 },
{ "l2_cksum_bad", KSTAT_DATA_UINT64 },
@@ -472,8 +474,6 @@ static int arc_no_grow; /* Don't try to grow cache size */
static uint64_t arc_tempreserve;
static uint64_t arc_loaned_bytes;
-typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
-
typedef struct arc_callback arc_callback_t;
struct arc_callback {
@@ -494,29 +494,53 @@ struct arc_write_callback {
arc_buf_t *awcb_buf;
};
-struct arc_buf_hdr {
- /* protected by hash lock */
- dva_t b_dva;
- uint64_t b_birth;
- uint64_t b_cksum0;
-
+/*
+ * ARC buffers are separated into multiple structs as a memory saving measure:
+ * - Common fields struct, always defined, and embedded within it:
+ * - L2-only fields, always allocated but undefined when not in L2ARC
+ * - L1-only fields, only allocated when in L1ARC
+ *
+ * Buffer in L1 Buffer only in L2
+ * +------------------------+ +------------------------+
+ * | arc_buf_hdr_t | | arc_buf_hdr_t |
+ * | | | |
+ * | | | |
+ * | | | |
+ * +------------------------+ +------------------------+
+ * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
+ * | (undefined if L1-only) | | |
+ * +------------------------+ +------------------------+
+ * | l1arc_buf_hdr_t |
+ * | |
+ * | |
+ * | |
+ * | |
+ * +------------------------+
+ *
+ * Because it's possible for the L2ARC to become extremely large, we can wind
+ * up eating a lot of memory in L2ARC buffer headers, so the size of a header
+ * is minimized by only allocating the fields necessary for an L1-cached buffer
+ * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
+ * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
+ * words in pointers. arc_hdr_realloc() is used to switch a header between
+ * these two allocation states.
+ */
+typedef struct l1arc_buf_hdr {
kmutex_t b_freeze_lock;
- zio_cksum_t *b_freeze_cksum;
+#ifdef ZFS_DEBUG
+ /*
+ * used for debugging wtih kmem_flags - by allocating and freeing
+ * b_thawed when the buffer is thawed, we get a record of the stack
+ * trace that thawed it.
+ */
void *b_thawed;
+#endif
- arc_buf_hdr_t *b_hash_next;
arc_buf_t *b_buf;
- arc_flags_t b_flags;
uint32_t b_datacnt;
-
- arc_callback_t *b_acb;
+ /* for waiting on writes to complete */
kcondvar_t b_cv;
- /* immutable */
- arc_buf_contents_t b_type;
- uint64_t b_size;
- uint64_t b_spa;
-
/* protected by arc state mutex */
arc_state_t *b_state;
list_node_t b_arc_node;
@@ -527,8 +551,46 @@ struct arc_buf_hdr {
/* self protecting */
refcount_t b_refcnt;
- l2arc_buf_hdr_t *b_l2hdr;
+ arc_callback_t *b_acb;
+ /* temporary buffer holder for in-flight compressed data */
+ void *b_tmp_cdata;
+} l1arc_buf_hdr_t;
+
+typedef struct l2arc_dev l2arc_dev_t;
+
+typedef struct l2arc_buf_hdr {
+ /* protected by arc_buf_hdr mutex */
+ l2arc_dev_t *b_dev; /* L2ARC device */
+ uint64_t b_daddr; /* disk address, offset byte */
+ /* real alloc'd buffer size depending on b_compress applied */
+ int32_t b_asize;
+
list_node_t b_l2node;
+} l2arc_buf_hdr_t;
+
+struct arc_buf_hdr {
+ /* protected by hash lock */
+ dva_t b_dva;
+ uint64_t b_birth;
+ /*
+ * Even though this checksum is only set/verified when a buffer is in
+ * the L1 cache, it needs to be in the set of common fields because it
+ * must be preserved from the time before a buffer is written out to
+ * L2ARC until after it is read back in.
+ */
+ zio_cksum_t *b_freeze_cksum;
+
+ arc_buf_hdr_t *b_hash_next;
+ arc_flags_t b_flags;
+
+ /* immutable */
+ int32_t b_size;
+ uint64_t b_spa;
+
+ /* L2ARC fields. Undefined when not in L2ARC. */
+ l2arc_buf_hdr_t b_l2hdr;
+ /* L1ARC fields. Undefined when in l2arc_only state */
+ l1arc_buf_hdr_t b_l1hdr;
};
static arc_buf_t *arc_eviction_list;
@@ -545,22 +607,38 @@ static arc_buf_hdr_t arc_eviction_hdr;
#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
-#define HDR_FREE_IN_PROGRESS(hdr) \
- ((hdr)->b_flags & ARC_FLAG_FREE_IN_PROGRESS)
+
#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
+#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
#define HDR_L2_READING(hdr) \
- ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS && \
- (hdr)->b_l2hdr != NULL)
+ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
+ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
+#define HDR_ISTYPE_METADATA(hdr) \
+ ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
+#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))
+
+#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
+#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
+
+/* For storing compression mode in b_flags */
+#define HDR_COMPRESS_OFFSET 24
+#define HDR_COMPRESS_NBITS 7
+
+#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \
+ HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS))
+#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \
+ HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp))
+
/*
* Other sizes
*/
-#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
-#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
+#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
+#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
/*
* Hash table routines
@@ -624,7 +702,7 @@ boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
/*
* L2ARC Internals
*/
-typedef struct l2arc_dev {
+struct l2arc_dev {
vdev_t *l2ad_vdev; /* vdev */
spa_t *l2ad_spa; /* spa */
uint64_t l2ad_hand; /* next write location */
@@ -633,15 +711,15 @@ typedef struct l2arc_dev {
uint64_t l2ad_evict; /* last addr eviction reached */
boolean_t l2ad_first; /* first sweep through */
boolean_t l2ad_writing; /* currently writing */
- list_t *l2ad_buflist; /* buffer list */
+ kmutex_t l2ad_mtx; /* lock for buffer list */
+ list_t l2ad_buflist; /* buffer list */
list_node_t l2ad_node; /* device list node */
-} l2arc_dev_t;
+};
static list_t L2ARC_dev_list; /* device list */
static list_t *l2arc_dev_list; /* device list pointer */
static kmutex_t l2arc_dev_mtx; /* device list mutex */
static l2arc_dev_t *l2arc_dev_last; /* last device used */
-static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
static list_t L2ARC_free_on_write; /* free after write buf list */
static list_t *l2arc_free_on_write; /* free after write list ptr */
static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
@@ -661,18 +739,6 @@ typedef struct l2arc_write_callback {
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
} l2arc_write_callback_t;
-struct l2arc_buf_hdr {
- /* protected by arc_buf_hdr mutex */
- l2arc_dev_t *b_dev; /* L2ARC device */
- uint64_t b_daddr; /* disk address, offset byte */
- /* compression applied to buffer data */
- enum zio_compress b_compress;
- /* real alloc'd buffer size depending on b_compress applied */
- int b_asize;
- /* temporary buffer holder for in-flight compressed data */
- void *b_tmp_cdata;
-};
-
typedef struct l2arc_data_free {
/* protected by l2arc_free_on_write_mtx */
void *l2df_data;
@@ -691,12 +757,13 @@ static int arc_evict_needed(arc_buf_contents_t);
static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t);
static void arc_buf_watch(arc_buf_t *);
+static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
+static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
+
static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
static void l2arc_read_done(zio_t *);
-static void l2arc_hdr_stat_add(void);
-static void l2arc_hdr_stat_remove(void);
-static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *);
+static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
@@ -719,8 +786,7 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
#define BUF_EMPTY(buf) \
((buf)->b_dva.dva_word[0] == 0 && \
- (buf)->b_dva.dva_word[1] == 0 && \
- (buf)->b_cksum0 == 0)
+ (buf)->b_dva.dva_word[1] == 0)
#define BUF_EQUAL(spa, dva, birth, buf) \
((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
@@ -733,7 +799,6 @@ buf_discard_identity(arc_buf_hdr_t *hdr)
hdr->b_dva.dva_word[0] = 0;
hdr->b_dva.dva_word[1] = 0;
hdr->b_birth = 0;
- hdr->b_cksum0 = 0;
}
static arc_buf_hdr_t *
@@ -763,6 +828,7 @@ buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
* equal to elem in the hash table, then the already existing element
* will be returned and the new element will not be inserted.
* Otherwise returns NULL.
+ * If lockp == NULL, the caller is assumed to already hold the hash lock.
*/
static arc_buf_hdr_t *
buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
@@ -775,8 +841,14 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
ASSERT(hdr->b_birth != 0);
ASSERT(!HDR_IN_HASH_TABLE(hdr));
- *lockp = hash_lock;
- mutex_enter(hash_lock);
+
+ if (lockp != NULL) {
+ *lockp = hash_lock;
+ mutex_enter(hash_lock);
+ } else {
+ ASSERT(MUTEX_HELD(hash_lock));
+ }
+
for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
fhdr = fhdr->b_hash_next, i++) {
if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
@@ -831,7 +903,8 @@ buf_hash_remove(arc_buf_hdr_t *hdr)
/*
* Global data structures and functions for the buf kmem cache.
*/
-static kmem_cache_t *hdr_cache;
+static kmem_cache_t *hdr_full_cache;
+static kmem_cache_t *hdr_l2only_cache;
static kmem_cache_t *buf_cache;
static void
@@ -843,7 +916,8 @@ buf_fini(void)
(buf_hash_table.ht_mask + 1) * sizeof (void *));
for (i = 0; i < BUF_LOCKS; i++)
mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
- kmem_cache_destroy(hdr_cache);
+ kmem_cache_destroy(hdr_full_cache);
+ kmem_cache_destroy(hdr_l2only_cache);
kmem_cache_destroy(buf_cache);
}
@@ -853,15 +927,27 @@ buf_fini(void)
*/
/* ARGSUSED */
static int
-hdr_cons(void *vbuf, void *unused, int kmflag)
+hdr_full_cons(void *vbuf, void *unused, int kmflag)
{
arc_buf_hdr_t *hdr = vbuf;
- bzero(hdr, sizeof (arc_buf_hdr_t));
- refcount_create(&hdr->b_refcnt);
- cv_init(&hdr->b_cv, NULL, CV_DEFAULT, NULL);
- mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
- arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
+ bzero(hdr, HDR_FULL_SIZE);
+ cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
+ refcount_create(&hdr->b_l1hdr.b_refcnt);
+ mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+ arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_hdr_t *hdr = vbuf;
+
+ bzero(hdr, HDR_L2ONLY_SIZE);
+ arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
return (0);
}
@@ -885,15 +971,25 @@ buf_cons(void *vbuf, void *unused, int kmflag)
*/
/* ARGSUSED */
static void
-hdr_dest(void *vbuf, void *unused)
+hdr_full_dest(void *vbuf, void *unused)
+{
+ arc_buf_hdr_t *hdr = vbuf;
+
+ ASSERT(BUF_EMPTY(hdr));
+ cv_destroy(&hdr->b_l1hdr.b_cv);
+ refcount_destroy(&hdr->b_l1hdr.b_refcnt);
+ mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
+ arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
+}
+
+/* ARGSUSED */
+static void
+hdr_l2only_dest(void *vbuf, void *unused)
{
arc_buf_hdr_t *hdr = vbuf;
ASSERT(BUF_EMPTY(hdr));
- refcount_destroy(&hdr->b_refcnt);
- cv_destroy(&hdr->b_cv);
- mutex_destroy(&hdr->b_freeze_lock);
- arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
+ arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
}
/* ARGSUSED */
@@ -947,8 +1043,11 @@ retry:
goto retry;
}
- hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
- 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
+ hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
+ 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
+ hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
+ HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
+ NULL, NULL, 0);
buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
@@ -962,6 +1061,81 @@ retry:
}
}
+/*
+ * Transition between the two allocation states for the arc_buf_hdr struct.
+ * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
+ * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
+ * version is used when a cache buffer is only in the L2ARC in order to reduce
+ * memory usage.
+ */
+static arc_buf_hdr_t *
+arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
+{
+ ASSERT(HDR_HAS_L2HDR(hdr));
+
+ arc_buf_hdr_t *nhdr;
+ l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+
+ ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
+ (old == hdr_l2only_cache && new == hdr_full_cache));
+
+ nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
+
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
+ buf_hash_remove(hdr);
+
+ bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
+ if (new == hdr_full_cache) {
+ nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
+ /*
+ * arc_access and arc_change_state need to be aware that a
+ * header has just come out of L2ARC, so we set its state to
+ * l2c_only even though it's about to change.
+ */
+ nhdr->b_l1hdr.b_state = arc_l2c_only;
+ } else {
+ ASSERT(hdr->b_l1hdr.b_buf == NULL);
+ ASSERT0(hdr->b_l1hdr.b_datacnt);
+ ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+ /*
+ * We might be removing the L1hdr of a buffer which was just
+ * written out to L2ARC. If such a buffer is compressed then we
+ * need to free its b_tmp_cdata before destroying the header.
+ */
+ if (hdr->b_l1hdr.b_tmp_cdata != NULL &&
+ HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
+ l2arc_release_cdata_buf(hdr);
+ nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
+ }
+ /*
+ * The header has been reallocated so we need to re-insert it into any
+ * lists it was on.
+ */
+ (void) buf_hash_insert(nhdr, NULL);
+
+ ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
+
+ mutex_enter(&dev->l2ad_mtx);
+
+ /*
+ * We must place the realloc'ed header back into the list at
+ * the same spot. Otherwise, if it's placed earlier in the list,
+ * l2arc_write_buffers() could find it during the function's
+ * write phase, and try to write it out to the l2arc.
+ */
+ list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
+ list_remove(&dev->l2ad_buflist, hdr);
+
+ mutex_exit(&dev->l2ad_mtx);
+
+ buf_discard_identity(hdr);
+ hdr->b_freeze_cksum = NULL;
+ kmem_cache_free(old, hdr);
+
+ return (nhdr);
+}
+
+
#define ARC_MINTIME (hz>>4) /* 62 ms */
static void
@@ -972,16 +1146,15 @@ arc_cksum_verify(arc_buf_t *buf)
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
- mutex_enter(&buf->b_hdr->b_freeze_lock);
- if (buf->b_hdr->b_freeze_cksum == NULL ||
- (buf->b_hdr->b_flags & ARC_FLAG_IO_ERROR)) {
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+ if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
+ mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
return;
}
fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
panic("buffer modified while frozen!");
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
}
static int
@@ -990,10 +1163,10 @@ arc_cksum_equal(arc_buf_t *buf)
zio_cksum_t zc;
int equal;
- mutex_enter(&buf->b_hdr->b_freeze_lock);
+ mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
return (equal);
}
@@ -1004,15 +1177,15 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force)
if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
return;
- mutex_enter(&buf->b_hdr->b_freeze_lock);
+ mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
if (buf->b_hdr->b_freeze_cksum != NULL) {
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
return;
}
buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
buf->b_hdr->b_freeze_cksum);
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
arc_buf_watch(buf);
}
@@ -1059,30 +1232,58 @@ arc_buf_watch(arc_buf_t *buf)
#endif
}
+static arc_buf_contents_t
+arc_buf_type(arc_buf_hdr_t *hdr)
+{
+ if (HDR_ISTYPE_METADATA(hdr)) {
+ return (ARC_BUFC_METADATA);
+ } else {
+ return (ARC_BUFC_DATA);
+ }
+}
+
+static uint32_t
+arc_bufc_to_flags(arc_buf_contents_t type)
+{
+ switch (type) {
+ case ARC_BUFC_DATA:
+ /* metadata field is 0 if buffer contains normal data */
+ return (0);
+ case ARC_BUFC_METADATA:
+ return (ARC_FLAG_BUFC_METADATA);
+ default:
+ break;
+ }
+ panic("undefined ARC buffer type!");
+ return ((uint32_t)-1);
+}
+
void
arc_buf_thaw(arc_buf_t *buf)
{
if (zfs_flags & ZFS_DEBUG_MODIFY) {
- if (buf->b_hdr->b_state != arc_anon)
+ if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
panic("modifying non-anon buffer!");
- if (buf->b_hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS)
+ if (HDR_IO_IN_PROGRESS(buf->b_hdr))
panic("modifying buffer while i/o in progress!");
arc_cksum_verify(buf);
}
- mutex_enter(&buf->b_hdr->b_freeze_lock);
+ mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
if (buf->b_hdr->b_freeze_cksum != NULL) {
kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
buf->b_hdr->b_freeze_cksum = NULL;
}
+#ifdef ZFS_DEBUG
if (zfs_flags & ZFS_DEBUG_MODIFY) {
- if (buf->b_hdr->b_thawed)
- kmem_free(buf->b_hdr->b_thawed, 1);
- buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
+ if (buf->b_hdr->b_l1hdr.b_thawed != NULL)
+ kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1);
+ buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
}
+#endif
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
arc_buf_unwatch(buf);
}
@@ -1099,7 +1300,7 @@ arc_buf_freeze(arc_buf_t *buf)
mutex_enter(hash_lock);
ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
- buf->b_hdr->b_state == arc_anon);
+ buf->b_hdr->b_l1hdr.b_state == arc_anon);
arc_cksum_compute(buf, B_FALSE);
mutex_exit(hash_lock);
@@ -1108,30 +1309,34 @@ arc_buf_freeze(arc_buf_t *buf)
static void
add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
{
+ ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(MUTEX_HELD(hash_lock));
+ arc_state_t *state = hdr->b_l1hdr.b_state;
- if ((refcount_add(&hdr->b_refcnt, tag) == 1) &&
- (hdr->b_state != arc_anon)) {
- uint64_t delta = hdr->b_size * hdr->b_datacnt;
- list_t *list = &hdr->b_state->arcs_list[hdr->b_type];
- uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
-
- ASSERT(!MUTEX_HELD(&hdr->b_state->arcs_mtx));
- mutex_enter(&hdr->b_state->arcs_mtx);
- ASSERT(list_link_active(&hdr->b_arc_node));
- list_remove(list, hdr);
- if (GHOST_STATE(hdr->b_state)) {
- ASSERT0(hdr->b_datacnt);
- ASSERT3P(hdr->b_buf, ==, NULL);
- delta = hdr->b_size;
+ if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
+ (state != arc_anon)) {
+ /* We don't use the L2-only state list. */
+ if (state != arc_l2c_only) {
+ uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
+ list_t *list = &state->arcs_list[arc_buf_type(hdr)];
+ uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
+
+ ASSERT(!MUTEX_HELD(&state->arcs_mtx));
+ mutex_enter(&state->arcs_mtx);
+ ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
+ list_remove(list, hdr);
+ if (GHOST_STATE(state)) {
+ ASSERT0(hdr->b_l1hdr.b_datacnt);
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ delta = hdr->b_size;
+ }
+ ASSERT(delta > 0);
+ ASSERT3U(*size, >=, delta);
+ atomic_add_64(size, -delta);
+ mutex_exit(&state->arcs_mtx);
}
- ASSERT(delta > 0);
- ASSERT3U(*size, >=, delta);
- atomic_add_64(size, -delta);
- mutex_exit(&hdr->b_state->arcs_mtx);
/* remove the prefetch flag if we get a reference */
- if (hdr->b_flags & ARC_FLAG_PREFETCH)
- hdr->b_flags &= ~ARC_FLAG_PREFETCH;
+ hdr->b_flags &= ~ARC_FLAG_PREFETCH;
}
}
@@ -1139,21 +1344,27 @@ static int
remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
{
int cnt;
- arc_state_t *state = hdr->b_state;
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
ASSERT(!GHOST_STATE(state));
- if (((cnt = refcount_remove(&hdr->b_refcnt, tag)) == 0) &&
+ /*
+ * arc_l2c_only counts as a ghost state so we don't need to explicitly
+ * check to prevent usage of the arc_l2c_only list.
+ */
+ if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
(state != arc_anon)) {
- uint64_t *size = &state->arcs_lsize[hdr->b_type];
+ uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
ASSERT(!MUTEX_HELD(&state->arcs_mtx));
mutex_enter(&state->arcs_mtx);
- ASSERT(!list_link_active(&hdr->b_arc_node));
- list_insert_head(&state->arcs_list[hdr->b_type], hdr);
- ASSERT(hdr->b_datacnt > 0);
- atomic_add_64(size, hdr->b_size * hdr->b_datacnt);
+ ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+ list_insert_head(&state->arcs_list[arc_buf_type(hdr)], hdr);
+ ASSERT(hdr->b_l1hdr.b_datacnt > 0);
+ atomic_add_64(size, hdr->b_size *
+ hdr->b_l1hdr.b_datacnt);
mutex_exit(&state->arcs_mtx);
}
return (cnt);
@@ -1167,40 +1378,60 @@ static void
arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
kmutex_t *hash_lock)
{
- arc_state_t *old_state = hdr->b_state;
- int64_t refcnt = refcount_count(&hdr->b_refcnt);
+ arc_state_t *old_state;
+ int64_t refcnt;
+ uint32_t datacnt;
uint64_t from_delta, to_delta;
+ arc_buf_contents_t buftype = arc_buf_type(hdr);
+
+ /*
+ * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
+ * in arc_read() when bringing a buffer out of the L2ARC. However, the
+ * L1 hdr doesn't always exist when we change state to arc_anon before
+ * destroying a header, in which case reallocating to add the L1 hdr is
+ * pointless.
+ */
+ if (HDR_HAS_L1HDR(hdr)) {
+ old_state = hdr->b_l1hdr.b_state;
+ refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
+ datacnt = hdr->b_l1hdr.b_datacnt;
+ } else {
+ old_state = arc_l2c_only;
+ refcnt = 0;
+ datacnt = 0;
+ }
ASSERT(MUTEX_HELD(hash_lock));
ASSERT3P(new_state, !=, old_state);
- ASSERT(refcnt == 0 || hdr->b_datacnt > 0);
- ASSERT(hdr->b_datacnt == 0 || !GHOST_STATE(new_state));
- ASSERT(hdr->b_datacnt <= 1 || old_state != arc_anon);
+ ASSERT(refcnt == 0 || datacnt > 0);
+ ASSERT(!GHOST_STATE(new_state) || datacnt == 0);
+ ASSERT(old_state != arc_anon || datacnt <= 1);
- from_delta = to_delta = hdr->b_datacnt * hdr->b_size;
+ from_delta = to_delta = datacnt * hdr->b_size;
/*
* If this buffer is evictable, transfer it from the
* old state list to the new state list.
*/
if (refcnt == 0) {
- if (old_state != arc_anon) {
+ if (old_state != arc_anon && old_state != arc_l2c_only) {
int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
- uint64_t *size = &old_state->arcs_lsize[hdr->b_type];
+ uint64_t *size = &old_state->arcs_lsize[buftype];
if (use_mutex)
mutex_enter(&old_state->arcs_mtx);
- ASSERT(list_link_active(&hdr->b_arc_node));
- list_remove(&old_state->arcs_list[hdr->b_type], hdr);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
+ list_remove(&old_state->arcs_list[buftype], hdr);
/*
* If prefetching out of the ghost cache,
* we will have a non-zero datacnt.
*/
- if (GHOST_STATE(old_state) && hdr->b_datacnt == 0) {
+ if (GHOST_STATE(old_state) && datacnt == 0) {
/* ghost elements have a ghost size */
- ASSERT(hdr->b_buf == NULL);
+ ASSERT(hdr->b_l1hdr.b_buf == NULL);
from_delta = hdr->b_size;
}
ASSERT3U(*size, >=, from_delta);
@@ -1209,20 +1440,26 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
if (use_mutex)
mutex_exit(&old_state->arcs_mtx);
}
- if (new_state != arc_anon) {
+ if (new_state != arc_anon && new_state != arc_l2c_only) {
int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
- uint64_t *size = &new_state->arcs_lsize[hdr->b_type];
+ uint64_t *size = &new_state->arcs_lsize[buftype];
+ /*
+ * An L1 header always exists here, since if we're
+ * moving to some L1-cached state (i.e. not l2c_only or
+ * anonymous), we realloc the header to add an L1hdr
+ * beforehand.
+ */
+ ASSERT(HDR_HAS_L1HDR(hdr));
if (use_mutex)
mutex_enter(&new_state->arcs_mtx);
- list_insert_head(&new_state->arcs_list[hdr->b_type],
- hdr);
+ list_insert_head(&new_state->arcs_list[buftype], hdr);
/* ghost elements have a ghost size */
if (GHOST_STATE(new_state)) {
- ASSERT(hdr->b_datacnt == 0);
- ASSERT(hdr->b_buf == NULL);
+ ASSERT0(datacnt);
+ ASSERT(hdr->b_l1hdr.b_buf == NULL);
to_delta = hdr->b_size;
}
atomic_add_64(size, to_delta);
@@ -1236,20 +1473,22 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
buf_hash_remove(hdr);
- /* adjust state sizes */
- if (to_delta)
+ /* adjust state sizes (ignore arc_l2c_only) */
+ if (to_delta && new_state != arc_l2c_only)
atomic_add_64(&new_state->arcs_size, to_delta);
- if (from_delta) {
+ if (from_delta && old_state != arc_l2c_only) {
ASSERT3U(old_state->arcs_size, >=, from_delta);
atomic_add_64(&old_state->arcs_size, -from_delta);
}
- hdr->b_state = new_state;
+ if (HDR_HAS_L1HDR(hdr))
+ hdr->b_l1hdr.b_state = new_state;
- /* adjust l2arc hdr stats */
- if (new_state == arc_l2c_only)
- l2arc_hdr_stat_add();
- else if (old_state == arc_l2c_only)
- l2arc_hdr_stat_remove();
+ /*
+ * L2 headers should never be on the L2 state list since they don't
+ * have L1 headers allocated.
+ */
+ ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
+ list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
}
void
@@ -1305,31 +1544,36 @@ arc_space_return(uint64_t space, arc_space_type_t type)
}
arc_buf_t *
-arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
+arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type)
{
arc_buf_hdr_t *hdr;
arc_buf_t *buf;
ASSERT3U(size, >, 0);
- hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+ hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
ASSERT(BUF_EMPTY(hdr));
+ ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
hdr->b_size = size;
- hdr->b_type = type;
hdr->b_spa = spa_load_guid(spa);
- hdr->b_state = arc_anon;
- hdr->b_arc_access = 0;
+
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
buf->b_efunc = NULL;
buf->b_private = NULL;
buf->b_next = NULL;
- hdr->b_buf = buf;
+
+ hdr->b_flags = arc_bufc_to_flags(type);
+ hdr->b_flags |= ARC_FLAG_HAS_L1HDR;
+
+ hdr->b_l1hdr.b_buf = buf;
+ hdr->b_l1hdr.b_state = arc_anon;
+ hdr->b_l1hdr.b_arc_access = 0;
+ hdr->b_l1hdr.b_datacnt = 1;
+
arc_get_data_buf(buf);
- hdr->b_datacnt = 1;
- hdr->b_flags = 0;
- ASSERT(refcount_is_zero(&hdr->b_refcnt));
- (void) refcount_add(&hdr->b_refcnt, tag);
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
return (buf);
}
@@ -1362,8 +1606,9 @@ arc_return_buf(arc_buf_t *buf, void *tag)
arc_buf_hdr_t *hdr = buf->b_hdr;
ASSERT(buf->b_data != NULL);
- (void) refcount_add(&hdr->b_refcnt, tag);
- (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
+ (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
}
@@ -1372,12 +1617,12 @@ arc_return_buf(arc_buf_t *buf, void *tag)
void
arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
{
- arc_buf_hdr_t *hdr;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
ASSERT(buf->b_data != NULL);
- hdr = buf->b_hdr;
- (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
- (void) refcount_remove(&hdr->b_refcnt, tag);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
+ (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
buf->b_efunc = NULL;
buf->b_private = NULL;
@@ -1391,15 +1636,16 @@ arc_buf_clone(arc_buf_t *from)
arc_buf_hdr_t *hdr = from->b_hdr;
uint64_t size = hdr->b_size;
- ASSERT(hdr->b_state != arc_anon);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(hdr->b_l1hdr.b_state != arc_anon);
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
buf->b_efunc = NULL;
buf->b_private = NULL;
- buf->b_next = hdr->b_buf;
- hdr->b_buf = buf;
+ buf->b_next = hdr->b_l1hdr.b_buf;
+ hdr->b_l1hdr.b_buf = buf;
arc_get_data_buf(buf);
bcopy(from->b_data, buf->b_data, size);
@@ -1409,11 +1655,11 @@ arc_buf_clone(arc_buf_t *from)
* then track the size and number of duplicates. These stats will be
* updated as duplicate buffers are created and destroyed.
*/
- if (hdr->b_type == ARC_BUFC_DATA) {
+ if (HDR_ISTYPE_DATA(hdr)) {
ARCSTAT_BUMP(arcstat_duplicate_buffers);
ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
}
- hdr->b_datacnt += 1;
+ hdr->b_l1hdr.b_datacnt += 1;
return (buf);
}
@@ -1436,17 +1682,20 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
hash_lock = HDR_LOCK(buf->b_hdr);
mutex_enter(hash_lock);
hdr = buf->b_hdr;
+ ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
mutex_exit(&buf->b_evict_lock);
- ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+ ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
+ hdr->b_l1hdr.b_state == arc_mfu);
+
add_reference(hdr, hash_lock, tag);
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_hits);
- ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH),
- demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+ demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
data, metadata, hits);
}
@@ -1484,10 +1733,10 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
arc_buf_t **bufp;
/* free up data associated with the buf */
- if (buf->b_data) {
- arc_state_t *state = buf->b_hdr->b_state;
+ if (buf->b_data != NULL) {
+ arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
uint64_t size = buf->b_hdr->b_size;
- arc_buf_contents_t type = buf->b_hdr->b_type;
+ arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
arc_cksum_verify(buf);
arc_buf_unwatch(buf);
@@ -1503,11 +1752,12 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
atomic_add_64(&arc_size, -size);
}
}
- if (list_link_active(&buf->b_hdr->b_arc_node)) {
+ if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
uint64_t *cnt = &state->arcs_lsize[type];
- ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
- ASSERT(state != arc_anon);
+ ASSERT(refcount_is_zero(
+ &buf->b_hdr->b_l1hdr.b_refcnt));
+ ASSERT(state != arc_anon && state != arc_l2c_only);
ASSERT3U(*cnt, >=, size);
atomic_add_64(cnt, -size);
@@ -1520,13 +1770,13 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
* If we're destroying a duplicate buffer make sure
* that the appropriate statistics are updated.
*/
- if (buf->b_hdr->b_datacnt > 1 &&
- buf->b_hdr->b_type == ARC_BUFC_DATA) {
+ if (buf->b_hdr->b_l1hdr.b_datacnt > 1 &&
+ HDR_ISTYPE_DATA(buf->b_hdr)) {
ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
}
- ASSERT(buf->b_hdr->b_datacnt > 0);
- buf->b_hdr->b_datacnt -= 1;
+ ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0);
+ buf->b_hdr->b_l1hdr.b_datacnt -= 1;
}
/* only remove the buf if requested */
@@ -1534,7 +1784,8 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
return;
/* remove the buf from the hdr list */
- for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
+ for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf;
+ bufp = &(*bufp)->b_next)
continue;
*bufp = buf->b_next;
buf->b_next = NULL;
@@ -1549,86 +1800,87 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
static void
arc_hdr_destroy(arc_buf_hdr_t *hdr)
{
- ASSERT(refcount_is_zero(&hdr->b_refcnt));
- ASSERT3P(hdr->b_state, ==, arc_anon);
+ if (HDR_HAS_L1HDR(hdr)) {
+ ASSERT(hdr->b_l1hdr.b_buf == NULL ||
+ hdr->b_l1hdr.b_datacnt > 0);
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+ }
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
+ ASSERT(!HDR_IN_HASH_TABLE(hdr));
+
+ if (HDR_HAS_L2HDR(hdr)) {
+ l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
+ boolean_t buflist_held = MUTEX_HELD(&l2hdr->b_dev->l2ad_mtx);
- if (l2hdr != NULL) {
- boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
- /*
- * To prevent arc_free() and l2arc_evict() from
- * attempting to free the same buffer at the same time,
- * a FREE_IN_PROGRESS flag is given to arc_free() to
- * give it priority. l2arc_evict() can't destroy this
- * header while we are waiting on l2arc_buflist_mtx.
- *
- * The hdr may be removed from l2ad_buflist before we
- * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
- */
if (!buflist_held) {
- mutex_enter(&l2arc_buflist_mtx);
- l2hdr = hdr->b_l2hdr;
+ mutex_enter(&l2hdr->b_dev->l2ad_mtx);
+ l2hdr = &hdr->b_l2hdr;
}
- if (l2hdr != NULL) {
- list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
- ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
- ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
- vdev_space_update(l2hdr->b_dev->l2ad_vdev,
- -l2hdr->b_asize, 0, 0);
- kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
- if (hdr->b_state == arc_l2c_only)
- l2arc_hdr_stat_remove();
- hdr->b_l2hdr = NULL;
- }
+ list_remove(&l2hdr->b_dev->l2ad_buflist, hdr);
+
+ ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+ ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
if (!buflist_held)
- mutex_exit(&l2arc_buflist_mtx);
+ mutex_exit(&l2hdr->b_dev->l2ad_mtx);
+
+ hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
}
- if (!BUF_EMPTY(hdr)) {
- ASSERT(!HDR_IN_HASH_TABLE(hdr));
+ if (!BUF_EMPTY(hdr))
buf_discard_identity(hdr);
- }
- while (hdr->b_buf) {
- arc_buf_t *buf = hdr->b_buf;
-
- if (buf->b_efunc) {
- mutex_enter(&arc_eviction_mtx);
- mutex_enter(&buf->b_evict_lock);
- ASSERT(buf->b_hdr != NULL);
- arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
- hdr->b_buf = buf->b_next;
- buf->b_hdr = &arc_eviction_hdr;
- buf->b_next = arc_eviction_list;
- arc_eviction_list = buf;
- mutex_exit(&buf->b_evict_lock);
- mutex_exit(&arc_eviction_mtx);
- } else {
- arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
- }
- }
+
if (hdr->b_freeze_cksum != NULL) {
kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
hdr->b_freeze_cksum = NULL;
}
- if (hdr->b_thawed) {
- kmem_free(hdr->b_thawed, 1);
- hdr->b_thawed = NULL;
+
+ if (HDR_HAS_L1HDR(hdr)) {
+ while (hdr->b_l1hdr.b_buf) {
+ arc_buf_t *buf = hdr->b_l1hdr.b_buf;
+
+ if (buf->b_efunc != NULL) {
+ mutex_enter(&arc_eviction_mtx);
+ mutex_enter(&buf->b_evict_lock);
+ ASSERT(buf->b_hdr != NULL);
+ arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
+ FALSE);
+ hdr->b_l1hdr.b_buf = buf->b_next;
+ buf->b_hdr = &arc_eviction_hdr;
+ buf->b_next = arc_eviction_list;
+ arc_eviction_list = buf;
+ mutex_exit(&buf->b_evict_lock);
+ mutex_exit(&arc_eviction_mtx);
+ } else {
+ arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
+ TRUE);
+ }
+ }
+#ifdef ZFS_DEBUG
+ if (hdr->b_l1hdr.b_thawed != NULL) {
+ kmem_free(hdr->b_l1hdr.b_thawed, 1);
+ hdr->b_l1hdr.b_thawed = NULL;
+ }
+#endif
}
- ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT3P(hdr->b_hash_next, ==, NULL);
- ASSERT3P(hdr->b_acb, ==, NULL);
- kmem_cache_free(hdr_cache, hdr);
+ if (HDR_HAS_L1HDR(hdr)) {
+ ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+ ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+ kmem_cache_free(hdr_full_cache, hdr);
+ } else {
+ kmem_cache_free(hdr_l2only_cache, hdr);
+ }
}
void
arc_buf_free(arc_buf_t *buf, void *tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
- int hashed = hdr->b_state != arc_anon;
+ int hashed = hdr->b_l1hdr.b_state != arc_anon;
ASSERT(buf->b_efunc == NULL);
ASSERT(buf->b_data != NULL);
@@ -1641,10 +1893,10 @@ arc_buf_free(arc_buf_t *buf, void *tag)
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
(void) remove_reference(hdr, hash_lock, tag);
- if (hdr->b_datacnt > 1) {
+ if (hdr->b_l1hdr.b_datacnt > 1) {
arc_buf_destroy(buf, FALSE, TRUE);
} else {
- ASSERT(buf == hdr->b_buf);
+ ASSERT(buf == hdr->b_l1hdr.b_buf);
ASSERT(buf->b_efunc == NULL);
hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
}
@@ -1658,7 +1910,7 @@ arc_buf_free(arc_buf_t *buf, void *tag)
*/
mutex_enter(&arc_eviction_mtx);
(void) remove_reference(hdr, NULL, tag);
- ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
mutex_exit(&arc_eviction_mtx);
if (destroy_hdr)
@@ -1678,34 +1930,35 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
kmutex_t *hash_lock = HDR_LOCK(hdr);
boolean_t no_callback = (buf->b_efunc == NULL);
- if (hdr->b_state == arc_anon) {
- ASSERT(hdr->b_datacnt == 1);
+ if (hdr->b_l1hdr.b_state == arc_anon) {
+ ASSERT(hdr->b_l1hdr.b_datacnt == 1);
arc_buf_free(buf, tag);
return (no_callback);
}
mutex_enter(hash_lock);
hdr = buf->b_hdr;
+ ASSERT(hdr->b_l1hdr.b_datacnt > 0);
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
- ASSERT(hdr->b_state != arc_anon);
+ ASSERT(hdr->b_l1hdr.b_state != arc_anon);
ASSERT(buf->b_data != NULL);
(void) remove_reference(hdr, hash_lock, tag);
- if (hdr->b_datacnt > 1) {
+ if (hdr->b_l1hdr.b_datacnt > 1) {
if (no_callback)
arc_buf_destroy(buf, FALSE, TRUE);
} else if (no_callback) {
- ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+ ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
ASSERT(buf->b_efunc == NULL);
hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
}
- ASSERT(no_callback || hdr->b_datacnt > 1 ||
- refcount_is_zero(&hdr->b_refcnt));
+ ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 ||
+ refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
mutex_exit(hash_lock);
return (no_callback);
}
-int
+int32_t
arc_buf_size(arc_buf_t *buf)
{
return (buf->b_hdr->b_size);
@@ -1746,7 +1999,7 @@ arc_buf_eviction_needed(arc_buf_t *buf)
return (B_TRUE);
}
- if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
+ if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr))
evict_needed = B_TRUE;
mutex_exit(&buf->b_evict_lock);
@@ -1783,8 +2036,20 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
- mutex_enter(&state->arcs_mtx);
+ /*
+ * The ghost list lock must be acquired first in order to prevent
+ * a 3 party deadlock:
+ *
+ * - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by
+ * l2ad_mtx in arc_hdr_realloc
+ * - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx
+ * - arc_evict acquires arc_*_ghost->arcs_mtx, followed by
+ * arc_*_ghost->arcs_mtx and forms a deadlock cycle.
+ *
+ * This situation is avoided by acquiring the ghost list lock first.
+ */
mutex_enter(&evicted_state->arcs_mtx);
+ mutex_enter(&state->arcs_mtx);
/*
* Decide which "type" (data vs metadata) to recycle from.
@@ -1800,6 +2065,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
arc_buf_hdr_t *metadata_hdr =
list_tail(&state->arcs_list[ARC_BUFC_METADATA]);
arc_buf_contents_t realtype;
+
if (data_hdr == NULL) {
realtype = ARC_BUFC_METADATA;
} else if (metadata_hdr == NULL) {
@@ -1808,13 +2074,13 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
realtype = ARC_BUFC_METADATA;
} else if (arc_meta_used <= arc_meta_min) {
realtype = ARC_BUFC_DATA;
+ } else if (HDR_HAS_L1HDR(data_hdr) &&
+ HDR_HAS_L1HDR(metadata_hdr) &&
+ data_hdr->b_l1hdr.b_arc_access <
+ metadata_hdr->b_l1hdr.b_arc_access) {
+ realtype = ARC_BUFC_DATA;
} else {
- if (data_hdr->b_arc_access <
- metadata_hdr->b_arc_access) {
- realtype = ARC_BUFC_DATA;
- } else {
- realtype = ARC_BUFC_METADATA;
- }
+ realtype = ARC_BUFC_METADATA;
}
if (realtype != type) {
/*
@@ -1835,8 +2101,8 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
/* prefetch buffers have a minimum lifespan */
if (HDR_IO_IN_PROGRESS(hdr) ||
(spa && hdr->b_spa != spa) ||
- (hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT) &&
- ddi_get_lbolt() - hdr->b_arc_access <
+ ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
+ ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
arc_min_prefetch_lifespan)) {
skipped++;
continue;
@@ -1861,11 +2127,11 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
*/
if (!recycle && count++ > arc_evict_iterations) {
list_insert_after(list, hdr, &marker);
- mutex_exit(&evicted_state->arcs_mtx);
mutex_exit(&state->arcs_mtx);
+ mutex_exit(&evicted_state->arcs_mtx);
kpreempt(KPREEMPT_SYNC);
- mutex_enter(&state->arcs_mtx);
mutex_enter(&evicted_state->arcs_mtx);
+ mutex_enter(&state->arcs_mtx);
hdr_prev = list_prev(list, &marker);
list_remove(list, &marker);
count = 0;
@@ -1875,28 +2141,29 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
hash_lock = HDR_LOCK(hdr);
have_lock = MUTEX_HELD(hash_lock);
if (have_lock || mutex_tryenter(hash_lock)) {
- ASSERT0(refcount_count(&hdr->b_refcnt));
- ASSERT(hdr->b_datacnt > 0);
- while (hdr->b_buf) {
- arc_buf_t *buf = hdr->b_buf;
+ ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
+ ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
+ while (hdr->b_l1hdr.b_buf) {
+ arc_buf_t *buf = hdr->b_l1hdr.b_buf;
if (!mutex_tryenter(&buf->b_evict_lock)) {
missed += 1;
break;
}
- if (buf->b_data) {
+ if (buf->b_data != NULL) {
bytes_evicted += hdr->b_size;
- if (recycle && hdr->b_type == type &&
+ if (recycle &&
+ arc_buf_type(hdr) == type &&
hdr->b_size == bytes &&
!HDR_L2_WRITING(hdr)) {
stolen = buf->b_data;
recycle = FALSE;
}
}
- if (buf->b_efunc) {
+ if (buf->b_efunc != NULL) {
mutex_enter(&arc_eviction_mtx);
arc_buf_destroy(buf,
buf->b_data == stolen, FALSE);
- hdr->b_buf = buf->b_next;
+ hdr->b_l1hdr.b_buf = buf->b_next;
buf->b_hdr = &arc_eviction_hdr;
buf->b_next = arc_eviction_list;
arc_eviction_list = buf;
@@ -1909,7 +2176,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
}
}
- if (hdr->b_l2hdr) {
+ if (HDR_HAS_L2HDR(hdr)) {
ARCSTAT_INCR(arcstat_evict_l2_cached,
hdr->b_size);
} else {
@@ -1923,7 +2190,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
}
}
- if (hdr->b_datacnt == 0) {
+ if (hdr->b_l1hdr.b_datacnt == 0) {
arc_change_state(evicted_state, hdr, hash_lock);
ASSERT(HDR_IN_HASH_TABLE(hdr));
hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
@@ -1939,8 +2206,8 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
}
}
- mutex_exit(&evicted_state->arcs_mtx);
mutex_exit(&state->arcs_mtx);
+ mutex_exit(&evicted_state->arcs_mtx);
if (bytes_evicted < bytes)
dprintf("only evicted %lld bytes from %x",
@@ -1982,7 +2249,7 @@ top:
mutex_enter(&state->arcs_mtx);
for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
hdr_prev = list_prev(list, hdr);
- if (hdr->b_type > ARC_BUFC_NUMTYPES)
+ if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES)
panic("invalid hdr=%p", (void *)hdr);
if (spa && hdr->b_spa != spa)
continue;
@@ -2014,16 +2281,23 @@ top:
}
if (mutex_tryenter(hash_lock)) {
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT(hdr->b_buf == NULL);
+ ASSERT(!HDR_HAS_L1HDR(hdr) ||
+ hdr->b_l1hdr.b_buf == NULL);
ARCSTAT_BUMP(arcstat_deleted);
bytes_deleted += hdr->b_size;
- if (hdr->b_l2hdr != NULL) {
+ if (HDR_HAS_L2HDR(hdr)) {
/*
* This buffer is cached on the 2nd Level ARC;
* don't destroy the header.
*/
arc_change_state(arc_l2c_only, hdr, hash_lock);
+ /*
+ * dropping from L1+L2 cached to L2-only,
+ * realloc to remove the L1 header.
+ */
+ hdr = arc_hdr_realloc(hdr, hdr_full_cache,
+ hdr_l2only_cache);
mutex_exit(hash_lock);
} else {
arc_change_state(arc_anon, hdr, hash_lock);
@@ -2166,27 +2440,27 @@ arc_flush(spa_t *spa)
{
uint64_t guid = 0;
- if (spa)
+ if (spa != NULL)
guid = spa_load_guid(spa);
while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
- if (spa)
+ if (spa != NULL)
break;
}
while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
- if (spa)
+ if (spa != NULL)
break;
}
while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
- if (spa)
+ if (spa != NULL)
break;
}
while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
- if (spa)
+ if (spa != NULL)
break;
}
@@ -2406,7 +2680,8 @@ arc_kmem_reap_now(void)
}
}
kmem_cache_reap_now(buf_cache);
- kmem_cache_reap_now(hdr_cache);
+ kmem_cache_reap_now(hdr_full_cache);
+ kmem_cache_reap_now(hdr_l2only_cache);
kmem_cache_reap_now(range_seg_cache);
if (zio_arena != NULL) {
@@ -2590,9 +2865,9 @@ arc_evict_needed(arc_buf_contents_t type)
static void
arc_get_data_buf(arc_buf_t *buf)
{
- arc_state_t *state = buf->b_hdr->b_state;
+ arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
uint64_t size = buf->b_hdr->b_size;
- arc_buf_contents_t type = buf->b_hdr->b_type;
+ arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
arc_adapt(size, state);
@@ -2618,8 +2893,7 @@ arc_get_data_buf(arc_buf_t *buf)
* will end up on the mru list; so steal space from there.
*/
if (state == arc_mfu_ghost)
- state = buf->b_hdr->b_flags & ARC_FLAG_PREFETCH ?
- arc_mru : arc_mfu;
+ state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu;
else if (state == arc_mru_ghost)
state = arc_mru;
@@ -2651,19 +2925,20 @@ out:
* Update the state size. Note that ghost states have a
* "ghost size" and so don't need to be updated.
*/
- if (!GHOST_STATE(buf->b_hdr->b_state)) {
+ if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
arc_buf_hdr_t *hdr = buf->b_hdr;
- atomic_add_64(&hdr->b_state->arcs_size, size);
- if (list_link_active(&hdr->b_arc_node)) {
- ASSERT(refcount_is_zero(&hdr->b_refcnt));
- atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
+ atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size);
+ if (list_link_active(&hdr->b_l1hdr.b_arc_node)) {
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type],
+ size);
}
/*
* If we are growing the cache, and we are adding anonymous
* data, and we have outgrown arc_p, update arc_p
*/
- if (arc_size < arc_c && hdr->b_state == arc_anon &&
+ if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
arc_p = MIN(arc_c, arc_p + size);
}
@@ -2679,20 +2954,21 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
clock_t now;
ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT(HDR_HAS_L1HDR(hdr));
- if (hdr->b_state == arc_anon) {
+ if (hdr->b_l1hdr.b_state == arc_anon) {
/*
* This buffer is not in the cache, and does not
* appear in our "ghost" list. Add the new buffer
* to the MRU state.
*/
- ASSERT(hdr->b_arc_access == 0);
- hdr->b_arc_access = ddi_get_lbolt();
+ ASSERT0(hdr->b_l1hdr.b_arc_access);
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
arc_change_state(arc_mru, hdr, hash_lock);
- } else if (hdr->b_state == arc_mru) {
+ } else if (hdr->b_l1hdr.b_state == arc_mru) {
now = ddi_get_lbolt();
/*
@@ -2703,14 +2979,15 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* - move the buffer to the head of the list if this is
* another prefetch (to make it less likely to be evicted).
*/
- if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) {
- if (refcount_count(&hdr->b_refcnt) == 0) {
- ASSERT(list_link_active(&hdr->b_arc_node));
+ if (HDR_PREFETCH(hdr)) {
+ if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
+ ASSERT(list_link_active(
+ &hdr->b_l1hdr.b_arc_node));
} else {
hdr->b_flags &= ~ARC_FLAG_PREFETCH;
ARCSTAT_BUMP(arcstat_mru_hits);
}
- hdr->b_arc_access = now;
+ hdr->b_l1hdr.b_arc_access = now;
return;
}
@@ -2719,18 +2996,18 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* but it is still in the cache. Move it to the MFU
* state.
*/
- if (now > hdr->b_arc_access + ARC_MINTIME) {
+ if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
/*
* More than 125ms have passed since we
* instantiated this buffer. Move it to the
* most frequently used state.
*/
- hdr->b_arc_access = now;
+ hdr->b_l1hdr.b_arc_access = now;
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
arc_change_state(arc_mfu, hdr, hash_lock);
}
ARCSTAT_BUMP(arcstat_mru_hits);
- } else if (hdr->b_state == arc_mru_ghost) {
+ } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
arc_state_t *new_state;
/*
* This buffer has been "accessed" recently, but
@@ -2738,9 +3015,9 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* MFU state.
*/
- if (hdr->b_flags & ARC_FLAG_PREFETCH) {
+ if (HDR_PREFETCH(hdr)) {
new_state = arc_mru;
- if (refcount_count(&hdr->b_refcnt) > 0)
+ if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
hdr->b_flags &= ~ARC_FLAG_PREFETCH;
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
} else {
@@ -2748,11 +3025,11 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
}
- hdr->b_arc_access = ddi_get_lbolt();
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
arc_change_state(new_state, hdr, hash_lock);
ARCSTAT_BUMP(arcstat_mru_ghost_hits);
- } else if (hdr->b_state == arc_mfu) {
+ } else if (hdr->b_l1hdr.b_state == arc_mfu) {
/*
* This buffer has been accessed more than once and is
* still in the cache. Keep it in the MFU state.
@@ -2762,13 +3039,13 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* If it was a prefetch, we will explicitly move it to
* the head of the list now.
*/
- if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) {
- ASSERT(refcount_count(&hdr->b_refcnt) == 0);
- ASSERT(list_link_active(&hdr->b_arc_node));
+ if ((HDR_PREFETCH(hdr)) != 0) {
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
}
ARCSTAT_BUMP(arcstat_mfu_hits);
- hdr->b_arc_access = ddi_get_lbolt();
- } else if (hdr->b_state == arc_mfu_ghost) {
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+ } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
arc_state_t *new_state = arc_mfu;
/*
* This buffer has been accessed more than once but has
@@ -2776,26 +3053,26 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* MFU state.
*/
- if (hdr->b_flags & ARC_FLAG_PREFETCH) {
+ if (HDR_PREFETCH(hdr)) {
/*
* This is a prefetch access...
* move this block back to the MRU state.
*/
- ASSERT0(refcount_count(&hdr->b_refcnt));
+ ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
new_state = arc_mru;
}
- hdr->b_arc_access = ddi_get_lbolt();
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
arc_change_state(new_state, hdr, hash_lock);
ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
- } else if (hdr->b_state == arc_l2c_only) {
+ } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
/*
* This buffer is on the 2nd Level ARC.
*/
- hdr->b_arc_access = ddi_get_lbolt();
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
arc_change_state(arc_mfu, hdr, hash_lock);
} else {
@@ -2866,11 +3143,11 @@ arc_read_done(zio_t *zio)
}
hdr->b_flags &= ~ARC_FLAG_L2_EVICTED;
- if (l2arc_noprefetch && (hdr->b_flags & ARC_FLAG_PREFETCH))
+ if (l2arc_noprefetch && HDR_PREFETCH(hdr))
hdr->b_flags &= ~ARC_FLAG_L2CACHE;
/* byteswap if necessary */
- callback_list = hdr->b_acb;
+ callback_list = hdr->b_l1hdr.b_acb;
ASSERT(callback_list != NULL);
if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
dmu_object_byteswap_t bswap =
@@ -2884,7 +3161,8 @@ arc_read_done(zio_t *zio)
arc_cksum_compute(buf, B_FALSE);
arc_buf_watch(buf);
- if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
+ if (hash_lock && zio->io_error == 0 &&
+ hdr->b_l1hdr.b_state == arc_anon) {
/*
* Only call arc_access on anonymous buffers. This is because
* if we've issued an I/O for an evicted buffer, we've already
@@ -2906,24 +3184,25 @@ arc_read_done(zio_t *zio)
abuf = NULL;
}
}
- hdr->b_acb = NULL;
+ hdr->b_l1hdr.b_acb = NULL;
hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
ASSERT(!HDR_BUF_AVAILABLE(hdr));
if (abuf == buf) {
ASSERT(buf->b_efunc == NULL);
- ASSERT(hdr->b_datacnt == 1);
+ ASSERT(hdr->b_l1hdr.b_datacnt == 1);
hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
}
- ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
+ callback_list != NULL);
if (zio->io_error != 0) {
hdr->b_flags |= ARC_FLAG_IO_ERROR;
- if (hdr->b_state != arc_anon)
+ if (hdr->b_l1hdr.b_state != arc_anon)
arc_change_state(arc_anon, hdr, hash_lock);
if (HDR_IN_HASH_TABLE(hdr))
buf_hash_remove(hdr);
- freeable = refcount_is_zero(&hdr->b_refcnt);
+ freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
}
/*
@@ -2931,9 +3210,9 @@ arc_read_done(zio_t *zio)
* that the hdr (and hence the cv) might be freed before we get to
* the cv_broadcast().
*/
- cv_broadcast(&hdr->b_cv);
+ cv_broadcast(&hdr->b_l1hdr.b_cv);
- if (hash_lock) {
+ if (hash_lock != NULL) {
mutex_exit(hash_lock);
} else {
/*
@@ -2942,8 +3221,8 @@ arc_read_done(zio_t *zio)
* moved to the anonymous state (so that it won't show up
* in the cache).
*/
- ASSERT3P(hdr->b_state, ==, arc_anon);
- freeable = refcount_is_zero(&hdr->b_refcnt);
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+ freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
}
/* execute each callback and free its structure */
@@ -3005,14 +3284,14 @@ top:
hdr = buf_hash_find(guid, bp, &hash_lock);
}
- if (hdr != NULL && hdr->b_datacnt > 0) {
+ if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) {
*arc_flags |= ARC_FLAG_CACHED;
if (HDR_IO_IN_PROGRESS(hdr)) {
if (*arc_flags & ARC_FLAG_WAIT) {
- cv_wait(&hdr->b_cv, hash_lock);
+ cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
mutex_exit(hash_lock);
goto top;
}
@@ -3030,8 +3309,8 @@ top:
spa, NULL, NULL, NULL, zio_flags);
ASSERT(acb->acb_done != NULL);
- acb->acb_next = hdr->b_acb;
- hdr->b_acb = acb;
+ acb->acb_next = hdr->b_l1hdr.b_acb;
+ hdr->b_l1hdr.b_acb = acb;
add_reference(hdr, hash_lock, private);
mutex_exit(hash_lock);
return (0);
@@ -3040,7 +3319,8 @@ top:
return (0);
}
- ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+ ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
+ hdr->b_l1hdr.b_state == arc_mfu);
if (done) {
add_reference(hdr, hash_lock, private);
@@ -3049,7 +3329,7 @@ top:
* copy of the data so that we will be guaranteed
* that arc_release() will always succeed.
*/
- buf = hdr->b_buf;
+ buf = hdr->b_l1hdr.b_buf;
ASSERT(buf);
ASSERT(buf->b_data);
if (HDR_BUF_AVAILABLE(hdr)) {
@@ -3060,7 +3340,7 @@ top:
}
} else if (*arc_flags & ARC_FLAG_PREFETCH &&
- refcount_count(&hdr->b_refcnt) == 0) {
+ refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
hdr->b_flags |= ARC_FLAG_PREFETCH;
}
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
@@ -3071,8 +3351,8 @@ top:
hdr->b_flags |= ARC_FLAG_L2COMPRESS;
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_hits);
- ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH),
- demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+ demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
data, metadata, hits);
if (done)
@@ -3084,7 +3364,7 @@ top:
uint64_t addr = 0;
boolean_t devw = B_FALSE;
enum zio_compress b_compress = ZIO_COMPRESS_OFF;
- uint64_t b_asize = 0;
+ int32_t b_asize = 0;
if (hdr == NULL) {
/* this block is not in the cache */
@@ -3095,7 +3375,6 @@ top:
if (!BP_IS_EMBEDDED(bp)) {
hdr->b_dva = *BP_IDENTITY(bp);
hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
- hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
exists = buf_hash_insert(hdr, &hash_lock);
}
if (exists != NULL) {
@@ -3119,11 +3398,20 @@ top:
if (BP_GET_LEVEL(bp) > 0)
hdr->b_flags |= ARC_FLAG_INDIRECT;
} else {
- /* this block is in the ghost cache */
- ASSERT(GHOST_STATE(hdr->b_state));
+ /*
+ * This block is in the ghost cache. If it was L2-only
+ * (and thus didn't have an L1 hdr), we realloc the
+ * header to add an L1 hdr.
+ */
+ if (!HDR_HAS_L1HDR(hdr)) {
+ hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
+ hdr_full_cache);
+ }
+
+ ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT0(refcount_count(&hdr->b_refcnt));
- ASSERT(hdr->b_buf == NULL);
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT(hdr->b_l1hdr.b_buf == NULL);
/* if this is a prefetch, we don't have a reference */
if (*arc_flags & ARC_FLAG_PREFETCH)
@@ -3140,29 +3428,29 @@ top:
buf->b_efunc = NULL;
buf->b_private = NULL;
buf->b_next = NULL;
- hdr->b_buf = buf;
- ASSERT(hdr->b_datacnt == 0);
- hdr->b_datacnt = 1;
+ hdr->b_l1hdr.b_buf = buf;
+ ASSERT0(hdr->b_l1hdr.b_datacnt);
+ hdr->b_l1hdr.b_datacnt = 1;
arc_get_data_buf(buf);
arc_access(hdr, hash_lock);
}
- ASSERT(!GHOST_STATE(hdr->b_state));
+ ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
acb->acb_done = done;
acb->acb_private = private;
- ASSERT(hdr->b_acb == NULL);
- hdr->b_acb = acb;
+ ASSERT(hdr->b_l1hdr.b_acb == NULL);
+ hdr->b_l1hdr.b_acb = acb;
hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
- if (hdr->b_l2hdr != NULL &&
- (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
- devw = hdr->b_l2hdr->b_dev->l2ad_writing;
- addr = hdr->b_l2hdr->b_daddr;
- b_compress = hdr->b_l2hdr->b_compress;
- b_asize = hdr->b_l2hdr->b_asize;
+ if (HDR_HAS_L2HDR(hdr) &&
+ (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
+ devw = hdr->b_l2hdr.b_dev->l2ad_writing;
+ addr = hdr->b_l2hdr.b_daddr;
+ b_compress = HDR_GET_COMPRESS(hdr);
+ b_asize = hdr->b_l2hdr.b_asize;
/*
* Lock out device removal.
*/
@@ -3182,8 +3470,8 @@ top:
DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
uint64_t, size, zbookmark_phys_t *, zb);
ARCSTAT_BUMP(arcstat_misses);
- ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH),
- demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+ demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
data, metadata, misses);
if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
@@ -3196,7 +3484,7 @@ top:
* also have invalidated the vdev.
* 5. This isn't prefetch and l2arc_noprefetch is set.
*/
- if (hdr->b_l2hdr != NULL &&
+ if (HDR_HAS_L2HDR(hdr) &&
!HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
!(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
l2arc_read_callback_t *cb;
@@ -3288,8 +3576,9 @@ void
arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
{
ASSERT(buf->b_hdr != NULL);
- ASSERT(buf->b_hdr->b_state != arc_anon);
- ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+ ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon);
+ ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) ||
+ func == NULL);
ASSERT(buf->b_efunc == NULL);
ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
@@ -3313,7 +3602,7 @@ arc_freed(spa_t *spa, const blkptr_t *bp)
if (hdr == NULL)
return;
if (HDR_BUF_AVAILABLE(hdr)) {
- arc_buf_t *buf = hdr->b_buf;
+ arc_buf_t *buf = hdr->b_l1hdr.b_buf;
add_reference(hdr, hash_lock, FTAG);
hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
mutex_exit(hash_lock);
@@ -3371,17 +3660,19 @@ arc_clear_callback(arc_buf_t *buf)
hdr = buf->b_hdr;
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
- ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
- ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+ ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <,
+ hdr->b_l1hdr.b_datacnt);
+ ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
+ hdr->b_l1hdr.b_state == arc_mfu);
buf->b_efunc = NULL;
buf->b_private = NULL;
- if (hdr->b_datacnt > 1) {
+ if (hdr->b_l1hdr.b_datacnt > 1) {
mutex_exit(&buf->b_evict_lock);
arc_buf_destroy(buf, FALSE, TRUE);
} else {
- ASSERT(buf == hdr->b_buf);
+ ASSERT(buf == hdr->b_l1hdr.b_buf);
hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
mutex_exit(&buf->b_evict_lock);
}
@@ -3400,10 +3691,7 @@ arc_clear_callback(arc_buf_t *buf)
void
arc_release(arc_buf_t *buf, void *tag)
{
- arc_buf_hdr_t *hdr;
- kmutex_t *hash_lock = NULL;
- l2arc_buf_hdr_t *l2hdr;
- uint64_t buf_size;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
/*
* It would be nice to assert that if it's DMU metadata (level >
@@ -3412,56 +3700,87 @@ arc_release(arc_buf_t *buf, void *tag)
*/
mutex_enter(&buf->b_evict_lock);
- hdr = buf->b_hdr;
- /* this buffer is not on any list */
- ASSERT(refcount_count(&hdr->b_refcnt) > 0);
+ /*
+ * We don't grab the hash lock prior to this check, because if
+ * the buffer's header is in the arc_anon state, it won't be
+ * linked into the hash table.
+ */
+ if (hdr->b_l1hdr.b_state == arc_anon) {
+ mutex_exit(&buf->b_evict_lock);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT(!HDR_IN_HASH_TABLE(hdr));
+ ASSERT(!HDR_HAS_L2HDR(hdr));
+ ASSERT(BUF_EMPTY(hdr));
- if (hdr->b_state == arc_anon) {
- /* this buffer is already released */
- ASSERT(buf->b_efunc == NULL);
- } else {
- hash_lock = HDR_LOCK(hdr);
- mutex_enter(hash_lock);
- hdr = buf->b_hdr;
- ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+ ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1);
+ ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
+ ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+
+ ASSERT3P(buf->b_efunc, ==, NULL);
+ ASSERT3P(buf->b_private, ==, NULL);
+
+ hdr->b_l1hdr.b_arc_access = 0;
+ arc_buf_thaw(buf);
+
+ return;
}
- l2hdr = hdr->b_l2hdr;
- if (l2hdr) {
- mutex_enter(&l2arc_buflist_mtx);
- hdr->b_l2hdr = NULL;
- list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+
+ /*
+ * This assignment is only valid as long as the hash_lock is
+ * held, we must be careful not to reference state or the
+ * b_state field after dropping the lock.
+ */
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+ ASSERT3P(state, !=, arc_anon);
+
+ /* this buffer is not on any list */
+ ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
+
+ if (HDR_HAS_L2HDR(hdr)) {
+ ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
+ ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+
+ mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
+ list_remove(&hdr->b_l2hdr.b_dev->l2ad_buflist, hdr);
+ mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
+
+ hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
}
- buf_size = hdr->b_size;
/*
* Do we have more than one buf?
*/
- if (hdr->b_datacnt > 1) {
+ if (hdr->b_l1hdr.b_datacnt > 1) {
arc_buf_hdr_t *nhdr;
arc_buf_t **bufp;
uint64_t blksz = hdr->b_size;
uint64_t spa = hdr->b_spa;
- arc_buf_contents_t type = hdr->b_type;
+ arc_buf_contents_t type = arc_buf_type(hdr);
uint32_t flags = hdr->b_flags;
- ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
+ ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
/*
* Pull the data off of this hdr and attach it to
* a new anonymous hdr.
*/
(void) remove_reference(hdr, hash_lock, tag);
- bufp = &hdr->b_buf;
+ bufp = &hdr->b_l1hdr.b_buf;
while (*bufp != buf)
bufp = &(*bufp)->b_next;
*bufp = buf->b_next;
buf->b_next = NULL;
- ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
- atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
- if (refcount_is_zero(&hdr->b_refcnt)) {
- uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
+ ASSERT3P(state, !=, arc_l2c_only);
+ ASSERT3U(state->arcs_size, >=, hdr->b_size);
+ atomic_add_64(&state->arcs_size, -hdr->b_size);
+ if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
+ ASSERT3P(state, !=, arc_l2c_only);
+ uint64_t *size = &state->arcs_lsize[type];
ASSERT3U(*size, >=, hdr->b_size);
atomic_add_64(size, -hdr->b_size);
}
@@ -3470,57 +3789,50 @@ arc_release(arc_buf_t *buf, void *tag)
* We're releasing a duplicate user data buffer, update
* our statistics accordingly.
*/
- if (hdr->b_type == ARC_BUFC_DATA) {
+ if (HDR_ISTYPE_DATA(hdr)) {
ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
ARCSTAT_INCR(arcstat_duplicate_buffers_size,
-hdr->b_size);
}
- hdr->b_datacnt -= 1;
+ hdr->b_l1hdr.b_datacnt -= 1;
arc_cksum_verify(buf);
arc_buf_unwatch(buf);
mutex_exit(hash_lock);
- nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+ nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
nhdr->b_size = blksz;
nhdr->b_spa = spa;
- nhdr->b_type = type;
- nhdr->b_buf = buf;
- nhdr->b_state = arc_anon;
- nhdr->b_arc_access = 0;
+
nhdr->b_flags = flags & ARC_FLAG_L2_WRITING;
- nhdr->b_l2hdr = NULL;
- nhdr->b_datacnt = 1;
+ nhdr->b_flags |= arc_bufc_to_flags(type);
+ nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
+
+ nhdr->b_l1hdr.b_buf = buf;
+ nhdr->b_l1hdr.b_datacnt = 1;
+ nhdr->b_l1hdr.b_state = arc_anon;
+ nhdr->b_l1hdr.b_arc_access = 0;
nhdr->b_freeze_cksum = NULL;
- (void) refcount_add(&nhdr->b_refcnt, tag);
+
+ (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
buf->b_hdr = nhdr;
mutex_exit(&buf->b_evict_lock);
atomic_add_64(&arc_anon->arcs_size, blksz);
} else {
mutex_exit(&buf->b_evict_lock);
- ASSERT(refcount_count(&hdr->b_refcnt) == 1);
- ASSERT(!list_link_active(&hdr->b_arc_node));
+ ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
+ /* protected by hash lock */
+ ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- if (hdr->b_state != arc_anon)
- arc_change_state(arc_anon, hdr, hash_lock);
- hdr->b_arc_access = 0;
- if (hash_lock)
- mutex_exit(hash_lock);
+ arc_change_state(arc_anon, hdr, hash_lock);
+ hdr->b_l1hdr.b_arc_access = 0;
+ mutex_exit(hash_lock);
buf_discard_identity(hdr);
arc_buf_thaw(buf);
}
buf->b_efunc = NULL;
buf->b_private = NULL;
-
- if (l2hdr) {
- ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
- vdev_space_update(l2hdr->b_dev->l2ad_vdev,
- -l2hdr->b_asize, 0, 0);
- kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
- ARCSTAT_INCR(arcstat_l2_size, -buf_size);
- mutex_exit(&l2arc_buflist_mtx);
- }
}
int
@@ -3529,7 +3841,8 @@ arc_released(arc_buf_t *buf)
int released;
mutex_enter(&buf->b_evict_lock);
- released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
+ released = (buf->b_data != NULL &&
+ buf->b_hdr->b_l1hdr.b_state == arc_anon);
mutex_exit(&buf->b_evict_lock);
return (released);
}
@@ -3541,7 +3854,7 @@ arc_referenced(arc_buf_t *buf)
int referenced;
mutex_enter(&buf->b_evict_lock);
- referenced = (refcount_count(&buf->b_hdr->b_refcnt));
+ referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
mutex_exit(&buf->b_evict_lock);
return (referenced);
}
@@ -3554,7 +3867,9 @@ arc_write_ready(zio_t *zio)
arc_buf_t *buf = callback->awcb_buf;
arc_buf_hdr_t *hdr = buf->b_hdr;
- ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
+ ASSERT(hdr->b_l1hdr.b_datacnt > 0);
callback->awcb_ready(zio, buf, callback->awcb_private);
/*
@@ -3564,12 +3879,12 @@ arc_write_ready(zio_t *zio)
* accounting for any re-write attempt.
*/
if (HDR_IO_IN_PROGRESS(hdr)) {
- mutex_enter(&hdr->b_freeze_lock);
+ mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
if (hdr->b_freeze_cksum != NULL) {
kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
hdr->b_freeze_cksum = NULL;
}
- mutex_exit(&hdr->b_freeze_lock);
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
}
arc_cksum_compute(buf, B_FALSE);
hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
@@ -3594,7 +3909,7 @@ arc_write_done(zio_t *zio)
arc_buf_t *buf = callback->awcb_buf;
arc_buf_hdr_t *hdr = buf->b_hdr;
- ASSERT(hdr->b_acb == NULL);
+ ASSERT(hdr->b_l1hdr.b_acb == NULL);
if (zio->io_error == 0) {
if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
@@ -3602,7 +3917,6 @@ arc_write_done(zio_t *zio)
} else {
hdr->b_dva = *BP_IDENTITY(zio->io_bp);
hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
- hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
}
} else {
ASSERT(BUF_EMPTY(hdr));
@@ -3623,7 +3937,7 @@ arc_write_done(zio_t *zio)
arc_cksum_verify(buf);
exists = buf_hash_insert(hdr, &hash_lock);
- if (exists) {
+ if (exists != NULL) {
/*
* This can only happen if we overwrite for
* sync-to-convergence, because we remove
@@ -3633,7 +3947,8 @@ arc_write_done(zio_t *zio)
if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
panic("bad overwrite, hdr=%p exists=%p",
(void *)hdr, (void *)exists);
- ASSERT(refcount_is_zero(&exists->b_refcnt));
+ ASSERT(refcount_is_zero(
+ &exists->b_l1hdr.b_refcnt));
arc_change_state(arc_anon, exists, hash_lock);
mutex_exit(hash_lock);
arc_hdr_destroy(exists);
@@ -3647,22 +3962,22 @@ arc_write_done(zio_t *zio)
(void *)hdr, (void *)exists);
} else {
/* Dedup */
- ASSERT(hdr->b_datacnt == 1);
- ASSERT(hdr->b_state == arc_anon);
+ ASSERT(hdr->b_l1hdr.b_datacnt == 1);
+ ASSERT(hdr->b_l1hdr.b_state == arc_anon);
ASSERT(BP_GET_DEDUP(zio->io_bp));
ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
}
}
hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
/* if it's not anon, we are doing a scrub */
- if (!exists && hdr->b_state == arc_anon)
+ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
} else {
hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
}
- ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+ ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
callback->awcb_done(zio, buf, callback->awcb_private);
kmem_free(callback, sizeof (arc_write_callback_t));
@@ -3682,8 +3997,9 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
ASSERT(ready != NULL);
ASSERT(done != NULL);
ASSERT(!HDR_IO_ERROR(hdr));
- ASSERT((hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS) == 0);
- ASSERT(hdr->b_acb == NULL);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT(hdr->b_l1hdr.b_acb == NULL);
+ ASSERT(hdr->b_l1hdr.b_datacnt > 0);
if (l2arc)
hdr->b_flags |= ARC_FLAG_L2CACHE;
if (l2arc_compress)
@@ -3903,25 +4219,35 @@ arc_init(void)
mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
buf_init();
@@ -4000,7 +4326,7 @@ arc_fini(void)
buf_fini();
- ASSERT(arc_loaned_bytes == 0);
+ ASSERT0(arc_loaned_bytes);
}
/*
@@ -4158,7 +4484,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
* 3. has an I/O in progress (it may be an incomplete read).
* 4. is flagged not eligible (zfs property).
*/
- if (hdr->b_spa != spa_guid || hdr->b_l2hdr != NULL ||
+ if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
return (B_FALSE);
@@ -4211,20 +4537,6 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
return (next);
}
-static void
-l2arc_hdr_stat_add(void)
-{
- ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
- ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
-}
-
-static void
-l2arc_hdr_stat_remove(void)
-{
- ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
- ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
-}
-
/*
* Cycle through L2ARC devices. This is how L2ARC load balances.
* If a device is returned, this also returns holding the spa config lock.
@@ -4321,7 +4633,6 @@ l2arc_write_done(zio_t *zio)
l2arc_dev_t *dev;
list_t *buflist;
arc_buf_hdr_t *head, *hdr, *hdr_prev;
- l2arc_buf_hdr_t *abl2;
kmutex_t *hash_lock;
int64_t bytes_dropped = 0;
@@ -4331,7 +4642,7 @@ l2arc_write_done(zio_t *zio)
ASSERT(dev != NULL);
head = cb->l2wcb_head;
ASSERT(head != NULL);
- buflist = dev->l2ad_buflist;
+ buflist = &dev->l2ad_buflist;
ASSERT(buflist != NULL);
DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
l2arc_write_callback_t *, cb);
@@ -4339,41 +4650,43 @@ l2arc_write_done(zio_t *zio)
if (zio->io_error != 0)
ARCSTAT_BUMP(arcstat_l2_writes_error);
- mutex_enter(&l2arc_buflist_mtx);
+ mutex_enter(&dev->l2ad_mtx);
/*
* All writes completed, or an error was hit.
*/
for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
hdr_prev = list_prev(buflist, hdr);
- abl2 = hdr->b_l2hdr;
-
- /*
- * Release the temporary compressed buffer as soon as possible.
- */
- if (abl2->b_compress != ZIO_COMPRESS_OFF)
- l2arc_release_cdata_buf(hdr);
hash_lock = HDR_LOCK(hdr);
if (!mutex_tryenter(hash_lock)) {
/*
* This buffer misses out. It may be in a stage
- * of eviction. Its ARC_L2_WRITING flag will be
+ * of eviction. Its ARC_FLAG_L2_WRITING flag will be
* left set, denying reads to this buffer.
*/
ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
continue;
}
+ /*
+ * It's possible that this buffer got evicted from the L1 cache
+ * before we grabbed the vdev + hash locks, in which case
+ * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated.
+ * Only free the buffer if we still have an L1 hdr.
+ */
+ if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL &&
+ HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
+ l2arc_release_cdata_buf(hdr);
+
if (zio->io_error != 0) {
/*
* Error - drop L2ARC entry.
*/
list_remove(buflist, hdr);
- ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
- bytes_dropped += abl2->b_asize;
- hdr->b_l2hdr = NULL;
- kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+ hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
+
+ ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
}
@@ -4387,8 +4700,9 @@ l2arc_write_done(zio_t *zio)
atomic_inc_64(&l2arc_writes_done);
list_remove(buflist, head);
- kmem_cache_free(hdr_cache, head);
- mutex_exit(&l2arc_buflist_mtx);
+ ASSERT(!HDR_HAS_L1HDR(head));
+ kmem_cache_free(hdr_l2only_cache, head);
+ mutex_exit(&dev->l2ad_mtx);
vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
@@ -4526,16 +4840,12 @@ static void
l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
{
list_t *buflist;
- l2arc_buf_hdr_t *abl2;
arc_buf_hdr_t *hdr, *hdr_prev;
kmutex_t *hash_lock;
uint64_t taddr;
int64_t bytes_evicted = 0;
- buflist = dev->l2ad_buflist;
-
- if (buflist == NULL)
- return;
+ buflist = &dev->l2ad_buflist;
if (!all && dev->l2ad_first) {
/*
@@ -4558,7 +4868,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
uint64_t, taddr, boolean_t, all);
top:
- mutex_enter(&l2arc_buflist_mtx);
+ mutex_enter(&dev->l2ad_mtx);
for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
hdr_prev = list_prev(buflist, hdr);
@@ -4568,7 +4878,7 @@ top:
* Missed the hash lock. Retry.
*/
ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
- mutex_exit(&l2arc_buflist_mtx);
+ mutex_exit(&dev->l2ad_mtx);
mutex_enter(hash_lock);
mutex_exit(hash_lock);
goto top;
@@ -4584,9 +4894,9 @@ top:
continue;
}
- if (!all && hdr->b_l2hdr != NULL &&
- (hdr->b_l2hdr->b_daddr > taddr ||
- hdr->b_l2hdr->b_daddr < dev->l2ad_hand)) {
+ if (!all && HDR_HAS_L2HDR(hdr) &&
+ (hdr->b_l2hdr.b_daddr > taddr ||
+ hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
/*
* We've evicted to the target address,
* or the end of the device.
@@ -4595,15 +4905,8 @@ top:
break;
}
- if (HDR_FREE_IN_PROGRESS(hdr)) {
- /*
- * Already on the path to destruction.
- */
- mutex_exit(hash_lock);
- continue;
- }
-
- if (hdr->b_state == arc_l2c_only) {
+ ASSERT(HDR_HAS_L2HDR(hdr));
+ if (!HDR_HAS_L1HDR(hdr)) {
ASSERT(!HDR_L2_READING(hdr));
/*
* This doesn't exist in the ARC. Destroy.
@@ -4613,6 +4916,8 @@ top:
arc_change_state(arc_anon, hdr, hash_lock);
arc_hdr_destroy(hdr);
} else {
+ ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
+ ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
/*
* Invalidate issued or about to be issued
* reads, since we may be about to write
@@ -4623,28 +4928,18 @@ top:
hdr->b_flags |= ARC_FLAG_L2_EVICTED;
}
- /*
- * Tell ARC this no longer exists in L2ARC.
- */
- if (hdr->b_l2hdr != NULL) {
- abl2 = hdr->b_l2hdr;
- ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
- bytes_evicted += abl2->b_asize;
- hdr->b_l2hdr = NULL;
- kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
- ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
- }
+ /* Tell ARC this no longer exists in L2ARC. */
+ ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
+ ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+ hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
list_remove(buflist, hdr);
- /*
- * This may have been leftover after a
- * failed write.
- */
+ /* This may have been leftover after a failed write. */
hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
}
mutex_exit(hash_lock);
}
- mutex_exit(&l2arc_buflist_mtx);
+ mutex_exit(&dev->l2ad_mtx);
vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0);
dev->l2ad_evict = taddr;
@@ -4685,8 +4980,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
pio = NULL;
write_sz = write_asize = write_psize = 0;
full = B_FALSE;
- head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+ head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
+ head->b_flags |= ARC_FLAG_HAS_L2HDR;
/*
* We will want to try to compress buffers that are at least 2x the
@@ -4697,7 +4993,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
/*
* Copy buffers for L2ARC writing.
*/
- mutex_enter(&l2arc_buflist_mtx);
+ mutex_enter(&dev->l2ad_mtx);
for (int try = 0; try <= 3; try++) {
uint64_t passed_sz = 0;
@@ -4719,7 +5015,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
headroom = (headroom * l2arc_headroom_boost) / 100;
for (; hdr; hdr = hdr_prev) {
- l2arc_buf_hdr_t *l2hdr;
kmutex_t *hash_lock;
uint64_t buf_sz;
@@ -4762,7 +5057,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
* l2arc_write_done() can find where the
* write buffers begin without searching.
*/
- list_insert_head(dev->l2ad_buflist, head);
+ list_insert_head(&dev->l2ad_buflist, head);
cb = kmem_alloc(
sizeof (l2arc_write_callback_t), KM_SLEEP);
@@ -4775,33 +5070,31 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
/*
* Create and add a new L2ARC header.
*/
- l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
- l2hdr->b_dev = dev;
+ hdr->b_l2hdr.b_dev = dev;
hdr->b_flags |= ARC_FLAG_L2_WRITING;
-
/*
* Temporarily stash the data buffer in b_tmp_cdata.
* The subsequent write step will pick it up from
- * there. This is because can't access hdr->b_buf
+ * there. This is because can't access b_l1hdr.b_buf
* without holding the hash_lock, which we in turn
* can't access without holding the ARC list locks
* (which we want to avoid during compression/writing).
*/
- l2hdr->b_compress = ZIO_COMPRESS_OFF;
- l2hdr->b_asize = hdr->b_size;
- l2hdr->b_tmp_cdata = hdr->b_buf->b_data;
+ HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
+ hdr->b_l2hdr.b_asize = hdr->b_size;
+ hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
buf_sz = hdr->b_size;
- hdr->b_l2hdr = l2hdr;
+ hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
- list_insert_head(dev->l2ad_buflist, hdr);
+ list_insert_head(&dev->l2ad_buflist, hdr);
/*
* Compute and store the buffer cksum before
* writing. On debug the cksum is verified first.
*/
- arc_cksum_verify(hdr->b_buf);
- arc_cksum_compute(hdr->b_buf, B_TRUE);
+ arc_cksum_verify(hdr->b_l1hdr.b_buf);
+ arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
mutex_exit(hash_lock);
@@ -4817,8 +5110,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
/* No buffers selected for writing? */
if (pio == NULL) {
ASSERT0(write_sz);
- mutex_exit(&l2arc_buflist_mtx);
- kmem_cache_free(hdr_cache, head);
+ mutex_exit(&dev->l2ad_mtx);
+ ASSERT(!HDR_HAS_L1HDR(head));
+ kmem_cache_free(hdr_l2only_cache, head);
return (0);
}
@@ -4827,24 +5121,22 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
* and work backwards, retracing the course of the buffer selector
* loop above.
*/
- for (hdr = list_prev(dev->l2ad_buflist, head); hdr;
- hdr = list_prev(dev->l2ad_buflist, hdr)) {
- l2arc_buf_hdr_t *l2hdr;
+ for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
+ hdr = list_prev(&dev->l2ad_buflist, hdr)) {
uint64_t buf_sz;
/*
* We shouldn't need to lock the buffer here, since we flagged
* it as ARC_FLAG_L2_WRITING in the previous step, but we must
* take care to only access its L2 cache parameters. In
- * particular, hdr->b_buf may be invalid by now due to
+ * particular, hdr->l1hdr.b_buf may be invalid by now due to
* ARC eviction.
*/
- l2hdr = hdr->b_l2hdr;
- l2hdr->b_daddr = dev->l2ad_hand;
+ hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
- if ((hdr->b_flags & ARC_FLAG_L2COMPRESS) &&
- l2hdr->b_asize >= buf_compress_minsz) {
- if (l2arc_compress_buf(l2hdr)) {
+ if ((HDR_L2COMPRESS(hdr)) &&
+ hdr->b_l2hdr.b_asize >= buf_compress_minsz) {
+ if (l2arc_compress_buf(hdr)) {
/*
* If compression succeeded, enable headroom
* boost on the next scan cycle.
@@ -4857,8 +5149,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
* Pick up the buffer data we had previously stashed away
* (and now potentially also compressed).
*/
- buf_data = l2hdr->b_tmp_cdata;
- buf_sz = l2hdr->b_asize;
+ buf_data = hdr->b_l1hdr.b_tmp_cdata;
+ buf_sz = hdr->b_l2hdr.b_asize;
/* Compression may have squashed the buffer to zero length. */
if (buf_sz != 0) {
@@ -4883,7 +5175,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
}
}
- mutex_exit(&l2arc_buflist_mtx);
+ mutex_exit(&dev->l2ad_mtx);
ASSERT3U(write_asize, <=, target_sz);
ARCSTAT_BUMP(arcstat_l2_writes_sent);
@@ -4911,7 +5203,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
/*
* Compresses an L2ARC buffer.
- * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
+ * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its
* size in l2hdr->b_asize. This routine tries to compress the data and
* depending on the compression result there are three possible outcomes:
* *) The buffer was incompressible. The original l2hdr contents were left
@@ -4929,17 +5221,21 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
* buffer was incompressible).
*/
static boolean_t
-l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
+l2arc_compress_buf(arc_buf_hdr_t *hdr)
{
void *cdata;
size_t csize, len, rounded;
+ ASSERT(HDR_HAS_L2HDR(hdr));
+ l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
- ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
- ASSERT(l2hdr->b_tmp_cdata != NULL);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF);
+ ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
len = l2hdr->b_asize;
cdata = zio_data_buf_alloc(len);
- csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
+ ASSERT3P(cdata, !=, NULL);
+ csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
cdata, l2hdr->b_asize);
rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
@@ -4951,9 +5247,9 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
if (csize == 0) {
/* zero block, indicate that there's nothing to write */
zio_data_buf_free(cdata, len);
- l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
+ HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY);
l2hdr->b_asize = 0;
- l2hdr->b_tmp_cdata = NULL;
+ hdr->b_l1hdr.b_tmp_cdata = NULL;
ARCSTAT_BUMP(arcstat_l2_compress_zeros);
return (B_TRUE);
} else if (csize > 0 && csize < len) {
@@ -4961,9 +5257,9 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
* Compression succeeded, we'll keep the cdata around for
* writing and release it afterwards.
*/
- l2hdr->b_compress = ZIO_COMPRESS_LZ4;
+ HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4);
l2hdr->b_asize = csize;
- l2hdr->b_tmp_cdata = cdata;
+ hdr->b_l1hdr.b_tmp_cdata = cdata;
ARCSTAT_BUMP(arcstat_l2_compress_successes);
return (B_TRUE);
} else {
@@ -5008,9 +5304,9 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
* need to fill its io_data after we're done restoring the
* buffer's contents.
*/
- ASSERT(hdr->b_buf != NULL);
- bzero(hdr->b_buf->b_data, hdr->b_size);
- zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
+ ASSERT(hdr->b_l1hdr.b_buf != NULL);
+ bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size);
+ zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data;
} else {
ASSERT(zio->io_data != NULL);
/*
@@ -5048,17 +5344,17 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
static void
l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
{
- l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
-
- if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) {
/*
* If the data was compressed, then we've allocated a
* temporary buffer for it, so now we need to release it.
*/
- ASSERT(l2hdr->b_tmp_cdata != NULL);
- zio_data_buf_free(l2hdr->b_tmp_cdata, hdr->b_size);
+ ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
+ zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
+ hdr->b_size);
}
- l2hdr->b_tmp_cdata = NULL;
+ hdr->b_l1hdr.b_tmp_cdata = NULL;
}
/*
@@ -5199,13 +5495,13 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
+ mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
/*
* This is a list of all ARC buffers that are still valid on the
* device.
*/
- adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
- list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l2node));
+ list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
@@ -5251,8 +5547,8 @@ l2arc_remove_vdev(vdev_t *vd)
* Clear all buflists and ARC references. L2ARC device flush.
*/
l2arc_evict(remdev, 0, B_TRUE);
- list_destroy(remdev->l2ad_buflist);
- kmem_free(remdev->l2ad_buflist, sizeof (list_t));
+ list_destroy(&remdev->l2ad_buflist);
+ mutex_destroy(&remdev->l2ad_mtx);
kmem_free(remdev, sizeof (l2arc_dev_t));
}
@@ -5267,7 +5563,6 @@ l2arc_init(void)
mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
l2arc_dev_list = &L2ARC_dev_list;
@@ -5292,7 +5587,6 @@ l2arc_fini(void)
mutex_destroy(&l2arc_feed_thr_lock);
cv_destroy(&l2arc_feed_thr_cv);
mutex_destroy(&l2arc_dev_mtx);
- mutex_destroy(&l2arc_buflist_mtx);
mutex_destroy(&l2arc_free_on_write_mtx);
list_destroy(l2arc_dev_list);
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
index 83ea6f7fbc..4e9e8e202a 100644
--- a/usr/src/uts/common/fs/zfs/sys/arc.h
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -70,10 +70,29 @@ typedef enum arc_flags
ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */
ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */
ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */
- ARC_FLAG_FREE_IN_PROGRESS = 1 << 13, /* about to be freed */
- ARC_FLAG_L2_WRITING = 1 << 14, /* write in progress */
- ARC_FLAG_L2_EVICTED = 1 << 15, /* evicted during I/O */
- ARC_FLAG_L2_WRITE_HEAD = 1 << 16, /* head of write list */
+ ARC_FLAG_L2_WRITING = 1 << 13, /* write in progress */
+ ARC_FLAG_L2_EVICTED = 1 << 14, /* evicted during I/O */
+ ARC_FLAG_L2_WRITE_HEAD = 1 << 15, /* head of write list */
+ /* indicates that the buffer contains metadata (otherwise, data) */
+ ARC_FLAG_BUFC_METADATA = 1 << 16,
+
+ /* Flags specifying whether optional hdr struct fields are defined */
+ ARC_FLAG_HAS_L1HDR = 1 << 17,
+ ARC_FLAG_HAS_L2HDR = 1 << 18,
+
+ /*
+ * The arc buffer's compression mode is stored in the top 7 bits of the
+ * flags field, so these dummy flags are included so that MDB can
+ * interpret the enum properly.
+ */
+ ARC_FLAG_COMPRESS_0 = 1 << 24,
+ ARC_FLAG_COMPRESS_1 = 1 << 25,
+ ARC_FLAG_COMPRESS_2 = 1 << 26,
+ ARC_FLAG_COMPRESS_3 = 1 << 27,
+ ARC_FLAG_COMPRESS_4 = 1 << 28,
+ ARC_FLAG_COMPRESS_5 = 1 << 29,
+ ARC_FLAG_COMPRESS_6 = 1 << 30
+
} arc_flags_t;
struct arc_buf {