summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/fs/zfs/arc.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/fs/zfs/arc.c')
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c169
1 files changed, 61 insertions, 108 deletions
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index 293c9afebd..67847d2d99 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -128,7 +128,6 @@
#include <sys/refcount.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
-#include <sys/dsl_pool.h>
#include <sys/zfs_zone.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
@@ -150,6 +149,10 @@ static kmutex_t arc_reclaim_thr_lock;
static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
static uint8_t arc_thread_exit;
+extern int zfs_write_limit_shift;
+extern uint64_t zfs_write_limit_max;
+extern kmutex_t zfs_write_limit_lock;
+
#define ARC_REDUCE_DNLC_PERCENT 3
uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
@@ -158,12 +161,6 @@ typedef enum arc_reclaim_strategy {
ARC_RECLAIM_CONS /* Conservative reclaim strategy */
} arc_reclaim_strategy_t;
-/*
- * The number of iterations through arc_evict_*() before we
- * drop & reacquire the lock.
- */
-int arc_evict_iterations = 100;
-
/* number of seconds before growing cache again */
static int arc_grow_retry = 60;
@@ -179,11 +176,6 @@ static int arc_shrink_shift = 5;
*/
static int arc_min_prefetch_lifespan;
-/*
- * If this percent of memory is free, don't throttle.
- */
-int arc_lotsfree_percent = 10;
-
static int arc_dead;
/*
@@ -479,7 +471,6 @@ typedef struct arc_write_callback arc_write_callback_t;
struct arc_write_callback {
void *awcb_private;
arc_done_func_t *awcb_ready;
- arc_done_func_t *awcb_physdone;
arc_done_func_t *awcb_done;
arc_buf_t *awcb_buf;
};
@@ -1174,7 +1165,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
uint64_t from_delta, to_delta;
ASSERT(MUTEX_HELD(hash_lock));
- ASSERT3P(new_state, !=, old_state);
+ ASSERT(new_state != old_state);
ASSERT(refcnt == 0 || ab->b_datacnt > 0);
ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
@@ -1789,8 +1780,6 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
kmutex_t *hash_lock;
boolean_t have_lock;
void *stolen = NULL;
- arc_buf_hdr_t marker = { 0 };
- int count = 0;
ASSERT(state == arc_mru || state == arc_mfu);
@@ -1814,33 +1803,6 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
if (recycle && ab->b_size != bytes &&
ab_prev && ab_prev->b_size == bytes)
continue;
-
- /* ignore markers */
- if (ab->b_spa == 0)
- continue;
-
- /*
- * It may take a long time to evict all the bufs requested.
- * To avoid blocking all arc activity, periodically drop
- * the arcs_mtx and give other threads a chance to run
- * before reacquiring the lock.
- *
- * If we are looking for a buffer to recycle, we are in
- * the hot code path, so don't sleep.
- */
- if (!recycle && count++ > arc_evict_iterations) {
- list_insert_after(list, ab, &marker);
- mutex_exit(&evicted_state->arcs_mtx);
- mutex_exit(&state->arcs_mtx);
- kpreempt(KPREEMPT_SYNC);
- mutex_enter(&state->arcs_mtx);
- mutex_enter(&evicted_state->arcs_mtx);
- ab_prev = list_prev(list, &marker);
- list_remove(list, &marker);
- count = 0;
- continue;
- }
-
hash_lock = HDR_LOCK(ab);
have_lock = MUTEX_HELD(hash_lock);
if (have_lock || mutex_tryenter(hash_lock)) {
@@ -1922,11 +1884,25 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
ARCSTAT_INCR(arcstat_mutex_miss, missed);
/*
- * Note: we have just evicted some data into the ghost state,
- * potentially putting the ghost size over the desired size. Rather
- * that evicting from the ghost list in this hot code path, leave
- * this chore to the arc_reclaim_thread().
+ * We have just evicted some data into the ghost state, make
+ * sure we also adjust the ghost state size if necessary.
*/
+ if (arc_no_grow &&
+ arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
+ int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
+ arc_mru_ghost->arcs_size - arc_c;
+
+ if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
+ int64_t todelete =
+ MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
+ arc_evict_ghost(arc_mru_ghost, NULL, todelete);
+ } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
+ int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
+ arc_mru_ghost->arcs_size +
+ arc_mfu_ghost->arcs_size - arc_c);
+ arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
+ }
+ }
return (stolen);
}
@@ -1944,15 +1920,12 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
kmutex_t *hash_lock;
uint64_t bytes_deleted = 0;
uint64_t bufs_skipped = 0;
- int count = 0;
ASSERT(GHOST_STATE(state));
top:
mutex_enter(&state->arcs_mtx);
for (ab = list_tail(list); ab; ab = ab_prev) {
ab_prev = list_prev(list, ab);
- if (ab->b_type > ARC_BUFC_NUMTYPES)
- panic("invalid ab=%p", (void *)ab);
if (spa && ab->b_spa != spa)
continue;
@@ -1964,23 +1937,6 @@ top:
/* caller may be trying to modify this buffer, skip it */
if (MUTEX_HELD(hash_lock))
continue;
-
- /*
- * It may take a long time to evict all the bufs requested.
- * To avoid blocking all arc activity, periodically drop
- * the arcs_mtx and give other threads a chance to run
- * before reacquiring the lock.
- */
- if (count++ > arc_evict_iterations) {
- list_insert_after(list, ab, &marker);
- mutex_exit(&state->arcs_mtx);
- kpreempt(KPREEMPT_SYNC);
- mutex_enter(&state->arcs_mtx);
- ab_prev = list_prev(list, &marker);
- list_remove(list, &marker);
- count = 0;
- continue;
- }
if (mutex_tryenter(hash_lock)) {
ASSERT(!HDR_IO_IN_PROGRESS(ab));
ASSERT(ab->b_buf == NULL);
@@ -2016,10 +1972,8 @@ top:
mutex_enter(&state->arcs_mtx);
ab_prev = list_prev(list, &marker);
list_remove(list, &marker);
- } else {
+ } else
bufs_skipped += 1;
- }
-
}
mutex_exit(&state->arcs_mtx);
@@ -2883,7 +2837,7 @@ arc_read_done(zio_t *zio)
*/
int
arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
- void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
+ void *private, int priority, int zio_flags, uint32_t *arc_flags,
const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr;
@@ -3494,18 +3448,6 @@ arc_write_ready(zio_t *zio)
hdr->b_flags |= ARC_IO_IN_PROGRESS;
}
-/*
- * The SPA calls this callback for each physical write that happens on behalf
- * of a logical write. See the comment in dbuf_write_physdone() for details.
- */
-static void
-arc_write_physdone(zio_t *zio)
-{
- arc_write_callback_t *cb = zio->io_private;
- if (cb->awcb_physdone != NULL)
- cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
-}
-
static void
arc_write_done(zio_t *zio)
{
@@ -3586,9 +3528,8 @@ arc_write_done(zio_t *zio)
zio_t *
arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
- const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
- arc_done_func_t *done, void *private, zio_priority_t priority,
- int zio_flags, const zbookmark_t *zb)
+ const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
+ void *private, int priority, int zio_flags, const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
arc_write_callback_t *callback;
@@ -3605,20 +3546,18 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
hdr->b_flags |= ARC_L2COMPRESS;
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
callback->awcb_ready = ready;
- callback->awcb_physdone = physdone;
callback->awcb_done = done;
callback->awcb_private = private;
callback->awcb_buf = buf;
zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
- arc_write_ready, arc_write_physdone, arc_write_done, callback,
- priority, zio_flags, zb);
+ arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
return (zio);
}
static int
-arc_memory_throttle(uint64_t reserve, uint64_t txg)
+arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
{
#ifdef _KERNEL
uint64_t available_memory = ptob(freemem);
@@ -3629,8 +3568,7 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg)
available_memory =
MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
#endif
-
- if (freemem > physmem * arc_lotsfree_percent / 100)
+ if (available_memory >= zfs_write_limit_max)
return (0);
if (txg > last_txg) {
@@ -3654,6 +3592,20 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg)
return (SET_ERROR(EAGAIN));
}
page_load = 0;
+
+ if (arc_size > arc_c_min) {
+ uint64_t evictable_memory =
+ arc_mru->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
+ arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
+ available_memory += MIN(evictable_memory, arc_size - arc_c_min);
+ }
+
+ if (inflight_data > available_memory / 4) {
+ ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+ return (SET_ERROR(ERESTART));
+ }
#endif
return (0);
}
@@ -3671,6 +3623,15 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
int error;
uint64_t anon_size;
+#ifdef ZFS_DEBUG
+ /*
+ * Once in a while, fail for no reason. Everything should cope.
+ */
+ if (spa_get_random(10000) == 0) {
+ dprintf("forcing random failure\n");
+ return (SET_ERROR(ERESTART));
+ }
+#endif
if (reserve > arc_c/4 && !arc_no_grow)
arc_c = MIN(arc_c_max, reserve * 4);
if (reserve > arc_c)
@@ -3688,8 +3649,7 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
* in order to compress/encrypt/etc the data. We therefore need to
* make sure that there is sufficient available memory for this.
*/
- error = arc_memory_throttle(reserve, txg);
- if (error != 0)
+ if (error = arc_memory_throttle(reserve, anon_size, txg))
return (error);
/*
@@ -3835,20 +3795,11 @@ arc_init(void)
arc_dead = FALSE;
arc_warm = B_FALSE;
- /*
- * Calculate maximum amount of dirty data per pool.
- *
- * If it has been set by /etc/system, take that.
- * Otherwise, use a percentage of physical memory defined by
- * zfs_dirty_data_max_percent (default 10%) with a cap at
- * zfs_dirty_data_max_max (default 4GB).
- */
- if (zfs_dirty_data_max == 0) {
- zfs_dirty_data_max = physmem * PAGESIZE *
- zfs_dirty_data_max_percent / 100;
- zfs_dirty_data_max = MIN(zfs_dirty_data_max,
- zfs_dirty_data_max_max);
- }
+ if (zfs_write_limit_max == 0)
+ zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
+ else
+ zfs_write_limit_shift = 0;
+ mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
}
void
@@ -3889,6 +3840,8 @@ arc_fini(void)
mutex_destroy(&arc_mfu_ghost->arcs_mtx);
mutex_destroy(&arc_l2c_only->arcs_mtx);
+ mutex_destroy(&zfs_write_limit_lock);
+
buf_fini();
ASSERT(arc_loaned_bytes == 0);