diff options
-rw-r--r-- | usr/src/cmd/mdb/common/modules/zfs/zfs.c | 2 | ||||
-rw-r--r-- | usr/src/cmd/stat/arcstat/arcstat.pl | 7 | ||||
-rw-r--r-- | usr/src/cmd/ztest/ztest.c | 20 | ||||
-rw-r--r-- | usr/src/lib/libzpool/common/llib-lzpool | 4 | ||||
-rw-r--r-- | usr/src/test/zfs-tests/tests/functional/removal/removal_with_ganging.ksh | 10 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/arc.c | 396 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/dbuf.c | 18 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/metaslab.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zthr.h | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev.c | 3 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_label.c | 12 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zthr.c | 28 |
12 files changed, 297 insertions, 211 deletions
diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c index 0c99391147..06ce396ef8 100644 --- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c +++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c @@ -396,7 +396,7 @@ zfs_params(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) "zfs_read_chunk_size", "zfs_nocacheflush", "zil_replay_disable", - "metaslab_gang_bang", + "metaslab_force_ganging", "metaslab_df_alloc_threshold", "metaslab_df_free_pct", "zio_injection_enabled", diff --git a/usr/src/cmd/stat/arcstat/arcstat.pl b/usr/src/cmd/stat/arcstat/arcstat.pl index 8f13221910..d4f12a9e1c 100644 --- a/usr/src/cmd/stat/arcstat/arcstat.pl +++ b/usr/src/cmd/stat/arcstat/arcstat.pl @@ -166,8 +166,11 @@ sub init { detailed_usage() if $vflag; @hdr = @xhdr if $xflag; #reset headers to xhdr - # check if L2ARC exists + # we want to capture the stats here, so that we can use them to check + # if an L2ARC device exists; but more importantly, so that we print + # the stats since boot as the first line of output from main(). snap_stats(); + if (defined $cur{"l2_size"}) { $l2exist = 1; } @@ -353,12 +356,12 @@ sub main { if ($count > 0) { $count_flag = 1; } while (1) { print_header() if ($i == 0); - snap_stats(); calculate(); print_values(); last if ($count_flag == 1 && $count-- <= 1); $i = (($i == $hdr_intr) && (not $raw_output)) ? 0 : $i+1; sleep($int); + snap_stats(); } close($out) if defined $out; } diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c index b53d4a091a..1522c75485 100644 --- a/usr/src/cmd/ztest/ztest.c +++ b/usr/src/cmd/ztest/ztest.c @@ -162,7 +162,7 @@ typedef struct ztest_shared_opts { int zo_init; uint64_t zo_time; uint64_t zo_maxloops; - uint64_t zo_metaslab_gang_bang; + uint64_t zo_metaslab_force_ganging; } ztest_shared_opts_t; static const ztest_shared_opts_t ztest_opts_defaults = { @@ -184,10 +184,10 @@ static const ztest_shared_opts_t ztest_opts_defaults = { .zo_init = 1, .zo_time = 300, /* 5 minutes */ .zo_maxloops = 50, /* max loops during spa_freeze() */ - .zo_metaslab_gang_bang = 32 << 10 + .zo_metaslab_force_ganging = 32 << 10 }; -extern uint64_t metaslab_gang_bang; +extern uint64_t metaslab_force_ganging; extern uint64_t metaslab_df_alloc_threshold; extern uint64_t zfs_deadman_synctime_ms; extern int metaslab_preload_limit; @@ -565,12 +565,12 @@ usage(boolean_t requested) const ztest_shared_opts_t *zo = &ztest_opts_defaults; char nice_vdev_size[NN_NUMBUF_SZ]; - char nice_gang_bang[NN_NUMBUF_SZ]; + char nice_force_ganging[NN_NUMBUF_SZ]; FILE *fp = requested ? stdout : stderr; nicenum(zo->zo_vdev_size, nice_vdev_size, sizeof (nice_vdev_size)); - nicenum(zo->zo_metaslab_gang_bang, nice_gang_bang, - sizeof (nice_gang_bang)); + nicenum(zo->zo_metaslab_force_ganging, nice_force_ganging, + sizeof (nice_force_ganging)); (void) fprintf(fp, "Usage: %s\n" "\t[-v vdevs (default: %llu)]\n" @@ -605,7 +605,7 @@ usage(boolean_t requested) zo->zo_raidz_parity, /* -R */ zo->zo_datasets, /* -d */ zo->zo_threads, /* -t */ - nice_gang_bang, /* -g */ + nice_force_ganging, /* -g */ zo->zo_init, /* -i */ (u_longlong_t)zo->zo_killrate, /* -k */ zo->zo_pool, /* -p */ @@ -674,8 +674,8 @@ process_options(int argc, char **argv) zo->zo_threads = MAX(1, value); break; case 'g': - zo->zo_metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, - value); + zo->zo_metaslab_force_ganging = + MAX(SPA_MINBLOCKSIZE << 1, value); break; case 'i': zo->zo_init = value; @@ -6422,7 +6422,7 @@ main(int argc, char **argv) zs = ztest_shared; if (fd_data_str) { - metaslab_gang_bang = ztest_opts.zo_metaslab_gang_bang; + metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; metaslab_df_alloc_threshold = zs->zs_metaslab_df_alloc_threshold; diff --git a/usr/src/lib/libzpool/common/llib-lzpool b/usr/src/lib/libzpool/common/llib-lzpool index 27d01fd54d..871facace3 100644 --- a/usr/src/lib/libzpool/common/llib-lzpool +++ b/usr/src/lib/libzpool/common/llib-lzpool @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. */ /* LINTLIBRARY */ @@ -64,7 +64,7 @@ #include <sys/abd.h> #include <libcmdutils.h> -extern uint64_t metaslab_gang_bang; +extern uint64_t metaslab_force_ganging; extern uint64_t metaslab_df_alloc_threshold; extern boolean_t zfeature_checks_disable; extern uint64_t zfs_deadman_synctime_ms; diff --git a/usr/src/test/zfs-tests/tests/functional/removal/removal_with_ganging.ksh b/usr/src/test/zfs-tests/tests/functional/removal/removal_with_ganging.ksh index 53251592ff..bfcb0151ca 100644 --- a/usr/src/test/zfs-tests/tests/functional/removal/removal_with_ganging.ksh +++ b/usr/src/test/zfs-tests/tests/functional/removal/removal_with_ganging.ksh @@ -15,26 +15,26 @@ # # -# Copyright (c) 2014, 2016 by Delphix. All rights reserved. +# Copyright (c) 2014, 2017 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/removal/removal.kshlib -function set_metaslab_gang_bang # new_value +function set_metaslab_force_ganging # new_value { typeset new_value=$1 - echo "metaslab_gang_bang/W $new_value" | mdb -kw + echo "metaslab_force_ganging/W $new_value" | mdb -kw } function cleanup { - log_must set_metaslab_gang_bang 0t$((2**17 + 1)) + log_must set_metaslab_force_ganging 0t$((2**17 + 1)) default_cleanup_noexit } default_setup_noexit "$DISKS" -log_must set_metaslab_gang_bang 0t$((2**12)) +log_must set_metaslab_force_ganging 0t$((2**14)) log_onexit cleanup FILE_CONTENTS="Leeloo Dallas mul-ti-pass." diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 50462066dc..565934e6ed 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -275,6 +275,7 @@ #endif #include <sys/callb.h> #include <sys/kstat.h> +#include <sys/zthr.h> #include <zfs_fletcher.h> #include <sys/aggsum.h> #include <sys/cityhash.h> @@ -285,10 +286,22 @@ boolean_t arc_watch = B_FALSE; int arc_procfd; #endif -static kmutex_t arc_reclaim_lock; -static kcondvar_t arc_reclaim_thread_cv; -static boolean_t arc_reclaim_thread_exit; -static kcondvar_t arc_reclaim_waiters_cv; +/* + * This thread's job is to keep enough free memory in the system, by + * calling arc_kmem_reap_now() plus arc_shrink(), which improves + * arc_available_memory(). + */ +static zthr_t *arc_reap_zthr; + +/* + * This thread's job is to keep arc_size under arc_c, by calling + * arc_adjust(), which improves arc_is_overflowing(). + */ +static zthr_t *arc_adjust_zthr; + +static kmutex_t arc_adjust_lock; +static kcondvar_t arc_adjust_waiters_cv; +static boolean_t arc_adjust_needed = B_FALSE; uint_t arc_reduce_dnlc_percent = 3; @@ -302,19 +315,23 @@ uint_t arc_reduce_dnlc_percent = 3; int zfs_arc_evict_batch_limit = 10; /* number of seconds before growing cache again */ -static int arc_grow_retry = 60; +int arc_grow_retry = 60; -/* number of milliseconds before attempting a kmem-cache-reap */ -static int arc_kmem_cache_reap_retry_ms = 1000; +/* + * Minimum time between calls to arc_kmem_reap_soon(). Note that this will + * be converted to ticks, so with the default hz=100, a setting of 15 ms + * will actually wait 2 ticks, or 20ms. + */ +int arc_kmem_cache_reap_retry_ms = 1000; /* shift of arc_c for calculating overflow limit in arc_get_data_impl */ -int zfs_arc_overflow_shift = 3; +int zfs_arc_overflow_shift = 3; /* shift of arc_c for calculating both min and max arc_p */ -static int arc_p_min_shift = 4; +int arc_p_min_shift = 4; /* log2(fraction of arc to reclaim) */ -static int arc_shrink_shift = 7; +int arc_shrink_shift = 7; /* * log2(fraction of ARC which must be free to allow growing). @@ -339,7 +356,7 @@ static int arc_min_prefetch_lifespan; */ int arc_lotsfree_percent = 10; -static int arc_dead; +static boolean_t arc_initialized; /* * The arc has filled available memory and has now warmed up. @@ -841,6 +858,7 @@ aggsum_t astat_other_size; aggsum_t astat_l2_hdr_size; static int arc_no_grow; /* Don't try to grow cache size */ +static hrtime_t arc_growtime; static uint64_t arc_tempreserve; static uint64_t arc_loaned_bytes; @@ -1400,8 +1418,8 @@ hdr_recl(void *unused) * umem calls the reclaim func when we destroy the buf cache, * which is after we do arc_fini(). */ - if (!arc_dead) - cv_signal(&arc_reclaim_thread_cv); + if (arc_initialized) + zthr_wakeup(arc_reap_zthr); } static void @@ -2558,7 +2576,7 @@ arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); - arc_loaned_bytes_update(size); + arc_loaned_bytes_update(arc_buf_size(buf)); return (buf); } @@ -2570,7 +2588,7 @@ arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, psize, lsize, compression_type); - arc_loaned_bytes_update(psize); + arc_loaned_bytes_update(arc_buf_size(buf)); return (buf); } @@ -3414,13 +3432,14 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, * function should proceed in this case). * * If threads are left sleeping, due to not - * using cv_broadcast, they will be woken up - * just before arc_reclaim_thread() sleeps. + * using cv_broadcast here, they will be woken + * up via cv_broadcast in arc_adjust_cb() just + * before arc_adjust_zthr sleeps. */ - mutex_enter(&arc_reclaim_lock); + mutex_enter(&arc_adjust_lock); if (!arc_is_overflowing()) - cv_signal(&arc_reclaim_waiters_cv); - mutex_exit(&arc_reclaim_lock); + cv_signal(&arc_adjust_waiters_cv); + mutex_exit(&arc_adjust_lock); } else { ARCSTAT_BUMP(arcstat_mutex_miss); } @@ -3893,8 +3912,8 @@ arc_flush(spa_t *spa, boolean_t retry) (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); } -void -arc_shrink(int64_t to_free) +static void +arc_reduce_target_size(int64_t to_free) { uint64_t asize = aggsum_value(&arc_size); if (arc_c > arc_c_min) { @@ -3913,8 +3932,13 @@ arc_shrink(int64_t to_free) ASSERT((int64_t)arc_p >= 0); } - if (asize > arc_c) - (void) arc_adjust(); + if (asize > arc_c) { + /* See comment in arc_adjust_cb_check() on why lock+flag */ + mutex_enter(&arc_adjust_lock); + arc_adjust_needed = B_TRUE; + mutex_exit(&arc_adjust_lock); + zthr_wakeup(arc_adjust_zthr); + } } typedef enum free_memory_reason_t { @@ -4066,7 +4090,7 @@ arc_reclaim_needed(void) } static void -arc_kmem_reap_now(void) +arc_kmem_reap_soon(void) { size_t i; kmem_cache_t *prev_cache = NULL; @@ -4092,16 +4116,6 @@ arc_kmem_reap_now(void) #endif #endif - /* - * If a kmem reap is already active, don't schedule more. We must - * check for this because kmem_cache_reap_soon() won't actually - * block on the cache being reaped (this is to prevent callers from - * becoming implicitly blocked by a system-wide kmem reap -- which, - * on a system with many, many full magazines, can take minutes). - */ - if (kmem_cache_reap_active()) - return; - for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; @@ -4127,139 +4141,162 @@ arc_kmem_reap_now(void) } } +/* ARGSUSED */ +static boolean_t +arc_adjust_cb_check(void *arg, zthr_t *zthr) +{ + /* + * This is necessary in order for the mdb ::arc dcmd to + * show up to date information. Since the ::arc command + * does not call the kstat's update function, without + * this call, the command may show stale stats for the + * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even + * with this change, the data might be up to 1 second + * out of date(the arc_adjust_zthr has a maximum sleep + * time of 1 second); but that should suffice. The + * arc_state_t structures can be queried directly if more + * accurate information is needed. + */ + if (arc_ksp != NULL) + arc_ksp->ks_update(arc_ksp, KSTAT_READ); + + /* + * We have to rely on arc_get_data_impl() to tell us when to adjust, + * rather than checking if we are overflowing here, so that we are + * sure to not leave arc_get_data_impl() waiting on + * arc_adjust_waiters_cv. If we have become "not overflowing" since + * arc_get_data_impl() checked, we need to wake it up. We could + * broadcast the CV here, but arc_get_data_impl() may have not yet + * gone to sleep. We would need to use a mutex to ensure that this + * function doesn't broadcast until arc_get_data_impl() has gone to + * sleep (e.g. the arc_adjust_lock). However, the lock ordering of + * such a lock would necessarily be incorrect with respect to the + * zthr_lock, which is held before this function is called, and is + * held by arc_get_data_impl() when it calls zthr_wakeup(). + */ + return (arc_adjust_needed); +} + /* - * Threads can block in arc_get_data_impl() waiting for this thread to evict - * enough data and signal them to proceed. When this happens, the threads in - * arc_get_data_impl() are sleeping while holding the hash lock for their - * particular arc header. Thus, we must be careful to never sleep on a - * hash lock in this thread. This is to prevent the following deadlock: - * - * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L", - * waiting for the reclaim thread to signal it. - * - * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, - * fails, and goes to sleep forever. - * - * This possible deadlock is avoided by always acquiring a hash lock - * using mutex_tryenter() from arc_reclaim_thread(). + * Keep arc_size under arc_c by running arc_adjust which evicts data + * from the ARC. */ /* ARGSUSED */ -static void -arc_reclaim_thread(void *unused) +static int +arc_adjust_cb(void *arg, zthr_t *zthr) { - hrtime_t growtime = 0; - hrtime_t kmem_reap_time = 0; - callb_cpr_t cpr; - - CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); + uint64_t evicted = 0; - mutex_enter(&arc_reclaim_lock); - while (!arc_reclaim_thread_exit) { - uint64_t evicted = 0; + /* Evict from cache */ + evicted = arc_adjust(); + /* + * If evicted is zero, we couldn't evict anything + * via arc_adjust(). This could be due to hash lock + * collisions, but more likely due to the majority of + * arc buffers being unevictable. Therefore, even if + * arc_size is above arc_c, another pass is unlikely to + * be helpful and could potentially cause us to enter an + * infinite loop. Additionally, zthr_iscancelled() is + * checked here so that if the arc is shutting down, the + * broadcast will wake any remaining arc adjust waiters. + */ + mutex_enter(&arc_adjust_lock); + arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) && + evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0; + if (!arc_adjust_needed) { /* - * This is necessary in order for the mdb ::arc dcmd to - * show up to date information. Since the ::arc command - * does not call the kstat's update function, without - * this call, the command may show stale stats for the - * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even - * with this change, the data might be up to 1 second - * out of date; but that should suffice. The arc_state_t - * structures can be queried directly if more accurate - * information is needed. + * We're either no longer overflowing, or we + * can't evict anything more, so we should wake + * up any waiters. */ - if (arc_ksp != NULL) - arc_ksp->ks_update(arc_ksp, KSTAT_READ); + cv_broadcast(&arc_adjust_waiters_cv); + } + mutex_exit(&arc_adjust_lock); - mutex_exit(&arc_reclaim_lock); + return (0); +} +/* ARGSUSED */ +static boolean_t +arc_reap_cb_check(void *arg, zthr_t *zthr) +{ + int64_t free_memory = arc_available_memory(); + + /* + * If a kmem reap is already active, don't schedule more. We must + * check for this because kmem_cache_reap_soon() won't actually + * block on the cache being reaped (this is to prevent callers from + * becoming implicitly blocked by a system-wide kmem reap -- which, + * on a system with many, many full magazines, can take minutes). + */ + if (!kmem_cache_reap_active() && + free_memory < 0) { + arc_no_grow = B_TRUE; + arc_warm = B_TRUE; /* - * We call arc_adjust() before (possibly) calling - * arc_kmem_reap_now(), so that we can wake up - * arc_get_data_impl() sooner. + * Wait at least zfs_grow_retry (default 60) seconds + * before considering growing. */ - evicted = arc_adjust(); + arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); + return (B_TRUE); + } else if (free_memory < arc_c >> arc_no_grow_shift) { + arc_no_grow = B_TRUE; + } else if (gethrtime() >= arc_growtime) { + arc_no_grow = B_FALSE; + } - int64_t free_memory = arc_available_memory(); - if (free_memory < 0) { - hrtime_t curtime = gethrtime(); - arc_no_grow = B_TRUE; - arc_warm = B_TRUE; + return (B_FALSE); +} - /* - * Wait at least zfs_grow_retry (default 60) seconds - * before considering growing. - */ - growtime = curtime + SEC2NSEC(arc_grow_retry); +/* + * Keep enough free memory in the system by reaping the ARC's kmem + * caches. To cause more slabs to be reapable, we may reduce the + * target size of the cache (arc_c), causing the arc_adjust_cb() + * to free more buffers. + */ +/* ARGSUSED */ +static int +arc_reap_cb(void *arg, zthr_t *zthr) +{ + int64_t free_memory; - /* - * Wait at least arc_kmem_cache_reap_retry_ms - * between arc_kmem_reap_now() calls. Without - * this check it is possible to end up in a - * situation where we spend lots of time - * reaping caches, while we're near arc_c_min. - */ - if (curtime >= kmem_reap_time) { - arc_kmem_reap_now(); - kmem_reap_time = gethrtime() + - MSEC2NSEC(arc_kmem_cache_reap_retry_ms); - } + /* + * Kick off asynchronous kmem_reap()'s of all our caches. + */ + arc_kmem_reap_soon(); - /* - * If we are still low on memory, shrink the ARC - * so that we have arc_shrink_min free space. - */ - free_memory = arc_available_memory(); + /* + * Wait at least arc_kmem_cache_reap_retry_ms between + * arc_kmem_reap_soon() calls. Without this check it is possible to + * end up in a situation where we spend lots of time reaping + * caches, while we're near arc_c_min. Waiting here also gives the + * subsequent free memory check a chance of finding that the + * asynchronous reap has already freed enough memory, and we don't + * need to call arc_reduce_target_size(). + */ + delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000); - int64_t to_free = - (arc_c >> arc_shrink_shift) - free_memory; - if (to_free > 0) { + /* + * Reduce the target size as needed to maintain the amount of free + * memory in the system at a fraction of the arc_size (1/128th by + * default). If oversubscribed (free_memory < 0) then reduce the + * target arc_size by the deficit amount plus the fractional + * amount. If free memory is positive but less then the fractional + * amount, reduce by what is needed to hit the fractional amount. + */ + free_memory = arc_available_memory(); + + int64_t to_free = + (arc_c >> arc_shrink_shift) - free_memory; + if (to_free > 0) { #ifdef _KERNEL - to_free = MAX(to_free, ptob(needfree)); + to_free = MAX(to_free, ptob(needfree)); #endif - arc_shrink(to_free); - } - } else if (free_memory < arc_c >> arc_no_grow_shift) { - arc_no_grow = B_TRUE; - } else if (gethrtime() >= growtime) { - arc_no_grow = B_FALSE; - } - - mutex_enter(&arc_reclaim_lock); - - /* - * If evicted is zero, we couldn't evict anything via - * arc_adjust(). This could be due to hash lock - * collisions, but more likely due to the majority of - * arc buffers being unevictable. Therefore, even if - * arc_size is above arc_c, another pass is unlikely to - * be helpful and could potentially cause us to enter an - * infinite loop. - */ - if (aggsum_compare(&arc_size, arc_c) <= 0|| evicted == 0) { - /* - * We're either no longer overflowing, or we - * can't evict anything more, so we should wake - * up any threads before we go to sleep. - */ - cv_broadcast(&arc_reclaim_waiters_cv); - - /* - * Block until signaled, or after one second (we - * might need to perform arc_kmem_reap_now() - * even if we aren't being signalled) - */ - CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait_hires(&arc_reclaim_thread_cv, - &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); - CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); - } + arc_reduce_target_size(to_free); } - arc_reclaim_thread_exit = B_FALSE; - cv_broadcast(&arc_reclaim_thread_cv); - CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ - thread_exit(); + return (0); } /* @@ -4303,11 +4340,15 @@ arc_adapt(int bytes, arc_state_t *state) } ASSERT((int64_t)arc_p >= 0); + /* + * Wake reap thread if we do not have any available memory + */ if (arc_reclaim_needed()) { - cv_signal(&arc_reclaim_thread_cv); + zthr_wakeup(arc_reap_zthr); return; } + if (arc_no_grow) return; @@ -4411,7 +4452,7 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) * overflowing; thus we don't use a while loop here. */ if (arc_is_overflowing()) { - mutex_enter(&arc_reclaim_lock); + mutex_enter(&arc_adjust_lock); /* * Now that we've acquired the lock, we may no longer be @@ -4425,11 +4466,12 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) * shouldn't cause any harm. */ if (arc_is_overflowing()) { - cv_signal(&arc_reclaim_thread_cv); - cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); + arc_adjust_needed = B_TRUE; + zthr_wakeup(arc_adjust_zthr); + (void) cv_wait(&arc_adjust_waiters_cv, + &arc_adjust_lock); } - - mutex_exit(&arc_reclaim_lock); + mutex_exit(&arc_adjust_lock); } VERIFY3U(hdr->b_type, ==, type); @@ -6090,10 +6132,8 @@ arc_init(void) #else uint64_t allmem = (physmem * PAGESIZE) / 2; #endif - - mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); - cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL); /* Convert seconds to clock ticks */ arc_min_prefetch_lifespan = 1 * hz; @@ -6182,9 +6222,14 @@ arc_init(void) arc_c = arc_c_min; arc_state_init(); - buf_init(); - arc_reclaim_thread_exit = B_FALSE; + /* + * The arc must be "uninitialized", so that hdr_recl() (which is + * registered by buf_init()) will not access arc_reap_zthr before + * it is created. + */ + ASSERT(!arc_initialized); + buf_init(); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); @@ -6195,10 +6240,12 @@ arc_init(void) kstat_install(arc_ksp); } - (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, - TS_RUN, minclsyspri); + arc_adjust_zthr = zthr_create(arc_adjust_cb_check, + arc_adjust_cb, NULL); + arc_reap_zthr = zthr_create_timer(arc_reap_cb_check, + arc_reap_cb, NULL, SEC2NSEC(1)); - arc_dead = B_FALSE; + arc_initialized = B_TRUE; arc_warm = B_FALSE; /* @@ -6220,31 +6267,24 @@ arc_init(void) void arc_fini(void) { - mutex_enter(&arc_reclaim_lock); - arc_reclaim_thread_exit = B_TRUE; - /* - * The reclaim thread will set arc_reclaim_thread_exit back to - * B_FALSE when it is finished exiting; we're waiting for that. - */ - while (arc_reclaim_thread_exit) { - cv_signal(&arc_reclaim_thread_cv); - cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); - } - mutex_exit(&arc_reclaim_lock); - /* Use B_TRUE to ensure *all* buffers are evicted */ arc_flush(NULL, B_TRUE); - arc_dead = B_TRUE; + arc_initialized = B_FALSE; if (arc_ksp != NULL) { kstat_delete(arc_ksp); arc_ksp = NULL; } - mutex_destroy(&arc_reclaim_lock); - cv_destroy(&arc_reclaim_thread_cv); - cv_destroy(&arc_reclaim_waiters_cv); + (void) zthr_cancel(arc_adjust_zthr); + zthr_destroy(arc_adjust_zthr); + + (void) zthr_cancel(arc_reap_zthr); + zthr_destroy(arc_reap_zthr); + + mutex_destroy(&arc_adjust_lock); + cv_destroy(&arc_adjust_waiters_cv); arc_state_fini(); buf_fini(); diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index 88dad24550..3e10c1e211 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -85,10 +85,10 @@ static boolean_t dbuf_evict_thread_exit; */ static multilist_t *dbuf_cache; static refcount_t dbuf_cache_size; -uint64_t dbuf_cache_max_bytes = 100 * 1024 * 1024; +uint64_t dbuf_cache_max_bytes = 0; -/* Cap the size of the dbuf cache to log2 fraction of arc size. */ -int dbuf_cache_max_shift = 5; +/* Set the default size of the dbuf cache to log2 fraction of arc size. */ +int dbuf_cache_shift = 5; /* * The dbuf cache uses a three-stage eviction policy: @@ -600,11 +600,15 @@ retry: mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); /* - * Setup the parameters for the dbuf cache. We cap the size of the - * dbuf cache to 1/32nd (default) of the size of the ARC. + * Setup the parameters for the dbuf cache. We set the size of the + * dbuf cache to 1/32nd (default) of the size of the ARC. If the value + * has been set in /etc/system and it's not greater than the size of + * the ARC, then we honor that value. */ - dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes, - arc_max_bytes() >> dbuf_cache_max_shift); + if (dbuf_cache_max_bytes == 0 || + dbuf_cache_max_bytes >= arc_max_bytes()) { + dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift; + } /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 3512d62704..88d81c9540 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -41,7 +41,7 @@ ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) uint64_t metaslab_aliquot = 512ULL << 10; -uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ +uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ /* * Since we can touch multiple metaslabs (and their respective space maps) @@ -3088,7 +3088,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, /* * For testing, make some blocks above a certain size be gang blocks. */ - if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) { + if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG); return (SET_ERROR(ENOSPC)); } diff --git a/usr/src/uts/common/fs/zfs/sys/zthr.h b/usr/src/uts/common/fs/zfs/sys/zthr.h index 62da2eea81..ce6033ecb6 100644 --- a/usr/src/uts/common/fs/zfs/sys/zthr.h +++ b/usr/src/uts/common/fs/zfs/sys/zthr.h @@ -29,6 +29,7 @@ struct zthr { kmutex_t zthr_lock; kcondvar_t zthr_cv; boolean_t zthr_cancel; + hrtime_t zthr_wait_time; zthr_checkfunc_t *zthr_checkfunc; zthr_func_t *zthr_func; @@ -38,6 +39,9 @@ struct zthr { extern zthr_t *zthr_create(zthr_checkfunc_t checkfunc, zthr_func_t *func, void *arg); +extern zthr_t *zthr_create_timer(zthr_checkfunc_t *checkfunc, + zthr_func_t *func, void *arg, hrtime_t nano_wait); + extern void zthr_exit(zthr_t *t, int rc); extern void zthr_destroy(zthr_t *t); diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index 388d13716c..e23e4a01c1 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -1585,7 +1585,8 @@ vdev_validate(vdev_t *vd) if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); - vdev_dbgmsg(vd, "vdev_validate: failed reading config"); + vdev_dbgmsg(vd, "vdev_validate: failed reading config for " + "txg %llu", (u_longlong_t)txg); return (0); } diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c index 21677dcdc2..d8a0762c42 100644 --- a/usr/src/uts/common/fs/zfs/vdev_label.c +++ b/usr/src/uts/common/fs/zfs/vdev_label.c @@ -540,6 +540,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) abd_t *vp_abd; zio_t *zio; uint64_t best_txg = 0; + uint64_t label_txg = 0; int error = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; @@ -565,8 +566,6 @@ retry: if (zio_wait(zio) == 0 && nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), &label, 0) == 0) { - uint64_t label_txg = 0; - /* * Auxiliary vdevs won't have txg values in their * labels and newly added vdevs may not have been @@ -597,6 +596,15 @@ retry: goto retry; } + /* + * We found a valid label but it didn't pass txg restrictions. + */ + if (config == NULL && label_txg != 0) { + vdev_dbgmsg(vd, "label discarded as txg is too large " + "(%llu > %llu)", (u_longlong_t)label_txg, + (u_longlong_t)txg); + } + abd_free(vp_abd); return (config); diff --git a/usr/src/uts/common/fs/zfs/zthr.c b/usr/src/uts/common/fs/zfs/zthr.c index 7772386c73..f26f911cb0 100644 --- a/usr/src/uts/common/fs/zfs/zthr.c +++ b/usr/src/uts/common/fs/zfs/zthr.c @@ -47,6 +47,10 @@ * 3] When the zthr is done, it changes the indicator to stopped, allowing * a new cycle to start. * + * Besides being awakened by other threads, a zthr can be configured + * during creation to wakeup on it's own after a specified interval + * [see zthr_create_timer()]. + * * == ZTHR creation * * Every zthr needs three inputs to start running: @@ -74,6 +78,9 @@ * * To start a zthr: * zthr_t *zthr_pointer = zthr_create(checkfunc, func, args); + * or + * zthr_t *zthr_pointer = zthr_create_timer(checkfunc, func, + * args, max_sleep); * * After that you should be able to wakeup, cancel, and resume the * zthr from another thread using zthr_pointer. @@ -189,7 +196,13 @@ zthr_procedure(void *arg) mutex_enter(&t->zthr_lock); } else { /* go to sleep */ - cv_wait(&t->zthr_cv, &t->zthr_lock); + if (t->zthr_wait_time == 0) { + cv_wait(&t->zthr_cv, &t->zthr_lock); + } else { + (void) cv_timedwait_hires(&t->zthr_cv, + &t->zthr_lock, t->zthr_wait_time, + MSEC2NSEC(1), 0); + } } } mutex_exit(&t->zthr_lock); @@ -200,6 +213,18 @@ zthr_procedure(void *arg) zthr_t * zthr_create(zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg) { + return (zthr_create_timer(checkfunc, func, arg, (hrtime_t)0)); +} + +/* + * Create a zthr with specified maximum sleep time. If the time + * in sleeping state exceeds max_sleep, a wakeup(do the check and + * start working if required) will be triggered. + */ +zthr_t * +zthr_create_timer(zthr_checkfunc_t *checkfunc, zthr_func_t *func, + void *arg, hrtime_t max_sleep) +{ zthr_t *t = kmem_zalloc(sizeof (*t), KM_SLEEP); mutex_init(&t->zthr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL); @@ -208,6 +233,7 @@ zthr_create(zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg) t->zthr_checkfunc = checkfunc; t->zthr_func = func; t->zthr_arg = arg; + t->zthr_wait_time = max_sleep; t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t, 0, &p0, TS_RUN, minclsyspri); |