[illumos-gate merge]

commit b6bf6e1540f30bd97b8d6e2c21d95e17841e0f23 9189 Add debug to vdev_label_read_config when txg check fails commit de753e34f9c399037936e8bc547d823bba9d4b0d 9284 arc_reclaim_thread has 2 jobs commit 243952c7eeef020886e3e2e3df99a513df40584a 9280 Assertion failure while running removal_with_ganging test with 4K devices commit 0a052a62e5b55bf138983a4b3ef88eb752006ae4 9322 Print stats since boot for first line of arcstat.pl commit 268bbb2a2fa79c36d4695d13a595ba50a7754b76 9188 increase size of dbuf cache to reduce indirect block decompression commit 9be12bd737714550277bd02b0c693db560976990 9321 arc_loan_compressed_buf() can increment arc_loaned_bytes by the wrong value Conflicts: usr/src/uts/common/fs/zfs/arc.c
author: Jerry Jelinek <jerry.jelinek@joyent.com> 2018-03-22 12:42:28 +0000
committer: Jerry Jelinek <jerry.jelinek@joyent.com> 2018-03-22 12:42:28 +0000
commit: 2136ccc7721a074a369d80492678baaa7cf47a67 (patch)
tree: fb0a00a29d3519e22f4b96ff7c969e9136522884
parent: cbbb96d54a2a10694551e3a98692b9386cb21e13 (diff)
parent: b6bf6e1540f30bd97b8d6e2c21d95e17841e0f23 (diff)
download: illumos-joyent-2136ccc7721a074a369d80492678baaa7cf47a67.tar.gz
12 files changed, 297 insertions, 211 deletions
diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
index 0c99391147..06ce396ef8 100644
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
@@ -396,7 +396,7 @@ zfs_params(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 		"zfs_read_chunk_size",
 		"zfs_nocacheflush",
 		"zil_replay_disable",
-		"metaslab_gang_bang",
+		"metaslab_force_ganging",
 		"metaslab_df_alloc_threshold",
 		"metaslab_df_free_pct",
 		"zio_injection_enabled",
diff --git a/usr/src/cmd/stat/arcstat/arcstat.pl b/usr/src/cmd/stat/arcstat/arcstat.pl
index 8f13221910..d4f12a9e1c 100644
--- a/usr/src/cmd/stat/arcstat/arcstat.pl
+++ b/usr/src/cmd/stat/arcstat/arcstat.pl
@@ -166,8 +166,11 @@ sub init {
 	detailed_usage() if $vflag;
 	@hdr = @xhdr if $xflag;		#reset headers to xhdr
 
-	# check if L2ARC exists
+	# we want to capture the stats here, so that we can use them to check
+	# if an L2ARC device exists; but more importantly, so that we print
+	# the stats since boot as the first line of output from main().
 	snap_stats();
+
 	if (defined $cur{"l2_size"}) {
 		$l2exist = 1;
 	}
@@ -353,12 +356,12 @@ sub main {
 	if ($count > 0) { $count_flag = 1; }
 	while (1) {
 		print_header() if ($i == 0);
-		snap_stats();
 		calculate();
 		print_values();
 		last if ($count_flag == 1 && $count-- <= 1);
 		$i = (($i == $hdr_intr) && (not $raw_output)) ? 0 : $i+1;
 		sleep($int);
+		snap_stats();
 	}
 	close($out) if defined $out;
 }
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index b53d4a091a..1522c75485 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -162,7 +162,7 @@ typedef struct ztest_shared_opts {
 	int zo_init;
 	uint64_t zo_time;
 	uint64_t zo_maxloops;
-	uint64_t zo_metaslab_gang_bang;
+	uint64_t zo_metaslab_force_ganging;
 } ztest_shared_opts_t;
 
 static const ztest_shared_opts_t ztest_opts_defaults = {
@@ -184,10 +184,10 @@ static const ztest_shared_opts_t ztest_opts_defaults = {
 	.zo_init = 1,
 	.zo_time = 300,			/* 5 minutes */
 	.zo_maxloops = 50,		/* max loops during spa_freeze() */
-	.zo_metaslab_gang_bang = 32 << 10
+	.zo_metaslab_force_ganging = 32 << 10
 };
 
-extern uint64_t metaslab_gang_bang;
+extern uint64_t metaslab_force_ganging;
 extern uint64_t metaslab_df_alloc_threshold;
 extern uint64_t zfs_deadman_synctime_ms;
 extern int metaslab_preload_limit;
@@ -565,12 +565,12 @@ usage(boolean_t requested)
 	const ztest_shared_opts_t *zo = &ztest_opts_defaults;
 
 	char nice_vdev_size[NN_NUMBUF_SZ];
-	char nice_gang_bang[NN_NUMBUF_SZ];
+	char nice_force_ganging[NN_NUMBUF_SZ];
 	FILE *fp = requested ? stdout : stderr;
 
 	nicenum(zo->zo_vdev_size, nice_vdev_size, sizeof (nice_vdev_size));
-	nicenum(zo->zo_metaslab_gang_bang, nice_gang_bang,
-	    sizeof (nice_gang_bang));
+	nicenum(zo->zo_metaslab_force_ganging, nice_force_ganging,
+	    sizeof (nice_force_ganging));
 
 	(void) fprintf(fp, "Usage: %s\n"
 	    "\t[-v vdevs (default: %llu)]\n"
@@ -605,7 +605,7 @@ usage(boolean_t requested)
 	    zo->zo_raidz_parity,			/* -R */
 	    zo->zo_datasets,				/* -d */
 	    zo->zo_threads,				/* -t */
-	    nice_gang_bang,				/* -g */
+	    nice_force_ganging,				/* -g */
 	    zo->zo_init,				/* -i */
 	    (u_longlong_t)zo->zo_killrate,		/* -k */
 	    zo->zo_pool,				/* -p */
@@ -674,8 +674,8 @@ process_options(int argc, char **argv)
 			zo->zo_threads = MAX(1, value);
 			break;
 		case 'g':
-			zo->zo_metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1,
-			    value);
+			zo->zo_metaslab_force_ganging =
+			    MAX(SPA_MINBLOCKSIZE << 1, value);
 			break;
 		case 'i':
 			zo->zo_init = value;
@@ -6422,7 +6422,7 @@ main(int argc, char **argv)
 	zs = ztest_shared;
 
 	if (fd_data_str) {
-		metaslab_gang_bang = ztest_opts.zo_metaslab_gang_bang;
+		metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging;
 		metaslab_df_alloc_threshold =
 		    zs->zs_metaslab_df_alloc_threshold;
 
diff --git a/usr/src/lib/libzpool/common/llib-lzpool b/usr/src/lib/libzpool/common/llib-lzpool
index 27d01fd54d..871facace3 100644
--- a/usr/src/lib/libzpool/common/llib-lzpool
+++ b/usr/src/lib/libzpool/common/llib-lzpool
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  */
 
 /* LINTLIBRARY */
@@ -64,7 +64,7 @@
 #include <sys/abd.h>
 #include <libcmdutils.h>
 
-extern uint64_t metaslab_gang_bang;
+extern uint64_t metaslab_force_ganging;
 extern uint64_t metaslab_df_alloc_threshold;
 extern boolean_t zfeature_checks_disable;
 extern uint64_t zfs_deadman_synctime_ms;
diff --git a/usr/src/test/zfs-tests/tests/functional/removal/removal_with_ganging.ksh b/usr/src/test/zfs-tests/tests/functional/removal/removal_with_ganging.ksh
index 53251592ff..bfcb0151ca 100644
--- a/usr/src/test/zfs-tests/tests/functional/removal/removal_with_ganging.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/removal/removal_with_ganging.ksh
@@ -15,26 +15,26 @@
 #
 
 #
-# Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2014, 2017 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
 . $STF_SUITE/tests/functional/removal/removal.kshlib
 
-function set_metaslab_gang_bang # new_value
+function set_metaslab_force_ganging # new_value
 {
 	typeset new_value=$1
-	echo "metaslab_gang_bang/W $new_value" | mdb -kw
+	echo "metaslab_force_ganging/W $new_value" | mdb -kw
 }
 
 function cleanup
 {
-	log_must set_metaslab_gang_bang 0t$((2**17 + 1))
+	log_must set_metaslab_force_ganging 0t$((2**17 + 1))
 	default_cleanup_noexit
 }
 
 default_setup_noexit "$DISKS"
-log_must set_metaslab_gang_bang 0t$((2**12))
+log_must set_metaslab_force_ganging 0t$((2**14))
 log_onexit cleanup
 
 FILE_CONTENTS="Leeloo Dallas mul-ti-pass."
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index 50462066dc..565934e6ed 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -275,6 +275,7 @@
 #endif
 #include <sys/callb.h>
 #include <sys/kstat.h>
+#include <sys/zthr.h>
 #include <zfs_fletcher.h>
 #include <sys/aggsum.h>
 #include <sys/cityhash.h>
@@ -285,10 +286,22 @@ boolean_t arc_watch = B_FALSE;
 int arc_procfd;
 #endif
 
-static kmutex_t		arc_reclaim_lock;
-static kcondvar_t	arc_reclaim_thread_cv;
-static boolean_t	arc_reclaim_thread_exit;
-static kcondvar_t	arc_reclaim_waiters_cv;
+/*
+ * This thread's job is to keep enough free memory in the system, by
+ * calling arc_kmem_reap_now() plus arc_shrink(), which improves
+ * arc_available_memory().
+ */
+static zthr_t		*arc_reap_zthr;
+
+/*
+ * This thread's job is to keep arc_size under arc_c, by calling
+ * arc_adjust(), which improves arc_is_overflowing().
+ */
+static zthr_t		*arc_adjust_zthr;
+
+static kmutex_t		arc_adjust_lock;
+static kcondvar_t	arc_adjust_waiters_cv;
+static boolean_t	arc_adjust_needed = B_FALSE;
 
 uint_t arc_reduce_dnlc_percent = 3;
 
@@ -302,19 +315,23 @@ uint_t arc_reduce_dnlc_percent = 3;
 int zfs_arc_evict_batch_limit = 10;
 
 /* number of seconds before growing cache again */
-static int		arc_grow_retry = 60;
+int arc_grow_retry = 60;
 
-/* number of milliseconds before attempting a kmem-cache-reap */
-static int		arc_kmem_cache_reap_retry_ms = 1000;
+/*
+ * Minimum time between calls to arc_kmem_reap_soon().  Note that this will
+ * be converted to ticks, so with the default hz=100, a setting of 15 ms
+ * will actually wait 2 ticks, or 20ms.
+ */
+int arc_kmem_cache_reap_retry_ms = 1000;
 
 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
-int		zfs_arc_overflow_shift = 3;
+int zfs_arc_overflow_shift = 3;
 
 /* shift of arc_c for calculating both min and max arc_p */
-static int		arc_p_min_shift = 4;
+int arc_p_min_shift = 4;
 
 /* log2(fraction of arc to reclaim) */
-static int		arc_shrink_shift = 7;
+int arc_shrink_shift = 7;
 
 /*
  * log2(fraction of ARC which must be free to allow growing).
@@ -339,7 +356,7 @@ static int		arc_min_prefetch_lifespan;
  */
 int arc_lotsfree_percent = 10;
 
-static int arc_dead;
+static boolean_t arc_initialized;
 
 /*
  * The arc has filled available memory and has now warmed up.
@@ -841,6 +858,7 @@ aggsum_t astat_other_size;
 aggsum_t astat_l2_hdr_size;
 
 static int		arc_no_grow;	/* Don't try to grow cache size */
+static hrtime_t		arc_growtime;
 static uint64_t		arc_tempreserve;
 static uint64_t		arc_loaned_bytes;
 
@@ -1400,8 +1418,8 @@ hdr_recl(void *unused)
 	 * umem calls the reclaim func when we destroy the buf cache,
 	 * which is after we do arc_fini().
 	 */
-	if (!arc_dead)
-		cv_signal(&arc_reclaim_thread_cv);
+	if (arc_initialized)
+		zthr_wakeup(arc_reap_zthr);
 }
 
 static void
@@ -2558,7 +2576,7 @@ arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
 	arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
 	    is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
 
-	arc_loaned_bytes_update(size);
+	arc_loaned_bytes_update(arc_buf_size(buf));
 
 	return (buf);
 }
@@ -2570,7 +2588,7 @@ arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
 	arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
 	    psize, lsize, compression_type);
 
-	arc_loaned_bytes_update(psize);
+	arc_loaned_bytes_update(arc_buf_size(buf));
 
 	return (buf);
 }
@@ -3414,13 +3432,14 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 			 * function should proceed in this case).
 			 *
 			 * If threads are left sleeping, due to not
-			 * using cv_broadcast, they will be woken up
-			 * just before arc_reclaim_thread() sleeps.
+			 * using cv_broadcast here, they will be woken
+			 * up via cv_broadcast in arc_adjust_cb() just
+			 * before arc_adjust_zthr sleeps.
 			 */
-			mutex_enter(&arc_reclaim_lock);
+			mutex_enter(&arc_adjust_lock);
 			if (!arc_is_overflowing())
-				cv_signal(&arc_reclaim_waiters_cv);
-			mutex_exit(&arc_reclaim_lock);
+				cv_signal(&arc_adjust_waiters_cv);
+			mutex_exit(&arc_adjust_lock);
 		} else {
 			ARCSTAT_BUMP(arcstat_mutex_miss);
 		}
@@ -3893,8 +3912,8 @@ arc_flush(spa_t *spa, boolean_t retry)
 	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
 }
 
-void
-arc_shrink(int64_t to_free)
+static void
+arc_reduce_target_size(int64_t to_free)
 {
 	uint64_t asize = aggsum_value(&arc_size);
 	if (arc_c > arc_c_min) {
@@ -3913,8 +3932,13 @@ arc_shrink(int64_t to_free)
 		ASSERT((int64_t)arc_p >= 0);
 	}
 
-	if (asize > arc_c)
-		(void) arc_adjust();
+	if (asize > arc_c) {
+		/* See comment in arc_adjust_cb_check() on why lock+flag */
+		mutex_enter(&arc_adjust_lock);
+		arc_adjust_needed = B_TRUE;
+		mutex_exit(&arc_adjust_lock);
+		zthr_wakeup(arc_adjust_zthr);
+	}
 }
 
 typedef enum free_memory_reason_t {
@@ -4066,7 +4090,7 @@ arc_reclaim_needed(void)
 }
 
 static void
-arc_kmem_reap_now(void)
+arc_kmem_reap_soon(void)
 {
 	size_t			i;
 	kmem_cache_t		*prev_cache = NULL;
@@ -4092,16 +4116,6 @@ arc_kmem_reap_now(void)
 #endif
 #endif
 
-	/*
-	 * If a kmem reap is already active, don't schedule more.  We must
-	 * check for this because kmem_cache_reap_soon() won't actually
-	 * block on the cache being reaped (this is to prevent callers from
-	 * becoming implicitly blocked by a system-wide kmem reap -- which,
-	 * on a system with many, many full magazines, can take minutes).
-	 */
-	if (kmem_cache_reap_active())
-		return;
-
 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
 		if (zio_buf_cache[i] != prev_cache) {
 			prev_cache = zio_buf_cache[i];
@@ -4127,139 +4141,162 @@ arc_kmem_reap_now(void)
 	}
 }
 
+/* ARGSUSED */
+static boolean_t
+arc_adjust_cb_check(void *arg, zthr_t *zthr)
+{
+	/*
+	 * This is necessary in order for the mdb ::arc dcmd to
+	 * show up to date information. Since the ::arc command
+	 * does not call the kstat's update function, without
+	 * this call, the command may show stale stats for the
+	 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
+	 * with this change, the data might be up to 1 second
+	 * out of date(the arc_adjust_zthr has a maximum sleep
+	 * time of 1 second); but that should suffice.  The
+	 * arc_state_t structures can be queried directly if more
+	 * accurate information is needed.
+	 */
+	if (arc_ksp != NULL)
+		arc_ksp->ks_update(arc_ksp, KSTAT_READ);
+
+	/*
+	 * We have to rely on arc_get_data_impl() to tell us when to adjust,
+	 * rather than checking if we are overflowing here, so that we are
+	 * sure to not leave arc_get_data_impl() waiting on
+	 * arc_adjust_waiters_cv.  If we have become "not overflowing" since
+	 * arc_get_data_impl() checked, we need to wake it up.  We could
+	 * broadcast the CV here, but arc_get_data_impl() may have not yet
+	 * gone to sleep.  We would need to use a mutex to ensure that this
+	 * function doesn't broadcast until arc_get_data_impl() has gone to
+	 * sleep (e.g. the arc_adjust_lock).  However, the lock ordering of
+	 * such a lock would necessarily be incorrect with respect to the
+	 * zthr_lock, which is held before this function is called, and is
+	 * held by arc_get_data_impl() when it calls zthr_wakeup().
+	 */
+	return (arc_adjust_needed);
+}
+
 /*
- * Threads can block in arc_get_data_impl() waiting for this thread to evict
- * enough data and signal them to proceed. When this happens, the threads in
- * arc_get_data_impl() are sleeping while holding the hash lock for their
- * particular arc header. Thus, we must be careful to never sleep on a
- * hash lock in this thread. This is to prevent the following deadlock:
- *
- *  - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L",
- *    waiting for the reclaim thread to signal it.
- *
- *  - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
- *    fails, and goes to sleep forever.
- *
- * This possible deadlock is avoided by always acquiring a hash lock
- * using mutex_tryenter() from arc_reclaim_thread().
+ * Keep arc_size under arc_c by running arc_adjust which evicts data
+ * from the ARC.
  */
 /* ARGSUSED */
-static void
-arc_reclaim_thread(void *unused)
+static int
+arc_adjust_cb(void *arg, zthr_t *zthr)
 {
-	hrtime_t		growtime = 0;
-	hrtime_t		kmem_reap_time = 0;
-	callb_cpr_t		cpr;
-
-	CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
+	uint64_t evicted = 0;
 
-	mutex_enter(&arc_reclaim_lock);
-	while (!arc_reclaim_thread_exit) {
-		uint64_t evicted = 0;
+	/* Evict from cache */
+	evicted = arc_adjust();
 
+	/*
+	 * If evicted is zero, we couldn't evict anything
+	 * via arc_adjust(). This could be due to hash lock
+	 * collisions, but more likely due to the majority of
+	 * arc buffers being unevictable. Therefore, even if
+	 * arc_size is above arc_c, another pass is unlikely to
+	 * be helpful and could potentially cause us to enter an
+	 * infinite loop.  Additionally, zthr_iscancelled() is
+	 * checked here so that if the arc is shutting down, the
+	 * broadcast will wake any remaining arc adjust waiters.
+	 */
+	mutex_enter(&arc_adjust_lock);
+	arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) &&
+	    evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0;
+	if (!arc_adjust_needed) {
 		/*
-		 * This is necessary in order for the mdb ::arc dcmd to
-		 * show up to date information. Since the ::arc command
-		 * does not call the kstat's update function, without
-		 * this call, the command may show stale stats for the
-		 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
-		 * with this change, the data might be up to 1 second
-		 * out of date; but that should suffice. The arc_state_t
-		 * structures can be queried directly if more accurate
-		 * information is needed.
+		 * We're either no longer overflowing, or we
+		 * can't evict anything more, so we should wake
+		 * up any waiters.
 		 */
-		if (arc_ksp != NULL)
-			arc_ksp->ks_update(arc_ksp, KSTAT_READ);
+		cv_broadcast(&arc_adjust_waiters_cv);
+	}
+	mutex_exit(&arc_adjust_lock);
 
-		mutex_exit(&arc_reclaim_lock);
+	return (0);
+}
 
+/* ARGSUSED */
+static boolean_t
+arc_reap_cb_check(void *arg, zthr_t *zthr)
+{
+	int64_t free_memory = arc_available_memory();
+
+	/*
+	 * If a kmem reap is already active, don't schedule more.  We must
+	 * check for this because kmem_cache_reap_soon() won't actually
+	 * block on the cache being reaped (this is to prevent callers from
+	 * becoming implicitly blocked by a system-wide kmem reap -- which,
+	 * on a system with many, many full magazines, can take minutes).
+	 */
+	if (!kmem_cache_reap_active() &&
+	    free_memory < 0) {
+		arc_no_grow = B_TRUE;
+		arc_warm = B_TRUE;
 		/*
-		 * We call arc_adjust() before (possibly) calling
-		 * arc_kmem_reap_now(), so that we can wake up
-		 * arc_get_data_impl() sooner.
+		 * Wait at least zfs_grow_retry (default 60) seconds
+		 * before considering growing.
 		 */
-		evicted = arc_adjust();
+		arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
+		return (B_TRUE);
+	} else if (free_memory < arc_c >> arc_no_grow_shift) {
+		arc_no_grow = B_TRUE;
+	} else if (gethrtime() >= arc_growtime) {
+		arc_no_grow = B_FALSE;
+	}
 
-		int64_t free_memory = arc_available_memory();
-		if (free_memory < 0) {
-			hrtime_t curtime = gethrtime();
-			arc_no_grow = B_TRUE;
-			arc_warm = B_TRUE;
+	return (B_FALSE);
+}
 
-			/*
-			 * Wait at least zfs_grow_retry (default 60) seconds
-			 * before considering growing.
-			 */
-			growtime = curtime + SEC2NSEC(arc_grow_retry);
+/*
+ * Keep enough free memory in the system by reaping the ARC's kmem
+ * caches.  To cause more slabs to be reapable, we may reduce the
+ * target size of the cache (arc_c), causing the arc_adjust_cb()
+ * to free more buffers.
+ */
+/* ARGSUSED */
+static int
+arc_reap_cb(void *arg, zthr_t *zthr)
+{
+	int64_t free_memory;
 
-			/*
-			 * Wait at least arc_kmem_cache_reap_retry_ms
-			 * between arc_kmem_reap_now() calls. Without
-			 * this check it is possible to end up in a
-			 * situation where we spend lots of time
-			 * reaping caches, while we're near arc_c_min.
-			 */
-			if (curtime >= kmem_reap_time) {
-				arc_kmem_reap_now();
-				kmem_reap_time = gethrtime() +
-				    MSEC2NSEC(arc_kmem_cache_reap_retry_ms);
-			}
+	/*
+	 * Kick off asynchronous kmem_reap()'s of all our caches.
+	 */
+	arc_kmem_reap_soon();
 
-			/*
-			 * If we are still low on memory, shrink the ARC
-			 * so that we have arc_shrink_min free space.
-			 */
-			free_memory = arc_available_memory();
+	/*
+	 * Wait at least arc_kmem_cache_reap_retry_ms between
+	 * arc_kmem_reap_soon() calls. Without this check it is possible to
+	 * end up in a situation where we spend lots of time reaping
+	 * caches, while we're near arc_c_min.  Waiting here also gives the
+	 * subsequent free memory check a chance of finding that the
+	 * asynchronous reap has already freed enough memory, and we don't
+	 * need to call arc_reduce_target_size().
+	 */
+	delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
 
-			int64_t to_free =
-			    (arc_c >> arc_shrink_shift) - free_memory;
-			if (to_free > 0) {
+	/*
+	 * Reduce the target size as needed to maintain the amount of free
+	 * memory in the system at a fraction of the arc_size (1/128th by
+	 * default).  If oversubscribed (free_memory < 0) then reduce the
+	 * target arc_size by the deficit amount plus the fractional
+	 * amount.  If free memory is positive but less then the fractional
+	 * amount, reduce by what is needed to hit the fractional amount.
+	 */
+	free_memory = arc_available_memory();
+
+	int64_t to_free =
+	    (arc_c >> arc_shrink_shift) - free_memory;
+	if (to_free > 0) {
 #ifdef _KERNEL
-				to_free = MAX(to_free, ptob(needfree));
+		to_free = MAX(to_free, ptob(needfree));
 #endif
-				arc_shrink(to_free);
-			}
-		} else if (free_memory < arc_c >> arc_no_grow_shift) {
-			arc_no_grow = B_TRUE;
-		} else if (gethrtime() >= growtime) {
-			arc_no_grow = B_FALSE;
-		}
-
-		mutex_enter(&arc_reclaim_lock);
-
-		/*
-		 * If evicted is zero, we couldn't evict anything via
-		 * arc_adjust(). This could be due to hash lock
-		 * collisions, but more likely due to the majority of
-		 * arc buffers being unevictable. Therefore, even if
-		 * arc_size is above arc_c, another pass is unlikely to
-		 * be helpful and could potentially cause us to enter an
-		 * infinite loop.
-		 */
-		if (aggsum_compare(&arc_size, arc_c) <= 0|| evicted == 0) {
-			/*
-			 * We're either no longer overflowing, or we
-			 * can't evict anything more, so we should wake
-			 * up any threads before we go to sleep.
-			 */
-			cv_broadcast(&arc_reclaim_waiters_cv);
-
-			/*
-			 * Block until signaled, or after one second (we
-			 * might need to perform arc_kmem_reap_now()
-			 * even if we aren't being signalled)
-			 */
-			CALLB_CPR_SAFE_BEGIN(&cpr);
-			(void) cv_timedwait_hires(&arc_reclaim_thread_cv,
-			    &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
-			CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
-		}
+		arc_reduce_target_size(to_free);
 	}
 
-	arc_reclaim_thread_exit = B_FALSE;
-	cv_broadcast(&arc_reclaim_thread_cv);
-	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_lock */
-	thread_exit();
+	return (0);
 }
 
 /*
@@ -4303,11 +4340,15 @@ arc_adapt(int bytes, arc_state_t *state)
 	}
 	ASSERT((int64_t)arc_p >= 0);
 
+	/*
+	 * Wake reap thread if we do not have any available memory
+	 */
 	if (arc_reclaim_needed()) {
-		cv_signal(&arc_reclaim_thread_cv);
+		zthr_wakeup(arc_reap_zthr);
 		return;
 	}
 
+
 	if (arc_no_grow)
 		return;
 
@@ -4411,7 +4452,7 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 	 * overflowing; thus we don't use a while loop here.
 	 */
 	if (arc_is_overflowing()) {
-		mutex_enter(&arc_reclaim_lock);
+		mutex_enter(&arc_adjust_lock);
 
 		/*
 		 * Now that we've acquired the lock, we may no longer be
@@ -4425,11 +4466,12 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 		 * shouldn't cause any harm.
 		 */
 		if (arc_is_overflowing()) {
-			cv_signal(&arc_reclaim_thread_cv);
-			cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
+			arc_adjust_needed = B_TRUE;
+			zthr_wakeup(arc_adjust_zthr);
+			(void) cv_wait(&arc_adjust_waiters_cv,
+			    &arc_adjust_lock);
 		}
-
-		mutex_exit(&arc_reclaim_lock);
+		mutex_exit(&arc_adjust_lock);
 	}
 
 	VERIFY3U(hdr->b_type, ==, type);
@@ -6090,10 +6132,8 @@ arc_init(void)
 #else
 	uint64_t allmem = (physmem * PAGESIZE) / 2;
 #endif
-
-	mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL);
 
 	/* Convert seconds to clock ticks */
 	arc_min_prefetch_lifespan = 1 * hz;
@@ -6182,9 +6222,14 @@ arc_init(void)
 		arc_c = arc_c_min;
 
 	arc_state_init();
-	buf_init();
 
-	arc_reclaim_thread_exit = B_FALSE;
+	/*
+	 * The arc must be "uninitialized", so that hdr_recl() (which is
+	 * registered by buf_init()) will not access arc_reap_zthr before
+	 * it is created.
+	 */
+	ASSERT(!arc_initialized);
+	buf_init();
 
 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
@@ -6195,10 +6240,12 @@ arc_init(void)
 		kstat_install(arc_ksp);
 	}
 
-	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
-	    TS_RUN, minclsyspri);
+	arc_adjust_zthr = zthr_create(arc_adjust_cb_check,
+	    arc_adjust_cb, NULL);
+	arc_reap_zthr = zthr_create_timer(arc_reap_cb_check,
+	    arc_reap_cb, NULL, SEC2NSEC(1));
 
-	arc_dead = B_FALSE;
+	arc_initialized = B_TRUE;
 	arc_warm = B_FALSE;
 
 	/*
@@ -6220,31 +6267,24 @@ arc_init(void)
 void
 arc_fini(void)
 {
-	mutex_enter(&arc_reclaim_lock);
-	arc_reclaim_thread_exit = B_TRUE;
-	/*
-	 * The reclaim thread will set arc_reclaim_thread_exit back to
-	 * B_FALSE when it is finished exiting; we're waiting for that.
-	 */
-	while (arc_reclaim_thread_exit) {
-		cv_signal(&arc_reclaim_thread_cv);
-		cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
-	}
-	mutex_exit(&arc_reclaim_lock);
-
 	/* Use B_TRUE to ensure *all* buffers are evicted */
 	arc_flush(NULL, B_TRUE);
 
-	arc_dead = B_TRUE;
+	arc_initialized = B_FALSE;
 
 	if (arc_ksp != NULL) {
 		kstat_delete(arc_ksp);
 		arc_ksp = NULL;
 	}
 
-	mutex_destroy(&arc_reclaim_lock);
-	cv_destroy(&arc_reclaim_thread_cv);
-	cv_destroy(&arc_reclaim_waiters_cv);
+	(void) zthr_cancel(arc_adjust_zthr);
+	zthr_destroy(arc_adjust_zthr);
+
+	(void) zthr_cancel(arc_reap_zthr);
+	zthr_destroy(arc_reap_zthr);
+
+	mutex_destroy(&arc_adjust_lock);
+	cv_destroy(&arc_adjust_waiters_cv);
 
 	arc_state_fini();
 	buf_fini();
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 88dad24550..3e10c1e211 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -85,10 +85,10 @@ static boolean_t dbuf_evict_thread_exit;
  */
 static multilist_t *dbuf_cache;
 static refcount_t dbuf_cache_size;
-uint64_t dbuf_cache_max_bytes = 100 * 1024 * 1024;
+uint64_t dbuf_cache_max_bytes = 0;
 
-/* Cap the size of the dbuf cache to log2 fraction of arc size. */
-int dbuf_cache_max_shift = 5;
+/* Set the default size of the dbuf cache to log2 fraction of arc size. */
+int dbuf_cache_shift = 5;
 
 /*
  * The dbuf cache uses a three-stage eviction policy:
@@ -600,11 +600,15 @@ retry:
 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
 
 	/*
-	 * Setup the parameters for the dbuf cache. We cap the size of the
-	 * dbuf cache to 1/32nd (default) of the size of the ARC.
+	 * Setup the parameters for the dbuf cache. We set the size of the
+	 * dbuf cache to 1/32nd (default) of the size of the ARC. If the value
+	 * has been set in /etc/system and it's not greater than the size of
+	 * the ARC, then we honor that value.
 	 */
-	dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes,
-	    arc_max_bytes() >> dbuf_cache_max_shift);
+	if (dbuf_cache_max_bytes == 0 ||
+	    dbuf_cache_max_bytes >= arc_max_bytes())  {
+		dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift;
+	}
 
 	/*
 	 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 3512d62704..88d81c9540 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -41,7 +41,7 @@
 	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
 
 uint64_t metaslab_aliquot = 512ULL << 10;
-uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
+uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 
 /*
  * Since we can touch multiple metaslabs (and their respective space maps)
@@ -3088,7 +3088,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	/*
 	 * For testing, make some blocks above a certain size be gang blocks.
 	 */
-	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) {
+	if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
 		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
 		return (SET_ERROR(ENOSPC));
 	}
diff --git a/usr/src/uts/common/fs/zfs/sys/zthr.h b/usr/src/uts/common/fs/zfs/sys/zthr.h
index 62da2eea81..ce6033ecb6 100644
--- a/usr/src/uts/common/fs/zfs/sys/zthr.h
+++ b/usr/src/uts/common/fs/zfs/sys/zthr.h
@@ -29,6 +29,7 @@ struct zthr {
 	kmutex_t	zthr_lock;
 	kcondvar_t	zthr_cv;
 	boolean_t	zthr_cancel;
+	hrtime_t	zthr_wait_time;
 
 	zthr_checkfunc_t	*zthr_checkfunc;
 	zthr_func_t	*zthr_func;
@@ -38,6 +39,9 @@ struct zthr {
 
 extern zthr_t *zthr_create(zthr_checkfunc_t checkfunc,
     zthr_func_t *func, void *arg);
+extern zthr_t *zthr_create_timer(zthr_checkfunc_t *checkfunc,
+    zthr_func_t *func, void *arg, hrtime_t nano_wait);
+
 extern void zthr_exit(zthr_t *t, int rc);
 extern void zthr_destroy(zthr_t *t);
 
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 388d13716c..e23e4a01c1 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -1585,7 +1585,8 @@ vdev_validate(vdev_t *vd)
 	if ((label = vdev_label_read_config(vd, txg)) == NULL) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_BAD_LABEL);
-		vdev_dbgmsg(vd, "vdev_validate: failed reading config");
+		vdev_dbgmsg(vd, "vdev_validate: failed reading config for "
+		    "txg %llu", (u_longlong_t)txg);
 		return (0);
 	}
 
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index 21677dcdc2..d8a0762c42 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -540,6 +540,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
 	abd_t *vp_abd;
 	zio_t *zio;
 	uint64_t best_txg = 0;
+	uint64_t label_txg = 0;
 	int error = 0;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SPECULATIVE;
@@ -565,8 +566,6 @@ retry:
 		if (zio_wait(zio) == 0 &&
 		    nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
 		    &label, 0) == 0) {
-			uint64_t label_txg = 0;
-
 			/*
 			 * Auxiliary vdevs won't have txg values in their
 			 * labels and newly added vdevs may not have been
@@ -597,6 +596,15 @@ retry:
 		goto retry;
 	}
 
+	/*
+	 * We found a valid label but it didn't pass txg restrictions.
+	 */
+	if (config == NULL && label_txg != 0) {
+		vdev_dbgmsg(vd, "label discarded as txg is too large "
+		    "(%llu > %llu)", (u_longlong_t)label_txg,
+		    (u_longlong_t)txg);
+	}
+
 	abd_free(vp_abd);
 
 	return (config);
diff --git a/usr/src/uts/common/fs/zfs/zthr.c b/usr/src/uts/common/fs/zfs/zthr.c
index 7772386c73..f26f911cb0 100644
--- a/usr/src/uts/common/fs/zfs/zthr.c
+++ b/usr/src/uts/common/fs/zfs/zthr.c
@@ -47,6 +47,10 @@
  * 3] When the zthr is done, it changes the indicator to stopped, allowing
  *    a new cycle to start.
  *
+ * Besides being awakened by other threads, a zthr can be configured
+ * during creation to wakeup on it's own after a specified interval
+ * [see zthr_create_timer()].
+ *
  * == ZTHR creation
  *
  * Every zthr needs three inputs to start running:
@@ -74,6 +78,9 @@
  *
  * To start a zthr:
  *     zthr_t *zthr_pointer = zthr_create(checkfunc, func, args);
+ * or
+ *     zthr_t *zthr_pointer = zthr_create_timer(checkfunc, func,
+ *         args, max_sleep);
  *
  * After that you should be able to wakeup, cancel, and resume the
  * zthr from another thread using zthr_pointer.
@@ -189,7 +196,13 @@ zthr_procedure(void *arg)
 			mutex_enter(&t->zthr_lock);
 		} else {
 			/* go to sleep */
-			cv_wait(&t->zthr_cv, &t->zthr_lock);
+			if (t->zthr_wait_time == 0) {
+				cv_wait(&t->zthr_cv, &t->zthr_lock);
+			} else {
+				(void) cv_timedwait_hires(&t->zthr_cv,
+				    &t->zthr_lock, t->zthr_wait_time,
+				    MSEC2NSEC(1), 0);
+			}
 		}
 	}
 	mutex_exit(&t->zthr_lock);
@@ -200,6 +213,18 @@ zthr_procedure(void *arg)
 zthr_t *
 zthr_create(zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg)
 {
+	return (zthr_create_timer(checkfunc, func, arg, (hrtime_t)0));
+}
+
+/*
+ * Create a zthr with specified maximum sleep time.  If the time
+ * in sleeping state exceeds max_sleep, a wakeup(do the check and
+ * start working if required) will be triggered.
+ */
+zthr_t *
+zthr_create_timer(zthr_checkfunc_t *checkfunc, zthr_func_t *func,
+    void *arg, hrtime_t max_sleep)
+{
 	zthr_t *t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 	mutex_init(&t->zthr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL);
@@ -208,6 +233,7 @@ zthr_create(zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg)
 	t->zthr_checkfunc = checkfunc;
 	t->zthr_func = func;
 	t->zthr_arg = arg;
+	t->zthr_wait_time = max_sleep;
 
 	t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t,
 	    0, &p0, TS_RUN, minclsyspri);
author	Jerry Jelinek <jerry.jelinek@joyent.com>	2018-03-22 12:42:28 +0000
committer	Jerry Jelinek <jerry.jelinek@joyent.com>	2018-03-22 12:42:28 +0000
commit	2136ccc7721a074a369d80492678baaa7cf47a67 (patch)
tree	fb0a00a29d3519e22f4b96ff7c969e9136522884
parent	cbbb96d54a2a10694551e3a98692b9386cb21e13 (diff)
parent	b6bf6e1540f30bd97b8d6e2c21d95e17841e0f23 (diff)
download	illumos-joyent-2136ccc7721a074a369d80492678baaa7cf47a67.tar.gz