summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPrakash Surya <prakash.surya@delphix.com>2017-08-03 08:36:51 -0700
committerPrakash Surya <prakash.surya@delphix.com>2017-08-30 22:29:47 -0700
commit216d7723a1a58124cf95c4950d51d5f99d3f4128 (patch)
tree4073acbe5f83fe597239db0f4793b5bd753e7a85
parentc81a25e9d3950dc5fab08d21f8be56d463b32c7a (diff)
downloadillumos-joyent-216d7723a1a58124cf95c4950d51d5f99d3f4128.tar.gz
8558 lwp_create() returns EAGAIN on system with more than 80K ZFS filesystems
Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Sebastien Roy <sebastien.roy@delphix.com> Approved by: Robert Mustacchi <rm@joyent.com>
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_pool.c37
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_pool.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zil_impl.h1
-rw-r--r--usr/src/uts/common/fs/zfs/zil.c12
-rw-r--r--usr/src/uts/common/os/taskq.c20
-rw-r--r--usr/src/uts/common/sys/taskq_impl.h3
6 files changed, 58 insertions, 17 deletions
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index 2d88fe1377..ac7079fdc7 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -132,6 +132,36 @@ uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
*/
int zfs_sync_taskq_batch_pct = 75;
+/*
+ * These tunables determine the behavior of how zil_itxg_clean() is
+ * called via zil_clean() in the context of spa_sync(). When an itxg
+ * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching.
+ * If the dispatch fails, the call to zil_itxg_clean() will occur
+ * synchronously in the context of spa_sync(), which can negatively
+ * impact the performance of spa_sync() (e.g. in the case of the itxg
+ * list having a large number of itxs that needs to be cleaned).
+ *
+ * Thus, these tunables can be used to manipulate the behavior of the
+ * taskq used by zil_clean(); they determine the number of taskq entries
+ * that are pre-populated when the taskq is first created (via the
+ * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of
+ * taskq entries that are cached after an on-demand allocation (via the
+ * "zfs_zil_clean_taskq_maxalloc").
+ *
+ * The idea being, we want to try reasonably hard to ensure there will
+ * already be a taskq entry pre-allocated by the time that it is needed
+ * by zil_clean(). This way, we can avoid the possibility of an
+ * on-demand allocation of a new taskq entry from failing, which would
+ * result in zil_itxg_clean() being called synchronously from zil_clean()
+ * (which can adversely affect performance of spa_sync()).
+ *
+ * Additionally, the number of threads used by the taskq can be
+ * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable.
+ */
+int zfs_zil_clean_taskq_nthr_pct = 100;
+int zfs_zil_clean_taskq_minalloc = 1024;
+int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;
+
int
dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
{
@@ -172,6 +202,12 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
TASKQ_THREADS_CPU_PCT);
+ dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",
+ zfs_zil_clean_taskq_nthr_pct, minclsyspri,
+ zfs_zil_clean_taskq_minalloc,
+ zfs_zil_clean_taskq_maxalloc,
+ TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
+
mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
@@ -322,6 +358,7 @@ dsl_pool_close(dsl_pool_t *dp)
txg_list_destroy(&dp->dp_sync_tasks);
txg_list_destroy(&dp->dp_dirty_dirs);
+ taskq_destroy(dp->dp_zil_clean_taskq);
taskq_destroy(dp->dp_sync_taskq);
/*
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
index 8291e470a1..4ed37b8469 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
@@ -122,6 +122,8 @@ typedef struct dsl_pool {
txg_list_t dp_dirty_dirs;
txg_list_t dp_sync_tasks;
taskq_t *dp_sync_taskq;
+ taskq_t *dp_zil_clean_taskq;
+ txg_list_t dp_early_sync_tasks;
/*
* Protects administrative changes (properties, namespace)
diff --git a/usr/src/uts/common/fs/zfs/sys/zil_impl.h b/usr/src/uts/common/fs/zfs/sys/zil_impl.h
index 1613033daf..0618133124 100644
--- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h
@@ -124,7 +124,6 @@ struct zilog {
list_t zl_lwb_list; /* in-flight log write list */
kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */
avl_tree_t zl_vdev_tree; /* vdevs to flush in zil_commit() */
- taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */
avl_tree_t zl_bp_tree; /* track bps during log parse */
clock_t zl_replay_time; /* lbolt of when replay started */
uint64_t zl_replay_blks; /* number of log blocks replayed */
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index 7e63626455..3aee0ce2b5 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -1394,8 +1394,7 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
return;
}
ASSERT3U(itxg->itxg_txg, <=, synced_txg);
- ASSERT(itxg->itxg_txg != 0);
- ASSERT(zilog->zl_clean_taskq != NULL);
+ ASSERT3U(itxg->itxg_txg, !=, 0);
clean_me = itxg->itxg_itxs;
itxg->itxg_itxs = NULL;
itxg->itxg_txg = 0;
@@ -1406,7 +1405,9 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
* free it in-line. This should be rare. Note, using TQ_SLEEP
* created a bad performance problem.
*/
- if (taskq_dispatch(zilog->zl_clean_taskq,
+ ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
+ ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
+ if (taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
(void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == NULL)
zil_itxg_clean(clean_me);
}
@@ -1835,13 +1836,10 @@ zil_open(objset_t *os, zil_get_data_t *get_data)
{
zilog_t *zilog = dmu_objset_zil(os);
- ASSERT(zilog->zl_clean_taskq == NULL);
ASSERT(zilog->zl_get_data == NULL);
ASSERT(list_is_empty(&zilog->zl_lwb_list));
zilog->zl_get_data = get_data;
- zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
- 2, 2, TASKQ_PREPOPULATE);
return (zilog);
}
@@ -1875,8 +1873,6 @@ zil_close(zilog_t *zilog)
zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg);
VERIFY(!zilog_is_dirty(zilog));
- taskq_destroy(zilog->zl_clean_taskq);
- zilog->zl_clean_taskq = NULL;
zilog->zl_get_data = NULL;
/*
diff --git a/usr/src/uts/common/os/taskq.c b/usr/src/uts/common/os/taskq.c
index f11f9cf3d7..814b738581 100644
--- a/usr/src/uts/common/os/taskq.c
+++ b/usr/src/uts/common/os/taskq.c
@@ -25,6 +25,7 @@
/*
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2017 by Delphix. All rights reserved.
*/
/*
@@ -596,6 +597,7 @@ struct taskq_kstat {
kstat_named_t tq_nactive;
kstat_named_t tq_pri;
kstat_named_t tq_nthreads;
+ kstat_named_t tq_nomem;
} taskq_kstat = {
{ "pid", KSTAT_DATA_UINT64 },
{ "tasks", KSTAT_DATA_UINT64 },
@@ -606,6 +608,7 @@ struct taskq_kstat {
{ "nactive", KSTAT_DATA_UINT64 },
{ "priority", KSTAT_DATA_UINT64 },
{ "threads", KSTAT_DATA_UINT64 },
+ { "nomem", KSTAT_DATA_UINT64 },
};
struct taskq_d_kstat {
@@ -1157,6 +1160,7 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flags);
if ((tqe = taskq_ent_alloc(tq, flags)) == NULL) {
+ tq->tq_nomem++;
mutex_exit(&tq->tq_lock);
return (NULL);
}
@@ -1270,7 +1274,7 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
if ((tqe1 = taskq_ent_alloc(tq, TQ_NOSLEEP)) != NULL) {
TQ_ENQUEUE_FRONT(tq, tqe1, taskq_bucket_extend, bucket);
} else {
- TQ_STAT(bucket, tqs_nomem);
+ tq->tq_nomem++;
}
}
@@ -1282,7 +1286,7 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
if ((tqe = taskq_ent_alloc(tq, flags)) != NULL) {
TQ_ENQUEUE(tq, tqe, func, arg);
} else {
- TQ_STAT(bucket, tqs_nomem);
+ tq->tq_nomem++;
}
}
mutex_exit(&tq->tq_lock);
@@ -2171,13 +2175,14 @@ taskq_bucket_extend(void *arg)
taskq_t *tq = b->tqbucket_taskq;
int nthreads;
+ mutex_enter(&tq->tq_lock);
+
if (! ENOUGH_MEMORY()) {
- TQ_STAT(b, tqs_nomem);
+ tq->tq_nomem++;
+ mutex_exit(&tq->tq_lock);
return;
}
- mutex_enter(&tq->tq_lock);
-
/*
* Observe global taskq limits on the number of threads.
*/
@@ -2192,7 +2197,7 @@ taskq_bucket_extend(void *arg)
if (tqe == NULL) {
mutex_enter(&tq->tq_lock);
- TQ_STAT(b, tqs_nomem);
+ tq->tq_nomem++;
tq->tq_tcreates--;
mutex_exit(&tq->tq_lock);
return;
@@ -2254,6 +2259,7 @@ taskq_kstat_update(kstat_t *ksp, int rw)
tqsp->tq_nalloc.value.ui64 = tq->tq_nalloc;
tqsp->tq_pri.value.ui64 = tq->tq_pri;
tqsp->tq_nthreads.value.ui64 = tq->tq_nthreads;
+ tqsp->tq_nomem.value.ui64 = tq->tq_nomem;
return (0);
}
@@ -2277,6 +2283,7 @@ taskq_d_kstat_update(kstat_t *ksp, int rw)
tqsp->tqd_bnactive.value.ui64 = tq->tq_active;
tqsp->tqd_btotaltime.value.ui64 = tq->tq_totaltime;
tqsp->tqd_pri.value.ui64 = tq->tq_pri;
+ tqsp->tqd_nomem.value.ui64 = tq->tq_nomem;
tqsp->tqd_hits.value.ui64 = 0;
tqsp->tqd_misses.value.ui64 = 0;
@@ -2298,7 +2305,6 @@ taskq_d_kstat_update(kstat_t *ksp, int rw)
tqsp->tqd_tdeaths.value.ui64 += b->tqbucket_stat.tqs_tdeaths;
tqsp->tqd_maxthreads.value.ui64 +=
b->tqbucket_stat.tqs_maxthreads;
- tqsp->tqd_nomem.value.ui64 += b->tqbucket_stat.tqs_nomem;
tqsp->tqd_disptcreates.value.ui64 +=
b->tqbucket_stat.tqs_disptcreates;
tqsp->tqd_totaltime.value.ui64 += b->tqbucket_totaltime;
diff --git a/usr/src/uts/common/sys/taskq_impl.h b/usr/src/uts/common/sys/taskq_impl.h
index b75427152a..dcb7290155 100644
--- a/usr/src/uts/common/sys/taskq_impl.h
+++ b/usr/src/uts/common/sys/taskq_impl.h
@@ -24,6 +24,7 @@
*/
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_TASKQ_IMPL_H
@@ -66,7 +67,6 @@ typedef struct tqstat {
uint_t tqs_tcreates; /* threads created */
uint_t tqs_tdeaths; /* threads died */
uint_t tqs_maxthreads; /* max # of alive threads */
- uint_t tqs_nomem; /* # of times there were no memory */
uint_t tqs_disptcreates;
} tqstat_t;
@@ -142,6 +142,7 @@ struct taskq {
*/
kstat_t *tq_kstat; /* Exported statistics */
hrtime_t tq_totaltime; /* Time spent processing tasks */
+ uint64_t tq_nomem; /* # of times there was no memory */
uint64_t tq_tasks; /* Total # of tasks posted */
uint64_t tq_executed; /* Total # of tasks executed */
int tq_maxtasks; /* Max number of tasks in the queue */