diff options
| author | Prakash Surya <prakash.surya@delphix.com> | 2017-08-03 08:36:51 -0700 |
|---|---|---|
| committer | Prakash Surya <prakash.surya@delphix.com> | 2017-08-30 22:29:47 -0700 |
| commit | 216d7723a1a58124cf95c4950d51d5f99d3f4128 (patch) | |
| tree | 4073acbe5f83fe597239db0f4793b5bd753e7a85 | |
| parent | c81a25e9d3950dc5fab08d21f8be56d463b32c7a (diff) | |
| download | illumos-joyent-216d7723a1a58124cf95c4950d51d5f99d3f4128.tar.gz | |
8558 lwp_create() returns EAGAIN on system with more than 80K ZFS filesystems
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Sebastien Roy <sebastien.roy@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
| -rw-r--r-- | usr/src/uts/common/fs/zfs/dsl_pool.c | 37 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/sys/dsl_pool.h | 2 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zil_impl.h | 1 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/zil.c | 12 | ||||
| -rw-r--r-- | usr/src/uts/common/os/taskq.c | 20 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/taskq_impl.h | 3 |
6 files changed, 58 insertions, 17 deletions
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index 2d88fe1377..ac7079fdc7 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -132,6 +132,36 @@ uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000; */ int zfs_sync_taskq_batch_pct = 75; +/* + * These tunables determine the behavior of how zil_itxg_clean() is + * called via zil_clean() in the context of spa_sync(). When an itxg + * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching. + * If the dispatch fails, the call to zil_itxg_clean() will occur + * synchronously in the context of spa_sync(), which can negatively + * impact the performance of spa_sync() (e.g. in the case of the itxg + * list having a large number of itxs that needs to be cleaned). + * + * Thus, these tunables can be used to manipulate the behavior of the + * taskq used by zil_clean(); they determine the number of taskq entries + * that are pre-populated when the taskq is first created (via the + * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of + * taskq entries that are cached after an on-demand allocation (via the + * "zfs_zil_clean_taskq_maxalloc"). + * + * The idea being, we want to try reasonably hard to ensure there will + * already be a taskq entry pre-allocated by the time that it is needed + * by zil_clean(). This way, we can avoid the possibility of an + * on-demand allocation of a new taskq entry from failing, which would + * result in zil_itxg_clean() being called synchronously from zil_clean() + * (which can adversely affect performance of spa_sync()). + * + * Additionally, the number of threads used by the taskq can be + * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable. + */ +int zfs_zil_clean_taskq_nthr_pct = 100; +int zfs_zil_clean_taskq_minalloc = 1024; +int zfs_zil_clean_taskq_maxalloc = 1024 * 1024; + int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) { @@ -172,6 +202,12 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX, TASKQ_THREADS_CPU_PCT); + dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq", + zfs_zil_clean_taskq_nthr_pct, minclsyspri, + zfs_zil_clean_taskq_minalloc, + zfs_zil_clean_taskq_maxalloc, + TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); + mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); @@ -322,6 +358,7 @@ dsl_pool_close(dsl_pool_t *dp) txg_list_destroy(&dp->dp_sync_tasks); txg_list_destroy(&dp->dp_dirty_dirs); + taskq_destroy(dp->dp_zil_clean_taskq); taskq_destroy(dp->dp_sync_taskq); /* diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h index 8291e470a1..4ed37b8469 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h @@ -122,6 +122,8 @@ typedef struct dsl_pool { txg_list_t dp_dirty_dirs; txg_list_t dp_sync_tasks; taskq_t *dp_sync_taskq; + taskq_t *dp_zil_clean_taskq; + txg_list_t dp_early_sync_tasks; /* * Protects administrative changes (properties, namespace) diff --git a/usr/src/uts/common/fs/zfs/sys/zil_impl.h b/usr/src/uts/common/fs/zfs/sys/zil_impl.h index 1613033daf..0618133124 100644 --- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h @@ -124,7 +124,6 @@ struct zilog { list_t zl_lwb_list; /* in-flight log write list */ kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */ avl_tree_t zl_vdev_tree; /* vdevs to flush in zil_commit() */ - taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */ avl_tree_t zl_bp_tree; /* track bps during log parse */ clock_t zl_replay_time; /* lbolt of when replay started */ uint64_t zl_replay_blks; /* number of log blocks replayed */ diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index 7e63626455..3aee0ce2b5 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -1394,8 +1394,7 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg) return; } ASSERT3U(itxg->itxg_txg, <=, synced_txg); - ASSERT(itxg->itxg_txg != 0); - ASSERT(zilog->zl_clean_taskq != NULL); + ASSERT3U(itxg->itxg_txg, !=, 0); clean_me = itxg->itxg_itxs; itxg->itxg_itxs = NULL; itxg->itxg_txg = 0; @@ -1406,7 +1405,9 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg) * free it in-line. This should be rare. Note, using TQ_SLEEP * created a bad performance problem. */ - if (taskq_dispatch(zilog->zl_clean_taskq, + ASSERT3P(zilog->zl_dmu_pool, !=, NULL); + ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL); + if (taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq, (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == NULL) zil_itxg_clean(clean_me); } @@ -1835,13 +1836,10 @@ zil_open(objset_t *os, zil_get_data_t *get_data) { zilog_t *zilog = dmu_objset_zil(os); - ASSERT(zilog->zl_clean_taskq == NULL); ASSERT(zilog->zl_get_data == NULL); ASSERT(list_is_empty(&zilog->zl_lwb_list)); zilog->zl_get_data = get_data; - zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri, - 2, 2, TASKQ_PREPOPULATE); return (zilog); } @@ -1875,8 +1873,6 @@ zil_close(zilog_t *zilog) zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg); VERIFY(!zilog_is_dirty(zilog)); - taskq_destroy(zilog->zl_clean_taskq); - zilog->zl_clean_taskq = NULL; zilog->zl_get_data = NULL; /* diff --git a/usr/src/uts/common/os/taskq.c b/usr/src/uts/common/os/taskq.c index f11f9cf3d7..814b738581 100644 --- a/usr/src/uts/common/os/taskq.c +++ b/usr/src/uts/common/os/taskq.c @@ -25,6 +25,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2017 by Delphix. All rights reserved. */ /* @@ -596,6 +597,7 @@ struct taskq_kstat { kstat_named_t tq_nactive; kstat_named_t tq_pri; kstat_named_t tq_nthreads; + kstat_named_t tq_nomem; } taskq_kstat = { { "pid", KSTAT_DATA_UINT64 }, { "tasks", KSTAT_DATA_UINT64 }, @@ -606,6 +608,7 @@ struct taskq_kstat { { "nactive", KSTAT_DATA_UINT64 }, { "priority", KSTAT_DATA_UINT64 }, { "threads", KSTAT_DATA_UINT64 }, + { "nomem", KSTAT_DATA_UINT64 }, }; struct taskq_d_kstat { @@ -1157,6 +1160,7 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flags); if ((tqe = taskq_ent_alloc(tq, flags)) == NULL) { + tq->tq_nomem++; mutex_exit(&tq->tq_lock); return (NULL); } @@ -1270,7 +1274,7 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) if ((tqe1 = taskq_ent_alloc(tq, TQ_NOSLEEP)) != NULL) { TQ_ENQUEUE_FRONT(tq, tqe1, taskq_bucket_extend, bucket); } else { - TQ_STAT(bucket, tqs_nomem); + tq->tq_nomem++; } } @@ -1282,7 +1286,7 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) if ((tqe = taskq_ent_alloc(tq, flags)) != NULL) { TQ_ENQUEUE(tq, tqe, func, arg); } else { - TQ_STAT(bucket, tqs_nomem); + tq->tq_nomem++; } } mutex_exit(&tq->tq_lock); @@ -2171,13 +2175,14 @@ taskq_bucket_extend(void *arg) taskq_t *tq = b->tqbucket_taskq; int nthreads; + mutex_enter(&tq->tq_lock); + if (! ENOUGH_MEMORY()) { - TQ_STAT(b, tqs_nomem); + tq->tq_nomem++; + mutex_exit(&tq->tq_lock); return; } - mutex_enter(&tq->tq_lock); - /* * Observe global taskq limits on the number of threads. */ @@ -2192,7 +2197,7 @@ taskq_bucket_extend(void *arg) if (tqe == NULL) { mutex_enter(&tq->tq_lock); - TQ_STAT(b, tqs_nomem); + tq->tq_nomem++; tq->tq_tcreates--; mutex_exit(&tq->tq_lock); return; @@ -2254,6 +2259,7 @@ taskq_kstat_update(kstat_t *ksp, int rw) tqsp->tq_nalloc.value.ui64 = tq->tq_nalloc; tqsp->tq_pri.value.ui64 = tq->tq_pri; tqsp->tq_nthreads.value.ui64 = tq->tq_nthreads; + tqsp->tq_nomem.value.ui64 = tq->tq_nomem; return (0); } @@ -2277,6 +2283,7 @@ taskq_d_kstat_update(kstat_t *ksp, int rw) tqsp->tqd_bnactive.value.ui64 = tq->tq_active; tqsp->tqd_btotaltime.value.ui64 = tq->tq_totaltime; tqsp->tqd_pri.value.ui64 = tq->tq_pri; + tqsp->tqd_nomem.value.ui64 = tq->tq_nomem; tqsp->tqd_hits.value.ui64 = 0; tqsp->tqd_misses.value.ui64 = 0; @@ -2298,7 +2305,6 @@ taskq_d_kstat_update(kstat_t *ksp, int rw) tqsp->tqd_tdeaths.value.ui64 += b->tqbucket_stat.tqs_tdeaths; tqsp->tqd_maxthreads.value.ui64 += b->tqbucket_stat.tqs_maxthreads; - tqsp->tqd_nomem.value.ui64 += b->tqbucket_stat.tqs_nomem; tqsp->tqd_disptcreates.value.ui64 += b->tqbucket_stat.tqs_disptcreates; tqsp->tqd_totaltime.value.ui64 += b->tqbucket_totaltime; diff --git a/usr/src/uts/common/sys/taskq_impl.h b/usr/src/uts/common/sys/taskq_impl.h index b75427152a..dcb7290155 100644 --- a/usr/src/uts/common/sys/taskq_impl.h +++ b/usr/src/uts/common/sys/taskq_impl.h @@ -24,6 +24,7 @@ */ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2017 by Delphix. All rights reserved. */ #ifndef _SYS_TASKQ_IMPL_H @@ -66,7 +67,6 @@ typedef struct tqstat { uint_t tqs_tcreates; /* threads created */ uint_t tqs_tdeaths; /* threads died */ uint_t tqs_maxthreads; /* max # of alive threads */ - uint_t tqs_nomem; /* # of times there were no memory */ uint_t tqs_disptcreates; } tqstat_t; @@ -142,6 +142,7 @@ struct taskq { */ kstat_t *tq_kstat; /* Exported statistics */ hrtime_t tq_totaltime; /* Time spent processing tasks */ + uint64_t tq_nomem; /* # of times there was no memory */ uint64_t tq_tasks; /* Total # of tasks posted */ uint64_t tq_executed; /* Total # of tasks executed */ int tq_maxtasks; /* Max number of tasks in the queue */ |
