summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdam Leventhal <ahl@delphix.com>2013-03-01 15:46:07 -0800
committerChristopher Siden <chris.siden@delphix.com>2013-03-01 15:46:25 -0800
commit0689f76c08c5e553ff25ac43a852b56c430bb61e (patch)
tree4a0f9c228e147faa8aa3a4d75e6881ccaaafbe32
parentaec3cf542eb81ffe3fe027b69143a82fdc7810cd (diff)
downloadillumos-gate-0689f76c08c5e553ff25ac43a852b56c430bb61e.tar.gz
3582 zfs_delay() should support a variable resolution
3584 DTrace sdt probes for ZFS txg states Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: Dan McDonald <danmcd@nexenta.com> Reviewed by: Richard Elling <richard.elling@dey-sys.com> Approved by: Garrett D'Amore <garrett@damore.org>
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c2
-rw-r--r--usr/src/lib/libzpool/common/kernel.c35
-rw-r--r--usr/src/lib/libzpool/common/sys/zfs_context.h2
-rw-r--r--usr/src/uts/common/conf/param.c8
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dir.c3
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_pool.c18
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_scan.c8
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c6
-rw-r--r--usr/src/uts/common/fs/zfs/sys/txg.h9
-rw-r--r--usr/src/uts/common/fs/zfs/sys/txg_impl.h12
-rw-r--r--usr/src/uts/common/fs/zfs/txg.c27
-rw-r--r--usr/src/uts/common/os/condvar.c2
-rw-r--r--usr/src/uts/common/sys/condvar.h2
-rw-r--r--usr/src/uts/common/sys/time.h3
14 files changed, 94 insertions, 43 deletions
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c
index aba4794942..83279f7db2 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c
@@ -35,8 +35,6 @@
#define IPADDRL sizeof (struct in_addr)
#define RARPRETRIES 5
-#define MSEC2NSEC(msec) ((msec) * 1000000)
-#define NSEC2MSEC(nsec) ((nsec) / 1000000)
/*
* The following value (8) is determined to work reliably in switched 10/100MB
diff --git a/usr/src/lib/libzpool/common/kernel.c b/usr/src/lib/libzpool/common/kernel.c
index 96280941a6..4dd614f7c1 100644
--- a/usr/src/lib/libzpool/common/kernel.c
+++ b/usr/src/lib/libzpool/common/kernel.c
@@ -329,6 +329,41 @@ top:
return (1);
}
+/*ARGSUSED*/
+clock_t
+cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res,
+ int flag)
+{
+ int error;
+ timestruc_t ts;
+ hrtime_t delta;
+
+ ASSERT(flag == 0);
+
+top:
+ delta = tim - gethrtime();
+ if (delta <= 0)
+ return (-1);
+
+ ts.tv_sec = delta / NANOSEC;
+ ts.tv_nsec = delta % NANOSEC;
+
+ ASSERT(mutex_owner(mp) == curthread);
+ mp->m_owner = NULL;
+ error = cond_reltimedwait(cv, &mp->m_lock, &ts);
+ mp->m_owner = curthread;
+
+ if (error == ETIME)
+ return (-1);
+
+ if (error == EINTR)
+ goto top;
+
+ ASSERT(error == 0);
+
+ return (1);
+}
+
void
cv_signal(kcondvar_t *cv)
{
diff --git a/usr/src/lib/libzpool/common/sys/zfs_context.h b/usr/src/lib/libzpool/common/sys/zfs_context.h
index 7802da4e2b..a46c657a1d 100644
--- a/usr/src/lib/libzpool/common/sys/zfs_context.h
+++ b/usr/src/lib/libzpool/common/sys/zfs_context.h
@@ -254,6 +254,8 @@ extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg);
extern void cv_destroy(kcondvar_t *cv);
extern void cv_wait(kcondvar_t *cv, kmutex_t *mp);
extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime);
+extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+ hrtime_t res, int flag);
extern void cv_signal(kcondvar_t *cv);
extern void cv_broadcast(kcondvar_t *cv);
diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c
index d72cfb0b8f..a01b879756 100644
--- a/usr/src/uts/common/conf/param.c
+++ b/usr/src/uts/common/conf/param.c
@@ -695,10 +695,10 @@ param_init(void)
* should re-evaluate their usage and specify the appropriate
* resolution.
*/
- time_res[TR_NANOSEC] = SEC;
- time_res[TR_MICROSEC] = MILLISEC;
- time_res[TR_MILLISEC] = MICROSEC;
- time_res[TR_SEC] = NANOSEC;
+ time_res[TR_NANOSEC] = NANOSEC / NANOSEC;
+ time_res[TR_MICROSEC] = NANOSEC / MICROSEC;
+ time_res[TR_MILLISEC] = NANOSEC / MILLISEC;
+ time_res[TR_SEC] = NANOSEC / SEC;
time_res[TR_CLOCK_TICK] = nsec_per_tick;
}
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
index 1e7ba6d6cb..c53c3c8e87 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -738,7 +738,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
} else {
if (err == EAGAIN) {
- txg_delay(dd->dd_pool, tx->tx_txg, 1);
+ txg_delay(dd->dd_pool, tx->tx_txg,
+ MSEC2NSEC(10), MSEC2NSEC(10));
err = ERESTART;
}
dsl_pool_memory_pressure(dd->dd_pool);
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index 6af631679e..c83068a001 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -58,6 +58,9 @@ kmutex_t zfs_write_limit_lock;
static pgcnt_t old_physmem = 0;
+hrtime_t zfs_throttle_delay = MSEC2NSEC(10);
+hrtime_t zfs_throttle_resolution = MSEC2NSEC(10);
+
int
dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
{
@@ -511,12 +514,13 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
* Weight the throughput calculation towards the current value:
* thru = 3/4 old_thru + 1/4 new_thru
*
- * Note: write_time is in nanosecs, so write_time/MICROSEC
- * yields millisecs
+ * Note: write_time is in nanosecs while dp_throughput is expressed in
+ * bytes per millisecond.
*/
ASSERT(zfs_write_limit_min > 0);
- if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) {
- uint64_t throughput = data_written / (write_time / MICROSEC);
+ if (data_written > zfs_write_limit_min / 8 &&
+ write_time > MSEC2NSEC(1)) {
+ uint64_t throughput = data_written / NSEC2MSEC(write_time);
if (dp->dp_throughput)
dp->dp_throughput = throughput / 4 +
@@ -614,8 +618,10 @@ dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
* the caller 1 clock tick. This will slow down the "fill"
* rate until the sync process can catch up with us.
*/
- if (reserved && reserved > (write_limit - (write_limit >> 3)))
- txg_delay(dp, tx->tx_txg, 1);
+ if (reserved && reserved > (write_limit - (write_limit >> 3))) {
+ txg_delay(dp, tx->tx_txg, zfs_throttle_delay,
+ zfs_throttle_resolution);
+ }
return (0);
}
diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c
index 3de3c6e4d7..167f3825e5 100644
--- a/usr/src/uts/common/fs/zfs/dsl_scan.c
+++ b/usr/src/uts/common/fs/zfs/dsl_scan.c
@@ -403,7 +403,7 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
- (elapsed_nanosecs / MICROSEC > mintime &&
+ (NSEC2MSEC(elapsed_nanosecs) > mintime &&
txg_sync_waiting(scn->scn_dp)) ||
spa_shutting_down(scn->scn_dp->dp_spa)) {
if (zb) {
@@ -1308,7 +1308,7 @@ dsl_scan_free_should_pause(dsl_scan_t *scn)
elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
- (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
+ (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms &&
txg_sync_waiting(scn->scn_dp)) ||
spa_shutting_down(scn->scn_dp->dp_spa));
}
@@ -1433,7 +1433,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
"free_bpobj/bptree txg %llu",
(longlong_t)scn->scn_visited_this_txg,
(longlong_t)
- (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
+ NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
(longlong_t)tx->tx_txg);
scn->scn_visited_this_txg = 0;
/*
@@ -1481,7 +1481,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
zfs_dbgmsg("visited %llu blocks in %llums",
(longlong_t)scn->scn_visited_this_txg,
- (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
+ (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));
if (!scn->scn_pausing) {
/* finished with scan. */
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 733d2609e5..1663abbb5e 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -499,8 +499,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
hdlr.cyh_arg = spa;
hdlr.cyh_level = CY_LOW_LEVEL;
- spa->spa_deadman_synctime = zfs_deadman_synctime *
- zfs_txg_synctime_ms * MICROSEC;
+ spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime *
+ zfs_txg_synctime_ms);
/*
* This determines how often we need to check for hung I/Os after
@@ -508,7 +508,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
* an expensive operation we don't want to check too frequently.
* Instead wait for 5 synctimes before checking again.
*/
- when.cyt_interval = 5ULL * zfs_txg_synctime_ms * MICROSEC;
+ when.cyt_interval = MSEC2NSEC(5 * zfs_txg_synctime_ms);
when.cyt_when = CY_INFINITY;
mutex_enter(&cpu_lock);
spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
diff --git a/usr/src/uts/common/fs/zfs/sys/txg.h b/usr/src/uts/common/fs/zfs/sys/txg.h
index 2df33f0fb0..1529e5ac6d 100644
--- a/usr/src/uts/common/fs/zfs/sys/txg.h
+++ b/usr/src/uts/common/fs/zfs/sys/txg.h
@@ -74,13 +74,8 @@ extern void txg_rele_to_quiesce(txg_handle_t *txghp);
extern void txg_rele_to_sync(txg_handle_t *txghp);
extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
-/*
- * Delay the caller by the specified number of ticks or until
- * the txg closes (whichever comes first). This is intended
- * to be used to throttle writers when the system nears its
- * capacity.
- */
-extern void txg_delay(struct dsl_pool *dp, uint64_t txg, int ticks);
+extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta,
+ hrtime_t resolution);
/*
* Wait until the given transaction group has finished syncing.
diff --git a/usr/src/uts/common/fs/zfs/sys/txg_impl.h b/usr/src/uts/common/fs/zfs/sys/txg_impl.h
index 7b356eac12..8f1b21b3d4 100644
--- a/usr/src/uts/common/fs/zfs/sys/txg_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/txg_impl.h
@@ -23,6 +23,10 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
#ifndef _SYS_TXG_IMPL_H
#define _SYS_TXG_IMPL_H
@@ -36,14 +40,14 @@ extern "C" {
struct tx_cpu {
kmutex_t tc_lock;
kcondvar_t tc_cv[TXG_SIZE];
- uint64_t tc_count[TXG_SIZE];
+ uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */
list_t tc_callbacks[TXG_SIZE]; /* commit cb list */
- char tc_pad[16];
+ char tc_pad[16]; /* pad to fill 3 cache lines */
};
typedef struct tx_state {
- tx_cpu_t *tx_cpu; /* protects right to enter txg */
- kmutex_t tx_sync_lock; /* protects tx_state_t */
+ tx_cpu_t *tx_cpu; /* protects access to tx_open_txg */
+ kmutex_t tx_sync_lock; /* protects the rest of this struct */
uint64_t tx_open_txg; /* currently open txg id */
uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */
uint64_t tx_syncing_txg; /* currently syncing txg id */
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
index 58690e325f..232cdd961e 100644
--- a/usr/src/uts/common/fs/zfs/txg.c
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -232,7 +232,7 @@ txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
}
static void
-txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time)
+txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
{
CALLB_CPR_SAFE_BEGIN(cpr);
@@ -353,6 +353,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
ASSERT(txg == tx->tx_open_txg);
tx->tx_open_txg++;
+ DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
+ DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
+
/*
* Now that we've incremented tx_open_txg, we can let threads
* enter the next transaction group.
@@ -475,6 +478,7 @@ txg_sync_thread(dsl_pool_t *dp)
txg = tx->tx_quiesced_txg;
tx->tx_quiesced_txg = 0;
tx->tx_syncing_txg = txg;
+ DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
cv_broadcast(&tx->tx_quiesce_more_cv);
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
@@ -488,6 +492,7 @@ txg_sync_thread(dsl_pool_t *dp)
mutex_enter(&tx->tx_sync_lock);
tx->tx_synced_txg = txg;
tx->tx_syncing_txg = 0;
+ DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
cv_broadcast(&tx->tx_sync_done_cv);
/*
@@ -536,21 +541,22 @@ txg_quiesce_thread(dsl_pool_t *dp)
*/
dprintf("quiesce done, handing off txg %llu\n", txg);
tx->tx_quiesced_txg = txg;
+ DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
cv_broadcast(&tx->tx_sync_more_cv);
cv_broadcast(&tx->tx_quiesce_done_cv);
}
}
/*
- * Delay this thread by 'ticks' if we are still in the open transaction
- * group and there is already a waiting txg quiesing or quiesced. Abort
- * the delay if this txg stalls or enters the quiesing state.
+ * Delay this thread by delay nanoseconds if we are still in the open
+ * transaction group and there is already a waiting txg quiesing or quiesced.
+ * Abort the delay if this txg stalls or enters the quiesing state.
*/
void
-txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
+txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
{
tx_state_t *tx = &dp->dp_tx;
- clock_t timeout = ddi_get_lbolt() + ticks;
+ hrtime_t start = gethrtime();
/* don't delay if this txg could transition to quiesing immediately */
if (tx->tx_open_txg > txg ||
@@ -563,10 +569,11 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
return;
}
- while (ddi_get_lbolt() < timeout &&
- tx->tx_syncing_txg < txg-1 && !txg_stalled(dp))
- (void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock,
- timeout);
+ while (gethrtime() - start < delay &&
+ tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
+ (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
+ &tx->tx_sync_lock, delay, resolution, 0);
+ }
mutex_exit(&tx->tx_sync_lock);
}
diff --git a/usr/src/uts/common/os/condvar.c b/usr/src/uts/common/os/condvar.c
index 60ff344d82..e9c418ffbd 100644
--- a/usr/src/uts/common/os/condvar.c
+++ b/usr/src/uts/common/os/condvar.c
@@ -43,8 +43,6 @@
#include <sys/sdt.h>
#include <sys/callo.h>
-clock_t cv_timedwait_hires(kcondvar_t *, kmutex_t *, hrtime_t, hrtime_t, int);
-
/*
* CV_MAX_WAITERS is the maximum number of waiters we track; once
* the number becomes higher than that, we look at the sleepq to
diff --git a/usr/src/uts/common/sys/condvar.h b/usr/src/uts/common/sys/condvar.h
index 56e660e5e2..ef72c3c567 100644
--- a/usr/src/uts/common/sys/condvar.h
+++ b/usr/src/uts/common/sys/condvar.h
@@ -94,6 +94,8 @@ extern void cv_destroy(kcondvar_t *);
extern void cv_wait(kcondvar_t *, kmutex_t *);
extern void cv_wait_stop(kcondvar_t *, kmutex_t *, int);
extern clock_t cv_timedwait(kcondvar_t *, kmutex_t *, clock_t);
+extern clock_t cv_timedwait_hires(kcondvar_t *, kmutex_t *, hrtime_t, hrtime_t,
+ int);
extern clock_t cv_reltimedwait(kcondvar_t *, kmutex_t *, clock_t, time_res_t);
extern int cv_wait_sig(kcondvar_t *, kmutex_t *);
extern clock_t cv_timedwait_sig(kcondvar_t *, kmutex_t *, clock_t);
diff --git a/usr/src/uts/common/sys/time.h b/usr/src/uts/common/sys/time.h
index fcb9a290fe..9fc4704860 100644
--- a/usr/src/uts/common/sys/time.h
+++ b/usr/src/uts/common/sys/time.h
@@ -236,6 +236,9 @@ struct itimerval32 {
#define MICROSEC 1000000
#define NANOSEC 1000000000
+#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC))
+#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC))
+
#endif /* !defined(__XOPEN_OR_POSIX) || defined(__EXTENSIONS__) */
#ifndef _ASM