diff options
author | Rafael Vanoni <rafael.vanoni@sun.com> | 2009-11-13 01:32:32 -0800 |
---|---|---|
committer | Rafael Vanoni <rafael.vanoni@sun.com> | 2009-11-13 01:32:32 -0800 |
commit | d3d50737e566cade9a08d73d2af95105ac7cd960 (patch) | |
tree | 399b76a3f6bf107e2ff506d8f9c3333654b29fc7 /usr/src/uts/common/os/clock.c | |
parent | 1eff5f7761619411b3c31280fcd96cefc32968b7 (diff) | |
download | illumos-joyent-d3d50737e566cade9a08d73d2af95105ac7cd960.tar.gz |
PSARC/2009/396 Tickless Kernel Architecture / lbolt decoupling
6860030 tickless clock requires a clock() decoupled lbolt / lbolt64
Portions contributed by Chad Mynhier <cmynhier@gmail.com>
Diffstat (limited to 'usr/src/uts/common/os/clock.c')
-rw-r--r-- | usr/src/uts/common/os/clock.c | 424 |
1 files changed, 374 insertions, 50 deletions
diff --git a/usr/src/uts/common/os/clock.c b/usr/src/uts/common/os/clock.c index 384e17b57d..c0c581a215 100644 --- a/usr/src/uts/common/os/clock.c +++ b/usr/src/uts/common/os/clock.c @@ -21,13 +21,11 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ - /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - #include <sys/param.h> #include <sys/t_lock.h> #include <sys/types.h> @@ -76,6 +74,9 @@ #include <sys/timex.h> #include <sys/inttypes.h> +#include <sys/sunddi.h> +#include <sys/clock_impl.h> + /* * clock() is called straight from the clock cyclic; see clock_init(). * @@ -239,11 +240,73 @@ int32_t pps_calcnt = 0; /* calibration intervals */ int32_t pps_errcnt = 0; /* calibration errors */ int32_t pps_stbcnt = 0; /* stability limit exceeded */ -/* The following variables require no explicit locking */ -volatile clock_t lbolt; /* time in Hz since last boot */ -volatile int64_t lbolt64; /* lbolt64 won't wrap for 2.9 billion yrs */ - kcondvar_t lbolt_cv; + +/* + * Hybrid lbolt implementation: + * + * The service historically provided by the lbolt and lbolt64 variables has + * been replaced by the ddi_get_lbolt() and ddi_get_lbolt64() routines, and the + * original symbols removed from the system. The once clock driven variables are + * now implemented in an event driven fashion, backed by gethrtime() coarsed to + * the appropriate clock resolution. The default event driven implementation is + * complemented by a cyclic driven one, active only during periods of intense + * activity around the DDI lbolt routines, when a lbolt specific cyclic is + * reprogramed to fire at a clock tick interval to serve consumers of lbolt who + * rely on the original low cost of consulting a memory position. + * + * The implementation uses the number of calls to these routines and the + * frequency of these to determine when to transition from event to cyclic + * driven and vice-versa. These values are kept on a per CPU basis for + * scalability reasons and to prevent CPUs from constantly invalidating a single + * cache line when modifying a global variable. The transition from event to + * cyclic mode happens once the thresholds are crossed, and activity on any CPU + * can cause such transition. + * + * The lbolt_hybrid function pointer is called by ddi_get_lbolt() and + * ddi_get_lbolt64(), and will point to lbolt_event_driven() or + * lbolt_cyclic_driven() according to the current mode. When the thresholds + * are exceeded, lbolt_event_driven() will reprogram the lbolt cyclic to + * fire at a nsec_per_tick interval and increment an internal variable at + * each firing. lbolt_hybrid will then point to lbolt_cyclic_driven(), which + * will simply return the value of such variable. lbolt_cyclic() will attempt + * to shut itself off at each threshold interval (sampling period for calls + * to the DDI lbolt routines), and return to the event driven mode, but will + * be prevented from doing so if lbolt_cyclic_driven() is being heavily used. + * + * lbolt_bootstrap is used during boot to serve lbolt consumers who don't wait + * for the cyclic subsystem to be intialized. + * + */ +static int64_t lbolt_bootstrap(void); +int64_t lbolt_event_driven(void); +int64_t lbolt_cyclic_driven(void); +int64_t (*lbolt_hybrid)(void) = lbolt_bootstrap; +uint_t lbolt_ev_to_cyclic(caddr_t, caddr_t); + +/* + * lbolt's cyclic, installed by clock_init(). + */ +static void lbolt_cyclic(void); + +/* + * Tunable to keep lbolt in cyclic driven mode. This will prevent the system + * from switching back to event driven, once it reaches cyclic mode. + */ +static boolean_t lbolt_cyc_only = B_FALSE; + +/* + * Cache aligned, per CPU structure with lbolt usage statistics. + */ +static lbolt_cpu_t *lb_cpu; + +/* + * Single, cache aligned, structure with all the information required by + * the lbolt implementation. + */ +lbolt_info_t *lb_info; + + int one_sec = 1; /* turned on once every second */ static int fsflushcnt; /* counter for t_fsflushr */ int dosynctodr = 1; /* patchable; enable/disable sync to TOD chip */ @@ -322,6 +385,8 @@ void (*cpucaps_clock_callout)() = NULL; extern clock_t clock_tick_proc_max; +static int64_t deadman_counter = 0; + static void clock(void) { @@ -338,6 +403,7 @@ clock(void) int s; int do_lgrp_load; int i; + clock_t now = LBOLT_NO_ACCOUNT; /* current tick */ if (panicstr) return; @@ -405,8 +471,10 @@ clock(void) do_lgrp_load = 1; } - if (one_sec) + if (one_sec) { loadavg_update(); + deadman_counter++; + } /* * First count the threads waiting on kpreempt queues in each @@ -544,15 +612,6 @@ clock(void) clock_tick_schedule(one_sec); /* - * bump time in ticks - * - * We rely on there being only one clock thread and hence - * don't need a lock to protect lbolt. - */ - lbolt++; - atomic_add_64((uint64_t *)&lbolt64, (int64_t)1); - - /* * Check for a callout that needs be called from the clock * thread to support the membership protocol in a clustered * system. Copy the function pointer so that we can reset @@ -753,7 +812,7 @@ clock(void) * the clock; record that. */ clock_adj_hist[adj_hist_entry++ % - CLOCK_ADJ_HIST_SIZE] = lbolt64; + CLOCK_ADJ_HIST_SIZE] = now; s = hr_clock_lock(); timedelta = (int64_t)drift*NANOSEC; hr_clock_unlock(s); @@ -882,30 +941,84 @@ clock(void) void clock_init(void) { - cyc_handler_t hdlr; - cyc_time_t when; - - hdlr.cyh_func = (cyc_func_t)clock; - hdlr.cyh_level = CY_LOCK_LEVEL; - hdlr.cyh_arg = NULL; + cyc_handler_t clk_hdlr, timer_hdlr, lbolt_hdlr; + cyc_time_t clk_when, lbolt_when; + int i, sz; + intptr_t buf; - when.cyt_when = 0; - when.cyt_interval = nsec_per_tick; + /* + * Setup handler and timer for the clock cyclic. + */ + clk_hdlr.cyh_func = (cyc_func_t)clock; + clk_hdlr.cyh_level = CY_LOCK_LEVEL; + clk_hdlr.cyh_arg = NULL; - mutex_enter(&cpu_lock); - clock_cyclic = cyclic_add(&hdlr, &when); - mutex_exit(&cpu_lock); + clk_when.cyt_when = 0; + clk_when.cyt_interval = nsec_per_tick; /* * cyclic_timer is dedicated to the ddi interface, which * uses the same clock resolution as the system one. */ - hdlr.cyh_func = (cyc_func_t)cyclic_timer; - hdlr.cyh_level = CY_LOCK_LEVEL; - hdlr.cyh_arg = NULL; + timer_hdlr.cyh_func = (cyc_func_t)cyclic_timer; + timer_hdlr.cyh_level = CY_LOCK_LEVEL; + timer_hdlr.cyh_arg = NULL; + /* + * Setup the necessary structures for the lbolt cyclic and add the + * soft interrupt which will switch from event to cyclic mode when + * under high pil. + */ + lbolt_hdlr.cyh_func = (cyc_func_t)lbolt_cyclic; + lbolt_hdlr.cyh_level = CY_LOCK_LEVEL; + lbolt_hdlr.cyh_arg = NULL; + + lbolt_when.cyt_interval = nsec_per_tick; + + if (lbolt_cyc_only) { + lbolt_when.cyt_when = 0; + lbolt_hybrid = lbolt_cyclic_driven; + } else { + lbolt_when.cyt_when = CY_INFINITY; + lbolt_hybrid = lbolt_event_driven; + } + + /* + * Allocate cache line aligned space for the per CPU lbolt data and + * lb_info structure. We also initialize these structures with their + * default values and install the softint to change from event to + * cyclic driven mode. + */ + sz = sizeof (lbolt_info_t) + CPU_CACHE_COHERENCE_SIZE; + buf = (intptr_t)kmem_zalloc(sz, KM_SLEEP); + lb_info = (lbolt_info_t *)P2ROUNDUP(buf, CPU_CACHE_COHERENCE_SIZE); + + if (hz != HZ_DEFAULT) + lb_info->lbi_thresh_interval = LBOLT_THRESH_INTERVAL * + hz/HZ_DEFAULT; + else + lb_info->lbi_thresh_interval = LBOLT_THRESH_INTERVAL; + + lb_info->lbi_thresh_calls = LBOLT_THRESH_CALLS; + + sz = (sizeof (lbolt_info_t) * max_ncpus) + CPU_CACHE_COHERENCE_SIZE; + buf = (intptr_t)kmem_zalloc(sz, KM_SLEEP); + lb_cpu = (lbolt_cpu_t *)P2ROUNDUP(buf, CPU_CACHE_COHERENCE_SIZE); + + for (i = 0; i < max_ncpus; i++) + lb_cpu[i].lbc_counter = lb_info->lbi_thresh_calls; + + lbolt_softint_add(); + + /* + * Grab cpu_lock and install all three cyclics. + */ mutex_enter(&cpu_lock); - ddi_timer_cyclic = cyclic_add(&hdlr, &when); + + clock_cyclic = cyclic_add(&clk_hdlr, &clk_when); + ddi_timer_cyclic = cyclic_add(&timer_hdlr, &clk_when); + lb_info->lbi_cyclic_id = cyclic_add(&lbolt_hdlr, &lbolt_when); + mutex_exit(&cpu_lock); } @@ -1631,8 +1744,8 @@ delay_common(clock_t ticks) return; } - deadline = lbolt + ticks; - while ((timeleft = deadline - lbolt) > 0) { + deadline = ddi_get_lbolt() + ticks; + while ((timeleft = deadline - ddi_get_lbolt()) > 0) { mutex_enter(&t->t_delay_lock); id = timeout_default(delay_wakeup, t, timeleft); cv_wait(&t->t_delay_cv, &t->t_delay_lock); @@ -1686,7 +1799,7 @@ delay_sig(clock_t ticks) return (0); } - deadline = lbolt + ticks; + deadline = ddi_get_lbolt() + ticks; mutex_enter(&t->t_delay_lock); do { rc = cv_timedwait_sig(&t->t_delay_cv, @@ -1807,15 +1920,6 @@ deadman(void) if (CPU->cpu_id != panic_cpu.cpu_id) return; - /* - * If we're panicking, the deadman cyclic continues to increase - * lbolt in case the dump device driver relies on this for - * timeouts. Note that we rely on deadman() being invoked once - * per second, and credit lbolt and lbolt64 with hz ticks each. - */ - lbolt += hz; - lbolt64 += hz; - if (!deadman_panic_timers) return; /* allow all timers to be manually disabled */ @@ -1840,8 +1944,8 @@ deadman(void) return; } - if (lbolt != CPU->cpu_deadman_lbolt) { - CPU->cpu_deadman_lbolt = lbolt; + if (deadman_counter != CPU->cpu_deadman_counter) { + CPU->cpu_deadman_counter = deadman_counter; CPU->cpu_deadman_countdown = deadman_seconds; return; } @@ -1879,7 +1983,7 @@ deadman(void) static void deadman_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when) { - cpu->cpu_deadman_lbolt = 0; + cpu->cpu_deadman_counter = 0; cpu->cpu_deadman_countdown = deadman_seconds; hdlr->cyh_func = (cyc_func_t)deadman; @@ -1892,9 +1996,6 @@ deadman_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when) * more likely that only one CPU will panic in case of a * timeout. This is (strictly speaking) an aesthetic, not a * technical consideration. - * - * The interval must be one second in accordance with the - * code in deadman() above to increase lbolt during panic. */ when->cyt_when = cpu->cpu_id * (NANOSEC / NCPU); when->cyt_interval = NANOSEC; @@ -2184,3 +2285,226 @@ calcloadavg(int nrun, uint64_t *hp_ave) hp_ave[i] += ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4; } } + +/* + * lbolt_hybrid() is used by ddi_get_lbolt() and ddi_get_lbolt64() to + * calculate the value of lbolt according to the current mode. In the event + * driven mode (the default), lbolt is calculated by dividing the current hires + * time by the number of nanoseconds per clock tick. In the cyclic driven mode + * an internal variable is incremented at each firing of the lbolt cyclic + * and returned by lbolt_cyclic_driven(). + * + * The system will transition from event to cyclic driven mode when the number + * of calls to lbolt_event_driven() exceeds the (per CPU) threshold within a + * window of time. It does so by reprograming lbolt_cyclic from CY_INFINITY to + * nsec_per_tick. The lbolt cyclic will remain ON while at least one CPU is + * causing enough activity to cross the thresholds. + */ +static int64_t +lbolt_bootstrap(void) +{ + return (0); +} + +/* ARGSUSED */ +uint_t +lbolt_ev_to_cyclic(caddr_t arg1, caddr_t arg2) +{ + hrtime_t ts, exp; + int ret; + + ASSERT(lbolt_hybrid != lbolt_cyclic_driven); + + kpreempt_disable(); + + ts = gethrtime(); + lb_info->lbi_internal = (ts/nsec_per_tick); + + /* + * Align the next expiration to a clock tick boundary. + */ + exp = ts + nsec_per_tick - 1; + exp = (exp/nsec_per_tick) * nsec_per_tick; + + ret = cyclic_reprogram(lb_info->lbi_cyclic_id, exp); + ASSERT(ret); + + lbolt_hybrid = lbolt_cyclic_driven; + lb_info->lbi_cyc_deactivate = B_FALSE; + lb_info->lbi_cyc_deac_start = lb_info->lbi_internal; + + kpreempt_enable(); + + ret = atomic_dec_32_nv(&lb_info->lbi_token); + ASSERT(ret == 0); + + return (1); +} + +int64_t +lbolt_event_driven(void) +{ + hrtime_t ts; + int64_t lb; + int ret, cpu = CPU->cpu_seqid; + + ts = gethrtime(); + ASSERT(ts > 0); + + ASSERT(nsec_per_tick > 0); + lb = (ts/nsec_per_tick); + + /* + * Switch to cyclic mode if the number of calls to this routine + * has reached the threshold within the interval. + */ + if ((lb - lb_cpu[cpu].lbc_cnt_start) < lb_info->lbi_thresh_interval) { + + if (--lb_cpu[cpu].lbc_counter == 0) { + /* + * Reached the threshold within the interval, reset + * the usage statistics. + */ + lb_cpu[cpu].lbc_counter = lb_info->lbi_thresh_calls; + lb_cpu[cpu].lbc_cnt_start = lb; + + /* + * Make sure only one thread reprograms the + * lbolt cyclic and changes the mode. + */ + if (panicstr == NULL && + atomic_cas_32(&lb_info->lbi_token, 0, 1) == 0) { + + if (lbolt_hybrid == lbolt_cyclic_driven) { + ret = atomic_dec_32_nv( + &lb_info->lbi_token); + ASSERT(ret == 0); + return (lb); + } + + lbolt_softint_post(); + } + } + } else { + /* + * Exceeded the interval, reset the usage statistics. + */ + lb_cpu[cpu].lbc_counter = lb_info->lbi_thresh_calls; + lb_cpu[cpu].lbc_cnt_start = lb; + } + + ASSERT(lb >= lb_info->lbi_debug_time); + + return (lb - lb_info->lbi_debug_time); +} + +int64_t +lbolt_cyclic_driven(void) +{ + int64_t lb = lb_info->lbi_internal; + int cpu = CPU->cpu_seqid; + + if ((lb - lb_cpu[cpu].lbc_cnt_start) < lb_info->lbi_thresh_interval) { + + if (lb_cpu[cpu].lbc_counter == 0) + /* + * Reached the threshold within the interval, + * prevent the lbolt cyclic from turning itself + * off. + */ + lb_info->lbi_cyc_deactivate = B_FALSE; + else + lb_cpu[cpu].lbc_counter--; + } else { + /* + * Only reset the usage statistics when the interval has + * exceeded. + */ + lb_cpu[cpu].lbc_counter = lb_info->lbi_thresh_calls; + lb_cpu[cpu].lbc_cnt_start = lb; + } + + ASSERT(lb >= lb_info->lbi_debug_time); + + return (lb - lb_info->lbi_debug_time); +} + +/* + * The lbolt_cyclic() routine will fire at a nsec_per_tick rate to satisfy + * performance needs of ddi_get_lbolt() and ddi_get_lbolt64() consumers. + * It is inactive by default, and will be activated when switching from event + * to cyclic driven lbolt. The cyclic will turn itself off unless signaled + * by lbolt_cyclic_driven(). + */ +static void +lbolt_cyclic(void) +{ + int ret; + + lb_info->lbi_internal++; + + if (!lbolt_cyc_only) { + + if (lb_info->lbi_cyc_deactivate) { + /* + * Switching from cyclic to event driven mode. + */ + if (atomic_cas_32(&lb_info->lbi_token, 0, 1) == 0) { + + if (lbolt_hybrid == lbolt_event_driven) { + ret = atomic_dec_32_nv( + &lb_info->lbi_token); + ASSERT(ret == 0); + return; + } + + kpreempt_disable(); + + lbolt_hybrid = lbolt_event_driven; + ret = cyclic_reprogram(lb_info->lbi_cyclic_id, + CY_INFINITY); + ASSERT(ret); + + kpreempt_enable(); + + ret = atomic_dec_32_nv(&lb_info->lbi_token); + ASSERT(ret == 0); + } + } + + /* + * The lbolt cyclic should not try to deactivate itself before + * the sampling period has elapsed. + */ + if (lb_info->lbi_internal - lb_info->lbi_cyc_deac_start >= + lb_info->lbi_thresh_interval) { + lb_info->lbi_cyc_deactivate = B_TRUE; + lb_info->lbi_cyc_deac_start = lb_info->lbi_internal; + } + } +} + +/* + * Since the lbolt service was historically cyclic driven, it must be 'stopped' + * when the system drops into the kernel debugger. lbolt_debug_entry() is + * called by the KDI system claim callbacks to record a hires timestamp at + * debug enter time. lbolt_debug_return() is called by the sistem release + * callbacks to account for the time spent in the debugger. The value is then + * accumulated in the lb_info structure and used by lbolt_event_driven() and + * lbolt_cyclic_driven(), as well as the mdb_get_lbolt() routine. + */ +void +lbolt_debug_entry(void) +{ + lb_info->lbi_debug_ts = gethrtime(); +} + +void +lbolt_debug_return(void) +{ + if (nsec_per_tick > 0) + lb_info->lbi_debug_time += + ((gethrtime() - lb_info->lbi_debug_ts)/nsec_per_tick); + + lb_info->lbi_debug_ts = 0; +} |