diff options
author | Eric Saxe <Eric.Saxe@Sun.COM> | 2009-10-14 14:54:01 -0700 |
---|---|---|
committer | Eric Saxe <Eric.Saxe@Sun.COM> | 2009-10-14 14:54:01 -0700 |
commit | 113b131b48d0e653a91612bb4461ea90adbd849a (patch) | |
tree | bd491e683eba0ea5803e3c4c9e5c7076aa1ee211 /usr/src | |
parent | 628680125482a37a45c692030029fd62a600f914 (diff) | |
download | illumos-gate-113b131b48d0e653a91612bb4461ea90adbd849a.tar.gz |
6883663 CPUs observed not downclocking when system is otherwise idle
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/uts/common/os/cpu_pm.c | 227 | ||||
-rw-r--r-- | usr/src/uts/common/sys/cpu_pm.h | 14 | ||||
-rw-r--r-- | usr/src/uts/common/sys/time.h | 3 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/mp_machdep.c | 8 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/timestamp.c | 18 | ||||
-rw-r--r-- | usr/src/uts/intel/ia32/os/archdep.c | 11 | ||||
-rw-r--r-- | usr/src/uts/intel/sys/archsystm.h | 1 | ||||
-rw-r--r-- | usr/src/uts/sun4/io/cbe.c | 6 |
8 files changed, 131 insertions, 157 deletions
diff --git a/usr/src/uts/common/os/cpu_pm.c b/usr/src/uts/common/os/cpu_pm.c index a439db0a7c..324e4168d7 100644 --- a/usr/src/uts/common/os/cpu_pm.c +++ b/usr/src/uts/common/os/cpu_pm.c @@ -25,6 +25,7 @@ #include <sys/cpu_pm.h> #include <sys/cmn_err.h> +#include <sys/time.h> #include <sys/sdt.h> /* @@ -69,7 +70,7 @@ * * Avoiding state thrashing in the presence of transient periods of utilization * and idleness while still being responsive to non-transient periods is key. - * The power manager implmeents several "governors" that are used to throttle + * The power manager implements a "governor" that is used to throttle * state transitions when a significant amount of transient idle or transient * work is detected. * @@ -81,6 +82,28 @@ * wait for an event elsewhere in the system. Where the idle period is short * enough, the overhead associated with making the state transition doesn't * justify the power savings. + * + * The following is the state machine for the governor implemented by + * cpupm_utilization_event(): + * + * ----->---tw---->----- + * / \ + * (I)-<-ti-<- -<-ntw-<(W) + * | \ / | + * \ \ / / + * >-nti/rm->(D)--->-tw->- + * Key: + * + * States + * - (D): Default (ungoverned) + * - (W): Transient work governed + * - (I): Transient idle governed + * State Transitions + * - tw: transient work + * - ti: transient idleness + * - ntw: non-transient work + * - nti: non-transient idleness + * - rm: thread remain event */ static cpupm_domain_t *cpupm_domains = NULL; @@ -109,39 +132,35 @@ hrtime_t cpupm_ti_predict_interval; /* * Number of mispredictions after which future transitions will be governed. */ -int cpupm_mispredict_thresh = 2; +int cpupm_mispredict_thresh = 4; /* * Likewise, the number of mispredicted governed transitions after which the * governor will be removed. */ -int cpupm_mispredict_gov_thresh = 10; +int cpupm_mispredict_gov_thresh = 4; /* - * The transient work and transient idle prediction intervals are initialized - * to be some multiple of the amount of time it takes to transition a power - * domain from the highest to the lowest power state, and back again, which - * is measured. - * - * The default values of those multiples are specified here. Tuning them higher - * will result in the transient work, and transient idle governors being used - * more aggresively, which limits the frequency of state transitions at the - * expense of performance and power savings, respectively. + * The transient work and transient idle prediction intervals are specified + * here. Tuning them higher will result in the transient work, and transient + * idle governors being used more aggresively, which limits the frequency of + * state transitions at the expense of performance and power savings, + * respectively. The intervals are specified in nanoseconds. */ -#define CPUPM_TI_GOV_DEFAULT_MULTIPLE 600 -#define CPUPM_TW_GOV_DEFAULT_MULTIPLE 25 - /* - * Number of high=>low=>high measurements performed, of which the average - * is taken. + * 400 usec + */ +#define CPUPM_DEFAULT_TI_INTERVAL 400000 +/* + * 400 usec */ -#define CPUPM_BENCHMARK_ITERS 5 +#define CPUPM_DEFAULT_TW_INTERVAL 400000 -int cpupm_ti_gov_multiple = CPUPM_TI_GOV_DEFAULT_MULTIPLE; -int cpupm_tw_gov_multiple = CPUPM_TW_GOV_DEFAULT_MULTIPLE; +hrtime_t cpupm_ti_gov_interval = CPUPM_DEFAULT_TI_INTERVAL; +hrtime_t cpupm_tw_gov_interval = CPUPM_DEFAULT_TW_INTERVAL; -static int cpupm_governor_initialize(void); +static void cpupm_governor_initialize(void); static void cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t); cpupm_policy_t @@ -201,23 +220,15 @@ cpupm_set_policy(cpupm_policy_t new_policy) break; } - pause_cpus(NULL); /* - * Attempt to initialize the governor parameters the first - * time through. + * Initialize the governor parameters the first time through. */ if (gov_init == 0) { - result = cpupm_governor_initialize(); - if (result == 0) { - gov_init = 1; - } else { - /* - * Failed to initialize the governor parameters - */ - start_cpus(); - break; - } + cpupm_governor_initialize(); + gov_init = 1; } + + pause_cpus(NULL); cpupm_policy = CPUPM_POLICY_ELASTIC; start_cpus(); @@ -398,7 +409,7 @@ cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom, * If the utilization has dropped to zero, then transition the * domain to its lowest power state. * - * Statistics are maintained to implement governors to reduce state + * Statistics are maintained to implement a governor to reduce state * transitions resulting from either transient work, or periods of * transient idleness on the domain. */ @@ -415,8 +426,8 @@ cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom, dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) { new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF]; - if (dom->cpd_tw_governed == B_TRUE) { - dom->cpd_tw_governed = B_FALSE; + if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) { + dom->cpd_governor = CPUPM_GOV_DISENGAGED; dom->cpd_tw = 0; } } @@ -437,10 +448,17 @@ cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom, /* * There's non-zero utilization, and the domain is * running in the lower power state. Before we - * consider raising power, perform some book keeping - * for the transient idle governor. + * consider raising power, check if the preceeding + * idle period was transient in duration. + * + * If the domain is already transient work governed, + * then we don't bother maintaining transient idle + * statistics, as the presence of enough transient work + * can also make the domain frequently transiently idle. + * In this case, we still want to remain transient work + * governed. */ - if (dom->cpd_ti_governed == B_FALSE) { + if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) { if ((now - last) < cpupm_ti_predict_interval) { /* * We're raising the domain power and @@ -448,18 +466,8 @@ cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom, * this a mispredicted power state * transition due to a transient * idle period. - * - * Note: The presence of enough - * transient work across the domain can - * result in frequent transient idle - * periods. We don't want the ti - * governor being installed as a side - * effect of transient work, so the ti - * governor is left alone if the tw - * governor is already installed. */ - if (dom->cpd_tw_governed == B_FALSE && - ++dom->cpd_ti >= + if (++dom->cpd_ti >= cpupm_mispredict_thresh) { /* * There's enough transient @@ -467,7 +475,8 @@ cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom, * justify governing future * lowering requests. */ - dom->cpd_ti_governed = B_TRUE; + dom->cpd_governor = + CPUPM_GOV_TRANS_IDLE; dom->cpd_ti = 0; DTRACE_PROBE1( cpupm__ti__governed, @@ -481,7 +490,7 @@ cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom, dom->cpd_ti = 0; } } - if (dom->cpd_tw_governed == B_TRUE) { + if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) { /* * Raise requests are governed due to * transient work. @@ -489,22 +498,6 @@ cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom, DTRACE_PROBE1(cpupm__raise__governed, cpupm_domain_t *, dom); - /* - * It's likely that we'll be governed for a - * while. If the transient idle governor is - * also in place, examine the preceeding idle - * interval to see if that still makes sense. - */ - if (dom->cpd_ti_governed == B_TRUE && - ((now - last) >= - cpupm_ti_predict_interval)) { - if (++dom->cpd_ti >= - cpupm_mispredict_gov_thresh) { - dom->cpd_ti_governed = - B_FALSE; - dom->cpd_ti = 0; - } - } return; } /* @@ -521,7 +514,8 @@ cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom, * perform some book keeping if the last lowering * request was governed. */ - if (dom->cpd_ti_governed == B_TRUE) { + if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) { + if ((now - last) >= cpupm_ti_predict_interval) { /* * The domain is transient idle @@ -535,7 +529,8 @@ cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom, * idle periods to justify * removing the governor. */ - dom->cpd_ti_governed = B_FALSE; + dom->cpd_governor = + CPUPM_GOV_DISENGAGED; dom->cpd_ti = 0; DTRACE_PROBE1( cpupm__ti__ungoverned, @@ -570,7 +565,7 @@ cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom, * perform some book keeping for the transient work * governor. */ - if (dom->cpd_tw_governed == B_FALSE) { + if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) { if ((now - last) < cpupm_tw_predict_interval) { /* * We're lowering the domain power and @@ -581,12 +576,13 @@ cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom, if (++dom->cpd_tw >= cpupm_mispredict_thresh) { /* - * There's enough transient idle + * There's enough transient work * transitions to justify - * governing future lowering + * governing future raise * requests. */ - dom->cpd_tw_governed = B_TRUE; + dom->cpd_governor = + CPUPM_GOV_TRANS_WORK; dom->cpd_tw = 0; DTRACE_PROBE1( cpupm__tw__governed, @@ -600,7 +596,7 @@ cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom, dom->cpd_tw = 0; } } - if (dom->cpd_ti_governed == B_TRUE) { + if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) { /* * Lowering requests are governed due to * transient idleness. @@ -608,22 +604,6 @@ cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom, DTRACE_PROBE1(cpupm__lowering__governed, cpupm_domain_t *, dom); - /* - * It's likely that we'll be governed for a - * while. If the transient work governor is - * also in place, examine the preceeding busy - * interval to see if that still makes sense. - */ - if (dom->cpd_tw_governed == B_TRUE && - ((now - last) >= - cpupm_tw_predict_interval)) { - if (++dom->cpd_tw >= - cpupm_mispredict_gov_thresh) { - dom->cpd_tw_governed = - B_FALSE; - dom->cpd_tw = 0; - } - } return; } @@ -642,7 +622,7 @@ cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom, * perform some book keeping if the last raising * request was governed. */ - if (dom->cpd_tw_governed == B_TRUE) { + if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) { if ((now - last) >= cpupm_tw_predict_interval) { /* * The domain is transient work @@ -656,7 +636,8 @@ cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom, * work to justify removing * the governor. */ - dom->cpd_tw_governed = B_FALSE; + dom->cpd_governor = + CPUPM_GOV_DISENGAGED; dom->cpd_tw = 0; DTRACE_PROBE1( cpupm__tw__ungoverned, @@ -741,62 +722,18 @@ cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level) } /* - * Benchmark some power state transitions and use the transition latencies as - * a basis for initializing parameters for the transient idle and transient - * work governors. - * - * Returns 0 on success or -1 if the governor parameters could not be - * initialized. + * Initialize the parameters for the transience governor state machine */ -static int +static void cpupm_governor_initialize(void) { - cpu_t *cp = CPU; - cpupm_domain_t *dom; - cpupm_state_t *low, *high; - id_t did; - hrtime_t start, delta, deltas = 0; - int iterations; - - did = cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE); - if (did == CPUPM_NO_DOMAIN) - return (-1); - - dom = cpupm_domain_find(did, CPUPM_DTYPE_ACTIVE); - if (dom == NULL) - return (-1); - - low = dom->cpd_named_states[CPUPM_STATE_LOW_POWER]; - high = dom->cpd_named_states[CPUPM_STATE_MAX_PERF]; - - for (iterations = 0; iterations < CPUPM_BENCHMARK_ITERS; iterations++) { - - /* - * Measure the amount of time it takes to transition the - * domain down to the lowest, and back to the highest power - * state. - */ - start = gethrtime_unscaled(); - (void) cpupm_change_state(cp, dom, low); - (void) cpupm_change_state(cp, dom, high); - delta = gethrtime_unscaled() - start; - - DTRACE_PROBE1(cpupm__benchmark__latency, - hrtime_t, delta); - - deltas += delta; - } - /* - * Figure the average latency, and tune the transient work and - * transient idle prediction intervals accordingly. + * The default prediction intervals are specified in nanoseconds. + * Convert these to the equivalent in unscaled hrtime, which is the + * format of the timestamps passed to cpupm_utilization_event() */ - delta = deltas / iterations; - - cpupm_ti_predict_interval = delta * cpupm_ti_gov_multiple; - cpupm_tw_predict_interval = delta * cpupm_tw_gov_multiple; - - return (0); + cpupm_ti_predict_interval = unscalehrtime(cpupm_ti_gov_interval); + cpupm_tw_predict_interval = unscalehrtime(cpupm_tw_gov_interval); } /* diff --git a/usr/src/uts/common/sys/cpu_pm.h b/usr/src/uts/common/sys/cpu_pm.h index 3ec3bcd68d..124722f82c 100644 --- a/usr/src/uts/common/sys/cpu_pm.h +++ b/usr/src/uts/common/sys/cpu_pm.h @@ -65,6 +65,15 @@ typedef enum cpupm_state_name { } cpupm_state_name_t; /* + * Possible states for the domain's transience governor + */ +typedef enum cpupm_gov_state_t { + CPUPM_GOV_DISENGAGED, + CPUPM_GOV_TRANS_IDLE, /* Transient idleness, lowerings disabled */ + CPUPM_GOV_TRANS_WORK /* Transient work, raises disabled */ +} cpupm_gov_state_t; + +/* * Utilization events delivered by the dispatcher. */ typedef enum cpupm_util_event { @@ -95,10 +104,9 @@ typedef struct cpupm_domain { cpupm_state_t *cpd_named_states[CPUPM_STATE_NAMES]; hrtime_t cpd_last_raise; /* Last raise request time */ hrtime_t cpd_last_lower; /* last lower request time */ - int cpd_tw; /* transient work history */ int cpd_ti; /* transient idle history */ - boolean_t cpd_ti_governed; /* transient idle governor */ - boolean_t cpd_tw_governed; /* transient work governor */ + int cpd_tw; /* transient work history */ + cpupm_gov_state_t cpd_governor; /* transience governor */ struct cpupm_domain *cpd_next; } cpupm_domain_t; diff --git a/usr/src/uts/common/sys/time.h b/usr/src/uts/common/sys/time.h index 99e4c6bf1f..860ec10383 100644 --- a/usr/src/uts/common/sys/time.h +++ b/usr/src/uts/common/sys/time.h @@ -9,7 +9,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -344,6 +344,7 @@ extern hrtime_t gethrtime_unscaled(void); extern hrtime_t gethrtime_max(void); extern hrtime_t gethrtime_waitfree(void); extern void scalehrtime(hrtime_t *); +extern uint64_t unscalehrtime(hrtime_t); extern void gethrestime(timespec_t *); extern time_t gethrestime_sec(void); extern void gethrestime_lasttick(timespec_t *); diff --git a/usr/src/uts/i86pc/os/mp_machdep.c b/usr/src/uts/i86pc/os/mp_machdep.c index 7a454e1921..973d7ef13f 100644 --- a/usr/src/uts/i86pc/os/mp_machdep.c +++ b/usr/src/uts/i86pc/os/mp_machdep.c @@ -85,6 +85,7 @@ static int mach_intr_ops(dev_info_t *, ddi_intr_handle_impl_t *, static void mach_notify_error(int level, char *errmsg); static hrtime_t dummy_hrtime(void); static void dummy_scalehrtime(hrtime_t *); +static uint64_t dummy_unscalehrtime(hrtime_t); void cpu_idle(void); static void cpu_wakeup(cpu_t *, int); #ifndef __xpv @@ -133,6 +134,7 @@ void (*psm_enable_intr)(int) = mp_enable_intr; hrtime_t (*gethrtimef)(void) = dummy_hrtime; hrtime_t (*gethrtimeunscaledf)(void) = dummy_hrtime; void (*scalehrtimef)(hrtime_t *) = dummy_scalehrtime; +uint64_t (*unscalehrtimef)(hrtime_t) = dummy_unscalehrtime; int (*psm_translate_irq)(dev_info_t *, int) = mach_translate_irq; void (*gethrestimef)(timestruc_t *) = pc_gethrestime; void (*psm_notify_error)(int, char *) = (void (*)(int, char *))NULL; @@ -372,6 +374,12 @@ static void dummy_scalehrtime(hrtime_t *ticks) {} +static uint64_t +dummy_unscalehrtime(hrtime_t nsecs) +{ + return ((uint64_t)nsecs); +} + /* * Supports Deep C-State power saving idle loop. */ diff --git a/usr/src/uts/i86pc/os/timestamp.c b/usr/src/uts/i86pc/os/timestamp.c index 02b4cb529c..380ed6785f 100644 --- a/usr/src/uts/i86pc/os/timestamp.c +++ b/usr/src/uts/i86pc/os/timestamp.c @@ -96,6 +96,7 @@ #define NSEC_SHIFT 5 static uint_t nsec_scale; +static uint_t nsec_unscale; /* * These two variables used to be grouped together inside of a structure that @@ -341,6 +342,20 @@ tsc_gethrtimeunscaled(void) return (tsc); } +/* + * Convert a nanosecond based timestamp to tsc + */ +uint64_t +tsc_unscalehrtime(hrtime_t nsec) +{ + hrtime_t tsc; + + if (tsc_gethrtime_enable) { + TSC_CONVERT(nsec, tsc, nsec_unscale); + return (tsc); + } + return ((uint64_t)nsec); +} /* Convert a tsc timestamp to nanoseconds */ void @@ -603,6 +618,8 @@ tsc_hrtimeinit(uint64_t cpu_freq_hz) ASSERT(cpu_freq_hz > NANOSEC / (1 << NSEC_SHIFT)); nsec_scale = (uint_t)(((uint64_t)NANOSEC << (32 - NSEC_SHIFT)) / cpu_freq_hz); + nsec_unscale = + (uint_t)(((uint64_t)cpu_freq_hz << (32 - NSEC_SHIFT)) / NANOSEC); flags = clear_int_flag(); tsc = tsc_read(); @@ -612,6 +629,7 @@ tsc_hrtimeinit(uint64_t cpu_freq_hz) gethrtimef = tsc_gethrtime; gethrtimeunscaledf = tsc_gethrtimeunscaled; scalehrtimef = tsc_scalehrtime; + unscalehrtimef = tsc_unscalehrtime; hrtime_tick = tsc_tick; gethrtime_hires = 1; /* diff --git a/usr/src/uts/intel/ia32/os/archdep.c b/usr/src/uts/intel/ia32/os/archdep.c index f4bbd80420..2664a7deea 100644 --- a/usr/src/uts/intel/ia32/os/archdep.c +++ b/usr/src/uts/intel/ia32/os/archdep.c @@ -19,16 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ - -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/param.h> #include <sys/types.h> #include <sys/vmparam.h> @@ -1353,6 +1350,12 @@ scalehrtime(hrtime_t *hrt) scalehrtimef(hrt); } +uint64_t +unscalehrtime(hrtime_t nsecs) +{ + return (unscalehrtimef(nsecs)); +} + void gethrestime(timespec_t *tp) { diff --git a/usr/src/uts/intel/sys/archsystm.h b/usr/src/uts/intel/sys/archsystm.h index 92438659c0..2670e2b509 100644 --- a/usr/src/uts/intel/sys/archsystm.h +++ b/usr/src/uts/intel/sys/archsystm.h @@ -169,6 +169,7 @@ extern void switch_sp_and_call(void *, void (*)(uint_t, uint_t), uint_t, extern hrtime_t (*gethrtimef)(void); extern hrtime_t (*gethrtimeunscaledf)(void); extern void (*scalehrtimef)(hrtime_t *); +extern uint64_t (*unscalehrtimef)(hrtime_t); extern void (*gethrestimef)(timestruc_t *); extern void av_dispatch_softvect(uint_t); diff --git a/usr/src/uts/sun4/io/cbe.c b/usr/src/uts/sun4/io/cbe.c index b08cc9a92d..220fbb76bb 100644 --- a/usr/src/uts/sun4/io/cbe.c +++ b/usr/src/uts/sun4/io/cbe.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/param.h> #include <sys/time.h> #include <sys/systm.h> @@ -56,7 +54,7 @@ hrtime2tick(hrtime_t ts) return (q * sys_tick_freq + ((r * sys_tick_freq) / NANOSEC)); } -static uint64_t +uint64_t unscalehrtime(hrtime_t ts) { uint64_t unscale = 0; |