summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/os/cpu_pm.c
diff options
context:
space:
mode:
authorEric Saxe <Eric.Saxe@Sun.COM>2009-02-25 21:04:18 -0800
committerEric Saxe <Eric.Saxe@Sun.COM>2009-02-25 21:04:18 -0800
commit0e7515250c8395f368aa45fb9acae7c4f8f8b786 (patch)
tree5c3abde4ff53a950ad424ce362fd793369c06872 /usr/src/uts/common/os/cpu_pm.c
parent9a5d73e03cd3312ddb571a748c40a63c58bd66e5 (diff)
downloadillumos-gate-0e7515250c8395f368aa45fb9acae7c4f8f8b786.tar.gz
PSARC 2008/777 cpupm keyword mode extensions
PSARC 2008/663 CPU Deep Idle Keyword 6567156 bring CPU power awareness to the dispatcher 6700904 deeper C-State support required on follow-ons to Intel Penryn processor generation microarchitecture 6805661 cmt_root may contain duplicates on UMA systems --HG-- rename : usr/src/uts/i86pc/io/cpudrv/cpudrv_mach.c => usr/src/uts/i86pc/io/cpudrv_mach.c rename : usr/src/uts/i86pc/io/cpudrv/cpu_acpi.c => usr/src/uts/i86pc/os/cpupm/cpu_acpi.c rename : usr/src/uts/i86pc/io/cpudrv/cpudrv_amd.c => usr/src/uts/i86pc/os/cpupm/cpupm_amd.c rename : usr/src/uts/i86pc/io/cpudrv/cpudrv_intel.c => usr/src/uts/i86pc/os/cpupm/cpupm_intel.c rename : usr/src/uts/i86pc/os/cpupm.c => usr/src/uts/i86pc/os/cpupm/cpupm_mach.c rename : usr/src/uts/i86pc/io/cpudrv/cpudrv_throttle.c => usr/src/uts/i86pc/os/cpupm/cpupm_throttle.c rename : usr/src/uts/i86pc/io/cpudrv/pwrnow.c => usr/src/uts/i86pc/os/cpupm/pwrnow.c rename : usr/src/uts/i86pc/io/cpudrv/speedstep.c => usr/src/uts/i86pc/os/cpupm/speedstep.c rename : usr/src/uts/i86pc/sys/cpupm.h => usr/src/uts/i86pc/sys/cpupm_mach.h rename : usr/src/uts/i86pc/sys/cpudrv_throttle.h => usr/src/uts/i86pc/sys/cpupm_throttle.h
Diffstat (limited to 'usr/src/uts/common/os/cpu_pm.c')
-rw-r--r--usr/src/uts/common/os/cpu_pm.c840
1 files changed, 840 insertions, 0 deletions
diff --git a/usr/src/uts/common/os/cpu_pm.c b/usr/src/uts/common/os/cpu_pm.c
new file mode 100644
index 0000000000..848907af1d
--- /dev/null
+++ b/usr/src/uts/common/os/cpu_pm.c
@@ -0,0 +1,840 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/cpu_pm.h>
+#include <sys/cmn_err.h>
+#include <sys/sdt.h>
+
+/*
+ * Solaris Event Based CPU Power Manager
+ *
+ * This file implements platform independent event based CPU power management.
+ * When CPUs are configured into the system, the CMT scheduling subsystem will
+ * query the platform to determine if the CPU belongs to any power management
+ * domains. That is, sets of CPUs that share power management states.
+ *
+ * Active Power Management domains represent a group of CPUs across which the
+ * Operating System can request speed changes (which may in turn result
+ * in voltage changes). This allows the operating system to trade off
+ * performance for power savings.
+ *
+ * Idle Power Management domains can enter power savings states when they are
+ * unutilized. These states allow the Operating System to trade off power
+ * for performance (in the form of latency to transition from the idle state
+ * to an active one).
+ *
+ * For each active and idle power domain the CMT subsystem instantiates, a
+ * cpupm_domain_t structure is created. As the dispatcher schedules threads
+ * to run on the system's CPUs, it will also track the utilization of the
+ * enumerated power domains. Significant changes in utilization will result
+ * in the dispatcher sending the power manager events that relate to the
+ * utilization of the power domain. The power manager recieves the events,
+ * and in the context of the policy objectives in force, may decide to request
+ * the domain's power/performance state be changed.
+ *
+ * Under the "elastic" CPUPM policy, when the utilization rises, the CPU power
+ * manager will request the CPUs in the domain run at their fastest (and most
+ * power consuming) state. When the domain becomes idle (utilization at zero),
+ * the power manager will request that the CPUs run at a speed that saves the
+ * most power.
+ *
+ * The advantage of this scheme, is that the CPU power manager working with the
+ * dispatcher can be extremely responsive to changes in utilization. Optimizing
+ * for performance in the presence of utilization, and power savings in the
+ * presence of idleness. Such close collaboration with the dispatcher has other
+ * benefits that will play out in the form of more sophisticated power /
+ * performance policy in the near future.
+ *
+ * Avoiding state thrashing in the presence of transient periods of utilization
+ * and idleness while still being responsive to non-transient periods is key.
+ * The power manager implmeents several "governors" that are used to throttle
+ * state transitions when a significant amount of transient idle or transient
+ * work is detected.
+ *
+ * Kernel background activity (e.g. taskq threads) are by far the most common
+ * form of transient utilization. Ungoverned in the face of this utililzation,
+ * hundreds of state transitions per second would result on an idle system.
+ *
+ * Transient idleness is common when a thread briefly yields the CPU to
+ * wait for an event elsewhere in the system. Where the idle period is short
+ * enough, the overhead associated with making the state transition doesn't
+ * justify the power savings.
+ */
+
+static cpupm_domain_t *cpupm_domains = NULL;
+
+/*
+ * Uninitialized state of CPU power management is disabled
+ */
+cpupm_policy_t cpupm_policy = CPUPM_POLICY_DISABLED;
+
+/*
+ * Periods of utilization lasting less than this time interval are characterized
+ * as transient. State changes associated with transient work are considered
+ * to be mispredicted. That is, it's not worth raising and lower power states
+ * where the utilization lasts for less than this interval.
+ */
+hrtime_t cpupm_tw_predict_interval;
+
+/*
+ * Periods of idleness lasting less than this time interval are characterized
+ * as transient. State changes associated with transient idle are considered
+ * to be mispredicted. That is, it's not worth lowering and raising power
+ * states where the idleness lasts for less than this interval.
+ */
+hrtime_t cpupm_ti_predict_interval;
+
+/*
+ * Number of mispredictions after which future transitions will be governed.
+ */
+int cpupm_mispredict_thresh = 2;
+
+/*
+ * Likewise, the number of mispredicted governed transitions after which the
+ * governor will be removed.
+ */
+int cpupm_mispredict_gov_thresh = 10;
+
+/*
+ * The transient work and transient idle prediction intervals are initialized
+ * to be some multiple of the amount of time it takes to transition a power
+ * domain from the highest to the lowest power state, and back again, which
+ * is measured.
+ *
+ * The default values of those multiples are specified here. Tuning them higher
+ * will result in the transient work, and transient idle governors being used
+ * more aggresively, which limits the frequency of state transitions at the
+ * expense of performance and power savings, respectively.
+ */
+#define CPUPM_TI_GOV_DEFAULT_MULTIPLE 600
+#define CPUPM_TW_GOV_DEFAULT_MULTIPLE 25
+
+/*
+ * Number of high=>low=>high measurements performed, of which the average
+ * is taken.
+ */
+#define CPUPM_BENCHMARK_ITERS 5
+
+int cpupm_ti_gov_multiple = CPUPM_TI_GOV_DEFAULT_MULTIPLE;
+int cpupm_tw_gov_multiple = CPUPM_TW_GOV_DEFAULT_MULTIPLE;
+
+
+static int cpupm_governor_initialize(void);
+static void cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t);
+
+cpupm_policy_t
+cpupm_get_policy(void)
+{
+ return (cpupm_policy);
+}
+
+int
+cpupm_set_policy(cpupm_policy_t new_policy)
+{
+ static int gov_init = 0;
+ int result = 0;
+
+ mutex_enter(&cpu_lock);
+ if (new_policy == cpupm_policy) {
+ mutex_exit(&cpu_lock);
+ return (result);
+ }
+
+ /*
+ * Pausing CPUs causes a high priority thread to be scheduled
+ * on all other CPUs (besides the current one). This locks out
+ * other CPUs from making CPUPM state transitions.
+ */
+ switch (new_policy) {
+ case CPUPM_POLICY_DISABLED:
+ pause_cpus(NULL);
+ cpupm_policy = CPUPM_POLICY_DISABLED;
+ start_cpus();
+
+ result = cmt_pad_disable(PGHW_POW_ACTIVE);
+
+ /*
+ * Once PAD has been enabled, it should always be possible
+ * to disable it.
+ */
+ ASSERT(result == 0);
+
+ /*
+ * Bring all the active power domains to the maximum
+ * performance state.
+ */
+ cpupm_state_change_global(CPUPM_DTYPE_ACTIVE,
+ CPUPM_STATE_MAX_PERF);
+
+ break;
+ case CPUPM_POLICY_ELASTIC:
+
+ result = cmt_pad_enable(PGHW_POW_ACTIVE);
+ if (result < 0) {
+ /*
+ * Failed to enable PAD across the active power
+ * domains, which may well be because none were
+ * enumerated.
+ */
+ break;
+ }
+
+ pause_cpus(NULL);
+ /*
+ * Attempt to initialize the governor parameters the first
+ * time through.
+ */
+ if (gov_init == 0) {
+ result = cpupm_governor_initialize();
+ if (result == 0) {
+ gov_init = 1;
+ } else {
+ /*
+ * Failed to initialize the governor parameters
+ */
+ start_cpus();
+ break;
+ }
+ }
+ cpupm_policy = CPUPM_POLICY_ELASTIC;
+ start_cpus();
+
+ break;
+ default:
+ cmn_err(CE_WARN, "Attempt to set unknown CPUPM policy %d\n",
+ new_policy);
+ ASSERT(0);
+ break;
+ }
+ mutex_exit(&cpu_lock);
+
+ return (result);
+}
+
+/*
+ * Look for an existing power domain
+ */
+static cpupm_domain_t *
+cpupm_domain_find(id_t id, cpupm_dtype_t type)
+{
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ cpupm_domain_t *dom;
+
+ dom = cpupm_domains;
+ while (dom != NULL) {
+ if (id == dom->cpd_id && type == dom->cpd_type)
+ return (dom);
+ dom = dom->cpd_next;
+ }
+ return (NULL);
+}
+
+/*
+ * Create a new domain
+ */
+static cpupm_domain_t *
+cpupm_domain_create(id_t id, cpupm_dtype_t type)
+{
+ cpupm_domain_t *dom;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ dom = kmem_zalloc(sizeof (cpupm_domain_t), KM_SLEEP);
+ dom->cpd_id = id;
+ dom->cpd_type = type;
+
+ /* Link into the known domain list */
+ dom->cpd_next = cpupm_domains;
+ cpupm_domains = dom;
+
+ return (dom);
+}
+
+static void
+cpupm_domain_state_enum(struct cpu *cp, cpupm_domain_t *dom)
+{
+ /*
+ * In the envent we're enumerating because the domain's state
+ * configuration has changed, toss any existing states.
+ */
+ if (dom->cpd_nstates > 0) {
+ kmem_free(dom->cpd_states,
+ sizeof (cpupm_state_t) * dom->cpd_nstates);
+ dom->cpd_nstates = 0;
+ }
+
+ /*
+ * Query to determine the number of states, allocate storage
+ * large enough to hold the state information, and pass it back
+ * to the platform driver to complete the enumeration.
+ */
+ dom->cpd_nstates = cpupm_plat_state_enumerate(cp, dom->cpd_type, NULL);
+
+ if (dom->cpd_nstates == 0)
+ return;
+
+ dom->cpd_states =
+ kmem_zalloc(dom->cpd_nstates * sizeof (cpupm_state_t), KM_SLEEP);
+ (void) cpupm_plat_state_enumerate(cp, dom->cpd_type, dom->cpd_states);
+}
+
+/*
+ * Initialize the specified type of power domain on behalf of the CPU
+ */
+cpupm_domain_t *
+cpupm_domain_init(struct cpu *cp, cpupm_dtype_t type)
+{
+ cpupm_domain_t *dom;
+ id_t did;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ /*
+ * Instantiate the domain if it doesn't already exist
+ * and enumerate its power states.
+ */
+ did = cpupm_domain_id(cp, type);
+ dom = cpupm_domain_find(did, type);
+ if (dom == NULL) {
+ dom = cpupm_domain_create(did, type);
+ cpupm_domain_state_enum(cp, dom);
+ }
+
+ /*
+ * Named state initialization
+ */
+ if (type == CPUPM_DTYPE_ACTIVE) {
+ /*
+ * For active power domains, the highest performance
+ * state is defined as first state returned from
+ * the domain enumeration.
+ */
+ dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
+ &dom->cpd_states[0];
+ dom->cpd_named_states[CPUPM_STATE_LOW_POWER] =
+ &dom->cpd_states[dom->cpd_nstates - 1];
+
+ /*
+ * Begin by assuming CPU is running at the max perf state.
+ */
+ dom->cpd_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+ }
+
+ return (dom);
+}
+
+/*
+ * Return the id associated with the given type of domain
+ * to which cp belongs
+ */
+id_t
+cpupm_domain_id(struct cpu *cp, cpupm_dtype_t type)
+{
+ return (cpupm_plat_domain_id(cp, type));
+}
+
+/*
+ * Initiate a state change for the specified domain on behalf of cp
+ */
+int
+cpupm_change_state(struct cpu *cp, cpupm_domain_t *dom, cpupm_state_t *state)
+{
+ if (cpupm_plat_change_state(cp, state) < 0)
+ return (-1);
+
+ DTRACE_PROBE2(cpupm__change__state,
+ cpupm_domain_t *, dom,
+ cpupm_state_t *, state);
+
+ dom->cpd_state = state;
+ return (0);
+}
+
+/*
+ * Interface into the CPU power manager to indicate a significant change
+ * in utilization of the specified active power domain
+ */
+void
+cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom,
+ cpupm_util_event_t event)
+{
+ cpupm_state_t *new_state = NULL;
+ hrtime_t last;
+
+ if (cpupm_policy == CPUPM_POLICY_DISABLED) {
+ return;
+ }
+
+ /*
+ * What follows is a simple elastic power state management policy.
+ *
+ * If the utilization has become non-zero, and the domain was
+ * previously at it's lowest power state, then transition it
+ * to the highest state in the spirit of "race to idle".
+ *
+ * If the utilization has dropped to zero, then transition the
+ * domain to its lowest power state.
+ *
+ * Statistics are maintained to implement governors to reduce state
+ * transitions resulting from either transient work, or periods of
+ * transient idleness on the domain.
+ */
+ switch (event) {
+ case CPUPM_DOM_REMAIN_BUSY:
+
+ /*
+ * We've received an event that the domain is running a thread
+ * that's made it to the end of it's time slice. If we are at
+ * low power, then raise it. If the transient work governor
+ * is engaged, then remove it.
+ */
+ if (dom->cpd_state ==
+ dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
+ new_state =
+ dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+ if (dom->cpd_tw_governed == B_TRUE) {
+ dom->cpd_tw_governed = B_FALSE;
+ dom->cpd_tw = 0;
+ }
+ }
+ break;
+
+ case CPUPM_DOM_BUSY_FROM_IDLE:
+ last = dom->cpd_last_lower;
+ dom->cpd_last_raise = now;
+
+ DTRACE_PROBE3(cpupm__raise__req,
+ cpupm_domain_t *, dom,
+ hrtime_t, last,
+ hrtime_t, now);
+
+ if (dom->cpd_state ==
+ dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
+
+ /*
+ * There's non-zero utilization, and the domain is
+ * running in the lower power state. Before we
+ * consider raising power, perform some book keeping
+ * for the transient idle governor.
+ */
+ if (dom->cpd_ti_governed == B_FALSE) {
+ if ((now - last) < cpupm_ti_predict_interval) {
+ /*
+ * We're raising the domain power and
+ * we *just* lowered it. Consider
+ * this a mispredicted power state
+ * transition due to a transient
+ * idle period.
+ */
+ if (++dom->cpd_ti >=
+ cpupm_mispredict_thresh) {
+ /*
+ * There's enough transient
+ * idle transitions to
+ * justify governing future
+ * lowering requests.
+ */
+ dom->cpd_ti_governed = B_TRUE;
+ dom->cpd_ti = 0;
+ DTRACE_PROBE1(
+ cpupm__ti__governed,
+ cpupm_domain_t *, dom);
+ }
+ } else {
+ /*
+ * We correctly predicted the last
+ * lowering.
+ */
+ dom->cpd_ti = 0;
+ }
+ }
+ if (dom->cpd_tw_governed == B_TRUE) {
+ /*
+ * Raise requests are governed due to
+ * transient work.
+ */
+ DTRACE_PROBE1(cpupm__raise__governed,
+ cpupm_domain_t *, dom);
+
+ /*
+ * It's likely that we'll be governed for a
+ * while. If the transient idle governor is
+ * also in place, examine the preceeding idle
+ * interval to see if that still makes sense.
+ */
+ if (dom->cpd_ti_governed == B_TRUE &&
+ ((now - last) >=
+ cpupm_ti_predict_interval)) {
+ if (++dom->cpd_ti >=
+ cpupm_mispredict_gov_thresh) {
+ dom->cpd_ti_governed =
+ B_FALSE;
+ dom->cpd_ti = 0;
+ }
+ }
+ return;
+ }
+ /*
+ * Prepare to transition to the higher power state
+ */
+ new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+
+ } else if (dom->cpd_state ==
+ dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
+
+ /*
+ * Utilization is non-zero, and we're already running
+ * in the higher power state. Take this opportunity to
+ * perform some book keeping if the last lowering
+ * request was governed.
+ */
+ if (dom->cpd_ti_governed == B_TRUE) {
+ if ((now - last) >= cpupm_ti_predict_interval) {
+ /*
+ * The domain is transient idle
+ * governed, and we mispredicted
+ * governing the last lowering request.
+ */
+ if (++dom->cpd_ti >=
+ cpupm_mispredict_gov_thresh) {
+ /*
+ * There's enough non-transient
+ * idle periods to justify
+ * removing the governor.
+ */
+ dom->cpd_ti_governed = B_FALSE;
+ dom->cpd_ti = 0;
+ DTRACE_PROBE1(
+ cpupm__ti__ungoverned,
+ cpupm_domain_t *, dom);
+ }
+ } else {
+ /*
+ * Correctly predicted governing the
+ * last lowering request.
+ */
+ dom->cpd_ti = 0;
+ }
+ }
+ }
+ break;
+
+ case CPUPM_DOM_IDLE_FROM_BUSY:
+ last = dom->cpd_last_raise;
+ dom->cpd_last_lower = now;
+
+ DTRACE_PROBE3(cpupm__lower__req,
+ cpupm_domain_t *, dom,
+ hrtime_t, last,
+ hrtime_t, now);
+
+ if (dom->cpd_state ==
+ dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
+
+ /*
+ * The domain is idle, and is running in the highest
+ * performance state. Before we consider lowering power,
+ * perform some book keeping for the transient work
+ * governor.
+ */
+ if (dom->cpd_tw_governed == B_FALSE) {
+ if ((now - last) < cpupm_tw_predict_interval) {
+ /*
+ * We're lowering the domain power and
+ * we *just* raised it. Consider the
+ * last raise mispredicted due to
+ * transient work.
+ */
+ if (++dom->cpd_tw >=
+ cpupm_mispredict_thresh) {
+ /*
+ * There's enough transient idle
+ * transitions to justify
+ * governing future lowering
+ * requests.
+ */
+ dom->cpd_tw_governed = B_TRUE;
+ dom->cpd_tw = 0;
+ DTRACE_PROBE1(
+ cpupm__tw__governed,
+ cpupm_domain_t *, dom);
+ }
+ } else {
+ /*
+ * We correctly predicted during the
+ * last raise.
+ */
+ dom->cpd_tw = 0;
+ }
+ }
+ if (dom->cpd_ti_governed == B_TRUE) {
+ /*
+ * Lowering requests are governed due to
+ * transient idleness.
+ */
+ DTRACE_PROBE1(cpupm__lowering__governed,
+ cpupm_domain_t *, dom);
+
+ /*
+ * It's likely that we'll be governed for a
+ * while. If the transient work governor is
+ * also in place, examine the preceeding busy
+ * interval to see if that still makes sense.
+ */
+ if (dom->cpd_tw_governed == B_TRUE &&
+ ((now - last) >=
+ cpupm_tw_predict_interval)) {
+ if (++dom->cpd_tw >=
+ cpupm_mispredict_gov_thresh) {
+ dom->cpd_tw_governed =
+ B_FALSE;
+ dom->cpd_tw = 0;
+ }
+ }
+ return;
+ }
+
+ /*
+ * Prepare to transition to a lower power state.
+ */
+ new_state =
+ dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
+
+ } else if (dom->cpd_state ==
+ dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
+
+ /*
+ * The domain is idle, and we're already running in
+ * the lower power state. Take this opportunity to
+ * perform some book keeping if the last raising
+ * request was governed.
+ */
+ if (dom->cpd_tw_governed == B_TRUE) {
+ if ((now - last) >= cpupm_tw_predict_interval) {
+ /*
+ * The domain is transient work
+ * governed, and we mispredicted
+ * governing the last raising request.
+ */
+ if (++dom->cpd_tw >=
+ cpupm_mispredict_gov_thresh) {
+ /*
+ * There's enough non-transient
+ * work to justify removing
+ * the governor.
+ */
+ dom->cpd_tw_governed = B_FALSE;
+ dom->cpd_tw = 0;
+ DTRACE_PROBE1(
+ cpupm__tw__ungoverned,
+ cpupm_domain_t *, dom);
+ }
+ } else {
+ /*
+ * We correctly predicted governing
+ * the last raise.
+ */
+ dom->cpd_tw = 0;
+ }
+ }
+ }
+ break;
+ }
+ /*
+ * Change the power state
+ * Not much currently done if this doesn't succeed
+ */
+ if (new_state)
+ (void) cpupm_change_state(cp, dom, new_state);
+}
+
+
+/*
+ * Interface called by platforms to dynamically change the
+ * MAX performance cpupm state
+ */
+void
+cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level)
+{
+ cpupm_domain_t *dom;
+ id_t did;
+ cpupm_dtype_t type = CPUPM_DTYPE_ACTIVE;
+ boolean_t change_state = B_FALSE;
+ cpupm_state_t *new_state = NULL;
+
+ did = cpupm_domain_id(cp, type);
+ mutex_enter(&cpu_lock);
+ dom = cpupm_domain_find(did, type);
+ mutex_exit(&cpu_lock);
+
+ /*
+ * Can use a lock to avoid changing the power state of the cpu when
+ * CPUPM_STATE_MAX_PERF is getting changed.
+ * Since the occurance of events to change MAX_PERF is not frequent,
+ * it may not be a good idea to overburden with locks. In the worst
+ * case, for one cycle the power may not get changed to the required
+ * level
+ */
+ if (dom != NULL) {
+ if (dom->cpd_state ==
+ dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
+ change_state = B_TRUE;
+ }
+
+ /*
+ * If an out of range level is passed, use the lowest supported
+ * speed.
+ */
+ if (max_perf_level >= dom->cpd_nstates &&
+ dom->cpd_nstates > 1) {
+ max_perf_level = dom->cpd_nstates - 1;
+ }
+
+ dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
+ &dom->cpd_states[max_perf_level];
+
+ /*
+ * If the current state is MAX_PERF, change the current state
+ * to the new MAX_PERF
+ */
+ if (change_state) {
+ new_state =
+ dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+ if (new_state) {
+ (void) cpupm_change_state(cp, dom, new_state);
+ }
+ }
+ }
+}
+
+/*
+ * Benchmark some power state transitions and use the transition latencies as
+ * a basis for initializing parameters for the transient idle and transient
+ * work governors.
+ *
+ * Returns 0 on success or -1 if the governor parameters could not be
+ * initialized.
+ */
+static int
+cpupm_governor_initialize(void)
+{
+ cpu_t *cp = CPU;
+ cpupm_domain_t *dom;
+ cpupm_state_t *low, *high;
+ id_t did;
+ hrtime_t start, delta, deltas = 0;
+ int iterations;
+
+ did = cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE);
+ if (did == CPUPM_NO_DOMAIN)
+ return (-1);
+
+ dom = cpupm_domain_find(did, CPUPM_DTYPE_ACTIVE);
+ if (dom == NULL)
+ return (-1);
+
+ low = dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
+ high = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+
+ for (iterations = 0; iterations < CPUPM_BENCHMARK_ITERS; iterations++) {
+
+ /*
+ * Measure the amount of time it takes to transition the
+ * domain down to the lowest, and back to the highest power
+ * state.
+ */
+ start = gethrtime_unscaled();
+ (void) cpupm_change_state(cp, dom, low);
+ (void) cpupm_change_state(cp, dom, high);
+ delta = gethrtime_unscaled() - start;
+
+ DTRACE_PROBE1(cpupm__benchmark__latency,
+ hrtime_t, delta);
+
+ deltas += delta;
+ }
+
+ /*
+ * Figure the average latency, and tune the transient work and
+ * transient idle prediction intervals accordingly.
+ */
+ delta = deltas / iterations;
+
+ cpupm_ti_predict_interval = delta * cpupm_ti_gov_multiple;
+ cpupm_tw_predict_interval = delta * cpupm_tw_gov_multiple;
+
+ return (0);
+}
+
+/*
+ * Initiate a state change in all CPUPM domain instances of the specified type
+ */
+static void
+cpupm_state_change_global(cpupm_dtype_t type, cpupm_state_name_t state)
+{
+ cpu_t *cp;
+ pg_cmt_t *pwr_pg;
+ cpupm_domain_t *dom;
+ group_t *hwset;
+ group_iter_t giter;
+ pg_cpu_itr_t cpu_iter;
+ pghw_type_t hw;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ switch (type) {
+ case CPUPM_DTYPE_ACTIVE:
+ hw = PGHW_POW_ACTIVE;
+ break;
+ default:
+ /*
+ * Power domain types other than "active" unsupported.
+ */
+ ASSERT(type == CPUPM_DTYPE_ACTIVE);
+ return;
+ }
+
+ if ((hwset = pghw_set_lookup(hw)) == NULL)
+ return;
+
+ /*
+ * Iterate over the power domains
+ */
+ group_iter_init(&giter);
+ while ((pwr_pg = group_iterate(hwset, &giter)) != NULL) {
+
+ dom = (cpupm_domain_t *)pwr_pg->cmt_pg.pghw_handle;
+
+ /*
+ * Iterate over the CPUs in each domain
+ */
+ PG_CPU_ITR_INIT(pwr_pg, cpu_iter);
+ while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
+ (void) cpupm_change_state(cp, dom,
+ dom->cpd_named_states[state]);
+ }
+ }
+}