PSARC 2008/777 cpupm keyword mode extensions

PSARC 2008/663 CPU Deep Idle Keyword 6567156 bring CPU power awareness to the dispatcher 6700904 deeper C-State support required on follow-ons to Intel Penryn processor generation microarchitecture 6805661 cmt_root may contain duplicates on UMA systems --HG-- rename : usr/src/uts/i86pc/io/cpudrv/cpudrv_mach.c => usr/src/uts/i86pc/io/cpudrv_mach.c rename : usr/src/uts/i86pc/io/cpudrv/cpu_acpi.c => usr/src/uts/i86pc/os/cpupm/cpu_acpi.c rename : usr/src/uts/i86pc/io/cpudrv/cpudrv_amd.c => usr/src/uts/i86pc/os/cpupm/cpupm_amd.c rename : usr/src/uts/i86pc/io/cpudrv/cpudrv_intel.c => usr/src/uts/i86pc/os/cpupm/cpupm_intel.c rename : usr/src/uts/i86pc/os/cpupm.c => usr/src/uts/i86pc/os/cpupm/cpupm_mach.c rename : usr/src/uts/i86pc/io/cpudrv/cpudrv_throttle.c => usr/src/uts/i86pc/os/cpupm/cpupm_throttle.c rename : usr/src/uts/i86pc/io/cpudrv/pwrnow.c => usr/src/uts/i86pc/os/cpupm/pwrnow.c rename : usr/src/uts/i86pc/io/cpudrv/speedstep.c => usr/src/uts/i86pc/os/cpupm/speedstep.c rename : usr/src/uts/i86pc/sys/cpupm.h => usr/src/uts/i86pc/sys/cpupm_mach.h rename : usr/src/uts/i86pc/sys/cpudrv_throttle.h => usr/src/uts/i86pc/sys/cpupm_throttle.h
author: Eric Saxe <Eric.Saxe@Sun.COM> 2009-02-25 21:04:18 -0800
committer: Eric Saxe <Eric.Saxe@Sun.COM> 2009-02-25 21:04:18 -0800
commit: 0e7515250c8395f368aa45fb9acae7c4f8f8b786 (patch)
tree: 5c3abde4ff53a950ad424ce362fd793369c06872 /usr/src/uts/common/os/cpu_pm.c
parent: 9a5d73e03cd3312ddb571a748c40a63c58bd66e5 (diff)
download: illumos-gate-0e7515250c8395f368aa45fb9acae7c4f8f8b786.tar.gz
1 files changed, 840 insertions, 0 deletions
diff --git a/usr/src/uts/common/os/cpu_pm.c b/usr/src/uts/common/os/cpu_pm.c
new file mode 100644
index 0000000000..848907af1d
--- /dev/null
+++ b/usr/src/uts/common/os/cpu_pm.c
@@ -0,0 +1,840 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/cpu_pm.h>
+#include <sys/cmn_err.h>
+#include <sys/sdt.h>
+
+/*
+ * Solaris Event Based CPU Power Manager
+ *
+ * This file implements platform independent event based CPU power management.
+ * When CPUs are configured into the system, the CMT scheduling subsystem will
+ * query the platform to determine if the CPU belongs to any power management
+ * domains. That is, sets of CPUs that share power management states.
+ *
+ * Active Power Management domains represent a group of CPUs across which the
+ * Operating System can request speed changes (which may in turn result
+ * in voltage changes). This allows the operating system to trade off
+ * performance for power savings.
+ *
+ * Idle Power Management domains can enter power savings states when they are
+ * unutilized. These states allow the Operating System to trade off power
+ * for performance (in the form of latency to transition from the idle state
+ * to an active one).
+ *
+ * For each active and idle power domain the CMT subsystem instantiates, a
+ * cpupm_domain_t structure is created. As the dispatcher schedules threads
+ * to run on the system's CPUs, it will also track the utilization of the
+ * enumerated power domains. Significant changes in utilization will result
+ * in the dispatcher sending the power manager events that relate to the
+ * utilization of the power domain. The power manager recieves the events,
+ * and in the context of the policy objectives in force, may decide to request
+ * the domain's power/performance state be changed.
+ *
+ * Under the "elastic" CPUPM policy, when the utilization rises, the CPU power
+ * manager will request the CPUs in the domain run at their fastest (and most
+ * power consuming) state. When the domain becomes idle (utilization at zero),
+ * the power manager will request that the CPUs run at a speed that saves the
+ * most power.
+ *
+ * The advantage of this scheme, is that the CPU power manager working with the
+ * dispatcher can be extremely responsive to changes in utilization. Optimizing
+ * for performance in the presence of utilization, and power savings in the
+ * presence of idleness. Such close collaboration with the dispatcher has other
+ * benefits that will play out in the form of more sophisticated power /
+ * performance policy in the near future.
+ *
+ * Avoiding state thrashing in the presence of transient periods of utilization
+ * and idleness while still being responsive to non-transient periods is key.
+ * The power manager implmeents several "governors" that are used to throttle
+ * state transitions when a significant amount of transient idle or transient
+ * work is detected.
+ *
+ * Kernel background activity (e.g. taskq threads) are by far the most common
+ * form of transient utilization. Ungoverned in the face of this utililzation,
+ * hundreds of state transitions per second would result on an idle system.
+ *
+ * Transient idleness is common when a thread briefly yields the CPU to
+ * wait for an event elsewhere in the system. Where the idle period is short
+ * enough, the overhead associated with making the state transition doesn't
+ * justify the power savings.
+ */
+
+static cpupm_domain_t *cpupm_domains = NULL;
+
+/*
+ * Uninitialized state of CPU power management is disabled
+ */
+cpupm_policy_t cpupm_policy = CPUPM_POLICY_DISABLED;
+
+/*
+ * Periods of utilization lasting less than this time interval are characterized
+ * as transient. State changes associated with transient work are considered
+ * to be mispredicted. That is, it's not worth raising and lower power states
+ * where the utilization lasts for less than this interval.
+ */
+hrtime_t cpupm_tw_predict_interval;
+
+/*
+ * Periods of idleness lasting less than this time interval are characterized
+ * as transient. State changes associated with transient idle are considered
+ * to be mispredicted. That is, it's not worth lowering and raising power
+ * states where the idleness lasts for less than this interval.
+ */
+hrtime_t cpupm_ti_predict_interval;
+
+/*
+ * Number of mispredictions after which future transitions will be governed.
+ */
+int cpupm_mispredict_thresh = 2;
+
+/*
+ * Likewise, the number of mispredicted governed transitions after which the
+ * governor will be removed.
+ */
+int cpupm_mispredict_gov_thresh = 10;
+
+/*
+ * The transient work and transient idle prediction intervals are initialized
+ * to be some multiple of the amount of time it takes to transition a power
+ * domain from the highest to the lowest power state, and back again, which
+ * is measured.
+ *
+ * The default values of those multiples are specified here. Tuning them higher
+ * will result in the transient work, and transient idle governors being used
+ * more aggresively, which limits the frequency of state transitions at the
+ * expense of performance and power savings, respectively.
+ */
+#define	CPUPM_TI_GOV_DEFAULT_MULTIPLE 600
+#define	CPUPM_TW_GOV_DEFAULT_MULTIPLE 25
+
+/*
+ * Number of high=>low=>high measurements performed, of which the average
+ * is taken.
+ */
+#define	CPUPM_BENCHMARK_ITERS 5
+
+int cpupm_ti_gov_multiple = CPUPM_TI_GOV_DEFAULT_MULTIPLE;
+int cpupm_tw_gov_multiple = CPUPM_TW_GOV_DEFAULT_MULTIPLE;
+
+
+static int	cpupm_governor_initialize(void);
+static void	cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t);
+
+cpupm_policy_t
+cpupm_get_policy(void)
+{
+	return (cpupm_policy);
+}
+
+int
+cpupm_set_policy(cpupm_policy_t new_policy)
+{
+	static int	gov_init = 0;
+	int		result = 0;
+
+	mutex_enter(&cpu_lock);
+	if (new_policy == cpupm_policy) {
+		mutex_exit(&cpu_lock);
+		return (result);
+	}
+
+	/*
+	 * Pausing CPUs causes a high priority thread to be scheduled
+	 * on all other CPUs (besides the current one). This locks out
+	 * other CPUs from making CPUPM state transitions.
+	 */
+	switch (new_policy) {
+	case CPUPM_POLICY_DISABLED:
+		pause_cpus(NULL);
+		cpupm_policy = CPUPM_POLICY_DISABLED;
+		start_cpus();
+
+		result = cmt_pad_disable(PGHW_POW_ACTIVE);
+
+		/*
+		 * Once PAD has been enabled, it should always be possible
+		 * to disable it.
+		 */
+		ASSERT(result == 0);
+
+		/*
+		 * Bring all the active power domains to the maximum
+		 * performance state.
+		 */
+		cpupm_state_change_global(CPUPM_DTYPE_ACTIVE,
+		    CPUPM_STATE_MAX_PERF);
+
+		break;
+	case CPUPM_POLICY_ELASTIC:
+
+		result = cmt_pad_enable(PGHW_POW_ACTIVE);
+		if (result < 0) {
+			/*
+			 * Failed to enable PAD across the active power
+			 * domains, which may well be because none were
+			 * enumerated.
+			 */
+			break;
+		}
+
+		pause_cpus(NULL);
+		/*
+		 * Attempt to initialize the governor parameters the first
+		 * time through.
+		 */
+		if (gov_init == 0) {
+			result = cpupm_governor_initialize();
+			if (result == 0) {
+				gov_init = 1;
+			} else {
+				/*
+				 * Failed to initialize the governor parameters
+				 */
+				start_cpus();
+				break;
+			}
+		}
+		cpupm_policy = CPUPM_POLICY_ELASTIC;
+		start_cpus();
+
+		break;
+	default:
+		cmn_err(CE_WARN, "Attempt to set unknown CPUPM policy %d\n",
+		    new_policy);
+		ASSERT(0);
+		break;
+	}
+	mutex_exit(&cpu_lock);
+
+	return (result);
+}
+
+/*
+ * Look for an existing power domain
+ */
+static cpupm_domain_t *
+cpupm_domain_find(id_t id, cpupm_dtype_t type)
+{
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	cpupm_domain_t *dom;
+
+	dom = cpupm_domains;
+	while (dom != NULL) {
+		if (id == dom->cpd_id && type == dom->cpd_type)
+			return (dom);
+		dom = dom->cpd_next;
+	}
+	return (NULL);
+}
+
+/*
+ * Create a new domain
+ */
+static cpupm_domain_t *
+cpupm_domain_create(id_t id, cpupm_dtype_t type)
+{
+	cpupm_domain_t *dom;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	dom = kmem_zalloc(sizeof (cpupm_domain_t), KM_SLEEP);
+	dom->cpd_id = id;
+	dom->cpd_type = type;
+
+	/* Link into the known domain list */
+	dom->cpd_next = cpupm_domains;
+	cpupm_domains = dom;
+
+	return (dom);
+}
+
+static void
+cpupm_domain_state_enum(struct cpu *cp, cpupm_domain_t *dom)
+{
+	/*
+	 * In the envent we're enumerating because the domain's state
+	 * configuration has changed, toss any existing states.
+	 */
+	if (dom->cpd_nstates > 0) {
+		kmem_free(dom->cpd_states,
+		    sizeof (cpupm_state_t) * dom->cpd_nstates);
+		dom->cpd_nstates = 0;
+	}
+
+	/*
+	 * Query to determine the number of states, allocate storage
+	 * large enough to hold the state information, and pass it back
+	 * to the platform driver to complete the enumeration.
+	 */
+	dom->cpd_nstates = cpupm_plat_state_enumerate(cp, dom->cpd_type, NULL);
+
+	if (dom->cpd_nstates == 0)
+		return;
+
+	dom->cpd_states =
+	    kmem_zalloc(dom->cpd_nstates * sizeof (cpupm_state_t), KM_SLEEP);
+	(void) cpupm_plat_state_enumerate(cp, dom->cpd_type, dom->cpd_states);
+}
+
+/*
+ * Initialize the specified type of power domain on behalf of the CPU
+ */
+cpupm_domain_t *
+cpupm_domain_init(struct cpu *cp, cpupm_dtype_t type)
+{
+	cpupm_domain_t	*dom;
+	id_t		did;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	/*
+	 * Instantiate the domain if it doesn't already exist
+	 * and enumerate its power states.
+	 */
+	did = cpupm_domain_id(cp, type);
+	dom = cpupm_domain_find(did, type);
+	if (dom == NULL) {
+		dom = cpupm_domain_create(did, type);
+		cpupm_domain_state_enum(cp, dom);
+	}
+
+	/*
+	 * Named state initialization
+	 */
+	if (type == CPUPM_DTYPE_ACTIVE) {
+		/*
+		 * For active power domains, the highest performance
+		 * state is defined as first state returned from
+		 * the domain enumeration.
+		 */
+		dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
+		    &dom->cpd_states[0];
+		dom->cpd_named_states[CPUPM_STATE_LOW_POWER] =
+		    &dom->cpd_states[dom->cpd_nstates - 1];
+
+		/*
+		 * Begin by assuming CPU is running at the max perf state.
+		 */
+		dom->cpd_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+	}
+
+	return (dom);
+}
+
+/*
+ * Return the id associated with the given type of domain
+ * to which cp belongs
+ */
+id_t
+cpupm_domain_id(struct cpu *cp, cpupm_dtype_t type)
+{
+	return (cpupm_plat_domain_id(cp, type));
+}
+
+/*
+ * Initiate a state change for the specified domain on behalf of cp
+ */
+int
+cpupm_change_state(struct cpu *cp, cpupm_domain_t *dom, cpupm_state_t *state)
+{
+	if (cpupm_plat_change_state(cp, state) < 0)
+		return (-1);
+
+	DTRACE_PROBE2(cpupm__change__state,
+	    cpupm_domain_t *, dom,
+	    cpupm_state_t *, state);
+
+	dom->cpd_state = state;
+	return (0);
+}
+
+/*
+ * Interface into the CPU power manager to indicate a significant change
+ * in utilization of the specified active power domain
+ */
+void
+cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom,
+			    cpupm_util_event_t event)
+{
+	cpupm_state_t	*new_state = NULL;
+	hrtime_t	last;
+
+	if (cpupm_policy == CPUPM_POLICY_DISABLED) {
+		return;
+	}
+
+	/*
+	 * What follows is a simple elastic power state management policy.
+	 *
+	 * If the utilization has become non-zero, and the domain was
+	 * previously at it's lowest power state, then transition it
+	 * to the highest state in the spirit of "race to idle".
+	 *
+	 * If the utilization has dropped to zero, then transition the
+	 * domain to its lowest power state.
+	 *
+	 * Statistics are maintained to implement governors to reduce state
+	 * transitions resulting from either transient work, or periods of
+	 * transient idleness on the domain.
+	 */
+	switch (event) {
+	case CPUPM_DOM_REMAIN_BUSY:
+
+		/*
+		 * We've received an event that the domain is running a thread
+		 * that's made it to the end of it's time slice. If we are at
+		 * low power, then raise it. If the transient work governor
+		 * is engaged, then remove it.
+		 */
+		if (dom->cpd_state ==
+		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
+			new_state =
+			    dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+			if (dom->cpd_tw_governed == B_TRUE) {
+				dom->cpd_tw_governed = B_FALSE;
+				dom->cpd_tw = 0;
+			}
+		}
+		break;
+
+	case CPUPM_DOM_BUSY_FROM_IDLE:
+		last = dom->cpd_last_lower;
+		dom->cpd_last_raise = now;
+
+		DTRACE_PROBE3(cpupm__raise__req,
+		    cpupm_domain_t *, dom,
+		    hrtime_t, last,
+		    hrtime_t, now);
+
+		if (dom->cpd_state ==
+		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
+
+			/*
+			 * There's non-zero utilization, and the domain is
+			 * running in the lower power state. Before we
+			 * consider raising power, perform some book keeping
+			 * for the transient idle governor.
+			 */
+			if (dom->cpd_ti_governed == B_FALSE) {
+				if ((now - last) < cpupm_ti_predict_interval) {
+					/*
+					 * We're raising the domain power and
+					 * we *just* lowered it. Consider
+					 * this a mispredicted power state
+					 * transition due to a transient
+					 * idle period.
+					 */
+					if (++dom->cpd_ti >=
+					    cpupm_mispredict_thresh) {
+						/*
+						 * There's enough transient
+						 * idle transitions to
+						 * justify governing future
+						 * lowering requests.
+						 */
+						dom->cpd_ti_governed = B_TRUE;
+						dom->cpd_ti = 0;
+						DTRACE_PROBE1(
+						    cpupm__ti__governed,
+						    cpupm_domain_t *, dom);
+					}
+				} else {
+					/*
+					 * We correctly predicted the last
+					 * lowering.
+					 */
+					dom->cpd_ti = 0;
+				}
+			}
+			if (dom->cpd_tw_governed == B_TRUE) {
+				/*
+				 * Raise requests are governed due to
+				 * transient work.
+				 */
+				DTRACE_PROBE1(cpupm__raise__governed,
+				    cpupm_domain_t *, dom);
+
+				/*
+				 * It's likely that we'll be governed for a
+				 * while. If the transient idle governor is
+				 * also in place, examine the preceeding idle
+				 * interval to see if that still makes sense.
+				 */
+				if (dom->cpd_ti_governed == B_TRUE &&
+				    ((now - last) >=
+				    cpupm_ti_predict_interval)) {
+					if (++dom->cpd_ti >=
+					    cpupm_mispredict_gov_thresh) {
+						dom->cpd_ti_governed =
+						    B_FALSE;
+						dom->cpd_ti = 0;
+					}
+				}
+				return;
+			}
+			/*
+			 * Prepare to transition to the higher power state
+			 */
+			new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+
+		} else if (dom->cpd_state ==
+		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
+
+			/*
+			 * Utilization is non-zero, and we're already running
+			 * in the higher power state. Take this opportunity to
+			 * perform some book keeping if the last lowering
+			 * request was governed.
+			 */
+			if (dom->cpd_ti_governed == B_TRUE) {
+				if ((now - last) >= cpupm_ti_predict_interval) {
+					/*
+					 * The domain is transient idle
+					 * governed, and we mispredicted
+					 * governing the last lowering request.
+					 */
+					if (++dom->cpd_ti >=
+					    cpupm_mispredict_gov_thresh) {
+						/*
+						 * There's enough non-transient
+						 * idle periods to justify
+						 * removing the governor.
+						 */
+						dom->cpd_ti_governed = B_FALSE;
+						dom->cpd_ti = 0;
+						DTRACE_PROBE1(
+						    cpupm__ti__ungoverned,
+						    cpupm_domain_t *, dom);
+					}
+				} else {
+					/*
+					 * Correctly predicted governing the
+					 * last lowering request.
+					 */
+					dom->cpd_ti = 0;
+				}
+			}
+		}
+		break;
+
+	case CPUPM_DOM_IDLE_FROM_BUSY:
+		last = dom->cpd_last_raise;
+		dom->cpd_last_lower = now;
+
+		DTRACE_PROBE3(cpupm__lower__req,
+		    cpupm_domain_t *, dom,
+		    hrtime_t, last,
+		    hrtime_t, now);
+
+		if (dom->cpd_state ==
+		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
+
+			/*
+			 * The domain is idle, and is running in the highest
+			 * performance state. Before we consider lowering power,
+			 * perform some book keeping for the transient work
+			 * governor.
+			 */
+			if (dom->cpd_tw_governed == B_FALSE) {
+				if ((now - last) < cpupm_tw_predict_interval) {
+					/*
+					 * We're lowering the domain power and
+					 * we *just* raised it. Consider the
+					 * last raise mispredicted due to
+					 * transient work.
+					 */
+					if (++dom->cpd_tw >=
+					    cpupm_mispredict_thresh) {
+						/*
+						 * There's enough transient idle
+						 * transitions to justify
+						 * governing future lowering
+						 * requests.
+						 */
+						dom->cpd_tw_governed = B_TRUE;
+						dom->cpd_tw = 0;
+						DTRACE_PROBE1(
+						    cpupm__tw__governed,
+						    cpupm_domain_t *, dom);
+					}
+				} else {
+					/*
+					 * We correctly predicted during the
+					 * last raise.
+					 */
+					dom->cpd_tw = 0;
+				}
+			}
+			if (dom->cpd_ti_governed == B_TRUE) {
+				/*
+				 * Lowering requests are governed due to
+				 * transient idleness.
+				 */
+				DTRACE_PROBE1(cpupm__lowering__governed,
+				    cpupm_domain_t *, dom);
+
+				/*
+				 * It's likely that we'll be governed for a
+				 * while. If the transient work governor is
+				 * also in place, examine the preceeding busy
+				 * interval to see if that still makes sense.
+				 */
+				if (dom->cpd_tw_governed == B_TRUE &&
+				    ((now - last) >=
+				    cpupm_tw_predict_interval)) {
+					if (++dom->cpd_tw >=
+					    cpupm_mispredict_gov_thresh) {
+						dom->cpd_tw_governed =
+						    B_FALSE;
+						dom->cpd_tw = 0;
+					}
+				}
+				return;
+			}
+
+			/*
+			 * Prepare to transition to a lower power state.
+			 */
+			new_state =
+			    dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
+
+		} else if (dom->cpd_state ==
+		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
+
+			/*
+			 * The domain is idle, and we're already running in
+			 * the lower power state. Take this opportunity to
+			 * perform some book keeping if the last raising
+			 * request was governed.
+			 */
+			if (dom->cpd_tw_governed == B_TRUE) {
+				if ((now - last) >= cpupm_tw_predict_interval) {
+					/*
+					 * The domain is transient work
+					 * governed, and we mispredicted
+					 * governing the last raising request.
+					 */
+					if (++dom->cpd_tw >=
+					    cpupm_mispredict_gov_thresh) {
+						/*
+						 * There's enough non-transient
+						 * work to justify removing
+						 * the governor.
+						 */
+						dom->cpd_tw_governed = B_FALSE;
+						dom->cpd_tw = 0;
+						DTRACE_PROBE1(
+						    cpupm__tw__ungoverned,
+						    cpupm_domain_t *, dom);
+					}
+				} else {
+					/*
+					 * We correctly predicted governing
+					 * the last raise.
+					 */
+					dom->cpd_tw = 0;
+				}
+			}
+		}
+		break;
+	}
+	/*
+	 * Change the power state
+	 * Not much currently done if this doesn't succeed
+	 */
+	if (new_state)
+		(void) cpupm_change_state(cp, dom, new_state);
+}
+
+
+/*
+ * Interface called by platforms to dynamically change the
+ * MAX performance cpupm state
+ */
+void
+cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level)
+{
+	cpupm_domain_t	*dom;
+	id_t		did;
+	cpupm_dtype_t	type = CPUPM_DTYPE_ACTIVE;
+	boolean_t	change_state = B_FALSE;
+	cpupm_state_t	*new_state = NULL;
+
+	did = cpupm_domain_id(cp, type);
+	mutex_enter(&cpu_lock);
+	dom = cpupm_domain_find(did, type);
+	mutex_exit(&cpu_lock);
+
+	/*
+	 * Can use a lock to avoid changing the power state of the cpu when
+	 * CPUPM_STATE_MAX_PERF is getting changed.
+	 * Since the occurance of events to change MAX_PERF is not frequent,
+	 * it may not be a good idea to overburden with locks. In the worst
+	 * case, for one cycle the power may not get changed to the required
+	 * level
+	 */
+	if (dom != NULL) {
+		if (dom->cpd_state ==
+		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
+			change_state = B_TRUE;
+		}
+
+		/*
+		 * If an out of range level is passed, use the lowest supported
+		 * speed.
+		 */
+		if (max_perf_level >= dom->cpd_nstates &&
+		    dom->cpd_nstates > 1) {
+			max_perf_level = dom->cpd_nstates - 1;
+		}
+
+		dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
+		    &dom->cpd_states[max_perf_level];
+
+		/*
+		 * If the current state is MAX_PERF, change the current state
+		 * to the new MAX_PERF
+		 */
+		if (change_state) {
+			new_state =
+			    dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+			if (new_state) {
+				(void) cpupm_change_state(cp, dom, new_state);
+			}
+		}
+	}
+}
+
+/*
+ * Benchmark some power state transitions and use the transition latencies as
+ * a basis for initializing parameters for the transient idle and transient
+ * work governors.
+ *
+ * Returns 0 on success or -1 if the governor parameters could not be
+ * initialized.
+ */
+static int
+cpupm_governor_initialize(void)
+{
+	cpu_t		*cp = CPU;
+	cpupm_domain_t	*dom;
+	cpupm_state_t	*low, *high;
+	id_t		did;
+	hrtime_t	start, delta, deltas = 0;
+	int		iterations;
+
+	did = cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE);
+	if (did == CPUPM_NO_DOMAIN)
+		return (-1);
+
+	dom = cpupm_domain_find(did, CPUPM_DTYPE_ACTIVE);
+	if (dom == NULL)
+		return (-1);
+
+	low = dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
+	high = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+
+	for (iterations = 0; iterations < CPUPM_BENCHMARK_ITERS; iterations++) {
+
+		/*
+		 * Measure the amount of time it takes to transition the
+		 * domain down to the lowest, and back to the highest power
+		 * state.
+		 */
+		start = gethrtime_unscaled();
+		(void) cpupm_change_state(cp, dom, low);
+		(void) cpupm_change_state(cp, dom, high);
+		delta = gethrtime_unscaled() - start;
+
+		DTRACE_PROBE1(cpupm__benchmark__latency,
+		    hrtime_t, delta);
+
+		deltas += delta;
+	}
+
+	/*
+	 * Figure the average latency, and tune the transient work and
+	 * transient idle prediction intervals accordingly.
+	 */
+	delta = deltas / iterations;
+
+	cpupm_ti_predict_interval = delta * cpupm_ti_gov_multiple;
+	cpupm_tw_predict_interval = delta * cpupm_tw_gov_multiple;
+
+	return (0);
+}
+
+/*
+ * Initiate a state change in all CPUPM domain instances of the specified type
+ */
+static void
+cpupm_state_change_global(cpupm_dtype_t type, cpupm_state_name_t state)
+{
+	cpu_t		*cp;
+	pg_cmt_t	*pwr_pg;
+	cpupm_domain_t	*dom;
+	group_t		*hwset;
+	group_iter_t	giter;
+	pg_cpu_itr_t	cpu_iter;
+	pghw_type_t	hw;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	switch (type) {
+	case CPUPM_DTYPE_ACTIVE:
+		hw = PGHW_POW_ACTIVE;
+		break;
+	default:
+		/*
+		 * Power domain types other than "active" unsupported.
+		 */
+		ASSERT(type == CPUPM_DTYPE_ACTIVE);
+		return;
+	}
+
+	if ((hwset = pghw_set_lookup(hw)) == NULL)
+		return;
+
+	/*
+	 * Iterate over the power domains
+	 */
+	group_iter_init(&giter);
+	while ((pwr_pg = group_iterate(hwset, &giter)) != NULL) {
+
+		dom = (cpupm_domain_t *)pwr_pg->cmt_pg.pghw_handle;
+
+		/*
+		 * Iterate over the CPUs in each domain
+		 */
+		PG_CPU_ITR_INIT(pwr_pg, cpu_iter);
+		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
+			(void) cpupm_change_state(cp, dom,
+			    dom->cpd_named_states[state]);
+		}
+	}
+}
author	Eric Saxe <Eric.Saxe@Sun.COM>	2009-02-25 21:04:18 -0800
committer	Eric Saxe <Eric.Saxe@Sun.COM>	2009-02-25 21:04:18 -0800
commit	0e7515250c8395f368aa45fb9acae7c4f8f8b786 (patch)
tree	5c3abde4ff53a950ad424ce362fd793369c06872 /usr/src/uts/common/os/cpu_pm.c
parent	9a5d73e03cd3312ddb571a748c40a63c58bd66e5 (diff)
download	illumos-gate-0e7515250c8395f368aa45fb9acae7c4f8f8b786.tar.gz