summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorEric Saxe <Eric.Saxe@Sun.COM>2009-02-25 21:04:18 -0800
committerEric Saxe <Eric.Saxe@Sun.COM>2009-02-25 21:04:18 -0800
commit0e7515250c8395f368aa45fb9acae7c4f8f8b786 (patch)
tree5c3abde4ff53a950ad424ce362fd793369c06872 /usr/src
parent9a5d73e03cd3312ddb571a748c40a63c58bd66e5 (diff)
downloadillumos-joyent-0e7515250c8395f368aa45fb9acae7c4f8f8b786.tar.gz
PSARC 2008/777 cpupm keyword mode extensions
PSARC 2008/663 CPU Deep Idle Keyword 6567156 bring CPU power awareness to the dispatcher 6700904 deeper C-State support required on follow-ons to Intel Penryn processor generation microarchitecture 6805661 cmt_root may contain duplicates on UMA systems --HG-- rename : usr/src/uts/i86pc/io/cpudrv/cpudrv_mach.c => usr/src/uts/i86pc/io/cpudrv_mach.c rename : usr/src/uts/i86pc/io/cpudrv/cpu_acpi.c => usr/src/uts/i86pc/os/cpupm/cpu_acpi.c rename : usr/src/uts/i86pc/io/cpudrv/cpudrv_amd.c => usr/src/uts/i86pc/os/cpupm/cpupm_amd.c rename : usr/src/uts/i86pc/io/cpudrv/cpudrv_intel.c => usr/src/uts/i86pc/os/cpupm/cpupm_intel.c rename : usr/src/uts/i86pc/os/cpupm.c => usr/src/uts/i86pc/os/cpupm/cpupm_mach.c rename : usr/src/uts/i86pc/io/cpudrv/cpudrv_throttle.c => usr/src/uts/i86pc/os/cpupm/cpupm_throttle.c rename : usr/src/uts/i86pc/io/cpudrv/pwrnow.c => usr/src/uts/i86pc/os/cpupm/pwrnow.c rename : usr/src/uts/i86pc/io/cpudrv/speedstep.c => usr/src/uts/i86pc/os/cpupm/speedstep.c rename : usr/src/uts/i86pc/sys/cpupm.h => usr/src/uts/i86pc/sys/cpupm_mach.h rename : usr/src/uts/i86pc/sys/cpudrv_throttle.h => usr/src/uts/i86pc/sys/cpupm_throttle.h
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/pg.c27
-rw-r--r--usr/src/cmd/power/handlers.c65
-rw-r--r--usr/src/cmd/power/parse.c7
-rw-r--r--usr/src/cmd/power/pmconfig.h5
-rw-r--r--usr/src/cmd/powertop/cpufreq.c12
-rw-r--r--usr/src/pkgdefs/SUNWhea/prototype_com1
-rw-r--r--usr/src/uts/common/Makefile.files2
-rw-r--r--usr/src/uts/common/conf/param.c6
-rw-r--r--usr/src/uts/common/disp/cmt.c1030
-rw-r--r--usr/src/uts/common/disp/cmt_policy.c229
-rw-r--r--usr/src/uts/common/disp/disp.c27
-rw-r--r--usr/src/uts/common/io/cpudrv.c382
-rw-r--r--usr/src/uts/common/io/pm.c130
-rw-r--r--usr/src/uts/common/os/cpu.c26
-rw-r--r--usr/src/uts/common/os/cpu_pm.c840
-rw-r--r--usr/src/uts/common/os/cpupm.c67
-rw-r--r--usr/src/uts/common/os/group.c37
-rw-r--r--usr/src/uts/common/os/pg.c120
-rw-r--r--usr/src/uts/common/os/pghw.c73
-rw-r--r--usr/src/uts/common/os/sunpm.c11
-rw-r--r--usr/src/uts/common/sys/Makefile1
-rw-r--r--usr/src/uts/common/sys/callb.h12
-rw-r--r--usr/src/uts/common/sys/cmt.h62
-rw-r--r--usr/src/uts/common/sys/cpu_pm.h139
-rw-r--r--usr/src/uts/common/sys/cpudrv.h62
-rw-r--r--usr/src/uts/common/sys/cpupm.h43
-rw-r--r--usr/src/uts/common/sys/cpuvar.h2
-rw-r--r--usr/src/uts/common/sys/epm.h29
-rw-r--r--usr/src/uts/common/sys/group.h16
-rw-r--r--usr/src/uts/common/sys/pg.h46
-rw-r--r--usr/src/uts/common/sys/pghw.h39
-rw-r--r--usr/src/uts/common/sys/pm.h11
-rw-r--r--usr/src/uts/i86pc/Makefile.files19
-rw-r--r--usr/src/uts/i86pc/Makefile.rules14
-rw-r--r--usr/src/uts/i86pc/io/cpudrv/cpudrv_mach.c516
-rw-r--r--usr/src/uts/i86pc/io/cpudrv_mach.c287
-rw-r--r--usr/src/uts/i86pc/io/hpet_acpi.c1388
-rw-r--r--usr/src/uts/i86pc/io/mp_platform_common.c56
-rw-r--r--usr/src/uts/i86pc/io/pcplusmp/apic.c38
-rw-r--r--usr/src/uts/i86pc/io/ppm_plat.c46
-rw-r--r--usr/src/uts/i86pc/os/cpuid.c48
-rw-r--r--usr/src/uts/i86pc/os/cpupm.c247
-rw-r--r--usr/src/uts/i86pc/os/cpupm/cpu_acpi.c (renamed from usr/src/uts/i86pc/io/cpudrv/cpu_acpi.c)264
-rw-r--r--usr/src/uts/i86pc/os/cpupm/cpu_idle.c877
-rw-r--r--usr/src/uts/i86pc/os/cpupm/cpupm_amd.c (renamed from usr/src/uts/i86pc/io/cpudrv/cpudrv_amd.c)13
-rw-r--r--usr/src/uts/i86pc/os/cpupm/cpupm_intel.c (renamed from usr/src/uts/i86pc/io/cpudrv/cpudrv_intel.c)62
-rw-r--r--usr/src/uts/i86pc/os/cpupm/cpupm_mach.c928
-rw-r--r--usr/src/uts/i86pc/os/cpupm/cpupm_throttle.c (renamed from usr/src/uts/i86pc/io/cpudrv/cpudrv_throttle.c)235
-rw-r--r--usr/src/uts/i86pc/os/cpupm/pwrnow.c (renamed from usr/src/uts/i86pc/io/cpudrv/pwrnow.c)99
-rw-r--r--usr/src/uts/i86pc/os/cpupm/speedstep.c (renamed from usr/src/uts/i86pc/io/cpudrv/speedstep.c)113
-rw-r--r--usr/src/uts/i86pc/os/mlsetup.c4
-rw-r--r--usr/src/uts/i86pc/os/mp_machdep.c166
-rw-r--r--usr/src/uts/i86pc/os/mp_startup.c42
-rw-r--r--usr/src/uts/i86pc/os/startup.c7
-rw-r--r--usr/src/uts/i86pc/sys/cpu_acpi.h68
-rw-r--r--usr/src/uts/i86pc/sys/cpu_idle.h72
-rw-r--r--usr/src/uts/i86pc/sys/cpudrv_mach.h141
-rw-r--r--usr/src/uts/i86pc/sys/cpupm.h89
-rw-r--r--usr/src/uts/i86pc/sys/cpupm_mach.h197
-rw-r--r--usr/src/uts/i86pc/sys/cpupm_throttle.h43
-rw-r--r--usr/src/uts/i86pc/sys/hpet.h80
-rw-r--r--usr/src/uts/i86pc/sys/hpet_acpi.h334
-rw-r--r--usr/src/uts/i86pc/sys/machcpuvar.h13
-rw-r--r--usr/src/uts/i86pc/sys/machsystm.h10
-rw-r--r--usr/src/uts/i86pc/sys/pwrnow.h8
-rw-r--r--usr/src/uts/i86pc/sys/speedstep.h8
-rw-r--r--usr/src/uts/i86xpv/Makefile.files3
-rw-r--r--usr/src/uts/intel/ia32/ml/modstubs.s20
-rw-r--r--usr/src/uts/intel/io/acpica/osl.c49
-rw-r--r--usr/src/uts/intel/sys/acpica.h6
-rw-r--r--usr/src/uts/intel/sys/x86_archext.h8
-rw-r--r--usr/src/uts/sun4/Makefile.files5
-rw-r--r--usr/src/uts/sun4/os/cpupm_mach.c51
-rw-r--r--usr/src/uts/sun4/os/mlsetup.c6
-rw-r--r--usr/src/uts/sun4/os/startup.c3
-rw-r--r--usr/src/uts/sun4/sys/cpupm_mach.h (renamed from usr/src/uts/i86pc/sys/cpudrv_throttle.h)15
-rw-r--r--usr/src/uts/sun4u/Makefile.files3
-rw-r--r--usr/src/uts/sun4u/cpu/spitfire.c3
-rw-r--r--usr/src/uts/sun4u/cpu/us3_cheetah.c4
-rw-r--r--usr/src/uts/sun4u/cpu/us3_cheetahplus.c4
-rw-r--r--usr/src/uts/sun4u/cpu/us3_jalapeno.c4
-rw-r--r--usr/src/uts/sun4u/io/cpudrv_mach.c40
-rw-r--r--usr/src/uts/sun4u/os/cmp.c54
-rw-r--r--usr/src/uts/sun4u/os/mach_startup.c4
-rw-r--r--usr/src/uts/sun4u/sys/cpudrv_mach.h65
-rw-r--r--usr/src/uts/sun4v/os/cmp.c60
-rw-r--r--usr/src/uts/sun4v/os/mach_startup.c4
87 files changed, 8342 insertions, 2160 deletions
diff --git a/usr/src/cmd/mdb/common/modules/genunix/pg.c b/usr/src/cmd/mdb/common/modules/genunix/pg.c
index 4e36430f04..60b4fba431 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/pg.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/pg.c
@@ -20,12 +20,10 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Display processor group information
*/
@@ -34,6 +32,7 @@
#include <mdb/mdb_modapi.h>
#include <sys/pghw.h>
+#include <sys/cmt.h>
/*
* PG hardware types indexed by hardware ID
@@ -46,6 +45,8 @@ char *pg_hw_names[] = {
"mpipe",
"chip",
"memory",
+ "active_pwr",
+ "idle_pwr",
};
#define A_CNT(arr) (sizeof (arr) / sizeof (arr[0]))
@@ -70,8 +71,10 @@ pg(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
{
pg_t pg;
pghw_t pghw;
+ pg_cmt_t pg_cmt;
pg_class_t pg_class;
int opt_q = 0; /* display only address. */
+ int is_cmt = 0; /* This is CMT pg */
/* Should provide an address */
if (! (flags & DCMD_ADDRSPEC))
@@ -86,13 +89,14 @@ pg(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
opt_q = B_TRUE;
if (DCMD_HDRSPEC(flags) && !opt_q) {
- mdb_printf("%6s %?s %6s %7s %9s %5s\n",
+ mdb_printf("%6s %?s %6s %7s %11s %5s %5s\n",
"PGID",
"ADDR",
"PHYSID",
"CLASS",
"HARDWARE",
- "#CPUs");
+ "#CPUs",
+ "LOAD");
}
/*
@@ -111,6 +115,14 @@ pg(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
return (DCMD_OK);
}
+ if (strcmp(pg_class.pgc_name, "cmt") == 0) {
+ if (mdb_vread(&pg_cmt, sizeof (pg_cmt_t), addr) == -1) {
+ mdb_warn("unable to read 'cmt pg' at %p", addr);
+ return (DCMD_ERR);
+ }
+ is_cmt = 1;
+ }
+
if (mdb_vread(&pg_class, sizeof (struct pg_class),
(uintptr_t)pg.pg_class) == -1) {
mdb_warn("unable to read 'pg_class' at %p", pg.pg_class);
@@ -125,10 +137,11 @@ pg(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
/*
* Display the physical PG info.
*/
- mdb_printf("%6d %?p %6d %7s %9s %5d\n",
+ mdb_printf("%6d %?p %6d %7s %11s %5d %5d\n",
pg.pg_id, addr, pghw.pghw_instance,
pg_class.pgc_name, pg_hw_name(pghw.pghw_hw),
- pg.pg_cpus.grp_size);
+ pg.pg_cpus.grp_size,
+ is_cmt ? pg_cmt.cmt_utilization : 0);
} else {
/*
* Display the basic PG info.
diff --git a/usr/src/cmd/power/handlers.c b/usr/src/cmd/power/handlers.c
index f5fa621c0c..5d2d51851c 100644
--- a/usr/src/cmd/power/handlers.c
+++ b/usr/src/cmd/power/handlers.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include "pmconfig.h"
#include <sys/mkdev.h>
#include <sys/syslog.h>
@@ -120,14 +118,66 @@ do_ioctl(int ioctl_cmd, char *keyword, char *behavior, int suppress)
int
cpupm(void)
{
+ struct bmtoc {
+ char *behavior;
+ char *mode;
+ int cmd;
+ int Errno;
+ };
+
+ static struct bmtoc bmlist[] = {
+ "disable", "\0", PM_STOP_CPUPM, EINVAL,
+ "enable", "poll-mode", PM_START_CPUPM_POLL, EBUSY,
+ "enable", "event-mode", PM_START_CPUPM_EV, EBUSY,
+ "enable", "\0", PM_START_CPUPM, EBUSY,
+ NULL, 0, 0, 0
+ };
+ struct bmtoc *bp;
+ char *behavior;
+ char *mode;
+
+ behavior = LINEARG(1);
+ if ((mode = LINEARG(2)) == NULL)
+ mode = "\0";
+
+ for (bp = bmlist; bp->cmd; bp++) {
+ if (strcmp(behavior, bp->behavior) == 0 &&
+ strcmp(mode, bp->mode) == 0) {
+ break;
+ }
+ }
+ if (bp->cmd == 0) {
+ if (LINEARG(2) == NULL) {
+ mesg(MERR, "invalid cpupm behavior \"%s\"\n", behavior);
+ } else {
+ mesg(MERR, "invalid cpupm behavior \"%s %s\"\n",
+ behavior, mode);
+ }
+ return (NOUP);
+ }
+ if (ioctl(pm_fd, bp->cmd, NULL) == -1 && errno != bp->Errno) {
+ mesg(MERR, "cpupm %s failed, %s\n",
+ behavior, strerror(errno));
+ return (NOUP);
+ }
+ return (OKUP);
+}
+
+/*
+ * Check for valid cpu_deep_idle option and communicate it to the kernel.
+ */
+int
+cpuidle(void)
+{
struct btoc {
char *behavior;
int cmd;
int Errno;
};
static struct btoc blist[] = {
- "disable", PM_STOP_CPUPM, EINVAL,
- "enable", PM_START_CPUPM, EBUSY,
+ "disable", PM_DISABLE_CPU_DEEP_IDLE, EINVAL,
+ "enable", PM_ENABLE_CPU_DEEP_IDLE, EBUSY,
+ "default", PM_DEFAULT_CPU_DEEP_IDLE, EBUSY,
NULL, 0, 0
};
struct btoc *bp;
@@ -138,18 +188,17 @@ cpupm(void)
break;
}
if (bp->cmd == 0) {
- mesg(MERR, "invalid cpupm behavior \"%s\"\n", behavior);
+ mesg(MERR, "invalid cpu_deep_idle behavior \"%s\"\n", behavior);
return (NOUP);
}
if (ioctl(pm_fd, bp->cmd, NULL) == -1 && errno != bp->Errno) {
- mesg(MERR, "cpupm %s failed, %s\n",
+ mesg(MERR, "cpu_deep_idle %s failed, %s\n",
behavior, strerror(errno));
return (NOUP);
}
return (OKUP);
}
-
/*
* Two decisions are identical except for the list names and ioctl commands
* inputs: whitelist, blacklist, yes, no
diff --git a/usr/src/cmd/power/parse.c b/usr/src/cmd/power/parse.c
index e7adff4d18..5ea845653d 100644
--- a/usr/src/cmd/power/parse.c
+++ b/usr/src/cmd/power/parse.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include "pmconfig.h"
#include <deflt.h>
#include <pwd.h>
@@ -58,7 +56,8 @@ static cinfo_t conftab[] = {
"autopm", autopm, &pm_status, NULL, 2, 0, 1,
"autoshutdown", autosd, &cpr_status, as_cmt, 5, 0, 1,
"cpu-threshold", cputhr, &pm_status, NULL, 2, 0, 1,
- "cpupm", cpupm, &pm_status, NULL, 2, 0, 1,
+ "cpu_deep_idle", cpuidle, &pm_status, NULL, 2, 0, 1,
+ "cpupm", cpupm, &pm_status, NULL, 2, 1, 1,
"device-dependency-property",
ddprop, &pm_status, NULL, 3, 1, 1,
"device-dependency", devdep, &pm_status, NULL, 3, 1, 1,
diff --git a/usr/src/cmd/power/pmconfig.h b/usr/src/cmd/power/pmconfig.h
index 33f26b63df..e03c434ac2 100644
--- a/usr/src/cmd/power/pmconfig.h
+++ b/usr/src/cmd/power/pmconfig.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _PMCONFIG_H
#define _PMCONFIG_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -126,6 +124,7 @@ extern int autoS3(void);
extern int autopm(void);
extern int autosd(void);
extern int cpupm(void);
+extern int cpuidle(void);
extern int cputhr(void);
extern int ddprop(void);
extern int devdep(void);
diff --git a/usr/src/cmd/powertop/cpufreq.c b/usr/src/cmd/powertop/cpufreq.c
index 18bd393665..9537ce6c65 100644
--- a/usr/src/cmd/powertop/cpufreq.c
+++ b/usr/src/cmd/powertop/cpufreq.c
@@ -71,18 +71,18 @@ static const char *pt_cpufreq_dtrace_prog =
"}"
""
":::cpu-change-speed"
-"/last[((cpudrv_devstate_t *)arg0)->cpu_id] != 0/"
+"/last[(processorid_t)arg0] != 0/"
"{"
-" this->cpu = ((cpudrv_devstate_t *)arg0)->cpu_id;"
-" this->oldspeed = ((cpudrv_pm_t *)arg1)->cur_spd->speed;"
+" this->cpu = (processorid_t)arg0;"
+" this->oldspeed = (uint32_t)(arg1/1000000);"
" @times[this->cpu, this->oldspeed] = sum(timestamp - last[this->cpu]);"
" last[this->cpu] = timestamp;"
"}"
":::cpu-change-speed"
-"/last[((cpudrv_devstate_t *)arg0)->cpu_id] == 0/"
+"/last[(processorid_t)arg0] == 0/"
"{"
-" this->cpu = ((cpudrv_devstate_t *)arg0)->cpu_id;"
-" this->oldspeed = ((cpudrv_pm_t *)arg1)->cur_spd->speed;"
+" this->cpu = (processorid_t)arg0;"
+" this->oldspeed = (uint32_t)(arg1/1000000);"
" @times[this->cpu, this->oldspeed] = sum(timestamp - begin);"
" last[this->cpu] = timestamp;"
"}";
diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com
index 555f28921c..3cc32ddd3d 100644
--- a/usr/src/pkgdefs/SUNWhea/prototype_com
+++ b/usr/src/pkgdefs/SUNWhea/prototype_com
@@ -711,6 +711,7 @@ f none usr/include/sys/cpr.h 644 root bin
f none usr/include/sys/cpu.h 644 root bin
f none usr/include/sys/cpupart.h 644 root bin
f none usr/include/sys/cpuvar.h 644 root bin
+f none usr/include/sys/cpu_pm.h 644 root bin
f none usr/include/sys/crc32.h 644 root bin
f none usr/include/sys/cred.h 644 root bin
f none usr/include/sys/cred_impl.h 644 root bin
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 19f0512969..d123becc90 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -41,8 +41,10 @@ COMMON_CORE_OBJS += \
brand.o \
cpucaps.o \
cmt.o \
+ cmt_policy.o \
cpu.o \
cpu_intr.o \
+ cpu_pm.o \
cpupart.o \
disp.o \
group.o \
diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c
index 5831545a33..ceecf32ee8 100644
--- a/usr/src/uts/common/conf/param.c
+++ b/usr/src/uts/common/conf/param.c
@@ -190,9 +190,6 @@ extern void deadman_init(void);
extern void clock_timer_init(void);
extern void clock_realtime_init(void);
extern void clock_highres_init(void);
-extern void pg_init(void);
-extern void pg_cmt_class_init(void);
-extern void pg_cpu0_init(void);
extern void clock_tick_mp_init(void);
extern void callout_mp_init(void);
extern void cpu_seq_tbl_init(void);
@@ -214,9 +211,6 @@ void (*init_tbl[])(void) = {
segvn_init,
flk_init,
cpu_seq_tbl_init,
- pg_init,
- pg_cmt_class_init,
- pg_cpu0_init,
schedctl_init,
fdb_init,
deadman_init,
diff --git a/usr/src/uts/common/disp/cmt.c b/usr/src/uts/common/disp/cmt.c
index 0fef28ff15..06c349c9b2 100644
--- a/usr/src/uts/common/disp/cmt.c
+++ b/usr/src/uts/common/disp/cmt.c
@@ -39,6 +39,7 @@
#include <sys/bitset.h>
#include <sys/lgrp.h>
#include <sys/cmt.h>
+#include <sys/cpu_pm.h>
/*
* CMT scheduler / dispatcher support
@@ -58,11 +59,12 @@
*
* The scheduler/dispatcher leverages knowledge of the performance
* relevant CMT sharing relationships existing between cpus to implement
- * optimized affinity and load balancing policies.
+ * optimized affinity, load balancing, and coalescence policies.
*
* Load balancing policy seeks to improve performance by minimizing
- * contention over shared processor resources / facilities, while the
- * affinity policies seek to improve cache and TLB utilization.
+ * contention over shared processor resources / facilities, Affinity
+ * policies seek to improve cache and TLB utilization. Coalescence
+ * policies improve resource utilization and ultimately power efficiency.
*
* The CMT PGs created by this class are already arranged into a
* hierarchy (which is done in the pghw layer). To implement the top-down
@@ -79,25 +81,24 @@
* balancng across the CMT PGs within their respective (per lgroup) top level
* groups.
*/
-typedef struct cmt_lgrp {
- group_t cl_pgs; /* Top level group of active CMT PGs */
- int cl_npgs; /* # of top level PGs in the lgroup */
- lgrp_handle_t cl_hand; /* lgroup's platform handle */
- struct cmt_lgrp *cl_next; /* next cmt_lgrp */
-} cmt_lgrp_t;
-
static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */
static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */
/* used for null_proc_lpa */
-static cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */
+cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */
static int is_cpu0 = 1; /* true if this is boot CPU context */
/*
+ * Array of hardware sharing relationships that are blacklisted.
+ * PGs won't be instantiated for blacklisted hardware sharing relationships.
+ */
+static int cmt_hw_blacklisted[PGHW_NUM_COMPONENTS];
+
+/*
* Set this to non-zero to disable CMT scheduling
* This must be done via kmdb -d, as /etc/system will be too late
*/
-static int cmt_sched_disabled = 0;
+int cmt_sched_disabled = 0;
static pg_cid_t pg_cmt_class_id; /* PG class id */
@@ -109,11 +110,19 @@ static void pg_cmt_cpu_active(cpu_t *);
static void pg_cmt_cpu_inactive(cpu_t *);
static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
-static void pg_cmt_hier_pack(void **, int);
+static char *pg_cmt_policy_name(pg_t *);
+static void pg_cmt_hier_sort(pg_cmt_t **, int);
+static pg_cmt_t *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *);
static int pg_cmt_cpu_belongs(pg_t *, cpu_t *);
static int pg_cmt_hw(pghw_type_t);
static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t);
static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t);
+static int pg_cmt_lineage_validate(pg_cmt_t **, int *);
+static void cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t,
+ kthread_t *, kthread_t *);
+static void cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t,
+ kthread_t *, kthread_t *);
+static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
/*
* Macro to test if PG is managed by the CMT PG class
@@ -121,6 +130,29 @@ static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t);
#define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
/*
+ * Status codes for CMT lineage validation
+ * See cmt_lineage_validate() below
+ */
+typedef enum cmt_lineage_validation {
+ CMT_LINEAGE_VALID,
+ CMT_LINEAGE_NON_CONCENTRIC,
+ CMT_LINEAGE_REPAIRED,
+ CMT_LINEAGE_UNRECOVERABLE
+} cmt_lineage_validation_t;
+
+/*
+ * Status of the current lineage under construction.
+ * One must be holding cpu_lock to change this.
+ */
+static cmt_lineage_validation_t cmt_lineage_status = CMT_LINEAGE_VALID;
+
+/*
+ * Power domain definitions (on x86) are defined by ACPI, and
+ * therefore may be subject to BIOS bugs.
+ */
+#define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw)
+
+/*
* CMT PG ops
*/
struct pg_ops pg_ops_cmt = {
@@ -134,6 +166,7 @@ struct pg_ops pg_ops_cmt = {
NULL, /* cpupart_out */
pg_cmt_cpupart_move,
pg_cmt_cpu_belongs,
+ pg_cmt_policy_name,
};
/*
@@ -156,25 +189,8 @@ pg_cmt_class_init(void)
void
pg_cmt_cpu_startup(cpu_t *cp)
{
- PG_NRUN_UPDATE(cp, 1);
-}
-
-/*
- * Adjust the CMT load in the CMT PGs in which the CPU belongs
- * Note that "n" can be positive in the case of increasing
- * load, or negative in the case of decreasing load.
- */
-void
-pg_cmt_load(cpu_t *cp, int n)
-{
- pg_cmt_t *pg;
-
- pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
- while (pg != NULL) {
- ASSERT(IS_CMT_PG(pg));
- atomic_add_32(&pg->cmt_nrunning, n);
- pg = pg->cmt_parent;
- }
+ pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread,
+ cp->cpu_thread);
}
/*
@@ -212,14 +228,219 @@ pg_cmt_free(pg_t *pg)
}
/*
- * Return 1 if CMT scheduling policies should be impelmented
- * for the specified hardware sharing relationship.
+ * Given a hardware sharing relationship, return which dispatcher
+ * policies should be implemented to optimize performance and efficiency
*/
-static int
-pg_cmt_hw(pghw_type_t hw)
+static pg_cmt_policy_t
+pg_cmt_policy(pghw_type_t hw)
+{
+ pg_cmt_policy_t p;
+
+ /*
+ * Give the platform a chance to override the default
+ */
+ if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY)
+ return (p);
+
+ switch (hw) {
+ case PGHW_IPIPE:
+ case PGHW_FPU:
+ case PGHW_CHIP:
+ return (CMT_BALANCE);
+ case PGHW_CACHE:
+ return (CMT_AFFINITY);
+ case PGHW_POW_ACTIVE:
+ case PGHW_POW_IDLE:
+ return (CMT_BALANCE);
+ default:
+ return (CMT_NO_POLICY);
+ }
+}
+
+/*
+ * Rank the importance of optimizing for the pg1 relationship vs.
+ * the pg2 relationship.
+ */
+static pg_cmt_t *
+pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2)
+{
+ pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw;
+ pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw;
+
+ /*
+ * A power domain is only important if CPUPM is enabled.
+ */
+ if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
+ if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2))
+ return (pg2);
+ if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1))
+ return (pg1);
+ }
+
+ /*
+ * Otherwise, ask the platform
+ */
+ if (pg_plat_hw_rank(hw1, hw2) == hw1)
+ return (pg1);
+ else
+ return (pg2);
+}
+
+/*
+ * Initialize CMT callbacks for the given PG
+ */
+static void
+cmt_callback_init(pg_t *pg)
{
- return (pg_plat_cmt_load_bal_hw(hw) ||
- pg_plat_cmt_affinity_hw(hw));
+ switch (((pghw_t *)pg)->pghw_hw) {
+ case PGHW_POW_ACTIVE:
+ pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr;
+ pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr;
+ break;
+ default:
+ pg->pg_cb.thread_swtch = cmt_ev_thread_swtch;
+
+ }
+}
+
+/*
+ * Promote PG above it's current parent.
+ * This is only legal if PG has an equal or greater number of CPUs
+ * than it's parent.
+ */
+static void
+cmt_hier_promote(pg_cmt_t *pg)
+{
+ pg_cmt_t *parent;
+ group_t *children;
+ cpu_t *cpu;
+ group_iter_t iter;
+ pg_cpu_itr_t cpu_iter;
+ int r;
+ int err;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ parent = pg->cmt_parent;
+ if (parent == NULL) {
+ /*
+ * Nothing to do
+ */
+ return;
+ }
+
+ ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent));
+
+ /*
+ * We're changing around the hierarchy, which is actively traversed
+ * by the dispatcher. Pause CPUS to ensure exclusivity.
+ */
+ pause_cpus(NULL);
+
+ /*
+ * If necessary, update the parent's sibling set, replacing parent
+ * with PG.
+ */
+ if (parent->cmt_siblings) {
+ if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE)
+ != -1) {
+ r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE);
+ ASSERT(r != -1);
+ }
+ }
+
+ /*
+ * If the parent is at the top of the hierarchy, replace it's entry
+ * in the root lgroup's group of top level PGs.
+ */
+ if (parent->cmt_parent == NULL &&
+ parent->cmt_siblings != &cmt_root->cl_pgs) {
+ if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE)
+ != -1) {
+ r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE);
+ ASSERT(r != -1);
+ }
+ }
+
+ /*
+ * We assume (and therefore assert) that the PG being promoted is an
+ * only child of it's parent. Update the parent's children set
+ * replacing PG's entry with the parent (since the parent is becoming
+ * the child). Then have PG and the parent swap children sets.
+ */
+ ASSERT(GROUP_SIZE(parent->cmt_children) <= 1);
+ if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) {
+ r = group_add(parent->cmt_children, parent, GRP_NORESIZE);
+ ASSERT(r != -1);
+ }
+
+ children = pg->cmt_children;
+ pg->cmt_children = parent->cmt_children;
+ parent->cmt_children = children;
+
+ /*
+ * Update the sibling references for PG and it's parent
+ */
+ pg->cmt_siblings = parent->cmt_siblings;
+ parent->cmt_siblings = pg->cmt_children;
+
+ /*
+ * Update any cached lineages in the per CPU pg data.
+ */
+ PG_CPU_ITR_INIT(pg, cpu_iter);
+ while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
+ int idx;
+ group_t *pgs;
+ pg_cmt_t *cpu_pg;
+
+ /*
+ * Iterate over the CPU's PGs updating the children
+ * of the PG being promoted, since they have a new parent.
+ */
+ pgs = &cpu->cpu_pg->pgs;
+ group_iter_init(&iter);
+ while ((cpu_pg = group_iterate(pgs, &iter)) != NULL) {
+ if (cpu_pg->cmt_parent == pg) {
+ cpu_pg->cmt_parent = parent;
+ }
+ }
+
+ /*
+ * Update the CMT load balancing lineage
+ */
+ pgs = &cpu->cpu_pg->cmt_pgs;
+ if ((idx = group_find(pgs, (void *)pg)) == -1) {
+ /*
+ * Unless this is the CPU who's lineage is being
+ * constructed, the PG being promoted should be
+ * in the lineage.
+ */
+ ASSERT(GROUP_SIZE(pgs) == 0);
+ continue;
+ }
+
+ ASSERT(GROUP_ACCESS(pgs, idx - 1) == parent);
+ ASSERT(idx > 0);
+
+ /*
+ * Have the child and the parent swap places in the CPU's
+ * lineage
+ */
+ group_remove_at(pgs, idx);
+ group_remove_at(pgs, idx - 1);
+ err = group_add_at(pgs, parent, idx);
+ ASSERT(err == 0);
+ err = group_add_at(pgs, pg, idx - 1);
+ ASSERT(err == 0);
+ }
+
+ /*
+ * Update the parent references for PG and it's parent
+ */
+ pg->cmt_parent = parent->cmt_parent;
+ parent->cmt_parent = pg;
+
+ start_cpus();
}
/*
@@ -230,7 +451,7 @@ pg_cmt_cpu_init(cpu_t *cp)
{
pg_cmt_t *pg;
group_t *cmt_pgs;
- int level, max_level, nlevels;
+ int levels, level;
pghw_type_t hw;
pg_t *pg_cache = NULL;
pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS];
@@ -239,26 +460,42 @@ pg_cmt_cpu_init(cpu_t *cp)
ASSERT(MUTEX_HELD(&cpu_lock));
+ if (cmt_sched_disabled)
+ return;
+
/*
* A new CPU is coming into the system.
* Interrogate the platform to see if the CPU
- * has any performance relevant CMT sharing
- * relationships
+ * has any performance or efficiency relevant
+ * sharing relationships
*/
cmt_pgs = &cp->cpu_pg->cmt_pgs;
cp->cpu_pg->cmt_lineage = NULL;
bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
- max_level = nlevels = 0;
+ levels = 0;
for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
+ pg_cmt_policy_t policy;
+
/*
- * We're only interested in CMT hw sharing relationships
+ * We're only interested in the hw sharing relationships
+ * for which we know how to optimize.
*/
- if (pg_cmt_hw(hw) == 0 || pg_plat_hw_shared(cp, hw) == 0)
+ policy = pg_cmt_policy(hw);
+ if (policy == CMT_NO_POLICY ||
+ pg_plat_hw_shared(cp, hw) == 0)
continue;
/*
+ * Continue if the hardware sharing relationship has been
+ * blacklisted.
+ */
+ if (cmt_hw_blacklisted[hw]) {
+ continue;
+ }
+
+ /*
* Find (or create) the PG associated with
* the hw sharing relationship in which cp
* belongs.
@@ -281,6 +518,11 @@ pg_cmt_cpu_init(cpu_t *cp)
* ... and CMT specific portions of the
* structure.
*/
+ pg->cmt_policy = policy;
+
+ /* CMT event callbacks */
+ cmt_callback_init((pg_t *)pg);
+
bitset_init(&pg->cmt_cpus_actv_set);
group_create(&pg->cmt_cpus_actv);
} else {
@@ -303,14 +545,10 @@ pg_cmt_cpu_init(cpu_t *cp)
}
/*
- * Build a lineage of CMT PGs for load balancing
+ * Build a lineage of CMT PGs for load balancing / coalescence
*/
- if (pg_plat_cmt_load_bal_hw(hw)) {
- level = pghw_level(hw);
- cpu_cmt_hier[level] = pg;
- if (level > max_level)
- max_level = level;
- nlevels++;
+ if (policy & (CMT_BALANCE | CMT_COALESCE)) {
+ cpu_cmt_hier[levels++] = pg;
}
/* Cache this for later */
@@ -318,44 +556,73 @@ pg_cmt_cpu_init(cpu_t *cp)
pg_cache = (pg_t *)pg;
}
- /*
- * Pack out any gaps in the constructed lineage,
- * then size it out.
- *
- * Gaps may exist where the architecture knows
- * about a hardware sharing relationship, but such a
- * relationship either isn't relevant for load
- * balancing or doesn't exist between CPUs on the system.
- */
- pg_cmt_hier_pack((void **)cpu_cmt_hier, max_level + 1);
- group_expand(cmt_pgs, nlevels);
-
+ group_expand(cmt_pgs, levels);
if (cmt_root == NULL)
cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());
/*
- * Find the lgrp that encapsulates this CPU's CMT hierarchy.
- * and locate/create a suitable cmt_lgrp_t.
+ * Find the lgrp that encapsulates this CPU's CMT hierarchy
*/
lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
lgrp = pg_cmt_lgrp_create(lgrp_handle);
/*
+ * Ascendingly sort the PGs in the lineage by number of CPUs
+ */
+ pg_cmt_hier_sort(cpu_cmt_hier, levels);
+
+ /*
+ * Examine the lineage and validate it.
+ * This routine will also try to fix the lineage along with the
+ * rest of the PG hierarchy should it detect an issue.
+ *
+ * If it returns -1, an unrecoverable error has happened and we
+ * need to return.
+ */
+ if (pg_cmt_lineage_validate(cpu_cmt_hier, &levels) < 0)
+ return;
+
+ /*
+ * For existing PGs in the lineage, verify that the parent is
+ * correct, as the generation in the lineage may have changed
+ * as a result of the sorting. Start the traversal at the top
+ * of the lineage, moving down.
+ */
+ for (level = levels - 1; level >= 0; ) {
+ int reorg;
+
+ reorg = 0;
+ pg = cpu_cmt_hier[level];
+
+ /*
+ * Promote PGs at an incorrect generation into place.
+ */
+ while (pg->cmt_parent &&
+ pg->cmt_parent != cpu_cmt_hier[level + 1]) {
+ cmt_hier_promote(pg);
+ reorg++;
+ }
+ if (reorg > 0)
+ level = levels - 1;
+ else
+ level--;
+ }
+
+ /*
* For each of the PGs in the CPU's lineage:
- * - Add an entry in the CPU's CMT PG group
- * which is used by the dispatcher to implement load balancing
- * policy.
+ * - Add an entry in the CPU sorted CMT PG group
+ * which is used for top down CMT load balancing
* - Tie the PG into the CMT hierarchy by connecting
* it to it's parent and siblings.
*/
- for (level = 0; level < nlevels; level++) {
+ for (level = 0; level < levels; level++) {
uint_t children;
int err;
pg = cpu_cmt_hier[level];
- err = group_add_at(cmt_pgs, pg, nlevels - level - 1);
+ err = group_add_at(cmt_pgs, pg, levels - level - 1);
ASSERT(err == 0);
if (level == 0)
@@ -371,12 +638,13 @@ pg_cmt_cpu_init(cpu_t *cp)
continue;
}
- if ((level + 1) == nlevels) {
+ if ((level + 1) == levels) {
pg->cmt_parent = NULL;
pg->cmt_siblings = &lgrp->cl_pgs;
children = ++lgrp->cl_npgs;
- cmt_root->cl_npgs++;
+ if (cmt_root != lgrp)
+ cmt_root->cl_npgs++;
} else {
pg->cmt_parent = cpu_cmt_hier[level + 1];
@@ -436,6 +704,9 @@ pg_cmt_cpu_fini(cpu_t *cp)
lgrp_handle_t lgrp_handle;
cmt_lgrp_t *lgrp;
+ if (cmt_sched_disabled)
+ return;
+
pgs = &cp->cpu_pg->pgs;
cmt_pgs = &cp->cpu_pg->cmt_pgs;
@@ -544,6 +815,9 @@ pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
ASSERT(MUTEX_HELD(&cpu_lock));
+ if (cmt_sched_disabled)
+ return;
+
pgs = &cp->cpu_pg->pgs;
/*
@@ -576,6 +850,9 @@ pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
ASSERT(MUTEX_HELD(&cpu_lock));
+ if (cmt_sched_disabled)
+ return;
+
pgs = &cp->cpu_pg->pgs;
group_iter_init(&pg_iter);
@@ -627,6 +904,9 @@ pg_cmt_cpu_active(cpu_t *cp)
ASSERT(MUTEX_HELD(&cpu_lock));
+ if (cmt_sched_disabled)
+ return;
+
pgs = &cp->cpu_pg->pgs;
group_iter_init(&i);
@@ -648,15 +928,16 @@ pg_cmt_cpu_active(cpu_t *cp)
* for balancing with it's siblings.
*/
if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
- pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) {
+ (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
ASSERT(err == 0);
/*
* If this is a top level PG, add it as a balancing
- * candidate when balancing within the root lgroup
+ * candidate when balancing within the root lgroup.
*/
- if (pg->cmt_parent == NULL) {
+ if (pg->cmt_parent == NULL &&
+ pg->cmt_siblings != &cmt_root->cl_pgs) {
err = group_add(&cmt_root->cl_pgs, pg,
GRP_NORESIZE);
ASSERT(err == 0);
@@ -691,6 +972,9 @@ pg_cmt_cpu_inactive(cpu_t *cp)
ASSERT(MUTEX_HELD(&cpu_lock));
+ if (cmt_sched_disabled)
+ return;
+
pgs = &cp->cpu_pg->pgs;
group_iter_init(&i);
@@ -713,11 +997,12 @@ pg_cmt_cpu_inactive(cpu_t *cp)
* load was balanced, remove it as a balancing candidate.
*/
if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
- pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) {
+ (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
ASSERT(err == 0);
- if (pg->cmt_parent == NULL) {
+ if (pg->cmt_parent == NULL &&
+ pg->cmt_siblings != &cmt_root->cl_pgs) {
err = group_remove(&cmt_root->cl_pgs, pg,
GRP_NORESIZE);
ASSERT(err == 0);
@@ -776,26 +1061,47 @@ pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
}
/*
- * Hierarchy packing utility routine. The hierarchy order is preserved.
+ * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
*/
static void
-pg_cmt_hier_pack(void *hier[], int sz)
+pg_cmt_hier_sort(pg_cmt_t **hier, int size)
{
- int i, j;
-
- for (i = 0; i < sz; i++) {
- if (hier[i] != NULL)
- continue;
+ int i, j, inc;
+ pg_t *tmp;
+ pg_t **h = (pg_t **)hier;
- for (j = i; j < sz; j++) {
- if (hier[j] != NULL) {
- hier[i] = hier[j];
- hier[j] = NULL;
- break;
+ /*
+ * First sort by number of CPUs
+ */
+ inc = size / 2;
+ while (inc > 0) {
+ for (i = inc; i < size; i++) {
+ j = i;
+ tmp = h[i];
+ while ((j >= inc) &&
+ (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) {
+ h[j] = h[j - inc];
+ j = j - inc;
}
+ h[j] = tmp;
+ }
+ if (inc == 2)
+ inc = 1;
+ else
+ inc = (inc * 5) / 11;
+ }
+
+ /*
+ * Break ties by asking the platform.
+ * Determine if h[i] outranks h[i + 1] and if so, swap them.
+ */
+ for (i = 0; i < size - 1; i++) {
+ if ((PG_NUM_CPUS(h[i]) == PG_NUM_CPUS(h[i + 1])) &&
+ pg_cmt_hier_rank(hier[i], hier[i + 1]) == hier[i]) {
+ tmp = h[i];
+ h[i] = h[i + 1];
+ h[i + 1] = tmp;
}
- if (j == sz)
- break;
}
}
@@ -840,134 +1146,492 @@ pg_cmt_lgrp_create(lgrp_handle_t hand)
}
/*
- * Perform multi-level CMT load balancing of running threads.
+ * Interfaces to enable and disable power aware dispatching
+ * The caller must be holding cpu_lock.
*
- * tp is the thread being enqueued.
- * cp is a hint CPU, against which CMT load balancing will be performed.
- *
- * Returns cp, or a CPU better than cp with respect to balancing
- * running thread load.
+ * Return 0 on success and -1 on failure.
*/
-cpu_t *
-cmt_balance(kthread_t *tp, cpu_t *cp)
+int
+cmt_pad_enable(pghw_type_t type)
{
- int hint, i, cpu, nsiblings;
- int self = 0;
- group_t *cmt_pgs, *siblings;
- pg_cmt_t *pg, *pg_tmp, *tpg = NULL;
- int pg_nrun, tpg_nrun;
- int level = 0;
- cpu_t *newcp;
+ group_t *hwset;
+ group_iter_t iter;
+ pg_cmt_t *pg;
- ASSERT(THREAD_LOCK_HELD(tp));
+ ASSERT(PGHW_IS_PM_DOMAIN(type));
+ ASSERT(MUTEX_HELD(&cpu_lock));
- cmt_pgs = &cp->cpu_pg->cmt_pgs;
+ if ((hwset = pghw_set_lookup(type)) == NULL ||
+ cmt_hw_blacklisted[type]) {
+ /*
+ * Unable to find any instances of the specified type
+ * of power domain, or the power domains have been blacklisted.
+ */
+ return (-1);
+ }
- if (GROUP_SIZE(cmt_pgs) == 0)
- return (cp); /* nothing to do */
+ /*
+ * Iterate over the power domains, setting the default dispatcher
+ * policy for power/performance optimization.
+ *
+ * Simply setting the policy isn't enough in the case where the power
+ * domain is an only child of another PG. Because the dispatcher walks
+ * the PG hierarchy in a top down fashion, the higher up PG's policy
+ * will dominate. So promote the power domain above it's parent if both
+ * PG and it's parent have the same CPUs to ensure it's policy
+ * dominates.
+ */
+ group_iter_init(&iter);
+ while ((pg = group_iterate(hwset, &iter)) != NULL) {
+ /*
+ * If the power domain is an only child to a parent
+ * not implementing the same policy, promote the child
+ * above the parent to activate the policy.
+ */
+ pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw);
+ while ((pg->cmt_parent != NULL) &&
+ (pg->cmt_parent->cmt_policy != pg->cmt_policy) &&
+ (PG_NUM_CPUS((pg_t *)pg) ==
+ PG_NUM_CPUS((pg_t *)pg->cmt_parent))) {
+ cmt_hier_promote(pg);
+ }
+ }
+
+ return (0);
+}
+
+int
+cmt_pad_disable(pghw_type_t type)
+{
+ group_t *hwset;
+ group_iter_t iter;
+ pg_cmt_t *pg;
+ pg_cmt_t *child;
- if (tp == curthread)
- self = 1;
+ ASSERT(PGHW_IS_PM_DOMAIN(type));
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ if ((hwset = pghw_set_lookup(type)) == NULL) {
+ /*
+ * Unable to find any instances of the specified type of
+ * power domain.
+ */
+ return (-1);
+ }
/*
- * Balance across siblings in the CPUs CMT lineage
- * If the thread is homed to the root lgroup, perform
- * top level balancing against other top level PGs
- * in the system. Otherwise, start with the default
- * top level siblings group, which is within the leaf lgroup
+ * Iterate over the power domains, setting the default dispatcher
+ * policy for performance optimization (load balancing).
*/
- pg = GROUP_ACCESS(cmt_pgs, level);
- if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID)
- siblings = &cmt_root->cl_pgs;
- else
- siblings = pg->cmt_siblings;
+ group_iter_init(&iter);
+ while ((pg = group_iterate(hwset, &iter)) != NULL) {
+
+ /*
+ * If the power domain has an only child that implements
+ * policy other than load balancing, promote the child
+ * above the power domain to ensure it's policy dominates.
+ */
+ if (GROUP_SIZE(pg->cmt_children) == 1) {
+ child = GROUP_ACCESS(pg->cmt_children, 0);
+ if ((child->cmt_policy & CMT_BALANCE) == 0) {
+ cmt_hier_promote(child);
+ }
+ }
+ pg->cmt_policy = CMT_BALANCE;
+ }
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
+ kthread_t *new)
+{
+ pg_cmt_t *cmt_pg = (pg_cmt_t *)pg;
+
+ if (old == cp->cpu_idle_thread) {
+ atomic_add_32(&cmt_pg->cmt_utilization, 1);
+ } else if (new == cp->cpu_idle_thread) {
+ atomic_add_32(&cmt_pg->cmt_utilization, -1);
+ }
+}
+
+/*
+ * Macro to test whether a thread is currently runnable on a CPU in a PG.
+ */
+#define THREAD_RUNNABLE_IN_PG(t, pg) \
+ ((t)->t_state == TS_RUN && \
+ (t)->t_disp_queue->disp_cpu && \
+ bitset_in_set(&(pg)->cmt_cpus_actv_set, \
+ (t)->t_disp_queue->disp_cpu->cpu_seqid))
+
+static void
+cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
+ kthread_t *new)
+{
+ pg_cmt_t *cmt = (pg_cmt_t *)pg;
+ cpupm_domain_t *dom;
+ uint32_t u;
+
+ if (old == cp->cpu_idle_thread) {
+ ASSERT(new != cp->cpu_idle_thread);
+ u = atomic_add_32_nv(&cmt->cmt_utilization, 1);
+ if (u == 1) {
+ /*
+ * Notify the CPU power manager that the domain
+ * is non-idle.
+ */
+ dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
+ cpupm_utilization_event(cp, now, dom,
+ CPUPM_DOM_BUSY_FROM_IDLE);
+ }
+ } else if (new == cp->cpu_idle_thread) {
+ ASSERT(old != cp->cpu_idle_thread);
+ u = atomic_add_32_nv(&cmt->cmt_utilization, -1);
+ if (u == 0) {
+ /*
+ * The domain is idle, notify the CPU power
+ * manager.
+ *
+ * Avoid notifying if the thread is simply migrating
+ * between CPUs in the domain.
+ */
+ if (!THREAD_RUNNABLE_IN_PG(old, cmt)) {
+ dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
+ cpupm_utilization_event(cp, now, dom,
+ CPUPM_DOM_IDLE_FROM_BUSY);
+ }
+ }
+ }
+}
+
+/* ARGSUSED */
+static void
+cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t)
+{
+ pg_cmt_t *cmt = (pg_cmt_t *)pg;
+ cpupm_domain_t *dom;
+
+ dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
+ cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY);
+}
+
+/*
+ * Return the name of the CMT scheduling policy
+ * being implemented across this PG
+ */
+static char *
+pg_cmt_policy_name(pg_t *pg)
+{
+ pg_cmt_policy_t policy;
+
+ policy = ((pg_cmt_t *)pg)->cmt_policy;
+
+ if (policy & CMT_AFFINITY) {
+ if (policy & CMT_BALANCE)
+ return ("Load Balancing & Affinity");
+ else if (policy & CMT_COALESCE)
+ return ("Load Coalescence & Affinity");
+ else
+ return ("Affinity");
+ } else {
+ if (policy & CMT_BALANCE)
+ return ("Load Balancing");
+ else if (policy & CMT_COALESCE)
+ return ("Load Coalescence");
+ else
+ return ("None");
+ }
+}
+
+/*
+ * Prune PG, and all other instances of PG's hardware sharing relationship
+ * from the PG hierarchy.
+ */
+static int
+pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz)
+{
+ group_t *hwset, *children;
+ int i, j, r, size = *sz;
+ group_iter_t hw_iter, child_iter;
+ pg_cpu_itr_t cpu_iter;
+ pg_cmt_t *pg, *child;
+ cpu_t *cpu;
+ int cap_needed;
+ pghw_type_t hw;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ hw = ((pghw_t *)pg_bad)->pghw_hw;
+
+ if (hw == PGHW_POW_ACTIVE) {
+ cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. "
+ "Event Based CPUPM Unavailable");
+ } else if (hw == PGHW_POW_IDLE) {
+ cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. "
+ "Dispatcher assisted CPUPM disabled.");
+ }
/*
- * Traverse down the lineage until we find a level that needs
- * balancing, or we get to the end.
+ * Find and eliminate the PG from the lineage.
*/
- for (;;) {
- nsiblings = GROUP_SIZE(siblings); /* self inclusive */
- if (nsiblings == 1)
- goto next_level;
+ for (i = 0; i < size; i++) {
+ if (lineage[i] == pg_bad) {
+ for (j = i; j < size - 1; j++)
+ lineage[j] = lineage[j + 1];
+ *sz = size - 1;
+ break;
+ }
+ }
- pg_nrun = pg->cmt_nrunning;
- if (self &&
- bitset_in_set(&pg->cmt_cpus_actv_set, CPU->cpu_seqid))
- pg_nrun--; /* Ignore curthread's effect */
+ /*
+ * We'll prune all instances of the hardware sharing relationship
+ * represented by pg. But before we do that (and pause CPUs) we need
+ * to ensure the hierarchy's groups are properly sized.
+ */
+ hwset = pghw_set_lookup(hw);
- hint = CPU_PSEUDO_RANDOM() % nsiblings;
+ /*
+ * Blacklist the hardware so that future groups won't be created.
+ */
+ cmt_hw_blacklisted[hw] = 1;
+ /*
+ * For each of the PGs being pruned, ensure sufficient capacity in
+ * the siblings set for the PG's children
+ */
+ group_iter_init(&hw_iter);
+ while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
/*
- * Find a balancing candidate from among our siblings
- * "hint" is a hint for where to start looking
+ * PG is being pruned, but if it is bringing up more than
+ * one child, ask for more capacity in the siblings group.
*/
- i = hint;
- do {
- ASSERT(i < nsiblings);
- pg_tmp = GROUP_ACCESS(siblings, i);
+ cap_needed = 0;
+ if (pg->cmt_children &&
+ GROUP_SIZE(pg->cmt_children) > 1) {
+ cap_needed = GROUP_SIZE(pg->cmt_children) - 1;
+
+ group_expand(pg->cmt_siblings,
+ GROUP_SIZE(pg->cmt_siblings) + cap_needed);
/*
- * The candidate must not be us, and must
- * have some CPU resources in the thread's
- * partition
+ * If this is a top level group, also ensure the
+ * capacity in the root lgrp level CMT grouping.
*/
- if (pg_tmp != pg &&
- bitset_in_set(&tp->t_cpupart->cp_cmt_pgs,
- ((pg_t *)pg_tmp)->pg_id)) {
- tpg = pg_tmp;
- break;
+ if (pg->cmt_parent == NULL &&
+ pg->cmt_siblings != &cmt_root->cl_pgs) {
+ group_expand(&cmt_root->cl_pgs,
+ GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed);
}
+ }
+ }
- if (++i >= nsiblings)
- i = 0;
- } while (i != hint);
+ /*
+ * We're operating on the PG hierarchy. Pause CPUs to ensure
+ * exclusivity with respect to the dispatcher.
+ */
+ pause_cpus(NULL);
- if (!tpg)
- goto next_level; /* no candidates at this level */
+ /*
+ * Prune all PG instances of the hardware sharing relationship
+ * represented by pg.
+ */
+ group_iter_init(&hw_iter);
+ while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
/*
- * Check if the balancing target is underloaded
- * Decide to balance if the target is running fewer
- * threads, or if it's running the same number of threads
- * with more online CPUs
+ * Remove PG from it's group of siblings, if it's there.
*/
- tpg_nrun = tpg->cmt_nrunning;
- if (pg_nrun > tpg_nrun ||
- (pg_nrun == tpg_nrun &&
- (GROUP_SIZE(&tpg->cmt_cpus_actv) >
- GROUP_SIZE(&pg->cmt_cpus_actv)))) {
- break;
+ if (pg->cmt_siblings) {
+ (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
+ }
+ if (pg->cmt_parent == NULL &&
+ pg->cmt_siblings != &cmt_root->cl_pgs) {
+ (void) group_remove(&cmt_root->cl_pgs, pg,
+ GRP_NORESIZE);
+ }
+ /*
+ * Add PGs children to it's group of siblings.
+ */
+ if (pg->cmt_children != NULL) {
+ children = pg->cmt_children;
+
+ group_iter_init(&child_iter);
+ while ((child = group_iterate(children, &child_iter))
+ != NULL) {
+ /*
+ * Transplant child from it's siblings set to
+ * PGs.
+ */
+ if (pg->cmt_siblings != NULL &&
+ child->cmt_siblings != NULL &&
+ group_remove(child->cmt_siblings, child,
+ GRP_NORESIZE) != -1) {
+ r = group_add(pg->cmt_siblings, child,
+ GRP_NORESIZE);
+ ASSERT(r == 0);
+ }
+ }
}
- tpg = NULL;
-next_level:
- if (++level == GROUP_SIZE(cmt_pgs))
- break;
+ /*
+ * Reset the callbacks to the defaults
+ */
+ pg_callback_set_defaults((pg_t *)pg);
+
+ /*
+ * Update all the CPU lineages in each of PG's CPUs
+ */
+ PG_CPU_ITR_INIT(pg, cpu_iter);
+ while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
+ group_t *pgs;
+ pg_cmt_t *cpu_pg;
+ group_iter_t liter; /* Iterator for the lineage */
- pg = GROUP_ACCESS(cmt_pgs, level);
- siblings = pg->cmt_siblings;
+ /*
+ * Iterate over the CPU's PGs updating the children
+ * of the PG being promoted, since they have a new
+ * parent and siblings set.
+ */
+ pgs = &cpu->cpu_pg->pgs;
+ group_iter_init(&liter);
+ while ((cpu_pg = group_iterate(pgs, &liter)) != NULL) {
+ if (cpu_pg->cmt_parent == pg) {
+ cpu_pg->cmt_parent = pg->cmt_parent;
+ cpu_pg->cmt_siblings = pg->cmt_siblings;
+ }
+ }
+
+ /*
+ * Update the CPU's lineages
+ */
+ pgs = &cpu->cpu_pg->cmt_pgs;
+ (void) group_remove(pgs, pg, GRP_NORESIZE);
+ pgs = &cpu->cpu_pg->pgs;
+ (void) group_remove(pgs, pg, GRP_NORESIZE);
+ }
}
+ start_cpus();
+ return (0);
+}
+
+/*
+ * Disable CMT scheduling
+ */
+static void
+pg_cmt_disable(void)
+{
+ cpu_t *cpu;
+
+ pause_cpus(NULL);
+ cpu = cpu_list;
+
+ do {
+ if (cpu->cpu_pg)
+ group_empty(&cpu->cpu_pg->cmt_pgs);
+ } while ((cpu = cpu->cpu_next) != cpu_list);
+
+ cmt_sched_disabled = 1;
+ start_cpus();
+ cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable");
+}
+
+static int
+pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz)
+{
+ int i, size;
+ pg_cmt_t *pg, *parent, *pg_bad;
+ cpu_t *cp;
+ pg_cpu_itr_t cpu_iter;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+revalidate:
+ size = *sz;
+ pg_bad = NULL;
+ for (i = 0; i < size - 1; i++) {
- if (tpg) {
- uint_t tgt_size = GROUP_SIZE(&tpg->cmt_cpus_actv);
+ pg = lineage[i];
+ parent = lineage[i + 1];
/*
- * Select an idle CPU from the target
+ * We assume that the lineage has already been sorted
+ * by the number of CPUs. In fact, we depend on it.
*/
- hint = CPU_PSEUDO_RANDOM() % tgt_size;
- cpu = hint;
- do {
- newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu);
- if (newcp->cpu_part == tp->t_cpupart &&
- newcp->cpu_dispatch_pri == -1) {
- cp = newcp;
- break;
+ ASSERT(PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)parent));
+
+ /*
+ * Walk each of the CPUs in the PGs group, and verify that
+ * the next larger PG contains at least the CPUs in this one.
+ */
+ PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter);
+ while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
+ if (pg_cpu_find((pg_t *)parent, cp) == B_FALSE) {
+ cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC;
+ goto handle_error;
}
- if (++cpu == tgt_size)
- cpu = 0;
- } while (cpu != hint);
+ }
}
- return (cp);
+handle_error:
+ switch (cmt_lineage_status) {
+ case CMT_LINEAGE_VALID:
+ case CMT_LINEAGE_REPAIRED:
+ break;
+ case CMT_LINEAGE_NON_CONCENTRIC:
+ /*
+ * We've detected a non-concentric PG lineage.
+ *
+ * This can happen when some of the CPU grouping information
+ * is derived from buggy sources (for example, incorrect ACPI
+ * tables on x86 systems).
+ *
+ * We attempt to recover from this by pruning out the
+ * illegal groupings from the PG hierarchy, which means that
+ * we won't optimize for those levels, but we will for the
+ * remaining ones.
+ *
+ * If a given level has CPUs not found in it's parent, then
+ * we examine the PG and it's parent to see if either grouping
+ * is enumerated from potentially buggy sources.
+ *
+ * If one has less CPUs than the other, and contains CPUs
+ * not found in the parent, and it is an untrusted enumeration,
+ * then prune it. If both have the same number of CPUs, then
+ * prune the one that is untrusted.
+ *
+ * This process repeats until we have a concentric lineage,
+ * or we would have to prune out level derived from what we
+ * thought was a reliable source, in which case CMT scheduling
+ * is disabled all together.
+ */
+ if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)parent)) &&
+ (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) {
+ pg_bad = pg;
+ } else if (PG_NUM_CPUS((pg_t *)pg) ==
+ PG_NUM_CPUS((pg_t *)parent)) {
+ if (PG_CMT_HW_SUSPECT(((pghw_t *)parent)->pghw_hw)) {
+ pg_bad = parent;
+ } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
+ pg_bad = pg;
+ }
+ }
+ if (pg_bad) {
+ if (pg_cmt_prune(pg_bad, lineage, sz) == 0) {
+ cmt_lineage_status = CMT_LINEAGE_REPAIRED;
+ goto revalidate;
+ }
+ }
+ /*FALLTHROUGH*/
+ default:
+ /*
+ * If we're here, something has gone wrong in trying to
+ * recover from a illegal PG hierarchy, or we've encountered
+ * a validation error for which we don't know how to recover.
+ * In this case, disable CMT scheduling all together.
+ */
+ pg_cmt_disable();
+ cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE;
+ return (-1);
+ }
+ return (0);
}
diff --git a/usr/src/uts/common/disp/cmt_policy.c b/usr/src/uts/common/disp/cmt_policy.c
new file mode 100644
index 0000000000..e3c00d2bc5
--- /dev/null
+++ b/usr/src/uts/common/disp/cmt_policy.c
@@ -0,0 +1,229 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/thread.h>
+#include <sys/cpuvar.h>
+#include <sys/cpupart.h>
+#include <sys/cmn_err.h>
+#include <sys/disp.h>
+#include <sys/group.h>
+#include <sys/bitset.h>
+#include <sys/lgrp.h>
+#include <sys/cmt.h>
+
+/*
+ * CMT dispatcher policies
+ *
+ * This file implements CMT dispatching policies using Processor Groups.
+ *
+ * The scheduler/dispatcher leverages knowledge of the performance
+ * relevant CMT sharing relationships existing between CPUs to implement
+ * load balancing, and coalescence thread placement policies.
+ *
+ * Load balancing policy seeks to improve performance by minimizing
+ * contention over shared processor resources / facilities. Coalescence
+ * policies improve resource utilization and ultimately power efficiency.
+ *
+ * On NUMA systems, the dispatcher will generally perform load balancing and
+ * coalescence within (and not across) lgroups. This is because there isn't
+ * much sense in trying to correct an imbalance by sending a thread outside
+ * of its home, if it would attempt to return home a short while later.
+ * The dispatcher will implement CMT policy across lgroups however, if
+ * it can do so with a thread homed to the root lgroup, since root homed
+ * threads have no lgroup affinity.
+ */
+
+/*
+ * Return non-zero if, given the policy, we should migrate from running
+ * somewhere "here" to somewhere "there".
+ */
+static int
+cmt_should_migrate(pg_cmt_t *here, pg_cmt_t *there, pg_cmt_policy_t policy,
+ int self)
+{
+ uint32_t here_util, there_util;
+
+ here_util = here->cmt_utilization;
+ there_util = there->cmt_utilization;
+
+ /*
+ * This assumes that curthread's utilization is "1"
+ */
+ if (self && bitset_in_set(&here->cmt_cpus_actv_set, CPU->cpu_seqid))
+ here_util--; /* Ignore curthread's effect */
+
+ /*
+ * Load balancing and coalescence are conflicting policies
+ */
+ ASSERT((policy & (CMT_BALANCE|CMT_COALESCE)) !=
+ (CMT_BALANCE|CMT_COALESCE));
+
+ if (policy & CMT_BALANCE) {
+ /*
+ * Balance utilization
+ *
+ * If the target is comparatively underutilized
+ * (either in an absolute sense, or scaled by capacity),
+ * then choose to balance.
+ */
+ if ((here_util > there_util) ||
+ (here_util == there_util &&
+ (CMT_CAPACITY(there) > CMT_CAPACITY(here)))) {
+ return (1);
+ }
+ } else if (policy & CMT_COALESCE) {
+ /*
+ * Attempt to drive group utilization up to capacity
+ */
+ if (there_util > here_util &&
+ there_util < CMT_CAPACITY(there))
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * Perform multi-level CMT load balancing of running threads.
+ *
+ * tp is the thread being enqueued.
+ * cp is a hint CPU, against which CMT load balancing will be performed.
+ *
+ * Returns cp, or a CPU better than cp with respect to balancing
+ * running thread load.
+ */
+cpu_t *
+cmt_balance(kthread_t *tp, cpu_t *cp)
+{
+ int hint, i, cpu, nsiblings;
+ int self = 0;
+ group_t *cmt_pgs, *siblings;
+ pg_cmt_t *pg, *pg_tmp, *tpg = NULL;
+ int level = 0;
+ cpu_t *newcp;
+ extern cmt_lgrp_t *cmt_root;
+
+ ASSERT(THREAD_LOCK_HELD(tp));
+
+ cmt_pgs = &cp->cpu_pg->cmt_pgs;
+
+ if (GROUP_SIZE(cmt_pgs) == 0)
+ return (cp); /* nothing to do */
+
+ if (tp == curthread)
+ self = 1;
+
+ /*
+ * Balance across siblings in the CPUs CMT lineage
+ * If the thread is homed to the root lgroup, perform
+ * top level balancing against other top level PGs
+ * in the system. Otherwise, start with the default
+ * top level siblings group, which is within the leaf lgroup
+ */
+ pg = GROUP_ACCESS(cmt_pgs, level);
+ if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID)
+ siblings = &cmt_root->cl_pgs;
+ else
+ siblings = pg->cmt_siblings;
+
+ /*
+ * Traverse down the lineage until we find a level that needs
+ * balancing, or we get to the end.
+ */
+ for (;;) {
+ nsiblings = GROUP_SIZE(siblings); /* self inclusive */
+ if (nsiblings == 1)
+ goto next_level;
+
+ hint = CPU_PSEUDO_RANDOM() % nsiblings;
+
+ /*
+ * Find a balancing candidate from among our siblings
+ * "hint" is a hint for where to start looking
+ */
+ i = hint;
+ do {
+ ASSERT(i < nsiblings);
+ pg_tmp = GROUP_ACCESS(siblings, i);
+
+ /*
+ * The candidate must not be us, and must
+ * have some CPU resources in the thread's
+ * partition
+ */
+ if (pg_tmp != pg &&
+ bitset_in_set(&tp->t_cpupart->cp_cmt_pgs,
+ ((pg_t *)pg_tmp)->pg_id)) {
+ tpg = pg_tmp;
+ break;
+ }
+
+ if (++i >= nsiblings)
+ i = 0;
+ } while (i != hint);
+
+ if (!tpg)
+ goto next_level; /* no candidates at this level */
+
+ /*
+ * Decide if we should migrate from the current PG to a
+ * target PG given a policy
+ */
+ if (cmt_should_migrate(pg, tpg, pg->cmt_policy, self))
+ break;
+ tpg = NULL;
+
+next_level:
+ if (++level == GROUP_SIZE(cmt_pgs))
+ break;
+
+ pg = GROUP_ACCESS(cmt_pgs, level);
+ siblings = pg->cmt_siblings;
+ }
+
+ if (tpg) {
+ uint_t tgt_size = GROUP_SIZE(&tpg->cmt_cpus_actv);
+
+ /*
+ * Select an idle CPU from the target
+ */
+ hint = CPU_PSEUDO_RANDOM() % tgt_size;
+ cpu = hint;
+ do {
+ newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu);
+ if (newcp->cpu_part == tp->t_cpupart &&
+ newcp->cpu_dispatch_pri == -1) {
+ cp = newcp;
+ break;
+ }
+ if (++cpu == tgt_size)
+ cpu = 0;
+ } while (cpu != hint);
+ }
+
+ return (cp);
+}
diff --git a/usr/src/uts/common/disp/disp.c b/usr/src/uts/common/disp/disp.c
index 458792c7f8..b3f6efeb2e 100644
--- a/usr/src/uts/common/disp/disp.c
+++ b/usr/src/uts/common/disp/disp.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -890,11 +890,10 @@ swtch()
cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
if (next != t) {
- if (t == cp->cpu_idle_thread) {
- PG_NRUN_UPDATE(cp, 1);
- } else if (next == cp->cpu_idle_thread) {
- PG_NRUN_UPDATE(cp, -1);
- }
+ hrtime_t now;
+
+ now = gethrtime_unscaled();
+ pg_ev_thread_swtch(cp, now, t, next);
/*
* If t was previously in the TS_ONPROC state,
@@ -904,7 +903,7 @@ swtch()
* queue.
*/
if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
- t->t_waitrq = gethrtime_unscaled();
+ t->t_waitrq = now;
}
/*
@@ -929,6 +928,8 @@ swtch()
if (t->t_flag & T_INTR_THREAD)
cpu_intr_swtch_exit(t);
+ pg_ev_thread_remain(cp, t);
+
DTRACE_SCHED(remain__cpu);
TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
(void) spl0();
@@ -960,8 +961,7 @@ swtch_from_zombie()
ASSERT(next != curthread);
TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
- if (next == cpu->cpu_idle_thread)
- PG_NRUN_UPDATE(cpu, -1);
+ pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
restore_mstate(next);
@@ -1055,6 +1055,7 @@ void
swtch_to(kthread_t *next)
{
cpu_t *cp = CPU;
+ hrtime_t now;
TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
@@ -1065,8 +1066,8 @@ swtch_to(kthread_t *next)
TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
- if (curthread == cp->cpu_idle_thread)
- PG_NRUN_UPDATE(cp, 1);
+ now = gethrtime_unscaled();
+ pg_ev_thread_swtch(cp, now, curthread, next);
/* OK to steal anything left on run queue */
cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
@@ -1081,7 +1082,7 @@ swtch_to(kthread_t *next)
* queue.
*/
if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
- curthread->t_waitrq = gethrtime_unscaled();
+ curthread->t_waitrq = now;
}
/* restore next thread to previously running microstate */
@@ -1098,8 +1099,6 @@ swtch_to(kthread_t *next)
*/
}
-
-
#define CPU_IDLING(pri) ((pri) == -1)
static void
diff --git a/usr/src/uts/common/io/cpudrv.c b/usr/src/uts/common/io/cpudrv.c
index 6f329fad4c..8314c5df43 100644
--- a/usr/src/uts/common/io/cpudrv.c
+++ b/usr/src/uts/common/io/cpudrv.c
@@ -43,7 +43,7 @@
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/sdt.h>
-
+#include <sys/epm.h>
#include <sys/machsystm.h>
#include <sys/x_call.h>
#include <sys/cpudrv_mach.h>
@@ -110,23 +110,25 @@ static struct modlinkage modlinkage = {
/*
* Function prototypes
*/
-static int cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp);
-static void cpudrv_pm_free(cpudrv_devstate_t *cpudsp);
-static int cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp);
-static void cpudrv_pm_monitor_disp(void *arg);
-static void cpudrv_pm_monitor(void *arg);
+static int cpudrv_init(cpudrv_devstate_t *cpudsp);
+static void cpudrv_free(cpudrv_devstate_t *cpudsp);
+static int cpudrv_comp_create(cpudrv_devstate_t *cpudsp);
+static void cpudrv_monitor_disp(void *arg);
+static void cpudrv_monitor(void *arg);
/*
* Driver global variables
*/
uint_t cpudrv_debug = 0;
void *cpudrv_state;
-static uint_t cpudrv_pm_idle_hwm = CPUDRV_PM_IDLE_HWM;
-static uint_t cpudrv_pm_idle_lwm = CPUDRV_PM_IDLE_LWM;
-static uint_t cpudrv_pm_idle_buf_zone = CPUDRV_PM_IDLE_BUF_ZONE;
-static uint_t cpudrv_pm_idle_bhwm_cnt_max = CPUDRV_PM_IDLE_BHWM_CNT_MAX;
-static uint_t cpudrv_pm_idle_blwm_cnt_max = CPUDRV_PM_IDLE_BLWM_CNT_MAX;
-static uint_t cpudrv_pm_user_hwm = CPUDRV_PM_USER_HWM;
+static uint_t cpudrv_idle_hwm = CPUDRV_IDLE_HWM;
+static uint_t cpudrv_idle_lwm = CPUDRV_IDLE_LWM;
+static uint_t cpudrv_idle_buf_zone = CPUDRV_IDLE_BUF_ZONE;
+static uint_t cpudrv_idle_bhwm_cnt_max = CPUDRV_IDLE_BHWM_CNT_MAX;
+static uint_t cpudrv_idle_blwm_cnt_max = CPUDRV_IDLE_BLWM_CNT_MAX;
+static uint_t cpudrv_user_hwm = CPUDRV_USER_HWM;
+
+boolean_t cpudrv_enabled = B_TRUE;
/*
* cpudrv_direct_pm allows user applications to directly control the
@@ -154,13 +156,13 @@ int cpudrv_direct_pm = 0;
* Arranges for the handler function to be called at the interval suitable
* for current speed.
*/
-#define CPUDRV_PM_MONITOR_INIT(cpudsp) { \
- if (CPUDRV_PM_POWER_ENABLED(cpudsp)) { \
+#define CPUDRV_MONITOR_INIT(cpudsp) { \
+ if (cpudrv_is_enabled(cpudsp)) { \
ASSERT(mutex_owned(&(cpudsp)->lock)); \
(cpudsp)->cpudrv_pm.timeout_id = \
- timeout(cpudrv_pm_monitor_disp, \
+ timeout(cpudrv_monitor_disp, \
(cpudsp), (((cpudsp)->cpudrv_pm.cur_spd == NULL) ? \
- CPUDRV_PM_QUANT_CNT_OTHR : \
+ CPUDRV_QUANT_CNT_OTHR : \
(cpudsp)->cpudrv_pm.cur_spd->quant_cnt)); \
} \
}
@@ -168,7 +170,7 @@ int cpudrv_direct_pm = 0;
/*
* Arranges for the handler function not to be called back.
*/
-#define CPUDRV_PM_MONITOR_FINI(cpudsp) { \
+#define CPUDRV_MONITOR_FINI(cpudsp) { \
timeout_id_t tmp_tid; \
ASSERT(mutex_owned(&(cpudsp)->lock)); \
tmp_tid = (cpudsp)->cpudrv_pm.timeout_id; \
@@ -203,7 +205,7 @@ _init(void)
/*
* Callbacks used by the PPM driver.
*/
- CPUDRV_PM_SET_PPM_CALLBACKS();
+ CPUDRV_SET_PPM_CALLBACKS();
return (error);
}
@@ -242,13 +244,13 @@ cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
case DDI_ATTACH:
DPRINTF(D_ATTACH, ("cpudrv_attach: instance %d: "
"DDI_ATTACH called\n", instance));
- if (CPUDRV_PM_DISABLED())
+ if (!cpudrv_is_enabled(NULL))
return (DDI_FAILURE);
if (ddi_soft_state_zalloc(cpudrv_state, instance) !=
DDI_SUCCESS) {
cmn_err(CE_WARN, "cpudrv_attach: instance %d: "
"can't allocate state", instance);
- CPUDRV_PM_DISABLE();
+ cpudrv_enabled = B_FALSE;
return (DDI_FAILURE);
}
if ((cpudsp = ddi_get_soft_state(cpudrv_state, instance)) ==
@@ -256,7 +258,7 @@ cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
cmn_err(CE_WARN, "cpudrv_attach: instance %d: "
"can't get state", instance);
ddi_soft_state_free(cpudrv_state, instance);
- CPUDRV_PM_DISABLE();
+ cpudrv_enabled = B_FALSE;
return (DDI_FAILURE);
}
cpudsp->dip = dip;
@@ -264,36 +266,36 @@ cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
/*
* Find CPU number for this dev_info node.
*/
- if (!cpudrv_pm_get_cpu_id(dip, &(cpudsp->cpu_id))) {
+ if (!cpudrv_get_cpu_id(dip, &(cpudsp->cpu_id))) {
cmn_err(CE_WARN, "cpudrv_attach: instance %d: "
"can't convert dip to cpu_id", instance);
ddi_soft_state_free(cpudrv_state, instance);
- CPUDRV_PM_DISABLE();
+ cpudrv_enabled = B_FALSE;
return (DDI_FAILURE);
}
- if (!cpudrv_mach_pm_init(cpudsp)) {
- ddi_soft_state_free(cpudrv_state, instance);
- CPUDRV_PM_DISABLE();
+ if (!cpudrv_mach_init(cpudsp)) {
+ cpudrv_enabled = B_FALSE;
return (DDI_FAILURE);
}
+
mutex_init(&cpudsp->lock, NULL, MUTEX_DRIVER, NULL);
- if (CPUDRV_PM_POWER_ENABLED(cpudsp)) {
- if (cpudrv_pm_init_power(cpudsp) != DDI_SUCCESS) {
- CPUDRV_PM_DISABLE();
- cpudrv_pm_free(cpudsp);
+ if (cpudrv_is_enabled(cpudsp)) {
+ if (cpudrv_init(cpudsp) != DDI_SUCCESS) {
+ cpudrv_enabled = B_FALSE;
+ cpudrv_free(cpudsp);
ddi_soft_state_free(cpudrv_state, instance);
return (DDI_FAILURE);
}
- if (cpudrv_pm_comp_create(cpudsp) != DDI_SUCCESS) {
- CPUDRV_PM_DISABLE();
- cpudrv_pm_free(cpudsp);
+ if (cpudrv_comp_create(cpudsp) != DDI_SUCCESS) {
+ cpudrv_enabled = B_FALSE;
+ cpudrv_free(cpudsp);
ddi_soft_state_free(cpudrv_state, instance);
return (DDI_FAILURE);
}
if (ddi_prop_update_string(DDI_DEV_T_NONE,
dip, "pm-class", "CPU") != DDI_PROP_SUCCESS) {
- CPUDRV_PM_DISABLE();
- cpudrv_pm_free(cpudsp);
+ cpudrv_enabled = B_FALSE;
+ cpudrv_free(cpudsp);
ddi_soft_state_free(cpudrv_state, instance);
return (DDI_FAILURE);
}
@@ -303,10 +305,10 @@ cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
* activities.
*/
cpudsp->cpudrv_pm.tq = taskq_create_instance(
- "cpudrv_pm_monitor",
- ddi_get_instance(dip), CPUDRV_PM_TASKQ_THREADS,
- (maxclsyspri - 1), CPUDRV_PM_TASKQ_MIN,
- CPUDRV_PM_TASKQ_MAX,
+ "cpudrv_monitor",
+ ddi_get_instance(dip), CPUDRV_TASKQ_THREADS,
+ (maxclsyspri - 1), CPUDRV_TASKQ_MIN,
+ CPUDRV_TASKQ_MAX,
TASKQ_PREPOPULATE|TASKQ_CPR_SAFE);
mutex_init(&cpudsp->cpudrv_pm.timeout_lock, NULL,
@@ -321,7 +323,7 @@ cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
* is full speed for us.
*/
/*
- * We need to take the lock because cpudrv_pm_monitor()
+ * We need to take the lock because cpudrv_monitor()
* will start running in parallel with attach().
*/
mutex_enter(&cpudsp->lock);
@@ -335,12 +337,12 @@ cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
* unknown speed and moves CPU to top speed when it
* has been initialized.
*/
- CPUDRV_PM_MONITOR_INIT(cpudsp);
+ CPUDRV_MONITOR_INIT(cpudsp);
mutex_exit(&cpudsp->lock);
}
- CPUDRV_PM_INSTALL_MAX_CHANGE_HANDLER(cpudsp, dip);
+ CPUDRV_INSTALL_MAX_CHANGE_HANDLER(cpudsp);
ddi_report_dev(dip);
return (DDI_SUCCESS);
@@ -355,7 +357,7 @@ cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
/*
* Nothing to do for resume, if not doing active PM.
*/
- if (!CPUDRV_PM_POWER_ENABLED(cpudsp))
+ if (!cpudrv_is_enabled(cpudsp))
return (DDI_SUCCESS);
mutex_enter(&cpudsp->lock);
@@ -365,9 +367,9 @@ cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
* that the needed speed is full speed for us.
*/
cpudsp->cpudrv_pm.cur_spd = NULL;
- CPUDRV_PM_MONITOR_INIT(cpudsp);
+ CPUDRV_MONITOR_INIT(cpudsp);
mutex_exit(&cpudsp->lock);
- CPUDRV_PM_REDEFINE_TOPSPEED(dip);
+ CPUDRV_REDEFINE_TOPSPEED(dip);
return (DDI_SUCCESS);
default:
@@ -409,7 +411,7 @@ cpudrv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
/*
* Nothing to do for suspend, if not doing active PM.
*/
- if (!CPUDRV_PM_POWER_ENABLED(cpudsp))
+ if (!cpudrv_is_enabled(cpudsp))
return (DDI_SUCCESS);
/*
@@ -427,18 +429,18 @@ cpudrv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
DPRINTF(D_DETACH, ("cpudrv_detach: instance %d: DDI_SUSPEND - "
"cur_spd %d, topspeed %d\n", instance,
cpupm->cur_spd->pm_level,
- CPUDRV_PM_TOPSPEED(cpupm)->pm_level));
+ CPUDRV_TOPSPEED(cpupm)->pm_level));
- CPUDRV_PM_MONITOR_FINI(cpudsp);
+ CPUDRV_MONITOR_FINI(cpudsp);
if (!cpudrv_direct_pm && (cpupm->cur_spd !=
- CPUDRV_PM_TOPSPEED(cpupm))) {
+ CPUDRV_TOPSPEED(cpupm))) {
if (cpupm->pm_busycnt < 1) {
- if ((pm_busy_component(dip, CPUDRV_PM_COMP_NUM)
+ if ((pm_busy_component(dip, CPUDRV_COMP_NUM)
== DDI_SUCCESS)) {
cpupm->pm_busycnt++;
} else {
- CPUDRV_PM_MONITOR_INIT(cpudsp);
+ CPUDRV_MONITOR_INIT(cpudsp);
mutex_exit(&cpudsp->lock);
cmn_err(CE_WARN, "cpudrv_detach: "
"instance %d: can't busy CPU "
@@ -447,16 +449,16 @@ cpudrv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
}
}
mutex_exit(&cpudsp->lock);
- if (pm_raise_power(dip, CPUDRV_PM_COMP_NUM,
- CPUDRV_PM_TOPSPEED(cpupm)->pm_level) !=
+ if (pm_raise_power(dip, CPUDRV_COMP_NUM,
+ CPUDRV_TOPSPEED(cpupm)->pm_level) !=
DDI_SUCCESS) {
mutex_enter(&cpudsp->lock);
- CPUDRV_PM_MONITOR_INIT(cpudsp);
+ CPUDRV_MONITOR_INIT(cpudsp);
mutex_exit(&cpudsp->lock);
cmn_err(CE_WARN, "cpudrv_detach: instance %d: "
"can't raise CPU power level to %d",
instance,
- CPUDRV_PM_TOPSPEED(cpupm)->pm_level);
+ CPUDRV_TOPSPEED(cpupm)->pm_level);
return (DDI_FAILURE);
} else {
return (DDI_SUCCESS);
@@ -483,7 +485,7 @@ cpudrv_power(dev_info_t *dip, int comp, int level)
{
int instance;
cpudrv_devstate_t *cpudsp;
- cpudrv_pm_t *cpupm;
+ cpudrv_pm_t *cpudrvpm;
cpudrv_pm_spd_t *new_spd;
boolean_t is_ready;
int ret;
@@ -492,14 +494,15 @@ cpudrv_power(dev_info_t *dip, int comp, int level)
DPRINTF(D_POWER, ("cpudrv_power: instance %d: level %d\n",
instance, level));
+
if ((cpudsp = ddi_get_soft_state(cpudrv_state, instance)) == NULL) {
- cmn_err(CE_WARN, "cpudrv_power: instance %d: can't get state",
- instance);
+ cmn_err(CE_WARN, "cpudrv_power: instance %d: can't "
+ "get state", instance);
return (DDI_FAILURE);
}
mutex_enter(&cpudsp->lock);
- cpupm = &(cpudsp->cpudrv_pm);
+ cpudrvpm = &(cpudsp->cpudrv_pm);
/*
* In normal operation, we fail if we are busy and request is
@@ -507,21 +510,22 @@ cpudrv_power(dev_info_t *dip, int comp, int level)
* is in special direct pm mode. On x86, we also let this through
* if the change is due to a request to govern the max speed.
*/
- if (!cpudrv_direct_pm && (cpupm->pm_busycnt >= 1) &&
- !cpudrv_pm_is_governor_thread(cpupm)) {
- if ((cpupm->cur_spd != NULL) &&
- (level < cpupm->cur_spd->pm_level)) {
+ if (!cpudrv_direct_pm && (cpudrvpm->pm_busycnt >= 1) &&
+ !cpudrv_is_governor_thread(cpudrvpm)) {
+ if ((cpudrvpm->cur_spd != NULL) &&
+ (level < cpudrvpm->cur_spd->pm_level)) {
mutex_exit(&cpudsp->lock);
return (DDI_FAILURE);
}
}
- for (new_spd = cpupm->head_spd; new_spd; new_spd = new_spd->down_spd) {
+ for (new_spd = cpudrvpm->head_spd; new_spd; new_spd =
+ new_spd->down_spd) {
if (new_spd->pm_level == level)
break;
}
if (!new_spd) {
- CPUDRV_PM_RESET_GOVERNOR_THREAD(cpupm);
+ CPUDRV_RESET_GOVERNOR_THREAD(cpudrvpm);
mutex_exit(&cpudsp->lock);
cmn_err(CE_WARN, "cpudrv_power: instance %d: "
"can't locate new CPU speed", instance);
@@ -538,105 +542,66 @@ cpudrv_power(dev_info_t *dip, int comp, int level)
* That's because we don't know what the CPU domains look like
* until all instances have been initialized.
*/
- is_ready = CPUDRV_PM_XCALL_IS_READY(cpudsp->cpu_id);
+ is_ready = CPUDRV_XCALL_IS_READY(cpudsp->cpu_id);
if (!is_ready) {
DPRINTF(D_POWER, ("cpudrv_power: instance %d: "
"CPU not ready for x-calls\n", instance));
- } else if (!(is_ready = cpudrv_pm_power_ready())) {
+ } else if (!(is_ready = cpudrv_power_ready())) {
DPRINTF(D_POWER, ("cpudrv_power: instance %d: "
- "waiting for all CPUs to be power manageable\n", instance));
+ "waiting for all CPUs to be power manageable\n",
+ instance));
}
if (!is_ready) {
- CPUDRV_PM_RESET_GOVERNOR_THREAD(cpupm);
+ CPUDRV_RESET_GOVERNOR_THREAD(cpudrvpm);
mutex_exit(&cpudsp->lock);
return (DDI_FAILURE);
}
/*
- * Execute CPU specific routine on the requested CPU to change its
- * speed to normal-speed/divisor.
+ * Execute CPU specific routine on the requested CPU to
+ * change its speed to normal-speed/divisor.
*/
- if ((ret = cpudrv_pm_change_speed(cpudsp, new_spd)) != DDI_SUCCESS) {
- cmn_err(CE_WARN, "cpudrv_power: cpudrv_pm_change_speed() "
- "return = %d", ret);
+ if ((ret = cpudrv_change_speed(cpudsp, new_spd)) != DDI_SUCCESS) {
+ cmn_err(CE_WARN, "cpudrv_power: "
+ "cpudrv_change_speed() return = %d", ret);
mutex_exit(&cpudsp->lock);
return (DDI_FAILURE);
}
/*
- * DTrace probe point for CPU speed change transition
- */
- DTRACE_PROBE3(cpu__change__speed, cpudrv_devstate_t *, cpudsp,
- cpudrv_pm_t *, cpupm, cpudrv_pm_spd_t *, new_spd);
-
- /*
* Reset idle threshold time for the new power level.
*/
- if ((cpupm->cur_spd != NULL) && (level < cpupm->cur_spd->pm_level)) {
- if (pm_idle_component(dip, CPUDRV_PM_COMP_NUM) ==
+ if ((cpudrvpm->cur_spd != NULL) && (level <
+ cpudrvpm->cur_spd->pm_level)) {
+ if (pm_idle_component(dip, CPUDRV_COMP_NUM) ==
DDI_SUCCESS) {
- if (cpupm->pm_busycnt >= 1)
- cpupm->pm_busycnt--;
- } else
- cmn_err(CE_WARN, "cpudrv_power: instance %d: can't "
- "idle CPU component", ddi_get_instance(dip));
+ if (cpudrvpm->pm_busycnt >= 1)
+ cpudrvpm->pm_busycnt--;
+ } else {
+ cmn_err(CE_WARN, "cpudrv_power: instance %d: "
+ "can't idle CPU component",
+ ddi_get_instance(dip));
+ }
}
/*
* Reset various parameters because we are now running at new speed.
*/
- cpupm->lastquan_mstate[CMS_IDLE] = 0;
- cpupm->lastquan_mstate[CMS_SYSTEM] = 0;
- cpupm->lastquan_mstate[CMS_USER] = 0;
- cpupm->lastquan_ticks = 0;
- cpupm->cur_spd = new_spd;
- CPUDRV_PM_RESET_GOVERNOR_THREAD(cpupm);
+ cpudrvpm->lastquan_mstate[CMS_IDLE] = 0;
+ cpudrvpm->lastquan_mstate[CMS_SYSTEM] = 0;
+ cpudrvpm->lastquan_mstate[CMS_USER] = 0;
+ cpudrvpm->lastquan_ticks = 0;
+ cpudrvpm->cur_spd = new_spd;
+ CPUDRV_RESET_GOVERNOR_THREAD(cpudrvpm);
mutex_exit(&cpudsp->lock);
return (DDI_SUCCESS);
}
/*
- * Initialize the field that will be used for reporting
- * the supported_frequencies_Hz cpu_info kstat.
- */
-static void
-set_supp_freqs(cpu_t *cp, cpudrv_pm_t *cpupm)
-{
- char *supp_freqs;
- char *sfptr;
- uint64_t *speeds;
- cpudrv_pm_spd_t *spd;
- int i;
-#define UINT64_MAX_STRING (sizeof ("18446744073709551615"))
-
- speeds = kmem_zalloc(cpupm->num_spd * sizeof (uint64_t), KM_SLEEP);
- for (i = cpupm->num_spd - 1, spd = cpupm->head_spd; spd;
- i--, spd = spd->down_spd) {
- speeds[i] =
- CPUDRV_PM_SPEED_HZ(cp->cpu_type_info.pi_clock, spd->speed);
- }
-
- supp_freqs = kmem_zalloc((UINT64_MAX_STRING * cpupm->num_spd),
- KM_SLEEP);
- sfptr = supp_freqs;
- for (i = 0; i < cpupm->num_spd; i++) {
- if (i == cpupm->num_spd - 1) {
- (void) sprintf(sfptr, "%"PRIu64, speeds[i]);
- } else {
- (void) sprintf(sfptr, "%"PRIu64":", speeds[i]);
- sfptr = supp_freqs + strlen(supp_freqs);
- }
- }
- cpu_set_supp_freqs(cp, supp_freqs);
- kmem_free(supp_freqs, (UINT64_MAX_STRING * cpupm->num_spd));
- kmem_free(speeds, cpupm->num_spd * sizeof (uint64_t));
-}
-
-/*
* Initialize power management data.
*/
static int
-cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp)
+cpudrv_init(cpudrv_devstate_t *cpudsp)
{
cpudrv_pm_t *cpupm = &(cpudsp->cpudrv_pm);
cpudrv_pm_spd_t *cur_spd;
@@ -647,10 +612,10 @@ cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp)
int user_cnt_percent;
int i;
- CPUDRV_PM_GET_SPEEDS(cpudsp, speeds, nspeeds);
+ CPUDRV_GET_SPEEDS(cpudsp, speeds, nspeeds);
if (nspeeds < 2) {
/* Need at least two speeds to power manage */
- CPUDRV_PM_FREE_SPEEDS(speeds, nspeeds);
+ CPUDRV_FREE_SPEEDS(speeds, nspeeds);
return (DDI_FAILURE);
}
cpupm->num_spd = nspeeds;
@@ -685,15 +650,15 @@ cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp)
cur_spd->speed = speeds[i];
if (i == 0) { /* normal speed */
cpupm->head_spd = cur_spd;
- CPUDRV_PM_TOPSPEED(cpupm) = cur_spd;
- cur_spd->quant_cnt = CPUDRV_PM_QUANT_CNT_NORMAL;
+ CPUDRV_TOPSPEED(cpupm) = cur_spd;
+ cur_spd->quant_cnt = CPUDRV_QUANT_CNT_NORMAL;
cur_spd->idle_hwm =
- (cpudrv_pm_idle_hwm * cur_spd->quant_cnt) / 100;
+ (cpudrv_idle_hwm * cur_spd->quant_cnt) / 100;
/* can't speed anymore */
cur_spd->idle_lwm = 0;
cur_spd->user_hwm = UINT_MAX;
} else {
- cur_spd->quant_cnt = CPUDRV_PM_QUANT_CNT_OTHR;
+ cur_spd->quant_cnt = CPUDRV_QUANT_CNT_OTHR;
ASSERT(prev_spd != NULL);
prev_spd->down_spd = cur_spd;
cur_spd->up_spd = cpupm->head_spd;
@@ -711,14 +676,14 @@ cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp)
* that there is at least a buffer zone seperation
* between the idle_lwm and idle_hwm values.
*/
- idle_cnt_percent = CPUDRV_PM_IDLE_CNT_PERCENT(
- cpudrv_pm_idle_hwm, speeds, i);
+ idle_cnt_percent = CPUDRV_IDLE_CNT_PERCENT(
+ cpudrv_idle_hwm, speeds, i);
idle_cnt_percent = max(idle_cnt_percent,
- (cpudrv_pm_idle_lwm + cpudrv_pm_idle_buf_zone));
+ (cpudrv_idle_lwm + cpudrv_idle_buf_zone));
cur_spd->idle_hwm =
(idle_cnt_percent * cur_spd->quant_cnt) / 100;
cur_spd->idle_lwm =
- (cpudrv_pm_idle_lwm * cur_spd->quant_cnt) / 100;
+ (cpudrv_idle_lwm * cur_spd->quant_cnt) / 100;
/*
* The lwm for user threads are determined such that
@@ -727,10 +692,10 @@ cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp)
* user_hwm in the new speed. This is to prevent
* the quick jump back up to higher speed.
*/
- cur_spd->user_hwm = (cpudrv_pm_user_hwm *
+ cur_spd->user_hwm = (cpudrv_user_hwm *
cur_spd->quant_cnt) / 100;
- user_cnt_percent = CPUDRV_PM_USER_CNT_PERCENT(
- cpudrv_pm_user_hwm, speeds, i);
+ user_cnt_percent = CPUDRV_USER_CNT_PERCENT(
+ cpudrv_user_hwm, speeds, i);
prev_spd->user_lwm =
(user_cnt_percent * prev_spd->quant_cnt) / 100;
}
@@ -740,11 +705,11 @@ cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp)
cur_spd->idle_hwm = UINT_MAX;
cur_spd->user_lwm = -1;
#ifdef DEBUG
- DPRINTF(D_PM_INIT, ("cpudrv_pm_init: instance %d: head_spd spd %d, "
+ DPRINTF(D_PM_INIT, ("cpudrv_init: instance %d: head_spd spd %d, "
"num_spd %d\n", ddi_get_instance(cpudsp->dip),
cpupm->head_spd->speed, cpupm->num_spd));
for (cur_spd = cpupm->head_spd; cur_spd; cur_spd = cur_spd->down_spd) {
- DPRINTF(D_PM_INIT, ("cpudrv_pm_init: instance %d: speed %d, "
+ DPRINTF(D_PM_INIT, ("cpudrv_init: instance %d: speed %d, "
"down_spd spd %d, idle_hwm %d, user_lwm %d, "
"up_spd spd %d, idle_lwm %d, user_hwm %d, "
"quant_cnt %d\n", ddi_get_instance(cpudsp->dip),
@@ -756,7 +721,7 @@ cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp)
cur_spd->quant_cnt));
}
#endif /* DEBUG */
- CPUDRV_PM_FREE_SPEEDS(speeds, nspeeds);
+ CPUDRV_FREE_SPEEDS(speeds, nspeeds);
return (DDI_SUCCESS);
}
@@ -764,7 +729,7 @@ cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp)
* Free CPU power management data.
*/
static void
-cpudrv_pm_free(cpudrv_devstate_t *cpudsp)
+cpudrv_free(cpudrv_devstate_t *cpudsp)
{
cpudrv_pm_t *cpupm = &(cpudsp->cpudrv_pm);
cpudrv_pm_spd_t *cur_spd, *next_spd;
@@ -776,14 +741,13 @@ cpudrv_pm_free(cpudrv_devstate_t *cpudsp)
cur_spd = next_spd;
}
bzero(cpupm, sizeof (cpudrv_pm_t));
- cpudrv_mach_pm_free(cpudsp);
}
/*
* Create pm-components property.
*/
static int
-cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp)
+cpudrv_comp_create(cpudrv_devstate_t *cpudsp)
{
cpudrv_pm_t *cpupm = &(cpudsp->cpudrv_pm);
cpudrv_pm_spd_t *cur_spd;
@@ -795,9 +759,9 @@ cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp)
int result = DDI_FAILURE;
pmc = kmem_zalloc((cpupm->num_spd + 1) * sizeof (char *), KM_SLEEP);
- size = CPUDRV_PM_COMP_SIZE();
- if (cpupm->num_spd > CPUDRV_PM_COMP_MAX_VAL) {
- cmn_err(CE_WARN, "cpudrv_pm_comp_create: instance %d: "
+ size = CPUDRV_COMP_SIZE();
+ if (cpupm->num_spd > CPUDRV_COMP_MAX_VAL) {
+ cmn_err(CE_WARN, "cpudrv_comp_create: instance %d: "
"number of speeds exceeded limits",
ddi_get_instance(cpudsp->dip));
kmem_free(pmc, (cpupm->num_spd + 1) * sizeof (char *));
@@ -808,9 +772,9 @@ cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp)
i--, cur_spd = cur_spd->down_spd) {
cur_spd->pm_level = i;
pmc[i] = kmem_zalloc((size * sizeof (char)), KM_SLEEP);
- comp_spd = CPUDRV_PM_COMP_SPEED(cpupm, cur_spd);
- if (comp_spd > CPUDRV_PM_COMP_MAX_VAL) {
- cmn_err(CE_WARN, "cpudrv_pm_comp_create: "
+ comp_spd = CPUDRV_COMP_SPEED(cpupm, cur_spd);
+ if (comp_spd > CPUDRV_COMP_MAX_VAL) {
+ cmn_err(CE_WARN, "cpudrv_comp_create: "
"instance %d: speed exceeded limits",
ddi_get_instance(cpudsp->dip));
for (j = cpupm->num_spd; j >= i; j--) {
@@ -820,14 +784,14 @@ cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp)
sizeof (char *));
return (result);
}
- CPUDRV_PM_COMP_SPRINT(pmc[i], cpupm, cur_spd, comp_spd)
- DPRINTF(D_PM_COMP_CREATE, ("cpudrv_pm_comp_create: "
+ CPUDRV_COMP_SPRINT(pmc[i], cpupm, cur_spd, comp_spd)
+ DPRINTF(D_PM_COMP_CREATE, ("cpudrv_comp_create: "
"instance %d: pm-components power level %d string '%s'\n",
ddi_get_instance(cpudsp->dip), i, pmc[i]));
}
pmc[0] = kmem_zalloc(sizeof (name), KM_SLEEP);
(void) strcat(pmc[0], name);
- DPRINTF(D_PM_COMP_CREATE, ("cpudrv_pm_comp_create: instance %d: "
+ DPRINTF(D_PM_COMP_CREATE, ("cpudrv_comp_create: instance %d: "
"pm-components component name '%s'\n",
ddi_get_instance(cpudsp->dip), pmc[0]));
@@ -835,7 +799,7 @@ cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp)
"pm-components", pmc, cpupm->num_spd + 1) == DDI_PROP_SUCCESS) {
result = DDI_SUCCESS;
} else {
- cmn_err(CE_WARN, "cpudrv_pm_comp_create: instance %d: "
+ cmn_err(CE_WARN, "cpudrv_comp_create: instance %d: "
"can't create pm-components property",
ddi_get_instance(cpudsp->dip));
}
@@ -851,16 +815,16 @@ cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp)
/*
* Mark a component idle.
*/
-#define CPUDRV_PM_MONITOR_PM_IDLE_COMP(dip, cpupm) { \
+#define CPUDRV_MONITOR_PM_IDLE_COMP(dip, cpupm) { \
if ((cpupm)->pm_busycnt >= 1) { \
- if (pm_idle_component((dip), CPUDRV_PM_COMP_NUM) == \
+ if (pm_idle_component((dip), CPUDRV_COMP_NUM) == \
DDI_SUCCESS) { \
- DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor: " \
+ DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: " \
"instance %d: pm_idle_component called\n", \
ddi_get_instance((dip)))); \
(cpupm)->pm_busycnt--; \
} else { \
- cmn_err(CE_WARN, "cpudrv_pm_monitor: instance %d: " \
+ cmn_err(CE_WARN, "cpudrv_monitor: instance %d: " \
"can't idle CPU component", \
ddi_get_instance((dip))); \
} \
@@ -870,16 +834,16 @@ cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp)
/*
* Marks a component busy in both PM framework and driver state structure.
*/
-#define CPUDRV_PM_MONITOR_PM_BUSY_COMP(dip, cpupm) { \
+#define CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm) { \
if ((cpupm)->pm_busycnt < 1) { \
- if (pm_busy_component((dip), CPUDRV_PM_COMP_NUM) == \
+ if (pm_busy_component((dip), CPUDRV_COMP_NUM) == \
DDI_SUCCESS) { \
- DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor: " \
+ DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: " \
"instance %d: pm_busy_component called\n", \
ddi_get_instance((dip)))); \
(cpupm)->pm_busycnt++; \
} else { \
- cmn_err(CE_WARN, "cpudrv_pm_monitor: instance %d: " \
+ cmn_err(CE_WARN, "cpudrv_monitor: instance %d: " \
"can't busy CPU component", \
ddi_get_instance((dip))); \
} \
@@ -889,19 +853,19 @@ cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp)
/*
* Marks a component busy and calls pm_raise_power().
*/
-#define CPUDRV_PM_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm, new_level) { \
+#define CPUDRV_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm, new_level) { \
/* \
* Mark driver and PM framework busy first so framework doesn't try \
* to bring CPU to lower speed when we need to be at higher speed. \
*/ \
- CPUDRV_PM_MONITOR_PM_BUSY_COMP((dip), (cpupm)); \
+ CPUDRV_MONITOR_PM_BUSY_COMP((dip), (cpupm)); \
mutex_exit(&(cpudsp)->lock); \
- DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor: instance %d: " \
+ DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: " \
"pm_raise_power called to %d\n", ddi_get_instance((dip)), \
(new_level))); \
- if (pm_raise_power((dip), CPUDRV_PM_COMP_NUM, (new_level)) != \
+ if (pm_raise_power((dip), CPUDRV_COMP_NUM, (new_level)) != \
DDI_SUCCESS) { \
- cmn_err(CE_WARN, "cpudrv_pm_monitor: instance %d: can't " \
+ cmn_err(CE_WARN, "cpudrv_monitor: instance %d: can't " \
"raise CPU power level", ddi_get_instance((dip))); \
} \
mutex_enter(&(cpudsp)->lock); \
@@ -913,7 +877,7 @@ cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp)
* We dispatch a taskq to do that job.
*/
static void
-cpudrv_pm_monitor_disp(void *arg)
+cpudrv_monitor_disp(void *arg)
{
cpudrv_devstate_t *cpudsp = (cpudrv_devstate_t *)arg;
@@ -922,13 +886,13 @@ cpudrv_pm_monitor_disp(void *arg)
* The queue should be empty at this time.
*/
mutex_enter(&cpudsp->cpudrv_pm.timeout_lock);
- if (!taskq_dispatch(cpudsp->cpudrv_pm.tq, cpudrv_pm_monitor, arg,
+ if (!taskq_dispatch(cpudsp->cpudrv_pm.tq, cpudrv_monitor, arg,
TQ_NOSLEEP)) {
mutex_exit(&cpudsp->cpudrv_pm.timeout_lock);
- DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor_disp: failed to "
- "dispatch the cpudrv_pm_monitor taskq\n"));
+ DPRINTF(D_PM_MONITOR, ("cpudrv_monitor_disp: failed to "
+ "dispatch the cpudrv_monitor taskq\n"));
mutex_enter(&cpudsp->lock);
- CPUDRV_PM_MONITOR_INIT(cpudsp);
+ CPUDRV_MONITOR_INIT(cpudsp);
mutex_exit(&cpudsp->lock);
return;
}
@@ -940,17 +904,16 @@ cpudrv_pm_monitor_disp(void *arg)
* Monitors each CPU for the amount of time idle thread was running in the
* last quantum and arranges for the CPU to go to the lower or higher speed.
* Called at the time interval appropriate for the current speed. The
- * time interval for normal speed is CPUDRV_PM_QUANT_CNT_NORMAL. The time
+ * time interval for normal speed is CPUDRV_QUANT_CNT_NORMAL. The time
* interval for other speeds (including unknown speed) is
- * CPUDRV_PM_QUANT_CNT_OTHR.
+ * CPUDRV_QUANT_CNT_OTHR.
*/
static void
-cpudrv_pm_monitor(void *arg)
+cpudrv_monitor(void *arg)
{
cpudrv_devstate_t *cpudsp = (cpudrv_devstate_t *)arg;
cpudrv_pm_t *cpupm;
cpudrv_pm_spd_t *cur_spd, *new_spd;
- cpu_t *cp;
dev_info_t *dip;
uint_t idle_cnt, user_cnt, system_cnt;
clock_t ticks;
@@ -984,12 +947,12 @@ cpudrv_pm_monitor(void *arg)
* That's because we don't know what the CPU domains look like
* until all instances have been initialized.
*/
- is_ready = CPUDRV_PM_XCALL_IS_READY(cpudsp->cpu_id);
+ is_ready = CPUDRV_XCALL_IS_READY(cpudsp->cpu_id);
if (!is_ready) {
- DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor: instance %d: "
+ DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: "
"CPU not ready for x-calls\n", ddi_get_instance(dip)));
- } else if (!(is_ready = cpudrv_pm_power_ready())) {
- DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor: instance %d: "
+ } else if (!(is_ready = cpudrv_power_ready())) {
+ DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: "
"waiting for all CPUs to be power manageable\n",
ddi_get_instance(dip)));
}
@@ -998,8 +961,8 @@ cpudrv_pm_monitor(void *arg)
* Make sure that we are busy so that framework doesn't
* try to bring us down in this situation.
*/
- CPUDRV_PM_MONITOR_PM_BUSY_COMP(dip, cpupm);
- CPUDRV_PM_MONITOR_INIT(cpudsp);
+ CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm);
+ CPUDRV_MONITOR_INIT(cpudsp);
mutex_exit(&cpudsp->lock);
goto do_return;
}
@@ -1008,35 +971,36 @@ cpudrv_pm_monitor(void *arg)
* Make sure that we are still not at unknown power level.
*/
if (cur_spd == NULL) {
- DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor: instance %d: "
+ DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: "
"cur_spd is unknown\n", ddi_get_instance(dip)));
- CPUDRV_PM_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm,
- CPUDRV_PM_TOPSPEED(cpupm)->pm_level);
+ CPUDRV_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm,
+ CPUDRV_TOPSPEED(cpupm)->pm_level);
/*
* We just changed the speed. Wait till at least next
* call to this routine before proceeding ahead.
*/
- CPUDRV_PM_MONITOR_INIT(cpudsp);
+ CPUDRV_MONITOR_INIT(cpudsp);
mutex_exit(&cpudsp->lock);
goto do_return;
}
mutex_enter(&cpu_lock);
- if ((cp = cpu_get(cpudsp->cpu_id)) == NULL) {
+ if (cpudsp->cp == NULL &&
+ (cpudsp->cp = cpu_get(cpudsp->cpu_id)) == NULL) {
mutex_exit(&cpu_lock);
- CPUDRV_PM_MONITOR_INIT(cpudsp);
+ CPUDRV_MONITOR_INIT(cpudsp);
mutex_exit(&cpudsp->lock);
- cmn_err(CE_WARN, "cpudrv_pm_monitor: instance %d: can't get "
+ cmn_err(CE_WARN, "cpudrv_monitor: instance %d: can't get "
"cpu_t", ddi_get_instance(dip));
goto do_return;
}
if (!cpupm->pm_started) {
cpupm->pm_started = B_TRUE;
- set_supp_freqs(cp, cpupm);
+ cpudrv_set_supp_freqs(cpudsp);
}
- get_cpu_mstate(cp, msnsecs);
+ get_cpu_mstate(cpudsp->cp, msnsecs);
GET_CPU_MSTATE_CNT(CMS_IDLE, idle_cnt);
GET_CPU_MSTATE_CNT(CMS_USER, user_cnt);
GET_CPU_MSTATE_CNT(CMS_SYSTEM, system_cnt);
@@ -1048,7 +1012,7 @@ cpudrv_pm_monitor(void *arg)
if (cpupm->lastquan_ticks == 0) {
cpupm->lastquan_ticks = NSEC_TO_TICK(gethrtime());
mutex_exit(&cpu_lock);
- CPUDRV_PM_MONITOR_INIT(cpudsp);
+ CPUDRV_MONITOR_INIT(cpudsp);
mutex_exit(&cpudsp->lock);
goto do_return;
}
@@ -1071,10 +1035,10 @@ cpudrv_pm_monitor(void *arg)
* Time taken between recording the current counts and
* arranging the next call of this routine is an error in our
* calculation. We minimize the error by calling
- * CPUDRV_PM_MONITOR_INIT() here instead of end of this routine.
+ * CPUDRV_MONITOR_INIT() here instead of end of this routine.
*/
- CPUDRV_PM_MONITOR_INIT(cpudsp);
- DPRINTF(D_PM_MONITOR_VERBOSE, ("cpudrv_pm_monitor: instance %d: "
+ CPUDRV_MONITOR_INIT(cpudsp);
+ DPRINTF(D_PM_MONITOR_VERBOSE, ("cpudrv_monitor: instance %d: "
"idle count %d, user count %d, system count %d, pm_level %d, "
"pm_busycnt %d\n", ddi_get_instance(dip), idle_cnt, user_cnt,
system_cnt, cur_spd->pm_level, cpupm->pm_busycnt));
@@ -1089,7 +1053,7 @@ cpudrv_pm_monitor(void *arg)
* DPRINTFs changes the timing.
*/
if (tick_cnt > cur_spd->quant_cnt) {
- DPRINTF(D_PM_MONITOR_DELAY, ("cpudrv_pm_monitor: instance %d: "
+ DPRINTF(D_PM_MONITOR_DELAY, ("cpudrv_monitor: instance %d: "
"tick count %d > quantum_count %u\n",
ddi_get_instance(dip), tick_cnt, cur_spd->quant_cnt));
}
@@ -1102,7 +1066,7 @@ cpudrv_pm_monitor(void *arg)
user_cnt = (user_cnt * cur_spd->quant_cnt) / tick_cnt;
if ((user_cnt > cur_spd->user_hwm) || (idle_cnt < cur_spd->idle_lwm &&
- cur_spd->idle_blwm_cnt >= cpudrv_pm_idle_blwm_cnt_max)) {
+ cur_spd->idle_blwm_cnt >= cpudrv_idle_blwm_cnt_max)) {
cur_spd->idle_blwm_cnt = 0;
cur_spd->idle_bhwm_cnt = 0;
/*
@@ -1111,21 +1075,21 @@ cpudrv_pm_monitor(void *arg)
* at the current speed.
*/
if (cur_spd == cur_spd->up_spd || cpudrv_direct_pm) {
- CPUDRV_PM_MONITOR_PM_BUSY_COMP(dip, cpupm);
+ CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm);
} else {
new_spd = cur_spd->up_spd;
- CPUDRV_PM_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm,
+ CPUDRV_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm,
new_spd->pm_level);
}
} else if ((user_cnt <= cur_spd->user_lwm) &&
- (idle_cnt >= cur_spd->idle_hwm) || !CPU_ACTIVE(cp)) {
+ (idle_cnt >= cur_spd->idle_hwm) || !CPU_ACTIVE(cpudsp->cp)) {
cur_spd->idle_blwm_cnt = 0;
cur_spd->idle_bhwm_cnt = 0;
/*
* Arrange to go to next lower speed by informing our idle
* status to the power management framework.
*/
- CPUDRV_PM_MONITOR_PM_IDLE_COMP(dip, cpupm);
+ CPUDRV_MONITOR_PM_IDLE_COMP(dip, cpupm);
} else {
/*
* If we are between the idle water marks and have not
@@ -1134,7 +1098,7 @@ cpudrv_pm_monitor(void *arg)
*/
if ((idle_cnt < cur_spd->idle_hwm) &&
(idle_cnt >= cur_spd->idle_lwm) &&
- (cur_spd->idle_bhwm_cnt < cpudrv_pm_idle_bhwm_cnt_max)) {
+ (cur_spd->idle_bhwm_cnt < cpudrv_idle_bhwm_cnt_max)) {
cur_spd->idle_blwm_cnt = 0;
cur_spd->idle_bhwm_cnt++;
mutex_exit(&cpudsp->lock);
@@ -1147,7 +1111,7 @@ cpudrv_pm_monitor(void *arg)
/*
* Arranges to stay at the current speed.
*/
- CPUDRV_PM_MONITOR_PM_BUSY_COMP(dip, cpupm);
+ CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm);
}
mutex_exit(&cpudsp->lock);
do_return:
diff --git a/usr/src/uts/common/io/pm.c b/usr/src/uts/common/io/pm.c
index fba811e3c8..b9f146bf2a 100644
--- a/usr/src/uts/common/io/pm.c
+++ b/usr/src/uts/common/io/pm.c
@@ -19,11 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-
/*
* pm This driver now only handles the ioctl interface. The scanning
* and policy stuff now lives in common/os/sunpm.c.
@@ -33,6 +32,7 @@
#include <sys/types.h>
#include <sys/errno.h>
#include <sys/modctl.h>
+#include <sys/callb.h> /* callback registration for cpu_deep_idle */
#include <sys/conf.h> /* driver flags and functions */
#include <sys/open.h> /* OTYP_CHR definition */
#include <sys/stat.h> /* S_IFCHR definition */
@@ -53,6 +53,7 @@
#include <sys/note.h>
#include <sys/taskq.h>
#include <sys/policy.h>
+#include <sys/cpu_pm.h>
/*
* Minor number is instance<<8 + clone minor from range 1-254; (0 reserved
@@ -73,6 +74,7 @@ extern kmutex_t pm_scan_lock; /* protects autopm_enable, pm_scans_disabled */
extern kmutex_t pm_clone_lock; /* protects pm_clones array */
extern int autopm_enabled;
extern pm_cpupm_t cpupm;
+extern pm_cpupm_t cpupm_default_mode;
extern int pm_default_idle_threshold;
extern int pm_system_idle_threshold;
extern int pm_cpu_idle_threshold;
@@ -444,6 +446,10 @@ static struct pm_cmd_info pmci[] = {
{PM_ADD_DEPENDENT_PROPERTY, "PM_ADD_DEPENDENT_PROPERTY", 1, PM_REQ,
INWHO | INDATASTRING, NODIP, DEP, SU},
{PM_START_CPUPM, "PM_START_CPUPM", 1, NOSTRUCT, 0, 0, 0, SU},
+ {PM_START_CPUPM_EV, "PM_START_CPUPM_EV", 1, NOSTRUCT, 0,
+ 0, 0, SU},
+ {PM_START_CPUPM_POLL, "PM_START_CPUPM_POLL", 1, NOSTRUCT, 0,
+ 0, 0, SU},
{PM_STOP_CPUPM, "PM_STOP_CPUPM", 1, NOSTRUCT, 0, 0, 0, SU},
{PM_GET_CPU_THRESHOLD, "PM_GET_CPU_THRESHOLD", 1, NOSTRUCT},
{PM_SET_CPU_THRESHOLD, "PM_SET_CPU_THRESHOLD", 1, NOSTRUCT,
@@ -457,6 +463,12 @@ static struct pm_cmd_info pmci[] = {
{PM_SEARCH_LIST, "PM_SEARCH_LIST", 1, PM_SRCH, 0, 0, 0, SU},
{PM_GET_CMD_NAME, "PM_GET_CMD_NAME", 1, PM_REQ, INDATAOUT, NODIP,
NODEP, 0},
+ {PM_DISABLE_CPU_DEEP_IDLE, "PM_DISABLE_CPU_DEEP_IDLE", 1, NOSTRUCT, 0,
+ 0, 0, SU},
+ {PM_ENABLE_CPU_DEEP_IDLE, "PM_START_CPU_DEEP_IDLE", 1, NOSTRUCT, 0,
+ 0, 0, SU},
+ {PM_DEFAULT_CPU_DEEP_IDLE, "PM_DFLT_CPU_DEEP_IDLE", 1, NOSTRUCT, 0,
+ 0, 0, SU},
{0, NULL}
};
@@ -500,16 +512,17 @@ pm_start_pm_walk(dev_info_t *dip, void *arg)
switch (cmd) {
case PM_START_CPUPM:
+ case PM_START_CPUPM_POLL:
if (!PM_ISCPU(dip))
return (DDI_WALK_CONTINUE);
mutex_enter(&pm_scan_lock);
- if (!PM_CPUPM_DISABLED)
+ if (!PM_CPUPM_DISABLED && !PM_EVENT_CPUPM)
pm_scan_init(dip);
mutex_exit(&pm_scan_lock);
break;
case PM_START_PM:
mutex_enter(&pm_scan_lock);
- if (PM_ISCPU(dip) && PM_CPUPM_DISABLED) {
+ if (PM_ISCPU(dip) && (PM_CPUPM_DISABLED || PM_EVENT_CPUPM)) {
mutex_exit(&pm_scan_lock);
return (DDI_WALK_CONTINUE);
}
@@ -552,7 +565,7 @@ pm_stop_pm_walk(dev_info_t *dip, void *arg)
* stop them as part of PM_STOP_PM. Only stop them as part of
* PM_STOP_CPUPM and PM_RESET_PM.
*/
- if (PM_ISCPU(dip) && PM_CPUPM_ENABLED)
+ if (PM_ISCPU(dip) && PM_POLLING_CPUPM)
return (DDI_WALK_CONTINUE);
break;
case PM_STOP_CPUPM:
@@ -2662,22 +2675,74 @@ pm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cr, int *rval_p)
switch (cmd) {
case PM_START_PM:
case PM_START_CPUPM:
+ case PM_START_CPUPM_EV:
+ case PM_START_CPUPM_POLL:
{
+ pm_cpupm_t new_mode = PM_CPUPM_NOTSET;
+ pm_cpupm_t old_mode = PM_CPUPM_NOTSET;
+ int r;
+
mutex_enter(&pm_scan_lock);
if ((cmd == PM_START_PM && autopm_enabled) ||
- (cmd == PM_START_CPUPM && PM_CPUPM_ENABLED)) {
+ (cmd == PM_START_CPUPM && PM_DEFAULT_CPUPM) ||
+ (cmd == PM_START_CPUPM_EV && PM_EVENT_CPUPM) ||
+ (cmd == PM_START_CPUPM_POLL && PM_POLLING_CPUPM)) {
mutex_exit(&pm_scan_lock);
- PMD(PMD_ERROR, ("ioctl: %s: EBUSY\n",
- cmdstr))
+ PMD(PMD_ERROR, ("ioctl: %s: EBUSY\n", cmdstr))
ret = EBUSY;
break;
}
- if (cmd == PM_START_PM)
+
+ if (cmd == PM_START_PM) {
autopm_enabled = 1;
- else
- cpupm = PM_CPUPM_ENABLE;
+ } else if (cmd == PM_START_CPUPM) {
+ old_mode = cpupm;
+ new_mode = cpupm = cpupm_default_mode;
+ } else if (cmd == PM_START_CPUPM_EV) {
+ old_mode = cpupm;
+ new_mode = cpupm = PM_CPUPM_EVENT;
+ } else if (cmd == PM_START_CPUPM_POLL) {
+ old_mode = cpupm;
+ new_mode = cpupm = PM_CPUPM_POLLING;
+ }
+
mutex_exit(&pm_scan_lock);
- ddi_walk_devs(ddi_root_node(), pm_start_pm_walk, &cmd);
+
+ /*
+ * If we are changing CPUPM modes, and it is active,
+ * then stop it from operating in the old mode.
+ */
+ if (old_mode == PM_CPUPM_POLLING) {
+ int c = PM_STOP_CPUPM;
+ ddi_walk_devs(ddi_root_node(), pm_stop_pm_walk,
+ &c);
+ } else if (old_mode == PM_CPUPM_EVENT) {
+ r = cpupm_set_policy(CPUPM_POLICY_DISABLED);
+
+ /*
+ * Disabling CPUPM policy should always
+ * succeed
+ */
+ ASSERT(r == 0);
+ }
+
+ /*
+ * If we are changing to event based CPUPM, enable it.
+ * In the event it's not supported, fall back to
+ * polling based CPUPM.
+ */
+ if (new_mode == PM_CPUPM_EVENT &&
+ cpupm_set_policy(CPUPM_POLICY_ELASTIC) < 0) {
+ mutex_enter(&pm_scan_lock);
+ new_mode = cpupm = PM_CPUPM_POLLING;
+ cmd = PM_START_CPUPM_POLL;
+ mutex_exit(&pm_scan_lock);
+ }
+ if (new_mode == PM_CPUPM_POLLING ||
+ cmd == PM_START_PM) {
+ ddi_walk_devs(ddi_root_node(), pm_start_pm_walk,
+ &cmd);
+ }
ret = 0;
break;
}
@@ -2687,6 +2752,7 @@ pm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cr, int *rval_p)
case PM_STOP_CPUPM:
{
extern void pm_discard_thresholds(void);
+ pm_cpupm_t old_mode = PM_CPUPM_NOTSET;
mutex_enter(&pm_scan_lock);
if ((cmd == PM_STOP_PM && !autopm_enabled) ||
@@ -2697,22 +2763,30 @@ pm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cr, int *rval_p)
ret = EINVAL;
break;
}
+
if (cmd == PM_STOP_PM) {
autopm_enabled = 0;
pm_S3_enabled = 0;
autoS3_enabled = 0;
} else if (cmd == PM_STOP_CPUPM) {
+ old_mode = cpupm;
cpupm = PM_CPUPM_DISABLE;
} else {
autopm_enabled = 0;
autoS3_enabled = 0;
+ old_mode = cpupm;
cpupm = PM_CPUPM_NOTSET;
}
mutex_exit(&pm_scan_lock);
/*
* bring devices to full power level, stop scan
+ * If CPUPM was operating in event driven mode, disable
+ * that.
*/
+ if (old_mode == PM_CPUPM_EVENT) {
+ (void) cpupm_set_policy(CPUPM_POLICY_DISABLED);
+ }
ddi_walk_devs(ddi_root_node(), pm_stop_pm_walk, &cmd);
ret = 0;
if (cmd == PM_STOP_PM || cmd == PM_STOP_CPUPM)
@@ -2796,7 +2870,7 @@ pm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cr, int *rval_p)
case PM_GET_CPUPM_STATE:
{
- if (PM_CPUPM_ENABLED)
+ if (PM_POLLING_CPUPM || PM_EVENT_CPUPM)
*rval_p = PM_CPU_PM_ENABLED;
else if (PM_CPUPM_DISABLED)
*rval_p = PM_CPU_PM_DISABLED;
@@ -2881,6 +2955,34 @@ pm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cr, int *rval_p)
break;
}
+ case PM_ENABLE_CPU_DEEP_IDLE:
+ {
+ if (callb_execute_class(CB_CL_CPU_DEEP_IDLE,
+ PM_ENABLE_CPU_DEEP_IDLE) == NULL)
+ ret = 0;
+ else
+ ret = EBUSY;
+ break;
+ }
+ case PM_DISABLE_CPU_DEEP_IDLE:
+ {
+ if (callb_execute_class(CB_CL_CPU_DEEP_IDLE,
+ PM_DISABLE_CPU_DEEP_IDLE) == NULL)
+ ret = 0;
+ else
+ ret = EINVAL;
+ break;
+ }
+ case PM_DEFAULT_CPU_DEEP_IDLE:
+ {
+ if (callb_execute_class(CB_CL_CPU_DEEP_IDLE,
+ PM_DEFAULT_CPU_DEEP_IDLE) == NULL)
+ ret = 0;
+ else
+ ret = EBUSY;
+ break;
+ }
+
default:
/*
* Internal error, invalid ioctl description
@@ -2896,7 +2998,7 @@ pm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cr, int *rval_p)
break;
}
- default:
+default:
/*
* Internal error, invalid ioctl description
* force debug entry even if pm_debug not set
diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c
index 904e507caf..8b8d0d08b5 100644
--- a/usr/src/uts/common/os/cpu.c
+++ b/usr/src/uts/common/os/cpu.c
@@ -56,6 +56,7 @@
#include <sys/msacct.h>
#include <sys/time.h>
#include <sys/archsystm.h>
+#include <sys/sdt.h>
#if defined(__x86) || defined(__amd64)
#include <sys/x86_archext.h>
#endif
@@ -2163,6 +2164,8 @@ static struct {
kstat_named_t ci_pkg_core_id;
kstat_named_t ci_ncpuperchip;
kstat_named_t ci_ncoreperchip;
+ kstat_named_t ci_max_cstates;
+ kstat_named_t ci_curr_cstate;
#endif
} cpu_info_template = {
{ "state", KSTAT_DATA_CHAR },
@@ -2189,6 +2192,8 @@ static struct {
{ "pkg_core_id", KSTAT_DATA_LONG },
{ "ncpu_per_chip", KSTAT_DATA_INT32 },
{ "ncore_per_chip", KSTAT_DATA_INT32 },
+ { "supported_max_cstates", KSTAT_DATA_INT32 },
+ { "current_cstate", KSTAT_DATA_INT32 },
#endif
};
@@ -2258,6 +2263,8 @@ cpu_info_kstat_update(kstat_t *ksp, int rw)
cpu_info_template.ci_ncoreperchip.value.l =
cpuid_get_ncore_per_chip(cp);
cpu_info_template.ci_pkg_core_id.value.l = cpuid_get_pkgcoreid(cp);
+ cpu_info_template.ci_max_cstates.value.l = cp->cpu_m.max_cstates;
+ cpu_info_template.ci_curr_cstate.value.l = cp->cpu_m.curr_cstate;
#endif
return (0);
@@ -2960,6 +2967,25 @@ cpu_set_supp_freqs(cpu_t *cp, const char *freqs)
}
/*
+ * Indicate the current CPU's clock freqency (in Hz).
+ * The calling context must be such that CPU references are safe.
+ */
+void
+cpu_set_curr_clock(uint64_t new_clk)
+{
+ uint64_t old_clk;
+
+ old_clk = CPU->cpu_curr_clock;
+ CPU->cpu_curr_clock = new_clk;
+
+ /*
+ * The cpu-change-speed DTrace probe exports the frequency in Hz
+ */
+ DTRACE_PROBE3(cpu__change__speed, processorid_t, CPU->cpu_id,
+ uint64_t, old_clk, uint64_t, new_clk);
+}
+
+/*
* processor_info(2) and p_online(2) status support functions
* The constants returned by the cpu_get_state() and cpu_get_state_str() are
* for use in communicating processor state information to userland. Kernel
diff --git a/usr/src/uts/common/os/cpu_pm.c b/usr/src/uts/common/os/cpu_pm.c
new file mode 100644
index 0000000000..848907af1d
--- /dev/null
+++ b/usr/src/uts/common/os/cpu_pm.c
@@ -0,0 +1,840 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/cpu_pm.h>
+#include <sys/cmn_err.h>
+#include <sys/sdt.h>
+
+/*
+ * Solaris Event Based CPU Power Manager
+ *
+ * This file implements platform independent event based CPU power management.
+ * When CPUs are configured into the system, the CMT scheduling subsystem will
+ * query the platform to determine if the CPU belongs to any power management
+ * domains. That is, sets of CPUs that share power management states.
+ *
+ * Active Power Management domains represent a group of CPUs across which the
+ * Operating System can request speed changes (which may in turn result
+ * in voltage changes). This allows the operating system to trade off
+ * performance for power savings.
+ *
+ * Idle Power Management domains can enter power savings states when they are
+ * unutilized. These states allow the Operating System to trade off power
+ * for performance (in the form of latency to transition from the idle state
+ * to an active one).
+ *
+ * For each active and idle power domain the CMT subsystem instantiates, a
+ * cpupm_domain_t structure is created. As the dispatcher schedules threads
+ * to run on the system's CPUs, it will also track the utilization of the
+ * enumerated power domains. Significant changes in utilization will result
+ * in the dispatcher sending the power manager events that relate to the
+ * utilization of the power domain. The power manager recieves the events,
+ * and in the context of the policy objectives in force, may decide to request
+ * the domain's power/performance state be changed.
+ *
+ * Under the "elastic" CPUPM policy, when the utilization rises, the CPU power
+ * manager will request the CPUs in the domain run at their fastest (and most
+ * power consuming) state. When the domain becomes idle (utilization at zero),
+ * the power manager will request that the CPUs run at a speed that saves the
+ * most power.
+ *
+ * The advantage of this scheme, is that the CPU power manager working with the
+ * dispatcher can be extremely responsive to changes in utilization. Optimizing
+ * for performance in the presence of utilization, and power savings in the
+ * presence of idleness. Such close collaboration with the dispatcher has other
+ * benefits that will play out in the form of more sophisticated power /
+ * performance policy in the near future.
+ *
+ * Avoiding state thrashing in the presence of transient periods of utilization
+ * and idleness while still being responsive to non-transient periods is key.
+ * The power manager implmeents several "governors" that are used to throttle
+ * state transitions when a significant amount of transient idle or transient
+ * work is detected.
+ *
+ * Kernel background activity (e.g. taskq threads) are by far the most common
+ * form of transient utilization. Ungoverned in the face of this utililzation,
+ * hundreds of state transitions per second would result on an idle system.
+ *
+ * Transient idleness is common when a thread briefly yields the CPU to
+ * wait for an event elsewhere in the system. Where the idle period is short
+ * enough, the overhead associated with making the state transition doesn't
+ * justify the power savings.
+ */
+
+static cpupm_domain_t *cpupm_domains = NULL;
+
+/*
+ * Uninitialized state of CPU power management is disabled
+ */
+cpupm_policy_t cpupm_policy = CPUPM_POLICY_DISABLED;
+
+/*
+ * Periods of utilization lasting less than this time interval are characterized
+ * as transient. State changes associated with transient work are considered
+ * to be mispredicted. That is, it's not worth raising and lower power states
+ * where the utilization lasts for less than this interval.
+ */
+hrtime_t cpupm_tw_predict_interval;
+
+/*
+ * Periods of idleness lasting less than this time interval are characterized
+ * as transient. State changes associated with transient idle are considered
+ * to be mispredicted. That is, it's not worth lowering and raising power
+ * states where the idleness lasts for less than this interval.
+ */
+hrtime_t cpupm_ti_predict_interval;
+
+/*
+ * Number of mispredictions after which future transitions will be governed.
+ */
+int cpupm_mispredict_thresh = 2;
+
+/*
+ * Likewise, the number of mispredicted governed transitions after which the
+ * governor will be removed.
+ */
+int cpupm_mispredict_gov_thresh = 10;
+
+/*
+ * The transient work and transient idle prediction intervals are initialized
+ * to be some multiple of the amount of time it takes to transition a power
+ * domain from the highest to the lowest power state, and back again, which
+ * is measured.
+ *
+ * The default values of those multiples are specified here. Tuning them higher
+ * will result in the transient work, and transient idle governors being used
+ * more aggresively, which limits the frequency of state transitions at the
+ * expense of performance and power savings, respectively.
+ */
+#define CPUPM_TI_GOV_DEFAULT_MULTIPLE 600
+#define CPUPM_TW_GOV_DEFAULT_MULTIPLE 25
+
+/*
+ * Number of high=>low=>high measurements performed, of which the average
+ * is taken.
+ */
+#define CPUPM_BENCHMARK_ITERS 5
+
+int cpupm_ti_gov_multiple = CPUPM_TI_GOV_DEFAULT_MULTIPLE;
+int cpupm_tw_gov_multiple = CPUPM_TW_GOV_DEFAULT_MULTIPLE;
+
+
+static int cpupm_governor_initialize(void);
+static void cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t);
+
+cpupm_policy_t
+cpupm_get_policy(void)
+{
+ return (cpupm_policy);
+}
+
+int
+cpupm_set_policy(cpupm_policy_t new_policy)
+{
+ static int gov_init = 0;
+ int result = 0;
+
+ mutex_enter(&cpu_lock);
+ if (new_policy == cpupm_policy) {
+ mutex_exit(&cpu_lock);
+ return (result);
+ }
+
+ /*
+ * Pausing CPUs causes a high priority thread to be scheduled
+ * on all other CPUs (besides the current one). This locks out
+ * other CPUs from making CPUPM state transitions.
+ */
+ switch (new_policy) {
+ case CPUPM_POLICY_DISABLED:
+ pause_cpus(NULL);
+ cpupm_policy = CPUPM_POLICY_DISABLED;
+ start_cpus();
+
+ result = cmt_pad_disable(PGHW_POW_ACTIVE);
+
+ /*
+ * Once PAD has been enabled, it should always be possible
+ * to disable it.
+ */
+ ASSERT(result == 0);
+
+ /*
+ * Bring all the active power domains to the maximum
+ * performance state.
+ */
+ cpupm_state_change_global(CPUPM_DTYPE_ACTIVE,
+ CPUPM_STATE_MAX_PERF);
+
+ break;
+ case CPUPM_POLICY_ELASTIC:
+
+ result = cmt_pad_enable(PGHW_POW_ACTIVE);
+ if (result < 0) {
+ /*
+ * Failed to enable PAD across the active power
+ * domains, which may well be because none were
+ * enumerated.
+ */
+ break;
+ }
+
+ pause_cpus(NULL);
+ /*
+ * Attempt to initialize the governor parameters the first
+ * time through.
+ */
+ if (gov_init == 0) {
+ result = cpupm_governor_initialize();
+ if (result == 0) {
+ gov_init = 1;
+ } else {
+ /*
+ * Failed to initialize the governor parameters
+ */
+ start_cpus();
+ break;
+ }
+ }
+ cpupm_policy = CPUPM_POLICY_ELASTIC;
+ start_cpus();
+
+ break;
+ default:
+ cmn_err(CE_WARN, "Attempt to set unknown CPUPM policy %d\n",
+ new_policy);
+ ASSERT(0);
+ break;
+ }
+ mutex_exit(&cpu_lock);
+
+ return (result);
+}
+
+/*
+ * Look for an existing power domain
+ */
+static cpupm_domain_t *
+cpupm_domain_find(id_t id, cpupm_dtype_t type)
+{
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ cpupm_domain_t *dom;
+
+ dom = cpupm_domains;
+ while (dom != NULL) {
+ if (id == dom->cpd_id && type == dom->cpd_type)
+ return (dom);
+ dom = dom->cpd_next;
+ }
+ return (NULL);
+}
+
+/*
+ * Create a new domain
+ */
+static cpupm_domain_t *
+cpupm_domain_create(id_t id, cpupm_dtype_t type)
+{
+ cpupm_domain_t *dom;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ dom = kmem_zalloc(sizeof (cpupm_domain_t), KM_SLEEP);
+ dom->cpd_id = id;
+ dom->cpd_type = type;
+
+ /* Link into the known domain list */
+ dom->cpd_next = cpupm_domains;
+ cpupm_domains = dom;
+
+ return (dom);
+}
+
+static void
+cpupm_domain_state_enum(struct cpu *cp, cpupm_domain_t *dom)
+{
+ /*
+ * In the envent we're enumerating because the domain's state
+ * configuration has changed, toss any existing states.
+ */
+ if (dom->cpd_nstates > 0) {
+ kmem_free(dom->cpd_states,
+ sizeof (cpupm_state_t) * dom->cpd_nstates);
+ dom->cpd_nstates = 0;
+ }
+
+ /*
+ * Query to determine the number of states, allocate storage
+ * large enough to hold the state information, and pass it back
+ * to the platform driver to complete the enumeration.
+ */
+ dom->cpd_nstates = cpupm_plat_state_enumerate(cp, dom->cpd_type, NULL);
+
+ if (dom->cpd_nstates == 0)
+ return;
+
+ dom->cpd_states =
+ kmem_zalloc(dom->cpd_nstates * sizeof (cpupm_state_t), KM_SLEEP);
+ (void) cpupm_plat_state_enumerate(cp, dom->cpd_type, dom->cpd_states);
+}
+
+/*
+ * Initialize the specified type of power domain on behalf of the CPU
+ */
+cpupm_domain_t *
+cpupm_domain_init(struct cpu *cp, cpupm_dtype_t type)
+{
+ cpupm_domain_t *dom;
+ id_t did;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ /*
+ * Instantiate the domain if it doesn't already exist
+ * and enumerate its power states.
+ */
+ did = cpupm_domain_id(cp, type);
+ dom = cpupm_domain_find(did, type);
+ if (dom == NULL) {
+ dom = cpupm_domain_create(did, type);
+ cpupm_domain_state_enum(cp, dom);
+ }
+
+ /*
+ * Named state initialization
+ */
+ if (type == CPUPM_DTYPE_ACTIVE) {
+ /*
+ * For active power domains, the highest performance
+ * state is defined as first state returned from
+ * the domain enumeration.
+ */
+ dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
+ &dom->cpd_states[0];
+ dom->cpd_named_states[CPUPM_STATE_LOW_POWER] =
+ &dom->cpd_states[dom->cpd_nstates - 1];
+
+ /*
+ * Begin by assuming CPU is running at the max perf state.
+ */
+ dom->cpd_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+ }
+
+ return (dom);
+}
+
+/*
+ * Return the id associated with the given type of domain
+ * to which cp belongs
+ */
+id_t
+cpupm_domain_id(struct cpu *cp, cpupm_dtype_t type)
+{
+ return (cpupm_plat_domain_id(cp, type));
+}
+
+/*
+ * Initiate a state change for the specified domain on behalf of cp
+ */
+int
+cpupm_change_state(struct cpu *cp, cpupm_domain_t *dom, cpupm_state_t *state)
+{
+ if (cpupm_plat_change_state(cp, state) < 0)
+ return (-1);
+
+ DTRACE_PROBE2(cpupm__change__state,
+ cpupm_domain_t *, dom,
+ cpupm_state_t *, state);
+
+ dom->cpd_state = state;
+ return (0);
+}
+
+/*
+ * Interface into the CPU power manager to indicate a significant change
+ * in utilization of the specified active power domain
+ */
+void
+cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom,
+ cpupm_util_event_t event)
+{
+ cpupm_state_t *new_state = NULL;
+ hrtime_t last;
+
+ if (cpupm_policy == CPUPM_POLICY_DISABLED) {
+ return;
+ }
+
+ /*
+ * What follows is a simple elastic power state management policy.
+ *
+ * If the utilization has become non-zero, and the domain was
+ * previously at it's lowest power state, then transition it
+ * to the highest state in the spirit of "race to idle".
+ *
+ * If the utilization has dropped to zero, then transition the
+ * domain to its lowest power state.
+ *
+ * Statistics are maintained to implement governors to reduce state
+ * transitions resulting from either transient work, or periods of
+ * transient idleness on the domain.
+ */
+ switch (event) {
+ case CPUPM_DOM_REMAIN_BUSY:
+
+ /*
+ * We've received an event that the domain is running a thread
+ * that's made it to the end of it's time slice. If we are at
+ * low power, then raise it. If the transient work governor
+ * is engaged, then remove it.
+ */
+ if (dom->cpd_state ==
+ dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
+ new_state =
+ dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+ if (dom->cpd_tw_governed == B_TRUE) {
+ dom->cpd_tw_governed = B_FALSE;
+ dom->cpd_tw = 0;
+ }
+ }
+ break;
+
+ case CPUPM_DOM_BUSY_FROM_IDLE:
+ last = dom->cpd_last_lower;
+ dom->cpd_last_raise = now;
+
+ DTRACE_PROBE3(cpupm__raise__req,
+ cpupm_domain_t *, dom,
+ hrtime_t, last,
+ hrtime_t, now);
+
+ if (dom->cpd_state ==
+ dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
+
+ /*
+ * There's non-zero utilization, and the domain is
+ * running in the lower power state. Before we
+ * consider raising power, perform some book keeping
+ * for the transient idle governor.
+ */
+ if (dom->cpd_ti_governed == B_FALSE) {
+ if ((now - last) < cpupm_ti_predict_interval) {
+ /*
+ * We're raising the domain power and
+ * we *just* lowered it. Consider
+ * this a mispredicted power state
+ * transition due to a transient
+ * idle period.
+ */
+ if (++dom->cpd_ti >=
+ cpupm_mispredict_thresh) {
+ /*
+ * There's enough transient
+ * idle transitions to
+ * justify governing future
+ * lowering requests.
+ */
+ dom->cpd_ti_governed = B_TRUE;
+ dom->cpd_ti = 0;
+ DTRACE_PROBE1(
+ cpupm__ti__governed,
+ cpupm_domain_t *, dom);
+ }
+ } else {
+ /*
+ * We correctly predicted the last
+ * lowering.
+ */
+ dom->cpd_ti = 0;
+ }
+ }
+ if (dom->cpd_tw_governed == B_TRUE) {
+ /*
+ * Raise requests are governed due to
+ * transient work.
+ */
+ DTRACE_PROBE1(cpupm__raise__governed,
+ cpupm_domain_t *, dom);
+
+ /*
+ * It's likely that we'll be governed for a
+ * while. If the transient idle governor is
+ * also in place, examine the preceeding idle
+ * interval to see if that still makes sense.
+ */
+ if (dom->cpd_ti_governed == B_TRUE &&
+ ((now - last) >=
+ cpupm_ti_predict_interval)) {
+ if (++dom->cpd_ti >=
+ cpupm_mispredict_gov_thresh) {
+ dom->cpd_ti_governed =
+ B_FALSE;
+ dom->cpd_ti = 0;
+ }
+ }
+ return;
+ }
+ /*
+ * Prepare to transition to the higher power state
+ */
+ new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+
+ } else if (dom->cpd_state ==
+ dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
+
+ /*
+ * Utilization is non-zero, and we're already running
+ * in the higher power state. Take this opportunity to
+ * perform some book keeping if the last lowering
+ * request was governed.
+ */
+ if (dom->cpd_ti_governed == B_TRUE) {
+ if ((now - last) >= cpupm_ti_predict_interval) {
+ /*
+ * The domain is transient idle
+ * governed, and we mispredicted
+ * governing the last lowering request.
+ */
+ if (++dom->cpd_ti >=
+ cpupm_mispredict_gov_thresh) {
+ /*
+ * There's enough non-transient
+ * idle periods to justify
+ * removing the governor.
+ */
+ dom->cpd_ti_governed = B_FALSE;
+ dom->cpd_ti = 0;
+ DTRACE_PROBE1(
+ cpupm__ti__ungoverned,
+ cpupm_domain_t *, dom);
+ }
+ } else {
+ /*
+ * Correctly predicted governing the
+ * last lowering request.
+ */
+ dom->cpd_ti = 0;
+ }
+ }
+ }
+ break;
+
+ case CPUPM_DOM_IDLE_FROM_BUSY:
+ last = dom->cpd_last_raise;
+ dom->cpd_last_lower = now;
+
+ DTRACE_PROBE3(cpupm__lower__req,
+ cpupm_domain_t *, dom,
+ hrtime_t, last,
+ hrtime_t, now);
+
+ if (dom->cpd_state ==
+ dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
+
+ /*
+ * The domain is idle, and is running in the highest
+ * performance state. Before we consider lowering power,
+ * perform some book keeping for the transient work
+ * governor.
+ */
+ if (dom->cpd_tw_governed == B_FALSE) {
+ if ((now - last) < cpupm_tw_predict_interval) {
+ /*
+ * We're lowering the domain power and
+ * we *just* raised it. Consider the
+ * last raise mispredicted due to
+ * transient work.
+ */
+ if (++dom->cpd_tw >=
+ cpupm_mispredict_thresh) {
+ /*
+ * There's enough transient idle
+ * transitions to justify
+ * governing future lowering
+ * requests.
+ */
+ dom->cpd_tw_governed = B_TRUE;
+ dom->cpd_tw = 0;
+ DTRACE_PROBE1(
+ cpupm__tw__governed,
+ cpupm_domain_t *, dom);
+ }
+ } else {
+ /*
+ * We correctly predicted during the
+ * last raise.
+ */
+ dom->cpd_tw = 0;
+ }
+ }
+ if (dom->cpd_ti_governed == B_TRUE) {
+ /*
+ * Lowering requests are governed due to
+ * transient idleness.
+ */
+ DTRACE_PROBE1(cpupm__lowering__governed,
+ cpupm_domain_t *, dom);
+
+ /*
+ * It's likely that we'll be governed for a
+ * while. If the transient work governor is
+ * also in place, examine the preceeding busy
+ * interval to see if that still makes sense.
+ */
+ if (dom->cpd_tw_governed == B_TRUE &&
+ ((now - last) >=
+ cpupm_tw_predict_interval)) {
+ if (++dom->cpd_tw >=
+ cpupm_mispredict_gov_thresh) {
+ dom->cpd_tw_governed =
+ B_FALSE;
+ dom->cpd_tw = 0;
+ }
+ }
+ return;
+ }
+
+ /*
+ * Prepare to transition to a lower power state.
+ */
+ new_state =
+ dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
+
+ } else if (dom->cpd_state ==
+ dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
+
+ /*
+ * The domain is idle, and we're already running in
+ * the lower power state. Take this opportunity to
+ * perform some book keeping if the last raising
+ * request was governed.
+ */
+ if (dom->cpd_tw_governed == B_TRUE) {
+ if ((now - last) >= cpupm_tw_predict_interval) {
+ /*
+ * The domain is transient work
+ * governed, and we mispredicted
+ * governing the last raising request.
+ */
+ if (++dom->cpd_tw >=
+ cpupm_mispredict_gov_thresh) {
+ /*
+ * There's enough non-transient
+ * work to justify removing
+ * the governor.
+ */
+ dom->cpd_tw_governed = B_FALSE;
+ dom->cpd_tw = 0;
+ DTRACE_PROBE1(
+ cpupm__tw__ungoverned,
+ cpupm_domain_t *, dom);
+ }
+ } else {
+ /*
+ * We correctly predicted governing
+ * the last raise.
+ */
+ dom->cpd_tw = 0;
+ }
+ }
+ }
+ break;
+ }
+ /*
+ * Change the power state
+ * Not much currently done if this doesn't succeed
+ */
+ if (new_state)
+ (void) cpupm_change_state(cp, dom, new_state);
+}
+
+
+/*
+ * Interface called by platforms to dynamically change the
+ * MAX performance cpupm state
+ */
+void
+cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level)
+{
+ cpupm_domain_t *dom;
+ id_t did;
+ cpupm_dtype_t type = CPUPM_DTYPE_ACTIVE;
+ boolean_t change_state = B_FALSE;
+ cpupm_state_t *new_state = NULL;
+
+ did = cpupm_domain_id(cp, type);
+ mutex_enter(&cpu_lock);
+ dom = cpupm_domain_find(did, type);
+ mutex_exit(&cpu_lock);
+
+ /*
+ * Can use a lock to avoid changing the power state of the cpu when
+ * CPUPM_STATE_MAX_PERF is getting changed.
+ * Since the occurance of events to change MAX_PERF is not frequent,
+ * it may not be a good idea to overburden with locks. In the worst
+ * case, for one cycle the power may not get changed to the required
+ * level
+ */
+ if (dom != NULL) {
+ if (dom->cpd_state ==
+ dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
+ change_state = B_TRUE;
+ }
+
+ /*
+ * If an out of range level is passed, use the lowest supported
+ * speed.
+ */
+ if (max_perf_level >= dom->cpd_nstates &&
+ dom->cpd_nstates > 1) {
+ max_perf_level = dom->cpd_nstates - 1;
+ }
+
+ dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
+ &dom->cpd_states[max_perf_level];
+
+ /*
+ * If the current state is MAX_PERF, change the current state
+ * to the new MAX_PERF
+ */
+ if (change_state) {
+ new_state =
+ dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+ if (new_state) {
+ (void) cpupm_change_state(cp, dom, new_state);
+ }
+ }
+ }
+}
+
+/*
+ * Benchmark some power state transitions and use the transition latencies as
+ * a basis for initializing parameters for the transient idle and transient
+ * work governors.
+ *
+ * Returns 0 on success or -1 if the governor parameters could not be
+ * initialized.
+ */
+static int
+cpupm_governor_initialize(void)
+{
+ cpu_t *cp = CPU;
+ cpupm_domain_t *dom;
+ cpupm_state_t *low, *high;
+ id_t did;
+ hrtime_t start, delta, deltas = 0;
+ int iterations;
+
+ did = cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE);
+ if (did == CPUPM_NO_DOMAIN)
+ return (-1);
+
+ dom = cpupm_domain_find(did, CPUPM_DTYPE_ACTIVE);
+ if (dom == NULL)
+ return (-1);
+
+ low = dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
+ high = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+
+ for (iterations = 0; iterations < CPUPM_BENCHMARK_ITERS; iterations++) {
+
+ /*
+ * Measure the amount of time it takes to transition the
+ * domain down to the lowest, and back to the highest power
+ * state.
+ */
+ start = gethrtime_unscaled();
+ (void) cpupm_change_state(cp, dom, low);
+ (void) cpupm_change_state(cp, dom, high);
+ delta = gethrtime_unscaled() - start;
+
+ DTRACE_PROBE1(cpupm__benchmark__latency,
+ hrtime_t, delta);
+
+ deltas += delta;
+ }
+
+ /*
+ * Figure the average latency, and tune the transient work and
+ * transient idle prediction intervals accordingly.
+ */
+ delta = deltas / iterations;
+
+ cpupm_ti_predict_interval = delta * cpupm_ti_gov_multiple;
+ cpupm_tw_predict_interval = delta * cpupm_tw_gov_multiple;
+
+ return (0);
+}
+
+/*
+ * Initiate a state change in all CPUPM domain instances of the specified type
+ */
+static void
+cpupm_state_change_global(cpupm_dtype_t type, cpupm_state_name_t state)
+{
+ cpu_t *cp;
+ pg_cmt_t *pwr_pg;
+ cpupm_domain_t *dom;
+ group_t *hwset;
+ group_iter_t giter;
+ pg_cpu_itr_t cpu_iter;
+ pghw_type_t hw;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ switch (type) {
+ case CPUPM_DTYPE_ACTIVE:
+ hw = PGHW_POW_ACTIVE;
+ break;
+ default:
+ /*
+ * Power domain types other than "active" unsupported.
+ */
+ ASSERT(type == CPUPM_DTYPE_ACTIVE);
+ return;
+ }
+
+ if ((hwset = pghw_set_lookup(hw)) == NULL)
+ return;
+
+ /*
+ * Iterate over the power domains
+ */
+ group_iter_init(&giter);
+ while ((pwr_pg = group_iterate(hwset, &giter)) != NULL) {
+
+ dom = (cpupm_domain_t *)pwr_pg->cmt_pg.pghw_handle;
+
+ /*
+ * Iterate over the CPUs in each domain
+ */
+ PG_CPU_ITR_INIT(pwr_pg, cpu_iter);
+ while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
+ (void) cpupm_change_state(cp, dom,
+ dom->cpd_named_states[state]);
+ }
+ }
+}
diff --git a/usr/src/uts/common/os/cpupm.c b/usr/src/uts/common/os/cpupm.c
new file mode 100644
index 0000000000..1e1aa97bf5
--- /dev/null
+++ b/usr/src/uts/common/os/cpupm.c
@@ -0,0 +1,67 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/sunddi.h>
+#include <sys/cpupm.h>
+
+/*
+ * Initialize the field that will be used for reporting
+ * the supported_frequencies_Hz cpu_info kstat.
+ */
+void
+cpupm_set_supp_freqs(cpu_t *cp, int *speeds, uint_t nspeeds)
+{
+ char *supp_freqs = NULL;
+ char *sfptr;
+ uint64_t *hzspeeds;
+ int i;
+ int j;
+#define UINT64_MAX_STRING (sizeof ("18446744073709551615"))
+
+ if (speeds == NULL) {
+ cpu_set_supp_freqs(cp, supp_freqs);
+ return;
+ }
+
+ hzspeeds = kmem_zalloc(nspeeds * sizeof (uint64_t), KM_SLEEP);
+ for (i = nspeeds - 1, j = 0; i >= 0; i--, j++) {
+ hzspeeds[i] = CPUPM_SPEED_HZ(cp->cpu_type_info.pi_clock,
+ speeds[j]);
+ }
+
+ supp_freqs = kmem_zalloc((UINT64_MAX_STRING * nspeeds), KM_SLEEP);
+ sfptr = supp_freqs;
+ for (i = 0; i < nspeeds; i++) {
+ if (i == nspeeds - 1) {
+ (void) sprintf(sfptr, "%"PRIu64, hzspeeds[i]);
+ } else {
+ (void) sprintf(sfptr, "%"PRIu64":", hzspeeds[i]);
+ sfptr = supp_freqs + strlen(supp_freqs);
+ }
+ }
+ cpu_set_supp_freqs(cp, supp_freqs);
+ kmem_free(supp_freqs, (UINT64_MAX_STRING * nspeeds));
+ kmem_free(hzspeeds, nspeeds * sizeof (uint64_t));
+}
diff --git a/usr/src/uts/common/os/group.c b/usr/src/uts/common/os/group.c
index b15dff181f..8c1bc7e491 100644
--- a/usr/src/uts/common/os/group.c
+++ b/usr/src/uts/common/os/group.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/systm.h>
#include <sys/param.h>
#include <sys/debug.h>
@@ -64,6 +62,21 @@ group_destroy(group_t *g)
}
/*
+ * Empty a group_t
+ * Capacity is preserved.
+ */
+void
+group_empty(group_t *g)
+{
+ int i;
+ int sz = g->grp_size;
+
+ g->grp_size = 0;
+ for (i = 0; i < sz; i++)
+ g->grp_set[i] = NULL;
+}
+
+/*
* Add element "e" to group "g"
*
* Returns -1 if addition would result in overcapacity, and
@@ -312,7 +325,7 @@ group_add_at(group_t *g, void *e, uint_t idx)
}
/*
- * Remove the entry at the specified index
+ * Remove the element at the specified index
*/
void
group_remove_at(group_t *g, uint_t idx)
@@ -320,3 +333,19 @@ group_remove_at(group_t *g, uint_t idx)
ASSERT(idx < g->grp_capacity);
g->grp_set[idx] = NULL;
}
+
+/*
+ * Find an element in the group, and return its index
+ * Returns -1 if the element could not be found.
+ */
+uint_t
+group_find(group_t *g, void *e)
+{
+ uint_t idx;
+
+ for (idx = 0; idx < g->grp_capacity; idx++) {
+ if (g->grp_set[idx] == e)
+ return (idx);
+ }
+ return ((uint_t)-1);
+}
diff --git a/usr/src/uts/common/os/pg.c b/usr/src/uts/common/os/pg.c
index 9bd15af43b..82601cac77 100644
--- a/usr/src/uts/common/os/pg.c
+++ b/usr/src/uts/common/os/pg.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/systm.h>
#include <sys/types.h>
#include <sys/param.h>
@@ -99,6 +97,7 @@
static pg_t *pg_alloc_default(pg_class_t);
static void pg_free_default(pg_t *);
+static void pg_null_op();
/*
* Bootstrap CPU specific PG data
@@ -127,6 +126,12 @@ static struct pg_ops pg_ops_default = {
NULL, /* cpupart_out */
NULL, /* cpupart_move */
NULL, /* cpu_belongs */
+ NULL, /* policy_name */
+};
+
+static struct pg_cb_ops pg_cb_ops_default = {
+ pg_null_op, /* thread_swtch */
+ pg_null_op, /* thread_remain */
};
/*
@@ -144,6 +149,13 @@ static struct pg_ops pg_ops_default = {
/*
+ * Class specific PG policy name
+ */
+#define PG_POLICY_NAME(pg) \
+ ((pg)->pg_class->pgc_ops->policy_name ? \
+ (pg)->pg_class->pgc_ops->policy_name(pg) : NULL) \
+
+/*
* Class specific membership test callback
*/
#define PG_CPU_BELONGS(pg, cp) \
@@ -206,13 +218,22 @@ static int pg_nclasses;
static pg_cid_t pg_default_cid;
/*
- * Initialze common PG subsystem. Perform CPU 0 initialization
+ * Initialze common PG subsystem.
*/
void
pg_init(void)
{
+ extern void pg_cmt_class_init();
+
pg_default_cid =
pg_class_register("default", &pg_ops_default, PGR_LOGICAL);
+
+ /*
+ * Initialize classes to allow them to register with the framework
+ */
+ pg_cmt_class_init();
+
+ pg_cpu0_init();
}
/*
@@ -282,7 +303,7 @@ pg_class_register(char *name, struct pg_ops *ops, pg_relation_t relation)
classes_old = pg_classes;
pg_classes =
kmem_zalloc(sizeof (pg_class_t) * (pg_nclasses + 1),
- KM_SLEEP);
+ KM_SLEEP);
(void) kcopy(classes_old, pg_classes,
sizeof (pg_class_t) * pg_nclasses);
kmem_free(classes_old, sizeof (pg_class_t) * pg_nclasses);
@@ -339,6 +360,27 @@ pg_cpu_next(pg_cpu_itr_t *itr)
}
/*
+ * Test if a given PG contains a given CPU
+ */
+boolean_t
+pg_cpu_find(pg_t *pg, cpu_t *cp)
+{
+ if (group_find(&pg->pg_cpus, cp) == (uint_t)-1)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
+ * Set the PGs callbacks to the default
+ */
+void
+pg_callback_set_defaults(pg_t *pg)
+{
+ bcopy(&pg_cb_ops_default, &pg->pg_cb, sizeof (struct pg_cb_ops));
+}
+
+/*
* Create a PG of a given class.
* This routine may block.
*/
@@ -374,6 +416,11 @@ pg_create(pg_cid_t cid)
*/
group_create(&pg->pg_cpus);
+ /*
+ * Initialize the events ops vector
+ */
+ pg_callback_set_defaults(pg);
+
return (pg);
}
@@ -620,6 +667,20 @@ pg_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
}
/*
+ * Return a class specific string describing a policy implemented
+ * across this PG
+ */
+char *
+pg_policy_name(pg_t *pg)
+{
+ char *str;
+ if ((str = PG_POLICY_NAME(pg)) != NULL)
+ return (str);
+
+ return ("N/A");
+}
+
+/*
* Provide the specified CPU a bootstrap pg
* This is needed to allow sane behaviour if any PG consuming
* code needs to deal with a partially initialized CPU
@@ -643,3 +704,52 @@ pg_free_default(struct pg *pg)
{
kmem_free(pg, sizeof (pg_t));
}
+
+static void
+pg_null_op()
+{
+}
+
+/*
+ * Invoke the "thread switch" callback for each of the CPU's PGs
+ * This is invoked from the dispatcher swtch() routine, which is called
+ * when a thread running an a CPU should switch to another thread.
+ * "cp" is the CPU on which the thread switch is happening
+ * "now" is an unscaled hrtime_t timestamp taken in swtch()
+ * "old" and "new" are the outgoing and incoming threads, respectively.
+ */
+void
+pg_ev_thread_swtch(struct cpu *cp, hrtime_t now, kthread_t *old, kthread_t *new)
+{
+ int i, sz;
+ group_t *grp;
+ pg_t *pg;
+
+ grp = &cp->cpu_pg->pgs;
+ sz = GROUP_SIZE(grp);
+ for (i = 0; i < sz; i++) {
+ pg = GROUP_ACCESS(grp, i);
+ pg->pg_cb.thread_swtch(pg, cp, now, old, new);
+ }
+}
+
+/*
+ * Invoke the "thread remain" callback for each of the CPU's PGs.
+ * This is called from the dispatcher's swtch() routine when a thread
+ * running on the CPU "cp" is switching to itself, which can happen as an
+ * artifact of the thread's timeslice expiring.
+ */
+void
+pg_ev_thread_remain(struct cpu *cp, kthread_t *t)
+{
+ int i, sz;
+ group_t *grp;
+ pg_t *pg;
+
+ grp = &cp->cpu_pg->pgs;
+ sz = GROUP_SIZE(grp);
+ for (i = 0; i < sz; i++) {
+ pg = GROUP_ACCESS(grp, i);
+ pg->pg_cb.thread_remain(pg, cp, t);
+ }
+}
diff --git a/usr/src/uts/common/os/pghw.c b/usr/src/uts/common/os/pghw.c
index 8b98bb7e7c..ca59db8602 100644
--- a/usr/src/uts/common/os/pghw.c
+++ b/usr/src/uts/common/os/pghw.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/systm.h>
#include <sys/types.h>
#include <sys/param.h>
@@ -35,6 +33,7 @@
#include <sys/group.h>
#include <sys/pg.h>
#include <sys/pghw.h>
+#include <sys/cpu_pm.h>
/*
* Processor Groups: Hardware sharing relationship layer
@@ -99,7 +98,7 @@
* (the CPU's chip, cache, lgroup, etc.).
*
* The hwsets are created dynamically as new hardware sharing relationship types
- * are instantiated. They are never destroyed, as once a given relathionship
+ * are instantiated. They are never destroyed, as once a given relationship
* type appears in the system, it is quite likely that at least one instance of
* that relationship will always persist as long as the system is running.
*/
@@ -107,11 +106,6 @@
static group_t *pg_hw; /* top level pg hw group */
/*
- * Lookup table mapping hardware sharing relationships with hierarchy levels
- */
-static int pghw_level_table[PGHW_NUM_COMPONENTS];
-
-/*
* Physical PG kstats
*/
struct pghw_kstat {
@@ -120,12 +114,14 @@ struct pghw_kstat {
kstat_named_t pg_ncpus;
kstat_named_t pg_instance_id;
kstat_named_t pg_hw;
+ kstat_named_t pg_policy;
} pghw_kstat = {
{ "id", KSTAT_DATA_UINT64 },
{ "pg_class", KSTAT_DATA_STRING },
{ "ncpus", KSTAT_DATA_UINT64 },
{ "instance_id", KSTAT_DATA_UINT64 },
{ "hardware", KSTAT_DATA_STRING },
+ { "policy", KSTAT_DATA_STRING },
};
kmutex_t pghw_kstat_lock;
@@ -138,7 +134,7 @@ static void pghw_set_add(group_t *, pghw_t *);
static void pghw_set_remove(group_t *, pghw_t *);
/*
- * Initialize the physical portion of a physical PG
+ * Initialize the physical portion of a hardware PG
*/
void
pghw_init(pghw_t *pg, cpu_t *cp, pghw_type_t hw)
@@ -157,6 +153,22 @@ pghw_init(pghw_t *pg, cpu_t *cp, pghw_type_t hw)
pg->pghw_instance =
pg_plat_hw_instance_id(cp, hw);
pghw_kstat_create(pg);
+
+ /*
+ * Hardware sharing relationship specific initialization
+ */
+ switch (pg->pghw_hw) {
+ case PGHW_POW_ACTIVE:
+ pg->pghw_handle =
+ (pghw_handle_t)cpupm_domain_init(cp, CPUPM_DTYPE_ACTIVE);
+ break;
+ case PGHW_POW_IDLE:
+ pg->pghw_handle =
+ (pghw_handle_t)cpupm_domain_init(cp, CPUPM_DTYPE_IDLE);
+ break;
+ default:
+ pg->pghw_handle = (pghw_handle_t)NULL;
+ }
}
/*
@@ -262,16 +274,6 @@ pghw_physid_destroy(cpu_t *cp)
}
/*
- * Return a sequential level identifier for the specified
- * hardware sharing relationship
- */
-int
-pghw_level(pghw_type_t hw)
-{
- return (pg_plat_hw_level(hw));
-}
-
-/*
* Create a new, empty hwset.
* This routine may block, and must not be called from any
* paused CPU context.
@@ -303,13 +305,6 @@ pghw_set_create(pghw_type_t hw)
ret = group_add_at(pg_hw, g, (uint_t)hw);
ASSERT(ret == 0);
- /*
- * Update the table that maps hardware sharing relationships
- * to hierarchy levels
- */
- ASSERT(pghw_level_table[hw] == NULL);
- pghw_level_table[hw] = pg_plat_hw_level(hw);
-
return (g);
}
@@ -353,24 +348,26 @@ pghw_set_remove(group_t *hwset, pghw_t *pg)
/*
* Return a string name given a pg_hw sharing type
*/
-#define PGHW_TYPE_NAME_MAX 8
-
static char *
pghw_type_string(pghw_type_t hw)
{
switch (hw) {
case PGHW_IPIPE:
- return ("ipipe");
+ return ("Integer Pipeline");
case PGHW_CACHE:
- return ("cache");
+ return ("Cache");
case PGHW_FPU:
- return ("fpu");
+ return ("Floating Point Unit");
case PGHW_MPIPE:
- return ("mpipe");
+ return ("Data Pipe to memory");
case PGHW_CHIP:
- return ("chip");
+ return ("Socket");
case PGHW_MEMORY:
- return ("memory");
+ return ("Memory");
+ case PGHW_POW_ACTIVE:
+ return ("CPU PM Active Power Domain");
+ case PGHW_POW_IDLE:
+ return ("CPU PM Idle Power Domain");
default:
return ("unknown");
}
@@ -393,8 +390,10 @@ pghw_kstat_create(pghw_t *pg)
"pg", "pg", KSTAT_TYPE_NAMED,
sizeof (pghw_kstat) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL)) != NULL) {
+ /* Class string, hw string, and policy string */
pg->pghw_kstat->ks_data_size += PG_CLASS_NAME_MAX;
- pg->pghw_kstat->ks_data_size += PGHW_TYPE_NAME_MAX;
+ pg->pghw_kstat->ks_data_size += PGHW_KSTAT_STR_LEN_MAX;
+ pg->pghw_kstat->ks_data_size += PGHW_KSTAT_STR_LEN_MAX;
pg->pghw_kstat->ks_lock = &pghw_kstat_lock;
pg->pghw_kstat->ks_data = &pghw_kstat;
pg->pghw_kstat->ks_update = pghw_kstat_update;
@@ -417,6 +416,6 @@ pghw_kstat_update(kstat_t *ksp, int rw)
pgsp->pg_instance_id.value.ui64 = (uint64_t)pg->pghw_instance;
kstat_named_setstr(&pgsp->pg_class, ((pg_t *)pg)->pg_class->pgc_name);
kstat_named_setstr(&pgsp->pg_hw, pghw_type_string(pg->pghw_hw));
-
+ kstat_named_setstr(&pgsp->pg_policy, pg_policy_name((pg_t *)pg));
return (0);
}
diff --git a/usr/src/uts/common/os/sunpm.c b/usr/src/uts/common/os/sunpm.c
index d7deefb099..84c0b9fbb6 100644
--- a/usr/src/uts/common/os/sunpm.c
+++ b/usr/src/uts/common/os/sunpm.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -348,6 +348,13 @@ int autopm_enabled;
pm_cpupm_t cpupm = PM_CPUPM_NOTSET;
/*
+ * Defines the default mode of operation for CPU power management,
+ * either the polling implementation, or the event based dispatcher driven
+ * implementation.
+ */
+pm_cpupm_t cpupm_default_mode = PM_CPUPM_EVENT;
+
+/*
* AutoS3 depends on autopm being enabled, and must be enabled by
* PM_START_AUTOS3 command.
*/
@@ -2568,7 +2575,7 @@ pm_lower_power(dev_info_t *dip, int comp, int level)
PMD(PMD_FAIL, ("%s: %s@%s(%s#%d) %s%s%s%s\n",
pmf, PM_DEVICE(dip),
!autopm_enabled ? "!autopm_enabled " : "",
- !PM_CPUPM_ENABLED ? "!cpupm_enabled " : "",
+ !PM_POLLING_CPUPM ? "!cpupm_polling " : "",
PM_CPUPM_DISABLED ? "cpupm_disabled " : "",
pm_noinvol(dip) ? "pm_noinvol()" : ""))
return (DDI_SUCCESS);
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index bc7ebb334d..9cd4ae55b4 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -139,6 +139,7 @@ CHKHDRS= \
cpr.h \
cpupart.h \
cpuvar.h \
+ cpu_pm.h \
crc32.h \
cred.h \
cred_impl.h \
diff --git a/usr/src/uts/common/sys/callb.h b/usr/src/uts/common/sys/callb.h
index b548f4ca23..302f314b80 100644
--- a/usr/src/uts/common/sys/callb.h
+++ b/usr/src/uts/common/sys/callb.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_CALLB_H
#define _SYS_CALLB_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/t_lock.h>
#include <sys/thread.h>
@@ -69,7 +66,8 @@ extern "C" {
#define CB_CL_MDBOOT CB_CL_UADMIN
#define CB_CL_ENTER_DEBUGGER 14
#define CB_CL_CPR_POST_KERNEL 15
-#define NCBCLASS 16 /* CHANGE ME if classes are added/removed */
+#define CB_CL_CPU_DEEP_IDLE 16
+#define NCBCLASS 17 /* CHANGE ME if classes are added/removed */
/*
* CB_CL_CPR_DAEMON class specific definitions are given below:
diff --git a/usr/src/uts/common/sys/cmt.h b/usr/src/uts/common/sys/cmt.h
index f1a95dc8c3..3ea49ded99 100644
--- a/usr/src/uts/common/sys/cmt.h
+++ b/usr/src/uts/common/sys/cmt.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -37,9 +37,20 @@ extern "C" {
#if (defined(_KERNEL) || defined(_KMEMUSER))
#include <sys/group.h>
#include <sys/pghw.h>
+#include <sys/lgrp.h>
#include <sys/types.h>
/*
+ * CMT related dispatcher policies
+ */
+#define CMT_NO_POLICY 0x0
+#define CMT_BALANCE 0x1
+#define CMT_COALESCE 0x2
+#define CMT_AFFINITY 0x4
+
+typedef uint_t pg_cmt_policy_t;
+
+/*
* CMT pg structure
*/
typedef struct pg_cmt {
@@ -47,26 +58,67 @@ typedef struct pg_cmt {
struct group *cmt_siblings; /* CMT PGs to balance with */
struct pg_cmt *cmt_parent; /* Parent CMT PG */
struct group *cmt_children; /* Active children CMT PGs */
+ pg_cmt_policy_t cmt_policy; /* Dispatcher policies to use */
+ uint32_t cmt_utilization; /* Group's utilization */
int cmt_nchildren; /* # of children CMT PGs */
- uint32_t cmt_nrunning; /* # of running threads */
+ int cmt_hint; /* hint for balancing */
struct group cmt_cpus_actv;
struct bitset cmt_cpus_actv_set; /* bitset of active CPUs */
} pg_cmt_t;
/*
+ * CMT lgroup structure
+ */
+typedef struct cmt_lgrp {
+ group_t cl_pgs; /* Top level group of active CMT PGs */
+ int cl_npgs; /* # of top level PGs in the lgroup */
+ lgrp_handle_t cl_hand; /* lgroup's platform handle */
+ struct cmt_lgrp *cl_next; /* next cmt_lgrp */
+} cmt_lgrp_t;
+
+/*
* Change the number of running threads on the pg
*/
-#define PG_NRUN_UPDATE(cp, n) (pg_cmt_load((cp), (n)))
+#define PG_NRUN_UPDATE(cp, n) (pg_cmt_load((cp), (n)))
+
+/*
+ * Indicate that the given logical CPU is (or isn't) currently utilized
+ */
+#define CMT_CPU_UTILIZED(cp) (pg_cmt_load((cp), 1))
+#define CMT_CPU_NOT_UTILIZED(cp) (pg_cmt_load((cp), -1))
+
+/*
+ * CMT PG's capacity
+ *
+ * Currently, this is defined to be the number of active
+ * logical CPUs in the group.
+ *
+ * This will be used in conjunction with the utilization, which is defined
+ * to be the number of threads actively running on CPUs in the group.
+ */
+#define CMT_CAPACITY(pg) (GROUP_SIZE(&((pg_cmt_t *)pg)->cmt_cpus_actv))
void pg_cmt_load(cpu_t *, int);
void pg_cmt_cpu_startup(cpu_t *);
int pg_cmt_can_migrate(cpu_t *, cpu_t *);
-int pg_plat_cmt_load_bal_hw(pghw_type_t);
-int pg_plat_cmt_affinity_hw(pghw_type_t);
+/*
+ * CMT platform interfaces
+ */
+pg_cmt_policy_t pg_plat_cmt_policy(pghw_type_t);
+int pg_plat_cmt_rank(pg_cmt_t *, pg_cmt_t *);
+/*
+ * CMT dispatcher policy
+ */
cpu_t *cmt_balance(kthread_t *, cpu_t *);
+/*
+ * Power Aware Dispatcher Interfaces
+ */
+int cmt_pad_enable(pghw_type_t);
+int cmt_pad_disable(pghw_type_t);
+
#endif /* !_KERNEL && !_KMEMUSER */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/cpu_pm.h b/usr/src/uts/common/sys/cpu_pm.h
new file mode 100644
index 0000000000..3ec3bcd68d
--- /dev/null
+++ b/usr/src/uts/common/sys/cpu_pm.h
@@ -0,0 +1,139 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _CPU_PM_H
+#define _CPU_PM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if (defined(_KERNEL) || defined(_KMEMUSER))
+#include <sys/cpuvar.h>
+#include <sys/processor.h>
+#include <sys/types.h>
+#include <sys/kstat.h>
+#include <sys/cmt.h>
+
+/*
+ * CPU Power Manager Policies
+ */
+typedef enum cpupm_policy {
+ CPUPM_POLICY_ELASTIC,
+ CPUPM_POLICY_DISABLED,
+ CPUPM_NUM_POLICIES
+} cpupm_policy_t;
+
+/*
+ * Power Managable CPU Domain Types
+ */
+typedef enum cpupm_dtype {
+ CPUPM_DTYPE_ACTIVE, /* Active Power Domain */
+ CPUPM_DTYPE_IDLE /* Idle Power Domain */
+} cpupm_dtype_t;
+
+/*
+ * CPUPM state names for policy implementation.
+ * The last element is used to size the enumeration.
+ */
+typedef enum cpupm_state_name {
+ CPUPM_STATE_LOW_POWER,
+ CPUPM_STATE_MAX_PERF,
+ CPUPM_STATE_NAMES
+} cpupm_state_name_t;
+
+/*
+ * Utilization events delivered by the dispatcher.
+ */
+typedef enum cpupm_util_event {
+ CPUPM_DOM_BUSY_FROM_IDLE,
+ CPUPM_DOM_IDLE_FROM_BUSY,
+ CPUPM_DOM_REMAIN_BUSY
+} cpupm_util_event_t;
+
+typedef uintptr_t cpupm_handle_t; /* Platform handle */
+
+/*
+ * CPU Power Domain State
+ */
+typedef struct cpupm_state {
+ uint32_t cps_speed;
+ cpupm_handle_t cps_handle;
+} cpupm_state_t;
+
+/*
+ * CPU Power Domain
+ */
+typedef struct cpupm_domain {
+ id_t cpd_id; /* Domain ID */
+ cpupm_dtype_t cpd_type; /* Active or Idle */
+ cpupm_state_t *cpd_states; /* Available Power States */
+ cpupm_state_t *cpd_state; /* Current State */
+ uint_t cpd_nstates; /* Number of States */
+ cpupm_state_t *cpd_named_states[CPUPM_STATE_NAMES];
+ hrtime_t cpd_last_raise; /* Last raise request time */
+ hrtime_t cpd_last_lower; /* last lower request time */
+ int cpd_tw; /* transient work history */
+ int cpd_ti; /* transient idle history */
+ boolean_t cpd_ti_governed; /* transient idle governor */
+ boolean_t cpd_tw_governed; /* transient work governor */
+ struct cpupm_domain *cpd_next;
+} cpupm_domain_t;
+
+#define CPUPM_NO_DOMAIN ((id_t)-1)
+
+/*
+ * CPU power manager domain management interfaces
+ */
+cpupm_domain_t *cpupm_domain_init(struct cpu *, cpupm_dtype_t);
+id_t cpupm_domain_id(struct cpu *, cpupm_dtype_t);
+int cpupm_change_state(struct cpu *, cpupm_domain_t *,
+ cpupm_state_t *);
+extern void cpupm_redefine_max_activepwr_state(struct cpu *, int);
+
+/*
+ * CPU power manager policy engine interfaces
+ */
+int cpupm_set_policy(cpupm_policy_t);
+cpupm_policy_t cpupm_get_policy(void);
+void cpupm_utilization_event(struct cpu *, hrtime_t,
+ cpupm_domain_t *, cpupm_util_event_t);
+
+/*
+ * CPU power platform driver interfaces
+ */
+id_t cpupm_plat_domain_id(struct cpu *, cpupm_dtype_t);
+uint_t cpupm_plat_state_enumerate(struct cpu *, cpupm_dtype_t,
+ cpupm_state_t *);
+int cpupm_plat_change_state(struct cpu *, cpupm_state_t *);
+
+
+#endif /* !_KERNEL && !_KMEMUSER */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _CPU_PM_H */
diff --git a/usr/src/uts/common/sys/cpudrv.h b/usr/src/uts/common/sys/cpudrv.h
index 782d8f509c..4cf4e4d1b6 100644
--- a/usr/src/uts/common/sys/cpudrv.h
+++ b/usr/src/uts/common/sys/cpudrv.h
@@ -76,10 +76,10 @@ typedef struct cpudrv_pm {
cpudrv_pm_spd_t *cur_spd; /* ptr to current speed */
uint_t num_spd; /* number of speeds */
hrtime_t lastquan_mstate[NCMSTATES]; /* last quantum's mstate */
- clock_t lastquan_ticks; /* last quantum's clock tick */
+ clock_t lastquan_ticks; /* last quantum's clock tick */
int pm_busycnt; /* pm_busy_component() count */
taskq_t *tq; /* taskq handler for CPU monitor */
- timeout_id_t timeout_id; /* cpudrv_pm_monitor()'s timeout_id */
+ timeout_id_t timeout_id; /* cpudrv_monitor()'s timeout_id */
int timeout_count; /* count dispatched timeouts */
kmutex_t timeout_lock; /* protect timeout_count */
kcondvar_t timeout_cv; /* wait on timeout_count change */
@@ -94,31 +94,31 @@ typedef struct cpudrv_pm {
* Idle & user threads water marks in percentage
*/
#if defined(__x86)
-#define CPUDRV_PM_IDLE_HWM 85 /* idle high water mark */
-#define CPUDRV_PM_IDLE_LWM 70 /* idle low water mark */
-#define CPUDRV_PM_IDLE_BLWM_CNT_MAX 1 /* # of iters idle can be < lwm */
-#define CPUDRV_PM_IDLE_BHWM_CNT_MAX 1 /* # of iters idle can be < hwm */
+#define CPUDRV_IDLE_HWM 85 /* idle high water mark */
+#define CPUDRV_IDLE_LWM 70 /* idle low water mark */
+#define CPUDRV_IDLE_BLWM_CNT_MAX 1 /* # of iters idle can be < lwm */
+#define CPUDRV_IDLE_BHWM_CNT_MAX 1 /* # of iters idle can be < hwm */
#else
-#define CPUDRV_PM_IDLE_HWM 98 /* idle high water mark */
-#define CPUDRV_PM_IDLE_LWM 8 /* idle low water mark */
-#define CPUDRV_PM_IDLE_BLWM_CNT_MAX 2 /* # of iters idle can be < lwm */
-#define CPUDRV_PM_IDLE_BHWM_CNT_MAX 2 /* # of iters idle can be < hwm */
+#define CPUDRV_IDLE_HWM 98 /* idle high water mark */
+#define CPUDRV_IDLE_LWM 8 /* idle low water mark */
+#define CPUDRV_IDLE_BLWM_CNT_MAX 2 /* # of iters idle can be < lwm */
+#define CPUDRV_IDLE_BHWM_CNT_MAX 2 /* # of iters idle can be < hwm */
#endif
-#define CPUDRV_PM_USER_HWM 20 /* user high water mark */
-#define CPUDRV_PM_IDLE_BUF_ZONE 4 /* buffer zone when going down */
+#define CPUDRV_USER_HWM 20 /* user high water mark */
+#define CPUDRV_IDLE_BUF_ZONE 4 /* buffer zone when going down */
/*
* Maximums for creating 'pm-components' property
*/
-#define CPUDRV_PM_COMP_MAX_DIG 4 /* max digits in power level */
+#define CPUDRV_COMP_MAX_DIG 4 /* max digits in power level */
/* or divisor */
-#define CPUDRV_PM_COMP_MAX_VAL 9999 /* max value in above digits */
+#define CPUDRV_COMP_MAX_VAL 9999 /* max value in above digits */
/*
* Component number for calls to PM framework
*/
-#define CPUDRV_PM_COMP_NUM 0 /* first component is 0 */
+#define CPUDRV_COMP_NUM 0 /* first component is 0 */
/*
* Quantum counts for normal and other clock speeds in terms of ticks.
@@ -132,26 +132,26 @@ typedef struct cpudrv_pm {
* that we monitor less frequently.
*
* We reach a tradeoff between these two requirements by monitoring
- * more frequently when we are in low speed mode (CPUDRV_PM_QUANT_CNT_OTHR)
+ * more frequently when we are in low speed mode (CPUDRV_QUANT_CNT_OTHR)
* so we can bring the CPU up without user noticing it. Moreover, at low
* speed we are not using CPU much so extra code execution should be fine.
* Since we are in no hurry to bring CPU down and at normal speed and we
* might really be using the CPU fully, we monitor less frequently
- * (CPUDRV_PM_QUANT_CNT_NORMAL).
+ * (CPUDRV_QUANT_CNT_NORMAL).
*/
#if defined(__x86)
-#define CPUDRV_PM_QUANT_CNT_NORMAL (hz * 1) /* 1 sec */
+#define CPUDRV_QUANT_CNT_NORMAL (hz * 1) /* 1 sec */
#else
-#define CPUDRV_PM_QUANT_CNT_NORMAL (hz * 5) /* 5 sec */
+#define CPUDRV_QUANT_CNT_NORMAL (hz * 5) /* 5 sec */
#endif
-#define CPUDRV_PM_QUANT_CNT_OTHR (hz * 1) /* 1 sec */
+#define CPUDRV_QUANT_CNT_OTHR (hz * 1) /* 1 sec */
/*
* Taskq parameters
*/
-#define CPUDRV_PM_TASKQ_THREADS 1 /* # threads to run CPU monitor */
-#define CPUDRV_PM_TASKQ_MIN 2 /* min # of taskq entries */
-#define CPUDRV_PM_TASKQ_MAX 2 /* max # of taskq entries */
+#define CPUDRV_TASKQ_THREADS 1 /* # threads to run CPU monitor */
+#define CPUDRV_TASKQ_MIN 2 /* min # of taskq entries */
+#define CPUDRV_TASKQ_MAX 2 /* max # of taskq entries */
/*
@@ -159,13 +159,14 @@ typedef struct cpudrv_pm {
*/
typedef struct cpudrv_devstate {
dev_info_t *dip; /* devinfo handle */
+ cpu_t *cp; /* CPU data for this node */
processorid_t cpu_id; /* CPU number for this node */
cpudrv_pm_t cpudrv_pm; /* power management data */
kmutex_t lock; /* protects state struct */
- void *mach_state; /* machine specific state */
} cpudrv_devstate_t;
extern void *cpudrv_state;
+extern boolean_t cpudrv_enabled;
/*
* Debugging definitions
@@ -191,12 +192,13 @@ extern uint_t cpudrv_debug;
#define DPRINTF(flag, args)
#endif /* DEBUG */
-extern int cpudrv_pm_change_speed(cpudrv_devstate_t *, cpudrv_pm_spd_t *);
-extern boolean_t cpudrv_pm_get_cpu_id(dev_info_t *, processorid_t *);
-extern boolean_t cpudrv_pm_power_ready(void);
-extern boolean_t cpudrv_pm_is_governor_thread(cpudrv_pm_t *);
-extern boolean_t cpudrv_mach_pm_init(cpudrv_devstate_t *);
-extern void cpudrv_mach_pm_free(cpudrv_devstate_t *);
+extern int cpudrv_change_speed(cpudrv_devstate_t *, cpudrv_pm_spd_t *);
+extern boolean_t cpudrv_get_cpu_id(dev_info_t *, processorid_t *);
+extern boolean_t cpudrv_is_governor_thread(cpudrv_pm_t *);
+extern boolean_t cpudrv_mach_init(cpudrv_devstate_t *);
+extern boolean_t cpudrv_power_ready(void);
+extern boolean_t cpudrv_is_enabled(cpudrv_devstate_t *);
+extern void cpudrv_set_supp_freqs(cpudrv_devstate_t *);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/sys/cpupm.h b/usr/src/uts/common/sys/cpupm.h
new file mode 100644
index 0000000000..2f74775450
--- /dev/null
+++ b/usr/src/uts/common/sys/cpupm.h
@@ -0,0 +1,43 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _CPUPM_H
+#define _CPUPM_H
+
+#include <sys/types.h>
+#include <sys/cpuvar.h>
+#include <sys/cpupm_mach.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void cpupm_set_supp_freqs(cpu_t *, int *, uint_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _CPUPM_H */
diff --git a/usr/src/uts/common/sys/cpuvar.h b/usr/src/uts/common/sys/cpuvar.h
index 2d056fa6ab..99829bbb03 100644
--- a/usr/src/uts/common/sys/cpuvar.h
+++ b/usr/src/uts/common/sys/cpuvar.h
@@ -366,7 +366,6 @@ extern cpu_core_t cpu_core[];
#define CPU_DISP_DONTSTEAL 0x01 /* CPU undergoing context swtch */
#define CPU_DISP_HALTED 0x02 /* CPU halted waiting for interrupt */
-
#endif /* _KERNEL || _KMEMUSER */
#if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP)
@@ -673,6 +672,7 @@ int cpu_get_state(cpu_t *); /* get current cpu state */
const char *cpu_get_state_str(cpu_t *); /* get current cpu state as string */
+void cpu_set_curr_clock(uint64_t); /* indicate the current CPU's freq */
void cpu_set_supp_freqs(cpu_t *, const char *); /* set the CPU supported */
/* frequencies */
diff --git a/usr/src/uts/common/sys/epm.h b/usr/src/uts/common/sys/epm.h
index 222fd59675..476b254d1a 100644
--- a/usr/src/uts/common/sys/epm.h
+++ b/usr/src/uts/common/sys/epm.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -227,7 +227,8 @@ typedef enum pm_canblock
typedef enum pm_cpupm
{
PM_CPUPM_NOTSET, /* no specific treatment of CPU devices */
- PM_CPUPM_ENABLE, /* power manage CPU devices */
+ PM_CPUPM_POLLING, /* CPUPM enabled: polling mode */
+ PM_CPUPM_EVENT, /* CPUPM enabled: event driven mode */
PM_CPUPM_DISABLE /* do not power manage CPU devices */
} pm_cpupm_t;
@@ -609,9 +610,19 @@ typedef struct pm_thresh_rec {
#define PM_ISCPU(dip) (DEVI(dip)->devi_pm_flags & PMC_CPU_DEVICE)
/*
- * Returns true if cpupm is enabled.
+ * Returns true if cpupm is enabled in event driven mode.
*/
-#define PM_CPUPM_ENABLED (cpupm == PM_CPUPM_ENABLE)
+#define PM_EVENT_CPUPM (cpupm == PM_CPUPM_EVENT)
+
+/*
+ * Returns true if cpupm is enabled in polling mode.
+ */
+#define PM_POLLING_CPUPM (cpupm == PM_CPUPM_POLLING)
+
+/*
+ * Returns true if cpupm operating using the default mode.
+ */
+#define PM_DEFAULT_CPUPM (cpupm == cpupm_default_mode)
/*
* Returns true if is disabled.
@@ -619,12 +630,14 @@ typedef struct pm_thresh_rec {
#define PM_CPUPM_DISABLED (cpupm == PM_CPUPM_DISABLE)
/*
- * If (autopm is enabled and
- * (CPUs are not disabled, or it isn't a cpu)) OR
- * (CPUs are enabled and it is one)
+ * If ((autopm is enabled and
+ * (CPUPM is not disabled and we're not in event mode, or it isn't a cpu))
+ * OR
+ * (CPUPM are enabled and it is one))
*/
#define PM_SCANABLE(dip) ((autopm_enabled && \
-(!PM_CPUPM_DISABLED || !PM_ISCPU(dip))) || (PM_CPUPM_ENABLED && PM_ISCPU(dip)))
+ ((!PM_CPUPM_DISABLED && !PM_EVENT_CPUPM) || !PM_ISCPU(dip))) || \
+ (PM_POLLING_CPUPM && PM_ISCPU(dip)))
#define PM_NOT_ALL_LOWEST 0x0 /* not all components are at lowest */
#define PM_ALL_LOWEST 0x1 /* all components are at lowest lvl */
diff --git a/usr/src/uts/common/sys/group.h b/usr/src/uts/common/sys/group.h
index 89a5ca1f1a..bb5613bc35 100644
--- a/usr/src/uts/common/sys/group.h
+++ b/usr/src/uts/common/sys/group.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _GROUP_H
#define _GROUP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Group Abstraction
*/
@@ -79,13 +77,14 @@ void group_expand(group_t *, uint_t);
* Group element iteration
*/
void group_iter_init(group_iter_t *);
-void *group_iterate(group_t *, uint_t *);
+void *group_iterate(group_t *, group_iter_t *);
/*
- * Add / remove an element from the group
+ * Add / remove an element (or elements) from the group
*/
int group_add(group_t *, void *, int);
int group_remove(group_t *, void *, int);
+void group_empty(group_t *);
/*
* Add / remove / access an element at a specified index.
@@ -95,6 +94,13 @@ int group_remove(group_t *, void *, int);
int group_add_at(group_t *, void *, uint_t);
void group_remove_at(group_t *, uint_t);
+/*
+ * Search for an element in a group.
+ * Returns an index that may be used with the *_at()
+ * routines above to add or remove the element.
+ */
+uint_t group_find(group_t *, void *);
+
#endif /* !_KERNEL && !_KMEMUSER */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/pg.h b/usr/src/uts/common/sys/pg.h
index 99c51ca09a..4ab31ffdd2 100644
--- a/usr/src/uts/common/sys/pg.h
+++ b/usr/src/uts/common/sys/pg.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _PG_H
#define _PG_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Processor Groups
*/
@@ -48,6 +46,8 @@ extern "C" {
typedef uint_t pgid_t; /* processor group id */
typedef uint_t pg_cid_t; /* processor group class id */
+struct pg;
+
/*
* Nature of CPU relationships
*/
@@ -57,13 +57,26 @@ typedef enum pg_relation {
} pg_relation_t;
/*
+ * Processor Group callbacks ops vector
+ * These provide a mechanism allowing per PG routines to invoked
+ * in response to events.
+ */
+typedef struct pg_cb_ops {
+ void (*thread_swtch)(struct pg *, struct cpu *, hrtime_t,
+ kthread_t *, kthread_t *);
+ void (*thread_remain)(struct pg *, struct cpu *,
+ kthread_t *);
+} pg_cb_ops_t;
+
+/*
* Processor group structure
*/
typedef struct pg {
- pgid_t pg_id; /* seq id */
- pg_relation_t pg_relation; /* grouping relationship */
- struct pg_class *pg_class; /* pg class */
- struct group pg_cpus; /* group of CPUs */
+ pgid_t pg_id; /* seq id */
+ pg_relation_t pg_relation; /* grouping relationship */
+ struct pg_class *pg_class; /* pg class */
+ struct group pg_cpus; /* group of CPUs */
+ pg_cb_ops_t pg_cb; /* pg events ops vector */
} pg_t;
/*
@@ -81,6 +94,7 @@ struct pg_ops {
void (*cpupart_move)(struct cpu *, struct cpupart *,
struct cpupart *);
int (*cpu_belongs)(struct pg *, struct cpu *);
+ char *(*policy_name)(struct pg *);
};
#define PG_CLASS_NAME_MAX 32
@@ -130,6 +144,12 @@ typedef struct pg_cpu_itr {
GROUP_ACCESS(&((pg_t *)pgrp)->pg_cpus, 0) : NULL)
/*
+ * Return the number of CPUs in a PG
+ */
+#define PG_NUM_CPUS(pgrp) \
+ (GROUP_SIZE(&(pgrp)->pg_cpus))
+
+/*
* Framework routines
*/
void pg_init(void);
@@ -162,7 +182,19 @@ void pg_cpu_add(pg_t *, cpu_t *);
void pg_cpu_delete(pg_t *, cpu_t *);
pg_t *pg_cpu_find_pg(cpu_t *, group_t *);
cpu_t *pg_cpu_next(pg_cpu_itr_t *);
+boolean_t pg_cpu_find(pg_t *, cpu_t *);
+
+/*
+ * PG Event callbacks
+ */
+void pg_callback_set_defaults(pg_t *);
+void pg_ev_thread_swtch(cpu_t *, hrtime_t, kthread_t *, kthread_t *);
+void pg_ev_thread_remain(cpu_t *, kthread_t *);
+/*
+ * PG Observability interfaces
+ */
+char *pg_policy_name(pg_t *);
#endif /* !_KERNEL && !_KMEMUSER */
diff --git a/usr/src/uts/common/sys/pghw.h b/usr/src/uts/common/sys/pghw.h
index f22afc021b..0953bc19c9 100644
--- a/usr/src/uts/common/sys/pghw.h
+++ b/usr/src/uts/common/sys/pghw.h
@@ -19,16 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _PGHW_H
#define _PGHW_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -48,27 +45,47 @@ extern "C" {
*/
typedef enum pghw_type {
PGHW_START,
- PGHW_IPIPE,
- PGHW_CACHE,
- PGHW_FPU,
- PGHW_MPIPE,
- PGHW_CHIP,
+ PGHW_IPIPE, /* Instruction Pipeline */
+ PGHW_CACHE, /* Cache (generally last level) */
+ PGHW_FPU, /* Floating Point Unit / Pipeline */
+ PGHW_MPIPE, /* Pipe to Memory */
+ PGHW_CHIP, /* Socket */
PGHW_MEMORY,
+ PGHW_POW_ACTIVE, /* Active Power Management Domain */
+ PGHW_POW_IDLE, /* Idle Power Management Domain */
PGHW_NUM_COMPONENTS
} pghw_type_t;
/*
+ * Returns true if the hardware is a type of power management domain
+ */
+#define PGHW_IS_PM_DOMAIN(hw) \
+ (hw == PGHW_POW_ACTIVE || hw == PGHW_POW_IDLE)
+
+/*
* Anonymous instance id
*/
#define PGHW_INSTANCE_ANON ((id_t)0xdecafbad)
/*
+ * Max length of PGHW kstat strings
+ */
+#define PGHW_KSTAT_STR_LEN_MAX 32
+
+
+/*
+ * Platform specific handle
+ */
+typedef uintptr_t pghw_handle_t;
+
+/*
* Processor Group (physical sharing relationship)
*/
typedef struct pghw {
pg_t pghw_pg; /* processor group */
pghw_type_t pghw_hw; /* HW sharing relationship */
id_t pghw_instance; /* sharing instance identifier */
+ pghw_handle_t pghw_handle; /* hw specific opaque handle */
kstat_t *pghw_kstat; /* physical kstats exported */
} pghw_t;
@@ -102,16 +119,14 @@ pghw_t *pghw_find_pg(cpu_t *, pghw_type_t);
pghw_t *pghw_find_by_instance(id_t, pghw_type_t);
group_t *pghw_set_lookup(pghw_type_t);
-int pghw_level(pghw_type_t);
-
void pghw_kstat_create(pghw_t *);
int pghw_kstat_update(kstat_t *, int);
/* Hardware sharing relationship platform interfaces */
int pg_plat_hw_shared(cpu_t *, pghw_type_t);
int pg_plat_cpus_share(cpu_t *, cpu_t *, pghw_type_t);
-int pg_plat_hw_level(pghw_type_t);
id_t pg_plat_hw_instance_id(cpu_t *, pghw_type_t);
+pghw_type_t pg_plat_hw_rank(pghw_type_t, pghw_type_t);
/*
* What comprises a "core" may vary across processor implementations,
diff --git a/usr/src/uts/common/sys/pm.h b/usr/src/uts/common/sys/pm.h
index 8be171fef1..f98bb79fcb 100644
--- a/usr/src/uts/common/sys/pm.h
+++ b/usr/src/uts/common/sys/pm.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_PM_H
#define _SYS_PM_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -92,6 +90,8 @@ typedef enum {
PM_GET_DEFAULT_SYSTEM_THRESHOLD,
PM_ADD_DEPENDENT_PROPERTY,
PM_START_CPUPM,
+ PM_START_CPUPM_EV,
+ PM_START_CPUPM_POLL,
PM_STOP_CPUPM,
PM_GET_CPU_THRESHOLD,
PM_SET_CPU_THRESHOLD,
@@ -104,7 +104,10 @@ typedef enum {
PM_SEARCH_LIST, /* search S3 enable/disable list */
PM_GET_AUTOS3_STATE,
PM_GET_S3_SUPPORT_STATE,
- PM_GET_CMD_NAME
+ PM_GET_CMD_NAME,
+ PM_DISABLE_CPU_DEEP_IDLE,
+ PM_ENABLE_CPU_DEEP_IDLE,
+ PM_DEFAULT_CPU_DEEP_IDLE
} pm_cmds;
/*
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index f05dbc437a..1d690fe67f 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -40,9 +40,15 @@ CORE_OBJS += \
cmi_hw.o \
cms.o \
confunix.o \
+ cpu_idle.o \
cpuid.o \
cpuid_subr.o \
cpupm.o \
+ cpupm_mach.o \
+ cpupm_amd.o \
+ cpupm_intel.o \
+ cpupm_throttle.o \
+ cpu_acpi.o \
dis_tables.o \
ddi_impl.o \
dtrace_subr.o \
@@ -93,6 +99,8 @@ CORE_OBJS += \
pci_orion.o \
pmem.o \
ppage.o \
+ pwrnow.o \
+ speedstep.o \
startup.o \
timestamp.o \
todpc_subr.o \
@@ -169,19 +177,14 @@ PCI_E_MISC_OBJS += pcie.o pcie_fault.o
PCI_E_NEXUS_OBJS += npe.o npe_misc.o
PCI_E_NEXUS_OBJS += pci_common.o pci_kstats.o pci_tools.o
PCINEXUS_OBJS += pci.o pci_common.o pci_kstats.o pci_tools.o
-PCPLUSMP_OBJS += apic.o apic_regops.o psm_common.o apic_introp.o mp_platform_common.o
+PCPLUSMP_OBJS += apic.o apic_regops.o psm_common.o apic_introp.o \
+ mp_platform_common.o hpet_acpi.o
ACPI_DRV_OBJS += acpi_drv.o acpi_video.o
CPUDRV_OBJS += \
cpudrv.o \
- cpudrv_amd.o \
- cpudrv_intel.o \
- cpudrv_mach.o \
- cpudrv_throttle.o \
- cpu_acpi.o \
- speedstep.o \
- pwrnow.o
+ cpudrv_mach.o
PPM_OBJS += ppm_subr.o ppm.o ppm_plat.o
diff --git a/usr/src/uts/i86pc/Makefile.rules b/usr/src/uts/i86pc/Makefile.rules
index 1ec05b783e..dc40a16541 100644
--- a/usr/src/uts/i86pc/Makefile.rules
+++ b/usr/src/uts/i86pc/Makefile.rules
@@ -67,10 +67,6 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/acpi_drv/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
-$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/cpudrv/%.c
- $(COMPILE.c) -o $@ $<
- $(CTFCONVERT_O)
-
$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/ioat/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -115,6 +111,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/os/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/os/cpupm/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/boot/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -270,9 +270,6 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/acpi_drv/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
-$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/cpudrv/%.c
- @($(LHEAD) $(LINT.c) $< $(LTAIL))
-
$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/ioat/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
@@ -309,6 +306,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/ml/%.s
$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/os/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/os/cpupm/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/boot/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
diff --git a/usr/src/uts/i86pc/io/cpudrv/cpudrv_mach.c b/usr/src/uts/i86pc/io/cpudrv/cpudrv_mach.c
deleted file mode 100644
index d2be88c404..0000000000
--- a/usr/src/uts/i86pc/io/cpudrv/cpudrv_mach.c
+++ /dev/null
@@ -1,516 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * CPU power management driver support for i86pc.
- */
-
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-#include <sys/cpupm.h>
-#include <sys/cpudrv_mach.h>
-#include <sys/machsystm.h>
-
-/*
- * Constants used by the Processor Device Notification handler
- * that identify what kind of change has occurred. We currently
- * only handle PPC_CHANGE_NOTIFICATION. The other two are
- * ignored.
- */
-#define PPC_CHANGE_NOTIFICATION 0x80
-#define CST_CHANGE_NOTIFICATION 0x81
-#define TPC_CHANGE_NOTIFICATION 0x82
-
-/*
- * Note that our driver numbers the power levels from lowest to
- * highest starting at 1 (i.e., the lowest power level is 1 and
- * the highest power level is cpupm->num_spd). The x86 modules get
- * their power levels from ACPI which numbers power levels from
- * highest to lowest starting at 0 (i.e., the lowest power level
- * is (cpupm->num_spd - 1) and the highest power level is 0). So to
- * map one of our driver power levels to one understood by ACPI we
- * simply subtract our driver power level from cpupm->num_spd. Likewise,
- * to map an ACPI power level to the proper driver power level, we
- * subtract the ACPI power level from cpupm->num_spd.
- */
-#define PM_2_PLAT_LEVEL(cpupm, pm_level) (cpupm->num_spd - pm_level)
-#define PLAT_2_PM_LEVEL(cpupm, plat_level) (cpupm->num_spd - plat_level)
-
-extern boolean_t cpudrv_intel_init(cpudrv_devstate_t *);
-extern boolean_t cpudrv_amd_init(cpudrv_devstate_t *);
-
-typedef struct cpudrv_mach_vendor {
- boolean_t (*cpuv_init)(cpudrv_devstate_t *);
-} cpudrv_mach_vendor_t;
-
-/*
- * Table of supported vendors.
- */
-static cpudrv_mach_vendor_t cpudrv_vendors[] = {
- cpudrv_intel_init,
- cpudrv_amd_init,
- NULL
-};
-
-uint_t
-cpudrv_pm_get_speeds(cpudrv_devstate_t *cpudsp, int **speeds)
-{
- cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
- return (cpu_acpi_get_speeds(mach_state->acpi_handle, speeds));
-}
-
-void
-cpudrv_pm_free_speeds(int *speeds, uint_t nspeeds)
-{
- cpu_acpi_free_speeds(speeds, nspeeds);
-}
-
-/*
- * Change CPU speed using interface provided by module.
- */
-int
-cpudrv_pm_change_speed(cpudrv_devstate_t *cpudsp, cpudrv_pm_spd_t *new_spd)
-{
- cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
- cpudrv_pm_t *cpupm;
- uint32_t plat_level;
- int ret;
-
- if (!(mach_state->caps & CPUDRV_P_STATES))
- return (DDI_FAILURE);
- ASSERT(mach_state->cpupm_pstate_ops != NULL);
- cpupm = &(cpudsp->cpudrv_pm);
- plat_level = PM_2_PLAT_LEVEL(cpupm, new_spd->pm_level);
- ret = mach_state->cpupm_pstate_ops->cpups_power(cpudsp, plat_level);
- if (ret != 0)
- return (DDI_FAILURE);
- return (DDI_SUCCESS);
-}
-
-/*
- * Determine the cpu_id for the CPU device.
- */
-boolean_t
-cpudrv_pm_get_cpu_id(dev_info_t *dip, processorid_t *cpu_id)
-{
- return ((*cpu_id = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
- DDI_PROP_DONTPASS, "reg", -1)) != -1);
-
-}
-
-/*
- * All CPU instances have been initialized successfully.
- */
-boolean_t
-cpudrv_pm_power_ready(void)
-{
- return (cpupm_is_enabled(CPUPM_P_STATES) && cpupm_is_ready());
-}
-
-/*
- * All CPU instances have been initialized successfully.
- */
-boolean_t
-cpudrv_pm_throttle_ready(void)
-{
- return (cpupm_is_enabled(CPUPM_T_STATES) && cpupm_is_ready());
-}
-
-/*
- * Is the current thread the thread that is handling the
- * PPC change notification?
- */
-boolean_t
-cpudrv_pm_is_governor_thread(cpudrv_pm_t *cpupm)
-{
- return (curthread == cpupm->pm_governor_thread);
-}
-
-/*
- * Initialize the machine.
- * See if a module exists for managing power for this CPU.
- */
-boolean_t
-cpudrv_mach_pm_init(cpudrv_devstate_t *cpudsp)
-{
- cpudrv_mach_vendor_t *vendors;
- cpudrv_mach_state_t *mach_state;
- int ret;
-
- mach_state = cpudsp->mach_state =
- kmem_zalloc(sizeof (cpudrv_mach_state_t), KM_SLEEP);
- mach_state->caps = CPUDRV_NO_STATES;
-
- mach_state->acpi_handle = cpu_acpi_init(cpudsp->dip);
- if (mach_state->acpi_handle == NULL) {
- cpudrv_mach_pm_free(cpudsp);
- cmn_err(CE_WARN, "!cpudrv_mach_pm_init: instance %d: "
- "unable to get ACPI handle",
- ddi_get_instance(cpudsp->dip));
- cmn_err(CE_NOTE, "!CPU power management will not function.");
- return (B_FALSE);
- }
-
- /*
- * Loop through the CPU management module table and see if
- * any of the modules implement CPU power management
- * for this CPU.
- */
- for (vendors = cpudrv_vendors; vendors->cpuv_init != NULL; vendors++) {
- if (vendors->cpuv_init(cpudsp))
- break;
- }
-
- /*
- * Nope, we can't power manage this CPU.
- */
- if (vendors == NULL) {
- cpudrv_mach_pm_free(cpudsp);
- return (B_FALSE);
- }
-
- /*
- * If P-state support exists for this system, then initialize it.
- */
- if (mach_state->cpupm_pstate_ops != NULL) {
- ret = mach_state->cpupm_pstate_ops->cpups_init(cpudsp);
- if (ret != 0) {
- cmn_err(CE_WARN, "!cpudrv_mach_pm_init: instance %d:"
- " unable to initialize P-state support",
- ddi_get_instance(cpudsp->dip));
- mach_state->cpupm_pstate_ops = NULL;
- cpupm_disable(CPUPM_P_STATES);
- } else {
- mach_state->caps |= CPUDRV_P_STATES;
- }
- }
-
- if (mach_state->cpupm_tstate_ops != NULL) {
- ret = mach_state->cpupm_tstate_ops->cputs_init(cpudsp);
- if (ret != 0) {
- cmn_err(CE_WARN, "!cpudrv_mach_pm_init: instance %d:"
- " unable to initialize T-state support",
- ddi_get_instance(cpudsp->dip));
- mach_state->cpupm_tstate_ops = NULL;
- cpupm_disable(CPUPM_T_STATES);
- } else {
- mach_state->caps |= CPUDRV_T_STATES;
- }
- }
-
- if (mach_state->caps == CPUDRV_NO_STATES) {
- cpudrv_mach_pm_free(cpudsp);
- return (B_FALSE);
- }
-
- return (B_TRUE);
-}
-
-/*
- * Free any resources allocated by cpudrv_mach_pm_init().
- */
-void
-cpudrv_mach_pm_free(cpudrv_devstate_t *cpudsp)
-{
- cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
-
- if (mach_state == NULL)
- return;
- if (mach_state->cpupm_pstate_ops != NULL) {
- mach_state->cpupm_pstate_ops->cpups_fini(cpudsp);
- mach_state->cpupm_pstate_ops = NULL;
- }
-
- if (mach_state->cpupm_tstate_ops != NULL) {
- mach_state->cpupm_tstate_ops->cputs_fini(cpudsp);
- mach_state->cpupm_tstate_ops = NULL;
- }
-
- if (mach_state->acpi_handle != NULL) {
- cpu_acpi_fini(mach_state->acpi_handle);
- mach_state->acpi_handle = NULL;
- }
-
- kmem_free(mach_state, sizeof (cpudrv_mach_state_t));
- cpudsp->mach_state = NULL;
-}
-
-/*
- * This routine changes the top speed to which the CPUs can transition by:
- *
- * - Resetting the up_spd for all speeds lower than the new top speed
- * to point to the new top speed.
- * - Updating the framework with a new "normal" (maximum power) for this
- * device.
- */
-void
-cpudrv_pm_set_topspeed(void *ctx, int plat_level)
-{
- cpudrv_devstate_t *cpudsp;
- cpudrv_pm_t *cpupm;
- cpudrv_pm_spd_t *spd;
- cpudrv_pm_spd_t *top_spd;
- dev_info_t *dip;
- int pm_level;
- int instance;
- int i;
-
- dip = ctx;
- instance = ddi_get_instance(dip);
- cpudsp = ddi_get_soft_state(cpudrv_state, instance);
- ASSERT(cpudsp != NULL);
-
- mutex_enter(&cpudsp->lock);
- cpupm = &(cpudsp->cpudrv_pm);
- pm_level = PLAT_2_PM_LEVEL(cpupm, plat_level);
- for (i = 0, spd = cpupm->head_spd; spd; i++, spd = spd->down_spd) {
- /*
- * Don't mess with speeds that are higher than the new
- * top speed. They should be out of range anyway.
- */
- if (spd->pm_level > pm_level)
- continue;
- /*
- * This is the new top speed.
- */
- if (spd->pm_level == pm_level)
- top_spd = spd;
-
- spd->up_spd = top_spd;
- }
- cpupm->top_spd = top_spd;
-
- cpupm->pm_governor_thread = curthread;
-
- mutex_exit(&cpudsp->lock);
-
- (void) pm_update_maxpower(dip, 0, top_spd->pm_level);
-}
-
-/*
- * This routine reads the ACPI _PPC object. It's accessed as a callback
- * by the ppm driver whenever a _PPC change notification is received.
- */
-int
-cpudrv_pm_get_topspeed(void *ctx)
-{
- cpudrv_mach_state_t *mach_state;
- cpu_acpi_handle_t handle;
- cpudrv_devstate_t *cpudsp;
- cpudrv_pm_t *cpupm;
- dev_info_t *dip;
- int instance;
- int plat_level;
- int max_level;
-
- dip = ctx;
- instance = ddi_get_instance(dip);
- cpudsp = ddi_get_soft_state(cpudrv_state, instance);
- ASSERT(cpudsp != NULL);
- cpupm = &(cpudsp->cpudrv_pm);
- mach_state = cpudsp->mach_state;
- handle = mach_state->acpi_handle;
-
- cpu_acpi_cache_ppc(handle);
- plat_level = CPU_ACPI_PPC(handle);
- max_level = cpupm->num_spd - 1;
- if ((plat_level < 0) || (plat_level > max_level)) {
- cmn_err(CE_NOTE, "!cpudrv_pm_get_topspeed: instance %d: "
- "_PPC out of range %d", instance, plat_level);
-
- plat_level = 0;
- }
- return (plat_level);
-}
-
-/*
- * This routine reads the ACPI _TPC object. It's accessed as a callback
- * by the cpu driver whenever a _TPC change notification is received.
- */
-int
-cpudrv_pm_get_topthrottle(cpudrv_devstate_t *cpudsp)
-{
- cpudrv_mach_state_t *mach_state;
- cpu_acpi_handle_t handle;
- int throtl_level;
-
- mach_state = cpudsp->mach_state;
- handle = mach_state->acpi_handle;
-
- cpu_acpi_cache_tpc(handle);
- throtl_level = CPU_ACPI_TPC(handle);
- return (throtl_level);
-}
-
-/*
- * Take care of CPU throttling when _TPC notification arrives
- */
-void
-cpudrv_pm_throttle_instance(cpudrv_devstate_t *cpudsp)
-{
- cpudrv_mach_state_t *mach_state;
- uint32_t new_level;
- int ret;
-
- ASSERT(cpudsp != NULL);
- mach_state = cpudsp->mach_state;
- if (!(mach_state->caps & CPUDRV_T_STATES))
- return;
- ASSERT(mach_state->cpupm_tstate_ops != NULL);
-
- /*
- * Get the new T-State support level
- */
- new_level = cpudrv_pm_get_topthrottle(cpudsp);
-
- /*
- * Change the cpu throttling to the new level
- */
- ret = mach_state->cpupm_tstate_ops->cputs_throttle(cpudsp, new_level);
- if (ret != 0) {
- cmn_err(CE_WARN, "Cannot change the cpu throttling to the new"
- " level: %d, Instance: %d", new_level, cpudsp->cpu_id);
- }
-}
-
-/*
- * Take care of CPU throttling when _TPC notification arrives
- */
-void
-cpudrv_pm_manage_throttling(void *ctx)
-{
- cpudrv_devstate_t *cpudsp;
- cpudrv_mach_state_t *mach_state;
- cpudrv_tstate_domain_t *domain;
- cpudrv_tstate_domain_node_t *domain_node;
- int instance;
- boolean_t is_ready;
-
- instance = ddi_get_instance((dev_info_t *)ctx);
- cpudsp = ddi_get_soft_state(cpudrv_state, instance);
- ASSERT(cpudsp != NULL);
-
- /*
- * We currently refuse to power manage if the CPU is not ready to
- * take cross calls (cross calls fail silently if CPU is not ready
- * for it).
- *
- * Additionally, for x86 platforms we cannot power manage
- * any one instance, until all instances have been initialized.
- * That's because we don't know what the CPU domains look like
- * until all instances have been initialized.
- */
- is_ready = CPUDRV_PM_XCALL_IS_READY(cpudsp->cpu_id);
- if (!is_ready) {
- DPRINTF(D_POWER, ("cpudrv_power: instance %d: "
- "CPU not ready for x-calls\n", instance));
- } else if (!(is_ready = cpudrv_pm_throttle_ready())) {
- DPRINTF(D_POWER, ("cpudrv_power: instance %d: "
- "waiting for all CPUs to be ready\n", instance));
- }
- if (!is_ready) {
- return;
- }
-
- mach_state = cpudsp->mach_state;
- domain_node = mach_state->tstate_domain_node;
- domain = domain_node->tdn_domain;
-
- switch (domain->td_type) {
- case CPU_ACPI_SW_ANY:
- /*
- * Just throttle the current instance and all other instances
- * under the same domain will get throttled to the same level
- */
- cpudrv_pm_throttle_instance(cpudsp);
- break;
- case CPU_ACPI_HW_ALL:
- case CPU_ACPI_SW_ALL:
- /*
- * Along with the current instance, throttle all the CPU's that
- * belong to the same domain
- */
- mutex_enter(&domain->td_lock);
- for (domain_node = domain->td_node; domain_node != NULL;
- domain_node = domain_node->tdn_next)
- cpudrv_pm_throttle_instance(domain_node->tdn_cpudsp);
- mutex_exit(&domain->td_lock);
- break;
-
- default:
- cmn_err(CE_WARN, "Not a valid coordination type (%x) to"
- " throttle cpu", domain->td_domain);
- break;
- }
-}
-
-/*
- * This notification handler is called whenever the ACPI _PPC
- * object changes. The _PPC is a sort of governor on power levels.
- * It sets an upper threshold on which, _PSS defined, power levels
- * are usuable. The _PPC value is dynamic and may change as properties
- * (i.e., thermal or AC source) of the system change.
- */
-/* ARGSUSED */
-static void
-cpudrv_pm_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
-{
- /*
- * We only handle _PPC change notifications.
- */
- if (val == PPC_CHANGE_NOTIFICATION)
- cpudrv_pm_redefine_topspeed(ctx);
- else if (val == TPC_CHANGE_NOTIFICATION) {
- cpudrv_pm_manage_throttling(ctx);
- }
-}
-
-void
-cpudrv_pm_install_notify_handler(cpudrv_devstate_t *cpudsp, dev_info_t *dip)
-{
- cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
- cpu_acpi_install_notify_handler(mach_state->acpi_handle,
- cpudrv_pm_notify_handler, dip);
-}
-
-void
-cpudrv_pm_redefine_topspeed(void *ctx)
-{
- /*
- * This should never happen, unless ppm does not get loaded.
- */
- if (cpupm_redefine_topspeed == NULL) {
- cmn_err(CE_WARN, "cpudrv_pm_redefine_topspeed: "
- "cpupm_redefine_topspeed has not been initialized - "
- "ignoring notification");
- return;
- }
-
- /*
- * ppm callback needs to handle redefinition for all CPUs in
- * the domain.
- */
- (*cpupm_redefine_topspeed)(ctx);
-}
diff --git a/usr/src/uts/i86pc/io/cpudrv_mach.c b/usr/src/uts/i86pc/io/cpudrv_mach.c
new file mode 100644
index 0000000000..56d2e4d6ac
--- /dev/null
+++ b/usr/src/uts/i86pc/io/cpudrv_mach.c
@@ -0,0 +1,287 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * CPU power management driver support for i86pc.
+ */
+
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cpupm.h>
+#include <sys/cpudrv_mach.h>
+#include <sys/machsystm.h>
+#include <sys/cpu_pm.h>
+#include <sys/cpuvar.h>
+#include <sys/sdt.h>
+#include <sys/cpu_idle.h>
+
+/*
+ * Note that our driver numbers the power levels from lowest to
+ * highest starting at 1 (i.e., the lowest power level is 1 and
+ * the highest power level is cpupm->num_spd). The x86 modules get
+ * their power levels from ACPI which numbers power levels from
+ * highest to lowest starting at 0 (i.e., the lowest power level
+ * is (cpupm->num_spd - 1) and the highest power level is 0). So to
+ * map one of our driver power levels to one understood by ACPI we
+ * simply subtract our driver power level from cpupm->num_spd. Likewise,
+ * to map an ACPI power level to the proper driver power level, we
+ * subtract the ACPI power level from cpupm->num_spd.
+ */
+#define PM_2_PLAT_LEVEL(cpupm, pm_level) (cpupm->num_spd - pm_level)
+#define PLAT_2_PM_LEVEL(cpupm, plat_level) (cpupm->num_spd - plat_level)
+
+/*
+ * Change CPU speed using interface provided by module.
+ */
+int
+cpudrv_change_speed(cpudrv_devstate_t *cpudsp, cpudrv_pm_spd_t *new_spd)
+{
+ cpu_t *cp = cpudsp->cp;
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+ cpudrv_pm_t *cpupm;
+ cpuset_t set;
+ uint32_t plat_level;
+
+ if (!(mach_state->ms_caps & CPUPM_P_STATES))
+ return (DDI_FAILURE);
+ ASSERT(mach_state->ms_pstate.cma_ops != NULL);
+ cpupm = &(cpudsp->cpudrv_pm);
+ plat_level = PM_2_PLAT_LEVEL(cpupm, new_spd->pm_level);
+ CPUSET_ONLY(set, cp->cpu_id);
+ mach_state->ms_pstate.cma_ops->cpus_change(set, plat_level);
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * Determine the cpu_id for the CPU device.
+ */
+boolean_t
+cpudrv_get_cpu_id(dev_info_t *dip, processorid_t *cpu_id)
+{
+ return ((*cpu_id = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+ DDI_PROP_DONTPASS, "reg", -1)) != -1);
+
+}
+
+boolean_t
+cpudrv_is_enabled(cpudrv_devstate_t *cpudsp)
+{
+ cpupm_mach_state_t *mach_state;
+
+ if (!cpupm_is_enabled(CPUPM_P_STATES) || !cpudrv_enabled)
+ return (B_FALSE);
+
+ /*
+ * Only check the instance specific setting it exists.
+ */
+ if (cpudsp != NULL && cpudsp->cp != NULL &&
+ cpudsp->cp->cpu_m.mcpu_pm_mach_state != NULL) {
+ mach_state =
+ (cpupm_mach_state_t *)cpudsp->cp->cpu_m.mcpu_pm_mach_state;
+ return (mach_state->ms_caps & CPUPM_P_STATES);
+ }
+
+ return (B_TRUE);
+}
+
+/*
+ * Is the current thread the thread that is handling the
+ * PPC change notification?
+ */
+boolean_t
+cpudrv_is_governor_thread(cpudrv_pm_t *cpupm)
+{
+ return (curthread == cpupm->pm_governor_thread);
+}
+
+/*
+ * This routine changes the top speed to which the CPUs can transition by:
+ *
+ * - Resetting the up_spd for all speeds lower than the new top speed
+ * to point to the new top speed.
+ * - Updating the framework with a new "normal" (maximum power) for this
+ * device.
+ */
+void
+cpudrv_set_topspeed(void *ctx, int plat_level)
+{
+ cpudrv_devstate_t *cpudsp;
+ cpudrv_pm_t *cpupm;
+ cpudrv_pm_spd_t *spd;
+ cpudrv_pm_spd_t *top_spd;
+ dev_info_t *dip;
+ int pm_level;
+ int instance;
+ int i;
+
+ dip = ctx;
+ instance = ddi_get_instance(dip);
+ cpudsp = ddi_get_soft_state(cpudrv_state, instance);
+ ASSERT(cpudsp != NULL);
+
+ mutex_enter(&cpudsp->lock);
+ cpupm = &(cpudsp->cpudrv_pm);
+ pm_level = PLAT_2_PM_LEVEL(cpupm, plat_level);
+ for (i = 0, spd = cpupm->head_spd; spd; i++, spd = spd->down_spd) {
+ /*
+ * Don't mess with speeds that are higher than the new
+ * top speed. They should be out of range anyway.
+ */
+ if (spd->pm_level > pm_level)
+ continue;
+ /*
+ * This is the new top speed.
+ */
+ if (spd->pm_level == pm_level)
+ top_spd = spd;
+
+ spd->up_spd = top_spd;
+ }
+ cpupm->top_spd = top_spd;
+
+ cpupm->pm_governor_thread = curthread;
+
+ mutex_exit(&cpudsp->lock);
+
+ (void) pm_update_maxpower(dip, 0, top_spd->pm_level);
+}
+
+/*
+ * This routine reads the ACPI _PPC object. It's accessed as a callback
+ * by the ppm driver whenever a _PPC change notification is received.
+ */
+int
+cpudrv_get_topspeed(void *ctx)
+{
+ cpu_t *cp;
+ cpudrv_devstate_t *cpudsp;
+ dev_info_t *dip;
+ int instance;
+ int plat_level;
+
+ dip = ctx;
+ instance = ddi_get_instance(dip);
+ cpudsp = ddi_get_soft_state(cpudrv_state, instance);
+ ASSERT(cpudsp != NULL);
+ cp = cpudsp->cp;
+ plat_level = cpupm_get_top_speed(cp);
+
+ return (plat_level);
+}
+
+
+/*
+ * This notification handler is called whenever the ACPI _PPC
+ * object changes. The _PPC is a sort of governor on power levels.
+ * It sets an upper threshold on which, _PSS defined, power levels
+ * are usuable. The _PPC value is dynamic and may change as properties
+ * (i.e., thermal or AC source) of the system change.
+ */
+/* ARGSUSED */
+static void
+cpudrv_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
+{
+ extern pm_cpupm_t cpupm;
+
+ /*
+ * We only handle _PPC change notifications.
+ */
+ if (val == CPUPM_PPC_CHANGE_NOTIFICATION && !PM_EVENT_CPUPM)
+ cpudrv_redefine_topspeed(ctx);
+}
+
+void
+cpudrv_install_notify_handler(cpudrv_devstate_t *cpudsp)
+{
+ cpu_t *cp = cpudsp->cp;
+ cpupm_add_notify_handler(cp, cpudrv_notify_handler,
+ cpudsp->dip);
+}
+
+void
+cpudrv_redefine_topspeed(void *ctx)
+{
+ /*
+ * This should never happen, unless ppm does not get loaded.
+ */
+ if (cpupm_redefine_topspeed == NULL) {
+ cmn_err(CE_WARN, "cpudrv_redefine_topspeed: "
+ "cpupm_redefine_topspeed has not been initialized - "
+ "ignoring notification");
+ return;
+ }
+
+ /*
+ * ppm callback needs to handle redefinition for all CPUs in
+ * the domain.
+ */
+ (*cpupm_redefine_topspeed)(ctx);
+}
+
+boolean_t
+cpudrv_mach_init(cpudrv_devstate_t *cpudsp)
+{
+ cpupm_mach_state_t *mach_state;
+
+ mutex_enter(&cpu_lock);
+ cpudsp->cp = cpu_get(cpudsp->cpu_id);
+ mutex_exit(&cpu_lock);
+ if (cpudsp->cp == NULL) {
+ cmn_err(CE_WARN, "cpudrv_mach_pm_init: instance %d: "
+ "can't get cpu_t", ddi_get_instance(cpudsp->dip));
+ return (B_FALSE);
+ }
+
+ mach_state = (cpupm_mach_state_t *)
+ (cpudsp->cp->cpu_m.mcpu_pm_mach_state);
+ mach_state->ms_dip = cpudsp->dip;
+ return (B_TRUE);
+}
+
+uint_t
+cpudrv_get_speeds(cpudrv_devstate_t *cpudsp, int **speeds)
+{
+ return (cpupm_get_speeds(cpudsp->cp, speeds));
+}
+
+void
+cpudrv_free_speeds(int *speeds, uint_t nspeeds)
+{
+ cpupm_free_speeds(speeds, nspeeds);
+}
+
+boolean_t
+cpudrv_power_ready(void)
+{
+ return (cpupm_power_ready());
+}
+
+/* ARGSUSED */
+void
+cpudrv_set_supp_freqs(cpudrv_devstate_t *cpudsp)
+{
+}
diff --git a/usr/src/uts/i86pc/io/hpet_acpi.c b/usr/src/uts/i86pc/io/hpet_acpi.c
new file mode 100644
index 0000000000..9f482f16fb
--- /dev/null
+++ b/usr/src/uts/i86pc/io/hpet_acpi.c
@@ -0,0 +1,1388 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/hpet_acpi.h>
+#include <sys/hpet.h>
+#include <sys/bitmap.h>
+#include <sys/inttypes.h>
+#include <sys/time.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+#include <sys/apic.h>
+#include <sys/callb.h>
+#include <sys/clock.h>
+#include <sys/archsystm.h>
+#include <sys/cpupart.h>
+
+/*
+ * hpet_state_lock is used to synchronize disabling/enabling deep c-states
+ * and to synchronize suspend/resume.
+ */
+static kmutex_t hpet_state_lock;
+static struct hpet_state {
+ boolean_t proxy_installed; /* CBE proxy interrupt setup */
+ boolean_t cpr; /* currently in CPR */
+ boolean_t cpu_deep_idle; /* user enable/disable */
+ boolean_t uni_cstate; /* disable if only one cstate */
+} hpet_state = { B_FALSE, B_FALSE, B_TRUE, B_TRUE};
+
+uint64_t hpet_spin_check = HPET_SPIN_CHECK;
+uint64_t hpet_spin_timeout = HPET_SPIN_TIMEOUT;
+uint64_t hpet_idle_spin_timeout = HPET_SPIN_TIMEOUT;
+uint64_t hpet_isr_spin_timeout = HPET_SPIN_TIMEOUT;
+
+static kmutex_t hpet_proxy_lock; /* lock for lAPIC proxy data */
+/*
+ * hpet_proxy_users is a per-cpu array.
+ */
+static hpet_proxy_t *hpet_proxy_users; /* one per CPU */
+
+
+ACPI_TABLE_HPET *hpet_table; /* ACPI HPET table */
+hpet_info_t hpet_info; /* Human readable Information */
+
+/*
+ * Provide HPET access from unix.so.
+ * Set up pointers to access symbols in pcplusmp.
+ */
+static void
+hpet_establish_hooks(void)
+{
+ hpet.install_proxy = &hpet_install_proxy;
+ hpet.callback = &hpet_callback;
+ hpet.use_hpet_timer = &hpet_use_hpet_timer;
+ hpet.use_lapic_timer = &hpet_use_lapic_timer;
+}
+
+/*
+ * Get the ACPI "HPET" table.
+ * acpi_probe() calls this function from mp_startup before drivers are loaded.
+ * acpi_probe() verified the system is using ACPI before calling this.
+ *
+ * There may be more than one ACPI HPET table (Itanium only?).
+ * Intel's HPET spec defines each timer block to have up to 32 counters and
+ * be 1024 bytes long. There can be more than one timer block of 32 counters.
+ * Each timer block would have an additional ACPI HPET table.
+ * Typical x86 systems today only have 1 HPET with 3 counters.
+ * On x86 we only consume HPET table "1" for now.
+ */
+int
+hpet_acpi_init(int *hpet_vect, iflag_t *hpet_flags)
+{
+ extern hrtime_t tsc_read(void);
+ extern int idle_cpu_no_deep_c;
+ extern int cpuid_deep_cstates_supported(void);
+ void *la;
+ uint64_t ret;
+ uint_t num_timers;
+ uint_t ti;
+
+ (void) memset(&hpet_info, 0, sizeof (hpet_info));
+ hpet.supported = HPET_NO_SUPPORT;
+
+ if (idle_cpu_no_deep_c)
+ return (DDI_FAILURE);
+
+ if (!cpuid_deep_cstates_supported())
+ return (DDI_FAILURE);
+
+ hpet_establish_hooks();
+
+ /*
+ * Get HPET ACPI table 1.
+ */
+ if (ACPI_FAILURE(AcpiGetTable(ACPI_SIG_HPET, HPET_TABLE_1,
+ (ACPI_TABLE_HEADER **)&hpet_table))) {
+ cmn_err(CE_NOTE, "!hpet_acpi: unable to get ACPI HPET table");
+ return (DDI_FAILURE);
+ }
+
+ if (hpet_validate_table(hpet_table) != AE_OK) {
+ cmn_err(CE_NOTE, "!hpet_acpi: invalid HPET table");
+ return (DDI_FAILURE);
+ }
+
+ la = hpet_memory_map(hpet_table);
+ if (la == NULL) {
+ cmn_err(CE_NOTE, "!hpet_acpi: memory map HPET failed");
+ return (DDI_FAILURE);
+ }
+ hpet_info.logical_address = la;
+
+ ret = hpet_read_gen_cap(&hpet_info);
+ hpet_info.gen_cap.counter_clk_period = HPET_GCAP_CNTR_CLK_PERIOD(ret);
+ hpet_info.gen_cap.vendor_id = HPET_GCAP_VENDOR_ID(ret);
+ hpet_info.gen_cap.leg_route_cap = HPET_GCAP_LEG_ROUTE_CAP(ret);
+ hpet_info.gen_cap.count_size_cap = HPET_GCAP_CNT_SIZE_CAP(ret);
+ /*
+ * Hardware contains the last timer's number.
+ * Add 1 to get the number of timers.
+ */
+ hpet_info.gen_cap.num_tim_cap = HPET_GCAP_NUM_TIM_CAP(ret) + 1;
+ hpet_info.gen_cap.rev_id = HPET_GCAP_REV_ID(ret);
+
+ if (hpet_info.gen_cap.counter_clk_period > HPET_MAX_CLK_PERIOD) {
+ cmn_err(CE_NOTE, "!hpet_acpi: COUNTER_CLK_PERIOD 0x%lx > 0x%lx",
+ (long)hpet_info.gen_cap.counter_clk_period,
+ (long)HPET_MAX_CLK_PERIOD);
+ return (DDI_FAILURE);
+ }
+
+ num_timers = (uint_t)hpet_info.gen_cap.num_tim_cap;
+ if ((num_timers < 3) || (num_timers > 32)) {
+ cmn_err(CE_NOTE, "!hpet_acpi: invalid number of HPET timers "
+ "%lx", (long)num_timers);
+ return (DDI_FAILURE);
+ }
+ hpet_info.timer_n_config = (hpet_TN_conf_cap_t *)kmem_zalloc(
+ num_timers * sizeof (uint64_t), KM_SLEEP);
+
+ ret = hpet_read_gen_config(&hpet_info);
+ hpet_info.gen_config.leg_rt_cnf = HPET_GCFR_LEG_RT_CNF_BITX(ret);
+ hpet_info.gen_config.enable_cnf = HPET_GCFR_ENABLE_CNF_BITX(ret);
+
+ /*
+ * Solaris does not use the HPET Legacy Replacement Route capabilities.
+ * This feature has been off by default on test systems.
+ * The HPET spec does not specify if Legacy Replacement Route is
+ * on or off by default, so we explicitely set it off here.
+ * It should not matter which mode the HPET is in since we use
+ * the first available non-legacy replacement timer: timer 2.
+ */
+ (void) hpet_set_leg_rt_cnf(&hpet_info, 0);
+
+ ret = hpet_read_gen_config(&hpet_info);
+ hpet_info.gen_config.leg_rt_cnf = HPET_GCFR_LEG_RT_CNF_BITX(ret);
+ hpet_info.gen_config.enable_cnf = HPET_GCFR_ENABLE_CNF_BITX(ret);
+
+ hpet_info.gen_intrpt_stat = hpet_read_gen_intrpt_stat(&hpet_info);
+ hpet_info.main_counter_value = hpet_read_main_counter_value(&hpet_info);
+
+ for (ti = 0; ti < num_timers; ++ti) {
+ ret = hpet_read_timer_N_config(&hpet_info, ti);
+ /*
+ * Make sure no timers are enabled (think fast reboot or
+ * virtual hardware).
+ */
+ if (ret & HPET_TIMER_N_INT_ENB_CNF_BIT) {
+ hpet_disable_timer(&hpet_info, ti);
+ ret &= ~HPET_TIMER_N_INT_ENB_CNF_BIT;
+ }
+
+ hpet_info.timer_n_config[ti] = hpet_convert_timer_N_config(ret);
+ }
+
+ /*
+ * Be aware the Main Counter may need to be initialized in the future
+ * if it is used for more than just Deep C-State support.
+ * The HPET's Main Counter does not need to be initialize to a specific
+ * value before starting it for use to wake up CPUs from Deep C-States.
+ */
+ if (hpet_start_main_counter(&hpet_info) != AE_OK) {
+ cmn_err(CE_NOTE, "!hpet_acpi: hpet_start_main_counter failed");
+ return (DDI_FAILURE);
+ }
+
+ hpet_info.period = hpet_info.gen_cap.counter_clk_period;
+ /*
+ * Read main counter twice to record HPET latency for debugging.
+ */
+ hpet_info.tsc[0] = tsc_read();
+ hpet_info.hpet_main_counter_reads[0] =
+ hpet_read_main_counter_value(&hpet_info);
+ hpet_info.tsc[1] = tsc_read();
+ hpet_info.hpet_main_counter_reads[1] =
+ hpet_read_main_counter_value(&hpet_info);
+ hpet_info.tsc[2] = tsc_read();
+
+ ret = hpet_read_gen_config(&hpet_info);
+ hpet_info.gen_config.leg_rt_cnf = HPET_GCFR_LEG_RT_CNF_BITX(ret);
+ hpet_info.gen_config.enable_cnf = HPET_GCFR_ENABLE_CNF_BITX(ret);
+
+ /*
+ * HPET main counter reads are supported now.
+ */
+ hpet.supported = HPET_TIMER_SUPPORT;
+
+ return (hpet_init_proxy(hpet_vect, hpet_flags));
+}
+
+void
+hpet_acpi_fini(void)
+{
+ if (hpet.supported == HPET_NO_SUPPORT)
+ return;
+ if (hpet.supported >= HPET_TIMER_SUPPORT)
+ hpet_stop_main_counter(&hpet_info);
+ if (hpet.supported > HPET_TIMER_SUPPORT)
+ hpet_disable_timer(&hpet_info, hpet_info.cstate_timer.timer);
+}
+
+/*
+ * Do initial setup to use a HPET timer as a proxy for Deep C-state stalled
+ * LAPIC Timers. Get a free HPET timer that supports I/O APIC routed interrupt.
+ * Setup data to handle the timer's ISR, and add the timer's interrupt.
+ *
+ * The ddi cannot be use to allocate the HPET timer's interrupt.
+ * ioapic_init_intr() in mp_platform_common() later sets up the I/O APIC
+ * to handle the HPET timer's interrupt.
+ *
+ * Note: FSB (MSI) interrupts are not currently supported by Intel HPETs as of
+ * ICH9. The HPET spec allows for MSI. In the future MSI may be prefered.
+ */
+static int
+hpet_init_proxy(int *hpet_vect, iflag_t *hpet_flags)
+{
+ if (hpet_get_IOAPIC_intr_capable_timer(&hpet_info) == -1) {
+ cmn_err(CE_WARN, "!hpet_acpi: get ioapic intr failed.");
+ return (DDI_FAILURE);
+ }
+
+ hpet_init_proxy_data();
+
+ if (hpet_install_interrupt_handler(&hpet_isr,
+ hpet_info.cstate_timer.intr) != AE_OK) {
+ cmn_err(CE_WARN, "!hpet_acpi: install interrupt failed.");
+ return (DDI_FAILURE);
+ }
+ *hpet_vect = hpet_info.cstate_timer.intr;
+ hpet_flags->intr_el = INTR_EL_LEVEL;
+ hpet_flags->intr_po = INTR_PO_ACTIVE_HIGH;
+ hpet_flags->bustype = BUS_PCI; /* we *do* conform to PCI */
+
+ /*
+ * Avoid a possibly stuck interrupt by programing the HPET's timer here
+ * before the I/O APIC is programmed to handle this interrupt.
+ */
+ hpet_timer_set_up(&hpet_info, hpet_info.cstate_timer.timer,
+ hpet_info.cstate_timer.intr);
+
+ /*
+ * All HPET functionality is supported.
+ */
+ hpet.supported = HPET_FULL_SUPPORT;
+ return (DDI_SUCCESS);
+}
+
+/*
+ * Called by kernel if it can support Deep C-States.
+ */
+static boolean_t
+hpet_install_proxy(void)
+{
+ if (hpet_state.proxy_installed == B_TRUE)
+ return (B_TRUE);
+
+ if (hpet.supported != HPET_FULL_SUPPORT)
+ return (B_FALSE);
+
+ hpet_enable_timer(&hpet_info, hpet_info.cstate_timer.timer);
+ hpet_state.proxy_installed = B_TRUE;
+
+ return (B_TRUE);
+}
+
+/*
+ * Remove the interrupt that was added with add_avintr() in
+ * hpet_install_interrupt_handler().
+ */
+static void
+hpet_uninstall_interrupt_handler(void)
+{
+ rem_avintr(NULL, CBE_HIGH_PIL, (avfunc)&hpet_isr,
+ hpet_info.cstate_timer.intr);
+}
+
+static int
+hpet_validate_table(ACPI_TABLE_HPET *hpet_table)
+{
+ ACPI_TABLE_HEADER *table_header = (ACPI_TABLE_HEADER *)hpet_table;
+
+ if (table_header->Length != sizeof (ACPI_TABLE_HPET)) {
+ cmn_err(CE_WARN, "!hpet_validate_table: Length %lx != sizeof ("
+ "ACPI_TABLE_HPET) %lx.",
+ (unsigned long)((ACPI_TABLE_HEADER *)hpet_table)->Length,
+ (unsigned long)sizeof (ACPI_TABLE_HPET));
+ return (AE_ERROR);
+ }
+
+ if (!ACPI_COMPARE_NAME(table_header->Signature, ACPI_SIG_HPET)) {
+ cmn_err(CE_WARN, "!hpet_validate_table: Invalid HPET table "
+ "signature");
+ return (AE_ERROR);
+ }
+
+ if (!hpet_checksum_table((unsigned char *)hpet_table,
+ (unsigned int)table_header->Length)) {
+ cmn_err(CE_WARN, "!hpet_validate_table: Invalid HPET checksum");
+ return (AE_ERROR);
+ }
+
+ /*
+ * Sequence should be table number - 1. We are using table 1.
+ */
+ if (hpet_table->Sequence != HPET_TABLE_1 - 1) {
+ cmn_err(CE_WARN, "!hpet_validate_table: Invalid Sequence %lx",
+ (long)hpet_table->Sequence);
+ return (AE_ERROR);
+ }
+
+ return (AE_OK);
+}
+
+static boolean_t
+hpet_checksum_table(unsigned char *table, unsigned int length)
+{
+ unsigned char checksum = 0;
+ int i;
+
+ for (i = 0; i < length; ++i, ++table)
+ checksum += *table;
+
+ return (checksum == 0);
+}
+
+static void *
+hpet_memory_map(ACPI_TABLE_HPET *hpet_table)
+{
+ return (AcpiOsMapMemory(hpet_table->Address.Address, HPET_SIZE));
+}
+
+static int
+hpet_start_main_counter(hpet_info_t *hip)
+{
+ uint64_t *gcr_ptr;
+ uint64_t gcr;
+
+ gcr_ptr = (uint64_t *)HPET_GEN_CONFIG_ADDRESS(hip->logical_address);
+ gcr = *gcr_ptr;
+
+ gcr |= HPET_GCFR_ENABLE_CNF;
+ *gcr_ptr = gcr;
+ gcr = *gcr_ptr;
+
+ return (gcr & HPET_GCFR_ENABLE_CNF ? AE_OK : ~AE_OK);
+}
+
+static int
+hpet_stop_main_counter(hpet_info_t *hip)
+{
+ uint64_t *gcr_ptr;
+ uint64_t gcr;
+
+ gcr_ptr = (uint64_t *)HPET_GEN_CONFIG_ADDRESS(hip->logical_address);
+ gcr = *gcr_ptr;
+
+ gcr &= ~HPET_GCFR_ENABLE_CNF;
+ *gcr_ptr = gcr;
+ gcr = *gcr_ptr;
+
+ return (gcr & HPET_GCFR_ENABLE_CNF ? ~AE_OK : AE_OK);
+}
+
+/*
+ * Set the Legacy Replacement Route bit.
+ * This should be called before setting up timers.
+ * The HPET specification is silent regarding setting this after timers are
+ * programmed.
+ */
+static uint64_t
+hpet_set_leg_rt_cnf(hpet_info_t *hip, uint32_t new_value)
+{
+ uint64_t gen_conf = hpet_read_gen_config(hip);
+
+ switch (new_value) {
+ case 0:
+ gen_conf &= ~HPET_GCFR_LEG_RT_CNF;
+ break;
+
+ case HPET_GCFR_LEG_RT_CNF:
+ gen_conf |= HPET_GCFR_LEG_RT_CNF;
+ break;
+
+ default:
+ ASSERT(new_value == 0 || new_value == HPET_GCFR_LEG_RT_CNF);
+ break;
+ }
+ hpet_write_gen_config(hip, gen_conf);
+ return (gen_conf);
+}
+
+static uint64_t
+hpet_read_gen_cap(hpet_info_t *hip)
+{
+ return (*(uint64_t *)HPET_GEN_CAP_ADDRESS(hip->logical_address));
+}
+
+static uint64_t
+hpet_read_gen_config(hpet_info_t *hip)
+{
+ return (*(uint64_t *)
+ HPET_GEN_CONFIG_ADDRESS(hip->logical_address));
+}
+
+static uint64_t
+hpet_read_gen_intrpt_stat(hpet_info_t *hip)
+{
+ hip->gen_intrpt_stat = *(uint64_t *)HPET_GEN_INTR_STAT_ADDRESS(
+ hip->logical_address);
+ return (hip->gen_intrpt_stat);
+}
+
+static uint64_t
+hpet_read_timer_N_config(hpet_info_t *hip, uint_t n)
+{
+ uint64_t conf = *(uint64_t *)HPET_TIMER_N_CONF_ADDRESS(
+ hip->logical_address, n);
+ hip->timer_n_config[n] = hpet_convert_timer_N_config(conf);
+ return (conf);
+}
+
+static hpet_TN_conf_cap_t
+hpet_convert_timer_N_config(uint64_t conf)
+{
+ hpet_TN_conf_cap_t cc = { 0 };
+
+ cc.int_route_cap = HPET_TIMER_N_INT_ROUTE_CAP(conf);
+ cc.fsb_int_del_cap = HPET_TIMER_N_FSB_INT_DEL_CAP(conf);
+ cc.fsb_int_en_cnf = HPET_TIMER_N_FSB_EN_CNF(conf);
+ cc.int_route_cnf = HPET_TIMER_N_INT_ROUTE_CNF(conf);
+ cc.mode32_cnf = HPET_TIMER_N_MODE32_CNF(conf);
+ cc.val_set_cnf = HPET_TIMER_N_VAL_SET_CNF(conf);
+ cc.size_cap = HPET_TIMER_N_SIZE_CAP(conf);
+ cc.per_int_cap = HPET_TIMER_N_PER_INT_CAP(conf);
+ cc.type_cnf = HPET_TIMER_N_TYPE_CNF(conf);
+ cc.int_enb_cnf = HPET_TIMER_N_INT_ENB_CNF(conf);
+ cc.int_type_cnf = HPET_TIMER_N_INT_TYPE_CNF(conf);
+
+ return (cc);
+}
+
+static uint64_t
+hpet_read_timer_N_comp(hpet_info_t *hip, uint_t n)
+{
+ if (hip->timer_n_config[n].size_cap == 1)
+ return (*(uint64_t *)
+ HPET_TIMER_N_COMP_ADDRESS(hip->logical_address, n));
+ else
+ return (*(uint32_t *)
+ HPET_TIMER_N_COMP_ADDRESS(hip->logical_address, n));
+}
+
+static uint64_t
+hpet_read_main_counter_value(hpet_info_t *hip)
+{
+ uint64_t value;
+ uint32_t *counter;
+ uint32_t high1, high2, low;
+
+ counter = (uint32_t *)HPET_MAIN_COUNTER_ADDRESS(hip->logical_address);
+
+ /*
+ * 32-bit main counters
+ */
+ if (hip->gen_cap.count_size_cap == 0) {
+ value = (uint64_t)*counter;
+ hip->main_counter_value = value;
+ return (value);
+ }
+
+ /*
+ * HPET spec claims a 64-bit read can be split into two 32-bit reads
+ * by the hardware connection to the HPET.
+ */
+ high2 = counter[1];
+ do {
+ high1 = high2;
+ low = counter[0];
+ high2 = counter[1];
+ } while (high2 != high1);
+
+ value = ((uint64_t)high1 << 32) | low;
+ hip->main_counter_value = value;
+ return (value);
+}
+
+static void
+hpet_write_gen_cap(hpet_info_t *hip, uint64_t l)
+{
+ *(uint64_t *)HPET_GEN_CAP_ADDRESS(hip->logical_address) = l;
+}
+
+static void
+hpet_write_gen_config(hpet_info_t *hip, uint64_t l)
+{
+ *(uint64_t *)HPET_GEN_CONFIG_ADDRESS(hip->logical_address) = l;
+}
+
+static void
+hpet_write_gen_intrpt_stat(hpet_info_t *hip, uint64_t l)
+{
+ *(uint64_t *)HPET_GEN_INTR_STAT_ADDRESS(hip->logical_address) = l;
+}
+
+static void
+hpet_write_timer_N_config(hpet_info_t *hip, uint_t n, uint64_t l)
+{
+ if (hip->timer_n_config[n].size_cap == 1)
+ *(uint64_t *)HPET_TIMER_N_CONF_ADDRESS(
+ hip->logical_address, n) = l;
+ else
+ *(uint32_t *)HPET_TIMER_N_CONF_ADDRESS(
+ hip->logical_address, n) = (uint32_t)(0xFFFFFFFF & l);
+}
+
+static void
+hpet_write_timer_N_comp(hpet_info_t *hip, uint_t n, uint64_t l)
+{
+ *(uint64_t *)HPET_TIMER_N_COMP_ADDRESS(hip->logical_address, n) = l;
+}
+
+static void
+hpet_disable_timer(hpet_info_t *hip, uint32_t timer_n)
+{
+ uint64_t l;
+
+ l = hpet_read_timer_N_config(hip, timer_n);
+ l &= ~HPET_TIMER_N_INT_ENB_CNF_BIT;
+ hpet_write_timer_N_config(hip, timer_n, l);
+}
+
+static void
+hpet_enable_timer(hpet_info_t *hip, uint32_t timer_n)
+{
+ uint64_t l;
+
+ l = hpet_read_timer_N_config(hip, timer_n);
+ l |= HPET_TIMER_N_INT_ENB_CNF_BIT;
+ hpet_write_timer_N_config(hip, timer_n, l);
+}
+
+static void
+hpet_write_main_counter_value(hpet_info_t *hip, uint64_t l)
+{
+ uint32_t *address;
+
+ /*
+ * HPET spec 1.0a states main counter register should be halted before
+ * it is written to.
+ */
+ ASSERT(!(hpet_read_gen_config(hip) & HPET_GCFR_ENABLE_CNF));
+
+ if (hip->gen_cap.count_size_cap == 1) {
+ *(uint64_t *)HPET_MAIN_COUNTER_ADDRESS(hip->logical_address)
+ = l;
+ } else {
+ address = (uint32_t *)HPET_MAIN_COUNTER_ADDRESS(
+ hip->logical_address);
+
+ address[0] = (uint32_t)(l & 0xFFFFFFFF);
+ }
+}
+
+/*
+ * Add the interrupt handler for I/O APIC interrupt number (interrupt line).
+ *
+ * The I/O APIC line (vector) is programmed in ioapic_init_intr() called
+ * from apic_picinit() psm_ops apic_ops entry point after we return from
+ * apic_init() psm_ops entry point.
+ */
+static uint32_t
+hpet_install_interrupt_handler(uint_t (*func)(char *), int vector)
+{
+ uint32_t retval;
+
+ retval = add_avintr(NULL, CBE_HIGH_PIL, (avfunc)func, "HPET Timer",
+ vector, NULL, NULL, NULL, NULL);
+ if (retval == 0) {
+ cmn_err(CE_WARN, "!hpet_acpi: add_avintr() failed");
+ return (AE_BAD_PARAMETER);
+ }
+ return (AE_OK);
+}
+
+/*
+ * The HPET timers specify which I/O APIC interrupts they can be routed to.
+ * Find the first available non-legacy-replacement timer and its I/O APIC irq.
+ * Supported I/O APIC IRQs are specified in the int_route_cap bitmap in each
+ * timer's timer_n_config register.
+ */
+static int
+hpet_get_IOAPIC_intr_capable_timer(hpet_info_t *hip)
+{
+ int timer;
+ int intr;
+
+ for (timer = HPET_FIRST_NON_LEGACY_TIMER;
+ timer < hip->gen_cap.num_tim_cap; ++timer) {
+
+ if (!hpet_timer_available(hip->allocated_timers, timer))
+ continue;
+
+ intr = lowbit(hip->timer_n_config[timer].int_route_cap) - 1;
+ if (intr >= 0) {
+ hpet_timer_alloc(&hip->allocated_timers, timer);
+ hip->cstate_timer.timer = timer;
+ hip->cstate_timer.intr = intr;
+ return (timer);
+ }
+ }
+
+ return (-1);
+}
+
+/*
+ * Mark this timer as used.
+ */
+static void
+hpet_timer_alloc(uint32_t *allocated_timers, uint32_t n)
+{
+ *allocated_timers |= 1 << n;
+}
+
+/*
+ * Check if this timer is available.
+ * No mutual exclusion because only one thread uses this.
+ */
+static int
+hpet_timer_available(uint32_t allocated_timers, uint32_t n)
+{
+ return ((allocated_timers & (1 << n)) == 0);
+}
+
+/*
+ * Setup timer N to route its interrupt to I/O APIC.
+ */
+static void
+hpet_timer_set_up(hpet_info_t *hip, uint32_t timer_n, uint32_t interrupt)
+{
+ uint64_t conf;
+
+ conf = hpet_read_timer_N_config(hip, timer_n);
+
+ /*
+ * Caller is required to verify this interrupt route is supported.
+ */
+ ASSERT(HPET_TIMER_N_INT_ROUTE_CAP(conf) & (1 << interrupt));
+
+ conf &= ~HPET_TIMER_N_FSB_EN_CNF_BIT; /* use IOAPIC */
+ conf |= HPET_TIMER_N_INT_ROUTE_SHIFT(interrupt);
+ conf &= ~HPET_TIMER_N_TYPE_CNF_BIT; /* non periodic */
+ conf &= ~HPET_TIMER_N_INT_ENB_CNF_BIT; /* disabled */
+ conf |= HPET_TIMER_N_INT_TYPE_CNF_BIT; /* Level Triggered */
+
+ hpet_write_timer_N_config(hip, timer_n, conf);
+}
+
+/*
+ * The HPET's Main Counter is not stopped before programming an HPET timer.
+ * This will allow the HPET to be used as a time source.
+ * The programmed timer interrupt may occur before this function returns.
+ * Callers must block interrupts before calling this function if they must
+ * guarantee the interrupt is handled after this function returns.
+ *
+ * Return 0 if main counter is less than timer after enabling timer.
+ * The interrupt was programmed, but it may fire before this returns.
+ * Return !0 if main counter is greater than timer after enabling timer.
+ * In other words: the timer will not fire, and we do not know if it did fire.
+ *
+ * delta is in HPET ticks.
+ *
+ * Writing a 64-bit value to a 32-bit register will "wrap around".
+ * A 32-bit HPET timer will wrap around in a little over 5 minutes.
+ */
+int
+hpet_timer_program(hpet_info_t *hip, uint32_t timer, uint64_t delta)
+{
+ uint64_t time, program;
+
+ program = hpet_read_main_counter_value(hip);
+ program += delta;
+ hpet_write_timer_N_comp(hip, timer, program);
+
+ time = hpet_read_main_counter_value(hip);
+ if (time < program)
+ return (AE_OK);
+
+ return (AE_TIME);
+}
+
+/*
+ * CPR and power policy-change callback entry point.
+ */
+boolean_t
+hpet_callback(int code)
+{
+ switch (code) {
+ case PM_DEFAULT_CPU_DEEP_IDLE:
+ /*FALLTHROUGH*/
+ case PM_ENABLE_CPU_DEEP_IDLE:
+ /*FALLTHROUGH*/
+ case PM_DISABLE_CPU_DEEP_IDLE:
+ return (hpet_deep_idle_config(code));
+
+ case CB_CODE_CPR_RESUME:
+ /*FALLTHROUGH*/
+ case CB_CODE_CPR_CHKPT:
+ return (hpet_cpr(code));
+
+ case CST_EVENT_MULTIPLE_CSTATES:
+ hpet_cst_callback(CST_EVENT_MULTIPLE_CSTATES);
+ return (B_TRUE);
+
+ case CST_EVENT_ONE_CSTATE:
+ hpet_cst_callback(CST_EVENT_ONE_CSTATE);
+ return (B_TRUE);
+
+ default:
+ cmn_err(CE_NOTE, "!hpet_callback: invalid code %d\n", code);
+ return (B_FALSE);
+ }
+}
+
+/*
+ * According to the HPET spec 1.0a: the Operating System must save and restore
+ * HPET event timer hardware context through ACPI sleep state transitions.
+ * Timer registers (including the main counter) may not be preserved through
+ * ACPI S3, S4, or S5 sleep states. This code does not not support S1 nor S2.
+ *
+ * Current HPET state is already in hpet.supported and
+ * hpet_state.proxy_installed. hpet_info contains the proxy interrupt HPET
+ * Timer state.
+ *
+ * Future projects beware: the HPET Main Counter is undefined after ACPI S3 or
+ * S4, and it is not saved/restored here. Future projects cannot expect the
+ * Main Counter to be monotomically (or accurately) increasing across CPR.
+ *
+ * Note: the CPR Checkpoint path later calls pause_cpus() which ensures all
+ * CPUs are awake and in a spin loop before the system suspends. The HPET is
+ * not needed for Deep C-state wakeup when CPUs are in cpu_pause().
+ * It is safe to leave the HPET running as the system suspends; we just
+ * disable the timer from generating interrupts here.
+ */
+static boolean_t
+hpet_cpr(int code)
+{
+ ulong_t intr, dead_count = 0;
+ hrtime_t dead = gethrtime() + hpet_spin_timeout;
+ boolean_t ret = B_TRUE;
+
+ mutex_enter(&hpet_state_lock);
+ switch (code) {
+ case CB_CODE_CPR_CHKPT:
+ if (hpet_state.proxy_installed == B_FALSE)
+ break;
+
+ hpet_state.cpr = B_TRUE;
+
+ intr = intr_clear();
+ while (!mutex_tryenter(&hpet_proxy_lock)) {
+ /*
+ * spin
+ */
+ intr_restore(intr);
+ if (dead_count++ > hpet_spin_check) {
+ dead_count = 0;
+ if (gethrtime() > dead) {
+ hpet_state.cpr = B_FALSE;
+ mutex_exit(&hpet_state_lock);
+ cmn_err(CE_NOTE, "!hpet_cpr: deadman");
+ return (B_FALSE);
+ }
+ }
+ intr = intr_clear();
+ }
+ hpet_expire_all();
+ mutex_exit(&hpet_proxy_lock);
+ intr_restore(intr);
+
+ hpet_disable_timer(&hpet_info, hpet_info.cstate_timer.timer);
+ break;
+
+ case CB_CODE_CPR_RESUME:
+ if (hpet_resume() == B_TRUE)
+ hpet_state.cpr = B_FALSE;
+ else
+ cmn_err(CE_NOTE, "!hpet_resume failed.");
+ break;
+
+ default:
+ cmn_err(CE_NOTE, "!hpet_cpr: invalid code %d\n", code);
+ ret = B_FALSE;
+ break;
+ }
+ mutex_exit(&hpet_state_lock);
+ return (ret);
+}
+
+/*
+ * Assume the HPET stopped in Suspend state and timer state was lost.
+ */
+static boolean_t
+hpet_resume(void)
+{
+ if (hpet.supported != HPET_TIMER_SUPPORT)
+ return (B_TRUE);
+
+ /*
+ * The HPET spec does not specify if Legacy Replacement Route is
+ * on or off by default, so we set it off here.
+ */
+ (void) hpet_set_leg_rt_cnf(&hpet_info, 0);
+
+ if (hpet_start_main_counter(&hpet_info) != AE_OK) {
+ cmn_err(CE_NOTE, "!hpet_resume: start main counter failed");
+ hpet.supported = HPET_NO_SUPPORT;
+ if (hpet_state.proxy_installed == B_TRUE) {
+ hpet_state.proxy_installed = B_FALSE;
+ hpet_uninstall_interrupt_handler();
+ }
+ return (B_FALSE);
+ }
+
+ if (hpet_state.proxy_installed == B_FALSE)
+ return (B_TRUE);
+
+ hpet_timer_set_up(&hpet_info, hpet_info.cstate_timer.timer,
+ hpet_info.cstate_timer.intr);
+ if (hpet_state.cpu_deep_idle == B_TRUE)
+ hpet_enable_timer(&hpet_info, hpet_info.cstate_timer.timer);
+
+ return (B_TRUE);
+}
+
+/*
+ * Callback to enable/disable Deep C-States based on power.conf setting.
+ */
+static boolean_t
+hpet_deep_idle_config(int code)
+{
+ ulong_t intr, dead_count = 0;
+ hrtime_t dead = gethrtime() + hpet_spin_timeout;
+ boolean_t ret = B_TRUE;
+
+ mutex_enter(&hpet_state_lock);
+ switch (code) {
+ case PM_DEFAULT_CPU_DEEP_IDLE:
+ /*FALLTHROUGH*/
+ case PM_ENABLE_CPU_DEEP_IDLE:
+
+ if (hpet_state.cpu_deep_idle == B_TRUE)
+ break;
+
+ if (hpet_state.proxy_installed == B_FALSE) {
+ ret = B_FALSE; /* Deep C-States not supported */
+ break;
+ }
+
+ hpet_enable_timer(&hpet_info, hpet_info.cstate_timer.timer);
+ hpet_state.cpu_deep_idle = B_TRUE;
+ break;
+
+ case PM_DISABLE_CPU_DEEP_IDLE:
+
+ if ((hpet_state.cpu_deep_idle == B_FALSE) ||
+ (hpet_state.proxy_installed == B_FALSE))
+ break;
+
+ /*
+ * The order of these operations is important to avoid
+ * lost wakeups: Set a flag to refuse all future LAPIC Timer
+ * proxy requests, then wake up all CPUs from deep C-state,
+ * and finally disable the HPET interrupt-generating timer.
+ */
+ hpet_state.cpu_deep_idle = B_FALSE;
+
+ intr = intr_clear();
+ while (!mutex_tryenter(&hpet_proxy_lock)) {
+ /*
+ * spin
+ */
+ intr_restore(intr);
+ if (dead_count++ > hpet_spin_check) {
+ dead_count = 0;
+ if (gethrtime() > dead) {
+ hpet_state.cpu_deep_idle = B_TRUE;
+ mutex_exit(&hpet_state_lock);
+ cmn_err(CE_NOTE,
+ "!hpet_deep_idle_config: deadman");
+ return (B_FALSE);
+ }
+ }
+ intr = intr_clear();
+ }
+ hpet_expire_all();
+ mutex_exit(&hpet_proxy_lock);
+ intr_restore(intr);
+
+ hpet_disable_timer(&hpet_info, hpet_info.cstate_timer.timer);
+ break;
+
+ default:
+ cmn_err(CE_NOTE, "!hpet_deep_idle_config: invalid code %d\n",
+ code);
+ ret = B_FALSE;
+ break;
+ }
+ mutex_exit(&hpet_state_lock);
+
+ return (ret);
+}
+
+/*
+ * Callback for _CST c-state change notifications.
+ */
+static void
+hpet_cst_callback(uint32_t code)
+{
+ ulong_t intr, dead_count = 0;
+ hrtime_t dead = gethrtime() + hpet_spin_timeout;
+
+ switch (code) {
+ case CST_EVENT_ONE_CSTATE:
+ hpet_state.uni_cstate = B_TRUE;
+ intr = intr_clear();
+ while (!mutex_tryenter(&hpet_proxy_lock)) {
+ /*
+ * spin
+ */
+ intr_restore(intr);
+ if (dead_count++ > hpet_spin_check) {
+ dead_count = 0;
+ if (gethrtime() > dead) {
+ hpet_expire_all();
+ cmn_err(CE_NOTE,
+ "!hpet_cst_callback: deadman");
+ return;
+ }
+ }
+ intr = intr_clear();
+ }
+ hpet_expire_all();
+ mutex_exit(&hpet_proxy_lock);
+ intr_restore(intr);
+ break;
+
+ case CST_EVENT_MULTIPLE_CSTATES:
+ hpet_state.uni_cstate = B_FALSE;
+ break;
+
+ default:
+ cmn_err(CE_NOTE, "!hpet_cst_callback: invalid code %d\n", code);
+ break;
+ }
+}
+
+/*
+ * Interrupt Service Routine for HPET I/O-APIC-generated interrupts.
+ * Used to wakeup CPUs from Deep C-state when their Local APIC Timer stops.
+ * This ISR runs on one CPU which pokes other CPUs out of Deep C-state as
+ * needed.
+ */
+/* ARGSUSED */
+static uint_t
+hpet_isr(char *arg)
+{
+ uint64_t timer_status;
+ uint64_t timer_mask;
+ ulong_t intr, dead_count = 0;
+ hrtime_t dead = gethrtime() + hpet_isr_spin_timeout;
+
+ timer_mask = HPET_INTR_STATUS_MASK(hpet_info.cstate_timer.timer);
+
+ /*
+ * We are using a level-triggered interrupt.
+ * HPET sets timer's General Interrupt Status Register bit N.
+ * ISR checks this bit to see if it needs servicing.
+ * ISR then clears this bit by writing 1 to that bit.
+ */
+ timer_status = hpet_read_gen_intrpt_stat(&hpet_info);
+ if (!(timer_status & timer_mask))
+ return (DDI_INTR_UNCLAIMED);
+ hpet_write_gen_intrpt_stat(&hpet_info, timer_mask);
+
+ /*
+ * Do not touch ISR data structures before checking the HPET's General
+ * Interrupt Status register. The General Interrupt Status register
+ * will not be set by hardware until after timer interrupt generation
+ * is enabled by software. Software allocates necessary data
+ * structures before enabling timer interrupts. ASSERT the software
+ * data structures required to handle this interrupt are initialized.
+ */
+ ASSERT(hpet_proxy_users != NULL);
+
+ /*
+ * CPUs in deep c-states do not enable interrupts until after
+ * performing idle cleanup which includes descheduling themselves from
+ * the HPET. The CPU running this ISR will NEVER find itself in the
+ * proxy list. A lost wakeup may occur if this is false.
+ */
+ ASSERT(hpet_proxy_users[CPU->cpu_id] == HPET_INFINITY);
+
+ /*
+ * Higher level interrupts may deadlock with CPUs going idle if this
+ * ISR is prempted while holding hpet_proxy_lock.
+ */
+ intr = intr_clear();
+ while (!mutex_tryenter(&hpet_proxy_lock)) {
+ /*
+ * spin
+ */
+ intr_restore(intr);
+ if (dead_count++ > hpet_spin_check) {
+ dead_count = 0;
+ if (gethrtime() > dead) {
+ hpet_expire_all();
+ return (DDI_INTR_CLAIMED);
+ }
+ }
+ intr = intr_clear();
+ }
+ (void) hpet_guaranteed_schedule(HPET_INFINITY);
+ mutex_exit(&hpet_proxy_lock);
+ intr_restore(intr);
+
+ return (DDI_INTR_CLAIMED);
+}
+
+/*
+ * Used when disabling the HPET Timer interrupt. CPUs in Deep C-state must be
+ * woken up because they can no longer rely on the HPET's Timer to wake them.
+ * We do not need to wait for CPUs to wakeup.
+ */
+static void
+hpet_expire_all(void)
+{
+ processorid_t id;
+
+ for (id = 0; id < ncpus; ++id) {
+ if (hpet_proxy_users[id] != HPET_INFINITY) {
+ hpet_proxy_users[id] = HPET_INFINITY;
+ if (id != CPU->cpu_id)
+ poke_cpu(id);
+ }
+ }
+}
+
+/*
+ * To avoid missed wakeups this function must guarantee either the HPET timer
+ * was successfully programmed to the next expire time or there are no waiting
+ * CPUs.
+ *
+ * Callers cannot enter C2 or deeper if the HPET could not be programmed to
+ * generate its next interrupt to happen at required_wakeup_time or sooner.
+ * Returns B_TRUE if the HPET was programmed to interrupt by
+ * required_wakeup_time, B_FALSE if not.
+ */
+static boolean_t
+hpet_guaranteed_schedule(hrtime_t required_wakeup_time)
+{
+ hrtime_t now, next_proxy_time;
+ processorid_t id, next_proxy_id;
+ int proxy_timer = hpet_info.cstate_timer.timer;
+ boolean_t done = B_FALSE;
+
+ ASSERT(mutex_owned(&hpet_proxy_lock));
+
+ /*
+ * Loop until we successfully program the HPET,
+ * or no CPUs are scheduled to use the HPET as a proxy.
+ */
+ do {
+ /*
+ * Wake all CPUs that expired before now.
+ * Find the next CPU to wake up and next HPET program time.
+ */
+ now = gethrtime();
+ next_proxy_time = HPET_INFINITY;
+ next_proxy_id = CPU->cpu_id;
+ for (id = 0; id < ncpus; ++id) {
+ if (hpet_proxy_users[id] < now) {
+ hpet_proxy_users[id] = HPET_INFINITY;
+ if (id != CPU->cpu_id)
+ poke_cpu(id);
+ } else if (hpet_proxy_users[id] < next_proxy_time) {
+ next_proxy_time = hpet_proxy_users[id];
+ next_proxy_id = id;
+ }
+ }
+
+ if (next_proxy_time == HPET_INFINITY) {
+ done = B_TRUE;
+ /*
+ * There are currently no CPUs using the HPET's Timer
+ * as a proxy for their LAPIC Timer. The HPET's Timer
+ * does not need to be programmed.
+ *
+ * Letting the HPET timer wrap around to the current
+ * time is the longest possible timeout.
+ * A 64-bit timer will wrap around in ~ 2^44 seconds.
+ * A 32-bit timer will wrap around in ~ 2^12 seconds.
+ *
+ * Disabling the HPET's timer interrupt requires a
+ * (relatively expensive) write to the HPET.
+ * Instead we do nothing.
+ *
+ * We are gambling some CPU will attempt to enter a
+ * deep c-state before the timer wraps around.
+ * We assume one spurious interrupt in a little over an
+ * hour has less performance impact than writing to the
+ * HPET's timer disable bit every time all CPUs wakeup
+ * from deep c-state.
+ */
+
+ } else {
+ /*
+ * Idle CPUs disable interrupts before programming the
+ * HPET to prevent a lost wakeup if the HPET
+ * interrupts the idle cpu before it can enter a
+ * Deep C-State.
+ */
+ if (hpet_timer_program(&hpet_info, proxy_timer,
+ HRTIME_TO_HPET_TICKS(next_proxy_time - gethrtime()))
+ != AE_OK) {
+ /*
+ * We could not program the HPET to wakeup the
+ * next CPU. We must wake the CPU ourself to
+ * avoid a lost wakeup.
+ */
+ hpet_proxy_users[next_proxy_id] = HPET_INFINITY;
+ if (next_proxy_id != CPU->cpu_id)
+ poke_cpu(next_proxy_id);
+ } else {
+ done = B_TRUE;
+ }
+ }
+
+ } while (!done);
+
+ return (next_proxy_time <= required_wakeup_time);
+}
+
+/*
+ * Use an HPET timer to act as this CPU's proxy local APIC timer.
+ * Used in deep c-states C2 and above while the CPU's local APIC timer stalls.
+ * Called by the idle thread with interrupts enabled.
+ * Always returns with interrupts disabled.
+ *
+ * There are 3 possible outcomes from this function:
+ * 1. The Local APIC Timer was already disabled before this function was called.
+ * LAPIC TIMER : disabled
+ * HPET : not scheduled to wake this CPU
+ * *lapic_expire : (hrtime_t)HPET_INFINITY
+ * Returns : B_TRUE
+ * 2. Successfully programmed the HPET to act as a LAPIC Timer proxy.
+ * LAPIC TIMER : disabled
+ * HPET : scheduled to wake this CPU
+ * *lapic_expire : hrtime_t when LAPIC timer would have expired
+ * Returns : B_TRUE
+ * 3. Failed to programmed the HPET to act as a LAPIC Timer proxy.
+ * LAPIC TIMER : enabled
+ * HPET : not scheduled to wake this CPU
+ * *lapic_expire : (hrtime_t)HPET_INFINITY
+ * Returns : B_FALSE
+ *
+ * The idle thread cannot enter Deep C-State in case 3.
+ * The idle thread must re-enable & re-program the LAPIC_TIMER in case 2.
+ */
+static boolean_t
+hpet_use_hpet_timer(hrtime_t *lapic_expire)
+{
+ extern hrtime_t apic_timer_stop_count(void);
+ extern void apic_timer_restart(hrtime_t);
+ hrtime_t now, expire, dead;
+ uint64_t lapic_count, dead_count;
+ cpupart_t *cpu_part;
+ processorid_t cpu_sid;
+ processorid_t cpu_id = CPU->cpu_id;
+ processorid_t id;
+ boolean_t rslt;
+ boolean_t hset_update;
+
+ cpu_part = CPU->cpu_part;
+ cpu_sid = CPU->cpu_seqid;
+
+ ASSERT(CPU->cpu_thread == CPU->cpu_idle_thread);
+ ASSERT(interrupts_enabled());
+
+ /*
+ * A critical section exists between when the HPET is programmed
+ * to interrupt the CPU and when this CPU enters an idle state.
+ * Interrupts must be blocked during that time to prevent lost
+ * CBE wakeup interrupts from either LAPIC or HPET.
+ *
+ * Must block interrupts before acquiring hpet_proxy_lock to prevent
+ * a deadlock with the ISR if the ISR runs on this CPU after the
+ * idle thread acquires the mutex but before it clears interrupts.
+ */
+ cli();
+
+ lapic_count = apic_timer_stop_count();
+ now = gethrtime();
+ dead = now + hpet_idle_spin_timeout;
+ *lapic_expire = expire = now + lapic_count;
+ if (lapic_count == (hrtime_t)-1) {
+ /*
+ * LAPIC timer is currently disabled.
+ * Will not use the HPET as a LAPIC Timer proxy.
+ */
+ *lapic_expire = (hrtime_t)HPET_INFINITY;
+ return (B_TRUE);
+ }
+
+ /*
+ * Serialize hpet_proxy data structure manipulation.
+ */
+ dead_count = 0;
+ while (!mutex_tryenter(&hpet_proxy_lock)) {
+ /*
+ * spin
+ */
+ apic_timer_restart(expire);
+ sti();
+ cli();
+
+ if (dead_count++ > hpet_spin_check) {
+ dead_count = 0;
+ hset_update = (((CPU->cpu_flags & CPU_OFFLINE) == 0) &&
+ (ncpus > 1));
+ if (hset_update &&
+ !bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
+ *lapic_expire = (hrtime_t)HPET_INFINITY;
+ return (B_FALSE);
+ }
+ }
+
+ lapic_count = apic_timer_stop_count();
+ now = gethrtime();
+ *lapic_expire = expire = now + lapic_count;
+ if (lapic_count == (hrtime_t)-1) {
+ /*
+ * LAPIC timer is currently disabled.
+ * Will not use the HPET as a LAPIC Timer proxy.
+ */
+ *lapic_expire = (hrtime_t)HPET_INFINITY;
+ return (B_TRUE);
+ }
+ if (now > dead) {
+ apic_timer_restart(expire);
+ *lapic_expire = (hrtime_t)HPET_INFINITY;
+ return (B_FALSE);
+ }
+ }
+
+ if ((hpet_state.cpr == B_TRUE) ||
+ (hpet_state.cpu_deep_idle == B_FALSE) ||
+ (hpet_state.proxy_installed == B_FALSE) ||
+ (hpet_state.uni_cstate == B_TRUE)) {
+ mutex_exit(&hpet_proxy_lock);
+ apic_timer_restart(expire);
+ *lapic_expire = (hrtime_t)HPET_INFINITY;
+ return (B_FALSE);
+ }
+
+ hpet_proxy_users[cpu_id] = expire;
+
+ /*
+ * We are done if another cpu is scheduled on the HPET with an
+ * expire time before us. The next HPET interrupt has been programmed
+ * to fire before our expire time.
+ */
+ for (id = 0; id < ncpus; ++id) {
+ if ((hpet_proxy_users[id] <= expire) && (id != cpu_id)) {
+ mutex_exit(&hpet_proxy_lock);
+ return (B_TRUE);
+ }
+ }
+
+ /*
+ * We are the next lAPIC to expire.
+ * Program the HPET with our expire time.
+ */
+ rslt = hpet_guaranteed_schedule(expire);
+ mutex_exit(&hpet_proxy_lock);
+
+ if (rslt == B_FALSE) {
+ apic_timer_restart(expire);
+ *lapic_expire = (hrtime_t)HPET_INFINITY;
+ }
+
+ return (rslt);
+}
+
+/*
+ * Called by the idle thread when waking up from Deep C-state before enabling
+ * interrupts. With an array data structure it is faster to always remove
+ * ourself from the array without checking if the HPET ISR already removed.
+ *
+ * We use a lazy algorithm for removing CPUs from the HPET's schedule.
+ * We do not reprogram the HPET here because this CPU has real work to do.
+ * On a idle system the CPU was probably woken up by the HPET's ISR.
+ * On a heavily loaded system CPUs are not going into Deep C-state.
+ * On a moderately loaded system another CPU will usually enter Deep C-state
+ * and reprogram the HPET before the HPET fires with our wakeup.
+ */
+static void
+hpet_use_lapic_timer(hrtime_t expire)
+{
+ extern void apic_timer_restart(hrtime_t);
+ processorid_t cpu_id = CPU->cpu_id;
+
+ ASSERT(CPU->cpu_thread == CPU->cpu_idle_thread);
+ ASSERT(!interrupts_enabled());
+
+ hpet_proxy_users[cpu_id] = HPET_INFINITY;
+
+ /*
+ * Do not enable a LAPIC Timer that was initially disabled.
+ */
+ if (expire != HPET_INFINITY)
+ apic_timer_restart(expire);
+
+ sti();
+}
+
+/*
+ * Initialize data structure to keep track of CPUs using HPET as a proxy for
+ * their stalled local APIC timer. For now this is just an array.
+ */
+static void
+hpet_init_proxy_data(void)
+{
+ processorid_t id;
+
+ /*
+ * Use apic_nproc because we are in boot before max_ncpus has been
+ * initialized.
+ */
+ hpet_proxy_users = kmem_zalloc(apic_nproc * sizeof (*hpet_proxy_users),
+ KM_SLEEP);
+
+ /*
+ * Unused entries always contain HPET_INFINITY.
+ */
+ for (id = 0; id < apic_nproc; ++id)
+ hpet_proxy_users[id] = HPET_INFINITY;
+}
diff --git a/usr/src/uts/i86pc/io/mp_platform_common.c b/usr/src/uts/i86pc/io/mp_platform_common.c
index 123ece8286..77314f3697 100644
--- a/usr/src/uts/i86pc/io/mp_platform_common.c
+++ b/usr/src/uts/i86pc/io/mp_platform_common.c
@@ -62,7 +62,10 @@
#include <sys/note.h>
#include <sys/pci_intr_lib.h>
#include <sys/sunndi.h>
-
+#if !defined(__xpv)
+#include <sys/hpet.h>
+#include <sys/clock.h>
+#endif
/*
* Local Function Prototypes
@@ -103,6 +106,12 @@ int apic_debug_mps_id = 0; /* 1 - print MPS ID strings */
int apic_sci_vect = -1;
iflag_t apic_sci_flags;
+#if !defined(__xpv)
+/* ACPI HPET interrupt configuration; -1 if HPET not used */
+int apic_hpet_vect = -1;
+iflag_t apic_hpet_flags;
+#endif
+
/*
* psm name pointer
*/
@@ -892,6 +901,17 @@ acpi_probe(char *modname)
cmn_err(CE_CONT,
"?Using ACPI for CPU/IOAPIC information ONLY\n");
}
+
+#if !defined(__xpv)
+ /*
+ * probe ACPI for hpet information here which is used later
+ * in apic_picinit().
+ */
+ if (hpet_acpi_init(&apic_hpet_vect, &apic_hpet_flags) < 0) {
+ cmn_err(CE_NOTE, "!ACPI HPET table query failed\n");
+ }
+#endif
+
return (PSM_SUCCESS);
}
/* if setting APIC mode failed above, we fall through to cleanup */
@@ -1324,6 +1344,40 @@ ioapic_init_intr(int mask_apic)
irqptr->airq_share++;
}
+
+#if !defined(__xpv)
+ /*
+ * Hack alert: deal with ACPI HPET interrupt chicken/egg here.
+ */
+ if (apic_hpet_vect > 0) {
+ /*
+ * hpet has already done add_avintr(); we just need
+ * to finish the job by mimicing translate_irq()
+ *
+ * Fake up an intrspec and setup the tables
+ */
+ ispec.intrspec_vec = apic_hpet_vect;
+ ispec.intrspec_pri = CBE_HIGH_PIL;
+
+ if (apic_setup_irq_table(NULL, apic_hpet_vect, NULL,
+ &ispec, &apic_hpet_flags, DDI_INTR_TYPE_FIXED) < 0) {
+ cmn_err(CE_WARN, "!apic: HPET setup failed");
+ return;
+ }
+ irqptr = apic_irq_table[apic_hpet_vect];
+
+ iflag = intr_clear();
+ lock_set(&apic_ioapic_lock);
+
+ /* Program I/O APIC */
+ (void) apic_setup_io_intr(irqptr, apic_hpet_vect, B_FALSE);
+
+ lock_clear(&apic_ioapic_lock);
+ intr_restore(iflag);
+
+ irqptr->airq_share++;
+ }
+#endif /* !defined(__xpv) */
}
/*
diff --git a/usr/src/uts/i86pc/io/pcplusmp/apic.c b/usr/src/uts/i86pc/io/pcplusmp/apic.c
index d83e2c2209..793a48c360 100644
--- a/usr/src/uts/i86pc/io/pcplusmp/apic.c
+++ b/usr/src/uts/i86pc/io/pcplusmp/apic.c
@@ -68,6 +68,7 @@
#include <sys/sunddi.h>
#include <sys/x_call.h>
#include <sys/reboot.h>
+#include <sys/hpet.h>
/*
* Local Function Prototypes
@@ -1650,6 +1651,8 @@ apic_shutdown(int cmd, int fcn)
uchar_t byte;
ulong_t iflag;
+ hpet_acpi_fini();
+
/* Send NMI to all CPUs except self to do per processor shutdown */
iflag = intr_clear();
#ifdef DEBUG
@@ -2039,6 +2042,41 @@ apic_timer_disable(void)
(apic_clkvect + APIC_BASE_VECT) | AV_MASK);
}
+/*
+ * Set timer far into the future and return timer
+ * current Count in nanoseconds.
+ */
+hrtime_t
+apic_timer_stop_count(void)
+{
+ hrtime_t ns_val;
+ int enable_val, count_val;
+
+ /*
+ * Should be called with interrupts disabled.
+ */
+ ASSERT(!interrupts_enabled());
+
+ enable_val = apic_reg_ops->apic_read(APIC_LOCAL_TIMER);
+ if ((enable_val & AV_MASK) == AV_MASK)
+ return ((hrtime_t)-1); /* timer is disabled */
+
+ count_val = apic_reg_ops->apic_read(APIC_CURR_COUNT);
+ ns_val = APIC_TICKS_TO_NSECS(count_val);
+
+ apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL);
+
+ return (ns_val);
+}
+
+/*
+ * Reprogram timer after Deep C-State.
+ */
+void
+apic_timer_restart(hrtime_t time)
+{
+ apic_timer_reprogram(time);
+}
ddi_periodic_t apic_periodic_id;
diff --git a/usr/src/uts/i86pc/io/ppm_plat.c b/usr/src/uts/i86pc/io/ppm_plat.c
index 4bc96639fe..0237676ade 100644
--- a/usr/src/uts/i86pc/io/ppm_plat.c
+++ b/usr/src/uts/i86pc/io/ppm_plat.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Platform Power Management master pseudo driver platform support.
*/
@@ -49,14 +47,17 @@ void
ppm_rebuild_cpu_domains(void)
{
char *str = "ppm_rebuild_cpu_domains";
- cpupm_cpu_dependency_t *dep;
- cpupm_cpu_dependency_t *dep_next;
- cpupm_cpu_node_t *cpu_next;
+ cpupm_state_domains_t *dep;
+ cpupm_state_domains_t *dep_next;
struct ppm_domit *domit_p;
ppm_domain_t *domp_old;
ppm_domain_t *domp;
ppm_dev_t *devp;
ppm_db_t *dbp;
+ uint_t cpu_id;
+ cpuset_t dom_cpu_set;
+ int result;
+ dev_info_t *cpu_dip;
/*
* Get the CPU domain data
@@ -100,7 +101,7 @@ ppm_rebuild_cpu_domains(void)
* leave the domain as it is (which is unmanageable since
* PPM_CPU_READY is off).
*/
- dep = cpupm_get_cpu_dependencies();
+ dep = cpupm_pstate_domains;
if (dep == NULL) {
PPMD(D_CPU, ("%s: No CPU dependency info!\n", str));
return;
@@ -112,11 +113,11 @@ ppm_rebuild_cpu_domains(void)
*/
mutex_enter(&domp_old->lock);
domp_old->dflags |= PPMD_OFFLINE;
- for (dep_next = dep; dep_next; dep_next = dep_next->cd_next) {
+ for (dep_next = dep; dep_next; dep_next = dep_next->pm_next) {
domp = kmem_zalloc(sizeof (*domp), KM_SLEEP);
domp->name = kmem_zalloc(MAXNAMELEN, KM_SLEEP);
(void) snprintf(domp->name, MAXNAMELEN, "acpi_cpu_domain_%d",
- dep_next->cd_dependency_id);
+ dep_next->pm_domain);
mutex_init(&domp->lock, NULL, MUTEX_DRIVER, NULL);
mutex_enter(&domp->lock);
domp->dflags = domit_p->dflags | PPMD_CPU_READY;
@@ -135,18 +136,27 @@ ppm_rebuild_cpu_domains(void)
* build the "conflist" for the domain. But conveniently, the
* "conflist" data is easily obtainable from the "devlist".
*/
- for (cpu_next = dep_next->cd_cpu; cpu_next;
- cpu_next = cpu_next->cn_next) {
- devp = PPM_GET_PRIVATE(cpu_next->cn_dip);
+ dom_cpu_set = dep_next->pm_cpus;
+ do {
+ CPUSET_FIND(dom_cpu_set, cpu_id);
+ if (cpu_id == CPUSET_NOTINSET)
+ break;
+
+ ASSERT(cpu_id < NCPU);
+ cpu_dip = ((cpupm_mach_state_t *)
+ (cpu[cpu_id]->cpu_m.mcpu_pm_mach_state))->ms_dip;
+ devp = PPM_GET_PRIVATE(cpu_dip);
ASSERT(devp && devp->domp == domp_old);
- devp = ppm_add_dev(cpu_next->cn_dip, domp);
+ devp = ppm_add_dev(cpu_dip, domp);
dbp = kmem_zalloc(sizeof (struct ppm_db), KM_SLEEP);
dbp->name = kmem_zalloc((strlen(devp->path) + 1),
KM_SLEEP);
(void) strcpy(dbp->name, devp->path);
dbp->next = domp->conflist;
domp->conflist = dbp;
- }
+
+ CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result);
+ } while (result == 0);
/*
* Note that we do not bother creating a "dc" list as there
@@ -165,7 +175,6 @@ ppm_rebuild_cpu_domains(void)
mutex_exit(&domp->lock);
}
mutex_exit(&domp_old->lock);
- cpupm_free_cpu_dependencies();
}
/*
@@ -176,7 +185,7 @@ void
ppm_set_topspeed(ppm_dev_t *cpup, int speed)
{
for (cpup = cpup->domp->devlist; cpup != NULL; cpup = cpup->next)
- (*cpupm_set_topspeed)(cpup->dip, speed);
+ (*cpupm_set_topspeed_callb)(cpup->dip, speed);
}
/*
@@ -197,7 +206,8 @@ ppm_redefine_topspeed(void *ctx)
cpup = PPM_GET_PRIVATE((dev_info_t *)ctx);
- if (cpupm_get_topspeed == NULL || cpupm_set_topspeed == NULL) {
+ if (cpupm_get_topspeed_callb == NULL ||
+ cpupm_set_topspeed_callb == NULL) {
cmn_err(CE_WARN, "%s: Cannot process request for instance %d "
"since cpupm interfaces are not initialized", str,
ddi_get_instance(cpup->dip));
@@ -215,7 +225,7 @@ ppm_redefine_topspeed(void *ctx)
* Process each CPU in the domain.
*/
for (ncpup = cpup->domp->devlist; ncpup != NULL; ncpup = ncpup->next) {
- topspeed = (*cpupm_get_topspeed)(ncpup->dip);
+ topspeed = (*cpupm_get_topspeed_callb)(ncpup->dip);
if (newspeed == -1 || topspeed < newspeed)
newspeed = topspeed;
}
diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c
index c3f2cb5074..c47c52f37f 100644
--- a/usr/src/uts/i86pc/os/cpuid.c
+++ b/usr/src/uts/i86pc/os/cpuid.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -1024,6 +1024,22 @@ cpuid_pass1(cpu_t *cpu)
cpi->cpi_ncore_per_chip = 1;
break;
}
+
+ /*
+ * Get CPUID data about TSC Invariance in Deep C-State.
+ */
+ switch (cpi->cpi_vendor) {
+ case X86_VENDOR_Intel:
+ if (cpi->cpi_maxeax >= 7) {
+ cp = &cpi->cpi_extd[7];
+ cp->cp_eax = 0x80000007;
+ cp->cp_ecx = 0;
+ (void) __cpuid_insn(cp);
+ }
+ break;
+ default:
+ break;
+ }
} else {
cpi->cpi_ncore_per_chip = 1;
}
@@ -3847,6 +3863,36 @@ patch_tsc_read(int flag)
}
}
+int
+cpuid_deep_cstates_supported(void)
+{
+ struct cpuid_info *cpi;
+ struct cpuid_regs regs;
+
+ ASSERT(cpuid_checkpass(CPU, 1));
+
+ cpi = CPU->cpu_m.mcpu_cpi;
+
+ if (!(x86_feature & X86_CPUID))
+ return (0);
+
+ switch (cpi->cpi_vendor) {
+ case X86_VENDOR_Intel:
+ if (cpi->cpi_xmaxeax < 0x80000007)
+ return (0);
+
+ /*
+ * TSC run at a constant rate in all ACPI C-states?
+ */
+ regs.cp_eax = 0x80000007;
+ (void) __cpuid_insn(&regs);
+ return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
+
+ default:
+ return (0);
+ }
+}
+
#if defined(__amd64) && !defined(__xpv)
/*
* Patch in versions of bcopy for high performance Intel Nhm processors
diff --git a/usr/src/uts/i86pc/os/cpupm.c b/usr/src/uts/i86pc/os/cpupm.c
deleted file mode 100644
index 6aad26948a..0000000000
--- a/usr/src/uts/i86pc/os/cpupm.c
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/cpupm.h>
-
-/*
- * This callback is used to build the PPM CPU domains once
- * all the CPU devices have been started. The callback is
- * initialized by the PPM driver to point to a routine that
- * will build the domains.
- */
-void (*cpupm_rebuild_cpu_domains)(void);
-
-/*
- * This callback is used to reset the topspeed for all the
- * CPU devices. The callback is initialized by the PPM driver to
- * point to a routine that will reinitialize all the CPU devices
- * once all the CPU devices have been started and the CPU domains
- * built.
- */
-void (*cpupm_init_topspeed)(void);
-
-/*
- * This callback is used to redefine the topspeed for a CPU device.
- * Since all CPUs in a domain should have identical properties, this
- * callback is initialized by the PPM driver to point to a routine
- * that will redefine the topspeed for all devices in a CPU domain.
- * This callback is exercised whenever an ACPI _PPC change notification
- * is received by the CPU driver.
- */
-void (*cpupm_redefine_topspeed)(void *);
-
-/*
- * This callback is used by the PPM driver to call into the CPU driver
- * to find a CPU's current topspeed (i.e., it's current ACPI _PPC value).
- */
-void (*cpupm_set_topspeed)(void *, int);
-
-/*
- * This callback is used by the PPM driver to call into the CPU driver
- * to set a new topspeed for a CPU.
- */
-int (*cpupm_get_topspeed)(void *);
-
-/*
- * Used to dynamically keep track of the CPU dependencies as CPU
- * devices attach. Will subsequently be used by the PPM driver
- * to build PPM CPU domains.
- */
-static cpupm_cpu_dependency_t *cpupm_cpu_dependencies = NULL;
-
-/*
- * If we are unable to correctly identify a dependency for any CPU, then
- * we punt and all CPUs are managed as one domain.
- */
-static boolean_t cpupm_dependencies_valid = B_TRUE;
-
-/*
- * If any CPU fails to attach, then cpupm is disabled for all CPUs.
- */
-static uint32_t cpupm_enabled = CPUPM_P_STATES | CPUPM_T_STATES;
-
-/*
- * Until all CPUs have succesfully attached, we do not allow
- * power management.
- */
-static boolean_t cpupm_ready = B_FALSE;
-
-/*
- * Print the CPU dependencies.
- */
-static void
-cpupm_print_cpu_dependencies()
-{
- cpupm_cpu_dependency_t *dptr;
- cpupm_cpu_node_t *nptr;
-
- for (dptr = cpupm_cpu_dependencies; dptr != NULL;
- dptr = dptr->cd_next) {
- for (nptr = dptr->cd_cpu; nptr != NULL; nptr = nptr->cn_next) {
- int instance = ddi_get_instance(nptr->cn_dip);
- cmn_err(CE_NOTE,
- "print_cpu_dependencies: dependency %d "
- "instance %d\n", dptr->cd_dependency_id, instance);
- }
- }
-}
-
-/*
- * Used to retrieve the dependencies built during CPUs attaching.
- */
-cpupm_cpu_dependency_t *
-cpupm_get_cpu_dependencies()
-{
- return (cpupm_cpu_dependencies);
-}
-
-/*
- * Build dependencies as CPUs attach. Note that we don't need to worry
- * about locking the dependency lists as concurrency is not an issue.
- * This routine relies on the fact that the CPU devices are attached
- * sequentially by a single thread.
- */
-void
-cpupm_add_cpu2dependency(dev_info_t *dip, int cpu_dependency)
-{
- cpupm_cpu_dependency_t *dptr;
- cpupm_cpu_node_t *nptr;
-
- if (!cpupm_dependencies_valid)
- return;
-
- if (cpu_dependency == -1) {
- cpupm_free_cpu_dependencies();
- return;
- }
-
- for (dptr = cpupm_cpu_dependencies; dptr != NULL;
- dptr = dptr->cd_next) {
- if (dptr->cd_dependency_id == cpu_dependency)
- break;
- }
-
- /* new dependency is created and linked at the head */
- if (dptr == NULL) {
- dptr = kmem_zalloc(sizeof (cpupm_cpu_dependency_t), KM_SLEEP);
- dptr->cd_dependency_id = cpu_dependency;
- dptr->cd_next = cpupm_cpu_dependencies;
- cpupm_cpu_dependencies = dptr;
- }
-
- /* new cpu is created and linked at head of dependency */
- nptr = kmem_zalloc(sizeof (cpupm_cpu_node_t), KM_SLEEP);
- nptr->cn_dip = dip;
- nptr->cn_next = dptr->cd_cpu;
- dptr->cd_cpu = nptr;
-}
-
-/*
- * Free the CPU dependencies.
- */
-void
-cpupm_free_cpu_dependencies()
-{
- cpupm_cpu_dependency_t *this_dependency, *next_dependency;
- cpupm_cpu_node_t *this_node, *next_node;
-
- cpupm_dependencies_valid = B_FALSE;
- this_dependency = cpupm_cpu_dependencies;
- while (this_dependency != NULL) {
- next_dependency = this_dependency->cd_next;
-
- /* discard CPU node chain */
- this_node = this_dependency->cd_cpu;
- while (this_node != NULL) {
- next_node = this_node->cn_next;
- kmem_free((void *)this_node,
- sizeof (cpupm_cpu_node_t));
- this_node = next_node;
- }
- kmem_free((void *)this_dependency,
- sizeof (cpupm_cpu_dependency_t));
- this_dependency = next_dependency;
- }
- cpupm_cpu_dependencies = NULL;
-}
-
-/*
- * If all CPUs have attached successfully, then the CPUs are
- * ready for power management.
- */
-boolean_t
-cpupm_is_ready()
-{
-#ifndef __xpv
- if (cpupm_enabled == CPUPM_NO_STATES)
- return (B_FALSE);
- return (cpupm_ready);
-#else
- return (B_FALSE);
-#endif
-}
-
-boolean_t
-cpupm_is_enabled(uint32_t state)
-{
- return ((cpupm_enabled & state) == state);
-}
-
-/*
- * By default, all states are enabled. But if there are any errors attaching
- * any of the CPU devices, then they are disabled.
- */
-void
-cpupm_disable(uint32_t state)
-{
- cpupm_enabled &= ~state;
- if (state & CPUPM_P_STATES)
- cpupm_free_cpu_dependencies();
-}
-
-/*
- * Once all CPUs have been started, the PPM driver should build CPU
- * domains and initialize the topspeed for all CPU devices.
- */
-void
-cpupm_post_startup()
-{
-#ifndef __xpv
- /*
- * The CPU domain built by the PPM during CPUs attaching
- * should be rebuilt with the information retrieved from
- * ACPI.
- */
- if (cpupm_rebuild_cpu_domains != NULL)
- (*cpupm_rebuild_cpu_domains)();
-
- /*
- * Only initialize the topspeed if P-states are enabled.
- */
- if (cpupm_enabled & CPUPM_P_STATES && cpupm_init_topspeed != NULL)
- (*cpupm_init_topspeed)();
-#endif
- cpupm_ready = B_TRUE;
-}
diff --git a/usr/src/uts/i86pc/io/cpudrv/cpu_acpi.c b/usr/src/uts/i86pc/os/cpupm/cpu_acpi.c
index 569ca2fc92..76e087a873 100644
--- a/usr/src/uts/i86pc/io/cpudrv/cpu_acpi.c
+++ b/usr/src/uts/i86pc/os/cpupm/cpu_acpi.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -37,7 +37,8 @@ typedef enum cpu_acpi_obj {
PTC_OBJ,
TSS_OBJ,
TSD_OBJ,
- TPC_OBJ
+ TPC_OBJ,
+ CSD_OBJ,
} cpu_acpi_obj_t;
/*
@@ -61,7 +62,8 @@ static cpu_acpi_obj_attr_t cpu_acpi_obj_attrs[] = {
{"_PTC"},
{"_TSS"},
{"_TSD"},
- {"_TPC"}
+ {"_TPC"},
+ {"_CSD"}
};
/*
@@ -199,8 +201,14 @@ cpu_acpi_cache_state_dependencies(cpu_acpi_handle_t handle,
{
ACPI_BUFFER abuf;
ACPI_OBJECT *pkg, *elements;
+ int number;
int ret = -1;
+ if (objtype == CSD_OBJ) {
+ number = 6;
+ } else {
+ number = 5;
+ }
/*
* Fetch the dependencies (if present) for the CPU node.
* Since they are optional, non-existence is not a failure
@@ -215,21 +223,29 @@ cpu_acpi_cache_state_dependencies(cpu_acpi_handle_t handle,
}
pkg = abuf.Pointer;
- if (pkg->Package.Count != 1) {
+
+ if (((objtype != CSD_OBJ) && (pkg->Package.Count != 1)) ||
+ ((objtype == CSD_OBJ) && (pkg->Package.Count != 1) &&
+ (pkg->Package.Count != 2))) {
cmn_err(CE_NOTE, "!cpu_acpi: %s unsupported package "
"count %d.", cpu_acpi_obj_attrs[objtype].name,
pkg->Package.Count);
goto out;
}
+ /*
+ * For C-state domain, we assume C2 and C3 have the same
+ * domain information
+ */
if (pkg->Package.Elements[0].Type != ACPI_TYPE_PACKAGE ||
- pkg->Package.Elements[0].Package.Count != 5) {
+ pkg->Package.Elements[0].Package.Count != number) {
cmn_err(CE_NOTE, "!cpu_acpi: Unexpected data in %s package.",
cpu_acpi_obj_attrs[objtype].name);
goto out;
}
elements = pkg->Package.Elements[0].Package.Elements;
- if (elements[0].Integer.Value != 5 || elements[1].Integer.Value != 0) {
+ if (elements[0].Integer.Value != number ||
+ elements[1].Integer.Value != 0) {
cmn_err(CE_NOTE, "!cpu_acpi: Unexpected %s revision.",
cpu_acpi_obj_attrs[objtype].name);
goto out;
@@ -240,6 +256,9 @@ cpu_acpi_cache_state_dependencies(cpu_acpi_handle_t handle,
sd->sd_domain = elements[2].Integer.Value;
sd->sd_type = elements[3].Integer.Value;
sd->sd_num = elements[4].Integer.Value;
+ if (objtype == CSD_OBJ) {
+ sd->sd_index = elements[5].Integer.Value;
+ }
ret = 0;
out:
@@ -285,6 +304,25 @@ cpu_acpi_cache_tsd(cpu_acpi_handle_t handle)
}
+/*
+ * Cache the ACPI _CSD data. The _CSD data defines C-state CPU dependencies
+ * (think CPU domains).
+ */
+static int
+cpu_acpi_cache_csd(cpu_acpi_handle_t handle)
+{
+ cpu_acpi_csd_t *csd;
+ int ret;
+
+ CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_CSD_CACHED);
+ csd = &CPU_ACPI_CSD(handle);
+ ret = cpu_acpi_cache_state_dependencies(handle, CSD_OBJ, csd);
+ if (ret == 0)
+ CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_CSD_CACHED);
+ return (ret);
+
+}
+
static void
cpu_acpi_cache_pstate(cpu_acpi_handle_t handle, ACPI_OBJECT *obj, int cnt)
{
@@ -567,6 +605,126 @@ cpu_acpi_cache_tpc(cpu_acpi_handle_t handle)
CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_TPC_CACHED);
}
+int
+cpu_acpi_verify_cstate(cpu_acpi_cstate_t *cstate)
+{
+ uint32_t addrspaceid = cstate->cs_addrspace_id;
+
+ if ((addrspaceid != ACPI_ADR_SPACE_FIXED_HARDWARE) &&
+ (addrspaceid != ACPI_ADR_SPACE_SYSTEM_IO)) {
+ cmn_err(CE_WARN, "!_CST: unsupported address space id"
+ ":C%d, type: %d\n", cstate->cs_type, addrspaceid);
+ return (1);
+ }
+ return (0);
+}
+
+int
+cpu_acpi_cache_cst(cpu_acpi_handle_t handle)
+{
+ ACPI_BUFFER abuf;
+ ACPI_OBJECT *obj;
+ ACPI_INTEGER cnt;
+ cpu_acpi_cstate_t *cstate, *p;
+ int i, count;
+
+ CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_CST_CACHED);
+
+ abuf.Length = ACPI_ALLOCATE_BUFFER;
+ abuf.Pointer = NULL;
+
+ if (ACPI_FAILURE(AcpiEvaluateObject(handle->cs_handle, "_CST",
+ NULL, &abuf))) {
+ cmn_err(CE_NOTE, "!cpu_acpi: _CST evaluate failure");
+ return (-1);
+ }
+ obj = (ACPI_OBJECT *)abuf.Pointer;
+ if (obj->Package.Count < 2) {
+ cmn_err(CE_NOTE, "!cpu_acpi: _CST package bad count %d.",
+ obj->Package.Count);
+ AcpiOsFree(abuf.Pointer);
+ return (-1);
+ }
+
+ /*
+ * Does the package look coherent?
+ */
+ cnt = obj->Package.Elements[0].Integer.Value;
+ if (cnt < 1 || cnt != obj->Package.Count - 1) {
+ cmn_err(CE_NOTE, "!cpu_acpi: _CST invalid element count %d != "
+ "Package count %d\n",
+ (int)cnt, (int)obj->Package.Count - 1);
+ AcpiOsFree(abuf.Pointer);
+ return (-1);
+ }
+
+ CPU_ACPI_CSTATES_COUNT(handle) = (uint32_t)cnt;
+ CPU_ACPI_CSTATES(handle) = kmem_zalloc(CPU_ACPI_CSTATES_SIZE(cnt),
+ KM_SLEEP);
+ CPU_ACPI_BM_INFO(handle) = 0;
+ cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
+ p = cstate;
+
+ for (i = 1, count = 1; i <= cnt; i++) {
+ ACPI_OBJECT *pkg;
+ AML_RESOURCE_GENERIC_REGISTER *reg;
+ ACPI_OBJECT *element;
+
+ pkg = &(obj->Package.Elements[i]);
+ reg = (AML_RESOURCE_GENERIC_REGISTER *)
+ pkg->Package.Elements[0].Buffer.Pointer;
+ cstate->cs_addrspace_id = reg->AddressSpaceId;
+ cstate->cs_address = reg->Address;
+ element = &(pkg->Package.Elements[1]);
+ cstate->cs_type = element->Integer.Value;
+ element = &(pkg->Package.Elements[2]);
+ cstate->cs_latency = element->Integer.Value;
+ element = &(pkg->Package.Elements[3]);
+ cstate->cs_power = element->Integer.Value;
+
+ if (cpu_acpi_verify_cstate(cstate)) {
+ /*
+ * ignore this entry if it's not valid
+ */
+ continue;
+ }
+ if (cstate == p) {
+ cstate++;
+ } else if (p->cs_type == cstate->cs_type) {
+ /*
+ * if there are duplicate entries, we keep the
+ * last one. This fixes:
+ * 1) some buggy BIOS have total duplicate entries.
+ * 2) ACPI Spec allows the same cstate entry with
+ * different power and latency, we use the one
+ * with more power saving.
+ */
+ (void) memcpy(p, cstate, sizeof (cpu_acpi_cstate_t));
+ } else {
+ /*
+ * we got a valid entry, cache it to the
+ * cstate structure
+ */
+ p = cstate++;
+ count++;
+ }
+ }
+
+ if (count < 2) {
+ cmn_err(CE_NOTE, "!cpu_acpi: _CST invalid count %d < 2\n",
+ count);
+ AcpiOsFree(abuf.Pointer);
+ return (-1);
+ }
+
+ if (count != cnt)
+ CPU_ACPI_CSTATES_COUNT(handle) = (uint32_t)count;
+
+ AcpiOsFree(abuf.Pointer);
+ CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_CST_CACHED);
+ return (0);
+}
+
/*
* Cache the _PCT, _PSS, _PSD and _PPC data.
*/
@@ -575,19 +733,19 @@ cpu_acpi_cache_pstate_data(cpu_acpi_handle_t handle)
{
if (cpu_acpi_cache_pct(handle) < 0) {
cmn_err(CE_WARN, "!cpu_acpi: error parsing _PCT for "
- "CPU instance %d", ddi_get_instance(handle->cs_dip));
+ "CPU %d", handle->cs_id);
return (-1);
}
if (cpu_acpi_cache_pstates(handle) != 0) {
cmn_err(CE_WARN, "!cpu_acpi: error parsing _PSS for "
- "CPU instance %d", ddi_get_instance(handle->cs_dip));
+ "CPU %d", handle->cs_id);
return (-1);
}
if (cpu_acpi_cache_psd(handle) < 0) {
cmn_err(CE_WARN, "!cpu_acpi: error parsing _PSD for "
- "CPU instance %d", ddi_get_instance(handle->cs_dip));
+ "CPU %d", handle->cs_id);
return (-1);
}
@@ -617,19 +775,19 @@ cpu_acpi_cache_tstate_data(cpu_acpi_handle_t handle)
{
if (cpu_acpi_cache_ptc(handle) < 0) {
cmn_err(CE_WARN, "!cpu_acpi: error parsing _PTC for "
- "CPU instance %d", ddi_get_instance(handle->cs_dip));
+ "CPU %d", handle->cs_id);
return (-1);
}
if (cpu_acpi_cache_tstates(handle) != 0) {
cmn_err(CE_WARN, "!cpu_acpi: error parsing _TSS for "
- "CPU instance %d", ddi_get_instance(handle->cs_dip));
+ "CPU %d", handle->cs_id);
return (-1);
}
if (cpu_acpi_cache_tsd(handle) < 0) {
cmn_err(CE_WARN, "!cpu_acpi: error parsing _TSD for "
- "CPU instance %d", ddi_get_instance(handle->cs_dip));
+ "CPU %d", handle->cs_id);
return (-1);
}
@@ -652,17 +810,63 @@ cpu_acpi_free_tstate_data(cpu_acpi_handle_t handle)
}
/*
+ * Cache the _CST data.
+ */
+int
+cpu_acpi_cache_cstate_data(cpu_acpi_handle_t handle)
+{
+ if (cpu_acpi_cache_cst(handle) < 0) {
+ cmn_err(CE_WARN, "!cpu_acpi: error parsing _CST for "
+ "CPU %d", handle->cs_id);
+ return (-1);
+ }
+
+ if (cpu_acpi_cache_csd(handle) < 0) {
+ cmn_err(CE_WARN, "!cpu_acpi: error parsing _CSD for "
+ "CPU %d", handle->cs_id);
+ return (-1);
+ }
+
+ return (0);
+}
+
+void
+cpu_acpi_free_cstate_data(cpu_acpi_handle_t handle)
+{
+ if (handle != NULL) {
+ if (CPU_ACPI_CSTATES(handle)) {
+ kmem_free(CPU_ACPI_CSTATES(handle),
+ CPU_ACPI_CSTATES_SIZE(
+ CPU_ACPI_CSTATES_COUNT(handle)));
+ CPU_ACPI_CSTATES(handle) = NULL;
+ }
+ }
+}
+
+/*
* Register a handler for processor change notifications.
*/
void
cpu_acpi_install_notify_handler(cpu_acpi_handle_t handle,
- ACPI_NOTIFY_HANDLER handler, dev_info_t *dip)
+ ACPI_NOTIFY_HANDLER handler, void *ctx)
{
- char path[MAXNAMELEN];
if (ACPI_FAILURE(AcpiInstallNotifyHandler(handle->cs_handle,
- ACPI_DEVICE_NOTIFY, handler, dip)))
+ ACPI_DEVICE_NOTIFY, handler, ctx)))
cmn_err(CE_NOTE, "!cpu_acpi: Unable to register "
- "notify handler for %s", ddi_pathname(dip, path));
+ "notify handler for CPU");
+}
+
+/*
+ * Remove a handler for processor change notifications.
+ */
+void
+cpu_acpi_remove_notify_handler(cpu_acpi_handle_t handle,
+ ACPI_NOTIFY_HANDLER handler)
+{
+ if (ACPI_FAILURE(AcpiRemoveNotifyHandler(handle->cs_handle,
+ ACPI_DEVICE_NOTIFY, handler)))
+ cmn_err(CE_NOTE, "!cpu_acpi: Unable to remove "
+ "notify handler for CPU");
}
/*
@@ -763,21 +967,43 @@ cpu_acpi_free_speeds(int *speeds, uint_t nspeeds)
kmem_free(speeds, nspeeds * sizeof (int));
}
+uint_t
+cpu_acpi_get_max_cstates(cpu_acpi_handle_t handle)
+{
+ if (CPU_ACPI_CSTATES(handle))
+ return (CPU_ACPI_CSTATES_COUNT(handle));
+ else
+ return (1);
+}
+
+void
+cpu_acpi_set_register(uint32_t bitreg, uint32_t value)
+{
+ AcpiSetRegister(bitreg, value);
+}
+
+void
+cpu_acpi_get_register(uint32_t bitreg, uint32_t *value)
+{
+ AcpiGetRegister(bitreg, value);
+}
+
/*
* Map the dip to an ACPI handle for the device.
*/
cpu_acpi_handle_t
-cpu_acpi_init(dev_info_t *dip)
+cpu_acpi_init(cpu_t *cp)
{
cpu_acpi_handle_t handle;
handle = kmem_zalloc(sizeof (cpu_acpi_state_t), KM_SLEEP);
- if (ACPI_FAILURE(acpica_get_handle(dip, &handle->cs_handle))) {
+ if (ACPI_FAILURE(acpica_get_handle_cpu(cp->cpu_id,
+ &handle->cs_handle))) {
kmem_free(handle, sizeof (cpu_acpi_state_t));
return (NULL);
}
- handle->cs_dip = dip;
+ handle->cs_id = cp->cpu_id;
return (handle);
}
diff --git a/usr/src/uts/i86pc/os/cpupm/cpu_idle.c b/usr/src/uts/i86pc/os/cpupm/cpu_idle.c
new file mode 100644
index 0000000000..40b03ff38b
--- /dev/null
+++ b/usr/src/uts/i86pc/os/cpupm/cpu_idle.c
@@ -0,0 +1,877 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/x86_archext.h>
+#include <sys/machsystm.h>
+#include <sys/x_call.h>
+#include <sys/stat.h>
+#include <sys/acpi/acpi.h>
+#include <sys/acpica.h>
+#include <sys/cpu_acpi.h>
+#include <sys/cpu_idle.h>
+#include <sys/cpupm.h>
+#include <sys/hpet.h>
+#include <sys/archsystm.h>
+#include <vm/hat_i86.h>
+#include <sys/dtrace.h>
+#include <sys/sdt.h>
+#include <sys/callb.h>
+
+extern void cpu_idle_adaptive(void);
+
+static int cpu_idle_init(cpu_t *);
+static void cpu_idle_fini(cpu_t *);
+static boolean_t cpu_deep_idle_callb(void *arg, int code);
+static boolean_t cpu_idle_cpr_callb(void *arg, int code);
+static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate);
+static void cpuidle_set_cstate_latency(cpu_t *cp);
+
+/*
+ * Interfaces for modules implementing Intel's deep c-state.
+ */
+cpupm_state_ops_t cpu_idle_ops = {
+ "Generic ACPI C-state Support",
+ cpu_idle_init,
+ cpu_idle_fini,
+ NULL
+};
+
+static kmutex_t cpu_idle_callb_mutex;
+static callb_id_t cpu_deep_idle_callb_id;
+static callb_id_t cpu_idle_cpr_callb_id;
+static uint_t cpu_idle_cfg_state;
+
+static kmutex_t cpu_idle_mutex;
+
+cpu_idle_kstat_t cpu_idle_kstat = {
+ { "address_space_id", KSTAT_DATA_STRING },
+ { "latency", KSTAT_DATA_UINT32 },
+ { "power", KSTAT_DATA_UINT32 },
+};
+
+/*
+ * kstat update function of the c-state info
+ */
+static int
+cpu_idle_kstat_update(kstat_t *ksp, int flag)
+{
+ cpu_acpi_cstate_t *cstate = ksp->ks_private;
+
+ if (flag == KSTAT_WRITE) {
+ return (EACCES);
+ }
+
+ if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
+ kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
+ "FFixedHW");
+ } else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) {
+ kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
+ "SystemIO");
+ } else {
+ kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
+ "Unsupported");
+ }
+
+ cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency;
+ cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power;
+
+ return (0);
+}
+
+/*
+ * c-state wakeup function.
+ * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals
+ * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State.
+ */
+void
+cstate_wakeup(cpu_t *cp, int bound)
+{
+ struct machcpu *mcpu = &(cp->cpu_m);
+ volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait;
+ cpupart_t *cpu_part;
+ uint_t cpu_found;
+ processorid_t cpu_sid;
+
+ cpu_part = cp->cpu_part;
+ cpu_sid = cp->cpu_seqid;
+ /*
+ * Clear the halted bit for that CPU since it will be woken up
+ * in a moment.
+ */
+ if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
+ /*
+ * Clear the halted bit for that CPU since it will be
+ * poked in a moment.
+ */
+ bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);
+
+ /*
+ * We may find the current CPU present in the halted cpuset
+ * if we're in the context of an interrupt that occurred
+ * before we had a chance to clear our bit in cpu_idle().
+ * Waking ourself is obviously unnecessary, since if
+ * we're here, we're not halted.
+ */
+ if (cp != CPU) {
+ /*
+ * Use correct wakeup mechanism
+ */
+ if ((mcpu_mwait != NULL) &&
+ (*mcpu_mwait == MWAIT_HALTED))
+ MWAIT_WAKEUP(cp);
+ else
+ poke_cpu(cp->cpu_id);
+ }
+ return;
+ } else {
+ /*
+ * This cpu isn't halted, but it's idle or undergoing a
+ * context switch. No need to awaken anyone else.
+ */
+ if (cp->cpu_thread == cp->cpu_idle_thread ||
+ cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
+ return;
+ }
+
+ /*
+ * No need to wake up other CPUs if the thread we just enqueued
+ * is bound.
+ */
+ if (bound)
+ return;
+
+
+ /*
+ * See if there's any other halted CPUs. If there are, then
+ * select one, and awaken it.
+ * It's possible that after we find a CPU, somebody else
+ * will awaken it before we get the chance.
+ * In that case, look again.
+ */
+ do {
+ cpu_found = bitset_find(&cpu_part->cp_haltset);
+ if (cpu_found == (uint_t)-1)
+ return;
+
+ } while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
+ cpu_found) < 0);
+
+ /*
+ * Must use correct wakeup mechanism to avoid lost wakeup of
+ * alternate cpu.
+ */
+ if (cpu_found != CPU->cpu_seqid) {
+ mcpu_mwait = cpu[cpu_found]->cpu_m.mcpu_mwait;
+ if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED))
+ MWAIT_WAKEUP(cpu_seq[cpu_found]);
+ else
+ poke_cpu(cpu_seq[cpu_found]->cpu_id);
+ }
+}
+
+/*
+ * enter deep c-state handler
+ */
+static void
+acpi_cpu_cstate(cpu_acpi_cstate_t *cstate)
+{
+ volatile uint32_t *mcpu_mwait = CPU->cpu_m.mcpu_mwait;
+ cpu_t *cpup = CPU;
+ processorid_t cpu_sid = cpup->cpu_seqid;
+ cpupart_t *cp = cpup->cpu_part;
+ hrtime_t lapic_expire;
+ uint8_t type = cstate->cs_addrspace_id;
+ uint32_t cs_type = cstate->cs_type;
+ int hset_update = 1;
+ boolean_t using_hpet_timer;
+
+ /*
+ * Set our mcpu_mwait here, so we can tell if anyone tries to
+ * wake us between now and when we call mwait. No other cpu will
+ * attempt to set our mcpu_mwait until we add ourself to the haltset.
+ */
+ if (mcpu_mwait) {
+ if (type == ACPI_ADR_SPACE_SYSTEM_IO)
+ *mcpu_mwait = MWAIT_WAKEUP_IPI;
+ else
+ *mcpu_mwait = MWAIT_HALTED;
+ }
+
+ /*
+ * If this CPU is online, and there are multiple CPUs
+ * in the system, then we should note our halting
+ * by adding ourselves to the partition's halted CPU
+ * bitmap. This allows other CPUs to find/awaken us when
+ * work becomes available.
+ */
+ if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
+ hset_update = 0;
+
+ /*
+ * Add ourselves to the partition's halted CPUs bitmask
+ * and set our HALTED flag, if necessary.
+ *
+ * When a thread becomes runnable, it is placed on the queue
+ * and then the halted cpuset is checked to determine who
+ * (if anyone) should be awakened. We therefore need to first
+ * add ourselves to the halted cpuset, and and then check if there
+ * is any work available.
+ *
+ * Note that memory barriers after updating the HALTED flag
+ * are not necessary since an atomic operation (updating the bitmap)
+ * immediately follows. On x86 the atomic operation acts as a
+ * memory barrier for the update of cpu_disp_flags.
+ */
+ if (hset_update) {
+ cpup->cpu_disp_flags |= CPU_DISP_HALTED;
+ bitset_atomic_add(&cp->cp_haltset, cpu_sid);
+ }
+
+ /*
+ * Check to make sure there's really nothing to do.
+ * Work destined for this CPU may become available after
+ * this check. We'll be notified through the clearing of our
+ * bit in the halted CPU bitmask, and a write to our mcpu_mwait.
+ *
+ * disp_anywork() checks disp_nrunnable, so we do not have to later.
+ */
+ if (disp_anywork()) {
+ if (hset_update) {
+ cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
+ bitset_atomic_del(&cp->cp_haltset, cpu_sid);
+ }
+ return;
+ }
+
+ /*
+ * We're on our way to being halted.
+ *
+ * The local APIC timer can stop in ACPI C2 and deeper c-states.
+ * Program the HPET hardware to substitute for this CPU's lAPIC timer.
+ * hpet.use_hpet_timer() disables the LAPIC Timer. Make sure to
+ * start the LAPIC Timer again before leaving this function.
+ *
+ * hpet.use_hpet_timer disables interrupts, so we will awaken
+ * immediately after halting if someone tries to poke us between now
+ * and the time we actually halt.
+ */
+ using_hpet_timer = hpet.use_hpet_timer(&lapic_expire);
+
+ /*
+ * We check for the presence of our bit after disabling interrupts.
+ * If it's cleared, we'll return. If the bit is cleared after
+ * we check then the cstate_wakeup() will pop us out of the halted
+ * state.
+ *
+ * This means that the ordering of the cstate_wakeup() and the clearing
+ * of the bit by cpu_wakeup is important.
+ * cpu_wakeup() must clear our mc_haltset bit, and then call
+ * cstate_wakeup().
+ * acpi_cpu_cstate() must disable interrupts, then check for the bit.
+ */
+ if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
+ hpet.use_lapic_timer(lapic_expire);
+ cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
+ return;
+ }
+
+ /*
+ * The check for anything locally runnable is here for performance
+ * and isn't needed for correctness. disp_nrunnable ought to be
+ * in our cache still, so it's inexpensive to check, and if there
+ * is anything runnable we won't have to wait for the poke.
+ */
+ if (cpup->cpu_disp->disp_nrunnable != 0) {
+ hpet.use_lapic_timer(lapic_expire);
+ if (hset_update) {
+ cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
+ bitset_atomic_del(&cp->cp_haltset, cpu_sid);
+ }
+ return;
+ }
+
+ if (using_hpet_timer == B_FALSE) {
+
+ hpet.use_lapic_timer(lapic_expire);
+
+ /*
+ * We are currently unable to program the HPET to act as this
+ * CPU's proxy lAPIC timer. This CPU cannot enter C2 or deeper
+ * because no timer is set to wake it up while its lAPIC timer
+ * stalls in deep C-States.
+ * Enter C1 instead.
+ *
+ * cstate_wake_cpu() will wake this CPU with an IPI which
+ * works with MWAIT.
+ */
+ i86_monitor(mcpu_mwait, 0, 0);
+ if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) == MWAIT_HALTED) {
+ cpu_dtrace_idle_probe(CPU_ACPI_C1);
+
+ tlb_going_idle();
+ i86_mwait(0, 0);
+ tlb_service();
+
+ cpu_dtrace_idle_probe(CPU_ACPI_C0);
+ }
+
+ /*
+ * We're no longer halted
+ */
+ if (hset_update) {
+ cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
+ bitset_atomic_del(&cp->cp_haltset, cpu_sid);
+ }
+ return;
+ }
+
+ cpu_dtrace_idle_probe((uint_t)cs_type);
+
+ if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) {
+ /*
+ * We're on our way to being halted.
+ * To avoid a lost wakeup, arm the monitor before checking
+ * if another cpu wrote to mcpu_mwait to wake us up.
+ */
+ i86_monitor(mcpu_mwait, 0, 0);
+ if (*mcpu_mwait == MWAIT_HALTED) {
+ uint32_t eax = cstate->cs_address;
+ uint32_t ecx = 1;
+
+ tlb_going_idle();
+ i86_mwait(eax, ecx);
+ tlb_service();
+ }
+ } else if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
+ uint32_t value;
+ ACPI_TABLE_FADT *gbl_FADT;
+
+ if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
+ tlb_going_idle();
+ (void) cpu_acpi_read_port(cstate->cs_address,
+ &value, 8);
+ acpica_get_global_FADT(&gbl_FADT);
+ (void) cpu_acpi_read_port(
+ gbl_FADT->XPmTimerBlock.Address, &value, 32);
+ tlb_service();
+ }
+ } else {
+ cmn_err(CE_WARN, "!_CST: cs_type %lx bad asid type %lx\n",
+ (long)cs_type, (long)type);
+ }
+
+ /*
+ * The lAPIC timer may have stopped in deep c-state.
+ * Reprogram this CPU's lAPIC here before enabling interrupts.
+ */
+ hpet.use_lapic_timer(lapic_expire);
+
+ cpu_dtrace_idle_probe(CPU_ACPI_C0);
+
+ /*
+ * We're no longer halted
+ */
+ if (hset_update) {
+ cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
+ bitset_atomic_del(&cp->cp_haltset, cpu_sid);
+ }
+}
+
+/*
+ * indicate when bus masters are active
+ */
+static uint32_t
+cpu_acpi_bm_sts(void)
+{
+ uint32_t bm_sts = 0;
+
+ cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_sts);
+
+ if (bm_sts)
+ cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
+
+ return (bm_sts);
+}
+
+/*
+ * Idle the present CPU, deep c-state is supported
+ */
+void
+cpu_acpi_idle(void)
+{
+ cpu_t *cp = CPU;
+ uint16_t cs_type;
+ cpu_acpi_handle_t handle;
+ cma_c_state_t *cs_data;
+ cpu_acpi_cstate_t *cstate;
+ hrtime_t start, end;
+ int cpu_max_cstates;
+
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+ handle = mach_state->ms_acpi_handle;
+ ASSERT(CPU_ACPI_CSTATES(handle) != NULL);
+
+ cs_data = mach_state->ms_cstate.cma_state.cstate;
+ cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
+ ASSERT(cstate != NULL);
+ cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
+ if (cpu_max_cstates > CPU_MAX_CSTATES)
+ cpu_max_cstates = CPU_MAX_CSTATES;
+
+ start = gethrtime_unscaled();
+
+ cs_type = cpupm_next_cstate(cs_data, start);
+
+ /*
+ * OSPM uses the BM_STS bit to determine the power state to enter
+ * when considering a transition to or from the C2/C3 power state.
+ * if C3 is determined, bus master activity demotes the power state
+ * to C2.
+ */
+ if ((cs_type >= CPU_ACPI_C3) && cpu_acpi_bm_sts())
+ cs_type = CPU_ACPI_C2;
+
+ /*
+ * BM_RLD determines if the Cx power state was exited as a result of
+ * bus master requests. Set this bit when using a C3 power state, and
+ * clear it when using a C1 or C2 power state.
+ */
+ if ((CPU_ACPI_BM_INFO(handle) & BM_RLD) && (cs_type < CPU_ACPI_C3)) {
+ cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
+ CPU_ACPI_BM_INFO(handle) &= ~BM_RLD;
+ }
+
+ if ((!(CPU_ACPI_BM_INFO(handle) & BM_RLD)) &&
+ (cs_type >= CPU_ACPI_C3)) {
+ cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
+ CPU_ACPI_BM_INFO(handle) |= BM_RLD;
+ }
+
+ cstate += cs_type - 1;
+
+ switch (cs_type) {
+ default:
+ /* FALLTHROUGH */
+ case CPU_ACPI_C1:
+ (*non_deep_idle_cpu)();
+ break;
+
+ case CPU_ACPI_C2:
+ acpi_cpu_cstate(cstate);
+ break;
+
+ case CPU_ACPI_C3:
+ /*
+ * recommended in ACPI spec, providing hardware mechanisms
+ * to prevent master from writing to memory (UP-only)
+ */
+ if ((ncpus_online == 1) &&
+ (CPU_ACPI_BM_INFO(handle) & BM_CTL)) {
+ cpu_acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
+ CPU_ACPI_BM_INFO(handle) |= BM_ARB_DIS;
+ /*
+ * Today all Intel's processor support C3 share cache.
+ */
+ } else if (x86_vendor != X86_VENDOR_Intel) {
+ __acpi_wbinvd();
+ }
+ acpi_cpu_cstate(cstate);
+ if (CPU_ACPI_BM_INFO(handle) & BM_ARB_DIS) {
+ cpu_acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
+ CPU_ACPI_BM_INFO(handle) &= ~BM_ARB_DIS;
+ }
+ break;
+ }
+
+ end = gethrtime_unscaled();
+
+ /*
+ * Update statistics
+ */
+ cpupm_wakeup_cstate_data(cs_data, end);
+}
+
+boolean_t
+cpu_deep_cstates_supported(void)
+{
+ extern int idle_cpu_no_deep_c;
+
+ if (idle_cpu_no_deep_c)
+ return (B_FALSE);
+
+ if (!cpuid_deep_cstates_supported())
+ return (B_FALSE);
+
+ if ((hpet.supported != HPET_FULL_SUPPORT) || !hpet.install_proxy())
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
+ * Validate that this processor supports deep cstate and if so,
+ * get the c-state data from ACPI and cache it.
+ */
+static int
+cpu_idle_init(cpu_t *cp)
+{
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+ cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
+ cpu_acpi_cstate_t *cstate;
+ char name[KSTAT_STRLEN];
+ int cpu_max_cstates, i;
+ ACPI_TABLE_FADT *gbl_FADT;
+
+ /*
+ * Cache the C-state specific ACPI data.
+ */
+ if (cpu_acpi_cache_cstate_data(handle) != 0) {
+ cmn_err(CE_NOTE,
+ "!cpu_idle_init: Failed to cache ACPI C-state data\n");
+ cpu_idle_fini(cp);
+ return (-1);
+ }
+
+ /*
+ * Check the bus master arbitration control ability.
+ */
+ acpica_get_global_FADT(&gbl_FADT);
+ if (gbl_FADT->Pm2ControlBlock && gbl_FADT->Pm2ControlLength)
+ CPU_ACPI_BM_INFO(handle) |= BM_CTL;
+
+ cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
+
+ cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
+
+ for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
+ (void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type);
+ /*
+ * Allocate, initialize and install cstate kstat
+ */
+ cstate->cs_ksp = kstat_create("cstate", CPU->cpu_id,
+ name, "misc",
+ KSTAT_TYPE_NAMED,
+ sizeof (cpu_idle_kstat) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (cstate->cs_ksp == NULL) {
+ cmn_err(CE_NOTE, "kstat_create(c_state) fail");
+ } else {
+ cstate->cs_ksp->ks_data = &cpu_idle_kstat;
+ cstate->cs_ksp->ks_lock = &cpu_idle_mutex;
+ cstate->cs_ksp->ks_update = cpu_idle_kstat_update;
+ cstate->cs_ksp->ks_data_size += MAXNAMELEN;
+ cstate->cs_ksp->ks_private = cstate;
+ kstat_install(cstate->cs_ksp);
+ cstate++;
+ }
+ }
+
+ cpupm_alloc_domains(cp, CPUPM_C_STATES);
+ cpupm_alloc_ms_cstate(cp);
+ cpuidle_set_cstate_latency(cp);
+
+ if (cpu_deep_cstates_supported()) {
+ mutex_enter(&cpu_idle_callb_mutex);
+ if (cpu_deep_idle_callb_id == (callb_id_t)0)
+ cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb,
+ (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle");
+ if (cpu_idle_cpr_callb_id == (callb_id_t)0)
+ cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb,
+ (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr");
+ mutex_exit(&cpu_idle_callb_mutex);
+ }
+
+ return (0);
+}
+
+/*
+ * Free resources allocated by cpu_idle_init().
+ */
+static void
+cpu_idle_fini(cpu_t *cp)
+{
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+ cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
+ cpu_acpi_cstate_t *cstate;
+ uint_t cpu_max_cstates, i;
+
+ /*
+ * idle cpu points back to the generic one
+ */
+ idle_cpu = CPU->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
+ disp_enq_thread = non_deep_idle_disp_enq_thread;
+
+ cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
+ if (cstate) {
+ cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
+
+ for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
+ if (cstate->cs_ksp != NULL)
+ kstat_delete(cstate->cs_ksp);
+ cstate++;
+ }
+ }
+
+ cpupm_free_ms_cstate(cp);
+ cpupm_free_domains(&cpupm_cstate_domains);
+ cpu_acpi_free_cstate_data(handle);
+
+ mutex_enter(&cpu_idle_callb_mutex);
+ if (cpu_deep_idle_callb_id != (callb_id_t)0) {
+ (void) callb_delete(cpu_deep_idle_callb_id);
+ cpu_deep_idle_callb_id = (callb_id_t)0;
+ }
+ if (cpu_idle_cpr_callb_id != (callb_id_t)0) {
+ (void) callb_delete(cpu_idle_cpr_callb_id);
+ cpu_idle_cpr_callb_id = (callb_id_t)0;
+ }
+ mutex_exit(&cpu_idle_callb_mutex);
+}
+
+/*ARGSUSED*/
+static boolean_t
+cpu_deep_idle_callb(void *arg, int code)
+{
+ boolean_t rslt = B_TRUE;
+
+ mutex_enter(&cpu_idle_callb_mutex);
+ switch (code) {
+ case PM_DEFAULT_CPU_DEEP_IDLE:
+ /*
+ * Default policy is same as enable
+ */
+ /*FALLTHROUGH*/
+ case PM_ENABLE_CPU_DEEP_IDLE:
+ if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0)
+ break;
+
+ if (hpet.callback(PM_ENABLE_CPU_DEEP_IDLE)) {
+ disp_enq_thread = cstate_wakeup;
+ idle_cpu = cpu_idle_adaptive;
+ cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG;
+ } else {
+ rslt = B_FALSE;
+ }
+ break;
+
+ case PM_DISABLE_CPU_DEEP_IDLE:
+ if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
+ break;
+
+ idle_cpu = non_deep_idle_cpu;
+ if (hpet.callback(PM_DISABLE_CPU_DEEP_IDLE)) {
+ disp_enq_thread = non_deep_idle_disp_enq_thread;
+ cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG;
+ }
+ break;
+
+ default:
+ cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n",
+ code);
+ break;
+ }
+ mutex_exit(&cpu_idle_callb_mutex);
+ return (rslt);
+}
+
+/*ARGSUSED*/
+static boolean_t
+cpu_idle_cpr_callb(void *arg, int code)
+{
+ boolean_t rslt = B_TRUE;
+
+ mutex_enter(&cpu_idle_callb_mutex);
+ switch (code) {
+ case CB_CODE_CPR_RESUME:
+ if (hpet.callback(CB_CODE_CPR_RESUME)) {
+ /*
+ * Do not enable dispatcher hooks if disabled by user.
+ */
+ if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
+ break;
+
+ disp_enq_thread = cstate_wakeup;
+ idle_cpu = cpu_idle_adaptive;
+ } else {
+ rslt = B_FALSE;
+ }
+ break;
+
+ case CB_CODE_CPR_CHKPT:
+ idle_cpu = non_deep_idle_cpu;
+ disp_enq_thread = non_deep_idle_disp_enq_thread;
+ hpet.callback(CB_CODE_CPR_CHKPT);
+ break;
+
+ default:
+ cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code);
+ break;
+ }
+ mutex_exit(&cpu_idle_callb_mutex);
+ return (rslt);
+}
+
+/*
+ * handle _CST notification
+ */
+void
+cpuidle_cstate_instance(cpu_t *cp)
+{
+#ifndef __xpv
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+ cpu_acpi_handle_t handle;
+ struct machcpu *mcpu;
+ cpuset_t dom_cpu_set;
+ kmutex_t *pm_lock;
+ int result = 0;
+ processorid_t cpu_id;
+
+ if (mach_state == NULL) {
+ return;
+ }
+
+ ASSERT(mach_state->ms_cstate.cma_domain != NULL);
+ dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus;
+ pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock;
+
+ /*
+ * Do for all the CPU's in the domain
+ */
+ mutex_enter(pm_lock);
+ do {
+ CPUSET_FIND(dom_cpu_set, cpu_id);
+ if (cpu_id == CPUSET_NOTINSET)
+ break;
+
+ ASSERT(cpu_id >= 0 && cpu_id < NCPU);
+ cp = cpu[cpu_id];
+ mach_state = (cpupm_mach_state_t *)
+ cp->cpu_m.mcpu_pm_mach_state;
+ if (!(mach_state->ms_caps & CPUPM_C_STATES)) {
+ mutex_exit(pm_lock);
+ return;
+ }
+ handle = mach_state->ms_acpi_handle;
+ ASSERT(handle != NULL);
+
+ /*
+ * re-evaluate cstate object
+ */
+ if (cpu_acpi_cache_cstate_data(handle) != 0) {
+ cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state"
+ " object Instance: %d", cpu_id);
+ }
+ mutex_enter(&cpu_lock);
+ mcpu = &(cp->cpu_m);
+ mcpu->max_cstates = cpu_acpi_get_max_cstates(handle);
+ if (mcpu->max_cstates > CPU_ACPI_C1) {
+ hpet.callback(CST_EVENT_MULTIPLE_CSTATES);
+ disp_enq_thread = cstate_wakeup;
+ cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
+ cpuidle_set_cstate_latency(cp);
+ } else if (mcpu->max_cstates == CPU_ACPI_C1) {
+ disp_enq_thread = non_deep_idle_disp_enq_thread;
+ cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
+ hpet.callback(CST_EVENT_ONE_CSTATE);
+ }
+ mutex_exit(&cpu_lock);
+
+ CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result);
+ mutex_exit(pm_lock);
+ } while (result < 0);
+#endif
+}
+
+/*
+ * handle the number or the type of available processor power states change
+ */
+void
+cpuidle_manage_cstates(void *ctx)
+{
+ cpu_t *cp = ctx;
+ processorid_t cpu_id = cp->cpu_id;
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+ boolean_t is_ready;
+
+ if (mach_state == NULL) {
+ return;
+ }
+
+ /*
+ * We currently refuse to power manage if the CPU is not ready to
+ * take cross calls (cross calls fail silently if CPU is not ready
+ * for it).
+ *
+ * Additionally, for x86 platforms we cannot power manage
+ * any one instance, until all instances have been initialized.
+ * That's because we don't know what the CPU domains look like
+ * until all instances have been initialized.
+ */
+ is_ready = CPUPM_XCALL_IS_READY(cpu_id) && cpupm_cstate_ready();
+ if (!is_ready)
+ return;
+
+ cpuidle_cstate_instance(cp);
+}
+
+static void
+cpuidle_set_cstate_latency(cpu_t *cp)
+{
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+ cpu_acpi_handle_t handle;
+ cpu_acpi_cstate_t *acpi_cstates;
+ cma_c_state_t *cpupm_cdata;
+ uint32_t i, cnt;
+
+ cpupm_cdata = mach_state->ms_cstate.cma_state.cstate;
+
+ ASSERT(cpupm_cdata != 0);
+ ASSERT(mach_state != NULL);
+ handle = mach_state->ms_acpi_handle;
+ ASSERT(handle != NULL);
+
+ cnt = CPU_ACPI_CSTATES_COUNT(handle);
+ acpi_cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
+
+ cpupm_cdata->cs_C2_latency = CPU_CSTATE_LATENCY_UNDEF;
+ cpupm_cdata->cs_C3_latency = CPU_CSTATE_LATENCY_UNDEF;
+
+ for (i = 1; i <= cnt; ++i, ++acpi_cstates) {
+ if ((cpupm_cdata->cs_C2_latency == CPU_CSTATE_LATENCY_UNDEF) &&
+ (acpi_cstates->cs_type == CPU_ACPI_C2))
+ cpupm_cdata->cs_C2_latency = acpi_cstates->cs_latency;
+
+ if ((cpupm_cdata->cs_C3_latency == CPU_CSTATE_LATENCY_UNDEF) &&
+ (acpi_cstates->cs_type == CPU_ACPI_C3))
+ cpupm_cdata->cs_C3_latency = acpi_cstates->cs_latency;
+ }
+}
diff --git a/usr/src/uts/i86pc/io/cpudrv/cpudrv_amd.c b/usr/src/uts/i86pc/os/cpupm/cpupm_amd.c
index 21dd88980c..086d9a8fe6 100644
--- a/usr/src/uts/i86pc/io/cpudrv/cpudrv_amd.c
+++ b/usr/src/uts/i86pc/os/cpupm/cpupm_amd.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -28,14 +28,14 @@
*/
#include <sys/x86_archext.h>
-#include <sys/cpudrv_mach.h>
#include <sys/cpu_acpi.h>
#include <sys/pwrnow.h>
boolean_t
-cpudrv_amd_init(cpudrv_devstate_t *cpudsp)
+cpupm_amd_init(cpu_t *cp)
{
- cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
/* AMD? */
if (x86_vendor != X86_VENDOR_AMD)
@@ -43,9 +43,10 @@ cpudrv_amd_init(cpudrv_devstate_t *cpudsp)
/*
* If we support PowerNow! on this processor, then set the
- * correct pstate_ops for the processor.
+ * correct cma_ops for the processor.
*/
- mach_state->cpupm_pstate_ops = pwrnow_supported() ? &pwrnow_ops : NULL;
+ mach_state->ms_pstate.cma_ops = pwrnow_supported() ?
+ &pwrnow_ops : NULL;
return (B_TRUE);
}
diff --git a/usr/src/uts/i86pc/io/cpudrv/cpudrv_intel.c b/usr/src/uts/i86pc/os/cpupm/cpupm_intel.c
index 8fed6f6a4e..dbd05d4198 100644
--- a/usr/src/uts/i86pc/io/cpudrv/cpudrv_intel.c
+++ b/usr/src/uts/i86pc/os/cpupm/cpupm_intel.c
@@ -28,31 +28,34 @@
*/
#include <sys/x86_archext.h>
-#include <sys/cpudrv_mach.h>
#include <sys/cpu_acpi.h>
#include <sys/speedstep.h>
-#include <sys/cpudrv_throttle.h>
+#include <sys/cpupm_throttle.h>
+#include <sys/cpu_idle.h>
/*
* The Intel Processor Driver Capabilities (_PDC).
* See Intel Processor Vendor-Specific ACPI Interface Specification
* for details.
*/
-#define CPUDRV_INTEL_PDC_REVISION 0x1
-#define CPUDRV_INTEL_PDC_PS_MSR 0x0001
-#define CPUDRV_INTEL_PDC_C1_HALT 0x0002
-#define CPUDRV_INTEL_PDC_TS_MSR 0x0004
-#define CPUDRV_INTEL_PDC_MP 0x0008
-#define CPUDRV_INTEL_PDC_SW_PSD 0x0020
-#define CPUDRV_INTEL_PDC_TSD 0x0080
-#define CPUDRV_INTEL_PDC_HW_PSD 0x0800
+#define CPUPM_INTEL_PDC_REVISION 0x1
+#define CPUPM_INTEL_PDC_PS_MSR 0x0001
+#define CPUPM_INTEL_PDC_C1_HALT 0x0002
+#define CPUPM_INTEL_PDC_TS_MSR 0x0004
+#define CPUPM_INTEL_PDC_MP 0x0008
+#define CPUPM_INTEL_PDC_C2C3_MP 0x0010
+#define CPUPM_INTEL_PDC_SW_PSD 0x0020
+#define CPUPM_INTEL_PDC_TSD 0x0080
+#define CPUPM_INTEL_PDC_C1_FFH 0x0100
+#define CPUPM_INTEL_PDC_HW_PSD 0x0800
-static uint32_t cpudrv_intel_pdccap = 0;
+static uint32_t cpupm_intel_pdccap = 0;
boolean_t
-cpudrv_intel_init(cpudrv_devstate_t *cpudsp)
+cpupm_intel_init(cpu_t *cp)
{
- cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
uint_t family;
uint_t model;
@@ -62,34 +65,45 @@ cpudrv_intel_init(cpudrv_devstate_t *cpudsp)
family = cpuid_getfamily(CPU);
model = cpuid_getmodel(CPU);
+ cpupm_intel_pdccap = CPUPM_INTEL_PDC_MP;
+
/*
* If we support SpeedStep on this processor, then set the
- * correct pstate_ops for the processor and enable appropriate
+ * correct cma_ops for the processor and enable appropriate
* _PDC bits.
*/
if (speedstep_supported(family, model)) {
- mach_state->cpupm_pstate_ops = &speedstep_ops;
- cpudrv_intel_pdccap = CPUDRV_INTEL_PDC_PS_MSR |
- CPUDRV_INTEL_PDC_C1_HALT | CPUDRV_INTEL_PDC_MP |
- CPUDRV_INTEL_PDC_SW_PSD | CPUDRV_INTEL_PDC_HW_PSD;
+ mach_state->ms_pstate.cma_ops = &speedstep_ops;
+ cpupm_intel_pdccap |= CPUPM_INTEL_PDC_PS_MSR |
+ CPUPM_INTEL_PDC_C1_HALT | CPUPM_INTEL_PDC_SW_PSD |
+ CPUPM_INTEL_PDC_HW_PSD;
} else {
- mach_state->cpupm_pstate_ops = NULL;
+ mach_state->ms_pstate.cma_ops = NULL;
}
/*
* Set the correct tstate_ops for the processor and
* enable appropriate _PDC bits.
*/
- mach_state->cpupm_tstate_ops = &cpudrv_throttle_ops;
- cpudrv_intel_pdccap |= CPUDRV_INTEL_PDC_TS_MSR |
- CPUDRV_INTEL_PDC_TSD;
+ mach_state->ms_tstate.cma_ops = &cpupm_throttle_ops;
+ cpupm_intel_pdccap |= CPUPM_INTEL_PDC_TS_MSR |
+ CPUPM_INTEL_PDC_TSD;
+
+ /*
+ * If we support deep cstates on this processor, then set the
+ * correct cstate_ops for the processor and enable appropriate
+ * _PDC bits.
+ */
+ mach_state->ms_cstate.cma_ops = &cpu_idle_ops;
+ cpupm_intel_pdccap |= CPUPM_INTEL_PDC_C1_HALT |
+ CPUPM_INTEL_PDC_C2C3_MP | CPUPM_INTEL_PDC_C1_FFH;
/*
* _PDC support is optional and the driver should
* function even if the _PDC write fails.
*/
- (void) cpu_acpi_write_pdc(mach_state->acpi_handle,
- CPUDRV_INTEL_PDC_REVISION, 1, &cpudrv_intel_pdccap);
+ (void) cpu_acpi_write_pdc(mach_state->ms_acpi_handle,
+ CPUPM_INTEL_PDC_REVISION, 1, &cpupm_intel_pdccap);
return (B_TRUE);
}
diff --git a/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c b/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c
new file mode 100644
index 0000000000..d7d9cb7221
--- /dev/null
+++ b/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c
@@ -0,0 +1,928 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/cpu_pm.h>
+#include <sys/x86_archext.h>
+#include <sys/sdt.h>
+#include <sys/spl.h>
+#include <sys/machsystm.h>
+#include <sys/hpet.h>
+#include <sys/cpupm.h>
+#include <sys/cpu_idle.h>
+#include <sys/cpu_acpi.h>
+#include <sys/cpupm_throttle.h>
+
+/*
+ * This callback is used to build the PPM CPU domains once
+ * all the CPU devices have been started. The callback is
+ * initialized by the PPM driver to point to a routine that
+ * will build the domains.
+ */
+void (*cpupm_rebuild_cpu_domains)(void);
+
+/*
+ * This callback is used to reset the topspeed for all the
+ * CPU devices. The callback is initialized by the PPM driver to
+ * point to a routine that will reinitialize all the CPU devices
+ * once all the CPU devices have been started and the CPU domains
+ * built.
+ */
+void (*cpupm_init_topspeed)(void);
+
+/*
+ * This callback is used to redefine the topspeed for a CPU device.
+ * Since all CPUs in a domain should have identical properties, this
+ * callback is initialized by the PPM driver to point to a routine
+ * that will redefine the topspeed for all devices in a CPU domain.
+ * This callback is exercised whenever an ACPI _PPC change notification
+ * is received by the CPU driver.
+ */
+void (*cpupm_redefine_topspeed)(void *);
+
+/*
+ * This callback is used by the PPM driver to call into the CPU driver
+ * to find a CPU's current topspeed (i.e., it's current ACPI _PPC value).
+ */
+void (*cpupm_set_topspeed_callb)(void *, int);
+
+/*
+ * This callback is used by the PPM driver to call into the CPU driver
+ * to set a new topspeed for a CPU.
+ */
+int (*cpupm_get_topspeed_callb)(void *);
+
+static void cpupm_event_notify_handler(ACPI_HANDLE, UINT32, void *);
+static void cpupm_free_notify_handlers(cpu_t *);
+
+/*
+ * Until proven otherwise, all power states are manageable.
+ */
+static uint32_t cpupm_enabled = CPUPM_ALL_STATES;
+
+/*
+ * Until all CPUs have started, we do not allow
+ * power management.
+ */
+static boolean_t cpupm_ready = B_FALSE;
+
+cpupm_state_domains_t *cpupm_pstate_domains = NULL;
+cpupm_state_domains_t *cpupm_tstate_domains = NULL;
+cpupm_state_domains_t *cpupm_cstate_domains = NULL;
+
+/*
+ * c-state tunables
+ *
+ * cpupm_cs_idle_cost_tunable is the ratio of time CPU spends executing + idle
+ * divided by time spent in the idle state transitions.
+ * A value of 10 means the CPU will not spend more than 1/10 of its time
+ * in idle latency. The worst case performance will be 90% of non Deep C-state
+ * kernel.
+ *
+ * cpupm_cs_idle_save_tunable is how long we must stay in a deeper C-state
+ * before it is worth going there. Expressed as a multiple of latency.
+ */
+uint32_t cpupm_cs_sample_tunable = 5; /* samples in decision period */
+uint32_t cpupm_cs_idle_cost_tunable = 10; /* work time / latency cost */
+uint32_t cpupm_cs_idle_save_tunable = 2; /* idle power savings */
+uint16_t cpupm_C2_idle_pct_tunable = 70;
+uint16_t cpupm_C3_idle_pct_tunable = 80;
+
+#ifndef __xpv
+extern boolean_t cpupm_intel_init(cpu_t *);
+extern boolean_t cpupm_amd_init(cpu_t *);
+
+typedef struct cpupm_vendor {
+ boolean_t (*cpuv_init)(cpu_t *);
+} cpupm_vendor_t;
+
+/*
+ * Table of supported vendors.
+ */
+static cpupm_vendor_t cpupm_vendors[] = {
+ cpupm_intel_init,
+ cpupm_amd_init,
+ NULL
+};
+#endif
+
+/*
+ * Initialize the machine.
+ * See if a module exists for managing power for this CPU.
+ */
+/*ARGSUSED*/
+void
+cpupm_init(cpu_t *cp)
+{
+#ifndef __xpv
+ cpupm_vendor_t *vendors;
+ cpupm_mach_state_t *mach_state;
+ struct machcpu *mcpu = &(cp->cpu_m);
+ int *speeds;
+ uint_t nspeeds;
+ int ret;
+
+ cpupm_set_supp_freqs(cp, NULL, 1);
+
+ mach_state = cp->cpu_m.mcpu_pm_mach_state =
+ kmem_zalloc(sizeof (cpupm_mach_state_t), KM_SLEEP);
+ mach_state->ms_caps = CPUPM_NO_STATES;
+ mutex_init(&mach_state->ms_lock, NULL, MUTEX_DRIVER, NULL);
+
+ mach_state->ms_acpi_handle = cpu_acpi_init(cp);
+ if (mach_state->ms_acpi_handle == NULL) {
+ cpupm_free(cp);
+ cmn_err(CE_WARN, "!cpupm_init: processor %d: "
+ "unable to get ACPI handle", cp->cpu_id);
+ cmn_err(CE_NOTE, "!CPU power management will not function.");
+ CPUPM_DISABLE();
+ return;
+ }
+
+ /*
+ * Loop through the CPU management module table and see if
+ * any of the modules implement CPU power management
+ * for this CPU.
+ */
+ for (vendors = cpupm_vendors; vendors->cpuv_init != NULL; vendors++) {
+ if (vendors->cpuv_init(cp))
+ break;
+ }
+
+ /*
+ * Nope, we can't power manage this CPU.
+ */
+ if (vendors == NULL) {
+ cpupm_free(cp);
+ CPUPM_DISABLE();
+ return;
+ }
+
+ /*
+ * If P-state support exists for this system, then initialize it.
+ */
+ if (mach_state->ms_pstate.cma_ops != NULL) {
+ ret = mach_state->ms_pstate.cma_ops->cpus_init(cp);
+ if (ret != 0) {
+ cmn_err(CE_WARN, "!cpupm_init: processor %d:"
+ " unable to initialize P-state support",
+ cp->cpu_id);
+ mach_state->ms_pstate.cma_ops = NULL;
+ cpupm_disable(CPUPM_P_STATES);
+ } else {
+ nspeeds = cpupm_get_speeds(cp, &speeds);
+ if (nspeeds == 0) {
+ cmn_err(CE_WARN, "!cpupm_init: processor %d:"
+ " no speeds to manage", cp->cpu_id);
+ } else {
+ cpupm_set_supp_freqs(cp, speeds, nspeeds);
+ cpupm_free_speeds(speeds, nspeeds);
+ mach_state->ms_caps |= CPUPM_P_STATES;
+ }
+ }
+ }
+
+ if (mach_state->ms_tstate.cma_ops != NULL) {
+ ret = mach_state->ms_tstate.cma_ops->cpus_init(cp);
+ if (ret != 0) {
+ cmn_err(CE_WARN, "!cpupm_init: processor %d:"
+ " unable to initialize T-state support",
+ cp->cpu_id);
+ mach_state->ms_tstate.cma_ops = NULL;
+ cpupm_disable(CPUPM_T_STATES);
+ } else {
+ mach_state->ms_caps |= CPUPM_T_STATES;
+ }
+ }
+
+ /*
+ * If C-states support exists for this system, then initialize it.
+ */
+ if (mach_state->ms_cstate.cma_ops != NULL) {
+ ret = mach_state->ms_cstate.cma_ops->cpus_init(cp);
+ if (ret != 0) {
+ cmn_err(CE_WARN, "!cpupm_init: processor %d:"
+ " unable to initialize C-state support",
+ cp->cpu_id);
+ mach_state->ms_cstate.cma_ops = NULL;
+ mcpu->max_cstates = CPU_ACPI_C1;
+ cpupm_disable(CPUPM_C_STATES);
+ idle_cpu = non_deep_idle_cpu;
+ disp_enq_thread = non_deep_idle_disp_enq_thread;
+ } else if (cpu_deep_cstates_supported()) {
+ mcpu->max_cstates = cpu_acpi_get_max_cstates(
+ mach_state->ms_acpi_handle);
+ if (mcpu->max_cstates > CPU_ACPI_C1) {
+ hpet.callback(CST_EVENT_MULTIPLE_CSTATES);
+ CPU->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
+ mcpu->mcpu_idle_type = CPU_ACPI_C1;
+ disp_enq_thread = cstate_wakeup;
+ } else {
+ hpet.callback(CST_EVENT_ONE_CSTATE);
+ }
+ mach_state->ms_caps |= CPUPM_C_STATES;
+ } else {
+ mcpu->max_cstates = CPU_ACPI_C1;
+ idle_cpu = non_deep_idle_cpu;
+ disp_enq_thread = non_deep_idle_disp_enq_thread;
+ }
+ }
+
+
+ if (mach_state->ms_caps == CPUPM_NO_STATES) {
+ cpupm_free(cp);
+ CPUPM_DISABLE();
+ return;
+ }
+
+ if ((mach_state->ms_caps & CPUPM_T_STATES) ||
+ (mach_state->ms_caps & CPUPM_P_STATES) ||
+ (mach_state->ms_caps & CPUPM_C_STATES))
+ cpupm_add_notify_handler(cp, cpupm_event_notify_handler, cp);
+#endif
+}
+
+/*
+ * Free any resources allocated by cpupm_init().
+ */
+/*ARGSUSED*/
+void
+cpupm_free(cpu_t *cp)
+{
+#ifndef __xpv
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+
+ if (mach_state == NULL)
+ return;
+ if (mach_state->ms_pstate.cma_ops != NULL) {
+ mach_state->ms_pstate.cma_ops->cpus_fini(cp);
+ mach_state->ms_pstate.cma_ops = NULL;
+ }
+
+ if (mach_state->ms_tstate.cma_ops != NULL) {
+ mach_state->ms_tstate.cma_ops->cpus_fini(cp);
+ mach_state->ms_tstate.cma_ops = NULL;
+ }
+
+ if (mach_state->ms_cstate.cma_ops != NULL) {
+ mach_state->ms_cstate.cma_ops->cpus_fini(cp);
+ mach_state->ms_cstate.cma_ops = NULL;
+ }
+
+ cpupm_free_notify_handlers(cp);
+
+ if (mach_state->ms_acpi_handle != NULL) {
+ cpu_acpi_fini(mach_state->ms_acpi_handle);
+ mach_state->ms_acpi_handle = NULL;
+ }
+
+ mutex_destroy(&mach_state->ms_lock);
+ kmem_free(mach_state, sizeof (cpupm_mach_state_t));
+ cp->cpu_m.mcpu_pm_mach_state = NULL;
+#endif
+}
+
+/*
+ * If all CPUs have started and at least one power state is manageable,
+ * then the CPUs are ready for power management.
+ */
+boolean_t
+cpupm_is_ready()
+{
+#ifndef __xpv
+ if (cpupm_enabled == CPUPM_NO_STATES)
+ return (B_FALSE);
+ return (cpupm_ready);
+#else
+ return (B_FALSE);
+#endif
+
+}
+
+boolean_t
+cpupm_is_enabled(uint32_t state)
+{
+ return ((cpupm_enabled & state) == state);
+}
+
+/*
+ * By default, all states are enabled.
+ */
+void
+cpupm_disable(uint32_t state)
+{
+
+ if (state & CPUPM_P_STATES) {
+ cpupm_free_domains(&cpupm_pstate_domains);
+ }
+ if (state & CPUPM_T_STATES) {
+ cpupm_free_domains(&cpupm_tstate_domains);
+ }
+ if (state & CPUPM_C_STATES) {
+ cpupm_free_domains(&cpupm_cstate_domains);
+ }
+ cpupm_enabled &= ~state;
+}
+
+/*
+ * Once all CPUs have been started, the PPM driver should build CPU
+ * domains and initialize the topspeed for all CPU devices.
+ */
+void
+cpupm_post_startup()
+{
+#ifndef __xpv
+ /*
+ * The CPU domain built by the PPM during CPUs attaching
+ * should be rebuilt with the information retrieved from
+ * ACPI.
+ */
+ if (cpupm_rebuild_cpu_domains != NULL)
+ (*cpupm_rebuild_cpu_domains)();
+
+ /*
+ * Only initialize the topspeed if P-states are enabled.
+ */
+ if (cpupm_enabled & CPUPM_P_STATES && cpupm_init_topspeed != NULL)
+ (*cpupm_init_topspeed)();
+#endif
+ cpupm_ready = B_TRUE;
+}
+
+/*
+ * Allocate power domains for C,P and T States
+ */
+void
+cpupm_alloc_domains(cpu_t *cp, int state)
+{
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+ cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
+ cpupm_state_domains_t **dom_ptr;
+ cpupm_state_domains_t *dptr;
+ cpupm_state_domains_t **mach_dom_state_ptr;
+ uint32_t domain;
+ uint32_t type;
+
+ switch (state) {
+ case CPUPM_P_STATES:
+ if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED)) {
+ domain = CPU_ACPI_PSD(handle).sd_domain;
+ type = CPU_ACPI_PSD(handle).sd_type;
+ } else {
+ mutex_enter(&cpu_lock);
+ domain = cpuid_get_chipid(cp);
+ mutex_exit(&cpu_lock);
+ type = CPU_ACPI_HW_ALL;
+ }
+ dom_ptr = &cpupm_pstate_domains;
+ mach_dom_state_ptr = &mach_state->ms_pstate.cma_domain;
+ break;
+ case CPUPM_T_STATES:
+ if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_TSD_CACHED)) {
+ domain = CPU_ACPI_TSD(handle).sd_domain;
+ type = CPU_ACPI_TSD(handle).sd_type;
+ } else {
+ mutex_enter(&cpu_lock);
+ domain = cpuid_get_chipid(cp);
+ mutex_exit(&cpu_lock);
+ type = CPU_ACPI_HW_ALL;
+ }
+ dom_ptr = &cpupm_tstate_domains;
+ mach_dom_state_ptr = &mach_state->ms_tstate.cma_domain;
+ break;
+ case CPUPM_C_STATES:
+ if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_CSD_CACHED)) {
+ domain = CPU_ACPI_CSD(handle).sd_domain;
+ type = CPU_ACPI_CSD(handle).sd_type;
+ } else {
+ mutex_enter(&cpu_lock);
+ domain = cpuid_get_coreid(cp);
+ mutex_exit(&cpu_lock);
+ type = CPU_ACPI_HW_ALL;
+ }
+ dom_ptr = &cpupm_cstate_domains;
+ mach_dom_state_ptr = &mach_state->ms_cstate.cma_domain;
+ break;
+ default:
+ return;
+ }
+
+ for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
+ if (dptr->pm_domain == domain)
+ break;
+ }
+
+ /* new domain is created and linked at the head */
+ if (dptr == NULL) {
+ dptr = kmem_zalloc(sizeof (cpupm_state_domains_t), KM_SLEEP);
+ dptr->pm_domain = domain;
+ dptr->pm_type = type;
+ dptr->pm_next = *dom_ptr;
+ mutex_init(&dptr->pm_lock, NULL, MUTEX_SPIN,
+ (void *)ipltospl(DISP_LEVEL));
+ CPUSET_ZERO(dptr->pm_cpus);
+ *dom_ptr = dptr;
+ }
+ CPUSET_ADD(dptr->pm_cpus, cp->cpu_id);
+ *mach_dom_state_ptr = dptr;
+}
+
+/*
+ * Free C, P or T state power domains
+ */
+void
+cpupm_free_domains(cpupm_state_domains_t **dom_ptr)
+{
+ cpupm_state_domains_t *this_domain, *next_domain;
+
+ this_domain = *dom_ptr;
+ while (this_domain != NULL) {
+ next_domain = this_domain->pm_next;
+ mutex_destroy(&this_domain->pm_lock);
+ kmem_free((void *)this_domain,
+ sizeof (cpupm_state_domains_t));
+ this_domain = next_domain;
+ }
+ *dom_ptr = NULL;
+}
+
+void
+cpupm_alloc_ms_cstate(cpu_t *cp)
+{
+ cpupm_mach_state_t *mach_state;
+ cpupm_mach_acpi_state_t *ms_cstate;
+
+ mach_state = (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+ ms_cstate = &mach_state->ms_cstate;
+ ASSERT(ms_cstate->cma_state.cstate == NULL);
+ ms_cstate->cma_state.cstate = kmem_zalloc(sizeof (cma_c_state_t),
+ KM_SLEEP);
+ ms_cstate->cma_state.cstate->cs_next_cstate = CPU_ACPI_C1;
+}
+
+void
+cpupm_free_ms_cstate(cpu_t *cp)
+{
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+ cpupm_mach_acpi_state_t *ms_cstate = &mach_state->ms_cstate;
+
+ if (ms_cstate->cma_state.cstate != NULL) {
+ kmem_free(ms_cstate->cma_state.cstate, sizeof (cma_c_state_t));
+ ms_cstate->cma_state.cstate = NULL;
+ }
+}
+
+void
+cpupm_state_change(cpu_t *cp, int level, int state)
+{
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+ cpupm_state_ops_t *state_ops;
+ cpupm_state_domains_t *state_domain;
+ cpuset_t set;
+
+ DTRACE_PROBE2(cpupm__state__change, cpu_t *, cp, int, level);
+
+ if (mach_state == NULL) {
+ return;
+ }
+
+ switch (state) {
+ case CPUPM_P_STATES:
+ state_ops = mach_state->ms_pstate.cma_ops;
+ state_domain = mach_state->ms_pstate.cma_domain;
+ break;
+ case CPUPM_T_STATES:
+ state_ops = mach_state->ms_tstate.cma_ops;
+ state_domain = mach_state->ms_tstate.cma_domain;
+ break;
+ default:
+ break;
+ }
+
+ switch (state_domain->pm_type) {
+ case CPU_ACPI_SW_ANY:
+ /*
+ * A request on any CPU in the domain transitions the domain
+ */
+ CPUSET_ONLY(set, cp->cpu_id);
+ state_ops->cpus_change(set, level);
+ break;
+ case CPU_ACPI_SW_ALL:
+ /*
+ * All CPUs in the domain must request the transition
+ */
+ case CPU_ACPI_HW_ALL:
+ /*
+ * P/T-state transitions are coordinated by the hardware
+ * For now, request the transition on all CPUs in the domain,
+ * but looking ahead we can probably be smarter about this.
+ */
+ mutex_enter(&state_domain->pm_lock);
+ state_ops->cpus_change(state_domain->pm_cpus, level);
+ mutex_exit(&state_domain->pm_lock);
+ break;
+ default:
+ cmn_err(CE_WARN, "Unknown domain coordination type: %d",
+ state_domain->pm_type);
+ }
+}
+
+/*
+ * CPU PM interfaces exposed to the CPU power manager
+ */
+/*ARGSUSED*/
+id_t
+cpupm_plat_domain_id(cpu_t *cp, cpupm_dtype_t type)
+{
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+
+ if ((mach_state == NULL) || (!cpupm_is_enabled(CPUPM_P_STATES) &&
+ !cpupm_is_enabled(CPUPM_C_STATES))) {
+ return (CPUPM_NO_DOMAIN);
+ }
+ if (type == CPUPM_DTYPE_ACTIVE) {
+ /*
+ * Return P-State domain for the specified CPU
+ */
+ if (mach_state->ms_pstate.cma_domain) {
+ return (mach_state->ms_pstate.cma_domain->pm_domain);
+ }
+ } else if (type == CPUPM_DTYPE_IDLE) {
+ /*
+ * Return C-State domain for the specified CPU
+ */
+ if (mach_state->ms_cstate.cma_domain) {
+ return (mach_state->ms_cstate.cma_domain->pm_domain);
+ }
+ }
+ return (CPUPM_NO_DOMAIN);
+}
+
+/*ARGSUSED*/
+uint_t
+cpupm_plat_state_enumerate(cpu_t *cp, cpupm_dtype_t type,
+ cpupm_state_t *states)
+{
+ int *speeds;
+ uint_t nspeeds, i;
+
+ /*
+ * Idle domain support unimplemented
+ */
+ if (type != CPUPM_DTYPE_ACTIVE) {
+ return (0);
+ }
+ nspeeds = cpupm_get_speeds(cp, &speeds);
+
+ /*
+ * If the caller passes NULL for states, just return the
+ * number of states.
+ */
+ if (states != NULL) {
+ for (i = 0; i < nspeeds; i++) {
+ states[i].cps_speed = speeds[i];
+ states[i].cps_handle = (cpupm_handle_t)i;
+ }
+ }
+ cpupm_free_speeds(speeds, nspeeds);
+ return (nspeeds);
+}
+
+/*ARGSUSED*/
+int
+cpupm_plat_change_state(cpu_t *cp, cpupm_state_t *state)
+{
+ if (!cpupm_is_ready())
+ return (-1);
+
+ cpupm_state_change(cp, (int)state->cps_handle, CPUPM_P_STATES);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+/*
+ * Note: It is the responsibility of the users of
+ * cpupm_get_speeds() to free the memory allocated
+ * for speeds using cpupm_free_speeds()
+ */
+uint_t
+cpupm_get_speeds(cpu_t *cp, int **speeds)
+{
+#ifndef __xpv
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+ return (cpu_acpi_get_speeds(mach_state->ms_acpi_handle, speeds));
+#else
+ return (0);
+#endif
+}
+
+/*ARGSUSED*/
+void
+cpupm_free_speeds(int *speeds, uint_t nspeeds)
+{
+#ifndef __xpv
+ cpu_acpi_free_speeds(speeds, nspeeds);
+#endif
+}
+
+/*
+ * All CPU instances have been initialized successfully.
+ */
+boolean_t
+cpupm_power_ready(void)
+{
+ return (cpupm_is_enabled(CPUPM_P_STATES) && cpupm_is_ready());
+}
+
+/*
+ * All CPU instances have been initialized successfully.
+ */
+boolean_t
+cpupm_throttle_ready(void)
+{
+ return (cpupm_is_enabled(CPUPM_T_STATES) && cpupm_is_ready());
+}
+
+/*
+ * All CPU instances have been initialized successfully.
+ */
+boolean_t
+cpupm_cstate_ready(void)
+{
+ return (cpupm_is_enabled(CPUPM_C_STATES) && cpupm_is_ready());
+}
+
+void
+cpupm_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
+{
+ cpu_t *cp = ctx;
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+ cpupm_notification_t *entry;
+
+ mutex_enter(&mach_state->ms_lock);
+ for (entry = mach_state->ms_handlers; entry != NULL;
+ entry = entry->nq_next) {
+ entry->nq_handler(obj, val, entry->nq_ctx);
+ }
+ mutex_exit(&mach_state->ms_lock);
+}
+
+/*ARGSUSED*/
+void
+cpupm_add_notify_handler(cpu_t *cp, CPUPM_NOTIFY_HANDLER handler, void *ctx)
+{
+#ifndef __xpv
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+ cpupm_notification_t *entry;
+
+ entry = kmem_zalloc(sizeof (cpupm_notification_t), KM_SLEEP);
+ entry->nq_handler = handler;
+ entry->nq_ctx = ctx;
+ mutex_enter(&mach_state->ms_lock);
+ if (mach_state->ms_handlers == NULL) {
+ entry->nq_next = NULL;
+ mach_state->ms_handlers = entry;
+ cpu_acpi_install_notify_handler(mach_state->ms_acpi_handle,
+ cpupm_notify_handler, cp);
+
+ } else {
+ entry->nq_next = mach_state->ms_handlers;
+ mach_state->ms_handlers = entry;
+ }
+ mutex_exit(&mach_state->ms_lock);
+#endif
+}
+
+/*ARGSUSED*/
+static void
+cpupm_free_notify_handlers(cpu_t *cp)
+{
+#ifndef __xpv
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+ cpupm_notification_t *entry;
+ cpupm_notification_t *next;
+
+ mutex_enter(&mach_state->ms_lock);
+ if (mach_state->ms_handlers == NULL) {
+ mutex_exit(&mach_state->ms_lock);
+ return;
+ }
+ if (mach_state->ms_acpi_handle != NULL) {
+ cpu_acpi_remove_notify_handler(mach_state->ms_acpi_handle,
+ cpupm_notify_handler);
+ }
+ entry = mach_state->ms_handlers;
+ while (entry != NULL) {
+ next = entry->nq_next;
+ kmem_free(entry, sizeof (cpupm_notification_t));
+ entry = next;
+ }
+ mach_state->ms_handlers = NULL;
+ mutex_exit(&mach_state->ms_lock);
+#endif
+}
+
+/*
+ * Get the current max speed from the ACPI _PPC object
+ */
+/*ARGSUSED*/
+int
+cpupm_get_top_speed(cpu_t *cp)
+{
+#ifndef __xpv
+ cpupm_mach_state_t *mach_state;
+ cpu_acpi_handle_t handle;
+ int plat_level;
+ uint_t nspeeds;
+ int max_level;
+
+ mach_state =
+ (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+ handle = mach_state->ms_acpi_handle;
+
+ cpu_acpi_cache_ppc(handle);
+ plat_level = CPU_ACPI_PPC(handle);
+
+ nspeeds = CPU_ACPI_PSTATES_COUNT(handle);
+
+ max_level = nspeeds - 1;
+ if ((plat_level < 0) || (plat_level > max_level)) {
+ cmn_err(CE_NOTE, "!cpupm_get_top_speed: CPU %d: "
+ "_PPC out of range %d", cp->cpu_id, plat_level);
+ plat_level = 0;
+ }
+
+ return (plat_level);
+#else
+ return (0);
+#endif
+}
+
+/*
+ * This notification handler is called whenever the ACPI _PPC
+ * object changes. The _PPC is a sort of governor on power levels.
+ * It sets an upper threshold on which, _PSS defined, power levels
+ * are usuable. The _PPC value is dynamic and may change as properties
+ * (i.e., thermal or AC source) of the system change.
+ */
+
+static void
+cpupm_power_manage_notifications(void *ctx)
+{
+ cpu_t *cp = ctx;
+ int top_speed;
+
+ top_speed = cpupm_get_top_speed(cp);
+ cpupm_redefine_max_activepwr_state(cp, top_speed);
+}
+
+/* ARGSUSED */
+static void
+cpupm_event_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
+{
+#ifndef __xpv
+ /*
+ * Currently, we handle _TPC,_CST and _PPC change notifications.
+ */
+ if (val == CPUPM_TPC_CHANGE_NOTIFICATION) {
+ cpupm_throttle_manage_notification(ctx);
+ } else if (val == CPUPM_CST_CHANGE_NOTIFICATION) {
+ cpuidle_manage_cstates(ctx);
+ } else if (val == CPUPM_PPC_CHANGE_NOTIFICATION) {
+ cpupm_power_manage_notifications(ctx);
+ }
+#endif
+}
+
+/*
+ * Update cpupm cstate data each time CPU exits idle.
+ */
+void
+cpupm_wakeup_cstate_data(cma_c_state_t *cs_data, hrtime_t end)
+{
+ cs_data->cs_idle_exit = end;
+}
+
+/*
+ * Determine next cstate based on cpupm data.
+ * Update cpupm cstate data each time CPU goes idle.
+ * Do as much as possible in the idle state bookkeeping function because the
+ * performance impact while idle is minimal compared to in the wakeup function
+ * when there is real work to do.
+ */
+uint32_t
+cpupm_next_cstate(cma_c_state_t *cs_data, hrtime_t start)
+{
+ hrtime_t duration;
+ hrtime_t ave_interval;
+ hrtime_t ave_idle_time;
+
+ duration = cs_data->cs_idle_exit - cs_data->cs_idle_enter;
+ scalehrtime(&duration);
+ cs_data->cs_idle += duration;
+ cs_data->cs_idle_enter = start;
+
+ ++cs_data->cs_cnt;
+ if (cs_data->cs_cnt > cpupm_cs_sample_tunable) {
+ cs_data->cs_smpl_len = start - cs_data->cs_smpl_start;
+ scalehrtime(&cs_data->cs_smpl_len);
+ cs_data->cs_smpl_len |= 1; /* protect from DIV 0 */
+ cs_data->cs_smpl_idle = cs_data->cs_idle;
+ cs_data->cs_idle = 0;
+ cs_data->cs_smpl_idle_pct = ((100 * cs_data->cs_smpl_idle) /
+ cs_data->cs_smpl_len);
+
+ cs_data->cs_smpl_start = start;
+ cs_data->cs_cnt = 0;
+
+ /*
+ * Strand level C-state policy
+ */
+ cs_data->cs_next_cstate = CPU_ACPI_C3;
+
+ /*
+ * Will CPU be idle long enough to save power?
+ */
+ ave_idle_time = (cs_data->cs_smpl_idle /
+ cpupm_cs_sample_tunable) / 1000;
+ if (ave_idle_time < (cs_data->cs_C2_latency *
+ cpupm_cs_idle_save_tunable)) {
+ cs_data->cs_next_cstate = CPU_ACPI_C1;
+ DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU,
+ int, 1);
+ return (cs_data->cs_next_cstate);
+ } else if (ave_idle_time < (cs_data->cs_C3_latency *
+ cpupm_cs_idle_save_tunable)) {
+ cs_data->cs_next_cstate = CPU_ACPI_C2;
+ DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU,
+ int, 2);
+ }
+
+ /*
+ * Wakeup often (even when non-idle time is very short)?
+ * Some producer/consumer type loads fall into this category.
+ */
+ ave_interval = (cs_data->cs_smpl_len / cpupm_cs_sample_tunable)
+ / 1000;
+ if (ave_interval <=
+ (cs_data->cs_C2_latency * cpupm_cs_idle_cost_tunable)) {
+ cs_data->cs_next_cstate = CPU_ACPI_C1;
+ DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU,
+ int, 3);
+ return (cs_data->cs_next_cstate);
+ } else if (ave_interval <=
+ (cs_data->cs_C3_latency * cpupm_cs_idle_cost_tunable)) {
+ cs_data->cs_next_cstate = CPU_ACPI_C2;
+ DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU,
+ int, 4);
+ }
+
+ /*
+ * Idle percent
+ */
+ if (cs_data->cs_smpl_idle_pct < cpupm_C2_idle_pct_tunable) {
+ cs_data->cs_next_cstate = CPU_ACPI_C1;
+ DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU,
+ int, 5);
+ return (cs_data->cs_next_cstate);
+ } else if ((cs_data->cs_next_cstate > CPU_ACPI_C2) &&
+ (cs_data->cs_smpl_idle_pct < cpupm_C3_idle_pct_tunable)) {
+ cs_data->cs_next_cstate = CPU_ACPI_C2;
+ DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU,
+ int, 6);
+ }
+ }
+
+ return (cs_data->cs_next_cstate);
+}
diff --git a/usr/src/uts/i86pc/io/cpudrv/cpudrv_throttle.c b/usr/src/uts/i86pc/os/cpupm/cpupm_throttle.c
index befa09433c..c1263a3bcd 100644
--- a/usr/src/uts/i86pc/io/cpudrv/cpudrv_throttle.c
+++ b/usr/src/uts/i86pc/os/cpupm/cpupm_throttle.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -27,19 +27,19 @@
#include <sys/machsystm.h>
#include <sys/x_call.h>
#include <sys/cpu_acpi.h>
-#include <sys/cpudrv_throttle.h>
+#include <sys/cpupm_throttle.h>
#include <sys/dtrace.h>
#include <sys/sdt.h>
-static int cpudrv_throttle_init(cpudrv_devstate_t *);
-static void cpudrv_throttle_fini(cpudrv_devstate_t *);
-static int cpudrv_throttle(cpudrv_devstate_t *, uint32_t);
+static int cpupm_throttle_init(cpu_t *);
+static void cpupm_throttle_fini(cpu_t *);
+static void cpupm_throttle(cpuset_t, uint32_t);
-cpudrv_tstate_ops_t cpudrv_throttle_ops = {
+cpupm_state_ops_t cpupm_throttle_ops = {
"Generic ACPI T-state Support",
- cpudrv_throttle_init,
- cpudrv_throttle_fini,
- cpudrv_throttle
+ cpupm_throttle_init,
+ cpupm_throttle_fini,
+ cpupm_throttle
};
/*
@@ -61,90 +61,12 @@ cpudrv_tstate_ops_t cpudrv_throttle_ops = {
* Debugging support
*/
#ifdef DEBUG
-volatile int cpudrv_throttle_debug = 0;
-#define CTDEBUG(arglist) if (cpudrv_throttle_debug) printf arglist;
+volatile int cpupm_throttle_debug = 0;
+#define CTDEBUG(arglist) if (cpupm_throttle_debug) printf arglist;
#else
#define CTDEBUG(arglist)
#endif
-cpudrv_tstate_domain_t *cpudrv_tstate_domains = NULL;
-
-/*
- * Allocate a new domain node.
- */
-static void
-cpudrv_alloc_tstate_domain(cpudrv_devstate_t *cpudsp)
-{
- cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
- cpu_acpi_handle_t handle = mach_state->acpi_handle;
- cpudrv_tstate_domain_t *dptr;
- cpudrv_tstate_domain_node_t *nptr;
- uint32_t domain;
- uint32_t type;
- cpu_t *cp;
-
- if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_TSD_CACHED)) {
- domain = CPU_ACPI_TSD(handle).sd_domain;
- type = CPU_ACPI_TSD(handle).sd_type;
- } else {
- mutex_enter(&cpu_lock);
- cp = cpu[CPU->cpu_id];
- domain = cpuid_get_chipid(cp);
- mutex_exit(&cpu_lock);
- type = CPU_ACPI_SW_ALL;
- }
-
- for (dptr = cpudrv_tstate_domains; dptr != NULL;
- dptr = dptr->td_next) {
- if (dptr->td_domain == domain)
- break;
- }
-
- /* new domain is created and linked at the head */
- if (dptr == NULL) {
- dptr = kmem_zalloc(sizeof (cpudrv_tstate_domain_t), KM_SLEEP);
- dptr->td_domain = domain;
- dptr->td_type = type;
- dptr->td_next = cpudrv_tstate_domains;
- mutex_init(&dptr->td_lock, NULL, MUTEX_DRIVER, NULL);
- cpudrv_tstate_domains = dptr;
- }
-
- /* new domain node is created and linked at the head of the domain */
- nptr = kmem_zalloc(sizeof (cpudrv_tstate_domain_node_t), KM_SLEEP);
- nptr->tdn_cpudsp = cpudsp;
- nptr->tdn_domain = dptr;
- nptr->tdn_next = dptr->td_node;
- dptr->td_node = nptr;
- mach_state->tstate_domain_node = nptr;
-}
-
-static void
-cpudrv_free_tstate_domains()
-{
- cpudrv_tstate_domain_t *this_domain, *next_domain;
- cpudrv_tstate_domain_node_t *this_node, *next_node;
-
- this_domain = cpudrv_tstate_domains;
- while (this_domain != NULL) {
- next_domain = this_domain->td_next;
-
- /* discard CPU node chain */
- this_node = this_domain->td_node;
- while (this_node != NULL) {
- next_node = this_node->tdn_next;
- kmem_free((void *)this_node,
- sizeof (cpudrv_tstate_domain_node_t));
- this_node = next_node;
- }
- mutex_destroy(&this_domain->td_lock);
- kmem_free((void *)this_domain,
- sizeof (cpudrv_tstate_domain_t));
- this_domain = next_domain;
- }
- cpudrv_tstate_domains = NULL;
-}
-
/*
* Write the _PTC ctrl register. How it is written, depends upon the _PTC
* APCI object value.
@@ -230,11 +152,11 @@ read_status(cpu_acpi_handle_t handle, uint32_t *stat)
* Transition the current processor to the requested throttling state.
*/
static void
-cpudrv_tstate_transition(int *ret, cpudrv_devstate_t *cpudsp,
- uint32_t req_state)
+cpupm_tstate_transition(uint32_t req_state)
{
- cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
- cpu_acpi_handle_t handle = mach_state->acpi_handle;
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)CPU->cpu_m.mcpu_pm_mach_state;
+ cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
cpu_acpi_tstate_t *req_tstate;
uint32_t ctrl;
uint32_t stat;
@@ -250,7 +172,6 @@ cpudrv_tstate_transition(int *ret, cpudrv_devstate_t *cpudsp,
*/
ctrl = CPU_ACPI_TSTATE_CTRL(req_tstate);
if (write_ctrl(handle, ctrl) != 0) {
- *ret = THROTTLE_RET_UNSUP_STATE;
return;
}
@@ -259,7 +180,6 @@ cpudrv_tstate_transition(int *ret, cpudrv_devstate_t *cpudsp,
* no status value comparison is required.
*/
if (CPU_ACPI_TSTATE_STAT(req_tstate) == 0) {
- *ret = THROTTLE_RET_SUCCESS;
return;
}
@@ -274,46 +194,40 @@ cpudrv_tstate_transition(int *ret, cpudrv_devstate_t *cpudsp,
if (CPU_ACPI_TSTATE_STAT(req_tstate) != stat) {
DTRACE_PROBE(throttle_transition_incomplete);
- *ret = THROTTLE_RET_TRANS_INCOMPLETE;
- } else {
- *ret = THROTTLE_RET_SUCCESS;
}
}
-static int
-cpudrv_throttle(cpudrv_devstate_t *cpudsp, uint32_t throtl_lvl)
+static void
+cpupm_throttle(cpuset_t set, uint32_t throtl_lvl)
{
- cpuset_t cpus;
- int ret;
-
/*
* If thread is already running on target CPU then just
* make the transition request. Otherwise, we'll need to
* make a cross-call.
*/
kpreempt_disable();
- if (cpudsp->cpu_id == CPU->cpu_id) {
- cpudrv_tstate_transition(&ret, cpudsp, throtl_lvl);
- } else {
- CPUSET_ONLY(cpus, cpudsp->cpu_id);
- xc_call((xc_arg_t)&ret, (xc_arg_t)cpudsp, (xc_arg_t)throtl_lvl,
- X_CALL_HIPRI, cpus, (xc_func_t)cpudrv_tstate_transition);
+ if (CPU_IN_SET(set, CPU->cpu_id)) {
+ cpupm_tstate_transition(throtl_lvl);
+ CPUSET_DEL(set, CPU->cpu_id);
+ }
+ if (!CPUSET_ISNULL(set)) {
+ xc_call((xc_arg_t)throtl_lvl, NULL, NULL, X_CALL_HIPRI,
+ set, (xc_func_t)cpupm_tstate_transition);
}
kpreempt_enable();
-
- return (ret);
}
static int
-cpudrv_throttle_init(cpudrv_devstate_t *cpudsp)
+cpupm_throttle_init(cpu_t *cp)
{
- cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
- cpu_acpi_handle_t handle = mach_state->acpi_handle;
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+ cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
cpu_acpi_ptc_t *ptc_stat;
if (cpu_acpi_cache_tstate_data(handle) != 0) {
CTDEBUG(("Failed to cache T-state ACPI data\n"));
- cpudrv_throttle_fini(cpudsp);
+ cpupm_throttle_fini(cp);
return (THROTTLE_RET_INCOMPLETE_DATA);
}
@@ -334,17 +248,98 @@ cpudrv_throttle_init(cpudrv_devstate_t *cpudsp)
return (THROTTLE_RET_INCOMPLETE_DATA);
}
- cpudrv_alloc_tstate_domain(cpudsp);
+ cpupm_alloc_domains(cp, CPUPM_T_STATES);
return (THROTTLE_RET_SUCCESS);
}
static void
-cpudrv_throttle_fini(cpudrv_devstate_t *cpudsp)
+cpupm_throttle_fini(cpu_t *cp)
{
- cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
- cpu_acpi_handle_t handle = mach_state->acpi_handle;
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+ cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
- cpudrv_free_tstate_domains();
+ cpupm_free_domains(&cpupm_tstate_domains);
cpu_acpi_free_tstate_data(handle);
}
+
+/*
+ * This routine reads the ACPI _TPC object. It's accessed as a callback
+ * by the cpu driver whenever a _TPC change notification is received.
+ */
+static int
+cpupm_throttle_get_max(processorid_t cpu_id)
+{
+ cpu_t *cp = cpu[cpu_id];
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+ cpu_acpi_handle_t handle;
+ int throtl_level;
+ int max_throttle_lvl;
+ uint_t num_throtl;
+
+ if (mach_state == NULL) {
+ return (-1);
+ }
+
+ handle = mach_state->ms_acpi_handle;
+ ASSERT(handle != NULL);
+
+ cpu_acpi_cache_tpc(handle);
+ throtl_level = CPU_ACPI_TPC(handle);
+
+ num_throtl = CPU_ACPI_TSTATES_COUNT(handle);
+
+ max_throttle_lvl = num_throtl - 1;
+ if ((throtl_level < 0) || (throtl_level > max_throttle_lvl)) {
+ cmn_err(CE_NOTE, "!cpupm_throttle_get_max: CPU %d: "
+ "_TPC out of range %d", cp->cpu_id, throtl_level);
+ throtl_level = 0;
+ }
+
+ return (throtl_level);
+}
+
+/*
+ * Take care of CPU throttling when _TPC notification arrives
+ */
+void
+cpupm_throttle_manage_notification(void *ctx)
+{
+ cpu_t *cp = ctx;
+ processorid_t cpu_id = cp->cpu_id;
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+ boolean_t is_ready;
+ int new_level;
+
+ if (mach_state == NULL) {
+ return;
+ }
+
+ /*
+ * We currently refuse to power-manage if the CPU is not ready to
+ * take cross calls (cross calls fail silently if CPU is not ready
+ * for it).
+ *
+ * Additionally, for x86 platforms we cannot power-manage
+ * any one instance, until all instances have been initialized.
+ * That's because we don't know what the CPU domains look like
+ * until all instances have been initialized.
+ */
+ is_ready = CPUPM_XCALL_IS_READY(cpu_id) && cpupm_throttle_ready();
+ if (!is_ready)
+ return;
+
+ if (!(mach_state->ms_caps & CPUPM_T_STATES))
+ return;
+ ASSERT(mach_state->ms_tstate.cma_ops != NULL);
+
+ /*
+ * Get the new T-State support level
+ */
+ new_level = cpupm_throttle_get_max(cpu_id);
+
+ cpupm_state_change(cp, new_level, CPUPM_T_STATES);
+}
diff --git a/usr/src/uts/i86pc/io/cpudrv/pwrnow.c b/usr/src/uts/i86pc/os/cpupm/pwrnow.c
index 4c731ff9e2..65cc251fbb 100644
--- a/usr/src/uts/i86pc/io/cpudrv/pwrnow.c
+++ b/usr/src/uts/i86pc/os/cpupm/pwrnow.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -28,21 +28,20 @@
#include <sys/x_call.h>
#include <sys/acpi/acpi.h>
#include <sys/acpica.h>
-#include <sys/cpudrv_mach.h>
#include <sys/pwrnow.h>
#include <sys/cpu_acpi.h>
#include <sys/cpupm.h>
#include <sys/dtrace.h>
#include <sys/sdt.h>
-static int pwrnow_init(cpudrv_devstate_t *);
-static void pwrnow_fini(cpudrv_devstate_t *);
-static int pwrnow_power(cpudrv_devstate_t *, uint32_t);
+static int pwrnow_init(cpu_t *);
+static void pwrnow_fini(cpu_t *);
+static void pwrnow_power(cpuset_t, uint32_t);
/*
* Interfaces for modules implementing AMD's PowerNow!.
*/
-cpudrv_pstate_ops_t pwrnow_ops = {
+cpupm_state_ops_t pwrnow_ops = {
"PowerNow! Technology",
pwrnow_init,
pwrnow_fini,
@@ -81,12 +80,11 @@ volatile int pwrnow_debug = 0;
/*
* Write the ctrl register.
*/
-static int
+static void
write_ctrl(cpu_acpi_handle_t handle, uint32_t ctrl)
{
cpu_acpi_pct_t *pct_ctrl;
uint64_t reg;
- int ret = 0;
pct_ctrl = CPU_ACPI_PCT_CTRL(handle);
@@ -94,35 +92,32 @@ write_ctrl(cpu_acpi_handle_t handle, uint32_t ctrl)
case ACPI_ADR_SPACE_FIXED_HARDWARE:
reg = ctrl;
wrmsr(PWRNOW_PERF_CTL_MSR, reg);
- ret = 0;
break;
default:
DTRACE_PROBE1(pwrnow_ctrl_unsupported_type, uint8_t,
pct_ctrl->cr_addrspace_id);
- return (-1);
+ return;
}
DTRACE_PROBE1(pwrnow_ctrl_write, uint32_t, ctrl);
- DTRACE_PROBE1(pwrnow_ctrl_write_err, int, ret);
-
- return (ret);
}
/*
* Transition the current processor to the requested state.
*/
static void
-pwrnow_pstate_transition(int *ret, cpudrv_devstate_t *cpudsp,
- uint32_t req_state)
+pwrnow_pstate_transition(uint32_t req_state)
{
- cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
- cpu_acpi_handle_t handle = mach_state->acpi_handle;
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)CPU->cpu_m.mcpu_pm_mach_state;
+ cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
cpu_acpi_pstate_t *req_pstate;
uint32_t ctrl;
req_pstate = (cpu_acpi_pstate_t *)CPU_ACPI_PSTATES(handle);
req_pstate += req_state;
+
DTRACE_PROBE1(pwrnow_transition_freq, uint32_t,
CPU_ACPI_FREQ(req_pstate));
@@ -130,40 +125,30 @@ pwrnow_pstate_transition(int *ret, cpudrv_devstate_t *cpudsp,
* Initiate the processor p-state change.
*/
ctrl = CPU_ACPI_PSTATE_CTRL(req_pstate);
- if (write_ctrl(handle, ctrl) != 0) {
- *ret = PWRNOW_RET_UNSUP_STATE;
- return;
- }
+ write_ctrl(handle, ctrl);
- mach_state->pstate = req_state;
- CPU->cpu_curr_clock = ((uint64_t)
- CPU_ACPI_FREQ(req_pstate) * 1000000);
-
- *ret = PWRNOW_RET_SUCCESS;
+ mach_state->ms_pstate.cma_state.pstate = req_state;
+ cpu_set_curr_clock((uint64_t)CPU_ACPI_FREQ(req_pstate) * 1000000);
}
-static int
-pwrnow_power(cpudrv_devstate_t *cpudsp, uint32_t req_state)
+static void
+pwrnow_power(cpuset_t set, uint32_t req_state)
{
- cpuset_t cpus;
- int ret;
-
/*
* If thread is already running on target CPU then just
* make the transition request. Otherwise, we'll need to
* make a cross-call.
*/
kpreempt_disable();
- if (cpudsp->cpu_id == CPU->cpu_id) {
- pwrnow_pstate_transition(&ret, cpudsp, req_state);
- } else {
- CPUSET_ONLY(cpus, cpudsp->cpu_id);
- xc_call((xc_arg_t)&ret, (xc_arg_t)cpudsp, (xc_arg_t)req_state,
- X_CALL_HIPRI, cpus, (xc_func_t)pwrnow_pstate_transition);
+ if (CPU_IN_SET(set, CPU->cpu_id)) {
+ pwrnow_pstate_transition(req_state);
+ CPUSET_DEL(set, CPU->cpu_id);
+ }
+ if (!CPUSET_ISNULL(set)) {
+ xc_call((xc_arg_t)req_state, NULL, NULL, X_CALL_HIPRI,
+ set, (xc_func_t)pwrnow_pstate_transition);
}
kpreempt_enable();
-
- return (ret);
}
/*
@@ -171,23 +156,21 @@ pwrnow_power(cpudrv_devstate_t *cpudsp, uint32_t req_state)
* get the P-state data from ACPI and cache it.
*/
static int
-pwrnow_init(cpudrv_devstate_t *cpudsp)
+pwrnow_init(cpu_t *cp)
{
- cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
- cpu_acpi_handle_t handle = mach_state->acpi_handle;
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+ cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
cpu_acpi_pct_t *pct_stat;
- cpu_t *cp;
- int domain;
- PWRNOW_DEBUG(("pwrnow_init: instance %d\n",
- ddi_get_instance(cpudsp->dip)));
+ PWRNOW_DEBUG(("pwrnow_init: processor %d\n", cp->cpu_id));
/*
* Cache the P-state specific ACPI data.
*/
if (cpu_acpi_cache_pstate_data(handle) != 0) {
PWRNOW_DEBUG(("Failed to cache ACPI data\n"));
- pwrnow_fini(cpudsp);
+ pwrnow_fini(cp);
return (PWRNOW_RET_NO_PM);
}
@@ -200,20 +183,13 @@ pwrnow_init(cpudrv_devstate_t *cpudsp)
cmn_err(CE_WARN, "!_PCT configured for unsupported "
"addrspace = %d.", pct_stat->cr_addrspace_id);
cmn_err(CE_NOTE, "!CPU power management will not function.");
- pwrnow_fini(cpudsp);
+ pwrnow_fini(cp);
return (PWRNOW_RET_NO_PM);
}
- if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED))
- domain = CPU_ACPI_PSD(handle).sd_domain;
- else {
- cp = cpu[CPU->cpu_id];
- domain = cpuid_get_chipid(cp);
- }
- cpupm_add_cpu2dependency(cpudsp->dip, domain);
+ cpupm_alloc_domains(cp, CPUPM_P_STATES);
- PWRNOW_DEBUG(("Instance %d succeeded.\n",
- ddi_get_instance(cpudsp->dip)));
+ PWRNOW_DEBUG(("Processor %d succeeded.\n", cp->cpu_id))
return (PWRNOW_RET_SUCCESS);
}
@@ -221,12 +197,13 @@ pwrnow_init(cpudrv_devstate_t *cpudsp)
* Free resources allocated by pwrnow_init().
*/
static void
-pwrnow_fini(cpudrv_devstate_t *cpudsp)
+pwrnow_fini(cpu_t *cp)
{
- cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
- cpu_acpi_handle_t handle = mach_state->acpi_handle;
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+ cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
- cpupm_free_cpu_dependencies();
+ cpupm_free_domains(&cpupm_pstate_domains);
cpu_acpi_free_pstate_data(handle);
}
diff --git a/usr/src/uts/i86pc/io/cpudrv/speedstep.c b/usr/src/uts/i86pc/os/cpupm/speedstep.c
index 764ca5c23a..e4886d0045 100644
--- a/usr/src/uts/i86pc/io/cpudrv/speedstep.c
+++ b/usr/src/uts/i86pc/os/cpupm/speedstep.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -28,21 +28,20 @@
#include <sys/x_call.h>
#include <sys/acpi/acpi.h>
#include <sys/acpica.h>
-#include <sys/cpudrv_mach.h>
#include <sys/speedstep.h>
#include <sys/cpu_acpi.h>
#include <sys/cpupm.h>
#include <sys/dtrace.h>
#include <sys/sdt.h>
-static int speedstep_init(cpudrv_devstate_t *);
-static void speedstep_fini(cpudrv_devstate_t *);
-static int speedstep_power(cpudrv_devstate_t *, uint32_t);
+static int speedstep_init(cpu_t *);
+static void speedstep_fini(cpu_t *);
+static void speedstep_power(cpuset_t, uint32_t);
/*
* Interfaces for modules implementing Intel's Enhanced SpeedStep.
*/
-cpudrv_pstate_ops_t speedstep_ops = {
+cpupm_state_ops_t speedstep_ops = {
"Enhanced SpeedStep Technology",
speedstep_init,
speedstep_fini,
@@ -80,12 +79,11 @@ volatile int ess_debug = 0;
* Write the ctrl register. How it is written, depends upon the _PCT
* APCI object value.
*/
-static int
+static void
write_ctrl(cpu_acpi_handle_t handle, uint32_t ctrl)
{
cpu_acpi_pct_t *pct_ctrl;
uint64_t reg;
- int ret = 0;
pct_ctrl = CPU_ACPI_PCT_CTRL(handle);
@@ -99,79 +97,67 @@ write_ctrl(cpu_acpi_handle_t handle, uint32_t ctrl)
reg &= ~((uint64_t)0xFFFF);
reg |= ctrl;
wrmsr(IA32_PERF_CTL_MSR, reg);
- ret = 0;
break;
case ACPI_ADR_SPACE_SYSTEM_IO:
- ret = cpu_acpi_write_port(pct_ctrl->cr_address, ctrl,
+ (void) cpu_acpi_write_port(pct_ctrl->cr_address, ctrl,
pct_ctrl->cr_width);
break;
default:
DTRACE_PROBE1(ess_ctrl_unsupported_type, uint8_t,
pct_ctrl->cr_addrspace_id);
- return (-1);
+ return;
}
DTRACE_PROBE1(ess_ctrl_write, uint32_t, ctrl);
- DTRACE_PROBE1(ess_ctrl_write_err, int, ret);
-
- return (ret);
}
/*
* Transition the current processor to the requested state.
*/
void
-speedstep_pstate_transition(int *ret, cpudrv_devstate_t *cpudsp,
- uint32_t req_state)
+speedstep_pstate_transition(uint32_t req_state)
{
- cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
- cpu_acpi_handle_t handle = mach_state->acpi_handle;
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)CPU->cpu_m.mcpu_pm_mach_state;
+ cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
cpu_acpi_pstate_t *req_pstate;
uint32_t ctrl;
req_pstate = (cpu_acpi_pstate_t *)CPU_ACPI_PSTATES(handle);
req_pstate += req_state;
+
DTRACE_PROBE1(ess_transition, uint32_t, CPU_ACPI_FREQ(req_pstate));
/*
* Initiate the processor p-state change.
*/
ctrl = CPU_ACPI_PSTATE_CTRL(req_pstate);
- if (write_ctrl(handle, ctrl) != 0) {
- *ret = ESS_RET_UNSUP_STATE;
- return;
- }
+ write_ctrl(handle, ctrl);
- mach_state->pstate = req_state;
- CPU->cpu_curr_clock =
- (((uint64_t)CPU_ACPI_FREQ(req_pstate) * 1000000));
- *ret = ESS_RET_SUCCESS;
+ mach_state->ms_pstate.cma_state.pstate = req_state;
+ cpu_set_curr_clock(((uint64_t)CPU_ACPI_FREQ(req_pstate) * 1000000));
}
-static int
-speedstep_power(cpudrv_devstate_t *cpudsp, uint32_t req_state)
+static void
+speedstep_power(cpuset_t set, uint32_t req_state)
{
- cpuset_t cpus;
- int ret;
-
/*
* If thread is already running on target CPU then just
* make the transition request. Otherwise, we'll need to
* make a cross-call.
*/
kpreempt_disable();
- if (cpudsp->cpu_id == CPU->cpu_id) {
- speedstep_pstate_transition(&ret, cpudsp, req_state);
- } else {
- CPUSET_ONLY(cpus, cpudsp->cpu_id);
- xc_call((xc_arg_t)&ret, (xc_arg_t)cpudsp, (xc_arg_t)req_state,
- X_CALL_HIPRI, cpus, (xc_func_t)speedstep_pstate_transition);
+ if (CPU_IN_SET(set, CPU->cpu_id)) {
+ speedstep_pstate_transition(req_state);
+ CPUSET_DEL(set, CPU->cpu_id);
+ }
+ if (!CPUSET_ISNULL(set)) {
+ xc_call((xc_arg_t)req_state, NULL, NULL, X_CALL_HIPRI, set,
+ (xc_func_t)speedstep_pstate_transition);
}
kpreempt_enable();
-
- return (ret);
}
/*
@@ -179,23 +165,21 @@ speedstep_power(cpudrv_devstate_t *cpudsp, uint32_t req_state)
* get the P-state data from ACPI and cache it.
*/
static int
-speedstep_init(cpudrv_devstate_t *cpudsp)
+speedstep_init(cpu_t *cp)
{
- cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
- cpu_acpi_handle_t handle = mach_state->acpi_handle;
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+ cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
cpu_acpi_pct_t *pct_stat;
- cpu_t *cp;
- int dependency;
- ESSDEBUG(("speedstep_init: instance %d\n",
- ddi_get_instance(cpudsp->dip)));
+ ESSDEBUG(("speedstep_init: processor %d\n", cp->cpu_id));
/*
* Cache the P-state specific ACPI data.
*/
if (cpu_acpi_cache_pstate_data(handle) != 0) {
ESSDEBUG(("Failed to cache ACPI data\n"));
- speedstep_fini(cpudsp);
+ speedstep_fini(cp);
return (ESS_RET_NO_PM);
}
@@ -211,21 +195,13 @@ speedstep_init(cpudrv_devstate_t *cpudsp)
cmn_err(CE_WARN, "!_PCT conifgured for unsupported "
"addrspace = %d.", pct_stat->cr_addrspace_id);
cmn_err(CE_NOTE, "!CPU power management will not function.");
- speedstep_fini(cpudsp);
+ speedstep_fini(cp);
return (ESS_RET_NO_PM);
}
- if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED))
- dependency = CPU_ACPI_PSD(handle).sd_domain;
- else {
- mutex_enter(&cpu_lock);
- cp = cpu[CPU->cpu_id];
- dependency = cpuid_get_chipid(cp);
- mutex_exit(&cpu_lock);
- }
- cpupm_add_cpu2dependency(cpudsp->dip, dependency);
+ cpupm_alloc_domains(cp, CPUPM_P_STATES);
- ESSDEBUG(("Instance %d succeeded.\n", ddi_get_instance(cpudsp->dip)));
+ ESSDEBUG(("Processor %d succeeded.\n", cp->cpu_id))
return (ESS_RET_SUCCESS);
}
@@ -233,12 +209,13 @@ speedstep_init(cpudrv_devstate_t *cpudsp)
* Free resources allocated by speedstep_init().
*/
static void
-speedstep_fini(cpudrv_devstate_t *cpudsp)
+speedstep_fini(cpu_t *cp)
{
- cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
- cpu_acpi_handle_t handle = mach_state->acpi_handle;
+ cpupm_mach_state_t *mach_state =
+ (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+ cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
- cpupm_free_cpu_dependencies();
+ cpupm_free_domains(&cpupm_pstate_domains);
cpu_acpi_free_pstate_data(handle);
}
@@ -246,7 +223,6 @@ boolean_t
speedstep_supported(uint_t family, uint_t model)
{
struct cpuid_regs cpu_regs;
- uint64_t reg;
/* Required features */
if (!(x86_feature & X86_CPUID) ||
@@ -272,16 +248,5 @@ speedstep_supported(uint_t family, uint_t model)
return (B_FALSE);
}
- /*
- * If Enhanced SpeedStep has not been enabled on the system,
- * then we probably should not override the BIOS setting.
- */
- reg = rdmsr(IA32_MISC_ENABLE_MSR);
- if (! (reg & IA32_MISC_ENABLE_EST)) {
- cmn_err(CE_NOTE, "!Enhanced Intel SpeedStep not enabled.");
- cmn_err(CE_NOTE, "!CPU power management will not function.");
- return (B_FALSE);
- }
-
return (B_TRUE);
}
diff --git a/usr/src/uts/i86pc/os/mlsetup.c b/usr/src/uts/i86pc/os/mlsetup.c
index 94ee76b0b0..001bd0537f 100644
--- a/usr/src/uts/i86pc/os/mlsetup.c
+++ b/usr/src/uts/i86pc/os/mlsetup.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -295,6 +295,8 @@ mlsetup(struct regs *rp)
*/
cpu_list_init(CPU);
+ pg_cpu_bootstrap(CPU);
+
/*
* Now that we have taken over the GDT, IDT and have initialized
* active CPU list it's time to inform kmdb if present.
diff --git a/usr/src/uts/i86pc/os/mp_machdep.c b/usr/src/uts/i86pc/os/mp_machdep.c
index e27b45d709..1954dfb81c 100644
--- a/usr/src/uts/i86pc/os/mp_machdep.c
+++ b/usr/src/uts/i86pc/os/mp_machdep.c
@@ -45,6 +45,7 @@
#include <sys/memlist.h>
#include <sys/param.h>
#include <sys/promif.h>
+#include <sys/cpu_pm.h>
#if defined(__xpv)
#include <sys/hypervisor.h>
#endif
@@ -52,6 +53,7 @@
#include <vm/hat_i86.h>
#include <sys/kdi_machimpl.h>
#include <sys/sdt.h>
+#include <sys/hpet.h>
#define OFFSETOF(s, m) (size_t)(&(((s *)0)->m))
@@ -76,10 +78,10 @@ static int mach_intr_ops(dev_info_t *, ddi_intr_handle_impl_t *,
static void mach_notify_error(int level, char *errmsg);
static hrtime_t dummy_hrtime(void);
static void dummy_scalehrtime(hrtime_t *);
-static void cpu_idle(void);
+void cpu_idle(void);
static void cpu_wakeup(cpu_t *, int);
#ifndef __xpv
-static void cpu_idle_mwait(void);
+void cpu_idle_mwait(void);
static void cpu_wakeup_mwait(cpu_t *, int);
#endif
/*
@@ -184,7 +186,23 @@ int idle_cpu_prefer_mwait = 1;
*/
int idle_cpu_assert_cflush_monitor = 1;
-#endif
+/*
+ * If non-zero, idle cpus will not use power saving Deep C-States idle loop.
+ */
+int idle_cpu_no_deep_c = 0;
+/*
+ * Non-power saving idle loop and wakeup pointers.
+ * Allows user to toggle Deep Idle power saving feature on/off.
+ */
+void (*non_deep_idle_cpu)() = cpu_idle;
+void (*non_deep_idle_disp_enq_thread)(cpu_t *, int);
+
+/*
+ * Object for the kernel to access the HPET.
+ */
+hpet_t hpet;
+
+#endif /* ifndef __xpv */
/*ARGSUSED*/
int
@@ -210,6 +228,16 @@ pg_plat_hw_shared(cpu_t *cp, pghw_type_t hw)
return (1);
else
return (0);
+ case PGHW_POW_ACTIVE:
+ if (cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE) != (id_t)-1)
+ return (1);
+ else
+ return (0);
+ case PGHW_POW_IDLE:
+ if (cpupm_domain_id(cp, CPUPM_DTYPE_IDLE) != (id_t)-1)
+ return (1);
+ else
+ return (0);
default:
return (0);
}
@@ -247,58 +275,63 @@ pg_plat_hw_instance_id(cpu_t *cpu, pghw_type_t hw)
return (cpuid_get_last_lvl_cacheid(cpu));
case PGHW_CHIP:
return (cpuid_get_chipid(cpu));
+ case PGHW_POW_ACTIVE:
+ return (cpupm_domain_id(cpu, CPUPM_DTYPE_ACTIVE));
+ case PGHW_POW_IDLE:
+ return (cpupm_domain_id(cpu, CPUPM_DTYPE_IDLE));
default:
return (-1);
}
}
-int
-pg_plat_hw_level(pghw_type_t hw)
+/*
+ * Express preference for optimizing for sharing relationship
+ * hw1 vs hw2
+ */
+pghw_type_t
+pg_plat_hw_rank(pghw_type_t hw1, pghw_type_t hw2)
{
- int i;
+ int i, rank1, rank2;
+
static pghw_type_t hw_hier[] = {
PGHW_IPIPE,
PGHW_CACHE,
PGHW_CHIP,
+ PGHW_POW_IDLE,
+ PGHW_POW_ACTIVE,
PGHW_NUM_COMPONENTS
};
for (i = 0; hw_hier[i] != PGHW_NUM_COMPONENTS; i++) {
- if (hw_hier[i] == hw)
- return (i);
+ if (hw_hier[i] == hw1)
+ rank1 = i;
+ if (hw_hier[i] == hw2)
+ rank2 = i;
}
- return (-1);
-}
-/*
- * Return 1 if CMT load balancing policies should be
- * implemented across instances of the specified hardware
- * sharing relationship.
- */
-int
-pg_plat_cmt_load_bal_hw(pghw_type_t hw)
-{
- if (hw == PGHW_IPIPE ||
- hw == PGHW_FPU ||
- hw == PGHW_CHIP ||
- hw == PGHW_CACHE)
- return (1);
+ if (rank1 > rank2)
+ return (hw1);
else
- return (0);
+ return (hw2);
}
-
/*
- * Return 1 if thread affinity polices should be implemented
- * for instances of the specifed hardware sharing relationship.
+ * Override the default CMT dispatcher policy for the specified
+ * hardware sharing relationship
*/
-int
-pg_plat_cmt_affinity_hw(pghw_type_t hw)
+pg_cmt_policy_t
+pg_plat_cmt_policy(pghw_type_t hw)
{
- if (hw == PGHW_CACHE)
- return (1);
- else
- return (0);
+ /*
+ * For shared caches, also load balance across them to
+ * maximize aggregate cache capacity
+ */
+ switch (hw) {
+ case PGHW_CACHE:
+ return (CMT_BALANCE|CMT_AFFINITY);
+ default:
+ return (CMT_NO_POLICY);
+ }
}
id_t
@@ -329,9 +362,28 @@ dummy_scalehrtime(hrtime_t *ticks)
{}
/*
+ * Supports Deep C-State power saving idle loop.
+ */
+void
+cpu_idle_adaptive(void)
+{
+ (*CPU->cpu_m.mcpu_idle_cpu)();
+}
+
+void
+cpu_dtrace_idle_probe(uint_t cstate)
+{
+ cpu_t *cpup = CPU;
+ struct machcpu *mcpu = &(cpup->cpu_m);
+
+ mcpu->curr_cstate = cstate;
+ DTRACE_PROBE1(idle__state__transition, uint_t, cstate);
+}
+
+/*
* Idle the present CPU until awoken via an interrupt
*/
-static void
+void
cpu_idle(void)
{
cpu_t *cpup = CPU;
@@ -427,11 +479,11 @@ cpu_idle(void)
return;
}
- DTRACE_PROBE1(idle__state__transition, uint_t, IDLE_STATE_C1);
+ cpu_dtrace_idle_probe(IDLE_STATE_C1);
mach_cpu_idle();
- DTRACE_PROBE1(idle__state__transition, uint_t, IDLE_STATE_C0);
+ cpu_dtrace_idle_probe(IDLE_STATE_C0);
/*
* We're no longer halted
@@ -510,7 +562,7 @@ cpu_wakeup(cpu_t *cpu, int bound)
/*
* Idle the present CPU until awoken via touching its monitored line
*/
-static void
+void
cpu_idle_mwait(void)
{
volatile uint32_t *mcpu_mwait = CPU->cpu_m.mcpu_mwait;
@@ -520,7 +572,7 @@ cpu_idle_mwait(void)
int hset_update = 1;
/*
- * Set our mcpu_mwait here, so we can tell if anyone trys to
+ * Set our mcpu_mwait here, so we can tell if anyone tries to
* wake us between now and when we call mwait. No other cpu will
* attempt to set our mcpu_mwait until we add ourself to the halted
* CPU bitmap.
@@ -529,7 +581,7 @@ cpu_idle_mwait(void)
/*
* If this CPU is online, and there's multiple CPUs
- * in the system, then we should notate our halting
+ * in the system, then we should note our halting
* by adding ourselves to the partition's halted CPU
* bitmap. This allows other CPUs to find/awaken us when
* work becomes available.
@@ -543,7 +595,7 @@ cpu_idle_mwait(void)
*
* When a thread becomes runnable, it is placed on the queue
* and then the halted CPU bitmap is checked to determine who
- * (if anyone) should be awoken. We therefore need to first
+ * (if anyone) should be awakened. We therefore need to first
* add ourselves to the bitmap, and and then check if there
* is any work available.
*
@@ -580,13 +632,13 @@ cpu_idle_mwait(void)
*/
i86_monitor(mcpu_mwait, 0, 0);
if (*mcpu_mwait == MWAIT_HALTED) {
- DTRACE_PROBE1(idle__state__transition, uint_t, IDLE_STATE_C1);
+ cpu_dtrace_idle_probe(IDLE_STATE_C1);
tlb_going_idle();
i86_mwait(0, 0);
tlb_service();
- DTRACE_PROBE1(idle__state__transition, uint_t, IDLE_STATE_C0);
+ cpu_dtrace_idle_probe(IDLE_STATE_C0);
}
/*
@@ -858,14 +910,23 @@ mach_init()
(*pops->psm_softinit)();
/*
- * Initialize the dispatcher's function hooks
- * to enable CPU halting when idle.
+ * Initialize the dispatcher's function hooks to enable CPU halting
+ * when idle. Set both the deep-idle and non-deep-idle hooks.
+ *
+ * Assume we can use power saving deep-idle loop cpu_idle_adaptive.
+ * Platform deep-idle driver will reset our idle loop to
+ * non_deep_idle_cpu if power saving deep-idle feature is not available.
+ *
* Do not use monitor/mwait if idle_cpu_use_hlt is not set(spin idle)
* or idle_cpu_prefer_mwait is not set.
* Allocate monitor/mwait buffer for cpu0.
*/
+#ifndef __xpv
+ non_deep_idle_disp_enq_thread = disp_enq_thread;
+#endif
if (idle_cpu_use_hlt) {
- idle_cpu = cpu_idle;
+ idle_cpu = cpu_idle_adaptive;
+ CPU->cpu_m.mcpu_idle_cpu = cpu_idle;
#ifndef __xpv
if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait) {
CPU->cpu_m.mcpu_mwait = cpuid_mwait_alloc(CPU);
@@ -878,12 +939,20 @@ mach_init()
"handle cpu 0 mwait size.");
#endif
idle_cpu_prefer_mwait = 0;
- idle_cpu = cpu_idle;
+ CPU->cpu_m.mcpu_idle_cpu = cpu_idle;
} else {
- idle_cpu = cpu_idle_mwait;
+ CPU->cpu_m.mcpu_idle_cpu = cpu_idle_mwait;
}
} else {
- idle_cpu = cpu_idle;
+ CPU->cpu_m.mcpu_idle_cpu = cpu_idle;
+ }
+ non_deep_idle_cpu = CPU->cpu_m.mcpu_idle_cpu;
+
+ /*
+ * Disable power saving deep idle loop?
+ */
+ if (idle_cpu_no_deep_c) {
+ idle_cpu = non_deep_idle_cpu;
}
#endif
}
@@ -970,6 +1039,7 @@ mach_smpinit(void)
#ifndef __xpv
if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait)
disp_enq_thread = cpu_wakeup_mwait;
+ non_deep_idle_disp_enq_thread = disp_enq_thread;
#endif
}
diff --git a/usr/src/uts/i86pc/os/mp_startup.c b/usr/src/uts/i86pc/os/mp_startup.c
index 54eb2f4369..5027d7a182 100644
--- a/usr/src/uts/i86pc/os/mp_startup.c
+++ b/usr/src/uts/i86pc/os/mp_startup.c
@@ -120,11 +120,6 @@ init_cpu_info(struct cpu *cp)
*/
cp->cpu_curr_clock = cpu_freq_hz;
- /*
- * Supported frequencies.
- */
- cpu_set_supp_freqs(cp, NULL);
-
(void) strcpy(pi->pi_processor_type, "i386");
if (fpu_exists)
(void) strcpy(pi->pi_fputypes, "i387 compatible");
@@ -236,8 +231,10 @@ mp_startup_init(int cpun)
proc_t *procp;
#if !defined(__xpv)
extern int idle_cpu_prefer_mwait;
+ extern void cpu_idle_mwait();
#endif
extern void idle();
+ extern void cpu_idle();
#ifdef TRAPTRACE
trap_trace_ctl_t *ttc = &trap_trace_ctl[cpun];
@@ -247,9 +244,12 @@ mp_startup_init(int cpun)
cp = kmem_zalloc(sizeof (*cp), KM_SLEEP);
#if !defined(__xpv)
- if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait)
+ if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait) {
cp->cpu_m.mcpu_mwait = cpuid_mwait_alloc(CPU);
+ cp->cpu_m.mcpu_idle_cpu = cpu_idle_mwait;
+ } else
#endif
+ cp->cpu_m.mcpu_idle_cpu = cpu_idle;
procp = curthread->t_procp;
@@ -1463,6 +1463,9 @@ mp_startup(void)
{
struct cpu *cp = CPU;
uint_t new_x86_feature;
+#ifndef __xpv
+ extern void cpupm_init(cpu_t *);
+#endif
/*
* We need to get TSC on this proc synced (i.e., any delta
@@ -1558,14 +1561,6 @@ mp_startup(void)
init_cpu_info(cp);
mutex_enter(&cpu_lock);
- /*
- * Processor group initialization for this CPU is dependent on the
- * cpuid probing, which must be done in the context of the current
- * CPU.
- */
- pghw_physid_create(cp);
- pg_cpu_init(cp);
- pg_cmt_cpu_startup(cp);
cp->cpu_flags |= CPU_RUNNING | CPU_READY | CPU_EXISTS;
@@ -1597,15 +1592,30 @@ mp_startup(void)
ASSERT(cp->cpu_base_spl == ipltospl(LOCK_LEVEL));
set_base_spl(); /* Restore the spl to its proper value */
+#ifndef __xpv
+ cpupm_init(cp);
+#endif
+ add_cpunode2devtree(cp->cpu_id, cp->cpu_m.mcpu_cpi);
+
+ /*
+ * Processor group initialization for this CPU is dependent on the
+ * cpuid probing, which must be done in the context of the current
+ * CPU, as well as the CPU's device node initialization (for ACPI).
+ */
+ mutex_enter(&cpu_lock);
+ pghw_physid_create(cp);
+ pg_cpu_init(cp);
+ pg_cmt_cpu_startup(cp);
+ mutex_exit(&cpu_lock);
+
/* Enable interrupts */
(void) spl0();
+
mutex_enter(&cpu_lock);
cpu_enable_intr(cp);
cpu_add_active(cp);
mutex_exit(&cpu_lock);
- add_cpunode2devtree(cp->cpu_id, cp->cpu_m.mcpu_cpi);
-
#ifndef __xpv
{
/*
diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c
index 58bc3416f1..533d90a2a0 100644
--- a/usr/src/uts/i86pc/os/startup.c
+++ b/usr/src/uts/i86pc/os/startup.c
@@ -137,6 +137,7 @@ extern void progressbar_init(void);
extern void progressbar_start(void);
extern void brand_init(void);
extern void pcf_init(void);
+extern void pg_init(void);
extern int size_pse_array(pgcnt_t, int);
@@ -2128,6 +2129,8 @@ ulong_t _bdhs34;
void
post_startup(void)
{
+ extern void cpupm_init(cpu_t *);
+
/*
* Set the system wide, processor-specific flags to be passed
* to userland via the aux vector for performance hints and
@@ -2186,7 +2189,11 @@ post_startup(void)
maxmem = freemem;
+ cpupm_init(CPU);
+
add_cpunode2devtree(CPU->cpu_id, CPU->cpu_m.mcpu_cpi);
+
+ pg_init();
}
static int
diff --git a/usr/src/uts/i86pc/sys/cpu_acpi.h b/usr/src/uts/i86pc/sys/cpu_acpi.h
index c0b750f447..1805cd4d22 100644
--- a/usr/src/uts/i86pc/sys/cpu_acpi.h
+++ b/usr/src/uts/i86pc/sys/cpu_acpi.h
@@ -19,13 +19,14 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _CPU_ACPI_H
#define _CPU_ACPI_H
+#include <sys/cpuvar.h>
#include <sys/acpi/acpi.h>
#include <sys/acpi/acresrc.h>
#include <sys/acpi/acglobal.h>
@@ -66,15 +67,25 @@ extern "C" {
#define CPU_ACPI_TSTATE_CTRL(tstate) tstate->ts_ctrl
#define CPU_ACPI_TSTATE_STAT(tstate) tstate->ts_state
-#define CPU_ACPI_NONE_CACHED 0x00
-#define CPU_ACPI_PCT_CACHED 0x01
-#define CPU_ACPI_PSS_CACHED 0x02
-#define CPU_ACPI_PSD_CACHED 0x04
-#define CPU_ACPI_PPC_CACHED 0x08
-#define CPU_ACPI_PTC_CACHED 0x10
-#define CPU_ACPI_TSS_CACHED 0x20
-#define CPU_ACPI_TSD_CACHED 0x40
-#define CPU_ACPI_TPC_CACHED 0x80
+/*
+ * C-state realted macros
+ */
+#define CPU_ACPI_CSD(sp) sp->cs_csd
+#define CPU_ACPI_BM_INFO(sp) sp->bm_info
+#define CPU_ACPI_CSTATES(sp) sp->cs_cstates.ss_states
+#define CPU_ACPI_CSTATES_COUNT(sp) sp->cs_cstates.ss_count
+
+#define CPU_ACPI_NONE_CACHED 0x0000
+#define CPU_ACPI_PCT_CACHED 0x0001
+#define CPU_ACPI_PSS_CACHED 0x0002
+#define CPU_ACPI_PSD_CACHED 0x0004
+#define CPU_ACPI_PPC_CACHED 0x0008
+#define CPU_ACPI_PTC_CACHED 0x0010
+#define CPU_ACPI_TSS_CACHED 0x0020
+#define CPU_ACPI_TSD_CACHED 0x0040
+#define CPU_ACPI_TPC_CACHED 0x0080
+#define CPU_ACPI_CST_CACHED 0x0100
+#define CPU_ACPI_CSD_CACHED 0x0200
#define CPU_ACPI_IS_OBJ_CACHED(sp, obj) (sp->cpu_acpi_cached & obj)
#define CPU_ACPI_OBJ_IS_CACHED(sp, obj) (sp->cpu_acpi_cached |= obj)
@@ -84,7 +95,8 @@ extern "C" {
#define CPU_ACPI_PSS_CNT (sizeof (cpu_acpi_pstate_t) / sizeof (uint32_t))
#define CPU_ACPI_TSTATES_SIZE(cnt) (cnt * sizeof (cpu_acpi_tstate_t))
#define CPU_ACPI_TSS_CNT (sizeof (cpu_acpi_tstate_t) / sizeof (uint32_t))
-
+#define CPU_ACPI_CSTATES_SIZE(cnt) (cnt * sizeof (cpu_acpi_cstate_t))
+#define CPU_ACPI_CST_CNT (sizeof (cpu_acpi_cstate_t) / sizeof (uint32_t))
/*
* CPU Domain Coordination Types
*/
@@ -102,10 +114,12 @@ typedef struct cpu_acpi_state_dependency
uint32_t sd_domain;
uint32_t sd_type;
uint32_t sd_num;
+ uint32_t sd_index;
} cpu_acpi_state_dependency_t;
typedef cpu_acpi_state_dependency_t cpu_acpi_psd_t;
typedef cpu_acpi_state_dependency_t cpu_acpi_tsd_t;
+typedef cpu_acpi_state_dependency_t cpu_acpi_csd_t;
/*
* Container for ACPI processor control register information
@@ -148,6 +162,21 @@ typedef struct cpu_acpi_tstate
} cpu_acpi_tstate_t;
+/*
+ * Container for _CST information
+ */
+typedef struct cpu_acpi_cstate
+{
+ uint32_t cs_addrspace_id;
+ uint32_t cs_address;
+ uint32_t cs_type;
+ uint32_t cs_latency;
+ uint32_t cs_power;
+ uint32_t promotion;
+ uint32_t demotion;
+ kstat_t *cs_ksp;
+} cpu_acpi_cstate_t;
+
typedef struct cpu_acpi_supported_states {
void *ss_states;
uint32_t ss_count;
@@ -155,6 +184,7 @@ typedef struct cpu_acpi_supported_states {
typedef cpu_acpi_supported_states_t cpu_acpi_pstates_t;
typedef cpu_acpi_supported_states_t cpu_acpi_tstates_t;
+typedef cpu_acpi_supported_states_t cpu_acpi_cstates_t;
typedef int cpu_acpi_present_capabilities_t;
typedef int cpu_acpi_ppc_t;
@@ -165,7 +195,7 @@ typedef int cpu_acpi_tpc_t;
*/
typedef struct cpu_acpi_state {
ACPI_HANDLE cs_handle;
- dev_info_t *cs_dip;
+ int cs_id;
uint_t cpu_acpi_cached;
cpu_acpi_pstates_t cs_pstates;
cpu_acpi_pct_t cs_pct[2];
@@ -175,6 +205,9 @@ typedef struct cpu_acpi_state {
cpu_acpi_ptc_t cs_ptc[2];
cpu_acpi_tsd_t cs_tsd;
cpu_acpi_tpc_t cs_tpc;
+ cpu_acpi_cstates_t cs_cstates;
+ cpu_acpi_csd_t cs_csd;
+ uint_t bm_info;
} cpu_acpi_state_t;
typedef cpu_acpi_state_t *cpu_acpi_handle_t;
@@ -185,15 +218,22 @@ extern int cpu_acpi_cache_pstate_data(cpu_acpi_handle_t);
extern void cpu_acpi_free_pstate_data(cpu_acpi_handle_t);
extern int cpu_acpi_cache_tstate_data(cpu_acpi_handle_t);
extern void cpu_acpi_free_tstate_data(cpu_acpi_handle_t);
+extern int cpu_acpi_cache_cstate_data(cpu_acpi_handle_t);
+extern void cpu_acpi_free_cstate_data(cpu_acpi_handle_t);
extern void cpu_acpi_install_notify_handler(cpu_acpi_handle_t,
- ACPI_NOTIFY_HANDLER, dev_info_t *);
+ ACPI_NOTIFY_HANDLER, void *);
+extern void cpu_acpi_remove_notify_handler(cpu_acpi_handle_t,
+ ACPI_NOTIFY_HANDLER);
extern int cpu_acpi_write_pdc(cpu_acpi_handle_t, uint32_t, uint32_t,
uint32_t *);
extern int cpu_acpi_write_port(ACPI_IO_ADDRESS, uint32_t, uint32_t);
extern int cpu_acpi_read_port(ACPI_IO_ADDRESS, uint32_t *, uint32_t);
+extern void cpu_acpi_set_register(uint32_t, uint32_t);
+extern void cpu_acpi_get_register(uint32_t, uint32_t *);
extern uint_t cpu_acpi_get_speeds(cpu_acpi_handle_t, int **);
+extern uint_t cpu_acpi_get_max_cstates(cpu_acpi_handle_t);
extern void cpu_acpi_free_speeds(int *, uint_t);
-extern cpu_acpi_handle_t cpu_acpi_init(dev_info_t *);
+extern cpu_acpi_handle_t cpu_acpi_init(cpu_t *);
extern void cpu_acpi_fini(cpu_acpi_handle_t);
#ifdef __cplusplus
diff --git a/usr/src/uts/i86pc/sys/cpu_idle.h b/usr/src/uts/i86pc/sys/cpu_idle.h
new file mode 100644
index 0000000000..6b38663c28
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/cpu_idle.h
@@ -0,0 +1,72 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _CPUIDLE_H
+#define _CPUIDLE_H
+
+#include <sys/cpupm.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#define CPU_MAX_CSTATES 8
+
+#define CPU_ACPI_C0 0
+#define CPU_ACPI_C1 1
+#define CPU_ACPI_C2 2
+#define CPU_ACPI_C3 3
+
+#define BM_CTL 0x1
+#define BM_RLD 0x2
+#define BM_ARB_DIS 0x4
+
+#define CPUID_TSC_INVARIANCE 0x100
+
+#define CPU_IDLE_DEEP_CFG (0x1) /* Deep Idle disabled by user */
+#define CPU_IDLE_CPR_CFG (0x2) /* In CPR */
+
+#define CPU_CSTATE_LATENCY_UNDEF (1000000) /* ACPI info missing */
+
+typedef struct cpu_idle_kstat_s {
+ struct kstat_named addr_space_id; /* register address space id */
+ struct kstat_named cs_latency; /* worst latency */
+ struct kstat_named cs_power; /* average power consumption */
+} cpu_idle_kstat_t;
+
+extern cpupm_state_ops_t cpu_idle_ops;
+
+extern void cpu_acpi_idle(void);
+extern void cstate_wakeup(cpu_t *, int);
+extern boolean_t cpu_deep_cstates_supported(void);
+extern void cpu_wakeup(cpu_t *, int);
+extern void cpu_wakeup_mwait(cpu_t *, int);
+extern void cpu_dtrace_idle_probe(uint_t);
+extern void cpuidle_manage_cstates(void *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _CPUIDLE_H */
diff --git a/usr/src/uts/i86pc/sys/cpudrv_mach.h b/usr/src/uts/i86pc/sys/cpudrv_mach.h
index 26b4ecb787..c26d93853f 100644
--- a/usr/src/uts/i86pc/sys/cpudrv_mach.h
+++ b/usr/src/uts/i86pc/sys/cpudrv_mach.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -42,31 +42,12 @@ extern "C" {
* for it).
*/
extern cpuset_t cpu_ready_set;
-#define CPUDRV_PM_XCALL_IS_READY(cpuid) CPU_IN_SET(cpu_ready_set, (cpuid))
-
-/*
- * An error attaching any of the devices results in disabling
- * CPU power management.
- */
-#define CPUDRV_PM_DISABLE() cpupm_disable(CPUPM_ALL_STATES)
-
-/*
- * If no power management states are enabled, then CPU power
- * management is disabled.
- */
-#define CPUDRV_PM_DISABLED() \
- (!cpupm_is_enabled(CPUPM_P_STATES) && !cpupm_is_enabled(CPUPM_T_STATES))
-
-/*
- * Is P-state management enabled?
- */
-#define CPUDRV_PM_POWER_ENABLED(cpudsp) \
- (((cpudrv_mach_state_t *)cpudsp->mach_state)->caps & CPUDRV_P_STATES)
+#define CPUDRV_XCALL_IS_READY(cpuid) CPU_IN_SET(cpu_ready_set, (cpuid))
/*
* We're about to exit the _PPC thread so reset tag.
*/
-#define CPUDRV_PM_RESET_GOVERNOR_THREAD(cpupm) { \
+#define CPUDRV_RESET_GOVERNOR_THREAD(cpupm) { \
if (curthread == cpupm->pm_governor_thread) \
cpupm->pm_governor_thread = NULL; \
}
@@ -74,50 +55,51 @@ extern cpuset_t cpu_ready_set;
/*
* The current top speed as defined by the _PPC.
*/
-#define CPUDRV_PM_TOPSPEED(cpupm) (cpupm)->top_spd
+#define CPUDRV_TOPSPEED(cpupm) (cpupm)->top_spd
/*
* Install a _PPC/_TPC change notification handler.
*/
-#define CPUDRV_PM_INSTALL_MAX_CHANGE_HANDLER(cpudsp, dip) \
- cpudrv_pm_install_notify_handler(cpudsp, dip);
+#define CPUDRV_INSTALL_MAX_CHANGE_HANDLER(cpudsp) \
+ cpudrv_install_notify_handler(cpudsp);
/*
* Redefine the topspeed.
*/
-#define CPUDRV_PM_REDEFINE_TOPSPEED(dip) cpudrv_pm_redefine_topspeed(dip)
+#define CPUDRV_REDEFINE_TOPSPEED(dip) cpudrv_redefine_topspeed(dip)
/*
* Set callbacks so that PPM can callback into CPUDRV
*/
-#define CPUDRV_PM_SET_PPM_CALLBACKS() { \
- cpupm_get_topspeed = cpudrv_pm_get_topspeed; \
- cpupm_set_topspeed = cpudrv_pm_set_topspeed; \
+#define CPUDRV_SET_PPM_CALLBACKS() { \
+ cpupm_get_topspeed_callb = cpudrv_get_topspeed; \
+ cpupm_set_topspeed_callb = cpudrv_set_topspeed; \
}
/*
* ACPI provides the supported speeds.
*/
-#define CPUDRV_PM_GET_SPEEDS(cpudsp, speeds, nspeeds) \
- nspeeds = cpudrv_pm_get_speeds(cpudsp, &speeds);
-#define CPUDRV_PM_FREE_SPEEDS(speeds, nspeeds) \
- cpudrv_pm_free_speeds(speeds, nspeeds);
+#define CPUDRV_GET_SPEEDS(cpudsp, speeds, nspeeds) \
+ nspeeds = cpudrv_get_speeds(cpudsp, &speeds);
+#define CPUDRV_FREE_SPEEDS(speeds, nspeeds) \
+ cpudrv_free_speeds(speeds, nspeeds);
/*
- * Convert speed to Hz.
+ * ACPI provides the supported C-states.
*/
-#define CPUDRV_PM_SPEED_HZ(unused, mhz) ((uint64_t)mhz * 1000000)
+#define CPUDRV_GET_MAX_CSTATES(handle) \
+ cpu_acpi_get_max_cstates(handle);
/*
* Compute the idle cnt percentage for a given speed.
*/
-#define CPUDRV_PM_IDLE_CNT_PERCENT(hwm, speeds, i) \
+#define CPUDRV_IDLE_CNT_PERCENT(hwm, speeds, i) \
(100 - (((100 - hwm) * speeds[0]) / speeds[i]))
/*
* Compute the user cnt percentage for a given speed.
*/
-#define CPUDRV_PM_USER_CNT_PERCENT(hwm, speeds, i) \
+#define CPUDRV_USER_CNT_PERCENT(hwm, speeds, i) \
((hwm * speeds[i]) / speeds[i - 1]);
/*
@@ -133,82 +115,21 @@ extern cpuset_t cpu_ready_set;
* The amount of memory needed for each string is:
* digits for power level + '=' + digits for freq + 'MHz' + '\0'
*/
-#define CPUDRV_PM_COMP_SIZE() \
- (CPUDRV_PM_COMP_MAX_DIG + 1 + CPUDRV_PM_COMP_MAX_DIG + 3 + 1);
-#define CPUDRV_PM_COMP_SPEED(cpupm, cur_spd) cur_spd->speed;
-#define CPUDRV_PM_COMP_SPRINT(pmc, cpupm, cur_spd, comp_spd) \
+#define CPUDRV_COMP_SIZE() \
+ (CPUDRV_COMP_MAX_DIG + 1 + CPUDRV_COMP_MAX_DIG + 3 + 1);
+#define CPUDRV_COMP_SPEED(cpupm, cur_spd) cur_spd->speed;
+#define CPUDRV_COMP_SPRINT(pmc, cpupm, cur_spd, comp_spd) \
(void) sprintf(pmc, "%d=%dMHz", cur_spd->pm_level, comp_spd);
-/*
- * T-State domain list
- */
-typedef struct cpudrv_tstate_domain_node {
- struct cpudrv_tstate_domain_node *tdn_next;
- struct cpudrv_tstate_domain *tdn_domain;
- cpudrv_devstate_t *tdn_cpudsp;
-} cpudrv_tstate_domain_node_t;
-
-typedef struct cpudrv_tstate_domain {
- struct cpudrv_tstate_domain *td_next;
- cpudrv_tstate_domain_node_t *td_node;
- uint32_t td_domain;
- uint32_t td_type;
- kmutex_t td_lock;
-} cpudrv_tstate_domain_t;
-
-extern cpudrv_tstate_domain_t *cpudrv_tstate_domains;
-
-/*
- * Different processor families have their own technologies for supporting
- * CPU power management (i.e., Intel has Enhanced SpeedStep for some of it's
- * processors and AMD has PowerNow! for some of it's processors). We support
- * these different technologies via modules that export the interfaces
- * described below.
- *
- * If a module implements the technology that should be used to manage
- * the current CPU device, then the cpups_init() module should return
- * succesfully (i.e., return code of 0) and perform any initialization
- * such that future power transistions can be performed by calling
- * the cpups_power() interface(). And the cpups_fini() interface can be
- * used to free any resources allocated by cpups_init().
- */
-typedef struct cpudrv_pstate_ops {
- char *cpups_label;
- int (*cpups_init)(cpudrv_devstate_t *);
- void (*cpups_fini)(cpudrv_devstate_t *);
- int (*cpups_power)(cpudrv_devstate_t *, uint32_t);
-} cpudrv_pstate_ops_t;
+extern void cpudrv_set_topspeed(void *, int);
+extern int cpudrv_get_topspeed(void *);
+extern int cpudrv_get_topthrottle(cpu_t *);
+extern void cpudrv_manage_throttling(void *);
+extern void cpudrv_install_notify_handler(cpudrv_devstate_t *);
+extern void cpudrv_redefine_topspeed(void *);
+extern uint_t cpudrv_get_speeds(cpudrv_devstate_t *, int **);
+extern void cpudrv_free_speeds(int *, uint_t);
-/*
- * T-state support.
- */
-typedef struct cpudrv_tstate_ops {
- char *cputs_label;
- int (*cputs_init)(cpudrv_devstate_t *);
- void (*cputs_fini)(cpudrv_devstate_t *);
- int (*cputs_throttle)(cpudrv_devstate_t *, uint32_t);
-} cpudrv_tstate_ops_t;
-
-typedef struct cpudrv_mach_state {
- void *acpi_handle;
- cpudrv_pstate_ops_t *cpupm_pstate_ops;
- cpudrv_tstate_ops_t *cpupm_tstate_ops;
- cpudrv_tstate_domain_node_t *tstate_domain_node;
- uint32_t pstate;
- uint32_t tstate;
- uint32_t caps;
-} cpudrv_mach_state_t;
-
-#define CPUDRV_NO_STATES 0x00
-#define CPUDRV_P_STATES 0x01
-#define CPUDRV_T_STATES 0x02
-
-extern uint_t cpudrv_pm_get_speeds(cpudrv_devstate_t *, int **);
-extern void cpudrv_pm_free_speeds(int *, uint_t);
-extern void cpudrv_pm_set_topspeed(void *, int);
-extern int cpudrv_pm_get_topspeed(void *);
-extern void cpudrv_pm_redefine_topspeed(void *);
-extern void cpudrv_pm_install_notify_handler(cpudrv_devstate_t *, dev_info_t *);
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/i86pc/sys/cpupm.h b/usr/src/uts/i86pc/sys/cpupm.h
deleted file mode 100644
index 2510a0fb60..0000000000
--- a/usr/src/uts/i86pc/sys/cpupm.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _CPUPM_H
-#define _CPUPM_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-
-/*
- * Simple structures used to temporarily keep track of CPU
- * dependencies until the PPM driver can build PPM CPU domains.
- */
-typedef struct cpupm_cpu_node {
- struct cpupm_cpu_node *cn_next;
- dev_info_t *cn_dip;
-} cpupm_cpu_node_t;
-
-typedef struct cpupm_cpu_dependency {
- struct cpupm_cpu_dependency *cd_next;
- cpupm_cpu_node_t *cd_cpu;
- int cd_dependency_id;
-} cpupm_cpu_dependency_t;
-
-/*
- * If any states are added, then make sure to add them to
- * CPUPM_ALL_STATES.
- */
-#define CPUPM_NO_STATES 0x00
-#define CPUPM_P_STATES 0x01
-#define CPUPM_T_STATES 0x02
-#define CPUPM_ALL_STATES (CPUPM_P_STATES | CPUPM_T_STATES)
-
-/*
- * Callbacks used for CPU power management.
- */
-extern void (*cpupm_rebuild_cpu_domains)(void);
-extern void (*cpupm_init_topspeed)(void);
-extern void (*cpupm_redefine_topspeed)(void *);
-extern int (*cpupm_get_topspeed)(void *);
-extern void (*cpupm_set_topspeed)(void *, int);
-
-/*
- * Routines used to manage temporary CPU dependencies.
- */
-extern cpupm_cpu_dependency_t *cpupm_get_cpu_dependencies();
-extern void cpupm_add_cpu2dependency(dev_info_t *, int);
-extern void cpupm_free_cpu_dependencies();
-
-/*
- * Routines to track overall status of CPU power management readiness.
- *
- */
-extern boolean_t cpupm_is_ready();
-extern boolean_t cpupm_is_enabled(uint32_t);
-extern void cpupm_disable(uint32_t);
-extern void cpupm_post_startup();
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _CPUPM_H */
diff --git a/usr/src/uts/i86pc/sys/cpupm_mach.h b/usr/src/uts/i86pc/sys/cpupm_mach.h
new file mode 100644
index 0000000000..fbb040f84b
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/cpupm_mach.h
@@ -0,0 +1,197 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _CPUPM_MACH_H
+#define _CPUPM_MACH_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cpuvar.h>
+#include <sys/ksynch.h>
+#include <sys/cpu_pm.h>
+
+/*
+ * CPU power domains
+ */
+typedef struct cpupm_state_domains {
+ struct cpupm_state_domains *pm_next;
+ uint32_t pm_domain;
+ uint32_t pm_type;
+ cpuset_t pm_cpus;
+ kmutex_t pm_lock;
+} cpupm_state_domains_t;
+
+extern cpupm_state_domains_t *cpupm_pstate_domains;
+extern cpupm_state_domains_t *cpupm_tstate_domains;
+extern cpupm_state_domains_t *cpupm_cstate_domains;
+
+/*
+ * Different processor families have their own technologies for supporting
+ * CPU power management (i.e., Intel has Enhanced SpeedStep for some of its
+ * processors and AMD has PowerNow! for some of its processors). We support
+ * these different technologies via modules that export the interfaces
+ * described below.
+ *
+ * If a module implements the technology that should be used to manage
+ * the current CPU device, then the cpus_init() module should return
+ * succesfully (i.e., return code of 0) and perform any initialization
+ * such that future power transistions can be performed by calling
+ * the cpus_change() interface. And the cpups_fini() interface can be
+ * used to free any resources allocated by cpus_init().
+ */
+typedef struct cpupm_state_ops {
+ char *cpups_label;
+ int (*cpus_init)(cpu_t *);
+ void (*cpus_fini)(cpu_t *);
+ void (*cpus_change)(cpuset_t, uint32_t);
+} cpupm_state_ops_t;
+
+/*
+ * Data kept for each C-state power-domain.
+ */
+typedef struct cma_c_state {
+ uint32_t cs_next_cstate; /* computed best C-state */
+
+ uint32_t cs_cnt; /* times accessed */
+ uint32_t cs_type; /* current ACPI idle type */
+
+ hrtime_t cs_idle_enter; /* entered idle */
+ hrtime_t cs_idle_exit; /* left idle */
+
+ hrtime_t cs_smpl_start; /* accounting sample began */
+ hrtime_t cs_idle; /* time idle */
+ hrtime_t cs_smpl_len; /* sample duration */
+ hrtime_t cs_smpl_idle; /* idle time in last sample */
+ uint64_t cs_smpl_idle_pct; /* % idle time in last smpl */
+
+ hrtime_t cs_C2_latency; /* C2 round trip latency */
+ hrtime_t cs_C3_latency; /* C3 round trip latency */
+} cma_c_state_t;
+
+typedef union cma_state {
+ cma_c_state_t *cstate;
+ uint32_t pstate;
+} cma_state_t;
+
+typedef struct cpupm_mach_acpi_state {
+ cpupm_state_ops_t *cma_ops;
+ cpupm_state_domains_t *cma_domain;
+ cma_state_t cma_state;
+} cpupm_mach_acpi_state_t;
+
+typedef struct cpupm_mach_state {
+ void *ms_acpi_handle;
+ cpupm_mach_acpi_state_t ms_pstate;
+ cpupm_mach_acpi_state_t ms_cstate;
+ cpupm_mach_acpi_state_t ms_tstate;
+ uint32_t ms_caps;
+ dev_info_t *ms_dip;
+ kmutex_t ms_lock;
+ struct cpupm_notification *ms_handlers;
+} cpupm_mach_state_t;
+
+/*
+ * Constants used by the Processor Device Notification handler
+ * that identify what kind of change has occurred.
+ */
+#define CPUPM_PPC_CHANGE_NOTIFICATION 0x80
+#define CPUPM_CST_CHANGE_NOTIFICATION 0x81
+#define CPUPM_TPC_CHANGE_NOTIFICATION 0x82
+
+typedef void (*CPUPM_NOTIFY_HANDLER)(void *handle, uint32_t val,
+ void *ctx);
+
+typedef struct cpupm_notification {
+ struct cpupm_notification *nq_next;
+ CPUPM_NOTIFY_HANDLER nq_handler;
+ void *nq_ctx;
+} cpupm_notification_t;
+
+/*
+ * If any states are added, then make sure to add them to
+ * CPUPM_ALL_STATES.
+ */
+#define CPUPM_NO_STATES 0x00
+#define CPUPM_P_STATES 0x01
+#define CPUPM_T_STATES 0x02
+#define CPUPM_C_STATES 0x04
+#define CPUPM_ALL_STATES (CPUPM_P_STATES \
+ | CPUPM_T_STATES \
+ | CPUPM_C_STATES)
+
+#define CPUPM_XCALL_IS_READY(cpuid) CPU_IN_SET(cpu_ready_set, (cpuid))
+
+/*
+ * An error in initializing any of the CPU PM results in disabling
+ * CPU power management.
+ */
+#define CPUPM_DISABLE() cpupm_disable(CPUPM_ALL_STATES)
+
+#define CPUPM_SPEED_HZ(unused, mhz) ((uint64_t)mhz * 1000000)
+
+/*
+ * Callbacks used for CPU power management.
+ */
+extern void (*cpupm_rebuild_cpu_domains)(void);
+extern void (*cpupm_init_topspeed)(void);
+extern void (*cpupm_redefine_topspeed)(void *);
+extern int (*cpupm_get_topspeed_callb)(void *);
+extern void (*cpupm_set_topspeed_callb)(void *, int);
+
+extern void cpupm_init(cpu_t *);
+extern void cpupm_free(cpu_t *);
+extern boolean_t cpupm_is_ready();
+extern boolean_t cpupm_is_enabled(uint32_t);
+extern void cpupm_disable(uint32_t);
+extern void cpupm_post_startup();
+extern void cpupm_alloc_domains(cpu_t *, int);
+extern void cpupm_free_domains(cpupm_state_domains_t **);
+extern void cpupm_alloc_ms_cstate(cpu_t *cp);
+extern void cpupm_free_ms_cstate(cpu_t *cp);
+extern void cpupm_state_change(cpu_t *, int, int);
+extern id_t cpupm_plat_domain_id(cpu_t *cp, cpupm_dtype_t type);
+extern uint_t cpupm_plat_state_enumerate(cpu_t *, cpupm_dtype_t,
+ cpupm_state_t *);
+extern int cpupm_plat_change_state(cpu_t *, cpupm_state_t *);
+extern uint_t cpupm_get_speeds(cpu_t *, int **);
+extern void cpupm_free_speeds(int *, uint_t);
+extern boolean_t cpupm_power_ready(void);
+extern boolean_t cpupm_throttle_ready(void);
+extern boolean_t cpupm_cstate_ready(void);
+extern void cpupm_add_notify_handler(cpu_t *, CPUPM_NOTIFY_HANDLER, void *);
+extern int cpupm_get_top_speed(cpu_t *);
+extern uint32_t cpupm_next_cstate(cma_c_state_t *, hrtime_t);
+extern void cpupm_idle_cstate_data(cma_c_state_t *, int);
+extern void cpupm_wakeup_cstate_data(cma_c_state_t *, hrtime_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _CPUPM_MACH_H */
diff --git a/usr/src/uts/i86pc/sys/cpupm_throttle.h b/usr/src/uts/i86pc/sys/cpupm_throttle.h
new file mode 100644
index 0000000000..5a607158da
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/cpupm_throttle.h
@@ -0,0 +1,43 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _CPUPM_THROTTLE_H
+#define _CPUPM_THROTTLE_H
+
+#include <sys/cpupm.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+cpupm_state_ops_t cpupm_throttle_ops;
+
+extern void cpupm_throttle_manage_notification(void *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _CPUPM_THROTTLE_H */
diff --git a/usr/src/uts/i86pc/sys/hpet.h b/usr/src/uts/i86pc/sys/hpet.h
new file mode 100644
index 0000000000..1ee9910441
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/hpet.h
@@ -0,0 +1,80 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _HPET_H
+#define _HPET_H
+
+#include <sys/hpet_acpi.h>
+
+/*
+ * Interface for HPET access.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * HPET_INFINITY is used for timers that will never expire.
+ */
+#define HPET_INFINITY (INT64_MAX)
+
+/*
+ * State of initialization.
+ */
+#define HPET_NO_SUPPORT (0)
+#define HPET_TIMER_SUPPORT (1) /* supports main counter reads */
+#define HPET_INTERRUPT_SUPPORT (2) /* supports interrupt/timer */
+#define HPET_FULL_SUPPORT (3) /* supports counter and timer intr */
+
+typedef struct hpet {
+ uint_t supported;
+ boolean_t (*install_proxy)(void);
+ boolean_t (*callback)(int);
+ /*
+ * Next two function pointers allow CPUs to use the HPET's timer
+ * as a proxy for their LAPIC timers which stop during Deep C-State.
+ */
+ boolean_t (*use_hpet_timer)(hrtime_t *);
+ void (*use_lapic_timer)(hrtime_t);
+} hpet_t;
+
+#define CST_EVENT_MULTIPLE_CSTATES (128) /* callbacks for _CST changes */
+#define CST_EVENT_ONE_CSTATE (129)
+
+/*
+ * unix access to the HPET is done through the hpet structure.
+ */
+extern hpet_t hpet;
+
+int hpet_acpi_init(int *hpet_vect, iflag_t *hpet_flags);
+void hpet_acpi_fini(void);
+uint32_t hpet_proxy_ipl(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _HPET_H */
diff --git a/usr/src/uts/i86pc/sys/hpet_acpi.h b/usr/src/uts/i86pc/sys/hpet_acpi.h
new file mode 100644
index 0000000000..c85707787e
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/hpet_acpi.h
@@ -0,0 +1,334 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _HPET_ACPI_H
+#define _HPET_ACPI_H
+
+#if defined(_KERNEL)
+#include <sys/acpi/acpi.h>
+#include <sys/acpi/actbl1.h>
+#include <sys/acpica.h>
+#endif /* defined(_KERNEL) */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Solaris uses an HPET Timer to generate interrupts for CPUs in Deep C-state
+ * with stalled LAPIC Timers. All CPUs use one HPET timer. The timer's
+ * interrupt targets one CPU (via the I/O APIC). The one CPU that receives
+ * the HPET's interrupt wakes up other CPUs as needed during the HPET Interrupt
+ * Service Routing. The HPET ISR uses poke_cpus to wake up other CPUs with an
+ * Inter Processor Interrupt.
+ *
+ * Please see the Intel Programmer's guides. Interrupts are disabled before
+ * a CPU Halts into Deep C-state. (This allows CPU-hardware-specific cleanup
+ * before servicing interrupts.) When a Deep C-state CPU wakes up (due to
+ * an externally generated interrupt), it resume execution where it halted.
+ * The CPU returning from Deep C-state must enable interrupts before it will
+ * handle the pending interrupt that woke it from Deep C-state.
+ *
+ *
+ * HPET bits as defined in the Intel IA-PC HPET Specification Rev 1.0a.
+ *
+ * The physical address space layout of the memory mapped HPET looks like this:
+ *
+ * struct hpet {
+ * uint64_t gen_cap;
+ * uint64_t res1;
+ * uint64_t gen_config;
+ * uint64_t res2;
+ * uint64_t gen_inter_stat;
+ * uint64_t res3;
+ * uint64_t main_counter_value;
+ * uint64_t res4;
+ * stuct hpet_timer {
+ * uint64_t config_and_capability;
+ * uint64_t comparator_value;
+ * uint64_t FSB_interrupt_route;
+ * uint64_t reserved;
+ * } timers[32];
+ * }
+ *
+ * There are 32 possible timers in an hpet. Only the first 3 timers are
+ * required. The other 29 timers are optional.
+ *
+ * HPETs can have 64-bit or 32-bit timers. Timers/compare registers can
+ * be 64-bit or 32-bit and can be a mixture of both.
+ * The first two timers are not used. The HPET spec intends the first two
+ * timers to be used as "legacy replacement" for the PIT and RTC timers.
+ *
+ * Solaris uses the first available non-legacy replacement timer as a proxy
+ * timer for processor Local APIC Timers that stop in deep idle C-states.
+ */
+
+/*
+ * We only use HPET table 1 on x86. Typical x86 systems only have 1 HPET.
+ * ACPI allows for multiple HPET tables to describe multiple HPETs.
+ */
+#define HPET_TABLE_1 (1)
+
+/*
+ * HPET Specification 1.0a defines the HPET to occupy 1024 bytes regardless of
+ * the number of counters (3 to 32) in this implementation.
+ */
+#define HPET_SIZE (1024)
+
+/*
+ * Offsets of hpet registers and macros to access them from HPET base address.
+ */
+#define HPET_GEN_CAP_OFFSET (0)
+#define HPET_GEN_CONFIG_OFFSET (0x10)
+#define HPET_GEN_INTR_STAT_OFFSET (0x20)
+#define HPET_MAIN_COUNTER_OFFSET (0xF0)
+#define HPET_TIMER_N_CONF_OFFSET(n) (0x100 + (n * 0x20))
+#define HPET_TIMER_N_COMP_OFFSET(n) (0x108 + (n * 0x20))
+
+#define OFFSET_ADDR(a, o) (((uintptr_t)(a)) + (o))
+#define HPET_GEN_CAP_ADDRESS(la) \
+ OFFSET_ADDR(la, HPET_GEN_CAP_OFFSET)
+#define HPET_GEN_CONFIG_ADDRESS(la) \
+ OFFSET_ADDR(la, HPET_GEN_CONFIG_OFFSET)
+#define HPET_GEN_INTR_STAT_ADDRESS(la) \
+ OFFSET_ADDR(la, HPET_GEN_INTR_STAT_OFFSET)
+#define HPET_MAIN_COUNTER_ADDRESS(la) \
+ OFFSET_ADDR(la, HPET_MAIN_COUNTER_OFFSET)
+#define HPET_TIMER_N_CONF_ADDRESS(la, n) \
+ OFFSET_ADDR(la, HPET_TIMER_N_CONF_OFFSET(n))
+#define HPET_TIMER_N_COMP_ADDRESS(la, n) \
+ OFFSET_ADDR(la, HPET_TIMER_N_COMP_OFFSET(n))
+
+/*
+ * HPET General Capabilities and ID Register
+ */
+typedef struct hpet_gen_cap {
+ uint32_t counter_clk_period; /* period in femtoseconds */
+ uint32_t vendor_id :16; /* vendor */
+ uint32_t leg_route_cap :1; /* 1=LegacyReplacemnt support */
+ uint32_t res1 :1; /* reserved */
+ uint32_t count_size_cap :1; /* 0=32bit, 1=64bit wide */
+ uint32_t num_tim_cap :5; /* number of timers -1 */
+ uint32_t rev_id :8; /* revision number */
+} hpet_gen_cap_t;
+
+/*
+ * Macros to parse fields of the hpet General Capabilities and ID Register.
+ */
+#define HPET_GCAP_CNTR_CLK_PERIOD(l) (l >> 32)
+#define HPET_GCAP_VENDOR_ID(l) BITX(l, 31, 16)
+#define HPET_GCAP_LEG_ROUTE_CAP(l) BITX(l, 15, 15)
+#define HPET_GCAP_CNT_SIZE_CAP(l) BITX(l, 13, 13)
+#define HPET_GCAP_NUM_TIM_CAP(l) BITX(l, 12, 8)
+#define HPET_GCAP_REV_ID(l) BITX(l, 7, 0)
+
+/*
+ * From HPET spec "The value in this field must be less than or equal to":
+ */
+#define HPET_MAX_CLK_PERIOD (0x5F5E100)
+
+/*
+ * Femto seconds in a second.
+ */
+#if defined(__i386)
+#define HPET_FEMTO_TO_NANO (1000000LL)
+#define HRTIME_TO_HPET_TICKS(t) (((t) * HPET_FEMTO_TO_NANO) / hpet_info.period)
+#else
+#define HPET_FEMTO_TO_NANO (1000000L)
+#define HRTIME_TO_HPET_TICKS(t) (((t) * HPET_FEMTO_TO_NANO) / hpet_info.period)
+#endif /* (__i386) */
+
+/*
+ * HPET General Configuration Register
+ */
+typedef struct hpet_gen_config_bitfield {
+ uint32_t leg_rt_cnf :1; /* legacy replacement route */
+ uint32_t enable_cnf :1; /* overal enable */
+} hpet_gen_conf_t;
+
+/*
+ * General Configuration Register fields.
+ */
+#define HPET_GCFR_LEG_RT_CNF (0x2) /* bit field value */
+#define HPET_GCFR_ENABLE_CNF (0x1) /* bit field value */
+#define HPET_GCFR_LEG_RT_CNF_BITX(l) BITX(l, 1, 1)
+#define HPET_GCFR_ENABLE_CNF_BITX(l) BITX(l, 0, 0)
+
+/*
+ * General Interrupt Status Register.
+ */
+#define HPET_GIS_T2_INT_STS(l) BITX(l, 2, 2)
+#define HPET_GIS_T1_INT_STS(l) BITX(l, 1, 1)
+#define HPET_GIS_T0_INT_STS(l) BITX(l, 0, 0)
+#define HPET_GIS_TN_INT_STS(l, n) BITX(l, n, n)
+
+#define HPET_INTR_STATUS_MASK(timer) ((uint64_t)1 << (timer))
+
+/*
+ * HPET Timer N Configuration and Capabilities Register
+ */
+typedef struct hpet_TN_conf_cap {
+ uint32_t int_route_cap; /* available I/O APIC intrups */
+ uint32_t res1 :16; /* reserved */
+ uint32_t fsb_int_del_cap :1; /* FSB interrupt supported */
+ uint32_t fsb_int_en_cnf :1; /* Set FSB intr delivery */
+ uint32_t int_route_cnf :5; /* I/O APIC interrupt to use */
+ uint32_t mode32_cnf :1; /* Force 32-bit mode */
+ uint32_t res2 :1; /* reserved */
+ uint32_t val_set_cnf :1; /* Set periodic mode accumula */
+ uint32_t size_cap :1; /* 1=64bit, 0=32bit timer */
+ uint32_t per_int_cap :1; /* 1=periodic mode supported */
+ uint32_t type_cnf :1; /* Enable periodic mode */
+ uint32_t int_enb_cnf :1; /* Enable interrupt generat */
+ uint32_t int_type_cnf :1; /* 0=edge, 1=level triggered */
+ uint32_t res3 :1; /* reserved */
+} hpet_TN_conf_cap_t;
+
+/*
+ * There are 3 to 32 timers on each HPET.
+ */
+#define HPET_TIMER_N_INT_ROUTE_CAP(l) (l >> 32)
+#define HPET_TIMER_N_INT_TYPE_CNF(l) BITX(l, 1, 1)
+#define HPET_TIMER_N_INT_ENB_CNF(l) BITX(l, 2, 2)
+#define HPET_TIMER_N_TYPE_CNF(l) BITX(l, 3, 3)
+#define HPET_TIMER_N_PER_INT_CAP(l) BITX(l, 4, 4)
+#define HPET_TIMER_N_SIZE_CAP(l) BITX(l, 5, 5)
+#define HPET_TIMER_N_VAL_SET_CNF(l) BITX(l, 6, 6)
+#define HPET_TIMER_N_MODE32_CNF(l) BITX(l, 8, 8)
+#define HPET_TIMER_N_INT_ROUTE_CNF(l) BITX(l, 13, 9)
+#define HPET_TIMER_N_FSB_EN_CNF(l) BITX(l, 14, 14)
+#define HPET_TIMER_N_FSB_INT_DEL_CAP(l) BITX(l, 15, 15)
+
+#define HPET_TIMER_N_INT_TYPE_CNF_BIT (1 << 1)
+#define HPET_TIMER_N_INT_ENB_CNF_BIT (1 << 2)
+#define HPET_TIMER_N_TYPE_CNF_BIT (1 << 3)
+#define HPET_TIMER_N_FSB_EN_CNF_BIT (1 << 14)
+#define HPET_TIMER_N_INT_ROUTE_SHIFT(i) (i << 9)
+
+/*
+ * HPET Spec reserves timers 0 and 1 for legacy timer replacement (PIT and RTC).
+ * Available timers for other use such as LACPI proxy during Deep C-State
+ * start at timer 2.
+ */
+#define HPET_FIRST_NON_LEGACY_TIMER (2)
+
+/*
+ * HPET timer and interrupt used as LAPIC proxy during deep C-State.
+ */
+typedef struct cstate_timer {
+ int timer;
+ int intr;
+} cstate_timer_t;
+
+/*
+ * Data structure of useful HPET device information.
+ */
+typedef struct hpet_info {
+ hpet_gen_cap_t gen_cap;
+ hpet_gen_conf_t gen_config;
+ uint64_t gen_intrpt_stat;
+ uint64_t main_counter_value;
+ void *logical_address; /* HPET VA memory map */
+ hpet_TN_conf_cap_t *timer_n_config; /* N Timer config and cap */
+ uint32_t num_timers; /* number of timers */
+ uint32_t allocated_timers; /* bitmap of timers in use */
+ cstate_timer_t cstate_timer; /* HPET Timer used for LAPIC proxy */
+ uint64_t hpet_main_counter_reads[2];
+ hrtime_t tsc[3];
+ hrtime_t period; /* counter_clk_period in Femto Secs */
+} hpet_info_t;
+
+#if defined(_KERNEL)
+
+/*
+ * Spin mutexes are used in several places because idle threads cannot block.
+ * These defines provide a mechanism to break out of spin loops to prevent
+ * system hangs if a CPU can never get the lock (due to an unknown
+ * hardware/software bug). 100 microsecond was chosen after extensive stress
+ * testing.
+ */
+#define HPET_SPIN_CHECK (1000)
+#define HPET_SPIN_TIMEOUT (100000)
+
+/*
+ * There is one of these per CPU using the HPET as a proxy for its stalled
+ * local APIC while in c-state >= C2.
+ */
+typedef hrtime_t hpet_proxy_t;
+
+extern ACPI_TABLE_HPET *hpet_table;
+extern hpet_info_t hpet_info;
+
+static int hpet_init_proxy(int *hpet_vect, iflag_t *hpet_flags);
+static boolean_t hpet_install_proxy(void);
+static boolean_t hpet_callback(int code);
+static boolean_t hpet_cpr(int code);
+static boolean_t hpet_resume(void);
+static void hpet_cst_callback(uint32_t code);
+static boolean_t hpet_deep_idle_config(int code);
+static int hpet_validate_table(ACPI_TABLE_HPET *hpet_table);
+static boolean_t hpet_checksum_table(unsigned char *table, unsigned int len);
+static void *hpet_memory_map(ACPI_TABLE_HPET *hpet_table);
+static int hpet_start_main_counter(hpet_info_t *hip);
+static int hpet_stop_main_counter(hpet_info_t *hip);
+static uint64_t hpet_read_main_counter_value(hpet_info_t *hip);
+static uint64_t hpet_set_leg_rt_cnf(hpet_info_t *hip, uint32_t new_value);
+static uint64_t hpet_read_gen_cap(hpet_info_t *hip);
+static uint64_t hpet_read_gen_config(hpet_info_t *hip);
+static uint64_t hpet_read_gen_intrpt_stat(hpet_info_t *hip);
+static uint64_t hpet_read_timer_N_config(hpet_info_t *hip, uint_t n);
+static hpet_TN_conf_cap_t hpet_convert_timer_N_config(uint64_t conf);
+static uint64_t hpet_read_timer_N_comp(hpet_info_t *hip, uint_t n);
+static void hpet_write_gen_cap(hpet_info_t *hip, uint64_t l);
+static void hpet_write_gen_config(hpet_info_t *hip, uint64_t l);
+static void hpet_write_gen_intrpt_stat(hpet_info_t *hip, uint64_t l);
+static void hpet_write_timer_N_config(hpet_info_t *hip, uint_t n, uint64_t l);
+static void hpet_write_timer_N_comp(hpet_info_t *hip, uint_t n, uint64_t l);
+static void hpet_disable_timer(hpet_info_t *hip, uint32_t timer_n);
+static void hpet_enable_timer(hpet_info_t *hip, uint32_t timer_n);
+static void hpet_write_main_counter_value(hpet_info_t *hip, uint64_t l);
+static int hpet_get_FSB_intr_capable_timer(hpet_info_t *hip, uint32_t mask);
+static int hpet_get_IOAPIC_intr_capable_timer(hpet_info_t *hip);
+static int hpet_timer_available(uint32_t allocated_timers, uint32_t n);
+static void hpet_timer_alloc(uint32_t *allocated_timers, uint32_t n);
+static void hpet_timer_set_up(hpet_info_t *hip, uint32_t timer_n,
+ uint32_t interrupt);
+static uint_t hpet_isr(char *arg);
+static uint32_t hpet_install_interrupt_handler(uint_t (*func)(char *),
+ int vector);
+static void hpet_uninstall_interrupt_handler(void);
+static void hpet_expire_all(void);
+static boolean_t hpet_guaranteed_schedule(hrtime_t required_wakeup_time);
+static boolean_t hpet_use_hpet_timer(hrtime_t *expire);
+static void hpet_use_lapic_timer(hrtime_t expire);
+static void hpet_init_proxy_data(void);
+
+#endif /* defined(_KERNEL) */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _HPET_ACPI_H */
diff --git a/usr/src/uts/i86pc/sys/machcpuvar.h b/usr/src/uts/i86pc/sys/machcpuvar.h
index 75654e16d7..415e71533e 100644
--- a/usr/src/uts/i86pc/sys/machcpuvar.h
+++ b/usr/src/uts/i86pc/sys/machcpuvar.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_MACHCPUVAR_H
#define _SYS_MACHCPUVAR_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -128,14 +126,21 @@ struct machcpu {
struct xen_evt_data *mcpu_evt_pend; /* hypervisor: pending events */
volatile uint32_t *mcpu_mwait; /* MONITOR/MWAIT buffer */
+ void (*mcpu_idle_cpu)(void); /* idle function */
+ uint16_t mcpu_idle_type; /* CPU next idle type */
+ uint16_t max_cstates; /* supported max cstates */
+ uint32_t curr_cstate; /* current cstate */
struct cpu_ucode_info *mcpu_ucode_info;
+
+ void *mcpu_pm_mach_state;
};
#define NINTR_THREADS (LOCK_LEVEL-1) /* number of interrupt threads */
#define MWAIT_HALTED (1) /* mcpu_mwait set when halting */
#define MWAIT_RUNNING (0) /* mcpu_mwait set to wakeup */
-#define MWAIT_WAKEUP(cpu) (*((cpu)->cpu_m.mcpu_mwait) = MWAIT_RUNNING);
+#define MWAIT_WAKEUP_IPI (2) /* need IPI to wakeup */
+#define MWAIT_WAKEUP(cpu) (*((cpu)->cpu_m.mcpu_mwait) = MWAIT_RUNNING)
#endif /* _ASM */
diff --git a/usr/src/uts/i86pc/sys/machsystm.h b/usr/src/uts/i86pc/sys/machsystm.h
index 0cd65e12e6..feebea3f6c 100644
--- a/usr/src/uts/i86pc/sys/machsystm.h
+++ b/usr/src/uts/i86pc/sys/machsystm.h
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -102,6 +102,14 @@ extern void trap(struct regs *, caddr_t, processorid_t);
extern void do_interrupt(struct regs *, trap_trace_rec_t *);
extern void memscrub_disable(void);
+/*
+ * Dispatcher hooks.
+ */
+void (*idle_cpu)();
+void (*non_deep_idle_cpu)();
+void (*disp_enq_thread)(cpu_t *, int);
+void (*non_deep_idle_disp_enq_thread)(cpu_t *, int);
+
#ifndef __xpv
extern unsigned int microdata;
#endif
diff --git a/usr/src/uts/i86pc/sys/pwrnow.h b/usr/src/uts/i86pc/sys/pwrnow.h
index 1e3cc24e3f..b010964290 100644
--- a/usr/src/uts/i86pc/sys/pwrnow.h
+++ b/usr/src/uts/i86pc/sys/pwrnow.h
@@ -19,22 +19,22 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _PWRNOW_H
#define _PWRNOW_H
-#include <sys/cpudrv_mach.h>
+#include <sys/cpupm.h>
#ifdef __cplusplus
extern "C" {
#endif
-boolean_t pwrnow_supported();
+extern boolean_t pwrnow_supported();
-cpudrv_pstate_ops_t pwrnow_ops;
+extern cpupm_state_ops_t pwrnow_ops;
#ifdef __cplusplus
}
diff --git a/usr/src/uts/i86pc/sys/speedstep.h b/usr/src/uts/i86pc/sys/speedstep.h
index f9debb2758..e2dfeba023 100644
--- a/usr/src/uts/i86pc/sys/speedstep.h
+++ b/usr/src/uts/i86pc/sys/speedstep.h
@@ -19,22 +19,22 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SPEEDSTEP_H
#define _SPEEDSTEP_H
-#include <sys/cpudrv_mach.h>
+#include <sys/cpupm.h>
#ifdef __cplusplus
extern "C" {
#endif
-boolean_t speedstep_supported(uint_t, uint_t);
+extern boolean_t speedstep_supported(uint_t, uint_t);
-cpudrv_pstate_ops_t speedstep_ops;
+extern cpupm_state_ops_t speedstep_ops;
#ifdef __cplusplus
}
diff --git a/usr/src/uts/i86xpv/Makefile.files b/usr/src/uts/i86xpv/Makefile.files
index 546549e603..9209bd604c 100644
--- a/usr/src/uts/i86xpv/Makefile.files
+++ b/usr/src/uts/i86xpv/Makefile.files
@@ -20,7 +20,7 @@
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
@@ -44,6 +44,7 @@ CORE_OBJS += \
cpuid.o \
cpuid_subr.o \
cpupm.o \
+ cpupm_mach.o \
dis_tables.o \
ddi_impl.o \
dtrace_subr.o \
diff --git a/usr/src/uts/intel/ia32/ml/modstubs.s b/usr/src/uts/intel/ia32/ml/modstubs.s
index 6d2fce1635..3b050716c0 100644
--- a/usr/src/uts/intel/ia32/ml/modstubs.s
+++ b/usr/src/uts/intel/ia32/ml/modstubs.s
@@ -1315,6 +1315,26 @@ fcnname/**/_info: \
END_MODULE(dcopy);
#endif
+/*
+ * Stubs for acpica
+ */
+#ifndef ACPICA_MODULE
+ MODULE(acpica,misc);
+ NO_UNLOAD_STUB(acpica, AcpiOsReadPort, nomod_minus_one) ;
+ NO_UNLOAD_STUB(acpica, AcpiOsWritePort, nomod_minus_one) ;
+ NO_UNLOAD_STUB(acpica, AcpiInstallNotifyHandler, nomod_minus_one) ;
+ NO_UNLOAD_STUB(acpica, AcpiRemoveNotifyHandler, nomod_minus_one) ;
+ NO_UNLOAD_STUB(acpica, AcpiEvaluateObject, nomod_minus_one) ;
+ NO_UNLOAD_STUB(acpica, AcpiEvaluateObjectTyped, nomod_minus_one) ;
+ NO_UNLOAD_STUB(acpica, AcpiSetRegister, nomod_minus_one) ;
+ NO_UNLOAD_STUB(acpica, AcpiGetRegister, nomod_minus_one) ;
+ NO_UNLOAD_STUB(acpica, AcpiOsFree, nomod_minus_one) ;
+ NO_UNLOAD_STUB(acpica, acpica_get_handle_cpu, nomod_minus_one) ;
+ NO_UNLOAD_STUB(acpica, acpica_get_global_FADT, nomod_minus_one) ;
+ NO_UNLOAD_STUB(acpica, __acpi_wbinvd, nomod_minus_one) ;
+ END_MODULE(acpica);
+#endif
+
#ifndef IPNET_MODULE
MODULE(ipnet,drv);
STUB(ipnet, ipnet_if_getdev, nomod_zero);
diff --git a/usr/src/uts/intel/io/acpica/osl.c b/usr/src/uts/intel/io/acpica/osl.c
index 41f85c9bdc..45edf50026 100644
--- a/usr/src/uts/intel/io/acpica/osl.c
+++ b/usr/src/uts/intel/io/acpica/osl.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
@@ -474,8 +474,16 @@ ACPI_CPU_FLAGS
AcpiOsAcquireLock(ACPI_HANDLE Handle)
{
- mutex_enter((kmutex_t *)Handle);
- return (0);
+
+ if (Handle == NULL)
+ return (AE_BAD_PARAMETER);
+
+ if (curthread == CPU->cpu_idle_thread) {
+ while (!mutex_tryenter((kmutex_t *)Handle))
+ /* spin */;
+ } else
+ mutex_enter((kmutex_t *)Handle);
+ return (AE_OK);
}
void
@@ -1365,24 +1373,8 @@ acpica_add_processor_to_map(UINT32 acpi_id, ACPI_HANDLE obj)
* Return the ACPI device node matching the CPU dev_info node.
*/
ACPI_STATUS
-acpica_get_handle_cpu(dev_info_t *dip, ACPI_HANDLE *rh)
+acpica_get_handle_cpu(int cpu_id, ACPI_HANDLE *rh)
{
- char *device_type_prop;
- int cpu_id;
-
- /*
- * if "device_type" != "cpu", error
- */
- if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0,
- "device_type", &device_type_prop) != DDI_PROP_SUCCESS)
- return (AE_ERROR);
-
- if (strcmp("cpu", device_type_prop) != 0) {
- ddi_prop_free(device_type_prop);
- return (AE_ERROR);
- }
- ddi_prop_free(device_type_prop);
-
/*
* if cpu_map itself is NULL, we're a uppc system and
* acpica_build_processor_map() hasn't been called yet.
@@ -1394,19 +1386,10 @@ acpica_get_handle_cpu(dev_info_t *dip, ACPI_HANDLE *rh)
return (AE_ERROR);
}
- /*
- * get 'reg' and get obj from cpu_map
- */
- cpu_id = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
- "reg", -1);
if ((cpu_id < 0) || (cpu_map[cpu_id] == NULL) ||
(cpu_map[cpu_id]->obj == NULL))
return (AE_ERROR);
- /*
- * tag devinfo and obj
- */
- (void) acpica_tag_devinfo(dip, cpu_map[cpu_id]->obj);
*rh = cpu_map[cpu_id]->obj;
return (AE_OK);
}
@@ -1689,7 +1672,7 @@ acpica_get_handle(dev_info_t *dip, ACPI_HANDLE *rh)
if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
"acpi-namespace", &acpiname) != DDI_PROP_SUCCESS) {
- return (acpica_get_handle_cpu(dip, rh));
+ return (AE_ERROR);
}
status = AcpiGetHandle(NULL, acpiname, rh);
@@ -1793,3 +1776,9 @@ acpica_build_processor_map()
ASSERT(status == AE_OK);
cpu_map_built = 1;
}
+
+void
+acpica_get_global_FADT(ACPI_TABLE_FADT **gbl_FADT)
+{
+ *gbl_FADT = &AcpiGbl_FADT;
+}
diff --git a/usr/src/uts/intel/sys/acpica.h b/usr/src/uts/intel/sys/acpica.h
index 8b3e1206c3..dddcc9bf78 100644
--- a/usr/src/uts/intel/sys/acpica.h
+++ b/usr/src/uts/intel/sys/acpica.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_ACPICA_H
#define _SYS_ACPICA_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -125,11 +123,13 @@ extern ACPI_STATUS acpica_get_sci(int *, iflag_t *);
extern int acpica_get_bdf(dev_info_t *, int *, int *, int *);
extern ACPI_STATUS acpica_get_devinfo(ACPI_HANDLE, dev_info_t **);
extern ACPI_STATUS acpica_get_handle(dev_info_t *, ACPI_HANDLE *);
+extern ACPI_STATUS acpica_get_handle_cpu(int, ACPI_HANDLE *);
extern ACPI_STATUS acpica_eval_int(ACPI_HANDLE, char *, int *);
extern void acpica_map_cpu(processorid_t, UINT32);
extern void acpica_build_processor_map();
extern void acpica_ddi_save_resources(dev_info_t *);
extern void acpica_ddi_restore_resources(dev_info_t *);
+extern void acpica_get_global_FADT(ACPI_TABLE_FADT **);
#ifdef __cplusplus
}
diff --git a/usr/src/uts/intel/sys/x86_archext.h b/usr/src/uts/intel/sys/x86_archext.h
index 369dff14db..c5a88a30f3 100644
--- a/usr/src/uts/intel/sys/x86_archext.h
+++ b/usr/src/uts/intel/sys/x86_archext.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -357,6 +357,11 @@ extern "C" {
"\10mmx\7cmov\6de\5pge\4mtrr\3msr\2tsc\1lgpg"
/*
+ * Intel Deep C-State invariant TSC in leaf 0x80000007.
+ */
+#define CPUID_TSC_CSTATE_INVARIANCE (0x100)
+
+/*
* x86_type is a legacy concept; this is supplanted
* for most purposes by x86_feature; modern CPUs
* should be X86_TYPE_OTHER
@@ -605,6 +610,7 @@ extern uint_t cpuid_get_dtlb_nent(struct cpu *, size_t);
#if !defined(__xpv)
extern uint32_t *cpuid_mwait_alloc(struct cpu *);
extern void cpuid_mwait_free(struct cpu *);
+extern int cpuid_deep_cstates_supported(void);
#endif
struct cpu_ucode_info;
diff --git a/usr/src/uts/sun4/Makefile.files b/usr/src/uts/sun4/Makefile.files
index 71e4dd6ee6..f532ad10ad 100644
--- a/usr/src/uts/sun4/Makefile.files
+++ b/usr/src/uts/sun4/Makefile.files
@@ -20,11 +20,9 @@
#
#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
# This Makefile defines all file modules for the directory uts/sun4
# and it's children. These are the source files which are common
# between sun4u and sun4r.
@@ -38,6 +36,7 @@ CORE_OBJS += bus_func.o
CORE_OBJS += cbe.o
CORE_OBJS += confunix.o
CORE_OBJS += copy.o
+CORE_OBJS += cpupm_mach.o
CORE_OBJS += cpu_states.o
CORE_OBJS += ddi_impl.o
CORE_OBJS += dmv.o
diff --git a/usr/src/uts/sun4/os/cpupm_mach.c b/usr/src/uts/sun4/os/cpupm_mach.c
new file mode 100644
index 0000000000..3d041c26ab
--- /dev/null
+++ b/usr/src/uts/sun4/os/cpupm_mach.c
@@ -0,0 +1,51 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/cpu_pm.h>
+
+/*
+ * CPU PM interfaces exposed to the CPU power manager
+ */
+/*ARGSUSED*/
+id_t
+cpupm_plat_domain_id(struct cpu *cp, cpupm_dtype_t type)
+{
+ return (CPUPM_NO_DOMAIN);
+}
+
+/*ARGSUSED*/
+uint_t
+cpupm_plat_state_enumerate(struct cpu *cp, cpupm_dtype_t type,
+ cpupm_state_t *states)
+{
+ return (0);
+}
+
+/*ARGSUSED*/
+int
+cpupm_plat_change_state(struct cpu *cp, cpupm_state_t *state)
+{
+ return (-1);
+}
diff --git a/usr/src/uts/sun4/os/mlsetup.c b/usr/src/uts/sun4/os/mlsetup.c
index db8066c8ca..4d6b244bd2 100644
--- a/usr/src/uts/sun4/os/mlsetup.c
+++ b/usr/src/uts/sun4/os/mlsetup.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/archsystm.h>
@@ -216,6 +214,8 @@ mlsetup(struct regs *rp, kfpu_t *fp)
cpu_vm_data_init(CPU);
+ pg_cpu_bootstrap(CPU);
+
(void) prom_set_preprom(kern_splr_preprom);
(void) prom_set_postprom(kern_splx_postprom);
PRM_INFO("mlsetup: now ok to call prom_printf");
diff --git a/usr/src/uts/sun4/os/startup.c b/usr/src/uts/sun4/os/startup.c
index 1459eb1ce4..fb1f5168b0 100644
--- a/usr/src/uts/sun4/os/startup.c
+++ b/usr/src/uts/sun4/os/startup.c
@@ -94,6 +94,7 @@ extern void memseg_remap_init(void);
extern void mach_kpm_init(void);
extern void pcf_init();
extern int size_pse_array(pgcnt_t, int);
+extern void pg_init();
/*
* External Data:
@@ -2222,6 +2223,8 @@ post_startup(void)
maxmem = freemem;
+ pg_init();
+
#ifdef PTL1_PANIC_DEBUG
init_ptl1_thread();
#endif /* PTL1_PANIC_DEBUG */
diff --git a/usr/src/uts/i86pc/sys/cpudrv_throttle.h b/usr/src/uts/sun4/sys/cpupm_mach.h
index ae4d352c14..4b7e6d01cc 100644
--- a/usr/src/uts/i86pc/sys/cpudrv_throttle.h
+++ b/usr/src/uts/sun4/sys/cpupm_mach.h
@@ -19,23 +19,24 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#ifndef _CPUDRV_THROTTLE_H
-#define _CPUDRV_THROTTLE_H
-
-#include <sys/cpudrv_mach.h>
+#ifndef _CPUPM_MACH_H
+#define _CPUPM_MACH_H
#ifdef __cplusplus
extern "C" {
#endif
-cpudrv_tstate_ops_t cpudrv_throttle_ops;
+/*
+ * Convert speed to Hz.
+ */
+#define CPUPM_SPEED_HZ(mhz, divisor) (((uint64_t)mhz * 1000000) / divisor)
#ifdef __cplusplus
}
#endif
-#endif /* _CPUDRV_THROTTLE_H */
+#endif /* _CPUPM_MACH_H */
diff --git a/usr/src/uts/sun4u/Makefile.files b/usr/src/uts/sun4u/Makefile.files
index 2e05b61c1e..15f7e7d22a 100644
--- a/usr/src/uts/sun4u/Makefile.files
+++ b/usr/src/uts/sun4u/Makefile.files
@@ -20,7 +20,7 @@
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# This Makefile defines all file modules for the directory uts/sun4u
@@ -36,6 +36,7 @@ CORE_OBJS += bootops.o
CORE_OBJS += cmp.o
CORE_OBJS += cpc_hwreg.o
CORE_OBJS += cpc_subr.o
+CORE_OBJS += cpupm.o
CORE_OBJS += mach_cpu_states.o
CORE_OBJS += mach_ddi_impl.o
CORE_OBJS += ecc.o
diff --git a/usr/src/uts/sun4u/cpu/spitfire.c b/usr/src/uts/sun4u/cpu/spitfire.c
index 00494d8cbc..9784a2338a 100644
--- a/usr/src/uts/sun4u/cpu/spitfire.c
+++ b/usr/src/uts/sun4u/cpu/spitfire.c
@@ -2904,8 +2904,7 @@ cpu_change_speed(uint64_t new_divisor, uint64_t arg2)
CHANGE_REFRESH_COUNT(HB_SPEED_UP, cur_divisor, new_divisor);
}
CPU->cpu_m.divisor = (uchar_t)new_divisor;
- CPU->cpu_curr_clock =
- (((uint64_t)pi->pi_clock * 1000000) / new_divisor);
+ cpu_set_curr_clock(((uint64_t)pi->pi_clock * 1000000) / new_divisor);
#endif
}
diff --git a/usr/src/uts/sun4u/cpu/us3_cheetah.c b/usr/src/uts/sun4u/cpu/us3_cheetah.c
index eadaebc099..c8290750bf 100644
--- a/usr/src/uts/sun4u/cpu/us3_cheetah.c
+++ b/usr/src/uts/sun4u/cpu/us3_cheetah.c
@@ -570,8 +570,8 @@ cpu_change_speed(uint64_t divisor, uint64_t arg2)
reg |= bceclk->mask;
set_safari_config(reg);
CPU->cpu_m.divisor = (uchar_t)divisor;
- CPU->cpu_curr_clock =
- (((uint64_t)pi->pi_clock * 1000000) / divisor);
+ cpu_set_curr_clock(((uint64_t)pi->pi_clock * 1000000) /
+ divisor);
return;
}
/*
diff --git a/usr/src/uts/sun4u/cpu/us3_cheetahplus.c b/usr/src/uts/sun4u/cpu/us3_cheetahplus.c
index 7cda4df713..b421e74b37 100644
--- a/usr/src/uts/sun4u/cpu/us3_cheetahplus.c
+++ b/usr/src/uts/sun4u/cpu/us3_cheetahplus.c
@@ -774,8 +774,8 @@ cpu_change_speed(uint64_t divisor, uint64_t arg2)
reg |= bceclk->mask;
set_safari_config(reg);
CPU->cpu_m.divisor = (uchar_t)divisor;
- CPU->cpu_curr_clock =
- (((uint64_t)pi->pi_clock * 1000000) / divisor);
+ cpu_set_curr_clock(((uint64_t)pi->pi_clock * 1000000) /
+ divisor);
return;
}
/*
diff --git a/usr/src/uts/sun4u/cpu/us3_jalapeno.c b/usr/src/uts/sun4u/cpu/us3_jalapeno.c
index bb0cb0c961..9dd046086a 100644
--- a/usr/src/uts/sun4u/cpu/us3_jalapeno.c
+++ b/usr/src/uts/sun4u/cpu/us3_jalapeno.c
@@ -792,8 +792,8 @@ cpu_change_speed(uint64_t divisor, uint64_t arg2)
(void) get_mcu_ctl_reg1();
}
CPU->cpu_m.divisor = (uchar_t)divisor;
- CPU->cpu_curr_clock =
- (((uint64_t)pi->pi_clock * 1000000) / divisor);
+ cpu_set_curr_clock(((uint64_t)pi->pi_clock * 1000000) /
+ divisor);
return;
}
/*
diff --git a/usr/src/uts/sun4u/io/cpudrv_mach.c b/usr/src/uts/sun4u/io/cpudrv_mach.c
index a9ca3debb4..c6129f64d8 100644
--- a/usr/src/uts/sun4u/io/cpudrv_mach.c
+++ b/usr/src/uts/sun4u/io/cpudrv_mach.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -28,16 +28,15 @@
*/
#include <sys/ddi.h>
#include <sys/sunddi.h>
+#include <sys/cpupm.h>
#include <sys/cpudrv_mach.h>
#include <sys/machsystm.h>
-boolean_t cpudrv_enabled = B_TRUE;
-
/*
* Change CPU speed.
*/
int
-cpudrv_pm_change_speed(cpudrv_devstate_t *cpudsp, cpudrv_pm_spd_t *new_spd)
+cpudrv_change_speed(cpudrv_devstate_t *cpudsp, cpudrv_pm_spd_t *new_spd)
{
xc_one(cpudsp->cpu_id, (xcfunc_t *)cpu_change_speed, \
(uint64_t)new_spd->speed, 0);
@@ -48,7 +47,7 @@ cpudrv_pm_change_speed(cpudrv_devstate_t *cpudsp, cpudrv_pm_spd_t *new_spd)
* Determine the cpu_id for the CPU device.
*/
boolean_t
-cpudrv_pm_get_cpu_id(dev_info_t *dip, processorid_t *cpu_id)
+cpudrv_get_cpu_id(dev_info_t *dip, processorid_t *cpu_id)
{
return (dip_to_cpu_id(dip, cpu_id) == DDI_SUCCESS);
}
@@ -57,7 +56,7 @@ cpudrv_pm_get_cpu_id(dev_info_t *dip, processorid_t *cpu_id)
* A noop for this machine type.
*/
boolean_t
-cpudrv_pm_power_ready(void)
+cpudrv_power_ready(void)
{
return (B_TRUE);
}
@@ -67,7 +66,7 @@ cpudrv_pm_power_ready(void)
*/
/* ARGSUSED */
boolean_t
-cpudrv_pm_is_governor_thread(cpudrv_pm_t *cpupm)
+cpudrv_is_governor_thread(cpudrv_pm_t *cpupm)
{
return (B_FALSE);
}
@@ -77,26 +76,31 @@ cpudrv_pm_is_governor_thread(cpudrv_pm_t *cpupm)
*/
/*ARGSUSED*/
boolean_t
-cpudrv_mach_pm_init(cpudrv_devstate_t *cpudsp)
+cpudrv_mach_init(cpudrv_devstate_t *cpudsp)
{
return (B_TRUE);
}
/*
- * A noop for this machine type.
+ * On SPARC all instances support power management unless attach fails.
+ * In the case of attach failure, cpudrv_enabled will be false.
*/
/*ARGSUSED*/
-void
-cpudrv_mach_pm_free(cpudrv_devstate_t *cpudsp)
+boolean_t
+cpudrv_is_enabled(cpudrv_devstate_t *cpudsp)
{
+ return (cpudrv_enabled);
}
-/*
- * On SPARC all instances support power management unless attach fails.
- * In the case of attach failure, cpupm_enabled will be false.
- */
-boolean_t
-cpudrv_pm_enabled()
+void
+cpudrv_set_supp_freqs(cpudrv_devstate_t *cpudsp)
{
- return (B_TRUE);
+ int *speeds;
+ uint_t nspeeds;
+
+ CPUDRV_GET_SPEEDS(cpudsp, speeds, nspeeds);
+ if (nspeeds == 0)
+ return;
+ cpupm_set_supp_freqs(cpudsp->cp, speeds, nspeeds);
+ CPUDRV_FREE_SPEEDS(speeds, nspeeds);
}
diff --git a/usr/src/uts/sun4u/os/cmp.c b/usr/src/uts/sun4u/os/cmp.c
index c44d8067ee..8ba9aa3b6e 100644
--- a/usr/src/uts/sun4u/os/cmp.c
+++ b/usr/src/uts/sun4u/os/cmp.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/machsystm.h>
#include <sys/x_call.h>
@@ -224,10 +222,16 @@ pg_plat_hw_instance_id(cpu_t *cpu, pghw_type_t hw)
}
}
-int
-pg_plat_hw_level(pghw_type_t hw)
+/*
+ * Rank the relative importance of optimizing for hw1 or hw2
+ */
+pghw_type_t
+pg_plat_hw_rank(pghw_type_t hw1, pghw_type_t hw2)
{
int i;
+ int rank1 = 0;
+ int rank2 = 0;
+
static pghw_type_t hw_hier[] = {
PGHW_IPIPE,
PGHW_CHIP,
@@ -236,40 +240,28 @@ pg_plat_hw_level(pghw_type_t hw)
};
for (i = 0; hw_hier[i] != PGHW_NUM_COMPONENTS; i++) {
- if (hw_hier[i] == hw)
- return (i);
+ if (hw_hier[i] == hw1)
+ rank1 = i;
+ if (hw_hier[i] == hw2)
+ rank2 = i;
}
- return (-1);
-}
-/*
- * Return 1 if CMT load balancing policies should be
- * implemented across instances of the specified hardware
- * sharing relationship.
- */
-int
-pg_plat_cmt_load_bal_hw(pghw_type_t hw)
-{
- if (hw == PGHW_IPIPE ||
- hw == PGHW_FPU ||
- hw == PGHW_CHIP)
- return (1);
+ if (rank1 > rank2)
+ return (hw1);
else
- return (0);
+ return (hw2);
}
-
/*
- * Return 1 if thread affinity polices should be implemented
- * for instances of the specifed hardware sharing relationship.
+ * Override the default CMT dispatcher policy for the specified
+ * hardware sharing relationship
*/
-int
-pg_plat_cmt_affinity_hw(pghw_type_t hw)
+/* ARGSUSED */
+pg_cmt_policy_t
+pg_plat_cmt_policy(pghw_type_t hw)
{
- if (hw == PGHW_CACHE)
- return (1);
- else
- return (0);
+ /* Accept the default polices */
+ return (CMT_NO_POLICY);
}
id_t
diff --git a/usr/src/uts/sun4u/os/mach_startup.c b/usr/src/uts/sun4u/os/mach_startup.c
index de59d089fc..0484b9b049 100644
--- a/usr/src/uts/sun4u/os/mach_startup.c
+++ b/usr/src/uts/sun4u/os/mach_startup.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -403,7 +403,7 @@ cpu_wakeup(cpu_t *cpu, int bound)
}
void
-mach_cpu_halt_idle()
+mach_cpu_halt_idle(void)
{
if (enable_halt_idle_cpus) {
if (&cpu_halt_cpu) {
diff --git a/usr/src/uts/sun4u/sys/cpudrv_mach.h b/usr/src/uts/sun4u/sys/cpudrv_mach.h
index 617e35b290..f1714fc695 100644
--- a/usr/src/uts/sun4u/sys/cpudrv_mach.h
+++ b/usr/src/uts/sun4u/sys/cpudrv_mach.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -38,44 +38,32 @@ extern "C" {
* take cross calls (cross calls fail silently if CPU is not ready
* for it).
*/
-#define CPUDRV_PM_XCALL_IS_READY(cpuid) (CPU_XCALL_READY(cpuid))
-
-/*
- * If a failure occurs during attach(), then CPU power management
- * is disabled.
- */
-extern boolean_t cpudrv_enabled;
-
-#define CPUDRV_PM_DISABLE() (cpudrv_enabled = B_FALSE)
-
-#define CPUDRV_PM_DISABLED() (!cpudrv_enabled)
-
-#define CPUDRV_PM_POWER_ENABLED(cpudsp) cpudrv_pm_enabled()
+#define CPUDRV_XCALL_IS_READY(cpuid) (CPU_XCALL_READY(cpuid))
/*
* Currently, there is no governor on sun4u,
*/
-#define CPUDRV_PM_RESET_GOVERNOR_THREAD(cpupm)
+#define CPUDRV_RESET_GOVERNOR_THREAD(cpupm)
/*
* Currently, there is no need for a handler on sun4u.
*/
-#define CPUDRV_PM_INSTALL_MAX_CHANGE_HANDLER(cpudsp, dip)
+#define CPUDRV_INSTALL_MAX_CHANGE_HANDLER(cpuid)
/*
* Topspeed is always the head speed.
*/
-#define CPUDRV_PM_TOPSPEED(cpupm) (cpupm)->head_spd
+#define CPUDRV_TOPSPEED(cpupm) (cpupm)->head_spd
/*
* There is no notion of changing topspeed on sun4u.
*/
-#define CPUDRV_PM_REDEFINE_TOPSPEED(dip)
+#define CPUDRV_REDEFINE_TOPSPEED(dip)
/*
* There are no PPM callbacks for sun4u.
*/
-#define CPUDRV_PM_SET_PPM_CALLBACKS()
+#define CPUDRV_SET_PPM_CALLBACKS()
/*
* clock-divisors property tells the supported speeds
@@ -84,33 +72,36 @@ extern boolean_t cpudrv_enabled;
* property value of "1, 2, 32" represents full, 1/2 and 1/32
* speeds.
*/
-#define CPUDRV_PM_GET_SPEEDS(cpudsp, speeds, nspeeds) { \
+#define CPUDRV_GET_SPEEDS(cpudsp, speeds, nspeeds) { \
if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, cpudsp->dip, \
DDI_PROP_DONTPASS, "clock-divisors", &speeds, \
&nspeeds) != DDI_PROP_SUCCESS) { \
- DPRINTF(D_PM_INIT, ("cpudrv_pm_init: instance %d: " \
+ nspeeds = 0; \
+ DPRINTF(D_PM_INIT, ("cpudrv_init: instance %d: " \
"clock-divisors property not defined\n", \
- ddi_get_instance(cpudsp->dip))); \
- return (DDI_FAILURE); \
+ ddi_get_instance(cpudsp->dip))); \
} \
}
-#define CPUDRV_PM_FREE_SPEEDS(speeds, unused) ddi_prop_free(speeds);
+#define CPUDRV_FREE_SPEEDS(speeds, nspeeds) { \
+ if (nspeeds > 0) \
+ ddi_prop_free(speeds); \
+}
/*
* Convert speed to Hz.
*/
-#define CPUDRV_PM_SPEED_HZ(mhz, divisor) (((uint64_t)mhz * 1000000) / divisor)
+#define CPUDRV_SPEED_HZ(mhz, divisor) (((uint64_t)mhz * 1000000) / divisor)
/*
* Compute the idle cnt percentage for a given speed.
*/
-#define CPUDRV_PM_IDLE_CNT_PERCENT(hwm, speeds, i) \
+#define CPUDRV_IDLE_CNT_PERCENT(hwm, speeds, i) \
(100 - ((100 - hwm) * speeds[i]))
/*
* Compute the user cnt percentage for a given speed.
*/
-#define CPUDRV_PM_USER_CNT_PERCENT(hwm, speeds, i) \
+#define CPUDRV_USER_CNT_PERCENT(hwm, speeds, i) \
((hwm * speeds[i - 1]) / speeds[i])
/*
@@ -128,23 +119,21 @@ extern boolean_t cpudrv_enabled;
* digits for power level + '=' + '1/' + digits for speed +
* description text + '\0'
*/
-#define CPUDRV_PM_COMP_NORMAL "Normal"
-#define CPUDRV_PM_COMP_OTHER " of Normal"
-#define CPUDRV_PM_COMP_SIZE() \
- (CPUDRV_PM_COMP_MAX_DIG + 1 + 2 + CPUDRV_PM_COMP_MAX_DIG + \
- sizeof (CPUDRV_PM_COMP_OTHER) + 1);
-#define CPUDRV_PM_COMP_SPEED(cpupm, cur_spd) \
+#define CPUDRV_COMP_NORMAL "Normal"
+#define CPUDRV_COMP_OTHER " of Normal"
+#define CPUDRV_COMP_SIZE() \
+ (CPUDRV_COMP_MAX_DIG + 1 + 2 + CPUDRV_COMP_MAX_DIG + \
+ sizeof (CPUDRV_COMP_OTHER) + 1);
+#define CPUDRV_COMP_SPEED(cpupm, cur_spd) \
((cur_spd == cpupm->head_spd) ? cur_spd->pm_level : cur_spd->speed)
-#define CPUDRV_PM_COMP_SPRINT(pmc, cpupm, cur_spd, comp_spd) { \
+#define CPUDRV_COMP_SPRINT(pmc, cpupm, cur_spd, comp_spd) { \
if (cur_spd == cpupm->head_spd) \
- (void) sprintf(pmc, "%d=%s", comp_spd, CPUDRV_PM_COMP_NORMAL);\
+ (void) sprintf(pmc, "%d=%s", comp_spd, CPUDRV_COMP_NORMAL);\
else \
(void) sprintf(pmc, "%d=1/%d%s", cur_spd->pm_level, \
- comp_spd, CPUDRV_PM_COMP_OTHER); \
+ comp_spd, CPUDRV_COMP_OTHER); \
}
-extern boolean_t cpudrv_pm_enabled(void);
-
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/sun4v/os/cmp.c b/usr/src/uts/sun4v/os/cmp.c
index 681afab583..4e80f06f32 100644
--- a/usr/src/uts/sun4v/os/cmp.c
+++ b/usr/src/uts/sun4v/os/cmp.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/machsystm.h>
#include <sys/cmp.h>
@@ -132,16 +130,15 @@ pg_plat_hw_instance_id(cpu_t *cpu, pghw_type_t hw)
}
/*
- * Order the relevant hw sharing relationships
- * from least, to greatest physical scope.
- *
- * The hierarchy *must* be defined for all hw that
- * pg_plat_hw_shared() returns non-zero.
+ * Rank the relative importance of optimizing for hw1 or hw2
*/
-int
-pg_plat_hw_level(pghw_type_t hw)
+pghw_type_t
+pg_plat_hw_rank(pghw_type_t hw1, pghw_type_t hw2)
{
int i;
+ int rank1 = 0;
+ int rank2 = 0;
+
static pghw_type_t hw_hier[] = {
PGHW_IPIPE,
PGHW_FPU,
@@ -150,40 +147,27 @@ pg_plat_hw_level(pghw_type_t hw)
};
for (i = 0; hw_hier[i] != PGHW_NUM_COMPONENTS; i++) {
- if (hw_hier[i] == hw)
- return (i);
+ if (hw_hier[i] == hw1)
+ rank1 = i;
+ if (hw_hier[i] == hw2)
+ rank2 = i;
}
- return (-1);
-}
-
-/*
- * Return 1 if CMT load balancing policies should be
- * implemented across instances of the specified hardware
- * sharing relationship.
- */
-int
-pg_plat_cmt_load_bal_hw(pghw_type_t hw)
-{
- if (hw == PGHW_IPIPE ||
- hw == PGHW_FPU ||
- hw == PGHW_MPIPE)
- return (1);
+ if (rank1 > rank2)
+ return (hw1);
else
- return (0);
+ return (hw2);
}
-
/*
- * Return 1 if thread affinity polices should be implemented
- * for instances of the specifed hardware sharing relationship.
+ * Override the default CMT dispatcher policy for the specified
+ * hardware sharing relationship
*/
-int
-pg_plat_cmt_affinity_hw(pghw_type_t hw)
+/* ARGSUSED */
+pg_cmt_policy_t
+pg_plat_cmt_policy(pghw_type_t hw)
{
- if (hw == PGHW_CACHE)
- return (1);
- else
- return (0);
+ /* Accept the default policies */
+ return (CMT_NO_POLICY);
}
id_t
@@ -213,7 +197,7 @@ pg_cmt_load_bal_hw(pghw_type_t hw)
return (0);
}
/*
- * Return 1 if thread affinity polices should be implemented
+ * Return 1 if thread affinity policies should be implemented
* for instances of the specifed hardware sharing relationship.
*/
int
diff --git a/usr/src/uts/sun4v/os/mach_startup.c b/usr/src/uts/sun4v/os/mach_startup.c
index ba9c4898d1..82069505a4 100644
--- a/usr/src/uts/sun4v/os/mach_startup.c
+++ b/usr/src/uts/sun4v/os/mach_startup.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -270,7 +270,7 @@ cpu_wakeup(cpu_t *cpu, int bound)
}
void
-mach_cpu_halt_idle()
+mach_cpu_halt_idle(void)
{
if (enable_halt_idle_cpus) {
idle_cpu = cpu_halt;