PSARC 2008/777 cpupm keyword mode extensions

PSARC 2008/663 CPU Deep Idle Keyword 6567156 bring CPU power awareness to the dispatcher 6700904 deeper C-State support required on follow-ons to Intel Penryn processor generation microarchitecture 6805661 cmt_root may contain duplicates on UMA systems --HG-- rename : usr/src/uts/i86pc/io/cpudrv/cpudrv_mach.c => usr/src/uts/i86pc/io/cpudrv_mach.c rename : usr/src/uts/i86pc/io/cpudrv/cpu_acpi.c => usr/src/uts/i86pc/os/cpupm/cpu_acpi.c rename : usr/src/uts/i86pc/io/cpudrv/cpudrv_amd.c => usr/src/uts/i86pc/os/cpupm/cpupm_amd.c rename : usr/src/uts/i86pc/io/cpudrv/cpudrv_intel.c => usr/src/uts/i86pc/os/cpupm/cpupm_intel.c rename : usr/src/uts/i86pc/os/cpupm.c => usr/src/uts/i86pc/os/cpupm/cpupm_mach.c rename : usr/src/uts/i86pc/io/cpudrv/cpudrv_throttle.c => usr/src/uts/i86pc/os/cpupm/cpupm_throttle.c rename : usr/src/uts/i86pc/io/cpudrv/pwrnow.c => usr/src/uts/i86pc/os/cpupm/pwrnow.c rename : usr/src/uts/i86pc/io/cpudrv/speedstep.c => usr/src/uts/i86pc/os/cpupm/speedstep.c rename : usr/src/uts/i86pc/sys/cpupm.h => usr/src/uts/i86pc/sys/cpupm_mach.h rename : usr/src/uts/i86pc/sys/cpudrv_throttle.h => usr/src/uts/i86pc/sys/cpupm_throttle.h
author: Eric Saxe <Eric.Saxe@Sun.COM> 2009-02-25 21:04:18 -0800
committer: Eric Saxe <Eric.Saxe@Sun.COM> 2009-02-25 21:04:18 -0800
commit: 0e7515250c8395f368aa45fb9acae7c4f8f8b786 (patch)
tree: 5c3abde4ff53a950ad424ce362fd793369c06872 /usr/src
parent: 9a5d73e03cd3312ddb571a748c40a63c58bd66e5 (diff)
download: illumos-joyent-0e7515250c8395f368aa45fb9acae7c4f8f8b786.tar.gz
87 files changed, 8342 insertions, 2160 deletions
diff --git a/usr/src/cmd/mdb/common/modules/genunix/pg.c b/usr/src/cmd/mdb/common/modules/genunix/pg.c
index 4e36430f04..60b4fba431 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/pg.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/pg.c
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Display processor group information
  */
@@ -34,6 +32,7 @@
 
 #include <mdb/mdb_modapi.h>
 #include <sys/pghw.h>
+#include <sys/cmt.h>
 
 /*
  * PG hardware types indexed by hardware ID
@@ -46,6 +45,8 @@ char *pg_hw_names[] = {
 	"mpipe",
 	"chip",
 	"memory",
+	"active_pwr",
+	"idle_pwr",
 };
 
 #define	A_CNT(arr)	(sizeof (arr) / sizeof (arr[0]))
@@ -70,8 +71,10 @@ pg(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 {
 	pg_t		pg;
 	pghw_t		pghw;
+	pg_cmt_t	pg_cmt;
 	pg_class_t	pg_class;
 	int		opt_q = 0; /* display only address. */
+	int		is_cmt = 0; /* This is CMT pg */
 
 	/* Should provide an address */
 	if (! (flags & DCMD_ADDRSPEC))
@@ -86,13 +89,14 @@ pg(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 		opt_q = B_TRUE;
 
 	if (DCMD_HDRSPEC(flags) && !opt_q) {
-		mdb_printf("%6s %?s %6s %7s %9s %5s\n",
+		mdb_printf("%6s %?s %6s %7s %11s %5s %5s\n",
 		    "PGID",
 		    "ADDR",
 		    "PHYSID",
 		    "CLASS",
 		    "HARDWARE",
-		    "#CPUs");
+		    "#CPUs",
+		    "LOAD");
 	}
 
 	/*
@@ -111,6 +115,14 @@ pg(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 		return (DCMD_OK);
 	}
 
+	if (strcmp(pg_class.pgc_name, "cmt") == 0) {
+		if (mdb_vread(&pg_cmt, sizeof (pg_cmt_t), addr) == -1) {
+			mdb_warn("unable to read 'cmt pg' at %p", addr);
+			return (DCMD_ERR);
+		}
+		is_cmt = 1;
+	}
+
 	if (mdb_vread(&pg_class, sizeof (struct pg_class),
 	    (uintptr_t)pg.pg_class) == -1) {
 		mdb_warn("unable to read 'pg_class' at %p", pg.pg_class);
@@ -125,10 +137,11 @@ pg(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 		/*
 		 * Display the physical PG info.
 		 */
-		mdb_printf("%6d %?p %6d %7s %9s %5d\n",
+		mdb_printf("%6d %?p %6d %7s %11s %5d %5d\n",
 		    pg.pg_id, addr, pghw.pghw_instance,
 		    pg_class.pgc_name, pg_hw_name(pghw.pghw_hw),
-		    pg.pg_cpus.grp_size);
+		    pg.pg_cpus.grp_size,
+		    is_cmt ? pg_cmt.cmt_utilization : 0);
 	} else {
 		/*
 		 * Display the basic PG info.
diff --git a/usr/src/cmd/power/handlers.c b/usr/src/cmd/power/handlers.c
index f5fa621c0c..5d2d51851c 100644
--- a/usr/src/cmd/power/handlers.c
+++ b/usr/src/cmd/power/handlers.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include "pmconfig.h"
 #include <sys/mkdev.h>
 #include <sys/syslog.h>
@@ -120,14 +118,66 @@ do_ioctl(int ioctl_cmd, char *keyword, char *behavior, int suppress)
 int
 cpupm(void)
 {
+	struct bmtoc {
+		char *behavior;
+		char *mode;
+		int cmd;
+		int Errno;
+	};
+
+	static struct bmtoc bmlist[] = {
+		"disable",	"\0",		PM_STOP_CPUPM,		EINVAL,
+		"enable",	"poll-mode",	PM_START_CPUPM_POLL,	EBUSY,
+		"enable",	"event-mode",	PM_START_CPUPM_EV,	EBUSY,
+		"enable",	"\0",		PM_START_CPUPM,		EBUSY,
+		NULL,		0,		0,			0
+	};
+	struct bmtoc *bp;
+	char *behavior;
+	char *mode;
+
+	behavior = LINEARG(1);
+	if ((mode = LINEARG(2)) == NULL)
+		mode = "\0";
+
+	for (bp = bmlist; bp->cmd; bp++) {
+		if (strcmp(behavior, bp->behavior) == 0 &&
+		    strcmp(mode, bp->mode) == 0) {
+			break;
+		}
+	}
+	if (bp->cmd == 0) {
+		if (LINEARG(2) == NULL) {
+			mesg(MERR, "invalid cpupm behavior \"%s\"\n", behavior);
+		} else {
+			mesg(MERR, "invalid cpupm behavior \"%s %s\"\n",
+			    behavior, mode);
+		}
+		return (NOUP);
+	}
+	if (ioctl(pm_fd, bp->cmd, NULL) == -1 && errno != bp->Errno) {
+		mesg(MERR, "cpupm %s failed, %s\n",
+		    behavior, strerror(errno));
+		return (NOUP);
+	}
+	return (OKUP);
+}
+
+/*
+ * Check for valid cpu_deep_idle option and communicate it to the kernel.
+ */
+int
+cpuidle(void)
+{
 	struct btoc {
 		char *behavior;
 		int cmd;
 		int Errno;
 	};
 	static struct btoc blist[] = {
-		"disable",	PM_STOP_CPUPM, EINVAL,
-		"enable",	PM_START_CPUPM, EBUSY,
+		"disable",	PM_DISABLE_CPU_DEEP_IDLE, EINVAL,
+		"enable",	PM_ENABLE_CPU_DEEP_IDLE, EBUSY,
+		"default",	PM_DEFAULT_CPU_DEEP_IDLE, EBUSY,
 		NULL,		0, 0
 	};
 	struct btoc *bp;
@@ -138,18 +188,17 @@ cpupm(void)
 			break;
 	}
 	if (bp->cmd == 0) {
-		mesg(MERR, "invalid cpupm behavior \"%s\"\n", behavior);
+		mesg(MERR, "invalid cpu_deep_idle behavior \"%s\"\n", behavior);
 		return (NOUP);
 	}
 	if (ioctl(pm_fd, bp->cmd, NULL) == -1 && errno != bp->Errno) {
-		mesg(MERR, "cpupm %s failed, %s\n",
+		mesg(MERR, "cpu_deep_idle %s failed, %s\n",
 		    behavior, strerror(errno));
 		return (NOUP);
 	}
 	return (OKUP);
 }
 
-
 /*
  * Two decisions are identical except for the list names and ioctl commands
  * inputs: whitelist, blacklist, yes, no
diff --git a/usr/src/cmd/power/parse.c b/usr/src/cmd/power/parse.c
index e7adff4d18..5ea845653d 100644
--- a/usr/src/cmd/power/parse.c
+++ b/usr/src/cmd/power/parse.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include "pmconfig.h"
 #include <deflt.h>
 #include <pwd.h>
@@ -58,7 +56,8 @@ static cinfo_t conftab[] = {
 	"autopm",		autopm,  &pm_status,	NULL,	2, 0, 1,
 	"autoshutdown",		autosd,  &cpr_status,	as_cmt,	5, 0, 1,
 	"cpu-threshold",	cputhr,  &pm_status,	NULL,	2, 0, 1,
-	"cpupm",		cpupm,   &pm_status,	NULL,	2, 0, 1,
+	"cpu_deep_idle",	cpuidle, &pm_status,	NULL,	2, 0, 1,
+	"cpupm",		cpupm,   &pm_status,	NULL,	2, 1, 1,
 	"device-dependency-property",
 				ddprop,  &pm_status,	NULL,	3, 1, 1,
 	"device-dependency",	devdep,  &pm_status,	NULL,	3, 1, 1,
diff --git a/usr/src/cmd/power/pmconfig.h b/usr/src/cmd/power/pmconfig.h
index 33f26b63df..e03c434ac2 100644
--- a/usr/src/cmd/power/pmconfig.h
+++ b/usr/src/cmd/power/pmconfig.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _PMCONFIG_H
 #define	_PMCONFIG_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -126,6 +124,7 @@ extern int autoS3(void);
 extern int autopm(void);
 extern int autosd(void);
 extern int cpupm(void);
+extern int cpuidle(void);
 extern int cputhr(void);
 extern int ddprop(void);
 extern int devdep(void);
diff --git a/usr/src/cmd/powertop/cpufreq.c b/usr/src/cmd/powertop/cpufreq.c
index 18bd393665..9537ce6c65 100644
--- a/usr/src/cmd/powertop/cpufreq.c
+++ b/usr/src/cmd/powertop/cpufreq.c
@@ -71,18 +71,18 @@ static const char 	*pt_cpufreq_dtrace_prog =
 "}"
 ""
 ":::cpu-change-speed"
-"/last[((cpudrv_devstate_t *)arg0)->cpu_id] != 0/"
+"/last[(processorid_t)arg0] != 0/"
 "{"
-"	this->cpu = ((cpudrv_devstate_t *)arg0)->cpu_id;"
-"	this->oldspeed = ((cpudrv_pm_t *)arg1)->cur_spd->speed;"
+"	this->cpu = (processorid_t)arg0;"
+"	this->oldspeed = (uint32_t)(arg1/1000000);"
 "	@times[this->cpu, this->oldspeed] = sum(timestamp - last[this->cpu]);"
 "	last[this->cpu] = timestamp;"
 "}"
 ":::cpu-change-speed"
-"/last[((cpudrv_devstate_t *)arg0)->cpu_id] == 0/"
+"/last[(processorid_t)arg0] == 0/"
 "{"
-"	this->cpu = ((cpudrv_devstate_t *)arg0)->cpu_id;"
-"	this->oldspeed = ((cpudrv_pm_t *)arg1)->cur_spd->speed;"
+"	this->cpu = (processorid_t)arg0;"
+"	this->oldspeed = (uint32_t)(arg1/1000000);"
 "	@times[this->cpu, this->oldspeed] = sum(timestamp - begin);"
 "	last[this->cpu] = timestamp;"
 "}";
diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com
index 555f28921c..3cc32ddd3d 100644
--- a/usr/src/pkgdefs/SUNWhea/prototype_com
+++ b/usr/src/pkgdefs/SUNWhea/prototype_com
@@ -711,6 +711,7 @@ f none usr/include/sys/cpr.h 644 root bin
 f none usr/include/sys/cpu.h 644 root bin
 f none usr/include/sys/cpupart.h 644 root bin
 f none usr/include/sys/cpuvar.h 644 root bin
+f none usr/include/sys/cpu_pm.h 644 root bin
 f none usr/include/sys/crc32.h 644 root bin
 f none usr/include/sys/cred.h 644 root bin
 f none usr/include/sys/cred_impl.h 644 root bin
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 19f0512969..d123becc90 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -41,8 +41,10 @@ COMMON_CORE_OBJS +=		\
 		brand.o		\
 		cpucaps.o	\
 		cmt.o		\
+		cmt_policy.o	\
 		cpu.o		\
 		cpu_intr.o	\
+		cpu_pm.o	\
 		cpupart.o	\
 		disp.o		\
 		group.o		\
diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c
index 5831545a33..ceecf32ee8 100644
--- a/usr/src/uts/common/conf/param.c
+++ b/usr/src/uts/common/conf/param.c
@@ -190,9 +190,6 @@ extern void deadman_init(void);
 extern void clock_timer_init(void);
 extern void clock_realtime_init(void);
 extern void clock_highres_init(void);
-extern void pg_init(void);
-extern void pg_cmt_class_init(void);
-extern void pg_cpu0_init(void);
 extern void clock_tick_mp_init(void);
 extern void callout_mp_init(void);
 extern void cpu_seq_tbl_init(void);
@@ -214,9 +211,6 @@ void	(*init_tbl[])(void) = {
 	segvn_init,
 	flk_init,
 	cpu_seq_tbl_init,
-	pg_init,
-	pg_cmt_class_init,
-	pg_cpu0_init,
 	schedctl_init,
 	fdb_init,
 	deadman_init,
diff --git a/usr/src/uts/common/disp/cmt.c b/usr/src/uts/common/disp/cmt.c
index 0fef28ff15..06c349c9b2 100644
--- a/usr/src/uts/common/disp/cmt.c
+++ b/usr/src/uts/common/disp/cmt.c
@@ -39,6 +39,7 @@
 #include <sys/bitset.h>
 #include <sys/lgrp.h>
 #include <sys/cmt.h>
+#include <sys/cpu_pm.h>
 
 /*
  * CMT scheduler / dispatcher support
@@ -58,11 +59,12 @@
  *
  * The scheduler/dispatcher leverages knowledge of the performance
  * relevant CMT sharing relationships existing between cpus to implement
- * optimized affinity and load balancing policies.
+ * optimized affinity, load balancing, and coalescence policies.
  *
  * Load balancing policy seeks to improve performance by minimizing
- * contention over shared processor resources / facilities, while the
- * affinity policies seek to improve cache and TLB utilization.
+ * contention over shared processor resources / facilities, Affinity
+ * policies seek to improve cache and TLB utilization. Coalescence
+ * policies improve resource utilization and ultimately power efficiency.
  *
  * The CMT PGs created by this class are already arranged into a
  * hierarchy (which is done in the pghw layer). To implement the top-down
@@ -79,25 +81,24 @@
  * balancng across the CMT PGs within their respective (per lgroup) top level
  * groups.
  */
-typedef struct cmt_lgrp {
-	group_t		cl_pgs;		/* Top level group of active CMT PGs */
-	int		cl_npgs;	/* # of top level PGs in the lgroup */
-	lgrp_handle_t	cl_hand;	/* lgroup's platform handle */
-	struct cmt_lgrp *cl_next;	/* next cmt_lgrp */
-} cmt_lgrp_t;
-
 static cmt_lgrp_t	*cmt_lgrps = NULL;	/* cmt_lgrps list head */
 static cmt_lgrp_t	*cpu0_lgrp = NULL;	/* boot CPU's initial lgrp */
 						/* used for null_proc_lpa */
-static cmt_lgrp_t	*cmt_root = NULL;	/* Reference to root cmt pg */
+cmt_lgrp_t		*cmt_root = NULL;	/* Reference to root cmt pg */
 
 static int		is_cpu0 = 1; /* true if this is boot CPU context */
 
 /*
+ * Array of hardware sharing relationships that are blacklisted.
+ * PGs won't be instantiated for blacklisted hardware sharing relationships.
+ */
+static int		cmt_hw_blacklisted[PGHW_NUM_COMPONENTS];
+
+/*
  * Set this to non-zero to disable CMT scheduling
  * This must be done via kmdb -d, as /etc/system will be too late
  */
-static int		cmt_sched_disabled = 0;
+int			cmt_sched_disabled = 0;
 
 static pg_cid_t		pg_cmt_class_id;		/* PG class id */
 
@@ -109,11 +110,19 @@ static void		pg_cmt_cpu_active(cpu_t *);
 static void		pg_cmt_cpu_inactive(cpu_t *);
 static void		pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
 static void		pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
-static void		pg_cmt_hier_pack(void **, int);
+static char		*pg_cmt_policy_name(pg_t *);
+static void		pg_cmt_hier_sort(pg_cmt_t **, int);
+static pg_cmt_t		*pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *);
 static int		pg_cmt_cpu_belongs(pg_t *, cpu_t *);
 static int		pg_cmt_hw(pghw_type_t);
 static cmt_lgrp_t	*pg_cmt_find_lgrp(lgrp_handle_t);
 static cmt_lgrp_t	*pg_cmt_lgrp_create(lgrp_handle_t);
+static int		pg_cmt_lineage_validate(pg_cmt_t **, int *);
+static void		cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t,
+			    kthread_t *, kthread_t *);
+static void		cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t,
+			    kthread_t *, kthread_t *);
+static void		cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
 
 /*
  * Macro to test if PG is managed by the CMT PG class
@@ -121,6 +130,29 @@ static cmt_lgrp_t	*pg_cmt_lgrp_create(lgrp_handle_t);
 #define	IS_CMT_PG(pg)	(((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
 
 /*
+ * Status codes for CMT lineage validation
+ * See cmt_lineage_validate() below
+ */
+typedef enum cmt_lineage_validation {
+	CMT_LINEAGE_VALID,
+	CMT_LINEAGE_NON_CONCENTRIC,
+	CMT_LINEAGE_REPAIRED,
+	CMT_LINEAGE_UNRECOVERABLE
+} cmt_lineage_validation_t;
+
+/*
+ * Status of the current lineage under construction.
+ * One must be holding cpu_lock to change this.
+ */
+static cmt_lineage_validation_t	cmt_lineage_status = CMT_LINEAGE_VALID;
+
+/*
+ * Power domain definitions (on x86) are defined by ACPI, and
+ * therefore may be subject to BIOS bugs.
+ */
+#define	PG_CMT_HW_SUSPECT(hw)	PGHW_IS_PM_DOMAIN(hw)
+
+/*
  * CMT PG ops
  */
 struct pg_ops pg_ops_cmt = {
@@ -134,6 +166,7 @@ struct pg_ops pg_ops_cmt = {
 	NULL,			/* cpupart_out */
 	pg_cmt_cpupart_move,
 	pg_cmt_cpu_belongs,
+	pg_cmt_policy_name,
 };
 
 /*
@@ -156,25 +189,8 @@ pg_cmt_class_init(void)
 void
 pg_cmt_cpu_startup(cpu_t *cp)
 {
-	PG_NRUN_UPDATE(cp, 1);
-}
-
-/*
- * Adjust the CMT load in the CMT PGs in which the CPU belongs
- * Note that "n" can be positive in the case of increasing
- * load, or negative in the case of decreasing load.
- */
-void
-pg_cmt_load(cpu_t *cp, int n)
-{
-	pg_cmt_t	*pg;
-
-	pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
-	while (pg != NULL) {
-		ASSERT(IS_CMT_PG(pg));
-		atomic_add_32(&pg->cmt_nrunning, n);
-		pg = pg->cmt_parent;
-	}
+	pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread,
+	    cp->cpu_thread);
 }
 
 /*
@@ -212,14 +228,219 @@ pg_cmt_free(pg_t *pg)
 }
 
 /*
- * Return 1 if CMT scheduling policies should be impelmented
- * for the specified hardware sharing relationship.
+ * Given a hardware sharing relationship, return which dispatcher
+ * policies should be implemented to optimize performance and efficiency
  */
-static int
-pg_cmt_hw(pghw_type_t hw)
+static pg_cmt_policy_t
+pg_cmt_policy(pghw_type_t hw)
+{
+	pg_cmt_policy_t p;
+
+	/*
+	 * Give the platform a chance to override the default
+	 */
+	if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY)
+		return (p);
+
+	switch (hw) {
+	case PGHW_IPIPE:
+	case PGHW_FPU:
+	case PGHW_CHIP:
+		return (CMT_BALANCE);
+	case PGHW_CACHE:
+		return (CMT_AFFINITY);
+	case PGHW_POW_ACTIVE:
+	case PGHW_POW_IDLE:
+		return (CMT_BALANCE);
+	default:
+		return (CMT_NO_POLICY);
+	}
+}
+
+/*
+ * Rank the importance of optimizing for the pg1 relationship vs.
+ * the pg2 relationship.
+ */
+static pg_cmt_t *
+pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2)
+{
+	pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw;
+	pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw;
+
+	/*
+	 * A power domain is only important if CPUPM is enabled.
+	 */
+	if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
+		if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2))
+			return (pg2);
+		if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1))
+			return (pg1);
+	}
+
+	/*
+	 * Otherwise, ask the platform
+	 */
+	if (pg_plat_hw_rank(hw1, hw2) == hw1)
+		return (pg1);
+	else
+		return (pg2);
+}
+
+/*
+ * Initialize CMT callbacks for the given PG
+ */
+static void
+cmt_callback_init(pg_t *pg)
 {
-	return (pg_plat_cmt_load_bal_hw(hw) ||
-	    pg_plat_cmt_affinity_hw(hw));
+	switch (((pghw_t *)pg)->pghw_hw) {
+	case PGHW_POW_ACTIVE:
+		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr;
+		pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr;
+		break;
+	default:
+		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch;
+
+	}
+}
+
+/*
+ * Promote PG above it's current parent.
+ * This is only legal if PG has an equal or greater number of CPUs
+ * than it's parent.
+ */
+static void
+cmt_hier_promote(pg_cmt_t *pg)
+{
+	pg_cmt_t	*parent;
+	group_t		*children;
+	cpu_t		*cpu;
+	group_iter_t	iter;
+	pg_cpu_itr_t	cpu_iter;
+	int		r;
+	int		err;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	parent = pg->cmt_parent;
+	if (parent == NULL) {
+		/*
+		 * Nothing to do
+		 */
+		return;
+	}
+
+	ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent));
+
+	/*
+	 * We're changing around the hierarchy, which is actively traversed
+	 * by the dispatcher. Pause CPUS to ensure exclusivity.
+	 */
+	pause_cpus(NULL);
+
+	/*
+	 * If necessary, update the parent's sibling set, replacing parent
+	 * with PG.
+	 */
+	if (parent->cmt_siblings) {
+		if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE)
+		    != -1) {
+			r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE);
+			ASSERT(r != -1);
+		}
+	}
+
+	/*
+	 * If the parent is at the top of the hierarchy, replace it's entry
+	 * in the root lgroup's group of top level PGs.
+	 */
+	if (parent->cmt_parent == NULL &&
+	    parent->cmt_siblings != &cmt_root->cl_pgs) {
+		if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE)
+		    != -1) {
+			r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE);
+			ASSERT(r != -1);
+		}
+	}
+
+	/*
+	 * We assume (and therefore assert) that the PG being promoted is an
+	 * only child of it's parent. Update the parent's children set
+	 * replacing PG's entry with the parent (since the parent is becoming
+	 * the child). Then have PG and the parent swap children sets.
+	 */
+	ASSERT(GROUP_SIZE(parent->cmt_children) <= 1);
+	if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) {
+		r = group_add(parent->cmt_children, parent, GRP_NORESIZE);
+		ASSERT(r != -1);
+	}
+
+	children = pg->cmt_children;
+	pg->cmt_children = parent->cmt_children;
+	parent->cmt_children = children;
+
+	/*
+	 * Update the sibling references for PG and it's parent
+	 */
+	pg->cmt_siblings = parent->cmt_siblings;
+	parent->cmt_siblings = pg->cmt_children;
+
+	/*
+	 * Update any cached lineages in the per CPU pg data.
+	 */
+	PG_CPU_ITR_INIT(pg, cpu_iter);
+	while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
+		int		idx;
+		group_t		*pgs;
+		pg_cmt_t	*cpu_pg;
+
+		/*
+		 * Iterate over the CPU's PGs updating the children
+		 * of the PG being promoted, since they have a new parent.
+		 */
+		pgs = &cpu->cpu_pg->pgs;
+		group_iter_init(&iter);
+		while ((cpu_pg = group_iterate(pgs, &iter)) != NULL) {
+			if (cpu_pg->cmt_parent == pg) {
+				cpu_pg->cmt_parent = parent;
+			}
+		}
+
+		/*
+		 * Update the CMT load balancing lineage
+		 */
+		pgs = &cpu->cpu_pg->cmt_pgs;
+		if ((idx = group_find(pgs, (void *)pg)) == -1) {
+			/*
+			 * Unless this is the CPU who's lineage is being
+			 * constructed, the PG being promoted should be
+			 * in the lineage.
+			 */
+			ASSERT(GROUP_SIZE(pgs) == 0);
+			continue;
+		}
+
+		ASSERT(GROUP_ACCESS(pgs, idx - 1) == parent);
+		ASSERT(idx > 0);
+
+		/*
+		 * Have the child and the parent swap places in the CPU's
+		 * lineage
+		 */
+		group_remove_at(pgs, idx);
+		group_remove_at(pgs, idx - 1);
+		err = group_add_at(pgs, parent, idx);
+		ASSERT(err == 0);
+		err = group_add_at(pgs, pg, idx - 1);
+		ASSERT(err == 0);
+	}
+
+	/*
+	 * Update the parent references for PG and it's parent
+	 */
+	pg->cmt_parent = parent->cmt_parent;
+	parent->cmt_parent = pg;
+
+	start_cpus();
 }
 
 /*
@@ -230,7 +451,7 @@ pg_cmt_cpu_init(cpu_t *cp)
 {
 	pg_cmt_t	*pg;
 	group_t		*cmt_pgs;
-	int		level, max_level, nlevels;
+	int		levels, level;
 	pghw_type_t	hw;
 	pg_t		*pg_cache = NULL;
 	pg_cmt_t	*cpu_cmt_hier[PGHW_NUM_COMPONENTS];
@@ -239,26 +460,42 @@ pg_cmt_cpu_init(cpu_t *cp)
 
 	ASSERT(MUTEX_HELD(&cpu_lock));
 
+	if (cmt_sched_disabled)
+		return;
+
 	/*
 	 * A new CPU is coming into the system.
 	 * Interrogate the platform to see if the CPU
-	 * has any performance relevant CMT sharing
-	 * relationships
+	 * has any performance or efficiency relevant
+	 * sharing relationships
 	 */
 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
 	cp->cpu_pg->cmt_lineage = NULL;
 
 	bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
-	max_level = nlevels = 0;
+	levels = 0;
 	for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
 
+		pg_cmt_policy_t	policy;
+
 		/*
-		 * We're only interested in CMT hw sharing relationships
+		 * We're only interested in the hw sharing relationships
+		 * for which we know how to optimize.
 		 */
-		if (pg_cmt_hw(hw) == 0 || pg_plat_hw_shared(cp, hw) == 0)
+		policy = pg_cmt_policy(hw);
+		if (policy == CMT_NO_POLICY ||
+		    pg_plat_hw_shared(cp, hw) == 0)
 			continue;
 
 		/*
+		 * Continue if the hardware sharing relationship has been
+		 * blacklisted.
+		 */
+		if (cmt_hw_blacklisted[hw]) {
+			continue;
+		}
+
+		/*
 		 * Find (or create) the PG associated with
 		 * the hw sharing relationship in which cp
 		 * belongs.
@@ -281,6 +518,11 @@ pg_cmt_cpu_init(cpu_t *cp)
 			 * ... and CMT specific portions of the
 			 * structure.
 			 */
+			pg->cmt_policy = policy;
+
+			/* CMT event callbacks */
+			cmt_callback_init((pg_t *)pg);
+
 			bitset_init(&pg->cmt_cpus_actv_set);
 			group_create(&pg->cmt_cpus_actv);
 		} else {
@@ -303,14 +545,10 @@ pg_cmt_cpu_init(cpu_t *cp)
 		}
 
 		/*
-		 * Build a lineage of CMT PGs for load balancing
+		 * Build a lineage of CMT PGs for load balancing / coalescence
 		 */
-		if (pg_plat_cmt_load_bal_hw(hw)) {
-			level = pghw_level(hw);
-			cpu_cmt_hier[level] = pg;
-			if (level > max_level)
-				max_level = level;
-			nlevels++;
+		if (policy & (CMT_BALANCE | CMT_COALESCE)) {
+			cpu_cmt_hier[levels++] = pg;
 		}
 
 		/* Cache this for later */
@@ -318,44 +556,73 @@ pg_cmt_cpu_init(cpu_t *cp)
 			pg_cache = (pg_t *)pg;
 	}
 
-	/*
-	 * Pack out any gaps in the constructed lineage,
-	 * then size it out.
-	 *
-	 * Gaps may exist where the architecture knows
-	 * about a hardware sharing relationship, but such a
-	 * relationship either isn't relevant for load
-	 * balancing or doesn't exist between CPUs on the system.
-	 */
-	pg_cmt_hier_pack((void **)cpu_cmt_hier, max_level + 1);
-	group_expand(cmt_pgs, nlevels);
-
+	group_expand(cmt_pgs, levels);
 
 	if (cmt_root == NULL)
 		cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());
 
 	/*
-	 * Find the lgrp that encapsulates this CPU's CMT hierarchy.
-	 * and locate/create a suitable cmt_lgrp_t.
+	 * Find the lgrp that encapsulates this CPU's CMT hierarchy
 	 */
 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
 	if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
 		lgrp = pg_cmt_lgrp_create(lgrp_handle);
 
 	/*
+	 * Ascendingly sort the PGs in the lineage by number of CPUs
+	 */
+	pg_cmt_hier_sort(cpu_cmt_hier, levels);
+
+	/*
+	 * Examine the lineage and validate it.
+	 * This routine will also try to fix the lineage along with the
+	 * rest of the PG hierarchy should it detect an issue.
+	 *
+	 * If it returns -1, an unrecoverable error has happened and we
+	 * need to return.
+	 */
+	if (pg_cmt_lineage_validate(cpu_cmt_hier, &levels) < 0)
+		return;
+
+	/*
+	 * For existing PGs in the lineage, verify that the parent is
+	 * correct, as the generation in the lineage may have changed
+	 * as a result of the sorting. Start the traversal at the top
+	 * of the lineage, moving down.
+	 */
+	for (level = levels - 1; level >= 0; ) {
+		int reorg;
+
+		reorg = 0;
+		pg = cpu_cmt_hier[level];
+
+		/*
+		 * Promote PGs at an incorrect generation into place.
+		 */
+		while (pg->cmt_parent &&
+		    pg->cmt_parent != cpu_cmt_hier[level + 1]) {
+			cmt_hier_promote(pg);
+			reorg++;
+		}
+		if (reorg > 0)
+			level = levels - 1;
+		else
+			level--;
+	}
+
+	/*
 	 * For each of the PGs in the CPU's lineage:
-	 *	- Add an entry in the CPU's CMT PG group
-	 *	  which is used by the dispatcher to implement load balancing
-	 *	  policy.
+	 *	- Add an entry in the CPU sorted CMT PG group
+	 *	  which is used for top down CMT load balancing
 	 *	- Tie the PG into the CMT hierarchy by connecting
 	 *	  it to it's parent and siblings.
 	 */
-	for (level = 0; level < nlevels; level++) {
+	for (level = 0; level < levels; level++) {
 		uint_t		children;
 		int		err;
 
 		pg = cpu_cmt_hier[level];
-		err = group_add_at(cmt_pgs, pg, nlevels - level - 1);
+		err = group_add_at(cmt_pgs, pg, levels - level - 1);
 		ASSERT(err == 0);
 
 		if (level == 0)
@@ -371,12 +638,13 @@ pg_cmt_cpu_init(cpu_t *cp)
 			continue;
 		}
 
-		if ((level + 1) == nlevels) {
+		if ((level + 1) == levels) {
 			pg->cmt_parent = NULL;
 
 			pg->cmt_siblings = &lgrp->cl_pgs;
 			children = ++lgrp->cl_npgs;
-			cmt_root->cl_npgs++;
+			if (cmt_root != lgrp)
+				cmt_root->cl_npgs++;
 		} else {
 			pg->cmt_parent = cpu_cmt_hier[level + 1];
 
@@ -436,6 +704,9 @@ pg_cmt_cpu_fini(cpu_t *cp)
 	lgrp_handle_t	lgrp_handle;
 	cmt_lgrp_t	*lgrp;
 
+	if (cmt_sched_disabled)
+		return;
+
 	pgs = &cp->cpu_pg->pgs;
 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
 
@@ -544,6 +815,9 @@ pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
 
 	ASSERT(MUTEX_HELD(&cpu_lock));
 
+	if (cmt_sched_disabled)
+		return;
+
 	pgs = &cp->cpu_pg->pgs;
 
 	/*
@@ -576,6 +850,9 @@ pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
 
 	ASSERT(MUTEX_HELD(&cpu_lock));
 
+	if (cmt_sched_disabled)
+		return;
+
 	pgs = &cp->cpu_pg->pgs;
 	group_iter_init(&pg_iter);
 
@@ -627,6 +904,9 @@ pg_cmt_cpu_active(cpu_t *cp)
 
 	ASSERT(MUTEX_HELD(&cpu_lock));
 
+	if (cmt_sched_disabled)
+		return;
+
 	pgs = &cp->cpu_pg->pgs;
 	group_iter_init(&i);
 
@@ -648,15 +928,16 @@ pg_cmt_cpu_active(cpu_t *cp)
 		 * for balancing with it's siblings.
 		 */
 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
-		    pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) {
+		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
 			err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
 			ASSERT(err == 0);
 
 			/*
 			 * If this is a top level PG, add it as a balancing
-			 * candidate when balancing within the root lgroup
+			 * candidate when balancing within the root lgroup.
 			 */
-			if (pg->cmt_parent == NULL) {
+			if (pg->cmt_parent == NULL &&
+			    pg->cmt_siblings != &cmt_root->cl_pgs) {
 				err = group_add(&cmt_root->cl_pgs, pg,
 				    GRP_NORESIZE);
 				ASSERT(err == 0);
@@ -691,6 +972,9 @@ pg_cmt_cpu_inactive(cpu_t *cp)
 
 	ASSERT(MUTEX_HELD(&cpu_lock));
 
+	if (cmt_sched_disabled)
+		return;
+
 	pgs = &cp->cpu_pg->pgs;
 	group_iter_init(&i);
 
@@ -713,11 +997,12 @@ pg_cmt_cpu_inactive(cpu_t *cp)
 		 * load was balanced, remove it as a balancing candidate.
 		 */
 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
-		    pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) {
+		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
 			err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
 			ASSERT(err == 0);
 
-			if (pg->cmt_parent == NULL) {
+			if (pg->cmt_parent == NULL &&
+			    pg->cmt_siblings != &cmt_root->cl_pgs) {
 				err = group_remove(&cmt_root->cl_pgs, pg,
 				    GRP_NORESIZE);
 				ASSERT(err == 0);
@@ -776,26 +1061,47 @@ pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
 }
 
 /*
- * Hierarchy packing utility routine. The hierarchy order is preserved.
+ * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
  */
 static void
-pg_cmt_hier_pack(void *hier[], int sz)
+pg_cmt_hier_sort(pg_cmt_t **hier, int size)
 {
-	int	i, j;
-
-	for (i = 0; i < sz; i++) {
-		if (hier[i] != NULL)
-			continue;
+	int		i, j, inc;
+	pg_t		*tmp;
+	pg_t		**h = (pg_t **)hier;
 
-		for (j = i; j < sz; j++) {
-			if (hier[j] != NULL) {
-				hier[i] = hier[j];
-				hier[j] = NULL;
-				break;
+	/*
+	 * First sort by number of CPUs
+	 */
+	inc = size / 2;
+	while (inc > 0) {
+		for (i = inc; i < size; i++) {
+			j = i;
+			tmp = h[i];
+			while ((j >= inc) &&
+			    (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) {
+				h[j] = h[j - inc];
+				j = j - inc;
 			}
+			h[j] = tmp;
+		}
+		if (inc == 2)
+			inc = 1;
+		else
+			inc = (inc * 5) / 11;
+	}
+
+	/*
+	 * Break ties by asking the platform.
+	 * Determine if h[i] outranks h[i + 1] and if so, swap them.
+	 */
+	for (i = 0; i < size - 1; i++) {
+		if ((PG_NUM_CPUS(h[i]) == PG_NUM_CPUS(h[i + 1])) &&
+		    pg_cmt_hier_rank(hier[i], hier[i + 1]) == hier[i]) {
+			tmp = h[i];
+			h[i] = h[i + 1];
+			h[i + 1] = tmp;
 		}
-		if (j == sz)
-			break;
 	}
 }
 
@@ -840,134 +1146,492 @@ pg_cmt_lgrp_create(lgrp_handle_t hand)
 }
 
 /*
- * Perform multi-level CMT load balancing of running threads.
+ * Interfaces to enable and disable power aware dispatching
+ * The caller must be holding cpu_lock.
  *
- * tp is the thread being enqueued.
- * cp is a hint CPU, against which CMT load balancing will be performed.
- *
- * Returns cp, or a CPU better than cp with respect to balancing
- * running thread load.
+ * Return 0 on success and -1 on failure.
  */
-cpu_t *
-cmt_balance(kthread_t *tp, cpu_t *cp)
+int
+cmt_pad_enable(pghw_type_t type)
 {
-	int		hint, i, cpu, nsiblings;
-	int		self = 0;
-	group_t		*cmt_pgs, *siblings;
-	pg_cmt_t	*pg, *pg_tmp, *tpg = NULL;
-	int		pg_nrun, tpg_nrun;
-	int		level = 0;
-	cpu_t		*newcp;
+	group_t		*hwset;
+	group_iter_t	iter;
+	pg_cmt_t	*pg;
 
-	ASSERT(THREAD_LOCK_HELD(tp));
+	ASSERT(PGHW_IS_PM_DOMAIN(type));
+	ASSERT(MUTEX_HELD(&cpu_lock));
 
-	cmt_pgs = &cp->cpu_pg->cmt_pgs;
+	if ((hwset = pghw_set_lookup(type)) == NULL ||
+	    cmt_hw_blacklisted[type]) {
+		/*
+		 * Unable to find any instances of the specified type
+		 * of power domain, or the power domains have been blacklisted.
+		 */
+		return (-1);
+	}
 
-	if (GROUP_SIZE(cmt_pgs) == 0)
-		return (cp);	/* nothing to do */
+	/*
+	 * Iterate over the power domains, setting the default dispatcher
+	 * policy for power/performance optimization.
+	 *
+	 * Simply setting the policy isn't enough in the case where the power
+	 * domain is an only child of another PG. Because the dispatcher walks
+	 * the PG hierarchy in a top down fashion, the higher up PG's policy
+	 * will dominate. So promote the power domain above it's parent if both
+	 * PG and it's parent have the same CPUs to ensure it's policy
+	 * dominates.
+	 */
+	group_iter_init(&iter);
+	while ((pg = group_iterate(hwset, &iter)) != NULL) {
+		/*
+		 * If the power domain is an only child to a parent
+		 * not implementing the same policy, promote the child
+		 * above the parent to activate the policy.
+		 */
+		pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw);
+		while ((pg->cmt_parent != NULL) &&
+		    (pg->cmt_parent->cmt_policy != pg->cmt_policy) &&
+		    (PG_NUM_CPUS((pg_t *)pg) ==
+		    PG_NUM_CPUS((pg_t *)pg->cmt_parent))) {
+			cmt_hier_promote(pg);
+		}
+	}
+
+	return (0);
+}
+
+int
+cmt_pad_disable(pghw_type_t type)
+{
+	group_t		*hwset;
+	group_iter_t	iter;
+	pg_cmt_t	*pg;
+	pg_cmt_t	*child;
 
-	if (tp == curthread)
-		self = 1;
+	ASSERT(PGHW_IS_PM_DOMAIN(type));
+	ASSERT(MUTEX_HELD(&cpu_lock));
 
+	if ((hwset = pghw_set_lookup(type)) == NULL) {
+		/*
+		 * Unable to find any instances of the specified type of
+		 * power domain.
+		 */
+		return (-1);
+	}
 	/*
-	 * Balance across siblings in the CPUs CMT lineage
-	 * If the thread is homed to the root lgroup, perform
-	 * top level balancing against other top level PGs
-	 * in the system. Otherwise, start with the default
-	 * top level siblings group, which is within the leaf lgroup
+	 * Iterate over the power domains, setting the default dispatcher
+	 * policy for performance optimization (load balancing).
 	 */
-	pg = GROUP_ACCESS(cmt_pgs, level);
-	if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID)
-		siblings = &cmt_root->cl_pgs;
-	else
-		siblings = pg->cmt_siblings;
+	group_iter_init(&iter);
+	while ((pg = group_iterate(hwset, &iter)) != NULL) {
+
+		/*
+		 * If the power domain has an only child that implements
+		 * policy other than load balancing, promote the child
+		 * above the power domain to ensure it's policy dominates.
+		 */
+		if (GROUP_SIZE(pg->cmt_children) == 1) {
+			child = GROUP_ACCESS(pg->cmt_children, 0);
+			if ((child->cmt_policy & CMT_BALANCE) == 0) {
+				cmt_hier_promote(child);
+			}
+		}
+		pg->cmt_policy = CMT_BALANCE;
+	}
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
+		    kthread_t *new)
+{
+	pg_cmt_t	*cmt_pg = (pg_cmt_t *)pg;
+
+	if (old == cp->cpu_idle_thread) {
+		atomic_add_32(&cmt_pg->cmt_utilization, 1);
+	} else if (new == cp->cpu_idle_thread) {
+		atomic_add_32(&cmt_pg->cmt_utilization, -1);
+	}
+}
+
+/*
+ * Macro to test whether a thread is currently runnable on a CPU in a PG.
+ */
+#define	THREAD_RUNNABLE_IN_PG(t, pg)					\
+	((t)->t_state == TS_RUN &&					\
+	    (t)->t_disp_queue->disp_cpu &&				\
+	    bitset_in_set(&(pg)->cmt_cpus_actv_set,			\
+	    (t)->t_disp_queue->disp_cpu->cpu_seqid))
+
+static void
+cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
+    kthread_t *new)
+{
+	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
+	cpupm_domain_t	*dom;
+	uint32_t	u;
+
+	if (old == cp->cpu_idle_thread) {
+		ASSERT(new != cp->cpu_idle_thread);
+		u = atomic_add_32_nv(&cmt->cmt_utilization, 1);
+		if (u == 1) {
+			/*
+			 * Notify the CPU power manager that the domain
+			 * is non-idle.
+			 */
+			dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
+			cpupm_utilization_event(cp, now, dom,
+			    CPUPM_DOM_BUSY_FROM_IDLE);
+		}
+	} else if (new == cp->cpu_idle_thread) {
+		ASSERT(old != cp->cpu_idle_thread);
+		u = atomic_add_32_nv(&cmt->cmt_utilization, -1);
+		if (u == 0) {
+			/*
+			 * The domain is idle, notify the CPU power
+			 * manager.
+			 *
+			 * Avoid notifying if the thread is simply migrating
+			 * between CPUs in the domain.
+			 */
+			if (!THREAD_RUNNABLE_IN_PG(old, cmt)) {
+				dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
+				cpupm_utilization_event(cp, now, dom,
+				    CPUPM_DOM_IDLE_FROM_BUSY);
+			}
+		}
+	}
+}
+
+/* ARGSUSED */
+static void
+cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t)
+{
+	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
+	cpupm_domain_t	*dom;
+
+	dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
+	cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY);
+}
+
+/*
+ * Return the name of the CMT scheduling policy
+ * being implemented across this PG
+ */
+static char *
+pg_cmt_policy_name(pg_t *pg)
+{
+	pg_cmt_policy_t policy;
+
+	policy = ((pg_cmt_t *)pg)->cmt_policy;
+
+	if (policy & CMT_AFFINITY) {
+		if (policy & CMT_BALANCE)
+			return ("Load Balancing & Affinity");
+		else if (policy & CMT_COALESCE)
+			return ("Load Coalescence & Affinity");
+		else
+			return ("Affinity");
+	} else {
+		if (policy & CMT_BALANCE)
+			return ("Load Balancing");
+		else if (policy & CMT_COALESCE)
+			return ("Load Coalescence");
+		else
+			return ("None");
+	}
+}
+
+/*
+ * Prune PG, and all other instances of PG's hardware sharing relationship
+ * from the PG hierarchy.
+ */
+static int
+pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz)
+{
+	group_t		*hwset, *children;
+	int		i, j, r, size = *sz;
+	group_iter_t	hw_iter, child_iter;
+	pg_cpu_itr_t	cpu_iter;
+	pg_cmt_t	*pg, *child;
+	cpu_t		*cpu;
+	int		cap_needed;
+	pghw_type_t	hw;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	hw = ((pghw_t *)pg_bad)->pghw_hw;
+
+	if (hw == PGHW_POW_ACTIVE) {
+		cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. "
+		    "Event Based CPUPM Unavailable");
+	} else if (hw == PGHW_POW_IDLE) {
+		cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. "
+		    "Dispatcher assisted CPUPM disabled.");
+	}
 
 	/*
-	 * Traverse down the lineage until we find a level that needs
-	 * balancing, or we get to the end.
+	 * Find and eliminate the PG from the lineage.
 	 */
-	for (;;) {
-		nsiblings = GROUP_SIZE(siblings);	/* self inclusive */
-		if (nsiblings == 1)
-			goto next_level;
+	for (i = 0; i < size; i++) {
+		if (lineage[i] == pg_bad) {
+			for (j = i; j < size - 1; j++)
+				lineage[j] = lineage[j + 1];
+			*sz = size - 1;
+			break;
+		}
+	}
 
-		pg_nrun = pg->cmt_nrunning;
-		if (self &&
-		    bitset_in_set(&pg->cmt_cpus_actv_set, CPU->cpu_seqid))
-			pg_nrun--;	/* Ignore curthread's effect */
+	/*
+	 * We'll prune all instances of the hardware sharing relationship
+	 * represented by pg. But before we do that (and pause CPUs) we need
+	 * to ensure the hierarchy's groups are properly sized.
+	 */
+	hwset = pghw_set_lookup(hw);
 
-		hint = CPU_PSEUDO_RANDOM() % nsiblings;
+	/*
+	 * Blacklist the hardware so that future groups won't be created.
+	 */
+	cmt_hw_blacklisted[hw] = 1;
 
+	/*
+	 * For each of the PGs being pruned, ensure sufficient capacity in
+	 * the siblings set for the PG's children
+	 */
+	group_iter_init(&hw_iter);
+	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
 		/*
-		 * Find a balancing candidate from among our siblings
-		 * "hint" is a hint for where to start looking
+		 * PG is being pruned, but if it is bringing up more than
+		 * one child, ask for more capacity in the siblings group.
 		 */
-		i = hint;
-		do {
-			ASSERT(i < nsiblings);
-			pg_tmp = GROUP_ACCESS(siblings, i);
+		cap_needed = 0;
+		if (pg->cmt_children &&
+		    GROUP_SIZE(pg->cmt_children) > 1) {
+			cap_needed = GROUP_SIZE(pg->cmt_children) - 1;
+
+			group_expand(pg->cmt_siblings,
+			    GROUP_SIZE(pg->cmt_siblings) + cap_needed);
 
 			/*
-			 * The candidate must not be us, and must
-			 * have some CPU resources in the thread's
-			 * partition
+			 * If this is a top level group, also ensure the
+			 * capacity in the root lgrp level CMT grouping.
 			 */
-			if (pg_tmp != pg &&
-			    bitset_in_set(&tp->t_cpupart->cp_cmt_pgs,
-			    ((pg_t *)pg_tmp)->pg_id)) {
-				tpg = pg_tmp;
-				break;
+			if (pg->cmt_parent == NULL &&
+			    pg->cmt_siblings != &cmt_root->cl_pgs) {
+				group_expand(&cmt_root->cl_pgs,
+				    GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed);
 			}
+		}
+	}
 
-			if (++i >= nsiblings)
-				i = 0;
-		} while (i != hint);
+	/*
+	 * We're operating on the PG hierarchy. Pause CPUs to ensure
+	 * exclusivity with respect to the dispatcher.
+	 */
+	pause_cpus(NULL);
 
-		if (!tpg)
-			goto next_level; /* no candidates at this level */
+	/*
+	 * Prune all PG instances of the hardware sharing relationship
+	 * represented by pg.
+	 */
+	group_iter_init(&hw_iter);
+	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
 
 		/*
-		 * Check if the balancing target is underloaded
-		 * Decide to balance if the target is running fewer
-		 * threads, or if it's running the same number of threads
-		 * with more online CPUs
+		 * Remove PG from it's group of siblings, if it's there.
 		 */
-		tpg_nrun = tpg->cmt_nrunning;
-		if (pg_nrun > tpg_nrun ||
-		    (pg_nrun == tpg_nrun &&
-		    (GROUP_SIZE(&tpg->cmt_cpus_actv) >
-		    GROUP_SIZE(&pg->cmt_cpus_actv)))) {
-			break;
+		if (pg->cmt_siblings) {
+			(void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
+		}
+		if (pg->cmt_parent == NULL &&
+		    pg->cmt_siblings != &cmt_root->cl_pgs) {
+			(void) group_remove(&cmt_root->cl_pgs, pg,
+			    GRP_NORESIZE);
+		}
+		/*
+		 * Add PGs children to it's group of siblings.
+		 */
+		if (pg->cmt_children != NULL) {
+			children = pg->cmt_children;
+
+			group_iter_init(&child_iter);
+			while ((child = group_iterate(children, &child_iter))
+			    != NULL) {
+				/*
+				 * Transplant child from it's siblings set to
+				 * PGs.
+				 */
+				if (pg->cmt_siblings != NULL &&
+				    child->cmt_siblings != NULL &&
+				    group_remove(child->cmt_siblings, child,
+				    GRP_NORESIZE) != -1) {
+					r = group_add(pg->cmt_siblings, child,
+					    GRP_NORESIZE);
+					ASSERT(r == 0);
+				}
+			}
 		}
-		tpg = NULL;
 
-next_level:
-		if (++level == GROUP_SIZE(cmt_pgs))
-			break;
+		/*
+		 * Reset the callbacks to the defaults
+		 */
+		pg_callback_set_defaults((pg_t *)pg);
+
+		/*
+		 * Update all the CPU lineages in each of PG's CPUs
+		 */
+		PG_CPU_ITR_INIT(pg, cpu_iter);
+		while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
+			group_t		*pgs;
+			pg_cmt_t	*cpu_pg;
+			group_iter_t	liter;	/* Iterator for the lineage */
 
-		pg = GROUP_ACCESS(cmt_pgs, level);
-		siblings = pg->cmt_siblings;
+			/*
+			 * Iterate over the CPU's PGs updating the children
+			 * of the PG being promoted, since they have a new
+			 * parent and siblings set.
+			 */
+			pgs = &cpu->cpu_pg->pgs;
+			group_iter_init(&liter);
+			while ((cpu_pg = group_iterate(pgs, &liter)) != NULL) {
+				if (cpu_pg->cmt_parent == pg) {
+					cpu_pg->cmt_parent = pg->cmt_parent;
+					cpu_pg->cmt_siblings = pg->cmt_siblings;
+				}
+			}
+
+			/*
+			 * Update the CPU's lineages
+			 */
+			pgs = &cpu->cpu_pg->cmt_pgs;
+			(void) group_remove(pgs, pg, GRP_NORESIZE);
+			pgs = &cpu->cpu_pg->pgs;
+			(void) group_remove(pgs, pg, GRP_NORESIZE);
+		}
 	}
+	start_cpus();
+	return (0);
+}
+
+/*
+ * Disable CMT scheduling
+ */
+static void
+pg_cmt_disable(void)
+{
+	cpu_t	*cpu;
+
+	pause_cpus(NULL);
+	cpu = cpu_list;
+
+	do {
+		if (cpu->cpu_pg)
+			group_empty(&cpu->cpu_pg->cmt_pgs);
+	} while ((cpu = cpu->cpu_next) != cpu_list);
+
+	cmt_sched_disabled = 1;
+	start_cpus();
+	cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable");
+}
+
+static int
+pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz)
+{
+	int		i, size;
+	pg_cmt_t	*pg, *parent, *pg_bad;
+	cpu_t		*cp;
+	pg_cpu_itr_t	cpu_iter;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+revalidate:
+	size = *sz;
+	pg_bad = NULL;
+	for (i = 0; i < size - 1; i++) {
 
-	if (tpg) {
-		uint_t	tgt_size = GROUP_SIZE(&tpg->cmt_cpus_actv);
+		pg = lineage[i];
+		parent = lineage[i + 1];
 
 		/*
-		 * Select an idle CPU from the target
+		 * We assume that the lineage has already been sorted
+		 * by the number of CPUs. In fact, we depend on it.
 		 */
-		hint = CPU_PSEUDO_RANDOM() % tgt_size;
-		cpu = hint;
-		do {
-			newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu);
-			if (newcp->cpu_part == tp->t_cpupart &&
-			    newcp->cpu_dispatch_pri == -1) {
-				cp = newcp;
-				break;
+		ASSERT(PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)parent));
+
+		/*
+		 * Walk each of the CPUs in the PGs group, and verify that
+		 * the next larger PG contains at least the CPUs in this one.
+		 */
+		PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter);
+		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
+			if (pg_cpu_find((pg_t *)parent, cp) == B_FALSE) {
+				cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC;
+				goto handle_error;
 			}
-			if (++cpu == tgt_size)
-				cpu = 0;
-		} while (cpu != hint);
+		}
 	}
 
-	return (cp);
+handle_error:
+	switch (cmt_lineage_status) {
+	case CMT_LINEAGE_VALID:
+	case CMT_LINEAGE_REPAIRED:
+		break;
+	case CMT_LINEAGE_NON_CONCENTRIC:
+		/*
+		 * We've detected a non-concentric PG lineage.
+		 *
+		 * This can happen when some of the CPU grouping information
+		 * is derived from buggy sources (for example, incorrect ACPI
+		 * tables on x86 systems).
+		 *
+		 * We attempt to recover from this by pruning out the
+		 * illegal groupings from the PG hierarchy, which means that
+		 * we won't optimize for those levels, but we will for the
+		 * remaining ones.
+		 *
+		 * If a given level has CPUs not found in it's parent, then
+		 * we examine the PG and it's parent to see if either grouping
+		 * is enumerated from potentially buggy sources.
+		 *
+		 * If one has less CPUs than the other, and contains CPUs
+		 * not found in the parent, and it is an untrusted enumeration,
+		 * then prune it. If both have the same number of CPUs, then
+		 * prune the one that is untrusted.
+		 *
+		 * This process repeats until we have a concentric lineage,
+		 * or we would have to prune out level derived from what we
+		 * thought was a reliable source, in which case CMT scheduling
+		 * is disabled all together.
+		 */
+		if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)parent)) &&
+		    (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) {
+			pg_bad = pg;
+		} else if (PG_NUM_CPUS((pg_t *)pg) ==
+		    PG_NUM_CPUS((pg_t *)parent)) {
+			if (PG_CMT_HW_SUSPECT(((pghw_t *)parent)->pghw_hw)) {
+				pg_bad = parent;
+			} else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
+				pg_bad = pg;
+			}
+		}
+		if (pg_bad) {
+			if (pg_cmt_prune(pg_bad, lineage, sz) == 0) {
+				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
+				goto revalidate;
+			}
+		}
+		/*FALLTHROUGH*/
+	default:
+		/*
+		 * If we're here, something has gone wrong in trying to
+		 * recover from a illegal PG hierarchy, or we've encountered
+		 * a validation error for which we don't know how to recover.
+		 * In this case, disable CMT scheduling all together.
+		 */
+		pg_cmt_disable();
+		cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE;
+		return (-1);
+	}
+	return (0);
 }
diff --git a/usr/src/uts/common/disp/cmt_policy.c b/usr/src/uts/common/disp/cmt_policy.c
new file mode 100644
index 0000000000..e3c00d2bc5
--- /dev/null
+++ b/usr/src/uts/common/disp/cmt_policy.c
@@ -0,0 +1,229 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/thread.h>
+#include <sys/cpuvar.h>
+#include <sys/cpupart.h>
+#include <sys/cmn_err.h>
+#include <sys/disp.h>
+#include <sys/group.h>
+#include <sys/bitset.h>
+#include <sys/lgrp.h>
+#include <sys/cmt.h>
+
+/*
+ * CMT dispatcher policies
+ *
+ * This file implements CMT dispatching policies using Processor Groups.
+ *
+ * The scheduler/dispatcher leverages knowledge of the performance
+ * relevant CMT sharing relationships existing between CPUs to implement
+ * load balancing, and coalescence thread placement policies.
+ *
+ * Load balancing policy seeks to improve performance by minimizing
+ * contention over shared processor resources / facilities. Coalescence
+ * policies improve resource utilization and ultimately power efficiency.
+ *
+ * On NUMA systems, the dispatcher will generally perform load balancing and
+ * coalescence within (and not across) lgroups. This is because there isn't
+ * much sense in trying to correct an imbalance by sending a thread outside
+ * of its home, if it would attempt to return home a short while later.
+ * The dispatcher will implement CMT policy across lgroups however, if
+ * it can do so with a thread homed to the root lgroup, since root homed
+ * threads have no lgroup affinity.
+ */
+
+/*
+ * Return non-zero if, given the policy, we should migrate from running
+ * somewhere "here" to somewhere "there".
+ */
+static int
+cmt_should_migrate(pg_cmt_t *here, pg_cmt_t *there, pg_cmt_policy_t policy,
+    int self)
+{
+	uint32_t here_util, there_util;
+
+	here_util = here->cmt_utilization;
+	there_util = there->cmt_utilization;
+
+	/*
+	 * This assumes that curthread's utilization is "1"
+	 */
+	if (self && bitset_in_set(&here->cmt_cpus_actv_set, CPU->cpu_seqid))
+		here_util--;	/* Ignore curthread's effect */
+
+	/*
+	 * Load balancing and coalescence are conflicting policies
+	 */
+	ASSERT((policy & (CMT_BALANCE|CMT_COALESCE)) !=
+	    (CMT_BALANCE|CMT_COALESCE));
+
+	if (policy & CMT_BALANCE) {
+		/*
+		 * Balance utilization
+		 *
+		 * If the target is comparatively underutilized
+		 * (either in an absolute sense, or scaled by capacity),
+		 * then choose to balance.
+		 */
+		if ((here_util > there_util) ||
+		    (here_util == there_util &&
+		    (CMT_CAPACITY(there) > CMT_CAPACITY(here)))) {
+			return (1);
+		}
+	} else if (policy & CMT_COALESCE) {
+		/*
+		 * Attempt to drive group utilization up to capacity
+		 */
+		if (there_util > here_util &&
+		    there_util < CMT_CAPACITY(there))
+			return (1);
+	}
+	return (0);
+}
+
+/*
+ * Perform multi-level CMT load balancing of running threads.
+ *
+ * tp is the thread being enqueued.
+ * cp is a hint CPU, against which CMT load balancing will be performed.
+ *
+ * Returns cp, or a CPU better than cp with respect to balancing
+ * running thread load.
+ */
+cpu_t *
+cmt_balance(kthread_t *tp, cpu_t *cp)
+{
+	int		hint, i, cpu, nsiblings;
+	int		self = 0;
+	group_t		*cmt_pgs, *siblings;
+	pg_cmt_t	*pg, *pg_tmp, *tpg = NULL;
+	int		level = 0;
+	cpu_t		*newcp;
+	extern cmt_lgrp_t *cmt_root;
+
+	ASSERT(THREAD_LOCK_HELD(tp));
+
+	cmt_pgs = &cp->cpu_pg->cmt_pgs;
+
+	if (GROUP_SIZE(cmt_pgs) == 0)
+		return (cp);	/* nothing to do */
+
+	if (tp == curthread)
+		self = 1;
+
+	/*
+	 * Balance across siblings in the CPUs CMT lineage
+	 * If the thread is homed to the root lgroup, perform
+	 * top level balancing against other top level PGs
+	 * in the system. Otherwise, start with the default
+	 * top level siblings group, which is within the leaf lgroup
+	 */
+	pg = GROUP_ACCESS(cmt_pgs, level);
+	if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID)
+		siblings = &cmt_root->cl_pgs;
+	else
+		siblings = pg->cmt_siblings;
+
+	/*
+	 * Traverse down the lineage until we find a level that needs
+	 * balancing, or we get to the end.
+	 */
+	for (;;) {
+		nsiblings = GROUP_SIZE(siblings);	/* self inclusive */
+		if (nsiblings == 1)
+			goto next_level;
+
+		hint = CPU_PSEUDO_RANDOM() % nsiblings;
+
+		/*
+		 * Find a balancing candidate from among our siblings
+		 * "hint" is a hint for where to start looking
+		 */
+		i = hint;
+		do {
+			ASSERT(i < nsiblings);
+			pg_tmp = GROUP_ACCESS(siblings, i);
+
+			/*
+			 * The candidate must not be us, and must
+			 * have some CPU resources in the thread's
+			 * partition
+			 */
+			if (pg_tmp != pg &&
+			    bitset_in_set(&tp->t_cpupart->cp_cmt_pgs,
+			    ((pg_t *)pg_tmp)->pg_id)) {
+				tpg = pg_tmp;
+				break;
+			}
+
+			if (++i >= nsiblings)
+				i = 0;
+		} while (i != hint);
+
+		if (!tpg)
+			goto next_level; /* no candidates at this level */
+
+		/*
+		 * Decide if we should migrate from the current PG to a
+		 * target PG given a policy
+		 */
+		if (cmt_should_migrate(pg, tpg, pg->cmt_policy, self))
+			break;
+		tpg = NULL;
+
+next_level:
+		if (++level == GROUP_SIZE(cmt_pgs))
+			break;
+
+		pg = GROUP_ACCESS(cmt_pgs, level);
+		siblings = pg->cmt_siblings;
+	}
+
+	if (tpg) {
+		uint_t	tgt_size = GROUP_SIZE(&tpg->cmt_cpus_actv);
+
+		/*
+		 * Select an idle CPU from the target
+		 */
+		hint = CPU_PSEUDO_RANDOM() % tgt_size;
+		cpu = hint;
+		do {
+			newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu);
+			if (newcp->cpu_part == tp->t_cpupart &&
+			    newcp->cpu_dispatch_pri == -1) {
+				cp = newcp;
+				break;
+			}
+			if (++cpu == tgt_size)
+				cpu = 0;
+		} while (cpu != hint);
+	}
+
+	return (cp);
+}
diff --git a/usr/src/uts/common/disp/disp.c b/usr/src/uts/common/disp/disp.c
index 458792c7f8..b3f6efeb2e 100644
--- a/usr/src/uts/common/disp/disp.c
+++ b/usr/src/uts/common/disp/disp.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -890,11 +890,10 @@ swtch()
 		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
 
 		if (next != t) {
-			if (t == cp->cpu_idle_thread) {
-				PG_NRUN_UPDATE(cp, 1);
-			} else if (next == cp->cpu_idle_thread) {
-				PG_NRUN_UPDATE(cp, -1);
-			}
+			hrtime_t now;
+
+			now = gethrtime_unscaled();
+			pg_ev_thread_swtch(cp, now, t, next);
 
 			/*
 			 * If t was previously in the TS_ONPROC state,
@@ -904,7 +903,7 @@ swtch()
 			 * queue.
 			 */
 			if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
-				t->t_waitrq = gethrtime_unscaled();
+				t->t_waitrq = now;
 			}
 
 			/*
@@ -929,6 +928,8 @@ swtch()
 			if (t->t_flag & T_INTR_THREAD)
 				cpu_intr_swtch_exit(t);
 
+			pg_ev_thread_remain(cp, t);
+
 			DTRACE_SCHED(remain__cpu);
 			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
 			(void) spl0();
@@ -960,8 +961,7 @@ swtch_from_zombie()
 	ASSERT(next != curthread);
 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 
-	if (next == cpu->cpu_idle_thread)
-		PG_NRUN_UPDATE(cpu, -1);
+	pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
 
 	restore_mstate(next);
 
@@ -1055,6 +1055,7 @@ void
 swtch_to(kthread_t *next)
 {
 	cpu_t			*cp = CPU;
+	hrtime_t		now;
 
 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 
@@ -1065,8 +1066,8 @@ swtch_to(kthread_t *next)
 
 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 
-	if (curthread == cp->cpu_idle_thread)
-		PG_NRUN_UPDATE(cp, 1);
+	now = gethrtime_unscaled();
+	pg_ev_thread_swtch(cp, now, curthread, next);
 
 	/* OK to steal anything left on run queue */
 	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
@@ -1081,7 +1082,7 @@ swtch_to(kthread_t *next)
 	 * queue.
 	 */
 	if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
-		curthread->t_waitrq = gethrtime_unscaled();
+		curthread->t_waitrq = now;
 	}
 
 	/* restore next thread to previously running microstate */
@@ -1098,8 +1099,6 @@ swtch_to(kthread_t *next)
 	 */
 }
 
-
-
 #define	CPU_IDLING(pri)	((pri) == -1)
 
 static void
diff --git a/usr/src/uts/common/io/cpudrv.c b/usr/src/uts/common/io/cpudrv.c
index 6f329fad4c..8314c5df43 100644
--- a/usr/src/uts/common/io/cpudrv.c
+++ b/usr/src/uts/common/io/cpudrv.c
@@ -43,7 +43,7 @@
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
 #include <sys/sdt.h>
-
+#include <sys/epm.h>
 #include <sys/machsystm.h>
 #include <sys/x_call.h>
 #include <sys/cpudrv_mach.h>
@@ -110,23 +110,25 @@ static struct modlinkage modlinkage = {
 /*
  * Function prototypes
  */
-static int cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp);
-static void cpudrv_pm_free(cpudrv_devstate_t *cpudsp);
-static int cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp);
-static void cpudrv_pm_monitor_disp(void *arg);
-static void cpudrv_pm_monitor(void *arg);
+static int cpudrv_init(cpudrv_devstate_t *cpudsp);
+static void cpudrv_free(cpudrv_devstate_t *cpudsp);
+static int cpudrv_comp_create(cpudrv_devstate_t *cpudsp);
+static void cpudrv_monitor_disp(void *arg);
+static void cpudrv_monitor(void *arg);
 
 /*
  * Driver global variables
  */
 uint_t cpudrv_debug = 0;
 void *cpudrv_state;
-static uint_t cpudrv_pm_idle_hwm = CPUDRV_PM_IDLE_HWM;
-static uint_t cpudrv_pm_idle_lwm = CPUDRV_PM_IDLE_LWM;
-static uint_t cpudrv_pm_idle_buf_zone = CPUDRV_PM_IDLE_BUF_ZONE;
-static uint_t cpudrv_pm_idle_bhwm_cnt_max = CPUDRV_PM_IDLE_BHWM_CNT_MAX;
-static uint_t cpudrv_pm_idle_blwm_cnt_max = CPUDRV_PM_IDLE_BLWM_CNT_MAX;
-static uint_t cpudrv_pm_user_hwm = CPUDRV_PM_USER_HWM;
+static uint_t cpudrv_idle_hwm = CPUDRV_IDLE_HWM;
+static uint_t cpudrv_idle_lwm = CPUDRV_IDLE_LWM;
+static uint_t cpudrv_idle_buf_zone = CPUDRV_IDLE_BUF_ZONE;
+static uint_t cpudrv_idle_bhwm_cnt_max = CPUDRV_IDLE_BHWM_CNT_MAX;
+static uint_t cpudrv_idle_blwm_cnt_max = CPUDRV_IDLE_BLWM_CNT_MAX;
+static uint_t cpudrv_user_hwm = CPUDRV_USER_HWM;
+
+boolean_t cpudrv_enabled = B_TRUE;
 
 /*
  * cpudrv_direct_pm allows user applications to directly control the
@@ -154,13 +156,13 @@ int cpudrv_direct_pm = 0;
  * Arranges for the handler function to be called at the interval suitable
  * for current speed.
  */
-#define	CPUDRV_PM_MONITOR_INIT(cpudsp) { \
-	if (CPUDRV_PM_POWER_ENABLED(cpudsp)) { \
+#define	CPUDRV_MONITOR_INIT(cpudsp) { \
+    if (cpudrv_is_enabled(cpudsp)) {	      \
 		ASSERT(mutex_owned(&(cpudsp)->lock)); \
 		(cpudsp)->cpudrv_pm.timeout_id = \
-		    timeout(cpudrv_pm_monitor_disp, \
+		    timeout(cpudrv_monitor_disp, \
 		    (cpudsp), (((cpudsp)->cpudrv_pm.cur_spd == NULL) ? \
-		    CPUDRV_PM_QUANT_CNT_OTHR : \
+		    CPUDRV_QUANT_CNT_OTHR : \
 		    (cpudsp)->cpudrv_pm.cur_spd->quant_cnt)); \
 	} \
 }
@@ -168,7 +170,7 @@ int cpudrv_direct_pm = 0;
 /*
  * Arranges for the handler function not to be called back.
  */
-#define	CPUDRV_PM_MONITOR_FINI(cpudsp) { \
+#define	CPUDRV_MONITOR_FINI(cpudsp) { \
 	timeout_id_t tmp_tid; \
 	ASSERT(mutex_owned(&(cpudsp)->lock)); \
 	tmp_tid = (cpudsp)->cpudrv_pm.timeout_id; \
@@ -203,7 +205,7 @@ _init(void)
 	/*
 	 * Callbacks used by the PPM driver.
 	 */
-	CPUDRV_PM_SET_PPM_CALLBACKS();
+	CPUDRV_SET_PPM_CALLBACKS();
 	return (error);
 }
 
@@ -242,13 +244,13 @@ cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	case DDI_ATTACH:
 		DPRINTF(D_ATTACH, ("cpudrv_attach: instance %d: "
 		    "DDI_ATTACH called\n", instance));
-		if (CPUDRV_PM_DISABLED())
+		if (!cpudrv_is_enabled(NULL))
 			return (DDI_FAILURE);
 		if (ddi_soft_state_zalloc(cpudrv_state, instance) !=
 		    DDI_SUCCESS) {
 			cmn_err(CE_WARN, "cpudrv_attach: instance %d: "
 			    "can't allocate state", instance);
-			CPUDRV_PM_DISABLE();
+			cpudrv_enabled = B_FALSE;
 			return (DDI_FAILURE);
 		}
 		if ((cpudsp = ddi_get_soft_state(cpudrv_state, instance)) ==
@@ -256,7 +258,7 @@ cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 			cmn_err(CE_WARN, "cpudrv_attach: instance %d: "
 			    "can't get state", instance);
 			ddi_soft_state_free(cpudrv_state, instance);
-			CPUDRV_PM_DISABLE();
+			cpudrv_enabled = B_FALSE;
 			return (DDI_FAILURE);
 		}
 		cpudsp->dip = dip;
@@ -264,36 +266,36 @@ cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 		/*
 		 * Find CPU number for this dev_info node.
 		 */
-		if (!cpudrv_pm_get_cpu_id(dip, &(cpudsp->cpu_id))) {
+		if (!cpudrv_get_cpu_id(dip, &(cpudsp->cpu_id))) {
 			cmn_err(CE_WARN, "cpudrv_attach: instance %d: "
 			    "can't convert dip to cpu_id", instance);
 			ddi_soft_state_free(cpudrv_state, instance);
-			CPUDRV_PM_DISABLE();
+			cpudrv_enabled = B_FALSE;
 			return (DDI_FAILURE);
 		}
-		if (!cpudrv_mach_pm_init(cpudsp)) {
-			ddi_soft_state_free(cpudrv_state, instance);
-			CPUDRV_PM_DISABLE();
+		if (!cpudrv_mach_init(cpudsp)) {
+			cpudrv_enabled = B_FALSE;
 			return (DDI_FAILURE);
 		}
+
 		mutex_init(&cpudsp->lock, NULL, MUTEX_DRIVER, NULL);
-		if (CPUDRV_PM_POWER_ENABLED(cpudsp)) {
-			if (cpudrv_pm_init_power(cpudsp) != DDI_SUCCESS) {
-				CPUDRV_PM_DISABLE();
-				cpudrv_pm_free(cpudsp);
+		if (cpudrv_is_enabled(cpudsp)) {
+			if (cpudrv_init(cpudsp) != DDI_SUCCESS) {
+				cpudrv_enabled = B_FALSE;
+				cpudrv_free(cpudsp);
 				ddi_soft_state_free(cpudrv_state, instance);
 				return (DDI_FAILURE);
 			}
-			if (cpudrv_pm_comp_create(cpudsp) != DDI_SUCCESS) {
-				CPUDRV_PM_DISABLE();
-				cpudrv_pm_free(cpudsp);
+			if (cpudrv_comp_create(cpudsp) != DDI_SUCCESS) {
+				cpudrv_enabled = B_FALSE;
+				cpudrv_free(cpudsp);
 				ddi_soft_state_free(cpudrv_state, instance);
 				return (DDI_FAILURE);
 			}
 			if (ddi_prop_update_string(DDI_DEV_T_NONE,
 			    dip, "pm-class", "CPU") != DDI_PROP_SUCCESS) {
-				CPUDRV_PM_DISABLE();
-				cpudrv_pm_free(cpudsp);
+				cpudrv_enabled = B_FALSE;
+				cpudrv_free(cpudsp);
 				ddi_soft_state_free(cpudrv_state, instance);
 				return (DDI_FAILURE);
 			}
@@ -303,10 +305,10 @@ cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 			 * activities.
 			 */
 			cpudsp->cpudrv_pm.tq = taskq_create_instance(
-			    "cpudrv_pm_monitor",
-			    ddi_get_instance(dip), CPUDRV_PM_TASKQ_THREADS,
-			    (maxclsyspri - 1), CPUDRV_PM_TASKQ_MIN,
-			    CPUDRV_PM_TASKQ_MAX,
+			    "cpudrv_monitor",
+			    ddi_get_instance(dip), CPUDRV_TASKQ_THREADS,
+			    (maxclsyspri - 1), CPUDRV_TASKQ_MIN,
+			    CPUDRV_TASKQ_MAX,
 			    TASKQ_PREPOPULATE|TASKQ_CPR_SAFE);
 
 			mutex_init(&cpudsp->cpudrv_pm.timeout_lock, NULL,
@@ -321,7 +323,7 @@ cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 			 * is full speed for us.
 			 */
 			/*
-			 * We need to take the lock because cpudrv_pm_monitor()
+			 * We need to take the lock because cpudrv_monitor()
 			 * will start running in parallel with attach().
 			 */
 			mutex_enter(&cpudsp->lock);
@@ -335,12 +337,12 @@ cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 			 * unknown speed and moves CPU to top speed when it
 			 * has been initialized.
 			 */
-			CPUDRV_PM_MONITOR_INIT(cpudsp);
+			CPUDRV_MONITOR_INIT(cpudsp);
 			mutex_exit(&cpudsp->lock);
 
 		}
 
-		CPUDRV_PM_INSTALL_MAX_CHANGE_HANDLER(cpudsp, dip);
+		CPUDRV_INSTALL_MAX_CHANGE_HANDLER(cpudsp);
 
 		ddi_report_dev(dip);
 		return (DDI_SUCCESS);
@@ -355,7 +357,7 @@ cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 		/*
 		 * Nothing to do for resume, if not doing active PM.
 		 */
-		if (!CPUDRV_PM_POWER_ENABLED(cpudsp))
+		if (!cpudrv_is_enabled(cpudsp))
 			return (DDI_SUCCESS);
 
 		mutex_enter(&cpudsp->lock);
@@ -365,9 +367,9 @@ cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 		 * that the needed speed is full speed for us.
 		 */
 		cpudsp->cpudrv_pm.cur_spd = NULL;
-		CPUDRV_PM_MONITOR_INIT(cpudsp);
+		CPUDRV_MONITOR_INIT(cpudsp);
 		mutex_exit(&cpudsp->lock);
-		CPUDRV_PM_REDEFINE_TOPSPEED(dip);
+		CPUDRV_REDEFINE_TOPSPEED(dip);
 		return (DDI_SUCCESS);
 
 	default:
@@ -409,7 +411,7 @@ cpudrv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 		/*
 		 * Nothing to do for suspend, if not doing active PM.
 		 */
-		if (!CPUDRV_PM_POWER_ENABLED(cpudsp))
+		if (!cpudrv_is_enabled(cpudsp))
 			return (DDI_SUCCESS);
 
 		/*
@@ -427,18 +429,18 @@ cpudrv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 		DPRINTF(D_DETACH, ("cpudrv_detach: instance %d: DDI_SUSPEND - "
 		    "cur_spd %d, topspeed %d\n", instance,
 		    cpupm->cur_spd->pm_level,
-		    CPUDRV_PM_TOPSPEED(cpupm)->pm_level));
+		    CPUDRV_TOPSPEED(cpupm)->pm_level));
 
-		CPUDRV_PM_MONITOR_FINI(cpudsp);
+		CPUDRV_MONITOR_FINI(cpudsp);
 
 		if (!cpudrv_direct_pm && (cpupm->cur_spd !=
-		    CPUDRV_PM_TOPSPEED(cpupm))) {
+		    CPUDRV_TOPSPEED(cpupm))) {
 			if (cpupm->pm_busycnt < 1) {
-				if ((pm_busy_component(dip, CPUDRV_PM_COMP_NUM)
+				if ((pm_busy_component(dip, CPUDRV_COMP_NUM)
 				    == DDI_SUCCESS)) {
 					cpupm->pm_busycnt++;
 				} else {
-					CPUDRV_PM_MONITOR_INIT(cpudsp);
+					CPUDRV_MONITOR_INIT(cpudsp);
 					mutex_exit(&cpudsp->lock);
 					cmn_err(CE_WARN, "cpudrv_detach: "
 					    "instance %d: can't busy CPU "
@@ -447,16 +449,16 @@ cpudrv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 				}
 			}
 			mutex_exit(&cpudsp->lock);
-			if (pm_raise_power(dip, CPUDRV_PM_COMP_NUM,
-			    CPUDRV_PM_TOPSPEED(cpupm)->pm_level) !=
+			if (pm_raise_power(dip, CPUDRV_COMP_NUM,
+			    CPUDRV_TOPSPEED(cpupm)->pm_level) !=
 			    DDI_SUCCESS) {
 				mutex_enter(&cpudsp->lock);
-				CPUDRV_PM_MONITOR_INIT(cpudsp);
+				CPUDRV_MONITOR_INIT(cpudsp);
 				mutex_exit(&cpudsp->lock);
 				cmn_err(CE_WARN, "cpudrv_detach: instance %d: "
 				    "can't raise CPU power level to %d",
 				    instance,
-				    CPUDRV_PM_TOPSPEED(cpupm)->pm_level);
+				    CPUDRV_TOPSPEED(cpupm)->pm_level);
 				return (DDI_FAILURE);
 			} else {
 				return (DDI_SUCCESS);
@@ -483,7 +485,7 @@ cpudrv_power(dev_info_t *dip, int comp, int level)
 {
 	int			instance;
 	cpudrv_devstate_t	*cpudsp;
-	cpudrv_pm_t 		*cpupm;
+	cpudrv_pm_t 		*cpudrvpm;
 	cpudrv_pm_spd_t		*new_spd;
 	boolean_t		is_ready;
 	int			ret;
@@ -492,14 +494,15 @@ cpudrv_power(dev_info_t *dip, int comp, int level)
 
 	DPRINTF(D_POWER, ("cpudrv_power: instance %d: level %d\n",
 	    instance, level));
+
 	if ((cpudsp = ddi_get_soft_state(cpudrv_state, instance)) == NULL) {
-		cmn_err(CE_WARN, "cpudrv_power: instance %d: can't get state",
-		    instance);
+		cmn_err(CE_WARN, "cpudrv_power: instance %d: can't "
+		    "get state", instance);
 		return (DDI_FAILURE);
 	}
 
 	mutex_enter(&cpudsp->lock);
-	cpupm = &(cpudsp->cpudrv_pm);
+	cpudrvpm = &(cpudsp->cpudrv_pm);
 
 	/*
 	 * In normal operation, we fail if we are busy and request is
@@ -507,21 +510,22 @@ cpudrv_power(dev_info_t *dip, int comp, int level)
 	 * is in special direct pm mode. On x86, we also let this through
 	 * if the change is due to a request to govern the max speed.
 	 */
-	if (!cpudrv_direct_pm && (cpupm->pm_busycnt >= 1) &&
-	    !cpudrv_pm_is_governor_thread(cpupm)) {
-		if ((cpupm->cur_spd != NULL) &&
-		    (level < cpupm->cur_spd->pm_level)) {
+	if (!cpudrv_direct_pm && (cpudrvpm->pm_busycnt >= 1) &&
+	    !cpudrv_is_governor_thread(cpudrvpm)) {
+		if ((cpudrvpm->cur_spd != NULL) &&
+		    (level < cpudrvpm->cur_spd->pm_level)) {
 			mutex_exit(&cpudsp->lock);
 			return (DDI_FAILURE);
 		}
 	}
 
-	for (new_spd = cpupm->head_spd; new_spd; new_spd = new_spd->down_spd) {
+	for (new_spd = cpudrvpm->head_spd; new_spd; new_spd =
+	    new_spd->down_spd) {
 		if (new_spd->pm_level == level)
 			break;
 	}
 	if (!new_spd) {
-		CPUDRV_PM_RESET_GOVERNOR_THREAD(cpupm);
+		CPUDRV_RESET_GOVERNOR_THREAD(cpudrvpm);
 		mutex_exit(&cpudsp->lock);
 		cmn_err(CE_WARN, "cpudrv_power: instance %d: "
 		    "can't locate new CPU speed", instance);
@@ -538,105 +542,66 @@ cpudrv_power(dev_info_t *dip, int comp, int level)
 	 * That's because we don't know what the CPU domains look like
 	 * until all instances have been initialized.
 	 */
-	is_ready = CPUDRV_PM_XCALL_IS_READY(cpudsp->cpu_id);
+	is_ready = CPUDRV_XCALL_IS_READY(cpudsp->cpu_id);
 	if (!is_ready) {
 		DPRINTF(D_POWER, ("cpudrv_power: instance %d: "
 		    "CPU not ready for x-calls\n", instance));
-	} else if (!(is_ready = cpudrv_pm_power_ready())) {
+	} else if (!(is_ready = cpudrv_power_ready())) {
 		DPRINTF(D_POWER, ("cpudrv_power: instance %d: "
-		    "waiting for all CPUs to be power manageable\n", instance));
+		    "waiting for all CPUs to be power manageable\n",
+		    instance));
 	}
 	if (!is_ready) {
-		CPUDRV_PM_RESET_GOVERNOR_THREAD(cpupm);
+		CPUDRV_RESET_GOVERNOR_THREAD(cpudrvpm);
 		mutex_exit(&cpudsp->lock);
 		return (DDI_FAILURE);
 	}
 
 	/*
-	 * Execute CPU specific routine on the requested CPU to change its
-	 * speed to normal-speed/divisor.
+	 * Execute CPU specific routine on the requested CPU to
+	 * change its speed to normal-speed/divisor.
 	 */
-	if ((ret = cpudrv_pm_change_speed(cpudsp, new_spd)) != DDI_SUCCESS) {
-		cmn_err(CE_WARN, "cpudrv_power: cpudrv_pm_change_speed() "
-		    "return = %d", ret);
+	if ((ret = cpudrv_change_speed(cpudsp, new_spd)) != DDI_SUCCESS) {
+		cmn_err(CE_WARN, "cpudrv_power: "
+		    "cpudrv_change_speed() return = %d", ret);
 		mutex_exit(&cpudsp->lock);
 		return (DDI_FAILURE);
 	}
 
 	/*
-	 * DTrace probe point for CPU speed change transition
-	 */
-	DTRACE_PROBE3(cpu__change__speed, cpudrv_devstate_t *, cpudsp,
-	    cpudrv_pm_t *, cpupm, cpudrv_pm_spd_t *, new_spd);
-
-	/*
 	 * Reset idle threshold time for the new power level.
 	 */
-	if ((cpupm->cur_spd != NULL) && (level < cpupm->cur_spd->pm_level)) {
-		if (pm_idle_component(dip, CPUDRV_PM_COMP_NUM) ==
+	if ((cpudrvpm->cur_spd != NULL) && (level <
+	    cpudrvpm->cur_spd->pm_level)) {
+		if (pm_idle_component(dip, CPUDRV_COMP_NUM) ==
 		    DDI_SUCCESS) {
-			if (cpupm->pm_busycnt >= 1)
-				cpupm->pm_busycnt--;
-		} else
-			cmn_err(CE_WARN, "cpudrv_power: instance %d: can't "
-			    "idle CPU component", ddi_get_instance(dip));
+			if (cpudrvpm->pm_busycnt >= 1)
+				cpudrvpm->pm_busycnt--;
+		} else {
+			cmn_err(CE_WARN, "cpudrv_power: instance %d: "
+			    "can't idle CPU component",
+			    ddi_get_instance(dip));
+		}
 	}
 	/*
 	 * Reset various parameters because we are now running at new speed.
 	 */
-	cpupm->lastquan_mstate[CMS_IDLE] = 0;
-	cpupm->lastquan_mstate[CMS_SYSTEM] = 0;
-	cpupm->lastquan_mstate[CMS_USER] = 0;
-	cpupm->lastquan_ticks = 0;
-	cpupm->cur_spd = new_spd;
-	CPUDRV_PM_RESET_GOVERNOR_THREAD(cpupm);
+	cpudrvpm->lastquan_mstate[CMS_IDLE] = 0;
+	cpudrvpm->lastquan_mstate[CMS_SYSTEM] = 0;
+	cpudrvpm->lastquan_mstate[CMS_USER] = 0;
+	cpudrvpm->lastquan_ticks = 0;
+	cpudrvpm->cur_spd = new_spd;
+	CPUDRV_RESET_GOVERNOR_THREAD(cpudrvpm);
 	mutex_exit(&cpudsp->lock);
 
 	return (DDI_SUCCESS);
 }
 
 /*
- * Initialize the field that will be used for reporting
- * the supported_frequencies_Hz cpu_info kstat.
- */
-static void
-set_supp_freqs(cpu_t *cp, cpudrv_pm_t *cpupm)
-{
-	char		*supp_freqs;
-	char		*sfptr;
-	uint64_t	*speeds;
-	cpudrv_pm_spd_t	*spd;
-	int		i;
-#define	UINT64_MAX_STRING (sizeof ("18446744073709551615"))
-
-	speeds = kmem_zalloc(cpupm->num_spd * sizeof (uint64_t), KM_SLEEP);
-	for (i = cpupm->num_spd - 1, spd = cpupm->head_spd; spd;
-	    i--, spd = spd->down_spd) {
-		speeds[i] =
-		    CPUDRV_PM_SPEED_HZ(cp->cpu_type_info.pi_clock, spd->speed);
-	}
-
-	supp_freqs = kmem_zalloc((UINT64_MAX_STRING * cpupm->num_spd),
-	    KM_SLEEP);
-	sfptr = supp_freqs;
-	for (i = 0; i < cpupm->num_spd; i++) {
-		if (i == cpupm->num_spd - 1) {
-			(void) sprintf(sfptr, "%"PRIu64, speeds[i]);
-		} else {
-			(void) sprintf(sfptr, "%"PRIu64":", speeds[i]);
-			sfptr = supp_freqs + strlen(supp_freqs);
-		}
-	}
-	cpu_set_supp_freqs(cp, supp_freqs);
-	kmem_free(supp_freqs, (UINT64_MAX_STRING * cpupm->num_spd));
-	kmem_free(speeds, cpupm->num_spd * sizeof (uint64_t));
-}
-
-/*
  * Initialize power management data.
  */
 static int
-cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp)
+cpudrv_init(cpudrv_devstate_t *cpudsp)
 {
 	cpudrv_pm_t 	*cpupm = &(cpudsp->cpudrv_pm);
 	cpudrv_pm_spd_t	*cur_spd;
@@ -647,10 +612,10 @@ cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp)
 	int		user_cnt_percent;
 	int		i;
 
-	CPUDRV_PM_GET_SPEEDS(cpudsp, speeds, nspeeds);
+	CPUDRV_GET_SPEEDS(cpudsp, speeds, nspeeds);
 	if (nspeeds < 2) {
 		/* Need at least two speeds to power manage */
-		CPUDRV_PM_FREE_SPEEDS(speeds, nspeeds);
+		CPUDRV_FREE_SPEEDS(speeds, nspeeds);
 		return (DDI_FAILURE);
 	}
 	cpupm->num_spd = nspeeds;
@@ -685,15 +650,15 @@ cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp)
 		cur_spd->speed = speeds[i];
 		if (i == 0) {	/* normal speed */
 			cpupm->head_spd = cur_spd;
-			CPUDRV_PM_TOPSPEED(cpupm) = cur_spd;
-			cur_spd->quant_cnt = CPUDRV_PM_QUANT_CNT_NORMAL;
+			CPUDRV_TOPSPEED(cpupm) = cur_spd;
+			cur_spd->quant_cnt = CPUDRV_QUANT_CNT_NORMAL;
 			cur_spd->idle_hwm =
-			    (cpudrv_pm_idle_hwm * cur_spd->quant_cnt) / 100;
+			    (cpudrv_idle_hwm * cur_spd->quant_cnt) / 100;
 			/* can't speed anymore */
 			cur_spd->idle_lwm = 0;
 			cur_spd->user_hwm = UINT_MAX;
 		} else {
-			cur_spd->quant_cnt = CPUDRV_PM_QUANT_CNT_OTHR;
+			cur_spd->quant_cnt = CPUDRV_QUANT_CNT_OTHR;
 			ASSERT(prev_spd != NULL);
 			prev_spd->down_spd = cur_spd;
 			cur_spd->up_spd = cpupm->head_spd;
@@ -711,14 +676,14 @@ cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp)
 			 * that there is at least a buffer zone seperation
 			 * between the idle_lwm and idle_hwm values.
 			 */
-			idle_cnt_percent = CPUDRV_PM_IDLE_CNT_PERCENT(
-			    cpudrv_pm_idle_hwm, speeds, i);
+			idle_cnt_percent = CPUDRV_IDLE_CNT_PERCENT(
+			    cpudrv_idle_hwm, speeds, i);
 			idle_cnt_percent = max(idle_cnt_percent,
-			    (cpudrv_pm_idle_lwm + cpudrv_pm_idle_buf_zone));
+			    (cpudrv_idle_lwm + cpudrv_idle_buf_zone));
 			cur_spd->idle_hwm =
 			    (idle_cnt_percent * cur_spd->quant_cnt) / 100;
 			cur_spd->idle_lwm =
-			    (cpudrv_pm_idle_lwm * cur_spd->quant_cnt) / 100;
+			    (cpudrv_idle_lwm * cur_spd->quant_cnt) / 100;
 
 			/*
 			 * The lwm for user threads are determined such that
@@ -727,10 +692,10 @@ cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp)
 			 * user_hwm in the new speed.  This is to prevent
 			 * the quick jump back up to higher speed.
 			 */
-			cur_spd->user_hwm = (cpudrv_pm_user_hwm *
+			cur_spd->user_hwm = (cpudrv_user_hwm *
 			    cur_spd->quant_cnt) / 100;
-			user_cnt_percent = CPUDRV_PM_USER_CNT_PERCENT(
-			    cpudrv_pm_user_hwm, speeds, i);
+			user_cnt_percent = CPUDRV_USER_CNT_PERCENT(
+			    cpudrv_user_hwm, speeds, i);
 			prev_spd->user_lwm =
 			    (user_cnt_percent * prev_spd->quant_cnt) / 100;
 		}
@@ -740,11 +705,11 @@ cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp)
 	cur_spd->idle_hwm = UINT_MAX;
 	cur_spd->user_lwm = -1;
 #ifdef	DEBUG
-	DPRINTF(D_PM_INIT, ("cpudrv_pm_init: instance %d: head_spd spd %d, "
+	DPRINTF(D_PM_INIT, ("cpudrv_init: instance %d: head_spd spd %d, "
 	    "num_spd %d\n", ddi_get_instance(cpudsp->dip),
 	    cpupm->head_spd->speed, cpupm->num_spd));
 	for (cur_spd = cpupm->head_spd; cur_spd; cur_spd = cur_spd->down_spd) {
-		DPRINTF(D_PM_INIT, ("cpudrv_pm_init: instance %d: speed %d, "
+		DPRINTF(D_PM_INIT, ("cpudrv_init: instance %d: speed %d, "
 		    "down_spd spd %d, idle_hwm %d, user_lwm %d, "
 		    "up_spd spd %d, idle_lwm %d, user_hwm %d, "
 		    "quant_cnt %d\n", ddi_get_instance(cpudsp->dip),
@@ -756,7 +721,7 @@ cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp)
 		    cur_spd->quant_cnt));
 	}
 #endif	/* DEBUG */
-	CPUDRV_PM_FREE_SPEEDS(speeds, nspeeds);
+	CPUDRV_FREE_SPEEDS(speeds, nspeeds);
 	return (DDI_SUCCESS);
 }
 
@@ -764,7 +729,7 @@ cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp)
  * Free CPU power management data.
  */
 static void
-cpudrv_pm_free(cpudrv_devstate_t *cpudsp)
+cpudrv_free(cpudrv_devstate_t *cpudsp)
 {
 	cpudrv_pm_t 	*cpupm = &(cpudsp->cpudrv_pm);
 	cpudrv_pm_spd_t	*cur_spd, *next_spd;
@@ -776,14 +741,13 @@ cpudrv_pm_free(cpudrv_devstate_t *cpudsp)
 		cur_spd = next_spd;
 	}
 	bzero(cpupm, sizeof (cpudrv_pm_t));
-	cpudrv_mach_pm_free(cpudsp);
 }
 
 /*
  * Create pm-components property.
  */
 static int
-cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp)
+cpudrv_comp_create(cpudrv_devstate_t *cpudsp)
 {
 	cpudrv_pm_t 	*cpupm = &(cpudsp->cpudrv_pm);
 	cpudrv_pm_spd_t	*cur_spd;
@@ -795,9 +759,9 @@ cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp)
 	int		result = DDI_FAILURE;
 
 	pmc = kmem_zalloc((cpupm->num_spd + 1) * sizeof (char *), KM_SLEEP);
-	size = CPUDRV_PM_COMP_SIZE();
-	if (cpupm->num_spd > CPUDRV_PM_COMP_MAX_VAL) {
-		cmn_err(CE_WARN, "cpudrv_pm_comp_create: instance %d: "
+	size = CPUDRV_COMP_SIZE();
+	if (cpupm->num_spd > CPUDRV_COMP_MAX_VAL) {
+		cmn_err(CE_WARN, "cpudrv_comp_create: instance %d: "
 		    "number of speeds exceeded limits",
 		    ddi_get_instance(cpudsp->dip));
 		kmem_free(pmc, (cpupm->num_spd + 1) * sizeof (char *));
@@ -808,9 +772,9 @@ cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp)
 	    i--, cur_spd = cur_spd->down_spd) {
 		cur_spd->pm_level = i;
 		pmc[i] = kmem_zalloc((size * sizeof (char)), KM_SLEEP);
-		comp_spd = CPUDRV_PM_COMP_SPEED(cpupm, cur_spd);
-		if (comp_spd > CPUDRV_PM_COMP_MAX_VAL) {
-			cmn_err(CE_WARN, "cpudrv_pm_comp_create: "
+		comp_spd = CPUDRV_COMP_SPEED(cpupm, cur_spd);
+		if (comp_spd > CPUDRV_COMP_MAX_VAL) {
+			cmn_err(CE_WARN, "cpudrv_comp_create: "
 			    "instance %d: speed exceeded limits",
 			    ddi_get_instance(cpudsp->dip));
 			for (j = cpupm->num_spd; j >= i; j--) {
@@ -820,14 +784,14 @@ cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp)
 			    sizeof (char *));
 			return (result);
 		}
-		CPUDRV_PM_COMP_SPRINT(pmc[i], cpupm, cur_spd, comp_spd)
-		DPRINTF(D_PM_COMP_CREATE, ("cpudrv_pm_comp_create: "
+		CPUDRV_COMP_SPRINT(pmc[i], cpupm, cur_spd, comp_spd)
+		DPRINTF(D_PM_COMP_CREATE, ("cpudrv_comp_create: "
 		    "instance %d: pm-components power level %d string '%s'\n",
 		    ddi_get_instance(cpudsp->dip), i, pmc[i]));
 	}
 	pmc[0] = kmem_zalloc(sizeof (name), KM_SLEEP);
 	(void) strcat(pmc[0], name);
-	DPRINTF(D_PM_COMP_CREATE, ("cpudrv_pm_comp_create: instance %d: "
+	DPRINTF(D_PM_COMP_CREATE, ("cpudrv_comp_create: instance %d: "
 	    "pm-components component name '%s'\n",
 	    ddi_get_instance(cpudsp->dip), pmc[0]));
 
@@ -835,7 +799,7 @@ cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp)
 	    "pm-components", pmc, cpupm->num_spd + 1) == DDI_PROP_SUCCESS) {
 		result = DDI_SUCCESS;
 	} else {
-		cmn_err(CE_WARN, "cpudrv_pm_comp_create: instance %d: "
+		cmn_err(CE_WARN, "cpudrv_comp_create: instance %d: "
 		    "can't create pm-components property",
 		    ddi_get_instance(cpudsp->dip));
 	}
@@ -851,16 +815,16 @@ cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp)
 /*
  * Mark a component idle.
  */
-#define	CPUDRV_PM_MONITOR_PM_IDLE_COMP(dip, cpupm) { \
+#define	CPUDRV_MONITOR_PM_IDLE_COMP(dip, cpupm) { \
 	if ((cpupm)->pm_busycnt >= 1) { \
-		if (pm_idle_component((dip), CPUDRV_PM_COMP_NUM) == \
+		if (pm_idle_component((dip), CPUDRV_COMP_NUM) == \
 		    DDI_SUCCESS) { \
-			DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor: " \
+			DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: " \
 			    "instance %d: pm_idle_component called\n", \
 			    ddi_get_instance((dip)))); \
 			(cpupm)->pm_busycnt--; \
 		} else { \
-			cmn_err(CE_WARN, "cpudrv_pm_monitor: instance %d: " \
+			cmn_err(CE_WARN, "cpudrv_monitor: instance %d: " \
 			    "can't idle CPU component", \
 			    ddi_get_instance((dip))); \
 		} \
@@ -870,16 +834,16 @@ cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp)
 /*
  * Marks a component busy in both PM framework and driver state structure.
  */
-#define	CPUDRV_PM_MONITOR_PM_BUSY_COMP(dip, cpupm) { \
+#define	CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm) { \
 	if ((cpupm)->pm_busycnt < 1) { \
-		if (pm_busy_component((dip), CPUDRV_PM_COMP_NUM) == \
+		if (pm_busy_component((dip), CPUDRV_COMP_NUM) == \
 		    DDI_SUCCESS) { \
-			DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor: " \
+			DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: " \
 			    "instance %d: pm_busy_component called\n", \
 			    ddi_get_instance((dip)))); \
 			(cpupm)->pm_busycnt++; \
 		} else { \
-			cmn_err(CE_WARN, "cpudrv_pm_monitor: instance %d: " \
+			cmn_err(CE_WARN, "cpudrv_monitor: instance %d: " \
 			    "can't busy CPU component", \
 			    ddi_get_instance((dip))); \
 		} \
@@ -889,19 +853,19 @@ cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp)
 /*
  * Marks a component busy and calls pm_raise_power().
  */
-#define	CPUDRV_PM_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm, new_level) { \
+#define	CPUDRV_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm, new_level) { \
 	/* \
 	 * Mark driver and PM framework busy first so framework doesn't try \
 	 * to bring CPU to lower speed when we need to be at higher speed. \
 	 */ \
-	CPUDRV_PM_MONITOR_PM_BUSY_COMP((dip), (cpupm)); \
+	CPUDRV_MONITOR_PM_BUSY_COMP((dip), (cpupm)); \
 	mutex_exit(&(cpudsp)->lock); \
-	DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor: instance %d: " \
+	DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: " \
 	    "pm_raise_power called to %d\n", ddi_get_instance((dip)), \
 		(new_level))); \
-	if (pm_raise_power((dip), CPUDRV_PM_COMP_NUM, (new_level)) != \
+	if (pm_raise_power((dip), CPUDRV_COMP_NUM, (new_level)) != \
 	    DDI_SUCCESS) { \
-		cmn_err(CE_WARN, "cpudrv_pm_monitor: instance %d: can't " \
+		cmn_err(CE_WARN, "cpudrv_monitor: instance %d: can't " \
 		    "raise CPU power level", ddi_get_instance((dip))); \
 	} \
 	mutex_enter(&(cpudsp)->lock); \
@@ -913,7 +877,7 @@ cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp)
  * We dispatch a taskq to do that job.
  */
 static void
-cpudrv_pm_monitor_disp(void *arg)
+cpudrv_monitor_disp(void *arg)
 {
 	cpudrv_devstate_t	*cpudsp = (cpudrv_devstate_t *)arg;
 
@@ -922,13 +886,13 @@ cpudrv_pm_monitor_disp(void *arg)
 	 * The queue should be empty at this time.
 	 */
 	mutex_enter(&cpudsp->cpudrv_pm.timeout_lock);
-	if (!taskq_dispatch(cpudsp->cpudrv_pm.tq, cpudrv_pm_monitor, arg,
+	if (!taskq_dispatch(cpudsp->cpudrv_pm.tq, cpudrv_monitor, arg,
 	    TQ_NOSLEEP)) {
 		mutex_exit(&cpudsp->cpudrv_pm.timeout_lock);
-		DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor_disp: failed to "
-		    "dispatch the cpudrv_pm_monitor taskq\n"));
+		DPRINTF(D_PM_MONITOR, ("cpudrv_monitor_disp: failed to "
+		    "dispatch the cpudrv_monitor taskq\n"));
 		mutex_enter(&cpudsp->lock);
-		CPUDRV_PM_MONITOR_INIT(cpudsp);
+		CPUDRV_MONITOR_INIT(cpudsp);
 		mutex_exit(&cpudsp->lock);
 		return;
 	}
@@ -940,17 +904,16 @@ cpudrv_pm_monitor_disp(void *arg)
  * Monitors each CPU for the amount of time idle thread was running in the
  * last quantum and arranges for the CPU to go to the lower or higher speed.
  * Called at the time interval appropriate for the current speed. The
- * time interval for normal speed is CPUDRV_PM_QUANT_CNT_NORMAL. The time
+ * time interval for normal speed is CPUDRV_QUANT_CNT_NORMAL. The time
  * interval for other speeds (including unknown speed) is
- * CPUDRV_PM_QUANT_CNT_OTHR.
+ * CPUDRV_QUANT_CNT_OTHR.
  */
 static void
-cpudrv_pm_monitor(void *arg)
+cpudrv_monitor(void *arg)
 {
 	cpudrv_devstate_t	*cpudsp = (cpudrv_devstate_t *)arg;
 	cpudrv_pm_t		*cpupm;
 	cpudrv_pm_spd_t		*cur_spd, *new_spd;
-	cpu_t			*cp;
 	dev_info_t		*dip;
 	uint_t			idle_cnt, user_cnt, system_cnt;
 	clock_t			ticks;
@@ -984,12 +947,12 @@ cpudrv_pm_monitor(void *arg)
 	 * That's because we don't know what the CPU domains look like
 	 * until all instances have been initialized.
 	 */
-	is_ready = CPUDRV_PM_XCALL_IS_READY(cpudsp->cpu_id);
+	is_ready = CPUDRV_XCALL_IS_READY(cpudsp->cpu_id);
 	if (!is_ready) {
-		DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor: instance %d: "
+		DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: "
 		    "CPU not ready for x-calls\n", ddi_get_instance(dip)));
-	} else if (!(is_ready = cpudrv_pm_power_ready())) {
-		DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor: instance %d: "
+	} else if (!(is_ready = cpudrv_power_ready())) {
+		DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: "
 		    "waiting for all CPUs to be power manageable\n",
 		    ddi_get_instance(dip)));
 	}
@@ -998,8 +961,8 @@ cpudrv_pm_monitor(void *arg)
 		 * Make sure that we are busy so that framework doesn't
 		 * try to bring us down in this situation.
 		 */
-		CPUDRV_PM_MONITOR_PM_BUSY_COMP(dip, cpupm);
-		CPUDRV_PM_MONITOR_INIT(cpudsp);
+		CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm);
+		CPUDRV_MONITOR_INIT(cpudsp);
 		mutex_exit(&cpudsp->lock);
 		goto do_return;
 	}
@@ -1008,35 +971,36 @@ cpudrv_pm_monitor(void *arg)
 	 * Make sure that we are still not at unknown power level.
 	 */
 	if (cur_spd == NULL) {
-		DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor: instance %d: "
+		DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: "
 		    "cur_spd is unknown\n", ddi_get_instance(dip)));
-		CPUDRV_PM_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm,
-		    CPUDRV_PM_TOPSPEED(cpupm)->pm_level);
+		CPUDRV_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm,
+		    CPUDRV_TOPSPEED(cpupm)->pm_level);
 		/*
 		 * We just changed the speed. Wait till at least next
 		 * call to this routine before proceeding ahead.
 		 */
-		CPUDRV_PM_MONITOR_INIT(cpudsp);
+		CPUDRV_MONITOR_INIT(cpudsp);
 		mutex_exit(&cpudsp->lock);
 		goto do_return;
 	}
 
 	mutex_enter(&cpu_lock);
-	if ((cp = cpu_get(cpudsp->cpu_id)) == NULL) {
+	if (cpudsp->cp == NULL &&
+	    (cpudsp->cp = cpu_get(cpudsp->cpu_id)) == NULL) {
 		mutex_exit(&cpu_lock);
-		CPUDRV_PM_MONITOR_INIT(cpudsp);
+		CPUDRV_MONITOR_INIT(cpudsp);
 		mutex_exit(&cpudsp->lock);
-		cmn_err(CE_WARN, "cpudrv_pm_monitor: instance %d: can't get "
+		cmn_err(CE_WARN, "cpudrv_monitor: instance %d: can't get "
 		    "cpu_t", ddi_get_instance(dip));
 		goto do_return;
 	}
 
 	if (!cpupm->pm_started) {
 		cpupm->pm_started = B_TRUE;
-		set_supp_freqs(cp, cpupm);
+		cpudrv_set_supp_freqs(cpudsp);
 	}
 
-	get_cpu_mstate(cp, msnsecs);
+	get_cpu_mstate(cpudsp->cp, msnsecs);
 	GET_CPU_MSTATE_CNT(CMS_IDLE, idle_cnt);
 	GET_CPU_MSTATE_CNT(CMS_USER, user_cnt);
 	GET_CPU_MSTATE_CNT(CMS_SYSTEM, system_cnt);
@@ -1048,7 +1012,7 @@ cpudrv_pm_monitor(void *arg)
 	if (cpupm->lastquan_ticks == 0) {
 		cpupm->lastquan_ticks = NSEC_TO_TICK(gethrtime());
 		mutex_exit(&cpu_lock);
-		CPUDRV_PM_MONITOR_INIT(cpudsp);
+		CPUDRV_MONITOR_INIT(cpudsp);
 		mutex_exit(&cpudsp->lock);
 		goto do_return;
 	}
@@ -1071,10 +1035,10 @@ cpudrv_pm_monitor(void *arg)
 	 * Time taken between recording the current counts and
 	 * arranging the next call of this routine is an error in our
 	 * calculation. We minimize the error by calling
-	 * CPUDRV_PM_MONITOR_INIT() here instead of end of this routine.
+	 * CPUDRV_MONITOR_INIT() here instead of end of this routine.
 	 */
-	CPUDRV_PM_MONITOR_INIT(cpudsp);
-	DPRINTF(D_PM_MONITOR_VERBOSE, ("cpudrv_pm_monitor: instance %d: "
+	CPUDRV_MONITOR_INIT(cpudsp);
+	DPRINTF(D_PM_MONITOR_VERBOSE, ("cpudrv_monitor: instance %d: "
 	    "idle count %d, user count %d, system count %d, pm_level %d, "
 	    "pm_busycnt %d\n", ddi_get_instance(dip), idle_cnt, user_cnt,
 	    system_cnt, cur_spd->pm_level, cpupm->pm_busycnt));
@@ -1089,7 +1053,7 @@ cpudrv_pm_monitor(void *arg)
 	 * DPRINTFs changes the timing.
 	 */
 	if (tick_cnt > cur_spd->quant_cnt) {
-		DPRINTF(D_PM_MONITOR_DELAY, ("cpudrv_pm_monitor: instance %d: "
+		DPRINTF(D_PM_MONITOR_DELAY, ("cpudrv_monitor: instance %d: "
 		    "tick count %d > quantum_count %u\n",
 		    ddi_get_instance(dip), tick_cnt, cur_spd->quant_cnt));
 	}
@@ -1102,7 +1066,7 @@ cpudrv_pm_monitor(void *arg)
 	user_cnt = (user_cnt * cur_spd->quant_cnt) / tick_cnt;
 
 	if ((user_cnt > cur_spd->user_hwm) || (idle_cnt < cur_spd->idle_lwm &&
-	    cur_spd->idle_blwm_cnt >= cpudrv_pm_idle_blwm_cnt_max)) {
+	    cur_spd->idle_blwm_cnt >= cpudrv_idle_blwm_cnt_max)) {
 		cur_spd->idle_blwm_cnt = 0;
 		cur_spd->idle_bhwm_cnt = 0;
 		/*
@@ -1111,21 +1075,21 @@ cpudrv_pm_monitor(void *arg)
 		 * at the current speed.
 		 */
 		if (cur_spd == cur_spd->up_spd || cpudrv_direct_pm) {
-			CPUDRV_PM_MONITOR_PM_BUSY_COMP(dip, cpupm);
+			CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm);
 		} else {
 			new_spd = cur_spd->up_spd;
-			CPUDRV_PM_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm,
+			CPUDRV_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm,
 			    new_spd->pm_level);
 		}
 	} else if ((user_cnt <= cur_spd->user_lwm) &&
-	    (idle_cnt >= cur_spd->idle_hwm) || !CPU_ACTIVE(cp)) {
+	    (idle_cnt >= cur_spd->idle_hwm) || !CPU_ACTIVE(cpudsp->cp)) {
 		cur_spd->idle_blwm_cnt = 0;
 		cur_spd->idle_bhwm_cnt = 0;
 		/*
 		 * Arrange to go to next lower speed by informing our idle
 		 * status to the power management framework.
 		 */
-		CPUDRV_PM_MONITOR_PM_IDLE_COMP(dip, cpupm);
+		CPUDRV_MONITOR_PM_IDLE_COMP(dip, cpupm);
 	} else {
 		/*
 		 * If we are between the idle water marks and have not
@@ -1134,7 +1098,7 @@ cpudrv_pm_monitor(void *arg)
 		 */
 		if ((idle_cnt < cur_spd->idle_hwm) &&
 		    (idle_cnt >= cur_spd->idle_lwm) &&
-		    (cur_spd->idle_bhwm_cnt < cpudrv_pm_idle_bhwm_cnt_max)) {
+		    (cur_spd->idle_bhwm_cnt < cpudrv_idle_bhwm_cnt_max)) {
 			cur_spd->idle_blwm_cnt = 0;
 			cur_spd->idle_bhwm_cnt++;
 			mutex_exit(&cpudsp->lock);
@@ -1147,7 +1111,7 @@ cpudrv_pm_monitor(void *arg)
 		/*
 		 * Arranges to stay at the current speed.
 		 */
-		CPUDRV_PM_MONITOR_PM_BUSY_COMP(dip, cpupm);
+		CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm);
 	}
 	mutex_exit(&cpudsp->lock);
 do_return:
diff --git a/usr/src/uts/common/io/pm.c b/usr/src/uts/common/io/pm.c
index fba811e3c8..b9f146bf2a 100644
--- a/usr/src/uts/common/io/pm.c
+++ b/usr/src/uts/common/io/pm.c
@@ -19,11 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-
 /*
  * pm	This driver now only handles the ioctl interface.  The scanning
  *	and policy stuff now lives in common/os/sunpm.c.
@@ -33,6 +32,7 @@
 #include <sys/types.h>
 #include <sys/errno.h>
 #include <sys/modctl.h>
+#include <sys/callb.h>		/* callback registration for cpu_deep_idle */
 #include <sys/conf.h>		/* driver flags and functions */
 #include <sys/open.h>		/* OTYP_CHR definition */
 #include <sys/stat.h>		/* S_IFCHR definition */
@@ -53,6 +53,7 @@
 #include <sys/note.h>
 #include <sys/taskq.h>
 #include <sys/policy.h>
+#include <sys/cpu_pm.h>
 
 /*
  * Minor number is instance<<8 + clone minor from range 1-254; (0 reserved
@@ -73,6 +74,7 @@ extern kmutex_t	pm_scan_lock;	/* protects autopm_enable, pm_scans_disabled */
 extern kmutex_t	pm_clone_lock;	/* protects pm_clones array */
 extern int	autopm_enabled;
 extern pm_cpupm_t cpupm;
+extern pm_cpupm_t cpupm_default_mode;
 extern int	pm_default_idle_threshold;
 extern int	pm_system_idle_threshold;
 extern int	pm_cpu_idle_threshold;
@@ -444,6 +446,10 @@ static struct pm_cmd_info pmci[] = {
 	{PM_ADD_DEPENDENT_PROPERTY, "PM_ADD_DEPENDENT_PROPERTY", 1, PM_REQ,
 	    INWHO | INDATASTRING, NODIP, DEP, SU},
 	{PM_START_CPUPM, "PM_START_CPUPM", 1, NOSTRUCT, 0, 0, 0, SU},
+	{PM_START_CPUPM_EV, "PM_START_CPUPM_EV", 1, NOSTRUCT, 0,
+	    0, 0, SU},
+	{PM_START_CPUPM_POLL, "PM_START_CPUPM_POLL", 1, NOSTRUCT, 0,
+	    0, 0, SU},
 	{PM_STOP_CPUPM, "PM_STOP_CPUPM", 1, NOSTRUCT, 0, 0, 0, SU},
 	{PM_GET_CPU_THRESHOLD, "PM_GET_CPU_THRESHOLD", 1, NOSTRUCT},
 	{PM_SET_CPU_THRESHOLD, "PM_SET_CPU_THRESHOLD", 1, NOSTRUCT,
@@ -457,6 +463,12 @@ static struct pm_cmd_info pmci[] = {
 	{PM_SEARCH_LIST, "PM_SEARCH_LIST", 1, PM_SRCH, 0, 0, 0, SU},
 	{PM_GET_CMD_NAME, "PM_GET_CMD_NAME", 1, PM_REQ, INDATAOUT, NODIP,
 	    NODEP, 0},
+	{PM_DISABLE_CPU_DEEP_IDLE, "PM_DISABLE_CPU_DEEP_IDLE", 1, NOSTRUCT, 0,
+	    0, 0, SU},
+	{PM_ENABLE_CPU_DEEP_IDLE, "PM_START_CPU_DEEP_IDLE", 1, NOSTRUCT, 0,
+	    0, 0, SU},
+	{PM_DEFAULT_CPU_DEEP_IDLE, "PM_DFLT_CPU_DEEP_IDLE", 1, NOSTRUCT, 0,
+	    0, 0, SU},
 	{0, NULL}
 };
 
@@ -500,16 +512,17 @@ pm_start_pm_walk(dev_info_t *dip, void *arg)
 
 	switch (cmd) {
 	case PM_START_CPUPM:
+	case PM_START_CPUPM_POLL:
 		if (!PM_ISCPU(dip))
 			return (DDI_WALK_CONTINUE);
 		mutex_enter(&pm_scan_lock);
-		if (!PM_CPUPM_DISABLED)
+		if (!PM_CPUPM_DISABLED && !PM_EVENT_CPUPM)
 			pm_scan_init(dip);
 		mutex_exit(&pm_scan_lock);
 		break;
 	case PM_START_PM:
 		mutex_enter(&pm_scan_lock);
-		if (PM_ISCPU(dip) && PM_CPUPM_DISABLED) {
+		if (PM_ISCPU(dip) && (PM_CPUPM_DISABLED || PM_EVENT_CPUPM)) {
 			mutex_exit(&pm_scan_lock);
 			return (DDI_WALK_CONTINUE);
 		}
@@ -552,7 +565,7 @@ pm_stop_pm_walk(dev_info_t *dip, void *arg)
 		 * stop them as part of PM_STOP_PM. Only stop them as part of
 		 * PM_STOP_CPUPM and PM_RESET_PM.
 		 */
-		if (PM_ISCPU(dip) && PM_CPUPM_ENABLED)
+		if (PM_ISCPU(dip) && PM_POLLING_CPUPM)
 			return (DDI_WALK_CONTINUE);
 		break;
 	case PM_STOP_CPUPM:
@@ -2662,22 +2675,74 @@ pm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cr, int *rval_p)
 		switch (cmd) {
 		case PM_START_PM:
 		case PM_START_CPUPM:
+		case PM_START_CPUPM_EV:
+		case PM_START_CPUPM_POLL:
 		{
+			pm_cpupm_t	new_mode = PM_CPUPM_NOTSET;
+			pm_cpupm_t	old_mode = PM_CPUPM_NOTSET;
+			int		r;
+
 			mutex_enter(&pm_scan_lock);
 			if ((cmd == PM_START_PM && autopm_enabled) ||
-			    (cmd == PM_START_CPUPM && PM_CPUPM_ENABLED)) {
+			    (cmd == PM_START_CPUPM && PM_DEFAULT_CPUPM) ||
+			    (cmd == PM_START_CPUPM_EV && PM_EVENT_CPUPM) ||
+			    (cmd == PM_START_CPUPM_POLL && PM_POLLING_CPUPM)) {
 				mutex_exit(&pm_scan_lock);
-				PMD(PMD_ERROR, ("ioctl: %s: EBUSY\n",
-				    cmdstr))
+				PMD(PMD_ERROR, ("ioctl: %s: EBUSY\n", cmdstr))
 				ret = EBUSY;
 				break;
 			}
-			if (cmd == PM_START_PM)
+
+			if (cmd == PM_START_PM) {
 				autopm_enabled = 1;
-			else
-				cpupm = PM_CPUPM_ENABLE;
+			} else if (cmd == PM_START_CPUPM) {
+				old_mode = cpupm;
+				new_mode = cpupm = cpupm_default_mode;
+			} else if (cmd == PM_START_CPUPM_EV) {
+				old_mode = cpupm;
+				new_mode = cpupm = PM_CPUPM_EVENT;
+			} else if (cmd == PM_START_CPUPM_POLL) {
+				old_mode = cpupm;
+				new_mode = cpupm = PM_CPUPM_POLLING;
+			}
+
 			mutex_exit(&pm_scan_lock);
-			ddi_walk_devs(ddi_root_node(), pm_start_pm_walk, &cmd);
+
+			/*
+			 * If we are changing CPUPM modes, and it is active,
+			 * then stop it from operating in the old mode.
+			 */
+			if (old_mode == PM_CPUPM_POLLING) {
+				int c = PM_STOP_CPUPM;
+				ddi_walk_devs(ddi_root_node(), pm_stop_pm_walk,
+				    &c);
+			} else if (old_mode == PM_CPUPM_EVENT) {
+				r = cpupm_set_policy(CPUPM_POLICY_DISABLED);
+
+				/*
+				 * Disabling CPUPM policy should always
+				 * succeed
+				 */
+				ASSERT(r == 0);
+			}
+
+			/*
+			 * If we are changing to event based CPUPM, enable it.
+			 * In the event it's not supported, fall back to
+			 * polling based CPUPM.
+			 */
+			if (new_mode == PM_CPUPM_EVENT &&
+			    cpupm_set_policy(CPUPM_POLICY_ELASTIC) < 0) {
+				mutex_enter(&pm_scan_lock);
+				new_mode = cpupm = PM_CPUPM_POLLING;
+				cmd = PM_START_CPUPM_POLL;
+				mutex_exit(&pm_scan_lock);
+			}
+			if (new_mode == PM_CPUPM_POLLING ||
+			    cmd == PM_START_PM) {
+				ddi_walk_devs(ddi_root_node(), pm_start_pm_walk,
+				    &cmd);
+			}
 			ret = 0;
 			break;
 		}
@@ -2687,6 +2752,7 @@ pm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cr, int *rval_p)
 		case PM_STOP_CPUPM:
 		{
 			extern void pm_discard_thresholds(void);
+			pm_cpupm_t old_mode = PM_CPUPM_NOTSET;
 
 			mutex_enter(&pm_scan_lock);
 			if ((cmd == PM_STOP_PM && !autopm_enabled) ||
@@ -2697,22 +2763,30 @@ pm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cr, int *rval_p)
 				ret = EINVAL;
 				break;
 			}
+
 			if (cmd == PM_STOP_PM) {
 				autopm_enabled = 0;
 				pm_S3_enabled = 0;
 				autoS3_enabled = 0;
 			} else if (cmd == PM_STOP_CPUPM) {
+				old_mode = cpupm;
 				cpupm = PM_CPUPM_DISABLE;
 			} else {
 				autopm_enabled = 0;
 				autoS3_enabled = 0;
+				old_mode = cpupm;
 				cpupm = PM_CPUPM_NOTSET;
 			}
 			mutex_exit(&pm_scan_lock);
 
 			/*
 			 * bring devices to full power level, stop scan
+			 * If CPUPM was operating in event driven mode, disable
+			 * that.
 			 */
+			if (old_mode == PM_CPUPM_EVENT) {
+				(void) cpupm_set_policy(CPUPM_POLICY_DISABLED);
+			}
 			ddi_walk_devs(ddi_root_node(), pm_stop_pm_walk, &cmd);
 			ret = 0;
 			if (cmd == PM_STOP_PM || cmd == PM_STOP_CPUPM)
@@ -2796,7 +2870,7 @@ pm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cr, int *rval_p)
 
 		case PM_GET_CPUPM_STATE:
 		{
-			if (PM_CPUPM_ENABLED)
+			if (PM_POLLING_CPUPM || PM_EVENT_CPUPM)
 				*rval_p = PM_CPU_PM_ENABLED;
 			else if (PM_CPUPM_DISABLED)
 				*rval_p = PM_CPU_PM_DISABLED;
@@ -2881,6 +2955,34 @@ pm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cr, int *rval_p)
 			break;
 		}
 
+		case PM_ENABLE_CPU_DEEP_IDLE:
+		{
+			if (callb_execute_class(CB_CL_CPU_DEEP_IDLE,
+			    PM_ENABLE_CPU_DEEP_IDLE) == NULL)
+				ret = 0;
+			else
+				ret = EBUSY;
+			break;
+		}
+		case PM_DISABLE_CPU_DEEP_IDLE:
+		{
+			if (callb_execute_class(CB_CL_CPU_DEEP_IDLE,
+			    PM_DISABLE_CPU_DEEP_IDLE) == NULL)
+				ret = 0;
+			else
+				ret = EINVAL;
+			break;
+		}
+		case PM_DEFAULT_CPU_DEEP_IDLE:
+		{
+			if (callb_execute_class(CB_CL_CPU_DEEP_IDLE,
+			    PM_DEFAULT_CPU_DEEP_IDLE) == NULL)
+				ret = 0;
+			else
+				ret = EBUSY;
+			break;
+		}
+
 		default:
 			/*
 			 * Internal error, invalid ioctl description
@@ -2896,7 +2998,7 @@ pm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cr, int *rval_p)
 		break;
 	}
 
-	default:
+default:
 		/*
 		 * Internal error, invalid ioctl description
 		 * force debug entry even if pm_debug not set
diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c
index 904e507caf..8b8d0d08b5 100644
--- a/usr/src/uts/common/os/cpu.c
+++ b/usr/src/uts/common/os/cpu.c
@@ -56,6 +56,7 @@
 #include <sys/msacct.h>
 #include <sys/time.h>
 #include <sys/archsystm.h>
+#include <sys/sdt.h>
 #if defined(__x86) || defined(__amd64)
 #include <sys/x86_archext.h>
 #endif
@@ -2163,6 +2164,8 @@ static struct {
 	kstat_named_t ci_pkg_core_id;
 	kstat_named_t ci_ncpuperchip;
 	kstat_named_t ci_ncoreperchip;
+	kstat_named_t ci_max_cstates;
+	kstat_named_t ci_curr_cstate;
 #endif
 } cpu_info_template = {
 	{ "state",			KSTAT_DATA_CHAR },
@@ -2189,6 +2192,8 @@ static struct {
 	{ "pkg_core_id",		KSTAT_DATA_LONG },
 	{ "ncpu_per_chip",		KSTAT_DATA_INT32 },
 	{ "ncore_per_chip",		KSTAT_DATA_INT32 },
+	{ "supported_max_cstates",	KSTAT_DATA_INT32 },
+	{ "current_cstate",		KSTAT_DATA_INT32 },
 #endif
 };
 
@@ -2258,6 +2263,8 @@ cpu_info_kstat_update(kstat_t *ksp, int rw)
 	cpu_info_template.ci_ncoreperchip.value.l =
 	    cpuid_get_ncore_per_chip(cp);
 	cpu_info_template.ci_pkg_core_id.value.l = cpuid_get_pkgcoreid(cp);
+	cpu_info_template.ci_max_cstates.value.l = cp->cpu_m.max_cstates;
+	cpu_info_template.ci_curr_cstate.value.l = cp->cpu_m.curr_cstate;
 #endif
 
 	return (0);
@@ -2960,6 +2967,25 @@ cpu_set_supp_freqs(cpu_t *cp, const char *freqs)
 }
 
 /*
+ * Indicate the current CPU's clock freqency (in Hz).
+ * The calling context must be such that CPU references are safe.
+ */
+void
+cpu_set_curr_clock(uint64_t new_clk)
+{
+	uint64_t old_clk;
+
+	old_clk = CPU->cpu_curr_clock;
+	CPU->cpu_curr_clock = new_clk;
+
+	/*
+	 * The cpu-change-speed DTrace probe exports the frequency in Hz
+	 */
+	DTRACE_PROBE3(cpu__change__speed, processorid_t, CPU->cpu_id,
+	    uint64_t, old_clk, uint64_t, new_clk);
+}
+
+/*
  * processor_info(2) and p_online(2) status support functions
  *   The constants returned by the cpu_get_state() and cpu_get_state_str() are
  *   for use in communicating processor state information to userland.  Kernel
diff --git a/usr/src/uts/common/os/cpu_pm.c b/usr/src/uts/common/os/cpu_pm.c
new file mode 100644
index 0000000000..848907af1d
--- /dev/null
+++ b/usr/src/uts/common/os/cpu_pm.c
@@ -0,0 +1,840 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/cpu_pm.h>
+#include <sys/cmn_err.h>
+#include <sys/sdt.h>
+
+/*
+ * Solaris Event Based CPU Power Manager
+ *
+ * This file implements platform independent event based CPU power management.
+ * When CPUs are configured into the system, the CMT scheduling subsystem will
+ * query the platform to determine if the CPU belongs to any power management
+ * domains. That is, sets of CPUs that share power management states.
+ *
+ * Active Power Management domains represent a group of CPUs across which the
+ * Operating System can request speed changes (which may in turn result
+ * in voltage changes). This allows the operating system to trade off
+ * performance for power savings.
+ *
+ * Idle Power Management domains can enter power savings states when they are
+ * unutilized. These states allow the Operating System to trade off power
+ * for performance (in the form of latency to transition from the idle state
+ * to an active one).
+ *
+ * For each active and idle power domain the CMT subsystem instantiates, a
+ * cpupm_domain_t structure is created. As the dispatcher schedules threads
+ * to run on the system's CPUs, it will also track the utilization of the
+ * enumerated power domains. Significant changes in utilization will result
+ * in the dispatcher sending the power manager events that relate to the
+ * utilization of the power domain. The power manager recieves the events,
+ * and in the context of the policy objectives in force, may decide to request
+ * the domain's power/performance state be changed.
+ *
+ * Under the "elastic" CPUPM policy, when the utilization rises, the CPU power
+ * manager will request the CPUs in the domain run at their fastest (and most
+ * power consuming) state. When the domain becomes idle (utilization at zero),
+ * the power manager will request that the CPUs run at a speed that saves the
+ * most power.
+ *
+ * The advantage of this scheme, is that the CPU power manager working with the
+ * dispatcher can be extremely responsive to changes in utilization. Optimizing
+ * for performance in the presence of utilization, and power savings in the
+ * presence of idleness. Such close collaboration with the dispatcher has other
+ * benefits that will play out in the form of more sophisticated power /
+ * performance policy in the near future.
+ *
+ * Avoiding state thrashing in the presence of transient periods of utilization
+ * and idleness while still being responsive to non-transient periods is key.
+ * The power manager implmeents several "governors" that are used to throttle
+ * state transitions when a significant amount of transient idle or transient
+ * work is detected.
+ *
+ * Kernel background activity (e.g. taskq threads) are by far the most common
+ * form of transient utilization. Ungoverned in the face of this utililzation,
+ * hundreds of state transitions per second would result on an idle system.
+ *
+ * Transient idleness is common when a thread briefly yields the CPU to
+ * wait for an event elsewhere in the system. Where the idle period is short
+ * enough, the overhead associated with making the state transition doesn't
+ * justify the power savings.
+ */
+
+static cpupm_domain_t *cpupm_domains = NULL;
+
+/*
+ * Uninitialized state of CPU power management is disabled
+ */
+cpupm_policy_t cpupm_policy = CPUPM_POLICY_DISABLED;
+
+/*
+ * Periods of utilization lasting less than this time interval are characterized
+ * as transient. State changes associated with transient work are considered
+ * to be mispredicted. That is, it's not worth raising and lower power states
+ * where the utilization lasts for less than this interval.
+ */
+hrtime_t cpupm_tw_predict_interval;
+
+/*
+ * Periods of idleness lasting less than this time interval are characterized
+ * as transient. State changes associated with transient idle are considered
+ * to be mispredicted. That is, it's not worth lowering and raising power
+ * states where the idleness lasts for less than this interval.
+ */
+hrtime_t cpupm_ti_predict_interval;
+
+/*
+ * Number of mispredictions after which future transitions will be governed.
+ */
+int cpupm_mispredict_thresh = 2;
+
+/*
+ * Likewise, the number of mispredicted governed transitions after which the
+ * governor will be removed.
+ */
+int cpupm_mispredict_gov_thresh = 10;
+
+/*
+ * The transient work and transient idle prediction intervals are initialized
+ * to be some multiple of the amount of time it takes to transition a power
+ * domain from the highest to the lowest power state, and back again, which
+ * is measured.
+ *
+ * The default values of those multiples are specified here. Tuning them higher
+ * will result in the transient work, and transient idle governors being used
+ * more aggresively, which limits the frequency of state transitions at the
+ * expense of performance and power savings, respectively.
+ */
+#define	CPUPM_TI_GOV_DEFAULT_MULTIPLE 600
+#define	CPUPM_TW_GOV_DEFAULT_MULTIPLE 25
+
+/*
+ * Number of high=>low=>high measurements performed, of which the average
+ * is taken.
+ */
+#define	CPUPM_BENCHMARK_ITERS 5
+
+int cpupm_ti_gov_multiple = CPUPM_TI_GOV_DEFAULT_MULTIPLE;
+int cpupm_tw_gov_multiple = CPUPM_TW_GOV_DEFAULT_MULTIPLE;
+
+
+static int	cpupm_governor_initialize(void);
+static void	cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t);
+
+cpupm_policy_t
+cpupm_get_policy(void)
+{
+	return (cpupm_policy);
+}
+
+int
+cpupm_set_policy(cpupm_policy_t new_policy)
+{
+	static int	gov_init = 0;
+	int		result = 0;
+
+	mutex_enter(&cpu_lock);
+	if (new_policy == cpupm_policy) {
+		mutex_exit(&cpu_lock);
+		return (result);
+	}
+
+	/*
+	 * Pausing CPUs causes a high priority thread to be scheduled
+	 * on all other CPUs (besides the current one). This locks out
+	 * other CPUs from making CPUPM state transitions.
+	 */
+	switch (new_policy) {
+	case CPUPM_POLICY_DISABLED:
+		pause_cpus(NULL);
+		cpupm_policy = CPUPM_POLICY_DISABLED;
+		start_cpus();
+
+		result = cmt_pad_disable(PGHW_POW_ACTIVE);
+
+		/*
+		 * Once PAD has been enabled, it should always be possible
+		 * to disable it.
+		 */
+		ASSERT(result == 0);
+
+		/*
+		 * Bring all the active power domains to the maximum
+		 * performance state.
+		 */
+		cpupm_state_change_global(CPUPM_DTYPE_ACTIVE,
+		    CPUPM_STATE_MAX_PERF);
+
+		break;
+	case CPUPM_POLICY_ELASTIC:
+
+		result = cmt_pad_enable(PGHW_POW_ACTIVE);
+		if (result < 0) {
+			/*
+			 * Failed to enable PAD across the active power
+			 * domains, which may well be because none were
+			 * enumerated.
+			 */
+			break;
+		}
+
+		pause_cpus(NULL);
+		/*
+		 * Attempt to initialize the governor parameters the first
+		 * time through.
+		 */
+		if (gov_init == 0) {
+			result = cpupm_governor_initialize();
+			if (result == 0) {
+				gov_init = 1;
+			} else {
+				/*
+				 * Failed to initialize the governor parameters
+				 */
+				start_cpus();
+				break;
+			}
+		}
+		cpupm_policy = CPUPM_POLICY_ELASTIC;
+		start_cpus();
+
+		break;
+	default:
+		cmn_err(CE_WARN, "Attempt to set unknown CPUPM policy %d\n",
+		    new_policy);
+		ASSERT(0);
+		break;
+	}
+	mutex_exit(&cpu_lock);
+
+	return (result);
+}
+
+/*
+ * Look for an existing power domain
+ */
+static cpupm_domain_t *
+cpupm_domain_find(id_t id, cpupm_dtype_t type)
+{
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	cpupm_domain_t *dom;
+
+	dom = cpupm_domains;
+	while (dom != NULL) {
+		if (id == dom->cpd_id && type == dom->cpd_type)
+			return (dom);
+		dom = dom->cpd_next;
+	}
+	return (NULL);
+}
+
+/*
+ * Create a new domain
+ */
+static cpupm_domain_t *
+cpupm_domain_create(id_t id, cpupm_dtype_t type)
+{
+	cpupm_domain_t *dom;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	dom = kmem_zalloc(sizeof (cpupm_domain_t), KM_SLEEP);
+	dom->cpd_id = id;
+	dom->cpd_type = type;
+
+	/* Link into the known domain list */
+	dom->cpd_next = cpupm_domains;
+	cpupm_domains = dom;
+
+	return (dom);
+}
+
+static void
+cpupm_domain_state_enum(struct cpu *cp, cpupm_domain_t *dom)
+{
+	/*
+	 * In the envent we're enumerating because the domain's state
+	 * configuration has changed, toss any existing states.
+	 */
+	if (dom->cpd_nstates > 0) {
+		kmem_free(dom->cpd_states,
+		    sizeof (cpupm_state_t) * dom->cpd_nstates);
+		dom->cpd_nstates = 0;
+	}
+
+	/*
+	 * Query to determine the number of states, allocate storage
+	 * large enough to hold the state information, and pass it back
+	 * to the platform driver to complete the enumeration.
+	 */
+	dom->cpd_nstates = cpupm_plat_state_enumerate(cp, dom->cpd_type, NULL);
+
+	if (dom->cpd_nstates == 0)
+		return;
+
+	dom->cpd_states =
+	    kmem_zalloc(dom->cpd_nstates * sizeof (cpupm_state_t), KM_SLEEP);
+	(void) cpupm_plat_state_enumerate(cp, dom->cpd_type, dom->cpd_states);
+}
+
+/*
+ * Initialize the specified type of power domain on behalf of the CPU
+ */
+cpupm_domain_t *
+cpupm_domain_init(struct cpu *cp, cpupm_dtype_t type)
+{
+	cpupm_domain_t	*dom;
+	id_t		did;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	/*
+	 * Instantiate the domain if it doesn't already exist
+	 * and enumerate its power states.
+	 */
+	did = cpupm_domain_id(cp, type);
+	dom = cpupm_domain_find(did, type);
+	if (dom == NULL) {
+		dom = cpupm_domain_create(did, type);
+		cpupm_domain_state_enum(cp, dom);
+	}
+
+	/*
+	 * Named state initialization
+	 */
+	if (type == CPUPM_DTYPE_ACTIVE) {
+		/*
+		 * For active power domains, the highest performance
+		 * state is defined as first state returned from
+		 * the domain enumeration.
+		 */
+		dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
+		    &dom->cpd_states[0];
+		dom->cpd_named_states[CPUPM_STATE_LOW_POWER] =
+		    &dom->cpd_states[dom->cpd_nstates - 1];
+
+		/*
+		 * Begin by assuming CPU is running at the max perf state.
+		 */
+		dom->cpd_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+	}
+
+	return (dom);
+}
+
+/*
+ * Return the id associated with the given type of domain
+ * to which cp belongs
+ */
+id_t
+cpupm_domain_id(struct cpu *cp, cpupm_dtype_t type)
+{
+	return (cpupm_plat_domain_id(cp, type));
+}
+
+/*
+ * Initiate a state change for the specified domain on behalf of cp
+ */
+int
+cpupm_change_state(struct cpu *cp, cpupm_domain_t *dom, cpupm_state_t *state)
+{
+	if (cpupm_plat_change_state(cp, state) < 0)
+		return (-1);
+
+	DTRACE_PROBE2(cpupm__change__state,
+	    cpupm_domain_t *, dom,
+	    cpupm_state_t *, state);
+
+	dom->cpd_state = state;
+	return (0);
+}
+
+/*
+ * Interface into the CPU power manager to indicate a significant change
+ * in utilization of the specified active power domain
+ */
+void
+cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom,
+			    cpupm_util_event_t event)
+{
+	cpupm_state_t	*new_state = NULL;
+	hrtime_t	last;
+
+	if (cpupm_policy == CPUPM_POLICY_DISABLED) {
+		return;
+	}
+
+	/*
+	 * What follows is a simple elastic power state management policy.
+	 *
+	 * If the utilization has become non-zero, and the domain was
+	 * previously at it's lowest power state, then transition it
+	 * to the highest state in the spirit of "race to idle".
+	 *
+	 * If the utilization has dropped to zero, then transition the
+	 * domain to its lowest power state.
+	 *
+	 * Statistics are maintained to implement governors to reduce state
+	 * transitions resulting from either transient work, or periods of
+	 * transient idleness on the domain.
+	 */
+	switch (event) {
+	case CPUPM_DOM_REMAIN_BUSY:
+
+		/*
+		 * We've received an event that the domain is running a thread
+		 * that's made it to the end of it's time slice. If we are at
+		 * low power, then raise it. If the transient work governor
+		 * is engaged, then remove it.
+		 */
+		if (dom->cpd_state ==
+		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
+			new_state =
+			    dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+			if (dom->cpd_tw_governed == B_TRUE) {
+				dom->cpd_tw_governed = B_FALSE;
+				dom->cpd_tw = 0;
+			}
+		}
+		break;
+
+	case CPUPM_DOM_BUSY_FROM_IDLE:
+		last = dom->cpd_last_lower;
+		dom->cpd_last_raise = now;
+
+		DTRACE_PROBE3(cpupm__raise__req,
+		    cpupm_domain_t *, dom,
+		    hrtime_t, last,
+		    hrtime_t, now);
+
+		if (dom->cpd_state ==
+		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
+
+			/*
+			 * There's non-zero utilization, and the domain is
+			 * running in the lower power state. Before we
+			 * consider raising power, perform some book keeping
+			 * for the transient idle governor.
+			 */
+			if (dom->cpd_ti_governed == B_FALSE) {
+				if ((now - last) < cpupm_ti_predict_interval) {
+					/*
+					 * We're raising the domain power and
+					 * we *just* lowered it. Consider
+					 * this a mispredicted power state
+					 * transition due to a transient
+					 * idle period.
+					 */
+					if (++dom->cpd_ti >=
+					    cpupm_mispredict_thresh) {
+						/*
+						 * There's enough transient
+						 * idle transitions to
+						 * justify governing future
+						 * lowering requests.
+						 */
+						dom->cpd_ti_governed = B_TRUE;
+						dom->cpd_ti = 0;
+						DTRACE_PROBE1(
+						    cpupm__ti__governed,
+						    cpupm_domain_t *, dom);
+					}
+				} else {
+					/*
+					 * We correctly predicted the last
+					 * lowering.
+					 */
+					dom->cpd_ti = 0;
+				}
+			}
+			if (dom->cpd_tw_governed == B_TRUE) {
+				/*
+				 * Raise requests are governed due to
+				 * transient work.
+				 */
+				DTRACE_PROBE1(cpupm__raise__governed,
+				    cpupm_domain_t *, dom);
+
+				/*
+				 * It's likely that we'll be governed for a
+				 * while. If the transient idle governor is
+				 * also in place, examine the preceeding idle
+				 * interval to see if that still makes sense.
+				 */
+				if (dom->cpd_ti_governed == B_TRUE &&
+				    ((now - last) >=
+				    cpupm_ti_predict_interval)) {
+					if (++dom->cpd_ti >=
+					    cpupm_mispredict_gov_thresh) {
+						dom->cpd_ti_governed =
+						    B_FALSE;
+						dom->cpd_ti = 0;
+					}
+				}
+				return;
+			}
+			/*
+			 * Prepare to transition to the higher power state
+			 */
+			new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+
+		} else if (dom->cpd_state ==
+		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
+
+			/*
+			 * Utilization is non-zero, and we're already running
+			 * in the higher power state. Take this opportunity to
+			 * perform some book keeping if the last lowering
+			 * request was governed.
+			 */
+			if (dom->cpd_ti_governed == B_TRUE) {
+				if ((now - last) >= cpupm_ti_predict_interval) {
+					/*
+					 * The domain is transient idle
+					 * governed, and we mispredicted
+					 * governing the last lowering request.
+					 */
+					if (++dom->cpd_ti >=
+					    cpupm_mispredict_gov_thresh) {
+						/*
+						 * There's enough non-transient
+						 * idle periods to justify
+						 * removing the governor.
+						 */
+						dom->cpd_ti_governed = B_FALSE;
+						dom->cpd_ti = 0;
+						DTRACE_PROBE1(
+						    cpupm__ti__ungoverned,
+						    cpupm_domain_t *, dom);
+					}
+				} else {
+					/*
+					 * Correctly predicted governing the
+					 * last lowering request.
+					 */
+					dom->cpd_ti = 0;
+				}
+			}
+		}
+		break;
+
+	case CPUPM_DOM_IDLE_FROM_BUSY:
+		last = dom->cpd_last_raise;
+		dom->cpd_last_lower = now;
+
+		DTRACE_PROBE3(cpupm__lower__req,
+		    cpupm_domain_t *, dom,
+		    hrtime_t, last,
+		    hrtime_t, now);
+
+		if (dom->cpd_state ==
+		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
+
+			/*
+			 * The domain is idle, and is running in the highest
+			 * performance state. Before we consider lowering power,
+			 * perform some book keeping for the transient work
+			 * governor.
+			 */
+			if (dom->cpd_tw_governed == B_FALSE) {
+				if ((now - last) < cpupm_tw_predict_interval) {
+					/*
+					 * We're lowering the domain power and
+					 * we *just* raised it. Consider the
+					 * last raise mispredicted due to
+					 * transient work.
+					 */
+					if (++dom->cpd_tw >=
+					    cpupm_mispredict_thresh) {
+						/*
+						 * There's enough transient idle
+						 * transitions to justify
+						 * governing future lowering
+						 * requests.
+						 */
+						dom->cpd_tw_governed = B_TRUE;
+						dom->cpd_tw = 0;
+						DTRACE_PROBE1(
+						    cpupm__tw__governed,
+						    cpupm_domain_t *, dom);
+					}
+				} else {
+					/*
+					 * We correctly predicted during the
+					 * last raise.
+					 */
+					dom->cpd_tw = 0;
+				}
+			}
+			if (dom->cpd_ti_governed == B_TRUE) {
+				/*
+				 * Lowering requests are governed due to
+				 * transient idleness.
+				 */
+				DTRACE_PROBE1(cpupm__lowering__governed,
+				    cpupm_domain_t *, dom);
+
+				/*
+				 * It's likely that we'll be governed for a
+				 * while. If the transient work governor is
+				 * also in place, examine the preceeding busy
+				 * interval to see if that still makes sense.
+				 */
+				if (dom->cpd_tw_governed == B_TRUE &&
+				    ((now - last) >=
+				    cpupm_tw_predict_interval)) {
+					if (++dom->cpd_tw >=
+					    cpupm_mispredict_gov_thresh) {
+						dom->cpd_tw_governed =
+						    B_FALSE;
+						dom->cpd_tw = 0;
+					}
+				}
+				return;
+			}
+
+			/*
+			 * Prepare to transition to a lower power state.
+			 */
+			new_state =
+			    dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
+
+		} else if (dom->cpd_state ==
+		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
+
+			/*
+			 * The domain is idle, and we're already running in
+			 * the lower power state. Take this opportunity to
+			 * perform some book keeping if the last raising
+			 * request was governed.
+			 */
+			if (dom->cpd_tw_governed == B_TRUE) {
+				if ((now - last) >= cpupm_tw_predict_interval) {
+					/*
+					 * The domain is transient work
+					 * governed, and we mispredicted
+					 * governing the last raising request.
+					 */
+					if (++dom->cpd_tw >=
+					    cpupm_mispredict_gov_thresh) {
+						/*
+						 * There's enough non-transient
+						 * work to justify removing
+						 * the governor.
+						 */
+						dom->cpd_tw_governed = B_FALSE;
+						dom->cpd_tw = 0;
+						DTRACE_PROBE1(
+						    cpupm__tw__ungoverned,
+						    cpupm_domain_t *, dom);
+					}
+				} else {
+					/*
+					 * We correctly predicted governing
+					 * the last raise.
+					 */
+					dom->cpd_tw = 0;
+				}
+			}
+		}
+		break;
+	}
+	/*
+	 * Change the power state
+	 * Not much currently done if this doesn't succeed
+	 */
+	if (new_state)
+		(void) cpupm_change_state(cp, dom, new_state);
+}
+
+
+/*
+ * Interface called by platforms to dynamically change the
+ * MAX performance cpupm state
+ */
+void
+cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level)
+{
+	cpupm_domain_t	*dom;
+	id_t		did;
+	cpupm_dtype_t	type = CPUPM_DTYPE_ACTIVE;
+	boolean_t	change_state = B_FALSE;
+	cpupm_state_t	*new_state = NULL;
+
+	did = cpupm_domain_id(cp, type);
+	mutex_enter(&cpu_lock);
+	dom = cpupm_domain_find(did, type);
+	mutex_exit(&cpu_lock);
+
+	/*
+	 * Can use a lock to avoid changing the power state of the cpu when
+	 * CPUPM_STATE_MAX_PERF is getting changed.
+	 * Since the occurance of events to change MAX_PERF is not frequent,
+	 * it may not be a good idea to overburden with locks. In the worst
+	 * case, for one cycle the power may not get changed to the required
+	 * level
+	 */
+	if (dom != NULL) {
+		if (dom->cpd_state ==
+		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
+			change_state = B_TRUE;
+		}
+
+		/*
+		 * If an out of range level is passed, use the lowest supported
+		 * speed.
+		 */
+		if (max_perf_level >= dom->cpd_nstates &&
+		    dom->cpd_nstates > 1) {
+			max_perf_level = dom->cpd_nstates - 1;
+		}
+
+		dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
+		    &dom->cpd_states[max_perf_level];
+
+		/*
+		 * If the current state is MAX_PERF, change the current state
+		 * to the new MAX_PERF
+		 */
+		if (change_state) {
+			new_state =
+			    dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+			if (new_state) {
+				(void) cpupm_change_state(cp, dom, new_state);
+			}
+		}
+	}
+}
+
+/*
+ * Benchmark some power state transitions and use the transition latencies as
+ * a basis for initializing parameters for the transient idle and transient
+ * work governors.
+ *
+ * Returns 0 on success or -1 if the governor parameters could not be
+ * initialized.
+ */
+static int
+cpupm_governor_initialize(void)
+{
+	cpu_t		*cp = CPU;
+	cpupm_domain_t	*dom;
+	cpupm_state_t	*low, *high;
+	id_t		did;
+	hrtime_t	start, delta, deltas = 0;
+	int		iterations;
+
+	did = cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE);
+	if (did == CPUPM_NO_DOMAIN)
+		return (-1);
+
+	dom = cpupm_domain_find(did, CPUPM_DTYPE_ACTIVE);
+	if (dom == NULL)
+		return (-1);
+
+	low = dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
+	high = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
+
+	for (iterations = 0; iterations < CPUPM_BENCHMARK_ITERS; iterations++) {
+
+		/*
+		 * Measure the amount of time it takes to transition the
+		 * domain down to the lowest, and back to the highest power
+		 * state.
+		 */
+		start = gethrtime_unscaled();
+		(void) cpupm_change_state(cp, dom, low);
+		(void) cpupm_change_state(cp, dom, high);
+		delta = gethrtime_unscaled() - start;
+
+		DTRACE_PROBE1(cpupm__benchmark__latency,
+		    hrtime_t, delta);
+
+		deltas += delta;
+	}
+
+	/*
+	 * Figure the average latency, and tune the transient work and
+	 * transient idle prediction intervals accordingly.
+	 */
+	delta = deltas / iterations;
+
+	cpupm_ti_predict_interval = delta * cpupm_ti_gov_multiple;
+	cpupm_tw_predict_interval = delta * cpupm_tw_gov_multiple;
+
+	return (0);
+}
+
+/*
+ * Initiate a state change in all CPUPM domain instances of the specified type
+ */
+static void
+cpupm_state_change_global(cpupm_dtype_t type, cpupm_state_name_t state)
+{
+	cpu_t		*cp;
+	pg_cmt_t	*pwr_pg;
+	cpupm_domain_t	*dom;
+	group_t		*hwset;
+	group_iter_t	giter;
+	pg_cpu_itr_t	cpu_iter;
+	pghw_type_t	hw;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	switch (type) {
+	case CPUPM_DTYPE_ACTIVE:
+		hw = PGHW_POW_ACTIVE;
+		break;
+	default:
+		/*
+		 * Power domain types other than "active" unsupported.
+		 */
+		ASSERT(type == CPUPM_DTYPE_ACTIVE);
+		return;
+	}
+
+	if ((hwset = pghw_set_lookup(hw)) == NULL)
+		return;
+
+	/*
+	 * Iterate over the power domains
+	 */
+	group_iter_init(&giter);
+	while ((pwr_pg = group_iterate(hwset, &giter)) != NULL) {
+
+		dom = (cpupm_domain_t *)pwr_pg->cmt_pg.pghw_handle;
+
+		/*
+		 * Iterate over the CPUs in each domain
+		 */
+		PG_CPU_ITR_INIT(pwr_pg, cpu_iter);
+		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
+			(void) cpupm_change_state(cp, dom,
+			    dom->cpd_named_states[state]);
+		}
+	}
+}
diff --git a/usr/src/uts/common/os/cpupm.c b/usr/src/uts/common/os/cpupm.c
new file mode 100644
index 0000000000..1e1aa97bf5
--- /dev/null
+++ b/usr/src/uts/common/os/cpupm.c
@@ -0,0 +1,67 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/sunddi.h>
+#include <sys/cpupm.h>
+
+/*
+ * Initialize the field that will be used for reporting
+ * the supported_frequencies_Hz cpu_info kstat.
+ */
+void
+cpupm_set_supp_freqs(cpu_t *cp, int *speeds, uint_t nspeeds)
+{
+	char		*supp_freqs = NULL;
+	char		*sfptr;
+	uint64_t	*hzspeeds;
+	int		i;
+	int		j;
+#define	UINT64_MAX_STRING (sizeof ("18446744073709551615"))
+
+	if (speeds == NULL) {
+		cpu_set_supp_freqs(cp, supp_freqs);
+		return;
+	}
+
+	hzspeeds = kmem_zalloc(nspeeds * sizeof (uint64_t), KM_SLEEP);
+	for (i = nspeeds - 1, j = 0; i >= 0; i--, j++) {
+		hzspeeds[i] = CPUPM_SPEED_HZ(cp->cpu_type_info.pi_clock,
+		    speeds[j]);
+	}
+
+	supp_freqs = kmem_zalloc((UINT64_MAX_STRING * nspeeds), KM_SLEEP);
+	sfptr = supp_freqs;
+	for (i = 0; i < nspeeds; i++) {
+		if (i == nspeeds - 1) {
+			(void) sprintf(sfptr, "%"PRIu64, hzspeeds[i]);
+		} else {
+			(void) sprintf(sfptr, "%"PRIu64":", hzspeeds[i]);
+			sfptr = supp_freqs + strlen(supp_freqs);
+		}
+	}
+	cpu_set_supp_freqs(cp, supp_freqs);
+	kmem_free(supp_freqs, (UINT64_MAX_STRING * nspeeds));
+	kmem_free(hzspeeds, nspeeds * sizeof (uint64_t));
+}
diff --git a/usr/src/uts/common/os/group.c b/usr/src/uts/common/os/group.c
index b15dff181f..8c1bc7e491 100644
--- a/usr/src/uts/common/os/group.c
+++ b/usr/src/uts/common/os/group.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/systm.h>
 #include <sys/param.h>
 #include <sys/debug.h>
@@ -64,6 +62,21 @@ group_destroy(group_t *g)
 }
 
 /*
+ * Empty a group_t
+ * Capacity is preserved.
+ */
+void
+group_empty(group_t *g)
+{
+	int	i;
+	int	sz = g->grp_size;
+
+	g->grp_size = 0;
+	for (i = 0; i < sz; i++)
+		g->grp_set[i] = NULL;
+}
+
+/*
  * Add element "e" to group "g"
  *
  * Returns -1 if addition would result in overcapacity, and
@@ -312,7 +325,7 @@ group_add_at(group_t *g, void *e, uint_t idx)
 }
 
 /*
- * Remove the entry at the specified index
+ * Remove the element at the specified index
  */
 void
 group_remove_at(group_t *g, uint_t idx)
@@ -320,3 +333,19 @@ group_remove_at(group_t *g, uint_t idx)
 	ASSERT(idx < g->grp_capacity);
 	g->grp_set[idx] = NULL;
 }
+
+/*
+ * Find an element in the group, and return its index
+ * Returns -1 if the element could not be found.
+ */
+uint_t
+group_find(group_t *g, void *e)
+{
+	uint_t	idx;
+
+	for (idx = 0; idx < g->grp_capacity; idx++) {
+		if (g->grp_set[idx] == e)
+			return (idx);
+	}
+	return ((uint_t)-1);
+}
diff --git a/usr/src/uts/common/os/pg.c b/usr/src/uts/common/os/pg.c
index 9bd15af43b..82601cac77 100644
--- a/usr/src/uts/common/os/pg.c
+++ b/usr/src/uts/common/os/pg.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/systm.h>
 #include <sys/types.h>
 #include <sys/param.h>
@@ -99,6 +97,7 @@
 
 static pg_t		*pg_alloc_default(pg_class_t);
 static void		pg_free_default(pg_t *);
+static void		pg_null_op();
 
 /*
  * Bootstrap CPU specific PG data
@@ -127,6 +126,12 @@ static struct pg_ops pg_ops_default = {
 	NULL,			/* cpupart_out */
 	NULL,			/* cpupart_move */
 	NULL,			/* cpu_belongs */
+	NULL,			/* policy_name */
+};
+
+static struct pg_cb_ops pg_cb_ops_default = {
+	pg_null_op,		/* thread_swtch */
+	pg_null_op,		/* thread_remain */
 };
 
 /*
@@ -144,6 +149,13 @@ static struct pg_ops pg_ops_default = {
 
 
 /*
+ * Class specific PG policy name
+ */
+#define	PG_POLICY_NAME(pg)						\
+	((pg)->pg_class->pgc_ops->policy_name ?				\
+	    (pg)->pg_class->pgc_ops->policy_name(pg) : NULL)		\
+
+/*
  * Class specific membership test callback
  */
 #define	PG_CPU_BELONGS(pg, cp)						\
@@ -206,13 +218,22 @@ static int		pg_nclasses;
 static pg_cid_t		pg_default_cid;
 
 /*
- * Initialze common PG subsystem. Perform CPU 0 initialization
+ * Initialze common PG subsystem.
  */
 void
 pg_init(void)
 {
+	extern void pg_cmt_class_init();
+
 	pg_default_cid =
 	    pg_class_register("default", &pg_ops_default, PGR_LOGICAL);
+
+	/*
+	 * Initialize classes to allow them to register with the framework
+	 */
+	pg_cmt_class_init();
+
+	pg_cpu0_init();
 }
 
 /*
@@ -282,7 +303,7 @@ pg_class_register(char *name, struct pg_ops *ops, pg_relation_t relation)
 		classes_old = pg_classes;
 		pg_classes =
 		    kmem_zalloc(sizeof (pg_class_t) * (pg_nclasses + 1),
-			KM_SLEEP);
+		    KM_SLEEP);
 		(void) kcopy(classes_old, pg_classes,
 		    sizeof (pg_class_t) * pg_nclasses);
 		kmem_free(classes_old, sizeof (pg_class_t) * pg_nclasses);
@@ -339,6 +360,27 @@ pg_cpu_next(pg_cpu_itr_t *itr)
 }
 
 /*
+ * Test if a given PG contains a given CPU
+ */
+boolean_t
+pg_cpu_find(pg_t *pg, cpu_t *cp)
+{
+	if (group_find(&pg->pg_cpus, cp) == (uint_t)-1)
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/*
+ * Set the PGs callbacks to the default
+ */
+void
+pg_callback_set_defaults(pg_t *pg)
+{
+	bcopy(&pg_cb_ops_default, &pg->pg_cb, sizeof (struct pg_cb_ops));
+}
+
+/*
  * Create a PG of a given class.
  * This routine may block.
  */
@@ -374,6 +416,11 @@ pg_create(pg_cid_t cid)
 	 */
 	group_create(&pg->pg_cpus);
 
+	/*
+	 * Initialize the events ops vector
+	 */
+	pg_callback_set_defaults(pg);
+
 	return (pg);
 }
 
@@ -620,6 +667,20 @@ pg_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
 }
 
 /*
+ * Return a class specific string describing a policy implemented
+ * across this PG
+ */
+char *
+pg_policy_name(pg_t *pg)
+{
+	char *str;
+	if ((str = PG_POLICY_NAME(pg)) != NULL)
+		return (str);
+
+	return ("N/A");
+}
+
+/*
  * Provide the specified CPU a bootstrap pg
  * This is needed to allow sane behaviour if any PG consuming
  * code needs to deal with a partially initialized CPU
@@ -643,3 +704,52 @@ pg_free_default(struct pg *pg)
 {
 	kmem_free(pg, sizeof (pg_t));
 }
+
+static void
+pg_null_op()
+{
+}
+
+/*
+ * Invoke the "thread switch" callback for each of the CPU's PGs
+ * This is invoked from the dispatcher swtch() routine, which is called
+ * when a thread running an a CPU should switch to another thread.
+ * "cp" is the CPU on which the thread switch is happening
+ * "now" is an unscaled hrtime_t timestamp taken in swtch()
+ * "old" and "new" are the outgoing and incoming threads, respectively.
+ */
+void
+pg_ev_thread_swtch(struct cpu *cp, hrtime_t now, kthread_t *old, kthread_t *new)
+{
+	int	i, sz;
+	group_t	*grp;
+	pg_t	*pg;
+
+	grp = &cp->cpu_pg->pgs;
+	sz = GROUP_SIZE(grp);
+	for (i = 0; i < sz; i++) {
+		pg = GROUP_ACCESS(grp, i);
+		pg->pg_cb.thread_swtch(pg, cp, now, old, new);
+	}
+}
+
+/*
+ * Invoke the "thread remain" callback for each of the CPU's PGs.
+ * This is called from the dispatcher's swtch() routine when a thread
+ * running on the CPU "cp" is switching to itself, which can happen as an
+ * artifact of the thread's timeslice expiring.
+ */
+void
+pg_ev_thread_remain(struct cpu *cp, kthread_t *t)
+{
+	int	i, sz;
+	group_t	*grp;
+	pg_t	*pg;
+
+	grp = &cp->cpu_pg->pgs;
+	sz = GROUP_SIZE(grp);
+	for (i = 0; i < sz; i++) {
+		pg = GROUP_ACCESS(grp, i);
+		pg->pg_cb.thread_remain(pg, cp, t);
+	}
+}
diff --git a/usr/src/uts/common/os/pghw.c b/usr/src/uts/common/os/pghw.c
index 8b98bb7e7c..ca59db8602 100644
--- a/usr/src/uts/common/os/pghw.c
+++ b/usr/src/uts/common/os/pghw.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/systm.h>
 #include <sys/types.h>
 #include <sys/param.h>
@@ -35,6 +33,7 @@
 #include <sys/group.h>
 #include <sys/pg.h>
 #include <sys/pghw.h>
+#include <sys/cpu_pm.h>
 
 /*
  * Processor Groups: Hardware sharing relationship layer
@@ -99,7 +98,7 @@
  * (the CPU's chip, cache, lgroup, etc.).
  *
  * The hwsets are created dynamically as new hardware sharing relationship types
- * are instantiated. They are never destroyed, as once a given relathionship
+ * are instantiated. They are never destroyed, as once a given relationship
  * type appears in the system, it is quite likely that at least one instance of
  * that relationship will always persist as long as the system is running.
  */
@@ -107,11 +106,6 @@
 static group_t		*pg_hw;		/* top level pg hw group */
 
 /*
- * Lookup table mapping hardware sharing relationships with hierarchy levels
- */
-static int		pghw_level_table[PGHW_NUM_COMPONENTS];
-
-/*
  * Physical PG kstats
  */
 struct pghw_kstat {
@@ -120,12 +114,14 @@ struct pghw_kstat {
 	kstat_named_t	pg_ncpus;
 	kstat_named_t	pg_instance_id;
 	kstat_named_t	pg_hw;
+	kstat_named_t	pg_policy;
 } pghw_kstat = {
 	{ "id",			KSTAT_DATA_UINT64 },
 	{ "pg_class",		KSTAT_DATA_STRING },
 	{ "ncpus",		KSTAT_DATA_UINT64 },
 	{ "instance_id",	KSTAT_DATA_UINT64 },
 	{ "hardware",		KSTAT_DATA_STRING },
+	{ "policy",		KSTAT_DATA_STRING },
 };
 
 kmutex_t		pghw_kstat_lock;
@@ -138,7 +134,7 @@ static void		pghw_set_add(group_t *, pghw_t *);
 static void		pghw_set_remove(group_t *, pghw_t *);
 
 /*
- * Initialize the physical portion of a physical PG
+ * Initialize the physical portion of a hardware PG
  */
 void
 pghw_init(pghw_t *pg, cpu_t *cp, pghw_type_t hw)
@@ -157,6 +153,22 @@ pghw_init(pghw_t *pg, cpu_t *cp, pghw_type_t hw)
 	pg->pghw_instance =
 	    pg_plat_hw_instance_id(cp, hw);
 	pghw_kstat_create(pg);
+
+	/*
+	 * Hardware sharing relationship specific initialization
+	 */
+	switch (pg->pghw_hw) {
+	case PGHW_POW_ACTIVE:
+		pg->pghw_handle =
+		    (pghw_handle_t)cpupm_domain_init(cp, CPUPM_DTYPE_ACTIVE);
+		break;
+	case PGHW_POW_IDLE:
+		pg->pghw_handle =
+		    (pghw_handle_t)cpupm_domain_init(cp, CPUPM_DTYPE_IDLE);
+		break;
+	default:
+		pg->pghw_handle = (pghw_handle_t)NULL;
+	}
 }
 
 /*
@@ -262,16 +274,6 @@ pghw_physid_destroy(cpu_t *cp)
 }
 
 /*
- * Return a sequential level identifier for the specified
- * hardware sharing relationship
- */
-int
-pghw_level(pghw_type_t hw)
-{
-	return (pg_plat_hw_level(hw));
-}
-
-/*
  * Create a new, empty hwset.
  * This routine may block, and must not be called from any
  * paused CPU context.
@@ -303,13 +305,6 @@ pghw_set_create(pghw_type_t hw)
 	ret = group_add_at(pg_hw, g, (uint_t)hw);
 	ASSERT(ret == 0);
 
-	/*
-	 * Update the table that maps hardware sharing relationships
-	 * to hierarchy levels
-	 */
-	ASSERT(pghw_level_table[hw] == NULL);
-	pghw_level_table[hw] = pg_plat_hw_level(hw);
-
 	return (g);
 }
 
@@ -353,24 +348,26 @@ pghw_set_remove(group_t *hwset, pghw_t *pg)
 /*
  * Return a string name given a pg_hw sharing type
  */
-#define	PGHW_TYPE_NAME_MAX	8
-
 static char *
 pghw_type_string(pghw_type_t hw)
 {
 	switch (hw) {
 	case PGHW_IPIPE:
-		return ("ipipe");
+		return ("Integer Pipeline");
 	case PGHW_CACHE:
-		return ("cache");
+		return ("Cache");
 	case PGHW_FPU:
-		return ("fpu");
+		return ("Floating Point Unit");
 	case PGHW_MPIPE:
-		return ("mpipe");
+		return ("Data Pipe to memory");
 	case PGHW_CHIP:
-		return ("chip");
+		return ("Socket");
 	case PGHW_MEMORY:
-		return ("memory");
+		return ("Memory");
+	case PGHW_POW_ACTIVE:
+		return ("CPU PM Active Power Domain");
+	case PGHW_POW_IDLE:
+		return ("CPU PM Idle Power Domain");
 	default:
 		return ("unknown");
 	}
@@ -393,8 +390,10 @@ pghw_kstat_create(pghw_t *pg)
 	    "pg", "pg", KSTAT_TYPE_NAMED,
 	    sizeof (pghw_kstat) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL)) != NULL) {
+		/* Class string, hw string, and policy string */
 		pg->pghw_kstat->ks_data_size += PG_CLASS_NAME_MAX;
-		pg->pghw_kstat->ks_data_size += PGHW_TYPE_NAME_MAX;
+		pg->pghw_kstat->ks_data_size += PGHW_KSTAT_STR_LEN_MAX;
+		pg->pghw_kstat->ks_data_size += PGHW_KSTAT_STR_LEN_MAX;
 		pg->pghw_kstat->ks_lock = &pghw_kstat_lock;
 		pg->pghw_kstat->ks_data = &pghw_kstat;
 		pg->pghw_kstat->ks_update = pghw_kstat_update;
@@ -417,6 +416,6 @@ pghw_kstat_update(kstat_t *ksp, int rw)
 	pgsp->pg_instance_id.value.ui64 = (uint64_t)pg->pghw_instance;
 	kstat_named_setstr(&pgsp->pg_class, ((pg_t *)pg)->pg_class->pgc_name);
 	kstat_named_setstr(&pgsp->pg_hw, pghw_type_string(pg->pghw_hw));
-
+	kstat_named_setstr(&pgsp->pg_policy, pg_policy_name((pg_t *)pg));
 	return (0);
 }
diff --git a/usr/src/uts/common/os/sunpm.c b/usr/src/uts/common/os/sunpm.c
index d7deefb099..84c0b9fbb6 100644
--- a/usr/src/uts/common/os/sunpm.c
+++ b/usr/src/uts/common/os/sunpm.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -348,6 +348,13 @@ int		autopm_enabled;
 pm_cpupm_t	cpupm = PM_CPUPM_NOTSET;
 
 /*
+ * Defines the default mode of operation for CPU power management,
+ * either the polling implementation, or the event based dispatcher driven
+ * implementation.
+ */
+pm_cpupm_t	cpupm_default_mode = PM_CPUPM_EVENT;
+
+/*
  * AutoS3 depends on autopm being enabled, and must be enabled by
  * PM_START_AUTOS3 command.
  */
@@ -2568,7 +2575,7 @@ pm_lower_power(dev_info_t *dip, int comp, int level)
 		PMD(PMD_FAIL, ("%s: %s@%s(%s#%d) %s%s%s%s\n",
 		    pmf, PM_DEVICE(dip),
 		    !autopm_enabled ? "!autopm_enabled " : "",
-		    !PM_CPUPM_ENABLED ? "!cpupm_enabled " : "",
+		    !PM_POLLING_CPUPM ? "!cpupm_polling " : "",
 		    PM_CPUPM_DISABLED ? "cpupm_disabled " : "",
 		    pm_noinvol(dip) ? "pm_noinvol()" : ""))
 		return (DDI_SUCCESS);
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index bc7ebb334d..9cd4ae55b4 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -139,6 +139,7 @@ CHKHDRS=			\
 	cpr.h			\
 	cpupart.h		\
 	cpuvar.h		\
+	cpu_pm.h		\
 	crc32.h			\
 	cred.h			\
 	cred_impl.h		\
diff --git a/usr/src/uts/common/sys/callb.h b/usr/src/uts/common/sys/callb.h
index b548f4ca23..302f314b80 100644
--- a/usr/src/uts/common/sys/callb.h
+++ b/usr/src/uts/common/sys/callb.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_CALLB_H
 #define	_SYS_CALLB_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/t_lock.h>
 #include <sys/thread.h>
 
@@ -69,7 +66,8 @@ extern "C" {
 #define	CB_CL_MDBOOT		CB_CL_UADMIN
 #define	CB_CL_ENTER_DEBUGGER	14
 #define	CB_CL_CPR_POST_KERNEL	15
-#define	NCBCLASS		16 /* CHANGE ME if classes are added/removed */
+#define	CB_CL_CPU_DEEP_IDLE	16
+#define	NCBCLASS		17 /* CHANGE ME if classes are added/removed */
 
 /*
  * CB_CL_CPR_DAEMON class specific definitions are given below:
diff --git a/usr/src/uts/common/sys/cmt.h b/usr/src/uts/common/sys/cmt.h
index f1a95dc8c3..3ea49ded99 100644
--- a/usr/src/uts/common/sys/cmt.h
+++ b/usr/src/uts/common/sys/cmt.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -37,9 +37,20 @@ extern "C" {
 #if (defined(_KERNEL) || defined(_KMEMUSER))
 #include <sys/group.h>
 #include <sys/pghw.h>
+#include <sys/lgrp.h>
 #include <sys/types.h>
 
 /*
+ * CMT related dispatcher policies
+ */
+#define	CMT_NO_POLICY	0x0
+#define	CMT_BALANCE	0x1
+#define	CMT_COALESCE	0x2
+#define	CMT_AFFINITY	0x4
+
+typedef uint_t pg_cmt_policy_t;
+
+/*
  * CMT pg structure
  */
 typedef struct pg_cmt {
@@ -47,26 +58,67 @@ typedef struct pg_cmt {
 	struct group	*cmt_siblings;		/* CMT PGs to balance with */
 	struct pg_cmt	*cmt_parent;		/* Parent CMT PG */
 	struct group	*cmt_children;		/* Active children CMT PGs */
+	pg_cmt_policy_t	cmt_policy;		/* Dispatcher policies to use */
+	uint32_t	cmt_utilization;	/* Group's utilization */
 	int		cmt_nchildren;		/* # of children CMT PGs */
-	uint32_t	cmt_nrunning;		/* # of running threads */
+	int		cmt_hint;		/* hint for balancing */
 	struct group	cmt_cpus_actv;
 	struct bitset	cmt_cpus_actv_set;	/* bitset of active CPUs */
 } pg_cmt_t;
 
 /*
+ * CMT lgroup structure
+ */
+typedef struct cmt_lgrp {
+	group_t		cl_pgs;		/* Top level group of active CMT PGs */
+	int		cl_npgs;	/* # of top level PGs in the lgroup */
+	lgrp_handle_t	cl_hand;	/* lgroup's platform handle */
+	struct cmt_lgrp	*cl_next;	/* next cmt_lgrp */
+} cmt_lgrp_t;
+
+/*
  * Change the number of running threads on the pg
  */
-#define	PG_NRUN_UPDATE(cp, n)	(pg_cmt_load((cp), (n)))
+#define	PG_NRUN_UPDATE(cp, n)		(pg_cmt_load((cp), (n)))
+
+/*
+ * Indicate that the given logical CPU is (or isn't) currently utilized
+ */
+#define	CMT_CPU_UTILIZED(cp)		(pg_cmt_load((cp), 1))
+#define	CMT_CPU_NOT_UTILIZED(cp)	(pg_cmt_load((cp), -1))
+
+/*
+ * CMT PG's capacity
+ *
+ * Currently, this is defined to be the number of active
+ * logical CPUs in the group.
+ *
+ * This will be used in conjunction with the utilization, which is defined
+ * to be the number of threads actively running on CPUs in the group.
+ */
+#define	CMT_CAPACITY(pg)	(GROUP_SIZE(&((pg_cmt_t *)pg)->cmt_cpus_actv))
 
 void		pg_cmt_load(cpu_t *, int);
 void		pg_cmt_cpu_startup(cpu_t *);
 int		pg_cmt_can_migrate(cpu_t *, cpu_t *);
 
-int		pg_plat_cmt_load_bal_hw(pghw_type_t);
-int		pg_plat_cmt_affinity_hw(pghw_type_t);
+/*
+ * CMT platform interfaces
+ */
+pg_cmt_policy_t	pg_plat_cmt_policy(pghw_type_t);
+int		pg_plat_cmt_rank(pg_cmt_t *, pg_cmt_t *);
 
+/*
+ * CMT dispatcher policy
+ */
 cpu_t		*cmt_balance(kthread_t *, cpu_t *);
 
+/*
+ * Power Aware Dispatcher Interfaces
+ */
+int		cmt_pad_enable(pghw_type_t);
+int		cmt_pad_disable(pghw_type_t);
+
 #endif	/* !_KERNEL && !_KMEMUSER */
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/sys/cpu_pm.h b/usr/src/uts/common/sys/cpu_pm.h
new file mode 100644
index 0000000000..3ec3bcd68d
--- /dev/null
+++ b/usr/src/uts/common/sys/cpu_pm.h
@@ -0,0 +1,139 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_CPU_PM_H
+#define	_CPU_PM_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#if (defined(_KERNEL) || defined(_KMEMUSER))
+#include <sys/cpuvar.h>
+#include <sys/processor.h>
+#include <sys/types.h>
+#include <sys/kstat.h>
+#include <sys/cmt.h>
+
+/*
+ * CPU Power Manager Policies
+ */
+typedef enum cpupm_policy {
+	CPUPM_POLICY_ELASTIC,
+	CPUPM_POLICY_DISABLED,
+	CPUPM_NUM_POLICIES
+} cpupm_policy_t;
+
+/*
+ * Power Managable CPU Domain Types
+ */
+typedef enum cpupm_dtype {
+	CPUPM_DTYPE_ACTIVE,	/* Active Power Domain */
+	CPUPM_DTYPE_IDLE	/* Idle Power Domain */
+} cpupm_dtype_t;
+
+/*
+ * CPUPM state names for policy implementation.
+ * The last element is used to size the enumeration.
+ */
+typedef enum cpupm_state_name {
+	CPUPM_STATE_LOW_POWER,
+	CPUPM_STATE_MAX_PERF,
+	CPUPM_STATE_NAMES
+} cpupm_state_name_t;
+
+/*
+ * Utilization events delivered by the dispatcher.
+ */
+typedef enum cpupm_util_event {
+	CPUPM_DOM_BUSY_FROM_IDLE,
+	CPUPM_DOM_IDLE_FROM_BUSY,
+	CPUPM_DOM_REMAIN_BUSY
+} cpupm_util_event_t;
+
+typedef uintptr_t	cpupm_handle_t;	/* Platform handle */
+
+/*
+ * CPU Power Domain State
+ */
+typedef struct cpupm_state {
+	uint32_t	cps_speed;
+	cpupm_handle_t	cps_handle;
+} cpupm_state_t;
+
+/*
+ * CPU Power Domain
+ */
+typedef struct cpupm_domain {
+	id_t			cpd_id;		/* Domain ID */
+	cpupm_dtype_t		cpd_type;	/* Active or Idle */
+	cpupm_state_t		*cpd_states;	/* Available Power States */
+	cpupm_state_t		*cpd_state;	/* Current State */
+	uint_t			cpd_nstates;	/* Number of States */
+	cpupm_state_t		*cpd_named_states[CPUPM_STATE_NAMES];
+	hrtime_t		cpd_last_raise;	/* Last raise request time */
+	hrtime_t		cpd_last_lower;	/* last lower request time */
+	int			cpd_tw;		/* transient work history */
+	int			cpd_ti;		/* transient idle history */
+	boolean_t		cpd_ti_governed; /* transient idle governor */
+	boolean_t		cpd_tw_governed; /* transient work governor */
+	struct cpupm_domain	*cpd_next;
+} cpupm_domain_t;
+
+#define	CPUPM_NO_DOMAIN ((id_t)-1)
+
+/*
+ * CPU power manager domain management interfaces
+ */
+cpupm_domain_t		*cpupm_domain_init(struct cpu *, cpupm_dtype_t);
+id_t			cpupm_domain_id(struct cpu *, cpupm_dtype_t);
+int			cpupm_change_state(struct cpu *, cpupm_domain_t *,
+    cpupm_state_t *);
+extern void		cpupm_redefine_max_activepwr_state(struct cpu *, int);
+
+/*
+ * CPU power manager policy engine interfaces
+ */
+int			cpupm_set_policy(cpupm_policy_t);
+cpupm_policy_t		cpupm_get_policy(void);
+void			cpupm_utilization_event(struct cpu *, hrtime_t,
+			    cpupm_domain_t *, cpupm_util_event_t);
+
+/*
+ * CPU power platform driver interfaces
+ */
+id_t	cpupm_plat_domain_id(struct cpu *, cpupm_dtype_t);
+uint_t	cpupm_plat_state_enumerate(struct cpu *, cpupm_dtype_t,
+    cpupm_state_t *);
+int	cpupm_plat_change_state(struct cpu *, cpupm_state_t *);
+
+
+#endif	/* !_KERNEL && !_KMEMUSER */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _CPU_PM_H */
diff --git a/usr/src/uts/common/sys/cpudrv.h b/usr/src/uts/common/sys/cpudrv.h
index 782d8f509c..4cf4e4d1b6 100644
--- a/usr/src/uts/common/sys/cpudrv.h
+++ b/usr/src/uts/common/sys/cpudrv.h
@@ -76,10 +76,10 @@ typedef struct cpudrv_pm {
 	cpudrv_pm_spd_t	*cur_spd;	/* ptr to current speed */
 	uint_t		num_spd;	/* number of speeds */
 	hrtime_t	lastquan_mstate[NCMSTATES]; /* last quantum's mstate */
-	clock_t		lastquan_ticks; /* last quantum's clock tick */
+	clock_t		lastquan_ticks;	/* last quantum's clock tick */
 	int		pm_busycnt;	/* pm_busy_component() count  */
 	taskq_t		*tq;		/* taskq handler for CPU monitor */
-	timeout_id_t	timeout_id;	/* cpudrv_pm_monitor()'s timeout_id */
+	timeout_id_t	timeout_id;	/* cpudrv_monitor()'s timeout_id */
 	int		timeout_count;	/* count dispatched timeouts */
 	kmutex_t	timeout_lock;	/* protect timeout_count */
 	kcondvar_t	timeout_cv;	/* wait on timeout_count change */
@@ -94,31 +94,31 @@ typedef struct cpudrv_pm {
  * Idle & user threads water marks in percentage
  */
 #if defined(__x86)
-#define	CPUDRV_PM_IDLE_HWM		85	/* idle high water mark */
-#define	CPUDRV_PM_IDLE_LWM		70	/* idle low water mark */
-#define	CPUDRV_PM_IDLE_BLWM_CNT_MAX	1    /* # of iters idle can be < lwm */
-#define	CPUDRV_PM_IDLE_BHWM_CNT_MAX	1    /* # of iters idle can be < hwm */
+#define	CPUDRV_IDLE_HWM		85	/* idle high water mark */
+#define	CPUDRV_IDLE_LWM		70	/* idle low water mark */
+#define	CPUDRV_IDLE_BLWM_CNT_MAX	1    /* # of iters idle can be < lwm */
+#define	CPUDRV_IDLE_BHWM_CNT_MAX	1    /* # of iters idle can be < hwm */
 #else
-#define	CPUDRV_PM_IDLE_HWM		98	/* idle high water mark */
-#define	CPUDRV_PM_IDLE_LWM		8	/* idle low water mark */
-#define	CPUDRV_PM_IDLE_BLWM_CNT_MAX	2    /* # of iters idle can be < lwm */
-#define	CPUDRV_PM_IDLE_BHWM_CNT_MAX	2    /* # of iters idle can be < hwm */
+#define	CPUDRV_IDLE_HWM		98	/* idle high water mark */
+#define	CPUDRV_IDLE_LWM		8	/* idle low water mark */
+#define	CPUDRV_IDLE_BLWM_CNT_MAX	2    /* # of iters idle can be < lwm */
+#define	CPUDRV_IDLE_BHWM_CNT_MAX	2    /* # of iters idle can be < hwm */
 #endif
-#define	CPUDRV_PM_USER_HWM		20	/* user high water mark */
-#define	CPUDRV_PM_IDLE_BUF_ZONE		4    /* buffer zone when going down */
+#define	CPUDRV_USER_HWM		20	/* user high water mark */
+#define	CPUDRV_IDLE_BUF_ZONE		4    /* buffer zone when going down */
 
 
 /*
  * Maximums for creating 'pm-components' property
  */
-#define	CPUDRV_PM_COMP_MAX_DIG	4	/* max digits in power level */
+#define	CPUDRV_COMP_MAX_DIG	4	/* max digits in power level */
 					/* or divisor */
-#define	CPUDRV_PM_COMP_MAX_VAL	9999	/* max value in above digits */
+#define	CPUDRV_COMP_MAX_VAL	9999	/* max value in above digits */
 
 /*
  * Component number for calls to PM framework
  */
-#define	CPUDRV_PM_COMP_NUM	0	/* first component is 0 */
+#define	CPUDRV_COMP_NUM	0	/* first component is 0 */
 
 /*
  * Quantum counts for normal and other clock speeds in terms of ticks.
@@ -132,26 +132,26 @@ typedef struct cpudrv_pm {
  *	that we monitor less frequently.
  *
  * We reach a tradeoff between these two requirements by monitoring
- * more frequently when we are in low speed mode (CPUDRV_PM_QUANT_CNT_OTHR)
+ * more frequently when we are in low speed mode (CPUDRV_QUANT_CNT_OTHR)
  * so we can bring the CPU up without user noticing it. Moreover, at low
  * speed we are not using CPU much so extra code execution should be fine.
  * Since we are in no hurry to bring CPU down and at normal speed and we
  * might really be using the CPU fully, we monitor less frequently
- * (CPUDRV_PM_QUANT_CNT_NORMAL).
+ * (CPUDRV_QUANT_CNT_NORMAL).
  */
 #if defined(__x86)
-#define	CPUDRV_PM_QUANT_CNT_NORMAL	(hz * 1)	/* 1 sec */
+#define	CPUDRV_QUANT_CNT_NORMAL	(hz * 1)	/* 1 sec */
 #else
-#define	CPUDRV_PM_QUANT_CNT_NORMAL	(hz * 5)	/* 5 sec */
+#define	CPUDRV_QUANT_CNT_NORMAL	(hz * 5)	/* 5 sec */
 #endif
-#define	CPUDRV_PM_QUANT_CNT_OTHR	(hz * 1)	/* 1 sec */
+#define	CPUDRV_QUANT_CNT_OTHR	(hz * 1)	/* 1 sec */
 
 /*
  * Taskq parameters
  */
-#define	CPUDRV_PM_TASKQ_THREADS		1    /* # threads to run CPU monitor */
-#define	CPUDRV_PM_TASKQ_MIN		2	/* min # of taskq entries */
-#define	CPUDRV_PM_TASKQ_MAX		2	/* max # of taskq entries */
+#define	CPUDRV_TASKQ_THREADS		1    /* # threads to run CPU monitor */
+#define	CPUDRV_TASKQ_MIN		2	/* min # of taskq entries */
+#define	CPUDRV_TASKQ_MAX		2	/* max # of taskq entries */
 
 
 /*
@@ -159,13 +159,14 @@ typedef struct cpudrv_pm {
  */
 typedef struct cpudrv_devstate {
 	dev_info_t	*dip;		/* devinfo handle */
+	cpu_t		*cp;		/* CPU data for this node */
 	processorid_t	cpu_id;		/* CPU number for this node */
 	cpudrv_pm_t	cpudrv_pm;	/* power management data */
 	kmutex_t	lock;		/* protects state struct */
-	void		*mach_state; /* machine specific state */
 } cpudrv_devstate_t;
 
 extern void	*cpudrv_state;
+extern boolean_t cpudrv_enabled;
 
 /*
  * Debugging definitions
@@ -191,12 +192,13 @@ extern uint_t	cpudrv_debug;
 #define	DPRINTF(flag, args)
 #endif /* DEBUG */
 
-extern int cpudrv_pm_change_speed(cpudrv_devstate_t *, cpudrv_pm_spd_t *);
-extern boolean_t cpudrv_pm_get_cpu_id(dev_info_t *, processorid_t *);
-extern boolean_t cpudrv_pm_power_ready(void);
-extern boolean_t cpudrv_pm_is_governor_thread(cpudrv_pm_t *);
-extern boolean_t cpudrv_mach_pm_init(cpudrv_devstate_t *);
-extern void cpudrv_mach_pm_free(cpudrv_devstate_t *);
+extern int cpudrv_change_speed(cpudrv_devstate_t *, cpudrv_pm_spd_t *);
+extern boolean_t cpudrv_get_cpu_id(dev_info_t *, processorid_t *);
+extern boolean_t cpudrv_is_governor_thread(cpudrv_pm_t *);
+extern boolean_t cpudrv_mach_init(cpudrv_devstate_t *);
+extern boolean_t cpudrv_power_ready(void);
+extern boolean_t cpudrv_is_enabled(cpudrv_devstate_t *);
+extern void cpudrv_set_supp_freqs(cpudrv_devstate_t *);
 
 #endif /* _KERNEL */
 
diff --git a/usr/src/uts/common/sys/cpupm.h b/usr/src/uts/common/sys/cpupm.h
new file mode 100644
index 0000000000..2f74775450
--- /dev/null
+++ b/usr/src/uts/common/sys/cpupm.h
@@ -0,0 +1,43 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_CPUPM_H
+#define	_CPUPM_H
+
+#include <sys/types.h>
+#include <sys/cpuvar.h>
+#include <sys/cpupm_mach.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+extern void cpupm_set_supp_freqs(cpu_t *, int *, uint_t);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _CPUPM_H */
diff --git a/usr/src/uts/common/sys/cpuvar.h b/usr/src/uts/common/sys/cpuvar.h
index 2d056fa6ab..99829bbb03 100644
--- a/usr/src/uts/common/sys/cpuvar.h
+++ b/usr/src/uts/common/sys/cpuvar.h
@@ -366,7 +366,6 @@ extern cpu_core_t cpu_core[];
 #define	CPU_DISP_DONTSTEAL	0x01	/* CPU undergoing context swtch */
 #define	CPU_DISP_HALTED		0x02	/* CPU halted waiting for interrupt */
 
-
 #endif /* _KERNEL || _KMEMUSER */
 
 #if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP)
@@ -673,6 +672,7 @@ int	cpu_get_state(cpu_t *);		/* get current cpu state */
 const char *cpu_get_state_str(cpu_t *);	/* get current cpu state as string */
 
 
+void	cpu_set_curr_clock(uint64_t);	/* indicate the current CPU's freq */
 void	cpu_set_supp_freqs(cpu_t *, const char *); /* set the CPU supported */
 						/* frequencies */
 
diff --git a/usr/src/uts/common/sys/epm.h b/usr/src/uts/common/sys/epm.h
index 222fd59675..476b254d1a 100644
--- a/usr/src/uts/common/sys/epm.h
+++ b/usr/src/uts/common/sys/epm.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -227,7 +227,8 @@ typedef enum pm_canblock
 typedef enum pm_cpupm
 {
 	PM_CPUPM_NOTSET,	/* no specific treatment of CPU devices */
-	PM_CPUPM_ENABLE,	/* power manage CPU devices */
+	PM_CPUPM_POLLING,	/* CPUPM enabled: polling mode */
+	PM_CPUPM_EVENT,		/* CPUPM enabled: event driven mode */
 	PM_CPUPM_DISABLE	/* do not power manage CPU devices */
 } pm_cpupm_t;
 
@@ -609,9 +610,19 @@ typedef struct pm_thresh_rec {
 #define	PM_ISCPU(dip) (DEVI(dip)->devi_pm_flags & PMC_CPU_DEVICE)
 
 /*
- * Returns true if cpupm is enabled.
+ * Returns true if cpupm is enabled in event driven mode.
  */
-#define	PM_CPUPM_ENABLED (cpupm == PM_CPUPM_ENABLE)
+#define	PM_EVENT_CPUPM (cpupm == PM_CPUPM_EVENT)
+
+/*
+ * Returns true if cpupm is enabled in polling mode.
+ */
+#define	PM_POLLING_CPUPM (cpupm == PM_CPUPM_POLLING)
+
+/*
+ * Returns true if cpupm operating using the default mode.
+ */
+#define	PM_DEFAULT_CPUPM (cpupm == cpupm_default_mode)
 
 /*
  * Returns true if is disabled.
@@ -619,12 +630,14 @@ typedef struct pm_thresh_rec {
 #define	PM_CPUPM_DISABLED (cpupm == PM_CPUPM_DISABLE)
 
 /*
- * If (autopm is enabled and
- *      (CPUs are not disabled, or it isn't a cpu)) OR
- *    (CPUs are enabled and it is one)
+ * If ((autopm is enabled and
+ *	(CPUPM is not disabled and we're not in event mode, or it isn't a cpu))
+ *	  OR
+ *	(CPUPM are enabled and it is one))
  */
 #define	PM_SCANABLE(dip) ((autopm_enabled && \
-(!PM_CPUPM_DISABLED || !PM_ISCPU(dip))) || (PM_CPUPM_ENABLED && PM_ISCPU(dip)))
+	((!PM_CPUPM_DISABLED && !PM_EVENT_CPUPM) || !PM_ISCPU(dip))) || \
+	(PM_POLLING_CPUPM && PM_ISCPU(dip)))
 
 #define	PM_NOT_ALL_LOWEST	0x0	/* not all components are at lowest */
 #define	PM_ALL_LOWEST		0x1	/* all components are at lowest lvl */
diff --git a/usr/src/uts/common/sys/group.h b/usr/src/uts/common/sys/group.h
index 89a5ca1f1a..bb5613bc35 100644
--- a/usr/src/uts/common/sys/group.h
+++ b/usr/src/uts/common/sys/group.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_GROUP_H
 #define	_GROUP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Group Abstraction
  */
@@ -79,13 +77,14 @@ void		group_expand(group_t *, uint_t);
  * Group element iteration
  */
 void		group_iter_init(group_iter_t *);
-void		*group_iterate(group_t *, uint_t *);
+void		*group_iterate(group_t *, group_iter_t *);
 
 /*
- * Add / remove an element from the group
+ * Add / remove an element (or elements) from the group
  */
 int		group_add(group_t *, void *, int);
 int		group_remove(group_t *, void *, int);
+void		group_empty(group_t *);
 
 /*
  * Add / remove / access an element at a specified index.
@@ -95,6 +94,13 @@ int		group_remove(group_t *, void *, int);
 int		group_add_at(group_t *, void *, uint_t);
 void		group_remove_at(group_t *, uint_t);
 
+/*
+ * Search for an element in a group.
+ * Returns an index that may be used with the *_at()
+ * routines above to add or remove the element.
+ */
+uint_t		group_find(group_t *, void *);
+
 #endif	/* !_KERNEL && !_KMEMUSER */
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/sys/pg.h b/usr/src/uts/common/sys/pg.h
index 99c51ca09a..4ab31ffdd2 100644
--- a/usr/src/uts/common/sys/pg.h
+++ b/usr/src/uts/common/sys/pg.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_PG_H
 #define	_PG_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Processor Groups
  */
@@ -48,6 +46,8 @@ extern "C" {
 typedef uint_t		pgid_t;		/* processor group id */
 typedef uint_t		pg_cid_t;	/* processor group class id */
 
+struct pg;
+
 /*
  * Nature of CPU relationships
  */
@@ -57,13 +57,26 @@ typedef enum pg_relation {
 } pg_relation_t;
 
 /*
+ * Processor Group callbacks ops vector
+ * These provide a mechanism allowing per PG routines to invoked
+ * in response to events.
+ */
+typedef struct pg_cb_ops {
+	void		(*thread_swtch)(struct pg *, struct cpu *, hrtime_t,
+			    kthread_t *, kthread_t *);
+	void		(*thread_remain)(struct pg *, struct cpu *,
+			    kthread_t *);
+} pg_cb_ops_t;
+
+/*
  * Processor group structure
  */
 typedef struct pg {
-	pgid_t		pg_id;		/* seq id */
-	pg_relation_t	pg_relation;	/* grouping relationship */
-	struct pg_class	*pg_class;	/* pg class */
-	struct group	pg_cpus;	/* group of CPUs */
+	pgid_t			pg_id;		/* seq id */
+	pg_relation_t		pg_relation;	/* grouping relationship */
+	struct pg_class		*pg_class;	/* pg class */
+	struct group		pg_cpus;	/* group of CPUs */
+	pg_cb_ops_t		pg_cb;		/* pg events ops vector */
 } pg_t;
 
 /*
@@ -81,6 +94,7 @@ struct pg_ops {
 	void		(*cpupart_move)(struct cpu *, struct cpupart *,
 			    struct cpupart *);
 	int		(*cpu_belongs)(struct pg *, struct cpu *);
+	char		*(*policy_name)(struct pg *);
 };
 
 #define	PG_CLASS_NAME_MAX 32
@@ -130,6 +144,12 @@ typedef struct	pg_cpu_itr {
 	    GROUP_ACCESS(&((pg_t *)pgrp)->pg_cpus, 0) : NULL)
 
 /*
+ * Return the number of CPUs in a PG
+ */
+#define	PG_NUM_CPUS(pgrp)			\
+	(GROUP_SIZE(&(pgrp)->pg_cpus))
+
+/*
  * Framework routines
  */
 void		pg_init(void);
@@ -162,7 +182,19 @@ void		pg_cpu_add(pg_t *, cpu_t *);
 void		pg_cpu_delete(pg_t *, cpu_t *);
 pg_t		*pg_cpu_find_pg(cpu_t *, group_t *);
 cpu_t		*pg_cpu_next(pg_cpu_itr_t *);
+boolean_t	pg_cpu_find(pg_t *, cpu_t *);
+
+/*
+ * PG Event callbacks
+ */
+void		pg_callback_set_defaults(pg_t *);
+void		pg_ev_thread_swtch(cpu_t *, hrtime_t, kthread_t *, kthread_t *);
+void		pg_ev_thread_remain(cpu_t *, kthread_t *);
 
+/*
+ * PG Observability interfaces
+ */
+char		*pg_policy_name(pg_t *);
 
 #endif	/* !_KERNEL && !_KMEMUSER */
 
diff --git a/usr/src/uts/common/sys/pghw.h b/usr/src/uts/common/sys/pghw.h
index f22afc021b..0953bc19c9 100644
--- a/usr/src/uts/common/sys/pghw.h
+++ b/usr/src/uts/common/sys/pghw.h
@@ -19,16 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_PGHW_H
 #define	_PGHW_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -48,27 +45,47 @@ extern "C" {
  */
 typedef enum pghw_type {
 	PGHW_START,
-	PGHW_IPIPE,
-	PGHW_CACHE,
-	PGHW_FPU,
-	PGHW_MPIPE,
-	PGHW_CHIP,
+	PGHW_IPIPE,	/* Instruction Pipeline */
+	PGHW_CACHE,	/* Cache (generally last level) */
+	PGHW_FPU,	/* Floating Point Unit / Pipeline */
+	PGHW_MPIPE,	/* Pipe to Memory */
+	PGHW_CHIP,	/* Socket */
 	PGHW_MEMORY,
+	PGHW_POW_ACTIVE,	/* Active Power Management Domain */
+	PGHW_POW_IDLE,		/* Idle Power Management Domain */
 	PGHW_NUM_COMPONENTS
 } pghw_type_t;
 
 /*
+ * Returns true if the hardware is a type of power management domain
+ */
+#define	PGHW_IS_PM_DOMAIN(hw)	\
+	(hw == PGHW_POW_ACTIVE || hw == PGHW_POW_IDLE)
+
+/*
  * Anonymous instance id
  */
 #define	PGHW_INSTANCE_ANON ((id_t)0xdecafbad)
 
 /*
+ * Max length of PGHW kstat strings
+ */
+#define	PGHW_KSTAT_STR_LEN_MAX	32
+
+
+/*
+ * Platform specific handle
+ */
+typedef uintptr_t pghw_handle_t;
+
+/*
  * Processor Group (physical sharing relationship)
  */
 typedef struct pghw {
 	pg_t		pghw_pg;	/* processor group */
 	pghw_type_t	pghw_hw;	/* HW sharing relationship */
 	id_t		pghw_instance;	/* sharing instance identifier */
+	pghw_handle_t	pghw_handle;	/* hw specific opaque handle */
 	kstat_t		*pghw_kstat;	/* physical kstats exported */
 } pghw_t;
 
@@ -102,16 +119,14 @@ pghw_t		*pghw_find_pg(cpu_t *, pghw_type_t);
 pghw_t		*pghw_find_by_instance(id_t, pghw_type_t);
 group_t		*pghw_set_lookup(pghw_type_t);
 
-int		pghw_level(pghw_type_t);
-
 void		pghw_kstat_create(pghw_t *);
 int		pghw_kstat_update(kstat_t *, int);
 
 /* Hardware sharing relationship platform interfaces */
 int		pg_plat_hw_shared(cpu_t *, pghw_type_t);
 int		pg_plat_cpus_share(cpu_t *, cpu_t *, pghw_type_t);
-int		pg_plat_hw_level(pghw_type_t);
 id_t		pg_plat_hw_instance_id(cpu_t *, pghw_type_t);
+pghw_type_t	pg_plat_hw_rank(pghw_type_t, pghw_type_t);
 
 /*
  * What comprises a "core" may vary across processor implementations,
diff --git a/usr/src/uts/common/sys/pm.h b/usr/src/uts/common/sys/pm.h
index 8be171fef1..f98bb79fcb 100644
--- a/usr/src/uts/common/sys/pm.h
+++ b/usr/src/uts/common/sys/pm.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_PM_H
 #define	_SYS_PM_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -92,6 +90,8 @@ typedef enum {
 	PM_GET_DEFAULT_SYSTEM_THRESHOLD,
 	PM_ADD_DEPENDENT_PROPERTY,
 	PM_START_CPUPM,
+	PM_START_CPUPM_EV,
+	PM_START_CPUPM_POLL,
 	PM_STOP_CPUPM,
 	PM_GET_CPU_THRESHOLD,
 	PM_SET_CPU_THRESHOLD,
@@ -104,7 +104,10 @@ typedef enum {
 	PM_SEARCH_LIST,		/* search S3 enable/disable list */
 	PM_GET_AUTOS3_STATE,
 	PM_GET_S3_SUPPORT_STATE,
-	PM_GET_CMD_NAME
+	PM_GET_CMD_NAME,
+	PM_DISABLE_CPU_DEEP_IDLE,
+	PM_ENABLE_CPU_DEEP_IDLE,
+	PM_DEFAULT_CPU_DEEP_IDLE
 } pm_cmds;
 
 /*
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index f05dbc437a..1d690fe67f 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -40,9 +40,15 @@ CORE_OBJS +=			\
 	cmi_hw.o		\
 	cms.o			\
 	confunix.o		\
+	cpu_idle.o		\
 	cpuid.o			\
 	cpuid_subr.o		\
 	cpupm.o			\
+	cpupm_mach.o		\
+	cpupm_amd.o		\
+	cpupm_intel.o		\
+	cpupm_throttle.o	\
+	cpu_acpi.o		\
 	dis_tables.o		\
 	ddi_impl.o		\
 	dtrace_subr.o		\
@@ -93,6 +99,8 @@ CORE_OBJS +=			\
 	pci_orion.o		\
 	pmem.o			\
 	ppage.o			\
+	pwrnow.o		\
+	speedstep.o		\
 	startup.o		\
 	timestamp.o		\
 	todpc_subr.o		\
@@ -169,19 +177,14 @@ PCI_E_MISC_OBJS += pcie.o pcie_fault.o
 PCI_E_NEXUS_OBJS += npe.o npe_misc.o
 PCI_E_NEXUS_OBJS += pci_common.o pci_kstats.o pci_tools.o
 PCINEXUS_OBJS += pci.o pci_common.o pci_kstats.o pci_tools.o
-PCPLUSMP_OBJS += apic.o apic_regops.o psm_common.o apic_introp.o mp_platform_common.o
+PCPLUSMP_OBJS += apic.o apic_regops.o psm_common.o apic_introp.o	\
+			mp_platform_common.o hpet_acpi.o
 
 ACPI_DRV_OBJS	+= acpi_drv.o acpi_video.o
 
 CPUDRV_OBJS	+= \
 	cpudrv.o \
-	cpudrv_amd.o \
-	cpudrv_intel.o \
-	cpudrv_mach.o \
-	cpudrv_throttle.o \
-	cpu_acpi.o \
-	speedstep.o \
-	pwrnow.o
+	cpudrv_mach.o
 
 PPM_OBJS	+= ppm_subr.o ppm.o ppm_plat.o
 
diff --git a/usr/src/uts/i86pc/Makefile.rules b/usr/src/uts/i86pc/Makefile.rules
index 1ec05b783e..dc40a16541 100644
--- a/usr/src/uts/i86pc/Makefile.rules
+++ b/usr/src/uts/i86pc/Makefile.rules
@@ -67,10 +67,6 @@ $(OBJS_DIR)/%.o:                $(UTSBASE)/i86pc/io/acpi_drv/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
-$(OBJS_DIR)/%.o:		$(UTSBASE)/i86pc/io/cpudrv/%.c
-	$(COMPILE.c) -o $@ $<
-	$(CTFCONVERT_O)
-
 $(OBJS_DIR)/%.o:		$(UTSBASE)/i86pc/io/ioat/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -115,6 +111,10 @@ $(OBJS_DIR)/%.o:		$(UTSBASE)/i86pc/os/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/i86pc/os/cpupm/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/i86pc/boot/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -270,9 +270,6 @@ $(LINTS_DIR)/%.ln:		$(UTSBASE)/i86pc/io/%.c
 $(LINTS_DIR)/%.ln:              $(UTSBASE)/i86pc/io/acpi_drv/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
-$(LINTS_DIR)/%.ln:		$(UTSBASE)/i86pc/io/cpudrv/%.c
-	@($(LHEAD) $(LINT.c) $< $(LTAIL))
-
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/i86pc/io/ioat/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
@@ -309,6 +306,9 @@ $(LINTS_DIR)/%.ln:		$(UTSBASE)/i86pc/ml/%.s
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/i86pc/os/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/i86pc/os/cpupm/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/i86pc/boot/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
diff --git a/usr/src/uts/i86pc/io/cpudrv/cpudrv_mach.c b/usr/src/uts/i86pc/io/cpudrv/cpudrv_mach.c
deleted file mode 100644
index d2be88c404..0000000000
--- a/usr/src/uts/i86pc/io/cpudrv/cpudrv_mach.c
+++ /dev/null
@@ -1,516 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * CPU power management driver support for i86pc.
- */
-
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-#include <sys/cpupm.h>
-#include <sys/cpudrv_mach.h>
-#include <sys/machsystm.h>
-
-/*
- * Constants used by the Processor Device Notification handler
- * that identify what kind of change has occurred. We currently
- * only handle PPC_CHANGE_NOTIFICATION. The other two are
- * ignored.
- */
-#define	PPC_CHANGE_NOTIFICATION	0x80
-#define	CST_CHANGE_NOTIFICATION	0x81
-#define	TPC_CHANGE_NOTIFICATION	0x82
-
-/*
- * Note that our driver numbers the power levels from lowest to
- * highest starting at 1 (i.e., the lowest power level is 1 and
- * the highest power level is cpupm->num_spd). The x86 modules get
- * their power levels from ACPI which numbers power levels from
- * highest to lowest starting at 0 (i.e., the lowest power level
- * is (cpupm->num_spd - 1) and the highest power level is 0). So to
- * map one of our driver power levels to one understood by ACPI we
- * simply subtract our driver power level from cpupm->num_spd. Likewise,
- * to map an ACPI power level to the proper driver power level, we
- * subtract the ACPI power level from cpupm->num_spd.
- */
-#define	PM_2_PLAT_LEVEL(cpupm, pm_level) (cpupm->num_spd - pm_level)
-#define	PLAT_2_PM_LEVEL(cpupm, plat_level) (cpupm->num_spd - plat_level)
-
-extern boolean_t cpudrv_intel_init(cpudrv_devstate_t *);
-extern boolean_t cpudrv_amd_init(cpudrv_devstate_t *);
-
-typedef struct cpudrv_mach_vendor {
-	boolean_t	(*cpuv_init)(cpudrv_devstate_t *);
-} cpudrv_mach_vendor_t;
-
-/*
- * Table of supported vendors.
- */
-static cpudrv_mach_vendor_t cpudrv_vendors[] = {
-	cpudrv_intel_init,
-	cpudrv_amd_init,
-	NULL
-};
-
-uint_t
-cpudrv_pm_get_speeds(cpudrv_devstate_t *cpudsp, int **speeds)
-{
-	cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
-	return (cpu_acpi_get_speeds(mach_state->acpi_handle, speeds));
-}
-
-void
-cpudrv_pm_free_speeds(int *speeds, uint_t nspeeds)
-{
-	cpu_acpi_free_speeds(speeds, nspeeds);
-}
-
-/*
- * Change CPU speed using interface provided by module.
- */
-int
-cpudrv_pm_change_speed(cpudrv_devstate_t *cpudsp, cpudrv_pm_spd_t *new_spd)
-{
-	cpudrv_mach_state_t	*mach_state = cpudsp->mach_state;
-	cpudrv_pm_t		*cpupm;
-	uint32_t		plat_level;
-	int			ret;
-
-	if (!(mach_state->caps & CPUDRV_P_STATES))
-		return (DDI_FAILURE);
-	ASSERT(mach_state->cpupm_pstate_ops != NULL);
-	cpupm = &(cpudsp->cpudrv_pm);
-	plat_level = PM_2_PLAT_LEVEL(cpupm, new_spd->pm_level);
-	ret = mach_state->cpupm_pstate_ops->cpups_power(cpudsp, plat_level);
-	if (ret != 0)
-		return (DDI_FAILURE);
-	return (DDI_SUCCESS);
-}
-
-/*
- * Determine the cpu_id for the CPU device.
- */
-boolean_t
-cpudrv_pm_get_cpu_id(dev_info_t *dip,  processorid_t *cpu_id)
-{
-	return ((*cpu_id = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
-	    DDI_PROP_DONTPASS, "reg", -1)) != -1);
-
-}
-
-/*
- * All CPU instances have been initialized successfully.
- */
-boolean_t
-cpudrv_pm_power_ready(void)
-{
-	return (cpupm_is_enabled(CPUPM_P_STATES) && cpupm_is_ready());
-}
-
-/*
- * All CPU instances have been initialized successfully.
- */
-boolean_t
-cpudrv_pm_throttle_ready(void)
-{
-	return (cpupm_is_enabled(CPUPM_T_STATES) && cpupm_is_ready());
-}
-
-/*
- * Is the current thread the thread that is handling the
- * PPC change notification?
- */
-boolean_t
-cpudrv_pm_is_governor_thread(cpudrv_pm_t *cpupm)
-{
-	return (curthread == cpupm->pm_governor_thread);
-}
-
-/*
- * Initialize the machine.
- * See if a module exists for managing power for this CPU.
- */
-boolean_t
-cpudrv_mach_pm_init(cpudrv_devstate_t *cpudsp)
-{
-	cpudrv_mach_vendor_t *vendors;
-	cpudrv_mach_state_t *mach_state;
-	int ret;
-
-	mach_state = cpudsp->mach_state =
-	    kmem_zalloc(sizeof (cpudrv_mach_state_t), KM_SLEEP);
-	mach_state->caps = CPUDRV_NO_STATES;
-
-	mach_state->acpi_handle = cpu_acpi_init(cpudsp->dip);
-	if (mach_state->acpi_handle == NULL) {
-		cpudrv_mach_pm_free(cpudsp);
-		cmn_err(CE_WARN, "!cpudrv_mach_pm_init: instance %d: "
-		    "unable to get ACPI handle",
-		    ddi_get_instance(cpudsp->dip));
-		cmn_err(CE_NOTE, "!CPU power management will not function.");
-		return (B_FALSE);
-	}
-
-	/*
-	 * Loop through the CPU management module table and see if
-	 * any of the modules implement CPU power management
-	 * for this CPU.
-	 */
-	for (vendors = cpudrv_vendors; vendors->cpuv_init != NULL; vendors++) {
-		if (vendors->cpuv_init(cpudsp))
-			break;
-	}
-
-	/*
-	 * Nope, we can't power manage this CPU.
-	 */
-	if (vendors == NULL) {
-		cpudrv_mach_pm_free(cpudsp);
-		return (B_FALSE);
-	}
-
-	/*
-	 * If P-state support exists for this system, then initialize it.
-	 */
-	if (mach_state->cpupm_pstate_ops != NULL) {
-		ret = mach_state->cpupm_pstate_ops->cpups_init(cpudsp);
-		if (ret != 0) {
-			cmn_err(CE_WARN, "!cpudrv_mach_pm_init: instance %d:"
-			    " unable to initialize P-state support",
-			    ddi_get_instance(cpudsp->dip));
-			mach_state->cpupm_pstate_ops = NULL;
-			cpupm_disable(CPUPM_P_STATES);
-		} else {
-			mach_state->caps |= CPUDRV_P_STATES;
-		}
-	}
-
-	if (mach_state->cpupm_tstate_ops != NULL) {
-		ret = mach_state->cpupm_tstate_ops->cputs_init(cpudsp);
-		if (ret != 0) {
-			cmn_err(CE_WARN, "!cpudrv_mach_pm_init: instance %d:"
-			    " unable to initialize T-state support",
-			    ddi_get_instance(cpudsp->dip));
-			mach_state->cpupm_tstate_ops = NULL;
-			cpupm_disable(CPUPM_T_STATES);
-		} else {
-			mach_state->caps |= CPUDRV_T_STATES;
-		}
-	}
-
-	if (mach_state->caps == CPUDRV_NO_STATES) {
-		cpudrv_mach_pm_free(cpudsp);
-		return (B_FALSE);
-	}
-
-	return (B_TRUE);
-}
-
-/*
- * Free any resources allocated by cpudrv_mach_pm_init().
- */
-void
-cpudrv_mach_pm_free(cpudrv_devstate_t *cpudsp)
-{
-	cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
-
-	if (mach_state == NULL)
-		return;
-	if (mach_state->cpupm_pstate_ops != NULL) {
-		mach_state->cpupm_pstate_ops->cpups_fini(cpudsp);
-		mach_state->cpupm_pstate_ops = NULL;
-	}
-
-	if (mach_state->cpupm_tstate_ops != NULL) {
-		mach_state->cpupm_tstate_ops->cputs_fini(cpudsp);
-		mach_state->cpupm_tstate_ops = NULL;
-	}
-
-	if (mach_state->acpi_handle != NULL) {
-		cpu_acpi_fini(mach_state->acpi_handle);
-		mach_state->acpi_handle = NULL;
-	}
-
-	kmem_free(mach_state, sizeof (cpudrv_mach_state_t));
-	cpudsp->mach_state = NULL;
-}
-
-/*
- * This routine changes the top speed to which the CPUs can transition by:
- *
- * - Resetting the up_spd for all speeds lower than the new top speed
- *   to point to the new top speed.
- * - Updating the framework with a new "normal" (maximum power) for this
- *   device.
- */
-void
-cpudrv_pm_set_topspeed(void *ctx, int plat_level)
-{
-	cpudrv_devstate_t	*cpudsp;
-	cpudrv_pm_t		*cpupm;
-	cpudrv_pm_spd_t	*spd;
-	cpudrv_pm_spd_t	*top_spd;
-	dev_info_t	*dip;
-	int		pm_level;
-	int		instance;
-	int		i;
-
-	dip = ctx;
-	instance = ddi_get_instance(dip);
-	cpudsp = ddi_get_soft_state(cpudrv_state, instance);
-	ASSERT(cpudsp != NULL);
-
-	mutex_enter(&cpudsp->lock);
-	cpupm = &(cpudsp->cpudrv_pm);
-	pm_level = PLAT_2_PM_LEVEL(cpupm, plat_level);
-	for (i = 0, spd = cpupm->head_spd; spd; i++, spd = spd->down_spd) {
-		/*
-		 * Don't mess with speeds that are higher than the new
-		 * top speed. They should be out of range anyway.
-		 */
-		if (spd->pm_level > pm_level)
-			continue;
-		/*
-		 * This is the new top speed.
-		 */
-		if (spd->pm_level == pm_level)
-			top_spd = spd;
-
-		spd->up_spd = top_spd;
-	}
-	cpupm->top_spd = top_spd;
-
-	cpupm->pm_governor_thread = curthread;
-
-	mutex_exit(&cpudsp->lock);
-
-	(void) pm_update_maxpower(dip, 0, top_spd->pm_level);
-}
-
-/*
- * This routine reads the ACPI _PPC object. It's accessed as a callback
- * by the ppm driver whenever a _PPC change notification is received.
- */
-int
-cpudrv_pm_get_topspeed(void *ctx)
-{
-	cpudrv_mach_state_t	*mach_state;
-	cpu_acpi_handle_t	handle;
-	cpudrv_devstate_t	*cpudsp;
-	cpudrv_pm_t		*cpupm;
-	dev_info_t		*dip;
-	int			instance;
-	int			plat_level;
-	int			max_level;
-
-	dip = ctx;
-	instance = ddi_get_instance(dip);
-	cpudsp = ddi_get_soft_state(cpudrv_state, instance);
-	ASSERT(cpudsp != NULL);
-	cpupm = &(cpudsp->cpudrv_pm);
-	mach_state = cpudsp->mach_state;
-	handle = mach_state->acpi_handle;
-
-	cpu_acpi_cache_ppc(handle);
-	plat_level = CPU_ACPI_PPC(handle);
-	max_level = cpupm->num_spd - 1;
-	if ((plat_level < 0) || (plat_level > max_level)) {
-		cmn_err(CE_NOTE, "!cpudrv_pm_get_topspeed: instance %d: "
-		    "_PPC out of range %d", instance, plat_level);
-
-		plat_level = 0;
-	}
-	return (plat_level);
-}
-
-/*
- * This routine reads the ACPI _TPC object. It's accessed as a callback
- * by the cpu driver whenever a _TPC change notification is received.
- */
-int
-cpudrv_pm_get_topthrottle(cpudrv_devstate_t *cpudsp)
-{
-	cpudrv_mach_state_t	*mach_state;
-	cpu_acpi_handle_t	handle;
-	int			throtl_level;
-
-	mach_state = cpudsp->mach_state;
-	handle = mach_state->acpi_handle;
-
-	cpu_acpi_cache_tpc(handle);
-	throtl_level = CPU_ACPI_TPC(handle);
-	return (throtl_level);
-}
-
-/*
- * Take care of CPU throttling when _TPC notification arrives
- */
-void
-cpudrv_pm_throttle_instance(cpudrv_devstate_t *cpudsp)
-{
-	cpudrv_mach_state_t	*mach_state;
-	uint32_t		new_level;
-	int			ret;
-
-	ASSERT(cpudsp != NULL);
-	mach_state = cpudsp->mach_state;
-	if (!(mach_state->caps & CPUDRV_T_STATES))
-		return;
-	ASSERT(mach_state->cpupm_tstate_ops != NULL);
-
-	/*
-	 * Get the new T-State support level
-	 */
-	new_level = cpudrv_pm_get_topthrottle(cpudsp);
-
-	/*
-	 * Change the cpu throttling to the new level
-	 */
-	ret = mach_state->cpupm_tstate_ops->cputs_throttle(cpudsp, new_level);
-	if (ret != 0) {
-		cmn_err(CE_WARN, "Cannot change the cpu throttling to the new"
-		    " level: %d, Instance: %d", new_level, cpudsp->cpu_id);
-	}
-}
-
-/*
- * Take care of CPU throttling when _TPC notification arrives
- */
-void
-cpudrv_pm_manage_throttling(void *ctx)
-{
-	cpudrv_devstate_t		*cpudsp;
-	cpudrv_mach_state_t		*mach_state;
-	cpudrv_tstate_domain_t		*domain;
-	cpudrv_tstate_domain_node_t	*domain_node;
-	int				instance;
-	boolean_t			is_ready;
-
-	instance = ddi_get_instance((dev_info_t *)ctx);
-	cpudsp = ddi_get_soft_state(cpudrv_state, instance);
-	ASSERT(cpudsp != NULL);
-
-	/*
-	 * We currently refuse to power manage if the CPU is not ready to
-	 * take cross calls (cross calls fail silently if CPU is not ready
-	 * for it).
-	 *
-	 * Additionally, for x86 platforms we cannot power manage
-	 * any one instance, until all instances have been initialized.
-	 * That's because we don't know what the CPU domains look like
-	 * until all instances have been initialized.
-	 */
-	is_ready = CPUDRV_PM_XCALL_IS_READY(cpudsp->cpu_id);
-	if (!is_ready) {
-		DPRINTF(D_POWER, ("cpudrv_power: instance %d: "
-		    "CPU not ready for x-calls\n", instance));
-	} else if (!(is_ready = cpudrv_pm_throttle_ready())) {
-		DPRINTF(D_POWER, ("cpudrv_power: instance %d: "
-		    "waiting for all CPUs to be ready\n", instance));
-	}
-	if (!is_ready) {
-		return;
-	}
-
-	mach_state = cpudsp->mach_state;
-	domain_node = mach_state->tstate_domain_node;
-	domain = domain_node->tdn_domain;
-
-	switch (domain->td_type) {
-	case CPU_ACPI_SW_ANY:
-		/*
-		 * Just throttle the current instance and all other instances
-		 * under the same domain will get throttled to the same level
-		 */
-		cpudrv_pm_throttle_instance(cpudsp);
-		break;
-	case CPU_ACPI_HW_ALL:
-	case CPU_ACPI_SW_ALL:
-		/*
-		 * Along with the current instance, throttle all the CPU's that
-		 * belong to the same domain
-		 */
-		mutex_enter(&domain->td_lock);
-		for (domain_node = domain->td_node; domain_node != NULL;
-		    domain_node = domain_node->tdn_next)
-			cpudrv_pm_throttle_instance(domain_node->tdn_cpudsp);
-		mutex_exit(&domain->td_lock);
-		break;
-
-	default:
-		cmn_err(CE_WARN, "Not a valid coordination type (%x) to"
-		    " throttle cpu", domain->td_domain);
-		break;
-	}
-}
-
-/*
- * This notification handler is called whenever the ACPI _PPC
- * object changes. The _PPC is a sort of governor on power levels.
- * It sets an upper threshold on which, _PSS defined, power levels
- * are usuable. The _PPC value is dynamic and may change as properties
- * (i.e., thermal or AC source) of the system change.
- */
-/* ARGSUSED */
-static void
-cpudrv_pm_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
-{
-	/*
-	 * We only handle _PPC change notifications.
-	 */
-	if (val == PPC_CHANGE_NOTIFICATION)
-		cpudrv_pm_redefine_topspeed(ctx);
-	else if (val == TPC_CHANGE_NOTIFICATION) {
-		cpudrv_pm_manage_throttling(ctx);
-	}
-}
-
-void
-cpudrv_pm_install_notify_handler(cpudrv_devstate_t *cpudsp, dev_info_t *dip)
-{
-	cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
-	cpu_acpi_install_notify_handler(mach_state->acpi_handle,
-	    cpudrv_pm_notify_handler, dip);
-}
-
-void
-cpudrv_pm_redefine_topspeed(void *ctx)
-{
-	/*
-	 * This should never happen, unless ppm does not get loaded.
-	 */
-	if (cpupm_redefine_topspeed == NULL) {
-		cmn_err(CE_WARN, "cpudrv_pm_redefine_topspeed: "
-		    "cpupm_redefine_topspeed has not been initialized - "
-		    "ignoring notification");
-		return;
-	}
-
-	/*
-	 * ppm callback needs to handle redefinition for all CPUs in
-	 * the domain.
-	 */
-	(*cpupm_redefine_topspeed)(ctx);
-}
diff --git a/usr/src/uts/i86pc/io/cpudrv_mach.c b/usr/src/uts/i86pc/io/cpudrv_mach.c
new file mode 100644
index 0000000000..56d2e4d6ac
--- /dev/null
+++ b/usr/src/uts/i86pc/io/cpudrv_mach.c
@@ -0,0 +1,287 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * CPU power management driver support for i86pc.
+ */
+
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cpupm.h>
+#include <sys/cpudrv_mach.h>
+#include <sys/machsystm.h>
+#include <sys/cpu_pm.h>
+#include <sys/cpuvar.h>
+#include <sys/sdt.h>
+#include <sys/cpu_idle.h>
+
+/*
+ * Note that our driver numbers the power levels from lowest to
+ * highest starting at 1 (i.e., the lowest power level is 1 and
+ * the highest power level is cpupm->num_spd). The x86 modules get
+ * their power levels from ACPI which numbers power levels from
+ * highest to lowest starting at 0 (i.e., the lowest power level
+ * is (cpupm->num_spd - 1) and the highest power level is 0). So to
+ * map one of our driver power levels to one understood by ACPI we
+ * simply subtract our driver power level from cpupm->num_spd. Likewise,
+ * to map an ACPI power level to the proper driver power level, we
+ * subtract the ACPI power level from cpupm->num_spd.
+ */
+#define	PM_2_PLAT_LEVEL(cpupm, pm_level) (cpupm->num_spd - pm_level)
+#define	PLAT_2_PM_LEVEL(cpupm, plat_level) (cpupm->num_spd - plat_level)
+
+/*
+ * Change CPU speed using interface provided by module.
+ */
+int
+cpudrv_change_speed(cpudrv_devstate_t *cpudsp, cpudrv_pm_spd_t *new_spd)
+{
+	cpu_t *cp = cpudsp->cp;
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+	cpudrv_pm_t *cpupm;
+	cpuset_t set;
+	uint32_t plat_level;
+
+	if (!(mach_state->ms_caps & CPUPM_P_STATES))
+		return (DDI_FAILURE);
+	ASSERT(mach_state->ms_pstate.cma_ops != NULL);
+	cpupm = &(cpudsp->cpudrv_pm);
+	plat_level = PM_2_PLAT_LEVEL(cpupm, new_spd->pm_level);
+	CPUSET_ONLY(set, cp->cpu_id);
+	mach_state->ms_pstate.cma_ops->cpus_change(set, plat_level);
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * Determine the cpu_id for the CPU device.
+ */
+boolean_t
+cpudrv_get_cpu_id(dev_info_t *dip,  processorid_t *cpu_id)
+{
+	return ((*cpu_id = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS, "reg", -1)) != -1);
+
+}
+
+boolean_t
+cpudrv_is_enabled(cpudrv_devstate_t *cpudsp)
+{
+	cpupm_mach_state_t *mach_state;
+
+	if (!cpupm_is_enabled(CPUPM_P_STATES) || !cpudrv_enabled)
+		return (B_FALSE);
+
+	/*
+	 * Only check the instance specific setting it exists.
+	 */
+	if (cpudsp != NULL && cpudsp->cp != NULL &&
+	    cpudsp->cp->cpu_m.mcpu_pm_mach_state != NULL) {
+		mach_state =
+		    (cpupm_mach_state_t *)cpudsp->cp->cpu_m.mcpu_pm_mach_state;
+		return (mach_state->ms_caps & CPUPM_P_STATES);
+	}
+
+	return (B_TRUE);
+}
+
+/*
+ * Is the current thread the thread that is handling the
+ * PPC change notification?
+ */
+boolean_t
+cpudrv_is_governor_thread(cpudrv_pm_t *cpupm)
+{
+	return (curthread == cpupm->pm_governor_thread);
+}
+
+/*
+ * This routine changes the top speed to which the CPUs can transition by:
+ *
+ * - Resetting the up_spd for all speeds lower than the new top speed
+ *   to point to the new top speed.
+ * - Updating the framework with a new "normal" (maximum power) for this
+ *   device.
+ */
+void
+cpudrv_set_topspeed(void *ctx, int plat_level)
+{
+	cpudrv_devstate_t *cpudsp;
+	cpudrv_pm_t *cpupm;
+	cpudrv_pm_spd_t	*spd;
+	cpudrv_pm_spd_t	*top_spd;
+	dev_info_t *dip;
+	int pm_level;
+	int instance;
+	int i;
+
+	dip = ctx;
+	instance = ddi_get_instance(dip);
+	cpudsp = ddi_get_soft_state(cpudrv_state, instance);
+	ASSERT(cpudsp != NULL);
+
+	mutex_enter(&cpudsp->lock);
+	cpupm = &(cpudsp->cpudrv_pm);
+	pm_level = PLAT_2_PM_LEVEL(cpupm, plat_level);
+	for (i = 0, spd = cpupm->head_spd; spd; i++, spd = spd->down_spd) {
+		/*
+		 * Don't mess with speeds that are higher than the new
+		 * top speed. They should be out of range anyway.
+		 */
+		if (spd->pm_level > pm_level)
+			continue;
+		/*
+		 * This is the new top speed.
+		 */
+		if (spd->pm_level == pm_level)
+			top_spd = spd;
+
+		spd->up_spd = top_spd;
+	}
+	cpupm->top_spd = top_spd;
+
+	cpupm->pm_governor_thread = curthread;
+
+	mutex_exit(&cpudsp->lock);
+
+	(void) pm_update_maxpower(dip, 0, top_spd->pm_level);
+}
+
+/*
+ * This routine reads the ACPI _PPC object. It's accessed as a callback
+ * by the ppm driver whenever a _PPC change notification is received.
+ */
+int
+cpudrv_get_topspeed(void *ctx)
+{
+	cpu_t *cp;
+	cpudrv_devstate_t *cpudsp;
+	dev_info_t *dip;
+	int instance;
+	int plat_level;
+
+	dip = ctx;
+	instance = ddi_get_instance(dip);
+	cpudsp = ddi_get_soft_state(cpudrv_state, instance);
+	ASSERT(cpudsp != NULL);
+	cp = cpudsp->cp;
+	plat_level = cpupm_get_top_speed(cp);
+
+	return (plat_level);
+}
+
+
+/*
+ * This notification handler is called whenever the ACPI _PPC
+ * object changes. The _PPC is a sort of governor on power levels.
+ * It sets an upper threshold on which, _PSS defined, power levels
+ * are usuable. The _PPC value is dynamic and may change as properties
+ * (i.e., thermal or AC source) of the system change.
+ */
+/* ARGSUSED */
+static void
+cpudrv_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
+{
+	extern pm_cpupm_t cpupm;
+
+	/*
+	 * We only handle _PPC change notifications.
+	 */
+	if (val == CPUPM_PPC_CHANGE_NOTIFICATION && !PM_EVENT_CPUPM)
+		cpudrv_redefine_topspeed(ctx);
+}
+
+void
+cpudrv_install_notify_handler(cpudrv_devstate_t *cpudsp)
+{
+	cpu_t *cp = cpudsp->cp;
+	cpupm_add_notify_handler(cp, cpudrv_notify_handler,
+	    cpudsp->dip);
+}
+
+void
+cpudrv_redefine_topspeed(void *ctx)
+{
+	/*
+	 * This should never happen, unless ppm does not get loaded.
+	 */
+	if (cpupm_redefine_topspeed == NULL) {
+		cmn_err(CE_WARN, "cpudrv_redefine_topspeed: "
+		    "cpupm_redefine_topspeed has not been initialized - "
+		    "ignoring notification");
+		return;
+	}
+
+	/*
+	 * ppm callback needs to handle redefinition for all CPUs in
+	 * the domain.
+	 */
+	(*cpupm_redefine_topspeed)(ctx);
+}
+
+boolean_t
+cpudrv_mach_init(cpudrv_devstate_t *cpudsp)
+{
+	cpupm_mach_state_t *mach_state;
+
+	mutex_enter(&cpu_lock);
+	cpudsp->cp = cpu_get(cpudsp->cpu_id);
+	mutex_exit(&cpu_lock);
+	if (cpudsp->cp == NULL) {
+		cmn_err(CE_WARN, "cpudrv_mach_pm_init: instance %d: "
+		    "can't get cpu_t", ddi_get_instance(cpudsp->dip));
+		return (B_FALSE);
+	}
+
+	mach_state = (cpupm_mach_state_t *)
+	    (cpudsp->cp->cpu_m.mcpu_pm_mach_state);
+	mach_state->ms_dip = cpudsp->dip;
+	return (B_TRUE);
+}
+
+uint_t
+cpudrv_get_speeds(cpudrv_devstate_t *cpudsp, int **speeds)
+{
+	return (cpupm_get_speeds(cpudsp->cp, speeds));
+}
+
+void
+cpudrv_free_speeds(int *speeds, uint_t nspeeds)
+{
+	cpupm_free_speeds(speeds, nspeeds);
+}
+
+boolean_t
+cpudrv_power_ready(void)
+{
+	return (cpupm_power_ready());
+}
+
+/* ARGSUSED */
+void
+cpudrv_set_supp_freqs(cpudrv_devstate_t *cpudsp)
+{
+}
diff --git a/usr/src/uts/i86pc/io/hpet_acpi.c b/usr/src/uts/i86pc/io/hpet_acpi.c
new file mode 100644
index 0000000000..9f482f16fb
--- /dev/null
+++ b/usr/src/uts/i86pc/io/hpet_acpi.c
@@ -0,0 +1,1388 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/hpet_acpi.h>
+#include <sys/hpet.h>
+#include <sys/bitmap.h>
+#include <sys/inttypes.h>
+#include <sys/time.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+#include <sys/apic.h>
+#include <sys/callb.h>
+#include <sys/clock.h>
+#include <sys/archsystm.h>
+#include <sys/cpupart.h>
+
+/*
+ * hpet_state_lock is used to synchronize disabling/enabling deep c-states
+ * and to synchronize suspend/resume.
+ */
+static kmutex_t		hpet_state_lock;
+static struct hpet_state {
+	boolean_t	proxy_installed;	/* CBE proxy interrupt setup */
+	boolean_t	cpr;			/* currently in CPR */
+	boolean_t	cpu_deep_idle;		/* user enable/disable */
+	boolean_t	uni_cstate;		/* disable if only one cstate */
+} hpet_state = { B_FALSE, B_FALSE, B_TRUE, B_TRUE};
+
+uint64_t hpet_spin_check = HPET_SPIN_CHECK;
+uint64_t hpet_spin_timeout = HPET_SPIN_TIMEOUT;
+uint64_t hpet_idle_spin_timeout = HPET_SPIN_TIMEOUT;
+uint64_t hpet_isr_spin_timeout = HPET_SPIN_TIMEOUT;
+
+static kmutex_t		hpet_proxy_lock;	/* lock for lAPIC proxy data */
+/*
+ * hpet_proxy_users is a per-cpu array.
+ */
+static hpet_proxy_t	*hpet_proxy_users;	/* one per CPU */
+
+
+ACPI_TABLE_HPET		*hpet_table;		/* ACPI HPET table */
+hpet_info_t		hpet_info;		/* Human readable Information */
+
+/*
+ * Provide HPET access from unix.so.
+ * Set up pointers to access symbols in pcplusmp.
+ */
+static void
+hpet_establish_hooks(void)
+{
+	hpet.install_proxy = &hpet_install_proxy;
+	hpet.callback = &hpet_callback;
+	hpet.use_hpet_timer = &hpet_use_hpet_timer;
+	hpet.use_lapic_timer = &hpet_use_lapic_timer;
+}
+
+/*
+ * Get the ACPI "HPET" table.
+ * acpi_probe() calls this function from mp_startup before drivers are loaded.
+ * acpi_probe() verified the system is using ACPI before calling this.
+ *
+ * There may be more than one ACPI HPET table (Itanium only?).
+ * Intel's HPET spec defines each timer block to have up to 32 counters and
+ * be 1024 bytes long.  There can be more than one timer block of 32 counters.
+ * Each timer block would have an additional ACPI HPET table.
+ * Typical x86 systems today only have 1 HPET with 3 counters.
+ * On x86 we only consume HPET table "1" for now.
+ */
+int
+hpet_acpi_init(int *hpet_vect, iflag_t *hpet_flags)
+{
+	extern hrtime_t tsc_read(void);
+	extern int	idle_cpu_no_deep_c;
+	extern int	cpuid_deep_cstates_supported(void);
+	void		*la;
+	uint64_t	ret;
+	uint_t		num_timers;
+	uint_t		ti;
+
+	(void) memset(&hpet_info, 0, sizeof (hpet_info));
+	hpet.supported = HPET_NO_SUPPORT;
+
+	if (idle_cpu_no_deep_c)
+		return (DDI_FAILURE);
+
+	if (!cpuid_deep_cstates_supported())
+		return (DDI_FAILURE);
+
+	hpet_establish_hooks();
+
+	/*
+	 * Get HPET ACPI table 1.
+	 */
+	if (ACPI_FAILURE(AcpiGetTable(ACPI_SIG_HPET, HPET_TABLE_1,
+	    (ACPI_TABLE_HEADER **)&hpet_table))) {
+		cmn_err(CE_NOTE, "!hpet_acpi: unable to get ACPI HPET table");
+		return (DDI_FAILURE);
+	}
+
+	if (hpet_validate_table(hpet_table) != AE_OK) {
+		cmn_err(CE_NOTE, "!hpet_acpi: invalid HPET table");
+		return (DDI_FAILURE);
+	}
+
+	la = hpet_memory_map(hpet_table);
+	if (la == NULL) {
+		cmn_err(CE_NOTE, "!hpet_acpi: memory map HPET failed");
+		return (DDI_FAILURE);
+	}
+	hpet_info.logical_address = la;
+
+	ret = hpet_read_gen_cap(&hpet_info);
+	hpet_info.gen_cap.counter_clk_period = HPET_GCAP_CNTR_CLK_PERIOD(ret);
+	hpet_info.gen_cap.vendor_id = HPET_GCAP_VENDOR_ID(ret);
+	hpet_info.gen_cap.leg_route_cap = HPET_GCAP_LEG_ROUTE_CAP(ret);
+	hpet_info.gen_cap.count_size_cap = HPET_GCAP_CNT_SIZE_CAP(ret);
+	/*
+	 * Hardware contains the last timer's number.
+	 * Add 1 to get the number of timers.
+	 */
+	hpet_info.gen_cap.num_tim_cap = HPET_GCAP_NUM_TIM_CAP(ret) + 1;
+	hpet_info.gen_cap.rev_id = HPET_GCAP_REV_ID(ret);
+
+	if (hpet_info.gen_cap.counter_clk_period > HPET_MAX_CLK_PERIOD) {
+		cmn_err(CE_NOTE, "!hpet_acpi: COUNTER_CLK_PERIOD 0x%lx > 0x%lx",
+		    (long)hpet_info.gen_cap.counter_clk_period,
+		    (long)HPET_MAX_CLK_PERIOD);
+		return (DDI_FAILURE);
+	}
+
+	num_timers = (uint_t)hpet_info.gen_cap.num_tim_cap;
+	if ((num_timers < 3) || (num_timers > 32)) {
+		cmn_err(CE_NOTE, "!hpet_acpi: invalid number of HPET timers "
+		    "%lx", (long)num_timers);
+		return (DDI_FAILURE);
+	}
+	hpet_info.timer_n_config = (hpet_TN_conf_cap_t *)kmem_zalloc(
+	    num_timers * sizeof (uint64_t), KM_SLEEP);
+
+	ret = hpet_read_gen_config(&hpet_info);
+	hpet_info.gen_config.leg_rt_cnf = HPET_GCFR_LEG_RT_CNF_BITX(ret);
+	hpet_info.gen_config.enable_cnf = HPET_GCFR_ENABLE_CNF_BITX(ret);
+
+	/*
+	 * Solaris does not use the HPET Legacy Replacement Route capabilities.
+	 * This feature has been off by default on test systems.
+	 * The HPET spec does not specify if Legacy Replacement Route is
+	 * on or off by default, so we explicitely set it off here.
+	 * It should not matter which mode the HPET is in since we use
+	 * the first available non-legacy replacement timer: timer 2.
+	 */
+	(void) hpet_set_leg_rt_cnf(&hpet_info, 0);
+
+	ret = hpet_read_gen_config(&hpet_info);
+	hpet_info.gen_config.leg_rt_cnf = HPET_GCFR_LEG_RT_CNF_BITX(ret);
+	hpet_info.gen_config.enable_cnf = HPET_GCFR_ENABLE_CNF_BITX(ret);
+
+	hpet_info.gen_intrpt_stat = hpet_read_gen_intrpt_stat(&hpet_info);
+	hpet_info.main_counter_value = hpet_read_main_counter_value(&hpet_info);
+
+	for (ti = 0; ti < num_timers; ++ti) {
+		ret = hpet_read_timer_N_config(&hpet_info, ti);
+		/*
+		 * Make sure no timers are enabled (think fast reboot or
+		 * virtual hardware).
+		 */
+		if (ret & HPET_TIMER_N_INT_ENB_CNF_BIT) {
+			hpet_disable_timer(&hpet_info, ti);
+			ret &= ~HPET_TIMER_N_INT_ENB_CNF_BIT;
+		}
+
+		hpet_info.timer_n_config[ti] = hpet_convert_timer_N_config(ret);
+	}
+
+	/*
+	 * Be aware the Main Counter may need to be initialized in the future
+	 * if it is used for more than just Deep C-State support.
+	 * The HPET's Main Counter does not need to be initialize to a specific
+	 * value before starting it for use to wake up CPUs from Deep C-States.
+	 */
+	if (hpet_start_main_counter(&hpet_info) != AE_OK) {
+		cmn_err(CE_NOTE, "!hpet_acpi: hpet_start_main_counter failed");
+		return (DDI_FAILURE);
+	}
+
+	hpet_info.period = hpet_info.gen_cap.counter_clk_period;
+	/*
+	 * Read main counter twice to record HPET latency for debugging.
+	 */
+	hpet_info.tsc[0] = tsc_read();
+	hpet_info.hpet_main_counter_reads[0] =
+	    hpet_read_main_counter_value(&hpet_info);
+	hpet_info.tsc[1] = tsc_read();
+	hpet_info.hpet_main_counter_reads[1] =
+	    hpet_read_main_counter_value(&hpet_info);
+	hpet_info.tsc[2] = tsc_read();
+
+	ret = hpet_read_gen_config(&hpet_info);
+	hpet_info.gen_config.leg_rt_cnf = HPET_GCFR_LEG_RT_CNF_BITX(ret);
+	hpet_info.gen_config.enable_cnf = HPET_GCFR_ENABLE_CNF_BITX(ret);
+
+	/*
+	 * HPET main counter reads are supported now.
+	 */
+	hpet.supported = HPET_TIMER_SUPPORT;
+
+	return (hpet_init_proxy(hpet_vect, hpet_flags));
+}
+
+void
+hpet_acpi_fini(void)
+{
+	if (hpet.supported == HPET_NO_SUPPORT)
+		return;
+	if (hpet.supported >= HPET_TIMER_SUPPORT)
+		hpet_stop_main_counter(&hpet_info);
+	if (hpet.supported > HPET_TIMER_SUPPORT)
+		hpet_disable_timer(&hpet_info, hpet_info.cstate_timer.timer);
+}
+
+/*
+ * Do initial setup to use a HPET timer as a proxy for Deep C-state stalled
+ * LAPIC Timers.  Get a free HPET timer that supports I/O APIC routed interrupt.
+ * Setup data to handle the timer's ISR, and add the timer's interrupt.
+ *
+ * The ddi cannot be use to allocate the HPET timer's interrupt.
+ * ioapic_init_intr() in mp_platform_common() later sets up the I/O APIC
+ * to handle the HPET timer's interrupt.
+ *
+ * Note: FSB (MSI) interrupts are not currently supported by Intel HPETs as of
+ * ICH9.  The HPET spec allows for MSI.  In the future MSI may be prefered.
+ */
+static int
+hpet_init_proxy(int *hpet_vect, iflag_t *hpet_flags)
+{
+	if (hpet_get_IOAPIC_intr_capable_timer(&hpet_info) == -1) {
+		cmn_err(CE_WARN, "!hpet_acpi: get ioapic intr failed.");
+		return (DDI_FAILURE);
+	}
+
+	hpet_init_proxy_data();
+
+	if (hpet_install_interrupt_handler(&hpet_isr,
+	    hpet_info.cstate_timer.intr) != AE_OK) {
+		cmn_err(CE_WARN, "!hpet_acpi: install interrupt failed.");
+		return (DDI_FAILURE);
+	}
+	*hpet_vect = hpet_info.cstate_timer.intr;
+	hpet_flags->intr_el = INTR_EL_LEVEL;
+	hpet_flags->intr_po = INTR_PO_ACTIVE_HIGH;
+	hpet_flags->bustype = BUS_PCI;		/*  we *do* conform to PCI */
+
+	/*
+	 * Avoid a possibly stuck interrupt by programing the HPET's timer here
+	 * before the I/O APIC is programmed to handle this interrupt.
+	 */
+	hpet_timer_set_up(&hpet_info, hpet_info.cstate_timer.timer,
+	    hpet_info.cstate_timer.intr);
+
+	/*
+	 * All HPET functionality is supported.
+	 */
+	hpet.supported = HPET_FULL_SUPPORT;
+	return (DDI_SUCCESS);
+}
+
+/*
+ * Called by kernel if it can support Deep C-States.
+ */
+static boolean_t
+hpet_install_proxy(void)
+{
+	if (hpet_state.proxy_installed == B_TRUE)
+		return (B_TRUE);
+
+	if (hpet.supported != HPET_FULL_SUPPORT)
+		return (B_FALSE);
+
+	hpet_enable_timer(&hpet_info, hpet_info.cstate_timer.timer);
+	hpet_state.proxy_installed = B_TRUE;
+
+	return (B_TRUE);
+}
+
+/*
+ * Remove the interrupt that was added with add_avintr() in
+ * hpet_install_interrupt_handler().
+ */
+static void
+hpet_uninstall_interrupt_handler(void)
+{
+	rem_avintr(NULL, CBE_HIGH_PIL, (avfunc)&hpet_isr,
+	    hpet_info.cstate_timer.intr);
+}
+
+static int
+hpet_validate_table(ACPI_TABLE_HPET *hpet_table)
+{
+	ACPI_TABLE_HEADER	*table_header = (ACPI_TABLE_HEADER *)hpet_table;
+
+	if (table_header->Length != sizeof (ACPI_TABLE_HPET)) {
+		cmn_err(CE_WARN, "!hpet_validate_table: Length %lx != sizeof ("
+		    "ACPI_TABLE_HPET) %lx.",
+		    (unsigned long)((ACPI_TABLE_HEADER *)hpet_table)->Length,
+		    (unsigned long)sizeof (ACPI_TABLE_HPET));
+		return (AE_ERROR);
+	}
+
+	if (!ACPI_COMPARE_NAME(table_header->Signature, ACPI_SIG_HPET)) {
+		cmn_err(CE_WARN, "!hpet_validate_table: Invalid HPET table "
+		    "signature");
+		return (AE_ERROR);
+	}
+
+	if (!hpet_checksum_table((unsigned char *)hpet_table,
+	    (unsigned int)table_header->Length)) {
+		cmn_err(CE_WARN, "!hpet_validate_table: Invalid HPET checksum");
+		return (AE_ERROR);
+	}
+
+	/*
+	 * Sequence should be table number - 1.  We are using table 1.
+	 */
+	if (hpet_table->Sequence != HPET_TABLE_1 - 1) {
+		cmn_err(CE_WARN, "!hpet_validate_table: Invalid Sequence %lx",
+		    (long)hpet_table->Sequence);
+		return (AE_ERROR);
+	}
+
+	return (AE_OK);
+}
+
+static boolean_t
+hpet_checksum_table(unsigned char *table, unsigned int length)
+{
+	unsigned char	checksum = 0;
+	int		i;
+
+	for (i = 0; i < length; ++i, ++table)
+		checksum += *table;
+
+	return (checksum == 0);
+}
+
+static void *
+hpet_memory_map(ACPI_TABLE_HPET *hpet_table)
+{
+	return (AcpiOsMapMemory(hpet_table->Address.Address, HPET_SIZE));
+}
+
+static int
+hpet_start_main_counter(hpet_info_t *hip)
+{
+	uint64_t	*gcr_ptr;
+	uint64_t	gcr;
+
+	gcr_ptr = (uint64_t *)HPET_GEN_CONFIG_ADDRESS(hip->logical_address);
+	gcr = *gcr_ptr;
+
+	gcr |= HPET_GCFR_ENABLE_CNF;
+	*gcr_ptr = gcr;
+	gcr = *gcr_ptr;
+
+	return (gcr & HPET_GCFR_ENABLE_CNF ? AE_OK : ~AE_OK);
+}
+
+static int
+hpet_stop_main_counter(hpet_info_t *hip)
+{
+	uint64_t	*gcr_ptr;
+	uint64_t	gcr;
+
+	gcr_ptr = (uint64_t *)HPET_GEN_CONFIG_ADDRESS(hip->logical_address);
+	gcr = *gcr_ptr;
+
+	gcr &= ~HPET_GCFR_ENABLE_CNF;
+	*gcr_ptr = gcr;
+	gcr = *gcr_ptr;
+
+	return (gcr & HPET_GCFR_ENABLE_CNF ? ~AE_OK : AE_OK);
+}
+
+/*
+ * Set the Legacy Replacement Route bit.
+ * This should be called before setting up timers.
+ * The HPET specification is silent regarding setting this after timers are
+ * programmed.
+ */
+static uint64_t
+hpet_set_leg_rt_cnf(hpet_info_t *hip, uint32_t new_value)
+{
+	uint64_t gen_conf = hpet_read_gen_config(hip);
+
+	switch (new_value) {
+	case 0:
+		gen_conf &= ~HPET_GCFR_LEG_RT_CNF;
+		break;
+
+	case HPET_GCFR_LEG_RT_CNF:
+		gen_conf |= HPET_GCFR_LEG_RT_CNF;
+		break;
+
+	default:
+		ASSERT(new_value == 0 || new_value == HPET_GCFR_LEG_RT_CNF);
+		break;
+	}
+	hpet_write_gen_config(hip, gen_conf);
+	return (gen_conf);
+}
+
+static uint64_t
+hpet_read_gen_cap(hpet_info_t *hip)
+{
+	return (*(uint64_t *)HPET_GEN_CAP_ADDRESS(hip->logical_address));
+}
+
+static uint64_t
+hpet_read_gen_config(hpet_info_t *hip)
+{
+	return (*(uint64_t *)
+	    HPET_GEN_CONFIG_ADDRESS(hip->logical_address));
+}
+
+static uint64_t
+hpet_read_gen_intrpt_stat(hpet_info_t *hip)
+{
+	hip->gen_intrpt_stat = *(uint64_t *)HPET_GEN_INTR_STAT_ADDRESS(
+	    hip->logical_address);
+	return (hip->gen_intrpt_stat);
+}
+
+static uint64_t
+hpet_read_timer_N_config(hpet_info_t *hip, uint_t n)
+{
+	uint64_t conf = *(uint64_t *)HPET_TIMER_N_CONF_ADDRESS(
+	    hip->logical_address, n);
+	hip->timer_n_config[n] = hpet_convert_timer_N_config(conf);
+	return (conf);
+}
+
+static hpet_TN_conf_cap_t
+hpet_convert_timer_N_config(uint64_t conf)
+{
+	hpet_TN_conf_cap_t cc = { 0 };
+
+	cc.int_route_cap = HPET_TIMER_N_INT_ROUTE_CAP(conf);
+	cc.fsb_int_del_cap = HPET_TIMER_N_FSB_INT_DEL_CAP(conf);
+	cc.fsb_int_en_cnf = HPET_TIMER_N_FSB_EN_CNF(conf);
+	cc.int_route_cnf = HPET_TIMER_N_INT_ROUTE_CNF(conf);
+	cc.mode32_cnf = HPET_TIMER_N_MODE32_CNF(conf);
+	cc.val_set_cnf = HPET_TIMER_N_VAL_SET_CNF(conf);
+	cc.size_cap = HPET_TIMER_N_SIZE_CAP(conf);
+	cc.per_int_cap = HPET_TIMER_N_PER_INT_CAP(conf);
+	cc.type_cnf = HPET_TIMER_N_TYPE_CNF(conf);
+	cc.int_enb_cnf = HPET_TIMER_N_INT_ENB_CNF(conf);
+	cc.int_type_cnf = HPET_TIMER_N_INT_TYPE_CNF(conf);
+
+	return (cc);
+}
+
+static uint64_t
+hpet_read_timer_N_comp(hpet_info_t *hip, uint_t n)
+{
+	if (hip->timer_n_config[n].size_cap == 1)
+		return (*(uint64_t *)
+		    HPET_TIMER_N_COMP_ADDRESS(hip->logical_address, n));
+	else
+		return (*(uint32_t *)
+		    HPET_TIMER_N_COMP_ADDRESS(hip->logical_address, n));
+}
+
+static uint64_t
+hpet_read_main_counter_value(hpet_info_t *hip)
+{
+	uint64_t	value;
+	uint32_t	*counter;
+	uint32_t	high1, high2, low;
+
+	counter = (uint32_t *)HPET_MAIN_COUNTER_ADDRESS(hip->logical_address);
+
+	/*
+	 * 32-bit main counters
+	 */
+	if (hip->gen_cap.count_size_cap == 0) {
+		value = (uint64_t)*counter;
+		hip->main_counter_value = value;
+		return (value);
+	}
+
+	/*
+	 * HPET spec claims a 64-bit read can be split into two 32-bit reads
+	 * by the hardware connection to the HPET.
+	 */
+	high2 = counter[1];
+	do {
+		high1 = high2;
+		low = counter[0];
+		high2 = counter[1];
+	} while (high2 != high1);
+
+	value = ((uint64_t)high1 << 32) | low;
+	hip->main_counter_value = value;
+	return (value);
+}
+
+static void
+hpet_write_gen_cap(hpet_info_t *hip, uint64_t l)
+{
+	*(uint64_t *)HPET_GEN_CAP_ADDRESS(hip->logical_address) = l;
+}
+
+static void
+hpet_write_gen_config(hpet_info_t *hip, uint64_t l)
+{
+	*(uint64_t *)HPET_GEN_CONFIG_ADDRESS(hip->logical_address) = l;
+}
+
+static void
+hpet_write_gen_intrpt_stat(hpet_info_t *hip, uint64_t l)
+{
+	*(uint64_t *)HPET_GEN_INTR_STAT_ADDRESS(hip->logical_address) = l;
+}
+
+static void
+hpet_write_timer_N_config(hpet_info_t *hip, uint_t n, uint64_t l)
+{
+	if (hip->timer_n_config[n].size_cap == 1)
+		*(uint64_t *)HPET_TIMER_N_CONF_ADDRESS(
+		    hip->logical_address, n) = l;
+	else
+		*(uint32_t *)HPET_TIMER_N_CONF_ADDRESS(
+		    hip->logical_address, n) = (uint32_t)(0xFFFFFFFF & l);
+}
+
+static void
+hpet_write_timer_N_comp(hpet_info_t *hip, uint_t n, uint64_t l)
+{
+	*(uint64_t *)HPET_TIMER_N_COMP_ADDRESS(hip->logical_address, n) = l;
+}
+
+static void
+hpet_disable_timer(hpet_info_t *hip, uint32_t timer_n)
+{
+	uint64_t l;
+
+	l = hpet_read_timer_N_config(hip, timer_n);
+	l &= ~HPET_TIMER_N_INT_ENB_CNF_BIT;
+	hpet_write_timer_N_config(hip, timer_n, l);
+}
+
+static void
+hpet_enable_timer(hpet_info_t *hip, uint32_t timer_n)
+{
+	uint64_t l;
+
+	l = hpet_read_timer_N_config(hip, timer_n);
+	l |= HPET_TIMER_N_INT_ENB_CNF_BIT;
+	hpet_write_timer_N_config(hip, timer_n, l);
+}
+
+static void
+hpet_write_main_counter_value(hpet_info_t *hip, uint64_t l)
+{
+	uint32_t	*address;
+
+	/*
+	 * HPET spec 1.0a states main counter register should be halted before
+	 * it is written to.
+	 */
+	ASSERT(!(hpet_read_gen_config(hip) & HPET_GCFR_ENABLE_CNF));
+
+	if (hip->gen_cap.count_size_cap == 1) {
+		*(uint64_t *)HPET_MAIN_COUNTER_ADDRESS(hip->logical_address)
+		    = l;
+	} else {
+		address = (uint32_t *)HPET_MAIN_COUNTER_ADDRESS(
+		    hip->logical_address);
+
+		address[0] = (uint32_t)(l & 0xFFFFFFFF);
+	}
+}
+
+/*
+ * Add the interrupt handler for I/O APIC interrupt number (interrupt line).
+ *
+ * The I/O APIC line (vector) is programmed in ioapic_init_intr() called
+ * from apic_picinit() psm_ops apic_ops entry point after we return from
+ * apic_init() psm_ops entry point.
+ */
+static uint32_t
+hpet_install_interrupt_handler(uint_t (*func)(char *), int vector)
+{
+	uint32_t retval;
+
+	retval = add_avintr(NULL, CBE_HIGH_PIL, (avfunc)func, "HPET Timer",
+	    vector, NULL, NULL, NULL, NULL);
+	if (retval == 0) {
+		cmn_err(CE_WARN, "!hpet_acpi: add_avintr() failed");
+		return (AE_BAD_PARAMETER);
+	}
+	return (AE_OK);
+}
+
+/*
+ * The HPET timers specify which I/O APIC interrupts they can be routed to.
+ * Find the first available non-legacy-replacement timer and its I/O APIC irq.
+ * Supported I/O APIC IRQs are specified in the int_route_cap bitmap in each
+ * timer's timer_n_config register.
+ */
+static int
+hpet_get_IOAPIC_intr_capable_timer(hpet_info_t *hip)
+{
+	int	timer;
+	int	intr;
+
+	for (timer = HPET_FIRST_NON_LEGACY_TIMER;
+	    timer < hip->gen_cap.num_tim_cap; ++timer) {
+
+		if (!hpet_timer_available(hip->allocated_timers, timer))
+			continue;
+
+		intr = lowbit(hip->timer_n_config[timer].int_route_cap) - 1;
+		if (intr >= 0) {
+			hpet_timer_alloc(&hip->allocated_timers, timer);
+			hip->cstate_timer.timer = timer;
+			hip->cstate_timer.intr = intr;
+			return (timer);
+		}
+	}
+
+	return (-1);
+}
+
+/*
+ * Mark this timer as used.
+ */
+static void
+hpet_timer_alloc(uint32_t *allocated_timers, uint32_t n)
+{
+	*allocated_timers |= 1 << n;
+}
+
+/*
+ * Check if this timer is available.
+ * No mutual exclusion because only one thread uses this.
+ */
+static int
+hpet_timer_available(uint32_t allocated_timers, uint32_t n)
+{
+	return ((allocated_timers & (1 << n)) == 0);
+}
+
+/*
+ * Setup timer N to route its interrupt to I/O APIC.
+ */
+static void
+hpet_timer_set_up(hpet_info_t *hip, uint32_t timer_n, uint32_t interrupt)
+{
+	uint64_t conf;
+
+	conf = hpet_read_timer_N_config(hip, timer_n);
+
+	/*
+	 * Caller is required to verify this interrupt route is supported.
+	 */
+	ASSERT(HPET_TIMER_N_INT_ROUTE_CAP(conf) & (1 << interrupt));
+
+	conf &= ~HPET_TIMER_N_FSB_EN_CNF_BIT;	/* use IOAPIC */
+	conf |= HPET_TIMER_N_INT_ROUTE_SHIFT(interrupt);
+	conf &= ~HPET_TIMER_N_TYPE_CNF_BIT;	/* non periodic */
+	conf &= ~HPET_TIMER_N_INT_ENB_CNF_BIT;	/* disabled */
+	conf |= HPET_TIMER_N_INT_TYPE_CNF_BIT;	/* Level Triggered */
+
+	hpet_write_timer_N_config(hip, timer_n, conf);
+}
+
+/*
+ * The HPET's Main Counter is not stopped before programming an HPET timer.
+ * This will allow the HPET to be used as a time source.
+ * The programmed timer interrupt may occur before this function returns.
+ * Callers must block interrupts before calling this function if they must
+ * guarantee the interrupt is handled after this function returns.
+ *
+ * Return 0 if main counter is less than timer after enabling timer.
+ * The interrupt was programmed, but it may fire before this returns.
+ * Return !0 if main counter is greater than timer after enabling timer.
+ * In other words: the timer will not fire, and we do not know if it did fire.
+ *
+ * delta is in HPET ticks.
+ *
+ * Writing a 64-bit value to a 32-bit register will "wrap around".
+ * A 32-bit HPET timer will wrap around in a little over 5 minutes.
+ */
+int
+hpet_timer_program(hpet_info_t *hip, uint32_t timer, uint64_t delta)
+{
+	uint64_t time, program;
+
+	program = hpet_read_main_counter_value(hip);
+	program += delta;
+	hpet_write_timer_N_comp(hip, timer, program);
+
+	time = hpet_read_main_counter_value(hip);
+	if (time < program)
+		return (AE_OK);
+
+	return (AE_TIME);
+}
+
+/*
+ * CPR and power policy-change callback entry point.
+ */
+boolean_t
+hpet_callback(int code)
+{
+	switch (code) {
+	case PM_DEFAULT_CPU_DEEP_IDLE:
+		/*FALLTHROUGH*/
+	case PM_ENABLE_CPU_DEEP_IDLE:
+		/*FALLTHROUGH*/
+	case PM_DISABLE_CPU_DEEP_IDLE:
+		return (hpet_deep_idle_config(code));
+
+	case CB_CODE_CPR_RESUME:
+		/*FALLTHROUGH*/
+	case CB_CODE_CPR_CHKPT:
+		return (hpet_cpr(code));
+
+	case CST_EVENT_MULTIPLE_CSTATES:
+		hpet_cst_callback(CST_EVENT_MULTIPLE_CSTATES);
+		return (B_TRUE);
+
+	case CST_EVENT_ONE_CSTATE:
+		hpet_cst_callback(CST_EVENT_ONE_CSTATE);
+		return (B_TRUE);
+
+	default:
+		cmn_err(CE_NOTE, "!hpet_callback: invalid code %d\n", code);
+		return (B_FALSE);
+	}
+}
+
+/*
+ * According to the HPET spec 1.0a: the Operating System must save and restore
+ * HPET event timer hardware context through ACPI sleep state transitions.
+ * Timer registers (including the main counter) may not be preserved through
+ * ACPI S3, S4, or S5 sleep states.  This code does not not support S1 nor S2.
+ *
+ * Current HPET state is already in hpet.supported and
+ * hpet_state.proxy_installed.  hpet_info contains the proxy interrupt HPET
+ * Timer state.
+ *
+ * Future projects beware: the HPET Main Counter is undefined after ACPI S3 or
+ * S4, and it is not saved/restored here.  Future projects cannot expect the
+ * Main Counter to be monotomically (or accurately) increasing across CPR.
+ *
+ * Note: the CPR Checkpoint path later calls pause_cpus() which ensures all
+ * CPUs are awake and in a spin loop before the system suspends.  The HPET is
+ * not needed for Deep C-state wakeup when CPUs are in cpu_pause().
+ * It is safe to leave the HPET running as the system suspends; we just
+ * disable the timer from generating interrupts here.
+ */
+static boolean_t
+hpet_cpr(int code)
+{
+	ulong_t		intr, dead_count = 0;
+	hrtime_t	dead = gethrtime() + hpet_spin_timeout;
+	boolean_t	ret = B_TRUE;
+
+	mutex_enter(&hpet_state_lock);
+	switch (code) {
+	case CB_CODE_CPR_CHKPT:
+		if (hpet_state.proxy_installed == B_FALSE)
+			break;
+
+		hpet_state.cpr = B_TRUE;
+
+		intr = intr_clear();
+		while (!mutex_tryenter(&hpet_proxy_lock)) {
+			/*
+			 * spin
+			 */
+			intr_restore(intr);
+			if (dead_count++ > hpet_spin_check) {
+				dead_count = 0;
+				if (gethrtime() > dead) {
+					hpet_state.cpr = B_FALSE;
+					mutex_exit(&hpet_state_lock);
+					cmn_err(CE_NOTE, "!hpet_cpr: deadman");
+					return (B_FALSE);
+				}
+			}
+			intr = intr_clear();
+		}
+		hpet_expire_all();
+		mutex_exit(&hpet_proxy_lock);
+		intr_restore(intr);
+
+		hpet_disable_timer(&hpet_info, hpet_info.cstate_timer.timer);
+		break;
+
+	case CB_CODE_CPR_RESUME:
+		if (hpet_resume() == B_TRUE)
+			hpet_state.cpr = B_FALSE;
+		else
+			cmn_err(CE_NOTE, "!hpet_resume failed.");
+		break;
+
+	default:
+		cmn_err(CE_NOTE, "!hpet_cpr: invalid code %d\n", code);
+		ret = B_FALSE;
+		break;
+	}
+	mutex_exit(&hpet_state_lock);
+	return (ret);
+}
+
+/*
+ * Assume the HPET stopped in Suspend state and timer state was lost.
+ */
+static boolean_t
+hpet_resume(void)
+{
+	if (hpet.supported != HPET_TIMER_SUPPORT)
+		return (B_TRUE);
+
+	/*
+	 * The HPET spec does not specify if Legacy Replacement Route is
+	 * on or off by default, so we set it off here.
+	 */
+	(void) hpet_set_leg_rt_cnf(&hpet_info, 0);
+
+	if (hpet_start_main_counter(&hpet_info) != AE_OK) {
+		cmn_err(CE_NOTE, "!hpet_resume: start main counter failed");
+		hpet.supported = HPET_NO_SUPPORT;
+		if (hpet_state.proxy_installed == B_TRUE) {
+			hpet_state.proxy_installed = B_FALSE;
+			hpet_uninstall_interrupt_handler();
+		}
+		return (B_FALSE);
+	}
+
+	if (hpet_state.proxy_installed == B_FALSE)
+		return (B_TRUE);
+
+	hpet_timer_set_up(&hpet_info, hpet_info.cstate_timer.timer,
+	    hpet_info.cstate_timer.intr);
+	if (hpet_state.cpu_deep_idle == B_TRUE)
+		hpet_enable_timer(&hpet_info, hpet_info.cstate_timer.timer);
+
+	return (B_TRUE);
+}
+
+/*
+ * Callback to enable/disable Deep C-States based on power.conf setting.
+ */
+static boolean_t
+hpet_deep_idle_config(int code)
+{
+	ulong_t		intr, dead_count = 0;
+	hrtime_t	dead = gethrtime() + hpet_spin_timeout;
+	boolean_t	ret = B_TRUE;
+
+	mutex_enter(&hpet_state_lock);
+	switch (code) {
+	case PM_DEFAULT_CPU_DEEP_IDLE:
+		/*FALLTHROUGH*/
+	case PM_ENABLE_CPU_DEEP_IDLE:
+
+		if (hpet_state.cpu_deep_idle == B_TRUE)
+			break;
+
+		if (hpet_state.proxy_installed == B_FALSE) {
+			ret = B_FALSE;  /* Deep C-States not supported */
+			break;
+		}
+
+		hpet_enable_timer(&hpet_info, hpet_info.cstate_timer.timer);
+		hpet_state.cpu_deep_idle = B_TRUE;
+		break;
+
+	case PM_DISABLE_CPU_DEEP_IDLE:
+
+		if ((hpet_state.cpu_deep_idle == B_FALSE) ||
+		    (hpet_state.proxy_installed == B_FALSE))
+			break;
+
+		/*
+		 * The order of these operations is important to avoid
+		 * lost wakeups: Set a flag to refuse all future LAPIC Timer
+		 * proxy requests, then wake up all CPUs from deep C-state,
+		 * and finally disable the HPET interrupt-generating timer.
+		 */
+		hpet_state.cpu_deep_idle = B_FALSE;
+
+		intr = intr_clear();
+		while (!mutex_tryenter(&hpet_proxy_lock)) {
+			/*
+			 * spin
+			 */
+			intr_restore(intr);
+			if (dead_count++ > hpet_spin_check) {
+				dead_count = 0;
+				if (gethrtime() > dead) {
+					hpet_state.cpu_deep_idle = B_TRUE;
+					mutex_exit(&hpet_state_lock);
+					cmn_err(CE_NOTE,
+					    "!hpet_deep_idle_config: deadman");
+					return (B_FALSE);
+				}
+			}
+			intr = intr_clear();
+		}
+		hpet_expire_all();
+		mutex_exit(&hpet_proxy_lock);
+		intr_restore(intr);
+
+		hpet_disable_timer(&hpet_info, hpet_info.cstate_timer.timer);
+		break;
+
+	default:
+		cmn_err(CE_NOTE, "!hpet_deep_idle_config: invalid code %d\n",
+		    code);
+		ret = B_FALSE;
+		break;
+	}
+	mutex_exit(&hpet_state_lock);
+
+	return (ret);
+}
+
+/*
+ * Callback for _CST c-state change notifications.
+ */
+static void
+hpet_cst_callback(uint32_t code)
+{
+	ulong_t		intr, dead_count = 0;
+	hrtime_t	dead = gethrtime() + hpet_spin_timeout;
+
+	switch (code) {
+	case CST_EVENT_ONE_CSTATE:
+		hpet_state.uni_cstate = B_TRUE;
+		intr = intr_clear();
+		while (!mutex_tryenter(&hpet_proxy_lock)) {
+			/*
+			 * spin
+			 */
+			intr_restore(intr);
+			if (dead_count++ > hpet_spin_check) {
+				dead_count = 0;
+				if (gethrtime() > dead) {
+					hpet_expire_all();
+					cmn_err(CE_NOTE,
+					    "!hpet_cst_callback: deadman");
+					return;
+				}
+			}
+			intr = intr_clear();
+		}
+		hpet_expire_all();
+		mutex_exit(&hpet_proxy_lock);
+		intr_restore(intr);
+		break;
+
+	case CST_EVENT_MULTIPLE_CSTATES:
+		hpet_state.uni_cstate = B_FALSE;
+		break;
+
+	default:
+		cmn_err(CE_NOTE, "!hpet_cst_callback: invalid code %d\n", code);
+		break;
+	}
+}
+
+/*
+ * Interrupt Service Routine for HPET I/O-APIC-generated interrupts.
+ * Used to wakeup CPUs from Deep C-state when their Local APIC Timer stops.
+ * This ISR runs on one CPU which pokes other CPUs out of Deep C-state as
+ * needed.
+ */
+/* ARGSUSED */
+static uint_t
+hpet_isr(char *arg)
+{
+	uint64_t	timer_status;
+	uint64_t	timer_mask;
+	ulong_t		intr, dead_count = 0;
+	hrtime_t	dead = gethrtime() + hpet_isr_spin_timeout;
+
+	timer_mask = HPET_INTR_STATUS_MASK(hpet_info.cstate_timer.timer);
+
+	/*
+	 * We are using a level-triggered interrupt.
+	 * HPET sets timer's General Interrupt Status Register bit N.
+	 * ISR checks this bit to see if it needs servicing.
+	 * ISR then clears this bit by writing 1 to that bit.
+	 */
+	timer_status = hpet_read_gen_intrpt_stat(&hpet_info);
+	if (!(timer_status & timer_mask))
+		return (DDI_INTR_UNCLAIMED);
+	hpet_write_gen_intrpt_stat(&hpet_info, timer_mask);
+
+	/*
+	 * Do not touch ISR data structures before checking the HPET's General
+	 * Interrupt Status register.  The General Interrupt Status register
+	 * will not be set by hardware until after timer interrupt generation
+	 * is enabled by software.  Software allocates necessary data
+	 * structures before enabling timer interrupts.  ASSERT the software
+	 * data structures required to handle this interrupt are initialized.
+	 */
+	ASSERT(hpet_proxy_users != NULL);
+
+	/*
+	 * CPUs in deep c-states do not enable interrupts until after
+	 * performing idle cleanup which includes descheduling themselves from
+	 * the HPET.  The CPU running this ISR will NEVER find itself in the
+	 * proxy list.  A lost wakeup may occur if this is false.
+	 */
+	ASSERT(hpet_proxy_users[CPU->cpu_id] == HPET_INFINITY);
+
+	/*
+	 * Higher level interrupts may deadlock with CPUs going idle if this
+	 * ISR is prempted while holding hpet_proxy_lock.
+	 */
+	intr = intr_clear();
+	while (!mutex_tryenter(&hpet_proxy_lock)) {
+		/*
+		 * spin
+		 */
+		intr_restore(intr);
+		if (dead_count++ > hpet_spin_check) {
+			dead_count = 0;
+			if (gethrtime() > dead) {
+				hpet_expire_all();
+				return (DDI_INTR_CLAIMED);
+			}
+		}
+		intr = intr_clear();
+	}
+	(void) hpet_guaranteed_schedule(HPET_INFINITY);
+	mutex_exit(&hpet_proxy_lock);
+	intr_restore(intr);
+
+	return (DDI_INTR_CLAIMED);
+}
+
+/*
+ * Used when disabling the HPET Timer interrupt.  CPUs in Deep C-state must be
+ * woken up because they can no longer rely on the HPET's Timer to wake them.
+ * We do not need to wait for CPUs to wakeup.
+ */
+static void
+hpet_expire_all(void)
+{
+	processorid_t	id;
+
+	for (id = 0; id < ncpus; ++id) {
+		if (hpet_proxy_users[id] != HPET_INFINITY) {
+			hpet_proxy_users[id] = HPET_INFINITY;
+			if (id != CPU->cpu_id)
+				poke_cpu(id);
+		}
+	}
+}
+
+/*
+ * To avoid missed wakeups this function must guarantee either the HPET timer
+ * was successfully programmed to the next expire time or there are no waiting
+ * CPUs.
+ *
+ * Callers cannot enter C2 or deeper if the HPET could not be programmed to
+ * generate its next interrupt to happen at required_wakeup_time or sooner.
+ * Returns B_TRUE if the HPET was programmed to interrupt by
+ * required_wakeup_time, B_FALSE if not.
+ */
+static boolean_t
+hpet_guaranteed_schedule(hrtime_t required_wakeup_time)
+{
+	hrtime_t	now, next_proxy_time;
+	processorid_t	id, next_proxy_id;
+	int		proxy_timer = hpet_info.cstate_timer.timer;
+	boolean_t	done = B_FALSE;
+
+	ASSERT(mutex_owned(&hpet_proxy_lock));
+
+	/*
+	 * Loop until we successfully program the HPET,
+	 * or no CPUs are scheduled to use the HPET as a proxy.
+	 */
+	do {
+		/*
+		 * Wake all CPUs that expired before now.
+		 * Find the next CPU to wake up and next HPET program time.
+		 */
+		now = gethrtime();
+		next_proxy_time = HPET_INFINITY;
+		next_proxy_id = CPU->cpu_id;
+		for (id = 0; id < ncpus; ++id) {
+			if (hpet_proxy_users[id] < now) {
+				hpet_proxy_users[id] = HPET_INFINITY;
+				if (id != CPU->cpu_id)
+					poke_cpu(id);
+			} else if (hpet_proxy_users[id] < next_proxy_time) {
+				next_proxy_time = hpet_proxy_users[id];
+				next_proxy_id = id;
+			}
+		}
+
+		if (next_proxy_time == HPET_INFINITY) {
+			done = B_TRUE;
+			/*
+			 * There are currently no CPUs using the HPET's Timer
+			 * as a proxy for their LAPIC Timer.  The HPET's Timer
+			 * does not need to be programmed.
+			 *
+			 * Letting the HPET timer wrap around to the current
+			 * time is the longest possible timeout.
+			 * A 64-bit timer will wrap around in ~ 2^44 seconds.
+			 * A 32-bit timer will wrap around in ~ 2^12 seconds.
+			 *
+			 * Disabling the HPET's timer interrupt requires a
+			 * (relatively expensive) write to the HPET.
+			 * Instead we do nothing.
+			 *
+			 * We are gambling some CPU will attempt to enter a
+			 * deep c-state before the timer wraps around.
+			 * We assume one spurious interrupt in a little over an
+			 * hour has less performance impact than writing to the
+			 * HPET's timer disable bit every time all CPUs wakeup
+			 * from deep c-state.
+			 */
+
+		} else {
+			/*
+			 * Idle CPUs disable interrupts before programming the
+			 * HPET to prevent a lost wakeup if the HPET
+			 * interrupts the idle cpu before it can enter a
+			 * Deep C-State.
+			 */
+			if (hpet_timer_program(&hpet_info, proxy_timer,
+			    HRTIME_TO_HPET_TICKS(next_proxy_time - gethrtime()))
+			    != AE_OK) {
+				/*
+				 * We could not program the HPET to wakeup the
+				 * next CPU.  We must wake the CPU ourself to
+				 * avoid a lost wakeup.
+				 */
+				hpet_proxy_users[next_proxy_id] = HPET_INFINITY;
+				if (next_proxy_id != CPU->cpu_id)
+					poke_cpu(next_proxy_id);
+			} else {
+				done = B_TRUE;
+			}
+		}
+
+	} while (!done);
+
+	return (next_proxy_time <= required_wakeup_time);
+}
+
+/*
+ * Use an HPET timer to act as this CPU's proxy local APIC timer.
+ * Used in deep c-states C2 and above while the CPU's local APIC timer stalls.
+ * Called by the idle thread with interrupts enabled.
+ * Always returns with interrupts disabled.
+ *
+ * There are 3 possible outcomes from this function:
+ * 1. The Local APIC Timer was already disabled before this function was called.
+ *	LAPIC TIMER	: disabled
+ *	HPET		: not scheduled to wake this CPU
+ *	*lapic_expire	: (hrtime_t)HPET_INFINITY
+ *	Returns		: B_TRUE
+ * 2. Successfully programmed the HPET to act as a LAPIC Timer proxy.
+ *	LAPIC TIMER	: disabled
+ *	HPET		: scheduled to wake this CPU
+ *	*lapic_expire	: hrtime_t when LAPIC timer would have expired
+ *	Returns		: B_TRUE
+ * 3. Failed to programmed the HPET to act as a LAPIC Timer proxy.
+ *	LAPIC TIMER	: enabled
+ *	HPET		: not scheduled to wake this CPU
+ *	*lapic_expire	: (hrtime_t)HPET_INFINITY
+ *	Returns		: B_FALSE
+ *
+ * The idle thread cannot enter Deep C-State in case 3.
+ * The idle thread must re-enable & re-program the LAPIC_TIMER in case 2.
+ */
+static boolean_t
+hpet_use_hpet_timer(hrtime_t *lapic_expire)
+{
+	extern hrtime_t	apic_timer_stop_count(void);
+	extern void	apic_timer_restart(hrtime_t);
+	hrtime_t	now, expire, dead;
+	uint64_t	lapic_count, dead_count;
+	cpupart_t	*cpu_part;
+	processorid_t	cpu_sid;
+	processorid_t	cpu_id = CPU->cpu_id;
+	processorid_t	id;
+	boolean_t	rslt;
+	boolean_t	hset_update;
+
+	cpu_part = CPU->cpu_part;
+	cpu_sid = CPU->cpu_seqid;
+
+	ASSERT(CPU->cpu_thread == CPU->cpu_idle_thread);
+	ASSERT(interrupts_enabled());
+
+	/*
+	 * A critical section exists between when the HPET is programmed
+	 * to interrupt the CPU and when this CPU enters an idle state.
+	 * Interrupts must be blocked during that time to prevent lost
+	 * CBE wakeup interrupts from either LAPIC or HPET.
+	 *
+	 * Must block interrupts before acquiring hpet_proxy_lock to prevent
+	 * a deadlock with the ISR if the ISR runs on this CPU after the
+	 * idle thread acquires the mutex but before it clears interrupts.
+	 */
+	cli();
+
+	lapic_count = apic_timer_stop_count();
+	now = gethrtime();
+	dead = now + hpet_idle_spin_timeout;
+	*lapic_expire = expire = now + lapic_count;
+	if (lapic_count == (hrtime_t)-1) {
+		/*
+		 * LAPIC timer is currently disabled.
+		 * Will not use the HPET as a LAPIC Timer proxy.
+		 */
+		*lapic_expire = (hrtime_t)HPET_INFINITY;
+		return (B_TRUE);
+	}
+
+	/*
+	 * Serialize hpet_proxy data structure manipulation.
+	 */
+	dead_count = 0;
+	while (!mutex_tryenter(&hpet_proxy_lock)) {
+		/*
+		 * spin
+		 */
+		apic_timer_restart(expire);
+		sti();
+		cli();
+
+		if (dead_count++ > hpet_spin_check) {
+			dead_count = 0;
+			hset_update = (((CPU->cpu_flags & CPU_OFFLINE) == 0) &&
+			    (ncpus > 1));
+			if (hset_update &&
+			    !bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
+				*lapic_expire = (hrtime_t)HPET_INFINITY;
+				return (B_FALSE);
+			}
+		}
+
+		lapic_count = apic_timer_stop_count();
+		now = gethrtime();
+		*lapic_expire = expire = now + lapic_count;
+		if (lapic_count == (hrtime_t)-1) {
+			/*
+			 * LAPIC timer is currently disabled.
+			 * Will not use the HPET as a LAPIC Timer proxy.
+			 */
+			*lapic_expire = (hrtime_t)HPET_INFINITY;
+			return (B_TRUE);
+		}
+		if (now > dead) {
+			apic_timer_restart(expire);
+			*lapic_expire = (hrtime_t)HPET_INFINITY;
+			return (B_FALSE);
+		}
+	}
+
+	if ((hpet_state.cpr == B_TRUE) ||
+	    (hpet_state.cpu_deep_idle == B_FALSE) ||
+	    (hpet_state.proxy_installed == B_FALSE) ||
+	    (hpet_state.uni_cstate == B_TRUE)) {
+		mutex_exit(&hpet_proxy_lock);
+		apic_timer_restart(expire);
+		*lapic_expire = (hrtime_t)HPET_INFINITY;
+		return (B_FALSE);
+	}
+
+	hpet_proxy_users[cpu_id] = expire;
+
+	/*
+	 * We are done if another cpu is scheduled on the HPET with an
+	 * expire time before us.  The next HPET interrupt has been programmed
+	 * to fire before our expire time.
+	 */
+	for (id = 0; id < ncpus; ++id) {
+		if ((hpet_proxy_users[id] <= expire) && (id != cpu_id)) {
+			mutex_exit(&hpet_proxy_lock);
+			return (B_TRUE);
+		}
+	}
+
+	/*
+	 * We are the next lAPIC to expire.
+	 * Program the HPET with our expire time.
+	 */
+	rslt = hpet_guaranteed_schedule(expire);
+	mutex_exit(&hpet_proxy_lock);
+
+	if (rslt == B_FALSE) {
+		apic_timer_restart(expire);
+		*lapic_expire = (hrtime_t)HPET_INFINITY;
+	}
+
+	return (rslt);
+}
+
+/*
+ * Called by the idle thread when waking up from Deep C-state before enabling
+ * interrupts.  With an array data structure it is faster to always remove
+ * ourself from the array without checking if the HPET ISR already removed.
+ *
+ * We use a lazy algorithm for removing CPUs from the HPET's schedule.
+ * We do not reprogram the HPET here because this CPU has real work to do.
+ * On a idle system the CPU was probably woken up by the HPET's ISR.
+ * On a heavily loaded system CPUs are not going into Deep C-state.
+ * On a moderately loaded system another CPU will usually enter Deep C-state
+ * and reprogram the HPET before the HPET fires with our wakeup.
+ */
+static void
+hpet_use_lapic_timer(hrtime_t expire)
+{
+	extern void	apic_timer_restart(hrtime_t);
+	processorid_t	cpu_id = CPU->cpu_id;
+
+	ASSERT(CPU->cpu_thread == CPU->cpu_idle_thread);
+	ASSERT(!interrupts_enabled());
+
+	hpet_proxy_users[cpu_id] = HPET_INFINITY;
+
+	/*
+	 * Do not enable a LAPIC Timer that was initially disabled.
+	 */
+	if (expire != HPET_INFINITY)
+		apic_timer_restart(expire);
+
+	sti();
+}
+
+/*
+ * Initialize data structure to keep track of CPUs using HPET as a proxy for
+ * their stalled local APIC timer.  For now this is just an array.
+ */
+static void
+hpet_init_proxy_data(void)
+{
+	processorid_t	id;
+
+	/*
+	 * Use apic_nproc because we are in boot before max_ncpus has been
+	 * initialized.
+	 */
+	hpet_proxy_users = kmem_zalloc(apic_nproc * sizeof (*hpet_proxy_users),
+	    KM_SLEEP);
+
+	/*
+	 * Unused entries always contain HPET_INFINITY.
+	 */
+	for (id = 0; id < apic_nproc; ++id)
+		hpet_proxy_users[id] = HPET_INFINITY;
+}
diff --git a/usr/src/uts/i86pc/io/mp_platform_common.c b/usr/src/uts/i86pc/io/mp_platform_common.c
index 123ece8286..77314f3697 100644
--- a/usr/src/uts/i86pc/io/mp_platform_common.c
+++ b/usr/src/uts/i86pc/io/mp_platform_common.c
@@ -62,7 +62,10 @@
 #include <sys/note.h>
 #include <sys/pci_intr_lib.h>
 #include <sys/sunndi.h>
-
+#if !defined(__xpv)
+#include <sys/hpet.h>
+#include <sys/clock.h>
+#endif
 
 /*
  *	Local Function Prototypes
@@ -103,6 +106,12 @@ int apic_debug_mps_id = 0;	/* 1 - print MPS ID strings */
 int apic_sci_vect = -1;
 iflag_t apic_sci_flags;
 
+#if !defined(__xpv)
+/* ACPI HPET interrupt configuration; -1 if HPET not used */
+int apic_hpet_vect = -1;
+iflag_t apic_hpet_flags;
+#endif
+
 /*
  * psm name pointer
  */
@@ -892,6 +901,17 @@ acpi_probe(char *modname)
 			cmn_err(CE_CONT,
 			    "?Using ACPI for CPU/IOAPIC information ONLY\n");
 		}
+
+#if !defined(__xpv)
+		/*
+		 * probe ACPI for hpet information here which is used later
+		 * in apic_picinit().
+		 */
+		if (hpet_acpi_init(&apic_hpet_vect, &apic_hpet_flags) < 0) {
+			cmn_err(CE_NOTE, "!ACPI HPET table query failed\n");
+		}
+#endif
+
 		return (PSM_SUCCESS);
 	}
 	/* if setting APIC mode failed above, we fall through to cleanup */
@@ -1324,6 +1344,40 @@ ioapic_init_intr(int mask_apic)
 
 		irqptr->airq_share++;
 	}
+
+#if !defined(__xpv)
+	/*
+	 * Hack alert: deal with ACPI HPET interrupt chicken/egg here.
+	 */
+	if (apic_hpet_vect > 0) {
+		/*
+		 * hpet has already done add_avintr(); we just need
+		 * to finish the job by mimicing translate_irq()
+		 *
+		 * Fake up an intrspec and setup the tables
+		 */
+		ispec.intrspec_vec = apic_hpet_vect;
+		ispec.intrspec_pri = CBE_HIGH_PIL;
+
+		if (apic_setup_irq_table(NULL, apic_hpet_vect, NULL,
+		    &ispec, &apic_hpet_flags, DDI_INTR_TYPE_FIXED) < 0) {
+			cmn_err(CE_WARN, "!apic: HPET setup failed");
+			return;
+		}
+		irqptr = apic_irq_table[apic_hpet_vect];
+
+		iflag = intr_clear();
+		lock_set(&apic_ioapic_lock);
+
+		/* Program I/O APIC */
+		(void) apic_setup_io_intr(irqptr, apic_hpet_vect, B_FALSE);
+
+		lock_clear(&apic_ioapic_lock);
+		intr_restore(iflag);
+
+		irqptr->airq_share++;
+	}
+#endif	/* !defined(__xpv) */
 }
 
 /*
diff --git a/usr/src/uts/i86pc/io/pcplusmp/apic.c b/usr/src/uts/i86pc/io/pcplusmp/apic.c
index d83e2c2209..793a48c360 100644
--- a/usr/src/uts/i86pc/io/pcplusmp/apic.c
+++ b/usr/src/uts/i86pc/io/pcplusmp/apic.c
@@ -68,6 +68,7 @@
 #include <sys/sunddi.h>
 #include <sys/x_call.h>
 #include <sys/reboot.h>
+#include <sys/hpet.h>
 
 /*
  *	Local Function Prototypes
@@ -1650,6 +1651,8 @@ apic_shutdown(int cmd, int fcn)
 	uchar_t	byte;
 	ulong_t iflag;
 
+	hpet_acpi_fini();
+
 	/* Send NMI to all CPUs except self to do per processor shutdown */
 	iflag = intr_clear();
 #ifdef	DEBUG
@@ -2039,6 +2042,41 @@ apic_timer_disable(void)
 	    (apic_clkvect + APIC_BASE_VECT) | AV_MASK);
 }
 
+/*
+ * Set timer far into the future and return timer
+ * current Count in nanoseconds.
+ */
+hrtime_t
+apic_timer_stop_count(void)
+{
+	hrtime_t	ns_val;
+	int		enable_val, count_val;
+
+	/*
+	 * Should be called with interrupts disabled.
+	 */
+	ASSERT(!interrupts_enabled());
+
+	enable_val = apic_reg_ops->apic_read(APIC_LOCAL_TIMER);
+	if ((enable_val & AV_MASK) == AV_MASK)
+		return ((hrtime_t)-1);		/* timer is disabled */
+
+	count_val = apic_reg_ops->apic_read(APIC_CURR_COUNT);
+	ns_val = APIC_TICKS_TO_NSECS(count_val);
+
+	apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL);
+
+	return (ns_val);
+}
+
+/*
+ * Reprogram timer after Deep C-State.
+ */
+void
+apic_timer_restart(hrtime_t time)
+{
+	apic_timer_reprogram(time);
+}
 
 ddi_periodic_t apic_periodic_id;
 
diff --git a/usr/src/uts/i86pc/io/ppm_plat.c b/usr/src/uts/i86pc/io/ppm_plat.c
index 4bc96639fe..0237676ade 100644
--- a/usr/src/uts/i86pc/io/ppm_plat.c
+++ b/usr/src/uts/i86pc/io/ppm_plat.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Platform Power Management master pseudo driver platform support.
  */
@@ -49,14 +47,17 @@ void
 ppm_rebuild_cpu_domains(void)
 {
 	char *str = "ppm_rebuild_cpu_domains";
-	cpupm_cpu_dependency_t *dep;
-	cpupm_cpu_dependency_t *dep_next;
-	cpupm_cpu_node_t *cpu_next;
+	cpupm_state_domains_t *dep;
+	cpupm_state_domains_t *dep_next;
 	struct ppm_domit *domit_p;
 	ppm_domain_t *domp_old;
 	ppm_domain_t *domp;
 	ppm_dev_t *devp;
 	ppm_db_t *dbp;
+	uint_t cpu_id;
+	cpuset_t dom_cpu_set;
+	int result;
+	dev_info_t *cpu_dip;
 
 	/*
 	 * Get the CPU domain data
@@ -100,7 +101,7 @@ ppm_rebuild_cpu_domains(void)
 	 * leave the domain as it is (which is unmanageable since
 	 * PPM_CPU_READY is off).
 	 */
-	dep = cpupm_get_cpu_dependencies();
+	dep = cpupm_pstate_domains;
 	if (dep == NULL) {
 		PPMD(D_CPU, ("%s: No CPU dependency info!\n", str));
 		return;
@@ -112,11 +113,11 @@ ppm_rebuild_cpu_domains(void)
 	 */
 	mutex_enter(&domp_old->lock);
 	domp_old->dflags |= PPMD_OFFLINE;
-	for (dep_next = dep; dep_next; dep_next = dep_next->cd_next) {
+	for (dep_next = dep; dep_next; dep_next = dep_next->pm_next) {
 		domp = kmem_zalloc(sizeof (*domp), KM_SLEEP);
 		domp->name =  kmem_zalloc(MAXNAMELEN, KM_SLEEP);
 		(void) snprintf(domp->name, MAXNAMELEN, "acpi_cpu_domain_%d",
-		    dep_next->cd_dependency_id);
+		    dep_next->pm_domain);
 		mutex_init(&domp->lock, NULL, MUTEX_DRIVER, NULL);
 		mutex_enter(&domp->lock);
 		domp->dflags = domit_p->dflags | PPMD_CPU_READY;
@@ -135,18 +136,27 @@ ppm_rebuild_cpu_domains(void)
 		 * build the "conflist" for the domain. But conveniently, the
 		 * "conflist" data is easily obtainable from the "devlist".
 		 */
-		for (cpu_next = dep_next->cd_cpu; cpu_next;
-		    cpu_next = cpu_next->cn_next) {
-			devp = PPM_GET_PRIVATE(cpu_next->cn_dip);
+		dom_cpu_set = dep_next->pm_cpus;
+		do {
+			CPUSET_FIND(dom_cpu_set, cpu_id);
+			if (cpu_id == CPUSET_NOTINSET)
+				break;
+
+			ASSERT(cpu_id < NCPU);
+			cpu_dip = ((cpupm_mach_state_t *)
+			    (cpu[cpu_id]->cpu_m.mcpu_pm_mach_state))->ms_dip;
+			devp = PPM_GET_PRIVATE(cpu_dip);
 			ASSERT(devp && devp->domp == domp_old);
-			devp = ppm_add_dev(cpu_next->cn_dip, domp);
+			devp = ppm_add_dev(cpu_dip, domp);
 			dbp = kmem_zalloc(sizeof (struct ppm_db), KM_SLEEP);
 			dbp->name = kmem_zalloc((strlen(devp->path) + 1),
 			    KM_SLEEP);
 			(void) strcpy(dbp->name, devp->path);
 			dbp->next = domp->conflist;
 			domp->conflist = dbp;
-		}
+
+			CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result);
+		} while (result == 0);
 
 		/*
 		 * Note that we do not bother creating a "dc" list as there
@@ -165,7 +175,6 @@ ppm_rebuild_cpu_domains(void)
 		mutex_exit(&domp->lock);
 	}
 	mutex_exit(&domp_old->lock);
-	cpupm_free_cpu_dependencies();
 }
 
 /*
@@ -176,7 +185,7 @@ void
 ppm_set_topspeed(ppm_dev_t *cpup, int speed)
 {
 	for (cpup = cpup->domp->devlist; cpup != NULL; cpup = cpup->next)
-		(*cpupm_set_topspeed)(cpup->dip, speed);
+		(*cpupm_set_topspeed_callb)(cpup->dip, speed);
 }
 
 /*
@@ -197,7 +206,8 @@ ppm_redefine_topspeed(void *ctx)
 
 	cpup = PPM_GET_PRIVATE((dev_info_t *)ctx);
 
-	if (cpupm_get_topspeed == NULL || cpupm_set_topspeed == NULL) {
+	if (cpupm_get_topspeed_callb == NULL ||
+	    cpupm_set_topspeed_callb == NULL) {
 		cmn_err(CE_WARN, "%s: Cannot process request for instance %d "
 		    "since cpupm interfaces are not initialized", str,
 		    ddi_get_instance(cpup->dip));
@@ -215,7 +225,7 @@ ppm_redefine_topspeed(void *ctx)
 	 * Process each CPU in the domain.
 	 */
 	for (ncpup = cpup->domp->devlist; ncpup != NULL; ncpup = ncpup->next) {
-		topspeed = (*cpupm_get_topspeed)(ncpup->dip);
+		topspeed = (*cpupm_get_topspeed_callb)(ncpup->dip);
 		if (newspeed == -1 || topspeed < newspeed)
 			newspeed = topspeed;
 	}
diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c
index c3f2cb5074..c47c52f37f 100644
--- a/usr/src/uts/i86pc/os/cpuid.c
+++ b/usr/src/uts/i86pc/os/cpuid.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -1024,6 +1024,22 @@ cpuid_pass1(cpu_t *cpu)
 			cpi->cpi_ncore_per_chip = 1;
 			break;
 		}
+
+		/*
+		 * Get CPUID data about TSC Invariance in Deep C-State.
+		 */
+		switch (cpi->cpi_vendor) {
+		case X86_VENDOR_Intel:
+			if (cpi->cpi_maxeax >= 7) {
+				cp = &cpi->cpi_extd[7];
+				cp->cp_eax = 0x80000007;
+				cp->cp_ecx = 0;
+				(void) __cpuid_insn(cp);
+			}
+			break;
+		default:
+			break;
+		}
 	} else {
 		cpi->cpi_ncore_per_chip = 1;
 	}
@@ -3847,6 +3863,36 @@ patch_tsc_read(int flag)
 	}
 }
 
+int
+cpuid_deep_cstates_supported(void)
+{
+	struct cpuid_info *cpi;
+	struct cpuid_regs regs;
+
+	ASSERT(cpuid_checkpass(CPU, 1));
+
+	cpi = CPU->cpu_m.mcpu_cpi;
+
+	if (!(x86_feature & X86_CPUID))
+		return (0);
+
+	switch (cpi->cpi_vendor) {
+	case X86_VENDOR_Intel:
+		if (cpi->cpi_xmaxeax < 0x80000007)
+			return (0);
+
+		/*
+		 * TSC run at a constant rate in all ACPI C-states?
+		 */
+		regs.cp_eax = 0x80000007;
+		(void) __cpuid_insn(&regs);
+		return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
+
+	default:
+		return (0);
+	}
+}
+
 #if defined(__amd64) && !defined(__xpv)
 /*
  * Patch in versions of bcopy for high performance Intel Nhm processors
diff --git a/usr/src/uts/i86pc/os/cpupm.c b/usr/src/uts/i86pc/os/cpupm.c
deleted file mode 100644
index 6aad26948a..0000000000
--- a/usr/src/uts/i86pc/os/cpupm.c
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/cpupm.h>
-
-/*
- * This callback is used to build the PPM CPU domains once
- * all the CPU devices have been started. The callback is
- * initialized by the PPM driver to point to a routine that
- * will build the domains.
- */
-void (*cpupm_rebuild_cpu_domains)(void);
-
-/*
- * This callback is used to reset the topspeed for all the
- * CPU devices. The callback is initialized by the PPM driver to
- * point to a routine that will reinitialize all the CPU devices
- * once all the CPU devices have been started and the CPU domains
- * built.
- */
-void (*cpupm_init_topspeed)(void);
-
-/*
- * This callback is used to redefine the topspeed for a CPU device.
- * Since all CPUs in a domain should have identical properties, this
- * callback is initialized by the PPM driver to point to a routine
- * that will redefine the topspeed for all devices in a CPU domain.
- * This callback is exercised whenever an ACPI _PPC change notification
- * is received by the CPU driver.
- */
-void (*cpupm_redefine_topspeed)(void *);
-
-/*
- * This callback is used by the PPM driver to call into the CPU driver
- * to find a CPU's current topspeed (i.e., it's current ACPI _PPC value).
- */
-void (*cpupm_set_topspeed)(void *, int);
-
-/*
- * This callback is used by the PPM driver to call into the CPU driver
- * to set a new topspeed for a CPU.
- */
-int (*cpupm_get_topspeed)(void *);
-
-/*
- * Used to dynamically keep track of the CPU dependencies as CPU
- * devices attach. Will subsequently be used by the PPM driver
- * to build PPM CPU domains.
- */
-static cpupm_cpu_dependency_t *cpupm_cpu_dependencies = NULL;
-
-/*
- * If we are unable to correctly identify a dependency for any CPU, then
- * we punt and all CPUs are managed as one domain.
- */
-static boolean_t cpupm_dependencies_valid = B_TRUE;
-
-/*
- * If any CPU fails to attach, then cpupm is disabled for all CPUs.
- */
-static uint32_t cpupm_enabled = CPUPM_P_STATES | CPUPM_T_STATES;
-
-/*
- * Until all CPUs have succesfully attached, we do not allow
- * power management.
- */
-static boolean_t cpupm_ready = B_FALSE;
-
-/*
- * Print the CPU dependencies.
- */
-static void
-cpupm_print_cpu_dependencies()
-{
-	cpupm_cpu_dependency_t *dptr;
-	cpupm_cpu_node_t *nptr;
-
-	for (dptr = cpupm_cpu_dependencies; dptr != NULL;
-	    dptr = dptr->cd_next) {
-		for (nptr = dptr->cd_cpu; nptr != NULL; nptr = nptr->cn_next) {
-			int instance = ddi_get_instance(nptr->cn_dip);
-			cmn_err(CE_NOTE,
-			    "print_cpu_dependencies: dependency %d "
-			    "instance %d\n", dptr->cd_dependency_id, instance);
-		}
-	}
-}
-
-/*
- * Used to retrieve the dependencies built during CPUs attaching.
- */
-cpupm_cpu_dependency_t *
-cpupm_get_cpu_dependencies()
-{
-	return (cpupm_cpu_dependencies);
-}
-
-/*
- * Build dependencies as CPUs attach. Note that we don't need to worry
- * about locking the dependency lists as concurrency is not an issue.
- * This routine relies on the fact that the CPU devices are attached
- * sequentially by a single thread.
- */
-void
-cpupm_add_cpu2dependency(dev_info_t *dip, int cpu_dependency)
-{
-	cpupm_cpu_dependency_t *dptr;
-	cpupm_cpu_node_t *nptr;
-
-	if (!cpupm_dependencies_valid)
-		return;
-
-	if (cpu_dependency == -1) {
-		cpupm_free_cpu_dependencies();
-		return;
-	}
-
-	for (dptr = cpupm_cpu_dependencies; dptr != NULL;
-	    dptr = dptr->cd_next) {
-		if (dptr->cd_dependency_id == cpu_dependency)
-			break;
-	}
-
-	/* new dependency is created and linked at the head */
-	if (dptr == NULL) {
-		dptr = kmem_zalloc(sizeof (cpupm_cpu_dependency_t), KM_SLEEP);
-		dptr->cd_dependency_id = cpu_dependency;
-		dptr->cd_next = cpupm_cpu_dependencies;
-		cpupm_cpu_dependencies = dptr;
-	}
-
-	/* new cpu is created and linked at head of dependency */
-	nptr = kmem_zalloc(sizeof (cpupm_cpu_node_t), KM_SLEEP);
-	nptr->cn_dip = dip;
-	nptr->cn_next = dptr->cd_cpu;
-	dptr->cd_cpu = nptr;
-}
-
-/*
- * Free the CPU dependencies.
- */
-void
-cpupm_free_cpu_dependencies()
-{
-	cpupm_cpu_dependency_t *this_dependency, *next_dependency;
-	cpupm_cpu_node_t *this_node, *next_node;
-
-	cpupm_dependencies_valid = B_FALSE;
-	this_dependency = cpupm_cpu_dependencies;
-	while (this_dependency != NULL) {
-		next_dependency = this_dependency->cd_next;
-
-		/* discard CPU node chain */
-		this_node = this_dependency->cd_cpu;
-		while (this_node != NULL) {
-			next_node = this_node->cn_next;
-			kmem_free((void *)this_node,
-			    sizeof (cpupm_cpu_node_t));
-			this_node = next_node;
-		}
-		kmem_free((void *)this_dependency,
-		    sizeof (cpupm_cpu_dependency_t));
-		this_dependency = next_dependency;
-	}
-	cpupm_cpu_dependencies = NULL;
-}
-
-/*
- * If all CPUs have attached successfully, then the CPUs are
- * ready for power management.
- */
-boolean_t
-cpupm_is_ready()
-{
-#ifndef	__xpv
-	if (cpupm_enabled == CPUPM_NO_STATES)
-		return (B_FALSE);
-	return (cpupm_ready);
-#else
-	return (B_FALSE);
-#endif
-}
-
-boolean_t
-cpupm_is_enabled(uint32_t state)
-{
-	return ((cpupm_enabled & state) == state);
-}
-
-/*
- * By default, all states are  enabled. But if there are any errors attaching
- * any of the CPU devices, then they are disabled.
- */
-void
-cpupm_disable(uint32_t state)
-{
-	cpupm_enabled &= ~state;
-	if (state & CPUPM_P_STATES)
-		cpupm_free_cpu_dependencies();
-}
-
-/*
- * Once all CPUs have been started, the PPM driver should build CPU
- * domains and initialize the topspeed for all CPU devices.
- */
-void
-cpupm_post_startup()
-{
-#ifndef	__xpv
-	/*
-	 * The CPU domain built by the PPM during CPUs attaching
-	 * should be rebuilt with the information retrieved from
-	 * ACPI.
-	 */
-	if (cpupm_rebuild_cpu_domains != NULL)
-		(*cpupm_rebuild_cpu_domains)();
-
-	/*
-	 * Only initialize the topspeed if P-states are enabled.
-	 */
-	if (cpupm_enabled & CPUPM_P_STATES && cpupm_init_topspeed != NULL)
-		(*cpupm_init_topspeed)();
-#endif
-	cpupm_ready = B_TRUE;
-}
diff --git a/usr/src/uts/i86pc/io/cpudrv/cpu_acpi.c b/usr/src/uts/i86pc/os/cpupm/cpu_acpi.c
index 569ca2fc92..76e087a873 100644
--- a/usr/src/uts/i86pc/io/cpudrv/cpu_acpi.c
+++ b/usr/src/uts/i86pc/os/cpupm/cpu_acpi.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -37,7 +37,8 @@ typedef enum cpu_acpi_obj {
 	PTC_OBJ,
 	TSS_OBJ,
 	TSD_OBJ,
-	TPC_OBJ
+	TPC_OBJ,
+	CSD_OBJ,
 } cpu_acpi_obj_t;
 
 /*
@@ -61,7 +62,8 @@ static cpu_acpi_obj_attr_t cpu_acpi_obj_attrs[] = {
 	{"_PTC"},
 	{"_TSS"},
 	{"_TSD"},
-	{"_TPC"}
+	{"_TPC"},
+	{"_CSD"}
 };
 
 /*
@@ -199,8 +201,14 @@ cpu_acpi_cache_state_dependencies(cpu_acpi_handle_t handle,
 {
 	ACPI_BUFFER abuf;
 	ACPI_OBJECT *pkg, *elements;
+	int number;
 	int ret = -1;
 
+	if (objtype == CSD_OBJ) {
+		number = 6;
+	} else {
+		number = 5;
+	}
 	/*
 	 * Fetch the dependencies (if present) for the CPU node.
 	 * Since they are optional, non-existence is not a failure
@@ -215,21 +223,29 @@ cpu_acpi_cache_state_dependencies(cpu_acpi_handle_t handle,
 	}
 
 	pkg = abuf.Pointer;
-	if (pkg->Package.Count != 1) {
+
+	if (((objtype != CSD_OBJ) && (pkg->Package.Count != 1)) ||
+	    ((objtype == CSD_OBJ) && (pkg->Package.Count != 1) &&
+	    (pkg->Package.Count != 2))) {
 		cmn_err(CE_NOTE, "!cpu_acpi: %s unsupported package "
 		    "count %d.", cpu_acpi_obj_attrs[objtype].name,
 		    pkg->Package.Count);
 		goto out;
 	}
 
+	/*
+	 * For C-state domain, we assume C2 and C3 have the same
+	 * domain information
+	 */
 	if (pkg->Package.Elements[0].Type != ACPI_TYPE_PACKAGE ||
-	    pkg->Package.Elements[0].Package.Count != 5) {
+	    pkg->Package.Elements[0].Package.Count != number) {
 		cmn_err(CE_NOTE, "!cpu_acpi: Unexpected data in %s package.",
 		    cpu_acpi_obj_attrs[objtype].name);
 		goto out;
 	}
 	elements = pkg->Package.Elements[0].Package.Elements;
-	if (elements[0].Integer.Value != 5 || elements[1].Integer.Value != 0) {
+	if (elements[0].Integer.Value != number ||
+	    elements[1].Integer.Value != 0) {
 		cmn_err(CE_NOTE, "!cpu_acpi: Unexpected %s revision.",
 		    cpu_acpi_obj_attrs[objtype].name);
 		goto out;
@@ -240,6 +256,9 @@ cpu_acpi_cache_state_dependencies(cpu_acpi_handle_t handle,
 	sd->sd_domain = elements[2].Integer.Value;
 	sd->sd_type = elements[3].Integer.Value;
 	sd->sd_num = elements[4].Integer.Value;
+	if (objtype == CSD_OBJ) {
+		sd->sd_index = elements[5].Integer.Value;
+	}
 
 	ret = 0;
 out:
@@ -285,6 +304,25 @@ cpu_acpi_cache_tsd(cpu_acpi_handle_t handle)
 
 }
 
+/*
+ * Cache the ACPI _CSD data. The _CSD data defines C-state CPU dependencies
+ * (think CPU domains).
+ */
+static int
+cpu_acpi_cache_csd(cpu_acpi_handle_t handle)
+{
+	cpu_acpi_csd_t *csd;
+	int ret;
+
+	CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_CSD_CACHED);
+	csd = &CPU_ACPI_CSD(handle);
+	ret = cpu_acpi_cache_state_dependencies(handle, CSD_OBJ, csd);
+	if (ret == 0)
+		CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_CSD_CACHED);
+	return (ret);
+
+}
+
 static void
 cpu_acpi_cache_pstate(cpu_acpi_handle_t handle, ACPI_OBJECT *obj, int cnt)
 {
@@ -567,6 +605,126 @@ cpu_acpi_cache_tpc(cpu_acpi_handle_t handle)
 		CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_TPC_CACHED);
 }
 
+int
+cpu_acpi_verify_cstate(cpu_acpi_cstate_t *cstate)
+{
+	uint32_t addrspaceid = cstate->cs_addrspace_id;
+
+	if ((addrspaceid != ACPI_ADR_SPACE_FIXED_HARDWARE) &&
+	    (addrspaceid != ACPI_ADR_SPACE_SYSTEM_IO)) {
+		cmn_err(CE_WARN, "!_CST: unsupported address space id"
+		    ":C%d, type: %d\n", cstate->cs_type, addrspaceid);
+		return (1);
+	}
+	return (0);
+}
+
+int
+cpu_acpi_cache_cst(cpu_acpi_handle_t handle)
+{
+	ACPI_BUFFER abuf;
+	ACPI_OBJECT *obj;
+	ACPI_INTEGER cnt;
+	cpu_acpi_cstate_t *cstate, *p;
+	int i, count;
+
+	CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_CST_CACHED);
+
+	abuf.Length = ACPI_ALLOCATE_BUFFER;
+	abuf.Pointer = NULL;
+
+	if (ACPI_FAILURE(AcpiEvaluateObject(handle->cs_handle, "_CST",
+	    NULL, &abuf))) {
+		cmn_err(CE_NOTE, "!cpu_acpi: _CST evaluate failure");
+		return (-1);
+	}
+	obj = (ACPI_OBJECT *)abuf.Pointer;
+	if (obj->Package.Count < 2) {
+		cmn_err(CE_NOTE, "!cpu_acpi: _CST package bad count %d.",
+		    obj->Package.Count);
+		AcpiOsFree(abuf.Pointer);
+		return (-1);
+	}
+
+	/*
+	 * Does the package look coherent?
+	 */
+	cnt = obj->Package.Elements[0].Integer.Value;
+	if (cnt < 1 || cnt != obj->Package.Count - 1) {
+		cmn_err(CE_NOTE, "!cpu_acpi: _CST invalid element count %d != "
+		    "Package count %d\n",
+		    (int)cnt, (int)obj->Package.Count - 1);
+		AcpiOsFree(abuf.Pointer);
+		return (-1);
+	}
+
+	CPU_ACPI_CSTATES_COUNT(handle) = (uint32_t)cnt;
+	CPU_ACPI_CSTATES(handle) = kmem_zalloc(CPU_ACPI_CSTATES_SIZE(cnt),
+	    KM_SLEEP);
+	CPU_ACPI_BM_INFO(handle) = 0;
+	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
+	p = cstate;
+
+	for (i = 1, count = 1; i <= cnt; i++) {
+		ACPI_OBJECT *pkg;
+		AML_RESOURCE_GENERIC_REGISTER *reg;
+		ACPI_OBJECT *element;
+
+		pkg = &(obj->Package.Elements[i]);
+		reg = (AML_RESOURCE_GENERIC_REGISTER *)
+		    pkg->Package.Elements[0].Buffer.Pointer;
+		cstate->cs_addrspace_id = reg->AddressSpaceId;
+		cstate->cs_address = reg->Address;
+		element = &(pkg->Package.Elements[1]);
+		cstate->cs_type = element->Integer.Value;
+		element = &(pkg->Package.Elements[2]);
+		cstate->cs_latency = element->Integer.Value;
+		element = &(pkg->Package.Elements[3]);
+		cstate->cs_power = element->Integer.Value;
+
+		if (cpu_acpi_verify_cstate(cstate)) {
+			/*
+			 * ignore this entry if it's not valid
+			 */
+			continue;
+		}
+		if (cstate == p) {
+			cstate++;
+		} else if (p->cs_type == cstate->cs_type) {
+			/*
+			 * if there are duplicate entries, we keep the
+			 * last one. This fixes:
+			 * 1) some buggy BIOS have total duplicate entries.
+			 * 2) ACPI Spec allows the same cstate entry with
+			 *    different power and latency, we use the one
+			 *    with more power saving.
+			 */
+			(void) memcpy(p, cstate, sizeof (cpu_acpi_cstate_t));
+		} else {
+			/*
+			 * we got a valid entry, cache it to the
+			 * cstate structure
+			 */
+			p = cstate++;
+			count++;
+		}
+	}
+
+	if (count < 2) {
+		cmn_err(CE_NOTE, "!cpu_acpi: _CST invalid count %d < 2\n",
+		    count);
+		AcpiOsFree(abuf.Pointer);
+		return (-1);
+	}
+
+	if (count != cnt)
+		CPU_ACPI_CSTATES_COUNT(handle) = (uint32_t)count;
+
+	AcpiOsFree(abuf.Pointer);
+	CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_CST_CACHED);
+	return (0);
+}
+
 /*
  * Cache the _PCT, _PSS, _PSD and _PPC data.
  */
@@ -575,19 +733,19 @@ cpu_acpi_cache_pstate_data(cpu_acpi_handle_t handle)
 {
 	if (cpu_acpi_cache_pct(handle) < 0) {
 		cmn_err(CE_WARN, "!cpu_acpi: error parsing _PCT for "
-		    "CPU instance %d", ddi_get_instance(handle->cs_dip));
+		    "CPU %d", handle->cs_id);
 		return (-1);
 	}
 
 	if (cpu_acpi_cache_pstates(handle) != 0) {
 		cmn_err(CE_WARN, "!cpu_acpi: error parsing _PSS for "
-		    "CPU instance %d", ddi_get_instance(handle->cs_dip));
+		    "CPU %d", handle->cs_id);
 		return (-1);
 	}
 
 	if (cpu_acpi_cache_psd(handle) < 0) {
 		cmn_err(CE_WARN, "!cpu_acpi: error parsing _PSD for "
-		    "CPU instance %d", ddi_get_instance(handle->cs_dip));
+		    "CPU %d", handle->cs_id);
 		return (-1);
 	}
 
@@ -617,19 +775,19 @@ cpu_acpi_cache_tstate_data(cpu_acpi_handle_t handle)
 {
 	if (cpu_acpi_cache_ptc(handle) < 0) {
 		cmn_err(CE_WARN, "!cpu_acpi: error parsing _PTC for "
-		    "CPU instance %d", ddi_get_instance(handle->cs_dip));
+		    "CPU %d", handle->cs_id);
 		return (-1);
 	}
 
 	if (cpu_acpi_cache_tstates(handle) != 0) {
 		cmn_err(CE_WARN, "!cpu_acpi: error parsing _TSS for "
-		    "CPU instance %d", ddi_get_instance(handle->cs_dip));
+		    "CPU %d", handle->cs_id);
 		return (-1);
 	}
 
 	if (cpu_acpi_cache_tsd(handle) < 0) {
 		cmn_err(CE_WARN, "!cpu_acpi: error parsing _TSD for "
-		    "CPU instance %d", ddi_get_instance(handle->cs_dip));
+		    "CPU %d", handle->cs_id);
 		return (-1);
 	}
 
@@ -652,17 +810,63 @@ cpu_acpi_free_tstate_data(cpu_acpi_handle_t handle)
 }
 
 /*
+ * Cache the _CST data.
+ */
+int
+cpu_acpi_cache_cstate_data(cpu_acpi_handle_t handle)
+{
+	if (cpu_acpi_cache_cst(handle) < 0) {
+		cmn_err(CE_WARN, "!cpu_acpi: error parsing _CST for "
+		    "CPU %d", handle->cs_id);
+		return (-1);
+	}
+
+	if (cpu_acpi_cache_csd(handle) < 0) {
+		cmn_err(CE_WARN, "!cpu_acpi: error parsing _CSD for "
+		    "CPU %d", handle->cs_id);
+		return (-1);
+	}
+
+	return (0);
+}
+
+void
+cpu_acpi_free_cstate_data(cpu_acpi_handle_t handle)
+{
+	if (handle != NULL) {
+		if (CPU_ACPI_CSTATES(handle)) {
+			kmem_free(CPU_ACPI_CSTATES(handle),
+			    CPU_ACPI_CSTATES_SIZE(
+			    CPU_ACPI_CSTATES_COUNT(handle)));
+			CPU_ACPI_CSTATES(handle) = NULL;
+		}
+	}
+}
+
+/*
  * Register a handler for processor change notifications.
  */
 void
 cpu_acpi_install_notify_handler(cpu_acpi_handle_t handle,
-    ACPI_NOTIFY_HANDLER handler, dev_info_t *dip)
+    ACPI_NOTIFY_HANDLER handler, void *ctx)
 {
-	char path[MAXNAMELEN];
 	if (ACPI_FAILURE(AcpiInstallNotifyHandler(handle->cs_handle,
-	    ACPI_DEVICE_NOTIFY, handler, dip)))
+	    ACPI_DEVICE_NOTIFY, handler, ctx)))
 		cmn_err(CE_NOTE, "!cpu_acpi: Unable to register "
-		    "notify handler for %s", ddi_pathname(dip, path));
+		    "notify handler for CPU");
+}
+
+/*
+ * Remove a handler for processor change notifications.
+ */
+void
+cpu_acpi_remove_notify_handler(cpu_acpi_handle_t handle,
+    ACPI_NOTIFY_HANDLER handler)
+{
+	if (ACPI_FAILURE(AcpiRemoveNotifyHandler(handle->cs_handle,
+	    ACPI_DEVICE_NOTIFY, handler)))
+		cmn_err(CE_NOTE, "!cpu_acpi: Unable to remove "
+		    "notify handler for CPU");
 }
 
 /*
@@ -763,21 +967,43 @@ cpu_acpi_free_speeds(int *speeds, uint_t nspeeds)
 	kmem_free(speeds, nspeeds * sizeof (int));
 }
 
+uint_t
+cpu_acpi_get_max_cstates(cpu_acpi_handle_t handle)
+{
+	if (CPU_ACPI_CSTATES(handle))
+		return (CPU_ACPI_CSTATES_COUNT(handle));
+	else
+		return (1);
+}
+
+void
+cpu_acpi_set_register(uint32_t bitreg, uint32_t value)
+{
+	AcpiSetRegister(bitreg, value);
+}
+
+void
+cpu_acpi_get_register(uint32_t bitreg, uint32_t *value)
+{
+	AcpiGetRegister(bitreg, value);
+}
+
 /*
  * Map the dip to an ACPI handle for the device.
  */
 cpu_acpi_handle_t
-cpu_acpi_init(dev_info_t *dip)
+cpu_acpi_init(cpu_t *cp)
 {
 	cpu_acpi_handle_t handle;
 
 	handle = kmem_zalloc(sizeof (cpu_acpi_state_t), KM_SLEEP);
 
-	if (ACPI_FAILURE(acpica_get_handle(dip, &handle->cs_handle))) {
+	if (ACPI_FAILURE(acpica_get_handle_cpu(cp->cpu_id,
+	    &handle->cs_handle))) {
 		kmem_free(handle, sizeof (cpu_acpi_state_t));
 		return (NULL);
 	}
-	handle->cs_dip = dip;
+	handle->cs_id = cp->cpu_id;
 	return (handle);
 }
 
diff --git a/usr/src/uts/i86pc/os/cpupm/cpu_idle.c b/usr/src/uts/i86pc/os/cpupm/cpu_idle.c
new file mode 100644
index 0000000000..40b03ff38b
--- /dev/null
+++ b/usr/src/uts/i86pc/os/cpupm/cpu_idle.c
@@ -0,0 +1,877 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/x86_archext.h>
+#include <sys/machsystm.h>
+#include <sys/x_call.h>
+#include <sys/stat.h>
+#include <sys/acpi/acpi.h>
+#include <sys/acpica.h>
+#include <sys/cpu_acpi.h>
+#include <sys/cpu_idle.h>
+#include <sys/cpupm.h>
+#include <sys/hpet.h>
+#include <sys/archsystm.h>
+#include <vm/hat_i86.h>
+#include <sys/dtrace.h>
+#include <sys/sdt.h>
+#include <sys/callb.h>
+
+extern void cpu_idle_adaptive(void);
+
+static int cpu_idle_init(cpu_t *);
+static void cpu_idle_fini(cpu_t *);
+static boolean_t cpu_deep_idle_callb(void *arg, int code);
+static boolean_t cpu_idle_cpr_callb(void *arg, int code);
+static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate);
+static void cpuidle_set_cstate_latency(cpu_t *cp);
+
+/*
+ * Interfaces for modules implementing Intel's deep c-state.
+ */
+cpupm_state_ops_t cpu_idle_ops = {
+	"Generic ACPI C-state Support",
+	cpu_idle_init,
+	cpu_idle_fini,
+	NULL
+};
+
+static kmutex_t		cpu_idle_callb_mutex;
+static callb_id_t	cpu_deep_idle_callb_id;
+static callb_id_t	cpu_idle_cpr_callb_id;
+static uint_t		cpu_idle_cfg_state;
+
+static kmutex_t cpu_idle_mutex;
+
+cpu_idle_kstat_t cpu_idle_kstat = {
+	{ "address_space_id",	KSTAT_DATA_STRING },
+	{ "latency",		KSTAT_DATA_UINT32 },
+	{ "power",		KSTAT_DATA_UINT32 },
+};
+
+/*
+ * kstat update function of the c-state info
+ */
+static int
+cpu_idle_kstat_update(kstat_t *ksp, int flag)
+{
+	cpu_acpi_cstate_t *cstate = ksp->ks_private;
+
+	if (flag == KSTAT_WRITE) {
+		return (EACCES);
+	}
+
+	if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
+		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
+		"FFixedHW");
+	} else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) {
+		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
+		"SystemIO");
+	} else {
+		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
+		"Unsupported");
+	}
+
+	cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency;
+	cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power;
+
+	return (0);
+}
+
+/*
+ * c-state wakeup function.
+ * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals
+ * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State.
+ */
+void
+cstate_wakeup(cpu_t *cp, int bound)
+{
+	struct machcpu	*mcpu = &(cp->cpu_m);
+	volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait;
+	cpupart_t	*cpu_part;
+	uint_t		cpu_found;
+	processorid_t	cpu_sid;
+
+	cpu_part = cp->cpu_part;
+	cpu_sid = cp->cpu_seqid;
+	/*
+	 * Clear the halted bit for that CPU since it will be woken up
+	 * in a moment.
+	 */
+	if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
+		/*
+		 * Clear the halted bit for that CPU since it will be
+		 * poked in a moment.
+		 */
+		bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);
+
+		/*
+		 * We may find the current CPU present in the halted cpuset
+		 * if we're in the context of an interrupt that occurred
+		 * before we had a chance to clear our bit in cpu_idle().
+		 * Waking ourself is obviously unnecessary, since if
+		 * we're here, we're not halted.
+		 */
+		if (cp != CPU) {
+			/*
+			 * Use correct wakeup mechanism
+			 */
+			if ((mcpu_mwait != NULL) &&
+			    (*mcpu_mwait == MWAIT_HALTED))
+				MWAIT_WAKEUP(cp);
+			else
+				poke_cpu(cp->cpu_id);
+		}
+		return;
+	} else {
+		/*
+		 * This cpu isn't halted, but it's idle or undergoing a
+		 * context switch. No need to awaken anyone else.
+		 */
+		if (cp->cpu_thread == cp->cpu_idle_thread ||
+		    cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
+			return;
+	}
+
+	/*
+	 * No need to wake up other CPUs if the thread we just enqueued
+	 * is bound.
+	 */
+	if (bound)
+		return;
+
+
+	/*
+	 * See if there's any other halted CPUs. If there are, then
+	 * select one, and awaken it.
+	 * It's possible that after we find a CPU, somebody else
+	 * will awaken it before we get the chance.
+	 * In that case, look again.
+	 */
+	do {
+		cpu_found = bitset_find(&cpu_part->cp_haltset);
+		if (cpu_found == (uint_t)-1)
+			return;
+
+	} while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
+	    cpu_found) < 0);
+
+	/*
+	 * Must use correct wakeup mechanism to avoid lost wakeup of
+	 * alternate cpu.
+	 */
+	if (cpu_found != CPU->cpu_seqid) {
+		mcpu_mwait = cpu[cpu_found]->cpu_m.mcpu_mwait;
+		if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED))
+			MWAIT_WAKEUP(cpu_seq[cpu_found]);
+		else
+			poke_cpu(cpu_seq[cpu_found]->cpu_id);
+	}
+}
+
+/*
+ * enter deep c-state handler
+ */
+static void
+acpi_cpu_cstate(cpu_acpi_cstate_t *cstate)
+{
+	volatile uint32_t	*mcpu_mwait = CPU->cpu_m.mcpu_mwait;
+	cpu_t			*cpup = CPU;
+	processorid_t		cpu_sid = cpup->cpu_seqid;
+	cpupart_t		*cp = cpup->cpu_part;
+	hrtime_t		lapic_expire;
+	uint8_t			type = cstate->cs_addrspace_id;
+	uint32_t		cs_type = cstate->cs_type;
+	int			hset_update = 1;
+	boolean_t		using_hpet_timer;
+
+	/*
+	 * Set our mcpu_mwait here, so we can tell if anyone tries to
+	 * wake us between now and when we call mwait.  No other cpu will
+	 * attempt to set our mcpu_mwait until we add ourself to the haltset.
+	 */
+	if (mcpu_mwait) {
+		if (type == ACPI_ADR_SPACE_SYSTEM_IO)
+			*mcpu_mwait = MWAIT_WAKEUP_IPI;
+		else
+			*mcpu_mwait = MWAIT_HALTED;
+	}
+
+	/*
+	 * If this CPU is online, and there are multiple CPUs
+	 * in the system, then we should note our halting
+	 * by adding ourselves to the partition's halted CPU
+	 * bitmap. This allows other CPUs to find/awaken us when
+	 * work becomes available.
+	 */
+	if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
+		hset_update = 0;
+
+	/*
+	 * Add ourselves to the partition's halted CPUs bitmask
+	 * and set our HALTED flag, if necessary.
+	 *
+	 * When a thread becomes runnable, it is placed on the queue
+	 * and then the halted cpuset is checked to determine who
+	 * (if anyone) should be awakened. We therefore need to first
+	 * add ourselves to the halted cpuset, and and then check if there
+	 * is any work available.
+	 *
+	 * Note that memory barriers after updating the HALTED flag
+	 * are not necessary since an atomic operation (updating the bitmap)
+	 * immediately follows. On x86 the atomic operation acts as a
+	 * memory barrier for the update of cpu_disp_flags.
+	 */
+	if (hset_update) {
+		cpup->cpu_disp_flags |= CPU_DISP_HALTED;
+		bitset_atomic_add(&cp->cp_haltset, cpu_sid);
+	}
+
+	/*
+	 * Check to make sure there's really nothing to do.
+	 * Work destined for this CPU may become available after
+	 * this check. We'll be notified through the clearing of our
+	 * bit in the halted CPU bitmask, and a write to our mcpu_mwait.
+	 *
+	 * disp_anywork() checks disp_nrunnable, so we do not have to later.
+	 */
+	if (disp_anywork()) {
+		if (hset_update) {
+			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
+			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
+		}
+		return;
+	}
+
+	/*
+	 * We're on our way to being halted.
+	 *
+	 * The local APIC timer can stop in ACPI C2 and deeper c-states.
+	 * Program the HPET hardware to substitute for this CPU's lAPIC timer.
+	 * hpet.use_hpet_timer() disables the LAPIC Timer.  Make sure to
+	 * start the LAPIC Timer again before leaving this function.
+	 *
+	 * hpet.use_hpet_timer disables interrupts, so we will awaken
+	 * immediately after halting if someone tries to poke us between now
+	 * and the time we actually halt.
+	 */
+	using_hpet_timer = hpet.use_hpet_timer(&lapic_expire);
+
+	/*
+	 * We check for the presence of our bit after disabling interrupts.
+	 * If it's cleared, we'll return. If the bit is cleared after
+	 * we check then the cstate_wakeup() will pop us out of the halted
+	 * state.
+	 *
+	 * This means that the ordering of the cstate_wakeup() and the clearing
+	 * of the bit by cpu_wakeup is important.
+	 * cpu_wakeup() must clear our mc_haltset bit, and then call
+	 * cstate_wakeup().
+	 * acpi_cpu_cstate() must disable interrupts, then check for the bit.
+	 */
+	if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
+		hpet.use_lapic_timer(lapic_expire);
+		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
+		return;
+	}
+
+	/*
+	 * The check for anything locally runnable is here for performance
+	 * and isn't needed for correctness. disp_nrunnable ought to be
+	 * in our cache still, so it's inexpensive to check, and if there
+	 * is anything runnable we won't have to wait for the poke.
+	 */
+	if (cpup->cpu_disp->disp_nrunnable != 0) {
+		hpet.use_lapic_timer(lapic_expire);
+		if (hset_update) {
+			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
+			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
+		}
+		return;
+	}
+
+	if (using_hpet_timer == B_FALSE) {
+
+		hpet.use_lapic_timer(lapic_expire);
+
+		/*
+		 * We are currently unable to program the HPET to act as this
+		 * CPU's proxy lAPIC timer.  This CPU cannot enter C2 or deeper
+		 * because no timer is set to wake it up while its lAPIC timer
+		 * stalls in deep C-States.
+		 * Enter C1 instead.
+		 *
+		 * cstate_wake_cpu() will wake this CPU with an IPI which
+		 * works with MWAIT.
+		 */
+		i86_monitor(mcpu_mwait, 0, 0);
+		if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) == MWAIT_HALTED) {
+			cpu_dtrace_idle_probe(CPU_ACPI_C1);
+
+			tlb_going_idle();
+			i86_mwait(0, 0);
+			tlb_service();
+
+			cpu_dtrace_idle_probe(CPU_ACPI_C0);
+		}
+
+		/*
+		 * We're no longer halted
+		 */
+		if (hset_update) {
+			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
+			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
+		}
+		return;
+	}
+
+	cpu_dtrace_idle_probe((uint_t)cs_type);
+
+	if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) {
+		/*
+		 * We're on our way to being halted.
+		 * To avoid a lost wakeup, arm the monitor before checking
+		 * if another cpu wrote to mcpu_mwait to wake us up.
+		 */
+		i86_monitor(mcpu_mwait, 0, 0);
+		if (*mcpu_mwait == MWAIT_HALTED) {
+			uint32_t eax = cstate->cs_address;
+			uint32_t ecx = 1;
+
+			tlb_going_idle();
+			i86_mwait(eax, ecx);
+			tlb_service();
+		}
+	} else if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
+		uint32_t value;
+		ACPI_TABLE_FADT *gbl_FADT;
+
+		if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
+			tlb_going_idle();
+			(void) cpu_acpi_read_port(cstate->cs_address,
+			    &value, 8);
+			acpica_get_global_FADT(&gbl_FADT);
+			(void) cpu_acpi_read_port(
+			    gbl_FADT->XPmTimerBlock.Address, &value, 32);
+			tlb_service();
+		}
+	} else {
+		cmn_err(CE_WARN, "!_CST: cs_type %lx bad asid type %lx\n",
+		    (long)cs_type, (long)type);
+	}
+
+	/*
+	 * The lAPIC timer may have stopped in deep c-state.
+	 * Reprogram this CPU's lAPIC here before enabling interrupts.
+	 */
+	hpet.use_lapic_timer(lapic_expire);
+
+	cpu_dtrace_idle_probe(CPU_ACPI_C0);
+
+	/*
+	 * We're no longer halted
+	 */
+	if (hset_update) {
+		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
+		bitset_atomic_del(&cp->cp_haltset, cpu_sid);
+	}
+}
+
+/*
+ * indicate when bus masters are active
+ */
+static uint32_t
+cpu_acpi_bm_sts(void)
+{
+	uint32_t bm_sts = 0;
+
+	cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_sts);
+
+	if (bm_sts)
+		cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
+
+	return (bm_sts);
+}
+
+/*
+ * Idle the present CPU, deep c-state is supported
+ */
+void
+cpu_acpi_idle(void)
+{
+	cpu_t *cp = CPU;
+	uint16_t cs_type;
+	cpu_acpi_handle_t handle;
+	cma_c_state_t *cs_data;
+	cpu_acpi_cstate_t *cstate;
+	hrtime_t start, end;
+	int cpu_max_cstates;
+
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+	handle = mach_state->ms_acpi_handle;
+	ASSERT(CPU_ACPI_CSTATES(handle) != NULL);
+
+	cs_data = mach_state->ms_cstate.cma_state.cstate;
+	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
+	ASSERT(cstate != NULL);
+	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
+	if (cpu_max_cstates > CPU_MAX_CSTATES)
+		cpu_max_cstates = CPU_MAX_CSTATES;
+
+	start = gethrtime_unscaled();
+
+	cs_type = cpupm_next_cstate(cs_data, start);
+
+	/*
+	 * OSPM uses the BM_STS bit to determine the power state to enter
+	 * when considering a transition to or from the C2/C3 power state.
+	 * if C3 is determined, bus master activity demotes the power state
+	 * to C2.
+	 */
+	if ((cs_type >= CPU_ACPI_C3) && cpu_acpi_bm_sts())
+		cs_type = CPU_ACPI_C2;
+
+	/*
+	 * BM_RLD determines if the Cx power state was exited as a result of
+	 * bus master requests. Set this bit when using a C3 power state, and
+	 * clear it when using a C1 or C2 power state.
+	 */
+	if ((CPU_ACPI_BM_INFO(handle) & BM_RLD) && (cs_type < CPU_ACPI_C3)) {
+		cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
+		CPU_ACPI_BM_INFO(handle) &= ~BM_RLD;
+	}
+
+	if ((!(CPU_ACPI_BM_INFO(handle) & BM_RLD)) &&
+	    (cs_type >= CPU_ACPI_C3)) {
+		cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
+		CPU_ACPI_BM_INFO(handle) |= BM_RLD;
+	}
+
+	cstate += cs_type - 1;
+
+	switch (cs_type) {
+	default:
+		/* FALLTHROUGH */
+	case CPU_ACPI_C1:
+		(*non_deep_idle_cpu)();
+		break;
+
+	case CPU_ACPI_C2:
+		acpi_cpu_cstate(cstate);
+		break;
+
+	case CPU_ACPI_C3:
+		/*
+		 * recommended in ACPI spec, providing hardware mechanisms
+		 * to prevent master from writing to memory (UP-only)
+		 */
+		if ((ncpus_online == 1) &&
+		    (CPU_ACPI_BM_INFO(handle) & BM_CTL)) {
+			cpu_acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
+			CPU_ACPI_BM_INFO(handle) |= BM_ARB_DIS;
+		/*
+		 * Today all Intel's processor support C3 share cache.
+		 */
+		} else if (x86_vendor != X86_VENDOR_Intel) {
+			__acpi_wbinvd();
+		}
+		acpi_cpu_cstate(cstate);
+		if (CPU_ACPI_BM_INFO(handle) & BM_ARB_DIS) {
+			cpu_acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
+			CPU_ACPI_BM_INFO(handle) &= ~BM_ARB_DIS;
+		}
+		break;
+	}
+
+	end = gethrtime_unscaled();
+
+	/*
+	 * Update statistics
+	 */
+	cpupm_wakeup_cstate_data(cs_data, end);
+}
+
+boolean_t
+cpu_deep_cstates_supported(void)
+{
+	extern int	idle_cpu_no_deep_c;
+
+	if (idle_cpu_no_deep_c)
+		return (B_FALSE);
+
+	if (!cpuid_deep_cstates_supported())
+		return (B_FALSE);
+
+	if ((hpet.supported != HPET_FULL_SUPPORT) || !hpet.install_proxy())
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/*
+ * Validate that this processor supports deep cstate and if so,
+ * get the c-state data from ACPI and cache it.
+ */
+static int
+cpu_idle_init(cpu_t *cp)
+{
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
+	cpu_acpi_cstate_t *cstate;
+	char name[KSTAT_STRLEN];
+	int cpu_max_cstates, i;
+	ACPI_TABLE_FADT *gbl_FADT;
+
+	/*
+	 * Cache the C-state specific ACPI data.
+	 */
+	if (cpu_acpi_cache_cstate_data(handle) != 0) {
+		cmn_err(CE_NOTE,
+		    "!cpu_idle_init: Failed to cache ACPI C-state data\n");
+		cpu_idle_fini(cp);
+		return (-1);
+	}
+
+	/*
+	 * Check the bus master arbitration control ability.
+	 */
+	acpica_get_global_FADT(&gbl_FADT);
+	if (gbl_FADT->Pm2ControlBlock && gbl_FADT->Pm2ControlLength)
+		CPU_ACPI_BM_INFO(handle) |= BM_CTL;
+
+	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
+
+	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
+
+	for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
+		(void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type);
+		/*
+		 * Allocate, initialize and install cstate kstat
+		 */
+		cstate->cs_ksp = kstat_create("cstate", CPU->cpu_id,
+		    name, "misc",
+		    KSTAT_TYPE_NAMED,
+		    sizeof (cpu_idle_kstat) / sizeof (kstat_named_t),
+		    KSTAT_FLAG_VIRTUAL);
+
+		if (cstate->cs_ksp == NULL) {
+			cmn_err(CE_NOTE, "kstat_create(c_state) fail");
+		} else {
+			cstate->cs_ksp->ks_data = &cpu_idle_kstat;
+			cstate->cs_ksp->ks_lock = &cpu_idle_mutex;
+			cstate->cs_ksp->ks_update = cpu_idle_kstat_update;
+			cstate->cs_ksp->ks_data_size += MAXNAMELEN;
+			cstate->cs_ksp->ks_private = cstate;
+			kstat_install(cstate->cs_ksp);
+			cstate++;
+		}
+	}
+
+	cpupm_alloc_domains(cp, CPUPM_C_STATES);
+	cpupm_alloc_ms_cstate(cp);
+	cpuidle_set_cstate_latency(cp);
+
+	if (cpu_deep_cstates_supported()) {
+		mutex_enter(&cpu_idle_callb_mutex);
+		if (cpu_deep_idle_callb_id == (callb_id_t)0)
+			cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb,
+			    (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle");
+		if (cpu_idle_cpr_callb_id == (callb_id_t)0)
+			cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb,
+			    (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr");
+		mutex_exit(&cpu_idle_callb_mutex);
+	}
+
+	return (0);
+}
+
+/*
+ * Free resources allocated by cpu_idle_init().
+ */
+static void
+cpu_idle_fini(cpu_t *cp)
+{
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
+	cpu_acpi_cstate_t *cstate;
+	uint_t	cpu_max_cstates, i;
+
+	/*
+	 * idle cpu points back to the generic one
+	 */
+	idle_cpu = CPU->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
+	disp_enq_thread = non_deep_idle_disp_enq_thread;
+
+	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
+	if (cstate) {
+		cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
+
+		for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
+			if (cstate->cs_ksp != NULL)
+				kstat_delete(cstate->cs_ksp);
+			cstate++;
+		}
+	}
+
+	cpupm_free_ms_cstate(cp);
+	cpupm_free_domains(&cpupm_cstate_domains);
+	cpu_acpi_free_cstate_data(handle);
+
+	mutex_enter(&cpu_idle_callb_mutex);
+	if (cpu_deep_idle_callb_id != (callb_id_t)0) {
+		(void) callb_delete(cpu_deep_idle_callb_id);
+		cpu_deep_idle_callb_id = (callb_id_t)0;
+	}
+	if (cpu_idle_cpr_callb_id != (callb_id_t)0) {
+		(void) callb_delete(cpu_idle_cpr_callb_id);
+		cpu_idle_cpr_callb_id = (callb_id_t)0;
+	}
+	mutex_exit(&cpu_idle_callb_mutex);
+}
+
+/*ARGSUSED*/
+static boolean_t
+cpu_deep_idle_callb(void *arg, int code)
+{
+	boolean_t rslt = B_TRUE;
+
+	mutex_enter(&cpu_idle_callb_mutex);
+	switch (code) {
+	case PM_DEFAULT_CPU_DEEP_IDLE:
+		/*
+		 * Default policy is same as enable
+		 */
+		/*FALLTHROUGH*/
+	case PM_ENABLE_CPU_DEEP_IDLE:
+		if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0)
+			break;
+
+		if (hpet.callback(PM_ENABLE_CPU_DEEP_IDLE)) {
+			disp_enq_thread = cstate_wakeup;
+			idle_cpu = cpu_idle_adaptive;
+			cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG;
+		} else {
+			rslt = B_FALSE;
+		}
+		break;
+
+	case PM_DISABLE_CPU_DEEP_IDLE:
+		if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
+			break;
+
+		idle_cpu = non_deep_idle_cpu;
+		if (hpet.callback(PM_DISABLE_CPU_DEEP_IDLE)) {
+			disp_enq_thread = non_deep_idle_disp_enq_thread;
+			cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG;
+		}
+		break;
+
+	default:
+		cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n",
+		    code);
+		break;
+	}
+	mutex_exit(&cpu_idle_callb_mutex);
+	return (rslt);
+}
+
+/*ARGSUSED*/
+static boolean_t
+cpu_idle_cpr_callb(void *arg, int code)
+{
+	boolean_t rslt = B_TRUE;
+
+	mutex_enter(&cpu_idle_callb_mutex);
+	switch (code) {
+	case CB_CODE_CPR_RESUME:
+		if (hpet.callback(CB_CODE_CPR_RESUME)) {
+			/*
+			 * Do not enable dispatcher hooks if disabled by user.
+			 */
+			if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
+				break;
+
+			disp_enq_thread = cstate_wakeup;
+			idle_cpu = cpu_idle_adaptive;
+		} else {
+			rslt = B_FALSE;
+		}
+		break;
+
+	case CB_CODE_CPR_CHKPT:
+		idle_cpu = non_deep_idle_cpu;
+		disp_enq_thread = non_deep_idle_disp_enq_thread;
+		hpet.callback(CB_CODE_CPR_CHKPT);
+		break;
+
+	default:
+		cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code);
+		break;
+	}
+	mutex_exit(&cpu_idle_callb_mutex);
+	return (rslt);
+}
+
+/*
+ * handle _CST notification
+ */
+void
+cpuidle_cstate_instance(cpu_t *cp)
+{
+#ifndef	__xpv
+	cpupm_mach_state_t	*mach_state =
+	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+	cpu_acpi_handle_t	handle;
+	struct machcpu		*mcpu;
+	cpuset_t 		dom_cpu_set;
+	kmutex_t		*pm_lock;
+	int			result = 0;
+	processorid_t		cpu_id;
+
+	if (mach_state == NULL) {
+		return;
+	}
+
+	ASSERT(mach_state->ms_cstate.cma_domain != NULL);
+	dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus;
+	pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock;
+
+	/*
+	 * Do for all the CPU's in the domain
+	 */
+	mutex_enter(pm_lock);
+	do {
+		CPUSET_FIND(dom_cpu_set, cpu_id);
+		if (cpu_id == CPUSET_NOTINSET)
+			break;
+
+		ASSERT(cpu_id >= 0 && cpu_id < NCPU);
+		cp = cpu[cpu_id];
+		mach_state = (cpupm_mach_state_t *)
+		    cp->cpu_m.mcpu_pm_mach_state;
+		if (!(mach_state->ms_caps & CPUPM_C_STATES)) {
+			mutex_exit(pm_lock);
+			return;
+		}
+		handle = mach_state->ms_acpi_handle;
+		ASSERT(handle != NULL);
+
+		/*
+		 * re-evaluate cstate object
+		 */
+		if (cpu_acpi_cache_cstate_data(handle) != 0) {
+			cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state"
+			    " object Instance: %d", cpu_id);
+		}
+		mutex_enter(&cpu_lock);
+		mcpu = &(cp->cpu_m);
+		mcpu->max_cstates = cpu_acpi_get_max_cstates(handle);
+		if (mcpu->max_cstates > CPU_ACPI_C1) {
+			hpet.callback(CST_EVENT_MULTIPLE_CSTATES);
+			disp_enq_thread = cstate_wakeup;
+			cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
+			cpuidle_set_cstate_latency(cp);
+		} else if (mcpu->max_cstates == CPU_ACPI_C1) {
+			disp_enq_thread = non_deep_idle_disp_enq_thread;
+			cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
+			hpet.callback(CST_EVENT_ONE_CSTATE);
+		}
+		mutex_exit(&cpu_lock);
+
+		CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result);
+		mutex_exit(pm_lock);
+	} while (result < 0);
+#endif
+}
+
+/*
+ * handle the number or the type of available processor power states change
+ */
+void
+cpuidle_manage_cstates(void *ctx)
+{
+	cpu_t			*cp = ctx;
+	processorid_t		cpu_id = cp->cpu_id;
+	cpupm_mach_state_t	*mach_state =
+	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+	boolean_t		is_ready;
+
+	if (mach_state == NULL) {
+		return;
+	}
+
+	/*
+	 * We currently refuse to power manage if the CPU is not ready to
+	 * take cross calls (cross calls fail silently if CPU is not ready
+	 * for it).
+	 *
+	 * Additionally, for x86 platforms we cannot power manage
+	 * any one instance, until all instances have been initialized.
+	 * That's because we don't know what the CPU domains look like
+	 * until all instances have been initialized.
+	 */
+	is_ready = CPUPM_XCALL_IS_READY(cpu_id) && cpupm_cstate_ready();
+	if (!is_ready)
+		return;
+
+	cpuidle_cstate_instance(cp);
+}
+
+static void
+cpuidle_set_cstate_latency(cpu_t *cp)
+{
+	cpupm_mach_state_t	*mach_state =
+	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+	cpu_acpi_handle_t	handle;
+	cpu_acpi_cstate_t	*acpi_cstates;
+	cma_c_state_t		*cpupm_cdata;
+	uint32_t		i, cnt;
+
+	cpupm_cdata = mach_state->ms_cstate.cma_state.cstate;
+
+	ASSERT(cpupm_cdata != 0);
+	ASSERT(mach_state != NULL);
+	handle = mach_state->ms_acpi_handle;
+	ASSERT(handle != NULL);
+
+	cnt = CPU_ACPI_CSTATES_COUNT(handle);
+	acpi_cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
+
+	cpupm_cdata->cs_C2_latency = CPU_CSTATE_LATENCY_UNDEF;
+	cpupm_cdata->cs_C3_latency = CPU_CSTATE_LATENCY_UNDEF;
+
+	for (i = 1; i <= cnt; ++i, ++acpi_cstates) {
+		if ((cpupm_cdata->cs_C2_latency == CPU_CSTATE_LATENCY_UNDEF) &&
+		    (acpi_cstates->cs_type == CPU_ACPI_C2))
+			cpupm_cdata->cs_C2_latency =  acpi_cstates->cs_latency;
+
+		if ((cpupm_cdata->cs_C3_latency == CPU_CSTATE_LATENCY_UNDEF) &&
+		    (acpi_cstates->cs_type == CPU_ACPI_C3))
+			cpupm_cdata->cs_C3_latency =  acpi_cstates->cs_latency;
+	}
+}
diff --git a/usr/src/uts/i86pc/io/cpudrv/cpudrv_amd.c b/usr/src/uts/i86pc/os/cpupm/cpupm_amd.c
index 21dd88980c..086d9a8fe6 100644
--- a/usr/src/uts/i86pc/io/cpudrv/cpudrv_amd.c
+++ b/usr/src/uts/i86pc/os/cpupm/cpupm_amd.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -28,14 +28,14 @@
  */
 
 #include <sys/x86_archext.h>
-#include <sys/cpudrv_mach.h>
 #include <sys/cpu_acpi.h>
 #include <sys/pwrnow.h>
 
 boolean_t
-cpudrv_amd_init(cpudrv_devstate_t *cpudsp)
+cpupm_amd_init(cpu_t *cp)
 {
-	cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
 
 	/* AMD? */
 	if (x86_vendor != X86_VENDOR_AMD)
@@ -43,9 +43,10 @@ cpudrv_amd_init(cpudrv_devstate_t *cpudsp)
 
 	/*
 	 * If we support PowerNow! on this processor, then set the
-	 * correct pstate_ops for the processor.
+	 * correct cma_ops for the processor.
 	 */
-	mach_state->cpupm_pstate_ops = pwrnow_supported() ? &pwrnow_ops : NULL;
+	mach_state->ms_pstate.cma_ops = pwrnow_supported() ?
+	    &pwrnow_ops : NULL;
 
 	return (B_TRUE);
 }
diff --git a/usr/src/uts/i86pc/io/cpudrv/cpudrv_intel.c b/usr/src/uts/i86pc/os/cpupm/cpupm_intel.c
index 8fed6f6a4e..dbd05d4198 100644
--- a/usr/src/uts/i86pc/io/cpudrv/cpudrv_intel.c
+++ b/usr/src/uts/i86pc/os/cpupm/cpupm_intel.c
@@ -28,31 +28,34 @@
  */
 
 #include <sys/x86_archext.h>
-#include <sys/cpudrv_mach.h>
 #include <sys/cpu_acpi.h>
 #include <sys/speedstep.h>
-#include <sys/cpudrv_throttle.h>
+#include <sys/cpupm_throttle.h>
+#include <sys/cpu_idle.h>
 
 /*
  * The Intel Processor Driver Capabilities (_PDC).
  * See Intel Processor Vendor-Specific ACPI Interface Specification
  * for details.
  */
-#define	CPUDRV_INTEL_PDC_REVISION	0x1
-#define	CPUDRV_INTEL_PDC_PS_MSR		0x0001
-#define	CPUDRV_INTEL_PDC_C1_HALT	0x0002
-#define	CPUDRV_INTEL_PDC_TS_MSR		0x0004
-#define	CPUDRV_INTEL_PDC_MP		0x0008
-#define	CPUDRV_INTEL_PDC_SW_PSD		0x0020
-#define	CPUDRV_INTEL_PDC_TSD		0x0080
-#define	CPUDRV_INTEL_PDC_HW_PSD		0x0800
+#define	CPUPM_INTEL_PDC_REVISION	0x1
+#define	CPUPM_INTEL_PDC_PS_MSR		0x0001
+#define	CPUPM_INTEL_PDC_C1_HALT		0x0002
+#define	CPUPM_INTEL_PDC_TS_MSR		0x0004
+#define	CPUPM_INTEL_PDC_MP		0x0008
+#define	CPUPM_INTEL_PDC_C2C3_MP		0x0010
+#define	CPUPM_INTEL_PDC_SW_PSD		0x0020
+#define	CPUPM_INTEL_PDC_TSD		0x0080
+#define	CPUPM_INTEL_PDC_C1_FFH		0x0100
+#define	CPUPM_INTEL_PDC_HW_PSD		0x0800
 
-static uint32_t cpudrv_intel_pdccap = 0;
+static uint32_t cpupm_intel_pdccap = 0;
 
 boolean_t
-cpudrv_intel_init(cpudrv_devstate_t *cpudsp)
+cpupm_intel_init(cpu_t *cp)
 {
-	cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
 	uint_t family;
 	uint_t model;
 
@@ -62,34 +65,45 @@ cpudrv_intel_init(cpudrv_devstate_t *cpudsp)
 	family = cpuid_getfamily(CPU);
 	model = cpuid_getmodel(CPU);
 
+	cpupm_intel_pdccap = CPUPM_INTEL_PDC_MP;
+
 	/*
 	 * If we support SpeedStep on this processor, then set the
-	 * correct pstate_ops for the processor and enable appropriate
+	 * correct cma_ops for the processor and enable appropriate
 	 * _PDC bits.
 	 */
 	if (speedstep_supported(family, model)) {
-		mach_state->cpupm_pstate_ops = &speedstep_ops;
-		cpudrv_intel_pdccap = CPUDRV_INTEL_PDC_PS_MSR |
-		    CPUDRV_INTEL_PDC_C1_HALT | CPUDRV_INTEL_PDC_MP |
-		    CPUDRV_INTEL_PDC_SW_PSD | CPUDRV_INTEL_PDC_HW_PSD;
+		mach_state->ms_pstate.cma_ops = &speedstep_ops;
+		cpupm_intel_pdccap |= CPUPM_INTEL_PDC_PS_MSR |
+		    CPUPM_INTEL_PDC_C1_HALT | CPUPM_INTEL_PDC_SW_PSD |
+		    CPUPM_INTEL_PDC_HW_PSD;
 	} else {
-		mach_state->cpupm_pstate_ops = NULL;
+		mach_state->ms_pstate.cma_ops = NULL;
 	}
 
 	/*
 	 * Set the correct tstate_ops for the processor and
 	 * enable appropriate _PDC bits.
 	 */
-	mach_state->cpupm_tstate_ops = &cpudrv_throttle_ops;
-	cpudrv_intel_pdccap |= CPUDRV_INTEL_PDC_TS_MSR |
-	    CPUDRV_INTEL_PDC_TSD;
+	mach_state->ms_tstate.cma_ops = &cpupm_throttle_ops;
+	cpupm_intel_pdccap |= CPUPM_INTEL_PDC_TS_MSR |
+	    CPUPM_INTEL_PDC_TSD;
+
+	/*
+	 * If we support deep cstates on this processor, then set the
+	 * correct cstate_ops for the processor and enable appropriate
+	 * _PDC bits.
+	 */
+	mach_state->ms_cstate.cma_ops = &cpu_idle_ops;
+	cpupm_intel_pdccap |= CPUPM_INTEL_PDC_C1_HALT |
+	    CPUPM_INTEL_PDC_C2C3_MP | CPUPM_INTEL_PDC_C1_FFH;
 
 	/*
 	 * _PDC support is optional and the driver should
 	 * function even if the _PDC write fails.
 	 */
-	(void) cpu_acpi_write_pdc(mach_state->acpi_handle,
-	    CPUDRV_INTEL_PDC_REVISION, 1, &cpudrv_intel_pdccap);
+	(void) cpu_acpi_write_pdc(mach_state->ms_acpi_handle,
+	    CPUPM_INTEL_PDC_REVISION, 1, &cpupm_intel_pdccap);
 
 	return (B_TRUE);
 }
diff --git a/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c b/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c
new file mode 100644
index 0000000000..d7d9cb7221
--- /dev/null
+++ b/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c
@@ -0,0 +1,928 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/cpu_pm.h>
+#include <sys/x86_archext.h>
+#include <sys/sdt.h>
+#include <sys/spl.h>
+#include <sys/machsystm.h>
+#include <sys/hpet.h>
+#include <sys/cpupm.h>
+#include <sys/cpu_idle.h>
+#include <sys/cpu_acpi.h>
+#include <sys/cpupm_throttle.h>
+
+/*
+ * This callback is used to build the PPM CPU domains once
+ * all the CPU devices have been started. The callback is
+ * initialized by the PPM driver to point to a routine that
+ * will build the domains.
+ */
+void (*cpupm_rebuild_cpu_domains)(void);
+
+/*
+ * This callback is used to reset the topspeed for all the
+ * CPU devices. The callback is initialized by the PPM driver to
+ * point to a routine that will reinitialize all the CPU devices
+ * once all the CPU devices have been started and the CPU domains
+ * built.
+ */
+void (*cpupm_init_topspeed)(void);
+
+/*
+ * This callback is used to redefine the topspeed for a CPU device.
+ * Since all CPUs in a domain should have identical properties, this
+ * callback is initialized by the PPM driver to point to a routine
+ * that will redefine the topspeed for all devices in a CPU domain.
+ * This callback is exercised whenever an ACPI _PPC change notification
+ * is received by the CPU driver.
+ */
+void (*cpupm_redefine_topspeed)(void *);
+
+/*
+ * This callback is used by the PPM driver to call into the CPU driver
+ * to find a CPU's current topspeed (i.e., it's current ACPI _PPC value).
+ */
+void (*cpupm_set_topspeed_callb)(void *, int);
+
+/*
+ * This callback is used by the PPM driver to call into the CPU driver
+ * to set a new topspeed for a CPU.
+ */
+int (*cpupm_get_topspeed_callb)(void *);
+
+static void cpupm_event_notify_handler(ACPI_HANDLE, UINT32, void *);
+static void cpupm_free_notify_handlers(cpu_t *);
+
+/*
+ * Until proven otherwise, all power states are manageable.
+ */
+static uint32_t cpupm_enabled = CPUPM_ALL_STATES;
+
+/*
+ * Until all CPUs have started, we do not allow
+ * power management.
+ */
+static boolean_t cpupm_ready = B_FALSE;
+
+cpupm_state_domains_t *cpupm_pstate_domains = NULL;
+cpupm_state_domains_t *cpupm_tstate_domains = NULL;
+cpupm_state_domains_t *cpupm_cstate_domains = NULL;
+
+/*
+ * c-state tunables
+ *
+ * cpupm_cs_idle_cost_tunable is the ratio of time CPU spends executing + idle
+ * divided by time spent in the idle state transitions.
+ * A value of 10 means the CPU will not spend more than 1/10 of its time
+ * in idle latency.  The worst case performance will be 90% of non Deep C-state
+ * kernel.
+ *
+ * cpupm_cs_idle_save_tunable is how long we must stay in a deeper C-state
+ * before it is worth going there.  Expressed as a multiple of latency.
+ */
+uint32_t cpupm_cs_sample_tunable = 5;		/* samples in decision period */
+uint32_t cpupm_cs_idle_cost_tunable = 10;	/* work time / latency cost */
+uint32_t cpupm_cs_idle_save_tunable = 2;	/* idle power savings */
+uint16_t cpupm_C2_idle_pct_tunable = 70;
+uint16_t cpupm_C3_idle_pct_tunable = 80;
+
+#ifndef __xpv
+extern boolean_t cpupm_intel_init(cpu_t *);
+extern boolean_t cpupm_amd_init(cpu_t *);
+
+typedef struct cpupm_vendor {
+	boolean_t	(*cpuv_init)(cpu_t *);
+} cpupm_vendor_t;
+
+/*
+ * Table of supported vendors.
+ */
+static cpupm_vendor_t cpupm_vendors[] = {
+	cpupm_intel_init,
+	cpupm_amd_init,
+	NULL
+};
+#endif
+
+/*
+ * Initialize the machine.
+ * See if a module exists for managing power for this CPU.
+ */
+/*ARGSUSED*/
+void
+cpupm_init(cpu_t *cp)
+{
+#ifndef __xpv
+	cpupm_vendor_t *vendors;
+	cpupm_mach_state_t *mach_state;
+	struct machcpu *mcpu = &(cp->cpu_m);
+	int *speeds;
+	uint_t nspeeds;
+	int ret;
+
+	cpupm_set_supp_freqs(cp, NULL, 1);
+
+	mach_state = cp->cpu_m.mcpu_pm_mach_state =
+	    kmem_zalloc(sizeof (cpupm_mach_state_t), KM_SLEEP);
+	mach_state->ms_caps = CPUPM_NO_STATES;
+	mutex_init(&mach_state->ms_lock, NULL, MUTEX_DRIVER, NULL);
+
+	mach_state->ms_acpi_handle = cpu_acpi_init(cp);
+	if (mach_state->ms_acpi_handle == NULL) {
+		cpupm_free(cp);
+		cmn_err(CE_WARN, "!cpupm_init: processor %d: "
+		    "unable to get ACPI handle", cp->cpu_id);
+		cmn_err(CE_NOTE, "!CPU power management will not function.");
+		CPUPM_DISABLE();
+		return;
+	}
+
+	/*
+	 * Loop through the CPU management module table and see if
+	 * any of the modules implement CPU power management
+	 * for this CPU.
+	 */
+	for (vendors = cpupm_vendors; vendors->cpuv_init != NULL; vendors++) {
+		if (vendors->cpuv_init(cp))
+			break;
+	}
+
+	/*
+	 * Nope, we can't power manage this CPU.
+	 */
+	if (vendors == NULL) {
+		cpupm_free(cp);
+		CPUPM_DISABLE();
+		return;
+	}
+
+	/*
+	 * If P-state support exists for this system, then initialize it.
+	 */
+	if (mach_state->ms_pstate.cma_ops != NULL) {
+		ret = mach_state->ms_pstate.cma_ops->cpus_init(cp);
+		if (ret != 0) {
+			cmn_err(CE_WARN, "!cpupm_init: processor %d:"
+			    " unable to initialize P-state support",
+			    cp->cpu_id);
+			mach_state->ms_pstate.cma_ops = NULL;
+			cpupm_disable(CPUPM_P_STATES);
+		} else {
+			nspeeds = cpupm_get_speeds(cp, &speeds);
+			if (nspeeds == 0) {
+				cmn_err(CE_WARN, "!cpupm_init: processor %d:"
+				    " no speeds to manage", cp->cpu_id);
+			} else {
+				cpupm_set_supp_freqs(cp, speeds, nspeeds);
+				cpupm_free_speeds(speeds, nspeeds);
+				mach_state->ms_caps |= CPUPM_P_STATES;
+			}
+		}
+	}
+
+	if (mach_state->ms_tstate.cma_ops != NULL) {
+		ret = mach_state->ms_tstate.cma_ops->cpus_init(cp);
+		if (ret != 0) {
+			cmn_err(CE_WARN, "!cpupm_init: processor %d:"
+			    " unable to initialize T-state support",
+			    cp->cpu_id);
+			mach_state->ms_tstate.cma_ops = NULL;
+			cpupm_disable(CPUPM_T_STATES);
+		} else {
+			mach_state->ms_caps |= CPUPM_T_STATES;
+		}
+	}
+
+	/*
+	 * If C-states support exists for this system, then initialize it.
+	 */
+	if (mach_state->ms_cstate.cma_ops != NULL) {
+		ret = mach_state->ms_cstate.cma_ops->cpus_init(cp);
+		if (ret != 0) {
+			cmn_err(CE_WARN, "!cpupm_init: processor %d:"
+			    " unable to initialize C-state support",
+			    cp->cpu_id);
+			mach_state->ms_cstate.cma_ops = NULL;
+			mcpu->max_cstates = CPU_ACPI_C1;
+			cpupm_disable(CPUPM_C_STATES);
+			idle_cpu = non_deep_idle_cpu;
+			disp_enq_thread = non_deep_idle_disp_enq_thread;
+		} else if (cpu_deep_cstates_supported()) {
+			mcpu->max_cstates = cpu_acpi_get_max_cstates(
+			    mach_state->ms_acpi_handle);
+			if (mcpu->max_cstates > CPU_ACPI_C1) {
+				hpet.callback(CST_EVENT_MULTIPLE_CSTATES);
+				CPU->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
+				mcpu->mcpu_idle_type = CPU_ACPI_C1;
+				disp_enq_thread = cstate_wakeup;
+			} else {
+				hpet.callback(CST_EVENT_ONE_CSTATE);
+			}
+			mach_state->ms_caps |= CPUPM_C_STATES;
+		} else {
+			mcpu->max_cstates = CPU_ACPI_C1;
+			idle_cpu = non_deep_idle_cpu;
+			disp_enq_thread = non_deep_idle_disp_enq_thread;
+		}
+	}
+
+
+	if (mach_state->ms_caps == CPUPM_NO_STATES) {
+		cpupm_free(cp);
+		CPUPM_DISABLE();
+		return;
+	}
+
+	if ((mach_state->ms_caps & CPUPM_T_STATES) ||
+	    (mach_state->ms_caps & CPUPM_P_STATES) ||
+	    (mach_state->ms_caps & CPUPM_C_STATES))
+		cpupm_add_notify_handler(cp, cpupm_event_notify_handler, cp);
+#endif
+}
+
+/*
+ * Free any resources allocated by cpupm_init().
+ */
+/*ARGSUSED*/
+void
+cpupm_free(cpu_t *cp)
+{
+#ifndef __xpv
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+
+	if (mach_state == NULL)
+		return;
+	if (mach_state->ms_pstate.cma_ops != NULL) {
+		mach_state->ms_pstate.cma_ops->cpus_fini(cp);
+		mach_state->ms_pstate.cma_ops = NULL;
+	}
+
+	if (mach_state->ms_tstate.cma_ops != NULL) {
+		mach_state->ms_tstate.cma_ops->cpus_fini(cp);
+		mach_state->ms_tstate.cma_ops = NULL;
+	}
+
+	if (mach_state->ms_cstate.cma_ops != NULL) {
+		mach_state->ms_cstate.cma_ops->cpus_fini(cp);
+		mach_state->ms_cstate.cma_ops = NULL;
+	}
+
+	cpupm_free_notify_handlers(cp);
+
+	if (mach_state->ms_acpi_handle != NULL) {
+		cpu_acpi_fini(mach_state->ms_acpi_handle);
+		mach_state->ms_acpi_handle = NULL;
+	}
+
+	mutex_destroy(&mach_state->ms_lock);
+	kmem_free(mach_state, sizeof (cpupm_mach_state_t));
+	cp->cpu_m.mcpu_pm_mach_state = NULL;
+#endif
+}
+
+/*
+ * If all CPUs have started and at least one power state is manageable,
+ * then the CPUs are ready for power management.
+ */
+boolean_t
+cpupm_is_ready()
+{
+#ifndef __xpv
+	if (cpupm_enabled == CPUPM_NO_STATES)
+		return (B_FALSE);
+	return (cpupm_ready);
+#else
+	return (B_FALSE);
+#endif
+
+}
+
+boolean_t
+cpupm_is_enabled(uint32_t state)
+{
+	return ((cpupm_enabled & state) == state);
+}
+
+/*
+ * By default, all states are enabled.
+ */
+void
+cpupm_disable(uint32_t state)
+{
+
+	if (state & CPUPM_P_STATES) {
+		cpupm_free_domains(&cpupm_pstate_domains);
+	}
+	if (state & CPUPM_T_STATES) {
+		cpupm_free_domains(&cpupm_tstate_domains);
+	}
+	if (state & CPUPM_C_STATES) {
+		cpupm_free_domains(&cpupm_cstate_domains);
+	}
+	cpupm_enabled &= ~state;
+}
+
+/*
+ * Once all CPUs have been started, the PPM driver should build CPU
+ * domains and initialize the topspeed for all CPU devices.
+ */
+void
+cpupm_post_startup()
+{
+#ifndef __xpv
+	/*
+	 * The CPU domain built by the PPM during CPUs attaching
+	 * should be rebuilt with the information retrieved from
+	 * ACPI.
+	 */
+	if (cpupm_rebuild_cpu_domains != NULL)
+		(*cpupm_rebuild_cpu_domains)();
+
+	/*
+	 * Only initialize the topspeed if P-states are enabled.
+	 */
+	if (cpupm_enabled & CPUPM_P_STATES && cpupm_init_topspeed != NULL)
+		(*cpupm_init_topspeed)();
+#endif
+	cpupm_ready = B_TRUE;
+}
+
+/*
+ * Allocate power domains for C,P and T States
+ */
+void
+cpupm_alloc_domains(cpu_t *cp, int state)
+{
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
+	cpupm_state_domains_t **dom_ptr;
+	cpupm_state_domains_t *dptr;
+	cpupm_state_domains_t **mach_dom_state_ptr;
+	uint32_t domain;
+	uint32_t type;
+
+	switch (state) {
+	case CPUPM_P_STATES:
+		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED)) {
+			domain = CPU_ACPI_PSD(handle).sd_domain;
+			type = CPU_ACPI_PSD(handle).sd_type;
+		} else {
+			mutex_enter(&cpu_lock);
+			domain = cpuid_get_chipid(cp);
+			mutex_exit(&cpu_lock);
+			type = CPU_ACPI_HW_ALL;
+		}
+		dom_ptr = &cpupm_pstate_domains;
+		mach_dom_state_ptr = &mach_state->ms_pstate.cma_domain;
+		break;
+	case CPUPM_T_STATES:
+		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_TSD_CACHED)) {
+			domain = CPU_ACPI_TSD(handle).sd_domain;
+			type = CPU_ACPI_TSD(handle).sd_type;
+		} else {
+			mutex_enter(&cpu_lock);
+			domain = cpuid_get_chipid(cp);
+			mutex_exit(&cpu_lock);
+			type = CPU_ACPI_HW_ALL;
+		}
+		dom_ptr = &cpupm_tstate_domains;
+		mach_dom_state_ptr = &mach_state->ms_tstate.cma_domain;
+		break;
+	case CPUPM_C_STATES:
+		if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_CSD_CACHED)) {
+			domain = CPU_ACPI_CSD(handle).sd_domain;
+			type = CPU_ACPI_CSD(handle).sd_type;
+		} else {
+			mutex_enter(&cpu_lock);
+			domain = cpuid_get_coreid(cp);
+			mutex_exit(&cpu_lock);
+			type = CPU_ACPI_HW_ALL;
+		}
+		dom_ptr = &cpupm_cstate_domains;
+		mach_dom_state_ptr = &mach_state->ms_cstate.cma_domain;
+		break;
+	default:
+		return;
+	}
+
+	for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) {
+		if (dptr->pm_domain == domain)
+			break;
+	}
+
+	/* new domain is created and linked at the head */
+	if (dptr == NULL) {
+		dptr = kmem_zalloc(sizeof (cpupm_state_domains_t), KM_SLEEP);
+		dptr->pm_domain = domain;
+		dptr->pm_type = type;
+		dptr->pm_next = *dom_ptr;
+		mutex_init(&dptr->pm_lock, NULL, MUTEX_SPIN,
+		    (void *)ipltospl(DISP_LEVEL));
+		CPUSET_ZERO(dptr->pm_cpus);
+		*dom_ptr = dptr;
+	}
+	CPUSET_ADD(dptr->pm_cpus, cp->cpu_id);
+	*mach_dom_state_ptr = dptr;
+}
+
+/*
+ * Free C, P or T state power domains
+ */
+void
+cpupm_free_domains(cpupm_state_domains_t **dom_ptr)
+{
+	cpupm_state_domains_t *this_domain, *next_domain;
+
+	this_domain = *dom_ptr;
+	while (this_domain != NULL) {
+		next_domain = this_domain->pm_next;
+		mutex_destroy(&this_domain->pm_lock);
+		kmem_free((void *)this_domain,
+		    sizeof (cpupm_state_domains_t));
+		this_domain = next_domain;
+	}
+	*dom_ptr = NULL;
+}
+
+void
+cpupm_alloc_ms_cstate(cpu_t *cp)
+{
+	cpupm_mach_state_t *mach_state;
+	cpupm_mach_acpi_state_t *ms_cstate;
+
+	mach_state = (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+	ms_cstate = &mach_state->ms_cstate;
+	ASSERT(ms_cstate->cma_state.cstate == NULL);
+	ms_cstate->cma_state.cstate = kmem_zalloc(sizeof (cma_c_state_t),
+	    KM_SLEEP);
+	ms_cstate->cma_state.cstate->cs_next_cstate = CPU_ACPI_C1;
+}
+
+void
+cpupm_free_ms_cstate(cpu_t *cp)
+{
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+	cpupm_mach_acpi_state_t *ms_cstate = &mach_state->ms_cstate;
+
+	if (ms_cstate->cma_state.cstate != NULL) {
+		kmem_free(ms_cstate->cma_state.cstate, sizeof (cma_c_state_t));
+		ms_cstate->cma_state.cstate = NULL;
+	}
+}
+
+void
+cpupm_state_change(cpu_t *cp, int level, int state)
+{
+	cpupm_mach_state_t	*mach_state =
+	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+	cpupm_state_ops_t	*state_ops;
+	cpupm_state_domains_t  	*state_domain;
+	cpuset_t		set;
+
+	DTRACE_PROBE2(cpupm__state__change, cpu_t *, cp, int, level);
+
+	if (mach_state == NULL) {
+		return;
+	}
+
+	switch (state) {
+	case CPUPM_P_STATES:
+		state_ops = mach_state->ms_pstate.cma_ops;
+		state_domain = mach_state->ms_pstate.cma_domain;
+		break;
+	case CPUPM_T_STATES:
+		state_ops = mach_state->ms_tstate.cma_ops;
+		state_domain = mach_state->ms_tstate.cma_domain;
+		break;
+	default:
+		break;
+	}
+
+	switch (state_domain->pm_type) {
+	case CPU_ACPI_SW_ANY:
+		/*
+		 * A request on any CPU in the domain transitions the domain
+		 */
+		CPUSET_ONLY(set, cp->cpu_id);
+		state_ops->cpus_change(set, level);
+		break;
+	case CPU_ACPI_SW_ALL:
+		/*
+		 * All CPUs in the domain must request the transition
+		 */
+	case CPU_ACPI_HW_ALL:
+		/*
+		 * P/T-state transitions are coordinated by the hardware
+		 * For now, request the transition on all CPUs in the domain,
+		 * but looking ahead we can probably be smarter about this.
+		 */
+		mutex_enter(&state_domain->pm_lock);
+		state_ops->cpus_change(state_domain->pm_cpus, level);
+		mutex_exit(&state_domain->pm_lock);
+		break;
+	default:
+		cmn_err(CE_WARN, "Unknown domain coordination type: %d",
+		    state_domain->pm_type);
+	}
+}
+
+/*
+ * CPU PM interfaces exposed to the CPU power manager
+ */
+/*ARGSUSED*/
+id_t
+cpupm_plat_domain_id(cpu_t *cp, cpupm_dtype_t type)
+{
+	cpupm_mach_state_t	*mach_state =
+	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+
+	if ((mach_state == NULL) || (!cpupm_is_enabled(CPUPM_P_STATES) &&
+	    !cpupm_is_enabled(CPUPM_C_STATES))) {
+		return (CPUPM_NO_DOMAIN);
+	}
+	if (type == CPUPM_DTYPE_ACTIVE) {
+		/*
+		 * Return P-State domain for the specified CPU
+		 */
+		if (mach_state->ms_pstate.cma_domain) {
+			return (mach_state->ms_pstate.cma_domain->pm_domain);
+		}
+	} else if (type == CPUPM_DTYPE_IDLE) {
+		/*
+		 * Return C-State domain for the specified CPU
+		 */
+		if (mach_state->ms_cstate.cma_domain) {
+			return (mach_state->ms_cstate.cma_domain->pm_domain);
+		}
+	}
+	return (CPUPM_NO_DOMAIN);
+}
+
+/*ARGSUSED*/
+uint_t
+cpupm_plat_state_enumerate(cpu_t *cp, cpupm_dtype_t type,
+    cpupm_state_t *states)
+{
+	int	*speeds;
+	uint_t	nspeeds, i;
+
+	/*
+	 * Idle domain support unimplemented
+	 */
+	if (type != CPUPM_DTYPE_ACTIVE) {
+		return (0);
+	}
+	nspeeds = cpupm_get_speeds(cp, &speeds);
+
+	/*
+	 * If the caller passes NULL for states, just return the
+	 * number of states.
+	 */
+	if (states != NULL) {
+		for (i = 0; i < nspeeds; i++) {
+			states[i].cps_speed = speeds[i];
+			states[i].cps_handle = (cpupm_handle_t)i;
+		}
+	}
+	cpupm_free_speeds(speeds, nspeeds);
+	return (nspeeds);
+}
+
+/*ARGSUSED*/
+int
+cpupm_plat_change_state(cpu_t *cp, cpupm_state_t *state)
+{
+	if (!cpupm_is_ready())
+		return (-1);
+
+	cpupm_state_change(cp, (int)state->cps_handle, CPUPM_P_STATES);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+/*
+ * Note: It is the responsibility of the users of
+ * cpupm_get_speeds() to free the memory allocated
+ * for speeds using cpupm_free_speeds()
+ */
+uint_t
+cpupm_get_speeds(cpu_t *cp, int **speeds)
+{
+#ifndef __xpv
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+	return (cpu_acpi_get_speeds(mach_state->ms_acpi_handle, speeds));
+#else
+	return (0);
+#endif
+}
+
+/*ARGSUSED*/
+void
+cpupm_free_speeds(int *speeds, uint_t nspeeds)
+{
+#ifndef __xpv
+	cpu_acpi_free_speeds(speeds, nspeeds);
+#endif
+}
+
+/*
+ * All CPU instances have been initialized successfully.
+ */
+boolean_t
+cpupm_power_ready(void)
+{
+	return (cpupm_is_enabled(CPUPM_P_STATES) && cpupm_is_ready());
+}
+
+/*
+ * All CPU instances have been initialized successfully.
+ */
+boolean_t
+cpupm_throttle_ready(void)
+{
+	return (cpupm_is_enabled(CPUPM_T_STATES) && cpupm_is_ready());
+}
+
+/*
+ * All CPU instances have been initialized successfully.
+ */
+boolean_t
+cpupm_cstate_ready(void)
+{
+	return (cpupm_is_enabled(CPUPM_C_STATES) && cpupm_is_ready());
+}
+
+void
+cpupm_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
+{
+	cpu_t *cp = ctx;
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+	cpupm_notification_t *entry;
+
+	mutex_enter(&mach_state->ms_lock);
+	for (entry =  mach_state->ms_handlers; entry != NULL;
+	    entry = entry->nq_next) {
+		entry->nq_handler(obj, val, entry->nq_ctx);
+	}
+	mutex_exit(&mach_state->ms_lock);
+}
+
+/*ARGSUSED*/
+void
+cpupm_add_notify_handler(cpu_t *cp, CPUPM_NOTIFY_HANDLER handler, void *ctx)
+{
+#ifndef __xpv
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+	cpupm_notification_t *entry;
+
+	entry = kmem_zalloc(sizeof (cpupm_notification_t), KM_SLEEP);
+	entry->nq_handler = handler;
+	entry->nq_ctx = ctx;
+	mutex_enter(&mach_state->ms_lock);
+	if (mach_state->ms_handlers == NULL) {
+		entry->nq_next = NULL;
+		mach_state->ms_handlers = entry;
+		cpu_acpi_install_notify_handler(mach_state->ms_acpi_handle,
+		    cpupm_notify_handler, cp);
+
+	} else {
+		entry->nq_next = mach_state->ms_handlers;
+		mach_state->ms_handlers = entry;
+	}
+	mutex_exit(&mach_state->ms_lock);
+#endif
+}
+
+/*ARGSUSED*/
+static void
+cpupm_free_notify_handlers(cpu_t *cp)
+{
+#ifndef __xpv
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+	cpupm_notification_t *entry;
+	cpupm_notification_t *next;
+
+	mutex_enter(&mach_state->ms_lock);
+	if (mach_state->ms_handlers == NULL) {
+		mutex_exit(&mach_state->ms_lock);
+		return;
+	}
+	if (mach_state->ms_acpi_handle != NULL) {
+		cpu_acpi_remove_notify_handler(mach_state->ms_acpi_handle,
+		    cpupm_notify_handler);
+	}
+	entry = mach_state->ms_handlers;
+	while (entry != NULL) {
+		next = entry->nq_next;
+		kmem_free(entry, sizeof (cpupm_notification_t));
+		entry = next;
+	}
+	mach_state->ms_handlers = NULL;
+	mutex_exit(&mach_state->ms_lock);
+#endif
+}
+
+/*
+ * Get the current max speed from the ACPI _PPC object
+ */
+/*ARGSUSED*/
+int
+cpupm_get_top_speed(cpu_t *cp)
+{
+#ifndef __xpv
+	cpupm_mach_state_t 	*mach_state;
+	cpu_acpi_handle_t 	handle;
+	int 			plat_level;
+	uint_t			nspeeds;
+	int			max_level;
+
+	mach_state =
+	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+	handle = mach_state->ms_acpi_handle;
+
+	cpu_acpi_cache_ppc(handle);
+	plat_level = CPU_ACPI_PPC(handle);
+
+	nspeeds = CPU_ACPI_PSTATES_COUNT(handle);
+
+	max_level = nspeeds - 1;
+	if ((plat_level < 0) || (plat_level > max_level)) {
+		cmn_err(CE_NOTE, "!cpupm_get_top_speed: CPU %d: "
+		    "_PPC out of range %d", cp->cpu_id, plat_level);
+		plat_level = 0;
+	}
+
+	return (plat_level);
+#else
+	return (0);
+#endif
+}
+
+/*
+ * This notification handler is called whenever the ACPI _PPC
+ * object changes. The _PPC is a sort of governor on power levels.
+ * It sets an upper threshold on which, _PSS defined, power levels
+ * are usuable. The _PPC value is dynamic and may change as properties
+ * (i.e., thermal or AC source) of the system change.
+ */
+
+static void
+cpupm_power_manage_notifications(void *ctx)
+{
+	cpu_t			*cp = ctx;
+	int			top_speed;
+
+	top_speed = cpupm_get_top_speed(cp);
+	cpupm_redefine_max_activepwr_state(cp, top_speed);
+}
+
+/* ARGSUSED */
+static void
+cpupm_event_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx)
+{
+#ifndef __xpv
+	/*
+	 * Currently, we handle _TPC,_CST and _PPC change notifications.
+	 */
+	if (val == CPUPM_TPC_CHANGE_NOTIFICATION) {
+		cpupm_throttle_manage_notification(ctx);
+	} else if (val == CPUPM_CST_CHANGE_NOTIFICATION) {
+		cpuidle_manage_cstates(ctx);
+	} else if (val == CPUPM_PPC_CHANGE_NOTIFICATION) {
+		cpupm_power_manage_notifications(ctx);
+	}
+#endif
+}
+
+/*
+ * Update cpupm cstate data each time CPU exits idle.
+ */
+void
+cpupm_wakeup_cstate_data(cma_c_state_t *cs_data, hrtime_t end)
+{
+	cs_data->cs_idle_exit = end;
+}
+
+/*
+ * Determine next cstate based on cpupm data.
+ * Update cpupm cstate data each time CPU goes idle.
+ * Do as much as possible in the idle state bookkeeping function because the
+ * performance impact while idle is minimal compared to in the wakeup function
+ * when there is real work to do.
+ */
+uint32_t
+cpupm_next_cstate(cma_c_state_t *cs_data, hrtime_t start)
+{
+	hrtime_t		duration;
+	hrtime_t		ave_interval;
+	hrtime_t		ave_idle_time;
+
+	duration = cs_data->cs_idle_exit - cs_data->cs_idle_enter;
+	scalehrtime(&duration);
+	cs_data->cs_idle += duration;
+	cs_data->cs_idle_enter = start;
+
+	++cs_data->cs_cnt;
+	if (cs_data->cs_cnt > cpupm_cs_sample_tunable) {
+		cs_data->cs_smpl_len = start - cs_data->cs_smpl_start;
+		scalehrtime(&cs_data->cs_smpl_len);
+		cs_data->cs_smpl_len |= 1;	/* protect from DIV 0 */
+		cs_data->cs_smpl_idle = cs_data->cs_idle;
+		cs_data->cs_idle = 0;
+		cs_data->cs_smpl_idle_pct = ((100 * cs_data->cs_smpl_idle) /
+		    cs_data->cs_smpl_len);
+
+		cs_data->cs_smpl_start = start;
+		cs_data->cs_cnt = 0;
+
+		/*
+		 * Strand level C-state policy
+		 */
+		cs_data->cs_next_cstate = CPU_ACPI_C3;
+
+		/*
+		 * Will CPU be idle long enough to save power?
+		 */
+		ave_idle_time = (cs_data->cs_smpl_idle /
+		    cpupm_cs_sample_tunable) / 1000;
+		if (ave_idle_time < (cs_data->cs_C2_latency *
+		    cpupm_cs_idle_save_tunable)) {
+			cs_data->cs_next_cstate = CPU_ACPI_C1;
+			DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU,
+			    int, 1);
+			return (cs_data->cs_next_cstate);
+		} else if (ave_idle_time < (cs_data->cs_C3_latency *
+		    cpupm_cs_idle_save_tunable)) {
+			cs_data->cs_next_cstate = CPU_ACPI_C2;
+			DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU,
+			    int, 2);
+		}
+
+		/*
+		 * Wakeup often (even when non-idle time is very short)?
+		 * Some producer/consumer type loads fall into this category.
+		 */
+		ave_interval = (cs_data->cs_smpl_len / cpupm_cs_sample_tunable)
+		    / 1000;
+		if (ave_interval <=
+		    (cs_data->cs_C2_latency * cpupm_cs_idle_cost_tunable)) {
+			cs_data->cs_next_cstate = CPU_ACPI_C1;
+			DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU,
+			    int, 3);
+			return (cs_data->cs_next_cstate);
+		} else if (ave_interval <=
+		    (cs_data->cs_C3_latency * cpupm_cs_idle_cost_tunable)) {
+			cs_data->cs_next_cstate = CPU_ACPI_C2;
+			DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU,
+			    int, 4);
+		}
+
+		/*
+		 * Idle percent
+		 */
+		if (cs_data->cs_smpl_idle_pct < cpupm_C2_idle_pct_tunable) {
+			cs_data->cs_next_cstate = CPU_ACPI_C1;
+			DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU,
+			    int, 5);
+			return (cs_data->cs_next_cstate);
+		} else if ((cs_data->cs_next_cstate > CPU_ACPI_C2) &&
+		    (cs_data->cs_smpl_idle_pct < cpupm_C3_idle_pct_tunable)) {
+			cs_data->cs_next_cstate = CPU_ACPI_C2;
+			DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU,
+			    int, 6);
+		}
+	}
+
+	return (cs_data->cs_next_cstate);
+}
diff --git a/usr/src/uts/i86pc/io/cpudrv/cpudrv_throttle.c b/usr/src/uts/i86pc/os/cpupm/cpupm_throttle.c
index befa09433c..c1263a3bcd 100644
--- a/usr/src/uts/i86pc/io/cpudrv/cpudrv_throttle.c
+++ b/usr/src/uts/i86pc/os/cpupm/cpupm_throttle.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -27,19 +27,19 @@
 #include <sys/machsystm.h>
 #include <sys/x_call.h>
 #include <sys/cpu_acpi.h>
-#include <sys/cpudrv_throttle.h>
+#include <sys/cpupm_throttle.h>
 #include <sys/dtrace.h>
 #include <sys/sdt.h>
 
-static int cpudrv_throttle_init(cpudrv_devstate_t *);
-static void cpudrv_throttle_fini(cpudrv_devstate_t *);
-static int cpudrv_throttle(cpudrv_devstate_t *,  uint32_t);
+static int cpupm_throttle_init(cpu_t *);
+static void cpupm_throttle_fini(cpu_t *);
+static void cpupm_throttle(cpuset_t,  uint32_t);
 
-cpudrv_tstate_ops_t cpudrv_throttle_ops = {
+cpupm_state_ops_t cpupm_throttle_ops = {
 	"Generic ACPI T-state Support",
-	cpudrv_throttle_init,
-	cpudrv_throttle_fini,
-	cpudrv_throttle
+	cpupm_throttle_init,
+	cpupm_throttle_fini,
+	cpupm_throttle
 };
 
 /*
@@ -61,90 +61,12 @@ cpudrv_tstate_ops_t cpudrv_throttle_ops = {
  * Debugging support
  */
 #ifdef  DEBUG
-volatile int cpudrv_throttle_debug = 0;
-#define	CTDEBUG(arglist) if (cpudrv_throttle_debug) printf arglist;
+volatile int cpupm_throttle_debug = 0;
+#define	CTDEBUG(arglist) if (cpupm_throttle_debug) printf arglist;
 #else
 #define	CTDEBUG(arglist)
 #endif
 
-cpudrv_tstate_domain_t *cpudrv_tstate_domains = NULL;
-
-/*
- * Allocate a new domain node.
- */
-static void
-cpudrv_alloc_tstate_domain(cpudrv_devstate_t *cpudsp)
-{
-	cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
-	cpu_acpi_handle_t handle = mach_state->acpi_handle;
-	cpudrv_tstate_domain_t *dptr;
-	cpudrv_tstate_domain_node_t *nptr;
-	uint32_t domain;
-	uint32_t type;
-	cpu_t *cp;
-
-	if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_TSD_CACHED)) {
-		domain = CPU_ACPI_TSD(handle).sd_domain;
-		type = CPU_ACPI_TSD(handle).sd_type;
-	} else {
-		mutex_enter(&cpu_lock);
-		cp = cpu[CPU->cpu_id];
-		domain = cpuid_get_chipid(cp);
-		mutex_exit(&cpu_lock);
-		type = CPU_ACPI_SW_ALL;
-	}
-
-	for (dptr = cpudrv_tstate_domains; dptr != NULL;
-	    dptr = dptr->td_next) {
-		if (dptr->td_domain == domain)
-			break;
-	}
-
-	/* new domain is created and linked at the head */
-	if (dptr == NULL) {
-		dptr = kmem_zalloc(sizeof (cpudrv_tstate_domain_t), KM_SLEEP);
-		dptr->td_domain = domain;
-		dptr->td_type = type;
-		dptr->td_next = cpudrv_tstate_domains;
-		mutex_init(&dptr->td_lock, NULL, MUTEX_DRIVER, NULL);
-		cpudrv_tstate_domains = dptr;
-	}
-
-	/* new domain node is created and linked at the head of the domain */
-	nptr = kmem_zalloc(sizeof (cpudrv_tstate_domain_node_t), KM_SLEEP);
-	nptr->tdn_cpudsp = cpudsp;
-	nptr->tdn_domain = dptr;
-	nptr->tdn_next = dptr->td_node;
-	dptr->td_node = nptr;
-	mach_state->tstate_domain_node = nptr;
-}
-
-static void
-cpudrv_free_tstate_domains()
-{
-	cpudrv_tstate_domain_t *this_domain, *next_domain;
-	cpudrv_tstate_domain_node_t *this_node, *next_node;
-
-	this_domain = cpudrv_tstate_domains;
-	while (this_domain != NULL) {
-		next_domain = this_domain->td_next;
-
-		/* discard CPU node chain */
-		this_node = this_domain->td_node;
-		while (this_node != NULL) {
-			next_node = this_node->tdn_next;
-			kmem_free((void *)this_node,
-			    sizeof (cpudrv_tstate_domain_node_t));
-			this_node = next_node;
-		}
-		mutex_destroy(&this_domain->td_lock);
-		kmem_free((void *)this_domain,
-		    sizeof (cpudrv_tstate_domain_t));
-		this_domain = next_domain;
-	}
-	cpudrv_tstate_domains = NULL;
-}
-
 /*
  * Write the _PTC ctrl register. How it is written, depends upon the _PTC
  * APCI object value.
@@ -230,11 +152,11 @@ read_status(cpu_acpi_handle_t handle, uint32_t *stat)
  * Transition the current processor to the requested throttling state.
  */
 static void
-cpudrv_tstate_transition(int *ret, cpudrv_devstate_t *cpudsp,
-    uint32_t req_state)
+cpupm_tstate_transition(uint32_t req_state)
 {
-	cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
-	cpu_acpi_handle_t handle = mach_state->acpi_handle;
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)CPU->cpu_m.mcpu_pm_mach_state;
+	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
 	cpu_acpi_tstate_t *req_tstate;
 	uint32_t ctrl;
 	uint32_t stat;
@@ -250,7 +172,6 @@ cpudrv_tstate_transition(int *ret, cpudrv_devstate_t *cpudsp,
 	 */
 	ctrl = CPU_ACPI_TSTATE_CTRL(req_tstate);
 	if (write_ctrl(handle, ctrl) != 0) {
-		*ret = THROTTLE_RET_UNSUP_STATE;
 		return;
 	}
 
@@ -259,7 +180,6 @@ cpudrv_tstate_transition(int *ret, cpudrv_devstate_t *cpudsp,
 	 * no status value comparison is required.
 	 */
 	if (CPU_ACPI_TSTATE_STAT(req_tstate) == 0) {
-		*ret = THROTTLE_RET_SUCCESS;
 		return;
 	}
 
@@ -274,46 +194,40 @@ cpudrv_tstate_transition(int *ret, cpudrv_devstate_t *cpudsp,
 
 	if (CPU_ACPI_TSTATE_STAT(req_tstate) != stat) {
 		DTRACE_PROBE(throttle_transition_incomplete);
-		*ret = THROTTLE_RET_TRANS_INCOMPLETE;
-	} else {
-		*ret = THROTTLE_RET_SUCCESS;
 	}
 }
 
-static int
-cpudrv_throttle(cpudrv_devstate_t *cpudsp,  uint32_t throtl_lvl)
+static void
+cpupm_throttle(cpuset_t set,  uint32_t throtl_lvl)
 {
-	cpuset_t cpus;
-	int ret;
-
 	/*
 	 * If thread is already running on target CPU then just
 	 * make the transition request. Otherwise, we'll need to
 	 * make a cross-call.
 	 */
 	kpreempt_disable();
-	if (cpudsp->cpu_id == CPU->cpu_id) {
-		cpudrv_tstate_transition(&ret, cpudsp, throtl_lvl);
-	} else {
-		CPUSET_ONLY(cpus, cpudsp->cpu_id);
-		xc_call((xc_arg_t)&ret, (xc_arg_t)cpudsp, (xc_arg_t)throtl_lvl,
-		    X_CALL_HIPRI, cpus, (xc_func_t)cpudrv_tstate_transition);
+	if (CPU_IN_SET(set, CPU->cpu_id)) {
+		cpupm_tstate_transition(throtl_lvl);
+		CPUSET_DEL(set, CPU->cpu_id);
+	}
+	if (!CPUSET_ISNULL(set)) {
+		xc_call((xc_arg_t)throtl_lvl, NULL, NULL, X_CALL_HIPRI,
+		    set, (xc_func_t)cpupm_tstate_transition);
 	}
 	kpreempt_enable();
-
-	return (ret);
 }
 
 static int
-cpudrv_throttle_init(cpudrv_devstate_t *cpudsp)
+cpupm_throttle_init(cpu_t *cp)
 {
-	cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
-	cpu_acpi_handle_t handle = mach_state->acpi_handle;
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
 	cpu_acpi_ptc_t *ptc_stat;
 
 	if (cpu_acpi_cache_tstate_data(handle) != 0) {
 		CTDEBUG(("Failed to cache T-state ACPI data\n"));
-		cpudrv_throttle_fini(cpudsp);
+		cpupm_throttle_fini(cp);
 		return (THROTTLE_RET_INCOMPLETE_DATA);
 	}
 
@@ -334,17 +248,98 @@ cpudrv_throttle_init(cpudrv_devstate_t *cpudsp)
 		return (THROTTLE_RET_INCOMPLETE_DATA);
 	}
 
-	cpudrv_alloc_tstate_domain(cpudsp);
+	cpupm_alloc_domains(cp, CPUPM_T_STATES);
 
 	return (THROTTLE_RET_SUCCESS);
 }
 
 static void
-cpudrv_throttle_fini(cpudrv_devstate_t *cpudsp)
+cpupm_throttle_fini(cpu_t *cp)
 {
-	cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
-	cpu_acpi_handle_t handle = mach_state->acpi_handle;
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
 
-	cpudrv_free_tstate_domains();
+	cpupm_free_domains(&cpupm_tstate_domains);
 	cpu_acpi_free_tstate_data(handle);
 }
+
+/*
+ * This routine reads the ACPI _TPC object. It's accessed as a callback
+ * by the cpu driver whenever a _TPC change notification is received.
+ */
+static int
+cpupm_throttle_get_max(processorid_t cpu_id)
+{
+	cpu_t			*cp = cpu[cpu_id];
+	cpupm_mach_state_t 	*mach_state =
+	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+	cpu_acpi_handle_t	handle;
+	int			throtl_level;
+	int			max_throttle_lvl;
+	uint_t			num_throtl;
+
+	if (mach_state == NULL) {
+		return (-1);
+	}
+
+	handle = mach_state->ms_acpi_handle;
+	ASSERT(handle != NULL);
+
+	cpu_acpi_cache_tpc(handle);
+	throtl_level = CPU_ACPI_TPC(handle);
+
+	num_throtl = CPU_ACPI_TSTATES_COUNT(handle);
+
+	max_throttle_lvl = num_throtl - 1;
+	if ((throtl_level < 0) || (throtl_level > max_throttle_lvl)) {
+		cmn_err(CE_NOTE, "!cpupm_throttle_get_max: CPU %d: "
+		    "_TPC out of range %d", cp->cpu_id, throtl_level);
+		throtl_level = 0;
+	}
+
+	return (throtl_level);
+}
+
+/*
+ * Take care of CPU throttling when _TPC notification arrives
+ */
+void
+cpupm_throttle_manage_notification(void *ctx)
+{
+	cpu_t			*cp = ctx;
+	processorid_t		cpu_id = cp->cpu_id;
+	cpupm_mach_state_t	*mach_state =
+	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+	boolean_t		is_ready;
+	int			new_level;
+
+	if (mach_state == NULL) {
+		return;
+	}
+
+	/*
+	 * We currently refuse to power-manage if the CPU is not ready to
+	 * take cross calls (cross calls fail silently if CPU is not ready
+	 * for it).
+	 *
+	 * Additionally, for x86 platforms we cannot power-manage
+	 * any one instance, until all instances have been initialized.
+	 * That's because we don't know what the CPU domains look like
+	 * until all instances have been initialized.
+	 */
+	is_ready = CPUPM_XCALL_IS_READY(cpu_id) && cpupm_throttle_ready();
+	if (!is_ready)
+		return;
+
+	if (!(mach_state->ms_caps & CPUPM_T_STATES))
+		return;
+	ASSERT(mach_state->ms_tstate.cma_ops != NULL);
+
+	/*
+	 * Get the new T-State support level
+	 */
+	new_level = cpupm_throttle_get_max(cpu_id);
+
+	cpupm_state_change(cp, new_level, CPUPM_T_STATES);
+}
diff --git a/usr/src/uts/i86pc/io/cpudrv/pwrnow.c b/usr/src/uts/i86pc/os/cpupm/pwrnow.c
index 4c731ff9e2..65cc251fbb 100644
--- a/usr/src/uts/i86pc/io/cpudrv/pwrnow.c
+++ b/usr/src/uts/i86pc/os/cpupm/pwrnow.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -28,21 +28,20 @@
 #include <sys/x_call.h>
 #include <sys/acpi/acpi.h>
 #include <sys/acpica.h>
-#include <sys/cpudrv_mach.h>
 #include <sys/pwrnow.h>
 #include <sys/cpu_acpi.h>
 #include <sys/cpupm.h>
 #include <sys/dtrace.h>
 #include <sys/sdt.h>
 
-static int pwrnow_init(cpudrv_devstate_t *);
-static void pwrnow_fini(cpudrv_devstate_t *);
-static int pwrnow_power(cpudrv_devstate_t *, uint32_t);
+static int pwrnow_init(cpu_t *);
+static void pwrnow_fini(cpu_t *);
+static void pwrnow_power(cpuset_t, uint32_t);
 
 /*
  * Interfaces for modules implementing AMD's PowerNow!.
  */
-cpudrv_pstate_ops_t pwrnow_ops = {
+cpupm_state_ops_t pwrnow_ops = {
 	"PowerNow! Technology",
 	pwrnow_init,
 	pwrnow_fini,
@@ -81,12 +80,11 @@ volatile int pwrnow_debug = 0;
 /*
  * Write the ctrl register.
  */
-static int
+static void
 write_ctrl(cpu_acpi_handle_t handle, uint32_t ctrl)
 {
 	cpu_acpi_pct_t *pct_ctrl;
 	uint64_t reg;
-	int ret = 0;
 
 	pct_ctrl = CPU_ACPI_PCT_CTRL(handle);
 
@@ -94,35 +92,32 @@ write_ctrl(cpu_acpi_handle_t handle, uint32_t ctrl)
 	case ACPI_ADR_SPACE_FIXED_HARDWARE:
 		reg = ctrl;
 		wrmsr(PWRNOW_PERF_CTL_MSR, reg);
-		ret = 0;
 		break;
 
 	default:
 		DTRACE_PROBE1(pwrnow_ctrl_unsupported_type, uint8_t,
 		    pct_ctrl->cr_addrspace_id);
-		return (-1);
+		return;
 	}
 
 	DTRACE_PROBE1(pwrnow_ctrl_write, uint32_t, ctrl);
-	DTRACE_PROBE1(pwrnow_ctrl_write_err, int, ret);
-
-	return (ret);
 }
 
 /*
  * Transition the current processor to the requested state.
  */
 static void
-pwrnow_pstate_transition(int *ret, cpudrv_devstate_t *cpudsp,
-    uint32_t req_state)
+pwrnow_pstate_transition(uint32_t req_state)
 {
-	cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
-	cpu_acpi_handle_t handle = mach_state->acpi_handle;
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)CPU->cpu_m.mcpu_pm_mach_state;
+	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
 	cpu_acpi_pstate_t *req_pstate;
 	uint32_t ctrl;
 
 	req_pstate = (cpu_acpi_pstate_t *)CPU_ACPI_PSTATES(handle);
 	req_pstate += req_state;
+
 	DTRACE_PROBE1(pwrnow_transition_freq, uint32_t,
 	    CPU_ACPI_FREQ(req_pstate));
 
@@ -130,40 +125,30 @@ pwrnow_pstate_transition(int *ret, cpudrv_devstate_t *cpudsp,
 	 * Initiate the processor p-state change.
 	 */
 	ctrl = CPU_ACPI_PSTATE_CTRL(req_pstate);
-	if (write_ctrl(handle, ctrl) != 0) {
-		*ret = PWRNOW_RET_UNSUP_STATE;
-		return;
-	}
+	write_ctrl(handle, ctrl);
 
-	mach_state->pstate = req_state;
-	CPU->cpu_curr_clock = ((uint64_t)
-	    CPU_ACPI_FREQ(req_pstate) * 1000000);
-
-	*ret = PWRNOW_RET_SUCCESS;
+	mach_state->ms_pstate.cma_state.pstate = req_state;
+	cpu_set_curr_clock((uint64_t)CPU_ACPI_FREQ(req_pstate) * 1000000);
 }
 
-static int
-pwrnow_power(cpudrv_devstate_t *cpudsp, uint32_t req_state)
+static void
+pwrnow_power(cpuset_t set, uint32_t req_state)
 {
-	cpuset_t cpus;
-	int ret;
-
 	/*
 	 * If thread is already running on target CPU then just
 	 * make the transition request. Otherwise, we'll need to
 	 * make a cross-call.
 	 */
 	kpreempt_disable();
-	if (cpudsp->cpu_id == CPU->cpu_id) {
-		pwrnow_pstate_transition(&ret, cpudsp, req_state);
-	} else {
-		CPUSET_ONLY(cpus, cpudsp->cpu_id);
-		xc_call((xc_arg_t)&ret, (xc_arg_t)cpudsp, (xc_arg_t)req_state,
-		    X_CALL_HIPRI, cpus, (xc_func_t)pwrnow_pstate_transition);
+	if (CPU_IN_SET(set, CPU->cpu_id)) {
+		pwrnow_pstate_transition(req_state);
+		CPUSET_DEL(set, CPU->cpu_id);
+	}
+	if (!CPUSET_ISNULL(set)) {
+		xc_call((xc_arg_t)req_state, NULL, NULL, X_CALL_HIPRI,
+		    set, (xc_func_t)pwrnow_pstate_transition);
 	}
 	kpreempt_enable();
-
-	return (ret);
 }
 
 /*
@@ -171,23 +156,21 @@ pwrnow_power(cpudrv_devstate_t *cpudsp, uint32_t req_state)
  * get the P-state data from ACPI and cache it.
  */
 static int
-pwrnow_init(cpudrv_devstate_t *cpudsp)
+pwrnow_init(cpu_t *cp)
 {
-	cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
-	cpu_acpi_handle_t handle = mach_state->acpi_handle;
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
 	cpu_acpi_pct_t *pct_stat;
-	cpu_t *cp;
-	int domain;
 
-	PWRNOW_DEBUG(("pwrnow_init: instance %d\n",
-	    ddi_get_instance(cpudsp->dip)));
+	PWRNOW_DEBUG(("pwrnow_init: processor %d\n", cp->cpu_id));
 
 	/*
 	 * Cache the P-state specific ACPI data.
 	 */
 	if (cpu_acpi_cache_pstate_data(handle) != 0) {
 		PWRNOW_DEBUG(("Failed to cache ACPI data\n"));
-		pwrnow_fini(cpudsp);
+		pwrnow_fini(cp);
 		return (PWRNOW_RET_NO_PM);
 	}
 
@@ -200,20 +183,13 @@ pwrnow_init(cpudrv_devstate_t *cpudsp)
 		cmn_err(CE_WARN, "!_PCT configured for unsupported "
 		    "addrspace = %d.", pct_stat->cr_addrspace_id);
 		cmn_err(CE_NOTE, "!CPU power management will not function.");
-		pwrnow_fini(cpudsp);
+		pwrnow_fini(cp);
 		return (PWRNOW_RET_NO_PM);
 	}
 
-	if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED))
-		domain = CPU_ACPI_PSD(handle).sd_domain;
-	else {
-		cp = cpu[CPU->cpu_id];
-		domain = cpuid_get_chipid(cp);
-	}
-	cpupm_add_cpu2dependency(cpudsp->dip, domain);
+	cpupm_alloc_domains(cp, CPUPM_P_STATES);
 
-	PWRNOW_DEBUG(("Instance %d succeeded.\n",
-	    ddi_get_instance(cpudsp->dip)));
+	PWRNOW_DEBUG(("Processor %d succeeded.\n", cp->cpu_id))
 	return (PWRNOW_RET_SUCCESS);
 }
 
@@ -221,12 +197,13 @@ pwrnow_init(cpudrv_devstate_t *cpudsp)
  * Free resources allocated by pwrnow_init().
  */
 static void
-pwrnow_fini(cpudrv_devstate_t *cpudsp)
+pwrnow_fini(cpu_t *cp)
 {
-	cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
-	cpu_acpi_handle_t handle = mach_state->acpi_handle;
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
 
-	cpupm_free_cpu_dependencies();
+	cpupm_free_domains(&cpupm_pstate_domains);
 	cpu_acpi_free_pstate_data(handle);
 }
 
diff --git a/usr/src/uts/i86pc/io/cpudrv/speedstep.c b/usr/src/uts/i86pc/os/cpupm/speedstep.c
index 764ca5c23a..e4886d0045 100644
--- a/usr/src/uts/i86pc/io/cpudrv/speedstep.c
+++ b/usr/src/uts/i86pc/os/cpupm/speedstep.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -28,21 +28,20 @@
 #include <sys/x_call.h>
 #include <sys/acpi/acpi.h>
 #include <sys/acpica.h>
-#include <sys/cpudrv_mach.h>
 #include <sys/speedstep.h>
 #include <sys/cpu_acpi.h>
 #include <sys/cpupm.h>
 #include <sys/dtrace.h>
 #include <sys/sdt.h>
 
-static int speedstep_init(cpudrv_devstate_t *);
-static void speedstep_fini(cpudrv_devstate_t *);
-static int speedstep_power(cpudrv_devstate_t *, uint32_t);
+static int speedstep_init(cpu_t *);
+static void speedstep_fini(cpu_t *);
+static void speedstep_power(cpuset_t, uint32_t);
 
 /*
  * Interfaces for modules implementing Intel's Enhanced SpeedStep.
  */
-cpudrv_pstate_ops_t speedstep_ops = {
+cpupm_state_ops_t speedstep_ops = {
 	"Enhanced SpeedStep Technology",
 	speedstep_init,
 	speedstep_fini,
@@ -80,12 +79,11 @@ volatile int ess_debug = 0;
  * Write the ctrl register. How it is written, depends upon the _PCT
  * APCI object value.
  */
-static int
+static void
 write_ctrl(cpu_acpi_handle_t handle, uint32_t ctrl)
 {
 	cpu_acpi_pct_t *pct_ctrl;
 	uint64_t reg;
-	int ret = 0;
 
 	pct_ctrl = CPU_ACPI_PCT_CTRL(handle);
 
@@ -99,79 +97,67 @@ write_ctrl(cpu_acpi_handle_t handle, uint32_t ctrl)
 		reg &= ~((uint64_t)0xFFFF);
 		reg |= ctrl;
 		wrmsr(IA32_PERF_CTL_MSR, reg);
-		ret = 0;
 		break;
 
 	case ACPI_ADR_SPACE_SYSTEM_IO:
-		ret = cpu_acpi_write_port(pct_ctrl->cr_address, ctrl,
+		(void) cpu_acpi_write_port(pct_ctrl->cr_address, ctrl,
 		    pct_ctrl->cr_width);
 		break;
 
 	default:
 		DTRACE_PROBE1(ess_ctrl_unsupported_type, uint8_t,
 		    pct_ctrl->cr_addrspace_id);
-		return (-1);
+		return;
 	}
 
 	DTRACE_PROBE1(ess_ctrl_write, uint32_t, ctrl);
-	DTRACE_PROBE1(ess_ctrl_write_err, int, ret);
-
-	return (ret);
 }
 
 /*
  * Transition the current processor to the requested state.
  */
 void
-speedstep_pstate_transition(int *ret, cpudrv_devstate_t *cpudsp,
-    uint32_t req_state)
+speedstep_pstate_transition(uint32_t req_state)
 {
-	cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
-	cpu_acpi_handle_t handle = mach_state->acpi_handle;
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)CPU->cpu_m.mcpu_pm_mach_state;
+	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
 	cpu_acpi_pstate_t *req_pstate;
 	uint32_t ctrl;
 
 	req_pstate = (cpu_acpi_pstate_t *)CPU_ACPI_PSTATES(handle);
 	req_pstate += req_state;
+
 	DTRACE_PROBE1(ess_transition, uint32_t, CPU_ACPI_FREQ(req_pstate));
 
 	/*
 	 * Initiate the processor p-state change.
 	 */
 	ctrl = CPU_ACPI_PSTATE_CTRL(req_pstate);
-	if (write_ctrl(handle, ctrl) != 0) {
-		*ret = ESS_RET_UNSUP_STATE;
-		return;
-	}
+	write_ctrl(handle, ctrl);
 
-	mach_state->pstate = req_state;
-	CPU->cpu_curr_clock =
-	    (((uint64_t)CPU_ACPI_FREQ(req_pstate) * 1000000));
-	*ret = ESS_RET_SUCCESS;
+	mach_state->ms_pstate.cma_state.pstate = req_state;
+	cpu_set_curr_clock(((uint64_t)CPU_ACPI_FREQ(req_pstate) * 1000000));
 }
 
-static int
-speedstep_power(cpudrv_devstate_t *cpudsp, uint32_t req_state)
+static void
+speedstep_power(cpuset_t set, uint32_t req_state)
 {
-	cpuset_t cpus;
-	int ret;
-
 	/*
 	 * If thread is already running on target CPU then just
 	 * make the transition request. Otherwise, we'll need to
 	 * make a cross-call.
 	 */
 	kpreempt_disable();
-	if (cpudsp->cpu_id == CPU->cpu_id) {
-		speedstep_pstate_transition(&ret, cpudsp, req_state);
-	} else {
-		CPUSET_ONLY(cpus, cpudsp->cpu_id);
-		xc_call((xc_arg_t)&ret, (xc_arg_t)cpudsp, (xc_arg_t)req_state,
-		    X_CALL_HIPRI, cpus, (xc_func_t)speedstep_pstate_transition);
+	if (CPU_IN_SET(set, CPU->cpu_id)) {
+		speedstep_pstate_transition(req_state);
+		CPUSET_DEL(set, CPU->cpu_id);
+	}
+	if (!CPUSET_ISNULL(set)) {
+		xc_call((xc_arg_t)req_state, NULL, NULL, X_CALL_HIPRI, set,
+		    (xc_func_t)speedstep_pstate_transition);
 	}
 	kpreempt_enable();
-
-	return (ret);
 }
 
 /*
@@ -179,23 +165,21 @@ speedstep_power(cpudrv_devstate_t *cpudsp, uint32_t req_state)
  * get the P-state data from ACPI and cache it.
  */
 static int
-speedstep_init(cpudrv_devstate_t *cpudsp)
+speedstep_init(cpu_t *cp)
 {
-	cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
-	cpu_acpi_handle_t handle = mach_state->acpi_handle;
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
+	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
 	cpu_acpi_pct_t *pct_stat;
-	cpu_t *cp;
-	int dependency;
 
-	ESSDEBUG(("speedstep_init: instance %d\n",
-	    ddi_get_instance(cpudsp->dip)));
+	ESSDEBUG(("speedstep_init: processor %d\n", cp->cpu_id));
 
 	/*
 	 * Cache the P-state specific ACPI data.
 	 */
 	if (cpu_acpi_cache_pstate_data(handle) != 0) {
 		ESSDEBUG(("Failed to cache ACPI data\n"));
-		speedstep_fini(cpudsp);
+		speedstep_fini(cp);
 		return (ESS_RET_NO_PM);
 	}
 
@@ -211,21 +195,13 @@ speedstep_init(cpudrv_devstate_t *cpudsp)
 		cmn_err(CE_WARN, "!_PCT conifgured for unsupported "
 		    "addrspace = %d.", pct_stat->cr_addrspace_id);
 		cmn_err(CE_NOTE, "!CPU power management will not function.");
-		speedstep_fini(cpudsp);
+		speedstep_fini(cp);
 		return (ESS_RET_NO_PM);
 	}
 
-	if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED))
-		dependency = CPU_ACPI_PSD(handle).sd_domain;
-	else {
-		mutex_enter(&cpu_lock);
-		cp = cpu[CPU->cpu_id];
-		dependency = cpuid_get_chipid(cp);
-		mutex_exit(&cpu_lock);
-	}
-	cpupm_add_cpu2dependency(cpudsp->dip, dependency);
+	cpupm_alloc_domains(cp, CPUPM_P_STATES);
 
-	ESSDEBUG(("Instance %d succeeded.\n", ddi_get_instance(cpudsp->dip)));
+	ESSDEBUG(("Processor %d succeeded.\n", cp->cpu_id))
 	return (ESS_RET_SUCCESS);
 }
 
@@ -233,12 +209,13 @@ speedstep_init(cpudrv_devstate_t *cpudsp)
  * Free resources allocated by speedstep_init().
  */
 static void
-speedstep_fini(cpudrv_devstate_t *cpudsp)
+speedstep_fini(cpu_t *cp)
 {
-	cpudrv_mach_state_t *mach_state = cpudsp->mach_state;
-	cpu_acpi_handle_t handle = mach_state->acpi_handle;
+	cpupm_mach_state_t *mach_state =
+	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
+	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
 
-	cpupm_free_cpu_dependencies();
+	cpupm_free_domains(&cpupm_pstate_domains);
 	cpu_acpi_free_pstate_data(handle);
 }
 
@@ -246,7 +223,6 @@ boolean_t
 speedstep_supported(uint_t family, uint_t model)
 {
 	struct cpuid_regs cpu_regs;
-	uint64_t reg;
 
 	/* Required features */
 	if (!(x86_feature & X86_CPUID) ||
@@ -272,16 +248,5 @@ speedstep_supported(uint_t family, uint_t model)
 		return (B_FALSE);
 	}
 
-	/*
-	 * If Enhanced SpeedStep has not been enabled on the system,
-	 * then we probably should not override the BIOS setting.
-	 */
-	reg = rdmsr(IA32_MISC_ENABLE_MSR);
-	if (! (reg & IA32_MISC_ENABLE_EST)) {
-		cmn_err(CE_NOTE, "!Enhanced Intel SpeedStep not enabled.");
-		cmn_err(CE_NOTE, "!CPU power management will not function.");
-		return (B_FALSE);
-	}
-
 	return (B_TRUE);
 }
diff --git a/usr/src/uts/i86pc/os/mlsetup.c b/usr/src/uts/i86pc/os/mlsetup.c
index 94ee76b0b0..001bd0537f 100644
--- a/usr/src/uts/i86pc/os/mlsetup.c
+++ b/usr/src/uts/i86pc/os/mlsetup.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -295,6 +295,8 @@ mlsetup(struct regs *rp)
 	 */
 	cpu_list_init(CPU);
 
+	pg_cpu_bootstrap(CPU);
+
 	/*
 	 * Now that we have taken over the GDT, IDT and have initialized
 	 * active CPU list it's time to inform kmdb if present.
diff --git a/usr/src/uts/i86pc/os/mp_machdep.c b/usr/src/uts/i86pc/os/mp_machdep.c
index e27b45d709..1954dfb81c 100644
--- a/usr/src/uts/i86pc/os/mp_machdep.c
+++ b/usr/src/uts/i86pc/os/mp_machdep.c
@@ -45,6 +45,7 @@
 #include <sys/memlist.h>
 #include <sys/param.h>
 #include <sys/promif.h>
+#include <sys/cpu_pm.h>
 #if defined(__xpv)
 #include <sys/hypervisor.h>
 #endif
@@ -52,6 +53,7 @@
 #include <vm/hat_i86.h>
 #include <sys/kdi_machimpl.h>
 #include <sys/sdt.h>
+#include <sys/hpet.h>
 
 #define	OFFSETOF(s, m)		(size_t)(&(((s *)0)->m))
 
@@ -76,10 +78,10 @@ static int mach_intr_ops(dev_info_t *, ddi_intr_handle_impl_t *,
 static void mach_notify_error(int level, char *errmsg);
 static hrtime_t dummy_hrtime(void);
 static void dummy_scalehrtime(hrtime_t *);
-static void cpu_idle(void);
+void cpu_idle(void);
 static void cpu_wakeup(cpu_t *, int);
 #ifndef __xpv
-static void cpu_idle_mwait(void);
+void cpu_idle_mwait(void);
 static void cpu_wakeup_mwait(cpu_t *, int);
 #endif
 /*
@@ -184,7 +186,23 @@ int	idle_cpu_prefer_mwait = 1;
  */
 int	idle_cpu_assert_cflush_monitor = 1;
 
-#endif
+/*
+ * If non-zero, idle cpus will not use power saving Deep C-States idle loop.
+ */
+int	idle_cpu_no_deep_c = 0;
+/*
+ * Non-power saving idle loop and wakeup pointers.
+ * Allows user to toggle Deep Idle power saving feature on/off.
+ */
+void	(*non_deep_idle_cpu)() = cpu_idle;
+void	(*non_deep_idle_disp_enq_thread)(cpu_t *, int);
+
+/*
+ * Object for the kernel to access the HPET.
+ */
+hpet_t hpet;
+
+#endif	/* ifndef __xpv */
 
 /*ARGSUSED*/
 int
@@ -210,6 +228,16 @@ pg_plat_hw_shared(cpu_t *cp, pghw_type_t hw)
 			return (1);
 		else
 			return (0);
+	case PGHW_POW_ACTIVE:
+		if (cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE) != (id_t)-1)
+			return (1);
+		else
+			return (0);
+	case PGHW_POW_IDLE:
+		if (cpupm_domain_id(cp, CPUPM_DTYPE_IDLE) != (id_t)-1)
+			return (1);
+		else
+			return (0);
 	default:
 		return (0);
 	}
@@ -247,58 +275,63 @@ pg_plat_hw_instance_id(cpu_t *cpu, pghw_type_t hw)
 		return (cpuid_get_last_lvl_cacheid(cpu));
 	case PGHW_CHIP:
 		return (cpuid_get_chipid(cpu));
+	case PGHW_POW_ACTIVE:
+		return (cpupm_domain_id(cpu, CPUPM_DTYPE_ACTIVE));
+	case PGHW_POW_IDLE:
+		return (cpupm_domain_id(cpu, CPUPM_DTYPE_IDLE));
 	default:
 		return (-1);
 	}
 }
 
-int
-pg_plat_hw_level(pghw_type_t hw)
+/*
+ * Express preference for optimizing for sharing relationship
+ * hw1 vs hw2
+ */
+pghw_type_t
+pg_plat_hw_rank(pghw_type_t hw1, pghw_type_t hw2)
 {
-	int i;
+	int i, rank1, rank2;
+
 	static pghw_type_t hw_hier[] = {
 		PGHW_IPIPE,
 		PGHW_CACHE,
 		PGHW_CHIP,
+		PGHW_POW_IDLE,
+		PGHW_POW_ACTIVE,
 		PGHW_NUM_COMPONENTS
 	};
 
 	for (i = 0; hw_hier[i] != PGHW_NUM_COMPONENTS; i++) {
-		if (hw_hier[i] == hw)
-			return (i);
+		if (hw_hier[i] == hw1)
+			rank1 = i;
+		if (hw_hier[i] == hw2)
+			rank2 = i;
 	}
-	return (-1);
-}
 
-/*
- * Return 1 if CMT load balancing policies should be
- * implemented across instances of the specified hardware
- * sharing relationship.
- */
-int
-pg_plat_cmt_load_bal_hw(pghw_type_t hw)
-{
-	if (hw == PGHW_IPIPE ||
-	    hw == PGHW_FPU ||
-	    hw == PGHW_CHIP ||
-	    hw == PGHW_CACHE)
-		return (1);
+	if (rank1 > rank2)
+		return (hw1);
 	else
-		return (0);
+		return (hw2);
 }
 
-
 /*
- * Return 1 if thread affinity polices should be implemented
- * for instances of the specifed hardware sharing relationship.
+ * Override the default CMT dispatcher policy for the specified
+ * hardware sharing relationship
  */
-int
-pg_plat_cmt_affinity_hw(pghw_type_t hw)
+pg_cmt_policy_t
+pg_plat_cmt_policy(pghw_type_t hw)
 {
-	if (hw == PGHW_CACHE)
-		return (1);
-	else
-		return (0);
+	/*
+	 * For shared caches, also load balance across them to
+	 * maximize aggregate cache capacity
+	 */
+	switch (hw) {
+	case PGHW_CACHE:
+		return (CMT_BALANCE|CMT_AFFINITY);
+	default:
+		return (CMT_NO_POLICY);
+	}
 }
 
 id_t
@@ -329,9 +362,28 @@ dummy_scalehrtime(hrtime_t *ticks)
 {}
 
 /*
+ * Supports Deep C-State power saving idle loop.
+ */
+void
+cpu_idle_adaptive(void)
+{
+	(*CPU->cpu_m.mcpu_idle_cpu)();
+}
+
+void
+cpu_dtrace_idle_probe(uint_t cstate)
+{
+	cpu_t		*cpup = CPU;
+	struct machcpu	*mcpu = &(cpup->cpu_m);
+
+	mcpu->curr_cstate = cstate;
+	DTRACE_PROBE1(idle__state__transition, uint_t, cstate);
+}
+
+/*
  * Idle the present CPU until awoken via an interrupt
  */
-static void
+void
 cpu_idle(void)
 {
 	cpu_t		*cpup = CPU;
@@ -427,11 +479,11 @@ cpu_idle(void)
 		return;
 	}
 
-	DTRACE_PROBE1(idle__state__transition, uint_t, IDLE_STATE_C1);
+	cpu_dtrace_idle_probe(IDLE_STATE_C1);
 
 	mach_cpu_idle();
 
-	DTRACE_PROBE1(idle__state__transition, uint_t, IDLE_STATE_C0);
+	cpu_dtrace_idle_probe(IDLE_STATE_C0);
 
 	/*
 	 * We're no longer halted
@@ -510,7 +562,7 @@ cpu_wakeup(cpu_t *cpu, int bound)
 /*
  * Idle the present CPU until awoken via touching its monitored line
  */
-static void
+void
 cpu_idle_mwait(void)
 {
 	volatile uint32_t	*mcpu_mwait = CPU->cpu_m.mcpu_mwait;
@@ -520,7 +572,7 @@ cpu_idle_mwait(void)
 	int			hset_update = 1;
 
 	/*
-	 * Set our mcpu_mwait here, so we can tell if anyone trys to
+	 * Set our mcpu_mwait here, so we can tell if anyone tries to
 	 * wake us between now and when we call mwait.  No other cpu will
 	 * attempt to set our mcpu_mwait until we add ourself to the halted
 	 * CPU bitmap.
@@ -529,7 +581,7 @@ cpu_idle_mwait(void)
 
 	/*
 	 * If this CPU is online, and there's multiple CPUs
-	 * in the system, then we should notate our halting
+	 * in the system, then we should note our halting
 	 * by adding ourselves to the partition's halted CPU
 	 * bitmap. This allows other CPUs to find/awaken us when
 	 * work becomes available.
@@ -543,7 +595,7 @@ cpu_idle_mwait(void)
 	 *
 	 * When a thread becomes runnable, it is placed on the queue
 	 * and then the halted CPU bitmap is checked to determine who
-	 * (if anyone) should be awoken. We therefore need to first
+	 * (if anyone) should be awakened. We therefore need to first
 	 * add ourselves to the bitmap, and and then check if there
 	 * is any work available.
 	 *
@@ -580,13 +632,13 @@ cpu_idle_mwait(void)
 	 */
 	i86_monitor(mcpu_mwait, 0, 0);
 	if (*mcpu_mwait == MWAIT_HALTED) {
-		DTRACE_PROBE1(idle__state__transition, uint_t, IDLE_STATE_C1);
+		cpu_dtrace_idle_probe(IDLE_STATE_C1);
 
 		tlb_going_idle();
 		i86_mwait(0, 0);
 		tlb_service();
 
-		DTRACE_PROBE1(idle__state__transition, uint_t, IDLE_STATE_C0);
+		cpu_dtrace_idle_probe(IDLE_STATE_C0);
 	}
 
 	/*
@@ -858,14 +910,23 @@ mach_init()
 	(*pops->psm_softinit)();
 
 	/*
-	 * Initialize the dispatcher's function hooks
-	 * to enable CPU halting when idle.
+	 * Initialize the dispatcher's function hooks to enable CPU halting
+	 * when idle.  Set both the deep-idle and non-deep-idle hooks.
+	 *
+	 * Assume we can use power saving deep-idle loop cpu_idle_adaptive.
+	 * Platform deep-idle driver will reset our idle loop to
+	 * non_deep_idle_cpu if power saving deep-idle feature is not available.
+	 *
 	 * Do not use monitor/mwait if idle_cpu_use_hlt is not set(spin idle)
 	 * or idle_cpu_prefer_mwait is not set.
 	 * Allocate monitor/mwait buffer for cpu0.
 	 */
+#ifndef __xpv
+	non_deep_idle_disp_enq_thread = disp_enq_thread;
+#endif
 	if (idle_cpu_use_hlt) {
-		idle_cpu = cpu_idle;
+		idle_cpu = cpu_idle_adaptive;
+		CPU->cpu_m.mcpu_idle_cpu = cpu_idle;
 #ifndef __xpv
 		if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait) {
 			CPU->cpu_m.mcpu_mwait = cpuid_mwait_alloc(CPU);
@@ -878,12 +939,20 @@ mach_init()
 				    "handle cpu 0 mwait size.");
 #endif
 				idle_cpu_prefer_mwait = 0;
-				idle_cpu = cpu_idle;
+				CPU->cpu_m.mcpu_idle_cpu = cpu_idle;
 			} else {
-				idle_cpu = cpu_idle_mwait;
+				CPU->cpu_m.mcpu_idle_cpu = cpu_idle_mwait;
 			}
 		} else {
-			idle_cpu = cpu_idle;
+			CPU->cpu_m.mcpu_idle_cpu = cpu_idle;
+		}
+		non_deep_idle_cpu = CPU->cpu_m.mcpu_idle_cpu;
+
+		/*
+		 * Disable power saving deep idle loop?
+		 */
+		if (idle_cpu_no_deep_c) {
+			idle_cpu = non_deep_idle_cpu;
 		}
 #endif
 	}
@@ -970,6 +1039,7 @@ mach_smpinit(void)
 #ifndef __xpv
 		if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait)
 			disp_enq_thread = cpu_wakeup_mwait;
+		non_deep_idle_disp_enq_thread = disp_enq_thread;
 #endif
 	}
 
diff --git a/usr/src/uts/i86pc/os/mp_startup.c b/usr/src/uts/i86pc/os/mp_startup.c
index 54eb2f4369..5027d7a182 100644
--- a/usr/src/uts/i86pc/os/mp_startup.c
+++ b/usr/src/uts/i86pc/os/mp_startup.c
@@ -120,11 +120,6 @@ init_cpu_info(struct cpu *cp)
 	 */
 	cp->cpu_curr_clock = cpu_freq_hz;
 
-	/*
-	 * Supported frequencies.
-	 */
-	cpu_set_supp_freqs(cp, NULL);
-
 	(void) strcpy(pi->pi_processor_type, "i386");
 	if (fpu_exists)
 		(void) strcpy(pi->pi_fputypes, "i387 compatible");
@@ -236,8 +231,10 @@ mp_startup_init(int cpun)
 	proc_t *procp;
 #if !defined(__xpv)
 	extern int idle_cpu_prefer_mwait;
+	extern void cpu_idle_mwait();
 #endif
 	extern void idle();
+	extern void cpu_idle();
 
 #ifdef TRAPTRACE
 	trap_trace_ctl_t *ttc = &trap_trace_ctl[cpun];
@@ -247,9 +244,12 @@ mp_startup_init(int cpun)
 
 	cp = kmem_zalloc(sizeof (*cp), KM_SLEEP);
 #if !defined(__xpv)
-	if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait)
+	if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait) {
 		cp->cpu_m.mcpu_mwait = cpuid_mwait_alloc(CPU);
+		cp->cpu_m.mcpu_idle_cpu = cpu_idle_mwait;
+	} else
 #endif
+		cp->cpu_m.mcpu_idle_cpu = cpu_idle;
 
 	procp = curthread->t_procp;
 
@@ -1463,6 +1463,9 @@ mp_startup(void)
 {
 	struct cpu *cp = CPU;
 	uint_t new_x86_feature;
+#ifndef __xpv
+	extern void cpupm_init(cpu_t *);
+#endif
 
 	/*
 	 * We need to get TSC on this proc synced (i.e., any delta
@@ -1558,14 +1561,6 @@ mp_startup(void)
 	init_cpu_info(cp);
 
 	mutex_enter(&cpu_lock);
-	/*
-	 * Processor group initialization for this CPU is dependent on the
-	 * cpuid probing, which must be done in the context of the current
-	 * CPU.
-	 */
-	pghw_physid_create(cp);
-	pg_cpu_init(cp);
-	pg_cmt_cpu_startup(cp);
 
 	cp->cpu_flags |= CPU_RUNNING | CPU_READY | CPU_EXISTS;
 
@@ -1597,15 +1592,30 @@ mp_startup(void)
 	ASSERT(cp->cpu_base_spl == ipltospl(LOCK_LEVEL));
 	set_base_spl();		/* Restore the spl to its proper value */
 
+#ifndef __xpv
+	cpupm_init(cp);
+#endif
+	add_cpunode2devtree(cp->cpu_id, cp->cpu_m.mcpu_cpi);
+
+	/*
+	 * Processor group initialization for this CPU is dependent on the
+	 * cpuid probing, which must be done in the context of the current
+	 * CPU, as well as the CPU's device node initialization (for ACPI).
+	 */
+	mutex_enter(&cpu_lock);
+	pghw_physid_create(cp);
+	pg_cpu_init(cp);
+	pg_cmt_cpu_startup(cp);
+	mutex_exit(&cpu_lock);
+
 	/* Enable interrupts */
 	(void) spl0();
+
 	mutex_enter(&cpu_lock);
 	cpu_enable_intr(cp);
 	cpu_add_active(cp);
 	mutex_exit(&cpu_lock);
 
-	add_cpunode2devtree(cp->cpu_id, cp->cpu_m.mcpu_cpi);
-
 #ifndef __xpv
 	{
 		/*
diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c
index 58bc3416f1..533d90a2a0 100644
--- a/usr/src/uts/i86pc/os/startup.c
+++ b/usr/src/uts/i86pc/os/startup.c
@@ -137,6 +137,7 @@ extern void progressbar_init(void);
 extern void progressbar_start(void);
 extern void brand_init(void);
 extern void pcf_init(void);
+extern void pg_init(void);
 
 extern int size_pse_array(pgcnt_t, int);
 
@@ -2128,6 +2129,8 @@ ulong_t  _bdhs34;
 void
 post_startup(void)
 {
+	extern void cpupm_init(cpu_t *);
+
 	/*
 	 * Set the system wide, processor-specific flags to be passed
 	 * to userland via the aux vector for performance hints and
@@ -2186,7 +2189,11 @@ post_startup(void)
 
 	maxmem = freemem;
 
+	cpupm_init(CPU);
+
 	add_cpunode2devtree(CPU->cpu_id, CPU->cpu_m.mcpu_cpi);
+
+	pg_init();
 }
 
 static int
diff --git a/usr/src/uts/i86pc/sys/cpu_acpi.h b/usr/src/uts/i86pc/sys/cpu_acpi.h
index c0b750f447..1805cd4d22 100644
--- a/usr/src/uts/i86pc/sys/cpu_acpi.h
+++ b/usr/src/uts/i86pc/sys/cpu_acpi.h
@@ -19,13 +19,14 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_CPU_ACPI_H
 #define	_CPU_ACPI_H
 
+#include <sys/cpuvar.h>
 #include <sys/acpi/acpi.h>
 #include <sys/acpi/acresrc.h>
 #include <sys/acpi/acglobal.h>
@@ -66,15 +67,25 @@ extern "C" {
 #define	CPU_ACPI_TSTATE_CTRL(tstate)	tstate->ts_ctrl
 #define	CPU_ACPI_TSTATE_STAT(tstate)	tstate->ts_state
 
-#define	CPU_ACPI_NONE_CACHED		0x00
-#define	CPU_ACPI_PCT_CACHED		0x01
-#define	CPU_ACPI_PSS_CACHED		0x02
-#define	CPU_ACPI_PSD_CACHED		0x04
-#define	CPU_ACPI_PPC_CACHED		0x08
-#define	CPU_ACPI_PTC_CACHED		0x10
-#define	CPU_ACPI_TSS_CACHED		0x20
-#define	CPU_ACPI_TSD_CACHED		0x40
-#define	CPU_ACPI_TPC_CACHED		0x80
+/*
+ * C-state realted macros
+ */
+#define	CPU_ACPI_CSD(sp)		sp->cs_csd
+#define	CPU_ACPI_BM_INFO(sp)		sp->bm_info
+#define	CPU_ACPI_CSTATES(sp)		sp->cs_cstates.ss_states
+#define	CPU_ACPI_CSTATES_COUNT(sp)	sp->cs_cstates.ss_count
+
+#define	CPU_ACPI_NONE_CACHED		0x0000
+#define	CPU_ACPI_PCT_CACHED		0x0001
+#define	CPU_ACPI_PSS_CACHED		0x0002
+#define	CPU_ACPI_PSD_CACHED		0x0004
+#define	CPU_ACPI_PPC_CACHED		0x0008
+#define	CPU_ACPI_PTC_CACHED		0x0010
+#define	CPU_ACPI_TSS_CACHED		0x0020
+#define	CPU_ACPI_TSD_CACHED		0x0040
+#define	CPU_ACPI_TPC_CACHED		0x0080
+#define	CPU_ACPI_CST_CACHED		0x0100
+#define	CPU_ACPI_CSD_CACHED		0x0200
 
 #define	CPU_ACPI_IS_OBJ_CACHED(sp, obj)	(sp->cpu_acpi_cached & obj)
 #define	CPU_ACPI_OBJ_IS_CACHED(sp, obj)	(sp->cpu_acpi_cached |= obj)
@@ -84,7 +95,8 @@ extern "C" {
 #define	CPU_ACPI_PSS_CNT (sizeof (cpu_acpi_pstate_t) / sizeof (uint32_t))
 #define	CPU_ACPI_TSTATES_SIZE(cnt) (cnt * sizeof (cpu_acpi_tstate_t))
 #define	CPU_ACPI_TSS_CNT (sizeof (cpu_acpi_tstate_t) / sizeof (uint32_t))
-
+#define	CPU_ACPI_CSTATES_SIZE(cnt) (cnt * sizeof (cpu_acpi_cstate_t))
+#define	CPU_ACPI_CST_CNT (sizeof (cpu_acpi_cstate_t) / sizeof (uint32_t))
 /*
  * CPU Domain Coordination Types
  */
@@ -102,10 +114,12 @@ typedef struct cpu_acpi_state_dependency
 	uint32_t sd_domain;
 	uint32_t sd_type;
 	uint32_t sd_num;
+	uint32_t sd_index;
 } cpu_acpi_state_dependency_t;
 
 typedef cpu_acpi_state_dependency_t cpu_acpi_psd_t;
 typedef cpu_acpi_state_dependency_t cpu_acpi_tsd_t;
+typedef cpu_acpi_state_dependency_t cpu_acpi_csd_t;
 
 /*
  * Container for ACPI processor control register information
@@ -148,6 +162,21 @@ typedef struct cpu_acpi_tstate
 
 } cpu_acpi_tstate_t;
 
+/*
+ * Container for _CST information
+ */
+typedef struct cpu_acpi_cstate
+{
+	uint32_t cs_addrspace_id;
+	uint32_t cs_address;
+	uint32_t cs_type;
+	uint32_t cs_latency;
+	uint32_t cs_power;
+	uint32_t promotion;
+	uint32_t demotion;
+	kstat_t	*cs_ksp;
+} cpu_acpi_cstate_t;
+
 typedef struct cpu_acpi_supported_states {
 	void *ss_states;
 	uint32_t ss_count;
@@ -155,6 +184,7 @@ typedef struct cpu_acpi_supported_states {
 
 typedef cpu_acpi_supported_states_t cpu_acpi_pstates_t;
 typedef cpu_acpi_supported_states_t cpu_acpi_tstates_t;
+typedef cpu_acpi_supported_states_t cpu_acpi_cstates_t;
 
 typedef int cpu_acpi_present_capabilities_t;
 typedef int cpu_acpi_ppc_t;
@@ -165,7 +195,7 @@ typedef int cpu_acpi_tpc_t;
  */
 typedef struct cpu_acpi_state {
 	ACPI_HANDLE cs_handle;
-	dev_info_t *cs_dip;
+	int cs_id;
 	uint_t cpu_acpi_cached;
 	cpu_acpi_pstates_t cs_pstates;
 	cpu_acpi_pct_t cs_pct[2];
@@ -175,6 +205,9 @@ typedef struct cpu_acpi_state {
 	cpu_acpi_ptc_t cs_ptc[2];
 	cpu_acpi_tsd_t cs_tsd;
 	cpu_acpi_tpc_t cs_tpc;
+	cpu_acpi_cstates_t cs_cstates;
+	cpu_acpi_csd_t cs_csd;
+	uint_t bm_info;
 } cpu_acpi_state_t;
 
 typedef cpu_acpi_state_t *cpu_acpi_handle_t;
@@ -185,15 +218,22 @@ extern int cpu_acpi_cache_pstate_data(cpu_acpi_handle_t);
 extern void cpu_acpi_free_pstate_data(cpu_acpi_handle_t);
 extern int cpu_acpi_cache_tstate_data(cpu_acpi_handle_t);
 extern void cpu_acpi_free_tstate_data(cpu_acpi_handle_t);
+extern int cpu_acpi_cache_cstate_data(cpu_acpi_handle_t);
+extern void cpu_acpi_free_cstate_data(cpu_acpi_handle_t);
 extern void cpu_acpi_install_notify_handler(cpu_acpi_handle_t,
-    ACPI_NOTIFY_HANDLER, dev_info_t *);
+    ACPI_NOTIFY_HANDLER, void *);
+extern void cpu_acpi_remove_notify_handler(cpu_acpi_handle_t,
+    ACPI_NOTIFY_HANDLER);
 extern int cpu_acpi_write_pdc(cpu_acpi_handle_t, uint32_t, uint32_t,
     uint32_t *);
 extern int cpu_acpi_write_port(ACPI_IO_ADDRESS, uint32_t, uint32_t);
 extern int cpu_acpi_read_port(ACPI_IO_ADDRESS, uint32_t *, uint32_t);
+extern void cpu_acpi_set_register(uint32_t, uint32_t);
+extern void cpu_acpi_get_register(uint32_t, uint32_t *);
 extern uint_t cpu_acpi_get_speeds(cpu_acpi_handle_t, int **);
+extern uint_t cpu_acpi_get_max_cstates(cpu_acpi_handle_t);
 extern void cpu_acpi_free_speeds(int *, uint_t);
-extern cpu_acpi_handle_t cpu_acpi_init(dev_info_t *);
+extern cpu_acpi_handle_t cpu_acpi_init(cpu_t *);
 extern void cpu_acpi_fini(cpu_acpi_handle_t);
 
 #ifdef __cplusplus
diff --git a/usr/src/uts/i86pc/sys/cpu_idle.h b/usr/src/uts/i86pc/sys/cpu_idle.h
new file mode 100644
index 0000000000..6b38663c28
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/cpu_idle.h
@@ -0,0 +1,72 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_CPUIDLE_H
+#define	_CPUIDLE_H
+
+#include <sys/cpupm.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#define	CPU_MAX_CSTATES	8
+
+#define	CPU_ACPI_C0	0
+#define	CPU_ACPI_C1	1
+#define	CPU_ACPI_C2	2
+#define	CPU_ACPI_C3	3
+
+#define	BM_CTL		0x1
+#define	BM_RLD		0x2
+#define	BM_ARB_DIS	0x4
+
+#define	CPUID_TSC_INVARIANCE	0x100
+
+#define	CPU_IDLE_DEEP_CFG	(0x1)	/* Deep Idle disabled by user */
+#define	CPU_IDLE_CPR_CFG	(0x2)	/* In CPR */
+
+#define	CPU_CSTATE_LATENCY_UNDEF	(1000000)	/* ACPI info missing */
+
+typedef struct cpu_idle_kstat_s {
+	struct kstat_named	addr_space_id;	/* register address space id */
+	struct kstat_named	cs_latency;	/* worst latency */
+	struct kstat_named	cs_power;	/* average power consumption */
+} cpu_idle_kstat_t;
+
+extern cpupm_state_ops_t cpu_idle_ops;
+
+extern void cpu_acpi_idle(void);
+extern void cstate_wakeup(cpu_t *, int);
+extern boolean_t cpu_deep_cstates_supported(void);
+extern void cpu_wakeup(cpu_t *, int);
+extern void cpu_wakeup_mwait(cpu_t *, int);
+extern void cpu_dtrace_idle_probe(uint_t);
+extern void cpuidle_manage_cstates(void *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _CPUIDLE_H */
diff --git a/usr/src/uts/i86pc/sys/cpudrv_mach.h b/usr/src/uts/i86pc/sys/cpudrv_mach.h
index 26b4ecb787..c26d93853f 100644
--- a/usr/src/uts/i86pc/sys/cpudrv_mach.h
+++ b/usr/src/uts/i86pc/sys/cpudrv_mach.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -42,31 +42,12 @@ extern "C" {
  * for it).
  */
 extern cpuset_t cpu_ready_set;
-#define	CPUDRV_PM_XCALL_IS_READY(cpuid) CPU_IN_SET(cpu_ready_set, (cpuid))
-
-/*
- * An error attaching any of the devices results in disabling
- * CPU power management.
- */
-#define	CPUDRV_PM_DISABLE() cpupm_disable(CPUPM_ALL_STATES)
-
-/*
- * If no power management states are enabled, then CPU power
- * management is disabled.
- */
-#define	CPUDRV_PM_DISABLED() \
-	(!cpupm_is_enabled(CPUPM_P_STATES) && !cpupm_is_enabled(CPUPM_T_STATES))
-
-/*
- * Is P-state management enabled?
- */
-#define	CPUDRV_PM_POWER_ENABLED(cpudsp) \
-	(((cpudrv_mach_state_t *)cpudsp->mach_state)->caps & CPUDRV_P_STATES)
+#define	CPUDRV_XCALL_IS_READY(cpuid) CPU_IN_SET(cpu_ready_set, (cpuid))
 
 /*
  * We're about to exit the _PPC thread so reset tag.
  */
-#define	CPUDRV_PM_RESET_GOVERNOR_THREAD(cpupm) { \
+#define	CPUDRV_RESET_GOVERNOR_THREAD(cpupm) { \
 	if (curthread == cpupm->pm_governor_thread) \
 		cpupm->pm_governor_thread = NULL; \
 }
@@ -74,50 +55,51 @@ extern cpuset_t cpu_ready_set;
 /*
  * The current top speed as defined by the _PPC.
  */
-#define	CPUDRV_PM_TOPSPEED(cpupm)	(cpupm)->top_spd
+#define	CPUDRV_TOPSPEED(cpupm)	(cpupm)->top_spd
 
 /*
  * Install a _PPC/_TPC change notification handler.
  */
-#define	CPUDRV_PM_INSTALL_MAX_CHANGE_HANDLER(cpudsp, dip) \
-	cpudrv_pm_install_notify_handler(cpudsp, dip);
+#define	CPUDRV_INSTALL_MAX_CHANGE_HANDLER(cpudsp) \
+	cpudrv_install_notify_handler(cpudsp);
 
 /*
  * Redefine the topspeed.
  */
-#define	CPUDRV_PM_REDEFINE_TOPSPEED(dip) cpudrv_pm_redefine_topspeed(dip)
+#define	CPUDRV_REDEFINE_TOPSPEED(dip) cpudrv_redefine_topspeed(dip)
 
 /*
  * Set callbacks so that PPM can callback into CPUDRV
  */
-#define	CPUDRV_PM_SET_PPM_CALLBACKS() { \
-	cpupm_get_topspeed = cpudrv_pm_get_topspeed; \
-	cpupm_set_topspeed = cpudrv_pm_set_topspeed; \
+#define	CPUDRV_SET_PPM_CALLBACKS() { \
+	cpupm_get_topspeed_callb = cpudrv_get_topspeed; \
+	cpupm_set_topspeed_callb = cpudrv_set_topspeed; \
 }
 
 /*
  * ACPI provides the supported speeds.
  */
-#define	CPUDRV_PM_GET_SPEEDS(cpudsp, speeds, nspeeds) \
-	nspeeds = cpudrv_pm_get_speeds(cpudsp, &speeds);
-#define	CPUDRV_PM_FREE_SPEEDS(speeds, nspeeds) \
-	cpudrv_pm_free_speeds(speeds, nspeeds);
+#define	CPUDRV_GET_SPEEDS(cpudsp, speeds, nspeeds) \
+	nspeeds = cpudrv_get_speeds(cpudsp, &speeds);
+#define	CPUDRV_FREE_SPEEDS(speeds, nspeeds) \
+	cpudrv_free_speeds(speeds, nspeeds);
 
 /*
- * Convert speed to Hz.
+ * ACPI provides the supported C-states.
  */
-#define	CPUDRV_PM_SPEED_HZ(unused, mhz) ((uint64_t)mhz * 1000000)
+#define	CPUDRV_GET_MAX_CSTATES(handle) \
+	cpu_acpi_get_max_cstates(handle);
 
 /*
  * Compute the idle cnt percentage for a given speed.
  */
-#define	CPUDRV_PM_IDLE_CNT_PERCENT(hwm, speeds, i) \
+#define	CPUDRV_IDLE_CNT_PERCENT(hwm, speeds, i) \
 	(100 - (((100 - hwm) * speeds[0]) / speeds[i]))
 
 /*
  * Compute the user cnt percentage for a given speed.
  */
-#define	CPUDRV_PM_USER_CNT_PERCENT(hwm, speeds, i) \
+#define	CPUDRV_USER_CNT_PERCENT(hwm, speeds, i) \
 	((hwm * speeds[i]) / speeds[i - 1]);
 
 /*
@@ -133,82 +115,21 @@ extern cpuset_t cpu_ready_set;
  * The amount of memory needed for each string is:
  * 	digits for power level + '=' +  digits for freq + 'MHz' + '\0'
  */
-#define	CPUDRV_PM_COMP_SIZE() \
-	(CPUDRV_PM_COMP_MAX_DIG + 1 + CPUDRV_PM_COMP_MAX_DIG + 3 + 1);
-#define	CPUDRV_PM_COMP_SPEED(cpupm, cur_spd) cur_spd->speed;
-#define	CPUDRV_PM_COMP_SPRINT(pmc, cpupm, cur_spd, comp_spd) \
+#define	CPUDRV_COMP_SIZE() \
+	(CPUDRV_COMP_MAX_DIG + 1 + CPUDRV_COMP_MAX_DIG + 3 + 1);
+#define	CPUDRV_COMP_SPEED(cpupm, cur_spd) cur_spd->speed;
+#define	CPUDRV_COMP_SPRINT(pmc, cpupm, cur_spd, comp_spd) \
 	(void) sprintf(pmc, "%d=%dMHz", cur_spd->pm_level, comp_spd);
 
-/*
- * T-State domain list
- */
-typedef struct cpudrv_tstate_domain_node {
-	struct cpudrv_tstate_domain_node	*tdn_next;
-	struct cpudrv_tstate_domain		*tdn_domain;
-	cpudrv_devstate_t			*tdn_cpudsp;
-} cpudrv_tstate_domain_node_t;
-
-typedef struct cpudrv_tstate_domain {
-	struct cpudrv_tstate_domain	*td_next;
-	cpudrv_tstate_domain_node_t	*td_node;
-	uint32_t			td_domain;
-	uint32_t			td_type;
-	kmutex_t			td_lock;
-} cpudrv_tstate_domain_t;
-
-extern cpudrv_tstate_domain_t *cpudrv_tstate_domains;
-
-/*
- * Different processor families have their own technologies for supporting
- * CPU power management (i.e., Intel has Enhanced SpeedStep for some of it's
- * processors and AMD has PowerNow! for some of it's processors). We support
- * these different technologies via modules that export the interfaces
- * described below.
- *
- * If a module implements the technology that should be used to manage
- * the current CPU device, then the cpups_init() module should return
- * succesfully (i.e., return code of 0) and perform any initialization
- * such that future power transistions can be performed by calling
- * the cpups_power() interface(). And the cpups_fini() interface can be
- * used to free any resources allocated by cpups_init().
- */
-typedef struct cpudrv_pstate_ops {
-	char	*cpups_label;
-	int	(*cpups_init)(cpudrv_devstate_t *);
-	void	(*cpups_fini)(cpudrv_devstate_t *);
-	int	(*cpups_power)(cpudrv_devstate_t *, uint32_t);
-} cpudrv_pstate_ops_t;
+extern void cpudrv_set_topspeed(void *, int);
+extern int cpudrv_get_topspeed(void *);
+extern int cpudrv_get_topthrottle(cpu_t *);
+extern void cpudrv_manage_throttling(void *);
+extern void cpudrv_install_notify_handler(cpudrv_devstate_t *);
+extern void cpudrv_redefine_topspeed(void *);
+extern uint_t cpudrv_get_speeds(cpudrv_devstate_t *, int **);
+extern void cpudrv_free_speeds(int *, uint_t);
 
-/*
- * T-state support.
- */
-typedef struct cpudrv_tstate_ops {
-	char	*cputs_label;
-	int	(*cputs_init)(cpudrv_devstate_t *);
-	void	(*cputs_fini)(cpudrv_devstate_t *);
-	int	(*cputs_throttle)(cpudrv_devstate_t *,  uint32_t);
-} cpudrv_tstate_ops_t;
-
-typedef struct cpudrv_mach_state {
-	void			*acpi_handle;
-	cpudrv_pstate_ops_t	*cpupm_pstate_ops;
-	cpudrv_tstate_ops_t	*cpupm_tstate_ops;
-	cpudrv_tstate_domain_node_t *tstate_domain_node;
-	uint32_t		pstate;
-	uint32_t		tstate;
-	uint32_t		caps;
-} cpudrv_mach_state_t;
-
-#define	CPUDRV_NO_STATES	0x00
-#define	CPUDRV_P_STATES		0x01
-#define	CPUDRV_T_STATES		0x02
-
-extern uint_t cpudrv_pm_get_speeds(cpudrv_devstate_t *, int **);
-extern void cpudrv_pm_free_speeds(int *, uint_t);
-extern void cpudrv_pm_set_topspeed(void *, int);
-extern int cpudrv_pm_get_topspeed(void *);
-extern void cpudrv_pm_redefine_topspeed(void *);
-extern void cpudrv_pm_install_notify_handler(cpudrv_devstate_t *, dev_info_t *);
 #ifdef  __cplusplus
 }
 #endif
diff --git a/usr/src/uts/i86pc/sys/cpupm.h b/usr/src/uts/i86pc/sys/cpupm.h
deleted file mode 100644
index 2510a0fb60..0000000000
--- a/usr/src/uts/i86pc/sys/cpupm.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_CPUPM_H
-#define	_CPUPM_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-
-/*
- * Simple structures used to temporarily keep track of CPU
- * dependencies until the PPM driver can build PPM CPU domains.
- */
-typedef struct cpupm_cpu_node {
-	struct cpupm_cpu_node	*cn_next;
-	dev_info_t		*cn_dip;
-} cpupm_cpu_node_t;
-
-typedef struct cpupm_cpu_dependency {
-	struct cpupm_cpu_dependency *cd_next;
-	cpupm_cpu_node_t	*cd_cpu;
-	int			cd_dependency_id;
-} cpupm_cpu_dependency_t;
-
-/*
- * If any states are added, then make sure to add them to
- * CPUPM_ALL_STATES.
- */
-#define	CPUPM_NO_STATES		0x00
-#define	CPUPM_P_STATES		0x01
-#define	CPUPM_T_STATES		0x02
-#define	CPUPM_ALL_STATES	(CPUPM_P_STATES | CPUPM_T_STATES)
-
-/*
- * Callbacks used for CPU power management.
- */
-extern void (*cpupm_rebuild_cpu_domains)(void);
-extern void (*cpupm_init_topspeed)(void);
-extern void (*cpupm_redefine_topspeed)(void *);
-extern int (*cpupm_get_topspeed)(void *);
-extern void (*cpupm_set_topspeed)(void *, int);
-
-/*
- * Routines used to manage temporary CPU dependencies.
- */
-extern cpupm_cpu_dependency_t *cpupm_get_cpu_dependencies();
-extern void cpupm_add_cpu2dependency(dev_info_t *, int);
-extern void cpupm_free_cpu_dependencies();
-
-/*
- * Routines to track overall status of CPU power management readiness.
- *
- */
-extern boolean_t cpupm_is_ready();
-extern boolean_t cpupm_is_enabled(uint32_t);
-extern void cpupm_disable(uint32_t);
-extern void cpupm_post_startup();
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif	/* _CPUPM_H */
diff --git a/usr/src/uts/i86pc/sys/cpupm_mach.h b/usr/src/uts/i86pc/sys/cpupm_mach.h
new file mode 100644
index 0000000000..fbb040f84b
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/cpupm_mach.h
@@ -0,0 +1,197 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_CPUPM_MACH_H
+#define	_CPUPM_MACH_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cpuvar.h>
+#include <sys/ksynch.h>
+#include <sys/cpu_pm.h>
+
+/*
+ * CPU power domains
+ */
+typedef struct cpupm_state_domains {
+	struct cpupm_state_domains	*pm_next;
+	uint32_t			pm_domain;
+	uint32_t			pm_type;
+	cpuset_t			pm_cpus;
+	kmutex_t			pm_lock;
+} cpupm_state_domains_t;
+
+extern cpupm_state_domains_t *cpupm_pstate_domains;
+extern cpupm_state_domains_t *cpupm_tstate_domains;
+extern cpupm_state_domains_t *cpupm_cstate_domains;
+
+/*
+ * Different processor families have their own technologies for supporting
+ * CPU power management (i.e., Intel has Enhanced SpeedStep for some of its
+ * processors and AMD has PowerNow! for some of its processors). We support
+ * these different technologies via modules that export the interfaces
+ * described below.
+ *
+ * If a module implements the technology that should be used to manage
+ * the current CPU device, then the cpus_init() module should return
+ * succesfully (i.e., return code of 0) and perform any initialization
+ * such that future power transistions can be performed by calling
+ * the cpus_change() interface. And the cpups_fini() interface can be
+ * used to free any resources allocated by cpus_init().
+ */
+typedef struct cpupm_state_ops {
+	char	*cpups_label;
+	int	(*cpus_init)(cpu_t *);
+	void	(*cpus_fini)(cpu_t *);
+	void	(*cpus_change)(cpuset_t, uint32_t);
+} cpupm_state_ops_t;
+
+/*
+ * Data kept for each C-state power-domain.
+ */
+typedef struct cma_c_state {
+	uint32_t	cs_next_cstate;	/* computed best C-state */
+
+	uint32_t	cs_cnt;		/* times accessed */
+	uint32_t	cs_type;	/* current ACPI idle type */
+
+	hrtime_t	cs_idle_enter;	/* entered idle */
+	hrtime_t	cs_idle_exit;	/* left idle */
+
+	hrtime_t	cs_smpl_start;	/* accounting sample began */
+	hrtime_t	cs_idle;	/* time idle */
+	hrtime_t	cs_smpl_len;	/* sample duration */
+	hrtime_t	cs_smpl_idle;	/* idle time in last sample */
+	uint64_t	cs_smpl_idle_pct;	/* % idle time in last smpl */
+
+	hrtime_t	cs_C2_latency;	/* C2 round trip latency */
+	hrtime_t	cs_C3_latency;	/* C3 round trip latency */
+} cma_c_state_t;
+
+typedef union cma_state {
+	cma_c_state_t	*cstate;
+	uint32_t	pstate;
+} cma_state_t;
+
+typedef struct cpupm_mach_acpi_state {
+	cpupm_state_ops_t	*cma_ops;
+	cpupm_state_domains_t   *cma_domain;
+	cma_state_t		cma_state;
+} cpupm_mach_acpi_state_t;
+
+typedef struct cpupm_mach_state {
+	void			*ms_acpi_handle;
+	cpupm_mach_acpi_state_t	ms_pstate;
+	cpupm_mach_acpi_state_t	ms_cstate;
+	cpupm_mach_acpi_state_t	ms_tstate;
+	uint32_t		ms_caps;
+	dev_info_t		*ms_dip;
+	kmutex_t		ms_lock;
+	struct cpupm_notification *ms_handlers;
+} cpupm_mach_state_t;
+
+/*
+ * Constants used by the Processor Device Notification handler
+ * that identify what kind of change has occurred.
+ */
+#define	CPUPM_PPC_CHANGE_NOTIFICATION 0x80
+#define	CPUPM_CST_CHANGE_NOTIFICATION 0x81
+#define	CPUPM_TPC_CHANGE_NOTIFICATION 0x82
+
+typedef void (*CPUPM_NOTIFY_HANDLER)(void *handle, uint32_t val,
+    void *ctx);
+
+typedef struct cpupm_notification {
+	struct cpupm_notification	*nq_next;
+	CPUPM_NOTIFY_HANDLER		nq_handler;
+	void				*nq_ctx;
+} cpupm_notification_t;
+
+/*
+ * If any states are added, then make sure to add them to
+ * CPUPM_ALL_STATES.
+ */
+#define	CPUPM_NO_STATES		0x00
+#define	CPUPM_P_STATES		0x01
+#define	CPUPM_T_STATES		0x02
+#define	CPUPM_C_STATES		0x04
+#define	CPUPM_ALL_STATES	(CPUPM_P_STATES \
+				| CPUPM_T_STATES \
+				| CPUPM_C_STATES)
+
+#define	CPUPM_XCALL_IS_READY(cpuid) CPU_IN_SET(cpu_ready_set, (cpuid))
+
+/*
+ * An error in initializing any of the CPU PM results in disabling
+ * CPU power management.
+ */
+#define	CPUPM_DISABLE() cpupm_disable(CPUPM_ALL_STATES)
+
+#define	CPUPM_SPEED_HZ(unused, mhz) ((uint64_t)mhz * 1000000)
+
+/*
+ * Callbacks used for CPU power management.
+ */
+extern void (*cpupm_rebuild_cpu_domains)(void);
+extern void (*cpupm_init_topspeed)(void);
+extern void (*cpupm_redefine_topspeed)(void *);
+extern int (*cpupm_get_topspeed_callb)(void *);
+extern void (*cpupm_set_topspeed_callb)(void *, int);
+
+extern void cpupm_init(cpu_t *);
+extern void cpupm_free(cpu_t *);
+extern boolean_t cpupm_is_ready();
+extern boolean_t cpupm_is_enabled(uint32_t);
+extern void cpupm_disable(uint32_t);
+extern void cpupm_post_startup();
+extern void cpupm_alloc_domains(cpu_t *, int);
+extern void cpupm_free_domains(cpupm_state_domains_t **);
+extern void cpupm_alloc_ms_cstate(cpu_t *cp);
+extern void cpupm_free_ms_cstate(cpu_t *cp);
+extern void cpupm_state_change(cpu_t *, int, int);
+extern id_t cpupm_plat_domain_id(cpu_t *cp, cpupm_dtype_t type);
+extern uint_t cpupm_plat_state_enumerate(cpu_t *, cpupm_dtype_t,
+    cpupm_state_t *);
+extern int cpupm_plat_change_state(cpu_t *, cpupm_state_t *);
+extern uint_t cpupm_get_speeds(cpu_t *, int **);
+extern void cpupm_free_speeds(int *, uint_t);
+extern boolean_t cpupm_power_ready(void);
+extern boolean_t cpupm_throttle_ready(void);
+extern boolean_t cpupm_cstate_ready(void);
+extern void cpupm_add_notify_handler(cpu_t *, CPUPM_NOTIFY_HANDLER, void *);
+extern int cpupm_get_top_speed(cpu_t *);
+extern uint32_t cpupm_next_cstate(cma_c_state_t *, hrtime_t);
+extern void cpupm_idle_cstate_data(cma_c_state_t *, int);
+extern void cpupm_wakeup_cstate_data(cma_c_state_t *, hrtime_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _CPUPM_MACH_H */
diff --git a/usr/src/uts/i86pc/sys/cpupm_throttle.h b/usr/src/uts/i86pc/sys/cpupm_throttle.h
new file mode 100644
index 0000000000..5a607158da
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/cpupm_throttle.h
@@ -0,0 +1,43 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_CPUPM_THROTTLE_H
+#define	_CPUPM_THROTTLE_H
+
+#include <sys/cpupm.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+cpupm_state_ops_t cpupm_throttle_ops;
+
+extern void cpupm_throttle_manage_notification(void *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _CPUPM_THROTTLE_H */
diff --git a/usr/src/uts/i86pc/sys/hpet.h b/usr/src/uts/i86pc/sys/hpet.h
new file mode 100644
index 0000000000..1ee9910441
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/hpet.h
@@ -0,0 +1,80 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_HPET_H
+#define	_HPET_H
+
+#include <sys/hpet_acpi.h>
+
+/*
+ * Interface for HPET access.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * HPET_INFINITY is used for timers that will never expire.
+ */
+#define	HPET_INFINITY		(INT64_MAX)
+
+/*
+ * State of initialization.
+ */
+#define	HPET_NO_SUPPORT		(0)
+#define	HPET_TIMER_SUPPORT	(1)	/* supports main counter reads */
+#define	HPET_INTERRUPT_SUPPORT	(2)	/* supports interrupt/timer */
+#define	HPET_FULL_SUPPORT	(3)	/* supports counter and timer intr */
+
+typedef struct hpet {
+	uint_t		supported;
+	boolean_t	(*install_proxy)(void);
+	boolean_t	(*callback)(int);
+	/*
+	 * Next two function pointers allow CPUs to use the HPET's timer
+	 * as a proxy for their LAPIC timers which stop during Deep C-State.
+	 */
+	boolean_t	(*use_hpet_timer)(hrtime_t *);
+	void		(*use_lapic_timer)(hrtime_t);
+} hpet_t;
+
+#define	CST_EVENT_MULTIPLE_CSTATES	(128)	/* callbacks for _CST changes */
+#define	CST_EVENT_ONE_CSTATE		(129)
+
+/*
+ * unix access to the HPET is done through the hpet structure.
+ */
+extern hpet_t hpet;
+
+int hpet_acpi_init(int *hpet_vect, iflag_t *hpet_flags);
+void hpet_acpi_fini(void);
+uint32_t hpet_proxy_ipl(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _HPET_H */
diff --git a/usr/src/uts/i86pc/sys/hpet_acpi.h b/usr/src/uts/i86pc/sys/hpet_acpi.h
new file mode 100644
index 0000000000..c85707787e
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/hpet_acpi.h
@@ -0,0 +1,334 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_HPET_ACPI_H
+#define	_HPET_ACPI_H
+
+#if defined(_KERNEL)
+#include <sys/acpi/acpi.h>
+#include <sys/acpi/actbl1.h>
+#include <sys/acpica.h>
+#endif	/* defined(_KERNEL) */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Solaris uses an HPET Timer to generate interrupts for CPUs in Deep C-state
+ * with stalled LAPIC Timers.  All CPUs use one HPET timer.  The timer's
+ * interrupt targets one CPU (via the I/O APIC).  The one CPU that receives
+ * the HPET's interrupt wakes up other CPUs as needed during the HPET Interrupt
+ * Service Routing.  The HPET ISR uses poke_cpus to wake up other CPUs with an
+ * Inter Processor Interrupt.
+ *
+ * Please see the Intel Programmer's guides.  Interrupts are disabled before
+ * a CPU Halts into Deep C-state.  (This allows CPU-hardware-specific cleanup
+ * before servicing interrupts.)  When a Deep C-state CPU wakes up (due to
+ * an externally generated interrupt), it resume execution where it halted.
+ * The CPU returning from Deep C-state must enable interrupts before it will
+ * handle the pending interrupt that woke it from Deep C-state.
+ *
+ *
+ * HPET bits as defined in the Intel IA-PC HPET Specification Rev 1.0a.
+ *
+ * The physical address space layout of the memory mapped HPET looks like this:
+ *
+ * struct hpet {
+ *	uint64_t	gen_cap;
+ *	uint64_t	res1;
+ *	uint64_t	gen_config;
+ *	uint64_t	res2;
+ *	uint64_t	gen_inter_stat;
+ *	uint64_t	res3;
+ *	uint64_t	main_counter_value;
+ *	uint64_t	res4;
+ *	stuct hpet_timer {
+ *		uint64_t	config_and_capability;
+ *		uint64_t	comparator_value;
+ *		uint64_t	FSB_interrupt_route;
+ *		uint64_t	reserved;
+ *	} timers[32];
+ * }
+ *
+ * There are 32 possible timers in an hpet.  Only the first 3 timers are
+ * required.  The other 29 timers are optional.
+ *
+ * HPETs can have 64-bit or 32-bit timers.  Timers/compare registers can
+ * be 64-bit or 32-bit and can be a mixture of both.
+ * The first two timers are not used.  The HPET spec intends the first two
+ * timers to be used as "legacy replacement" for the PIT and RTC timers.
+ *
+ * Solaris uses the first available non-legacy replacement timer as a proxy
+ * timer for processor Local APIC Timers that stop in deep idle C-states.
+ */
+
+/*
+ * We only use HPET table 1 on x86.  Typical x86 systems only have 1 HPET.
+ * ACPI allows for multiple HPET tables to describe multiple HPETs.
+ */
+#define	HPET_TABLE_1		(1)
+
+/*
+ * HPET Specification 1.0a defines the HPET to occupy 1024 bytes regardless of
+ * the number of counters (3 to 32) in this implementation.
+ */
+#define	HPET_SIZE		(1024)
+
+/*
+ * Offsets of hpet registers and macros to access them from HPET base address.
+ */
+#define	HPET_GEN_CAP_OFFSET		(0)
+#define	HPET_GEN_CONFIG_OFFSET		(0x10)
+#define	HPET_GEN_INTR_STAT_OFFSET	(0x20)
+#define	HPET_MAIN_COUNTER_OFFSET	(0xF0)
+#define	HPET_TIMER_N_CONF_OFFSET(n)	(0x100 + (n * 0x20))
+#define	HPET_TIMER_N_COMP_OFFSET(n)	(0x108 + (n * 0x20))
+
+#define	OFFSET_ADDR(a, o)		(((uintptr_t)(a)) + (o))
+#define	HPET_GEN_CAP_ADDRESS(la)				\
+		    OFFSET_ADDR(la, HPET_GEN_CAP_OFFSET)
+#define	HPET_GEN_CONFIG_ADDRESS(la)				\
+		    OFFSET_ADDR(la, HPET_GEN_CONFIG_OFFSET)
+#define	HPET_GEN_INTR_STAT_ADDRESS(la)				\
+		    OFFSET_ADDR(la, HPET_GEN_INTR_STAT_OFFSET)
+#define	HPET_MAIN_COUNTER_ADDRESS(la)				\
+		    OFFSET_ADDR(la, HPET_MAIN_COUNTER_OFFSET)
+#define	HPET_TIMER_N_CONF_ADDRESS(la, n)			\
+		    OFFSET_ADDR(la, HPET_TIMER_N_CONF_OFFSET(n))
+#define	HPET_TIMER_N_COMP_ADDRESS(la, n)			\
+		    OFFSET_ADDR(la, HPET_TIMER_N_COMP_OFFSET(n))
+
+/*
+ * HPET General Capabilities and ID Register
+ */
+typedef struct hpet_gen_cap {
+	uint32_t	counter_clk_period;	/* period in femtoseconds */
+	uint32_t	vendor_id	:16;	/* vendor */
+	uint32_t	leg_route_cap	:1;	/* 1=LegacyReplacemnt support */
+	uint32_t	res1		:1;	/* reserved */
+	uint32_t	count_size_cap	:1;	/* 0=32bit, 1=64bit wide */
+	uint32_t	num_tim_cap	:5;	/* number of timers -1 */
+	uint32_t	rev_id		:8;	/* revision number */
+} hpet_gen_cap_t;
+
+/*
+ * Macros to parse fields of the hpet General Capabilities and ID Register.
+ */
+#define	HPET_GCAP_CNTR_CLK_PERIOD(l)	(l >> 32)
+#define	HPET_GCAP_VENDOR_ID(l)		BITX(l, 31, 16)
+#define	HPET_GCAP_LEG_ROUTE_CAP(l)	BITX(l, 15, 15)
+#define	HPET_GCAP_CNT_SIZE_CAP(l)	BITX(l, 13, 13)
+#define	HPET_GCAP_NUM_TIM_CAP(l)	BITX(l, 12, 8)
+#define	HPET_GCAP_REV_ID(l)		BITX(l, 7, 0)
+
+/*
+ * From HPET spec "The value in this field must be less than or equal to":
+ */
+#define	HPET_MAX_CLK_PERIOD	(0x5F5E100)
+
+/*
+ * Femto seconds in a second.
+ */
+#if defined(__i386)
+#define	HPET_FEMTO_TO_NANO	(1000000LL)
+#define	HRTIME_TO_HPET_TICKS(t)	(((t) * HPET_FEMTO_TO_NANO) / hpet_info.period)
+#else
+#define	HPET_FEMTO_TO_NANO	(1000000L)
+#define	HRTIME_TO_HPET_TICKS(t)	(((t) * HPET_FEMTO_TO_NANO) / hpet_info.period)
+#endif	/* (__i386) */
+
+/*
+ * HPET General Configuration Register
+ */
+typedef struct hpet_gen_config_bitfield {
+	uint32_t	leg_rt_cnf :1;		/* legacy replacement route */
+	uint32_t	enable_cnf :1;		/* overal enable */
+} hpet_gen_conf_t;
+
+/*
+ * General Configuration Register fields.
+ */
+#define	HPET_GCFR_LEG_RT_CNF		(0x2)		/* bit field value */
+#define	HPET_GCFR_ENABLE_CNF		(0x1)		/* bit field value */
+#define	HPET_GCFR_LEG_RT_CNF_BITX(l)	BITX(l, 1, 1)
+#define	HPET_GCFR_ENABLE_CNF_BITX(l)	BITX(l, 0, 0)
+
+/*
+ * General Interrupt Status Register.
+ */
+#define	HPET_GIS_T2_INT_STS(l)		BITX(l, 2, 2)
+#define	HPET_GIS_T1_INT_STS(l)		BITX(l, 1, 1)
+#define	HPET_GIS_T0_INT_STS(l)		BITX(l, 0, 0)
+#define	HPET_GIS_TN_INT_STS(l, n)	BITX(l, n, n)
+
+#define	HPET_INTR_STATUS_MASK(timer)	((uint64_t)1 << (timer))
+
+/*
+ * HPET Timer N Configuration and Capabilities Register
+ */
+typedef struct hpet_TN_conf_cap {
+	uint32_t	int_route_cap;		/* available I/O APIC intrups */
+	uint32_t	res1		:16;	/* reserved */
+	uint32_t	fsb_int_del_cap	:1;	/* FSB interrupt supported */
+	uint32_t	fsb_int_en_cnf	:1;	/* Set FSB intr delivery */
+	uint32_t	int_route_cnf	:5;	/* I/O APIC interrupt to use */
+	uint32_t	mode32_cnf	:1;	/* Force 32-bit mode */
+	uint32_t	res2		:1;	/* reserved */
+	uint32_t	val_set_cnf	:1;	/* Set periodic mode accumula */
+	uint32_t	size_cap	:1;	/* 1=64bit, 0=32bit timer */
+	uint32_t	per_int_cap	:1;	/* 1=periodic mode supported */
+	uint32_t	type_cnf	:1;	/* Enable periodic mode */
+	uint32_t	int_enb_cnf	:1;	/* Enable interrupt generat */
+	uint32_t	int_type_cnf	:1;	/* 0=edge, 1=level triggered */
+	uint32_t	res3		:1;	/* reserved */
+} hpet_TN_conf_cap_t;
+
+/*
+ * There are 3 to 32 timers on each HPET.
+ */
+#define	HPET_TIMER_N_INT_ROUTE_CAP(l)	(l >> 32)
+#define	HPET_TIMER_N_INT_TYPE_CNF(l)	BITX(l, 1, 1)
+#define	HPET_TIMER_N_INT_ENB_CNF(l)	BITX(l, 2, 2)
+#define	HPET_TIMER_N_TYPE_CNF(l)	BITX(l, 3, 3)
+#define	HPET_TIMER_N_PER_INT_CAP(l)	BITX(l, 4, 4)
+#define	HPET_TIMER_N_SIZE_CAP(l)	BITX(l, 5, 5)
+#define	HPET_TIMER_N_VAL_SET_CNF(l)	BITX(l, 6, 6)
+#define	HPET_TIMER_N_MODE32_CNF(l)	BITX(l, 8, 8)
+#define	HPET_TIMER_N_INT_ROUTE_CNF(l)	BITX(l, 13, 9)
+#define	HPET_TIMER_N_FSB_EN_CNF(l)	BITX(l, 14, 14)
+#define	HPET_TIMER_N_FSB_INT_DEL_CAP(l)	BITX(l, 15, 15)
+
+#define	HPET_TIMER_N_INT_TYPE_CNF_BIT	(1 << 1)
+#define	HPET_TIMER_N_INT_ENB_CNF_BIT	(1 << 2)
+#define	HPET_TIMER_N_TYPE_CNF_BIT	(1 << 3)
+#define	HPET_TIMER_N_FSB_EN_CNF_BIT	(1 << 14)
+#define	HPET_TIMER_N_INT_ROUTE_SHIFT(i)	(i << 9)
+
+/*
+ * HPET Spec reserves timers 0 and 1 for legacy timer replacement (PIT and RTC).
+ * Available timers for other use such as LACPI proxy during Deep C-State
+ * start at timer 2.
+ */
+#define	HPET_FIRST_NON_LEGACY_TIMER	(2)
+
+/*
+ * HPET timer and interrupt used as LAPIC proxy during deep C-State.
+ */
+typedef struct cstate_timer {
+	int	timer;
+	int	intr;
+} cstate_timer_t;
+
+/*
+ * Data structure of useful HPET device information.
+ */
+typedef struct hpet_info {
+	hpet_gen_cap_t	gen_cap;
+	hpet_gen_conf_t	gen_config;
+	uint64_t	gen_intrpt_stat;
+	uint64_t	main_counter_value;
+	void		*logical_address;	/* HPET VA memory map */
+	hpet_TN_conf_cap_t *timer_n_config;	/* N Timer config and cap */
+	uint32_t	num_timers;		/* number of timers */
+	uint32_t	allocated_timers;	/* bitmap of timers in use */
+	cstate_timer_t	cstate_timer;	/* HPET Timer used for LAPIC proxy */
+	uint64_t	hpet_main_counter_reads[2];
+	hrtime_t	tsc[3];
+	hrtime_t	period;		/* counter_clk_period in Femto Secs */
+} hpet_info_t;
+
+#if defined(_KERNEL)
+
+/*
+ * Spin mutexes are used in several places because idle threads cannot block.
+ * These defines provide a mechanism to break out of spin loops to prevent
+ * system hangs if a CPU can never get the lock (due to an unknown
+ * hardware/software bug).  100 microsecond was chosen after extensive stress
+ * testing.
+ */
+#define	HPET_SPIN_CHECK		(1000)
+#define	HPET_SPIN_TIMEOUT	(100000)
+
+/*
+ * There is one of these per CPU using the HPET as a proxy for its stalled
+ * local APIC while in c-state >= C2.
+ */
+typedef hrtime_t hpet_proxy_t;
+
+extern ACPI_TABLE_HPET	*hpet_table;
+extern hpet_info_t	hpet_info;
+
+static int hpet_init_proxy(int *hpet_vect, iflag_t *hpet_flags);
+static boolean_t hpet_install_proxy(void);
+static boolean_t hpet_callback(int code);
+static boolean_t hpet_cpr(int code);
+static boolean_t hpet_resume(void);
+static void hpet_cst_callback(uint32_t code);
+static boolean_t hpet_deep_idle_config(int code);
+static int hpet_validate_table(ACPI_TABLE_HPET *hpet_table);
+static boolean_t hpet_checksum_table(unsigned char *table, unsigned int len);
+static void *hpet_memory_map(ACPI_TABLE_HPET *hpet_table);
+static int hpet_start_main_counter(hpet_info_t *hip);
+static int hpet_stop_main_counter(hpet_info_t *hip);
+static uint64_t hpet_read_main_counter_value(hpet_info_t *hip);
+static uint64_t hpet_set_leg_rt_cnf(hpet_info_t *hip, uint32_t new_value);
+static uint64_t hpet_read_gen_cap(hpet_info_t *hip);
+static uint64_t hpet_read_gen_config(hpet_info_t *hip);
+static uint64_t hpet_read_gen_intrpt_stat(hpet_info_t *hip);
+static uint64_t hpet_read_timer_N_config(hpet_info_t *hip, uint_t n);
+static hpet_TN_conf_cap_t hpet_convert_timer_N_config(uint64_t conf);
+static uint64_t hpet_read_timer_N_comp(hpet_info_t *hip, uint_t n);
+static void hpet_write_gen_cap(hpet_info_t *hip, uint64_t l);
+static void hpet_write_gen_config(hpet_info_t *hip, uint64_t l);
+static void hpet_write_gen_intrpt_stat(hpet_info_t *hip, uint64_t l);
+static void hpet_write_timer_N_config(hpet_info_t *hip, uint_t n, uint64_t l);
+static void hpet_write_timer_N_comp(hpet_info_t *hip, uint_t n, uint64_t l);
+static void hpet_disable_timer(hpet_info_t *hip, uint32_t timer_n);
+static void hpet_enable_timer(hpet_info_t *hip, uint32_t timer_n);
+static void hpet_write_main_counter_value(hpet_info_t *hip, uint64_t l);
+static int hpet_get_FSB_intr_capable_timer(hpet_info_t *hip, uint32_t mask);
+static int hpet_get_IOAPIC_intr_capable_timer(hpet_info_t *hip);
+static int hpet_timer_available(uint32_t allocated_timers, uint32_t n);
+static void hpet_timer_alloc(uint32_t *allocated_timers, uint32_t n);
+static void hpet_timer_set_up(hpet_info_t *hip, uint32_t timer_n,
+    uint32_t interrupt);
+static uint_t hpet_isr(char *arg);
+static uint32_t hpet_install_interrupt_handler(uint_t (*func)(char *),
+    int vector);
+static void hpet_uninstall_interrupt_handler(void);
+static void hpet_expire_all(void);
+static boolean_t hpet_guaranteed_schedule(hrtime_t required_wakeup_time);
+static boolean_t hpet_use_hpet_timer(hrtime_t *expire);
+static void hpet_use_lapic_timer(hrtime_t expire);
+static void hpet_init_proxy_data(void);
+
+#endif	/* defined(_KERNEL) */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _HPET_ACPI_H */
diff --git a/usr/src/uts/i86pc/sys/machcpuvar.h b/usr/src/uts/i86pc/sys/machcpuvar.h
index 75654e16d7..415e71533e 100644
--- a/usr/src/uts/i86pc/sys/machcpuvar.h
+++ b/usr/src/uts/i86pc/sys/machcpuvar.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_MACHCPUVAR_H
 #define	_SYS_MACHCPUVAR_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -128,14 +126,21 @@ struct	machcpu {
 	struct xen_evt_data *mcpu_evt_pend; /* hypervisor: pending events */
 
 	volatile uint32_t *mcpu_mwait;	/* MONITOR/MWAIT buffer */
+	void (*mcpu_idle_cpu)(void);	/* idle function */
+	uint16_t mcpu_idle_type;	/* CPU next idle type */
+	uint16_t max_cstates;		/* supported max cstates */
+	uint32_t curr_cstate;		/* current cstate */
 
 	struct cpu_ucode_info	*mcpu_ucode_info;
+
+	void		*mcpu_pm_mach_state;
 };
 
 #define	NINTR_THREADS	(LOCK_LEVEL-1)	/* number of interrupt threads */
 #define	MWAIT_HALTED	(1)		/* mcpu_mwait set when halting */
 #define	MWAIT_RUNNING	(0)		/* mcpu_mwait set to wakeup */
-#define	MWAIT_WAKEUP(cpu)	(*((cpu)->cpu_m.mcpu_mwait) = MWAIT_RUNNING);
+#define	MWAIT_WAKEUP_IPI	(2)	/* need IPI to wakeup */
+#define	MWAIT_WAKEUP(cpu)	(*((cpu)->cpu_m.mcpu_mwait) = MWAIT_RUNNING)
 
 #endif	/* _ASM */
 
diff --git a/usr/src/uts/i86pc/sys/machsystm.h b/usr/src/uts/i86pc/sys/machsystm.h
index 0cd65e12e6..feebea3f6c 100644
--- a/usr/src/uts/i86pc/sys/machsystm.h
+++ b/usr/src/uts/i86pc/sys/machsystm.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -102,6 +102,14 @@ extern void trap(struct regs *, caddr_t, processorid_t);
 extern void do_interrupt(struct regs *, trap_trace_rec_t *);
 extern void memscrub_disable(void);
 
+/*
+ * Dispatcher hooks.
+ */
+void    (*idle_cpu)();
+void    (*non_deep_idle_cpu)();
+void    (*disp_enq_thread)(cpu_t *, int);
+void    (*non_deep_idle_disp_enq_thread)(cpu_t *, int);
+
 #ifndef __xpv
 extern unsigned int microdata;
 #endif
diff --git a/usr/src/uts/i86pc/sys/pwrnow.h b/usr/src/uts/i86pc/sys/pwrnow.h
index 1e3cc24e3f..b010964290 100644
--- a/usr/src/uts/i86pc/sys/pwrnow.h
+++ b/usr/src/uts/i86pc/sys/pwrnow.h
@@ -19,22 +19,22 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_PWRNOW_H
 #define	_PWRNOW_H
 
-#include <sys/cpudrv_mach.h>
+#include <sys/cpupm.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-boolean_t pwrnow_supported();
+extern boolean_t pwrnow_supported();
 
-cpudrv_pstate_ops_t pwrnow_ops;
+extern cpupm_state_ops_t pwrnow_ops;
 
 #ifdef __cplusplus
 }
diff --git a/usr/src/uts/i86pc/sys/speedstep.h b/usr/src/uts/i86pc/sys/speedstep.h
index f9debb2758..e2dfeba023 100644
--- a/usr/src/uts/i86pc/sys/speedstep.h
+++ b/usr/src/uts/i86pc/sys/speedstep.h
@@ -19,22 +19,22 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SPEEDSTEP_H
 #define	_SPEEDSTEP_H
 
-#include <sys/cpudrv_mach.h>
+#include <sys/cpupm.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-boolean_t speedstep_supported(uint_t, uint_t);
+extern boolean_t speedstep_supported(uint_t, uint_t);
 
-cpudrv_pstate_ops_t speedstep_ops;
+extern cpupm_state_ops_t speedstep_ops;
 
 #ifdef __cplusplus
 }
diff --git a/usr/src/uts/i86xpv/Makefile.files b/usr/src/uts/i86xpv/Makefile.files
index 546549e603..9209bd604c 100644
--- a/usr/src/uts/i86xpv/Makefile.files
+++ b/usr/src/uts/i86xpv/Makefile.files
@@ -20,7 +20,7 @@
 #
 
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 
@@ -44,6 +44,7 @@ CORE_OBJS +=			\
 	cpuid.o			\
 	cpuid_subr.o		\
 	cpupm.o			\
+	cpupm_mach.o		\
 	dis_tables.o		\
 	ddi_impl.o		\
 	dtrace_subr.o		\
diff --git a/usr/src/uts/intel/ia32/ml/modstubs.s b/usr/src/uts/intel/ia32/ml/modstubs.s
index 6d2fce1635..3b050716c0 100644
--- a/usr/src/uts/intel/ia32/ml/modstubs.s
+++ b/usr/src/uts/intel/ia32/ml/modstubs.s
@@ -1315,6 +1315,26 @@ fcnname/**/_info:							\
 	END_MODULE(dcopy);
 #endif
 
+/*
+ * Stubs for acpica
+ */
+#ifndef ACPICA_MODULE
+	MODULE(acpica,misc);
+	NO_UNLOAD_STUB(acpica, AcpiOsReadPort, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, AcpiOsWritePort, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, AcpiInstallNotifyHandler, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, AcpiRemoveNotifyHandler, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, AcpiEvaluateObject, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, AcpiEvaluateObjectTyped, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, AcpiSetRegister, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, AcpiGetRegister, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, AcpiOsFree, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, acpica_get_handle_cpu, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, acpica_get_global_FADT, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, __acpi_wbinvd, nomod_minus_one) ;
+	END_MODULE(acpica);
+#endif
+
 #ifndef IPNET_MODULE
 	MODULE(ipnet,drv);
 	STUB(ipnet, ipnet_if_getdev, nomod_zero);
diff --git a/usr/src/uts/intel/io/acpica/osl.c b/usr/src/uts/intel/io/acpica/osl.c
index 41f85c9bdc..45edf50026 100644
--- a/usr/src/uts/intel/io/acpica/osl.c
+++ b/usr/src/uts/intel/io/acpica/osl.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
@@ -474,8 +474,16 @@ ACPI_CPU_FLAGS
 AcpiOsAcquireLock(ACPI_HANDLE Handle)
 {
 
-	mutex_enter((kmutex_t *)Handle);
-	return (0);
+
+	if (Handle == NULL)
+		return (AE_BAD_PARAMETER);
+
+	if (curthread == CPU->cpu_idle_thread) {
+		while (!mutex_tryenter((kmutex_t *)Handle))
+			/* spin */;
+	} else
+		mutex_enter((kmutex_t *)Handle);
+	return (AE_OK);
 }
 
 void
@@ -1365,24 +1373,8 @@ acpica_add_processor_to_map(UINT32 acpi_id, ACPI_HANDLE obj)
  * Return the ACPI device node matching the CPU dev_info node.
  */
 ACPI_STATUS
-acpica_get_handle_cpu(dev_info_t *dip, ACPI_HANDLE *rh)
+acpica_get_handle_cpu(int cpu_id, ACPI_HANDLE *rh)
 {
-	char	*device_type_prop;
-	int	cpu_id;
-
-	/*
-	 * if "device_type" != "cpu", error
-	 */
-	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0,
-	    "device_type", &device_type_prop) != DDI_PROP_SUCCESS)
-		return (AE_ERROR);
-
-	if (strcmp("cpu", device_type_prop) != 0) {
-		ddi_prop_free(device_type_prop);
-		return (AE_ERROR);
-	}
-	ddi_prop_free(device_type_prop);
-
 	/*
 	 * if cpu_map itself is NULL, we're a uppc system and
 	 * acpica_build_processor_map() hasn't been called yet.
@@ -1394,19 +1386,10 @@ acpica_get_handle_cpu(dev_info_t *dip, ACPI_HANDLE *rh)
 			return (AE_ERROR);
 	}
 
-	/*
-	 * get 'reg' and get obj from cpu_map
-	 */
-	cpu_id = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
-	    "reg", -1);
 	if ((cpu_id < 0) || (cpu_map[cpu_id] == NULL) ||
 	    (cpu_map[cpu_id]->obj == NULL))
 		return (AE_ERROR);
 
-	/*
-	 * tag devinfo and obj
-	 */
-	(void) acpica_tag_devinfo(dip, cpu_map[cpu_id]->obj);
 	*rh = cpu_map[cpu_id]->obj;
 	return (AE_OK);
 }
@@ -1689,7 +1672,7 @@ acpica_get_handle(dev_info_t *dip, ACPI_HANDLE *rh)
 
 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
 	    "acpi-namespace", &acpiname) != DDI_PROP_SUCCESS) {
-		return (acpica_get_handle_cpu(dip, rh));
+		return (AE_ERROR);
 	}
 
 	status = AcpiGetHandle(NULL, acpiname, rh);
@@ -1793,3 +1776,9 @@ acpica_build_processor_map()
 	ASSERT(status == AE_OK);
 	cpu_map_built = 1;
 }
+
+void
+acpica_get_global_FADT(ACPI_TABLE_FADT **gbl_FADT)
+{
+	*gbl_FADT = &AcpiGbl_FADT;
+}
diff --git a/usr/src/uts/intel/sys/acpica.h b/usr/src/uts/intel/sys/acpica.h
index 8b3e1206c3..dddcc9bf78 100644
--- a/usr/src/uts/intel/sys/acpica.h
+++ b/usr/src/uts/intel/sys/acpica.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_ACPICA_H
 #define	_SYS_ACPICA_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -125,11 +123,13 @@ extern ACPI_STATUS acpica_get_sci(int *, iflag_t *);
 extern int acpica_get_bdf(dev_info_t *, int *, int *, int *);
 extern ACPI_STATUS acpica_get_devinfo(ACPI_HANDLE, dev_info_t **);
 extern ACPI_STATUS acpica_get_handle(dev_info_t *, ACPI_HANDLE *);
+extern ACPI_STATUS acpica_get_handle_cpu(int, ACPI_HANDLE *);
 extern ACPI_STATUS acpica_eval_int(ACPI_HANDLE, char *, int *);
 extern void acpica_map_cpu(processorid_t, UINT32);
 extern void acpica_build_processor_map();
 extern void acpica_ddi_save_resources(dev_info_t *);
 extern void acpica_ddi_restore_resources(dev_info_t *);
+extern void acpica_get_global_FADT(ACPI_TABLE_FADT **);
 
 #ifdef __cplusplus
 }
diff --git a/usr/src/uts/intel/sys/x86_archext.h b/usr/src/uts/intel/sys/x86_archext.h
index 369dff14db..c5a88a30f3 100644
--- a/usr/src/uts/intel/sys/x86_archext.h
+++ b/usr/src/uts/intel/sys/x86_archext.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -357,6 +357,11 @@ extern "C" {
 	"\10mmx\7cmov\6de\5pge\4mtrr\3msr\2tsc\1lgpg"
 
 /*
+ * Intel Deep C-State invariant TSC in leaf 0x80000007.
+ */
+#define	CPUID_TSC_CSTATE_INVARIANCE	(0x100)
+
+/*
  * x86_type is a legacy concept; this is supplanted
  * for most purposes by x86_feature; modern CPUs
  * should be X86_TYPE_OTHER
@@ -605,6 +610,7 @@ extern uint_t cpuid_get_dtlb_nent(struct cpu *, size_t);
 #if !defined(__xpv)
 extern uint32_t *cpuid_mwait_alloc(struct cpu *);
 extern void cpuid_mwait_free(struct cpu *);
+extern int cpuid_deep_cstates_supported(void);
 #endif
 
 struct cpu_ucode_info;
diff --git a/usr/src/uts/sun4/Makefile.files b/usr/src/uts/sun4/Makefile.files
index 71e4dd6ee6..f532ad10ad 100644
--- a/usr/src/uts/sun4/Makefile.files
+++ b/usr/src/uts/sun4/Makefile.files
@@ -20,11 +20,9 @@
 #
 
 #
-# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
-#
 #	This Makefile defines all file modules for the directory uts/sun4
 #	and it's children. These are the source files which are common 
 #	between sun4u and sun4r.
@@ -38,6 +36,7 @@ CORE_OBJS +=	bus_func.o
 CORE_OBJS +=	cbe.o
 CORE_OBJS +=	confunix.o
 CORE_OBJS +=	copy.o
+CORE_OBJS +=	cpupm_mach.o
 CORE_OBJS +=	cpu_states.o
 CORE_OBJS +=	ddi_impl.o
 CORE_OBJS +=	dmv.o
diff --git a/usr/src/uts/sun4/os/cpupm_mach.c b/usr/src/uts/sun4/os/cpupm_mach.c
new file mode 100644
index 0000000000..3d041c26ab
--- /dev/null
+++ b/usr/src/uts/sun4/os/cpupm_mach.c
@@ -0,0 +1,51 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/cpu_pm.h>
+
+/*
+ * CPU PM interfaces exposed to the CPU power manager
+ */
+/*ARGSUSED*/
+id_t
+cpupm_plat_domain_id(struct cpu *cp, cpupm_dtype_t type)
+{
+	return (CPUPM_NO_DOMAIN);
+}
+
+/*ARGSUSED*/
+uint_t
+cpupm_plat_state_enumerate(struct cpu *cp, cpupm_dtype_t type,
+    cpupm_state_t *states)
+{
+	return (0);
+}
+
+/*ARGSUSED*/
+int
+cpupm_plat_change_state(struct cpu *cp, cpupm_state_t *state)
+{
+	return (-1);
+}
diff --git a/usr/src/uts/sun4/os/mlsetup.c b/usr/src/uts/sun4/os/mlsetup.c
index db8066c8ca..4d6b244bd2 100644
--- a/usr/src/uts/sun4/os/mlsetup.c
+++ b/usr/src/uts/sun4/os/mlsetup.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/archsystm.h>
@@ -216,6 +214,8 @@ mlsetup(struct regs *rp, kfpu_t *fp)
 
 	cpu_vm_data_init(CPU);
 
+	pg_cpu_bootstrap(CPU);
+
 	(void) prom_set_preprom(kern_splr_preprom);
 	(void) prom_set_postprom(kern_splx_postprom);
 	PRM_INFO("mlsetup: now ok to call prom_printf");
diff --git a/usr/src/uts/sun4/os/startup.c b/usr/src/uts/sun4/os/startup.c
index 1459eb1ce4..fb1f5168b0 100644
--- a/usr/src/uts/sun4/os/startup.c
+++ b/usr/src/uts/sun4/os/startup.c
@@ -94,6 +94,7 @@ extern void memseg_remap_init(void);
 extern void mach_kpm_init(void);
 extern void pcf_init();
 extern int size_pse_array(pgcnt_t, int);
+extern void pg_init();
 
 /*
  * External Data:
@@ -2222,6 +2223,8 @@ post_startup(void)
 
 	maxmem = freemem;
 
+	pg_init();
+
 #ifdef	PTL1_PANIC_DEBUG
 	init_ptl1_thread();
 #endif	/* PTL1_PANIC_DEBUG */
diff --git a/usr/src/uts/i86pc/sys/cpudrv_throttle.h b/usr/src/uts/sun4/sys/cpupm_mach.h
index ae4d352c14..4b7e6d01cc 100644
--- a/usr/src/uts/i86pc/sys/cpudrv_throttle.h
+++ b/usr/src/uts/sun4/sys/cpupm_mach.h
@@ -19,23 +19,24 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#ifndef	_CPUDRV_THROTTLE_H
-#define	_CPUDRV_THROTTLE_H
-
-#include <sys/cpudrv_mach.h>
+#ifndef	_CPUPM_MACH_H
+#define	_CPUPM_MACH_H
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-cpudrv_tstate_ops_t cpudrv_throttle_ops;
+/*
+ * Convert speed to Hz.
+ */
+#define	CPUPM_SPEED_HZ(mhz, divisor) (((uint64_t)mhz * 1000000) / divisor)
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif	/* _CPUDRV_THROTTLE_H */
+#endif	/* _CPUPM_MACH_H */
diff --git a/usr/src/uts/sun4u/Makefile.files b/usr/src/uts/sun4u/Makefile.files
index 2e05b61c1e..15f7e7d22a 100644
--- a/usr/src/uts/sun4u/Makefile.files
+++ b/usr/src/uts/sun4u/Makefile.files
@@ -20,7 +20,7 @@
 #
 
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 #	This Makefile defines all file modules for the directory uts/sun4u
@@ -36,6 +36,7 @@ CORE_OBJS +=	bootops.o
 CORE_OBJS +=	cmp.o
 CORE_OBJS +=	cpc_hwreg.o
 CORE_OBJS +=	cpc_subr.o
+CORE_OBJS +=	cpupm.o
 CORE_OBJS +=	mach_cpu_states.o
 CORE_OBJS +=	mach_ddi_impl.o
 CORE_OBJS +=	ecc.o
diff --git a/usr/src/uts/sun4u/cpu/spitfire.c b/usr/src/uts/sun4u/cpu/spitfire.c
index 00494d8cbc..9784a2338a 100644
--- a/usr/src/uts/sun4u/cpu/spitfire.c
+++ b/usr/src/uts/sun4u/cpu/spitfire.c
@@ -2904,8 +2904,7 @@ cpu_change_speed(uint64_t new_divisor, uint64_t arg2)
 		CHANGE_REFRESH_COUNT(HB_SPEED_UP, cur_divisor, new_divisor);
 	}
 	CPU->cpu_m.divisor = (uchar_t)new_divisor;
-	CPU->cpu_curr_clock =
-	    (((uint64_t)pi->pi_clock * 1000000) / new_divisor);
+	cpu_set_curr_clock(((uint64_t)pi->pi_clock * 1000000) / new_divisor);
 #endif
 }
 
diff --git a/usr/src/uts/sun4u/cpu/us3_cheetah.c b/usr/src/uts/sun4u/cpu/us3_cheetah.c
index eadaebc099..c8290750bf 100644
--- a/usr/src/uts/sun4u/cpu/us3_cheetah.c
+++ b/usr/src/uts/sun4u/cpu/us3_cheetah.c
@@ -570,8 +570,8 @@ cpu_change_speed(uint64_t divisor, uint64_t arg2)
 		reg |= bceclk->mask;
 		set_safari_config(reg);
 		CPU->cpu_m.divisor = (uchar_t)divisor;
-		CPU->cpu_curr_clock =
-		    (((uint64_t)pi->pi_clock * 1000000) / divisor);
+		cpu_set_curr_clock(((uint64_t)pi->pi_clock * 1000000) /
+		    divisor);
 		return;
 	}
 	/*
diff --git a/usr/src/uts/sun4u/cpu/us3_cheetahplus.c b/usr/src/uts/sun4u/cpu/us3_cheetahplus.c
index 7cda4df713..b421e74b37 100644
--- a/usr/src/uts/sun4u/cpu/us3_cheetahplus.c
+++ b/usr/src/uts/sun4u/cpu/us3_cheetahplus.c
@@ -774,8 +774,8 @@ cpu_change_speed(uint64_t divisor, uint64_t arg2)
 		reg |= bceclk->mask;
 		set_safari_config(reg);
 		CPU->cpu_m.divisor = (uchar_t)divisor;
-		CPU->cpu_curr_clock =
-		    (((uint64_t)pi->pi_clock * 1000000) / divisor);
+		cpu_set_curr_clock(((uint64_t)pi->pi_clock * 1000000) /
+		    divisor);
 		return;
 	}
 	/*
diff --git a/usr/src/uts/sun4u/cpu/us3_jalapeno.c b/usr/src/uts/sun4u/cpu/us3_jalapeno.c
index bb0cb0c961..9dd046086a 100644
--- a/usr/src/uts/sun4u/cpu/us3_jalapeno.c
+++ b/usr/src/uts/sun4u/cpu/us3_jalapeno.c
@@ -792,8 +792,8 @@ cpu_change_speed(uint64_t divisor, uint64_t arg2)
 			(void) get_mcu_ctl_reg1();
 		}
 		CPU->cpu_m.divisor = (uchar_t)divisor;
-		CPU->cpu_curr_clock =
-		    (((uint64_t)pi->pi_clock * 1000000) / divisor);
+		cpu_set_curr_clock(((uint64_t)pi->pi_clock * 1000000) /
+		    divisor);
 		return;
 	}
 	/*
diff --git a/usr/src/uts/sun4u/io/cpudrv_mach.c b/usr/src/uts/sun4u/io/cpudrv_mach.c
index a9ca3debb4..c6129f64d8 100644
--- a/usr/src/uts/sun4u/io/cpudrv_mach.c
+++ b/usr/src/uts/sun4u/io/cpudrv_mach.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -28,16 +28,15 @@
  */
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
+#include <sys/cpupm.h>
 #include <sys/cpudrv_mach.h>
 #include <sys/machsystm.h>
 
-boolean_t cpudrv_enabled = B_TRUE;
-
 /*
  * Change CPU speed.
  */
 int
-cpudrv_pm_change_speed(cpudrv_devstate_t *cpudsp, cpudrv_pm_spd_t *new_spd)
+cpudrv_change_speed(cpudrv_devstate_t *cpudsp, cpudrv_pm_spd_t *new_spd)
 {
 	xc_one(cpudsp->cpu_id, (xcfunc_t *)cpu_change_speed, \
 	    (uint64_t)new_spd->speed, 0);
@@ -48,7 +47,7 @@ cpudrv_pm_change_speed(cpudrv_devstate_t *cpudsp, cpudrv_pm_spd_t *new_spd)
  * Determine the cpu_id for the CPU device.
  */
 boolean_t
-cpudrv_pm_get_cpu_id(dev_info_t *dip,  processorid_t *cpu_id)
+cpudrv_get_cpu_id(dev_info_t *dip,  processorid_t *cpu_id)
 {
 	return (dip_to_cpu_id(dip, cpu_id) == DDI_SUCCESS);
 }
@@ -57,7 +56,7 @@ cpudrv_pm_get_cpu_id(dev_info_t *dip,  processorid_t *cpu_id)
  * A noop for this machine type.
  */
 boolean_t
-cpudrv_pm_power_ready(void)
+cpudrv_power_ready(void)
 {
 	return (B_TRUE);
 }
@@ -67,7 +66,7 @@ cpudrv_pm_power_ready(void)
  */
 /* ARGSUSED */
 boolean_t
-cpudrv_pm_is_governor_thread(cpudrv_pm_t *cpupm)
+cpudrv_is_governor_thread(cpudrv_pm_t *cpupm)
 {
 	return (B_FALSE);
 }
@@ -77,26 +76,31 @@ cpudrv_pm_is_governor_thread(cpudrv_pm_t *cpupm)
  */
 /*ARGSUSED*/
 boolean_t
-cpudrv_mach_pm_init(cpudrv_devstate_t *cpudsp)
+cpudrv_mach_init(cpudrv_devstate_t *cpudsp)
 {
 	return (B_TRUE);
 }
 
 /*
- * A noop for this machine type.
+ * On SPARC all instances support power management unless attach fails.
+ * In the case of attach failure, cpudrv_enabled will be false.
  */
 /*ARGSUSED*/
-void
-cpudrv_mach_pm_free(cpudrv_devstate_t *cpudsp)
+boolean_t
+cpudrv_is_enabled(cpudrv_devstate_t *cpudsp)
 {
+	return (cpudrv_enabled);
 }
 
-/*
- * On SPARC all instances support power management unless attach fails.
- * In the case of attach failure, cpupm_enabled will be false.
- */
-boolean_t
-cpudrv_pm_enabled()
+void
+cpudrv_set_supp_freqs(cpudrv_devstate_t *cpudsp)
 {
-	return (B_TRUE);
+	int		*speeds;
+	uint_t		nspeeds;
+
+	CPUDRV_GET_SPEEDS(cpudsp, speeds, nspeeds);
+	if (nspeeds == 0)
+		return;
+	cpupm_set_supp_freqs(cpudsp->cp, speeds, nspeeds);
+	CPUDRV_FREE_SPEEDS(speeds, nspeeds);
 }
diff --git a/usr/src/uts/sun4u/os/cmp.c b/usr/src/uts/sun4u/os/cmp.c
index c44d8067ee..8ba9aa3b6e 100644
--- a/usr/src/uts/sun4u/os/cmp.c
+++ b/usr/src/uts/sun4u/os/cmp.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/machsystm.h>
 #include <sys/x_call.h>
@@ -224,10 +222,16 @@ pg_plat_hw_instance_id(cpu_t *cpu, pghw_type_t hw)
 	}
 }
 
-int
-pg_plat_hw_level(pghw_type_t hw)
+/*
+ * Rank the relative importance of optimizing for hw1 or hw2
+ */
+pghw_type_t
+pg_plat_hw_rank(pghw_type_t hw1, pghw_type_t hw2)
 {
 	int i;
+	int rank1 = 0;
+	int rank2 = 0;
+
 	static pghw_type_t hw_hier[] = {
 		PGHW_IPIPE,
 		PGHW_CHIP,
@@ -236,40 +240,28 @@ pg_plat_hw_level(pghw_type_t hw)
 	};
 
 	for (i = 0; hw_hier[i] != PGHW_NUM_COMPONENTS; i++) {
-		if (hw_hier[i] == hw)
-			return (i);
+		if (hw_hier[i] == hw1)
+			rank1 = i;
+		if (hw_hier[i] == hw2)
+			rank2 = i;
 	}
-	return (-1);
-}
 
-/*
- * Return 1 if CMT load balancing policies should be
- * implemented across instances of the specified hardware
- * sharing relationship.
- */
-int
-pg_plat_cmt_load_bal_hw(pghw_type_t hw)
-{
-	if (hw == PGHW_IPIPE ||
-	    hw == PGHW_FPU ||
-	    hw == PGHW_CHIP)
-		return (1);
+	if (rank1 > rank2)
+		return (hw1);
 	else
-		return (0);
+		return (hw2);
 }
 
-
 /*
- * Return 1 if thread affinity polices should be implemented
- * for instances of the specifed hardware sharing relationship.
+ * Override the default CMT dispatcher policy for the specified
+ * hardware sharing relationship
  */
-int
-pg_plat_cmt_affinity_hw(pghw_type_t hw)
+/* ARGSUSED */
+pg_cmt_policy_t
+pg_plat_cmt_policy(pghw_type_t hw)
 {
-	if (hw == PGHW_CACHE)
-		return (1);
-	else
-		return (0);
+	/* Accept the default polices */
+	return (CMT_NO_POLICY);
 }
 
 id_t
diff --git a/usr/src/uts/sun4u/os/mach_startup.c b/usr/src/uts/sun4u/os/mach_startup.c
index de59d089fc..0484b9b049 100644
--- a/usr/src/uts/sun4u/os/mach_startup.c
+++ b/usr/src/uts/sun4u/os/mach_startup.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -403,7 +403,7 @@ cpu_wakeup(cpu_t *cpu, int bound)
 }
 
 void
-mach_cpu_halt_idle()
+mach_cpu_halt_idle(void)
 {
 	if (enable_halt_idle_cpus) {
 		if (&cpu_halt_cpu) {
diff --git a/usr/src/uts/sun4u/sys/cpudrv_mach.h b/usr/src/uts/sun4u/sys/cpudrv_mach.h
index 617e35b290..f1714fc695 100644
--- a/usr/src/uts/sun4u/sys/cpudrv_mach.h
+++ b/usr/src/uts/sun4u/sys/cpudrv_mach.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -38,44 +38,32 @@ extern "C" {
  * take cross calls (cross calls fail silently if CPU is not ready
  * for it).
  */
-#define	CPUDRV_PM_XCALL_IS_READY(cpuid) (CPU_XCALL_READY(cpuid))
-
-/*
- * If a failure occurs during attach(), then CPU power management
- * is disabled.
- */
-extern boolean_t cpudrv_enabled;
-
-#define	CPUDRV_PM_DISABLE() (cpudrv_enabled = B_FALSE)
-
-#define	CPUDRV_PM_DISABLED() (!cpudrv_enabled)
-
-#define	CPUDRV_PM_POWER_ENABLED(cpudsp) cpudrv_pm_enabled()
+#define	CPUDRV_XCALL_IS_READY(cpuid) (CPU_XCALL_READY(cpuid))
 
 /*
  * Currently, there is no governor on sun4u,
  */
-#define	CPUDRV_PM_RESET_GOVERNOR_THREAD(cpupm)
+#define	CPUDRV_RESET_GOVERNOR_THREAD(cpupm)
 
 /*
  * Currently, there is no need for a handler on sun4u.
  */
-#define	CPUDRV_PM_INSTALL_MAX_CHANGE_HANDLER(cpudsp, dip)
+#define	CPUDRV_INSTALL_MAX_CHANGE_HANDLER(cpuid)
 
 /*
  * Topspeed is always the head speed.
  */
-#define	CPUDRV_PM_TOPSPEED(cpupm)	(cpupm)->head_spd
+#define	CPUDRV_TOPSPEED(cpupm)	(cpupm)->head_spd
 
 /*
  * There is no notion of changing topspeed on sun4u.
  */
-#define	CPUDRV_PM_REDEFINE_TOPSPEED(dip)
+#define	CPUDRV_REDEFINE_TOPSPEED(dip)
 
 /*
  * There are no PPM callbacks for sun4u.
  */
-#define	CPUDRV_PM_SET_PPM_CALLBACKS()
+#define	CPUDRV_SET_PPM_CALLBACKS()
 
 /*
  * clock-divisors property tells the supported speeds
@@ -84,33 +72,36 @@ extern boolean_t cpudrv_enabled;
  * property value of "1, 2, 32" represents full, 1/2 and 1/32
  * speeds.
  */
-#define	CPUDRV_PM_GET_SPEEDS(cpudsp, speeds, nspeeds) { \
+#define	CPUDRV_GET_SPEEDS(cpudsp, speeds, nspeeds) { \
 	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, cpudsp->dip, \
 	    DDI_PROP_DONTPASS, "clock-divisors", &speeds, \
 	    &nspeeds) != DDI_PROP_SUCCESS) { \
-		DPRINTF(D_PM_INIT, ("cpudrv_pm_init: instance %d: " \
+		nspeeds = 0; \
+		DPRINTF(D_PM_INIT, ("cpudrv_init: instance %d: " \
 		    "clock-divisors property not defined\n", \
-				    ddi_get_instance(cpudsp->dip))); \
-		return (DDI_FAILURE); \
+		    ddi_get_instance(cpudsp->dip))); \
 	} \
 }
-#define	CPUDRV_PM_FREE_SPEEDS(speeds, unused) ddi_prop_free(speeds);
+#define	CPUDRV_FREE_SPEEDS(speeds, nspeeds) { \
+	if (nspeeds > 0) \
+		ddi_prop_free(speeds); \
+}
 
 /*
  * Convert speed to Hz.
  */
-#define	CPUDRV_PM_SPEED_HZ(mhz, divisor) (((uint64_t)mhz * 1000000) / divisor)
+#define	CPUDRV_SPEED_HZ(mhz, divisor) (((uint64_t)mhz * 1000000) / divisor)
 
 /*
  * Compute the idle cnt percentage for a given speed.
  */
-#define	CPUDRV_PM_IDLE_CNT_PERCENT(hwm, speeds, i) \
+#define	CPUDRV_IDLE_CNT_PERCENT(hwm, speeds, i) \
 	(100 - ((100 - hwm) * speeds[i]))
 
 /*
  * Compute the user cnt percentage for a given speed.
  */
-#define	CPUDRV_PM_USER_CNT_PERCENT(hwm, speeds, i) \
+#define	CPUDRV_USER_CNT_PERCENT(hwm, speeds, i) \
 	((hwm * speeds[i - 1]) / speeds[i])
 
 /*
@@ -128,23 +119,21 @@ extern boolean_t cpudrv_enabled;
  *      digits for power level + '=' + '1/' + digits for speed +
  *      description text + '\0'
  */
-#define	CPUDRV_PM_COMP_NORMAL "Normal"
-#define	CPUDRV_PM_COMP_OTHER " of Normal"
-#define	CPUDRV_PM_COMP_SIZE() \
-	(CPUDRV_PM_COMP_MAX_DIG + 1 + 2 + CPUDRV_PM_COMP_MAX_DIG + \
-	    sizeof (CPUDRV_PM_COMP_OTHER) + 1);
-#define	CPUDRV_PM_COMP_SPEED(cpupm, cur_spd) \
+#define	CPUDRV_COMP_NORMAL "Normal"
+#define	CPUDRV_COMP_OTHER " of Normal"
+#define	CPUDRV_COMP_SIZE() \
+	(CPUDRV_COMP_MAX_DIG + 1 + 2 + CPUDRV_COMP_MAX_DIG + \
+	    sizeof (CPUDRV_COMP_OTHER) + 1);
+#define	CPUDRV_COMP_SPEED(cpupm, cur_spd) \
 	((cur_spd == cpupm->head_spd) ? cur_spd->pm_level : cur_spd->speed)
-#define	CPUDRV_PM_COMP_SPRINT(pmc, cpupm, cur_spd, comp_spd) { \
+#define	CPUDRV_COMP_SPRINT(pmc, cpupm, cur_spd, comp_spd) { \
 	if (cur_spd == cpupm->head_spd) \
-		(void) sprintf(pmc, "%d=%s", comp_spd, CPUDRV_PM_COMP_NORMAL);\
+		(void) sprintf(pmc, "%d=%s", comp_spd, CPUDRV_COMP_NORMAL);\
 	else \
 		(void) sprintf(pmc, "%d=1/%d%s", cur_spd->pm_level, \
-		    comp_spd, CPUDRV_PM_COMP_OTHER); \
+		    comp_spd, CPUDRV_COMP_OTHER); \
 }
 
-extern boolean_t cpudrv_pm_enabled(void);
-
 #ifdef  __cplusplus
 }
 #endif
diff --git a/usr/src/uts/sun4v/os/cmp.c b/usr/src/uts/sun4v/os/cmp.c
index 681afab583..4e80f06f32 100644
--- a/usr/src/uts/sun4v/os/cmp.c
+++ b/usr/src/uts/sun4v/os/cmp.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/machsystm.h>
 #include <sys/cmp.h>
@@ -132,16 +130,15 @@ pg_plat_hw_instance_id(cpu_t *cpu, pghw_type_t hw)
 }
 
 /*
- * Order the relevant hw sharing relationships
- * from least, to greatest physical scope.
- *
- * The hierarchy *must* be defined for all hw that
- * pg_plat_hw_shared() returns non-zero.
+ * Rank the relative importance of optimizing for hw1 or hw2
  */
-int
-pg_plat_hw_level(pghw_type_t hw)
+pghw_type_t
+pg_plat_hw_rank(pghw_type_t hw1, pghw_type_t hw2)
 {
 	int i;
+	int rank1 = 0;
+	int rank2 = 0;
+
 	static pghw_type_t hw_hier[] = {
 		PGHW_IPIPE,
 		PGHW_FPU,
@@ -150,40 +147,27 @@ pg_plat_hw_level(pghw_type_t hw)
 	};
 
 	for (i = 0; hw_hier[i] != PGHW_NUM_COMPONENTS; i++) {
-		if (hw_hier[i] == hw)
-			return (i);
+		if (hw_hier[i] == hw1)
+			rank1 = i;
+		if (hw_hier[i] == hw2)
+			rank2 = i;
 	}
-	return (-1);
-}
-
-/*
- * Return 1 if CMT load balancing policies should be
- * implemented across instances of the specified hardware
- * sharing relationship.
- */
-int
-pg_plat_cmt_load_bal_hw(pghw_type_t hw)
-{
-	if (hw == PGHW_IPIPE ||
-	    hw == PGHW_FPU ||
-	    hw == PGHW_MPIPE)
-		return (1);
+	if (rank1 > rank2)
+		return (hw1);
 	else
-		return (0);
+		return (hw2);
 }
 
-
 /*
- * Return 1 if thread affinity polices should be implemented
- * for instances of the specifed hardware sharing relationship.
+ * Override the default CMT dispatcher policy for the specified
+ * hardware sharing relationship
  */
-int
-pg_plat_cmt_affinity_hw(pghw_type_t hw)
+/* ARGSUSED */
+pg_cmt_policy_t
+pg_plat_cmt_policy(pghw_type_t hw)
 {
-	if (hw == PGHW_CACHE)
-		return (1);
-	else
-		return (0);
+	/* Accept the default policies */
+	return (CMT_NO_POLICY);
 }
 
 id_t
@@ -213,7 +197,7 @@ pg_cmt_load_bal_hw(pghw_type_t hw)
 		return (0);
 }
 /*
- * Return 1 if thread affinity polices should be implemented
+ * Return 1 if thread affinity policies should be implemented
  * for instances of the specifed hardware sharing relationship.
  */
 int
diff --git a/usr/src/uts/sun4v/os/mach_startup.c b/usr/src/uts/sun4v/os/mach_startup.c
index ba9c4898d1..82069505a4 100644
--- a/usr/src/uts/sun4v/os/mach_startup.c
+++ b/usr/src/uts/sun4v/os/mach_startup.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -270,7 +270,7 @@ cpu_wakeup(cpu_t *cpu, int bound)
 }
 
 void
-mach_cpu_halt_idle()
+mach_cpu_halt_idle(void)
 {
 	if (enable_halt_idle_cpus) {
 		idle_cpu = cpu_halt;
author	Eric Saxe <Eric.Saxe@Sun.COM>	2009-02-25 21:04:18 -0800
committer	Eric Saxe <Eric.Saxe@Sun.COM>	2009-02-25 21:04:18 -0800
commit	0e7515250c8395f368aa45fb9acae7c4f8f8b786 (patch)
tree	5c3abde4ff53a950ad424ce362fd793369c06872 /usr/src
parent	9a5d73e03cd3312ddb571a748c40a63c58bd66e5 (diff)
download	illumos-joyent-0e7515250c8395f368aa45fb9acae7c4f8f8b786.tar.gz