48 files changed, 3759 insertions, 374 deletions
diff --git a/usr/src/cmd/cpc/common/cputrack.c b/usr/src/cmd/cpc/common/cputrack.c
index 22ad2673e2..41034aef6e 100644
--- a/usr/src/cmd/cpc/common/cputrack.c
+++ b/usr/src/cmd/cpc/common/cputrack.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -62,6 +62,12 @@ static const struct options *opts = (const struct options *)&__options;
 
 static cpc_t *cpc;
 
+/*
+ * How many signals caught from terminal
+ * We bail out as soon as possible when interrupt is set
+ */
+static int	interrupt = 0;
+
 /*ARGSUSED*/
 static void
 cputrack_errfn(const char *fn, int subcode, const char *fmt, va_list ap)
@@ -79,6 +85,8 @@ cputrack_pctx_errfn(const char *fn, const char *fmt, va_list ap)
 }
 
 static int cputrack(int argc, char *argv[], int optind);
+static void intr(int);
+
 #if defined(__i386)
 static void p4_ht_error(void);
 #endif
@@ -220,6 +228,19 @@ main(int argc, char *argv[])
 		exit(2);
 	}
 
+	/*
+	 * Catch signals from terminal, so they can be handled asynchronously
+	 * when we're ready instead of when we're not (;-)
+	 */
+	if (sigset(SIGHUP, SIG_IGN) == SIG_DFL)
+		(void) sigset(SIGHUP, intr);
+	if (sigset(SIGINT, SIG_IGN) == SIG_DFL)
+		(void) sigset(SIGINT, intr);
+	if (sigset(SIGQUIT, SIG_IGN) == SIG_DFL)
+		(void) sigset(SIGQUIT, intr);
+	(void) sigset(SIGPIPE, intr);
+	(void) sigset(SIGTERM, intr);
+
 	cpc_setgrp_reset(opts->master);
 	(void) setvbuf(opts->log, NULL, _IOLBF, 0);
 	ret = cputrack(argc, argv, optind);
@@ -310,6 +331,9 @@ pinit_lwp(pctx_t *pctx, pid_t pid, id_t lwpid, void *arg)
 	char *errstr;
 	int nreq;
 
+	if (interrupt)
+		return (0);
+
 	if (state->maxlwpid < lwpid) {
 		state->sgrps = realloc(state->sgrps,
 		    lwpid * sizeof (state->sgrps));
@@ -373,6 +397,9 @@ pfini_lwp(pctx_t *pctx, pid_t pid, id_t lwpid, void *arg)
 	cpc_buf_t **data1, **data2, **scratch;
 	int nreq;
 
+	if (interrupt)
+		return (0);
+
 	set = cpc_setgrp_getset(sgrp);
 	nreq = cpc_setgrp_getbufs(sgrp, &data1, &data2, &scratch);
 	if (cpc_set_sample(cpc, set, *scratch) == 0) {
@@ -424,6 +451,9 @@ plwp_create(pctx_t *pctx, pid_t pid, id_t lwpid, void *arg)
 	cpc_buf_t	**data1, **data2, **scratch;
 	int		nreq;
 
+	if (interrupt)
+		return (0);
+
 	nreq = cpc_setgrp_getbufs(sgrp, &data1, &data2, &scratch);
 
 	print_sample(pid, lwpid, "lwp_create",
@@ -442,6 +472,9 @@ plwp_exit(pctx_t *pctx, pid_t pid, id_t lwpid, void *arg)
 	int		nreq;
 	cpc_buf_t	**data1, **data2, **scratch;
 
+	if (interrupt)
+		return (0);
+
 	start = cpc_setgrp_getset(sgrp);
 	do {
 		nreq = cpc_setgrp_getbufs(sgrp, &data1, &data2, &scratch);
@@ -465,6 +498,9 @@ pexec(pctx_t *pctx, pid_t pid, id_t lwpid, char *name, void *arg)
 	cpc_buf_t	**data1, **data2, **scratch;
 	hrtime_t	hrt;
 
+	if (interrupt)
+		return (0);
+
 	/*
 	 * Print the accumulated results from the previous program image
 	 */
@@ -505,6 +541,9 @@ pexit(pctx_t *pctx, pid_t pid, id_t lwpid, int status, void *arg)
 	int		nreq;
 	cpc_buf_t	**data1, **data2, **scratch;
 
+	if (interrupt)
+		return;
+
 	cpc_setgrp_reset(state->accum);
 	start = cpc_setgrp_getset(state->accum);
 	do {
@@ -539,6 +578,9 @@ ptick(pctx_t *pctx, pid_t pid, id_t lwpid, void *arg)
 	char *errstr;
 	int nreqs;
 
+	if (interrupt)
+		return (0);
+
 	nreqs = cpc_setgrp_getbufs(sgrp, &data1, &data2, &scratch);
 
 	if (opts->nsets == 1) {
@@ -704,7 +746,6 @@ cputrack(int argc, char *argv[], int optind)
 			state->accum = NULL;
 		}
 	}
-	pctx_release(pctx);
 
 	return (err != 0 ? 1 : 0);
 }
@@ -834,3 +875,12 @@ p4_ht_error(void)
 }
 
 #endif /* defined(__i386) */
+
+/*ARGSUSED*/
+static void
+intr(int sig)
+{
+	interrupt++;
+	if (cpc != NULL)
+		cpc_terminate(cpc);
+}
diff --git a/usr/src/lib/libcpc/common/libcpc.c b/usr/src/lib/libcpc/common/libcpc.c
index 5bdba39fda..9f4f6ac848 100644
--- a/usr/src/lib/libcpc/common/libcpc.c
+++ b/usr/src/lib/libcpc/common/libcpc.c
@@ -168,6 +168,23 @@ cpc_close(cpc_t *cpc)
 	return (0);
 }
 
+/*
+ * Terminate everything that runs in pctx_run
+ */
+void
+cpc_terminate(cpc_t *cpc)
+{
+	cpc_set_t	*csp;
+	int		sigblocked;
+
+	sigblocked = cpc_lock(cpc);
+	for (csp = cpc->cpc_sets; csp != NULL; csp = csp->cs_next) {
+		if (csp->cs_pctx != NULL)
+			pctx_terminate(csp->cs_pctx);
+	}
+	cpc_unlock(cpc, sigblocked);
+}
+
 cpc_set_t *
 cpc_set_create(cpc_t *cpc)
 {
@@ -224,6 +241,14 @@ cpc_set_destroy(cpc_t *cpc, cpc_set_t *set)
 	if (csp->cs_state != CS_UNBOUND)
 		(void) cpc_unbind(cpc, csp);
 
+	/*
+	 * Detach from the process
+	 */
+	if (csp->cs_pctx != NULL) {
+		pctx_release(csp->cs_pctx);
+		csp->cs_pctx = NULL;
+	}
+
 	for (req = csp->cs_request; req != NULL; req = next) {
 		next = req->cr_next;
 
diff --git a/usr/src/lib/libcpc/common/libcpc.h b/usr/src/lib/libcpc/common/libcpc.h
index 384474a76c..73627345a0 100644
--- a/usr/src/lib/libcpc/common/libcpc.h
+++ b/usr/src/lib/libcpc/common/libcpc.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -163,6 +163,8 @@ extern void cpc_walk_attrs(cpc_t *cpc, void *arg,
 extern int cpc_enable(cpc_t *cpc);
 extern int cpc_disable(cpc_t *cpc);
 
+extern void cpc_terminate(cpc_t *);
+
 #if defined(__sparc) || defined(__i386)
 
 /*
diff --git a/usr/src/lib/libcpc/common/mapfile-vers b/usr/src/lib/libcpc/common/mapfile-vers
index 91f1689c9f..e577fc7c5e 100644
--- a/usr/src/lib/libcpc/common/mapfile-vers
+++ b/usr/src/lib/libcpc/common/mapfile-vers
@@ -83,6 +83,7 @@ SUNW_1.2 {
 SUNWprivate_1.1 {
     global:
 	SUNWprivate_1.1;
+	cpc_terminate;
     local:
 	*;
 };
diff --git a/usr/src/lib/libpctx/common/libpctx.c b/usr/src/lib/libpctx/common/libpctx.c
index 9c28fb9b9b..f17e238322 100644
--- a/usr/src/lib/libpctx/common/libpctx.c
+++ b/usr/src/lib/libpctx/common/libpctx.c
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * This file contains a set of generic routines for periodically
  * sampling the state of another process, or tree of processes.
@@ -66,6 +64,7 @@ struct __pctx {
 	int verbose;
 	int created;
 	int sigblocked;
+	int terminate;
 	sigset_t savedset;
 	cpc_t *cpc;
 };
@@ -108,6 +107,7 @@ pctx_create(
 	pctx = calloc(1, sizeof (*pctx));
 	pctx->uarg = arg;
 	pctx->verbose = verbose;
+	pctx->terminate = 0;
 	pctx->errfn = errfn ? errfn : pctx_default_errfn;
 
 	if ((pctx->Pr = Pcreate(filename, argv, &err, 0, 0)) == NULL) {
@@ -487,6 +487,7 @@ pctx_release(pctx_t *pctx)
 		Prelease(pctx->Pr, PRELEASE_CLEAR);
 		pctx->Pr = NULL;
 	}
+
 	pctx_free(pctx);
 	bzero(pctx, sizeof (*pctx));
 	free(pctx);
@@ -577,7 +578,7 @@ pctx_run(
 	 * exited successfully or the number of time samples has expired.
 	 * Otherwise, if an error has occurred, running becomes -1.
 	 */
-	while (running == 1) {
+	while (running == 1 && !pctx->terminate) {
 
 		if (Psetrun(pctx->Pr, 0, 0) != 0) {
 			if (pctx->verbose)
@@ -609,10 +610,13 @@ pctx_run(
 					if (nsamples != 1)
 						nsamples--;
 				}
-			} while (mswait == 0);
+			} while (mswait == 0 && !pctx->terminate);
 		}
 
-		(void) Pwait(pctx->Pr, mswait);
+		if (pctx->terminate)
+			goto bailout;
+		else
+			(void) Pwait(pctx->Pr, mswait);
 
 checkstate:
 		switch (pstate = Pstate(pctx->Pr)) {
@@ -854,6 +858,9 @@ checkstate:
 bailout:
 	(void) signal(SIGCHLD, sigsaved);
 
+	if (pctx->terminate)
+		return (0);
+
 	switch (running) {
 	case 0:
 		return (0);
@@ -885,6 +892,7 @@ __pctx_cpc(pctx_t *pctx, cpc_t *cpc,
 	 * We store the last cpc_t used by libpctx, so that when this pctx is
 	 * destroyed, libpctx can notify libcpc.
 	 */
+
 	if (pctx->cpc != NULL && pctx->cpc != cpc && pctx_cpc_callback != NULL)
 		(*pctx_cpc_callback)(pctx->cpc, pctx);
 	pctx->cpc = cpc;
@@ -993,3 +1001,12 @@ __pctx_cpc_register_callback(void (*arg)(struct __cpc *, struct __pctx *))
 {
 	pctx_cpc_callback = arg;
 }
+
+/*
+ * Tell pctx_run to bail out immediately
+ */
+void
+pctx_terminate(struct __pctx *pctx)
+{
+	pctx->terminate = 1;
+}
diff --git a/usr/src/lib/libpctx/common/libpctx.h b/usr/src/lib/libpctx/common/libpctx.h
index 10d0fb7c7e..7cd9ffff91 100644
--- a/usr/src/lib/libpctx/common/libpctx.h
+++ b/usr/src/lib/libpctx/common/libpctx.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_LIBPCTX_H
 #define	_LIBPCTX_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <fcntl.h>
 #include <stdarg.h>
@@ -67,6 +64,8 @@ typedef int pctx_init_lwpfn_t(pctx_t *, pid_t, id_t, void *);
 typedef int pctx_fini_lwpfn_t(pctx_t *, pid_t, id_t, void *);
 typedef int pctx_sysc_lwp_exitfn_t(pctx_t *, pid_t, id_t, void *);
 
+extern void pctx_terminate(pctx_t *);
+
 typedef	enum {
 	PCTX_NULL_EVENT = 0,
 	PCTX_SYSC_EXEC_EVENT,
diff --git a/usr/src/lib/libpctx/common/mapfile-vers b/usr/src/lib/libpctx/common/mapfile-vers
index 1b296817d4..e316020c8b 100644
--- a/usr/src/lib/libpctx/common/mapfile-vers
+++ b/usr/src/lib/libpctx/common/mapfile-vers
@@ -50,6 +50,7 @@ SUNWprivate_1.1 {
     global:
 	__pctx_cpc;
 	__pctx_cpc_register_callback;
+	pctx_terminate;
     local:
 	*;
 };
diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com
index 8ad553b07c..88ab8b3f20 100644
--- a/usr/src/pkgdefs/SUNWhea/prototype_com
+++ b/usr/src/pkgdefs/SUNWhea/prototype_com
@@ -682,6 +682,7 @@ f none usr/include/sys/bustypes.h 644 root bin
 f none usr/include/sys/byteorder.h 644 root bin
 f none usr/include/sys/callb.h 644 root bin
 f none usr/include/sys/callo.h 644 root bin
+f none usr/include/sys/cap_util.h 644 root bin
 f none usr/include/sys/cpucaps.h 644 root bin
 f none usr/include/sys/cpucaps_impl.h 644 root bin
 f none usr/include/sys/ccompile.h 644 root bin
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 83b7bf34c6..974cec5d3f 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -47,6 +47,7 @@ COMMON_CORE_OBJS +=		\
 		cpu_intr.o	\
 		cpu_pm.o	\
 		cpupart.o	\
+		cap_util.o	\
 		disp.o		\
 		group.o		\
 		kstat_fr.o	\
diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c
index e6d77020a6..09e529b934 100644
--- a/usr/src/uts/common/conf/param.c
+++ b/usr/src/uts/common/conf/param.c
@@ -212,6 +212,7 @@ extern void clock_timer_init(void);
 extern void clock_realtime_init(void);
 extern void clock_highres_init(void);
 extern void clock_tick_mp_init(void);
+extern void cu_init(void);
 extern void callout_mp_init(void);
 extern void cpu_seq_tbl_init(void);
 
@@ -257,6 +258,7 @@ void	(*mp_init_tbl[])(void) = {
 	siron_mp_init,
 #endif
 	clock_tick_mp_init,
+	cu_init,
 	callout_mp_init,
 	0
 };
diff --git a/usr/src/uts/common/disp/cmt.c b/usr/src/uts/common/disp/cmt.c
index b2f219472d..a5f1a52e34 100644
--- a/usr/src/uts/common/disp/cmt.c
+++ b/usr/src/uts/common/disp/cmt.c
@@ -159,7 +159,6 @@ static void		cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
 static cmt_lineage_validation_t	pg_cmt_lineage_validate(pg_cmt_t **, int *,
 			    cpu_pg_t *);
 
-
 /*
  * CMT PG ops
  */
@@ -583,6 +582,8 @@ pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata)
 			ASSERT(IS_CMT_PG(pg));
 		}
 
+		((pghw_t *)pg)->pghw_generation++;
+
 		/* Add the CPU to the PG */
 		pg_cpu_add((pg_t *)pg, cp, pgdata);
 
@@ -762,7 +763,7 @@ pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata)
  *
  * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
  * references a "bootstrap" structure across this function's invocation.
- * pg_cmt_cpu_init() and the routines it calls must be careful to operate only
+ * pg_cmt_cpu_fini() and the routines it calls must be careful to operate only
  * on the "pgdata" argument, and not cp->cpu_pg.
  */
 static void
@@ -818,6 +819,8 @@ pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata)
 	pg = (pg_cmt_t *)pgdata->cmt_lineage;
 	while (pg != NULL) {
 
+		((pghw_t *)pg)->pghw_generation++;
+
 		/*
 		 * Remove the PG from the CPU's load balancing lineage
 		 */
@@ -990,6 +993,11 @@ pg_cmt_cpu_active(cpu_t *cp)
 		if (IS_CMT_PG(pg) == 0)
 			continue;
 
+		/*
+		 * Move to the next generation since topology is changing
+		 */
+		((pghw_t *)pg)->pghw_generation++;
+
 		err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
 		ASSERT(err == 0);
 
@@ -1056,6 +1064,11 @@ pg_cmt_cpu_inactive(cpu_t *cp)
 			continue;
 
 		/*
+		 * Move to the next generation since topology is changing
+		 */
+		((pghw_t *)pg)->pghw_generation++;
+
+		/*
 		 * Remove the CPU from the CMT PGs active CPU group
 		 * bitmap
 		 */
diff --git a/usr/src/uts/common/dtrace/dcpc.c b/usr/src/uts/common/dtrace/dcpc.c
index e780d1e620..c410e65eaa 100644
--- a/usr/src/uts/common/dtrace/dcpc.c
+++ b/usr/src/uts/common/dtrace/dcpc.c
@@ -35,6 +35,7 @@
 #include <sys/conf.h>
 #include <sys/kmem.h>
 #include <sys/kcpc.h>
+#include <sys/cap_util.h>
 #include <sys/cpc_pcbe.h>
 #include <sys/cpc_impl.h>
 #include <sys/dtrace_impl.h>
@@ -463,8 +464,7 @@ dcpc_program_cpu_event(cpu_t *c)
 
 	set = dcpc_create_set(c);
 
-	octx = NULL;
-	set->ks_ctx = ctx = kcpc_ctx_alloc();
+	set->ks_ctx = ctx = kcpc_ctx_alloc(KM_SLEEP);
 	ctx->kc_set = set;
 	ctx->kc_cpuid = c->cpu_id;
 
@@ -489,11 +489,9 @@ dcpc_program_cpu_event(cpu_t *c)
 	 * If we already have an active enabling then save the current cpc
 	 * context away.
 	 */
-	if (c->cpu_cpc_ctx != NULL)
-		octx = c->cpu_cpc_ctx;
+	octx = c->cpu_cpc_ctx;
 
-	c->cpu_cpc_ctx = ctx;
-	kcpc_remote_program(c);
+	kcpc_cpu_program(c, ctx);
 
 	if (octx != NULL) {
 		kcpc_set_t *oset = octx->kc_set;
@@ -528,9 +526,14 @@ dcpc_disable_cpu(cpu_t *c)
 	if (c->cpu_flags & CPU_OFFLINE)
 		return;
 
-	kcpc_remote_stop(c);
-
+	/*
+	 * Grab CPUs CPC context before kcpc_cpu_stop() stops counters and
+	 * changes it.
+	 */
 	ctx = c->cpu_cpc_ctx;
+
+	kcpc_cpu_stop(c, B_FALSE);
+
 	set = ctx->kc_set;
 
 	kcpc_free_configs(set);
@@ -538,7 +541,6 @@ dcpc_disable_cpu(cpu_t *c)
 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
 	kcpc_free_set(set);
 	kcpc_ctx_free(ctx);
-	c->cpu_cpc_ctx = NULL;
 }
 
 /*
@@ -615,8 +617,21 @@ dcpc_program_event(dcpc_probe_t *pp)
 		if (c->cpu_flags & CPU_OFFLINE)
 			continue;
 
+		/*
+		 * Stop counters but preserve existing DTrace CPC context
+		 * if there is one.
+		 *
+		 * If we come here when the first event is programmed for a CPU,
+		 * there should be no DTrace CPC context installed. In this
+		 * case, kcpc_cpu_stop() will ensure that there is no other
+		 * context on the CPU.
+		 *
+		 * If we add new enabling to the original one, the CPU should
+		 * have the old DTrace CPC context which we need to keep around
+		 * since dcpc_program_event() will add to it.
+		 */
 		if (c->cpu_cpc_ctx != NULL)
-			kcpc_remote_stop(c);
+			kcpc_cpu_stop(c, B_TRUE);
 	} while ((c = c->cpu_next) != cpu_list);
 
 	dcpc_release_interrupts();
@@ -708,6 +723,13 @@ dcpc_enable(void *arg, dtrace_id_t id, void *parg)
 	ASSERT(pp->dcpc_actv_req_idx >= 0);
 
 	/*
+	 * DTrace is taking over CPC contexts, so stop collecting
+	 * capacity/utilization data for all CPUs.
+	 */
+	if (dtrace_cpc_in_use == 1)
+		cu_disable();
+
+	/*
 	 * The following must hold true if we are to (attempt to) enable
 	 * this request:
 	 *
@@ -758,7 +780,7 @@ dcpc_enable(void *arg, dtrace_id_t id, void *parg)
 			if (c->cpu_flags & CPU_OFFLINE)
 				continue;
 
-			kcpc_remote_program(c);
+			kcpc_cpu_program(c, c->cpu_cpc_ctx);
 		} while ((c = c->cpu_next) != cpu_list);
 	}
 
@@ -766,6 +788,13 @@ dcpc_enable(void *arg, dtrace_id_t id, void *parg)
 	dcpc_actv_reqs[pp->dcpc_actv_req_idx] = NULL;
 	pp->dcpc_actv_req_idx = pp->dcpc_picno = -1;
 
+	/*
+	 * If all probes are removed, enable capacity/utilization data
+	 * collection for every CPU.
+	 */
+	if (dtrace_cpc_in_use == 0)
+		cu_enable();
+
 	return (-1);
 }
 
@@ -841,6 +870,13 @@ dcpc_disable(void *arg, dtrace_id_t id, void *parg)
 	dtrace_cpc_in_use--;
 	pp->dcpc_enabled = 0;
 	pp->dcpc_actv_req_idx = pp->dcpc_picno = pp->dcpc_disabling = -1;
+
+	/*
+	 * If all probes are removed, enable capacity/utilization data
+	 * collection for every CPU
+	 */
+	if (dtrace_cpc_in_use == 0)
+		cu_enable();
 }
 
 /*ARGSUSED*/
@@ -891,7 +927,6 @@ dcpc_cpu_setup(cpu_setup_t what, processorid_t cpu, void *arg)
 		 */
 		if (dtrace_cpc_in_use) {
 			c = cpu_get(cpu);
-
 			(void) dcpc_program_cpu_event(c);
 		}
 		break;
diff --git a/usr/src/uts/common/io/cpc.c b/usr/src/uts/common/io/cpc.c
index 6881380251..0b003c3ee1 100644
--- a/usr/src/uts/common/io/cpc.c
+++ b/usr/src/uts/common/io/cpc.c
@@ -942,49 +942,19 @@ static struct modlinkage modl = {
 #endif
 };
 
-static void
-kcpc_init(void)
-{
-	long hash;
-
-	rw_init(&kcpc_cpuctx_lock, NULL, RW_DEFAULT, NULL);
-	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
-		mutex_init(&kcpc_ctx_llock[hash],
-		    NULL, MUTEX_DRIVER, (void *)(uintptr_t)15);
-}
-
-static void
-kcpc_fini(void)
-{
-	long hash;
-
-	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
-		mutex_destroy(&kcpc_ctx_llock[hash]);
-	rw_destroy(&kcpc_cpuctx_lock);
-}
-
 int
 _init(void)
 {
-	int ret;
-
-	if (kcpc_hw_load_pcbe() != 0)
+	if (kcpc_init() != 0)
 		return (ENOTSUP);
 
-	kcpc_init();
-	if ((ret = mod_install(&modl)) != 0)
-		kcpc_fini();
-	return (ret);
+	return (mod_install(&modl));
 }
 
 int
 _fini(void)
 {
-	int ret;
-
-	if ((ret = mod_remove(&modl)) == 0)
-		kcpc_fini();
-	return (ret);
+	return (mod_remove(&modl));
 }
 
 int
diff --git a/usr/src/uts/common/os/cap_util.c b/usr/src/uts/common/os/cap_util.c
new file mode 100644
index 0000000000..16ff7f45fd
--- /dev/null
+++ b/usr/src/uts/common/os/cap_util.c
@@ -0,0 +1,1652 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Support for determining capacity and utilization of performance relevant
+ * hardware components in a computer
+ *
+ * THEORY
+ * ------
+ * The capacity and utilization of the performance relevant hardware components
+ * is needed to be able to optimize performance while minimizing the amount of
+ * power used on a system.  The idea is to use hardware performance counters
+ * and potentially other means to determine the capacity and utilization of
+ * performance relevant hardware components (eg. execution pipeline, cache,
+ * memory, etc.) and attribute the utilization to the responsible CPU and the
+ * thread running there.
+ *
+ * This will help characterize the utilization of performance relevant
+ * components and how much is used by each CPU and each thread.  With
+ * that data, the utilization can be aggregated to all the CPUs sharing each
+ * performance relevant hardware component to calculate the total utilization
+ * of each component and compare that with the component's capacity to
+ * essentially determine the actual hardware load of the component.  The
+ * hardware utilization attributed to each running thread can also be
+ * aggregated to determine the total hardware utilization of each component to
+ * a workload.
+ *
+ * Once that is done, one can determine how much of each performance relevant
+ * hardware component is needed by a given thread or set of threads (eg. a
+ * workload) and size up exactly what hardware is needed by the threads and how
+ * much.  With this info, we can better place threads among CPUs to match their
+ * exact hardware resource needs and potentially lower or raise the power based
+ * on their utilization or pack threads onto the fewest hardware components
+ * needed and power off any remaining unused components to minimize power
+ * without sacrificing performance.
+ *
+ * IMPLEMENTATION
+ * --------------
+ * The code has been designed and implemented to make (un)programming and
+ * reading the counters for a given CPU as lightweight and fast as possible.
+ * This is very important because we need to read and potentially (un)program
+ * the counters very often and in performance sensitive code.  Specifically,
+ * the counters may need to be (un)programmed during context switch and/or a
+ * cyclic handler when there are more counter events to count than existing
+ * counters.
+ *
+ * Consequently, the code has been split up to allow allocating and
+ * initializing everything needed to program and read the counters on a given
+ * CPU once and make (un)programming and reading the counters for a given CPU
+ * not have to allocate/free memory or grab any locks.  To do this, all the
+ * state needed to (un)program and read the counters on a CPU is kept per CPU
+ * and is made lock free by forcing any code that reads or manipulates the
+ * counters or the state needed to (un)program or read the counters to run on
+ * the target CPU and disable preemption while running on the target CPU to
+ * protect any critical sections. All counter manipulation on the target CPU is
+ * happening either from a cross-call to the target CPU or at the same PIL as
+ * used by the cross-call subsystem. This guarantees that counter manipulation
+ * is not interrupted by cross-calls from other CPUs.
+ *
+ * The synchronization has been made lock free or as simple as possible for
+ * performance and to avoid getting the locking all tangled up when we interpose
+ * on the CPC routines that (un)program the counters to manage the counters
+ * between the kernel and user on each CPU.  When the user starts using the
+ * counters on a given CPU, the kernel will unprogram the counters that it is
+ * using on that CPU just before they are programmed for the user.  Then the
+ * kernel will program the counters on a given CPU for its own use when the user
+ * stops using them.
+ *
+ * There is a special interaction with DTrace cpc provider (dcpc). Before dcpc
+ * enables any probe, it requests to disable and unprogram all counters used for
+ * capacity and utilizations. These counters are never re-programmed back until
+ * dcpc completes. When all DTrace cpc probes are removed, dcpc notifies CU
+ * framework and it re-programs the counters.
+ *
+ * When a CPU is going offline, its CU counters are unprogrammed and disabled,
+ * so that they would not be re-programmed again by some other activity on the
+ * CPU that is going offline.
+ *
+ * The counters are programmed during boot.  However, a flag is available to
+ * disable this if necessary (see cu_flag below).  A handler is provided to
+ * (un)program the counters during CPU on/offline.  Basic routines are provided
+ * to initialize and tear down this module, initialize and tear down any state
+ * needed for a given CPU, and (un)program the counters for a given CPU.
+ * Lastly, a handler is provided to read the counters and attribute the
+ * utilization to the responsible CPU.
+ */
+#include <sys/types.h>
+#include <sys/cmn_err.h>
+#include <sys/cpuvar.h>
+#include <sys/ddi.h>
+#include <sys/disp.h>
+#include <sys/sdt.h>
+#include <sys/sunddi.h>
+#include <sys/thread.h>
+#include <sys/pghw.h>
+#include <sys/cmt.h>
+#include <sys/x_call.h>
+#include <sys/cap_util.h>
+
+#include <sys/archsystm.h>
+#include <sys/promif.h>
+
+#if defined(__x86)
+#include <sys/xc_levels.h>
+#endif
+
+
+/*
+ * Default CPU hardware performance counter flags to use for measuring capacity
+ * and utilization
+ */
+#define	CU_CPC_FLAGS_DEFAULT	\
+	(CPC_COUNT_USER|CPC_COUNT_SYSTEM|CPC_OVF_NOTIFY_EMT)
+
+/*
+ * Possible Flags for controlling this module.
+ */
+#define	CU_FLAG_ENABLE		1	/* Enable module */
+#define	CU_FLAG_READY		2	/* Ready to setup module */
+#define	CU_FLAG_ON		4	/* Module is on */
+
+/*
+ * pg_cpu kstats calculate utilization rate and maximum utilization rate for
+ * some CPUs. The rate is calculated based on data from two subsequent
+ * snapshots. When the time between such two snapshots is too small, the
+ * resulting rate may have low accuracy, so we only consider snapshots which
+ * are separated by SAMPLE_INTERVAL nanoseconds from one another. We do not
+ * update the rate if the interval is smaller than that.
+ *
+ * Use one tenth of a second as the minimum interval for utilization rate
+ * calculation.
+ *
+ * NOTE: The CU_SAMPLE_INTERVAL_MIN should be higher than the scaling factor in
+ * the CU_RATE() macro below to guarantee that we never divide by zero.
+ *
+ * Rate is the number of events per second. The rate is the number of events
+ * divided by time and multiplied by the number of nanoseconds in a second. We
+ * do not want time to be too small since it will cause large errors in
+ * division.
+ *
+ * We do not want to multiply two large numbers (the instruction count and
+ * NANOSEC) either since it may cause integer overflow. So we divide both the
+ * numerator and the denominator by the same value.
+ *
+ * NOTE: The scaling factor below should be less than CU_SAMPLE_INTERVAL_MIN
+ * above to guarantee that time divided by this value is always non-zero.
+ */
+#define	CU_RATE(val, time) \
+	(((val) * (NANOSEC / CU_SCALE)) / ((time) / CU_SCALE))
+
+#define	CU_SAMPLE_INTERVAL_MIN	(NANOSEC / 10)
+
+#define	CU_SCALE (CU_SAMPLE_INTERVAL_MIN / 10000)
+
+/*
+ * When the time between two kstat reads for the same CPU is less than
+ * CU_UPDATE_THRESHOLD use the old counter data and skip updating counter values
+ * for the CPU. This helps reduce cross-calls when kstat consumers read data
+ * very often or when they read PG utilization data and then CPU utilization
+ * data quickly after that.
+ */
+#define	CU_UPDATE_THRESHOLD (NANOSEC / 10)
+
+/*
+ * The IS_HIPIL() macro verifies that the code is executed either from a
+ * cross-call or from high-PIL interrupt
+ */
+#ifdef DEBUG
+#define	IS_HIPIL() (getpil() >= XCALL_PIL)
+#else
+#define	IS_HIPIL()
+#endif	/* DEBUG */
+
+
+typedef void (*cu_cpu_func_t)(uintptr_t, int *);
+
+
+/*
+ * Flags to use for programming CPU hardware performance counters to measure
+ * capacity and utilization
+ */
+int				cu_cpc_flags = CU_CPC_FLAGS_DEFAULT;
+
+/*
+ * Initial value used for programming hardware counters
+ */
+uint64_t			cu_cpc_preset_value = 0;
+
+/*
+ * List of CPC event requests for capacity and utilization.
+ */
+static kcpc_request_list_t	*cu_cpc_reqs = NULL;
+
+/*
+ * When a CPU is a member of PG with a sharing relationship that is supported
+ * by the capacity/utilization framework, a kstat is created for that CPU and
+ * sharing relationship.
+ *
+ * These kstats are updated one at a time, so we can have a single scratch
+ * space to fill the data.
+ *
+ * CPU counter kstats fields:
+ *
+ *   cu_cpu_id		CPU ID for this kstat
+ *
+ *   cu_generation	Generation value that increases whenever any CPU goes
+ *			  offline or online. Two kstat snapshots for the same
+ *			  CPU may only be compared if they have the same
+ *			  generation.
+ *
+ *   cu_pg_id		PG ID for the relationship described by this kstat
+ *
+ *   cu_cpu_util	Running value of CPU utilization for the sharing
+ *			  relationship
+ *
+ *   cu_cpu_time_running Total time spent collecting CU data. The time may be
+ *			   less than wall time if CU counters were stopped for
+ *			   some time.
+ *
+ *   cu_cpu_time_stopped Total time the CU counters were stopped.
+ *
+ *   cu_cpu_rate	Utilization rate, expressed in operations per second.
+ *
+ *   cu_cpu_rate_max	Maximum observed value of utilization rate.
+ */
+struct cu_cpu_kstat {
+	kstat_named_t	cu_cpu_id;
+	kstat_named_t	cu_generation;
+	kstat_named_t	cu_pg_id;
+	kstat_named_t	cu_cpu_util;
+	kstat_named_t	cu_cpu_time_running;
+	kstat_named_t	cu_cpu_time_stopped;
+	kstat_named_t	cu_cpu_rate;
+	kstat_named_t	cu_cpu_rate_max;
+} cu_cpu_kstat = {
+	{ "id",				KSTAT_DATA_UINT32 },
+	{ "generation",			KSTAT_DATA_UINT32 },
+	{ "pg_id",			KSTAT_DATA_LONG },
+	{ "hw_util",			KSTAT_DATA_UINT64 },
+	{ "hw_util_time_running",	KSTAT_DATA_UINT64 },
+	{ "hw_util_time_stopped",	KSTAT_DATA_UINT64 },
+	{ "hw_util_rate",		KSTAT_DATA_UINT64 },
+	{ "hw_util_rate_max",		KSTAT_DATA_UINT64 },
+};
+
+/*
+ * Flags for controlling this module
+ */
+uint_t				cu_flags = CU_FLAG_ENABLE;
+
+/*
+ * Error return value for cu_init() since it can't return anything to be called
+ * from mp_init_tbl[] (:-(
+ */
+static int			cu_init_error = 0;
+
+hrtime_t			cu_sample_interval_min = CU_SAMPLE_INTERVAL_MIN;
+
+hrtime_t			cu_update_threshold = CU_UPDATE_THRESHOLD;
+
+static kmutex_t			pg_cpu_kstat_lock;
+
+
+/*
+ * Forward declaration of interface routines
+ */
+void		cu_disable(void);
+void		cu_enable(void);
+void		cu_init(void);
+void		cu_cpc_program(cpu_t *cp, int *err);
+void		cu_cpc_unprogram(cpu_t *cp, int *err);
+int		cu_cpu_update(struct cpu *cp, boolean_t move_to);
+void		cu_pg_update(pghw_t *pg);
+
+
+/*
+ * Forward declaration of private routines
+ */
+static int	cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs);
+static void	cu_cpc_program_xcall(uintptr_t arg, int *err);
+static int	cu_cpc_req_add(char *event, kcpc_request_list_t *reqs,
+    int nreqs, cu_cntr_stats_t *stats, int kmem_flags, int *nevents);
+static int	cu_cpu_callback(cpu_setup_t what, int id, void *arg);
+static void	cu_cpu_disable(cpu_t *cp);
+static void	cu_cpu_enable(cpu_t *cp);
+static int	cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs);
+static int	cu_cpu_fini(cpu_t *cp);
+static void	cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info);
+static int	cu_cpu_kstat_update(kstat_t *ksp, int rw);
+static int	cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg);
+static int	cu_cpu_update_stats(cu_cntr_stats_t *stats,
+    uint64_t cntr_value);
+static void cu_cpu_info_detach_xcall(void);
+
+/*
+ * Disable or enable Capacity Utilization counters on all CPUs.
+ */
+void
+cu_disable(void)
+{
+	cpu_t *cp;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	cp = cpu_active;
+	do {
+		if (!(cp->cpu_flags & CPU_OFFLINE))
+			cu_cpu_disable(cp);
+	} while ((cp = cp->cpu_next_onln) != cpu_active);
+}
+
+
+void
+cu_enable(void)
+{
+	cpu_t *cp;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	cp = cpu_active;
+	do {
+		if (!(cp->cpu_flags & CPU_OFFLINE))
+			cu_cpu_enable(cp);
+	} while ((cp = cp->cpu_next_onln) != cpu_active);
+}
+
+
+/*
+ * Setup capacity and utilization support
+ */
+void
+cu_init(void)
+{
+	cpu_t	*cp;
+
+	cu_init_error = 0;
+	if (!(cu_flags & CU_FLAG_ENABLE) || (cu_flags & CU_FLAG_ON)) {
+		cu_init_error = -1;
+		return;
+	}
+
+	if (kcpc_init() != 0) {
+		cu_init_error = -2;
+		return;
+	}
+
+	/*
+	 * Can't measure hardware capacity and utilization without CPU
+	 * hardware performance counters
+	 */
+	if (cpc_ncounters <= 0) {
+		cu_init_error = -3;
+		return;
+	}
+
+	/*
+	 * Setup CPC event request queue
+	 */
+	cu_cpc_reqs = kcpc_reqs_init(cpc_ncounters, KM_SLEEP);
+
+	mutex_enter(&cpu_lock);
+
+	/*
+	 * Mark flags to say that module is ready to be setup
+	 */
+	cu_flags |= CU_FLAG_READY;
+
+	cp = cpu_active;
+	do {
+		/*
+		 * Allocate and setup state needed to measure capacity and
+		 * utilization
+		 */
+		if (cu_cpu_init(cp, cu_cpc_reqs) != 0)
+			cu_init_error = -5;
+
+		/*
+		 * Reset list of counter event requests so its space can be
+		 * reused for a different set of requests for next CPU
+		 */
+		(void) kcpc_reqs_reset(cu_cpc_reqs);
+
+		cp = cp->cpu_next_onln;
+	} while (cp != cpu_active);
+
+	/*
+	 * Mark flags to say that module is on now and counters are ready to be
+	 * programmed on all active CPUs
+	 */
+	cu_flags |= CU_FLAG_ON;
+
+	/*
+	 * Program counters on currently active CPUs
+	 */
+	cp = cpu_active;
+	do {
+		if (cu_cpu_run(cp, cu_cpc_program_xcall,
+		    (uintptr_t)B_FALSE) != 0)
+			cu_init_error = -6;
+
+		cp = cp->cpu_next_onln;
+	} while (cp != cpu_active);
+
+	/*
+	 * Register callback for CPU state changes to enable and disable
+	 * CPC counters as CPUs come on and offline
+	 */
+	register_cpu_setup_func(cu_cpu_callback, NULL);
+
+	mutex_exit(&cpu_lock);
+}
+
+
+/*
+ * Return number of counter events needed to measure capacity and utilization
+ * for specified CPU and fill in list of CPC requests with each counter event
+ * needed if list where to add CPC requests is given
+ *
+ * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
+ *	 everything that has been successfully allocated if any memory
+ *	 allocation fails
+ */
+static int
+cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs)
+{
+	group_t		*cmt_pgs;
+	cu_cntr_info_t	**cntr_info_array;
+	cpu_pg_t	*cpu_pgs;
+	cu_cpu_info_t	*cu_cpu_info;
+	pg_cmt_t	*pg_cmt;
+	pghw_t		*pg_hw;
+	cu_cntr_stats_t	*stats;
+	int		nevents;
+	pghw_type_t	pg_hw_type;
+	group_iter_t	iter;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	/*
+	 * There has to be a target CPU for this
+	 */
+	if (cp == NULL)
+		return (-1);
+
+	/*
+	 * Return 0 when CPU doesn't belong to any group
+	 */
+	cpu_pgs = cp->cpu_pg;
+	if (cpu_pgs == NULL || GROUP_SIZE(&cpu_pgs->cmt_pgs) < 1)
+		return (0);
+
+	cmt_pgs = &cpu_pgs->cmt_pgs;
+	cu_cpu_info = cp->cpu_cu_info;
+
+	/*
+	 * Grab counter statistics and info
+	 */
+	if (reqs == NULL) {
+		stats = NULL;
+		cntr_info_array = NULL;
+	} else {
+		if (cu_cpu_info == NULL || cu_cpu_info->cu_cntr_stats == NULL)
+			return (-2);
+
+		stats = cu_cpu_info->cu_cntr_stats;
+		cntr_info_array = cu_cpu_info->cu_cntr_info;
+	}
+
+	/*
+	 * See whether platform (or processor) specific code knows which CPC
+	 * events to request, etc. are needed to measure hardware capacity and
+	 * utilization on this machine
+	 */
+	nevents = cu_plat_cpc_init(cp, reqs, nreqs);
+	if (nevents >= 0)
+		return (nevents);
+
+	/*
+	 * Let common code decide which CPC events to request, etc. to measure
+	 * capacity and utilization since platform (or processor) specific does
+	 * not know....
+	 *
+	 * Walk CPU's PG lineage and do following:
+	 *
+	 * - Setup CPC request, counter info, and stats needed for each counter
+	 *   event to measure capacity and and utilization for each of CPU's PG
+	 *   hardware sharing relationships
+	 *
+	 * - Create PG CPU kstats to export capacity and utilization for each PG
+	 */
+	nevents = 0;
+	group_iter_init(&iter);
+	while ((pg_cmt = group_iterate(cmt_pgs, &iter)) != NULL) {
+		cu_cntr_info_t	*cntr_info;
+		int		nevents_save;
+		int		nstats;
+
+		pg_hw = (pghw_t *)pg_cmt;
+		pg_hw_type = pg_hw->pghw_hw;
+		nevents_save = nevents;
+		nstats = 0;
+
+		switch (pg_hw_type) {
+		case PGHW_IPIPE:
+			if (cu_cpc_req_add("PAPI_tot_ins", reqs, nreqs, stats,
+			    KM_NOSLEEP, &nevents) != 0)
+				continue;
+			nstats = 1;
+			break;
+
+		case PGHW_FPU:
+			if (cu_cpc_req_add("PAPI_fp_ins", reqs, nreqs, stats,
+			    KM_NOSLEEP, &nevents) != 0)
+				continue;
+			nstats = 1;
+			break;
+
+		default:
+			/*
+			 * Don't measure capacity and utilization for this kind
+			 * of PG hardware relationship so skip to next PG in
+			 * CPU's PG lineage
+			 */
+			continue;
+		}
+
+		cntr_info = cntr_info_array[pg_hw_type];
+
+		/*
+		 * Nothing to measure for this hardware sharing relationship
+		 */
+		if (nevents - nevents_save == 0) {
+			if (cntr_info != NULL)
+				kmem_free(cntr_info, sizeof (cu_cntr_info_t));
+				cntr_info_array[pg_hw_type] = NULL;
+			continue;
+		}
+
+		/*
+		 * Fill in counter info for this PG hardware relationship
+		 */
+		if (cntr_info == NULL) {
+			cntr_info = kmem_zalloc(sizeof (cu_cntr_info_t),
+			    KM_NOSLEEP);
+			if (cntr_info == NULL)
+				continue;
+			cntr_info_array[pg_hw_type] = cntr_info;
+		}
+		cntr_info->ci_cpu = cp;
+		cntr_info->ci_pg = pg_hw;
+		cntr_info->ci_stats = &stats[nevents_save];
+		cntr_info->ci_nstats = nstats;
+
+		/*
+		 * Create PG CPU kstats for this hardware relationship
+		 */
+		cu_cpu_kstat_create(pg_hw, cntr_info);
+	}
+
+	return (nevents);
+}
+
+
+/*
+ * Program counters for capacity and utilization on given CPU
+ *
+ * If any of the following conditions is true, the counters are not programmed:
+ *
+ * - CU framework is disabled
+ * - The cpu_cu_info field of the cpu structure is NULL
+ * - DTrace is active
+ * - Counters are programmed already
+ * - Counters are disabled (by calls to cu_cpu_disable())
+ */
+void
+cu_cpc_program(cpu_t *cp, int *err)
+{
+	cu_cpc_ctx_t	*cpu_ctx;
+	kcpc_ctx_t	*ctx;
+	cu_cpu_info_t	*cu_cpu_info;
+
+	ASSERT(IS_HIPIL());
+	/*
+	 * Should be running on given CPU. We disable preemption to keep CPU
+	 * from disappearing and make sure flags and CPC context don't change
+	 * from underneath us
+	 */
+	kpreempt_disable();
+	ASSERT(cp == CPU);
+
+	/*
+	 * Module not ready to program counters
+	 */
+	if (!(cu_flags & CU_FLAG_ON)) {
+		*err = -1;
+		kpreempt_enable();
+		return;
+	}
+
+	if (cp == NULL) {
+		*err = -2;
+		kpreempt_enable();
+		return;
+	}
+
+	cu_cpu_info = cp->cpu_cu_info;
+	if (cu_cpu_info == NULL) {
+		*err = -3;
+		kpreempt_enable();
+		return;
+	}
+
+	/*
+	 * If DTrace CPC is active or counters turned on already or are
+	 * disabled, just return.
+	 */
+	if (dtrace_cpc_in_use || (cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON) ||
+	    cu_cpu_info->cu_disabled) {
+		*err = 1;
+		kpreempt_enable();
+		return;
+	}
+
+	if ((CPU->cpu_cpc_ctx != NULL) &&
+	    !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
+		*err = -4;
+		kpreempt_enable();
+		return;
+	}
+
+	/*
+	 * Get CPU's CPC context needed for capacity and utilization
+	 */
+	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
+	ASSERT(cpu_ctx != NULL);
+	ASSERT(cpu_ctx->nctx >= 0);
+
+	ASSERT(cpu_ctx->ctx_ptr_array == NULL || cpu_ctx->ctx_ptr_array_sz > 0);
+	ASSERT(cpu_ctx->nctx <= cpu_ctx->ctx_ptr_array_sz);
+	if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
+	    cpu_ctx->ctx_ptr_array_sz <= 0) {
+		*err = -5;
+		kpreempt_enable();
+		return;
+	}
+
+	/*
+	 * Increment index in CPU's CPC context info to point at next context
+	 * to program
+	 *
+	 * NOTE: Do this now instead of after programming counters to ensure
+	 *	 that index will always point at *current* context so we will
+	 *	 always be able to unprogram *current* context if necessary
+	 */
+	cpu_ctx->cur_index = (cpu_ctx->cur_index + 1) % cpu_ctx->nctx;
+
+	ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];
+
+	/*
+	 * Clear KCPC_CTX_INVALID and KCPC_CTX_INVALID_STOPPED from CPU's CPC
+	 * context before programming counters
+	 *
+	 * Context is marked with KCPC_CTX_INVALID_STOPPED when context is
+	 * unprogrammed and may be marked with KCPC_CTX_INVALID when
+	 * kcpc_invalidate_all() is called by cpustat(1M) and dtrace CPC to
+	 * invalidate all CPC contexts before they take over all the counters.
+	 *
+	 * This isn't necessary since these flags are only used for thread bound
+	 * CPC contexts not CPU bound CPC contexts like ones used for capacity
+	 * and utilization.
+	 *
+	 * There is no need to protect the flag update since no one is using
+	 * this context now.
+	 */
+	ctx->kc_flags &= ~(KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
+
+	/*
+	 * Program counters on this CPU
+	 */
+	kcpc_program(ctx, B_FALSE, B_FALSE);
+
+	cp->cpu_cpc_ctx = ctx;
+
+	/*
+	 * Set state in CPU structure to say that CPU's counters are programmed
+	 * for capacity and utilization now and that they are transitioning from
+	 * off to on state. This will cause cu_cpu_update to update stop times
+	 * for all programmed counters.
+	 */
+	cu_cpu_info->cu_flag |= CU_CPU_CNTRS_ON | CU_CPU_CNTRS_OFF_ON;
+
+	/*
+	 * Update counter statistics
+	 */
+	(void) cu_cpu_update(cp, B_FALSE);
+
+	cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_OFF_ON;
+
+	*err = 0;
+	kpreempt_enable();
+}
+
+
+/*
+ * Cross call wrapper routine for cu_cpc_program()
+ *
+ * Checks to make sure that counters on CPU aren't being used by someone else
+ * before calling cu_cpc_program() since cu_cpc_program() needs to assert that
+ * nobody else is using the counters to catch and prevent any broken code.
+ * Also, this check needs to happen on the target CPU since the CPU's CPC
+ * context can only be changed while running on the CPU.
+ *
+ * If the first argument is TRUE, cu_cpc_program_xcall also checks that there is
+ * no valid thread bound cpc context. This is important to check to prevent
+ * re-programming thread counters with CU counters when CPU is coming on-line.
+ */
+static void
+cu_cpc_program_xcall(uintptr_t arg, int *err)
+{
+	boolean_t	avoid_thread_context = (boolean_t)arg;
+
+	kpreempt_disable();
+
+	if (CPU->cpu_cpc_ctx != NULL &&
+	    !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
+		*err = -100;
+		kpreempt_enable();
+		return;
+	}
+
+	if (avoid_thread_context && (curthread->t_cpc_ctx != NULL) &&
+	    !(curthread->t_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
+		*err = -200;
+		kpreempt_enable();
+		return;
+	}
+
+	cu_cpc_program(CPU, err);
+	kpreempt_enable();
+}
+
+
+/*
+ * Unprogram counters for capacity and utilization on given CPU
+ * This function should be always executed on the target CPU at high PIL
+ */
+void
+cu_cpc_unprogram(cpu_t *cp, int *err)
+{
+	cu_cpc_ctx_t	*cpu_ctx;
+	kcpc_ctx_t	*ctx;
+	cu_cpu_info_t	*cu_cpu_info;
+
+	ASSERT(IS_HIPIL());
+	/*
+	 * Should be running on given CPU with preemption disabled to keep CPU
+	 * from disappearing and make sure flags and CPC context don't change
+	 * from underneath us
+	 */
+	kpreempt_disable();
+	ASSERT(cp == CPU);
+
+	/*
+	 * Module not on
+	 */
+	if (!(cu_flags & CU_FLAG_ON)) {
+		*err = -1;
+		kpreempt_enable();
+		return;
+	}
+
+	cu_cpu_info = cp->cpu_cu_info;
+	if (cu_cpu_info == NULL) {
+		*err = -3;
+		kpreempt_enable();
+		return;
+	}
+
+	/*
+	 * Counters turned off already
+	 */
+	if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)) {
+		*err = 1;
+		kpreempt_enable();
+		return;
+	}
+
+	/*
+	 * Update counter statistics
+	 */
+	(void) cu_cpu_update(cp, B_FALSE);
+
+	/*
+	 * Get CPU's CPC context needed for capacity and utilization
+	 */
+	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
+	if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
+	    cpu_ctx->ctx_ptr_array_sz <= 0) {
+		*err = -5;
+		kpreempt_enable();
+		return;
+	}
+	ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];
+
+	/*
+	 * CPU's CPC context should be current capacity and utilization CPC
+	 * context
+	 */
+	ASSERT(cp->cpu_cpc_ctx == ctx);
+	if (cp->cpu_cpc_ctx != ctx) {
+		*err = -6;
+		kpreempt_enable();
+		return;
+	}
+
+	/*
+	 * Unprogram counters on CPU.
+	 */
+	kcpc_unprogram(ctx, B_FALSE);
+
+	ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
+
+	/*
+	 * Unset state in CPU structure saying that CPU's counters are
+	 * programmed
+	 */
+	cp->cpu_cpc_ctx = NULL;
+	cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_ON;
+
+	*err = 0;
+	kpreempt_enable();
+}
+
+
+/*
+ * Add given counter event to list of CPC requests
+ */
+static int
+cu_cpc_req_add(char *event, kcpc_request_list_t *reqs, int nreqs,
+    cu_cntr_stats_t *stats, int kmem_flags, int *nevents)
+{
+	int	n;
+	int	retval;
+	uint_t  flags;
+
+	/*
+	 * Return error when no counter event specified, counter event not
+	 * supported by CPC's PCBE, or number of events not given
+	 */
+	if (event == NULL || kcpc_event_supported(event) == B_FALSE ||
+	    nevents == NULL)
+		return (-1);
+
+	n = *nevents;
+
+	/*
+	 * Only count number of counter events needed if list
+	 * where to add CPC requests not given
+	 */
+	if (reqs == NULL) {
+		n++;
+		*nevents = n;
+		return (-3);
+	}
+
+	/*
+	 * Return error when stats not given or not enough room on list of CPC
+	 * requests for more counter events
+	 */
+	if (stats == NULL || (nreqs <= 0 && n >= nreqs))
+		return (-4);
+
+	/*
+	 * Use flags in cu_cpc_flags to program counters and enable overflow
+	 * interrupts/traps (unless PCBE can't handle overflow interrupts) so
+	 * PCBE can catch counters before they wrap to hopefully give us an
+	 * accurate (64-bit) virtualized counter
+	 */
+	flags = cu_cpc_flags;
+	if ((kcpc_pcbe_capabilities() & CPC_CAP_OVERFLOW_INTERRUPT) == 0)
+		flags &= ~CPC_OVF_NOTIFY_EMT;
+
+	/*
+	 * Add CPC request to list
+	 */
+	retval = kcpc_reqs_add(reqs, event, cu_cpc_preset_value,
+	    flags, 0, NULL, &stats[n], kmem_flags);
+
+	if (retval != 0)
+		return (-5);
+
+	n++;
+	*nevents = n;
+	return (0);
+}
+
+static void
+cu_cpu_info_detach_xcall(void)
+{
+	ASSERT(IS_HIPIL());
+
+	CPU->cpu_cu_info = NULL;
+}
+
+
+/*
+ * Enable or disable collection of capacity/utilization data for a current CPU.
+ * Counters are enabled if 'on' argument is True and disabled if it is False.
+ * This function should be always executed at high PIL
+ */
+static void
+cu_cpc_trigger(uintptr_t arg1, uintptr_t arg2)
+{
+	cpu_t		*cp = (cpu_t *)arg1;
+	boolean_t	on = (boolean_t)arg2;
+	int		error;
+	cu_cpu_info_t	*cu_cpu_info;
+
+	ASSERT(IS_HIPIL());
+	kpreempt_disable();
+	ASSERT(cp == CPU);
+
+	if (!(cu_flags & CU_FLAG_ON)) {
+		kpreempt_enable();
+		return;
+	}
+
+	cu_cpu_info = cp->cpu_cu_info;
+	if (cu_cpu_info == NULL) {
+		kpreempt_enable();
+		return;
+	}
+
+	ASSERT(!cu_cpu_info->cu_disabled ||
+	    !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));
+
+	if (on) {
+		/*
+		 * Decrement the cu_disabled counter.
+		 * Once it drops to zero, call cu_cpc_program.
+		 */
+		if (cu_cpu_info->cu_disabled > 0)
+			cu_cpu_info->cu_disabled--;
+		if (cu_cpu_info->cu_disabled == 0)
+			cu_cpc_program(CPU, &error);
+	} else if (cu_cpu_info->cu_disabled++ == 0) {
+		/*
+		 * This is the first attempt to disable CU, so turn it off
+		 */
+		cu_cpc_unprogram(cp, &error);
+		ASSERT(!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));
+	}
+
+	kpreempt_enable();
+}
+
+
+/*
+ * Callback for changes in CPU states
+ * Used to enable or disable hardware performance counters on CPUs that are
+ * turned on or off
+ *
+ * NOTE: cpc should be programmed/unprogrammed while running on the target CPU.
+ * We have to use thread_affinity_set to hop to the right CPU because these
+ * routines expect cpu_lock held, so we can't cross-call other CPUs while
+ * holding CPU lock.
+ */
+static int
+/* LINTED E_FUNC_ARG_UNUSED */
+cu_cpu_callback(cpu_setup_t what, int id, void *arg)
+{
+	cpu_t	*cp;
+	int	retval = 0;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	if (!(cu_flags & CU_FLAG_ON))
+		return (-1);
+
+	cp = cpu_get(id);
+	if (cp == NULL)
+		return (-2);
+
+	switch (what) {
+	case CPU_ON:
+		/*
+		 * Setup counters on CPU being turned on
+		 */
+		retval = cu_cpu_init(cp, cu_cpc_reqs);
+
+		/*
+		 * Reset list of counter event requests so its space can be
+		 * reused for a different set of requests for next CPU
+		 */
+		(void) kcpc_reqs_reset(cu_cpc_reqs);
+		break;
+	case CPU_INTR_ON:
+		/*
+		 * Setup counters on CPU being turned on.
+		 */
+		retval = cu_cpu_run(cp, cu_cpc_program_xcall,
+		    (uintptr_t)B_TRUE);
+		break;
+	case CPU_OFF:
+		/*
+		 * Disable counters on CPU being turned off. Counters will not
+		 * be re-enabled on this CPU until it comes back online.
+		 */
+		cu_cpu_disable(cp);
+		ASSERT(!CU_CPC_ON(cp));
+		retval = cu_cpu_fini(cp);
+		break;
+	default:
+		break;
+	}
+	return (retval);
+}
+
+
+/*
+ * Disable or enable Capacity Utilization counters on a given CPU. This function
+ * can be called from any CPU to disable counters on the given CPU.
+ */
+static void
+cu_cpu_disable(cpu_t *cp)
+{
+	cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_FALSE);
+}
+
+
+static void
+cu_cpu_enable(cpu_t *cp)
+{
+	cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_TRUE);
+}
+
+
+/*
+ * Setup capacity and utilization support for given CPU
+ *
+ * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
+ *	 everything that has been successfully allocated including cpu_cu_info
+ *	if any memory allocation fails
+ */
+static int
+cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs)
+{
+	kcpc_ctx_t	**ctx_ptr_array;
+	size_t		ctx_ptr_array_sz;
+	cu_cpc_ctx_t	*cpu_ctx;
+	cu_cpu_info_t	*cu_cpu_info;
+	int		n;
+
+	/*
+	 * cpu_lock should be held and protect against CPU going away and races
+	 * with cu_{init,fini,cpu_fini}()
+	 */
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	/*
+	 * Return if not ready to setup counters yet
+	 */
+	if (!(cu_flags & CU_FLAG_READY))
+		return (-1);
+
+	if (cp->cpu_cu_info == NULL) {
+		cp->cpu_cu_info = kmem_zalloc(sizeof (cu_cpu_info_t),
+		    KM_NOSLEEP);
+		if (cp->cpu_cu_info == NULL)
+			return (-2);
+	}
+
+	/*
+	 * Get capacity and utilization CPC context for CPU and check to see
+	 * whether it has been setup already
+	 */
+	cu_cpu_info = cp->cpu_cu_info;
+	cu_cpu_info->cu_cpu = cp;
+	cu_cpu_info->cu_disabled = dtrace_cpc_in_use ? 1 : 0;
+
+	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
+	if (cpu_ctx->nctx > 0 && cpu_ctx->ctx_ptr_array != NULL &&
+	    cpu_ctx->ctx_ptr_array_sz > 0) {
+		return (1);
+	}
+
+	/*
+	 * Should have no contexts since it hasn't been setup already
+	 */
+	ASSERT(cpu_ctx->nctx == 0 && cpu_ctx->ctx_ptr_array == NULL &&
+	    cpu_ctx->ctx_ptr_array_sz == 0);
+
+	/*
+	 * Determine how many CPC events needed to measure capacity and
+	 * utilization for this CPU, allocate space for counter statistics for
+	 * each event, and fill in list of CPC event requests with corresponding
+	 * counter stats for each request to make attributing counter data
+	 * easier later....
+	 */
+	n = cu_cpc_init(cp, NULL, 0);
+	if (n <= 0) {
+		(void) cu_cpu_fini(cp);
+		return (-3);
+	}
+
+	cu_cpu_info->cu_cntr_stats = kmem_zalloc(n * sizeof (cu_cntr_stats_t),
+	    KM_NOSLEEP);
+	if (cu_cpu_info->cu_cntr_stats == NULL) {
+		(void) cu_cpu_fini(cp);
+		return (-4);
+	}
+
+	cu_cpu_info->cu_ncntr_stats = n;
+
+	n = cu_cpc_init(cp, reqs, n);
+	if (n <= 0) {
+		(void) cu_cpu_fini(cp);
+		return (-5);
+	}
+
+	/*
+	 * Create CPC context with given requests
+	 */
+	ctx_ptr_array = NULL;
+	ctx_ptr_array_sz = 0;
+	n = kcpc_cpu_ctx_create(cp, reqs, KM_NOSLEEP, &ctx_ptr_array,
+	    &ctx_ptr_array_sz);
+	if (n <= 0) {
+		(void) cu_cpu_fini(cp);
+		return (-6);
+	}
+
+	/*
+	 * Should have contexts
+	 */
+	ASSERT(n > 0 && ctx_ptr_array != NULL && ctx_ptr_array_sz > 0);
+	if (ctx_ptr_array == NULL || ctx_ptr_array_sz <= 0) {
+		(void) cu_cpu_fini(cp);
+		return (-7);
+	}
+
+	/*
+	 * Fill in CPC context info for CPU needed for capacity and utilization
+	 */
+	cpu_ctx->cur_index = 0;
+	cpu_ctx->nctx = n;
+	cpu_ctx->ctx_ptr_array = ctx_ptr_array;
+	cpu_ctx->ctx_ptr_array_sz = ctx_ptr_array_sz;
+	return (0);
+}
+
+/*
+ * Tear down capacity and utilization support for given CPU
+ */
+static int
+cu_cpu_fini(cpu_t *cp)
+{
+	kcpc_ctx_t	*ctx;
+	cu_cpc_ctx_t	*cpu_ctx;
+	cu_cpu_info_t	*cu_cpu_info;
+	int		i;
+	pghw_type_t	pg_hw_type;
+
+	/*
+	 * cpu_lock should be held and protect against CPU going away and races
+	 * with cu_{init,fini,cpu_init}()
+	 */
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	/*
+	 * Have to at least be ready to setup counters to have allocated
+	 * anything that needs to be deallocated now
+	 */
+	if (!(cu_flags & CU_FLAG_READY))
+		return (-1);
+
+	/*
+	 * Nothing to do if CPU's capacity and utilization info doesn't exist
+	 */
+	cu_cpu_info = cp->cpu_cu_info;
+	if (cu_cpu_info == NULL)
+		return (1);
+
+	/*
+	 * Tear down any existing kstats and counter info for each hardware
+	 * sharing relationship
+	 */
+	for (pg_hw_type = PGHW_START; pg_hw_type < PGHW_NUM_COMPONENTS;
+	    pg_hw_type++) {
+		cu_cntr_info_t	*cntr_info;
+
+		cntr_info = cu_cpu_info->cu_cntr_info[pg_hw_type];
+		if (cntr_info == NULL)
+			continue;
+
+		if (cntr_info->ci_kstat != NULL) {
+			kstat_delete(cntr_info->ci_kstat);
+			cntr_info->ci_kstat = NULL;
+		}
+		kmem_free(cntr_info, sizeof (cu_cntr_info_t));
+	}
+
+	/*
+	 * Free counter statistics for CPU
+	 */
+	ASSERT(cu_cpu_info->cu_cntr_stats == NULL ||
+	    cu_cpu_info->cu_ncntr_stats > 0);
+	if (cu_cpu_info->cu_cntr_stats != NULL &&
+	    cu_cpu_info->cu_ncntr_stats > 0) {
+		kmem_free(cu_cpu_info->cu_cntr_stats,
+		    cu_cpu_info->cu_ncntr_stats * sizeof (cu_cntr_stats_t));
+		cu_cpu_info->cu_cntr_stats = NULL;
+		cu_cpu_info->cu_ncntr_stats = 0;
+	}
+
+	/*
+	 * Get capacity and utilization CPC contexts for given CPU and check to
+	 * see whether they have been freed already
+	 */
+	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
+	if (cpu_ctx != NULL && cpu_ctx->ctx_ptr_array != NULL &&
+	    cpu_ctx->ctx_ptr_array_sz > 0) {
+		/*
+		 * Free CPC contexts for given CPU
+		 */
+		for (i = 0; i < cpu_ctx->nctx; i++) {
+			ctx = cpu_ctx->ctx_ptr_array[i];
+			if (ctx == NULL)
+				continue;
+			kcpc_free(ctx, 0);
+		}
+
+		/*
+		 * Free CPC context pointer array
+		 */
+		kmem_free(cpu_ctx->ctx_ptr_array, cpu_ctx->ctx_ptr_array_sz);
+
+		/*
+		 * Zero CPC info for CPU
+		 */
+		bzero(cpu_ctx, sizeof (cu_cpc_ctx_t));
+	}
+
+	/*
+	 * Set cp->cpu_cu_info pointer to NULL. Go through cross-call to ensure
+	 * that no one is going to access the cpu_cu_info whicch we are going to
+	 * free.
+	 */
+	if (cpu_is_online(cp))
+		cpu_call(cp, (cpu_call_func_t)cu_cpu_info_detach_xcall, 0, 0);
+	else
+		cp->cpu_cu_info = NULL;
+
+	/*
+	 * Free CPU's capacity and utilization info
+	 */
+	kmem_free(cu_cpu_info, sizeof (cu_cpu_info_t));
+
+	return (0);
+}
+
+/*
+ * Create capacity & utilization kstats for given PG CPU hardware sharing
+ * relationship
+ */
+static void
+cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info)
+{
+	char		*class, *sh_name;
+	kstat_t		*ks;
+
+	/*
+	 * Just return when no counter info or CPU
+	 */
+	if (cntr_info == NULL || cntr_info->ci_cpu == NULL)
+		return;
+
+	/*
+	 * Get the class name from the leaf PG that this CPU belongs to.
+	 * If there are no PGs, just use the default class "cpu".
+	 */
+	class = pg ? pghw_type_string(pg->pghw_hw) : "cpu";
+	sh_name = pg ? pghw_type_shortstring(pg->pghw_hw) : "cpu";
+
+	if ((ks = kstat_create_zone("pg_cpu", cntr_info->ci_cpu->cpu_id,
+	    sh_name, class, KSTAT_TYPE_NAMED,
+	    sizeof (cu_cpu_kstat) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID)) == NULL)
+		return;
+
+	ks->ks_lock = &pg_cpu_kstat_lock;
+	ks->ks_data = &cu_cpu_kstat;
+	ks->ks_update = cu_cpu_kstat_update;
+
+	ks->ks_private = cntr_info;
+	cntr_info->ci_kstat = ks;
+	kstat_install(cntr_info->ci_kstat);
+}
+
+
+/*
+ * Propagate values from CPU capacity & utilization stats to kstats
+ */
+static int
+cu_cpu_kstat_update(kstat_t *ksp, int rw)
+{
+	cpu_t		*cp;
+	cu_cntr_info_t	*cntr_info = ksp->ks_private;
+	struct cu_cpu_kstat	*kstat = &cu_cpu_kstat;
+	pghw_t		*pg;
+	cu_cntr_stats_t	*stats;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	kpreempt_disable();
+
+	/*
+	 * Update capacity and utilization statistics needed for CPU's PG (CPU)
+	 * kstats
+	 */
+	cp = cntr_info->ci_cpu;
+	(void) cu_cpu_update(cp, B_TRUE);
+
+	pg = cntr_info->ci_pg;
+	stats = cntr_info->ci_stats;
+	kstat->cu_cpu_id.value.ui32 = cp->cpu_id;
+	kstat->cu_generation.value.ui32 = cp->cpu_generation;
+	if (pg == NULL)
+		kstat->cu_pg_id.value.l = -1;
+	else
+		kstat->cu_pg_id.value.l = pg->pghw_pg.pg_id;
+
+	kstat->cu_cpu_util.value.ui64 = stats->cs_value_total;
+	kstat->cu_cpu_rate.value.ui64 = stats->cs_rate;
+	kstat->cu_cpu_rate_max.value.ui64 = stats->cs_rate_max;
+	kstat->cu_cpu_time_running.value.ui64 = stats->cs_time_running;
+	kstat->cu_cpu_time_stopped.value.ui64 = stats->cs_time_stopped;
+	/*
+	 * Counters are stopped now, so the cs_time_stopped was last
+	 * updated at cs_time_start time. Add the time passed since then
+	 * to the stopped time.
+	 */
+	if (!(cp->cpu_cu_info->cu_flag & CU_CPU_CNTRS_ON))
+		kstat->cu_cpu_time_stopped.value.ui64 +=
+		    gethrtime() - stats->cs_time_start;
+
+	kpreempt_enable();
+
+	return (0);
+}
+
+/*
+ * Run specified function with specified argument on a given CPU and return
+ * whatever the function returns
+ */
+static int
+cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg)
+{
+	int error = 0;
+
+	/*
+	 * cpu_call() will call func on the CPU specified with given argument
+	 * and return func's return value in last argument
+	 */
+	cpu_call(cp, (cpu_call_func_t)func, arg, (uintptr_t)&error);
+	return (error);
+}
+
+
+/*
+ * Update counter statistics on a given CPU.
+ *
+ * If move_to argument is True, execute the function on the CPU specified
+ * Otherwise, assume that it is already runninng on the right CPU
+ *
+ * If move_to is specified, the caller should hold cpu_lock or have preemption
+ * disabled. Otherwise it is up to the caller to guarantee that things do not
+ * change in the process.
+ */
+int
+cu_cpu_update(struct cpu *cp, boolean_t move_to)
+{
+	int	retval;
+	cu_cpu_info_t	*cu_cpu_info = cp->cpu_cu_info;
+	hrtime_t	time_snap;
+
+	ASSERT(!move_to || MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0);
+
+	/*
+	 * Nothing to do if counters are not programmed
+	 */
+	if (!(cu_flags & CU_FLAG_ON) ||
+	    (cu_cpu_info == NULL) ||
+	    !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
+		return (0);
+
+	/*
+	 * Don't update CPU statistics if it was updated recently
+	 * and provide old results instead
+	 */
+	time_snap = gethrtime();
+	if ((time_snap - cu_cpu_info->cu_sample_time) < cu_update_threshold) {
+		DTRACE_PROBE1(cu__drop__sample, cpu_t *, cp);
+		return (0);
+	}
+
+	cu_cpu_info->cu_sample_time = time_snap;
+
+	/*
+	 * CPC counter should be read on the CPU that is running the counter. We
+	 * either have to move ourselves to the target CPU or insure that we
+	 * already run there.
+	 *
+	 * We use cross-call to the target CPU to execute kcpc_read() and
+	 * cu_cpu_update_stats() there.
+	 */
+	retval = 0;
+	if (move_to)
+		(void) cu_cpu_run(cp, (cu_cpu_func_t)kcpc_read,
+		    (uintptr_t)cu_cpu_update_stats);
+	else {
+		retval = kcpc_read((kcpc_update_func_t)cu_cpu_update_stats);
+		/*
+		 * Offset negative return value by -10 so we can distinguish it
+		 * from error return values of this routine vs kcpc_read()
+		 */
+		if (retval < 0)
+			retval -= 10;
+	}
+
+	return (retval);
+}
+
+
+/*
+ * Update CPU counter statistics for current CPU.
+ * This function may be called from a cross-call
+ */
+static int
+cu_cpu_update_stats(cu_cntr_stats_t *stats, uint64_t cntr_value)
+{
+	cu_cpu_info_t	*cu_cpu_info = CPU->cpu_cu_info;
+	uint_t		flags;
+	uint64_t	delta;
+	hrtime_t	time_delta;
+	hrtime_t	time_snap;
+
+	if (stats == NULL)
+		return (-1);
+
+	/*
+	 * Nothing to do if counters are not programmed. This should not happen,
+	 * but we check just in case.
+	 */
+	ASSERT(cu_flags & CU_FLAG_ON);
+	ASSERT(cu_cpu_info != NULL);
+	if (!(cu_flags & CU_FLAG_ON) ||
+	    (cu_cpu_info == NULL))
+		return (-2);
+
+	flags = cu_cpu_info->cu_flag;
+	ASSERT(flags & CU_CPU_CNTRS_ON);
+	if (!(flags & CU_CPU_CNTRS_ON))
+		return (-2);
+
+	/*
+	 * Take snapshot of high resolution timer
+	 */
+	time_snap = gethrtime();
+
+	/*
+	 * CU counters have just been programmed. We cannot assume that the new
+	 * cntr_value continues from where we left off, so use the cntr_value as
+	 * the new initial value.
+	 */
+	if (flags & CU_CPU_CNTRS_OFF_ON)
+		stats->cs_value_start = cntr_value;
+
+	/*
+	 * Calculate delta in counter values between start of sampling period
+	 * and now
+	 */
+	delta = cntr_value - stats->cs_value_start;
+
+	/*
+	 * Calculate time between start of sampling period and now
+	 */
+	time_delta = stats->cs_time_start ?
+	    time_snap - stats->cs_time_start :
+	    0;
+	stats->cs_time_start = time_snap;
+	stats->cs_value_start = cntr_value;
+
+	if (time_delta > 0) { /* wrap shouldn't happen */
+		/*
+		 * Update either running or stopped time based on the transition
+		 * state
+		 */
+		if (flags & CU_CPU_CNTRS_OFF_ON)
+			stats->cs_time_stopped += time_delta;
+		else
+			stats->cs_time_running += time_delta;
+	}
+
+	/*
+	 * Update rest of counter statistics if counter value didn't wrap
+	 */
+	if (delta > 0) {
+		/*
+		 * Update utilization rate if the interval between samples is
+		 * sufficient.
+		 */
+		ASSERT(cu_sample_interval_min > CU_SCALE);
+		if (time_delta > cu_sample_interval_min)
+			stats->cs_rate = CU_RATE(delta, time_delta);
+		if (stats->cs_rate_max < stats->cs_rate)
+			stats->cs_rate_max = stats->cs_rate;
+
+		stats->cs_value_last = delta;
+		stats->cs_value_total += delta;
+	}
+
+	return (0);
+}
+
+/*
+ * Update CMT PG utilization data.
+ *
+ * This routine computes the running total utilization and times for the
+ * specified PG by adding up the total utilization and counter running and
+ * stopped times of all CPUs in the PG and calculates the utilization rate and
+ * maximum rate for all CPUs in the PG.
+ */
+void
+cu_pg_update(pghw_t *pg)
+{
+	pg_cpu_itr_t	cpu_iter;
+	pghw_type_t	pg_hwtype;
+	cpu_t		*cpu;
+	pghw_util_t	*hw_util = &pg->pghw_stats;
+	uint64_t	old_utilization = hw_util->pghw_util;
+	hrtime_t	now;
+	hrtime_t	time_delta;
+	uint64_t	utilization_delta;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	now = gethrtime();
+
+	pg_hwtype = pg->pghw_hw;
+
+	/*
+	 * Initialize running total utilization and times for PG to 0
+	 */
+	hw_util->pghw_util = 0;
+	hw_util->pghw_time_running = 0;
+	hw_util->pghw_time_stopped = 0;
+
+	/*
+	 * Iterate over all CPUs in the PG and aggregate utilization, running
+	 * time and stopped time.
+	 */
+	PG_CPU_ITR_INIT(pg, cpu_iter);
+	while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
+		cu_cpu_info_t	*cu_cpu_info = cpu->cpu_cu_info;
+		cu_cntr_info_t	*cntr_info;
+		cu_cntr_stats_t	*stats;
+
+		if (cu_cpu_info == NULL)
+			continue;
+
+		/*
+		 * Update utilization data for the CPU and then
+		 * aggregate per CPU running totals for PG
+		 */
+		(void) cu_cpu_update(cpu, B_TRUE);
+		cntr_info = cu_cpu_info->cu_cntr_info[pg_hwtype];
+
+		if (cntr_info == NULL || (stats = cntr_info->ci_stats) == NULL)
+			continue;
+
+		hw_util->pghw_util += stats->cs_value_total;
+		hw_util->pghw_time_running += stats->cs_time_running;
+		hw_util->pghw_time_stopped += stats->cs_time_stopped;
+
+		/*
+		 * If counters are stopped now, the pg_time_stopped was last
+		 * updated at cs_time_start time. Add the time passed since then
+		 * to the stopped time.
+		 */
+		if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
+			hw_util->pghw_time_stopped +=
+			    now - stats->cs_time_start;
+	}
+
+	/*
+	 * Compute per PG instruction rate and maximum rate
+	 */
+	time_delta = now - hw_util->pghw_time_stamp;
+	hw_util->pghw_time_stamp = now;
+
+	if (old_utilization == 0)
+		return;
+
+	/*
+	 * Calculate change in utilization over sampling period and set this to
+	 * 0 if the delta would be 0 or negative which may happen if any CPUs go
+	 * offline during the sampling period
+	 */
+	if (hw_util->pghw_util > old_utilization)
+		utilization_delta = hw_util->pghw_util - old_utilization;
+	else
+		utilization_delta = 0;
+
+	/*
+	 * Update utilization rate if the interval between samples is
+	 * sufficient.
+	 */
+	ASSERT(cu_sample_interval_min > CU_SCALE);
+	if (time_delta > CU_SAMPLE_INTERVAL_MIN)
+		hw_util->pghw_rate = CU_RATE(utilization_delta, time_delta);
+
+	/*
+	 * Update the maximum observed rate
+	 */
+	if (hw_util->pghw_rate_max < hw_util->pghw_rate)
+		hw_util->pghw_rate_max = hw_util->pghw_rate;
+}
diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c
index 009598f03f..62e8eeb2fe 100644
--- a/usr/src/uts/common/os/cpu.c
+++ b/usr/src/uts/common/os/cpu.c
@@ -1203,12 +1203,14 @@ cpu_online(cpu_t *cp)
 		}
 		cp->cpu_flags &= ~(CPU_QUIESCED | CPU_OFFLINE | CPU_FROZEN |
 		    CPU_SPARE);
+		CPU_NEW_GENERATION(cp);
 		start_cpus();
 		cpu_stats_kstat_create(cp);
 		cpu_create_intrstat(cp);
 		lgrp_kstat_create(cp);
 		cpu_state_change_notify(cp->cpu_id, CPU_ON);
 		cpu_intr_enable(cp);	/* arch-dep hook */
+		cpu_state_change_notify(cp->cpu_id, CPU_INTR_ON);
 		cpu_set_state(cp);
 		cyclic_online(cp);
 		/*
@@ -1284,6 +1286,7 @@ cpu_offline(cpu_t *cp, int flags)
 	/*
 	 * Tell interested parties that this CPU is going offline.
 	 */
+	CPU_NEW_GENERATION(cp);
 	cpu_state_change_notify(cp->cpu_id, CPU_OFF);
 
 	/*
@@ -1557,8 +1560,11 @@ out:
 	/*
 	 * If we failed, we need to notify everyone that this CPU is back on.
 	 */
-	if (error != 0)
+	if (error != 0) {
+		CPU_NEW_GENERATION(cp);
 		cpu_state_change_notify(cp->cpu_id, CPU_ON);
+		cpu_state_change_notify(cp->cpu_id, CPU_INTR_ON);
+	}
 
 	return (error);
 }
@@ -2152,6 +2158,7 @@ static struct {
 	kstat_named_t ci_core_id;
 	kstat_named_t ci_curr_clock_Hz;
 	kstat_named_t ci_supp_freq_Hz;
+	kstat_named_t ci_pg_id;
 #if defined(__sparcv9)
 	kstat_named_t ci_device_ID;
 	kstat_named_t ci_cpu_fru;
@@ -2167,6 +2174,7 @@ static struct {
 	kstat_named_t ci_ncoreperchip;
 	kstat_named_t ci_max_cstates;
 	kstat_named_t ci_curr_cstate;
+	kstat_named_t ci_cacheid;
 	kstat_named_t ci_sktstr;
 #endif
 } cpu_info_template = {
@@ -2181,6 +2189,7 @@ static struct {
 	{ "core_id",			KSTAT_DATA_LONG },
 	{ "current_clock_Hz",		KSTAT_DATA_UINT64 },
 	{ "supported_frequencies_Hz",	KSTAT_DATA_STRING },
+	{ "pg_id",			KSTAT_DATA_LONG },
 #if defined(__sparcv9)
 	{ "device_ID",			KSTAT_DATA_UINT64 },
 	{ "cpu_fru",			KSTAT_DATA_STRING },
@@ -2196,6 +2205,7 @@ static struct {
 	{ "ncore_per_chip",		KSTAT_DATA_INT32 },
 	{ "supported_max_cstates",	KSTAT_DATA_INT32 },
 	{ "current_cstate",		KSTAT_DATA_INT32 },
+	{ "cache_id",			KSTAT_DATA_INT32 },
 	{ "socket_type",		KSTAT_DATA_STRING },
 #endif
 };
@@ -2253,6 +2263,9 @@ cpu_info_kstat_update(kstat_t *ksp, int rw)
 	cpu_info_template.ci_core_id.value.l = pg_plat_get_core_id(cp);
 	cpu_info_template.ci_curr_clock_Hz.value.ui64 =
 	    cp->cpu_curr_clock;
+	cpu_info_template.ci_pg_id.value.l =
+	    cp->cpu_pg && cp->cpu_pg->cmt_lineage ?
+	    cp->cpu_pg->cmt_lineage->pg_id : -1;
 	kstat_named_setstr(&cpu_info_template.ci_supp_freq_Hz,
 	    cp->cpu_supp_freqs);
 #if defined(__sparcv9)
@@ -2273,6 +2286,7 @@ cpu_info_kstat_update(kstat_t *ksp, int rw)
 	cpu_info_template.ci_pkg_core_id.value.l = cpuid_get_pkgcoreid(cp);
 	cpu_info_template.ci_max_cstates.value.l = cp->cpu_m.max_cstates;
 	cpu_info_template.ci_curr_cstate.value.l = cpu_idle_get_cpu_state(cp);
+	cpu_info_template.ci_cacheid.value.i32 = cpuid_get_cacheid(cp);
 	kstat_named_setstr(&cpu_info_template.ci_sktstr,
 	    cpuid_getsocketstr(cp));
 #endif
diff --git a/usr/src/uts/common/os/group.c b/usr/src/uts/common/os/group.c
index 01e3f1ebdd..e46e7f600c 100644
--- a/usr/src/uts/common/os/group.c
+++ b/usr/src/uts/common/os/group.c
@@ -28,6 +28,7 @@
 #include <sys/debug.h>
 #include <sys/kmem.h>
 #include <sys/group.h>
+#include <sys/cmn_err.h>
 
 
 #define	GRP_SET_SIZE_DEFAULT 2
@@ -352,3 +353,102 @@ group_find(group_t *g, void *e)
 	}
 	return ((uint_t)-1);
 }
+
+/*
+ * Return a string in a given buffer with list of integer entries in a group.
+ * The string concatenates consecutive integer ranges ax x-y.
+ * The resulting string looks like "1,2-5,8"
+ *
+ * The convert argument is used to map group elements to integer IDs.
+ */
+char *
+group2intlist(group_t *group, char *buffer, size_t len, int (convert)(void*))
+{
+	char		*ptr = buffer;
+	void		*v;
+	group_iter_t	iter;
+	boolean_t	first_iteration = B_TRUE;
+	boolean_t	first_value = B_TRUE;
+	int		start = 0, end = 0;
+
+	/*
+	 * Allow for the terminating NULL-byte
+	 */
+	len = len -1;
+
+	group_iter_init(&iter);
+	while ((v = group_iterate(group, &iter)) != NULL && len > 0) {
+		int id = convert(v);
+		int nbytes = 0;
+
+		if (first_iteration) {
+			start = end = id;
+			first_iteration = B_FALSE;
+		} else if (end + 1 == id) {
+			/*
+			 * Got consecutive ID, so extend end of range without
+			 * doing anything since the range may extend further
+			 */
+			end = id;
+		} else {
+			if (first_value) {
+				first_value = B_FALSE;
+			} else {
+				*ptr++ = ',';
+				len--;
+			}
+
+			if (len == 0)
+				break;
+
+			/*
+			 * Next ID is not consecutive, so dump IDs gotten so
+			 * far.
+			 */
+			if (end > start + 1) /* range */
+				nbytes = snprintf(ptr, len, "%d-%d",
+				    start, end);
+			else if (end > start) /* different values */
+				nbytes = snprintf(ptr, len, "%d,%d",
+				    start, end);
+			else /* same value */
+				nbytes = snprintf(ptr, len, "%d", start);
+
+			if (nbytes <= 0) {
+				len = 0;
+				break;
+			}
+
+			/*
+			 * Advance position in the string
+			 */
+			ptr += nbytes;
+			len -= nbytes;
+
+			/*
+			 * Try finding consecutive range starting from current
+			 * ID.
+			 */
+			start = end = id;
+		}
+	}
+
+	if (!first_value) {
+		*ptr++ = ',';
+		len--;
+	}
+	/*
+	 * Print last ID(s)
+	 */
+	if (len > 0) {
+		if (end > start + 1) {
+			(void) snprintf(ptr, len, "%d-%d", start, end);
+		} else if (end != start) {
+			(void) snprintf(ptr, len, "%d,%d", start, end);
+		} else {
+			(void) snprintf(ptr, len, "%d", start);
+		}
+	}
+
+	return (buffer);
+}
diff --git a/usr/src/uts/common/os/kcpc.c b/usr/src/uts/common/os/kcpc.c
index e5cab151b8..50a999dcc5 100644
--- a/usr/src/uts/common/os/kcpc.c
+++ b/usr/src/uts/common/os/kcpc.c
@@ -39,12 +39,17 @@
 #include <sys/sunddi.h>
 #include <sys/modctl.h>
 #include <sys/sdt.h>
+#include <sys/archsystm.h>
+#include <sys/promif.h>
+#include <sys/x_call.h>
+#include <sys/cap_util.h>
 #if defined(__x86)
 #include <asm/clock.h>
+#include <sys/xc_levels.h>
 #endif
 
-kmutex_t	kcpc_ctx_llock[CPC_HASH_BUCKETS];	/* protects ctx_list */
-kcpc_ctx_t	*kcpc_ctx_list[CPC_HASH_BUCKETS];	/* head of list */
+static kmutex_t	kcpc_ctx_llock[CPC_HASH_BUCKETS];	/* protects ctx_list */
+static kcpc_ctx_t *kcpc_ctx_list[CPC_HASH_BUCKETS];	/* head of list */
 
 
 krwlock_t	kcpc_cpuctx_lock;	/* lock for 'kcpc_cpuctx' below */
@@ -73,10 +78,75 @@ static int kcpc_nullctx_panic = 0;
 static void kcpc_lwp_create(kthread_t *t, kthread_t *ct);
 static void kcpc_restore(kcpc_ctx_t *ctx);
 static void kcpc_save(kcpc_ctx_t *ctx);
-static void kcpc_free(kcpc_ctx_t *ctx, int isexec);
 static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx);
 static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch);
 static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set);
+static kcpc_set_t *kcpc_set_create(kcpc_request_t *reqs, int nreqs,
+    int set_flags, int kmem_flags);
+
+/*
+ * Macros to manipulate context flags. All flag updates should use one of these
+ * two macros
+ *
+ * Flags should be always be updated atomically since some of the updates are
+ * not protected by locks.
+ */
+#define	KCPC_CTX_FLAG_SET(ctx, flag) atomic_or_uint(&(ctx)->kc_flags, (flag))
+#define	KCPC_CTX_FLAG_CLR(ctx, flag) atomic_and_uint(&(ctx)->kc_flags, ~(flag))
+
+/*
+ * The IS_HIPIL() macro verifies that the code is executed either from a
+ * cross-call or from high-PIL interrupt
+ */
+#ifdef DEBUG
+#define	IS_HIPIL() (getpil() >= XCALL_PIL)
+#else
+#define	IS_HIPIL()
+#endif	/* DEBUG */
+
+
+extern int kcpc_hw_load_pcbe(void);
+
+/*
+ * Return value from kcpc_hw_load_pcbe()
+ */
+static int kcpc_pcbe_error = 0;
+
+/*
+ * Perform one-time initialization of kcpc framework.
+ * This function performs the initialization only the first time it is called.
+ * It is safe to call it multiple times.
+ */
+int
+kcpc_init(void)
+{
+	long hash;
+	static uint32_t kcpc_initialized = 0;
+
+	/*
+	 * We already tried loading platform pcbe module and failed
+	 */
+	if (kcpc_pcbe_error != 0)
+		return (-1);
+
+	/*
+	 * The kcpc framework should be initialized at most once
+	 */
+	if (atomic_cas_32(&kcpc_initialized, 0, 1) != 0)
+		return (0);
+
+	rw_init(&kcpc_cpuctx_lock, NULL, RW_DEFAULT, NULL);
+	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
+		mutex_init(&kcpc_ctx_llock[hash],
+		    NULL, MUTEX_DRIVER, (void *)(uintptr_t)15);
+
+	/*
+	 * Load platform-specific pcbe module
+	 */
+	kcpc_pcbe_error = kcpc_hw_load_pcbe();
+
+	return (kcpc_pcbe_error == 0 ? 0 : -1);
+}
 
 void
 kcpc_register_pcbe(pcbe_ops_t *ops)
@@ -103,8 +173,9 @@ kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode)
 	cpu_t		*cp;
 	kcpc_ctx_t	*ctx;
 	int		error;
+	int		save_spl;
 
-	ctx = kcpc_ctx_alloc();
+	ctx = kcpc_ctx_alloc(KM_SLEEP);
 
 	if (kcpc_assign_reqs(set, ctx) != 0) {
 		kcpc_ctx_free(ctx);
@@ -141,28 +212,34 @@ kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode)
 		goto unbound;
 
 	mutex_enter(&cp->cpu_cpc_ctxlock);
+	kpreempt_disable();
+	save_spl = spl_xcall();
 
-	if (cp->cpu_cpc_ctx != NULL) {
+	/*
+	 * Check to see whether counters for CPU already being used by someone
+	 * other than kernel for capacity and utilization (since kernel will
+	 * let go of counters for user in kcpc_program() below)
+	 */
+	if (cp->cpu_cpc_ctx != NULL && !CU_CPC_ON(cp)) {
 		/*
 		 * If this CPU already has a bound set, return an error.
 		 */
+		splx(save_spl);
+		kpreempt_enable();
 		mutex_exit(&cp->cpu_cpc_ctxlock);
 		goto unbound;
 	}
 
 	if (curthread->t_bind_cpu != cpuid) {
+		splx(save_spl);
+		kpreempt_enable();
 		mutex_exit(&cp->cpu_cpc_ctxlock);
 		goto unbound;
 	}
-	cp->cpu_cpc_ctx = ctx;
 
-	/*
-	 * Kernel preemption must be disabled while fiddling with the hardware
-	 * registers to prevent partial updates.
-	 */
-	kpreempt_disable();
-	ctx->kc_rawtick = KCPC_GET_TICK();
-	pcbe_ops->pcbe_program(ctx);
+	kcpc_program(ctx, B_FALSE, B_TRUE);
+
+	splx(save_spl);
 	kpreempt_enable();
 
 	mutex_exit(&cp->cpu_cpc_ctxlock);
@@ -197,14 +274,14 @@ kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
 	if (t->t_cpc_ctx != NULL)
 		return (EEXIST);
 
-	ctx = kcpc_ctx_alloc();
+	ctx = kcpc_ctx_alloc(KM_SLEEP);
 
 	/*
 	 * The context must begin life frozen until it has been properly
 	 * programmed onto the hardware. This prevents the context ops from
 	 * worrying about it until we're ready.
 	 */
-	ctx->kc_flags |= KCPC_CTX_FREEZE;
+	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
 	ctx->kc_hrtime = gethrtime();
 
 	if (kcpc_assign_reqs(set, ctx) != 0) {
@@ -215,13 +292,13 @@ kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
 
 	ctx->kc_cpuid = -1;
 	if (set->ks_flags & CPC_BIND_LWP_INHERIT)
-		ctx->kc_flags |= KCPC_CTX_LWPINHERIT;
+		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_LWPINHERIT);
 	ctx->kc_thread = t;
 	t->t_cpc_ctx = ctx;
 	/*
 	 * Permit threads to look at their own hardware counters from userland.
 	 */
-	ctx->kc_flags |= KCPC_CTX_NONPRIV;
+	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_NONPRIV);
 
 	/*
 	 * Create the data store for this set.
@@ -248,12 +325,14 @@ kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
 	 * Ask the backend to program the hardware.
 	 */
 	if (t == curthread) {
+		int save_spl;
+
 		kpreempt_disable();
-		ctx->kc_rawtick = KCPC_GET_TICK();
-		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
-		pcbe_ops->pcbe_program(ctx);
+		save_spl = spl_xcall();
+		kcpc_program(ctx, B_TRUE, B_TRUE);
+		splx(save_spl);
 		kpreempt_enable();
-	} else
+	} else {
 		/*
 		 * Since we are the agent LWP, we know the victim LWP is stopped
 		 * until we're done here; no need to worry about preemption or
@@ -262,7 +341,8 @@ kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
 		 * still be accessed from, for instance, another CPU doing a
 		 * kcpc_invalidate_all().
 		 */
-		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
+		KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
+	}
 
 	mutex_enter(&set->ks_lock);
 	set->ks_state |= KCPC_SET_BOUND;
@@ -304,7 +384,7 @@ kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode)
 			 * notification, we flag the context as being one that
 			 * cares about overflow.
 			 */
-			ctx->kc_flags |= KCPC_CTX_SIGOVF;
+			KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_SIGOVF);
 		}
 
 		rp->kr_config = NULL;
@@ -349,7 +429,7 @@ int
 kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick)
 {
 	kcpc_ctx_t	*ctx = set->ks_ctx;
-	uint64_t	curtick = KCPC_GET_TICK();
+	int		save_spl;
 
 	mutex_enter(&set->ks_lock);
 	if ((set->ks_state & KCPC_SET_BOUND) == 0) {
@@ -358,41 +438,53 @@ kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick)
 	}
 	mutex_exit(&set->ks_lock);
 
-	if (ctx->kc_flags & KCPC_CTX_INVALID)
+	/*
+	 * Kernel preemption must be disabled while reading the hardware regs,
+	 * and if this is a CPU-bound context, while checking the CPU binding of
+	 * the current thread.
+	 */
+	kpreempt_disable();
+	save_spl = spl_xcall();
+
+	if (ctx->kc_flags & KCPC_CTX_INVALID) {
+		splx(save_spl);
+		kpreempt_enable();
 		return (EAGAIN);
+	}
 
 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0) {
-		/*
-		 * Kernel preemption must be disabled while reading the
-		 * hardware regs, and if this is a CPU-bound context, while
-		 * checking the CPU binding of the current thread.
-		 */
-		kpreempt_disable();
-
 		if (ctx->kc_cpuid != -1) {
 			if (curthread->t_bind_cpu != ctx->kc_cpuid) {
+				splx(save_spl);
 				kpreempt_enable();
 				return (EAGAIN);
 			}
 		}
 
 		if (ctx->kc_thread == curthread) {
-			ctx->kc_hrtime = gethrtime();
+			uint64_t curtick = KCPC_GET_TICK();
+
+			ctx->kc_hrtime = gethrtime_waitfree();
 			pcbe_ops->pcbe_sample(ctx);
 			ctx->kc_vtick += curtick - ctx->kc_rawtick;
 			ctx->kc_rawtick = curtick;
 		}
 
-		kpreempt_enable();
-
 		/*
 		 * The config may have been invalidated by
 		 * the pcbe_sample op.
 		 */
-		if (ctx->kc_flags & KCPC_CTX_INVALID)
+		if (ctx->kc_flags & KCPC_CTX_INVALID) {
+			splx(save_spl);
+			kpreempt_enable();
 			return (EAGAIN);
+		}
+
 	}
 
+	splx(save_spl);
+	kpreempt_enable();
+
 	if (copyout(set->ks_data, buf,
 	    set->ks_nreqs * sizeof (uint64_t)) == -1)
 		return (EFAULT);
@@ -412,20 +504,17 @@ kcpc_stop_hw(kcpc_ctx_t *ctx)
 {
 	cpu_t *cp;
 
-	ASSERT((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED))
-	    == KCPC_CTX_INVALID);
-
 	kpreempt_disable();
 
-	cp = cpu_get(ctx->kc_cpuid);
-	ASSERT(cp != NULL);
+	if (ctx->kc_cpuid == CPU->cpu_id) {
+		cp = CPU;
+	} else {
+		cp = cpu_get(ctx->kc_cpuid);
+	}
+
+	ASSERT(cp != NULL && cp->cpu_cpc_ctx == ctx);
+	kcpc_cpu_stop(cp, B_FALSE);
 
-	if (cp == CPU) {
-		pcbe_ops->pcbe_allstop();
-		atomic_or_uint(&ctx->kc_flags,
-		    KCPC_CTX_INVALID_STOPPED);
-	} else
-		kcpc_remote_stop(cp);
 	kpreempt_enable();
 }
 
@@ -451,7 +540,7 @@ kcpc_unbind(kcpc_set_t *set)
 	 * Use kc_lock to synchronize with kcpc_restore().
 	 */
 	mutex_enter(&ctx->kc_lock);
-	ctx->kc_flags |= KCPC_CTX_INVALID;
+	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
 	mutex_exit(&ctx->kc_lock);
 
 	if (ctx->kc_cpuid == -1) {
@@ -461,12 +550,14 @@ kcpc_unbind(kcpc_set_t *set)
 		 * context.  It will be freed via removectx() calling
 		 * freectx() calling kcpc_free().
 		 */
-		if (t == curthread &&
-		    (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
+		if (t == curthread) {
+			int save_spl;
+
 			kpreempt_disable();
-			pcbe_ops->pcbe_allstop();
-			atomic_or_uint(&ctx->kc_flags,
-			    KCPC_CTX_INVALID_STOPPED);
+			save_spl = spl_xcall();
+			if (!(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED))
+				kcpc_unprogram(ctx, B_TRUE);
+			splx(save_spl);
 			kpreempt_enable();
 		}
 #ifdef DEBUG
@@ -503,7 +594,6 @@ kcpc_unbind(kcpc_set_t *set)
 			if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0)
 				kcpc_stop_hw(ctx);
 			ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
-			cp->cpu_cpc_ctx = NULL;
 			mutex_exit(&cp->cpu_cpc_ctxlock);
 		}
 		mutex_exit(&cpu_lock);
@@ -543,12 +633,20 @@ kcpc_restart(kcpc_set_t *set)
 {
 	kcpc_ctx_t	*ctx = set->ks_ctx;
 	int		i;
+	int		save_spl;
 
 	ASSERT(set->ks_state & KCPC_SET_BOUND);
 	ASSERT(ctx->kc_thread == curthread);
 	ASSERT(ctx->kc_cpuid == -1);
 
+	for (i = 0; i < set->ks_nreqs; i++) {
+		*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
+		pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset,
+		    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
+	}
+
 	kpreempt_disable();
+	save_spl = spl_xcall();
 
 	/*
 	 * If the user is doing this on a running set, make sure the counters
@@ -557,18 +655,13 @@ kcpc_restart(kcpc_set_t *set)
 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
 		pcbe_ops->pcbe_allstop();
 
-	for (i = 0; i < set->ks_nreqs; i++) {
-		*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
-		pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset,
-		    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
-	}
-
 	/*
 	 * Ask the backend to program the hardware.
 	 */
 	ctx->kc_rawtick = KCPC_GET_TICK();
-	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
+	KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
 	pcbe_ops->pcbe_program(ctx);
+	splx(save_spl);
 	kpreempt_enable();
 
 	return (0);
@@ -604,7 +697,7 @@ kcpc_enable(kthread_t *t, int cmd, int enable)
 		if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
 			return (EINVAL);
 		kpreempt_disable();
-		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
+		KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
 		kcpc_restore(ctx);
 		kpreempt_enable();
 	} else if (cmd == CPC_DISABLE) {
@@ -612,7 +705,7 @@ kcpc_enable(kthread_t *t, int cmd, int enable)
 			return (EINVAL);
 		kpreempt_disable();
 		kcpc_save(ctx);
-		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
+		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
 		kpreempt_enable();
 	} else if (cmd == CPC_USR_EVENTS || cmd == CPC_SYS_EVENTS) {
 		/*
@@ -624,10 +717,11 @@ kcpc_enable(kthread_t *t, int cmd, int enable)
 		    CPC_COUNT_USER: CPC_COUNT_SYSTEM;
 
 		kpreempt_disable();
-		atomic_or_uint(&ctx->kc_flags,
+		KCPC_CTX_FLAG_SET(ctx,
 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
 		pcbe_ops->pcbe_allstop();
 		kpreempt_enable();
+
 		for (i = 0; i < set->ks_nreqs; i++) {
 			set->ks_req[i].kr_preset = *(set->ks_req[i].kr_data);
 			if (enable)
@@ -715,12 +809,14 @@ kcpc_next_config(void *token, void *current, uint64_t **data)
 
 
 kcpc_ctx_t *
-kcpc_ctx_alloc(void)
+kcpc_ctx_alloc(int kmem_flags)
 {
 	kcpc_ctx_t	*ctx;
 	long		hash;
 
-	ctx = (kcpc_ctx_t *)kmem_zalloc(sizeof (kcpc_ctx_t), KM_SLEEP);
+	ctx = (kcpc_ctx_t *)kmem_zalloc(sizeof (kcpc_ctx_t), kmem_flags);
+	if (ctx == NULL)
+		return (NULL);
 
 	hash = CPC_HASH_CTX(ctx);
 	mutex_enter(&kcpc_ctx_llock[hash]);
@@ -909,9 +1005,10 @@ kcpc_overflow_intr(caddr_t arg, uint64_t bitmap)
 		 */
 		if (kcpc_nullctx_panic)
 			panic("null cpc context, thread %p", (void *)t);
-
-		cmn_err(CE_WARN,
+#ifdef DEBUG
+		cmn_err(CE_NOTE,
 		    "null cpc context found in overflow handler!\n");
+#endif
 		atomic_add_32(&kcpc_nullctx_count, 1);
 	} else if ((ctx->kc_flags & KCPC_CTX_INVALID) == 0) {
 		/*
@@ -935,13 +1032,20 @@ kcpc_overflow_intr(caddr_t arg, uint64_t bitmap)
 				 * so freeze the context. The interrupt handler
 				 * has already stopped the counter hardware.
 				 */
-				atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
+				KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
 				atomic_or_uint(&ctx->kc_pics[i].kp_flags,
 				    KCPC_PIC_OVERFLOWED);
 			}
 		}
 		aston(t);
+	} else if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) {
+		/*
+		 * Thread context is no longer valid, but here may be a valid
+		 * CPU context.
+		 */
+		return (curthread->t_cpu->cpu_cpc_ctx);
 	}
+
 	return (NULL);
 }
 
@@ -956,6 +1060,7 @@ kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
 	kcpc_ctx_t *ctx;
 	uint64_t bitmap;
 	uint8_t *state;
+	int	save_spl;
 
 	if (pcbe_ops == NULL ||
 	    (bitmap = pcbe_ops->pcbe_overflow_bitmap()) == 0)
@@ -985,6 +1090,13 @@ kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
 			(*dtrace_cpc_fire)(bitmap);
 
 			ctx = curthread->t_cpu->cpu_cpc_ctx;
+			if (ctx == NULL) {
+#ifdef DEBUG
+				cmn_err(CE_NOTE, "null cpc context in"
+				    "hardware overflow handler!\n");
+#endif
+				return (DDI_INTR_CLAIMED);
+			}
 
 			/* Reset any counters that have overflowed */
 			for (i = 0; i < ctx->kc_set->ks_nreqs; i++) {
@@ -1025,7 +1137,12 @@ kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
 	 * the middle of updating it, no AST has been posted, and so we
 	 * should sample the counters here, and restart them with no
 	 * further fuss.
+	 *
+	 * The CPU's CPC context may disappear as a result of cross-call which
+	 * has higher PIL on x86, so protect the context by raising PIL to the
+	 * cross-call level.
 	 */
+	save_spl = spl_xcall();
 	if ((ctx = kcpc_overflow_intr(arg1, bitmap)) != NULL) {
 		uint64_t curtick = KCPC_GET_TICK();
 
@@ -1035,6 +1152,7 @@ kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
 		pcbe_ops->pcbe_sample(ctx);
 		pcbe_ops->pcbe_program(ctx);
 	}
+	splx(save_spl);
 
 	return (DDI_INTR_CLAIMED);
 }
@@ -1087,7 +1205,7 @@ kcpc_overflow_ast()
 	 * Otherwise, re-enable the counters and continue life as before.
 	 */
 	kpreempt_disable();
-	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
+	KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
 	pcbe_ops->pcbe_program(ctx);
 	kpreempt_enable();
 	return (0);
@@ -1099,43 +1217,68 @@ kcpc_overflow_ast()
 static void
 kcpc_save(kcpc_ctx_t *ctx)
 {
+	int err;
+	int save_spl;
+
+	kpreempt_disable();
+	save_spl = spl_xcall();
+
 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
-		if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)
+		if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) {
+			splx(save_spl);
+			kpreempt_enable();
 			return;
+		}
 		/*
 		 * This context has been invalidated but the counters have not
 		 * been stopped. Stop them here and mark the context stopped.
 		 */
-		pcbe_ops->pcbe_allstop();
-		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
+		kcpc_unprogram(ctx, B_TRUE);
+		splx(save_spl);
+		kpreempt_enable();
 		return;
 	}
 
 	pcbe_ops->pcbe_allstop();
-	if (ctx->kc_flags & KCPC_CTX_FREEZE)
+	if (ctx->kc_flags & KCPC_CTX_FREEZE) {
+		splx(save_spl);
+		kpreempt_enable();
 		return;
+	}
 
 	/*
 	 * Need to sample for all reqs into each req's current mpic.
 	 */
-	ctx->kc_hrtime = gethrtime();
+	ctx->kc_hrtime = gethrtime_waitfree();
 	ctx->kc_vtick += KCPC_GET_TICK() - ctx->kc_rawtick;
 	pcbe_ops->pcbe_sample(ctx);
+
+	/*
+	 * Program counter for measuring capacity and utilization since user
+	 * thread isn't using counter anymore
+	 */
+	ASSERT(ctx->kc_cpuid == -1);
+	cu_cpc_program(CPU, &err);
+	splx(save_spl);
+	kpreempt_enable();
 }
 
 static void
 kcpc_restore(kcpc_ctx_t *ctx)
 {
+	int save_spl;
+
 	mutex_enter(&ctx->kc_lock);
+
 	if ((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED)) ==
-	    KCPC_CTX_INVALID)
+	    KCPC_CTX_INVALID) {
 		/*
 		 * The context is invalidated but has not been marked stopped.
 		 * We mark it as such here because we will not start the
 		 * counters during this context switch.
 		 */
-		ctx->kc_flags |= KCPC_CTX_INVALID_STOPPED;
-
+		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID_STOPPED);
+	}
 
 	if (ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_FREEZE)) {
 		mutex_exit(&ctx->kc_lock);
@@ -1151,7 +1294,7 @@ kcpc_restore(kcpc_ctx_t *ctx)
 	 * doing this, we're asking kcpc_free() to cv_wait() until
 	 * kcpc_restore() has completed.
 	 */
-	ctx->kc_flags |= KCPC_CTX_RESTORE;
+	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_RESTORE);
 	mutex_exit(&ctx->kc_lock);
 
 	/*
@@ -1159,14 +1302,17 @@ kcpc_restore(kcpc_ctx_t *ctx)
 	 * don't do an explicit pcbe_allstop() here because they should have
 	 * been stopped already by the last consumer.
 	 */
-	ctx->kc_rawtick = KCPC_GET_TICK();
-	pcbe_ops->pcbe_program(ctx);
+	kpreempt_disable();
+	save_spl = spl_xcall();
+	kcpc_program(ctx, B_TRUE, B_TRUE);
+	splx(save_spl);
+	kpreempt_enable();
 
 	/*
 	 * Wake the agent thread if it's waiting in kcpc_free().
 	 */
 	mutex_enter(&ctx->kc_lock);
-	ctx->kc_flags &= ~KCPC_CTX_RESTORE;
+	KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_RESTORE);
 	cv_signal(&ctx->kc_condv);
 	mutex_exit(&ctx->kc_lock);
 }
@@ -1177,7 +1323,6 @@ kcpc_restore(kcpc_ctx_t *ctx)
  * counters when the idle thread is switched on, and they start them again when
  * it is switched off.
  */
-
 /*ARGSUSED*/
 void
 kcpc_idle_save(struct cpu *cp)
@@ -1242,7 +1387,7 @@ kcpc_lwp_create(kthread_t *t, kthread_t *ct)
 		rw_exit(&kcpc_cpuctx_lock);
 		return;
 	}
-	cctx = kcpc_ctx_alloc();
+	cctx = kcpc_ctx_alloc(KM_SLEEP);
 	kcpc_ctx_clone(ctx, cctx);
 	rw_exit(&kcpc_cpuctx_lock);
 
@@ -1250,7 +1395,7 @@ kcpc_lwp_create(kthread_t *t, kthread_t *ct)
 	 * Copy the parent context's kc_flags field, but don't overwrite
 	 * the child's in case it was modified during kcpc_ctx_clone.
 	 */
-	cctx->kc_flags |= ctx->kc_flags;
+	KCPC_CTX_FLAG_SET(cctx,  ctx->kc_flags);
 	cctx->kc_thread = ct;
 	cctx->kc_cpuid = -1;
 	ct->t_cpc_set = cctx->kc_set;
@@ -1265,13 +1410,14 @@ kcpc_lwp_create(kthread_t *t, kthread_t *ct)
 		 * set to UINT64_MAX, and their pic's overflow flag turned on
 		 * so that our trap() processing knows to send a signal.
 		 */
-		atomic_or_uint(&cctx->kc_flags, KCPC_CTX_FREEZE);
+		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
 		for (i = 0; i < ks->ks_nreqs; i++) {
 			kcpc_request_t *kr = &ks->ks_req[i];
 
 			if (kr->kr_flags & CPC_OVF_NOTIFY_EMT) {
 				*(kr->kr_data) = UINT64_MAX;
-				kr->kr_picp->kp_flags |= KCPC_PIC_OVERFLOWED;
+				atomic_or_uint(&kr->kr_picp->kp_flags,
+				    KCPC_PIC_OVERFLOWED);
 			}
 		}
 		ttolwp(ct)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
@@ -1315,7 +1461,7 @@ kcpc_lwp_create(kthread_t *t, kthread_t *ct)
  */
 
 /*ARGSUSED*/
-static void
+void
 kcpc_free(kcpc_ctx_t *ctx, int isexec)
 {
 	int		i;
@@ -1329,7 +1475,7 @@ kcpc_free(kcpc_ctx_t *ctx, int isexec)
 	mutex_enter(&ctx->kc_lock);
 	while (ctx->kc_flags & KCPC_CTX_RESTORE)
 		cv_wait(&ctx->kc_condv, &ctx->kc_lock);
-	ctx->kc_flags |= KCPC_CTX_INVALID;
+	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
 	mutex_exit(&ctx->kc_lock);
 
 	if (isexec) {
@@ -1356,21 +1502,22 @@ kcpc_free(kcpc_ctx_t *ctx, int isexec)
 			if (cp != NULL) {
 				mutex_enter(&cp->cpu_cpc_ctxlock);
 				kcpc_stop_hw(ctx);
-				cp->cpu_cpc_ctx = NULL;
 				mutex_exit(&cp->cpu_cpc_ctxlock);
 			}
 			mutex_exit(&cpu_lock);
 			ASSERT(curthread->t_cpc_ctx == NULL);
 		} else {
+			int save_spl;
+
 			/*
 			 * Thread-bound context; stop _this_ CPU's counters.
 			 */
 			kpreempt_disable();
-			pcbe_ops->pcbe_allstop();
-			atomic_or_uint(&ctx->kc_flags,
-			    KCPC_CTX_INVALID_STOPPED);
-			kpreempt_enable();
+			save_spl = spl_xcall();
+			kcpc_unprogram(ctx, B_TRUE);
 			curthread->t_cpc_ctx = NULL;
+			splx(save_spl);
+			kpreempt_enable();
 		}
 
 		/*
@@ -1435,7 +1582,7 @@ kcpc_invalidate_all(void)
 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++) {
 		mutex_enter(&kcpc_ctx_llock[hash]);
 		for (ctx = kcpc_ctx_list[hash]; ctx; ctx = ctx->kc_next)
-			atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
+			KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
 		mutex_exit(&kcpc_ctx_llock[hash]);
 	}
 }
@@ -1451,7 +1598,7 @@ kcpc_invalidate_config(void *token)
 
 	ASSERT(ctx != NULL);
 
-	atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
+	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
 }
 
 /*
@@ -1462,18 +1609,11 @@ kcpc_passivate(void)
 {
 	kcpc_ctx_t *ctx = curthread->t_cpc_ctx;
 	kcpc_set_t *set = curthread->t_cpc_set;
+	int	save_spl;
 
 	if (set == NULL)
 		return;
 
-	/*
-	 * We're cleaning up after this thread; ensure there are no dangling
-	 * CPC pointers left behind. The context and set will be freed by
-	 * freectx() in the case of an LWP-bound set, and by kcpc_unbind() in
-	 * the case of a CPU-bound set.
-	 */
-	curthread->t_cpc_ctx = NULL;
-
 	if (ctx == NULL) {
 		/*
 		 * This thread has a set but no context; it must be a CPU-bound
@@ -1491,6 +1631,8 @@ kcpc_passivate(void)
 		return;
 	}
 
+	kpreempt_disable();
+	save_spl = spl_xcall();
 	curthread->t_cpc_set = NULL;
 
 	/*
@@ -1500,13 +1642,20 @@ kcpc_passivate(void)
 	 * INVALID_STOPPED flag here and kcpc_restore() setting the flag during
 	 * a context switch.
 	 */
-
-	kpreempt_disable();
 	if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
-		pcbe_ops->pcbe_allstop();
-		atomic_or_uint(&ctx->kc_flags,
+		kcpc_unprogram(ctx, B_TRUE);
+		KCPC_CTX_FLAG_SET(ctx,
 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
 	}
+
+	/*
+	 * We're cleaning up after this thread; ensure there are no dangling
+	 * CPC pointers left behind. The context and set will be freed by
+	 * freectx().
+	 */
+	curthread->t_cpc_ctx = NULL;
+
+	splx(save_spl);
 	kpreempt_enable();
 }
 
@@ -1667,7 +1816,7 @@ kcpc_invalidate(kthread_t *t)
 	kcpc_ctx_t *ctx = t->t_cpc_ctx;
 
 	if (ctx != NULL)
-		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
+		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
 }
 
 /*
@@ -1691,6 +1840,648 @@ kcpc_pcbe_tryload(const char *prefix, uint_t first, uint_t second, uint_t third)
 	    "pcbe", prefix, ".", s, 3, NULL) < 0 ? -1 : 0);
 }
 
+/*
+ * Create one or more CPC context for given CPU with specified counter event
+ * requests
+ *
+ * If number of requested counter events is less than or equal number of
+ * hardware counters on a CPU and can all be assigned to the counters on a CPU
+ * at the same time, then make one CPC context.
+ *
+ * Otherwise, multiple CPC contexts are created to allow multiplexing more
+ * counter events than existing counters onto the counters by iterating through
+ * all of the CPC contexts, programming the counters with each CPC context one
+ * at a time and measuring the resulting counter values.  Each of the resulting
+ * CPC contexts contains some number of requested counter events less than or
+ * equal the number of counters on a CPU depending on whether all the counter
+ * events can be programmed on all the counters at the same time or not.
+ *
+ * Flags to kmem_{,z}alloc() are passed in as an argument to allow specifying
+ * whether memory allocation should be non-blocking or not.  The code will try
+ * to allocate *whole* CPC contexts if possible.  If there is any memory
+ * allocation failure during the allocations needed for a given CPC context, it
+ * will skip allocating that CPC context because it cannot allocate the whole
+ * thing.  Thus, the only time that it will end up allocating none (ie. no CPC
+ * contexts whatsoever) is when it cannot even allocate *one* whole CPC context
+ * without a memory allocation failure occurring.
+ */
+int
+kcpc_cpu_ctx_create(cpu_t *cp, kcpc_request_list_t *req_list, int kmem_flags,
+    kcpc_ctx_t ***ctx_ptr_array, size_t *ctx_ptr_array_sz)
+{
+	kcpc_ctx_t	**ctx_ptrs;
+	int		nctx;
+	int		nctx_ptrs;
+	int		nreqs;
+	kcpc_request_t	*reqs;
+
+	if (cp == NULL || ctx_ptr_array == NULL || ctx_ptr_array_sz == NULL ||
+	    req_list == NULL || req_list->krl_cnt < 1)
+		return (-1);
+
+	/*
+	 * Allocate number of sets assuming that each set contains one and only
+	 * one counter event request for each counter on a CPU
+	 */
+	nreqs = req_list->krl_cnt;
+	nctx_ptrs = (nreqs + cpc_ncounters - 1) / cpc_ncounters;
+	ctx_ptrs = kmem_zalloc(nctx_ptrs * sizeof (kcpc_ctx_t *), kmem_flags);
+	if (ctx_ptrs == NULL)
+		return (-2);
+
+	/*
+	 * Fill in sets of requests
+	 */
+	nctx = 0;
+	reqs = req_list->krl_list;
+	while (nreqs > 0) {
+		kcpc_ctx_t	*ctx;
+		kcpc_set_t	*set;
+		int		subcode;
+
+		/*
+		 * Allocate CPC context and set for requested counter events
+		 */
+		ctx = kcpc_ctx_alloc(kmem_flags);
+		set = kcpc_set_create(reqs, nreqs, 0, kmem_flags);
+		if (set == NULL) {
+			kcpc_ctx_free(ctx);
+			break;
+		}
+
+		/*
+		 * Determine assignment of requested counter events to specific
+		 * counters
+		 */
+		if (kcpc_assign_reqs(set, ctx) != 0) {
+			/*
+			 * May not be able to assign requested counter events
+			 * to all counters since all counters may not be able
+			 * to do all events, so only do one counter event in
+			 * set of counter requests when this happens since at
+			 * least one of the counters must be able to do the
+			 * event.
+			 */
+			kcpc_free_set(set);
+			set = kcpc_set_create(reqs, 1, 0, kmem_flags);
+			if (set == NULL) {
+				kcpc_ctx_free(ctx);
+				break;
+			}
+			if (kcpc_assign_reqs(set, ctx) != 0) {
+#ifdef DEBUG
+				cmn_err(CE_NOTE, "!kcpc_cpu_ctx_create: can't "
+				    "assign counter event %s!\n",
+				    set->ks_req->kr_event);
+#endif
+				kcpc_free_set(set);
+				kcpc_ctx_free(ctx);
+				reqs++;
+				nreqs--;
+				continue;
+			}
+		}
+
+		/*
+		 * Allocate memory needed to hold requested counter event data
+		 */
+		set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t),
+		    kmem_flags);
+		if (set->ks_data == NULL) {
+			kcpc_free_set(set);
+			kcpc_ctx_free(ctx);
+			break;
+		}
+
+		/*
+		 * Configure requested counter events
+		 */
+		if (kcpc_configure_reqs(ctx, set, &subcode) != 0) {
+#ifdef DEBUG
+			cmn_err(CE_NOTE,
+			    "!kcpc_cpu_ctx_create: can't configure "
+			    "set of counter event requests!\n");
+#endif
+			reqs += set->ks_nreqs;
+			nreqs -= set->ks_nreqs;
+			kmem_free(set->ks_data,
+			    set->ks_nreqs * sizeof (uint64_t));
+			kcpc_free_set(set);
+			kcpc_ctx_free(ctx);
+			continue;
+		}
+
+		/*
+		 * Point set of counter event requests at this context and fill
+		 * in CPC context
+		 */
+		set->ks_ctx = ctx;
+		ctx->kc_set = set;
+		ctx->kc_cpuid = cp->cpu_id;
+		ctx->kc_thread = curthread;
+
+		ctx_ptrs[nctx] = ctx;
+
+		/*
+		 * Update requests and how many are left to be assigned to sets
+		 */
+		reqs += set->ks_nreqs;
+		nreqs -= set->ks_nreqs;
+
+		/*
+		 * Increment number of CPC contexts and allocate bigger array
+		 * for context pointers as needed
+		 */
+		nctx++;
+		if (nctx >= nctx_ptrs) {
+			kcpc_ctx_t	**new;
+			int		new_cnt;
+
+			/*
+			 * Allocate more CPC contexts based on how many
+			 * contexts allocated so far and how many counter
+			 * requests left to assign
+			 */
+			new_cnt = nctx_ptrs +
+			    ((nreqs + cpc_ncounters - 1) / cpc_ncounters);
+			new = kmem_zalloc(new_cnt * sizeof (kcpc_ctx_t *),
+			    kmem_flags);
+			if (new == NULL)
+				break;
+
+			/*
+			 * Copy contents of old sets into new ones
+			 */
+			bcopy(ctx_ptrs, new,
+			    nctx_ptrs * sizeof (kcpc_ctx_t *));
+
+			/*
+			 * Free old array of context pointers and use newly
+			 * allocated one instead now
+			 */
+			kmem_free(ctx_ptrs, nctx_ptrs * sizeof (kcpc_ctx_t *));
+			ctx_ptrs = new;
+			nctx_ptrs = new_cnt;
+		}
+	}
+
+	/*
+	 * Return NULL if no CPC contexts filled in
+	 */
+	if (nctx == 0) {
+		kmem_free(ctx_ptrs, nctx_ptrs * sizeof (kcpc_ctx_t *));
+		*ctx_ptr_array = NULL;
+		*ctx_ptr_array_sz = 0;
+		return (-2);
+	}
+
+	*ctx_ptr_array = ctx_ptrs;
+	*ctx_ptr_array_sz = nctx_ptrs * sizeof (kcpc_ctx_t *);
+	return (nctx);
+}
+
+/*
+ * Return whether PCBE supports given counter event
+ */
+boolean_t
+kcpc_event_supported(char *event)
+{
+	if (pcbe_ops == NULL || pcbe_ops->pcbe_event_coverage(event) == 0)
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/*
+ * Program counters on current CPU with given CPC context
+ *
+ * If kernel is interposing on counters to measure hardware capacity and
+ * utilization, then unprogram counters for kernel *before* programming them
+ * with specified CPC context.
+ *
+ * kcpc_{program,unprogram}() may be called either directly by a thread running
+ * on the target CPU or from a cross-call from another CPU. To protect
+ * programming and unprogramming from being interrupted by cross-calls, callers
+ * who execute kcpc_{program,unprogram} should raise PIL to the level used by
+ * cross-calls.
+ */
+void
+kcpc_program(kcpc_ctx_t *ctx, boolean_t for_thread, boolean_t cu_interpose)
+{
+	int	error;
+
+	ASSERT(IS_HIPIL());
+
+	/*
+	 * CPC context shouldn't be NULL, its CPU field should specify current
+	 * CPU or be -1 to specify any CPU when the context is bound to a
+	 * thread, and preemption should be disabled
+	 */
+	ASSERT(ctx != NULL && (ctx->kc_cpuid == CPU->cpu_id ||
+	    ctx->kc_cpuid == -1) && curthread->t_preempt > 0);
+	if (ctx == NULL || (ctx->kc_cpuid != CPU->cpu_id &&
+	    ctx->kc_cpuid != -1) || curthread->t_preempt < 1)
+		return;
+
+	/*
+	 * Unprogram counters for kernel measuring hardware capacity and
+	 * utilization
+	 */
+	if (cu_interpose == B_TRUE) {
+		cu_cpc_unprogram(CPU, &error);
+	} else {
+		kcpc_set_t *set = ctx->kc_set;
+		int i;
+
+		ASSERT(set != NULL);
+
+		/*
+		 * Since cu_interpose is false, we are programming CU context.
+		 * In general, PCBE can continue from the state saved in the
+		 * set, but it is not very reliable, so we start again from the
+		 * preset value.
+		 */
+		for (i = 0; i < set->ks_nreqs; i++) {
+			/*
+			 * Reset the virtual counter value to the preset value.
+			 */
+			*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
+
+			/*
+			 * Reset PCBE to the preset value.
+			 */
+			pcbe_ops->pcbe_configure(0, NULL,
+			    set->ks_req[i].kr_preset,
+			    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
+		}
+	}
+
+	/*
+	 * Program counters with specified CPC context
+	 */
+	ctx->kc_rawtick = KCPC_GET_TICK();
+	pcbe_ops->pcbe_program(ctx);
+
+	/*
+	 * Denote that counters programmed for thread or CPU CPC context
+	 * differently
+	 */
+	if (for_thread == B_TRUE)
+		KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
+	else
+		CPU->cpu_cpc_ctx = ctx;
+}
+
+/*
+ * Unprogram counters with given CPC context on current CPU
+ *
+ * If kernel is interposing on counters to measure hardware capacity and
+ * utilization, then program counters for the kernel capacity and utilization
+ * *after* unprogramming them for given CPC context.
+ *
+ * See the comment for kcpc_program regarding the synchronization with
+ * cross-calls.
+ */
+void
+kcpc_unprogram(kcpc_ctx_t *ctx, boolean_t cu_interpose)
+{
+	int	error;
+
+	ASSERT(IS_HIPIL());
+
+	/*
+	 * CPC context shouldn't be NULL, its CPU field should specify current
+	 * CPU or be -1 to specify any CPU when the context is bound to a
+	 * thread, and preemption should be disabled
+	 */
+	ASSERT(ctx != NULL && (ctx->kc_cpuid == CPU->cpu_id ||
+	    ctx->kc_cpuid == -1) && curthread->t_preempt > 0);
+
+	if (ctx == NULL || (ctx->kc_cpuid != CPU->cpu_id &&
+	    ctx->kc_cpuid != -1) || curthread->t_preempt < 1 ||
+	    (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) != 0) {
+		return;
+	}
+
+	/*
+	 * Specified CPC context to be unprogrammed should be bound to current
+	 * CPU or thread
+	 */
+	ASSERT(CPU->cpu_cpc_ctx == ctx || curthread->t_cpc_ctx == ctx);
+
+	/*
+	 * Stop counters
+	 */
+	pcbe_ops->pcbe_allstop();
+	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID_STOPPED);
+
+	/*
+	 * Allow kernel to interpose on counters and program them for its own
+	 * use to measure hardware capacity and utilization if cu_interpose
+	 * argument is true
+	 */
+	if (cu_interpose == B_TRUE)
+		cu_cpc_program(CPU, &error);
+}
+
+/*
+ * Read CPU Performance Counter (CPC) on current CPU and call specified update
+ * routine with data for each counter event currently programmed on CPU
+ */
+int
+kcpc_read(kcpc_update_func_t update_func)
+{
+	kcpc_ctx_t	*ctx;
+	int		i;
+	kcpc_request_t	*req;
+	int		retval;
+	kcpc_set_t	*set;
+
+	ASSERT(IS_HIPIL());
+
+	/*
+	 * Can't grab locks or block because may be called inside dispatcher
+	 */
+	kpreempt_disable();
+
+	ctx = CPU->cpu_cpc_ctx;
+	if (ctx == NULL) {
+		kpreempt_enable();
+		return (0);
+	}
+
+	/*
+	 * Read counter data from current CPU
+	 */
+	pcbe_ops->pcbe_sample(ctx);
+
+	set = ctx->kc_set;
+	if (set == NULL || set->ks_req == NULL) {
+		kpreempt_enable();
+		return (0);
+	}
+
+	/*
+	 * Call update function with preset pointer and data for each CPC event
+	 * request currently programmed on current CPU
+	 */
+	req = set->ks_req;
+	retval = 0;
+	for (i = 0; i < set->ks_nreqs; i++) {
+		int	ret;
+
+		if (req[i].kr_data == NULL)
+			break;
+
+		ret = update_func(req[i].kr_ptr, *req[i].kr_data);
+		if (ret < 0)
+			retval = ret;
+	}
+
+	kpreempt_enable();
+
+	return (retval);
+}
+
+/*
+ * Initialize list of counter event requests
+ */
+kcpc_request_list_t *
+kcpc_reqs_init(int nreqs, int kmem_flags)
+{
+	kcpc_request_list_t	*req_list;
+	kcpc_request_t		*reqs;
+
+	if (nreqs < 1)
+		return (NULL);
+
+	req_list = kmem_zalloc(sizeof (kcpc_request_list_t), kmem_flags);
+	if (req_list == NULL)
+		return (NULL);
+
+	reqs = kmem_zalloc(nreqs * sizeof (kcpc_request_t), kmem_flags);
+	if (reqs == NULL) {
+		kmem_free(req_list, sizeof (kcpc_request_list_t));
+		return (NULL);
+	}
+
+	req_list->krl_list = reqs;
+	req_list->krl_cnt = 0;
+	req_list->krl_max = nreqs;
+	return (req_list);
+}
+
+
+/*
+ * Add counter event request to given list of counter event requests
+ */
+int
+kcpc_reqs_add(kcpc_request_list_t *req_list, char *event, uint64_t preset,
+    uint_t flags, uint_t nattrs, kcpc_attr_t *attr, void *ptr, int kmem_flags)
+{
+	kcpc_request_t	*req;
+
+	ASSERT(req_list->krl_max != 0);
+	if (req_list == NULL || req_list->krl_list == NULL)
+		return (-1);
+
+	/*
+	 * Allocate more space (if needed)
+	 */
+	if (req_list->krl_cnt > req_list->krl_max) {
+		kcpc_request_t	*new;
+		kcpc_request_t	*old;
+
+		old = req_list->krl_list;
+		new = kmem_zalloc((req_list->krl_max +
+		    cpc_ncounters) * sizeof (kcpc_request_t), kmem_flags);
+		if (new == NULL)
+			return (-2);
+
+		req_list->krl_list = new;
+		bcopy(old, req_list->krl_list,
+		    req_list->krl_cnt * sizeof (kcpc_request_t));
+		kmem_free(old, req_list->krl_max * sizeof (kcpc_request_t));
+		req_list->krl_cnt = 0;
+		req_list->krl_max += cpc_ncounters;
+	}
+
+	/*
+	 * Fill in request as much as possible now, but some fields will need
+	 * to be set when request is assigned to a set.
+	 */
+	req = &req_list->krl_list[req_list->krl_cnt];
+	req->kr_config = NULL;
+	req->kr_picnum = -1;	/* have CPC pick this */
+	req->kr_index = -1;	/* set when assigning request to set */
+	req->kr_data = NULL;	/* set when configuring request */
+	(void) strcpy(req->kr_event, event);
+	req->kr_preset = preset;
+	req->kr_flags = flags;
+	req->kr_nattrs = nattrs;
+	req->kr_attr = attr;
+	/*
+	 * Keep pointer given by caller to give to update function when this
+	 * counter event is sampled/read
+	 */
+	req->kr_ptr = ptr;
+
+	req_list->krl_cnt++;
+
+	return (0);
+}
+
+/*
+ * Reset list of CPC event requests so its space can be used for another set
+ * of requests
+ */
+int
+kcpc_reqs_reset(kcpc_request_list_t *req_list)
+{
+	/*
+	 * Return when pointer to request list structure or request is NULL or
+	 * when max requests is less than or equal to 0
+	 */
+	if (req_list == NULL || req_list->krl_list == NULL ||
+	    req_list->krl_max <= 0)
+		return (-1);
+
+	/*
+	 * Zero out requests and number of requests used
+	 */
+	bzero(req_list->krl_list, req_list->krl_max * sizeof (kcpc_request_t));
+	req_list->krl_cnt = 0;
+	return (0);
+}
+
+/*
+ * Free given list of counter event requests
+ */
+int
+kcpc_reqs_fini(kcpc_request_list_t *req_list)
+{
+	kmem_free(req_list->krl_list,
+	    req_list->krl_max * sizeof (kcpc_request_t));
+	kmem_free(req_list, sizeof (kcpc_request_list_t));
+	return (0);
+}
+
+/*
+ * Create set of given counter event requests
+ */
+static kcpc_set_t *
+kcpc_set_create(kcpc_request_t *reqs, int nreqs, int set_flags, int kmem_flags)
+{
+	int		i;
+	kcpc_set_t	*set;
+
+	/*
+	 * Allocate set and assign number of requests in set and flags
+	 */
+	set = kmem_zalloc(sizeof (kcpc_set_t), kmem_flags);
+	if (set == NULL)
+		return (NULL);
+
+	if (nreqs < cpc_ncounters)
+		set->ks_nreqs = nreqs;
+	else
+		set->ks_nreqs = cpc_ncounters;
+
+	set->ks_flags = set_flags;
+
+	/*
+	 * Allocate requests needed, copy requests into set, and set index into
+	 * data for each request (which may change when we assign requested
+	 * counter events to counters)
+	 */
+	set->ks_req = (kcpc_request_t *)kmem_zalloc(sizeof (kcpc_request_t) *
+	    set->ks_nreqs, kmem_flags);
+	if (set->ks_req == NULL) {
+		kmem_free(set, sizeof (kcpc_set_t));
+		return (NULL);
+	}
+
+	bcopy(reqs, set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs);
+
+	for (i = 0; i < set->ks_nreqs; i++)
+		set->ks_req[i].kr_index = i;
+
+	return (set);
+}
+
+
+/*
+ * Stop counters on current CPU.
+ *
+ * If preserve_context is true, the caller is interested in the CPU's CPC
+ * context and wants it to be preserved.
+ *
+ * If preserve_context is false, the caller does not need the CPU's CPC context
+ * to be preserved, so it is set to NULL.
+ */
+static void
+kcpc_cpustop_func(boolean_t preserve_context)
+{
+	kpreempt_disable();
+
+	/*
+	 * Someone already stopped this context before us, so there is nothing
+	 * to do.
+	 */
+	if (CPU->cpu_cpc_ctx == NULL) {
+		kpreempt_enable();
+		return;
+	}
+
+	kcpc_unprogram(CPU->cpu_cpc_ctx, B_TRUE);
+	/*
+	 * If CU does not use counters, then clear the CPU's CPC context
+	 * If the caller requested to preserve context it should disable CU
+	 * first, so there should be no CU context now.
+	 */
+	ASSERT(!preserve_context || !CU_CPC_ON(CPU));
+	if (!preserve_context && CPU->cpu_cpc_ctx != NULL && !CU_CPC_ON(CPU))
+		CPU->cpu_cpc_ctx = NULL;
+
+	kpreempt_enable();
+}
+
+/*
+ * Stop counters on given CPU and set its CPC context to NULL unless
+ * preserve_context is true.
+ */
+void
+kcpc_cpu_stop(cpu_t *cp, boolean_t preserve_context)
+{
+	cpu_call(cp, (cpu_call_func_t)kcpc_cpustop_func,
+	    preserve_context, 0);
+}
+
+/*
+ * Program the context on the current CPU
+ */
+static void
+kcpc_remoteprogram_func(kcpc_ctx_t *ctx, uintptr_t arg)
+{
+	boolean_t for_thread = (boolean_t)arg;
+
+	ASSERT(ctx != NULL);
+
+	kpreempt_disable();
+	kcpc_program(ctx, for_thread, B_TRUE);
+	kpreempt_enable();
+}
+
+/*
+ * Program counters on given CPU
+ */
+void
+kcpc_cpu_program(cpu_t *cp, kcpc_ctx_t *ctx)
+{
+	cpu_call(cp, (cpu_call_func_t)kcpc_remoteprogram_func, (uintptr_t)ctx,
+	    (uintptr_t)B_FALSE);
+}
+
 char *
 kcpc_list_attrs(void)
 {
diff --git a/usr/src/uts/common/os/pg.c b/usr/src/uts/common/os/pg.c
index 067670dbbb..835ae3d322 100644
--- a/usr/src/uts/common/os/pg.c
+++ b/usr/src/uts/common/os/pg.c
@@ -110,7 +110,11 @@ static cpu_pg_t		bootstrap_pg_data;
  * and the next free id in the set.
  */
 static bitset_t		pg_id_set;
-static pgid_t		pg_id_next = 0;
+
+/*
+ * ID space starts from 1 to assume that root has ID 0;
+ */
+static pgid_t		pg_id_next = 1;
 
 /*
  * Default and externed PG ops vectors
diff --git a/usr/src/uts/common/os/pghw.c b/usr/src/uts/common/os/pghw.c
index ca59db8602..534cb2c540 100644
--- a/usr/src/uts/common/os/pghw.c
+++ b/usr/src/uts/common/os/pghw.c
@@ -34,6 +34,7 @@
 #include <sys/pg.h>
 #include <sys/pghw.h>
 #include <sys/cpu_pm.h>
+#include <sys/cap_util.h>
 
 /*
  * Processor Groups: Hardware sharing relationship layer
@@ -116,10 +117,10 @@ struct pghw_kstat {
 	kstat_named_t	pg_hw;
 	kstat_named_t	pg_policy;
 } pghw_kstat = {
-	{ "id",			KSTAT_DATA_UINT64 },
+	{ "id",			KSTAT_DATA_UINT32 },
 	{ "pg_class",		KSTAT_DATA_STRING },
-	{ "ncpus",		KSTAT_DATA_UINT64 },
-	{ "instance_id",	KSTAT_DATA_UINT64 },
+	{ "ncpus",		KSTAT_DATA_UINT32 },
+	{ "instance_id",	KSTAT_DATA_UINT32 },
 	{ "hardware",		KSTAT_DATA_STRING },
 	{ "policy",		KSTAT_DATA_STRING },
 };
@@ -127,12 +128,92 @@ struct pghw_kstat {
 kmutex_t		pghw_kstat_lock;
 
 /*
+ * Capacity and Utilization PG kstats
+ *
+ * These kstats are updated one at a time, so we can have a single scratch space
+ * to fill the data.
+ *
+ * kstat fields:
+ *
+ *   pgid		PG ID for PG described by this kstat
+ *
+ *   pg_ncpus		Number of CPUs within this PG
+ *
+ *   pg_cpus		String describing CPUs within this PG
+ *
+ *   pg_sharing		Name of sharing relationship for this PG
+ *
+ *   pg_generation	Generation value that increases whenever any CPU leaves
+ *			  or joins PG. Two kstat snapshots for the same
+ *			  CPU may only be compared if they have the same
+ *			  generation
+ *
+ *   pg_hw_util		Running value of PG utilization for the sharing
+ *			  relationship
+ *
+ *   pg_hw_util_time_running
+ *			Total time spent collecting CU data. The time may be
+ *			less than wall time if CU counters were stopped for
+ *			some time.
+ *
+ *   pg_hw_util_time_stopped Total time the CU counters were stopped.
+ *
+ *   pg_hw_util_rate	Utilization rate, expressed in operations per second.
+ *
+ *   pg_hw_util_rate_max Maximum observed value of utilization rate.
+ */
+struct pghw_cu_kstat {
+	kstat_named_t	pg_id;
+	kstat_named_t	pg_ncpus;
+	kstat_named_t	pg_generation;
+	kstat_named_t	pg_hw_util;
+	kstat_named_t	pg_hw_util_time_running;
+	kstat_named_t	pg_hw_util_time_stopped;
+	kstat_named_t	pg_hw_util_rate;
+	kstat_named_t	pg_hw_util_rate_max;
+	kstat_named_t	pg_cpus;
+	kstat_named_t	pg_sharing;
+} pghw_cu_kstat = {
+	{ "id",			KSTAT_DATA_UINT32 },
+	{ "ncpus",		KSTAT_DATA_UINT32 },
+	{ "generation",		KSTAT_DATA_UINT32   },
+	{ "hw_util",		KSTAT_DATA_UINT64   },
+	{ "hw_util_time_running",	KSTAT_DATA_UINT64   },
+	{ "hw_util_time_stopped",	KSTAT_DATA_UINT64   },
+	{ "hw_util_rate",	KSTAT_DATA_UINT64   },
+	{ "hw_util_rate_max",	KSTAT_DATA_UINT64   },
+	{ "cpus",		KSTAT_DATA_STRING   },
+	{ "sharing_relation",	KSTAT_DATA_STRING   },
+};
+
+/*
+ * Calculate the string size to represent NCPUS. Allow 5 digits for each CPU ID
+ * plus one space per CPU plus NUL byte in the end. This is only an estimate,
+ * since we try to compress CPU ranges as x-y. In the worst case the string
+ * representation of CPUs may be truncated.
+ */
+#define	CPUSTR_LEN(ncpus) ((ncpus) * 6)
+
+/*
+ * Maximum length of the string that represents list of CPUs
+ */
+static int pg_cpulist_maxlen = 0;
+
+static void		pghw_kstat_create(pghw_t *);
+static int		pghw_kstat_update(kstat_t *, int);
+static int		pghw_cu_kstat_update(kstat_t *, int);
+static int		cpu2id(void *);
+
+/*
  * hwset operations
  */
 static group_t		*pghw_set_create(pghw_type_t);
 static void		pghw_set_add(group_t *, pghw_t *);
 static void		pghw_set_remove(group_t *, pghw_t *);
 
+static void		pghw_cpulist_alloc(pghw_t *);
+static int		cpu2id(void *);
+
 /*
  * Initialize the physical portion of a hardware PG
  */
@@ -150,6 +231,7 @@ pghw_init(pghw_t *pg, cpu_t *cp, pghw_type_t hw)
 
 	pghw_set_add(hwset, pg);
 	pg->pghw_hw = hw;
+	pg->pghw_generation = 0;
 	pg->pghw_instance =
 	    pg_plat_hw_instance_id(cp, hw);
 	pghw_kstat_create(pg);
@@ -186,8 +268,20 @@ pghw_fini(pghw_t *pg)
 	pg->pghw_instance = (id_t)PGHW_INSTANCE_ANON;
 	pg->pghw_hw = (pghw_type_t)-1;
 
-	if (pg->pghw_kstat)
+	if (pg->pghw_kstat != NULL)
 		kstat_delete(pg->pghw_kstat);
+
+	/*
+	 * Destroy string representation of CPUs
+	 */
+	if (pg->pghw_cpulist != NULL) {
+		kmem_free(pg->pghw_cpulist,
+		    pg->pghw_cpulist_len);
+		pg->pghw_cpulist = NULL;
+	}
+
+	if (pg->pghw_cu_kstat != NULL)
+		kstat_delete(pg->pghw_cu_kstat);
 }
 
 /*
@@ -344,11 +438,10 @@ pghw_set_remove(group_t *hwset, pghw_t *pg)
 	ASSERT(result == 0);
 }
 
-
 /*
  * Return a string name given a pg_hw sharing type
  */
-static char *
+char *
 pghw_type_string(pghw_type_t hw)
 {
 	switch (hw) {
@@ -374,6 +467,34 @@ pghw_type_string(pghw_type_t hw)
 }
 
 /*
+ * Return a short string name given a pg_hw sharing type
+ */
+char *
+pghw_type_shortstring(pghw_type_t hw)
+{
+	switch (hw) {
+	case PGHW_IPIPE:
+		return ("instr_pipeline");
+	case PGHW_CACHE:
+		return ("Cache");
+	case PGHW_FPU:
+		return ("FPU");
+	case PGHW_MPIPE:
+		return ("memory_pipeline");
+	case PGHW_CHIP:
+		return ("Socket");
+	case PGHW_MEMORY:
+		return ("Memory");
+	case PGHW_POW_ACTIVE:
+		return ("CPU_PM_Active");
+	case PGHW_POW_IDLE:
+		return ("CPU_PM_Idle");
+	default:
+		return ("unknown");
+	}
+}
+
+/*
  * Create / Update routines for PG hw kstats
  *
  * It is the intention of these kstats to provide some level
@@ -383,11 +504,14 @@ pghw_type_string(pghw_type_t hw)
 void
 pghw_kstat_create(pghw_t *pg)
 {
+	char *class = pghw_type_string(pg->pghw_hw);
+
 	/*
 	 * Create a physical pg kstat
 	 */
 	if ((pg->pghw_kstat = kstat_create("pg", ((pg_t *)pg)->pg_id,
-	    "pg", "pg", KSTAT_TYPE_NAMED,
+	    "pg", "pg",
+	    KSTAT_TYPE_NAMED,
 	    sizeof (pghw_kstat) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL)) != NULL) {
 		/* Class string, hw string, and policy string */
@@ -400,6 +524,28 @@ pghw_kstat_create(pghw_t *pg)
 		pg->pghw_kstat->ks_private = pg;
 		kstat_install(pg->pghw_kstat);
 	}
+
+	if (pg_cpulist_maxlen == 0)
+		pg_cpulist_maxlen = CPUSTR_LEN(max_ncpus);
+
+	/*
+	 * Create a physical pg kstat
+	 */
+	if ((pg->pghw_cu_kstat = kstat_create("pg", ((pg_t *)pg)->pg_id,
+	    "hardware", class,
+	    KSTAT_TYPE_NAMED,
+	    sizeof (pghw_cu_kstat) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL)) != NULL) {
+		pg->pghw_cu_kstat->ks_lock = &pghw_kstat_lock;
+		pg->pghw_cu_kstat->ks_data = &pghw_cu_kstat;
+		pg->pghw_cu_kstat->ks_update = pghw_cu_kstat_update;
+		pg->pghw_cu_kstat->ks_private = pg;
+		pg->pghw_cu_kstat->ks_data_size += strlen(class) + 1;
+		/* Allow space for CPU strings */
+		pg->pghw_cu_kstat->ks_data_size += PGHW_KSTAT_STR_LEN_MAX;
+		pg->pghw_cu_kstat->ks_data_size += pg_cpulist_maxlen;
+		kstat_install(pg->pghw_cu_kstat);
+	}
 }
 
 int
@@ -411,11 +557,147 @@ pghw_kstat_update(kstat_t *ksp, int rw)
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
-	pgsp->pg_id.value.ui64 = ((pg_t *)pg)->pg_id;
-	pgsp->pg_ncpus.value.ui64 = GROUP_SIZE(&((pg_t *)pg)->pg_cpus);
-	pgsp->pg_instance_id.value.ui64 = (uint64_t)pg->pghw_instance;
+	pgsp->pg_id.value.ui32 = ((pg_t *)pg)->pg_id;
+	pgsp->pg_ncpus.value.ui32 = GROUP_SIZE(&((pg_t *)pg)->pg_cpus);
+	pgsp->pg_instance_id.value.ui32 = pg->pghw_instance;
 	kstat_named_setstr(&pgsp->pg_class, ((pg_t *)pg)->pg_class->pgc_name);
 	kstat_named_setstr(&pgsp->pg_hw, pghw_type_string(pg->pghw_hw));
 	kstat_named_setstr(&pgsp->pg_policy, pg_policy_name((pg_t *)pg));
 	return (0);
 }
+
+int
+pghw_cu_kstat_update(kstat_t *ksp, int rw)
+{
+	struct pghw_cu_kstat	*pgsp = &pghw_cu_kstat;
+	pghw_t			*pg = ksp->ks_private;
+	pghw_util_t		*hw_util = &pg->pghw_stats;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	pgsp->pg_id.value.ui32 = ((pg_t *)pg)->pg_id;
+	pgsp->pg_ncpus.value.ui32 = GROUP_SIZE(&((pg_t *)pg)->pg_cpus);
+
+	/*
+	 * Allocate memory for the string representing the list of CPUs in PG.
+	 * This memory should persist past the call to pghw_cu_kstat_update()
+	 * since the kstat snapshot routine will reference this memory.
+	 */
+	pghw_cpulist_alloc(pg);
+
+	if (pg->pghw_kstat_gen != pg->pghw_generation) {
+		/*
+		 * PG kstat generation number is out of sync with PG's
+		 * generation mumber. It means that some CPUs could have joined
+		 * or left PG and it is not possible to compare the numbers
+		 * obtained before and after the generation change.
+		 *
+		 * Reset the maximum utilization rate and start computing it
+		 * from scratch.
+		 */
+		hw_util->pghw_util = 0;
+		hw_util->pghw_rate_max = 0;
+		pg->pghw_kstat_gen = pg->pghw_generation;
+	}
+
+	/*
+	 * We can't block on CPU lock because when PG is destroyed (under
+	 * cpu_lock) it tries to delete this kstat and it will wait for us to
+	 * complete which will never happen since we are waiting for cpu_lock to
+	 * drop. Deadlocks are fun!
+	 */
+	if (mutex_tryenter(&cpu_lock)) {
+		if (pg->pghw_cpulist != NULL &&
+		    *(pg->pghw_cpulist) == '\0') {
+			(void) group2intlist(&(((pg_t *)pg)->pg_cpus),
+			    pg->pghw_cpulist, pg->pghw_cpulist_len, cpu2id);
+		}
+		cu_pg_update(pg);
+		mutex_exit(&cpu_lock);
+	}
+
+	pgsp->pg_generation.value.ui32 = pg->pghw_kstat_gen;
+	pgsp->pg_hw_util.value.ui64 = hw_util->pghw_util;
+	pgsp->pg_hw_util_time_running.value.ui64 = hw_util->pghw_time_running;
+	pgsp->pg_hw_util_time_stopped.value.ui64 = hw_util->pghw_time_stopped;
+	pgsp->pg_hw_util_rate.value.ui64 = hw_util->pghw_rate;
+	pgsp->pg_hw_util_rate_max.value.ui64 = hw_util->pghw_rate_max;
+	if (pg->pghw_cpulist != NULL)
+		kstat_named_setstr(&pgsp->pg_cpus, pg->pghw_cpulist);
+	else
+		kstat_named_setstr(&pgsp->pg_cpus, "");
+
+	kstat_named_setstr(&pgsp->pg_sharing, pghw_type_string(pg->pghw_hw));
+
+	return (0);
+}
+
+/*
+ * Update the string representation of CPUs in PG (pg->pghw_cpulist).
+ * The string representation is used for kstats.
+ *
+ * The string is allocated if it has not already been or if it is already
+ * allocated and PG has more CPUs now. If PG has smaller or equal number of
+ * CPUs, but the actual CPUs may have changed, the string is reset to the empty
+ * string causes the string representation to be recreated. The pghw_generation
+ * field is used to detect whether CPUs within the pg may have changed.
+ */
+static void
+pghw_cpulist_alloc(pghw_t *pg)
+{
+	uint_t	ncpus = GROUP_SIZE(&((pg_t *)pg)->pg_cpus);
+	size_t	len = CPUSTR_LEN(ncpus);
+
+	/*
+	 * If the pghw_cpulist string is already allocated we need to make sure
+	 * that it has sufficient length. Also if the set of CPUs may have
+	 * changed, we need to re-generate the string.
+	 */
+	if (pg->pghw_cpulist != NULL &&
+	    pg->pghw_kstat_gen != pg->pghw_generation) {
+		if (len <= pg->pghw_cpulist_len) {
+			/*
+			 * There is sufficient space in the pghw_cpulist for
+			 * the new set of CPUs. Just clear the string to trigger
+			 * re-generation of list of CPUs
+			 */
+			*(pg->pghw_cpulist) = '\0';
+		} else {
+			/*
+			 * There is, potentially, insufficient space in
+			 * pghw_cpulist, so reallocate the string.
+			 */
+			ASSERT(strlen(pg->pghw_cpulist) < pg->pghw_cpulist_len);
+			kmem_free(pg->pghw_cpulist, pg->pghw_cpulist_len);
+			pg->pghw_cpulist = NULL;
+			pg->pghw_cpulist_len = 0;
+		}
+	}
+
+	if (pg->pghw_cpulist == NULL) {
+		/*
+		 * Allocate space to hold cpulist.
+		 *
+		 * Length can not be bigger that the maximum space we have
+		 * allowed for the kstat buffer
+		 */
+		if (len > pg_cpulist_maxlen)
+			len = pg_cpulist_maxlen;
+		if (len > 0) {
+			pg->pghw_cpulist = kmem_zalloc(len, KM_NOSLEEP);
+			if (pg->pghw_cpulist != NULL)
+				pg->pghw_cpulist_len = len;
+		}
+	}
+}
+
+static int
+cpu2id(void *v)
+{
+	cpu_t *cp = (cpu_t *)v;
+
+	ASSERT(v != NULL);
+
+	return (cp->cpu_id);
+}
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index 9006be10f4..5133e80e69 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -98,6 +98,7 @@ CHKHDRS=			\
 	byteorder.h		\
 	callb.h			\
 	callo.h			\
+	cap_util.h		\
 	cpucaps.h		\
 	cpucaps_impl.h		\
 	ccompile.h		\
diff --git a/usr/src/uts/common/sys/cap_util.h b/usr/src/uts/common/sys/cap_util.h
new file mode 100644
index 0000000000..7e25ba6697
--- /dev/null
+++ b/usr/src/uts/common/sys/cap_util.h
@@ -0,0 +1,173 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_CAP_UTIL_H
+#define	_SYS_CAP_UTIL_H
+
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/kcpc.h>
+#include <sys/cpc_impl.h>
+#include <sys/pghw.h>
+#include <sys/cmt.h>
+
+#ifdef	_KERNEL
+
+/*
+ * Capacity and utilization flags for each CPU
+ */
+#define	CU_CPU_CNTRS_ON		1	/* CPU performance counters are on */
+#define	CU_CPU_CNTRS_OFF_ON	2	/* Off -> on transition */
+
+/*
+ * Macro that returns whether CPU performance counters turned on for given CPU
+ */
+#define	CU_CPC_ON(cp) \
+	((cp) != NULL && (cp)->cpu_cu_info != NULL && \
+	    ((cp)->cpu_cu_info->cu_flag & CU_CPU_CNTRS_ON))
+
+
+/*
+ * Per counter statistics
+ */
+typedef struct cu_cntr_stats {
+	hrtime_t	cs_time_running; /* running total of time counting */
+	hrtime_t	cs_time_stopped; /* ... time not counting */
+	hrtime_t	cs_time_start;	/* start time of current sample  */
+	uint64_t	cs_value_start;	/* starting value for next sample */
+	uint64_t	cs_value_last;	/* last value */
+	uint64_t	cs_value_total;	/* running total */
+	uint64_t	cs_rate;	/* observed rate since last */
+	uint64_t	cs_rate_max;	/* maximum rate */
+	kcpc_request_t	*cs_cpc_req;	/* corresponding CPC request */
+	struct cpu	*cs_cpu_start;	/* CPU where starting value gotten */
+} cu_cntr_stats_t;
+
+
+/*
+ * Counter info for a PG hardware sharing relationship
+ */
+typedef struct cu_cntr_info {
+	cpu_t		*ci_cpu;	/* CPU being measured */
+	pghw_t		*ci_pg;		/* hardware PG being measured */
+	kstat_t		*ci_kstat;	/* kstats being exported */
+	cu_cntr_stats_t	*ci_stats;	/* counter statistics */
+	uint_t		ci_nstats;	/* number of statistics */
+} cu_cntr_info_t;
+
+
+/*
+ * Each CPU can have one or more CPC contexts for measuring capacity and
+ * utilization
+ *
+ * One CPC context is needed per CPU if the counter events needed to measure
+ * capacity and utilization on each CPU can be programmed onto all the counters
+ * on a CPU at the same time and there are fewer or same number of desired
+ * counter events as counters on each CPU.  Otherwise, the desired counter
+ * events are assigned across multiple CPC contexts, so the contexts and their
+ * counter events can be multiplexed onto the counters over time to get the
+ * data for all of the counter events.
+ */
+typedef struct cu_cpc_ctx {
+	int		cur_index;	/* index for current context */
+	int		nctx;		/* number of CPC contexts */
+	kcpc_ctx_t	**ctx_ptr_array; /* array of context pointers */
+	size_t		ctx_ptr_array_sz; /* size of array */
+} cu_cpc_ctx_t;
+
+/*
+ * Per CPU capacity and utilization info
+ */
+typedef struct cu_cpu_info {
+	struct cpu	*cu_cpu;	/* CPU for the statistics */
+	uint_t		cu_flag;	/* capacity & utilization flag */
+	hrtime_t	cu_sample_time;	/* when last sample taken */
+	cu_cpc_ctx_t	cu_cpc_ctx;	/* performance counter contexts */
+	cu_cntr_stats_t	*cu_cntr_stats;	/* counter statistics array */
+	uint_t		cu_ncntr_stats;	/* number of counter statistics */
+	uint_t		cu_disabled;	/* count of disable requests */
+	/*
+	 * Per PG hardware sharing relationship counter info
+	 */
+	cu_cntr_info_t	*cu_cntr_info[PGHW_NUM_COMPONENTS];
+} cu_cpu_info_t;
+
+/*
+ * COMMON INTERFACE ROUTINES
+ */
+
+/*
+ * Setup capacity and utilization support
+ */
+extern void	cu_init(void);
+
+/*
+ * Tear down capacity and utilization support
+ */
+extern int	cu_fini(void);
+
+/*
+ * Program CPC for capacity and utilization on given CPU
+ */
+extern void	cu_cpc_program(struct cpu *, int *);
+
+/*
+ * Unprogram CPC for capacity and utilization on given CPU
+ */
+extern void	cu_cpc_unprogram(struct cpu *, int *);
+
+/*
+ * Update counter statistics on a given CPU
+ */
+extern int	cu_cpu_update(struct cpu *, boolean_t);
+
+/*
+ * Update utilization and capacity data for CMT PG
+ */
+extern void	cu_pg_update(pghw_t *);
+
+/*
+ * Disable or enable capacity and utilization on all CPUs
+ */
+extern void	cu_disable(void);
+extern void	cu_enable(void);
+
+/*
+ * PLATFORM SPECIFIC INTERFACE ROUTINES
+ */
+extern int	cu_plat_cpc_init(cpu_t *, kcpc_request_list_t *, int);
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_CAP_UTIL_H */
diff --git a/usr/src/uts/common/sys/cmt.h b/usr/src/uts/common/sys/cmt.h
index 4e7ed28656..afdb6730a6 100644
--- a/usr/src/uts/common/sys/cmt.h
+++ b/usr/src/uts/common/sys/cmt.h
@@ -63,6 +63,7 @@ typedef struct pg_cmt {
 	int		cmt_nchildren;		/* # of children CMT PGs */
 	struct group	cmt_cpus_actv;
 	struct bitset	cmt_cpus_actv_set;	/* bitset of active CPUs */
+	kstat_t		*cmt_kstat;		/* cmt kstats exported */
 } pg_cmt_t;
 
 /*
diff --git a/usr/src/uts/common/sys/cpc_impl.h b/usr/src/uts/common/sys/cpc_impl.h
index 1b57c76c10..ae89c90508 100644
--- a/usr/src/uts/common/sys/cpc_impl.h
+++ b/usr/src/uts/common/sys/cpc_impl.h
@@ -131,7 +131,7 @@ typedef struct _kcpc_ctx kcpc_ctx_t;
 
 struct _kcpc_ctx {
 	struct _kcpc_set *kc_set;	/* linked list of all bound sets */
-	uint32_t	kc_flags;
+	volatile uint_t	kc_flags;
 	kcpc_pic_t	*kc_pics;	/* pointer to array of per-pic data */
 	hrtime_t	kc_hrtime;	/* gethrtime() at last sample */
 	uint64_t	kc_vtick;	/* virtualized %tick */
@@ -214,20 +214,18 @@ extern hrtime_t tsc_read(void);
 struct cpu;
 
 extern uint_t cpc_ncounters;
-extern kmutex_t	kcpc_ctx_llock[];	/* protects ctx_list */
-extern kcpc_ctx_t *kcpc_ctx_list[];	/* head of list */
 extern krwlock_t kcpc_cpuctx_lock;	/* lock for 'kcpc_cpuctx' below */
 extern int	kcpc_cpuctx;		/* number of cpu-specific contexts */
 
 extern void kcpc_invalidate_all(void);
 
 extern void kcpc_passivate(void);
-extern void kcpc_remote_stop(struct cpu *cp);
+extern void kcpc_cpu_stop(struct cpu *, boolean_t);
 extern int kcpc_pcbe_tryload(const char *, uint_t, uint_t, uint_t);
-extern void kcpc_remote_program(struct cpu *cp);
+extern void kcpc_cpu_program(struct cpu *, kcpc_ctx_t *);
 extern void kcpc_register_dcpc(void (*func)(uint64_t));
 extern void kcpc_unregister_dcpc(void);
-extern kcpc_ctx_t *kcpc_ctx_alloc(void);
+extern kcpc_ctx_t *kcpc_ctx_alloc(int);
 extern int kcpc_assign_reqs(struct _kcpc_set *, kcpc_ctx_t *);
 extern void kcpc_ctx_free(kcpc_ctx_t *);
 extern int kcpc_configure_reqs(kcpc_ctx_t *, struct _kcpc_set *, int *);
diff --git a/usr/src/uts/common/sys/cpc_pcbe.h b/usr/src/uts/common/sys/cpc_pcbe.h
index 7522a9bf82..eb168fcf2c 100644
--- a/usr/src/uts/common/sys/cpc_pcbe.h
+++ b/usr/src/uts/common/sys/cpc_pcbe.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -37,8 +36,6 @@
 #ifndef _SYS_CPC_PCBE_H
 #define	_SYS_CPC_PCBE_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/inttypes.h>
 #include <sys/cpc_impl.h>
 
@@ -51,6 +48,8 @@ extern "C" {
  */
 #define	PCBE_VER_1	1
 
+#define	PCBE_IMPL_NAME_P4HT	"Pentium 4 with HyperThreading"
+
 typedef struct __pcbe_ops {
 	uint_t		pcbe_ver;
 	uint_t		pcbe_caps;
diff --git a/usr/src/uts/common/sys/cpuvar.h b/usr/src/uts/common/sys/cpuvar.h
index aece259a35..b52192b419 100644
--- a/usr/src/uts/common/sys/cpuvar.h
+++ b/usr/src/uts/common/sys/cpuvar.h
@@ -222,6 +222,16 @@ typedef struct cpu {
 
 	uint_t		cpu_rotor;	/* for cheap pseudo-random numbers */
 
+	struct cu_cpu_info	*cpu_cu_info;	/* capacity & util. info */
+
+	/*
+	 * cpu_generation is updated whenever CPU goes on-line or off-line.
+	 * Updates to cpu_generation are protected by cpu_lock.
+	 *
+	 * See CPU_NEW_GENERATION() macro below.
+	 */
+	volatile uint_t		cpu_generation;	/* tracking on/off-line */
+
 	/*
 	 * New members must be added /before/ this member, as the CTF tools
 	 * rely on this being the last field before cpu_m, so they can
@@ -597,6 +607,13 @@ extern struct cpu *curcpup(void);
 #define	CPU_STATS(cp, stat)                                       \
 	((cp)->cpu_stats.stat)
 
+/*
+ * Increment CPU generation value.
+ * This macro should be called whenever CPU goes on-line or off-line.
+ * Updates to cpu_generation should be protected by cpu_lock.
+ */
+#define	CPU_NEW_GENERATION(cp)	((cp)->cpu_generation++)
+
 #endif /* _KERNEL || _KMEMUSER */
 
 /*
@@ -726,6 +743,49 @@ void	cpu_enable_intr(struct cpu *cp); /* start issuing interrupts to cpu */
  */
 extern kmutex_t	cpu_lock;	/* lock protecting CPU data */
 
+/*
+ * CPU state change events
+ *
+ * Various subsystems need to know when CPUs change their state. They get this
+ * information by registering  CPU state change callbacks using
+ * register_cpu_setup_func(). Whenever any CPU changes its state, the callback
+ * function is called. The callback function is passed three arguments:
+ *
+ *   Event, described by cpu_setup_t
+ *   CPU ID
+ *   Transparent pointer passed when registering the callback
+ *
+ * The callback function is called with cpu_lock held. The return value from the
+ * callback function is usually ignored, except for CPU_CONFIG and CPU_UNCONFIG
+ * events. For these two events, non-zero return value indicates a failure and
+ * prevents successful completion of the operation.
+ *
+ * New events may be added in the future. Callback functions should ignore any
+ * events that they do not understand.
+ *
+ * The following events provide notification callbacks:
+ *
+ *  CPU_INIT	A new CPU is started and added to the list of active CPUs
+ *		  This event is only used during boot
+ *
+ *  CPU_CONFIG	A newly inserted CPU is prepared for starting running code
+ *		  This event is called by DR code
+ *
+ *  CPU_UNCONFIG CPU has been powered off and needs cleanup
+ *		  This event is called by DR code
+ *
+ *  CPU_ON	CPU is enabled but does not run anything yet
+ *
+ *  CPU_INTR_ON	CPU is enabled and has interrupts enabled
+ *
+ *  CPU_OFF	CPU is going offline but can still run threads
+ *
+ *  CPU_CPUPART_OUT	CPU is going to move out of its partition
+ *
+ *  CPU_CPUPART_IN	CPU is going to move to a new partition
+ *
+ *  CPU_SETUP	CPU is set up during boot and can run threads
+ */
 typedef enum {
 	CPU_INIT,
 	CPU_CONFIG,
@@ -734,7 +794,8 @@ typedef enum {
 	CPU_OFF,
 	CPU_CPUPART_IN,
 	CPU_CPUPART_OUT,
-	CPU_SETUP
+	CPU_SETUP,
+	CPU_INTR_ON
 } cpu_setup_t;
 
 typedef int cpu_setup_func_t(cpu_setup_t, int, void *);
@@ -748,6 +809,13 @@ extern void unregister_cpu_setup_func(cpu_setup_func_t *, void *);
 extern void cpu_state_change_notify(int, cpu_setup_t);
 
 /*
+ * Call specified function on the given CPU
+ */
+typedef void (*cpu_call_func_t)(uintptr_t, uintptr_t);
+extern void cpu_call(cpu_t *, cpu_call_func_t, uintptr_t, uintptr_t);
+
+
+/*
  * Create various strings that describe the given CPU for the
  * processor_info system call and configuration-related kstats.
  */
diff --git a/usr/src/uts/common/sys/group.h b/usr/src/uts/common/sys/group.h
index bb5613bc35..2db1ac01bb 100644
--- a/usr/src/uts/common/sys/group.h
+++ b/usr/src/uts/common/sys/group.h
@@ -101,6 +101,17 @@ void		group_remove_at(group_t *, uint_t);
  */
 uint_t		group_find(group_t *, void *);
 
+/*
+ * Convert a group to a string with list of integers.
+ *
+ * The consecutive integer values are represented using x-y notation.
+ * The resulting string looks like "1,2-5,8"
+ *
+ * The convert argument is used to map group elements to integer IDs.
+ * The output buffer and its length are specfied in the arguments.
+ */
+extern char *group2intlist(group_t *, char *, size_t, int (convert)(void*));
+
 #endif	/* !_KERNEL && !_KMEMUSER */
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/sys/kcpc.h b/usr/src/uts/common/sys/kcpc.h
index f30e093f78..d90b1c1d29 100644
--- a/usr/src/uts/common/sys/kcpc.h
+++ b/usr/src/uts/common/sys/kcpc.h
@@ -28,11 +28,13 @@
 
 #include <sys/cpc_impl.h>
 #include <sys/ksynch.h>
+#include <sys/types.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
+
 /*
  * Kernel clients need this file in order to know what a request is and how to
  * program one.
@@ -74,8 +76,33 @@ struct _kcpc_request {
 	uint_t			kr_flags;
 	uint_t			kr_nattrs;
 	kcpc_attr_t		*kr_attr;
+	void			*kr_ptr;	/* Ptr assigned by requester */
 };
 
+typedef struct _kcpc_request_list {
+	kcpc_request_t		*krl_list;	/* counter event requests */
+	int			krl_cnt;	/* how many requests */
+	int			krl_max;	/* max request entries */
+} kcpc_request_list_t;
+
+/*
+ * Type of update function to be called when reading counters on current CPU in
+ * kcpc_read()
+ */
+typedef int (*kcpc_update_func_t)(void *, uint64_t);
+
+/*
+ * Type of read function to be called when reading counters on current CPU
+ * (ie. should be same type signature as kcpc_read())
+ */
+typedef int (*kcpc_read_func_t)(kcpc_update_func_t);
+
+
+/*
+ * Initialize the kcpc framework
+ */
+extern int kcpc_init(void);
+
 /*
  * Bind the set to the indicated thread.
  * Returns 0 on success, or an errno in case of error. If EINVAL is returned,
@@ -96,6 +123,56 @@ extern int kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime,
     uint64_t *tick);
 
 /*
+ * Create CPC context containing specified list of requested counter events
+ */
+extern int kcpc_cpu_ctx_create(struct cpu *cp, kcpc_request_list_t *req_list,
+    int kmem_flags, kcpc_ctx_t ***ctx_ptr_array, size_t *ctx_ptr_array_sz);
+
+/*
+ * Returns whether specified counter event is supported
+ */
+extern boolean_t kcpc_event_supported(char *event);
+
+/*
+ * Initialize list of CPC event requests
+ */
+extern kcpc_request_list_t *kcpc_reqs_init(int nreqs, int kmem_flags);
+
+/*
+ * Add counter event request to given list of counter event requests
+ */
+extern int kcpc_reqs_add(kcpc_request_list_t *req_list, char *event,
+    uint64_t preset, uint_t flags, uint_t nattrs, kcpc_attr_t *attr, void *ptr,
+    int kmem_flags);
+
+/*
+ * Reset list of CPC event requests so its space can be used for another set
+ * of requests
+ */
+extern int kcpc_reqs_reset(kcpc_request_list_t *req_list);
+
+/*
+ * Free given list of counter event requests
+ */
+extern int kcpc_reqs_fini(kcpc_request_list_t *req_list);
+
+/*
+ * Read CPC data for given event on current CPU
+ */
+extern int kcpc_read(kcpc_update_func_t);
+
+/*
+ * Program current CPU with given CPC context
+ */
+extern void kcpc_program(kcpc_ctx_t *ctx, boolean_t for_thread,
+    boolean_t cu_interpose);
+
+/*
+ * Unprogram CPC counters on current CPU
+ */
+extern void kcpc_unprogram(kcpc_ctx_t *ctx, boolean_t cu_interpose);
+
+/*
  * Unbind a request and release the associated resources.
  */
 extern int kcpc_unbind(kcpc_set_t *set);
@@ -128,6 +205,8 @@ extern void kcpc_idle_restore(struct cpu *cp);
 extern krwlock_t	kcpc_cpuctx_lock;  /* lock for 'kcpc_cpuctx' below */
 extern int		kcpc_cpuctx;	   /* number of cpu-specific contexts */
 
+extern void kcpc_free(kcpc_ctx_t *ctx, int isexec);
+
 /*
  * 'dtrace_cpc_in_use' contains the number of currently active cpc provider
  * based enablings. See the block comment in uts/common/os/dtrace_subr.c for
diff --git a/usr/src/uts/common/sys/pghw.h b/usr/src/uts/common/sys/pghw.h
index ab8b0a9bbe..f0550dba7e 100644
--- a/usr/src/uts/common/sys/pghw.h
+++ b/usr/src/uts/common/sys/pghw.h
@@ -89,6 +89,27 @@ typedef enum pghw_type {
 typedef uintptr_t pghw_handle_t;
 
 /*
+ * Representation of PG hardware utilization NOTE: All the sums listed below are
+ * the sums of running total of each item for each CPU in the PG (eg.
+ * sum(utilization) is sum of running total utilization of each CPU in PG)
+ */
+typedef struct pghw_util {
+	uint64_t	pghw_util;	/* sum(utilization) */
+	uint64_t	pghw_rate;	/* Last observed utilization rate */
+	uint64_t	pghw_rate_max;	/* Max observed rate (in units/sec) */
+	hrtime_t	pghw_time_stamp; /* Timestamp of last snapshot */
+	/*
+	 * sum(time utilization counters on)
+	 */
+	hrtime_t	pghw_time_running;
+	/*
+	 * sum(time utilization counters off)
+	 */
+	hrtime_t	pghw_time_stopped;
+} pghw_util_t;
+
+
+/*
  * Processor Group (physical sharing relationship)
  */
 typedef struct pghw {
@@ -97,6 +118,23 @@ typedef struct pghw {
 	id_t		pghw_instance;	/* sharing instance identifier */
 	pghw_handle_t	pghw_handle;	/* hw specific opaque handle */
 	kstat_t		*pghw_kstat;	/* physical kstats exported */
+	kstat_t		*pghw_cu_kstat;  /* for capacity and utilization */
+	/*
+	 * pghw_generation should be updated by superclasses whenever PG changes
+	 * significanly (e.g.  new CPUs join or leave PG).
+	 */
+	uint_t		pghw_generation; /* generation number */
+
+	/*
+	 * The following fields are used by PGHW cu kstats
+	 */
+	char		*pghw_cpulist;	/* list of CPUs */
+	size_t		pghw_cpulist_len;	/* length of the list */
+	/*
+	 * Generation number at kstat update time
+	 */
+	uint_t		pghw_kstat_gen;
+	pghw_util_t	pghw_stats;	/* Utilization data */
 } pghw_t;
 
 /*
@@ -111,32 +149,35 @@ typedef struct cpu_physid {
 /*
  * Physical PG initialization / CPU service hooks
  */
-void		pghw_init(pghw_t *, cpu_t *, pghw_type_t);
-void		pghw_fini(pghw_t *);
-void		pghw_cpu_add(pghw_t *, cpu_t *);
-pghw_t		*pghw_place_cpu(cpu_t *, pghw_type_t);
+extern void		pghw_init(pghw_t *, cpu_t *, pghw_type_t);
+extern void		pghw_fini(pghw_t *);
+extern void		pghw_cpu_add(pghw_t *, cpu_t *);
+extern pghw_t		*pghw_place_cpu(cpu_t *, pghw_type_t);
 
 /*
  * Physical ID cache creation / destruction
  */
-void		pghw_physid_create(cpu_t *);
-void		pghw_physid_destroy(cpu_t *);
+extern void		pghw_physid_create(cpu_t *);
+extern void		pghw_physid_destroy(cpu_t *);
 
 /*
  * CPU / PG hardware related seach operations
  */
-pghw_t		*pghw_find_pg(cpu_t *, pghw_type_t);
-pghw_t		*pghw_find_by_instance(id_t, pghw_type_t);
-group_t		*pghw_set_lookup(pghw_type_t);
-
-void		pghw_kstat_create(pghw_t *);
-int		pghw_kstat_update(kstat_t *, int);
+extern pghw_t		*pghw_find_pg(cpu_t *, pghw_type_t);
+extern pghw_t		*pghw_find_by_instance(id_t, pghw_type_t);
+extern group_t		*pghw_set_lookup(pghw_type_t);
 
 /* Hardware sharing relationship platform interfaces */
-int		pg_plat_hw_shared(cpu_t *, pghw_type_t);
-int		pg_plat_cpus_share(cpu_t *, cpu_t *, pghw_type_t);
-id_t		pg_plat_hw_instance_id(cpu_t *, pghw_type_t);
-pghw_type_t	pg_plat_hw_rank(pghw_type_t, pghw_type_t);
+extern int		pg_plat_hw_shared(cpu_t *, pghw_type_t);
+extern int		pg_plat_cpus_share(cpu_t *, cpu_t *, pghw_type_t);
+extern id_t		pg_plat_hw_instance_id(cpu_t *, pghw_type_t);
+extern pghw_type_t	pg_plat_hw_rank(pghw_type_t, pghw_type_t);
+
+/*
+ * String representation of the hardware type
+ */
+extern char		*pghw_type_string(pghw_type_t);
+extern char		*pghw_type_shortstring(pghw_type_t);
 
 /*
  * What comprises a "core" may vary across processor implementations,
@@ -144,7 +185,7 @@ pghw_type_t	pg_plat_hw_rank(pghw_type_t, pghw_type_t);
  * is no PGHW_CORE type, but we provide an interface here to allow platforms
  * to express cpu <=> core mappings.
  */
-id_t		pg_plat_get_core_id(cpu_t *);
+extern id_t		pg_plat_get_core_id(cpu_t *);
 
 #endif	/* !_KERNEL && !_KMEMUSER */
 
diff --git a/usr/src/uts/common/sys/systm.h b/usr/src/uts/common/sys/systm.h
index 84ccfb9991..4c3dc7f886 100644
--- a/usr/src/uts/common/sys/systm.h
+++ b/usr/src/uts/common/sys/systm.h
@@ -270,6 +270,7 @@ int spl8(void);
 void splx(int);
 void set_base_spl(void);
 int __ipltospl(int);
+int spl_xcall(void);
 
 void softcall_init(void);
 void softcall(void (*)(void *), void *);
diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c
index 8e532685c7..8621e3ef55 100644
--- a/usr/src/uts/i86pc/os/cpuid.c
+++ b/usr/src/uts/i86pc/os/cpuid.c
@@ -2669,6 +2669,13 @@ cpuid_get_clogid(cpu_t *cpu)
 	return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
 }
 
+int
+cpuid_get_cacheid(cpu_t *cpu)
+{
+	ASSERT(cpuid_checkpass(cpu, 1));
+	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
+}
+
 uint_t
 cpuid_get_procnodeid(cpu_t *cpu)
 {
diff --git a/usr/src/uts/i86pc/os/intr.c b/usr/src/uts/i86pc/os/intr.c
index 786cd29e8f..fc0ef9e260 100644
--- a/usr/src/uts/i86pc/os/intr.c
+++ b/usr/src/uts/i86pc/os/intr.c
@@ -1179,6 +1179,12 @@ getpil(void)
 }
 
 int
+spl_xcall(void)
+{
+	return (splr(ipltospl(XCALL_PIL)));
+}
+
+int
 interrupts_enabled(void)
 {
 	ulong_t	flag;
diff --git a/usr/src/uts/i86pc/os/mp_call.c b/usr/src/uts/i86pc/os/mp_call.c
index 5725b18d85..df18f16588 100644
--- a/usr/src/uts/i86pc/os/mp_call.c
+++ b/usr/src/uts/i86pc/os/mp_call.c
@@ -32,6 +32,8 @@
 #include <sys/systm.h>
 #include <sys/promif.h>
 #include <sys/xc_levels.h>
+#include <sys/spl.h>
+#include <sys/bitmap.h>
 
 /*
  * Interrupt another CPU.
@@ -54,3 +56,38 @@ poke_cpu(int cpun)
 	 */
 	send_dirint(cpun, XC_CPUPOKE_PIL);
 }
+
+/*
+ * Call a function on a target CPU
+ */
+void
+cpu_call(cpu_t *cp, cpu_call_func_t func, uintptr_t arg1, uintptr_t arg2)
+{
+	cpuset_t set;
+
+	if (panicstr)
+		return;
+
+	/*
+	 * Prevent CPU from going off-line
+	 */
+	kpreempt_disable();
+
+	/*
+	 * If we are on the target CPU, call the function directly, but raise
+	 * the PIL to XC_PIL.
+	 * This guarantees that functions called via cpu_call() can not ever
+	 * interrupt each other.
+	 */
+	if (CPU == cp) {
+		int save_spl = splr(ipltospl(XC_HI_PIL));
+
+		(*func)(arg1, arg2);
+		splx(save_spl);
+	} else {
+		CPUSET_ONLY(set, cp->cpu_id);
+		xc_call((xc_arg_t)arg1, (xc_arg_t)arg2, 0, CPUSET2BV(set),
+		    (xc_func_t)func);
+	}
+	kpreempt_enable();
+}
diff --git a/usr/src/uts/i86pc/os/mp_machdep.c b/usr/src/uts/i86pc/os/mp_machdep.c
index 7470a1ef38..80e371850b 100644
--- a/usr/src/uts/i86pc/os/mp_machdep.c
+++ b/usr/src/uts/i86pc/os/mp_machdep.c
@@ -1,3 +1,4 @@
+
 /*
  * CDDL HEADER START
  *
@@ -61,6 +62,7 @@
 #include <sys/hpet.h>
 #include <sys/sunddi.h>
 #include <sys/sunndi.h>
+#include <sys/cpc_pcbe.h>
 
 #define	OFFSETOF(s, m)		(size_t)(&(((s *)0)->m))
 
@@ -1680,3 +1682,37 @@ pg_cmt_affinity_hw(pghw_type_t hw)
 	else
 		return (0);
 }
+
+/*
+ * Return number of counter events requested to measure hardware capacity and
+ * utilization and setup CPC requests for specified CPU as needed
+ *
+ * May return 0 when platform or processor specific code knows that no CPC
+ * events should be programmed on this CPU or -1 when platform or processor
+ * specific code doesn't know which counter events are best to use and common
+ * code should decide for itself
+ */
+int
+/* LINTED E_FUNC_ARG_UNUSED */
+cu_plat_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs)
+{
+	const char	*impl_name;
+
+	/*
+	 * Return error if pcbe_ops not set
+	 */
+	if (pcbe_ops == NULL)
+		return (-1);
+
+	/*
+	 * Return that no CPC events should be programmed on hyperthreaded
+	 * Pentium 4 and return error for all other x86 processors to tell
+	 * common code to decide what counter events to program on those CPUs
+	 * for measuring hardware capacity and utilization
+	 */
+	impl_name = pcbe_ops->pcbe_impl_name();
+	if (impl_name != NULL && strcmp(impl_name, PCBE_IMPL_NAME_P4HT) == 0)
+		return (0);
+	else
+		return (-1);
+}
diff --git a/usr/src/uts/i86pc/sys/xc_levels.h b/usr/src/uts/i86pc/sys/xc_levels.h
index 31ba6441fa..0492e48a1d 100644
--- a/usr/src/uts/i86pc/sys/xc_levels.h
+++ b/usr/src/uts/i86pc/sys/xc_levels.h
@@ -35,6 +35,7 @@ extern "C" {
 #define	XC_CPUPOKE_PIL	11	/* poke to cause wakeup, no service function */
 #define	XC_SYS_PIL	13	/* should be defined elsewhere */
 #define	XC_HI_PIL	15	/* cross call with service function */
+#define	XCALL_PIL	XC_HI_PIL /* alias for XC_HI_PIL */
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/intel/genunix/Makefile b/usr/src/uts/intel/genunix/Makefile
index db7b60ff14..ab0073268f 100644
--- a/usr/src/uts/intel/genunix/Makefile
+++ b/usr/src/uts/intel/genunix/Makefile
@@ -20,7 +20,7 @@
 #
 
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 
@@ -84,6 +84,8 @@ $(PATCH_BUILD)IPCTF_TARGET =
 CPPFLAGS	+= -I$(SRC)/common
 CPPFLAGS	+= -I$(SRC)/uts/common/fs/zfs
 
+CPPFLAGS	+= -I$(UTSBASE)/i86pc
+
 #
 # For now, disable these lint checks; maintainers should endeavor
 # to investigate and remove these for maximum lint coverage.
diff --git a/usr/src/uts/intel/ia32/os/cpc_subr.c b/usr/src/uts/intel/ia32/os/cpc_subr.c
index 1a71c1c431..1e3049a399 100644
--- a/usr/src/uts/intel/ia32/os/cpc_subr.c
+++ b/usr/src/uts/intel/ia32/os/cpc_subr.c
@@ -188,33 +188,6 @@ kcpc_hw_load_pcbe(void)
 	    cpuid_getmodel(CPU), cpuid_getstep(CPU)));
 }
 
-static int
-kcpc_remotestop_func(void)
-{
-	ASSERT(CPU->cpu_cpc_ctx != NULL);
-	pcbe_ops->pcbe_allstop();
-	atomic_or_uint(&CPU->cpu_cpc_ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
-
-	return (0);
-}
-
-/*
- * Ensure the counters are stopped on the given processor.
- *
- * Callers must ensure kernel preemption is disabled.
- */
-void
-kcpc_remote_stop(cpu_t *cp)
-{
-	cpuset_t set;
-
-	CPUSET_ZERO(set);
-
-	CPUSET_ADD(set, cp->cpu_id);
-
-	xc_sync(0, 0, 0, CPUSET2BV(set), (xc_func_t)kcpc_remotestop_func);
-}
-
 /*
  * Called by the generic framework to check if it's OK to bind a set to a CPU.
  */
@@ -292,28 +265,3 @@ kcpc_hw_lwp_hook(void)
 	mutex_exit(&cpu_lock);
 	return (0);
 }
-
-static int
-kcpc_remoteprogram_func(void)
-{
-	ASSERT(CPU->cpu_cpc_ctx != NULL);
-
-	pcbe_ops->pcbe_program(CPU->cpu_cpc_ctx);
-
-	return (0);
-}
-
-/*
- * Ensure counters are enabled on the given processor.
- */
-void
-kcpc_remote_program(cpu_t *cp)
-{
-	cpuset_t set;
-
-	CPUSET_ZERO(set);
-
-	CPUSET_ADD(set, cp->cpu_id);
-
-	xc_sync(0, 0, 0, CPUSET2BV(set), (xc_func_t)kcpc_remoteprogram_func);
-}
diff --git a/usr/src/uts/intel/pcbe/opteron_pcbe.c b/usr/src/uts/intel/pcbe/opteron_pcbe.c
index 18a309eca6..cb97d21b78 100644
--- a/usr/src/uts/intel/pcbe/opteron_pcbe.c
+++ b/usr/src/uts/intel/pcbe/opteron_pcbe.c
@@ -563,26 +563,6 @@ opt_pcbe_list_attrs(void)
 	return ("edge,pc,inv,cmask,umask");
 }
 
-/*ARGSUSED*/
-static uint64_t
-opt_pcbe_event_coverage(char *event)
-{
-	/*
-	 * Fortunately, all counters can count all events.
-	 */
-	return (0xF);
-}
-
-static uint64_t
-opt_pcbe_overflow_bitmap(void)
-{
-	/*
-	 * Unfortunately, this chip cannot detect which counter overflowed, so
-	 * we must act as if they all did.
-	 */
-	return (0xF);
-}
-
 static amd_generic_event_t *
 find_generic_event(char *name)
 {
@@ -608,6 +588,32 @@ find_event(char *name)
 }
 
 /*ARGSUSED*/
+static uint64_t
+opt_pcbe_event_coverage(char *event)
+{
+	/*
+	 * Check whether counter event is supported
+	 */
+	if (find_event(event) == NULL && find_generic_event(event) == NULL)
+		return (0);
+
+	/*
+	 * Fortunately, all counters can count all events.
+	 */
+	return (0xF);
+}
+
+static uint64_t
+opt_pcbe_overflow_bitmap(void)
+{
+	/*
+	 * Unfortunately, this chip cannot detect which counter overflowed, so
+	 * we must act as if they all did.
+	 */
+	return (0xF);
+}
+
+/*ARGSUSED*/
 static int
 opt_pcbe_configure(uint_t picnum, char *event, uint64_t preset, uint32_t flags,
     uint_t nattrs, kcpc_attr_t *attrs, void **data, void *token)
diff --git a/usr/src/uts/intel/pcbe/p4_pcbe.c b/usr/src/uts/intel/pcbe/p4_pcbe.c
index 0fffcd2961..8c05c599a3 100644
--- a/usr/src/uts/intel/pcbe/p4_pcbe.c
+++ b/usr/src/uts/intel/pcbe/p4_pcbe.c
@@ -522,7 +522,7 @@ static const char *
 p4_pcbe_impl_name(void)
 {
 	if (p4_htt)
-		return ("Pentium 4 with HyperThreading");
+		return (PCBE_IMPL_NAME_P4HT);
 	return ("Pentium 4");
 }
 
diff --git a/usr/src/uts/intel/sys/x86_archext.h b/usr/src/uts/intel/sys/x86_archext.h
index e5f1cababc..0bb28d4d49 100644
--- a/usr/src/uts/intel/sys/x86_archext.h
+++ b/usr/src/uts/intel/sys/x86_archext.h
@@ -626,6 +626,7 @@ extern int cpuid_get_chipid(struct cpu *);
 extern id_t cpuid_get_coreid(struct cpu *);
 extern int cpuid_get_pkgcoreid(struct cpu *);
 extern int cpuid_get_clogid(struct cpu *);
+extern int cpuid_get_cacheid(struct cpu *);
 extern uint32_t cpuid_get_apicid(struct cpu *);
 extern uint_t cpuid_get_procnodeid(struct cpu *cpu);
 extern uint_t cpuid_get_procnodes_per_pkg(struct cpu *cpu);
diff --git a/usr/src/uts/sun4/os/mp_call.c b/usr/src/uts/sun4/os/mp_call.c
index f881a23755..f7ee31a276 100644
--- a/usr/src/uts/sun4/os/mp_call.c
+++ b/usr/src/uts/sun4/os/mp_call.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Facilities for cross-processor subroutine calls using "mailbox" interrupts.
  */
@@ -37,6 +34,7 @@
 #include <sys/systm.h>
 #include <sys/machsystm.h>
 #include <sys/intr.h>
+#include <sys/xc_impl.h>
 
 /*
  * Interrupt another CPU.
@@ -64,3 +62,40 @@ poke_cpu(int cpun)
 
 	xt_one(cpun, setsoftint_tl1, poke_cpu_inum, 0);
 }
+
+extern int xc_spl_enter[];
+
+/*
+ * Call a function on a target CPU
+ */
+void
+cpu_call(cpu_t *cp, cpu_call_func_t func, uintptr_t arg1, uintptr_t arg2)
+{
+	if (panicstr)
+		return;
+
+	/*
+	 * Prevent CPU from going offline
+	 */
+	kpreempt_disable();
+
+	/*
+	 * If we are on the target CPU, call the function directly, but raise
+	 * the PIL to XC_PIL.
+	 * This guarantees that functions called via cpu_call() can not ever
+	 * interrupt each other.
+	 */
+	if (CPU != cp) {
+		xc_one(cp->cpu_id, (xcfunc_t *)func, (uint64_t)arg1,
+		    (uint64_t)arg2);
+	} else {
+		int lcx;
+		int opl;
+
+		XC_SPL_ENTER(lcx, opl);
+		func(arg1, arg2);
+		XC_SPL_EXIT(lcx, opl);
+	}
+
+	kpreempt_enable();
+}
diff --git a/usr/src/uts/sun4/os/x_call.c b/usr/src/uts/sun4/os/x_call.c
index 0c5c06c36a..521f740c82 100644
--- a/usr/src/uts/sun4/os/x_call.c
+++ b/usr/src/uts/sun4/os/x_call.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/systm.h>
 #include <sys/archsystm.h>
 #include <sys/machsystm.h>
@@ -226,6 +224,15 @@ xc_init(void)
  */
 
 /*
+ * spl_xcall - set PIL to xcall level
+ */
+int
+spl_xcall(void)
+{
+	return (splr(XCALL_PIL));
+}
+
+/*
  * xt_one - send a "x-trap" to a cpu
  */
 void
diff --git a/usr/src/uts/sun4u/genunix/Makefile b/usr/src/uts/sun4u/genunix/Makefile
index 8d7c87f065..1a77e4c916 100644
--- a/usr/src/uts/sun4u/genunix/Makefile
+++ b/usr/src/uts/sun4u/genunix/Makefile
@@ -20,7 +20,7 @@
 #
 
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 
@@ -96,6 +96,8 @@ CFLAGS += $(CCVERBOSE)
 CPPFLAGS += -I$(SRC)/common
 CPPFLAGS += -I$(SRC)/uts/common/fs/zfs
 
+INC_PATH +=  -I$(UTSBASE)/sun4
+
 #
 # For now, disable these lint checks; maintainers should endeavor
 # to investigate and remove these for maximum lint coverage.
diff --git a/usr/src/uts/sun4u/os/cmp.c b/usr/src/uts/sun4u/os/cmp.c
index 8ba9aa3b6e..8a0fa0e6dc 100644
--- a/usr/src/uts/sun4u/os/cmp.c
+++ b/usr/src/uts/sun4u/os/cmp.c
@@ -303,3 +303,19 @@ pg_cmt_affinity_hw(pghw_type_t hw)
 	else
 		return (0);
 }
+
+/*
+ * Return number of counter events requested to measure hardware capacity and
+ * utilization and setup CPC requests for specified CPU if list where to add
+ * CPC requests is given
+ */
+int
+/* LINTED E_FUNC_ARG_UNUSED */
+cu_plat_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs)
+{
+	/*
+	 * Return error to tell common code to decide what counter events to
+	 * program on this CPU for measuring hardware capacity and utilization
+	 */
+	return (-1);
+}
diff --git a/usr/src/uts/sun4u/os/cpc_subr.c b/usr/src/uts/sun4u/os/cpc_subr.c
index a9c64681fd..cfe1fd283d 100644
--- a/usr/src/uts/sun4u/os/cpc_subr.c
+++ b/usr/src/uts/sun4u/os/cpc_subr.c
@@ -45,6 +45,7 @@
 #include <sys/cpc_pcbe.h>
 #include <sys/modctl.h>
 #include <sys/sdt.h>
+#include <sys/kcpc.h>
 
 uint64_t	cpc_level15_inum;	/* used in interrupt.s */
 int		cpc_has_overflow_intr;	/* set in cheetah.c */
@@ -111,26 +112,6 @@ kcpc_hw_load_pcbe(void)
 }
 
 /*ARGSUSED*/
-static void
-kcpc_remotestop_func(uint64_t arg1, uint64_t arg2)
-{
-	ASSERT(CPU->cpu_cpc_ctx != NULL);
-	pcbe_ops->pcbe_allstop();
-	atomic_or_uint(&CPU->cpu_cpc_ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
-}
-
-/*
- * Ensure the counters are stopped on the given processor.
- *
- * Callers must ensure kernel preemption is disabled.
- */
-void
-kcpc_remote_stop(cpu_t *cp)
-{
-	xc_one(cp->cpu_id, kcpc_remotestop_func, 0, 0);
-}
-
-/*ARGSUSED*/
 int
 kcpc_hw_cpu_hook(processorid_t cpuid, ulong_t *kcpc_cpumap)
 {
@@ -142,21 +123,3 @@ kcpc_hw_lwp_hook(void)
 {
 	return (0);
 }
-
-/*ARGSUSED*/
-static void
-kcpc_remoteprogram_func(uint64_t arg1, uint64_t arg2)
-{
-	ASSERT(CPU->cpu_cpc_ctx != NULL);
-
-	pcbe_ops->pcbe_program(CPU->cpu_cpc_ctx);
-}
-
-/*
- * Ensure counters are enabled on the given processor.
- */
-void
-kcpc_remote_program(cpu_t *cp)
-{
-	xc_one(cp->cpu_id, kcpc_remoteprogram_func, 0, 0);
-}
diff --git a/usr/src/uts/sun4v/genunix/Makefile b/usr/src/uts/sun4v/genunix/Makefile
index e629630fb5..28d4f2aeeb 100644
--- a/usr/src/uts/sun4v/genunix/Makefile
+++ b/usr/src/uts/sun4v/genunix/Makefile
@@ -20,7 +20,7 @@
 #
 
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 #
@@ -104,6 +104,8 @@ CFLAGS += $(CCVERBOSE)
 CPPFLAGS += -I$(SRC)/common
 CPPFLAGS += -I$(SRC)/uts/common/fs/zfs
 
+INC_PATH +=  -I$(UTSBASE)/sun4
+
 #
 # For now, disable these lint checks; maintainers should endeavor
 # to investigate and remove these for maximum lint coverage.
diff --git a/usr/src/uts/sun4v/os/cmp.c b/usr/src/uts/sun4v/os/cmp.c
index 4e80f06f32..8eedd1a69d 100644
--- a/usr/src/uts/sun4v/os/cmp.c
+++ b/usr/src/uts/sun4v/os/cmp.c
@@ -208,3 +208,19 @@ pg_cmt_affinity_hw(pghw_type_t hw)
 	else
 		return (0);
 }
+
+/*
+ * Return number of counter events requested to measure hardware capacity and
+ * utilization and setup CPC requests for specified CPU if list where to add
+ * CPC requests is given
+ */
+int
+/* LINTED E_FUNC_ARG_UNUSED */
+cu_plat_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs)
+{
+	/*
+	 * Return error to tell common code to decide what counter events to
+	 * program on this CPU for measuring hardware capacity and utilization
+	 */
+	return (-1);
+}
diff --git a/usr/src/uts/sun4v/os/cpc_subr.c b/usr/src/uts/sun4v/os/cpc_subr.c
index 8e58d85513..089c582541 100644
--- a/usr/src/uts/sun4v/os/cpc_subr.c
+++ b/usr/src/uts/sun4v/os/cpc_subr.c
@@ -130,26 +130,6 @@ kcpc_hw_load_pcbe(void)
 }
 
 /*ARGSUSED*/
-static void
-kcpc_remotestop_func(uint64_t arg1, uint64_t arg2)
-{
-	ASSERT(CPU->cpu_cpc_ctx != NULL);
-	pcbe_ops->pcbe_allstop();
-	atomic_or_uint(&CPU->cpu_cpc_ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
-}
-
-/*
- * Ensure the counters are stopped on the given processor.
- *
- * Callers must ensure kernel preemption is disabled.
- */
-void
-kcpc_remote_stop(cpu_t *cp)
-{
-	xc_one(cp->cpu_id, kcpc_remotestop_func, 0, 0);
-}
-
-/*ARGSUSED*/
 int
 kcpc_hw_cpu_hook(processorid_t cpuid, ulong_t *kcpc_cpumap)
 {
@@ -161,21 +141,3 @@ kcpc_hw_lwp_hook(void)
 {
 	return (0);
 }
-
-/*ARGSUSED*/
-static void
-kcpc_remoteprogram_func(uint64_t arg1, uint64_t arg2)
-{
-	ASSERT(CPU->cpu_cpc_ctx != NULL);
-
-	pcbe_ops->pcbe_program(CPU->cpu_cpc_ctx);
-}
-
-/*
- * Ensure counters are enabled on the given processor.
- */
-void
-kcpc_remote_program(cpu_t *cp)
-{
-	xc_one(cp->cpu_id, kcpc_remoteprogram_func, 0, 0);
-}
diff --git a/usr/src/uts/sun4v/pcbe/niagara2_pcbe.c b/usr/src/uts/sun4v/pcbe/niagara2_pcbe.c
index 13c428130e..d4b69e5de4 100644
--- a/usr/src/uts/sun4v/pcbe/niagara2_pcbe.c
+++ b/usr/src/uts/sun4v/pcbe/niagara2_pcbe.c
@@ -399,6 +399,12 @@ static uint64_t
 ni2_pcbe_event_coverage(char *event)
 {
 	/*
+	 * Check whether counter event is supported
+	 */
+	if (find_event(event) == NULL && find_generic_event(event) == NULL)
+		return (0);
+
+	/*
 	 * Fortunately, both pic0 and pic1 can count all events.
 	 */
 	return (0x3);