6764832 Provide user-level processor groups observability

6831680 cputrack(1) leaves its victim with unneeded cpc context 6901343 cpc context flag updates are not always atomic 6908152 Dormant thread CPC context affects cpu CPC consumers
author: Alexander Kolbasov <Alexander.Kolbasov@Sun.COM> 2009-12-22 21:52:00 -0800
committer: Alexander Kolbasov <Alexander.Kolbasov@Sun.COM> 2009-12-22 21:52:00 -0800
commit: b885580b43755ee4ea1e280b85428893d2ba9291 (patch)
tree: f7f7848d3eef390282bab6fc859d7a5275053ecf /usr/src
parent: e7437094ebbbd4d60375f3927c017ff00cbab1de (diff)
download: illumos-joyent-b885580b43755ee4ea1e280b85428893d2ba9291.tar.gz
48 files changed, 3759 insertions, 374 deletions
diff --git a/usr/src/cmd/cpc/common/cputrack.c b/usr/src/cmd/cpc/common/cputrack.c
index 22ad2673e2..41034aef6e 100644
--- a/usr/src/cmd/cpc/common/cputrack.c
+++ b/usr/src/cmd/cpc/common/cputrack.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -62,6 +62,12 @@ static const struct options *opts = (const struct options *)&__options;
 
 static cpc_t *cpc;
 
+/*
+ * How many signals caught from terminal
+ * We bail out as soon as possible when interrupt is set
+ */
+static int	interrupt = 0;
+
 /*ARGSUSED*/
 static void
 cputrack_errfn(const char *fn, int subcode, const char *fmt, va_list ap)
@@ -79,6 +85,8 @@ cputrack_pctx_errfn(const char *fn, const char *fmt, va_list ap)
 }
 
 static int cputrack(int argc, char *argv[], int optind);
+static void intr(int);
+
 #if defined(__i386)
 static void p4_ht_error(void);
 #endif
@@ -220,6 +228,19 @@ main(int argc, char *argv[])
 		exit(2);
 	}
 
+	/*
+	 * Catch signals from terminal, so they can be handled asynchronously
+	 * when we're ready instead of when we're not (;-)
+	 */
+	if (sigset(SIGHUP, SIG_IGN) == SIG_DFL)
+		(void) sigset(SIGHUP, intr);
+	if (sigset(SIGINT, SIG_IGN) == SIG_DFL)
+		(void) sigset(SIGINT, intr);
+	if (sigset(SIGQUIT, SIG_IGN) == SIG_DFL)
+		(void) sigset(SIGQUIT, intr);
+	(void) sigset(SIGPIPE, intr);
+	(void) sigset(SIGTERM, intr);
+
 	cpc_setgrp_reset(opts->master);
 	(void) setvbuf(opts->log, NULL, _IOLBF, 0);
 	ret = cputrack(argc, argv, optind);
@@ -310,6 +331,9 @@ pinit_lwp(pctx_t *pctx, pid_t pid, id_t lwpid, void *arg)
 	char *errstr;
 	int nreq;
 
+	if (interrupt)
+		return (0);
+
 	if (state->maxlwpid < lwpid) {
 		state->sgrps = realloc(state->sgrps,
 		    lwpid * sizeof (state->sgrps));
@@ -373,6 +397,9 @@ pfini_lwp(pctx_t *pctx, pid_t pid, id_t lwpid, void *arg)
 	cpc_buf_t **data1, **data2, **scratch;
 	int nreq;
 
+	if (interrupt)
+		return (0);
+
 	set = cpc_setgrp_getset(sgrp);
 	nreq = cpc_setgrp_getbufs(sgrp, &data1, &data2, &scratch);
 	if (cpc_set_sample(cpc, set, *scratch) == 0) {
@@ -424,6 +451,9 @@ plwp_create(pctx_t *pctx, pid_t pid, id_t lwpid, void *arg)
 	cpc_buf_t	**data1, **data2, **scratch;
 	int		nreq;
 
+	if (interrupt)
+		return (0);
+
 	nreq = cpc_setgrp_getbufs(sgrp, &data1, &data2, &scratch);
 
 	print_sample(pid, lwpid, "lwp_create",
@@ -442,6 +472,9 @@ plwp_exit(pctx_t *pctx, pid_t pid, id_t lwpid, void *arg)
 	int		nreq;
 	cpc_buf_t	**data1, **data2, **scratch;
 
+	if (interrupt)
+		return (0);
+
 	start = cpc_setgrp_getset(sgrp);
 	do {
 		nreq = cpc_setgrp_getbufs(sgrp, &data1, &data2, &scratch);
@@ -465,6 +498,9 @@ pexec(pctx_t *pctx, pid_t pid, id_t lwpid, char *name, void *arg)
 	cpc_buf_t	**data1, **data2, **scratch;
 	hrtime_t	hrt;
 
+	if (interrupt)
+		return (0);
+
 	/*
 	 * Print the accumulated results from the previous program image
 	 */
@@ -505,6 +541,9 @@ pexit(pctx_t *pctx, pid_t pid, id_t lwpid, int status, void *arg)
 	int		nreq;
 	cpc_buf_t	**data1, **data2, **scratch;
 
+	if (interrupt)
+		return;
+
 	cpc_setgrp_reset(state->accum);
 	start = cpc_setgrp_getset(state->accum);
 	do {
@@ -539,6 +578,9 @@ ptick(pctx_t *pctx, pid_t pid, id_t lwpid, void *arg)
 	char *errstr;
 	int nreqs;
 
+	if (interrupt)
+		return (0);
+
 	nreqs = cpc_setgrp_getbufs(sgrp, &data1, &data2, &scratch);
 
 	if (opts->nsets == 1) {
@@ -704,7 +746,6 @@ cputrack(int argc, char *argv[], int optind)
 			state->accum = NULL;
 		}
 	}
-	pctx_release(pctx);
 
 	return (err != 0 ? 1 : 0);
 }
@@ -834,3 +875,12 @@ p4_ht_error(void)
 }
 
 #endif /* defined(__i386) */
+
+/*ARGSUSED*/
+static void
+intr(int sig)
+{
+	interrupt++;
+	if (cpc != NULL)
+		cpc_terminate(cpc);
+}
diff --git a/usr/src/lib/libcpc/common/libcpc.c b/usr/src/lib/libcpc/common/libcpc.c
index 5bdba39fda..9f4f6ac848 100644
--- a/usr/src/lib/libcpc/common/libcpc.c
+++ b/usr/src/lib/libcpc/common/libcpc.c
@@ -168,6 +168,23 @@ cpc_close(cpc_t *cpc)
 	return (0);
 }
 
+/*
+ * Terminate everything that runs in pctx_run
+ */
+void
+cpc_terminate(cpc_t *cpc)
+{
+	cpc_set_t	*csp;
+	int		sigblocked;
+
+	sigblocked = cpc_lock(cpc);
+	for (csp = cpc->cpc_sets; csp != NULL; csp = csp->cs_next) {
+		if (csp->cs_pctx != NULL)
+			pctx_terminate(csp->cs_pctx);
+	}
+	cpc_unlock(cpc, sigblocked);
+}
+
 cpc_set_t *
 cpc_set_create(cpc_t *cpc)
 {
@@ -224,6 +241,14 @@ cpc_set_destroy(cpc_t *cpc, cpc_set_t *set)
 	if (csp->cs_state != CS_UNBOUND)
 		(void) cpc_unbind(cpc, csp);
 
+	/*
+	 * Detach from the process
+	 */
+	if (csp->cs_pctx != NULL) {
+		pctx_release(csp->cs_pctx);
+		csp->cs_pctx = NULL;
+	}
+
 	for (req = csp->cs_request; req != NULL; req = next) {
 		next = req->cr_next;
 
diff --git a/usr/src/lib/libcpc/common/libcpc.h b/usr/src/lib/libcpc/common/libcpc.h
index 384474a76c..73627345a0 100644
--- a/usr/src/lib/libcpc/common/libcpc.h
+++ b/usr/src/lib/libcpc/common/libcpc.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -163,6 +163,8 @@ extern void cpc_walk_attrs(cpc_t *cpc, void *arg,
 extern int cpc_enable(cpc_t *cpc);
 extern int cpc_disable(cpc_t *cpc);
 
+extern void cpc_terminate(cpc_t *);
+
 #if defined(__sparc) || defined(__i386)
 
 /*
diff --git a/usr/src/lib/libcpc/common/mapfile-vers b/usr/src/lib/libcpc/common/mapfile-vers
index 91f1689c9f..e577fc7c5e 100644
--- a/usr/src/lib/libcpc/common/mapfile-vers
+++ b/usr/src/lib/libcpc/common/mapfile-vers
@@ -83,6 +83,7 @@ SUNW_1.2 {
 SUNWprivate_1.1 {
     global:
 	SUNWprivate_1.1;
+	cpc_terminate;
     local:
 	*;
 };
diff --git a/usr/src/lib/libpctx/common/libpctx.c b/usr/src/lib/libpctx/common/libpctx.c
index 9c28fb9b9b..f17e238322 100644
--- a/usr/src/lib/libpctx/common/libpctx.c
+++ b/usr/src/lib/libpctx/common/libpctx.c
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * This file contains a set of generic routines for periodically
  * sampling the state of another process, or tree of processes.
@@ -66,6 +64,7 @@ struct __pctx {
 	int verbose;
 	int created;
 	int sigblocked;
+	int terminate;
 	sigset_t savedset;
 	cpc_t *cpc;
 };
@@ -108,6 +107,7 @@ pctx_create(
 	pctx = calloc(1, sizeof (*pctx));
 	pctx->uarg = arg;
 	pctx->verbose = verbose;
+	pctx->terminate = 0;
 	pctx->errfn = errfn ? errfn : pctx_default_errfn;
 
 	if ((pctx->Pr = Pcreate(filename, argv, &err, 0, 0)) == NULL) {
@@ -487,6 +487,7 @@ pctx_release(pctx_t *pctx)
 		Prelease(pctx->Pr, PRELEASE_CLEAR);
 		pctx->Pr = NULL;
 	}
+
 	pctx_free(pctx);
 	bzero(pctx, sizeof (*pctx));
 	free(pctx);
@@ -577,7 +578,7 @@ pctx_run(
 	 * exited successfully or the number of time samples has expired.
 	 * Otherwise, if an error has occurred, running becomes -1.
 	 */
-	while (running == 1) {
+	while (running == 1 && !pctx->terminate) {
 
 		if (Psetrun(pctx->Pr, 0, 0) != 0) {
 			if (pctx->verbose)
@@ -609,10 +610,13 @@ pctx_run(
 					if (nsamples != 1)
 						nsamples--;
 				}
-			} while (mswait == 0);
+			} while (mswait == 0 && !pctx->terminate);
 		}
 
-		(void) Pwait(pctx->Pr, mswait);
+		if (pctx->terminate)
+			goto bailout;
+		else
+			(void) Pwait(pctx->Pr, mswait);
 
 checkstate:
 		switch (pstate = Pstate(pctx->Pr)) {
@@ -854,6 +858,9 @@ checkstate:
 bailout:
 	(void) signal(SIGCHLD, sigsaved);
 
+	if (pctx->terminate)
+		return (0);
+
 	switch (running) {
 	case 0:
 		return (0);
@@ -885,6 +892,7 @@ __pctx_cpc(pctx_t *pctx, cpc_t *cpc,
 	 * We store the last cpc_t used by libpctx, so that when this pctx is
 	 * destroyed, libpctx can notify libcpc.
 	 */
+
 	if (pctx->cpc != NULL && pctx->cpc != cpc && pctx_cpc_callback != NULL)
 		(*pctx_cpc_callback)(pctx->cpc, pctx);
 	pctx->cpc = cpc;
@@ -993,3 +1001,12 @@ __pctx_cpc_register_callback(void (*arg)(struct __cpc *, struct __pctx *))
 {
 	pctx_cpc_callback = arg;
 }
+
+/*
+ * Tell pctx_run to bail out immediately
+ */
+void
+pctx_terminate(struct __pctx *pctx)
+{
+	pctx->terminate = 1;
+}
diff --git a/usr/src/lib/libpctx/common/libpctx.h b/usr/src/lib/libpctx/common/libpctx.h
index 10d0fb7c7e..7cd9ffff91 100644
--- a/usr/src/lib/libpctx/common/libpctx.h
+++ b/usr/src/lib/libpctx/common/libpctx.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_LIBPCTX_H
 #define	_LIBPCTX_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <fcntl.h>
 #include <stdarg.h>
@@ -67,6 +64,8 @@ typedef int pctx_init_lwpfn_t(pctx_t *, pid_t, id_t, void *);
 typedef int pctx_fini_lwpfn_t(pctx_t *, pid_t, id_t, void *);
 typedef int pctx_sysc_lwp_exitfn_t(pctx_t *, pid_t, id_t, void *);
 
+extern void pctx_terminate(pctx_t *);
+
 typedef	enum {
 	PCTX_NULL_EVENT = 0,
 	PCTX_SYSC_EXEC_EVENT,
diff --git a/usr/src/lib/libpctx/common/mapfile-vers b/usr/src/lib/libpctx/common/mapfile-vers
index 1b296817d4..e316020c8b 100644
--- a/usr/src/lib/libpctx/common/mapfile-vers
+++ b/usr/src/lib/libpctx/common/mapfile-vers
@@ -50,6 +50,7 @@ SUNWprivate_1.1 {
     global:
 	__pctx_cpc;
 	__pctx_cpc_register_callback;
+	pctx_terminate;
     local:
 	*;
 };
diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com
index 8ad553b07c..88ab8b3f20 100644
--- a/usr/src/pkgdefs/SUNWhea/prototype_com
+++ b/usr/src/pkgdefs/SUNWhea/prototype_com
@@ -682,6 +682,7 @@ f none usr/include/sys/bustypes.h 644 root bin
 f none usr/include/sys/byteorder.h 644 root bin
 f none usr/include/sys/callb.h 644 root bin
 f none usr/include/sys/callo.h 644 root bin
+f none usr/include/sys/cap_util.h 644 root bin
 f none usr/include/sys/cpucaps.h 644 root bin
 f none usr/include/sys/cpucaps_impl.h 644 root bin
 f none usr/include/sys/ccompile.h 644 root bin
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 83b7bf34c6..974cec5d3f 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -47,6 +47,7 @@ COMMON_CORE_OBJS +=		\
 		cpu_intr.o	\
 		cpu_pm.o	\
 		cpupart.o	\
+		cap_util.o	\
 		disp.o		\
 		group.o		\
 		kstat_fr.o	\
diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c
index e6d77020a6..09e529b934 100644
--- a/usr/src/uts/common/conf/param.c
+++ b/usr/src/uts/common/conf/param.c
@@ -212,6 +212,7 @@ extern void clock_timer_init(void);
 extern void clock_realtime_init(void);
 extern void clock_highres_init(void);
 extern void clock_tick_mp_init(void);
+extern void cu_init(void);
 extern void callout_mp_init(void);
 extern void cpu_seq_tbl_init(void);
 
@@ -257,6 +258,7 @@ void	(*mp_init_tbl[])(void) = {
 	siron_mp_init,
 #endif
 	clock_tick_mp_init,
+	cu_init,
 	callout_mp_init,
 	0
 };
diff --git a/usr/src/uts/common/disp/cmt.c b/usr/src/uts/common/disp/cmt.c
index b2f219472d..a5f1a52e34 100644
--- a/usr/src/uts/common/disp/cmt.c
+++ b/usr/src/uts/common/disp/cmt.c
@@ -159,7 +159,6 @@ static void		cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
 static cmt_lineage_validation_t	pg_cmt_lineage_validate(pg_cmt_t **, int *,
 			    cpu_pg_t *);
 
-
 /*
  * CMT PG ops
  */
@@ -583,6 +582,8 @@ pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata)
 			ASSERT(IS_CMT_PG(pg));
 		}
 
+		((pghw_t *)pg)->pghw_generation++;
+
 		/* Add the CPU to the PG */
 		pg_cpu_add((pg_t *)pg, cp, pgdata);
 
@@ -762,7 +763,7 @@ pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata)
  *
  * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
  * references a "bootstrap" structure across this function's invocation.
- * pg_cmt_cpu_init() and the routines it calls must be careful to operate only
+ * pg_cmt_cpu_fini() and the routines it calls must be careful to operate only
  * on the "pgdata" argument, and not cp->cpu_pg.
  */
 static void
@@ -818,6 +819,8 @@ pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata)
 	pg = (pg_cmt_t *)pgdata->cmt_lineage;
 	while (pg != NULL) {
 
+		((pghw_t *)pg)->pghw_generation++;
+
 		/*
 		 * Remove the PG from the CPU's load balancing lineage
 		 */
@@ -990,6 +993,11 @@ pg_cmt_cpu_active(cpu_t *cp)
 		if (IS_CMT_PG(pg) == 0)
 			continue;
 
+		/*
+		 * Move to the next generation since topology is changing
+		 */
+		((pghw_t *)pg)->pghw_generation++;
+
 		err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
 		ASSERT(err == 0);
 
@@ -1056,6 +1064,11 @@ pg_cmt_cpu_inactive(cpu_t *cp)
 			continue;
 
 		/*
+		 * Move to the next generation since topology is changing
+		 */
+		((pghw_t *)pg)->pghw_generation++;
+
+		/*
 		 * Remove the CPU from the CMT PGs active CPU group
 		 * bitmap
 		 */
diff --git a/usr/src/uts/common/dtrace/dcpc.c b/usr/src/uts/common/dtrace/dcpc.c
index e780d1e620..c410e65eaa 100644
--- a/usr/src/uts/common/dtrace/dcpc.c
+++ b/usr/src/uts/common/dtrace/dcpc.c
@@ -35,6 +35,7 @@
 #include <sys/conf.h>
 #include <sys/kmem.h>
 #include <sys/kcpc.h>
+#include <sys/cap_util.h>
 #include <sys/cpc_pcbe.h>
 #include <sys/cpc_impl.h>
 #include <sys/dtrace_impl.h>
@@ -463,8 +464,7 @@ dcpc_program_cpu_event(cpu_t *c)
 
 	set = dcpc_create_set(c);
 
-	octx = NULL;
-	set->ks_ctx = ctx = kcpc_ctx_alloc();
+	set->ks_ctx = ctx = kcpc_ctx_alloc(KM_SLEEP);
 	ctx->kc_set = set;
 	ctx->kc_cpuid = c->cpu_id;
 
@@ -489,11 +489,9 @@ dcpc_program_cpu_event(cpu_t *c)
 	 * If we already have an active enabling then save the current cpc
 	 * context away.
 	 */
-	if (c->cpu_cpc_ctx != NULL)
-		octx = c->cpu_cpc_ctx;
+	octx = c->cpu_cpc_ctx;
 
-	c->cpu_cpc_ctx = ctx;
-	kcpc_remote_program(c);
+	kcpc_cpu_program(c, ctx);
 
 	if (octx != NULL) {
 		kcpc_set_t *oset = octx->kc_set;
@@ -528,9 +526,14 @@ dcpc_disable_cpu(cpu_t *c)
 	if (c->cpu_flags & CPU_OFFLINE)
 		return;
 
-	kcpc_remote_stop(c);
-
+	/*
+	 * Grab CPUs CPC context before kcpc_cpu_stop() stops counters and
+	 * changes it.
+	 */
 	ctx = c->cpu_cpc_ctx;
+
+	kcpc_cpu_stop(c, B_FALSE);
+
 	set = ctx->kc_set;
 
 	kcpc_free_configs(set);
@@ -538,7 +541,6 @@ dcpc_disable_cpu(cpu_t *c)
 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
 	kcpc_free_set(set);
 	kcpc_ctx_free(ctx);
-	c->cpu_cpc_ctx = NULL;
 }
 
 /*
@@ -615,8 +617,21 @@ dcpc_program_event(dcpc_probe_t *pp)
 		if (c->cpu_flags & CPU_OFFLINE)
 			continue;
 
+		/*
+		 * Stop counters but preserve existing DTrace CPC context
+		 * if there is one.
+		 *
+		 * If we come here when the first event is programmed for a CPU,
+		 * there should be no DTrace CPC context installed. In this
+		 * case, kcpc_cpu_stop() will ensure that there is no other
+		 * context on the CPU.
+		 *
+		 * If we add new enabling to the original one, the CPU should
+		 * have the old DTrace CPC context which we need to keep around
+		 * since dcpc_program_event() will add to it.
+		 */
 		if (c->cpu_cpc_ctx != NULL)
-			kcpc_remote_stop(c);
+			kcpc_cpu_stop(c, B_TRUE);
 	} while ((c = c->cpu_next) != cpu_list);
 
 	dcpc_release_interrupts();
@@ -708,6 +723,13 @@ dcpc_enable(void *arg, dtrace_id_t id, void *parg)
 	ASSERT(pp->dcpc_actv_req_idx >= 0);
 
 	/*
+	 * DTrace is taking over CPC contexts, so stop collecting
+	 * capacity/utilization data for all CPUs.
+	 */
+	if (dtrace_cpc_in_use == 1)
+		cu_disable();
+
+	/*
 	 * The following must hold true if we are to (attempt to) enable
 	 * this request:
 	 *
@@ -758,7 +780,7 @@ dcpc_enable(void *arg, dtrace_id_t id, void *parg)
 			if (c->cpu_flags & CPU_OFFLINE)
 				continue;
 
-			kcpc_remote_program(c);
+			kcpc_cpu_program(c, c->cpu_cpc_ctx);
 		} while ((c = c->cpu_next) != cpu_list);
 	}
 
@@ -766,6 +788,13 @@ dcpc_enable(void *arg, dtrace_id_t id, void *parg)
 	dcpc_actv_reqs[pp->dcpc_actv_req_idx] = NULL;
 	pp->dcpc_actv_req_idx = pp->dcpc_picno = -1;
 
+	/*
+	 * If all probes are removed, enable capacity/utilization data
+	 * collection for every CPU.
+	 */
+	if (dtrace_cpc_in_use == 0)
+		cu_enable();
+
 	return (-1);
 }
 
@@ -841,6 +870,13 @@ dcpc_disable(void *arg, dtrace_id_t id, void *parg)
 	dtrace_cpc_in_use--;
 	pp->dcpc_enabled = 0;
 	pp->dcpc_actv_req_idx = pp->dcpc_picno = pp->dcpc_disabling = -1;
+
+	/*
+	 * If all probes are removed, enable capacity/utilization data
+	 * collection for every CPU
+	 */
+	if (dtrace_cpc_in_use == 0)
+		cu_enable();
 }
 
 /*ARGSUSED*/
@@ -891,7 +927,6 @@ dcpc_cpu_setup(cpu_setup_t what, processorid_t cpu, void *arg)
 		 */
 		if (dtrace_cpc_in_use) {
 			c = cpu_get(cpu);
-
 			(void) dcpc_program_cpu_event(c);
 		}
 		break;
diff --git a/usr/src/uts/common/io/cpc.c b/usr/src/uts/common/io/cpc.c
index 6881380251..0b003c3ee1 100644
--- a/usr/src/uts/common/io/cpc.c
+++ b/usr/src/uts/common/io/cpc.c
@@ -942,49 +942,19 @@ static struct modlinkage modl = {
 #endif
 };
 
-static void
-kcpc_init(void)
-{
-	long hash;
-
-	rw_init(&kcpc_cpuctx_lock, NULL, RW_DEFAULT, NULL);
-	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
-		mutex_init(&kcpc_ctx_llock[hash],
-		    NULL, MUTEX_DRIVER, (void *)(uintptr_t)15);
-}
-
-static void
-kcpc_fini(void)
-{
-	long hash;
-
-	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
-		mutex_destroy(&kcpc_ctx_llock[hash]);
-	rw_destroy(&kcpc_cpuctx_lock);
-}
-
 int
 _init(void)
 {
-	int ret;
-
-	if (kcpc_hw_load_pcbe() != 0)
+	if (kcpc_init() != 0)
 		return (ENOTSUP);
 
-	kcpc_init();
-	if ((ret = mod_install(&modl)) != 0)
-		kcpc_fini();
-	return (ret);
+	return (mod_install(&modl));
 }
 
 int
 _fini(void)
 {
-	int ret;
-
-	if ((ret = mod_remove(&modl)) == 0)
-		kcpc_fini();
-	return (ret);
+	return (mod_remove(&modl));
 }
 
 int
diff --git a/usr/src/uts/common/os/cap_util.c b/usr/src/uts/common/os/cap_util.c
new file mode 100644
index 0000000000..16ff7f45fd
--- /dev/null
+++ b/usr/src/uts/common/os/cap_util.c
@@ -0,0 +1,1652 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Support for determining capacity and utilization of performance relevant
+ * hardware components in a computer
+ *
+ * THEORY
+ * ------
+ * The capacity and utilization of the performance relevant hardware components
+ * is needed to be able to optimize performance while minimizing the amount of
+ * power used on a system.  The idea is to use hardware performance counters
+ * and potentially other means to determine the capacity and utilization of
+ * performance relevant hardware components (eg. execution pipeline, cache,
+ * memory, etc.) and attribute the utilization to the responsible CPU and the
+ * thread running there.
+ *
+ * This will help characterize the utilization of performance relevant
+ * components and how much is used by each CPU and each thread.  With
+ * that data, the utilization can be aggregated to all the CPUs sharing each
+ * performance relevant hardware component to calculate the total utilization
+ * of each component and compare that with the component's capacity to
+ * essentially determine the actual hardware load of the component.  The
+ * hardware utilization attributed to each running thread can also be
+ * aggregated to determine the total hardware utilization of each component to
+ * a workload.
+ *
+ * Once that is done, one can determine how much of each performance relevant
+ * hardware component is needed by a given thread or set of threads (eg. a
+ * workload) and size up exactly what hardware is needed by the threads and how
+ * much.  With this info, we can better place threads among CPUs to match their
+ * exact hardware resource needs and potentially lower or raise the power based
+ * on their utilization or pack threads onto the fewest hardware components
+ * needed and power off any remaining unused components to minimize power
+ * without sacrificing performance.
+ *
+ * IMPLEMENTATION
+ * --------------
+ * The code has been designed and implemented to make (un)programming and
+ * reading the counters for a given CPU as lightweight and fast as possible.
+ * This is very important because we need to read and potentially (un)program
+ * the counters very often and in performance sensitive code.  Specifically,
+ * the counters may need to be (un)programmed during context switch and/or a
+ * cyclic handler when there are more counter events to count than existing
+ * counters.
+ *
+ * Consequently, the code has been split up to allow allocating and
+ * initializing everything needed to program and read the counters on a given
+ * CPU once and make (un)programming and reading the counters for a given CPU
+ * not have to allocate/free memory or grab any locks.  To do this, all the
+ * state needed to (un)program and read the counters on a CPU is kept per CPU
+ * and is made lock free by forcing any code that reads or manipulates the
+ * counters or the state needed to (un)program or read the counters to run on
+ * the target CPU and disable preemption while running on the target CPU to
+ * protect any critical sections. All counter manipulation on the target CPU is
+ * happening either from a cross-call to the target CPU or at the same PIL as
+ * used by the cross-call subsystem. This guarantees that counter manipulation
+ * is not interrupted by cross-calls from other CPUs.
+ *
+ * The synchronization has been made lock free or as simple as possible for
+ * performance and to avoid getting the locking all tangled up when we interpose
+ * on the CPC routines that (un)program the counters to manage the counters
+ * between the kernel and user on each CPU.  When the user starts using the
+ * counters on a given CPU, the kernel will unprogram the counters that it is
+ * using on that CPU just before they are programmed for the user.  Then the
+ * kernel will program the counters on a given CPU for its own use when the user
+ * stops using them.
+ *
+ * There is a special interaction with DTrace cpc provider (dcpc). Before dcpc
+ * enables any probe, it requests to disable and unprogram all counters used for
+ * capacity and utilizations. These counters are never re-programmed back until
+ * dcpc completes. When all DTrace cpc probes are removed, dcpc notifies CU
+ * framework and it re-programs the counters.
+ *
+ * When a CPU is going offline, its CU counters are unprogrammed and disabled,
+ * so that they would not be re-programmed again by some other activity on the
+ * CPU that is going offline.
+ *
+ * The counters are programmed during boot.  However, a flag is available to
+ * disable this if necessary (see cu_flag below).  A handler is provided to
+ * (un)program the counters during CPU on/offline.  Basic routines are provided
+ * to initialize and tear down this module, initialize and tear down any state
+ * needed for a given CPU, and (un)program the counters for a given CPU.
+ * Lastly, a handler is provided to read the counters and attribute the
+ * utilization to the responsible CPU.
+ */
+#include <sys/types.h>
+#include <sys/cmn_err.h>
+#include <sys/cpuvar.h>
+#include <sys/ddi.h>
+#include <sys/disp.h>
+#include <sys/sdt.h>
+#include <sys/sunddi.h>
+#include <sys/thread.h>
+#include <sys/pghw.h>
+#include <sys/cmt.h>
+#include <sys/x_call.h>
+#include <sys/cap_util.h>
+
+#include <sys/archsystm.h>
+#include <sys/promif.h>
+
+#if defined(__x86)
+#include <sys/xc_levels.h>
+#endif
+
+
+/*
+ * Default CPU hardware performance counter flags to use for measuring capacity
+ * and utilization
+ */
+#define	CU_CPC_FLAGS_DEFAULT	\
+	(CPC_COUNT_USER|CPC_COUNT_SYSTEM|CPC_OVF_NOTIFY_EMT)
+
+/*
+ * Possible Flags for controlling this module.
+ */
+#define	CU_FLAG_ENABLE		1	/* Enable module */
+#define	CU_FLAG_READY		2	/* Ready to setup module */
+#define	CU_FLAG_ON		4	/* Module is on */
+
+/*
+ * pg_cpu kstats calculate utilization rate and maximum utilization rate for
+ * some CPUs. The rate is calculated based on data from two subsequent
+ * snapshots. When the time between such two snapshots is too small, the
+ * resulting rate may have low accuracy, so we only consider snapshots which
+ * are separated by SAMPLE_INTERVAL nanoseconds from one another. We do not
+ * update the rate if the interval is smaller than that.
+ *
+ * Use one tenth of a second as the minimum interval for utilization rate
+ * calculation.
+ *
+ * NOTE: The CU_SAMPLE_INTERVAL_MIN should be higher than the scaling factor in
+ * the CU_RATE() macro below to guarantee that we never divide by zero.
+ *
+ * Rate is the number of events per second. The rate is the number of events
+ * divided by time and multiplied by the number of nanoseconds in a second. We
+ * do not want time to be too small since it will cause large errors in
+ * division.
+ *
+ * We do not want to multiply two large numbers (the instruction count and
+ * NANOSEC) either since it may cause integer overflow. So we divide both the
+ * numerator and the denominator by the same value.
+ *
+ * NOTE: The scaling factor below should be less than CU_SAMPLE_INTERVAL_MIN
+ * above to guarantee that time divided by this value is always non-zero.
+ */
+#define	CU_RATE(val, time) \
+	(((val) * (NANOSEC / CU_SCALE)) / ((time) / CU_SCALE))
+
+#define	CU_SAMPLE_INTERVAL_MIN	(NANOSEC / 10)
+
+#define	CU_SCALE (CU_SAMPLE_INTERVAL_MIN / 10000)
+
+/*
+ * When the time between two kstat reads for the same CPU is less than
+ * CU_UPDATE_THRESHOLD use the old counter data and skip updating counter values
+ * for the CPU. This helps reduce cross-calls when kstat consumers read data
+ * very often or when they read PG utilization data and then CPU utilization
+ * data quickly after that.
+ */
+#define	CU_UPDATE_THRESHOLD (NANOSEC / 10)
+
+/*
+ * The IS_HIPIL() macro verifies that the code is executed either from a
+ * cross-call or from high-PIL interrupt
+ */
+#ifdef DEBUG
+#define	IS_HIPIL() (getpil() >= XCALL_PIL)
+#else
+#define	IS_HIPIL()
+#endif	/* DEBUG */
+
+
+typedef void (*cu_cpu_func_t)(uintptr_t, int *);
+
+
+/*
+ * Flags to use for programming CPU hardware performance counters to measure
+ * capacity and utilization
+ */
+int				cu_cpc_flags = CU_CPC_FLAGS_DEFAULT;
+
+/*
+ * Initial value used for programming hardware counters
+ */
+uint64_t			cu_cpc_preset_value = 0;
+
+/*
+ * List of CPC event requests for capacity and utilization.
+ */
+static kcpc_request_list_t	*cu_cpc_reqs = NULL;
+
+/*
+ * When a CPU is a member of PG with a sharing relationship that is supported
+ * by the capacity/utilization framework, a kstat is created for that CPU and
+ * sharing relationship.
+ *
+ * These kstats are updated one at a time, so we can have a single scratch
+ * space to fill the data.
+ *
+ * CPU counter kstats fields:
+ *
+ *   cu_cpu_id		CPU ID for this kstat
+ *
+ *   cu_generation	Generation value that increases whenever any CPU goes
+ *			  offline or online. Two kstat snapshots for the same
+ *			  CPU may only be compared if they have the same
+ *			  generation.
+ *
+ *   cu_pg_id		PG ID for the relationship described by this kstat
+ *
+ *   cu_cpu_util	Running value of CPU utilization for the sharing
+ *			  relationship
+ *
+ *   cu_cpu_time_running Total time spent collecting CU data. The time may be
+ *			   less than wall time if CU counters were stopped for
+ *			   some time.
+ *
+ *   cu_cpu_time_stopped Total time the CU counters were stopped.
+ *
+ *   cu_cpu_rate	Utilization rate, expressed in operations per second.
+ *
+ *   cu_cpu_rate_max	Maximum observed value of utilization rate.
+ */
+struct cu_cpu_kstat {
+	kstat_named_t	cu_cpu_id;
+	kstat_named_t	cu_generation;
+	kstat_named_t	cu_pg_id;
+	kstat_named_t	cu_cpu_util;
+	kstat_named_t	cu_cpu_time_running;
+	kstat_named_t	cu_cpu_time_stopped;
+	kstat_named_t	cu_cpu_rate;
+	kstat_named_t	cu_cpu_rate_max;
+} cu_cpu_kstat = {
+	{ "id",				KSTAT_DATA_UINT32 },
+	{ "generation",			KSTAT_DATA_UINT32 },
+	{ "pg_id",			KSTAT_DATA_LONG },
+	{ "hw_util",			KSTAT_DATA_UINT64 },
+	{ "hw_util_time_running",	KSTAT_DATA_UINT64 },
+	{ "hw_util_time_stopped",	KSTAT_DATA_UINT64 },
+	{ "hw_util_rate",		KSTAT_DATA_UINT64 },
+	{ "hw_util_rate_max",		KSTAT_DATA_UINT64 },
+};
+
+/*
+ * Flags for controlling this module
+ */
+uint_t				cu_flags = CU_FLAG_ENABLE;
+
+/*
+ * Error return value for cu_init() since it can't return anything to be called
+ * from mp_init_tbl[] (:-(
+ */
+static int			cu_init_error = 0;
+
+hrtime_t			cu_sample_interval_min = CU_SAMPLE_INTERVAL_MIN;
+
+hrtime_t			cu_update_threshold = CU_UPDATE_THRESHOLD;
+
+static kmutex_t			pg_cpu_kstat_lock;
+
+
+/*
+ * Forward declaration of interface routines
+ */
+void		cu_disable(void);
+void		cu_enable(void);
+void		cu_init(void);
+void		cu_cpc_program(cpu_t *cp, int *err);
+void		cu_cpc_unprogram(cpu_t *cp, int *err);
+int		cu_cpu_update(struct cpu *cp, boolean_t move_to);
+void		cu_pg_update(pghw_t *pg);
+
+
+/*
+ * Forward declaration of private routines
+ */
+static int	cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs);
+static void	cu_cpc_program_xcall(uintptr_t arg, int *err);
+static int	cu_cpc_req_add(char *event, kcpc_request_list_t *reqs,
+    int nreqs, cu_cntr_stats_t *stats, int kmem_flags, int *nevents);
+static int	cu_cpu_callback(cpu_setup_t what, int id, void *arg);
+static void	cu_cpu_disable(cpu_t *cp);
+static void	cu_cpu_enable(cpu_t *cp);
+static int	cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs);
+static int	cu_cpu_fini(cpu_t *cp);
+static void	cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info);
+static int	cu_cpu_kstat_update(kstat_t *ksp, int rw);
+static int	cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg);
+static int	cu_cpu_update_stats(cu_cntr_stats_t *stats,
+    uint64_t cntr_value);
+static void cu_cpu_info_detach_xcall(void);
+
+/*
+ * Disable or enable Capacity Utilization counters on all CPUs.
+ */
+void
+cu_disable(void)
+{
+	cpu_t *cp;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	cp = cpu_active;
+	do {
+		if (!(cp->cpu_flags & CPU_OFFLINE))
+			cu_cpu_disable(cp);
+	} while ((cp = cp->cpu_next_onln) != cpu_active);
+}
+
+
+void
+cu_enable(void)
+{
+	cpu_t *cp;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	cp = cpu_active;
+	do {
+		if (!(cp->cpu_flags & CPU_OFFLINE))
+			cu_cpu_enable(cp);
+	} while ((cp = cp->cpu_next_onln) != cpu_active);
+}
+
+
+/*
+ * Setup capacity and utilization support
+ */
+void
+cu_init(void)
+{
+	cpu_t	*cp;
+
+	cu_init_error = 0;
+	if (!(cu_flags & CU_FLAG_ENABLE) || (cu_flags & CU_FLAG_ON)) {
+		cu_init_error = -1;
+		return;
+	}
+
+	if (kcpc_init() != 0) {
+		cu_init_error = -2;
+		return;
+	}
+
+	/*
+	 * Can't measure hardware capacity and utilization without CPU
+	 * hardware performance counters
+	 */
+	if (cpc_ncounters <= 0) {
+		cu_init_error = -3;
+		return;
+	}
+
+	/*
+	 * Setup CPC event request queue
+	 */
+	cu_cpc_reqs = kcpc_reqs_init(cpc_ncounters, KM_SLEEP);
+
+	mutex_enter(&cpu_lock);
+
+	/*
+	 * Mark flags to say that module is ready to be setup
+	 */
+	cu_flags |= CU_FLAG_READY;
+
+	cp = cpu_active;
+	do {
+		/*
+		 * Allocate and setup state needed to measure capacity and
+		 * utilization
+		 */
+		if (cu_cpu_init(cp, cu_cpc_reqs) != 0)
+			cu_init_error = -5;
+
+		/*
+		 * Reset list of counter event requests so its space can be
+		 * reused for a different set of requests for next CPU
+		 */
+		(void) kcpc_reqs_reset(cu_cpc_reqs);
+
+		cp = cp->cpu_next_onln;
+	} while (cp != cpu_active);
+
+	/*
+	 * Mark flags to say that module is on now and counters are ready to be
+	 * programmed on all active CPUs
+	 */
+	cu_flags |= CU_FLAG_ON;
+
+	/*
+	 * Program counters on currently active CPUs
+	 */
+	cp = cpu_active;
+	do {
+		if (cu_cpu_run(cp, cu_cpc_program_xcall,
+		    (uintptr_t)B_FALSE) != 0)
+			cu_init_error = -6;
+
+		cp = cp->cpu_next_onln;
+	} while (cp != cpu_active);
+
+	/*
+	 * Register callback for CPU state changes to enable and disable
+	 * CPC counters as CPUs come on and offline
+	 */
+	register_cpu_setup_func(cu_cpu_callback, NULL);
+
+	mutex_exit(&cpu_lock);
+}
+
+
+/*
+ * Return number of counter events needed to measure capacity and utilization
+ * for specified CPU and fill in list of CPC requests with each counter event
+ * needed if list where to add CPC requests is given
+ *
+ * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
+ *	 everything that has been successfully allocated if any memory
+ *	 allocation fails
+ */
+static int
+cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs)
+{
+	group_t		*cmt_pgs;
+	cu_cntr_info_t	**cntr_info_array;
+	cpu_pg_t	*cpu_pgs;
+	cu_cpu_info_t	*cu_cpu_info;
+	pg_cmt_t	*pg_cmt;
+	pghw_t		*pg_hw;
+	cu_cntr_stats_t	*stats;
+	int		nevents;
+	pghw_type_t	pg_hw_type;
+	group_iter_t	iter;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	/*
+	 * There has to be a target CPU for this
+	 */
+	if (cp == NULL)
+		return (-1);
+
+	/*
+	 * Return 0 when CPU doesn't belong to any group
+	 */
+	cpu_pgs = cp->cpu_pg;
+	if (cpu_pgs == NULL || GROUP_SIZE(&cpu_pgs->cmt_pgs) < 1)
+		return (0);
+
+	cmt_pgs = &cpu_pgs->cmt_pgs;
+	cu_cpu_info = cp->cpu_cu_info;
+
+	/*
+	 * Grab counter statistics and info
+	 */
+	if (reqs == NULL) {
+		stats = NULL;
+		cntr_info_array = NULL;
+	} else {
+		if (cu_cpu_info == NULL || cu_cpu_info->cu_cntr_stats == NULL)
+			return (-2);
+
+		stats = cu_cpu_info->cu_cntr_stats;
+		cntr_info_array = cu_cpu_info->cu_cntr_info;
+	}
+
+	/*
+	 * See whether platform (or processor) specific code knows which CPC
+	 * events to request, etc. are needed to measure hardware capacity and
+	 * utilization on this machine
+	 */
+	nevents = cu_plat_cpc_init(cp, reqs, nreqs);
+	if (nevents >= 0)
+		return (nevents);
+
+	/*
+	 * Let common code decide which CPC events to request, etc. to measure
+	 * capacity and utilization since platform (or processor) specific does
+	 * not know....
+	 *
+	 * Walk CPU's PG lineage and do following:
+	 *
+	 * - Setup CPC request, counter info, and stats needed for each counter
+	 *   event to measure capacity and and utilization for each of CPU's PG
+	 *   hardware sharing relationships
+	 *
+	 * - Create PG CPU kstats to export capacity and utilization for each PG
+	 */
+	nevents = 0;
+	group_iter_init(&iter);
+	while ((pg_cmt = group_iterate(cmt_pgs, &iter)) != NULL) {
+		cu_cntr_info_t	*cntr_info;
+		int		nevents_save;
+		int		nstats;
+
+		pg_hw = (pghw_t *)pg_cmt;
+		pg_hw_type = pg_hw->pghw_hw;
+		nevents_save = nevents;
+		nstats = 0;
+
+		switch (pg_hw_type) {
+		case PGHW_IPIPE:
+			if (cu_cpc_req_add("PAPI_tot_ins", reqs, nreqs, stats,
+			    KM_NOSLEEP, &nevents) != 0)
+				continue;
+			nstats = 1;
+			break;
+
+		case PGHW_FPU:
+			if (cu_cpc_req_add("PAPI_fp_ins", reqs, nreqs, stats,
+			    KM_NOSLEEP, &nevents) != 0)
+				continue;
+			nstats = 1;
+			break;
+
+		default:
+			/*
+			 * Don't measure capacity and utilization for this kind
+			 * of PG hardware relationship so skip to next PG in
+			 * CPU's PG lineage
+			 */
+			continue;
+		}
+
+		cntr_info = cntr_info_array[pg_hw_type];
+
+		/*
+		 * Nothing to measure for this hardware sharing relationship
+		 */
+		if (nevents - nevents_save == 0) {
+			if (cntr_info != NULL)
+				kmem_free(cntr_info, sizeof (cu_cntr_info_t));
+				cntr_info_array[pg_hw_type] = NULL;
+			continue;
+		}
+
+		/*
+		 * Fill in counter info for this PG hardware relationship
+		 */
+		if (cntr_info == NULL) {
+			cntr_info = kmem_zalloc(sizeof (cu_cntr_info_t),
+			    KM_NOSLEEP);
+			if (cntr_info == NULL)
+				continue;
+			cntr_info_array[pg_hw_type] = cntr_info;
+		}
+		cntr_info->ci_cpu = cp;
+		cntr_info->ci_pg = pg_hw;
+		cntr_info->ci_stats = &stats[nevents_save];
+		cntr_info->ci_nstats = nstats;
+
+		/*
+		 * Create PG CPU kstats for this hardware relationship
+		 */
+		cu_cpu_kstat_create(pg_hw, cntr_info);
+	}
+
+	return (nevents);
+}
+
+
+/*
+ * Program counters for capacity and utilization on given CPU
+ *
+ * If any of the following conditions is true, the counters are not programmed:
+ *
+ * - CU framework is disabled
+ * - The cpu_cu_info field of the cpu structure is NULL
+ * - DTrace is active
+ * - Counters are programmed already
+ * - Counters are disabled (by calls to cu_cpu_disable())
+ */
+void
+cu_cpc_program(cpu_t *cp, int *err)
+{
+	cu_cpc_ctx_t	*cpu_ctx;
+	kcpc_ctx_t	*ctx;
+	cu_cpu_info_t	*cu_cpu_info;
+
+	ASSERT(IS_HIPIL());
+	/*
+	 * Should be running on given CPU. We disable preemption to keep CPU
+	 * from disappearing and make sure flags and CPC context don't change
+	 * from underneath us
+	 */
+	kpreempt_disable();
+	ASSERT(cp == CPU);
+
+	/*
+	 * Module not ready to program counters
+	 */
+	if (!(cu_flags & CU_FLAG_ON)) {
+		*err = -1;
+		kpreempt_enable();
+		return;
+	}
+
+	if (cp == NULL) {
+		*err = -2;
+		kpreempt_enable();
+		return;
+	}
+
+	cu_cpu_info = cp->cpu_cu_info;
+	if (cu_cpu_info == NULL) {
+		*err = -3;
+		kpreempt_enable();
+		return;
+	}
+
+	/*
+	 * If DTrace CPC is active or counters turned on already or are
+	 * disabled, just return.
+	 */
+	if (dtrace_cpc_in_use || (cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON) ||
+	    cu_cpu_info->cu_disabled) {
+		*err = 1;
+		kpreempt_enable();
+		return;
+	}
+
+	if ((CPU->cpu_cpc_ctx != NULL) &&
+	    !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
+		*err = -4;
+		kpreempt_enable();
+		return;
+	}
+
+	/*
+	 * Get CPU's CPC context needed for capacity and utilization
+	 */
+	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
+	ASSERT(cpu_ctx != NULL);
+	ASSERT(cpu_ctx->nctx >= 0);
+
+	ASSERT(cpu_ctx->ctx_ptr_array == NULL || cpu_ctx->ctx_ptr_array_sz > 0);
+	ASSERT(cpu_ctx->nctx <= cpu_ctx->ctx_ptr_array_sz);
+	if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
+	    cpu_ctx->ctx_ptr_array_sz <= 0) {
+		*err = -5;
+		kpreempt_enable();
+		return;
+	}
+
+	/*
+	 * Increment index in CPU's CPC context info to point at next context
+	 * to program
+	 *
+	 * NOTE: Do this now instead of after programming counters to ensure
+	 *	 that index will always point at *current* context so we will
+	 *	 always be able to unprogram *current* context if necessary
+	 */
+	cpu_ctx->cur_index = (cpu_ctx->cur_index + 1) % cpu_ctx->nctx;
+
+	ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];
+
+	/*
+	 * Clear KCPC_CTX_INVALID and KCPC_CTX_INVALID_STOPPED from CPU's CPC
+	 * context before programming counters
+	 *
+	 * Context is marked with KCPC_CTX_INVALID_STOPPED when context is
+	 * unprogrammed and may be marked with KCPC_CTX_INVALID when
+	 * kcpc_invalidate_all() is called by cpustat(1M) and dtrace CPC to
+	 * invalidate all CPC contexts before they take over all the counters.
+	 *
+	 * This isn't necessary since these flags are only used for thread bound
+	 * CPC contexts not CPU bound CPC contexts like ones used for capacity
+	 * and utilization.
+	 *
+	 * There is no need to protect the flag update since no one is using
+	 * this context now.
+	 */
+	ctx->kc_flags &= ~(KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
+
+	/*
+	 * Program counters on this CPU
+	 */
+	kcpc_program(ctx, B_FALSE, B_FALSE);
+
+	cp->cpu_cpc_ctx = ctx;
+
+	/*
+	 * Set state in CPU structure to say that CPU's counters are programmed
+	 * for capacity and utilization now and that they are transitioning from
+	 * off to on state. This will cause cu_cpu_update to update stop times
+	 * for all programmed counters.
+	 */
+	cu_cpu_info->cu_flag |= CU_CPU_CNTRS_ON | CU_CPU_CNTRS_OFF_ON;
+
+	/*
+	 * Update counter statistics
+	 */
+	(void) cu_cpu_update(cp, B_FALSE);
+
+	cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_OFF_ON;
+
+	*err = 0;
+	kpreempt_enable();
+}
+
+
+/*
+ * Cross call wrapper routine for cu_cpc_program()
+ *
+ * Checks to make sure that counters on CPU aren't being used by someone else
+ * before calling cu_cpc_program() since cu_cpc_program() needs to assert that
+ * nobody else is using the counters to catch and prevent any broken code.
+ * Also, this check needs to happen on the target CPU since the CPU's CPC
+ * context can only be changed while running on the CPU.
+ *
+ * If the first argument is TRUE, cu_cpc_program_xcall also checks that there is
+ * no valid thread bound cpc context. This is important to check to prevent
+ * re-programming thread counters with CU counters when CPU is coming on-line.
+ */
+static void
+cu_cpc_program_xcall(uintptr_t arg, int *err)
+{
+	boolean_t	avoid_thread_context = (boolean_t)arg;
+
+	kpreempt_disable();
+
+	if (CPU->cpu_cpc_ctx != NULL &&
+	    !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
+		*err = -100;
+		kpreempt_enable();
+		return;
+	}
+
+	if (avoid_thread_context && (curthread->t_cpc_ctx != NULL) &&
+	    !(curthread->t_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
+		*err = -200;
+		kpreempt_enable();
+		return;
+	}
+
+	cu_cpc_program(CPU, err);
+	kpreempt_enable();
+}
+
+
+/*
+ * Unprogram counters for capacity and utilization on given CPU
+ * This function should be always executed on the target CPU at high PIL
+ */
+void
+cu_cpc_unprogram(cpu_t *cp, int *err)
+{
+	cu_cpc_ctx_t	*cpu_ctx;
+	kcpc_ctx_t	*ctx;
+	cu_cpu_info_t	*cu_cpu_info;
+
+	ASSERT(IS_HIPIL());
+	/*
+	 * Should be running on given CPU with preemption disabled to keep CPU
+	 * from disappearing and make sure flags and CPC context don't change
+	 * from underneath us
+	 */
+	kpreempt_disable();
+	ASSERT(cp == CPU);
+
+	/*
+	 * Module not on
+	 */
+	if (!(cu_flags & CU_FLAG_ON)) {
+		*err = -1;
+		kpreempt_enable();
+		return;
+	}
+
+	cu_cpu_info = cp->cpu_cu_info;
+	if (cu_cpu_info == NULL) {
+		*err = -3;
+		kpreempt_enable();
+		return;
+	}
+
+	/*
+	 * Counters turned off already
+	 */
+	if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)) {
+		*err = 1;
+		kpreempt_enable();
+		return;
+	}
+
+	/*
+	 * Update counter statistics
+	 */
+	(void) cu_cpu_update(cp, B_FALSE);
+
+	/*
+	 * Get CPU's CPC context needed for capacity and utilization
+	 */
+	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
+	if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
+	    cpu_ctx->ctx_ptr_array_sz <= 0) {
+		*err = -5;
+		kpreempt_enable();
+		return;
+	}
+	ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];
+
+	/*
+	 * CPU's CPC context should be current capacity and utilization CPC
+	 * context
+	 */
+	ASSERT(cp->cpu_cpc_ctx == ctx);
+	if (cp->cpu_cpc_ctx != ctx) {
+		*err = -6;
+		kpreempt_enable();
+		return;
+	}
+
+	/*
+	 * Unprogram counters on CPU.
+	 */
+	kcpc_unprogram(ctx, B_FALSE);
+
+	ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
+
+	/*
+	 * Unset state in CPU structure saying that CPU's counters are
+	 * programmed
+	 */
+	cp->cpu_cpc_ctx = NULL;
+	cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_ON;
+
+	*err = 0;
+	kpreempt_enable();
+}
+
+
+/*
+ * Add given counter event to list of CPC requests
+ */
+static int
+cu_cpc_req_add(char *event, kcpc_request_list_t *reqs, int nreqs,
+    cu_cntr_stats_t *stats, int kmem_flags, int *nevents)
+{
+	int	n;
+	int	retval;
+	uint_t  flags;
+
+	/*
+	 * Return error when no counter event specified, counter event not
+	 * supported by CPC's PCBE, or number of events not given
+	 */
+	if (event == NULL || kcpc_event_supported(event) == B_FALSE ||
+	    nevents == NULL)
+		return (-1);
+
+	n = *nevents;
+
+	/*
+	 * Only count number of counter events needed if list
+	 * where to add CPC requests not given
+	 */
+	if (reqs == NULL) {
+		n++;
+		*nevents = n;
+		return (-3);
+	}
+
+	/*
+	 * Return error when stats not given or not enough room on list of CPC
+	 * requests for more counter events
+	 */
+	if (stats == NULL || (nreqs <= 0 && n >= nreqs))
+		return (-4);
+
+	/*
+	 * Use flags in cu_cpc_flags to program counters and enable overflow
+	 * interrupts/traps (unless PCBE can't handle overflow interrupts) so
+	 * PCBE can catch counters before they wrap to hopefully give us an
+	 * accurate (64-bit) virtualized counter
+	 */
+	flags = cu_cpc_flags;
+	if ((kcpc_pcbe_capabilities() & CPC_CAP_OVERFLOW_INTERRUPT) == 0)
+		flags &= ~CPC_OVF_NOTIFY_EMT;
+
+	/*
+	 * Add CPC request to list
+	 */
+	retval = kcpc_reqs_add(reqs, event, cu_cpc_preset_value,
+	    flags, 0, NULL, &stats[n], kmem_flags);
+
+	if (retval != 0)
+		return (-5);
+
+	n++;
+	*nevents = n;
+	return (0);
+}
+
+static void
+cu_cpu_info_detach_xcall(void)
+{
+	ASSERT(IS_HIPIL());
+
+	CPU->cpu_cu_info = NULL;
+}
+
+
+/*
+ * Enable or disable collection of capacity/utilization data for a current CPU.
+ * Counters are enabled if 'on' argument is True and disabled if it is False.
+ * This function should be always executed at high PIL
+ */
+static void
+cu_cpc_trigger(uintptr_t arg1, uintptr_t arg2)
+{
+	cpu_t		*cp = (cpu_t *)arg1;
+	boolean_t	on = (boolean_t)arg2;
+	int		error;
+	cu_cpu_info_t	*cu_cpu_info;
+
+	ASSERT(IS_HIPIL());
+	kpreempt_disable();
+	ASSERT(cp == CPU);
+
+	if (!(cu_flags & CU_FLAG_ON)) {
+		kpreempt_enable();
+		return;
+	}
+
+	cu_cpu_info = cp->cpu_cu_info;
+	if (cu_cpu_info == NULL) {
+		kpreempt_enable();
+		return;
+	}
+
+	ASSERT(!cu_cpu_info->cu_disabled ||
+	    !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));
+
+	if (on) {
+		/*
+		 * Decrement the cu_disabled counter.
+		 * Once it drops to zero, call cu_cpc_program.
+		 */
+		if (cu_cpu_info->cu_disabled > 0)
+			cu_cpu_info->cu_disabled--;
+		if (cu_cpu_info->cu_disabled == 0)
+			cu_cpc_program(CPU, &error);
+	} else if (cu_cpu_info->cu_disabled++ == 0) {
+		/*
+		 * This is the first attempt to disable CU, so turn it off
+		 */
+		cu_cpc_unprogram(cp, &error);
+		ASSERT(!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));
+	}
+
+	kpreempt_enable();
+}
+
+
+/*
+ * Callback for changes in CPU states
+ * Used to enable or disable hardware performance counters on CPUs that are
+ * turned on or off
+ *
+ * NOTE: cpc should be programmed/unprogrammed while running on the target CPU.
+ * We have to use thread_affinity_set to hop to the right CPU because these
+ * routines expect cpu_lock held, so we can't cross-call other CPUs while
+ * holding CPU lock.
+ */
+static int
+/* LINTED E_FUNC_ARG_UNUSED */
+cu_cpu_callback(cpu_setup_t what, int id, void *arg)
+{
+	cpu_t	*cp;
+	int	retval = 0;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	if (!(cu_flags & CU_FLAG_ON))
+		return (-1);
+
+	cp = cpu_get(id);
+	if (cp == NULL)
+		return (-2);
+
+	switch (what) {
+	case CPU_ON:
+		/*
+		 * Setup counters on CPU being turned on
+		 */
+		retval = cu_cpu_init(cp, cu_cpc_reqs);
+
+		/*
+		 * Reset list of counter event requests so its space can be
+		 * reused for a different set of requests for next CPU
+		 */
+		(void) kcpc_reqs_reset(cu_cpc_reqs);
+		break;
+	case CPU_INTR_ON:
+		/*
+		 * Setup counters on CPU being turned on.
+		 */
+		retval = cu_cpu_run(cp, cu_cpc_program_xcall,
+		    (uintptr_t)B_TRUE);
+		break;
+	case CPU_OFF:
+		/*
+		 * Disable counters on CPU being turned off. Counters will not
+		 * be re-enabled on this CPU until it comes back online.
+		 */
+		cu_cpu_disable(cp);
+		ASSERT(!CU_CPC_ON(cp));
+		retval = cu_cpu_fini(cp);
+		break;
+	default:
+		break;
+	}
+	return (retval);
+}
+
+
+/*
+ * Disable or enable Capacity Utilization counters on a given CPU. This function
+ * can be called from any CPU to disable counters on the given CPU.
+ */
+static void
+cu_cpu_disable(cpu_t *cp)
+{
+	cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_FALSE);
+}
+
+
+static void
+cu_cpu_enable(cpu_t *cp)
+{
+	cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_TRUE);
+}
+
+
+/*
+ * Setup capacity and utilization support for given CPU
+ *
+ * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
+ *	 everything that has been successfully allocated including cpu_cu_info
+ *	if any memory allocation fails
+ */
+static int
+cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs)
+{
+	kcpc_ctx_t	**ctx_ptr_array;
+	size_t		ctx_ptr_array_sz;
+	cu_cpc_ctx_t	*cpu_ctx;
+	cu_cpu_info_t	*cu_cpu_info;
+	int		n;
+
+	/*
+	 * cpu_lock should be held and protect against CPU going away and races
+	 * with cu_{init,fini,cpu_fini}()
+	 */
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	/*
+	 * Return if not ready to setup counters yet
+	 */
+	if (!(cu_flags & CU_FLAG_READY))
+		return (-1);
+
+	if (cp->cpu_cu_info == NULL) {
+		cp->cpu_cu_info = kmem_zalloc(sizeof (cu_cpu_info_t),
+		    KM_NOSLEEP);
+		if (cp->cpu_cu_info == NULL)
+			return (-2);
+	}
+
+	/*
+	 * Get capacity and utilization CPC context for CPU and check to see
+	 * whether it has been setup already
+	 */
+	cu_cpu_info = cp->cpu_cu_info;
+	cu_cpu_info->cu_cpu = cp;
+	cu_cpu_info->cu_disabled = dtrace_cpc_in_use ? 1 : 0;
+
+	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
+	if (cpu_ctx->nctx > 0 && cpu_ctx->ctx_ptr_array != NULL &&
+	    cpu_ctx->ctx_ptr_array_sz > 0) {
+		return (1);
+	}
+
+	/*
+	 * Should have no contexts since it hasn't been setup already
+	 */
+	ASSERT(cpu_ctx->nctx == 0 && cpu_ctx->ctx_ptr_array == NULL &&
+	    cpu_ctx->ctx_ptr_array_sz == 0);
+
+	/*
+	 * Determine how many CPC events needed to measure capacity and
+	 * utilization for this CPU, allocate space for counter statistics for
+	 * each event, and fill in list of CPC event requests with corresponding
+	 * counter stats for each request to make attributing counter data
+	 * easier later....
+	 */
+	n = cu_cpc_init(cp, NULL, 0);
+	if (n <= 0) {
+		(void) cu_cpu_fini(cp);
+		return (-3);
+	}
+
+	cu_cpu_info->cu_cntr_stats = kmem_zalloc(n * sizeof (cu_cntr_stats_t),
+	    KM_NOSLEEP);
+	if (cu_cpu_info->cu_cntr_stats == NULL) {
+		(void) cu_cpu_fini(cp);
+		return (-4);
+	}
+
+	cu_cpu_info->cu_ncntr_stats = n;
+
+	n = cu_cpc_init(cp, reqs, n);
+	if (n <= 0) {
+		(void) cu_cpu_fini(cp);
+		return (-5);
+	}
+
+	/*
+	 * Create CPC context with given requests
+	 */
+	ctx_ptr_array = NULL;
+	ctx_ptr_array_sz = 0;
+	n = kcpc_cpu_ctx_create(cp, reqs, KM_NOSLEEP, &ctx_ptr_array,
+	    &ctx_ptr_array_sz);
+	if (n <= 0) {
+		(void) cu_cpu_fini(cp);
+		return (-6);
+	}
+
+	/*
+	 * Should have contexts
+	 */
+	ASSERT(n > 0 && ctx_ptr_array != NULL && ctx_ptr_array_sz > 0);
+	if (ctx_ptr_array == NULL || ctx_ptr_array_sz <= 0) {
+		(void) cu_cpu_fini(cp);
+		return (-7);
+	}
+
+	/*
+	 * Fill in CPC context info for CPU needed for capacity and utilization
+	 */
+	cpu_ctx->cur_index = 0;
+	cpu_ctx->nctx = n;
+	cpu_ctx->ctx_ptr_array = ctx_ptr_array;
+	cpu_ctx->ctx_ptr_array_sz = ctx_ptr_array_sz;
+	return (0);
+}
+
+/*
+ * Tear down capacity and utilization support for given CPU
+ */
+static int
+cu_cpu_fini(cpu_t *cp)
+{
+	kcpc_ctx_t	*ctx;
+	cu_cpc_ctx_t	*cpu_ctx;
+	cu_cpu_info_t	*cu_cpu_info;
+	int		i;
+	pghw_type_t	pg_hw_type;
+
+	/*
+	 * cpu_lock should be held and protect against CPU going away and races
+	 * with cu_{init,fini,cpu_init}()
+	 */
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	/*
+	 * Have to at least be ready to setup counters to have allocated
+	 * anything that needs to be deallocated now
+	 */
+	if (!(cu_flags & CU_FLAG_READY))
+		return (-1);
+
+	/*
+	 * Nothing to do if CPU's capacity and utilization info doesn't exist
+	 */
+	cu_cpu_info = cp->cpu_cu_info;
+	if (cu_cpu_info == NULL)
+		return (1);
+
+	/*
+	 * Tear down any existing kstats and counter info for each hardware
+	 * sharing relationship
+	 */
+	for (pg_hw_type = PGHW_START; pg_hw_type < PGHW_NUM_COMPONENTS;
+	    pg_hw_type++) {
+		cu_cntr_info_t	*cntr_info;
+
+		cntr_info = cu_cpu_info->cu_cntr_info[pg_hw_type];
+		if (cntr_info == NULL)
+			continue;
+
+		if (cntr_info->ci_kstat != NULL) {
+			kstat_delete(cntr_info->ci_kstat);
+			cntr_info->ci_kstat = NULL;
+		}
+		kmem_free(cntr_info, sizeof (cu_cntr_info_t));
+	}
+
+	/*
+	 * Free counter statistics for CPU
+	 */
+	ASSERT(cu_cpu_info->cu_cntr_stats == NULL ||
+	    cu_cpu_info->cu_ncntr_stats > 0);
+	if (cu_cpu_info->cu_cntr_stats != NULL &&
+	    cu_cpu_info->cu_ncntr_stats > 0) {
+		kmem_free(cu_cpu_info->cu_cntr_stats,
+		    cu_cpu_info->cu_ncntr_stats * sizeof (cu_cntr_stats_t));
+		cu_cpu_info->cu_cntr_stats = NULL;
+		cu_cpu_info->cu_ncntr_stats = 0;
+	}
+
+	/*
+	 * Get capacity and utilization CPC contexts for given CPU and check to
+	 * see whether they have been freed already
+	 */
+	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
+	if (cpu_ctx != NULL && cpu_ctx->ctx_ptr_array != NULL &&
+	    cpu_ctx->ctx_ptr_array_sz > 0) {
+		/*
+		 * Free CPC contexts for given CPU
+		 */
+		for (i = 0; i < cpu_ctx->nctx; i++) {
+			ctx = cpu_ctx->ctx_ptr_array[i];
+			if (ctx == NULL)
+				continue;
+			kcpc_free(ctx, 0);
+		}
+
+		/*
+		 * Free CPC context pointer array
+		 */
+		kmem_free(cpu_ctx->ctx_ptr_array, cpu_ctx->ctx_ptr_array_sz);
+
+		/*
+		 * Zero CPC info for CPU
+		 */
+		bzero(cpu_ctx, sizeof (cu_cpc_ctx_t));
+	}
+
+	/*
+	 * Set cp->cpu_cu_info pointer to NULL. Go through cross-call to ensure
+	 * that no one is going to access the cpu_cu_info whicch we are going to
+	 * free.
+	 */
+	if (cpu_is_online(cp))
+		cpu_call(cp, (cpu_call_func_t)cu_cpu_info_detach_xcall, 0, 0);
+	else
+		cp->cpu_cu_info = NULL;
+
+	/*
+	 * Free CPU's capacity and utilization info
+	 */
+	kmem_free(cu_cpu_info, sizeof (cu_cpu_info_t));
+
+	return (0);
+}
+
+/*
+ * Create capacity & utilization kstats for given PG CPU hardware sharing
+ * relationship
+ */
+static void
+cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info)
+{
+	char		*class, *sh_name;
+	kstat_t		*ks;
+
+	/*
+	 * Just return when no counter info or CPU
+	 */
+	if (cntr_info == NULL || cntr_info->ci_cpu == NULL)
+		return;
+
+	/*
+	 * Get the class name from the leaf PG that this CPU belongs to.
+	 * If there are no PGs, just use the default class "cpu".
+	 */
+	class = pg ? pghw_type_string(pg->pghw_hw) : "cpu";
+	sh_name = pg ? pghw_type_shortstring(pg->pghw_hw) : "cpu";
+
+	if ((ks = kstat_create_zone("pg_cpu", cntr_info->ci_cpu->cpu_id,
+	    sh_name, class, KSTAT_TYPE_NAMED,
+	    sizeof (cu_cpu_kstat) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID)) == NULL)
+		return;
+
+	ks->ks_lock = &pg_cpu_kstat_lock;
+	ks->ks_data = &cu_cpu_kstat;
+	ks->ks_update = cu_cpu_kstat_update;
+
+	ks->ks_private = cntr_info;
+	cntr_info->ci_kstat = ks;
+	kstat_install(cntr_info->ci_kstat);
+}
+
+
+/*
+ * Propagate values from CPU capacity & utilization stats to kstats
+ */
+static int
+cu_cpu_kstat_update(kstat_t *ksp, int rw)
+{
+	cpu_t		*cp;
+	cu_cntr_info_t	*cntr_info = ksp->ks_private;
+	struct cu_cpu_kstat	*kstat = &cu_cpu_kstat;
+	pghw_t		*pg;
+	cu_cntr_stats_t	*stats;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	kpreempt_disable();
+
+	/*
+	 * Update capacity and utilization statistics needed for CPU's PG (CPU)
+	 * kstats
+	 */
+	cp = cntr_info->ci_cpu;
+	(void) cu_cpu_update(cp, B_TRUE);
+
+	pg = cntr_info->ci_pg;
+	stats = cntr_info->ci_stats;
+	kstat->cu_cpu_id.value.ui32 = cp->cpu_id;
+	kstat->cu_generation.value.ui32 = cp->cpu_generation;
+	if (pg == NULL)
+		kstat->cu_pg_id.value.l = -1;
+	else
+		kstat->cu_pg_id.value.l = pg->pghw_pg.pg_id;
+
+	kstat->cu_cpu_util.value.ui64 = stats->cs_value_total;
+	kstat->cu_cpu_rate.value.ui64 = stats->cs_rate;
+	kstat->cu_cpu_rate_max.value.ui64 = stats->cs_rate_max;
+	kstat->cu_cpu_time_running.value.ui64 = stats->cs_time_running;
+	kstat->cu_cpu_time_stopped.value.ui64 = stats->cs_time_stopped;
+	/*
+	 * Counters are stopped now, so the cs_time_stopped was last
+	 * updated at cs_time_start time. Add the time passed since then
+	 * to the stopped time.
+	 */
+	if (!(cp->cpu_cu_info->cu_flag & CU_CPU_CNTRS_ON))
+		kstat->cu_cpu_time_stopped.value.ui64 +=
+		    gethrtime() - stats->cs_time_start;
+
+	kpreempt_enable();
+
+	return (0);
+}
+
+/*
+ * Run specified function with specified argument on a given CPU and return
+ * whatever the function returns
+ */
+static int
+cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg)
+{
+	int error = 0;
+
+	/*
+	 * cpu_call() will call func on the CPU specified with given argument
+	 * and return func's return value in last argument
+	 */
+	cpu_call(cp, (cpu_call_func_t)func, arg, (uintptr_t)&error);
+	return (error);
+}
+
+
+/*
+ * Update counter statistics on a given CPU.
+ *
+ * If move_to argument is True, execute the function on the CPU specified
+ * Otherwise, assume that it is already runninng on the right CPU
+ *
+ * If move_to is specified, the caller should hold cpu_lock or have preemption
+ * disabled. Otherwise it is up to the caller to guarantee that things do not
+ * change in the process.
+ */
+int
+cu_cpu_update(struct cpu *cp, boolean_t move_to)
+{
+	int	retval;
+	cu_cpu_info_t	*cu_cpu_info = cp->cpu_cu_info;
+	hrtime_t	time_snap;
+
+	ASSERT(!move_to || MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0);
+
+	/*
+	 * Nothing to do if counters are not programmed
+	 */
+	if (!(cu_flags & CU_FLAG_ON) ||
+	    (cu_cpu_info == NULL) ||
+	    !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
+		return (0);
+
+	/*
+	 * Don't update CPU statistics if it was updated recently
+	 * and provide old results instead
+	 */
+	time_snap = gethrtime();
+	if ((time_snap - cu_cpu_info->cu_sample_time) < cu_update_threshold) {
+		DTRACE_PROBE1(cu__drop__sample, cpu_t *, cp);
+		return (0);
+	}
+
+	cu_cpu_info->cu_sample_time = time_snap;
+
+	/*
+	 * CPC counter should be read on the CPU that is running the counter. We
+	 * either have to move ourselves to the target CPU or insure that we
+	 * already run there.
+	 *
+	 * We use cross-call to the target CPU to execute kcpc_read() and
+	 * cu_cpu_update_stats() there.
+	 */
+	retval = 0;
+	if (move_to)
+		(void) cu_cpu_run(cp, (cu_cpu_func_t)kcpc_read,
+		    (uintptr_t)cu_cpu_update_stats);
+	else {
+		retval = kcpc_read((kcpc_update_func_t)cu_cpu_update_stats);
+		/*
+		 * Offset negative return value by -10 so we can distinguish it
+		 * from error return values of this routine vs kcpc_read()
+		 */
+		if (retval < 0)
+			retval -= 10;
+	}
+
+	return (retval);
+}
+
+
+/*
+ * Update CPU counter statistics for current CPU.
+ * This function may be called from a cross-call
+ */
+static int
+cu_cpu_update_stats(cu_cntr_stats_t *stats, uint64_t cntr_value)
+{
+	cu_cpu_info_t	*cu_cpu_info = CPU->cpu_cu_info;
+	uint_t		flags;
+	uint64_t	delta;
+	hrtime_t	time_delta;
+	hrtime_t	time_snap;
+
+	if (stats == NULL)
+		return (-1);
+
+	/*
+	 * Nothing to do if counters are not programmed. This should not happen,
+	 * but we check just in case.
+	 */
+	ASSERT(cu_flags & CU_FLAG_ON);
+	ASSERT(cu_cpu_info != NULL);
+	if (!(cu_flags & CU_FLAG_ON) ||
+	    (cu_cpu_info == NULL))
+		return (-2);
+
+	flags = cu_cpu_info->cu_flag;
+	ASSERT(flags & CU_CPU_CNTRS_ON);
+	if (!(flags & CU_CPU_CNTRS_ON))
+		return (-2);
+
+	/*
+	 * Take snapshot of high resolution timer
+	 */
+	time_snap = gethrtime();
+
+	/*
+	 * CU counters have just been programmed. We cannot assume that the new
+	 * cntr_value continues from where we left off, so use the cntr_value as
+	 * the new initial value.
+	 */
+	if (flags & CU_CPU_CNTRS_OFF_ON)
+		stats->cs_value_start = cntr_value;
+
+	/*
+	 * Calculate delta in counter values between start of sampling period
+	 * and now
+	 */
+	delta = cntr_value - stats->cs_value_start;
+
+	/*
+	 * Calculate time between start of sampling period and now
+	 */
+	time_delta = stats->cs_time_start ?
+	    time_snap - stats->cs_time_start :
+	    0;
+	stats->cs_time_start = time_snap;
+	stats->cs_value_start = cntr_value;
+
+	if (time_delta > 0) { /* wrap shouldn't happen */
+		/*
+		 * Update either running or stopped time based on the transition
+		 * state
+		 */
+		if (flags & CU_CPU_CNTRS_OFF_ON)
+			stats->cs_time_stopped += time_delta;
+		else
+			stats->cs_time_running += time_delta;
+	}
+
+	/*
+	 * Update rest of counter statistics if counter value didn't wrap
+	 */
+	if (delta > 0) {
+		/*
+		 * Update utilization rate if the interval between samples is
+		 * sufficient.
+		 */
+		ASSERT(cu_sample_interval_min > CU_SCALE);
+		if (time_delta > cu_sample_interval_min)
+			stats->cs_rate = CU_RATE(delta, time_delta);
+		if (stats->cs_rate_max < stats->cs_rate)
+			stats->cs_rate_max = stats->cs_rate;
+
+		stats->cs_value_last = delta;
+		stats->cs_value_total += delta;
+	}
+
+	return (0);
+}
+
+/*
+ * Update CMT PG utilization data.
+ *
+ * This routine computes the running total utilization and times for the
+ * specified PG by adding up the total utilization and counter running and
+ * stopped times of all CPUs in the PG and calculates the utilization rate and
+ * maximum rate for all CPUs in the PG.
+ */
+void
+cu_pg_update(pghw_t *pg)
+{
+	pg_cpu_itr_t	cpu_iter;
+	pghw_type_t	pg_hwtype;
+	cpu_t		*cpu;
+	pghw_util_t	*hw_util = &pg->pghw_stats;
+	uint64_t	old_utilization = hw_util->pghw_util;
+	hrtime_t	now;
+	hrtime_t	time_delta;
+	uint64_t	utilization_delta;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	now = gethrtime();
+
+	pg_hwtype = pg->pghw_hw;
+
+	/*
+	 * Initialize running total utilization and times for PG to 0
+	 */
+	hw_util->pghw_util = 0;
+	hw_util->pghw_time_running = 0;
+	hw_util->pghw_time_stopped = 0;
+
+	/*
+	 * Iterate over all CPUs in the PG and aggregate utilization, running
+	 * time and stopped time.
+	 */
+	PG_CPU_ITR_INIT(pg, cpu_iter);
+	while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
+		cu_cpu_info_t	*cu_cpu_info = cpu->cpu_cu_info;
+		cu_cntr_info_t	*cntr_info;
+		cu_cntr_stats_t	*stats;
+
+		if (cu_cpu_info == NULL)
+			continue;
+
+		/*
+		 * Update utilization data for the CPU and then
+		 * aggregate per CPU running totals for PG
+		 */
+		(void) cu_cpu_update(cpu, B_TRUE);
+		cntr_info = cu_cpu_info->cu_cntr_info[pg_hwtype];
+
+		if (cntr_info == NULL || (stats = cntr_info->ci_stats) == NULL)
+			continue;
+
+		hw_util->pghw_util += stats->cs_value_total;
+		hw_util->pghw_time_running += stats->cs_time_running;
+		hw_util->pghw_time_stopped += stats->cs_time_stopped;
+
+		/*
+		 * If counters are stopped now, the pg_time_stopped was last
+		 * updated at cs_time_start time. Add the time passed since then
+		 * to the stopped time.
+		 */
+		if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
+			hw_util->pghw_time_stopped +=
+			    now - stats->cs_time_start;
+	}
+
+	/*
+	 * Compute per PG instruction rate and maximum rate
+	 */
+	time_delta = now - hw_util->pghw_time_stamp;
+	hw_util->pghw_time_stamp = now;
+
+	if (old_utilization == 0)
+		return;
+
+	/*
+	 * Calculate change in utilization over sampling period and set this to
+	 * 0 if the delta would be 0 or negative which may happen if any CPUs go
+	 * offline during the sampling period
+	 */
+	if (hw_util->pghw_util > old_utilization)
+		utilization_delta = hw_util->pghw_util - old_utilization;
+	else
+		utilization_delta = 0;
+
+	/*
+	 * Update utilization rate if the interval between samples is
+	 * sufficient.
+	 */
+	ASSERT(cu_sample_interval_min > CU_SCALE);
+	if (time_delta > CU_SAMPLE_INTERVAL_MIN)
+		hw_util->pghw_rate = CU_RATE(utilization_delta, time_delta);
+
+	/*
+	 * Update the maximum observed rate
+	 */
+	if (hw_util->pghw_rate_max < hw_util->pghw_rate)
+		hw_util->pghw_rate_max = hw_util->pghw_rate;
+}
diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c
index 009598f03f..62e8eeb2fe 100644
--- a/usr/src/uts/common/os/cpu.c
+++ b/usr/src/uts/common/os/cpu.c
@@ -1203,12 +1203,14 @@ cpu_online(cpu_t *cp)
 		}
 		cp->cpu_flags &= ~(CPU_QUIESCED | CPU_OFFLINE | CPU_FROZEN |
 		    CPU_SPARE);
+		CPU_NEW_GENERATION(cp);
 		start_cpus();
 		cpu_stats_kstat_create(cp);
 		cpu_create_intrstat(cp);
 		lgrp_kstat_create(cp);
 		cpu_state_change_notify(cp->cpu_id, CPU_ON);
 		cpu_intr_enable(cp);	/* arch-dep hook */
+		cpu_state_change_notify(cp->cpu_id, CPU_INTR_ON);
 		cpu_set_state(cp);
 		cyclic_online(cp);
 		/*
@@ -1284,6 +1286,7 @@ cpu_offline(cpu_t *cp, int flags)
 	/*
 	 * Tell interested parties that this CPU is going offline.
 	 */
+	CPU_NEW_GENERATION(cp);
 	cpu_state_change_notify(cp->cpu_id, CPU_OFF);
 
 	/*
@@ -1557,8 +1560,11 @@ out:
 	/*
 	 * If we failed, we need to notify everyone that this CPU is back on.
 	 */
-	if (error != 0)
+	if (error != 0) {
+		CPU_NEW_GENERATION(cp);
 		cpu_state_change_notify(cp->cpu_id, CPU_ON);
+		cpu_state_change_notify(cp->cpu_id, CPU_INTR_ON);
+	}
 
 	return (error);
 }
@@ -2152,6 +2158,7 @@ static struct {
 	kstat_named_t ci_core_id;
 	kstat_named_t ci_curr_clock_Hz;
 	kstat_named_t ci_supp_freq_Hz;
+	kstat_named_t ci_pg_id;
 #if defined(__sparcv9)
 	kstat_named_t ci_device_ID;
 	kstat_named_t ci_cpu_fru;
@@ -2167,6 +2174,7 @@ static struct {
 	kstat_named_t ci_ncoreperchip;
 	kstat_named_t ci_max_cstates;
 	kstat_named_t ci_curr_cstate;
+	kstat_named_t ci_cacheid;
 	kstat_named_t ci_sktstr;
 #endif
 } cpu_info_template = {
@@ -2181,6 +2189,7 @@ static struct {
 	{ "core_id",			KSTAT_DATA_LONG },
 	{ "current_clock_Hz",		KSTAT_DATA_UINT64 },
 	{ "supported_frequencies_Hz",	KSTAT_DATA_STRING },
+	{ "pg_id",			KSTAT_DATA_LONG },
 #if defined(__sparcv9)
 	{ "device_ID",			KSTAT_DATA_UINT64 },
 	{ "cpu_fru",			KSTAT_DATA_STRING },
@@ -2196,6 +2205,7 @@ static struct {
 	{ "ncore_per_chip",		KSTAT_DATA_INT32 },
 	{ "supported_max_cstates",	KSTAT_DATA_INT32 },
 	{ "current_cstate",		KSTAT_DATA_INT32 },
+	{ "cache_id",			KSTAT_DATA_INT32 },
 	{ "socket_type",		KSTAT_DATA_STRING },
 #endif
 };
@@ -2253,6 +2263,9 @@ cpu_info_kstat_update(kstat_t *ksp, int rw)
 	cpu_info_template.ci_core_id.value.l = pg_plat_get_core_id(cp);
 	cpu_info_template.ci_curr_clock_Hz.value.ui64 =
 	    cp->cpu_curr_clock;
+	cpu_info_template.ci_pg_id.value.l =
+	    cp->cpu_pg && cp->cpu_pg->cmt_lineage ?
+	    cp->cpu_pg->cmt_lineage->pg_id : -1;
 	kstat_named_setstr(&cpu_info_template.ci_supp_freq_Hz,
 	    cp->cpu_supp_freqs);
 #if defined(__sparcv9)
@@ -2273,6 +2286,7 @@ cpu_info_kstat_update(kstat_t *ksp, int rw)
 	cpu_info_template.ci_pkg_core_id.value.l = cpuid_get_pkgcoreid(cp);
 	cpu_info_template.ci_max_cstates.value.l = cp->cpu_m.max_cstates;
 	cpu_info_template.ci_curr_cstate.value.l = cpu_idle_get_cpu_state(cp);
+	cpu_info_template.ci_cacheid.value.i32 = cpuid_get_cacheid(cp);
 	kstat_named_setstr(&cpu_info_template.ci_sktstr,
 	    cpuid_getsocketstr(cp));
 #endif
diff --git a/usr/src/uts/common/os/group.c b/usr/src/uts/common/os/group.c
index 01e3f1ebdd..e46e7f600c 100644
--- a/usr/src/uts/common/os/group.c
+++ b/usr/src/uts/common/os/group.c
@@ -28,6 +28,7 @@
 #include <sys/debug.h>
 #include <sys/kmem.h>
 #include <sys/group.h>
+#include <sys/cmn_err.h>
 
 
 #define	GRP_SET_SIZE_DEFAULT 2
@@ -352,3 +353,102 @@ group_find(group_t *g, void *e)
 	}
 	return ((uint_t)-1);
 }
+
+/*
+ * Return a string in a given buffer with list of integer entries in a group.
+ * The string concatenates consecutive integer ranges ax x-y.
+ * The resulting string looks like "1,2-5,8"
+ *
+ * The convert argument is used to map group elements to integer IDs.
+ */
+char *
+group2intlist(group_t *group, char *buffer, size_t len, int (convert)(void*))
+{
+	char		*ptr = buffer;
+	void		*v;
+	group_iter_t	iter;
+	boolean_t	first_iteration = B_TRUE;
+	boolean_t	first_value = B_TRUE;
+	int		start = 0, end = 0;
+
+	/*
+	 * Allow for the terminating NULL-byte
+	 */
+	len = len -1;
+
+	group_iter_init(&iter);
+	while ((v = group_iterate(group, &iter)) != NULL && len > 0) {
+		int id = convert(v);
+		int nbytes = 0;
+
+		if (first_iteration) {
+			start = end = id;
+			first_iteration = B_FALSE;
+		} else if (end + 1 == id) {
+			/*
+			 * Got consecutive ID, so extend end of range without
+			 * doing anything since the range may extend further
+			 */
+			end = id;
+		} else {
+			if (first_value) {
+				first_value = B_FALSE;
+			} else {
+				*ptr++ = ',';
+				len--;
+			}
+
+			if (len == 0)
+				break;
+
+			/*
+			 * Next ID is not consecutive, so dump IDs gotten so
+			 * far.
+			 */
+			if (end > start + 1) /* range */
+				nbytes = snprintf(ptr, len, "%d-%d",
+				    start, end);
+			else if (end > start) /* different values */
+				nbytes = snprintf(ptr, len, "%d,%d",
+				    start, end);
+			else /* same value */
+				nbytes = snprintf(ptr, len, "%d", start);
+
+			if (nbytes <= 0) {
+				len = 0;
+				break;
+			}
+
+			/*
+			 * Advance position in the string
+			 */
+			ptr += nbytes;
+			len -= nbytes;
+
+			/*
+			 * Try finding consecutive range starting from current
+			 * ID.
+			 */
+			start = end = id;
+		}
+	}
+
+	if (!first_value) {
+		*ptr++ = ',';
+		len--;
+	}
+	/*
+	 * Print last ID(s)
+	 */
+	if (len > 0) {
+		if (end > start + 1) {
+			(void) snprintf(ptr, len, "%d-%d", start, end);
+		} else if (end != start) {
+			(void) snprintf(ptr, len, "%d,%d", start, end);
+		} else {
+			(void) snprintf(ptr, len, "%d", start);
+		}
+	}
+
+	return (buffer);
+}
diff --git a/usr/src/uts/common/os/kcpc.c b/usr/src/uts/common/os/kcpc.c
index e5cab151b8..50a999dcc5 100644
--- a/usr/src/uts/common/os/kcpc.c
+++ b/usr/src/uts/common/os/kcpc.c
@@ -39,12 +39,17 @@
 #include <sys/sunddi.h>
 #include <sys/modctl.h>
 #include <sys/sdt.h>
+#include <sys/archsystm.h>
+#include <sys/promif.h>
+#include <sys/x_call.h>
+#include <sys/cap_util.h>
 #if defined(__x86)
 #include <asm/clock.h>
+#include <sys/xc_levels.h>
 #endif
 
-kmutex_t	kcpc_ctx_llock[CPC_HASH_BUCKETS];	/* protects ctx_list */
-kcpc_ctx_t	*kcpc_ctx_list[CPC_HASH_BUCKETS];	/* head of list */
+static kmutex_t	kcpc_ctx_llock[CPC_HASH_BUCKETS];	/* protects ctx_list */
+static kcpc_ctx_t *kcpc_ctx_list[CPC_HASH_BUCKETS];	/* head of list */
 
 
 krwlock_t	kcpc_cpuctx_lock;	/* lock for 'kcpc_cpuctx' below */
@@ -73,10 +78,75 @@ static int kcpc_nullctx_panic = 0;
 static void kcpc_lwp_create(kthread_t *t, kthread_t *ct);
 static void kcpc_restore(kcpc_ctx_t *ctx);
 static void kcpc_save(kcpc_ctx_t *ctx);
-static void kcpc_free(kcpc_ctx_t *ctx, int isexec);
 static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx);
 static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch);
 static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set);
+static kcpc_set_t *kcpc_set_create(kcpc_request_t *reqs, int nreqs,
+    int set_flags, int kmem_flags);
+
+/*
+ * Macros to manipulate context flags. All flag updates should use one of these
+ * two macros
+ *
+ * Flags should be always be updated atomically since some of the updates are
+ * not protected by locks.
+ */
+#define	KCPC_CTX_FLAG_SET(ctx, flag) atomic_or_uint(&(ctx)->kc_flags, (flag))
+#define	KCPC_CTX_FLAG_CLR(ctx, flag) atomic_and_uint(&(ctx)->kc_flags, ~(flag))
+
+/*
+ * The IS_HIPIL() macro verifies that the code is executed either from a
+ * cross-call or from high-PIL interrupt
+ */
+#ifdef DEBUG
+#define	IS_HIPIL() (getpil() >= XCALL_PIL)
+#else
+#define	IS_HIPIL()
+#endif	/* DEBUG */
+
+
+extern int kcpc_hw_load_pcbe(void);
+
+/*
+ * Return value from kcpc_hw_load_pcbe()
+ */
+static int kcpc_pcbe_error = 0;
+
+/*
+ * Perform one-time initialization of kcpc framework.
+ * This function performs the initialization only the first time it is called.
+ * It is safe to call it multiple times.
+ */
+int
+kcpc_init(void)
+{
+	long hash;
+	static uint32_t kcpc_initialized = 0;
+
+	/*
+	 * We already tried loading platform pcbe module and failed
+	 */
+	if (kcpc_pcbe_error != 0)
+		return (-1);
+
+	/*
+	 * The kcpc framework should be initialized at most once
+	 */
+	if (atomic_cas_32(&kcpc_initialized, 0, 1) != 0)
+		return (0);
+
+	rw_init(&kcpc_cpuctx_lock, NULL, RW_DEFAULT, NULL);
+	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
+		mutex_init(&kcpc_ctx_llock[hash],
+		    NULL, MUTEX_DRIVER, (void *)(uintptr_t)15);
+
+	/*
+	 * Load platform-specific pcbe module
+	 */
+	kcpc_pcbe_error = kcpc_hw_load_pcbe();
+
+	return (kcpc_pcbe_error == 0 ? 0 : -1);
+}
 
 void
 kcpc_register_pcbe(pcbe_ops_t *ops)
@@ -103,8 +173,9 @@ kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode)
 	cpu_t		*cp;
 	kcpc_ctx_t	*ctx;
 	int		error;
+	int		save_spl;
 
-	ctx = kcpc_ctx_alloc();
+	ctx = kcpc_ctx_alloc(KM_SLEEP);
 
 	if (kcpc_assign_reqs(set, ctx) != 0) {
 		kcpc_ctx_free(ctx);
@@ -141,28 +212,34 @@ kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode)
 		goto unbound;
 
 	mutex_enter(&cp->cpu_cpc_ctxlock);
+	kpreempt_disable();
+	save_spl = spl_xcall();
 
-	if (cp->cpu_cpc_ctx != NULL) {
+	/*
+	 * Check to see whether counters for CPU already being used by someone
+	 * other than kernel for capacity and utilization (since kernel will
+	 * let go of counters for user in kcpc_program() below)
+	 */
+	if (cp->cpu_cpc_ctx != NULL && !CU_CPC_ON(cp)) {
 		/*
 		 * If this CPU already has a bound set, return an error.
 		 */
+		splx(save_spl);
+		kpreempt_enable();
 		mutex_exit(&cp->cpu_cpc_ctxlock);
 		goto unbound;
 	}
 
 	if (curthread->t_bind_cpu != cpuid) {
+		splx(save_spl);
+		kpreempt_enable();
 		mutex_exit(&cp->cpu_cpc_ctxlock);
 		goto unbound;
 	}
-	cp->cpu_cpc_ctx = ctx;
 
-	/*
-	 * Kernel preemption must be disabled while fiddling with the hardware
-	 * registers to prevent partial updates.
-	 */
-	kpreempt_disable();
-	ctx->kc_rawtick = KCPC_GET_TICK();
-	pcbe_ops->pcbe_program(ctx);
+	kcpc_program(ctx, B_FALSE, B_TRUE);
+
+	splx(save_spl);
 	kpreempt_enable();
 
 	mutex_exit(&cp->cpu_cpc_ctxlock);
@@ -197,14 +274,14 @@ kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
 	if (t->t_cpc_ctx != NULL)
 		return (EEXIST);
 
-	ctx = kcpc_ctx_alloc();
+	ctx = kcpc_ctx_alloc(KM_SLEEP);
 
 	/*
 	 * The context must begin life frozen until it has been properly
 	 * programmed onto the hardware. This prevents the context ops from
 	 * worrying about it until we're ready.
 	 */
-	ctx->kc_flags |= KCPC_CTX_FREEZE;
+	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
 	ctx->kc_hrtime = gethrtime();
 
 	if (kcpc_assign_reqs(set, ctx) != 0) {
@@ -215,13 +292,13 @@ kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
 
 	ctx->kc_cpuid = -1;
 	if (set->ks_flags & CPC_BIND_LWP_INHERIT)
-		ctx->kc_flags |= KCPC_CTX_LWPINHERIT;
+		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_LWPINHERIT);
 	ctx->kc_thread = t;
 	t->t_cpc_ctx = ctx;
 	/*
 	 * Permit threads to look at their own hardware counters from userland.
 	 */
-	ctx->kc_flags |= KCPC_CTX_NONPRIV;
+	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_NONPRIV);
 
 	/*
 	 * Create the data store for this set.
@@ -248,12 +325,14 @@ kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
 	 * Ask the backend to program the hardware.
 	 */
 	if (t == curthread) {
+		int save_spl;
+
 		kpreempt_disable();
-		ctx->kc_rawtick = KCPC_GET_TICK();
-		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
-		pcbe_ops->pcbe_program(ctx);
+		save_spl = spl_xcall();
+		kcpc_program(ctx, B_TRUE, B_TRUE);
+		splx(save_spl);
 		kpreempt_enable();
-	} else
+	} else {
 		/*
 		 * Since we are the agent LWP, we know the victim LWP is stopped
 		 * until we're done here; no need to worry about preemption or
@@ -262,7 +341,8 @@ kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
 		 * still be accessed from, for instance, another CPU doing a
 		 * kcpc_invalidate_all().
 		 */
-		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
+		KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
+	}
 
 	mutex_enter(&set->ks_lock);
 	set->ks_state |= KCPC_SET_BOUND;
@@ -304,7 +384,7 @@ kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode)
 			 * notification, we flag the context as being one that
 			 * cares about overflow.
 			 */
-			ctx->kc_flags |= KCPC_CTX_SIGOVF;
+			KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_SIGOVF);
 		}
 
 		rp->kr_config = NULL;
@@ -349,7 +429,7 @@ int
 kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick)
 {
 	kcpc_ctx_t	*ctx = set->ks_ctx;
-	uint64_t	curtick = KCPC_GET_TICK();
+	int		save_spl;
 
 	mutex_enter(&set->ks_lock);
 	if ((set->ks_state & KCPC_SET_BOUND) == 0) {
@@ -358,41 +438,53 @@ kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick)
 	}
 	mutex_exit(&set->ks_lock);
 
-	if (ctx->kc_flags & KCPC_CTX_INVALID)
+	/*
+	 * Kernel preemption must be disabled while reading the hardware regs,
+	 * and if this is a CPU-bound context, while checking the CPU binding of
+	 * the current thread.
+	 */
+	kpreempt_disable();
+	save_spl = spl_xcall();
+
+	if (ctx->kc_flags & KCPC_CTX_INVALID) {
+		splx(save_spl);
+		kpreempt_enable();
 		return (EAGAIN);
+	}
 
 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0) {
-		/*
-		 * Kernel preemption must be disabled while reading the
-		 * hardware regs, and if this is a CPU-bound context, while
-		 * checking the CPU binding of the current thread.
-		 */
-		kpreempt_disable();
-
 		if (ctx->kc_cpuid != -1) {
 			if (curthread->t_bind_cpu != ctx->kc_cpuid) {
+				splx(save_spl);
 				kpreempt_enable();
 				return (EAGAIN);
 			}
 		}
 
 		if (ctx->kc_thread == curthread) {
-			ctx->kc_hrtime = gethrtime();
+			uint64_t curtick = KCPC_GET_TICK();
+
+			ctx->kc_hrtime = gethrtime_waitfree();
 			pcbe_ops->pcbe_sample(ctx);
 			ctx->kc_vtick += curtick - ctx->kc_rawtick;
 			ctx->kc_rawtick = curtick;
 		}
 
-		kpreempt_enable();
-
 		/*
 		 * The config may have been invalidated by
 		 * the pcbe_sample op.
 		 */
-		if (ctx->kc_flags & KCPC_CTX_INVALID)
+		if (ctx->kc_flags & KCPC_CTX_INVALID) {
+			splx(save_spl);
+			kpreempt_enable();
 			return (EAGAIN);
+		}
+
 	}
 
+	splx(save_spl);
+	kpreempt_enable();
+
 	if (copyout(set->ks_data, buf,
 	    set->ks_nreqs * sizeof (uint64_t)) == -1)
 		return (EFAULT);
@@ -412,20 +504,17 @@ kcpc_stop_hw(kcpc_ctx_t *ctx)
 {
 	cpu_t *cp;
 
-	ASSERT((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED))
-	    == KCPC_CTX_INVALID);
-
 	kpreempt_disable();
 
-	cp = cpu_get(ctx->kc_cpuid);
-	ASSERT(cp != NULL);
+	if (ctx->kc_cpuid == CPU->cpu_id) {
+		cp = CPU;
+	} else {
+		cp = cpu_get(ctx->kc_cpuid);
+	}
+
+	ASSERT(cp != NULL && cp->cpu_cpc_ctx == ctx);
+	kcpc_cpu_stop(cp, B_FALSE);
 
-	if (cp == CPU) {
-		pcbe_ops->pcbe_allstop();
-		atomic_or_uint(&ctx->kc_flags,
-		    KCPC_CTX_INVALID_STOPPED);
-	} else
-		kcpc_remote_stop(cp);
 	kpreempt_enable();
 }
 
@@ -451,7 +540,7 @@ kcpc_unbind(kcpc_set_t *set)
 	 * Use kc_lock to synchronize with kcpc_restore().
 	 */
 	mutex_enter(&ctx->kc_lock);
-	ctx->kc_flags |= KCPC_CTX_INVALID;
+	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
 	mutex_exit(&ctx->kc_lock);
 
 	if (ctx->kc_cpuid == -1) {
@@ -461,12 +550,14 @@ kcpc_unbind(kcpc_set_t *set)
 		 * context.  It will be freed via removectx() calling
 		 * freectx() calling kcpc_free().
 		 */
-		if (t == curthread &&
-		    (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
+		if (t == curthread) {
+			int save_spl;
+
 			kpreempt_disable();
-			pcbe_ops->pcbe_allstop();
-			atomic_or_uint(&ctx->kc_flags,
-			    KCPC_CTX_INVALID_STOPPED);
+			save_spl = spl_xcall();
+			if (!(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED))
+				kcpc_unprogram(ctx, B_TRUE);
+			splx(save_spl);
 			kpreempt_enable();
 		}
 #ifdef DEBUG
@@ -503,7 +594,6 @@ kcpc_unbind(kcpc_set_t *set)
 			if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0)
 				kcpc_stop_hw(ctx);
 			ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
-			cp->cpu_cpc_ctx = NULL;
 			mutex_exit(&cp->cpu_cpc_ctxlock);
 		}
 		mutex_exit(&cpu_lock);
@@ -543,12 +633,20 @@ kcpc_restart(kcpc_set_t *set)
 {
 	kcpc_ctx_t	*ctx = set->ks_ctx;
 	int		i;
+	int		save_spl;
 
 	ASSERT(set->ks_state & KCPC_SET_BOUND);
 	ASSERT(ctx->kc_thread == curthread);
 	ASSERT(ctx->kc_cpuid == -1);
 
+	for (i = 0; i < set->ks_nreqs; i++) {
+		*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
+		pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset,
+		    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
+	}
+
 	kpreempt_disable();
+	save_spl = spl_xcall();
 
 	/*
 	 * If the user is doing this on a running set, make sure the counters
@@ -557,18 +655,13 @@ kcpc_restart(kcpc_set_t *set)
 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
 		pcbe_ops->pcbe_allstop();
 
-	for (i = 0; i < set->ks_nreqs; i++) {
-		*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
-		pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset,
-		    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
-	}
-
 	/*
 	 * Ask the backend to program the hardware.
 	 */
 	ctx->kc_rawtick = KCPC_GET_TICK();
-	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
+	KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
 	pcbe_ops->pcbe_program(ctx);
+	splx(save_spl);
 	kpreempt_enable();
 
 	return (0);
@@ -604,7 +697,7 @@ kcpc_enable(kthread_t *t, int cmd, int enable)
 		if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
 			return (EINVAL);
 		kpreempt_disable();
-		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
+		KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
 		kcpc_restore(ctx);
 		kpreempt_enable();
 	} else if (cmd == CPC_DISABLE) {
@@ -612,7 +705,7 @@ kcpc_enable(kthread_t *t, int cmd, int enable)
 			return (EINVAL);
 		kpreempt_disable();
 		kcpc_save(ctx);
-		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
+		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
 		kpreempt_enable();
 	} else if (cmd == CPC_USR_EVENTS || cmd == CPC_SYS_EVENTS) {
 		/*
@@ -624,10 +717,11 @@ kcpc_enable(kthread_t *t, int cmd, int enable)
 		    CPC_COUNT_USER: CPC_COUNT_SYSTEM;
 
 		kpreempt_disable();
-		atomic_or_uint(&ctx->kc_flags,
+		KCPC_CTX_FLAG_SET(ctx,
 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
 		pcbe_ops->pcbe_allstop();
 		kpreempt_enable();
+
 		for (i = 0; i < set->ks_nreqs; i++) {
 			set->ks_req[i].kr_preset = *(set->ks_req[i].kr_data);
 			if (enable)
@@ -715,12 +809,14 @@ kcpc_next_config(void *token, void *current, uint64_t **data)
 
 
 kcpc_ctx_t *
-kcpc_ctx_alloc(void)
+kcpc_ctx_alloc(int kmem_flags)
 {
 	kcpc_ctx_t	*ctx;
 	long		hash;
 
-	ctx = (kcpc_ctx_t *)kmem_zalloc(sizeof (kcpc_ctx_t), KM_SLEEP);
+	ctx = (kcpc_ctx_t *)kmem_zalloc(sizeof (kcpc_ctx_t), kmem_flags);
+	if (ctx == NULL)
+		return (NULL);
 
 	hash = CPC_HASH_CTX(ctx);
 	mutex_enter(&kcpc_ctx_llock[hash]);
@@ -909,9 +1005,10 @@ kcpc_overflow_intr(caddr_t arg, uint64_t bitmap)
 		 */
 		if (kcpc_nullctx_panic)
 			panic("null cpc context, thread %p", (void *)t);
-
-		cmn_err(CE_WARN,
+#ifdef DEBUG
+		cmn_err(CE_NOTE,
 		    "null cpc context found in overflow handler!\n");
+#endif
 		atomic_add_32(&kcpc_nullctx_count, 1);
 	} else if ((ctx->kc_flags & KCPC_CTX_INVALID) == 0) {
 		/*
@@ -935,13 +1032,20 @@ kcpc_overflow_intr(caddr_t arg, uint64_t bitmap)
 				 * so freeze the context. The interrupt handler
 				 * has already stopped the counter hardware.
 				 */
-				atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
+				KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
 				atomic_or_uint(&ctx->kc_pics[i].kp_flags,
 				    KCPC_PIC_OVERFLOWED);
 			}
 		}
 		aston(t);
+	} else if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) {
+		/*
+		 * Thread context is no longer valid, but here may be a valid
+		 * CPU context.
+		 */
+		return (curthread->t_cpu->cpu_cpc_ctx);
 	}
+
 	return (NULL);
 }
 
@@ -956,6 +1060,7 @@ kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
 	kcpc_ctx_t *ctx;
 	uint64_t bitmap;
 	uint8_t *state;
+	int	save_spl;
 
 	if (pcbe_ops == NULL ||
 	    (bitmap = pcbe_ops->pcbe_overflow_bitmap()) == 0)
@@ -985,6 +1090,13 @@ kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
 			(*dtrace_cpc_fire)(bitmap);
 
 			ctx = curthread->t_cpu->cpu_cpc_ctx;
+			if (ctx == NULL) {
+#ifdef DEBUG
+				cmn_err(CE_NOTE, "null cpc context in"
+				    "hardware overflow handler!\n");
+#endif
+				return (DDI_INTR_CLAIMED);
+			}
 
 			/* Reset any counters that have overflowed */
 			for (i = 0; i < ctx->kc_set->ks_nreqs; i++) {
@@ -1025,7 +1137,12 @@ kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
 	 * the middle of updating it, no AST has been posted, and so we
 	 * should sample the counters here, and restart them with no
 	 * further fuss.
+	 *
+	 * The CPU's CPC context may disappear as a result of cross-call which
+	 * has higher PIL on x86, so protect the context by raising PIL to the
+	 * cross-call level.
 	 */
+	save_spl = spl_xcall();
 	if ((ctx = kcpc_overflow_intr(arg1, bitmap)) != NULL) {
 		uint64_t curtick = KCPC_GET_TICK();
 
@@ -1035,6 +1152,7 @@ kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
 		pcbe_ops->pcbe_sample(ctx);
 		pcbe_ops->pcbe_program(ctx);
 	}
+	splx(save_spl);
 
 	return (DDI_INTR_CLAIMED);
 }
@@ -1087,7 +1205,7 @@ kcpc_overflow_ast()
 	 * Otherwise, re-enable the counters and continue life as before.
 	 */
 	kpreempt_disable();
-	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
+	KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
 	pcbe_ops->pcbe_program(ctx);
 	kpreempt_enable();
 	return (0);
@@ -1099,43 +1217,68 @@ kcpc_overflow_ast()
 static void
 kcpc_save(kcpc_ctx_t *ctx)
 {
+	int err;
+	int save_spl;
+
+	kpreempt_disable();
+	save_spl = spl_xcall();
+
 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
-		if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)
+		if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) {
+			splx(save_spl);
+			kpreempt_enable();
 			return;
+		}
 		/*
 		 * This context has been invalidated but the counters have not
 		 * been stopped. Stop them here and mark the context stopped.
 		 */
-		pcbe_ops->pcbe_allstop();
-		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
+		kcpc_unprogram(ctx, B_TRUE);
+		splx(save_spl);
+		kpreempt_enable();
 		return;
 	}
 
 	pcbe_ops->pcbe_allstop();
-	if (ctx->kc_flags & KCPC_CTX_FREEZE)
+	if (ctx->kc_flags & KCPC_CTX_FREEZE) {
+		splx(save_spl);
+		kpreempt_enable();
 		return;
+	}
 
 	/*
 	 * Need to sample for all reqs into each req's current mpic.
 	 */
-	ctx->kc_hrtime = gethrtime();
+	ctx->kc_hrtime = gethrtime_waitfree();
 	ctx->kc_vtick += KCPC_GET_TICK() - ctx->kc_rawtick;
 	pcbe_ops->pcbe_sample(ctx);
+
+	/*
+	 * Program counter for measuring capacity and utilization since user
+	 * thread isn't using counter anymore
+	 */
+	ASSERT(ctx->kc_cpuid == -1);
+	cu_cpc_program(CPU, &err);
+	splx(save_spl);
+	kpreempt_enable();
 }
 
 static void
 kcpc_restore(kcpc_ctx_t *ctx)
 {
+	int save_spl;
+
 	mutex_enter(&ctx->kc_lock);
+
 	if ((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED)) ==
-	    KCPC_CTX_INVALID)
+	    KCPC_CTX_INVALID) {
 		/*
 		 * The context is invalidated but has not been marked stopped.
 		 * We mark it as such here because we will not start the
 		 * counters during this context switch.
 		 */
-		ctx->kc_flags |= KCPC_CTX_INVALID_STOPPED;
-
+		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID_STOPPED);
+	}
 
 	if (ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_FREEZE)) {
 		mutex_exit(&ctx->kc_lock);
@@ -1151,7 +1294,7 @@ kcpc_restore(kcpc_ctx_t *ctx)
 	 * doing this, we're asking kcpc_free() to cv_wait() until
 	 * kcpc_restore() has completed.
 	 */
-	ctx->kc_flags |= KCPC_CTX_RESTORE;
+	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_RESTORE);
 	mutex_exit(&ctx->kc_lock);
 
 	/*
@@ -1159,14 +1302,17 @@ kcpc_restore(kcpc_ctx_t *ctx)
 	 * don't do an explicit pcbe_allstop() here because they should have
 	 * been stopped already by the last consumer.
 	 */
-	ctx->kc_rawtick = KCPC_GET_TICK();
-	pcbe_ops->pcbe_program(ctx);
+	kpreempt_disable();
+	save_spl = spl_xcall();
+	kcpc_program(ctx, B_TRUE, B_TRUE);
+	splx(save_spl);
+	kpreempt_enable();
 
 	/*
 	 * Wake the agent thread if it's waiting in kcpc_free().
 	 */
 	mutex_enter(&ctx->kc_lock);
-	ctx->kc_flags &= ~KCPC_CTX_RESTORE;
+	KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_RESTORE);
 	cv_signal(&ctx->kc_condv);
 	mutex_exit(&ctx->kc_lock);
 }
@@ -1177,7 +1323,6 @@ kcpc_restore(kcpc_ctx_t *ctx)
  * counters when the idle thread is switched on, and they start them again when
  * it is switched off.
  */
-
 /*ARGSUSED*/
 void
 kcpc_idle_save(struct cpu *cp)
@@ -1242,7 +1387,7 @@ kcpc_lwp_create(kthread_t *t, kthread_t *ct)
 		rw_exit(&kcpc_cpuctx_lock);
 		return;
 	}
-	cctx = kcpc_ctx_alloc();
+	cctx = kcpc_ctx_alloc(KM_SLEEP);
 	kcpc_ctx_clone(ctx, cctx);
 	rw_exit(&kcpc_cpuctx_lock);
 
@@ -1250,7 +1395,7 @@ kcpc_lwp_create(kthread_t *t, kthread_t *ct)
 	 * Copy the parent context's kc_flags field, but don't overwrite
 	 * the child's in case it was modified during kcpc_ctx_clone.
 	 */
-	cctx->kc_flags |= ctx->kc_flags;
+	KCPC_CTX_FLAG_SET(cctx,  ctx->kc_flags);
 	cctx->kc_thread = ct;
 	cctx->kc_cpuid = -1;
 	ct->t_cpc_set = cctx->kc_set;
@@ -1265,13 +1410,14 @@ kcpc_lwp_create(kthread_t *t, kthread_t *ct)
 		 * set to UINT64_MAX, and their pic's overflow flag turned on
 		 * so that our trap() processing knows to send a signal.
 		 */
-		atomic_or_uint(&cctx->kc_flags, KCPC_CTX_FREEZE);
+		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
 		for (i = 0; i < ks->ks_nreqs; i++) {
 			kcpc_request_t *kr = &ks->ks_req[i];
 
 			if (kr->kr_flags & CPC_OVF_NOTIFY_EMT) {
 				*(kr->kr_data) = UINT64_MAX;
-				kr->kr_picp->kp_flags |= KCPC_PIC_OVERFLOWED;
+				atomic_or_uint(&kr->kr_picp->kp_flags,
+				    KCPC_PIC_OVERFLOWED);
 			}
 		}
 		ttolwp(ct)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
@@ -1315,7 +1461,7 @@ kcpc_lwp_create(kthread_t *t, kthread_t *ct)
  */
 
 /*ARGSUSED*/
-static void
+void
 kcpc_free(kcpc_ctx_t *ctx, int isexec)
 {
 	int		i;
@@ -1329,7 +1475,7 @@ kcpc_free(kcpc_ctx_t *ctx, int isexec)
 	mutex_enter(&ctx->kc_lock);
 	while (ctx->kc_flags & KCPC_CTX_RESTORE)
 		cv_wait(&ctx->kc_condv, &ctx->kc_lock);
-	ctx->kc_flags |= KCPC_CTX_INVALID;
+	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
 	mutex_exit(&ctx->kc_lock);
 
 	if (isexec) {
@@ -1356,21 +1502,22 @@ kcpc_free(kcpc_ctx_t *ctx, int isexec)
 			if (cp != NULL) {
 				mutex_enter(&cp->cpu_cpc_ctxlock);
 				kcpc_stop_hw(ctx);
-				cp->cpu_cpc_ctx = NULL;
 				mutex_exit(&cp->cpu_cpc_ctxlock);
 			}
 			mutex_exit(&cpu_lock);
 			ASSERT(curthread->t_cpc_ctx == NULL);
 		} else {
+			int save_spl;
+
 			/*
 			 * Thread-bound context; stop _this_ CPU's counters.
 			 */
 			kpreempt_disable();
-			pcbe_ops->pcbe_allstop();
-			atomic_or_uint(&ctx->kc_flags,
-			    KCPC_CTX_INVALID_STOPPED);
-			kpreempt_enable();
+			save_spl = spl_xcall();
+			kcpc_unprogram(ctx, B_TRUE);
 			curthread->t_cpc_ctx = NULL;
+			splx(save_spl);
+			kpreempt_enable();
 		}
 
 		/*
@@ -1435,7 +1582,7 @@ kcpc_invalidate_all(void)
 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++) {
 		mutex_enter(&kcpc_ctx_llock[hash]);
 		for (ctx = kcpc_ctx_list[hash]; ctx; ctx = ctx->kc_next)
-			atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
+			KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
 		mutex_exit(&kcpc_ctx_llock[hash]);
 	}
 }
@@ -1451,7 +1598,7 @@ kcpc_invalidate_config(void *token)
 
 	ASSERT(ctx != NULL);
 
-	atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
+	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
 }
 
 /*
@@ -1462,18 +1609,11 @@ kcpc_passivate(void)
 {
 	kcpc_ctx_t *ctx = curthread->t_cpc_ctx;
 	kcpc_set_t *set = curthread->t_cpc_set;
+	int	save_spl;
 
 	if (set == NULL)
 		return;
 
-	/*
-	 * We're cleaning up after this thread; ensure there are no dangling
-	 * CPC pointers left behind. The context and set will be freed by
-	 * freectx() in the case of an LWP-bound set, and by kcpc_unbind() in
-	 * the case of a CPU-bound set.
-	 */
-	curthread->t_cpc_ctx = NULL;
-
 	if (ctx == NULL) {
 		/*
 		 * This thread has a set but no context; it must be a CPU-bound
@@ -1491,6 +1631,8 @@ kcpc_passivate(void)
 		return;
 	}
 
+	kpreempt_disable();
+	save_spl = spl_xcall();
 	curthread->t_cpc_set = NULL;
 
 	/*
@@ -1500,13 +1642,20 @@ kcpc_passivate(void)
 	 * INVALID_STOPPED flag here and kcpc_restore() setting the flag during
 	 * a context switch.
 	 */
-
-	kpreempt_disable();
 	if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
-		pcbe_ops->pcbe_allstop();
-		atomic_or_uint(&ctx->kc_flags,
+		kcpc_unprogram(ctx, B_TRUE);
+		KCPC_CTX_FLAG_SET(ctx,
 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
 	}
+
+	/*
+	 * We're cleaning up after this thread; ensure there are no dangling
+	 * CPC pointers left behind. The context and set will be freed by
+	 * freectx().
+	 */
+	curthread->t_cpc_ctx = NULL;
+
+	splx(save_spl);
 	kpreempt_enable();
 }
 
@@ -1667,7 +1816,7 @@ kcpc_invalidate(kthread_t *t)
 	kcpc_ctx_t *ctx = t->t_cpc_ctx;
 
 	if (ctx != NULL)
-		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
+		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
 }
 
 /*
@@ -1691,6 +1840,648 @@ kcpc_pcbe_tryload(const char *prefix, uint_t first, uint_t second, uint_t third)
 	    "pcbe", prefix, ".", s, 3, NULL) < 0 ? -1 : 0);
 }
 
+/*
+ * Create one or more CPC context for given CPU with specified counter event
+ * requests
+ *
+ * If number of requested counter events is less than or equal number of
+ * hardware counters on a CPU and can all be assigned to the counters on a CPU
+ * at the same time, then make one CPC context.
+ *
+ * Otherwise, multiple CPC contexts are created to allow multiplexing more
+ * counter events than existing counters onto the counters by iterating through
+ * all of the CPC contexts, programming the counters with each CPC context one
+ * at a time and measuring the resulting counter values.  Each of the resulting
+ * CPC contexts contains some number of requested counter events less than or
+ * equal the number of counters on a CPU depending on whether all the counter
+ * events can be programmed on all the counters at the same time or not.
+ *
+ * Flags to kmem_{,z}alloc() are passed in as an argument to allow specifying
+ * whether memory allocation should be non-blocking or not.  The code will try
+ * to allocate *whole* CPC contexts if possible.  If there is any memory
+ * allocation failure during the allocations needed for a given CPC context, it
+ * will skip allocating that CPC context because it cannot allocate the whole
+ * thing.  Thus, the only time that it will end up allocating none (ie. no CPC
+ * contexts whatsoever) is when it cannot even allocate *one* whole CPC context
+ * without a memory allocation failure occurring.
+ */
+int
+kcpc_cpu_ctx_create(cpu_t *cp, kcpc_request_list_t *req_list, int kmem_flags,
+    kcpc_ctx_t ***ctx_ptr_array, size_t *ctx_ptr_array_sz)
+{
+	kcpc_ctx_t	**ctx_ptrs;
+	int		nctx;
+	int		nctx_ptrs;
+	int		nreqs;
+	kcpc_request_t	*reqs;
+
+	if (cp == NULL || ctx_ptr_array == NULL || ctx_ptr_array_sz == NULL ||
+	    req_list == NULL || req_list->krl_cnt < 1)
+		return (-1);
+
+	/*
+	 * Allocate number of sets assuming that each set contains one and only
+	 * one counter event request for each counter on a CPU
+	 */
+	nreqs = req_list->krl_cnt;
+	nctx_ptrs = (nreqs + cpc_ncounters - 1) / cpc_ncounters;
+	ctx_ptrs = kmem_zalloc(nctx_ptrs * sizeof (kcpc_ctx_t *), kmem_flags);
+	if (ctx_ptrs == NULL)
+		return (-2);
+
+	/*
+	 * Fill in sets of requests
+	 */
+	nctx = 0;
+	reqs = req_list->krl_list;
+	while (nreqs > 0) {
+		kcpc_ctx_t	*ctx;
+		kcpc_set_t	*set;
+		int		subcode;
+
+		/*
+		 * Allocate CPC context and set for requested counter events
+		 */
+		ctx = kcpc_ctx_alloc(kmem_flags);
+		set = kcpc_set_create(reqs, nreqs, 0, kmem_flags);
+		if (set == NULL) {
+			kcpc_ctx_free(ctx);
+			break;
+		}
+
+		/*
+		 * Determine assignment of requested counter events to specific
+		 * counters
+		 */
+		if (kcpc_assign_reqs(set, ctx) != 0) {
+			/*
+			 * May not be able to assign requested counter events
+			 * to all counters since all counters may not be able
+			 * to do all events, so only do one counter event in
+			 * set of counter requests when this happens since at
+			 * least one of the counters must be able to do the
+			 * event.
+			 */
+			kcpc_free_set(set);
+			set = kcpc_set_create(reqs, 1, 0, kmem_flags);
+			if (set == NULL) {
+				kcpc_ctx_free(ctx);
+				break;
+			}
+			if (kcpc_assign_reqs(set, ctx) != 0) {
+#ifdef DEBUG
+				cmn_err(CE_NOTE, "!kcpc_cpu_ctx_create: can't "
+				    "assign counter event %s!\n",
+				    set->ks_req->kr_event);
+#endif
+				kcpc_free_set(set);
+				kcpc_ctx_free(ctx);
+				reqs++;
+				nreqs--;
+				continue;
+			}
+		}
+
+		/*
+		 * Allocate memory needed to hold requested counter event data
+		 */
+		set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t),
+		    kmem_flags);
+		if (set->ks_data == NULL) {
+			kcpc_free_set(set);
+			kcpc_ctx_free(ctx);
+			break;
+		}
+
+		/*
+		 * Configure requested counter events
+		 */
+		if (kcpc_configure_reqs(ctx, set, &subcode) != 0) {
+#ifdef DEBUG
+			cmn_err(CE_NOTE,
+			    "!kcpc_cpu_ctx_create: can't configure "
+			    "set of counter event requests!\n");
+#endif
+			reqs += set->ks_nreqs;
+			nreqs -= set->ks_nreqs;
+			kmem_free(set->ks_data,
+			    set->ks_nreqs * sizeof (uint64_t));
+			kcpc_free_set(set);
+			kcpc_ctx_free(ctx);
+			continue;
+		}
+
+		/*
+		 * Point set of counter event requests at this context and fill
+		 * in CPC context
+		 */
+		set->ks_ctx = ctx;
+		ctx->kc_set = set;
+		ctx->kc_cpuid = cp->cpu_id;
+		ctx->kc_thread = curthread;
+
+		ctx_ptrs[nctx] = ctx;
+
+		/*
+		 * Update requests and how many are left to be assigned to sets
+		 */
+		reqs += set->ks_nreqs;
+		nreqs -= set->ks_nreqs;
+
+		/*
+		 * Increment number of CPC contexts and allocate bigger array
+		 * for context pointers as needed
+		 */
+		nctx++;
+		if (nctx >= nctx_ptrs) {
+			kcpc_ctx_t	**new;
+			int		new_cnt;
+
+			/*
+			 * Allocate more CPC contexts based on how many
+			 * contexts allocated so far and how many counter
+			 * requests left to assign
+			 */
+			new_cnt = nctx_ptrs +
+			    ((nreqs + cpc_ncounters - 1) / cpc_ncounters);
+			new = kmem_zalloc(new_cnt * sizeof (kcpc_ctx_t *),
+			    kmem_flags);
+			if (new == NULL)
+				break;
+
+			/*
+			 * Copy contents of old sets into new ones
+			 */
+			bcopy(ctx_ptrs, new,
+			    nctx_ptrs * sizeof (kcpc_ctx_t *));
+
+			/*
+			 * Free old array of context pointers and use newly
+			 * allocated one instead now
+			 */
+			kmem_free(ctx_ptrs, nctx_ptrs * sizeof (kcpc_ctx_t *));
+			ctx_ptrs = new;
+			nctx_ptrs = new_cnt;
+		}
+	}
+
+	/*
+	 * Return NULL if no CPC contexts filled in
+	 */
+	if (nctx == 0) {
+		kmem_free(ctx_ptrs, nctx_ptrs * sizeof (kcpc_ctx_t *));
+		*ctx_ptr_array = NULL;
+		*ctx_ptr_array_sz = 0;
+		return (-2);
+	}
+
+	*ctx_ptr_array = ctx_ptrs;
+	*ctx_ptr_array_sz = nctx_ptrs * sizeof (kcpc_ctx_t *);
+	return (nctx);
+}
+
+/*
+ * Return whether PCBE supports given counter event
+ */
+boolean_t
+kcpc_event_supported(char *event)
+{
+	if (pcbe_ops == NULL || pcbe_ops->pcbe_event_coverage(event) == 0)
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/*
+ * Program counters on current CPU with given CPC context
+ *
+ * If kernel is interposing on counters to measure hardware capacity and
+ * utilization, then unprogram counters for kernel *before* programming them
+ * with specified CPC context.
+ *
+ * kcpc_{program,unprogram}() may be called either directly by a thread running
+ * on the target CPU or from a cross-call from another CPU. To protect
+ * programming and unprogramming from being interrupted by cross-calls, callers
+ * who execute kcpc_{program,unprogram} should raise PIL to the level used by
+ * cross-calls.
+ */
+void
+kcpc_program(kcpc_ctx_t *ctx, boolean_t for_thread, boolean_t cu_interpose)
+{
+	int	error;
+
+	ASSERT(IS_HIPIL());
+
+	/*
+	 * CPC context shouldn't be NULL, its CPU field should specify current
+	 * CPU or be -1 to specify any CPU when the context is bound to a
+	 * thread, and preemption should be disabled
+	 */
+	ASSERT(ctx != NULL && (ctx->kc_cpuid == CPU->cpu_id ||
+	    ctx->kc_cpuid == -1) && curthread->t_preempt > 0);
+	if (ctx == NULL || (ctx->kc_cpuid != CPU->cpu_id &&
+	    ctx->kc_cpuid != -1) || curthread->t_preempt < 1)
+		return;
+
+	/*
+	 * Unprogram counters for kernel measuring hardware capacity and
+	 * utilization
+	 */
+	if (cu_interpose == B_TRUE) {
+		cu_cpc_unprogram(CPU, &error);
+	} else {
+		kcpc_set_t *set = ctx->kc_set;
+		int i;
+
+		ASSERT(set != NULL);
+
+		/*
+		 * Since cu_interpose is false, we are programming CU context.
+		 * In general, PCBE can continue from the state saved in the
+		 * set, but it is not very reliable, so we start again from the
+		 * preset value.
+		 */
+		for (i = 0; i < set->ks_nreqs; i++) {
+			/*
+			 * Reset the virtual counter value to the preset value.
+			 */
+			*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
+
+			/*
+			 * Reset PCBE to the preset value.
+			 */
+			pcbe_ops->pcbe_configure(0, NULL,
+			    set->ks_req[i].kr_preset,
+			    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
+		}
+	}
+
+	/*
+	 * Program counters with specified CPC context
+	 */
+	ctx->kc_rawtick = KCPC_GET_TICK();
+	pcbe_ops->pcbe_program(ctx);
+
+	/*
+	 * Denote that counters programmed for thread or CPU CPC context
+	 * differently
+	 */
+	if (for_thread == B_TRUE)
+		KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
+	else
+		CPU->cpu_cpc_ctx = ctx;
+}
+
+/*
+ * Unprogram counters with given CPC context on current CPU
+ *
+ * If kernel is interposing on counters to measure hardware capacity and
+ * utilization, then program counters for the kernel capacity and utilization
+ * *after* unprogramming them for given CPC context.
+ *
+ * See the comment for kcpc_program regarding the synchronization with
+ * cross-calls.
+ */
+void
+kcpc_unprogram(kcpc_ctx_t *ctx, boolean_t cu_interpose)
+{
+	int	error;
+
+	ASSERT(IS_HIPIL());
+
+	/*
+	 * CPC context shouldn't be NULL, its CPU field should specify current
+	 * CPU or be -1 to specify any CPU when the context is bound to a
+	 * thread, and preemption should be disabled
+	 */
+	ASSERT(ctx != NULL && (ctx->kc_cpuid == CPU->cpu_id ||
+	    ctx->kc_cpuid == -1) && curthread->t_preempt > 0);
+
+	if (ctx == NULL || (ctx->kc_cpuid != CPU->cpu_id &&
+	    ctx->kc_cpuid != -1) || curthread->t_preempt < 1 ||
+	    (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) != 0) {
+		return;
+	}
+
+	/*
+	 * Specified CPC context to be unprogrammed should be bound to current
+	 * CPU or thread
+	 */
+	ASSERT(CPU->cpu_cpc_ctx == ctx || curthread->t_cpc_ctx == ctx);
+
+	/*
+	 * Stop counters
+	 */
+	pcbe_ops->pcbe_allstop();
+	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID_STOPPED);
+
+	/*
+	 * Allow kernel to interpose on counters and program them for its own
+	 * use to measure hardware capacity and utilization if cu_interpose
+	 * argument is true
+	 */
+	if (cu_interpose == B_TRUE)
+		cu_cpc_program(CPU, &error);
+}
+
+/*
+ * Read CPU Performance Counter (CPC) on current CPU and call specified update
+ * routine with data for each counter event currently programmed on CPU
+ */
+int
+kcpc_read(kcpc_update_func_t update_func)
+{
+	kcpc_ctx_t	*ctx;
+	int		i;
+	kcpc_request_t	*req;
+	int		retval;
+	kcpc_set_t	*set;
+
+	ASSERT(IS_HIPIL());
+
+	/*
+	 * Can't grab locks or block because may be called inside dispatcher
+	 */
+	kpreempt_disable();
+
+	ctx = CPU->cpu_cpc_ctx;
+	if (ctx == NULL) {
+		kpreempt_enable();
+		return (0);
+	}
+
+	/*
+	 * Read counter data from current CPU
+	 */
+	pcbe_ops->pcbe_sample(ctx);
+
+	set = ctx->kc_set;
+	if (set == NULL || set->ks_req == NULL) {
+		kpreempt_enable();
+		return (0);
+	}
+
+	/*
+	 * Call update function with preset pointer and data for each CPC event
+	 * request currently programmed on current CPU
+	 */
+	req = set->ks_req;
+	retval = 0;
+	for (i = 0; i < set->ks_nreqs; i++) {
+		int	ret;
+
+		if (req[i].kr_data == NULL)
+			break;
+
+		ret = update_func(req[i].kr_ptr, *req[i].kr_data);
+		if (ret < 0)
+			retval = ret;
+	}
+
+	kpreempt_enable();
+
+	return (retval);
+}
+
+/*
+ * Initialize list of counter event requests
+ */
+kcpc_request_list_t *
+kcpc_reqs_init(int nreqs, int kmem_flags)
+{
+	kcpc_request_list_t	*req_list;
+	kcpc_request_t		*reqs;
+
+	if (nreqs < 1)
+		return (NULL);
+
+	req_list = kmem_zalloc(sizeof (kcpc_request_list_t), kmem_flags);
+	if (req_list == NULL)
+		return (NULL);
+
+	reqs = kmem_zalloc(nreqs * sizeof (kcpc_request_t), kmem_flags);
+	if (reqs == NULL) {
+		kmem_free(req_list, sizeof (kcpc_request_list_t));
+		return (NULL);
+	}
+
+	req_list->krl_list = reqs;
+	req_list->krl_cnt = 0;
+	req_list->krl_max = nreqs;
+	return (req_list);
+}
+
+
+/*
+ * Add counter event request to given list of counter event requests
+ */
+int
+kcpc_reqs_add(kcpc_request_list_t *req_list, char *event, uint64_t preset,
+    uint_t flags, uint_t nattrs, kcpc_attr_t *attr, void *ptr, int kmem_flags)
+{
+	kcpc_request_t	*req;
+
+	ASSERT(req_list->krl_max != 0);
+	if (req_list == NULL || req_list->krl_list == NULL)
+		return (-1);
+
+	/*
+	 * Allocate more space (if needed)
+	 */
+	if (req_list->krl_cnt > req_list->krl_max) {
+		kcpc_request_t	*new;
+		kcpc_request_t	*old;
+
+		old = req_list->krl_list;
+		new = kmem_zalloc((req_list->krl_max +
+		    cpc_ncounters) * sizeof (kcpc_request_t), kmem_flags);
+		if (new == NULL)
+			return (-2);
+
+		req_list->krl_list = new;
+		bcopy(old, req_list->krl_list,
+		    req_list->krl_cnt * sizeof (kcpc_request_t));
+		kmem_free(old, req_list->krl_max * sizeof (kcpc_request_t));
+		req_list->krl_cnt = 0;
+		req_list->krl_max += cpc_ncounters;
+	}
+
+	/*
+	 * Fill in request as much as possible now, but some fields will need
+	 * to be set when request is assigned to a set.
+	 */
+	req = &req_list->krl_list[req_list->krl_cnt];
+	req->kr_config = NULL;
+	req->kr_picnum = -1;	/* have CPC pick this */
+	req->kr_index = -1;	/* set when assigning request to set */
+	req->kr_data = NULL;	/* set when configuring request */
+	(void) strcpy(req->kr_event, event);
+	req->kr_preset = preset;
+	req->kr_flags = flags;
+	req->kr_nattrs = nattrs;
+	req->kr_attr = attr;
+	/*
+	 * Keep pointer given by caller to give to update function when this
+	 * counter event is sampled/read
+	 */
+	req->kr_ptr = ptr;
+
+	req_list->krl_cnt++;
+
+	return (0);
+}
+
+/*
+ * Reset list of CPC event requests so its space can be used for another set
+ * of requests
+ */
+int
+kcpc_reqs_reset(kcpc_request_list_t *req_list)
+{
+	/*
+	 * Return when pointer to request list structure or request is NULL or
+	 * when max requests is less than or equal to 0
+	 */
+	if (req_list == NULL || req_list->krl_list == NULL ||
+	    req_list->krl_max <= 0)
+		return (-1);
+
+	/*
+	 * Zero out requests and number of requests used
+	 */
+	bzero(req_list->krl_list, req_list->krl_max * sizeof (kcpc_request_t));
+	req_list->krl_cnt = 0;
+	return (0);
+}
+
+/*
+ * Free given list of counter event requests
+ */
+int
+kcpc_reqs_fini(kcpc_request_list_t *req_list)
+{
+	kmem_free(req_list->krl_list,
+	    req_list->krl_max * sizeof (kcpc_request_t));
+	kmem_free(req_list, sizeof (kcpc_request_list_t));
+	return (0);
+}
+
+/*
+ * Create set of given counter event requests
+ */
+static kcpc_set_t *
+kcpc_set_create(kcpc_request_t *reqs, int nreqs, int set_flags, int kmem_flags)
+{
+	int		i;
+	kcpc_set_t	*set;
+
+	/*
+	 * Allocate set and assign number of requests in set and flags
+	 */
+	set = kmem_zalloc(sizeof (kcpc_set_t), kmem_flags);
+	if (set == NULL)
+		return (NULL);
+
+	if (nreqs < cpc_ncounters)
+		set->ks_nreqs = nreqs;
+	else
+		set->ks_nreqs = cpc_ncounters;
+
+	set->ks_flags = set_flags;
+
+	/*
+	 * Allocate requests needed, copy requests into set, and set index into
+	 * data for each request (which may change when we assign requested
+	 * counter events to counters)
+	 */
+	set->ks_req = (kcpc_request_t *)kmem_zalloc(sizeof (kcpc_request_t) *
+	    set->ks_nreqs, kmem_flags);
+	if (set->ks_req == NULL) {
+		kmem_free(set, sizeof (kcpc_set_t));
+		return (NULL);
+	}
+
+	bcopy(reqs, set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs);
+
+	for (i = 0; i < set->ks_nreqs; i++)
+		set->ks_req[i].kr_index = i;
+
+	return (set);
+}
+
+
+/*
+ * Stop counters on current CPU.
+ *
+ * If preserve_context is true, the caller is interested in the CPU's CPC
+ * context and wants it to be preserved.
+ *
+ * If preserve_context is false, the caller does not need the CPU's CPC context
+ * to be preserved, so it is set to NULL.
+ */
+static void
+kcpc_cpustop_func(boolean_t preserve_context)
+{
+	kpreempt_disable();
+
+	/*
+	 * Someone already stopped this context before us, so there is nothing
+	 * to do.
+	 */
+	if (CPU->cpu_cpc_ctx == NULL) {
+		kpreempt_enable();
+		return;
+	}
+
+	kcpc_unprogram(CPU->cpu_cpc_ctx, B_TRUE);
+	/*
+	 * If CU does not use counters, then clear the CPU's CPC context
+	 * If the caller requested to preserve context it should disable CU
+	 * first, so there should be no CU context now.
+	 */
+	ASSERT(!preserve_context || !CU_CPC_ON(CPU));
+	if (!preserve_context && CPU->cpu_cpc_ctx != NULL && !CU_CPC_ON(CPU))
+		CPU->cpu_cpc_ctx = NULL;
+
+	kpreempt_enable();
+}
+
+/*
+ * Stop counters on given CPU and set its CPC context to NULL unless
+ * preserve_context is true.
+ */
+void
+kcpc_cpu_stop(cpu_t *cp, boolean_t preserve_context)
+{
+	cpu_call(cp, (cpu_call_func_t)kcpc_cpustop_func,
+	    preserve_context, 0);
+}
+
+/*
+ * Program the context on the current CPU
+ */
+static void
+kcpc_remoteprogram_func(kcpc_ctx_t *ctx, uintptr_t arg)
+{
+	boolean_t for_thread = (boolean_t)arg;
+
+	ASSERT(ctx != NULL);
+
+	kpreempt_disable();
+	kcpc_program(ctx, for_thread, B_TRUE);
+	kpreempt_enable();
+}
+
+/*
+ * Program counters on given CPU
+ */
+void
+kcpc_cpu_program(cpu_t *cp, kcpc_ctx_t *ctx)
+{
+	cpu_call(cp, (cpu_call_func_t)kcpc_remoteprogram_func, (uintptr_t)ctx,
+	    (uintptr_t)B_FALSE);
+}
+
 char *
 kcpc_list_attrs(void)
 {
diff --git a/usr/src/uts/common/os/pg.c b/usr/src/uts/common/os/pg.c
index 067670dbbb..835ae3d322 100644
--- a/usr/src/uts/common/os/pg.c
+++ b/usr/src/uts/common/os/pg.c
@@ -110,7 +110,11 @@ static cpu_pg_t		bootstrap_pg_data;
  * and the next free id in the set.
  */
 static bitset_t		pg_id_set;
-static pgid_t		pg_id_next = 0;
+
+/*
+ * ID space starts from 1 to assume that root has ID 0;
+ */
+static pgid_t		pg_id_next = 1;
 
 /*
  * Default and externed PG ops vectors
diff --git a/usr/src/uts/common/os/pghw.c b/usr/src/uts/common/os/pghw.c
index ca59db8602..534cb2c540 100644
--- a/usr/src/uts/common/os/pghw.c
+++ b/usr/src/uts/common/os/pghw.c
@@ -34,6 +34,7 @@
 #include <sys/pg.h>
 #include <sys/pghw.h>
 #include <sys/cpu_pm.h>
+#include <sys/cap_util.h>
 
 /*
  * Processor Groups: Hardware sharing relationship layer
@@ -116,10 +117,10 @@ struct pghw_kstat {
 	kstat_named_t	pg_hw;
 	kstat_named_t	pg_policy;
 } pghw_kstat = {
-	{ "id",			KSTAT_DATA_UINT64 },
+	{ "id",			KSTAT_DATA_UINT32 },
 	{ "pg_class",		KSTAT_DATA_STRING },
-	{ "ncpus",		KSTAT_DATA_UINT64 },
-	{ "instance_id",	KSTAT_DATA_UINT64 },
+	{ "ncpus",		KSTAT_DATA_UINT32 },
+	{ "instance_id",	KSTAT_DATA_UINT32 },
 	{ "hardware",		KSTAT_DATA_STRING },
 	{ "policy",		KSTAT_DATA_STRING },
 };
@@ -127,12 +128,92 @@ struct pghw_kstat {
 kmutex_t		pghw_kstat_lock;
 
 /*
+ * Capacity and Utilization PG kstats
+ *
+ * These kstats are updated one at a time, so we can have a single scratch space
+ * to fill the data.
+ *
+ * kstat fields:
+ *
+ *   pgid		PG ID for PG described by this kstat
+ *
+ *   pg_ncpus		Number of CPUs within this PG
+ *
+ *   pg_cpus		String describing CPUs within this PG
+ *
+ *   pg_sharing		Name of sharing relationship for this PG
+ *
+ *   pg_generation	Generation value that increases whenever any CPU leaves
+ *			  or joins PG. Two kstat snapshots for the same
+ *			  CPU may only be compared if they have the same
+ *			  generation
+ *
+ *   pg_hw_util		Running value of PG utilization for the sharing
+ *			  relationship
+ *
+ *   pg_hw_util_time_running
+ *			Total time spent collecting CU data. The time may be
+ *			less than wall time if CU counters were stopped for
+ *			some time.
+ *
+ *   pg_hw_util_time_stopped Total time the CU counters were stopped.
+ *
+ *   pg_hw_util_rate	Utilization rate, expressed in operations per second.
+ *
+ *   pg_hw_util_rate_max Maximum observed value of utilization rate.
+ */
+struct pghw_cu_kstat {
+	kstat_named_t	pg_id;
+	kstat_named_t	pg_ncpus;
+	kstat_named_t	pg_generation;
+	kstat_named_t	pg_hw_util;
+	kstat_named_t	pg_hw_util_time_running;
+	kstat_named_t	pg_hw_util_time_stopped;
+	kstat_named_t	pg_hw_util_rate;
+	kstat_named_t	pg_hw_util_rate_max;
+	kstat_named_t	pg_cpus;
+	kstat_named_t	pg_sharing;
+} pghw_cu_kstat = {
+	{ "id",			KSTAT_DATA_UINT32 },
+	{ "ncpus",		KSTAT_DATA_UINT32 },
+	{ "generation",		KSTAT_DATA_UINT32   },
+	{ "hw_util",		KSTAT_DATA_UINT64   },
+	{ "hw_util_time_running",	KSTAT_DATA_UINT64   },
+	{ "hw_util_time_stopped",	KSTAT_DATA_UINT64   },
+	{ "hw_util_rate",	KSTAT_DATA_UINT64   },
+	{ "hw_util_rate_max",	KSTAT_DATA_UINT64   },
+	{ "cpus",		KSTAT_DATA_STRING   },
+	{ "sharing_relation",	KSTAT_DATA_STRING   },
+};
+
+/*
+ * Calculate the string size to represent NCPUS. Allow 5 digits for each CPU ID
+ * plus one space per CPU plus NUL byte in the end. This is only an estimate,
+ * since we try to compress CPU ranges as x-y. In the worst case the string
+ * representation of CPUs may be truncated.
+ */
+#define	CPUSTR_LEN(ncpus) ((ncpus) * 6)
+
+/*
+ * Maximum length of the string that represents list of CPUs
+ */
+static int pg_cpulist_maxlen = 0;
+
+static void		pghw_kstat_create(pghw_t *);
+static int		pghw_kstat_update(kstat_t *, int);
+static int		pghw_cu_kstat_update(kstat_t *, int);
+static int		cpu2id(void *);
+
+/*
  * hwset operations
  */
 static group_t		*pghw_set_create(pghw_type_t);
 static void		pghw_set_add(group_t *, pghw_t *);
 static void		pghw_set_remove(group_t *, pghw_t *);
 
+static void		pghw_cpulist_alloc(pghw_t *);
+static int		cpu2id(void *);
+
 /*
  * Initialize the physical portion of a hardware PG
  */
@@ -150,6 +231,7 @@ pghw_init(pghw_t *pg, cpu_t *cp, pghw_type_t hw)
 
 	pghw_set_add(hwset, pg);
 	pg->pghw_hw = hw;
+	pg->pghw_generation = 0;
 	pg->pghw_instance =
 	    pg_plat_hw_instance_id(cp, hw);
 	pghw_kstat_create(pg);
@@ -186,8 +268,20 @@ pghw_fini(pghw_t *pg)
 	pg->pghw_instance = (id_t)PGHW_INSTANCE_ANON;
 	pg->pghw_hw = (pghw_type_t)-1;
 
-	if (pg->pghw_kstat)
+	if (pg->pghw_kstat != NULL)
 		kstat_delete(pg->pghw_kstat);
+
+	/*
+	 * Destroy string representation of CPUs
+	 */
+	if (pg->pghw_cpulist != NULL) {
+		kmem_free(pg->pghw_cpulist,
+		    pg->pghw_cpulist_len);
+		pg->pghw_cpulist = NULL;
+	}
+
+	if (pg->pghw_cu_kstat != NULL)
+		kstat_delete(pg->pghw_cu_kstat);
 }
 
 /*
@@ -344,11 +438,10 @@ pghw_set_remove(group_t *hwset, pghw_t *pg)
 	ASSERT(result == 0);
 }
 
-
 /*
  * Return a string name given a pg_hw sharing type
  */
-static char *
+char *
 pghw_type_string(pghw_type_t hw)
 {
 	switch (hw) {
@@ -374,6 +467,34 @@ pghw_type_string(pghw_type_t hw)
 }
 
 /*
+ * Return a short string name given a pg_hw sharing type
+ */
+char *
+pghw_type_shortstring(pghw_type_t hw)
+{
+	switch (hw) {
+	case PGHW_IPIPE:
+		return ("instr_pipeline");
+	case PGHW_CACHE:
+		return ("Cache");
+	case PGHW_FPU:
+		return ("FPU");
+	case PGHW_MPIPE:
+		return ("memory_pipeline");
+	case PGHW_CHIP:
+		return ("Socket");
+	case PGHW_MEMORY:
+		return ("Memory");
+	case PGHW_POW_ACTIVE:
+		return ("CPU_PM_Active");
+	case PGHW_POW_IDLE:
+		return ("CPU_PM_Idle");
+	default:
+		return ("unknown");
+	}
+}
+
+/*
  * Create / Update routines for PG hw kstats
  *
  * It is the intention of these kstats to provide some level
@@ -383,11 +504,14 @@ pghw_type_string(pghw_type_t hw)
 void
 pghw_kstat_create(pghw_t *pg)
 {
+	char *class = pghw_type_string(pg->pghw_hw);
+
 	/*
 	 * Create a physical pg kstat
 	 */
 	if ((pg->pghw_kstat = kstat_create("pg", ((pg_t *)pg)->pg_id,
-	    "pg", "pg", KSTAT_TYPE_NAMED,
+	    "pg", "pg",
+	    KSTAT_TYPE_NAMED,
 	    sizeof (pghw_kstat) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL)) != NULL) {
 		/* Class string, hw string, and policy string */
@@ -400,6 +524,28 @@ pghw_kstat_create(pghw_t *pg)
 		pg->pghw_kstat->ks_private = pg;
 		kstat_install(pg->pghw_kstat);
 	}
+
+	if (pg_cpulist_maxlen == 0)
+		pg_cpulist_maxlen = CPUSTR_LEN(max_ncpus);
+
+	/*
+	 * Create a physical pg kstat
+	 */
+	if ((pg->pghw_cu_kstat = kstat_create("pg", ((pg_t *)pg)->pg_id,
+	    "hardware", class,
+	    KSTAT_TYPE_NAMED,
+	    sizeof (pghw_cu_kstat) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL)) != NULL) {
+		pg->pghw_cu_kstat->ks_lock = &pghw_kstat_lock;
+		pg->pghw_cu_kstat->ks_data = &pghw_cu_kstat;
+		pg->pghw_cu_kstat->ks_update = pghw_cu_kstat_update;
+		pg->pghw_cu_kstat->ks_private = pg;
+		pg->pghw_cu_kstat->ks_data_size += strlen(class) + 1;
+		/* Allow space for CPU strings */
+		pg->pghw_cu_kstat->ks_data_size += PGHW_KSTAT_STR_LEN_MAX;
+		pg->pghw_cu_kstat->ks_data_size += pg_cpulist_maxlen;
+		kstat_install(pg->pghw_cu_kstat);
+	}
 }
 
 int
@@ -411,11 +557,147 @@ pghw_kstat_update(kstat_t *ksp, int rw)
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
-	pgsp->pg_id.value.ui64 = ((pg_t *)pg)->pg_id;
-	pgsp->pg_ncpus.value.ui64 = GROUP_SIZE(&((pg_t *)pg)->pg_cpus);
-	pgsp->pg_instance_id.value.ui64 = (uint64_t)pg->pghw_instance;
+	pgsp->pg_id.value.ui32 = ((pg_t *)pg)->pg_id;
+	pgsp->pg_ncpus.value.ui32 = GROUP_SIZE(&((pg_t *)pg)->pg_cpus);
+	pgsp->pg_instance_id.value.ui32 = pg->pghw_instance;
 	kstat_named_setstr(&pgsp->pg_class, ((pg_t *)pg)->pg_class->pgc_name);
 	kstat_named_setstr(&pgsp->pg_hw, pghw_type_string(pg->pghw_hw));
 	kstat_named_setstr(&pgsp->pg_policy, pg_policy_name((pg_t *)pg));
 	return (0);
 }
+
+int
+pghw_cu_kstat_update(kstat_t *ksp, int rw)
+{
+	struct pghw_cu_kstat	*pgsp = &pghw_cu_kstat;
+	pghw_t			*pg = ksp->ks_private;
+	pghw_util_t		*hw_util = &pg->pghw_stats;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	pgsp->pg_id.value.ui32 = ((pg_t *)pg)->pg_id;
+	pgsp->pg_ncpus.value.ui32 = GROUP_SIZE(&((pg_t *)pg)->pg_cpus);
+
+	/*
+	 * Allocate memory for the string representing the list of CPUs in PG.
+	 * This memory should persist past the call to pghw_cu_kstat_update()
+	 * since the kstat snapshot routine will reference this memory.
+	 */
+	pghw_cpulist_alloc(pg);
+
+	if (pg->pghw_kstat_gen != pg->pghw_generation) {
+		/*
+		 * PG kstat generation number is out of sync with PG's
+		 * generation mumber. It means that some CPUs could have joined
+		 * or left PG and it is not possible to compare the numbers
+		 * obtained before and after the generation change.
+		 *
+		 * Reset the maximum utilization rate and start computing it
+		 * from scratch.
+		 */
+		hw_util->pghw_util = 0;
+		hw_util->pghw_rate_max = 0;
+		pg->pghw_kstat_gen = pg->pghw_generation;
+	}
+
+	/*
+	 * We can't block on CPU lock because when PG is destroyed (under
+	 * cpu_lock) it tries to delete this kstat and it will wait for us to
+	 * complete which will never happen since we are waiting for cpu_lock to
+	 * drop. Deadlocks are fun!
+	 */
+	if (mutex_tryenter(&cpu_lock)) {
+		if (pg->pghw_cpulist != NULL &&
+		    *(pg->pghw_cpulist) == '\0') {
+			(void) group2intlist(&(((pg_t *)pg)->pg_cpus),
+			    pg->pghw_cpulist, pg->pghw_cpulist_len, cpu2id);
+		}
+		cu_pg_update(pg);
+		mutex_exit(&cpu_lock);
+	}
+
+	pgsp->pg_generation.value.ui32 = pg->pghw_kstat_gen;
+	pgsp->pg_hw_util.value.ui64 = hw_util->pghw_util;
+	pgsp->pg_hw_util_time_running.value.ui64 = hw_util->pghw_time_running;
+	pgsp->pg_hw_util_time_stopped.value.ui64 = hw_util->pghw_time_stopped;
+	pgsp->pg_hw_util_rate.value.ui64 = hw_util->pghw_rate;
+	pgsp->pg_hw_util_rate_max.value.ui64 = hw_util->pghw_rate_max;
+	if (pg->pghw_cpulist != NULL)
+		kstat_named_setstr(&pgsp->pg_cpus, pg->pghw_cpulist);
+	else
+		kstat_named_setstr(&pgsp->pg_cpus, "");
+
+	kstat_named_setstr(&pgsp->pg_sharing, pghw_type_string(pg->pghw_hw));
+
+	return (0);
+}
+
+/*
+ * Update the string representation of CPUs in PG (pg->pghw_cpulist).
+ * The string representation is used for kstats.
+ *
+ * The string is allocated if it has not already been or if it is already
+ * allocated and PG has more CPUs now. If PG has smaller or equal number of
+ * CPUs, but the actual CPUs may have changed, the string is reset to the empty
+ * string causes the string representation to be recreated. The pghw_generation
+ * field is used to detect whether CPUs within the pg may have changed.
+ */
+static void
+pghw_cpulist_alloc(pghw_t *pg)
+{
+	uint_t	ncpus = GROUP_SIZE(&((pg_t *)pg)->pg_cpus);
+	size_t	len = CPUSTR_LEN(ncpus);
+
+	/*
+	 * If the pghw_cpulist string is already allocated we need to make sure
+	 * that it has sufficient length. Also if the set of CPUs may have
+	 * changed, we need to re-generate the string.
+	 */
+	if (pg->pghw_cpulist != NULL &&
+	    pg->pghw_kstat_gen != pg->pghw_generation) {
+		if (len <= pg->pghw_cpulist_len) {
+			/*
+			 * There is sufficient space in the pghw_cpulist for
+			 * the new set of CPUs. Just clear the string to trigger
+			 * re-generation of list of CPUs
+			 */
+			*(pg->pghw_cpulist) = '\0';
+		} else {
+			/*
+			 * There is, potentially, insufficient space in
+			 * pghw_cpulist, so reallocate the string.
+			 */
+			ASSERT(strlen(pg->pghw_cpulist) < pg->pghw_cpulist_len);
+			kmem_free(pg->pghw_cpulist, pg->pghw_cpulist_len);
+			pg->pghw_cpulist = NULL;
+			pg->pghw_cpulist_len = 0;
+		}
+	}
+
+	if (pg->pghw_cpulist == NULL) {
+		/*
+		 * Allocate space to hold cpulist.
+		 *
+		 * Length can not be bigger that the maximum space we have
+		 * allowed for the kstat buffer
+		 */
+		if (len > pg_cpulist_maxlen)
+			len = pg_cpulist_maxlen;
+		if (len > 0) {
+			pg->pghw_cpulist = kmem_zalloc(len, KM_NOSLEEP);
+			if (pg->pghw_cpulist != NULL)
+				pg->pghw_cpulist_len = len;
+		}
+	}
+}
+
+static int
+cpu2id(void *v)
+{
+	cpu_t *cp = (cpu_t *)v;
+
+	ASSERT(v != NULL);
+
+	return (cp->cpu_id);
+}
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index 9006be10f4..5133e80e69 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -98,6 +98,7 @@ CHKHDRS=			\
 	byteorder.h		\
 	callb.h			\
 	callo.h			\
+	cap_util.h		\
 	cpucaps.h		\
 	cpucaps_impl.h		\
 	ccompile.h		\
diff --git a/usr/src/uts/common/sys/cap_util.h b/usr/src/uts/common/sys/cap_util.h
new file mode 100644
index 0000000000..7e25ba6697
--- /dev/null
+++ b/usr/src/uts/common/sys/cap_util.h
@@ -0,0 +1,173 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_CAP_UTIL_H
+#define	_SYS_CAP_UTIL_H
+
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/kcpc.h>
+#include <sys/cpc_impl.h>
+#include <sys/pghw.h>
+#include <sys/cmt.h>
+
+#ifdef	_KERNEL
+
+/*
+ * Capacity and utilization flags for each CPU
+ */
+#define	CU_CPU_CNTRS_ON		1	/* CPU performance counters are on */
+#define	CU_CPU_CNTRS_OFF_ON	2	/* Off -> on transition */
+
+/*
+ * Macro that returns whether CPU performance counters turned on for given CPU
+ */
+#define	CU_CPC_ON(cp) \
+	((cp) != NULL && (cp)->cpu_cu_info != NULL && \
+	    ((cp)->cpu_cu_info->cu_flag & CU_CPU_CNTRS_ON))
+
+
+/*
+ * Per counter statistics
+ */
+typedef struct cu_cntr_stats {
+	hrtime_t	cs_time_running; /* running total of time counting */
+	hrtime_t	cs_time_stopped; /* ... time not counting */
+	hrtime_t	cs_time_start;	/* start time of current sample  */
+	uint64_t	cs_value_start;	/* starting value for next sample */
+	uint64_t	cs_value_last;	/* last value */
+	uint64_t	cs_value_total;	/* running total */
+	uint64_t	cs_rate;	/* observed rate since last */
+	uint64_t	cs_rate_max;	/* maximum rate */
+	kcpc_request_t	*cs_cpc_req;	/* corresponding CPC request */
+	struct cpu	*cs_cpu_start;	/* CPU where starting value gotten */
+} cu_cntr_stats_t;
+
+
+/*
+ * Counter info for a PG hardware sharing relationship
+ */
+typedef struct cu_cntr_info {
+	cpu_t		*ci_cpu;	/* CPU being measured */
+	pghw_t		*ci_pg;		/* hardware PG being measured */
+	kstat_t		*ci_kstat;	/* kstats being exported */
+	cu_cntr_stats_t	*ci_stats;	/* counter statistics */
+	uint_t		ci_nstats;	/* number of statistics */
+} cu_cntr_info_t;
+
+
+/*
+ * Each CPU can have one or more CPC contexts for measuring capacity and
+ * utilization
+ *
+ * One CPC context is needed per CPU if the counter events needed to measure
+ * capacity and utilization on each CPU can be programmed onto all the counters
+ * on a CPU at the same time and there are fewer or same number of desired
+ * counter events as counters on each CPU.  Otherwise, the desired counter
+ * events are assigned across multiple CPC contexts, so the contexts and their
+ * counter events can be multiplexed onto the counters over time to get the
+ * data for all of the counter events.
+ */
+typedef struct cu_cpc_ctx {
+	int		cur_index;	/* index for current context */
+	int		nctx;		/* number of CPC contexts */
+	kcpc_ctx_t	**ctx_ptr_array; /* array of context pointers */
+	size_t		ctx_ptr_array_sz; /* size of array */
+} cu_cpc_ctx_t;
+
+/*
+ * Per CPU capacity and utilization info
+ */
+typedef struct cu_cpu_info {
+	struct cpu	*cu_cpu;	/* CPU for the statistics */
+	uint_t		cu_flag;	/* capacity & utilization flag */
+	hrtime_t	cu_sample_time;	/* when last sample taken */
+	cu_cpc_ctx_t	cu_cpc_ctx;	/* performance counter contexts */
+	cu_cntr_stats_t	*cu_cntr_stats;	/* counter statistics array */
+	uint_t		cu_ncntr_stats;	/* number of counter statistics */
+	uint_t		cu_disabled;	/* count of disable requests */
+	/*
+	 * Per PG hardware sharing relationship counter info
+	 */
+	cu_cntr_info_t	*cu_cntr_info[PGHW_NUM_COMPONENTS];
+} cu_cpu_info_t;
+
+/*
+ * COMMON INTERFACE ROUTINES
+ */
+
+/*
+ * Setup capacity and utilization support
+ */
+extern void	cu_init(void);
+
+/*
+ * Tear down capacity and utilization support
+ */
+extern int	cu_fini(void);
+
+/*
+ * Program CPC for capacity and utilization on given CPU
+ */
+extern void	cu_cpc_program(struct cpu *, int *);
+
+/*
+ * Unprogram CPC for capacity and utilization on given CPU
+ */
+extern void	cu_cpc_unprogram(struct cpu *, int *);
+
+/*
+ * Update counter statistics on a given CPU
+ */
+extern int	cu_cpu_update(struct cpu *, boolean_t);
+
+/*
+ * Update utilization and capacity data for CMT PG
+ */
+extern void	cu_pg_update(pghw_t *);
+
+/*
+ * Disable or enable capacity and utilization on all CPUs
+ */
+extern void	cu_disable(void);
+extern void	cu_enable(void);
+
+/*
+ * PLATFORM SPECIFIC INTERFACE ROUTINES
+ */
+extern int	cu_plat_cpc_init(cpu_t *, kcpc_request_list_t *, int);
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_CAP_UTIL_H */
diff --git a/usr/src/uts/common/sys/cmt.h b/usr/src/uts/common/sys/cmt.h
index 4e7ed28656..afdb6730a6 100644
--- a/usr/src/uts/common/sys/cmt.h
+++ b/usr/src/uts/common/sys/cmt.h
@@ -63,6 +63,7 @@ typedef struct pg_cmt {
 	int		cmt_nchildren;		/* # of children CMT PGs */
 	struct group	cmt_cpus_actv;
 	struct bitset	cmt_cpus_actv_set;	/* bitset of active CPUs */
+	kstat_t		*cmt_kstat;		/* cmt kstats exported */
 } pg_cmt_t;
 
 /*
diff --git a/usr/src/uts/common/sys/cpc_impl.h b/usr/src/uts/common/sys/cpc_impl.h
index 1b57c76c10..ae89c90508 100644
--- a/usr/src/uts/common/sys/cpc_impl.h
+++ b/usr/src/uts/common/sys/cpc_impl.h
@@ -131,7 +131,7 @@ typedef struct _kcpc_ctx kcpc_ctx_t;
 
 struct _kcpc_ctx {
 	struct _kcpc_set *kc_set;	/* linked list of all bound sets */
-	uint32_t	kc_flags;
+	volatile uint_t	kc_flags;
 	kcpc_pic_t	*kc_pics;	/* pointer to array of per-pic data */
 	hrtime_t	kc_hrtime;	/* gethrtime() at last sample */
 	uint64_t	kc_vtick;	/* virtualized %tick */
@@ -214,20 +214,18 @@ extern hrtime_t tsc_read(void);
 struct cpu;
 
 extern uint_t cpc_ncounters;
-extern kmutex_t	kcpc_ctx_llock[];	/* protects ctx_list */
-extern kcpc_ctx_t *kcpc_ctx_list[];	/* head of list */
 extern krwlock_t kcpc_cpuctx_lock;	/* lock for 'kcpc_cpuctx' below */
 extern int	kcpc_cpuctx;		/* number of cpu-specific contexts */
 
 extern void kcpc_invalidate_all(void);
 
 extern void kcpc_passivate(void);
-extern void kcpc_remote_stop(struct cpu *cp);
+extern void kcpc_cpu_stop(struct cpu *, boolean_t);
 extern int kcpc_pcbe_tryload(const char *, uint_t, uint_t, uint_t);
-extern void kcpc_remote_program(struct cpu *cp);
+extern void kcpc_cpu_program(struct cpu *, kcpc_ctx_t *);
 extern void kcpc_register_dcpc(void (*func)(uint64_t));
 extern void kcpc_unregister_dcpc(void);
-extern kcpc_ctx_t *kcpc_ctx_alloc(void);
+extern kcpc_ctx_t *kcpc_ctx_alloc(int);
 extern int kcpc_assign_reqs(struct _kcpc_set *, kcpc_ctx_t *);
 extern void kcpc_ctx_free(kcpc_ctx_t *);
 extern int kcpc_configure_reqs(kcpc_ctx_t *, struct _kcpc_set *, int *);
diff --git a/usr/src/uts/common/sys/cpc_pcbe.h b/usr/src/uts/common/sys/cpc_pcbe.h
index 7522a9bf82..eb168fcf2c 100644
--- a/usr/src/uts/common/sys/cpc_pcbe.h
+++ b/usr/src/uts/common/sys/cpc_pcbe.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -37,8 +36,6 @@
 #ifndef _SYS_CPC_PCBE_H
 #define	_SYS_CPC_PCBE_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/inttypes.h>
 #include <sys/cpc_impl.h>
 
@@ -51,6 +48,8 @@ extern "C" {
  */
 #define	PCBE_VER_1	1
 
+#define	PCBE_IMPL_NAME_P4HT	"Pentium 4 with HyperThreading"
+
 typedef struct __pcbe_ops {
 	uint_t		pcbe_ver;
 	uint_t		pcbe_caps;
diff --git a/usr/src/uts/common/sys/cpuvar.h b/usr/src/uts/common/sys/cpuvar.h
index aece259a35..b52192b419 100644
--- a/usr/src/uts/common/sys/cpuvar.h
+++ b/usr/src/uts/common/sys/cpuvar.h
@@ -222,6 +222,16 @@ typedef struct cpu {
 
 	uint_t		cpu_rotor;	/* for cheap pseudo-random numbers */
 
+	struct cu_cpu_info	*cpu_cu_info;	/* capacity & util. info */
+
+	/*
+	 * cpu_generation is updated whenever CPU goes on-line or off-line.
+	 * Updates to cpu_generation are protected by cpu_lock.
+	 *
+	 * See CPU_NEW_GENERATION() macro below.
+	 */
+	volatile uint_t		cpu_generation;	/* tracking on/off-line */
+
 	/*
 	 * New members must be added /before/ this member, as the CTF tools
 	 * rely on this being the last field before cpu_m, so they can
@@ -597,6 +607,13 @@ extern struct cpu *curcpup(void);
 #define	CPU_STATS(cp, stat)                                       \
 	((cp)->cpu_stats.stat)
 
+/*
+ * Increment CPU generation value.
+ * This macro should be called whenever CPU goes on-line or off-line.
+ * Updates to cpu_generation should be protected by cpu_lock.
+ */
+#define	CPU_NEW_GENERATION(cp)	((cp)->cpu_generation++)
+
 #endif /* _KERNEL || _KMEMUSER */
 
 /*
@@ -726,6 +743,49 @@ void	cpu_enable_intr(struct cpu *cp); /* start issuing interrupts to cpu */
  */
 extern kmutex_t	cpu_lock;	/* lock protecting CPU data */
 
+/*
+ * CPU state change events
+ *
+ * Various subsystems need to know when CPUs change their state. They get this
+ * information by registering  CPU state change callbacks using
+ * register_cpu_setup_func(). Whenever any CPU changes its state, the callback
+ * function is called. The callback function is passed three arguments:
+ *
+ *   Event, described by cpu_setup_t
+ *   CPU ID
+ *   Transparent pointer passed when registering the callback
+ *
+ * The callback function is called with cpu_lock held. The return value from the
+ * callback function is usually ignored, except for CPU_CONFIG and CPU_UNCONFIG
+ * events. For these two events, non-zero return value indicates a failure and
+ * prevents successful completion of the operation.
+ *
+ * New events may be added in the future. Callback functions should ignore any
+ * events that they do not understand.
+ *
+ * The following events provide notification callbacks:
+ *
+ *  CPU_INIT	A new CPU is started and added to the list of active CPUs
+ *		  This event is only used during boot
+ *
+ *  CPU_CONFIG	A newly inserted CPU is prepared for starting running code
+ *		  This event is called by DR code
+ *
+ *  CPU_UNCONFIG CPU has been powered off and needs cleanup
+ *		  This event is called by DR code
+ *
+ *  CPU_ON	CPU is enabled but does not run anything yet
+ *
+ *  CPU_INTR_ON	CPU is enabled and has interrupts enabled
+ *
+ *  CPU_OFF	CPU is going offline but can still run threads
+ *
+ *  CPU_CPUPART_OUT	CPU is going to move out of its partition
+ *
+ *  CPU_CPUPART_IN	CPU is going to move to a new partition
+ *
+ *  CPU_SETUP	CPU is set up during boot and can run threads
+ */
 typedef enum {
 	CPU_INIT,
 	CPU_CONFIG,
@@ -734,7 +794,8 @@ typedef enum {
 	CPU_OFF,
 	CPU_CPUPART_IN,
 	CPU_CPUPART_OUT,
-	CPU_SETUP
+	CPU_SETUP,
+	CPU_INTR_ON
 } cpu_setup_t;
 
 typedef int cpu_setup_func_t(cpu_setup_t, int, void *);
@@ -748,6 +809,13 @@ extern void unregister_cpu_setup_func(cpu_setup_func_t *, void *);
 extern void cpu_state_change_notify(int, cpu_setup_t);
 
 /*
+ * Call specified function on the given CPU
+ */
+typedef void (*cpu_call_func_t)(uintptr_t, uintptr_t);
+extern void cpu_call(cpu_t *, cpu_call_func_t, uintptr_t, uintptr_t);
+
+
+/*
  * Create various strings that describe the given CPU for the
  * processor_info system call and configuration-related kstats.
  */
diff --git a/usr/src/uts/common/sys/group.h b/usr/src/uts/common/sys/group.h
index bb5613bc35..2db1ac01bb 100644
--- a/usr/src/uts/common/sys/group.h
+++ b/usr/src/uts/common/sys/group.h
@@ -101,6 +101,17 @@ void		group_remove_at(group_t *, uint_t);
  */
 uint_t		group_find(group_t *, void *);
 
+/*
+ * Convert a group to a string with list of integers.
+ *
+ * The consecutive integer values are represented using x-y notation.
+ * The resulting string looks like "1,2-5,8"
+ *
+ * The convert argument is used to map group elements to integer IDs.
+ * The output buffer and its length are specfied in the arguments.
+ */
+extern char *group2intlist(group_t *, char *, size_t, int (convert)(void*));
+
 #endif	/* !_KERNEL && !_KMEMUSER */
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/sys/kcpc.h b/usr/src/uts/common/sys/kcpc.h
index f30e093f78..d90b1c1d29 100644
--- a/usr/src/uts/common/sys/kcpc.h
+++ b/usr/src/uts/common/sys/kcpc.h
@@ -28,11 +28,13 @@
 
 #include <sys/cpc_impl.h>
 #include <sys/ksynch.h>
+#include <sys/types.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
+
 /*
  * Kernel clients need this file in order to know what a request is and how to
  * program one.
@@ -74,8 +76,33 @@ struct _kcpc_request {
 	uint_t			kr_flags;
 	uint_t			kr_nattrs;
 	kcpc_attr_t		*kr_attr;
+	void			*kr_ptr;	/* Ptr assigned by requester */
 };
 
+typedef struct _kcpc_request_list {
+	kcpc_request_t		*krl_list;	/* counter event requests */
+	int			krl_cnt;	/* how many requests */
+	int			krl_max;	/* max request entries */
+} kcpc_request_list_t;
+
+/*
+ * Type of update function to be called when reading counters on current CPU in
+ * kcpc_read()
+ */
+typedef int (*kcpc_update_func_t)(void *, uint64_t);
+
+/*
+ * Type of read function to be called when reading counters on current CPU
+ * (ie. should be same type signature as kcpc_read())
+ */
+typedef int (*kcpc_read_func_t)(kcpc_update_func_t);
+
+
+/*
+ * Initialize the kcpc framework
+ */
+extern int kcpc_init(void);
+
 /*
  * Bind the set to the indicated thread.
  * Returns 0 on success, or an errno in case of error. If EINVAL is returned,
@@ -96,6 +123,56 @@ extern int kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime,
     uint64_t *tick);
 
 /*
+ * Create CPC context containing specified list of requested counter events
+ */
+extern int kcpc_cpu_ctx_create(struct cpu *cp, kcpc_request_list_t *req_list,
+    int kmem_flags, kcpc_ctx_t ***ctx_ptr_array, size_t *ctx_ptr_array_sz);
+
+/*
+ * Returns whether specified counter event is supported
+ */
+extern boolean_t kcpc_event_supported(char *event);
+
+/*
+ * Initialize list of CPC event requests
+ */
+extern kcpc_request_list_t *kcpc_reqs_init(int nreqs, int kmem_flags);
+
+/*
+ * Add counter event request to given list of counter event requests
+ */
+extern int kcpc_reqs_add(kcpc_request_list_t *req_list, char *event,
+    uint64_t preset, uint_t flags, uint_t nattrs, kcpc_attr_t *attr, void *ptr,
+    int kmem_flags);
+
+/*
+ * Reset list of CPC event requests so its space can be used for another set
+ * of requests
+ */
+extern int kcpc_reqs_reset(kcpc_request_list_t *req_list);
+
+/*
+ * Free given list of counter event requests
+ */
+extern int kcpc_reqs_fini(kcpc_request_list_t *req_list);
+
+/*
+ * Read CPC data for given event on current CPU
+ */
+extern int kcpc_read(kcpc_update_func_t);
+
+/*
+ * Program current CPU with given CPC context
+ */
+extern void kcpc_program(kcpc_ctx_t *ctx, boolean_t for_thread,
+    boolean_t cu_interpose);
+
+/*
+ * Unprogram CPC counters on current CPU
+ */
+extern void kcpc_unprogram(kcpc_ctx_t *ctx, boolean_t cu_interpose);
+
+/*
  * Unbind a request and release the associated resources.
  */
 extern int kcpc_unbind(kcpc_set_t *set);
@@ -128,6 +205,8 @@ extern void kcpc_idle_restore(struct cpu *cp);
 extern krwlock_t	kcpc_cpuctx_lock;  /* lock for 'kcpc_cpuctx' below */
 extern int		kcpc_cpuctx;	   /* number of cpu-specific contexts */
 
+extern void kcpc_free(kcpc_ctx_t *ctx, int isexec);
+
 /*
  * 'dtrace_cpc_in_use' contains the number of currently active cpc provider
  * based enablings. See the block comment in uts/common/os/dtrace_subr.c for
diff --git a/usr/src/uts/common/sys/pghw.h b/usr/src/uts/common/sys/pghw.h
index ab8b0a9bbe..f0550dba7e 100644
--- a/usr/src/uts/common/sys/pghw.h
+++ b/usr/src/uts/common/sys/pghw.h
@@ -89,6 +89,27 @@ typedef enum pghw_type {
 typedef uintptr_t pghw_handle_t;
 
 /*
+ * Representation of PG hardware utilization NOTE: All the sums listed below are
+ * the sums of running total of each item for each CPU in the PG (eg.
+ * sum(utilization) is sum of running total utilization of each CPU in PG)
+ */
+typedef struct pghw_util {
+	uint64_t	pghw_util;	/* sum(utilization) */
+	uint64_t	pghw_rate;	/* Last observed utilization rate */
+	uint64_t	pghw_rate_max;	/* Max observed rate (in units/sec) */
+	hrtime_t	pghw_time_stamp; /* Timestamp of last snapshot */
+	/*
+	 * sum(time utilization counters on)
+	 */
+	hrtime_t	pghw_time_running;
+	/*
+	 * sum(time utilization counters off)
+	 */
+	hrtime_t	pghw_time_stopped;
+} pghw_util_t;
+
+
+/*
  * Processor Group (physical sharing relationship)
  */
 typedef struct pghw {
@@ -97,6 +118,23 @@ typedef struct pghw {
 	id_t		pghw_instance;	/* sharing instance identifier */
 	pghw_handle_t	pghw_handle;	/* hw specific opaque handle */
 	kstat_t		*pghw_kstat;	/* physical kstats exported */
+	kstat_t		*pghw_cu_kstat;  /* for capacity and utilization */
+	/*
+	 * pghw_generation should be updated by superclasses whenever PG changes
+	 * significanly (e.g.  new CPUs join or leave PG).
+	 */
+	uint_t		pghw_generation; /* generation number */
+
+	/*
+	 * The following fields are used by PGHW cu kstats
+	 */
+	char		*pghw_cpulist;	/* list of CPUs */
+	size_t		pghw_cpulist_len;	/* length of the list */
+	/*
+	 * Generation number at kstat update time
+	 */
+	uint_t		pghw_kstat_gen;
+	pghw_util_t	pghw_stats;	/* Utilization data */
 } pghw_t;
 
 /*
@@ -111,32 +149,35 @@ typedef struct cpu_physid {
 /*
  * Physical PG initialization / CPU service hooks
  */
-void		pghw_init(pghw_t *, cpu_t *, pghw_type_t);
-void		pghw_fini(pghw_t *);
-void		pghw_cpu_add(pghw_t *, cpu_t *);
-pghw_t		*pghw_place_cpu(cpu_t *, pghw_type_t);
+extern void		pghw_init(pghw_t *, cpu_t *, pghw_type_t);
+extern void		pghw_fini(pghw_t *);
+extern void		pghw_cpu_add(pghw_t *, cpu_t *);
+extern pghw_t		*pghw_place_cpu(cpu_t *, pghw_type_t);
 
 /*
  * Physical ID cache creation / destruction
  */
-void		pghw_physid_create(cpu_t *);
-void		pghw_physid_destroy(cpu_t *);
+extern void		pghw_physid_create(cpu_t *);
+extern void		pghw_physid_destroy(cpu_t *);
 
 /*
  * CPU / PG hardware related seach operations
  */
-pghw_t		*pghw_find_pg(cpu_t *, pghw_type_t);
-pghw_t		*pghw_find_by_instance(id_t, pghw_type_t);
-group_t		*pghw_set_lookup(pghw_type_t);
-
-void		pghw_kstat_create(pghw_t *);
-int		pghw_kstat_update(kstat_t *, int);
+extern pghw_t		*pghw_find_pg(cpu_t *, pghw_type_t);
+extern pghw_t		*pghw_find_by_instance(id_t, pghw_type_t);
+extern group_t		*pghw_set_lookup(pghw_type_t);
 
 /* Hardware sharing relationship platform interfaces */
-int		pg_plat_hw_shared(cpu_t *, pghw_type_t);
-int		pg_plat_cpus_share(cpu_t *, cpu_t *, pghw_type_t);
-id_t		pg_plat_hw_instance_id(cpu_t *, pghw_type_t);
-pghw_type_t	pg_plat_hw_rank(pghw_type_t, pghw_type_t);
+extern int		pg_plat_hw_shared(cpu_t *, pghw_type_t);
+extern int		pg_plat_cpus_share(cpu_t *, cpu_t *, pghw_type_t);
+extern id_t		pg_plat_hw_instance_id(cpu_t *, pghw_type_t);
+extern pghw_type_t	pg_plat_hw_rank(pghw_type_t, pghw_type_t);
+
+/*
+ * String representation of the hardware type
+ */
+extern char		*pghw_type_string(pghw_type_t);
+extern char		*pghw_type_shortstring(pghw_type_t);
 
 /*
  * What comprises a "core" may vary across processor implementations,
@@ -144,7 +185,7 @@ pghw_type_t	pg_plat_hw_rank(pghw_type_t, pghw_type_t);
  * is no PGHW_CORE type, but we provide an interface here to allow platforms
  * to express cpu <=> core mappings.
  */
-id_t		pg_plat_get_core_id(cpu_t *);
+extern id_t		pg_plat_get_core_id(cpu_t *);
 
 #endif	/* !_KERNEL && !_KMEMUSER */
 
diff --git a/usr/src/uts/common/sys/systm.h b/usr/src/uts/common/sys/systm.h
index 84ccfb9991..4c3dc7f886 100644
--- a/usr/src/uts/common/sys/systm.h
+++ b/usr/src/uts/common/sys/systm.h
@@ -270,6 +270,7 @@ int spl8(void);
 void splx(int);
 void set_base_spl(void);
 int __ipltospl(int);
+int spl_xcall(void);
 
 void softcall_init(void);
 void softcall(void (*)(void *), void *);
diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c
index 8e532685c7..8621e3ef55 100644
--- a/usr/src/uts/i86pc/os/cpuid.c
+++ b/usr/src/uts/i86pc/os/cpuid.c
@@ -2669,6 +2669,13 @@ cpuid_get_clogid(cpu_t *cpu)
 	return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
 }
 
+int
+cpuid_get_cacheid(cpu_t *cpu)
+{
+	ASSERT(cpuid_checkpass(cpu, 1));
+	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
+}
+
 uint_t
 cpuid_get_procnodeid(cpu_t *cpu)
 {
diff --git a/usr/src/uts/i86pc/os/intr.c b/usr/src/uts/i86pc/os/intr.c
index 786cd29e8f..fc0ef9e260 100644
--- a/usr/src/uts/i86pc/os/intr.c
+++ b/usr/src/uts/i86pc/os/intr.c
@@ -1179,6 +1179,12 @@ getpil(void)
 }
 
 int
+spl_xcall(void)
+{
+	return (splr(ipltospl(XCALL_PIL)));
+}
+
+int
 interrupts_enabled(void)
 {
 	ulong_t	flag;
diff --git a/usr/src/uts/i86pc/os/mp_call.c b/usr/src/uts/i86pc/os/mp_call.c
index 5725b18d85..df18f16588 100644
--- a/usr/src/uts/i86pc/os/mp_call.c
+++ b/usr/src/uts/i86pc/os/mp_call.c
@@ -32,6 +32,8 @@
 #include <sys/systm.h>
 #include <sys/promif.h>
 #include <sys/xc_levels.h>
+#include <sys/spl.h>
+#include <sys/bitmap.h>
 
 /*
  * Interrupt another CPU.
@@ -54,3 +56,38 @@ poke_cpu(int cpun)
 	 */
 	send_dirint(cpun, XC_CPUPOKE_PIL);
 }
+
+/*
+ * Call a function on a target CPU
+ */
+void
+cpu_call(cpu_t *cp, cpu_call_func_t func, uintptr_t arg1, uintptr_t arg2)
+{
+	cpuset_t set;
+
+	if (panicstr)
+		return;
+
+	/*
+	 * Prevent CPU from going off-line
+	 */
+	kpreempt_disable();
+
+	/*
+	 * If we are on the target CPU, call the function directly, but raise
+	 * the PIL to XC_PIL.
+	 * This guarantees that functions called via cpu_call() can not ever
+	 * interrupt each other.
+	 */
+	if (CPU == cp) {
+		int save_spl = splr(ipltospl(XC_HI_PIL));
+
+		(*func)(arg1, arg2);
+		splx(save_spl);
+	} else {
+		CPUSET_ONLY(set, cp->cpu_id);
+		xc_call((xc_arg_t)arg1, (xc_arg_t)arg2, 0, CPUSET2BV(set),
+		    (xc_func_t)func);
+	}
+	kpreempt_enable();
+}
diff --git a/usr/src/uts/i86pc/os/mp_machdep.c b/usr/src/uts/i86pc/os/mp_machdep.c
index 7470a1ef38..80e371850b 100644
--- a/usr/src/uts/i86pc/os/mp_machdep.c
+++ b/usr/src/uts/i86pc/os/mp_machdep.c
@@ -1,3 +1,4 @@
+
 /*
  * CDDL HEADER START
  *
@@ -61,6 +62,7 @@
 #include <sys/hpet.h>
 #include <sys/sunddi.h>
 #include <sys/sunndi.h>
+#include <sys/cpc_pcbe.h>
 
 #define	OFFSETOF(s, m)		(size_t)(&(((s *)0)->m))
 
@@ -1680,3 +1682,37 @@ pg_cmt_affinity_hw(pghw_type_t hw)
 	else
 		return (0);
 }
+
+/*
+ * Return number of counter events requested to measure hardware capacity and
+ * utilization and setup CPC requests for specified CPU as needed
+ *
+ * May return 0 when platform or processor specific code knows that no CPC
+ * events should be programmed on this CPU or -1 when platform or processor
+ * specific code doesn't know which counter events are best to use and common
+ * code should decide for itself
+ */
+int
+/* LINTED E_FUNC_ARG_UNUSED */
+cu_plat_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs)
+{
+	const char	*impl_name;
+
+	/*
+	 * Return error if pcbe_ops not set
+	 */
+	if (pcbe_ops == NULL)
+		return (-1);
+
+	/*
+	 * Return that no CPC events should be programmed on hyperthreaded
+	 * Pentium 4 and return error for all other x86 processors to tell
+	 * common code to decide what counter events to program on those CPUs
+	 * for measuring hardware capacity and utilization
+	 */
+	impl_name = pcbe_ops->pcbe_impl_name();
+	if (impl_name != NULL && strcmp(impl_name, PCBE_IMPL_NAME_P4HT) == 0)
+		return (0);
+	else
+		return (-1);
+}
diff --git a/usr/src/uts/i86pc/sys/xc_levels.h b/usr/src/uts/i86pc/sys/xc_levels.h
index 31ba6441fa..0492e48a1d 100644
--- a/usr/src/uts/i86pc/sys/xc_levels.h
+++ b/usr/src/uts/i86pc/sys/xc_levels.h
@@ -35,6 +35,7 @@ extern "C" {
 #define	XC_CPUPOKE_PIL	11	/* poke to cause wakeup, no service function */
 #define	XC_SYS_PIL	13	/* should be defined elsewhere */
 #define	XC_HI_PIL	15	/* cross call with service function */
+#define	XCALL_PIL	XC_HI_PIL /* alias for XC_HI_PIL */
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/intel/genunix/Makefile b/usr/src/uts/intel/genunix/Makefile
index db7b60ff14..ab0073268f 100644
--- a/usr/src/uts/intel/genunix/Makefile
+++ b/usr/src/uts/intel/genunix/Makefile
@@ -20,7 +20,7 @@
 #
 
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 
@@ -84,6 +84,8 @@ $(PATCH_BUILD)IPCTF_TARGET =
 CPPFLAGS	+= -I$(SRC)/common
 CPPFLAGS	+= -I$(SRC)/uts/common/fs/zfs
 
+CPPFLAGS	+= -I$(UTSBASE)/i86pc
+
 #
 # For now, disable these lint checks; maintainers should endeavor
 # to investigate and remove these for maximum lint coverage.
diff --git a/usr/src/uts/intel/ia32/os/cpc_subr.c b/usr/src/uts/intel/ia32/os/cpc_subr.c
index 1a71c1c431..1e3049a399 100644
--- a/usr/src/uts/intel/ia32/os/cpc_subr.c
+++ b/usr/src/uts/intel/ia32/os/cpc_subr.c
@@ -188,33 +188,6 @@ kcpc_hw_load_pcbe(void)
 	    cpuid_getmodel(CPU), cpuid_getstep(CPU)));
 }
 
-static int
-kcpc_remotestop_func(void)
-{
-	ASSERT(CPU->cpu_cpc_ctx != NULL);
-	pcbe_ops->pcbe_allstop();
-	atomic_or_uint(&CPU->cpu_cpc_ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
-
-	return (0);
-}
-
-/*
- * Ensure the counters are stopped on the given processor.
- *
- * Callers must ensure kernel preemption is disabled.
- */
-void
-kcpc_remote_stop(cpu_t *cp)
-{
-	cpuset_t set;
-
-	CPUSET_ZERO(set);
-
-	CPUSET_ADD(set, cp->cpu_id);
-
-	xc_sync(0, 0, 0, CPUSET2BV(set), (xc_func_t)kcpc_remotestop_func);
-}
-
 /*
  * Called by the generic framework to check if it's OK to bind a set to a CPU.
  */
@@ -292,28 +265,3 @@ kcpc_hw_lwp_hook(void)
 	mutex_exit(&cpu_lock);
 	return (0);
 }
-
-static int
-kcpc_remoteprogram_func(void)
-{
-	ASSERT(CPU->cpu_cpc_ctx != NULL);
-
-	pcbe_ops->pcbe_program(CPU->cpu_cpc_ctx);
-
-	return (0);
-}
-
-/*
- * Ensure counters are enabled on the given processor.
- */
-void
-kcpc_remote_program(cpu_t *cp)
-{
-	cpuset_t set;
-
-	CPUSET_ZERO(set);
-
-	CPUSET_ADD(set, cp->cpu_id);
-
-	xc_sync(0, 0, 0, CPUSET2BV(set), (xc_func_t)kcpc_remoteprogram_func);
-}
diff --git a/usr/src/uts/intel/pcbe/opteron_pcbe.c b/usr/src/uts/intel/pcbe/opteron_pcbe.c
index 18a309eca6..cb97d21b78 100644
--- a/usr/src/uts/intel/pcbe/opteron_pcbe.c
+++ b/usr/src/uts/intel/pcbe/opteron_pcbe.c
@@ -563,26 +563,6 @@ opt_pcbe_list_attrs(void)
 	return ("edge,pc,inv,cmask,umask");
 }
 
-/*ARGSUSED*/
-static uint64_t
-opt_pcbe_event_coverage(char *event)
-{
-	/*
-	 * Fortunately, all counters can count all events.
-	 */
-	return (0xF);
-}
-
-static uint64_t
-opt_pcbe_overflow_bitmap(void)
-{
-	/*
-	 * Unfortunately, this chip cannot detect which counter overflowed, so
-	 * we must act as if they all did.
-	 */
-	return (0xF);
-}
-
 static amd_generic_event_t *
 find_generic_event(char *name)
 {
@@ -608,6 +588,32 @@ find_event(char *name)
 }
 
 /*ARGSUSED*/
+static uint64_t
+opt_pcbe_event_coverage(char *event)
+{
+	/*
+	 * Check whether counter event is supported
+	 */
+	if (find_event(event) == NULL && find_generic_event(event) == NULL)
+		return (0);
+
+	/*
+	 * Fortunately, all counters can count all events.
+	 */
+	return (0xF);
+}
+
+static uint64_t
+opt_pcbe_overflow_bitmap(void)
+{
+	/*
+	 * Unfortunately, this chip cannot detect which counter overflowed, so
+	 * we must act as if they all did.
+	 */
+	return (0xF);
+}
+
+/*ARGSUSED*/
 static int
 opt_pcbe_configure(uint_t picnum, char *event, uint64_t preset, uint32_t flags,
     uint_t nattrs, kcpc_attr_t *attrs, void **data, void *token)
diff --git a/usr/src/uts/intel/pcbe/p4_pcbe.c b/usr/src/uts/intel/pcbe/p4_pcbe.c
index 0fffcd2961..8c05c599a3 100644
--- a/usr/src/uts/intel/pcbe/p4_pcbe.c
+++ b/usr/src/uts/intel/pcbe/p4_pcbe.c
@@ -522,7 +522,7 @@ static const char *
 p4_pcbe_impl_name(void)
 {
 	if (p4_htt)
-		return ("Pentium 4 with HyperThreading");
+		return (PCBE_IMPL_NAME_P4HT);
 	return ("Pentium 4");
 }
 
diff --git a/usr/src/uts/intel/sys/x86_archext.h b/usr/src/uts/intel/sys/x86_archext.h
index e5f1cababc..0bb28d4d49 100644
--- a/usr/src/uts/intel/sys/x86_archext.h
+++ b/usr/src/uts/intel/sys/x86_archext.h
@@ -626,6 +626,7 @@ extern int cpuid_get_chipid(struct cpu *);
 extern id_t cpuid_get_coreid(struct cpu *);
 extern int cpuid_get_pkgcoreid(struct cpu *);
 extern int cpuid_get_clogid(struct cpu *);
+extern int cpuid_get_cacheid(struct cpu *);
 extern uint32_t cpuid_get_apicid(struct cpu *);
 extern uint_t cpuid_get_procnodeid(struct cpu *cpu);
 extern uint_t cpuid_get_procnodes_per_pkg(struct cpu *cpu);
diff --git a/usr/src/uts/sun4/os/mp_call.c b/usr/src/uts/sun4/os/mp_call.c
index f881a23755..f7ee31a276 100644
--- a/usr/src/uts/sun4/os/mp_call.c
+++ b/usr/src/uts/sun4/os/mp_call.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Facilities for cross-processor subroutine calls using "mailbox" interrupts.
  */
@@ -37,6 +34,7 @@
 #include <sys/systm.h>
 #include <sys/machsystm.h>
 #include <sys/intr.h>
+#include <sys/xc_impl.h>
 
 /*
  * Interrupt another CPU.
@@ -64,3 +62,40 @@ poke_cpu(int cpun)
 
 	xt_one(cpun, setsoftint_tl1, poke_cpu_inum, 0);
 }
+
+extern int xc_spl_enter[];
+
+/*
+ * Call a function on a target CPU
+ */
+void
+cpu_call(cpu_t *cp, cpu_call_func_t func, uintptr_t arg1, uintptr_t arg2)
+{
+	if (panicstr)
+		return;
+
+	/*
+	 * Prevent CPU from going offline
+	 */
+	kpreempt_disable();
+
+	/*
+	 * If we are on the target CPU, call the function directly, but raise
+	 * the PIL to XC_PIL.
+	 * This guarantees that functions called via cpu_call() can not ever
+	 * interrupt each other.
+	 */
+	if (CPU != cp) {
+		xc_one(cp->cpu_id, (xcfunc_t *)func, (uint64_t)arg1,
+		    (uint64_t)arg2);
+	} else {
+		int lcx;
+		int opl;
+
+		XC_SPL_ENTER(lcx, opl);
+		func(arg1, arg2);
+		XC_SPL_EXIT(lcx, opl);
+	}
+
+	kpreempt_enable();
+}
diff --git a/usr/src/uts/sun4/os/x_call.c b/usr/src/uts/sun4/os/x_call.c
index 0c5c06c36a..521f740c82 100644
--- a/usr/src/uts/sun4/os/x_call.c
+++ b/usr/src/uts/sun4/os/x_call.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/systm.h>
 #include <sys/archsystm.h>
 #include <sys/machsystm.h>
@@ -226,6 +224,15 @@ xc_init(void)
  */
 
 /*
+ * spl_xcall - set PIL to xcall level
+ */
+int
+spl_xcall(void)
+{
+	return (splr(XCALL_PIL));
+}
+
+/*
  * xt_one - send a "x-trap" to a cpu
  */
 void
diff --git a/usr/src/uts/sun4u/genunix/Makefile b/usr/src/uts/sun4u/genunix/Makefile
index 8d7c87f065..1a77e4c916 100644
--- a/usr/src/uts/sun4u/genunix/Makefile
+++ b/usr/src/uts/sun4u/genunix/Makefile
@@ -20,7 +20,7 @@
 #
 
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 
@@ -96,6 +96,8 @@ CFLAGS += $(CCVERBOSE)
 CPPFLAGS += -I$(SRC)/common
 CPPFLAGS += -I$(SRC)/uts/common/fs/zfs
 
+INC_PATH +=  -I$(UTSBASE)/sun4
+
 #
 # For now, disable these lint checks; maintainers should endeavor
 # to investigate and remove these for maximum lint coverage.
diff --git a/usr/src/uts/sun4u/os/cmp.c b/usr/src/uts/sun4u/os/cmp.c
index 8ba9aa3b6e..8a0fa0e6dc 100644
--- a/usr/src/uts/sun4u/os/cmp.c
+++ b/usr/src/uts/sun4u/os/cmp.c
@@ -303,3 +303,19 @@ pg_cmt_affinity_hw(pghw_type_t hw)
 	else
 		return (0);
 }
+
+/*
+ * Return number of counter events requested to measure hardware capacity and
+ * utilization and setup CPC requests for specified CPU if list where to add
+ * CPC requests is given
+ */
+int
+/* LINTED E_FUNC_ARG_UNUSED */
+cu_plat_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs)
+{
+	/*
+	 * Return error to tell common code to decide what counter events to
+	 * program on this CPU for measuring hardware capacity and utilization
+	 */
+	return (-1);
+}
diff --git a/usr/src/uts/sun4u/os/cpc_subr.c b/usr/src/uts/sun4u/os/cpc_subr.c
index a9c64681fd..cfe1fd283d 100644
--- a/usr/src/uts/sun4u/os/cpc_subr.c
+++ b/usr/src/uts/sun4u/os/cpc_subr.c
@@ -45,6 +45,7 @@
 #include <sys/cpc_pcbe.h>
 #include <sys/modctl.h>
 #include <sys/sdt.h>
+#include <sys/kcpc.h>
 
 uint64_t	cpc_level15_inum;	/* used in interrupt.s */
 int		cpc_has_overflow_intr;	/* set in cheetah.c */
@@ -111,26 +112,6 @@ kcpc_hw_load_pcbe(void)
 }
 
 /*ARGSUSED*/
-static void
-kcpc_remotestop_func(uint64_t arg1, uint64_t arg2)
-{
-	ASSERT(CPU->cpu_cpc_ctx != NULL);
-	pcbe_ops->pcbe_allstop();
-	atomic_or_uint(&CPU->cpu_cpc_ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
-}
-
-/*
- * Ensure the counters are stopped on the given processor.
- *
- * Callers must ensure kernel preemption is disabled.
- */
-void
-kcpc_remote_stop(cpu_t *cp)
-{
-	xc_one(cp->cpu_id, kcpc_remotestop_func, 0, 0);
-}
-
-/*ARGSUSED*/
 int
 kcpc_hw_cpu_hook(processorid_t cpuid, ulong_t *kcpc_cpumap)
 {
@@ -142,21 +123,3 @@ kcpc_hw_lwp_hook(void)
 {
 	return (0);
 }
-
-/*ARGSUSED*/
-static void
-kcpc_remoteprogram_func(uint64_t arg1, uint64_t arg2)
-{
-	ASSERT(CPU->cpu_cpc_ctx != NULL);
-
-	pcbe_ops->pcbe_program(CPU->cpu_cpc_ctx);
-}
-
-/*
- * Ensure counters are enabled on the given processor.
- */
-void
-kcpc_remote_program(cpu_t *cp)
-{
-	xc_one(cp->cpu_id, kcpc_remoteprogram_func, 0, 0);
-}
diff --git a/usr/src/uts/sun4v/genunix/Makefile b/usr/src/uts/sun4v/genunix/Makefile
index e629630fb5..28d4f2aeeb 100644
--- a/usr/src/uts/sun4v/genunix/Makefile
+++ b/usr/src/uts/sun4v/genunix/Makefile
@@ -20,7 +20,7 @@
 #
 
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 #
@@ -104,6 +104,8 @@ CFLAGS += $(CCVERBOSE)
 CPPFLAGS += -I$(SRC)/common
 CPPFLAGS += -I$(SRC)/uts/common/fs/zfs
 
+INC_PATH +=  -I$(UTSBASE)/sun4
+
 #
 # For now, disable these lint checks; maintainers should endeavor
 # to investigate and remove these for maximum lint coverage.
diff --git a/usr/src/uts/sun4v/os/cmp.c b/usr/src/uts/sun4v/os/cmp.c
index 4e80f06f32..8eedd1a69d 100644
--- a/usr/src/uts/sun4v/os/cmp.c
+++ b/usr/src/uts/sun4v/os/cmp.c
@@ -208,3 +208,19 @@ pg_cmt_affinity_hw(pghw_type_t hw)
 	else
 		return (0);
 }
+
+/*
+ * Return number of counter events requested to measure hardware capacity and
+ * utilization and setup CPC requests for specified CPU if list where to add
+ * CPC requests is given
+ */
+int
+/* LINTED E_FUNC_ARG_UNUSED */
+cu_plat_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs)
+{
+	/*
+	 * Return error to tell common code to decide what counter events to
+	 * program on this CPU for measuring hardware capacity and utilization
+	 */
+	return (-1);
+}
diff --git a/usr/src/uts/sun4v/os/cpc_subr.c b/usr/src/uts/sun4v/os/cpc_subr.c
index 8e58d85513..089c582541 100644
--- a/usr/src/uts/sun4v/os/cpc_subr.c
+++ b/usr/src/uts/sun4v/os/cpc_subr.c
@@ -130,26 +130,6 @@ kcpc_hw_load_pcbe(void)
 }
 
 /*ARGSUSED*/
-static void
-kcpc_remotestop_func(uint64_t arg1, uint64_t arg2)
-{
-	ASSERT(CPU->cpu_cpc_ctx != NULL);
-	pcbe_ops->pcbe_allstop();
-	atomic_or_uint(&CPU->cpu_cpc_ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
-}
-
-/*
- * Ensure the counters are stopped on the given processor.
- *
- * Callers must ensure kernel preemption is disabled.
- */
-void
-kcpc_remote_stop(cpu_t *cp)
-{
-	xc_one(cp->cpu_id, kcpc_remotestop_func, 0, 0);
-}
-
-/*ARGSUSED*/
 int
 kcpc_hw_cpu_hook(processorid_t cpuid, ulong_t *kcpc_cpumap)
 {
@@ -161,21 +141,3 @@ kcpc_hw_lwp_hook(void)
 {
 	return (0);
 }
-
-/*ARGSUSED*/
-static void
-kcpc_remoteprogram_func(uint64_t arg1, uint64_t arg2)
-{
-	ASSERT(CPU->cpu_cpc_ctx != NULL);
-
-	pcbe_ops->pcbe_program(CPU->cpu_cpc_ctx);
-}
-
-/*
- * Ensure counters are enabled on the given processor.
- */
-void
-kcpc_remote_program(cpu_t *cp)
-{
-	xc_one(cp->cpu_id, kcpc_remoteprogram_func, 0, 0);
-}
diff --git a/usr/src/uts/sun4v/pcbe/niagara2_pcbe.c b/usr/src/uts/sun4v/pcbe/niagara2_pcbe.c
index 13c428130e..d4b69e5de4 100644
--- a/usr/src/uts/sun4v/pcbe/niagara2_pcbe.c
+++ b/usr/src/uts/sun4v/pcbe/niagara2_pcbe.c
@@ -399,6 +399,12 @@ static uint64_t
 ni2_pcbe_event_coverage(char *event)
 {
 	/*
+	 * Check whether counter event is supported
+	 */
+	if (find_event(event) == NULL && find_generic_event(event) == NULL)
+		return (0);
+
+	/*
 	 * Fortunately, both pic0 and pic1 can count all events.
 	 */
 	return (0x3);
author	Alexander Kolbasov <Alexander.Kolbasov@Sun.COM>	2009-12-22 21:52:00 -0800
committer	Alexander Kolbasov <Alexander.Kolbasov@Sun.COM>	2009-12-22 21:52:00 -0800
commit	b885580b43755ee4ea1e280b85428893d2ba9291 (patch)
tree	f7f7848d3eef390282bab6fc859d7a5275053ecf /usr/src
parent	e7437094ebbbd4d60375f3927c017ff00cbab1de (diff)
download	illumos-joyent-b885580b43755ee4ea1e280b85428893d2ba9291.tar.gz