diff options
Diffstat (limited to 'usr/src')
48 files changed, 3759 insertions, 374 deletions
diff --git a/usr/src/cmd/cpc/common/cputrack.c b/usr/src/cmd/cpc/common/cputrack.c index 22ad2673e2..41034aef6e 100644 --- a/usr/src/cmd/cpc/common/cputrack.c +++ b/usr/src/cmd/cpc/common/cputrack.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -62,6 +62,12 @@ static const struct options *opts = (const struct options *)&__options; static cpc_t *cpc; +/* + * How many signals caught from terminal + * We bail out as soon as possible when interrupt is set + */ +static int interrupt = 0; + /*ARGSUSED*/ static void cputrack_errfn(const char *fn, int subcode, const char *fmt, va_list ap) @@ -79,6 +85,8 @@ cputrack_pctx_errfn(const char *fn, const char *fmt, va_list ap) } static int cputrack(int argc, char *argv[], int optind); +static void intr(int); + #if defined(__i386) static void p4_ht_error(void); #endif @@ -220,6 +228,19 @@ main(int argc, char *argv[]) exit(2); } + /* + * Catch signals from terminal, so they can be handled asynchronously + * when we're ready instead of when we're not (;-) + */ + if (sigset(SIGHUP, SIG_IGN) == SIG_DFL) + (void) sigset(SIGHUP, intr); + if (sigset(SIGINT, SIG_IGN) == SIG_DFL) + (void) sigset(SIGINT, intr); + if (sigset(SIGQUIT, SIG_IGN) == SIG_DFL) + (void) sigset(SIGQUIT, intr); + (void) sigset(SIGPIPE, intr); + (void) sigset(SIGTERM, intr); + cpc_setgrp_reset(opts->master); (void) setvbuf(opts->log, NULL, _IOLBF, 0); ret = cputrack(argc, argv, optind); @@ -310,6 +331,9 @@ pinit_lwp(pctx_t *pctx, pid_t pid, id_t lwpid, void *arg) char *errstr; int nreq; + if (interrupt) + return (0); + if (state->maxlwpid < lwpid) { state->sgrps = realloc(state->sgrps, lwpid * sizeof (state->sgrps)); @@ -373,6 +397,9 @@ pfini_lwp(pctx_t *pctx, pid_t pid, id_t lwpid, void *arg) cpc_buf_t **data1, **data2, **scratch; int nreq; + if (interrupt) + return (0); + set = cpc_setgrp_getset(sgrp); nreq = cpc_setgrp_getbufs(sgrp, &data1, &data2, &scratch); if (cpc_set_sample(cpc, set, *scratch) == 0) { @@ -424,6 +451,9 @@ plwp_create(pctx_t *pctx, pid_t pid, id_t lwpid, void *arg) cpc_buf_t **data1, **data2, **scratch; int nreq; + if (interrupt) + return (0); + nreq = cpc_setgrp_getbufs(sgrp, &data1, &data2, &scratch); print_sample(pid, lwpid, "lwp_create", @@ -442,6 +472,9 @@ plwp_exit(pctx_t *pctx, pid_t pid, id_t lwpid, void *arg) int nreq; cpc_buf_t **data1, **data2, **scratch; + if (interrupt) + return (0); + start = cpc_setgrp_getset(sgrp); do { nreq = cpc_setgrp_getbufs(sgrp, &data1, &data2, &scratch); @@ -465,6 +498,9 @@ pexec(pctx_t *pctx, pid_t pid, id_t lwpid, char *name, void *arg) cpc_buf_t **data1, **data2, **scratch; hrtime_t hrt; + if (interrupt) + return (0); + /* * Print the accumulated results from the previous program image */ @@ -505,6 +541,9 @@ pexit(pctx_t *pctx, pid_t pid, id_t lwpid, int status, void *arg) int nreq; cpc_buf_t **data1, **data2, **scratch; + if (interrupt) + return; + cpc_setgrp_reset(state->accum); start = cpc_setgrp_getset(state->accum); do { @@ -539,6 +578,9 @@ ptick(pctx_t *pctx, pid_t pid, id_t lwpid, void *arg) char *errstr; int nreqs; + if (interrupt) + return (0); + nreqs = cpc_setgrp_getbufs(sgrp, &data1, &data2, &scratch); if (opts->nsets == 1) { @@ -704,7 +746,6 @@ cputrack(int argc, char *argv[], int optind) state->accum = NULL; } } - pctx_release(pctx); return (err != 0 ? 1 : 0); } @@ -834,3 +875,12 @@ p4_ht_error(void) } #endif /* defined(__i386) */ + +/*ARGSUSED*/ +static void +intr(int sig) +{ + interrupt++; + if (cpc != NULL) + cpc_terminate(cpc); +} diff --git a/usr/src/lib/libcpc/common/libcpc.c b/usr/src/lib/libcpc/common/libcpc.c index 5bdba39fda..9f4f6ac848 100644 --- a/usr/src/lib/libcpc/common/libcpc.c +++ b/usr/src/lib/libcpc/common/libcpc.c @@ -168,6 +168,23 @@ cpc_close(cpc_t *cpc) return (0); } +/* + * Terminate everything that runs in pctx_run + */ +void +cpc_terminate(cpc_t *cpc) +{ + cpc_set_t *csp; + int sigblocked; + + sigblocked = cpc_lock(cpc); + for (csp = cpc->cpc_sets; csp != NULL; csp = csp->cs_next) { + if (csp->cs_pctx != NULL) + pctx_terminate(csp->cs_pctx); + } + cpc_unlock(cpc, sigblocked); +} + cpc_set_t * cpc_set_create(cpc_t *cpc) { @@ -224,6 +241,14 @@ cpc_set_destroy(cpc_t *cpc, cpc_set_t *set) if (csp->cs_state != CS_UNBOUND) (void) cpc_unbind(cpc, csp); + /* + * Detach from the process + */ + if (csp->cs_pctx != NULL) { + pctx_release(csp->cs_pctx); + csp->cs_pctx = NULL; + } + for (req = csp->cs_request; req != NULL; req = next) { next = req->cr_next; diff --git a/usr/src/lib/libcpc/common/libcpc.h b/usr/src/lib/libcpc/common/libcpc.h index 384474a76c..73627345a0 100644 --- a/usr/src/lib/libcpc/common/libcpc.h +++ b/usr/src/lib/libcpc/common/libcpc.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -163,6 +163,8 @@ extern void cpc_walk_attrs(cpc_t *cpc, void *arg, extern int cpc_enable(cpc_t *cpc); extern int cpc_disable(cpc_t *cpc); +extern void cpc_terminate(cpc_t *); + #if defined(__sparc) || defined(__i386) /* diff --git a/usr/src/lib/libcpc/common/mapfile-vers b/usr/src/lib/libcpc/common/mapfile-vers index 91f1689c9f..e577fc7c5e 100644 --- a/usr/src/lib/libcpc/common/mapfile-vers +++ b/usr/src/lib/libcpc/common/mapfile-vers @@ -83,6 +83,7 @@ SUNW_1.2 { SUNWprivate_1.1 { global: SUNWprivate_1.1; + cpc_terminate; local: *; }; diff --git a/usr/src/lib/libpctx/common/libpctx.c b/usr/src/lib/libpctx/common/libpctx.c index 9c28fb9b9b..f17e238322 100644 --- a/usr/src/lib/libpctx/common/libpctx.c +++ b/usr/src/lib/libpctx/common/libpctx.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This file contains a set of generic routines for periodically * sampling the state of another process, or tree of processes. @@ -66,6 +64,7 @@ struct __pctx { int verbose; int created; int sigblocked; + int terminate; sigset_t savedset; cpc_t *cpc; }; @@ -108,6 +107,7 @@ pctx_create( pctx = calloc(1, sizeof (*pctx)); pctx->uarg = arg; pctx->verbose = verbose; + pctx->terminate = 0; pctx->errfn = errfn ? errfn : pctx_default_errfn; if ((pctx->Pr = Pcreate(filename, argv, &err, 0, 0)) == NULL) { @@ -487,6 +487,7 @@ pctx_release(pctx_t *pctx) Prelease(pctx->Pr, PRELEASE_CLEAR); pctx->Pr = NULL; } + pctx_free(pctx); bzero(pctx, sizeof (*pctx)); free(pctx); @@ -577,7 +578,7 @@ pctx_run( * exited successfully or the number of time samples has expired. * Otherwise, if an error has occurred, running becomes -1. */ - while (running == 1) { + while (running == 1 && !pctx->terminate) { if (Psetrun(pctx->Pr, 0, 0) != 0) { if (pctx->verbose) @@ -609,10 +610,13 @@ pctx_run( if (nsamples != 1) nsamples--; } - } while (mswait == 0); + } while (mswait == 0 && !pctx->terminate); } - (void) Pwait(pctx->Pr, mswait); + if (pctx->terminate) + goto bailout; + else + (void) Pwait(pctx->Pr, mswait); checkstate: switch (pstate = Pstate(pctx->Pr)) { @@ -854,6 +858,9 @@ checkstate: bailout: (void) signal(SIGCHLD, sigsaved); + if (pctx->terminate) + return (0); + switch (running) { case 0: return (0); @@ -885,6 +892,7 @@ __pctx_cpc(pctx_t *pctx, cpc_t *cpc, * We store the last cpc_t used by libpctx, so that when this pctx is * destroyed, libpctx can notify libcpc. */ + if (pctx->cpc != NULL && pctx->cpc != cpc && pctx_cpc_callback != NULL) (*pctx_cpc_callback)(pctx->cpc, pctx); pctx->cpc = cpc; @@ -993,3 +1001,12 @@ __pctx_cpc_register_callback(void (*arg)(struct __cpc *, struct __pctx *)) { pctx_cpc_callback = arg; } + +/* + * Tell pctx_run to bail out immediately + */ +void +pctx_terminate(struct __pctx *pctx) +{ + pctx->terminate = 1; +} diff --git a/usr/src/lib/libpctx/common/libpctx.h b/usr/src/lib/libpctx/common/libpctx.h index 10d0fb7c7e..7cd9ffff91 100644 --- a/usr/src/lib/libpctx/common/libpctx.h +++ b/usr/src/lib/libpctx/common/libpctx.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _LIBPCTX_H #define _LIBPCTX_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <fcntl.h> #include <stdarg.h> @@ -67,6 +64,8 @@ typedef int pctx_init_lwpfn_t(pctx_t *, pid_t, id_t, void *); typedef int pctx_fini_lwpfn_t(pctx_t *, pid_t, id_t, void *); typedef int pctx_sysc_lwp_exitfn_t(pctx_t *, pid_t, id_t, void *); +extern void pctx_terminate(pctx_t *); + typedef enum { PCTX_NULL_EVENT = 0, PCTX_SYSC_EXEC_EVENT, diff --git a/usr/src/lib/libpctx/common/mapfile-vers b/usr/src/lib/libpctx/common/mapfile-vers index 1b296817d4..e316020c8b 100644 --- a/usr/src/lib/libpctx/common/mapfile-vers +++ b/usr/src/lib/libpctx/common/mapfile-vers @@ -50,6 +50,7 @@ SUNWprivate_1.1 { global: __pctx_cpc; __pctx_cpc_register_callback; + pctx_terminate; local: *; }; diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com index 8ad553b07c..88ab8b3f20 100644 --- a/usr/src/pkgdefs/SUNWhea/prototype_com +++ b/usr/src/pkgdefs/SUNWhea/prototype_com @@ -682,6 +682,7 @@ f none usr/include/sys/bustypes.h 644 root bin f none usr/include/sys/byteorder.h 644 root bin f none usr/include/sys/callb.h 644 root bin f none usr/include/sys/callo.h 644 root bin +f none usr/include/sys/cap_util.h 644 root bin f none usr/include/sys/cpucaps.h 644 root bin f none usr/include/sys/cpucaps_impl.h 644 root bin f none usr/include/sys/ccompile.h 644 root bin diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 83b7bf34c6..974cec5d3f 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -47,6 +47,7 @@ COMMON_CORE_OBJS += \ cpu_intr.o \ cpu_pm.o \ cpupart.o \ + cap_util.o \ disp.o \ group.o \ kstat_fr.o \ diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c index e6d77020a6..09e529b934 100644 --- a/usr/src/uts/common/conf/param.c +++ b/usr/src/uts/common/conf/param.c @@ -212,6 +212,7 @@ extern void clock_timer_init(void); extern void clock_realtime_init(void); extern void clock_highres_init(void); extern void clock_tick_mp_init(void); +extern void cu_init(void); extern void callout_mp_init(void); extern void cpu_seq_tbl_init(void); @@ -257,6 +258,7 @@ void (*mp_init_tbl[])(void) = { siron_mp_init, #endif clock_tick_mp_init, + cu_init, callout_mp_init, 0 }; diff --git a/usr/src/uts/common/disp/cmt.c b/usr/src/uts/common/disp/cmt.c index b2f219472d..a5f1a52e34 100644 --- a/usr/src/uts/common/disp/cmt.c +++ b/usr/src/uts/common/disp/cmt.c @@ -159,7 +159,6 @@ static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *); static cmt_lineage_validation_t pg_cmt_lineage_validate(pg_cmt_t **, int *, cpu_pg_t *); - /* * CMT PG ops */ @@ -583,6 +582,8 @@ pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata) ASSERT(IS_CMT_PG(pg)); } + ((pghw_t *)pg)->pghw_generation++; + /* Add the CPU to the PG */ pg_cpu_add((pg_t *)pg, cp, pgdata); @@ -762,7 +763,7 @@ pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata) * * cp->cpu_pg is used by the dispatcher to access the CPU's PG data * references a "bootstrap" structure across this function's invocation. - * pg_cmt_cpu_init() and the routines it calls must be careful to operate only + * pg_cmt_cpu_fini() and the routines it calls must be careful to operate only * on the "pgdata" argument, and not cp->cpu_pg. */ static void @@ -818,6 +819,8 @@ pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata) pg = (pg_cmt_t *)pgdata->cmt_lineage; while (pg != NULL) { + ((pghw_t *)pg)->pghw_generation++; + /* * Remove the PG from the CPU's load balancing lineage */ @@ -990,6 +993,11 @@ pg_cmt_cpu_active(cpu_t *cp) if (IS_CMT_PG(pg) == 0) continue; + /* + * Move to the next generation since topology is changing + */ + ((pghw_t *)pg)->pghw_generation++; + err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); ASSERT(err == 0); @@ -1056,6 +1064,11 @@ pg_cmt_cpu_inactive(cpu_t *cp) continue; /* + * Move to the next generation since topology is changing + */ + ((pghw_t *)pg)->pghw_generation++; + + /* * Remove the CPU from the CMT PGs active CPU group * bitmap */ diff --git a/usr/src/uts/common/dtrace/dcpc.c b/usr/src/uts/common/dtrace/dcpc.c index e780d1e620..c410e65eaa 100644 --- a/usr/src/uts/common/dtrace/dcpc.c +++ b/usr/src/uts/common/dtrace/dcpc.c @@ -35,6 +35,7 @@ #include <sys/conf.h> #include <sys/kmem.h> #include <sys/kcpc.h> +#include <sys/cap_util.h> #include <sys/cpc_pcbe.h> #include <sys/cpc_impl.h> #include <sys/dtrace_impl.h> @@ -463,8 +464,7 @@ dcpc_program_cpu_event(cpu_t *c) set = dcpc_create_set(c); - octx = NULL; - set->ks_ctx = ctx = kcpc_ctx_alloc(); + set->ks_ctx = ctx = kcpc_ctx_alloc(KM_SLEEP); ctx->kc_set = set; ctx->kc_cpuid = c->cpu_id; @@ -489,11 +489,9 @@ dcpc_program_cpu_event(cpu_t *c) * If we already have an active enabling then save the current cpc * context away. */ - if (c->cpu_cpc_ctx != NULL) - octx = c->cpu_cpc_ctx; + octx = c->cpu_cpc_ctx; - c->cpu_cpc_ctx = ctx; - kcpc_remote_program(c); + kcpc_cpu_program(c, ctx); if (octx != NULL) { kcpc_set_t *oset = octx->kc_set; @@ -528,9 +526,14 @@ dcpc_disable_cpu(cpu_t *c) if (c->cpu_flags & CPU_OFFLINE) return; - kcpc_remote_stop(c); - + /* + * Grab CPUs CPC context before kcpc_cpu_stop() stops counters and + * changes it. + */ ctx = c->cpu_cpc_ctx; + + kcpc_cpu_stop(c, B_FALSE); + set = ctx->kc_set; kcpc_free_configs(set); @@ -538,7 +541,6 @@ dcpc_disable_cpu(cpu_t *c) kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t)); kcpc_free_set(set); kcpc_ctx_free(ctx); - c->cpu_cpc_ctx = NULL; } /* @@ -615,8 +617,21 @@ dcpc_program_event(dcpc_probe_t *pp) if (c->cpu_flags & CPU_OFFLINE) continue; + /* + * Stop counters but preserve existing DTrace CPC context + * if there is one. + * + * If we come here when the first event is programmed for a CPU, + * there should be no DTrace CPC context installed. In this + * case, kcpc_cpu_stop() will ensure that there is no other + * context on the CPU. + * + * If we add new enabling to the original one, the CPU should + * have the old DTrace CPC context which we need to keep around + * since dcpc_program_event() will add to it. + */ if (c->cpu_cpc_ctx != NULL) - kcpc_remote_stop(c); + kcpc_cpu_stop(c, B_TRUE); } while ((c = c->cpu_next) != cpu_list); dcpc_release_interrupts(); @@ -708,6 +723,13 @@ dcpc_enable(void *arg, dtrace_id_t id, void *parg) ASSERT(pp->dcpc_actv_req_idx >= 0); /* + * DTrace is taking over CPC contexts, so stop collecting + * capacity/utilization data for all CPUs. + */ + if (dtrace_cpc_in_use == 1) + cu_disable(); + + /* * The following must hold true if we are to (attempt to) enable * this request: * @@ -758,7 +780,7 @@ dcpc_enable(void *arg, dtrace_id_t id, void *parg) if (c->cpu_flags & CPU_OFFLINE) continue; - kcpc_remote_program(c); + kcpc_cpu_program(c, c->cpu_cpc_ctx); } while ((c = c->cpu_next) != cpu_list); } @@ -766,6 +788,13 @@ dcpc_enable(void *arg, dtrace_id_t id, void *parg) dcpc_actv_reqs[pp->dcpc_actv_req_idx] = NULL; pp->dcpc_actv_req_idx = pp->dcpc_picno = -1; + /* + * If all probes are removed, enable capacity/utilization data + * collection for every CPU. + */ + if (dtrace_cpc_in_use == 0) + cu_enable(); + return (-1); } @@ -841,6 +870,13 @@ dcpc_disable(void *arg, dtrace_id_t id, void *parg) dtrace_cpc_in_use--; pp->dcpc_enabled = 0; pp->dcpc_actv_req_idx = pp->dcpc_picno = pp->dcpc_disabling = -1; + + /* + * If all probes are removed, enable capacity/utilization data + * collection for every CPU + */ + if (dtrace_cpc_in_use == 0) + cu_enable(); } /*ARGSUSED*/ @@ -891,7 +927,6 @@ dcpc_cpu_setup(cpu_setup_t what, processorid_t cpu, void *arg) */ if (dtrace_cpc_in_use) { c = cpu_get(cpu); - (void) dcpc_program_cpu_event(c); } break; diff --git a/usr/src/uts/common/io/cpc.c b/usr/src/uts/common/io/cpc.c index 6881380251..0b003c3ee1 100644 --- a/usr/src/uts/common/io/cpc.c +++ b/usr/src/uts/common/io/cpc.c @@ -942,49 +942,19 @@ static struct modlinkage modl = { #endif }; -static void -kcpc_init(void) -{ - long hash; - - rw_init(&kcpc_cpuctx_lock, NULL, RW_DEFAULT, NULL); - for (hash = 0; hash < CPC_HASH_BUCKETS; hash++) - mutex_init(&kcpc_ctx_llock[hash], - NULL, MUTEX_DRIVER, (void *)(uintptr_t)15); -} - -static void -kcpc_fini(void) -{ - long hash; - - for (hash = 0; hash < CPC_HASH_BUCKETS; hash++) - mutex_destroy(&kcpc_ctx_llock[hash]); - rw_destroy(&kcpc_cpuctx_lock); -} - int _init(void) { - int ret; - - if (kcpc_hw_load_pcbe() != 0) + if (kcpc_init() != 0) return (ENOTSUP); - kcpc_init(); - if ((ret = mod_install(&modl)) != 0) - kcpc_fini(); - return (ret); + return (mod_install(&modl)); } int _fini(void) { - int ret; - - if ((ret = mod_remove(&modl)) == 0) - kcpc_fini(); - return (ret); + return (mod_remove(&modl)); } int diff --git a/usr/src/uts/common/os/cap_util.c b/usr/src/uts/common/os/cap_util.c new file mode 100644 index 0000000000..16ff7f45fd --- /dev/null +++ b/usr/src/uts/common/os/cap_util.c @@ -0,0 +1,1652 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Support for determining capacity and utilization of performance relevant + * hardware components in a computer + * + * THEORY + * ------ + * The capacity and utilization of the performance relevant hardware components + * is needed to be able to optimize performance while minimizing the amount of + * power used on a system. The idea is to use hardware performance counters + * and potentially other means to determine the capacity and utilization of + * performance relevant hardware components (eg. execution pipeline, cache, + * memory, etc.) and attribute the utilization to the responsible CPU and the + * thread running there. + * + * This will help characterize the utilization of performance relevant + * components and how much is used by each CPU and each thread. With + * that data, the utilization can be aggregated to all the CPUs sharing each + * performance relevant hardware component to calculate the total utilization + * of each component and compare that with the component's capacity to + * essentially determine the actual hardware load of the component. The + * hardware utilization attributed to each running thread can also be + * aggregated to determine the total hardware utilization of each component to + * a workload. + * + * Once that is done, one can determine how much of each performance relevant + * hardware component is needed by a given thread or set of threads (eg. a + * workload) and size up exactly what hardware is needed by the threads and how + * much. With this info, we can better place threads among CPUs to match their + * exact hardware resource needs and potentially lower or raise the power based + * on their utilization or pack threads onto the fewest hardware components + * needed and power off any remaining unused components to minimize power + * without sacrificing performance. + * + * IMPLEMENTATION + * -------------- + * The code has been designed and implemented to make (un)programming and + * reading the counters for a given CPU as lightweight and fast as possible. + * This is very important because we need to read and potentially (un)program + * the counters very often and in performance sensitive code. Specifically, + * the counters may need to be (un)programmed during context switch and/or a + * cyclic handler when there are more counter events to count than existing + * counters. + * + * Consequently, the code has been split up to allow allocating and + * initializing everything needed to program and read the counters on a given + * CPU once and make (un)programming and reading the counters for a given CPU + * not have to allocate/free memory or grab any locks. To do this, all the + * state needed to (un)program and read the counters on a CPU is kept per CPU + * and is made lock free by forcing any code that reads or manipulates the + * counters or the state needed to (un)program or read the counters to run on + * the target CPU and disable preemption while running on the target CPU to + * protect any critical sections. All counter manipulation on the target CPU is + * happening either from a cross-call to the target CPU or at the same PIL as + * used by the cross-call subsystem. This guarantees that counter manipulation + * is not interrupted by cross-calls from other CPUs. + * + * The synchronization has been made lock free or as simple as possible for + * performance and to avoid getting the locking all tangled up when we interpose + * on the CPC routines that (un)program the counters to manage the counters + * between the kernel and user on each CPU. When the user starts using the + * counters on a given CPU, the kernel will unprogram the counters that it is + * using on that CPU just before they are programmed for the user. Then the + * kernel will program the counters on a given CPU for its own use when the user + * stops using them. + * + * There is a special interaction with DTrace cpc provider (dcpc). Before dcpc + * enables any probe, it requests to disable and unprogram all counters used for + * capacity and utilizations. These counters are never re-programmed back until + * dcpc completes. When all DTrace cpc probes are removed, dcpc notifies CU + * framework and it re-programs the counters. + * + * When a CPU is going offline, its CU counters are unprogrammed and disabled, + * so that they would not be re-programmed again by some other activity on the + * CPU that is going offline. + * + * The counters are programmed during boot. However, a flag is available to + * disable this if necessary (see cu_flag below). A handler is provided to + * (un)program the counters during CPU on/offline. Basic routines are provided + * to initialize and tear down this module, initialize and tear down any state + * needed for a given CPU, and (un)program the counters for a given CPU. + * Lastly, a handler is provided to read the counters and attribute the + * utilization to the responsible CPU. + */ +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/cpuvar.h> +#include <sys/ddi.h> +#include <sys/disp.h> +#include <sys/sdt.h> +#include <sys/sunddi.h> +#include <sys/thread.h> +#include <sys/pghw.h> +#include <sys/cmt.h> +#include <sys/x_call.h> +#include <sys/cap_util.h> + +#include <sys/archsystm.h> +#include <sys/promif.h> + +#if defined(__x86) +#include <sys/xc_levels.h> +#endif + + +/* + * Default CPU hardware performance counter flags to use for measuring capacity + * and utilization + */ +#define CU_CPC_FLAGS_DEFAULT \ + (CPC_COUNT_USER|CPC_COUNT_SYSTEM|CPC_OVF_NOTIFY_EMT) + +/* + * Possible Flags for controlling this module. + */ +#define CU_FLAG_ENABLE 1 /* Enable module */ +#define CU_FLAG_READY 2 /* Ready to setup module */ +#define CU_FLAG_ON 4 /* Module is on */ + +/* + * pg_cpu kstats calculate utilization rate and maximum utilization rate for + * some CPUs. The rate is calculated based on data from two subsequent + * snapshots. When the time between such two snapshots is too small, the + * resulting rate may have low accuracy, so we only consider snapshots which + * are separated by SAMPLE_INTERVAL nanoseconds from one another. We do not + * update the rate if the interval is smaller than that. + * + * Use one tenth of a second as the minimum interval for utilization rate + * calculation. + * + * NOTE: The CU_SAMPLE_INTERVAL_MIN should be higher than the scaling factor in + * the CU_RATE() macro below to guarantee that we never divide by zero. + * + * Rate is the number of events per second. The rate is the number of events + * divided by time and multiplied by the number of nanoseconds in a second. We + * do not want time to be too small since it will cause large errors in + * division. + * + * We do not want to multiply two large numbers (the instruction count and + * NANOSEC) either since it may cause integer overflow. So we divide both the + * numerator and the denominator by the same value. + * + * NOTE: The scaling factor below should be less than CU_SAMPLE_INTERVAL_MIN + * above to guarantee that time divided by this value is always non-zero. + */ +#define CU_RATE(val, time) \ + (((val) * (NANOSEC / CU_SCALE)) / ((time) / CU_SCALE)) + +#define CU_SAMPLE_INTERVAL_MIN (NANOSEC / 10) + +#define CU_SCALE (CU_SAMPLE_INTERVAL_MIN / 10000) + +/* + * When the time between two kstat reads for the same CPU is less than + * CU_UPDATE_THRESHOLD use the old counter data and skip updating counter values + * for the CPU. This helps reduce cross-calls when kstat consumers read data + * very often or when they read PG utilization data and then CPU utilization + * data quickly after that. + */ +#define CU_UPDATE_THRESHOLD (NANOSEC / 10) + +/* + * The IS_HIPIL() macro verifies that the code is executed either from a + * cross-call or from high-PIL interrupt + */ +#ifdef DEBUG +#define IS_HIPIL() (getpil() >= XCALL_PIL) +#else +#define IS_HIPIL() +#endif /* DEBUG */ + + +typedef void (*cu_cpu_func_t)(uintptr_t, int *); + + +/* + * Flags to use for programming CPU hardware performance counters to measure + * capacity and utilization + */ +int cu_cpc_flags = CU_CPC_FLAGS_DEFAULT; + +/* + * Initial value used for programming hardware counters + */ +uint64_t cu_cpc_preset_value = 0; + +/* + * List of CPC event requests for capacity and utilization. + */ +static kcpc_request_list_t *cu_cpc_reqs = NULL; + +/* + * When a CPU is a member of PG with a sharing relationship that is supported + * by the capacity/utilization framework, a kstat is created for that CPU and + * sharing relationship. + * + * These kstats are updated one at a time, so we can have a single scratch + * space to fill the data. + * + * CPU counter kstats fields: + * + * cu_cpu_id CPU ID for this kstat + * + * cu_generation Generation value that increases whenever any CPU goes + * offline or online. Two kstat snapshots for the same + * CPU may only be compared if they have the same + * generation. + * + * cu_pg_id PG ID for the relationship described by this kstat + * + * cu_cpu_util Running value of CPU utilization for the sharing + * relationship + * + * cu_cpu_time_running Total time spent collecting CU data. The time may be + * less than wall time if CU counters were stopped for + * some time. + * + * cu_cpu_time_stopped Total time the CU counters were stopped. + * + * cu_cpu_rate Utilization rate, expressed in operations per second. + * + * cu_cpu_rate_max Maximum observed value of utilization rate. + */ +struct cu_cpu_kstat { + kstat_named_t cu_cpu_id; + kstat_named_t cu_generation; + kstat_named_t cu_pg_id; + kstat_named_t cu_cpu_util; + kstat_named_t cu_cpu_time_running; + kstat_named_t cu_cpu_time_stopped; + kstat_named_t cu_cpu_rate; + kstat_named_t cu_cpu_rate_max; +} cu_cpu_kstat = { + { "id", KSTAT_DATA_UINT32 }, + { "generation", KSTAT_DATA_UINT32 }, + { "pg_id", KSTAT_DATA_LONG }, + { "hw_util", KSTAT_DATA_UINT64 }, + { "hw_util_time_running", KSTAT_DATA_UINT64 }, + { "hw_util_time_stopped", KSTAT_DATA_UINT64 }, + { "hw_util_rate", KSTAT_DATA_UINT64 }, + { "hw_util_rate_max", KSTAT_DATA_UINT64 }, +}; + +/* + * Flags for controlling this module + */ +uint_t cu_flags = CU_FLAG_ENABLE; + +/* + * Error return value for cu_init() since it can't return anything to be called + * from mp_init_tbl[] (:-( + */ +static int cu_init_error = 0; + +hrtime_t cu_sample_interval_min = CU_SAMPLE_INTERVAL_MIN; + +hrtime_t cu_update_threshold = CU_UPDATE_THRESHOLD; + +static kmutex_t pg_cpu_kstat_lock; + + +/* + * Forward declaration of interface routines + */ +void cu_disable(void); +void cu_enable(void); +void cu_init(void); +void cu_cpc_program(cpu_t *cp, int *err); +void cu_cpc_unprogram(cpu_t *cp, int *err); +int cu_cpu_update(struct cpu *cp, boolean_t move_to); +void cu_pg_update(pghw_t *pg); + + +/* + * Forward declaration of private routines + */ +static int cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs); +static void cu_cpc_program_xcall(uintptr_t arg, int *err); +static int cu_cpc_req_add(char *event, kcpc_request_list_t *reqs, + int nreqs, cu_cntr_stats_t *stats, int kmem_flags, int *nevents); +static int cu_cpu_callback(cpu_setup_t what, int id, void *arg); +static void cu_cpu_disable(cpu_t *cp); +static void cu_cpu_enable(cpu_t *cp); +static int cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs); +static int cu_cpu_fini(cpu_t *cp); +static void cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info); +static int cu_cpu_kstat_update(kstat_t *ksp, int rw); +static int cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg); +static int cu_cpu_update_stats(cu_cntr_stats_t *stats, + uint64_t cntr_value); +static void cu_cpu_info_detach_xcall(void); + +/* + * Disable or enable Capacity Utilization counters on all CPUs. + */ +void +cu_disable(void) +{ + cpu_t *cp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + cp = cpu_active; + do { + if (!(cp->cpu_flags & CPU_OFFLINE)) + cu_cpu_disable(cp); + } while ((cp = cp->cpu_next_onln) != cpu_active); +} + + +void +cu_enable(void) +{ + cpu_t *cp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + cp = cpu_active; + do { + if (!(cp->cpu_flags & CPU_OFFLINE)) + cu_cpu_enable(cp); + } while ((cp = cp->cpu_next_onln) != cpu_active); +} + + +/* + * Setup capacity and utilization support + */ +void +cu_init(void) +{ + cpu_t *cp; + + cu_init_error = 0; + if (!(cu_flags & CU_FLAG_ENABLE) || (cu_flags & CU_FLAG_ON)) { + cu_init_error = -1; + return; + } + + if (kcpc_init() != 0) { + cu_init_error = -2; + return; + } + + /* + * Can't measure hardware capacity and utilization without CPU + * hardware performance counters + */ + if (cpc_ncounters <= 0) { + cu_init_error = -3; + return; + } + + /* + * Setup CPC event request queue + */ + cu_cpc_reqs = kcpc_reqs_init(cpc_ncounters, KM_SLEEP); + + mutex_enter(&cpu_lock); + + /* + * Mark flags to say that module is ready to be setup + */ + cu_flags |= CU_FLAG_READY; + + cp = cpu_active; + do { + /* + * Allocate and setup state needed to measure capacity and + * utilization + */ + if (cu_cpu_init(cp, cu_cpc_reqs) != 0) + cu_init_error = -5; + + /* + * Reset list of counter event requests so its space can be + * reused for a different set of requests for next CPU + */ + (void) kcpc_reqs_reset(cu_cpc_reqs); + + cp = cp->cpu_next_onln; + } while (cp != cpu_active); + + /* + * Mark flags to say that module is on now and counters are ready to be + * programmed on all active CPUs + */ + cu_flags |= CU_FLAG_ON; + + /* + * Program counters on currently active CPUs + */ + cp = cpu_active; + do { + if (cu_cpu_run(cp, cu_cpc_program_xcall, + (uintptr_t)B_FALSE) != 0) + cu_init_error = -6; + + cp = cp->cpu_next_onln; + } while (cp != cpu_active); + + /* + * Register callback for CPU state changes to enable and disable + * CPC counters as CPUs come on and offline + */ + register_cpu_setup_func(cu_cpu_callback, NULL); + + mutex_exit(&cpu_lock); +} + + +/* + * Return number of counter events needed to measure capacity and utilization + * for specified CPU and fill in list of CPC requests with each counter event + * needed if list where to add CPC requests is given + * + * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free + * everything that has been successfully allocated if any memory + * allocation fails + */ +static int +cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs) +{ + group_t *cmt_pgs; + cu_cntr_info_t **cntr_info_array; + cpu_pg_t *cpu_pgs; + cu_cpu_info_t *cu_cpu_info; + pg_cmt_t *pg_cmt; + pghw_t *pg_hw; + cu_cntr_stats_t *stats; + int nevents; + pghw_type_t pg_hw_type; + group_iter_t iter; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + /* + * There has to be a target CPU for this + */ + if (cp == NULL) + return (-1); + + /* + * Return 0 when CPU doesn't belong to any group + */ + cpu_pgs = cp->cpu_pg; + if (cpu_pgs == NULL || GROUP_SIZE(&cpu_pgs->cmt_pgs) < 1) + return (0); + + cmt_pgs = &cpu_pgs->cmt_pgs; + cu_cpu_info = cp->cpu_cu_info; + + /* + * Grab counter statistics and info + */ + if (reqs == NULL) { + stats = NULL; + cntr_info_array = NULL; + } else { + if (cu_cpu_info == NULL || cu_cpu_info->cu_cntr_stats == NULL) + return (-2); + + stats = cu_cpu_info->cu_cntr_stats; + cntr_info_array = cu_cpu_info->cu_cntr_info; + } + + /* + * See whether platform (or processor) specific code knows which CPC + * events to request, etc. are needed to measure hardware capacity and + * utilization on this machine + */ + nevents = cu_plat_cpc_init(cp, reqs, nreqs); + if (nevents >= 0) + return (nevents); + + /* + * Let common code decide which CPC events to request, etc. to measure + * capacity and utilization since platform (or processor) specific does + * not know.... + * + * Walk CPU's PG lineage and do following: + * + * - Setup CPC request, counter info, and stats needed for each counter + * event to measure capacity and and utilization for each of CPU's PG + * hardware sharing relationships + * + * - Create PG CPU kstats to export capacity and utilization for each PG + */ + nevents = 0; + group_iter_init(&iter); + while ((pg_cmt = group_iterate(cmt_pgs, &iter)) != NULL) { + cu_cntr_info_t *cntr_info; + int nevents_save; + int nstats; + + pg_hw = (pghw_t *)pg_cmt; + pg_hw_type = pg_hw->pghw_hw; + nevents_save = nevents; + nstats = 0; + + switch (pg_hw_type) { + case PGHW_IPIPE: + if (cu_cpc_req_add("PAPI_tot_ins", reqs, nreqs, stats, + KM_NOSLEEP, &nevents) != 0) + continue; + nstats = 1; + break; + + case PGHW_FPU: + if (cu_cpc_req_add("PAPI_fp_ins", reqs, nreqs, stats, + KM_NOSLEEP, &nevents) != 0) + continue; + nstats = 1; + break; + + default: + /* + * Don't measure capacity and utilization for this kind + * of PG hardware relationship so skip to next PG in + * CPU's PG lineage + */ + continue; + } + + cntr_info = cntr_info_array[pg_hw_type]; + + /* + * Nothing to measure for this hardware sharing relationship + */ + if (nevents - nevents_save == 0) { + if (cntr_info != NULL) + kmem_free(cntr_info, sizeof (cu_cntr_info_t)); + cntr_info_array[pg_hw_type] = NULL; + continue; + } + + /* + * Fill in counter info for this PG hardware relationship + */ + if (cntr_info == NULL) { + cntr_info = kmem_zalloc(sizeof (cu_cntr_info_t), + KM_NOSLEEP); + if (cntr_info == NULL) + continue; + cntr_info_array[pg_hw_type] = cntr_info; + } + cntr_info->ci_cpu = cp; + cntr_info->ci_pg = pg_hw; + cntr_info->ci_stats = &stats[nevents_save]; + cntr_info->ci_nstats = nstats; + + /* + * Create PG CPU kstats for this hardware relationship + */ + cu_cpu_kstat_create(pg_hw, cntr_info); + } + + return (nevents); +} + + +/* + * Program counters for capacity and utilization on given CPU + * + * If any of the following conditions is true, the counters are not programmed: + * + * - CU framework is disabled + * - The cpu_cu_info field of the cpu structure is NULL + * - DTrace is active + * - Counters are programmed already + * - Counters are disabled (by calls to cu_cpu_disable()) + */ +void +cu_cpc_program(cpu_t *cp, int *err) +{ + cu_cpc_ctx_t *cpu_ctx; + kcpc_ctx_t *ctx; + cu_cpu_info_t *cu_cpu_info; + + ASSERT(IS_HIPIL()); + /* + * Should be running on given CPU. We disable preemption to keep CPU + * from disappearing and make sure flags and CPC context don't change + * from underneath us + */ + kpreempt_disable(); + ASSERT(cp == CPU); + + /* + * Module not ready to program counters + */ + if (!(cu_flags & CU_FLAG_ON)) { + *err = -1; + kpreempt_enable(); + return; + } + + if (cp == NULL) { + *err = -2; + kpreempt_enable(); + return; + } + + cu_cpu_info = cp->cpu_cu_info; + if (cu_cpu_info == NULL) { + *err = -3; + kpreempt_enable(); + return; + } + + /* + * If DTrace CPC is active or counters turned on already or are + * disabled, just return. + */ + if (dtrace_cpc_in_use || (cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON) || + cu_cpu_info->cu_disabled) { + *err = 1; + kpreempt_enable(); + return; + } + + if ((CPU->cpu_cpc_ctx != NULL) && + !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) { + *err = -4; + kpreempt_enable(); + return; + } + + /* + * Get CPU's CPC context needed for capacity and utilization + */ + cpu_ctx = &cu_cpu_info->cu_cpc_ctx; + ASSERT(cpu_ctx != NULL); + ASSERT(cpu_ctx->nctx >= 0); + + ASSERT(cpu_ctx->ctx_ptr_array == NULL || cpu_ctx->ctx_ptr_array_sz > 0); + ASSERT(cpu_ctx->nctx <= cpu_ctx->ctx_ptr_array_sz); + if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL || + cpu_ctx->ctx_ptr_array_sz <= 0) { + *err = -5; + kpreempt_enable(); + return; + } + + /* + * Increment index in CPU's CPC context info to point at next context + * to program + * + * NOTE: Do this now instead of after programming counters to ensure + * that index will always point at *current* context so we will + * always be able to unprogram *current* context if necessary + */ + cpu_ctx->cur_index = (cpu_ctx->cur_index + 1) % cpu_ctx->nctx; + + ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index]; + + /* + * Clear KCPC_CTX_INVALID and KCPC_CTX_INVALID_STOPPED from CPU's CPC + * context before programming counters + * + * Context is marked with KCPC_CTX_INVALID_STOPPED when context is + * unprogrammed and may be marked with KCPC_CTX_INVALID when + * kcpc_invalidate_all() is called by cpustat(1M) and dtrace CPC to + * invalidate all CPC contexts before they take over all the counters. + * + * This isn't necessary since these flags are only used for thread bound + * CPC contexts not CPU bound CPC contexts like ones used for capacity + * and utilization. + * + * There is no need to protect the flag update since no one is using + * this context now. + */ + ctx->kc_flags &= ~(KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED); + + /* + * Program counters on this CPU + */ + kcpc_program(ctx, B_FALSE, B_FALSE); + + cp->cpu_cpc_ctx = ctx; + + /* + * Set state in CPU structure to say that CPU's counters are programmed + * for capacity and utilization now and that they are transitioning from + * off to on state. This will cause cu_cpu_update to update stop times + * for all programmed counters. + */ + cu_cpu_info->cu_flag |= CU_CPU_CNTRS_ON | CU_CPU_CNTRS_OFF_ON; + + /* + * Update counter statistics + */ + (void) cu_cpu_update(cp, B_FALSE); + + cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_OFF_ON; + + *err = 0; + kpreempt_enable(); +} + + +/* + * Cross call wrapper routine for cu_cpc_program() + * + * Checks to make sure that counters on CPU aren't being used by someone else + * before calling cu_cpc_program() since cu_cpc_program() needs to assert that + * nobody else is using the counters to catch and prevent any broken code. + * Also, this check needs to happen on the target CPU since the CPU's CPC + * context can only be changed while running on the CPU. + * + * If the first argument is TRUE, cu_cpc_program_xcall also checks that there is + * no valid thread bound cpc context. This is important to check to prevent + * re-programming thread counters with CU counters when CPU is coming on-line. + */ +static void +cu_cpc_program_xcall(uintptr_t arg, int *err) +{ + boolean_t avoid_thread_context = (boolean_t)arg; + + kpreempt_disable(); + + if (CPU->cpu_cpc_ctx != NULL && + !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) { + *err = -100; + kpreempt_enable(); + return; + } + + if (avoid_thread_context && (curthread->t_cpc_ctx != NULL) && + !(curthread->t_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) { + *err = -200; + kpreempt_enable(); + return; + } + + cu_cpc_program(CPU, err); + kpreempt_enable(); +} + + +/* + * Unprogram counters for capacity and utilization on given CPU + * This function should be always executed on the target CPU at high PIL + */ +void +cu_cpc_unprogram(cpu_t *cp, int *err) +{ + cu_cpc_ctx_t *cpu_ctx; + kcpc_ctx_t *ctx; + cu_cpu_info_t *cu_cpu_info; + + ASSERT(IS_HIPIL()); + /* + * Should be running on given CPU with preemption disabled to keep CPU + * from disappearing and make sure flags and CPC context don't change + * from underneath us + */ + kpreempt_disable(); + ASSERT(cp == CPU); + + /* + * Module not on + */ + if (!(cu_flags & CU_FLAG_ON)) { + *err = -1; + kpreempt_enable(); + return; + } + + cu_cpu_info = cp->cpu_cu_info; + if (cu_cpu_info == NULL) { + *err = -3; + kpreempt_enable(); + return; + } + + /* + * Counters turned off already + */ + if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)) { + *err = 1; + kpreempt_enable(); + return; + } + + /* + * Update counter statistics + */ + (void) cu_cpu_update(cp, B_FALSE); + + /* + * Get CPU's CPC context needed for capacity and utilization + */ + cpu_ctx = &cu_cpu_info->cu_cpc_ctx; + if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL || + cpu_ctx->ctx_ptr_array_sz <= 0) { + *err = -5; + kpreempt_enable(); + return; + } + ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index]; + + /* + * CPU's CPC context should be current capacity and utilization CPC + * context + */ + ASSERT(cp->cpu_cpc_ctx == ctx); + if (cp->cpu_cpc_ctx != ctx) { + *err = -6; + kpreempt_enable(); + return; + } + + /* + * Unprogram counters on CPU. + */ + kcpc_unprogram(ctx, B_FALSE); + + ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED); + + /* + * Unset state in CPU structure saying that CPU's counters are + * programmed + */ + cp->cpu_cpc_ctx = NULL; + cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_ON; + + *err = 0; + kpreempt_enable(); +} + + +/* + * Add given counter event to list of CPC requests + */ +static int +cu_cpc_req_add(char *event, kcpc_request_list_t *reqs, int nreqs, + cu_cntr_stats_t *stats, int kmem_flags, int *nevents) +{ + int n; + int retval; + uint_t flags; + + /* + * Return error when no counter event specified, counter event not + * supported by CPC's PCBE, or number of events not given + */ + if (event == NULL || kcpc_event_supported(event) == B_FALSE || + nevents == NULL) + return (-1); + + n = *nevents; + + /* + * Only count number of counter events needed if list + * where to add CPC requests not given + */ + if (reqs == NULL) { + n++; + *nevents = n; + return (-3); + } + + /* + * Return error when stats not given or not enough room on list of CPC + * requests for more counter events + */ + if (stats == NULL || (nreqs <= 0 && n >= nreqs)) + return (-4); + + /* + * Use flags in cu_cpc_flags to program counters and enable overflow + * interrupts/traps (unless PCBE can't handle overflow interrupts) so + * PCBE can catch counters before they wrap to hopefully give us an + * accurate (64-bit) virtualized counter + */ + flags = cu_cpc_flags; + if ((kcpc_pcbe_capabilities() & CPC_CAP_OVERFLOW_INTERRUPT) == 0) + flags &= ~CPC_OVF_NOTIFY_EMT; + + /* + * Add CPC request to list + */ + retval = kcpc_reqs_add(reqs, event, cu_cpc_preset_value, + flags, 0, NULL, &stats[n], kmem_flags); + + if (retval != 0) + return (-5); + + n++; + *nevents = n; + return (0); +} + +static void +cu_cpu_info_detach_xcall(void) +{ + ASSERT(IS_HIPIL()); + + CPU->cpu_cu_info = NULL; +} + + +/* + * Enable or disable collection of capacity/utilization data for a current CPU. + * Counters are enabled if 'on' argument is True and disabled if it is False. + * This function should be always executed at high PIL + */ +static void +cu_cpc_trigger(uintptr_t arg1, uintptr_t arg2) +{ + cpu_t *cp = (cpu_t *)arg1; + boolean_t on = (boolean_t)arg2; + int error; + cu_cpu_info_t *cu_cpu_info; + + ASSERT(IS_HIPIL()); + kpreempt_disable(); + ASSERT(cp == CPU); + + if (!(cu_flags & CU_FLAG_ON)) { + kpreempt_enable(); + return; + } + + cu_cpu_info = cp->cpu_cu_info; + if (cu_cpu_info == NULL) { + kpreempt_enable(); + return; + } + + ASSERT(!cu_cpu_info->cu_disabled || + !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)); + + if (on) { + /* + * Decrement the cu_disabled counter. + * Once it drops to zero, call cu_cpc_program. + */ + if (cu_cpu_info->cu_disabled > 0) + cu_cpu_info->cu_disabled--; + if (cu_cpu_info->cu_disabled == 0) + cu_cpc_program(CPU, &error); + } else if (cu_cpu_info->cu_disabled++ == 0) { + /* + * This is the first attempt to disable CU, so turn it off + */ + cu_cpc_unprogram(cp, &error); + ASSERT(!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)); + } + + kpreempt_enable(); +} + + +/* + * Callback for changes in CPU states + * Used to enable or disable hardware performance counters on CPUs that are + * turned on or off + * + * NOTE: cpc should be programmed/unprogrammed while running on the target CPU. + * We have to use thread_affinity_set to hop to the right CPU because these + * routines expect cpu_lock held, so we can't cross-call other CPUs while + * holding CPU lock. + */ +static int +/* LINTED E_FUNC_ARG_UNUSED */ +cu_cpu_callback(cpu_setup_t what, int id, void *arg) +{ + cpu_t *cp; + int retval = 0; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + if (!(cu_flags & CU_FLAG_ON)) + return (-1); + + cp = cpu_get(id); + if (cp == NULL) + return (-2); + + switch (what) { + case CPU_ON: + /* + * Setup counters on CPU being turned on + */ + retval = cu_cpu_init(cp, cu_cpc_reqs); + + /* + * Reset list of counter event requests so its space can be + * reused for a different set of requests for next CPU + */ + (void) kcpc_reqs_reset(cu_cpc_reqs); + break; + case CPU_INTR_ON: + /* + * Setup counters on CPU being turned on. + */ + retval = cu_cpu_run(cp, cu_cpc_program_xcall, + (uintptr_t)B_TRUE); + break; + case CPU_OFF: + /* + * Disable counters on CPU being turned off. Counters will not + * be re-enabled on this CPU until it comes back online. + */ + cu_cpu_disable(cp); + ASSERT(!CU_CPC_ON(cp)); + retval = cu_cpu_fini(cp); + break; + default: + break; + } + return (retval); +} + + +/* + * Disable or enable Capacity Utilization counters on a given CPU. This function + * can be called from any CPU to disable counters on the given CPU. + */ +static void +cu_cpu_disable(cpu_t *cp) +{ + cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_FALSE); +} + + +static void +cu_cpu_enable(cpu_t *cp) +{ + cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_TRUE); +} + + +/* + * Setup capacity and utilization support for given CPU + * + * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free + * everything that has been successfully allocated including cpu_cu_info + * if any memory allocation fails + */ +static int +cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs) +{ + kcpc_ctx_t **ctx_ptr_array; + size_t ctx_ptr_array_sz; + cu_cpc_ctx_t *cpu_ctx; + cu_cpu_info_t *cu_cpu_info; + int n; + + /* + * cpu_lock should be held and protect against CPU going away and races + * with cu_{init,fini,cpu_fini}() + */ + ASSERT(MUTEX_HELD(&cpu_lock)); + + /* + * Return if not ready to setup counters yet + */ + if (!(cu_flags & CU_FLAG_READY)) + return (-1); + + if (cp->cpu_cu_info == NULL) { + cp->cpu_cu_info = kmem_zalloc(sizeof (cu_cpu_info_t), + KM_NOSLEEP); + if (cp->cpu_cu_info == NULL) + return (-2); + } + + /* + * Get capacity and utilization CPC context for CPU and check to see + * whether it has been setup already + */ + cu_cpu_info = cp->cpu_cu_info; + cu_cpu_info->cu_cpu = cp; + cu_cpu_info->cu_disabled = dtrace_cpc_in_use ? 1 : 0; + + cpu_ctx = &cu_cpu_info->cu_cpc_ctx; + if (cpu_ctx->nctx > 0 && cpu_ctx->ctx_ptr_array != NULL && + cpu_ctx->ctx_ptr_array_sz > 0) { + return (1); + } + + /* + * Should have no contexts since it hasn't been setup already + */ + ASSERT(cpu_ctx->nctx == 0 && cpu_ctx->ctx_ptr_array == NULL && + cpu_ctx->ctx_ptr_array_sz == 0); + + /* + * Determine how many CPC events needed to measure capacity and + * utilization for this CPU, allocate space for counter statistics for + * each event, and fill in list of CPC event requests with corresponding + * counter stats for each request to make attributing counter data + * easier later.... + */ + n = cu_cpc_init(cp, NULL, 0); + if (n <= 0) { + (void) cu_cpu_fini(cp); + return (-3); + } + + cu_cpu_info->cu_cntr_stats = kmem_zalloc(n * sizeof (cu_cntr_stats_t), + KM_NOSLEEP); + if (cu_cpu_info->cu_cntr_stats == NULL) { + (void) cu_cpu_fini(cp); + return (-4); + } + + cu_cpu_info->cu_ncntr_stats = n; + + n = cu_cpc_init(cp, reqs, n); + if (n <= 0) { + (void) cu_cpu_fini(cp); + return (-5); + } + + /* + * Create CPC context with given requests + */ + ctx_ptr_array = NULL; + ctx_ptr_array_sz = 0; + n = kcpc_cpu_ctx_create(cp, reqs, KM_NOSLEEP, &ctx_ptr_array, + &ctx_ptr_array_sz); + if (n <= 0) { + (void) cu_cpu_fini(cp); + return (-6); + } + + /* + * Should have contexts + */ + ASSERT(n > 0 && ctx_ptr_array != NULL && ctx_ptr_array_sz > 0); + if (ctx_ptr_array == NULL || ctx_ptr_array_sz <= 0) { + (void) cu_cpu_fini(cp); + return (-7); + } + + /* + * Fill in CPC context info for CPU needed for capacity and utilization + */ + cpu_ctx->cur_index = 0; + cpu_ctx->nctx = n; + cpu_ctx->ctx_ptr_array = ctx_ptr_array; + cpu_ctx->ctx_ptr_array_sz = ctx_ptr_array_sz; + return (0); +} + +/* + * Tear down capacity and utilization support for given CPU + */ +static int +cu_cpu_fini(cpu_t *cp) +{ + kcpc_ctx_t *ctx; + cu_cpc_ctx_t *cpu_ctx; + cu_cpu_info_t *cu_cpu_info; + int i; + pghw_type_t pg_hw_type; + + /* + * cpu_lock should be held and protect against CPU going away and races + * with cu_{init,fini,cpu_init}() + */ + ASSERT(MUTEX_HELD(&cpu_lock)); + + /* + * Have to at least be ready to setup counters to have allocated + * anything that needs to be deallocated now + */ + if (!(cu_flags & CU_FLAG_READY)) + return (-1); + + /* + * Nothing to do if CPU's capacity and utilization info doesn't exist + */ + cu_cpu_info = cp->cpu_cu_info; + if (cu_cpu_info == NULL) + return (1); + + /* + * Tear down any existing kstats and counter info for each hardware + * sharing relationship + */ + for (pg_hw_type = PGHW_START; pg_hw_type < PGHW_NUM_COMPONENTS; + pg_hw_type++) { + cu_cntr_info_t *cntr_info; + + cntr_info = cu_cpu_info->cu_cntr_info[pg_hw_type]; + if (cntr_info == NULL) + continue; + + if (cntr_info->ci_kstat != NULL) { + kstat_delete(cntr_info->ci_kstat); + cntr_info->ci_kstat = NULL; + } + kmem_free(cntr_info, sizeof (cu_cntr_info_t)); + } + + /* + * Free counter statistics for CPU + */ + ASSERT(cu_cpu_info->cu_cntr_stats == NULL || + cu_cpu_info->cu_ncntr_stats > 0); + if (cu_cpu_info->cu_cntr_stats != NULL && + cu_cpu_info->cu_ncntr_stats > 0) { + kmem_free(cu_cpu_info->cu_cntr_stats, + cu_cpu_info->cu_ncntr_stats * sizeof (cu_cntr_stats_t)); + cu_cpu_info->cu_cntr_stats = NULL; + cu_cpu_info->cu_ncntr_stats = 0; + } + + /* + * Get capacity and utilization CPC contexts for given CPU and check to + * see whether they have been freed already + */ + cpu_ctx = &cu_cpu_info->cu_cpc_ctx; + if (cpu_ctx != NULL && cpu_ctx->ctx_ptr_array != NULL && + cpu_ctx->ctx_ptr_array_sz > 0) { + /* + * Free CPC contexts for given CPU + */ + for (i = 0; i < cpu_ctx->nctx; i++) { + ctx = cpu_ctx->ctx_ptr_array[i]; + if (ctx == NULL) + continue; + kcpc_free(ctx, 0); + } + + /* + * Free CPC context pointer array + */ + kmem_free(cpu_ctx->ctx_ptr_array, cpu_ctx->ctx_ptr_array_sz); + + /* + * Zero CPC info for CPU + */ + bzero(cpu_ctx, sizeof (cu_cpc_ctx_t)); + } + + /* + * Set cp->cpu_cu_info pointer to NULL. Go through cross-call to ensure + * that no one is going to access the cpu_cu_info whicch we are going to + * free. + */ + if (cpu_is_online(cp)) + cpu_call(cp, (cpu_call_func_t)cu_cpu_info_detach_xcall, 0, 0); + else + cp->cpu_cu_info = NULL; + + /* + * Free CPU's capacity and utilization info + */ + kmem_free(cu_cpu_info, sizeof (cu_cpu_info_t)); + + return (0); +} + +/* + * Create capacity & utilization kstats for given PG CPU hardware sharing + * relationship + */ +static void +cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info) +{ + char *class, *sh_name; + kstat_t *ks; + + /* + * Just return when no counter info or CPU + */ + if (cntr_info == NULL || cntr_info->ci_cpu == NULL) + return; + + /* + * Get the class name from the leaf PG that this CPU belongs to. + * If there are no PGs, just use the default class "cpu". + */ + class = pg ? pghw_type_string(pg->pghw_hw) : "cpu"; + sh_name = pg ? pghw_type_shortstring(pg->pghw_hw) : "cpu"; + + if ((ks = kstat_create_zone("pg_cpu", cntr_info->ci_cpu->cpu_id, + sh_name, class, KSTAT_TYPE_NAMED, + sizeof (cu_cpu_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID)) == NULL) + return; + + ks->ks_lock = &pg_cpu_kstat_lock; + ks->ks_data = &cu_cpu_kstat; + ks->ks_update = cu_cpu_kstat_update; + + ks->ks_private = cntr_info; + cntr_info->ci_kstat = ks; + kstat_install(cntr_info->ci_kstat); +} + + +/* + * Propagate values from CPU capacity & utilization stats to kstats + */ +static int +cu_cpu_kstat_update(kstat_t *ksp, int rw) +{ + cpu_t *cp; + cu_cntr_info_t *cntr_info = ksp->ks_private; + struct cu_cpu_kstat *kstat = &cu_cpu_kstat; + pghw_t *pg; + cu_cntr_stats_t *stats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + kpreempt_disable(); + + /* + * Update capacity and utilization statistics needed for CPU's PG (CPU) + * kstats + */ + cp = cntr_info->ci_cpu; + (void) cu_cpu_update(cp, B_TRUE); + + pg = cntr_info->ci_pg; + stats = cntr_info->ci_stats; + kstat->cu_cpu_id.value.ui32 = cp->cpu_id; + kstat->cu_generation.value.ui32 = cp->cpu_generation; + if (pg == NULL) + kstat->cu_pg_id.value.l = -1; + else + kstat->cu_pg_id.value.l = pg->pghw_pg.pg_id; + + kstat->cu_cpu_util.value.ui64 = stats->cs_value_total; + kstat->cu_cpu_rate.value.ui64 = stats->cs_rate; + kstat->cu_cpu_rate_max.value.ui64 = stats->cs_rate_max; + kstat->cu_cpu_time_running.value.ui64 = stats->cs_time_running; + kstat->cu_cpu_time_stopped.value.ui64 = stats->cs_time_stopped; + /* + * Counters are stopped now, so the cs_time_stopped was last + * updated at cs_time_start time. Add the time passed since then + * to the stopped time. + */ + if (!(cp->cpu_cu_info->cu_flag & CU_CPU_CNTRS_ON)) + kstat->cu_cpu_time_stopped.value.ui64 += + gethrtime() - stats->cs_time_start; + + kpreempt_enable(); + + return (0); +} + +/* + * Run specified function with specified argument on a given CPU and return + * whatever the function returns + */ +static int +cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg) +{ + int error = 0; + + /* + * cpu_call() will call func on the CPU specified with given argument + * and return func's return value in last argument + */ + cpu_call(cp, (cpu_call_func_t)func, arg, (uintptr_t)&error); + return (error); +} + + +/* + * Update counter statistics on a given CPU. + * + * If move_to argument is True, execute the function on the CPU specified + * Otherwise, assume that it is already runninng on the right CPU + * + * If move_to is specified, the caller should hold cpu_lock or have preemption + * disabled. Otherwise it is up to the caller to guarantee that things do not + * change in the process. + */ +int +cu_cpu_update(struct cpu *cp, boolean_t move_to) +{ + int retval; + cu_cpu_info_t *cu_cpu_info = cp->cpu_cu_info; + hrtime_t time_snap; + + ASSERT(!move_to || MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0); + + /* + * Nothing to do if counters are not programmed + */ + if (!(cu_flags & CU_FLAG_ON) || + (cu_cpu_info == NULL) || + !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)) + return (0); + + /* + * Don't update CPU statistics if it was updated recently + * and provide old results instead + */ + time_snap = gethrtime(); + if ((time_snap - cu_cpu_info->cu_sample_time) < cu_update_threshold) { + DTRACE_PROBE1(cu__drop__sample, cpu_t *, cp); + return (0); + } + + cu_cpu_info->cu_sample_time = time_snap; + + /* + * CPC counter should be read on the CPU that is running the counter. We + * either have to move ourselves to the target CPU or insure that we + * already run there. + * + * We use cross-call to the target CPU to execute kcpc_read() and + * cu_cpu_update_stats() there. + */ + retval = 0; + if (move_to) + (void) cu_cpu_run(cp, (cu_cpu_func_t)kcpc_read, + (uintptr_t)cu_cpu_update_stats); + else { + retval = kcpc_read((kcpc_update_func_t)cu_cpu_update_stats); + /* + * Offset negative return value by -10 so we can distinguish it + * from error return values of this routine vs kcpc_read() + */ + if (retval < 0) + retval -= 10; + } + + return (retval); +} + + +/* + * Update CPU counter statistics for current CPU. + * This function may be called from a cross-call + */ +static int +cu_cpu_update_stats(cu_cntr_stats_t *stats, uint64_t cntr_value) +{ + cu_cpu_info_t *cu_cpu_info = CPU->cpu_cu_info; + uint_t flags; + uint64_t delta; + hrtime_t time_delta; + hrtime_t time_snap; + + if (stats == NULL) + return (-1); + + /* + * Nothing to do if counters are not programmed. This should not happen, + * but we check just in case. + */ + ASSERT(cu_flags & CU_FLAG_ON); + ASSERT(cu_cpu_info != NULL); + if (!(cu_flags & CU_FLAG_ON) || + (cu_cpu_info == NULL)) + return (-2); + + flags = cu_cpu_info->cu_flag; + ASSERT(flags & CU_CPU_CNTRS_ON); + if (!(flags & CU_CPU_CNTRS_ON)) + return (-2); + + /* + * Take snapshot of high resolution timer + */ + time_snap = gethrtime(); + + /* + * CU counters have just been programmed. We cannot assume that the new + * cntr_value continues from where we left off, so use the cntr_value as + * the new initial value. + */ + if (flags & CU_CPU_CNTRS_OFF_ON) + stats->cs_value_start = cntr_value; + + /* + * Calculate delta in counter values between start of sampling period + * and now + */ + delta = cntr_value - stats->cs_value_start; + + /* + * Calculate time between start of sampling period and now + */ + time_delta = stats->cs_time_start ? + time_snap - stats->cs_time_start : + 0; + stats->cs_time_start = time_snap; + stats->cs_value_start = cntr_value; + + if (time_delta > 0) { /* wrap shouldn't happen */ + /* + * Update either running or stopped time based on the transition + * state + */ + if (flags & CU_CPU_CNTRS_OFF_ON) + stats->cs_time_stopped += time_delta; + else + stats->cs_time_running += time_delta; + } + + /* + * Update rest of counter statistics if counter value didn't wrap + */ + if (delta > 0) { + /* + * Update utilization rate if the interval between samples is + * sufficient. + */ + ASSERT(cu_sample_interval_min > CU_SCALE); + if (time_delta > cu_sample_interval_min) + stats->cs_rate = CU_RATE(delta, time_delta); + if (stats->cs_rate_max < stats->cs_rate) + stats->cs_rate_max = stats->cs_rate; + + stats->cs_value_last = delta; + stats->cs_value_total += delta; + } + + return (0); +} + +/* + * Update CMT PG utilization data. + * + * This routine computes the running total utilization and times for the + * specified PG by adding up the total utilization and counter running and + * stopped times of all CPUs in the PG and calculates the utilization rate and + * maximum rate for all CPUs in the PG. + */ +void +cu_pg_update(pghw_t *pg) +{ + pg_cpu_itr_t cpu_iter; + pghw_type_t pg_hwtype; + cpu_t *cpu; + pghw_util_t *hw_util = &pg->pghw_stats; + uint64_t old_utilization = hw_util->pghw_util; + hrtime_t now; + hrtime_t time_delta; + uint64_t utilization_delta; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + now = gethrtime(); + + pg_hwtype = pg->pghw_hw; + + /* + * Initialize running total utilization and times for PG to 0 + */ + hw_util->pghw_util = 0; + hw_util->pghw_time_running = 0; + hw_util->pghw_time_stopped = 0; + + /* + * Iterate over all CPUs in the PG and aggregate utilization, running + * time and stopped time. + */ + PG_CPU_ITR_INIT(pg, cpu_iter); + while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { + cu_cpu_info_t *cu_cpu_info = cpu->cpu_cu_info; + cu_cntr_info_t *cntr_info; + cu_cntr_stats_t *stats; + + if (cu_cpu_info == NULL) + continue; + + /* + * Update utilization data for the CPU and then + * aggregate per CPU running totals for PG + */ + (void) cu_cpu_update(cpu, B_TRUE); + cntr_info = cu_cpu_info->cu_cntr_info[pg_hwtype]; + + if (cntr_info == NULL || (stats = cntr_info->ci_stats) == NULL) + continue; + + hw_util->pghw_util += stats->cs_value_total; + hw_util->pghw_time_running += stats->cs_time_running; + hw_util->pghw_time_stopped += stats->cs_time_stopped; + + /* + * If counters are stopped now, the pg_time_stopped was last + * updated at cs_time_start time. Add the time passed since then + * to the stopped time. + */ + if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)) + hw_util->pghw_time_stopped += + now - stats->cs_time_start; + } + + /* + * Compute per PG instruction rate and maximum rate + */ + time_delta = now - hw_util->pghw_time_stamp; + hw_util->pghw_time_stamp = now; + + if (old_utilization == 0) + return; + + /* + * Calculate change in utilization over sampling period and set this to + * 0 if the delta would be 0 or negative which may happen if any CPUs go + * offline during the sampling period + */ + if (hw_util->pghw_util > old_utilization) + utilization_delta = hw_util->pghw_util - old_utilization; + else + utilization_delta = 0; + + /* + * Update utilization rate if the interval between samples is + * sufficient. + */ + ASSERT(cu_sample_interval_min > CU_SCALE); + if (time_delta > CU_SAMPLE_INTERVAL_MIN) + hw_util->pghw_rate = CU_RATE(utilization_delta, time_delta); + + /* + * Update the maximum observed rate + */ + if (hw_util->pghw_rate_max < hw_util->pghw_rate) + hw_util->pghw_rate_max = hw_util->pghw_rate; +} diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c index 009598f03f..62e8eeb2fe 100644 --- a/usr/src/uts/common/os/cpu.c +++ b/usr/src/uts/common/os/cpu.c @@ -1203,12 +1203,14 @@ cpu_online(cpu_t *cp) } cp->cpu_flags &= ~(CPU_QUIESCED | CPU_OFFLINE | CPU_FROZEN | CPU_SPARE); + CPU_NEW_GENERATION(cp); start_cpus(); cpu_stats_kstat_create(cp); cpu_create_intrstat(cp); lgrp_kstat_create(cp); cpu_state_change_notify(cp->cpu_id, CPU_ON); cpu_intr_enable(cp); /* arch-dep hook */ + cpu_state_change_notify(cp->cpu_id, CPU_INTR_ON); cpu_set_state(cp); cyclic_online(cp); /* @@ -1284,6 +1286,7 @@ cpu_offline(cpu_t *cp, int flags) /* * Tell interested parties that this CPU is going offline. */ + CPU_NEW_GENERATION(cp); cpu_state_change_notify(cp->cpu_id, CPU_OFF); /* @@ -1557,8 +1560,11 @@ out: /* * If we failed, we need to notify everyone that this CPU is back on. */ - if (error != 0) + if (error != 0) { + CPU_NEW_GENERATION(cp); cpu_state_change_notify(cp->cpu_id, CPU_ON); + cpu_state_change_notify(cp->cpu_id, CPU_INTR_ON); + } return (error); } @@ -2152,6 +2158,7 @@ static struct { kstat_named_t ci_core_id; kstat_named_t ci_curr_clock_Hz; kstat_named_t ci_supp_freq_Hz; + kstat_named_t ci_pg_id; #if defined(__sparcv9) kstat_named_t ci_device_ID; kstat_named_t ci_cpu_fru; @@ -2167,6 +2174,7 @@ static struct { kstat_named_t ci_ncoreperchip; kstat_named_t ci_max_cstates; kstat_named_t ci_curr_cstate; + kstat_named_t ci_cacheid; kstat_named_t ci_sktstr; #endif } cpu_info_template = { @@ -2181,6 +2189,7 @@ static struct { { "core_id", KSTAT_DATA_LONG }, { "current_clock_Hz", KSTAT_DATA_UINT64 }, { "supported_frequencies_Hz", KSTAT_DATA_STRING }, + { "pg_id", KSTAT_DATA_LONG }, #if defined(__sparcv9) { "device_ID", KSTAT_DATA_UINT64 }, { "cpu_fru", KSTAT_DATA_STRING }, @@ -2196,6 +2205,7 @@ static struct { { "ncore_per_chip", KSTAT_DATA_INT32 }, { "supported_max_cstates", KSTAT_DATA_INT32 }, { "current_cstate", KSTAT_DATA_INT32 }, + { "cache_id", KSTAT_DATA_INT32 }, { "socket_type", KSTAT_DATA_STRING }, #endif }; @@ -2253,6 +2263,9 @@ cpu_info_kstat_update(kstat_t *ksp, int rw) cpu_info_template.ci_core_id.value.l = pg_plat_get_core_id(cp); cpu_info_template.ci_curr_clock_Hz.value.ui64 = cp->cpu_curr_clock; + cpu_info_template.ci_pg_id.value.l = + cp->cpu_pg && cp->cpu_pg->cmt_lineage ? + cp->cpu_pg->cmt_lineage->pg_id : -1; kstat_named_setstr(&cpu_info_template.ci_supp_freq_Hz, cp->cpu_supp_freqs); #if defined(__sparcv9) @@ -2273,6 +2286,7 @@ cpu_info_kstat_update(kstat_t *ksp, int rw) cpu_info_template.ci_pkg_core_id.value.l = cpuid_get_pkgcoreid(cp); cpu_info_template.ci_max_cstates.value.l = cp->cpu_m.max_cstates; cpu_info_template.ci_curr_cstate.value.l = cpu_idle_get_cpu_state(cp); + cpu_info_template.ci_cacheid.value.i32 = cpuid_get_cacheid(cp); kstat_named_setstr(&cpu_info_template.ci_sktstr, cpuid_getsocketstr(cp)); #endif diff --git a/usr/src/uts/common/os/group.c b/usr/src/uts/common/os/group.c index 01e3f1ebdd..e46e7f600c 100644 --- a/usr/src/uts/common/os/group.c +++ b/usr/src/uts/common/os/group.c @@ -28,6 +28,7 @@ #include <sys/debug.h> #include <sys/kmem.h> #include <sys/group.h> +#include <sys/cmn_err.h> #define GRP_SET_SIZE_DEFAULT 2 @@ -352,3 +353,102 @@ group_find(group_t *g, void *e) } return ((uint_t)-1); } + +/* + * Return a string in a given buffer with list of integer entries in a group. + * The string concatenates consecutive integer ranges ax x-y. + * The resulting string looks like "1,2-5,8" + * + * The convert argument is used to map group elements to integer IDs. + */ +char * +group2intlist(group_t *group, char *buffer, size_t len, int (convert)(void*)) +{ + char *ptr = buffer; + void *v; + group_iter_t iter; + boolean_t first_iteration = B_TRUE; + boolean_t first_value = B_TRUE; + int start = 0, end = 0; + + /* + * Allow for the terminating NULL-byte + */ + len = len -1; + + group_iter_init(&iter); + while ((v = group_iterate(group, &iter)) != NULL && len > 0) { + int id = convert(v); + int nbytes = 0; + + if (first_iteration) { + start = end = id; + first_iteration = B_FALSE; + } else if (end + 1 == id) { + /* + * Got consecutive ID, so extend end of range without + * doing anything since the range may extend further + */ + end = id; + } else { + if (first_value) { + first_value = B_FALSE; + } else { + *ptr++ = ','; + len--; + } + + if (len == 0) + break; + + /* + * Next ID is not consecutive, so dump IDs gotten so + * far. + */ + if (end > start + 1) /* range */ + nbytes = snprintf(ptr, len, "%d-%d", + start, end); + else if (end > start) /* different values */ + nbytes = snprintf(ptr, len, "%d,%d", + start, end); + else /* same value */ + nbytes = snprintf(ptr, len, "%d", start); + + if (nbytes <= 0) { + len = 0; + break; + } + + /* + * Advance position in the string + */ + ptr += nbytes; + len -= nbytes; + + /* + * Try finding consecutive range starting from current + * ID. + */ + start = end = id; + } + } + + if (!first_value) { + *ptr++ = ','; + len--; + } + /* + * Print last ID(s) + */ + if (len > 0) { + if (end > start + 1) { + (void) snprintf(ptr, len, "%d-%d", start, end); + } else if (end != start) { + (void) snprintf(ptr, len, "%d,%d", start, end); + } else { + (void) snprintf(ptr, len, "%d", start); + } + } + + return (buffer); +} diff --git a/usr/src/uts/common/os/kcpc.c b/usr/src/uts/common/os/kcpc.c index e5cab151b8..50a999dcc5 100644 --- a/usr/src/uts/common/os/kcpc.c +++ b/usr/src/uts/common/os/kcpc.c @@ -39,12 +39,17 @@ #include <sys/sunddi.h> #include <sys/modctl.h> #include <sys/sdt.h> +#include <sys/archsystm.h> +#include <sys/promif.h> +#include <sys/x_call.h> +#include <sys/cap_util.h> #if defined(__x86) #include <asm/clock.h> +#include <sys/xc_levels.h> #endif -kmutex_t kcpc_ctx_llock[CPC_HASH_BUCKETS]; /* protects ctx_list */ -kcpc_ctx_t *kcpc_ctx_list[CPC_HASH_BUCKETS]; /* head of list */ +static kmutex_t kcpc_ctx_llock[CPC_HASH_BUCKETS]; /* protects ctx_list */ +static kcpc_ctx_t *kcpc_ctx_list[CPC_HASH_BUCKETS]; /* head of list */ krwlock_t kcpc_cpuctx_lock; /* lock for 'kcpc_cpuctx' below */ @@ -73,10 +78,75 @@ static int kcpc_nullctx_panic = 0; static void kcpc_lwp_create(kthread_t *t, kthread_t *ct); static void kcpc_restore(kcpc_ctx_t *ctx); static void kcpc_save(kcpc_ctx_t *ctx); -static void kcpc_free(kcpc_ctx_t *ctx, int isexec); static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx); static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch); static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set); +static kcpc_set_t *kcpc_set_create(kcpc_request_t *reqs, int nreqs, + int set_flags, int kmem_flags); + +/* + * Macros to manipulate context flags. All flag updates should use one of these + * two macros + * + * Flags should be always be updated atomically since some of the updates are + * not protected by locks. + */ +#define KCPC_CTX_FLAG_SET(ctx, flag) atomic_or_uint(&(ctx)->kc_flags, (flag)) +#define KCPC_CTX_FLAG_CLR(ctx, flag) atomic_and_uint(&(ctx)->kc_flags, ~(flag)) + +/* + * The IS_HIPIL() macro verifies that the code is executed either from a + * cross-call or from high-PIL interrupt + */ +#ifdef DEBUG +#define IS_HIPIL() (getpil() >= XCALL_PIL) +#else +#define IS_HIPIL() +#endif /* DEBUG */ + + +extern int kcpc_hw_load_pcbe(void); + +/* + * Return value from kcpc_hw_load_pcbe() + */ +static int kcpc_pcbe_error = 0; + +/* + * Perform one-time initialization of kcpc framework. + * This function performs the initialization only the first time it is called. + * It is safe to call it multiple times. + */ +int +kcpc_init(void) +{ + long hash; + static uint32_t kcpc_initialized = 0; + + /* + * We already tried loading platform pcbe module and failed + */ + if (kcpc_pcbe_error != 0) + return (-1); + + /* + * The kcpc framework should be initialized at most once + */ + if (atomic_cas_32(&kcpc_initialized, 0, 1) != 0) + return (0); + + rw_init(&kcpc_cpuctx_lock, NULL, RW_DEFAULT, NULL); + for (hash = 0; hash < CPC_HASH_BUCKETS; hash++) + mutex_init(&kcpc_ctx_llock[hash], + NULL, MUTEX_DRIVER, (void *)(uintptr_t)15); + + /* + * Load platform-specific pcbe module + */ + kcpc_pcbe_error = kcpc_hw_load_pcbe(); + + return (kcpc_pcbe_error == 0 ? 0 : -1); +} void kcpc_register_pcbe(pcbe_ops_t *ops) @@ -103,8 +173,9 @@ kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode) cpu_t *cp; kcpc_ctx_t *ctx; int error; + int save_spl; - ctx = kcpc_ctx_alloc(); + ctx = kcpc_ctx_alloc(KM_SLEEP); if (kcpc_assign_reqs(set, ctx) != 0) { kcpc_ctx_free(ctx); @@ -141,28 +212,34 @@ kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode) goto unbound; mutex_enter(&cp->cpu_cpc_ctxlock); + kpreempt_disable(); + save_spl = spl_xcall(); - if (cp->cpu_cpc_ctx != NULL) { + /* + * Check to see whether counters for CPU already being used by someone + * other than kernel for capacity and utilization (since kernel will + * let go of counters for user in kcpc_program() below) + */ + if (cp->cpu_cpc_ctx != NULL && !CU_CPC_ON(cp)) { /* * If this CPU already has a bound set, return an error. */ + splx(save_spl); + kpreempt_enable(); mutex_exit(&cp->cpu_cpc_ctxlock); goto unbound; } if (curthread->t_bind_cpu != cpuid) { + splx(save_spl); + kpreempt_enable(); mutex_exit(&cp->cpu_cpc_ctxlock); goto unbound; } - cp->cpu_cpc_ctx = ctx; - /* - * Kernel preemption must be disabled while fiddling with the hardware - * registers to prevent partial updates. - */ - kpreempt_disable(); - ctx->kc_rawtick = KCPC_GET_TICK(); - pcbe_ops->pcbe_program(ctx); + kcpc_program(ctx, B_FALSE, B_TRUE); + + splx(save_spl); kpreempt_enable(); mutex_exit(&cp->cpu_cpc_ctxlock); @@ -197,14 +274,14 @@ kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode) if (t->t_cpc_ctx != NULL) return (EEXIST); - ctx = kcpc_ctx_alloc(); + ctx = kcpc_ctx_alloc(KM_SLEEP); /* * The context must begin life frozen until it has been properly * programmed onto the hardware. This prevents the context ops from * worrying about it until we're ready. */ - ctx->kc_flags |= KCPC_CTX_FREEZE; + KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE); ctx->kc_hrtime = gethrtime(); if (kcpc_assign_reqs(set, ctx) != 0) { @@ -215,13 +292,13 @@ kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode) ctx->kc_cpuid = -1; if (set->ks_flags & CPC_BIND_LWP_INHERIT) - ctx->kc_flags |= KCPC_CTX_LWPINHERIT; + KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_LWPINHERIT); ctx->kc_thread = t; t->t_cpc_ctx = ctx; /* * Permit threads to look at their own hardware counters from userland. */ - ctx->kc_flags |= KCPC_CTX_NONPRIV; + KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_NONPRIV); /* * Create the data store for this set. @@ -248,12 +325,14 @@ kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode) * Ask the backend to program the hardware. */ if (t == curthread) { + int save_spl; + kpreempt_disable(); - ctx->kc_rawtick = KCPC_GET_TICK(); - atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE); - pcbe_ops->pcbe_program(ctx); + save_spl = spl_xcall(); + kcpc_program(ctx, B_TRUE, B_TRUE); + splx(save_spl); kpreempt_enable(); - } else + } else { /* * Since we are the agent LWP, we know the victim LWP is stopped * until we're done here; no need to worry about preemption or @@ -262,7 +341,8 @@ kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode) * still be accessed from, for instance, another CPU doing a * kcpc_invalidate_all(). */ - atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE); + KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE); + } mutex_enter(&set->ks_lock); set->ks_state |= KCPC_SET_BOUND; @@ -304,7 +384,7 @@ kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode) * notification, we flag the context as being one that * cares about overflow. */ - ctx->kc_flags |= KCPC_CTX_SIGOVF; + KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_SIGOVF); } rp->kr_config = NULL; @@ -349,7 +429,7 @@ int kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick) { kcpc_ctx_t *ctx = set->ks_ctx; - uint64_t curtick = KCPC_GET_TICK(); + int save_spl; mutex_enter(&set->ks_lock); if ((set->ks_state & KCPC_SET_BOUND) == 0) { @@ -358,41 +438,53 @@ kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick) } mutex_exit(&set->ks_lock); - if (ctx->kc_flags & KCPC_CTX_INVALID) + /* + * Kernel preemption must be disabled while reading the hardware regs, + * and if this is a CPU-bound context, while checking the CPU binding of + * the current thread. + */ + kpreempt_disable(); + save_spl = spl_xcall(); + + if (ctx->kc_flags & KCPC_CTX_INVALID) { + splx(save_spl); + kpreempt_enable(); return (EAGAIN); + } if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0) { - /* - * Kernel preemption must be disabled while reading the - * hardware regs, and if this is a CPU-bound context, while - * checking the CPU binding of the current thread. - */ - kpreempt_disable(); - if (ctx->kc_cpuid != -1) { if (curthread->t_bind_cpu != ctx->kc_cpuid) { + splx(save_spl); kpreempt_enable(); return (EAGAIN); } } if (ctx->kc_thread == curthread) { - ctx->kc_hrtime = gethrtime(); + uint64_t curtick = KCPC_GET_TICK(); + + ctx->kc_hrtime = gethrtime_waitfree(); pcbe_ops->pcbe_sample(ctx); ctx->kc_vtick += curtick - ctx->kc_rawtick; ctx->kc_rawtick = curtick; } - kpreempt_enable(); - /* * The config may have been invalidated by * the pcbe_sample op. */ - if (ctx->kc_flags & KCPC_CTX_INVALID) + if (ctx->kc_flags & KCPC_CTX_INVALID) { + splx(save_spl); + kpreempt_enable(); return (EAGAIN); + } + } + splx(save_spl); + kpreempt_enable(); + if (copyout(set->ks_data, buf, set->ks_nreqs * sizeof (uint64_t)) == -1) return (EFAULT); @@ -412,20 +504,17 @@ kcpc_stop_hw(kcpc_ctx_t *ctx) { cpu_t *cp; - ASSERT((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED)) - == KCPC_CTX_INVALID); - kpreempt_disable(); - cp = cpu_get(ctx->kc_cpuid); - ASSERT(cp != NULL); + if (ctx->kc_cpuid == CPU->cpu_id) { + cp = CPU; + } else { + cp = cpu_get(ctx->kc_cpuid); + } + + ASSERT(cp != NULL && cp->cpu_cpc_ctx == ctx); + kcpc_cpu_stop(cp, B_FALSE); - if (cp == CPU) { - pcbe_ops->pcbe_allstop(); - atomic_or_uint(&ctx->kc_flags, - KCPC_CTX_INVALID_STOPPED); - } else - kcpc_remote_stop(cp); kpreempt_enable(); } @@ -451,7 +540,7 @@ kcpc_unbind(kcpc_set_t *set) * Use kc_lock to synchronize with kcpc_restore(). */ mutex_enter(&ctx->kc_lock); - ctx->kc_flags |= KCPC_CTX_INVALID; + KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID); mutex_exit(&ctx->kc_lock); if (ctx->kc_cpuid == -1) { @@ -461,12 +550,14 @@ kcpc_unbind(kcpc_set_t *set) * context. It will be freed via removectx() calling * freectx() calling kcpc_free(). */ - if (t == curthread && - (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) { + if (t == curthread) { + int save_spl; + kpreempt_disable(); - pcbe_ops->pcbe_allstop(); - atomic_or_uint(&ctx->kc_flags, - KCPC_CTX_INVALID_STOPPED); + save_spl = spl_xcall(); + if (!(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) + kcpc_unprogram(ctx, B_TRUE); + splx(save_spl); kpreempt_enable(); } #ifdef DEBUG @@ -503,7 +594,6 @@ kcpc_unbind(kcpc_set_t *set) if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) kcpc_stop_hw(ctx); ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED); - cp->cpu_cpc_ctx = NULL; mutex_exit(&cp->cpu_cpc_ctxlock); } mutex_exit(&cpu_lock); @@ -543,12 +633,20 @@ kcpc_restart(kcpc_set_t *set) { kcpc_ctx_t *ctx = set->ks_ctx; int i; + int save_spl; ASSERT(set->ks_state & KCPC_SET_BOUND); ASSERT(ctx->kc_thread == curthread); ASSERT(ctx->kc_cpuid == -1); + for (i = 0; i < set->ks_nreqs; i++) { + *(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset; + pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset, + 0, 0, NULL, &set->ks_req[i].kr_config, NULL); + } + kpreempt_disable(); + save_spl = spl_xcall(); /* * If the user is doing this on a running set, make sure the counters @@ -557,18 +655,13 @@ kcpc_restart(kcpc_set_t *set) if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0) pcbe_ops->pcbe_allstop(); - for (i = 0; i < set->ks_nreqs; i++) { - *(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset; - pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset, - 0, 0, NULL, &set->ks_req[i].kr_config, NULL); - } - /* * Ask the backend to program the hardware. */ ctx->kc_rawtick = KCPC_GET_TICK(); - atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE); + KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE); pcbe_ops->pcbe_program(ctx); + splx(save_spl); kpreempt_enable(); return (0); @@ -604,7 +697,7 @@ kcpc_enable(kthread_t *t, int cmd, int enable) if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0) return (EINVAL); kpreempt_disable(); - atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE); + KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE); kcpc_restore(ctx); kpreempt_enable(); } else if (cmd == CPC_DISABLE) { @@ -612,7 +705,7 @@ kcpc_enable(kthread_t *t, int cmd, int enable) return (EINVAL); kpreempt_disable(); kcpc_save(ctx); - atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE); + KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE); kpreempt_enable(); } else if (cmd == CPC_USR_EVENTS || cmd == CPC_SYS_EVENTS) { /* @@ -624,10 +717,11 @@ kcpc_enable(kthread_t *t, int cmd, int enable) CPC_COUNT_USER: CPC_COUNT_SYSTEM; kpreempt_disable(); - atomic_or_uint(&ctx->kc_flags, + KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED); pcbe_ops->pcbe_allstop(); kpreempt_enable(); + for (i = 0; i < set->ks_nreqs; i++) { set->ks_req[i].kr_preset = *(set->ks_req[i].kr_data); if (enable) @@ -715,12 +809,14 @@ kcpc_next_config(void *token, void *current, uint64_t **data) kcpc_ctx_t * -kcpc_ctx_alloc(void) +kcpc_ctx_alloc(int kmem_flags) { kcpc_ctx_t *ctx; long hash; - ctx = (kcpc_ctx_t *)kmem_zalloc(sizeof (kcpc_ctx_t), KM_SLEEP); + ctx = (kcpc_ctx_t *)kmem_zalloc(sizeof (kcpc_ctx_t), kmem_flags); + if (ctx == NULL) + return (NULL); hash = CPC_HASH_CTX(ctx); mutex_enter(&kcpc_ctx_llock[hash]); @@ -909,9 +1005,10 @@ kcpc_overflow_intr(caddr_t arg, uint64_t bitmap) */ if (kcpc_nullctx_panic) panic("null cpc context, thread %p", (void *)t); - - cmn_err(CE_WARN, +#ifdef DEBUG + cmn_err(CE_NOTE, "null cpc context found in overflow handler!\n"); +#endif atomic_add_32(&kcpc_nullctx_count, 1); } else if ((ctx->kc_flags & KCPC_CTX_INVALID) == 0) { /* @@ -935,13 +1032,20 @@ kcpc_overflow_intr(caddr_t arg, uint64_t bitmap) * so freeze the context. The interrupt handler * has already stopped the counter hardware. */ - atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE); + KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE); atomic_or_uint(&ctx->kc_pics[i].kp_flags, KCPC_PIC_OVERFLOWED); } } aston(t); + } else if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) { + /* + * Thread context is no longer valid, but here may be a valid + * CPU context. + */ + return (curthread->t_cpu->cpu_cpc_ctx); } + return (NULL); } @@ -956,6 +1060,7 @@ kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2) kcpc_ctx_t *ctx; uint64_t bitmap; uint8_t *state; + int save_spl; if (pcbe_ops == NULL || (bitmap = pcbe_ops->pcbe_overflow_bitmap()) == 0) @@ -985,6 +1090,13 @@ kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2) (*dtrace_cpc_fire)(bitmap); ctx = curthread->t_cpu->cpu_cpc_ctx; + if (ctx == NULL) { +#ifdef DEBUG + cmn_err(CE_NOTE, "null cpc context in" + "hardware overflow handler!\n"); +#endif + return (DDI_INTR_CLAIMED); + } /* Reset any counters that have overflowed */ for (i = 0; i < ctx->kc_set->ks_nreqs; i++) { @@ -1025,7 +1137,12 @@ kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2) * the middle of updating it, no AST has been posted, and so we * should sample the counters here, and restart them with no * further fuss. + * + * The CPU's CPC context may disappear as a result of cross-call which + * has higher PIL on x86, so protect the context by raising PIL to the + * cross-call level. */ + save_spl = spl_xcall(); if ((ctx = kcpc_overflow_intr(arg1, bitmap)) != NULL) { uint64_t curtick = KCPC_GET_TICK(); @@ -1035,6 +1152,7 @@ kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2) pcbe_ops->pcbe_sample(ctx); pcbe_ops->pcbe_program(ctx); } + splx(save_spl); return (DDI_INTR_CLAIMED); } @@ -1087,7 +1205,7 @@ kcpc_overflow_ast() * Otherwise, re-enable the counters and continue life as before. */ kpreempt_disable(); - atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE); + KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE); pcbe_ops->pcbe_program(ctx); kpreempt_enable(); return (0); @@ -1099,43 +1217,68 @@ kcpc_overflow_ast() static void kcpc_save(kcpc_ctx_t *ctx) { + int err; + int save_spl; + + kpreempt_disable(); + save_spl = spl_xcall(); + if (ctx->kc_flags & KCPC_CTX_INVALID) { - if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) + if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) { + splx(save_spl); + kpreempt_enable(); return; + } /* * This context has been invalidated but the counters have not * been stopped. Stop them here and mark the context stopped. */ - pcbe_ops->pcbe_allstop(); - atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID_STOPPED); + kcpc_unprogram(ctx, B_TRUE); + splx(save_spl); + kpreempt_enable(); return; } pcbe_ops->pcbe_allstop(); - if (ctx->kc_flags & KCPC_CTX_FREEZE) + if (ctx->kc_flags & KCPC_CTX_FREEZE) { + splx(save_spl); + kpreempt_enable(); return; + } /* * Need to sample for all reqs into each req's current mpic. */ - ctx->kc_hrtime = gethrtime(); + ctx->kc_hrtime = gethrtime_waitfree(); ctx->kc_vtick += KCPC_GET_TICK() - ctx->kc_rawtick; pcbe_ops->pcbe_sample(ctx); + + /* + * Program counter for measuring capacity and utilization since user + * thread isn't using counter anymore + */ + ASSERT(ctx->kc_cpuid == -1); + cu_cpc_program(CPU, &err); + splx(save_spl); + kpreempt_enable(); } static void kcpc_restore(kcpc_ctx_t *ctx) { + int save_spl; + mutex_enter(&ctx->kc_lock); + if ((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED)) == - KCPC_CTX_INVALID) + KCPC_CTX_INVALID) { /* * The context is invalidated but has not been marked stopped. * We mark it as such here because we will not start the * counters during this context switch. */ - ctx->kc_flags |= KCPC_CTX_INVALID_STOPPED; - + KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID_STOPPED); + } if (ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_FREEZE)) { mutex_exit(&ctx->kc_lock); @@ -1151,7 +1294,7 @@ kcpc_restore(kcpc_ctx_t *ctx) * doing this, we're asking kcpc_free() to cv_wait() until * kcpc_restore() has completed. */ - ctx->kc_flags |= KCPC_CTX_RESTORE; + KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_RESTORE); mutex_exit(&ctx->kc_lock); /* @@ -1159,14 +1302,17 @@ kcpc_restore(kcpc_ctx_t *ctx) * don't do an explicit pcbe_allstop() here because they should have * been stopped already by the last consumer. */ - ctx->kc_rawtick = KCPC_GET_TICK(); - pcbe_ops->pcbe_program(ctx); + kpreempt_disable(); + save_spl = spl_xcall(); + kcpc_program(ctx, B_TRUE, B_TRUE); + splx(save_spl); + kpreempt_enable(); /* * Wake the agent thread if it's waiting in kcpc_free(). */ mutex_enter(&ctx->kc_lock); - ctx->kc_flags &= ~KCPC_CTX_RESTORE; + KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_RESTORE); cv_signal(&ctx->kc_condv); mutex_exit(&ctx->kc_lock); } @@ -1177,7 +1323,6 @@ kcpc_restore(kcpc_ctx_t *ctx) * counters when the idle thread is switched on, and they start them again when * it is switched off. */ - /*ARGSUSED*/ void kcpc_idle_save(struct cpu *cp) @@ -1242,7 +1387,7 @@ kcpc_lwp_create(kthread_t *t, kthread_t *ct) rw_exit(&kcpc_cpuctx_lock); return; } - cctx = kcpc_ctx_alloc(); + cctx = kcpc_ctx_alloc(KM_SLEEP); kcpc_ctx_clone(ctx, cctx); rw_exit(&kcpc_cpuctx_lock); @@ -1250,7 +1395,7 @@ kcpc_lwp_create(kthread_t *t, kthread_t *ct) * Copy the parent context's kc_flags field, but don't overwrite * the child's in case it was modified during kcpc_ctx_clone. */ - cctx->kc_flags |= ctx->kc_flags; + KCPC_CTX_FLAG_SET(cctx, ctx->kc_flags); cctx->kc_thread = ct; cctx->kc_cpuid = -1; ct->t_cpc_set = cctx->kc_set; @@ -1265,13 +1410,14 @@ kcpc_lwp_create(kthread_t *t, kthread_t *ct) * set to UINT64_MAX, and their pic's overflow flag turned on * so that our trap() processing knows to send a signal. */ - atomic_or_uint(&cctx->kc_flags, KCPC_CTX_FREEZE); + KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE); for (i = 0; i < ks->ks_nreqs; i++) { kcpc_request_t *kr = &ks->ks_req[i]; if (kr->kr_flags & CPC_OVF_NOTIFY_EMT) { *(kr->kr_data) = UINT64_MAX; - kr->kr_picp->kp_flags |= KCPC_PIC_OVERFLOWED; + atomic_or_uint(&kr->kr_picp->kp_flags, + KCPC_PIC_OVERFLOWED); } } ttolwp(ct)->lwp_pcb.pcb_flags |= CPC_OVERFLOW; @@ -1315,7 +1461,7 @@ kcpc_lwp_create(kthread_t *t, kthread_t *ct) */ /*ARGSUSED*/ -static void +void kcpc_free(kcpc_ctx_t *ctx, int isexec) { int i; @@ -1329,7 +1475,7 @@ kcpc_free(kcpc_ctx_t *ctx, int isexec) mutex_enter(&ctx->kc_lock); while (ctx->kc_flags & KCPC_CTX_RESTORE) cv_wait(&ctx->kc_condv, &ctx->kc_lock); - ctx->kc_flags |= KCPC_CTX_INVALID; + KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID); mutex_exit(&ctx->kc_lock); if (isexec) { @@ -1356,21 +1502,22 @@ kcpc_free(kcpc_ctx_t *ctx, int isexec) if (cp != NULL) { mutex_enter(&cp->cpu_cpc_ctxlock); kcpc_stop_hw(ctx); - cp->cpu_cpc_ctx = NULL; mutex_exit(&cp->cpu_cpc_ctxlock); } mutex_exit(&cpu_lock); ASSERT(curthread->t_cpc_ctx == NULL); } else { + int save_spl; + /* * Thread-bound context; stop _this_ CPU's counters. */ kpreempt_disable(); - pcbe_ops->pcbe_allstop(); - atomic_or_uint(&ctx->kc_flags, - KCPC_CTX_INVALID_STOPPED); - kpreempt_enable(); + save_spl = spl_xcall(); + kcpc_unprogram(ctx, B_TRUE); curthread->t_cpc_ctx = NULL; + splx(save_spl); + kpreempt_enable(); } /* @@ -1435,7 +1582,7 @@ kcpc_invalidate_all(void) for (hash = 0; hash < CPC_HASH_BUCKETS; hash++) { mutex_enter(&kcpc_ctx_llock[hash]); for (ctx = kcpc_ctx_list[hash]; ctx; ctx = ctx->kc_next) - atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID); + KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID); mutex_exit(&kcpc_ctx_llock[hash]); } } @@ -1451,7 +1598,7 @@ kcpc_invalidate_config(void *token) ASSERT(ctx != NULL); - atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID); + KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID); } /* @@ -1462,18 +1609,11 @@ kcpc_passivate(void) { kcpc_ctx_t *ctx = curthread->t_cpc_ctx; kcpc_set_t *set = curthread->t_cpc_set; + int save_spl; if (set == NULL) return; - /* - * We're cleaning up after this thread; ensure there are no dangling - * CPC pointers left behind. The context and set will be freed by - * freectx() in the case of an LWP-bound set, and by kcpc_unbind() in - * the case of a CPU-bound set. - */ - curthread->t_cpc_ctx = NULL; - if (ctx == NULL) { /* * This thread has a set but no context; it must be a CPU-bound @@ -1491,6 +1631,8 @@ kcpc_passivate(void) return; } + kpreempt_disable(); + save_spl = spl_xcall(); curthread->t_cpc_set = NULL; /* @@ -1500,13 +1642,20 @@ kcpc_passivate(void) * INVALID_STOPPED flag here and kcpc_restore() setting the flag during * a context switch. */ - - kpreempt_disable(); if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) { - pcbe_ops->pcbe_allstop(); - atomic_or_uint(&ctx->kc_flags, + kcpc_unprogram(ctx, B_TRUE); + KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED); } + + /* + * We're cleaning up after this thread; ensure there are no dangling + * CPC pointers left behind. The context and set will be freed by + * freectx(). + */ + curthread->t_cpc_ctx = NULL; + + splx(save_spl); kpreempt_enable(); } @@ -1667,7 +1816,7 @@ kcpc_invalidate(kthread_t *t) kcpc_ctx_t *ctx = t->t_cpc_ctx; if (ctx != NULL) - atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID); + KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID); } /* @@ -1691,6 +1840,648 @@ kcpc_pcbe_tryload(const char *prefix, uint_t first, uint_t second, uint_t third) "pcbe", prefix, ".", s, 3, NULL) < 0 ? -1 : 0); } +/* + * Create one or more CPC context for given CPU with specified counter event + * requests + * + * If number of requested counter events is less than or equal number of + * hardware counters on a CPU and can all be assigned to the counters on a CPU + * at the same time, then make one CPC context. + * + * Otherwise, multiple CPC contexts are created to allow multiplexing more + * counter events than existing counters onto the counters by iterating through + * all of the CPC contexts, programming the counters with each CPC context one + * at a time and measuring the resulting counter values. Each of the resulting + * CPC contexts contains some number of requested counter events less than or + * equal the number of counters on a CPU depending on whether all the counter + * events can be programmed on all the counters at the same time or not. + * + * Flags to kmem_{,z}alloc() are passed in as an argument to allow specifying + * whether memory allocation should be non-blocking or not. The code will try + * to allocate *whole* CPC contexts if possible. If there is any memory + * allocation failure during the allocations needed for a given CPC context, it + * will skip allocating that CPC context because it cannot allocate the whole + * thing. Thus, the only time that it will end up allocating none (ie. no CPC + * contexts whatsoever) is when it cannot even allocate *one* whole CPC context + * without a memory allocation failure occurring. + */ +int +kcpc_cpu_ctx_create(cpu_t *cp, kcpc_request_list_t *req_list, int kmem_flags, + kcpc_ctx_t ***ctx_ptr_array, size_t *ctx_ptr_array_sz) +{ + kcpc_ctx_t **ctx_ptrs; + int nctx; + int nctx_ptrs; + int nreqs; + kcpc_request_t *reqs; + + if (cp == NULL || ctx_ptr_array == NULL || ctx_ptr_array_sz == NULL || + req_list == NULL || req_list->krl_cnt < 1) + return (-1); + + /* + * Allocate number of sets assuming that each set contains one and only + * one counter event request for each counter on a CPU + */ + nreqs = req_list->krl_cnt; + nctx_ptrs = (nreqs + cpc_ncounters - 1) / cpc_ncounters; + ctx_ptrs = kmem_zalloc(nctx_ptrs * sizeof (kcpc_ctx_t *), kmem_flags); + if (ctx_ptrs == NULL) + return (-2); + + /* + * Fill in sets of requests + */ + nctx = 0; + reqs = req_list->krl_list; + while (nreqs > 0) { + kcpc_ctx_t *ctx; + kcpc_set_t *set; + int subcode; + + /* + * Allocate CPC context and set for requested counter events + */ + ctx = kcpc_ctx_alloc(kmem_flags); + set = kcpc_set_create(reqs, nreqs, 0, kmem_flags); + if (set == NULL) { + kcpc_ctx_free(ctx); + break; + } + + /* + * Determine assignment of requested counter events to specific + * counters + */ + if (kcpc_assign_reqs(set, ctx) != 0) { + /* + * May not be able to assign requested counter events + * to all counters since all counters may not be able + * to do all events, so only do one counter event in + * set of counter requests when this happens since at + * least one of the counters must be able to do the + * event. + */ + kcpc_free_set(set); + set = kcpc_set_create(reqs, 1, 0, kmem_flags); + if (set == NULL) { + kcpc_ctx_free(ctx); + break; + } + if (kcpc_assign_reqs(set, ctx) != 0) { +#ifdef DEBUG + cmn_err(CE_NOTE, "!kcpc_cpu_ctx_create: can't " + "assign counter event %s!\n", + set->ks_req->kr_event); +#endif + kcpc_free_set(set); + kcpc_ctx_free(ctx); + reqs++; + nreqs--; + continue; + } + } + + /* + * Allocate memory needed to hold requested counter event data + */ + set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t), + kmem_flags); + if (set->ks_data == NULL) { + kcpc_free_set(set); + kcpc_ctx_free(ctx); + break; + } + + /* + * Configure requested counter events + */ + if (kcpc_configure_reqs(ctx, set, &subcode) != 0) { +#ifdef DEBUG + cmn_err(CE_NOTE, + "!kcpc_cpu_ctx_create: can't configure " + "set of counter event requests!\n"); +#endif + reqs += set->ks_nreqs; + nreqs -= set->ks_nreqs; + kmem_free(set->ks_data, + set->ks_nreqs * sizeof (uint64_t)); + kcpc_free_set(set); + kcpc_ctx_free(ctx); + continue; + } + + /* + * Point set of counter event requests at this context and fill + * in CPC context + */ + set->ks_ctx = ctx; + ctx->kc_set = set; + ctx->kc_cpuid = cp->cpu_id; + ctx->kc_thread = curthread; + + ctx_ptrs[nctx] = ctx; + + /* + * Update requests and how many are left to be assigned to sets + */ + reqs += set->ks_nreqs; + nreqs -= set->ks_nreqs; + + /* + * Increment number of CPC contexts and allocate bigger array + * for context pointers as needed + */ + nctx++; + if (nctx >= nctx_ptrs) { + kcpc_ctx_t **new; + int new_cnt; + + /* + * Allocate more CPC contexts based on how many + * contexts allocated so far and how many counter + * requests left to assign + */ + new_cnt = nctx_ptrs + + ((nreqs + cpc_ncounters - 1) / cpc_ncounters); + new = kmem_zalloc(new_cnt * sizeof (kcpc_ctx_t *), + kmem_flags); + if (new == NULL) + break; + + /* + * Copy contents of old sets into new ones + */ + bcopy(ctx_ptrs, new, + nctx_ptrs * sizeof (kcpc_ctx_t *)); + + /* + * Free old array of context pointers and use newly + * allocated one instead now + */ + kmem_free(ctx_ptrs, nctx_ptrs * sizeof (kcpc_ctx_t *)); + ctx_ptrs = new; + nctx_ptrs = new_cnt; + } + } + + /* + * Return NULL if no CPC contexts filled in + */ + if (nctx == 0) { + kmem_free(ctx_ptrs, nctx_ptrs * sizeof (kcpc_ctx_t *)); + *ctx_ptr_array = NULL; + *ctx_ptr_array_sz = 0; + return (-2); + } + + *ctx_ptr_array = ctx_ptrs; + *ctx_ptr_array_sz = nctx_ptrs * sizeof (kcpc_ctx_t *); + return (nctx); +} + +/* + * Return whether PCBE supports given counter event + */ +boolean_t +kcpc_event_supported(char *event) +{ + if (pcbe_ops == NULL || pcbe_ops->pcbe_event_coverage(event) == 0) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Program counters on current CPU with given CPC context + * + * If kernel is interposing on counters to measure hardware capacity and + * utilization, then unprogram counters for kernel *before* programming them + * with specified CPC context. + * + * kcpc_{program,unprogram}() may be called either directly by a thread running + * on the target CPU or from a cross-call from another CPU. To protect + * programming and unprogramming from being interrupted by cross-calls, callers + * who execute kcpc_{program,unprogram} should raise PIL to the level used by + * cross-calls. + */ +void +kcpc_program(kcpc_ctx_t *ctx, boolean_t for_thread, boolean_t cu_interpose) +{ + int error; + + ASSERT(IS_HIPIL()); + + /* + * CPC context shouldn't be NULL, its CPU field should specify current + * CPU or be -1 to specify any CPU when the context is bound to a + * thread, and preemption should be disabled + */ + ASSERT(ctx != NULL && (ctx->kc_cpuid == CPU->cpu_id || + ctx->kc_cpuid == -1) && curthread->t_preempt > 0); + if (ctx == NULL || (ctx->kc_cpuid != CPU->cpu_id && + ctx->kc_cpuid != -1) || curthread->t_preempt < 1) + return; + + /* + * Unprogram counters for kernel measuring hardware capacity and + * utilization + */ + if (cu_interpose == B_TRUE) { + cu_cpc_unprogram(CPU, &error); + } else { + kcpc_set_t *set = ctx->kc_set; + int i; + + ASSERT(set != NULL); + + /* + * Since cu_interpose is false, we are programming CU context. + * In general, PCBE can continue from the state saved in the + * set, but it is not very reliable, so we start again from the + * preset value. + */ + for (i = 0; i < set->ks_nreqs; i++) { + /* + * Reset the virtual counter value to the preset value. + */ + *(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset; + + /* + * Reset PCBE to the preset value. + */ + pcbe_ops->pcbe_configure(0, NULL, + set->ks_req[i].kr_preset, + 0, 0, NULL, &set->ks_req[i].kr_config, NULL); + } + } + + /* + * Program counters with specified CPC context + */ + ctx->kc_rawtick = KCPC_GET_TICK(); + pcbe_ops->pcbe_program(ctx); + + /* + * Denote that counters programmed for thread or CPU CPC context + * differently + */ + if (for_thread == B_TRUE) + KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE); + else + CPU->cpu_cpc_ctx = ctx; +} + +/* + * Unprogram counters with given CPC context on current CPU + * + * If kernel is interposing on counters to measure hardware capacity and + * utilization, then program counters for the kernel capacity and utilization + * *after* unprogramming them for given CPC context. + * + * See the comment for kcpc_program regarding the synchronization with + * cross-calls. + */ +void +kcpc_unprogram(kcpc_ctx_t *ctx, boolean_t cu_interpose) +{ + int error; + + ASSERT(IS_HIPIL()); + + /* + * CPC context shouldn't be NULL, its CPU field should specify current + * CPU or be -1 to specify any CPU when the context is bound to a + * thread, and preemption should be disabled + */ + ASSERT(ctx != NULL && (ctx->kc_cpuid == CPU->cpu_id || + ctx->kc_cpuid == -1) && curthread->t_preempt > 0); + + if (ctx == NULL || (ctx->kc_cpuid != CPU->cpu_id && + ctx->kc_cpuid != -1) || curthread->t_preempt < 1 || + (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) != 0) { + return; + } + + /* + * Specified CPC context to be unprogrammed should be bound to current + * CPU or thread + */ + ASSERT(CPU->cpu_cpc_ctx == ctx || curthread->t_cpc_ctx == ctx); + + /* + * Stop counters + */ + pcbe_ops->pcbe_allstop(); + KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID_STOPPED); + + /* + * Allow kernel to interpose on counters and program them for its own + * use to measure hardware capacity and utilization if cu_interpose + * argument is true + */ + if (cu_interpose == B_TRUE) + cu_cpc_program(CPU, &error); +} + +/* + * Read CPU Performance Counter (CPC) on current CPU and call specified update + * routine with data for each counter event currently programmed on CPU + */ +int +kcpc_read(kcpc_update_func_t update_func) +{ + kcpc_ctx_t *ctx; + int i; + kcpc_request_t *req; + int retval; + kcpc_set_t *set; + + ASSERT(IS_HIPIL()); + + /* + * Can't grab locks or block because may be called inside dispatcher + */ + kpreempt_disable(); + + ctx = CPU->cpu_cpc_ctx; + if (ctx == NULL) { + kpreempt_enable(); + return (0); + } + + /* + * Read counter data from current CPU + */ + pcbe_ops->pcbe_sample(ctx); + + set = ctx->kc_set; + if (set == NULL || set->ks_req == NULL) { + kpreempt_enable(); + return (0); + } + + /* + * Call update function with preset pointer and data for each CPC event + * request currently programmed on current CPU + */ + req = set->ks_req; + retval = 0; + for (i = 0; i < set->ks_nreqs; i++) { + int ret; + + if (req[i].kr_data == NULL) + break; + + ret = update_func(req[i].kr_ptr, *req[i].kr_data); + if (ret < 0) + retval = ret; + } + + kpreempt_enable(); + + return (retval); +} + +/* + * Initialize list of counter event requests + */ +kcpc_request_list_t * +kcpc_reqs_init(int nreqs, int kmem_flags) +{ + kcpc_request_list_t *req_list; + kcpc_request_t *reqs; + + if (nreqs < 1) + return (NULL); + + req_list = kmem_zalloc(sizeof (kcpc_request_list_t), kmem_flags); + if (req_list == NULL) + return (NULL); + + reqs = kmem_zalloc(nreqs * sizeof (kcpc_request_t), kmem_flags); + if (reqs == NULL) { + kmem_free(req_list, sizeof (kcpc_request_list_t)); + return (NULL); + } + + req_list->krl_list = reqs; + req_list->krl_cnt = 0; + req_list->krl_max = nreqs; + return (req_list); +} + + +/* + * Add counter event request to given list of counter event requests + */ +int +kcpc_reqs_add(kcpc_request_list_t *req_list, char *event, uint64_t preset, + uint_t flags, uint_t nattrs, kcpc_attr_t *attr, void *ptr, int kmem_flags) +{ + kcpc_request_t *req; + + ASSERT(req_list->krl_max != 0); + if (req_list == NULL || req_list->krl_list == NULL) + return (-1); + + /* + * Allocate more space (if needed) + */ + if (req_list->krl_cnt > req_list->krl_max) { + kcpc_request_t *new; + kcpc_request_t *old; + + old = req_list->krl_list; + new = kmem_zalloc((req_list->krl_max + + cpc_ncounters) * sizeof (kcpc_request_t), kmem_flags); + if (new == NULL) + return (-2); + + req_list->krl_list = new; + bcopy(old, req_list->krl_list, + req_list->krl_cnt * sizeof (kcpc_request_t)); + kmem_free(old, req_list->krl_max * sizeof (kcpc_request_t)); + req_list->krl_cnt = 0; + req_list->krl_max += cpc_ncounters; + } + + /* + * Fill in request as much as possible now, but some fields will need + * to be set when request is assigned to a set. + */ + req = &req_list->krl_list[req_list->krl_cnt]; + req->kr_config = NULL; + req->kr_picnum = -1; /* have CPC pick this */ + req->kr_index = -1; /* set when assigning request to set */ + req->kr_data = NULL; /* set when configuring request */ + (void) strcpy(req->kr_event, event); + req->kr_preset = preset; + req->kr_flags = flags; + req->kr_nattrs = nattrs; + req->kr_attr = attr; + /* + * Keep pointer given by caller to give to update function when this + * counter event is sampled/read + */ + req->kr_ptr = ptr; + + req_list->krl_cnt++; + + return (0); +} + +/* + * Reset list of CPC event requests so its space can be used for another set + * of requests + */ +int +kcpc_reqs_reset(kcpc_request_list_t *req_list) +{ + /* + * Return when pointer to request list structure or request is NULL or + * when max requests is less than or equal to 0 + */ + if (req_list == NULL || req_list->krl_list == NULL || + req_list->krl_max <= 0) + return (-1); + + /* + * Zero out requests and number of requests used + */ + bzero(req_list->krl_list, req_list->krl_max * sizeof (kcpc_request_t)); + req_list->krl_cnt = 0; + return (0); +} + +/* + * Free given list of counter event requests + */ +int +kcpc_reqs_fini(kcpc_request_list_t *req_list) +{ + kmem_free(req_list->krl_list, + req_list->krl_max * sizeof (kcpc_request_t)); + kmem_free(req_list, sizeof (kcpc_request_list_t)); + return (0); +} + +/* + * Create set of given counter event requests + */ +static kcpc_set_t * +kcpc_set_create(kcpc_request_t *reqs, int nreqs, int set_flags, int kmem_flags) +{ + int i; + kcpc_set_t *set; + + /* + * Allocate set and assign number of requests in set and flags + */ + set = kmem_zalloc(sizeof (kcpc_set_t), kmem_flags); + if (set == NULL) + return (NULL); + + if (nreqs < cpc_ncounters) + set->ks_nreqs = nreqs; + else + set->ks_nreqs = cpc_ncounters; + + set->ks_flags = set_flags; + + /* + * Allocate requests needed, copy requests into set, and set index into + * data for each request (which may change when we assign requested + * counter events to counters) + */ + set->ks_req = (kcpc_request_t *)kmem_zalloc(sizeof (kcpc_request_t) * + set->ks_nreqs, kmem_flags); + if (set->ks_req == NULL) { + kmem_free(set, sizeof (kcpc_set_t)); + return (NULL); + } + + bcopy(reqs, set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs); + + for (i = 0; i < set->ks_nreqs; i++) + set->ks_req[i].kr_index = i; + + return (set); +} + + +/* + * Stop counters on current CPU. + * + * If preserve_context is true, the caller is interested in the CPU's CPC + * context and wants it to be preserved. + * + * If preserve_context is false, the caller does not need the CPU's CPC context + * to be preserved, so it is set to NULL. + */ +static void +kcpc_cpustop_func(boolean_t preserve_context) +{ + kpreempt_disable(); + + /* + * Someone already stopped this context before us, so there is nothing + * to do. + */ + if (CPU->cpu_cpc_ctx == NULL) { + kpreempt_enable(); + return; + } + + kcpc_unprogram(CPU->cpu_cpc_ctx, B_TRUE); + /* + * If CU does not use counters, then clear the CPU's CPC context + * If the caller requested to preserve context it should disable CU + * first, so there should be no CU context now. + */ + ASSERT(!preserve_context || !CU_CPC_ON(CPU)); + if (!preserve_context && CPU->cpu_cpc_ctx != NULL && !CU_CPC_ON(CPU)) + CPU->cpu_cpc_ctx = NULL; + + kpreempt_enable(); +} + +/* + * Stop counters on given CPU and set its CPC context to NULL unless + * preserve_context is true. + */ +void +kcpc_cpu_stop(cpu_t *cp, boolean_t preserve_context) +{ + cpu_call(cp, (cpu_call_func_t)kcpc_cpustop_func, + preserve_context, 0); +} + +/* + * Program the context on the current CPU + */ +static void +kcpc_remoteprogram_func(kcpc_ctx_t *ctx, uintptr_t arg) +{ + boolean_t for_thread = (boolean_t)arg; + + ASSERT(ctx != NULL); + + kpreempt_disable(); + kcpc_program(ctx, for_thread, B_TRUE); + kpreempt_enable(); +} + +/* + * Program counters on given CPU + */ +void +kcpc_cpu_program(cpu_t *cp, kcpc_ctx_t *ctx) +{ + cpu_call(cp, (cpu_call_func_t)kcpc_remoteprogram_func, (uintptr_t)ctx, + (uintptr_t)B_FALSE); +} + char * kcpc_list_attrs(void) { diff --git a/usr/src/uts/common/os/pg.c b/usr/src/uts/common/os/pg.c index 067670dbbb..835ae3d322 100644 --- a/usr/src/uts/common/os/pg.c +++ b/usr/src/uts/common/os/pg.c @@ -110,7 +110,11 @@ static cpu_pg_t bootstrap_pg_data; * and the next free id in the set. */ static bitset_t pg_id_set; -static pgid_t pg_id_next = 0; + +/* + * ID space starts from 1 to assume that root has ID 0; + */ +static pgid_t pg_id_next = 1; /* * Default and externed PG ops vectors diff --git a/usr/src/uts/common/os/pghw.c b/usr/src/uts/common/os/pghw.c index ca59db8602..534cb2c540 100644 --- a/usr/src/uts/common/os/pghw.c +++ b/usr/src/uts/common/os/pghw.c @@ -34,6 +34,7 @@ #include <sys/pg.h> #include <sys/pghw.h> #include <sys/cpu_pm.h> +#include <sys/cap_util.h> /* * Processor Groups: Hardware sharing relationship layer @@ -116,10 +117,10 @@ struct pghw_kstat { kstat_named_t pg_hw; kstat_named_t pg_policy; } pghw_kstat = { - { "id", KSTAT_DATA_UINT64 }, + { "id", KSTAT_DATA_UINT32 }, { "pg_class", KSTAT_DATA_STRING }, - { "ncpus", KSTAT_DATA_UINT64 }, - { "instance_id", KSTAT_DATA_UINT64 }, + { "ncpus", KSTAT_DATA_UINT32 }, + { "instance_id", KSTAT_DATA_UINT32 }, { "hardware", KSTAT_DATA_STRING }, { "policy", KSTAT_DATA_STRING }, }; @@ -127,12 +128,92 @@ struct pghw_kstat { kmutex_t pghw_kstat_lock; /* + * Capacity and Utilization PG kstats + * + * These kstats are updated one at a time, so we can have a single scratch space + * to fill the data. + * + * kstat fields: + * + * pgid PG ID for PG described by this kstat + * + * pg_ncpus Number of CPUs within this PG + * + * pg_cpus String describing CPUs within this PG + * + * pg_sharing Name of sharing relationship for this PG + * + * pg_generation Generation value that increases whenever any CPU leaves + * or joins PG. Two kstat snapshots for the same + * CPU may only be compared if they have the same + * generation + * + * pg_hw_util Running value of PG utilization for the sharing + * relationship + * + * pg_hw_util_time_running + * Total time spent collecting CU data. The time may be + * less than wall time if CU counters were stopped for + * some time. + * + * pg_hw_util_time_stopped Total time the CU counters were stopped. + * + * pg_hw_util_rate Utilization rate, expressed in operations per second. + * + * pg_hw_util_rate_max Maximum observed value of utilization rate. + */ +struct pghw_cu_kstat { + kstat_named_t pg_id; + kstat_named_t pg_ncpus; + kstat_named_t pg_generation; + kstat_named_t pg_hw_util; + kstat_named_t pg_hw_util_time_running; + kstat_named_t pg_hw_util_time_stopped; + kstat_named_t pg_hw_util_rate; + kstat_named_t pg_hw_util_rate_max; + kstat_named_t pg_cpus; + kstat_named_t pg_sharing; +} pghw_cu_kstat = { + { "id", KSTAT_DATA_UINT32 }, + { "ncpus", KSTAT_DATA_UINT32 }, + { "generation", KSTAT_DATA_UINT32 }, + { "hw_util", KSTAT_DATA_UINT64 }, + { "hw_util_time_running", KSTAT_DATA_UINT64 }, + { "hw_util_time_stopped", KSTAT_DATA_UINT64 }, + { "hw_util_rate", KSTAT_DATA_UINT64 }, + { "hw_util_rate_max", KSTAT_DATA_UINT64 }, + { "cpus", KSTAT_DATA_STRING }, + { "sharing_relation", KSTAT_DATA_STRING }, +}; + +/* + * Calculate the string size to represent NCPUS. Allow 5 digits for each CPU ID + * plus one space per CPU plus NUL byte in the end. This is only an estimate, + * since we try to compress CPU ranges as x-y. In the worst case the string + * representation of CPUs may be truncated. + */ +#define CPUSTR_LEN(ncpus) ((ncpus) * 6) + +/* + * Maximum length of the string that represents list of CPUs + */ +static int pg_cpulist_maxlen = 0; + +static void pghw_kstat_create(pghw_t *); +static int pghw_kstat_update(kstat_t *, int); +static int pghw_cu_kstat_update(kstat_t *, int); +static int cpu2id(void *); + +/* * hwset operations */ static group_t *pghw_set_create(pghw_type_t); static void pghw_set_add(group_t *, pghw_t *); static void pghw_set_remove(group_t *, pghw_t *); +static void pghw_cpulist_alloc(pghw_t *); +static int cpu2id(void *); + /* * Initialize the physical portion of a hardware PG */ @@ -150,6 +231,7 @@ pghw_init(pghw_t *pg, cpu_t *cp, pghw_type_t hw) pghw_set_add(hwset, pg); pg->pghw_hw = hw; + pg->pghw_generation = 0; pg->pghw_instance = pg_plat_hw_instance_id(cp, hw); pghw_kstat_create(pg); @@ -186,8 +268,20 @@ pghw_fini(pghw_t *pg) pg->pghw_instance = (id_t)PGHW_INSTANCE_ANON; pg->pghw_hw = (pghw_type_t)-1; - if (pg->pghw_kstat) + if (pg->pghw_kstat != NULL) kstat_delete(pg->pghw_kstat); + + /* + * Destroy string representation of CPUs + */ + if (pg->pghw_cpulist != NULL) { + kmem_free(pg->pghw_cpulist, + pg->pghw_cpulist_len); + pg->pghw_cpulist = NULL; + } + + if (pg->pghw_cu_kstat != NULL) + kstat_delete(pg->pghw_cu_kstat); } /* @@ -344,11 +438,10 @@ pghw_set_remove(group_t *hwset, pghw_t *pg) ASSERT(result == 0); } - /* * Return a string name given a pg_hw sharing type */ -static char * +char * pghw_type_string(pghw_type_t hw) { switch (hw) { @@ -374,6 +467,34 @@ pghw_type_string(pghw_type_t hw) } /* + * Return a short string name given a pg_hw sharing type + */ +char * +pghw_type_shortstring(pghw_type_t hw) +{ + switch (hw) { + case PGHW_IPIPE: + return ("instr_pipeline"); + case PGHW_CACHE: + return ("Cache"); + case PGHW_FPU: + return ("FPU"); + case PGHW_MPIPE: + return ("memory_pipeline"); + case PGHW_CHIP: + return ("Socket"); + case PGHW_MEMORY: + return ("Memory"); + case PGHW_POW_ACTIVE: + return ("CPU_PM_Active"); + case PGHW_POW_IDLE: + return ("CPU_PM_Idle"); + default: + return ("unknown"); + } +} + +/* * Create / Update routines for PG hw kstats * * It is the intention of these kstats to provide some level @@ -383,11 +504,14 @@ pghw_type_string(pghw_type_t hw) void pghw_kstat_create(pghw_t *pg) { + char *class = pghw_type_string(pg->pghw_hw); + /* * Create a physical pg kstat */ if ((pg->pghw_kstat = kstat_create("pg", ((pg_t *)pg)->pg_id, - "pg", "pg", KSTAT_TYPE_NAMED, + "pg", "pg", + KSTAT_TYPE_NAMED, sizeof (pghw_kstat) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL)) != NULL) { /* Class string, hw string, and policy string */ @@ -400,6 +524,28 @@ pghw_kstat_create(pghw_t *pg) pg->pghw_kstat->ks_private = pg; kstat_install(pg->pghw_kstat); } + + if (pg_cpulist_maxlen == 0) + pg_cpulist_maxlen = CPUSTR_LEN(max_ncpus); + + /* + * Create a physical pg kstat + */ + if ((pg->pghw_cu_kstat = kstat_create("pg", ((pg_t *)pg)->pg_id, + "hardware", class, + KSTAT_TYPE_NAMED, + sizeof (pghw_cu_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL)) != NULL) { + pg->pghw_cu_kstat->ks_lock = &pghw_kstat_lock; + pg->pghw_cu_kstat->ks_data = &pghw_cu_kstat; + pg->pghw_cu_kstat->ks_update = pghw_cu_kstat_update; + pg->pghw_cu_kstat->ks_private = pg; + pg->pghw_cu_kstat->ks_data_size += strlen(class) + 1; + /* Allow space for CPU strings */ + pg->pghw_cu_kstat->ks_data_size += PGHW_KSTAT_STR_LEN_MAX; + pg->pghw_cu_kstat->ks_data_size += pg_cpulist_maxlen; + kstat_install(pg->pghw_cu_kstat); + } } int @@ -411,11 +557,147 @@ pghw_kstat_update(kstat_t *ksp, int rw) if (rw == KSTAT_WRITE) return (EACCES); - pgsp->pg_id.value.ui64 = ((pg_t *)pg)->pg_id; - pgsp->pg_ncpus.value.ui64 = GROUP_SIZE(&((pg_t *)pg)->pg_cpus); - pgsp->pg_instance_id.value.ui64 = (uint64_t)pg->pghw_instance; + pgsp->pg_id.value.ui32 = ((pg_t *)pg)->pg_id; + pgsp->pg_ncpus.value.ui32 = GROUP_SIZE(&((pg_t *)pg)->pg_cpus); + pgsp->pg_instance_id.value.ui32 = pg->pghw_instance; kstat_named_setstr(&pgsp->pg_class, ((pg_t *)pg)->pg_class->pgc_name); kstat_named_setstr(&pgsp->pg_hw, pghw_type_string(pg->pghw_hw)); kstat_named_setstr(&pgsp->pg_policy, pg_policy_name((pg_t *)pg)); return (0); } + +int +pghw_cu_kstat_update(kstat_t *ksp, int rw) +{ + struct pghw_cu_kstat *pgsp = &pghw_cu_kstat; + pghw_t *pg = ksp->ks_private; + pghw_util_t *hw_util = &pg->pghw_stats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + pgsp->pg_id.value.ui32 = ((pg_t *)pg)->pg_id; + pgsp->pg_ncpus.value.ui32 = GROUP_SIZE(&((pg_t *)pg)->pg_cpus); + + /* + * Allocate memory for the string representing the list of CPUs in PG. + * This memory should persist past the call to pghw_cu_kstat_update() + * since the kstat snapshot routine will reference this memory. + */ + pghw_cpulist_alloc(pg); + + if (pg->pghw_kstat_gen != pg->pghw_generation) { + /* + * PG kstat generation number is out of sync with PG's + * generation mumber. It means that some CPUs could have joined + * or left PG and it is not possible to compare the numbers + * obtained before and after the generation change. + * + * Reset the maximum utilization rate and start computing it + * from scratch. + */ + hw_util->pghw_util = 0; + hw_util->pghw_rate_max = 0; + pg->pghw_kstat_gen = pg->pghw_generation; + } + + /* + * We can't block on CPU lock because when PG is destroyed (under + * cpu_lock) it tries to delete this kstat and it will wait for us to + * complete which will never happen since we are waiting for cpu_lock to + * drop. Deadlocks are fun! + */ + if (mutex_tryenter(&cpu_lock)) { + if (pg->pghw_cpulist != NULL && + *(pg->pghw_cpulist) == '\0') { + (void) group2intlist(&(((pg_t *)pg)->pg_cpus), + pg->pghw_cpulist, pg->pghw_cpulist_len, cpu2id); + } + cu_pg_update(pg); + mutex_exit(&cpu_lock); + } + + pgsp->pg_generation.value.ui32 = pg->pghw_kstat_gen; + pgsp->pg_hw_util.value.ui64 = hw_util->pghw_util; + pgsp->pg_hw_util_time_running.value.ui64 = hw_util->pghw_time_running; + pgsp->pg_hw_util_time_stopped.value.ui64 = hw_util->pghw_time_stopped; + pgsp->pg_hw_util_rate.value.ui64 = hw_util->pghw_rate; + pgsp->pg_hw_util_rate_max.value.ui64 = hw_util->pghw_rate_max; + if (pg->pghw_cpulist != NULL) + kstat_named_setstr(&pgsp->pg_cpus, pg->pghw_cpulist); + else + kstat_named_setstr(&pgsp->pg_cpus, ""); + + kstat_named_setstr(&pgsp->pg_sharing, pghw_type_string(pg->pghw_hw)); + + return (0); +} + +/* + * Update the string representation of CPUs in PG (pg->pghw_cpulist). + * The string representation is used for kstats. + * + * The string is allocated if it has not already been or if it is already + * allocated and PG has more CPUs now. If PG has smaller or equal number of + * CPUs, but the actual CPUs may have changed, the string is reset to the empty + * string causes the string representation to be recreated. The pghw_generation + * field is used to detect whether CPUs within the pg may have changed. + */ +static void +pghw_cpulist_alloc(pghw_t *pg) +{ + uint_t ncpus = GROUP_SIZE(&((pg_t *)pg)->pg_cpus); + size_t len = CPUSTR_LEN(ncpus); + + /* + * If the pghw_cpulist string is already allocated we need to make sure + * that it has sufficient length. Also if the set of CPUs may have + * changed, we need to re-generate the string. + */ + if (pg->pghw_cpulist != NULL && + pg->pghw_kstat_gen != pg->pghw_generation) { + if (len <= pg->pghw_cpulist_len) { + /* + * There is sufficient space in the pghw_cpulist for + * the new set of CPUs. Just clear the string to trigger + * re-generation of list of CPUs + */ + *(pg->pghw_cpulist) = '\0'; + } else { + /* + * There is, potentially, insufficient space in + * pghw_cpulist, so reallocate the string. + */ + ASSERT(strlen(pg->pghw_cpulist) < pg->pghw_cpulist_len); + kmem_free(pg->pghw_cpulist, pg->pghw_cpulist_len); + pg->pghw_cpulist = NULL; + pg->pghw_cpulist_len = 0; + } + } + + if (pg->pghw_cpulist == NULL) { + /* + * Allocate space to hold cpulist. + * + * Length can not be bigger that the maximum space we have + * allowed for the kstat buffer + */ + if (len > pg_cpulist_maxlen) + len = pg_cpulist_maxlen; + if (len > 0) { + pg->pghw_cpulist = kmem_zalloc(len, KM_NOSLEEP); + if (pg->pghw_cpulist != NULL) + pg->pghw_cpulist_len = len; + } + } +} + +static int +cpu2id(void *v) +{ + cpu_t *cp = (cpu_t *)v; + + ASSERT(v != NULL); + + return (cp->cpu_id); +} diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index 9006be10f4..5133e80e69 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -98,6 +98,7 @@ CHKHDRS= \ byteorder.h \ callb.h \ callo.h \ + cap_util.h \ cpucaps.h \ cpucaps_impl.h \ ccompile.h \ diff --git a/usr/src/uts/common/sys/cap_util.h b/usr/src/uts/common/sys/cap_util.h new file mode 100644 index 0000000000..7e25ba6697 --- /dev/null +++ b/usr/src/uts/common/sys/cap_util.h @@ -0,0 +1,173 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CAP_UTIL_H +#define _SYS_CAP_UTIL_H + + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <sys/kcpc.h> +#include <sys/cpc_impl.h> +#include <sys/pghw.h> +#include <sys/cmt.h> + +#ifdef _KERNEL + +/* + * Capacity and utilization flags for each CPU + */ +#define CU_CPU_CNTRS_ON 1 /* CPU performance counters are on */ +#define CU_CPU_CNTRS_OFF_ON 2 /* Off -> on transition */ + +/* + * Macro that returns whether CPU performance counters turned on for given CPU + */ +#define CU_CPC_ON(cp) \ + ((cp) != NULL && (cp)->cpu_cu_info != NULL && \ + ((cp)->cpu_cu_info->cu_flag & CU_CPU_CNTRS_ON)) + + +/* + * Per counter statistics + */ +typedef struct cu_cntr_stats { + hrtime_t cs_time_running; /* running total of time counting */ + hrtime_t cs_time_stopped; /* ... time not counting */ + hrtime_t cs_time_start; /* start time of current sample */ + uint64_t cs_value_start; /* starting value for next sample */ + uint64_t cs_value_last; /* last value */ + uint64_t cs_value_total; /* running total */ + uint64_t cs_rate; /* observed rate since last */ + uint64_t cs_rate_max; /* maximum rate */ + kcpc_request_t *cs_cpc_req; /* corresponding CPC request */ + struct cpu *cs_cpu_start; /* CPU where starting value gotten */ +} cu_cntr_stats_t; + + +/* + * Counter info for a PG hardware sharing relationship + */ +typedef struct cu_cntr_info { + cpu_t *ci_cpu; /* CPU being measured */ + pghw_t *ci_pg; /* hardware PG being measured */ + kstat_t *ci_kstat; /* kstats being exported */ + cu_cntr_stats_t *ci_stats; /* counter statistics */ + uint_t ci_nstats; /* number of statistics */ +} cu_cntr_info_t; + + +/* + * Each CPU can have one or more CPC contexts for measuring capacity and + * utilization + * + * One CPC context is needed per CPU if the counter events needed to measure + * capacity and utilization on each CPU can be programmed onto all the counters + * on a CPU at the same time and there are fewer or same number of desired + * counter events as counters on each CPU. Otherwise, the desired counter + * events are assigned across multiple CPC contexts, so the contexts and their + * counter events can be multiplexed onto the counters over time to get the + * data for all of the counter events. + */ +typedef struct cu_cpc_ctx { + int cur_index; /* index for current context */ + int nctx; /* number of CPC contexts */ + kcpc_ctx_t **ctx_ptr_array; /* array of context pointers */ + size_t ctx_ptr_array_sz; /* size of array */ +} cu_cpc_ctx_t; + +/* + * Per CPU capacity and utilization info + */ +typedef struct cu_cpu_info { + struct cpu *cu_cpu; /* CPU for the statistics */ + uint_t cu_flag; /* capacity & utilization flag */ + hrtime_t cu_sample_time; /* when last sample taken */ + cu_cpc_ctx_t cu_cpc_ctx; /* performance counter contexts */ + cu_cntr_stats_t *cu_cntr_stats; /* counter statistics array */ + uint_t cu_ncntr_stats; /* number of counter statistics */ + uint_t cu_disabled; /* count of disable requests */ + /* + * Per PG hardware sharing relationship counter info + */ + cu_cntr_info_t *cu_cntr_info[PGHW_NUM_COMPONENTS]; +} cu_cpu_info_t; + +/* + * COMMON INTERFACE ROUTINES + */ + +/* + * Setup capacity and utilization support + */ +extern void cu_init(void); + +/* + * Tear down capacity and utilization support + */ +extern int cu_fini(void); + +/* + * Program CPC for capacity and utilization on given CPU + */ +extern void cu_cpc_program(struct cpu *, int *); + +/* + * Unprogram CPC for capacity and utilization on given CPU + */ +extern void cu_cpc_unprogram(struct cpu *, int *); + +/* + * Update counter statistics on a given CPU + */ +extern int cu_cpu_update(struct cpu *, boolean_t); + +/* + * Update utilization and capacity data for CMT PG + */ +extern void cu_pg_update(pghw_t *); + +/* + * Disable or enable capacity and utilization on all CPUs + */ +extern void cu_disable(void); +extern void cu_enable(void); + +/* + * PLATFORM SPECIFIC INTERFACE ROUTINES + */ +extern int cu_plat_cpc_init(cpu_t *, kcpc_request_list_t *, int); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CAP_UTIL_H */ diff --git a/usr/src/uts/common/sys/cmt.h b/usr/src/uts/common/sys/cmt.h index 4e7ed28656..afdb6730a6 100644 --- a/usr/src/uts/common/sys/cmt.h +++ b/usr/src/uts/common/sys/cmt.h @@ -63,6 +63,7 @@ typedef struct pg_cmt { int cmt_nchildren; /* # of children CMT PGs */ struct group cmt_cpus_actv; struct bitset cmt_cpus_actv_set; /* bitset of active CPUs */ + kstat_t *cmt_kstat; /* cmt kstats exported */ } pg_cmt_t; /* diff --git a/usr/src/uts/common/sys/cpc_impl.h b/usr/src/uts/common/sys/cpc_impl.h index 1b57c76c10..ae89c90508 100644 --- a/usr/src/uts/common/sys/cpc_impl.h +++ b/usr/src/uts/common/sys/cpc_impl.h @@ -131,7 +131,7 @@ typedef struct _kcpc_ctx kcpc_ctx_t; struct _kcpc_ctx { struct _kcpc_set *kc_set; /* linked list of all bound sets */ - uint32_t kc_flags; + volatile uint_t kc_flags; kcpc_pic_t *kc_pics; /* pointer to array of per-pic data */ hrtime_t kc_hrtime; /* gethrtime() at last sample */ uint64_t kc_vtick; /* virtualized %tick */ @@ -214,20 +214,18 @@ extern hrtime_t tsc_read(void); struct cpu; extern uint_t cpc_ncounters; -extern kmutex_t kcpc_ctx_llock[]; /* protects ctx_list */ -extern kcpc_ctx_t *kcpc_ctx_list[]; /* head of list */ extern krwlock_t kcpc_cpuctx_lock; /* lock for 'kcpc_cpuctx' below */ extern int kcpc_cpuctx; /* number of cpu-specific contexts */ extern void kcpc_invalidate_all(void); extern void kcpc_passivate(void); -extern void kcpc_remote_stop(struct cpu *cp); +extern void kcpc_cpu_stop(struct cpu *, boolean_t); extern int kcpc_pcbe_tryload(const char *, uint_t, uint_t, uint_t); -extern void kcpc_remote_program(struct cpu *cp); +extern void kcpc_cpu_program(struct cpu *, kcpc_ctx_t *); extern void kcpc_register_dcpc(void (*func)(uint64_t)); extern void kcpc_unregister_dcpc(void); -extern kcpc_ctx_t *kcpc_ctx_alloc(void); +extern kcpc_ctx_t *kcpc_ctx_alloc(int); extern int kcpc_assign_reqs(struct _kcpc_set *, kcpc_ctx_t *); extern void kcpc_ctx_free(kcpc_ctx_t *); extern int kcpc_configure_reqs(kcpc_ctx_t *, struct _kcpc_set *, int *); diff --git a/usr/src/uts/common/sys/cpc_pcbe.h b/usr/src/uts/common/sys/cpc_pcbe.h index 7522a9bf82..eb168fcf2c 100644 --- a/usr/src/uts/common/sys/cpc_pcbe.h +++ b/usr/src/uts/common/sys/cpc_pcbe.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -37,8 +36,6 @@ #ifndef _SYS_CPC_PCBE_H #define _SYS_CPC_PCBE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/inttypes.h> #include <sys/cpc_impl.h> @@ -51,6 +48,8 @@ extern "C" { */ #define PCBE_VER_1 1 +#define PCBE_IMPL_NAME_P4HT "Pentium 4 with HyperThreading" + typedef struct __pcbe_ops { uint_t pcbe_ver; uint_t pcbe_caps; diff --git a/usr/src/uts/common/sys/cpuvar.h b/usr/src/uts/common/sys/cpuvar.h index aece259a35..b52192b419 100644 --- a/usr/src/uts/common/sys/cpuvar.h +++ b/usr/src/uts/common/sys/cpuvar.h @@ -222,6 +222,16 @@ typedef struct cpu { uint_t cpu_rotor; /* for cheap pseudo-random numbers */ + struct cu_cpu_info *cpu_cu_info; /* capacity & util. info */ + + /* + * cpu_generation is updated whenever CPU goes on-line or off-line. + * Updates to cpu_generation are protected by cpu_lock. + * + * See CPU_NEW_GENERATION() macro below. + */ + volatile uint_t cpu_generation; /* tracking on/off-line */ + /* * New members must be added /before/ this member, as the CTF tools * rely on this being the last field before cpu_m, so they can @@ -597,6 +607,13 @@ extern struct cpu *curcpup(void); #define CPU_STATS(cp, stat) \ ((cp)->cpu_stats.stat) +/* + * Increment CPU generation value. + * This macro should be called whenever CPU goes on-line or off-line. + * Updates to cpu_generation should be protected by cpu_lock. + */ +#define CPU_NEW_GENERATION(cp) ((cp)->cpu_generation++) + #endif /* _KERNEL || _KMEMUSER */ /* @@ -726,6 +743,49 @@ void cpu_enable_intr(struct cpu *cp); /* start issuing interrupts to cpu */ */ extern kmutex_t cpu_lock; /* lock protecting CPU data */ +/* + * CPU state change events + * + * Various subsystems need to know when CPUs change their state. They get this + * information by registering CPU state change callbacks using + * register_cpu_setup_func(). Whenever any CPU changes its state, the callback + * function is called. The callback function is passed three arguments: + * + * Event, described by cpu_setup_t + * CPU ID + * Transparent pointer passed when registering the callback + * + * The callback function is called with cpu_lock held. The return value from the + * callback function is usually ignored, except for CPU_CONFIG and CPU_UNCONFIG + * events. For these two events, non-zero return value indicates a failure and + * prevents successful completion of the operation. + * + * New events may be added in the future. Callback functions should ignore any + * events that they do not understand. + * + * The following events provide notification callbacks: + * + * CPU_INIT A new CPU is started and added to the list of active CPUs + * This event is only used during boot + * + * CPU_CONFIG A newly inserted CPU is prepared for starting running code + * This event is called by DR code + * + * CPU_UNCONFIG CPU has been powered off and needs cleanup + * This event is called by DR code + * + * CPU_ON CPU is enabled but does not run anything yet + * + * CPU_INTR_ON CPU is enabled and has interrupts enabled + * + * CPU_OFF CPU is going offline but can still run threads + * + * CPU_CPUPART_OUT CPU is going to move out of its partition + * + * CPU_CPUPART_IN CPU is going to move to a new partition + * + * CPU_SETUP CPU is set up during boot and can run threads + */ typedef enum { CPU_INIT, CPU_CONFIG, @@ -734,7 +794,8 @@ typedef enum { CPU_OFF, CPU_CPUPART_IN, CPU_CPUPART_OUT, - CPU_SETUP + CPU_SETUP, + CPU_INTR_ON } cpu_setup_t; typedef int cpu_setup_func_t(cpu_setup_t, int, void *); @@ -748,6 +809,13 @@ extern void unregister_cpu_setup_func(cpu_setup_func_t *, void *); extern void cpu_state_change_notify(int, cpu_setup_t); /* + * Call specified function on the given CPU + */ +typedef void (*cpu_call_func_t)(uintptr_t, uintptr_t); +extern void cpu_call(cpu_t *, cpu_call_func_t, uintptr_t, uintptr_t); + + +/* * Create various strings that describe the given CPU for the * processor_info system call and configuration-related kstats. */ diff --git a/usr/src/uts/common/sys/group.h b/usr/src/uts/common/sys/group.h index bb5613bc35..2db1ac01bb 100644 --- a/usr/src/uts/common/sys/group.h +++ b/usr/src/uts/common/sys/group.h @@ -101,6 +101,17 @@ void group_remove_at(group_t *, uint_t); */ uint_t group_find(group_t *, void *); +/* + * Convert a group to a string with list of integers. + * + * The consecutive integer values are represented using x-y notation. + * The resulting string looks like "1,2-5,8" + * + * The convert argument is used to map group elements to integer IDs. + * The output buffer and its length are specfied in the arguments. + */ +extern char *group2intlist(group_t *, char *, size_t, int (convert)(void*)); + #endif /* !_KERNEL && !_KMEMUSER */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/kcpc.h b/usr/src/uts/common/sys/kcpc.h index f30e093f78..d90b1c1d29 100644 --- a/usr/src/uts/common/sys/kcpc.h +++ b/usr/src/uts/common/sys/kcpc.h @@ -28,11 +28,13 @@ #include <sys/cpc_impl.h> #include <sys/ksynch.h> +#include <sys/types.h> #ifdef __cplusplus extern "C" { #endif + /* * Kernel clients need this file in order to know what a request is and how to * program one. @@ -74,8 +76,33 @@ struct _kcpc_request { uint_t kr_flags; uint_t kr_nattrs; kcpc_attr_t *kr_attr; + void *kr_ptr; /* Ptr assigned by requester */ }; +typedef struct _kcpc_request_list { + kcpc_request_t *krl_list; /* counter event requests */ + int krl_cnt; /* how many requests */ + int krl_max; /* max request entries */ +} kcpc_request_list_t; + +/* + * Type of update function to be called when reading counters on current CPU in + * kcpc_read() + */ +typedef int (*kcpc_update_func_t)(void *, uint64_t); + +/* + * Type of read function to be called when reading counters on current CPU + * (ie. should be same type signature as kcpc_read()) + */ +typedef int (*kcpc_read_func_t)(kcpc_update_func_t); + + +/* + * Initialize the kcpc framework + */ +extern int kcpc_init(void); + /* * Bind the set to the indicated thread. * Returns 0 on success, or an errno in case of error. If EINVAL is returned, @@ -96,6 +123,56 @@ extern int kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick); /* + * Create CPC context containing specified list of requested counter events + */ +extern int kcpc_cpu_ctx_create(struct cpu *cp, kcpc_request_list_t *req_list, + int kmem_flags, kcpc_ctx_t ***ctx_ptr_array, size_t *ctx_ptr_array_sz); + +/* + * Returns whether specified counter event is supported + */ +extern boolean_t kcpc_event_supported(char *event); + +/* + * Initialize list of CPC event requests + */ +extern kcpc_request_list_t *kcpc_reqs_init(int nreqs, int kmem_flags); + +/* + * Add counter event request to given list of counter event requests + */ +extern int kcpc_reqs_add(kcpc_request_list_t *req_list, char *event, + uint64_t preset, uint_t flags, uint_t nattrs, kcpc_attr_t *attr, void *ptr, + int kmem_flags); + +/* + * Reset list of CPC event requests so its space can be used for another set + * of requests + */ +extern int kcpc_reqs_reset(kcpc_request_list_t *req_list); + +/* + * Free given list of counter event requests + */ +extern int kcpc_reqs_fini(kcpc_request_list_t *req_list); + +/* + * Read CPC data for given event on current CPU + */ +extern int kcpc_read(kcpc_update_func_t); + +/* + * Program current CPU with given CPC context + */ +extern void kcpc_program(kcpc_ctx_t *ctx, boolean_t for_thread, + boolean_t cu_interpose); + +/* + * Unprogram CPC counters on current CPU + */ +extern void kcpc_unprogram(kcpc_ctx_t *ctx, boolean_t cu_interpose); + +/* * Unbind a request and release the associated resources. */ extern int kcpc_unbind(kcpc_set_t *set); @@ -128,6 +205,8 @@ extern void kcpc_idle_restore(struct cpu *cp); extern krwlock_t kcpc_cpuctx_lock; /* lock for 'kcpc_cpuctx' below */ extern int kcpc_cpuctx; /* number of cpu-specific contexts */ +extern void kcpc_free(kcpc_ctx_t *ctx, int isexec); + /* * 'dtrace_cpc_in_use' contains the number of currently active cpc provider * based enablings. See the block comment in uts/common/os/dtrace_subr.c for diff --git a/usr/src/uts/common/sys/pghw.h b/usr/src/uts/common/sys/pghw.h index ab8b0a9bbe..f0550dba7e 100644 --- a/usr/src/uts/common/sys/pghw.h +++ b/usr/src/uts/common/sys/pghw.h @@ -89,6 +89,27 @@ typedef enum pghw_type { typedef uintptr_t pghw_handle_t; /* + * Representation of PG hardware utilization NOTE: All the sums listed below are + * the sums of running total of each item for each CPU in the PG (eg. + * sum(utilization) is sum of running total utilization of each CPU in PG) + */ +typedef struct pghw_util { + uint64_t pghw_util; /* sum(utilization) */ + uint64_t pghw_rate; /* Last observed utilization rate */ + uint64_t pghw_rate_max; /* Max observed rate (in units/sec) */ + hrtime_t pghw_time_stamp; /* Timestamp of last snapshot */ + /* + * sum(time utilization counters on) + */ + hrtime_t pghw_time_running; + /* + * sum(time utilization counters off) + */ + hrtime_t pghw_time_stopped; +} pghw_util_t; + + +/* * Processor Group (physical sharing relationship) */ typedef struct pghw { @@ -97,6 +118,23 @@ typedef struct pghw { id_t pghw_instance; /* sharing instance identifier */ pghw_handle_t pghw_handle; /* hw specific opaque handle */ kstat_t *pghw_kstat; /* physical kstats exported */ + kstat_t *pghw_cu_kstat; /* for capacity and utilization */ + /* + * pghw_generation should be updated by superclasses whenever PG changes + * significanly (e.g. new CPUs join or leave PG). + */ + uint_t pghw_generation; /* generation number */ + + /* + * The following fields are used by PGHW cu kstats + */ + char *pghw_cpulist; /* list of CPUs */ + size_t pghw_cpulist_len; /* length of the list */ + /* + * Generation number at kstat update time + */ + uint_t pghw_kstat_gen; + pghw_util_t pghw_stats; /* Utilization data */ } pghw_t; /* @@ -111,32 +149,35 @@ typedef struct cpu_physid { /* * Physical PG initialization / CPU service hooks */ -void pghw_init(pghw_t *, cpu_t *, pghw_type_t); -void pghw_fini(pghw_t *); -void pghw_cpu_add(pghw_t *, cpu_t *); -pghw_t *pghw_place_cpu(cpu_t *, pghw_type_t); +extern void pghw_init(pghw_t *, cpu_t *, pghw_type_t); +extern void pghw_fini(pghw_t *); +extern void pghw_cpu_add(pghw_t *, cpu_t *); +extern pghw_t *pghw_place_cpu(cpu_t *, pghw_type_t); /* * Physical ID cache creation / destruction */ -void pghw_physid_create(cpu_t *); -void pghw_physid_destroy(cpu_t *); +extern void pghw_physid_create(cpu_t *); +extern void pghw_physid_destroy(cpu_t *); /* * CPU / PG hardware related seach operations */ -pghw_t *pghw_find_pg(cpu_t *, pghw_type_t); -pghw_t *pghw_find_by_instance(id_t, pghw_type_t); -group_t *pghw_set_lookup(pghw_type_t); - -void pghw_kstat_create(pghw_t *); -int pghw_kstat_update(kstat_t *, int); +extern pghw_t *pghw_find_pg(cpu_t *, pghw_type_t); +extern pghw_t *pghw_find_by_instance(id_t, pghw_type_t); +extern group_t *pghw_set_lookup(pghw_type_t); /* Hardware sharing relationship platform interfaces */ -int pg_plat_hw_shared(cpu_t *, pghw_type_t); -int pg_plat_cpus_share(cpu_t *, cpu_t *, pghw_type_t); -id_t pg_plat_hw_instance_id(cpu_t *, pghw_type_t); -pghw_type_t pg_plat_hw_rank(pghw_type_t, pghw_type_t); +extern int pg_plat_hw_shared(cpu_t *, pghw_type_t); +extern int pg_plat_cpus_share(cpu_t *, cpu_t *, pghw_type_t); +extern id_t pg_plat_hw_instance_id(cpu_t *, pghw_type_t); +extern pghw_type_t pg_plat_hw_rank(pghw_type_t, pghw_type_t); + +/* + * String representation of the hardware type + */ +extern char *pghw_type_string(pghw_type_t); +extern char *pghw_type_shortstring(pghw_type_t); /* * What comprises a "core" may vary across processor implementations, @@ -144,7 +185,7 @@ pghw_type_t pg_plat_hw_rank(pghw_type_t, pghw_type_t); * is no PGHW_CORE type, but we provide an interface here to allow platforms * to express cpu <=> core mappings. */ -id_t pg_plat_get_core_id(cpu_t *); +extern id_t pg_plat_get_core_id(cpu_t *); #endif /* !_KERNEL && !_KMEMUSER */ diff --git a/usr/src/uts/common/sys/systm.h b/usr/src/uts/common/sys/systm.h index 84ccfb9991..4c3dc7f886 100644 --- a/usr/src/uts/common/sys/systm.h +++ b/usr/src/uts/common/sys/systm.h @@ -270,6 +270,7 @@ int spl8(void); void splx(int); void set_base_spl(void); int __ipltospl(int); +int spl_xcall(void); void softcall_init(void); void softcall(void (*)(void *), void *); diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c index 8e532685c7..8621e3ef55 100644 --- a/usr/src/uts/i86pc/os/cpuid.c +++ b/usr/src/uts/i86pc/os/cpuid.c @@ -2669,6 +2669,13 @@ cpuid_get_clogid(cpu_t *cpu) return (cpu->cpu_m.mcpu_cpi->cpi_clogid); } +int +cpuid_get_cacheid(cpu_t *cpu) +{ + ASSERT(cpuid_checkpass(cpu, 1)); + return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid); +} + uint_t cpuid_get_procnodeid(cpu_t *cpu) { diff --git a/usr/src/uts/i86pc/os/intr.c b/usr/src/uts/i86pc/os/intr.c index 786cd29e8f..fc0ef9e260 100644 --- a/usr/src/uts/i86pc/os/intr.c +++ b/usr/src/uts/i86pc/os/intr.c @@ -1179,6 +1179,12 @@ getpil(void) } int +spl_xcall(void) +{ + return (splr(ipltospl(XCALL_PIL))); +} + +int interrupts_enabled(void) { ulong_t flag; diff --git a/usr/src/uts/i86pc/os/mp_call.c b/usr/src/uts/i86pc/os/mp_call.c index 5725b18d85..df18f16588 100644 --- a/usr/src/uts/i86pc/os/mp_call.c +++ b/usr/src/uts/i86pc/os/mp_call.c @@ -32,6 +32,8 @@ #include <sys/systm.h> #include <sys/promif.h> #include <sys/xc_levels.h> +#include <sys/spl.h> +#include <sys/bitmap.h> /* * Interrupt another CPU. @@ -54,3 +56,38 @@ poke_cpu(int cpun) */ send_dirint(cpun, XC_CPUPOKE_PIL); } + +/* + * Call a function on a target CPU + */ +void +cpu_call(cpu_t *cp, cpu_call_func_t func, uintptr_t arg1, uintptr_t arg2) +{ + cpuset_t set; + + if (panicstr) + return; + + /* + * Prevent CPU from going off-line + */ + kpreempt_disable(); + + /* + * If we are on the target CPU, call the function directly, but raise + * the PIL to XC_PIL. + * This guarantees that functions called via cpu_call() can not ever + * interrupt each other. + */ + if (CPU == cp) { + int save_spl = splr(ipltospl(XC_HI_PIL)); + + (*func)(arg1, arg2); + splx(save_spl); + } else { + CPUSET_ONLY(set, cp->cpu_id); + xc_call((xc_arg_t)arg1, (xc_arg_t)arg2, 0, CPUSET2BV(set), + (xc_func_t)func); + } + kpreempt_enable(); +} diff --git a/usr/src/uts/i86pc/os/mp_machdep.c b/usr/src/uts/i86pc/os/mp_machdep.c index 7470a1ef38..80e371850b 100644 --- a/usr/src/uts/i86pc/os/mp_machdep.c +++ b/usr/src/uts/i86pc/os/mp_machdep.c @@ -1,3 +1,4 @@ + /* * CDDL HEADER START * @@ -61,6 +62,7 @@ #include <sys/hpet.h> #include <sys/sunddi.h> #include <sys/sunndi.h> +#include <sys/cpc_pcbe.h> #define OFFSETOF(s, m) (size_t)(&(((s *)0)->m)) @@ -1680,3 +1682,37 @@ pg_cmt_affinity_hw(pghw_type_t hw) else return (0); } + +/* + * Return number of counter events requested to measure hardware capacity and + * utilization and setup CPC requests for specified CPU as needed + * + * May return 0 when platform or processor specific code knows that no CPC + * events should be programmed on this CPU or -1 when platform or processor + * specific code doesn't know which counter events are best to use and common + * code should decide for itself + */ +int +/* LINTED E_FUNC_ARG_UNUSED */ +cu_plat_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs) +{ + const char *impl_name; + + /* + * Return error if pcbe_ops not set + */ + if (pcbe_ops == NULL) + return (-1); + + /* + * Return that no CPC events should be programmed on hyperthreaded + * Pentium 4 and return error for all other x86 processors to tell + * common code to decide what counter events to program on those CPUs + * for measuring hardware capacity and utilization + */ + impl_name = pcbe_ops->pcbe_impl_name(); + if (impl_name != NULL && strcmp(impl_name, PCBE_IMPL_NAME_P4HT) == 0) + return (0); + else + return (-1); +} diff --git a/usr/src/uts/i86pc/sys/xc_levels.h b/usr/src/uts/i86pc/sys/xc_levels.h index 31ba6441fa..0492e48a1d 100644 --- a/usr/src/uts/i86pc/sys/xc_levels.h +++ b/usr/src/uts/i86pc/sys/xc_levels.h @@ -35,6 +35,7 @@ extern "C" { #define XC_CPUPOKE_PIL 11 /* poke to cause wakeup, no service function */ #define XC_SYS_PIL 13 /* should be defined elsewhere */ #define XC_HI_PIL 15 /* cross call with service function */ +#define XCALL_PIL XC_HI_PIL /* alias for XC_HI_PIL */ #ifdef __cplusplus } diff --git a/usr/src/uts/intel/genunix/Makefile b/usr/src/uts/intel/genunix/Makefile index db7b60ff14..ab0073268f 100644 --- a/usr/src/uts/intel/genunix/Makefile +++ b/usr/src/uts/intel/genunix/Makefile @@ -20,7 +20,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -84,6 +84,8 @@ $(PATCH_BUILD)IPCTF_TARGET = CPPFLAGS += -I$(SRC)/common CPPFLAGS += -I$(SRC)/uts/common/fs/zfs +CPPFLAGS += -I$(UTSBASE)/i86pc + # # For now, disable these lint checks; maintainers should endeavor # to investigate and remove these for maximum lint coverage. diff --git a/usr/src/uts/intel/ia32/os/cpc_subr.c b/usr/src/uts/intel/ia32/os/cpc_subr.c index 1a71c1c431..1e3049a399 100644 --- a/usr/src/uts/intel/ia32/os/cpc_subr.c +++ b/usr/src/uts/intel/ia32/os/cpc_subr.c @@ -188,33 +188,6 @@ kcpc_hw_load_pcbe(void) cpuid_getmodel(CPU), cpuid_getstep(CPU))); } -static int -kcpc_remotestop_func(void) -{ - ASSERT(CPU->cpu_cpc_ctx != NULL); - pcbe_ops->pcbe_allstop(); - atomic_or_uint(&CPU->cpu_cpc_ctx->kc_flags, KCPC_CTX_INVALID_STOPPED); - - return (0); -} - -/* - * Ensure the counters are stopped on the given processor. - * - * Callers must ensure kernel preemption is disabled. - */ -void -kcpc_remote_stop(cpu_t *cp) -{ - cpuset_t set; - - CPUSET_ZERO(set); - - CPUSET_ADD(set, cp->cpu_id); - - xc_sync(0, 0, 0, CPUSET2BV(set), (xc_func_t)kcpc_remotestop_func); -} - /* * Called by the generic framework to check if it's OK to bind a set to a CPU. */ @@ -292,28 +265,3 @@ kcpc_hw_lwp_hook(void) mutex_exit(&cpu_lock); return (0); } - -static int -kcpc_remoteprogram_func(void) -{ - ASSERT(CPU->cpu_cpc_ctx != NULL); - - pcbe_ops->pcbe_program(CPU->cpu_cpc_ctx); - - return (0); -} - -/* - * Ensure counters are enabled on the given processor. - */ -void -kcpc_remote_program(cpu_t *cp) -{ - cpuset_t set; - - CPUSET_ZERO(set); - - CPUSET_ADD(set, cp->cpu_id); - - xc_sync(0, 0, 0, CPUSET2BV(set), (xc_func_t)kcpc_remoteprogram_func); -} diff --git a/usr/src/uts/intel/pcbe/opteron_pcbe.c b/usr/src/uts/intel/pcbe/opteron_pcbe.c index 18a309eca6..cb97d21b78 100644 --- a/usr/src/uts/intel/pcbe/opteron_pcbe.c +++ b/usr/src/uts/intel/pcbe/opteron_pcbe.c @@ -563,26 +563,6 @@ opt_pcbe_list_attrs(void) return ("edge,pc,inv,cmask,umask"); } -/*ARGSUSED*/ -static uint64_t -opt_pcbe_event_coverage(char *event) -{ - /* - * Fortunately, all counters can count all events. - */ - return (0xF); -} - -static uint64_t -opt_pcbe_overflow_bitmap(void) -{ - /* - * Unfortunately, this chip cannot detect which counter overflowed, so - * we must act as if they all did. - */ - return (0xF); -} - static amd_generic_event_t * find_generic_event(char *name) { @@ -608,6 +588,32 @@ find_event(char *name) } /*ARGSUSED*/ +static uint64_t +opt_pcbe_event_coverage(char *event) +{ + /* + * Check whether counter event is supported + */ + if (find_event(event) == NULL && find_generic_event(event) == NULL) + return (0); + + /* + * Fortunately, all counters can count all events. + */ + return (0xF); +} + +static uint64_t +opt_pcbe_overflow_bitmap(void) +{ + /* + * Unfortunately, this chip cannot detect which counter overflowed, so + * we must act as if they all did. + */ + return (0xF); +} + +/*ARGSUSED*/ static int opt_pcbe_configure(uint_t picnum, char *event, uint64_t preset, uint32_t flags, uint_t nattrs, kcpc_attr_t *attrs, void **data, void *token) diff --git a/usr/src/uts/intel/pcbe/p4_pcbe.c b/usr/src/uts/intel/pcbe/p4_pcbe.c index 0fffcd2961..8c05c599a3 100644 --- a/usr/src/uts/intel/pcbe/p4_pcbe.c +++ b/usr/src/uts/intel/pcbe/p4_pcbe.c @@ -522,7 +522,7 @@ static const char * p4_pcbe_impl_name(void) { if (p4_htt) - return ("Pentium 4 with HyperThreading"); + return (PCBE_IMPL_NAME_P4HT); return ("Pentium 4"); } diff --git a/usr/src/uts/intel/sys/x86_archext.h b/usr/src/uts/intel/sys/x86_archext.h index e5f1cababc..0bb28d4d49 100644 --- a/usr/src/uts/intel/sys/x86_archext.h +++ b/usr/src/uts/intel/sys/x86_archext.h @@ -626,6 +626,7 @@ extern int cpuid_get_chipid(struct cpu *); extern id_t cpuid_get_coreid(struct cpu *); extern int cpuid_get_pkgcoreid(struct cpu *); extern int cpuid_get_clogid(struct cpu *); +extern int cpuid_get_cacheid(struct cpu *); extern uint32_t cpuid_get_apicid(struct cpu *); extern uint_t cpuid_get_procnodeid(struct cpu *cpu); extern uint_t cpuid_get_procnodes_per_pkg(struct cpu *cpu); diff --git a/usr/src/uts/sun4/os/mp_call.c b/usr/src/uts/sun4/os/mp_call.c index f881a23755..f7ee31a276 100644 --- a/usr/src/uts/sun4/os/mp_call.c +++ b/usr/src/uts/sun4/os/mp_call.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Facilities for cross-processor subroutine calls using "mailbox" interrupts. */ @@ -37,6 +34,7 @@ #include <sys/systm.h> #include <sys/machsystm.h> #include <sys/intr.h> +#include <sys/xc_impl.h> /* * Interrupt another CPU. @@ -64,3 +62,40 @@ poke_cpu(int cpun) xt_one(cpun, setsoftint_tl1, poke_cpu_inum, 0); } + +extern int xc_spl_enter[]; + +/* + * Call a function on a target CPU + */ +void +cpu_call(cpu_t *cp, cpu_call_func_t func, uintptr_t arg1, uintptr_t arg2) +{ + if (panicstr) + return; + + /* + * Prevent CPU from going offline + */ + kpreempt_disable(); + + /* + * If we are on the target CPU, call the function directly, but raise + * the PIL to XC_PIL. + * This guarantees that functions called via cpu_call() can not ever + * interrupt each other. + */ + if (CPU != cp) { + xc_one(cp->cpu_id, (xcfunc_t *)func, (uint64_t)arg1, + (uint64_t)arg2); + } else { + int lcx; + int opl; + + XC_SPL_ENTER(lcx, opl); + func(arg1, arg2); + XC_SPL_EXIT(lcx, opl); + } + + kpreempt_enable(); +} diff --git a/usr/src/uts/sun4/os/x_call.c b/usr/src/uts/sun4/os/x_call.c index 0c5c06c36a..521f740c82 100644 --- a/usr/src/uts/sun4/os/x_call.c +++ b/usr/src/uts/sun4/os/x_call.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/systm.h> #include <sys/archsystm.h> #include <sys/machsystm.h> @@ -226,6 +224,15 @@ xc_init(void) */ /* + * spl_xcall - set PIL to xcall level + */ +int +spl_xcall(void) +{ + return (splr(XCALL_PIL)); +} + +/* * xt_one - send a "x-trap" to a cpu */ void diff --git a/usr/src/uts/sun4u/genunix/Makefile b/usr/src/uts/sun4u/genunix/Makefile index 8d7c87f065..1a77e4c916 100644 --- a/usr/src/uts/sun4u/genunix/Makefile +++ b/usr/src/uts/sun4u/genunix/Makefile @@ -20,7 +20,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -96,6 +96,8 @@ CFLAGS += $(CCVERBOSE) CPPFLAGS += -I$(SRC)/common CPPFLAGS += -I$(SRC)/uts/common/fs/zfs +INC_PATH += -I$(UTSBASE)/sun4 + # # For now, disable these lint checks; maintainers should endeavor # to investigate and remove these for maximum lint coverage. diff --git a/usr/src/uts/sun4u/os/cmp.c b/usr/src/uts/sun4u/os/cmp.c index 8ba9aa3b6e..8a0fa0e6dc 100644 --- a/usr/src/uts/sun4u/os/cmp.c +++ b/usr/src/uts/sun4u/os/cmp.c @@ -303,3 +303,19 @@ pg_cmt_affinity_hw(pghw_type_t hw) else return (0); } + +/* + * Return number of counter events requested to measure hardware capacity and + * utilization and setup CPC requests for specified CPU if list where to add + * CPC requests is given + */ +int +/* LINTED E_FUNC_ARG_UNUSED */ +cu_plat_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs) +{ + /* + * Return error to tell common code to decide what counter events to + * program on this CPU for measuring hardware capacity and utilization + */ + return (-1); +} diff --git a/usr/src/uts/sun4u/os/cpc_subr.c b/usr/src/uts/sun4u/os/cpc_subr.c index a9c64681fd..cfe1fd283d 100644 --- a/usr/src/uts/sun4u/os/cpc_subr.c +++ b/usr/src/uts/sun4u/os/cpc_subr.c @@ -45,6 +45,7 @@ #include <sys/cpc_pcbe.h> #include <sys/modctl.h> #include <sys/sdt.h> +#include <sys/kcpc.h> uint64_t cpc_level15_inum; /* used in interrupt.s */ int cpc_has_overflow_intr; /* set in cheetah.c */ @@ -111,26 +112,6 @@ kcpc_hw_load_pcbe(void) } /*ARGSUSED*/ -static void -kcpc_remotestop_func(uint64_t arg1, uint64_t arg2) -{ - ASSERT(CPU->cpu_cpc_ctx != NULL); - pcbe_ops->pcbe_allstop(); - atomic_or_uint(&CPU->cpu_cpc_ctx->kc_flags, KCPC_CTX_INVALID_STOPPED); -} - -/* - * Ensure the counters are stopped on the given processor. - * - * Callers must ensure kernel preemption is disabled. - */ -void -kcpc_remote_stop(cpu_t *cp) -{ - xc_one(cp->cpu_id, kcpc_remotestop_func, 0, 0); -} - -/*ARGSUSED*/ int kcpc_hw_cpu_hook(processorid_t cpuid, ulong_t *kcpc_cpumap) { @@ -142,21 +123,3 @@ kcpc_hw_lwp_hook(void) { return (0); } - -/*ARGSUSED*/ -static void -kcpc_remoteprogram_func(uint64_t arg1, uint64_t arg2) -{ - ASSERT(CPU->cpu_cpc_ctx != NULL); - - pcbe_ops->pcbe_program(CPU->cpu_cpc_ctx); -} - -/* - * Ensure counters are enabled on the given processor. - */ -void -kcpc_remote_program(cpu_t *cp) -{ - xc_one(cp->cpu_id, kcpc_remoteprogram_func, 0, 0); -} diff --git a/usr/src/uts/sun4v/genunix/Makefile b/usr/src/uts/sun4v/genunix/Makefile index e629630fb5..28d4f2aeeb 100644 --- a/usr/src/uts/sun4v/genunix/Makefile +++ b/usr/src/uts/sun4v/genunix/Makefile @@ -20,7 +20,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # @@ -104,6 +104,8 @@ CFLAGS += $(CCVERBOSE) CPPFLAGS += -I$(SRC)/common CPPFLAGS += -I$(SRC)/uts/common/fs/zfs +INC_PATH += -I$(UTSBASE)/sun4 + # # For now, disable these lint checks; maintainers should endeavor # to investigate and remove these for maximum lint coverage. diff --git a/usr/src/uts/sun4v/os/cmp.c b/usr/src/uts/sun4v/os/cmp.c index 4e80f06f32..8eedd1a69d 100644 --- a/usr/src/uts/sun4v/os/cmp.c +++ b/usr/src/uts/sun4v/os/cmp.c @@ -208,3 +208,19 @@ pg_cmt_affinity_hw(pghw_type_t hw) else return (0); } + +/* + * Return number of counter events requested to measure hardware capacity and + * utilization and setup CPC requests for specified CPU if list where to add + * CPC requests is given + */ +int +/* LINTED E_FUNC_ARG_UNUSED */ +cu_plat_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs) +{ + /* + * Return error to tell common code to decide what counter events to + * program on this CPU for measuring hardware capacity and utilization + */ + return (-1); +} diff --git a/usr/src/uts/sun4v/os/cpc_subr.c b/usr/src/uts/sun4v/os/cpc_subr.c index 8e58d85513..089c582541 100644 --- a/usr/src/uts/sun4v/os/cpc_subr.c +++ b/usr/src/uts/sun4v/os/cpc_subr.c @@ -130,26 +130,6 @@ kcpc_hw_load_pcbe(void) } /*ARGSUSED*/ -static void -kcpc_remotestop_func(uint64_t arg1, uint64_t arg2) -{ - ASSERT(CPU->cpu_cpc_ctx != NULL); - pcbe_ops->pcbe_allstop(); - atomic_or_uint(&CPU->cpu_cpc_ctx->kc_flags, KCPC_CTX_INVALID_STOPPED); -} - -/* - * Ensure the counters are stopped on the given processor. - * - * Callers must ensure kernel preemption is disabled. - */ -void -kcpc_remote_stop(cpu_t *cp) -{ - xc_one(cp->cpu_id, kcpc_remotestop_func, 0, 0); -} - -/*ARGSUSED*/ int kcpc_hw_cpu_hook(processorid_t cpuid, ulong_t *kcpc_cpumap) { @@ -161,21 +141,3 @@ kcpc_hw_lwp_hook(void) { return (0); } - -/*ARGSUSED*/ -static void -kcpc_remoteprogram_func(uint64_t arg1, uint64_t arg2) -{ - ASSERT(CPU->cpu_cpc_ctx != NULL); - - pcbe_ops->pcbe_program(CPU->cpu_cpc_ctx); -} - -/* - * Ensure counters are enabled on the given processor. - */ -void -kcpc_remote_program(cpu_t *cp) -{ - xc_one(cp->cpu_id, kcpc_remoteprogram_func, 0, 0); -} diff --git a/usr/src/uts/sun4v/pcbe/niagara2_pcbe.c b/usr/src/uts/sun4v/pcbe/niagara2_pcbe.c index 13c428130e..d4b69e5de4 100644 --- a/usr/src/uts/sun4v/pcbe/niagara2_pcbe.c +++ b/usr/src/uts/sun4v/pcbe/niagara2_pcbe.c @@ -399,6 +399,12 @@ static uint64_t ni2_pcbe_event_coverage(char *event) { /* + * Check whether counter event is supported + */ + if (find_event(event) == NULL && find_generic_event(event) == NULL) + return (0); + + /* * Fortunately, both pic0 and pic1 can count all events. */ return (0x3); |