diff options
Diffstat (limited to 'usr/src/uts/common/disp/cmt.c')
-rw-r--r-- | usr/src/uts/common/disp/cmt.c | 804 |
1 files changed, 804 insertions, 0 deletions
diff --git a/usr/src/uts/common/disp/cmt.c b/usr/src/uts/common/disp/cmt.c new file mode 100644 index 0000000000..1bf0704346 --- /dev/null +++ b/usr/src/uts/common/disp/cmt.c @@ -0,0 +1,804 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/thread.h> +#include <sys/cpuvar.h> +#include <sys/cpupart.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/kstat.h> +#include <sys/processor.h> +#include <sys/disp.h> +#include <sys/group.h> +#include <sys/pghw.h> +#include <sys/bitset.h> +#include <sys/lgrp.h> +#include <sys/cmt.h> + +/* + * CMT scheduler / dispatcher support + * + * This file implements CMT scheduler support using Processor Groups. + * The CMT processor group class creates and maintains the CMT class + * specific processor group pg_cmt_t. + * + * ---------------------------- <-- pg_cmt_t * + * | pghw_t | + * ---------------------------- + * | CMT class specific data | + * | - hierarchy linkage | + * | - CMT load balancing data| + * | - active CPU group/bitset| + * ---------------------------- + * + * The scheduler/dispatcher leverages knowledge of the performance + * relevant CMT sharing relationships existing between cpus to implement + * optimized affinity and load balancing policies. + * + * Load balancing policy seeks to improve performance by minimizing + * contention over shared processor resources / facilities, while the + * affinity policies seek to improve cache and TLB utilization. + * + * The CMT PGs created by this class are already arranged into a + * hierarchy (which is done in the pghw layer). To implement the top-down + * CMT load balancing algorithm, the CMT PGs additionally maintain + * parent, child and sibling hierarchy relationships. + * Parent PGs always contain a superset of their children(s) resources, + * each PG can have at most one parent, and siblings are the group of PGs + * sharing the same parent. + * + * On NUMA systems, the CMT load balancing algorithm balances across the + * CMT PGs within their respective lgroups. On UMA based system, there + * exists a top level group of PGs to balance across. On NUMA systems multiple + * top level groups are instantiated, where the top level balancing begins by + * balancng across the CMT PGs within their respective (per lgroup) top level + * groups. + */ + +typedef struct cmt_lgrp { + group_t cl_pgs; /* Top level group of active CMT PGs */ + int cl_npgs; /* # of top level PGs in the lgroup */ + lgrp_handle_t cl_hand; /* lgroup's platform handle */ + struct cmt_lgrp *cl_next; /* next cmt_lgrp */ +} cmt_lgrp_t; + +static cmt_lgrp_t *cmt_lgrps = NULL; + +static int is_cpu0 = 1; +static int cmt_sched_disabled = 0; + +static pg_cid_t pg_cmt_class_id; /* PG class id */ + +static pg_t *pg_cmt_alloc(); +static void pg_cmt_free(pg_t *); +static void pg_cmt_cpu_init(cpu_t *); +static void pg_cmt_cpu_fini(cpu_t *); +static void pg_cmt_cpu_active(cpu_t *); +static void pg_cmt_cpu_inactive(cpu_t *); +static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); +static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); +static void pg_cmt_hier_pack(pg_cmt_t **, int); +static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); +static int pg_cmt_hw(pghw_type_t); +static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); + +/* + * Macro to test if PG is managed by the CMT PG class + */ +#define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) + +/* + * CMT PG ops + */ +struct pg_ops pg_ops_cmt = { + pg_cmt_alloc, + pg_cmt_free, + pg_cmt_cpu_init, + pg_cmt_cpu_fini, + pg_cmt_cpu_active, + pg_cmt_cpu_inactive, + pg_cmt_cpupart_in, + NULL, /* cpupart_out */ + pg_cmt_cpupart_move, + pg_cmt_cpu_belongs, +}; + +/* + * Initialize the CMT PG class + */ +void +pg_cmt_class_init(void) +{ + if (cmt_sched_disabled) + return; + + pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); +} + +/* + * Called to indicate a new CPU has started up so + * that either t0 or the slave startup thread can + * be accounted for. + */ +void +pg_cmt_cpu_startup(cpu_t *cp) +{ + PG_NRUN_UPDATE(cp, 1); +} + +/* + * Adjust the CMT load in the CMT PGs in which the CPU belongs + * Note that "n" can be positive in the case of increasing + * load, or negative in the case of decreasing load. + */ +void +pg_cmt_load(cpu_t *cp, int n) +{ + pg_cmt_t *pg; + + pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage; + while (pg != NULL) { + ASSERT(IS_CMT_PG(pg)); + atomic_add_32(&pg->cmt_nrunning, n); + pg = pg->cmt_parent; + } +} + +/* + * Return non-zero if thread can migrate between "from" and "to" + * without a performance penalty + */ +int +pg_cmt_can_migrate(cpu_t *from, cpu_t *to) +{ + if (from->cpu_physid->cpu_cacheid == + to->cpu_physid->cpu_cacheid) + return (1); + return (0); +} + +/* + * CMT class specific PG allocation + */ +static pg_t * +pg_cmt_alloc(void) +{ + return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); +} + +/* + * Class specific PG de-allocation + */ +static void +pg_cmt_free(pg_t *pg) +{ + ASSERT(pg != NULL); + ASSERT(IS_CMT_PG(pg)); + + kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); +} + +/* + * Return 1 if CMT load balancing policies should be + * implemented across instances of the specified hardware + * sharing relationship. + */ +static int +pg_cmt_load_bal_hw(pghw_type_t hw) +{ + if (hw == PGHW_IPIPE || + hw == PGHW_FPU || + hw == PGHW_CHIP) + return (1); + else + return (0); +} + +/* + * Return 1 if thread affinity polices should be implemented + * for instances of the specifed hardware sharing relationship. + */ +static int +pg_cmt_affinity_hw(pghw_type_t hw) +{ + if (hw == PGHW_CACHE) + return (1); + else + return (0); +} + +/* + * Return 1 if CMT scheduling policies should be impelmented + * for the specified hardware sharing relationship. + */ +static int +pg_cmt_hw(pghw_type_t hw) +{ + return (pg_cmt_load_bal_hw(hw) || + pg_cmt_affinity_hw(hw)); +} + +/* + * CMT class callback for a new CPU entering the system + */ +static void +pg_cmt_cpu_init(cpu_t *cp) +{ + pg_cmt_t *pg; + group_t *cmt_pgs; + int level, max_level, nlevels; + pghw_type_t hw; + pg_t *pg_cache = NULL; + pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; + lgrp_handle_t lgrp_handle; + cmt_lgrp_t *lgrp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + /* + * A new CPU is coming into the system. + * Interrogate the platform to see if the CPU + * has any performance relevant CMT sharing + * relationships + */ + cmt_pgs = &cp->cpu_pg->cmt_pgs; + cp->cpu_pg->cmt_lineage = NULL; + + bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); + max_level = nlevels = 0; + for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { + + /* + * We're only interested in CMT hw sharing relationships + */ + if (pg_cmt_hw(hw) == 0 || pg_plat_hw_shared(cp, hw) == 0) + continue; + + /* + * Find (or create) the PG associated with + * the hw sharing relationship in which cp + * belongs. + * + * Determine if a suitable PG already + * exists, or if one needs to be created. + */ + pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); + if (pg == NULL) { + /* + * Create a new one. + * Initialize the common... + */ + pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); + + /* ... physical ... */ + pghw_init((pghw_t *)pg, cp, hw); + + /* + * ... and CMT specific portions of the + * structure. + */ + bitset_init(&pg->cmt_cpus_actv_set); + group_create(&pg->cmt_cpus_actv); + } else { + ASSERT(IS_CMT_PG(pg)); + } + + /* Add the CPU to the PG */ + pg_cpu_add((pg_t *)pg, cp); + + /* + * Ensure capacity of the active CPUs group/bitset + */ + group_expand(&pg->cmt_cpus_actv, + GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); + + if (cp->cpu_seqid >= + bitset_capacity(&pg->cmt_cpus_actv_set)) { + bitset_resize(&pg->cmt_cpus_actv_set, + cp->cpu_seqid + 1); + } + + /* + * Build a lineage of CMT PGs for load balancing + */ + if (pg_cmt_load_bal_hw(hw)) { + level = pghw_level(hw); + cpu_cmt_hier[level] = pg; + if (level > max_level) + max_level = level; + nlevels++; + } + + /* Cache this for later */ + if (hw == PGHW_CACHE) + pg_cache = (pg_t *)pg; + } + + /* + * Pack out any gaps in the constructed lineage. + * Gaps may exist where the architecture knows + * about a hardware sharing relationship, but such a + * relationship either isn't relevant for load + * balancing or doesn't exist between CPUs on the system. + */ + pg_cmt_hier_pack(cpu_cmt_hier, max_level + 1); + + /* + * For each of the PGs int the CPU's lineage: + * - Add an entry in the CPU sorted CMT PG group + * which is used for top down CMT load balancing + * - Tie the PG into the CMT hierarchy by connecting + * it to it's parent and siblings. + */ + group_expand(cmt_pgs, nlevels); + + /* + * Find the lgrp that encapsulates this CPU's CMT hierarchy + */ + lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); + lgrp = pg_cmt_find_lgrp(lgrp_handle); + + for (level = 0; level < nlevels; level++) { + uint_t children; + int err; + + pg = cpu_cmt_hier[level]; + err = group_add_at(cmt_pgs, pg, nlevels - level - 1); + ASSERT(err == 0); + + if (level == 0) + cp->cpu_pg->cmt_lineage = (pg_t *)pg; + + if (pg->cmt_siblings != NULL) { + /* Already initialized */ + ASSERT(pg->cmt_parent == NULL || + pg->cmt_parent == cpu_cmt_hier[level + 1]); + ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || + pg->cmt_siblings == pg->cmt_parent->cmt_children); + continue; + } + + if ((level + 1) == nlevels) { + pg->cmt_parent = NULL; + pg->cmt_siblings = &lgrp->cl_pgs; + children = ++lgrp->cl_npgs; + } else { + pg->cmt_parent = cpu_cmt_hier[level + 1]; + + /* + * A good parent keeps track of their children. + * The parent's children group is also the PG's + * siblings. + */ + if (pg->cmt_parent->cmt_children == NULL) { + pg->cmt_parent->cmt_children = + kmem_zalloc(sizeof (group_t), KM_SLEEP); + group_create(pg->cmt_parent->cmt_children); + } + pg->cmt_siblings = pg->cmt_parent->cmt_children; + children = ++pg->cmt_parent->cmt_nchildren; + } + pg->cmt_hint = 0; + group_expand(pg->cmt_siblings, children); + } + + /* + * Cache the chip and core IDs in the cpu_t->cpu_physid structure + * for fast lookups later. + */ + if (cp->cpu_physid) { + cp->cpu_physid->cpu_chipid = + pg_plat_hw_instance_id(cp, PGHW_CHIP); + cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); + + /* + * If this cpu has a PG representing shared cache, then set + * cpu_cacheid to that PG's logical id + */ + if (pg_cache) + cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; + } + + /* CPU0 only initialization */ + if (is_cpu0) { + pg_cmt_cpu_startup(cp); + is_cpu0 = 0; + } + +} + +/* + * Class callback when a CPU is leaving the system (deletion) + */ +static void +pg_cmt_cpu_fini(cpu_t *cp) +{ + group_iter_t i; + pg_cmt_t *pg; + group_t *pgs, *cmt_pgs; + lgrp_handle_t lgrp_handle; + cmt_lgrp_t *lgrp; + + pgs = &cp->cpu_pg->pgs; + cmt_pgs = &cp->cpu_pg->cmt_pgs; + + /* + * Find the lgroup that encapsulates this CPU's CMT hierarchy + */ + lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); + lgrp = pg_cmt_find_lgrp(lgrp_handle); + + /* + * First, clean up anything load balancing specific for each of + * the CPU's PGs that participated in CMT load balancing + */ + pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage; + while (pg != NULL) { + + /* + * Remove the PG from the CPU's load balancing lineage + */ + (void) group_remove(cmt_pgs, pg, GRP_RESIZE); + + /* + * If it's about to become empty, destroy it's children + * group, and remove it's reference from it's siblings. + * This is done here (rather than below) to avoid removing + * our reference from a PG that we just eliminated. + */ + if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { + if (pg->cmt_children != NULL) + group_destroy(pg->cmt_children); + if (pg->cmt_siblings != NULL) { + if (pg->cmt_siblings == &lgrp->cl_pgs) + lgrp->cl_npgs--; + else + pg->cmt_parent->cmt_nchildren--; + } + } + pg = pg->cmt_parent; + } + + ASSERT(GROUP_SIZE(cmt_pgs) == 0); + + /* + * Now that the load balancing lineage updates have happened, + * remove the CPU from all it's PGs (destroying any that become + * empty). + */ + group_iter_init(&i); + while ((pg = group_iterate(pgs, &i)) != NULL) { + if (IS_CMT_PG(pg) == 0) + continue; + + pg_cpu_delete((pg_t *)pg, cp); + /* + * Deleting the CPU from the PG changes the CPU's + * PG group over which we are actively iterating + * Re-initialize the iteration + */ + group_iter_init(&i); + + if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { + + /* + * The PG has become zero sized, so destroy it. + */ + group_destroy(&pg->cmt_cpus_actv); + bitset_fini(&pg->cmt_cpus_actv_set); + pghw_fini((pghw_t *)pg); + + pg_destroy((pg_t *)pg); + } + } +} + +/* + * Class callback when a CPU is entering a cpu partition + */ +static void +pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) +{ + group_t *pgs; + pg_t *pg; + group_iter_t i; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + pgs = &cp->cpu_pg->pgs; + + /* + * Ensure that the new partition's PG bitset + * is large enough for all CMT PG's to which cp + * belongs + */ + group_iter_init(&i); + while ((pg = group_iterate(pgs, &i)) != NULL) { + if (IS_CMT_PG(pg) == 0) + continue; + + if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) + bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); + } +} + +/* + * Class callback when a CPU is actually moving partitions + */ +static void +pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) +{ + cpu_t *cpp; + group_t *pgs; + pg_t *pg; + group_iter_t pg_iter; + pg_cpu_itr_t cpu_iter; + boolean_t found; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + pgs = &cp->cpu_pg->pgs; + group_iter_init(&pg_iter); + + /* + * Iterate over the CPUs CMT PGs + */ + while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { + + if (IS_CMT_PG(pg) == 0) + continue; + + /* + * Add the PG to the bitset in the new partition. + */ + bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); + + /* + * Remove the PG from the bitset in the old partition + * if the last of the PG's CPUs have left. + */ + found = B_FALSE; + PG_CPU_ITR_INIT(pg, cpu_iter); + while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { + if (cpp == cp) + continue; + if (cpp->cpu_part->cp_id == oldpp->cp_id) { + found = B_TRUE; + break; + } + } + if (!found) + bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); + } +} + +/* + * Class callback when a CPU becomes active (online) + * + * This is called in a context where CPUs are paused + */ +static void +pg_cmt_cpu_active(cpu_t *cp) +{ + int err; + group_iter_t i; + pg_cmt_t *pg; + group_t *pgs; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + pgs = &cp->cpu_pg->pgs; + group_iter_init(&i); + + /* + * Iterate over the CPU's PGs + */ + while ((pg = group_iterate(pgs, &i)) != NULL) { + + if (IS_CMT_PG(pg) == 0) + continue; + + err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); + ASSERT(err == 0); + + /* + * If this is the first active CPU in the PG, and it + * represents a hardware sharing relationship over which + * CMT load balancing is performed, add it as a candidate + * for balancing with it's siblings. + */ + if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && + pg_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) { + err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); + ASSERT(err == 0); + } + + /* + * Notate the CPU in the PGs active CPU bitset. + * Also notate the PG as being active in it's associated + * partition + */ + bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); + bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); + } +} + +/* + * Class callback when a CPU goes inactive (offline) + * + * This is called in a context where CPUs are paused + */ +static void +pg_cmt_cpu_inactive(cpu_t *cp) +{ + int err; + group_t *pgs; + pg_cmt_t *pg; + cpu_t *cpp; + group_iter_t i; + pg_cpu_itr_t cpu_itr; + boolean_t found; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + pgs = &cp->cpu_pg->pgs; + group_iter_init(&i); + + while ((pg = group_iterate(pgs, &i)) != NULL) { + + if (IS_CMT_PG(pg) == 0) + continue; + + /* + * Remove the CPU from the CMT PGs active CPU group + * bitmap + */ + err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); + ASSERT(err == 0); + + bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); + + /* + * If there are no more active CPUs in this PG over which + * load was balanced, remove it as a balancing candidate. + */ + if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && + pg_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) { + err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); + ASSERT(err == 0); + } + + /* + * Assert the number of active CPUs does not exceed + * the total number of CPUs in the PG + */ + ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= + GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); + + /* + * Update the PG bitset in the CPU's old partition + */ + found = B_FALSE; + PG_CPU_ITR_INIT(pg, cpu_itr); + while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { + if (cpp == cp) + continue; + if (cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { + found = B_TRUE; + break; + } + } + if (!found) { + bitset_del(&cp->cpu_part->cp_cmt_pgs, + ((pg_t *)pg)->pg_id); + } + } +} + +/* + * Return non-zero if the CPU belongs in the given PG + */ +static int +pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) +{ + cpu_t *pg_cpu; + + pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); + + ASSERT(pg_cpu != NULL); + + /* + * The CPU belongs if, given the nature of the hardware sharing + * relationship represented by the PG, the CPU has that + * relationship with some other CPU already in the PG + */ + if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) + return (1); + + return (0); +} + +/* + * Pack the CPUs CMT hierarchy + * The hierarchy order is preserved + */ +static void +pg_cmt_hier_pack(pg_cmt_t *hier[], int sz) +{ + int i, j; + + for (i = 0; i < sz; i++) { + if (hier[i] != NULL) + continue; + + for (j = i; j < sz; j++) { + if (hier[j] != NULL) { + hier[i] = hier[j]; + hier[j] = NULL; + break; + } + } + if (j == sz) + break; + } +} + +/* + * Return a cmt_lgrp_t * given an lgroup handle. + * If the right one doesn't yet exist, create one + * by growing the cmt_lgrps array + */ +static cmt_lgrp_t * +pg_cmt_find_lgrp(lgrp_handle_t hand) +{ + cmt_lgrp_t *lgrp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + lgrp = cmt_lgrps; + while (lgrp != NULL) { + if (lgrp->cl_hand == hand) + return (lgrp); + lgrp = lgrp->cl_next; + } + + /* + * Haven't seen this lgrp yet + */ + lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); + + lgrp->cl_hand = hand; + lgrp->cl_npgs = 0; + lgrp->cl_next = cmt_lgrps; + cmt_lgrps = lgrp; + group_create(&lgrp->cl_pgs); + + return (lgrp); +} |