diff options
Diffstat (limited to 'usr/src/uts/sun4v/os/mpo.c')
-rw-r--r-- | usr/src/uts/sun4v/os/mpo.c | 1264 |
1 files changed, 1264 insertions, 0 deletions
diff --git a/usr/src/uts/sun4v/os/mpo.c b/usr/src/uts/sun4v/os/mpo.c new file mode 100644 index 0000000000..d98ce96438 --- /dev/null +++ b/usr/src/uts/sun4v/os/mpo.c @@ -0,0 +1,1264 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/machsystm.h> +#include <sys/machparam.h> +#include <sys/cmn_err.h> +#include <sys/stat.h> +#include <sys/mach_descrip.h> +#include <sys/memnode.h> +#include <sys/mdesc.h> +#include <sys/mpo.h> +#include <vm/vm_dep.h> + +/* + * MPO and the sun4v memory representation + * --------------------------------------- + * + * Latency groups are defined in the sun4v achitecture by memory-latency-group + * nodes in the Machine Description, as specified in FWARC/2007/260. These + * tie together cpu nodes and mblock nodes, and contain mask and match + * properties that identify the portion of an mblock that belongs to the + * lgroup. Mask and match are defined in the Physical Address (PA) space, + * but an mblock defines Real Addresses (RA). To translate, the mblock + * includes the property address-congruence-offset, hereafter referred to as + * ra_to_pa. A real address ra is a member of an lgroup if + * + * (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match + * + * The MD is traversed, and information on all mblocks is kept in the array + * mpo_mblock[]. Information on all CPUs, including which lgroup they map + * to, is kept in the array mpo_cpu[]. + * + * This implementation makes (and verifies) the simplifying assumption that + * the mask bits are the same for all defined lgroups, and that all 1 bits in + * the mask are contiguous. Thus the number of lgroups is bounded by the + * number of possible mask values, and the lgrp_handle_t is defined as the + * mask value, shifted right to eliminate the 0 bit positions in mask. The + * masks and values are also referred to as "home bits" in the code. + * + * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup + * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock + * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the + * home bits. This yields the mem_node. + * + * Interfaces + * ---------- + * + * This file exports the following entry points: + * + * plat_lgrp_init() + * plat_build_mem_nodes() + * plat_lgrp_cpu_to_hand() + * plat_lgrp_latency() + * plat_pfn_to_mem_node() + * These implement the usual platform lgroup interfaces. + * + * plat_rapfn_to_papfn() + * Recover the PA page coloring bits from an RA. + * + * plat_mem_node_iterator_init() + * Initialize an iterator to efficiently step through pages in a mem_node. + * + * plat_mem_node_intersect_range() + * Find the intersection with a mem_node. + */ + +int sun4v_mpo_enable = 1; +int sun4v_mpo_debug = 0; +char sun4v_mpo_status[256] = ""; + +/* Save CPU info from the MD and associate CPUs with lgroups */ +static struct cpu_md mpo_cpu[NCPU]; + +/* Save lgroup info from the MD */ +#define MAX_MD_LGROUPS 32 +static struct lgrp_md mpo_lgroup[MAX_MD_LGROUPS]; +static int n_lgrpnodes = 0; +static int n_locality_groups = 0; +static int max_locality_groups = 0; + +/* Save mblocks from the MD */ +static struct mblock_md mpo_mblock[MPO_MAX_MBLOCKS]; +static int n_mblocks = 0; + +/* Save mem_node stripes calculate from mblocks and lgroups. */ +static mem_stripe_t mem_stripes[MAX_MEM_STRIPES]; +static int n_mem_stripes = 0; +static pfn_t mnode_stride; /* distance between stripes, start to start */ +static int stripe_shift; /* stride/stripes expressed as a shift */ +static pfn_t mnode_pages; /* mem_node stripe width */ + +/* Save home mask and shift used to calculate lgrp_handle_t values */ +static uint64_t home_mask = 0; +static pfn_t home_mask_pfn = 0; +static int home_mask_shift = 0; +static uint_t home_mask_pfn_shift = 0; + +/* Save lowest and highest latencies found across all lgroups */ +static int lower_latency = 0; +static int higher_latency = 0; + +static pfn_t base_ra_to_pa_pfn = 0; /* ra_to_pa for single mblock memory */ + +static int valid_pages(md_t *md, mde_cookie_t cpu0); +static int unique_home_mem_lg_count(uint64_t mem_lg_homeset); +static int fix_interleave(void); + +/* Debug support */ +#if defined(DEBUG) && !defined(lint) +#define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args) +#else +#define MPO_DEBUG(...) +#endif /* DEBUG */ + +/* Record status message, viewable from mdb */ +#define MPO_STATUS(args...) { \ + (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args); \ + MPO_DEBUG(sun4v_mpo_status); \ +} + +/* + * Routine to read a uint64_t from a given md + */ +static int64_t +get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val) +{ + int err = md_get_prop_val(md, node, propname, val); + return (err); +} + +static int +mblock_cmp(const void *a, const void *b) +{ + struct mblock_md *m1 = (struct mblock_md *)a; + struct mblock_md *m2 = (struct mblock_md *)b; + + if (m1->base < m2->base) + return (-1); + else if (m1->base == m2->base) + return (0); + else + return (1); +} + +static void +mblock_sort(struct mblock_md *mblocks, int n) +{ + extern void qsort(void *, size_t, size_t, + int (*)(const void *, const void *)); + + qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp); +} + +/* + * + * Traverse the MD to determine: + * + * Number of CPU nodes, lgrp_nodes, and mblocks + * Then for each lgrp_node, obtain the appropriate data. + * For each CPU, determine its home locality and store it. + * For each mblock, retrieve its data and store it. + */ +static int +lgrp_traverse(md_t *md) +{ + mde_cookie_t root, *cpunodes, *lgrpnodes, *nodes, *mblocknodes; + uint64_t i, j, k, o, n_nodes; + uint64_t n_lgroups = 0; + uint64_t mem_lg_homeset = 0; + int ret_val = 0; + int result = 0; + int n_cpunodes = 0; + int sub_page_fix; + + n_nodes = md_node_count(md); + + if (n_nodes <= 0) { + MPO_STATUS("lgrp_traverse: No nodes in node count\n"); + ret_val = -1; + goto fail; + } + + root = md_root_node(md); + + if (root == MDE_INVAL_ELEM_COOKIE) { + MPO_STATUS("lgrp_traverse: Root node is missing\n"); + ret_val = -1; + goto fail; + } + + /* + * Build the Memory Nodes. Do this before any possibility of + * bailing from this routine so we obtain ra_to_pa (needed for page + * coloring) even when there are no lgroups defined. + */ + + n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, + "fwd", &mblocknodes); + + if (n_mblocks <= 0 || n_mblocks > MPO_MAX_MBLOCKS) { + MPO_STATUS("lgrp_traverse: No mblock " + "nodes detected in Machine Descriptor\n"); + n_mblocks = 0; + ret_val = -1; + goto fail; + } + + for (i = 0; i < n_mblocks; i++) { + mpo_mblock[i].node = mblocknodes[i]; + + /* Without a base or size value we will fail */ + result = get_int(md, mblocknodes[i], PROP_LG_BASE, + &mpo_mblock[i].base); + if (result < 0) { + MPO_STATUS("lgrp_traverse: " + "PROP_LG_BASE is missing\n"); + n_mblocks = 0; + ret_val = -1; + goto fail; + } + + result = get_int(md, mblocknodes[i], PROP_LG_SIZE, + &mpo_mblock[i].size); + if (result < 0) { + MPO_STATUS("lgrp_traverse: " + "PROP_LG_SIZE is missing\n"); + n_mblocks = 0; + ret_val = -1; + goto fail; + } + + result = get_int(md, mblocknodes[i], + PROP_LG_RA_PA_OFFSET, &mpo_mblock[i].ra_to_pa); + + /* If we don't have an ra_pa_offset, just set it to 0 */ + if (result < 0) + mpo_mblock[i].ra_to_pa = 0; + + MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, " + "ra_to_pa = %lx\n", i, + mpo_mblock[i].base, + mpo_mblock[i].size, + mpo_mblock[i].ra_to_pa); + } + + /* Must sort mblocks by address for mem_node_iterator_init() */ + mblock_sort(mpo_mblock, n_mblocks); + + base_ra_to_pa_pfn = btop(mpo_mblock[0].ra_to_pa); + + /* Page coloring hook is required so we can iterate through mnodes */ + if (&page_next_pfn_for_color_cpu == NULL) { + MPO_STATUS("lgrp_traverse: No page coloring support\n"); + ret_val = -1; + goto fail; + } + + /* Global enable for mpo */ + if (sun4v_mpo_enable == 0) { + MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n"); + ret_val = -1; + goto fail; + } + + n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG, + "fwd", &lgrpnodes); + + if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) { + MPO_STATUS("lgrp_traverse: No Lgroups\n"); + ret_val = -1; + goto fail; + } + + n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes); + + if (n_cpunodes <= 0 || n_cpunodes > NCPU) { + MPO_STATUS("lgrp_traverse: No CPU nodes detected " + "in MD\n"); + ret_val = -1; + goto fail; + } + + MPO_DEBUG("lgrp_traverse: Node Count: %ld\n", n_nodes); + MPO_DEBUG("lgrp_traverse: md: %p\n", md); + MPO_DEBUG("lgrp_traverse: root: %lx\n", root); + MPO_DEBUG("lgrp_traverse: mem_lgs: %d\n", n_lgrpnodes); + MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes); + MPO_DEBUG("lgrp_traverse: mblocks: %d\n", n_mblocks); + + for (i = 0; i < n_lgrpnodes; i++) { + mpo_lgroup[i].node = lgrpnodes[i]; + mpo_lgroup[i].id = i; + mpo_lgroup[i].ncpu = 0; + result = get_int(md, lgrpnodes[i], PROP_LG_MASK, + &mpo_lgroup[i].addr_mask); + result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH, + &mpo_lgroup[i].addr_match); + + /* + * If either the mask or match properties are missing, set to 0 + */ + if (result < 0) { + mpo_lgroup[i].addr_mask = 0; + mpo_lgroup[i].addr_match = 0; + } + + /* Set latency to 0 if property not present */ + + result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY, + &mpo_lgroup[i].latency); + if (result < 0) + mpo_lgroup[i].latency = 0; + } + + /* + * Sub-page level interleave is not yet supported. Check for it, + * and remove sub-page interleaved lgroups from mpo_lgroup and + * n_lgrpnodes. If no lgroups are left, return. + */ + + sub_page_fix = fix_interleave(); + if (n_lgrpnodes == 0) { + ret_val = -1; + goto fail; + } + + /* Ensure that all of the addr_mask values are the same */ + + for (i = 0; i < n_lgrpnodes; i++) { + if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) { + MPO_STATUS("lgrp_traverse: " + "addr_mask values are not the same\n"); + ret_val = -1; + goto fail; + } + } + + /* + * Ensure that all lgrp nodes see all the mblocks. However, if + * sub-page interleave is being fixed, they do not, so skip + * the check. + */ + + if (sub_page_fix == 0) { + for (i = 0; i < n_lgrpnodes; i++) { + j = md_alloc_scan_dag(md, mpo_lgroup[i].node, + PROP_LG_MBLOCK, "fwd", &nodes); + md_free_scan_dag(md, &nodes); + if (j != n_mblocks) { + MPO_STATUS("lgrp_traverse: " + "sub-page interleave is being fixed\n"); + ret_val = -1; + goto fail; + } + } + } + + /* + * Use the address mask from the first lgroup node + * to establish our home_mask. + */ + home_mask = mpo_lgroup[0].addr_mask; + home_mask_pfn = btop(home_mask); + home_mask_shift = lowbit(home_mask) - 1; + home_mask_pfn_shift = home_mask_shift - PAGESHIFT; + mnode_pages = btop(1ULL << home_mask_shift); + + /* + * How many values are possible in home mask? Assume the mask + * bits are contiguous. + */ + max_locality_groups = + 1 << highbit(home_mask_pfn >> home_mask_pfn_shift); + + /* Now verify the home mask bits are contiguous */ + + if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) { + MPO_STATUS("lgrp_traverse: " + "home mask bits are not contiguous\n"); + ret_val = -1; + goto fail; + } + + /* Record all of the home bits */ + + for (i = 0; i < n_lgrpnodes; i++) { + HOMESET_ADD(mem_lg_homeset, + mpo_lgroup[i].addr_match >> home_mask_shift); + } + + /* Count the number different "home" mem_lg's we've discovered */ + + n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset); + + /* If we have only 1 locality group then we can exit */ + if (n_locality_groups == 1) { + MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n"); + ret_val = -1; + goto fail; + } + + /* + * Set the latencies. A CPU's lgroup is defined by the lowest + * latency found. All other memory is considered remote, and the + * remote latency is represented by the highest latency found. + * Thus hierarchical lgroups, if any, are approximated by a + * two level scheme. + * + * The Solaris MPO framework by convention wants to see latencies + * in units of nano-sec/10. In the MD, the units are defined to be + * pico-seconds. + */ + + lower_latency = mpo_lgroup[0].latency; + higher_latency = mpo_lgroup[0].latency; + + for (i = 1; i < n_lgrpnodes; i++) { + if (mpo_lgroup[i].latency < lower_latency) { + lower_latency = mpo_lgroup[i].latency; + } + if (mpo_lgroup[i].latency > higher_latency) { + higher_latency = mpo_lgroup[i].latency; + } + } + lower_latency /= 10000; + higher_latency /= 10000; + + /* Clear our CPU data */ + + for (i = 0; i < NCPU; i++) { + mpo_cpu[i].home = 0; + mpo_cpu[i].latency = (uint_t)(-1); + } + + /* Build the CPU nodes */ + for (i = 0; i < n_cpunodes; i++) { + + /* Read in the lgroup nodes */ + + result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k); + if (result < 0) { + MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n"); + ret_val = -1; + goto fail; + } + + n_lgroups = md_alloc_scan_dag(md, cpunodes[i], PROP_LG_MEM_LG, + "fwd", &nodes); + if (n_lgroups <= 0) { + MPO_STATUS("lgrp_traverse: PROP_LG_MEM_LG missing"); + ret_val = -1; + goto fail; + } + + /* + * Find the lgroup this cpu belongs to with the lowest latency. + * Check all the lgrp nodes connected to this CPU to determine + * which has the smallest latency. + */ + + for (j = 0; j < n_lgroups; j++) { + for (o = 0; o < n_lgrpnodes; o++) { + if (nodes[j] == mpo_lgroup[o].node) { + if (mpo_lgroup[o].latency < + mpo_cpu[k].latency) { + mpo_cpu[k].home = + mpo_lgroup[o].addr_match + >> home_mask_shift; + mpo_cpu[k].latency = + mpo_lgroup[o].latency; + mpo_lgroup[o].ncpu++; + } + } + } + } + md_free_scan_dag(md, &nodes); + } + + /* Validate that no large pages cross mnode boundaries. */ + if (valid_pages(md, cpunodes[0]) == 0) { + ret_val = -1; + goto fail; + } + +fail: + /* MD cookies are no longer valid; ensure they are not used again. */ + for (i = 0; i < n_mblocks; i++) + mpo_mblock[i].node = MDE_INVAL_ELEM_COOKIE; + for (i = 0; i < n_lgrpnodes; i++) + mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE; + + if (n_cpunodes > 0) + md_free_scan_dag(md, &cpunodes); + if (n_lgrpnodes > 0) + md_free_scan_dag(md, &lgrpnodes); + if (n_mblocks > 0) + md_free_scan_dag(md, &mblocknodes); + else + panic("lgrp_traverse: No memory blocks found"); + + if (ret_val == 0) + MPO_STATUS("MPO feature is enabled.\n"); + + return (ret_val); +} + +/* + * Determine the number of unique mem_lg's present in our system + */ +static int +unique_home_mem_lg_count(uint64_t mem_lg_homeset) +{ + int homeid; + int count = 0; + + /* + * Scan the "home" bits of the mem_lgs, count + * the number that are unique. + */ + + for (homeid = 0; homeid < NLGRPS_MAX; homeid++) { + if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) { + count++; + } + } + + MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n", + mem_lg_homeset); + MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count); + + /* Default must be at least one */ + if (count == 0) + count = 1; + + return (count); +} + +/* + * Platform specific lgroup initialization + */ +void +plat_lgrp_init(void) +{ + md_t *md; + int i, rc, ncpu_min; + + /* Get the Machine Descriptor handle */ + + md = md_get_handle(); + + /* If not, we cannot continue */ + + if (md == NULL) { + panic("cannot access machine descriptor\n"); + } else { + rc = lgrp_traverse(md); + (void) md_fini_handle(md); + } + + /* + * If we can't process the MD for lgroups then at least let the + * system try to boot. Assume we have one lgroup so that + * when plat_build_mem_nodes is called, it will attempt to init + * an mnode based on the supplied memory segment. + */ + + if (rc == -1) { + home_mask_pfn = 0; + max_locality_groups = 1; + n_locality_groups = 1; + return; + } + + mem_node_pfn_shift = 0; + mem_node_physalign = 0; + + /* Use lgroup-aware TSB allocations */ + tsb_lgrp_affinity = 1; + + /* + * lgrp_expand_proc_thresh is the minimum load on the lgroups + * this process is currently running on before considering + * expanding threads to another lgroup. + * + * lgrp_expand_proc_diff determines how much less the remote lgroup + * must be loaded before expanding to it. + * + * On sun4v CMT processors, threads share a core pipeline, and + * at less than 100% utilization, best throughput is obtained by + * spreading threads across more cores, even if some are in a + * different lgroup. Spread threads to a new lgroup if the + * current group is more than 50% loaded. Because of virtualization, + * lgroups may have different numbers of CPUs, but the tunables + * apply to all lgroups, so find the smallest lgroup and compute + * 50% loading. + */ + + ncpu_min = NCPU; + for (i = 0; i < n_lgrpnodes; i++) { + int ncpu = mpo_lgroup[i].ncpu; + if (ncpu != 0 && ncpu < ncpu_min) + ncpu_min = ncpu; + } + lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2; + + /* new home may only be half as loaded as the existing home to use it */ + lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2; + + lgrp_loadavg_tolerance = lgrp_loadavg_max_effect; + + /* Require that a home lgroup have some memory to be chosen */ + lgrp_mem_free_thresh = 1; + + /* Standard home-on-next-touch policy */ + lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT; + + /* Disable option to choose root lgroup if all leaf lgroups are busy */ + lgrp_load_thresh = UINT32_MAX; +} + +/* + * Helper routine for debugging calls to mem_node_add_slice() + */ +static void +mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn) +{ +#if defined(DEBUG) && !defined(lint) + static int slice_count = 0; + + slice_count++; + MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n", + slice_count, basepfn, endpfn); +#endif + mem_node_add_slice(basepfn, endpfn); +} + +/* + * Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node() + */ +static void +mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode) +{ + MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld," + "mnode index: %d\n", plathand, mnode); + plat_assign_lgrphand_to_mem_node(plathand, mnode); +} + +/* + * plat_build_mem_nodes() + * + * Define the mem_nodes based on the modified boot memory list, + * or based on info read from the MD in plat_lgrp_init(). + * + * When the home mask lies in the middle of the address bits (as it does on + * Victoria Falls), then the memory in one mem_node is no longer contiguous; + * it is striped across an mblock in a repeating pattern of contiguous memory + * followed by a gap. The stripe width is the size of the contiguous piece. + * The stride is the distance from the start of one contiguous piece to the + * start of the next. The gap is thus stride - stripe_width. + * + * The stripe of an mnode that falls within an mblock is described by the type + * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The + * mem_stripe_t's are kept in a global array mem_stripes[]. The index into + * this array is predetermined. The mem_stripe_t that describes mnode m + * within mpo_mblock[i] is stored at + * mem_stripes[ m + i * max_locality_groups ] + * + * max_locality_groups is the total number of possible locality groups, + * as defined by the size of the home mask, even if the memory assigned + * to the domain is small and does not cover all the lgroups. Thus some + * mem_stripe_t's may be empty. + * + * The members of mem_stripe_t are: + * physbase: First valid page in mem_node in the corresponding mblock + * physmax: Last valid page in mem_node in mblock + * offset: The full stripe width starts at physbase - offset. + * Thus if offset is non-zero, this mem_node starts in the middle + * of a stripe width, and the second full stripe starts at + * physbase - offset + stride. (even though physmax may fall in the + * middle of a stripe width, we do not save the ending fragment size + * in this data structure.) + * exists: Set to 1 if the mblock has memory in this mem_node stripe. + * + * The stripe width is kept in the global mnode_pages. + * The stride is kept in the global mnode_stride. + * All the above use pfn's as the unit. + * + * As an example, the memory layout for a domain with 2 mblocks and 4 + * mem_nodes 0,1,2,3 could look like this: + * + * 123012301230 ... 012301230123 ... + * mblock 0 mblock 1 + */ + +void +plat_build_mem_nodes(u_longlong_t *list, size_t nelems) +{ + lgrp_handle_t lgrphand, lgrp_start; + int i, mnode, elem; + uint64_t offset, stripe_end, base, len, end, ra_to_pa, stride; + uint64_t stripe, frag, remove; + mem_stripe_t *ms; + + /* Check for non-MPO sun4v platforms */ + + if (n_locality_groups <= 1) { + mpo_plat_assign_lgrphand_to_mem_node((lgrp_handle_t)0, 0); + for (elem = 0; elem < nelems; elem += 2) { + base = list[elem]; + len = list[elem+1]; + + mpo_mem_node_add_slice(btop(base), + btop(base + len - 1)); + } + mem_node_pfn_shift = 0; + mem_node_physalign = 0; + n_mem_stripes = 0; + return; + } + + /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */ + max_mem_nodes = max_locality_groups; + bzero(mem_stripes, sizeof (mem_stripes)); + stripe = ptob(mnode_pages); + stride = max_locality_groups * stripe; + + /* Save commonly used values in globals */ + mnode_stride = btop(stride); + n_mem_stripes = max_locality_groups * n_mblocks; + stripe_shift = highbit(max_locality_groups) - 1; + + for (i = 0; i < n_mblocks; i++) { + + base = mpo_mblock[i].base; + end = mpo_mblock[i].base + mpo_mblock[i].size; + ra_to_pa = mpo_mblock[i].ra_to_pa; + mpo_mblock[i].base_pfn = btop(base); + mpo_mblock[i].end_pfn = btop(end - 1); + + /* Find the offset from the prev stripe boundary in PA space. */ + offset = (base + ra_to_pa) & (stripe - 1); + + /* Set the next stripe boundary. */ + stripe_end = base - offset + stripe; + + lgrp_start = (((base + ra_to_pa) & home_mask) >> + home_mask_shift); + lgrphand = lgrp_start; + + /* + * Loop over all lgroups covered by the mblock, creating a + * stripe for each. Stop when lgrp_start is visited again. + */ + do { + /* mblock may not span all lgroups */ + if (base >= end) + break; + + mnode = lgrphand; + ASSERT(mnode < max_mem_nodes); + + /* + * Calculate the size of the fragment that does not + * belong to the mnode in the last partial stride. + */ + frag = (end - (base - offset)) & (stride - 1); + if (frag == 0) { + /* remove the gap */ + remove = stride - stripe; + } else if (frag < stripe) { + /* fragment fits in stripe; keep it all */ + remove = 0; + } else { + /* fragment is large; trim after whole stripe */ + remove = frag - stripe; + } + + ms = &mem_stripes[i * max_locality_groups + mnode]; + ms->physbase = btop(base); + ms->physmax = btop(end - 1 - remove); + ms->offset = btop(offset); + ms->exists = 1; + + mpo_plat_assign_lgrphand_to_mem_node(lgrphand, mnode); + mpo_mem_node_add_slice(ms->physbase, ms->physmax); + + base = stripe_end; + stripe_end += stripe; + offset = 0; + lgrphand = (((base + ra_to_pa) & home_mask) >> + home_mask_shift); + } while (lgrphand != lgrp_start); + } + + /* + * Indicate to vm_pagelist that the hpm_counters array + * should be shared because the ranges overlap. + */ + if (max_mem_nodes > 1) { + interleaved_mnodes = 1; + } +} + +/* + * Return the locality group value for the supplied processor + */ +lgrp_handle_t +plat_lgrp_cpu_to_hand(processorid_t id) +{ + if (n_locality_groups > 1) { + return ((lgrp_handle_t)mpo_cpu[(int)id].home); + } else { + return ((lgrp_handle_t)0); /* Default */ + } +} + +int +plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to) +{ + /* + * Return min remote latency when there are more than two lgroups + * (root and child) and getting latency between two different lgroups + * or root is involved. + */ + if (lgrp_optimizations() && (from != to || + from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) { + return ((int)higher_latency); + } else { + return ((int)lower_latency); + } +} + +int +plat_pfn_to_mem_node(pfn_t pfn) +{ + int i, mnode; + pfn_t ra_to_pa_pfn; + struct mblock_md *mb; + + if (n_locality_groups <= 1) + return (0); + + /* + * The mnode is defined to be 1:1 with the lgroup handle, which + * is taken from from the home bits. Find the mblock in which + * the pfn falls to get the ra_to_pa adjustment, and extract + * the home bits. + */ + mb = &mpo_mblock[0]; + for (i = 0; i < n_mblocks; i++) { + if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) { + ra_to_pa_pfn = btop(mb->ra_to_pa); + mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >> + home_mask_pfn_shift); + ASSERT(mnode < max_mem_nodes); + return (mnode); + } + mb++; + } + + panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn); + return (pfn); +} + +/* + * plat_rapfn_to_papfn + * + * Convert a pfn in RA space to a pfn in PA space, in which the page coloring + * and home mask bits are correct. The upper bits do not necessarily + * match the actual PA, however. + */ +pfn_t +plat_rapfn_to_papfn(pfn_t pfn) +{ + int i; + pfn_t ra_to_pa_pfn; + struct mblock_md *mb; + + ASSERT(n_mblocks > 0); + if (n_mblocks == 1) + return (pfn + base_ra_to_pa_pfn); + + /* + * Find the mblock in which the pfn falls + * in order to get the ra_to_pa adjustment. + */ + for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) { + if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) { + ra_to_pa_pfn = btop(mb->ra_to_pa); + return (pfn + ra_to_pa_pfn); + } + } + + panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn); + return (pfn); +} + +/* + * plat_mem_node_iterator_init() + * Initialize cookie to iterate over pfn's in an mnode. There is + * no additional iterator function. The caller uses the info from + * the iterator structure directly. + * + * pfn: starting pfn. + * mnode: desired mnode. + * init: set to 1 for full init, 0 for continuation + * + * Returns the appropriate starting pfn for the iteration + * the same as the input pfn if it falls in an mblock. + * Returns the (pfn_t)-1 value if the input pfn lies past + * the last valid mnode pfn. + */ +pfn_t +plat_mem_node_iterator_init(pfn_t pfn, int mnode, + mem_node_iterator_t *it, int init) +{ + int i; + struct mblock_md *mblock; + pfn_t base, end; + + ASSERT(it != NULL); + ASSERT(mnode >= 0 && mnode < max_mem_nodes); + ASSERT(n_mblocks > 0); + + if (init) { + it->mi_last_mblock = 0; + it->mi_init = 1; + } + + /* Check if mpo is not enabled and we only have one mblock */ + if (n_locality_groups == 1 && n_mblocks == 1) { + it->mi_mnode = mnode; + it->mi_ra_to_pa = base_ra_to_pa_pfn; + it->mi_mnode_pfn_mask = 0; + it->mi_mnode_pfn_shift = 0; + it->mi_mnode_mask = 0; + it->mi_mblock_base = mem_node_config[mnode].physbase; + it->mi_mblock_end = mem_node_config[mnode].physmax; + if (pfn < it->mi_mblock_base) + pfn = it->mi_mblock_base; + else if (pfn > it->mi_mblock_end) + pfn = (pfn_t)-1; + return (pfn); + } + + /* + * Find mblock that contains pfn, or first mblock after pfn, + * else pfn is out of bounds, so use the last mblock. + * mblocks are sorted in ascending address order. + */ + ASSERT(it->mi_last_mblock < n_mblocks); + ASSERT(init == 1 || pfn > mpo_mblock[it->mi_last_mblock].end_pfn); + i = init ? 0 : it->mi_last_mblock + 1; + if (i == n_mblocks) + return ((pfn_t)-1); + + for (; i < n_mblocks; i++) { + if (pfn <= mpo_mblock[i].end_pfn) + break; + } + if (i == n_mblocks) { + it->mi_last_mblock = i - 1; + return ((pfn_t)-1); + } + it->mi_last_mblock = i; + + /* + * Memory stripes are defined if there is more than one locality + * group, so use the stripe bounds. Otherwise use mblock bounds. + */ + mblock = &mpo_mblock[i]; + if (n_mem_stripes > 0) { + mem_stripe_t *ms = + &mem_stripes[i * max_locality_groups + mnode]; + base = ms->physbase; + end = ms->physmax; + } else { + ASSERT(mnode == 0); + base = mblock->base_pfn; + end = mblock->end_pfn; + } + + it->mi_mnode = mnode; + it->mi_ra_to_pa = btop(mblock->ra_to_pa); + it->mi_mblock_base = base; + it->mi_mblock_end = end; + it->mi_mnode_pfn_mask = home_mask_pfn; /* is 0 for non-MPO case */ + it->mi_mnode_pfn_shift = home_mask_pfn_shift; + it->mi_mnode_mask = max_locality_groups - 1; + if (pfn < base) + pfn = base; + else if (pfn > end) + pfn = (pfn_t)-1; + return (pfn); +} + +/* + * plat_mem_node_intersect_range() + * + * Find the intersection between a memnode and a range of pfn's. + */ +void +plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len, + int mnode, pgcnt_t *npages_out) +{ + pfn_t offset, len, hole, base, end, test_end, frag; + pfn_t nearest; + mem_stripe_t *ms; + int i, npages; + + *npages_out = 0; + + if (!mem_node_config[mnode].exists || test_len == 0) + return; + + base = mem_node_config[mnode].physbase; + end = mem_node_config[mnode].physmax; + + test_end = test_base + test_len - 1; + if (end < test_base || base > test_end) + return; + + if (n_locality_groups == 1) { + *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1; + return; + } + + hole = mnode_stride - mnode_pages; + npages = 0; + + /* + * Iterate over all the stripes for this mnode (one per mblock), + * find the intersection with each, and accumulate the intersections. + * + * Determing the intersection with a stripe is tricky. If base or end + * fall outside the mem_node bounds, round them to physbase/physmax of + * mem_node. If base or end fall in a gap, round them to start of + * nearest stripe. If they fall within a stripe, keep base or end, + * but calculate the fragment size that should be excluded from the + * stripe. Calculate how many strides fall in the adjusted range, + * multiply by stripe width, and add the start and end fragments. + */ + + for (i = mnode; i < n_mem_stripes; i += max_locality_groups) { + ms = &mem_stripes[i]; + if (ms->exists && + test_base <= (end = ms->physmax) && + test_end >= (base = ms->physbase)) { + + offset = ms->offset; + + if (test_base > base) { + /* Round test_base to next multiple of stride */ + len = P2ROUNDUP(test_base - (base - offset), + mnode_stride); + nearest = base - offset + len; + /* + * Compute distance from test_base to the + * stride boundary to see if test_base falls + * in the stripe or in the hole. + */ + if (nearest - test_base > hole) { + /* + * test_base lies in stripe, + * and offset should be excluded. + */ + offset = test_base - + (nearest - mnode_stride); + base = test_base; + } else { + /* round up to next stripe start */ + offset = 0; + base = nearest; + if (base > end) + continue; + } + + } + + if (test_end < end) + end = test_end; + end++; /* adjust to an exclusive bound */ + + /* Round end to next multiple of stride */ + len = P2ROUNDUP(end - (base - offset), mnode_stride); + nearest = (base - offset) + len; + if (nearest - end <= hole) { + /* end falls in hole, use entire last stripe */ + frag = 0; + } else { + /* end falls in stripe, compute fragment */ + frag = nearest - hole - end; + } + + len = (len >> stripe_shift) - offset - frag; + npages += len; + } + } + + *npages_out = npages; +} + +/* + * valid_pages() + * + * Return 1 if pages are valid and do not cross mnode boundaries + * (which would break page free list assumptions), and 0 otherwise. + */ + +#define MNODE(pa) \ + ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift) + +static int +valid_pages(md_t *md, mde_cookie_t cpu0) +{ + int i, max_szc; + uint64_t last_page_base, szc_mask; + uint64_t max_page_len, max_coalesce_len; + struct mblock_md *mb = mpo_mblock; + + /* + * Find the smaller of the largest page possible and supported. + * mmu_exported_pagesize_mask is not yet initialized, so read + * it from the MD. Apply minimal fixups in case of broken MDs + * to get a sane mask. + */ + + if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask)) + szc_mask = 0; + szc_mask |= (1 << TTE4M); /* largest in sun4v default support */ + max_szc = highbit(szc_mask) - 1; + if (max_szc > TTE256M) + max_szc = TTE256M; + max_page_len = TTEBYTES(max_szc); + + /* + * Page coalescing code coalesces all sizes up to 256M on sun4v, even + * if mmu-page-size-list does not contain it, so 256M pages must fall + * within one mnode to use MPO. + */ + max_coalesce_len = TTEBYTES(TTE256M); + ASSERT(max_coalesce_len >= max_page_len); + + if (ptob(mnode_pages) < max_coalesce_len) { + MPO_STATUS("Page too large; MPO disabled: page = %lx, " + "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages)); + return (0); + } + + for (i = 0; i < n_mblocks; i++) { + uint64_t base = mb->base; + uint64_t end = mb->base + mb->size - 1; + uint64_t ra_to_pa = mb->ra_to_pa; + + /* + * If mblock is smaller than the max page size, then + * RA = PA mod MAXPAGE is not guaranteed, but it must + * not span mnodes. + */ + if (mb->size < max_page_len) { + if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) { + MPO_STATUS("Small mblock spans mnodes; " + "MPO disabled: base = %lx, end = %lx, " + "ra2pa = %lx\n", base, end, ra_to_pa); + return (0); + } + } else { + /* Verify RA = PA mod MAXPAGE, using coalesce size */ + uint64_t pa_base = base + ra_to_pa; + if ((base & (max_coalesce_len - 1)) != + (pa_base & (max_coalesce_len - 1))) { + MPO_STATUS("bad page alignment; MPO disabled: " + "ra = %lx, pa = %lx, pagelen = %lx\n", + base, pa_base, max_coalesce_len); + return (0); + } + } + + /* + * Find start of last large page in mblock in RA space. + * If page extends into the next mblock, verify the + * mnode does not change. + */ + last_page_base = P2ALIGN(end, max_coalesce_len); + if (i + 1 < n_mblocks && + last_page_base + max_coalesce_len > mb[1].base && + MNODE(last_page_base + ra_to_pa) != + MNODE(mb[1].base + mb[1].ra_to_pa)) { + MPO_STATUS("Large page spans mblocks; MPO disabled: " + "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, " + "pagelen = %lx\n", end, ra_to_pa, mb[1].base, + mb[1].ra_to_pa, max_coalesce_len); + return (0); + } + + mb++; + } + return (1); +} + + +/* + * fix_interleave() - Find lgroups with sub-page sized memory interleave, + * if any, and remove them. This yields a config where the "coarse + * grained" lgroups cover all of memory, even though part of that memory + * is fine grain interleaved and does not deliver a purely local memory + * latency. + * + * This function reads and modifies the globals: + * mpo_lgroup[], n_lgrpnodes + * + * Returns 1 if lgroup nodes were removed, 0 otherwise. + */ + +static int +fix_interleave(void) +{ + int i, j; + uint64_t mask = 0; + + j = 0; + for (i = 0; i < n_lgrpnodes; i++) { + if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) { + /* remove this lgroup */ + mask = mpo_lgroup[i].addr_mask; + } else { + mpo_lgroup[j++] = mpo_lgroup[i]; + } + } + n_lgrpnodes = j; + + if (mask != 0) + MPO_STATUS("sub-page interleave %lx found; " + "removing lgroup.\n", mask); + + return (mask != 0); +} |