summaryrefslogtreecommitdiff
path: root/usr/src/uts/sun4v/os/mpo.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/sun4v/os/mpo.c')
-rw-r--r--usr/src/uts/sun4v/os/mpo.c1264
1 files changed, 1264 insertions, 0 deletions
diff --git a/usr/src/uts/sun4v/os/mpo.c b/usr/src/uts/sun4v/os/mpo.c
new file mode 100644
index 0000000000..d98ce96438
--- /dev/null
+++ b/usr/src/uts/sun4v/os/mpo.c
@@ -0,0 +1,1264 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/machsystm.h>
+#include <sys/machparam.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/mach_descrip.h>
+#include <sys/memnode.h>
+#include <sys/mdesc.h>
+#include <sys/mpo.h>
+#include <vm/vm_dep.h>
+
+/*
+ * MPO and the sun4v memory representation
+ * ---------------------------------------
+ *
+ * Latency groups are defined in the sun4v achitecture by memory-latency-group
+ * nodes in the Machine Description, as specified in FWARC/2007/260. These
+ * tie together cpu nodes and mblock nodes, and contain mask and match
+ * properties that identify the portion of an mblock that belongs to the
+ * lgroup. Mask and match are defined in the Physical Address (PA) space,
+ * but an mblock defines Real Addresses (RA). To translate, the mblock
+ * includes the property address-congruence-offset, hereafter referred to as
+ * ra_to_pa. A real address ra is a member of an lgroup if
+ *
+ * (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match
+ *
+ * The MD is traversed, and information on all mblocks is kept in the array
+ * mpo_mblock[]. Information on all CPUs, including which lgroup they map
+ * to, is kept in the array mpo_cpu[].
+ *
+ * This implementation makes (and verifies) the simplifying assumption that
+ * the mask bits are the same for all defined lgroups, and that all 1 bits in
+ * the mask are contiguous. Thus the number of lgroups is bounded by the
+ * number of possible mask values, and the lgrp_handle_t is defined as the
+ * mask value, shifted right to eliminate the 0 bit positions in mask. The
+ * masks and values are also referred to as "home bits" in the code.
+ *
+ * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup
+ * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock
+ * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the
+ * home bits. This yields the mem_node.
+ *
+ * Interfaces
+ * ----------
+ *
+ * This file exports the following entry points:
+ *
+ * plat_lgrp_init()
+ * plat_build_mem_nodes()
+ * plat_lgrp_cpu_to_hand()
+ * plat_lgrp_latency()
+ * plat_pfn_to_mem_node()
+ * These implement the usual platform lgroup interfaces.
+ *
+ * plat_rapfn_to_papfn()
+ * Recover the PA page coloring bits from an RA.
+ *
+ * plat_mem_node_iterator_init()
+ * Initialize an iterator to efficiently step through pages in a mem_node.
+ *
+ * plat_mem_node_intersect_range()
+ * Find the intersection with a mem_node.
+ */
+
+int sun4v_mpo_enable = 1;
+int sun4v_mpo_debug = 0;
+char sun4v_mpo_status[256] = "";
+
+/* Save CPU info from the MD and associate CPUs with lgroups */
+static struct cpu_md mpo_cpu[NCPU];
+
+/* Save lgroup info from the MD */
+#define MAX_MD_LGROUPS 32
+static struct lgrp_md mpo_lgroup[MAX_MD_LGROUPS];
+static int n_lgrpnodes = 0;
+static int n_locality_groups = 0;
+static int max_locality_groups = 0;
+
+/* Save mblocks from the MD */
+static struct mblock_md mpo_mblock[MPO_MAX_MBLOCKS];
+static int n_mblocks = 0;
+
+/* Save mem_node stripes calculate from mblocks and lgroups. */
+static mem_stripe_t mem_stripes[MAX_MEM_STRIPES];
+static int n_mem_stripes = 0;
+static pfn_t mnode_stride; /* distance between stripes, start to start */
+static int stripe_shift; /* stride/stripes expressed as a shift */
+static pfn_t mnode_pages; /* mem_node stripe width */
+
+/* Save home mask and shift used to calculate lgrp_handle_t values */
+static uint64_t home_mask = 0;
+static pfn_t home_mask_pfn = 0;
+static int home_mask_shift = 0;
+static uint_t home_mask_pfn_shift = 0;
+
+/* Save lowest and highest latencies found across all lgroups */
+static int lower_latency = 0;
+static int higher_latency = 0;
+
+static pfn_t base_ra_to_pa_pfn = 0; /* ra_to_pa for single mblock memory */
+
+static int valid_pages(md_t *md, mde_cookie_t cpu0);
+static int unique_home_mem_lg_count(uint64_t mem_lg_homeset);
+static int fix_interleave(void);
+
+/* Debug support */
+#if defined(DEBUG) && !defined(lint)
+#define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args)
+#else
+#define MPO_DEBUG(...)
+#endif /* DEBUG */
+
+/* Record status message, viewable from mdb */
+#define MPO_STATUS(args...) { \
+ (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args); \
+ MPO_DEBUG(sun4v_mpo_status); \
+}
+
+/*
+ * Routine to read a uint64_t from a given md
+ */
+static int64_t
+get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val)
+{
+ int err = md_get_prop_val(md, node, propname, val);
+ return (err);
+}
+
+static int
+mblock_cmp(const void *a, const void *b)
+{
+ struct mblock_md *m1 = (struct mblock_md *)a;
+ struct mblock_md *m2 = (struct mblock_md *)b;
+
+ if (m1->base < m2->base)
+ return (-1);
+ else if (m1->base == m2->base)
+ return (0);
+ else
+ return (1);
+}
+
+static void
+mblock_sort(struct mblock_md *mblocks, int n)
+{
+ extern void qsort(void *, size_t, size_t,
+ int (*)(const void *, const void *));
+
+ qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp);
+}
+
+/*
+ *
+ * Traverse the MD to determine:
+ *
+ * Number of CPU nodes, lgrp_nodes, and mblocks
+ * Then for each lgrp_node, obtain the appropriate data.
+ * For each CPU, determine its home locality and store it.
+ * For each mblock, retrieve its data and store it.
+ */
+static int
+lgrp_traverse(md_t *md)
+{
+ mde_cookie_t root, *cpunodes, *lgrpnodes, *nodes, *mblocknodes;
+ uint64_t i, j, k, o, n_nodes;
+ uint64_t n_lgroups = 0;
+ uint64_t mem_lg_homeset = 0;
+ int ret_val = 0;
+ int result = 0;
+ int n_cpunodes = 0;
+ int sub_page_fix;
+
+ n_nodes = md_node_count(md);
+
+ if (n_nodes <= 0) {
+ MPO_STATUS("lgrp_traverse: No nodes in node count\n");
+ ret_val = -1;
+ goto fail;
+ }
+
+ root = md_root_node(md);
+
+ if (root == MDE_INVAL_ELEM_COOKIE) {
+ MPO_STATUS("lgrp_traverse: Root node is missing\n");
+ ret_val = -1;
+ goto fail;
+ }
+
+ /*
+ * Build the Memory Nodes. Do this before any possibility of
+ * bailing from this routine so we obtain ra_to_pa (needed for page
+ * coloring) even when there are no lgroups defined.
+ */
+
+ n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK,
+ "fwd", &mblocknodes);
+
+ if (n_mblocks <= 0 || n_mblocks > MPO_MAX_MBLOCKS) {
+ MPO_STATUS("lgrp_traverse: No mblock "
+ "nodes detected in Machine Descriptor\n");
+ n_mblocks = 0;
+ ret_val = -1;
+ goto fail;
+ }
+
+ for (i = 0; i < n_mblocks; i++) {
+ mpo_mblock[i].node = mblocknodes[i];
+
+ /* Without a base or size value we will fail */
+ result = get_int(md, mblocknodes[i], PROP_LG_BASE,
+ &mpo_mblock[i].base);
+ if (result < 0) {
+ MPO_STATUS("lgrp_traverse: "
+ "PROP_LG_BASE is missing\n");
+ n_mblocks = 0;
+ ret_val = -1;
+ goto fail;
+ }
+
+ result = get_int(md, mblocknodes[i], PROP_LG_SIZE,
+ &mpo_mblock[i].size);
+ if (result < 0) {
+ MPO_STATUS("lgrp_traverse: "
+ "PROP_LG_SIZE is missing\n");
+ n_mblocks = 0;
+ ret_val = -1;
+ goto fail;
+ }
+
+ result = get_int(md, mblocknodes[i],
+ PROP_LG_RA_PA_OFFSET, &mpo_mblock[i].ra_to_pa);
+
+ /* If we don't have an ra_pa_offset, just set it to 0 */
+ if (result < 0)
+ mpo_mblock[i].ra_to_pa = 0;
+
+ MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, "
+ "ra_to_pa = %lx\n", i,
+ mpo_mblock[i].base,
+ mpo_mblock[i].size,
+ mpo_mblock[i].ra_to_pa);
+ }
+
+ /* Must sort mblocks by address for mem_node_iterator_init() */
+ mblock_sort(mpo_mblock, n_mblocks);
+
+ base_ra_to_pa_pfn = btop(mpo_mblock[0].ra_to_pa);
+
+ /* Page coloring hook is required so we can iterate through mnodes */
+ if (&page_next_pfn_for_color_cpu == NULL) {
+ MPO_STATUS("lgrp_traverse: No page coloring support\n");
+ ret_val = -1;
+ goto fail;
+ }
+
+ /* Global enable for mpo */
+ if (sun4v_mpo_enable == 0) {
+ MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n");
+ ret_val = -1;
+ goto fail;
+ }
+
+ n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG,
+ "fwd", &lgrpnodes);
+
+ if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) {
+ MPO_STATUS("lgrp_traverse: No Lgroups\n");
+ ret_val = -1;
+ goto fail;
+ }
+
+ n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes);
+
+ if (n_cpunodes <= 0 || n_cpunodes > NCPU) {
+ MPO_STATUS("lgrp_traverse: No CPU nodes detected "
+ "in MD\n");
+ ret_val = -1;
+ goto fail;
+ }
+
+ MPO_DEBUG("lgrp_traverse: Node Count: %ld\n", n_nodes);
+ MPO_DEBUG("lgrp_traverse: md: %p\n", md);
+ MPO_DEBUG("lgrp_traverse: root: %lx\n", root);
+ MPO_DEBUG("lgrp_traverse: mem_lgs: %d\n", n_lgrpnodes);
+ MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes);
+ MPO_DEBUG("lgrp_traverse: mblocks: %d\n", n_mblocks);
+
+ for (i = 0; i < n_lgrpnodes; i++) {
+ mpo_lgroup[i].node = lgrpnodes[i];
+ mpo_lgroup[i].id = i;
+ mpo_lgroup[i].ncpu = 0;
+ result = get_int(md, lgrpnodes[i], PROP_LG_MASK,
+ &mpo_lgroup[i].addr_mask);
+ result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH,
+ &mpo_lgroup[i].addr_match);
+
+ /*
+ * If either the mask or match properties are missing, set to 0
+ */
+ if (result < 0) {
+ mpo_lgroup[i].addr_mask = 0;
+ mpo_lgroup[i].addr_match = 0;
+ }
+
+ /* Set latency to 0 if property not present */
+
+ result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY,
+ &mpo_lgroup[i].latency);
+ if (result < 0)
+ mpo_lgroup[i].latency = 0;
+ }
+
+ /*
+ * Sub-page level interleave is not yet supported. Check for it,
+ * and remove sub-page interleaved lgroups from mpo_lgroup and
+ * n_lgrpnodes. If no lgroups are left, return.
+ */
+
+ sub_page_fix = fix_interleave();
+ if (n_lgrpnodes == 0) {
+ ret_val = -1;
+ goto fail;
+ }
+
+ /* Ensure that all of the addr_mask values are the same */
+
+ for (i = 0; i < n_lgrpnodes; i++) {
+ if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) {
+ MPO_STATUS("lgrp_traverse: "
+ "addr_mask values are not the same\n");
+ ret_val = -1;
+ goto fail;
+ }
+ }
+
+ /*
+ * Ensure that all lgrp nodes see all the mblocks. However, if
+ * sub-page interleave is being fixed, they do not, so skip
+ * the check.
+ */
+
+ if (sub_page_fix == 0) {
+ for (i = 0; i < n_lgrpnodes; i++) {
+ j = md_alloc_scan_dag(md, mpo_lgroup[i].node,
+ PROP_LG_MBLOCK, "fwd", &nodes);
+ md_free_scan_dag(md, &nodes);
+ if (j != n_mblocks) {
+ MPO_STATUS("lgrp_traverse: "
+ "sub-page interleave is being fixed\n");
+ ret_val = -1;
+ goto fail;
+ }
+ }
+ }
+
+ /*
+ * Use the address mask from the first lgroup node
+ * to establish our home_mask.
+ */
+ home_mask = mpo_lgroup[0].addr_mask;
+ home_mask_pfn = btop(home_mask);
+ home_mask_shift = lowbit(home_mask) - 1;
+ home_mask_pfn_shift = home_mask_shift - PAGESHIFT;
+ mnode_pages = btop(1ULL << home_mask_shift);
+
+ /*
+ * How many values are possible in home mask? Assume the mask
+ * bits are contiguous.
+ */
+ max_locality_groups =
+ 1 << highbit(home_mask_pfn >> home_mask_pfn_shift);
+
+ /* Now verify the home mask bits are contiguous */
+
+ if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) {
+ MPO_STATUS("lgrp_traverse: "
+ "home mask bits are not contiguous\n");
+ ret_val = -1;
+ goto fail;
+ }
+
+ /* Record all of the home bits */
+
+ for (i = 0; i < n_lgrpnodes; i++) {
+ HOMESET_ADD(mem_lg_homeset,
+ mpo_lgroup[i].addr_match >> home_mask_shift);
+ }
+
+ /* Count the number different "home" mem_lg's we've discovered */
+
+ n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset);
+
+ /* If we have only 1 locality group then we can exit */
+ if (n_locality_groups == 1) {
+ MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n");
+ ret_val = -1;
+ goto fail;
+ }
+
+ /*
+ * Set the latencies. A CPU's lgroup is defined by the lowest
+ * latency found. All other memory is considered remote, and the
+ * remote latency is represented by the highest latency found.
+ * Thus hierarchical lgroups, if any, are approximated by a
+ * two level scheme.
+ *
+ * The Solaris MPO framework by convention wants to see latencies
+ * in units of nano-sec/10. In the MD, the units are defined to be
+ * pico-seconds.
+ */
+
+ lower_latency = mpo_lgroup[0].latency;
+ higher_latency = mpo_lgroup[0].latency;
+
+ for (i = 1; i < n_lgrpnodes; i++) {
+ if (mpo_lgroup[i].latency < lower_latency) {
+ lower_latency = mpo_lgroup[i].latency;
+ }
+ if (mpo_lgroup[i].latency > higher_latency) {
+ higher_latency = mpo_lgroup[i].latency;
+ }
+ }
+ lower_latency /= 10000;
+ higher_latency /= 10000;
+
+ /* Clear our CPU data */
+
+ for (i = 0; i < NCPU; i++) {
+ mpo_cpu[i].home = 0;
+ mpo_cpu[i].latency = (uint_t)(-1);
+ }
+
+ /* Build the CPU nodes */
+ for (i = 0; i < n_cpunodes; i++) {
+
+ /* Read in the lgroup nodes */
+
+ result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k);
+ if (result < 0) {
+ MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n");
+ ret_val = -1;
+ goto fail;
+ }
+
+ n_lgroups = md_alloc_scan_dag(md, cpunodes[i], PROP_LG_MEM_LG,
+ "fwd", &nodes);
+ if (n_lgroups <= 0) {
+ MPO_STATUS("lgrp_traverse: PROP_LG_MEM_LG missing");
+ ret_val = -1;
+ goto fail;
+ }
+
+ /*
+ * Find the lgroup this cpu belongs to with the lowest latency.
+ * Check all the lgrp nodes connected to this CPU to determine
+ * which has the smallest latency.
+ */
+
+ for (j = 0; j < n_lgroups; j++) {
+ for (o = 0; o < n_lgrpnodes; o++) {
+ if (nodes[j] == mpo_lgroup[o].node) {
+ if (mpo_lgroup[o].latency <
+ mpo_cpu[k].latency) {
+ mpo_cpu[k].home =
+ mpo_lgroup[o].addr_match
+ >> home_mask_shift;
+ mpo_cpu[k].latency =
+ mpo_lgroup[o].latency;
+ mpo_lgroup[o].ncpu++;
+ }
+ }
+ }
+ }
+ md_free_scan_dag(md, &nodes);
+ }
+
+ /* Validate that no large pages cross mnode boundaries. */
+ if (valid_pages(md, cpunodes[0]) == 0) {
+ ret_val = -1;
+ goto fail;
+ }
+
+fail:
+ /* MD cookies are no longer valid; ensure they are not used again. */
+ for (i = 0; i < n_mblocks; i++)
+ mpo_mblock[i].node = MDE_INVAL_ELEM_COOKIE;
+ for (i = 0; i < n_lgrpnodes; i++)
+ mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE;
+
+ if (n_cpunodes > 0)
+ md_free_scan_dag(md, &cpunodes);
+ if (n_lgrpnodes > 0)
+ md_free_scan_dag(md, &lgrpnodes);
+ if (n_mblocks > 0)
+ md_free_scan_dag(md, &mblocknodes);
+ else
+ panic("lgrp_traverse: No memory blocks found");
+
+ if (ret_val == 0)
+ MPO_STATUS("MPO feature is enabled.\n");
+
+ return (ret_val);
+}
+
+/*
+ * Determine the number of unique mem_lg's present in our system
+ */
+static int
+unique_home_mem_lg_count(uint64_t mem_lg_homeset)
+{
+ int homeid;
+ int count = 0;
+
+ /*
+ * Scan the "home" bits of the mem_lgs, count
+ * the number that are unique.
+ */
+
+ for (homeid = 0; homeid < NLGRPS_MAX; homeid++) {
+ if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) {
+ count++;
+ }
+ }
+
+ MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n",
+ mem_lg_homeset);
+ MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count);
+
+ /* Default must be at least one */
+ if (count == 0)
+ count = 1;
+
+ return (count);
+}
+
+/*
+ * Platform specific lgroup initialization
+ */
+void
+plat_lgrp_init(void)
+{
+ md_t *md;
+ int i, rc, ncpu_min;
+
+ /* Get the Machine Descriptor handle */
+
+ md = md_get_handle();
+
+ /* If not, we cannot continue */
+
+ if (md == NULL) {
+ panic("cannot access machine descriptor\n");
+ } else {
+ rc = lgrp_traverse(md);
+ (void) md_fini_handle(md);
+ }
+
+ /*
+ * If we can't process the MD for lgroups then at least let the
+ * system try to boot. Assume we have one lgroup so that
+ * when plat_build_mem_nodes is called, it will attempt to init
+ * an mnode based on the supplied memory segment.
+ */
+
+ if (rc == -1) {
+ home_mask_pfn = 0;
+ max_locality_groups = 1;
+ n_locality_groups = 1;
+ return;
+ }
+
+ mem_node_pfn_shift = 0;
+ mem_node_physalign = 0;
+
+ /* Use lgroup-aware TSB allocations */
+ tsb_lgrp_affinity = 1;
+
+ /*
+ * lgrp_expand_proc_thresh is the minimum load on the lgroups
+ * this process is currently running on before considering
+ * expanding threads to another lgroup.
+ *
+ * lgrp_expand_proc_diff determines how much less the remote lgroup
+ * must be loaded before expanding to it.
+ *
+ * On sun4v CMT processors, threads share a core pipeline, and
+ * at less than 100% utilization, best throughput is obtained by
+ * spreading threads across more cores, even if some are in a
+ * different lgroup. Spread threads to a new lgroup if the
+ * current group is more than 50% loaded. Because of virtualization,
+ * lgroups may have different numbers of CPUs, but the tunables
+ * apply to all lgroups, so find the smallest lgroup and compute
+ * 50% loading.
+ */
+
+ ncpu_min = NCPU;
+ for (i = 0; i < n_lgrpnodes; i++) {
+ int ncpu = mpo_lgroup[i].ncpu;
+ if (ncpu != 0 && ncpu < ncpu_min)
+ ncpu_min = ncpu;
+ }
+ lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2;
+
+ /* new home may only be half as loaded as the existing home to use it */
+ lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2;
+
+ lgrp_loadavg_tolerance = lgrp_loadavg_max_effect;
+
+ /* Require that a home lgroup have some memory to be chosen */
+ lgrp_mem_free_thresh = 1;
+
+ /* Standard home-on-next-touch policy */
+ lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT;
+
+ /* Disable option to choose root lgroup if all leaf lgroups are busy */
+ lgrp_load_thresh = UINT32_MAX;
+}
+
+/*
+ * Helper routine for debugging calls to mem_node_add_slice()
+ */
+static void
+mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn)
+{
+#if defined(DEBUG) && !defined(lint)
+ static int slice_count = 0;
+
+ slice_count++;
+ MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n",
+ slice_count, basepfn, endpfn);
+#endif
+ mem_node_add_slice(basepfn, endpfn);
+}
+
+/*
+ * Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node()
+ */
+static void
+mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode)
+{
+ MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld,"
+ "mnode index: %d\n", plathand, mnode);
+ plat_assign_lgrphand_to_mem_node(plathand, mnode);
+}
+
+/*
+ * plat_build_mem_nodes()
+ *
+ * Define the mem_nodes based on the modified boot memory list,
+ * or based on info read from the MD in plat_lgrp_init().
+ *
+ * When the home mask lies in the middle of the address bits (as it does on
+ * Victoria Falls), then the memory in one mem_node is no longer contiguous;
+ * it is striped across an mblock in a repeating pattern of contiguous memory
+ * followed by a gap. The stripe width is the size of the contiguous piece.
+ * The stride is the distance from the start of one contiguous piece to the
+ * start of the next. The gap is thus stride - stripe_width.
+ *
+ * The stripe of an mnode that falls within an mblock is described by the type
+ * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The
+ * mem_stripe_t's are kept in a global array mem_stripes[]. The index into
+ * this array is predetermined. The mem_stripe_t that describes mnode m
+ * within mpo_mblock[i] is stored at
+ * mem_stripes[ m + i * max_locality_groups ]
+ *
+ * max_locality_groups is the total number of possible locality groups,
+ * as defined by the size of the home mask, even if the memory assigned
+ * to the domain is small and does not cover all the lgroups. Thus some
+ * mem_stripe_t's may be empty.
+ *
+ * The members of mem_stripe_t are:
+ * physbase: First valid page in mem_node in the corresponding mblock
+ * physmax: Last valid page in mem_node in mblock
+ * offset: The full stripe width starts at physbase - offset.
+ * Thus if offset is non-zero, this mem_node starts in the middle
+ * of a stripe width, and the second full stripe starts at
+ * physbase - offset + stride. (even though physmax may fall in the
+ * middle of a stripe width, we do not save the ending fragment size
+ * in this data structure.)
+ * exists: Set to 1 if the mblock has memory in this mem_node stripe.
+ *
+ * The stripe width is kept in the global mnode_pages.
+ * The stride is kept in the global mnode_stride.
+ * All the above use pfn's as the unit.
+ *
+ * As an example, the memory layout for a domain with 2 mblocks and 4
+ * mem_nodes 0,1,2,3 could look like this:
+ *
+ * 123012301230 ... 012301230123 ...
+ * mblock 0 mblock 1
+ */
+
+void
+plat_build_mem_nodes(u_longlong_t *list, size_t nelems)
+{
+ lgrp_handle_t lgrphand, lgrp_start;
+ int i, mnode, elem;
+ uint64_t offset, stripe_end, base, len, end, ra_to_pa, stride;
+ uint64_t stripe, frag, remove;
+ mem_stripe_t *ms;
+
+ /* Check for non-MPO sun4v platforms */
+
+ if (n_locality_groups <= 1) {
+ mpo_plat_assign_lgrphand_to_mem_node((lgrp_handle_t)0, 0);
+ for (elem = 0; elem < nelems; elem += 2) {
+ base = list[elem];
+ len = list[elem+1];
+
+ mpo_mem_node_add_slice(btop(base),
+ btop(base + len - 1));
+ }
+ mem_node_pfn_shift = 0;
+ mem_node_physalign = 0;
+ n_mem_stripes = 0;
+ return;
+ }
+
+ /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */
+ max_mem_nodes = max_locality_groups;
+ bzero(mem_stripes, sizeof (mem_stripes));
+ stripe = ptob(mnode_pages);
+ stride = max_locality_groups * stripe;
+
+ /* Save commonly used values in globals */
+ mnode_stride = btop(stride);
+ n_mem_stripes = max_locality_groups * n_mblocks;
+ stripe_shift = highbit(max_locality_groups) - 1;
+
+ for (i = 0; i < n_mblocks; i++) {
+
+ base = mpo_mblock[i].base;
+ end = mpo_mblock[i].base + mpo_mblock[i].size;
+ ra_to_pa = mpo_mblock[i].ra_to_pa;
+ mpo_mblock[i].base_pfn = btop(base);
+ mpo_mblock[i].end_pfn = btop(end - 1);
+
+ /* Find the offset from the prev stripe boundary in PA space. */
+ offset = (base + ra_to_pa) & (stripe - 1);
+
+ /* Set the next stripe boundary. */
+ stripe_end = base - offset + stripe;
+
+ lgrp_start = (((base + ra_to_pa) & home_mask) >>
+ home_mask_shift);
+ lgrphand = lgrp_start;
+
+ /*
+ * Loop over all lgroups covered by the mblock, creating a
+ * stripe for each. Stop when lgrp_start is visited again.
+ */
+ do {
+ /* mblock may not span all lgroups */
+ if (base >= end)
+ break;
+
+ mnode = lgrphand;
+ ASSERT(mnode < max_mem_nodes);
+
+ /*
+ * Calculate the size of the fragment that does not
+ * belong to the mnode in the last partial stride.
+ */
+ frag = (end - (base - offset)) & (stride - 1);
+ if (frag == 0) {
+ /* remove the gap */
+ remove = stride - stripe;
+ } else if (frag < stripe) {
+ /* fragment fits in stripe; keep it all */
+ remove = 0;
+ } else {
+ /* fragment is large; trim after whole stripe */
+ remove = frag - stripe;
+ }
+
+ ms = &mem_stripes[i * max_locality_groups + mnode];
+ ms->physbase = btop(base);
+ ms->physmax = btop(end - 1 - remove);
+ ms->offset = btop(offset);
+ ms->exists = 1;
+
+ mpo_plat_assign_lgrphand_to_mem_node(lgrphand, mnode);
+ mpo_mem_node_add_slice(ms->physbase, ms->physmax);
+
+ base = stripe_end;
+ stripe_end += stripe;
+ offset = 0;
+ lgrphand = (((base + ra_to_pa) & home_mask) >>
+ home_mask_shift);
+ } while (lgrphand != lgrp_start);
+ }
+
+ /*
+ * Indicate to vm_pagelist that the hpm_counters array
+ * should be shared because the ranges overlap.
+ */
+ if (max_mem_nodes > 1) {
+ interleaved_mnodes = 1;
+ }
+}
+
+/*
+ * Return the locality group value for the supplied processor
+ */
+lgrp_handle_t
+plat_lgrp_cpu_to_hand(processorid_t id)
+{
+ if (n_locality_groups > 1) {
+ return ((lgrp_handle_t)mpo_cpu[(int)id].home);
+ } else {
+ return ((lgrp_handle_t)0); /* Default */
+ }
+}
+
+int
+plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to)
+{
+ /*
+ * Return min remote latency when there are more than two lgroups
+ * (root and child) and getting latency between two different lgroups
+ * or root is involved.
+ */
+ if (lgrp_optimizations() && (from != to ||
+ from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) {
+ return ((int)higher_latency);
+ } else {
+ return ((int)lower_latency);
+ }
+}
+
+int
+plat_pfn_to_mem_node(pfn_t pfn)
+{
+ int i, mnode;
+ pfn_t ra_to_pa_pfn;
+ struct mblock_md *mb;
+
+ if (n_locality_groups <= 1)
+ return (0);
+
+ /*
+ * The mnode is defined to be 1:1 with the lgroup handle, which
+ * is taken from from the home bits. Find the mblock in which
+ * the pfn falls to get the ra_to_pa adjustment, and extract
+ * the home bits.
+ */
+ mb = &mpo_mblock[0];
+ for (i = 0; i < n_mblocks; i++) {
+ if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) {
+ ra_to_pa_pfn = btop(mb->ra_to_pa);
+ mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >>
+ home_mask_pfn_shift);
+ ASSERT(mnode < max_mem_nodes);
+ return (mnode);
+ }
+ mb++;
+ }
+
+ panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn);
+ return (pfn);
+}
+
+/*
+ * plat_rapfn_to_papfn
+ *
+ * Convert a pfn in RA space to a pfn in PA space, in which the page coloring
+ * and home mask bits are correct. The upper bits do not necessarily
+ * match the actual PA, however.
+ */
+pfn_t
+plat_rapfn_to_papfn(pfn_t pfn)
+{
+ int i;
+ pfn_t ra_to_pa_pfn;
+ struct mblock_md *mb;
+
+ ASSERT(n_mblocks > 0);
+ if (n_mblocks == 1)
+ return (pfn + base_ra_to_pa_pfn);
+
+ /*
+ * Find the mblock in which the pfn falls
+ * in order to get the ra_to_pa adjustment.
+ */
+ for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) {
+ if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) {
+ ra_to_pa_pfn = btop(mb->ra_to_pa);
+ return (pfn + ra_to_pa_pfn);
+ }
+ }
+
+ panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn);
+ return (pfn);
+}
+
+/*
+ * plat_mem_node_iterator_init()
+ * Initialize cookie to iterate over pfn's in an mnode. There is
+ * no additional iterator function. The caller uses the info from
+ * the iterator structure directly.
+ *
+ * pfn: starting pfn.
+ * mnode: desired mnode.
+ * init: set to 1 for full init, 0 for continuation
+ *
+ * Returns the appropriate starting pfn for the iteration
+ * the same as the input pfn if it falls in an mblock.
+ * Returns the (pfn_t)-1 value if the input pfn lies past
+ * the last valid mnode pfn.
+ */
+pfn_t
+plat_mem_node_iterator_init(pfn_t pfn, int mnode,
+ mem_node_iterator_t *it, int init)
+{
+ int i;
+ struct mblock_md *mblock;
+ pfn_t base, end;
+
+ ASSERT(it != NULL);
+ ASSERT(mnode >= 0 && mnode < max_mem_nodes);
+ ASSERT(n_mblocks > 0);
+
+ if (init) {
+ it->mi_last_mblock = 0;
+ it->mi_init = 1;
+ }
+
+ /* Check if mpo is not enabled and we only have one mblock */
+ if (n_locality_groups == 1 && n_mblocks == 1) {
+ it->mi_mnode = mnode;
+ it->mi_ra_to_pa = base_ra_to_pa_pfn;
+ it->mi_mnode_pfn_mask = 0;
+ it->mi_mnode_pfn_shift = 0;
+ it->mi_mnode_mask = 0;
+ it->mi_mblock_base = mem_node_config[mnode].physbase;
+ it->mi_mblock_end = mem_node_config[mnode].physmax;
+ if (pfn < it->mi_mblock_base)
+ pfn = it->mi_mblock_base;
+ else if (pfn > it->mi_mblock_end)
+ pfn = (pfn_t)-1;
+ return (pfn);
+ }
+
+ /*
+ * Find mblock that contains pfn, or first mblock after pfn,
+ * else pfn is out of bounds, so use the last mblock.
+ * mblocks are sorted in ascending address order.
+ */
+ ASSERT(it->mi_last_mblock < n_mblocks);
+ ASSERT(init == 1 || pfn > mpo_mblock[it->mi_last_mblock].end_pfn);
+ i = init ? 0 : it->mi_last_mblock + 1;
+ if (i == n_mblocks)
+ return ((pfn_t)-1);
+
+ for (; i < n_mblocks; i++) {
+ if (pfn <= mpo_mblock[i].end_pfn)
+ break;
+ }
+ if (i == n_mblocks) {
+ it->mi_last_mblock = i - 1;
+ return ((pfn_t)-1);
+ }
+ it->mi_last_mblock = i;
+
+ /*
+ * Memory stripes are defined if there is more than one locality
+ * group, so use the stripe bounds. Otherwise use mblock bounds.
+ */
+ mblock = &mpo_mblock[i];
+ if (n_mem_stripes > 0) {
+ mem_stripe_t *ms =
+ &mem_stripes[i * max_locality_groups + mnode];
+ base = ms->physbase;
+ end = ms->physmax;
+ } else {
+ ASSERT(mnode == 0);
+ base = mblock->base_pfn;
+ end = mblock->end_pfn;
+ }
+
+ it->mi_mnode = mnode;
+ it->mi_ra_to_pa = btop(mblock->ra_to_pa);
+ it->mi_mblock_base = base;
+ it->mi_mblock_end = end;
+ it->mi_mnode_pfn_mask = home_mask_pfn; /* is 0 for non-MPO case */
+ it->mi_mnode_pfn_shift = home_mask_pfn_shift;
+ it->mi_mnode_mask = max_locality_groups - 1;
+ if (pfn < base)
+ pfn = base;
+ else if (pfn > end)
+ pfn = (pfn_t)-1;
+ return (pfn);
+}
+
+/*
+ * plat_mem_node_intersect_range()
+ *
+ * Find the intersection between a memnode and a range of pfn's.
+ */
+void
+plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len,
+ int mnode, pgcnt_t *npages_out)
+{
+ pfn_t offset, len, hole, base, end, test_end, frag;
+ pfn_t nearest;
+ mem_stripe_t *ms;
+ int i, npages;
+
+ *npages_out = 0;
+
+ if (!mem_node_config[mnode].exists || test_len == 0)
+ return;
+
+ base = mem_node_config[mnode].physbase;
+ end = mem_node_config[mnode].physmax;
+
+ test_end = test_base + test_len - 1;
+ if (end < test_base || base > test_end)
+ return;
+
+ if (n_locality_groups == 1) {
+ *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1;
+ return;
+ }
+
+ hole = mnode_stride - mnode_pages;
+ npages = 0;
+
+ /*
+ * Iterate over all the stripes for this mnode (one per mblock),
+ * find the intersection with each, and accumulate the intersections.
+ *
+ * Determing the intersection with a stripe is tricky. If base or end
+ * fall outside the mem_node bounds, round them to physbase/physmax of
+ * mem_node. If base or end fall in a gap, round them to start of
+ * nearest stripe. If they fall within a stripe, keep base or end,
+ * but calculate the fragment size that should be excluded from the
+ * stripe. Calculate how many strides fall in the adjusted range,
+ * multiply by stripe width, and add the start and end fragments.
+ */
+
+ for (i = mnode; i < n_mem_stripes; i += max_locality_groups) {
+ ms = &mem_stripes[i];
+ if (ms->exists &&
+ test_base <= (end = ms->physmax) &&
+ test_end >= (base = ms->physbase)) {
+
+ offset = ms->offset;
+
+ if (test_base > base) {
+ /* Round test_base to next multiple of stride */
+ len = P2ROUNDUP(test_base - (base - offset),
+ mnode_stride);
+ nearest = base - offset + len;
+ /*
+ * Compute distance from test_base to the
+ * stride boundary to see if test_base falls
+ * in the stripe or in the hole.
+ */
+ if (nearest - test_base > hole) {
+ /*
+ * test_base lies in stripe,
+ * and offset should be excluded.
+ */
+ offset = test_base -
+ (nearest - mnode_stride);
+ base = test_base;
+ } else {
+ /* round up to next stripe start */
+ offset = 0;
+ base = nearest;
+ if (base > end)
+ continue;
+ }
+
+ }
+
+ if (test_end < end)
+ end = test_end;
+ end++; /* adjust to an exclusive bound */
+
+ /* Round end to next multiple of stride */
+ len = P2ROUNDUP(end - (base - offset), mnode_stride);
+ nearest = (base - offset) + len;
+ if (nearest - end <= hole) {
+ /* end falls in hole, use entire last stripe */
+ frag = 0;
+ } else {
+ /* end falls in stripe, compute fragment */
+ frag = nearest - hole - end;
+ }
+
+ len = (len >> stripe_shift) - offset - frag;
+ npages += len;
+ }
+ }
+
+ *npages_out = npages;
+}
+
+/*
+ * valid_pages()
+ *
+ * Return 1 if pages are valid and do not cross mnode boundaries
+ * (which would break page free list assumptions), and 0 otherwise.
+ */
+
+#define MNODE(pa) \
+ ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift)
+
+static int
+valid_pages(md_t *md, mde_cookie_t cpu0)
+{
+ int i, max_szc;
+ uint64_t last_page_base, szc_mask;
+ uint64_t max_page_len, max_coalesce_len;
+ struct mblock_md *mb = mpo_mblock;
+
+ /*
+ * Find the smaller of the largest page possible and supported.
+ * mmu_exported_pagesize_mask is not yet initialized, so read
+ * it from the MD. Apply minimal fixups in case of broken MDs
+ * to get a sane mask.
+ */
+
+ if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask))
+ szc_mask = 0;
+ szc_mask |= (1 << TTE4M); /* largest in sun4v default support */
+ max_szc = highbit(szc_mask) - 1;
+ if (max_szc > TTE256M)
+ max_szc = TTE256M;
+ max_page_len = TTEBYTES(max_szc);
+
+ /*
+ * Page coalescing code coalesces all sizes up to 256M on sun4v, even
+ * if mmu-page-size-list does not contain it, so 256M pages must fall
+ * within one mnode to use MPO.
+ */
+ max_coalesce_len = TTEBYTES(TTE256M);
+ ASSERT(max_coalesce_len >= max_page_len);
+
+ if (ptob(mnode_pages) < max_coalesce_len) {
+ MPO_STATUS("Page too large; MPO disabled: page = %lx, "
+ "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages));
+ return (0);
+ }
+
+ for (i = 0; i < n_mblocks; i++) {
+ uint64_t base = mb->base;
+ uint64_t end = mb->base + mb->size - 1;
+ uint64_t ra_to_pa = mb->ra_to_pa;
+
+ /*
+ * If mblock is smaller than the max page size, then
+ * RA = PA mod MAXPAGE is not guaranteed, but it must
+ * not span mnodes.
+ */
+ if (mb->size < max_page_len) {
+ if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) {
+ MPO_STATUS("Small mblock spans mnodes; "
+ "MPO disabled: base = %lx, end = %lx, "
+ "ra2pa = %lx\n", base, end, ra_to_pa);
+ return (0);
+ }
+ } else {
+ /* Verify RA = PA mod MAXPAGE, using coalesce size */
+ uint64_t pa_base = base + ra_to_pa;
+ if ((base & (max_coalesce_len - 1)) !=
+ (pa_base & (max_coalesce_len - 1))) {
+ MPO_STATUS("bad page alignment; MPO disabled: "
+ "ra = %lx, pa = %lx, pagelen = %lx\n",
+ base, pa_base, max_coalesce_len);
+ return (0);
+ }
+ }
+
+ /*
+ * Find start of last large page in mblock in RA space.
+ * If page extends into the next mblock, verify the
+ * mnode does not change.
+ */
+ last_page_base = P2ALIGN(end, max_coalesce_len);
+ if (i + 1 < n_mblocks &&
+ last_page_base + max_coalesce_len > mb[1].base &&
+ MNODE(last_page_base + ra_to_pa) !=
+ MNODE(mb[1].base + mb[1].ra_to_pa)) {
+ MPO_STATUS("Large page spans mblocks; MPO disabled: "
+ "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, "
+ "pagelen = %lx\n", end, ra_to_pa, mb[1].base,
+ mb[1].ra_to_pa, max_coalesce_len);
+ return (0);
+ }
+
+ mb++;
+ }
+ return (1);
+}
+
+
+/*
+ * fix_interleave() - Find lgroups with sub-page sized memory interleave,
+ * if any, and remove them. This yields a config where the "coarse
+ * grained" lgroups cover all of memory, even though part of that memory
+ * is fine grain interleaved and does not deliver a purely local memory
+ * latency.
+ *
+ * This function reads and modifies the globals:
+ * mpo_lgroup[], n_lgrpnodes
+ *
+ * Returns 1 if lgroup nodes were removed, 0 otherwise.
+ */
+
+static int
+fix_interleave(void)
+{
+ int i, j;
+ uint64_t mask = 0;
+
+ j = 0;
+ for (i = 0; i < n_lgrpnodes; i++) {
+ if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) {
+ /* remove this lgroup */
+ mask = mpo_lgroup[i].addr_mask;
+ } else {
+ mpo_lgroup[j++] = mpo_lgroup[i];
+ }
+ }
+ n_lgrpnodes = j;
+
+ if (mask != 0)
+ MPO_STATUS("sub-page interleave %lx found; "
+ "removing lgroup.\n", mask);
+
+ return (mask != 0);
+}