summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordp78419 <none@none>2007-07-31 16:27:12 -0700
committerdp78419 <none@none>2007-07-31 16:27:12 -0700
commitce8eb11a8717b4a57c68fd77ab9f8aac15b16bf2 (patch)
tree772441639680866ab4a841bbef119c6a813e6c09
parent79777a7dd0179283917bda2ba98999c382d31c2c (diff)
downloadillumos-joyent-ce8eb11a8717b4a57c68fd77ab9f8aac15b16bf2.tar.gz
PSARC 2006/675 MPO for Victoria Falls/Maramba project
6539930 MPO for sun4v platforms
-rw-r--r--usr/src/uts/common/os/mem_cage.c24
-rw-r--r--usr/src/uts/common/sys/lgrp.h9
-rw-r--r--usr/src/uts/common/sys/pghw.h7
-rw-r--r--usr/src/uts/common/vm/vm_pagelist.c345
-rw-r--r--usr/src/uts/i86pc/os/memnode.c13
-rw-r--r--usr/src/uts/i86pc/os/mp_machdep.c27
-rw-r--r--usr/src/uts/i86pc/sys/memnode.h11
-rw-r--r--usr/src/uts/i86pc/vm/vm_dep.h40
-rw-r--r--usr/src/uts/sun4/os/memnode.c70
-rw-r--r--usr/src/uts/sun4/sys/memnode.h17
-rw-r--r--usr/src/uts/sun4/vm/vm_dep.h153
-rw-r--r--usr/src/uts/sun4u/os/cmp.c27
-rw-r--r--usr/src/uts/sun4v/Makefile.files1
-rw-r--r--usr/src/uts/sun4v/Makefile.sun4v.shared1
-rw-r--r--usr/src/uts/sun4v/cpu/generic.c5
-rw-r--r--usr/src/uts/sun4v/cpu/niagara.c3
-rw-r--r--usr/src/uts/sun4v/cpu/niagara2.c284
-rw-r--r--usr/src/uts/sun4v/os/cmp.c37
-rw-r--r--usr/src/uts/sun4v/os/mpo.c1264
-rw-r--r--usr/src/uts/sun4v/sys/cpu_module.h3
-rw-r--r--usr/src/uts/sun4v/sys/machcpuvar.h2
-rw-r--r--usr/src/uts/sun4v/sys/mpo.h112
22 files changed, 2197 insertions, 258 deletions
diff --git a/usr/src/uts/common/os/mem_cage.c b/usr/src/uts/common/os/mem_cage.c
index beb2fe3cbe..820fe555dd 100644
--- a/usr/src/uts/common/os/mem_cage.c
+++ b/usr/src/uts/common/os/mem_cage.c
@@ -335,7 +335,7 @@ kcage_next_range(int incage, pfn_t lo, pfn_t hi,
rw_enter(&kcage_range_rwlock, RW_READER);
for (lp = incage ? kcage_glist : kcage_current_glist;
- lp != NULL; lp = lp->next) {
+ lp != NULL; lp = lp->next) {
pfn_t klo, khi;
@@ -886,7 +886,7 @@ kcage_recalc_preferred_size(pgcnt_t preferred_size)
segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE;
}
kcage_kmemlp_mincage = MIN(lpmincage,
- (segkmem_kmemlp_max / PAGESIZE));
+ (segkmem_kmemlp_max / PAGESIZE));
preferred_size = MAX(kcage_kmemlp_mincage, preferred_size);
}
return (preferred_size);
@@ -1006,11 +1006,7 @@ kcage_init(pgcnt_t preferred_size)
*/
if (SEGKMEM_USE_LARGEPAGES) {
extern void page_freelist_coalesce_all(int mnode);
- extern int max_mem_nodes;
- int mnode, max_mnodes = max_mem_nodes;
- for (mnode = 0; mnode < max_mnodes; mnode++) {
- page_freelist_coalesce_all(mnode);
- }
+ page_freelist_coalesce_all(-1); /* do all mnodes */
}
ksp = kstat_create("kcage", 0, "kcage_page_list", "misc",
@@ -1288,7 +1284,7 @@ kcage_freemem_add(pgcnt_t npages)
wakeup_pcgs(); /* wakeup threads in pcgs() */
if (kcage_needfree != 0 &&
- kcage_freemem >= (kcage_throttlefree + kcage_needfree)) {
+ kcage_freemem >= (kcage_throttlefree + kcage_needfree)) {
mutex_enter(&kcage_throttle_mutex);
cv_broadcast(&kcage_throttle_cv);
@@ -1467,7 +1463,7 @@ kcage_expand()
* have enough free pages to page_relocate() even a single page.
*/
wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree)
- - kcage_freemem;
+ - kcage_freemem;
if (wanted <= 0)
return (0);
else if (freemem < pageout_reserve + 1) {
@@ -1683,7 +1679,7 @@ kcage_cageout()
#endif
CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex,
- callb_generic_cpr, "cageout");
+ callb_generic_cpr, "cageout");
mutex_enter(&kcage_cageout_mutex);
kcage_cageout_thread = curthread;
@@ -1724,7 +1720,7 @@ again:
pages_skipped = 0;
shared_skipped = 0;
while ((kcage_freemem < kcage_lotsfree || kcage_needfree) &&
- (pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) {
+ (pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) {
if (start_pfn == PFN_INVALID)
start_pfn = pfn;
@@ -1820,7 +1816,7 @@ again:
* In pass {0, 1, 2}, skip page if mod bit is set.
*/
prm = hat_pagesync(pp,
- HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
+ HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
/* On first pass ignore ref'd pages */
if (pass <= 1 && (prm & P_REF)) {
@@ -1833,7 +1829,7 @@ again:
/* On pass 2, page_destroy if mod bit is not set */
if (pass <= 2) {
if (pp->p_szc != 0 || (prm & P_MOD) ||
- pp->p_lckcnt || pp->p_cowcnt) {
+ pp->p_lckcnt || pp->p_cowcnt) {
pages_skipped = 1;
page_unlock(pp);
} else {
@@ -1843,7 +1839,7 @@ again:
* checking if mod bit is set
*/
(void) hat_pageunload(pp,
- HAT_FORCE_PGUNLOAD);
+ HAT_FORCE_PGUNLOAD);
/*
* skip this page if modified
diff --git a/usr/src/uts/common/sys/lgrp.h b/usr/src/uts/common/sys/lgrp.h
index c0ed75d981..48ad8e8757 100644
--- a/usr/src/uts/common/sys/lgrp.h
+++ b/usr/src/uts/common/sys/lgrp.h
@@ -598,6 +598,15 @@ int lgrp_plat_latency(lgrp_handle_t, lgrp_handle_t);
lgrp_handle_t lgrp_plat_root_hand(void);
void lgrp_plat_probe(void);
+extern uint32_t lgrp_expand_proc_thresh;
+extern uint32_t lgrp_expand_proc_diff;
+extern pgcnt_t lgrp_mem_free_thresh;
+extern uint32_t lgrp_loadavg_tolerance;
+extern uint32_t lgrp_loadavg_max_effect;
+extern uint32_t lgrp_load_thresh;
+extern lgrp_mem_policy_t lgrp_mem_policy_root;
+extern int tsb_lgrp_affinity;
+
#endif /* _KERNEL && _KMEMUSER */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/pghw.h b/usr/src/uts/common/sys/pghw.h
index e78be92032..f22afc021b 100644
--- a/usr/src/uts/common/sys/pghw.h
+++ b/usr/src/uts/common/sys/pghw.h
@@ -52,17 +52,12 @@ typedef enum pghw_type {
PGHW_CACHE,
PGHW_FPU,
PGHW_MPIPE,
+ PGHW_CHIP,
PGHW_MEMORY,
PGHW_NUM_COMPONENTS
} pghw_type_t;
/*
- * Consider the physical processor sharing relationship
- * equivalant to a shared pipe to memory.
- */
-#define PGHW_CHIP PGHW_MPIPE
-
-/*
* Anonymous instance id
*/
#define PGHW_INSTANCE_ANON ((id_t)0xdecafbad)
diff --git a/usr/src/uts/common/vm/vm_pagelist.c b/usr/src/uts/common/vm/vm_pagelist.c
index cef95452bf..d45b8cd0fe 100644
--- a/usr/src/uts/common/vm/vm_pagelist.c
+++ b/usr/src/uts/common/vm/vm_pagelist.c
@@ -497,21 +497,37 @@ page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
color &= ceq_mask;
- ncolor <<= PAGE_GET_COLOR_SHIFT(szc, nszc);
+ ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
return (color | (ncolor & ~ceq_mask));
}
/*
+ * The interleaved_mnodes flag is set when mnodes overlap in
+ * the physbase..physmax range, but have disjoint slices.
+ * In this case hpm_counters is shared by all mnodes.
+ * This flag is set dynamically by the platform.
+ */
+int interleaved_mnodes = 0;
+
+/*
* Called by startup().
* Size up the per page size free list counters based on physmax
* of each node and max_mem_nodes.
+ *
+ * If interleaved_mnodes is set we need to find the first mnode that
+ * exists. hpm_counters for the first mnode will then be shared by
+ * all other mnodes. If interleaved_mnodes is not set, just set
+ * first=mnode each time. That means there will be no sharing.
*/
size_t
page_ctrs_sz(void)
{
int r; /* region size */
int mnode;
+ int firstmn; /* first mnode that exists */
int nranges;
+ pfn_t physbase;
+ pfn_t physmax;
uint_t ctrs_sz = 0;
int i;
pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
@@ -525,7 +541,7 @@ page_ctrs_sz(void)
colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
}
- for (mnode = 0; mnode < max_mem_nodes; mnode++) {
+ for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
pgcnt_t r_pgcnt;
pfn_t r_base;
@@ -534,6 +550,7 @@ page_ctrs_sz(void)
if (mem_node_config[mnode].exists == 0)
continue;
+ HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
nranges = MNODE_RANGE_CNT(mnode);
mnode_nranges[mnode] = nranges;
mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
@@ -543,22 +560,25 @@ page_ctrs_sz(void)
* base aligned to large page size.
*/
for (r = 1; r < mmu_page_sizes; r++) {
+ /* add in space for hpm_color_current */
+ ctrs_sz += sizeof (size_t) *
+ colors_per_szc[r] * nranges;
+
+ if (firstmn != mnode)
+ continue;
+
/* add in space for hpm_counters */
r_align = page_get_pagecnt(r);
- r_base = mem_node_config[mnode].physbase;
+ r_base = physbase;
r_base &= ~(r_align - 1);
- r_pgcnt = howmany(mem_node_config[mnode].physmax -
- r_base + 1, r_align);
+ r_pgcnt = howmany(physmax - r_base + 1, r_align);
+
/*
* Round up to always allocate on pointer sized
* boundaries.
*/
ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
sizeof (hpmctr_t *));
-
- /* add in space for hpm_color_current */
- ctrs_sz += sizeof (size_t) *
- colors_per_szc[r] * nranges;
}
}
@@ -605,6 +625,9 @@ page_ctrs_alloc(caddr_t alloc_base)
int mrange, nranges;
int r; /* region size */
int i;
+ int firstmn; /* first mnode that exists */
+ pfn_t physbase;
+ pfn_t physmax;
pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
/*
@@ -660,7 +683,7 @@ page_ctrs_alloc(caddr_t alloc_base)
/* initialize page list counts */
PLCNT_INIT(alloc_base);
- for (mnode = 0; mnode < max_mem_nodes; mnode++) {
+ for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
pgcnt_t r_pgcnt;
pfn_t r_base;
@@ -671,6 +694,8 @@ page_ctrs_alloc(caddr_t alloc_base)
if (mem_node_config[mnode].exists == 0)
continue;
+ HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
+
for (r = 1; r < mmu_page_sizes; r++) {
/*
* the page_counters base has to be aligned to the
@@ -678,11 +703,10 @@ page_ctrs_alloc(caddr_t alloc_base)
* will cross large page boundaries.
*/
r_align = page_get_pagecnt(r);
- r_base = mem_node_config[mnode].physbase;
+ r_base = physbase;
/* base needs to be aligned - lower to aligned value */
r_base &= ~(r_align - 1);
- r_pgcnt = howmany(mem_node_config[mnode].physmax -
- r_base + 1, r_align);
+ r_pgcnt = howmany(physmax - r_base + 1, r_align);
r_shift = PAGE_BSZS_SHIFT(r);
PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
@@ -699,9 +723,12 @@ page_ctrs_alloc(caddr_t alloc_base)
pfn_t pfnum = r_base;
size_t idx;
int mrange;
+ MEM_NODE_ITERATOR_DECL(it);
+ MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
+ ASSERT(pfnum != (pfn_t)-1);
PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
- color_mask, color_mask);
+ color_mask, color_mask, &it);
idx = PNUM_TO_IDX(mnode, r, pfnum);
idx = (idx >= r_pgcnt) ? 0 : idx;
for (mrange = 0; mrange < nranges; mrange++) {
@@ -709,14 +736,18 @@ page_ctrs_alloc(caddr_t alloc_base)
r, i, mrange) = idx;
}
}
- PAGE_COUNTERS_COUNTERS(mnode, r) =
- (hpmctr_t *)alloc_base;
- /*
- * Round up to make alloc_base always be aligned on
- * a pointer boundary.
- */
- alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
- sizeof (hpmctr_t *));
+
+ /* hpm_counters may be shared by all mnodes */
+ if (firstmn == mnode) {
+ PAGE_COUNTERS_COUNTERS(mnode, r) =
+ (hpmctr_t *)alloc_base;
+ alloc_base +=
+ P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
+ sizeof (hpmctr_t *));
+ } else {
+ PAGE_COUNTERS_COUNTERS(mnode, r) =
+ PAGE_COUNTERS_COUNTERS(firstmn, r);
+ }
/*
* Verify that PNUM_TO_IDX and IDX_TO_PNUM
@@ -735,7 +766,7 @@ page_ctrs_alloc(caddr_t alloc_base)
* page_ctrs_sz() has added some slop for these roundups.
*/
alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
- L2CACHE_ALIGN);
+ L2CACHE_ALIGN);
}
/* Initialize other page counter specific data structures. */
@@ -894,6 +925,7 @@ page_ctrs_adjust(int mnode)
size_t pcsz, old_csz;
hpmctr_t *new_ctr, *old_ctr;
pfn_t oldbase, newbase;
+ pfn_t physbase, physmax;
size_t old_npgs;
hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
size_t size_cache[MMU_PAGE_SIZES];
@@ -908,15 +940,17 @@ page_ctrs_adjust(int mnode)
int old_maxmrange, new_maxmrange;
int rc = 0;
- newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK;
- npgs = roundup(mem_node_config[mnode].physmax,
- PC_BASE_ALIGN) - newbase;
-
cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
MMU_PAGE_SIZES, KM_NOSLEEP);
if (cands_cache == NULL)
return (ENOMEM);
+ i = -1;
+ HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
+
+ newbase = physbase & ~PC_BASE_ALIGN_MASK;
+ npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
+
/* prepare to free non-null pointers on the way out */
cands_cache_nranges = nranges;
bzero(ctr_cache, sizeof (ctr_cache));
@@ -997,8 +1031,7 @@ page_ctrs_adjust(int mnode)
* Grab the write lock to prevent others from walking these arrays
* while we are modifying them.
*/
- rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER);
- page_freelist_lock(mnode);
+ PAGE_CTRS_WRITE_LOCK(mnode);
old_nranges = mnode_nranges[mnode];
cands_cache_nranges = old_nranges;
@@ -1016,7 +1049,7 @@ page_ctrs_adjust(int mnode)
for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
old_color_array[mrange] =
PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
- r, mrange);
+ r, mrange);
}
pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
@@ -1048,6 +1081,21 @@ page_ctrs_adjust(int mnode)
PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
PAGE_COUNTERS_BASE(mnode, r) = newbase;
+
+ /* update shared hpm_counters in other mnodes */
+ if (interleaved_mnodes) {
+ for (i = 0; i < max_mem_nodes; i++) {
+ if (i == mnode)
+ continue;
+ if (mem_node_config[i].exists == 0)
+ continue;
+ ASSERT(PAGE_COUNTERS_COUNTERS(i, r) == old_ctr);
+ PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
+ PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
+ PAGE_COUNTERS_BASE(i, r) = newbase;
+ }
+ }
+
for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
color_cache[r][mrange];
@@ -1059,16 +1107,27 @@ page_ctrs_adjust(int mnode)
*/
for (i = 0; i < colors_per_szc[r]; i++) {
uint_t color_mask = colors_per_szc[r] - 1;
+ int mlo = interleaved_mnodes ? 0 : mnode;
+ int mhi = interleaved_mnodes ? max_mem_nodes :
+ (mnode + 1);
+ int m;
pfn_t pfnum = newbase;
size_t idx;
+ MEM_NODE_ITERATOR_DECL(it);
- PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, color_mask,
- color_mask);
- idx = PNUM_TO_IDX(mnode, r, pfnum);
- idx = (idx < pcsz) ? idx : 0;
- for (mrange = 0; mrange < nranges; mrange++) {
- PAGE_COUNTERS_CURRENT_COLOR(mnode,
- r, i, mrange) = idx;
+ for (m = mlo; m < mhi; m++) {
+ if (mem_node_config[m].exists == 0)
+ continue;
+ MEM_NODE_ITERATOR_INIT(pfnum, m, &it);
+ ASSERT(pfnum != (pfn_t)-1);
+ PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, color_mask,
+ color_mask, &it);
+ idx = PNUM_TO_IDX(m, r, pfnum);
+ idx = (idx < pcsz) ? idx : 0;
+ for (mrange = 0; mrange < nranges; mrange++) {
+ PAGE_COUNTERS_CURRENT_COLOR(m,
+ r, i, mrange) = idx;
+ }
}
}
@@ -1129,8 +1188,7 @@ page_ctrs_adjust(int mnode)
}
}
}
- page_freelist_unlock(mnode);
- rw_exit(&page_ctrs_rwlock[mnode]);
+ PAGE_CTRS_WRITE_UNLOCK(mnode);
/*
* Now that we have dropped the write lock, it is safe to free all
@@ -2130,6 +2188,7 @@ page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
size_t len, idx, idx0;
pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc);
page_t *ret_pp;
+ MEM_NODE_ITERATOR_DECL(it);
#if defined(__sparc)
pfn_t pfnum0, nlo, nhi;
#endif
@@ -2169,11 +2228,15 @@ page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
/* round to szcpgcnt boundaries */
lo = P2ROUNDUP(lo, szcpgcnt);
+ MEM_NODE_ITERATOR_INIT(lo, mnode, &it);
+ ASSERT(lo != (pfn_t)-1);
hi = hi & ~(szcpgcnt - 1);
/* set lo to the closest pfn of the right color */
- if ((PFN_2_COLOR(lo, szc) ^ color) & ceq_mask) {
- PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask);
+ if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
+ (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
+ PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
+ &it);
}
if (hi <= lo) {
@@ -2208,11 +2271,22 @@ page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
pfnum = IDX_TO_PNUM(mnode, r, idx0);
if (pfnum < lo || pfnum >= hi) {
pfnum = lo;
- } else if ((PFN_2_COLOR(pfnum, szc) ^ color) & ceq_mask) {
- /* pfnum has invalid color get the closest correct pfn */
- PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
- color_mask);
- pfnum = (pfnum >= hi) ? lo : pfnum;
+ } else {
+ MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
+ if (pfnum == (pfn_t)-1) {
+ pfnum = lo;
+ MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
+ ASSERT(pfnum != (pfn_t)-1);
+ } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
+ (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
+ /* invalid color, get the closest correct pfn */
+ PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
+ color_mask, &it);
+ if (pfnum >= hi) {
+ pfnum = lo;
+ MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
+ }
+ }
}
/* set starting index */
@@ -2239,12 +2313,16 @@ page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
/* jump to the next page in the range */
if (pfnum < nlo) {
pfnum = P2ROUNDUP(nlo, szcpgcnt);
+ MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
idx = PNUM_TO_IDX(mnode, r, pfnum);
if (idx >= len || pfnum >= hi)
goto wrapit;
- if ((PFN_2_COLOR(pfnum, szc) ^ color) &
+ if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
ceq_mask)
goto next;
+ if (interleaved_mnodes &&
+ PFN_2_MEM_NODE(pfnum) != mnode)
+ goto next;
}
}
#endif
@@ -2264,7 +2342,7 @@ page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
if (ret_pp != NULL) {
VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
- PFN_2_COLOR(pfnum, szc), mrange) = idx;
+ PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
page_freelist_unlock(mnode);
rw_exit(&page_ctrs_rwlock[mnode]);
#if defined(__sparc)
@@ -2299,11 +2377,12 @@ page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
}
next:
PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
- color_mask);
+ color_mask, &it);
idx = PNUM_TO_IDX(mnode, r, pfnum);
if (idx >= len || pfnum >= hi) {
wrapit:
pfnum = lo;
+ MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
idx = PNUM_TO_IDX(mnode, r, pfnum);
wrap++;
#if defined(__sparc)
@@ -2319,14 +2398,17 @@ wrapit:
/*
* For the given mnode, promote as many small pages to large pages as possible.
+ * mnode can be -1, which means do them all
*/
void
page_freelist_coalesce_all(int mnode)
{
int r; /* region size */
int idx, full;
- pfn_t pfnum;
size_t len;
+ int doall = interleaved_mnodes || mnode < 0;
+ int mlo = doall ? 0 : mnode;
+ int mhi = doall ? max_mem_nodes : (mnode + 1);
VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
@@ -2340,39 +2422,54 @@ page_freelist_coalesce_all(int mnode)
* Always promote to the largest page possible
* first to reduce the number of page promotions.
*/
- rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
- page_freelist_lock(mnode);
+ for (mnode = mlo; mnode < mhi; mnode++) {
+ rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
+ page_freelist_lock(mnode);
+ }
for (r = mmu_page_sizes - 1; r > 0; r--) {
- pgcnt_t cands = 0;
- int mrange, nranges = mnode_nranges[mnode];
+ for (mnode = mlo; mnode < mhi; mnode++) {
+ pgcnt_t cands = 0;
+ int mrange, nranges = mnode_nranges[mnode];
- for (mrange = 0; mrange < nranges; mrange++) {
- PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
- if (cands != 0)
- break;
- }
- if (cands == 0) {
- VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all);
- continue;
- }
+ for (mrange = 0; mrange < nranges; mrange++) {
+ PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
+ if (cands != 0)
+ break;
+ }
+ if (cands == 0) {
+ VM_STAT_ADD(vmm_vmstats.
+ page_ctrs_cands_skip_all);
+ continue;
+ }
- full = FULL_REGION_CNT(r);
- len = PAGE_COUNTERS_ENTRIES(mnode, r);
-
- for (idx = 0; idx < len; idx++) {
- if (PAGE_COUNTERS(mnode, r, idx) == full) {
- pfnum = IDX_TO_PNUM(mnode, r, idx);
- ASSERT(pfnum >=
- mem_node_config[mnode].physbase &&
- pfnum <
- mem_node_config[mnode].physmax);
- (void) page_promote(mnode,
- pfnum, r, PC_FREE, PC_MTYPE_ANY);
+ full = FULL_REGION_CNT(r);
+ len = PAGE_COUNTERS_ENTRIES(mnode, r);
+
+ for (idx = 0; idx < len; idx++) {
+ if (PAGE_COUNTERS(mnode, r, idx) == full) {
+ pfn_t pfnum =
+ IDX_TO_PNUM(mnode, r, idx);
+ int tmnode = interleaved_mnodes ?
+ PFN_2_MEM_NODE(pfnum) : mnode;
+
+ ASSERT(pfnum >=
+ mem_node_config[tmnode].physbase &&
+ pfnum <
+ mem_node_config[tmnode].physmax);
+
+ (void) page_promote(tmnode,
+ pfnum, r, PC_FREE, PC_MTYPE_ANY);
+ }
}
+ /* shared hpm_counters covers all mnodes, so we quit */
+ if (interleaved_mnodes)
+ break;
}
}
- page_freelist_unlock(mnode);
- rw_exit(&page_ctrs_rwlock[mnode]);
+ for (mnode = mlo; mnode < mhi; mnode++) {
+ page_freelist_unlock(mnode);
+ rw_exit(&page_ctrs_rwlock[mnode]);
+ }
}
/*
@@ -2601,22 +2698,22 @@ page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
/* we can split pages in the freelist, but not the cachelist */
if (can_split) {
- plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
+ plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
- /* calculate next sizes color masks and number of free list bins */
- for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
- plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
- plw->plw_ceq_mask[szc]);
- plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
- }
- plw->plw_ceq_mask[nszc] = INVALID_MASK;
- plw->plw_bins[nszc] = 0;
+ /* set next szc color masks and number of free list bins */
+ for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
+ plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
+ plw->plw_ceq_mask[szc]);
+ plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
+ }
+ plw->plw_ceq_mask[nszc] = INVALID_MASK;
+ plw->plw_bins[nszc] = 0;
} else {
- ASSERT(szc == 0);
- plw->plw_do_split = 0;
- plw->plw_bins[1] = 0;
- plw->plw_ceq_mask[1] = INVALID_MASK;
+ ASSERT(szc == 0);
+ plw->plw_do_split = 0;
+ plw->plw_bins[1] = 0;
+ plw->plw_ceq_mask[1] = INVALID_MASK;
}
}
@@ -2664,7 +2761,7 @@ page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
plw->plw_bin_marker =
nbin = INC_MASKED(nbin, neq_mask,
- plw->plw_color_mask);
+ plw->plw_color_mask);
plw->plw_bin_split_prev = plw->plw_bin0;
/*
* large pages all have the same vac color
@@ -2710,10 +2807,10 @@ page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
}
if (plw->plw_bins[nszc] != 0) {
- nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
- if (!((plw->plw_split_next ^ nbin_nsz) &
- plw->plw_ceq_mask[nszc]))
- plw->plw_do_split = 1;
+ nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
+ if (!((plw->plw_split_next ^ nbin_nsz) &
+ plw->plw_ceq_mask[nszc]))
+ plw->plw_do_split = 1;
}
return (nbin);
@@ -2864,8 +2961,8 @@ bin_empty_1:
*/
if (plw.plw_do_split &&
(pp = page_freelist_split(szc, bin, mnode,
- mtype, PFNNULL, &plw)) != NULL)
- return (pp);
+ mtype, PFNNULL, &plw)) != NULL)
+ return (pp);
if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL)
@@ -3229,6 +3326,7 @@ page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
uint_t color_mask;
pfn_t hi, lo;
uint_t skip;
+ MEM_NODE_ITERATOR_DECL(it);
ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
@@ -3308,6 +3406,7 @@ page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
/* round to szcpgcnt boundaries */
lo = P2ROUNDUP(lo, szcpgcnt);
+ MEM_NODE_ITERATOR_INIT(lo, mnode, &it);
hi = hi & ~(szcpgcnt - 1);
if (hi <= lo)
@@ -3318,10 +3417,14 @@ page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
* page sizes may only have a single page color
*/
skip = szcpgcnt;
- if (ceq_mask > 0) {
+ if (ceq_mask > 0 || interleaved_mnodes) {
/* set lo to point at appropriate color */
- PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
- color_mask);
+ if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
+ (interleaved_mnodes &&
+ PFN_2_MEM_NODE(lo) != mnode)) {
+ PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
+ color_mask, &it);
+ }
if (hi <= lo)
/* mseg cannot satisfy color request */
continue;
@@ -3331,10 +3434,15 @@ page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
randpfn = (pfn_t)GETTICK();
randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
- if (ceq_mask) {
- PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin, ceq_mask,
- color_mask);
- randpfn = (randpfn >= hi) ? lo : randpfn;
+ MEM_NODE_ITERATOR_INIT(randpfn, mnode, &it);
+ if (ceq_mask || interleaved_mnodes) {
+ if (randpfn != (pfn_t)-1)
+ PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
+ ceq_mask, color_mask, &it);
+ if (randpfn >= hi) {
+ randpfn = lo;
+ MEM_NODE_ITERATOR_INIT(randpfn, mnode, &it);
+ }
}
randpp = mseg->pages + (randpfn - mseg->pages_base);
@@ -3357,17 +3465,23 @@ page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
}
}
- if (ceq_mask == 0) {
+ if (ceq_mask == 0 && !interleaved_mnodes) {
pp += skip;
} else {
pfn_t pfn = pp->p_pagenum;
PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
- ceq_mask, color_mask);
- pp = mseg->pages + (pfn - mseg->pages_base);
+ ceq_mask, color_mask, &it);
+ if (pfn == (pfn_t)-1) {
+ pp = endpp;
+ } else {
+ pp = mseg->pages +
+ (pfn - mseg->pages_base);
+ }
}
if (pp >= endpp) {
/* start from the beginning */
+ MEM_NODE_ITERATOR_INIT(lo, mnode, &it);
pp = mseg->pages + (lo - mseg->pages_base);
ASSERT(pp->p_pagenum == lo);
ASSERT(pp + szcpgcnt <= endpp);
@@ -3947,9 +4061,9 @@ page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
while ((pplist == NULL) &&
(mnode = lgrp_memnode_choose(&lgrp_cookie))
!= -1) {
- pplist = page_get_mnode_freelist(
- mnode, bin, mtype, szc,
- flags);
+ pplist =
+ page_get_mnode_freelist(mnode, bin,
+ mtype, szc, flags);
}
/*
@@ -3968,8 +4082,9 @@ page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
while ((pplist == NULL) &&
(mnode = lgrp_memnode_choose(&lgrp_cookie))
!= -1) {
- pplist = page_get_mnode_cachelist(
- bin, flags, mnode, mtype);
+ pplist =
+ page_get_mnode_cachelist(bin, flags,
+ mnode, mtype);
}
if (pplist != NULL) {
page_hashout(pplist, NULL);
@@ -4079,11 +4194,11 @@ page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
while ((pplist == NULL) &&
(mnode =
- lgrp_memnode_choose(&lgrp_cookie))
+ lgrp_memnode_choose(&lgrp_cookie))
!= -1) {
pplist = page_get_contig_pages(
- mnode, bin, mtype, szc,
- flags | PGI_PGCPHIPRI);
+ mnode, bin, mtype, szc,
+ flags | PGI_PGCPHIPRI);
}
break;
}
diff --git a/usr/src/uts/i86pc/os/memnode.c b/usr/src/uts/i86pc/os/memnode.c
index e64fd2b0c6..9440ad17f6 100644
--- a/usr/src/uts/i86pc/os/memnode.c
+++ b/usr/src/uts/i86pc/os/memnode.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -152,7 +151,7 @@ mem_node_post_del_slice(pfn_t start, pfn_t end, int cancelled)
if (!cancelled) {
delta_pgcnt = end - start;
node_size = mem_node_config[mnode].physmax -
- mem_node_config[mnode].physbase;
+ mem_node_config[mnode].physbase;
if (node_size > delta_pgcnt) {
/*
@@ -232,7 +231,7 @@ mem_node_alloc()
*/
for (mnode = 0; mnode < max_mem_nodes; mnode++)
if (cas32((uint32_t *)&mem_node_config[mnode].exists,
- 0, 1) == 0)
+ 0, 1) == 0)
break;
if (mnode >= max_mem_nodes)
@@ -273,7 +272,7 @@ mem_node_memlist_pages(int mnode, struct memlist *mlist)
for (pmem = mlist; pmem; pmem = pmem->next) {
cur_base = btop(pmem->address);
cur_end = cur_base + btop(pmem->size) - 1;
- if (end <= cur_base || base >= cur_end)
+ if (end < cur_base || base > cur_end)
continue;
npgs = npgs + (MIN(cur_end, end) -
MAX(cur_base, base)) + 1;
diff --git a/usr/src/uts/i86pc/os/mp_machdep.c b/usr/src/uts/i86pc/os/mp_machdep.c
index 4605f6e517..a44c266f27 100644
--- a/usr/src/uts/i86pc/os/mp_machdep.c
+++ b/usr/src/uts/i86pc/os/mp_machdep.c
@@ -1343,3 +1343,30 @@ mach_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp,
}
return (PSM_SUCCESS);
}
+/*
+ * Return 1 if CMT load balancing policies should be
+ * implemented across instances of the specified hardware
+ * sharing relationship.
+ */
+int
+pg_cmt_load_bal_hw(pghw_type_t hw)
+{
+ if (hw == PGHW_IPIPE ||
+ hw == PGHW_FPU ||
+ hw == PGHW_CHIP)
+ return (1);
+ else
+ return (0);
+}
+/*
+ * Return 1 if thread affinity polices should be implemented
+ * for instances of the specifed hardware sharing relationship.
+ */
+int
+pg_cmt_affinity_hw(pghw_type_t hw)
+{
+ if (hw == PGHW_CACHE)
+ return (1);
+ else
+ return (0);
+}
diff --git a/usr/src/uts/i86pc/sys/memnode.h b/usr/src/uts/i86pc/sys/memnode.h
index c76f90216e..21a059ac44 100644
--- a/usr/src/uts/i86pc/sys/memnode.h
+++ b/usr/src/uts/i86pc/sys/memnode.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -51,9 +50,6 @@ extern "C" {
#define PFN_2_MEM_NODE(pfn) \
((max_mem_nodes > 1) ? plat_pfn_to_mem_node(pfn) : 0)
-#define LGRPHAND_2_MEM_NODE(lgrp_plat_hand) \
- ((max_mem_nodes > 1) ? plat_lgrphand_to_mem_node(lgrp_plat_hand) : 0)
-
#define MEM_NODE_2_LGRPHAND(mnode) \
((max_mem_nodes > 1) ? plat_mem_node_to_lgrphand(mnode) : \
LGRP_DEFAULT_HANDLE)
@@ -90,7 +86,6 @@ extern void mem_node_post_del_slice(pfn_t, pfn_t, int);
extern int mem_node_alloc(void);
extern pgcnt_t mem_node_memlist_pages(int, struct memlist *);
-
extern struct mem_node_conf mem_node_config[];
extern uint64_t mem_node_physalign;
extern int mem_node_pfn_shift;
diff --git a/usr/src/uts/i86pc/vm/vm_dep.h b/usr/src/uts/i86pc/vm/vm_dep.h
index b95f6b8e17..49e9386d81 100644
--- a/usr/src/uts/i86pc/vm/vm_dep.h
+++ b/usr/src/uts/i86pc/vm/vm_dep.h
@@ -39,6 +39,7 @@ extern "C" {
#include <sys/clock.h>
#include <vm/hat_pte.h>
#include <sys/param.h>
+#include <sys/memnode.h>
/*
* WARNING: vm_dep.h is included by files in common. As such, macros
@@ -285,10 +286,41 @@ extern kmutex_t *cpc_mutex[NPC_MUTEX];
extern page_t *page_get_mnode_freelist(int, uint_t, int, uchar_t, uint_t);
extern page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
-#define PAGE_GET_COLOR_SHIFT(szc, nszc) \
+/* mem node iterator is not used on x86 */
+#define MEM_NODE_ITERATOR_DECL(it)
+#define MEM_NODE_ITERATOR_INIT(pfn, mnode, it)
+
+/*
+ * interleaved_mnodes mode is never set on x86, therefore,
+ * simply return the limits of the given mnode, which then
+ * determines the length of hpm_counters array for the mnode.
+ */
+#define HPM_COUNTERS_LIMITS(mnode, physbase, physmax, first) \
+ { \
+ (physbase) = mem_node_config[(mnode)].physbase; \
+ (physmax) = mem_node_config[(mnode)].physmax; \
+ (first) = (mnode); \
+ }
+
+#define PAGE_CTRS_WRITE_LOCK(mnode) \
+ { \
+ rw_enter(&page_ctrs_rwlock[(mnode)], RW_WRITER);\
+ page_freelist_lock(mnode); \
+ }
+
+#define PAGE_CTRS_WRITE_UNLOCK(mnode) \
+ { \
+ page_freelist_unlock(mnode); \
+ rw_exit(&page_ctrs_rwlock[(mnode)]); \
+ }
+
+#define PAGE_GET_COLOR_SHIFT(szc, nszc) \
(hw_page_array[(nszc)].hp_shift - hw_page_array[(szc)].hp_shift)
-#define PFN_2_COLOR(pfn, szc) \
+#define PAGE_CONVERT_COLOR(ncolor, szc, nszc) \
+ ((ncolor) << PAGE_GET_COLOR_SHIFT((szc), (nszc)))
+
+#define PFN_2_COLOR(pfn, szc, it) \
(((pfn) & page_colors_mask) >> \
(hw_page_array[szc].hp_shift - hw_page_array[0].hp_shift))
@@ -305,7 +337,7 @@ extern page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
* This macro calculates the next sequential pfn with the specified
* color using color equivalency mask
*/
-#define PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask) \
+#define PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask, it) \
ASSERT(((color) & ~(ceq_mask)) == 0); \
{ \
uint_t pfn_shift = PAGE_BSZS_SHIFT(szc); \
@@ -329,7 +361,7 @@ extern page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
((color) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc)))
/* Find the bin for the given page if it was of size szc */
-#define PP_2_BIN_SZC(pp, szc) (PFN_2_COLOR(pp->p_pagenum, szc))
+#define PP_2_BIN_SZC(pp, szc) (PFN_2_COLOR(pp->p_pagenum, szc, NULL))
#define PP_2_BIN(pp) (PP_2_BIN_SZC(pp, pp->p_szc))
diff --git a/usr/src/uts/sun4/os/memnode.c b/usr/src/uts/sun4/os/memnode.c
index 849bec22c8..cb21287ebd 100644
--- a/usr/src/uts/sun4/os/memnode.c
+++ b/usr/src/uts/sun4/os/memnode.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -83,7 +82,7 @@ mem_node_add_slice(pfn_t start, pfn_t end)
end = roundup(end, btop(mem_node_physalign)) - 1;
}
- if (&plat_slice_add)
+ if (&plat_slice_add != NULL)
plat_slice_add(start, end);
mnode = PFN_2_MEM_NODE(start);
@@ -148,7 +147,7 @@ mem_node_post_del_slice(pfn_t start, pfn_t end, int cancelled)
if (!cancelled) {
delta_pgcnt = end - start;
node_size = mem_node_config[mnode].physmax -
- mem_node_config[mnode].physbase;
+ mem_node_config[mnode].physbase;
if (node_size > delta_pgcnt) {
/*
@@ -180,7 +179,7 @@ mem_node_post_del_slice(pfn_t start, pfn_t end, int cancelled)
mem_node_config[mnode].exists = 0;
}
- if (&plat_slice_del)
+ if (&plat_slice_del != NULL)
plat_slice_del(start, end);
}
}
@@ -195,7 +194,7 @@ startup_build_mem_nodes(u_longlong_t *list, size_t nelems)
/* LINTED: ASSERT will always true or false */
ASSERT(NBBY * sizeof (mnodeset_t) >= max_mem_nodes);
- if (&plat_build_mem_nodes) {
+ if (&plat_build_mem_nodes != NULL) {
plat_build_mem_nodes(list, nelems);
} else {
/*
@@ -226,7 +225,7 @@ mem_node_alloc()
*/
for (mnode = 0; mnode < max_mem_nodes; mnode++)
if (cas32((uint32_t *)&mem_node_config[mnode].exists,
- 0, 1) == 0)
+ 0, 1) == 0)
break;
if (mnode >= max_mem_nodes)
@@ -247,27 +246,39 @@ mem_node_alloc()
* Find the intersection between a memnode and a memlist
* and returns the number of pages that overlap.
*
- * Assumes the list is protected from DR operations by
- * the memlist lock.
+ * Grab the memlist lock to protect the list from DR operations.
*/
pgcnt_t
mem_node_memlist_pages(int mnode, struct memlist *mlist)
{
pfn_t base, end;
pfn_t cur_base, cur_end;
- pgcnt_t npgs;
+ pgcnt_t npgs = 0;
+ pgcnt_t pages;
struct memlist *pmem;
+ if (&plat_mem_node_intersect_range != NULL) {
+ memlist_read_lock();
+
+ for (pmem = mlist; pmem; pmem = pmem->next) {
+ plat_mem_node_intersect_range(btop(pmem->address),
+ btop(pmem->size), mnode, &pages);
+ npgs += pages;
+ }
+
+ memlist_read_unlock();
+ return (npgs);
+ }
+
base = mem_node_config[mnode].physbase;
end = mem_node_config[mnode].physmax;
- npgs = 0;
memlist_read_lock();
for (pmem = mlist; pmem; pmem = pmem->next) {
cur_base = btop(pmem->address);
cur_end = cur_base + btop(pmem->size) - 1;
- if (end <= cur_base || base >= cur_end)
+ if (end < cur_base || base > cur_end)
continue;
npgs = npgs + (MIN(cur_end, end) -
MAX(cur_base, base)) + 1;
@@ -277,3 +288,34 @@ mem_node_memlist_pages(int mnode, struct memlist *mlist)
return (npgs);
}
+
+/*
+ * Find MIN(physbase) and MAX(physmax) over all mnodes
+ *
+ * Called during startup and DR to find hpm_counters limits when
+ * interleaved_mnodes is set.
+ * NOTE: there is a race condition with DR if it tries to change more than
+ * one mnode in parallel. Sizing shared hpm_counters depends on finding the
+ * min(physbase) and max(physmax) across all mnodes. Therefore, the caller of
+ * page_ctrs_adjust must ensure that mem_node_config does not change while it
+ * is running.
+ */
+void
+mem_node_max_range(pfn_t *basep, pfn_t *maxp)
+{
+ int mnode;
+ pfn_t max = 0;
+ pfn_t base = (pfn_t)-1;
+
+ for (mnode = 0; mnode < max_mem_nodes; mnode++) {
+ if (mem_node_config[mnode].exists == 0)
+ continue;
+ if (max < mem_node_config[mnode].physmax)
+ max = mem_node_config[mnode].physmax;
+ if (base > mem_node_config[mnode].physbase)
+ base = mem_node_config[mnode].physbase;
+ }
+ ASSERT(base != (pfn_t)-1 && max != 0);
+ *basep = base;
+ *maxp = max;
+}
diff --git a/usr/src/uts/sun4/sys/memnode.h b/usr/src/uts/sun4/sys/memnode.h
index d8068b9235..745d03002f 100644
--- a/usr/src/uts/sun4/sys/memnode.h
+++ b/usr/src/uts/sun4/sys/memnode.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -66,16 +65,13 @@ extern "C" {
* nodes, so the platform can always make everything work.
*/
-#ifndef MAX_MEM_NODES
+#ifndef MAX_MEM_NODES
#define MAX_MEM_NODES (4)
#endif /* MAX_MEM_NODES */
#define PFN_2_MEM_NODE(pfn) \
((max_mem_nodes > 1) ? plat_pfn_to_mem_node(pfn) : 0)
-#define LGRPHAND_2_MEM_NODE(lgrp_plat_hand) \
- ((max_mem_nodes > 1) ? plat_lgrphand_to_mem_node(lgrp_plat_hand) : 0)
-
#define MEM_NODE_2_LGRPHAND(mnode) \
((max_mem_nodes > 1) ? plat_mem_node_to_lgrphand(mnode) : \
LGRP_DEFAULT_HANDLE)
@@ -90,12 +86,14 @@ extern void plat_assign_lgrphand_to_mem_node(lgrp_handle_t, int);
extern lgrp_handle_t plat_mem_node_to_lgrphand(int);
extern void plat_slice_add(pfn_t, pfn_t);
extern void plat_slice_del(pfn_t, pfn_t);
+extern void plat_mem_node_intersect_range(pfn_t, pgcnt_t, int, pgcnt_t *);
#pragma weak plat_pfn_to_mem_node
#pragma weak plat_lgrphand_to_mem_node
#pragma weak plat_mem_node_to_lgrphand
#pragma weak plat_slice_add
#pragma weak plat_slice_del
+#pragma weak plat_mem_node_intersect_range
struct mem_node_conf {
int exists; /* only try if set, list may still be empty */
@@ -111,7 +109,8 @@ extern void mem_node_pre_del_slice(pfn_t, pfn_t);
extern void mem_node_post_del_slice(pfn_t, pfn_t, int);
extern int mem_node_alloc(void);
extern pgcnt_t mem_node_memlist_pages(int, struct memlist *);
-
+extern void mem_node_add_slice(pfn_t start, pfn_t end);
+extern void mem_node_max_range(pfn_t *, pfn_t *);
extern struct mem_node_conf mem_node_config[];
extern uint64_t mem_node_physalign;
diff --git a/usr/src/uts/sun4/vm/vm_dep.h b/usr/src/uts/sun4/vm/vm_dep.h
index 6f150837f8..357f9ba0a3 100644
--- a/usr/src/uts/sun4/vm/vm_dep.h
+++ b/usr/src/uts/sun4/vm/vm_dep.h
@@ -107,6 +107,92 @@ extern kmutex_t *fpc_mutex[NPC_MUTEX];
extern kmutex_t *cpc_mutex[NPC_MUTEX];
/*
+ * Iterator provides the info needed to convert RA to PA.
+ * MEM_NODE_ITERATOR_INIT() should be called before
+ * PAGE_NEXT_PFN_FOR_COLOR() if pfn was not obtained via a previous
+ * PAGE_NEXT_PFN_FOR_COLOR() call. Iterator caches color 2 hash
+ * translations requiring initializer call if color or ceq_mask changes,
+ * even if pfn doesn't. MEM_NODE_ITERATOR_INIT() must also be called before
+ * PFN_2_COLOR() that uses a valid iterator argument.
+ */
+#ifdef sun4v
+
+typedef struct mem_node_iterator {
+ uint_t mi_mnode; /* mnode in which to iterate */
+ int mi_init; /* set to 1 when first init */
+ int mi_last_mblock; /* last mblock visited */
+ uint_t mi_hash_ceq_mask; /* cached copy of ceq_mask */
+ uint_t mi_hash_color; /* cached copy of color */
+ uint_t mi_mnode_mask; /* number of mask bits */
+ uint_t mi_mnode_pfn_shift; /* mnode position in pfn */
+ pfn_t mi_mblock_base; /* first valid pfn in current mblock */
+ pfn_t mi_mblock_end; /* last valid pfn in current mblock */
+ pfn_t mi_ra_to_pa; /* ra adjustment for current mblock */
+ pfn_t mi_mnode_pfn_mask; /* mask to obtain mnode id bits */
+} mem_node_iterator_t;
+
+#define MEM_NODE_ITERATOR_DECL(it) \
+ mem_node_iterator_t it
+#define MEM_NODE_ITERATOR_INIT(pfn, mnode, it) \
+ (pfn) = plat_mem_node_iterator_init((pfn), (mnode), (it), 1)
+
+extern pfn_t plat_mem_node_iterator_init(pfn_t, int,
+ mem_node_iterator_t *, int);
+extern pfn_t plat_rapfn_to_papfn(pfn_t);
+extern int interleaved_mnodes;
+
+#else /* sun4v */
+
+#define MEM_NODE_ITERATOR_DECL(it) \
+ void *it = NULL
+#define MEM_NODE_ITERATOR_INIT(pfn, mnode, it)
+
+#endif /* sun4v */
+
+/*
+ * Return the mnode limits so that hpc_counters length and base
+ * index can be determined. When interleaved_mnodes is set, we
+ * create an array only for the first mnode that exists. All other
+ * mnodes will share the array in this case.
+ * If interleaved_mnodes is not set, simply return the limits for
+ * the given mnode.
+ */
+#define HPM_COUNTERS_LIMITS(mnode, physbase, physmax, first) \
+ if (!interleaved_mnodes) { \
+ (physbase) = mem_node_config[(mnode)].physbase; \
+ (physmax) = mem_node_config[(mnode)].physmax; \
+ (first) = (mnode); \
+ } else if ((first) < 0) { \
+ mem_node_max_range(&(physbase), &(physmax)); \
+ (first) = (mnode); \
+ }
+
+#define PAGE_CTRS_WRITE_LOCK(mnode) \
+ if (!interleaved_mnodes) { \
+ rw_enter(&page_ctrs_rwlock[(mnode)], RW_WRITER); \
+ page_freelist_lock(mnode); \
+ } else { \
+ /* changing shared hpm_counters */ \
+ int _i; \
+ for (_i = 0; _i < max_mem_nodes; _i++) { \
+ rw_enter(&page_ctrs_rwlock[_i], RW_WRITER); \
+ page_freelist_lock(_i); \
+ } \
+ }
+
+#define PAGE_CTRS_WRITE_UNLOCK(mnode) \
+ if (!interleaved_mnodes) { \
+ page_freelist_unlock(mnode); \
+ rw_exit(&page_ctrs_rwlock[(mnode)]); \
+ } else { \
+ int _i; \
+ for (_i = 0; _i < max_mem_nodes; _i++) { \
+ page_freelist_unlock(_i); \
+ rw_exit(&page_ctrs_rwlock[_i]); \
+ } \
+ }
+
+/*
* cpu specific color conversion functions
*/
extern uint_t page_get_nsz_color_mask_cpu(uchar_t, uint_t);
@@ -118,11 +204,14 @@ extern uint_t page_get_nsz_color_cpu(uchar_t, uint_t);
extern uint_t page_get_color_shift_cpu(uchar_t, uchar_t);
#pragma weak page_get_color_shift_cpu
+extern uint_t page_convert_color_cpu(uint_t, uchar_t, uchar_t);
+#pragma weak page_convert_color_cpu
+
extern pfn_t page_next_pfn_for_color_cpu(pfn_t,
- uchar_t, uint_t, uint_t, uint_t);
+ uchar_t, uint_t, uint_t, uint_t, void *);
#pragma weak page_next_pfn_for_color_cpu
-extern uint_t page_pfn_2_color_cpu(pfn_t, uchar_t);
+extern uint_t page_pfn_2_color_cpu(pfn_t, uchar_t, void *);
#pragma weak page_pfn_2_color_cpu
#define PAGE_GET_COLOR_SHIFT(szc, nszc) \
@@ -131,9 +220,14 @@ extern uint_t page_pfn_2_color_cpu(pfn_t, uchar_t);
(hw_page_array[(nszc)].hp_shift - \
hw_page_array[(szc)].hp_shift))
-#define PFN_2_COLOR(pfn, szc) \
+#define PAGE_CONVERT_COLOR(ncolor, szc, nszc) \
+ ((&page_convert_color_cpu != NULL) ? \
+ page_convert_color_cpu(ncolor, szc, nszc) : \
+ ((ncolor) << PAGE_GET_COLOR_SHIFT((szc), (nszc))))
+
+#define PFN_2_COLOR(pfn, szc, it) \
((&page_pfn_2_color_cpu != NULL) ? \
- page_pfn_2_color_cpu(pfn, szc) : \
+ page_pfn_2_color_cpu(pfn, szc, it) : \
((pfn & (hw_page_array[0].hp_colors - 1)) >> \
(hw_page_array[szc].hp_shift - \
hw_page_array[0].hp_shift)))
@@ -151,7 +245,7 @@ extern uint_t page_pfn_2_color_cpu(pfn_t, uchar_t);
* This macro calculates the next sequential pfn with the specified
* color using color equivalency mask
*/
-#define PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask) \
+#define PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask, it) \
ASSERT(((color) & ~(ceq_mask)) == 0); \
if (&page_next_pfn_for_color_cpu == NULL) { \
uint_t pfn_shift = PAGE_BSZS_SHIFT(szc); \
@@ -165,8 +259,8 @@ extern uint_t page_pfn_2_color_cpu(pfn_t, uchar_t);
pfn = (pfn > spfn ? pfn : pfn + stride) << pfn_shift; \
} \
} else { \
- pfn = page_next_pfn_for_color_cpu(pfn, szc, color, \
- ceq_mask, color_mask); \
+ pfn = page_next_pfn_for_color_cpu(pfn, szc, color, \
+ ceq_mask, color_mask, it); \
}
/* get the color equivalency mask for the next szc */
@@ -182,7 +276,7 @@ extern uint_t page_pfn_2_color_cpu(pfn_t, uchar_t);
page_get_nsz_color_cpu(szc, color))
/* Find the bin for the given page if it was of size szc */
-#define PP_2_BIN_SZC(pp, szc) (PFN_2_COLOR(pp->p_pagenum, szc))
+#define PP_2_BIN_SZC(pp, szc) (PFN_2_COLOR(pp->p_pagenum, szc, (void *)(-1)))
#define PP_2_BIN(pp) (PP_2_BIN_SZC(pp, pp->p_szc))
@@ -335,16 +429,31 @@ typedef struct {
* when memory is added (kphysm_add_memory_dynamic) or deleted
* (kphysm_del_cleanup).
*/
-#define PLCNT_MODIFY_MAX(startpfn, cnt) { \
- pfn_t pfn = startpfn, endpfn = startpfn + ABS(cnt); \
- while (pfn < endpfn) { \
- int mn = PFN_2_MEM_NODE(pfn); \
- long inc = MIN(endpfn, mem_node_config[mn].physmax + 1) \
- - pfn; \
- pfn += inc; \
- atomic_add_long(&plcnt[mn][MTYPE_RELOC].plc_mt_pgmax, \
- ((cnt) < 0) ? -inc: inc); \
- } \
+#define PLCNT_MODIFY_MAX(pfn, cnt) { \
+ spgcnt_t _cnt = (spgcnt_t)(cnt); \
+ pgcnt_t _acnt = ABS(_cnt); \
+ int _mn; \
+ pgcnt_t _np; \
+ if (&plat_mem_node_intersect_range != NULL) { \
+ for (_mn = 0; _mn < max_mem_nodes; _mn++) { \
+ plat_mem_node_intersect_range((pfn), _acnt, _mn, &_np);\
+ if (_np == 0) \
+ continue; \
+ atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \
+ (_cnt < 0) ? -_np : _np); \
+ } \
+ } else { \
+ pfn_t _pfn = (pfn); \
+ pfn_t _endpfn = _pfn + _acnt; \
+ while (_pfn < _endpfn) { \
+ _mn = PFN_2_MEM_NODE(_pfn); \
+ _np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - \
+ _pfn; \
+ _pfn += _np; \
+ atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \
+ (_cnt < 0) ? -_np : _np); \
+ } \
+ } \
}
extern plcnt_t plcnt;
@@ -495,17 +604,17 @@ switch (consistent_coloring) { \
(vac_shift - MMU_PAGESHIFT)); \
if ((szc) == 0 || &page_pfn_2_color_cpu == NULL) { \
pfn += slew; \
- bin = PFN_2_COLOR(pfn, szc); \
+ bin = PFN_2_COLOR(pfn, szc, NULL); \
} else { \
- bin = PFN_2_COLOR(pfn, szc); \
+ bin = PFN_2_COLOR(pfn, szc, NULL); \
bin += slew >> (vac_shift - MMU_PAGESHIFT); \
bin &= hw_page_array[(szc)].hp_colors - 1; \
} \
break; \
} \
case 1: \
- bin = PFN_2_COLOR(((uintptr_t)addr >> MMU_PAGESHIFT), \
- szc); \
+ bin = PFN_2_COLOR(((uintptr_t)addr >> MMU_PAGESHIFT), \
+ szc, NULL); \
break; \
case 2: { \
int cnt = as_color_bin(as); \
diff --git a/usr/src/uts/sun4u/os/cmp.c b/usr/src/uts/sun4u/os/cmp.c
index 86a021f3d1..20aa7855c8 100644
--- a/usr/src/uts/sun4u/os/cmp.c
+++ b/usr/src/uts/sun4u/os/cmp.c
@@ -280,3 +280,30 @@ cmp_set_nosteal_interval(void)
/* Set the nosteal interval (used by disp_getbest()) to 100us */
nosteal_nsec = 100000UL;
}
+/*
+ * Return 1 if CMT load balancing policies should be
+ * implemented across instances of the specified hardware
+ * sharing relationship.
+ */
+int
+pg_cmt_load_bal_hw(pghw_type_t hw)
+{
+ if (hw == PGHW_IPIPE ||
+ hw == PGHW_FPU ||
+ hw == PGHW_CHIP)
+ return (1);
+ else
+ return (0);
+}
+/*
+ * Return 1 if thread affinity polices should be implemented
+ * for instances of the specifed hardware sharing relationship.
+ */
+int
+pg_cmt_affinity_hw(pghw_type_t hw)
+{
+ if (hw == PGHW_CACHE)
+ return (1);
+ else
+ return (0);
+}
diff --git a/usr/src/uts/sun4v/Makefile.files b/usr/src/uts/sun4v/Makefile.files
index d153205ab1..b7c02c1e22 100644
--- a/usr/src/uts/sun4v/Makefile.files
+++ b/usr/src/uts/sun4v/Makefile.files
@@ -62,6 +62,7 @@ CORE_OBJS += mach_xc.o
CORE_OBJS += mem_cage.o
CORE_OBJS += mem_config.o
CORE_OBJS += memlist_new.o
+CORE_OBJS += mpo.o
CORE_OBJS += ppage.o
CORE_OBJS += promif_asr.o
CORE_OBJS += promif_cpu.o
diff --git a/usr/src/uts/sun4v/Makefile.sun4v.shared b/usr/src/uts/sun4v/Makefile.sun4v.shared
index 82dbb2b21c..a299bb9a56 100644
--- a/usr/src/uts/sun4v/Makefile.sun4v.shared
+++ b/usr/src/uts/sun4v/Makefile.sun4v.shared
@@ -215,6 +215,7 @@ FDOFFSETS = $(UTSBASE)/sun/io/fd_offsets.in
#
MACHINE_DEFS = -D$(PLATFORM) -D_MACHDEP -DSFMMU
+MACHINE_DEFS += -DMAX_MEM_NODES=8
$(MPSAS_BUILD)MACHINE_DEFS += -DMPSAS
diff --git a/usr/src/uts/sun4v/cpu/generic.c b/usr/src/uts/sun4v/cpu/generic.c
index eab39b9fe9..21771a5f71 100644
--- a/usr/src/uts/sun4v/cpu/generic.c
+++ b/usr/src/uts/sun4v/cpu/generic.c
@@ -35,6 +35,7 @@
#include <sys/elf_SPARC.h>
#include <vm/hat_sfmmu.h>
#include <vm/page.h>
+#include <vm/vm_dep.h>
#include <sys/cpuvar.h>
#include <sys/async.h>
#include <sys/cmn_err.h>
@@ -167,6 +168,10 @@ cpu_map_exec_units(struct cpu *cp)
if (cp->cpu_m.cpu_fpu == NO_EU_MAPPING_FOUND)
cp->cpu_m.cpu_fpu = (id_t)(cp->cpu_id);
+ cp->cpu_m.cpu_mpipe = cpunodes[cp->cpu_id].l2_cache_mapping;
+ if (cp->cpu_m.cpu_mpipe == NO_L2_CACHE_MAPPING_FOUND)
+ cp->cpu_m.cpu_mpipe = CPU_L2_CACHEID_INVALID;
+
cp->cpu_m.cpu_core = (id_t)(cp->cpu_id);
/*
diff --git a/usr/src/uts/sun4v/cpu/niagara.c b/usr/src/uts/sun4v/cpu/niagara.c
index cb7e182d27..d607c2625c 100644
--- a/usr/src/uts/sun4v/cpu/niagara.c
+++ b/usr/src/uts/sun4v/cpu/niagara.c
@@ -193,9 +193,10 @@ cpu_map_exec_units(struct cpu *cp)
/*
* Niagara systems just have one chip. Therefore, the chip id
- * is always 0.
+ * mpipe id are always 0.
*/
cp->cpu_m.cpu_chip = 0;
+ cp->cpu_m.cpu_mpipe = 0;
}
static int niagara_cpucnt;
diff --git a/usr/src/uts/sun4v/cpu/niagara2.c b/usr/src/uts/sun4v/cpu/niagara2.c
index e791361578..e77b2ef3b4 100644
--- a/usr/src/uts/sun4v/cpu/niagara2.c
+++ b/usr/src/uts/sun4v/cpu/niagara2.c
@@ -198,9 +198,9 @@ cpu_map_exec_units(struct cpu *cp)
* share the same L2 cache. If no such info is available, we
* set the cpu to belong to the defacto chip 0.
*/
- cp->cpu_m.cpu_chip = cpunodes[cp->cpu_id].l2_cache_mapping;
- if (cp->cpu_m.cpu_chip == NO_CHIP_MAPPING_FOUND)
- cp->cpu_m.cpu_chip = 0;
+ cp->cpu_m.cpu_mpipe = cpunodes[cp->cpu_id].l2_cache_mapping;
+ if (cp->cpu_m.cpu_mpipe == NO_L2_CACHE_MAPPING_FOUND)
+ cp->cpu_m.cpu_mpipe = CPU_L2_CACHEID_INVALID;
}
static int cpucnt;
@@ -283,22 +283,112 @@ cpu_trapstat_data(void *buf, uint_t tstat_pgszs)
}
}
+/*
+ * Page coloring support for hashed cache index mode
+ */
+
+/*
+ * Node id bits from machine description (MD). Node id distinguishes
+ * local versus remote memory. Because of MPO, page allocation does
+ * not cross node boundaries. Therefore, remove the node id bits from
+ * the color, since they are fixed. Either bit 30, or 31:30 in
+ * Victoria Falls processors.
+ * The number of node id bits is always 0 in Niagara2.
+ */
+typedef struct n2color {
+ uchar_t nnbits; /* number of node id bits */
+ uchar_t nnmask; /* mask for node id bits */
+ uchar_t lomask; /* mask for bits below node id */
+ uchar_t lobits; /* number of bits below node id */
+} n2color_t;
+
+n2color_t n2color[MMU_PAGE_SIZES];
+static uchar_t nhbits[] = {7, 7, 6, 5, 5, 5};
+
+/*
+ * Remove node id bits from color bits 32:28.
+ * This will reduce the number of colors.
+ * No change if number of node bits is zero.
+ */
+static inline uint_t
+n2_hash2color(uint_t color, uchar_t szc)
+{
+ n2color_t m = n2color[szc];
+
+ if (m.nnbits > 0) {
+ color = ((color >> m.nnbits) & ~m.lomask) | (color & m.lomask);
+ ASSERT((color & ~(hw_page_array[szc].hp_colors - 1)) == 0);
+ }
+
+ return (color);
+}
+
+/*
+ * Restore node id bits into page color.
+ * This will increase the number of colors to match N2.
+ * No change if number of node bits is zero.
+ */
+static inline uint_t
+n2_color2hash(uint_t color, uchar_t szc, uint_t node)
+{
+ n2color_t m = n2color[szc];
+
+ if (m.nnbits > 0) {
+ color = ((color & ~m.lomask) << m.nnbits) | (color & m.lomask);
+ color |= (node & m.nnmask) << m.lobits;
+ }
+
+ return (color);
+}
+
/* NI2 L2$ index is pa[32:28]^pa[17:13].pa[19:18]^pa[12:11].pa[10:6] */
+
+/*
+ * iterator NULL means pfn is VA, do not adjust ra_to_pa
+ * iterator (-1) means pfn is RA, need to convert to PA
+ * iterator non-null means pfn is RA, use ra_to_pa
+ */
uint_t
-page_pfn_2_color_cpu(pfn_t pfn, uchar_t szc)
+page_pfn_2_color_cpu(pfn_t pfn, uchar_t szc, void *cookie)
{
+ mem_node_iterator_t *it = cookie;
uint_t color;
ASSERT(szc <= TTE256M);
+ if (it == ((mem_node_iterator_t *)(-1))) {
+ pfn = plat_rapfn_to_papfn(pfn);
+ } else if (it != NULL) {
+ ASSERT(pfn >= it->mi_mblock_base && pfn <= it->mi_mblock_end);
+ pfn = pfn + it->mi_ra_to_pa;
+ }
pfn = PFN_BASE(pfn, szc);
color = ((pfn >> 15) ^ pfn) & 0x1f;
- if (szc >= TTE4M)
- return (color);
+ if (szc < TTE4M) {
+ /* 19:18 */
+ color = (color << 2) | ((pfn >> 5) & 0x3);
+ if (szc > TTE64K)
+ color >>= 1; /* 19 */
+ }
+ return (n2_hash2color(color, szc));
+}
- color = (color << 2) | ((pfn >> 5) & 0x3);
+static uint_t
+page_papfn_2_color_cpu(pfn_t papfn, uchar_t szc)
+{
+ uint_t color;
+
+ ASSERT(szc <= TTE256M);
- return (szc <= TTE64K ? color : (color >> 1));
+ papfn = PFN_BASE(papfn, szc);
+ color = ((papfn >> 15) ^ papfn) & 0x1f;
+ if (szc < TTE4M) {
+ /* 19:18 */
+ color = (color << 2) | ((papfn >> 5) & 0x3);
+ if (szc > TTE64K)
+ color >>= 1; /* 19 */
+ }
+ return (color);
}
#if TTE256M != 5
@@ -310,46 +400,91 @@ page_get_nsz_color_mask_cpu(uchar_t szc, uint_t mask)
{
static uint_t ni2_color_masks[5] = {0x63, 0x1e, 0x3e, 0x1f, 0x1f};
ASSERT(szc < TTE256M);
-
+ mask = n2_color2hash(mask, szc, 0);
mask &= ni2_color_masks[szc];
- return ((szc == TTE64K || szc == TTE512K) ? (mask >> 1) : mask);
+ if (szc == TTE64K || szc == TTE512K)
+ mask >>= 1;
+ return (n2_hash2color(mask, szc + 1));
}
uint_t
page_get_nsz_color_cpu(uchar_t szc, uint_t color)
{
ASSERT(szc < TTE256M);
- return ((szc == TTE64K || szc == TTE512K) ? (color >> 1) : color);
+ color = n2_color2hash(color, szc, 0);
+ if (szc == TTE64K || szc == TTE512K)
+ color >>= 1;
+ return (n2_hash2color(color, szc + 1));
}
uint_t
page_get_color_shift_cpu(uchar_t szc, uchar_t nszc)
{
+ uint_t s;
ASSERT(nszc >= szc);
ASSERT(nszc <= TTE256M);
- if (szc == nszc)
- return (0);
- if (szc <= TTE64K)
- return ((nszc >= TTE4M) ? 2 : ((nszc >= TTE512K) ? 1 : 0));
- if (szc == TTE512K)
- return (1);
+ s = nhbits[szc] - n2color[szc].nnbits;
+ s -= nhbits[nszc] - n2color[nszc].nnbits;
- return (0);
+ return (s);
+}
+
+uint_t
+page_convert_color_cpu(uint_t ncolor, uchar_t szc, uchar_t nszc)
+{
+ uint_t color;
+
+ ASSERT(nszc > szc);
+ ASSERT(nszc <= TTE256M);
+ ncolor = n2_color2hash(ncolor, nszc, 0);
+ color = ncolor << (nhbits[szc] - nhbits[nszc]);
+ color = n2_hash2color(color, szc);
+ return (color);
}
+#define PAPFN_2_MNODE(pfn) \
+ (((pfn) & it->mi_mnode_pfn_mask) >> it->mi_mnode_pfn_shift)
+
/*ARGSUSED*/
pfn_t
page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color,
- uint_t ceq_mask, uint_t color_mask)
+ uint_t ceq_mask, uint_t color_mask, void *cookie)
{
+ mem_node_iterator_t *it = cookie;
pfn_t pstep = PNUM_SIZE(szc);
pfn_t npfn, pfn_ceq_mask, pfn_color;
pfn_t tmpmask, mask = (pfn_t)-1;
+ uint_t pfnmn;
ASSERT((color & ~ceq_mask) == 0);
-
- if (((page_pfn_2_color_cpu(pfn, szc) ^ color) & ceq_mask) == 0) {
+ ASSERT(pfn >= it->mi_mblock_base && pfn <= it->mi_mblock_end);
+
+ /* convert RA to PA for accurate color calculation */
+ if (it->mi_init) {
+ /* first call after it, so cache these values */
+ it->mi_hash_ceq_mask =
+ n2_color2hash(ceq_mask, szc, it->mi_mnode_mask);
+ it->mi_hash_color =
+ n2_color2hash(color, szc, it->mi_mnode);
+ it->mi_init = 0;
+ } else {
+ ASSERT(it->mi_hash_ceq_mask ==
+ n2_color2hash(ceq_mask, szc, it->mi_mnode_mask));
+ ASSERT(it->mi_hash_color ==
+ n2_color2hash(color, szc, it->mi_mnode));
+ }
+ ceq_mask = it->mi_hash_ceq_mask;
+ color = it->mi_hash_color;
+ pfn += it->mi_ra_to_pa;
+
+ /* restart here when we switch memblocks */
+next_mem_block:
+ if (szc <= TTE64K) {
+ pfnmn = PAPFN_2_MNODE(pfn);
+ }
+ if (((page_papfn_2_color_cpu(pfn, szc) ^ color) & ceq_mask) == 0 &&
+ (szc > TTE64K || pfnmn == it->mi_mnode)) {
/* we start from the page with correct color */
if (szc >= TTE512K) {
@@ -361,18 +496,19 @@ page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color,
pfn_ceq_mask = ((ceq_mask & 1) << 6) |
((ceq_mask >> 1) << 15);
}
- pfn = ADD_MASKED(pfn, pstep, pfn_ceq_mask, mask);
- return (pfn);
+ npfn = ADD_MASKED(pfn, pstep, pfn_ceq_mask, mask);
+ goto done;
} else {
/*
* We deal 64K or 8K page. Check if we could the
* satisfy the request without changing PA[32:28]
*/
pfn_ceq_mask = ((ceq_mask & 3) << 5) | (ceq_mask >> 2);
+ pfn_ceq_mask |= it->mi_mnode_pfn_mask;
npfn = ADD_MASKED(pfn, pstep, pfn_ceq_mask, mask);
if ((((npfn ^ pfn) >> 15) & 0x1f) == 0)
- return (npfn);
+ goto done;
/*
* for next pfn we have to change bits PA[32:28]
@@ -382,15 +518,14 @@ page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color,
npfn |= (ceq_mask & color & 3) << 5;
pfn_ceq_mask = (szc == TTE8K) ? 0 :
(ceq_mask & 0x1c) << 13;
+ pfn_ceq_mask |= it->mi_mnode_pfn_mask;
npfn = ADD_MASKED(npfn, (1 << 15), pfn_ceq_mask, mask);
/*
* set bits PA[17:13] to match the color
*/
- ceq_mask >>= 2;
- color = (color >> 2) & ceq_mask;
- npfn |= ((npfn >> 15) ^ color) & ceq_mask;
- return (npfn);
+ npfn |= ((npfn >> 15) ^ (color >> 2)) & (ceq_mask >> 2);
+ goto done;
}
}
@@ -405,9 +540,9 @@ page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color,
} else {
/* try get the right color by changing bit PA[19:19] */
npfn = pfn + pstep;
- if (((page_pfn_2_color_cpu(npfn, szc) ^ color) &
+ if (((page_papfn_2_color_cpu(npfn, szc) ^ color) &
ceq_mask) == 0)
- return (npfn);
+ goto done;
/* page color is PA[32:28].PA[19:19] */
pfn_ceq_mask = ((ceq_mask & 1) << 6) |
@@ -419,34 +554,45 @@ page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color,
while (npfn <= pfn) {
npfn = ADD_MASKED(npfn, pstep, pfn_ceq_mask, mask);
}
- return (npfn);
+ goto done;
}
/*
- * We deal 64K or 8K page of incorrect color.
+ * We deal 64K or 8K page of incorrect color.
* Try correcting color without changing PA[32:28]
*/
-
pfn_ceq_mask = ((ceq_mask & 3) << 5) | (ceq_mask >> 2);
pfn_color = ((color & 3) << 5) | (color >> 2);
- npfn = (pfn & ~(pfn_t)0x7f);
- npfn |= (((pfn >> 15) & 0x1f) ^ pfn_color) & pfn_ceq_mask;
- npfn = (szc == TTE64K) ? (npfn & ~(pfn_t)0x7) : npfn;
-
- if (((page_pfn_2_color_cpu(npfn, szc) ^ color) & ceq_mask) == 0) {
-
- /* the color is fixed - find the next page */
- while (npfn <= pfn) {
- npfn = ADD_MASKED(npfn, pstep, pfn_ceq_mask, mask);
+ if (pfnmn == it->mi_mnode) {
+ npfn = (pfn & ~(pfn_t)0x7f);
+ npfn |= (((pfn >> 15) & 0x1f) ^ pfn_color) & pfn_ceq_mask;
+ npfn = (szc == TTE64K) ? (npfn & ~(pfn_t)0x7) : npfn;
+
+ if (((page_papfn_2_color_cpu(npfn, szc) ^ color) &
+ ceq_mask) == 0) {
+ /* the color is fixed - find the next page */
+ pfn_ceq_mask |= it->mi_mnode_pfn_mask;
+ while (npfn <= pfn) {
+ npfn = ADD_MASKED(npfn, pstep, pfn_ceq_mask,
+ mask);
+ }
+ if ((((npfn ^ pfn) >> 15) & 0x1f) == 0)
+ goto done;
}
- if ((((npfn ^ pfn) >> 15) & 0x1f) == 0)
- return (npfn);
}
/* to fix the color need to touch PA[32:28] */
npfn = (szc == TTE8K) ? ((pfn >> 15) << 15) :
(((pfn >> 18) << 18) | ((color & 0x1c) << 13));
+
+ /* fix mnode if input pfn is in the wrong mnode. */
+ if ((pfnmn = PAPFN_2_MNODE(npfn)) != it->mi_mnode) {
+ npfn += ((it->mi_mnode - pfnmn) & it->mi_mnode_mask) <<
+ it->mi_mnode_pfn_shift;
+ }
+
tmpmask = (szc == TTE8K) ? 0 : (ceq_mask & 0x1c) << 13;
+ tmpmask |= it->mi_mnode_pfn_mask;
while (npfn <= pfn) {
npfn = ADD_MASKED(npfn, (1 << 15), tmpmask, mask);
@@ -456,25 +602,58 @@ page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color,
npfn |= (((npfn >> 15) & 0x1f) ^ pfn_color) & pfn_ceq_mask;
npfn = (szc == TTE64K) ? (npfn & ~(pfn_t)0x7) : npfn;
- ASSERT(((page_pfn_2_color_cpu(npfn, szc) ^ color) & ceq_mask) == 0);
+done:
+ ASSERT(((page_papfn_2_color_cpu(npfn, szc) ^ color) & ceq_mask) == 0);
+ ASSERT(PAPFN_2_MNODE(npfn) == it->mi_mnode);
+
+ /* PA to RA */
+ npfn -= it->mi_ra_to_pa;
+
+ /* check for possible memblock switch */
+ if (npfn > it->mi_mblock_end) {
+ pfn = plat_mem_node_iterator_init(npfn, it->mi_mnode, it, 0);
+ if (pfn == (pfn_t)-1)
+ return (pfn);
+ ASSERT(pfn >= it->mi_mblock_base && pfn <= it->mi_mblock_end);
+ pfn += it->mi_ra_to_pa;
+ goto next_mem_block;
+ }
return (npfn);
}
/*
* init page coloring
+ * VF encodes node_id for an L-group in either bit 30 or 31:30,
+ * which effectively reduces the number of colors available per mnode.
*/
void
page_coloring_init_cpu()
{
int i;
-
- hw_page_array[0].hp_colors = 1 << 7;
- hw_page_array[1].hp_colors = 1 << 7;
- hw_page_array[2].hp_colors = 1 << 6;
-
- for (i = 3; i < mmu_page_sizes; i++) {
- hw_page_array[i].hp_colors = 1 << 5;
+ uchar_t id;
+ uchar_t lo;
+ uchar_t hi;
+ n2color_t m;
+ mem_node_iterator_t it;
+ static uchar_t idmask[] = {0, 0x7, 0x1f, 0x1f, 0x1f, 0x1f};
+
+ (void) plat_mem_node_iterator_init(0, 0, &it, 1);
+ for (i = 0; i < mmu_page_sizes; i++) {
+ memset(&m, 0, sizeof (m));
+ id = it.mi_mnode_pfn_mask >> 15; /* node id mask */
+ id &= idmask[i];
+ lo = lowbit(id);
+ if (lo > 0) {
+ hi = highbit(id);
+ m.nnbits = hi - lo + 1;
+ m.nnmask = (1 << m.nnbits) - 1;
+ lo += nhbits[i] - 5;
+ m.lomask = (1 << (lo - 1)) - 1;
+ m.lobits = lo - 1;
+ }
+ hw_page_array[i].hp_colors = 1 << (nhbits[i] - m.nnbits);
+ n2color[i] = m;
}
}
@@ -486,6 +665,7 @@ page_set_colorequiv_arr_cpu(void)
{
static uint_t nequiv_shades_log2[MMU_PAGE_SIZES] = {2, 5, 0, 0, 0, 0};
+ nequiv_shades_log2[1] -= n2color[1].nnbits;
if (colorequiv > 1) {
int i;
uint_t sv_a = lowbit(colorequiv) - 1;
diff --git a/usr/src/uts/sun4v/os/cmp.c b/usr/src/uts/sun4v/os/cmp.c
index 1503ef4b47..d5a9e3087d 100644
--- a/usr/src/uts/sun4v/os/cmp.c
+++ b/usr/src/uts/sun4v/os/cmp.c
@@ -97,7 +97,7 @@ pg_plat_hw_shared(cpu_t *cp, pghw_type_t hw)
return (1);
case PGHW_FPU:
return (1);
- case PGHW_CHIP:
+ case PGHW_MPIPE:
return (1);
}
return (0);
@@ -120,8 +120,8 @@ pg_plat_hw_instance_id(cpu_t *cpu, pghw_type_t hw)
switch (hw) {
case PGHW_IPIPE:
return (cpu->cpu_m.cpu_ipipe);
- case PGHW_CHIP:
- return (cpu->cpu_m.cpu_chip);
+ case PGHW_MPIPE:
+ return (cpu->cpu_m.cpu_mpipe);
case PGHW_FPU:
return (cpu->cpu_m.cpu_fpu);
default:
@@ -143,7 +143,7 @@ pg_plat_hw_level(pghw_type_t hw)
static pghw_type_t hw_hier[] = {
PGHW_IPIPE,
PGHW_FPU,
- PGHW_CHIP,
+ PGHW_MPIPE,
PGHW_NUM_COMPONENTS
};
@@ -164,7 +164,7 @@ pg_plat_cmt_load_bal_hw(pghw_type_t hw)
{
if (hw == PGHW_IPIPE ||
hw == PGHW_FPU ||
- hw == PGHW_CHIP)
+ hw == PGHW_MPIPE)
return (1);
else
return (0);
@@ -195,3 +195,30 @@ cmp_set_nosteal_interval(void)
{
nosteal_nsec = 0;
}
+/*
+ * Return 1 if CMT load balancing policies should be
+ * implemented across instances of the specified hardware
+ * sharing relationship.
+ */
+int
+pg_cmt_load_bal_hw(pghw_type_t hw)
+{
+ if (hw == PGHW_IPIPE ||
+ hw == PGHW_FPU ||
+ hw == PGHW_MPIPE)
+ return (1);
+ else
+ return (0);
+}
+/*
+ * Return 1 if thread affinity polices should be implemented
+ * for instances of the specifed hardware sharing relationship.
+ */
+int
+pg_cmt_affinity_hw(pghw_type_t hw)
+{
+ if (hw == PGHW_CACHE)
+ return (1);
+ else
+ return (0);
+}
diff --git a/usr/src/uts/sun4v/os/mpo.c b/usr/src/uts/sun4v/os/mpo.c
new file mode 100644
index 0000000000..d98ce96438
--- /dev/null
+++ b/usr/src/uts/sun4v/os/mpo.c
@@ -0,0 +1,1264 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/machsystm.h>
+#include <sys/machparam.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/mach_descrip.h>
+#include <sys/memnode.h>
+#include <sys/mdesc.h>
+#include <sys/mpo.h>
+#include <vm/vm_dep.h>
+
+/*
+ * MPO and the sun4v memory representation
+ * ---------------------------------------
+ *
+ * Latency groups are defined in the sun4v achitecture by memory-latency-group
+ * nodes in the Machine Description, as specified in FWARC/2007/260. These
+ * tie together cpu nodes and mblock nodes, and contain mask and match
+ * properties that identify the portion of an mblock that belongs to the
+ * lgroup. Mask and match are defined in the Physical Address (PA) space,
+ * but an mblock defines Real Addresses (RA). To translate, the mblock
+ * includes the property address-congruence-offset, hereafter referred to as
+ * ra_to_pa. A real address ra is a member of an lgroup if
+ *
+ * (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match
+ *
+ * The MD is traversed, and information on all mblocks is kept in the array
+ * mpo_mblock[]. Information on all CPUs, including which lgroup they map
+ * to, is kept in the array mpo_cpu[].
+ *
+ * This implementation makes (and verifies) the simplifying assumption that
+ * the mask bits are the same for all defined lgroups, and that all 1 bits in
+ * the mask are contiguous. Thus the number of lgroups is bounded by the
+ * number of possible mask values, and the lgrp_handle_t is defined as the
+ * mask value, shifted right to eliminate the 0 bit positions in mask. The
+ * masks and values are also referred to as "home bits" in the code.
+ *
+ * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup
+ * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock
+ * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the
+ * home bits. This yields the mem_node.
+ *
+ * Interfaces
+ * ----------
+ *
+ * This file exports the following entry points:
+ *
+ * plat_lgrp_init()
+ * plat_build_mem_nodes()
+ * plat_lgrp_cpu_to_hand()
+ * plat_lgrp_latency()
+ * plat_pfn_to_mem_node()
+ * These implement the usual platform lgroup interfaces.
+ *
+ * plat_rapfn_to_papfn()
+ * Recover the PA page coloring bits from an RA.
+ *
+ * plat_mem_node_iterator_init()
+ * Initialize an iterator to efficiently step through pages in a mem_node.
+ *
+ * plat_mem_node_intersect_range()
+ * Find the intersection with a mem_node.
+ */
+
+int sun4v_mpo_enable = 1;
+int sun4v_mpo_debug = 0;
+char sun4v_mpo_status[256] = "";
+
+/* Save CPU info from the MD and associate CPUs with lgroups */
+static struct cpu_md mpo_cpu[NCPU];
+
+/* Save lgroup info from the MD */
+#define MAX_MD_LGROUPS 32
+static struct lgrp_md mpo_lgroup[MAX_MD_LGROUPS];
+static int n_lgrpnodes = 0;
+static int n_locality_groups = 0;
+static int max_locality_groups = 0;
+
+/* Save mblocks from the MD */
+static struct mblock_md mpo_mblock[MPO_MAX_MBLOCKS];
+static int n_mblocks = 0;
+
+/* Save mem_node stripes calculate from mblocks and lgroups. */
+static mem_stripe_t mem_stripes[MAX_MEM_STRIPES];
+static int n_mem_stripes = 0;
+static pfn_t mnode_stride; /* distance between stripes, start to start */
+static int stripe_shift; /* stride/stripes expressed as a shift */
+static pfn_t mnode_pages; /* mem_node stripe width */
+
+/* Save home mask and shift used to calculate lgrp_handle_t values */
+static uint64_t home_mask = 0;
+static pfn_t home_mask_pfn = 0;
+static int home_mask_shift = 0;
+static uint_t home_mask_pfn_shift = 0;
+
+/* Save lowest and highest latencies found across all lgroups */
+static int lower_latency = 0;
+static int higher_latency = 0;
+
+static pfn_t base_ra_to_pa_pfn = 0; /* ra_to_pa for single mblock memory */
+
+static int valid_pages(md_t *md, mde_cookie_t cpu0);
+static int unique_home_mem_lg_count(uint64_t mem_lg_homeset);
+static int fix_interleave(void);
+
+/* Debug support */
+#if defined(DEBUG) && !defined(lint)
+#define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args)
+#else
+#define MPO_DEBUG(...)
+#endif /* DEBUG */
+
+/* Record status message, viewable from mdb */
+#define MPO_STATUS(args...) { \
+ (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args); \
+ MPO_DEBUG(sun4v_mpo_status); \
+}
+
+/*
+ * Routine to read a uint64_t from a given md
+ */
+static int64_t
+get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val)
+{
+ int err = md_get_prop_val(md, node, propname, val);
+ return (err);
+}
+
+static int
+mblock_cmp(const void *a, const void *b)
+{
+ struct mblock_md *m1 = (struct mblock_md *)a;
+ struct mblock_md *m2 = (struct mblock_md *)b;
+
+ if (m1->base < m2->base)
+ return (-1);
+ else if (m1->base == m2->base)
+ return (0);
+ else
+ return (1);
+}
+
+static void
+mblock_sort(struct mblock_md *mblocks, int n)
+{
+ extern void qsort(void *, size_t, size_t,
+ int (*)(const void *, const void *));
+
+ qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp);
+}
+
+/*
+ *
+ * Traverse the MD to determine:
+ *
+ * Number of CPU nodes, lgrp_nodes, and mblocks
+ * Then for each lgrp_node, obtain the appropriate data.
+ * For each CPU, determine its home locality and store it.
+ * For each mblock, retrieve its data and store it.
+ */
+static int
+lgrp_traverse(md_t *md)
+{
+ mde_cookie_t root, *cpunodes, *lgrpnodes, *nodes, *mblocknodes;
+ uint64_t i, j, k, o, n_nodes;
+ uint64_t n_lgroups = 0;
+ uint64_t mem_lg_homeset = 0;
+ int ret_val = 0;
+ int result = 0;
+ int n_cpunodes = 0;
+ int sub_page_fix;
+
+ n_nodes = md_node_count(md);
+
+ if (n_nodes <= 0) {
+ MPO_STATUS("lgrp_traverse: No nodes in node count\n");
+ ret_val = -1;
+ goto fail;
+ }
+
+ root = md_root_node(md);
+
+ if (root == MDE_INVAL_ELEM_COOKIE) {
+ MPO_STATUS("lgrp_traverse: Root node is missing\n");
+ ret_val = -1;
+ goto fail;
+ }
+
+ /*
+ * Build the Memory Nodes. Do this before any possibility of
+ * bailing from this routine so we obtain ra_to_pa (needed for page
+ * coloring) even when there are no lgroups defined.
+ */
+
+ n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK,
+ "fwd", &mblocknodes);
+
+ if (n_mblocks <= 0 || n_mblocks > MPO_MAX_MBLOCKS) {
+ MPO_STATUS("lgrp_traverse: No mblock "
+ "nodes detected in Machine Descriptor\n");
+ n_mblocks = 0;
+ ret_val = -1;
+ goto fail;
+ }
+
+ for (i = 0; i < n_mblocks; i++) {
+ mpo_mblock[i].node = mblocknodes[i];
+
+ /* Without a base or size value we will fail */
+ result = get_int(md, mblocknodes[i], PROP_LG_BASE,
+ &mpo_mblock[i].base);
+ if (result < 0) {
+ MPO_STATUS("lgrp_traverse: "
+ "PROP_LG_BASE is missing\n");
+ n_mblocks = 0;
+ ret_val = -1;
+ goto fail;
+ }
+
+ result = get_int(md, mblocknodes[i], PROP_LG_SIZE,
+ &mpo_mblock[i].size);
+ if (result < 0) {
+ MPO_STATUS("lgrp_traverse: "
+ "PROP_LG_SIZE is missing\n");
+ n_mblocks = 0;
+ ret_val = -1;
+ goto fail;
+ }
+
+ result = get_int(md, mblocknodes[i],
+ PROP_LG_RA_PA_OFFSET, &mpo_mblock[i].ra_to_pa);
+
+ /* If we don't have an ra_pa_offset, just set it to 0 */
+ if (result < 0)
+ mpo_mblock[i].ra_to_pa = 0;
+
+ MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, "
+ "ra_to_pa = %lx\n", i,
+ mpo_mblock[i].base,
+ mpo_mblock[i].size,
+ mpo_mblock[i].ra_to_pa);
+ }
+
+ /* Must sort mblocks by address for mem_node_iterator_init() */
+ mblock_sort(mpo_mblock, n_mblocks);
+
+ base_ra_to_pa_pfn = btop(mpo_mblock[0].ra_to_pa);
+
+ /* Page coloring hook is required so we can iterate through mnodes */
+ if (&page_next_pfn_for_color_cpu == NULL) {
+ MPO_STATUS("lgrp_traverse: No page coloring support\n");
+ ret_val = -1;
+ goto fail;
+ }
+
+ /* Global enable for mpo */
+ if (sun4v_mpo_enable == 0) {
+ MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n");
+ ret_val = -1;
+ goto fail;
+ }
+
+ n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG,
+ "fwd", &lgrpnodes);
+
+ if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) {
+ MPO_STATUS("lgrp_traverse: No Lgroups\n");
+ ret_val = -1;
+ goto fail;
+ }
+
+ n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes);
+
+ if (n_cpunodes <= 0 || n_cpunodes > NCPU) {
+ MPO_STATUS("lgrp_traverse: No CPU nodes detected "
+ "in MD\n");
+ ret_val = -1;
+ goto fail;
+ }
+
+ MPO_DEBUG("lgrp_traverse: Node Count: %ld\n", n_nodes);
+ MPO_DEBUG("lgrp_traverse: md: %p\n", md);
+ MPO_DEBUG("lgrp_traverse: root: %lx\n", root);
+ MPO_DEBUG("lgrp_traverse: mem_lgs: %d\n", n_lgrpnodes);
+ MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes);
+ MPO_DEBUG("lgrp_traverse: mblocks: %d\n", n_mblocks);
+
+ for (i = 0; i < n_lgrpnodes; i++) {
+ mpo_lgroup[i].node = lgrpnodes[i];
+ mpo_lgroup[i].id = i;
+ mpo_lgroup[i].ncpu = 0;
+ result = get_int(md, lgrpnodes[i], PROP_LG_MASK,
+ &mpo_lgroup[i].addr_mask);
+ result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH,
+ &mpo_lgroup[i].addr_match);
+
+ /*
+ * If either the mask or match properties are missing, set to 0
+ */
+ if (result < 0) {
+ mpo_lgroup[i].addr_mask = 0;
+ mpo_lgroup[i].addr_match = 0;
+ }
+
+ /* Set latency to 0 if property not present */
+
+ result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY,
+ &mpo_lgroup[i].latency);
+ if (result < 0)
+ mpo_lgroup[i].latency = 0;
+ }
+
+ /*
+ * Sub-page level interleave is not yet supported. Check for it,
+ * and remove sub-page interleaved lgroups from mpo_lgroup and
+ * n_lgrpnodes. If no lgroups are left, return.
+ */
+
+ sub_page_fix = fix_interleave();
+ if (n_lgrpnodes == 0) {
+ ret_val = -1;
+ goto fail;
+ }
+
+ /* Ensure that all of the addr_mask values are the same */
+
+ for (i = 0; i < n_lgrpnodes; i++) {
+ if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) {
+ MPO_STATUS("lgrp_traverse: "
+ "addr_mask values are not the same\n");
+ ret_val = -1;
+ goto fail;
+ }
+ }
+
+ /*
+ * Ensure that all lgrp nodes see all the mblocks. However, if
+ * sub-page interleave is being fixed, they do not, so skip
+ * the check.
+ */
+
+ if (sub_page_fix == 0) {
+ for (i = 0; i < n_lgrpnodes; i++) {
+ j = md_alloc_scan_dag(md, mpo_lgroup[i].node,
+ PROP_LG_MBLOCK, "fwd", &nodes);
+ md_free_scan_dag(md, &nodes);
+ if (j != n_mblocks) {
+ MPO_STATUS("lgrp_traverse: "
+ "sub-page interleave is being fixed\n");
+ ret_val = -1;
+ goto fail;
+ }
+ }
+ }
+
+ /*
+ * Use the address mask from the first lgroup node
+ * to establish our home_mask.
+ */
+ home_mask = mpo_lgroup[0].addr_mask;
+ home_mask_pfn = btop(home_mask);
+ home_mask_shift = lowbit(home_mask) - 1;
+ home_mask_pfn_shift = home_mask_shift - PAGESHIFT;
+ mnode_pages = btop(1ULL << home_mask_shift);
+
+ /*
+ * How many values are possible in home mask? Assume the mask
+ * bits are contiguous.
+ */
+ max_locality_groups =
+ 1 << highbit(home_mask_pfn >> home_mask_pfn_shift);
+
+ /* Now verify the home mask bits are contiguous */
+
+ if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) {
+ MPO_STATUS("lgrp_traverse: "
+ "home mask bits are not contiguous\n");
+ ret_val = -1;
+ goto fail;
+ }
+
+ /* Record all of the home bits */
+
+ for (i = 0; i < n_lgrpnodes; i++) {
+ HOMESET_ADD(mem_lg_homeset,
+ mpo_lgroup[i].addr_match >> home_mask_shift);
+ }
+
+ /* Count the number different "home" mem_lg's we've discovered */
+
+ n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset);
+
+ /* If we have only 1 locality group then we can exit */
+ if (n_locality_groups == 1) {
+ MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n");
+ ret_val = -1;
+ goto fail;
+ }
+
+ /*
+ * Set the latencies. A CPU's lgroup is defined by the lowest
+ * latency found. All other memory is considered remote, and the
+ * remote latency is represented by the highest latency found.
+ * Thus hierarchical lgroups, if any, are approximated by a
+ * two level scheme.
+ *
+ * The Solaris MPO framework by convention wants to see latencies
+ * in units of nano-sec/10. In the MD, the units are defined to be
+ * pico-seconds.
+ */
+
+ lower_latency = mpo_lgroup[0].latency;
+ higher_latency = mpo_lgroup[0].latency;
+
+ for (i = 1; i < n_lgrpnodes; i++) {
+ if (mpo_lgroup[i].latency < lower_latency) {
+ lower_latency = mpo_lgroup[i].latency;
+ }
+ if (mpo_lgroup[i].latency > higher_latency) {
+ higher_latency = mpo_lgroup[i].latency;
+ }
+ }
+ lower_latency /= 10000;
+ higher_latency /= 10000;
+
+ /* Clear our CPU data */
+
+ for (i = 0; i < NCPU; i++) {
+ mpo_cpu[i].home = 0;
+ mpo_cpu[i].latency = (uint_t)(-1);
+ }
+
+ /* Build the CPU nodes */
+ for (i = 0; i < n_cpunodes; i++) {
+
+ /* Read in the lgroup nodes */
+
+ result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k);
+ if (result < 0) {
+ MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n");
+ ret_val = -1;
+ goto fail;
+ }
+
+ n_lgroups = md_alloc_scan_dag(md, cpunodes[i], PROP_LG_MEM_LG,
+ "fwd", &nodes);
+ if (n_lgroups <= 0) {
+ MPO_STATUS("lgrp_traverse: PROP_LG_MEM_LG missing");
+ ret_val = -1;
+ goto fail;
+ }
+
+ /*
+ * Find the lgroup this cpu belongs to with the lowest latency.
+ * Check all the lgrp nodes connected to this CPU to determine
+ * which has the smallest latency.
+ */
+
+ for (j = 0; j < n_lgroups; j++) {
+ for (o = 0; o < n_lgrpnodes; o++) {
+ if (nodes[j] == mpo_lgroup[o].node) {
+ if (mpo_lgroup[o].latency <
+ mpo_cpu[k].latency) {
+ mpo_cpu[k].home =
+ mpo_lgroup[o].addr_match
+ >> home_mask_shift;
+ mpo_cpu[k].latency =
+ mpo_lgroup[o].latency;
+ mpo_lgroup[o].ncpu++;
+ }
+ }
+ }
+ }
+ md_free_scan_dag(md, &nodes);
+ }
+
+ /* Validate that no large pages cross mnode boundaries. */
+ if (valid_pages(md, cpunodes[0]) == 0) {
+ ret_val = -1;
+ goto fail;
+ }
+
+fail:
+ /* MD cookies are no longer valid; ensure they are not used again. */
+ for (i = 0; i < n_mblocks; i++)
+ mpo_mblock[i].node = MDE_INVAL_ELEM_COOKIE;
+ for (i = 0; i < n_lgrpnodes; i++)
+ mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE;
+
+ if (n_cpunodes > 0)
+ md_free_scan_dag(md, &cpunodes);
+ if (n_lgrpnodes > 0)
+ md_free_scan_dag(md, &lgrpnodes);
+ if (n_mblocks > 0)
+ md_free_scan_dag(md, &mblocknodes);
+ else
+ panic("lgrp_traverse: No memory blocks found");
+
+ if (ret_val == 0)
+ MPO_STATUS("MPO feature is enabled.\n");
+
+ return (ret_val);
+}
+
+/*
+ * Determine the number of unique mem_lg's present in our system
+ */
+static int
+unique_home_mem_lg_count(uint64_t mem_lg_homeset)
+{
+ int homeid;
+ int count = 0;
+
+ /*
+ * Scan the "home" bits of the mem_lgs, count
+ * the number that are unique.
+ */
+
+ for (homeid = 0; homeid < NLGRPS_MAX; homeid++) {
+ if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) {
+ count++;
+ }
+ }
+
+ MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n",
+ mem_lg_homeset);
+ MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count);
+
+ /* Default must be at least one */
+ if (count == 0)
+ count = 1;
+
+ return (count);
+}
+
+/*
+ * Platform specific lgroup initialization
+ */
+void
+plat_lgrp_init(void)
+{
+ md_t *md;
+ int i, rc, ncpu_min;
+
+ /* Get the Machine Descriptor handle */
+
+ md = md_get_handle();
+
+ /* If not, we cannot continue */
+
+ if (md == NULL) {
+ panic("cannot access machine descriptor\n");
+ } else {
+ rc = lgrp_traverse(md);
+ (void) md_fini_handle(md);
+ }
+
+ /*
+ * If we can't process the MD for lgroups then at least let the
+ * system try to boot. Assume we have one lgroup so that
+ * when plat_build_mem_nodes is called, it will attempt to init
+ * an mnode based on the supplied memory segment.
+ */
+
+ if (rc == -1) {
+ home_mask_pfn = 0;
+ max_locality_groups = 1;
+ n_locality_groups = 1;
+ return;
+ }
+
+ mem_node_pfn_shift = 0;
+ mem_node_physalign = 0;
+
+ /* Use lgroup-aware TSB allocations */
+ tsb_lgrp_affinity = 1;
+
+ /*
+ * lgrp_expand_proc_thresh is the minimum load on the lgroups
+ * this process is currently running on before considering
+ * expanding threads to another lgroup.
+ *
+ * lgrp_expand_proc_diff determines how much less the remote lgroup
+ * must be loaded before expanding to it.
+ *
+ * On sun4v CMT processors, threads share a core pipeline, and
+ * at less than 100% utilization, best throughput is obtained by
+ * spreading threads across more cores, even if some are in a
+ * different lgroup. Spread threads to a new lgroup if the
+ * current group is more than 50% loaded. Because of virtualization,
+ * lgroups may have different numbers of CPUs, but the tunables
+ * apply to all lgroups, so find the smallest lgroup and compute
+ * 50% loading.
+ */
+
+ ncpu_min = NCPU;
+ for (i = 0; i < n_lgrpnodes; i++) {
+ int ncpu = mpo_lgroup[i].ncpu;
+ if (ncpu != 0 && ncpu < ncpu_min)
+ ncpu_min = ncpu;
+ }
+ lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2;
+
+ /* new home may only be half as loaded as the existing home to use it */
+ lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2;
+
+ lgrp_loadavg_tolerance = lgrp_loadavg_max_effect;
+
+ /* Require that a home lgroup have some memory to be chosen */
+ lgrp_mem_free_thresh = 1;
+
+ /* Standard home-on-next-touch policy */
+ lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT;
+
+ /* Disable option to choose root lgroup if all leaf lgroups are busy */
+ lgrp_load_thresh = UINT32_MAX;
+}
+
+/*
+ * Helper routine for debugging calls to mem_node_add_slice()
+ */
+static void
+mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn)
+{
+#if defined(DEBUG) && !defined(lint)
+ static int slice_count = 0;
+
+ slice_count++;
+ MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n",
+ slice_count, basepfn, endpfn);
+#endif
+ mem_node_add_slice(basepfn, endpfn);
+}
+
+/*
+ * Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node()
+ */
+static void
+mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode)
+{
+ MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld,"
+ "mnode index: %d\n", plathand, mnode);
+ plat_assign_lgrphand_to_mem_node(plathand, mnode);
+}
+
+/*
+ * plat_build_mem_nodes()
+ *
+ * Define the mem_nodes based on the modified boot memory list,
+ * or based on info read from the MD in plat_lgrp_init().
+ *
+ * When the home mask lies in the middle of the address bits (as it does on
+ * Victoria Falls), then the memory in one mem_node is no longer contiguous;
+ * it is striped across an mblock in a repeating pattern of contiguous memory
+ * followed by a gap. The stripe width is the size of the contiguous piece.
+ * The stride is the distance from the start of one contiguous piece to the
+ * start of the next. The gap is thus stride - stripe_width.
+ *
+ * The stripe of an mnode that falls within an mblock is described by the type
+ * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The
+ * mem_stripe_t's are kept in a global array mem_stripes[]. The index into
+ * this array is predetermined. The mem_stripe_t that describes mnode m
+ * within mpo_mblock[i] is stored at
+ * mem_stripes[ m + i * max_locality_groups ]
+ *
+ * max_locality_groups is the total number of possible locality groups,
+ * as defined by the size of the home mask, even if the memory assigned
+ * to the domain is small and does not cover all the lgroups. Thus some
+ * mem_stripe_t's may be empty.
+ *
+ * The members of mem_stripe_t are:
+ * physbase: First valid page in mem_node in the corresponding mblock
+ * physmax: Last valid page in mem_node in mblock
+ * offset: The full stripe width starts at physbase - offset.
+ * Thus if offset is non-zero, this mem_node starts in the middle
+ * of a stripe width, and the second full stripe starts at
+ * physbase - offset + stride. (even though physmax may fall in the
+ * middle of a stripe width, we do not save the ending fragment size
+ * in this data structure.)
+ * exists: Set to 1 if the mblock has memory in this mem_node stripe.
+ *
+ * The stripe width is kept in the global mnode_pages.
+ * The stride is kept in the global mnode_stride.
+ * All the above use pfn's as the unit.
+ *
+ * As an example, the memory layout for a domain with 2 mblocks and 4
+ * mem_nodes 0,1,2,3 could look like this:
+ *
+ * 123012301230 ... 012301230123 ...
+ * mblock 0 mblock 1
+ */
+
+void
+plat_build_mem_nodes(u_longlong_t *list, size_t nelems)
+{
+ lgrp_handle_t lgrphand, lgrp_start;
+ int i, mnode, elem;
+ uint64_t offset, stripe_end, base, len, end, ra_to_pa, stride;
+ uint64_t stripe, frag, remove;
+ mem_stripe_t *ms;
+
+ /* Check for non-MPO sun4v platforms */
+
+ if (n_locality_groups <= 1) {
+ mpo_plat_assign_lgrphand_to_mem_node((lgrp_handle_t)0, 0);
+ for (elem = 0; elem < nelems; elem += 2) {
+ base = list[elem];
+ len = list[elem+1];
+
+ mpo_mem_node_add_slice(btop(base),
+ btop(base + len - 1));
+ }
+ mem_node_pfn_shift = 0;
+ mem_node_physalign = 0;
+ n_mem_stripes = 0;
+ return;
+ }
+
+ /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */
+ max_mem_nodes = max_locality_groups;
+ bzero(mem_stripes, sizeof (mem_stripes));
+ stripe = ptob(mnode_pages);
+ stride = max_locality_groups * stripe;
+
+ /* Save commonly used values in globals */
+ mnode_stride = btop(stride);
+ n_mem_stripes = max_locality_groups * n_mblocks;
+ stripe_shift = highbit(max_locality_groups) - 1;
+
+ for (i = 0; i < n_mblocks; i++) {
+
+ base = mpo_mblock[i].base;
+ end = mpo_mblock[i].base + mpo_mblock[i].size;
+ ra_to_pa = mpo_mblock[i].ra_to_pa;
+ mpo_mblock[i].base_pfn = btop(base);
+ mpo_mblock[i].end_pfn = btop(end - 1);
+
+ /* Find the offset from the prev stripe boundary in PA space. */
+ offset = (base + ra_to_pa) & (stripe - 1);
+
+ /* Set the next stripe boundary. */
+ stripe_end = base - offset + stripe;
+
+ lgrp_start = (((base + ra_to_pa) & home_mask) >>
+ home_mask_shift);
+ lgrphand = lgrp_start;
+
+ /*
+ * Loop over all lgroups covered by the mblock, creating a
+ * stripe for each. Stop when lgrp_start is visited again.
+ */
+ do {
+ /* mblock may not span all lgroups */
+ if (base >= end)
+ break;
+
+ mnode = lgrphand;
+ ASSERT(mnode < max_mem_nodes);
+
+ /*
+ * Calculate the size of the fragment that does not
+ * belong to the mnode in the last partial stride.
+ */
+ frag = (end - (base - offset)) & (stride - 1);
+ if (frag == 0) {
+ /* remove the gap */
+ remove = stride - stripe;
+ } else if (frag < stripe) {
+ /* fragment fits in stripe; keep it all */
+ remove = 0;
+ } else {
+ /* fragment is large; trim after whole stripe */
+ remove = frag - stripe;
+ }
+
+ ms = &mem_stripes[i * max_locality_groups + mnode];
+ ms->physbase = btop(base);
+ ms->physmax = btop(end - 1 - remove);
+ ms->offset = btop(offset);
+ ms->exists = 1;
+
+ mpo_plat_assign_lgrphand_to_mem_node(lgrphand, mnode);
+ mpo_mem_node_add_slice(ms->physbase, ms->physmax);
+
+ base = stripe_end;
+ stripe_end += stripe;
+ offset = 0;
+ lgrphand = (((base + ra_to_pa) & home_mask) >>
+ home_mask_shift);
+ } while (lgrphand != lgrp_start);
+ }
+
+ /*
+ * Indicate to vm_pagelist that the hpm_counters array
+ * should be shared because the ranges overlap.
+ */
+ if (max_mem_nodes > 1) {
+ interleaved_mnodes = 1;
+ }
+}
+
+/*
+ * Return the locality group value for the supplied processor
+ */
+lgrp_handle_t
+plat_lgrp_cpu_to_hand(processorid_t id)
+{
+ if (n_locality_groups > 1) {
+ return ((lgrp_handle_t)mpo_cpu[(int)id].home);
+ } else {
+ return ((lgrp_handle_t)0); /* Default */
+ }
+}
+
+int
+plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to)
+{
+ /*
+ * Return min remote latency when there are more than two lgroups
+ * (root and child) and getting latency between two different lgroups
+ * or root is involved.
+ */
+ if (lgrp_optimizations() && (from != to ||
+ from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) {
+ return ((int)higher_latency);
+ } else {
+ return ((int)lower_latency);
+ }
+}
+
+int
+plat_pfn_to_mem_node(pfn_t pfn)
+{
+ int i, mnode;
+ pfn_t ra_to_pa_pfn;
+ struct mblock_md *mb;
+
+ if (n_locality_groups <= 1)
+ return (0);
+
+ /*
+ * The mnode is defined to be 1:1 with the lgroup handle, which
+ * is taken from from the home bits. Find the mblock in which
+ * the pfn falls to get the ra_to_pa adjustment, and extract
+ * the home bits.
+ */
+ mb = &mpo_mblock[0];
+ for (i = 0; i < n_mblocks; i++) {
+ if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) {
+ ra_to_pa_pfn = btop(mb->ra_to_pa);
+ mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >>
+ home_mask_pfn_shift);
+ ASSERT(mnode < max_mem_nodes);
+ return (mnode);
+ }
+ mb++;
+ }
+
+ panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn);
+ return (pfn);
+}
+
+/*
+ * plat_rapfn_to_papfn
+ *
+ * Convert a pfn in RA space to a pfn in PA space, in which the page coloring
+ * and home mask bits are correct. The upper bits do not necessarily
+ * match the actual PA, however.
+ */
+pfn_t
+plat_rapfn_to_papfn(pfn_t pfn)
+{
+ int i;
+ pfn_t ra_to_pa_pfn;
+ struct mblock_md *mb;
+
+ ASSERT(n_mblocks > 0);
+ if (n_mblocks == 1)
+ return (pfn + base_ra_to_pa_pfn);
+
+ /*
+ * Find the mblock in which the pfn falls
+ * in order to get the ra_to_pa adjustment.
+ */
+ for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) {
+ if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) {
+ ra_to_pa_pfn = btop(mb->ra_to_pa);
+ return (pfn + ra_to_pa_pfn);
+ }
+ }
+
+ panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn);
+ return (pfn);
+}
+
+/*
+ * plat_mem_node_iterator_init()
+ * Initialize cookie to iterate over pfn's in an mnode. There is
+ * no additional iterator function. The caller uses the info from
+ * the iterator structure directly.
+ *
+ * pfn: starting pfn.
+ * mnode: desired mnode.
+ * init: set to 1 for full init, 0 for continuation
+ *
+ * Returns the appropriate starting pfn for the iteration
+ * the same as the input pfn if it falls in an mblock.
+ * Returns the (pfn_t)-1 value if the input pfn lies past
+ * the last valid mnode pfn.
+ */
+pfn_t
+plat_mem_node_iterator_init(pfn_t pfn, int mnode,
+ mem_node_iterator_t *it, int init)
+{
+ int i;
+ struct mblock_md *mblock;
+ pfn_t base, end;
+
+ ASSERT(it != NULL);
+ ASSERT(mnode >= 0 && mnode < max_mem_nodes);
+ ASSERT(n_mblocks > 0);
+
+ if (init) {
+ it->mi_last_mblock = 0;
+ it->mi_init = 1;
+ }
+
+ /* Check if mpo is not enabled and we only have one mblock */
+ if (n_locality_groups == 1 && n_mblocks == 1) {
+ it->mi_mnode = mnode;
+ it->mi_ra_to_pa = base_ra_to_pa_pfn;
+ it->mi_mnode_pfn_mask = 0;
+ it->mi_mnode_pfn_shift = 0;
+ it->mi_mnode_mask = 0;
+ it->mi_mblock_base = mem_node_config[mnode].physbase;
+ it->mi_mblock_end = mem_node_config[mnode].physmax;
+ if (pfn < it->mi_mblock_base)
+ pfn = it->mi_mblock_base;
+ else if (pfn > it->mi_mblock_end)
+ pfn = (pfn_t)-1;
+ return (pfn);
+ }
+
+ /*
+ * Find mblock that contains pfn, or first mblock after pfn,
+ * else pfn is out of bounds, so use the last mblock.
+ * mblocks are sorted in ascending address order.
+ */
+ ASSERT(it->mi_last_mblock < n_mblocks);
+ ASSERT(init == 1 || pfn > mpo_mblock[it->mi_last_mblock].end_pfn);
+ i = init ? 0 : it->mi_last_mblock + 1;
+ if (i == n_mblocks)
+ return ((pfn_t)-1);
+
+ for (; i < n_mblocks; i++) {
+ if (pfn <= mpo_mblock[i].end_pfn)
+ break;
+ }
+ if (i == n_mblocks) {
+ it->mi_last_mblock = i - 1;
+ return ((pfn_t)-1);
+ }
+ it->mi_last_mblock = i;
+
+ /*
+ * Memory stripes are defined if there is more than one locality
+ * group, so use the stripe bounds. Otherwise use mblock bounds.
+ */
+ mblock = &mpo_mblock[i];
+ if (n_mem_stripes > 0) {
+ mem_stripe_t *ms =
+ &mem_stripes[i * max_locality_groups + mnode];
+ base = ms->physbase;
+ end = ms->physmax;
+ } else {
+ ASSERT(mnode == 0);
+ base = mblock->base_pfn;
+ end = mblock->end_pfn;
+ }
+
+ it->mi_mnode = mnode;
+ it->mi_ra_to_pa = btop(mblock->ra_to_pa);
+ it->mi_mblock_base = base;
+ it->mi_mblock_end = end;
+ it->mi_mnode_pfn_mask = home_mask_pfn; /* is 0 for non-MPO case */
+ it->mi_mnode_pfn_shift = home_mask_pfn_shift;
+ it->mi_mnode_mask = max_locality_groups - 1;
+ if (pfn < base)
+ pfn = base;
+ else if (pfn > end)
+ pfn = (pfn_t)-1;
+ return (pfn);
+}
+
+/*
+ * plat_mem_node_intersect_range()
+ *
+ * Find the intersection between a memnode and a range of pfn's.
+ */
+void
+plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len,
+ int mnode, pgcnt_t *npages_out)
+{
+ pfn_t offset, len, hole, base, end, test_end, frag;
+ pfn_t nearest;
+ mem_stripe_t *ms;
+ int i, npages;
+
+ *npages_out = 0;
+
+ if (!mem_node_config[mnode].exists || test_len == 0)
+ return;
+
+ base = mem_node_config[mnode].physbase;
+ end = mem_node_config[mnode].physmax;
+
+ test_end = test_base + test_len - 1;
+ if (end < test_base || base > test_end)
+ return;
+
+ if (n_locality_groups == 1) {
+ *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1;
+ return;
+ }
+
+ hole = mnode_stride - mnode_pages;
+ npages = 0;
+
+ /*
+ * Iterate over all the stripes for this mnode (one per mblock),
+ * find the intersection with each, and accumulate the intersections.
+ *
+ * Determing the intersection with a stripe is tricky. If base or end
+ * fall outside the mem_node bounds, round them to physbase/physmax of
+ * mem_node. If base or end fall in a gap, round them to start of
+ * nearest stripe. If they fall within a stripe, keep base or end,
+ * but calculate the fragment size that should be excluded from the
+ * stripe. Calculate how many strides fall in the adjusted range,
+ * multiply by stripe width, and add the start and end fragments.
+ */
+
+ for (i = mnode; i < n_mem_stripes; i += max_locality_groups) {
+ ms = &mem_stripes[i];
+ if (ms->exists &&
+ test_base <= (end = ms->physmax) &&
+ test_end >= (base = ms->physbase)) {
+
+ offset = ms->offset;
+
+ if (test_base > base) {
+ /* Round test_base to next multiple of stride */
+ len = P2ROUNDUP(test_base - (base - offset),
+ mnode_stride);
+ nearest = base - offset + len;
+ /*
+ * Compute distance from test_base to the
+ * stride boundary to see if test_base falls
+ * in the stripe or in the hole.
+ */
+ if (nearest - test_base > hole) {
+ /*
+ * test_base lies in stripe,
+ * and offset should be excluded.
+ */
+ offset = test_base -
+ (nearest - mnode_stride);
+ base = test_base;
+ } else {
+ /* round up to next stripe start */
+ offset = 0;
+ base = nearest;
+ if (base > end)
+ continue;
+ }
+
+ }
+
+ if (test_end < end)
+ end = test_end;
+ end++; /* adjust to an exclusive bound */
+
+ /* Round end to next multiple of stride */
+ len = P2ROUNDUP(end - (base - offset), mnode_stride);
+ nearest = (base - offset) + len;
+ if (nearest - end <= hole) {
+ /* end falls in hole, use entire last stripe */
+ frag = 0;
+ } else {
+ /* end falls in stripe, compute fragment */
+ frag = nearest - hole - end;
+ }
+
+ len = (len >> stripe_shift) - offset - frag;
+ npages += len;
+ }
+ }
+
+ *npages_out = npages;
+}
+
+/*
+ * valid_pages()
+ *
+ * Return 1 if pages are valid and do not cross mnode boundaries
+ * (which would break page free list assumptions), and 0 otherwise.
+ */
+
+#define MNODE(pa) \
+ ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift)
+
+static int
+valid_pages(md_t *md, mde_cookie_t cpu0)
+{
+ int i, max_szc;
+ uint64_t last_page_base, szc_mask;
+ uint64_t max_page_len, max_coalesce_len;
+ struct mblock_md *mb = mpo_mblock;
+
+ /*
+ * Find the smaller of the largest page possible and supported.
+ * mmu_exported_pagesize_mask is not yet initialized, so read
+ * it from the MD. Apply minimal fixups in case of broken MDs
+ * to get a sane mask.
+ */
+
+ if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask))
+ szc_mask = 0;
+ szc_mask |= (1 << TTE4M); /* largest in sun4v default support */
+ max_szc = highbit(szc_mask) - 1;
+ if (max_szc > TTE256M)
+ max_szc = TTE256M;
+ max_page_len = TTEBYTES(max_szc);
+
+ /*
+ * Page coalescing code coalesces all sizes up to 256M on sun4v, even
+ * if mmu-page-size-list does not contain it, so 256M pages must fall
+ * within one mnode to use MPO.
+ */
+ max_coalesce_len = TTEBYTES(TTE256M);
+ ASSERT(max_coalesce_len >= max_page_len);
+
+ if (ptob(mnode_pages) < max_coalesce_len) {
+ MPO_STATUS("Page too large; MPO disabled: page = %lx, "
+ "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages));
+ return (0);
+ }
+
+ for (i = 0; i < n_mblocks; i++) {
+ uint64_t base = mb->base;
+ uint64_t end = mb->base + mb->size - 1;
+ uint64_t ra_to_pa = mb->ra_to_pa;
+
+ /*
+ * If mblock is smaller than the max page size, then
+ * RA = PA mod MAXPAGE is not guaranteed, but it must
+ * not span mnodes.
+ */
+ if (mb->size < max_page_len) {
+ if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) {
+ MPO_STATUS("Small mblock spans mnodes; "
+ "MPO disabled: base = %lx, end = %lx, "
+ "ra2pa = %lx\n", base, end, ra_to_pa);
+ return (0);
+ }
+ } else {
+ /* Verify RA = PA mod MAXPAGE, using coalesce size */
+ uint64_t pa_base = base + ra_to_pa;
+ if ((base & (max_coalesce_len - 1)) !=
+ (pa_base & (max_coalesce_len - 1))) {
+ MPO_STATUS("bad page alignment; MPO disabled: "
+ "ra = %lx, pa = %lx, pagelen = %lx\n",
+ base, pa_base, max_coalesce_len);
+ return (0);
+ }
+ }
+
+ /*
+ * Find start of last large page in mblock in RA space.
+ * If page extends into the next mblock, verify the
+ * mnode does not change.
+ */
+ last_page_base = P2ALIGN(end, max_coalesce_len);
+ if (i + 1 < n_mblocks &&
+ last_page_base + max_coalesce_len > mb[1].base &&
+ MNODE(last_page_base + ra_to_pa) !=
+ MNODE(mb[1].base + mb[1].ra_to_pa)) {
+ MPO_STATUS("Large page spans mblocks; MPO disabled: "
+ "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, "
+ "pagelen = %lx\n", end, ra_to_pa, mb[1].base,
+ mb[1].ra_to_pa, max_coalesce_len);
+ return (0);
+ }
+
+ mb++;
+ }
+ return (1);
+}
+
+
+/*
+ * fix_interleave() - Find lgroups with sub-page sized memory interleave,
+ * if any, and remove them. This yields a config where the "coarse
+ * grained" lgroups cover all of memory, even though part of that memory
+ * is fine grain interleaved and does not deliver a purely local memory
+ * latency.
+ *
+ * This function reads and modifies the globals:
+ * mpo_lgroup[], n_lgrpnodes
+ *
+ * Returns 1 if lgroup nodes were removed, 0 otherwise.
+ */
+
+static int
+fix_interleave(void)
+{
+ int i, j;
+ uint64_t mask = 0;
+
+ j = 0;
+ for (i = 0; i < n_lgrpnodes; i++) {
+ if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) {
+ /* remove this lgroup */
+ mask = mpo_lgroup[i].addr_mask;
+ } else {
+ mpo_lgroup[j++] = mpo_lgroup[i];
+ }
+ }
+ n_lgrpnodes = j;
+
+ if (mask != 0)
+ MPO_STATUS("sub-page interleave %lx found; "
+ "removing lgroup.\n", mask);
+
+ return (mask != 0);
+}
diff --git a/usr/src/uts/sun4v/sys/cpu_module.h b/usr/src/uts/sun4v/sys/cpu_module.h
index 0786951416..2d7c909a0f 100644
--- a/usr/src/uts/sun4v/sys/cpu_module.h
+++ b/usr/src/uts/sun4v/sys/cpu_module.h
@@ -146,11 +146,12 @@ extern void bzero(void *addr, size_t count);
int cpu_trapstat_conf(int cmd);
void cpu_trapstat_data(void *buf, uint_t pgszs);
-
+/* Used by the fill_cpu() function */
#define NO_MAPPING_FOUND 0xffffffff
#define NO_EU_MAPPING_FOUND NO_MAPPING_FOUND
#define NO_CHIP_MAPPING_FOUND NO_MAPPING_FOUND
#define NO_CORE_MAPPING_FOUND NO_MAPPING_FOUND
+#define NO_L2_CACHE_MAPPING_FOUND NO_MAPPING_FOUND
/*
* Default MMU pagesize mask for sun4v architecture.
*/
diff --git a/usr/src/uts/sun4v/sys/machcpuvar.h b/usr/src/uts/sun4v/sys/machcpuvar.h
index 921d6c584d..2113747d55 100644
--- a/usr/src/uts/sun4v/sys/machcpuvar.h
+++ b/usr/src/uts/sun4v/sys/machcpuvar.h
@@ -104,6 +104,7 @@ typedef struct ptl1_state {
*/
#define CPU_CHIPID_INVALID -1
#define CPU_COREID_INVALID -1
+#define CPU_L2_CACHEID_INVALID -1
/*
* Machine specific fields of the cpu struct
@@ -177,6 +178,7 @@ struct machcpu {
uint16_t *cpu_list; /* uint16_t [NCPU] */
uint64_t cpu_list_ra; /* cpu list ra */
id_t cpu_ipipe; /* cpu int exec unit id */
+ id_t cpu_mpipe; /* cpu memory pipe id */
id_t cpu_fpu; /* cpu fpu unit id */
id_t cpu_core; /* cpu core id */
id_t cpu_chip; /* cpu chip id */
diff --git a/usr/src/uts/sun4v/sys/mpo.h b/usr/src/uts/sun4v/sys/mpo.h
new file mode 100644
index 0000000000..e390b5e483
--- /dev/null
+++ b/usr/src/uts/sun4v/sys/mpo.h
@@ -0,0 +1,112 @@
+
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_MPO_H
+#define _SYS_MPO_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * mpo.h - Sun4v MPO common header file
+ *
+ */
+#define MPO_MAX_MBLOCKS 16
+#define MAX_MEM_STRIPES (MAX_MEM_NODES * MPO_MAX_MBLOCKS)
+
+#define PROP_LG_CPU_ID "id"
+#define PROP_LG_MASK "address-mask"
+#define PROP_LG_LATENCY "latency"
+#define PROP_LG_MATCH "address-match"
+#define PROP_LG_MEM_LG "memory-latency-group"
+#define PROP_LG_CPU "cpu"
+#define PROP_LG_MBLOCK "mblock"
+#define PROP_LG_BASE "base"
+#define PROP_LG_SIZE "size"
+#define PROP_LG_RA_PA_OFFSET "address-congruence-offset"
+
+/* Macro to set the correspending bit if an mem-lg homeid is a member */
+#define HOMESET_ADD(homeset, home)\
+ homeset |= ((int)1 << (home))
+
+/* Macro to check if an mem_lg homeid is a member of the homeset */
+#define MEM_LG_ISMEMBER(homeset, home)\
+ ((homeset) & ((uint64_t)1 << (home)))
+
+/* Structure to store CPU information from the MD */
+
+struct cpu_md {
+ uint_t home;
+ uint64_t latency;
+};
+
+/* Structure to store mem-lg information from the MD */
+
+struct lgrp_md {
+ uint64_t id;
+ uint64_t addr_mask;
+ uint64_t addr_match;
+ uint64_t latency;
+ mde_cookie_t node;
+ int ncpu;
+};
+
+/* Structure to store mblock information retrieved from the MD */
+
+struct mblock_md {
+ uint64_t base;
+ uint64_t size;
+ uint64_t ra_to_pa;
+ mde_cookie_t node;
+ pfn_t base_pfn;
+ pfn_t end_pfn;
+};
+
+/* Structure for memnode information for use by plat_pfn_to_mem_node */
+
+struct mnode_info {
+ pfn_t base_pfn;
+ pfn_t end_pfn;
+};
+
+/* A stripe defines the portion of a mem_node that falls in one mblock */
+typedef struct {
+ pfn_t physbase; /* first page in mnode in the corresponding mblock */
+ pfn_t physmax; /* last valid page in mnode in mblock */
+ pfn_t offset; /* stripe starts at physbase - offset */
+ int exists; /* set to 1 if mblock has memory in this mnode stripe */
+} mem_stripe_t;
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_MPO_H */