diff options
author | dp78419 <none@none> | 2007-07-31 16:27:12 -0700 |
---|---|---|
committer | dp78419 <none@none> | 2007-07-31 16:27:12 -0700 |
commit | ce8eb11a8717b4a57c68fd77ab9f8aac15b16bf2 (patch) | |
tree | 772441639680866ab4a841bbef119c6a813e6c09 | |
parent | 79777a7dd0179283917bda2ba98999c382d31c2c (diff) | |
download | illumos-joyent-ce8eb11a8717b4a57c68fd77ab9f8aac15b16bf2.tar.gz |
PSARC 2006/675 MPO for Victoria Falls/Maramba project
6539930 MPO for sun4v platforms
-rw-r--r-- | usr/src/uts/common/os/mem_cage.c | 24 | ||||
-rw-r--r-- | usr/src/uts/common/sys/lgrp.h | 9 | ||||
-rw-r--r-- | usr/src/uts/common/sys/pghw.h | 7 | ||||
-rw-r--r-- | usr/src/uts/common/vm/vm_pagelist.c | 345 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/memnode.c | 13 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/mp_machdep.c | 27 | ||||
-rw-r--r-- | usr/src/uts/i86pc/sys/memnode.h | 11 | ||||
-rw-r--r-- | usr/src/uts/i86pc/vm/vm_dep.h | 40 | ||||
-rw-r--r-- | usr/src/uts/sun4/os/memnode.c | 70 | ||||
-rw-r--r-- | usr/src/uts/sun4/sys/memnode.h | 17 | ||||
-rw-r--r-- | usr/src/uts/sun4/vm/vm_dep.h | 153 | ||||
-rw-r--r-- | usr/src/uts/sun4u/os/cmp.c | 27 | ||||
-rw-r--r-- | usr/src/uts/sun4v/Makefile.files | 1 | ||||
-rw-r--r-- | usr/src/uts/sun4v/Makefile.sun4v.shared | 1 | ||||
-rw-r--r-- | usr/src/uts/sun4v/cpu/generic.c | 5 | ||||
-rw-r--r-- | usr/src/uts/sun4v/cpu/niagara.c | 3 | ||||
-rw-r--r-- | usr/src/uts/sun4v/cpu/niagara2.c | 284 | ||||
-rw-r--r-- | usr/src/uts/sun4v/os/cmp.c | 37 | ||||
-rw-r--r-- | usr/src/uts/sun4v/os/mpo.c | 1264 | ||||
-rw-r--r-- | usr/src/uts/sun4v/sys/cpu_module.h | 3 | ||||
-rw-r--r-- | usr/src/uts/sun4v/sys/machcpuvar.h | 2 | ||||
-rw-r--r-- | usr/src/uts/sun4v/sys/mpo.h | 112 |
22 files changed, 2197 insertions, 258 deletions
diff --git a/usr/src/uts/common/os/mem_cage.c b/usr/src/uts/common/os/mem_cage.c index beb2fe3cbe..820fe555dd 100644 --- a/usr/src/uts/common/os/mem_cage.c +++ b/usr/src/uts/common/os/mem_cage.c @@ -335,7 +335,7 @@ kcage_next_range(int incage, pfn_t lo, pfn_t hi, rw_enter(&kcage_range_rwlock, RW_READER); for (lp = incage ? kcage_glist : kcage_current_glist; - lp != NULL; lp = lp->next) { + lp != NULL; lp = lp->next) { pfn_t klo, khi; @@ -886,7 +886,7 @@ kcage_recalc_preferred_size(pgcnt_t preferred_size) segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE; } kcage_kmemlp_mincage = MIN(lpmincage, - (segkmem_kmemlp_max / PAGESIZE)); + (segkmem_kmemlp_max / PAGESIZE)); preferred_size = MAX(kcage_kmemlp_mincage, preferred_size); } return (preferred_size); @@ -1006,11 +1006,7 @@ kcage_init(pgcnt_t preferred_size) */ if (SEGKMEM_USE_LARGEPAGES) { extern void page_freelist_coalesce_all(int mnode); - extern int max_mem_nodes; - int mnode, max_mnodes = max_mem_nodes; - for (mnode = 0; mnode < max_mnodes; mnode++) { - page_freelist_coalesce_all(mnode); - } + page_freelist_coalesce_all(-1); /* do all mnodes */ } ksp = kstat_create("kcage", 0, "kcage_page_list", "misc", @@ -1288,7 +1284,7 @@ kcage_freemem_add(pgcnt_t npages) wakeup_pcgs(); /* wakeup threads in pcgs() */ if (kcage_needfree != 0 && - kcage_freemem >= (kcage_throttlefree + kcage_needfree)) { + kcage_freemem >= (kcage_throttlefree + kcage_needfree)) { mutex_enter(&kcage_throttle_mutex); cv_broadcast(&kcage_throttle_cv); @@ -1467,7 +1463,7 @@ kcage_expand() * have enough free pages to page_relocate() even a single page. */ wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree) - - kcage_freemem; + - kcage_freemem; if (wanted <= 0) return (0); else if (freemem < pageout_reserve + 1) { @@ -1683,7 +1679,7 @@ kcage_cageout() #endif CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex, - callb_generic_cpr, "cageout"); + callb_generic_cpr, "cageout"); mutex_enter(&kcage_cageout_mutex); kcage_cageout_thread = curthread; @@ -1724,7 +1720,7 @@ again: pages_skipped = 0; shared_skipped = 0; while ((kcage_freemem < kcage_lotsfree || kcage_needfree) && - (pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) { + (pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) { if (start_pfn == PFN_INVALID) start_pfn = pfn; @@ -1820,7 +1816,7 @@ again: * In pass {0, 1, 2}, skip page if mod bit is set. */ prm = hat_pagesync(pp, - HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); + HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); /* On first pass ignore ref'd pages */ if (pass <= 1 && (prm & P_REF)) { @@ -1833,7 +1829,7 @@ again: /* On pass 2, page_destroy if mod bit is not set */ if (pass <= 2) { if (pp->p_szc != 0 || (prm & P_MOD) || - pp->p_lckcnt || pp->p_cowcnt) { + pp->p_lckcnt || pp->p_cowcnt) { pages_skipped = 1; page_unlock(pp); } else { @@ -1843,7 +1839,7 @@ again: * checking if mod bit is set */ (void) hat_pageunload(pp, - HAT_FORCE_PGUNLOAD); + HAT_FORCE_PGUNLOAD); /* * skip this page if modified diff --git a/usr/src/uts/common/sys/lgrp.h b/usr/src/uts/common/sys/lgrp.h index c0ed75d981..48ad8e8757 100644 --- a/usr/src/uts/common/sys/lgrp.h +++ b/usr/src/uts/common/sys/lgrp.h @@ -598,6 +598,15 @@ int lgrp_plat_latency(lgrp_handle_t, lgrp_handle_t); lgrp_handle_t lgrp_plat_root_hand(void); void lgrp_plat_probe(void); +extern uint32_t lgrp_expand_proc_thresh; +extern uint32_t lgrp_expand_proc_diff; +extern pgcnt_t lgrp_mem_free_thresh; +extern uint32_t lgrp_loadavg_tolerance; +extern uint32_t lgrp_loadavg_max_effect; +extern uint32_t lgrp_load_thresh; +extern lgrp_mem_policy_t lgrp_mem_policy_root; +extern int tsb_lgrp_affinity; + #endif /* _KERNEL && _KMEMUSER */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/pghw.h b/usr/src/uts/common/sys/pghw.h index e78be92032..f22afc021b 100644 --- a/usr/src/uts/common/sys/pghw.h +++ b/usr/src/uts/common/sys/pghw.h @@ -52,17 +52,12 @@ typedef enum pghw_type { PGHW_CACHE, PGHW_FPU, PGHW_MPIPE, + PGHW_CHIP, PGHW_MEMORY, PGHW_NUM_COMPONENTS } pghw_type_t; /* - * Consider the physical processor sharing relationship - * equivalant to a shared pipe to memory. - */ -#define PGHW_CHIP PGHW_MPIPE - -/* * Anonymous instance id */ #define PGHW_INSTANCE_ANON ((id_t)0xdecafbad) diff --git a/usr/src/uts/common/vm/vm_pagelist.c b/usr/src/uts/common/vm/vm_pagelist.c index cef95452bf..d45b8cd0fe 100644 --- a/usr/src/uts/common/vm/vm_pagelist.c +++ b/usr/src/uts/common/vm/vm_pagelist.c @@ -497,21 +497,37 @@ page_correct_color(uchar_t szc, uchar_t nszc, uint_t color, ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc)); color &= ceq_mask; - ncolor <<= PAGE_GET_COLOR_SHIFT(szc, nszc); + ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc); return (color | (ncolor & ~ceq_mask)); } /* + * The interleaved_mnodes flag is set when mnodes overlap in + * the physbase..physmax range, but have disjoint slices. + * In this case hpm_counters is shared by all mnodes. + * This flag is set dynamically by the platform. + */ +int interleaved_mnodes = 0; + +/* * Called by startup(). * Size up the per page size free list counters based on physmax * of each node and max_mem_nodes. + * + * If interleaved_mnodes is set we need to find the first mnode that + * exists. hpm_counters for the first mnode will then be shared by + * all other mnodes. If interleaved_mnodes is not set, just set + * first=mnode each time. That means there will be no sharing. */ size_t page_ctrs_sz(void) { int r; /* region size */ int mnode; + int firstmn; /* first mnode that exists */ int nranges; + pfn_t physbase; + pfn_t physmax; uint_t ctrs_sz = 0; int i; pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; @@ -525,7 +541,7 @@ page_ctrs_sz(void) colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); } - for (mnode = 0; mnode < max_mem_nodes; mnode++) { + for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { pgcnt_t r_pgcnt; pfn_t r_base; @@ -534,6 +550,7 @@ page_ctrs_sz(void) if (mem_node_config[mnode].exists == 0) continue; + HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); nranges = MNODE_RANGE_CNT(mnode); mnode_nranges[mnode] = nranges; mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); @@ -543,22 +560,25 @@ page_ctrs_sz(void) * base aligned to large page size. */ for (r = 1; r < mmu_page_sizes; r++) { + /* add in space for hpm_color_current */ + ctrs_sz += sizeof (size_t) * + colors_per_szc[r] * nranges; + + if (firstmn != mnode) + continue; + /* add in space for hpm_counters */ r_align = page_get_pagecnt(r); - r_base = mem_node_config[mnode].physbase; + r_base = physbase; r_base &= ~(r_align - 1); - r_pgcnt = howmany(mem_node_config[mnode].physmax - - r_base + 1, r_align); + r_pgcnt = howmany(physmax - r_base + 1, r_align); + /* * Round up to always allocate on pointer sized * boundaries. */ ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), sizeof (hpmctr_t *)); - - /* add in space for hpm_color_current */ - ctrs_sz += sizeof (size_t) * - colors_per_szc[r] * nranges; } } @@ -605,6 +625,9 @@ page_ctrs_alloc(caddr_t alloc_base) int mrange, nranges; int r; /* region size */ int i; + int firstmn; /* first mnode that exists */ + pfn_t physbase; + pfn_t physmax; pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; /* @@ -660,7 +683,7 @@ page_ctrs_alloc(caddr_t alloc_base) /* initialize page list counts */ PLCNT_INIT(alloc_base); - for (mnode = 0; mnode < max_mem_nodes; mnode++) { + for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { pgcnt_t r_pgcnt; pfn_t r_base; @@ -671,6 +694,8 @@ page_ctrs_alloc(caddr_t alloc_base) if (mem_node_config[mnode].exists == 0) continue; + HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); + for (r = 1; r < mmu_page_sizes; r++) { /* * the page_counters base has to be aligned to the @@ -678,11 +703,10 @@ page_ctrs_alloc(caddr_t alloc_base) * will cross large page boundaries. */ r_align = page_get_pagecnt(r); - r_base = mem_node_config[mnode].physbase; + r_base = physbase; /* base needs to be aligned - lower to aligned value */ r_base &= ~(r_align - 1); - r_pgcnt = howmany(mem_node_config[mnode].physmax - - r_base + 1, r_align); + r_pgcnt = howmany(physmax - r_base + 1, r_align); r_shift = PAGE_BSZS_SHIFT(r); PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; @@ -699,9 +723,12 @@ page_ctrs_alloc(caddr_t alloc_base) pfn_t pfnum = r_base; size_t idx; int mrange; + MEM_NODE_ITERATOR_DECL(it); + MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); + ASSERT(pfnum != (pfn_t)-1); PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, - color_mask, color_mask); + color_mask, color_mask, &it); idx = PNUM_TO_IDX(mnode, r, pfnum); idx = (idx >= r_pgcnt) ? 0 : idx; for (mrange = 0; mrange < nranges; mrange++) { @@ -709,14 +736,18 @@ page_ctrs_alloc(caddr_t alloc_base) r, i, mrange) = idx; } } - PAGE_COUNTERS_COUNTERS(mnode, r) = - (hpmctr_t *)alloc_base; - /* - * Round up to make alloc_base always be aligned on - * a pointer boundary. - */ - alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), - sizeof (hpmctr_t *)); + + /* hpm_counters may be shared by all mnodes */ + if (firstmn == mnode) { + PAGE_COUNTERS_COUNTERS(mnode, r) = + (hpmctr_t *)alloc_base; + alloc_base += + P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), + sizeof (hpmctr_t *)); + } else { + PAGE_COUNTERS_COUNTERS(mnode, r) = + PAGE_COUNTERS_COUNTERS(firstmn, r); + } /* * Verify that PNUM_TO_IDX and IDX_TO_PNUM @@ -735,7 +766,7 @@ page_ctrs_alloc(caddr_t alloc_base) * page_ctrs_sz() has added some slop for these roundups. */ alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, - L2CACHE_ALIGN); + L2CACHE_ALIGN); } /* Initialize other page counter specific data structures. */ @@ -894,6 +925,7 @@ page_ctrs_adjust(int mnode) size_t pcsz, old_csz; hpmctr_t *new_ctr, *old_ctr; pfn_t oldbase, newbase; + pfn_t physbase, physmax; size_t old_npgs; hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; size_t size_cache[MMU_PAGE_SIZES]; @@ -908,15 +940,17 @@ page_ctrs_adjust(int mnode) int old_maxmrange, new_maxmrange; int rc = 0; - newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK; - npgs = roundup(mem_node_config[mnode].physmax, - PC_BASE_ALIGN) - newbase; - cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES, KM_NOSLEEP); if (cands_cache == NULL) return (ENOMEM); + i = -1; + HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i); + + newbase = physbase & ~PC_BASE_ALIGN_MASK; + npgs = roundup(physmax, PC_BASE_ALIGN) - newbase; + /* prepare to free non-null pointers on the way out */ cands_cache_nranges = nranges; bzero(ctr_cache, sizeof (ctr_cache)); @@ -997,8 +1031,7 @@ page_ctrs_adjust(int mnode) * Grab the write lock to prevent others from walking these arrays * while we are modifying them. */ - rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER); - page_freelist_lock(mnode); + PAGE_CTRS_WRITE_LOCK(mnode); old_nranges = mnode_nranges[mnode]; cands_cache_nranges = old_nranges; @@ -1016,7 +1049,7 @@ page_ctrs_adjust(int mnode) for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { old_color_array[mrange] = PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, - r, mrange); + r, mrange); } pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); @@ -1048,6 +1081,21 @@ page_ctrs_adjust(int mnode) PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; PAGE_COUNTERS_BASE(mnode, r) = newbase; + + /* update shared hpm_counters in other mnodes */ + if (interleaved_mnodes) { + for (i = 0; i < max_mem_nodes; i++) { + if (i == mnode) + continue; + if (mem_node_config[i].exists == 0) + continue; + ASSERT(PAGE_COUNTERS_COUNTERS(i, r) == old_ctr); + PAGE_COUNTERS_COUNTERS(i, r) = new_ctr; + PAGE_COUNTERS_ENTRIES(i, r) = pcsz; + PAGE_COUNTERS_BASE(i, r) = newbase; + } + } + for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) = color_cache[r][mrange]; @@ -1059,16 +1107,27 @@ page_ctrs_adjust(int mnode) */ for (i = 0; i < colors_per_szc[r]; i++) { uint_t color_mask = colors_per_szc[r] - 1; + int mlo = interleaved_mnodes ? 0 : mnode; + int mhi = interleaved_mnodes ? max_mem_nodes : + (mnode + 1); + int m; pfn_t pfnum = newbase; size_t idx; + MEM_NODE_ITERATOR_DECL(it); - PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, color_mask, - color_mask); - idx = PNUM_TO_IDX(mnode, r, pfnum); - idx = (idx < pcsz) ? idx : 0; - for (mrange = 0; mrange < nranges; mrange++) { - PAGE_COUNTERS_CURRENT_COLOR(mnode, - r, i, mrange) = idx; + for (m = mlo; m < mhi; m++) { + if (mem_node_config[m].exists == 0) + continue; + MEM_NODE_ITERATOR_INIT(pfnum, m, &it); + ASSERT(pfnum != (pfn_t)-1); + PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, color_mask, + color_mask, &it); + idx = PNUM_TO_IDX(m, r, pfnum); + idx = (idx < pcsz) ? idx : 0; + for (mrange = 0; mrange < nranges; mrange++) { + PAGE_COUNTERS_CURRENT_COLOR(m, + r, i, mrange) = idx; + } } } @@ -1129,8 +1188,7 @@ page_ctrs_adjust(int mnode) } } } - page_freelist_unlock(mnode); - rw_exit(&page_ctrs_rwlock[mnode]); + PAGE_CTRS_WRITE_UNLOCK(mnode); /* * Now that we have dropped the write lock, it is safe to free all @@ -2130,6 +2188,7 @@ page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask, size_t len, idx, idx0; pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc); page_t *ret_pp; + MEM_NODE_ITERATOR_DECL(it); #if defined(__sparc) pfn_t pfnum0, nlo, nhi; #endif @@ -2169,11 +2228,15 @@ page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask, /* round to szcpgcnt boundaries */ lo = P2ROUNDUP(lo, szcpgcnt); + MEM_NODE_ITERATOR_INIT(lo, mnode, &it); + ASSERT(lo != (pfn_t)-1); hi = hi & ~(szcpgcnt - 1); /* set lo to the closest pfn of the right color */ - if ((PFN_2_COLOR(lo, szc) ^ color) & ceq_mask) { - PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask); + if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) || + (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) { + PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask, + &it); } if (hi <= lo) { @@ -2208,11 +2271,22 @@ page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask, pfnum = IDX_TO_PNUM(mnode, r, idx0); if (pfnum < lo || pfnum >= hi) { pfnum = lo; - } else if ((PFN_2_COLOR(pfnum, szc) ^ color) & ceq_mask) { - /* pfnum has invalid color get the closest correct pfn */ - PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, - color_mask); - pfnum = (pfnum >= hi) ? lo : pfnum; + } else { + MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); + if (pfnum == (pfn_t)-1) { + pfnum = lo; + MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); + ASSERT(pfnum != (pfn_t)-1); + } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask || + (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) { + /* invalid color, get the closest correct pfn */ + PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, + color_mask, &it); + if (pfnum >= hi) { + pfnum = lo; + MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); + } + } } /* set starting index */ @@ -2239,12 +2313,16 @@ page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask, /* jump to the next page in the range */ if (pfnum < nlo) { pfnum = P2ROUNDUP(nlo, szcpgcnt); + MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); idx = PNUM_TO_IDX(mnode, r, pfnum); if (idx >= len || pfnum >= hi) goto wrapit; - if ((PFN_2_COLOR(pfnum, szc) ^ color) & + if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask) goto next; + if (interleaved_mnodes && + PFN_2_MEM_NODE(pfnum) != mnode) + goto next; } } #endif @@ -2264,7 +2342,7 @@ page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask, if (ret_pp != NULL) { VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]); PAGE_COUNTERS_CURRENT_COLOR(mnode, r, - PFN_2_COLOR(pfnum, szc), mrange) = idx; + PFN_2_COLOR(pfnum, szc, &it), mrange) = idx; page_freelist_unlock(mnode); rw_exit(&page_ctrs_rwlock[mnode]); #if defined(__sparc) @@ -2299,11 +2377,12 @@ page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask, } next: PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, - color_mask); + color_mask, &it); idx = PNUM_TO_IDX(mnode, r, pfnum); if (idx >= len || pfnum >= hi) { wrapit: pfnum = lo; + MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); idx = PNUM_TO_IDX(mnode, r, pfnum); wrap++; #if defined(__sparc) @@ -2319,14 +2398,17 @@ wrapit: /* * For the given mnode, promote as many small pages to large pages as possible. + * mnode can be -1, which means do them all */ void page_freelist_coalesce_all(int mnode) { int r; /* region size */ int idx, full; - pfn_t pfnum; size_t len; + int doall = interleaved_mnodes || mnode < 0; + int mlo = doall ? 0 : mnode; + int mhi = doall ? max_mem_nodes : (mnode + 1); VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); @@ -2340,39 +2422,54 @@ page_freelist_coalesce_all(int mnode) * Always promote to the largest page possible * first to reduce the number of page promotions. */ - rw_enter(&page_ctrs_rwlock[mnode], RW_READER); - page_freelist_lock(mnode); + for (mnode = mlo; mnode < mhi; mnode++) { + rw_enter(&page_ctrs_rwlock[mnode], RW_READER); + page_freelist_lock(mnode); + } for (r = mmu_page_sizes - 1; r > 0; r--) { - pgcnt_t cands = 0; - int mrange, nranges = mnode_nranges[mnode]; + for (mnode = mlo; mnode < mhi; mnode++) { + pgcnt_t cands = 0; + int mrange, nranges = mnode_nranges[mnode]; - for (mrange = 0; mrange < nranges; mrange++) { - PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands); - if (cands != 0) - break; - } - if (cands == 0) { - VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all); - continue; - } + for (mrange = 0; mrange < nranges; mrange++) { + PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands); + if (cands != 0) + break; + } + if (cands == 0) { + VM_STAT_ADD(vmm_vmstats. + page_ctrs_cands_skip_all); + continue; + } - full = FULL_REGION_CNT(r); - len = PAGE_COUNTERS_ENTRIES(mnode, r); - - for (idx = 0; idx < len; idx++) { - if (PAGE_COUNTERS(mnode, r, idx) == full) { - pfnum = IDX_TO_PNUM(mnode, r, idx); - ASSERT(pfnum >= - mem_node_config[mnode].physbase && - pfnum < - mem_node_config[mnode].physmax); - (void) page_promote(mnode, - pfnum, r, PC_FREE, PC_MTYPE_ANY); + full = FULL_REGION_CNT(r); + len = PAGE_COUNTERS_ENTRIES(mnode, r); + + for (idx = 0; idx < len; idx++) { + if (PAGE_COUNTERS(mnode, r, idx) == full) { + pfn_t pfnum = + IDX_TO_PNUM(mnode, r, idx); + int tmnode = interleaved_mnodes ? + PFN_2_MEM_NODE(pfnum) : mnode; + + ASSERT(pfnum >= + mem_node_config[tmnode].physbase && + pfnum < + mem_node_config[tmnode].physmax); + + (void) page_promote(tmnode, + pfnum, r, PC_FREE, PC_MTYPE_ANY); + } } + /* shared hpm_counters covers all mnodes, so we quit */ + if (interleaved_mnodes) + break; } } - page_freelist_unlock(mnode); - rw_exit(&page_ctrs_rwlock[mnode]); + for (mnode = mlo; mnode < mhi; mnode++) { + page_freelist_unlock(mnode); + rw_exit(&page_ctrs_rwlock[mnode]); + } } /* @@ -2601,22 +2698,22 @@ page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split, /* we can split pages in the freelist, but not the cachelist */ if (can_split) { - plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0; + plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0; - /* calculate next sizes color masks and number of free list bins */ - for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) { - plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc, - plw->plw_ceq_mask[szc]); - plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc); - } - plw->plw_ceq_mask[nszc] = INVALID_MASK; - plw->plw_bins[nszc] = 0; + /* set next szc color masks and number of free list bins */ + for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) { + plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc, + plw->plw_ceq_mask[szc]); + plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc); + } + plw->plw_ceq_mask[nszc] = INVALID_MASK; + plw->plw_bins[nszc] = 0; } else { - ASSERT(szc == 0); - plw->plw_do_split = 0; - plw->plw_bins[1] = 0; - plw->plw_ceq_mask[1] = INVALID_MASK; + ASSERT(szc == 0); + plw->plw_do_split = 0; + plw->plw_bins[1] = 0; + plw->plw_ceq_mask[1] = INVALID_MASK; } } @@ -2664,7 +2761,7 @@ page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw) if (vac_colors > 1 && nbin == plw->plw_bin_marker) { plw->plw_bin_marker = nbin = INC_MASKED(nbin, neq_mask, - plw->plw_color_mask); + plw->plw_color_mask); plw->plw_bin_split_prev = plw->plw_bin0; /* * large pages all have the same vac color @@ -2710,10 +2807,10 @@ page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw) } if (plw->plw_bins[nszc] != 0) { - nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin); - if (!((plw->plw_split_next ^ nbin_nsz) & - plw->plw_ceq_mask[nszc])) - plw->plw_do_split = 1; + nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin); + if (!((plw->plw_split_next ^ nbin_nsz) & + plw->plw_ceq_mask[nszc])) + plw->plw_do_split = 1; } return (nbin); @@ -2864,8 +2961,8 @@ bin_empty_1: */ if (plw.plw_do_split && (pp = page_freelist_split(szc, bin, mnode, - mtype, PFNNULL, &plw)) != NULL) - return (pp); + mtype, PFNNULL, &plw)) != NULL) + return (pp); if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc, bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL) @@ -3229,6 +3326,7 @@ page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, uint_t color_mask; pfn_t hi, lo; uint_t skip; + MEM_NODE_ITERATOR_DECL(it); ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); @@ -3308,6 +3406,7 @@ page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, /* round to szcpgcnt boundaries */ lo = P2ROUNDUP(lo, szcpgcnt); + MEM_NODE_ITERATOR_INIT(lo, mnode, &it); hi = hi & ~(szcpgcnt - 1); if (hi <= lo) @@ -3318,10 +3417,14 @@ page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, * page sizes may only have a single page color */ skip = szcpgcnt; - if (ceq_mask > 0) { + if (ceq_mask > 0 || interleaved_mnodes) { /* set lo to point at appropriate color */ - PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask, - color_mask); + if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) || + (interleaved_mnodes && + PFN_2_MEM_NODE(lo) != mnode)) { + PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask, + color_mask, &it); + } if (hi <= lo) /* mseg cannot satisfy color request */ continue; @@ -3331,10 +3434,15 @@ page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, randpfn = (pfn_t)GETTICK(); randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); - if (ceq_mask) { - PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin, ceq_mask, - color_mask); - randpfn = (randpfn >= hi) ? lo : randpfn; + MEM_NODE_ITERATOR_INIT(randpfn, mnode, &it); + if (ceq_mask || interleaved_mnodes) { + if (randpfn != (pfn_t)-1) + PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin, + ceq_mask, color_mask, &it); + if (randpfn >= hi) { + randpfn = lo; + MEM_NODE_ITERATOR_INIT(randpfn, mnode, &it); + } } randpp = mseg->pages + (randpfn - mseg->pages_base); @@ -3357,17 +3465,23 @@ page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, } } - if (ceq_mask == 0) { + if (ceq_mask == 0 && !interleaved_mnodes) { pp += skip; } else { pfn_t pfn = pp->p_pagenum; PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin, - ceq_mask, color_mask); - pp = mseg->pages + (pfn - mseg->pages_base); + ceq_mask, color_mask, &it); + if (pfn == (pfn_t)-1) { + pp = endpp; + } else { + pp = mseg->pages + + (pfn - mseg->pages_base); + } } if (pp >= endpp) { /* start from the beginning */ + MEM_NODE_ITERATOR_INIT(lo, mnode, &it); pp = mseg->pages + (lo - mseg->pages_base); ASSERT(pp->p_pagenum == lo); ASSERT(pp + szcpgcnt <= endpp); @@ -3947,9 +4061,9 @@ page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, while ((pplist == NULL) && (mnode = lgrp_memnode_choose(&lgrp_cookie)) != -1) { - pplist = page_get_mnode_freelist( - mnode, bin, mtype, szc, - flags); + pplist = + page_get_mnode_freelist(mnode, bin, + mtype, szc, flags); } /* @@ -3968,8 +4082,9 @@ page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, while ((pplist == NULL) && (mnode = lgrp_memnode_choose(&lgrp_cookie)) != -1) { - pplist = page_get_mnode_cachelist( - bin, flags, mnode, mtype); + pplist = + page_get_mnode_cachelist(bin, flags, + mnode, mtype); } if (pplist != NULL) { page_hashout(pplist, NULL); @@ -4079,11 +4194,11 @@ page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, while ((pplist == NULL) && (mnode = - lgrp_memnode_choose(&lgrp_cookie)) + lgrp_memnode_choose(&lgrp_cookie)) != -1) { pplist = page_get_contig_pages( - mnode, bin, mtype, szc, - flags | PGI_PGCPHIPRI); + mnode, bin, mtype, szc, + flags | PGI_PGCPHIPRI); } break; } diff --git a/usr/src/uts/i86pc/os/memnode.c b/usr/src/uts/i86pc/os/memnode.c index e64fd2b0c6..9440ad17f6 100644 --- a/usr/src/uts/i86pc/os/memnode.c +++ b/usr/src/uts/i86pc/os/memnode.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -152,7 +151,7 @@ mem_node_post_del_slice(pfn_t start, pfn_t end, int cancelled) if (!cancelled) { delta_pgcnt = end - start; node_size = mem_node_config[mnode].physmax - - mem_node_config[mnode].physbase; + mem_node_config[mnode].physbase; if (node_size > delta_pgcnt) { /* @@ -232,7 +231,7 @@ mem_node_alloc() */ for (mnode = 0; mnode < max_mem_nodes; mnode++) if (cas32((uint32_t *)&mem_node_config[mnode].exists, - 0, 1) == 0) + 0, 1) == 0) break; if (mnode >= max_mem_nodes) @@ -273,7 +272,7 @@ mem_node_memlist_pages(int mnode, struct memlist *mlist) for (pmem = mlist; pmem; pmem = pmem->next) { cur_base = btop(pmem->address); cur_end = cur_base + btop(pmem->size) - 1; - if (end <= cur_base || base >= cur_end) + if (end < cur_base || base > cur_end) continue; npgs = npgs + (MIN(cur_end, end) - MAX(cur_base, base)) + 1; diff --git a/usr/src/uts/i86pc/os/mp_machdep.c b/usr/src/uts/i86pc/os/mp_machdep.c index 4605f6e517..a44c266f27 100644 --- a/usr/src/uts/i86pc/os/mp_machdep.c +++ b/usr/src/uts/i86pc/os/mp_machdep.c @@ -1343,3 +1343,30 @@ mach_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp, } return (PSM_SUCCESS); } +/* + * Return 1 if CMT load balancing policies should be + * implemented across instances of the specified hardware + * sharing relationship. + */ +int +pg_cmt_load_bal_hw(pghw_type_t hw) +{ + if (hw == PGHW_IPIPE || + hw == PGHW_FPU || + hw == PGHW_CHIP) + return (1); + else + return (0); +} +/* + * Return 1 if thread affinity polices should be implemented + * for instances of the specifed hardware sharing relationship. + */ +int +pg_cmt_affinity_hw(pghw_type_t hw) +{ + if (hw == PGHW_CACHE) + return (1); + else + return (0); +} diff --git a/usr/src/uts/i86pc/sys/memnode.h b/usr/src/uts/i86pc/sys/memnode.h index c76f90216e..21a059ac44 100644 --- a/usr/src/uts/i86pc/sys/memnode.h +++ b/usr/src/uts/i86pc/sys/memnode.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -51,9 +50,6 @@ extern "C" { #define PFN_2_MEM_NODE(pfn) \ ((max_mem_nodes > 1) ? plat_pfn_to_mem_node(pfn) : 0) -#define LGRPHAND_2_MEM_NODE(lgrp_plat_hand) \ - ((max_mem_nodes > 1) ? plat_lgrphand_to_mem_node(lgrp_plat_hand) : 0) - #define MEM_NODE_2_LGRPHAND(mnode) \ ((max_mem_nodes > 1) ? plat_mem_node_to_lgrphand(mnode) : \ LGRP_DEFAULT_HANDLE) @@ -90,7 +86,6 @@ extern void mem_node_post_del_slice(pfn_t, pfn_t, int); extern int mem_node_alloc(void); extern pgcnt_t mem_node_memlist_pages(int, struct memlist *); - extern struct mem_node_conf mem_node_config[]; extern uint64_t mem_node_physalign; extern int mem_node_pfn_shift; diff --git a/usr/src/uts/i86pc/vm/vm_dep.h b/usr/src/uts/i86pc/vm/vm_dep.h index b95f6b8e17..49e9386d81 100644 --- a/usr/src/uts/i86pc/vm/vm_dep.h +++ b/usr/src/uts/i86pc/vm/vm_dep.h @@ -39,6 +39,7 @@ extern "C" { #include <sys/clock.h> #include <vm/hat_pte.h> #include <sys/param.h> +#include <sys/memnode.h> /* * WARNING: vm_dep.h is included by files in common. As such, macros @@ -285,10 +286,41 @@ extern kmutex_t *cpc_mutex[NPC_MUTEX]; extern page_t *page_get_mnode_freelist(int, uint_t, int, uchar_t, uint_t); extern page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); -#define PAGE_GET_COLOR_SHIFT(szc, nszc) \ +/* mem node iterator is not used on x86 */ +#define MEM_NODE_ITERATOR_DECL(it) +#define MEM_NODE_ITERATOR_INIT(pfn, mnode, it) + +/* + * interleaved_mnodes mode is never set on x86, therefore, + * simply return the limits of the given mnode, which then + * determines the length of hpm_counters array for the mnode. + */ +#define HPM_COUNTERS_LIMITS(mnode, physbase, physmax, first) \ + { \ + (physbase) = mem_node_config[(mnode)].physbase; \ + (physmax) = mem_node_config[(mnode)].physmax; \ + (first) = (mnode); \ + } + +#define PAGE_CTRS_WRITE_LOCK(mnode) \ + { \ + rw_enter(&page_ctrs_rwlock[(mnode)], RW_WRITER);\ + page_freelist_lock(mnode); \ + } + +#define PAGE_CTRS_WRITE_UNLOCK(mnode) \ + { \ + page_freelist_unlock(mnode); \ + rw_exit(&page_ctrs_rwlock[(mnode)]); \ + } + +#define PAGE_GET_COLOR_SHIFT(szc, nszc) \ (hw_page_array[(nszc)].hp_shift - hw_page_array[(szc)].hp_shift) -#define PFN_2_COLOR(pfn, szc) \ +#define PAGE_CONVERT_COLOR(ncolor, szc, nszc) \ + ((ncolor) << PAGE_GET_COLOR_SHIFT((szc), (nszc))) + +#define PFN_2_COLOR(pfn, szc, it) \ (((pfn) & page_colors_mask) >> \ (hw_page_array[szc].hp_shift - hw_page_array[0].hp_shift)) @@ -305,7 +337,7 @@ extern page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); * This macro calculates the next sequential pfn with the specified * color using color equivalency mask */ -#define PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask) \ +#define PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask, it) \ ASSERT(((color) & ~(ceq_mask)) == 0); \ { \ uint_t pfn_shift = PAGE_BSZS_SHIFT(szc); \ @@ -329,7 +361,7 @@ extern page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); ((color) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc))) /* Find the bin for the given page if it was of size szc */ -#define PP_2_BIN_SZC(pp, szc) (PFN_2_COLOR(pp->p_pagenum, szc)) +#define PP_2_BIN_SZC(pp, szc) (PFN_2_COLOR(pp->p_pagenum, szc, NULL)) #define PP_2_BIN(pp) (PP_2_BIN_SZC(pp, pp->p_szc)) diff --git a/usr/src/uts/sun4/os/memnode.c b/usr/src/uts/sun4/os/memnode.c index 849bec22c8..cb21287ebd 100644 --- a/usr/src/uts/sun4/os/memnode.c +++ b/usr/src/uts/sun4/os/memnode.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -83,7 +82,7 @@ mem_node_add_slice(pfn_t start, pfn_t end) end = roundup(end, btop(mem_node_physalign)) - 1; } - if (&plat_slice_add) + if (&plat_slice_add != NULL) plat_slice_add(start, end); mnode = PFN_2_MEM_NODE(start); @@ -148,7 +147,7 @@ mem_node_post_del_slice(pfn_t start, pfn_t end, int cancelled) if (!cancelled) { delta_pgcnt = end - start; node_size = mem_node_config[mnode].physmax - - mem_node_config[mnode].physbase; + mem_node_config[mnode].physbase; if (node_size > delta_pgcnt) { /* @@ -180,7 +179,7 @@ mem_node_post_del_slice(pfn_t start, pfn_t end, int cancelled) mem_node_config[mnode].exists = 0; } - if (&plat_slice_del) + if (&plat_slice_del != NULL) plat_slice_del(start, end); } } @@ -195,7 +194,7 @@ startup_build_mem_nodes(u_longlong_t *list, size_t nelems) /* LINTED: ASSERT will always true or false */ ASSERT(NBBY * sizeof (mnodeset_t) >= max_mem_nodes); - if (&plat_build_mem_nodes) { + if (&plat_build_mem_nodes != NULL) { plat_build_mem_nodes(list, nelems); } else { /* @@ -226,7 +225,7 @@ mem_node_alloc() */ for (mnode = 0; mnode < max_mem_nodes; mnode++) if (cas32((uint32_t *)&mem_node_config[mnode].exists, - 0, 1) == 0) + 0, 1) == 0) break; if (mnode >= max_mem_nodes) @@ -247,27 +246,39 @@ mem_node_alloc() * Find the intersection between a memnode and a memlist * and returns the number of pages that overlap. * - * Assumes the list is protected from DR operations by - * the memlist lock. + * Grab the memlist lock to protect the list from DR operations. */ pgcnt_t mem_node_memlist_pages(int mnode, struct memlist *mlist) { pfn_t base, end; pfn_t cur_base, cur_end; - pgcnt_t npgs; + pgcnt_t npgs = 0; + pgcnt_t pages; struct memlist *pmem; + if (&plat_mem_node_intersect_range != NULL) { + memlist_read_lock(); + + for (pmem = mlist; pmem; pmem = pmem->next) { + plat_mem_node_intersect_range(btop(pmem->address), + btop(pmem->size), mnode, &pages); + npgs += pages; + } + + memlist_read_unlock(); + return (npgs); + } + base = mem_node_config[mnode].physbase; end = mem_node_config[mnode].physmax; - npgs = 0; memlist_read_lock(); for (pmem = mlist; pmem; pmem = pmem->next) { cur_base = btop(pmem->address); cur_end = cur_base + btop(pmem->size) - 1; - if (end <= cur_base || base >= cur_end) + if (end < cur_base || base > cur_end) continue; npgs = npgs + (MIN(cur_end, end) - MAX(cur_base, base)) + 1; @@ -277,3 +288,34 @@ mem_node_memlist_pages(int mnode, struct memlist *mlist) return (npgs); } + +/* + * Find MIN(physbase) and MAX(physmax) over all mnodes + * + * Called during startup and DR to find hpm_counters limits when + * interleaved_mnodes is set. + * NOTE: there is a race condition with DR if it tries to change more than + * one mnode in parallel. Sizing shared hpm_counters depends on finding the + * min(physbase) and max(physmax) across all mnodes. Therefore, the caller of + * page_ctrs_adjust must ensure that mem_node_config does not change while it + * is running. + */ +void +mem_node_max_range(pfn_t *basep, pfn_t *maxp) +{ + int mnode; + pfn_t max = 0; + pfn_t base = (pfn_t)-1; + + for (mnode = 0; mnode < max_mem_nodes; mnode++) { + if (mem_node_config[mnode].exists == 0) + continue; + if (max < mem_node_config[mnode].physmax) + max = mem_node_config[mnode].physmax; + if (base > mem_node_config[mnode].physbase) + base = mem_node_config[mnode].physbase; + } + ASSERT(base != (pfn_t)-1 && max != 0); + *basep = base; + *maxp = max; +} diff --git a/usr/src/uts/sun4/sys/memnode.h b/usr/src/uts/sun4/sys/memnode.h index d8068b9235..745d03002f 100644 --- a/usr/src/uts/sun4/sys/memnode.h +++ b/usr/src/uts/sun4/sys/memnode.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -66,16 +65,13 @@ extern "C" { * nodes, so the platform can always make everything work. */ -#ifndef MAX_MEM_NODES +#ifndef MAX_MEM_NODES #define MAX_MEM_NODES (4) #endif /* MAX_MEM_NODES */ #define PFN_2_MEM_NODE(pfn) \ ((max_mem_nodes > 1) ? plat_pfn_to_mem_node(pfn) : 0) -#define LGRPHAND_2_MEM_NODE(lgrp_plat_hand) \ - ((max_mem_nodes > 1) ? plat_lgrphand_to_mem_node(lgrp_plat_hand) : 0) - #define MEM_NODE_2_LGRPHAND(mnode) \ ((max_mem_nodes > 1) ? plat_mem_node_to_lgrphand(mnode) : \ LGRP_DEFAULT_HANDLE) @@ -90,12 +86,14 @@ extern void plat_assign_lgrphand_to_mem_node(lgrp_handle_t, int); extern lgrp_handle_t plat_mem_node_to_lgrphand(int); extern void plat_slice_add(pfn_t, pfn_t); extern void plat_slice_del(pfn_t, pfn_t); +extern void plat_mem_node_intersect_range(pfn_t, pgcnt_t, int, pgcnt_t *); #pragma weak plat_pfn_to_mem_node #pragma weak plat_lgrphand_to_mem_node #pragma weak plat_mem_node_to_lgrphand #pragma weak plat_slice_add #pragma weak plat_slice_del +#pragma weak plat_mem_node_intersect_range struct mem_node_conf { int exists; /* only try if set, list may still be empty */ @@ -111,7 +109,8 @@ extern void mem_node_pre_del_slice(pfn_t, pfn_t); extern void mem_node_post_del_slice(pfn_t, pfn_t, int); extern int mem_node_alloc(void); extern pgcnt_t mem_node_memlist_pages(int, struct memlist *); - +extern void mem_node_add_slice(pfn_t start, pfn_t end); +extern void mem_node_max_range(pfn_t *, pfn_t *); extern struct mem_node_conf mem_node_config[]; extern uint64_t mem_node_physalign; diff --git a/usr/src/uts/sun4/vm/vm_dep.h b/usr/src/uts/sun4/vm/vm_dep.h index 6f150837f8..357f9ba0a3 100644 --- a/usr/src/uts/sun4/vm/vm_dep.h +++ b/usr/src/uts/sun4/vm/vm_dep.h @@ -107,6 +107,92 @@ extern kmutex_t *fpc_mutex[NPC_MUTEX]; extern kmutex_t *cpc_mutex[NPC_MUTEX]; /* + * Iterator provides the info needed to convert RA to PA. + * MEM_NODE_ITERATOR_INIT() should be called before + * PAGE_NEXT_PFN_FOR_COLOR() if pfn was not obtained via a previous + * PAGE_NEXT_PFN_FOR_COLOR() call. Iterator caches color 2 hash + * translations requiring initializer call if color or ceq_mask changes, + * even if pfn doesn't. MEM_NODE_ITERATOR_INIT() must also be called before + * PFN_2_COLOR() that uses a valid iterator argument. + */ +#ifdef sun4v + +typedef struct mem_node_iterator { + uint_t mi_mnode; /* mnode in which to iterate */ + int mi_init; /* set to 1 when first init */ + int mi_last_mblock; /* last mblock visited */ + uint_t mi_hash_ceq_mask; /* cached copy of ceq_mask */ + uint_t mi_hash_color; /* cached copy of color */ + uint_t mi_mnode_mask; /* number of mask bits */ + uint_t mi_mnode_pfn_shift; /* mnode position in pfn */ + pfn_t mi_mblock_base; /* first valid pfn in current mblock */ + pfn_t mi_mblock_end; /* last valid pfn in current mblock */ + pfn_t mi_ra_to_pa; /* ra adjustment for current mblock */ + pfn_t mi_mnode_pfn_mask; /* mask to obtain mnode id bits */ +} mem_node_iterator_t; + +#define MEM_NODE_ITERATOR_DECL(it) \ + mem_node_iterator_t it +#define MEM_NODE_ITERATOR_INIT(pfn, mnode, it) \ + (pfn) = plat_mem_node_iterator_init((pfn), (mnode), (it), 1) + +extern pfn_t plat_mem_node_iterator_init(pfn_t, int, + mem_node_iterator_t *, int); +extern pfn_t plat_rapfn_to_papfn(pfn_t); +extern int interleaved_mnodes; + +#else /* sun4v */ + +#define MEM_NODE_ITERATOR_DECL(it) \ + void *it = NULL +#define MEM_NODE_ITERATOR_INIT(pfn, mnode, it) + +#endif /* sun4v */ + +/* + * Return the mnode limits so that hpc_counters length and base + * index can be determined. When interleaved_mnodes is set, we + * create an array only for the first mnode that exists. All other + * mnodes will share the array in this case. + * If interleaved_mnodes is not set, simply return the limits for + * the given mnode. + */ +#define HPM_COUNTERS_LIMITS(mnode, physbase, physmax, first) \ + if (!interleaved_mnodes) { \ + (physbase) = mem_node_config[(mnode)].physbase; \ + (physmax) = mem_node_config[(mnode)].physmax; \ + (first) = (mnode); \ + } else if ((first) < 0) { \ + mem_node_max_range(&(physbase), &(physmax)); \ + (first) = (mnode); \ + } + +#define PAGE_CTRS_WRITE_LOCK(mnode) \ + if (!interleaved_mnodes) { \ + rw_enter(&page_ctrs_rwlock[(mnode)], RW_WRITER); \ + page_freelist_lock(mnode); \ + } else { \ + /* changing shared hpm_counters */ \ + int _i; \ + for (_i = 0; _i < max_mem_nodes; _i++) { \ + rw_enter(&page_ctrs_rwlock[_i], RW_WRITER); \ + page_freelist_lock(_i); \ + } \ + } + +#define PAGE_CTRS_WRITE_UNLOCK(mnode) \ + if (!interleaved_mnodes) { \ + page_freelist_unlock(mnode); \ + rw_exit(&page_ctrs_rwlock[(mnode)]); \ + } else { \ + int _i; \ + for (_i = 0; _i < max_mem_nodes; _i++) { \ + page_freelist_unlock(_i); \ + rw_exit(&page_ctrs_rwlock[_i]); \ + } \ + } + +/* * cpu specific color conversion functions */ extern uint_t page_get_nsz_color_mask_cpu(uchar_t, uint_t); @@ -118,11 +204,14 @@ extern uint_t page_get_nsz_color_cpu(uchar_t, uint_t); extern uint_t page_get_color_shift_cpu(uchar_t, uchar_t); #pragma weak page_get_color_shift_cpu +extern uint_t page_convert_color_cpu(uint_t, uchar_t, uchar_t); +#pragma weak page_convert_color_cpu + extern pfn_t page_next_pfn_for_color_cpu(pfn_t, - uchar_t, uint_t, uint_t, uint_t); + uchar_t, uint_t, uint_t, uint_t, void *); #pragma weak page_next_pfn_for_color_cpu -extern uint_t page_pfn_2_color_cpu(pfn_t, uchar_t); +extern uint_t page_pfn_2_color_cpu(pfn_t, uchar_t, void *); #pragma weak page_pfn_2_color_cpu #define PAGE_GET_COLOR_SHIFT(szc, nszc) \ @@ -131,9 +220,14 @@ extern uint_t page_pfn_2_color_cpu(pfn_t, uchar_t); (hw_page_array[(nszc)].hp_shift - \ hw_page_array[(szc)].hp_shift)) -#define PFN_2_COLOR(pfn, szc) \ +#define PAGE_CONVERT_COLOR(ncolor, szc, nszc) \ + ((&page_convert_color_cpu != NULL) ? \ + page_convert_color_cpu(ncolor, szc, nszc) : \ + ((ncolor) << PAGE_GET_COLOR_SHIFT((szc), (nszc)))) + +#define PFN_2_COLOR(pfn, szc, it) \ ((&page_pfn_2_color_cpu != NULL) ? \ - page_pfn_2_color_cpu(pfn, szc) : \ + page_pfn_2_color_cpu(pfn, szc, it) : \ ((pfn & (hw_page_array[0].hp_colors - 1)) >> \ (hw_page_array[szc].hp_shift - \ hw_page_array[0].hp_shift))) @@ -151,7 +245,7 @@ extern uint_t page_pfn_2_color_cpu(pfn_t, uchar_t); * This macro calculates the next sequential pfn with the specified * color using color equivalency mask */ -#define PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask) \ +#define PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask, it) \ ASSERT(((color) & ~(ceq_mask)) == 0); \ if (&page_next_pfn_for_color_cpu == NULL) { \ uint_t pfn_shift = PAGE_BSZS_SHIFT(szc); \ @@ -165,8 +259,8 @@ extern uint_t page_pfn_2_color_cpu(pfn_t, uchar_t); pfn = (pfn > spfn ? pfn : pfn + stride) << pfn_shift; \ } \ } else { \ - pfn = page_next_pfn_for_color_cpu(pfn, szc, color, \ - ceq_mask, color_mask); \ + pfn = page_next_pfn_for_color_cpu(pfn, szc, color, \ + ceq_mask, color_mask, it); \ } /* get the color equivalency mask for the next szc */ @@ -182,7 +276,7 @@ extern uint_t page_pfn_2_color_cpu(pfn_t, uchar_t); page_get_nsz_color_cpu(szc, color)) /* Find the bin for the given page if it was of size szc */ -#define PP_2_BIN_SZC(pp, szc) (PFN_2_COLOR(pp->p_pagenum, szc)) +#define PP_2_BIN_SZC(pp, szc) (PFN_2_COLOR(pp->p_pagenum, szc, (void *)(-1))) #define PP_2_BIN(pp) (PP_2_BIN_SZC(pp, pp->p_szc)) @@ -335,16 +429,31 @@ typedef struct { * when memory is added (kphysm_add_memory_dynamic) or deleted * (kphysm_del_cleanup). */ -#define PLCNT_MODIFY_MAX(startpfn, cnt) { \ - pfn_t pfn = startpfn, endpfn = startpfn + ABS(cnt); \ - while (pfn < endpfn) { \ - int mn = PFN_2_MEM_NODE(pfn); \ - long inc = MIN(endpfn, mem_node_config[mn].physmax + 1) \ - - pfn; \ - pfn += inc; \ - atomic_add_long(&plcnt[mn][MTYPE_RELOC].plc_mt_pgmax, \ - ((cnt) < 0) ? -inc: inc); \ - } \ +#define PLCNT_MODIFY_MAX(pfn, cnt) { \ + spgcnt_t _cnt = (spgcnt_t)(cnt); \ + pgcnt_t _acnt = ABS(_cnt); \ + int _mn; \ + pgcnt_t _np; \ + if (&plat_mem_node_intersect_range != NULL) { \ + for (_mn = 0; _mn < max_mem_nodes; _mn++) { \ + plat_mem_node_intersect_range((pfn), _acnt, _mn, &_np);\ + if (_np == 0) \ + continue; \ + atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \ + (_cnt < 0) ? -_np : _np); \ + } \ + } else { \ + pfn_t _pfn = (pfn); \ + pfn_t _endpfn = _pfn + _acnt; \ + while (_pfn < _endpfn) { \ + _mn = PFN_2_MEM_NODE(_pfn); \ + _np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - \ + _pfn; \ + _pfn += _np; \ + atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \ + (_cnt < 0) ? -_np : _np); \ + } \ + } \ } extern plcnt_t plcnt; @@ -495,17 +604,17 @@ switch (consistent_coloring) { \ (vac_shift - MMU_PAGESHIFT)); \ if ((szc) == 0 || &page_pfn_2_color_cpu == NULL) { \ pfn += slew; \ - bin = PFN_2_COLOR(pfn, szc); \ + bin = PFN_2_COLOR(pfn, szc, NULL); \ } else { \ - bin = PFN_2_COLOR(pfn, szc); \ + bin = PFN_2_COLOR(pfn, szc, NULL); \ bin += slew >> (vac_shift - MMU_PAGESHIFT); \ bin &= hw_page_array[(szc)].hp_colors - 1; \ } \ break; \ } \ case 1: \ - bin = PFN_2_COLOR(((uintptr_t)addr >> MMU_PAGESHIFT), \ - szc); \ + bin = PFN_2_COLOR(((uintptr_t)addr >> MMU_PAGESHIFT), \ + szc, NULL); \ break; \ case 2: { \ int cnt = as_color_bin(as); \ diff --git a/usr/src/uts/sun4u/os/cmp.c b/usr/src/uts/sun4u/os/cmp.c index 86a021f3d1..20aa7855c8 100644 --- a/usr/src/uts/sun4u/os/cmp.c +++ b/usr/src/uts/sun4u/os/cmp.c @@ -280,3 +280,30 @@ cmp_set_nosteal_interval(void) /* Set the nosteal interval (used by disp_getbest()) to 100us */ nosteal_nsec = 100000UL; } +/* + * Return 1 if CMT load balancing policies should be + * implemented across instances of the specified hardware + * sharing relationship. + */ +int +pg_cmt_load_bal_hw(pghw_type_t hw) +{ + if (hw == PGHW_IPIPE || + hw == PGHW_FPU || + hw == PGHW_CHIP) + return (1); + else + return (0); +} +/* + * Return 1 if thread affinity polices should be implemented + * for instances of the specifed hardware sharing relationship. + */ +int +pg_cmt_affinity_hw(pghw_type_t hw) +{ + if (hw == PGHW_CACHE) + return (1); + else + return (0); +} diff --git a/usr/src/uts/sun4v/Makefile.files b/usr/src/uts/sun4v/Makefile.files index d153205ab1..b7c02c1e22 100644 --- a/usr/src/uts/sun4v/Makefile.files +++ b/usr/src/uts/sun4v/Makefile.files @@ -62,6 +62,7 @@ CORE_OBJS += mach_xc.o CORE_OBJS += mem_cage.o CORE_OBJS += mem_config.o CORE_OBJS += memlist_new.o +CORE_OBJS += mpo.o CORE_OBJS += ppage.o CORE_OBJS += promif_asr.o CORE_OBJS += promif_cpu.o diff --git a/usr/src/uts/sun4v/Makefile.sun4v.shared b/usr/src/uts/sun4v/Makefile.sun4v.shared index 82dbb2b21c..a299bb9a56 100644 --- a/usr/src/uts/sun4v/Makefile.sun4v.shared +++ b/usr/src/uts/sun4v/Makefile.sun4v.shared @@ -215,6 +215,7 @@ FDOFFSETS = $(UTSBASE)/sun/io/fd_offsets.in # MACHINE_DEFS = -D$(PLATFORM) -D_MACHDEP -DSFMMU +MACHINE_DEFS += -DMAX_MEM_NODES=8 $(MPSAS_BUILD)MACHINE_DEFS += -DMPSAS diff --git a/usr/src/uts/sun4v/cpu/generic.c b/usr/src/uts/sun4v/cpu/generic.c index eab39b9fe9..21771a5f71 100644 --- a/usr/src/uts/sun4v/cpu/generic.c +++ b/usr/src/uts/sun4v/cpu/generic.c @@ -35,6 +35,7 @@ #include <sys/elf_SPARC.h> #include <vm/hat_sfmmu.h> #include <vm/page.h> +#include <vm/vm_dep.h> #include <sys/cpuvar.h> #include <sys/async.h> #include <sys/cmn_err.h> @@ -167,6 +168,10 @@ cpu_map_exec_units(struct cpu *cp) if (cp->cpu_m.cpu_fpu == NO_EU_MAPPING_FOUND) cp->cpu_m.cpu_fpu = (id_t)(cp->cpu_id); + cp->cpu_m.cpu_mpipe = cpunodes[cp->cpu_id].l2_cache_mapping; + if (cp->cpu_m.cpu_mpipe == NO_L2_CACHE_MAPPING_FOUND) + cp->cpu_m.cpu_mpipe = CPU_L2_CACHEID_INVALID; + cp->cpu_m.cpu_core = (id_t)(cp->cpu_id); /* diff --git a/usr/src/uts/sun4v/cpu/niagara.c b/usr/src/uts/sun4v/cpu/niagara.c index cb7e182d27..d607c2625c 100644 --- a/usr/src/uts/sun4v/cpu/niagara.c +++ b/usr/src/uts/sun4v/cpu/niagara.c @@ -193,9 +193,10 @@ cpu_map_exec_units(struct cpu *cp) /* * Niagara systems just have one chip. Therefore, the chip id - * is always 0. + * mpipe id are always 0. */ cp->cpu_m.cpu_chip = 0; + cp->cpu_m.cpu_mpipe = 0; } static int niagara_cpucnt; diff --git a/usr/src/uts/sun4v/cpu/niagara2.c b/usr/src/uts/sun4v/cpu/niagara2.c index e791361578..e77b2ef3b4 100644 --- a/usr/src/uts/sun4v/cpu/niagara2.c +++ b/usr/src/uts/sun4v/cpu/niagara2.c @@ -198,9 +198,9 @@ cpu_map_exec_units(struct cpu *cp) * share the same L2 cache. If no such info is available, we * set the cpu to belong to the defacto chip 0. */ - cp->cpu_m.cpu_chip = cpunodes[cp->cpu_id].l2_cache_mapping; - if (cp->cpu_m.cpu_chip == NO_CHIP_MAPPING_FOUND) - cp->cpu_m.cpu_chip = 0; + cp->cpu_m.cpu_mpipe = cpunodes[cp->cpu_id].l2_cache_mapping; + if (cp->cpu_m.cpu_mpipe == NO_L2_CACHE_MAPPING_FOUND) + cp->cpu_m.cpu_mpipe = CPU_L2_CACHEID_INVALID; } static int cpucnt; @@ -283,22 +283,112 @@ cpu_trapstat_data(void *buf, uint_t tstat_pgszs) } } +/* + * Page coloring support for hashed cache index mode + */ + +/* + * Node id bits from machine description (MD). Node id distinguishes + * local versus remote memory. Because of MPO, page allocation does + * not cross node boundaries. Therefore, remove the node id bits from + * the color, since they are fixed. Either bit 30, or 31:30 in + * Victoria Falls processors. + * The number of node id bits is always 0 in Niagara2. + */ +typedef struct n2color { + uchar_t nnbits; /* number of node id bits */ + uchar_t nnmask; /* mask for node id bits */ + uchar_t lomask; /* mask for bits below node id */ + uchar_t lobits; /* number of bits below node id */ +} n2color_t; + +n2color_t n2color[MMU_PAGE_SIZES]; +static uchar_t nhbits[] = {7, 7, 6, 5, 5, 5}; + +/* + * Remove node id bits from color bits 32:28. + * This will reduce the number of colors. + * No change if number of node bits is zero. + */ +static inline uint_t +n2_hash2color(uint_t color, uchar_t szc) +{ + n2color_t m = n2color[szc]; + + if (m.nnbits > 0) { + color = ((color >> m.nnbits) & ~m.lomask) | (color & m.lomask); + ASSERT((color & ~(hw_page_array[szc].hp_colors - 1)) == 0); + } + + return (color); +} + +/* + * Restore node id bits into page color. + * This will increase the number of colors to match N2. + * No change if number of node bits is zero. + */ +static inline uint_t +n2_color2hash(uint_t color, uchar_t szc, uint_t node) +{ + n2color_t m = n2color[szc]; + + if (m.nnbits > 0) { + color = ((color & ~m.lomask) << m.nnbits) | (color & m.lomask); + color |= (node & m.nnmask) << m.lobits; + } + + return (color); +} + /* NI2 L2$ index is pa[32:28]^pa[17:13].pa[19:18]^pa[12:11].pa[10:6] */ + +/* + * iterator NULL means pfn is VA, do not adjust ra_to_pa + * iterator (-1) means pfn is RA, need to convert to PA + * iterator non-null means pfn is RA, use ra_to_pa + */ uint_t -page_pfn_2_color_cpu(pfn_t pfn, uchar_t szc) +page_pfn_2_color_cpu(pfn_t pfn, uchar_t szc, void *cookie) { + mem_node_iterator_t *it = cookie; uint_t color; ASSERT(szc <= TTE256M); + if (it == ((mem_node_iterator_t *)(-1))) { + pfn = plat_rapfn_to_papfn(pfn); + } else if (it != NULL) { + ASSERT(pfn >= it->mi_mblock_base && pfn <= it->mi_mblock_end); + pfn = pfn + it->mi_ra_to_pa; + } pfn = PFN_BASE(pfn, szc); color = ((pfn >> 15) ^ pfn) & 0x1f; - if (szc >= TTE4M) - return (color); + if (szc < TTE4M) { + /* 19:18 */ + color = (color << 2) | ((pfn >> 5) & 0x3); + if (szc > TTE64K) + color >>= 1; /* 19 */ + } + return (n2_hash2color(color, szc)); +} - color = (color << 2) | ((pfn >> 5) & 0x3); +static uint_t +page_papfn_2_color_cpu(pfn_t papfn, uchar_t szc) +{ + uint_t color; + + ASSERT(szc <= TTE256M); - return (szc <= TTE64K ? color : (color >> 1)); + papfn = PFN_BASE(papfn, szc); + color = ((papfn >> 15) ^ papfn) & 0x1f; + if (szc < TTE4M) { + /* 19:18 */ + color = (color << 2) | ((papfn >> 5) & 0x3); + if (szc > TTE64K) + color >>= 1; /* 19 */ + } + return (color); } #if TTE256M != 5 @@ -310,46 +400,91 @@ page_get_nsz_color_mask_cpu(uchar_t szc, uint_t mask) { static uint_t ni2_color_masks[5] = {0x63, 0x1e, 0x3e, 0x1f, 0x1f}; ASSERT(szc < TTE256M); - + mask = n2_color2hash(mask, szc, 0); mask &= ni2_color_masks[szc]; - return ((szc == TTE64K || szc == TTE512K) ? (mask >> 1) : mask); + if (szc == TTE64K || szc == TTE512K) + mask >>= 1; + return (n2_hash2color(mask, szc + 1)); } uint_t page_get_nsz_color_cpu(uchar_t szc, uint_t color) { ASSERT(szc < TTE256M); - return ((szc == TTE64K || szc == TTE512K) ? (color >> 1) : color); + color = n2_color2hash(color, szc, 0); + if (szc == TTE64K || szc == TTE512K) + color >>= 1; + return (n2_hash2color(color, szc + 1)); } uint_t page_get_color_shift_cpu(uchar_t szc, uchar_t nszc) { + uint_t s; ASSERT(nszc >= szc); ASSERT(nszc <= TTE256M); - if (szc == nszc) - return (0); - if (szc <= TTE64K) - return ((nszc >= TTE4M) ? 2 : ((nszc >= TTE512K) ? 1 : 0)); - if (szc == TTE512K) - return (1); + s = nhbits[szc] - n2color[szc].nnbits; + s -= nhbits[nszc] - n2color[nszc].nnbits; - return (0); + return (s); +} + +uint_t +page_convert_color_cpu(uint_t ncolor, uchar_t szc, uchar_t nszc) +{ + uint_t color; + + ASSERT(nszc > szc); + ASSERT(nszc <= TTE256M); + ncolor = n2_color2hash(ncolor, nszc, 0); + color = ncolor << (nhbits[szc] - nhbits[nszc]); + color = n2_hash2color(color, szc); + return (color); } +#define PAPFN_2_MNODE(pfn) \ + (((pfn) & it->mi_mnode_pfn_mask) >> it->mi_mnode_pfn_shift) + /*ARGSUSED*/ pfn_t page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color, - uint_t ceq_mask, uint_t color_mask) + uint_t ceq_mask, uint_t color_mask, void *cookie) { + mem_node_iterator_t *it = cookie; pfn_t pstep = PNUM_SIZE(szc); pfn_t npfn, pfn_ceq_mask, pfn_color; pfn_t tmpmask, mask = (pfn_t)-1; + uint_t pfnmn; ASSERT((color & ~ceq_mask) == 0); - - if (((page_pfn_2_color_cpu(pfn, szc) ^ color) & ceq_mask) == 0) { + ASSERT(pfn >= it->mi_mblock_base && pfn <= it->mi_mblock_end); + + /* convert RA to PA for accurate color calculation */ + if (it->mi_init) { + /* first call after it, so cache these values */ + it->mi_hash_ceq_mask = + n2_color2hash(ceq_mask, szc, it->mi_mnode_mask); + it->mi_hash_color = + n2_color2hash(color, szc, it->mi_mnode); + it->mi_init = 0; + } else { + ASSERT(it->mi_hash_ceq_mask == + n2_color2hash(ceq_mask, szc, it->mi_mnode_mask)); + ASSERT(it->mi_hash_color == + n2_color2hash(color, szc, it->mi_mnode)); + } + ceq_mask = it->mi_hash_ceq_mask; + color = it->mi_hash_color; + pfn += it->mi_ra_to_pa; + + /* restart here when we switch memblocks */ +next_mem_block: + if (szc <= TTE64K) { + pfnmn = PAPFN_2_MNODE(pfn); + } + if (((page_papfn_2_color_cpu(pfn, szc) ^ color) & ceq_mask) == 0 && + (szc > TTE64K || pfnmn == it->mi_mnode)) { /* we start from the page with correct color */ if (szc >= TTE512K) { @@ -361,18 +496,19 @@ page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color, pfn_ceq_mask = ((ceq_mask & 1) << 6) | ((ceq_mask >> 1) << 15); } - pfn = ADD_MASKED(pfn, pstep, pfn_ceq_mask, mask); - return (pfn); + npfn = ADD_MASKED(pfn, pstep, pfn_ceq_mask, mask); + goto done; } else { /* * We deal 64K or 8K page. Check if we could the * satisfy the request without changing PA[32:28] */ pfn_ceq_mask = ((ceq_mask & 3) << 5) | (ceq_mask >> 2); + pfn_ceq_mask |= it->mi_mnode_pfn_mask; npfn = ADD_MASKED(pfn, pstep, pfn_ceq_mask, mask); if ((((npfn ^ pfn) >> 15) & 0x1f) == 0) - return (npfn); + goto done; /* * for next pfn we have to change bits PA[32:28] @@ -382,15 +518,14 @@ page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color, npfn |= (ceq_mask & color & 3) << 5; pfn_ceq_mask = (szc == TTE8K) ? 0 : (ceq_mask & 0x1c) << 13; + pfn_ceq_mask |= it->mi_mnode_pfn_mask; npfn = ADD_MASKED(npfn, (1 << 15), pfn_ceq_mask, mask); /* * set bits PA[17:13] to match the color */ - ceq_mask >>= 2; - color = (color >> 2) & ceq_mask; - npfn |= ((npfn >> 15) ^ color) & ceq_mask; - return (npfn); + npfn |= ((npfn >> 15) ^ (color >> 2)) & (ceq_mask >> 2); + goto done; } } @@ -405,9 +540,9 @@ page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color, } else { /* try get the right color by changing bit PA[19:19] */ npfn = pfn + pstep; - if (((page_pfn_2_color_cpu(npfn, szc) ^ color) & + if (((page_papfn_2_color_cpu(npfn, szc) ^ color) & ceq_mask) == 0) - return (npfn); + goto done; /* page color is PA[32:28].PA[19:19] */ pfn_ceq_mask = ((ceq_mask & 1) << 6) | @@ -419,34 +554,45 @@ page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color, while (npfn <= pfn) { npfn = ADD_MASKED(npfn, pstep, pfn_ceq_mask, mask); } - return (npfn); + goto done; } /* - * We deal 64K or 8K page of incorrect color. + * We deal 64K or 8K page of incorrect color. * Try correcting color without changing PA[32:28] */ - pfn_ceq_mask = ((ceq_mask & 3) << 5) | (ceq_mask >> 2); pfn_color = ((color & 3) << 5) | (color >> 2); - npfn = (pfn & ~(pfn_t)0x7f); - npfn |= (((pfn >> 15) & 0x1f) ^ pfn_color) & pfn_ceq_mask; - npfn = (szc == TTE64K) ? (npfn & ~(pfn_t)0x7) : npfn; - - if (((page_pfn_2_color_cpu(npfn, szc) ^ color) & ceq_mask) == 0) { - - /* the color is fixed - find the next page */ - while (npfn <= pfn) { - npfn = ADD_MASKED(npfn, pstep, pfn_ceq_mask, mask); + if (pfnmn == it->mi_mnode) { + npfn = (pfn & ~(pfn_t)0x7f); + npfn |= (((pfn >> 15) & 0x1f) ^ pfn_color) & pfn_ceq_mask; + npfn = (szc == TTE64K) ? (npfn & ~(pfn_t)0x7) : npfn; + + if (((page_papfn_2_color_cpu(npfn, szc) ^ color) & + ceq_mask) == 0) { + /* the color is fixed - find the next page */ + pfn_ceq_mask |= it->mi_mnode_pfn_mask; + while (npfn <= pfn) { + npfn = ADD_MASKED(npfn, pstep, pfn_ceq_mask, + mask); + } + if ((((npfn ^ pfn) >> 15) & 0x1f) == 0) + goto done; } - if ((((npfn ^ pfn) >> 15) & 0x1f) == 0) - return (npfn); } /* to fix the color need to touch PA[32:28] */ npfn = (szc == TTE8K) ? ((pfn >> 15) << 15) : (((pfn >> 18) << 18) | ((color & 0x1c) << 13)); + + /* fix mnode if input pfn is in the wrong mnode. */ + if ((pfnmn = PAPFN_2_MNODE(npfn)) != it->mi_mnode) { + npfn += ((it->mi_mnode - pfnmn) & it->mi_mnode_mask) << + it->mi_mnode_pfn_shift; + } + tmpmask = (szc == TTE8K) ? 0 : (ceq_mask & 0x1c) << 13; + tmpmask |= it->mi_mnode_pfn_mask; while (npfn <= pfn) { npfn = ADD_MASKED(npfn, (1 << 15), tmpmask, mask); @@ -456,25 +602,58 @@ page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color, npfn |= (((npfn >> 15) & 0x1f) ^ pfn_color) & pfn_ceq_mask; npfn = (szc == TTE64K) ? (npfn & ~(pfn_t)0x7) : npfn; - ASSERT(((page_pfn_2_color_cpu(npfn, szc) ^ color) & ceq_mask) == 0); +done: + ASSERT(((page_papfn_2_color_cpu(npfn, szc) ^ color) & ceq_mask) == 0); + ASSERT(PAPFN_2_MNODE(npfn) == it->mi_mnode); + + /* PA to RA */ + npfn -= it->mi_ra_to_pa; + + /* check for possible memblock switch */ + if (npfn > it->mi_mblock_end) { + pfn = plat_mem_node_iterator_init(npfn, it->mi_mnode, it, 0); + if (pfn == (pfn_t)-1) + return (pfn); + ASSERT(pfn >= it->mi_mblock_base && pfn <= it->mi_mblock_end); + pfn += it->mi_ra_to_pa; + goto next_mem_block; + } return (npfn); } /* * init page coloring + * VF encodes node_id for an L-group in either bit 30 or 31:30, + * which effectively reduces the number of colors available per mnode. */ void page_coloring_init_cpu() { int i; - - hw_page_array[0].hp_colors = 1 << 7; - hw_page_array[1].hp_colors = 1 << 7; - hw_page_array[2].hp_colors = 1 << 6; - - for (i = 3; i < mmu_page_sizes; i++) { - hw_page_array[i].hp_colors = 1 << 5; + uchar_t id; + uchar_t lo; + uchar_t hi; + n2color_t m; + mem_node_iterator_t it; + static uchar_t idmask[] = {0, 0x7, 0x1f, 0x1f, 0x1f, 0x1f}; + + (void) plat_mem_node_iterator_init(0, 0, &it, 1); + for (i = 0; i < mmu_page_sizes; i++) { + memset(&m, 0, sizeof (m)); + id = it.mi_mnode_pfn_mask >> 15; /* node id mask */ + id &= idmask[i]; + lo = lowbit(id); + if (lo > 0) { + hi = highbit(id); + m.nnbits = hi - lo + 1; + m.nnmask = (1 << m.nnbits) - 1; + lo += nhbits[i] - 5; + m.lomask = (1 << (lo - 1)) - 1; + m.lobits = lo - 1; + } + hw_page_array[i].hp_colors = 1 << (nhbits[i] - m.nnbits); + n2color[i] = m; } } @@ -486,6 +665,7 @@ page_set_colorequiv_arr_cpu(void) { static uint_t nequiv_shades_log2[MMU_PAGE_SIZES] = {2, 5, 0, 0, 0, 0}; + nequiv_shades_log2[1] -= n2color[1].nnbits; if (colorequiv > 1) { int i; uint_t sv_a = lowbit(colorequiv) - 1; diff --git a/usr/src/uts/sun4v/os/cmp.c b/usr/src/uts/sun4v/os/cmp.c index 1503ef4b47..d5a9e3087d 100644 --- a/usr/src/uts/sun4v/os/cmp.c +++ b/usr/src/uts/sun4v/os/cmp.c @@ -97,7 +97,7 @@ pg_plat_hw_shared(cpu_t *cp, pghw_type_t hw) return (1); case PGHW_FPU: return (1); - case PGHW_CHIP: + case PGHW_MPIPE: return (1); } return (0); @@ -120,8 +120,8 @@ pg_plat_hw_instance_id(cpu_t *cpu, pghw_type_t hw) switch (hw) { case PGHW_IPIPE: return (cpu->cpu_m.cpu_ipipe); - case PGHW_CHIP: - return (cpu->cpu_m.cpu_chip); + case PGHW_MPIPE: + return (cpu->cpu_m.cpu_mpipe); case PGHW_FPU: return (cpu->cpu_m.cpu_fpu); default: @@ -143,7 +143,7 @@ pg_plat_hw_level(pghw_type_t hw) static pghw_type_t hw_hier[] = { PGHW_IPIPE, PGHW_FPU, - PGHW_CHIP, + PGHW_MPIPE, PGHW_NUM_COMPONENTS }; @@ -164,7 +164,7 @@ pg_plat_cmt_load_bal_hw(pghw_type_t hw) { if (hw == PGHW_IPIPE || hw == PGHW_FPU || - hw == PGHW_CHIP) + hw == PGHW_MPIPE) return (1); else return (0); @@ -195,3 +195,30 @@ cmp_set_nosteal_interval(void) { nosteal_nsec = 0; } +/* + * Return 1 if CMT load balancing policies should be + * implemented across instances of the specified hardware + * sharing relationship. + */ +int +pg_cmt_load_bal_hw(pghw_type_t hw) +{ + if (hw == PGHW_IPIPE || + hw == PGHW_FPU || + hw == PGHW_MPIPE) + return (1); + else + return (0); +} +/* + * Return 1 if thread affinity polices should be implemented + * for instances of the specifed hardware sharing relationship. + */ +int +pg_cmt_affinity_hw(pghw_type_t hw) +{ + if (hw == PGHW_CACHE) + return (1); + else + return (0); +} diff --git a/usr/src/uts/sun4v/os/mpo.c b/usr/src/uts/sun4v/os/mpo.c new file mode 100644 index 0000000000..d98ce96438 --- /dev/null +++ b/usr/src/uts/sun4v/os/mpo.c @@ -0,0 +1,1264 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/machsystm.h> +#include <sys/machparam.h> +#include <sys/cmn_err.h> +#include <sys/stat.h> +#include <sys/mach_descrip.h> +#include <sys/memnode.h> +#include <sys/mdesc.h> +#include <sys/mpo.h> +#include <vm/vm_dep.h> + +/* + * MPO and the sun4v memory representation + * --------------------------------------- + * + * Latency groups are defined in the sun4v achitecture by memory-latency-group + * nodes in the Machine Description, as specified in FWARC/2007/260. These + * tie together cpu nodes and mblock nodes, and contain mask and match + * properties that identify the portion of an mblock that belongs to the + * lgroup. Mask and match are defined in the Physical Address (PA) space, + * but an mblock defines Real Addresses (RA). To translate, the mblock + * includes the property address-congruence-offset, hereafter referred to as + * ra_to_pa. A real address ra is a member of an lgroup if + * + * (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match + * + * The MD is traversed, and information on all mblocks is kept in the array + * mpo_mblock[]. Information on all CPUs, including which lgroup they map + * to, is kept in the array mpo_cpu[]. + * + * This implementation makes (and verifies) the simplifying assumption that + * the mask bits are the same for all defined lgroups, and that all 1 bits in + * the mask are contiguous. Thus the number of lgroups is bounded by the + * number of possible mask values, and the lgrp_handle_t is defined as the + * mask value, shifted right to eliminate the 0 bit positions in mask. The + * masks and values are also referred to as "home bits" in the code. + * + * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup + * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock + * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the + * home bits. This yields the mem_node. + * + * Interfaces + * ---------- + * + * This file exports the following entry points: + * + * plat_lgrp_init() + * plat_build_mem_nodes() + * plat_lgrp_cpu_to_hand() + * plat_lgrp_latency() + * plat_pfn_to_mem_node() + * These implement the usual platform lgroup interfaces. + * + * plat_rapfn_to_papfn() + * Recover the PA page coloring bits from an RA. + * + * plat_mem_node_iterator_init() + * Initialize an iterator to efficiently step through pages in a mem_node. + * + * plat_mem_node_intersect_range() + * Find the intersection with a mem_node. + */ + +int sun4v_mpo_enable = 1; +int sun4v_mpo_debug = 0; +char sun4v_mpo_status[256] = ""; + +/* Save CPU info from the MD and associate CPUs with lgroups */ +static struct cpu_md mpo_cpu[NCPU]; + +/* Save lgroup info from the MD */ +#define MAX_MD_LGROUPS 32 +static struct lgrp_md mpo_lgroup[MAX_MD_LGROUPS]; +static int n_lgrpnodes = 0; +static int n_locality_groups = 0; +static int max_locality_groups = 0; + +/* Save mblocks from the MD */ +static struct mblock_md mpo_mblock[MPO_MAX_MBLOCKS]; +static int n_mblocks = 0; + +/* Save mem_node stripes calculate from mblocks and lgroups. */ +static mem_stripe_t mem_stripes[MAX_MEM_STRIPES]; +static int n_mem_stripes = 0; +static pfn_t mnode_stride; /* distance between stripes, start to start */ +static int stripe_shift; /* stride/stripes expressed as a shift */ +static pfn_t mnode_pages; /* mem_node stripe width */ + +/* Save home mask and shift used to calculate lgrp_handle_t values */ +static uint64_t home_mask = 0; +static pfn_t home_mask_pfn = 0; +static int home_mask_shift = 0; +static uint_t home_mask_pfn_shift = 0; + +/* Save lowest and highest latencies found across all lgroups */ +static int lower_latency = 0; +static int higher_latency = 0; + +static pfn_t base_ra_to_pa_pfn = 0; /* ra_to_pa for single mblock memory */ + +static int valid_pages(md_t *md, mde_cookie_t cpu0); +static int unique_home_mem_lg_count(uint64_t mem_lg_homeset); +static int fix_interleave(void); + +/* Debug support */ +#if defined(DEBUG) && !defined(lint) +#define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args) +#else +#define MPO_DEBUG(...) +#endif /* DEBUG */ + +/* Record status message, viewable from mdb */ +#define MPO_STATUS(args...) { \ + (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args); \ + MPO_DEBUG(sun4v_mpo_status); \ +} + +/* + * Routine to read a uint64_t from a given md + */ +static int64_t +get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val) +{ + int err = md_get_prop_val(md, node, propname, val); + return (err); +} + +static int +mblock_cmp(const void *a, const void *b) +{ + struct mblock_md *m1 = (struct mblock_md *)a; + struct mblock_md *m2 = (struct mblock_md *)b; + + if (m1->base < m2->base) + return (-1); + else if (m1->base == m2->base) + return (0); + else + return (1); +} + +static void +mblock_sort(struct mblock_md *mblocks, int n) +{ + extern void qsort(void *, size_t, size_t, + int (*)(const void *, const void *)); + + qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp); +} + +/* + * + * Traverse the MD to determine: + * + * Number of CPU nodes, lgrp_nodes, and mblocks + * Then for each lgrp_node, obtain the appropriate data. + * For each CPU, determine its home locality and store it. + * For each mblock, retrieve its data and store it. + */ +static int +lgrp_traverse(md_t *md) +{ + mde_cookie_t root, *cpunodes, *lgrpnodes, *nodes, *mblocknodes; + uint64_t i, j, k, o, n_nodes; + uint64_t n_lgroups = 0; + uint64_t mem_lg_homeset = 0; + int ret_val = 0; + int result = 0; + int n_cpunodes = 0; + int sub_page_fix; + + n_nodes = md_node_count(md); + + if (n_nodes <= 0) { + MPO_STATUS("lgrp_traverse: No nodes in node count\n"); + ret_val = -1; + goto fail; + } + + root = md_root_node(md); + + if (root == MDE_INVAL_ELEM_COOKIE) { + MPO_STATUS("lgrp_traverse: Root node is missing\n"); + ret_val = -1; + goto fail; + } + + /* + * Build the Memory Nodes. Do this before any possibility of + * bailing from this routine so we obtain ra_to_pa (needed for page + * coloring) even when there are no lgroups defined. + */ + + n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, + "fwd", &mblocknodes); + + if (n_mblocks <= 0 || n_mblocks > MPO_MAX_MBLOCKS) { + MPO_STATUS("lgrp_traverse: No mblock " + "nodes detected in Machine Descriptor\n"); + n_mblocks = 0; + ret_val = -1; + goto fail; + } + + for (i = 0; i < n_mblocks; i++) { + mpo_mblock[i].node = mblocknodes[i]; + + /* Without a base or size value we will fail */ + result = get_int(md, mblocknodes[i], PROP_LG_BASE, + &mpo_mblock[i].base); + if (result < 0) { + MPO_STATUS("lgrp_traverse: " + "PROP_LG_BASE is missing\n"); + n_mblocks = 0; + ret_val = -1; + goto fail; + } + + result = get_int(md, mblocknodes[i], PROP_LG_SIZE, + &mpo_mblock[i].size); + if (result < 0) { + MPO_STATUS("lgrp_traverse: " + "PROP_LG_SIZE is missing\n"); + n_mblocks = 0; + ret_val = -1; + goto fail; + } + + result = get_int(md, mblocknodes[i], + PROP_LG_RA_PA_OFFSET, &mpo_mblock[i].ra_to_pa); + + /* If we don't have an ra_pa_offset, just set it to 0 */ + if (result < 0) + mpo_mblock[i].ra_to_pa = 0; + + MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, " + "ra_to_pa = %lx\n", i, + mpo_mblock[i].base, + mpo_mblock[i].size, + mpo_mblock[i].ra_to_pa); + } + + /* Must sort mblocks by address for mem_node_iterator_init() */ + mblock_sort(mpo_mblock, n_mblocks); + + base_ra_to_pa_pfn = btop(mpo_mblock[0].ra_to_pa); + + /* Page coloring hook is required so we can iterate through mnodes */ + if (&page_next_pfn_for_color_cpu == NULL) { + MPO_STATUS("lgrp_traverse: No page coloring support\n"); + ret_val = -1; + goto fail; + } + + /* Global enable for mpo */ + if (sun4v_mpo_enable == 0) { + MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n"); + ret_val = -1; + goto fail; + } + + n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG, + "fwd", &lgrpnodes); + + if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) { + MPO_STATUS("lgrp_traverse: No Lgroups\n"); + ret_val = -1; + goto fail; + } + + n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes); + + if (n_cpunodes <= 0 || n_cpunodes > NCPU) { + MPO_STATUS("lgrp_traverse: No CPU nodes detected " + "in MD\n"); + ret_val = -1; + goto fail; + } + + MPO_DEBUG("lgrp_traverse: Node Count: %ld\n", n_nodes); + MPO_DEBUG("lgrp_traverse: md: %p\n", md); + MPO_DEBUG("lgrp_traverse: root: %lx\n", root); + MPO_DEBUG("lgrp_traverse: mem_lgs: %d\n", n_lgrpnodes); + MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes); + MPO_DEBUG("lgrp_traverse: mblocks: %d\n", n_mblocks); + + for (i = 0; i < n_lgrpnodes; i++) { + mpo_lgroup[i].node = lgrpnodes[i]; + mpo_lgroup[i].id = i; + mpo_lgroup[i].ncpu = 0; + result = get_int(md, lgrpnodes[i], PROP_LG_MASK, + &mpo_lgroup[i].addr_mask); + result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH, + &mpo_lgroup[i].addr_match); + + /* + * If either the mask or match properties are missing, set to 0 + */ + if (result < 0) { + mpo_lgroup[i].addr_mask = 0; + mpo_lgroup[i].addr_match = 0; + } + + /* Set latency to 0 if property not present */ + + result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY, + &mpo_lgroup[i].latency); + if (result < 0) + mpo_lgroup[i].latency = 0; + } + + /* + * Sub-page level interleave is not yet supported. Check for it, + * and remove sub-page interleaved lgroups from mpo_lgroup and + * n_lgrpnodes. If no lgroups are left, return. + */ + + sub_page_fix = fix_interleave(); + if (n_lgrpnodes == 0) { + ret_val = -1; + goto fail; + } + + /* Ensure that all of the addr_mask values are the same */ + + for (i = 0; i < n_lgrpnodes; i++) { + if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) { + MPO_STATUS("lgrp_traverse: " + "addr_mask values are not the same\n"); + ret_val = -1; + goto fail; + } + } + + /* + * Ensure that all lgrp nodes see all the mblocks. However, if + * sub-page interleave is being fixed, they do not, so skip + * the check. + */ + + if (sub_page_fix == 0) { + for (i = 0; i < n_lgrpnodes; i++) { + j = md_alloc_scan_dag(md, mpo_lgroup[i].node, + PROP_LG_MBLOCK, "fwd", &nodes); + md_free_scan_dag(md, &nodes); + if (j != n_mblocks) { + MPO_STATUS("lgrp_traverse: " + "sub-page interleave is being fixed\n"); + ret_val = -1; + goto fail; + } + } + } + + /* + * Use the address mask from the first lgroup node + * to establish our home_mask. + */ + home_mask = mpo_lgroup[0].addr_mask; + home_mask_pfn = btop(home_mask); + home_mask_shift = lowbit(home_mask) - 1; + home_mask_pfn_shift = home_mask_shift - PAGESHIFT; + mnode_pages = btop(1ULL << home_mask_shift); + + /* + * How many values are possible in home mask? Assume the mask + * bits are contiguous. + */ + max_locality_groups = + 1 << highbit(home_mask_pfn >> home_mask_pfn_shift); + + /* Now verify the home mask bits are contiguous */ + + if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) { + MPO_STATUS("lgrp_traverse: " + "home mask bits are not contiguous\n"); + ret_val = -1; + goto fail; + } + + /* Record all of the home bits */ + + for (i = 0; i < n_lgrpnodes; i++) { + HOMESET_ADD(mem_lg_homeset, + mpo_lgroup[i].addr_match >> home_mask_shift); + } + + /* Count the number different "home" mem_lg's we've discovered */ + + n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset); + + /* If we have only 1 locality group then we can exit */ + if (n_locality_groups == 1) { + MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n"); + ret_val = -1; + goto fail; + } + + /* + * Set the latencies. A CPU's lgroup is defined by the lowest + * latency found. All other memory is considered remote, and the + * remote latency is represented by the highest latency found. + * Thus hierarchical lgroups, if any, are approximated by a + * two level scheme. + * + * The Solaris MPO framework by convention wants to see latencies + * in units of nano-sec/10. In the MD, the units are defined to be + * pico-seconds. + */ + + lower_latency = mpo_lgroup[0].latency; + higher_latency = mpo_lgroup[0].latency; + + for (i = 1; i < n_lgrpnodes; i++) { + if (mpo_lgroup[i].latency < lower_latency) { + lower_latency = mpo_lgroup[i].latency; + } + if (mpo_lgroup[i].latency > higher_latency) { + higher_latency = mpo_lgroup[i].latency; + } + } + lower_latency /= 10000; + higher_latency /= 10000; + + /* Clear our CPU data */ + + for (i = 0; i < NCPU; i++) { + mpo_cpu[i].home = 0; + mpo_cpu[i].latency = (uint_t)(-1); + } + + /* Build the CPU nodes */ + for (i = 0; i < n_cpunodes; i++) { + + /* Read in the lgroup nodes */ + + result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k); + if (result < 0) { + MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n"); + ret_val = -1; + goto fail; + } + + n_lgroups = md_alloc_scan_dag(md, cpunodes[i], PROP_LG_MEM_LG, + "fwd", &nodes); + if (n_lgroups <= 0) { + MPO_STATUS("lgrp_traverse: PROP_LG_MEM_LG missing"); + ret_val = -1; + goto fail; + } + + /* + * Find the lgroup this cpu belongs to with the lowest latency. + * Check all the lgrp nodes connected to this CPU to determine + * which has the smallest latency. + */ + + for (j = 0; j < n_lgroups; j++) { + for (o = 0; o < n_lgrpnodes; o++) { + if (nodes[j] == mpo_lgroup[o].node) { + if (mpo_lgroup[o].latency < + mpo_cpu[k].latency) { + mpo_cpu[k].home = + mpo_lgroup[o].addr_match + >> home_mask_shift; + mpo_cpu[k].latency = + mpo_lgroup[o].latency; + mpo_lgroup[o].ncpu++; + } + } + } + } + md_free_scan_dag(md, &nodes); + } + + /* Validate that no large pages cross mnode boundaries. */ + if (valid_pages(md, cpunodes[0]) == 0) { + ret_val = -1; + goto fail; + } + +fail: + /* MD cookies are no longer valid; ensure they are not used again. */ + for (i = 0; i < n_mblocks; i++) + mpo_mblock[i].node = MDE_INVAL_ELEM_COOKIE; + for (i = 0; i < n_lgrpnodes; i++) + mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE; + + if (n_cpunodes > 0) + md_free_scan_dag(md, &cpunodes); + if (n_lgrpnodes > 0) + md_free_scan_dag(md, &lgrpnodes); + if (n_mblocks > 0) + md_free_scan_dag(md, &mblocknodes); + else + panic("lgrp_traverse: No memory blocks found"); + + if (ret_val == 0) + MPO_STATUS("MPO feature is enabled.\n"); + + return (ret_val); +} + +/* + * Determine the number of unique mem_lg's present in our system + */ +static int +unique_home_mem_lg_count(uint64_t mem_lg_homeset) +{ + int homeid; + int count = 0; + + /* + * Scan the "home" bits of the mem_lgs, count + * the number that are unique. + */ + + for (homeid = 0; homeid < NLGRPS_MAX; homeid++) { + if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) { + count++; + } + } + + MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n", + mem_lg_homeset); + MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count); + + /* Default must be at least one */ + if (count == 0) + count = 1; + + return (count); +} + +/* + * Platform specific lgroup initialization + */ +void +plat_lgrp_init(void) +{ + md_t *md; + int i, rc, ncpu_min; + + /* Get the Machine Descriptor handle */ + + md = md_get_handle(); + + /* If not, we cannot continue */ + + if (md == NULL) { + panic("cannot access machine descriptor\n"); + } else { + rc = lgrp_traverse(md); + (void) md_fini_handle(md); + } + + /* + * If we can't process the MD for lgroups then at least let the + * system try to boot. Assume we have one lgroup so that + * when plat_build_mem_nodes is called, it will attempt to init + * an mnode based on the supplied memory segment. + */ + + if (rc == -1) { + home_mask_pfn = 0; + max_locality_groups = 1; + n_locality_groups = 1; + return; + } + + mem_node_pfn_shift = 0; + mem_node_physalign = 0; + + /* Use lgroup-aware TSB allocations */ + tsb_lgrp_affinity = 1; + + /* + * lgrp_expand_proc_thresh is the minimum load on the lgroups + * this process is currently running on before considering + * expanding threads to another lgroup. + * + * lgrp_expand_proc_diff determines how much less the remote lgroup + * must be loaded before expanding to it. + * + * On sun4v CMT processors, threads share a core pipeline, and + * at less than 100% utilization, best throughput is obtained by + * spreading threads across more cores, even if some are in a + * different lgroup. Spread threads to a new lgroup if the + * current group is more than 50% loaded. Because of virtualization, + * lgroups may have different numbers of CPUs, but the tunables + * apply to all lgroups, so find the smallest lgroup and compute + * 50% loading. + */ + + ncpu_min = NCPU; + for (i = 0; i < n_lgrpnodes; i++) { + int ncpu = mpo_lgroup[i].ncpu; + if (ncpu != 0 && ncpu < ncpu_min) + ncpu_min = ncpu; + } + lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2; + + /* new home may only be half as loaded as the existing home to use it */ + lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2; + + lgrp_loadavg_tolerance = lgrp_loadavg_max_effect; + + /* Require that a home lgroup have some memory to be chosen */ + lgrp_mem_free_thresh = 1; + + /* Standard home-on-next-touch policy */ + lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT; + + /* Disable option to choose root lgroup if all leaf lgroups are busy */ + lgrp_load_thresh = UINT32_MAX; +} + +/* + * Helper routine for debugging calls to mem_node_add_slice() + */ +static void +mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn) +{ +#if defined(DEBUG) && !defined(lint) + static int slice_count = 0; + + slice_count++; + MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n", + slice_count, basepfn, endpfn); +#endif + mem_node_add_slice(basepfn, endpfn); +} + +/* + * Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node() + */ +static void +mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode) +{ + MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld," + "mnode index: %d\n", plathand, mnode); + plat_assign_lgrphand_to_mem_node(plathand, mnode); +} + +/* + * plat_build_mem_nodes() + * + * Define the mem_nodes based on the modified boot memory list, + * or based on info read from the MD in plat_lgrp_init(). + * + * When the home mask lies in the middle of the address bits (as it does on + * Victoria Falls), then the memory in one mem_node is no longer contiguous; + * it is striped across an mblock in a repeating pattern of contiguous memory + * followed by a gap. The stripe width is the size of the contiguous piece. + * The stride is the distance from the start of one contiguous piece to the + * start of the next. The gap is thus stride - stripe_width. + * + * The stripe of an mnode that falls within an mblock is described by the type + * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The + * mem_stripe_t's are kept in a global array mem_stripes[]. The index into + * this array is predetermined. The mem_stripe_t that describes mnode m + * within mpo_mblock[i] is stored at + * mem_stripes[ m + i * max_locality_groups ] + * + * max_locality_groups is the total number of possible locality groups, + * as defined by the size of the home mask, even if the memory assigned + * to the domain is small and does not cover all the lgroups. Thus some + * mem_stripe_t's may be empty. + * + * The members of mem_stripe_t are: + * physbase: First valid page in mem_node in the corresponding mblock + * physmax: Last valid page in mem_node in mblock + * offset: The full stripe width starts at physbase - offset. + * Thus if offset is non-zero, this mem_node starts in the middle + * of a stripe width, and the second full stripe starts at + * physbase - offset + stride. (even though physmax may fall in the + * middle of a stripe width, we do not save the ending fragment size + * in this data structure.) + * exists: Set to 1 if the mblock has memory in this mem_node stripe. + * + * The stripe width is kept in the global mnode_pages. + * The stride is kept in the global mnode_stride. + * All the above use pfn's as the unit. + * + * As an example, the memory layout for a domain with 2 mblocks and 4 + * mem_nodes 0,1,2,3 could look like this: + * + * 123012301230 ... 012301230123 ... + * mblock 0 mblock 1 + */ + +void +plat_build_mem_nodes(u_longlong_t *list, size_t nelems) +{ + lgrp_handle_t lgrphand, lgrp_start; + int i, mnode, elem; + uint64_t offset, stripe_end, base, len, end, ra_to_pa, stride; + uint64_t stripe, frag, remove; + mem_stripe_t *ms; + + /* Check for non-MPO sun4v platforms */ + + if (n_locality_groups <= 1) { + mpo_plat_assign_lgrphand_to_mem_node((lgrp_handle_t)0, 0); + for (elem = 0; elem < nelems; elem += 2) { + base = list[elem]; + len = list[elem+1]; + + mpo_mem_node_add_slice(btop(base), + btop(base + len - 1)); + } + mem_node_pfn_shift = 0; + mem_node_physalign = 0; + n_mem_stripes = 0; + return; + } + + /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */ + max_mem_nodes = max_locality_groups; + bzero(mem_stripes, sizeof (mem_stripes)); + stripe = ptob(mnode_pages); + stride = max_locality_groups * stripe; + + /* Save commonly used values in globals */ + mnode_stride = btop(stride); + n_mem_stripes = max_locality_groups * n_mblocks; + stripe_shift = highbit(max_locality_groups) - 1; + + for (i = 0; i < n_mblocks; i++) { + + base = mpo_mblock[i].base; + end = mpo_mblock[i].base + mpo_mblock[i].size; + ra_to_pa = mpo_mblock[i].ra_to_pa; + mpo_mblock[i].base_pfn = btop(base); + mpo_mblock[i].end_pfn = btop(end - 1); + + /* Find the offset from the prev stripe boundary in PA space. */ + offset = (base + ra_to_pa) & (stripe - 1); + + /* Set the next stripe boundary. */ + stripe_end = base - offset + stripe; + + lgrp_start = (((base + ra_to_pa) & home_mask) >> + home_mask_shift); + lgrphand = lgrp_start; + + /* + * Loop over all lgroups covered by the mblock, creating a + * stripe for each. Stop when lgrp_start is visited again. + */ + do { + /* mblock may not span all lgroups */ + if (base >= end) + break; + + mnode = lgrphand; + ASSERT(mnode < max_mem_nodes); + + /* + * Calculate the size of the fragment that does not + * belong to the mnode in the last partial stride. + */ + frag = (end - (base - offset)) & (stride - 1); + if (frag == 0) { + /* remove the gap */ + remove = stride - stripe; + } else if (frag < stripe) { + /* fragment fits in stripe; keep it all */ + remove = 0; + } else { + /* fragment is large; trim after whole stripe */ + remove = frag - stripe; + } + + ms = &mem_stripes[i * max_locality_groups + mnode]; + ms->physbase = btop(base); + ms->physmax = btop(end - 1 - remove); + ms->offset = btop(offset); + ms->exists = 1; + + mpo_plat_assign_lgrphand_to_mem_node(lgrphand, mnode); + mpo_mem_node_add_slice(ms->physbase, ms->physmax); + + base = stripe_end; + stripe_end += stripe; + offset = 0; + lgrphand = (((base + ra_to_pa) & home_mask) >> + home_mask_shift); + } while (lgrphand != lgrp_start); + } + + /* + * Indicate to vm_pagelist that the hpm_counters array + * should be shared because the ranges overlap. + */ + if (max_mem_nodes > 1) { + interleaved_mnodes = 1; + } +} + +/* + * Return the locality group value for the supplied processor + */ +lgrp_handle_t +plat_lgrp_cpu_to_hand(processorid_t id) +{ + if (n_locality_groups > 1) { + return ((lgrp_handle_t)mpo_cpu[(int)id].home); + } else { + return ((lgrp_handle_t)0); /* Default */ + } +} + +int +plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to) +{ + /* + * Return min remote latency when there are more than two lgroups + * (root and child) and getting latency between two different lgroups + * or root is involved. + */ + if (lgrp_optimizations() && (from != to || + from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) { + return ((int)higher_latency); + } else { + return ((int)lower_latency); + } +} + +int +plat_pfn_to_mem_node(pfn_t pfn) +{ + int i, mnode; + pfn_t ra_to_pa_pfn; + struct mblock_md *mb; + + if (n_locality_groups <= 1) + return (0); + + /* + * The mnode is defined to be 1:1 with the lgroup handle, which + * is taken from from the home bits. Find the mblock in which + * the pfn falls to get the ra_to_pa adjustment, and extract + * the home bits. + */ + mb = &mpo_mblock[0]; + for (i = 0; i < n_mblocks; i++) { + if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) { + ra_to_pa_pfn = btop(mb->ra_to_pa); + mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >> + home_mask_pfn_shift); + ASSERT(mnode < max_mem_nodes); + return (mnode); + } + mb++; + } + + panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn); + return (pfn); +} + +/* + * plat_rapfn_to_papfn + * + * Convert a pfn in RA space to a pfn in PA space, in which the page coloring + * and home mask bits are correct. The upper bits do not necessarily + * match the actual PA, however. + */ +pfn_t +plat_rapfn_to_papfn(pfn_t pfn) +{ + int i; + pfn_t ra_to_pa_pfn; + struct mblock_md *mb; + + ASSERT(n_mblocks > 0); + if (n_mblocks == 1) + return (pfn + base_ra_to_pa_pfn); + + /* + * Find the mblock in which the pfn falls + * in order to get the ra_to_pa adjustment. + */ + for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) { + if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) { + ra_to_pa_pfn = btop(mb->ra_to_pa); + return (pfn + ra_to_pa_pfn); + } + } + + panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn); + return (pfn); +} + +/* + * plat_mem_node_iterator_init() + * Initialize cookie to iterate over pfn's in an mnode. There is + * no additional iterator function. The caller uses the info from + * the iterator structure directly. + * + * pfn: starting pfn. + * mnode: desired mnode. + * init: set to 1 for full init, 0 for continuation + * + * Returns the appropriate starting pfn for the iteration + * the same as the input pfn if it falls in an mblock. + * Returns the (pfn_t)-1 value if the input pfn lies past + * the last valid mnode pfn. + */ +pfn_t +plat_mem_node_iterator_init(pfn_t pfn, int mnode, + mem_node_iterator_t *it, int init) +{ + int i; + struct mblock_md *mblock; + pfn_t base, end; + + ASSERT(it != NULL); + ASSERT(mnode >= 0 && mnode < max_mem_nodes); + ASSERT(n_mblocks > 0); + + if (init) { + it->mi_last_mblock = 0; + it->mi_init = 1; + } + + /* Check if mpo is not enabled and we only have one mblock */ + if (n_locality_groups == 1 && n_mblocks == 1) { + it->mi_mnode = mnode; + it->mi_ra_to_pa = base_ra_to_pa_pfn; + it->mi_mnode_pfn_mask = 0; + it->mi_mnode_pfn_shift = 0; + it->mi_mnode_mask = 0; + it->mi_mblock_base = mem_node_config[mnode].physbase; + it->mi_mblock_end = mem_node_config[mnode].physmax; + if (pfn < it->mi_mblock_base) + pfn = it->mi_mblock_base; + else if (pfn > it->mi_mblock_end) + pfn = (pfn_t)-1; + return (pfn); + } + + /* + * Find mblock that contains pfn, or first mblock after pfn, + * else pfn is out of bounds, so use the last mblock. + * mblocks are sorted in ascending address order. + */ + ASSERT(it->mi_last_mblock < n_mblocks); + ASSERT(init == 1 || pfn > mpo_mblock[it->mi_last_mblock].end_pfn); + i = init ? 0 : it->mi_last_mblock + 1; + if (i == n_mblocks) + return ((pfn_t)-1); + + for (; i < n_mblocks; i++) { + if (pfn <= mpo_mblock[i].end_pfn) + break; + } + if (i == n_mblocks) { + it->mi_last_mblock = i - 1; + return ((pfn_t)-1); + } + it->mi_last_mblock = i; + + /* + * Memory stripes are defined if there is more than one locality + * group, so use the stripe bounds. Otherwise use mblock bounds. + */ + mblock = &mpo_mblock[i]; + if (n_mem_stripes > 0) { + mem_stripe_t *ms = + &mem_stripes[i * max_locality_groups + mnode]; + base = ms->physbase; + end = ms->physmax; + } else { + ASSERT(mnode == 0); + base = mblock->base_pfn; + end = mblock->end_pfn; + } + + it->mi_mnode = mnode; + it->mi_ra_to_pa = btop(mblock->ra_to_pa); + it->mi_mblock_base = base; + it->mi_mblock_end = end; + it->mi_mnode_pfn_mask = home_mask_pfn; /* is 0 for non-MPO case */ + it->mi_mnode_pfn_shift = home_mask_pfn_shift; + it->mi_mnode_mask = max_locality_groups - 1; + if (pfn < base) + pfn = base; + else if (pfn > end) + pfn = (pfn_t)-1; + return (pfn); +} + +/* + * plat_mem_node_intersect_range() + * + * Find the intersection between a memnode and a range of pfn's. + */ +void +plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len, + int mnode, pgcnt_t *npages_out) +{ + pfn_t offset, len, hole, base, end, test_end, frag; + pfn_t nearest; + mem_stripe_t *ms; + int i, npages; + + *npages_out = 0; + + if (!mem_node_config[mnode].exists || test_len == 0) + return; + + base = mem_node_config[mnode].physbase; + end = mem_node_config[mnode].physmax; + + test_end = test_base + test_len - 1; + if (end < test_base || base > test_end) + return; + + if (n_locality_groups == 1) { + *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1; + return; + } + + hole = mnode_stride - mnode_pages; + npages = 0; + + /* + * Iterate over all the stripes for this mnode (one per mblock), + * find the intersection with each, and accumulate the intersections. + * + * Determing the intersection with a stripe is tricky. If base or end + * fall outside the mem_node bounds, round them to physbase/physmax of + * mem_node. If base or end fall in a gap, round them to start of + * nearest stripe. If they fall within a stripe, keep base or end, + * but calculate the fragment size that should be excluded from the + * stripe. Calculate how many strides fall in the adjusted range, + * multiply by stripe width, and add the start and end fragments. + */ + + for (i = mnode; i < n_mem_stripes; i += max_locality_groups) { + ms = &mem_stripes[i]; + if (ms->exists && + test_base <= (end = ms->physmax) && + test_end >= (base = ms->physbase)) { + + offset = ms->offset; + + if (test_base > base) { + /* Round test_base to next multiple of stride */ + len = P2ROUNDUP(test_base - (base - offset), + mnode_stride); + nearest = base - offset + len; + /* + * Compute distance from test_base to the + * stride boundary to see if test_base falls + * in the stripe or in the hole. + */ + if (nearest - test_base > hole) { + /* + * test_base lies in stripe, + * and offset should be excluded. + */ + offset = test_base - + (nearest - mnode_stride); + base = test_base; + } else { + /* round up to next stripe start */ + offset = 0; + base = nearest; + if (base > end) + continue; + } + + } + + if (test_end < end) + end = test_end; + end++; /* adjust to an exclusive bound */ + + /* Round end to next multiple of stride */ + len = P2ROUNDUP(end - (base - offset), mnode_stride); + nearest = (base - offset) + len; + if (nearest - end <= hole) { + /* end falls in hole, use entire last stripe */ + frag = 0; + } else { + /* end falls in stripe, compute fragment */ + frag = nearest - hole - end; + } + + len = (len >> stripe_shift) - offset - frag; + npages += len; + } + } + + *npages_out = npages; +} + +/* + * valid_pages() + * + * Return 1 if pages are valid and do not cross mnode boundaries + * (which would break page free list assumptions), and 0 otherwise. + */ + +#define MNODE(pa) \ + ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift) + +static int +valid_pages(md_t *md, mde_cookie_t cpu0) +{ + int i, max_szc; + uint64_t last_page_base, szc_mask; + uint64_t max_page_len, max_coalesce_len; + struct mblock_md *mb = mpo_mblock; + + /* + * Find the smaller of the largest page possible and supported. + * mmu_exported_pagesize_mask is not yet initialized, so read + * it from the MD. Apply minimal fixups in case of broken MDs + * to get a sane mask. + */ + + if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask)) + szc_mask = 0; + szc_mask |= (1 << TTE4M); /* largest in sun4v default support */ + max_szc = highbit(szc_mask) - 1; + if (max_szc > TTE256M) + max_szc = TTE256M; + max_page_len = TTEBYTES(max_szc); + + /* + * Page coalescing code coalesces all sizes up to 256M on sun4v, even + * if mmu-page-size-list does not contain it, so 256M pages must fall + * within one mnode to use MPO. + */ + max_coalesce_len = TTEBYTES(TTE256M); + ASSERT(max_coalesce_len >= max_page_len); + + if (ptob(mnode_pages) < max_coalesce_len) { + MPO_STATUS("Page too large; MPO disabled: page = %lx, " + "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages)); + return (0); + } + + for (i = 0; i < n_mblocks; i++) { + uint64_t base = mb->base; + uint64_t end = mb->base + mb->size - 1; + uint64_t ra_to_pa = mb->ra_to_pa; + + /* + * If mblock is smaller than the max page size, then + * RA = PA mod MAXPAGE is not guaranteed, but it must + * not span mnodes. + */ + if (mb->size < max_page_len) { + if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) { + MPO_STATUS("Small mblock spans mnodes; " + "MPO disabled: base = %lx, end = %lx, " + "ra2pa = %lx\n", base, end, ra_to_pa); + return (0); + } + } else { + /* Verify RA = PA mod MAXPAGE, using coalesce size */ + uint64_t pa_base = base + ra_to_pa; + if ((base & (max_coalesce_len - 1)) != + (pa_base & (max_coalesce_len - 1))) { + MPO_STATUS("bad page alignment; MPO disabled: " + "ra = %lx, pa = %lx, pagelen = %lx\n", + base, pa_base, max_coalesce_len); + return (0); + } + } + + /* + * Find start of last large page in mblock in RA space. + * If page extends into the next mblock, verify the + * mnode does not change. + */ + last_page_base = P2ALIGN(end, max_coalesce_len); + if (i + 1 < n_mblocks && + last_page_base + max_coalesce_len > mb[1].base && + MNODE(last_page_base + ra_to_pa) != + MNODE(mb[1].base + mb[1].ra_to_pa)) { + MPO_STATUS("Large page spans mblocks; MPO disabled: " + "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, " + "pagelen = %lx\n", end, ra_to_pa, mb[1].base, + mb[1].ra_to_pa, max_coalesce_len); + return (0); + } + + mb++; + } + return (1); +} + + +/* + * fix_interleave() - Find lgroups with sub-page sized memory interleave, + * if any, and remove them. This yields a config where the "coarse + * grained" lgroups cover all of memory, even though part of that memory + * is fine grain interleaved and does not deliver a purely local memory + * latency. + * + * This function reads and modifies the globals: + * mpo_lgroup[], n_lgrpnodes + * + * Returns 1 if lgroup nodes were removed, 0 otherwise. + */ + +static int +fix_interleave(void) +{ + int i, j; + uint64_t mask = 0; + + j = 0; + for (i = 0; i < n_lgrpnodes; i++) { + if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) { + /* remove this lgroup */ + mask = mpo_lgroup[i].addr_mask; + } else { + mpo_lgroup[j++] = mpo_lgroup[i]; + } + } + n_lgrpnodes = j; + + if (mask != 0) + MPO_STATUS("sub-page interleave %lx found; " + "removing lgroup.\n", mask); + + return (mask != 0); +} diff --git a/usr/src/uts/sun4v/sys/cpu_module.h b/usr/src/uts/sun4v/sys/cpu_module.h index 0786951416..2d7c909a0f 100644 --- a/usr/src/uts/sun4v/sys/cpu_module.h +++ b/usr/src/uts/sun4v/sys/cpu_module.h @@ -146,11 +146,12 @@ extern void bzero(void *addr, size_t count); int cpu_trapstat_conf(int cmd); void cpu_trapstat_data(void *buf, uint_t pgszs); - +/* Used by the fill_cpu() function */ #define NO_MAPPING_FOUND 0xffffffff #define NO_EU_MAPPING_FOUND NO_MAPPING_FOUND #define NO_CHIP_MAPPING_FOUND NO_MAPPING_FOUND #define NO_CORE_MAPPING_FOUND NO_MAPPING_FOUND +#define NO_L2_CACHE_MAPPING_FOUND NO_MAPPING_FOUND /* * Default MMU pagesize mask for sun4v architecture. */ diff --git a/usr/src/uts/sun4v/sys/machcpuvar.h b/usr/src/uts/sun4v/sys/machcpuvar.h index 921d6c584d..2113747d55 100644 --- a/usr/src/uts/sun4v/sys/machcpuvar.h +++ b/usr/src/uts/sun4v/sys/machcpuvar.h @@ -104,6 +104,7 @@ typedef struct ptl1_state { */ #define CPU_CHIPID_INVALID -1 #define CPU_COREID_INVALID -1 +#define CPU_L2_CACHEID_INVALID -1 /* * Machine specific fields of the cpu struct @@ -177,6 +178,7 @@ struct machcpu { uint16_t *cpu_list; /* uint16_t [NCPU] */ uint64_t cpu_list_ra; /* cpu list ra */ id_t cpu_ipipe; /* cpu int exec unit id */ + id_t cpu_mpipe; /* cpu memory pipe id */ id_t cpu_fpu; /* cpu fpu unit id */ id_t cpu_core; /* cpu core id */ id_t cpu_chip; /* cpu chip id */ diff --git a/usr/src/uts/sun4v/sys/mpo.h b/usr/src/uts/sun4v/sys/mpo.h new file mode 100644 index 0000000000..e390b5e483 --- /dev/null +++ b/usr/src/uts/sun4v/sys/mpo.h @@ -0,0 +1,112 @@ + +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_MPO_H +#define _SYS_MPO_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * mpo.h - Sun4v MPO common header file + * + */ +#define MPO_MAX_MBLOCKS 16 +#define MAX_MEM_STRIPES (MAX_MEM_NODES * MPO_MAX_MBLOCKS) + +#define PROP_LG_CPU_ID "id" +#define PROP_LG_MASK "address-mask" +#define PROP_LG_LATENCY "latency" +#define PROP_LG_MATCH "address-match" +#define PROP_LG_MEM_LG "memory-latency-group" +#define PROP_LG_CPU "cpu" +#define PROP_LG_MBLOCK "mblock" +#define PROP_LG_BASE "base" +#define PROP_LG_SIZE "size" +#define PROP_LG_RA_PA_OFFSET "address-congruence-offset" + +/* Macro to set the correspending bit if an mem-lg homeid is a member */ +#define HOMESET_ADD(homeset, home)\ + homeset |= ((int)1 << (home)) + +/* Macro to check if an mem_lg homeid is a member of the homeset */ +#define MEM_LG_ISMEMBER(homeset, home)\ + ((homeset) & ((uint64_t)1 << (home))) + +/* Structure to store CPU information from the MD */ + +struct cpu_md { + uint_t home; + uint64_t latency; +}; + +/* Structure to store mem-lg information from the MD */ + +struct lgrp_md { + uint64_t id; + uint64_t addr_mask; + uint64_t addr_match; + uint64_t latency; + mde_cookie_t node; + int ncpu; +}; + +/* Structure to store mblock information retrieved from the MD */ + +struct mblock_md { + uint64_t base; + uint64_t size; + uint64_t ra_to_pa; + mde_cookie_t node; + pfn_t base_pfn; + pfn_t end_pfn; +}; + +/* Structure for memnode information for use by plat_pfn_to_mem_node */ + +struct mnode_info { + pfn_t base_pfn; + pfn_t end_pfn; +}; + +/* A stripe defines the portion of a mem_node that falls in one mblock */ +typedef struct { + pfn_t physbase; /* first page in mnode in the corresponding mblock */ + pfn_t physmax; /* last valid page in mnode in mblock */ + pfn_t offset; /* stripe starts at physbase - offset */ + int exists; /* set to 1 if mblock has memory in this mnode stripe */ +} mem_stripe_t; + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MPO_H */ |