PSARC 2006/675 MPO for Victoria Falls/Maramba project

6539930 MPO for sun4v platforms
author: dp78419 <none@none> 2007-07-31 16:27:12 -0700
committer: dp78419 <none@none> 2007-07-31 16:27:12 -0700
commit: ce8eb11a8717b4a57c68fd77ab9f8aac15b16bf2 (patch)
tree: 772441639680866ab4a841bbef119c6a813e6c09
parent: 79777a7dd0179283917bda2ba98999c382d31c2c (diff)
download: illumos-joyent-ce8eb11a8717b4a57c68fd77ab9f8aac15b16bf2.tar.gz
22 files changed, 2197 insertions, 258 deletions
diff --git a/usr/src/uts/common/os/mem_cage.c b/usr/src/uts/common/os/mem_cage.c
index beb2fe3cbe..820fe555dd 100644
--- a/usr/src/uts/common/os/mem_cage.c
+++ b/usr/src/uts/common/os/mem_cage.c
@@ -335,7 +335,7 @@ kcage_next_range(int incage, pfn_t lo, pfn_t hi,
 	rw_enter(&kcage_range_rwlock, RW_READER);
 
 	for (lp = incage ? kcage_glist : kcage_current_glist;
-		lp != NULL; lp = lp->next) {
+	    lp != NULL; lp = lp->next) {
 
 		pfn_t klo, khi;
 
@@ -886,7 +886,7 @@ kcage_recalc_preferred_size(pgcnt_t preferred_size)
 			    segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE;
 		}
 		kcage_kmemlp_mincage = MIN(lpmincage,
-			    (segkmem_kmemlp_max / PAGESIZE));
+		    (segkmem_kmemlp_max / PAGESIZE));
 		preferred_size = MAX(kcage_kmemlp_mincage, preferred_size);
 	}
 	return (preferred_size);
@@ -1006,11 +1006,7 @@ kcage_init(pgcnt_t preferred_size)
 	 */
 	if (SEGKMEM_USE_LARGEPAGES) {
 		extern void page_freelist_coalesce_all(int mnode);
-		extern int max_mem_nodes;
-		int mnode, max_mnodes = max_mem_nodes;
-		for (mnode = 0; mnode < max_mnodes; mnode++) {
-			page_freelist_coalesce_all(mnode);
-		}
+		page_freelist_coalesce_all(-1);	/* do all mnodes */
 	}
 
 	ksp = kstat_create("kcage", 0, "kcage_page_list", "misc",
@@ -1288,7 +1284,7 @@ kcage_freemem_add(pgcnt_t npages)
 	wakeup_pcgs();  /* wakeup threads in pcgs() */
 
 	if (kcage_needfree != 0 &&
-		kcage_freemem >= (kcage_throttlefree + kcage_needfree)) {
+	    kcage_freemem >= (kcage_throttlefree + kcage_needfree)) {
 
 		mutex_enter(&kcage_throttle_mutex);
 		cv_broadcast(&kcage_throttle_cv);
@@ -1467,7 +1463,7 @@ kcage_expand()
 	 * have enough free pages to page_relocate() even a single page.
 	 */
 	wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree)
-		- kcage_freemem;
+	    - kcage_freemem;
 	if (wanted <= 0)
 		return (0);
 	else if (freemem < pageout_reserve + 1) {
@@ -1683,7 +1679,7 @@ kcage_cageout()
 #endif
 
 	CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex,
-		callb_generic_cpr, "cageout");
+	    callb_generic_cpr, "cageout");
 
 	mutex_enter(&kcage_cageout_mutex);
 	kcage_cageout_thread = curthread;
@@ -1724,7 +1720,7 @@ again:
 	pages_skipped = 0;
 	shared_skipped = 0;
 	while ((kcage_freemem < kcage_lotsfree || kcage_needfree) &&
-		(pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) {
+	    (pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) {
 
 		if (start_pfn == PFN_INVALID)
 			start_pfn = pfn;
@@ -1820,7 +1816,7 @@ again:
 			 * In pass {0, 1, 2}, skip page if mod bit is set.
 			 */
 			prm = hat_pagesync(pp,
-				HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
+			    HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
 
 			/* On first pass ignore ref'd pages */
 			if (pass <= 1 && (prm & P_REF)) {
@@ -1833,7 +1829,7 @@ again:
 			/* On pass 2, page_destroy if mod bit is not set */
 			if (pass <= 2) {
 				if (pp->p_szc != 0 || (prm & P_MOD) ||
-					pp->p_lckcnt || pp->p_cowcnt) {
+				    pp->p_lckcnt || pp->p_cowcnt) {
 					pages_skipped = 1;
 					page_unlock(pp);
 				} else {
@@ -1843,7 +1839,7 @@ again:
 					 * checking if mod bit is set
 					 */
 					(void) hat_pageunload(pp,
-						HAT_FORCE_PGUNLOAD);
+					    HAT_FORCE_PGUNLOAD);
 
 					/*
 					 * skip this page if modified
diff --git a/usr/src/uts/common/sys/lgrp.h b/usr/src/uts/common/sys/lgrp.h
index c0ed75d981..48ad8e8757 100644
--- a/usr/src/uts/common/sys/lgrp.h
+++ b/usr/src/uts/common/sys/lgrp.h
@@ -598,6 +598,15 @@ int	lgrp_plat_latency(lgrp_handle_t, lgrp_handle_t);
 lgrp_handle_t	lgrp_plat_root_hand(void);
 void	lgrp_plat_probe(void);
 
+extern uint32_t		lgrp_expand_proc_thresh;
+extern uint32_t		lgrp_expand_proc_diff;
+extern pgcnt_t		lgrp_mem_free_thresh;
+extern uint32_t		lgrp_loadavg_tolerance;
+extern uint32_t		lgrp_loadavg_max_effect;
+extern uint32_t		lgrp_load_thresh;
+extern lgrp_mem_policy_t lgrp_mem_policy_root;
+extern int		tsb_lgrp_affinity;
+
 #endif	/* _KERNEL && _KMEMUSER */
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/sys/pghw.h b/usr/src/uts/common/sys/pghw.h
index e78be92032..f22afc021b 100644
--- a/usr/src/uts/common/sys/pghw.h
+++ b/usr/src/uts/common/sys/pghw.h
@@ -52,17 +52,12 @@ typedef enum pghw_type {
 	PGHW_CACHE,
 	PGHW_FPU,
 	PGHW_MPIPE,
+	PGHW_CHIP,
 	PGHW_MEMORY,
 	PGHW_NUM_COMPONENTS
 } pghw_type_t;
 
 /*
- * Consider the physical processor sharing relationship
- * equivalant to a shared pipe to memory.
- */
-#define	PGHW_CHIP	PGHW_MPIPE
-
-/*
  * Anonymous instance id
  */
 #define	PGHW_INSTANCE_ANON ((id_t)0xdecafbad)
diff --git a/usr/src/uts/common/vm/vm_pagelist.c b/usr/src/uts/common/vm/vm_pagelist.c
index cef95452bf..d45b8cd0fe 100644
--- a/usr/src/uts/common/vm/vm_pagelist.c
+++ b/usr/src/uts/common/vm/vm_pagelist.c
@@ -497,21 +497,37 @@ page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
 	ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
 
 	color &= ceq_mask;
-	ncolor <<= PAGE_GET_COLOR_SHIFT(szc, nszc);
+	ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
 	return (color | (ncolor & ~ceq_mask));
 }
 
 /*
+ * The interleaved_mnodes flag is set when mnodes overlap in
+ * the physbase..physmax range, but have disjoint slices.
+ * In this case hpm_counters is shared by all mnodes.
+ * This flag is set dynamically by the platform.
+ */
+int interleaved_mnodes = 0;
+
+/*
  * Called by startup().
  * Size up the per page size free list counters based on physmax
  * of each node and max_mem_nodes.
+ *
+ * If interleaved_mnodes is set we need to find the first mnode that
+ * exists. hpm_counters for the first mnode will then be shared by
+ * all other mnodes. If interleaved_mnodes is not set, just set
+ * first=mnode each time. That means there will be no sharing.
  */
 size_t
 page_ctrs_sz(void)
 {
 	int	r;		/* region size */
 	int	mnode;
+	int	firstmn;	/* first mnode that exists */
 	int	nranges;
+	pfn_t	physbase;
+	pfn_t	physmax;
 	uint_t	ctrs_sz = 0;
 	int 	i;
 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
@@ -525,7 +541,7 @@ page_ctrs_sz(void)
 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
 	}
 
-	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
+	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
 
 		pgcnt_t r_pgcnt;
 		pfn_t   r_base;
@@ -534,6 +550,7 @@ page_ctrs_sz(void)
 		if (mem_node_config[mnode].exists == 0)
 			continue;
 
+		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
 		nranges = MNODE_RANGE_CNT(mnode);
 		mnode_nranges[mnode] = nranges;
 		mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
@@ -543,22 +560,25 @@ page_ctrs_sz(void)
 		 * base aligned to large page size.
 		 */
 		for (r = 1; r < mmu_page_sizes; r++) {
+			/* add in space for hpm_color_current */
+			ctrs_sz += sizeof (size_t) *
+			    colors_per_szc[r] * nranges;
+
+			if (firstmn != mnode)
+				continue;
+
 			/* add in space for hpm_counters */
 			r_align = page_get_pagecnt(r);
-			r_base = mem_node_config[mnode].physbase;
+			r_base = physbase;
 			r_base &= ~(r_align - 1);
-			r_pgcnt = howmany(mem_node_config[mnode].physmax -
-			    r_base + 1, r_align);
+			r_pgcnt = howmany(physmax - r_base + 1, r_align);
+
 			/*
 			 * Round up to always allocate on pointer sized
 			 * boundaries.
 			 */
 			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
 			    sizeof (hpmctr_t *));
-
-			/* add in space for hpm_color_current */
-			ctrs_sz += sizeof (size_t) *
-			    colors_per_szc[r] * nranges;
 		}
 	}
 
@@ -605,6 +625,9 @@ page_ctrs_alloc(caddr_t alloc_base)
 	int	mrange, nranges;
 	int	r;		/* region size */
 	int	i;
+	int	firstmn;	/* first mnode that exists */
+	pfn_t	physbase;
+	pfn_t	physmax;
 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
 
 	/*
@@ -660,7 +683,7 @@ page_ctrs_alloc(caddr_t alloc_base)
 	/* initialize page list counts */
 	PLCNT_INIT(alloc_base);
 
-	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
+	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
 
 		pgcnt_t r_pgcnt;
 		pfn_t	r_base;
@@ -671,6 +694,8 @@ page_ctrs_alloc(caddr_t alloc_base)
 		if (mem_node_config[mnode].exists == 0)
 			continue;
 
+		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
+
 		for (r = 1; r < mmu_page_sizes; r++) {
 			/*
 			 * the page_counters base has to be aligned to the
@@ -678,11 +703,10 @@ page_ctrs_alloc(caddr_t alloc_base)
 			 * will cross large page boundaries.
 			 */
 			r_align = page_get_pagecnt(r);
-			r_base = mem_node_config[mnode].physbase;
+			r_base = physbase;
 			/* base needs to be aligned - lower to aligned value */
 			r_base &= ~(r_align - 1);
-			r_pgcnt = howmany(mem_node_config[mnode].physmax -
-			    r_base + 1, r_align);
+			r_pgcnt = howmany(physmax - r_base + 1, r_align);
 			r_shift = PAGE_BSZS_SHIFT(r);
 
 			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
@@ -699,9 +723,12 @@ page_ctrs_alloc(caddr_t alloc_base)
 				pfn_t  pfnum = r_base;
 				size_t idx;
 				int mrange;
+				MEM_NODE_ITERATOR_DECL(it);
 
+				MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
+				ASSERT(pfnum != (pfn_t)-1);
 				PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
-				    color_mask, color_mask);
+				    color_mask, color_mask, &it);
 				idx = PNUM_TO_IDX(mnode, r, pfnum);
 				idx = (idx >= r_pgcnt) ? 0 : idx;
 				for (mrange = 0; mrange < nranges; mrange++) {
@@ -709,14 +736,18 @@ page_ctrs_alloc(caddr_t alloc_base)
 					    r, i, mrange) = idx;
 				}
 			}
-			PAGE_COUNTERS_COUNTERS(mnode, r) =
-			    (hpmctr_t *)alloc_base;
-			/*
-			 * Round up to make alloc_base always be aligned on
-			 * a pointer boundary.
-			 */
-			alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
-			    sizeof (hpmctr_t *));
+
+			/* hpm_counters may be shared by all mnodes */
+			if (firstmn == mnode) {
+				PAGE_COUNTERS_COUNTERS(mnode, r) =
+				    (hpmctr_t *)alloc_base;
+				alloc_base +=
+				    P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
+				    sizeof (hpmctr_t *));
+			} else {
+				PAGE_COUNTERS_COUNTERS(mnode, r) =
+				    PAGE_COUNTERS_COUNTERS(firstmn, r);
+			}
 
 			/*
 			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
@@ -735,7 +766,7 @@ page_ctrs_alloc(caddr_t alloc_base)
 		 * page_ctrs_sz() has added some slop for these roundups.
 		 */
 		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
-			L2CACHE_ALIGN);
+		    L2CACHE_ALIGN);
 	}
 
 	/* Initialize other page counter specific data structures. */
@@ -894,6 +925,7 @@ page_ctrs_adjust(int mnode)
 	size_t	pcsz, old_csz;
 	hpmctr_t *new_ctr, *old_ctr;
 	pfn_t	oldbase, newbase;
+	pfn_t	physbase, physmax;
 	size_t	old_npgs;
 	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
 	size_t	size_cache[MMU_PAGE_SIZES];
@@ -908,15 +940,17 @@ page_ctrs_adjust(int mnode)
 	int old_maxmrange, new_maxmrange;
 	int rc = 0;
 
-	newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK;
-	npgs = roundup(mem_node_config[mnode].physmax,
-	    PC_BASE_ALIGN) - newbase;
-
 	cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
 	    MMU_PAGE_SIZES, KM_NOSLEEP);
 	if (cands_cache == NULL)
 		return (ENOMEM);
 
+	i = -1;
+	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
+
+	newbase = physbase & ~PC_BASE_ALIGN_MASK;
+	npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
+
 	/* prepare to free non-null pointers on the way out */
 	cands_cache_nranges = nranges;
 	bzero(ctr_cache, sizeof (ctr_cache));
@@ -997,8 +1031,7 @@ page_ctrs_adjust(int mnode)
 	 * Grab the write lock to prevent others from walking these arrays
 	 * while we are modifying them.
 	 */
-	rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER);
-	page_freelist_lock(mnode);
+	PAGE_CTRS_WRITE_LOCK(mnode);
 
 	old_nranges = mnode_nranges[mnode];
 	cands_cache_nranges = old_nranges;
@@ -1016,7 +1049,7 @@ page_ctrs_adjust(int mnode)
 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
 			old_color_array[mrange] =
 			    PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
-				r, mrange);
+			    r, mrange);
 		}
 
 		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
@@ -1048,6 +1081,21 @@ page_ctrs_adjust(int mnode)
 		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
 		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
 		PAGE_COUNTERS_BASE(mnode, r) = newbase;
+
+		/* update shared hpm_counters in other mnodes */
+		if (interleaved_mnodes) {
+			for (i = 0; i < max_mem_nodes; i++) {
+				if (i == mnode)
+					continue;
+				if (mem_node_config[i].exists == 0)
+					continue;
+				ASSERT(PAGE_COUNTERS_COUNTERS(i, r) == old_ctr);
+				PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
+				PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
+				PAGE_COUNTERS_BASE(i, r) = newbase;
+			}
+		}
+
 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
 			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
 			    color_cache[r][mrange];
@@ -1059,16 +1107,27 @@ page_ctrs_adjust(int mnode)
 		 */
 		for (i = 0; i < colors_per_szc[r]; i++) {
 			uint_t color_mask = colors_per_szc[r] - 1;
+			int mlo = interleaved_mnodes ? 0 : mnode;
+			int mhi = interleaved_mnodes ? max_mem_nodes :
+			    (mnode + 1);
+			int m;
 			pfn_t  pfnum = newbase;
 			size_t idx;
+			MEM_NODE_ITERATOR_DECL(it);
 
-			PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, color_mask,
-			    color_mask);
-			idx = PNUM_TO_IDX(mnode, r, pfnum);
-			idx = (idx < pcsz) ? idx : 0;
-			for (mrange = 0; mrange < nranges; mrange++) {
-				PAGE_COUNTERS_CURRENT_COLOR(mnode,
-				    r, i, mrange) = idx;
+			for (m = mlo; m < mhi; m++) {
+				if (mem_node_config[m].exists == 0)
+					continue;
+				MEM_NODE_ITERATOR_INIT(pfnum, m, &it);
+				ASSERT(pfnum != (pfn_t)-1);
+				PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, color_mask,
+				    color_mask, &it);
+				idx = PNUM_TO_IDX(m, r, pfnum);
+				idx = (idx < pcsz) ? idx : 0;
+				for (mrange = 0; mrange < nranges; mrange++) {
+					PAGE_COUNTERS_CURRENT_COLOR(m,
+					    r, i, mrange) = idx;
+				}
 			}
 		}
 
@@ -1129,8 +1188,7 @@ page_ctrs_adjust(int mnode)
 			}
 		}
 	}
-	page_freelist_unlock(mnode);
-	rw_exit(&page_ctrs_rwlock[mnode]);
+	PAGE_CTRS_WRITE_UNLOCK(mnode);
 
 	/*
 	 * Now that we have dropped the write lock, it is safe to free all
@@ -2130,6 +2188,7 @@ page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
 	size_t	len, idx, idx0;
 	pgcnt_t	cands = 0, szcpgcnt = page_get_pagecnt(szc);
 	page_t	*ret_pp;
+	MEM_NODE_ITERATOR_DECL(it);
 #if defined(__sparc)
 	pfn_t pfnum0, nlo, nhi;
 #endif
@@ -2169,11 +2228,15 @@ page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
 
 	/* round to szcpgcnt boundaries */
 	lo = P2ROUNDUP(lo, szcpgcnt);
+	MEM_NODE_ITERATOR_INIT(lo, mnode, &it);
+	ASSERT(lo != (pfn_t)-1);
 	hi = hi & ~(szcpgcnt - 1);
 
 	/* set lo to the closest pfn of the right color */
-	if ((PFN_2_COLOR(lo, szc) ^ color) & ceq_mask) {
-		PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask);
+	if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
+	    (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
+		PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
+		    &it);
 	}
 
 	if (hi <= lo) {
@@ -2208,11 +2271,22 @@ page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
 	pfnum = IDX_TO_PNUM(mnode, r, idx0);
 	if (pfnum < lo || pfnum >= hi) {
 		pfnum = lo;
-	} else if ((PFN_2_COLOR(pfnum, szc) ^ color) & ceq_mask) {
-		/* pfnum has invalid color get the closest correct pfn */
-		PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
-		    color_mask);
-		pfnum = (pfnum >= hi) ? lo : pfnum;
+	} else {
+		MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
+		if (pfnum == (pfn_t)-1) {
+			pfnum = lo;
+			MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
+			ASSERT(pfnum != (pfn_t)-1);
+		} else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
+		    (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
+			/* invalid color, get the closest correct pfn */
+			PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
+			    color_mask, &it);
+			if (pfnum >= hi) {
+				pfnum = lo;
+				MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
+			}
+		}
 	}
 
 	/* set starting index */
@@ -2239,12 +2313,16 @@ page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
 			/* jump to the next page in the range */
 			if (pfnum < nlo) {
 				pfnum = P2ROUNDUP(nlo, szcpgcnt);
+				MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
 				idx = PNUM_TO_IDX(mnode, r, pfnum);
 				if (idx >= len || pfnum >= hi)
 					goto wrapit;
-				if ((PFN_2_COLOR(pfnum, szc) ^ color) &
+				if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
 				    ceq_mask)
 					goto next;
+				if (interleaved_mnodes &&
+				    PFN_2_MEM_NODE(pfnum) != mnode)
+					goto next;
 			}
 		}
 #endif
@@ -2264,7 +2342,7 @@ page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
 			if (ret_pp != NULL) {
 				VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
-				    PFN_2_COLOR(pfnum, szc), mrange) = idx;
+				    PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
 				page_freelist_unlock(mnode);
 				rw_exit(&page_ctrs_rwlock[mnode]);
 #if defined(__sparc)
@@ -2299,11 +2377,12 @@ page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
 		}
 next:
 		PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
-			    color_mask);
+		    color_mask, &it);
 		idx = PNUM_TO_IDX(mnode, r, pfnum);
 		if (idx >= len || pfnum >= hi) {
 wrapit:
 			pfnum = lo;
+			MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
 			idx = PNUM_TO_IDX(mnode, r, pfnum);
 			wrap++;
 #if defined(__sparc)
@@ -2319,14 +2398,17 @@ wrapit:
 
 /*
  * For the given mnode, promote as many small pages to large pages as possible.
+ * mnode can be -1, which means do them all
  */
 void
 page_freelist_coalesce_all(int mnode)
 {
 	int 	r;		/* region size */
 	int 	idx, full;
-	pfn_t	pfnum;
 	size_t	len;
+	int doall = interleaved_mnodes || mnode < 0;
+	int mlo = doall ? 0 : mnode;
+	int mhi = doall ? max_mem_nodes : (mnode + 1);
 
 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
 
@@ -2340,39 +2422,54 @@ page_freelist_coalesce_all(int mnode)
 	 * Always promote to the largest page possible
 	 * first to reduce the number of page promotions.
 	 */
-	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
-	page_freelist_lock(mnode);
+	for (mnode = mlo; mnode < mhi; mnode++) {
+		rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
+		page_freelist_lock(mnode);
+	}
 	for (r = mmu_page_sizes - 1; r > 0; r--) {
-		pgcnt_t cands = 0;
-		int mrange, nranges = mnode_nranges[mnode];
+		for (mnode = mlo; mnode < mhi; mnode++) {
+			pgcnt_t cands = 0;
+			int mrange, nranges = mnode_nranges[mnode];
 
-		for (mrange = 0; mrange < nranges; mrange++) {
-			PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
-			if (cands != 0)
-				break;
-		}
-		if (cands == 0) {
-			VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all);
-			continue;
-		}
+			for (mrange = 0; mrange < nranges; mrange++) {
+				PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
+				if (cands != 0)
+					break;
+			}
+			if (cands == 0) {
+				VM_STAT_ADD(vmm_vmstats.
+				    page_ctrs_cands_skip_all);
+				continue;
+			}
 
-		full = FULL_REGION_CNT(r);
-		len  = PAGE_COUNTERS_ENTRIES(mnode, r);
-
-		for (idx = 0; idx < len; idx++) {
-			if (PAGE_COUNTERS(mnode, r, idx) == full) {
-				pfnum = IDX_TO_PNUM(mnode, r, idx);
-				ASSERT(pfnum >=
-				    mem_node_config[mnode].physbase &&
-				    pfnum <
-				    mem_node_config[mnode].physmax);
-				(void) page_promote(mnode,
-				    pfnum, r, PC_FREE, PC_MTYPE_ANY);
+			full = FULL_REGION_CNT(r);
+			len  = PAGE_COUNTERS_ENTRIES(mnode, r);
+
+			for (idx = 0; idx < len; idx++) {
+				if (PAGE_COUNTERS(mnode, r, idx) == full) {
+					pfn_t pfnum =
+					    IDX_TO_PNUM(mnode, r, idx);
+					int tmnode = interleaved_mnodes ?
+					    PFN_2_MEM_NODE(pfnum) : mnode;
+
+					ASSERT(pfnum >=
+					    mem_node_config[tmnode].physbase &&
+					    pfnum <
+					    mem_node_config[tmnode].physmax);
+
+					(void) page_promote(tmnode,
+					    pfnum, r, PC_FREE, PC_MTYPE_ANY);
+				}
 			}
+			/* shared hpm_counters covers all mnodes, so we quit */
+			if (interleaved_mnodes)
+				break;
 		}
 	}
-	page_freelist_unlock(mnode);
-	rw_exit(&page_ctrs_rwlock[mnode]);
+	for (mnode = mlo; mnode < mhi; mnode++) {
+		page_freelist_unlock(mnode);
+		rw_exit(&page_ctrs_rwlock[mnode]);
+	}
 }
 
 /*
@@ -2601,22 +2698,22 @@ page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
 
 	/* we can split pages in the freelist, but not the cachelist */
 	if (can_split) {
-	    plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
+		plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
 
-	    /* calculate next sizes color masks and number of free list bins */
-	    for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
-		plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
-		    plw->plw_ceq_mask[szc]);
-		plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
-	    }
-	    plw->plw_ceq_mask[nszc] = INVALID_MASK;
-	    plw->plw_bins[nszc] = 0;
+		/* set next szc color masks and number of free list bins */
+		for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
+			plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
+			    plw->plw_ceq_mask[szc]);
+			plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
+		}
+		plw->plw_ceq_mask[nszc] = INVALID_MASK;
+		plw->plw_bins[nszc] = 0;
 
 	} else {
-	    ASSERT(szc == 0);
-	    plw->plw_do_split = 0;
-	    plw->plw_bins[1] = 0;
-	    plw->plw_ceq_mask[1] = INVALID_MASK;
+		ASSERT(szc == 0);
+		plw->plw_do_split = 0;
+		plw->plw_bins[1] = 0;
+		plw->plw_ceq_mask[1] = INVALID_MASK;
 	}
 }
 
@@ -2664,7 +2761,7 @@ page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
 			if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
 				plw->plw_bin_marker =
 				    nbin = INC_MASKED(nbin, neq_mask,
-					plw->plw_color_mask);
+				    plw->plw_color_mask);
 				plw->plw_bin_split_prev = plw->plw_bin0;
 				/*
 				 * large pages all have the same vac color
@@ -2710,10 +2807,10 @@ page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
 	}
 
 	if (plw->plw_bins[nszc] != 0) {
-	    nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
-	    if (!((plw->plw_split_next ^ nbin_nsz) &
-		plw->plw_ceq_mask[nszc]))
-		plw->plw_do_split = 1;
+		nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
+		if (!((plw->plw_split_next ^ nbin_nsz) &
+		    plw->plw_ceq_mask[nszc]))
+			plw->plw_do_split = 1;
 	}
 
 	return (nbin);
@@ -2864,8 +2961,8 @@ bin_empty_1:
 		 */
 		if (plw.plw_do_split &&
 		    (pp = page_freelist_split(szc, bin, mnode,
-			mtype, PFNNULL, &plw)) != NULL)
-		    return (pp);
+		    mtype, PFNNULL, &plw)) != NULL)
+			return (pp);
 
 		if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
 		    bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
@@ -3229,6 +3326,7 @@ page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
 	uint_t color_mask;
 	pfn_t hi, lo;
 	uint_t skip;
+	MEM_NODE_ITERATOR_DECL(it);
 
 	ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
 
@@ -3308,6 +3406,7 @@ page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
 
 		/* round to szcpgcnt boundaries */
 		lo = P2ROUNDUP(lo, szcpgcnt);
+		MEM_NODE_ITERATOR_INIT(lo, mnode, &it);
 		hi = hi & ~(szcpgcnt - 1);
 
 		if (hi <= lo)
@@ -3318,10 +3417,14 @@ page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
 		 * page sizes may only have a single page color
 		 */
 		skip = szcpgcnt;
-		if (ceq_mask > 0) {
+		if (ceq_mask > 0 || interleaved_mnodes) {
 			/* set lo to point at appropriate color */
-			PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
-			    color_mask);
+			if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
+			    (interleaved_mnodes &&
+			    PFN_2_MEM_NODE(lo) != mnode)) {
+				PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
+				    color_mask, &it);
+			}
 			if (hi <= lo)
 				/* mseg cannot satisfy color request */
 				continue;
@@ -3331,10 +3434,15 @@ page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
 
 		randpfn = (pfn_t)GETTICK();
 		randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
-		if (ceq_mask) {
-			PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin, ceq_mask,
-			    color_mask);
-			randpfn = (randpfn >= hi) ? lo : randpfn;
+		MEM_NODE_ITERATOR_INIT(randpfn, mnode, &it);
+		if (ceq_mask || interleaved_mnodes) {
+			if (randpfn != (pfn_t)-1)
+				PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
+				    ceq_mask, color_mask, &it);
+			if (randpfn >= hi) {
+				randpfn = lo;
+				MEM_NODE_ITERATOR_INIT(randpfn, mnode, &it);
+			}
 		}
 		randpp = mseg->pages + (randpfn - mseg->pages_base);
 
@@ -3357,17 +3465,23 @@ page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
 				}
 			}
 
-			if (ceq_mask == 0) {
+			if (ceq_mask == 0 && !interleaved_mnodes) {
 				pp += skip;
 			} else {
 				pfn_t pfn = pp->p_pagenum;
 
 				PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
-				    ceq_mask, color_mask);
-				pp = mseg->pages + (pfn - mseg->pages_base);
+				    ceq_mask, color_mask, &it);
+				if (pfn == (pfn_t)-1) {
+					pp = endpp;
+				} else {
+					pp = mseg->pages +
+					    (pfn - mseg->pages_base);
+				}
 			}
 			if (pp >= endpp) {
 				/* start from the beginning */
+				MEM_NODE_ITERATOR_INIT(lo, mnode, &it);
 				pp = mseg->pages + (lo - mseg->pages_base);
 				ASSERT(pp->p_pagenum == lo);
 				ASSERT(pp + szcpgcnt <= endpp);
@@ -3947,9 +4061,9 @@ page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
 				while ((pplist == NULL) &&
 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
 				    != -1) {
-					pplist = page_get_mnode_freelist(
-						mnode, bin, mtype, szc,
-						    flags);
+					pplist =
+					    page_get_mnode_freelist(mnode, bin,
+					    mtype, szc, flags);
 				}
 
 				/*
@@ -3968,8 +4082,9 @@ page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
 				while ((pplist == NULL) &&
 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
 				    != -1) {
-					pplist = page_get_mnode_cachelist(
-						bin, flags, mnode, mtype);
+					pplist =
+					    page_get_mnode_cachelist(bin, flags,
+					    mnode, mtype);
 				}
 				if (pplist != NULL) {
 					page_hashout(pplist, NULL);
@@ -4079,11 +4194,11 @@ page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
 
 				while ((pplist == NULL) &&
 				    (mnode =
-					lgrp_memnode_choose(&lgrp_cookie))
+				    lgrp_memnode_choose(&lgrp_cookie))
 				    != -1) {
 					pplist = page_get_contig_pages(
-						mnode, bin, mtype, szc,
-						    flags | PGI_PGCPHIPRI);
+					    mnode, bin, mtype, szc,
+					    flags | PGI_PGCPHIPRI);
 				}
 				break;
 			}
diff --git a/usr/src/uts/i86pc/os/memnode.c b/usr/src/uts/i86pc/os/memnode.c
index e64fd2b0c6..9440ad17f6 100644
--- a/usr/src/uts/i86pc/os/memnode.c
+++ b/usr/src/uts/i86pc/os/memnode.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -152,7 +151,7 @@ mem_node_post_del_slice(pfn_t start, pfn_t end, int cancelled)
 	if (!cancelled) {
 		delta_pgcnt = end - start;
 		node_size = mem_node_config[mnode].physmax -
-				mem_node_config[mnode].physbase;
+		    mem_node_config[mnode].physbase;
 
 		if (node_size > delta_pgcnt) {
 			/*
@@ -232,7 +231,7 @@ mem_node_alloc()
 	 */
 	for (mnode = 0; mnode < max_mem_nodes; mnode++)
 		if (cas32((uint32_t *)&mem_node_config[mnode].exists,
-			0, 1) == 0)
+		    0, 1) == 0)
 			break;
 
 	if (mnode >= max_mem_nodes)
@@ -273,7 +272,7 @@ mem_node_memlist_pages(int mnode, struct memlist *mlist)
 	for (pmem = mlist; pmem; pmem = pmem->next) {
 		cur_base = btop(pmem->address);
 		cur_end = cur_base + btop(pmem->size) - 1;
-		if (end <= cur_base || base >= cur_end)
+		if (end < cur_base || base > cur_end)
 			continue;
 		npgs = npgs + (MIN(cur_end, end) -
 		    MAX(cur_base, base)) + 1;
diff --git a/usr/src/uts/i86pc/os/mp_machdep.c b/usr/src/uts/i86pc/os/mp_machdep.c
index 4605f6e517..a44c266f27 100644
--- a/usr/src/uts/i86pc/os/mp_machdep.c
+++ b/usr/src/uts/i86pc/os/mp_machdep.c
@@ -1343,3 +1343,30 @@ mach_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp,
 	}
 	return (PSM_SUCCESS);
 }
+/*
+ * Return 1 if CMT load balancing policies should be
+ * implemented across instances of the specified hardware
+ * sharing relationship.
+ */
+int
+pg_cmt_load_bal_hw(pghw_type_t hw)
+{
+	if (hw == PGHW_IPIPE ||
+	    hw == PGHW_FPU ||
+	    hw == PGHW_CHIP)
+		return (1);
+	else
+		return (0);
+}
+/*
+ * Return 1 if thread affinity polices should be implemented
+ * for instances of the specifed hardware sharing relationship.
+ */
+int
+pg_cmt_affinity_hw(pghw_type_t hw)
+{
+	if (hw == PGHW_CACHE)
+		return (1);
+	else
+		return (0);
+}
diff --git a/usr/src/uts/i86pc/sys/memnode.h b/usr/src/uts/i86pc/sys/memnode.h
index c76f90216e..21a059ac44 100644
--- a/usr/src/uts/i86pc/sys/memnode.h
+++ b/usr/src/uts/i86pc/sys/memnode.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -51,9 +50,6 @@ extern "C" {
 #define	PFN_2_MEM_NODE(pfn)			\
 	((max_mem_nodes > 1) ? plat_pfn_to_mem_node(pfn) : 0)
 
-#define	LGRPHAND_2_MEM_NODE(lgrp_plat_hand)	\
-	((max_mem_nodes > 1) ? plat_lgrphand_to_mem_node(lgrp_plat_hand) : 0)
-
 #define	MEM_NODE_2_LGRPHAND(mnode)		\
 	((max_mem_nodes > 1) ? plat_mem_node_to_lgrphand(mnode) : \
 	    LGRP_DEFAULT_HANDLE)
@@ -90,7 +86,6 @@ extern void mem_node_post_del_slice(pfn_t, pfn_t, int);
 extern int mem_node_alloc(void);
 extern pgcnt_t mem_node_memlist_pages(int, struct memlist *);
 
-
 extern struct mem_node_conf	mem_node_config[];
 extern uint64_t			mem_node_physalign;
 extern int			mem_node_pfn_shift;
diff --git a/usr/src/uts/i86pc/vm/vm_dep.h b/usr/src/uts/i86pc/vm/vm_dep.h
index b95f6b8e17..49e9386d81 100644
--- a/usr/src/uts/i86pc/vm/vm_dep.h
+++ b/usr/src/uts/i86pc/vm/vm_dep.h
@@ -39,6 +39,7 @@ extern "C" {
 #include <sys/clock.h>
 #include <vm/hat_pte.h>
 #include <sys/param.h>
+#include <sys/memnode.h>
 
 /*
  * WARNING: vm_dep.h is included by files in common. As such, macros
@@ -285,10 +286,41 @@ extern kmutex_t	*cpc_mutex[NPC_MUTEX];
 extern page_t *page_get_mnode_freelist(int, uint_t, int, uchar_t, uint_t);
 extern page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
 
-#define	PAGE_GET_COLOR_SHIFT(szc, nszc)                          \
+/* mem node iterator is not used on x86 */
+#define	MEM_NODE_ITERATOR_DECL(it)
+#define	MEM_NODE_ITERATOR_INIT(pfn, mnode, it)
+
+/*
+ * interleaved_mnodes mode is never set on x86, therefore,
+ * simply return the limits of the given mnode, which then
+ * determines the length of hpm_counters array for the mnode.
+ */
+#define	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, first) 	\
+	{							\
+		(physbase) = mem_node_config[(mnode)].physbase;	\
+		(physmax) = mem_node_config[(mnode)].physmax;	\
+		(first) = (mnode);				\
+	}
+
+#define	PAGE_CTRS_WRITE_LOCK(mnode)				\
+	{							\
+		rw_enter(&page_ctrs_rwlock[(mnode)], RW_WRITER);\
+		page_freelist_lock(mnode);			\
+	}
+
+#define	PAGE_CTRS_WRITE_UNLOCK(mnode)				\
+	{							\
+		page_freelist_unlock(mnode);			\
+		rw_exit(&page_ctrs_rwlock[(mnode)]);		\
+	}
+
+#define	PAGE_GET_COLOR_SHIFT(szc, nszc)				\
 	    (hw_page_array[(nszc)].hp_shift - hw_page_array[(szc)].hp_shift)
 
-#define	PFN_2_COLOR(pfn, szc)						\
+#define	PAGE_CONVERT_COLOR(ncolor, szc, nszc)			\
+	    ((ncolor) << PAGE_GET_COLOR_SHIFT((szc), (nszc)))
+
+#define	PFN_2_COLOR(pfn, szc, it)					\
 	(((pfn) & page_colors_mask) >>			                \
 	(hw_page_array[szc].hp_shift - hw_page_array[0].hp_shift))
 
@@ -305,7 +337,7 @@ extern page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
  * This macro calculates the next sequential pfn with the specified
  * color using color equivalency mask
  */
-#define	PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask)        \
+#define	PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask, it)    \
 	ASSERT(((color) & ~(ceq_mask)) == 0);                                 \
 	{								      \
 		uint_t	pfn_shift = PAGE_BSZS_SHIFT(szc);                     \
@@ -329,7 +361,7 @@ extern page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
 	((color) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc)))
 
 /* Find the bin for the given page if it was of size szc */
-#define	PP_2_BIN_SZC(pp, szc)	(PFN_2_COLOR(pp->p_pagenum, szc))
+#define	PP_2_BIN_SZC(pp, szc)	(PFN_2_COLOR(pp->p_pagenum, szc, NULL))
 
 #define	PP_2_BIN(pp)		(PP_2_BIN_SZC(pp, pp->p_szc))
 
diff --git a/usr/src/uts/sun4/os/memnode.c b/usr/src/uts/sun4/os/memnode.c
index 849bec22c8..cb21287ebd 100644
--- a/usr/src/uts/sun4/os/memnode.c
+++ b/usr/src/uts/sun4/os/memnode.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -83,7 +82,7 @@ mem_node_add_slice(pfn_t start, pfn_t end)
 		end = roundup(end, btop(mem_node_physalign)) - 1;
 	}
 
-	if (&plat_slice_add)
+	if (&plat_slice_add != NULL)
 		plat_slice_add(start, end);
 
 	mnode = PFN_2_MEM_NODE(start);
@@ -148,7 +147,7 @@ mem_node_post_del_slice(pfn_t start, pfn_t end, int cancelled)
 	if (!cancelled) {
 		delta_pgcnt = end - start;
 		node_size = mem_node_config[mnode].physmax -
-				mem_node_config[mnode].physbase;
+		    mem_node_config[mnode].physbase;
 
 		if (node_size > delta_pgcnt) {
 			/*
@@ -180,7 +179,7 @@ mem_node_post_del_slice(pfn_t start, pfn_t end, int cancelled)
 			mem_node_config[mnode].exists = 0;
 		}
 
-		if (&plat_slice_del)
+		if (&plat_slice_del != NULL)
 			plat_slice_del(start, end);
 	}
 }
@@ -195,7 +194,7 @@ startup_build_mem_nodes(u_longlong_t *list, size_t nelems)
 	/* LINTED: ASSERT will always true or false */
 	ASSERT(NBBY * sizeof (mnodeset_t) >= max_mem_nodes);
 
-	if (&plat_build_mem_nodes) {
+	if (&plat_build_mem_nodes != NULL) {
 		plat_build_mem_nodes(list, nelems);
 	} else {
 		/*
@@ -226,7 +225,7 @@ mem_node_alloc()
 	 */
 	for (mnode = 0; mnode < max_mem_nodes; mnode++)
 		if (cas32((uint32_t *)&mem_node_config[mnode].exists,
-			0, 1) == 0)
+		    0, 1) == 0)
 			break;
 
 	if (mnode >= max_mem_nodes)
@@ -247,27 +246,39 @@ mem_node_alloc()
  * Find the intersection between a memnode and a memlist
  * and returns the number of pages that overlap.
  *
- * Assumes the list is protected from DR operations by
- * the memlist lock.
+ * Grab the memlist lock to protect the list from DR operations.
  */
 pgcnt_t
 mem_node_memlist_pages(int mnode, struct memlist *mlist)
 {
 	pfn_t		base, end;
 	pfn_t		cur_base, cur_end;
-	pgcnt_t		npgs;
+	pgcnt_t		npgs = 0;
+	pgcnt_t		pages;
 	struct memlist	*pmem;
 
+	if (&plat_mem_node_intersect_range != NULL) {
+		memlist_read_lock();
+
+		for (pmem = mlist; pmem; pmem = pmem->next) {
+			plat_mem_node_intersect_range(btop(pmem->address),
+			    btop(pmem->size), mnode, &pages);
+			npgs += pages;
+		}
+
+		memlist_read_unlock();
+		return (npgs);
+	}
+
 	base = mem_node_config[mnode].physbase;
 	end = mem_node_config[mnode].physmax;
-	npgs = 0;
 
 	memlist_read_lock();
 
 	for (pmem = mlist; pmem; pmem = pmem->next) {
 		cur_base = btop(pmem->address);
 		cur_end = cur_base + btop(pmem->size) - 1;
-		if (end <= cur_base || base >= cur_end)
+		if (end < cur_base || base > cur_end)
 			continue;
 		npgs = npgs + (MIN(cur_end, end) -
 		    MAX(cur_base, base)) + 1;
@@ -277,3 +288,34 @@ mem_node_memlist_pages(int mnode, struct memlist *mlist)
 
 	return (npgs);
 }
+
+/*
+ * Find MIN(physbase) and MAX(physmax) over all mnodes
+ *
+ * Called during startup and DR to find hpm_counters limits when
+ * interleaved_mnodes is set.
+ * NOTE: there is a race condition with DR if it tries to change more than
+ * one mnode in parallel. Sizing shared hpm_counters depends on finding the
+ * min(physbase) and max(physmax) across all mnodes. Therefore, the caller of
+ * page_ctrs_adjust must ensure that mem_node_config does not change while it
+ * is running.
+ */
+void
+mem_node_max_range(pfn_t *basep, pfn_t *maxp)
+{
+	int mnode;
+	pfn_t max = 0;
+	pfn_t base = (pfn_t)-1;
+
+	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
+		if (mem_node_config[mnode].exists == 0)
+			continue;
+		if (max < mem_node_config[mnode].physmax)
+			max = mem_node_config[mnode].physmax;
+		if (base > mem_node_config[mnode].physbase)
+			base = mem_node_config[mnode].physbase;
+	}
+	ASSERT(base != (pfn_t)-1 && max != 0);
+	*basep = base;
+	*maxp = max;
+}
diff --git a/usr/src/uts/sun4/sys/memnode.h b/usr/src/uts/sun4/sys/memnode.h
index d8068b9235..745d03002f 100644
--- a/usr/src/uts/sun4/sys/memnode.h
+++ b/usr/src/uts/sun4/sys/memnode.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -66,16 +65,13 @@ extern "C" {
  * nodes, so the platform can always make everything work.
  */
 
-#ifndef	MAX_MEM_NODES
+#ifndef MAX_MEM_NODES
 #define	MAX_MEM_NODES	(4)
 #endif	/* MAX_MEM_NODES */
 
 #define	PFN_2_MEM_NODE(pfn)			\
 	((max_mem_nodes > 1) ? plat_pfn_to_mem_node(pfn) : 0)
 
-#define	LGRPHAND_2_MEM_NODE(lgrp_plat_hand)	\
-	((max_mem_nodes > 1) ? plat_lgrphand_to_mem_node(lgrp_plat_hand) : 0)
-
 #define	MEM_NODE_2_LGRPHAND(mnode)		\
 	((max_mem_nodes > 1) ? plat_mem_node_to_lgrphand(mnode) : \
 	    LGRP_DEFAULT_HANDLE)
@@ -90,12 +86,14 @@ extern void plat_assign_lgrphand_to_mem_node(lgrp_handle_t, int);
 extern lgrp_handle_t plat_mem_node_to_lgrphand(int);
 extern void plat_slice_add(pfn_t, pfn_t);
 extern void plat_slice_del(pfn_t, pfn_t);
+extern void plat_mem_node_intersect_range(pfn_t, pgcnt_t, int, pgcnt_t *);
 
 #pragma	weak plat_pfn_to_mem_node
 #pragma	weak plat_lgrphand_to_mem_node
 #pragma	weak plat_mem_node_to_lgrphand
 #pragma	weak plat_slice_add
 #pragma	weak plat_slice_del
+#pragma weak plat_mem_node_intersect_range
 
 struct	mem_node_conf {
 	int	exists;		/* only try if set, list may still be empty */
@@ -111,7 +109,8 @@ extern void mem_node_pre_del_slice(pfn_t, pfn_t);
 extern void mem_node_post_del_slice(pfn_t, pfn_t, int);
 extern int mem_node_alloc(void);
 extern pgcnt_t mem_node_memlist_pages(int, struct memlist *);
-
+extern void mem_node_add_slice(pfn_t start, pfn_t end);
+extern void mem_node_max_range(pfn_t *, pfn_t *);
 
 extern struct mem_node_conf	mem_node_config[];
 extern uint64_t			mem_node_physalign;
diff --git a/usr/src/uts/sun4/vm/vm_dep.h b/usr/src/uts/sun4/vm/vm_dep.h
index 6f150837f8..357f9ba0a3 100644
--- a/usr/src/uts/sun4/vm/vm_dep.h
+++ b/usr/src/uts/sun4/vm/vm_dep.h
@@ -107,6 +107,92 @@ extern kmutex_t	*fpc_mutex[NPC_MUTEX];
 extern kmutex_t	*cpc_mutex[NPC_MUTEX];
 
 /*
+ * Iterator provides the info needed to convert RA to PA.
+ * MEM_NODE_ITERATOR_INIT() should be called before
+ * PAGE_NEXT_PFN_FOR_COLOR() if pfn was not obtained via a previous
+ * PAGE_NEXT_PFN_FOR_COLOR() call. Iterator caches color 2 hash
+ * translations requiring initializer call if color or ceq_mask changes,
+ * even if pfn doesn't. MEM_NODE_ITERATOR_INIT() must also be called before
+ * PFN_2_COLOR() that uses a valid iterator argument.
+ */
+#ifdef	sun4v
+
+typedef struct mem_node_iterator {
+	uint_t mi_mnode;		/* mnode in which to iterate */
+	int mi_init;			/* set to 1 when first init */
+	int mi_last_mblock;		/* last mblock visited */
+	uint_t mi_hash_ceq_mask;	/* cached copy of ceq_mask */
+	uint_t mi_hash_color;		/* cached copy of color */
+	uint_t mi_mnode_mask;		/* number of mask bits */
+	uint_t mi_mnode_pfn_shift;	/* mnode position in pfn */
+	pfn_t mi_mblock_base;		/* first valid pfn in current mblock */
+	pfn_t mi_mblock_end;		/* last valid pfn in current mblock */
+	pfn_t mi_ra_to_pa;		/* ra adjustment for current mblock */
+	pfn_t mi_mnode_pfn_mask;	/* mask to obtain mnode id bits */
+} mem_node_iterator_t;
+
+#define	MEM_NODE_ITERATOR_DECL(it) \
+	mem_node_iterator_t it
+#define	MEM_NODE_ITERATOR_INIT(pfn, mnode, it) \
+	(pfn) = plat_mem_node_iterator_init((pfn), (mnode), (it), 1)
+
+extern pfn_t plat_mem_node_iterator_init(pfn_t, int,
+    mem_node_iterator_t *, int);
+extern pfn_t plat_rapfn_to_papfn(pfn_t);
+extern int interleaved_mnodes;
+
+#else	/* sun4v */
+
+#define	MEM_NODE_ITERATOR_DECL(it) \
+	void *it = NULL
+#define	MEM_NODE_ITERATOR_INIT(pfn, mnode, it)
+
+#endif	/* sun4v */
+
+/*
+ * Return the mnode limits so that hpc_counters length and base
+ * index can be determined. When interleaved_mnodes is set, we
+ * create an array only for the first mnode that exists. All other
+ * mnodes will share the array in this case.
+ * If interleaved_mnodes is not set, simply return the limits for
+ * the given mnode.
+ */
+#define	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, first)		\
+	if (!interleaved_mnodes) {					\
+		(physbase) = mem_node_config[(mnode)].physbase;		\
+		(physmax) = mem_node_config[(mnode)].physmax;		\
+		(first) = (mnode);					\
+	} else if ((first) < 0) {					\
+		mem_node_max_range(&(physbase), &(physmax));		\
+		(first) = (mnode);					\
+	}
+
+#define	PAGE_CTRS_WRITE_LOCK(mnode)					\
+	if (!interleaved_mnodes) {					\
+		rw_enter(&page_ctrs_rwlock[(mnode)], RW_WRITER);	\
+		page_freelist_lock(mnode);				\
+	} else {							\
+		/* changing shared hpm_counters */			\
+		int _i;							\
+		for (_i = 0; _i < max_mem_nodes; _i++) {		\
+			rw_enter(&page_ctrs_rwlock[_i], RW_WRITER);	\
+			page_freelist_lock(_i);				\
+		}							\
+	}
+
+#define	PAGE_CTRS_WRITE_UNLOCK(mnode)					\
+	if (!interleaved_mnodes) {					\
+		page_freelist_unlock(mnode);				\
+		rw_exit(&page_ctrs_rwlock[(mnode)]);			\
+	} else {							\
+		int _i;							\
+		for (_i = 0; _i < max_mem_nodes; _i++) {		\
+			page_freelist_unlock(_i);			\
+			rw_exit(&page_ctrs_rwlock[_i]);			\
+		}							\
+	}
+
+/*
  * cpu specific color conversion functions
  */
 extern uint_t page_get_nsz_color_mask_cpu(uchar_t, uint_t);
@@ -118,11 +204,14 @@ extern uint_t page_get_nsz_color_cpu(uchar_t, uint_t);
 extern uint_t page_get_color_shift_cpu(uchar_t, uchar_t);
 #pragma weak page_get_color_shift_cpu
 
+extern uint_t page_convert_color_cpu(uint_t, uchar_t, uchar_t);
+#pragma weak page_convert_color_cpu
+
 extern pfn_t page_next_pfn_for_color_cpu(pfn_t,
-    uchar_t, uint_t, uint_t, uint_t);
+    uchar_t, uint_t, uint_t, uint_t, void *);
 #pragma weak page_next_pfn_for_color_cpu
 
-extern uint_t  page_pfn_2_color_cpu(pfn_t, uchar_t);
+extern uint_t  page_pfn_2_color_cpu(pfn_t, uchar_t, void *);
 #pragma weak page_pfn_2_color_cpu
 
 #define	PAGE_GET_COLOR_SHIFT(szc, nszc)				\
@@ -131,9 +220,14 @@ extern uint_t  page_pfn_2_color_cpu(pfn_t, uchar_t);
 	    (hw_page_array[(nszc)].hp_shift -			\
 		hw_page_array[(szc)].hp_shift))
 
-#define	PFN_2_COLOR(pfn, szc)					\
+#define	PAGE_CONVERT_COLOR(ncolor, szc, nszc)			\
+	((&page_convert_color_cpu != NULL) ?			\
+	    page_convert_color_cpu(ncolor, szc, nszc) :		\
+	    ((ncolor) << PAGE_GET_COLOR_SHIFT((szc), (nszc))))
+
+#define	PFN_2_COLOR(pfn, szc, it)				\
 	((&page_pfn_2_color_cpu != NULL) ?			\
-	    page_pfn_2_color_cpu(pfn, szc) :			\
+	    page_pfn_2_color_cpu(pfn, szc, it) :		\
 	    ((pfn & (hw_page_array[0].hp_colors - 1)) >>	\
 		(hw_page_array[szc].hp_shift -			\
 		    hw_page_array[0].hp_shift)))
@@ -151,7 +245,7 @@ extern uint_t  page_pfn_2_color_cpu(pfn_t, uchar_t);
  * This macro calculates the next sequential pfn with the specified
  * color using color equivalency mask
  */
-#define	PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask)        \
+#define	PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask, it)    \
 	ASSERT(((color) & ~(ceq_mask)) == 0);                                 \
 	if (&page_next_pfn_for_color_cpu == NULL) {                           \
 		uint_t	pfn_shift = PAGE_BSZS_SHIFT(szc);                     \
@@ -165,8 +259,8 @@ extern uint_t  page_pfn_2_color_cpu(pfn_t, uchar_t);
 			pfn = (pfn > spfn ? pfn : pfn + stride) << pfn_shift; \
 		}                                                             \
 	} else {                                                              \
-		pfn = page_next_pfn_for_color_cpu(pfn, szc, color,	      \
-		    ceq_mask, color_mask);                                    \
+	    pfn = page_next_pfn_for_color_cpu(pfn, szc, color,		      \
+		ceq_mask, color_mask, it);				      \
 	}
 
 /* get the color equivalency mask for the next szc */
@@ -182,7 +276,7 @@ extern uint_t  page_pfn_2_color_cpu(pfn_t, uchar_t);
 	    page_get_nsz_color_cpu(szc, color))
 
 /* Find the bin for the given page if it was of size szc */
-#define	PP_2_BIN_SZC(pp, szc)	(PFN_2_COLOR(pp->p_pagenum, szc))
+#define	PP_2_BIN_SZC(pp, szc)	(PFN_2_COLOR(pp->p_pagenum, szc, (void *)(-1)))
 
 #define	PP_2_BIN(pp)		(PP_2_BIN_SZC(pp, pp->p_szc))
 
@@ -335,16 +429,31 @@ typedef	struct {
  * when memory is added (kphysm_add_memory_dynamic) or deleted
  * (kphysm_del_cleanup).
  */
-#define	PLCNT_MODIFY_MAX(startpfn, cnt) {				\
-	pfn_t	pfn = startpfn, endpfn = startpfn + ABS(cnt);		\
-	while (pfn < endpfn) {						\
-		int mn = PFN_2_MEM_NODE(pfn);				\
-		long inc = MIN(endpfn, mem_node_config[mn].physmax + 1)	\
-		    - pfn;						\
-		pfn += inc;						\
-		atomic_add_long(&plcnt[mn][MTYPE_RELOC].plc_mt_pgmax, 	\
-		    ((cnt) < 0) ? -inc: inc);				\
-	}								\
+#define	PLCNT_MODIFY_MAX(pfn, cnt) {					       \
+	spgcnt_t _cnt = (spgcnt_t)(cnt);				       \
+	pgcnt_t _acnt = ABS(_cnt);					       \
+	int _mn;							       \
+	pgcnt_t _np;							       \
+	if (&plat_mem_node_intersect_range != NULL) {			       \
+		for (_mn = 0; _mn < max_mem_nodes; _mn++) {		       \
+			plat_mem_node_intersect_range((pfn), _acnt, _mn, &_np);\
+			if (_np == 0)					       \
+				continue;				       \
+			atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \
+			    (_cnt < 0) ? -_np : _np);			       \
+		}							       \
+	} else {							       \
+		pfn_t _pfn = (pfn);					       \
+		pfn_t _endpfn = _pfn + _acnt;				       \
+		while (_pfn < _endpfn) {				       \
+			_mn = PFN_2_MEM_NODE(_pfn);			       \
+			_np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - \
+			    _pfn;					       \
+			_pfn += _np;					       \
+			atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \
+			    (_cnt < 0) ? -_np : _np);			       \
+		}							       \
+	}								       \
 }
 
 extern plcnt_t	plcnt;
@@ -495,17 +604,17 @@ switch (consistent_coloring) {						\
 			(vac_shift - MMU_PAGESHIFT));			\
 		if ((szc) == 0 || &page_pfn_2_color_cpu == NULL) {	\
 			pfn += slew;					\
-			bin = PFN_2_COLOR(pfn, szc);			\
+			bin = PFN_2_COLOR(pfn, szc, NULL);		\
 		} else {						\
-			bin = PFN_2_COLOR(pfn, szc);			\
+			bin = PFN_2_COLOR(pfn, szc, NULL);		\
 			bin += slew >> (vac_shift - MMU_PAGESHIFT);	\
 			bin &= hw_page_array[(szc)].hp_colors - 1;	\
 		}							\
 		break;                                                  \
 	}                                                               \
 	case 1:                                                         \
-		bin = PFN_2_COLOR(((uintptr_t)addr >> MMU_PAGESHIFT),   \
-					szc);	                        \
+		bin = PFN_2_COLOR(((uintptr_t)addr >> MMU_PAGESHIFT),	\
+		    szc, NULL);						\
 		break;                                                  \
 	case 2: {                                                       \
 		int cnt = as_color_bin(as);				\
diff --git a/usr/src/uts/sun4u/os/cmp.c b/usr/src/uts/sun4u/os/cmp.c
index 86a021f3d1..20aa7855c8 100644
--- a/usr/src/uts/sun4u/os/cmp.c
+++ b/usr/src/uts/sun4u/os/cmp.c
@@ -280,3 +280,30 @@ cmp_set_nosteal_interval(void)
 	/* Set the nosteal interval (used by disp_getbest()) to 100us */
 	nosteal_nsec = 100000UL;
 }
+/*
+ * Return 1 if CMT load balancing policies should be
+ * implemented across instances of the specified hardware
+ * sharing relationship.
+ */
+int
+pg_cmt_load_bal_hw(pghw_type_t hw)
+{
+	if (hw == PGHW_IPIPE ||
+	    hw == PGHW_FPU ||
+	    hw == PGHW_CHIP)
+		return (1);
+	else
+		return (0);
+}
+/*
+ * Return 1 if thread affinity polices should be implemented
+ * for instances of the specifed hardware sharing relationship.
+ */
+int
+pg_cmt_affinity_hw(pghw_type_t hw)
+{
+	if (hw == PGHW_CACHE)
+		return (1);
+	else
+		return (0);
+}
diff --git a/usr/src/uts/sun4v/Makefile.files b/usr/src/uts/sun4v/Makefile.files
index d153205ab1..b7c02c1e22 100644
--- a/usr/src/uts/sun4v/Makefile.files
+++ b/usr/src/uts/sun4v/Makefile.files
@@ -62,6 +62,7 @@ CORE_OBJS +=	mach_xc.o
 CORE_OBJS +=	mem_cage.o
 CORE_OBJS +=	mem_config.o
 CORE_OBJS +=	memlist_new.o
+CORE_OBJS +=	mpo.o
 CORE_OBJS +=	ppage.o
 CORE_OBJS +=	promif_asr.o
 CORE_OBJS +=	promif_cpu.o
diff --git a/usr/src/uts/sun4v/Makefile.sun4v.shared b/usr/src/uts/sun4v/Makefile.sun4v.shared
index 82dbb2b21c..a299bb9a56 100644
--- a/usr/src/uts/sun4v/Makefile.sun4v.shared
+++ b/usr/src/uts/sun4v/Makefile.sun4v.shared
@@ -215,6 +215,7 @@ FDOFFSETS 	 = $(UTSBASE)/sun/io/fd_offsets.in
 #
 
 MACHINE_DEFS	= -D$(PLATFORM) -D_MACHDEP -DSFMMU
+MACHINE_DEFS	+= -DMAX_MEM_NODES=8
 
 $(MPSAS_BUILD)MACHINE_DEFS      += -DMPSAS
 
diff --git a/usr/src/uts/sun4v/cpu/generic.c b/usr/src/uts/sun4v/cpu/generic.c
index eab39b9fe9..21771a5f71 100644
--- a/usr/src/uts/sun4v/cpu/generic.c
+++ b/usr/src/uts/sun4v/cpu/generic.c
@@ -35,6 +35,7 @@
 #include <sys/elf_SPARC.h>
 #include <vm/hat_sfmmu.h>
 #include <vm/page.h>
+#include <vm/vm_dep.h>
 #include <sys/cpuvar.h>
 #include <sys/async.h>
 #include <sys/cmn_err.h>
@@ -167,6 +168,10 @@ cpu_map_exec_units(struct cpu *cp)
 	if (cp->cpu_m.cpu_fpu == NO_EU_MAPPING_FOUND)
 		cp->cpu_m.cpu_fpu = (id_t)(cp->cpu_id);
 
+	cp->cpu_m.cpu_mpipe = cpunodes[cp->cpu_id].l2_cache_mapping;
+	if (cp->cpu_m.cpu_mpipe == NO_L2_CACHE_MAPPING_FOUND)
+		cp->cpu_m.cpu_mpipe = CPU_L2_CACHEID_INVALID;
+
 	cp->cpu_m.cpu_core = (id_t)(cp->cpu_id);
 
 	/*
diff --git a/usr/src/uts/sun4v/cpu/niagara.c b/usr/src/uts/sun4v/cpu/niagara.c
index cb7e182d27..d607c2625c 100644
--- a/usr/src/uts/sun4v/cpu/niagara.c
+++ b/usr/src/uts/sun4v/cpu/niagara.c
@@ -193,9 +193,10 @@ cpu_map_exec_units(struct cpu *cp)
 
 	/*
 	 * Niagara systems just have one chip. Therefore, the chip id
-	 * is always 0.
+	 * mpipe id are always 0.
 	 */
 	cp->cpu_m.cpu_chip = 0;
+	cp->cpu_m.cpu_mpipe = 0;
 }
 
 static int niagara_cpucnt;
diff --git a/usr/src/uts/sun4v/cpu/niagara2.c b/usr/src/uts/sun4v/cpu/niagara2.c
index e791361578..e77b2ef3b4 100644
--- a/usr/src/uts/sun4v/cpu/niagara2.c
+++ b/usr/src/uts/sun4v/cpu/niagara2.c
@@ -198,9 +198,9 @@ cpu_map_exec_units(struct cpu *cp)
 	 * share the same L2 cache. If no such info is available, we
 	 * set the cpu to belong to the defacto chip 0.
 	 */
-	cp->cpu_m.cpu_chip = cpunodes[cp->cpu_id].l2_cache_mapping;
-	if (cp->cpu_m.cpu_chip == NO_CHIP_MAPPING_FOUND)
-		cp->cpu_m.cpu_chip = 0;
+	cp->cpu_m.cpu_mpipe = cpunodes[cp->cpu_id].l2_cache_mapping;
+	if (cp->cpu_m.cpu_mpipe == NO_L2_CACHE_MAPPING_FOUND)
+		cp->cpu_m.cpu_mpipe = CPU_L2_CACHEID_INVALID;
 }
 
 static int cpucnt;
@@ -283,22 +283,112 @@ cpu_trapstat_data(void *buf, uint_t tstat_pgszs)
 	}
 }
 
+/*
+ * Page coloring support for hashed cache index mode
+ */
+
+/*
+ * Node id bits from machine description (MD).  Node id distinguishes
+ * local versus remote memory. Because of MPO, page allocation does
+ * not cross node boundaries. Therefore, remove the node id bits from
+ * the color, since they are fixed. Either bit 30, or 31:30 in
+ * Victoria Falls processors.
+ * The number of node id bits is always 0 in Niagara2.
+ */
+typedef struct n2color {
+	uchar_t nnbits;	/* number of node id bits */
+	uchar_t nnmask; /* mask for node id bits */
+	uchar_t	lomask;	/* mask for bits below node id */
+	uchar_t lobits;	/* number of bits below node id */
+} n2color_t;
+
+n2color_t n2color[MMU_PAGE_SIZES];
+static uchar_t nhbits[] = {7, 7, 6, 5, 5, 5};
+
+/*
+ * Remove node id bits from color bits 32:28.
+ * This will reduce the number of colors.
+ * No change if number of node bits is zero.
+ */
+static inline uint_t
+n2_hash2color(uint_t color, uchar_t szc)
+{
+	n2color_t m = n2color[szc];
+
+	if (m.nnbits > 0) {
+		color = ((color >> m.nnbits) & ~m.lomask) | (color & m.lomask);
+		ASSERT((color & ~(hw_page_array[szc].hp_colors - 1)) == 0);
+	}
+
+	return (color);
+}
+
+/*
+ * Restore node id bits into page color.
+ * This will increase the number of colors to match N2.
+ * No change if number of node bits is zero.
+ */
+static inline uint_t
+n2_color2hash(uint_t color, uchar_t szc, uint_t node)
+{
+	n2color_t m = n2color[szc];
+
+	if (m.nnbits > 0) {
+		color = ((color & ~m.lomask) << m.nnbits) | (color & m.lomask);
+		color |= (node & m.nnmask) << m.lobits;
+	}
+
+	return (color);
+}
+
 /* NI2 L2$ index is pa[32:28]^pa[17:13].pa[19:18]^pa[12:11].pa[10:6] */
+
+/*
+ * iterator NULL means pfn is VA, do not adjust ra_to_pa
+ * iterator (-1) means pfn is RA, need to convert to PA
+ * iterator non-null means pfn is RA, use ra_to_pa
+ */
 uint_t
-page_pfn_2_color_cpu(pfn_t pfn, uchar_t szc)
+page_pfn_2_color_cpu(pfn_t pfn, uchar_t szc, void *cookie)
 {
+	mem_node_iterator_t *it = cookie;
 	uint_t color;
 
 	ASSERT(szc <= TTE256M);
 
+	if (it == ((mem_node_iterator_t *)(-1))) {
+		pfn = plat_rapfn_to_papfn(pfn);
+	} else if (it != NULL) {
+		ASSERT(pfn >= it->mi_mblock_base && pfn <= it->mi_mblock_end);
+		pfn = pfn + it->mi_ra_to_pa;
+	}
 	pfn = PFN_BASE(pfn, szc);
 	color = ((pfn >> 15) ^ pfn) & 0x1f;
-	if (szc >= TTE4M)
-		return (color);
+	if (szc < TTE4M) {
+		/* 19:18 */
+		color = (color << 2) | ((pfn >> 5) & 0x3);
+		if (szc > TTE64K)
+			color >>= 1;    /* 19 */
+	}
+	return (n2_hash2color(color, szc));
+}
 
-	color = (color << 2) | ((pfn >> 5) & 0x3);
+static uint_t
+page_papfn_2_color_cpu(pfn_t papfn, uchar_t szc)
+{
+	uint_t color;
+
+	ASSERT(szc <= TTE256M);
 
-	return (szc <= TTE64K ? color : (color >> 1));
+	papfn = PFN_BASE(papfn, szc);
+	color = ((papfn >> 15) ^ papfn) & 0x1f;
+	if (szc < TTE4M) {
+		/* 19:18 */
+		color = (color << 2) | ((papfn >> 5) & 0x3);
+		if (szc > TTE64K)
+			color >>= 1;    /* 19 */
+	}
+	return (color);
 }
 
 #if TTE256M != 5
@@ -310,46 +400,91 @@ page_get_nsz_color_mask_cpu(uchar_t szc, uint_t mask)
 {
 	static uint_t ni2_color_masks[5] = {0x63, 0x1e, 0x3e, 0x1f, 0x1f};
 	ASSERT(szc < TTE256M);
-
+	mask = n2_color2hash(mask, szc, 0);
 	mask &= ni2_color_masks[szc];
-	return ((szc == TTE64K || szc == TTE512K) ? (mask >> 1) : mask);
+	if (szc == TTE64K || szc == TTE512K)
+		mask >>= 1;
+	return (n2_hash2color(mask, szc + 1));
 }
 
 uint_t
 page_get_nsz_color_cpu(uchar_t szc, uint_t color)
 {
 	ASSERT(szc < TTE256M);
-	return ((szc == TTE64K || szc == TTE512K) ? (color >> 1) : color);
+	color = n2_color2hash(color, szc, 0);
+	if (szc == TTE64K || szc == TTE512K)
+		color >>= 1;
+	return (n2_hash2color(color, szc + 1));
 }
 
 uint_t
 page_get_color_shift_cpu(uchar_t szc, uchar_t nszc)
 {
+	uint_t s;
 	ASSERT(nszc >= szc);
 	ASSERT(nszc <= TTE256M);
 
-	if (szc == nszc)
-		return (0);
-	if (szc <= TTE64K)
-		return ((nszc >= TTE4M) ? 2 : ((nszc >= TTE512K) ? 1 : 0));
-	if (szc == TTE512K)
-		return (1);
+	s = nhbits[szc] - n2color[szc].nnbits;
+	s -= nhbits[nszc] - n2color[nszc].nnbits;
 
-	return (0);
+	return (s);
+}
+
+uint_t
+page_convert_color_cpu(uint_t ncolor, uchar_t szc, uchar_t nszc)
+{
+	uint_t color;
+
+	ASSERT(nszc > szc);
+	ASSERT(nszc <= TTE256M);
+	ncolor = n2_color2hash(ncolor, nszc, 0);
+	color = ncolor << (nhbits[szc] - nhbits[nszc]);
+	color = n2_hash2color(color, szc);
+	return (color);
 }
 
+#define	PAPFN_2_MNODE(pfn) \
+	(((pfn) & it->mi_mnode_pfn_mask) >> it->mi_mnode_pfn_shift)
+
 /*ARGSUSED*/
 pfn_t
 page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color,
-    uint_t ceq_mask, uint_t color_mask)
+    uint_t ceq_mask, uint_t color_mask, void *cookie)
 {
+	mem_node_iterator_t *it = cookie;
 	pfn_t pstep = PNUM_SIZE(szc);
 	pfn_t npfn, pfn_ceq_mask, pfn_color;
 	pfn_t tmpmask, mask = (pfn_t)-1;
+	uint_t pfnmn;
 
 	ASSERT((color & ~ceq_mask) == 0);
-
-	if (((page_pfn_2_color_cpu(pfn, szc) ^ color) & ceq_mask) == 0) {
+	ASSERT(pfn >= it->mi_mblock_base && pfn <= it->mi_mblock_end);
+
+	/* convert RA to PA for accurate color calculation */
+	if (it->mi_init) {
+		/* first call after it, so cache these values */
+		it->mi_hash_ceq_mask =
+		    n2_color2hash(ceq_mask, szc, it->mi_mnode_mask);
+		it->mi_hash_color =
+		    n2_color2hash(color, szc, it->mi_mnode);
+		it->mi_init = 0;
+	} else {
+		ASSERT(it->mi_hash_ceq_mask ==
+		    n2_color2hash(ceq_mask, szc, it->mi_mnode_mask));
+		ASSERT(it->mi_hash_color ==
+		    n2_color2hash(color, szc, it->mi_mnode));
+	}
+	ceq_mask = it->mi_hash_ceq_mask;
+	color = it->mi_hash_color;
+	pfn += it->mi_ra_to_pa;
+
+	/* restart here when we switch memblocks */
+next_mem_block:
+	if (szc <= TTE64K) {
+		pfnmn = PAPFN_2_MNODE(pfn);
+	}
+	if (((page_papfn_2_color_cpu(pfn, szc) ^ color) & ceq_mask) == 0 &&
+	    (szc > TTE64K || pfnmn == it->mi_mnode)) {
 
 		/* we start from the page with correct color */
 		if (szc >= TTE512K) {
@@ -361,18 +496,19 @@ page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color,
 				pfn_ceq_mask = ((ceq_mask & 1) << 6) |
 				    ((ceq_mask >> 1) << 15);
 			}
-			pfn = ADD_MASKED(pfn, pstep, pfn_ceq_mask, mask);
-			return (pfn);
+			npfn = ADD_MASKED(pfn, pstep, pfn_ceq_mask, mask);
+			goto done;
 		} else {
 			/*
 			 * We deal 64K or 8K page. Check if we could the
 			 * satisfy the request without changing PA[32:28]
 			 */
 			pfn_ceq_mask = ((ceq_mask & 3) << 5) | (ceq_mask >> 2);
+			pfn_ceq_mask |= it->mi_mnode_pfn_mask;
 			npfn = ADD_MASKED(pfn, pstep, pfn_ceq_mask, mask);
 
 			if ((((npfn ^ pfn) >> 15) & 0x1f) == 0)
-				return (npfn);
+				goto done;
 
 			/*
 			 * for next pfn we have to change bits PA[32:28]
@@ -382,15 +518,14 @@ page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color,
 			npfn |= (ceq_mask & color & 3) << 5;
 			pfn_ceq_mask = (szc == TTE8K) ? 0 :
 			    (ceq_mask & 0x1c) << 13;
+			pfn_ceq_mask |= it->mi_mnode_pfn_mask;
 			npfn = ADD_MASKED(npfn, (1 << 15), pfn_ceq_mask, mask);
 
 			/*
 			 * set bits PA[17:13] to match the color
 			 */
-			ceq_mask >>= 2;
-			color = (color >> 2) & ceq_mask;
-			npfn |= ((npfn >> 15) ^ color) & ceq_mask;
-			return (npfn);
+			npfn |= ((npfn >> 15) ^ (color >> 2)) & (ceq_mask >> 2);
+			goto done;
 		}
 	}
 
@@ -405,9 +540,9 @@ page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color,
 		} else {
 			/* try get the right color by changing bit PA[19:19] */
 			npfn = pfn + pstep;
-			if (((page_pfn_2_color_cpu(npfn, szc) ^ color) &
+			if (((page_papfn_2_color_cpu(npfn, szc) ^ color) &
 			    ceq_mask) == 0)
-				return (npfn);
+				goto done;
 
 			/* page color is PA[32:28].PA[19:19] */
 			pfn_ceq_mask = ((ceq_mask & 1) << 6) |
@@ -419,34 +554,45 @@ page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color,
 		while (npfn <= pfn) {
 			npfn = ADD_MASKED(npfn, pstep, pfn_ceq_mask, mask);
 		}
-		return (npfn);
+		goto done;
 	}
 
 	/*
-	 * We deal 64K or 8K page of incorrect color.
+	 *  We deal 64K or 8K page of incorrect color.
 	 * Try correcting color without changing PA[32:28]
 	 */
-
 	pfn_ceq_mask = ((ceq_mask & 3) << 5) | (ceq_mask >> 2);
 	pfn_color = ((color & 3) << 5) | (color >> 2);
-	npfn = (pfn & ~(pfn_t)0x7f);
-	npfn |= (((pfn >> 15) & 0x1f) ^ pfn_color) & pfn_ceq_mask;
-	npfn = (szc == TTE64K) ? (npfn & ~(pfn_t)0x7) : npfn;
-
-	if (((page_pfn_2_color_cpu(npfn, szc) ^ color) & ceq_mask) == 0) {
-
-		/* the color is fixed - find the next page */
-		while (npfn <= pfn) {
-			npfn = ADD_MASKED(npfn, pstep, pfn_ceq_mask, mask);
+	if (pfnmn == it->mi_mnode) {
+		npfn = (pfn & ~(pfn_t)0x7f);
+		npfn |= (((pfn >> 15) & 0x1f) ^ pfn_color) & pfn_ceq_mask;
+		npfn = (szc == TTE64K) ? (npfn & ~(pfn_t)0x7) : npfn;
+
+		if (((page_papfn_2_color_cpu(npfn, szc) ^ color) &
+		    ceq_mask) == 0) {
+			/* the color is fixed - find the next page */
+			pfn_ceq_mask |= it->mi_mnode_pfn_mask;
+			while (npfn <= pfn) {
+				npfn = ADD_MASKED(npfn, pstep, pfn_ceq_mask,
+				    mask);
+			}
+			if ((((npfn ^ pfn) >> 15) & 0x1f) == 0)
+				goto done;
 		}
-		if ((((npfn ^ pfn) >> 15) & 0x1f) == 0)
-			return (npfn);
 	}
 
 	/* to fix the color need to touch PA[32:28] */
 	npfn = (szc == TTE8K) ? ((pfn >> 15) << 15) :
 	    (((pfn >> 18) << 18) | ((color & 0x1c) << 13));
+
+	/* fix mnode if input pfn is in the wrong mnode. */
+	if ((pfnmn = PAPFN_2_MNODE(npfn)) != it->mi_mnode) {
+		npfn += ((it->mi_mnode - pfnmn) & it->mi_mnode_mask) <<
+		    it->mi_mnode_pfn_shift;
+	}
+
 	tmpmask = (szc == TTE8K) ? 0 : (ceq_mask & 0x1c) << 13;
+	tmpmask |= it->mi_mnode_pfn_mask;
 
 	while (npfn <= pfn) {
 		npfn = ADD_MASKED(npfn, (1 << 15), tmpmask, mask);
@@ -456,25 +602,58 @@ page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color,
 	npfn |= (((npfn >> 15) & 0x1f) ^ pfn_color) & pfn_ceq_mask;
 	npfn = (szc == TTE64K) ? (npfn & ~(pfn_t)0x7) : npfn;
 
-	ASSERT(((page_pfn_2_color_cpu(npfn, szc) ^ color) & ceq_mask) == 0);
+done:
+	ASSERT(((page_papfn_2_color_cpu(npfn, szc) ^ color) & ceq_mask) == 0);
+	ASSERT(PAPFN_2_MNODE(npfn) == it->mi_mnode);
+
+	/* PA to RA */
+	npfn -= it->mi_ra_to_pa;
+
+	/* check for possible memblock switch */
+	if (npfn > it->mi_mblock_end) {
+		pfn = plat_mem_node_iterator_init(npfn, it->mi_mnode, it, 0);
+		if (pfn == (pfn_t)-1)
+			return (pfn);
+		ASSERT(pfn >= it->mi_mblock_base && pfn <= it->mi_mblock_end);
+		pfn += it->mi_ra_to_pa;
+		goto next_mem_block;
+	}
 
 	return (npfn);
 }
 
 /*
  * init page coloring
+ * VF encodes node_id for an L-group in either bit 30 or 31:30,
+ * which effectively reduces the number of colors available per mnode.
  */
 void
 page_coloring_init_cpu()
 {
 	int i;
-
-	hw_page_array[0].hp_colors = 1 << 7;
-	hw_page_array[1].hp_colors = 1 << 7;
-	hw_page_array[2].hp_colors = 1 << 6;
-
-	for (i = 3; i < mmu_page_sizes; i++) {
-		hw_page_array[i].hp_colors = 1 << 5;
+	uchar_t id;
+	uchar_t lo;
+	uchar_t hi;
+	n2color_t m;
+	mem_node_iterator_t it;
+	static uchar_t idmask[] = {0, 0x7, 0x1f, 0x1f, 0x1f, 0x1f};
+
+	(void) plat_mem_node_iterator_init(0, 0, &it, 1);
+	for (i = 0; i < mmu_page_sizes; i++) {
+		memset(&m, 0, sizeof (m));
+		id = it.mi_mnode_pfn_mask >> 15;	/* node id mask */
+		id &= idmask[i];
+		lo = lowbit(id);
+		if (lo > 0) {
+			hi = highbit(id);
+			m.nnbits = hi - lo + 1;
+			m.nnmask = (1 << m.nnbits) - 1;
+			lo += nhbits[i] - 5;
+			m.lomask = (1 << (lo - 1)) - 1;
+			m.lobits = lo - 1;
+		}
+		hw_page_array[i].hp_colors = 1 << (nhbits[i] - m.nnbits);
+		n2color[i] = m;
 	}
 }
 
@@ -486,6 +665,7 @@ page_set_colorequiv_arr_cpu(void)
 {
 	static uint_t nequiv_shades_log2[MMU_PAGE_SIZES] = {2, 5, 0, 0, 0, 0};
 
+	nequiv_shades_log2[1] -= n2color[1].nnbits;
 	if (colorequiv > 1) {
 		int i;
 		uint_t sv_a = lowbit(colorequiv) - 1;
diff --git a/usr/src/uts/sun4v/os/cmp.c b/usr/src/uts/sun4v/os/cmp.c
index 1503ef4b47..d5a9e3087d 100644
--- a/usr/src/uts/sun4v/os/cmp.c
+++ b/usr/src/uts/sun4v/os/cmp.c
@@ -97,7 +97,7 @@ pg_plat_hw_shared(cpu_t *cp, pghw_type_t hw)
 		return (1);
 	case PGHW_FPU:
 		return (1);
-	case PGHW_CHIP:
+	case PGHW_MPIPE:
 		return (1);
 	}
 	return (0);
@@ -120,8 +120,8 @@ pg_plat_hw_instance_id(cpu_t *cpu, pghw_type_t hw)
 	switch (hw) {
 	case PGHW_IPIPE:
 		return (cpu->cpu_m.cpu_ipipe);
-	case PGHW_CHIP:
-		return (cpu->cpu_m.cpu_chip);
+	case PGHW_MPIPE:
+		return (cpu->cpu_m.cpu_mpipe);
 	case PGHW_FPU:
 		return (cpu->cpu_m.cpu_fpu);
 	default:
@@ -143,7 +143,7 @@ pg_plat_hw_level(pghw_type_t hw)
 	static pghw_type_t hw_hier[] = {
 		PGHW_IPIPE,
 		PGHW_FPU,
-		PGHW_CHIP,
+		PGHW_MPIPE,
 		PGHW_NUM_COMPONENTS
 	};
 
@@ -164,7 +164,7 @@ pg_plat_cmt_load_bal_hw(pghw_type_t hw)
 {
 	if (hw == PGHW_IPIPE ||
 	    hw == PGHW_FPU ||
-	    hw == PGHW_CHIP)
+	    hw == PGHW_MPIPE)
 		return (1);
 	else
 		return (0);
@@ -195,3 +195,30 @@ cmp_set_nosteal_interval(void)
 {
 	nosteal_nsec = 0;
 }
+/*
+ * Return 1 if CMT load balancing policies should be
+ * implemented across instances of the specified hardware
+ * sharing relationship.
+ */
+int
+pg_cmt_load_bal_hw(pghw_type_t hw)
+{
+	if (hw == PGHW_IPIPE ||
+	    hw == PGHW_FPU ||
+	    hw == PGHW_MPIPE)
+		return (1);
+	else
+		return (0);
+}
+/*
+ * Return 1 if thread affinity polices should be implemented
+ * for instances of the specifed hardware sharing relationship.
+ */
+int
+pg_cmt_affinity_hw(pghw_type_t hw)
+{
+	if (hw == PGHW_CACHE)
+		return (1);
+	else
+		return (0);
+}
diff --git a/usr/src/uts/sun4v/os/mpo.c b/usr/src/uts/sun4v/os/mpo.c
new file mode 100644
index 0000000000..d98ce96438
--- /dev/null
+++ b/usr/src/uts/sun4v/os/mpo.c
@@ -0,0 +1,1264 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/machsystm.h>
+#include <sys/machparam.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/mach_descrip.h>
+#include <sys/memnode.h>
+#include <sys/mdesc.h>
+#include <sys/mpo.h>
+#include <vm/vm_dep.h>
+
+/*
+ * MPO and the sun4v memory representation
+ * ---------------------------------------
+ *
+ * Latency groups are defined in the sun4v achitecture by memory-latency-group
+ * nodes in the Machine Description, as specified in FWARC/2007/260.  These
+ * tie together cpu nodes and mblock nodes, and contain mask and match
+ * properties that identify the portion of an mblock that belongs to the
+ * lgroup.  Mask and match are defined in the Physical Address (PA) space,
+ * but an mblock defines Real Addresses (RA).  To translate, the mblock
+ * includes the property address-congruence-offset, hereafter referred to as
+ * ra_to_pa.  A real address ra is a member of an lgroup if
+ *
+ *	(ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match
+ *
+ * The MD is traversed, and information on all mblocks is kept in the array
+ * mpo_mblock[].  Information on all CPUs, including which lgroup they map
+ * to, is kept in the array mpo_cpu[].
+ *
+ * This implementation makes (and verifies) the simplifying assumption that
+ * the mask bits are the same for all defined lgroups, and that all 1 bits in
+ * the mask are contiguous.  Thus the number of lgroups is bounded by the
+ * number of possible mask values, and the lgrp_handle_t is defined as the
+ * mask value, shifted right to eliminate the 0 bit positions in mask.  The
+ * masks and values are also referred to as "home bits" in the code.
+ *
+ * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup
+ * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock
+ * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the
+ * home bits.  This yields the mem_node.
+ *
+ * Interfaces
+ * ----------
+ *
+ * This file exports the following entry points:
+ *
+ * plat_lgrp_init()
+ * plat_build_mem_nodes()
+ * plat_lgrp_cpu_to_hand()
+ * plat_lgrp_latency()
+ * plat_pfn_to_mem_node()
+ *	These implement the usual platform lgroup interfaces.
+ *
+ * plat_rapfn_to_papfn()
+ *	Recover the PA page coloring bits from an RA.
+ *
+ * plat_mem_node_iterator_init()
+ *	Initialize an iterator to efficiently step through pages in a mem_node.
+ *
+ * plat_mem_node_intersect_range()
+ *	Find the intersection with a mem_node.
+ */
+
+int	sun4v_mpo_enable = 1;
+int	sun4v_mpo_debug = 0;
+char	sun4v_mpo_status[256] = "";
+
+/* Save CPU info from the MD and associate CPUs with lgroups */
+static	struct cpu_md mpo_cpu[NCPU];
+
+/* Save lgroup info from the MD */
+#define	MAX_MD_LGROUPS 32
+static	struct	lgrp_md mpo_lgroup[MAX_MD_LGROUPS];
+static	int	n_lgrpnodes = 0;
+static	int	n_locality_groups = 0;
+static	int	max_locality_groups = 0;
+
+/* Save mblocks from the MD */
+static 	struct	mblock_md mpo_mblock[MPO_MAX_MBLOCKS];
+static	int	n_mblocks = 0;
+
+/* Save mem_node stripes calculate from mblocks and lgroups. */
+static mem_stripe_t mem_stripes[MAX_MEM_STRIPES];
+static	int	n_mem_stripes = 0;
+static	pfn_t	mnode_stride;	/* distance between stripes, start to start */
+static	int	stripe_shift;	/* stride/stripes expressed as a shift */
+static	pfn_t	mnode_pages;	/* mem_node stripe width */
+
+/* Save home mask and shift used to calculate lgrp_handle_t values */
+static	uint64_t home_mask = 0;
+static	pfn_t	home_mask_pfn = 0;
+static	int	home_mask_shift = 0;
+static	uint_t	home_mask_pfn_shift = 0;
+
+/* Save lowest and highest latencies found across all lgroups */
+static	int	lower_latency = 0;
+static	int	higher_latency = 0;
+
+static	pfn_t	base_ra_to_pa_pfn = 0;	/* ra_to_pa for single mblock memory */
+
+static	int	valid_pages(md_t *md, mde_cookie_t cpu0);
+static	int	unique_home_mem_lg_count(uint64_t mem_lg_homeset);
+static	int	fix_interleave(void);
+
+/* Debug support */
+#if defined(DEBUG) && !defined(lint)
+#define	MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args)
+#else
+#define	MPO_DEBUG(...)
+#endif	/* DEBUG */
+
+/* Record status message, viewable from mdb */
+#define	MPO_STATUS(args...) {						      \
+	(void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args);   \
+	MPO_DEBUG(sun4v_mpo_status);					      \
+}
+
+/*
+ * Routine to read a uint64_t from a given md
+ */
+static	int64_t
+get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val)
+{
+	int err = md_get_prop_val(md, node, propname, val);
+	return (err);
+}
+
+static int
+mblock_cmp(const void *a, const void *b)
+{
+	struct mblock_md *m1 = (struct mblock_md *)a;
+	struct mblock_md *m2 = (struct mblock_md *)b;
+
+	if (m1->base < m2->base)
+		return (-1);
+	else if (m1->base == m2->base)
+		return (0);
+	else
+		return (1);
+}
+
+static void
+mblock_sort(struct mblock_md *mblocks, int n)
+{
+	extern void qsort(void *, size_t, size_t,
+	    int (*)(const void *, const void *));
+
+	qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp);
+}
+
+/*
+ *
+ * Traverse the MD to determine:
+ *
+ *  Number of CPU nodes, lgrp_nodes, and mblocks
+ *  Then for each lgrp_node, obtain the appropriate data.
+ *  For each CPU, determine its home locality and store it.
+ *  For each mblock, retrieve its data and store it.
+ */
+static	int
+lgrp_traverse(md_t *md)
+{
+	mde_cookie_t root, *cpunodes, *lgrpnodes, *nodes, *mblocknodes;
+	uint64_t i, j, k, o, n_nodes;
+	uint64_t n_lgroups = 0;
+	uint64_t mem_lg_homeset = 0;
+	int ret_val = 0;
+	int result = 0;
+	int n_cpunodes = 0;
+	int sub_page_fix;
+
+	n_nodes = md_node_count(md);
+
+	if (n_nodes <= 0) {
+		MPO_STATUS("lgrp_traverse: No nodes in node count\n");
+		ret_val = -1;
+		goto fail;
+	}
+
+	root = md_root_node(md);
+
+	if (root == MDE_INVAL_ELEM_COOKIE) {
+		MPO_STATUS("lgrp_traverse: Root node is missing\n");
+		ret_val = -1;
+		goto fail;
+	}
+
+	/*
+	 * Build the Memory Nodes.  Do this before any possibility of
+	 * bailing from this routine so we obtain ra_to_pa (needed for page
+	 * coloring) even when there are no lgroups defined.
+	 */
+
+	n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK,
+	    "fwd", &mblocknodes);
+
+	if (n_mblocks <= 0 || n_mblocks > MPO_MAX_MBLOCKS) {
+		MPO_STATUS("lgrp_traverse: No mblock "
+		    "nodes detected in Machine Descriptor\n");
+		n_mblocks = 0;
+		ret_val = -1;
+		goto fail;
+	}
+
+	for (i = 0; i < n_mblocks; i++) {
+		mpo_mblock[i].node = mblocknodes[i];
+
+		/* Without a base or size value we will fail */
+		result = get_int(md, mblocknodes[i], PROP_LG_BASE,
+		    &mpo_mblock[i].base);
+		if (result < 0) {
+			MPO_STATUS("lgrp_traverse: "
+			    "PROP_LG_BASE is missing\n");
+			n_mblocks = 0;
+			ret_val = -1;
+			goto fail;
+		}
+
+		result = get_int(md, mblocknodes[i], PROP_LG_SIZE,
+		    &mpo_mblock[i].size);
+		if (result < 0) {
+			MPO_STATUS("lgrp_traverse: "
+			    "PROP_LG_SIZE is missing\n");
+			n_mblocks = 0;
+			ret_val = -1;
+			goto fail;
+		}
+
+		result = get_int(md, mblocknodes[i],
+		    PROP_LG_RA_PA_OFFSET, &mpo_mblock[i].ra_to_pa);
+
+		/* If we don't have an ra_pa_offset, just set it to 0 */
+		if (result < 0)
+			mpo_mblock[i].ra_to_pa = 0;
+
+		MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, "
+		    "ra_to_pa = %lx\n", i,
+		    mpo_mblock[i].base,
+		    mpo_mblock[i].size,
+		    mpo_mblock[i].ra_to_pa);
+	}
+
+	/* Must sort mblocks by address for mem_node_iterator_init() */
+	mblock_sort(mpo_mblock, n_mblocks);
+
+	base_ra_to_pa_pfn = btop(mpo_mblock[0].ra_to_pa);
+
+	/* Page coloring hook is required so we can iterate through mnodes */
+	if (&page_next_pfn_for_color_cpu == NULL) {
+		MPO_STATUS("lgrp_traverse: No page coloring support\n");
+		ret_val = -1;
+		goto fail;
+	}
+
+	/* Global enable for mpo */
+	if (sun4v_mpo_enable == 0) {
+		MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n");
+		ret_val = -1;
+		goto fail;
+	}
+
+	n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG,
+	    "fwd", &lgrpnodes);
+
+	if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) {
+		MPO_STATUS("lgrp_traverse: No Lgroups\n");
+		ret_val = -1;
+		goto fail;
+	}
+
+	n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes);
+
+	if (n_cpunodes <= 0 || n_cpunodes > NCPU) {
+		MPO_STATUS("lgrp_traverse: No CPU nodes detected "
+		    "in MD\n");
+		ret_val = -1;
+		goto fail;
+	}
+
+	MPO_DEBUG("lgrp_traverse: Node Count: %ld\n", n_nodes);
+	MPO_DEBUG("lgrp_traverse: md: %p\n", md);
+	MPO_DEBUG("lgrp_traverse: root: %lx\n", root);
+	MPO_DEBUG("lgrp_traverse: mem_lgs: %d\n", n_lgrpnodes);
+	MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes);
+	MPO_DEBUG("lgrp_traverse: mblocks: %d\n", n_mblocks);
+
+	for (i = 0; i < n_lgrpnodes; i++) {
+		mpo_lgroup[i].node = lgrpnodes[i];
+		mpo_lgroup[i].id = i;
+		mpo_lgroup[i].ncpu = 0;
+		result = get_int(md, lgrpnodes[i], PROP_LG_MASK,
+		    &mpo_lgroup[i].addr_mask);
+		result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH,
+		    &mpo_lgroup[i].addr_match);
+
+		/*
+		 * If either the mask or match properties are missing, set to 0
+		 */
+		if (result < 0) {
+			mpo_lgroup[i].addr_mask = 0;
+			mpo_lgroup[i].addr_match = 0;
+		}
+
+		/* Set latency to 0 if property not present */
+
+		result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY,
+		    &mpo_lgroup[i].latency);
+		if (result < 0)
+			mpo_lgroup[i].latency = 0;
+	}
+
+	/*
+	 * Sub-page level interleave is not yet supported.  Check for it,
+	 * and remove sub-page interleaved lgroups from mpo_lgroup and
+	 * n_lgrpnodes.  If no lgroups are left, return.
+	 */
+
+	sub_page_fix = fix_interleave();
+	if (n_lgrpnodes == 0) {
+		ret_val = -1;
+		goto fail;
+	}
+
+	/* Ensure that all of the addr_mask values are the same */
+
+	for (i = 0; i < n_lgrpnodes; i++) {
+		if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) {
+			MPO_STATUS("lgrp_traverse: "
+			    "addr_mask values are not the same\n");
+			ret_val = -1;
+			goto fail;
+		}
+	}
+
+	/*
+	 * Ensure that all lgrp nodes see all the mblocks. However, if
+	 * sub-page interleave is being fixed, they do not, so skip
+	 * the check.
+	 */
+
+	if (sub_page_fix == 0) {
+		for (i = 0; i < n_lgrpnodes; i++) {
+			j = md_alloc_scan_dag(md, mpo_lgroup[i].node,
+			    PROP_LG_MBLOCK, "fwd", &nodes);
+			md_free_scan_dag(md, &nodes);
+			if (j != n_mblocks) {
+				MPO_STATUS("lgrp_traverse: "
+				    "sub-page interleave is being fixed\n");
+				ret_val = -1;
+				goto fail;
+			}
+		}
+	}
+
+	/*
+	 * Use the address mask from the first lgroup node
+	 * to establish our home_mask.
+	 */
+	home_mask = mpo_lgroup[0].addr_mask;
+	home_mask_pfn = btop(home_mask);
+	home_mask_shift = lowbit(home_mask) - 1;
+	home_mask_pfn_shift = home_mask_shift - PAGESHIFT;
+	mnode_pages = btop(1ULL << home_mask_shift);
+
+	/*
+	 * How many values are possible in home mask?  Assume the mask
+	 * bits are contiguous.
+	 */
+	max_locality_groups =
+	    1 << highbit(home_mask_pfn >> home_mask_pfn_shift);
+
+	/* Now verify the home mask bits are contiguous */
+
+	if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) {
+		MPO_STATUS("lgrp_traverse: "
+		    "home mask bits are not contiguous\n");
+		ret_val = -1;
+		goto fail;
+	}
+
+	/* Record all of the home bits */
+
+	for (i = 0; i < n_lgrpnodes; i++) {
+		HOMESET_ADD(mem_lg_homeset,
+		    mpo_lgroup[i].addr_match >> home_mask_shift);
+	}
+
+	/* Count the number different "home"  mem_lg's we've discovered */
+
+	n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset);
+
+	/* If we have only 1 locality group then we can exit */
+	if (n_locality_groups == 1) {
+		MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n");
+		ret_val = -1;
+		goto fail;
+	}
+
+	/*
+	 * Set the latencies.  A CPU's lgroup is defined by the lowest
+	 * latency found.  All other memory is considered remote, and the
+	 * remote latency is represented by the highest latency found.
+	 * Thus hierarchical lgroups, if any, are approximated by a
+	 * two level scheme.
+	 *
+	 * The Solaris MPO framework by convention wants to see latencies
+	 * in units of nano-sec/10. In the MD, the units are defined to be
+	 * pico-seconds.
+	 */
+
+	lower_latency = mpo_lgroup[0].latency;
+	higher_latency = mpo_lgroup[0].latency;
+
+	for (i = 1; i < n_lgrpnodes; i++) {
+		if (mpo_lgroup[i].latency < lower_latency) {
+			lower_latency = mpo_lgroup[i].latency;
+		}
+		if (mpo_lgroup[i].latency > higher_latency) {
+			higher_latency = mpo_lgroup[i].latency;
+		}
+	}
+	lower_latency /= 10000;
+	higher_latency /= 10000;
+
+	/* Clear our CPU data */
+
+	for (i = 0; i < NCPU; i++) {
+		mpo_cpu[i].home = 0;
+		mpo_cpu[i].latency = (uint_t)(-1);
+	}
+
+	/* Build the CPU nodes */
+	for (i = 0; i < n_cpunodes; i++) {
+
+		/* Read in the lgroup nodes */
+
+		result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k);
+		if (result < 0) {
+			MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n");
+			ret_val = -1;
+			goto fail;
+		}
+
+		n_lgroups = md_alloc_scan_dag(md, cpunodes[i], PROP_LG_MEM_LG,
+		    "fwd", &nodes);
+		if (n_lgroups <= 0) {
+			MPO_STATUS("lgrp_traverse: PROP_LG_MEM_LG missing");
+			ret_val = -1;
+			goto fail;
+		}
+
+		/*
+		 * Find the lgroup this cpu belongs to with the lowest latency.
+		 * Check all the lgrp nodes connected to this CPU to determine
+		 * which has the smallest latency.
+		 */
+
+		for (j = 0; j < n_lgroups; j++) {
+			for (o = 0; o < n_lgrpnodes; o++) {
+				if (nodes[j] == mpo_lgroup[o].node) {
+					if (mpo_lgroup[o].latency <
+					    mpo_cpu[k].latency) {
+						mpo_cpu[k].home =
+						    mpo_lgroup[o].addr_match
+						    >> home_mask_shift;
+						mpo_cpu[k].latency =
+						    mpo_lgroup[o].latency;
+						mpo_lgroup[o].ncpu++;
+					}
+				}
+			}
+		}
+		md_free_scan_dag(md, &nodes);
+	}
+
+	/* Validate that no large pages cross mnode boundaries. */
+	if (valid_pages(md, cpunodes[0]) == 0) {
+		ret_val = -1;
+		goto fail;
+	}
+
+fail:
+	/* MD cookies are no longer valid; ensure they are not used again. */
+	for (i = 0; i < n_mblocks; i++)
+		mpo_mblock[i].node = MDE_INVAL_ELEM_COOKIE;
+	for (i = 0; i < n_lgrpnodes; i++)
+		mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE;
+
+	if (n_cpunodes > 0)
+		md_free_scan_dag(md, &cpunodes);
+	if (n_lgrpnodes > 0)
+		md_free_scan_dag(md, &lgrpnodes);
+	if (n_mblocks > 0)
+		md_free_scan_dag(md, &mblocknodes);
+	else
+		panic("lgrp_traverse: No memory blocks found");
+
+	if (ret_val == 0)
+		MPO_STATUS("MPO feature is enabled.\n");
+
+	return (ret_val);
+}
+
+/*
+ *  Determine the number of unique mem_lg's present in our system
+ */
+static	int
+unique_home_mem_lg_count(uint64_t mem_lg_homeset)
+{
+	int homeid;
+	int count = 0;
+
+	/*
+	 * Scan the "home" bits of the mem_lgs, count
+	 * the number that are unique.
+	 */
+
+	for (homeid = 0; homeid < NLGRPS_MAX; homeid++) {
+		if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) {
+			count++;
+		}
+	}
+
+	MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n",
+	    mem_lg_homeset);
+	MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count);
+
+	/* Default must be at least one */
+	if (count == 0)
+		count = 1;
+
+	return (count);
+}
+
+/*
+ * Platform specific lgroup initialization
+ */
+void
+plat_lgrp_init(void)
+{
+	md_t *md;
+	int i, rc, ncpu_min;
+
+	/* Get the Machine Descriptor handle */
+
+	md = md_get_handle();
+
+	/* If not, we cannot continue */
+
+	if (md == NULL) {
+		panic("cannot access machine descriptor\n");
+	} else {
+		rc = lgrp_traverse(md);
+		(void) md_fini_handle(md);
+	}
+
+	/*
+	 * If we can't process the MD for lgroups then at least let the
+	 * system try to boot.  Assume we have one lgroup so that
+	 * when plat_build_mem_nodes is called, it will attempt to init
+	 * an mnode based on the supplied memory segment.
+	 */
+
+	if (rc == -1) {
+		home_mask_pfn = 0;
+		max_locality_groups = 1;
+		n_locality_groups = 1;
+		return;
+	}
+
+	mem_node_pfn_shift = 0;
+	mem_node_physalign = 0;
+
+	/* Use lgroup-aware TSB allocations */
+	tsb_lgrp_affinity = 1;
+
+	/*
+	 * lgrp_expand_proc_thresh is the minimum load on the lgroups
+	 * this process is currently running on before considering
+	 * expanding threads to another lgroup.
+	 *
+	 * lgrp_expand_proc_diff determines how much less the remote lgroup
+	 * must be loaded before expanding to it.
+	 *
+	 * On sun4v CMT processors, threads share a core pipeline, and
+	 * at less than 100% utilization, best throughput is obtained by
+	 * spreading threads across more cores, even if some are in a
+	 * different lgroup.  Spread threads to a new lgroup if the
+	 * current group is more than 50% loaded.  Because of virtualization,
+	 * lgroups may have different numbers of CPUs, but the tunables
+	 * apply to all lgroups, so find the smallest lgroup and compute
+	 * 50% loading.
+	 */
+
+	ncpu_min = NCPU;
+	for (i = 0; i < n_lgrpnodes; i++) {
+		int ncpu = mpo_lgroup[i].ncpu;
+		if (ncpu != 0 && ncpu < ncpu_min)
+			ncpu_min = ncpu;
+	}
+	lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2;
+
+	/* new home may only be half as loaded as the existing home to use it */
+	lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2;
+
+	lgrp_loadavg_tolerance = lgrp_loadavg_max_effect;
+
+	/* Require that a home lgroup have some memory to be chosen */
+	lgrp_mem_free_thresh = 1;
+
+	/* Standard home-on-next-touch policy */
+	lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT;
+
+	/* Disable option to choose root lgroup if all leaf lgroups are busy */
+	lgrp_load_thresh = UINT32_MAX;
+}
+
+/*
+ *  Helper routine for debugging calls to mem_node_add_slice()
+ */
+static	void
+mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn)
+{
+#if defined(DEBUG) && !defined(lint)
+	static int slice_count = 0;
+
+	slice_count++;
+	MPO_DEBUG("mem_add_slice(%d): basepfn: %lx  endpfn: %lx\n",
+	    slice_count, basepfn, endpfn);
+#endif
+	mem_node_add_slice(basepfn, endpfn);
+}
+
+/*
+ *  Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node()
+ */
+static	void
+mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode)
+{
+	MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld,"
+	    "mnode index: %d\n", plathand, mnode);
+	plat_assign_lgrphand_to_mem_node(plathand, mnode);
+}
+
+/*
+ * plat_build_mem_nodes()
+ *
+ * Define the mem_nodes based on the modified boot memory list,
+ * or based on info read from the MD in plat_lgrp_init().
+ *
+ * When the home mask lies in the middle of the address bits (as it does on
+ * Victoria Falls), then the memory in one mem_node is no longer contiguous;
+ * it is striped across an mblock in a repeating pattern of contiguous memory
+ * followed by a gap.  The stripe width is the size of the contiguous piece.
+ * The stride is the distance from the start of one contiguous piece to the
+ * start of the next.  The gap is thus stride - stripe_width.
+ *
+ * The stripe of an mnode that falls within an mblock is described by the type
+ * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock.  The
+ * mem_stripe_t's are kept in a global array mem_stripes[].  The index into
+ * this array is predetermined.  The mem_stripe_t that describes mnode m
+ * within mpo_mblock[i] is stored at
+ *	 mem_stripes[ m + i * max_locality_groups ]
+ *
+ * max_locality_groups is the total number of possible locality groups,
+ * as defined by the size of the home mask, even if the memory assigned
+ * to the domain is small and does not cover all the lgroups.  Thus some
+ * mem_stripe_t's may be empty.
+ *
+ * The members of mem_stripe_t are:
+ *	physbase: First valid page in mem_node in the corresponding mblock
+ *	physmax: Last valid page in mem_node in mblock
+ *	offset:  The full stripe width starts at physbase - offset.
+ *	    Thus if offset is non-zero, this mem_node starts in the middle
+ *	    of a stripe width, and the second full stripe starts at
+ *	    physbase - offset + stride.  (even though physmax may fall in the
+ *	    middle of a stripe width, we do not save the ending fragment size
+ *	    in this data structure.)
+ *	exists: Set to 1 if the mblock has memory in this mem_node stripe.
+ *
+ *	The stripe width is kept in the global mnode_pages.
+ *	The stride is kept in the global mnode_stride.
+ *	All the above use pfn's as the unit.
+ *
+ * As an example, the memory layout for a domain with 2 mblocks and 4
+ * mem_nodes 0,1,2,3 could look like this:
+ *
+ *	123012301230 ...	012301230123 ...
+ *	  mblock 0		  mblock 1
+ */
+
+void
+plat_build_mem_nodes(u_longlong_t *list, size_t nelems)
+{
+	lgrp_handle_t lgrphand, lgrp_start;
+	int i, mnode, elem;
+	uint64_t offset, stripe_end, base, len, end, ra_to_pa, stride;
+	uint64_t stripe, frag, remove;
+	mem_stripe_t *ms;
+
+	/* Check for non-MPO sun4v platforms */
+
+	if (n_locality_groups <= 1) {
+		mpo_plat_assign_lgrphand_to_mem_node((lgrp_handle_t)0, 0);
+		for (elem = 0; elem < nelems; elem += 2) {
+			base = list[elem];
+			len = list[elem+1];
+
+			mpo_mem_node_add_slice(btop(base),
+			    btop(base + len - 1));
+		}
+		mem_node_pfn_shift = 0;
+		mem_node_physalign = 0;
+		n_mem_stripes = 0;
+		return;
+	}
+
+	/* Pre-reserve space for plat_assign_lgrphand_to_mem_node */
+	max_mem_nodes = max_locality_groups;
+	bzero(mem_stripes, sizeof (mem_stripes));
+	stripe = ptob(mnode_pages);
+	stride = max_locality_groups * stripe;
+
+	/* Save commonly used values in globals */
+	mnode_stride = btop(stride);
+	n_mem_stripes = max_locality_groups * n_mblocks;
+	stripe_shift = highbit(max_locality_groups) - 1;
+
+	for (i = 0; i < n_mblocks; i++) {
+
+		base = mpo_mblock[i].base;
+		end = mpo_mblock[i].base + mpo_mblock[i].size;
+		ra_to_pa = mpo_mblock[i].ra_to_pa;
+		mpo_mblock[i].base_pfn = btop(base);
+		mpo_mblock[i].end_pfn = btop(end - 1);
+
+		/* Find the offset from the prev stripe boundary in PA space. */
+		offset = (base + ra_to_pa) & (stripe - 1);
+
+		/* Set the next stripe boundary. */
+		stripe_end = base - offset + stripe;
+
+		lgrp_start = (((base + ra_to_pa) & home_mask) >>
+		    home_mask_shift);
+		lgrphand = lgrp_start;
+
+		/*
+		 * Loop over all lgroups covered by the mblock, creating a
+		 * stripe for each.  Stop when lgrp_start is visited again.
+		 */
+		do {
+			/* mblock may not span all lgroups */
+			if (base >= end)
+				break;
+
+			mnode = lgrphand;
+			ASSERT(mnode < max_mem_nodes);
+
+			/*
+			 * Calculate the size of the fragment that does not
+			 * belong to the mnode in the last partial stride.
+			 */
+			frag = (end - (base - offset)) & (stride - 1);
+			if (frag == 0) {
+				/* remove the gap */
+				remove = stride - stripe;
+			} else if (frag < stripe) {
+				/* fragment fits in stripe; keep it all */
+				remove = 0;
+			} else {
+				/* fragment is large; trim after whole stripe */
+				remove = frag - stripe;
+			}
+
+			ms = &mem_stripes[i * max_locality_groups + mnode];
+			ms->physbase = btop(base);
+			ms->physmax = btop(end - 1 - remove);
+			ms->offset = btop(offset);
+			ms->exists = 1;
+
+			mpo_plat_assign_lgrphand_to_mem_node(lgrphand, mnode);
+			mpo_mem_node_add_slice(ms->physbase, ms->physmax);
+
+			base = stripe_end;
+			stripe_end += stripe;
+			offset = 0;
+			lgrphand = (((base + ra_to_pa) & home_mask) >>
+			    home_mask_shift);
+		} while (lgrphand != lgrp_start);
+	}
+
+	/*
+	 * Indicate to vm_pagelist that the hpm_counters array
+	 * should be shared because the ranges overlap.
+	 */
+	if (max_mem_nodes > 1) {
+		interleaved_mnodes = 1;
+	}
+}
+
+/*
+ * Return the locality group value for the supplied processor
+ */
+lgrp_handle_t
+plat_lgrp_cpu_to_hand(processorid_t id)
+{
+	if (n_locality_groups > 1) {
+		return ((lgrp_handle_t)mpo_cpu[(int)id].home);
+	} else {
+		return ((lgrp_handle_t)0); /* Default */
+	}
+}
+
+int
+plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to)
+{
+	/*
+	 * Return min remote latency when there are more than two lgroups
+	 * (root and child) and getting latency between two different lgroups
+	 * or root is involved.
+	 */
+	if (lgrp_optimizations() && (from != to ||
+	    from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) {
+		return ((int)higher_latency);
+	} else {
+		return ((int)lower_latency);
+	}
+}
+
+int
+plat_pfn_to_mem_node(pfn_t pfn)
+{
+	int i, mnode;
+	pfn_t ra_to_pa_pfn;
+	struct mblock_md *mb;
+
+	if (n_locality_groups <= 1)
+		return (0);
+
+	/*
+	 * The mnode is defined to be 1:1 with the lgroup handle, which
+	 * is taken from from the home bits.  Find the mblock in which
+	 * the pfn falls to get the ra_to_pa adjustment, and extract
+	 * the home bits.
+	 */
+	mb = &mpo_mblock[0];
+	for (i = 0; i < n_mblocks; i++) {
+		if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) {
+			ra_to_pa_pfn = btop(mb->ra_to_pa);
+			mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >>
+			    home_mask_pfn_shift);
+			ASSERT(mnode < max_mem_nodes);
+			return (mnode);
+		}
+		mb++;
+	}
+
+	panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn);
+	return (pfn);
+}
+
+/*
+ * plat_rapfn_to_papfn
+ *
+ * Convert a pfn in RA space to a pfn in PA space, in which the page coloring
+ * and home mask bits are correct.  The upper bits do not necessarily
+ * match the actual PA, however.
+ */
+pfn_t
+plat_rapfn_to_papfn(pfn_t pfn)
+{
+	int i;
+	pfn_t ra_to_pa_pfn;
+	struct mblock_md *mb;
+
+	ASSERT(n_mblocks > 0);
+	if (n_mblocks == 1)
+		return (pfn + base_ra_to_pa_pfn);
+
+	/*
+	 * Find the mblock in which the pfn falls
+	 * in order to get the ra_to_pa adjustment.
+	 */
+	for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) {
+		if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) {
+			ra_to_pa_pfn = btop(mb->ra_to_pa);
+			return (pfn + ra_to_pa_pfn);
+		}
+	}
+
+	panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn);
+	return (pfn);
+}
+
+/*
+ * plat_mem_node_iterator_init()
+ *	Initialize cookie to iterate over pfn's in an mnode.  There is
+ *	no additional iterator function.  The caller uses the info from
+ *	the iterator structure directly.
+ *
+ *	pfn: starting pfn.
+ * 	mnode: desired mnode.
+ *	init: set to 1 for full init, 0 for continuation
+ *
+ *	Returns the appropriate starting pfn for the iteration
+ *	the same as the input pfn if it falls in an mblock.
+ *	Returns the (pfn_t)-1 value if the input pfn lies past
+ *	the last valid mnode pfn.
+ */
+pfn_t
+plat_mem_node_iterator_init(pfn_t pfn, int mnode,
+    mem_node_iterator_t *it, int init)
+{
+	int i;
+	struct mblock_md *mblock;
+	pfn_t base, end;
+
+	ASSERT(it != NULL);
+	ASSERT(mnode >= 0 && mnode < max_mem_nodes);
+	ASSERT(n_mblocks > 0);
+
+	if (init) {
+		it->mi_last_mblock = 0;
+		it->mi_init = 1;
+	}
+
+	/* Check if mpo is not enabled and we only have one mblock */
+	if (n_locality_groups == 1 && n_mblocks == 1) {
+		it->mi_mnode = mnode;
+		it->mi_ra_to_pa = base_ra_to_pa_pfn;
+		it->mi_mnode_pfn_mask = 0;
+		it->mi_mnode_pfn_shift = 0;
+		it->mi_mnode_mask = 0;
+		it->mi_mblock_base = mem_node_config[mnode].physbase;
+		it->mi_mblock_end = mem_node_config[mnode].physmax;
+		if (pfn < it->mi_mblock_base)
+			pfn = it->mi_mblock_base;
+		else if (pfn > it->mi_mblock_end)
+			pfn = (pfn_t)-1;
+		return (pfn);
+	}
+
+	/*
+	 * Find mblock that contains pfn, or first mblock after pfn,
+	 * else pfn is out of bounds, so use the last mblock.
+	 * mblocks are sorted in ascending address order.
+	 */
+	ASSERT(it->mi_last_mblock < n_mblocks);
+	ASSERT(init == 1 || pfn > mpo_mblock[it->mi_last_mblock].end_pfn);
+	i = init ? 0 : it->mi_last_mblock + 1;
+	if (i == n_mblocks)
+		return ((pfn_t)-1);
+
+	for (; i < n_mblocks; i++) {
+		if (pfn <= mpo_mblock[i].end_pfn)
+			break;
+	}
+	if (i == n_mblocks) {
+		it->mi_last_mblock = i - 1;
+		return ((pfn_t)-1);
+	}
+	it->mi_last_mblock = i;
+
+	/*
+	 * Memory stripes are defined if there is more than one locality
+	 * group, so use the stripe bounds.  Otherwise use mblock bounds.
+	 */
+	mblock = &mpo_mblock[i];
+	if (n_mem_stripes > 0) {
+		mem_stripe_t *ms =
+		    &mem_stripes[i * max_locality_groups + mnode];
+		base = ms->physbase;
+		end = ms->physmax;
+	} else {
+		ASSERT(mnode == 0);
+		base = mblock->base_pfn;
+		end = mblock->end_pfn;
+	}
+
+	it->mi_mnode = mnode;
+	it->mi_ra_to_pa = btop(mblock->ra_to_pa);
+	it->mi_mblock_base = base;
+	it->mi_mblock_end = end;
+	it->mi_mnode_pfn_mask = home_mask_pfn;	/* is 0 for non-MPO case */
+	it->mi_mnode_pfn_shift = home_mask_pfn_shift;
+	it->mi_mnode_mask = max_locality_groups - 1;
+	if (pfn < base)
+		pfn = base;
+	else if (pfn > end)
+		pfn = (pfn_t)-1;
+	return (pfn);
+}
+
+/*
+ * plat_mem_node_intersect_range()
+ *
+ * Find the intersection between a memnode and a range of pfn's.
+ */
+void
+plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len,
+    int mnode, pgcnt_t *npages_out)
+{
+	pfn_t offset, len, hole, base, end, test_end, frag;
+	pfn_t nearest;
+	mem_stripe_t *ms;
+	int i, npages;
+
+	*npages_out = 0;
+
+	if (!mem_node_config[mnode].exists || test_len == 0)
+		return;
+
+	base = mem_node_config[mnode].physbase;
+	end = mem_node_config[mnode].physmax;
+
+	test_end = test_base + test_len - 1;
+	if (end < test_base || base > test_end)
+		return;
+
+	if (n_locality_groups == 1) {
+		*npages_out = MIN(test_end, end) - MAX(test_base, base) + 1;
+		return;
+	}
+
+	hole = mnode_stride - mnode_pages;
+	npages = 0;
+
+	/*
+	 * Iterate over all the stripes for this mnode (one per mblock),
+	 * find the intersection with each, and accumulate the intersections.
+	 *
+	 * Determing the intersection with a stripe is tricky.  If base or end
+	 * fall outside the mem_node bounds, round them to physbase/physmax of
+	 * mem_node.  If base or end fall in a gap, round them to start of
+	 * nearest stripe.  If they fall within a stripe, keep base or end,
+	 * but calculate the fragment size that should be excluded from the
+	 * stripe.  Calculate how many strides fall in the adjusted range,
+	 * multiply by stripe width, and add the start and end fragments.
+	 */
+
+	for (i = mnode; i < n_mem_stripes; i += max_locality_groups) {
+		ms = &mem_stripes[i];
+		if (ms->exists &&
+		    test_base <= (end = ms->physmax) &&
+		    test_end >= (base = ms->physbase)) {
+
+			offset = ms->offset;
+
+			if (test_base > base) {
+				/* Round test_base to next multiple of stride */
+				len = P2ROUNDUP(test_base - (base - offset),
+				    mnode_stride);
+				nearest = base - offset + len;
+				/*
+				 * Compute distance from test_base to the
+				 * stride boundary to see if test_base falls
+				 * in the stripe or in the hole.
+				 */
+				if (nearest - test_base > hole) {
+					/*
+					 * test_base lies in stripe,
+					 * and offset should be excluded.
+					 */
+					offset = test_base -
+					    (nearest - mnode_stride);
+					base = test_base;
+				} else {
+					/* round up to next stripe start */
+					offset = 0;
+					base = nearest;
+					if (base > end)
+						continue;
+				}
+
+			}
+
+			if (test_end < end)
+				end = test_end;
+			end++;		/* adjust to an exclusive bound */
+
+			/* Round end to next multiple of stride */
+			len = P2ROUNDUP(end - (base - offset), mnode_stride);
+			nearest = (base - offset) + len;
+			if (nearest - end <= hole) {
+				/* end falls in hole, use entire last stripe */
+				frag = 0;
+			} else {
+				/* end falls in stripe, compute fragment */
+				frag = nearest - hole - end;
+			}
+
+			len = (len >> stripe_shift) - offset - frag;
+			npages += len;
+		}
+	}
+
+	*npages_out = npages;
+}
+
+/*
+ * valid_pages()
+ *
+ * Return 1 if pages are valid and do not cross mnode boundaries
+ * (which would break page free list assumptions), and 0 otherwise.
+ */
+
+#define	MNODE(pa)	\
+	((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift)
+
+static int
+valid_pages(md_t *md, mde_cookie_t cpu0)
+{
+	int i, max_szc;
+	uint64_t last_page_base, szc_mask;
+	uint64_t max_page_len, max_coalesce_len;
+	struct mblock_md *mb = mpo_mblock;
+
+	/*
+	 * Find the smaller of the largest page possible and supported.
+	 * mmu_exported_pagesize_mask is not yet initialized, so read
+	 * it from the MD.  Apply minimal fixups in case of broken MDs
+	 * to get a sane mask.
+	 */
+
+	if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask))
+		szc_mask = 0;
+	szc_mask |=  (1 << TTE4M);	/* largest in sun4v default support */
+	max_szc = highbit(szc_mask) - 1;
+	if (max_szc > TTE256M)
+		max_szc = TTE256M;
+	max_page_len = TTEBYTES(max_szc);
+
+	/*
+	 * Page coalescing code coalesces all sizes up to 256M on sun4v, even
+	 * if mmu-page-size-list does not contain it, so 256M pages must fall
+	 * within one mnode to use MPO.
+	 */
+	max_coalesce_len = TTEBYTES(TTE256M);
+	ASSERT(max_coalesce_len >= max_page_len);
+
+	if (ptob(mnode_pages) < max_coalesce_len) {
+		MPO_STATUS("Page too large; MPO disabled: page = %lx, "
+		    "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages));
+		return (0);
+	}
+
+	for (i = 0; i < n_mblocks; i++) {
+		uint64_t base = mb->base;
+		uint64_t end = mb->base + mb->size - 1;
+		uint64_t ra_to_pa = mb->ra_to_pa;
+
+		/*
+		 * If mblock is smaller than the max page size, then
+		 * RA = PA mod MAXPAGE is not guaranteed, but it must
+		 * not span mnodes.
+		 */
+		if (mb->size < max_page_len) {
+			if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) {
+				MPO_STATUS("Small mblock spans mnodes; "
+				    "MPO disabled: base = %lx, end = %lx, "
+				    "ra2pa = %lx\n", base, end, ra_to_pa);
+				return (0);
+			}
+		} else {
+			/* Verify RA = PA mod MAXPAGE, using coalesce size */
+			uint64_t pa_base = base + ra_to_pa;
+			if ((base & (max_coalesce_len - 1)) !=
+			    (pa_base & (max_coalesce_len - 1))) {
+				MPO_STATUS("bad page alignment; MPO disabled: "
+				    "ra = %lx, pa = %lx, pagelen = %lx\n",
+				    base, pa_base, max_coalesce_len);
+				return (0);
+			}
+		}
+
+		/*
+		 * Find start of last large page in mblock in RA space.
+		 * If page extends into the next mblock, verify the
+		 * mnode does not change.
+		 */
+		last_page_base = P2ALIGN(end, max_coalesce_len);
+		if (i + 1 < n_mblocks &&
+		    last_page_base + max_coalesce_len > mb[1].base &&
+		    MNODE(last_page_base + ra_to_pa) !=
+		    MNODE(mb[1].base + mb[1].ra_to_pa)) {
+			MPO_STATUS("Large page spans mblocks; MPO disabled: "
+			    "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, "
+			    "pagelen = %lx\n", end, ra_to_pa, mb[1].base,
+			    mb[1].ra_to_pa, max_coalesce_len);
+			return (0);
+		}
+
+		mb++;
+	}
+	return (1);
+}
+
+
+/*
+ * fix_interleave() - Find lgroups with sub-page sized memory interleave,
+ * if any, and remove them.  This yields a config where the "coarse
+ * grained" lgroups cover all of memory, even though part of that memory
+ * is fine grain interleaved and does not deliver a purely local memory
+ * latency.
+ *
+ * This function reads and modifies the globals:
+ *	mpo_lgroup[], n_lgrpnodes
+ *
+ * Returns 1 if lgroup nodes were removed, 0 otherwise.
+ */
+
+static int
+fix_interleave(void)
+{
+	int i, j;
+	uint64_t mask = 0;
+
+	j = 0;
+	for (i = 0; i < n_lgrpnodes; i++) {
+		if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) {
+			/* remove this lgroup */
+			mask = mpo_lgroup[i].addr_mask;
+		} else {
+			mpo_lgroup[j++] = mpo_lgroup[i];
+		}
+	}
+	n_lgrpnodes = j;
+
+	if (mask != 0)
+		MPO_STATUS("sub-page interleave %lx found; "
+		    "removing lgroup.\n", mask);
+
+	return (mask != 0);
+}
diff --git a/usr/src/uts/sun4v/sys/cpu_module.h b/usr/src/uts/sun4v/sys/cpu_module.h
index 0786951416..2d7c909a0f 100644
--- a/usr/src/uts/sun4v/sys/cpu_module.h
+++ b/usr/src/uts/sun4v/sys/cpu_module.h
@@ -146,11 +146,12 @@ extern void bzero(void *addr, size_t count);
 
 int	cpu_trapstat_conf(int cmd);
 void	cpu_trapstat_data(void *buf, uint_t pgszs);
-
+/* Used  by the fill_cpu() function */
 #define	NO_MAPPING_FOUND		0xffffffff
 #define	NO_EU_MAPPING_FOUND		NO_MAPPING_FOUND
 #define	NO_CHIP_MAPPING_FOUND		NO_MAPPING_FOUND
 #define	NO_CORE_MAPPING_FOUND		NO_MAPPING_FOUND
+#define	NO_L2_CACHE_MAPPING_FOUND	NO_MAPPING_FOUND
 /*
  * Default MMU pagesize mask for sun4v architecture.
  */
diff --git a/usr/src/uts/sun4v/sys/machcpuvar.h b/usr/src/uts/sun4v/sys/machcpuvar.h
index 921d6c584d..2113747d55 100644
--- a/usr/src/uts/sun4v/sys/machcpuvar.h
+++ b/usr/src/uts/sun4v/sys/machcpuvar.h
@@ -104,6 +104,7 @@ typedef struct ptl1_state {
  */
 #define	CPU_CHIPID_INVALID	-1
 #define	CPU_COREID_INVALID	-1
+#define	CPU_L2_CACHEID_INVALID	-1
 
 /*
  * Machine specific fields of the cpu struct
@@ -177,6 +178,7 @@ struct	machcpu {
 	uint16_t	*cpu_list;		/* uint16_t [NCPU] */
 	uint64_t	cpu_list_ra;		/* cpu list ra */
 	id_t		cpu_ipipe;		/* cpu int exec unit id */
+	id_t		cpu_mpipe;		/* cpu memory pipe id */
 	id_t		cpu_fpu;		/* cpu fpu unit id */
 	id_t		cpu_core;		/* cpu core id */
 	id_t		cpu_chip;		/* cpu chip id */
diff --git a/usr/src/uts/sun4v/sys/mpo.h b/usr/src/uts/sun4v/sys/mpo.h
new file mode 100644
index 0000000000..e390b5e483
--- /dev/null
+++ b/usr/src/uts/sun4v/sys/mpo.h
@@ -0,0 +1,112 @@
+
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_MPO_H
+#define	_SYS_MPO_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * mpo.h -  Sun4v MPO common header file
+ *
+ */
+#define	MPO_MAX_MBLOCKS		16
+#define	MAX_MEM_STRIPES	(MAX_MEM_NODES * MPO_MAX_MBLOCKS)
+
+#define	PROP_LG_CPU_ID	"id"
+#define	PROP_LG_MASK	"address-mask"
+#define	PROP_LG_LATENCY "latency"
+#define	PROP_LG_MATCH	"address-match"
+#define	PROP_LG_MEM_LG	"memory-latency-group"
+#define	PROP_LG_CPU	"cpu"
+#define	PROP_LG_MBLOCK	"mblock"
+#define	PROP_LG_BASE	"base"
+#define	PROP_LG_SIZE	"size"
+#define	PROP_LG_RA_PA_OFFSET	"address-congruence-offset"
+
+/* Macro to set the correspending bit if an mem-lg homeid is a member */
+#define	HOMESET_ADD(homeset, home)\
+	homeset |= ((int)1 << (home))
+
+/* Macro to check if an mem_lg homeid is a member of the homeset */
+#define	MEM_LG_ISMEMBER(homeset, home)\
+	((homeset) & ((uint64_t)1 << (home)))
+
+/* Structure to store CPU information from the MD */
+
+struct cpu_md {
+	uint_t 	home;
+	uint64_t  latency;
+};
+
+/* Structure to store mem-lg information from the MD */
+
+struct lgrp_md {
+	uint64_t	id;
+	uint64_t	addr_mask;
+	uint64_t	addr_match;
+	uint64_t	latency;
+	mde_cookie_t	node;
+	int		ncpu;
+};
+
+/* Structure to store mblock information retrieved from the MD */
+
+struct mblock_md {
+	uint64_t	base;
+	uint64_t	size;
+	uint64_t	ra_to_pa;
+	mde_cookie_t	node;
+	pfn_t		base_pfn;
+	pfn_t		end_pfn;
+};
+
+/* Structure for memnode information for use by plat_pfn_to_mem_node */
+
+struct mnode_info {
+	pfn_t		base_pfn;
+	pfn_t		end_pfn;
+};
+
+/* A stripe defines the portion of a mem_node that falls in one mblock */
+typedef struct {
+	pfn_t physbase;	/* first page in mnode in the corresponding mblock */
+	pfn_t physmax;	/* last valid page in mnode in mblock */
+	pfn_t offset;   /* stripe starts at physbase - offset */
+	int exists;	/* set to 1 if mblock has memory in this mnode stripe */
+} mem_stripe_t;
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_MPO_H */
author	dp78419 <none@none>	2007-07-31 16:27:12 -0700
committer	dp78419 <none@none>	2007-07-31 16:27:12 -0700
commit	ce8eb11a8717b4a57c68fd77ab9f8aac15b16bf2 (patch)
tree	772441639680866ab4a841bbef119c6a813e6c09
parent	79777a7dd0179283917bda2ba98999c382d31c2c (diff)
download	illumos-joyent-ce8eb11a8717b4a57c68fd77ab9f8aac15b16bf2.tar.gz