summaryrefslogtreecommitdiff
path: root/usr/src/uts/i86pc/os/startup.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/i86pc/os/startup.c')
-rw-r--r--usr/src/uts/i86pc/os/startup.c226
1 files changed, 100 insertions, 126 deletions
diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c
index 8c1d4e0697..f63973f092 100644
--- a/usr/src/uts/i86pc/os/startup.c
+++ b/usr/src/uts/i86pc/os/startup.c
@@ -319,22 +319,16 @@ static struct seg *segmap = &kmapseg; /* easier to use name for in here */
struct seg *segkp = &kpseg; /* Pageable kernel virtual memory segment */
-#if defined(__amd64)
struct seg kvseg_core; /* Segment used for the core heap */
struct seg kpmseg; /* Segment used for physical mapping */
struct seg *segkpm = &kpmseg; /* 64bit kernel physical mapping segment */
-#else
-struct seg *segkpm = NULL; /* Unused on IA32 */
-#endif
caddr_t segkp_base; /* Base address of segkp */
caddr_t segzio_base; /* Base address of segzio */
-#if defined(__amd64)
pgcnt_t segkpsize = btop(SEGKPDEFSIZE); /* size of segkp segment in pages */
-#else
-pgcnt_t segkpsize = 0;
-#endif
-pgcnt_t segziosize = 0; /* size of zio segment in pages */
+caddr_t segkvmm_base;
+pgcnt_t segkvmmsize;
+pgcnt_t segziosize;
/*
* A static DR page_t VA map is reserved that can map the page structures
@@ -455,23 +449,32 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t);
* 0xFFFFFFFF.C0000000 |-----------------------|- core_base / ekernelheap
* | Kernel |
* | heap |
+ * | |
+ * | |
* 0xFFFFFXXX.XXX00000 |-----------------------|- kernelheap (floating)
* | segmap |
* 0xFFFFFXXX.XXX00000 |-----------------------|- segmap_start (floating)
* | device mappings |
* 0xFFFFFXXX.XXX00000 |-----------------------|- toxic_addr (floating)
- * | segzio |
+ * | segzio |
* 0xFFFFFXXX.XXX00000 |-----------------------|- segzio_base (floating)
- * | segkp |
- * --- |-----------------------|- segkp_base (floating)
+ * | segkvmm |
+ * | |
+ * | |
+ * | |
+ * 0xFFFFFXXX.XXX00000 |-----------------------|- segkvmm_base (floating)
+ * | segkp |
+ * |-----------------------|- segkp_base (floating)
* | page_t structures | valloc_base + valloc_sz
* | memsegs, memlists, |
* | page hash, etc. |
- * 0xFFFFFF00.00000000 |-----------------------|- valloc_base (lower if >256GB)
+ * 0xFFFFFE00.00000000 |-----------------------|- valloc_base (lower if >256GB)
* | segkpm |
- * 0xFFFFFE00.00000000 |-----------------------|
+ * | |
+ * 0xFFFFFD00.00000000 |-----------------------|- SEGKPM_BASE (lower if >256GB)
* | Red Zone |
- * 0xFFFFFD80.00000000 |-----------------------|- KERNELBASE (lower if >256GB)
+ * 0xFFFFFC80.00000000 |-----------------------|- KERNELBASE (lower if >256GB)
+ * 0xFFFFFC7F.FFE00000 |-----------------------|- USERLIMIT (lower if >256GB)
* | User stack |- User space memory
* | |
* | shared objects, etc | (grows downwards)
@@ -697,6 +700,7 @@ startup_smap(void)
uint32_t inst;
uint8_t *instp;
char sym[128];
+ struct modctl *modp;
extern int _smap_enable_patch_count;
extern int _smap_disable_patch_count;
@@ -730,8 +734,15 @@ startup_smap(void)
hot_patch_kernel_text((caddr_t)instp, inst, 4);
}
- hot_patch_kernel_text((caddr_t)smap_enable, SMAP_CLAC_INSTR, 4);
- hot_patch_kernel_text((caddr_t)smap_disable, SMAP_STAC_INSTR, 4);
+ /*
+ * Hotinline calls to smap_enable and smap_disable within
+ * unix module. Hotinlines in other modules are done on
+ * mod_load().
+ */
+ modp = mod_hold_by_name("unix");
+ do_hotinlines(modp->mod_mp);
+ mod_release_mod(modp);
+
setcr4(getcr4() | CR4_SMAP);
smap_enable();
}
@@ -1076,22 +1087,9 @@ startup_memlist(void)
PRM_DEBUG(memblocks);
/*
- * Compute maximum physical address for memory DR operations.
- * Memory DR operations are unsupported on xpv or 32bit OSes.
+ * We no longer support any form of memory DR.
*/
-#ifdef __amd64
- if (plat_dr_support_memory()) {
- if (plat_dr_physmax == 0) {
- uint_t pabits = UINT_MAX;
-
- cpuid_get_addrsize(CPU, &pabits, NULL);
- plat_dr_physmax = btop(1ULL << pabits);
- }
- if (plat_dr_physmax > PHYSMEM_MAX64)
- plat_dr_physmax = PHYSMEM_MAX64;
- } else
-#endif
- plat_dr_physmax = 0;
+ plat_dr_physmax = 0;
/*
* Examine the bios reserved memory to find out:
@@ -1252,68 +1250,55 @@ startup_memlist(void)
pse_table_alloc_size = pse_table_size * sizeof (pad_mutex_t);
ADD_TO_ALLOCATIONS(pse_mutex, pse_table_alloc_size);
-#if defined(__amd64)
valloc_sz = ROUND_UP_LPAGE(valloc_sz);
valloc_base = VALLOC_BASE;
/*
- * The default values of VALLOC_BASE and SEGKPM_BASE should work
- * for values of physmax up to 256GB (1/4 TB). They need adjusting when
- * memory is at addresses above 256GB. When adjusted, segkpm_base must
- * be aligned on KERNEL_REDZONE_SIZE boundary (span of top level pte).
+ * The signicant memory-sized regions are roughly sized as follows in
+ * the default layout with max physmem:
+ * segkpm: 1x physmem allocated (but 1Tb room, below VALLOC_BASE)
+ * segzio: 1.5x physmem
+ * segkvmm: 4x physmem
+ * heap: whatever's left up to COREHEAP_BASE, at least 1.5x physmem
*
- * In the general case (>256GB), we use (4 * physmem) for the
- * kernel's virtual addresses, which is divided approximately
- * as follows:
- * - 1 * physmem for segkpm
- * - 1.5 * physmem for segzio
- * - 1.5 * physmem for heap
- * Total: 4.0 * physmem
+ * The idea is that we leave enough room to avoid fragmentation issues,
+ * so we would like the VA arenas to have some extra.
*
- * Note that the segzio and heap sizes are more than physmem so that
- * VA fragmentation does not prevent either of them from being
- * able to use nearly all of physmem. The value of 1.5x is determined
- * experimentally and may need to change if the workload changes.
+ * Ignoring the loose change of segkp, valloc, and such, this means that
+ * as COREHEAP_BASE-VALLOC_BASE=2Tb, we can accommodate a physmem up to
+ * about (2Tb / 7.0), rounded down to 256Gb in the check below.
+ *
+ * Note that KPM lives below VALLOC_BASE, but we want to include it in
+ * adjustments, hence the 8 below.
+ *
+ * Beyond 256Gb, we push segkpm_base (and hence kernelbase and
+ * _userlimit) down to accommodate the VA requirements above.
*/
- if (physmax + 1 > mmu_btop(TERABYTE / 4) ||
- plat_dr_physmax > mmu_btop(TERABYTE / 4)) {
- uint64_t kpm_resv_amount = mmu_ptob(physmax + 1);
+ if (physmax + 1 > mmu_btop(TERABYTE / 4)) {
+ uint64_t physmem_bytes = mmu_ptob(physmax + 1);
+ uint64_t adjustment = 8 * (physmem_bytes - (TERABYTE / 4));
- if (kpm_resv_amount < mmu_ptob(plat_dr_physmax)) {
- kpm_resv_amount = mmu_ptob(plat_dr_physmax);
- }
+ PRM_DEBUG(adjustment);
/*
- * This is what actually controls the KVA : UVA split.
- * The kernel uses high VA, and this is lowering the
- * boundary, thus increasing the amount of VA for the kernel.
- * This gives the kernel 4 * (amount of physical memory) VA.
- *
- * The maximum VA is UINT64_MAX and we are using
- * 64-bit 2's complement math, so e.g. if you have 512GB
- * of memory, segkpm_base = -(4 * 512GB) == -2TB ==
- * UINT64_MAX - 2TB (approximately). So the kernel's
- * VA is [UINT64_MAX-2TB to UINT64_MAX].
+ * segkpm_base is always aligned on a L3 PTE boundary.
*/
- segkpm_base = -(P2ROUNDUP((4 * kpm_resv_amount),
- KERNEL_REDZONE_SIZE));
+ segkpm_base -= P2ROUNDUP(adjustment, KERNEL_REDZONE_SIZE);
- /* make sure we leave some space for user apps above hole */
+ /*
+ * But make sure we leave some space for user apps above hole.
+ */
segkpm_base = MAX(segkpm_base, AMD64_VA_HOLE_END + TERABYTE);
- if (segkpm_base > SEGKPM_BASE)
- segkpm_base = SEGKPM_BASE;
- PRM_DEBUG(segkpm_base);
- valloc_base = segkpm_base + P2ROUNDUP(kpm_resv_amount, ONE_GIG);
+ ASSERT(segkpm_base <= SEGKPM_BASE);
+
+ valloc_base = segkpm_base + P2ROUNDUP(physmem_bytes, ONE_GIG);
if (valloc_base < segkpm_base)
panic("not enough kernel VA to support memory size");
- PRM_DEBUG(valloc_base);
}
-#else /* __i386 */
- valloc_base = (uintptr_t)(MISC_VA_BASE - valloc_sz);
- valloc_base = P2ALIGN(valloc_base, mmu.level_size[1]);
+
+ PRM_DEBUG(segkpm_base);
PRM_DEBUG(valloc_base);
-#endif /* __i386 */
/*
* do all the initial allocations
@@ -1901,73 +1886,70 @@ protect_boot_range(uintptr_t low, uintptr_t high, int setaside)
}
/*
- *
+ * Establish the final size of the kernel's heap, size of segmap, segkp, etc.
*/
static void
layout_kernel_va(void)
{
- PRM_POINT("layout_kernel_va() starting...");
- /*
- * Establish the final size of the kernel's heap, size of segmap,
- * segkp, etc.
- */
+ const size_t physmem_size = mmu_ptob(physmem);
+ size_t size;
-#if defined(__amd64)
+ PRM_POINT("layout_kernel_va() starting...");
kpm_vbase = (caddr_t)segkpm_base;
- if (physmax + 1 < plat_dr_physmax) {
- kpm_size = ROUND_UP_LPAGE(mmu_ptob(plat_dr_physmax));
- } else {
- kpm_size = ROUND_UP_LPAGE(mmu_ptob(physmax + 1));
- }
+ kpm_size = ROUND_UP_LPAGE(mmu_ptob(physmax + 1));
if ((uintptr_t)kpm_vbase + kpm_size > (uintptr_t)valloc_base)
panic("not enough room for kpm!");
PRM_DEBUG(kpm_size);
PRM_DEBUG(kpm_vbase);
- /*
- * By default we create a seg_kp in 64 bit kernels, it's a little
- * faster to access than embedding it in the heap.
- */
segkp_base = (caddr_t)valloc_base + valloc_sz;
if (!segkp_fromheap) {
- size_t sz = mmu_ptob(segkpsize);
+ size = mmu_ptob(segkpsize);
/*
* determine size of segkp
*/
- if (sz < SEGKPMINSIZE || sz > SEGKPMAXSIZE) {
- sz = SEGKPDEFSIZE;
+ if (size < SEGKPMINSIZE || size > SEGKPMAXSIZE) {
+ size = SEGKPDEFSIZE;
cmn_err(CE_WARN, "!Illegal value for segkpsize. "
"segkpsize has been reset to %ld pages",
- mmu_btop(sz));
+ mmu_btop(size));
}
- sz = MIN(sz, MAX(SEGKPMINSIZE, mmu_ptob(physmem)));
+ size = MIN(size, MAX(SEGKPMINSIZE, physmem_size));
- segkpsize = mmu_btop(ROUND_UP_LPAGE(sz));
+ segkpsize = mmu_btop(ROUND_UP_LPAGE(size));
}
PRM_DEBUG(segkp_base);
PRM_DEBUG(segkpsize);
/*
- * segzio is used for ZFS cached data. It uses a distinct VA
- * segment (from kernel heap) so that we can easily tell not to
- * include it in kernel crash dumps on 64 bit kernels. The trick is
- * to give it lots of VA, but not constrain the kernel heap.
- * We can use 1.5x physmem for segzio, leaving approximately
- * another 1.5x physmem for heap. See also the comment in
- * startup_memlist().
+ * segkvmm: backing for vmm guest memory. Like segzio, we have a
+ * separate segment for two reasons: it makes it easy to skip our pages
+ * on kernel crash dumps, and it helps avoid fragmentation. With this
+ * segment, we're expecting significantly-sized allocations only; we'll
+ * default to 4x the size of physmem.
*/
- segzio_base = segkp_base + mmu_ptob(segkpsize);
+ segkvmm_base = segkp_base + mmu_ptob(segkpsize);
+ size = segkvmmsize != 0 ? mmu_ptob(segkvmmsize) : (physmem_size * 4);
+
+ size = MAX(size, SEGVMMMINSIZE);
+ segkvmmsize = mmu_btop(ROUND_UP_LPAGE(size));
+
+ PRM_DEBUG(segkvmmsize);
+ PRM_DEBUG(segkvmm_base);
+
+ /*
+ * segzio is used for ZFS cached data. For segzio, we use 1.5x physmem.
+ */
+ segzio_base = segkvmm_base + mmu_ptob(segkvmmsize);
if (segzio_fromheap) {
segziosize = 0;
} else {
- size_t physmem_size = mmu_ptob(physmem);
- size_t size = (segziosize == 0) ?
- physmem_size * 3 / 2 : mmu_ptob(segziosize);
+ size = (segziosize != 0) ? mmu_ptob(segziosize) :
+ (physmem_size * 3) / 2;
- if (size < SEGZIOMINSIZE)
- size = SEGZIOMINSIZE;
+ size = MAX(size, SEGZIOMINSIZE);
segziosize = mmu_btop(ROUND_UP_LPAGE(size));
}
PRM_DEBUG(segziosize);
@@ -1981,10 +1963,6 @@ layout_kernel_va(void)
ROUND_UP_LPAGE((uintptr_t)segzio_base + mmu_ptob(segziosize));
PRM_DEBUG(toxic_addr);
segmap_start = ROUND_UP_LPAGE(toxic_addr + toxic_size);
-#else /* __i386 */
- segmap_start = ROUND_UP_LPAGE(kernelbase);
-#endif /* __i386 */
- PRM_DEBUG(segmap_start);
/*
* Users can change segmapsize through eeprom. If the variable
@@ -1993,16 +1971,6 @@ layout_kernel_va(void)
*/
segmapsize = MAX(ROUND_UP_LPAGE(segmapsize), SEGMAPDEFAULT);
-#if defined(__i386)
- /*
- * 32-bit systems don't have segkpm or segkp, so segmap appears at
- * the bottom of the kernel's address range. Set aside space for a
- * small red zone just below the start of segmap.
- */
- segmap_start += KERNEL_REDZONE_SIZE;
- segmapsize -= KERNEL_REDZONE_SIZE;
-#endif
-
PRM_DEBUG(segmap_start);
PRM_DEBUG(segmapsize);
kernelheap = (caddr_t)ROUND_UP_LPAGE(segmap_start + segmapsize);
@@ -2603,6 +2571,7 @@ add_physmem_cb(page_t *pp, pfn_t pnum)
pp->p_mapping = NULL;
pp->p_embed = 0;
pp->p_share = 0;
+ pp->p_zoneid = ALL_ZONES;
pp->p_mlentry = 0;
}
@@ -2790,11 +2759,16 @@ kvm_init(void)
(void) segkmem_create(&kvseg_core);
}
+ PRM_POINT("attaching segkvmm");
+ (void) seg_attach(&kas, segkvmm_base, mmu_ptob(segkvmmsize), &kvmmseg);
+ (void) segkmem_create(&kvmmseg);
+ segkmem_kvmm_init(segkvmm_base, mmu_ptob(segkvmmsize));
+
if (segziosize > 0) {
PRM_POINT("attaching segzio");
(void) seg_attach(&kas, segzio_base, mmu_ptob(segziosize),
&kzioseg);
- (void) segkmem_zio_create(&kzioseg);
+ (void) segkmem_create(&kzioseg);
/* create zio area covering new segment */
segkmem_zio_init(segzio_base, mmu_ptob(segziosize));