summaryrefslogtreecommitdiff
path: root/usr/src/uts/i86pc
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/i86pc')
-rw-r--r--usr/src/uts/i86pc/Makefile.i86pc1
-rw-r--r--usr/src/uts/i86pc/dboot/dboot_printf.c4
-rw-r--r--usr/src/uts/i86pc/dboot/dboot_printf.h13
-rw-r--r--usr/src/uts/i86pc/dboot/dboot_startkern.c422
-rw-r--r--usr/src/uts/i86pc/dboot/dboot_xboot.h12
-rw-r--r--usr/src/uts/i86pc/io/mp_platform_common.c1
-rw-r--r--usr/src/uts/i86pc/io/psm/psm_common.c1
-rw-r--r--usr/src/uts/i86pc/ml/kpti_trampolines.s2
-rw-r--r--usr/src/uts/i86pc/ml/offsets.in1
-rw-r--r--usr/src/uts/i86pc/ml/syscall_asm_amd64.s166
-rw-r--r--usr/src/uts/i86pc/os/cpr_impl.c19
-rw-r--r--usr/src/uts/i86pc/os/ibft.c6
-rw-r--r--usr/src/uts/i86pc/os/lgrpplat.c14
-rw-r--r--usr/src/uts/i86pc/os/startup.c1
-rw-r--r--usr/src/uts/i86pc/os/trap.c12
-rw-r--r--usr/src/uts/i86pc/sys/apic.h2
-rw-r--r--usr/src/uts/i86pc/sys/comm_page.h1
-rw-r--r--usr/src/uts/i86pc/sys/vm_machparam.h6
-rw-r--r--usr/src/uts/i86pc/vm/hat_i86.c78
-rw-r--r--usr/src/uts/i86pc/vm/hment.c7
-rw-r--r--usr/src/uts/i86pc/vm/vm_machdep.c8
21 files changed, 594 insertions, 183 deletions
diff --git a/usr/src/uts/i86pc/Makefile.i86pc b/usr/src/uts/i86pc/Makefile.i86pc
index bed1885700..a398d741dc 100644
--- a/usr/src/uts/i86pc/Makefile.i86pc
+++ b/usr/src/uts/i86pc/Makefile.i86pc
@@ -24,6 +24,7 @@
#
# Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2013 Andrew Stormont. All rights reserved.
+# Copyright 2019 Joyent, Inc.
# Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
# Copyright 2019 Joyent, Inc.
#
diff --git a/usr/src/uts/i86pc/dboot/dboot_printf.c b/usr/src/uts/i86pc/dboot/dboot_printf.c
index 9d02c1943a..59d4e247f0 100644
--- a/usr/src/uts/i86pc/dboot/dboot_printf.c
+++ b/usr/src/uts/i86pc/dboot/dboot_printf.c
@@ -203,6 +203,10 @@ unsigned_num:
dboot_putnum(x, B_FALSE, base);
break;
+ case 'z':
+ size = sizeof (size_t);
+ goto again;
+
default:
dboot_puts("dboot_printf(): unknown % escape\n");
}
diff --git a/usr/src/uts/i86pc/dboot/dboot_printf.h b/usr/src/uts/i86pc/dboot/dboot_printf.h
index 22cf561e51..94b3db92e7 100644
--- a/usr/src/uts/i86pc/dboot/dboot_printf.h
+++ b/usr/src/uts/i86pc/dboot/dboot_printf.h
@@ -22,32 +22,29 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2020 Joyent, Inc.
*/
#ifndef _DBOOT_PRINTF_H
#define _DBOOT_PRINTF_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
/*
- * Very primitive printf. This only understands the following simple formats:
- * %%, %c, %s, %d, %ld, %lld, %x, %lx, %llx, %p
+ * Very primitive printf. We mark this as PRINTFLIKE so we can use %z
*/
-/*PRINTFLIKE1*/
extern void dboot_printf(char *fmt, ...)
- __KPRINTFLIKE(1);
+ __PRINTFLIKE(1);
/*
* Primitive version of panic, prints a message, waits for a keystroke,
* then resets the system
*/
-/*PRINTFLIKE1*/
extern void dboot_panic(char *fmt, ...)
- __KPRINTFLIKE(1);
+ __NORETURN __PRINTFLIKE(1);
#ifdef __cplusplus
diff --git a/usr/src/uts/i86pc/dboot/dboot_startkern.c b/usr/src/uts/i86pc/dboot/dboot_startkern.c
index 6621356133..6654244be2 100644
--- a/usr/src/uts/i86pc/dboot/dboot_startkern.c
+++ b/usr/src/uts/i86pc/dboot/dboot_startkern.c
@@ -75,6 +75,10 @@ extern int have_cpuid(void);
#define SHA1_ASCII_LENGTH (SHA1_DIGEST_LENGTH * 2)
+#define ULL(v) ((u_longlong_t)(v))
+
+static void *page_alloc(void);
+
/*
* This file contains code that runs to transition us from either a multiboot
* compliant loader (32 bit non-paging) or a XPV domain loader to
@@ -105,7 +109,10 @@ x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
* virtual address.
*/
paddr_t ktext_phys;
-uint32_t ksize = 2 * FOUR_MEG; /* kernel nucleus is 8Meg */
+/*
+ * Nucleus size is 8Mb, including text, data, and BSS.
+ */
+uint32_t ksize = 2 * FOUR_MEG;
static uint64_t target_kernel_text; /* value to use for KERNEL_TEXT */
@@ -115,9 +122,16 @@ static uint64_t target_kernel_text; /* value to use for KERNEL_TEXT */
char stack_space[STACK_SIZE];
/*
- * Used to track physical memory allocation
+ * The highest address we build page tables for.
*/
-static paddr_t next_avail_addr = 0;
+static paddr_t boot_map_end;
+
+/*
+ * The dboot allocator. This is a small area we use for allocating the
+ * kernel nucleus and pages for the identity page tables we build here.
+ */
+static paddr_t alloc_addr;
+static paddr_t alloc_end;
#if defined(__xpv)
/*
@@ -127,7 +141,6 @@ static paddr_t next_avail_addr = 0;
* to derive a pfn from a pointer, you subtract mfn_base.
*/
-static paddr_t scratch_end = 0; /* we can't write all of mem here */
static paddr_t mfn_base; /* addr corresponding to mfn_list[0] */
start_info_t *xen_info;
@@ -233,6 +246,12 @@ uint_t map_debug = 0;
static char noname[2] = "-";
+static boolean_t
+ranges_intersect(uint64_t s1, uint64_t e1, uint64_t s2, uint64_t e2)
+{
+ return (s1 < e2 && e1 >= s2);
+}
+
/*
* Either hypervisor-specific or grub-specific code builds the initial
* memlists. This code does the sort/merge/link for final use.
@@ -288,8 +307,16 @@ sort_physinstall(void)
if (prom_debug) {
dboot_printf("\nFinal memlists:\n");
for (i = 0; i < memlists_used; ++i) {
- dboot_printf("\t%d: addr=%" PRIx64 " size=%"
- PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
+ dboot_printf("\t%d: 0x%llx-0x%llx size=0x%llx\n",
+ i, ULL(memlists[i].addr), ULL(memlists[i].addr +
+ memlists[i].size), ULL(memlists[i].size));
+ }
+
+ dboot_printf("\nBoot modules:\n");
+ for (i = 0; i < bi->bi_module_cnt; i++) {
+ dboot_printf("\t%d: 0x%llx-0x%llx size=0x%llx\n",
+ i, ULL(modules[i].bm_addr), ULL(modules[i].bm_addr +
+ modules[i].bm_size), ULL(modules[i].bm_size));
}
}
@@ -341,6 +368,8 @@ dboot_halt(void)
while (--i)
(void) HYPERVISOR_yield();
(void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
+ for (;;)
+ ;
}
/*
@@ -427,7 +456,7 @@ set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
paddr_t
make_ptable(x86pte_t *pteval, uint_t level)
{
- paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
+ paddr_t new_table = (paddr_t)(uintptr_t)page_alloc();
if (level == top_level && level == 2)
*pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
@@ -659,18 +688,6 @@ exclude_from_pci(uint64_t start, uint64_t end)
}
}
-/*
- * During memory allocation, find the highest address not used yet.
- */
-static void
-check_higher(paddr_t a)
-{
- if (a < next_avail_addr)
- return;
- next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
- DBG(next_avail_addr);
-}
-
static int
dboot_loader_mmap_entries(void)
{
@@ -687,7 +704,6 @@ dboot_loader_mmap_entries(void)
DBG(mb_info->mmap_addr);
DBG(mb_info->mmap_length);
- check_higher(mb_info->mmap_addr + mb_info->mmap_length);
for (mmap_addr = mb_info->mmap_addr;
mmap_addr < mb_info->mmap_addr +
@@ -894,17 +910,13 @@ build_pcimemlists(void)
}
#if defined(__xpv)
-/*
- * Initialize memory allocator stuff from hypervisor-supplied start info.
- */
static void
-init_mem_alloc(void)
+init_dboot_alloc(void)
{
int local; /* variables needed to find start region */
- paddr_t scratch_start;
xen_memory_map_t map;
- DBG_MSG("Entered init_mem_alloc()\n");
+ DBG_MSG("Entered init_dboot_alloc()\n");
/*
* Free memory follows the stack. There's at least 512KB of scratch
@@ -913,17 +925,17 @@ init_mem_alloc(void)
* allocated last and will be outside the addressible range. We'll
* switch to new page tables before we unpack the kernel
*/
- scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
- DBG(scratch_start);
- scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
- DBG(scratch_end);
+ alloc_addr = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
+ DBG(alloc_addr);
+ alloc_end = RNDUP((paddr_t)alloc_addr + 512 * 1024, TWO_MEG);
+ DBG(alloc_end);
/*
* For paranoia, leave some space between hypervisor data and ours.
* Use 500 instead of 512.
*/
- next_avail_addr = scratch_end - 500 * 1024;
- DBG(next_avail_addr);
+ alloc_addr = alloc_end - 500 * 1024;
+ DBG(alloc_addr);
/*
* The domain builder gives us at most 1 module
@@ -1271,7 +1283,6 @@ process_module(int midx)
char *cmdline = dboot_multiboot_modcmdline(midx);
char *p, *q;
- check_higher(mod_end);
if (prom_debug) {
dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n",
midx, cmdline, (ulong_t)mod_start, (ulong_t)mod_end);
@@ -1435,7 +1446,6 @@ static void
dboot_process_modules(void)
{
int i, modcount;
- extern char _end[];
DBG_MSG("\nFinding Modules\n");
modcount = dboot_multiboot_modcount();
@@ -1443,11 +1453,11 @@ dboot_process_modules(void)
dboot_panic("Too many modules (%d) -- the maximum is %d.",
modcount, MAX_BOOT_MODULES);
}
+
/*
* search the modules to find the last used address
* we'll build the module list while we're walking through here
*/
- check_higher((paddr_t)(uintptr_t)&_end);
for (i = 0; i < modcount; ++i) {
process_module(i);
modules_used++;
@@ -1462,6 +1472,80 @@ dboot_process_modules(void)
check_images();
}
+#define CORRUPT_REGION_START 0xc700000
+#define CORRUPT_REGION_SIZE 0x100000
+#define CORRUPT_REGION_END (CORRUPT_REGION_START + CORRUPT_REGION_SIZE)
+
+static void
+dboot_add_memlist(uint64_t start, uint64_t end)
+{
+ if (end > max_mem)
+ max_mem = end;
+
+ /*
+ * Well, this is sad. On some systems, there is a region of memory that
+ * can be corrupted until some number of seconds after we have booted.
+ * And the BIOS doesn't tell us that this memory is unsafe to use. And
+ * we don't know how long it's dangerous. So we'll chop out this range
+ * from any memory list that would otherwise be usable. Note that any
+ * system of this type will give us the new-style (0x40) memlist, so we
+ * need not fix up the other path below.
+ *
+ * However, if we're boot-loaded from something that doesn't have a
+ * RICHMOND-16 workaround (which on many systems is just fine), it could
+ * actually use this region for the boot modules; if we remove it from
+ * the memlist, we'll keel over when trying to access the region.
+ *
+ * So, if we see that a module intersects the region, we presume it's
+ * OK.
+ */
+
+ if (find_boot_prop("disable-RICHMOND-16") != NULL)
+ goto out;
+
+ for (uint32_t i = 0; i < bi->bi_module_cnt; i++) {
+ native_ptr_t mod_start = modules[i].bm_addr;
+ native_ptr_t mod_end = modules[i].bm_addr + modules[i].bm_size;
+
+ if (ranges_intersect(mod_start, mod_end, CORRUPT_REGION_START,
+ CORRUPT_REGION_END)) {
+ if (prom_debug) {
+ dboot_printf("disabling RICHMOND-16 workaround "
+ "due to module #%d: "
+ "name %s addr %lx size %lx\n",
+ i, (char *)(uintptr_t)modules[i].bm_name,
+ (ulong_t)modules[i].bm_addr,
+ (ulong_t)modules[i].bm_size);
+ }
+ goto out;
+ }
+ }
+
+ if (start < CORRUPT_REGION_START && end > CORRUPT_REGION_START) {
+ memlists[memlists_used].addr = start;
+ memlists[memlists_used].size =
+ CORRUPT_REGION_START - start;
+ ++memlists_used;
+ if (end > CORRUPT_REGION_END)
+ start = CORRUPT_REGION_END;
+ else
+ return;
+ }
+
+ if (start >= CORRUPT_REGION_START && start < CORRUPT_REGION_END) {
+ if (end <= CORRUPT_REGION_END)
+ return;
+ start = CORRUPT_REGION_END;
+ }
+
+out:
+ memlists[memlists_used].addr = start;
+ memlists[memlists_used].size = end - start;
+ ++memlists_used;
+ if (memlists_used > MAX_MEMLIST)
+ dboot_panic("too many memlists");
+}
+
/*
* We then build the phys_install memlist from the multiboot information.
*/
@@ -1505,13 +1589,7 @@ dboot_process_mmap(void)
*/
switch (type) {
case 1:
- if (end > max_mem)
- max_mem = end;
- memlists[memlists_used].addr = start;
- memlists[memlists_used].size = end - start;
- ++memlists_used;
- if (memlists_used > MAX_MEMLIST)
- dboot_panic("too many memlists");
+ dboot_add_memlist(start, end);
break;
case 2:
rsvdmemlists[rsvdmemlists_used].addr = start;
@@ -1593,21 +1671,15 @@ dboot_multiboot1_highest_addr(void)
return (addr);
}
-static void
+static uint64_t
dboot_multiboot_highest_addr(void)
{
- paddr_t addr;
-
switch (multiboot_version) {
case 1:
- addr = dboot_multiboot1_highest_addr();
- if (addr != (paddr_t)(uintptr_t)NULL)
- check_higher(addr);
+ return (dboot_multiboot1_highest_addr());
break;
case 2:
- addr = dboot_multiboot2_highest_addr(mb2_info);
- if (addr != (paddr_t)(uintptr_t)NULL)
- check_higher(addr);
+ return (dboot_multiboot2_highest_addr(mb2_info));
break;
default:
dboot_panic("Unknown multiboot version: %d\n",
@@ -1617,15 +1689,97 @@ dboot_multiboot_highest_addr(void)
}
/*
- * Walk the boot loader provided information and find the highest free address.
+ * Set up our simple physical memory allocator. This is used to allocate both
+ * the kernel nucleus (ksize) and our page table pages.
+ *
+ * We need to find a contiguous region in the memlists that is below 4Gb (as
+ * we're 32-bit and need to use the addresses), and isn't otherwise in use by
+ * dboot, multiboot allocations, or boot modules. The memlist is sorted and
+ * merged by this point.
+ *
+ * Historically, this code always did the allocations past the end of the
+ * highest used address, even if there was space below. For reasons unclear, if
+ * we don't do this, then we get massive corruption during early kernel boot.
+ *
+ * Note that find_kalloc_start() starts its search at the end of this
+ * allocation.
+ *
+ * This all falls apart horribly on some EFI systems booting under iPXE, where
+ * we end up with boot module allocation such that there is no room between the
+ * highest used address and our 4Gb limit. To that end, we have an iPXE hack
+ * that limits the maximum address used by its allocations in an attempt to give
+ * us room.
*/
static void
-init_mem_alloc(void)
+init_dboot_alloc(void)
{
- DBG_MSG("Entered init_mem_alloc()\n");
+ extern char _end[];
+
+ DBG_MSG("Entered init_dboot_alloc()\n");
+
dboot_process_modules();
dboot_process_mmap();
- dboot_multiboot_highest_addr();
+
+ size_t align = FOUR_MEG;
+
+ /*
+ * We need enough alloc space for the nucleus memory...
+ */
+ size_t size = RNDUP(ksize, align);
+
+ /*
+ * And enough page table pages to cover potentially 4Gb. Each leaf PT
+ * covers 2Mb, so we need a maximum of 2048 pages for those. Next level
+ * up each covers 1Gb, and so on, so we'll just add a little slop (which
+ * gets aligned up anyway).
+ */
+ size += RNDUP(MMU_PAGESIZE * (2048 + 256), align);
+
+ uint64_t start = MAX(dboot_multiboot_highest_addr(),
+ (paddr_t)(uintptr_t)&_end);
+ start = RNDUP(start, align);
+
+ /*
+ * As mentioned above, only start our search after all the boot modules.
+ */
+ for (uint_t i = 0; i < bi->bi_module_cnt; i++) {
+ native_ptr_t mod_end = modules[i].bm_addr + modules[i].bm_size;
+
+ start = MAX(start, RNDUP(mod_end, MMU_PAGESIZE));
+ }
+
+ uint64_t end = start + size;
+
+ DBG(start);
+ DBG(end);
+
+ for (uint_t i = 0; i < memlists_used; i++) {
+ uint64_t ml_start = memlists[i].addr;
+ uint64_t ml_end = memlists[i].addr + memlists[i].size;
+
+ /*
+ * If we're past our starting point for search, begin at this
+ * memlist.
+ */
+ if (start < ml_start) {
+ start = RNDUP(ml_start, align);
+ end = start + size;
+ }
+
+ if (end >= (uint64_t)UINT32_MAX) {
+ dboot_panic("couldn't find alloc space below 4Gb");
+ }
+
+ if (end < ml_end) {
+ alloc_addr = start;
+ alloc_end = end;
+ DBG(alloc_addr);
+ DBG(alloc_end);
+ return;
+ }
+ }
+
+ dboot_panic("couldn't find alloc space in memlists");
}
static int
@@ -1869,77 +2023,89 @@ print_efi64(EFI_SYSTEM_TABLE64 *efi)
#endif /* !__xpv */
/*
- * Simple memory allocator, allocates aligned physical memory.
- * Note that startup_kernel() only allocates memory, never frees.
- * Memory usage just grows in an upward direction.
+ * Simple memory allocator for aligned physical memory from the area provided by
+ * init_dboot_alloc(). This is a simple bump allocator, and it's never directly
+ * freed by dboot.
*/
static void *
-do_mem_alloc(uint32_t size, uint32_t align)
+dboot_alloc(uint32_t size, uint32_t align)
{
- uint_t i;
- uint64_t best;
- uint64_t start;
- uint64_t end;
+ uint32_t start = RNDUP(alloc_addr, align);
- /*
- * make sure size is a multiple of pagesize
- */
size = RNDUP(size, MMU_PAGESIZE);
- next_avail_addr = RNDUP(next_avail_addr, align);
- /*
- * XXPV fixme joe
- *
- * a really large bootarchive that causes you to run out of memory
- * may cause this to blow up
- */
- /* LINTED E_UNEXPECTED_UINT_PROMOTION */
- best = (uint64_t)-size;
- for (i = 0; i < memlists_used; ++i) {
- start = memlists[i].addr;
-#if defined(__xpv)
- start += mfn_base;
-#endif
- end = start + memlists[i].size;
+ if (start + size > alloc_end) {
+ dboot_panic("%s: couldn't allocate 0x%x bytes aligned 0x%x "
+ "alloc_addr = 0x%llx, alloc_end = 0x%llx", __func__,
+ size, align, (u_longlong_t)alloc_addr,
+ (u_longlong_t)alloc_end);
+ }
- /*
- * did we find the desired address?
- */
- if (start <= next_avail_addr && next_avail_addr + size <= end) {
- best = next_avail_addr;
- goto done;
- }
+ alloc_addr = start + size;
- /*
- * if not is this address the best so far?
- */
- if (start > next_avail_addr && start < best &&
- RNDUP(start, align) + size <= end)
- best = RNDUP(start, align);
+ if (map_debug) {
+ dboot_printf("%s(0x%x, 0x%x) = 0x%x\n", __func__, size,
+ align, start);
}
- /*
- * We didn't find exactly the address we wanted, due to going off the
- * end of a memory region. Return the best found memory address.
- */
-done:
- next_avail_addr = best + size;
-#if defined(__xpv)
- if (next_avail_addr > scratch_end)
- dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
- "0x%lx", (ulong_t)next_avail_addr,
- (ulong_t)scratch_end);
-#endif
- (void) memset((void *)(uintptr_t)best, 0, size);
- return ((void *)(uintptr_t)best);
+ (void) memset((void *)(uintptr_t)start, 0, size);
+ return ((void *)(uintptr_t)start);
}
-void *
-mem_alloc(uint32_t size)
+static void *
+page_alloc(void)
{
- return (do_mem_alloc(size, MMU_PAGESIZE));
+ return (dboot_alloc(MMU_PAGESIZE, MMU_PAGESIZE));
}
+/*
+ * This is where we tell the kernel to start physical allocations from, beyond
+ * the end of our allocation area and all boot modules. It might be beyond 4Gb,
+ * so we can't touch that area ourselves.
+ *
+ * We might set kalloc_start to the end of a memlist; if so make sure we skip it
+ * along to the next one.
+ *
+ * This is making the massive assumption that there is a suitably large area for
+ * kernel allocations past the end of the last boot module and the dboot
+ * allocated region. Worse, we don't have a simple way to assert that is so.
+ */
+static paddr_t
+find_kalloc_start(void)
+{
+ paddr_t kalloc_start = alloc_end;
+ uint_t i;
+
+ for (i = 0; i < bi->bi_module_cnt; i++) {
+ native_ptr_t mod_end = modules[i].bm_addr + modules[i].bm_size;
+
+ kalloc_start = MAX(kalloc_start, RNDUP(mod_end, MMU_PAGESIZE));
+ }
+
+ boot_map_end = kalloc_start;
+ DBG(boot_map_end);
+
+ for (i = 0; i < memlists_used; i++) {
+ uint64_t ml_start = memlists[i].addr;
+ uint64_t ml_end = memlists[i].addr + memlists[i].size;
+
+ if (kalloc_start >= ml_end)
+ continue;
+
+ if (kalloc_start < ml_start)
+ kalloc_start = ml_start;
+ break;
+ }
+
+ if (i == memlists_used) {
+ dboot_panic("fell off the end of memlists finding a "
+ "kalloc_start value > 0x%llx", (u_longlong_t)kalloc_start);
+ }
+
+ DBG(kalloc_start);
+
+ return (kalloc_start);
+}
/*
* Build page tables to map all of memory used so far as well as the kernel.
@@ -1962,7 +2128,7 @@ build_page_tables(void)
#if defined(__xpv)
top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
#else /* __xpv */
- top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
+ top_page_table = (paddr_t)(uintptr_t)page_alloc();
#endif /* __xpv */
DBG((uintptr_t)top_page_table);
@@ -1988,7 +2154,7 @@ build_page_tables(void)
/*
* The kernel will need a 1 page window to work with page tables
*/
- bi->bi_pt_window = (native_ptr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
+ bi->bi_pt_window = (native_ptr_t)(uintptr_t)page_alloc();
DBG(bi->bi_pt_window);
bi->bi_pte_to_pt_window =
(native_ptr_t)(uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
@@ -2029,6 +2195,10 @@ build_page_tables(void)
#if !defined(__xpv)
+ /*
+ * Map every valid memlist address up until boot_map_end: this will
+ * cover at least our alloc region and all boot modules.
+ */
for (i = 0; i < memlists_used; ++i) {
start = memlists[i].addr;
end = start + memlists[i].size;
@@ -2036,11 +2206,11 @@ build_page_tables(void)
if (map_debug)
dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
start, end);
- while (start < end && start < next_avail_addr) {
+ while (start < end && start < boot_map_end) {
map_pa_at_va(start, start, 0);
start += MMU_PAGESIZE;
}
- if (start >= next_avail_addr)
+ if (start >= boot_map_end)
break;
}
@@ -2302,7 +2472,9 @@ startup_kernel(void)
/*
* Need correct target_kernel_text value
*/
+#if defined(_BOOT_TARGET_amd64)
target_kernel_text = KERNEL_TEXT;
+#endif
DBG(target_kernel_text);
#if defined(__xpv)
@@ -2462,7 +2634,7 @@ startup_kernel(void)
/*
* initialize the simple memory allocator
*/
- init_mem_alloc();
+ init_dboot_alloc();
#if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
/*
@@ -2516,7 +2688,7 @@ startup_kernel(void)
* For grub, copy kernel bits from the ELF64 file to final place.
*/
DBG_MSG("\nAllocating nucleus pages.\n");
- ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
+ ktext_phys = (uintptr_t)dboot_alloc(ksize, FOUR_MEG);
if (ktext_phys == 0)
dboot_panic("failed to allocate aligned kernel memory");
@@ -2527,6 +2699,8 @@ startup_kernel(void)
DBG(ktext_phys);
+ paddr_t kalloc_start = find_kalloc_start();
+
/*
* Allocate page tables.
*/
@@ -2544,18 +2718,18 @@ startup_kernel(void)
#if defined(__xpv)
- bi->bi_next_paddr = next_avail_addr - mfn_base;
+ bi->bi_next_paddr = kalloc_start - mfn_base;
DBG(bi->bi_next_paddr);
- bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
+ bi->bi_next_vaddr = (native_ptr_t)kalloc_start;
DBG(bi->bi_next_vaddr);
/*
* unmap unused pages in start area to make them available for DMA
*/
- while (next_avail_addr < scratch_end) {
- (void) HYPERVISOR_update_va_mapping(next_avail_addr,
+ while (alloc_addr < alloc_end) {
+ (void) HYPERVISOR_update_va_mapping(alloc_addr,
0, UVMF_INVLPG | UVMF_LOCAL);
- next_avail_addr += MMU_PAGESIZE;
+ alloc_addr += MMU_PAGESIZE;
}
bi->bi_xen_start_info = (native_ptr_t)(uintptr_t)xen_info;
@@ -2565,9 +2739,9 @@ startup_kernel(void)
#else /* __xpv */
- bi->bi_next_paddr = next_avail_addr;
+ bi->bi_next_paddr = kalloc_start;
DBG(bi->bi_next_paddr);
- bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
+ bi->bi_next_vaddr = (native_ptr_t)kalloc_start;
DBG(bi->bi_next_vaddr);
bi->bi_mb_version = multiboot_version;
diff --git a/usr/src/uts/i86pc/dboot/dboot_xboot.h b/usr/src/uts/i86pc/dboot/dboot_xboot.h
index 7d0876c79c..f261f3f2b1 100644
--- a/usr/src/uts/i86pc/dboot/dboot_xboot.h
+++ b/usr/src/uts/i86pc/dboot/dboot_xboot.h
@@ -22,6 +22,8 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2020 Joyent, Inc.
*/
#ifndef _DBOOT_XBOOT_H
@@ -52,16 +54,14 @@ extern uint_t prom_debug;
#define DBG_MSG(s) do { if (prom_debug) \
dboot_printf(s); \
- _NOTE(CONSTANTCONDITION) \
} while (0)
-#define DBG(x) do { if (prom_debug) { \
- dboot_printf("%s is 0x%" PRIx64 "\n", #x, (uint64_t)(x)); \
- _NOTE(CONSTANTCONDITION) \
+#define DBG(x) do { if (prom_debug) { \
+ dboot_printf("%s: %s is 0x%" PRIx64 "\n", \
+ __func__, #x, (uint64_t)(x)); \
} } while (0)
-extern void dboot_halt(void);
-extern void *mem_alloc(uint32_t size);
+extern void dboot_halt(void) __NORETURN;
#define RNDUP(x, y) (((x) + ((y) - 1ul)) & ~((y) - 1ul))
diff --git a/usr/src/uts/i86pc/io/mp_platform_common.c b/usr/src/uts/i86pc/io/mp_platform_common.c
index bff745b483..54a0ac3506 100644
--- a/usr/src/uts/i86pc/io/mp_platform_common.c
+++ b/usr/src/uts/i86pc/io/mp_platform_common.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc.
+ * Copyright 2017 Joyent, Inc.
* Copyright (c) 2017 by Delphix. All rights reserved.
* Copyright 2020 Joyent, Inc.
* Copyright 2020 RackTop Systems, Inc.
diff --git a/usr/src/uts/i86pc/io/psm/psm_common.c b/usr/src/uts/i86pc/io/psm/psm_common.c
index b59d87bdcc..623c6e5617 100644
--- a/usr/src/uts/i86pc/io/psm/psm_common.c
+++ b/usr/src/uts/i86pc/io/psm/psm_common.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
*/
#include <sys/types.h>
diff --git a/usr/src/uts/i86pc/ml/kpti_trampolines.s b/usr/src/uts/i86pc/ml/kpti_trampolines.s
index 4b5102d547..17249eb747 100644
--- a/usr/src/uts/i86pc/ml/kpti_trampolines.s
+++ b/usr/src/uts/i86pc/ml/kpti_trampolines.s
@@ -667,6 +667,8 @@ tr_intr_ret_end:
MK_INTR_TRAMPOLINE_NOERR(invaltrap)
MK_INTR_TRAMPOLINE_NOERR(fasttrap)
MK_INTR_TRAMPOLINE_NOERR(dtrace_ret)
+ MK_INTR_TRAMPOLINE_NOERR(brand_sys_int80)
+ MK_INTR_TRAMPOLINE_NOERR(sys_int80)
/*
* These are special because they can interrupt other traps, and
diff --git a/usr/src/uts/i86pc/ml/offsets.in b/usr/src/uts/i86pc/ml/offsets.in
index 622f7cd2a3..6c1de5c145 100644
--- a/usr/src/uts/i86pc/ml/offsets.in
+++ b/usr/src/uts/i86pc/ml/offsets.in
@@ -144,6 +144,7 @@ _klwp
lwp_thread
lwp_procp
lwp_brand
+ lwp_brand_syscall
lwp_eosys
lwp_regs
lwp_arg
diff --git a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
index 8a68b4bced..8040e35297 100644
--- a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
+++ b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
@@ -515,6 +515,7 @@ noprod_sys_syscall:
movq T_LWP(%r15), %r14
ASSERT_NO_RUPDATE_PENDING(%r14)
+
ENABLE_INTR_FLAGS
MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
@@ -528,6 +529,37 @@ noprod_sys_syscall:
incq %gs:CPU_STATS_SYS_SYSCALL
+ /*
+ * If our LWP has an alternate system call handler, run that instead of
+ * the regular system call path.
+ */
+ movq LWP_BRAND_SYSCALL(%r14), %rdi
+ testq %rdi, %rdi
+ jz _syscall_no_brand
+
+ pushq %rax
+ subq $8, %rsp /* align stack for call to C */
+ INDIRECT_CALL_REG(rdi)
+ addq $8, %rsp
+
+ /*
+ * If the alternate handler returns non-zero, the normal system call
+ * processing is resumed.
+ */
+ testl %eax, %eax
+ popq %rax
+ jnz _syscall_no_brand
+
+ /*
+ * For branded syscalls which were handled in-kernel, shuffle the
+ * register state as would be done by the native handler before jumping
+ * to the post-syscall logic.
+ */
+ movq REGOFF_RAX(%rsp), %r12
+ movq REGOFF_RDX(%rsp), %r13
+ jmp _syscall_after_brand
+
+_syscall_no_brand:
movw %ax, T_SYSNUM(%r15)
movzbl T_PRE_SYS(%r15), %ebx
ORL_SYSCALLTRACE(%ebx)
@@ -563,6 +595,8 @@ _syscall_invoke:
shrq $32, %r13 /* upper 32-bits into %edx */
movl %r12d, %r12d /* lower 32-bits into %eax */
5:
+
+_syscall_after_brand:
/*
* Optimistically assume that there's no post-syscall
* work to do. (This is to avoid having to call syscall_mstate()
@@ -825,11 +859,46 @@ _syscall32_save:
incq %gs:CPU_STATS_SYS_SYSCALL
/*
+ * If our lwp has an alternate system call handler, run that instead
+ * of the regular system call path.
+ */
+ movq LWP_BRAND_SYSCALL(%r14), %rax
+ testq %rax, %rax
+ jz _syscall32_no_brand
+
+ movb $LWP_SYS, LWP_STATE(%r14)
+ INDIRECT_CALL_REG(rax)
+
+ /*
+ * If the alternate handler returns non-zero, the normal system call
+ * processing is resumed.
+ */
+ testl %eax, %eax
+ jnz _syscall32_no_brand
+
+ /*
+ * For branded syscalls which were handled in-kernel, shuffle the
+ * register state as would be done by the native handler before jumping
+ * to the post-syscall logic.
+ */
+ movl REGOFF_RAX(%rsp), %r12d
+ movl REGOFF_RDX(%rsp), %r13d
+ jmp _syscall32_after_brand
+
+_syscall32_no_brand:
+ /*
* Make some space for MAXSYSARGS (currently 8) 32-bit args placed
* into 64-bit (long) arg slots, maintaining 16 byte alignment. Or
* more succinctly:
*
* SA(MAXSYSARGS * sizeof (long)) == 64
+ *
+ * Note, this space is used both to copy in the arguments from user
+ * land, but also to as part of the old UNIX style syscall_ap() method.
+ * syscall_entry expects that we do not change the values of this space
+ * that we give it. However, this means that when we end up in the more
+ * recent model of passing the arguments based on the calling
+ * conventions, we'll need to save an additional 16 bytes of stack.
*/
#define SYS_DROP 64 /* drop for args */
subq $SYS_DROP, %rsp
@@ -857,12 +926,16 @@ _syscall32_save:
*/
movq %rax, %rbx
- movl 0(%rsp), %edi
- movl 8(%rsp), %esi
- movl 0x10(%rsp), %edx
- movl 0x18(%rsp), %ecx
- movl 0x20(%rsp), %r8d
- movl 0x28(%rsp), %r9d
+ movl 0x0(%rsp), %edi /* arg0 */
+ movl 0x8(%rsp), %esi /* arg1 */
+ movl 0x10(%rsp), %edx /* arg2 */
+ movl 0x38(%rsp), %eax /* arg7 load */
+ movl 0x18(%rsp), %ecx /* arg3 */
+ pushq %rax /* arg7 saved to stack */
+ movl 0x28(%rsp), %r8d /* arg4 */
+ movl 0x38(%rsp), %eax /* arg6 load */
+ movl 0x30(%rsp), %r9d /* arg5 */
+ pushq %rax /* arg6 saved to stack */
movq SY_CALLC(%rbx), %rax
INDIRECT_CALL_REG(rax)
@@ -881,6 +954,8 @@ _syscall32_save:
shrq $32, %r13 /* upper 32-bits into %edx */
movl %eax, %r12d /* lower 32-bits into %eax */
+_syscall32_after_brand:
+
/*
* Optimistically assume that there's no post-syscall
* work to do. (This is to avoid having to call syscall_mstate()
@@ -1133,15 +1208,20 @@ _full_syscall_postsys32:
/*
* Fetch the arguments copied onto the kernel stack and put
* them in the right registers to invoke a C-style syscall handler.
- * %rax contains the handler address.
+ * %rax contains the handler address. For the last two arguments, we
+ * push them onto the stack -- we can't clobber the old arguments.
*/
movq %rax, %rbx
- movl 0(%rsp), %edi
- movl 8(%rsp), %esi
- movl 0x10(%rsp), %edx
- movl 0x18(%rsp), %ecx
- movl 0x20(%rsp), %r8d
- movl 0x28(%rsp), %r9d
+ movl 0x0(%rsp), %edi /* arg0 */
+ movl 0x8(%rsp), %esi /* arg1 */
+ movl 0x10(%rsp), %edx /* arg2 */
+ movl 0x38(%rsp), %eax /* arg7 load */
+ movl 0x18(%rsp), %ecx /* arg3 */
+ pushq %rax /* arg7 saved to stack */
+ movl 0x28(%rsp), %r8d /* arg4 */
+ movl 0x38(%rsp), %eax /* arg6 load */
+ movl 0x30(%rsp), %r9d /* arg5 */
+ pushq %rax /* arg6 saved to stack */
movq SY_CALLC(%rbx), %rax
INDIRECT_CALL_REG(rax)
@@ -1220,6 +1300,66 @@ _full_syscall_postsys32:
SET_SIZE(brand_sys_sysenter)
/*
+ * System call via an int80. This entry point is only used by the Linux
+ * application environment. Unlike the other entry points, there is no
+ * default action to take if no callback is registered for this process.
+ */
+
+ ENTRY_NP(brand_sys_int80)
+ SWAPGS /* kernel gsbase */
+ XPV_TRAP_POP
+ call smap_enable
+
+ /*
+ * We first attempt to call the "b_int80" handler from the "struct
+ * brand_mach_ops" for this brand. If no handler function is installed
+ * for this brand, the BRAND_CALLBACK() macro returns here and we
+ * check the lwp for a "lwp_brand_syscall" handler.
+ */
+ BRAND_CALLBACK(BRAND_CB_INT80, BRAND_URET_FROM_INTR_STACK())
+
+ /*
+ * Check to see if this lwp provides "lwp_brand_syscall". If so, we
+ * will route this int80 through the regular system call handling path.
+ */
+ movq %r15, %gs:CPU_RTMP_R15
+ movq %gs:CPU_THREAD, %r15
+ movq T_LWP(%r15), %r15
+ movq LWP_BRAND_SYSCALL(%r15), %r15
+ testq %r15, %r15
+ movq %gs:CPU_RTMP_R15, %r15
+ jnz nopop_syscall_int
+
+ /*
+ * The brand provided neither a "b_int80", nor a "lwp_brand_syscall"
+ * function, and has thus opted out of handling this trap.
+ */
+ SWAPGS /* user gsbase */
+ jmp nopop_int80
+
+ ENTRY_NP(sys_int80)
+ /*
+ * We hit an int80, but this process isn't of a brand with an int80
+ * handler. Bad process! Make it look as if the INT failed.
+ * Modify %rip to point before the INT, push the expected error
+ * code and fake a GP fault. Note on 64-bit hypervisor we need
+ * to undo the XPV_TRAP_POP and push rcx and r11 back on the stack
+ * because gptrap will pop them again with its own XPV_TRAP_POP.
+ */
+ XPV_TRAP_POP
+ call smap_enable
+nopop_int80:
+ subq $2, (%rsp) /* int insn 2-bytes */
+ pushq $_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2)
+#if defined(__xpv)
+ push %r11
+ push %rcx
+#endif
+ jmp gptrap / GP fault
+ SET_SIZE(sys_int80)
+ SET_SIZE(brand_sys_int80)
+
+/*
* This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by
* the generic i386 libc to do system calls. We do a small amount of setup
* before jumping into the existing sys_syscall32 path.
diff --git a/usr/src/uts/i86pc/os/cpr_impl.c b/usr/src/uts/i86pc/os/cpr_impl.c
index 4a5c71b35d..e878f765ef 100644
--- a/usr/src/uts/i86pc/os/cpr_impl.c
+++ b/usr/src/uts/i86pc/os/cpr_impl.c
@@ -23,6 +23,10 @@
*/
/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
* Platform specific implementation code
* Currently only suspend to RAM is supported (ACPI S3)
*/
@@ -737,6 +741,20 @@ i_cpr_is_supported(int sleeptype)
if (sleeptype != CPR_TORAM)
return (0);
+ /*
+ * Unfortunately, the x86 resume code was never implemented for GAS.
+ * The only obvious problem is that a trick necessary to appease Sun
+ * Studio does the wrong thing for GAS. Doubley unfortunate is that
+ * the condition used to detect GAS is incorrect, so we do in fact
+ * compile the Studio path, it just immediately fails in resume.
+ *
+ * Given that, if we were built using GCC, never allow CPR to be
+ * attempted.
+ */
+#ifdef __GNUC__
+ return (0);
+#else
+
/*
* The next statement tests if a specific platform has turned off
* cpr support.
@@ -751,6 +769,7 @@ i_cpr_is_supported(int sleeptype)
return (1);
return (pm_S3_enabled);
+#endif
}
void
diff --git a/usr/src/uts/i86pc/os/ibft.c b/usr/src/uts/i86pc/os/ibft.c
index d9ed882705..fab1324787 100644
--- a/usr/src/uts/i86pc/os/ibft.c
+++ b/usr/src/uts/i86pc/os/ibft.c
@@ -39,6 +39,7 @@
#include <sys/kmem.h>
#include <sys/psm.h>
#include <sys/bootconf.h>
+#include <sys/reboot.h>
typedef enum ibft_structure_type {
Reserved = 0,
@@ -206,6 +207,7 @@ static ibft_status_t iscsi_parse_ibft_NIC(iscsi_ibft_nic_t *nicp);
static ibft_status_t iscsi_parse_ibft_target(char *begin_of_ibft,
iscsi_ibft_tgt_t *tgtp);
+extern int boothowto;
/*
* Return value:
@@ -759,7 +761,9 @@ ld_ib_prop()
* 1) pass "-B ibft-noprobe=1" on kernel command line
* 2) add line "set ibft_noprobe=1" in /etc/system
*/
- cmn_err(CE_NOTE, IBFT_NOPROBE_MSG);
+ if (boothowto & RB_VERBOSE) {
+ cmn_err(CE_NOTE, IBFT_NOPROBE_MSG);
+ }
return;
}
diff --git a/usr/src/uts/i86pc/os/lgrpplat.c b/usr/src/uts/i86pc/os/lgrpplat.c
index ed463fba8f..6320c0a949 100644
--- a/usr/src/uts/i86pc/os/lgrpplat.c
+++ b/usr/src/uts/i86pc/os/lgrpplat.c
@@ -2800,7 +2800,11 @@ lgrp_plat_process_sli(uint32_t domain_id, uchar_t *sli_info,
/*
* Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs
* and memory are local to each other in the same NUMA node and return number
- * of nodes
+ * of nodes.
+ *
+ * The SRAT table pointer is populated during bootup by
+ * build_firmware_properties() in fakebop.c. Several motherboard and BIOS
+ * manufacturers are guilty of not having a SRAT table.
*/
static int
lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp,
@@ -2817,9 +2821,15 @@ lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp,
/*
* Nothing to do when no SRAT or disabled
*/
- if (tp == NULL || !lgrp_plat_srat_enable)
+ if (!lgrp_plat_srat_enable)
return (-1);
+ if (tp == NULL) {
+ cmn_err(CE_WARN, "Couldn't read ACPI SRAT table from BIOS. "
+ "lgrp support will be limited to one group.\n");
+ return (-1);
+ }
+
/*
* Try to get domain information from MSCT table.
* ACPI4.0: OSPM will use information provided by the MSCT only
diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c
index a0bb296e70..e1e92ffe4f 100644
--- a/usr/src/uts/i86pc/os/startup.c
+++ b/usr/src/uts/i86pc/os/startup.c
@@ -2450,6 +2450,7 @@ add_physmem_cb(page_t *pp, pfn_t pnum)
pp->p_mapping = NULL;
pp->p_embed = 0;
pp->p_share = 0;
+ pp->p_zoneid = ALL_ZONES;
pp->p_mlentry = 0;
}
diff --git a/usr/src/uts/i86pc/os/trap.c b/usr/src/uts/i86pc/os/trap.c
index b7c18bb8c9..063fac49f7 100644
--- a/usr/src/uts/i86pc/os/trap.c
+++ b/usr/src/uts/i86pc/os/trap.c
@@ -99,6 +99,7 @@
#include <sys/hypervisor.h>
#endif
#include <sys/contract/process_impl.h>
+#include <sys/brand.h>
#define USER 0x10000 /* user-mode flag added to trap type */
@@ -810,6 +811,17 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid)
fault_type = F_INVAL;
}
+ /*
+ * Allow the brand to interpose on invalid memory accesses
+ * prior to running the native pagefault handler. If this
+ * brand hook returns zero, it was able to handle the fault
+ * completely. Otherwise, drive on and call pagefault().
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_pagefault != NULL &&
+ BROP(p)->b_pagefault(p, lwp, addr, fault_type, rw) == 0) {
+ goto out;
+ }
+
res = pagefault(addr, fault_type, rw, 0);
/*
diff --git a/usr/src/uts/i86pc/sys/apic.h b/usr/src/uts/i86pc/sys/apic.h
index 26626ec5a4..f2528a632f 100644
--- a/usr/src/uts/i86pc/sys/apic.h
+++ b/usr/src/uts/i86pc/sys/apic.h
@@ -386,7 +386,7 @@ struct apic_io_intr {
/* special or reserve vectors */
#define APIC_CHECK_RESERVE_VECTORS(v) \
(((v) == T_FASTTRAP) || ((v) == APIC_SPUR_INTR) || \
- ((v) == T_SYSCALLINT) || ((v) == T_DTRACE_RET))
+ ((v) == T_SYSCALLINT) || ((v) == T_DTRACE_RET) || ((v) == 0x80))
/* cmos shutdown code for BIOS */
#define BIOS_SHUTDOWN 0x0a
diff --git a/usr/src/uts/i86pc/sys/comm_page.h b/usr/src/uts/i86pc/sys/comm_page.h
index 520ad9001d..ea19c856a8 100644
--- a/usr/src/uts/i86pc/sys/comm_page.h
+++ b/usr/src/uts/i86pc/sys/comm_page.h
@@ -27,6 +27,7 @@ extern "C" {
#endif
#define COMM_PAGE_SIZE PAGESIZE
+#define COMM_PAGE_ALIGN 0x4000
#ifndef _ASM
diff --git a/usr/src/uts/i86pc/sys/vm_machparam.h b/usr/src/uts/i86pc/sys/vm_machparam.h
index 90a5245217..0d0c95535c 100644
--- a/usr/src/uts/i86pc/sys/vm_machparam.h
+++ b/usr/src/uts/i86pc/sys/vm_machparam.h
@@ -23,6 +23,7 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
*/
#ifndef _SYS_VM_MACHPARAM_H
@@ -129,11 +130,12 @@ extern "C" {
*
* XXX - The system doesn't account for multiple swap devices.
*/
-#define DISKRPM 60
+#define DISKRPM 600
/*
* The maximum value for handspreadpages which is the the distance
- * between the two clock hands in pages.
+ * between the two clock hands in pages. This is only used when the page
+ * scanner is first started.
*/
#define MAXHANDSPREADPAGES ((64 * 1024 * 1024) / PAGESIZE)
diff --git a/usr/src/uts/i86pc/vm/hat_i86.c b/usr/src/uts/i86pc/vm/hat_i86.c
index 7650d28f41..ea9436e881 100644
--- a/usr/src/uts/i86pc/vm/hat_i86.c
+++ b/usr/src/uts/i86pc/vm/hat_i86.c
@@ -3808,7 +3808,7 @@ hat_page_getattr(struct page *pp, uint_t flag)
/*
- * common code used by hat_pageunload() and hment_steal()
+ * common code used by hat_page_inval() and hment_steal()
*/
hment_t *
hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry)
@@ -3864,15 +3864,13 @@ hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry)
extern int vpm_enable;
/*
- * Unload all translations to a page. If the page is a subpage of a large
+ * Unload translations to a page. If the page is a subpage of a large
* page, the large page mappings are also removed.
- *
- * The forceflags are unused.
+ * If curhat is not NULL, then we only unload the translation
+ * for the given process, otherwise all translations are unloaded.
*/
-
-/*ARGSUSED*/
-static int
-hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag)
+void
+hat_page_inval(struct page *pp, uint_t pg_szcd, struct hat *curhat)
{
page_t *cur_pp = pp;
hment_t *hm;
@@ -3880,16 +3878,11 @@ hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag)
htable_t *ht;
uint_t entry;
level_t level;
+ ulong_t cnt = 0;
XPV_DISALLOW_MIGRATE();
/*
- * prevent recursion due to kmem_free()
- */
- ++curthread->t_hatdepth;
- ASSERT(curthread->t_hatdepth < 16);
-
- /*
* clear the vpm ref.
*/
if (vpm_enable) {
@@ -3899,6 +3892,8 @@ hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag)
* The loop with next_size handles pages with multiple pagesize mappings
*/
next_size:
+ if (curhat != NULL)
+ cnt = hat_page_getshare(cur_pp);
for (;;) {
/*
@@ -3910,14 +3905,13 @@ next_size:
if (hm == NULL) {
x86_hm_exit(cur_pp);
+curproc_done:
/*
* If not part of a larger page, we're done.
*/
if (cur_pp->p_szc <= pg_szcd) {
- ASSERT(curthread->t_hatdepth > 0);
- --curthread->t_hatdepth;
XPV_ALLOW_MIGRATE();
- return (0);
+ return;
}
/*
@@ -3936,8 +3930,20 @@ next_size:
* If this mapping size matches, remove it.
*/
level = ht->ht_level;
- if (level == pg_szcd)
- break;
+ if (level == pg_szcd) {
+ if (curhat == NULL || ht->ht_hat == curhat)
+ break;
+ /*
+ * Unloading only the given process but it's
+ * not the hat for the current process. Leave
+ * entry in place. Also do a safety check to
+ * ensure we don't get in an infinite loop
+ */
+ if (cnt-- == 0) {
+ x86_hm_exit(cur_pp);
+ goto curproc_done;
+ }
+ }
}
/*
@@ -3947,14 +3953,44 @@ next_size:
hm = hati_page_unmap(cur_pp, ht, entry);
if (hm != NULL)
hment_free(hm);
+
+ /* Perform check above for being part of a larger page. */
+ if (curhat != NULL)
+ goto curproc_done;
}
}
+/*
+ * Unload translations to a page. If unloadflag is HAT_CURPROC_PGUNLOAD, then
+ * we only unload the translation for the current process, otherwise all
+ * translations are unloaded.
+ */
+static int
+hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t unloadflag)
+{
+ struct hat *curhat = NULL;
+
+ /*
+ * prevent recursion due to kmem_free()
+ */
+ ++curthread->t_hatdepth;
+ ASSERT(curthread->t_hatdepth < 16);
+
+ if (unloadflag == HAT_CURPROC_PGUNLOAD)
+ curhat = curthread->t_procp->p_as->a_hat;
+
+ hat_page_inval(pp, pg_szcd, curhat);
+
+ ASSERT(curthread->t_hatdepth > 0);
+ --curthread->t_hatdepth;
+ return (0);
+}
+
int
-hat_pageunload(struct page *pp, uint_t forceflag)
+hat_pageunload(struct page *pp, uint_t unloadflag)
{
ASSERT(PAGE_EXCL(pp));
- return (hati_pageunload(pp, 0, forceflag));
+ return (hati_pageunload(pp, 0, unloadflag));
}
/*
diff --git a/usr/src/uts/i86pc/vm/hment.c b/usr/src/uts/i86pc/vm/hment.c
index bb18b5c462..769bbd15d2 100644
--- a/usr/src/uts/i86pc/vm/hment.c
+++ b/usr/src/uts/i86pc/vm/hment.c
@@ -21,6 +21,7 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
*/
#include <sys/types.h>
@@ -35,6 +36,7 @@
#include <vm/hat_i86.h>
#include <sys/cmn_err.h>
#include <sys/avl.h>
+#include <sys/zone.h>
/*
@@ -319,6 +321,8 @@ hment_insert(hment_t *hm, page_t *pp)
((hment_t *)pp->p_mapping)->hm_prev = hm;
pp->p_mapping = hm;
+ zone_add_page(pp);
+
/*
* Add the hment to the system-wide hash table.
*/
@@ -460,6 +464,7 @@ hment_assign(htable_t *htable, uint_t entry, page_t *pp, hment_t *hm)
pp->p_embed = 1;
pp->p_mapping = htable;
pp->p_mlentry = entry;
+ zone_add_page(pp);
return;
}
@@ -541,6 +546,7 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry)
pp->p_mapping = NULL;
pp->p_mlentry = 0;
pp->p_embed = 0;
+ zone_rm_page(pp);
return (NULL);
}
@@ -576,6 +582,7 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry)
hm->hm_hashlink = null_avl_link;
hm->hm_next = NULL;
hm->hm_prev = NULL;
+ zone_rm_page(pp);
return (hm);
}
diff --git a/usr/src/uts/i86pc/vm/vm_machdep.c b/usr/src/uts/i86pc/vm/vm_machdep.c
index 225628b1c8..bc9d03e7f5 100644
--- a/usr/src/uts/i86pc/vm/vm_machdep.c
+++ b/usr/src/uts/i86pc/vm/vm_machdep.c
@@ -711,10 +711,8 @@ void
map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
{
struct proc *p = curproc;
- caddr_t userlimit = (flags & _MAP_LOW32) ?
- (caddr_t)_userlimit32 : p->p_as->a_userlimit;
-
- map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
+ map_addr_proc(addrp, len, off, vacalign,
+ map_userlimit(p, p->p_as, flags), curproc, flags);
}
/*ARGSUSED*/
@@ -3546,7 +3544,7 @@ page_create_io(
if (nscan < desscan && freemem < minfree) {
TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
"pageout_cv_signal:freemem %ld", freemem);
- cv_signal(&proc_pageout->p_cv);
+ WAKE_PAGEOUT_SCANNER();
}
if (flags & PG_PHYSCONTIG) {