diff options
Diffstat (limited to 'usr/src/uts/i86pc')
-rw-r--r-- | usr/src/uts/i86pc/Makefile.i86pc | 1 | ||||
-rw-r--r-- | usr/src/uts/i86pc/dboot/dboot_printf.c | 4 | ||||
-rw-r--r-- | usr/src/uts/i86pc/dboot/dboot_printf.h | 13 | ||||
-rw-r--r-- | usr/src/uts/i86pc/dboot/dboot_startkern.c | 422 | ||||
-rw-r--r-- | usr/src/uts/i86pc/dboot/dboot_xboot.h | 12 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/mp_platform_common.c | 1 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/psm/psm_common.c | 1 | ||||
-rw-r--r-- | usr/src/uts/i86pc/ml/kpti_trampolines.s | 2 | ||||
-rw-r--r-- | usr/src/uts/i86pc/ml/offsets.in | 1 | ||||
-rw-r--r-- | usr/src/uts/i86pc/ml/syscall_asm_amd64.s | 166 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/cpr_impl.c | 19 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/ibft.c | 6 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/lgrpplat.c | 14 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/startup.c | 1 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/trap.c | 12 | ||||
-rw-r--r-- | usr/src/uts/i86pc/sys/apic.h | 2 | ||||
-rw-r--r-- | usr/src/uts/i86pc/sys/comm_page.h | 1 | ||||
-rw-r--r-- | usr/src/uts/i86pc/sys/vm_machparam.h | 6 | ||||
-rw-r--r-- | usr/src/uts/i86pc/vm/hat_i86.c | 78 | ||||
-rw-r--r-- | usr/src/uts/i86pc/vm/hment.c | 7 | ||||
-rw-r--r-- | usr/src/uts/i86pc/vm/vm_machdep.c | 8 |
21 files changed, 594 insertions, 183 deletions
diff --git a/usr/src/uts/i86pc/Makefile.i86pc b/usr/src/uts/i86pc/Makefile.i86pc index bed1885700..a398d741dc 100644 --- a/usr/src/uts/i86pc/Makefile.i86pc +++ b/usr/src/uts/i86pc/Makefile.i86pc @@ -24,6 +24,7 @@ # # Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright (c) 2013 Andrew Stormont. All rights reserved. +# Copyright 2019 Joyent, Inc. # Copyright 2019 OmniOS Community Edition (OmniOSce) Association. # Copyright 2019 Joyent, Inc. # diff --git a/usr/src/uts/i86pc/dboot/dboot_printf.c b/usr/src/uts/i86pc/dboot/dboot_printf.c index 9d02c1943a..59d4e247f0 100644 --- a/usr/src/uts/i86pc/dboot/dboot_printf.c +++ b/usr/src/uts/i86pc/dboot/dboot_printf.c @@ -203,6 +203,10 @@ unsigned_num: dboot_putnum(x, B_FALSE, base); break; + case 'z': + size = sizeof (size_t); + goto again; + default: dboot_puts("dboot_printf(): unknown % escape\n"); } diff --git a/usr/src/uts/i86pc/dboot/dboot_printf.h b/usr/src/uts/i86pc/dboot/dboot_printf.h index 22cf561e51..94b3db92e7 100644 --- a/usr/src/uts/i86pc/dboot/dboot_printf.h +++ b/usr/src/uts/i86pc/dboot/dboot_printf.h @@ -22,32 +22,29 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2020 Joyent, Inc. */ #ifndef _DBOOT_PRINTF_H #define _DBOOT_PRINTF_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif /* - * Very primitive printf. This only understands the following simple formats: - * %%, %c, %s, %d, %ld, %lld, %x, %lx, %llx, %p + * Very primitive printf. We mark this as PRINTFLIKE so we can use %z */ -/*PRINTFLIKE1*/ extern void dboot_printf(char *fmt, ...) - __KPRINTFLIKE(1); + __PRINTFLIKE(1); /* * Primitive version of panic, prints a message, waits for a keystroke, * then resets the system */ -/*PRINTFLIKE1*/ extern void dboot_panic(char *fmt, ...) - __KPRINTFLIKE(1); + __NORETURN __PRINTFLIKE(1); #ifdef __cplusplus diff --git a/usr/src/uts/i86pc/dboot/dboot_startkern.c b/usr/src/uts/i86pc/dboot/dboot_startkern.c index 6621356133..6654244be2 100644 --- a/usr/src/uts/i86pc/dboot/dboot_startkern.c +++ b/usr/src/uts/i86pc/dboot/dboot_startkern.c @@ -75,6 +75,10 @@ extern int have_cpuid(void); #define SHA1_ASCII_LENGTH (SHA1_DIGEST_LENGTH * 2) +#define ULL(v) ((u_longlong_t)(v)) + +static void *page_alloc(void); + /* * This file contains code that runs to transition us from either a multiboot * compliant loader (32 bit non-paging) or a XPV domain loader to @@ -105,7 +109,10 @@ x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST; * virtual address. */ paddr_t ktext_phys; -uint32_t ksize = 2 * FOUR_MEG; /* kernel nucleus is 8Meg */ +/* + * Nucleus size is 8Mb, including text, data, and BSS. + */ +uint32_t ksize = 2 * FOUR_MEG; static uint64_t target_kernel_text; /* value to use for KERNEL_TEXT */ @@ -115,9 +122,16 @@ static uint64_t target_kernel_text; /* value to use for KERNEL_TEXT */ char stack_space[STACK_SIZE]; /* - * Used to track physical memory allocation + * The highest address we build page tables for. */ -static paddr_t next_avail_addr = 0; +static paddr_t boot_map_end; + +/* + * The dboot allocator. This is a small area we use for allocating the + * kernel nucleus and pages for the identity page tables we build here. + */ +static paddr_t alloc_addr; +static paddr_t alloc_end; #if defined(__xpv) /* @@ -127,7 +141,6 @@ static paddr_t next_avail_addr = 0; * to derive a pfn from a pointer, you subtract mfn_base. */ -static paddr_t scratch_end = 0; /* we can't write all of mem here */ static paddr_t mfn_base; /* addr corresponding to mfn_list[0] */ start_info_t *xen_info; @@ -233,6 +246,12 @@ uint_t map_debug = 0; static char noname[2] = "-"; +static boolean_t +ranges_intersect(uint64_t s1, uint64_t e1, uint64_t s2, uint64_t e2) +{ + return (s1 < e2 && e1 >= s2); +} + /* * Either hypervisor-specific or grub-specific code builds the initial * memlists. This code does the sort/merge/link for final use. @@ -288,8 +307,16 @@ sort_physinstall(void) if (prom_debug) { dboot_printf("\nFinal memlists:\n"); for (i = 0; i < memlists_used; ++i) { - dboot_printf("\t%d: addr=%" PRIx64 " size=%" - PRIx64 "\n", i, memlists[i].addr, memlists[i].size); + dboot_printf("\t%d: 0x%llx-0x%llx size=0x%llx\n", + i, ULL(memlists[i].addr), ULL(memlists[i].addr + + memlists[i].size), ULL(memlists[i].size)); + } + + dboot_printf("\nBoot modules:\n"); + for (i = 0; i < bi->bi_module_cnt; i++) { + dboot_printf("\t%d: 0x%llx-0x%llx size=0x%llx\n", + i, ULL(modules[i].bm_addr), ULL(modules[i].bm_addr + + modules[i].bm_size), ULL(modules[i].bm_size)); } } @@ -341,6 +368,8 @@ dboot_halt(void) while (--i) (void) HYPERVISOR_yield(); (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff); + for (;;) + ; } /* @@ -427,7 +456,7 @@ set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval) paddr_t make_ptable(x86pte_t *pteval, uint_t level) { - paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); + paddr_t new_table = (paddr_t)(uintptr_t)page_alloc(); if (level == top_level && level == 2) *pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID; @@ -659,18 +688,6 @@ exclude_from_pci(uint64_t start, uint64_t end) } } -/* - * During memory allocation, find the highest address not used yet. - */ -static void -check_higher(paddr_t a) -{ - if (a < next_avail_addr) - return; - next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE); - DBG(next_avail_addr); -} - static int dboot_loader_mmap_entries(void) { @@ -687,7 +704,6 @@ dboot_loader_mmap_entries(void) DBG(mb_info->mmap_addr); DBG(mb_info->mmap_length); - check_higher(mb_info->mmap_addr + mb_info->mmap_length); for (mmap_addr = mb_info->mmap_addr; mmap_addr < mb_info->mmap_addr + @@ -894,17 +910,13 @@ build_pcimemlists(void) } #if defined(__xpv) -/* - * Initialize memory allocator stuff from hypervisor-supplied start info. - */ static void -init_mem_alloc(void) +init_dboot_alloc(void) { int local; /* variables needed to find start region */ - paddr_t scratch_start; xen_memory_map_t map; - DBG_MSG("Entered init_mem_alloc()\n"); + DBG_MSG("Entered init_dboot_alloc()\n"); /* * Free memory follows the stack. There's at least 512KB of scratch @@ -913,17 +925,17 @@ init_mem_alloc(void) * allocated last and will be outside the addressible range. We'll * switch to new page tables before we unpack the kernel */ - scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE); - DBG(scratch_start); - scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG); - DBG(scratch_end); + alloc_addr = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE); + DBG(alloc_addr); + alloc_end = RNDUP((paddr_t)alloc_addr + 512 * 1024, TWO_MEG); + DBG(alloc_end); /* * For paranoia, leave some space between hypervisor data and ours. * Use 500 instead of 512. */ - next_avail_addr = scratch_end - 500 * 1024; - DBG(next_avail_addr); + alloc_addr = alloc_end - 500 * 1024; + DBG(alloc_addr); /* * The domain builder gives us at most 1 module @@ -1271,7 +1283,6 @@ process_module(int midx) char *cmdline = dboot_multiboot_modcmdline(midx); char *p, *q; - check_higher(mod_end); if (prom_debug) { dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n", midx, cmdline, (ulong_t)mod_start, (ulong_t)mod_end); @@ -1435,7 +1446,6 @@ static void dboot_process_modules(void) { int i, modcount; - extern char _end[]; DBG_MSG("\nFinding Modules\n"); modcount = dboot_multiboot_modcount(); @@ -1443,11 +1453,11 @@ dboot_process_modules(void) dboot_panic("Too many modules (%d) -- the maximum is %d.", modcount, MAX_BOOT_MODULES); } + /* * search the modules to find the last used address * we'll build the module list while we're walking through here */ - check_higher((paddr_t)(uintptr_t)&_end); for (i = 0; i < modcount; ++i) { process_module(i); modules_used++; @@ -1462,6 +1472,80 @@ dboot_process_modules(void) check_images(); } +#define CORRUPT_REGION_START 0xc700000 +#define CORRUPT_REGION_SIZE 0x100000 +#define CORRUPT_REGION_END (CORRUPT_REGION_START + CORRUPT_REGION_SIZE) + +static void +dboot_add_memlist(uint64_t start, uint64_t end) +{ + if (end > max_mem) + max_mem = end; + + /* + * Well, this is sad. On some systems, there is a region of memory that + * can be corrupted until some number of seconds after we have booted. + * And the BIOS doesn't tell us that this memory is unsafe to use. And + * we don't know how long it's dangerous. So we'll chop out this range + * from any memory list that would otherwise be usable. Note that any + * system of this type will give us the new-style (0x40) memlist, so we + * need not fix up the other path below. + * + * However, if we're boot-loaded from something that doesn't have a + * RICHMOND-16 workaround (which on many systems is just fine), it could + * actually use this region for the boot modules; if we remove it from + * the memlist, we'll keel over when trying to access the region. + * + * So, if we see that a module intersects the region, we presume it's + * OK. + */ + + if (find_boot_prop("disable-RICHMOND-16") != NULL) + goto out; + + for (uint32_t i = 0; i < bi->bi_module_cnt; i++) { + native_ptr_t mod_start = modules[i].bm_addr; + native_ptr_t mod_end = modules[i].bm_addr + modules[i].bm_size; + + if (ranges_intersect(mod_start, mod_end, CORRUPT_REGION_START, + CORRUPT_REGION_END)) { + if (prom_debug) { + dboot_printf("disabling RICHMOND-16 workaround " + "due to module #%d: " + "name %s addr %lx size %lx\n", + i, (char *)(uintptr_t)modules[i].bm_name, + (ulong_t)modules[i].bm_addr, + (ulong_t)modules[i].bm_size); + } + goto out; + } + } + + if (start < CORRUPT_REGION_START && end > CORRUPT_REGION_START) { + memlists[memlists_used].addr = start; + memlists[memlists_used].size = + CORRUPT_REGION_START - start; + ++memlists_used; + if (end > CORRUPT_REGION_END) + start = CORRUPT_REGION_END; + else + return; + } + + if (start >= CORRUPT_REGION_START && start < CORRUPT_REGION_END) { + if (end <= CORRUPT_REGION_END) + return; + start = CORRUPT_REGION_END; + } + +out: + memlists[memlists_used].addr = start; + memlists[memlists_used].size = end - start; + ++memlists_used; + if (memlists_used > MAX_MEMLIST) + dboot_panic("too many memlists"); +} + /* * We then build the phys_install memlist from the multiboot information. */ @@ -1505,13 +1589,7 @@ dboot_process_mmap(void) */ switch (type) { case 1: - if (end > max_mem) - max_mem = end; - memlists[memlists_used].addr = start; - memlists[memlists_used].size = end - start; - ++memlists_used; - if (memlists_used > MAX_MEMLIST) - dboot_panic("too many memlists"); + dboot_add_memlist(start, end); break; case 2: rsvdmemlists[rsvdmemlists_used].addr = start; @@ -1593,21 +1671,15 @@ dboot_multiboot1_highest_addr(void) return (addr); } -static void +static uint64_t dboot_multiboot_highest_addr(void) { - paddr_t addr; - switch (multiboot_version) { case 1: - addr = dboot_multiboot1_highest_addr(); - if (addr != (paddr_t)(uintptr_t)NULL) - check_higher(addr); + return (dboot_multiboot1_highest_addr()); break; case 2: - addr = dboot_multiboot2_highest_addr(mb2_info); - if (addr != (paddr_t)(uintptr_t)NULL) - check_higher(addr); + return (dboot_multiboot2_highest_addr(mb2_info)); break; default: dboot_panic("Unknown multiboot version: %d\n", @@ -1617,15 +1689,97 @@ dboot_multiboot_highest_addr(void) } /* - * Walk the boot loader provided information and find the highest free address. + * Set up our simple physical memory allocator. This is used to allocate both + * the kernel nucleus (ksize) and our page table pages. + * + * We need to find a contiguous region in the memlists that is below 4Gb (as + * we're 32-bit and need to use the addresses), and isn't otherwise in use by + * dboot, multiboot allocations, or boot modules. The memlist is sorted and + * merged by this point. + * + * Historically, this code always did the allocations past the end of the + * highest used address, even if there was space below. For reasons unclear, if + * we don't do this, then we get massive corruption during early kernel boot. + * + * Note that find_kalloc_start() starts its search at the end of this + * allocation. + * + * This all falls apart horribly on some EFI systems booting under iPXE, where + * we end up with boot module allocation such that there is no room between the + * highest used address and our 4Gb limit. To that end, we have an iPXE hack + * that limits the maximum address used by its allocations in an attempt to give + * us room. */ static void -init_mem_alloc(void) +init_dboot_alloc(void) { - DBG_MSG("Entered init_mem_alloc()\n"); + extern char _end[]; + + DBG_MSG("Entered init_dboot_alloc()\n"); + dboot_process_modules(); dboot_process_mmap(); - dboot_multiboot_highest_addr(); + + size_t align = FOUR_MEG; + + /* + * We need enough alloc space for the nucleus memory... + */ + size_t size = RNDUP(ksize, align); + + /* + * And enough page table pages to cover potentially 4Gb. Each leaf PT + * covers 2Mb, so we need a maximum of 2048 pages for those. Next level + * up each covers 1Gb, and so on, so we'll just add a little slop (which + * gets aligned up anyway). + */ + size += RNDUP(MMU_PAGESIZE * (2048 + 256), align); + + uint64_t start = MAX(dboot_multiboot_highest_addr(), + (paddr_t)(uintptr_t)&_end); + start = RNDUP(start, align); + + /* + * As mentioned above, only start our search after all the boot modules. + */ + for (uint_t i = 0; i < bi->bi_module_cnt; i++) { + native_ptr_t mod_end = modules[i].bm_addr + modules[i].bm_size; + + start = MAX(start, RNDUP(mod_end, MMU_PAGESIZE)); + } + + uint64_t end = start + size; + + DBG(start); + DBG(end); + + for (uint_t i = 0; i < memlists_used; i++) { + uint64_t ml_start = memlists[i].addr; + uint64_t ml_end = memlists[i].addr + memlists[i].size; + + /* + * If we're past our starting point for search, begin at this + * memlist. + */ + if (start < ml_start) { + start = RNDUP(ml_start, align); + end = start + size; + } + + if (end >= (uint64_t)UINT32_MAX) { + dboot_panic("couldn't find alloc space below 4Gb"); + } + + if (end < ml_end) { + alloc_addr = start; + alloc_end = end; + DBG(alloc_addr); + DBG(alloc_end); + return; + } + } + + dboot_panic("couldn't find alloc space in memlists"); } static int @@ -1869,77 +2023,89 @@ print_efi64(EFI_SYSTEM_TABLE64 *efi) #endif /* !__xpv */ /* - * Simple memory allocator, allocates aligned physical memory. - * Note that startup_kernel() only allocates memory, never frees. - * Memory usage just grows in an upward direction. + * Simple memory allocator for aligned physical memory from the area provided by + * init_dboot_alloc(). This is a simple bump allocator, and it's never directly + * freed by dboot. */ static void * -do_mem_alloc(uint32_t size, uint32_t align) +dboot_alloc(uint32_t size, uint32_t align) { - uint_t i; - uint64_t best; - uint64_t start; - uint64_t end; + uint32_t start = RNDUP(alloc_addr, align); - /* - * make sure size is a multiple of pagesize - */ size = RNDUP(size, MMU_PAGESIZE); - next_avail_addr = RNDUP(next_avail_addr, align); - /* - * XXPV fixme joe - * - * a really large bootarchive that causes you to run out of memory - * may cause this to blow up - */ - /* LINTED E_UNEXPECTED_UINT_PROMOTION */ - best = (uint64_t)-size; - for (i = 0; i < memlists_used; ++i) { - start = memlists[i].addr; -#if defined(__xpv) - start += mfn_base; -#endif - end = start + memlists[i].size; + if (start + size > alloc_end) { + dboot_panic("%s: couldn't allocate 0x%x bytes aligned 0x%x " + "alloc_addr = 0x%llx, alloc_end = 0x%llx", __func__, + size, align, (u_longlong_t)alloc_addr, + (u_longlong_t)alloc_end); + } - /* - * did we find the desired address? - */ - if (start <= next_avail_addr && next_avail_addr + size <= end) { - best = next_avail_addr; - goto done; - } + alloc_addr = start + size; - /* - * if not is this address the best so far? - */ - if (start > next_avail_addr && start < best && - RNDUP(start, align) + size <= end) - best = RNDUP(start, align); + if (map_debug) { + dboot_printf("%s(0x%x, 0x%x) = 0x%x\n", __func__, size, + align, start); } - /* - * We didn't find exactly the address we wanted, due to going off the - * end of a memory region. Return the best found memory address. - */ -done: - next_avail_addr = best + size; -#if defined(__xpv) - if (next_avail_addr > scratch_end) - dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: " - "0x%lx", (ulong_t)next_avail_addr, - (ulong_t)scratch_end); -#endif - (void) memset((void *)(uintptr_t)best, 0, size); - return ((void *)(uintptr_t)best); + (void) memset((void *)(uintptr_t)start, 0, size); + return ((void *)(uintptr_t)start); } -void * -mem_alloc(uint32_t size) +static void * +page_alloc(void) { - return (do_mem_alloc(size, MMU_PAGESIZE)); + return (dboot_alloc(MMU_PAGESIZE, MMU_PAGESIZE)); } +/* + * This is where we tell the kernel to start physical allocations from, beyond + * the end of our allocation area and all boot modules. It might be beyond 4Gb, + * so we can't touch that area ourselves. + * + * We might set kalloc_start to the end of a memlist; if so make sure we skip it + * along to the next one. + * + * This is making the massive assumption that there is a suitably large area for + * kernel allocations past the end of the last boot module and the dboot + * allocated region. Worse, we don't have a simple way to assert that is so. + */ +static paddr_t +find_kalloc_start(void) +{ + paddr_t kalloc_start = alloc_end; + uint_t i; + + for (i = 0; i < bi->bi_module_cnt; i++) { + native_ptr_t mod_end = modules[i].bm_addr + modules[i].bm_size; + + kalloc_start = MAX(kalloc_start, RNDUP(mod_end, MMU_PAGESIZE)); + } + + boot_map_end = kalloc_start; + DBG(boot_map_end); + + for (i = 0; i < memlists_used; i++) { + uint64_t ml_start = memlists[i].addr; + uint64_t ml_end = memlists[i].addr + memlists[i].size; + + if (kalloc_start >= ml_end) + continue; + + if (kalloc_start < ml_start) + kalloc_start = ml_start; + break; + } + + if (i == memlists_used) { + dboot_panic("fell off the end of memlists finding a " + "kalloc_start value > 0x%llx", (u_longlong_t)kalloc_start); + } + + DBG(kalloc_start); + + return (kalloc_start); +} /* * Build page tables to map all of memory used so far as well as the kernel. @@ -1962,7 +2128,7 @@ build_page_tables(void) #if defined(__xpv) top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base; #else /* __xpv */ - top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); + top_page_table = (paddr_t)(uintptr_t)page_alloc(); #endif /* __xpv */ DBG((uintptr_t)top_page_table); @@ -1988,7 +2154,7 @@ build_page_tables(void) /* * The kernel will need a 1 page window to work with page tables */ - bi->bi_pt_window = (native_ptr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); + bi->bi_pt_window = (native_ptr_t)(uintptr_t)page_alloc(); DBG(bi->bi_pt_window); bi->bi_pte_to_pt_window = (native_ptr_t)(uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0); @@ -2029,6 +2195,10 @@ build_page_tables(void) #if !defined(__xpv) + /* + * Map every valid memlist address up until boot_map_end: this will + * cover at least our alloc region and all boot modules. + */ for (i = 0; i < memlists_used; ++i) { start = memlists[i].addr; end = start + memlists[i].size; @@ -2036,11 +2206,11 @@ build_page_tables(void) if (map_debug) dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n", start, end); - while (start < end && start < next_avail_addr) { + while (start < end && start < boot_map_end) { map_pa_at_va(start, start, 0); start += MMU_PAGESIZE; } - if (start >= next_avail_addr) + if (start >= boot_map_end) break; } @@ -2302,7 +2472,9 @@ startup_kernel(void) /* * Need correct target_kernel_text value */ +#if defined(_BOOT_TARGET_amd64) target_kernel_text = KERNEL_TEXT; +#endif DBG(target_kernel_text); #if defined(__xpv) @@ -2462,7 +2634,7 @@ startup_kernel(void) /* * initialize the simple memory allocator */ - init_mem_alloc(); + init_dboot_alloc(); #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64) /* @@ -2516,7 +2688,7 @@ startup_kernel(void) * For grub, copy kernel bits from the ELF64 file to final place. */ DBG_MSG("\nAllocating nucleus pages.\n"); - ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG); + ktext_phys = (uintptr_t)dboot_alloc(ksize, FOUR_MEG); if (ktext_phys == 0) dboot_panic("failed to allocate aligned kernel memory"); @@ -2527,6 +2699,8 @@ startup_kernel(void) DBG(ktext_phys); + paddr_t kalloc_start = find_kalloc_start(); + /* * Allocate page tables. */ @@ -2544,18 +2718,18 @@ startup_kernel(void) #if defined(__xpv) - bi->bi_next_paddr = next_avail_addr - mfn_base; + bi->bi_next_paddr = kalloc_start - mfn_base; DBG(bi->bi_next_paddr); - bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr; + bi->bi_next_vaddr = (native_ptr_t)kalloc_start; DBG(bi->bi_next_vaddr); /* * unmap unused pages in start area to make them available for DMA */ - while (next_avail_addr < scratch_end) { - (void) HYPERVISOR_update_va_mapping(next_avail_addr, + while (alloc_addr < alloc_end) { + (void) HYPERVISOR_update_va_mapping(alloc_addr, 0, UVMF_INVLPG | UVMF_LOCAL); - next_avail_addr += MMU_PAGESIZE; + alloc_addr += MMU_PAGESIZE; } bi->bi_xen_start_info = (native_ptr_t)(uintptr_t)xen_info; @@ -2565,9 +2739,9 @@ startup_kernel(void) #else /* __xpv */ - bi->bi_next_paddr = next_avail_addr; + bi->bi_next_paddr = kalloc_start; DBG(bi->bi_next_paddr); - bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr; + bi->bi_next_vaddr = (native_ptr_t)kalloc_start; DBG(bi->bi_next_vaddr); bi->bi_mb_version = multiboot_version; diff --git a/usr/src/uts/i86pc/dboot/dboot_xboot.h b/usr/src/uts/i86pc/dboot/dboot_xboot.h index 7d0876c79c..f261f3f2b1 100644 --- a/usr/src/uts/i86pc/dboot/dboot_xboot.h +++ b/usr/src/uts/i86pc/dboot/dboot_xboot.h @@ -22,6 +22,8 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2020 Joyent, Inc. */ #ifndef _DBOOT_XBOOT_H @@ -52,16 +54,14 @@ extern uint_t prom_debug; #define DBG_MSG(s) do { if (prom_debug) \ dboot_printf(s); \ - _NOTE(CONSTANTCONDITION) \ } while (0) -#define DBG(x) do { if (prom_debug) { \ - dboot_printf("%s is 0x%" PRIx64 "\n", #x, (uint64_t)(x)); \ - _NOTE(CONSTANTCONDITION) \ +#define DBG(x) do { if (prom_debug) { \ + dboot_printf("%s: %s is 0x%" PRIx64 "\n", \ + __func__, #x, (uint64_t)(x)); \ } } while (0) -extern void dboot_halt(void); -extern void *mem_alloc(uint32_t size); +extern void dboot_halt(void) __NORETURN; #define RNDUP(x, y) (((x) + ((y) - 1ul)) & ~((y) - 1ul)) diff --git a/usr/src/uts/i86pc/io/mp_platform_common.c b/usr/src/uts/i86pc/io/mp_platform_common.c index bff745b483..54a0ac3506 100644 --- a/usr/src/uts/i86pc/io/mp_platform_common.c +++ b/usr/src/uts/i86pc/io/mp_platform_common.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. + * Copyright 2017 Joyent, Inc. * Copyright (c) 2017 by Delphix. All rights reserved. * Copyright 2020 Joyent, Inc. * Copyright 2020 RackTop Systems, Inc. diff --git a/usr/src/uts/i86pc/io/psm/psm_common.c b/usr/src/uts/i86pc/io/psm/psm_common.c index b59d87bdcc..623c6e5617 100644 --- a/usr/src/uts/i86pc/io/psm/psm_common.c +++ b/usr/src/uts/i86pc/io/psm/psm_common.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016, Joyent, Inc. */ #include <sys/types.h> diff --git a/usr/src/uts/i86pc/ml/kpti_trampolines.s b/usr/src/uts/i86pc/ml/kpti_trampolines.s index 4b5102d547..17249eb747 100644 --- a/usr/src/uts/i86pc/ml/kpti_trampolines.s +++ b/usr/src/uts/i86pc/ml/kpti_trampolines.s @@ -667,6 +667,8 @@ tr_intr_ret_end: MK_INTR_TRAMPOLINE_NOERR(invaltrap) MK_INTR_TRAMPOLINE_NOERR(fasttrap) MK_INTR_TRAMPOLINE_NOERR(dtrace_ret) + MK_INTR_TRAMPOLINE_NOERR(brand_sys_int80) + MK_INTR_TRAMPOLINE_NOERR(sys_int80) /* * These are special because they can interrupt other traps, and diff --git a/usr/src/uts/i86pc/ml/offsets.in b/usr/src/uts/i86pc/ml/offsets.in index 622f7cd2a3..6c1de5c145 100644 --- a/usr/src/uts/i86pc/ml/offsets.in +++ b/usr/src/uts/i86pc/ml/offsets.in @@ -144,6 +144,7 @@ _klwp lwp_thread lwp_procp lwp_brand + lwp_brand_syscall lwp_eosys lwp_regs lwp_arg diff --git a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s index 8a68b4bced..8040e35297 100644 --- a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s +++ b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s @@ -515,6 +515,7 @@ noprod_sys_syscall: movq T_LWP(%r15), %r14 ASSERT_NO_RUPDATE_PENDING(%r14) + ENABLE_INTR_FLAGS MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) @@ -528,6 +529,37 @@ noprod_sys_syscall: incq %gs:CPU_STATS_SYS_SYSCALL + /* + * If our LWP has an alternate system call handler, run that instead of + * the regular system call path. + */ + movq LWP_BRAND_SYSCALL(%r14), %rdi + testq %rdi, %rdi + jz _syscall_no_brand + + pushq %rax + subq $8, %rsp /* align stack for call to C */ + INDIRECT_CALL_REG(rdi) + addq $8, %rsp + + /* + * If the alternate handler returns non-zero, the normal system call + * processing is resumed. + */ + testl %eax, %eax + popq %rax + jnz _syscall_no_brand + + /* + * For branded syscalls which were handled in-kernel, shuffle the + * register state as would be done by the native handler before jumping + * to the post-syscall logic. + */ + movq REGOFF_RAX(%rsp), %r12 + movq REGOFF_RDX(%rsp), %r13 + jmp _syscall_after_brand + +_syscall_no_brand: movw %ax, T_SYSNUM(%r15) movzbl T_PRE_SYS(%r15), %ebx ORL_SYSCALLTRACE(%ebx) @@ -563,6 +595,8 @@ _syscall_invoke: shrq $32, %r13 /* upper 32-bits into %edx */ movl %r12d, %r12d /* lower 32-bits into %eax */ 5: + +_syscall_after_brand: /* * Optimistically assume that there's no post-syscall * work to do. (This is to avoid having to call syscall_mstate() @@ -825,11 +859,46 @@ _syscall32_save: incq %gs:CPU_STATS_SYS_SYSCALL /* + * If our lwp has an alternate system call handler, run that instead + * of the regular system call path. + */ + movq LWP_BRAND_SYSCALL(%r14), %rax + testq %rax, %rax + jz _syscall32_no_brand + + movb $LWP_SYS, LWP_STATE(%r14) + INDIRECT_CALL_REG(rax) + + /* + * If the alternate handler returns non-zero, the normal system call + * processing is resumed. + */ + testl %eax, %eax + jnz _syscall32_no_brand + + /* + * For branded syscalls which were handled in-kernel, shuffle the + * register state as would be done by the native handler before jumping + * to the post-syscall logic. + */ + movl REGOFF_RAX(%rsp), %r12d + movl REGOFF_RDX(%rsp), %r13d + jmp _syscall32_after_brand + +_syscall32_no_brand: + /* * Make some space for MAXSYSARGS (currently 8) 32-bit args placed * into 64-bit (long) arg slots, maintaining 16 byte alignment. Or * more succinctly: * * SA(MAXSYSARGS * sizeof (long)) == 64 + * + * Note, this space is used both to copy in the arguments from user + * land, but also to as part of the old UNIX style syscall_ap() method. + * syscall_entry expects that we do not change the values of this space + * that we give it. However, this means that when we end up in the more + * recent model of passing the arguments based on the calling + * conventions, we'll need to save an additional 16 bytes of stack. */ #define SYS_DROP 64 /* drop for args */ subq $SYS_DROP, %rsp @@ -857,12 +926,16 @@ _syscall32_save: */ movq %rax, %rbx - movl 0(%rsp), %edi - movl 8(%rsp), %esi - movl 0x10(%rsp), %edx - movl 0x18(%rsp), %ecx - movl 0x20(%rsp), %r8d - movl 0x28(%rsp), %r9d + movl 0x0(%rsp), %edi /* arg0 */ + movl 0x8(%rsp), %esi /* arg1 */ + movl 0x10(%rsp), %edx /* arg2 */ + movl 0x38(%rsp), %eax /* arg7 load */ + movl 0x18(%rsp), %ecx /* arg3 */ + pushq %rax /* arg7 saved to stack */ + movl 0x28(%rsp), %r8d /* arg4 */ + movl 0x38(%rsp), %eax /* arg6 load */ + movl 0x30(%rsp), %r9d /* arg5 */ + pushq %rax /* arg6 saved to stack */ movq SY_CALLC(%rbx), %rax INDIRECT_CALL_REG(rax) @@ -881,6 +954,8 @@ _syscall32_save: shrq $32, %r13 /* upper 32-bits into %edx */ movl %eax, %r12d /* lower 32-bits into %eax */ +_syscall32_after_brand: + /* * Optimistically assume that there's no post-syscall * work to do. (This is to avoid having to call syscall_mstate() @@ -1133,15 +1208,20 @@ _full_syscall_postsys32: /* * Fetch the arguments copied onto the kernel stack and put * them in the right registers to invoke a C-style syscall handler. - * %rax contains the handler address. + * %rax contains the handler address. For the last two arguments, we + * push them onto the stack -- we can't clobber the old arguments. */ movq %rax, %rbx - movl 0(%rsp), %edi - movl 8(%rsp), %esi - movl 0x10(%rsp), %edx - movl 0x18(%rsp), %ecx - movl 0x20(%rsp), %r8d - movl 0x28(%rsp), %r9d + movl 0x0(%rsp), %edi /* arg0 */ + movl 0x8(%rsp), %esi /* arg1 */ + movl 0x10(%rsp), %edx /* arg2 */ + movl 0x38(%rsp), %eax /* arg7 load */ + movl 0x18(%rsp), %ecx /* arg3 */ + pushq %rax /* arg7 saved to stack */ + movl 0x28(%rsp), %r8d /* arg4 */ + movl 0x38(%rsp), %eax /* arg6 load */ + movl 0x30(%rsp), %r9d /* arg5 */ + pushq %rax /* arg6 saved to stack */ movq SY_CALLC(%rbx), %rax INDIRECT_CALL_REG(rax) @@ -1220,6 +1300,66 @@ _full_syscall_postsys32: SET_SIZE(brand_sys_sysenter) /* + * System call via an int80. This entry point is only used by the Linux + * application environment. Unlike the other entry points, there is no + * default action to take if no callback is registered for this process. + */ + + ENTRY_NP(brand_sys_int80) + SWAPGS /* kernel gsbase */ + XPV_TRAP_POP + call smap_enable + + /* + * We first attempt to call the "b_int80" handler from the "struct + * brand_mach_ops" for this brand. If no handler function is installed + * for this brand, the BRAND_CALLBACK() macro returns here and we + * check the lwp for a "lwp_brand_syscall" handler. + */ + BRAND_CALLBACK(BRAND_CB_INT80, BRAND_URET_FROM_INTR_STACK()) + + /* + * Check to see if this lwp provides "lwp_brand_syscall". If so, we + * will route this int80 through the regular system call handling path. + */ + movq %r15, %gs:CPU_RTMP_R15 + movq %gs:CPU_THREAD, %r15 + movq T_LWP(%r15), %r15 + movq LWP_BRAND_SYSCALL(%r15), %r15 + testq %r15, %r15 + movq %gs:CPU_RTMP_R15, %r15 + jnz nopop_syscall_int + + /* + * The brand provided neither a "b_int80", nor a "lwp_brand_syscall" + * function, and has thus opted out of handling this trap. + */ + SWAPGS /* user gsbase */ + jmp nopop_int80 + + ENTRY_NP(sys_int80) + /* + * We hit an int80, but this process isn't of a brand with an int80 + * handler. Bad process! Make it look as if the INT failed. + * Modify %rip to point before the INT, push the expected error + * code and fake a GP fault. Note on 64-bit hypervisor we need + * to undo the XPV_TRAP_POP and push rcx and r11 back on the stack + * because gptrap will pop them again with its own XPV_TRAP_POP. + */ + XPV_TRAP_POP + call smap_enable +nopop_int80: + subq $2, (%rsp) /* int insn 2-bytes */ + pushq $_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2) +#if defined(__xpv) + push %r11 + push %rcx +#endif + jmp gptrap / GP fault + SET_SIZE(sys_int80) + SET_SIZE(brand_sys_int80) + +/* * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by * the generic i386 libc to do system calls. We do a small amount of setup * before jumping into the existing sys_syscall32 path. diff --git a/usr/src/uts/i86pc/os/cpr_impl.c b/usr/src/uts/i86pc/os/cpr_impl.c index 4a5c71b35d..e878f765ef 100644 --- a/usr/src/uts/i86pc/os/cpr_impl.c +++ b/usr/src/uts/i86pc/os/cpr_impl.c @@ -23,6 +23,10 @@ */ /* + * Copyright 2019 Joyent, Inc. + */ + +/* * Platform specific implementation code * Currently only suspend to RAM is supported (ACPI S3) */ @@ -737,6 +741,20 @@ i_cpr_is_supported(int sleeptype) if (sleeptype != CPR_TORAM) return (0); + /* + * Unfortunately, the x86 resume code was never implemented for GAS. + * The only obvious problem is that a trick necessary to appease Sun + * Studio does the wrong thing for GAS. Doubley unfortunate is that + * the condition used to detect GAS is incorrect, so we do in fact + * compile the Studio path, it just immediately fails in resume. + * + * Given that, if we were built using GCC, never allow CPR to be + * attempted. + */ +#ifdef __GNUC__ + return (0); +#else + /* * The next statement tests if a specific platform has turned off * cpr support. @@ -751,6 +769,7 @@ i_cpr_is_supported(int sleeptype) return (1); return (pm_S3_enabled); +#endif } void diff --git a/usr/src/uts/i86pc/os/ibft.c b/usr/src/uts/i86pc/os/ibft.c index d9ed882705..fab1324787 100644 --- a/usr/src/uts/i86pc/os/ibft.c +++ b/usr/src/uts/i86pc/os/ibft.c @@ -39,6 +39,7 @@ #include <sys/kmem.h> #include <sys/psm.h> #include <sys/bootconf.h> +#include <sys/reboot.h> typedef enum ibft_structure_type { Reserved = 0, @@ -206,6 +207,7 @@ static ibft_status_t iscsi_parse_ibft_NIC(iscsi_ibft_nic_t *nicp); static ibft_status_t iscsi_parse_ibft_target(char *begin_of_ibft, iscsi_ibft_tgt_t *tgtp); +extern int boothowto; /* * Return value: @@ -759,7 +761,9 @@ ld_ib_prop() * 1) pass "-B ibft-noprobe=1" on kernel command line * 2) add line "set ibft_noprobe=1" in /etc/system */ - cmn_err(CE_NOTE, IBFT_NOPROBE_MSG); + if (boothowto & RB_VERBOSE) { + cmn_err(CE_NOTE, IBFT_NOPROBE_MSG); + } return; } diff --git a/usr/src/uts/i86pc/os/lgrpplat.c b/usr/src/uts/i86pc/os/lgrpplat.c index ed463fba8f..6320c0a949 100644 --- a/usr/src/uts/i86pc/os/lgrpplat.c +++ b/usr/src/uts/i86pc/os/lgrpplat.c @@ -2800,7 +2800,11 @@ lgrp_plat_process_sli(uint32_t domain_id, uchar_t *sli_info, /* * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs * and memory are local to each other in the same NUMA node and return number - * of nodes + * of nodes. + * + * The SRAT table pointer is populated during bootup by + * build_firmware_properties() in fakebop.c. Several motherboard and BIOS + * manufacturers are guilty of not having a SRAT table. */ static int lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp, @@ -2817,9 +2821,15 @@ lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp, /* * Nothing to do when no SRAT or disabled */ - if (tp == NULL || !lgrp_plat_srat_enable) + if (!lgrp_plat_srat_enable) return (-1); + if (tp == NULL) { + cmn_err(CE_WARN, "Couldn't read ACPI SRAT table from BIOS. " + "lgrp support will be limited to one group.\n"); + return (-1); + } + /* * Try to get domain information from MSCT table. * ACPI4.0: OSPM will use information provided by the MSCT only diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c index a0bb296e70..e1e92ffe4f 100644 --- a/usr/src/uts/i86pc/os/startup.c +++ b/usr/src/uts/i86pc/os/startup.c @@ -2450,6 +2450,7 @@ add_physmem_cb(page_t *pp, pfn_t pnum) pp->p_mapping = NULL; pp->p_embed = 0; pp->p_share = 0; + pp->p_zoneid = ALL_ZONES; pp->p_mlentry = 0; } diff --git a/usr/src/uts/i86pc/os/trap.c b/usr/src/uts/i86pc/os/trap.c index b7c18bb8c9..063fac49f7 100644 --- a/usr/src/uts/i86pc/os/trap.c +++ b/usr/src/uts/i86pc/os/trap.c @@ -99,6 +99,7 @@ #include <sys/hypervisor.h> #endif #include <sys/contract/process_impl.h> +#include <sys/brand.h> #define USER 0x10000 /* user-mode flag added to trap type */ @@ -810,6 +811,17 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid) fault_type = F_INVAL; } + /* + * Allow the brand to interpose on invalid memory accesses + * prior to running the native pagefault handler. If this + * brand hook returns zero, it was able to handle the fault + * completely. Otherwise, drive on and call pagefault(). + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_pagefault != NULL && + BROP(p)->b_pagefault(p, lwp, addr, fault_type, rw) == 0) { + goto out; + } + res = pagefault(addr, fault_type, rw, 0); /* diff --git a/usr/src/uts/i86pc/sys/apic.h b/usr/src/uts/i86pc/sys/apic.h index 26626ec5a4..f2528a632f 100644 --- a/usr/src/uts/i86pc/sys/apic.h +++ b/usr/src/uts/i86pc/sys/apic.h @@ -386,7 +386,7 @@ struct apic_io_intr { /* special or reserve vectors */ #define APIC_CHECK_RESERVE_VECTORS(v) \ (((v) == T_FASTTRAP) || ((v) == APIC_SPUR_INTR) || \ - ((v) == T_SYSCALLINT) || ((v) == T_DTRACE_RET)) + ((v) == T_SYSCALLINT) || ((v) == T_DTRACE_RET) || ((v) == 0x80)) /* cmos shutdown code for BIOS */ #define BIOS_SHUTDOWN 0x0a diff --git a/usr/src/uts/i86pc/sys/comm_page.h b/usr/src/uts/i86pc/sys/comm_page.h index 520ad9001d..ea19c856a8 100644 --- a/usr/src/uts/i86pc/sys/comm_page.h +++ b/usr/src/uts/i86pc/sys/comm_page.h @@ -27,6 +27,7 @@ extern "C" { #endif #define COMM_PAGE_SIZE PAGESIZE +#define COMM_PAGE_ALIGN 0x4000 #ifndef _ASM diff --git a/usr/src/uts/i86pc/sys/vm_machparam.h b/usr/src/uts/i86pc/sys/vm_machparam.h index 90a5245217..0d0c95535c 100644 --- a/usr/src/uts/i86pc/sys/vm_machparam.h +++ b/usr/src/uts/i86pc/sys/vm_machparam.h @@ -23,6 +23,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #ifndef _SYS_VM_MACHPARAM_H @@ -129,11 +130,12 @@ extern "C" { * * XXX - The system doesn't account for multiple swap devices. */ -#define DISKRPM 60 +#define DISKRPM 600 /* * The maximum value for handspreadpages which is the the distance - * between the two clock hands in pages. + * between the two clock hands in pages. This is only used when the page + * scanner is first started. */ #define MAXHANDSPREADPAGES ((64 * 1024 * 1024) / PAGESIZE) diff --git a/usr/src/uts/i86pc/vm/hat_i86.c b/usr/src/uts/i86pc/vm/hat_i86.c index 7650d28f41..ea9436e881 100644 --- a/usr/src/uts/i86pc/vm/hat_i86.c +++ b/usr/src/uts/i86pc/vm/hat_i86.c @@ -3808,7 +3808,7 @@ hat_page_getattr(struct page *pp, uint_t flag) /* - * common code used by hat_pageunload() and hment_steal() + * common code used by hat_page_inval() and hment_steal() */ hment_t * hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry) @@ -3864,15 +3864,13 @@ hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry) extern int vpm_enable; /* - * Unload all translations to a page. If the page is a subpage of a large + * Unload translations to a page. If the page is a subpage of a large * page, the large page mappings are also removed. - * - * The forceflags are unused. + * If curhat is not NULL, then we only unload the translation + * for the given process, otherwise all translations are unloaded. */ - -/*ARGSUSED*/ -static int -hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag) +void +hat_page_inval(struct page *pp, uint_t pg_szcd, struct hat *curhat) { page_t *cur_pp = pp; hment_t *hm; @@ -3880,16 +3878,11 @@ hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag) htable_t *ht; uint_t entry; level_t level; + ulong_t cnt = 0; XPV_DISALLOW_MIGRATE(); /* - * prevent recursion due to kmem_free() - */ - ++curthread->t_hatdepth; - ASSERT(curthread->t_hatdepth < 16); - - /* * clear the vpm ref. */ if (vpm_enable) { @@ -3899,6 +3892,8 @@ hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag) * The loop with next_size handles pages with multiple pagesize mappings */ next_size: + if (curhat != NULL) + cnt = hat_page_getshare(cur_pp); for (;;) { /* @@ -3910,14 +3905,13 @@ next_size: if (hm == NULL) { x86_hm_exit(cur_pp); +curproc_done: /* * If not part of a larger page, we're done. */ if (cur_pp->p_szc <= pg_szcd) { - ASSERT(curthread->t_hatdepth > 0); - --curthread->t_hatdepth; XPV_ALLOW_MIGRATE(); - return (0); + return; } /* @@ -3936,8 +3930,20 @@ next_size: * If this mapping size matches, remove it. */ level = ht->ht_level; - if (level == pg_szcd) - break; + if (level == pg_szcd) { + if (curhat == NULL || ht->ht_hat == curhat) + break; + /* + * Unloading only the given process but it's + * not the hat for the current process. Leave + * entry in place. Also do a safety check to + * ensure we don't get in an infinite loop + */ + if (cnt-- == 0) { + x86_hm_exit(cur_pp); + goto curproc_done; + } + } } /* @@ -3947,14 +3953,44 @@ next_size: hm = hati_page_unmap(cur_pp, ht, entry); if (hm != NULL) hment_free(hm); + + /* Perform check above for being part of a larger page. */ + if (curhat != NULL) + goto curproc_done; } } +/* + * Unload translations to a page. If unloadflag is HAT_CURPROC_PGUNLOAD, then + * we only unload the translation for the current process, otherwise all + * translations are unloaded. + */ +static int +hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t unloadflag) +{ + struct hat *curhat = NULL; + + /* + * prevent recursion due to kmem_free() + */ + ++curthread->t_hatdepth; + ASSERT(curthread->t_hatdepth < 16); + + if (unloadflag == HAT_CURPROC_PGUNLOAD) + curhat = curthread->t_procp->p_as->a_hat; + + hat_page_inval(pp, pg_szcd, curhat); + + ASSERT(curthread->t_hatdepth > 0); + --curthread->t_hatdepth; + return (0); +} + int -hat_pageunload(struct page *pp, uint_t forceflag) +hat_pageunload(struct page *pp, uint_t unloadflag) { ASSERT(PAGE_EXCL(pp)); - return (hati_pageunload(pp, 0, forceflag)); + return (hati_pageunload(pp, 0, unloadflag)); } /* diff --git a/usr/src/uts/i86pc/vm/hment.c b/usr/src/uts/i86pc/vm/hment.c index bb18b5c462..769bbd15d2 100644 --- a/usr/src/uts/i86pc/vm/hment.c +++ b/usr/src/uts/i86pc/vm/hment.c @@ -21,6 +21,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #include <sys/types.h> @@ -35,6 +36,7 @@ #include <vm/hat_i86.h> #include <sys/cmn_err.h> #include <sys/avl.h> +#include <sys/zone.h> /* @@ -319,6 +321,8 @@ hment_insert(hment_t *hm, page_t *pp) ((hment_t *)pp->p_mapping)->hm_prev = hm; pp->p_mapping = hm; + zone_add_page(pp); + /* * Add the hment to the system-wide hash table. */ @@ -460,6 +464,7 @@ hment_assign(htable_t *htable, uint_t entry, page_t *pp, hment_t *hm) pp->p_embed = 1; pp->p_mapping = htable; pp->p_mlentry = entry; + zone_add_page(pp); return; } @@ -541,6 +546,7 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry) pp->p_mapping = NULL; pp->p_mlentry = 0; pp->p_embed = 0; + zone_rm_page(pp); return (NULL); } @@ -576,6 +582,7 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry) hm->hm_hashlink = null_avl_link; hm->hm_next = NULL; hm->hm_prev = NULL; + zone_rm_page(pp); return (hm); } diff --git a/usr/src/uts/i86pc/vm/vm_machdep.c b/usr/src/uts/i86pc/vm/vm_machdep.c index 225628b1c8..bc9d03e7f5 100644 --- a/usr/src/uts/i86pc/vm/vm_machdep.c +++ b/usr/src/uts/i86pc/vm/vm_machdep.c @@ -711,10 +711,8 @@ void map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) { struct proc *p = curproc; - caddr_t userlimit = (flags & _MAP_LOW32) ? - (caddr_t)_userlimit32 : p->p_as->a_userlimit; - - map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); + map_addr_proc(addrp, len, off, vacalign, + map_userlimit(p, p->p_as, flags), curproc, flags); } /*ARGSUSED*/ @@ -3546,7 +3544,7 @@ page_create_io( if (nscan < desscan && freemem < minfree) { TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, "pageout_cv_signal:freemem %ld", freemem); - cv_signal(&proc_pageout->p_cv); + WAKE_PAGEOUT_SCANNER(); } if (flags & PG_PHYSCONTIG) { |