diff options
author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2016-04-29 11:35:48 +0000 |
---|---|---|
committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2016-04-29 11:35:48 +0000 |
commit | d006a4cbd4e9a251a94eec493f1618a9e452992a (patch) | |
tree | 65a5d74b65e9786362a556a4a09f82fd2bbcfbe6 | |
parent | 3e4225906c66015597649ddcdb4fe37a2204fec2 (diff) | |
parent | af868f46a5b794687741d5424de9e3a2d684a84a (diff) | |
download | illumos-joyent-d006a4cbd4e9a251a94eec493f1618a9e452992a.tar.gz |
[illumos-gate merge]
commit af868f46a5b794687741d5424de9e3a2d684a84a
6914 kernel virtual memory fragmentation leads to hang
commit 23a268cfbc75530b746495f3e157b9bc71069420
6565 pargs crashes on growing env
commit 1872b0b513cebbb59f48d3164530f8598d11df23
6929 couple of updates to i386 disassembler tables
-rw-r--r-- | usr/src/cmd/ptools/pargs/pargs.c | 6 | ||||
-rw-r--r-- | usr/src/common/dis/i386/dis_tables.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/arc.c | 21 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/startup.c | 84 | ||||
-rw-r--r-- | usr/src/uts/i86pc/sys/machparam.h | 2 |
5 files changed, 71 insertions, 46 deletions
diff --git a/usr/src/cmd/ptools/pargs/pargs.c b/usr/src/cmd/ptools/pargs/pargs.c index 54792b216a..d8072b56a1 100644 --- a/usr/src/cmd/ptools/pargs/pargs.c +++ b/usr/src/cmd/ptools/pargs/pargs.c @@ -91,6 +91,7 @@ typedef struct pargs_data { uintptr_t *pd_argv; char **pd_argv_strs; size_t pd_envc; + size_t pd_envc_curr; uintptr_t *pd_envp; char **pd_envp_strs; size_t pd_auxc; @@ -641,6 +642,10 @@ build_env(void *data, struct ps_prochandle *pr, uintptr_t addr, const char *str) pargs_data_t *datap = data; if (datap->pd_envp != NULL) { + /* env has more items than last time, skip the newer ones */ + if (datap->pd_envc > datap->pd_envc_curr) + return (0); + datap->pd_envp[datap->pd_envc] = addr; if (str == NULL) datap->pd_envp_strs[datap->pd_envc] = NULL; @@ -660,6 +665,7 @@ get_env(pargs_data_t *datap) datap->pd_envc = 0; (void) Penv_iter(pr, build_env, datap); + datap->pd_envc_curr = datap->pd_envc; datap->pd_envp = safe_zalloc(sizeof (uintptr_t) * datap->pd_envc); datap->pd_envp_strs = safe_zalloc(sizeof (char *) * datap->pd_envc); diff --git a/usr/src/common/dis/i386/dis_tables.c b/usr/src/common/dis/i386/dis_tables.c index eeff5fb1e9..c21c392d77 100644 --- a/usr/src/common/dis/i386/dis_tables.c +++ b/usr/src/common/dis/i386/dis_tables.c @@ -1456,7 +1456,7 @@ const instable_t dis_op0F38[256] = { /* [78] */ INVALID, INVALID, INVALID, INVALID, /* [7C] */ INVALID, INVALID, INVALID, INVALID, -/* [80] */ TNSy("invept", RM_66r), TNSy("invvpid", RM_66r),INVALID, INVALID, +/* [80] */ TNSy("invept", RM_66r), TNSy("invvpid", RM_66r),TNSy("invpcid", RM_66r),INVALID, /* [84] */ INVALID, INVALID, INVALID, INVALID, /* [88] */ INVALID, INVALID, INVALID, INVALID, /* [8C] */ INVALID, INVALID, INVALID, INVALID, @@ -2214,7 +2214,7 @@ const instable_t dis_distable[16][16] = { /* [9,0] */ TNS("nop",NORM), TS("xchg",RA), TS("xchg",RA), TS("xchg",RA), /* [9,4] */ TS("xchg",RA), TS("xchg",RA), TS("xchg",RA), TS("xchg",RA), /* [9,8] */ TNS("cXtX",CBW), TNS("cXtX",CWD), TNSx("lcall",SO), TNS("fwait",NORM), -/* [9,C] */ TSZy("pushf",IMPLMEM,4),TSZy("popf",IMPLMEM,4), TNSx("sahf",NORM), TNSx("lahf",NORM), +/* [9,C] */ TSZy("pushf",IMPLMEM,4),TSZy("popf",IMPLMEM,4), TNS("sahf",NORM), TNS("lahf",NORM), }, { /* [A,0] */ TNS("movb",OA), TS("mov",OA), TNS("movb",AO), TS("mov",AO), /* [A,4] */ TNSZ("movsb",SD,1), TS("movs",SD), TNSZ("cmpsb",SD,1), TS("cmps",SD), diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index c3e4101a42..7cd1a6d7a8 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -5008,18 +5008,6 @@ arc_init(void) /* Convert seconds to clock ticks */ arc_min_prefetch_lifespan = 1 * hz; - /* Start out with 1/8 of all memory */ - arc_c = allmem / 8; - -#ifdef _KERNEL - /* - * On architectures where the physical memory can be larger - * than the addressable space (intel in 32-bit mode), we may - * need to limit the cache to 1/8 of VM size. - */ - arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); -#endif - /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ arc_c_min = MAX(allmem / 32, 64 << 20); /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ @@ -5054,6 +5042,15 @@ arc_init(void) /* limit meta-data to 1/4 of the arc capacity */ arc_meta_limit = arc_c_max / 4; +#ifdef _KERNEL + /* + * Metadata is stored in the kernel's heap. Don't let us + * use more than half the heap for the ARC. + */ + arc_meta_limit = MIN(arc_meta_limit, + vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2); +#endif + /* Allow the tunable to override if it is reasonable */ if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) arc_meta_limit = zfs_arc_meta_limit; diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c index 0f16f3cc63..16c683d993 100644 --- a/usr/src/uts/i86pc/os/startup.c +++ b/usr/src/uts/i86pc/os/startup.c @@ -23,6 +23,7 @@ * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. * Copyright 2015 Joyent, Inc. + * Copyright (c) 2015 by Delphix. All rights reserved. */ /* * Copyright (c) 2010, Intel Corporation. @@ -404,9 +405,9 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t); * |--- GDT ---|- GDT page (GDT_VA) * |--- debug info ---|- debug info (DEBUG_INFO_VA) * | | - * | page_t structures | - * | memsegs, memlists, | - * | page hash, etc. | + * | page_t structures | + * | memsegs, memlists, | + * | page hash, etc. | * --- -|-----------------------|- ekernelheap, valloc_base (floating) * | | (segkp is just an arena in the heap) * | | @@ -414,7 +415,7 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t); * | | * | | * --- -|-----------------------|- kernelheap (floating) - * | Segkmap | + * | Segkmap | * 0xC3002000 -|-----------------------|- segmap_start (floating) * | Red Zone | * 0xC3000000 -|-----------------------|- kernelbase / userlimit (floating) @@ -438,7 +439,7 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t); * 0xFFFFFFFF.FFC00000 |-----------------------|- ARGSBASE * | debugger (?) | * 0xFFFFFFFF.FF800000 |-----------------------|- SEGDEBUGBASE - * | unused | + * | unused | * +-----------------------+ * | Kernel Data | * 0xFFFFFFFF.FBC00000 |-----------------------| @@ -447,7 +448,7 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t); * |--- GDT ---|- GDT page (GDT_VA) * |--- debug info ---|- debug info (DEBUG_INFO_VA) * | | - * | Core heap | (used for loadable modules) + * | Core heap | (used for loadable modules) * 0xFFFFFFFF.C0000000 |-----------------------|- core_base / ekernelheap * | Kernel | * | heap | @@ -460,23 +461,23 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t); * 0xFFFFFXXX.XXX00000 |-----------------------|- segzio_base (floating) * | segkp | * --- |-----------------------|- segkp_base (floating) - * | page_t structures | valloc_base + valloc_sz - * | memsegs, memlists, | - * | page hash, etc. | - * 0xFFFFFF00.00000000 |-----------------------|- valloc_base (lower if > 1TB) + * | page_t structures | valloc_base + valloc_sz + * | memsegs, memlists, | + * | page hash, etc. | + * 0xFFFFFF00.00000000 |-----------------------|- valloc_base (lower if >256GB) * | segkpm | * 0xFFFFFE00.00000000 |-----------------------| * | Red Zone | - * 0xFFFFFD80.00000000 |-----------------------|- KERNELBASE (lower if > 1TB) + * 0xFFFFFD80.00000000 |-----------------------|- KERNELBASE (lower if >256GB) * | User stack |- User space memory - * | | - * | shared objects, etc | (grows downwards) + * | | + * | shared objects, etc | (grows downwards) * : : - * | | + * | | * 0xFFFF8000.00000000 |-----------------------| - * | | - * | VA Hole / unused | - * | | + * | | + * | VA Hole / unused | + * | | * 0x00008000.00000000 |-----------------------| * | | * | | @@ -1243,20 +1244,45 @@ startup_memlist(void) /* * The default values of VALLOC_BASE and SEGKPM_BASE should work - * for values of physmax up to 1 Terabyte. They need adjusting when - * memory is at addresses above 1 TB. When adjusted, segkpm_base must + * for values of physmax up to 256GB (1/4 TB). They need adjusting when + * memory is at addresses above 256GB. When adjusted, segkpm_base must * be aligned on KERNEL_REDZONE_SIZE boundary (span of top level pte). + * + * In the general case (>256GB), we use (4 * physmem) for the + * kernel's virtual addresses, which is divided approximately + * as follows: + * - 1 * physmem for segkpm + * - 1.5 * physmem for segzio + * - 1.5 * physmem for heap + * Total: 4.0 * physmem + * + * Note that the segzio and heap sizes are more than physmem so that + * VA fragmentation does not prevent either of them from being + * able to use nearly all of physmem. The value of 1.5x is determined + * experimentally and may need to change if the workload changes. */ - if (physmax + 1 > mmu_btop(TERABYTE) || - plat_dr_physmax > mmu_btop(TERABYTE)) { + if (physmax + 1 > mmu_btop(TERABYTE / 4) || + plat_dr_physmax > mmu_btop(TERABYTE / 4)) { uint64_t kpm_resv_amount = mmu_ptob(physmax + 1); if (kpm_resv_amount < mmu_ptob(plat_dr_physmax)) { kpm_resv_amount = mmu_ptob(plat_dr_physmax); } - segkpm_base = -(P2ROUNDUP((2 * kpm_resv_amount), - KERNEL_REDZONE_SIZE)); /* down from top VA */ + /* + * This is what actually controls the KVA : UVA split. + * The kernel uses high VA, and this is lowering the + * boundary, thus increasing the amount of VA for the kernel. + * This gives the kernel 4 * (amount of physical memory) VA. + * + * The maximum VA is UINT64_MAX and we are using + * 64-bit 2's complement math, so e.g. if you have 512GB + * of memory, segkpm_base = -(4 * 512GB) == -2TB == + * UINT64_MAX - 2TB (approximately). So the kernel's + * VA is [UINT64_MAX-2TB to UINT64_MAX]. + */ + segkpm_base = -(P2ROUNDUP((4 * kpm_resv_amount), + KERNEL_REDZONE_SIZE)); /* make sure we leave some space for user apps above hole */ segkpm_base = MAX(segkpm_base, AMD64_VA_HOLE_END + TERABYTE); @@ -1906,8 +1932,9 @@ layout_kernel_va(void) * segment (from kernel heap) so that we can easily tell not to * include it in kernel crash dumps on 64 bit kernels. The trick is * to give it lots of VA, but not constrain the kernel heap. - * We scale the size of segzio linearly with physmem up to - * SEGZIOMAXSIZE. Above that amount it scales at 50% of physmem. + * We can use 1.5x physmem for segzio, leaving approximately + * another 1.5x physmem for heap. See also the comment in + * startup_memlist(). */ segzio_base = segkp_base + mmu_ptob(segkpsize); if (segzio_fromheap) { @@ -1915,15 +1942,10 @@ layout_kernel_va(void) } else { size_t physmem_size = mmu_ptob(physmem); size_t size = (segziosize == 0) ? - physmem_size : mmu_ptob(segziosize); + physmem_size * 3 / 2 : mmu_ptob(segziosize); if (size < SEGZIOMINSIZE) size = SEGZIOMINSIZE; - if (size > SEGZIOMAXSIZE) { - size = SEGZIOMAXSIZE; - if (physmem_size > size) - size += (physmem_size - size) / 2; - } segziosize = mmu_btop(ROUND_UP_LPAGE(size)); } PRM_DEBUG(segziosize); diff --git a/usr/src/uts/i86pc/sys/machparam.h b/usr/src/uts/i86pc/sys/machparam.h index a0fa08db16..99ae0d4d3b 100644 --- a/usr/src/uts/i86pc/sys/machparam.h +++ b/usr/src/uts/i86pc/sys/machparam.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 by Delphix. All rights reserved. */ /* Copyright (c) 1988 AT&T */ @@ -191,7 +192,6 @@ extern "C" { * minimum size for segzio */ #define SEGZIOMINSIZE (400L * 1024 * 1024L) /* 400M */ -#define SEGZIOMAXSIZE (512L * 1024L * 1024L * 1024L) /* 512G */ /* * During intial boot we limit heap to the top 4Gig. |