summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJerry Jelinek <jerry.jelinek@joyent.com>2016-04-29 11:35:48 +0000
committerJerry Jelinek <jerry.jelinek@joyent.com>2016-04-29 11:35:48 +0000
commitd006a4cbd4e9a251a94eec493f1618a9e452992a (patch)
tree65a5d74b65e9786362a556a4a09f82fd2bbcfbe6
parent3e4225906c66015597649ddcdb4fe37a2204fec2 (diff)
parentaf868f46a5b794687741d5424de9e3a2d684a84a (diff)
downloadillumos-joyent-d006a4cbd4e9a251a94eec493f1618a9e452992a.tar.gz
[illumos-gate merge]
commit af868f46a5b794687741d5424de9e3a2d684a84a 6914 kernel virtual memory fragmentation leads to hang commit 23a268cfbc75530b746495f3e157b9bc71069420 6565 pargs crashes on growing env commit 1872b0b513cebbb59f48d3164530f8598d11df23 6929 couple of updates to i386 disassembler tables
-rw-r--r--usr/src/cmd/ptools/pargs/pargs.c6
-rw-r--r--usr/src/common/dis/i386/dis_tables.c4
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c21
-rw-r--r--usr/src/uts/i86pc/os/startup.c84
-rw-r--r--usr/src/uts/i86pc/sys/machparam.h2
5 files changed, 71 insertions, 46 deletions
diff --git a/usr/src/cmd/ptools/pargs/pargs.c b/usr/src/cmd/ptools/pargs/pargs.c
index 54792b216a..d8072b56a1 100644
--- a/usr/src/cmd/ptools/pargs/pargs.c
+++ b/usr/src/cmd/ptools/pargs/pargs.c
@@ -91,6 +91,7 @@ typedef struct pargs_data {
uintptr_t *pd_argv;
char **pd_argv_strs;
size_t pd_envc;
+ size_t pd_envc_curr;
uintptr_t *pd_envp;
char **pd_envp_strs;
size_t pd_auxc;
@@ -641,6 +642,10 @@ build_env(void *data, struct ps_prochandle *pr, uintptr_t addr, const char *str)
pargs_data_t *datap = data;
if (datap->pd_envp != NULL) {
+ /* env has more items than last time, skip the newer ones */
+ if (datap->pd_envc > datap->pd_envc_curr)
+ return (0);
+
datap->pd_envp[datap->pd_envc] = addr;
if (str == NULL)
datap->pd_envp_strs[datap->pd_envc] = NULL;
@@ -660,6 +665,7 @@ get_env(pargs_data_t *datap)
datap->pd_envc = 0;
(void) Penv_iter(pr, build_env, datap);
+ datap->pd_envc_curr = datap->pd_envc;
datap->pd_envp = safe_zalloc(sizeof (uintptr_t) * datap->pd_envc);
datap->pd_envp_strs = safe_zalloc(sizeof (char *) * datap->pd_envc);
diff --git a/usr/src/common/dis/i386/dis_tables.c b/usr/src/common/dis/i386/dis_tables.c
index eeff5fb1e9..c21c392d77 100644
--- a/usr/src/common/dis/i386/dis_tables.c
+++ b/usr/src/common/dis/i386/dis_tables.c
@@ -1456,7 +1456,7 @@ const instable_t dis_op0F38[256] = {
/* [78] */ INVALID, INVALID, INVALID, INVALID,
/* [7C] */ INVALID, INVALID, INVALID, INVALID,
-/* [80] */ TNSy("invept", RM_66r), TNSy("invvpid", RM_66r),INVALID, INVALID,
+/* [80] */ TNSy("invept", RM_66r), TNSy("invvpid", RM_66r),TNSy("invpcid", RM_66r),INVALID,
/* [84] */ INVALID, INVALID, INVALID, INVALID,
/* [88] */ INVALID, INVALID, INVALID, INVALID,
/* [8C] */ INVALID, INVALID, INVALID, INVALID,
@@ -2214,7 +2214,7 @@ const instable_t dis_distable[16][16] = {
/* [9,0] */ TNS("nop",NORM), TS("xchg",RA), TS("xchg",RA), TS("xchg",RA),
/* [9,4] */ TS("xchg",RA), TS("xchg",RA), TS("xchg",RA), TS("xchg",RA),
/* [9,8] */ TNS("cXtX",CBW), TNS("cXtX",CWD), TNSx("lcall",SO), TNS("fwait",NORM),
-/* [9,C] */ TSZy("pushf",IMPLMEM,4),TSZy("popf",IMPLMEM,4), TNSx("sahf",NORM), TNSx("lahf",NORM),
+/* [9,C] */ TSZy("pushf",IMPLMEM,4),TSZy("popf",IMPLMEM,4), TNS("sahf",NORM), TNS("lahf",NORM),
}, {
/* [A,0] */ TNS("movb",OA), TS("mov",OA), TNS("movb",AO), TS("mov",AO),
/* [A,4] */ TNSZ("movsb",SD,1), TS("movs",SD), TNSZ("cmpsb",SD,1), TS("cmps",SD),
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index c3e4101a42..7cd1a6d7a8 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -5008,18 +5008,6 @@ arc_init(void)
/* Convert seconds to clock ticks */
arc_min_prefetch_lifespan = 1 * hz;
- /* Start out with 1/8 of all memory */
- arc_c = allmem / 8;
-
-#ifdef _KERNEL
- /*
- * On architectures where the physical memory can be larger
- * than the addressable space (intel in 32-bit mode), we may
- * need to limit the cache to 1/8 of VM size.
- */
- arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
-#endif
-
/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
arc_c_min = MAX(allmem / 32, 64 << 20);
/* set max to 3/4 of all memory, or all but 1GB, whichever is more */
@@ -5054,6 +5042,15 @@ arc_init(void)
/* limit meta-data to 1/4 of the arc capacity */
arc_meta_limit = arc_c_max / 4;
+#ifdef _KERNEL
+ /*
+ * Metadata is stored in the kernel's heap. Don't let us
+ * use more than half the heap for the ARC.
+ */
+ arc_meta_limit = MIN(arc_meta_limit,
+ vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2);
+#endif
+
/* Allow the tunable to override if it is reasonable */
if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
arc_meta_limit = zfs_arc_meta_limit;
diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c
index 0f16f3cc63..16c683d993 100644
--- a/usr/src/uts/i86pc/os/startup.c
+++ b/usr/src/uts/i86pc/os/startup.c
@@ -23,6 +23,7 @@
* Copyright 2012 DEY Storage Systems, Inc. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
* Copyright 2015 Joyent, Inc.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
*/
/*
* Copyright (c) 2010, Intel Corporation.
@@ -404,9 +405,9 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t);
* |--- GDT ---|- GDT page (GDT_VA)
* |--- debug info ---|- debug info (DEBUG_INFO_VA)
* | |
- * | page_t structures |
- * | memsegs, memlists, |
- * | page hash, etc. |
+ * | page_t structures |
+ * | memsegs, memlists, |
+ * | page hash, etc. |
* --- -|-----------------------|- ekernelheap, valloc_base (floating)
* | | (segkp is just an arena in the heap)
* | |
@@ -414,7 +415,7 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t);
* | |
* | |
* --- -|-----------------------|- kernelheap (floating)
- * | Segkmap |
+ * | Segkmap |
* 0xC3002000 -|-----------------------|- segmap_start (floating)
* | Red Zone |
* 0xC3000000 -|-----------------------|- kernelbase / userlimit (floating)
@@ -438,7 +439,7 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t);
* 0xFFFFFFFF.FFC00000 |-----------------------|- ARGSBASE
* | debugger (?) |
* 0xFFFFFFFF.FF800000 |-----------------------|- SEGDEBUGBASE
- * | unused |
+ * | unused |
* +-----------------------+
* | Kernel Data |
* 0xFFFFFFFF.FBC00000 |-----------------------|
@@ -447,7 +448,7 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t);
* |--- GDT ---|- GDT page (GDT_VA)
* |--- debug info ---|- debug info (DEBUG_INFO_VA)
* | |
- * | Core heap | (used for loadable modules)
+ * | Core heap | (used for loadable modules)
* 0xFFFFFFFF.C0000000 |-----------------------|- core_base / ekernelheap
* | Kernel |
* | heap |
@@ -460,23 +461,23 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t);
* 0xFFFFFXXX.XXX00000 |-----------------------|- segzio_base (floating)
* | segkp |
* --- |-----------------------|- segkp_base (floating)
- * | page_t structures | valloc_base + valloc_sz
- * | memsegs, memlists, |
- * | page hash, etc. |
- * 0xFFFFFF00.00000000 |-----------------------|- valloc_base (lower if > 1TB)
+ * | page_t structures | valloc_base + valloc_sz
+ * | memsegs, memlists, |
+ * | page hash, etc. |
+ * 0xFFFFFF00.00000000 |-----------------------|- valloc_base (lower if >256GB)
* | segkpm |
* 0xFFFFFE00.00000000 |-----------------------|
* | Red Zone |
- * 0xFFFFFD80.00000000 |-----------------------|- KERNELBASE (lower if > 1TB)
+ * 0xFFFFFD80.00000000 |-----------------------|- KERNELBASE (lower if >256GB)
* | User stack |- User space memory
- * | |
- * | shared objects, etc | (grows downwards)
+ * | |
+ * | shared objects, etc | (grows downwards)
* : :
- * | |
+ * | |
* 0xFFFF8000.00000000 |-----------------------|
- * | |
- * | VA Hole / unused |
- * | |
+ * | |
+ * | VA Hole / unused |
+ * | |
* 0x00008000.00000000 |-----------------------|
* | |
* | |
@@ -1243,20 +1244,45 @@ startup_memlist(void)
/*
* The default values of VALLOC_BASE and SEGKPM_BASE should work
- * for values of physmax up to 1 Terabyte. They need adjusting when
- * memory is at addresses above 1 TB. When adjusted, segkpm_base must
+ * for values of physmax up to 256GB (1/4 TB). They need adjusting when
+ * memory is at addresses above 256GB. When adjusted, segkpm_base must
* be aligned on KERNEL_REDZONE_SIZE boundary (span of top level pte).
+ *
+ * In the general case (>256GB), we use (4 * physmem) for the
+ * kernel's virtual addresses, which is divided approximately
+ * as follows:
+ * - 1 * physmem for segkpm
+ * - 1.5 * physmem for segzio
+ * - 1.5 * physmem for heap
+ * Total: 4.0 * physmem
+ *
+ * Note that the segzio and heap sizes are more than physmem so that
+ * VA fragmentation does not prevent either of them from being
+ * able to use nearly all of physmem. The value of 1.5x is determined
+ * experimentally and may need to change if the workload changes.
*/
- if (physmax + 1 > mmu_btop(TERABYTE) ||
- plat_dr_physmax > mmu_btop(TERABYTE)) {
+ if (physmax + 1 > mmu_btop(TERABYTE / 4) ||
+ plat_dr_physmax > mmu_btop(TERABYTE / 4)) {
uint64_t kpm_resv_amount = mmu_ptob(physmax + 1);
if (kpm_resv_amount < mmu_ptob(plat_dr_physmax)) {
kpm_resv_amount = mmu_ptob(plat_dr_physmax);
}
- segkpm_base = -(P2ROUNDUP((2 * kpm_resv_amount),
- KERNEL_REDZONE_SIZE)); /* down from top VA */
+ /*
+ * This is what actually controls the KVA : UVA split.
+ * The kernel uses high VA, and this is lowering the
+ * boundary, thus increasing the amount of VA for the kernel.
+ * This gives the kernel 4 * (amount of physical memory) VA.
+ *
+ * The maximum VA is UINT64_MAX and we are using
+ * 64-bit 2's complement math, so e.g. if you have 512GB
+ * of memory, segkpm_base = -(4 * 512GB) == -2TB ==
+ * UINT64_MAX - 2TB (approximately). So the kernel's
+ * VA is [UINT64_MAX-2TB to UINT64_MAX].
+ */
+ segkpm_base = -(P2ROUNDUP((4 * kpm_resv_amount),
+ KERNEL_REDZONE_SIZE));
/* make sure we leave some space for user apps above hole */
segkpm_base = MAX(segkpm_base, AMD64_VA_HOLE_END + TERABYTE);
@@ -1906,8 +1932,9 @@ layout_kernel_va(void)
* segment (from kernel heap) so that we can easily tell not to
* include it in kernel crash dumps on 64 bit kernels. The trick is
* to give it lots of VA, but not constrain the kernel heap.
- * We scale the size of segzio linearly with physmem up to
- * SEGZIOMAXSIZE. Above that amount it scales at 50% of physmem.
+ * We can use 1.5x physmem for segzio, leaving approximately
+ * another 1.5x physmem for heap. See also the comment in
+ * startup_memlist().
*/
segzio_base = segkp_base + mmu_ptob(segkpsize);
if (segzio_fromheap) {
@@ -1915,15 +1942,10 @@ layout_kernel_va(void)
} else {
size_t physmem_size = mmu_ptob(physmem);
size_t size = (segziosize == 0) ?
- physmem_size : mmu_ptob(segziosize);
+ physmem_size * 3 / 2 : mmu_ptob(segziosize);
if (size < SEGZIOMINSIZE)
size = SEGZIOMINSIZE;
- if (size > SEGZIOMAXSIZE) {
- size = SEGZIOMAXSIZE;
- if (physmem_size > size)
- size += (physmem_size - size) / 2;
- }
segziosize = mmu_btop(ROUND_UP_LPAGE(size));
}
PRM_DEBUG(segziosize);
diff --git a/usr/src/uts/i86pc/sys/machparam.h b/usr/src/uts/i86pc/sys/machparam.h
index a0fa08db16..99ae0d4d3b 100644
--- a/usr/src/uts/i86pc/sys/machparam.h
+++ b/usr/src/uts/i86pc/sys/machparam.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
*/
/* Copyright (c) 1988 AT&T */
@@ -191,7 +192,6 @@ extern "C" {
* minimum size for segzio
*/
#define SEGZIOMINSIZE (400L * 1024 * 1024L) /* 400M */
-#define SEGZIOMAXSIZE (512L * 1024L * 1024L * 1024L) /* 512G */
/*
* During intial boot we limit heap to the top 4Gig.