summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorJerry Jelinek <jerry.jelinek@joyent.com>2016-04-29 11:35:48 +0000
committerJerry Jelinek <jerry.jelinek@joyent.com>2016-04-29 11:35:48 +0000
commitd006a4cbd4e9a251a94eec493f1618a9e452992a (patch)
tree65a5d74b65e9786362a556a4a09f82fd2bbcfbe6 /usr/src
parent3e4225906c66015597649ddcdb4fe37a2204fec2 (diff)
parentaf868f46a5b794687741d5424de9e3a2d684a84a (diff)
downloadillumos-joyent-d006a4cbd4e9a251a94eec493f1618a9e452992a.tar.gz
[illumos-gate merge]
commit af868f46a5b794687741d5424de9e3a2d684a84a 6914 kernel virtual memory fragmentation leads to hang commit 23a268cfbc75530b746495f3e157b9bc71069420 6565 pargs crashes on growing env commit 1872b0b513cebbb59f48d3164530f8598d11df23 6929 couple of updates to i386 disassembler tables
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/ptools/pargs/pargs.c6
-rw-r--r--usr/src/common/dis/i386/dis_tables.c4
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c21
-rw-r--r--usr/src/uts/i86pc/os/startup.c84
-rw-r--r--usr/src/uts/i86pc/sys/machparam.h2
5 files changed, 71 insertions, 46 deletions
diff --git a/usr/src/cmd/ptools/pargs/pargs.c b/usr/src/cmd/ptools/pargs/pargs.c
index 54792b216a..d8072b56a1 100644
--- a/usr/src/cmd/ptools/pargs/pargs.c
+++ b/usr/src/cmd/ptools/pargs/pargs.c
@@ -91,6 +91,7 @@ typedef struct pargs_data {
uintptr_t *pd_argv;
char **pd_argv_strs;
size_t pd_envc;
+ size_t pd_envc_curr;
uintptr_t *pd_envp;
char **pd_envp_strs;
size_t pd_auxc;
@@ -641,6 +642,10 @@ build_env(void *data, struct ps_prochandle *pr, uintptr_t addr, const char *str)
pargs_data_t *datap = data;
if (datap->pd_envp != NULL) {
+ /* env has more items than last time, skip the newer ones */
+ if (datap->pd_envc > datap->pd_envc_curr)
+ return (0);
+
datap->pd_envp[datap->pd_envc] = addr;
if (str == NULL)
datap->pd_envp_strs[datap->pd_envc] = NULL;
@@ -660,6 +665,7 @@ get_env(pargs_data_t *datap)
datap->pd_envc = 0;
(void) Penv_iter(pr, build_env, datap);
+ datap->pd_envc_curr = datap->pd_envc;
datap->pd_envp = safe_zalloc(sizeof (uintptr_t) * datap->pd_envc);
datap->pd_envp_strs = safe_zalloc(sizeof (char *) * datap->pd_envc);
diff --git a/usr/src/common/dis/i386/dis_tables.c b/usr/src/common/dis/i386/dis_tables.c
index eeff5fb1e9..c21c392d77 100644
--- a/usr/src/common/dis/i386/dis_tables.c
+++ b/usr/src/common/dis/i386/dis_tables.c
@@ -1456,7 +1456,7 @@ const instable_t dis_op0F38[256] = {
/* [78] */ INVALID, INVALID, INVALID, INVALID,
/* [7C] */ INVALID, INVALID, INVALID, INVALID,
-/* [80] */ TNSy("invept", RM_66r), TNSy("invvpid", RM_66r),INVALID, INVALID,
+/* [80] */ TNSy("invept", RM_66r), TNSy("invvpid", RM_66r),TNSy("invpcid", RM_66r),INVALID,
/* [84] */ INVALID, INVALID, INVALID, INVALID,
/* [88] */ INVALID, INVALID, INVALID, INVALID,
/* [8C] */ INVALID, INVALID, INVALID, INVALID,
@@ -2214,7 +2214,7 @@ const instable_t dis_distable[16][16] = {
/* [9,0] */ TNS("nop",NORM), TS("xchg",RA), TS("xchg",RA), TS("xchg",RA),
/* [9,4] */ TS("xchg",RA), TS("xchg",RA), TS("xchg",RA), TS("xchg",RA),
/* [9,8] */ TNS("cXtX",CBW), TNS("cXtX",CWD), TNSx("lcall",SO), TNS("fwait",NORM),
-/* [9,C] */ TSZy("pushf",IMPLMEM,4),TSZy("popf",IMPLMEM,4), TNSx("sahf",NORM), TNSx("lahf",NORM),
+/* [9,C] */ TSZy("pushf",IMPLMEM,4),TSZy("popf",IMPLMEM,4), TNS("sahf",NORM), TNS("lahf",NORM),
}, {
/* [A,0] */ TNS("movb",OA), TS("mov",OA), TNS("movb",AO), TS("mov",AO),
/* [A,4] */ TNSZ("movsb",SD,1), TS("movs",SD), TNSZ("cmpsb",SD,1), TS("cmps",SD),
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index c3e4101a42..7cd1a6d7a8 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -5008,18 +5008,6 @@ arc_init(void)
/* Convert seconds to clock ticks */
arc_min_prefetch_lifespan = 1 * hz;
- /* Start out with 1/8 of all memory */
- arc_c = allmem / 8;
-
-#ifdef _KERNEL
- /*
- * On architectures where the physical memory can be larger
- * than the addressable space (intel in 32-bit mode), we may
- * need to limit the cache to 1/8 of VM size.
- */
- arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
-#endif
-
/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
arc_c_min = MAX(allmem / 32, 64 << 20);
/* set max to 3/4 of all memory, or all but 1GB, whichever is more */
@@ -5054,6 +5042,15 @@ arc_init(void)
/* limit meta-data to 1/4 of the arc capacity */
arc_meta_limit = arc_c_max / 4;
+#ifdef _KERNEL
+ /*
+ * Metadata is stored in the kernel's heap. Don't let us
+ * use more than half the heap for the ARC.
+ */
+ arc_meta_limit = MIN(arc_meta_limit,
+ vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2);
+#endif
+
/* Allow the tunable to override if it is reasonable */
if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
arc_meta_limit = zfs_arc_meta_limit;
diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c
index 0f16f3cc63..16c683d993 100644
--- a/usr/src/uts/i86pc/os/startup.c
+++ b/usr/src/uts/i86pc/os/startup.c
@@ -23,6 +23,7 @@
* Copyright 2012 DEY Storage Systems, Inc. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
* Copyright 2015 Joyent, Inc.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
*/
/*
* Copyright (c) 2010, Intel Corporation.
@@ -404,9 +405,9 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t);
* |--- GDT ---|- GDT page (GDT_VA)
* |--- debug info ---|- debug info (DEBUG_INFO_VA)
* | |
- * | page_t structures |
- * | memsegs, memlists, |
- * | page hash, etc. |
+ * | page_t structures |
+ * | memsegs, memlists, |
+ * | page hash, etc. |
* --- -|-----------------------|- ekernelheap, valloc_base (floating)
* | | (segkp is just an arena in the heap)
* | |
@@ -414,7 +415,7 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t);
* | |
* | |
* --- -|-----------------------|- kernelheap (floating)
- * | Segkmap |
+ * | Segkmap |
* 0xC3002000 -|-----------------------|- segmap_start (floating)
* | Red Zone |
* 0xC3000000 -|-----------------------|- kernelbase / userlimit (floating)
@@ -438,7 +439,7 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t);
* 0xFFFFFFFF.FFC00000 |-----------------------|- ARGSBASE
* | debugger (?) |
* 0xFFFFFFFF.FF800000 |-----------------------|- SEGDEBUGBASE
- * | unused |
+ * | unused |
* +-----------------------+
* | Kernel Data |
* 0xFFFFFFFF.FBC00000 |-----------------------|
@@ -447,7 +448,7 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t);
* |--- GDT ---|- GDT page (GDT_VA)
* |--- debug info ---|- debug info (DEBUG_INFO_VA)
* | |
- * | Core heap | (used for loadable modules)
+ * | Core heap | (used for loadable modules)
* 0xFFFFFFFF.C0000000 |-----------------------|- core_base / ekernelheap
* | Kernel |
* | heap |
@@ -460,23 +461,23 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t);
* 0xFFFFFXXX.XXX00000 |-----------------------|- segzio_base (floating)
* | segkp |
* --- |-----------------------|- segkp_base (floating)
- * | page_t structures | valloc_base + valloc_sz
- * | memsegs, memlists, |
- * | page hash, etc. |
- * 0xFFFFFF00.00000000 |-----------------------|- valloc_base (lower if > 1TB)
+ * | page_t structures | valloc_base + valloc_sz
+ * | memsegs, memlists, |
+ * | page hash, etc. |
+ * 0xFFFFFF00.00000000 |-----------------------|- valloc_base (lower if >256GB)
* | segkpm |
* 0xFFFFFE00.00000000 |-----------------------|
* | Red Zone |
- * 0xFFFFFD80.00000000 |-----------------------|- KERNELBASE (lower if > 1TB)
+ * 0xFFFFFD80.00000000 |-----------------------|- KERNELBASE (lower if >256GB)
* | User stack |- User space memory
- * | |
- * | shared objects, etc | (grows downwards)
+ * | |
+ * | shared objects, etc | (grows downwards)
* : :
- * | |
+ * | |
* 0xFFFF8000.00000000 |-----------------------|
- * | |
- * | VA Hole / unused |
- * | |
+ * | |
+ * | VA Hole / unused |
+ * | |
* 0x00008000.00000000 |-----------------------|
* | |
* | |
@@ -1243,20 +1244,45 @@ startup_memlist(void)
/*
* The default values of VALLOC_BASE and SEGKPM_BASE should work
- * for values of physmax up to 1 Terabyte. They need adjusting when
- * memory is at addresses above 1 TB. When adjusted, segkpm_base must
+ * for values of physmax up to 256GB (1/4 TB). They need adjusting when
+ * memory is at addresses above 256GB. When adjusted, segkpm_base must
* be aligned on KERNEL_REDZONE_SIZE boundary (span of top level pte).
+ *
+ * In the general case (>256GB), we use (4 * physmem) for the
+ * kernel's virtual addresses, which is divided approximately
+ * as follows:
+ * - 1 * physmem for segkpm
+ * - 1.5 * physmem for segzio
+ * - 1.5 * physmem for heap
+ * Total: 4.0 * physmem
+ *
+ * Note that the segzio and heap sizes are more than physmem so that
+ * VA fragmentation does not prevent either of them from being
+ * able to use nearly all of physmem. The value of 1.5x is determined
+ * experimentally and may need to change if the workload changes.
*/
- if (physmax + 1 > mmu_btop(TERABYTE) ||
- plat_dr_physmax > mmu_btop(TERABYTE)) {
+ if (physmax + 1 > mmu_btop(TERABYTE / 4) ||
+ plat_dr_physmax > mmu_btop(TERABYTE / 4)) {
uint64_t kpm_resv_amount = mmu_ptob(physmax + 1);
if (kpm_resv_amount < mmu_ptob(plat_dr_physmax)) {
kpm_resv_amount = mmu_ptob(plat_dr_physmax);
}
- segkpm_base = -(P2ROUNDUP((2 * kpm_resv_amount),
- KERNEL_REDZONE_SIZE)); /* down from top VA */
+ /*
+ * This is what actually controls the KVA : UVA split.
+ * The kernel uses high VA, and this is lowering the
+ * boundary, thus increasing the amount of VA for the kernel.
+ * This gives the kernel 4 * (amount of physical memory) VA.
+ *
+ * The maximum VA is UINT64_MAX and we are using
+ * 64-bit 2's complement math, so e.g. if you have 512GB
+ * of memory, segkpm_base = -(4 * 512GB) == -2TB ==
+ * UINT64_MAX - 2TB (approximately). So the kernel's
+ * VA is [UINT64_MAX-2TB to UINT64_MAX].
+ */
+ segkpm_base = -(P2ROUNDUP((4 * kpm_resv_amount),
+ KERNEL_REDZONE_SIZE));
/* make sure we leave some space for user apps above hole */
segkpm_base = MAX(segkpm_base, AMD64_VA_HOLE_END + TERABYTE);
@@ -1906,8 +1932,9 @@ layout_kernel_va(void)
* segment (from kernel heap) so that we can easily tell not to
* include it in kernel crash dumps on 64 bit kernels. The trick is
* to give it lots of VA, but not constrain the kernel heap.
- * We scale the size of segzio linearly with physmem up to
- * SEGZIOMAXSIZE. Above that amount it scales at 50% of physmem.
+ * We can use 1.5x physmem for segzio, leaving approximately
+ * another 1.5x physmem for heap. See also the comment in
+ * startup_memlist().
*/
segzio_base = segkp_base + mmu_ptob(segkpsize);
if (segzio_fromheap) {
@@ -1915,15 +1942,10 @@ layout_kernel_va(void)
} else {
size_t physmem_size = mmu_ptob(physmem);
size_t size = (segziosize == 0) ?
- physmem_size : mmu_ptob(segziosize);
+ physmem_size * 3 / 2 : mmu_ptob(segziosize);
if (size < SEGZIOMINSIZE)
size = SEGZIOMINSIZE;
- if (size > SEGZIOMAXSIZE) {
- size = SEGZIOMAXSIZE;
- if (physmem_size > size)
- size += (physmem_size - size) / 2;
- }
segziosize = mmu_btop(ROUND_UP_LPAGE(size));
}
PRM_DEBUG(segziosize);
diff --git a/usr/src/uts/i86pc/sys/machparam.h b/usr/src/uts/i86pc/sys/machparam.h
index a0fa08db16..99ae0d4d3b 100644
--- a/usr/src/uts/i86pc/sys/machparam.h
+++ b/usr/src/uts/i86pc/sys/machparam.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
*/
/* Copyright (c) 1988 AT&T */
@@ -191,7 +192,6 @@ extern "C" {
* minimum size for segzio
*/
#define SEGZIOMINSIZE (400L * 1024 * 1024L) /* 400M */
-#define SEGZIOMAXSIZE (512L * 1024L * 1024L * 1024L) /* 512G */
/*
* During intial boot we limit heap to the top 4Gig.