summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorMatthew Ahrens <mahrens@delphix.com>2016-04-20 14:43:23 -0700
committerMatthew Ahrens <mahrens@delphix.com>2016-04-28 21:04:28 -0700
commitaf868f46a5b794687741d5424de9e3a2d684a84a (patch)
treef5a8534e5532ec8e14922c3faff48696c9a7ff31 /usr/src
parent23a268cfbc75530b746495f3e157b9bc71069420 (diff)
downloadillumos-joyent-af868f46a5b794687741d5424de9e3a2d684a84a.tar.gz
6914 kernel virtual memory fragmentation leads to hang
Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: John Kennedy <john.kennedy@delphix.com> Reviewed by: Igor Kozhukhov <ikozhukhov@gmail.com> Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com> Approved by: Garrett D'Amore <garrett@damore.org>
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c21
-rw-r--r--usr/src/uts/i86pc/os/startup.c84
-rw-r--r--usr/src/uts/i86pc/sys/machparam.h2
3 files changed, 63 insertions, 44 deletions
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index 1eec1f84b2..681368e437 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -4999,18 +4999,6 @@ arc_init(void)
/* Convert seconds to clock ticks */
arc_min_prefetch_lifespan = 1 * hz;
- /* Start out with 1/8 of all memory */
- arc_c = allmem / 8;
-
-#ifdef _KERNEL
- /*
- * On architectures where the physical memory can be larger
- * than the addressable space (intel in 32-bit mode), we may
- * need to limit the cache to 1/8 of VM size.
- */
- arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
-#endif
-
/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
arc_c_min = MAX(allmem / 32, 64 << 20);
/* set max to 3/4 of all memory, or all but 1GB, whichever is more */
@@ -5045,6 +5033,15 @@ arc_init(void)
/* limit meta-data to 1/4 of the arc capacity */
arc_meta_limit = arc_c_max / 4;
+#ifdef _KERNEL
+ /*
+ * Metadata is stored in the kernel's heap. Don't let us
+ * use more than half the heap for the ARC.
+ */
+ arc_meta_limit = MIN(arc_meta_limit,
+ vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2);
+#endif
+
/* Allow the tunable to override if it is reasonable */
if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
arc_meta_limit = zfs_arc_meta_limit;
diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c
index 0f16f3cc63..16c683d993 100644
--- a/usr/src/uts/i86pc/os/startup.c
+++ b/usr/src/uts/i86pc/os/startup.c
@@ -23,6 +23,7 @@
* Copyright 2012 DEY Storage Systems, Inc. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
* Copyright 2015 Joyent, Inc.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
*/
/*
* Copyright (c) 2010, Intel Corporation.
@@ -404,9 +405,9 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t);
* |--- GDT ---|- GDT page (GDT_VA)
* |--- debug info ---|- debug info (DEBUG_INFO_VA)
* | |
- * | page_t structures |
- * | memsegs, memlists, |
- * | page hash, etc. |
+ * | page_t structures |
+ * | memsegs, memlists, |
+ * | page hash, etc. |
* --- -|-----------------------|- ekernelheap, valloc_base (floating)
* | | (segkp is just an arena in the heap)
* | |
@@ -414,7 +415,7 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t);
* | |
* | |
* --- -|-----------------------|- kernelheap (floating)
- * | Segkmap |
+ * | Segkmap |
* 0xC3002000 -|-----------------------|- segmap_start (floating)
* | Red Zone |
* 0xC3000000 -|-----------------------|- kernelbase / userlimit (floating)
@@ -438,7 +439,7 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t);
* 0xFFFFFFFF.FFC00000 |-----------------------|- ARGSBASE
* | debugger (?) |
* 0xFFFFFFFF.FF800000 |-----------------------|- SEGDEBUGBASE
- * | unused |
+ * | unused |
* +-----------------------+
* | Kernel Data |
* 0xFFFFFFFF.FBC00000 |-----------------------|
@@ -447,7 +448,7 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t);
* |--- GDT ---|- GDT page (GDT_VA)
* |--- debug info ---|- debug info (DEBUG_INFO_VA)
* | |
- * | Core heap | (used for loadable modules)
+ * | Core heap | (used for loadable modules)
* 0xFFFFFFFF.C0000000 |-----------------------|- core_base / ekernelheap
* | Kernel |
* | heap |
@@ -460,23 +461,23 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t);
* 0xFFFFFXXX.XXX00000 |-----------------------|- segzio_base (floating)
* | segkp |
* --- |-----------------------|- segkp_base (floating)
- * | page_t structures | valloc_base + valloc_sz
- * | memsegs, memlists, |
- * | page hash, etc. |
- * 0xFFFFFF00.00000000 |-----------------------|- valloc_base (lower if > 1TB)
+ * | page_t structures | valloc_base + valloc_sz
+ * | memsegs, memlists, |
+ * | page hash, etc. |
+ * 0xFFFFFF00.00000000 |-----------------------|- valloc_base (lower if >256GB)
* | segkpm |
* 0xFFFFFE00.00000000 |-----------------------|
* | Red Zone |
- * 0xFFFFFD80.00000000 |-----------------------|- KERNELBASE (lower if > 1TB)
+ * 0xFFFFFD80.00000000 |-----------------------|- KERNELBASE (lower if >256GB)
* | User stack |- User space memory
- * | |
- * | shared objects, etc | (grows downwards)
+ * | |
+ * | shared objects, etc | (grows downwards)
* : :
- * | |
+ * | |
* 0xFFFF8000.00000000 |-----------------------|
- * | |
- * | VA Hole / unused |
- * | |
+ * | |
+ * | VA Hole / unused |
+ * | |
* 0x00008000.00000000 |-----------------------|
* | |
* | |
@@ -1243,20 +1244,45 @@ startup_memlist(void)
/*
* The default values of VALLOC_BASE and SEGKPM_BASE should work
- * for values of physmax up to 1 Terabyte. They need adjusting when
- * memory is at addresses above 1 TB. When adjusted, segkpm_base must
+ * for values of physmax up to 256GB (1/4 TB). They need adjusting when
+ * memory is at addresses above 256GB. When adjusted, segkpm_base must
* be aligned on KERNEL_REDZONE_SIZE boundary (span of top level pte).
+ *
+ * In the general case (>256GB), we use (4 * physmem) for the
+ * kernel's virtual addresses, which is divided approximately
+ * as follows:
+ * - 1 * physmem for segkpm
+ * - 1.5 * physmem for segzio
+ * - 1.5 * physmem for heap
+ * Total: 4.0 * physmem
+ *
+ * Note that the segzio and heap sizes are more than physmem so that
+ * VA fragmentation does not prevent either of them from being
+ * able to use nearly all of physmem. The value of 1.5x is determined
+ * experimentally and may need to change if the workload changes.
*/
- if (physmax + 1 > mmu_btop(TERABYTE) ||
- plat_dr_physmax > mmu_btop(TERABYTE)) {
+ if (physmax + 1 > mmu_btop(TERABYTE / 4) ||
+ plat_dr_physmax > mmu_btop(TERABYTE / 4)) {
uint64_t kpm_resv_amount = mmu_ptob(physmax + 1);
if (kpm_resv_amount < mmu_ptob(plat_dr_physmax)) {
kpm_resv_amount = mmu_ptob(plat_dr_physmax);
}
- segkpm_base = -(P2ROUNDUP((2 * kpm_resv_amount),
- KERNEL_REDZONE_SIZE)); /* down from top VA */
+ /*
+ * This is what actually controls the KVA : UVA split.
+ * The kernel uses high VA, and this is lowering the
+ * boundary, thus increasing the amount of VA for the kernel.
+ * This gives the kernel 4 * (amount of physical memory) VA.
+ *
+ * The maximum VA is UINT64_MAX and we are using
+ * 64-bit 2's complement math, so e.g. if you have 512GB
+ * of memory, segkpm_base = -(4 * 512GB) == -2TB ==
+ * UINT64_MAX - 2TB (approximately). So the kernel's
+ * VA is [UINT64_MAX-2TB to UINT64_MAX].
+ */
+ segkpm_base = -(P2ROUNDUP((4 * kpm_resv_amount),
+ KERNEL_REDZONE_SIZE));
/* make sure we leave some space for user apps above hole */
segkpm_base = MAX(segkpm_base, AMD64_VA_HOLE_END + TERABYTE);
@@ -1906,8 +1932,9 @@ layout_kernel_va(void)
* segment (from kernel heap) so that we can easily tell not to
* include it in kernel crash dumps on 64 bit kernels. The trick is
* to give it lots of VA, but not constrain the kernel heap.
- * We scale the size of segzio linearly with physmem up to
- * SEGZIOMAXSIZE. Above that amount it scales at 50% of physmem.
+ * We can use 1.5x physmem for segzio, leaving approximately
+ * another 1.5x physmem for heap. See also the comment in
+ * startup_memlist().
*/
segzio_base = segkp_base + mmu_ptob(segkpsize);
if (segzio_fromheap) {
@@ -1915,15 +1942,10 @@ layout_kernel_va(void)
} else {
size_t physmem_size = mmu_ptob(physmem);
size_t size = (segziosize == 0) ?
- physmem_size : mmu_ptob(segziosize);
+ physmem_size * 3 / 2 : mmu_ptob(segziosize);
if (size < SEGZIOMINSIZE)
size = SEGZIOMINSIZE;
- if (size > SEGZIOMAXSIZE) {
- size = SEGZIOMAXSIZE;
- if (physmem_size > size)
- size += (physmem_size - size) / 2;
- }
segziosize = mmu_btop(ROUND_UP_LPAGE(size));
}
PRM_DEBUG(segziosize);
diff --git a/usr/src/uts/i86pc/sys/machparam.h b/usr/src/uts/i86pc/sys/machparam.h
index a0fa08db16..99ae0d4d3b 100644
--- a/usr/src/uts/i86pc/sys/machparam.h
+++ b/usr/src/uts/i86pc/sys/machparam.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
*/
/* Copyright (c) 1988 AT&T */
@@ -191,7 +192,6 @@ extern "C" {
* minimum size for segzio
*/
#define SEGZIOMINSIZE (400L * 1024 * 1024L) /* 400M */
-#define SEGZIOMAXSIZE (512L * 1024L * 1024L * 1024L) /* 512G */
/*
* During intial boot we limit heap to the top 4Gig.