summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorStan Studzinski <Stan.Studzinski@Sun.COM>2010-04-13 11:03:56 -0700
committerStan Studzinski <Stan.Studzinski@Sun.COM>2010-04-13 11:03:56 -0700
commit23a80de1aec78d238d06caf311eaceb81dd5a440 (patch)
tree0b52be831a38566af69b540ddc88de0ea39c683b /usr/src
parentbf994817a71d4ac680198e25fe79d13c247306e0 (diff)
downloadillumos-joyent-23a80de1aec78d238d06caf311eaceb81dd5a440.tar.gz
6675738 KM_NOSLEEP may still try too hard for some allocations
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/uts/common/os/mem_cage.c201
-rw-r--r--usr/src/uts/common/sys/kmem.h1
-rw-r--r--usr/src/uts/common/sys/vmem.h4
-rw-r--r--usr/src/uts/common/vm/page.h7
-rw-r--r--usr/src/uts/common/vm/seg_kmem.c9
-rw-r--r--usr/src/uts/common/vm/vm_page.c12
6 files changed, 68 insertions, 166 deletions
diff --git a/usr/src/uts/common/os/mem_cage.c b/usr/src/uts/common/os/mem_cage.c
index 74904a0a38..9e87a9c4f0 100644
--- a/usr/src/uts/common/os/mem_cage.c
+++ b/usr/src/uts/common/os/mem_cage.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -75,7 +74,6 @@ struct kcage_stats_scan {
uint_t kt_cantlock;
uint_t kt_gotone;
uint_t kt_gotonefree;
- uint_t kt_skiplevel;
uint_t kt_skipshared;
uint_t kt_skiprefd;
uint_t kt_destroy;
@@ -1185,15 +1183,9 @@ kcage_cageout_init()
int
kcage_create_throttle(pgcnt_t npages, int flags)
{
- int niter = 0;
- pgcnt_t lastfree;
- int enough = kcage_freemem > kcage_throttlefree + npages;
KCAGE_STAT_INCR(kct_calls); /* unprotected incr. */
- kcage_cageout_wakeup(); /* just to be sure */
- KCAGE_STAT_INCR(kct_cagewake); /* unprotected incr. */
-
/*
* Obviously, we can't throttle the cageout thread since
* we depend on it. We also can't throttle the panic thread.
@@ -1209,7 +1201,7 @@ kcage_create_throttle(pgcnt_t npages, int flags)
* if freemem is very low.
*/
if (NOMEMWAIT()) {
- if (enough) {
+ if (kcage_freemem > kcage_throttlefree + npages) {
KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */
return (KCT_CRIT);
} else if (freemem < minfree) {
@@ -1235,9 +1227,6 @@ kcage_create_throttle(pgcnt_t npages, int flags)
*/
while (kcage_freemem < kcage_throttlefree + npages) {
ASSERT(kcage_on);
-
- lastfree = kcage_freemem;
-
if (kcage_cageout_ready) {
mutex_enter(&kcage_throttle_mutex);
@@ -1265,23 +1254,20 @@ kcage_create_throttle(pgcnt_t npages, int flags)
atomic_add_long(&kcage_needfree, -npages);
}
- if ((flags & PG_WAIT) == 0) {
- if (kcage_freemem > lastfree) {
- KCAGE_STAT_INCR(kct_progress);
- niter = 0;
- } else {
- KCAGE_STAT_INCR(kct_noprogress);
- if (++niter >= kcage_maxwait) {
- KCAGE_STAT_INCR(kct_timeout);
- return (KCT_FAILURE);
- }
- }
- }
-
if (NOMEMWAIT() && freemem < minfree) {
return (KCT_CRIT);
}
+ if ((flags & PG_WAIT) == 0) {
+ pgcnt_t limit = (flags & PG_NORMALPRI) ?
+ throttlefree : pageout_reserve;
+ if ((kcage_freemem < kcage_throttlefree + npages) &&
+ (freemem < limit + npages)) {
+ return (KCT_FAILURE);
+ } else {
+ return (KCT_NONCRIT);
+ }
+ }
}
return (KCT_NONCRIT);
}
@@ -1393,9 +1379,9 @@ check_free_and_return:
if (page_trylock(pp, SE_SHARED)) {
if (PP_ISNORELOC(pp))
goto check_free_and_return;
- } else
+ } else {
return (EAGAIN);
-
+ }
if (!PP_ISFREE(pp)) {
page_unlock(pp);
return (EAGAIN);
@@ -1471,14 +1457,13 @@ kcage_expand()
* Exit early if expansion amount is equal to or less than zero.
* (<0 is possible if kcage_freemem rises suddenly.)
*
- * Exit early when the global page pool (apparently) does not
- * have enough free pages to page_relocate() even a single page.
+ * Exit early when freemem drops below pageout_reserve plus the request.
*/
wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree)
- kcage_freemem;
- if (wanted <= 0)
+ if (wanted <= 0) {
return (0);
- else if (freemem < pageout_reserve + 1) {
+ } else if (freemem < pageout_reserve + wanted) {
KCAGE_STAT_INCR(ke_lowfreemem);
return (0);
}
@@ -1670,6 +1655,18 @@ kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp)
return (0);
}
+/*
+ * Expand cage only if there is not enough memory to satisfy
+ * current request. We only do one (complete) scan of the cage.
+ * Dirty pages and pages with shared mappings are skipped;
+ * Locked pages (p_lckcnt and p_cowcnt) are also skipped.
+ * All other pages are freed (if they can be locked).
+ * This may affect caching of user pages which are in cage by freeing/
+ * reclaiming them more often. However cage is mainly for kernel (heap)
+ * pages and we want to keep user pages outside of cage. The above policy
+ * should also reduce cage expansion plus it should speed up cage mem
+ * allocations.
+ */
static void
kcage_cageout()
{
@@ -1677,12 +1674,7 @@ kcage_cageout()
page_t *pp;
callb_cpr_t cprinfo;
int did_something;
- int scan_again;
pfn_t start_pfn;
- int pass;
- int last_pass;
- int pages_skipped;
- int shared_skipped;
ulong_t shared_level = 8;
pgcnt_t nfreed;
#ifdef KCAGE_STATS
@@ -1713,14 +1705,9 @@ loop:
KCAGE_STAT_INCR(kt_wakeups);
KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem);
KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem);
- pass = 0;
- last_pass = 0;
-
#ifdef KCAGE_STATS
scan_start = ddi_get_lbolt();
#endif
-
-again:
if (!kcage_on)
goto loop;
@@ -1728,16 +1715,16 @@ again:
KCAGE_STAT_INCR_SCAN(kt_passes);
did_something = 0;
- pages_skipped = 0;
- shared_skipped = 0;
- while ((kcage_freemem < kcage_lotsfree || kcage_needfree) &&
- (pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) {
+ while (kcage_freemem < kcage_lotsfree + kcage_needfree) {
+
+ if ((pfn = kcage_walk_cage(pfn == PFN_INVALID)) ==
+ PFN_INVALID) {
+ break;
+ }
if (start_pfn == PFN_INVALID)
start_pfn = pfn;
else if (start_pfn == pfn) {
- last_pass = pass;
- pass += 1;
/*
* Did a complete walk of kernel cage, but didn't free
* any pages. If only one cpu is active then
@@ -1813,63 +1800,12 @@ again:
continue;
}
- KCAGE_STAT_SET_SCAN(kt_skiplevel, shared_level);
if (hat_page_checkshare(pp, shared_level)) {
page_unlock(pp);
- pages_skipped = 1;
- shared_skipped = 1;
KCAGE_STAT_INCR_SCAN(kt_skipshared);
continue;
}
- /*
- * In pass {0, 1}, skip page if ref bit is set.
- * In pass {0, 1, 2}, skip page if mod bit is set.
- */
- prm = hat_pagesync(pp,
- HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
-
- /* On first pass ignore ref'd pages */
- if (pass <= 1 && (prm & P_REF)) {
- KCAGE_STAT_INCR_SCAN(kt_skiprefd);
- pages_skipped = 1;
- page_unlock(pp);
- continue;
- }
-
- /* On pass 2, VN_DISPOSE if mod bit is not set */
- if (pass <= 2) {
- if (pp->p_szc != 0 || (prm & P_MOD) ||
- pp->p_lckcnt || pp->p_cowcnt) {
- pages_skipped = 1;
- page_unlock(pp);
- } else {
-
- /*
- * unload the mappings before
- * checking if mod bit is set
- */
- (void) hat_pageunload(pp,
- HAT_FORCE_PGUNLOAD);
-
- /*
- * skip this page if modified
- */
- if (hat_ismod(pp)) {
- pages_skipped = 1;
- page_unlock(pp);
- continue;
- }
-
- KCAGE_STAT_INCR_SCAN(kt_destroy);
- /* constant in conditional context */
- /* LINTED */
- VN_DISPOSE(pp, B_INVAL, 0, kcred);
- did_something = 1;
- }
- continue;
- }
-
if (kcage_invalidate_page(pp, &nfreed) == 0) {
did_something = 1;
KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed);
@@ -1883,66 +1819,17 @@ again:
}
}
- /*
- * Expand the cage only if available cage memory is really low.
- * This test is done only after a complete scan of the cage.
- * The reason for not checking and expanding more often is to
- * avoid rapid expansion of the cage. Naturally, scanning the
- * cage takes time. So by scanning first, we use that work as a
- * delay loop in between expand decisions.
- */
-
- scan_again = 0;
- if (kcage_freemem < kcage_minfree || kcage_needfree) {
- /*
- * Kcage_expand() will return a non-zero value if it was
- * able to expand the cage -- whether or not the new
- * pages are free and immediately usable. If non-zero,
- * we do another scan of the cage. The pages might be
- * freed during that scan or by time we get back here.
- * If not, we will attempt another expansion.
- * However, if kcage_expand() returns zero, then it was
- * unable to expand the cage. This is the case when the
- * the growth list is exausted, therefore no work was done
- * and there is no reason to scan the cage again.
- * Note: Kernel cage scan is not repeated when only one
- * cpu is active to avoid kernel cage thread hogging cpu.
- */
- if (pass <= 3 && pages_skipped && cp_default.cp_ncpus > 1)
- scan_again = 1;
- else
- (void) kcage_expand(); /* don't scan again */
- } else if (kcage_freemem < kcage_lotsfree) {
- /*
- * If available cage memory is less than abundant
- * and a full scan of the cage has not yet been completed,
- * or a scan has completed and some work was performed,
- * or pages were skipped because of sharing,
- * or we simply have not yet completed two passes,
- * then do another scan.
- */
- if (pass <= 2 && pages_skipped)
- scan_again = 1;
- if (pass == last_pass || did_something)
- scan_again = 1;
- else if (shared_skipped && shared_level < (8<<24)) {
- shared_level <<= 1;
- scan_again = 1;
- }
- }
+ if (kcage_freemem < kcage_throttlefree + kcage_needfree)
+ (void) kcage_expand();
- if (scan_again && cp_default.cp_ncpus > 1)
- goto again;
- else {
- if (shared_level > 8)
- shared_level >>= 1;
+ if (kcage_on && kcage_cageout_ready)
+ cv_broadcast(&kcage_throttle_cv);
- KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem);
- KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem);
- KCAGE_STAT_SET_SCAN(kt_ticks, ddi_get_lbolt() - scan_start);
- KCAGE_STAT_INC_SCAN_INDEX;
- goto loop;
- }
+ KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem);
+ KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem);
+ KCAGE_STAT_SET_SCAN(kt_ticks, ddi_get_lbolt() - scan_start);
+ KCAGE_STAT_INC_SCAN_INDEX;
+ goto loop;
/*NOTREACHED*/
}
diff --git a/usr/src/uts/common/sys/kmem.h b/usr/src/uts/common/sys/kmem.h
index 009e4e5981..3a37f63fa2 100644
--- a/usr/src/uts/common/sys/kmem.h
+++ b/usr/src/uts/common/sys/kmem.h
@@ -45,6 +45,7 @@ extern "C" {
#define KM_NOSLEEP 0x0001 /* cannot block for memory; may fail */
#define KM_PANIC 0x0002 /* if memory cannot be allocated, panic */
#define KM_PUSHPAGE 0x0004 /* can block for memory; may use reserve */
+#define KM_NORMALPRI 0x0008 /* with KM_NOSLEEP, lower priority allocation */
#define KM_VMFLAGS 0x00ff /* flags that must match VM_* flags */
#define KM_FLAGS 0xffff /* all settable kmem flags */
diff --git a/usr/src/uts/common/sys/vmem.h b/usr/src/uts/common/sys/vmem.h
index 81e9bff5c8..669f32699d 100644
--- a/usr/src/uts/common/sys/vmem.h
+++ b/usr/src/uts/common/sys/vmem.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_VMEM_H
@@ -40,6 +39,7 @@ extern "C" {
#define VM_NOSLEEP 0x00000001 /* same as KM_NOSLEEP */
#define VM_PANIC 0x00000002 /* same as KM_PANIC */
#define VM_PUSHPAGE 0x00000004 /* same as KM_PUSHPAGE */
+#define VM_NORMALPRI 0x00000008 /* same as KM_NORMALPRI */
#define VM_KMFLAGS 0x000000ff /* flags that must match KM_* flags */
#define VM_BESTFIT 0x00000100
diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h
index 1edf9b3326..026ea7c29b 100644
--- a/usr/src/uts/common/vm/page.h
+++ b/usr/src/uts/common/vm/page.h
@@ -604,7 +604,7 @@ extern pad_mutex_t ph_mutex[];
* Flags used while creating pages.
*/
#define PG_EXCL 0x0001
-#define PG_WAIT 0x0002
+#define PG_WAIT 0x0002 /* Blocking memory allocations */
#define PG_PHYSCONTIG 0x0004 /* NOT SUPPORTED */
#define PG_MATCH_COLOR 0x0008 /* SUPPORTED by free list routines */
#define PG_NORELOC 0x0010 /* Non-relocatable alloc hint. */
@@ -612,7 +612,8 @@ extern pad_mutex_t ph_mutex[];
#define PG_PANIC 0x0020 /* system will panic if alloc fails */
#define PG_PUSHPAGE 0x0040 /* alloc may use reserve */
#define PG_LOCAL 0x0080 /* alloc from given lgrp only */
-
+#define PG_NORMALPRI 0x0100 /* PG_WAIT like priority, but */
+ /* non-blocking */
/*
* When p_selock has the SE_EWANTED bit set, threads waiting for SE_EXCL
* access are given priority over all other waiting threads.
@@ -968,7 +969,7 @@ int page_szc_user_filtered(size_t);
#define PR_UE 0x02 /* page has an unhandled UE */
#define PR_UE_SCRUBBED 0x04 /* page has seen a UE but was cleaned */
#define PR_FMA 0x08 /* A DE wants this page retired */
-#define PR_CAPTURE 0x10 /* Generic page capture flag */
+#define PR_CAPTURE 0x10 /* page is hashed on page_capture_hash[] */
#define PR_RESV 0x20 /* Reserved for future use */
#define PR_MSG 0x40 /* message(s) already printed for this page */
#define PR_RETIRED 0x80 /* This page has been retired */
diff --git a/usr/src/uts/common/vm/seg_kmem.c b/usr/src/uts/common/vm/seg_kmem.c
index f3197a4b47..10ea3f7dc2 100644
--- a/usr/src/uts/common/vm/seg_kmem.c
+++ b/usr/src/uts/common/vm/seg_kmem.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -842,6 +841,10 @@ segkmem_page_create(void *addr, size_t size, int vmflag, void *arg)
pgflags |= PG_PANIC;
if (vmflag & VM_PUSHPAGE)
pgflags |= PG_PUSHPAGE;
+ if (vmflag & VM_NORMALPRI) {
+ ASSERT(vmflag & VM_NOSLEEP);
+ pgflags |= PG_NORMALPRI;
+ }
return (page_create_va(vp, (u_offset_t)(uintptr_t)addr, size,
pgflags, &kseg, addr));
@@ -1109,6 +1112,8 @@ segkmem_page_create_large(void *addr, size_t size, int vmflag, void *arg)
pgflags |= PG_WAIT;
if (vmflag & VM_PUSHPAGE)
pgflags |= PG_PUSHPAGE;
+ if (vmflag & VM_NORMALPRI)
+ pgflags |= PG_NORMALPRI;
return (page_create_va_large(&kvp, (u_offset_t)(uintptr_t)addr, size,
pgflags, &kvseg, addr, arg));
diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c
index 5895bcb6fa..a35f7cc196 100644
--- a/usr/src/uts/common/vm/vm_page.c
+++ b/usr/src/uts/common/vm/vm_page.c
@@ -1474,6 +1474,14 @@ page_create_throttle(pgcnt_t npages, int flags)
pgcnt_t tf; /* effective value of throttlefree */
/*
+ * Normal priority allocations.
+ */
+ if ((flags & (PG_WAIT | PG_NORMALPRI)) == PG_NORMALPRI) {
+ ASSERT(!(flags & (PG_PANIC | PG_PUSHPAGE)));
+ return (freemem >= npages + throttlefree);
+ }
+
+ /*
* Never deny pages when:
* - it's a thread that cannot block [NOMEMWAIT()]
* - the allocation cannot block and must not fail
@@ -2141,7 +2149,7 @@ page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
ASSERT(vp != NULL);
ASSERT((flags & ~(PG_EXCL | PG_WAIT |
- PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0);
+ PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
/* but no others */
ASSERT((flags & PG_EXCL) == PG_EXCL);
@@ -2276,7 +2284,7 @@ page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
/*NOTREACHED*/
}
ASSERT((flags & ~(PG_EXCL | PG_WAIT |
- PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0);
+ PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
/* but no others */
pages_req = npages = btopr(bytes);