diff options
author | Stan Studzinski <Stan.Studzinski@Sun.COM> | 2010-04-13 11:03:56 -0700 |
---|---|---|
committer | Stan Studzinski <Stan.Studzinski@Sun.COM> | 2010-04-13 11:03:56 -0700 |
commit | 23a80de1aec78d238d06caf311eaceb81dd5a440 (patch) | |
tree | 0b52be831a38566af69b540ddc88de0ea39c683b | |
parent | bf994817a71d4ac680198e25fe79d13c247306e0 (diff) | |
download | illumos-joyent-23a80de1aec78d238d06caf311eaceb81dd5a440.tar.gz |
6675738 KM_NOSLEEP may still try too hard for some allocations
-rw-r--r-- | usr/src/uts/common/os/mem_cage.c | 201 | ||||
-rw-r--r-- | usr/src/uts/common/sys/kmem.h | 1 | ||||
-rw-r--r-- | usr/src/uts/common/sys/vmem.h | 4 | ||||
-rw-r--r-- | usr/src/uts/common/vm/page.h | 7 | ||||
-rw-r--r-- | usr/src/uts/common/vm/seg_kmem.c | 9 | ||||
-rw-r--r-- | usr/src/uts/common/vm/vm_page.c | 12 |
6 files changed, 68 insertions, 166 deletions
diff --git a/usr/src/uts/common/os/mem_cage.c b/usr/src/uts/common/os/mem_cage.c index 74904a0a38..9e87a9c4f0 100644 --- a/usr/src/uts/common/os/mem_cage.c +++ b/usr/src/uts/common/os/mem_cage.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/types.h> @@ -75,7 +74,6 @@ struct kcage_stats_scan { uint_t kt_cantlock; uint_t kt_gotone; uint_t kt_gotonefree; - uint_t kt_skiplevel; uint_t kt_skipshared; uint_t kt_skiprefd; uint_t kt_destroy; @@ -1185,15 +1183,9 @@ kcage_cageout_init() int kcage_create_throttle(pgcnt_t npages, int flags) { - int niter = 0; - pgcnt_t lastfree; - int enough = kcage_freemem > kcage_throttlefree + npages; KCAGE_STAT_INCR(kct_calls); /* unprotected incr. */ - kcage_cageout_wakeup(); /* just to be sure */ - KCAGE_STAT_INCR(kct_cagewake); /* unprotected incr. */ - /* * Obviously, we can't throttle the cageout thread since * we depend on it. We also can't throttle the panic thread. @@ -1209,7 +1201,7 @@ kcage_create_throttle(pgcnt_t npages, int flags) * if freemem is very low. */ if (NOMEMWAIT()) { - if (enough) { + if (kcage_freemem > kcage_throttlefree + npages) { KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */ return (KCT_CRIT); } else if (freemem < minfree) { @@ -1235,9 +1227,6 @@ kcage_create_throttle(pgcnt_t npages, int flags) */ while (kcage_freemem < kcage_throttlefree + npages) { ASSERT(kcage_on); - - lastfree = kcage_freemem; - if (kcage_cageout_ready) { mutex_enter(&kcage_throttle_mutex); @@ -1265,23 +1254,20 @@ kcage_create_throttle(pgcnt_t npages, int flags) atomic_add_long(&kcage_needfree, -npages); } - if ((flags & PG_WAIT) == 0) { - if (kcage_freemem > lastfree) { - KCAGE_STAT_INCR(kct_progress); - niter = 0; - } else { - KCAGE_STAT_INCR(kct_noprogress); - if (++niter >= kcage_maxwait) { - KCAGE_STAT_INCR(kct_timeout); - return (KCT_FAILURE); - } - } - } - if (NOMEMWAIT() && freemem < minfree) { return (KCT_CRIT); } + if ((flags & PG_WAIT) == 0) { + pgcnt_t limit = (flags & PG_NORMALPRI) ? + throttlefree : pageout_reserve; + if ((kcage_freemem < kcage_throttlefree + npages) && + (freemem < limit + npages)) { + return (KCT_FAILURE); + } else { + return (KCT_NONCRIT); + } + } } return (KCT_NONCRIT); } @@ -1393,9 +1379,9 @@ check_free_and_return: if (page_trylock(pp, SE_SHARED)) { if (PP_ISNORELOC(pp)) goto check_free_and_return; - } else + } else { return (EAGAIN); - + } if (!PP_ISFREE(pp)) { page_unlock(pp); return (EAGAIN); @@ -1471,14 +1457,13 @@ kcage_expand() * Exit early if expansion amount is equal to or less than zero. * (<0 is possible if kcage_freemem rises suddenly.) * - * Exit early when the global page pool (apparently) does not - * have enough free pages to page_relocate() even a single page. + * Exit early when freemem drops below pageout_reserve plus the request. */ wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree) - kcage_freemem; - if (wanted <= 0) + if (wanted <= 0) { return (0); - else if (freemem < pageout_reserve + 1) { + } else if (freemem < pageout_reserve + wanted) { KCAGE_STAT_INCR(ke_lowfreemem); return (0); } @@ -1670,6 +1655,18 @@ kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp) return (0); } +/* + * Expand cage only if there is not enough memory to satisfy + * current request. We only do one (complete) scan of the cage. + * Dirty pages and pages with shared mappings are skipped; + * Locked pages (p_lckcnt and p_cowcnt) are also skipped. + * All other pages are freed (if they can be locked). + * This may affect caching of user pages which are in cage by freeing/ + * reclaiming them more often. However cage is mainly for kernel (heap) + * pages and we want to keep user pages outside of cage. The above policy + * should also reduce cage expansion plus it should speed up cage mem + * allocations. + */ static void kcage_cageout() { @@ -1677,12 +1674,7 @@ kcage_cageout() page_t *pp; callb_cpr_t cprinfo; int did_something; - int scan_again; pfn_t start_pfn; - int pass; - int last_pass; - int pages_skipped; - int shared_skipped; ulong_t shared_level = 8; pgcnt_t nfreed; #ifdef KCAGE_STATS @@ -1713,14 +1705,9 @@ loop: KCAGE_STAT_INCR(kt_wakeups); KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem); KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem); - pass = 0; - last_pass = 0; - #ifdef KCAGE_STATS scan_start = ddi_get_lbolt(); #endif - -again: if (!kcage_on) goto loop; @@ -1728,16 +1715,16 @@ again: KCAGE_STAT_INCR_SCAN(kt_passes); did_something = 0; - pages_skipped = 0; - shared_skipped = 0; - while ((kcage_freemem < kcage_lotsfree || kcage_needfree) && - (pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) { + while (kcage_freemem < kcage_lotsfree + kcage_needfree) { + + if ((pfn = kcage_walk_cage(pfn == PFN_INVALID)) == + PFN_INVALID) { + break; + } if (start_pfn == PFN_INVALID) start_pfn = pfn; else if (start_pfn == pfn) { - last_pass = pass; - pass += 1; /* * Did a complete walk of kernel cage, but didn't free * any pages. If only one cpu is active then @@ -1813,63 +1800,12 @@ again: continue; } - KCAGE_STAT_SET_SCAN(kt_skiplevel, shared_level); if (hat_page_checkshare(pp, shared_level)) { page_unlock(pp); - pages_skipped = 1; - shared_skipped = 1; KCAGE_STAT_INCR_SCAN(kt_skipshared); continue; } - /* - * In pass {0, 1}, skip page if ref bit is set. - * In pass {0, 1, 2}, skip page if mod bit is set. - */ - prm = hat_pagesync(pp, - HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); - - /* On first pass ignore ref'd pages */ - if (pass <= 1 && (prm & P_REF)) { - KCAGE_STAT_INCR_SCAN(kt_skiprefd); - pages_skipped = 1; - page_unlock(pp); - continue; - } - - /* On pass 2, VN_DISPOSE if mod bit is not set */ - if (pass <= 2) { - if (pp->p_szc != 0 || (prm & P_MOD) || - pp->p_lckcnt || pp->p_cowcnt) { - pages_skipped = 1; - page_unlock(pp); - } else { - - /* - * unload the mappings before - * checking if mod bit is set - */ - (void) hat_pageunload(pp, - HAT_FORCE_PGUNLOAD); - - /* - * skip this page if modified - */ - if (hat_ismod(pp)) { - pages_skipped = 1; - page_unlock(pp); - continue; - } - - KCAGE_STAT_INCR_SCAN(kt_destroy); - /* constant in conditional context */ - /* LINTED */ - VN_DISPOSE(pp, B_INVAL, 0, kcred); - did_something = 1; - } - continue; - } - if (kcage_invalidate_page(pp, &nfreed) == 0) { did_something = 1; KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed); @@ -1883,66 +1819,17 @@ again: } } - /* - * Expand the cage only if available cage memory is really low. - * This test is done only after a complete scan of the cage. - * The reason for not checking and expanding more often is to - * avoid rapid expansion of the cage. Naturally, scanning the - * cage takes time. So by scanning first, we use that work as a - * delay loop in between expand decisions. - */ - - scan_again = 0; - if (kcage_freemem < kcage_minfree || kcage_needfree) { - /* - * Kcage_expand() will return a non-zero value if it was - * able to expand the cage -- whether or not the new - * pages are free and immediately usable. If non-zero, - * we do another scan of the cage. The pages might be - * freed during that scan or by time we get back here. - * If not, we will attempt another expansion. - * However, if kcage_expand() returns zero, then it was - * unable to expand the cage. This is the case when the - * the growth list is exausted, therefore no work was done - * and there is no reason to scan the cage again. - * Note: Kernel cage scan is not repeated when only one - * cpu is active to avoid kernel cage thread hogging cpu. - */ - if (pass <= 3 && pages_skipped && cp_default.cp_ncpus > 1) - scan_again = 1; - else - (void) kcage_expand(); /* don't scan again */ - } else if (kcage_freemem < kcage_lotsfree) { - /* - * If available cage memory is less than abundant - * and a full scan of the cage has not yet been completed, - * or a scan has completed and some work was performed, - * or pages were skipped because of sharing, - * or we simply have not yet completed two passes, - * then do another scan. - */ - if (pass <= 2 && pages_skipped) - scan_again = 1; - if (pass == last_pass || did_something) - scan_again = 1; - else if (shared_skipped && shared_level < (8<<24)) { - shared_level <<= 1; - scan_again = 1; - } - } + if (kcage_freemem < kcage_throttlefree + kcage_needfree) + (void) kcage_expand(); - if (scan_again && cp_default.cp_ncpus > 1) - goto again; - else { - if (shared_level > 8) - shared_level >>= 1; + if (kcage_on && kcage_cageout_ready) + cv_broadcast(&kcage_throttle_cv); - KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem); - KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem); - KCAGE_STAT_SET_SCAN(kt_ticks, ddi_get_lbolt() - scan_start); - KCAGE_STAT_INC_SCAN_INDEX; - goto loop; - } + KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem); + KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem); + KCAGE_STAT_SET_SCAN(kt_ticks, ddi_get_lbolt() - scan_start); + KCAGE_STAT_INC_SCAN_INDEX; + goto loop; /*NOTREACHED*/ } diff --git a/usr/src/uts/common/sys/kmem.h b/usr/src/uts/common/sys/kmem.h index 009e4e5981..3a37f63fa2 100644 --- a/usr/src/uts/common/sys/kmem.h +++ b/usr/src/uts/common/sys/kmem.h @@ -45,6 +45,7 @@ extern "C" { #define KM_NOSLEEP 0x0001 /* cannot block for memory; may fail */ #define KM_PANIC 0x0002 /* if memory cannot be allocated, panic */ #define KM_PUSHPAGE 0x0004 /* can block for memory; may use reserve */ +#define KM_NORMALPRI 0x0008 /* with KM_NOSLEEP, lower priority allocation */ #define KM_VMFLAGS 0x00ff /* flags that must match VM_* flags */ #define KM_FLAGS 0xffff /* all settable kmem flags */ diff --git a/usr/src/uts/common/sys/vmem.h b/usr/src/uts/common/sys/vmem.h index 81e9bff5c8..669f32699d 100644 --- a/usr/src/uts/common/sys/vmem.h +++ b/usr/src/uts/common/sys/vmem.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_VMEM_H @@ -40,6 +39,7 @@ extern "C" { #define VM_NOSLEEP 0x00000001 /* same as KM_NOSLEEP */ #define VM_PANIC 0x00000002 /* same as KM_PANIC */ #define VM_PUSHPAGE 0x00000004 /* same as KM_PUSHPAGE */ +#define VM_NORMALPRI 0x00000008 /* same as KM_NORMALPRI */ #define VM_KMFLAGS 0x000000ff /* flags that must match KM_* flags */ #define VM_BESTFIT 0x00000100 diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h index 1edf9b3326..026ea7c29b 100644 --- a/usr/src/uts/common/vm/page.h +++ b/usr/src/uts/common/vm/page.h @@ -604,7 +604,7 @@ extern pad_mutex_t ph_mutex[]; * Flags used while creating pages. */ #define PG_EXCL 0x0001 -#define PG_WAIT 0x0002 +#define PG_WAIT 0x0002 /* Blocking memory allocations */ #define PG_PHYSCONTIG 0x0004 /* NOT SUPPORTED */ #define PG_MATCH_COLOR 0x0008 /* SUPPORTED by free list routines */ #define PG_NORELOC 0x0010 /* Non-relocatable alloc hint. */ @@ -612,7 +612,8 @@ extern pad_mutex_t ph_mutex[]; #define PG_PANIC 0x0020 /* system will panic if alloc fails */ #define PG_PUSHPAGE 0x0040 /* alloc may use reserve */ #define PG_LOCAL 0x0080 /* alloc from given lgrp only */ - +#define PG_NORMALPRI 0x0100 /* PG_WAIT like priority, but */ + /* non-blocking */ /* * When p_selock has the SE_EWANTED bit set, threads waiting for SE_EXCL * access are given priority over all other waiting threads. @@ -968,7 +969,7 @@ int page_szc_user_filtered(size_t); #define PR_UE 0x02 /* page has an unhandled UE */ #define PR_UE_SCRUBBED 0x04 /* page has seen a UE but was cleaned */ #define PR_FMA 0x08 /* A DE wants this page retired */ -#define PR_CAPTURE 0x10 /* Generic page capture flag */ +#define PR_CAPTURE 0x10 /* page is hashed on page_capture_hash[] */ #define PR_RESV 0x20 /* Reserved for future use */ #define PR_MSG 0x40 /* message(s) already printed for this page */ #define PR_RETIRED 0x80 /* This page has been retired */ diff --git a/usr/src/uts/common/vm/seg_kmem.c b/usr/src/uts/common/vm/seg_kmem.c index f3197a4b47..10ea3f7dc2 100644 --- a/usr/src/uts/common/vm/seg_kmem.c +++ b/usr/src/uts/common/vm/seg_kmem.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/types.h> @@ -842,6 +841,10 @@ segkmem_page_create(void *addr, size_t size, int vmflag, void *arg) pgflags |= PG_PANIC; if (vmflag & VM_PUSHPAGE) pgflags |= PG_PUSHPAGE; + if (vmflag & VM_NORMALPRI) { + ASSERT(vmflag & VM_NOSLEEP); + pgflags |= PG_NORMALPRI; + } return (page_create_va(vp, (u_offset_t)(uintptr_t)addr, size, pgflags, &kseg, addr)); @@ -1109,6 +1112,8 @@ segkmem_page_create_large(void *addr, size_t size, int vmflag, void *arg) pgflags |= PG_WAIT; if (vmflag & VM_PUSHPAGE) pgflags |= PG_PUSHPAGE; + if (vmflag & VM_NORMALPRI) + pgflags |= PG_NORMALPRI; return (page_create_va_large(&kvp, (u_offset_t)(uintptr_t)addr, size, pgflags, &kvseg, addr, arg)); diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c index 5895bcb6fa..a35f7cc196 100644 --- a/usr/src/uts/common/vm/vm_page.c +++ b/usr/src/uts/common/vm/vm_page.c @@ -1474,6 +1474,14 @@ page_create_throttle(pgcnt_t npages, int flags) pgcnt_t tf; /* effective value of throttlefree */ /* + * Normal priority allocations. + */ + if ((flags & (PG_WAIT | PG_NORMALPRI)) == PG_NORMALPRI) { + ASSERT(!(flags & (PG_PANIC | PG_PUSHPAGE))); + return (freemem >= npages + throttlefree); + } + + /* * Never deny pages when: * - it's a thread that cannot block [NOMEMWAIT()] * - the allocation cannot block and must not fail @@ -2141,7 +2149,7 @@ page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, ASSERT(vp != NULL); ASSERT((flags & ~(PG_EXCL | PG_WAIT | - PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0); + PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0); /* but no others */ ASSERT((flags & PG_EXCL) == PG_EXCL); @@ -2276,7 +2284,7 @@ page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, /*NOTREACHED*/ } ASSERT((flags & ~(PG_EXCL | PG_WAIT | - PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0); + PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0); /* but no others */ pages_req = npages = btopr(bytes); |