summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDan McDonald <danmcd@joyent.com>2021-02-26 11:50:34 -0500
committerDan McDonald <danmcd@joyent.com>2021-02-26 15:07:49 -0500
commite995d04dc2acdf768fe17d155ce98e9cebe2694e (patch)
tree96ce8e5799c46b973244f96e187dd66b11c830cf
parent30e46d865c328134f725b0f2a58f9ce75e7f5c21 (diff)
downloadillumos-joyent-e995d04dc2acdf768fe17d155ce98e9cebe2694e.tar.gz
13097 improve VM tunables for modern systems (fix mismerge)
-rw-r--r--usr/src/uts/common/os/vm_pageout.c746
-rw-r--r--usr/src/uts/common/vm/vm_page.c1
-rw-r--r--usr/src/uts/i86pc/sys/vm_machparam.h2
3 files changed, 387 insertions, 362 deletions
diff --git a/usr/src/uts/common/os/vm_pageout.c b/usr/src/uts/common/os/vm_pageout.c
index f3c2fe1e88..c675a4dfb8 100644
--- a/usr/src/uts/common/os/vm_pageout.c
+++ b/usr/src/uts/common/os/vm_pageout.c
@@ -242,6 +242,7 @@ pgcnt_t lotsfree = 0;
pgcnt_t needfree = 0;
pgcnt_t throttlefree = 0;
pgcnt_t pageout_reserve = 0;
+pri_t pageout_pri;
pgcnt_t deficit;
pgcnt_t nscan;
@@ -250,7 +251,8 @@ pgcnt_t desscan;
/* kstats */
uint64_t low_mem_scan;
uint64_t zone_cap_scan;
-uint64_t n_throttle;
+
+#define MAX_PSCAN_THREADS 16
/*
* Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and
@@ -270,33 +272,28 @@ uint64_t n_throttle;
* depending on memory pressure or zones over their cap.
*
* zone_pageout_nsec:
- * Number of nanoseconds budget for each cycle when a zone
- * is over its memory cap. If this is zero, then the value
- * of max_pageout_nsec is used instead.
+ * Number of nanoseconds budget for each cycle when a zone
+ * is over its memory cap. If this is zero, then the value
+ * of max_pageout_nsec is used instead.
*/
-
static hrtime_t min_pageout_nsec;
static hrtime_t max_pageout_nsec;
static hrtime_t pageout_nsec;
static hrtime_t zone_pageout_nsec;
-#define MAX_PSCAN_THREADS 16
-static boolean_t reset_hands[MAX_PSCAN_THREADS];
+static boolean_t reset_hands[MAX_PSCAN_THREADS];
+
+#define PAGES_POLL_MASK 1023
+#define SCHEDPAGING_HZ 4
/*
- * These can be tuned in /etc/system or set with mdb.
- * 'des_page_scanners' is the desired number of page scanner threads. The
- * system will bring the actual number of threads into line with the desired
- * number. If des_page_scanners is set to an invalid value, the system will
- * correct the setting.
+ * despagescanners:
+ * The desired number of page scanner threads. The value can be set in
+ * /etc/system or tuned directly with 'mdb -kw'. The system will bring
+ * the actual number of threads into line with the desired number. If set
+ * to an invalid value, the system will correct the setting.
*/
-uint_t des_page_scanners;
-uint_t pageout_reset_cnt = 64; /* num. cycles for pageout_scanner hand reset */
-
-uint_t n_page_scanners;
-static pgcnt_t pscan_region_sz; /* informational only */
-
-#define PAGES_POLL_MASK 1023
+uint_t despagescanners = 0;
/*
* pageout_sample_lim:
@@ -323,20 +320,23 @@ static pgcnt_t pscan_region_sz; /* informational only */
* enough samples have been recorded (pageout_sample_cnt). Once set, this
* new value is used for fastscan and handspreadpages.
*/
-
typedef hrtime_t hrrate_t;
static uint64_t pageout_sample_lim = 4;
static uint64_t pageout_sample_cnt = 0;
static pgcnt_t pageout_sample_pages = 0;
+static hrtime_t pageout_sample_etime = 0;
static hrrate_t pageout_rate = 0;
static pgcnt_t pageout_new_spread = 0;
-static hrtime_t pageout_sample_etime = 0;
-
-/* True if page scanner is first starting up */
+/* True if the page scanner is first starting up */
#define PAGE_SCAN_STARTUP (pageout_sample_cnt < pageout_sample_lim)
+/* The current number of page scanner threads */
+static uint_t n_page_scanners = 1;
+/* The number of page scanner threads that are actively scanning. */
+static uint_t pageouts_running;
+
/*
* Record number of times a pageout_scanner() wakeup cycle finished because it
* timed out (exceeded its CPU budget), rather than because it visited
@@ -385,9 +385,10 @@ static struct clockinit {
pgcnt_t ci_fastscan;
pgcnt_t ci_slowscan;
pgcnt_t ci_handspreadpages;
+ uint_t ci_despagescanners;
} clockinit = { .ci_init = false };
-static pgcnt_t
+static inline pgcnt_t
clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
{
if (value < minimum) {
@@ -421,6 +422,72 @@ tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
static boolean_t zones_over = B_FALSE;
/*
+ * On large memory systems, multiple instances of the page scanner are run,
+ * each responsible for a separate region of memory. This speeds up page
+ * invalidation under low memory conditions.
+ *
+ * despagescanners can be set in /etc/system or via mdb and it will
+ * be used as a guide for how many page scanners to create; the value
+ * will be adjusted if it is not sensible. Otherwise, the number of
+ * page scanners is determined dynamically based on handspreadpages.
+ */
+static void
+recalc_pagescanners(void)
+{
+ pgcnt_t sz;
+ uint_t des;
+
+ /* If the initial calibration has not been done, take no action. */
+ if (pageout_new_spread == 0)
+ return;
+
+ /*
+ * If the desired number of scanners is set in /etc/system
+ * then try to use it.
+ */
+ if (despagescanners == 0 && clockinit.ci_despagescanners != 0)
+ despagescanners = clockinit.ci_despagescanners;
+
+ if (despagescanners != 0) {
+ /*
+ * We have a desired number of page scanners, either from
+ * /etc/system or set via mdb. Try and use it (it will be
+ * clamped below).
+ */
+ des = despagescanners;
+ } else {
+ /*
+ * Calculate the number of desired scanners based on the
+ * system's memory size.
+ *
+ * A 64GiB region size is used as the basis for calculating how
+ * many scanner threads should be created. For systems with up
+ * to 64GiB of RAM, a single thread is used; for very large
+ * memory systems the threads are limited to MAX_PSCAN_THREADS.
+ */
+ sz = btop(64ULL << 30);
+
+ if (sz > looppages) {
+ des = 1;
+ } else {
+ pgcnt_t tmp = sz;
+
+ for (des = 1; tmp < looppages; des++)
+ tmp += sz;
+ }
+ }
+
+ /*
+ * clamp the number of scanners so that we are under MAX_PSCAN_THREADS
+ * and so that each scanner covers at least 10% more than
+ * handspreadpages.
+ */
+ des = clamp(des, 1,
+ looppages / (handspreadpages + handspreadpages / 10));
+ despagescanners = clamp(des, 1, MAX_PSCAN_THREADS);
+}
+
+/*
* Set up the paging constants for the clock algorithm used by
* pageout_scanner(), and by the virtual memory system overall. See the
* comments at the top of this file for more information about the threshold
@@ -434,9 +501,6 @@ static boolean_t zones_over = B_FALSE;
void
setupclock(void)
{
- uint_t i;
- pgcnt_t sz, tmp;
- pgcnt_t defval;
bool half = (pageout_threshold_style == 1);
bool recalc = true;
@@ -450,7 +514,6 @@ setupclock(void)
* A value of zero for any tunable means we will use the default
* sizing.
*/
-
if (!clockinit.ci_init) {
clockinit.ci_init = true;
@@ -466,6 +529,7 @@ setupclock(void)
clockinit.ci_fastscan = fastscan;
clockinit.ci_slowscan = slowscan;
clockinit.ci_handspreadpages = handspreadpages;
+ clockinit.ci_despagescanners = despagescanners;
/*
* The first call does not trigger a recalculation, only
@@ -647,7 +711,7 @@ setupclock(void)
}
/*
- * Handspreadpages is distance (in pages) between front and back
+ * Handspreadpages is the distance (in pages) between front and back
* pageout daemon hands. The amount of time to reclaim a page
* once pageout examines it increases with this distance and
* decreases as the scan rate rises. It must be < the amount
@@ -682,58 +746,32 @@ setupclock(void)
handspreadpages = looppages - 1;
}
- if (!recalc) {
- /*
- * Setup basic values at initialization.
- */
- pscan_region_sz = total_pages;
- des_page_scanners = n_page_scanners = 1;
- reset_hands[0] = B_TRUE;
- return;
- }
-
/*
- * Recalculating
- *
- * We originally set the number of page scanners to 1. Now that we
- * know what the handspreadpages is for a scanner, figure out how many
- * scanners we should run. We want to ensure that the regions don't
- * overlap and that they are not touching.
- *
- * A default 64GB region size is used as the initial value to calculate
- * how many scanner threads we should create on lower memory systems.
- * The idea is to limit the number of threads to a practical value
- * (e.g. a 64GB machine really only needs one scanner thread). For very
- * large memory systems, we limit ourselves to MAX_PSCAN_THREADS
- * threads.
- *
- * The scanner threads themselves are evenly spread out around the
- * memory "clock" in pageout_scanner when we reset the hands, and each
- * thread will scan all of memory.
+ * Establish the minimum and maximum length of time to be spent
+ * scanning pages per wakeup, limiting the scanner duty cycle. The
+ * input percentage values (0-100) must be converted to a fraction of
+ * the number of nanoseconds in a second of wall time, then further
+ * scaled down by the number of scanner wakeups in a second.
*/
- sz = (btop(64ULL * 0x40000000ULL));
- if (sz < handspreadpages) {
- /*
- * 64GB is smaller than the separation between the front
- * and back hands; use double handspreadpages.
- */
- sz = handspreadpages << 1;
- }
- if (sz > total_pages) {
- sz = total_pages;
- }
- /* Record region size for inspection with mdb, otherwise unused */
- pscan_region_sz = sz;
+ min_pageout_nsec = MAX(1,
+ NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
+ max_pageout_nsec = MAX(min_pageout_nsec,
+ NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
- tmp = sz;
- for (i = 1; tmp < total_pages; i++) {
- tmp += sz;
- }
+ /*
+ * If not called for recalculation, return and skip the remaining
+ * steps.
+ */
+ if (!recalc)
+ return;
- if (i > MAX_PSCAN_THREADS)
- i = MAX_PSCAN_THREADS;
+ /*
+ * Set a flag to re-evaluate the clock hand positions.
+ */
+ for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++)
+ reset_hands[i] = B_TRUE;
- des_page_scanners = i;
+ recalc_pagescanners();
}
/*
@@ -747,9 +785,8 @@ setupclock(void)
* in its next pass; schedpaging() sets this value based on the amount of
* currently available memory.
*/
-#define SCHEDPAGING_HZ 4
-static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */
+static kmutex_t pageout_mutex;
/*
* Pool of available async pageout putpage requests.
@@ -777,7 +814,7 @@ static bool pageout_pushing = false;
static uint64_t pageout_pushcount = 0;
static uint64_t pageout_pushcount_seen = 0;
-static int async_list_size = 256; /* number of async request structs */
+static int async_list_size = 8192; /* number of async request structs */
static void pageout_scanner(void *);
@@ -808,157 +845,142 @@ schedpaging(void *arg)
if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
kcage_cageout_wakeup();
- (void) atomic_swap_ulong(&nscan, 0);
- vavail = freemem - deficit;
- if (pageout_new_spread != 0)
- vavail -= needfree;
- if (vavail < 0)
- vavail = 0;
- if (vavail > lotsfree)
- vavail = lotsfree;
+ if (mutex_tryenter(&pageout_mutex)) {
- /*
- * Fix for 1161438 (CRS SPR# 73922). All variables
- * in the original calculation for desscan were 32 bit signed
- * ints. As freemem approaches 0x0 on a system with 1 Gig or
- * more of memory, the calculation can overflow. When this
- * happens, desscan becomes negative and pageout_scanner()
- * stops paging out.
- */
- if (needfree > 0 && pageout_new_spread == 0) {
- /*
- * If we've not yet collected enough samples to
- * calculate a spread, kick into high gear anytime
- * needfree is non-zero. Note that desscan will not be
- * the limiting factor for systems with larger memory;
- * the %CPU will limit the scan. That will also be
- * maxed out below.
- */
- desscan = fastscan / SCHEDPAGING_HZ;
- } else {
- /*
- * Once we've calculated a spread based on system
- * memory and usage, just treat needfree as another
- * form of deficit.
- */
- spgcnt_t faststmp, slowstmp, result;
+ if (pageouts_running != 0)
+ goto out;
- slowstmp = slowscan * vavail;
- faststmp = fastscan * (lotsfree - vavail);
- result = (slowstmp + faststmp) /
- nz(lotsfree) / SCHEDPAGING_HZ;
- desscan = (pgcnt_t)result;
- }
+ /* No pageout scanner threads running. */
+ nscan = 0;
+ vavail = freemem - deficit;
+ if (pageout_new_spread != 0)
+ vavail -= needfree;
+ vavail = clamp(vavail, 0, lotsfree);
- /*
- * If we've not yet collected enough samples to calculate a
- * spread, also kick %CPU to the max.
- */
- if (pageout_new_spread == 0) {
- pageout_nsec = max_pageout_nsec;
- } else {
- pageout_nsec = min_pageout_nsec +
- (lotsfree - vavail) *
- (max_pageout_nsec - min_pageout_nsec) /
- nz(lotsfree);
- }
+ if (needfree > 0 && pageout_new_spread == 0) {
+ /*
+ * If we've not yet collected enough samples to
+ * calculate a spread, use the old logic of kicking
+ * into high gear anytime needfree is non-zero.
+ */
+ desscan = fastscan / SCHEDPAGING_HZ;
+ } else {
+ /*
+ * Once we've calculated a spread based on system
+ * memory and usage, just treat needfree as another
+ * form of deficit.
+ */
+ spgcnt_t faststmp, slowstmp, result;
- if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) {
- /*
- * We have finished the pagescan initialization and the desired
- * number of page scanners has changed, either because
- * initialization just finished, because of a memory DR, or
- * because des_page_scanners has been modified on the fly (i.e.
- * by mdb). If we need more scanners, start them now, otherwise
- * the excess scanners will terminate on their own when they
- * reset their hands.
- */
- uint_t i;
- uint_t curr_nscan = n_page_scanners;
- pgcnt_t max = total_pages / handspreadpages;
+ slowstmp = slowscan * vavail;
+ faststmp = fastscan * (lotsfree - vavail);
+ result = (slowstmp + faststmp) /
+ nz(lotsfree) / SCHEDPAGING_HZ;
+ desscan = (pgcnt_t)result;
+ }
- if (des_page_scanners > max)
- des_page_scanners = max;
+ pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
+ (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
- if (des_page_scanners > MAX_PSCAN_THREADS) {
- des_page_scanners = MAX_PSCAN_THREADS;
- } else if (des_page_scanners == 0) {
- des_page_scanners = 1;
- }
+ DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t,
+ pageout_nsec);
- /*
- * Each thread has its own entry in the reset_hands array, so
- * we don't need any locking in pageout_scanner to check the
- * thread's reset_hands entry. Thus, we use a pre-allocated
- * fixed size reset_hands array and upper limit on the number
- * of pagescan threads.
- *
- * The reset_hands entries need to be true before we start new
- * scanners, but if we're reducing, we don't want a race on the
- * recalculation for the existing threads, so we set
- * n_page_scanners first.
- */
- n_page_scanners = des_page_scanners;
- for (i = 0; i < MAX_PSCAN_THREADS; i++) {
- reset_hands[i] = B_TRUE;
- }
+ if (pageout_new_spread != 0 && despagescanners != 0 &&
+ despagescanners != n_page_scanners) {
+ /*
+ * We have finished the pagescan initialisation and the
+ * desired number of page scanners has changed, either
+ * because initialisation just finished, because of a
+ * memory DR, or because despagescanners has been
+ * modified on the fly (i.e. by mdb).
+ */
+ uint_t i, curr_nscan = n_page_scanners;
+
+ /* Re-validate despagescanners */
+ recalc_pagescanners();
+
+ n_page_scanners = despagescanners;
+
+ for (i = 0; i < MAX_PSCAN_THREADS; i++)
+ reset_hands[i] = B_TRUE;
+
+ /* If we need more scanners, start them now. */
+ if (n_page_scanners > curr_nscan) {
+ for (i = curr_nscan; i < n_page_scanners; i++) {
+ (void) lwp_kernel_create(proc_pageout,
+ pageout_scanner,
+ (void *)(uintptr_t)i, TS_RUN,
+ pageout_pri);
+ }
+ }
- if (des_page_scanners > curr_nscan) {
- /* Create additional pageout scanner threads. */
- for (i = curr_nscan; i < des_page_scanners; i++) {
- (void) lwp_kernel_create(proc_pageout,
- pageout_scanner, (void *)(uintptr_t)i,
- TS_RUN, curthread->t_pri);
+ /*
+ * If the number of scanners has decreased, trigger a
+ * wakeup so that the excess threads will terminate.
+ */
+ if (n_page_scanners < curr_nscan) {
+ WAKE_PAGEOUT_SCANNER();
}
}
- }
- zones_over = B_FALSE;
+ zones_over = B_FALSE;
- if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) {
- if (!PAGE_SCAN_STARTUP)
- low_mem_scan++;
- /*
- * Either we need more memory, or we still need to
- * measure the average scan rate. Wake the scanner.
- */
- DTRACE_PROBE(schedpage__wake__low);
- WAKE_PAGEOUT_SCANNER();
-
- } else if (zone_num_over_cap > 0) {
- /* One or more zones are over their cap. */
-
- /* No page limit */
- desscan = total_pages;
-
- /*
- * Increase the scanning CPU% to the max. This implies
- * 80% of one CPU/sec if the scanner can run each
- * opportunity. Can also be tuned via setting
- * zone_pageout_nsec in /etc/system or with mdb.
- */
- pageout_nsec = (zone_pageout_nsec != 0) ?
- zone_pageout_nsec : max_pageout_nsec;
+ if (PAGE_SCAN_STARTUP) {
+ /*
+ * We still need to measure the rate at which the
+ * system is able to scan pages of memory. Each of
+ * these initial samples is a scan of as much system
+ * memory as practical, regardless of whether or not we
+ * are experiencing memory pressure.
+ */
+ desscan = total_pages;
+ pageout_nsec = max_pageout_nsec;
- zones_over = B_TRUE;
- zone_cap_scan++;
+ DTRACE_PROBE(schedpage__wake__sample);
+ WAKE_PAGEOUT_SCANNER();
+ } else if (freemem < lotsfree + needfree) {
+ /*
+ * We need more memory.
+ */
+ low_mem_scan++;
- DTRACE_PROBE(schedpage__wake__zone);
- WAKE_PAGEOUT_SCANNER();
+ DTRACE_PROBE(schedpage__wake__low);
+ WAKE_PAGEOUT_SCANNER();
+ } else if (zone_num_over_cap > 0) {
+ /*
+ * One of more zones are over their cap.
+ */
- } else {
- /*
- * There are enough free pages, no need to
- * kick the scanner thread. And next time
- * around, keep more of the `highly shared'
- * pages.
- */
- cv_signal_pageout();
+ /* No page limit */
+ desscan = total_pages;
- mutex_enter(&pageout_mutex);
- if (po_share > MIN_PO_SHARE) {
- po_share >>= 1;
+ /*
+ * Increase the scanning CPU% to the max. This implies
+ * 80% of one CPU/sec if the scanner can run each
+ * opportunity. Can also be tuned via setting
+ * zone_pageout_nsec in /etc/system or with mdb.
+ */
+ pageout_nsec = (zone_pageout_nsec != 0) ?
+ zone_pageout_nsec : max_pageout_nsec;
+
+ zones_over = B_TRUE;
+ zone_cap_scan++;
+
+ DTRACE_PROBE(schedpage__wake__zone);
+ WAKE_PAGEOUT_SCANNER();
+ } else {
+ /*
+ * There are enough free pages, no need to
+ * kick the scanner thread. And next time
+ * around, keep more of the `highly shared'
+ * pages.
+ */
+ cv_signal_pageout();
+ if (po_share > MIN_PO_SHARE) {
+ po_share >>= 1;
+ }
}
+out:
mutex_exit(&pageout_mutex);
}
@@ -987,47 +1009,39 @@ uint_t dopageout = 1;
/*
* The page out daemon, which runs as process 2.
*
- * Page out occurs when either:
- * a) there is less than lotsfree pages,
- * b) there are one or more zones over their physical memory cap.
- *
- * The daemon treats physical memory as a circular array of pages and scans the
- * pages using a 'two-handed clock' algorithm. The front hand moves through
- * the pages, clearing the reference bit. The back hand travels a distance
- * (handspreadpages) behind the front hand, freeing the pages that have not
- * been referenced in the time since the front hand passed. If modified, they
- * are first written to their backing store before being freed.
+ * The daemon treats physical memory as a circular array of pages and scans
+ * the pages using a 'two-handed clock' algorithm. The front hand moves
+ * through the pages, clearing the reference bit. The back hand travels a
+ * distance (handspreadpages) behind the front hand, freeing the pages that
+ * have not been referenced in the time since the front hand passed. If
+ * modified, they are first written to their backing store before being
+ * freed.
*
- * In order to make page invalidation more responsive on machines with larger
- * memory, multiple pageout_scanner threads may be created. In this case, the
- * threads are evenly distributed around the the memory "clock face" so that
- * memory can be reclaimed more quickly (that is, there can be large regions in
- * which no pages can be reclaimed by a single thread, leading to lag which
- * causes undesirable behavior such as htable stealing).
+ * In order to make page invalidation more responsive on machines with
+ * larger memory, multiple pageout_scanner threads may be created. In this
+ * case, each thread is given a segment of the memory "clock face" so that
+ * memory can be reclaimed more quickly.
*
- * As long as there are at least lotsfree pages, or no zones over their cap,
- * then pageout_scanner threads are not run. When pageout_scanner threads are
- * running for case (a), all pages are considered for pageout. For case (b),
- * only pages belonging to a zone over its cap will be considered for pageout.
+ * As long as there are at least lotsfree pages, or no zones over their
+ * cap, then pageout_scanner threads are not run. When pageout_scanner
+ * threads are running for case (a), all pages are considered for pageout.
+ * For case (b), only pages belonging to a zone over its cap will be
+ * considered for pageout.
*
- * There are multiple threads that act on behalf of the pageout process.
- * A set of threads scan pages (pageout_scanner) and frees them up if
- * they don't require any VOP_PUTPAGE operation. If a page must be
- * written back to its backing store, the request is put on a list
- * and the other (pageout) thread is signaled. The pageout thread
- * grabs VOP_PUTPAGE requests from the list, and processes them.
- * Some filesystems may require resources for the VOP_PUTPAGE
- * operations (like memory) and hence can block the pageout
- * thread, but the pageout_scanner threads can still operate. There is still
- * no guarantee that memory deadlocks cannot occur.
- *
- * The pageout_scanner parameters are determined in schedpaging().
+ * There are multiple threads that act on behalf of the pageout process. A
+ * set of threads scan pages (pageout_scanner) and frees them up if they
+ * don't require any VOP_PUTPAGE operation. If a page must be written back
+ * to its backing store, the request is put on a list and the other
+ * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE
+ * requests from the list, and processes them. Some filesystems may require
+ * resources for the VOP_PUTPAGE operations (like memory) and hence can
+ * block the pageout thread, but the scanner thread can still operate.
+ * There is still no guarantee that memory deadlocks cannot occur.
*/
void
pageout()
{
struct async_reqs *arg;
- pri_t pageout_pri;
int i;
pgcnt_t max_pushes;
callb_cpr_t cprinfo;
@@ -1058,11 +1072,12 @@ pageout()
push_req[i].a_next = &push_req[i + 1];
}
- pageout_pri = curthread->t_pri;
+ pageout_pri = curthread->t_pri - 1;
- /* Create the (first) pageout scanner thread. */
- (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
- pageout_pri - 1);
+ /* Create the first pageout scanner thread. */
+ (void) lwp_kernel_create(proc_pageout, pageout_scanner,
+ (void *)0, /* this is instance 0, not NULL */
+ TS_RUN, pageout_pri);
/*
* kick off pageout scheduler.
@@ -1098,6 +1113,7 @@ pageout()
mutex_exit(&push_lock);
DTRACE_PROBE(pageout__push);
+
if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
pushes++;
@@ -1122,97 +1138,116 @@ pageout()
static void
pageout_scanner(void *a)
{
- struct page *fronthand, *backhand;
- uint_t laps, iter = 0;
+ struct page *fronthand, *backhand, *fronthandstart;
+ struct page *regionstart, *regionend;
+ uint_t laps;
callb_cpr_t cprinfo;
- pgcnt_t nscan_cnt, nscan_limit;
+ pgcnt_t nscan_cnt, tick;
pgcnt_t pcount;
- uint_t inst = (uint_t)(uintptr_t)a;
+ bool bhwrapping, fhwrapping;
hrtime_t sample_start, sample_end;
- kmutex_t pscan_mutex;
- bool sampling;
+ uint_t inst = (uint_t)(uintptr_t)a;
VERIFY3U(inst, <, MAX_PSCAN_THREADS);
- mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL);
-
- CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan");
- mutex_enter(&pscan_mutex);
+ CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
+ mutex_enter(&pageout_mutex);
/*
- * Establish the minimum and maximum length of time to be spent
- * scanning pages per wakeup, limiting the scanner duty cycle. The
- * input percentage values (0-100) must be converted to a fraction of
- * the number of nanoseconds in a second of wall time, then further
- * scaled down by the number of scanner wakeups in a second:
+ * The restart case does not attempt to point the hands at roughly
+ * the right point on the assumption that after one circuit things
+ * will have settled down, and restarts shouldn't be that often.
*/
- min_pageout_nsec = MAX(1,
- NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
- max_pageout_nsec = MAX(min_pageout_nsec,
- NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
+ reset_hands[inst] = B_TRUE;
+
+ pageouts_running++;
+ mutex_exit(&pageout_mutex);
loop:
cv_signal_pageout();
+ mutex_enter(&pageout_mutex);
+ pageouts_running--;
CALLB_CPR_SAFE_BEGIN(&cprinfo);
- cv_wait(&proc_pageout->p_cv, &pscan_mutex);
- CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex);
+ cv_wait(&proc_pageout->p_cv, &pageout_mutex);
+ CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
+ pageouts_running++;
+ mutex_exit(&pageout_mutex);
/*
- * Check if pageout has been disabled for debugging purposes:
+ * Check if pageout has been disabled for debugging purposes.
*/
if (!dopageout) {
goto loop;
}
/*
- * One may reset the clock hands for debugging purposes. Hands will
- * also be reset if memory is added to or removed from the system.
+ * One may reset the clock hands and scanned region for debugging
+ * purposes. Hands will also be reset on first thread startup, if
+ * the number of scanning threads (n_page_scanners) changes, or if
+ * memory is added to, or removed from, the system.
*/
if (reset_hands[inst]) {
struct page *first;
- pgcnt_t offset = total_pages / n_page_scanners;
reset_hands[inst] = B_FALSE;
+
if (inst >= n_page_scanners) {
/*
- * The desired number of page scanners has been
- * reduced and this instance is no longer wanted.
- * Exit the lwp.
- */
+ * The desired number of page scanners has been
+ * reduced and this instance is no longer wanted.
+ * Exit the lwp.
+ */
VERIFY3U(inst, !=, 0);
- mutex_exit(&pscan_mutex);
+ DTRACE_PROBE1(pageout__exit, uint_t, inst);
+ mutex_enter(&pageout_mutex);
+ pageouts_running--;
+ mutex_exit(&pageout_mutex);
mutex_enter(&curproc->p_lock);
lwp_exit();
+ /* NOTREACHED */
}
+ first = page_first();
+
/*
- * The reset case repositions the hands at the proper place
- * on the memory clock face to prevent creep into another
- * thread's active region or when the number of threads has
- * changed.
- *
- * Set the two clock hands to be separated by a reasonable
- * amount, but no more than 360 degrees apart.
- *
- * If inst == 0, backhand starts at first page, otherwise
- * it is (inst * offset) around the memory "clock face" so that
- * we spread out each scanner instance evenly.
+ * Each scanner thread gets its own sector of the memory
+ * clock face.
*/
- first = page_first();
- backhand = page_nextn(first, offset * inst);
- if (handspreadpages >= total_pages) {
- fronthand = page_nextn(backhand, total_pages - 1);
+ pgcnt_t span, offset;
+
+ span = looppages / n_page_scanners;
+ VERIFY3U(span, >, handspreadpages);
+
+ offset = inst * span;
+ regionstart = page_nextn(first, offset);
+ if (inst == n_page_scanners - 1) {
+ /* The last instance goes up to the last page */
+ regionend = page_nextn(first, looppages - 1);
} else {
- fronthand = page_nextn(backhand, handspreadpages);
+ regionend = page_nextn(regionstart, span - 1);
}
+
+ backhand = regionstart;
+ fronthand = page_nextn(backhand, handspreadpages);
+ tick = 1;
+
+ bhwrapping = fhwrapping = B_FALSE;
+
+ DTRACE_PROBE4(pageout__reset, uint_t, inst,
+ pgcnt_t, regionstart, pgcnt_t, regionend,
+ pgcnt_t, fronthand);
}
+ /*
+ * This CPU kstat is only incremented here and we're obviously
+ * on this CPU, so no lock.
+ */
CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
/*
* Keep track of the number of times we have scanned all the way around
- * the loop:
+ * the loop on this wakeup.
*/
laps = 0;
@@ -1220,48 +1255,43 @@ loop:
* Track the number of pages visited during this scan so that we can
* periodically measure our duty cycle.
*/
- pcount = 0;
nscan_cnt = 0;
+ pcount = 0;
- if (PAGE_SCAN_STARTUP) {
- /*
- * We need to measure the rate at which the system is able to
- * scan pages of memory. Each of these initial samples is a
- * scan of all system memory, regardless of whether or not we
- * are experiencing memory pressure.
- */
- nscan_limit = total_pages;
- sampling = true;
- } else {
- nscan_limit = desscan;
- sampling = false;
- }
+ DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan,
+ hrtime_t, pageout_nsec, page_t *, backhand, page_t *, fronthand);
- DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst,
- page_t *, backhand, page_t *, fronthand);
+ /*
+ * Record the initial position of the front hand for this cycle so
+ * that we can detect when the hand wraps around.
+ */
+ fronthandstart = fronthand;
sample_start = gethrtime();
/*
* Scan the appropriate number of pages for a single duty cycle.
- * Only scan while at least one of these is true:
- * 1) one or more zones is over its cap
- * 2) there is not enough free memory
- * 3) during page scan startup when determining sample data
*/
- while (nscan_cnt < nscan_limit) {
+ while (nscan_cnt < desscan) {
checkpage_result_t rvfront, rvback;
- if (!sampling && !zones_over &&
- freemem >= lotsfree + needfree) {
+ /*
+ * Only scan while at least one of these is true:
+ * 1) one or more zones is over its cap
+ * 2) there is not enough free memory
+ * 3) during page scan startup when determining sample data
+ */
+ if (!PAGE_SCAN_STARTUP && freemem >= lotsfree + needfree &&
+ !zones_over) {
/*
* We are not sampling and enough memory has become
* available that scanning is no longer required.
*/
+ DTRACE_PROBE1(pageout__memfree, uint_t, inst);
break;
}
- DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst);
+ DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount);
/*
* Periodically check to see if we have exceeded the CPU duty
@@ -1272,13 +1302,8 @@ loop:
pageout_cycle_nsec = gethrtime() - sample_start;
if (pageout_cycle_nsec >= pageout_nsec) {
- /*
- * This is where we normally break out of the
- * loop when scanning zones or sampling.
- */
- if (!zones_over) {
+ if (!zones_over)
atomic_inc_64(&pageout_timeouts);
- }
DTRACE_PROBE1(pageout__timeout, uint_t, inst);
break;
}
@@ -1310,22 +1335,31 @@ loop:
nscan_cnt++;
}
- backhand = page_next(backhand);
- fronthand = page_next(fronthand);
+ if (bhwrapping) {
+ backhand = regionstart;
+ bhwrapping = B_FALSE;
+ } else {
+ backhand = page_nextn(backhand, tick);
+ if (backhand == regionend)
+ bhwrapping = B_TRUE;
+ }
+
+ if (fhwrapping) {
+ fronthand = regionstart;
+ fhwrapping = B_FALSE;
+ } else {
+ fronthand = page_nextn(fronthand, tick);
+ if (fronthand == regionend)
+ fhwrapping = B_TRUE;
+ }
/*
- * The front hand has wrapped around to the first page in the
- * loop.
+ * The front hand has wrapped around during this wakeup.
*/
- if (fronthand == page_first()) {
- DTRACE_PROBE1(pageout__wrap__front, uint_t, inst);
-
- /*
- * Every 64 wraps we reposition our hands within our
- * region to prevent creep into another thread.
- */
- if ((++iter % pageout_reset_cnt) == 0)
- reset_hands[inst] = B_TRUE;
+ if (fronthand == fronthandstart) {
+ laps++;
+ DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst,
+ uint_t, laps);
/*
* This CPU kstat is only incremented here and we're
@@ -1334,15 +1368,13 @@ loop:
CPU_STATS_ADDQ(CPU, vm, rev, 1);
/*
- * If scanning because the system is low on memory,
* then when we wraparound memory we want to try to
* reclaim more pages.
* If scanning only because zones are over their cap,
* then wrapping is common and we simply keep going.
- */
- if (freemem < lotsfree + needfree && ++laps > 1) {
+ */
+ if (laps > 1 && freemem < lotsfree + needfree) {
/*
- * The system is low on memory.
* Extremely unlikely, but it happens.
* We went around the loop at least once
* and didn't get far enough.
@@ -1350,40 +1382,33 @@ loop:
* pages, skip fewer of them. Otherwise,
* give up till the next clock tick.
*/
- mutex_enter(&pageout_mutex);
if (po_share < MAX_PO_SHARE) {
po_share <<= 1;
- mutex_exit(&pageout_mutex);
} else {
- mutex_exit(&pageout_mutex);
break;
}
}
}
}
- atomic_add_long(&nscan, nscan_cnt);
-
sample_end = gethrtime();
+ atomic_add_long(&nscan, nscan_cnt);
- DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount,
- uint_t, inst);
+ DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps,
+ pgcnt_t, nscan_cnt, pgcnt_t, pcount)
/*
- * The following two blocks are only relevant when the scanner is
- * first started up. After the scanner runs for a while, neither of
- * the conditions will ever be true again.
- *
* The global variables used below are only modified by this thread and
* only during initial scanning when there is a single page scanner
- * thread running. Thus, we don't use any locking.
+ * thread running.
*/
if (pageout_new_spread == 0) {
VERIFY3U(inst, ==, 0);
+
if (PAGE_SCAN_STARTUP) {
/*
* Continue accumulating samples until we have enough
- * to get a reasonable value for average scan rate:
+ * to get a reasonable value for average scan rate.
*/
pageout_sample_pages += pcount;
pageout_sample_etime += sample_end - sample_start;
@@ -1525,9 +1550,9 @@ checkpage(struct page *pp, pageout_hand_t whichhand)
if (pp->p_zoneid == ALL_ZONES ||
zone_pdata[pp->p_zoneid].zpers_over == 0) {
/*
- * Cross-zone shared page, or zone not over it's cap.
- * Leave the page alone.
- */
+ * Cross-zone shared page, or zone not over it's cap.
+ * Leave the page alone.
+ */
page_unlock(pp);
return (CKP_INELIGIBLE);
}
@@ -1537,7 +1562,6 @@ checkpage(struct page *pp, pageout_hand_t whichhand)
/*
* Maintain statistics for what we are freeing
*/
-
if (pp->p_vnode != NULL) {
if (pp->p_vnode->v_flag & VVMEXEC)
isexec = 1;
diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c
index c2123db013..b3a3e03fa3 100644
--- a/usr/src/uts/common/vm/vm_page.c
+++ b/usr/src/uts/common/vm/vm_page.c
@@ -84,6 +84,7 @@
static pgcnt_t max_page_get; /* max page_get request size in pages */
pgcnt_t total_pages = 0; /* total number of pages (used by /proc) */
+uint64_t n_throttle = 0; /* num times page create throttled */
/*
* freemem_lock protects all freemem variables:
diff --git a/usr/src/uts/i86pc/sys/vm_machparam.h b/usr/src/uts/i86pc/sys/vm_machparam.h
index fde81e59ed..0d0c95535c 100644
--- a/usr/src/uts/i86pc/sys/vm_machparam.h
+++ b/usr/src/uts/i86pc/sys/vm_machparam.h
@@ -130,7 +130,7 @@ extern "C" {
*
* XXX - The system doesn't account for multiple swap devices.
*/
-#define DISKRPM 60
+#define DISKRPM 600
/*
* The maximum value for handspreadpages which is the the distance