/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright 2018 Joyent, Inc.
 */

/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
/*	  All Rights Reserved  	*/

/*
 * University Copyright- Copyright (c) 1982, 1986, 1988
 * The Regents of the University of California
 * All Rights Reserved
 *
 * University Acknowledgment- Portions of this document are derived from
 * software developed by the University of California, Berkeley, and its
 * contributors.
 */

#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/buf.h>
#include <sys/uio.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/mman.h>
#include <sys/cred.h>
#include <sys/vnode.h>
#include <sys/vm.h>
#include <sys/vmparam.h>
#include <sys/vtrace.h>
#include <sys/cmn_err.h>
#include <sys/cpuvar.h>
#include <sys/user.h>
#include <sys/kmem.h>
#include <sys/debug.h>
#include <sys/callb.h>
#include <sys/tnf_probe.h>
#include <sys/mem_cage.h>
#include <sys/time.h>
#include <sys/zone.h>

#include <vm/hat.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/page.h>
#include <vm/pvn.h>
#include <vm/seg_kmem.h>

static int checkpage(page_t *, int);

/*
 * The following parameters control operation of the page replacement
 * algorithm.  They are initialized to 0, and then computed at boot time
 * based on the size of the system.  If they are patched non-zero in
 * a loaded vmunix they are left alone and may thus be changed per system
 * using mdb on the loaded system.
 */
pgcnt_t		slowscan = 0;
pgcnt_t		fastscan = 0;

static pgcnt_t	handspreadpages = 0;
static int	loopfraction = 2;
static pgcnt_t	looppages;
/* See comment below describing 4% and 80% */
static int	min_percent_cpu = 4;
static int	max_percent_cpu = 80;
static pgcnt_t	maxfastscan = 0;
static pgcnt_t	maxslowscan = 100;

pgcnt_t	maxpgio = 0;
pgcnt_t	minfree = 0;
pgcnt_t	desfree = 0;
pgcnt_t	lotsfree = 0;
pgcnt_t	needfree = 0;
pgcnt_t	throttlefree = 0;
pgcnt_t	pageout_reserve = 0;

pgcnt_t	deficit;
pgcnt_t	nscan;
pgcnt_t	desscan;

/* kstats */
uint64_t low_mem_scan;
uint64_t zone_cap_scan;
uint64_t n_throttle;

clock_t	zone_pageout_ticks;	/* tunable to change zone pagescan ticks */

/*
 * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
 * are the number of ticks in each wakeup cycle that gives the
 * equivalent of some underlying %CPU duty cycle.
 *
 * For example, when RATETOSCHEDPAGING is 4 (the default), then schedpaging()
 * will run 4 times/sec to update pageout scanning parameters and kickoff
 * the pageout_scanner() thread if necessary.
 *
 * Given hz is 100, min_pageout_ticks will be set to 1 (1% of a CPU). When
 * pageout_ticks is set to min_pageout_ticks, then the total CPU time consumed
 * by the scanner in a 1 second interval is 4% of a CPU (RATETOSCHEDPAGING * 1).
 *
 * Given hz is 100, max_pageout_ticks will be set to 20 (20% of a CPU). When
 * pageout_ticks is set to max_pageout_ticks, then the total CPU time consumed
 * by the scanner in a 1 second interval is 80% of a CPU
 * (RATETOSCHEDPAGING * 20). There is no point making max_pageout_ticks >25
 * since schedpaging() runs RATETOSCHEDPAGING (4) times/sec.
 *
 * If hz is 1000, then min_pageout_ticks will be 10 and max_pageout_ticks
 * will be 200, so the CPU percentages are the same as when hz is 100.
 *
 * min_pageout_ticks:
 *     ticks/wakeup equivalent of min_percent_cpu.
 *
 * max_pageout_ticks:
 *     ticks/wakeup equivalent of max_percent_cpu.
 *
 * pageout_ticks:
 *     Number of clock ticks budgeted for each wakeup cycle.
 *     Computed each time around by schedpaging().
 *     Varies between min_pageout_ticks .. max_pageout_ticks,
 *     depending on memory pressure or zones over their cap.
 */

static clock_t	min_pageout_ticks;
static clock_t	max_pageout_ticks;
static clock_t	pageout_ticks;

#define	MAX_PSCAN_THREADS	16
static boolean_t reset_hands[MAX_PSCAN_THREADS];

/*
 * These can be tuned in /etc/system or set with mdb.
 * 'des_page_scanners' is the desired number of page scanner threads. The
 * system will bring the actual number of threads into line with the desired
 * number. If des_page_scanners is set to an invalid value, the system will
 * correct the setting.
 */
uint_t des_page_scanners;
uint_t pageout_reset_cnt = 64;	/* num. cycles for pageout_scanner hand reset */

uint_t n_page_scanners;
static pgcnt_t	pscan_region_sz; /* informational only */


#define	PAGES_POLL_MASK	1023

/*
 * pageout_sample_lim:
 *     The limit on the number of samples needed to establish a value
 *     for new pageout parameters, fastscan, slowscan, and handspreadpages.
 *
 * pageout_sample_cnt:
 *     Current sample number.  Once the sample gets large enough,
 *     set new values for handspreadpages, fastscan and slowscan.
 *
 * pageout_sample_pages:
 *     The accumulated number of pages scanned during sampling.
 *
 * pageout_sample_etime:
 *     The accumulated number of nanoseconds for the sample.
 *
 * pageout_rate:
 *     Rate in pages/second, computed at the end of sampling.
 *
 * pageout_new_spread:
 *     The new value to use for maxfastscan and (perhaps) handspreadpages.
 *     Intended to be the number pages that can be scanned per sec using ~10%
 *     of a CPU. Calculated after enough samples have been taken.
 *     pageout_rate / 10
 */

typedef hrtime_t hrrate_t;

static uint_t	pageout_sample_lim = 4;
static uint_t	pageout_sample_cnt = 0;
static pgcnt_t	pageout_sample_pages = 0;
static hrrate_t	pageout_rate = 0;
static pgcnt_t	pageout_new_spread = 0;

static hrtime_t	pageout_sample_etime = 0;

/* True if page scanner is first starting up */
#define	PAGE_SCAN_STARTUP	(pageout_sample_cnt < pageout_sample_lim)

/*
 * Record number of times a pageout_scanner wakeup cycle finished because it
 * timed out (exceeded its CPU budget), rather than because it visited
 * its budgeted number of pages. This is only done when scanning under low
 * free memory conditions, not when scanning for zones over their cap.
 */
uint64_t pageout_timeouts = 0;

#ifdef VM_STATS
static struct pageoutvmstats_str {
	ulong_t	checkpage[3];
} pageoutvmstats;
#endif /* VM_STATS */

/*
 * Threads waiting for free memory use this condition variable and lock until
 * memory becomes available.
 */
kmutex_t	memavail_lock;
kcondvar_t	memavail_cv;

/*
 * The size of the clock loop.
 */
#define	LOOPPAGES	total_pages

/*
 * Local boolean to control scanning when zones are over their cap. Avoids
 * accessing the zone_num_over_cap variable except within schedpaging(), which
 * only runs periodically. This is here only to reduce our access to
 * zone_num_over_cap, since it is already accessed a lot during paging, and
 * the page scanner accesses the zones_over variable on each page during a
 * scan. There is no lock needed for zone_num_over_cap since schedpaging()
 * doesn't modify the variable, it only cares if the variable is 0 or non-0.
 */
static boolean_t zones_over = B_FALSE;

/*
 * Set up the paging constants for the page scanner clock-hand algorithm.
 * Called at startup after the system is initialized and the amount of memory
 * and number of paging devices is known (recalc will be 0). Called again once
 * PAGE_SCAN_STARTUP is true after the scanner has collected enough samples
 * (recalc will be 1).
 *
 * Will also be called after a memory dynamic reconfiguration operation and
 * recalc will be 1 in those cases too.
 *
 * lotsfree is 1/64 of memory, but at least 512K (ha!).
 * desfree is 1/2 of lotsfree.
 * minfree is 1/2 of desfree.
 */
void
setupclock(int recalc)
{
	uint_t i;
	pgcnt_t sz, tmp;

	static spgcnt_t init_lfree, init_dfree, init_mfree;
	static spgcnt_t init_tfree, init_preserve, init_mpgio;
	static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages;

	looppages = LOOPPAGES;

	/*
	 * setupclock can be called to recalculate the paging
	 * parameters in the case of dynamic reconfiguration of memory.
	 * So to make sure we make the proper calculations, if such a
	 * situation should arise, we save away the initial values
	 * of each parameter so we can recall them when needed. This
	 * way we don't lose the settings an admin might have made
	 * through the /etc/system file.
	 */

	if (!recalc) {
		init_lfree = lotsfree;
		init_dfree = desfree;
		init_mfree = minfree;
		init_tfree = throttlefree;
		init_preserve = pageout_reserve;
		init_mpgio = maxpgio;
		init_mfscan = maxfastscan;
		init_fscan = fastscan;
		init_sscan = slowscan;
		init_hspages = handspreadpages;
	}

	/*
	 * Set up thresholds for paging:
	 */

	/*
	 * Lotsfree is threshold where paging daemon turns on.
	 */
	if (init_lfree == 0 || init_lfree >= looppages)
		lotsfree = MAX(looppages / 64, btop(512 * 1024));
	else
		lotsfree = init_lfree;

	/*
	 * Desfree is amount of memory desired free.
	 * If less than this for extended period, start swapping.
	 */
	if (init_dfree == 0 || init_dfree >= lotsfree)
		desfree = lotsfree / 2;
	else
		desfree = init_dfree;

	/*
	 * Minfree is minimal amount of free memory which is tolerable.
	 */
	if (init_mfree == 0 || init_mfree >= desfree)
		minfree = desfree / 2;
	else
		minfree = init_mfree;

	/*
	 * Throttlefree is the point at which we start throttling
	 * PG_WAIT requests until enough memory becomes available.
	 */
	if (init_tfree == 0 || init_tfree >= desfree)
		throttlefree = minfree;
	else
		throttlefree = init_tfree;

	/*
	 * Pageout_reserve is the number of pages that we keep in
	 * stock for pageout's own use.  Having a few such pages
	 * provides insurance against system deadlock due to
	 * pageout needing pages.  When freemem < pageout_reserve,
	 * non-blocking allocations are denied to any threads
	 * other than pageout and sched.  (At some point we might
	 * want to consider a per-thread flag like T_PUSHING_PAGES
	 * to indicate that a thread is part of the page-pushing
	 * dance (e.g. an interrupt thread) and thus is entitled
	 * to the same special dispensation we accord pageout.)
	 */
	if (init_preserve == 0 || init_preserve >= throttlefree)
		pageout_reserve = throttlefree / 2;
	else
		pageout_reserve = init_preserve;

	/*
	 * Maxpgio thresholds how much paging is acceptable.
	 * This figures that 2/3 busy on an arm is all that is
	 * tolerable for paging.  We assume one operation per disk rev.
	 *
	 * XXX - Does not account for multiple swap devices.
	 */
	if (init_mpgio == 0)
		maxpgio = (DISKRPM * 2) / 3;
	else
		maxpgio = init_mpgio;

	/*
	 * When the system is in a low memory state, the page scan rate varies
	 * between fastscan and slowscan based on the amount of free memory
	 * available. When only zones are over their memory cap, the scan rate
	 * is always fastscan.
	 *
	 * The fastscan rate should be set based on the number pages that can
	 * be scanned per sec using ~10% of a CPU. Since this value depends on
	 * the processor, MMU, Ghz etc., it must be determined dynamically.
	 *
	 * When the scanner first starts up, fastscan will be set to 0 and
	 * maxfastscan will be set to MAXHANDSPREADPAGES (64MB, in pages).
	 * However, once the scanner has collected enough samples, then fastscan
	 * is set to be the smaller of 1/2 of memory (looppages / loopfraction)
	 * or maxfastscan (which is set from pageout_new_spread). Thus,
	 * MAXHANDSPREADPAGES is irrelevant after the scanner is fully
	 * initialized.
	 *
	 * pageout_new_spread is calculated when the scanner first starts
	 * running. During this initial sampling period the nscan_limit
	 * is set to the total_pages of system memory. Thus, the scanner could
	 * theoretically scan all of memory in one pass. However, each sample
	 * is also limited by the %CPU budget. This is controlled by
	 * pageout_ticks which is set in schedpaging(). During the sampling
	 * period, pageout_ticks is set to max_pageout_ticks. This tick value
	 * is derived from the max_percent_cpu (80%) described above. On a
	 * system with more than a small amount of memory (~8GB), the scanner's
	 * %CPU will be the limiting factor in calculating pageout_new_spread.
	 *
	 * At the end of the sampling period, the pageout_rate indicates how
	 * many pages could be scanned per second. The pageout_new_spread is
	 * then set to be 1/10th of that (i.e. approximating 10% of a CPU).
	 * Of course, this value could still be more than the physical memory
	 * on the system. If so, fastscan is set to 1/2 of memory, as
	 * mentioned above.
	 *
	 * All of this leads up to the setting of handspreadpages, which is
	 * set to fastscan. This is the distance, in pages, between the front
	 * and back hands during scanning. It will dictate which pages will
	 * be considered "hot" on the backhand and which pages will be "cold"
	 * and reclaimed
	 *
	 * If the scanner is limited by desscan, then at the highest rate it
	 * will scan up to fastscan/RATETOSCHEDPAGING pages per cycle. If the
	 * scanner is limited by the %CPU, then at the highest rate (20% of a
	 * CPU per cycle) the number of pages scanned could be much less.
	 *
	 * Thus, if the scanner is limited by desscan, then the handspreadpages
	 * setting means 1sec between the front and back hands, but if the
	 * scanner is limited by %CPU, it could be several seconds between the
	 * two hands.
	 *
	 * The basic assumption is that at the worst case, stealing pages
	 * not accessed within 1 sec seems reasonable and ensures that active
	 * user processes don't thrash. This is especially true when the system
	 * is in a low memory state.
	 *
	 * There are some additional factors to consider for the case of
	 * scanning when zones are over their cap. In this situation it is
	 * also likely that the machine will have a large physical memory which
	 * will take many seconds to fully scan (due to the %CPU and desscan
	 * limits per cycle). It is probable that there will be few (or 0)
	 * pages attributed to these zones in any single scanning cycle. The
	 * result is that reclaiming enough pages for these zones might take
	 * several additional seconds (this is generally not a problem since
	 * the zone physical cap is just a soft cap).
	 *
	 * This is similar to the typical multi-processor situation in which
	 * pageout is often unable to maintain the minimum paging thresholds
	 * under heavy load due to the fact that user processes running on
	 * other CPU's can be dirtying memory at a much faster pace than
	 * pageout can find pages to free.
	 *
	 * One potential approach to address both of these cases is to enable
	 * more than one CPU to run the page scanner, in such a manner that the
	 * various clock hands don't overlap. However, this also makes it more
	 * difficult to determine the values for fastscan, slowscan and
	 * handspreadpages. This is left as a future enhancement, if necessary.
	 *
	 * When free memory falls just below lotsfree, the scan rate goes from
	 * 0 to slowscan (i.e., the page scanner starts running).  This
	 * transition needs to be smooth and is achieved by ensuring that
	 * pageout scans a small number of pages to satisfy the transient
	 * memory demand.  This is set to not exceed 100 pages/sec (25 per
	 * wakeup) since scanning that many pages has no noticible impact
	 * on system performance.
	 *
	 * The swapper is currently used to free up memory when pageout is
	 * unable to meet memory demands. It does this by swapping out entire
	 * processes. In addition to freeing up memory, swapping also reduces
	 * the demand for memory because the swapped out processes cannot
	 * run, and thereby consume memory. However, this is a pathological
	 * state and performance will generally be considered unacceptable.
	 */
	if (init_mfscan == 0) {
		if (pageout_new_spread != 0)
			maxfastscan = pageout_new_spread;
		else
			maxfastscan = MAXHANDSPREADPAGES;
	} else {
		maxfastscan = init_mfscan;
	}
	if (init_fscan == 0) {
		fastscan = MIN(looppages / loopfraction, maxfastscan);
	} else {
		fastscan = init_fscan;
		if (fastscan > looppages / loopfraction)
			fastscan = looppages / loopfraction;
	}

	/*
	 * Set slow scan time to 1/10 the fast scan time, but
	 * not to exceed maxslowscan.
	 */
	if (init_sscan == 0)
		slowscan = MIN(fastscan / 10, maxslowscan);
	else
		slowscan = init_sscan;
	if (slowscan > fastscan / 2)
		slowscan = fastscan / 2;

	/*
	 * Handspreadpages is distance (in pages) between front and back
	 * pageout daemon hands.  The amount of time to reclaim a page
	 * once pageout examines it increases with this distance and
	 * decreases as the scan rate rises. It must be < the amount
	 * of pageable memory.
	 *
	 * Since pageout is limited to the %CPU per cycle, setting
	 * handspreadpages to be "fastscan" results in the front hand being
	 * a few secs (varies based on the processor speed) ahead of the back
	 * hand at fastscan rates.
	 *
	 * As a result, user processes have a much better chance of
	 * referencing their pages before the back hand examines them.
	 * This also significantly lowers the number of reclaims from
	 * the freelist since pageout does not end up freeing pages which
	 * may be referenced a sec later.
	 */
	if (init_hspages == 0)
		handspreadpages = fastscan;
	else
		handspreadpages = init_hspages;

	/*
	 * Make sure that back hand follows front hand by at least
	 * 1/RATETOSCHEDPAGING seconds.  Without this test, it is possible
	 * for the back hand to look at a page during the same wakeup of
	 * the pageout daemon in which the front hand cleared its ref bit.
	 */
	if (handspreadpages >= looppages)
		handspreadpages = looppages - 1;

	if (recalc == 0) {
		/*
		 * Setup basic values at initialization.
		 */
		pscan_region_sz = total_pages;
		des_page_scanners = n_page_scanners = 1;
		reset_hands[0] = B_TRUE;
		return;
	}

	/*
	 * Recalculating
	 *
	 * We originally set the number of page scanners to 1. Now that we
	 * know what the handspreadpages is for a scanner, figure out how many
	 * scanners we should run. We want to ensure that the regions don't
	 * overlap and that they are not touching.
	 *
	 * A default 64GB region size is used as the initial value to calculate
	 * how many scanner threads we should create on lower memory systems.
	 * The idea is to limit the number of threads to a practical value
	 * (e.g. a 64GB machine really only needs one scanner thread). For very
	 * large memory systems, we limit ourselves to MAX_PSCAN_THREADS
	 * threads.
	 *
	 * The scanner threads themselves are evenly spread out around the
	 * memory "clock" in pageout_scanner when we reset the hands, and each
	 * thread will scan all of memory.
	 */
	sz = (btop(64ULL * 0x40000000ULL));
	if (sz < handspreadpages) {
		/*
		 * 64GB is smaller than the separation between the front
		 * and back hands; use double handspreadpages.
		 */
		sz = handspreadpages << 1;
	}
	if (sz > total_pages) {
		sz = total_pages;
	}
	/* Record region size for inspection with mdb, otherwise unused */
	pscan_region_sz = sz;

	tmp = sz;
	for (i = 1; tmp < total_pages; i++) {
		tmp += sz;
	}

	if (i > MAX_PSCAN_THREADS)
		i = MAX_PSCAN_THREADS;

	des_page_scanners = i;
}

/*
 * Pageout scheduling.
 *
 * Schedpaging controls the rate at which the page out daemon runs by
 * setting the global variables pageout_ticks and desscan RATETOSCHEDPAGING
 * times a second. The pageout_ticks variable controls the percent of one
 * CPU that each page scanner thread should consume (see min_percent_cpu
 * and max_percent_cpu descriptions). The desscan variable records the number
 * of pages pageout should examine in its next pass; schedpaging sets this
 * value based on the amount of currently available memory. In addtition, the
 * nscan variable records the number of pages pageout has examined in its
 * current pass; schedpaging resets this value to zero each time it runs.
 */

#define	RATETOSCHEDPAGING	4		/* times/second */

/* held while pageout_scanner or schedpaging are modifying shared data */
static kmutex_t	pageout_mutex;

/*
 * Pool of available async pageout putpage requests.
 */
static struct async_reqs *push_req;
static struct async_reqs *req_freelist;	/* available req structs */
static struct async_reqs *push_list;	/* pending reqs */
static kmutex_t push_lock;		/* protects req pool */
static kcondvar_t push_cv;

static int async_list_size = 256;	/* number of async request structs */

static void pageout_scanner(void *);

/*
 * If a page is being shared more than "po_share" times
 * then leave it alone- don't page it out.
 */
#define	MIN_PO_SHARE	(8)
#define	MAX_PO_SHARE	((MIN_PO_SHARE) << 24)
ulong_t	po_share = MIN_PO_SHARE;

/*
 * Schedule rate for paging.
 * Rate is linear interpolation between
 * slowscan with lotsfree and fastscan when out of memory.
 */
static void
schedpaging(void *arg)
{
	spgcnt_t vavail;

	if (freemem < lotsfree + needfree + kmem_reapahead)
		kmem_reap();

	if (freemem < lotsfree + needfree)
		seg_preap();

	if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
		kcage_cageout_wakeup();

	(void) atomic_swap_ulong(&nscan, 0);
	vavail = freemem - deficit;
	if (pageout_new_spread != 0)
		vavail -= needfree;
	if (vavail < 0)
		vavail = 0;
	if (vavail > lotsfree)
		vavail = lotsfree;

	/*
	 * Fix for 1161438 (CRS SPR# 73922).  All variables
	 * in the original calculation for desscan were 32 bit signed
	 * ints.  As freemem approaches 0x0 on a system with 1 Gig or
	 * more of memory, the calculation can overflow.  When this
	 * happens, desscan becomes negative and pageout_scanner()
	 * stops paging out.
	 */
	if ((needfree) && (pageout_new_spread == 0)) {
		/*
		 * If we've not yet collected enough samples to
		 * calculate a spread, kick into high gear anytime
		 * needfree is non-zero. Note that desscan will not be
		 * the limiting factor for systems with larger memory;
		 * the %CPU will limit the scan. That will also be
		 * maxed out below.
		 */
		desscan = fastscan / RATETOSCHEDPAGING;
	} else {
		/*
		 * Once we've calculated a spread based on system
		 * memory and usage, just treat needfree as another
		 * form of deficit.
		 */
		spgcnt_t faststmp, slowstmp, result;

		slowstmp = slowscan * vavail;
		faststmp = fastscan * (lotsfree - vavail);
		result = (slowstmp + faststmp) /
		    nz(lotsfree) / RATETOSCHEDPAGING;
		desscan = (pgcnt_t)result;
	}

	/*
	 * If we've not yet collected enough samples to calculate a
	 * spread, also kick %CPU to the max.
	 */
	if (pageout_new_spread == 0) {
		pageout_ticks = max_pageout_ticks;
	} else {
		pageout_ticks = min_pageout_ticks +
		    (lotsfree - vavail) *
		    (max_pageout_ticks - min_pageout_ticks) /
		    nz(lotsfree);
	}

	if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) {
		/*
		 * We have finished the pagescan initialization and the desired
		 * number of page scanners has changed, either because
		 * initialization just finished, because of a memory DR, or
		 * because des_page_scanners has been modified on the fly (i.e.
		 * by mdb). If we need more scanners, start them now, otherwise
		 * the excess scanners will terminate on their own when they
		 * reset their hands.
		 */
		uint_t i;
		uint_t curr_nscan = n_page_scanners;
		pgcnt_t max = total_pages / handspreadpages;

		if (des_page_scanners > max)
			des_page_scanners = max;

		if (des_page_scanners > MAX_PSCAN_THREADS) {
			des_page_scanners = MAX_PSCAN_THREADS;
		} else if (des_page_scanners == 0) {
			des_page_scanners = 1;
		}

		/*
		 * Each thread has its own entry in the reset_hands array, so
		 * we don't need any locking in pageout_scanner to check the
		 * thread's reset_hands entry. Thus, we use a pre-allocated
		 * fixed size reset_hands array and upper limit on the number
		 * of pagescan threads.
		 *
		 * The reset_hands entries need to be true before we start new
		 * scanners, but if we're reducing, we don't want a race on the
		 * recalculation for the existing threads, so we set
		 * n_page_scanners first.
		 */
		n_page_scanners = des_page_scanners;
		for (i = 0; i < MAX_PSCAN_THREADS; i++) {
			reset_hands[i] = B_TRUE;
		}

		if (des_page_scanners > curr_nscan) {
			/* Create additional pageout scanner threads. */
			for (i = curr_nscan; i < des_page_scanners; i++) {
				(void) lwp_kernel_create(proc_pageout,
				    pageout_scanner, (void *)(uintptr_t)i,
				    TS_RUN, curthread->t_pri);
			}
		}
	}

	zones_over = B_FALSE;

	if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) {
		if (!PAGE_SCAN_STARTUP)
			low_mem_scan++;
		DTRACE_PROBE(schedpage__wake__low);
		WAKE_PAGEOUT_SCANNER();

	} else if (zone_num_over_cap > 0) {
		/* One or more zones are over their cap. */

		/* No page limit */
		desscan = total_pages;

		/*
		 * Increase the scanning CPU% to the max. This implies
		 * 80% of one CPU/sec if the scanner can run each
		 * opportunity. Can also be tuned via setting
		 * zone_pageout_ticks in /etc/system or with mdb.
		 */
		pageout_ticks = (zone_pageout_ticks != 0) ?
		    zone_pageout_ticks : max_pageout_ticks;

		zones_over = B_TRUE;
		zone_cap_scan++;

		DTRACE_PROBE(schedpage__wake__zone);
		WAKE_PAGEOUT_SCANNER();

	} else {
		/*
		 * There are enough free pages, no need to
		 * kick the scanner thread.  And next time
		 * around, keep more of the `highly shared'
		 * pages.
		 */
		cv_signal_pageout();

		mutex_enter(&pageout_mutex);
		if (po_share > MIN_PO_SHARE) {
			po_share >>= 1;
		}
		mutex_exit(&pageout_mutex);
	}

	/*
	 * Signal threads waiting for available memory.
	 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
	 * in this case it is not needed - the waiters will be waken up during
	 * the next invocation of this function.
	 */
	if (kmem_avail() > 0)
		cv_broadcast(&memavail_cv);

	(void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING);
}

pgcnt_t		pushes;
ulong_t		push_list_size;		/* # of requests on pageout queue */

#define	FRONT	1
#define	BACK	2

int dopageout = 1;	/* /etc/system tunable to disable page reclamation */

/*
 * The page out daemon, which runs as process 2.
 *
 * Page out occurs when either:
 * a) there is less than lotsfree pages,
 * b) there are one or more zones over their physical memory cap.
 *
 * The daemon treats physical memory as a circular array of pages and scans the
 * pages using a 'two-handed clock' algorithm. The front hand moves through
 * the pages, clearing the reference bit. The back hand travels a distance
 * (handspreadpages) behind the front hand, freeing the pages that have not
 * been referenced in the time since the front hand passed. If modified, they
 * are first written to their backing store before being freed.
 *
 * In order to make page invalidation more responsive on machines with larger
 * memory, multiple pageout_scanner threads may be created. In this case, the
 * threads are evenly distributed around the the memory "clock face" so that
 * memory can be reclaimed more quickly (that is, there can be large regions in
 * which no pages can be reclaimed by a single thread, leading to lag which
 * causes undesirable behavior such as htable stealing).
 *
 * As long as there are at least lotsfree pages, or no zones over their cap,
 * then pageout_scanner threads are not run. When pageout_scanner threads are
 * running for case (a), all pages are considered for pageout. For case (b),
 * only pages belonging to a zone over its cap will be considered for pageout.
 *
 * There are multiple threads that act on behalf of the pageout process.
 * A set of threads scan pages (pageout_scanner) and frees them up if
 * they don't require any VOP_PUTPAGE operation. If a page must be
 * written back to its backing store, the request is put on a list
 * and the other (pageout) thread is signaled. The pageout thread
 * grabs VOP_PUTPAGE requests from the list, and processes them.
 * Some filesystems may require resources for the VOP_PUTPAGE
 * operations (like memory) and hence can block the pageout
 * thread, but the pageout_scanner threads can still operate. There is still
 * no guarantee that memory deadlocks cannot occur.
 *
 * The pageout_scanner parameters are determined in schedpaging().
 */
void
pageout()
{
	struct async_reqs *arg;
	pri_t pageout_pri;
	int i;
	pgcnt_t max_pushes;
	callb_cpr_t cprinfo;

	proc_pageout = ttoproc(curthread);
	proc_pageout->p_cstime = 0;
	proc_pageout->p_stime =  0;
	proc_pageout->p_cutime =  0;
	proc_pageout->p_utime = 0;
	bcopy("pageout", PTOU(curproc)->u_psargs, 8);
	bcopy("pageout", PTOU(curproc)->u_comm, 7);

	/*
	 * Create pageout scanner thread
	 */
	mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);

	/*
	 * Allocate and initialize the async request structures
	 * for pageout.
	 */
	push_req = (struct async_reqs *)
	    kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);

	req_freelist = push_req;
	for (i = 0; i < async_list_size - 1; i++)
		push_req[i].a_next = &push_req[i + 1];

	pageout_pri = curthread->t_pri;

	/* Create the (first) pageout scanner thread. */
	(void) lwp_kernel_create(proc_pageout, pageout_scanner, (void *) 0,
	    TS_RUN, pageout_pri - 1);

	/*
	 * kick off pageout scheduler.
	 */
	schedpaging(NULL);

	/*
	 * Create kernel cage thread.
	 * The kernel cage thread is started under the pageout process
	 * to take advantage of the less restricted page allocation
	 * in page_create_throttle().
	 */
	kcage_cageout_init();

	/*
	 * Limit pushes to avoid saturating pageout devices.
	 */
	max_pushes = maxpgio / RATETOSCHEDPAGING;
	CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");

	for (;;) {
		mutex_enter(&push_lock);

		while ((arg = push_list) == NULL || pushes > max_pushes) {
			CALLB_CPR_SAFE_BEGIN(&cprinfo);
			cv_wait(&push_cv, &push_lock);
			pushes = 0;
			CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
		}
		push_list = arg->a_next;
		arg->a_next = NULL;
		mutex_exit(&push_lock);

		DTRACE_PROBE(pageout__push);
		if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
		    arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
			pushes++;
		}

		/* vp held by checkpage() */
		VN_RELE(arg->a_vp);

		mutex_enter(&push_lock);
		arg->a_next = req_freelist;	/* back on freelist */
		req_freelist = arg;
		push_list_size--;
		mutex_exit(&push_lock);
	}
}

/*
 * Kernel thread that scans pages looking for ones to free
 */
static void
pageout_scanner(void *a)
{
	struct page *fronthand, *backhand;
	uint_t count, iter = 0;
	callb_cpr_t cprinfo;
	pgcnt_t	nscan_cnt, nscan_limit;
	pgcnt_t	pcount;
	uint_t inst = (uint_t)(uintptr_t)a;
	hrtime_t sample_start, sample_end;
	clock_t pageout_lbolt;
	kmutex_t pscan_mutex;

	VERIFY3U(inst, <, MAX_PSCAN_THREADS);

	mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL);

	CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan");
	mutex_enter(&pscan_mutex);

	min_pageout_ticks = MAX(1,
	    ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
	max_pageout_ticks = MAX(min_pageout_ticks,
	    ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING);

loop:
	cv_signal_pageout();

	CALLB_CPR_SAFE_BEGIN(&cprinfo);
	cv_wait(&proc_pageout->p_cv, &pscan_mutex);
	CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex);

	if (!dopageout)
		goto loop;

	if (reset_hands[inst]) {
		struct page *first;
		pgcnt_t offset = total_pages / n_page_scanners;

		reset_hands[inst] = B_FALSE;
		if (inst >= n_page_scanners) {
			/*
			 * The desired number of page scanners has been
			 * reduced and this instance is no longer wanted.
			 * Exit the lwp.
			 */
			VERIFY3U(inst, !=, 0);
			mutex_exit(&pscan_mutex);
			mutex_enter(&curproc->p_lock);
			lwp_exit();
		}

		/*
		 * The reset case repositions the hands at the proper place
		 * on the memory clock face to prevent creep into another
		 * thread's active region or when the number of threads has
		 * changed.
		 *
		 * Set the two clock hands to be separated by a reasonable
		 * amount, but no more than 360 degrees apart.
		 *
		 * If inst == 0, backhand starts at first page, otherwise
		 * it is (inst * offset) around the memory "clock face" so that
		 * we spread out each scanner instance evenly.
		 */
		first = page_first();
		backhand = page_nextn(first, offset * inst);
		if (handspreadpages >= total_pages) {
			fronthand = page_nextn(backhand, total_pages - 1);
		} else {
			fronthand = page_nextn(backhand, handspreadpages);
		}
	}

	/*
	 * This CPU kstat is only incremented here and we're obviously on this
	 * CPU, so no lock.
	 */
	CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
	count = 0;

	/* Kernel probe */
	TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */,
	    tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree);

	pcount = 0;
	nscan_cnt = 0;
	if (PAGE_SCAN_STARTUP) {
		nscan_limit = total_pages;
	} else {
		nscan_limit = desscan;
	}

	DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst,
	    page_t *, backhand, page_t *, fronthand);

	pageout_lbolt = ddi_get_lbolt();
	sample_start = gethrtime();

	/*
	 * Scan the appropriate number of pages for a single duty cycle.
	 * Only scan while at least one of these is true:
	 * 1) one or more zones is over its cap
	 * 2) there is not enough free memory
	 * 3) during page scan startup when determining sample data
	 */
	while (nscan_cnt < nscan_limit &&
	    (zones_over ||
	    freemem < lotsfree + needfree ||
	    PAGE_SCAN_STARTUP)) {
		int rvfront, rvback;

		DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst);

		/*
		 * Check to see if we have exceeded our %CPU budget
		 * for this wakeup, but not on every single page visited,
		 * just every once in a while.
		 */
		if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
			clock_t pageout_cycle_ticks;

			pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
			if (pageout_cycle_ticks >= pageout_ticks) {
				/*
				 * This is where we normally break out of the
				 * loop when scanning zones or sampling.
				 */
				if (!zones_over) {
					atomic_inc_64(&pageout_timeouts);
				}
				DTRACE_PROBE1(pageout__timeout, uint_t, inst);
				break;
			}
		}

		/*
		 * If checkpage manages to add a page to the free list,
		 * we give ourselves another couple of trips around memory.
		 */
		if ((rvfront = checkpage(fronthand, FRONT)) == 1)
			count = 0;
		if ((rvback = checkpage(backhand, BACK)) == 1)
			count = 0;

		++pcount;

		/*
		 * This CPU kstat is only incremented here and we're obviously
		 * on this CPU, so no lock.
		 */
		CPU_STATS_ADDQ(CPU, vm, scan, 1);

		/*
		 * Don't include ineligible pages in the number scanned.
		 */
		if (rvfront != -1 || rvback != -1)
			nscan_cnt++;

		backhand = page_next(backhand);

		/*
		 * backhand update and wraparound check are done separately
		 * because lint barks when it finds an empty "if" body
		 */

		if ((fronthand = page_next(fronthand)) == page_first())	{
			DTRACE_PROBE1(pageout__wrap__front, uint_t, inst);

			/*
			 * Every 64 wraps we reposition our hands within our
			 * region to prevent creep into another thread.
			 */
			if ((++iter % pageout_reset_cnt) == 0)
				reset_hands[inst] = B_TRUE;

			/*
			 * This CPU kstat is only incremented here and we're
			 * obviously on this CPU, so no lock.
			 */
			CPU_STATS_ADDQ(CPU, vm, rev, 1);

			/*
			 * If scanning because the system is low on memory,
			 * then when we wraparound memory we want to try to
			 * reclaim more pages.
			 * If scanning only because zones are over their cap,
			 * then wrapping is common and we simply keep going.
			 */
			if (freemem < lotsfree + needfree && ++count > 1) {
				/*
				 * The system is low on memory.
				 * Extremely unlikely, but it happens.
				 * We went around memory at least once
				 * and didn't reclaim enough.
				 * If we are still skipping `highly shared'
				 * pages, skip fewer of them.  Otherwise,
				 * give up till the next clock tick.
				 */
				mutex_enter(&pageout_mutex);
				if (po_share < MAX_PO_SHARE) {
					po_share <<= 1;
					mutex_exit(&pageout_mutex);
				} else {
					/*
					 * Really a "goto loop", but if someone
					 * is tracing or TNF_PROBE_ing, hit
					 * those probes first.
					 */
					mutex_exit(&pageout_mutex);
					break;
				}
			}
		}
	}

	atomic_add_long(&nscan, nscan_cnt);

	sample_end = gethrtime();

	DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount,
	    uint_t, inst);

	/* Kernel probe */
	TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */,
	    tnf_ulong, pages_scanned, nscan_cnt, tnf_ulong, pages_free,
	    freemem);

	/*
	 * The following two blocks are only relevant when the scanner is
	 * first started up. After the scanner runs for a while, neither of
	 * the conditions will ever be true again.
	 *
	 * The global variables used below are only modified by this thread and
	 * only during initial scanning when there is a single page scanner
	 * thread running. Thus, we don't use any locking.
	 */
	if (PAGE_SCAN_STARTUP) {
		VERIFY3U(inst, ==, 0);
		pageout_sample_pages += pcount;
		pageout_sample_etime += sample_end - sample_start;
		++pageout_sample_cnt;

	} else if (pageout_new_spread == 0) {
		uint_t i;

		/*
		 * We have run enough samples, set the spread.
		 */
		VERIFY3U(inst, ==, 0);
		pageout_rate = (hrrate_t)pageout_sample_pages *
		    (hrrate_t)(NANOSEC) / pageout_sample_etime;
		pageout_new_spread = pageout_rate / 10;
		setupclock(1);
	}

	goto loop;
}

/*
 * Look at the page at hand.  If it is locked (e.g., for physical i/o),
 * system (u., page table) or free, then leave it alone.  Otherwise,
 * if we are running the front hand, turn off the page's reference bit.
 * If running the back hand, check whether the page has been reclaimed.
 * If not, free the page, pushing it to disk first if necessary.
 *
 * Return values:
 *	-1 if the page is not a candidate at all,
 *	 0 if not freed, or
 *	 1 if we freed it.
 */
static int
checkpage(struct page *pp, int whichhand)
{
	int ppattr;
	int isfs = 0;
	int isexec = 0;
	int pagesync_flag;
	zoneid_t zid = ALL_ZONES;

	/*
	 * Skip pages:
	 * 	- associated with the kernel vnode since
	 *	    they are always "exclusively" locked.
	 *	- that are free
	 *	- that are shared more than po_share'd times
	 *	- its already locked
	 *
	 * NOTE:  These optimizations assume that reads are atomic.
	 */

	if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
	    pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
	    hat_page_checkshare(pp, po_share)) {
		return (-1);
	}

	if (!page_trylock(pp, SE_EXCL)) {
		/*
		 * Skip the page if we can't acquire the "exclusive" lock.
		 */
		return (-1);
	} else if (PP_ISFREE(pp)) {
		/*
		 * It became free between the above check and our actually
		 * locking the page.  Oh, well there will be other pages.
		 */
		page_unlock(pp);
		return (-1);
	}

	/*
	 * Reject pages that cannot be freed. The page_struct_lock
	 * need not be acquired to examine these
	 * fields since the page has an "exclusive" lock.
	 */
	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
		page_unlock(pp);
		return (-1);
	}

	if (zones_over) {
		ASSERT(pp->p_zoneid == ALL_ZONES ||
		    pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID);
		if (pp->p_zoneid == ALL_ZONES ||
		    zone_pdata[pp->p_zoneid].zpers_over == 0) {
			/*
			 * Cross-zone shared page, or zone not over it's cap.
			 * Leave the page alone.
			 */
			page_unlock(pp);
			return (-1);
		}
		zid = pp->p_zoneid;
	}

	/*
	 * Maintain statistics for what we are freeing
	 */

	if (pp->p_vnode != NULL) {
		if (pp->p_vnode->v_flag & VVMEXEC)
			isexec = 1;

		if (!IS_SWAPFSVP(pp->p_vnode))
			isfs = 1;
	}

	/*
	 * Turn off REF and MOD bits with the front hand.
	 * The back hand examines the REF bit and always considers
	 * SHARED pages as referenced.
	 */
	if (whichhand == FRONT)
		pagesync_flag = HAT_SYNC_ZERORM;
	else
		pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
		    HAT_SYNC_STOPON_SHARED;

	ppattr = hat_pagesync(pp, pagesync_flag);

recheck:
	/*
	 * If page is referenced; fronthand makes unreferenced and reclaimable.
	 * For the backhand, a process referenced the page since the front hand
	 * went by, so it's not a candidate for freeing up.
	 */
	if (ppattr & P_REF) {
		DTRACE_PROBE2(pageout__isref, page_t *, pp, int, whichhand);
		if (whichhand == FRONT) {
			hat_clrref(pp);
		}
		page_unlock(pp);
		return (0);
	}

	/*
	 * This page is not referenced, so it must be reclaimable and we can
	 * add it to the free list. This can be done by either hand.
	 */

	VM_STAT_ADD(pageoutvmstats.checkpage[0]);

	/*
	 * If large page, attempt to demote it. If successfully demoted,
	 * retry the checkpage.
	 */
	if (pp->p_szc != 0) {
		if (!page_try_demote_pages(pp)) {
			VM_STAT_ADD(pageoutvmstats.checkpage[1]);
			page_unlock(pp);
			return (-1);
		}
		ASSERT(pp->p_szc == 0);
		VM_STAT_ADD(pageoutvmstats.checkpage[2]);
		/*
		 * since page_try_demote_pages() could have unloaded some
		 * mappings it makes sense to reload ppattr.
		 */
		ppattr = hat_page_getattr(pp, P_MOD | P_REF);
	}

	/*
	 * If the page is currently dirty, we have to arrange
	 * to have it cleaned before it can be freed.
	 *
	 * XXX - ASSERT(pp->p_vnode != NULL);
	 */
	if ((ppattr & P_MOD) && pp->p_vnode) {
		struct vnode *vp = pp->p_vnode;
		u_offset_t offset = pp->p_offset;

		/*
		 * Note: There is no possibility to test for process being
		 * swapped out or about to exit since we can't get back to
		 * process(es) from the page.
		 */

		/*
		 * Hold the vnode before releasing the page lock to
		 * prevent it from being freed and re-used by some
		 * other thread.
		 */
		VN_HOLD(vp);
		page_unlock(pp);

		/*
		 * Queue i/o request for the pageout thread.
		 */
		if (!queue_io_request(vp, offset)) {
			VN_RELE(vp);
			return (0);
		}
		if (isfs) {
			zone_pageout_stat(zid, ZPO_DIRTY);
		} else {
			zone_pageout_stat(zid, ZPO_ANONDIRTY);
		}
		return (1);
	}

	/*
	 * Now we unload all the translations,
	 * and put the page back on to the free list.
	 * If the page was used (referenced or modified) after
	 * the pagesync but before it was unloaded we catch it
	 * and handle the page properly.
	 */
	DTRACE_PROBE2(pageout__free, page_t *, pp, int, whichhand);
	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
	ppattr = hat_page_getattr(pp, P_MOD | P_REF);
	if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
		goto recheck;

	/*LINTED: constant in conditional context*/
	VN_DISPOSE(pp, B_FREE, 0, kcred);

	CPU_STATS_ADD_K(vm, dfree, 1);

	if (isfs) {
		if (isexec) {
			CPU_STATS_ADD_K(vm, execfree, 1);
		} else {
			CPU_STATS_ADD_K(vm, fsfree, 1);
		}
		zone_pageout_stat(zid, ZPO_FS);
	} else {
		CPU_STATS_ADD_K(vm, anonfree, 1);
		zone_pageout_stat(zid, ZPO_ANON);
	}

	return (1);		/* freed a page! */
}

/*
 * Queue async i/o request from pageout_scanner and segment swapout
 * routines on one common list.  This ensures that pageout devices (swap)
 * are not saturated by pageout_scanner or swapout requests.
 * The pageout thread empties this list by initiating i/o operations.
 */
int
queue_io_request(vnode_t *vp, u_offset_t off)
{
	struct async_reqs *arg;

	/*
	 * If we cannot allocate an async request struct,
	 * skip this page.
	 */
	mutex_enter(&push_lock);
	if ((arg = req_freelist) == NULL) {
		mutex_exit(&push_lock);
		return (0);
	}
	req_freelist = arg->a_next;		/* adjust freelist */
	push_list_size++;

	arg->a_vp = vp;
	arg->a_off = off;
	arg->a_len = PAGESIZE;
	arg->a_flags = B_ASYNC | B_FREE;
	arg->a_cred = kcred;		/* always held */

	/*
	 * Add to list of pending write requests.
	 */
	arg->a_next = push_list;
	push_list = arg;

	if (req_freelist == NULL) {
		/*
		 * No free async requests left. The lock is held so we
		 * might as well signal the pusher thread now.
		 */
		cv_signal(&push_cv);
	}
	mutex_exit(&push_lock);
	return (1);
}

/*
 * Wakeup pageout to initiate i/o if push_list is not empty.
 */
void
cv_signal_pageout()
{
	if (push_list != NULL) {
		mutex_enter(&push_lock);
		cv_signal(&push_cv);
		mutex_exit(&push_lock);
	}
}