summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJerry Jelinek <jerry.jelinek@joyent.com>2017-10-12 14:43:52 +0000
committerJerry Jelinek <jerry.jelinek@joyent.com>2017-10-12 15:05:24 +0000
commitbf4caad4ba9209e6bcd3f68a6d9e197473022286 (patch)
treed019ac538cd100af5721431c04bacdb6418fb89c
parenta298c587338cf16ca71d352bedfb494fba3f0378 (diff)
downloadillumos-joyent-bf4caad4ba9209e6bcd3f68a6d9e197473022286.tar.gz
OS-6306 accurate in-kernel zone RSS tracking
Reviewed by: Jason King <jason.king@joyent.com> Reviewed by: Dan McDonald <danmcd@joyent.com> Reviewed by: Patrick Mooney <patrick.mooney@joyent.com> Approved by: Dan McDonald <danmcd@joyent.com>
-rw-r--r--usr/src/cmd/zoneadmd/mcap.c236
-rw-r--r--usr/src/uts/common/brand/lx/procfs/lx_prvnops.c17
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_rlimit.c6
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c28
-rw-r--r--usr/src/uts/common/fs/lxproc/lxpr_vnops.c19
-rw-r--r--usr/src/uts/common/os/zone.c398
-rw-r--r--usr/src/uts/common/sys/zone.h62
-rw-r--r--usr/src/uts/common/syscall/sysconfig.c46
-rw-r--r--usr/src/uts/common/vm/page.h7
-rw-r--r--usr/src/uts/common/vm/vm_usage.c26
-rw-r--r--usr/src/uts/i86pc/os/startup.c1
-rw-r--r--usr/src/uts/i86pc/vm/hment.c9
-rw-r--r--usr/src/uts/sfmmu/vm/hat_sfmmu.c6
13 files changed, 478 insertions, 383 deletions
diff --git a/usr/src/cmd/zoneadmd/mcap.c b/usr/src/cmd/zoneadmd/mcap.c
index 88b72b6c55..d280c49b5b 100644
--- a/usr/src/cmd/zoneadmd/mcap.c
+++ b/usr/src/cmd/zoneadmd/mcap.c
@@ -28,29 +28,18 @@
* the associated zone's physical memory. A thread to do this is started
* when the zone boots and is halted when the zone shuts down.
*
- * Because of the way that the VM system is currently implemented, there is no
- * way to go from the bottom up (page to process to zone). Thus, there is no
- * obvious way to hook an rctl into the kernel's paging code to enforce a hard
- * memory cap. Instead, we implement a soft physical memory cap which looks
- * at the zone's overall rss and once it is over the cap, works from the top
- * down (zone to process to page), looking at zone processes, to determine
- * what to try to pageout to get the zone under its memory cap.
- *
- * The code uses the fast, cheap, but potentially very inaccurate sum of the
- * rss values from psinfo_t to first approximate the zone's rss and will
- * fallback to the vm_getusage syscall to determine the zone's rss if needed.
+ * The code obtains the accurate in-kernel RSS for the zone.
* It then checks the rss against the zone's zone.max-physical-memory rctl.
* Once the zone goes over its cap, then this thread will work through the
* zone's /proc process list, Pgrab-bing each process and stepping through the
- * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...)
- * to pageout pages, until the zone is again under its cap.
+ * address space segments, using a private option (_RUSAGESYS_INVALMAP) to the
+ * private SYS_rusagesys syscall to attempt to unload page translations, until
+ * the zone is again under its cap.
*
* Although zone memory capping is implemented as a soft cap by this user-level
* thread, the interfaces around memory caps that are exposed to the user are
* the standard ones; an rctl and kstats. This thread uses the rctl value
- * to obtain the cap and works with the zone kernel code to update the kstats.
- * If the implementation ever moves into the kernel, these exposed interfaces
- * do not need to change.
+ * to obtain the cap.
*
* The thread adaptively sleeps, periodically checking the state of the
* zone. As the zone's rss gets closer to the cap, the thread will wake up
@@ -129,14 +118,6 @@
#define TUNE_NPAGE "phys-mcap-no-pageout"
#define TUNE_NPFTHROT "phys-mcap-no-pf-throttle"
-/*
- * These are only used in get_mem_info but global. We always need scale_rss and
- * prev_fast_rss to be persistent but we also have the other two global so we
- * can easily see these with mdb.
- */
-uint64_t scale_rss = 0;
-uint64_t prev_fast_rss = 0;
-uint64_t fast_rss = 0;
uint64_t accurate_rss = 0;
/*
@@ -160,8 +141,6 @@ static boolean_t skip_vmusage = B_FALSE;
static boolean_t skip_pageout = B_FALSE;
static boolean_t skip_pf_throttle = B_FALSE;
-static zlog_t *logp;
-
static int64_t check_suspend();
static void get_mcap_tunables();
@@ -535,127 +514,12 @@ done:
static uint64_t
get_mem_info()
{
- uint64_t n = 1;
- zsd_vmusage64_t buf;
- uint64_t tmp_rss;
- DIR *pdir = NULL;
- struct dirent *dent;
-
- /*
- * Start by doing the fast, cheap RSS calculation using the rss value
- * in psinfo_t. Because that's per-process, it can lead to double
- * counting some memory and overestimating how much is being used, but
- * as long as that's not over the cap, then we don't need do the
- * expensive calculation.
- *
- * If we have to do the expensive calculation, we remember the scaling
- * factor so that we can try to use that on subsequent iterations for
- * the fast rss.
- */
- if (shutting_down)
- return (0);
-
- if ((pdir = opendir(zoneproc)) == NULL)
- return (0);
-
- accurate_rss = 0;
- fast_rss = 0;
- while (!shutting_down && (dent = readdir(pdir)) != NULL) {
- pid_t pid;
- int psfd;
- int64_t rss;
- char pathbuf[MAXPATHLEN];
- psinfo_t psinfo;
-
- if (strcmp(".", dent->d_name) == 0 ||
- strcmp("..", dent->d_name) == 0)
- continue;
-
- pid = atoi(dent->d_name);
- if (pid == 0 || pid == 1)
- continue;
-
- (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo",
- zoneproc, pid);
-
- rss = 0;
- if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
- if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
- sizeof (psinfo))
- rss = (int64_t)psinfo.pr_rssize;
-
- (void) close(psfd);
- }
-
- fast_rss += rss;
- }
-
- (void) closedir(pdir);
-
if (shutting_down)
return (0);
- debug("fast rss: %lluKB, scale: %llu, prev: %lluKB\n", fast_rss,
- scale_rss, prev_fast_rss);
-
- /* see if we can get by with a scaled fast rss */
- tmp_rss = fast_rss;
- if (scale_rss > 1 && prev_fast_rss > 0) {
- /*
- * Only scale the fast value if it hasn't ballooned too much
- * to trust.
- */
- if (fast_rss / prev_fast_rss < 2) {
- fast_rss /= scale_rss;
- debug("scaled fast rss: %lluKB\n", fast_rss);
- }
- }
-
- if (fast_rss <= zone_rss_cap || skip_vmusage) {
- uint64_t zone_rss_bytes;
-
- zone_rss_bytes = fast_rss * 1024;
- /* Use the zone's approx. RSS in the kernel */
- (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
- return (fast_rss);
- }
-
- buf.vmu_id = zid;
-
- /* get accurate usage (cached data may be up to 5 seconds old) */
- if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_A_ZONE, 5,
- (uintptr_t)&buf, (uintptr_t)&n) != 0) {
- debug("vmusage failed\n");
- (void) sleep_shutdown(1);
- return (0);
- }
-
- if (n > 1) {
- /* This should never happen */
- debug("vmusage returned more than one result\n");
- (void) sleep_shutdown(1);
- return (0);
- }
-
- if (buf.vmu_id != zid) {
- /* This should never happen */
- debug("vmusage returned the incorrect zone\n");
- (void) sleep_shutdown(1);
- return (0);
- }
-
- accurate_rss = buf.vmu_rss_all / 1024;
-
- /* calculate scaling factor to use for fast_rss from now on */
- if (accurate_rss > 0) {
- scale_rss = fast_rss / accurate_rss;
- debug("new scaling factor: %llu\n", scale_rss);
- /* remember the fast rss when we had to get the accurate rss */
- prev_fast_rss = tmp_rss;
- }
-
- debug("accurate rss: %lluKB, scale: %llu, prev: %lluKB\n", accurate_rss,
- scale_rss, prev_fast_rss);
+ (void) zone_getattr(zid, ZONE_ATTR_RSS, &accurate_rss,
+ sizeof (accurate_rss));
+ accurate_rss /= 1024;
return (accurate_rss);
}
@@ -988,75 +852,6 @@ has_proc()
}
/*
- * We run this loop for brands with no /proc to simply update the RSS, using
- * the cheap GZ /proc data, every 5 minutes.
- */
-static void
-no_procfs()
-{
- DIR *pdir = NULL;
- struct dirent *dent;
- uint64_t zone_rss_bytes;
-
- (void) sleep_shutdown(30);
- while (!shutting_down) {
- /*
- * Just do the fast, cheap RSS calculation using the rss value
- * in psinfo_t. Because that's per-process, it can lead to
- * double counting some memory and overestimating how much is
- * being used. Since there is no /proc in the zone, we use the
- * GZ /proc and check for the correct zone.
- */
- if ((pdir = opendir("/proc")) == NULL)
- return;
-
- fast_rss = 0;
- while (!shutting_down && (dent = readdir(pdir)) != NULL) {
- pid_t pid;
- int psfd;
- int64_t rss;
- char pathbuf[MAXPATHLEN];
- psinfo_t psinfo;
-
- if (strcmp(".", dent->d_name) == 0 ||
- strcmp("..", dent->d_name) == 0)
- continue;
-
- pid = atoi(dent->d_name);
- if (pid == 0 || pid == 1)
- continue;
-
- (void) snprintf(pathbuf, sizeof (pathbuf),
- "/proc/%d/psinfo", pid);
-
- rss = 0;
- if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
- if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
- sizeof (psinfo)) {
- if (psinfo.pr_zoneid == zid)
- rss = (int64_t)psinfo.pr_rssize;
- }
-
- (void) close(psfd);
- }
-
- fast_rss += rss;
- }
-
- (void) closedir(pdir);
-
- if (shutting_down)
- return;
-
- zone_rss_bytes = fast_rss * 1024;
- /* Use the zone's approx. RSS in the kernel */
- (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
-
- (void) sleep_shutdown(300);
- }
-}
-
-/*
* Thread that checks zone's memory usage and when over the cap, goes through
* the zone's process list trying to pageout processes to get under the cap.
*/
@@ -1066,21 +861,17 @@ mcap_zone()
DIR *pdir = NULL;
int64_t excess;
- debug("thread startup\n");
-
- get_mcap_tunables();
-
/*
- * If the zone has no /proc filesystem, we can't use the fast algorithm
- * to check RSS or pageout any processes. All we can do is periodically
- * update it's RSS kstat using the expensive sycall.
+ * If the zone has no /proc filesystem (e.g. KVM), we can't pageout any
+ * processes. Terminate this thread.
*/
if (!has_proc()) {
- no_procfs();
- debug("thread shutdown\n");
return;
}
+ debug("thread startup\n");
+ get_mcap_tunables();
+
/*
* When first starting it is likely lots of other zones are starting
* too because the system is booting. Since we just started the zone
@@ -1172,7 +963,6 @@ create_mcap_thread(zlog_t *zlogp, zoneid_t id)
shutting_down = 0;
zid = id;
- logp = zlogp;
/* all but the lx brand currently use /proc */
if (strcmp(brand_name, "lx") == 0) {
diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c
index 59fac45556..8f58b3a5ad 100644
--- a/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c
+++ b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c
@@ -3776,7 +3776,7 @@ lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
{
zone_t *zone = LXPTOZ(lxpnp);
lx_zone_data_t *lxzd = ztolxzd(zone);
- long total_mem, free_mem, total_swap;
+ ulong_t total_mem, free_mem, total_swap;
boolean_t swap_disabled;
ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO);
@@ -3784,21 +3784,16 @@ lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
ASSERT(lxzd != NULL);
swap_disabled = lxzd->lxzd_swap_disabled;
- if (zone->zone_phys_mem_ctl == UINT64_MAX) {
- total_mem = physmem * PAGESIZE;
- free_mem = freemem * PAGESIZE;
- } else {
- total_mem = zone->zone_phys_mem_ctl;
- free_mem = zone->zone_phys_mem_ctl - zone->zone_phys_mem;
- if (free_mem < 0)
- free_mem = 0;
- }
+ zone_get_physmem_data(zone->zone_id, (pgcnt_t *)&total_mem,
+ (pgcnt_t *)&free_mem);
+ total_mem = ptob(total_mem);
+ free_mem = ptob(free_mem);
if (swap_disabled) {
total_swap = 0;
} else {
if (zone->zone_max_swap_ctl == UINT64_MAX) {
- total_swap = k_anoninfo.ani_max * PAGESIZE;
+ total_swap = ptob(k_anoninfo.ani_max);
} else {
mutex_enter(&zone->zone_mem_lock);
total_swap = zone->zone_max_swap_ctl;
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c b/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c
index 8fadf8d391..30fa996615 100644
--- a/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c
@@ -158,7 +158,11 @@ lx_getrlimit_common(int lx_resource, uint64_t *rlim_curp, uint64_t *rlim_maxp)
case LX_RLIMIT_RSS:
/* zone.max-physical-memory */
- rlim64.rlim_cur = rlim64.rlim_max = curzone->zone_phys_mem_ctl;
+ zone_get_physmem_data(curzone->zone_id,
+ (pgcnt_t *)&rlim64.rlim_cur,
+ (pgcnt_t *)&rlim64.rlim_max); /* max is dummy variable */
+ rlim64.rlim_cur = rlim64.rlim_max = ptob(rlim64.rlim_cur);
+
break;
case LX_RLIMIT_NPROC:
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c
index 387471c0f5..052ad322a7 100644
--- a/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c
@@ -21,7 +21,7 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2017 Joyent, Inc.
*/
#include <vm/anon.h>
@@ -75,8 +75,9 @@ extern pgcnt_t swapfs_minfree;
static void
lx_sysinfo_common(lx_sysinfo_t *si)
{
- zone_t *zone = curthread->t_procp->p_zone;
- uint64_t zphysmem, zfreemem, ztotswap, zfreeswap;
+ zone_t *zone = curzone;
+ pgcnt_t zphysmem, zfreemem;
+ ulong_t ztotswap, zfreeswap;
si->si_uptime = gethrestime_sec() - zone->zone_boot_time;
@@ -90,26 +91,7 @@ lx_sysinfo_common(lx_sysinfo_t *si)
*/
si->si_procs = (int32_t)zone->zone_nlwps;
- /*
- * If memory or swap limits are set on the zone, use those, otherwise
- * use the system values. physmem and freemem are in pages, but the
- * zone values are in bytes. Likewise, ani_max and ani_free are in
- * pages.
- */
- if (zone->zone_phys_mem_ctl == UINT64_MAX) {
- zphysmem = physmem;
- zfreemem = freemem;
- } else {
- int64_t freemem;
-
- zphysmem = btop(zone->zone_phys_mem_ctl);
- freemem = zone->zone_phys_mem_ctl - zone->zone_phys_mem;
- if (freemem > 0) {
- zfreemem = btop(freemem);
- } else {
- zfreemem = 0;
- }
- }
+ zone_get_physmem_data(zone->zone_id, &zphysmem, &zfreemem);
if (zone->zone_max_swap_ctl == UINT64_MAX) {
ztotswap = k_anoninfo.ani_max;
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c
index 1f7f3074d6..9bcc0f7e8b 100644
--- a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c
+++ b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c
@@ -1449,23 +1449,18 @@ lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
{
zone_t *zone = LXPTOZ(lxpnp);
int global = zone == global_zone;
- long total_mem, free_mem, total_swap, used_swap;
+ ulong_t total_mem, free_mem, total_swap, used_swap;
ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO);
- if (global || zone->zone_phys_mem_ctl == UINT64_MAX) {
- total_mem = physmem * PAGESIZE;
- free_mem = freemem * PAGESIZE;
- } else {
- total_mem = zone->zone_phys_mem_ctl;
- free_mem = zone->zone_phys_mem_ctl - zone->zone_phys_mem;
- if (free_mem < 0)
- free_mem = 0;
- }
+ zone_get_physmem_data(zone->zone_id, (pgcnt_t *)&total_mem,
+ (pgcnt_t *)&free_mem);
+ total_mem = ptob(total_mem);
+ free_mem = ptob(free_mem);
if (global || zone->zone_max_swap_ctl == UINT64_MAX) {
- total_swap = k_anoninfo.ani_max * PAGESIZE;
- used_swap = k_anoninfo.ani_phys_resv * PAGESIZE;
+ total_swap = ptob(k_anoninfo.ani_max);
+ used_swap = ptob(k_anoninfo.ani_phys_resv);
} else {
mutex_enter(&zone->zone_mem_lock);
total_swap = zone->zone_max_swap_ctl;
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index 2912df0a29..9c1ee8d750 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -328,8 +328,8 @@ static list_t zone_active;
static list_t zone_deathrow;
static kmutex_t zone_deathrow_lock;
-/* number of zones is limited by virtual interface limit in IP */
-uint_t maxzones = 8192;
+/* This can be dynamically reduced if various subsystems hit internal limits. */
+uint_t maxzones = MAX_ZONES;
/* Event channel to sent zone state change notifications */
evchan_t *zone_event_chan;
@@ -429,6 +429,55 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
static const int ZONE_SYSCALL_API_VERSION = 7;
/*
+ * "zone_pcap_data" is an array indexed by zoneid. Each member stores the zone's
+ * current page usage, its page limit, a flag indicating if the zone is
+ * over its physical memory cap and various statistics. The zpcap_over flag is
+ * the interface for the page scanner to use when reclaiming pages for zones
+ * that are over their cap.
+ *
+ * All zone physical memory cap data is stored in this array instead of within
+ * the zone structure itself. This is because zone structures come and go, but
+ * paging-related work can be asynchronous to any particular zone. In,
+ * particular:
+ * 1) Page scanning to reclaim pages occurs from a kernel thread that is not
+ * associated with any zone.
+ * 2) Freeing segkp pages can occur long after the zone which first
+ * instantiated those pages has gone away.
+ * We want to be able to account for pages/zone without constantly having to
+ * take extra locks and finding the relevant zone structure, particularly during
+ * page scanning.
+ *
+ * The page scanner can run when "zone_num_over_cap" is non-zero. It can
+ * do a direct lookup of a zoneid into the "zone_pcap_data" array to determine
+ * if that zone is over its cap.
+ *
+ * There is no locking for the page scanner to perform these two checks.
+ * We cannot have the page scanner blocking normal paging activity for
+ * running processes. Because the physical memory cap is a soft cap, it is
+ * fine for the scanner to simply read the current state of the counter and
+ * the zone's zpcap_over entry in the array. The scanner should never modify
+ * either of these items. Internally the entries and the counter are managed
+ * with the "zone_physcap_lock" mutex as we add/remove mappings to pages. We
+ * take care to ensure that we only take the zone_physcap_lock mutex when a
+ * zone is transitioning over/under its physical memory cap.
+ *
+ * The "zone_incr_capped" and "zone_decr_capped" functions are used manage
+ * the "zone_pcap_data" array and associated counter.
+ *
+ * The zone_pcap_t structure tracks the zone's physical cap and phyiscal usage
+ * in terms of pages. These values are currently defined as uint32. Thus, the
+ * maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295) since
+ * UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a
+ * zone's maximum RSS is limited to 17.5 TB and twice that with an 8k page size.
+ * In the future we may need to expand these counters to 64-bit, but for now
+ * we're using 32-bit to conserve memory, since this array is statically
+ * allocatd within the kernel based on the maximum number of zones supported.
+ */
+uint_t zone_num_over_cap;
+zone_pcap_t zone_pcap_data[MAX_ZONES];
+static kmutex_t zone_physcap_lock;
+
+/*
* Certain filesystems (such as NFS and autofs) need to know which zone
* the mount is being placed in. Because of this, we need to be able to
* ensure that a zone isn't in the process of being created/destroyed such
@@ -1822,11 +1871,10 @@ static rctl_qty_t
zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
{
rctl_qty_t q;
- zone_t *z = p->p_zone;
+ zone_pcap_t *zp = &zone_pcap_data[p->p_zone->zone_id];
ASSERT(MUTEX_HELD(&p->p_lock));
- /* No additional lock because not enforced in the kernel */
- q = z->zone_phys_mem;
+ q = ptob(zp->zpcap_pg_cnt);
return (q);
}
@@ -1835,11 +1883,30 @@ static int
zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
rctl_qty_t nv)
{
+ zoneid_t zid;
+ uint_t pg_val;
+
ASSERT(MUTEX_HELD(&p->p_lock));
ASSERT(e->rcep_t == RCENTITY_ZONE);
if (e->rcep_p.zone == NULL)
return (0);
- e->rcep_p.zone->zone_phys_mem_ctl = nv;
+ zid = e->rcep_p.zone->zone_id;
+ if (nv == UINT64_MAX) {
+ pg_val = UINT32_MAX;
+ } else {
+ uint64_t pages = btop(nv);
+
+ /*
+ * Return from RCTLOP_SET is always ignored so just clamp an
+ * out-of-range value to our largest "limited" value.
+ */
+ if (pages >= UINT32_MAX) {
+ pg_val = UINT32_MAX - 1;
+ } else {
+ pg_val = (uint_t)pages;
+ }
+ }
+ zone_pcap_data[zid].zpcap_pg_limit = pg_val;
return (0);
}
@@ -1949,12 +2016,13 @@ zone_physmem_kstat_update(kstat_t *ksp, int rw)
{
zone_t *zone = ksp->ks_private;
zone_kstat_t *zk = ksp->ks_data;
+ zone_pcap_t *zp = &zone_pcap_data[zone->zone_id];
if (rw == KSTAT_WRITE)
return (EACCES);
- zk->zk_usage.value.ui64 = zone->zone_phys_mem;
- zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl;
+ zk->zk_usage.value.ui64 = ptob(zp->zpcap_pg_cnt);
+ zk->zk_value.value.ui64 = ptob(zp->zpcap_pg_limit);
return (0);
}
@@ -2172,16 +2240,24 @@ zone_mcap_kstat_update(kstat_t *ksp, int rw)
{
zone_t *zone = ksp->ks_private;
zone_mcap_kstat_t *zmp = ksp->ks_data;
+ zone_pcap_t *zp;
if (rw == KSTAT_WRITE)
return (EACCES);
- zmp->zm_rss.value.ui64 = zone->zone_phys_mem;
- zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl;
+ zp = &zone_pcap_data[zone->zone_id];
+
+ zmp->zm_rss.value.ui64 = ptob(zp->zpcap_pg_cnt);
+ zmp->zm_phys_cap.value.ui64 = ptob(zp->zpcap_pg_limit);
zmp->zm_swap.value.ui64 = zone->zone_max_swap;
zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
- zmp->zm_nover.value.ui64 = zone->zone_mcap_nover;
- zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout;
+ zmp->zm_nover.value.ui64 = zp->zpcap_nover;
+#ifndef DEBUG
+ zmp->zm_pagedout.value.ui64 = ptob(zp->zpcap_pg_out);
+#else
+ zmp->zm_pagedout.value.ui64 = ptob(zp->zpcap_pg_fsdirty +
+ zp->zpcap_pg_fs + zp->zpcap_pg_anon + zp->zpcap_pg_anondirty);
+#endif
zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
@@ -2427,8 +2503,6 @@ zone_zsd_init(void)
zone0.zone_locked_mem_ctl = UINT64_MAX;
ASSERT(zone0.zone_max_swap == 0);
zone0.zone_max_swap_ctl = UINT64_MAX;
- zone0.zone_phys_mem = 0;
- zone0.zone_phys_mem_ctl = UINT64_MAX;
zone0.zone_max_lofi = 0;
zone0.zone_max_lofi_ctl = UINT64_MAX;
zone0.zone_shmmax = 0;
@@ -2770,6 +2844,9 @@ zone_free(zone_t *zone)
*/
cpucaps_zone_remove(zone);
+ /* Clear physical memory capping data. */
+ bzero(&zone_pcap_data[zone->zone_id], sizeof (zone_pcap_t));
+
ASSERT(zone->zone_cpucap == NULL);
/* remove from deathrow list */
@@ -3020,16 +3097,14 @@ zone_set_initname(zone_t *zone, const char *zone_initname)
* The zone_set_mcap_nover and zone_set_mcap_pageout functions are used
* to provide the physical memory capping kstats. Since physical memory
* capping is currently implemented in userland, that code uses the setattr
- * entry point to increment the kstats. We always simply increment nover
- * every time that setattr is called and we always add in the input value
- * to zone_mcap_pagedout every time that is called.
+ * entry point to increment the kstats. We ignore nover when that setattr is
+ * called and we always add in the input value to zone_mcap_pagedout every
+ * time that is called.
*/
/*ARGSUSED*/
static int
zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover)
{
- zone->zone_mcap_nover++;
-
return (0);
}
@@ -3039,8 +3114,17 @@ zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout)
uint64_t pageout;
int err;
- if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0)
- zone->zone_mcap_pagedout += pageout;
+ if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0) {
+ zone_pcap_t *zp = &zone_pcap_data[zone->zone_id];
+ uint64_t pages;
+
+ pages = btop(pageout);
+#ifndef DEBUG
+ atomic_add_64(&zp->zpcap_pg_out, pages);
+#else
+ atomic_add_64(&zp->zpcap_pg_fs, pages);
+#endif
+ }
return (err);
}
@@ -3063,22 +3147,6 @@ zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay)
return (err);
}
-/*
- * The zone_set_rss function is used to set the zone's RSS when we do the
- * fast, approximate calculation in user-land.
- */
-static int
-zone_set_rss(zone_t *zone, const uint64_t *prss)
-{
- uint64_t rss;
- int err;
-
- if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0)
- zone->zone_phys_mem = rss;
-
- return (err);
-}
-
static int
zone_set_sched_class(zone_t *zone, const char *new_class)
{
@@ -5077,8 +5145,6 @@ zone_create(const char *zone_name, const char *zone_root,
zone->zone_locked_mem_ctl = UINT64_MAX;
zone->zone_max_swap = 0;
zone->zone_max_swap_ctl = UINT64_MAX;
- zone->zone_phys_mem = 0;
- zone->zone_phys_mem_ctl = UINT64_MAX;
zone->zone_max_lofi = 0;
zone->zone_max_lofi_ctl = UINT64_MAX;
zone->zone_lockedmem_kstat = NULL;
@@ -5091,6 +5157,13 @@ zone_create(const char *zone_name, const char *zone_root,
*/
zone->zone_rctls = NULL;
+ /*
+ * Ensure page count is 0 (in case zoneid has wrapped).
+ * Initialize physical memory cap as unlimited.
+ */
+ zone_pcap_data[zoneid].zpcap_pg_cnt = 0;
+ zone_pcap_data[zoneid].zpcap_pg_limit = UINT32_MAX;
+
if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
zone_free(zone);
return (zone_create_error(error, 0, extended_error));
@@ -6228,6 +6301,19 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
bufsize) != 0)
error = EFAULT;
break;
+ case ZONE_ATTR_RSS: {
+ zone_pcap_t *zp = &zone_pcap_data[zone->zone_id];
+ uint64_t phys_mem;
+
+ phys_mem = ptob(zp->zpcap_pg_cnt);
+ size = sizeof (phys_mem);
+ if (bufsize > size)
+ bufsize = size;
+ if (buf != NULL &&
+ copyout(&phys_mem, buf, bufsize) != 0)
+ error = EFAULT;
+ }
+ break;
default:
if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
size = bufsize;
@@ -6281,8 +6367,7 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
*/
zone_status = zone_status_get(zone);
if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT &&
- attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS &&
- zone_status > ZONE_IS_READY) {
+ attr != ZONE_ATTR_PG_FLT_DELAY && zone_status > ZONE_IS_READY) {
err = EINVAL;
goto done;
}
@@ -6313,9 +6398,6 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
case ZONE_ATTR_PG_FLT_DELAY:
err = zone_set_page_fault_delay(zone, (const uint32_t *)buf);
break;
- case ZONE_ATTR_RSS:
- err = zone_set_rss(zone, (const uint64_t *)buf);
- break;
case ZONE_ATTR_SECFLAGS:
err = zone_set_secflags(zone, (psecflags_t *)buf);
break;
@@ -8074,3 +8156,231 @@ done:
else
return (0);
}
+
+static void
+zone_incr_capped(zoneid_t zid)
+{
+ zone_pcap_t *zp = &zone_pcap_data[zid];
+
+ /* See if over (unlimited is UINT32_MAX), or already marked that way. */
+ if (zp->zpcap_pg_cnt <= zp->zpcap_pg_limit || zp->zpcap_over == 1) {
+ return;
+ }
+
+ mutex_enter(&zone_physcap_lock);
+ /* Recheck setting under mutex */
+ if (zp->zpcap_pg_cnt > zp->zpcap_pg_limit && zp->zpcap_over == 0) {
+ zp->zpcap_over = 1;
+ zp->zpcap_nover++;
+ zone_num_over_cap++;
+ DTRACE_PROBE1(zone__over__pcap, zoneid_t, zid);
+ }
+ mutex_exit(&zone_physcap_lock);
+}
+
+/*
+ * We want some hysteresis when the zone is going under its cap so that we're
+ * not continuously toggling page scanning back and forth by a single page
+ * around the cap. Using ~1% of the zone's page limit seems to be a good
+ * quantity. This table shows some various zone memory caps and the number of
+ * pages (assuming a 4k page size). Given this, we choose to shift the page
+ * limit by 7 places to get a hysteresis that is slightly less than 1%.
+ *
+ * cap pages pages 1% shift7 shift7
+ * 128M 32768 0x0008000 327 256 0x00100
+ * 512M 131072 0x0020000 1310 1024 0x00400
+ * 1G 262144 0x0040000 2621 2048 0x00800
+ * 4G 1048576 0x0100000 10485 8192 0x02000
+ * 8G 2097152 0x0200000 20971 16384 0x04000
+ * 16G 4194304 0x0400000 41943 32768 0x08000
+ * 32G 8388608 0x0800000 83886 65536 0x10000
+ * 64G 16777216 0x1000000 167772 131072 0x20000
+ */
+static void
+zone_decr_capped(zoneid_t zid)
+{
+ zone_pcap_t *zp = &zone_pcap_data[zid];
+ uint32_t adjusted_limit;
+
+ /*
+ * See if under, or already marked that way. There is no need to
+ * check for an unlimited cap (zpcap_pg_limit == UINT32_MAX)
+ * since we'll never set zpcap_over in zone_incr_capped().
+ */
+ if (zp->zpcap_over == 0 || zp->zpcap_pg_cnt >= zp->zpcap_pg_limit) {
+ return;
+ }
+
+ adjusted_limit = zp->zpcap_pg_limit - (zp->zpcap_pg_limit >> 7);
+
+ /* Recheck, accounting for our hysteresis. */
+ if (zp->zpcap_pg_cnt >= adjusted_limit) {
+ return;
+ }
+
+ mutex_enter(&zone_physcap_lock);
+ /* Recheck under mutex. */
+ if (zp->zpcap_pg_cnt < adjusted_limit && zp->zpcap_over == 1) {
+ zp->zpcap_over = 0;
+ ASSERT(zone_num_over_cap > 0);
+ zone_num_over_cap--;
+ DTRACE_PROBE1(zone__under__pcap, zoneid_t, zid);
+ }
+ mutex_exit(&zone_physcap_lock);
+}
+
+/*
+ * For zone_add_page() and zone_rm_page(), access to the page we're touching is
+ * controlled by our caller's locking.
+ * On x86 our callers already did: ASSERT(x86_hm_held(pp))
+ * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp))
+ */
+void
+zone_add_page(page_t *pp)
+{
+ uint_t pcnt;
+ zone_pcap_t *zp;
+ zoneid_t zid;
+
+ /* Skip pages in segkmem, etc. (KV_KVP, ...) */
+ if (PP_ISKAS(pp))
+ return;
+
+ ASSERT(!PP_ISFREE(pp));
+
+ zid = curzone->zone_id;
+ if (pp->p_zoneid == zid) {
+ /* Another mapping to this page for this zone, do nothing */
+ return;
+ }
+
+ if (pp->p_szc == 0) {
+ pcnt = 1;
+ } else {
+ /* large page */
+ pcnt = page_get_pagecnt(pp->p_szc);
+ }
+
+ if (pp->p_share == 0) {
+ /* First mapping to this page. */
+ pp->p_zoneid = zid;
+ zp = &zone_pcap_data[zid];
+ ASSERT(zp->zpcap_pg_cnt + pcnt < UINT32_MAX);
+ atomic_add_32((uint32_t *)&zp->zpcap_pg_cnt, pcnt);
+ zone_incr_capped(zid);
+ return;
+ }
+
+ if (pp->p_zoneid != ALL_ZONES) {
+ /*
+ * The page is now being shared across a different zone.
+ * Decrement the original zone's usage.
+ */
+ zid = pp->p_zoneid;
+ pp->p_zoneid = ALL_ZONES;
+ ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+ zp = &zone_pcap_data[zid];
+
+ if (zp->zpcap_pg_cnt > 0) {
+ atomic_add_32((uint32_t *)&zp->zpcap_pg_cnt, -pcnt);
+ }
+ zone_decr_capped(zid);
+ }
+}
+
+void
+zone_rm_page(page_t *pp)
+{
+ uint_t pcnt;
+ zone_pcap_t *zp;
+ zoneid_t zid;
+
+ /* Skip pages in segkmem, etc. (KV_KVP, ...) */
+ if (PP_ISKAS(pp))
+ return;
+
+ zid = pp->p_zoneid;
+ if (zid == ALL_ZONES || pp->p_share != 0)
+ return;
+
+ /* This is the last mapping to the page for a zone. */
+ if (pp->p_szc == 0) {
+ pcnt = 1;
+ } else {
+ /* large page */
+ pcnt = (int64_t)page_get_pagecnt(pp->p_szc);
+ }
+
+ ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+ zp = &zone_pcap_data[zid];
+ if (zp->zpcap_pg_cnt > 0) {
+ atomic_add_32((uint32_t *)&zp->zpcap_pg_cnt, -pcnt);
+ }
+ zone_decr_capped(zid);
+ pp->p_zoneid = ALL_ZONES;
+}
+
+void
+zone_pageout_stat(int zid, zone_pageout_op_t op)
+{
+ zone_pcap_t *zp;
+
+ if (zid == ALL_ZONES)
+ return;
+
+ ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+ zp = &zone_pcap_data[zid];
+
+#ifndef DEBUG
+ atomic_add_64(&zp->zpcap_pg_out, 1);
+#else
+ switch (op) {
+ case ZPO_DIRTY:
+ atomic_add_64(&zp->zpcap_pg_fsdirty, 1);
+ break;
+ case ZPO_FS:
+ atomic_add_64(&zp->zpcap_pg_fs, 1);
+ break;
+ case ZPO_ANON:
+ atomic_add_64(&zp->zpcap_pg_anon, 1);
+ break;
+ case ZPO_ANONDIRTY:
+ atomic_add_64(&zp->zpcap_pg_anondirty, 1);
+ break;
+ default:
+ cmn_err(CE_PANIC, "Invalid pageout operator %d", op);
+ break;
+ }
+#endif
+}
+
+/*
+ * Return the zone's physical memory cap and current free memory (in pages).
+ */
+void
+zone_get_physmem_data(int zid, pgcnt_t *memcap, pgcnt_t *free)
+{
+ zone_pcap_t *zp;
+
+ ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+ zp = &zone_pcap_data[zid];
+
+ /*
+ * If memory or swap limits are set on the zone, use those, otherwise
+ * use the system values. physmem and freemem are also in pages.
+ */
+ if (zp->zpcap_pg_limit == UINT32_MAX) {
+ *memcap = physmem;
+ *free = freemem;
+ } else {
+ int64_t freemem;
+
+ *memcap = (pgcnt_t)zp->zpcap_pg_limit;
+ freemem = zp->zpcap_pg_limit - zp->zpcap_pg_cnt;
+ if (freemem > 0) {
+ *free = (pgcnt_t)freemem;
+ } else {
+ *free = (pgcnt_t)0;
+ }
+ }
+}
diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h
index 3bf7979174..a08ef59959 100644
--- a/usr/src/uts/common/sys/zone.h
+++ b/usr/src/uts/common/sys/zone.h
@@ -22,7 +22,7 @@
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
* Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>.
- * Copyright 2016, Joyent, Inc.
+ * Copyright 2017, Joyent, Inc.
*/
#ifndef _SYS_ZONE_H
@@ -51,15 +51,27 @@ extern "C" {
* NOTE
*
* The contents of this file are private to the implementation of
- * Solaris and are subject to change at any time without notice.
+ * illumos and are subject to change at any time without notice.
* Applications and drivers using these interfaces may fail to
* run on future releases.
*/
/* Available both in kernel and for user space */
-/* zone id restrictions and special ids */
-#define MAX_ZONEID 9999
+/*
+ * zone id restrictions and special ids.
+ * See 'maxzones' for run-time zone limit.
+ *
+ * The current 8k value for MAX_ZONES was originally derived from the virtual
+ * interface limit in IP when "shared-stack" was the only supported networking
+ * for zones. The virtual interface limit is the number of addresses allowed
+ * on an interface (see MAX_ADDRS_PER_IF). Even with exclusive stacks, an 8k
+ * zone limit is still a reasonable choice at this time, given other limits
+ * within the kernel. Since we only support 8192 zones (which includes GZ),
+ * there is no point in allowing MAX_ZONEID > 8k.
+ */
+#define MAX_ZONES 8192
+#define MAX_ZONEID (MAX_ZONES - 1)
#define MIN_USERZONEID 1 /* lowest user-creatable zone ID */
#define MIN_ZONEID 0 /* minimum zone ID on system */
#define GLOBAL_ZONEID 0
@@ -563,7 +575,6 @@ typedef struct zone {
int zone_init_status; /* init's exit status */
int zone_boot_err; /* for zone_boot() if boot fails */
char *zone_bootargs; /* arguments passed via zone_boot() */
- rctl_qty_t zone_phys_mem_ctl; /* current phys. memory limit */
/*
* zone_kthreads is protected by zone_status_lock.
*/
@@ -647,7 +658,7 @@ typedef struct zone {
zone_zfs_kstat_t *zone_zfs_stats;
/*
- * Solaris Auditing per-zone audit context
+ * illumos Auditing per-zone audit context
*/
struct au_kcontext *zone_audit_kctxt;
/*
@@ -667,11 +678,8 @@ typedef struct zone {
/*
* kstats and counters for physical memory capping.
*/
- rctl_qty_t zone_phys_mem; /* current bytes of phys. mem. (RSS) */
kstat_t *zone_physmem_kstat;
- uint64_t zone_mcap_nover; /* # of times over phys. cap */
- uint64_t zone_mcap_pagedout; /* bytes of mem. paged out */
- kmutex_t zone_mcap_lock; /* protects mcap statistics */
+ kmutex_t zone_mcap_lock; /* protects mcap statistics */
kstat_t *zone_mcap_ksp;
zone_mcap_kstat_t *zone_mcap_stats;
uint64_t zone_pgpgin; /* pages paged in */
@@ -739,6 +747,30 @@ typedef struct zone {
kmutex_t zone_mount_lock;
} zone_t;
+/* zpcap_over is treated as a boolean but is 32 bits for alignment. */
+typedef struct zone_pcap {
+ uint32_t zpcap_over; /* currently over cap */
+ uint32_t zpcap_pg_cnt; /* current RSS in pages */
+ uint32_t zpcap_pg_limit; /* current RRS limit in pages */
+ uint32_t zpcap_nover; /* # of times over phys. cap */
+#ifndef DEBUG
+ uint64_t zpcap_pg_out; /* # pages flushed */
+#else
+ /*
+ * To conserve memory, detailed pageout stats are only kept for DEBUG
+ * builds.
+ */
+ uint64_t zpcap_pg_anon; /* # clean anon pages flushed */
+ uint64_t zpcap_pg_anondirty; /* # dirty anon pages flushed */
+ uint64_t zpcap_pg_fs; /* # clean fs pages flushed */
+ uint64_t zpcap_pg_fsdirty; /* # dirty fs pages flushed */
+#endif
+} zone_pcap_t;
+
+typedef enum zone_pageout_op {
+ ZPO_DIRTY, ZPO_FS, ZPO_ANON, ZPO_ANONDIRTY
+} zone_pageout_op_t;
+
/*
* Special value of zone_psetid to indicate that pools are disabled.
*/
@@ -963,6 +995,16 @@ extern void mount_completed(zone_t *);
extern int zone_walk(int (*)(zone_t *, void *), void *);
+struct page;
+extern void zone_add_page(struct page *);
+extern void zone_rm_page(struct page *);
+extern void zone_pageout_stat(int, zone_pageout_op_t);
+extern void zone_get_physmem_data(int, pgcnt_t *, pgcnt_t *);
+
+/* Interfaces for page scanning */
+extern uint_t zone_num_over_cap;
+extern zone_pcap_t zone_pcap_data[MAX_ZONES];
+
extern rctl_hndl_t rc_zone_locked_mem;
extern rctl_hndl_t rc_zone_max_swap;
extern rctl_hndl_t rc_zone_phys_mem;
diff --git a/usr/src/uts/common/syscall/sysconfig.c b/usr/src/uts/common/syscall/sysconfig.c
index 92daeed703..e09f4e85a2 100644
--- a/usr/src/uts/common/syscall/sysconfig.c
+++ b/usr/src/uts/common/syscall/sysconfig.c
@@ -22,7 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2017 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -171,41 +171,29 @@ sysconfig(int which)
/*
* If the non-global zone has a phys. memory cap, use that.
* We always report the system-wide value for the global zone,
- * even though rcapd can be used on the global zone too.
+ * even though memory capping can be used on the global zone
+ * too.
*/
- if (!INGLOBALZONE(curproc) &&
- curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX)
- return (MIN(btop(curproc->p_zone->zone_phys_mem_ctl),
- physinstalled));
+ if (!INGLOBALZONE(curproc)) {
+ pgcnt_t cap, free;
+
+ zone_get_physmem_data(curzone->zone_id, &cap, &free);
+ return (MIN(cap, physinstalled));
+ }
return (physinstalled);
case _CONFIG_AVPHYS_PAGES:
/*
- * If the non-global zone has a phys. memory cap, use
- * the phys. memory cap - zone's rss. We always
- * report the system-wide value for the global zone, even
- * though memory capping can be used on the global zone too.
- * We use the cached value for the RSS since vm_getusage()
- * is so expensive and we don't need this value to be exact.
+ * If the non-global zone has a phys. memory cap, use its
+ * free value. We always report the system-wide value for the
+ * global zone, even though memory capping can be used on the
+ * global zone too.
*/
- if (!INGLOBALZONE(curproc) &&
- curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX) {
- pgcnt_t cap, rss, free;
-
- cap = btop(curproc->p_zone->zone_phys_mem_ctl);
- if (cap > physinstalled)
- return (freemem);
-
- rss = btop(curproc->p_zone->zone_phys_mem);
- /*
- * Because this is a soft cap, it is possible
- * for rss to be temporarily over the cap.
- */
- if (cap > rss)
- free = cap - rss;
- else
- free = 0;
+ if (!INGLOBALZONE(curproc)) {
+ pgcnt_t cap, free;
+
+ zone_get_physmem_data(curzone->zone_id, &cap, &free);
return (MIN(free, freemem));
}
diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h
index 8747b96acc..ae9b0be758 100644
--- a/usr/src/uts/common/vm/page.h
+++ b/usr/src/uts/common/vm/page.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -229,6 +230,7 @@ struct as;
* p_nrm
* p_mapping
* p_share
+ * p_zoneid
*
* The following field is file system dependent. How it is used and
* the locking strategies applied are up to the individual file system
@@ -527,9 +529,8 @@ typedef struct page {
pfn_t p_pagenum; /* physical page number */
uint_t p_share; /* number of translations */
-#if defined(_LP64)
- uint_t p_sharepad; /* pad for growing p_share */
-#endif
+ short p_zoneid; /* zone page use tracking */
+ short p_pad1; /* TBD */
uint_t p_slckcnt; /* number of softlocks */
#if defined(__sparc)
uint_t p_kpmref; /* number of kpm mapping sharers */
diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c
index 1d5ef71e3e..91296e9c8d 100644
--- a/usr/src/uts/common/vm/vm_usage.c
+++ b/usr/src/uts/common/vm/vm_usage.c
@@ -25,7 +25,7 @@
*/
/*
- * Copyright 2016, Joyent, Inc.
+ * Copyright 2017, Joyent, Inc.
*/
/*
@@ -1792,28 +1792,6 @@ vmu_cache_rele(vmu_cache_t *cache)
}
/*
- * When new data is calculated, update the phys_mem rctl usage value in the
- * zones.
- */
-static void
-vmu_update_zone_rctls(vmu_cache_t *cache)
-{
- vmusage_t *rp;
- size_t i = 0;
- zone_t *zp;
-
- for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) {
- if (rp->vmu_type == VMUSAGE_ZONE &&
- rp->vmu_zoneid != ALL_ZONES) {
- if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) {
- zp->zone_phys_mem = rp->vmu_rss_all;
- zone_rele(zp);
- }
- }
- }
-}
-
-/*
* Copy out the cached results to a caller. Inspect the callers flags
* and zone to determine which cached results should be copied.
*/
@@ -2112,8 +2090,6 @@ start:
mutex_exit(&vmu_data.vmu_lock);
- /* update zone's phys. mem. rctl usage */
- vmu_update_zone_rctls(cache);
/* copy cache */
ret = vmu_copyout_results(cache, buf, nres, flags_orig,
req_zone_id, cpflg);
diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c
index ba90b6627d..bfe8c2486b 100644
--- a/usr/src/uts/i86pc/os/startup.c
+++ b/usr/src/uts/i86pc/os/startup.c
@@ -2611,6 +2611,7 @@ add_physmem_cb(page_t *pp, pfn_t pnum)
pp->p_mapping = NULL;
pp->p_embed = 0;
pp->p_share = 0;
+ pp->p_zoneid = ALL_ZONES;
pp->p_mlentry = 0;
}
diff --git a/usr/src/uts/i86pc/vm/hment.c b/usr/src/uts/i86pc/vm/hment.c
index d00d756828..079f64e92e 100644
--- a/usr/src/uts/i86pc/vm/hment.c
+++ b/usr/src/uts/i86pc/vm/hment.c
@@ -21,10 +21,9 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/sysmacros.h>
#include <sys/kmem.h>
@@ -37,6 +36,7 @@
#include <vm/hat_i86.h>
#include <sys/cmn_err.h>
#include <sys/avl.h>
+#include <sys/zone.h>
/*
@@ -323,6 +323,8 @@ hment_insert(hment_t *hm, page_t *pp)
((hment_t *)pp->p_mapping)->hm_prev = hm;
pp->p_mapping = hm;
+ zone_add_page(pp);
+
/*
* Add the hment to the system-wide hash table.
*/
@@ -464,6 +466,7 @@ hment_assign(htable_t *htable, uint_t entry, page_t *pp, hment_t *hm)
pp->p_embed = 1;
pp->p_mapping = htable;
pp->p_mlentry = entry;
+ zone_add_page(pp);
return;
}
@@ -545,6 +548,7 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry)
pp->p_mapping = NULL;
pp->p_mlentry = 0;
pp->p_embed = 0;
+ zone_rm_page(pp);
return (NULL);
}
@@ -580,6 +584,7 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry)
hm->hm_hashlink = null_avl_link;
hm->hm_next = NULL;
hm->hm_prev = NULL;
+ zone_rm_page(pp);
return (hm);
}
diff --git a/usr/src/uts/sfmmu/vm/hat_sfmmu.c b/usr/src/uts/sfmmu/vm/hat_sfmmu.c
index b7539c828c..2ef3ea20e8 100644
--- a/usr/src/uts/sfmmu/vm/hat_sfmmu.c
+++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.c
@@ -24,6 +24,7 @@
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright 2016 Gary Mills
+ * Copyright 2017 Joyent, Inc.
*/
/*
@@ -85,6 +86,7 @@
#include <sys/fpu/fpusystm.h>
#include <vm/mach_kpm.h>
#include <sys/callb.h>
+#include <sys/zone.h>
#ifdef DEBUG
#define SFMMU_VALIDATE_HMERID(hat, rid, saddr, len) \
@@ -933,6 +935,7 @@ static kphysm_setup_vector_t sfmmu_update_vec = {
} \
pp->p_mapping = hme; \
pp->p_share++; \
+ zone_add_page(pp); \
}
/*
@@ -953,6 +956,7 @@ static kphysm_setup_vector_t sfmmu_update_vec = {
\
ASSERT(pp->p_share > 0); \
pp->p_share--; \
+ zone_rm_page(pp); \
\
if (hme->hme_prev) { \
ASSERT(pp->p_mapping != hme); \
@@ -7350,6 +7354,8 @@ retry:
tpp->p_mapping = NULL;
dpp->p_share = tpp->p_share;
tpp->p_share = 0;
+ dpp->p_zoneid = tpp->p_zoneid;
+ tpp->p_zoneid = ALL_ZONES;
while (index != 0) {
index = index >> 1;