diff options
| author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2017-09-28 12:56:20 +0000 |
|---|---|---|
| committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2017-09-28 12:57:39 +0000 |
| commit | 5ae84a5233b723c890288b775cb5317db2e54d61 (patch) | |
| tree | f28877eae0fdbba8a58726efc3e464145fda1766 /usr/src | |
| parent | 597e7db3df75fb2976c1c29ef285e8bc6c289b4d (diff) | |
| download | illumos-joyent-5ae84a5233b723c890288b775cb5317db2e54d61.tar.gz | |
OS-6306 accurate in-kernel zone RSS tracking
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>
Diffstat (limited to 'usr/src')
| -rw-r--r-- | usr/src/cmd/zoneadmd/mcap.c | 236 | ||||
| -rw-r--r-- | usr/src/uts/common/os/zone.c | 236 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/zone.h | 30 | ||||
| -rw-r--r-- | usr/src/uts/common/syscall/sysconfig.c | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/vm/page.h | 7 | ||||
| -rw-r--r-- | usr/src/uts/common/vm/vm_usage.c | 26 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/os/startup.c | 3 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/vm/hment.c | 9 | ||||
| -rw-r--r-- | usr/src/uts/sfmmu/vm/hat_sfmmu.c | 6 |
9 files changed, 272 insertions, 285 deletions
diff --git a/usr/src/cmd/zoneadmd/mcap.c b/usr/src/cmd/zoneadmd/mcap.c index 88b72b6c55..d280c49b5b 100644 --- a/usr/src/cmd/zoneadmd/mcap.c +++ b/usr/src/cmd/zoneadmd/mcap.c @@ -28,29 +28,18 @@ * the associated zone's physical memory. A thread to do this is started * when the zone boots and is halted when the zone shuts down. * - * Because of the way that the VM system is currently implemented, there is no - * way to go from the bottom up (page to process to zone). Thus, there is no - * obvious way to hook an rctl into the kernel's paging code to enforce a hard - * memory cap. Instead, we implement a soft physical memory cap which looks - * at the zone's overall rss and once it is over the cap, works from the top - * down (zone to process to page), looking at zone processes, to determine - * what to try to pageout to get the zone under its memory cap. - * - * The code uses the fast, cheap, but potentially very inaccurate sum of the - * rss values from psinfo_t to first approximate the zone's rss and will - * fallback to the vm_getusage syscall to determine the zone's rss if needed. + * The code obtains the accurate in-kernel RSS for the zone. * It then checks the rss against the zone's zone.max-physical-memory rctl. * Once the zone goes over its cap, then this thread will work through the * zone's /proc process list, Pgrab-bing each process and stepping through the - * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...) - * to pageout pages, until the zone is again under its cap. + * address space segments, using a private option (_RUSAGESYS_INVALMAP) to the + * private SYS_rusagesys syscall to attempt to unload page translations, until + * the zone is again under its cap. * * Although zone memory capping is implemented as a soft cap by this user-level * thread, the interfaces around memory caps that are exposed to the user are * the standard ones; an rctl and kstats. This thread uses the rctl value - * to obtain the cap and works with the zone kernel code to update the kstats. - * If the implementation ever moves into the kernel, these exposed interfaces - * do not need to change. + * to obtain the cap. * * The thread adaptively sleeps, periodically checking the state of the * zone. As the zone's rss gets closer to the cap, the thread will wake up @@ -129,14 +118,6 @@ #define TUNE_NPAGE "phys-mcap-no-pageout" #define TUNE_NPFTHROT "phys-mcap-no-pf-throttle" -/* - * These are only used in get_mem_info but global. We always need scale_rss and - * prev_fast_rss to be persistent but we also have the other two global so we - * can easily see these with mdb. - */ -uint64_t scale_rss = 0; -uint64_t prev_fast_rss = 0; -uint64_t fast_rss = 0; uint64_t accurate_rss = 0; /* @@ -160,8 +141,6 @@ static boolean_t skip_vmusage = B_FALSE; static boolean_t skip_pageout = B_FALSE; static boolean_t skip_pf_throttle = B_FALSE; -static zlog_t *logp; - static int64_t check_suspend(); static void get_mcap_tunables(); @@ -535,127 +514,12 @@ done: static uint64_t get_mem_info() { - uint64_t n = 1; - zsd_vmusage64_t buf; - uint64_t tmp_rss; - DIR *pdir = NULL; - struct dirent *dent; - - /* - * Start by doing the fast, cheap RSS calculation using the rss value - * in psinfo_t. Because that's per-process, it can lead to double - * counting some memory and overestimating how much is being used, but - * as long as that's not over the cap, then we don't need do the - * expensive calculation. - * - * If we have to do the expensive calculation, we remember the scaling - * factor so that we can try to use that on subsequent iterations for - * the fast rss. - */ - if (shutting_down) - return (0); - - if ((pdir = opendir(zoneproc)) == NULL) - return (0); - - accurate_rss = 0; - fast_rss = 0; - while (!shutting_down && (dent = readdir(pdir)) != NULL) { - pid_t pid; - int psfd; - int64_t rss; - char pathbuf[MAXPATHLEN]; - psinfo_t psinfo; - - if (strcmp(".", dent->d_name) == 0 || - strcmp("..", dent->d_name) == 0) - continue; - - pid = atoi(dent->d_name); - if (pid == 0 || pid == 1) - continue; - - (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", - zoneproc, pid); - - rss = 0; - if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) { - if (pread(psfd, &psinfo, sizeof (psinfo), 0) == - sizeof (psinfo)) - rss = (int64_t)psinfo.pr_rssize; - - (void) close(psfd); - } - - fast_rss += rss; - } - - (void) closedir(pdir); - if (shutting_down) return (0); - debug("fast rss: %lluKB, scale: %llu, prev: %lluKB\n", fast_rss, - scale_rss, prev_fast_rss); - - /* see if we can get by with a scaled fast rss */ - tmp_rss = fast_rss; - if (scale_rss > 1 && prev_fast_rss > 0) { - /* - * Only scale the fast value if it hasn't ballooned too much - * to trust. - */ - if (fast_rss / prev_fast_rss < 2) { - fast_rss /= scale_rss; - debug("scaled fast rss: %lluKB\n", fast_rss); - } - } - - if (fast_rss <= zone_rss_cap || skip_vmusage) { - uint64_t zone_rss_bytes; - - zone_rss_bytes = fast_rss * 1024; - /* Use the zone's approx. RSS in the kernel */ - (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0); - return (fast_rss); - } - - buf.vmu_id = zid; - - /* get accurate usage (cached data may be up to 5 seconds old) */ - if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_A_ZONE, 5, - (uintptr_t)&buf, (uintptr_t)&n) != 0) { - debug("vmusage failed\n"); - (void) sleep_shutdown(1); - return (0); - } - - if (n > 1) { - /* This should never happen */ - debug("vmusage returned more than one result\n"); - (void) sleep_shutdown(1); - return (0); - } - - if (buf.vmu_id != zid) { - /* This should never happen */ - debug("vmusage returned the incorrect zone\n"); - (void) sleep_shutdown(1); - return (0); - } - - accurate_rss = buf.vmu_rss_all / 1024; - - /* calculate scaling factor to use for fast_rss from now on */ - if (accurate_rss > 0) { - scale_rss = fast_rss / accurate_rss; - debug("new scaling factor: %llu\n", scale_rss); - /* remember the fast rss when we had to get the accurate rss */ - prev_fast_rss = tmp_rss; - } - - debug("accurate rss: %lluKB, scale: %llu, prev: %lluKB\n", accurate_rss, - scale_rss, prev_fast_rss); + (void) zone_getattr(zid, ZONE_ATTR_RSS, &accurate_rss, + sizeof (accurate_rss)); + accurate_rss /= 1024; return (accurate_rss); } @@ -988,75 +852,6 @@ has_proc() } /* - * We run this loop for brands with no /proc to simply update the RSS, using - * the cheap GZ /proc data, every 5 minutes. - */ -static void -no_procfs() -{ - DIR *pdir = NULL; - struct dirent *dent; - uint64_t zone_rss_bytes; - - (void) sleep_shutdown(30); - while (!shutting_down) { - /* - * Just do the fast, cheap RSS calculation using the rss value - * in psinfo_t. Because that's per-process, it can lead to - * double counting some memory and overestimating how much is - * being used. Since there is no /proc in the zone, we use the - * GZ /proc and check for the correct zone. - */ - if ((pdir = opendir("/proc")) == NULL) - return; - - fast_rss = 0; - while (!shutting_down && (dent = readdir(pdir)) != NULL) { - pid_t pid; - int psfd; - int64_t rss; - char pathbuf[MAXPATHLEN]; - psinfo_t psinfo; - - if (strcmp(".", dent->d_name) == 0 || - strcmp("..", dent->d_name) == 0) - continue; - - pid = atoi(dent->d_name); - if (pid == 0 || pid == 1) - continue; - - (void) snprintf(pathbuf, sizeof (pathbuf), - "/proc/%d/psinfo", pid); - - rss = 0; - if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) { - if (pread(psfd, &psinfo, sizeof (psinfo), 0) == - sizeof (psinfo)) { - if (psinfo.pr_zoneid == zid) - rss = (int64_t)psinfo.pr_rssize; - } - - (void) close(psfd); - } - - fast_rss += rss; - } - - (void) closedir(pdir); - - if (shutting_down) - return; - - zone_rss_bytes = fast_rss * 1024; - /* Use the zone's approx. RSS in the kernel */ - (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0); - - (void) sleep_shutdown(300); - } -} - -/* * Thread that checks zone's memory usage and when over the cap, goes through * the zone's process list trying to pageout processes to get under the cap. */ @@ -1066,21 +861,17 @@ mcap_zone() DIR *pdir = NULL; int64_t excess; - debug("thread startup\n"); - - get_mcap_tunables(); - /* - * If the zone has no /proc filesystem, we can't use the fast algorithm - * to check RSS or pageout any processes. All we can do is periodically - * update it's RSS kstat using the expensive sycall. + * If the zone has no /proc filesystem (e.g. KVM), we can't pageout any + * processes. Terminate this thread. */ if (!has_proc()) { - no_procfs(); - debug("thread shutdown\n"); return; } + debug("thread startup\n"); + get_mcap_tunables(); + /* * When first starting it is likely lots of other zones are starting * too because the system is booting. Since we just started the zone @@ -1172,7 +963,6 @@ create_mcap_thread(zlog_t *zlogp, zoneid_t id) shutting_down = 0; zid = id; - logp = zlogp; /* all but the lx brand currently use /proc */ if (strcmp(brand_name, "lx") == 0) { diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index 2912df0a29..1e5db33066 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -328,8 +328,8 @@ static list_t zone_active; static list_t zone_deathrow; static kmutex_t zone_deathrow_lock; -/* number of zones is limited by virtual interface limit in IP */ -uint_t maxzones = 8192; +/* This can be dynamically reduced if various subsystems hit internal limits. */ +uint_t maxzones = MAX_ZONES; /* Event channel to sent zone state change notifications */ evchan_t *zone_event_chan; @@ -3063,22 +3063,6 @@ zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay) return (err); } -/* - * The zone_set_rss function is used to set the zone's RSS when we do the - * fast, approximate calculation in user-land. - */ -static int -zone_set_rss(zone_t *zone, const uint64_t *prss) -{ - uint64_t rss; - int err; - - if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0) - zone->zone_phys_mem = rss; - - return (err); -} - static int zone_set_sched_class(zone_t *zone, const char *new_class) { @@ -6228,6 +6212,14 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) bufsize) != 0) error = EFAULT; break; + case ZONE_ATTR_RSS: + size = sizeof (zone->zone_phys_mem); + if (bufsize > size) + bufsize = size; + if (buf != NULL && + copyout(&zone->zone_phys_mem, buf, bufsize) != 0) + error = EFAULT; + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -6281,8 +6273,7 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) */ zone_status = zone_status_get(zone); if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT && - attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS && - zone_status > ZONE_IS_READY) { + attr != ZONE_ATTR_PG_FLT_DELAY && zone_status > ZONE_IS_READY) { err = EINVAL; goto done; } @@ -6313,9 +6304,6 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_PG_FLT_DELAY: err = zone_set_page_fault_delay(zone, (const uint32_t *)buf); break; - case ZONE_ATTR_RSS: - err = zone_set_rss(zone, (const uint64_t *)buf); - break; case ZONE_ATTR_SECFLAGS: err = zone_set_secflags(zone, (psecflags_t *)buf); break; @@ -8074,3 +8062,205 @@ done: else return (0); } + +/* + * "zone_over_cap" is an array indexed by zoneid, indicating which zones are + * over their physical memory cap. This is the interface for the page scanner + * to use when reclaiming pages for zones that are over their cap. + * + * The page scanner can run when "zone_num_over_cap" is non-zero. It can + * do a direct lookup of a zoneid into the "zone_over_cap" array to determine + * if that zone is over its cap. + * + * There is no locking for the page scanner to perform these two checks. + * We cannot have the page scanner blocking normal paging activity for + * running processes. Because the physical memory cap is a soft cap, it is + * fine for the scanner to simply read the current state of the counter and + * the zone's entry in the array. The scanner should never modify either of + * these items. Internally the entries and the counter are managed with the + * "zone_physcap_lock" mutex as we add/remove mappings to pages. We take care + * to ensure that we only take the zone_physcap_lock mutex when a zone is + * transitioning over/under its physical memory cap. + * + * The "zone_incr_capped", "zone_decr_capped" and "zone_clr_capped" functions + * are used manage the "zone_over_cap" array and associated counter. + */ +uint8_t zone_over_cap[MAX_ZONES]; +uint_t zone_num_over_cap; +static kmutex_t zone_physcap_lock; + +static void +zone_incr_capped(zone_t *zone) +{ + /* See if over (unlimited is UINT64_MAX), or already marked that way. */ + if (zone->zone_phys_mem <= zone->zone_phys_mem_ctl || + zone_over_cap[zone->zone_id] == 1) { + return; + } + + mutex_enter(&zone_physcap_lock); + /* Recheck setting under mutex */ + if (zone->zone_phys_mem > zone->zone_phys_mem_ctl && + zone_over_cap[zone->zone_id] == 0) { + zone_over_cap[zone->zone_id] = 1; + zone_num_over_cap++; + DTRACE_PROBE1(zone__over__pcap, zone_t *, zone); + } + mutex_exit(&zone_physcap_lock); +} + +static void +zone_decr_capped(zone_t *zone) +{ + /* + * See if under, or already marked that way. There is no need to + * check for an unlimited cap (zone_phys_mem_ctl == UINT64_MAX) + * since we'll never add the zone in zone_incr_capped_zone(). + */ + if (zone_over_cap[zone->zone_id] == 0 || + zone->zone_phys_mem >= zone->zone_phys_mem_ctl) { + return; + } + + mutex_enter(&zone_physcap_lock); + /* Recheck setting under mutex */ + if (zone->zone_phys_mem < zone->zone_phys_mem_ctl && + zone_over_cap[zone->zone_id] == 1) { + ASSERT(zone_num_over_cap > 0); + zone_over_cap[zone->zone_id] = 0; + zone_num_over_cap--; + DTRACE_PROBE1(zone__under__pcap, zone_t *, zone); + } + mutex_exit(&zone_physcap_lock); +} + +/* Clear out an entry for a zone which no longer exists. */ +static void +zone_clr_capped(zoneid_t zid) +{ + if (zone_over_cap[zid] == 0) + return; + + mutex_enter(&zone_physcap_lock); + /* Recheck setting under mutex */ + if (zone_over_cap[zid] == 1) { + ASSERT(zone_num_over_cap > 0); + zone_over_cap[zid] = 0; + zone_num_over_cap--; + } + mutex_exit(&zone_physcap_lock); +} + +/* + * For zone_add_page() and zone_rm_page(), access to the page we're touching is + * controlled by our caller's locking. + * On x86 our callers already did: ASSERT(x86_hm_held(pp)) + * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp)) + */ +void +zone_add_page(page_t *pp) +{ + int64_t psize; + zone_t *zone; + + /* Skip pages in segkmem, etc. (KV_KVP, ...) */ + if (PP_ISKAS(pp)) + return; + + ASSERT(!PP_ISFREE(pp)); + + zone = curzone; + if (pp->p_zoneid == zone->zone_id) { + /* Another mapping to this page for this zone, do nothing */ + return; + } + + if (pp->p_szc == 0) { + psize = (int64_t)PAGESIZE; + } else { + /* large page */ + psize = (int64_t)page_get_pagesize(pp->p_szc); + } + + if (pp->p_share == 0) { + /* First mapping to this page. */ + pp->p_zoneid = zone->zone_id; + atomic_add_64((uint64_t *)&zone->zone_phys_mem, psize); + zone_incr_capped(zone); + return; + } + + if (pp->p_zoneid != ALL_ZONES) { + /* + * The page is now being shared across a different zone. + * Decrement the original zone's usage. + */ + zoneid_t id; + + id = pp->p_zoneid; + pp->p_zoneid = ALL_ZONES; + if ((zone = zone_find_by_id(id)) == NULL) { + /* + * Perhaps the zone has halted but since we have the + * page locked down, the page hasn't been freed yet. + * In any case, there is no zone RSS to update. + */ + zone_clr_capped(id); + return; + } + + atomic_add_64((uint64_t *)&zone->zone_phys_mem, -psize); + if ((int64_t)zone->zone_phys_mem < 0) { + DTRACE_PROBE1(zone__ap__neg, zoneid_t, id); + cmn_err(CE_WARN, "zone %d: RSS negative", id); + zone->zone_phys_mem = 0; + } + zone_decr_capped(zone); + zone_rele(zone); + } +} + +void +zone_rm_page(page_t *pp) +{ + zone_t *zone; + boolean_t do_rele = B_FALSE; + int64_t psize; + + /* Skip pages in segkmem, etc. (KV_KVP, ...) */ + if (PP_ISKAS(pp)) + return; + + if (pp->p_zoneid == ALL_ZONES || pp->p_share != 0) + return; + + /* This is the last mapping to the page for a zone. */ + if (pp->p_szc == 0) { + psize = (int64_t)PAGESIZE; + } else { + /* large page */ + psize = (int64_t)page_get_pagesize(pp->p_szc); + } + + if (pp->p_zoneid == curzone->zone_id) { + zone = curzone; + } else if ((zone = zone_find_by_id(pp->p_zoneid)) != NULL) { + do_rele = B_TRUE; + } + + if (zone != NULL) { + atomic_add_64((uint64_t *)&zone->zone_phys_mem, -psize); + if ((int64_t)zone->zone_phys_mem < 0) { + DTRACE_PROBE1(zone__rp__neg, zoneid_t, zone->zone_id); + cmn_err(CE_WARN, "zone %d: RSS negative", + zone->zone_id); + zone->zone_phys_mem = 0; + } + zone_decr_capped(zone); + if (do_rele) + zone_rele(zone); + } else { + zone_clr_capped(pp->p_zoneid); + } + pp->p_zoneid = ALL_ZONES; +} diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index 3bf7979174..32b2b7bf38 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -22,7 +22,7 @@ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>. - * Copyright 2016, Joyent, Inc. + * Copyright 2017, Joyent, Inc. */ #ifndef _SYS_ZONE_H @@ -51,15 +51,27 @@ extern "C" { * NOTE * * The contents of this file are private to the implementation of - * Solaris and are subject to change at any time without notice. + * illumos and are subject to change at any time without notice. * Applications and drivers using these interfaces may fail to * run on future releases. */ /* Available both in kernel and for user space */ -/* zone id restrictions and special ids */ -#define MAX_ZONEID 9999 +/* + * zone id restrictions and special ids. + * See 'maxzones' for run-time zone limit. + * + * The current 8k value for MAX_ZONES was originally derived from the virtual + * interface limit in IP when "shared-stack" was the only supported networking + * for zones. The virtual interface limit is the number of addresses allowed + * on an interface (see MAX_ADDRS_PER_IF). Even with exclusive stacks, an 8k + * zone limit is still a reasonable choice at this time, given other limits + * within the kernel. Since we only support 8192 zones (which includes GZ), + * there is no point in allowing MAX_ZONEID > 8k. + */ +#define MAX_ZONES 8192 +#define MAX_ZONEID (MAX_ZONES - 1) #define MIN_USERZONEID 1 /* lowest user-creatable zone ID */ #define MIN_ZONEID 0 /* minimum zone ID on system */ #define GLOBAL_ZONEID 0 @@ -647,7 +659,7 @@ typedef struct zone { zone_zfs_kstat_t *zone_zfs_stats; /* - * Solaris Auditing per-zone audit context + * illumos Auditing per-zone audit context */ struct au_kcontext *zone_audit_kctxt; /* @@ -963,6 +975,14 @@ extern void mount_completed(zone_t *); extern int zone_walk(int (*)(zone_t *, void *), void *); +struct page; +extern void zone_add_page(struct page *); +extern void zone_rm_page(struct page *); + +/* Interfaces for page scanning */ +extern uint_t zone_num_over_cap; +extern uint8_t zone_over_cap[MAX_ZONES]; + extern rctl_hndl_t rc_zone_locked_mem; extern rctl_hndl_t rc_zone_max_swap; extern rctl_hndl_t rc_zone_phys_mem; diff --git a/usr/src/uts/common/syscall/sysconfig.c b/usr/src/uts/common/syscall/sysconfig.c index 92daeed703..fc38f8770c 100644 --- a/usr/src/uts/common/syscall/sysconfig.c +++ b/usr/src/uts/common/syscall/sysconfig.c @@ -22,7 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2016 Joyent, Inc. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -186,8 +186,6 @@ sysconfig(int which) * the phys. memory cap - zone's rss. We always * report the system-wide value for the global zone, even * though memory capping can be used on the global zone too. - * We use the cached value for the RSS since vm_getusage() - * is so expensive and we don't need this value to be exact. */ if (!INGLOBALZONE(curproc) && curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX) { diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h index 8747b96acc..ae9b0be758 100644 --- a/usr/src/uts/common/vm/page.h +++ b/usr/src/uts/common/vm/page.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -229,6 +230,7 @@ struct as; * p_nrm * p_mapping * p_share + * p_zoneid * * The following field is file system dependent. How it is used and * the locking strategies applied are up to the individual file system @@ -527,9 +529,8 @@ typedef struct page { pfn_t p_pagenum; /* physical page number */ uint_t p_share; /* number of translations */ -#if defined(_LP64) - uint_t p_sharepad; /* pad for growing p_share */ -#endif + short p_zoneid; /* zone page use tracking */ + short p_pad1; /* TBD */ uint_t p_slckcnt; /* number of softlocks */ #if defined(__sparc) uint_t p_kpmref; /* number of kpm mapping sharers */ diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c index 1d5ef71e3e..91296e9c8d 100644 --- a/usr/src/uts/common/vm/vm_usage.c +++ b/usr/src/uts/common/vm/vm_usage.c @@ -25,7 +25,7 @@ */ /* - * Copyright 2016, Joyent, Inc. + * Copyright 2017, Joyent, Inc. */ /* @@ -1792,28 +1792,6 @@ vmu_cache_rele(vmu_cache_t *cache) } /* - * When new data is calculated, update the phys_mem rctl usage value in the - * zones. - */ -static void -vmu_update_zone_rctls(vmu_cache_t *cache) -{ - vmusage_t *rp; - size_t i = 0; - zone_t *zp; - - for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) { - if (rp->vmu_type == VMUSAGE_ZONE && - rp->vmu_zoneid != ALL_ZONES) { - if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) { - zp->zone_phys_mem = rp->vmu_rss_all; - zone_rele(zp); - } - } - } -} - -/* * Copy out the cached results to a caller. Inspect the callers flags * and zone to determine which cached results should be copied. */ @@ -2112,8 +2090,6 @@ start: mutex_exit(&vmu_data.vmu_lock); - /* update zone's phys. mem. rctl usage */ - vmu_update_zone_rctls(cache); /* copy cache */ ret = vmu_copyout_results(cache, buf, nres, flags_orig, req_zone_id, cpflg); diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c index 16c683d993..8a6751000b 100644 --- a/usr/src/uts/i86pc/os/startup.c +++ b/usr/src/uts/i86pc/os/startup.c @@ -22,7 +22,7 @@ * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - * Copyright 2015 Joyent, Inc. + * Copyright 2017 Joyent, Inc. * Copyright (c) 2015 by Delphix. All rights reserved. */ /* @@ -2603,6 +2603,7 @@ add_physmem_cb(page_t *pp, pfn_t pnum) pp->p_mapping = NULL; pp->p_embed = 0; pp->p_share = 0; + pp->p_zoneid = ALL_ZONES; pp->p_mlentry = 0; } diff --git a/usr/src/uts/i86pc/vm/hment.c b/usr/src/uts/i86pc/vm/hment.c index d00d756828..079f64e92e 100644 --- a/usr/src/uts/i86pc/vm/hment.c +++ b/usr/src/uts/i86pc/vm/hment.c @@ -21,10 +21,9 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/sysmacros.h> #include <sys/kmem.h> @@ -37,6 +36,7 @@ #include <vm/hat_i86.h> #include <sys/cmn_err.h> #include <sys/avl.h> +#include <sys/zone.h> /* @@ -323,6 +323,8 @@ hment_insert(hment_t *hm, page_t *pp) ((hment_t *)pp->p_mapping)->hm_prev = hm; pp->p_mapping = hm; + zone_add_page(pp); + /* * Add the hment to the system-wide hash table. */ @@ -464,6 +466,7 @@ hment_assign(htable_t *htable, uint_t entry, page_t *pp, hment_t *hm) pp->p_embed = 1; pp->p_mapping = htable; pp->p_mlentry = entry; + zone_add_page(pp); return; } @@ -545,6 +548,7 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry) pp->p_mapping = NULL; pp->p_mlentry = 0; pp->p_embed = 0; + zone_rm_page(pp); return (NULL); } @@ -580,6 +584,7 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry) hm->hm_hashlink = null_avl_link; hm->hm_next = NULL; hm->hm_prev = NULL; + zone_rm_page(pp); return (hm); } diff --git a/usr/src/uts/sfmmu/vm/hat_sfmmu.c b/usr/src/uts/sfmmu/vm/hat_sfmmu.c index b7539c828c..2ef3ea20e8 100644 --- a/usr/src/uts/sfmmu/vm/hat_sfmmu.c +++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.c @@ -24,6 +24,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2016 Gary Mills + * Copyright 2017 Joyent, Inc. */ /* @@ -85,6 +86,7 @@ #include <sys/fpu/fpusystm.h> #include <vm/mach_kpm.h> #include <sys/callb.h> +#include <sys/zone.h> #ifdef DEBUG #define SFMMU_VALIDATE_HMERID(hat, rid, saddr, len) \ @@ -933,6 +935,7 @@ static kphysm_setup_vector_t sfmmu_update_vec = { } \ pp->p_mapping = hme; \ pp->p_share++; \ + zone_add_page(pp); \ } /* @@ -953,6 +956,7 @@ static kphysm_setup_vector_t sfmmu_update_vec = { \ ASSERT(pp->p_share > 0); \ pp->p_share--; \ + zone_rm_page(pp); \ \ if (hme->hme_prev) { \ ASSERT(pp->p_mapping != hme); \ @@ -7350,6 +7354,8 @@ retry: tpp->p_mapping = NULL; dpp->p_share = tpp->p_share; tpp->p_share = 0; + dpp->p_zoneid = tpp->p_zoneid; + tpp->p_zoneid = ALL_ZONES; while (index != 0) { index = index >> 1; |
