diff options
author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2017-09-29 15:07:06 +0000 |
---|---|---|
committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2017-09-29 15:07:06 +0000 |
commit | ffeae41cb1d13f7eed5a29287a0b7bbac7edc7b5 (patch) | |
tree | e17ea3d0ac0be23167b93e139add1d37231a6993 /usr | |
parent | 917c950a6b639832805b114bfefaa03d982e59f3 (diff) | |
download | illumos-joyent-ffeae41cb1d13f7eed5a29287a0b7bbac7edc7b5.tar.gz |
Revert "OS-6306 accurate in-kernel zone RSS tracking" [needs more work]
This reverts commit 5ae84a5233b723c890288b775cb5317db2e54d61.
Diffstat (limited to 'usr')
-rw-r--r-- | usr/src/cmd/zoneadmd/mcap.c | 236 | ||||
-rw-r--r-- | usr/src/uts/common/os/zone.c | 236 | ||||
-rw-r--r-- | usr/src/uts/common/sys/zone.h | 30 | ||||
-rw-r--r-- | usr/src/uts/common/syscall/sysconfig.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/vm/page.h | 7 | ||||
-rw-r--r-- | usr/src/uts/common/vm/vm_usage.c | 26 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/startup.c | 3 | ||||
-rw-r--r-- | usr/src/uts/i86pc/vm/hment.c | 9 | ||||
-rw-r--r-- | usr/src/uts/sfmmu/vm/hat_sfmmu.c | 6 |
9 files changed, 285 insertions, 272 deletions
diff --git a/usr/src/cmd/zoneadmd/mcap.c b/usr/src/cmd/zoneadmd/mcap.c index d280c49b5b..88b72b6c55 100644 --- a/usr/src/cmd/zoneadmd/mcap.c +++ b/usr/src/cmd/zoneadmd/mcap.c @@ -28,18 +28,29 @@ * the associated zone's physical memory. A thread to do this is started * when the zone boots and is halted when the zone shuts down. * - * The code obtains the accurate in-kernel RSS for the zone. + * Because of the way that the VM system is currently implemented, there is no + * way to go from the bottom up (page to process to zone). Thus, there is no + * obvious way to hook an rctl into the kernel's paging code to enforce a hard + * memory cap. Instead, we implement a soft physical memory cap which looks + * at the zone's overall rss and once it is over the cap, works from the top + * down (zone to process to page), looking at zone processes, to determine + * what to try to pageout to get the zone under its memory cap. + * + * The code uses the fast, cheap, but potentially very inaccurate sum of the + * rss values from psinfo_t to first approximate the zone's rss and will + * fallback to the vm_getusage syscall to determine the zone's rss if needed. * It then checks the rss against the zone's zone.max-physical-memory rctl. * Once the zone goes over its cap, then this thread will work through the * zone's /proc process list, Pgrab-bing each process and stepping through the - * address space segments, using a private option (_RUSAGESYS_INVALMAP) to the - * private SYS_rusagesys syscall to attempt to unload page translations, until - * the zone is again under its cap. + * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...) + * to pageout pages, until the zone is again under its cap. * * Although zone memory capping is implemented as a soft cap by this user-level * thread, the interfaces around memory caps that are exposed to the user are * the standard ones; an rctl and kstats. This thread uses the rctl value - * to obtain the cap. + * to obtain the cap and works with the zone kernel code to update the kstats. + * If the implementation ever moves into the kernel, these exposed interfaces + * do not need to change. * * The thread adaptively sleeps, periodically checking the state of the * zone. As the zone's rss gets closer to the cap, the thread will wake up @@ -118,6 +129,14 @@ #define TUNE_NPAGE "phys-mcap-no-pageout" #define TUNE_NPFTHROT "phys-mcap-no-pf-throttle" +/* + * These are only used in get_mem_info but global. We always need scale_rss and + * prev_fast_rss to be persistent but we also have the other two global so we + * can easily see these with mdb. + */ +uint64_t scale_rss = 0; +uint64_t prev_fast_rss = 0; +uint64_t fast_rss = 0; uint64_t accurate_rss = 0; /* @@ -141,6 +160,8 @@ static boolean_t skip_vmusage = B_FALSE; static boolean_t skip_pageout = B_FALSE; static boolean_t skip_pf_throttle = B_FALSE; +static zlog_t *logp; + static int64_t check_suspend(); static void get_mcap_tunables(); @@ -514,12 +535,127 @@ done: static uint64_t get_mem_info() { + uint64_t n = 1; + zsd_vmusage64_t buf; + uint64_t tmp_rss; + DIR *pdir = NULL; + struct dirent *dent; + + /* + * Start by doing the fast, cheap RSS calculation using the rss value + * in psinfo_t. Because that's per-process, it can lead to double + * counting some memory and overestimating how much is being used, but + * as long as that's not over the cap, then we don't need do the + * expensive calculation. + * + * If we have to do the expensive calculation, we remember the scaling + * factor so that we can try to use that on subsequent iterations for + * the fast rss. + */ + if (shutting_down) + return (0); + + if ((pdir = opendir(zoneproc)) == NULL) + return (0); + + accurate_rss = 0; + fast_rss = 0; + while (!shutting_down && (dent = readdir(pdir)) != NULL) { + pid_t pid; + int psfd; + int64_t rss; + char pathbuf[MAXPATHLEN]; + psinfo_t psinfo; + + if (strcmp(".", dent->d_name) == 0 || + strcmp("..", dent->d_name) == 0) + continue; + + pid = atoi(dent->d_name); + if (pid == 0 || pid == 1) + continue; + + (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", + zoneproc, pid); + + rss = 0; + if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) { + if (pread(psfd, &psinfo, sizeof (psinfo), 0) == + sizeof (psinfo)) + rss = (int64_t)psinfo.pr_rssize; + + (void) close(psfd); + } + + fast_rss += rss; + } + + (void) closedir(pdir); + if (shutting_down) return (0); - (void) zone_getattr(zid, ZONE_ATTR_RSS, &accurate_rss, - sizeof (accurate_rss)); - accurate_rss /= 1024; + debug("fast rss: %lluKB, scale: %llu, prev: %lluKB\n", fast_rss, + scale_rss, prev_fast_rss); + + /* see if we can get by with a scaled fast rss */ + tmp_rss = fast_rss; + if (scale_rss > 1 && prev_fast_rss > 0) { + /* + * Only scale the fast value if it hasn't ballooned too much + * to trust. + */ + if (fast_rss / prev_fast_rss < 2) { + fast_rss /= scale_rss; + debug("scaled fast rss: %lluKB\n", fast_rss); + } + } + + if (fast_rss <= zone_rss_cap || skip_vmusage) { + uint64_t zone_rss_bytes; + + zone_rss_bytes = fast_rss * 1024; + /* Use the zone's approx. RSS in the kernel */ + (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0); + return (fast_rss); + } + + buf.vmu_id = zid; + + /* get accurate usage (cached data may be up to 5 seconds old) */ + if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_A_ZONE, 5, + (uintptr_t)&buf, (uintptr_t)&n) != 0) { + debug("vmusage failed\n"); + (void) sleep_shutdown(1); + return (0); + } + + if (n > 1) { + /* This should never happen */ + debug("vmusage returned more than one result\n"); + (void) sleep_shutdown(1); + return (0); + } + + if (buf.vmu_id != zid) { + /* This should never happen */ + debug("vmusage returned the incorrect zone\n"); + (void) sleep_shutdown(1); + return (0); + } + + accurate_rss = buf.vmu_rss_all / 1024; + + /* calculate scaling factor to use for fast_rss from now on */ + if (accurate_rss > 0) { + scale_rss = fast_rss / accurate_rss; + debug("new scaling factor: %llu\n", scale_rss); + /* remember the fast rss when we had to get the accurate rss */ + prev_fast_rss = tmp_rss; + } + + debug("accurate rss: %lluKB, scale: %llu, prev: %lluKB\n", accurate_rss, + scale_rss, prev_fast_rss); return (accurate_rss); } @@ -852,6 +988,75 @@ has_proc() } /* + * We run this loop for brands with no /proc to simply update the RSS, using + * the cheap GZ /proc data, every 5 minutes. + */ +static void +no_procfs() +{ + DIR *pdir = NULL; + struct dirent *dent; + uint64_t zone_rss_bytes; + + (void) sleep_shutdown(30); + while (!shutting_down) { + /* + * Just do the fast, cheap RSS calculation using the rss value + * in psinfo_t. Because that's per-process, it can lead to + * double counting some memory and overestimating how much is + * being used. Since there is no /proc in the zone, we use the + * GZ /proc and check for the correct zone. + */ + if ((pdir = opendir("/proc")) == NULL) + return; + + fast_rss = 0; + while (!shutting_down && (dent = readdir(pdir)) != NULL) { + pid_t pid; + int psfd; + int64_t rss; + char pathbuf[MAXPATHLEN]; + psinfo_t psinfo; + + if (strcmp(".", dent->d_name) == 0 || + strcmp("..", dent->d_name) == 0) + continue; + + pid = atoi(dent->d_name); + if (pid == 0 || pid == 1) + continue; + + (void) snprintf(pathbuf, sizeof (pathbuf), + "/proc/%d/psinfo", pid); + + rss = 0; + if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) { + if (pread(psfd, &psinfo, sizeof (psinfo), 0) == + sizeof (psinfo)) { + if (psinfo.pr_zoneid == zid) + rss = (int64_t)psinfo.pr_rssize; + } + + (void) close(psfd); + } + + fast_rss += rss; + } + + (void) closedir(pdir); + + if (shutting_down) + return; + + zone_rss_bytes = fast_rss * 1024; + /* Use the zone's approx. RSS in the kernel */ + (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0); + + (void) sleep_shutdown(300); + } +} + +/* * Thread that checks zone's memory usage and when over the cap, goes through * the zone's process list trying to pageout processes to get under the cap. */ @@ -861,17 +1066,21 @@ mcap_zone() DIR *pdir = NULL; int64_t excess; + debug("thread startup\n"); + + get_mcap_tunables(); + /* - * If the zone has no /proc filesystem (e.g. KVM), we can't pageout any - * processes. Terminate this thread. + * If the zone has no /proc filesystem, we can't use the fast algorithm + * to check RSS or pageout any processes. All we can do is periodically + * update it's RSS kstat using the expensive sycall. */ if (!has_proc()) { + no_procfs(); + debug("thread shutdown\n"); return; } - debug("thread startup\n"); - get_mcap_tunables(); - /* * When first starting it is likely lots of other zones are starting * too because the system is booting. Since we just started the zone @@ -963,6 +1172,7 @@ create_mcap_thread(zlog_t *zlogp, zoneid_t id) shutting_down = 0; zid = id; + logp = zlogp; /* all but the lx brand currently use /proc */ if (strcmp(brand_name, "lx") == 0) { diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index 1e5db33066..2912df0a29 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -328,8 +328,8 @@ static list_t zone_active; static list_t zone_deathrow; static kmutex_t zone_deathrow_lock; -/* This can be dynamically reduced if various subsystems hit internal limits. */ -uint_t maxzones = MAX_ZONES; +/* number of zones is limited by virtual interface limit in IP */ +uint_t maxzones = 8192; /* Event channel to sent zone state change notifications */ evchan_t *zone_event_chan; @@ -3063,6 +3063,22 @@ zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay) return (err); } +/* + * The zone_set_rss function is used to set the zone's RSS when we do the + * fast, approximate calculation in user-land. + */ +static int +zone_set_rss(zone_t *zone, const uint64_t *prss) +{ + uint64_t rss; + int err; + + if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0) + zone->zone_phys_mem = rss; + + return (err); +} + static int zone_set_sched_class(zone_t *zone, const char *new_class) { @@ -6212,14 +6228,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) bufsize) != 0) error = EFAULT; break; - case ZONE_ATTR_RSS: - size = sizeof (zone->zone_phys_mem); - if (bufsize > size) - bufsize = size; - if (buf != NULL && - copyout(&zone->zone_phys_mem, buf, bufsize) != 0) - error = EFAULT; - break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -6273,7 +6281,8 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) */ zone_status = zone_status_get(zone); if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT && - attr != ZONE_ATTR_PG_FLT_DELAY && zone_status > ZONE_IS_READY) { + attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS && + zone_status > ZONE_IS_READY) { err = EINVAL; goto done; } @@ -6304,6 +6313,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_PG_FLT_DELAY: err = zone_set_page_fault_delay(zone, (const uint32_t *)buf); break; + case ZONE_ATTR_RSS: + err = zone_set_rss(zone, (const uint64_t *)buf); + break; case ZONE_ATTR_SECFLAGS: err = zone_set_secflags(zone, (psecflags_t *)buf); break; @@ -8062,205 +8074,3 @@ done: else return (0); } - -/* - * "zone_over_cap" is an array indexed by zoneid, indicating which zones are - * over their physical memory cap. This is the interface for the page scanner - * to use when reclaiming pages for zones that are over their cap. - * - * The page scanner can run when "zone_num_over_cap" is non-zero. It can - * do a direct lookup of a zoneid into the "zone_over_cap" array to determine - * if that zone is over its cap. - * - * There is no locking for the page scanner to perform these two checks. - * We cannot have the page scanner blocking normal paging activity for - * running processes. Because the physical memory cap is a soft cap, it is - * fine for the scanner to simply read the current state of the counter and - * the zone's entry in the array. The scanner should never modify either of - * these items. Internally the entries and the counter are managed with the - * "zone_physcap_lock" mutex as we add/remove mappings to pages. We take care - * to ensure that we only take the zone_physcap_lock mutex when a zone is - * transitioning over/under its physical memory cap. - * - * The "zone_incr_capped", "zone_decr_capped" and "zone_clr_capped" functions - * are used manage the "zone_over_cap" array and associated counter. - */ -uint8_t zone_over_cap[MAX_ZONES]; -uint_t zone_num_over_cap; -static kmutex_t zone_physcap_lock; - -static void -zone_incr_capped(zone_t *zone) -{ - /* See if over (unlimited is UINT64_MAX), or already marked that way. */ - if (zone->zone_phys_mem <= zone->zone_phys_mem_ctl || - zone_over_cap[zone->zone_id] == 1) { - return; - } - - mutex_enter(&zone_physcap_lock); - /* Recheck setting under mutex */ - if (zone->zone_phys_mem > zone->zone_phys_mem_ctl && - zone_over_cap[zone->zone_id] == 0) { - zone_over_cap[zone->zone_id] = 1; - zone_num_over_cap++; - DTRACE_PROBE1(zone__over__pcap, zone_t *, zone); - } - mutex_exit(&zone_physcap_lock); -} - -static void -zone_decr_capped(zone_t *zone) -{ - /* - * See if under, or already marked that way. There is no need to - * check for an unlimited cap (zone_phys_mem_ctl == UINT64_MAX) - * since we'll never add the zone in zone_incr_capped_zone(). - */ - if (zone_over_cap[zone->zone_id] == 0 || - zone->zone_phys_mem >= zone->zone_phys_mem_ctl) { - return; - } - - mutex_enter(&zone_physcap_lock); - /* Recheck setting under mutex */ - if (zone->zone_phys_mem < zone->zone_phys_mem_ctl && - zone_over_cap[zone->zone_id] == 1) { - ASSERT(zone_num_over_cap > 0); - zone_over_cap[zone->zone_id] = 0; - zone_num_over_cap--; - DTRACE_PROBE1(zone__under__pcap, zone_t *, zone); - } - mutex_exit(&zone_physcap_lock); -} - -/* Clear out an entry for a zone which no longer exists. */ -static void -zone_clr_capped(zoneid_t zid) -{ - if (zone_over_cap[zid] == 0) - return; - - mutex_enter(&zone_physcap_lock); - /* Recheck setting under mutex */ - if (zone_over_cap[zid] == 1) { - ASSERT(zone_num_over_cap > 0); - zone_over_cap[zid] = 0; - zone_num_over_cap--; - } - mutex_exit(&zone_physcap_lock); -} - -/* - * For zone_add_page() and zone_rm_page(), access to the page we're touching is - * controlled by our caller's locking. - * On x86 our callers already did: ASSERT(x86_hm_held(pp)) - * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp)) - */ -void -zone_add_page(page_t *pp) -{ - int64_t psize; - zone_t *zone; - - /* Skip pages in segkmem, etc. (KV_KVP, ...) */ - if (PP_ISKAS(pp)) - return; - - ASSERT(!PP_ISFREE(pp)); - - zone = curzone; - if (pp->p_zoneid == zone->zone_id) { - /* Another mapping to this page for this zone, do nothing */ - return; - } - - if (pp->p_szc == 0) { - psize = (int64_t)PAGESIZE; - } else { - /* large page */ - psize = (int64_t)page_get_pagesize(pp->p_szc); - } - - if (pp->p_share == 0) { - /* First mapping to this page. */ - pp->p_zoneid = zone->zone_id; - atomic_add_64((uint64_t *)&zone->zone_phys_mem, psize); - zone_incr_capped(zone); - return; - } - - if (pp->p_zoneid != ALL_ZONES) { - /* - * The page is now being shared across a different zone. - * Decrement the original zone's usage. - */ - zoneid_t id; - - id = pp->p_zoneid; - pp->p_zoneid = ALL_ZONES; - if ((zone = zone_find_by_id(id)) == NULL) { - /* - * Perhaps the zone has halted but since we have the - * page locked down, the page hasn't been freed yet. - * In any case, there is no zone RSS to update. - */ - zone_clr_capped(id); - return; - } - - atomic_add_64((uint64_t *)&zone->zone_phys_mem, -psize); - if ((int64_t)zone->zone_phys_mem < 0) { - DTRACE_PROBE1(zone__ap__neg, zoneid_t, id); - cmn_err(CE_WARN, "zone %d: RSS negative", id); - zone->zone_phys_mem = 0; - } - zone_decr_capped(zone); - zone_rele(zone); - } -} - -void -zone_rm_page(page_t *pp) -{ - zone_t *zone; - boolean_t do_rele = B_FALSE; - int64_t psize; - - /* Skip pages in segkmem, etc. (KV_KVP, ...) */ - if (PP_ISKAS(pp)) - return; - - if (pp->p_zoneid == ALL_ZONES || pp->p_share != 0) - return; - - /* This is the last mapping to the page for a zone. */ - if (pp->p_szc == 0) { - psize = (int64_t)PAGESIZE; - } else { - /* large page */ - psize = (int64_t)page_get_pagesize(pp->p_szc); - } - - if (pp->p_zoneid == curzone->zone_id) { - zone = curzone; - } else if ((zone = zone_find_by_id(pp->p_zoneid)) != NULL) { - do_rele = B_TRUE; - } - - if (zone != NULL) { - atomic_add_64((uint64_t *)&zone->zone_phys_mem, -psize); - if ((int64_t)zone->zone_phys_mem < 0) { - DTRACE_PROBE1(zone__rp__neg, zoneid_t, zone->zone_id); - cmn_err(CE_WARN, "zone %d: RSS negative", - zone->zone_id); - zone->zone_phys_mem = 0; - } - zone_decr_capped(zone); - if (do_rele) - zone_rele(zone); - } else { - zone_clr_capped(pp->p_zoneid); - } - pp->p_zoneid = ALL_ZONES; -} diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index 32b2b7bf38..3bf7979174 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -22,7 +22,7 @@ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>. - * Copyright 2017, Joyent, Inc. + * Copyright 2016, Joyent, Inc. */ #ifndef _SYS_ZONE_H @@ -51,27 +51,15 @@ extern "C" { * NOTE * * The contents of this file are private to the implementation of - * illumos and are subject to change at any time without notice. + * Solaris and are subject to change at any time without notice. * Applications and drivers using these interfaces may fail to * run on future releases. */ /* Available both in kernel and for user space */ -/* - * zone id restrictions and special ids. - * See 'maxzones' for run-time zone limit. - * - * The current 8k value for MAX_ZONES was originally derived from the virtual - * interface limit in IP when "shared-stack" was the only supported networking - * for zones. The virtual interface limit is the number of addresses allowed - * on an interface (see MAX_ADDRS_PER_IF). Even with exclusive stacks, an 8k - * zone limit is still a reasonable choice at this time, given other limits - * within the kernel. Since we only support 8192 zones (which includes GZ), - * there is no point in allowing MAX_ZONEID > 8k. - */ -#define MAX_ZONES 8192 -#define MAX_ZONEID (MAX_ZONES - 1) +/* zone id restrictions and special ids */ +#define MAX_ZONEID 9999 #define MIN_USERZONEID 1 /* lowest user-creatable zone ID */ #define MIN_ZONEID 0 /* minimum zone ID on system */ #define GLOBAL_ZONEID 0 @@ -659,7 +647,7 @@ typedef struct zone { zone_zfs_kstat_t *zone_zfs_stats; /* - * illumos Auditing per-zone audit context + * Solaris Auditing per-zone audit context */ struct au_kcontext *zone_audit_kctxt; /* @@ -975,14 +963,6 @@ extern void mount_completed(zone_t *); extern int zone_walk(int (*)(zone_t *, void *), void *); -struct page; -extern void zone_add_page(struct page *); -extern void zone_rm_page(struct page *); - -/* Interfaces for page scanning */ -extern uint_t zone_num_over_cap; -extern uint8_t zone_over_cap[MAX_ZONES]; - extern rctl_hndl_t rc_zone_locked_mem; extern rctl_hndl_t rc_zone_max_swap; extern rctl_hndl_t rc_zone_phys_mem; diff --git a/usr/src/uts/common/syscall/sysconfig.c b/usr/src/uts/common/syscall/sysconfig.c index fc38f8770c..92daeed703 100644 --- a/usr/src/uts/common/syscall/sysconfig.c +++ b/usr/src/uts/common/syscall/sysconfig.c @@ -22,7 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2017 Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -186,6 +186,8 @@ sysconfig(int which) * the phys. memory cap - zone's rss. We always * report the system-wide value for the global zone, even * though memory capping can be used on the global zone too. + * We use the cached value for the RSS since vm_getusage() + * is so expensive and we don't need this value to be exact. */ if (!INGLOBALZONE(curproc) && curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX) { diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h index ae9b0be758..8747b96acc 100644 --- a/usr/src/uts/common/vm/page.h +++ b/usr/src/uts/common/vm/page.h @@ -20,7 +20,6 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2017, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -230,7 +229,6 @@ struct as; * p_nrm * p_mapping * p_share - * p_zoneid * * The following field is file system dependent. How it is used and * the locking strategies applied are up to the individual file system @@ -529,8 +527,9 @@ typedef struct page { pfn_t p_pagenum; /* physical page number */ uint_t p_share; /* number of translations */ - short p_zoneid; /* zone page use tracking */ - short p_pad1; /* TBD */ +#if defined(_LP64) + uint_t p_sharepad; /* pad for growing p_share */ +#endif uint_t p_slckcnt; /* number of softlocks */ #if defined(__sparc) uint_t p_kpmref; /* number of kpm mapping sharers */ diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c index 91296e9c8d..1d5ef71e3e 100644 --- a/usr/src/uts/common/vm/vm_usage.c +++ b/usr/src/uts/common/vm/vm_usage.c @@ -25,7 +25,7 @@ */ /* - * Copyright 2017, Joyent, Inc. + * Copyright 2016, Joyent, Inc. */ /* @@ -1792,6 +1792,28 @@ vmu_cache_rele(vmu_cache_t *cache) } /* + * When new data is calculated, update the phys_mem rctl usage value in the + * zones. + */ +static void +vmu_update_zone_rctls(vmu_cache_t *cache) +{ + vmusage_t *rp; + size_t i = 0; + zone_t *zp; + + for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) { + if (rp->vmu_type == VMUSAGE_ZONE && + rp->vmu_zoneid != ALL_ZONES) { + if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) { + zp->zone_phys_mem = rp->vmu_rss_all; + zone_rele(zp); + } + } + } +} + +/* * Copy out the cached results to a caller. Inspect the callers flags * and zone to determine which cached results should be copied. */ @@ -2090,6 +2112,8 @@ start: mutex_exit(&vmu_data.vmu_lock); + /* update zone's phys. mem. rctl usage */ + vmu_update_zone_rctls(cache); /* copy cache */ ret = vmu_copyout_results(cache, buf, nres, flags_orig, req_zone_id, cpflg); diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c index 8a6751000b..16c683d993 100644 --- a/usr/src/uts/i86pc/os/startup.c +++ b/usr/src/uts/i86pc/os/startup.c @@ -22,7 +22,7 @@ * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - * Copyright 2017 Joyent, Inc. + * Copyright 2015 Joyent, Inc. * Copyright (c) 2015 by Delphix. All rights reserved. */ /* @@ -2603,7 +2603,6 @@ add_physmem_cb(page_t *pp, pfn_t pnum) pp->p_mapping = NULL; pp->p_embed = 0; pp->p_share = 0; - pp->p_zoneid = ALL_ZONES; pp->p_mlentry = 0; } diff --git a/usr/src/uts/i86pc/vm/hment.c b/usr/src/uts/i86pc/vm/hment.c index 079f64e92e..d00d756828 100644 --- a/usr/src/uts/i86pc/vm/hment.c +++ b/usr/src/uts/i86pc/vm/hment.c @@ -21,9 +21,10 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2017 Joyent, Inc. */ +#pragma ident "%Z%%M% %I% %E% SMI" + #include <sys/types.h> #include <sys/sysmacros.h> #include <sys/kmem.h> @@ -36,7 +37,6 @@ #include <vm/hat_i86.h> #include <sys/cmn_err.h> #include <sys/avl.h> -#include <sys/zone.h> /* @@ -323,8 +323,6 @@ hment_insert(hment_t *hm, page_t *pp) ((hment_t *)pp->p_mapping)->hm_prev = hm; pp->p_mapping = hm; - zone_add_page(pp); - /* * Add the hment to the system-wide hash table. */ @@ -466,7 +464,6 @@ hment_assign(htable_t *htable, uint_t entry, page_t *pp, hment_t *hm) pp->p_embed = 1; pp->p_mapping = htable; pp->p_mlentry = entry; - zone_add_page(pp); return; } @@ -548,7 +545,6 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry) pp->p_mapping = NULL; pp->p_mlentry = 0; pp->p_embed = 0; - zone_rm_page(pp); return (NULL); } @@ -584,7 +580,6 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry) hm->hm_hashlink = null_avl_link; hm->hm_next = NULL; hm->hm_prev = NULL; - zone_rm_page(pp); return (hm); } diff --git a/usr/src/uts/sfmmu/vm/hat_sfmmu.c b/usr/src/uts/sfmmu/vm/hat_sfmmu.c index 2ef3ea20e8..b7539c828c 100644 --- a/usr/src/uts/sfmmu/vm/hat_sfmmu.c +++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.c @@ -24,7 +24,6 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2016 Gary Mills - * Copyright 2017 Joyent, Inc. */ /* @@ -86,7 +85,6 @@ #include <sys/fpu/fpusystm.h> #include <vm/mach_kpm.h> #include <sys/callb.h> -#include <sys/zone.h> #ifdef DEBUG #define SFMMU_VALIDATE_HMERID(hat, rid, saddr, len) \ @@ -935,7 +933,6 @@ static kphysm_setup_vector_t sfmmu_update_vec = { } \ pp->p_mapping = hme; \ pp->p_share++; \ - zone_add_page(pp); \ } /* @@ -956,7 +953,6 @@ static kphysm_setup_vector_t sfmmu_update_vec = { \ ASSERT(pp->p_share > 0); \ pp->p_share--; \ - zone_rm_page(pp); \ \ if (hme->hme_prev) { \ ASSERT(pp->p_mapping != hme); \ @@ -7354,8 +7350,6 @@ retry: tpp->p_mapping = NULL; dpp->p_share = tpp->p_share; tpp->p_share = 0; - dpp->p_zoneid = tpp->p_zoneid; - tpp->p_zoneid = ALL_ZONES; while (index != 0) { index = index >> 1; |