diff options
author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2017-10-12 14:43:52 +0000 |
---|---|---|
committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2017-10-12 15:05:24 +0000 |
commit | bf4caad4ba9209e6bcd3f68a6d9e197473022286 (patch) | |
tree | d019ac538cd100af5721431c04bacdb6418fb89c | |
parent | a298c587338cf16ca71d352bedfb494fba3f0378 (diff) | |
download | illumos-joyent-bf4caad4ba9209e6bcd3f68a6d9e197473022286.tar.gz |
OS-6306 accurate in-kernel zone RSS tracking
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>
-rw-r--r-- | usr/src/cmd/zoneadmd/mcap.c | 236 | ||||
-rw-r--r-- | usr/src/uts/common/brand/lx/procfs/lx_prvnops.c | 17 | ||||
-rw-r--r-- | usr/src/uts/common/brand/lx/syscall/lx_rlimit.c | 6 | ||||
-rw-r--r-- | usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c | 28 | ||||
-rw-r--r-- | usr/src/uts/common/fs/lxproc/lxpr_vnops.c | 19 | ||||
-rw-r--r-- | usr/src/uts/common/os/zone.c | 398 | ||||
-rw-r--r-- | usr/src/uts/common/sys/zone.h | 62 | ||||
-rw-r--r-- | usr/src/uts/common/syscall/sysconfig.c | 46 | ||||
-rw-r--r-- | usr/src/uts/common/vm/page.h | 7 | ||||
-rw-r--r-- | usr/src/uts/common/vm/vm_usage.c | 26 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/startup.c | 1 | ||||
-rw-r--r-- | usr/src/uts/i86pc/vm/hment.c | 9 | ||||
-rw-r--r-- | usr/src/uts/sfmmu/vm/hat_sfmmu.c | 6 |
13 files changed, 478 insertions, 383 deletions
diff --git a/usr/src/cmd/zoneadmd/mcap.c b/usr/src/cmd/zoneadmd/mcap.c index 88b72b6c55..d280c49b5b 100644 --- a/usr/src/cmd/zoneadmd/mcap.c +++ b/usr/src/cmd/zoneadmd/mcap.c @@ -28,29 +28,18 @@ * the associated zone's physical memory. A thread to do this is started * when the zone boots and is halted when the zone shuts down. * - * Because of the way that the VM system is currently implemented, there is no - * way to go from the bottom up (page to process to zone). Thus, there is no - * obvious way to hook an rctl into the kernel's paging code to enforce a hard - * memory cap. Instead, we implement a soft physical memory cap which looks - * at the zone's overall rss and once it is over the cap, works from the top - * down (zone to process to page), looking at zone processes, to determine - * what to try to pageout to get the zone under its memory cap. - * - * The code uses the fast, cheap, but potentially very inaccurate sum of the - * rss values from psinfo_t to first approximate the zone's rss and will - * fallback to the vm_getusage syscall to determine the zone's rss if needed. + * The code obtains the accurate in-kernel RSS for the zone. * It then checks the rss against the zone's zone.max-physical-memory rctl. * Once the zone goes over its cap, then this thread will work through the * zone's /proc process list, Pgrab-bing each process and stepping through the - * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...) - * to pageout pages, until the zone is again under its cap. + * address space segments, using a private option (_RUSAGESYS_INVALMAP) to the + * private SYS_rusagesys syscall to attempt to unload page translations, until + * the zone is again under its cap. * * Although zone memory capping is implemented as a soft cap by this user-level * thread, the interfaces around memory caps that are exposed to the user are * the standard ones; an rctl and kstats. This thread uses the rctl value - * to obtain the cap and works with the zone kernel code to update the kstats. - * If the implementation ever moves into the kernel, these exposed interfaces - * do not need to change. + * to obtain the cap. * * The thread adaptively sleeps, periodically checking the state of the * zone. As the zone's rss gets closer to the cap, the thread will wake up @@ -129,14 +118,6 @@ #define TUNE_NPAGE "phys-mcap-no-pageout" #define TUNE_NPFTHROT "phys-mcap-no-pf-throttle" -/* - * These are only used in get_mem_info but global. We always need scale_rss and - * prev_fast_rss to be persistent but we also have the other two global so we - * can easily see these with mdb. - */ -uint64_t scale_rss = 0; -uint64_t prev_fast_rss = 0; -uint64_t fast_rss = 0; uint64_t accurate_rss = 0; /* @@ -160,8 +141,6 @@ static boolean_t skip_vmusage = B_FALSE; static boolean_t skip_pageout = B_FALSE; static boolean_t skip_pf_throttle = B_FALSE; -static zlog_t *logp; - static int64_t check_suspend(); static void get_mcap_tunables(); @@ -535,127 +514,12 @@ done: static uint64_t get_mem_info() { - uint64_t n = 1; - zsd_vmusage64_t buf; - uint64_t tmp_rss; - DIR *pdir = NULL; - struct dirent *dent; - - /* - * Start by doing the fast, cheap RSS calculation using the rss value - * in psinfo_t. Because that's per-process, it can lead to double - * counting some memory and overestimating how much is being used, but - * as long as that's not over the cap, then we don't need do the - * expensive calculation. - * - * If we have to do the expensive calculation, we remember the scaling - * factor so that we can try to use that on subsequent iterations for - * the fast rss. - */ - if (shutting_down) - return (0); - - if ((pdir = opendir(zoneproc)) == NULL) - return (0); - - accurate_rss = 0; - fast_rss = 0; - while (!shutting_down && (dent = readdir(pdir)) != NULL) { - pid_t pid; - int psfd; - int64_t rss; - char pathbuf[MAXPATHLEN]; - psinfo_t psinfo; - - if (strcmp(".", dent->d_name) == 0 || - strcmp("..", dent->d_name) == 0) - continue; - - pid = atoi(dent->d_name); - if (pid == 0 || pid == 1) - continue; - - (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", - zoneproc, pid); - - rss = 0; - if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) { - if (pread(psfd, &psinfo, sizeof (psinfo), 0) == - sizeof (psinfo)) - rss = (int64_t)psinfo.pr_rssize; - - (void) close(psfd); - } - - fast_rss += rss; - } - - (void) closedir(pdir); - if (shutting_down) return (0); - debug("fast rss: %lluKB, scale: %llu, prev: %lluKB\n", fast_rss, - scale_rss, prev_fast_rss); - - /* see if we can get by with a scaled fast rss */ - tmp_rss = fast_rss; - if (scale_rss > 1 && prev_fast_rss > 0) { - /* - * Only scale the fast value if it hasn't ballooned too much - * to trust. - */ - if (fast_rss / prev_fast_rss < 2) { - fast_rss /= scale_rss; - debug("scaled fast rss: %lluKB\n", fast_rss); - } - } - - if (fast_rss <= zone_rss_cap || skip_vmusage) { - uint64_t zone_rss_bytes; - - zone_rss_bytes = fast_rss * 1024; - /* Use the zone's approx. RSS in the kernel */ - (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0); - return (fast_rss); - } - - buf.vmu_id = zid; - - /* get accurate usage (cached data may be up to 5 seconds old) */ - if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_A_ZONE, 5, - (uintptr_t)&buf, (uintptr_t)&n) != 0) { - debug("vmusage failed\n"); - (void) sleep_shutdown(1); - return (0); - } - - if (n > 1) { - /* This should never happen */ - debug("vmusage returned more than one result\n"); - (void) sleep_shutdown(1); - return (0); - } - - if (buf.vmu_id != zid) { - /* This should never happen */ - debug("vmusage returned the incorrect zone\n"); - (void) sleep_shutdown(1); - return (0); - } - - accurate_rss = buf.vmu_rss_all / 1024; - - /* calculate scaling factor to use for fast_rss from now on */ - if (accurate_rss > 0) { - scale_rss = fast_rss / accurate_rss; - debug("new scaling factor: %llu\n", scale_rss); - /* remember the fast rss when we had to get the accurate rss */ - prev_fast_rss = tmp_rss; - } - - debug("accurate rss: %lluKB, scale: %llu, prev: %lluKB\n", accurate_rss, - scale_rss, prev_fast_rss); + (void) zone_getattr(zid, ZONE_ATTR_RSS, &accurate_rss, + sizeof (accurate_rss)); + accurate_rss /= 1024; return (accurate_rss); } @@ -988,75 +852,6 @@ has_proc() } /* - * We run this loop for brands with no /proc to simply update the RSS, using - * the cheap GZ /proc data, every 5 minutes. - */ -static void -no_procfs() -{ - DIR *pdir = NULL; - struct dirent *dent; - uint64_t zone_rss_bytes; - - (void) sleep_shutdown(30); - while (!shutting_down) { - /* - * Just do the fast, cheap RSS calculation using the rss value - * in psinfo_t. Because that's per-process, it can lead to - * double counting some memory and overestimating how much is - * being used. Since there is no /proc in the zone, we use the - * GZ /proc and check for the correct zone. - */ - if ((pdir = opendir("/proc")) == NULL) - return; - - fast_rss = 0; - while (!shutting_down && (dent = readdir(pdir)) != NULL) { - pid_t pid; - int psfd; - int64_t rss; - char pathbuf[MAXPATHLEN]; - psinfo_t psinfo; - - if (strcmp(".", dent->d_name) == 0 || - strcmp("..", dent->d_name) == 0) - continue; - - pid = atoi(dent->d_name); - if (pid == 0 || pid == 1) - continue; - - (void) snprintf(pathbuf, sizeof (pathbuf), - "/proc/%d/psinfo", pid); - - rss = 0; - if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) { - if (pread(psfd, &psinfo, sizeof (psinfo), 0) == - sizeof (psinfo)) { - if (psinfo.pr_zoneid == zid) - rss = (int64_t)psinfo.pr_rssize; - } - - (void) close(psfd); - } - - fast_rss += rss; - } - - (void) closedir(pdir); - - if (shutting_down) - return; - - zone_rss_bytes = fast_rss * 1024; - /* Use the zone's approx. RSS in the kernel */ - (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0); - - (void) sleep_shutdown(300); - } -} - -/* * Thread that checks zone's memory usage and when over the cap, goes through * the zone's process list trying to pageout processes to get under the cap. */ @@ -1066,21 +861,17 @@ mcap_zone() DIR *pdir = NULL; int64_t excess; - debug("thread startup\n"); - - get_mcap_tunables(); - /* - * If the zone has no /proc filesystem, we can't use the fast algorithm - * to check RSS or pageout any processes. All we can do is periodically - * update it's RSS kstat using the expensive sycall. + * If the zone has no /proc filesystem (e.g. KVM), we can't pageout any + * processes. Terminate this thread. */ if (!has_proc()) { - no_procfs(); - debug("thread shutdown\n"); return; } + debug("thread startup\n"); + get_mcap_tunables(); + /* * When first starting it is likely lots of other zones are starting * too because the system is booting. Since we just started the zone @@ -1172,7 +963,6 @@ create_mcap_thread(zlog_t *zlogp, zoneid_t id) shutting_down = 0; zid = id; - logp = zlogp; /* all but the lx brand currently use /proc */ if (strcmp(brand_name, "lx") == 0) { diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c index 59fac45556..8f58b3a5ad 100644 --- a/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c +++ b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c @@ -3776,7 +3776,7 @@ lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) { zone_t *zone = LXPTOZ(lxpnp); lx_zone_data_t *lxzd = ztolxzd(zone); - long total_mem, free_mem, total_swap; + ulong_t total_mem, free_mem, total_swap; boolean_t swap_disabled; ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO); @@ -3784,21 +3784,16 @@ lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) ASSERT(lxzd != NULL); swap_disabled = lxzd->lxzd_swap_disabled; - if (zone->zone_phys_mem_ctl == UINT64_MAX) { - total_mem = physmem * PAGESIZE; - free_mem = freemem * PAGESIZE; - } else { - total_mem = zone->zone_phys_mem_ctl; - free_mem = zone->zone_phys_mem_ctl - zone->zone_phys_mem; - if (free_mem < 0) - free_mem = 0; - } + zone_get_physmem_data(zone->zone_id, (pgcnt_t *)&total_mem, + (pgcnt_t *)&free_mem); + total_mem = ptob(total_mem); + free_mem = ptob(free_mem); if (swap_disabled) { total_swap = 0; } else { if (zone->zone_max_swap_ctl == UINT64_MAX) { - total_swap = k_anoninfo.ani_max * PAGESIZE; + total_swap = ptob(k_anoninfo.ani_max); } else { mutex_enter(&zone->zone_mem_lock); total_swap = zone->zone_max_swap_ctl; diff --git a/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c b/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c index 8fadf8d391..30fa996615 100644 --- a/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c +++ b/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c @@ -158,7 +158,11 @@ lx_getrlimit_common(int lx_resource, uint64_t *rlim_curp, uint64_t *rlim_maxp) case LX_RLIMIT_RSS: /* zone.max-physical-memory */ - rlim64.rlim_cur = rlim64.rlim_max = curzone->zone_phys_mem_ctl; + zone_get_physmem_data(curzone->zone_id, + (pgcnt_t *)&rlim64.rlim_cur, + (pgcnt_t *)&rlim64.rlim_max); /* max is dummy variable */ + rlim64.rlim_cur = rlim64.rlim_max = ptob(rlim64.rlim_cur); + break; case LX_RLIMIT_NPROC: diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c index 387471c0f5..052ad322a7 100644 --- a/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c +++ b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c @@ -21,7 +21,7 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2016 Joyent, Inc. + * Copyright 2017 Joyent, Inc. */ #include <vm/anon.h> @@ -75,8 +75,9 @@ extern pgcnt_t swapfs_minfree; static void lx_sysinfo_common(lx_sysinfo_t *si) { - zone_t *zone = curthread->t_procp->p_zone; - uint64_t zphysmem, zfreemem, ztotswap, zfreeswap; + zone_t *zone = curzone; + pgcnt_t zphysmem, zfreemem; + ulong_t ztotswap, zfreeswap; si->si_uptime = gethrestime_sec() - zone->zone_boot_time; @@ -90,26 +91,7 @@ lx_sysinfo_common(lx_sysinfo_t *si) */ si->si_procs = (int32_t)zone->zone_nlwps; - /* - * If memory or swap limits are set on the zone, use those, otherwise - * use the system values. physmem and freemem are in pages, but the - * zone values are in bytes. Likewise, ani_max and ani_free are in - * pages. - */ - if (zone->zone_phys_mem_ctl == UINT64_MAX) { - zphysmem = physmem; - zfreemem = freemem; - } else { - int64_t freemem; - - zphysmem = btop(zone->zone_phys_mem_ctl); - freemem = zone->zone_phys_mem_ctl - zone->zone_phys_mem; - if (freemem > 0) { - zfreemem = btop(freemem); - } else { - zfreemem = 0; - } - } + zone_get_physmem_data(zone->zone_id, &zphysmem, &zfreemem); if (zone->zone_max_swap_ctl == UINT64_MAX) { ztotswap = k_anoninfo.ani_max; diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c index 1f7f3074d6..9bcc0f7e8b 100644 --- a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c +++ b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c @@ -1449,23 +1449,18 @@ lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) { zone_t *zone = LXPTOZ(lxpnp); int global = zone == global_zone; - long total_mem, free_mem, total_swap, used_swap; + ulong_t total_mem, free_mem, total_swap, used_swap; ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO); - if (global || zone->zone_phys_mem_ctl == UINT64_MAX) { - total_mem = physmem * PAGESIZE; - free_mem = freemem * PAGESIZE; - } else { - total_mem = zone->zone_phys_mem_ctl; - free_mem = zone->zone_phys_mem_ctl - zone->zone_phys_mem; - if (free_mem < 0) - free_mem = 0; - } + zone_get_physmem_data(zone->zone_id, (pgcnt_t *)&total_mem, + (pgcnt_t *)&free_mem); + total_mem = ptob(total_mem); + free_mem = ptob(free_mem); if (global || zone->zone_max_swap_ctl == UINT64_MAX) { - total_swap = k_anoninfo.ani_max * PAGESIZE; - used_swap = k_anoninfo.ani_phys_resv * PAGESIZE; + total_swap = ptob(k_anoninfo.ani_max); + used_swap = ptob(k_anoninfo.ani_phys_resv); } else { mutex_enter(&zone->zone_mem_lock); total_swap = zone->zone_max_swap_ctl; diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index 2912df0a29..9c1ee8d750 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -328,8 +328,8 @@ static list_t zone_active; static list_t zone_deathrow; static kmutex_t zone_deathrow_lock; -/* number of zones is limited by virtual interface limit in IP */ -uint_t maxzones = 8192; +/* This can be dynamically reduced if various subsystems hit internal limits. */ +uint_t maxzones = MAX_ZONES; /* Event channel to sent zone state change notifications */ evchan_t *zone_event_chan; @@ -429,6 +429,55 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *, static const int ZONE_SYSCALL_API_VERSION = 7; /* + * "zone_pcap_data" is an array indexed by zoneid. Each member stores the zone's + * current page usage, its page limit, a flag indicating if the zone is + * over its physical memory cap and various statistics. The zpcap_over flag is + * the interface for the page scanner to use when reclaiming pages for zones + * that are over their cap. + * + * All zone physical memory cap data is stored in this array instead of within + * the zone structure itself. This is because zone structures come and go, but + * paging-related work can be asynchronous to any particular zone. In, + * particular: + * 1) Page scanning to reclaim pages occurs from a kernel thread that is not + * associated with any zone. + * 2) Freeing segkp pages can occur long after the zone which first + * instantiated those pages has gone away. + * We want to be able to account for pages/zone without constantly having to + * take extra locks and finding the relevant zone structure, particularly during + * page scanning. + * + * The page scanner can run when "zone_num_over_cap" is non-zero. It can + * do a direct lookup of a zoneid into the "zone_pcap_data" array to determine + * if that zone is over its cap. + * + * There is no locking for the page scanner to perform these two checks. + * We cannot have the page scanner blocking normal paging activity for + * running processes. Because the physical memory cap is a soft cap, it is + * fine for the scanner to simply read the current state of the counter and + * the zone's zpcap_over entry in the array. The scanner should never modify + * either of these items. Internally the entries and the counter are managed + * with the "zone_physcap_lock" mutex as we add/remove mappings to pages. We + * take care to ensure that we only take the zone_physcap_lock mutex when a + * zone is transitioning over/under its physical memory cap. + * + * The "zone_incr_capped" and "zone_decr_capped" functions are used manage + * the "zone_pcap_data" array and associated counter. + * + * The zone_pcap_t structure tracks the zone's physical cap and phyiscal usage + * in terms of pages. These values are currently defined as uint32. Thus, the + * maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295) since + * UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a + * zone's maximum RSS is limited to 17.5 TB and twice that with an 8k page size. + * In the future we may need to expand these counters to 64-bit, but for now + * we're using 32-bit to conserve memory, since this array is statically + * allocatd within the kernel based on the maximum number of zones supported. + */ +uint_t zone_num_over_cap; +zone_pcap_t zone_pcap_data[MAX_ZONES]; +static kmutex_t zone_physcap_lock; + +/* * Certain filesystems (such as NFS and autofs) need to know which zone * the mount is being placed in. Because of this, we need to be able to * ensure that a zone isn't in the process of being created/destroyed such @@ -1822,11 +1871,10 @@ static rctl_qty_t zone_phys_mem_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; - zone_t *z = p->p_zone; + zone_pcap_t *zp = &zone_pcap_data[p->p_zone->zone_id]; ASSERT(MUTEX_HELD(&p->p_lock)); - /* No additional lock because not enforced in the kernel */ - q = z->zone_phys_mem; + q = ptob(zp->zpcap_pg_cnt); return (q); } @@ -1835,11 +1883,30 @@ static int zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) { + zoneid_t zid; + uint_t pg_val; + ASSERT(MUTEX_HELD(&p->p_lock)); ASSERT(e->rcep_t == RCENTITY_ZONE); if (e->rcep_p.zone == NULL) return (0); - e->rcep_p.zone->zone_phys_mem_ctl = nv; + zid = e->rcep_p.zone->zone_id; + if (nv == UINT64_MAX) { + pg_val = UINT32_MAX; + } else { + uint64_t pages = btop(nv); + + /* + * Return from RCTLOP_SET is always ignored so just clamp an + * out-of-range value to our largest "limited" value. + */ + if (pages >= UINT32_MAX) { + pg_val = UINT32_MAX - 1; + } else { + pg_val = (uint_t)pages; + } + } + zone_pcap_data[zid].zpcap_pg_limit = pg_val; return (0); } @@ -1949,12 +2016,13 @@ zone_physmem_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; zone_kstat_t *zk = ksp->ks_data; + zone_pcap_t *zp = &zone_pcap_data[zone->zone_id]; if (rw == KSTAT_WRITE) return (EACCES); - zk->zk_usage.value.ui64 = zone->zone_phys_mem; - zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl; + zk->zk_usage.value.ui64 = ptob(zp->zpcap_pg_cnt); + zk->zk_value.value.ui64 = ptob(zp->zpcap_pg_limit); return (0); } @@ -2172,16 +2240,24 @@ zone_mcap_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; zone_mcap_kstat_t *zmp = ksp->ks_data; + zone_pcap_t *zp; if (rw == KSTAT_WRITE) return (EACCES); - zmp->zm_rss.value.ui64 = zone->zone_phys_mem; - zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl; + zp = &zone_pcap_data[zone->zone_id]; + + zmp->zm_rss.value.ui64 = ptob(zp->zpcap_pg_cnt); + zmp->zm_phys_cap.value.ui64 = ptob(zp->zpcap_pg_limit); zmp->zm_swap.value.ui64 = zone->zone_max_swap; zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl; - zmp->zm_nover.value.ui64 = zone->zone_mcap_nover; - zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout; + zmp->zm_nover.value.ui64 = zp->zpcap_nover; +#ifndef DEBUG + zmp->zm_pagedout.value.ui64 = ptob(zp->zpcap_pg_out); +#else + zmp->zm_pagedout.value.ui64 = ptob(zp->zpcap_pg_fsdirty + + zp->zpcap_pg_fs + zp->zpcap_pg_anon + zp->zpcap_pg_anondirty); +#endif zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin; zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin; zmp->zm_execpgin.value.ui64 = zone->zone_execpgin; @@ -2427,8 +2503,6 @@ zone_zsd_init(void) zone0.zone_locked_mem_ctl = UINT64_MAX; ASSERT(zone0.zone_max_swap == 0); zone0.zone_max_swap_ctl = UINT64_MAX; - zone0.zone_phys_mem = 0; - zone0.zone_phys_mem_ctl = UINT64_MAX; zone0.zone_max_lofi = 0; zone0.zone_max_lofi_ctl = UINT64_MAX; zone0.zone_shmmax = 0; @@ -2770,6 +2844,9 @@ zone_free(zone_t *zone) */ cpucaps_zone_remove(zone); + /* Clear physical memory capping data. */ + bzero(&zone_pcap_data[zone->zone_id], sizeof (zone_pcap_t)); + ASSERT(zone->zone_cpucap == NULL); /* remove from deathrow list */ @@ -3020,16 +3097,14 @@ zone_set_initname(zone_t *zone, const char *zone_initname) * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used * to provide the physical memory capping kstats. Since physical memory * capping is currently implemented in userland, that code uses the setattr - * entry point to increment the kstats. We always simply increment nover - * every time that setattr is called and we always add in the input value - * to zone_mcap_pagedout every time that is called. + * entry point to increment the kstats. We ignore nover when that setattr is + * called and we always add in the input value to zone_mcap_pagedout every + * time that is called. */ /*ARGSUSED*/ static int zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover) { - zone->zone_mcap_nover++; - return (0); } @@ -3039,8 +3114,17 @@ zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout) uint64_t pageout; int err; - if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0) - zone->zone_mcap_pagedout += pageout; + if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0) { + zone_pcap_t *zp = &zone_pcap_data[zone->zone_id]; + uint64_t pages; + + pages = btop(pageout); +#ifndef DEBUG + atomic_add_64(&zp->zpcap_pg_out, pages); +#else + atomic_add_64(&zp->zpcap_pg_fs, pages); +#endif + } return (err); } @@ -3063,22 +3147,6 @@ zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay) return (err); } -/* - * The zone_set_rss function is used to set the zone's RSS when we do the - * fast, approximate calculation in user-land. - */ -static int -zone_set_rss(zone_t *zone, const uint64_t *prss) -{ - uint64_t rss; - int err; - - if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0) - zone->zone_phys_mem = rss; - - return (err); -} - static int zone_set_sched_class(zone_t *zone, const char *new_class) { @@ -5077,8 +5145,6 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_locked_mem_ctl = UINT64_MAX; zone->zone_max_swap = 0; zone->zone_max_swap_ctl = UINT64_MAX; - zone->zone_phys_mem = 0; - zone->zone_phys_mem_ctl = UINT64_MAX; zone->zone_max_lofi = 0; zone->zone_max_lofi_ctl = UINT64_MAX; zone->zone_lockedmem_kstat = NULL; @@ -5091,6 +5157,13 @@ zone_create(const char *zone_name, const char *zone_root, */ zone->zone_rctls = NULL; + /* + * Ensure page count is 0 (in case zoneid has wrapped). + * Initialize physical memory cap as unlimited. + */ + zone_pcap_data[zoneid].zpcap_pg_cnt = 0; + zone_pcap_data[zoneid].zpcap_pg_limit = UINT32_MAX; + if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) { zone_free(zone); return (zone_create_error(error, 0, extended_error)); @@ -6228,6 +6301,19 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) bufsize) != 0) error = EFAULT; break; + case ZONE_ATTR_RSS: { + zone_pcap_t *zp = &zone_pcap_data[zone->zone_id]; + uint64_t phys_mem; + + phys_mem = ptob(zp->zpcap_pg_cnt); + size = sizeof (phys_mem); + if (bufsize > size) + bufsize = size; + if (buf != NULL && + copyout(&phys_mem, buf, bufsize) != 0) + error = EFAULT; + } + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -6281,8 +6367,7 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) */ zone_status = zone_status_get(zone); if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT && - attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS && - zone_status > ZONE_IS_READY) { + attr != ZONE_ATTR_PG_FLT_DELAY && zone_status > ZONE_IS_READY) { err = EINVAL; goto done; } @@ -6313,9 +6398,6 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_PG_FLT_DELAY: err = zone_set_page_fault_delay(zone, (const uint32_t *)buf); break; - case ZONE_ATTR_RSS: - err = zone_set_rss(zone, (const uint64_t *)buf); - break; case ZONE_ATTR_SECFLAGS: err = zone_set_secflags(zone, (psecflags_t *)buf); break; @@ -8074,3 +8156,231 @@ done: else return (0); } + +static void +zone_incr_capped(zoneid_t zid) +{ + zone_pcap_t *zp = &zone_pcap_data[zid]; + + /* See if over (unlimited is UINT32_MAX), or already marked that way. */ + if (zp->zpcap_pg_cnt <= zp->zpcap_pg_limit || zp->zpcap_over == 1) { + return; + } + + mutex_enter(&zone_physcap_lock); + /* Recheck setting under mutex */ + if (zp->zpcap_pg_cnt > zp->zpcap_pg_limit && zp->zpcap_over == 0) { + zp->zpcap_over = 1; + zp->zpcap_nover++; + zone_num_over_cap++; + DTRACE_PROBE1(zone__over__pcap, zoneid_t, zid); + } + mutex_exit(&zone_physcap_lock); +} + +/* + * We want some hysteresis when the zone is going under its cap so that we're + * not continuously toggling page scanning back and forth by a single page + * around the cap. Using ~1% of the zone's page limit seems to be a good + * quantity. This table shows some various zone memory caps and the number of + * pages (assuming a 4k page size). Given this, we choose to shift the page + * limit by 7 places to get a hysteresis that is slightly less than 1%. + * + * cap pages pages 1% shift7 shift7 + * 128M 32768 0x0008000 327 256 0x00100 + * 512M 131072 0x0020000 1310 1024 0x00400 + * 1G 262144 0x0040000 2621 2048 0x00800 + * 4G 1048576 0x0100000 10485 8192 0x02000 + * 8G 2097152 0x0200000 20971 16384 0x04000 + * 16G 4194304 0x0400000 41943 32768 0x08000 + * 32G 8388608 0x0800000 83886 65536 0x10000 + * 64G 16777216 0x1000000 167772 131072 0x20000 + */ +static void +zone_decr_capped(zoneid_t zid) +{ + zone_pcap_t *zp = &zone_pcap_data[zid]; + uint32_t adjusted_limit; + + /* + * See if under, or already marked that way. There is no need to + * check for an unlimited cap (zpcap_pg_limit == UINT32_MAX) + * since we'll never set zpcap_over in zone_incr_capped(). + */ + if (zp->zpcap_over == 0 || zp->zpcap_pg_cnt >= zp->zpcap_pg_limit) { + return; + } + + adjusted_limit = zp->zpcap_pg_limit - (zp->zpcap_pg_limit >> 7); + + /* Recheck, accounting for our hysteresis. */ + if (zp->zpcap_pg_cnt >= adjusted_limit) { + return; + } + + mutex_enter(&zone_physcap_lock); + /* Recheck under mutex. */ + if (zp->zpcap_pg_cnt < adjusted_limit && zp->zpcap_over == 1) { + zp->zpcap_over = 0; + ASSERT(zone_num_over_cap > 0); + zone_num_over_cap--; + DTRACE_PROBE1(zone__under__pcap, zoneid_t, zid); + } + mutex_exit(&zone_physcap_lock); +} + +/* + * For zone_add_page() and zone_rm_page(), access to the page we're touching is + * controlled by our caller's locking. + * On x86 our callers already did: ASSERT(x86_hm_held(pp)) + * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp)) + */ +void +zone_add_page(page_t *pp) +{ + uint_t pcnt; + zone_pcap_t *zp; + zoneid_t zid; + + /* Skip pages in segkmem, etc. (KV_KVP, ...) */ + if (PP_ISKAS(pp)) + return; + + ASSERT(!PP_ISFREE(pp)); + + zid = curzone->zone_id; + if (pp->p_zoneid == zid) { + /* Another mapping to this page for this zone, do nothing */ + return; + } + + if (pp->p_szc == 0) { + pcnt = 1; + } else { + /* large page */ + pcnt = page_get_pagecnt(pp->p_szc); + } + + if (pp->p_share == 0) { + /* First mapping to this page. */ + pp->p_zoneid = zid; + zp = &zone_pcap_data[zid]; + ASSERT(zp->zpcap_pg_cnt + pcnt < UINT32_MAX); + atomic_add_32((uint32_t *)&zp->zpcap_pg_cnt, pcnt); + zone_incr_capped(zid); + return; + } + + if (pp->p_zoneid != ALL_ZONES) { + /* + * The page is now being shared across a different zone. + * Decrement the original zone's usage. + */ + zid = pp->p_zoneid; + pp->p_zoneid = ALL_ZONES; + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pcap_data[zid]; + + if (zp->zpcap_pg_cnt > 0) { + atomic_add_32((uint32_t *)&zp->zpcap_pg_cnt, -pcnt); + } + zone_decr_capped(zid); + } +} + +void +zone_rm_page(page_t *pp) +{ + uint_t pcnt; + zone_pcap_t *zp; + zoneid_t zid; + + /* Skip pages in segkmem, etc. (KV_KVP, ...) */ + if (PP_ISKAS(pp)) + return; + + zid = pp->p_zoneid; + if (zid == ALL_ZONES || pp->p_share != 0) + return; + + /* This is the last mapping to the page for a zone. */ + if (pp->p_szc == 0) { + pcnt = 1; + } else { + /* large page */ + pcnt = (int64_t)page_get_pagecnt(pp->p_szc); + } + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pcap_data[zid]; + if (zp->zpcap_pg_cnt > 0) { + atomic_add_32((uint32_t *)&zp->zpcap_pg_cnt, -pcnt); + } + zone_decr_capped(zid); + pp->p_zoneid = ALL_ZONES; +} + +void +zone_pageout_stat(int zid, zone_pageout_op_t op) +{ + zone_pcap_t *zp; + + if (zid == ALL_ZONES) + return; + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pcap_data[zid]; + +#ifndef DEBUG + atomic_add_64(&zp->zpcap_pg_out, 1); +#else + switch (op) { + case ZPO_DIRTY: + atomic_add_64(&zp->zpcap_pg_fsdirty, 1); + break; + case ZPO_FS: + atomic_add_64(&zp->zpcap_pg_fs, 1); + break; + case ZPO_ANON: + atomic_add_64(&zp->zpcap_pg_anon, 1); + break; + case ZPO_ANONDIRTY: + atomic_add_64(&zp->zpcap_pg_anondirty, 1); + break; + default: + cmn_err(CE_PANIC, "Invalid pageout operator %d", op); + break; + } +#endif +} + +/* + * Return the zone's physical memory cap and current free memory (in pages). + */ +void +zone_get_physmem_data(int zid, pgcnt_t *memcap, pgcnt_t *free) +{ + zone_pcap_t *zp; + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pcap_data[zid]; + + /* + * If memory or swap limits are set on the zone, use those, otherwise + * use the system values. physmem and freemem are also in pages. + */ + if (zp->zpcap_pg_limit == UINT32_MAX) { + *memcap = physmem; + *free = freemem; + } else { + int64_t freemem; + + *memcap = (pgcnt_t)zp->zpcap_pg_limit; + freemem = zp->zpcap_pg_limit - zp->zpcap_pg_cnt; + if (freemem > 0) { + *free = (pgcnt_t)freemem; + } else { + *free = (pgcnt_t)0; + } + } +} diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index 3bf7979174..a08ef59959 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -22,7 +22,7 @@ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>. - * Copyright 2016, Joyent, Inc. + * Copyright 2017, Joyent, Inc. */ #ifndef _SYS_ZONE_H @@ -51,15 +51,27 @@ extern "C" { * NOTE * * The contents of this file are private to the implementation of - * Solaris and are subject to change at any time without notice. + * illumos and are subject to change at any time without notice. * Applications and drivers using these interfaces may fail to * run on future releases. */ /* Available both in kernel and for user space */ -/* zone id restrictions and special ids */ -#define MAX_ZONEID 9999 +/* + * zone id restrictions and special ids. + * See 'maxzones' for run-time zone limit. + * + * The current 8k value for MAX_ZONES was originally derived from the virtual + * interface limit in IP when "shared-stack" was the only supported networking + * for zones. The virtual interface limit is the number of addresses allowed + * on an interface (see MAX_ADDRS_PER_IF). Even with exclusive stacks, an 8k + * zone limit is still a reasonable choice at this time, given other limits + * within the kernel. Since we only support 8192 zones (which includes GZ), + * there is no point in allowing MAX_ZONEID > 8k. + */ +#define MAX_ZONES 8192 +#define MAX_ZONEID (MAX_ZONES - 1) #define MIN_USERZONEID 1 /* lowest user-creatable zone ID */ #define MIN_ZONEID 0 /* minimum zone ID on system */ #define GLOBAL_ZONEID 0 @@ -563,7 +575,6 @@ typedef struct zone { int zone_init_status; /* init's exit status */ int zone_boot_err; /* for zone_boot() if boot fails */ char *zone_bootargs; /* arguments passed via zone_boot() */ - rctl_qty_t zone_phys_mem_ctl; /* current phys. memory limit */ /* * zone_kthreads is protected by zone_status_lock. */ @@ -647,7 +658,7 @@ typedef struct zone { zone_zfs_kstat_t *zone_zfs_stats; /* - * Solaris Auditing per-zone audit context + * illumos Auditing per-zone audit context */ struct au_kcontext *zone_audit_kctxt; /* @@ -667,11 +678,8 @@ typedef struct zone { /* * kstats and counters for physical memory capping. */ - rctl_qty_t zone_phys_mem; /* current bytes of phys. mem. (RSS) */ kstat_t *zone_physmem_kstat; - uint64_t zone_mcap_nover; /* # of times over phys. cap */ - uint64_t zone_mcap_pagedout; /* bytes of mem. paged out */ - kmutex_t zone_mcap_lock; /* protects mcap statistics */ + kmutex_t zone_mcap_lock; /* protects mcap statistics */ kstat_t *zone_mcap_ksp; zone_mcap_kstat_t *zone_mcap_stats; uint64_t zone_pgpgin; /* pages paged in */ @@ -739,6 +747,30 @@ typedef struct zone { kmutex_t zone_mount_lock; } zone_t; +/* zpcap_over is treated as a boolean but is 32 bits for alignment. */ +typedef struct zone_pcap { + uint32_t zpcap_over; /* currently over cap */ + uint32_t zpcap_pg_cnt; /* current RSS in pages */ + uint32_t zpcap_pg_limit; /* current RRS limit in pages */ + uint32_t zpcap_nover; /* # of times over phys. cap */ +#ifndef DEBUG + uint64_t zpcap_pg_out; /* # pages flushed */ +#else + /* + * To conserve memory, detailed pageout stats are only kept for DEBUG + * builds. + */ + uint64_t zpcap_pg_anon; /* # clean anon pages flushed */ + uint64_t zpcap_pg_anondirty; /* # dirty anon pages flushed */ + uint64_t zpcap_pg_fs; /* # clean fs pages flushed */ + uint64_t zpcap_pg_fsdirty; /* # dirty fs pages flushed */ +#endif +} zone_pcap_t; + +typedef enum zone_pageout_op { + ZPO_DIRTY, ZPO_FS, ZPO_ANON, ZPO_ANONDIRTY +} zone_pageout_op_t; + /* * Special value of zone_psetid to indicate that pools are disabled. */ @@ -963,6 +995,16 @@ extern void mount_completed(zone_t *); extern int zone_walk(int (*)(zone_t *, void *), void *); +struct page; +extern void zone_add_page(struct page *); +extern void zone_rm_page(struct page *); +extern void zone_pageout_stat(int, zone_pageout_op_t); +extern void zone_get_physmem_data(int, pgcnt_t *, pgcnt_t *); + +/* Interfaces for page scanning */ +extern uint_t zone_num_over_cap; +extern zone_pcap_t zone_pcap_data[MAX_ZONES]; + extern rctl_hndl_t rc_zone_locked_mem; extern rctl_hndl_t rc_zone_max_swap; extern rctl_hndl_t rc_zone_phys_mem; diff --git a/usr/src/uts/common/syscall/sysconfig.c b/usr/src/uts/common/syscall/sysconfig.c index 92daeed703..e09f4e85a2 100644 --- a/usr/src/uts/common/syscall/sysconfig.c +++ b/usr/src/uts/common/syscall/sysconfig.c @@ -22,7 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2016 Joyent, Inc. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -171,41 +171,29 @@ sysconfig(int which) /* * If the non-global zone has a phys. memory cap, use that. * We always report the system-wide value for the global zone, - * even though rcapd can be used on the global zone too. + * even though memory capping can be used on the global zone + * too. */ - if (!INGLOBALZONE(curproc) && - curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX) - return (MIN(btop(curproc->p_zone->zone_phys_mem_ctl), - physinstalled)); + if (!INGLOBALZONE(curproc)) { + pgcnt_t cap, free; + + zone_get_physmem_data(curzone->zone_id, &cap, &free); + return (MIN(cap, physinstalled)); + } return (physinstalled); case _CONFIG_AVPHYS_PAGES: /* - * If the non-global zone has a phys. memory cap, use - * the phys. memory cap - zone's rss. We always - * report the system-wide value for the global zone, even - * though memory capping can be used on the global zone too. - * We use the cached value for the RSS since vm_getusage() - * is so expensive and we don't need this value to be exact. + * If the non-global zone has a phys. memory cap, use its + * free value. We always report the system-wide value for the + * global zone, even though memory capping can be used on the + * global zone too. */ - if (!INGLOBALZONE(curproc) && - curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX) { - pgcnt_t cap, rss, free; - - cap = btop(curproc->p_zone->zone_phys_mem_ctl); - if (cap > physinstalled) - return (freemem); - - rss = btop(curproc->p_zone->zone_phys_mem); - /* - * Because this is a soft cap, it is possible - * for rss to be temporarily over the cap. - */ - if (cap > rss) - free = cap - rss; - else - free = 0; + if (!INGLOBALZONE(curproc)) { + pgcnt_t cap, free; + + zone_get_physmem_data(curzone->zone_id, &cap, &free); return (MIN(free, freemem)); } diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h index 8747b96acc..ae9b0be758 100644 --- a/usr/src/uts/common/vm/page.h +++ b/usr/src/uts/common/vm/page.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -229,6 +230,7 @@ struct as; * p_nrm * p_mapping * p_share + * p_zoneid * * The following field is file system dependent. How it is used and * the locking strategies applied are up to the individual file system @@ -527,9 +529,8 @@ typedef struct page { pfn_t p_pagenum; /* physical page number */ uint_t p_share; /* number of translations */ -#if defined(_LP64) - uint_t p_sharepad; /* pad for growing p_share */ -#endif + short p_zoneid; /* zone page use tracking */ + short p_pad1; /* TBD */ uint_t p_slckcnt; /* number of softlocks */ #if defined(__sparc) uint_t p_kpmref; /* number of kpm mapping sharers */ diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c index 1d5ef71e3e..91296e9c8d 100644 --- a/usr/src/uts/common/vm/vm_usage.c +++ b/usr/src/uts/common/vm/vm_usage.c @@ -25,7 +25,7 @@ */ /* - * Copyright 2016, Joyent, Inc. + * Copyright 2017, Joyent, Inc. */ /* @@ -1792,28 +1792,6 @@ vmu_cache_rele(vmu_cache_t *cache) } /* - * When new data is calculated, update the phys_mem rctl usage value in the - * zones. - */ -static void -vmu_update_zone_rctls(vmu_cache_t *cache) -{ - vmusage_t *rp; - size_t i = 0; - zone_t *zp; - - for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) { - if (rp->vmu_type == VMUSAGE_ZONE && - rp->vmu_zoneid != ALL_ZONES) { - if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) { - zp->zone_phys_mem = rp->vmu_rss_all; - zone_rele(zp); - } - } - } -} - -/* * Copy out the cached results to a caller. Inspect the callers flags * and zone to determine which cached results should be copied. */ @@ -2112,8 +2090,6 @@ start: mutex_exit(&vmu_data.vmu_lock); - /* update zone's phys. mem. rctl usage */ - vmu_update_zone_rctls(cache); /* copy cache */ ret = vmu_copyout_results(cache, buf, nres, flags_orig, req_zone_id, cpflg); diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c index ba90b6627d..bfe8c2486b 100644 --- a/usr/src/uts/i86pc/os/startup.c +++ b/usr/src/uts/i86pc/os/startup.c @@ -2611,6 +2611,7 @@ add_physmem_cb(page_t *pp, pfn_t pnum) pp->p_mapping = NULL; pp->p_embed = 0; pp->p_share = 0; + pp->p_zoneid = ALL_ZONES; pp->p_mlentry = 0; } diff --git a/usr/src/uts/i86pc/vm/hment.c b/usr/src/uts/i86pc/vm/hment.c index d00d756828..079f64e92e 100644 --- a/usr/src/uts/i86pc/vm/hment.c +++ b/usr/src/uts/i86pc/vm/hment.c @@ -21,10 +21,9 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/sysmacros.h> #include <sys/kmem.h> @@ -37,6 +36,7 @@ #include <vm/hat_i86.h> #include <sys/cmn_err.h> #include <sys/avl.h> +#include <sys/zone.h> /* @@ -323,6 +323,8 @@ hment_insert(hment_t *hm, page_t *pp) ((hment_t *)pp->p_mapping)->hm_prev = hm; pp->p_mapping = hm; + zone_add_page(pp); + /* * Add the hment to the system-wide hash table. */ @@ -464,6 +466,7 @@ hment_assign(htable_t *htable, uint_t entry, page_t *pp, hment_t *hm) pp->p_embed = 1; pp->p_mapping = htable; pp->p_mlentry = entry; + zone_add_page(pp); return; } @@ -545,6 +548,7 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry) pp->p_mapping = NULL; pp->p_mlentry = 0; pp->p_embed = 0; + zone_rm_page(pp); return (NULL); } @@ -580,6 +584,7 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry) hm->hm_hashlink = null_avl_link; hm->hm_next = NULL; hm->hm_prev = NULL; + zone_rm_page(pp); return (hm); } diff --git a/usr/src/uts/sfmmu/vm/hat_sfmmu.c b/usr/src/uts/sfmmu/vm/hat_sfmmu.c index b7539c828c..2ef3ea20e8 100644 --- a/usr/src/uts/sfmmu/vm/hat_sfmmu.c +++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.c @@ -24,6 +24,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2016 Gary Mills + * Copyright 2017 Joyent, Inc. */ /* @@ -85,6 +86,7 @@ #include <sys/fpu/fpusystm.h> #include <vm/mach_kpm.h> #include <sys/callb.h> +#include <sys/zone.h> #ifdef DEBUG #define SFMMU_VALIDATE_HMERID(hat, rid, saddr, len) \ @@ -933,6 +935,7 @@ static kphysm_setup_vector_t sfmmu_update_vec = { } \ pp->p_mapping = hme; \ pp->p_share++; \ + zone_add_page(pp); \ } /* @@ -953,6 +956,7 @@ static kphysm_setup_vector_t sfmmu_update_vec = { \ ASSERT(pp->p_share > 0); \ pp->p_share--; \ + zone_rm_page(pp); \ \ if (hme->hme_prev) { \ ASSERT(pp->p_mapping != hme); \ @@ -7350,6 +7354,8 @@ retry: tpp->p_mapping = NULL; dpp->p_share = tpp->p_share; tpp->p_share = 0; + dpp->p_zoneid = tpp->p_zoneid; + tpp->p_zoneid = ALL_ZONES; while (index != 0) { index = index >> 1; |