diff options
| author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2017-09-28 12:56:20 +0000 |
|---|---|---|
| committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2017-09-28 12:57:39 +0000 |
| commit | 5ae84a5233b723c890288b775cb5317db2e54d61 (patch) | |
| tree | f28877eae0fdbba8a58726efc3e464145fda1766 /usr/src/uts | |
| parent | 597e7db3df75fb2976c1c29ef285e8bc6c289b4d (diff) | |
| download | illumos-joyent-5ae84a5233b723c890288b775cb5317db2e54d61.tar.gz | |
OS-6306 accurate in-kernel zone RSS tracking
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>
Diffstat (limited to 'usr/src/uts')
| -rw-r--r-- | usr/src/uts/common/os/zone.c | 236 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/zone.h | 30 | ||||
| -rw-r--r-- | usr/src/uts/common/syscall/sysconfig.c | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/vm/page.h | 7 | ||||
| -rw-r--r-- | usr/src/uts/common/vm/vm_usage.c | 26 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/os/startup.c | 3 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/vm/hment.c | 9 | ||||
| -rw-r--r-- | usr/src/uts/sfmmu/vm/hat_sfmmu.c | 6 |
8 files changed, 259 insertions, 62 deletions
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index 2912df0a29..1e5db33066 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -328,8 +328,8 @@ static list_t zone_active; static list_t zone_deathrow; static kmutex_t zone_deathrow_lock; -/* number of zones is limited by virtual interface limit in IP */ -uint_t maxzones = 8192; +/* This can be dynamically reduced if various subsystems hit internal limits. */ +uint_t maxzones = MAX_ZONES; /* Event channel to sent zone state change notifications */ evchan_t *zone_event_chan; @@ -3063,22 +3063,6 @@ zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay) return (err); } -/* - * The zone_set_rss function is used to set the zone's RSS when we do the - * fast, approximate calculation in user-land. - */ -static int -zone_set_rss(zone_t *zone, const uint64_t *prss) -{ - uint64_t rss; - int err; - - if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0) - zone->zone_phys_mem = rss; - - return (err); -} - static int zone_set_sched_class(zone_t *zone, const char *new_class) { @@ -6228,6 +6212,14 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) bufsize) != 0) error = EFAULT; break; + case ZONE_ATTR_RSS: + size = sizeof (zone->zone_phys_mem); + if (bufsize > size) + bufsize = size; + if (buf != NULL && + copyout(&zone->zone_phys_mem, buf, bufsize) != 0) + error = EFAULT; + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -6281,8 +6273,7 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) */ zone_status = zone_status_get(zone); if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT && - attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS && - zone_status > ZONE_IS_READY) { + attr != ZONE_ATTR_PG_FLT_DELAY && zone_status > ZONE_IS_READY) { err = EINVAL; goto done; } @@ -6313,9 +6304,6 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_PG_FLT_DELAY: err = zone_set_page_fault_delay(zone, (const uint32_t *)buf); break; - case ZONE_ATTR_RSS: - err = zone_set_rss(zone, (const uint64_t *)buf); - break; case ZONE_ATTR_SECFLAGS: err = zone_set_secflags(zone, (psecflags_t *)buf); break; @@ -8074,3 +8062,205 @@ done: else return (0); } + +/* + * "zone_over_cap" is an array indexed by zoneid, indicating which zones are + * over their physical memory cap. This is the interface for the page scanner + * to use when reclaiming pages for zones that are over their cap. + * + * The page scanner can run when "zone_num_over_cap" is non-zero. It can + * do a direct lookup of a zoneid into the "zone_over_cap" array to determine + * if that zone is over its cap. + * + * There is no locking for the page scanner to perform these two checks. + * We cannot have the page scanner blocking normal paging activity for + * running processes. Because the physical memory cap is a soft cap, it is + * fine for the scanner to simply read the current state of the counter and + * the zone's entry in the array. The scanner should never modify either of + * these items. Internally the entries and the counter are managed with the + * "zone_physcap_lock" mutex as we add/remove mappings to pages. We take care + * to ensure that we only take the zone_physcap_lock mutex when a zone is + * transitioning over/under its physical memory cap. + * + * The "zone_incr_capped", "zone_decr_capped" and "zone_clr_capped" functions + * are used manage the "zone_over_cap" array and associated counter. + */ +uint8_t zone_over_cap[MAX_ZONES]; +uint_t zone_num_over_cap; +static kmutex_t zone_physcap_lock; + +static void +zone_incr_capped(zone_t *zone) +{ + /* See if over (unlimited is UINT64_MAX), or already marked that way. */ + if (zone->zone_phys_mem <= zone->zone_phys_mem_ctl || + zone_over_cap[zone->zone_id] == 1) { + return; + } + + mutex_enter(&zone_physcap_lock); + /* Recheck setting under mutex */ + if (zone->zone_phys_mem > zone->zone_phys_mem_ctl && + zone_over_cap[zone->zone_id] == 0) { + zone_over_cap[zone->zone_id] = 1; + zone_num_over_cap++; + DTRACE_PROBE1(zone__over__pcap, zone_t *, zone); + } + mutex_exit(&zone_physcap_lock); +} + +static void +zone_decr_capped(zone_t *zone) +{ + /* + * See if under, or already marked that way. There is no need to + * check for an unlimited cap (zone_phys_mem_ctl == UINT64_MAX) + * since we'll never add the zone in zone_incr_capped_zone(). + */ + if (zone_over_cap[zone->zone_id] == 0 || + zone->zone_phys_mem >= zone->zone_phys_mem_ctl) { + return; + } + + mutex_enter(&zone_physcap_lock); + /* Recheck setting under mutex */ + if (zone->zone_phys_mem < zone->zone_phys_mem_ctl && + zone_over_cap[zone->zone_id] == 1) { + ASSERT(zone_num_over_cap > 0); + zone_over_cap[zone->zone_id] = 0; + zone_num_over_cap--; + DTRACE_PROBE1(zone__under__pcap, zone_t *, zone); + } + mutex_exit(&zone_physcap_lock); +} + +/* Clear out an entry for a zone which no longer exists. */ +static void +zone_clr_capped(zoneid_t zid) +{ + if (zone_over_cap[zid] == 0) + return; + + mutex_enter(&zone_physcap_lock); + /* Recheck setting under mutex */ + if (zone_over_cap[zid] == 1) { + ASSERT(zone_num_over_cap > 0); + zone_over_cap[zid] = 0; + zone_num_over_cap--; + } + mutex_exit(&zone_physcap_lock); +} + +/* + * For zone_add_page() and zone_rm_page(), access to the page we're touching is + * controlled by our caller's locking. + * On x86 our callers already did: ASSERT(x86_hm_held(pp)) + * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp)) + */ +void +zone_add_page(page_t *pp) +{ + int64_t psize; + zone_t *zone; + + /* Skip pages in segkmem, etc. (KV_KVP, ...) */ + if (PP_ISKAS(pp)) + return; + + ASSERT(!PP_ISFREE(pp)); + + zone = curzone; + if (pp->p_zoneid == zone->zone_id) { + /* Another mapping to this page for this zone, do nothing */ + return; + } + + if (pp->p_szc == 0) { + psize = (int64_t)PAGESIZE; + } else { + /* large page */ + psize = (int64_t)page_get_pagesize(pp->p_szc); + } + + if (pp->p_share == 0) { + /* First mapping to this page. */ + pp->p_zoneid = zone->zone_id; + atomic_add_64((uint64_t *)&zone->zone_phys_mem, psize); + zone_incr_capped(zone); + return; + } + + if (pp->p_zoneid != ALL_ZONES) { + /* + * The page is now being shared across a different zone. + * Decrement the original zone's usage. + */ + zoneid_t id; + + id = pp->p_zoneid; + pp->p_zoneid = ALL_ZONES; + if ((zone = zone_find_by_id(id)) == NULL) { + /* + * Perhaps the zone has halted but since we have the + * page locked down, the page hasn't been freed yet. + * In any case, there is no zone RSS to update. + */ + zone_clr_capped(id); + return; + } + + atomic_add_64((uint64_t *)&zone->zone_phys_mem, -psize); + if ((int64_t)zone->zone_phys_mem < 0) { + DTRACE_PROBE1(zone__ap__neg, zoneid_t, id); + cmn_err(CE_WARN, "zone %d: RSS negative", id); + zone->zone_phys_mem = 0; + } + zone_decr_capped(zone); + zone_rele(zone); + } +} + +void +zone_rm_page(page_t *pp) +{ + zone_t *zone; + boolean_t do_rele = B_FALSE; + int64_t psize; + + /* Skip pages in segkmem, etc. (KV_KVP, ...) */ + if (PP_ISKAS(pp)) + return; + + if (pp->p_zoneid == ALL_ZONES || pp->p_share != 0) + return; + + /* This is the last mapping to the page for a zone. */ + if (pp->p_szc == 0) { + psize = (int64_t)PAGESIZE; + } else { + /* large page */ + psize = (int64_t)page_get_pagesize(pp->p_szc); + } + + if (pp->p_zoneid == curzone->zone_id) { + zone = curzone; + } else if ((zone = zone_find_by_id(pp->p_zoneid)) != NULL) { + do_rele = B_TRUE; + } + + if (zone != NULL) { + atomic_add_64((uint64_t *)&zone->zone_phys_mem, -psize); + if ((int64_t)zone->zone_phys_mem < 0) { + DTRACE_PROBE1(zone__rp__neg, zoneid_t, zone->zone_id); + cmn_err(CE_WARN, "zone %d: RSS negative", + zone->zone_id); + zone->zone_phys_mem = 0; + } + zone_decr_capped(zone); + if (do_rele) + zone_rele(zone); + } else { + zone_clr_capped(pp->p_zoneid); + } + pp->p_zoneid = ALL_ZONES; +} diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index 3bf7979174..32b2b7bf38 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -22,7 +22,7 @@ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>. - * Copyright 2016, Joyent, Inc. + * Copyright 2017, Joyent, Inc. */ #ifndef _SYS_ZONE_H @@ -51,15 +51,27 @@ extern "C" { * NOTE * * The contents of this file are private to the implementation of - * Solaris and are subject to change at any time without notice. + * illumos and are subject to change at any time without notice. * Applications and drivers using these interfaces may fail to * run on future releases. */ /* Available both in kernel and for user space */ -/* zone id restrictions and special ids */ -#define MAX_ZONEID 9999 +/* + * zone id restrictions and special ids. + * See 'maxzones' for run-time zone limit. + * + * The current 8k value for MAX_ZONES was originally derived from the virtual + * interface limit in IP when "shared-stack" was the only supported networking + * for zones. The virtual interface limit is the number of addresses allowed + * on an interface (see MAX_ADDRS_PER_IF). Even with exclusive stacks, an 8k + * zone limit is still a reasonable choice at this time, given other limits + * within the kernel. Since we only support 8192 zones (which includes GZ), + * there is no point in allowing MAX_ZONEID > 8k. + */ +#define MAX_ZONES 8192 +#define MAX_ZONEID (MAX_ZONES - 1) #define MIN_USERZONEID 1 /* lowest user-creatable zone ID */ #define MIN_ZONEID 0 /* minimum zone ID on system */ #define GLOBAL_ZONEID 0 @@ -647,7 +659,7 @@ typedef struct zone { zone_zfs_kstat_t *zone_zfs_stats; /* - * Solaris Auditing per-zone audit context + * illumos Auditing per-zone audit context */ struct au_kcontext *zone_audit_kctxt; /* @@ -963,6 +975,14 @@ extern void mount_completed(zone_t *); extern int zone_walk(int (*)(zone_t *, void *), void *); +struct page; +extern void zone_add_page(struct page *); +extern void zone_rm_page(struct page *); + +/* Interfaces for page scanning */ +extern uint_t zone_num_over_cap; +extern uint8_t zone_over_cap[MAX_ZONES]; + extern rctl_hndl_t rc_zone_locked_mem; extern rctl_hndl_t rc_zone_max_swap; extern rctl_hndl_t rc_zone_phys_mem; diff --git a/usr/src/uts/common/syscall/sysconfig.c b/usr/src/uts/common/syscall/sysconfig.c index 92daeed703..fc38f8770c 100644 --- a/usr/src/uts/common/syscall/sysconfig.c +++ b/usr/src/uts/common/syscall/sysconfig.c @@ -22,7 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2016 Joyent, Inc. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -186,8 +186,6 @@ sysconfig(int which) * the phys. memory cap - zone's rss. We always * report the system-wide value for the global zone, even * though memory capping can be used on the global zone too. - * We use the cached value for the RSS since vm_getusage() - * is so expensive and we don't need this value to be exact. */ if (!INGLOBALZONE(curproc) && curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX) { diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h index 8747b96acc..ae9b0be758 100644 --- a/usr/src/uts/common/vm/page.h +++ b/usr/src/uts/common/vm/page.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -229,6 +230,7 @@ struct as; * p_nrm * p_mapping * p_share + * p_zoneid * * The following field is file system dependent. How it is used and * the locking strategies applied are up to the individual file system @@ -527,9 +529,8 @@ typedef struct page { pfn_t p_pagenum; /* physical page number */ uint_t p_share; /* number of translations */ -#if defined(_LP64) - uint_t p_sharepad; /* pad for growing p_share */ -#endif + short p_zoneid; /* zone page use tracking */ + short p_pad1; /* TBD */ uint_t p_slckcnt; /* number of softlocks */ #if defined(__sparc) uint_t p_kpmref; /* number of kpm mapping sharers */ diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c index 1d5ef71e3e..91296e9c8d 100644 --- a/usr/src/uts/common/vm/vm_usage.c +++ b/usr/src/uts/common/vm/vm_usage.c @@ -25,7 +25,7 @@ */ /* - * Copyright 2016, Joyent, Inc. + * Copyright 2017, Joyent, Inc. */ /* @@ -1792,28 +1792,6 @@ vmu_cache_rele(vmu_cache_t *cache) } /* - * When new data is calculated, update the phys_mem rctl usage value in the - * zones. - */ -static void -vmu_update_zone_rctls(vmu_cache_t *cache) -{ - vmusage_t *rp; - size_t i = 0; - zone_t *zp; - - for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) { - if (rp->vmu_type == VMUSAGE_ZONE && - rp->vmu_zoneid != ALL_ZONES) { - if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) { - zp->zone_phys_mem = rp->vmu_rss_all; - zone_rele(zp); - } - } - } -} - -/* * Copy out the cached results to a caller. Inspect the callers flags * and zone to determine which cached results should be copied. */ @@ -2112,8 +2090,6 @@ start: mutex_exit(&vmu_data.vmu_lock); - /* update zone's phys. mem. rctl usage */ - vmu_update_zone_rctls(cache); /* copy cache */ ret = vmu_copyout_results(cache, buf, nres, flags_orig, req_zone_id, cpflg); diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c index 16c683d993..8a6751000b 100644 --- a/usr/src/uts/i86pc/os/startup.c +++ b/usr/src/uts/i86pc/os/startup.c @@ -22,7 +22,7 @@ * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - * Copyright 2015 Joyent, Inc. + * Copyright 2017 Joyent, Inc. * Copyright (c) 2015 by Delphix. All rights reserved. */ /* @@ -2603,6 +2603,7 @@ add_physmem_cb(page_t *pp, pfn_t pnum) pp->p_mapping = NULL; pp->p_embed = 0; pp->p_share = 0; + pp->p_zoneid = ALL_ZONES; pp->p_mlentry = 0; } diff --git a/usr/src/uts/i86pc/vm/hment.c b/usr/src/uts/i86pc/vm/hment.c index d00d756828..079f64e92e 100644 --- a/usr/src/uts/i86pc/vm/hment.c +++ b/usr/src/uts/i86pc/vm/hment.c @@ -21,10 +21,9 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/sysmacros.h> #include <sys/kmem.h> @@ -37,6 +36,7 @@ #include <vm/hat_i86.h> #include <sys/cmn_err.h> #include <sys/avl.h> +#include <sys/zone.h> /* @@ -323,6 +323,8 @@ hment_insert(hment_t *hm, page_t *pp) ((hment_t *)pp->p_mapping)->hm_prev = hm; pp->p_mapping = hm; + zone_add_page(pp); + /* * Add the hment to the system-wide hash table. */ @@ -464,6 +466,7 @@ hment_assign(htable_t *htable, uint_t entry, page_t *pp, hment_t *hm) pp->p_embed = 1; pp->p_mapping = htable; pp->p_mlentry = entry; + zone_add_page(pp); return; } @@ -545,6 +548,7 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry) pp->p_mapping = NULL; pp->p_mlentry = 0; pp->p_embed = 0; + zone_rm_page(pp); return (NULL); } @@ -580,6 +584,7 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry) hm->hm_hashlink = null_avl_link; hm->hm_next = NULL; hm->hm_prev = NULL; + zone_rm_page(pp); return (hm); } diff --git a/usr/src/uts/sfmmu/vm/hat_sfmmu.c b/usr/src/uts/sfmmu/vm/hat_sfmmu.c index b7539c828c..2ef3ea20e8 100644 --- a/usr/src/uts/sfmmu/vm/hat_sfmmu.c +++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.c @@ -24,6 +24,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2016 Gary Mills + * Copyright 2017 Joyent, Inc. */ /* @@ -85,6 +86,7 @@ #include <sys/fpu/fpusystm.h> #include <vm/mach_kpm.h> #include <sys/callb.h> +#include <sys/zone.h> #ifdef DEBUG #define SFMMU_VALIDATE_HMERID(hat, rid, saddr, len) \ @@ -933,6 +935,7 @@ static kphysm_setup_vector_t sfmmu_update_vec = { } \ pp->p_mapping = hme; \ pp->p_share++; \ + zone_add_page(pp); \ } /* @@ -953,6 +956,7 @@ static kphysm_setup_vector_t sfmmu_update_vec = { \ ASSERT(pp->p_share > 0); \ pp->p_share--; \ + zone_rm_page(pp); \ \ if (hme->hme_prev) { \ ASSERT(pp->p_mapping != hme); \ @@ -7350,6 +7354,8 @@ retry: tpp->p_mapping = NULL; dpp->p_share = tpp->p_share; tpp->p_share = 0; + dpp->p_zoneid = tpp->p_zoneid; + tpp->p_zoneid = ALL_ZONES; while (index != 0) { index = index >> 1; |
