diff options
Diffstat (limited to 'usr/src/uts/common/os/zone.c')
| -rw-r--r-- | usr/src/uts/common/os/zone.c | 236 |
1 files changed, 213 insertions, 23 deletions
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index 2912df0a29..1e5db33066 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -328,8 +328,8 @@ static list_t zone_active; static list_t zone_deathrow; static kmutex_t zone_deathrow_lock; -/* number of zones is limited by virtual interface limit in IP */ -uint_t maxzones = 8192; +/* This can be dynamically reduced if various subsystems hit internal limits. */ +uint_t maxzones = MAX_ZONES; /* Event channel to sent zone state change notifications */ evchan_t *zone_event_chan; @@ -3063,22 +3063,6 @@ zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay) return (err); } -/* - * The zone_set_rss function is used to set the zone's RSS when we do the - * fast, approximate calculation in user-land. - */ -static int -zone_set_rss(zone_t *zone, const uint64_t *prss) -{ - uint64_t rss; - int err; - - if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0) - zone->zone_phys_mem = rss; - - return (err); -} - static int zone_set_sched_class(zone_t *zone, const char *new_class) { @@ -6228,6 +6212,14 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) bufsize) != 0) error = EFAULT; break; + case ZONE_ATTR_RSS: + size = sizeof (zone->zone_phys_mem); + if (bufsize > size) + bufsize = size; + if (buf != NULL && + copyout(&zone->zone_phys_mem, buf, bufsize) != 0) + error = EFAULT; + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -6281,8 +6273,7 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) */ zone_status = zone_status_get(zone); if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT && - attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS && - zone_status > ZONE_IS_READY) { + attr != ZONE_ATTR_PG_FLT_DELAY && zone_status > ZONE_IS_READY) { err = EINVAL; goto done; } @@ -6313,9 +6304,6 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_PG_FLT_DELAY: err = zone_set_page_fault_delay(zone, (const uint32_t *)buf); break; - case ZONE_ATTR_RSS: - err = zone_set_rss(zone, (const uint64_t *)buf); - break; case ZONE_ATTR_SECFLAGS: err = zone_set_secflags(zone, (psecflags_t *)buf); break; @@ -8074,3 +8062,205 @@ done: else return (0); } + +/* + * "zone_over_cap" is an array indexed by zoneid, indicating which zones are + * over their physical memory cap. This is the interface for the page scanner + * to use when reclaiming pages for zones that are over their cap. + * + * The page scanner can run when "zone_num_over_cap" is non-zero. It can + * do a direct lookup of a zoneid into the "zone_over_cap" array to determine + * if that zone is over its cap. + * + * There is no locking for the page scanner to perform these two checks. + * We cannot have the page scanner blocking normal paging activity for + * running processes. Because the physical memory cap is a soft cap, it is + * fine for the scanner to simply read the current state of the counter and + * the zone's entry in the array. The scanner should never modify either of + * these items. Internally the entries and the counter are managed with the + * "zone_physcap_lock" mutex as we add/remove mappings to pages. We take care + * to ensure that we only take the zone_physcap_lock mutex when a zone is + * transitioning over/under its physical memory cap. + * + * The "zone_incr_capped", "zone_decr_capped" and "zone_clr_capped" functions + * are used manage the "zone_over_cap" array and associated counter. + */ +uint8_t zone_over_cap[MAX_ZONES]; +uint_t zone_num_over_cap; +static kmutex_t zone_physcap_lock; + +static void +zone_incr_capped(zone_t *zone) +{ + /* See if over (unlimited is UINT64_MAX), or already marked that way. */ + if (zone->zone_phys_mem <= zone->zone_phys_mem_ctl || + zone_over_cap[zone->zone_id] == 1) { + return; + } + + mutex_enter(&zone_physcap_lock); + /* Recheck setting under mutex */ + if (zone->zone_phys_mem > zone->zone_phys_mem_ctl && + zone_over_cap[zone->zone_id] == 0) { + zone_over_cap[zone->zone_id] = 1; + zone_num_over_cap++; + DTRACE_PROBE1(zone__over__pcap, zone_t *, zone); + } + mutex_exit(&zone_physcap_lock); +} + +static void +zone_decr_capped(zone_t *zone) +{ + /* + * See if under, or already marked that way. There is no need to + * check for an unlimited cap (zone_phys_mem_ctl == UINT64_MAX) + * since we'll never add the zone in zone_incr_capped_zone(). + */ + if (zone_over_cap[zone->zone_id] == 0 || + zone->zone_phys_mem >= zone->zone_phys_mem_ctl) { + return; + } + + mutex_enter(&zone_physcap_lock); + /* Recheck setting under mutex */ + if (zone->zone_phys_mem < zone->zone_phys_mem_ctl && + zone_over_cap[zone->zone_id] == 1) { + ASSERT(zone_num_over_cap > 0); + zone_over_cap[zone->zone_id] = 0; + zone_num_over_cap--; + DTRACE_PROBE1(zone__under__pcap, zone_t *, zone); + } + mutex_exit(&zone_physcap_lock); +} + +/* Clear out an entry for a zone which no longer exists. */ +static void +zone_clr_capped(zoneid_t zid) +{ + if (zone_over_cap[zid] == 0) + return; + + mutex_enter(&zone_physcap_lock); + /* Recheck setting under mutex */ + if (zone_over_cap[zid] == 1) { + ASSERT(zone_num_over_cap > 0); + zone_over_cap[zid] = 0; + zone_num_over_cap--; + } + mutex_exit(&zone_physcap_lock); +} + +/* + * For zone_add_page() and zone_rm_page(), access to the page we're touching is + * controlled by our caller's locking. + * On x86 our callers already did: ASSERT(x86_hm_held(pp)) + * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp)) + */ +void +zone_add_page(page_t *pp) +{ + int64_t psize; + zone_t *zone; + + /* Skip pages in segkmem, etc. (KV_KVP, ...) */ + if (PP_ISKAS(pp)) + return; + + ASSERT(!PP_ISFREE(pp)); + + zone = curzone; + if (pp->p_zoneid == zone->zone_id) { + /* Another mapping to this page for this zone, do nothing */ + return; + } + + if (pp->p_szc == 0) { + psize = (int64_t)PAGESIZE; + } else { + /* large page */ + psize = (int64_t)page_get_pagesize(pp->p_szc); + } + + if (pp->p_share == 0) { + /* First mapping to this page. */ + pp->p_zoneid = zone->zone_id; + atomic_add_64((uint64_t *)&zone->zone_phys_mem, psize); + zone_incr_capped(zone); + return; + } + + if (pp->p_zoneid != ALL_ZONES) { + /* + * The page is now being shared across a different zone. + * Decrement the original zone's usage. + */ + zoneid_t id; + + id = pp->p_zoneid; + pp->p_zoneid = ALL_ZONES; + if ((zone = zone_find_by_id(id)) == NULL) { + /* + * Perhaps the zone has halted but since we have the + * page locked down, the page hasn't been freed yet. + * In any case, there is no zone RSS to update. + */ + zone_clr_capped(id); + return; + } + + atomic_add_64((uint64_t *)&zone->zone_phys_mem, -psize); + if ((int64_t)zone->zone_phys_mem < 0) { + DTRACE_PROBE1(zone__ap__neg, zoneid_t, id); + cmn_err(CE_WARN, "zone %d: RSS negative", id); + zone->zone_phys_mem = 0; + } + zone_decr_capped(zone); + zone_rele(zone); + } +} + +void +zone_rm_page(page_t *pp) +{ + zone_t *zone; + boolean_t do_rele = B_FALSE; + int64_t psize; + + /* Skip pages in segkmem, etc. (KV_KVP, ...) */ + if (PP_ISKAS(pp)) + return; + + if (pp->p_zoneid == ALL_ZONES || pp->p_share != 0) + return; + + /* This is the last mapping to the page for a zone. */ + if (pp->p_szc == 0) { + psize = (int64_t)PAGESIZE; + } else { + /* large page */ + psize = (int64_t)page_get_pagesize(pp->p_szc); + } + + if (pp->p_zoneid == curzone->zone_id) { + zone = curzone; + } else if ((zone = zone_find_by_id(pp->p_zoneid)) != NULL) { + do_rele = B_TRUE; + } + + if (zone != NULL) { + atomic_add_64((uint64_t *)&zone->zone_phys_mem, -psize); + if ((int64_t)zone->zone_phys_mem < 0) { + DTRACE_PROBE1(zone__rp__neg, zoneid_t, zone->zone_id); + cmn_err(CE_WARN, "zone %d: RSS negative", + zone->zone_id); + zone->zone_phys_mem = 0; + } + zone_decr_capped(zone); + if (do_rele) + zone_rele(zone); + } else { + zone_clr_capped(pp->p_zoneid); + } + pp->p_zoneid = ALL_ZONES; +} |
