summaryrefslogtreecommitdiff
path: root/usr/src/uts
diff options
context:
space:
mode:
authorJerry Jelinek <jerry.jelinek@joyent.com>2017-09-28 12:56:20 +0000
committerJerry Jelinek <jerry.jelinek@joyent.com>2017-09-28 12:57:39 +0000
commit5ae84a5233b723c890288b775cb5317db2e54d61 (patch)
treef28877eae0fdbba8a58726efc3e464145fda1766 /usr/src/uts
parent597e7db3df75fb2976c1c29ef285e8bc6c289b4d (diff)
downloadillumos-joyent-5ae84a5233b723c890288b775cb5317db2e54d61.tar.gz
OS-6306 accurate in-kernel zone RSS tracking
Reviewed by: Dan McDonald <danmcd@joyent.com> Reviewed by: Jason King <jason.king@joyent.com> Reviewed by: Patrick Mooney <patrick.mooney@joyent.com> Approved by: Dan McDonald <danmcd@joyent.com>
Diffstat (limited to 'usr/src/uts')
-rw-r--r--usr/src/uts/common/os/zone.c236
-rw-r--r--usr/src/uts/common/sys/zone.h30
-rw-r--r--usr/src/uts/common/syscall/sysconfig.c4
-rw-r--r--usr/src/uts/common/vm/page.h7
-rw-r--r--usr/src/uts/common/vm/vm_usage.c26
-rw-r--r--usr/src/uts/i86pc/os/startup.c3
-rw-r--r--usr/src/uts/i86pc/vm/hment.c9
-rw-r--r--usr/src/uts/sfmmu/vm/hat_sfmmu.c6
8 files changed, 259 insertions, 62 deletions
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index 2912df0a29..1e5db33066 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -328,8 +328,8 @@ static list_t zone_active;
static list_t zone_deathrow;
static kmutex_t zone_deathrow_lock;
-/* number of zones is limited by virtual interface limit in IP */
-uint_t maxzones = 8192;
+/* This can be dynamically reduced if various subsystems hit internal limits. */
+uint_t maxzones = MAX_ZONES;
/* Event channel to sent zone state change notifications */
evchan_t *zone_event_chan;
@@ -3063,22 +3063,6 @@ zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay)
return (err);
}
-/*
- * The zone_set_rss function is used to set the zone's RSS when we do the
- * fast, approximate calculation in user-land.
- */
-static int
-zone_set_rss(zone_t *zone, const uint64_t *prss)
-{
- uint64_t rss;
- int err;
-
- if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0)
- zone->zone_phys_mem = rss;
-
- return (err);
-}
-
static int
zone_set_sched_class(zone_t *zone, const char *new_class)
{
@@ -6228,6 +6212,14 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
bufsize) != 0)
error = EFAULT;
break;
+ case ZONE_ATTR_RSS:
+ size = sizeof (zone->zone_phys_mem);
+ if (bufsize > size)
+ bufsize = size;
+ if (buf != NULL &&
+ copyout(&zone->zone_phys_mem, buf, bufsize) != 0)
+ error = EFAULT;
+ break;
default:
if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
size = bufsize;
@@ -6281,8 +6273,7 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
*/
zone_status = zone_status_get(zone);
if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT &&
- attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS &&
- zone_status > ZONE_IS_READY) {
+ attr != ZONE_ATTR_PG_FLT_DELAY && zone_status > ZONE_IS_READY) {
err = EINVAL;
goto done;
}
@@ -6313,9 +6304,6 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
case ZONE_ATTR_PG_FLT_DELAY:
err = zone_set_page_fault_delay(zone, (const uint32_t *)buf);
break;
- case ZONE_ATTR_RSS:
- err = zone_set_rss(zone, (const uint64_t *)buf);
- break;
case ZONE_ATTR_SECFLAGS:
err = zone_set_secflags(zone, (psecflags_t *)buf);
break;
@@ -8074,3 +8062,205 @@ done:
else
return (0);
}
+
+/*
+ * "zone_over_cap" is an array indexed by zoneid, indicating which zones are
+ * over their physical memory cap. This is the interface for the page scanner
+ * to use when reclaiming pages for zones that are over their cap.
+ *
+ * The page scanner can run when "zone_num_over_cap" is non-zero. It can
+ * do a direct lookup of a zoneid into the "zone_over_cap" array to determine
+ * if that zone is over its cap.
+ *
+ * There is no locking for the page scanner to perform these two checks.
+ * We cannot have the page scanner blocking normal paging activity for
+ * running processes. Because the physical memory cap is a soft cap, it is
+ * fine for the scanner to simply read the current state of the counter and
+ * the zone's entry in the array. The scanner should never modify either of
+ * these items. Internally the entries and the counter are managed with the
+ * "zone_physcap_lock" mutex as we add/remove mappings to pages. We take care
+ * to ensure that we only take the zone_physcap_lock mutex when a zone is
+ * transitioning over/under its physical memory cap.
+ *
+ * The "zone_incr_capped", "zone_decr_capped" and "zone_clr_capped" functions
+ * are used manage the "zone_over_cap" array and associated counter.
+ */
+uint8_t zone_over_cap[MAX_ZONES];
+uint_t zone_num_over_cap;
+static kmutex_t zone_physcap_lock;
+
+static void
+zone_incr_capped(zone_t *zone)
+{
+ /* See if over (unlimited is UINT64_MAX), or already marked that way. */
+ if (zone->zone_phys_mem <= zone->zone_phys_mem_ctl ||
+ zone_over_cap[zone->zone_id] == 1) {
+ return;
+ }
+
+ mutex_enter(&zone_physcap_lock);
+ /* Recheck setting under mutex */
+ if (zone->zone_phys_mem > zone->zone_phys_mem_ctl &&
+ zone_over_cap[zone->zone_id] == 0) {
+ zone_over_cap[zone->zone_id] = 1;
+ zone_num_over_cap++;
+ DTRACE_PROBE1(zone__over__pcap, zone_t *, zone);
+ }
+ mutex_exit(&zone_physcap_lock);
+}
+
+static void
+zone_decr_capped(zone_t *zone)
+{
+ /*
+ * See if under, or already marked that way. There is no need to
+ * check for an unlimited cap (zone_phys_mem_ctl == UINT64_MAX)
+ * since we'll never add the zone in zone_incr_capped_zone().
+ */
+ if (zone_over_cap[zone->zone_id] == 0 ||
+ zone->zone_phys_mem >= zone->zone_phys_mem_ctl) {
+ return;
+ }
+
+ mutex_enter(&zone_physcap_lock);
+ /* Recheck setting under mutex */
+ if (zone->zone_phys_mem < zone->zone_phys_mem_ctl &&
+ zone_over_cap[zone->zone_id] == 1) {
+ ASSERT(zone_num_over_cap > 0);
+ zone_over_cap[zone->zone_id] = 0;
+ zone_num_over_cap--;
+ DTRACE_PROBE1(zone__under__pcap, zone_t *, zone);
+ }
+ mutex_exit(&zone_physcap_lock);
+}
+
+/* Clear out an entry for a zone which no longer exists. */
+static void
+zone_clr_capped(zoneid_t zid)
+{
+ if (zone_over_cap[zid] == 0)
+ return;
+
+ mutex_enter(&zone_physcap_lock);
+ /* Recheck setting under mutex */
+ if (zone_over_cap[zid] == 1) {
+ ASSERT(zone_num_over_cap > 0);
+ zone_over_cap[zid] = 0;
+ zone_num_over_cap--;
+ }
+ mutex_exit(&zone_physcap_lock);
+}
+
+/*
+ * For zone_add_page() and zone_rm_page(), access to the page we're touching is
+ * controlled by our caller's locking.
+ * On x86 our callers already did: ASSERT(x86_hm_held(pp))
+ * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp))
+ */
+void
+zone_add_page(page_t *pp)
+{
+ int64_t psize;
+ zone_t *zone;
+
+ /* Skip pages in segkmem, etc. (KV_KVP, ...) */
+ if (PP_ISKAS(pp))
+ return;
+
+ ASSERT(!PP_ISFREE(pp));
+
+ zone = curzone;
+ if (pp->p_zoneid == zone->zone_id) {
+ /* Another mapping to this page for this zone, do nothing */
+ return;
+ }
+
+ if (pp->p_szc == 0) {
+ psize = (int64_t)PAGESIZE;
+ } else {
+ /* large page */
+ psize = (int64_t)page_get_pagesize(pp->p_szc);
+ }
+
+ if (pp->p_share == 0) {
+ /* First mapping to this page. */
+ pp->p_zoneid = zone->zone_id;
+ atomic_add_64((uint64_t *)&zone->zone_phys_mem, psize);
+ zone_incr_capped(zone);
+ return;
+ }
+
+ if (pp->p_zoneid != ALL_ZONES) {
+ /*
+ * The page is now being shared across a different zone.
+ * Decrement the original zone's usage.
+ */
+ zoneid_t id;
+
+ id = pp->p_zoneid;
+ pp->p_zoneid = ALL_ZONES;
+ if ((zone = zone_find_by_id(id)) == NULL) {
+ /*
+ * Perhaps the zone has halted but since we have the
+ * page locked down, the page hasn't been freed yet.
+ * In any case, there is no zone RSS to update.
+ */
+ zone_clr_capped(id);
+ return;
+ }
+
+ atomic_add_64((uint64_t *)&zone->zone_phys_mem, -psize);
+ if ((int64_t)zone->zone_phys_mem < 0) {
+ DTRACE_PROBE1(zone__ap__neg, zoneid_t, id);
+ cmn_err(CE_WARN, "zone %d: RSS negative", id);
+ zone->zone_phys_mem = 0;
+ }
+ zone_decr_capped(zone);
+ zone_rele(zone);
+ }
+}
+
+void
+zone_rm_page(page_t *pp)
+{
+ zone_t *zone;
+ boolean_t do_rele = B_FALSE;
+ int64_t psize;
+
+ /* Skip pages in segkmem, etc. (KV_KVP, ...) */
+ if (PP_ISKAS(pp))
+ return;
+
+ if (pp->p_zoneid == ALL_ZONES || pp->p_share != 0)
+ return;
+
+ /* This is the last mapping to the page for a zone. */
+ if (pp->p_szc == 0) {
+ psize = (int64_t)PAGESIZE;
+ } else {
+ /* large page */
+ psize = (int64_t)page_get_pagesize(pp->p_szc);
+ }
+
+ if (pp->p_zoneid == curzone->zone_id) {
+ zone = curzone;
+ } else if ((zone = zone_find_by_id(pp->p_zoneid)) != NULL) {
+ do_rele = B_TRUE;
+ }
+
+ if (zone != NULL) {
+ atomic_add_64((uint64_t *)&zone->zone_phys_mem, -psize);
+ if ((int64_t)zone->zone_phys_mem < 0) {
+ DTRACE_PROBE1(zone__rp__neg, zoneid_t, zone->zone_id);
+ cmn_err(CE_WARN, "zone %d: RSS negative",
+ zone->zone_id);
+ zone->zone_phys_mem = 0;
+ }
+ zone_decr_capped(zone);
+ if (do_rele)
+ zone_rele(zone);
+ } else {
+ zone_clr_capped(pp->p_zoneid);
+ }
+ pp->p_zoneid = ALL_ZONES;
+}
diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h
index 3bf7979174..32b2b7bf38 100644
--- a/usr/src/uts/common/sys/zone.h
+++ b/usr/src/uts/common/sys/zone.h
@@ -22,7 +22,7 @@
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
* Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>.
- * Copyright 2016, Joyent, Inc.
+ * Copyright 2017, Joyent, Inc.
*/
#ifndef _SYS_ZONE_H
@@ -51,15 +51,27 @@ extern "C" {
* NOTE
*
* The contents of this file are private to the implementation of
- * Solaris and are subject to change at any time without notice.
+ * illumos and are subject to change at any time without notice.
* Applications and drivers using these interfaces may fail to
* run on future releases.
*/
/* Available both in kernel and for user space */
-/* zone id restrictions and special ids */
-#define MAX_ZONEID 9999
+/*
+ * zone id restrictions and special ids.
+ * See 'maxzones' for run-time zone limit.
+ *
+ * The current 8k value for MAX_ZONES was originally derived from the virtual
+ * interface limit in IP when "shared-stack" was the only supported networking
+ * for zones. The virtual interface limit is the number of addresses allowed
+ * on an interface (see MAX_ADDRS_PER_IF). Even with exclusive stacks, an 8k
+ * zone limit is still a reasonable choice at this time, given other limits
+ * within the kernel. Since we only support 8192 zones (which includes GZ),
+ * there is no point in allowing MAX_ZONEID > 8k.
+ */
+#define MAX_ZONES 8192
+#define MAX_ZONEID (MAX_ZONES - 1)
#define MIN_USERZONEID 1 /* lowest user-creatable zone ID */
#define MIN_ZONEID 0 /* minimum zone ID on system */
#define GLOBAL_ZONEID 0
@@ -647,7 +659,7 @@ typedef struct zone {
zone_zfs_kstat_t *zone_zfs_stats;
/*
- * Solaris Auditing per-zone audit context
+ * illumos Auditing per-zone audit context
*/
struct au_kcontext *zone_audit_kctxt;
/*
@@ -963,6 +975,14 @@ extern void mount_completed(zone_t *);
extern int zone_walk(int (*)(zone_t *, void *), void *);
+struct page;
+extern void zone_add_page(struct page *);
+extern void zone_rm_page(struct page *);
+
+/* Interfaces for page scanning */
+extern uint_t zone_num_over_cap;
+extern uint8_t zone_over_cap[MAX_ZONES];
+
extern rctl_hndl_t rc_zone_locked_mem;
extern rctl_hndl_t rc_zone_max_swap;
extern rctl_hndl_t rc_zone_phys_mem;
diff --git a/usr/src/uts/common/syscall/sysconfig.c b/usr/src/uts/common/syscall/sysconfig.c
index 92daeed703..fc38f8770c 100644
--- a/usr/src/uts/common/syscall/sysconfig.c
+++ b/usr/src/uts/common/syscall/sysconfig.c
@@ -22,7 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2017 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -186,8 +186,6 @@ sysconfig(int which)
* the phys. memory cap - zone's rss. We always
* report the system-wide value for the global zone, even
* though memory capping can be used on the global zone too.
- * We use the cached value for the RSS since vm_getusage()
- * is so expensive and we don't need this value to be exact.
*/
if (!INGLOBALZONE(curproc) &&
curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX) {
diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h
index 8747b96acc..ae9b0be758 100644
--- a/usr/src/uts/common/vm/page.h
+++ b/usr/src/uts/common/vm/page.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -229,6 +230,7 @@ struct as;
* p_nrm
* p_mapping
* p_share
+ * p_zoneid
*
* The following field is file system dependent. How it is used and
* the locking strategies applied are up to the individual file system
@@ -527,9 +529,8 @@ typedef struct page {
pfn_t p_pagenum; /* physical page number */
uint_t p_share; /* number of translations */
-#if defined(_LP64)
- uint_t p_sharepad; /* pad for growing p_share */
-#endif
+ short p_zoneid; /* zone page use tracking */
+ short p_pad1; /* TBD */
uint_t p_slckcnt; /* number of softlocks */
#if defined(__sparc)
uint_t p_kpmref; /* number of kpm mapping sharers */
diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c
index 1d5ef71e3e..91296e9c8d 100644
--- a/usr/src/uts/common/vm/vm_usage.c
+++ b/usr/src/uts/common/vm/vm_usage.c
@@ -25,7 +25,7 @@
*/
/*
- * Copyright 2016, Joyent, Inc.
+ * Copyright 2017, Joyent, Inc.
*/
/*
@@ -1792,28 +1792,6 @@ vmu_cache_rele(vmu_cache_t *cache)
}
/*
- * When new data is calculated, update the phys_mem rctl usage value in the
- * zones.
- */
-static void
-vmu_update_zone_rctls(vmu_cache_t *cache)
-{
- vmusage_t *rp;
- size_t i = 0;
- zone_t *zp;
-
- for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) {
- if (rp->vmu_type == VMUSAGE_ZONE &&
- rp->vmu_zoneid != ALL_ZONES) {
- if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) {
- zp->zone_phys_mem = rp->vmu_rss_all;
- zone_rele(zp);
- }
- }
- }
-}
-
-/*
* Copy out the cached results to a caller. Inspect the callers flags
* and zone to determine which cached results should be copied.
*/
@@ -2112,8 +2090,6 @@ start:
mutex_exit(&vmu_data.vmu_lock);
- /* update zone's phys. mem. rctl usage */
- vmu_update_zone_rctls(cache);
/* copy cache */
ret = vmu_copyout_results(cache, buf, nres, flags_orig,
req_zone_id, cpflg);
diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c
index 16c683d993..8a6751000b 100644
--- a/usr/src/uts/i86pc/os/startup.c
+++ b/usr/src/uts/i86pc/os/startup.c
@@ -22,7 +22,7 @@
* Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2012 DEY Storage Systems, Inc. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2017 Joyent, Inc.
* Copyright (c) 2015 by Delphix. All rights reserved.
*/
/*
@@ -2603,6 +2603,7 @@ add_physmem_cb(page_t *pp, pfn_t pnum)
pp->p_mapping = NULL;
pp->p_embed = 0;
pp->p_share = 0;
+ pp->p_zoneid = ALL_ZONES;
pp->p_mlentry = 0;
}
diff --git a/usr/src/uts/i86pc/vm/hment.c b/usr/src/uts/i86pc/vm/hment.c
index d00d756828..079f64e92e 100644
--- a/usr/src/uts/i86pc/vm/hment.c
+++ b/usr/src/uts/i86pc/vm/hment.c
@@ -21,10 +21,9 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/sysmacros.h>
#include <sys/kmem.h>
@@ -37,6 +36,7 @@
#include <vm/hat_i86.h>
#include <sys/cmn_err.h>
#include <sys/avl.h>
+#include <sys/zone.h>
/*
@@ -323,6 +323,8 @@ hment_insert(hment_t *hm, page_t *pp)
((hment_t *)pp->p_mapping)->hm_prev = hm;
pp->p_mapping = hm;
+ zone_add_page(pp);
+
/*
* Add the hment to the system-wide hash table.
*/
@@ -464,6 +466,7 @@ hment_assign(htable_t *htable, uint_t entry, page_t *pp, hment_t *hm)
pp->p_embed = 1;
pp->p_mapping = htable;
pp->p_mlentry = entry;
+ zone_add_page(pp);
return;
}
@@ -545,6 +548,7 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry)
pp->p_mapping = NULL;
pp->p_mlentry = 0;
pp->p_embed = 0;
+ zone_rm_page(pp);
return (NULL);
}
@@ -580,6 +584,7 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry)
hm->hm_hashlink = null_avl_link;
hm->hm_next = NULL;
hm->hm_prev = NULL;
+ zone_rm_page(pp);
return (hm);
}
diff --git a/usr/src/uts/sfmmu/vm/hat_sfmmu.c b/usr/src/uts/sfmmu/vm/hat_sfmmu.c
index b7539c828c..2ef3ea20e8 100644
--- a/usr/src/uts/sfmmu/vm/hat_sfmmu.c
+++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.c
@@ -24,6 +24,7 @@
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright 2016 Gary Mills
+ * Copyright 2017 Joyent, Inc.
*/
/*
@@ -85,6 +86,7 @@
#include <sys/fpu/fpusystm.h>
#include <vm/mach_kpm.h>
#include <sys/callb.h>
+#include <sys/zone.h>
#ifdef DEBUG
#define SFMMU_VALIDATE_HMERID(hat, rid, saddr, len) \
@@ -933,6 +935,7 @@ static kphysm_setup_vector_t sfmmu_update_vec = {
} \
pp->p_mapping = hme; \
pp->p_share++; \
+ zone_add_page(pp); \
}
/*
@@ -953,6 +956,7 @@ static kphysm_setup_vector_t sfmmu_update_vec = {
\
ASSERT(pp->p_share > 0); \
pp->p_share--; \
+ zone_rm_page(pp); \
\
if (hme->hme_prev) { \
ASSERT(pp->p_mapping != hme); \
@@ -7350,6 +7354,8 @@ retry:
tpp->p_mapping = NULL;
dpp->p_share = tpp->p_share;
tpp->p_share = 0;
+ dpp->p_zoneid = tpp->p_zoneid;
+ tpp->p_zoneid = ALL_ZONES;
while (index != 0) {
index = index >> 1;