OS-11 rcapd behaves poorly when under extreme load

author: Jerry Jelinek <jerry.jelinek@joyent.com> 2011-05-05 07:46:18 -0700
committer: Jerry Jelinek <jerry.jelinek@joyent.com> 2011-05-05 07:46:18 -0700
commit: 9230f57fd5dc4b0898a592a83cfa3f7b18b26718 (patch)
tree: aaa4b6796395f90c30238d0af42360549edd2fd8 /usr/src/cmd
parent: f3861a7cccef3296d09052f75b91c493572fd94a (diff)
download: illumos-joyent-9230f57fd5dc4b0898a592a83cfa3f7b18b26718.tar.gz
8 files changed, 927 insertions, 121 deletions
diff --git a/usr/src/cmd/rcap/common/utils.c b/usr/src/cmd/rcap/common/utils.c
index 799fdcef23..dd511c7c50 100644
--- a/usr/src/cmd/rcap/common/utils.c
+++ b/usr/src/cmd/rcap/common/utils.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/param.h>
@@ -257,77 +258,3 @@ xatoi(char *p)
 		return (i);
 	}
 }
-
-/*
- * get_running_zones() calls zone_list(2) to find out how many zones are
- * running.  It then calls zone_list(2) again to fetch the list of running
- * zones (stored in *zents).
- */
-int
-get_running_zones(uint_t *nzents, zone_entry_t **zents)
-{
-	zoneid_t *zids;
-	uint_t nzents_saved;
-	int i;
-	zone_entry_t *zentp;
-	zone_state_t zstate;
-
-	*zents = NULL;
-	if (zone_list(NULL, nzents) != 0) {
-		warn(gettext("could not get zoneid list\n"));
-		return (E_ERROR);
-	}
-
-again:
-	if (*nzents == 0)
-		return (E_SUCCESS);
-
-	if ((zids = (zoneid_t *)calloc(*nzents, sizeof (zoneid_t))) == NULL) {
-		warn(gettext("out of memory: zones will not be capped\n"));
-		return (E_ERROR);
-	}
-
-	nzents_saved = *nzents;
-
-	if (zone_list(zids, nzents) != 0) {
-		warn(gettext("could not get zone list\n"));
-		free(zids);
-		return (E_ERROR);
-	}
-	if (*nzents != nzents_saved) {
-		/* list changed, try again */
-		free(zids);
-		goto again;
-	}
-
-	*zents = calloc(*nzents, sizeof (zone_entry_t));
-	if (*zents == NULL) {
-		warn(gettext("out of memory: zones will not be capped\n"));
-		free(zids);
-		return (E_ERROR);
-	}
-
-	zentp = *zents;
-	for (i = 0; i < *nzents; i++) {
-		char name[ZONENAME_MAX];
-
-		if (getzonenamebyid(zids[i], name, sizeof (name)) < 0) {
-			warn(gettext("could not get name for "
-			    "zoneid %d\n"), zids[i]);
-			continue;
-		}
-
-		(void) strlcpy(zentp->zname, name, sizeof (zentp->zname));
-		zentp->zid = zids[i];
-		if (zone_get_state(name, &zstate) != Z_OK ||
-		    zstate != ZONE_STATE_RUNNING)
-			continue;
-
-
-		zentp++;
-	}
-	*nzents = zentp - *zents;
-
-	free(zids);
-	return (E_SUCCESS);
-}
diff --git a/usr/src/cmd/rcap/common/utils.h b/usr/src/cmd/rcap/common/utils.h
index 7196cfb4ce..cf2e17c080 100644
--- a/usr/src/cmd/rcap/common/utils.h
+++ b/usr/src/cmd/rcap/common/utils.h
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  */
 
 #ifndef	_UTILS_H
@@ -98,7 +99,6 @@ extern void vdprintfe(int, const char *, va_list);
 extern void dprintfe(int, char *, ...);
 extern void hrt2ts(hrtime_t, timestruc_t *);
 extern int xatoi(char *);
-extern int get_running_zones(uint_t *, zone_entry_t **);
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/cmd/rcap/rcapd/rcapd_collection_zone.c b/usr/src/cmd/rcap/rcapd/rcapd_collection_zone.c
index 798ed97707..88403dda37 100644
--- a/usr/src/cmd/rcap/rcapd/rcapd_collection_zone.c
+++ b/usr/src/cmd/rcap/rcapd/rcapd_collection_zone.c
@@ -121,61 +121,36 @@ get_zone_cap(zoneid_t zid)
 	return (mcap);
 }
 
-static void
-update_zone(zone_entry_t *zent, void *walk_data)
+/*
+ * For zones, rcapd only caps the global zone, since each non-global zone
+ * caps itself.
+ */
+/* ARGSUSED */
+void
+lcollection_update_zone(lcollection_update_type_t ut,
+    void(*update_notification_cb)(char *, char *, int, uint64_t, int))
 {
-	void(*update_notification_cb)(char *, char *, int, uint64_t, int) =
-	    (void(*)(char *, char *, int, uint64_t, int))walk_data;
 	int changes;
 	int64_t max_rss;
 	uint64_t mcap;
 	lcollection_t *lcol;
 	rcid_t colid;
 
-	mcap = get_zone_cap(zent->zid);
-	if (mcap != 0 && mcap != UINT64_MAX)
+	mcap = get_zone_cap(GLOBAL_ZONEID);
+	if (mcap != 0 && mcap != UINT64_MAX) {
 		max_rss = ROUNDUP(mcap, 1024) / 1024;
-	else
-		max_rss = 0;
-
-	if (zent->zid == GLOBAL_ZONEID) {
-		if (max_rss > 0)
-			gz_capped = B_TRUE;
-		else
-			gz_capped = B_FALSE;
+		gz_capped = B_TRUE;
+	} else {
+		max_rss = UINT64_MAX / 1024;
+		gz_capped = B_FALSE;
 	}
 
-
 	colid.rcid_type = RCIDT_ZONE;
-	colid.rcid_val = zent->zid;
+	colid.rcid_val = GLOBAL_ZONEID;
 
-	lcol = lcollection_insert_update(&colid, max_rss, zent->zname,
+	lcol = lcollection_insert_update(&colid, max_rss, GLOBAL_ZONENAME,
 	    &changes);
 	if (update_notification_cb != NULL)
-		update_notification_cb("zone", zent->zname, changes, max_rss,
-		    (lcol != NULL) ? lcol->lcol_mark : 0);
-}
-
-
-/* ARGSUSED */
-void
-lcollection_update_zone(lcollection_update_type_t ut,
-    void(*update_notification_cb)(char *, char *, int, uint64_t, int))
-{
-	int i;
-	uint_t nzents;
-	zone_entry_t *zents;
-
-	/*
-	 * Enumerate running zones.
-	 */
-	if (get_running_zones(&nzents, &zents) != 0)
-		return;
-
-	for (i = 0; i < nzents; i++) {
-		update_zone(&zents[i], (void *)update_notification_cb);
-
-	}
-
-	free(zents);
+		update_notification_cb("zone", GLOBAL_ZONENAME, changes,
+		    max_rss, (lcol != NULL) ? lcol->lcol_mark : 0);
 }
diff --git a/usr/src/cmd/zoneadmd/Makefile b/usr/src/cmd/zoneadmd/Makefile
index cb03ef459a..f8810c46ef 100644
--- a/usr/src/cmd/zoneadmd/Makefile
+++ b/usr/src/cmd/zoneadmd/Makefile
@@ -23,6 +23,7 @@
 
 #
 # Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2011, Joyent, Inc. All rights reserved.
 #
 
 PROG= zoneadmd
@@ -31,7 +32,7 @@ include ../Makefile.cmd
 
 ROOTCMDDIR=	$(ROOTLIB)/zones
 
-OBJS= zoneadmd.o zcons.o vplat.o
+OBJS= zoneadmd.o zcons.o vplat.o mcap.o
 SRCS = $(OBJS:.o=.c)
 POFILE=zoneadmd_all.po
 POFILES= $(OBJS:%.o=%.po)
@@ -39,7 +40,7 @@ POFILES= $(OBJS:%.o=%.po)
 CFLAGS += $(CCVERBOSE)
 LDLIBS += -lsocket -lzonecfg -lnsl -ldevinfo -ldevice -lnvpair \
 	-lgen -lbsm -lcontract -lzfs -luuid -lbrand -ldladm -ltsnet -ltsol \
-	-linetutil
+	-linetutil -lproc
 XGETFLAGS += -a -x zoneadmd.xcl
 
 .KEEP_STATE:
diff --git a/usr/src/cmd/zoneadmd/mcap.c b/usr/src/cmd/zoneadmd/mcap.c
new file mode 100644
index 0000000000..4ae4bd0ecd
--- /dev/null
+++ b/usr/src/cmd/zoneadmd/mcap.c
@@ -0,0 +1,882 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
+ */
+
+/*
+ * This file implements the code which runs a thread inside zoneadmd to cap
+ * the associated zone's physical memory.  A thread to do this is started
+ * when the zone boots and is halted when the zone shuts down.
+ *
+ * Because of the way that the VM system is currently implemented, there is no
+ * way to go from the bottom up (page to process to zone).  Thus, there is no
+ * obvious way to hook an rctl into the kernel's paging code to enforce a hard
+ * memory cap.  Instead, we implement a soft physical memory cap which looks
+ * at the zone's overall rss and once it is over the cap, works from the top
+ * down (zone to process to page), looking at zone processes, to determine
+ * what to try to pageout to get the zone under its memory cap.
+ *
+ * The code uses the vm_getusage syscall to determine the zone's rss and
+ * checks that against the zone's zone.max-physical-memory rctl.  Once the
+ * zone goes over its cap, then this thread will work through the zone's
+ * /proc process list, Pgrab-bing each process and stepping through the
+ * address space segments attempting to use pr_memcntl(...MS_INVALIDATE...)
+ * to pageout pages, until the zone is again under its cap.
+ *
+ * Although zone memory capping is implemented as a soft cap by this user-level
+ * thread, the interfaces around memory caps that are exposed to the user are
+ * the standard ones; an rctl and kstats.  This thread uses the rctl value
+ * to obtain the cap and works with the zone kernel code to update the kstats.
+ * If the implementation ever moves into the kernel, these exposed interfaces
+ * do not need to change.
+ *
+ * The thread adaptively sleeps, periodically checking the state of the
+ * zone.  As the zone's rss gets closer to the cap, the thread will wake up
+ * more often to check the zone's status.  Once the zone is over the cap,
+ * the thread will work to pageout until the zone is under the cap, as shown
+ * by updated vm_usage data.
+ *
+ * There are a couple of interfaces (xmap, pagedata) in proc(4) that can be
+ * used to examine a processes mapped segments while we are trying to pageout.
+ * The observed xmap segement size data is frequently smaller than the
+ * pagedata segement size data, so it is less effective in practice.  Thus we
+ * use pagedata to determine the size of each segment.
+ *
+ * The pagedata page maps (at least on x86) are not useful.  Those flags
+ * are set by hrm_setbits() and on x86 that code path is only executed by
+ *     segvn_pagelock -> hat_setstat -> hrm_setbits
+ *     segvn_softunlock -^
+ * On SPARC there is an additional code path which may make this data
+ * useful (sfmmu_ttesync), but since it is not generic, we ignore the page
+ * maps and only use the segement info from pagedata.  If we ever fix this
+ * issue, then we could generalize this mcap code to do more with the data on
+ * active pages.
+ *
+ * For debugging, touch the file {zonepath}/mcap_debug.log.  This will
+ * cause the thread to start logging its actions into that file (it may take
+ * a minute or two if the thread is currently sleeping).  Removing that
+ * file will cause logging to stop.
+ */
+
+#include <sys/mman.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libproc.h>
+#include <limits.h>
+#include <procfs.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/priocntl.h>
+#include <dirent.h>
+#include <zone.h>
+#include <libzonecfg.h>
+#include <thread.h>
+#include <values.h>
+#include <sys/vm_usage.h>
+#include <sys/resource.h>
+#include <sys/debug.h>
+#include <synch.h>
+#include "zoneadmd.h"
+
+					/* round up to next y = 2^n */
+#define	ROUNDUP(x, y)	(((x) + ((y) - 1)) & ~((y) - 1))
+
+#define	CAP_REFRESH	((uint64_t)300 * NANOSEC) /* every 5 minutes */
+
+static char	zonename[ZONENAME_MAX];
+static char	zonepath[MAXPATHLEN];
+static char	zoneproc[MAXPATHLEN];
+static char	debug_log[MAXPATHLEN];
+static zoneid_t	zid;
+static mutex_t	shutdown_mx;
+static cond_t	shutdown_cv;
+static int	shutting_down = 0;
+static thread_t mcap_tid;
+static FILE	*debug_log_fp = NULL;
+static uint64_t	sum_pageout = 0;	/* total bytes paged out in a pass */
+
+/*
+ * Structure to hold current state about a process address space that we're
+ * working on.
+ */
+typedef struct {
+	int pr_curr;		/* the # of the mapping we're working on */
+	int pr_nmap;		/* number of mappings in address space */
+	int pr_cnt;		/* number of mappings processed */
+
+	prpageheader_t *pr_pghp; /* process's complete pagedata */
+	prasmap_t *pr_asp;	/* current address space pointer */
+
+	uintptr_t pr_addr;	/* base of mapping */
+	uint64_t pr_size;	/* size of mapping */
+} proc_map_t;
+
+typedef struct zsd_vmusage64 {
+	id_t vmu_zoneid;
+	uint_t vmu_type;
+	id_t vmu_id;
+	/*
+	 * An amd64 kernel will align the following uint64_t members, but a
+	 * 32bit i386 process will not without help.
+	 */
+	int vmu_align_next_members_on_8_bytes;
+	uint64_t vmu_rss_all;
+	uint64_t vmu_rss_private;
+	uint64_t vmu_rss_shared;
+	uint64_t vmu_swap_all;
+	uint64_t vmu_swap_private;
+	uint64_t vmu_swap_shared;
+} zsd_vmusage64_t;
+
+/*
+ * Output a debug log message.
+ */
+/*PRINTFLIKE1*/
+static void
+debug(char *fmt, ...)
+{
+	va_list ap;
+
+	if (debug_log_fp == NULL)
+		return;
+
+	va_start(ap, fmt);
+	(void) vfprintf(debug_log_fp, fmt, ap);
+	va_end(ap);
+	(void) fflush(debug_log_fp);
+}
+
+/*
+ * Like sleep(3C) but can be interupted by cond_signal which is posted when
+ * we're shutting down the mcap thread.
+ */
+static void
+sleep_shutdown(int secs)
+{
+	timestruc_t to;
+
+	to.tv_sec = secs;
+	to.tv_nsec = 0;
+
+	(void) mutex_lock(&shutdown_mx);
+	if (!shutting_down)
+		(void) cond_reltimedwait(&shutdown_cv, &shutdown_mx, &to);
+	(void) mutex_unlock(&shutdown_mx);
+}
+
+static boolean_t
+proc_issystem(pid_t pid)
+{
+	char pc_clname[PC_CLNMSZ];
+
+	if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
+	    PC_KY_NULL) != -1)
+		return (strcmp(pc_clname, "SYS") == 0);
+
+	return (B_TRUE);
+}
+
+static struct ps_prochandle *
+control_proc(pid_t pid)
+{
+	int res;
+	struct ps_prochandle *ph;
+
+	/* Take control of the process. */
+	if ((ph = Pgrab(pid, 0, &res)) == NULL)
+		return (NULL);
+
+	if (Pcreate_agent(ph) != 0) {
+		(void) Prelease(ph, 0);
+		return (NULL);
+	}
+
+	/* Verify agent LWP is actually stopped. */
+	errno = 0;
+	while (Pstate(ph) == PS_RUN)
+		(void) Pwait(ph, 0);
+
+	if (Pstate(ph) != PS_STOP) {
+		Pdestroy_agent(ph);
+		(void) Prelease(ph, 0);
+		return (NULL);
+	}
+
+	return (ph);
+}
+
+/*
+ * Get data from the current prasmap_t and advance pr_asp to the next
+ * asmap in the pagedata.
+ */
+static uintptr_t
+nextmapping(proc_map_t *pmp)
+{
+	prasmap_t *pap;
+	void *pdp;		/* per-page data pointer */
+
+	pmp->pr_curr++;
+	if (pmp->pr_curr > pmp->pr_nmap)
+		return (NULL);
+
+	pap = pmp->pr_asp;
+
+	pmp->pr_addr = pap->pr_vaddr;
+	pmp->pr_size = pap->pr_npage * pap->pr_pagesize;
+	pmp->pr_cnt++;
+
+	/* Advance the pr_asp pointer to the next asmap */
+	pdp = pap + 1;
+	pdp = (caddr_t)(uintptr_t)((uintptr_t)pdp + pap->pr_npage);
+
+	/* Skip to next 64-bit-aligned address to get the next prasmap_t. */
+	pdp = (caddr_t)(((uintptr_t)pdp + 7) & ~7);
+	pmp->pr_asp = (prasmap_t *)pdp;
+
+	return (pmp->pr_addr);
+}
+
+/*
+ * Initialize the proc_map_t to access the first mapping of an address space.
+ */
+static void *
+init_map(proc_map_t *pmp, pid_t pid)
+{
+	int fd;
+	int res;
+	struct stat st;
+	char pathbuf[MAXPATHLEN];
+
+	bzero(pmp, sizeof (proc_map_t));
+	pmp->pr_nmap = -1;
+
+	(void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/pagedata", zoneproc,
+	    pid);
+	if ((fd = open(pathbuf, O_RDONLY, 0)) < 0)
+		return (NULL);
+
+redo:
+	errno = 0;
+	if (fstat(fd, &st) != 0)
+		return (NULL);
+
+	if ((pmp->pr_pghp = malloc(st.st_size)) == NULL) {
+		debug("cannot malloc() %ld bytes for pagedata", st.st_size);
+		return (NULL);
+	}
+	(void) bzero(pmp->pr_pghp, st.st_size);
+
+	errno = 0;
+	if ((res = read(fd, pmp->pr_pghp, st.st_size)) != st.st_size) {
+		free(pmp->pr_pghp);
+		pmp->pr_pghp = NULL;
+		if (res > 0 || errno == E2BIG) {
+			goto redo;
+		} else {
+			debug("pid %ld cannot read pagedata\n", pid);
+			return (NULL);
+		}
+	}
+
+	pmp->pr_nmap = pmp->pr_pghp->pr_nmap;
+	pmp->pr_asp = (prasmap_t *)(pmp->pr_pghp + 1);
+done:
+	(void) close(fd);
+	return ((void *)nextmapping(pmp));
+}
+
+/*
+ * Attempt to page out a region of the given process's address space.  May
+ * return nonzero if not all of the pages may are pageable, for any reason.
+ */
+static int
+pageout_mapping(struct ps_prochandle *Pr, proc_map_t *pmp)
+{
+	int res;
+
+	errno = 0;
+	res = pr_memcntl(Pr, (caddr_t)pmp->pr_addr, pmp->pr_size, MC_SYNC,
+	    (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0);
+
+	/*
+	 * EBUSY indicates none of the pages have backing store allocated, or
+	 * some pages were locked.  Don't care about this.
+	 */
+	if (res != 0 && errno == EBUSY)
+		res = 0;
+
+	return (res);
+}
+
+/*
+ * Compute the delta of the process RSS since the last call.  If the
+ * psinfo cannot be obtained, no error is returned; its up to the caller to
+ * detect the process termination via other means.
+ */
+static int64_t
+rss_delta(int64_t *old_rss, int psfd)
+{
+	int64_t		d_rss = 0;
+	psinfo_t	psinfo;
+
+	if (pread(psfd, &psinfo, sizeof (psinfo_t), 0) == sizeof (psinfo_t)) {
+		d_rss = (int64_t)psinfo.pr_rssize - *old_rss;
+		*old_rss = (int64_t)psinfo.pr_rssize;
+	}
+
+	return (d_rss);
+}
+
+
+/*
+ * Work through a process paging out mappings until the whole address space was
+ * examined or the excess is < 0.  Return our estimate of the updated excess.
+ */
+static int64_t
+pageout_process(pid_t pid, int64_t excess)
+{
+	int			psfd;
+	void			*praddr;
+	proc_map_t		cur;
+	struct ps_prochandle	*ph = NULL;
+	int			unpageable_mappings;
+	int64_t			sum_d_rss, sum_att, d_rss;
+	int64_t			old_rss;
+	psinfo_t		psinfo;
+	int			incr_rss_check = 0;
+	char			pathbuf[MAXPATHLEN];
+
+	cur.pr_pghp = NULL;
+	(void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
+	    pid);
+	if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
+		return (excess);
+
+	if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo))
+		goto done;
+
+	old_rss = (int64_t)psinfo.pr_rssize;
+
+	/* If unscannable, skip it. */
+	if (psinfo.pr_nlwp == 0 || proc_issystem(pid)) {
+		debug("pid: %ld system process, skipping %s\n",
+		    pid, psinfo.pr_psargs);
+		goto done;
+	}
+
+	/* If tiny RSS (16KB), skip it. */
+	if (old_rss <= 16) {
+		debug("pid: %ld skipping, RSS %lldKB %s\n",
+		    pid, old_rss, psinfo.pr_psargs);
+		goto done;
+	}
+
+	/* Get segment residency information. */
+	praddr = init_map(&cur, pid);
+
+	/* Skip process if it has no mappings. */
+	if (cur.pr_pghp == NULL) {
+		debug("%ld: pagedata unreadable; ignoring\n", pid);
+		goto done;
+	}
+
+	debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n",
+	    pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs);
+
+	/* Take control of the process. */
+	if ((ph = control_proc(pid)) == NULL) {
+		debug("%ld: cannot control\n", pid);
+		goto done;
+	}
+
+	/*
+	 * If the process RSS is not enough to erase the excess then no need
+	 * to incrementally check the RSS delta after each pageout attempt.
+	 * Instead check it after we've tried all of the segements.
+	 */
+	if (excess - old_rss < 0)
+		incr_rss_check = 1;
+
+	/*
+	 * Within the process's address space, attempt to page out mappings.
+	 */
+	sum_att = sum_d_rss = 0;
+	unpageable_mappings = 0;
+	while (excess > 0 && praddr != NULL && !shutting_down) {
+		/* Try to page out the mapping. */
+		if (pageout_mapping(ph, &cur) < 0) {
+			debug("pid %ld: exited or unpageable\n", pid);
+			break;
+		}
+
+		/* attempted is the size of the mapping */
+		sum_att += (cur.pr_size / 1024);
+
+		/*
+		 * This processes RSS is potentially enough to clear the
+		 * excess so check as we go along to see if we can stop
+		 * paging out partway through the process.
+		 */
+		if (incr_rss_check) {
+			d_rss = rss_delta(&old_rss, psfd);
+
+			/*
+			 * If this pageout attempt was unsuccessful (the
+			 * resident portion was not affected), then note it was
+			 * unpageable. Mappings are unpageable when none of the
+			 * pages paged out, such as when they are locked, or
+			 * involved in asynchronous I/O.
+			 */
+			if (d_rss >= 0) {
+				unpageable_mappings++;
+			} else {
+				excess += d_rss;
+				sum_d_rss += d_rss;
+				sum_pageout += (-d_rss * 1024);
+			}
+		}
+
+		praddr = (void *)nextmapping(&cur);
+	}
+
+	if (!incr_rss_check) {
+		d_rss = rss_delta(&old_rss, psfd);
+		if (d_rss < 0) {
+			excess += d_rss;
+			sum_d_rss += d_rss;
+			sum_pageout += (-d_rss * 1024);
+		}
+	}
+
+	debug("pid %ld: map %d unp %d att %lluKB drss %lldKB excess %lldKB\n",
+	    pid, cur.pr_cnt, unpageable_mappings, (unsigned long long)sum_att,
+	    (unsigned long long)sum_d_rss, (long long)excess);
+
+done:
+	/* If a process was grabbed, release it, destroying its agent. */
+	if (ph != NULL) {
+		Pdestroy_agent(ph);
+		(void) Prelease(ph, 0);
+	}
+
+	if (cur.pr_pghp != NULL)
+		free(cur.pr_pghp);
+
+	(void) close(psfd);
+
+	if (shutting_down)
+		return (0);
+
+	return (excess);
+}
+
+/*
+ * Get the zone's RSS data.
+ */
+static uint64_t
+get_mem_info(int age)
+{
+	uint64_t n = 400;	/* Initial guess on number of zones */
+	uint64_t got = n;
+	int i;
+	zsd_vmusage64_t *buf = NULL;
+	size_t size = sizeof (zsd_vmusage64_t) * n;
+	uint64_t zone_rss = 0;
+
+	/* Preallocate to try to get all zone mem data with only 1 syscall. */
+	if ((buf = (zsd_vmusage64_t *)malloc(size)) == NULL) {
+		debug("get_mem_info malloc failed\n");
+		return (0);
+	}
+
+again:
+	if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_ALL_ZONES,
+	    age, (uintptr_t)buf, (uintptr_t)&n) != 0) {
+		debug("vmusage failed\n");
+		(void) sleep_shutdown(1);
+		if (shutting_down) {
+			free(buf);
+			return (0);
+		}
+		goto again;
+	}
+
+	if (n > got) {
+		size_t size = sizeof (zsd_vmusage64_t) * n;
+
+		if (buf != NULL)
+			free(buf);
+		buf = (zsd_vmusage64_t *)malloc(size);
+		if (buf == NULL) {
+			debug("get_mem_info malloc failed\n");
+			return (0);
+		}
+		got = n;
+		goto again;
+	}
+
+	for (i = 0; i < n; i++) {
+		if (buf[i].vmu_id == zid) {
+			zone_rss = buf[i].vmu_rss_all / 1024;
+			break;
+		}
+	}
+
+	free(buf);
+	return (zone_rss);
+}
+
+/*
+ * Needed to read the zones physical-memory-cap rctl.
+ */
+static struct ps_prochandle *
+grab_zone_proc()
+{
+	DIR *dirp;
+	struct dirent *dentp;
+	struct ps_prochandle *ph = NULL;
+	int tmp;
+
+	if ((dirp = opendir(zoneproc)) == NULL)
+		return (NULL);
+
+	while (!shutting_down && (dentp = readdir(dirp))) {
+		int pid;
+
+		if (strcmp(".", dentp->d_name) == 0 ||
+		    strcmp("..", dentp->d_name) == 0)
+			continue;
+
+		pid = atoi(dentp->d_name);
+		/* attempt to grab process */
+		if ((ph = Pgrab(pid, 0, &tmp)) != NULL) {
+			if (Psetflags(ph, PR_RLC) == 0) {
+				if (Pcreate_agent(ph) == 0) {
+					(void) closedir(dirp);
+					return (ph);
+				}
+			}
+			Prelease(ph, 0);
+		}
+	}
+
+	(void) closedir(dirp);
+	return (NULL);
+}
+
+static uint64_t
+get_zone_cap()
+{
+	rctlblk_t *rblk;
+	uint64_t mcap;
+	struct ps_prochandle *ph;
+
+	if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL)
+		return (UINT64_MAX);
+
+	if ((ph = grab_zone_proc()) == NULL) {
+		free(rblk);
+		return (UINT64_MAX);
+	}
+
+	if (pr_getrctl(ph, "zone.max-physical-memory", NULL, rblk,
+	    RCTL_FIRST)) {
+		Pdestroy_agent(ph);
+		Prelease(ph, 0);
+		free(rblk);
+		return (UINT64_MAX);
+	}
+
+	Pdestroy_agent(ph);
+	Prelease(ph, 0);
+
+	mcap = rctlblk_get_value(rblk);
+	free(rblk);
+	return (mcap);
+}
+
+/*
+ * check_suspend is invoked at the beginning of every pass through the process
+ * list or after we've paged out enough so that we think the excess is under
+ * the cap.  The purpose is to periodically check the zone's rss and return
+ * the excess when the zone is over the cap.  The rest of the time this
+ * function will sleep, periodically waking up to check the current rss.
+ *
+ * The age parameter is used to tell us how old the cached rss data can be.
+ * When first starting up, the cached data can be older, but after we
+ * start paging out, we want current data.
+ *
+ * Depending on the percentage of penetration of the zone's rss into the
+ * cap we sleep for longer or shorter amounts and accept older cached
+ * vmusage data.  This reduces the impact of this work on the system, which
+ * is important considering that each zone will be monitoring its rss.
+ */
+static int64_t
+check_suspend(int age)
+{
+	static hrtime_t last_cap_read = 0;
+	static uint64_t zone_rss_cap;	/* RSS cap(KB) */
+	static uint64_t addon;
+	static uint64_t lo_thresh;	/* Thresholds for how long to  sleep */
+	static uint64_t hi_thresh;	/* when under the cap (80% & 90%). */
+
+	/* Wait a second to give the async pageout a chance to catch up. */
+	(void) sleep_shutdown(1);
+
+	while (!shutting_down) {
+		int64_t new_excess;
+		int sleep_time;
+		hrtime_t now;
+		struct stat st;
+		uint64_t zone_rss;		/* total RSS(KB) */
+
+		/*
+		 * Check if the debug log files exists and enable or disable
+		 * debug.
+		 */
+		if (debug_log_fp == NULL) {
+			if (stat(debug_log, &st) == 0)
+				debug_log_fp = fopen(debug_log, "w");
+		} else {
+			if (stat(debug_log, &st) == -1) {
+				(void) fclose(debug_log_fp);
+				debug_log_fp = NULL;
+			}
+		}
+
+		/*
+		 * If the CAP_REFRESH interval has passed, re-get the current
+		 * cap in case it has been dynamically updated.
+		 */
+		now = gethrtime();
+		if (now - last_cap_read > CAP_REFRESH) {
+			uint64_t mcap;
+
+			last_cap_read = now;
+
+			mcap = get_zone_cap();
+			if (mcap != 0 && mcap != UINT64_MAX)
+				zone_rss_cap = ROUNDUP(mcap, 1024) / 1024;
+			else
+				zone_rss_cap = UINT64_MAX;
+
+			lo_thresh = (uint64_t)(zone_rss_cap * .8);
+			hi_thresh = (uint64_t)(zone_rss_cap * .9);
+			addon = (uint64_t)(zone_rss_cap * 0.05);
+
+			debug("current cap %lluKB lo %lluKB hi %lluKB\n",
+			    zone_rss_cap, lo_thresh, hi_thresh);
+		}
+
+		/* No cap, nothing to do. */
+		if (zone_rss_cap == 0 || zone_rss_cap == UINT64_MAX) {
+			debug("no cap, sleep 120 seconds\n");
+			(void) sleep_shutdown(120);
+			continue;
+		}
+
+		/*
+		 * If we did some paging out since our last invocation then
+		 * update the kstat so we can track how much was paged out.
+		 */
+		if (sum_pageout != 0) {
+			(void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT,
+			    &sum_pageout, 0);
+			sum_pageout = 0;
+		}
+
+		zone_rss = get_mem_info(age);
+
+		/* calculate excess */
+		new_excess = zone_rss - zone_rss_cap;
+
+		debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
+		    zone_rss, zone_rss_cap, new_excess);
+
+		if (new_excess > 0) {
+			uint64_t n = 1;
+
+			/* Increment "nover" kstat. */
+			(void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0);
+
+			/*
+			 * Once we go over the cap, then we want to page out a
+			 * little extra instead of stopping right at the cap.
+			 * To do this we add 5% to the excess so that
+			 * pageout_proces will work a little longer before
+			 * stopping.
+			 */
+			return ((int64_t)(new_excess + addon));
+		}
+
+		/*
+		 * At this point we are under the cap.
+		 *
+		 * Scale the amount of time we sleep before rechecking the
+		 * zone's memory usage.  Also, scale the accpetable age of
+		 * cached results from vm_getusage.  We do this based on the
+		 * penetration into the capped limit.
+		 */
+		if (zone_rss <= lo_thresh) {
+			sleep_time = 120;
+			age = 15;
+		} else if (zone_rss <= hi_thresh) {
+			sleep_time = 60;
+			age = 10;
+		} else {
+			sleep_time = 30;
+			age = 5;
+		}
+
+		debug("sleep %d seconds\n", sleep_time);
+		(void) sleep_shutdown(sleep_time);
+	}
+
+	return (0);
+}
+
+/*
+ * Thread that checks zone's memory usage and when over the cap, goes through
+ * the zone's process list trying to pageout processes to get under the cap.
+ */
+static void
+mcap_zone()
+{
+	DIR *pdir = NULL;
+	int age = 10;	/* initial cached vmusage can be 10 secs. old */
+	int64_t excess;
+
+	debug("thread startup\n");
+
+	/*
+	 * When first starting it is likely lots of other zones are starting
+	 * too because the system is booting.  Since we just started the zone
+	 * we're not worried about being over the cap right away, so we let
+	 * things settle a bit and tolerate some older data here to minimize
+	 * the load on the system.
+	 */
+	(void) sleep_shutdown(15); /* wait 15 secs. so the zone can get going */
+
+	/* Wait until zone's /proc is mounted */
+	while (!shutting_down) {
+		struct stat st;
+
+		if (stat(zoneproc, &st) == 0 &&
+		    strcmp(st.st_fstype, "proc") == 0)
+			break;
+		sleep_shutdown(5);
+	}
+
+	/* Open zone's /proc and walk entries. */
+	while (!shutting_down) {
+		if ((pdir = opendir(zoneproc)) != NULL)
+			break;
+		sleep_shutdown(5);
+	}
+
+	while (!shutting_down) {
+		struct dirent *dirent;
+
+		/* Wait until we've gone over the cap. */
+		excess = check_suspend(age);
+
+		debug("starting to scan, excess %lldk\n", (long long)excess);
+
+		/*
+		 * After the initial startup, we want the age of the cached
+		 * vmusage to be only 1 second old since we are checking
+		 * the current state after we've gone over the cap and have
+		 * paged out some processes.
+		 */
+		age = 1;
+
+		while (!shutting_down && (dirent = readdir(pdir)) != NULL) {
+			pid_t pid;
+
+			if (strcmp(".", dirent->d_name) == 0 ||
+			    strcmp("..", dirent->d_name) == 0)
+				continue;
+
+			pid = atoi(dirent->d_name);
+			if (pid == 0 || pid == 1)
+				continue;
+
+			excess = pageout_process(pid, excess);
+
+			if (excess <= 0) {
+				debug("done scanning; excess %lld\n",
+				    (long long)excess);
+				/* Double check the current excess */
+				excess = check_suspend(1);
+			}
+		}
+
+		debug("process pass done; excess %lld\n", (long long)excess);
+		rewinddir(pdir);
+	}
+
+	(void) closedir(pdir);
+	debug("thread shutdown\n");
+}
+
+void
+create_mcap_thread(zlog_t *zlogp, zoneid_t id)
+{
+	int		res;
+
+	shutting_down = 0;
+	zid = id;
+	(void) getzonenamebyid(zid, zonename, sizeof (zonename));
+
+	if (zone_get_zonepath(zonename, zonepath, sizeof (zonepath)) != 0)
+		zerror(zlogp, B_FALSE, "zone %s missing zonepath", zonename);
+	(void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc", zonepath);
+	(void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log",
+	    zonepath);
+
+	res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL,
+	    &mcap_tid);
+	if (res != 0) {
+		zerror(zlogp, B_FALSE, "error %d creating memory cap thread",
+		    res);
+		mcap_tid = 0;
+	}
+}
+
+void
+destroy_mcap_thread()
+{
+	if (mcap_tid != 0) {
+		shutting_down = 1;
+		(void) cond_signal(&shutdown_cv);
+		(void) thr_join(mcap_tid, NULL, NULL);
+		mcap_tid = 0;
+	}
+}
diff --git a/usr/src/cmd/zoneadmd/zcons.c b/usr/src/cmd/zoneadmd/zcons.c
index 8653b954fb..5bcc1b90e8 100644
--- a/usr/src/cmd/zoneadmd/zcons.c
+++ b/usr/src/cmd/zoneadmd/zcons.c
@@ -436,7 +436,7 @@ devlinks:
 		} else if (errno != ENXIO) {
 			break;
 		}
-		sleep(1);
+		(void) sleep(1);
 	}
 	if (rv != 0)
 		zerror(zlogp, B_TRUE, "ERROR: error while acquiring slave "
diff --git a/usr/src/cmd/zoneadmd/zoneadmd.c b/usr/src/cmd/zoneadmd/zoneadmd.c
index 40cda3a665..45a16668d6 100644
--- a/usr/src/cmd/zoneadmd/zoneadmd.c
+++ b/usr/src/cmd/zoneadmd/zoneadmd.c
@@ -1051,6 +1051,9 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate, boolean_t debug)
 	if (brand_poststatechg(zlogp, zstate, Z_BOOT, debug) != 0)
 		goto bad;
 
+	/* Startup a thread to perform memory capping for the zone. */
+	create_mcap_thread(zlogp, zone_id);
+
 	return (0);
 
 bad:
@@ -1073,6 +1076,9 @@ zone_halt(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting, int zstate,
 	if (brand_prestatechg(zlogp, zstate, Z_HALT, debug) != 0)
 		return (-1);
 
+	/* Shutting down, stop the memcap thread */
+	destroy_mcap_thread();
+
 	if (vplat_teardown(zlogp, unmount_cmd, rebooting, debug) != 0) {
 		if (!bringup_failure_recovery)
 			zerror(zlogp, B_FALSE, "unable to destroy zone");
@@ -1712,11 +1718,20 @@ top:
 		 * state.
 		 */
 		if (zstate > ZONE_STATE_INSTALLED) {
+			static zoneid_t zid;
+
 			zerror(zlogp, B_FALSE,
 			    "zone '%s': WARNING: zone is in state '%s', but "
 			    "zoneadmd does not appear to be available; "
 			    "restarted zoneadmd to recover.",
 			    zone_name, zone_state_str(zstate));
+
+			/*
+			 * Startup a thread to perform memory capping for the
+			 * zone.
+			 */
+			if ((zid = getzoneidbyname(zone_name)) != -1)
+				create_mcap_thread(zlogp, zid);
 		}
 
 		(void) fdetach(zone_door_path);
diff --git a/usr/src/cmd/zoneadmd/zoneadmd.h b/usr/src/cmd/zoneadmd/zoneadmd.h
index 8ca177fdb1..49aa9e8cf6 100644
--- a/usr/src/cmd/zoneadmd/zoneadmd.h
+++ b/usr/src/cmd/zoneadmd/zoneadmd.h
@@ -152,6 +152,12 @@ extern int init_console(zlog_t *);
 extern void serve_console(zlog_t *);
 
 /*
+ * Memory capping thread creation.
+ */
+extern void create_mcap_thread(zlog_t *, zoneid_t);
+extern void destroy_mcap_thread();
+
+/*
  * Contract handling.
  */
 extern int init_template(void);
author	Jerry Jelinek <jerry.jelinek@joyent.com>	2011-05-05 07:46:18 -0700
committer	Jerry Jelinek <jerry.jelinek@joyent.com>	2011-05-05 07:46:18 -0700
commit	9230f57fd5dc4b0898a592a83cfa3f7b18b26718 (patch)
tree	aaa4b6796395f90c30238d0af42360549edd2fd8 /usr/src/cmd
parent	f3861a7cccef3296d09052f75b91c493572fd94a (diff)
download	illumos-joyent-9230f57fd5dc4b0898a592a83cfa3f7b18b26718.tar.gz