diff options
| author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2011-05-05 07:46:18 -0700 |
|---|---|---|
| committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2011-05-05 07:46:18 -0700 |
| commit | 9230f57fd5dc4b0898a592a83cfa3f7b18b26718 (patch) | |
| tree | aaa4b6796395f90c30238d0af42360549edd2fd8 /usr/src/cmd | |
| parent | f3861a7cccef3296d09052f75b91c493572fd94a (diff) | |
| download | illumos-joyent-9230f57fd5dc4b0898a592a83cfa3f7b18b26718.tar.gz | |
OS-11 rcapd behaves poorly when under extreme load
Diffstat (limited to 'usr/src/cmd')
| -rw-r--r-- | usr/src/cmd/rcap/common/utils.c | 75 | ||||
| -rw-r--r-- | usr/src/cmd/rcap/common/utils.h | 2 | ||||
| -rw-r--r-- | usr/src/cmd/rcap/rcapd/rcapd_collection_zone.c | 61 | ||||
| -rw-r--r-- | usr/src/cmd/zoneadmd/Makefile | 5 | ||||
| -rw-r--r-- | usr/src/cmd/zoneadmd/mcap.c | 882 | ||||
| -rw-r--r-- | usr/src/cmd/zoneadmd/zcons.c | 2 | ||||
| -rw-r--r-- | usr/src/cmd/zoneadmd/zoneadmd.c | 15 | ||||
| -rw-r--r-- | usr/src/cmd/zoneadmd/zoneadmd.h | 6 |
8 files changed, 927 insertions, 121 deletions
diff --git a/usr/src/cmd/rcap/common/utils.c b/usr/src/cmd/rcap/common/utils.c index 799fdcef23..dd511c7c50 100644 --- a/usr/src/cmd/rcap/common/utils.c +++ b/usr/src/cmd/rcap/common/utils.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ #include <sys/param.h> @@ -257,77 +258,3 @@ xatoi(char *p) return (i); } } - -/* - * get_running_zones() calls zone_list(2) to find out how many zones are - * running. It then calls zone_list(2) again to fetch the list of running - * zones (stored in *zents). - */ -int -get_running_zones(uint_t *nzents, zone_entry_t **zents) -{ - zoneid_t *zids; - uint_t nzents_saved; - int i; - zone_entry_t *zentp; - zone_state_t zstate; - - *zents = NULL; - if (zone_list(NULL, nzents) != 0) { - warn(gettext("could not get zoneid list\n")); - return (E_ERROR); - } - -again: - if (*nzents == 0) - return (E_SUCCESS); - - if ((zids = (zoneid_t *)calloc(*nzents, sizeof (zoneid_t))) == NULL) { - warn(gettext("out of memory: zones will not be capped\n")); - return (E_ERROR); - } - - nzents_saved = *nzents; - - if (zone_list(zids, nzents) != 0) { - warn(gettext("could not get zone list\n")); - free(zids); - return (E_ERROR); - } - if (*nzents != nzents_saved) { - /* list changed, try again */ - free(zids); - goto again; - } - - *zents = calloc(*nzents, sizeof (zone_entry_t)); - if (*zents == NULL) { - warn(gettext("out of memory: zones will not be capped\n")); - free(zids); - return (E_ERROR); - } - - zentp = *zents; - for (i = 0; i < *nzents; i++) { - char name[ZONENAME_MAX]; - - if (getzonenamebyid(zids[i], name, sizeof (name)) < 0) { - warn(gettext("could not get name for " - "zoneid %d\n"), zids[i]); - continue; - } - - (void) strlcpy(zentp->zname, name, sizeof (zentp->zname)); - zentp->zid = zids[i]; - if (zone_get_state(name, &zstate) != Z_OK || - zstate != ZONE_STATE_RUNNING) - continue; - - - zentp++; - } - *nzents = zentp - *zents; - - free(zids); - return (E_SUCCESS); -} diff --git a/usr/src/cmd/rcap/common/utils.h b/usr/src/cmd/rcap/common/utils.h index 7196cfb4ce..cf2e17c080 100644 --- a/usr/src/cmd/rcap/common/utils.h +++ b/usr/src/cmd/rcap/common/utils.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ #ifndef _UTILS_H @@ -98,7 +99,6 @@ extern void vdprintfe(int, const char *, va_list); extern void dprintfe(int, char *, ...); extern void hrt2ts(hrtime_t, timestruc_t *); extern int xatoi(char *); -extern int get_running_zones(uint_t *, zone_entry_t **); #ifdef __cplusplus } diff --git a/usr/src/cmd/rcap/rcapd/rcapd_collection_zone.c b/usr/src/cmd/rcap/rcapd/rcapd_collection_zone.c index 798ed97707..88403dda37 100644 --- a/usr/src/cmd/rcap/rcapd/rcapd_collection_zone.c +++ b/usr/src/cmd/rcap/rcapd/rcapd_collection_zone.c @@ -121,61 +121,36 @@ get_zone_cap(zoneid_t zid) return (mcap); } -static void -update_zone(zone_entry_t *zent, void *walk_data) +/* + * For zones, rcapd only caps the global zone, since each non-global zone + * caps itself. + */ +/* ARGSUSED */ +void +lcollection_update_zone(lcollection_update_type_t ut, + void(*update_notification_cb)(char *, char *, int, uint64_t, int)) { - void(*update_notification_cb)(char *, char *, int, uint64_t, int) = - (void(*)(char *, char *, int, uint64_t, int))walk_data; int changes; int64_t max_rss; uint64_t mcap; lcollection_t *lcol; rcid_t colid; - mcap = get_zone_cap(zent->zid); - if (mcap != 0 && mcap != UINT64_MAX) + mcap = get_zone_cap(GLOBAL_ZONEID); + if (mcap != 0 && mcap != UINT64_MAX) { max_rss = ROUNDUP(mcap, 1024) / 1024; - else - max_rss = 0; - - if (zent->zid == GLOBAL_ZONEID) { - if (max_rss > 0) - gz_capped = B_TRUE; - else - gz_capped = B_FALSE; + gz_capped = B_TRUE; + } else { + max_rss = UINT64_MAX / 1024; + gz_capped = B_FALSE; } - colid.rcid_type = RCIDT_ZONE; - colid.rcid_val = zent->zid; + colid.rcid_val = GLOBAL_ZONEID; - lcol = lcollection_insert_update(&colid, max_rss, zent->zname, + lcol = lcollection_insert_update(&colid, max_rss, GLOBAL_ZONENAME, &changes); if (update_notification_cb != NULL) - update_notification_cb("zone", zent->zname, changes, max_rss, - (lcol != NULL) ? lcol->lcol_mark : 0); -} - - -/* ARGSUSED */ -void -lcollection_update_zone(lcollection_update_type_t ut, - void(*update_notification_cb)(char *, char *, int, uint64_t, int)) -{ - int i; - uint_t nzents; - zone_entry_t *zents; - - /* - * Enumerate running zones. - */ - if (get_running_zones(&nzents, &zents) != 0) - return; - - for (i = 0; i < nzents; i++) { - update_zone(&zents[i], (void *)update_notification_cb); - - } - - free(zents); + update_notification_cb("zone", GLOBAL_ZONENAME, changes, + max_rss, (lcol != NULL) ? lcol->lcol_mark : 0); } diff --git a/usr/src/cmd/zoneadmd/Makefile b/usr/src/cmd/zoneadmd/Makefile index cb03ef459a..f8810c46ef 100644 --- a/usr/src/cmd/zoneadmd/Makefile +++ b/usr/src/cmd/zoneadmd/Makefile @@ -23,6 +23,7 @@ # # Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2011, Joyent, Inc. All rights reserved. # PROG= zoneadmd @@ -31,7 +32,7 @@ include ../Makefile.cmd ROOTCMDDIR= $(ROOTLIB)/zones -OBJS= zoneadmd.o zcons.o vplat.o +OBJS= zoneadmd.o zcons.o vplat.o mcap.o SRCS = $(OBJS:.o=.c) POFILE=zoneadmd_all.po POFILES= $(OBJS:%.o=%.po) @@ -39,7 +40,7 @@ POFILES= $(OBJS:%.o=%.po) CFLAGS += $(CCVERBOSE) LDLIBS += -lsocket -lzonecfg -lnsl -ldevinfo -ldevice -lnvpair \ -lgen -lbsm -lcontract -lzfs -luuid -lbrand -ldladm -ltsnet -ltsol \ - -linetutil + -linetutil -lproc XGETFLAGS += -a -x zoneadmd.xcl .KEEP_STATE: diff --git a/usr/src/cmd/zoneadmd/mcap.c b/usr/src/cmd/zoneadmd/mcap.c new file mode 100644 index 0000000000..4ae4bd0ecd --- /dev/null +++ b/usr/src/cmd/zoneadmd/mcap.c @@ -0,0 +1,882 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2011 Joyent, Inc. All rights reserved. + */ + +/* + * This file implements the code which runs a thread inside zoneadmd to cap + * the associated zone's physical memory. A thread to do this is started + * when the zone boots and is halted when the zone shuts down. + * + * Because of the way that the VM system is currently implemented, there is no + * way to go from the bottom up (page to process to zone). Thus, there is no + * obvious way to hook an rctl into the kernel's paging code to enforce a hard + * memory cap. Instead, we implement a soft physical memory cap which looks + * at the zone's overall rss and once it is over the cap, works from the top + * down (zone to process to page), looking at zone processes, to determine + * what to try to pageout to get the zone under its memory cap. + * + * The code uses the vm_getusage syscall to determine the zone's rss and + * checks that against the zone's zone.max-physical-memory rctl. Once the + * zone goes over its cap, then this thread will work through the zone's + * /proc process list, Pgrab-bing each process and stepping through the + * address space segments attempting to use pr_memcntl(...MS_INVALIDATE...) + * to pageout pages, until the zone is again under its cap. + * + * Although zone memory capping is implemented as a soft cap by this user-level + * thread, the interfaces around memory caps that are exposed to the user are + * the standard ones; an rctl and kstats. This thread uses the rctl value + * to obtain the cap and works with the zone kernel code to update the kstats. + * If the implementation ever moves into the kernel, these exposed interfaces + * do not need to change. + * + * The thread adaptively sleeps, periodically checking the state of the + * zone. As the zone's rss gets closer to the cap, the thread will wake up + * more often to check the zone's status. Once the zone is over the cap, + * the thread will work to pageout until the zone is under the cap, as shown + * by updated vm_usage data. + * + * There are a couple of interfaces (xmap, pagedata) in proc(4) that can be + * used to examine a processes mapped segments while we are trying to pageout. + * The observed xmap segement size data is frequently smaller than the + * pagedata segement size data, so it is less effective in practice. Thus we + * use pagedata to determine the size of each segment. + * + * The pagedata page maps (at least on x86) are not useful. Those flags + * are set by hrm_setbits() and on x86 that code path is only executed by + * segvn_pagelock -> hat_setstat -> hrm_setbits + * segvn_softunlock -^ + * On SPARC there is an additional code path which may make this data + * useful (sfmmu_ttesync), but since it is not generic, we ignore the page + * maps and only use the segement info from pagedata. If we ever fix this + * issue, then we could generalize this mcap code to do more with the data on + * active pages. + * + * For debugging, touch the file {zonepath}/mcap_debug.log. This will + * cause the thread to start logging its actions into that file (it may take + * a minute or two if the thread is currently sleeping). Removing that + * file will cause logging to stop. + */ + +#include <sys/mman.h> +#include <sys/param.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <libproc.h> +#include <limits.h> +#include <procfs.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <time.h> +#include <unistd.h> +#include <sys/priocntl.h> +#include <dirent.h> +#include <zone.h> +#include <libzonecfg.h> +#include <thread.h> +#include <values.h> +#include <sys/vm_usage.h> +#include <sys/resource.h> +#include <sys/debug.h> +#include <synch.h> +#include "zoneadmd.h" + + /* round up to next y = 2^n */ +#define ROUNDUP(x, y) (((x) + ((y) - 1)) & ~((y) - 1)) + +#define CAP_REFRESH ((uint64_t)300 * NANOSEC) /* every 5 minutes */ + +static char zonename[ZONENAME_MAX]; +static char zonepath[MAXPATHLEN]; +static char zoneproc[MAXPATHLEN]; +static char debug_log[MAXPATHLEN]; +static zoneid_t zid; +static mutex_t shutdown_mx; +static cond_t shutdown_cv; +static int shutting_down = 0; +static thread_t mcap_tid; +static FILE *debug_log_fp = NULL; +static uint64_t sum_pageout = 0; /* total bytes paged out in a pass */ + +/* + * Structure to hold current state about a process address space that we're + * working on. + */ +typedef struct { + int pr_curr; /* the # of the mapping we're working on */ + int pr_nmap; /* number of mappings in address space */ + int pr_cnt; /* number of mappings processed */ + + prpageheader_t *pr_pghp; /* process's complete pagedata */ + prasmap_t *pr_asp; /* current address space pointer */ + + uintptr_t pr_addr; /* base of mapping */ + uint64_t pr_size; /* size of mapping */ +} proc_map_t; + +typedef struct zsd_vmusage64 { + id_t vmu_zoneid; + uint_t vmu_type; + id_t vmu_id; + /* + * An amd64 kernel will align the following uint64_t members, but a + * 32bit i386 process will not without help. + */ + int vmu_align_next_members_on_8_bytes; + uint64_t vmu_rss_all; + uint64_t vmu_rss_private; + uint64_t vmu_rss_shared; + uint64_t vmu_swap_all; + uint64_t vmu_swap_private; + uint64_t vmu_swap_shared; +} zsd_vmusage64_t; + +/* + * Output a debug log message. + */ +/*PRINTFLIKE1*/ +static void +debug(char *fmt, ...) +{ + va_list ap; + + if (debug_log_fp == NULL) + return; + + va_start(ap, fmt); + (void) vfprintf(debug_log_fp, fmt, ap); + va_end(ap); + (void) fflush(debug_log_fp); +} + +/* + * Like sleep(3C) but can be interupted by cond_signal which is posted when + * we're shutting down the mcap thread. + */ +static void +sleep_shutdown(int secs) +{ + timestruc_t to; + + to.tv_sec = secs; + to.tv_nsec = 0; + + (void) mutex_lock(&shutdown_mx); + if (!shutting_down) + (void) cond_reltimedwait(&shutdown_cv, &shutdown_mx, &to); + (void) mutex_unlock(&shutdown_mx); +} + +static boolean_t +proc_issystem(pid_t pid) +{ + char pc_clname[PC_CLNMSZ]; + + if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname, + PC_KY_NULL) != -1) + return (strcmp(pc_clname, "SYS") == 0); + + return (B_TRUE); +} + +static struct ps_prochandle * +control_proc(pid_t pid) +{ + int res; + struct ps_prochandle *ph; + + /* Take control of the process. */ + if ((ph = Pgrab(pid, 0, &res)) == NULL) + return (NULL); + + if (Pcreate_agent(ph) != 0) { + (void) Prelease(ph, 0); + return (NULL); + } + + /* Verify agent LWP is actually stopped. */ + errno = 0; + while (Pstate(ph) == PS_RUN) + (void) Pwait(ph, 0); + + if (Pstate(ph) != PS_STOP) { + Pdestroy_agent(ph); + (void) Prelease(ph, 0); + return (NULL); + } + + return (ph); +} + +/* + * Get data from the current prasmap_t and advance pr_asp to the next + * asmap in the pagedata. + */ +static uintptr_t +nextmapping(proc_map_t *pmp) +{ + prasmap_t *pap; + void *pdp; /* per-page data pointer */ + + pmp->pr_curr++; + if (pmp->pr_curr > pmp->pr_nmap) + return (NULL); + + pap = pmp->pr_asp; + + pmp->pr_addr = pap->pr_vaddr; + pmp->pr_size = pap->pr_npage * pap->pr_pagesize; + pmp->pr_cnt++; + + /* Advance the pr_asp pointer to the next asmap */ + pdp = pap + 1; + pdp = (caddr_t)(uintptr_t)((uintptr_t)pdp + pap->pr_npage); + + /* Skip to next 64-bit-aligned address to get the next prasmap_t. */ + pdp = (caddr_t)(((uintptr_t)pdp + 7) & ~7); + pmp->pr_asp = (prasmap_t *)pdp; + + return (pmp->pr_addr); +} + +/* + * Initialize the proc_map_t to access the first mapping of an address space. + */ +static void * +init_map(proc_map_t *pmp, pid_t pid) +{ + int fd; + int res; + struct stat st; + char pathbuf[MAXPATHLEN]; + + bzero(pmp, sizeof (proc_map_t)); + pmp->pr_nmap = -1; + + (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/pagedata", zoneproc, + pid); + if ((fd = open(pathbuf, O_RDONLY, 0)) < 0) + return (NULL); + +redo: + errno = 0; + if (fstat(fd, &st) != 0) + return (NULL); + + if ((pmp->pr_pghp = malloc(st.st_size)) == NULL) { + debug("cannot malloc() %ld bytes for pagedata", st.st_size); + return (NULL); + } + (void) bzero(pmp->pr_pghp, st.st_size); + + errno = 0; + if ((res = read(fd, pmp->pr_pghp, st.st_size)) != st.st_size) { + free(pmp->pr_pghp); + pmp->pr_pghp = NULL; + if (res > 0 || errno == E2BIG) { + goto redo; + } else { + debug("pid %ld cannot read pagedata\n", pid); + return (NULL); + } + } + + pmp->pr_nmap = pmp->pr_pghp->pr_nmap; + pmp->pr_asp = (prasmap_t *)(pmp->pr_pghp + 1); +done: + (void) close(fd); + return ((void *)nextmapping(pmp)); +} + +/* + * Attempt to page out a region of the given process's address space. May + * return nonzero if not all of the pages may are pageable, for any reason. + */ +static int +pageout_mapping(struct ps_prochandle *Pr, proc_map_t *pmp) +{ + int res; + + errno = 0; + res = pr_memcntl(Pr, (caddr_t)pmp->pr_addr, pmp->pr_size, MC_SYNC, + (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0); + + /* + * EBUSY indicates none of the pages have backing store allocated, or + * some pages were locked. Don't care about this. + */ + if (res != 0 && errno == EBUSY) + res = 0; + + return (res); +} + +/* + * Compute the delta of the process RSS since the last call. If the + * psinfo cannot be obtained, no error is returned; its up to the caller to + * detect the process termination via other means. + */ +static int64_t +rss_delta(int64_t *old_rss, int psfd) +{ + int64_t d_rss = 0; + psinfo_t psinfo; + + if (pread(psfd, &psinfo, sizeof (psinfo_t), 0) == sizeof (psinfo_t)) { + d_rss = (int64_t)psinfo.pr_rssize - *old_rss; + *old_rss = (int64_t)psinfo.pr_rssize; + } + + return (d_rss); +} + + +/* + * Work through a process paging out mappings until the whole address space was + * examined or the excess is < 0. Return our estimate of the updated excess. + */ +static int64_t +pageout_process(pid_t pid, int64_t excess) +{ + int psfd; + void *praddr; + proc_map_t cur; + struct ps_prochandle *ph = NULL; + int unpageable_mappings; + int64_t sum_d_rss, sum_att, d_rss; + int64_t old_rss; + psinfo_t psinfo; + int incr_rss_check = 0; + char pathbuf[MAXPATHLEN]; + + cur.pr_pghp = NULL; + (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc, + pid); + if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0) + return (excess); + + if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo)) + goto done; + + old_rss = (int64_t)psinfo.pr_rssize; + + /* If unscannable, skip it. */ + if (psinfo.pr_nlwp == 0 || proc_issystem(pid)) { + debug("pid: %ld system process, skipping %s\n", + pid, psinfo.pr_psargs); + goto done; + } + + /* If tiny RSS (16KB), skip it. */ + if (old_rss <= 16) { + debug("pid: %ld skipping, RSS %lldKB %s\n", + pid, old_rss, psinfo.pr_psargs); + goto done; + } + + /* Get segment residency information. */ + praddr = init_map(&cur, pid); + + /* Skip process if it has no mappings. */ + if (cur.pr_pghp == NULL) { + debug("%ld: pagedata unreadable; ignoring\n", pid); + goto done; + } + + debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n", + pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs); + + /* Take control of the process. */ + if ((ph = control_proc(pid)) == NULL) { + debug("%ld: cannot control\n", pid); + goto done; + } + + /* + * If the process RSS is not enough to erase the excess then no need + * to incrementally check the RSS delta after each pageout attempt. + * Instead check it after we've tried all of the segements. + */ + if (excess - old_rss < 0) + incr_rss_check = 1; + + /* + * Within the process's address space, attempt to page out mappings. + */ + sum_att = sum_d_rss = 0; + unpageable_mappings = 0; + while (excess > 0 && praddr != NULL && !shutting_down) { + /* Try to page out the mapping. */ + if (pageout_mapping(ph, &cur) < 0) { + debug("pid %ld: exited or unpageable\n", pid); + break; + } + + /* attempted is the size of the mapping */ + sum_att += (cur.pr_size / 1024); + + /* + * This processes RSS is potentially enough to clear the + * excess so check as we go along to see if we can stop + * paging out partway through the process. + */ + if (incr_rss_check) { + d_rss = rss_delta(&old_rss, psfd); + + /* + * If this pageout attempt was unsuccessful (the + * resident portion was not affected), then note it was + * unpageable. Mappings are unpageable when none of the + * pages paged out, such as when they are locked, or + * involved in asynchronous I/O. + */ + if (d_rss >= 0) { + unpageable_mappings++; + } else { + excess += d_rss; + sum_d_rss += d_rss; + sum_pageout += (-d_rss * 1024); + } + } + + praddr = (void *)nextmapping(&cur); + } + + if (!incr_rss_check) { + d_rss = rss_delta(&old_rss, psfd); + if (d_rss < 0) { + excess += d_rss; + sum_d_rss += d_rss; + sum_pageout += (-d_rss * 1024); + } + } + + debug("pid %ld: map %d unp %d att %lluKB drss %lldKB excess %lldKB\n", + pid, cur.pr_cnt, unpageable_mappings, (unsigned long long)sum_att, + (unsigned long long)sum_d_rss, (long long)excess); + +done: + /* If a process was grabbed, release it, destroying its agent. */ + if (ph != NULL) { + Pdestroy_agent(ph); + (void) Prelease(ph, 0); + } + + if (cur.pr_pghp != NULL) + free(cur.pr_pghp); + + (void) close(psfd); + + if (shutting_down) + return (0); + + return (excess); +} + +/* + * Get the zone's RSS data. + */ +static uint64_t +get_mem_info(int age) +{ + uint64_t n = 400; /* Initial guess on number of zones */ + uint64_t got = n; + int i; + zsd_vmusage64_t *buf = NULL; + size_t size = sizeof (zsd_vmusage64_t) * n; + uint64_t zone_rss = 0; + + /* Preallocate to try to get all zone mem data with only 1 syscall. */ + if ((buf = (zsd_vmusage64_t *)malloc(size)) == NULL) { + debug("get_mem_info malloc failed\n"); + return (0); + } + +again: + if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_ALL_ZONES, + age, (uintptr_t)buf, (uintptr_t)&n) != 0) { + debug("vmusage failed\n"); + (void) sleep_shutdown(1); + if (shutting_down) { + free(buf); + return (0); + } + goto again; + } + + if (n > got) { + size_t size = sizeof (zsd_vmusage64_t) * n; + + if (buf != NULL) + free(buf); + buf = (zsd_vmusage64_t *)malloc(size); + if (buf == NULL) { + debug("get_mem_info malloc failed\n"); + return (0); + } + got = n; + goto again; + } + + for (i = 0; i < n; i++) { + if (buf[i].vmu_id == zid) { + zone_rss = buf[i].vmu_rss_all / 1024; + break; + } + } + + free(buf); + return (zone_rss); +} + +/* + * Needed to read the zones physical-memory-cap rctl. + */ +static struct ps_prochandle * +grab_zone_proc() +{ + DIR *dirp; + struct dirent *dentp; + struct ps_prochandle *ph = NULL; + int tmp; + + if ((dirp = opendir(zoneproc)) == NULL) + return (NULL); + + while (!shutting_down && (dentp = readdir(dirp))) { + int pid; + + if (strcmp(".", dentp->d_name) == 0 || + strcmp("..", dentp->d_name) == 0) + continue; + + pid = atoi(dentp->d_name); + /* attempt to grab process */ + if ((ph = Pgrab(pid, 0, &tmp)) != NULL) { + if (Psetflags(ph, PR_RLC) == 0) { + if (Pcreate_agent(ph) == 0) { + (void) closedir(dirp); + return (ph); + } + } + Prelease(ph, 0); + } + } + + (void) closedir(dirp); + return (NULL); +} + +static uint64_t +get_zone_cap() +{ + rctlblk_t *rblk; + uint64_t mcap; + struct ps_prochandle *ph; + + if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL) + return (UINT64_MAX); + + if ((ph = grab_zone_proc()) == NULL) { + free(rblk); + return (UINT64_MAX); + } + + if (pr_getrctl(ph, "zone.max-physical-memory", NULL, rblk, + RCTL_FIRST)) { + Pdestroy_agent(ph); + Prelease(ph, 0); + free(rblk); + return (UINT64_MAX); + } + + Pdestroy_agent(ph); + Prelease(ph, 0); + + mcap = rctlblk_get_value(rblk); + free(rblk); + return (mcap); +} + +/* + * check_suspend is invoked at the beginning of every pass through the process + * list or after we've paged out enough so that we think the excess is under + * the cap. The purpose is to periodically check the zone's rss and return + * the excess when the zone is over the cap. The rest of the time this + * function will sleep, periodically waking up to check the current rss. + * + * The age parameter is used to tell us how old the cached rss data can be. + * When first starting up, the cached data can be older, but after we + * start paging out, we want current data. + * + * Depending on the percentage of penetration of the zone's rss into the + * cap we sleep for longer or shorter amounts and accept older cached + * vmusage data. This reduces the impact of this work on the system, which + * is important considering that each zone will be monitoring its rss. + */ +static int64_t +check_suspend(int age) +{ + static hrtime_t last_cap_read = 0; + static uint64_t zone_rss_cap; /* RSS cap(KB) */ + static uint64_t addon; + static uint64_t lo_thresh; /* Thresholds for how long to sleep */ + static uint64_t hi_thresh; /* when under the cap (80% & 90%). */ + + /* Wait a second to give the async pageout a chance to catch up. */ + (void) sleep_shutdown(1); + + while (!shutting_down) { + int64_t new_excess; + int sleep_time; + hrtime_t now; + struct stat st; + uint64_t zone_rss; /* total RSS(KB) */ + + /* + * Check if the debug log files exists and enable or disable + * debug. + */ + if (debug_log_fp == NULL) { + if (stat(debug_log, &st) == 0) + debug_log_fp = fopen(debug_log, "w"); + } else { + if (stat(debug_log, &st) == -1) { + (void) fclose(debug_log_fp); + debug_log_fp = NULL; + } + } + + /* + * If the CAP_REFRESH interval has passed, re-get the current + * cap in case it has been dynamically updated. + */ + now = gethrtime(); + if (now - last_cap_read > CAP_REFRESH) { + uint64_t mcap; + + last_cap_read = now; + + mcap = get_zone_cap(); + if (mcap != 0 && mcap != UINT64_MAX) + zone_rss_cap = ROUNDUP(mcap, 1024) / 1024; + else + zone_rss_cap = UINT64_MAX; + + lo_thresh = (uint64_t)(zone_rss_cap * .8); + hi_thresh = (uint64_t)(zone_rss_cap * .9); + addon = (uint64_t)(zone_rss_cap * 0.05); + + debug("current cap %lluKB lo %lluKB hi %lluKB\n", + zone_rss_cap, lo_thresh, hi_thresh); + } + + /* No cap, nothing to do. */ + if (zone_rss_cap == 0 || zone_rss_cap == UINT64_MAX) { + debug("no cap, sleep 120 seconds\n"); + (void) sleep_shutdown(120); + continue; + } + + /* + * If we did some paging out since our last invocation then + * update the kstat so we can track how much was paged out. + */ + if (sum_pageout != 0) { + (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT, + &sum_pageout, 0); + sum_pageout = 0; + } + + zone_rss = get_mem_info(age); + + /* calculate excess */ + new_excess = zone_rss - zone_rss_cap; + + debug("rss %lluKB, cap %lluKB, excess %lldKB\n", + zone_rss, zone_rss_cap, new_excess); + + if (new_excess > 0) { + uint64_t n = 1; + + /* Increment "nover" kstat. */ + (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0); + + /* + * Once we go over the cap, then we want to page out a + * little extra instead of stopping right at the cap. + * To do this we add 5% to the excess so that + * pageout_proces will work a little longer before + * stopping. + */ + return ((int64_t)(new_excess + addon)); + } + + /* + * At this point we are under the cap. + * + * Scale the amount of time we sleep before rechecking the + * zone's memory usage. Also, scale the accpetable age of + * cached results from vm_getusage. We do this based on the + * penetration into the capped limit. + */ + if (zone_rss <= lo_thresh) { + sleep_time = 120; + age = 15; + } else if (zone_rss <= hi_thresh) { + sleep_time = 60; + age = 10; + } else { + sleep_time = 30; + age = 5; + } + + debug("sleep %d seconds\n", sleep_time); + (void) sleep_shutdown(sleep_time); + } + + return (0); +} + +/* + * Thread that checks zone's memory usage and when over the cap, goes through + * the zone's process list trying to pageout processes to get under the cap. + */ +static void +mcap_zone() +{ + DIR *pdir = NULL; + int age = 10; /* initial cached vmusage can be 10 secs. old */ + int64_t excess; + + debug("thread startup\n"); + + /* + * When first starting it is likely lots of other zones are starting + * too because the system is booting. Since we just started the zone + * we're not worried about being over the cap right away, so we let + * things settle a bit and tolerate some older data here to minimize + * the load on the system. + */ + (void) sleep_shutdown(15); /* wait 15 secs. so the zone can get going */ + + /* Wait until zone's /proc is mounted */ + while (!shutting_down) { + struct stat st; + + if (stat(zoneproc, &st) == 0 && + strcmp(st.st_fstype, "proc") == 0) + break; + sleep_shutdown(5); + } + + /* Open zone's /proc and walk entries. */ + while (!shutting_down) { + if ((pdir = opendir(zoneproc)) != NULL) + break; + sleep_shutdown(5); + } + + while (!shutting_down) { + struct dirent *dirent; + + /* Wait until we've gone over the cap. */ + excess = check_suspend(age); + + debug("starting to scan, excess %lldk\n", (long long)excess); + + /* + * After the initial startup, we want the age of the cached + * vmusage to be only 1 second old since we are checking + * the current state after we've gone over the cap and have + * paged out some processes. + */ + age = 1; + + while (!shutting_down && (dirent = readdir(pdir)) != NULL) { + pid_t pid; + + if (strcmp(".", dirent->d_name) == 0 || + strcmp("..", dirent->d_name) == 0) + continue; + + pid = atoi(dirent->d_name); + if (pid == 0 || pid == 1) + continue; + + excess = pageout_process(pid, excess); + + if (excess <= 0) { + debug("done scanning; excess %lld\n", + (long long)excess); + /* Double check the current excess */ + excess = check_suspend(1); + } + } + + debug("process pass done; excess %lld\n", (long long)excess); + rewinddir(pdir); + } + + (void) closedir(pdir); + debug("thread shutdown\n"); +} + +void +create_mcap_thread(zlog_t *zlogp, zoneid_t id) +{ + int res; + + shutting_down = 0; + zid = id; + (void) getzonenamebyid(zid, zonename, sizeof (zonename)); + + if (zone_get_zonepath(zonename, zonepath, sizeof (zonepath)) != 0) + zerror(zlogp, B_FALSE, "zone %s missing zonepath", zonename); + (void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc", zonepath); + (void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log", + zonepath); + + res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL, + &mcap_tid); + if (res != 0) { + zerror(zlogp, B_FALSE, "error %d creating memory cap thread", + res); + mcap_tid = 0; + } +} + +void +destroy_mcap_thread() +{ + if (mcap_tid != 0) { + shutting_down = 1; + (void) cond_signal(&shutdown_cv); + (void) thr_join(mcap_tid, NULL, NULL); + mcap_tid = 0; + } +} diff --git a/usr/src/cmd/zoneadmd/zcons.c b/usr/src/cmd/zoneadmd/zcons.c index 8653b954fb..5bcc1b90e8 100644 --- a/usr/src/cmd/zoneadmd/zcons.c +++ b/usr/src/cmd/zoneadmd/zcons.c @@ -436,7 +436,7 @@ devlinks: } else if (errno != ENXIO) { break; } - sleep(1); + (void) sleep(1); } if (rv != 0) zerror(zlogp, B_TRUE, "ERROR: error while acquiring slave " diff --git a/usr/src/cmd/zoneadmd/zoneadmd.c b/usr/src/cmd/zoneadmd/zoneadmd.c index 40cda3a665..45a16668d6 100644 --- a/usr/src/cmd/zoneadmd/zoneadmd.c +++ b/usr/src/cmd/zoneadmd/zoneadmd.c @@ -1051,6 +1051,9 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate, boolean_t debug) if (brand_poststatechg(zlogp, zstate, Z_BOOT, debug) != 0) goto bad; + /* Startup a thread to perform memory capping for the zone. */ + create_mcap_thread(zlogp, zone_id); + return (0); bad: @@ -1073,6 +1076,9 @@ zone_halt(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting, int zstate, if (brand_prestatechg(zlogp, zstate, Z_HALT, debug) != 0) return (-1); + /* Shutting down, stop the memcap thread */ + destroy_mcap_thread(); + if (vplat_teardown(zlogp, unmount_cmd, rebooting, debug) != 0) { if (!bringup_failure_recovery) zerror(zlogp, B_FALSE, "unable to destroy zone"); @@ -1712,11 +1718,20 @@ top: * state. */ if (zstate > ZONE_STATE_INSTALLED) { + static zoneid_t zid; + zerror(zlogp, B_FALSE, "zone '%s': WARNING: zone is in state '%s', but " "zoneadmd does not appear to be available; " "restarted zoneadmd to recover.", zone_name, zone_state_str(zstate)); + + /* + * Startup a thread to perform memory capping for the + * zone. + */ + if ((zid = getzoneidbyname(zone_name)) != -1) + create_mcap_thread(zlogp, zid); } (void) fdetach(zone_door_path); diff --git a/usr/src/cmd/zoneadmd/zoneadmd.h b/usr/src/cmd/zoneadmd/zoneadmd.h index 8ca177fdb1..49aa9e8cf6 100644 --- a/usr/src/cmd/zoneadmd/zoneadmd.h +++ b/usr/src/cmd/zoneadmd/zoneadmd.h @@ -152,6 +152,12 @@ extern int init_console(zlog_t *); extern void serve_console(zlog_t *); /* + * Memory capping thread creation. + */ +extern void create_mcap_thread(zlog_t *, zoneid_t); +extern void destroy_mcap_thread(); + +/* * Contract handling. */ extern int init_template(void); |
