diff options
Diffstat (limited to 'usr/src/cmd/zoneadmd/mcap.c')
-rw-r--r-- | usr/src/cmd/zoneadmd/mcap.c | 1182 |
1 files changed, 1182 insertions, 0 deletions
diff --git a/usr/src/cmd/zoneadmd/mcap.c b/usr/src/cmd/zoneadmd/mcap.c new file mode 100644 index 0000000000..16cd2dd07a --- /dev/null +++ b/usr/src/cmd/zoneadmd/mcap.c @@ -0,0 +1,1182 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2014, Joyent, Inc. All rights reserved. + */ + +/* + * This file implements the code which runs a thread inside zoneadmd to cap + * the associated zone's physical memory. A thread to do this is started + * when the zone boots and is halted when the zone shuts down. + * + * Because of the way that the VM system is currently implemented, there is no + * way to go from the bottom up (page to process to zone). Thus, there is no + * obvious way to hook an rctl into the kernel's paging code to enforce a hard + * memory cap. Instead, we implement a soft physical memory cap which looks + * at the zone's overall rss and once it is over the cap, works from the top + * down (zone to process to page), looking at zone processes, to determine + * what to try to pageout to get the zone under its memory cap. + * + * The code uses the fast, cheap, but potentially very inaccurate sum of the + * rss values from psinfo_t to first approximate the zone's rss and will + * fallback to the vm_getusage syscall to determine the zone's rss if needed. + * It then checks the rss against the zone's zone.max-physical-memory rctl. + * Once the zone goes over its cap, then this thread will work through the + * zone's /proc process list, Pgrab-bing each process and stepping through the + * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...) + * to pageout pages, until the zone is again under its cap. + * + * Although zone memory capping is implemented as a soft cap by this user-level + * thread, the interfaces around memory caps that are exposed to the user are + * the standard ones; an rctl and kstats. This thread uses the rctl value + * to obtain the cap and works with the zone kernel code to update the kstats. + * If the implementation ever moves into the kernel, these exposed interfaces + * do not need to change. + * + * The thread adaptively sleeps, periodically checking the state of the + * zone. As the zone's rss gets closer to the cap, the thread will wake up + * more often to check the zone's status. Once the zone is over the cap, + * the thread will work to pageout until the zone is under the cap, as shown + * by updated vm_usage data. + * + * NOTE: The pagedata page maps (at least on x86) are not useful. Those flags + * are set by hrm_setbits() and on x86 that code path is only executed by + * segvn_pagelock -> hat_setstat -> hrm_setbits + * segvn_softunlock -^ + * On SPARC there is an additional code path which may make this data + * useful (sfmmu_ttesync), but since it is not generic, we ignore the page + * maps. If we ever fix this issue, then we could generalize this mcap code to + * do more with the data on active pages. + * + * For debugging, touch the file {zonepath}/mcap_debug.log. This will + * cause the thread to start logging its actions into that file (it may take + * a minute or two if the thread is currently sleeping). Removing that + * file will cause logging to stop. + */ + +#include <sys/mman.h> +#include <sys/param.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <libproc.h> +#include <limits.h> +#include <procfs.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <time.h> +#include <unistd.h> +#include <sys/priocntl.h> +#include <dirent.h> +#include <zone.h> +#include <libzonecfg.h> +#include <thread.h> +#include <values.h> +#include <sys/vm_usage.h> +#include <sys/resource.h> +#include <sys/debug.h> +#include <synch.h> +#include <wait.h> +#include <libcontract.h> +#include <libcontract_priv.h> +#include <sys/contract/process.h> +#include "zoneadmd.h" + + /* round up to next y = 2^n */ +#define ROUNDUP(x, y) (((x) + ((y) - 1)) & ~((y) - 1)) + +#define CAP_REFRESH ((uint64_t)300 * NANOSEC) /* every 5 minutes */ + +/* + * zonecfg attribute tunables for memory capping. + * phys-mcap-cmd + * type: string + * specifies a command that can be run when over the cap + * phys-mcap-no-vmusage + * type: boolean + * true disables vm_getusage and just uses zone's proc. rss sum + * phys-mcap-no-pageout + * type: boolean + * true disables pageout when over + * phys-mcap-no-pf-throttle + * type: boolean + * true disables page fault throttling when over + */ +#define TUNE_CMD "phys-mcap-cmd" +#define TUNE_NVMU "phys-mcap-no-vmusage" +#define TUNE_NPAGE "phys-mcap-no-pageout" +#define TUNE_NPFTHROT "phys-mcap-no-pf-throttle" + +/* + * These are only used in get_mem_info but global. We always need scale_rss and + * prev_fast_rss to be persistent but we also have the other two global so we + * can easily see these with mdb. + */ +uint64_t scale_rss = 0; +uint64_t prev_fast_rss = 0; +uint64_t fast_rss = 0; +uint64_t accurate_rss = 0; + +static char zoneproc[MAXPATHLEN]; +static char debug_log[MAXPATHLEN]; +static zoneid_t zid; +static mutex_t shutdown_mx; +static cond_t shutdown_cv; +static int shutting_down = 0; +static thread_t mcap_tid; +static FILE *debug_log_fp = NULL; +static uint64_t zone_rss_cap; /* RSS cap(KB) */ +static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */ +static boolean_t skip_vmusage = B_FALSE; +static boolean_t skip_pageout = B_FALSE; +static boolean_t skip_pf_throttle = B_FALSE; + +static zlog_t *logp; + +static int64_t check_suspend(); +static void get_mcap_tunables(); + +/* + * Structure to hold current state about a process address space that we're + * working on. + */ +typedef struct { + int pr_curr; /* the # of the mapping we're working on */ + int pr_nmap; /* number of mappings in address space */ + prmap_t *pr_mapp; /* process's map array */ +} proc_map_t; + +typedef struct zsd_vmusage64 { + id_t vmu_zoneid; + uint_t vmu_type; + id_t vmu_id; + /* + * An amd64 kernel will align the following uint64_t members, but a + * 32bit i386 process will not without help. + */ + int vmu_align_next_members_on_8_bytes; + uint64_t vmu_rss_all; + uint64_t vmu_rss_private; + uint64_t vmu_rss_shared; + uint64_t vmu_swap_all; + uint64_t vmu_swap_private; + uint64_t vmu_swap_shared; +} zsd_vmusage64_t; + +/* + * Output a debug log message. + */ +/*PRINTFLIKE1*/ +static void +debug(char *fmt, ...) +{ + va_list ap; + + if (debug_log_fp == NULL) + return; + + va_start(ap, fmt); + (void) vfprintf(debug_log_fp, fmt, ap); + va_end(ap); + (void) fflush(debug_log_fp); +} + +/* + * Like sleep(3C) but can be interupted by cond_signal which is posted when + * we're shutting down the mcap thread. + */ +static void +sleep_shutdown(int secs) +{ + timestruc_t to; + + to.tv_sec = secs; + to.tv_nsec = 0; + + (void) mutex_lock(&shutdown_mx); + if (!shutting_down) + (void) cond_reltimedwait(&shutdown_cv, &shutdown_mx, &to); + (void) mutex_unlock(&shutdown_mx); +} + +static boolean_t +proc_issystem(pid_t pid) +{ + char pc_clname[PC_CLNMSZ]; + + if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname, + PC_KY_NULL) != -1) + return (strcmp(pc_clname, "SYS") == 0); + + return (B_TRUE); +} + +/* + * Fork a child that enters the zone and runs the "phys-mcap-cmd" command. + */ +static void +run_over_cmd() +{ + int ctfd; + int err; + pid_t childpid; + siginfo_t info; + ctid_t ct; + + /* + * Before we enter the zone, we need to create a new process contract + * for the child, as required by zone_enter(). + */ + if ((ctfd = open64("/system/contract/process/template", O_RDWR)) == -1) + return; + if (ct_tmpl_set_critical(ctfd, 0) != 0 || + ct_tmpl_set_informative(ctfd, 0) != 0 || + ct_pr_tmpl_set_fatal(ctfd, CT_PR_EV_HWERR) != 0 || + ct_pr_tmpl_set_param(ctfd, CT_PR_PGRPONLY) != 0 || + ct_tmpl_activate(ctfd) != 0) { + (void) close(ctfd); + return; + } + + childpid = fork(); + switch (childpid) { + case -1: + (void) ct_tmpl_clear(ctfd); + (void) close(ctfd); + break; + case 0: /* Child */ + (void) ct_tmpl_clear(ctfd); + (void) close(ctfd); + if (zone_enter(zid) == -1) + _exit(errno); + err = system(over_cmd); + _exit(err); + break; + default: /* Parent */ + if (contract_latest(&ct) == -1) + ct = -1; + (void) ct_tmpl_clear(ctfd); + (void) close(ctfd); + err = waitid(P_PID, childpid, &info, WEXITED); + (void) contract_abandon_id(ct); + if (err == -1 || info.si_status != 0) + debug("over_cmd failed"); + break; + } +} + +/* + * Get the next mapping. + */ +static prmap_t * +nextmapping(proc_map_t *pmp) +{ + if (pmp->pr_mapp == NULL || pmp->pr_curr >= pmp->pr_nmap) + return (NULL); + + return (&pmp->pr_mapp[pmp->pr_curr++]); +} + +/* + * Initialize the proc_map_t to access the first mapping of an address space. + */ +static prmap_t * +init_map(proc_map_t *pmp, pid_t pid) +{ + int fd; + int res; + struct stat st; + char pathbuf[MAXPATHLEN]; + + bzero(pmp, sizeof (proc_map_t)); + pmp->pr_nmap = -1; + + (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/map", zoneproc, pid); + if ((fd = open(pathbuf, O_RDONLY, 0)) < 0) + return (NULL); + +redo: + errno = 0; + if (fstat(fd, &st) != 0) + goto done; + + if ((pmp->pr_mapp = malloc(st.st_size)) == NULL) { + debug("cannot malloc() %ld bytes for xmap", st.st_size); + goto done; + } + (void) bzero(pmp->pr_mapp, st.st_size); + + errno = 0; + if ((res = pread(fd, pmp->pr_mapp, st.st_size, 0)) != st.st_size) { + free(pmp->pr_mapp); + pmp->pr_mapp = NULL; + if (res > 0 || errno == E2BIG) { + goto redo; + } else { + debug("pid %ld cannot read xmap\n", pid); + goto done; + } + } + + pmp->pr_nmap = st.st_size / sizeof (prmap_t); + +done: + (void) close(fd); + return (nextmapping(pmp)); +} + +/* + * Attempt to invalidate the entire mapping from within the given process's + * address space. May return nonzero with errno as: + * ESRCH - process not found + * ENOMEM - segment not found + * EINVAL - mapping exceeds a single segment + */ +static int +pageout_mapping(pid_t pid, prmap_t *pmp) +{ + int res; + + if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM) + return (0); + + errno = 0; + res = syscall(SYS_rusagesys, _RUSAGESYS_INVALMAP, pid, pmp->pr_vaddr, + pmp->pr_size); + + return (res); +} + +/* + * Work through a process paging out mappings until the whole address space was + * examined or the excess is < 0. Return our estimate of the updated excess. + */ +static int64_t +pageout_process(pid_t pid, int64_t excess) +{ + int psfd; + prmap_t *pmap; + proc_map_t cur; + int res; + int64_t sum_d_rss, d_rss; + int64_t old_rss; + int map_cnt; + psinfo_t psinfo; + char pathbuf[MAXPATHLEN]; + + (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc, + pid); + if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0) + return (excess); + + cur.pr_mapp = NULL; + + if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo)) + goto done; + + old_rss = (int64_t)psinfo.pr_rssize; + map_cnt = 0; + + /* If unscannable, skip it. */ + if (psinfo.pr_nlwp == 0 || proc_issystem(pid)) { + debug("pid %ld: system process, skipping %s\n", + pid, psinfo.pr_psargs); + goto done; + } + + /* If tiny RSS (16KB), skip it. */ + if (old_rss <= 16) { + debug("pid %ld: skipping, RSS %lldKB %s\n", + pid, old_rss, psinfo.pr_psargs); + goto done; + } + + /* Get segment residency information. */ + pmap = init_map(&cur, pid); + + /* Skip process if it has no mappings. */ + if (pmap == NULL) { + debug("pid %ld: map unreadable; ignoring\n", pid); + goto done; + } + + debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n", + pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs); + + /* + * Within the process's address space, attempt to page out mappings. + */ + sum_d_rss = 0; + while (excess > 0 && pmap != NULL && !shutting_down) { + /* invalidate the entire mapping */ + if ((res = pageout_mapping(pid, pmap)) < 0) + debug("pid %ld: mapping 0x%p %ldkb unpageable (%d)\n", + pid, pmap->pr_vaddr, pmap->pr_size / 1024, errno); + + map_cnt++; + + /* + * Re-check the process rss and get the delta. + */ + if (pread(psfd, &psinfo, sizeof (psinfo), 0) + != sizeof (psinfo)) { + excess -= old_rss; + goto done; + } + + d_rss = (int64_t)psinfo.pr_rssize - old_rss; + old_rss = (int64_t)psinfo.pr_rssize; + sum_d_rss += d_rss; + + /* + * d_rss hopefully should be negative (or 0 if nothing + * invalidated) but can be positive if more got paged in. + */ + excess += d_rss; + + if (excess <= 0) { + debug("pid %ld: (part.) nmap %d delta_rss %lldKB " + "excess %lldKB\n", pid, map_cnt, + (unsigned long long)sum_d_rss, (long long)excess); + map_cnt = 0; + + /* + * If we're actually under, this will suspend checking + * in the middle of this process's address space. + */ + excess = check_suspend(); + if (shutting_down) + goto done; + + /* + * since we might have suspended, re-read process's rss + */ + if (pread(psfd, &psinfo, sizeof (psinfo), 0) + != sizeof (psinfo)) { + excess -= old_rss; + goto done; + } + + old_rss = (int64_t)psinfo.pr_rssize; + + debug("pid %ld: resume pageout; excess %lld\n", pid, + (long long)excess); + sum_d_rss = 0; + } + + pmap = nextmapping(&cur); + } + + debug("pid %ld: nmap %d delta_rss %lldKB excess %lldKB\n", + pid, map_cnt, (unsigned long long)sum_d_rss, (long long)excess); + +done: + if (cur.pr_mapp != NULL) + free(cur.pr_mapp); + + (void) close(psfd); + + if (shutting_down) + return (0); + + return (excess); +} + +/* + * Get the zone's RSS data. + */ +static uint64_t +get_mem_info() +{ + uint64_t n = 1; + zsd_vmusage64_t buf; + uint64_t tmp_rss; + DIR *pdir = NULL; + struct dirent *dent; + + /* + * Start by doing the fast, cheap RSS calculation using the rss value + * in psinfo_t. Because that's per-process, it can lead to double + * counting some memory and overestimating how much is being used, but + * as long as that's not over the cap, then we don't need do the + * expensive calculation. + * + * If we have to do the expensive calculation, we remember the scaling + * factor so that we can try to use that on subsequent iterations for + * the fast rss. + */ + if (shutting_down) + return (0); + + if ((pdir = opendir(zoneproc)) == NULL) + return (0); + + accurate_rss = 0; + fast_rss = 0; + while (!shutting_down && (dent = readdir(pdir)) != NULL) { + pid_t pid; + int psfd; + int64_t rss; + char pathbuf[MAXPATHLEN]; + psinfo_t psinfo; + + if (strcmp(".", dent->d_name) == 0 || + strcmp("..", dent->d_name) == 0) + continue; + + pid = atoi(dent->d_name); + if (pid == 0 || pid == 1) + continue; + + (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", + zoneproc, pid); + + rss = 0; + if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) { + if (pread(psfd, &psinfo, sizeof (psinfo), 0) == + sizeof (psinfo)) + rss = (int64_t)psinfo.pr_rssize; + + (void) close(psfd); + } + + fast_rss += rss; + } + + (void) closedir(pdir); + + if (shutting_down) + return (0); + + debug("fast rss: %lluKB, scale: %llu, prev: %lluKB\n", fast_rss, + scale_rss, prev_fast_rss); + + /* see if we can get by with a scaled fast rss */ + tmp_rss = fast_rss; + if (scale_rss > 1 && prev_fast_rss > 0) { + /* + * Only scale the fast value if it hasn't ballooned too much + * to trust. + */ + if (fast_rss / prev_fast_rss < 2) { + fast_rss /= scale_rss; + debug("scaled fast rss: %lluKB\n", fast_rss); + } + } + + if (fast_rss <= zone_rss_cap || skip_vmusage) { + uint64_t zone_rss_bytes; + + zone_rss_bytes = fast_rss * 1024; + /* Use the zone's approx. RSS in the kernel */ + (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0); + return (fast_rss); + } + + buf.vmu_id = zid; + + /* get accurate usage (cached data may be up to 5 seconds old) */ + if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_A_ZONE, 5, + (uintptr_t)&buf, (uintptr_t)&n) != 0) { + debug("vmusage failed\n"); + (void) sleep_shutdown(1); + return (0); + } + + if (n > 1) { + /* This should never happen */ + debug("vmusage returned more than one result\n"); + (void) sleep_shutdown(1); + return (0); + } + + if (buf.vmu_id != zid) { + /* This should never happen */ + debug("vmusage returned the incorrect zone\n"); + (void) sleep_shutdown(1); + return (0); + } + + accurate_rss = buf.vmu_rss_all / 1024; + + /* calculate scaling factor to use for fast_rss from now on */ + if (accurate_rss > 0) { + scale_rss = fast_rss / accurate_rss; + debug("new scaling factor: %llu\n", scale_rss); + /* remember the fast rss when we had to get the accurate rss */ + prev_fast_rss = tmp_rss; + } + + debug("accurate rss: %lluKB, scale: %llu, prev: %lluKB\n", accurate_rss, + scale_rss, prev_fast_rss); + return (accurate_rss); +} + +/* + * Needed to read the zones physical-memory-cap rctl. + */ +static struct ps_prochandle * +grab_zone_proc() +{ + DIR *dirp; + struct dirent *dentp; + struct ps_prochandle *ph = NULL; + int tmp; + + if ((dirp = opendir(zoneproc)) == NULL) + return (NULL); + + while (!shutting_down && (dentp = readdir(dirp))) { + int pid; + + if (strcmp(".", dentp->d_name) == 0 || + strcmp("..", dentp->d_name) == 0) + continue; + + pid = atoi(dentp->d_name); + /* attempt to grab process */ + if ((ph = Pgrab(pid, 0, &tmp)) != NULL) { + if (Psetflags(ph, PR_RLC) == 0) { + if (Pcreate_agent(ph) == 0) { + (void) closedir(dirp); + return (ph); + } + } + Prelease(ph, 0); + } + } + + (void) closedir(dirp); + return (NULL); +} + +static uint64_t +get_zone_cap() +{ + rctlblk_t *rblk; + uint64_t mcap; + struct ps_prochandle *ph; + + if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL) + return (UINT64_MAX); + + if ((ph = grab_zone_proc()) == NULL) { + free(rblk); + return (UINT64_MAX); + } + + if (pr_getrctl(ph, "zone.max-physical-memory", NULL, rblk, + RCTL_FIRST)) { + Pdestroy_agent(ph); + Prelease(ph, 0); + free(rblk); + return (UINT64_MAX); + } + + Pdestroy_agent(ph); + Prelease(ph, 0); + + mcap = rctlblk_get_value(rblk); + free(rblk); + return (mcap); +} + +/* + * check_suspend is invoked at the beginning of every pass through the process + * list or after we've paged out enough so that we think the excess is under + * the cap. The purpose is to periodically check the zone's rss and return + * the excess when the zone is over the cap. The rest of the time this + * function will sleep, periodically waking up to check the current rss. + * + * Depending on the percentage of penetration of the zone's rss into the + * cap we sleep for longer or shorter amounts. This reduces the impact of this + * work on the system, which is important considering that each zone will be + * monitoring its rss. + */ +static int64_t +check_suspend() +{ + static hrtime_t last_cap_read = 0; + static uint64_t addon; + static uint64_t lo_thresh; /* Thresholds for how long to sleep */ + static uint64_t hi_thresh; /* when under the cap (80% & 90%). */ + static uint64_t prev_zone_rss = 0; + static uint32_t pfdelay = 0; /* usec page fault delay when over */ + + /* Wait a second to give the async pageout a chance to catch up. */ + (void) sleep_shutdown(1); + + while (!shutting_down) { + int64_t new_excess; + int sleep_time; + hrtime_t now; + struct stat st; + uint64_t zone_rss; /* total RSS(KB) */ + + /* + * Check if the debug log files exists and enable or disable + * debug. + */ + if (debug_log_fp == NULL) { + if (stat(debug_log, &st) == 0) + debug_log_fp = fopen(debug_log, "w"); + } else { + if (stat(debug_log, &st) == -1) { + (void) fclose(debug_log_fp); + debug_log_fp = NULL; + } + } + + /* + * If the CAP_REFRESH interval has passed, re-get the current + * cap in case it has been dynamically updated. + */ + now = gethrtime(); + if (now - last_cap_read > CAP_REFRESH) { + uint64_t mcap; + + last_cap_read = now; + + mcap = get_zone_cap(); + if (mcap != 0 && mcap != UINT64_MAX) + zone_rss_cap = ROUNDUP(mcap, 1024) / 1024; + else + zone_rss_cap = UINT64_MAX; + + lo_thresh = (uint64_t)(zone_rss_cap * .8); + hi_thresh = (uint64_t)(zone_rss_cap * .9); + addon = (uint64_t)(zone_rss_cap * 0.05); + + /* + * We allow the memory cap tunables to be changed on + * the fly. + */ + get_mcap_tunables(); + + debug("%s: %s\n", TUNE_CMD, over_cmd); + debug("%s: %d\n", TUNE_NVMU, skip_vmusage); + debug("%s: %d\n", TUNE_NPAGE, skip_pageout); + debug("%s: %d\n", TUNE_NPFTHROT, skip_pf_throttle); + debug("current cap %lluKB lo %lluKB hi %lluKB\n", + zone_rss_cap, lo_thresh, hi_thresh); + } + + /* No cap, nothing to do. */ + if (zone_rss_cap == 0 || zone_rss_cap == UINT64_MAX) { + debug("no cap, sleep 120 seconds\n"); + (void) sleep_shutdown(120); + continue; + } + + zone_rss = get_mem_info(); + + /* calculate excess */ + new_excess = zone_rss - zone_rss_cap; + + debug("rss %lluKB, cap %lluKB, excess %lldKB\n", + zone_rss, zone_rss_cap, new_excess); + + /* + * If necessary, updates stats. + */ + + /* + * If it looks like we did some paging out since last over the + * cap then update the kstat so we can approximate how much was + * paged out. + */ + if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) { + uint64_t diff; + + /* assume diff is num bytes we paged out */ + diff = (prev_zone_rss - zone_rss) * 1024; + + (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT, + &diff, 0); + } + prev_zone_rss = zone_rss; + + if (new_excess > 0) { + uint64_t n = 1; + + /* Increment "nover" kstat. */ + (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0); + + if (!skip_pf_throttle) { + /* + * Tell the kernel to start throttling page + * faults by some number of usecs to help us + * catch up. If we are persistently over the + * cap the delay ramps up to a max of 2000usecs. + * Note that for delays less than 1 tick + * (i.e. all of these) we busy-wait in as_fault. + * delay faults/sec + * 125 8000 + * 250 4000 + * 500 2000 + * 1000 1000 + * 2000 500 + */ + if (pfdelay == 0) + pfdelay = 125; + else if (pfdelay < 2000) + pfdelay *= 2; + + (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, + &pfdelay, 0); + } + + /* + * Once we go over the cap, then we want to + * page out a little extra instead of stopping + * right at the cap. To do this we add 5% to + * the excess so that pageout_proces will work + * a little longer before stopping. + */ + return ((int64_t)(new_excess + addon)); + } + + /* + * At this point we are under the cap. + * + * Tell the kernel to stop throttling page faults. + * + * Scale the amount of time we sleep before rechecking the + * zone's memory usage. Also, scale the accpetable age of + * cached results from vm_getusage. We do this based on the + * penetration into the capped limit. + */ + if (pfdelay > 0) { + pfdelay = 0; + (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, + &pfdelay, 0); + } + + if (zone_rss <= lo_thresh) { + sleep_time = 120; + } else if (zone_rss <= hi_thresh) { + sleep_time = 60; + } else { + sleep_time = 30; + } + + debug("sleep %d seconds\n", sleep_time); + (void) sleep_shutdown(sleep_time); + } + + /* Shutting down, tell the kernel so it doesn't throttle */ + if (pfdelay > 0) { + pfdelay = 0; + (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, &pfdelay, 0); + } + + return (0); +} + +static void +get_mcap_tunables() +{ + zone_dochandle_t handle; + struct zone_attrtab attr; + + over_cmd[0] = '\0'; + if ((handle = zonecfg_init_handle()) == NULL) + return; + + if (zonecfg_get_handle(zone_name, handle) != Z_OK) + goto done; + + /* Reset to defaults in case rebooting and settings have changed */ + over_cmd[0] = '\0'; + skip_vmusage = B_FALSE; + skip_pageout = B_FALSE; + skip_pf_throttle = B_FALSE; + + if (zonecfg_setattrent(handle) != Z_OK) + goto done; + while (zonecfg_getattrent(handle, &attr) == Z_OK) { + if (strcmp(TUNE_CMD, attr.zone_attr_name) == 0) { + (void) strlcpy(over_cmd, attr.zone_attr_value, + sizeof (over_cmd)); + } else if (strcmp(TUNE_NVMU, attr.zone_attr_name) == 0) { + if (strcmp("true", attr.zone_attr_value) == 0) + skip_vmusage = B_TRUE; + } else if (strcmp(TUNE_NPAGE, attr.zone_attr_name) == 0) { + if (strcmp("true", attr.zone_attr_value) == 0) + skip_pageout = B_TRUE; + } else if (strcmp(TUNE_NPFTHROT, attr.zone_attr_name) == 0) { + if (strcmp("true", attr.zone_attr_value) == 0) + skip_pf_throttle = B_TRUE; + } + } + (void) zonecfg_endattrent(handle); + +done: + zonecfg_fini_handle(handle); +} + +/* ARGSUSED */ +static int +chk_proc_fs(void *data, const char *spec, const char *dir, + const char *fstype, const char *opt) +{ + if (fstype != NULL && strcmp(fstype, "proc") == 0) + *((boolean_t *)data) = B_TRUE; + + return (0); +} + +static boolean_t +has_proc() +{ + brand_handle_t bh; + boolean_t fnd = B_FALSE; + + if ((bh = brand_open(brand_name)) != NULL) { + (void) brand_platform_iter_mounts(bh, chk_proc_fs, &fnd); + } + + brand_close(bh); + return (fnd); +} + +/* + * We run this loop for brands with no /proc to simply update the RSS, using + * the cheap GZ /proc data, every 5 minutes. + */ +static void +no_procfs() +{ + DIR *pdir = NULL; + struct dirent *dent; + uint64_t zone_rss_bytes; + + (void) sleep_shutdown(30); + while (!shutting_down) { + /* + * Just do the fast, cheap RSS calculation using the rss value + * in psinfo_t. Because that's per-process, it can lead to + * double counting some memory and overestimating how much is + * being used. Since there is no /proc in the zone, we use the + * GZ /proc and check for the correct zone. + */ + if ((pdir = opendir("/proc")) == NULL) + return; + + fast_rss = 0; + while (!shutting_down && (dent = readdir(pdir)) != NULL) { + pid_t pid; + int psfd; + int64_t rss; + char pathbuf[MAXPATHLEN]; + psinfo_t psinfo; + + if (strcmp(".", dent->d_name) == 0 || + strcmp("..", dent->d_name) == 0) + continue; + + pid = atoi(dent->d_name); + if (pid == 0 || pid == 1) + continue; + + (void) snprintf(pathbuf, sizeof (pathbuf), + "/proc/%d/psinfo", pid); + + rss = 0; + if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) { + if (pread(psfd, &psinfo, sizeof (psinfo), 0) == + sizeof (psinfo)) { + if (psinfo.pr_zoneid == zid) + rss = (int64_t)psinfo.pr_rssize; + } + + (void) close(psfd); + } + + fast_rss += rss; + } + + (void) closedir(pdir); + + if (shutting_down) + return; + + zone_rss_bytes = fast_rss * 1024; + /* Use the zone's approx. RSS in the kernel */ + (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0); + + (void) sleep_shutdown(300); + } +} + +/* + * Thread that checks zone's memory usage and when over the cap, goes through + * the zone's process list trying to pageout processes to get under the cap. + */ +static void +mcap_zone() +{ + DIR *pdir = NULL; + int64_t excess; + + debug("thread startup\n"); + + get_mcap_tunables(); + + /* + * If the zone has no /proc filesystem, we can't use the fast algorithm + * to check RSS or pageout any processes. All we can do is periodically + * update it's RSS kstat using the expensive sycall. + */ + if (!has_proc()) { + no_procfs(); + debug("thread shutdown\n"); + return; + } + + /* + * When first starting it is likely lots of other zones are starting + * too because the system is booting. Since we just started the zone + * we're not worried about being over the cap right away, so we let + * things settle a bit and tolerate some older data here to minimize + * the load on the system. + */ + (void) sleep_shutdown(15); /* wait 15 secs. so the zone can get going */ + + /* Wait until zone's /proc is mounted */ + while (!shutting_down) { + struct stat st; + + if (stat(zoneproc, &st) == 0 && + strcmp(st.st_fstype, "proc") == 0) + break; + sleep_shutdown(5); + } + + /* Open zone's /proc and walk entries. */ + while (!shutting_down) { + if ((pdir = opendir(zoneproc)) != NULL) + break; + sleep_shutdown(5); + } + + while (!shutting_down) { + struct dirent *dirent; + + /* Wait until we've gone over the cap. */ + excess = check_suspend(); + + debug("starting to scan, excess %lldk\n", (long long)excess); + + if (over_cmd[0] != '\0') { + uint64_t zone_rss; /* total RSS(KB) */ + + debug("run phys_mcap_cmd: %s\n", over_cmd); + run_over_cmd(); + + zone_rss = get_mem_info(); + excess = zone_rss - zone_rss_cap; + debug("rss %lluKB, cap %lluKB, excess %lldKB\n", + zone_rss, zone_rss_cap, excess); + if (excess <= 0) + continue; + } + + while (!shutting_down && (dirent = readdir(pdir)) != NULL) { + pid_t pid; + + if (strcmp(".", dirent->d_name) == 0 || + strcmp("..", dirent->d_name) == 0) + continue; + + pid = atoi(dirent->d_name); + if (pid == 0 || pid == 1) + continue; + + if (skip_pageout) + (void) sleep_shutdown(2); + else + excess = pageout_process(pid, excess); + + if (excess <= 0) { + debug("apparently under; excess %lld\n", + (long long)excess); + /* Double check the current excess */ + excess = check_suspend(); + } + } + + debug("process pass done; excess %lld\n", (long long)excess); + rewinddir(pdir); + + if (skip_pageout) + (void) sleep_shutdown(120); + } + + if (pdir != NULL) + (void) closedir(pdir); + debug("thread shutdown\n"); +} + +void +create_mcap_thread(zlog_t *zlogp, zoneid_t id) +{ + int res; + + shutting_down = 0; + zid = id; + logp = zlogp; + + /* all but the lx brand currently use /proc */ + if (strcmp(brand_name, "lx") == 0) { + (void) snprintf(zoneproc, sizeof (zoneproc), + "%s/root/native/proc", zonepath); + } else { + (void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc", + zonepath); + } + + (void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log", + zonepath); + + res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL, + &mcap_tid); + if (res != 0) { + zerror(zlogp, B_FALSE, "error %d creating memory cap thread", + res); + mcap_tid = 0; + } +} + +void +destroy_mcap_thread() +{ + if (mcap_tid != 0) { + shutting_down = 1; + (void) cond_signal(&shutdown_cv); + (void) thr_join(mcap_tid, NULL, NULL); + mcap_tid = 0; + } +} |