/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Copyright 2011, 2012, Joyent, Inc. All rights reserved. */ /* * This file implements the code which runs a thread inside zoneadmd to cap * the associated zone's physical memory. A thread to do this is started * when the zone boots and is halted when the zone shuts down. * * Because of the way that the VM system is currently implemented, there is no * way to go from the bottom up (page to process to zone). Thus, there is no * obvious way to hook an rctl into the kernel's paging code to enforce a hard * memory cap. Instead, we implement a soft physical memory cap which looks * at the zone's overall rss and once it is over the cap, works from the top * down (zone to process to page), looking at zone processes, to determine * what to try to pageout to get the zone under its memory cap. * * The code uses the vm_getusage syscall to determine the zone's rss and * checks that against the zone's zone.max-physical-memory rctl. Once the * zone goes over its cap, then this thread will work through the zone's * /proc process list, Pgrab-bing each process and stepping through the * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...) * to pageout pages, until the zone is again under its cap. * * Although zone memory capping is implemented as a soft cap by this user-level * thread, the interfaces around memory caps that are exposed to the user are * the standard ones; an rctl and kstats. This thread uses the rctl value * to obtain the cap and works with the zone kernel code to update the kstats. * If the implementation ever moves into the kernel, these exposed interfaces * do not need to change. * * The thread adaptively sleeps, periodically checking the state of the * zone. As the zone's rss gets closer to the cap, the thread will wake up * more often to check the zone's status. Once the zone is over the cap, * the thread will work to pageout until the zone is under the cap, as shown * by updated vm_usage data. * * NOTE: The pagedata page maps (at least on x86) are not useful. Those flags * are set by hrm_setbits() and on x86 that code path is only executed by * segvn_pagelock -> hat_setstat -> hrm_setbits * segvn_softunlock -^ * On SPARC there is an additional code path which may make this data * useful (sfmmu_ttesync), but since it is not generic, we ignore the page * maps. If we ever fix this issue, then we could generalize this mcap code to * do more with the data on active pages. * * For debugging, touch the file {zonepath}/mcap_debug.log. This will * cause the thread to start logging its actions into that file (it may take * a minute or two if the thread is currently sleeping). Removing that * file will cause logging to stop. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zoneadmd.h" /* round up to next y = 2^n */ #define ROUNDUP(x, y) (((x) + ((y) - 1)) & ~((y) - 1)) #define CAP_REFRESH ((uint64_t)300 * NANOSEC) /* every 5 minutes */ /* * zonecfg attribute tunables for memory capping. * phys-mcap-cmd * type: string * specifies a command that can be run when over the cap * phys-mcap-no-vmusage * type: boolean * true disables vm_getusage and just uses zone's proc. rss sum * phys-mcap-no-pageout * type: boolean * true disables pageout when over */ #define TUNE_CMD "phys-mcap-cmd" #define TUNE_NVMU "phys-mcap-no-vmusage" #define TUNE_NPAGE "phys-mcap-no-pageout" static char zonename[ZONENAME_MAX]; static char zonepath[MAXPATHLEN]; static char zoneproc[MAXPATHLEN]; static char debug_log[MAXPATHLEN]; static zoneid_t zid; static mutex_t shutdown_mx; static cond_t shutdown_cv; static int shutting_down = 0; static thread_t mcap_tid; static FILE *debug_log_fp = NULL; static uint64_t zone_rss_cap; /* RSS cap(KB) */ static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */ static boolean_t skip_vmusage = B_FALSE; static boolean_t skip_pageout = B_FALSE; /* * Structure to hold current state about a process address space that we're * working on. */ typedef struct { int pr_curr; /* the # of the mapping we're working on */ int pr_nmap; /* number of mappings in address space */ prxmap_t *pr_xmapp; /* process's xmap array */ } proc_map_t; typedef struct zsd_vmusage64 { id_t vmu_zoneid; uint_t vmu_type; id_t vmu_id; /* * An amd64 kernel will align the following uint64_t members, but a * 32bit i386 process will not without help. */ int vmu_align_next_members_on_8_bytes; uint64_t vmu_rss_all; uint64_t vmu_rss_private; uint64_t vmu_rss_shared; uint64_t vmu_swap_all; uint64_t vmu_swap_private; uint64_t vmu_swap_shared; } zsd_vmusage64_t; /* * Output a debug log message. */ /*PRINTFLIKE1*/ static void debug(char *fmt, ...) { va_list ap; if (debug_log_fp == NULL) return; va_start(ap, fmt); (void) vfprintf(debug_log_fp, fmt, ap); va_end(ap); (void) fflush(debug_log_fp); } /* * Like sleep(3C) but can be interupted by cond_signal which is posted when * we're shutting down the mcap thread. */ static void sleep_shutdown(int secs) { timestruc_t to; to.tv_sec = secs; to.tv_nsec = 0; (void) mutex_lock(&shutdown_mx); if (!shutting_down) (void) cond_reltimedwait(&shutdown_cv, &shutdown_mx, &to); (void) mutex_unlock(&shutdown_mx); } static boolean_t proc_issystem(pid_t pid) { char pc_clname[PC_CLNMSZ]; if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname, PC_KY_NULL) != -1) return (strcmp(pc_clname, "SYS") == 0); return (B_TRUE); } /* * Fork a child that enters the zone and runs the "phys-mcap-cmd" command. */ static void run_over_cmd() { int ctfd; int err; pid_t childpid; siginfo_t info; ctid_t ct; /* * Before we enter the zone, we need to create a new process contract * for the child, as required by zone_enter(). */ if ((ctfd = open64("/system/contract/process/template", O_RDWR)) == -1) return; if (ct_tmpl_set_critical(ctfd, 0) != 0 || ct_tmpl_set_informative(ctfd, 0) != 0 || ct_pr_tmpl_set_fatal(ctfd, CT_PR_EV_HWERR) != 0 || ct_pr_tmpl_set_param(ctfd, CT_PR_PGRPONLY) != 0 || ct_tmpl_activate(ctfd) != 0) { (void) close(ctfd); return; } childpid = fork(); switch (childpid) { case -1: (void) ct_tmpl_clear(ctfd); (void) close(ctfd); break; case 0: /* Child */ (void) ct_tmpl_clear(ctfd); (void) close(ctfd); if (zone_enter(zid) == -1) _exit(errno); err = system(over_cmd); _exit(err); break; default: /* Parent */ if (contract_latest(&ct) == -1) ct = -1; (void) ct_tmpl_clear(ctfd); (void) close(ctfd); err = waitid(P_PID, childpid, &info, WEXITED); (void) contract_abandon_id(ct); if (err == -1 || info.si_status != 0) debug("over_cmd failed"); break; } } static struct ps_prochandle * control_proc(pid_t pid) { int res; struct ps_prochandle *ph; /* Take control of the process. */ if ((ph = Pgrab(pid, 0, &res)) == NULL) return (NULL); if (Pcreate_agent(ph) != 0) { (void) Prelease(ph, 0); return (NULL); } /* Verify agent LWP is actually stopped. */ errno = 0; while (Pstate(ph) == PS_RUN) (void) Pwait(ph, 0); if (Pstate(ph) != PS_STOP) { Pdestroy_agent(ph); (void) Prelease(ph, 0); return (NULL); } return (ph); } /* * Get the next mapping. */ static prxmap_t * nextmapping(proc_map_t *pmp) { if (pmp->pr_xmapp == NULL || pmp->pr_curr >= pmp->pr_nmap) return (NULL); return (&pmp->pr_xmapp[pmp->pr_curr++]); } /* * Initialize the proc_map_t to access the first mapping of an address space. */ static prxmap_t * init_map(proc_map_t *pmp, pid_t pid) { int fd; int res; struct stat st; char pathbuf[MAXPATHLEN]; bzero(pmp, sizeof (proc_map_t)); pmp->pr_nmap = -1; (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/xmap", zoneproc, pid); if ((fd = open(pathbuf, O_RDONLY, 0)) < 0) return (NULL); redo: errno = 0; if (fstat(fd, &st) != 0) goto done; if ((pmp->pr_xmapp = malloc(st.st_size)) == NULL) { debug("cannot malloc() %ld bytes for xmap", st.st_size); goto done; } (void) bzero(pmp->pr_xmapp, st.st_size); errno = 0; if ((res = read(fd, pmp->pr_xmapp, st.st_size)) != st.st_size) { free(pmp->pr_xmapp); pmp->pr_xmapp = NULL; if (res > 0 || errno == E2BIG) { goto redo; } else { debug("pid %ld cannot read xmap\n", pid); goto done; } } pmp->pr_nmap = st.st_size / sizeof (prxmap_t); done: (void) close(fd); return (nextmapping(pmp)); } /* * Attempt to page out a region of the given process's address space. May * return nonzero if not all of the pages may are pageable, for any reason. */ static int pageout_mapping(struct ps_prochandle *Pr, prxmap_t *pmp) { int res; /* * We particularly want to avoid the pr_memcntl on anonymous mappings * which show 0 since that will pull them back off of the free list * and increase the zone's RSS, even though the process itself has * them freed up. */ if (pmp->pr_mflags & MA_ANON && pmp->pr_anon == 0) return (0); else if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM) return (0); errno = 0; res = pr_memcntl(Pr, (caddr_t)pmp->pr_vaddr, pmp->pr_size, MC_SYNC, (caddr_t)(MS_ASYNC | MS_INVALCURPROC), 0, 0); /* * EBUSY indicates none of the pages have backing store allocated, or * some pages were locked. Don't care about this. */ if (res != 0 && errno == EBUSY) res = 0; return (res); } /* * Compute the delta of the process RSS since the last call. If the * psinfo cannot be obtained, no error is returned; its up to the caller to * detect the process termination via other means. */ static int64_t rss_delta(int64_t *old_rss, int psfd) { int64_t d_rss = 0; psinfo_t psinfo; if (pread(psfd, &psinfo, sizeof (psinfo_t), 0) == sizeof (psinfo_t)) { d_rss = (int64_t)psinfo.pr_rssize - *old_rss; *old_rss = (int64_t)psinfo.pr_rssize; } return (d_rss); } /* * Work through a process paging out mappings until the whole address space was * examined or the excess is < 0. Return our estimate of the updated excess. */ static int64_t pageout_process(pid_t pid, int64_t excess) { int psfd; prxmap_t *pxmap; proc_map_t cur; struct ps_prochandle *ph = NULL; int unpageable_mappings; int64_t sum_d_rss, sum_att, d_rss; int64_t old_rss; psinfo_t psinfo; int incr_rss_check = 0; char pathbuf[MAXPATHLEN]; (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc, pid); if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0) return (excess); if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo)) goto done; old_rss = (int64_t)psinfo.pr_rssize; /* If unscannable, skip it. */ if (psinfo.pr_nlwp == 0 || proc_issystem(pid)) { debug("pid: %ld system process, skipping %s\n", pid, psinfo.pr_psargs); goto done; } /* If tiny RSS (16KB), skip it. */ if (old_rss <= 16) { debug("pid: %ld skipping, RSS %lldKB %s\n", pid, old_rss, psinfo.pr_psargs); goto done; } /* Get segment residency information. */ pxmap = init_map(&cur, pid); /* Skip process if it has no mappings. */ if (pxmap == NULL) { debug("%ld: xmap unreadable; ignoring\n", pid); goto done; } debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n", pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs); /* Take control of the process. */ if ((ph = control_proc(pid)) == NULL) { debug("%ld: cannot control\n", pid); goto done; } /* * If the process RSS is not enough to erase the excess then no need * to incrementally check the RSS delta after each pageout attempt. * Instead check it after we've tried all of the segements. */ if (excess - old_rss < 0) incr_rss_check = 1; /* * Within the process's address space, attempt to page out mappings. */ sum_att = sum_d_rss = 0; unpageable_mappings = 0; while (excess > 0 && pxmap != NULL && !shutting_down) { /* Try to page out the mapping. */ if (pageout_mapping(ph, pxmap) < 0) { debug("pid %ld: exited or unpageable\n", pid); break; } /* attempted is the size of the mapping */ sum_att += pxmap->pr_size / 1024; /* * This processes RSS is potentially enough to clear the * excess so check as we go along to see if we can stop * paging out partway through the process. */ if (incr_rss_check) { d_rss = rss_delta(&old_rss, psfd); /* * If this pageout attempt was unsuccessful (the * resident portion was not affected), then note it was * unpageable. Mappings are unpageable when none of the * pages paged out, such as when they are locked, or * involved in asynchronous I/O. */ if (d_rss >= 0) { unpageable_mappings++; } else { excess += d_rss; sum_d_rss += d_rss; } } pxmap = nextmapping(&cur); } if (!incr_rss_check) { d_rss = rss_delta(&old_rss, psfd); if (d_rss < 0) { excess += d_rss; sum_d_rss += d_rss; } } debug("pid %ld: unp %d att %lluKB drss %lldKB excess %lldKB\n", pid, unpageable_mappings, (unsigned long long)sum_att, (unsigned long long)sum_d_rss, (long long)excess); done: /* If a process was grabbed, release it, destroying its agent. */ if (ph != NULL) { Pdestroy_agent(ph); (void) Prelease(ph, 0); } if (cur.pr_xmapp != NULL) free(cur.pr_xmapp); (void) close(psfd); if (shutting_down) return (0); return (excess); } /* * Get the zone's RSS data. */ static uint64_t get_mem_info(int age) { uint64_t n = 1; zsd_vmusage64_t buf; uint64_t zone_rss; DIR *pdir = NULL; struct dirent *dent; /* * Start by doing the fast, cheap RSS calculation using the rss value * in psinfo_t. Because that's per-process, it can lead to double * counting some memory and overestimating how much is being used, but * as long as that's not over the cap, then we don't need do the * expensive calculation. */ if (shutting_down) return (0); if ((pdir = opendir(zoneproc)) == NULL) return (0); zone_rss = 0; while (!shutting_down && (dent = readdir(pdir)) != NULL) { pid_t pid; int psfd; int64_t rss; char pathbuf[MAXPATHLEN]; psinfo_t psinfo; if (strcmp(".", dent->d_name) == 0 || strcmp("..", dent->d_name) == 0) continue; pid = atoi(dent->d_name); if (pid == 0 || pid == 1) continue; (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc, pid); rss = 0; if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) { if (pread(psfd, &psinfo, sizeof (psinfo), 0) == sizeof (psinfo)) rss = (int64_t)psinfo.pr_rssize; (void) close(psfd); } zone_rss += rss; } (void) closedir(pdir); if (shutting_down) return (0); debug("fast rss %lluKB\n", zone_rss); if (zone_rss <= zone_rss_cap || skip_vmusage) { uint64_t zone_rss_bytes; zone_rss_bytes = zone_rss * 1024; /* Use the zone's approx. RSS in the kernel */ (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0); return (zone_rss); } buf.vmu_id = zid; if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_A_ZONE, age, (uintptr_t)&buf, (uintptr_t)&n) != 0) { debug("vmusage failed\n"); (void) sleep_shutdown(1); return (0); } if (n > 1) { /* This should never happen */ debug("vmusage returned more than one result\n"); (void) sleep_shutdown(1); return (0); } if (buf.vmu_id != zid) { /* This should never happen */ debug("vmusage returned the incorrect zone\n"); (void) sleep_shutdown(1); return (0); } zone_rss = buf.vmu_rss_all / 1024; return (zone_rss); } /* * Needed to read the zones physical-memory-cap rctl. */ static struct ps_prochandle * grab_zone_proc() { DIR *dirp; struct dirent *dentp; struct ps_prochandle *ph = NULL; int tmp; if ((dirp = opendir(zoneproc)) == NULL) return (NULL); while (!shutting_down && (dentp = readdir(dirp))) { int pid; if (strcmp(".", dentp->d_name) == 0 || strcmp("..", dentp->d_name) == 0) continue; pid = atoi(dentp->d_name); /* attempt to grab process */ if ((ph = Pgrab(pid, 0, &tmp)) != NULL) { if (Psetflags(ph, PR_RLC) == 0) { if (Pcreate_agent(ph) == 0) { (void) closedir(dirp); return (ph); } } Prelease(ph, 0); } } (void) closedir(dirp); return (NULL); } static uint64_t get_zone_cap() { rctlblk_t *rblk; uint64_t mcap; struct ps_prochandle *ph; if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL) return (UINT64_MAX); if ((ph = grab_zone_proc()) == NULL) { free(rblk); return (UINT64_MAX); } if (pr_getrctl(ph, "zone.max-physical-memory", NULL, rblk, RCTL_FIRST)) { Pdestroy_agent(ph); Prelease(ph, 0); free(rblk); return (UINT64_MAX); } Pdestroy_agent(ph); Prelease(ph, 0); mcap = rctlblk_get_value(rblk); free(rblk); return (mcap); } /* * check_suspend is invoked at the beginning of every pass through the process * list or after we've paged out enough so that we think the excess is under * the cap. The purpose is to periodically check the zone's rss and return * the excess when the zone is over the cap. The rest of the time this * function will sleep, periodically waking up to check the current rss. * * The age parameter is used to tell us how old the cached rss data can be. * When first starting up, the cached data can be older, but after we * start paging out, we want current data. * * Depending on the percentage of penetration of the zone's rss into the * cap we sleep for longer or shorter amounts and accept older cached * vmusage data. This reduces the impact of this work on the system, which * is important considering that each zone will be monitoring its rss. */ static int64_t check_suspend(int age, boolean_t new_cycle) { static hrtime_t last_cap_read = 0; static uint64_t addon; static uint64_t lo_thresh; /* Thresholds for how long to sleep */ static uint64_t hi_thresh; /* when under the cap (80% & 90%). */ static uint64_t prev_zone_rss = 0; static uint32_t pfdelay = 0; /* usec page fault delay when over */ /* Wait a second to give the async pageout a chance to catch up. */ (void) sleep_shutdown(1); while (!shutting_down) { int64_t new_excess; int sleep_time; hrtime_t now; struct stat st; uint64_t zone_rss; /* total RSS(KB) */ /* * Check if the debug log files exists and enable or disable * debug. */ if (debug_log_fp == NULL) { if (stat(debug_log, &st) == 0) debug_log_fp = fopen(debug_log, "w"); } else { if (stat(debug_log, &st) == -1) { (void) fclose(debug_log_fp); debug_log_fp = NULL; } } /* * If the CAP_REFRESH interval has passed, re-get the current * cap in case it has been dynamically updated. */ now = gethrtime(); if (now - last_cap_read > CAP_REFRESH) { uint64_t mcap; last_cap_read = now; mcap = get_zone_cap(); if (mcap != 0 && mcap != UINT64_MAX) zone_rss_cap = ROUNDUP(mcap, 1024) / 1024; else zone_rss_cap = UINT64_MAX; lo_thresh = (uint64_t)(zone_rss_cap * .8); hi_thresh = (uint64_t)(zone_rss_cap * .9); addon = (uint64_t)(zone_rss_cap * 0.05); debug("%s: %s\n", TUNE_CMD, over_cmd); debug("%s: %d\n", TUNE_NVMU, skip_vmusage); debug("%s: %d\n", TUNE_NPAGE, skip_pageout); debug("current cap %lluKB lo %lluKB hi %lluKB\n", zone_rss_cap, lo_thresh, hi_thresh); } /* No cap, nothing to do. */ if (zone_rss_cap == 0 || zone_rss_cap == UINT64_MAX) { debug("no cap, sleep 120 seconds\n"); (void) sleep_shutdown(120); continue; } zone_rss = get_mem_info(age); /* calculate excess */ new_excess = zone_rss - zone_rss_cap; debug("rss %lluKB, cap %lluKB, excess %lldKB\n", zone_rss, zone_rss_cap, new_excess); /* * If necessary, updates stats. */ /* * If it looks like we did some paging out since last over the * cap then update the kstat so we can approximate how much was * paged out. */ if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) { uint64_t diff; /* assume diff is num bytes we paged out */ diff = (prev_zone_rss - zone_rss) * 1024; (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT, &diff, 0); } prev_zone_rss = zone_rss; if (new_excess > 0) { if (new_cycle) { uint64_t n = 1; /* Increment "nover" kstat. */ (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0); } if (!skip_pageout) { /* * Tell the kernel to start throttling page * faults by some number of usecs to help us * catch up. If we are persistently over the * cap the delay ramps up to a max of 2000usecs. * Note that for delays less than 1 tick * (i.e. all of these) we busy-wait in as_fault. * delay faults/sec * 125 8000 * 250 4000 * 500 2000 * 1000 1000 * 2000 500 */ if (pfdelay == 0) pfdelay = 125; else if (pfdelay < 2000) pfdelay *= 2; (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, &pfdelay, 0); } /* * Once we go over the cap, then we want to * page out a little extra instead of stopping * right at the cap. To do this we add 5% to * the excess so that pageout_proces will work * a little longer before stopping. */ return ((int64_t)(new_excess + addon)); } /* * At this point we are under the cap. * * Tell the kernel to stop throttling page faults. * * Scale the amount of time we sleep before rechecking the * zone's memory usage. Also, scale the accpetable age of * cached results from vm_getusage. We do this based on the * penetration into the capped limit. */ if (pfdelay > 0) { pfdelay = 0; (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, &pfdelay, 0); } if (zone_rss <= lo_thresh) { sleep_time = 120; age = 15; } else if (zone_rss <= hi_thresh) { sleep_time = 60; age = 10; } else { sleep_time = 30; age = 5; } debug("sleep %d seconds\n", sleep_time); (void) sleep_shutdown(sleep_time); } /* Shutting down, tell the kernel so it doesn't throttle */ if (pfdelay > 0) { pfdelay = 0; (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, &pfdelay, 0); } return (0); } static void get_mcap_tunables() { zone_dochandle_t handle; struct zone_attrtab attr; over_cmd[0] = '\0'; if ((handle = zonecfg_init_handle()) == NULL) return; if (zonecfg_get_handle(zonename, handle) != Z_OK) goto done; /* Reset to defaults in case rebooting and settings have changed */ over_cmd[0] = '\0'; skip_vmusage = B_FALSE; skip_pageout = B_FALSE; if (zonecfg_setattrent(handle) != Z_OK) goto done; while (zonecfg_getattrent(handle, &attr) == Z_OK) { if (strcmp(TUNE_CMD, attr.zone_attr_name) == 0) { (void) strlcpy(over_cmd, attr.zone_attr_value, sizeof (over_cmd)); } else if (strcmp(TUNE_NVMU, attr.zone_attr_name) == 0) { if (strcmp("true", attr.zone_attr_value) == 0) skip_vmusage = B_TRUE; } else if (strcmp(TUNE_NPAGE, attr.zone_attr_name) == 0) { if (strcmp("true", attr.zone_attr_value) == 0) skip_pageout = B_TRUE; } } (void) zonecfg_endattrent(handle); done: zonecfg_fini_handle(handle); } /* * Thread that checks zone's memory usage and when over the cap, goes through * the zone's process list trying to pageout processes to get under the cap. */ static void mcap_zone() { DIR *pdir = NULL; int age = 10; /* initial cached vmusage can be 10 secs. old */ int64_t excess; debug("thread startup\n"); get_mcap_tunables(); /* * When first starting it is likely lots of other zones are starting * too because the system is booting. Since we just started the zone * we're not worried about being over the cap right away, so we let * things settle a bit and tolerate some older data here to minimize * the load on the system. */ (void) sleep_shutdown(15); /* wait 15 secs. so the zone can get going */ /* Wait until zone's /proc is mounted */ while (!shutting_down) { struct stat st; if (stat(zoneproc, &st) == 0 && strcmp(st.st_fstype, "proc") == 0) break; sleep_shutdown(5); } /* Open zone's /proc and walk entries. */ while (!shutting_down) { if ((pdir = opendir(zoneproc)) != NULL) break; sleep_shutdown(5); } while (!shutting_down) { struct dirent *dirent; /* Wait until we've gone over the cap. */ excess = check_suspend(age, B_TRUE); debug("starting to scan, excess %lldk\n", (long long)excess); /* * After the initial startup, we want the age of the cached * vmusage to be only 1 second old since we are checking * the current state after we've gone over the cap and have * paged out some processes. */ age = 1; if (over_cmd[0] != '\0') { uint64_t zone_rss; /* total RSS(KB) */ debug("run phys_mcap_cmd: %s\n", over_cmd); run_over_cmd(); zone_rss = get_mem_info(0); excess = zone_rss - zone_rss_cap; debug("rss %lluKB, cap %lluKB, excess %lldKB\n", zone_rss, zone_rss_cap, excess); if (excess <= 0) continue; } while (!shutting_down && (dirent = readdir(pdir)) != NULL) { pid_t pid; if (strcmp(".", dirent->d_name) == 0 || strcmp("..", dirent->d_name) == 0) continue; pid = atoi(dirent->d_name); if (pid == 0 || pid == 1) continue; if (skip_pageout) (void) sleep_shutdown(2); else excess = pageout_process(pid, excess); if (excess <= 0) { debug("apparently under; excess %lld\n", (long long)excess); /* Double check the current excess */ excess = check_suspend(1, B_FALSE); } } debug("process pass done; excess %lld\n", (long long)excess); rewinddir(pdir); } if (pdir != NULL) (void) closedir(pdir); debug("thread shutdown\n"); } void create_mcap_thread(zlog_t *zlogp, zoneid_t id) { int res; shutting_down = 0; zid = id; (void) getzonenamebyid(zid, zonename, sizeof (zonename)); if (zone_get_zonepath(zonename, zonepath, sizeof (zonepath)) != 0) zerror(zlogp, B_FALSE, "zone %s missing zonepath", zonename); (void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc", zonepath); (void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log", zonepath); res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL, &mcap_tid); if (res != 0) { zerror(zlogp, B_FALSE, "error %d creating memory cap thread", res); mcap_tid = 0; } } void destroy_mcap_thread() { if (mcap_tid != 0) { shutting_down = 1; (void) cond_signal(&shutdown_cv); (void) thr_join(mcap_tid, NULL, NULL); mcap_tid = 0; } }