diff options
Diffstat (limited to 'usr/src/cmd/zoneadmd/mcap.c')
-rw-r--r-- | usr/src/cmd/zoneadmd/mcap.c | 183 |
1 files changed, 85 insertions, 98 deletions
diff --git a/usr/src/cmd/zoneadmd/mcap.c b/usr/src/cmd/zoneadmd/mcap.c index 9e0fba65e2..d52eec9c97 100644 --- a/usr/src/cmd/zoneadmd/mcap.c +++ b/usr/src/cmd/zoneadmd/mcap.c @@ -40,7 +40,7 @@ * checks that against the zone's zone.max-physical-memory rctl. Once the * zone goes over its cap, then this thread will work through the zone's * /proc process list, Pgrab-bing each process and stepping through the - * address space segments attempting to use pr_memcntl(...MS_INVALIDATE...) + * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...) * to pageout pages, until the zone is again under its cap. * * Although zone memory capping is implemented as a soft cap by this user-level @@ -56,21 +56,14 @@ * the thread will work to pageout until the zone is under the cap, as shown * by updated vm_usage data. * - * There are a couple of interfaces (xmap, pagedata) in proc(4) that can be - * used to examine a processes mapped segments while we are trying to pageout. - * The observed xmap segement size data is frequently smaller than the - * pagedata segement size data, so it is less effective in practice. Thus we - * use pagedata to determine the size of each segment. - * - * The pagedata page maps (at least on x86) are not useful. Those flags + * NOTE: The pagedata page maps (at least on x86) are not useful. Those flags * are set by hrm_setbits() and on x86 that code path is only executed by * segvn_pagelock -> hat_setstat -> hrm_setbits * segvn_softunlock -^ * On SPARC there is an additional code path which may make this data * useful (sfmmu_ttesync), but since it is not generic, we ignore the page - * maps and only use the segement info from pagedata. If we ever fix this - * issue, then we could generalize this mcap code to do more with the data on - * active pages. + * maps. If we ever fix this issue, then we could generalize this mcap code to + * do more with the data on active pages. * * For debugging, touch the file {zonepath}/mcap_debug.log. This will * cause the thread to start logging its actions into that file (it may take @@ -124,7 +117,6 @@ static cond_t shutdown_cv; static int shutting_down = 0; static thread_t mcap_tid; static FILE *debug_log_fp = NULL; -static uint64_t sum_pageout = 0; /* total bytes paged out in a pass */ static uint64_t zone_rss_cap; /* RSS cap(KB) */ static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */ @@ -135,13 +127,7 @@ static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */ typedef struct { int pr_curr; /* the # of the mapping we're working on */ int pr_nmap; /* number of mappings in address space */ - int pr_cnt; /* number of mappings processed */ - - prpageheader_t *pr_pghp; /* process's complete pagedata */ - prasmap_t *pr_asp; /* current address space pointer */ - - uintptr_t pr_addr; /* base of mapping */ - uint64_t pr_size; /* size of mapping */ + prxmap_t *pr_xmapp; /* process's xmap array */ } proc_map_t; typedef struct zsd_vmusage64 { @@ -293,40 +279,21 @@ control_proc(pid_t pid) } /* - * Get data from the current prasmap_t and advance pr_asp to the next - * asmap in the pagedata. + * Get the next mapping. */ -static uintptr_t +static prxmap_t * nextmapping(proc_map_t *pmp) { - prasmap_t *pap; - void *pdp; /* per-page data pointer */ - - pmp->pr_curr++; - if (pmp->pr_curr > pmp->pr_nmap) + if (pmp->pr_xmapp == NULL || pmp->pr_curr >= pmp->pr_nmap) return (NULL); - pap = pmp->pr_asp; - - pmp->pr_addr = pap->pr_vaddr; - pmp->pr_size = pap->pr_npage * pap->pr_pagesize; - pmp->pr_cnt++; - - /* Advance the pr_asp pointer to the next asmap */ - pdp = pap + 1; - pdp = (caddr_t)(uintptr_t)((uintptr_t)pdp + pap->pr_npage); - - /* Skip to next 64-bit-aligned address to get the next prasmap_t. */ - pdp = (caddr_t)(((uintptr_t)pdp + 7) & ~7); - pmp->pr_asp = (prasmap_t *)pdp; - - return (pmp->pr_addr); + return (&pmp->pr_xmapp[pmp->pr_curr++]); } /* * Initialize the proc_map_t to access the first mapping of an address space. */ -static void * +static prxmap_t * init_map(proc_map_t *pmp, pid_t pid) { int fd; @@ -337,39 +304,37 @@ init_map(proc_map_t *pmp, pid_t pid) bzero(pmp, sizeof (proc_map_t)); pmp->pr_nmap = -1; - (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/pagedata", zoneproc, - pid); + (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/xmap", zoneproc, pid); if ((fd = open(pathbuf, O_RDONLY, 0)) < 0) return (NULL); redo: errno = 0; if (fstat(fd, &st) != 0) - return (NULL); + goto done; - if ((pmp->pr_pghp = malloc(st.st_size)) == NULL) { - debug("cannot malloc() %ld bytes for pagedata", st.st_size); - return (NULL); + if ((pmp->pr_xmapp = malloc(st.st_size)) == NULL) { + debug("cannot malloc() %ld bytes for xmap", st.st_size); + goto done; } - (void) bzero(pmp->pr_pghp, st.st_size); + (void) bzero(pmp->pr_xmapp, st.st_size); errno = 0; - if ((res = read(fd, pmp->pr_pghp, st.st_size)) != st.st_size) { - free(pmp->pr_pghp); - pmp->pr_pghp = NULL; + if ((res = read(fd, pmp->pr_xmapp, st.st_size)) != st.st_size) { + free(pmp->pr_xmapp); + pmp->pr_xmapp = NULL; if (res > 0 || errno == E2BIG) { goto redo; } else { - debug("pid %ld cannot read pagedata\n", pid); - return (NULL); + debug("pid %ld cannot read xmap\n", pid); + goto done; } } - pmp->pr_nmap = pmp->pr_pghp->pr_nmap; - pmp->pr_asp = (prasmap_t *)(pmp->pr_pghp + 1); + pmp->pr_nmap = st.st_size / sizeof (prxmap_t); done: (void) close(fd); - return ((void *)nextmapping(pmp)); + return (nextmapping(pmp)); } /* @@ -377,13 +342,24 @@ done: * return nonzero if not all of the pages may are pageable, for any reason. */ static int -pageout_mapping(struct ps_prochandle *Pr, proc_map_t *pmp) +pageout_mapping(struct ps_prochandle *Pr, prxmap_t *pmp) { int res; + /* + * We particularly want to avoid the pr_memcntl on anonymous mappings + * which show 0 since that will pull them back off of the free list + * and increase the zone's RSS, even though the process itself has + * them freed up. + */ + if (pmp->pr_mflags & MA_ANON && pmp->pr_anon == 0) + return (0); + else if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM) + return (0); + errno = 0; - res = pr_memcntl(Pr, (caddr_t)pmp->pr_addr, pmp->pr_size, MC_SYNC, - (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0); + res = pr_memcntl(Pr, (caddr_t)pmp->pr_vaddr, pmp->pr_size, MC_SYNC, + (caddr_t)(MS_ASYNC | MS_INVALCURPROC), 0, 0); /* * EBUSY indicates none of the pages have backing store allocated, or @@ -423,7 +399,7 @@ static int64_t pageout_process(pid_t pid, int64_t excess) { int psfd; - void *praddr; + prxmap_t *pxmap; proc_map_t cur; struct ps_prochandle *ph = NULL; int unpageable_mappings; @@ -433,7 +409,6 @@ pageout_process(pid_t pid, int64_t excess) int incr_rss_check = 0; char pathbuf[MAXPATHLEN]; - cur.pr_pghp = NULL; (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc, pid); if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0) @@ -459,11 +434,11 @@ pageout_process(pid_t pid, int64_t excess) } /* Get segment residency information. */ - praddr = init_map(&cur, pid); + pxmap = init_map(&cur, pid); /* Skip process if it has no mappings. */ - if (cur.pr_pghp == NULL) { - debug("%ld: pagedata unreadable; ignoring\n", pid); + if (pxmap == NULL) { + debug("%ld: xmap unreadable; ignoring\n", pid); goto done; } @@ -489,15 +464,15 @@ pageout_process(pid_t pid, int64_t excess) */ sum_att = sum_d_rss = 0; unpageable_mappings = 0; - while (excess > 0 && praddr != NULL && !shutting_down) { + while (excess > 0 && pxmap != NULL && !shutting_down) { /* Try to page out the mapping. */ - if (pageout_mapping(ph, &cur) < 0) { + if (pageout_mapping(ph, pxmap) < 0) { debug("pid %ld: exited or unpageable\n", pid); break; } /* attempted is the size of the mapping */ - sum_att += (cur.pr_size / 1024); + sum_att += pxmap->pr_size / 1024; /* * This processes RSS is potentially enough to clear the @@ -519,11 +494,10 @@ pageout_process(pid_t pid, int64_t excess) } else { excess += d_rss; sum_d_rss += d_rss; - sum_pageout += (-d_rss * 1024); } } - praddr = (void *)nextmapping(&cur); + pxmap = nextmapping(&cur); } if (!incr_rss_check) { @@ -531,12 +505,11 @@ pageout_process(pid_t pid, int64_t excess) if (d_rss < 0) { excess += d_rss; sum_d_rss += d_rss; - sum_pageout += (-d_rss * 1024); } } - debug("pid %ld: map %d unp %d att %lluKB drss %lldKB excess %lldKB\n", - pid, cur.pr_cnt, unpageable_mappings, (unsigned long long)sum_att, + debug("pid %ld: unp %d att %lluKB drss %lldKB excess %lldKB\n", + pid, unpageable_mappings, (unsigned long long)sum_att, (unsigned long long)sum_d_rss, (long long)excess); done: @@ -546,8 +519,8 @@ done: (void) Prelease(ph, 0); } - if (cur.pr_pghp != NULL) - free(cur.pr_pghp); + if (cur.pr_xmapp != NULL) + free(cur.pr_xmapp); (void) close(psfd); @@ -680,12 +653,13 @@ get_zone_cap() * is important considering that each zone will be monitoring its rss. */ static int64_t -check_suspend(int age) +check_suspend(int age, boolean_t new_cycle) { static hrtime_t last_cap_read = 0; static uint64_t addon; static uint64_t lo_thresh; /* Thresholds for how long to sleep */ static uint64_t hi_thresh; /* when under the cap (80% & 90%). */ + static uint64_t prev_zone_rss = 0; /* Wait a second to give the async pageout a chance to catch up. */ (void) sleep_shutdown(1); @@ -742,16 +716,6 @@ check_suspend(int age) continue; } - /* - * If we did some paging out since our last invocation then - * update the kstat so we can track how much was paged out. - */ - if (sum_pageout != 0) { - (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT, - &sum_pageout, 0); - sum_pageout = 0; - } - zone_rss = get_mem_info(age); /* calculate excess */ @@ -760,18 +724,41 @@ check_suspend(int age) debug("rss %lluKB, cap %lluKB, excess %lldKB\n", zone_rss, zone_rss_cap, new_excess); + /* + * If necessary, updates stats. + */ + + /* + * If it looks like we did some paging out since last over the + * cap then update the kstat so we can approximate how much was + * paged out. + */ + if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) { + uint64_t diff; + + /* assume diff is num bytes we paged out */ + diff = (prev_zone_rss - zone_rss) * 1024; + + (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT, + &diff, 0); + } + prev_zone_rss = zone_rss; + if (new_excess > 0) { - uint64_t n = 1; + if (new_cycle) { + uint64_t n = 1; - /* Increment "nover" kstat. */ - (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0); + /* Increment "nover" kstat. */ + (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, + &n, 0); + } /* - * Once we go over the cap, then we want to page out a - * little extra instead of stopping right at the cap. - * To do this we add 5% to the excess so that - * pageout_proces will work a little longer before - * stopping. + * Once we go over the cap, then we want to + * page out a little extra instead of stopping + * right at the cap. To do this we add 5% to + * the excess so that pageout_proces will work + * a little longer before stopping. */ return ((int64_t)(new_excess + addon)); } @@ -845,7 +832,7 @@ mcap_zone() struct dirent *dirent; /* Wait until we've gone over the cap. */ - excess = check_suspend(age); + excess = check_suspend(age, B_TRUE); debug("starting to scan, excess %lldk\n", (long long)excess); @@ -885,10 +872,10 @@ mcap_zone() excess = pageout_process(pid, excess); if (excess <= 0) { - debug("done scanning; excess %lld\n", + debug("apparently under; excess %lld\n", (long long)excess); /* Double check the current excess */ - excess = check_suspend(1); + excess = check_suspend(1, B_FALSE); } } |