diff options
Diffstat (limited to 'usr/src/cmd')
-rw-r--r-- | usr/src/cmd/fs.d/nfs/mountd/mountd.c | 8 | ||||
-rw-r--r-- | usr/src/cmd/fs.d/nfs/nfsd/nfsd.c | 8 | ||||
-rw-r--r-- | usr/src/cmd/rcap/rcapd/rcapd_scanner.c | 3 | ||||
-rw-r--r-- | usr/src/cmd/savecore/savecore.c | 17 | ||||
-rw-r--r-- | usr/src/cmd/stat/Makefile | 5 | ||||
-rw-r--r-- | usr/src/cmd/stat/zschedstat/Makefile | 50 | ||||
-rw-r--r-- | usr/src/cmd/stat/zschedstat/zschedstat.c | 335 | ||||
-rw-r--r-- | usr/src/cmd/truss/print.c | 9 | ||||
-rw-r--r-- | usr/src/cmd/zoneadmd/mcap.c | 183 |
9 files changed, 513 insertions, 105 deletions
diff --git a/usr/src/cmd/fs.d/nfs/mountd/mountd.c b/usr/src/cmd/fs.d/nfs/mountd/mountd.c index dbf4c11ea1..04f8ff1def 100644 --- a/usr/src/cmd/fs.d/nfs/mountd/mountd.c +++ b/usr/src/cmd/fs.d/nfs/mountd/mountd.c @@ -20,6 +20,7 @@ * * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2012 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -390,6 +391,13 @@ main(int argc, char *argv[]) exit(1); } + /* Mountd cannot run in a non-global zone. */ + if (getzoneid() != GLOBAL_ZONEID) { + (void) fprintf(stderr, "%s: can only run in the global zone\n", + argv[0]); + exit(1); + } + maxthreads = 0; while ((c = getopt(argc, argv, "vrm:")) != EOF) { diff --git a/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c b/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c index 6c0e0bda5e..c34c39a13e 100644 --- a/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c +++ b/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -176,6 +177,13 @@ main(int ac, char *av[]) exit(1); } + /* Nfsd cannot run in a non-global zone. */ + if (getzoneid() != GLOBAL_ZONEID) { + (void) fprintf(stderr, "%s: can only run in the global zone\n", + av[0]); + exit(1); + } + (void) enable_extended_FILE_stdio(-1, -1); /* diff --git a/usr/src/cmd/rcap/rcapd/rcapd_scanner.c b/usr/src/cmd/rcap/rcapd/rcapd_scanner.c index b39811b552..254bb9e922 100644 --- a/usr/src/cmd/rcap/rcapd/rcapd_scanner.c +++ b/usr/src/cmd/rcap/rcapd/rcapd_scanner.c @@ -21,6 +21,7 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. */ #pragma ident "%Z%%M% %I% %E% SMI" @@ -551,7 +552,7 @@ pageout(pid_t pid, struct ps_prochandle *Pr, caddr_t start, caddr_t end) errno = 0; res = pr_memcntl(Pr, start, (end - start), MC_SYNC, - (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0); + (caddr_t)(MS_ASYNC | MS_INVALCURPROC), 0, 0); debug_high("pr_memcntl [%p-%p): %d", (void *)start, (void *)end, res); /* diff --git a/usr/src/cmd/savecore/savecore.c b/usr/src/cmd/savecore/savecore.c index 1315893f0d..4042d8f3ab 100644 --- a/usr/src/cmd/savecore/savecore.c +++ b/usr/src/cmd/savecore/savecore.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #include <stdio.h> @@ -1501,7 +1502,14 @@ getbounds(const char *f) long b = -1; const char *p = strrchr(f, '/'); - (void) sscanf(p ? p + 1 : f, "vmdump.%ld", &b); + if (p == NULL || strncmp(p, "vmdump", 6) != 0) + p = strstr(f, "vmdump"); + + if (p != NULL && *p == '/') + p++; + + (void) sscanf(p ? p : f, "vmdump.%ld", &b); + return (b); } @@ -1635,6 +1643,7 @@ main(int argc, char *argv[]) struct rlimit rl; long filebounds = -1; char namelist[30], corefile[30], boundstr[30]; + dumpfile = NULL; startts = gethrtime(); @@ -1675,7 +1684,11 @@ main(int argc, char *argv[]) } } - if (geteuid() != 0 && filebounds < 0) { + /* + * If doing something other than extracting an existing dump (i.e. + * dumpfile has been provided as an option), the user must be root. + */ + if (geteuid() != 0 && dumpfile == NULL) { (void) fprintf(stderr, "%s: %s %s\n", progname, gettext("you must be root to use"), progname); exit(1); diff --git a/usr/src/cmd/stat/Makefile b/usr/src/cmd/stat/Makefile index faaa19f42c..01b96d14d2 100644 --- a/usr/src/cmd/stat/Makefile +++ b/usr/src/cmd/stat/Makefile @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2011 Joyent, Inc. All rights reserved. +# Copyright 2011, 2012, Joyent, Inc. All rights reserved. # Use is subject to license terms. # # cmd/stat/Makefile @@ -33,7 +33,8 @@ SUBDIRS= arcstat \ mpstat \ vfsstat \ vmstat \ - ziostat + ziostat \ + zschedstat all := TARGET = all install := TARGET = install diff --git a/usr/src/cmd/stat/zschedstat/Makefile b/usr/src/cmd/stat/zschedstat/Makefile new file mode 100644 index 0000000000..b8654d0ba4 --- /dev/null +++ b/usr/src/cmd/stat/zschedstat/Makefile @@ -0,0 +1,50 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2012, Joyent, Inc. All rights reserved. +# + +include $(SRC)/cmd/Makefile.cmd + +PROG= zschedstat +OBJS = zschedstat.o +SRCS =$(OBJS:%.o=%.c) $(COMMON_SRCS) + +LDLIBS += -lkstat + +lint := LINTFLAGS = -muxs + +.KEEP_STATE: + +all: $(PROG) + +install: all .WAIT $(ROOTPROG) + +clean: + +$(ROOTBINPROG): $(PROG) + $(INS.file) + +lint: lint_SRCS + +check: + $(CSTYLE) -pP $(SRCS:%=%) + +include $(SRC)/cmd/Makefile.targ diff --git a/usr/src/cmd/stat/zschedstat/zschedstat.c b/usr/src/cmd/stat/zschedstat/zschedstat.c new file mode 100644 index 0000000000..ba89e2403f --- /dev/null +++ b/usr/src/cmd/stat/zschedstat/zschedstat.c @@ -0,0 +1,335 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2012 Joyent, Inc. All rights reserved. + */ + + +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <kstat.h> +#include <errno.h> +#include <sys/zone.h> + +typedef struct { + boolean_t valid; + uint64_t rqueue; + uint64_t rticks; + uint32_t fss_share_pct; + uint64_t fss_pri_hi; + uint64_t fss_pri_avg; + double avrun1; + uint64_t ns_usr; + uint64_t ns_sys; + uint64_t ns_wt; + uint64_t cpu_cap; + uint64_t cpu_baseline; + uint64_t cpu_cap_usage; + uint64_t above_base_sec; + uint64_t delay_cnt; + uint64_t delay_time; + /* Values from the previous cycle so we can diff */ + uint64_t prv_rticks; + uint64_t prv_ns_usr; + uint64_t prv_ns_sys; + uint64_t prv_ns_wt; + uint64_t prv_above_base_sec; + uint64_t prv_delay_cnt; + uint64_t prv_delay_time; +} zinfo_t; + +/* + * MAX_ZONEID is only 10000, so it is a lot faster to go direct to the entry + * we want, even though valid entries in this array will be sparse. + */ + +static zinfo_t zinfo[MAX_ZONEID]; +static uint32_t nsec_per_tick = 0; + +static void +usage() +{ + (void) fprintf(stderr, "zschedstat [-r] [interval [count]]\n"); + exit(1); +} + +static void +get_zone_misc(int zid, kstat_t *ksp) +{ + kstat_named_t *kp; + + kp = (kstat_named_t *)kstat_data_lookup(ksp, "run_queue"); + zinfo[zid].rqueue = kp->value.ui64; + + kp = (kstat_named_t *)kstat_data_lookup(ksp, "run_ticks"); + zinfo[zid].rticks = kp->value.ui64; + + kp = (kstat_named_t *)kstat_data_lookup(ksp, "fss_share_percent"); + zinfo[zid].fss_share_pct = kp->value.ui32; + + kp = (kstat_named_t *)kstat_data_lookup(ksp, "fss_pri_hi"); + zinfo[zid].fss_pri_hi = kp->value.ui64; + + kp = (kstat_named_t *)kstat_data_lookup(ksp, "fss_pri_avg"); + zinfo[zid].fss_pri_avg = kp->value.ui64; + + kp = (kstat_named_t *)kstat_data_lookup(ksp, "avenrun_1min"); + zinfo[zid].avrun1 = (double)kp->value.ui32 / FSCALE; + + kp = (kstat_named_t *)kstat_data_lookup(ksp, "nsec_user"); + zinfo[zid].ns_usr = kp->value.ui64; + + kp = (kstat_named_t *)kstat_data_lookup(ksp, "nsec_sys"); + zinfo[zid].ns_sys = kp->value.ui64; + + kp = (kstat_named_t *)kstat_data_lookup(ksp, "nsec_waitrq"); + zinfo[zid].ns_wt = kp->value.ui64; +} + +static void +get_zone_caps(int zid, kstat_t *ksp) +{ + kstat_named_t *kp; + + kp = (kstat_named_t *)kstat_data_lookup(ksp, "value"); + zinfo[zid].cpu_cap = kp->value.ui64; + + kp = (kstat_named_t *)kstat_data_lookup(ksp, "baseline"); + zinfo[zid].cpu_baseline = kp->value.ui64; + + kp = (kstat_named_t *)kstat_data_lookup(ksp, "usage"); + zinfo[zid].cpu_cap_usage = kp->value.ui64; + + kp = (kstat_named_t *)kstat_data_lookup(ksp, "above_base_sec"); + zinfo[zid].above_base_sec = kp->value.ui64; +} + +static void +get_zone_vfs(int zid, kstat_t *ksp) +{ + kstat_named_t *kp; + + kp = (kstat_named_t *)kstat_data_lookup(ksp, "delay_cnt"); + zinfo[zid].delay_cnt = kp->value.ui64; + + kp = (kstat_named_t *)kstat_data_lookup(ksp, "delay_time"); + zinfo[zid].delay_time = kp->value.ui64; +} + +static void +read_kstats() +{ + kstat_ctl_t *kc; + kstat_t *ksp; + + if ((kc = kstat_open()) == NULL) { + (void) fprintf(stderr, "open failed\n"); + exit(1); + } + + for (ksp = kc->kc_chain; ksp; ksp = ksp->ks_next) { + if (strcmp("zones", ksp->ks_module) == 0 && + strcmp("zone_misc", ksp->ks_class) == 0) { + if (kstat_read(kc, ksp, NULL) == -1) { + (void) fprintf(stderr, "read failed\n"); + exit(1); + } + zinfo[ksp->ks_instance].valid = B_TRUE; + + get_zone_misc(ksp->ks_instance, ksp); + + } else if (strcmp("caps", ksp->ks_module) == 0 && + strcmp("zone_caps", ksp->ks_class) == 0 && + strncmp("cpucaps_zone", ksp->ks_name, 12) == 0) { + if (kstat_read(kc, ksp, NULL) == -1) { + (void) fprintf(stderr, "read failed\n"); + exit(1); + } + zinfo[ksp->ks_instance].valid = B_TRUE; + + get_zone_caps(ksp->ks_instance, ksp); + + } else if (strcmp("zone_vfs", ksp->ks_module) == 0) { + if (kstat_read(kc, ksp, NULL) == -1) { + (void) fprintf(stderr, "read failed\n"); + exit(1); + } + zinfo[ksp->ks_instance].valid = B_TRUE; + + get_zone_vfs(ksp->ks_instance, ksp); + + } else if (nsec_per_tick == 0 && + strcmp("unix", ksp->ks_module) == 0 && + strcmp("system_misc", ksp->ks_name) == 0) { + kstat_named_t *kp; + + if (kstat_read(kc, ksp, NULL) == -1) { + (void) fprintf(stderr, "read failed\n"); + exit(1); + } + + kp = (kstat_named_t *)kstat_data_lookup(ksp, + "nsec_per_tick"); + nsec_per_tick = kp->value.ui32; + } + } + + (void) kstat_close(kc); +} + +static float +fmt_nsec(uint64_t curr, uint64_t prv) +{ + float s; + uint64_t nsec; + + nsec = curr - prv; + s = (float)nsec / (long)NANOSEC; + + return (s); +} + +/* convert usecs to msecs */ +static float +fmt_usec(uint64_t curr, uint64_t prv) +{ + float s; + uint64_t usec; + + usec = curr - prv; + s = (float)usec / (long)MILLISEC; + + return (s); +} + +static float +fmt_ticks(uint64_t curr, uint64_t prv) +{ + float s; + uint64_t ticks, nsec; + + ticks = curr - prv; + nsec = ticks * nsec_per_tick; + + s = (float)nsec / (long)NANOSEC; + + return (s); +} + +static void +print_data(boolean_t parse) +{ + int i; + char *fmt; + + if (parse) { + fmt = "%d,%lld,%.2f,%.1f,%lld,%lld,%lld,%lld,%lld," + "%.2f,%lld,%.2f,%.2f,%.2f,%.2f\n"; + } else { + fmt = "%4d %2lld %6.2f %5.1f %2lld %2lld %5lld %5lld %2lld " + "%5.2f %4lld %6.2f %6.2f %6.2f %6.2f\n"; + + (void) printf("%4s %2s %6s %5s %2s %2s %5s %5s %2s " + "%5s %4s %6s %6s %6s %6s\n", + "zid", "rq", "rsec", "sh%", "ph", "pa", "cap", "usage", + "bs", "1mla", "dcnt", "dms", "user", "sys", "wtrq"); + } + + for (i = 0; i < MAX_ZONEID; i++) { + if (zinfo[i].valid == B_FALSE) + continue; + + /*LINTED E_SEC_PRINTF_VAR_FMT*/ + (void) printf(fmt, + i, + zinfo[i].rqueue, + fmt_ticks(zinfo[i].rticks, zinfo[i].prv_rticks), + (float)zinfo[i].fss_share_pct / (float)10, + zinfo[i].fss_pri_hi, + zinfo[i].fss_pri_avg, + zinfo[i].cpu_cap, + zinfo[i].cpu_cap_usage, + zinfo[i].above_base_sec - zinfo[i].prv_above_base_sec, + zinfo[i].avrun1, + zinfo[i].delay_cnt - zinfo[i].prv_delay_cnt, + fmt_usec(zinfo[i].delay_time, zinfo[i].prv_delay_time), + fmt_nsec(zinfo[i].ns_usr, zinfo[i].prv_ns_usr), + fmt_nsec(zinfo[i].ns_sys, zinfo[i].prv_ns_sys), + fmt_nsec(zinfo[i].ns_wt, zinfo[i].prv_ns_wt)); + + zinfo[i].valid = B_FALSE; + zinfo[i].prv_rticks = zinfo[i].rticks; + zinfo[i].prv_ns_usr = zinfo[i].ns_usr; + zinfo[i].prv_ns_sys = zinfo[i].ns_sys; + zinfo[i].prv_ns_wt = zinfo[i].ns_wt; + zinfo[i].prv_above_base_sec = zinfo[i].above_base_sec; + zinfo[i].prv_delay_cnt = zinfo[i].delay_cnt; + zinfo[i].prv_delay_time = zinfo[i].delay_time; + } +} + +int +main(int argc, char **argv) +{ + int interval = 5; + int count; + int forever = 1; + int arg; + extern int optind; + boolean_t do_parse = B_FALSE; + + while ((arg = getopt(argc, argv, "r")) != EOF) { + switch (arg) { + case 'r': + do_parse = B_TRUE; + break; + default: + usage(); + } + } + + if (argc > optind) { + interval = atoi(argv[optind]); + optind++; + + if (argc > optind) { + count = atoi(argv[optind]); + forever = 0; + optind++; + } + } + if (argc > optind) + usage(); + + for (;;) { + read_kstats(); + print_data(do_parse); + if (forever == 0 && --count == 0) + break; + (void) sleep(interval); + } + + return (0); +} diff --git a/usr/src/cmd/truss/print.c b/usr/src/cmd/truss/print.c index 49d6da39f9..a8c923fae2 100644 --- a/usr/src/cmd/truss/print.c +++ b/usr/src/cmd/truss/print.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright (c) 2011, 2012, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -825,7 +825,9 @@ prt_mc4(private_t *pri, int raw, long val) /* print memcntl() (4th) argument */ return; case MC_SYNC: - if ((val & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) == 0) { + if ((val & + ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE|MS_INVALCURPROC)) + == 0) { *(s = pri->code_buf) = '\0'; if (val & MS_SYNC) (void) strlcat(s, "|MS_SYNC", CBSIZE); @@ -834,6 +836,9 @@ prt_mc4(private_t *pri, int raw, long val) /* print memcntl() (4th) argument */ if (val & MS_INVALIDATE) (void) strlcat(s, "|MS_INVALIDATE", CBSIZE); + if (val & MS_INVALCURPROC) + (void) strlcat(s, "|MS_INVALCURPROC", + CBSIZE); } break; diff --git a/usr/src/cmd/zoneadmd/mcap.c b/usr/src/cmd/zoneadmd/mcap.c index 9e0fba65e2..d52eec9c97 100644 --- a/usr/src/cmd/zoneadmd/mcap.c +++ b/usr/src/cmd/zoneadmd/mcap.c @@ -40,7 +40,7 @@ * checks that against the zone's zone.max-physical-memory rctl. Once the * zone goes over its cap, then this thread will work through the zone's * /proc process list, Pgrab-bing each process and stepping through the - * address space segments attempting to use pr_memcntl(...MS_INVALIDATE...) + * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...) * to pageout pages, until the zone is again under its cap. * * Although zone memory capping is implemented as a soft cap by this user-level @@ -56,21 +56,14 @@ * the thread will work to pageout until the zone is under the cap, as shown * by updated vm_usage data. * - * There are a couple of interfaces (xmap, pagedata) in proc(4) that can be - * used to examine a processes mapped segments while we are trying to pageout. - * The observed xmap segement size data is frequently smaller than the - * pagedata segement size data, so it is less effective in practice. Thus we - * use pagedata to determine the size of each segment. - * - * The pagedata page maps (at least on x86) are not useful. Those flags + * NOTE: The pagedata page maps (at least on x86) are not useful. Those flags * are set by hrm_setbits() and on x86 that code path is only executed by * segvn_pagelock -> hat_setstat -> hrm_setbits * segvn_softunlock -^ * On SPARC there is an additional code path which may make this data * useful (sfmmu_ttesync), but since it is not generic, we ignore the page - * maps and only use the segement info from pagedata. If we ever fix this - * issue, then we could generalize this mcap code to do more with the data on - * active pages. + * maps. If we ever fix this issue, then we could generalize this mcap code to + * do more with the data on active pages. * * For debugging, touch the file {zonepath}/mcap_debug.log. This will * cause the thread to start logging its actions into that file (it may take @@ -124,7 +117,6 @@ static cond_t shutdown_cv; static int shutting_down = 0; static thread_t mcap_tid; static FILE *debug_log_fp = NULL; -static uint64_t sum_pageout = 0; /* total bytes paged out in a pass */ static uint64_t zone_rss_cap; /* RSS cap(KB) */ static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */ @@ -135,13 +127,7 @@ static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */ typedef struct { int pr_curr; /* the # of the mapping we're working on */ int pr_nmap; /* number of mappings in address space */ - int pr_cnt; /* number of mappings processed */ - - prpageheader_t *pr_pghp; /* process's complete pagedata */ - prasmap_t *pr_asp; /* current address space pointer */ - - uintptr_t pr_addr; /* base of mapping */ - uint64_t pr_size; /* size of mapping */ + prxmap_t *pr_xmapp; /* process's xmap array */ } proc_map_t; typedef struct zsd_vmusage64 { @@ -293,40 +279,21 @@ control_proc(pid_t pid) } /* - * Get data from the current prasmap_t and advance pr_asp to the next - * asmap in the pagedata. + * Get the next mapping. */ -static uintptr_t +static prxmap_t * nextmapping(proc_map_t *pmp) { - prasmap_t *pap; - void *pdp; /* per-page data pointer */ - - pmp->pr_curr++; - if (pmp->pr_curr > pmp->pr_nmap) + if (pmp->pr_xmapp == NULL || pmp->pr_curr >= pmp->pr_nmap) return (NULL); - pap = pmp->pr_asp; - - pmp->pr_addr = pap->pr_vaddr; - pmp->pr_size = pap->pr_npage * pap->pr_pagesize; - pmp->pr_cnt++; - - /* Advance the pr_asp pointer to the next asmap */ - pdp = pap + 1; - pdp = (caddr_t)(uintptr_t)((uintptr_t)pdp + pap->pr_npage); - - /* Skip to next 64-bit-aligned address to get the next prasmap_t. */ - pdp = (caddr_t)(((uintptr_t)pdp + 7) & ~7); - pmp->pr_asp = (prasmap_t *)pdp; - - return (pmp->pr_addr); + return (&pmp->pr_xmapp[pmp->pr_curr++]); } /* * Initialize the proc_map_t to access the first mapping of an address space. */ -static void * +static prxmap_t * init_map(proc_map_t *pmp, pid_t pid) { int fd; @@ -337,39 +304,37 @@ init_map(proc_map_t *pmp, pid_t pid) bzero(pmp, sizeof (proc_map_t)); pmp->pr_nmap = -1; - (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/pagedata", zoneproc, - pid); + (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/xmap", zoneproc, pid); if ((fd = open(pathbuf, O_RDONLY, 0)) < 0) return (NULL); redo: errno = 0; if (fstat(fd, &st) != 0) - return (NULL); + goto done; - if ((pmp->pr_pghp = malloc(st.st_size)) == NULL) { - debug("cannot malloc() %ld bytes for pagedata", st.st_size); - return (NULL); + if ((pmp->pr_xmapp = malloc(st.st_size)) == NULL) { + debug("cannot malloc() %ld bytes for xmap", st.st_size); + goto done; } - (void) bzero(pmp->pr_pghp, st.st_size); + (void) bzero(pmp->pr_xmapp, st.st_size); errno = 0; - if ((res = read(fd, pmp->pr_pghp, st.st_size)) != st.st_size) { - free(pmp->pr_pghp); - pmp->pr_pghp = NULL; + if ((res = read(fd, pmp->pr_xmapp, st.st_size)) != st.st_size) { + free(pmp->pr_xmapp); + pmp->pr_xmapp = NULL; if (res > 0 || errno == E2BIG) { goto redo; } else { - debug("pid %ld cannot read pagedata\n", pid); - return (NULL); + debug("pid %ld cannot read xmap\n", pid); + goto done; } } - pmp->pr_nmap = pmp->pr_pghp->pr_nmap; - pmp->pr_asp = (prasmap_t *)(pmp->pr_pghp + 1); + pmp->pr_nmap = st.st_size / sizeof (prxmap_t); done: (void) close(fd); - return ((void *)nextmapping(pmp)); + return (nextmapping(pmp)); } /* @@ -377,13 +342,24 @@ done: * return nonzero if not all of the pages may are pageable, for any reason. */ static int -pageout_mapping(struct ps_prochandle *Pr, proc_map_t *pmp) +pageout_mapping(struct ps_prochandle *Pr, prxmap_t *pmp) { int res; + /* + * We particularly want to avoid the pr_memcntl on anonymous mappings + * which show 0 since that will pull them back off of the free list + * and increase the zone's RSS, even though the process itself has + * them freed up. + */ + if (pmp->pr_mflags & MA_ANON && pmp->pr_anon == 0) + return (0); + else if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM) + return (0); + errno = 0; - res = pr_memcntl(Pr, (caddr_t)pmp->pr_addr, pmp->pr_size, MC_SYNC, - (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0); + res = pr_memcntl(Pr, (caddr_t)pmp->pr_vaddr, pmp->pr_size, MC_SYNC, + (caddr_t)(MS_ASYNC | MS_INVALCURPROC), 0, 0); /* * EBUSY indicates none of the pages have backing store allocated, or @@ -423,7 +399,7 @@ static int64_t pageout_process(pid_t pid, int64_t excess) { int psfd; - void *praddr; + prxmap_t *pxmap; proc_map_t cur; struct ps_prochandle *ph = NULL; int unpageable_mappings; @@ -433,7 +409,6 @@ pageout_process(pid_t pid, int64_t excess) int incr_rss_check = 0; char pathbuf[MAXPATHLEN]; - cur.pr_pghp = NULL; (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc, pid); if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0) @@ -459,11 +434,11 @@ pageout_process(pid_t pid, int64_t excess) } /* Get segment residency information. */ - praddr = init_map(&cur, pid); + pxmap = init_map(&cur, pid); /* Skip process if it has no mappings. */ - if (cur.pr_pghp == NULL) { - debug("%ld: pagedata unreadable; ignoring\n", pid); + if (pxmap == NULL) { + debug("%ld: xmap unreadable; ignoring\n", pid); goto done; } @@ -489,15 +464,15 @@ pageout_process(pid_t pid, int64_t excess) */ sum_att = sum_d_rss = 0; unpageable_mappings = 0; - while (excess > 0 && praddr != NULL && !shutting_down) { + while (excess > 0 && pxmap != NULL && !shutting_down) { /* Try to page out the mapping. */ - if (pageout_mapping(ph, &cur) < 0) { + if (pageout_mapping(ph, pxmap) < 0) { debug("pid %ld: exited or unpageable\n", pid); break; } /* attempted is the size of the mapping */ - sum_att += (cur.pr_size / 1024); + sum_att += pxmap->pr_size / 1024; /* * This processes RSS is potentially enough to clear the @@ -519,11 +494,10 @@ pageout_process(pid_t pid, int64_t excess) } else { excess += d_rss; sum_d_rss += d_rss; - sum_pageout += (-d_rss * 1024); } } - praddr = (void *)nextmapping(&cur); + pxmap = nextmapping(&cur); } if (!incr_rss_check) { @@ -531,12 +505,11 @@ pageout_process(pid_t pid, int64_t excess) if (d_rss < 0) { excess += d_rss; sum_d_rss += d_rss; - sum_pageout += (-d_rss * 1024); } } - debug("pid %ld: map %d unp %d att %lluKB drss %lldKB excess %lldKB\n", - pid, cur.pr_cnt, unpageable_mappings, (unsigned long long)sum_att, + debug("pid %ld: unp %d att %lluKB drss %lldKB excess %lldKB\n", + pid, unpageable_mappings, (unsigned long long)sum_att, (unsigned long long)sum_d_rss, (long long)excess); done: @@ -546,8 +519,8 @@ done: (void) Prelease(ph, 0); } - if (cur.pr_pghp != NULL) - free(cur.pr_pghp); + if (cur.pr_xmapp != NULL) + free(cur.pr_xmapp); (void) close(psfd); @@ -680,12 +653,13 @@ get_zone_cap() * is important considering that each zone will be monitoring its rss. */ static int64_t -check_suspend(int age) +check_suspend(int age, boolean_t new_cycle) { static hrtime_t last_cap_read = 0; static uint64_t addon; static uint64_t lo_thresh; /* Thresholds for how long to sleep */ static uint64_t hi_thresh; /* when under the cap (80% & 90%). */ + static uint64_t prev_zone_rss = 0; /* Wait a second to give the async pageout a chance to catch up. */ (void) sleep_shutdown(1); @@ -742,16 +716,6 @@ check_suspend(int age) continue; } - /* - * If we did some paging out since our last invocation then - * update the kstat so we can track how much was paged out. - */ - if (sum_pageout != 0) { - (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT, - &sum_pageout, 0); - sum_pageout = 0; - } - zone_rss = get_mem_info(age); /* calculate excess */ @@ -760,18 +724,41 @@ check_suspend(int age) debug("rss %lluKB, cap %lluKB, excess %lldKB\n", zone_rss, zone_rss_cap, new_excess); + /* + * If necessary, updates stats. + */ + + /* + * If it looks like we did some paging out since last over the + * cap then update the kstat so we can approximate how much was + * paged out. + */ + if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) { + uint64_t diff; + + /* assume diff is num bytes we paged out */ + diff = (prev_zone_rss - zone_rss) * 1024; + + (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT, + &diff, 0); + } + prev_zone_rss = zone_rss; + if (new_excess > 0) { - uint64_t n = 1; + if (new_cycle) { + uint64_t n = 1; - /* Increment "nover" kstat. */ - (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0); + /* Increment "nover" kstat. */ + (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, + &n, 0); + } /* - * Once we go over the cap, then we want to page out a - * little extra instead of stopping right at the cap. - * To do this we add 5% to the excess so that - * pageout_proces will work a little longer before - * stopping. + * Once we go over the cap, then we want to + * page out a little extra instead of stopping + * right at the cap. To do this we add 5% to + * the excess so that pageout_proces will work + * a little longer before stopping. */ return ((int64_t)(new_excess + addon)); } @@ -845,7 +832,7 @@ mcap_zone() struct dirent *dirent; /* Wait until we've gone over the cap. */ - excess = check_suspend(age); + excess = check_suspend(age, B_TRUE); debug("starting to scan, excess %lldk\n", (long long)excess); @@ -885,10 +872,10 @@ mcap_zone() excess = pageout_process(pid, excess); if (excess <= 0) { - debug("done scanning; excess %lld\n", + debug("apparently under; excess %lld\n", (long long)excess); /* Double check the current excess */ - excess = check_suspend(1); + excess = check_suspend(1, B_FALSE); } } |