summaryrefslogtreecommitdiff
path: root/usr/src/cmd
diff options
context:
space:
mode:
authorJohn Sonnenschein <johns@joyent.com>2012-01-26 22:45:08 +0000
committerJohn Sonnenschein <johns@joyent.com>2012-01-26 22:45:08 +0000
commit92fb61a58dea0e097df79e73d3cda1bef52c4339 (patch)
tree8a9727b7a3600c29b261de4580520de19ec57e56 /usr/src/cmd
parentd6f90348d6442b80b8e77a4aaf217cbf86c54bdb (diff)
parent6a5fc1386817167c00f9ee99bebd9ce35d434593 (diff)
downloadillumos-joyent-gcc4.tar.gz
Merge branch 'gcc4' of git.joyent.com:illumos-joyent into gcc4gcc4
Diffstat (limited to 'usr/src/cmd')
-rw-r--r--usr/src/cmd/fs.d/nfs/mountd/mountd.c8
-rw-r--r--usr/src/cmd/fs.d/nfs/nfsd/nfsd.c8
-rw-r--r--usr/src/cmd/rcap/rcapd/rcapd_scanner.c3
-rw-r--r--usr/src/cmd/savecore/savecore.c17
-rw-r--r--usr/src/cmd/stat/Makefile5
-rw-r--r--usr/src/cmd/stat/zschedstat/Makefile50
-rw-r--r--usr/src/cmd/stat/zschedstat/zschedstat.c335
-rw-r--r--usr/src/cmd/truss/print.c9
-rw-r--r--usr/src/cmd/zoneadmd/mcap.c183
9 files changed, 513 insertions, 105 deletions
diff --git a/usr/src/cmd/fs.d/nfs/mountd/mountd.c b/usr/src/cmd/fs.d/nfs/mountd/mountd.c
index dbf4c11ea1..04f8ff1def 100644
--- a/usr/src/cmd/fs.d/nfs/mountd/mountd.c
+++ b/usr/src/cmd/fs.d/nfs/mountd/mountd.c
@@ -20,6 +20,7 @@
*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -390,6 +391,13 @@ main(int argc, char *argv[])
exit(1);
}
+ /* Mountd cannot run in a non-global zone. */
+ if (getzoneid() != GLOBAL_ZONEID) {
+ (void) fprintf(stderr, "%s: can only run in the global zone\n",
+ argv[0]);
+ exit(1);
+ }
+
maxthreads = 0;
while ((c = getopt(argc, argv, "vrm:")) != EOF) {
diff --git a/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c b/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c
index 6c0e0bda5e..c34c39a13e 100644
--- a/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c
+++ b/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -176,6 +177,13 @@ main(int ac, char *av[])
exit(1);
}
+ /* Nfsd cannot run in a non-global zone. */
+ if (getzoneid() != GLOBAL_ZONEID) {
+ (void) fprintf(stderr, "%s: can only run in the global zone\n",
+ av[0]);
+ exit(1);
+ }
+
(void) enable_extended_FILE_stdio(-1, -1);
/*
diff --git a/usr/src/cmd/rcap/rcapd/rcapd_scanner.c b/usr/src/cmd/rcap/rcapd/rcapd_scanner.c
index b39811b552..254bb9e922 100644
--- a/usr/src/cmd/rcap/rcapd/rcapd_scanner.c
+++ b/usr/src/cmd/rcap/rcapd/rcapd_scanner.c
@@ -21,6 +21,7 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
@@ -551,7 +552,7 @@ pageout(pid_t pid, struct ps_prochandle *Pr, caddr_t start, caddr_t end)
errno = 0;
res = pr_memcntl(Pr, start, (end - start), MC_SYNC,
- (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0);
+ (caddr_t)(MS_ASYNC | MS_INVALCURPROC), 0, 0);
debug_high("pr_memcntl [%p-%p): %d", (void *)start, (void *)end, res);
/*
diff --git a/usr/src/cmd/savecore/savecore.c b/usr/src/cmd/savecore/savecore.c
index 1315893f0d..4042d8f3ab 100644
--- a/usr/src/cmd/savecore/savecore.c
+++ b/usr/src/cmd/savecore/savecore.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
#include <stdio.h>
@@ -1501,7 +1502,14 @@ getbounds(const char *f)
long b = -1;
const char *p = strrchr(f, '/');
- (void) sscanf(p ? p + 1 : f, "vmdump.%ld", &b);
+ if (p == NULL || strncmp(p, "vmdump", 6) != 0)
+ p = strstr(f, "vmdump");
+
+ if (p != NULL && *p == '/')
+ p++;
+
+ (void) sscanf(p ? p : f, "vmdump.%ld", &b);
+
return (b);
}
@@ -1635,6 +1643,7 @@ main(int argc, char *argv[])
struct rlimit rl;
long filebounds = -1;
char namelist[30], corefile[30], boundstr[30];
+ dumpfile = NULL;
startts = gethrtime();
@@ -1675,7 +1684,11 @@ main(int argc, char *argv[])
}
}
- if (geteuid() != 0 && filebounds < 0) {
+ /*
+ * If doing something other than extracting an existing dump (i.e.
+ * dumpfile has been provided as an option), the user must be root.
+ */
+ if (geteuid() != 0 && dumpfile == NULL) {
(void) fprintf(stderr, "%s: %s %s\n", progname,
gettext("you must be root to use"), progname);
exit(1);
diff --git a/usr/src/cmd/stat/Makefile b/usr/src/cmd/stat/Makefile
index faaa19f42c..01b96d14d2 100644
--- a/usr/src/cmd/stat/Makefile
+++ b/usr/src/cmd/stat/Makefile
@@ -19,7 +19,7 @@
# CDDL HEADER END
#
#
-# Copyright 2011 Joyent, Inc. All rights reserved.
+# Copyright 2011, 2012, Joyent, Inc. All rights reserved.
# Use is subject to license terms.
#
# cmd/stat/Makefile
@@ -33,7 +33,8 @@ SUBDIRS= arcstat \
mpstat \
vfsstat \
vmstat \
- ziostat
+ ziostat \
+ zschedstat
all := TARGET = all
install := TARGET = install
diff --git a/usr/src/cmd/stat/zschedstat/Makefile b/usr/src/cmd/stat/zschedstat/Makefile
new file mode 100644
index 0000000000..b8654d0ba4
--- /dev/null
+++ b/usr/src/cmd/stat/zschedstat/Makefile
@@ -0,0 +1,50 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2012, Joyent, Inc. All rights reserved.
+#
+
+include $(SRC)/cmd/Makefile.cmd
+
+PROG= zschedstat
+OBJS = zschedstat.o
+SRCS =$(OBJS:%.o=%.c) $(COMMON_SRCS)
+
+LDLIBS += -lkstat
+
+lint := LINTFLAGS = -muxs
+
+.KEEP_STATE:
+
+all: $(PROG)
+
+install: all .WAIT $(ROOTPROG)
+
+clean:
+
+$(ROOTBINPROG): $(PROG)
+ $(INS.file)
+
+lint: lint_SRCS
+
+check:
+ $(CSTYLE) -pP $(SRCS:%=%)
+
+include $(SRC)/cmd/Makefile.targ
diff --git a/usr/src/cmd/stat/zschedstat/zschedstat.c b/usr/src/cmd/stat/zschedstat/zschedstat.c
new file mode 100644
index 0000000000..ba89e2403f
--- /dev/null
+++ b/usr/src/cmd/stat/zschedstat/zschedstat.c
@@ -0,0 +1,335 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2012 Joyent, Inc. All rights reserved.
+ */
+
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <kstat.h>
+#include <errno.h>
+#include <sys/zone.h>
+
+typedef struct {
+ boolean_t valid;
+ uint64_t rqueue;
+ uint64_t rticks;
+ uint32_t fss_share_pct;
+ uint64_t fss_pri_hi;
+ uint64_t fss_pri_avg;
+ double avrun1;
+ uint64_t ns_usr;
+ uint64_t ns_sys;
+ uint64_t ns_wt;
+ uint64_t cpu_cap;
+ uint64_t cpu_baseline;
+ uint64_t cpu_cap_usage;
+ uint64_t above_base_sec;
+ uint64_t delay_cnt;
+ uint64_t delay_time;
+ /* Values from the previous cycle so we can diff */
+ uint64_t prv_rticks;
+ uint64_t prv_ns_usr;
+ uint64_t prv_ns_sys;
+ uint64_t prv_ns_wt;
+ uint64_t prv_above_base_sec;
+ uint64_t prv_delay_cnt;
+ uint64_t prv_delay_time;
+} zinfo_t;
+
+/*
+ * MAX_ZONEID is only 10000, so it is a lot faster to go direct to the entry
+ * we want, even though valid entries in this array will be sparse.
+ */
+
+static zinfo_t zinfo[MAX_ZONEID];
+static uint32_t nsec_per_tick = 0;
+
+static void
+usage()
+{
+ (void) fprintf(stderr, "zschedstat [-r] [interval [count]]\n");
+ exit(1);
+}
+
+static void
+get_zone_misc(int zid, kstat_t *ksp)
+{
+ kstat_named_t *kp;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "run_queue");
+ zinfo[zid].rqueue = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "run_ticks");
+ zinfo[zid].rticks = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "fss_share_percent");
+ zinfo[zid].fss_share_pct = kp->value.ui32;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "fss_pri_hi");
+ zinfo[zid].fss_pri_hi = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "fss_pri_avg");
+ zinfo[zid].fss_pri_avg = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "avenrun_1min");
+ zinfo[zid].avrun1 = (double)kp->value.ui32 / FSCALE;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "nsec_user");
+ zinfo[zid].ns_usr = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "nsec_sys");
+ zinfo[zid].ns_sys = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "nsec_waitrq");
+ zinfo[zid].ns_wt = kp->value.ui64;
+}
+
+static void
+get_zone_caps(int zid, kstat_t *ksp)
+{
+ kstat_named_t *kp;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "value");
+ zinfo[zid].cpu_cap = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "baseline");
+ zinfo[zid].cpu_baseline = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "usage");
+ zinfo[zid].cpu_cap_usage = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "above_base_sec");
+ zinfo[zid].above_base_sec = kp->value.ui64;
+}
+
+static void
+get_zone_vfs(int zid, kstat_t *ksp)
+{
+ kstat_named_t *kp;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "delay_cnt");
+ zinfo[zid].delay_cnt = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "delay_time");
+ zinfo[zid].delay_time = kp->value.ui64;
+}
+
+static void
+read_kstats()
+{
+ kstat_ctl_t *kc;
+ kstat_t *ksp;
+
+ if ((kc = kstat_open()) == NULL) {
+ (void) fprintf(stderr, "open failed\n");
+ exit(1);
+ }
+
+ for (ksp = kc->kc_chain; ksp; ksp = ksp->ks_next) {
+ if (strcmp("zones", ksp->ks_module) == 0 &&
+ strcmp("zone_misc", ksp->ks_class) == 0) {
+ if (kstat_read(kc, ksp, NULL) == -1) {
+ (void) fprintf(stderr, "read failed\n");
+ exit(1);
+ }
+ zinfo[ksp->ks_instance].valid = B_TRUE;
+
+ get_zone_misc(ksp->ks_instance, ksp);
+
+ } else if (strcmp("caps", ksp->ks_module) == 0 &&
+ strcmp("zone_caps", ksp->ks_class) == 0 &&
+ strncmp("cpucaps_zone", ksp->ks_name, 12) == 0) {
+ if (kstat_read(kc, ksp, NULL) == -1) {
+ (void) fprintf(stderr, "read failed\n");
+ exit(1);
+ }
+ zinfo[ksp->ks_instance].valid = B_TRUE;
+
+ get_zone_caps(ksp->ks_instance, ksp);
+
+ } else if (strcmp("zone_vfs", ksp->ks_module) == 0) {
+ if (kstat_read(kc, ksp, NULL) == -1) {
+ (void) fprintf(stderr, "read failed\n");
+ exit(1);
+ }
+ zinfo[ksp->ks_instance].valid = B_TRUE;
+
+ get_zone_vfs(ksp->ks_instance, ksp);
+
+ } else if (nsec_per_tick == 0 &&
+ strcmp("unix", ksp->ks_module) == 0 &&
+ strcmp("system_misc", ksp->ks_name) == 0) {
+ kstat_named_t *kp;
+
+ if (kstat_read(kc, ksp, NULL) == -1) {
+ (void) fprintf(stderr, "read failed\n");
+ exit(1);
+ }
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp,
+ "nsec_per_tick");
+ nsec_per_tick = kp->value.ui32;
+ }
+ }
+
+ (void) kstat_close(kc);
+}
+
+static float
+fmt_nsec(uint64_t curr, uint64_t prv)
+{
+ float s;
+ uint64_t nsec;
+
+ nsec = curr - prv;
+ s = (float)nsec / (long)NANOSEC;
+
+ return (s);
+}
+
+/* convert usecs to msecs */
+static float
+fmt_usec(uint64_t curr, uint64_t prv)
+{
+ float s;
+ uint64_t usec;
+
+ usec = curr - prv;
+ s = (float)usec / (long)MILLISEC;
+
+ return (s);
+}
+
+static float
+fmt_ticks(uint64_t curr, uint64_t prv)
+{
+ float s;
+ uint64_t ticks, nsec;
+
+ ticks = curr - prv;
+ nsec = ticks * nsec_per_tick;
+
+ s = (float)nsec / (long)NANOSEC;
+
+ return (s);
+}
+
+static void
+print_data(boolean_t parse)
+{
+ int i;
+ char *fmt;
+
+ if (parse) {
+ fmt = "%d,%lld,%.2f,%.1f,%lld,%lld,%lld,%lld,%lld,"
+ "%.2f,%lld,%.2f,%.2f,%.2f,%.2f\n";
+ } else {
+ fmt = "%4d %2lld %6.2f %5.1f %2lld %2lld %5lld %5lld %2lld "
+ "%5.2f %4lld %6.2f %6.2f %6.2f %6.2f\n";
+
+ (void) printf("%4s %2s %6s %5s %2s %2s %5s %5s %2s "
+ "%5s %4s %6s %6s %6s %6s\n",
+ "zid", "rq", "rsec", "sh%", "ph", "pa", "cap", "usage",
+ "bs", "1mla", "dcnt", "dms", "user", "sys", "wtrq");
+ }
+
+ for (i = 0; i < MAX_ZONEID; i++) {
+ if (zinfo[i].valid == B_FALSE)
+ continue;
+
+ /*LINTED E_SEC_PRINTF_VAR_FMT*/
+ (void) printf(fmt,
+ i,
+ zinfo[i].rqueue,
+ fmt_ticks(zinfo[i].rticks, zinfo[i].prv_rticks),
+ (float)zinfo[i].fss_share_pct / (float)10,
+ zinfo[i].fss_pri_hi,
+ zinfo[i].fss_pri_avg,
+ zinfo[i].cpu_cap,
+ zinfo[i].cpu_cap_usage,
+ zinfo[i].above_base_sec - zinfo[i].prv_above_base_sec,
+ zinfo[i].avrun1,
+ zinfo[i].delay_cnt - zinfo[i].prv_delay_cnt,
+ fmt_usec(zinfo[i].delay_time, zinfo[i].prv_delay_time),
+ fmt_nsec(zinfo[i].ns_usr, zinfo[i].prv_ns_usr),
+ fmt_nsec(zinfo[i].ns_sys, zinfo[i].prv_ns_sys),
+ fmt_nsec(zinfo[i].ns_wt, zinfo[i].prv_ns_wt));
+
+ zinfo[i].valid = B_FALSE;
+ zinfo[i].prv_rticks = zinfo[i].rticks;
+ zinfo[i].prv_ns_usr = zinfo[i].ns_usr;
+ zinfo[i].prv_ns_sys = zinfo[i].ns_sys;
+ zinfo[i].prv_ns_wt = zinfo[i].ns_wt;
+ zinfo[i].prv_above_base_sec = zinfo[i].above_base_sec;
+ zinfo[i].prv_delay_cnt = zinfo[i].delay_cnt;
+ zinfo[i].prv_delay_time = zinfo[i].delay_time;
+ }
+}
+
+int
+main(int argc, char **argv)
+{
+ int interval = 5;
+ int count;
+ int forever = 1;
+ int arg;
+ extern int optind;
+ boolean_t do_parse = B_FALSE;
+
+ while ((arg = getopt(argc, argv, "r")) != EOF) {
+ switch (arg) {
+ case 'r':
+ do_parse = B_TRUE;
+ break;
+ default:
+ usage();
+ }
+ }
+
+ if (argc > optind) {
+ interval = atoi(argv[optind]);
+ optind++;
+
+ if (argc > optind) {
+ count = atoi(argv[optind]);
+ forever = 0;
+ optind++;
+ }
+ }
+ if (argc > optind)
+ usage();
+
+ for (;;) {
+ read_kstats();
+ print_data(do_parse);
+ if (forever == 0 && --count == 0)
+ break;
+ (void) sleep(interval);
+ }
+
+ return (0);
+}
diff --git a/usr/src/cmd/truss/print.c b/usr/src/cmd/truss/print.c
index 49d6da39f9..a8c923fae2 100644
--- a/usr/src/cmd/truss/print.c
+++ b/usr/src/cmd/truss/print.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2012, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -825,7 +825,9 @@ prt_mc4(private_t *pri, int raw, long val) /* print memcntl() (4th) argument */
return;
case MC_SYNC:
- if ((val & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) == 0) {
+ if ((val &
+ ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE|MS_INVALCURPROC))
+ == 0) {
*(s = pri->code_buf) = '\0';
if (val & MS_SYNC)
(void) strlcat(s, "|MS_SYNC", CBSIZE);
@@ -834,6 +836,9 @@ prt_mc4(private_t *pri, int raw, long val) /* print memcntl() (4th) argument */
if (val & MS_INVALIDATE)
(void) strlcat(s, "|MS_INVALIDATE",
CBSIZE);
+ if (val & MS_INVALCURPROC)
+ (void) strlcat(s, "|MS_INVALCURPROC",
+ CBSIZE);
}
break;
diff --git a/usr/src/cmd/zoneadmd/mcap.c b/usr/src/cmd/zoneadmd/mcap.c
index 9e0fba65e2..d52eec9c97 100644
--- a/usr/src/cmd/zoneadmd/mcap.c
+++ b/usr/src/cmd/zoneadmd/mcap.c
@@ -40,7 +40,7 @@
* checks that against the zone's zone.max-physical-memory rctl. Once the
* zone goes over its cap, then this thread will work through the zone's
* /proc process list, Pgrab-bing each process and stepping through the
- * address space segments attempting to use pr_memcntl(...MS_INVALIDATE...)
+ * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...)
* to pageout pages, until the zone is again under its cap.
*
* Although zone memory capping is implemented as a soft cap by this user-level
@@ -56,21 +56,14 @@
* the thread will work to pageout until the zone is under the cap, as shown
* by updated vm_usage data.
*
- * There are a couple of interfaces (xmap, pagedata) in proc(4) that can be
- * used to examine a processes mapped segments while we are trying to pageout.
- * The observed xmap segement size data is frequently smaller than the
- * pagedata segement size data, so it is less effective in practice. Thus we
- * use pagedata to determine the size of each segment.
- *
- * The pagedata page maps (at least on x86) are not useful. Those flags
+ * NOTE: The pagedata page maps (at least on x86) are not useful. Those flags
* are set by hrm_setbits() and on x86 that code path is only executed by
* segvn_pagelock -> hat_setstat -> hrm_setbits
* segvn_softunlock -^
* On SPARC there is an additional code path which may make this data
* useful (sfmmu_ttesync), but since it is not generic, we ignore the page
- * maps and only use the segement info from pagedata. If we ever fix this
- * issue, then we could generalize this mcap code to do more with the data on
- * active pages.
+ * maps. If we ever fix this issue, then we could generalize this mcap code to
+ * do more with the data on active pages.
*
* For debugging, touch the file {zonepath}/mcap_debug.log. This will
* cause the thread to start logging its actions into that file (it may take
@@ -124,7 +117,6 @@ static cond_t shutdown_cv;
static int shutting_down = 0;
static thread_t mcap_tid;
static FILE *debug_log_fp = NULL;
-static uint64_t sum_pageout = 0; /* total bytes paged out in a pass */
static uint64_t zone_rss_cap; /* RSS cap(KB) */
static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */
@@ -135,13 +127,7 @@ static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */
typedef struct {
int pr_curr; /* the # of the mapping we're working on */
int pr_nmap; /* number of mappings in address space */
- int pr_cnt; /* number of mappings processed */
-
- prpageheader_t *pr_pghp; /* process's complete pagedata */
- prasmap_t *pr_asp; /* current address space pointer */
-
- uintptr_t pr_addr; /* base of mapping */
- uint64_t pr_size; /* size of mapping */
+ prxmap_t *pr_xmapp; /* process's xmap array */
} proc_map_t;
typedef struct zsd_vmusage64 {
@@ -293,40 +279,21 @@ control_proc(pid_t pid)
}
/*
- * Get data from the current prasmap_t and advance pr_asp to the next
- * asmap in the pagedata.
+ * Get the next mapping.
*/
-static uintptr_t
+static prxmap_t *
nextmapping(proc_map_t *pmp)
{
- prasmap_t *pap;
- void *pdp; /* per-page data pointer */
-
- pmp->pr_curr++;
- if (pmp->pr_curr > pmp->pr_nmap)
+ if (pmp->pr_xmapp == NULL || pmp->pr_curr >= pmp->pr_nmap)
return (NULL);
- pap = pmp->pr_asp;
-
- pmp->pr_addr = pap->pr_vaddr;
- pmp->pr_size = pap->pr_npage * pap->pr_pagesize;
- pmp->pr_cnt++;
-
- /* Advance the pr_asp pointer to the next asmap */
- pdp = pap + 1;
- pdp = (caddr_t)(uintptr_t)((uintptr_t)pdp + pap->pr_npage);
-
- /* Skip to next 64-bit-aligned address to get the next prasmap_t. */
- pdp = (caddr_t)(((uintptr_t)pdp + 7) & ~7);
- pmp->pr_asp = (prasmap_t *)pdp;
-
- return (pmp->pr_addr);
+ return (&pmp->pr_xmapp[pmp->pr_curr++]);
}
/*
* Initialize the proc_map_t to access the first mapping of an address space.
*/
-static void *
+static prxmap_t *
init_map(proc_map_t *pmp, pid_t pid)
{
int fd;
@@ -337,39 +304,37 @@ init_map(proc_map_t *pmp, pid_t pid)
bzero(pmp, sizeof (proc_map_t));
pmp->pr_nmap = -1;
- (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/pagedata", zoneproc,
- pid);
+ (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/xmap", zoneproc, pid);
if ((fd = open(pathbuf, O_RDONLY, 0)) < 0)
return (NULL);
redo:
errno = 0;
if (fstat(fd, &st) != 0)
- return (NULL);
+ goto done;
- if ((pmp->pr_pghp = malloc(st.st_size)) == NULL) {
- debug("cannot malloc() %ld bytes for pagedata", st.st_size);
- return (NULL);
+ if ((pmp->pr_xmapp = malloc(st.st_size)) == NULL) {
+ debug("cannot malloc() %ld bytes for xmap", st.st_size);
+ goto done;
}
- (void) bzero(pmp->pr_pghp, st.st_size);
+ (void) bzero(pmp->pr_xmapp, st.st_size);
errno = 0;
- if ((res = read(fd, pmp->pr_pghp, st.st_size)) != st.st_size) {
- free(pmp->pr_pghp);
- pmp->pr_pghp = NULL;
+ if ((res = read(fd, pmp->pr_xmapp, st.st_size)) != st.st_size) {
+ free(pmp->pr_xmapp);
+ pmp->pr_xmapp = NULL;
if (res > 0 || errno == E2BIG) {
goto redo;
} else {
- debug("pid %ld cannot read pagedata\n", pid);
- return (NULL);
+ debug("pid %ld cannot read xmap\n", pid);
+ goto done;
}
}
- pmp->pr_nmap = pmp->pr_pghp->pr_nmap;
- pmp->pr_asp = (prasmap_t *)(pmp->pr_pghp + 1);
+ pmp->pr_nmap = st.st_size / sizeof (prxmap_t);
done:
(void) close(fd);
- return ((void *)nextmapping(pmp));
+ return (nextmapping(pmp));
}
/*
@@ -377,13 +342,24 @@ done:
* return nonzero if not all of the pages may are pageable, for any reason.
*/
static int
-pageout_mapping(struct ps_prochandle *Pr, proc_map_t *pmp)
+pageout_mapping(struct ps_prochandle *Pr, prxmap_t *pmp)
{
int res;
+ /*
+ * We particularly want to avoid the pr_memcntl on anonymous mappings
+ * which show 0 since that will pull them back off of the free list
+ * and increase the zone's RSS, even though the process itself has
+ * them freed up.
+ */
+ if (pmp->pr_mflags & MA_ANON && pmp->pr_anon == 0)
+ return (0);
+ else if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM)
+ return (0);
+
errno = 0;
- res = pr_memcntl(Pr, (caddr_t)pmp->pr_addr, pmp->pr_size, MC_SYNC,
- (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0);
+ res = pr_memcntl(Pr, (caddr_t)pmp->pr_vaddr, pmp->pr_size, MC_SYNC,
+ (caddr_t)(MS_ASYNC | MS_INVALCURPROC), 0, 0);
/*
* EBUSY indicates none of the pages have backing store allocated, or
@@ -423,7 +399,7 @@ static int64_t
pageout_process(pid_t pid, int64_t excess)
{
int psfd;
- void *praddr;
+ prxmap_t *pxmap;
proc_map_t cur;
struct ps_prochandle *ph = NULL;
int unpageable_mappings;
@@ -433,7 +409,6 @@ pageout_process(pid_t pid, int64_t excess)
int incr_rss_check = 0;
char pathbuf[MAXPATHLEN];
- cur.pr_pghp = NULL;
(void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
pid);
if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
@@ -459,11 +434,11 @@ pageout_process(pid_t pid, int64_t excess)
}
/* Get segment residency information. */
- praddr = init_map(&cur, pid);
+ pxmap = init_map(&cur, pid);
/* Skip process if it has no mappings. */
- if (cur.pr_pghp == NULL) {
- debug("%ld: pagedata unreadable; ignoring\n", pid);
+ if (pxmap == NULL) {
+ debug("%ld: xmap unreadable; ignoring\n", pid);
goto done;
}
@@ -489,15 +464,15 @@ pageout_process(pid_t pid, int64_t excess)
*/
sum_att = sum_d_rss = 0;
unpageable_mappings = 0;
- while (excess > 0 && praddr != NULL && !shutting_down) {
+ while (excess > 0 && pxmap != NULL && !shutting_down) {
/* Try to page out the mapping. */
- if (pageout_mapping(ph, &cur) < 0) {
+ if (pageout_mapping(ph, pxmap) < 0) {
debug("pid %ld: exited or unpageable\n", pid);
break;
}
/* attempted is the size of the mapping */
- sum_att += (cur.pr_size / 1024);
+ sum_att += pxmap->pr_size / 1024;
/*
* This processes RSS is potentially enough to clear the
@@ -519,11 +494,10 @@ pageout_process(pid_t pid, int64_t excess)
} else {
excess += d_rss;
sum_d_rss += d_rss;
- sum_pageout += (-d_rss * 1024);
}
}
- praddr = (void *)nextmapping(&cur);
+ pxmap = nextmapping(&cur);
}
if (!incr_rss_check) {
@@ -531,12 +505,11 @@ pageout_process(pid_t pid, int64_t excess)
if (d_rss < 0) {
excess += d_rss;
sum_d_rss += d_rss;
- sum_pageout += (-d_rss * 1024);
}
}
- debug("pid %ld: map %d unp %d att %lluKB drss %lldKB excess %lldKB\n",
- pid, cur.pr_cnt, unpageable_mappings, (unsigned long long)sum_att,
+ debug("pid %ld: unp %d att %lluKB drss %lldKB excess %lldKB\n",
+ pid, unpageable_mappings, (unsigned long long)sum_att,
(unsigned long long)sum_d_rss, (long long)excess);
done:
@@ -546,8 +519,8 @@ done:
(void) Prelease(ph, 0);
}
- if (cur.pr_pghp != NULL)
- free(cur.pr_pghp);
+ if (cur.pr_xmapp != NULL)
+ free(cur.pr_xmapp);
(void) close(psfd);
@@ -680,12 +653,13 @@ get_zone_cap()
* is important considering that each zone will be monitoring its rss.
*/
static int64_t
-check_suspend(int age)
+check_suspend(int age, boolean_t new_cycle)
{
static hrtime_t last_cap_read = 0;
static uint64_t addon;
static uint64_t lo_thresh; /* Thresholds for how long to sleep */
static uint64_t hi_thresh; /* when under the cap (80% & 90%). */
+ static uint64_t prev_zone_rss = 0;
/* Wait a second to give the async pageout a chance to catch up. */
(void) sleep_shutdown(1);
@@ -742,16 +716,6 @@ check_suspend(int age)
continue;
}
- /*
- * If we did some paging out since our last invocation then
- * update the kstat so we can track how much was paged out.
- */
- if (sum_pageout != 0) {
- (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT,
- &sum_pageout, 0);
- sum_pageout = 0;
- }
-
zone_rss = get_mem_info(age);
/* calculate excess */
@@ -760,18 +724,41 @@ check_suspend(int age)
debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
zone_rss, zone_rss_cap, new_excess);
+ /*
+ * If necessary, updates stats.
+ */
+
+ /*
+ * If it looks like we did some paging out since last over the
+ * cap then update the kstat so we can approximate how much was
+ * paged out.
+ */
+ if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) {
+ uint64_t diff;
+
+ /* assume diff is num bytes we paged out */
+ diff = (prev_zone_rss - zone_rss) * 1024;
+
+ (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT,
+ &diff, 0);
+ }
+ prev_zone_rss = zone_rss;
+
if (new_excess > 0) {
- uint64_t n = 1;
+ if (new_cycle) {
+ uint64_t n = 1;
- /* Increment "nover" kstat. */
- (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0);
+ /* Increment "nover" kstat. */
+ (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER,
+ &n, 0);
+ }
/*
- * Once we go over the cap, then we want to page out a
- * little extra instead of stopping right at the cap.
- * To do this we add 5% to the excess so that
- * pageout_proces will work a little longer before
- * stopping.
+ * Once we go over the cap, then we want to
+ * page out a little extra instead of stopping
+ * right at the cap. To do this we add 5% to
+ * the excess so that pageout_proces will work
+ * a little longer before stopping.
*/
return ((int64_t)(new_excess + addon));
}
@@ -845,7 +832,7 @@ mcap_zone()
struct dirent *dirent;
/* Wait until we've gone over the cap. */
- excess = check_suspend(age);
+ excess = check_suspend(age, B_TRUE);
debug("starting to scan, excess %lldk\n", (long long)excess);
@@ -885,10 +872,10 @@ mcap_zone()
excess = pageout_process(pid, excess);
if (excess <= 0) {
- debug("done scanning; excess %lld\n",
+ debug("apparently under; excess %lld\n",
(long long)excess);
/* Double check the current excess */
- excess = check_suspend(1);
+ excess = check_suspend(1, B_FALSE);
}
}