summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Sonnenschein <johns@joyent.com>2012-01-26 22:45:08 +0000
committerJohn Sonnenschein <johns@joyent.com>2012-01-26 22:45:08 +0000
commit92fb61a58dea0e097df79e73d3cda1bef52c4339 (patch)
tree8a9727b7a3600c29b261de4580520de19ec57e56
parentd6f90348d6442b80b8e77a4aaf217cbf86c54bdb (diff)
parent6a5fc1386817167c00f9ee99bebd9ce35d434593 (diff)
downloadillumos-joyent-92fb61a58dea0e097df79e73d3cda1bef52c4339.tar.gz
Merge branch 'gcc4' of git.joyent.com:illumos-joyent into gcc4gcc4
-rw-r--r--manifest2
-rw-r--r--usr/src/cmd/fs.d/nfs/mountd/mountd.c8
-rw-r--r--usr/src/cmd/fs.d/nfs/nfsd/nfsd.c8
-rw-r--r--usr/src/cmd/rcap/rcapd/rcapd_scanner.c3
-rw-r--r--usr/src/cmd/savecore/savecore.c17
-rw-r--r--usr/src/cmd/stat/Makefile5
-rw-r--r--usr/src/cmd/stat/zschedstat/Makefile50
-rw-r--r--usr/src/cmd/stat/zschedstat/zschedstat.c335
-rw-r--r--usr/src/cmd/truss/print.c9
-rw-r--r--usr/src/cmd/zoneadmd/mcap.c183
-rw-r--r--usr/src/lib/brand/joyent/zone/statechange.ksh3
-rwxr-xr-xusr/src/lib/brand/kvm/zone/kinstall.ksh8
-rwxr-xr-xusr/src/lib/brand/kvm/zone/statechange.ksh3
-rw-r--r--usr/src/lib/libumem/common/envvar.c38
-rw-r--r--usr/src/lib/libumem/common/umem.c32
-rw-r--r--usr/src/lib/libumem/common/umem_impl.h5
-rw-r--r--usr/src/lib/libumem/common/vmem.c2
-rw-r--r--usr/src/lib/libumem/common/vmem_base.c4
-rw-r--r--usr/src/lib/libumem/common/vmem_base.h5
-rw-r--r--usr/src/man/man1m/Makefile2
-rw-r--r--usr/src/man/man1m/zschedstat.1m202
-rw-r--r--usr/src/uts/common/disp/fss.c60
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_server.c4
-rw-r--r--usr/src/uts/common/os/kstat_fr.c6
-rw-r--r--usr/src/uts/common/os/zone.c13
-rw-r--r--usr/src/uts/common/sys/buf.h8
-rw-r--r--usr/src/uts/common/sys/fss.h4
-rw-r--r--usr/src/uts/common/sys/mman.h2
-rw-r--r--usr/src/uts/common/sys/zone.h21
-rw-r--r--usr/src/uts/common/syscall/memcntl.c9
-rw-r--r--usr/src/uts/common/vm/hat.h2
-rw-r--r--usr/src/uts/common/vm/seg_vn.c12
-rw-r--r--usr/src/uts/common/vm/vm_pvn.c28
-rw-r--r--usr/src/uts/common/vm/vm_usage.c16
-rw-r--r--usr/src/uts/i86pc/vm/hat_i86.c44
35 files changed, 1002 insertions, 151 deletions
diff --git a/manifest b/manifest
index b36a0f527e..9846f06a7f 100644
--- a/manifest
+++ b/manifest
@@ -2351,6 +2351,7 @@ f usr/bin/zcat 0555 root bin
f usr/bin/ziostat 0555 root bin
s usr/bin/zonename=../../sbin/zonename
f usr/bin/zonestat 0555 root bin
+f usr/bin/zschedstat 0555 root bin
d usr/ccs 0755 root bin
d usr/ccs/bin 0755 root bin
d usr/ccs/bin/amd64 0755 root bin
@@ -11983,6 +11984,7 @@ f usr/share/man/man1m/zic.1m 0444 root bin
f usr/share/man/man1m/zoneadm.1m 0444 root bin
f usr/share/man/man1m/zonecfg.1m 0444 root bin
f usr/share/man/man1m/zpool.1m 0444 root bin
+f usr/share/man/man1m/zschedstat.1m 0444 root bin
f usr/share/man/man1m/zstreamdump.1m 0444 root bin
d usr/share/man/man2 0755 root bin
f usr/share/man/man2/Intro.2 0444 root bin
diff --git a/usr/src/cmd/fs.d/nfs/mountd/mountd.c b/usr/src/cmd/fs.d/nfs/mountd/mountd.c
index dbf4c11ea1..04f8ff1def 100644
--- a/usr/src/cmd/fs.d/nfs/mountd/mountd.c
+++ b/usr/src/cmd/fs.d/nfs/mountd/mountd.c
@@ -20,6 +20,7 @@
*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -390,6 +391,13 @@ main(int argc, char *argv[])
exit(1);
}
+ /* Mountd cannot run in a non-global zone. */
+ if (getzoneid() != GLOBAL_ZONEID) {
+ (void) fprintf(stderr, "%s: can only run in the global zone\n",
+ argv[0]);
+ exit(1);
+ }
+
maxthreads = 0;
while ((c = getopt(argc, argv, "vrm:")) != EOF) {
diff --git a/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c b/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c
index 6c0e0bda5e..c34c39a13e 100644
--- a/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c
+++ b/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -176,6 +177,13 @@ main(int ac, char *av[])
exit(1);
}
+ /* Nfsd cannot run in a non-global zone. */
+ if (getzoneid() != GLOBAL_ZONEID) {
+ (void) fprintf(stderr, "%s: can only run in the global zone\n",
+ av[0]);
+ exit(1);
+ }
+
(void) enable_extended_FILE_stdio(-1, -1);
/*
diff --git a/usr/src/cmd/rcap/rcapd/rcapd_scanner.c b/usr/src/cmd/rcap/rcapd/rcapd_scanner.c
index b39811b552..254bb9e922 100644
--- a/usr/src/cmd/rcap/rcapd/rcapd_scanner.c
+++ b/usr/src/cmd/rcap/rcapd/rcapd_scanner.c
@@ -21,6 +21,7 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
@@ -551,7 +552,7 @@ pageout(pid_t pid, struct ps_prochandle *Pr, caddr_t start, caddr_t end)
errno = 0;
res = pr_memcntl(Pr, start, (end - start), MC_SYNC,
- (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0);
+ (caddr_t)(MS_ASYNC | MS_INVALCURPROC), 0, 0);
debug_high("pr_memcntl [%p-%p): %d", (void *)start, (void *)end, res);
/*
diff --git a/usr/src/cmd/savecore/savecore.c b/usr/src/cmd/savecore/savecore.c
index 1315893f0d..4042d8f3ab 100644
--- a/usr/src/cmd/savecore/savecore.c
+++ b/usr/src/cmd/savecore/savecore.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
#include <stdio.h>
@@ -1501,7 +1502,14 @@ getbounds(const char *f)
long b = -1;
const char *p = strrchr(f, '/');
- (void) sscanf(p ? p + 1 : f, "vmdump.%ld", &b);
+ if (p == NULL || strncmp(p, "vmdump", 6) != 0)
+ p = strstr(f, "vmdump");
+
+ if (p != NULL && *p == '/')
+ p++;
+
+ (void) sscanf(p ? p : f, "vmdump.%ld", &b);
+
return (b);
}
@@ -1635,6 +1643,7 @@ main(int argc, char *argv[])
struct rlimit rl;
long filebounds = -1;
char namelist[30], corefile[30], boundstr[30];
+ dumpfile = NULL;
startts = gethrtime();
@@ -1675,7 +1684,11 @@ main(int argc, char *argv[])
}
}
- if (geteuid() != 0 && filebounds < 0) {
+ /*
+ * If doing something other than extracting an existing dump (i.e.
+ * dumpfile has been provided as an option), the user must be root.
+ */
+ if (geteuid() != 0 && dumpfile == NULL) {
(void) fprintf(stderr, "%s: %s %s\n", progname,
gettext("you must be root to use"), progname);
exit(1);
diff --git a/usr/src/cmd/stat/Makefile b/usr/src/cmd/stat/Makefile
index faaa19f42c..01b96d14d2 100644
--- a/usr/src/cmd/stat/Makefile
+++ b/usr/src/cmd/stat/Makefile
@@ -19,7 +19,7 @@
# CDDL HEADER END
#
#
-# Copyright 2011 Joyent, Inc. All rights reserved.
+# Copyright 2011, 2012, Joyent, Inc. All rights reserved.
# Use is subject to license terms.
#
# cmd/stat/Makefile
@@ -33,7 +33,8 @@ SUBDIRS= arcstat \
mpstat \
vfsstat \
vmstat \
- ziostat
+ ziostat \
+ zschedstat
all := TARGET = all
install := TARGET = install
diff --git a/usr/src/cmd/stat/zschedstat/Makefile b/usr/src/cmd/stat/zschedstat/Makefile
new file mode 100644
index 0000000000..b8654d0ba4
--- /dev/null
+++ b/usr/src/cmd/stat/zschedstat/Makefile
@@ -0,0 +1,50 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2012, Joyent, Inc. All rights reserved.
+#
+
+include $(SRC)/cmd/Makefile.cmd
+
+PROG= zschedstat
+OBJS = zschedstat.o
+SRCS =$(OBJS:%.o=%.c) $(COMMON_SRCS)
+
+LDLIBS += -lkstat
+
+lint := LINTFLAGS = -muxs
+
+.KEEP_STATE:
+
+all: $(PROG)
+
+install: all .WAIT $(ROOTPROG)
+
+clean:
+
+$(ROOTBINPROG): $(PROG)
+ $(INS.file)
+
+lint: lint_SRCS
+
+check:
+ $(CSTYLE) -pP $(SRCS:%=%)
+
+include $(SRC)/cmd/Makefile.targ
diff --git a/usr/src/cmd/stat/zschedstat/zschedstat.c b/usr/src/cmd/stat/zschedstat/zschedstat.c
new file mode 100644
index 0000000000..ba89e2403f
--- /dev/null
+++ b/usr/src/cmd/stat/zschedstat/zschedstat.c
@@ -0,0 +1,335 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2012 Joyent, Inc. All rights reserved.
+ */
+
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <kstat.h>
+#include <errno.h>
+#include <sys/zone.h>
+
+typedef struct {
+ boolean_t valid;
+ uint64_t rqueue;
+ uint64_t rticks;
+ uint32_t fss_share_pct;
+ uint64_t fss_pri_hi;
+ uint64_t fss_pri_avg;
+ double avrun1;
+ uint64_t ns_usr;
+ uint64_t ns_sys;
+ uint64_t ns_wt;
+ uint64_t cpu_cap;
+ uint64_t cpu_baseline;
+ uint64_t cpu_cap_usage;
+ uint64_t above_base_sec;
+ uint64_t delay_cnt;
+ uint64_t delay_time;
+ /* Values from the previous cycle so we can diff */
+ uint64_t prv_rticks;
+ uint64_t prv_ns_usr;
+ uint64_t prv_ns_sys;
+ uint64_t prv_ns_wt;
+ uint64_t prv_above_base_sec;
+ uint64_t prv_delay_cnt;
+ uint64_t prv_delay_time;
+} zinfo_t;
+
+/*
+ * MAX_ZONEID is only 10000, so it is a lot faster to go direct to the entry
+ * we want, even though valid entries in this array will be sparse.
+ */
+
+static zinfo_t zinfo[MAX_ZONEID];
+static uint32_t nsec_per_tick = 0;
+
+static void
+usage()
+{
+ (void) fprintf(stderr, "zschedstat [-r] [interval [count]]\n");
+ exit(1);
+}
+
+static void
+get_zone_misc(int zid, kstat_t *ksp)
+{
+ kstat_named_t *kp;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "run_queue");
+ zinfo[zid].rqueue = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "run_ticks");
+ zinfo[zid].rticks = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "fss_share_percent");
+ zinfo[zid].fss_share_pct = kp->value.ui32;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "fss_pri_hi");
+ zinfo[zid].fss_pri_hi = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "fss_pri_avg");
+ zinfo[zid].fss_pri_avg = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "avenrun_1min");
+ zinfo[zid].avrun1 = (double)kp->value.ui32 / FSCALE;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "nsec_user");
+ zinfo[zid].ns_usr = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "nsec_sys");
+ zinfo[zid].ns_sys = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "nsec_waitrq");
+ zinfo[zid].ns_wt = kp->value.ui64;
+}
+
+static void
+get_zone_caps(int zid, kstat_t *ksp)
+{
+ kstat_named_t *kp;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "value");
+ zinfo[zid].cpu_cap = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "baseline");
+ zinfo[zid].cpu_baseline = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "usage");
+ zinfo[zid].cpu_cap_usage = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "above_base_sec");
+ zinfo[zid].above_base_sec = kp->value.ui64;
+}
+
+static void
+get_zone_vfs(int zid, kstat_t *ksp)
+{
+ kstat_named_t *kp;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "delay_cnt");
+ zinfo[zid].delay_cnt = kp->value.ui64;
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp, "delay_time");
+ zinfo[zid].delay_time = kp->value.ui64;
+}
+
+static void
+read_kstats()
+{
+ kstat_ctl_t *kc;
+ kstat_t *ksp;
+
+ if ((kc = kstat_open()) == NULL) {
+ (void) fprintf(stderr, "open failed\n");
+ exit(1);
+ }
+
+ for (ksp = kc->kc_chain; ksp; ksp = ksp->ks_next) {
+ if (strcmp("zones", ksp->ks_module) == 0 &&
+ strcmp("zone_misc", ksp->ks_class) == 0) {
+ if (kstat_read(kc, ksp, NULL) == -1) {
+ (void) fprintf(stderr, "read failed\n");
+ exit(1);
+ }
+ zinfo[ksp->ks_instance].valid = B_TRUE;
+
+ get_zone_misc(ksp->ks_instance, ksp);
+
+ } else if (strcmp("caps", ksp->ks_module) == 0 &&
+ strcmp("zone_caps", ksp->ks_class) == 0 &&
+ strncmp("cpucaps_zone", ksp->ks_name, 12) == 0) {
+ if (kstat_read(kc, ksp, NULL) == -1) {
+ (void) fprintf(stderr, "read failed\n");
+ exit(1);
+ }
+ zinfo[ksp->ks_instance].valid = B_TRUE;
+
+ get_zone_caps(ksp->ks_instance, ksp);
+
+ } else if (strcmp("zone_vfs", ksp->ks_module) == 0) {
+ if (kstat_read(kc, ksp, NULL) == -1) {
+ (void) fprintf(stderr, "read failed\n");
+ exit(1);
+ }
+ zinfo[ksp->ks_instance].valid = B_TRUE;
+
+ get_zone_vfs(ksp->ks_instance, ksp);
+
+ } else if (nsec_per_tick == 0 &&
+ strcmp("unix", ksp->ks_module) == 0 &&
+ strcmp("system_misc", ksp->ks_name) == 0) {
+ kstat_named_t *kp;
+
+ if (kstat_read(kc, ksp, NULL) == -1) {
+ (void) fprintf(stderr, "read failed\n");
+ exit(1);
+ }
+
+ kp = (kstat_named_t *)kstat_data_lookup(ksp,
+ "nsec_per_tick");
+ nsec_per_tick = kp->value.ui32;
+ }
+ }
+
+ (void) kstat_close(kc);
+}
+
+static float
+fmt_nsec(uint64_t curr, uint64_t prv)
+{
+ float s;
+ uint64_t nsec;
+
+ nsec = curr - prv;
+ s = (float)nsec / (long)NANOSEC;
+
+ return (s);
+}
+
+/* convert usecs to msecs */
+static float
+fmt_usec(uint64_t curr, uint64_t prv)
+{
+ float s;
+ uint64_t usec;
+
+ usec = curr - prv;
+ s = (float)usec / (long)MILLISEC;
+
+ return (s);
+}
+
+static float
+fmt_ticks(uint64_t curr, uint64_t prv)
+{
+ float s;
+ uint64_t ticks, nsec;
+
+ ticks = curr - prv;
+ nsec = ticks * nsec_per_tick;
+
+ s = (float)nsec / (long)NANOSEC;
+
+ return (s);
+}
+
+static void
+print_data(boolean_t parse)
+{
+ int i;
+ char *fmt;
+
+ if (parse) {
+ fmt = "%d,%lld,%.2f,%.1f,%lld,%lld,%lld,%lld,%lld,"
+ "%.2f,%lld,%.2f,%.2f,%.2f,%.2f\n";
+ } else {
+ fmt = "%4d %2lld %6.2f %5.1f %2lld %2lld %5lld %5lld %2lld "
+ "%5.2f %4lld %6.2f %6.2f %6.2f %6.2f\n";
+
+ (void) printf("%4s %2s %6s %5s %2s %2s %5s %5s %2s "
+ "%5s %4s %6s %6s %6s %6s\n",
+ "zid", "rq", "rsec", "sh%", "ph", "pa", "cap", "usage",
+ "bs", "1mla", "dcnt", "dms", "user", "sys", "wtrq");
+ }
+
+ for (i = 0; i < MAX_ZONEID; i++) {
+ if (zinfo[i].valid == B_FALSE)
+ continue;
+
+ /*LINTED E_SEC_PRINTF_VAR_FMT*/
+ (void) printf(fmt,
+ i,
+ zinfo[i].rqueue,
+ fmt_ticks(zinfo[i].rticks, zinfo[i].prv_rticks),
+ (float)zinfo[i].fss_share_pct / (float)10,
+ zinfo[i].fss_pri_hi,
+ zinfo[i].fss_pri_avg,
+ zinfo[i].cpu_cap,
+ zinfo[i].cpu_cap_usage,
+ zinfo[i].above_base_sec - zinfo[i].prv_above_base_sec,
+ zinfo[i].avrun1,
+ zinfo[i].delay_cnt - zinfo[i].prv_delay_cnt,
+ fmt_usec(zinfo[i].delay_time, zinfo[i].prv_delay_time),
+ fmt_nsec(zinfo[i].ns_usr, zinfo[i].prv_ns_usr),
+ fmt_nsec(zinfo[i].ns_sys, zinfo[i].prv_ns_sys),
+ fmt_nsec(zinfo[i].ns_wt, zinfo[i].prv_ns_wt));
+
+ zinfo[i].valid = B_FALSE;
+ zinfo[i].prv_rticks = zinfo[i].rticks;
+ zinfo[i].prv_ns_usr = zinfo[i].ns_usr;
+ zinfo[i].prv_ns_sys = zinfo[i].ns_sys;
+ zinfo[i].prv_ns_wt = zinfo[i].ns_wt;
+ zinfo[i].prv_above_base_sec = zinfo[i].above_base_sec;
+ zinfo[i].prv_delay_cnt = zinfo[i].delay_cnt;
+ zinfo[i].prv_delay_time = zinfo[i].delay_time;
+ }
+}
+
+int
+main(int argc, char **argv)
+{
+ int interval = 5;
+ int count;
+ int forever = 1;
+ int arg;
+ extern int optind;
+ boolean_t do_parse = B_FALSE;
+
+ while ((arg = getopt(argc, argv, "r")) != EOF) {
+ switch (arg) {
+ case 'r':
+ do_parse = B_TRUE;
+ break;
+ default:
+ usage();
+ }
+ }
+
+ if (argc > optind) {
+ interval = atoi(argv[optind]);
+ optind++;
+
+ if (argc > optind) {
+ count = atoi(argv[optind]);
+ forever = 0;
+ optind++;
+ }
+ }
+ if (argc > optind)
+ usage();
+
+ for (;;) {
+ read_kstats();
+ print_data(do_parse);
+ if (forever == 0 && --count == 0)
+ break;
+ (void) sleep(interval);
+ }
+
+ return (0);
+}
diff --git a/usr/src/cmd/truss/print.c b/usr/src/cmd/truss/print.c
index 49d6da39f9..a8c923fae2 100644
--- a/usr/src/cmd/truss/print.c
+++ b/usr/src/cmd/truss/print.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2012, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -825,7 +825,9 @@ prt_mc4(private_t *pri, int raw, long val) /* print memcntl() (4th) argument */
return;
case MC_SYNC:
- if ((val & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) == 0) {
+ if ((val &
+ ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE|MS_INVALCURPROC))
+ == 0) {
*(s = pri->code_buf) = '\0';
if (val & MS_SYNC)
(void) strlcat(s, "|MS_SYNC", CBSIZE);
@@ -834,6 +836,9 @@ prt_mc4(private_t *pri, int raw, long val) /* print memcntl() (4th) argument */
if (val & MS_INVALIDATE)
(void) strlcat(s, "|MS_INVALIDATE",
CBSIZE);
+ if (val & MS_INVALCURPROC)
+ (void) strlcat(s, "|MS_INVALCURPROC",
+ CBSIZE);
}
break;
diff --git a/usr/src/cmd/zoneadmd/mcap.c b/usr/src/cmd/zoneadmd/mcap.c
index 9e0fba65e2..d52eec9c97 100644
--- a/usr/src/cmd/zoneadmd/mcap.c
+++ b/usr/src/cmd/zoneadmd/mcap.c
@@ -40,7 +40,7 @@
* checks that against the zone's zone.max-physical-memory rctl. Once the
* zone goes over its cap, then this thread will work through the zone's
* /proc process list, Pgrab-bing each process and stepping through the
- * address space segments attempting to use pr_memcntl(...MS_INVALIDATE...)
+ * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...)
* to pageout pages, until the zone is again under its cap.
*
* Although zone memory capping is implemented as a soft cap by this user-level
@@ -56,21 +56,14 @@
* the thread will work to pageout until the zone is under the cap, as shown
* by updated vm_usage data.
*
- * There are a couple of interfaces (xmap, pagedata) in proc(4) that can be
- * used to examine a processes mapped segments while we are trying to pageout.
- * The observed xmap segement size data is frequently smaller than the
- * pagedata segement size data, so it is less effective in practice. Thus we
- * use pagedata to determine the size of each segment.
- *
- * The pagedata page maps (at least on x86) are not useful. Those flags
+ * NOTE: The pagedata page maps (at least on x86) are not useful. Those flags
* are set by hrm_setbits() and on x86 that code path is only executed by
* segvn_pagelock -> hat_setstat -> hrm_setbits
* segvn_softunlock -^
* On SPARC there is an additional code path which may make this data
* useful (sfmmu_ttesync), but since it is not generic, we ignore the page
- * maps and only use the segement info from pagedata. If we ever fix this
- * issue, then we could generalize this mcap code to do more with the data on
- * active pages.
+ * maps. If we ever fix this issue, then we could generalize this mcap code to
+ * do more with the data on active pages.
*
* For debugging, touch the file {zonepath}/mcap_debug.log. This will
* cause the thread to start logging its actions into that file (it may take
@@ -124,7 +117,6 @@ static cond_t shutdown_cv;
static int shutting_down = 0;
static thread_t mcap_tid;
static FILE *debug_log_fp = NULL;
-static uint64_t sum_pageout = 0; /* total bytes paged out in a pass */
static uint64_t zone_rss_cap; /* RSS cap(KB) */
static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */
@@ -135,13 +127,7 @@ static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */
typedef struct {
int pr_curr; /* the # of the mapping we're working on */
int pr_nmap; /* number of mappings in address space */
- int pr_cnt; /* number of mappings processed */
-
- prpageheader_t *pr_pghp; /* process's complete pagedata */
- prasmap_t *pr_asp; /* current address space pointer */
-
- uintptr_t pr_addr; /* base of mapping */
- uint64_t pr_size; /* size of mapping */
+ prxmap_t *pr_xmapp; /* process's xmap array */
} proc_map_t;
typedef struct zsd_vmusage64 {
@@ -293,40 +279,21 @@ control_proc(pid_t pid)
}
/*
- * Get data from the current prasmap_t and advance pr_asp to the next
- * asmap in the pagedata.
+ * Get the next mapping.
*/
-static uintptr_t
+static prxmap_t *
nextmapping(proc_map_t *pmp)
{
- prasmap_t *pap;
- void *pdp; /* per-page data pointer */
-
- pmp->pr_curr++;
- if (pmp->pr_curr > pmp->pr_nmap)
+ if (pmp->pr_xmapp == NULL || pmp->pr_curr >= pmp->pr_nmap)
return (NULL);
- pap = pmp->pr_asp;
-
- pmp->pr_addr = pap->pr_vaddr;
- pmp->pr_size = pap->pr_npage * pap->pr_pagesize;
- pmp->pr_cnt++;
-
- /* Advance the pr_asp pointer to the next asmap */
- pdp = pap + 1;
- pdp = (caddr_t)(uintptr_t)((uintptr_t)pdp + pap->pr_npage);
-
- /* Skip to next 64-bit-aligned address to get the next prasmap_t. */
- pdp = (caddr_t)(((uintptr_t)pdp + 7) & ~7);
- pmp->pr_asp = (prasmap_t *)pdp;
-
- return (pmp->pr_addr);
+ return (&pmp->pr_xmapp[pmp->pr_curr++]);
}
/*
* Initialize the proc_map_t to access the first mapping of an address space.
*/
-static void *
+static prxmap_t *
init_map(proc_map_t *pmp, pid_t pid)
{
int fd;
@@ -337,39 +304,37 @@ init_map(proc_map_t *pmp, pid_t pid)
bzero(pmp, sizeof (proc_map_t));
pmp->pr_nmap = -1;
- (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/pagedata", zoneproc,
- pid);
+ (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/xmap", zoneproc, pid);
if ((fd = open(pathbuf, O_RDONLY, 0)) < 0)
return (NULL);
redo:
errno = 0;
if (fstat(fd, &st) != 0)
- return (NULL);
+ goto done;
- if ((pmp->pr_pghp = malloc(st.st_size)) == NULL) {
- debug("cannot malloc() %ld bytes for pagedata", st.st_size);
- return (NULL);
+ if ((pmp->pr_xmapp = malloc(st.st_size)) == NULL) {
+ debug("cannot malloc() %ld bytes for xmap", st.st_size);
+ goto done;
}
- (void) bzero(pmp->pr_pghp, st.st_size);
+ (void) bzero(pmp->pr_xmapp, st.st_size);
errno = 0;
- if ((res = read(fd, pmp->pr_pghp, st.st_size)) != st.st_size) {
- free(pmp->pr_pghp);
- pmp->pr_pghp = NULL;
+ if ((res = read(fd, pmp->pr_xmapp, st.st_size)) != st.st_size) {
+ free(pmp->pr_xmapp);
+ pmp->pr_xmapp = NULL;
if (res > 0 || errno == E2BIG) {
goto redo;
} else {
- debug("pid %ld cannot read pagedata\n", pid);
- return (NULL);
+ debug("pid %ld cannot read xmap\n", pid);
+ goto done;
}
}
- pmp->pr_nmap = pmp->pr_pghp->pr_nmap;
- pmp->pr_asp = (prasmap_t *)(pmp->pr_pghp + 1);
+ pmp->pr_nmap = st.st_size / sizeof (prxmap_t);
done:
(void) close(fd);
- return ((void *)nextmapping(pmp));
+ return (nextmapping(pmp));
}
/*
@@ -377,13 +342,24 @@ done:
* return nonzero if not all of the pages may are pageable, for any reason.
*/
static int
-pageout_mapping(struct ps_prochandle *Pr, proc_map_t *pmp)
+pageout_mapping(struct ps_prochandle *Pr, prxmap_t *pmp)
{
int res;
+ /*
+ * We particularly want to avoid the pr_memcntl on anonymous mappings
+ * which show 0 since that will pull them back off of the free list
+ * and increase the zone's RSS, even though the process itself has
+ * them freed up.
+ */
+ if (pmp->pr_mflags & MA_ANON && pmp->pr_anon == 0)
+ return (0);
+ else if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM)
+ return (0);
+
errno = 0;
- res = pr_memcntl(Pr, (caddr_t)pmp->pr_addr, pmp->pr_size, MC_SYNC,
- (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0);
+ res = pr_memcntl(Pr, (caddr_t)pmp->pr_vaddr, pmp->pr_size, MC_SYNC,
+ (caddr_t)(MS_ASYNC | MS_INVALCURPROC), 0, 0);
/*
* EBUSY indicates none of the pages have backing store allocated, or
@@ -423,7 +399,7 @@ static int64_t
pageout_process(pid_t pid, int64_t excess)
{
int psfd;
- void *praddr;
+ prxmap_t *pxmap;
proc_map_t cur;
struct ps_prochandle *ph = NULL;
int unpageable_mappings;
@@ -433,7 +409,6 @@ pageout_process(pid_t pid, int64_t excess)
int incr_rss_check = 0;
char pathbuf[MAXPATHLEN];
- cur.pr_pghp = NULL;
(void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
pid);
if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
@@ -459,11 +434,11 @@ pageout_process(pid_t pid, int64_t excess)
}
/* Get segment residency information. */
- praddr = init_map(&cur, pid);
+ pxmap = init_map(&cur, pid);
/* Skip process if it has no mappings. */
- if (cur.pr_pghp == NULL) {
- debug("%ld: pagedata unreadable; ignoring\n", pid);
+ if (pxmap == NULL) {
+ debug("%ld: xmap unreadable; ignoring\n", pid);
goto done;
}
@@ -489,15 +464,15 @@ pageout_process(pid_t pid, int64_t excess)
*/
sum_att = sum_d_rss = 0;
unpageable_mappings = 0;
- while (excess > 0 && praddr != NULL && !shutting_down) {
+ while (excess > 0 && pxmap != NULL && !shutting_down) {
/* Try to page out the mapping. */
- if (pageout_mapping(ph, &cur) < 0) {
+ if (pageout_mapping(ph, pxmap) < 0) {
debug("pid %ld: exited or unpageable\n", pid);
break;
}
/* attempted is the size of the mapping */
- sum_att += (cur.pr_size / 1024);
+ sum_att += pxmap->pr_size / 1024;
/*
* This processes RSS is potentially enough to clear the
@@ -519,11 +494,10 @@ pageout_process(pid_t pid, int64_t excess)
} else {
excess += d_rss;
sum_d_rss += d_rss;
- sum_pageout += (-d_rss * 1024);
}
}
- praddr = (void *)nextmapping(&cur);
+ pxmap = nextmapping(&cur);
}
if (!incr_rss_check) {
@@ -531,12 +505,11 @@ pageout_process(pid_t pid, int64_t excess)
if (d_rss < 0) {
excess += d_rss;
sum_d_rss += d_rss;
- sum_pageout += (-d_rss * 1024);
}
}
- debug("pid %ld: map %d unp %d att %lluKB drss %lldKB excess %lldKB\n",
- pid, cur.pr_cnt, unpageable_mappings, (unsigned long long)sum_att,
+ debug("pid %ld: unp %d att %lluKB drss %lldKB excess %lldKB\n",
+ pid, unpageable_mappings, (unsigned long long)sum_att,
(unsigned long long)sum_d_rss, (long long)excess);
done:
@@ -546,8 +519,8 @@ done:
(void) Prelease(ph, 0);
}
- if (cur.pr_pghp != NULL)
- free(cur.pr_pghp);
+ if (cur.pr_xmapp != NULL)
+ free(cur.pr_xmapp);
(void) close(psfd);
@@ -680,12 +653,13 @@ get_zone_cap()
* is important considering that each zone will be monitoring its rss.
*/
static int64_t
-check_suspend(int age)
+check_suspend(int age, boolean_t new_cycle)
{
static hrtime_t last_cap_read = 0;
static uint64_t addon;
static uint64_t lo_thresh; /* Thresholds for how long to sleep */
static uint64_t hi_thresh; /* when under the cap (80% & 90%). */
+ static uint64_t prev_zone_rss = 0;
/* Wait a second to give the async pageout a chance to catch up. */
(void) sleep_shutdown(1);
@@ -742,16 +716,6 @@ check_suspend(int age)
continue;
}
- /*
- * If we did some paging out since our last invocation then
- * update the kstat so we can track how much was paged out.
- */
- if (sum_pageout != 0) {
- (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT,
- &sum_pageout, 0);
- sum_pageout = 0;
- }
-
zone_rss = get_mem_info(age);
/* calculate excess */
@@ -760,18 +724,41 @@ check_suspend(int age)
debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
zone_rss, zone_rss_cap, new_excess);
+ /*
+ * If necessary, updates stats.
+ */
+
+ /*
+ * If it looks like we did some paging out since last over the
+ * cap then update the kstat so we can approximate how much was
+ * paged out.
+ */
+ if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) {
+ uint64_t diff;
+
+ /* assume diff is num bytes we paged out */
+ diff = (prev_zone_rss - zone_rss) * 1024;
+
+ (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT,
+ &diff, 0);
+ }
+ prev_zone_rss = zone_rss;
+
if (new_excess > 0) {
- uint64_t n = 1;
+ if (new_cycle) {
+ uint64_t n = 1;
- /* Increment "nover" kstat. */
- (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0);
+ /* Increment "nover" kstat. */
+ (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER,
+ &n, 0);
+ }
/*
- * Once we go over the cap, then we want to page out a
- * little extra instead of stopping right at the cap.
- * To do this we add 5% to the excess so that
- * pageout_proces will work a little longer before
- * stopping.
+ * Once we go over the cap, then we want to
+ * page out a little extra instead of stopping
+ * right at the cap. To do this we add 5% to
+ * the excess so that pageout_proces will work
+ * a little longer before stopping.
*/
return ((int64_t)(new_excess + addon));
}
@@ -845,7 +832,7 @@ mcap_zone()
struct dirent *dirent;
/* Wait until we've gone over the cap. */
- excess = check_suspend(age);
+ excess = check_suspend(age, B_TRUE);
debug("starting to scan, excess %lldk\n", (long long)excess);
@@ -885,10 +872,10 @@ mcap_zone()
excess = pageout_process(pid, excess);
if (excess <= 0) {
- debug("done scanning; excess %lld\n",
+ debug("apparently under; excess %lld\n",
(long long)excess);
/* Double check the current excess */
- excess = check_suspend(1);
+ excess = check_suspend(1, B_FALSE);
}
}
diff --git a/usr/src/lib/brand/joyent/zone/statechange.ksh b/usr/src/lib/brand/joyent/zone/statechange.ksh
index 70be7819b4..97b041528a 100644
--- a/usr/src/lib/brand/joyent/zone/statechange.ksh
+++ b/usr/src/lib/brand/joyent/zone/statechange.ksh
@@ -248,7 +248,6 @@ setup_net()
fi
if [[ $allow_ip_spoof != "1" ]]; then
spoof_opts="${spoof_opts}${comma}ip-nospoof"
- ip_spoof_enabled="true"
comma=","
fi
if [[ $allow_restricted_traffic != "1" ]]; then
@@ -288,7 +287,7 @@ setup_net()
fi
fi
- if [[ -n "${zone_ip}" ]] && [[ -n "${ip_spoof_enabled}" ]] && \
+ if [[ -n "${zone_ip}" ]] && [[ $allow_ip_spoof != "1" ]] && \
[[ "${zone_ip}" != "dhcp" ]]; then
dladm set-linkprop -t -z $ZONENAME \
-p "allowed-ips=${zone_ip}" ${nic}
diff --git a/usr/src/lib/brand/kvm/zone/kinstall.ksh b/usr/src/lib/brand/kvm/zone/kinstall.ksh
index abd60a17b7..43b0c953d1 100755
--- a/usr/src/lib/brand/kvm/zone/kinstall.ksh
+++ b/usr/src/lib/brand/kvm/zone/kinstall.ksh
@@ -66,6 +66,14 @@ PDS_NAME=`mount | nawk -v p=$dname '{if ($1 == p) print $3}'`
[ -z "$PDS_NAME" ] && \
print -u2 "Brand error: missing parent ZFS dataset for $dname"
+# it's possible to specify a zone root here if you specified the
+# '-x nodataset' when installing the zone.
+if [[ -n ${TMPLZONE} ]]; then
+ zfs snapshot $PDS_NAME/${TMPLZONE}@${bname}
+ zfs clone -o quota=${ZQUOTA}g $PDS_NAME/${TMPLZONE}@${bname} \
+ $PDS_NAME/$bname
+fi
+
if [ ! -d ${ZONEPATH}/config ]; then
mkdir -p ${ZONEPATH}/config
chmod 755 ${ZONEPATH}/config
diff --git a/usr/src/lib/brand/kvm/zone/statechange.ksh b/usr/src/lib/brand/kvm/zone/statechange.ksh
index 22f647775a..efb10be50e 100755
--- a/usr/src/lib/brand/kvm/zone/statechange.ksh
+++ b/usr/src/lib/brand/kvm/zone/statechange.ksh
@@ -221,7 +221,6 @@ setup_net()
fi
if [[ $allow_ip_spoof != "1" ]]; then
spoof_opts="${spoof_opts}${comma}ip-nospoof"
- ip_spoof_enabled="true"
comma=","
fi
if [[ $allow_restricted_traffic != "1" ]]; then
@@ -244,7 +243,7 @@ setup_net()
fi
fi
- if [[ -n "${zone_ip}" ]] && [[ -n "${ip_spoof_enabled}" ]] && \
+ if [[ -n "${zone_ip}" ]] && [[ $allow_ip_spoof != "1" ]] && \
[[ "${zone_ip}" != "dhcp" ]]; then
dladm set-linkprop -t -z $ZONENAME \
-p "allowed-ips=${zone_ip}" ${nic}
diff --git a/usr/src/lib/libumem/common/envvar.c b/usr/src/lib/libumem/common/envvar.c
index 949d33ce16..fc3d490a01 100644
--- a/usr/src/lib/libumem/common/envvar.c
+++ b/usr/src/lib/libumem/common/envvar.c
@@ -22,10 +22,9 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <ctype.h>
#include <errno.h>
#include <limits.h>
@@ -84,6 +83,7 @@ typedef struct umem_env_item {
#ifndef UMEM_STANDALONE
static arg_process_t umem_backend_process;
+static arg_process_t umem_allocator_process;
#endif
static arg_process_t umem_log_process;
@@ -100,6 +100,11 @@ static umem_env_item_t umem_options_items[] = {
NULL, 0, NULL, NULL,
&umem_backend_process
},
+ { "allocator", "Evolving", ITEM_SPECIAL,
+ "=best, =first, =next, or =instant",
+ NULL, 0, NULL, NULL,
+ &umem_allocator_process
+ },
#endif
{ "concurrency", "Private", ITEM_UINT,
@@ -473,6 +478,35 @@ fail:
CURRENT, name, name, name);
return (ARG_BAD);
}
+
+
+static int
+umem_allocator_process(const umem_env_item_t *item, const char *item_arg)
+{
+ const char *name = item->item_name;
+
+ if (item_arg == NULL)
+ goto fail;
+
+ if (strcmp(item_arg, "best") == 0)
+ vmem_allocator = VM_BESTFIT;
+ else if (strcmp(item_arg, "next") == 0)
+ vmem_allocator = VM_NEXTFIT;
+ else if (strcmp(item_arg, "first") == 0)
+ vmem_allocator = VM_FIRSTFIT;
+ else if (strcmp(item_arg, "instant") == 0)
+ vmem_allocator = 0;
+ else
+ goto fail;
+
+ return (ARG_SUCCESS);
+
+fail:
+ log_message("%s: %s: must be %s=best, %s=next or %s=first\n",
+ CURRENT, name, name, name, name);
+ return (ARG_BAD);
+
+}
#endif
static int
diff --git a/usr/src/lib/libumem/common/umem.c b/usr/src/lib/libumem/common/umem.c
index a3eb0b8e6c..9ee030dd47 100644
--- a/usr/src/lib/libumem/common/umem.c
+++ b/usr/src/lib/libumem/common/umem.c
@@ -21,11 +21,10 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* based on usr/src/uts/common/os/kmem.c r1.64 from 2001/12/18
*
@@ -355,6 +354,17 @@
* umem_log_header_t's:
* lh_cpu[*].clh_lock
* lh_lock
+ *
+ * 7. Changing UMEM_MAXBUF
+ * -----------------------
+ *
+ * When changing UMEM_MAXBUF extra care has to be taken. It is not sufficient to
+ * simply increase this number. First, one must update the umem_alloc_table to
+ * have the appropriate number of entires based upon the new size. If this is
+ * not done, this will lead to libumem blowing an assertion.
+ *
+ * The second place to update, which is not required, is the umem_alloc_sizes.
+ * These determine the default cache sizes that we're going to support.
*/
#include <umem_impl.h>
@@ -420,7 +430,9 @@ static int umem_alloc_sizes[] = {
P2ALIGN(8192 / 2, 64), 4544,
P2ALIGN(8192 / 1, 64), 9216,
4096 * 3,
- UMEM_MAXBUF, /* = 8192 * 2 */
+ 8192 * 2, /* = 8192 * 2 */
+ 24576, 32768, 40960, 49152, 57344, 65536, 73728, 81920,
+ 90112, 98304, 106496, 114688, 122880, UMEM_MAXBUF, /* 128k */
/* 24 slots for user expansion */
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
@@ -592,6 +604,20 @@ umem_cache_t umem_null_cache = {
static umem_cache_t *umem_alloc_table[UMEM_MAXBUF >> UMEM_ALIGN_SHIFT] = {
ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
ALLOC_TABLE_1024
};
diff --git a/usr/src/lib/libumem/common/umem_impl.h b/usr/src/lib/libumem/common/umem_impl.h
index c6481d9751..84313c32ed 100644
--- a/usr/src/lib/libumem/common/umem_impl.h
+++ b/usr/src/lib/libumem/common/umem_impl.h
@@ -21,14 +21,13 @@
*/
/*
* Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _UMEM_IMPL_H
#define _UMEM_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <umem.h>
#include <sys/sysmacros.h>
@@ -353,7 +352,7 @@ typedef struct umem_cpu {
uint32_t cpu_number;
} umem_cpu_t;
-#define UMEM_MAXBUF 16384
+#define UMEM_MAXBUF 131072
#define UMEM_ALIGN 8 /* min guaranteed alignment */
#define UMEM_ALIGN_SHIFT 3 /* log2(UMEM_ALIGN) */
diff --git a/usr/src/lib/libumem/common/vmem.c b/usr/src/lib/libumem/common/vmem.c
index 040517a78f..c868e42977 100644
--- a/usr/src/lib/libumem/common/vmem.c
+++ b/usr/src/lib/libumem/common/vmem.c
@@ -22,6 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
/*
@@ -1069,6 +1070,7 @@ vmem_alloc(vmem_t *vmp, size_t size, int vmflag)
int hb;
int flist = 0;
uint32_t mtbf;
+ vmflag |= vmem_allocator;
if (size - 1 < vmp->vm_qcache_max) {
ASSERT(vmflag & VM_NOSLEEP);
diff --git a/usr/src/lib/libumem/common/vmem_base.c b/usr/src/lib/libumem/common/vmem_base.c
index 6b1c07e1ba..dcd83ddf31 100644
--- a/usr/src/lib/libumem/common/vmem_base.c
+++ b/usr/src/lib/libumem/common/vmem_base.c
@@ -22,14 +22,14 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include "vmem_base.h"
#include "umem_base.h"
uint_t vmem_backend = 0;
+uint_t vmem_allocator = VM_BESTFIT;
vmem_t *
vmem_heap_arena(vmem_alloc_t **allocp, vmem_free_t **freep)
diff --git a/usr/src/lib/libumem/common/vmem_base.h b/usr/src/lib/libumem/common/vmem_base.h
index 46ed397343..a585520e0b 100644
--- a/usr/src/lib/libumem/common/vmem_base.h
+++ b/usr/src/lib/libumem/common/vmem_base.h
@@ -21,13 +21,12 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
#ifndef _VMEM_BASE_H
#define _VMEM_BASE_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/vmem.h>
#include <umem.h>
@@ -66,6 +65,8 @@ extern uint_t vmem_backend;
#define VMEM_BACKEND_MMAP 0x0000002
#define VMEM_BACKEND_STAND 0x0000003
+extern uint_t vmem_allocator;
+
extern vmem_t *vmem_heap;
extern vmem_alloc_t *vmem_heap_alloc;
extern vmem_free_t *vmem_heap_free;
diff --git a/usr/src/man/man1m/Makefile b/usr/src/man/man1m/Makefile
index cd1ec0efb3..6af88a31a9 100644
--- a/usr/src/man/man1m/Makefile
+++ b/usr/src/man/man1m/Makefile
@@ -12,6 +12,7 @@
#
# Copyright 2011, Richard Lowe
# Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+# Copyright 2012 Joyent, Inc. All rights reserved.
#
include ../../Makefile.master
@@ -597,6 +598,7 @@ COMMON_MANFILES = 6to4relay.1m \
zoneadmd.1m \
zonecfg.1m \
zpool.1m \
+ zschedstat.1m \
zstreamdump.1m
i386_MANFILES = lms.1m \
diff --git a/usr/src/man/man1m/zschedstat.1m b/usr/src/man/man1m/zschedstat.1m
new file mode 100644
index 0000000000..61ea8353bf
--- /dev/null
+++ b/usr/src/man/man1m/zschedstat.1m
@@ -0,0 +1,202 @@
+'\" te
+.\" Copyright (c) 2012, Joyent, Inc. All Rights reserved
+.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.
+.\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with
+.\" the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
+.TH ZSCHEDSTAT 1M "Jan 16, 2012"
+.SH NAME
+zschedstat \- report per-zone CPU scheduling statistics
+.SH SYNOPSIS
+.LP
+.nf
+\fB/usr/bin/zschedstat\fR [\fB-r] [\fIinterval\fR [\fIcount\fR]]
+.fi
+
+.SH DESCRIPTION
+.sp
+.LP
+The \fBzschedstat\fR utility iteratively reports per-zone CPU-scheduling
+activity. The first iteration of output is for all time since boot; each
+subsequent iteration is for the prior interval only.
+.sp
+.LP
+The output of the \fBzschedstat\fR utility shows the following information.
+.sp
+.ne 2
+.na
+\fB\fBzid\fR\fR
+.ad
+.RS 10n
+zone ID
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBrq\fR\fR
+.ad
+.RS 10n
+The number of threads FSS saw in the run queue in the last second.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBrsec\fR\fR
+.ad
+.RS 10n
+The total number of seconds that FSS counted processes running for this
+zone during the sampling interval.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBsh%\fR\fR
+.ad
+.RS 10n
+The percent of the active shares FSS calculated for this zone. This column
+may not sum exactly to 100% due to rounding.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBph\fR\fR
+.ad
+.RS 10n
+The highest priorty FSS calculated for a process during the last second
+(range 0-59). This will be 0 if FSS saw no runnable processes for the zone
+in the last second.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBpa\fR\fR
+.ad
+.RS 10n
+The average priorty FSS calculated for all runnable processes during the last
+second that there were runnable processes for this zone (range 0-59).
+This data might be several seconds old if there were no runnable processes
+in the interval.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBcap\fR\fR
+.ad
+.RS 10n
+The current cpu-cap for the zone (in percent of a CPU).
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBusage\fR\fR
+.ad
+.RS 10n
+The cpu-cap calculated usage for the zone in the interval (in percent of a CPU).
+The usage will be 0 if there is no cpu-cap.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBbs\fR\fR
+.ad
+.RS 10n
+The number of seconds during the interval that the zone was bursting.
+This will be 0 if there is no cpu-cap.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fB1mla\fR\fR
+.ad
+.RS 10n
+The one minute load average for the zone.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBdcnt\fR\fR
+.ad
+.RS 10n
+The number of times that the ZFS I/O throttle delayed a process in the zone.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBdms\fR\fR
+.ad
+.RS 10n
+The total time, in milli-seconds, of ZFS I/O throttle delay for processes in the
+zone.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBuser\fR\fR
+.ad
+.RS 10n
+The total number of seconds processes were running in user-level code.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBsys\fR\fR
+.ad
+.RS 10n
+The total number of seconds processes were running in the kernel.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBwtrq\fR\fR
+.ad
+.RS 10n
+The total number of seconds processes were waiting in the run queue to run.
+.RE
+
+.SH OPTIONS
+.sp
+.LP
+The following options are supported:
+.sp
+.ne 2
+.na
+\fB\fB-r\fR\fR
+.ad
+.RS 12n
+Display data in a comma-separated format.
+.RE
+
+.SH OPERANDS
+.sp
+.LP
+The following operands are supported:
+.sp
+.ne 2
+.na
+\fB\fIcount\fR\fR
+.ad
+.RS 12n
+Display only \fIcount\fR reports.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fIinterval\fR\fR
+.ad
+.RS 12n
+Report once each \fIinterval\fR seconds.
+.RE
diff --git a/usr/src/uts/common/disp/fss.c b/usr/src/uts/common/disp/fss.c
index 62301d65d8..af8826780c 100644
--- a/usr/src/uts/common/disp/fss.c
+++ b/usr/src/uts/common/disp/fss.c
@@ -814,6 +814,7 @@ fss_decay_usage()
fsszone_t *fsszone;
fsspri_t maxfsspri;
int psetid;
+ struct zone *zp;
mutex_enter(&fsspsets_lock);
/*
@@ -824,6 +825,8 @@ fss_decay_usage()
fsspset = &fsspsets[psetid];
mutex_enter(&fsspset->fssps_lock);
+ fsspset->fssps_gen++;
+
if (fsspset->fssps_cpupart == NULL ||
(fssproj = fsspset->fssps_list) == NULL) {
mutex_exit(&fsspset->fssps_lock);
@@ -843,6 +846,21 @@ fss_decay_usage()
fsspset->fssps_maxfsspri = maxfsspri;
do {
+ fsszone = fssproj->fssp_fsszone;
+ zp = fsszone->fssz_zone;
+
+ /*
+ * Reset zone's FSS kstats if they are from a
+ * previous cycle.
+ */
+ if (fsspset->fssps_gen != zp->zone_fss_gen) {
+ zp->zone_fss_gen = fsspset->fssps_gen;
+ zp->zone_fss_pri_hi = 0;
+ zp->zone_runq_cntr = 0;
+ zp->zone_fss_shr_pct = 0;
+ zp->zone_proc_cnt = 0;
+ }
+
/*
* Decay usage for each project running on
* this cpu partition.
@@ -850,9 +868,18 @@ fss_decay_usage()
fssproj->fssp_usage =
(fssproj->fssp_usage * FSS_DECAY_USG) /
FSS_DECAY_BASE + fssproj->fssp_ticks;
+
fssproj->fssp_ticks = 0;
- fsszone = fssproj->fssp_fsszone;
+ zp->zone_run_ticks += fssproj->fssp_zone_ticks;
+ /*
+ * This is the count for this one second cycle only,
+ * and not cumulative.
+ */
+ zp->zone_runq_cntr += fssproj->fssp_runnable;
+
+ fssproj->fssp_zone_ticks = 0;
+
/*
* Readjust the project's number of shares if it has
* changed since we checked it last time.
@@ -871,7 +898,7 @@ fss_decay_usage()
* Readjust the zone's number of shares if it
* has changed since we checked it last time.
*/
- zone_ext_shares = fsszone->fssz_zone->zone_shares;
+ zone_ext_shares = zp->zone_shares;
if (fsszone->fssz_rshares != zone_ext_shares) {
if (fsszone->fssz_runnable != 0) {
fsspset->fssps_shares -=
@@ -883,6 +910,12 @@ fss_decay_usage()
}
zone_int_shares = fsszone->fssz_shares;
pset_shares = fsspset->fssps_shares;
+
+ if (zp->zone_runq_cntr > 0 && pset_shares > 0)
+ /* in tenths of a pct */
+ zp->zone_fss_shr_pct =
+ (zone_ext_shares * 1000) / pset_shares;
+
/*
* Calculate fssp_shusage value to be used
* for fsspri increments for the next second.
@@ -1050,6 +1083,8 @@ fss_update_list(int i)
fssproc_t *fssproc;
fssproj_t *fssproj;
fsspri_t fsspri;
+ struct zone *zp;
+ pri_t fss_umdpri;
kthread_t *t;
int updated = 0;
@@ -1073,6 +1108,7 @@ fss_update_list(int i)
fssproj = FSSPROC2FSSPROJ(fssproc);
if (fssproj == NULL)
goto next;
+
if (fssproj->fssp_shares != 0) {
/*
* Decay fsspri value.
@@ -1096,11 +1132,28 @@ fss_update_list(int i)
fss_newpri(fssproc);
updated = 1;
+ fss_umdpri = fssproc->fss_umdpri;
+
+ /*
+ * Summarize a zone's process priorities for runnable
+ * procs.
+ */
+ zp = fssproj->fssp_fsszone->fssz_zone;
+
+ if (fss_umdpri > zp->zone_fss_pri_hi)
+ zp->zone_fss_pri_hi = fss_umdpri;
+
+ if (zp->zone_proc_cnt++ == 0)
+ zp->zone_fss_pri_avg = fss_umdpri;
+ else
+ zp->zone_fss_pri_avg =
+ (zp->zone_fss_pri_avg + fss_umdpri) / 2;
+
/*
* Only dequeue the thread if it needs to be moved; otherwise
* it should just round-robin here.
*/
- if (t->t_pri != fssproc->fss_umdpri)
+ if (t->t_pri != fss_umdpri)
fss_change_priority(t, fssproc);
next:
thread_unlock(t);
@@ -2180,6 +2233,7 @@ fss_tick(kthread_t *t)
fsspset_t *fsspset = FSSPROJ2FSSPSET(fssproj);
disp_lock_enter_high(&fsspset->fssps_displock);
fssproj->fssp_ticks += fss_nice_tick[fssproc->fss_nice];
+ fssproj->fssp_zone_ticks++;
fssproc->fss_ticks++;
disp_lock_exit_high(&fsspset->fssps_displock);
}
diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c
index ad2fed01dc..8473788d8a 100644
--- a/usr/src/uts/common/fs/nfs/nfs_server.c
+++ b/usr/src/uts/common/fs/nfs/nfs_server.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Bayard G. Bell. All rights reserved.
+ * Copyright (c) 2012 Joyent, Inc. All rights reserved.
*/
/*
@@ -2520,6 +2521,9 @@ nfs_srvinit(void)
{
int error;
+ if (getzoneid() != GLOBAL_ZONEID)
+ return (EACCES);
+
error = nfs_exportinit();
if (error != 0)
return (error);
diff --git a/usr/src/uts/common/os/kstat_fr.c b/usr/src/uts/common/os/kstat_fr.c
index 93376a9edf..25afef3259 100644
--- a/usr/src/uts/common/os/kstat_fr.c
+++ b/usr/src/uts/common/os/kstat_fr.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2012, Joyent, Inc. All rights reserved.
*/
/*
@@ -161,6 +161,7 @@ struct {
kstat_named_t avenrun_5min;
kstat_named_t avenrun_15min;
kstat_named_t boot_time;
+ kstat_named_t nsec_per_tick;
} system_misc_kstat = {
{ "ncpus", KSTAT_DATA_UINT32 },
{ "lbolt", KSTAT_DATA_UINT32 },
@@ -172,6 +173,7 @@ struct {
{ "avenrun_5min", KSTAT_DATA_UINT32 },
{ "avenrun_15min", KSTAT_DATA_UINT32 },
{ "boot_time", KSTAT_DATA_UINT32 },
+ { "nsec_per_tick", KSTAT_DATA_UINT32 },
};
struct {
@@ -855,6 +857,8 @@ system_misc_kstat_update(kstat_t *ksp, int rw)
system_misc_kstat.avenrun_15min.value.ui32 = (uint32_t)loadavgp[2];
system_misc_kstat.boot_time.value.ui32 = (uint32_t)
zone_boot_time;
+ system_misc_kstat.nsec_per_tick.value.ui32 = (uint32_t)
+ nsec_per_tick;
return (0);
}
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index 3ea0d0fe95..79f61ddcb9 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -2222,6 +2222,12 @@ zone_misc_kstat_update(kstat_t *ksp, int rw)
zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
+ zmp->zm_run_ticks.value.ui64 = zone->zone_run_ticks;
+ zmp->zm_run_wait.value.ui64 = zone->zone_runq_cntr;
+ zmp->zm_fss_shr_pct.value.ui64 = zone->zone_fss_shr_pct;
+ zmp->zm_fss_pri_hi.value.ui64 = zone->zone_fss_pri_hi;
+ zmp->zm_fss_pri_avg.value.ui64 = zone->zone_fss_pri_avg;
+
return (0);
}
@@ -2255,6 +2261,13 @@ zone_misc_kstat_create(zone_t *zone)
kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
KSTAT_DATA_UINT32);
+ kstat_named_init(&zmp->zm_run_ticks, "run_ticks", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_run_wait, "run_queue", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_fss_shr_pct, "fss_share_percent",
+ KSTAT_DATA_UINT32);
+ kstat_named_init(&zmp->zm_fss_pri_hi, "fss_pri_hi", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_fss_pri_avg, "fss_pri_avg",
+ KSTAT_DATA_UINT64);
ksp->ks_update = zone_misc_kstat_update;
ksp->ks_private = zone;
diff --git a/usr/src/uts/common/sys/buf.h b/usr/src/uts/common/sys/buf.h
index a9191aed7c..cb8a6012fc 100644
--- a/usr/src/uts/common/sys/buf.h
+++ b/usr/src/uts/common/sys/buf.h
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -186,6 +187,7 @@ struct biostats {
#define B_STARTED 0x2000000 /* io:::start probe called for buf */
#define B_ABRWRITE 0x4000000 /* Application based recovery active */
#define B_PAGE_NOWAIT 0x8000000 /* Skip the page if it is locked */
+#define B_INVALCURONLY 0x10000000 /* invalidate only for curproc */
/*
* There is some confusion over the meaning of B_FREE and B_INVAL and what
@@ -198,6 +200,12 @@ struct biostats {
* between the sole use of these two flags. In both cases, IO will be done
* if the page is not yet committed to storage.
*
+ * The B_INVALCURONLY flag modifies the behavior of the B_INVAL flag and is
+ * intended to be used in conjunction with B_INVAL. B_INVALCURONLY has no
+ * meaning on its own. When both B_INVALCURONLY and B_INVAL are set, then
+ * the mapping for the page is only invalidated for the current process.
+ * In this case, the page is not destroyed unless this was the final mapping.
+ *
* In order to discard pages without writing them back, (B_INVAL | B_TRUNC)
* should be used.
*
diff --git a/usr/src/uts/common/sys/fss.h b/usr/src/uts/common/sys/fss.h
index 583586fd75..cdb47beb7f 100644
--- a/usr/src/uts/common/sys/fss.h
+++ b/usr/src/uts/common/sys/fss.h
@@ -22,6 +22,7 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_FSS_H
@@ -86,6 +87,7 @@ typedef struct fsspset {
/* on the list */
struct fssproj *fssps_list; /* list of project parts */
struct fsszone *fssps_zones; /* list of fsszone_t's in pset */
+ uint32_t fssps_gen; /* generation for zone's kstats */
} fsspset_t;
/*
@@ -103,6 +105,8 @@ typedef struct fssproj {
/* protected by fssps_displock */
uint32_t fssp_ticks; /* total of all ticks */
/* protected by fssps_displock */
+ uint32_t fssp_zone_ticks; /* unscaled total of all ticks */
+ /* protected by fssps_displock */
fssusage_t fssp_usage; /* this project's decayed usage */
fssusage_t fssp_shusage; /* normalized usage */
struct fssproj *fssp_next; /* next project on this pset */
diff --git a/usr/src/uts/common/sys/mman.h b/usr/src/uts/common/sys/mman.h
index 6c9119e56d..82344607b0 100644
--- a/usr/src/uts/common/sys/mman.h
+++ b/usr/src/uts/common/sys/mman.h
@@ -22,6 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -353,6 +354,7 @@ struct memcntl_mha32 {
#define MS_SYNC 0x4 /* wait for msync */
#define MS_ASYNC 0x1 /* return immediately */
#define MS_INVALIDATE 0x2 /* invalidate caches */
+#define MS_INVALCURPROC 0x8 /* invalidate cache for curproc only */
#if (_POSIX_C_SOURCE <= 2) && !defined(_XPG4_2) || defined(__EXTENSIONS__)
/* functions to mctl */
diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h
index 08677a2f65..a2b7217fd4 100644
--- a/usr/src/uts/common/sys/zone.h
+++ b/usr/src/uts/common/sys/zone.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
+ * Copyright (c) 2011, 2012, Joyent Inc. All rights reserved.
*/
#ifndef _SYS_ZONE_H
@@ -441,6 +441,11 @@ typedef struct {
kstat_named_t zm_avenrun1;
kstat_named_t zm_avenrun5;
kstat_named_t zm_avenrun15;
+ kstat_named_t zm_run_ticks;
+ kstat_named_t zm_run_wait;
+ kstat_named_t zm_fss_shr_pct;
+ kstat_named_t zm_fss_pri_hi;
+ kstat_named_t zm_fss_pri_avg;
} zone_misc_kstat_t;
typedef struct zone {
@@ -671,6 +676,20 @@ typedef struct zone {
struct loadavg_s zone_loadavg; /* loadavg for this zone */
uint64_t zone_hp_avenrun[3]; /* high-precision avenrun */
int zone_avenrun[3]; /* FSCALED avg. run queue len */
+
+ /*
+ * FSS stats updated once per second by fss_decay_usage.
+ * zone_runq_cntr is an instantaneous accumulation of the number of
+ * processes in the run queue per project and is not computed over the
+ * one second interval.
+ */
+ uint32_t zone_fss_gen; /* FSS generation cntr */
+ uint32_t zone_proc_cnt; /* FSS process cntr */
+ uint64_t zone_run_ticks; /* tot # of ticks running */
+ uint64_t zone_runq_cntr; /* tot # of procs in runq */
+ uint32_t zone_fss_shr_pct; /* fss active shr % in intvl */
+ uint64_t zone_fss_pri_hi; /* fss high pri this interval */
+ uint64_t zone_fss_pri_avg; /* fss avg pri this interval */
} zone_t;
/*
diff --git a/usr/src/uts/common/syscall/memcntl.c b/usr/src/uts/common/syscall/memcntl.c
index 1ab3a8b65e..63c8b64ad0 100644
--- a/usr/src/uts/common/syscall/memcntl.c
+++ b/usr/src/uts/common/syscall/memcntl.c
@@ -21,6 +21,7 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -116,13 +117,17 @@ memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask)
* MS_SYNC used to be defined to be zero but is now non-zero.
* For binary compatibility we still accept zero
* (the absence of MS_ASYNC) to mean the same thing.
+ * Binary compatibility is not an issue for MS_INVALCURPROC.
*/
iarg = (uintptr_t)arg;
if ((iarg & ~MS_INVALIDATE) == 0)
iarg |= MS_SYNC;
- if (((iarg & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) != 0) ||
- ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC))) {
+ if (((iarg &
+ ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE|MS_INVALCURPROC)) != 0) ||
+ ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC)) ||
+ ((iarg & (MS_INVALIDATE|MS_INVALCURPROC)) ==
+ (MS_INVALIDATE|MS_INVALCURPROC))) {
error = set_errno(EINVAL);
} else {
error = as_ctl(as, addr, len, cmd, attr, iarg, NULL, 0);
diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h
index 1d91475e38..156b810046 100644
--- a/usr/src/uts/common/vm/hat.h
+++ b/usr/src/uts/common/vm/hat.h
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -460,6 +461,7 @@ void hat_setstat(struct as *, caddr_t, size_t, uint_t);
*/
#define HAT_ADV_PGUNLOAD 0x00
#define HAT_FORCE_PGUNLOAD 0x01
+#define HAT_CURPROC_PGUNLOAD 0x02
/*
* Attributes for hat_page_*attr, hat_setstats and
diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c
index 31c293d416..5f106f6c06 100644
--- a/usr/src/uts/common/vm/seg_vn.c
+++ b/usr/src/uts/common/vm/seg_vn.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2012, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -7254,7 +7255,8 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
vpp = svd->vpage;
offset = svd->offset + (uintptr_t)(addr - seg->s_base);
bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) |
- ((flags & MS_INVALIDATE) ? B_INVAL : 0);
+ ((flags & MS_INVALIDATE) ? B_INVAL : 0) |
+ ((flags & MS_INVALCURPROC) ? (B_INVALCURONLY | B_INVAL) : 0);
if (attr) {
pageprot = attr & ~(SHARED|PRIVATE);
@@ -7279,11 +7281,11 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
vpp = &svd->vpage[seg_page(seg, addr)];
} else if (svd->vp && svd->amp == NULL &&
- (flags & MS_INVALIDATE) == 0) {
+ (flags & (MS_INVALIDATE | MS_INVALCURPROC)) == 0) {
/*
- * No attributes, no anonymous pages and MS_INVALIDATE flag
- * is not on, just use one big request.
+ * No attributes, no anonymous pages and MS_INVAL* flags
+ * are not on, just use one big request.
*/
err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len,
bflags, svd->cred, NULL);
@@ -7335,7 +7337,7 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
* might race in and lock the page after we unlock and before
* we do the PUTPAGE, then PUTPAGE simply does nothing.
*/
- if (flags & MS_INVALIDATE) {
+ if (flags & (MS_INVALIDATE | MS_INVALCURPROC)) {
if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) {
if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
page_unlock(pp);
diff --git a/usr/src/uts/common/vm/vm_pvn.c b/usr/src/uts/common/vm/vm_pvn.c
index 7233581227..39ace0b3c2 100644
--- a/usr/src/uts/common/vm/vm_pvn.c
+++ b/usr/src/uts/common/vm/vm_pvn.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -431,7 +432,14 @@ pvn_write_done(page_t *plist, int flags)
page_io_unlock(pp);
page_unlock(pp);
}
- } else if (flags & B_INVAL) {
+ } else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
+ /*
+ * If B_INVALCURONLY is set, then we handle that case
+ * in the next conditional if hat_page_is_mapped()
+ * indicates that there are no additional mappings
+ * to the page.
+ */
+
/*
* XXX - Failed writes with B_INVAL set are
* not handled appropriately.
@@ -572,8 +580,9 @@ pvn_write_done(page_t *plist, int flags)
}
/*
- * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
- * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster
+ * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE,
+ * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}.
+ * B_DELWRI indicates that this page is part of a kluster
* operation and is only to be considered if it doesn't involve any
* waiting here. B_TRUNC indicates that the file is being truncated
* and so no i/o needs to be done. B_FORCE indicates that the page
@@ -627,13 +636,17 @@ pvn_getdirty(page_t *pp, int flags)
* If we want to free or invalidate the page then
* we need to unload it so that anyone who wants
* it will have to take a minor fault to get it.
+ * If we are only invalidating the page for the
+ * current process, then pass in a different flag.
* Otherwise, we're just writing the page back so we
* need to sync up the hardwre and software mod bit to
* detect any future modifications. We clear the
* software mod bit when we put the page on the dirty
* list.
*/
- if (flags & (B_INVAL | B_FREE)) {
+ if (flags & B_INVALCURONLY) {
+ (void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD);
+ } else if (flags & (B_INVAL | B_FREE)) {
(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
} else {
(void) hat_pagesync(pp, HAT_SYNC_ZERORM);
@@ -645,7 +658,7 @@ pvn_getdirty(page_t *pp, int flags)
* list after all.
*/
page_io_unlock(pp);
- if (flags & B_INVAL) {
+ if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
/*LINTED: constant in conditional context*/
VN_DISPOSE(pp, B_INVAL, 0, kcred);
} else if (flags & B_FREE) {
@@ -657,6 +670,9 @@ pvn_getdirty(page_t *pp, int flags)
* of VOP_PUTPAGE() who prefer freeing the
* page _only_ if no one else is accessing it.
* E.g. segmap_release()
+ * We also take this path for B_INVALCURONLY and
+ * let page_release call VN_DISPOSE if no one else is
+ * using the page.
*
* The above hat_ismod() check is useless because:
* (1) we may not be holding SE_EXCL lock;
@@ -681,7 +697,7 @@ pvn_getdirty(page_t *pp, int flags)
* We'll detect the fact that they used it when the
* i/o is done and avoid freeing the page.
*/
- if (flags & B_FREE)
+ if (flags & (B_FREE | B_INVALCURONLY))
page_downgrade(pp);
diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c
index 18e3c4c806..bbfd6013cd 100644
--- a/usr/src/uts/common/vm/vm_usage.c
+++ b/usr/src/uts/common/vm/vm_usage.c
@@ -939,7 +939,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
(page = page_exists(vn, off)) != NULL) {
- page_type = VMUSAGE_BOUND_INCORE;
+ if (PP_ISFREE(page))
+ page_type = VMUSAGE_BOUND_NOT_INCORE;
+ else
+ page_type = VMUSAGE_BOUND_INCORE;
if (page->p_szc > 0) {
pgcnt = page_get_pagecnt(page->p_szc);
pgshft = page_get_shift(page->p_szc);
@@ -1026,7 +1029,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
if (vnode->v_pages != NULL &&
(page = page_exists(vnode, ptob(index))) != NULL) {
- page_type = VMUSAGE_BOUND_INCORE;
+ if (PP_ISFREE(page))
+ page_type = VMUSAGE_BOUND_NOT_INCORE;
+ else
+ page_type = VMUSAGE_BOUND_INCORE;
if (page->p_szc > 0) {
pgcnt = page_get_pagecnt(page->p_szc);
pgshft = page_get_shift(page->p_szc);
@@ -1306,6 +1312,12 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
}
/*
+ * Pages on the free list aren't counted for the rss.
+ */
+ if (PP_ISFREE(page))
+ continue;
+
+ /*
* Assume anon structs with a refcnt
* of 1 are not COW shared, so there
* is no reason to track them per entity.
diff --git a/usr/src/uts/i86pc/vm/hat_i86.c b/usr/src/uts/i86pc/vm/hat_i86.c
index 8da02a4c36..40b033d0e4 100644
--- a/usr/src/uts/i86pc/vm/hat_i86.c
+++ b/usr/src/uts/i86pc/vm/hat_i86.c
@@ -27,6 +27,7 @@
*/
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
/*
@@ -3350,15 +3351,13 @@ hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry)
extern int vpm_enable;
/*
- * Unload all translations to a page. If the page is a subpage of a large
+ * Unload translations to a page. If the page is a subpage of a large
* page, the large page mappings are also removed.
- *
- * The forceflags are unused.
+ * If unloadflag is HAT_CURPROC_PGUNLOAD, then we only unload the translation
+ * for the current process, otherwise all translations are unloaded.
*/
-
-/*ARGSUSED*/
static int
-hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag)
+hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t unloadflag)
{
page_t *cur_pp = pp;
hment_t *hm;
@@ -3366,6 +3365,8 @@ hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag)
htable_t *ht;
uint_t entry;
level_t level;
+ struct hat *curhat;
+ ulong_t cnt;
XPV_DISALLOW_MIGRATE();
@@ -3375,6 +3376,9 @@ hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag)
++curthread->t_hatdepth;
ASSERT(curthread->t_hatdepth < 16);
+ if (unloadflag == HAT_CURPROC_PGUNLOAD)
+ curhat = curthread->t_procp->p_as->a_hat;
+
#if defined(__amd64)
/*
* clear the vpm ref.
@@ -3387,6 +3391,8 @@ hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag)
* The loop with next_size handles pages with multiple pagesize mappings
*/
next_size:
+ if (unloadflag == HAT_CURPROC_PGUNLOAD)
+ cnt = hat_page_getshare(cur_pp);
for (;;) {
/*
@@ -3398,6 +3404,7 @@ next_size:
if (hm == NULL) {
x86_hm_exit(cur_pp);
+curproc_done:
/*
* If not part of a larger page, we're done.
*/
@@ -3424,8 +3431,21 @@ next_size:
* If this mapping size matches, remove it.
*/
level = ht->ht_level;
- if (level == pg_szcd)
- break;
+ if (level == pg_szcd) {
+ if (unloadflag != HAT_CURPROC_PGUNLOAD ||
+ ht->ht_hat == curhat)
+ break;
+ /*
+ * unloadflag == HAT_CURPROC_PGUNLOAD but it's
+ * not the hat for the current process. Leave
+ * entry in place. Also do a safety check to
+ * ensure we don't get in an infinite loop
+ */
+ if (cnt-- == 0) {
+ x86_hm_exit(cur_pp);
+ goto curproc_done;
+ }
+ }
}
/*
@@ -3435,14 +3455,18 @@ next_size:
hm = hati_page_unmap(cur_pp, ht, entry);
if (hm != NULL)
hment_free(hm);
+
+ /* Perform check above for being part of a larger page. */
+ if (unloadflag == HAT_CURPROC_PGUNLOAD)
+ goto curproc_done;
}
}
int
-hat_pageunload(struct page *pp, uint_t forceflag)
+hat_pageunload(struct page *pp, uint_t unloadflag)
{
ASSERT(PAGE_EXCL(pp));
- return (hati_pageunload(pp, 0, forceflag));
+ return (hati_pageunload(pp, 0, unloadflag));
}
/*