summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--usr/src/cmd/prstat/prstat.c152
-rw-r--r--usr/src/cmd/prstat/prstat.h1
-rw-r--r--usr/src/cmd/rcap/common/rcapd.h35
-rw-r--r--usr/src/cmd/rcap/common/rcapd_stat.h12
-rw-r--r--usr/src/cmd/rcap/common/utils.c81
-rw-r--r--usr/src/cmd/rcap/common/utils.h14
-rw-r--r--usr/src/cmd/rcap/rcapadm/Makefile9
-rw-r--r--usr/src/cmd/rcap/rcapadm/rcapadm.c61
-rw-r--r--usr/src/cmd/rcap/rcapd/Makefile.rcapd4
-rw-r--r--usr/src/cmd/rcap/rcapd/rcapd_collection.c72
-rw-r--r--usr/src/cmd/rcap/rcapd/rcapd_collection_project.c53
-rw-r--r--usr/src/cmd/rcap/rcapd/rcapd_collection_zone.c99
-rw-r--r--usr/src/cmd/rcap/rcapd/rcapd_main.c862
-rw-r--r--usr/src/cmd/rcap/rcapd/rcapd_scanner.c31
-rw-r--r--usr/src/cmd/rcap/rcapstat/Makefile9
-rw-r--r--usr/src/cmd/rcap/rcapstat/rcapstat.c64
-rw-r--r--usr/src/cmd/truss/print.c1
-rw-r--r--usr/src/cmd/truss/systable.c5
-rw-r--r--usr/src/cmd/zoneadm/Makefile4
-rw-r--r--usr/src/cmd/zoneadm/resource-mgmt.xml116
-rw-r--r--usr/src/cmd/zoneadm/svc-resource-mgmt54
-rw-r--r--usr/src/cmd/zoneadm/zoneadm.c186
-rw-r--r--usr/src/cmd/zoneadm/zoneadm.h3
-rw-r--r--usr/src/cmd/zoneadmd/Makefile2
-rw-r--r--usr/src/cmd/zoneadmd/vplat.c267
-rw-r--r--usr/src/cmd/zoneadmd/zoneadmd.c22
-rw-r--r--usr/src/cmd/zoneadmd/zoneadmd.h2
-rw-r--r--usr/src/cmd/zonecfg/zonecfg.c1505
-rw-r--r--usr/src/cmd/zonecfg/zonecfg.h54
-rw-r--r--usr/src/cmd/zonecfg/zonecfg_grammar.y141
-rw-r--r--usr/src/cmd/zonecfg/zonecfg_lex.l48
-rw-r--r--usr/src/head/libzonecfg.h80
-rw-r--r--usr/src/lib/Makefile2
-rw-r--r--usr/src/lib/libc/port/gen/getrusage.c15
-rw-r--r--usr/src/lib/libc/port/mapfile-vers1
-rw-r--r--usr/src/lib/libpool/common/pool.c126
-rw-r--r--usr/src/lib/libpool/common/pool.h9
-rw-r--r--usr/src/lib/libpool/common/pool_commit.c17
-rw-r--r--usr/src/lib/libpool/common/pool_internal.c24
-rw-r--r--usr/src/lib/libpool/common/pool_internal.h9
-rw-r--r--usr/src/lib/libpool/common/pool_kernel.c23
-rw-r--r--usr/src/lib/libproject/common/setproject.c14
-rw-r--r--usr/src/lib/libzonecfg/Makefile.com3
-rw-r--r--usr/src/lib/libzonecfg/common/libzonecfg.c1552
-rw-r--r--usr/src/lib/libzonecfg/common/mapfile-vers29
-rw-r--r--usr/src/lib/libzonecfg/dtd/zonecfg.dtd.123
-rw-r--r--usr/src/pkgdefs/SUNWhea/prototype_com1
-rw-r--r--usr/src/pkgdefs/SUNWrcapu/depend8
-rw-r--r--usr/src/pkgdefs/SUNWzoner/prototype_com2
-rw-r--r--usr/src/tools/scripts/bfu.sh2
-rw-r--r--usr/src/uts/common/Makefile.files1
-rw-r--r--usr/src/uts/common/disp/priocntl.c17
-rw-r--r--usr/src/uts/common/fs/tmpfs/tmp_tnode.c15
-rw-r--r--usr/src/uts/common/fs/tmpfs/tmp_vnops.c21
-rw-r--r--usr/src/uts/common/os/modhash.c24
-rw-r--r--usr/src/uts/common/os/pid.c82
-rw-r--r--usr/src/uts/common/os/pool.c4
-rw-r--r--usr/src/uts/common/os/project.c95
-rw-r--r--usr/src/uts/common/os/rctl.c122
-rw-r--r--usr/src/uts/common/os/schedctl.c13
-rw-r--r--usr/src/uts/common/os/sysent.c4
-rw-r--r--usr/src/uts/common/os/task.c11
-rw-r--r--usr/src/uts/common/os/zone.c376
-rw-r--r--usr/src/uts/common/sys/Makefile1
-rw-r--r--usr/src/uts/common/sys/modhash_impl.h19
-rw-r--r--usr/src/uts/common/sys/priocntl.h1
-rw-r--r--usr/src/uts/common/sys/proc.h2
-rw-r--r--usr/src/uts/common/sys/project.h14
-rw-r--r--usr/src/uts/common/sys/rctl.h9
-rw-r--r--usr/src/uts/common/sys/resource.h8
-rw-r--r--usr/src/uts/common/sys/syscall.h3
-rw-r--r--usr/src/uts/common/sys/vm_usage.h120
-rw-r--r--usr/src/uts/common/sys/zone.h30
-rw-r--r--usr/src/uts/common/syscall/processor_bind.c10
-rw-r--r--usr/src/uts/common/syscall/pset.c3
-rw-r--r--usr/src/uts/common/syscall/rusagesys.c19
-rw-r--r--usr/src/uts/common/syscall/tasksys.c9
-rw-r--r--usr/src/uts/common/vm/anon.h16
-rw-r--r--usr/src/uts/common/vm/seg.h10
-rw-r--r--usr/src/uts/common/vm/seg_kp.c22
-rw-r--r--usr/src/uts/common/vm/seg_vn.c5
-rw-r--r--usr/src/uts/common/vm/vm_anon.c20
-rw-r--r--usr/src/uts/common/vm/vm_page.c3
-rw-r--r--usr/src/uts/common/vm/vm_seg.c56
-rw-r--r--usr/src/uts/common/vm/vm_usage.c1978
85 files changed, 8180 insertions, 872 deletions
diff --git a/usr/src/cmd/prstat/prstat.c b/usr/src/cmd/prstat/prstat.c
index 743990ad2a..5a4b9185ea 100644
--- a/usr/src/cmd/prstat/prstat.c
+++ b/usr/src/cmd/prstat/prstat.c
@@ -31,6 +31,7 @@
#include <sys/loadavg.h>
#include <sys/time.h>
#include <sys/pset.h>
+#include <sys/vm_usage.h>
#include <zone.h>
#include <libzonecfg.h>
@@ -86,21 +87,21 @@
#define USAGE_HEADER_LWP \
" PID USERNAME USR SYS TRP TFL DFL LCK SLP LAT VCX ICX SCL SIG PROCESS/LWPID "
#define USER_HEADER_PROC \
-" NPROC USERNAME SIZE RSS MEMORY TIME CPU "
+" NPROC USERNAME SWAP RSS MEMORY TIME CPU "
#define USER_HEADER_LWP \
-" NLWP USERNAME SIZE RSS MEMORY TIME CPU "
+" NLWP USERNAME SWAP RSS MEMORY TIME CPU "
#define TASK_HEADER_PROC \
-"TASKID NPROC SIZE RSS MEMORY TIME CPU PROJECT "
+"TASKID NPROC SWAP RSS MEMORY TIME CPU PROJECT "
#define TASK_HEADER_LWP \
-"TASKID NLWP SIZE RSS MEMORY TIME CPU PROJECT "
+"TASKID NLWP SWAP RSS MEMORY TIME CPU PROJECT "
#define PROJECT_HEADER_PROC \
-"PROJID NPROC SIZE RSS MEMORY TIME CPU PROJECT "
+"PROJID NPROC SWAP RSS MEMORY TIME CPU PROJECT "
#define PROJECT_HEADER_LWP \
-"PROJID NLWP SIZE RSS MEMORY TIME CPU PROJECT "
+"PROJID NLWP SWAP RSS MEMORY TIME CPU PROJECT "
#define ZONE_HEADER_PROC \
-"ZONEID NPROC SIZE RSS MEMORY TIME CPU ZONE "
+"ZONEID NPROC SWAP RSS MEMORY TIME CPU ZONE "
#define ZONE_HEADER_LWP \
-"ZONEID NLWP SIZE RSS MEMORY TIME CPU ZONE "
+"ZONEID NLWP SWAP RSS MEMORY TIME CPU ZONE "
#define PSINFO_LINE \
"%6d %-8s %5s %5s %-6s %3s %3s %9s %3.3s%% %-.16s/%d"
#define PSINFO_LINE_LGRP \
@@ -160,6 +161,8 @@ static volatile uint_t sigwinch = 0;
static volatile uint_t sigtstp = 0;
static volatile uint_t sigterm = 0;
+static long pagesize;
+
/* default settings */
static optdesc_t opts = {
@@ -185,6 +188,129 @@ psetloadavg(long psetid, void *ptr)
}
/*
+ * Queries the memory virtual and rss size for each member of a list.
+ * This will override the values computed by /proc aggregation.
+ */
+static void
+list_getsize(list_t *list)
+{
+ id_info_t *id;
+ vmusage_t *results, *next;
+ vmusage_t *match;
+ size_t nres = 0;
+ size_t i;
+ uint_t flags = 0;
+ int ret;
+ size_t physmem = sysconf(_SC_PHYS_PAGES) * pagesize;
+
+ /*
+ * Determine what swap/rss results to calculate. getvmusage() will
+ * prune results returned to non-global zones automatically, so
+ * there is no need to pass different flags when calling from a
+ * non-global zone.
+ *
+ * Currently list_getsize() is only called with a single flag. This
+ * is because -Z, -J, -T, and -a are mutually exclusive. Regardless
+ * of this, we handle multiple flags.
+ */
+ if (opts.o_outpmode & OPT_USERS) {
+ /*
+ * Gather rss for all users in all zones. Treat the same
+ * uid in different zones as the same user.
+ */
+ flags |= VMUSAGE_COL_RUSERS;
+
+ } else if (opts.o_outpmode & OPT_TASKS) {
+ /* Gather rss for all tasks in all zones */
+ flags |= VMUSAGE_ALL_TASKS;
+
+ } else if (opts.o_outpmode & OPT_PROJECTS) {
+ /*
+ * Gather rss for all projects in all zones. Treat the same
+ * projid in diffrent zones as the same project.
+ */
+ flags |= VMUSAGE_COL_PROJECTS;
+
+ } else if (opts.o_outpmode & OPT_ZONES) {
+ /* Gather rss for all zones */
+ flags |= VMUSAGE_ALL_ZONES;
+
+ } else {
+ Die(gettext(
+ "Cannot determine rss flags for output options %x\n"),
+ opts.o_outpmode);
+ }
+
+ /*
+ * getvmusage() returns an array of result structures. One for
+ * each zone, project, task, or user on the system, depending on
+ * flags.
+ *
+ * If getvmusage() fails, prstat will use the size already gathered
+ * from psinfo
+ */
+ if (getvmusage(flags, opts.o_interval, NULL, &nres) != 0)
+ return;
+
+ results = (vmusage_t *)Malloc(sizeof (vmusage_t) * nres);
+ for (;;) {
+ ret = getvmusage(flags, opts.o_interval, results, &nres);
+ if (ret == 0)
+ break;
+ if (errno == EOVERFLOW) {
+ results = (vmusage_t *)Realloc(results,
+ sizeof (vmusage_t) * nres);
+ continue;
+ }
+ /*
+ * Failure for some other reason. Prstat will use the size
+ * already gathered from psinfo.
+ */
+ return;
+ }
+ for (id = list->l_head; id != NULL; id = id->id_next) {
+
+ match = NULL;
+ next = results;
+ for (i = 0; i < nres; i++, next++) {
+ switch (flags) {
+ case VMUSAGE_COL_RUSERS:
+ if (next->vmu_id == id->id_uid)
+ match = next;
+ break;
+ case VMUSAGE_ALL_TASKS:
+ if (next->vmu_id == id->id_taskid)
+ match = next;
+ break;
+ case VMUSAGE_COL_PROJECTS:
+ if (next->vmu_id == id->id_projid)
+ match = next;
+ break;
+ case VMUSAGE_ALL_ZONES:
+ if (next->vmu_id == id->id_zoneid)
+ match = next;
+ break;
+ default:
+ Die(gettext(
+ "Unknown vmusage flags %d\n"), flags);
+ }
+ }
+ if (match != NULL) {
+ id->id_size = match->vmu_swap_all / 1024;
+ id->id_rssize = match->vmu_rss_all / 1024;
+ id->id_pctmem = (100.0 * (float)match->vmu_rss_all) /
+ (float)physmem;
+ /* Output using data from getvmusage() */
+ id->id_sizematch = B_TRUE;
+ }
+ /*
+ * If no match is found, prstat will use the size already
+ * gathered from psinfo.
+ */
+ }
+}
+
+/*
* A routine to display the contents of the list on the screen
*/
static void
@@ -282,7 +408,7 @@ list_print(list_t *list)
cpu = (100 * id->id_pctcpu) / total_cpu;
else
cpu = id->id_pctcpu;
- if (total_mem >= 100)
+ if (id->id_sizematch == B_FALSE && total_mem >= 100)
mem = (100 * id->id_pctmem) / total_mem;
else
mem = id->id_pctmem;
@@ -566,6 +692,7 @@ update:
id->id_zoneid = lwp->li_info.pr_zoneid;
id->id_lgroup = lwp->li_info.pr_lwp.pr_lgrp;
id->id_nproc++;
+ id->id_sizematch = B_FALSE;
if (lwp->li_flags & LWP_REPRESENT) {
id->id_size = lwp->li_info.pr_size;
id->id_rssize = lwp->li_info.pr_rssize;
@@ -1175,6 +1302,7 @@ Exit()
fd_exit();
}
+
int
main(int argc, char **argv)
{
@@ -1192,6 +1320,8 @@ main(int argc, char **argv)
lwpid_init();
fd_init(Setrlimit());
+ pagesize = sysconf(_SC_PAGESIZE);
+
while ((opt = getopt(argc, argv, "vcHmaRLtu:U:n:p:C:P:h:s:S:j:k:TJz:Z"))
!= (int)EOF) {
switch (opt) {
@@ -1419,21 +1549,25 @@ main(int argc, char **argv)
list_print(&lwps);
}
if (opts.o_outpmode & OPT_USERS) {
+ list_getsize(&users);
list_sort(&users);
list_print(&users);
list_clear(&users);
}
if (opts.o_outpmode & OPT_TASKS) {
+ list_getsize(&tasks);
list_sort(&tasks);
list_print(&tasks);
list_clear(&tasks);
}
if (opts.o_outpmode & OPT_PROJECTS) {
+ list_getsize(&projects);
list_sort(&projects);
list_print(&projects);
list_clear(&projects);
}
if (opts.o_outpmode & OPT_ZONES) {
+ list_getsize(&zones);
list_sort(&zones);
list_print(&zones);
list_clear(&zones);
diff --git a/usr/src/cmd/prstat/prstat.h b/usr/src/cmd/prstat/prstat.h
index 1a13329845..d130164e7d 100644
--- a/usr/src/cmd/prstat/prstat.h
+++ b/usr/src/cmd/prstat/prstat.h
@@ -122,6 +122,7 @@ typedef struct id_info {
zoneid_t id_zoneid; /* zone id */
int id_lgroup; /* lgroup id */
uint_t id_nproc; /* number of processes */
+ boolean_t id_sizematch; /* size/rssize from getvmusage() */
size_t id_size; /* memory usage */
size_t id_rssize; /* resident set size */
ulong_t id_time; /* cpu time (in secs) */
diff --git a/usr/src/cmd/rcap/common/rcapd.h b/usr/src/cmd/rcap/common/rcapd.h
index 89cf5f3d81..7a554c213b 100644
--- a/usr/src/cmd/rcap/common/rcapd.h
+++ b/usr/src/cmd/rcap/common/rcapd.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -58,7 +57,21 @@ extern "C" {
#define LCST_CAP_REMOVED (1<<1)
#define LCST_CAP_ZERO (1<<2)
-typedef int64_t rcid_t;
+typedef enum {
+ RCIDT_PROJECT,
+ RCIDT_ZONE
+} rcid_type_t;
+
+typedef struct {
+ /*
+ * The following field could just be a rcid_type_t but it gets
+ * written out to a file as binary data for communication between
+ * 64-bit rcapd & 32-bit rcapstat, so we need to force a standard size
+ * and alignment here.
+ */
+ uint64_t rcid_type;
+ int64_t rcid_val;
+} rcid_t;
typedef enum {
LCU_COMPLETE, /* an enumeration of all possible collections */
@@ -138,7 +151,6 @@ typedef struct lcollection {
uint64_t lcol_rss; /* RSS of all processes (kB) */
uint64_t lcol_image_size; /* image size of all processes (kB) */
uint64_t lcol_rss_cap; /* RSS cap (kB) */
- int lcol_stat_invalidate; /* flag to reset interval statistics */
lcollection_stat_t lcol_stat; /* statistics */
lcollection_stat_t lcol_stat_old; /* previous interval's statistics */
lprocess_t *lcol_lprocess; /* member processes */
@@ -162,12 +174,11 @@ typedef struct lcollection_report {
extern int get_psinfo(pid_t, struct psinfo *, int, int(*)(void *, int), void *,
lprocess_t *);
-extern lcollection_t *lcollection_find(id_t);
+extern lcollection_t *lcollection_find(rcid_t *);
extern void lcollection_freq_move(lprocess_t *);
-extern lcollection_t *lcollection_insert_update(rcid_t, uint64_t, char *,
+extern lcollection_t *lcollection_insert_update(rcid_t *, uint64_t, char *,
int *changes);
extern int lcollection_member(lcollection_t *, lprocess_t *);
-extern void lcollection_set_type(rctype_t);
extern void lcollection_free(lcollection_t *);
extern void lcollection_update(lcollection_update_type_t);
extern void list_walk_collection(int (*)(lcollection_t *, void *), void *);
@@ -178,12 +189,6 @@ extern void scan_abort(void);
extern void check_update_statistics(void);
/*
- * The collection-specific function determining the collection ID from a
- * process' psinfo.
- */
-extern rcid_t(*rc_getidbypsinfo)(struct psinfo *);
-
-/*
* Global (in rcapd only) variables.
*/
extern rcfg_t rcfg;
diff --git a/usr/src/cmd/rcap/common/rcapd_stat.h b/usr/src/cmd/rcap/common/rcapd_stat.h
index c34ceb36e2..fa769ba643 100644
--- a/usr/src/cmd/rcap/common/rcapd_stat.h
+++ b/usr/src/cmd/rcap/common/rcapd_stat.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -44,7 +43,10 @@ extern "C" {
*/
#define RC_MODE_LEN 16
typedef struct rcapd_stat_hdr {
- pid_t rs_pid; /* pid of producer */
+ /*
+ * sizeof pid_t can vary, so we use a fixed 64-bit quantity.
+ */
+ uint64_t rs_pid; /* pid of producer */
hrtime_t rs_time; /* time recorded */
/*
diff --git a/usr/src/cmd/rcap/common/utils.c b/usr/src/cmd/rcap/common/utils.c
index f9757a12f6..c01f568915 100644
--- a/usr/src/cmd/rcap/common/utils.c
+++ b/usr/src/cmd/rcap/common/utils.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -260,3 +259,77 @@ xatoi(char *p)
return (i);
}
}
+
+/*
+ * get_running_zones() calls zone_list(2) to find out how many zones are
+ * running. It then calls zone_list(2) again to fetch the list of running
+ * zones (stored in *zents).
+ */
+int
+get_running_zones(uint_t *nzents, zone_entry_t **zents)
+{
+ zoneid_t *zids;
+ uint_t nzents_saved;
+ int i;
+ zone_entry_t *zentp;
+ zone_state_t zstate;
+
+ *zents = NULL;
+ if (zone_list(NULL, nzents) != 0) {
+ warn(gettext("could not get zoneid list\n"));
+ return (E_ERROR);
+ }
+
+again:
+ if (*nzents == 0)
+ return (E_SUCCESS);
+
+ if ((zids = (zoneid_t *)calloc(*nzents, sizeof (zoneid_t))) == NULL) {
+ warn(gettext("out of memory: zones will not be capped\n"));
+ return (E_ERROR);
+ }
+
+ nzents_saved = *nzents;
+
+ if (zone_list(zids, nzents) != 0) {
+ warn(gettext("could not get zone list\n"));
+ free(zids);
+ return (E_ERROR);
+ }
+ if (*nzents != nzents_saved) {
+ /* list changed, try again */
+ free(zids);
+ goto again;
+ }
+
+ *zents = calloc(*nzents, sizeof (zone_entry_t));
+ if (*zents == NULL) {
+ warn(gettext("out of memory: zones will not be capped\n"));
+ free(zids);
+ return (E_ERROR);
+ }
+
+ zentp = *zents;
+ for (i = 0; i < *nzents; i++) {
+ char name[ZONENAME_MAX];
+
+ if (getzonenamebyid(zids[i], name, sizeof (name)) < 0) {
+ warn(gettext("could not get name for "
+ "zoneid %d\n"), zids[i]);
+ continue;
+ }
+
+ (void) strlcpy(zentp->zname, name, sizeof (zentp->zname));
+ zentp->zid = zids[i];
+ if (zone_get_state(name, &zstate) != Z_OK ||
+ zstate != ZONE_STATE_RUNNING)
+ continue;
+
+
+ zentp++;
+ }
+ *nzents = zentp - *zents;
+
+ free(zids);
+ return (E_SUCCESS);
+}
diff --git a/usr/src/cmd/rcap/common/utils.h b/usr/src/cmd/rcap/common/utils.h
index 678dee51ab..f952d59bbb 100644
--- a/usr/src/cmd/rcap/common/utils.h
+++ b/usr/src/cmd/rcap/common/utils.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -33,6 +32,7 @@
#include <libintl.h>
#include <stdarg.h>
#include <time.h>
+#include <libzonecfg.h>
#ifdef __cplusplus
extern "C" {
@@ -63,6 +63,11 @@ typedef enum rcm_dst {
RCD_SYSLOG /* syslog() daemon facility */
} rcm_dst_t;
+typedef struct zone_entry {
+ zoneid_t zid;
+ char zname[ZONENAME_MAX];
+} zone_entry_t;
+
#define LINELEN 256 /* max. message length */
#ifdef DEBUG
@@ -95,6 +100,7 @@ extern void vdprintfe(int, char *, va_list);
extern void dprintfe(int, char *, ...);
extern void hrt2ts(hrtime_t, timestruc_t *);
extern int xatoi(char *);
+extern int get_running_zones(uint_t *, zone_entry_t **);
#ifdef __cplusplus
}
diff --git a/usr/src/cmd/rcap/rcapadm/Makefile b/usr/src/cmd/rcap/rcapadm/Makefile
index 59c1530185..3b4de32953 100644
--- a/usr/src/cmd/rcap/rcapadm/Makefile
+++ b/usr/src/cmd/rcap/rcapadm/Makefile
@@ -2,9 +2,8 @@
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
# CDDL HEADER END
#
#
-# Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#ident "%Z%%M% %I% %E% SMI"
@@ -41,7 +40,7 @@ LINTSRCS = $(COMMON_DIR)/utils.c \
$(NOT_RELEASE_BUILD)CPPFLAGS += -DDEBUG
CPPFLAGS += -I$(COMMON_DIR)
-LDLIBS += -lumem -ll -lscf
+LDLIBS += -lumem -ll -lscf -lzonecfg
LINTFLAGS += $(LDLIBS) -mnu
diff --git a/usr/src/cmd/rcap/rcapadm/rcapadm.c b/usr/src/cmd/rcap/rcapadm/rcapadm.c
index cc9fd290a1..1951682283 100644
--- a/usr/src/cmd/rcap/rcapadm/rcapadm.c
+++ b/usr/src/cmd/rcap/rcapadm/rcapadm.c
@@ -39,6 +39,8 @@
#include <libscf_priv.h>
#include <libintl.h>
#include <locale.h>
+#include <zone.h>
+#include <libzonecfg.h>
#include "utils.h"
#include "rcapd.h"
@@ -61,7 +63,9 @@ usage()
" [-c <percent>] "
"# set memory cap\n"
" "
- "# enforcement threshold\n"));
+ "# enforcement threshold\n"
+ " [-z <zonename> -m <max-rss>] "
+ "# update zone memory cap\n"));
exit(E_USAGE);
}
@@ -135,18 +139,54 @@ out:
scf_handle_destroy(h);
}
+/*
+ * Update the in-kernel memory cap for the specified zone.
+ */
+static int
+update_zone_mcap(char *zonename, char *maxrss)
+{
+ zoneid_t zone_id;
+ uint64_t num;
+
+ if (getzoneid() != GLOBAL_ZONEID || zonecfg_in_alt_root())
+ return (E_SUCCESS);
+
+ /* get the running zone from the kernel */
+ if ((zone_id = getzoneidbyname(zonename)) == -1) {
+ (void) fprintf(stderr, gettext("zone '%s' must be running\n"),
+ zonename);
+ return (E_ERROR);
+ }
+
+ if (zonecfg_str_to_bytes(maxrss, &num) == -1) {
+ (void) fprintf(stderr, gettext("invalid max-rss value\n"));
+ return (E_ERROR);
+ }
+
+ if (zone_setattr(zone_id, ZONE_ATTR_PHYS_MCAP, &num, 0) == -1) {
+ (void) fprintf(stderr, gettext("could not set memory "
+ "cap for zone '%s'\n"), zonename);
+ return (E_ERROR);
+ }
+
+ return (E_SUCCESS);
+}
+
int
main(int argc, char *argv[])
{
char *subopts, *optval;
int modified = 0;
+ boolean_t refresh = B_FALSE;
int opt;
+ char *zonename;
+ char *maxrss = NULL;
(void) setprogname("rcapadm");
(void) setlocale(LC_ALL, "");
(void) textdomain(TEXT_DOMAIN);
- while ((opt = getopt(argc, argv, "DEc:i:n")) != EOF) {
+ while ((opt = getopt(argc, argv, "DEc:i:m:nz:")) != EOF) {
switch (opt) {
case 'n':
no_starting_stopping = 1;
@@ -203,12 +243,24 @@ main(int argc, char *argv[])
}
modified++;
break;
+ case 'm':
+ maxrss = optarg;
+ break;
+ case 'z':
+ refresh = B_TRUE;
+ zonename = optarg;
+ break;
default:
usage();
}
}
- if (argc > optind)
+ /* the -z & -m options must be used together */
+ if (argc > optind || (refresh && maxrss == NULL) ||
+ (!refresh && maxrss != NULL))
+ usage();
+
+ if (refresh && (no_starting_stopping > 0 || modified))
usage();
if (rcfg_read(fname, -1, &conf, NULL) < 0) {
@@ -232,6 +284,9 @@ main(int argc, char *argv[])
}
}
+ if (refresh)
+ return (update_zone_mcap(zonename, maxrss));
+
if (modified) {
if (pressure >= 0)
conf.rcfg_memory_cap_enforcement_pressure = pressure;
diff --git a/usr/src/cmd/rcap/rcapd/Makefile.rcapd b/usr/src/cmd/rcap/rcapd/Makefile.rcapd
index 5fd0d01416..716ea41e38 100644
--- a/usr/src/cmd/rcap/rcapd/Makefile.rcapd
+++ b/usr/src/cmd/rcap/rcapd/Makefile.rcapd
@@ -35,6 +35,7 @@
SRCS = rcapd_main.c \
rcapd_collection.c \
rcapd_collection_project.c \
+ rcapd_collection_zone.c \
rcapd_mapping.c \
rcapd_rfd.c \
rcapd_scanner.c \
@@ -44,6 +45,7 @@ SRCS = rcapd_main.c \
LINTSRCS = ../rcapd_main.c \
../rcapd_collection.c \
../rcapd_collection_project.c \
+ ../rcapd_collection_zone.c \
../rcapd_mapping.c \
../rcapd_rfd.c \
../rcapd_scanner.c \
@@ -53,7 +55,7 @@ LINTSRCS = ../rcapd_main.c \
$(NOT_RELEASE_BUILD)CPPFLAGS += -DDEBUG
CPPFLAGS += -DDEBUG_MSG
CPPFLAGS += -I$(COMMON_DIR)
-LDLIBS += -lkstat -ll -lproc -lproject -lumem
+LDLIBS += -lkstat -ll -lproc -lproject -lzonecfg -lumem
LDLIBS += $(EXTRA_LDLIBS)
LINTFLAGS += -u
diff --git a/usr/src/cmd/rcap/rcapd/rcapd_collection.c b/usr/src/cmd/rcap/rcapd/rcapd_collection.c
index 7dac0e8155..fdaf8dbfe0 100644
--- a/usr/src/cmd/rcap/rcapd/rcapd_collection.c
+++ b/usr/src/cmd/rcap/rcapd/rcapd_collection.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -41,14 +40,16 @@
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
typedef struct {
- rcid_t lfa_colid;
+ rcid_t *lfa_colidp;
lcollection_t *lfa_found;
} lcollection_find_arg_t;
extern void lcollection_update_project(lcollection_update_type_t,
- void(*)(char *, int, uint64_t, int));
-extern void lcollection_set_type_project();
-static void lcollection_update_notification_cb(char *, int, uint64_t, int);
+ void(*)(char *, char *, int, uint64_t, int));
+extern void lcollection_update_zone(lcollection_update_type_t,
+ void(*)(char *, char *, int, uint64_t, int));
+static void lcollection_update_notification_cb(char *, char *, int, uint64_t,
+ int);
rcid_t(*rc_getidbypsinfo)(psinfo_t *);
uint64_t phys_total = 0;
@@ -57,28 +58,8 @@ static lcollection_t *lcollection_head = NULL;
void
lcollection_update(lcollection_update_type_t ut)
{
- if (rcfg.rcfg_mode == rctype_project)
- lcollection_update_project(ut,
- lcollection_update_notification_cb);
- else
- die(gettext("unknown mode %s\n"), rcfg.rcfg_mode_name);
-}
-
-/*
- * Configure which collection type will be used.
- */
-void
-lcollection_set_type(rctype_t type)
-{
- switch (type) {
- case rctype_project:
- lcollection_set_type_project();
- break;
- default:
- /* can't happen */
- die(gettext("unknown mode %d\n"), type);
- /*NOTREACHED*/
- }
+ lcollection_update_zone(ut, lcollection_update_notification_cb);
+ lcollection_update_project(ut, lcollection_update_notification_cb);
}
/*
@@ -93,7 +74,7 @@ lcollection_set_type(rctype_t type)
* LCSS_CAP_ZERO
*/
lcollection_t *
-lcollection_insert_update(rcid_t colid, uint64_t rss_cap, char *name,
+lcollection_insert_update(rcid_t *colidp, uint64_t rss_cap, char *name,
int *changes)
{
lcollection_t *lcol;
@@ -103,7 +84,7 @@ lcollection_insert_update(rcid_t colid, uint64_t rss_cap, char *name,
if (rss_cap == 0)
*changes |= LCST_CAP_ZERO;
- lcol = lcollection_find(colid);
+ lcol = lcollection_find(colidp);
/*
* If the specified collection is capped, add it to lcollection.
@@ -120,12 +101,13 @@ lcollection_insert_update(rcid_t colid, uint64_t rss_cap, char *name,
lcol = malloc(sizeof (*lcol));
if (lcol == NULL) {
debug("not enough memory to monitor %s %s",
- rcfg.rcfg_mode_name, name);
+ (colidp->rcid_type == RCIDT_PROJECT ?
+ "project" : "zone"), name);
return (NULL);
}
(void) bzero(lcol, sizeof (*lcol));
- lcol->lcol_id = colid;
+ lcol->lcol_id = *colidp;
debug("added collection %s\n", name);
lcol->lcol_prev = NULL;
lcol->lcol_next = lcollection_head;
@@ -157,8 +139,8 @@ lcollection_insert_update(rcid_t colid, uint64_t rss_cap, char *name,
}
static void
-lcollection_update_notification_cb(char *name, int changes, uint64_t rss_cap,
- int mark)
+lcollection_update_notification_cb(char *col_type, char *name, int changes,
+ uint64_t rss_cap, int mark)
{
/*
* Assume the collection has been updated redundantly if its mark count
@@ -168,10 +150,10 @@ lcollection_update_notification_cb(char *name, int changes, uint64_t rss_cap,
return;
if (changes & LCST_CAP_ZERO)
- debug("%s %s: %s\n", rcfg.rcfg_mode_name, name,
+ debug("%s %s: %s\n", col_type, name,
(changes & LCST_CAP_REMOVED) ? "cap removed" : "uncapped");
else
- debug("%s %s: cap: %llukB\n", rcfg.rcfg_mode_name, name,
+ debug("%s %s: cap: %llukB\n", col_type, name,
(unsigned long long)rss_cap);
}
@@ -215,19 +197,23 @@ lcollection_member(lcollection_t *lcol, lprocess_t *lpc)
static int
lcollection_find_cb(lcollection_t *lcol, void *arg)
{
- if (lcol->lcol_id == ((lcollection_find_arg_t *)arg)->lfa_colid) {
+ rcid_t *colidp = ((lcollection_find_arg_t *)arg)->lfa_colidp;
+
+ if (lcol->lcol_id.rcid_type == colidp->rcid_type &&
+ lcol->lcol_id.rcid_val == colidp->rcid_val) {
((lcollection_find_arg_t *)arg)->lfa_found = lcol;
return (1);
- } else
- return (0);
+ }
+
+ return (0);
}
lcollection_t *
-lcollection_find(id_t colid)
+lcollection_find(rcid_t *colidp)
{
lcollection_find_arg_t lfa;
- lfa.lfa_colid = colid;
+ lfa.lfa_colidp = colidp;
lfa.lfa_found = NULL;
list_walk_collection(lcollection_find_cb, &lfa);
diff --git a/usr/src/cmd/rcap/rcapd/rcapd_collection_project.c b/usr/src/cmd/rcap/rcapd/rcapd_collection_project.c
index ba34100f05..eab6d2a94a 100644
--- a/usr/src/cmd/rcap/rcapd/rcapd_collection_project.c
+++ b/usr/src/cmd/rcap/rcapd/rcapd_collection_project.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -38,24 +37,17 @@
/* round up to next y = 2^n */
#define ROUNDUP(x, y) (((x) + ((y) - 1)) & ~((y) - 1))
-static rcid_t rc_proj_getidbypsinfo(psinfo_t *);
-
-void
-lcollection_set_type_project(void)
-{
- rc_getidbypsinfo = rc_proj_getidbypsinfo;
-}
-
static int
lcollection_update_project_cb(const struct project *proj, void *walk_data)
{
- void(*update_notification_cb)(char *, int, uint64_t, int) =
- (void(*)(char *, int, uint64_t, int))walk_data;
+ void(*update_notification_cb)(char *, char *, int, uint64_t, int) =
+ (void(*)(char *, char *, int, uint64_t, int))walk_data;
char *capattr_abs;
char *end;
int changes;
int64_t max_rss;
lcollection_t *lcol;
+ rcid_t colid;
capattr_abs = strstr(proj->pj_attr, PJ_ABS_ATTR_NAME "=");
if (capattr_abs != NULL) {
@@ -70,17 +62,19 @@ lcollection_update_project_cb(const struct project *proj, void *walk_data)
capattr_abs += strlen(PJ_ABS_ATTR_NAME "=");
max_rss = ROUNDUP(strtoll(capattr_abs, &end, 10), 1024) / 1024;
if (end == capattr_abs || *end != ';' && *end != 0)
- warn(gettext("%s %s: malformed %s value "
- "'%s'\n"), rcfg.rcfg_mode_name, proj->pj_name,
- PJ_ABS_ATTR_NAME, capattr_abs);
+ warn(gettext("project %s: malformed %s value '%s'\n"),
+ proj->pj_name, PJ_ABS_ATTR_NAME, capattr_abs);
} else
max_rss = 0;
- lcol = lcollection_insert_update(proj->pj_projid, max_rss,
- proj->pj_name, &changes);
+ colid.rcid_type = RCIDT_PROJECT;
+ colid.rcid_val = proj->pj_projid;
+
+ lcol = lcollection_insert_update(&colid, max_rss, proj->pj_name,
+ &changes);
if (update_notification_cb != NULL)
- update_notification_cb(proj->pj_name, changes, max_rss, (lcol !=
- NULL) ? lcol->lcol_mark : 0);
+ update_notification_cb("project", proj->pj_name, changes,
+ max_rss, (lcol != NULL) ? lcol->lcol_mark : 0);
return (0);
}
@@ -101,10 +95,13 @@ lcollection_update_project_byid_cb(const projid_t id, void *walk_data)
static int
lcollection_update_onceactive_cb(lcollection_t *lcol, void *walk_data)
{
- void(*update_notification_cb)(char *, int, uint64_t, int) =
- (void(*)(char *, int, uint64_t, int))walk_data;
+ void(*update_notification_cb)(char *, char *, int, uint64_t, int) =
+ (void(*)(char *, char *, int, uint64_t, int))walk_data;
+
+ if (lcol->lcol_id.rcid_type != RCIDT_PROJECT)
+ return (0);
- return (lcollection_update_project_byid_cb(lcol->lcol_id,
+ return (lcollection_update_project_byid_cb(lcol->lcol_id.rcid_val,
(void *)update_notification_cb));
}
@@ -125,7 +122,7 @@ project_walk_all(int(*cb)(const struct project *, void *), void *walk_data)
void
lcollection_update_project(lcollection_update_type_t ut,
- void(*update_notification_cb)(char *, int, uint64_t, int))
+ void(*update_notification_cb)(char *, char *, int, uint64_t, int))
{
switch (ut) {
case LCU_ACTIVE_ONLY:
@@ -154,9 +151,3 @@ lcollection_update_project(lcollection_update_type_t ut,
(void *)update_notification_cb);
}
}
-
-static rcid_t
-rc_proj_getidbypsinfo(psinfo_t *psinfo)
-{
- return (psinfo->pr_projid);
-}
diff --git a/usr/src/cmd/rcap/rcapd/rcapd_collection_zone.c b/usr/src/cmd/rcap/rcapd/rcapd_collection_zone.c
new file mode 100644
index 0000000000..db86aa6276
--- /dev/null
+++ b/usr/src/cmd/rcap/rcapd/rcapd_collection_zone.c
@@ -0,0 +1,99 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <procfs.h>
+#include <project.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <zone.h>
+#include <libzonecfg.h>
+#include "rcapd.h"
+#include "utils.h"
+
+extern boolean_t gz_capped;
+
+ /* round up to next y = 2^n */
+#define ROUNDUP(x, y) (((x) + ((y) - 1)) & ~((y) - 1))
+
+static void
+update_zone(zone_entry_t *zent, void *walk_data)
+{
+ void(*update_notification_cb)(char *, char *, int, uint64_t, int) =
+ (void(*)(char *, char *, int, uint64_t, int))walk_data;
+ int changes;
+ int64_t max_rss;
+ uint64_t mcap;
+ lcollection_t *lcol;
+ rcid_t colid;
+
+ if (zone_getattr(zent->zid, ZONE_ATTR_PHYS_MCAP, &mcap,
+ sizeof (mcap)) != -1 && mcap != 0)
+ max_rss = ROUNDUP(mcap, 1024) / 1024;
+ else
+ max_rss = 0;
+
+ if (zent->zid == GLOBAL_ZONEID) {
+ if (max_rss > 0)
+ gz_capped = B_TRUE;
+ else
+ gz_capped = B_FALSE;
+ }
+
+
+ colid.rcid_type = RCIDT_ZONE;
+ colid.rcid_val = zent->zid;
+
+ lcol = lcollection_insert_update(&colid, max_rss, zent->zname,
+ &changes);
+ if (update_notification_cb != NULL)
+ update_notification_cb("zone", zent->zname, changes, max_rss,
+ (lcol != NULL) ? lcol->lcol_mark : 0);
+}
+
+
+/* ARGSUSED */
+void
+lcollection_update_zone(lcollection_update_type_t ut,
+ void(*update_notification_cb)(char *, char *, int, uint64_t, int))
+{
+ int i;
+ uint_t nzents;
+ zone_entry_t *zents;
+
+ /*
+ * Enumerate running zones.
+ */
+ if (get_running_zones(&nzents, &zents) != 0)
+ return;
+
+ for (i = 0; i < nzents; i++) {
+ update_zone(&zents[i], (void *)update_notification_cb);
+
+ }
+
+ free(zents);
+}
diff --git a/usr/src/cmd/rcap/rcapd/rcapd_main.c b/usr/src/cmd/rcap/rcapd/rcapd_main.c
index 9c2e8b3c48..960065826e 100644
--- a/usr/src/cmd/rcap/rcapd/rcapd_main.c
+++ b/usr/src/cmd/rcap/rcapd/rcapd_main.c
@@ -61,6 +61,7 @@
#include <unistd.h>
#include <zone.h>
#include <assert.h>
+#include <sys/vm_usage.h>
#include "rcapd.h"
#include "rcapd_mapping.h"
#include "rcapd_rfd.h"
@@ -80,30 +81,42 @@
#define STAT_TEMPLATE_SUFFIX ".XXXXXX" /* suffix of mkstemp() arg */
#define DAEMON_UID 1 /* uid to use */
+#define CAPPED_PROJECT 0x01
+#define CAPPED_ZONE 0x02
+
typedef struct soft_scan_arg {
uint64_t ssa_sum_excess;
int64_t ssa_scan_goal;
+ boolean_t ssa_project_over_cap;
} soft_scan_arg_t;
+typedef struct sample_col_arg {
+ boolean_t sca_any_over_cap;
+ boolean_t sca_project_over_cap;
+} sample_col_arg_t;
+
+
static int debug_mode = 0; /* debug mode flag */
static pid_t rcapd_pid; /* rcapd's pid to ensure it's not */
/* scanned */
static kstat_ctl_t *kctl; /* kstat chain */
-static uint64_t new_sp = 0, old_sp = 0; /* measure delta in page scan count */
-static int enforce_caps = 0; /* cap enforcement flag, dependent on */
- /* enforce_soft_caps and */
- /* global_scanner_running */
-static int enforce_soft_caps = 0; /* soft cap enforcement flag, */
- /* depending on memory pressure */
static int memory_pressure = 0; /* physical memory utilization (%) */
static int memory_pressure_sample = 0; /* count of samples */
-static int global_scanner_running = 0; /* global scanning flag, to avoid */
- /* interference with kernel's page */
- /* scanner */
+static long page_size_kb = 0; /* system page size in KB */
+static size_t nvmu_vals = 0; /* # of kernel RSS/swap vals in array */
+static size_t vmu_vals_len = 0; /* size of RSS/swap vals array */
+static vmusage_t *vmu_vals = NULL; /* snapshot of kernel RSS/swap values */
static hrtime_t next_report; /* time of next report */
static int termination_signal = 0; /* terminating signal */
+static zoneid_t my_zoneid = (zoneid_t)-1;
+static lcollection_t *gz_col; /* global zone collection */
rcfg_t rcfg;
+/*
+ * Updated when we re-read the collection configurations if this rcapd instance
+ * is running in the global zone and the global zone is capped.
+ */
+boolean_t gz_capped = B_FALSE;
/*
* Flags.
@@ -116,9 +129,9 @@ static int verify_statistics(void);
static int update_statistics(void);
/*
- * Checks if a process is marked 'system'. Returns zero only when it is not.
+ * Checks if a process is marked 'system'. Returns FALSE only when it is not.
*/
-static int
+static boolean_t
proc_issystem(pid_t pid)
{
char pc_clname[PC_CLNMSZ];
@@ -128,22 +141,43 @@ proc_issystem(pid_t pid)
return (strcmp(pc_clname, "SYS") == 0);
} else {
debug("cannot get class-specific scheduling parameters; "
- "assuming system process");
- return (-1);
+ "assuming system process\n");
+ return (B_TRUE);
}
}
-/*
- * fname is the process name, for debugging messages, and unscannable is a flag
- * indicating whether the process should be scanned.
- */
static void
-lprocess_insert_mark(pid_t pid, id_t colid, char *fname, int unscannable)
+lprocess_insert_mark(psinfo_t *psinfop)
{
+ pid_t pid = psinfop->pr_pid;
+ /* flag indicating whether the process should be scanned. */
+ int unscannable = psinfop->pr_nlwp == 0;
+ rcid_t colid;
lcollection_t *lcol;
lprocess_t *lproc;
- if ((lcol = lcollection_find(colid)) == NULL)
+ /*
+ * Determine which collection to put this process into. We only have
+ * to worry about tracking both zone and project capped processes if
+ * this rcapd instance is running in the global zone, since we'll only
+ * see processes in our own projects in a non-global zone. In the
+ * global zone, if the process belongs to a non-global zone, we only
+ * need to track it for the capped non-global zone collection. For
+ * global zone processes, we first attempt to put the process into a
+ * capped project collection. On the second pass into this function
+ * the projid will be cleared so we will just track the process for the
+ * global zone collection as a whole.
+ */
+ if (psinfop->pr_zoneid == my_zoneid && psinfop->pr_projid != -1) {
+ colid.rcid_type = RCIDT_PROJECT;
+ colid.rcid_val = psinfop->pr_projid;
+ } else {
+ /* try to add to zone collection */
+ colid.rcid_type = RCIDT_ZONE;
+ colid.rcid_val = psinfop->pr_zoneid;
+ }
+
+ if ((lcol = lcollection_find(&colid)) == NULL)
return;
/*
@@ -193,7 +227,8 @@ lprocess_insert_mark(pid_t pid, id_t colid, char *fname, int unscannable)
if (lcollection_member(lcol, lproc)) {
lprocess_t *cur = lcol->lcol_lprocess;
debug("The collection %lld already has these members, "
- "including me, %d!\n", (long long)lcol->lcol_id,
+ "including me, %d!\n",
+ (long long)lcol->lcol_id.rcid_val,
(int)lproc->lpc_pid);
while (cur != NULL) {
debug("\t%d\n", (int)cur->lpc_pid);
@@ -209,7 +244,10 @@ lprocess_insert_mark(pid_t pid, id_t colid, char *fname, int unscannable)
lproc->lpc_prev = NULL;
lcol->lcol_lprocess = lproc;
- debug("tracking %d %d %s%s\n", (int)colid, (int)pid, fname,
+ debug("tracking %s %ld %d %s%s\n",
+ (colid.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
+ (long)colid.rcid_val,
+ (int)pid, psinfop->pr_psargs,
(lproc->lpc_unscannable != 0) ? " (not scannable)" : "");
lcol->lcol_stat.lcols_proc_in++;
}
@@ -328,22 +366,28 @@ get_psinfo(pid_t pid, psinfo_t *psinfo, int cached_fd,
}
/*
- * Retrieve the collection membership of all processes in our zone, and update
- * the psinfo of those non-system, non-zombie ones in collections.
+ * Retrieve the collection membership of all processes and update the psinfo of
+ * those non-system, non-zombie ones in collections. For global zone processes,
+ * we first attempt to put the process into a capped project collection. We
+ * also want to track the process for the global zone collection as a whole.
*/
static void
proc_cb(const pid_t pid)
{
- static zoneid_t ours = (zoneid_t)-1;
psinfo_t psinfo;
- if (ours == (zoneid_t)-1)
- ours = getzoneid();
-
- if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0 &&
- psinfo.pr_zoneid == ours)
- lprocess_insert_mark(psinfo.pr_pid, rc_getidbypsinfo(&psinfo),
- psinfo.pr_psargs, psinfo.pr_nlwp == 0);
+ if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0) {
+ lprocess_insert_mark(&psinfo);
+ if (gz_capped && psinfo.pr_zoneid == GLOBAL_ZONEID) {
+ /*
+ * We also want to track this process for the global
+ * zone as a whole so add it to the global zone
+ * collection as well.
+ */
+ psinfo.pr_projid = -1;
+ lprocess_insert_mark(&psinfo);
+ }
+ }
}
/*
@@ -359,57 +403,149 @@ lprocess_update_psinfo_fd_cb(void *arg, int fd)
}
/*
- * Update the RSS of processes in monitored collections.
+ * Get the system pagesize.
*/
-/*ARGSUSED*/
-static int
-mem_sample_cb(lcollection_t *lcol, lprocess_t *lpc)
+static void
+get_page_size(void)
{
- psinfo_t psinfo;
+ page_size_kb = sysconf(_SC_PAGESIZE) / 1024;
+ debug("physical page size: %luKB\n", page_size_kb);
+}
+
+static void
+tm_fmt(char *msg, hrtime_t t1, hrtime_t t2)
+{
+ hrtime_t diff = t2 - t1;
+
+ if (diff < MILLISEC)
+ debug("%s: %lld nanoseconds\n", msg, diff);
+ else if (diff < MICROSEC)
+ debug("%s: %.2f microseconds\n", msg, (float)diff / MILLISEC);
+ else if (diff < NANOSEC)
+ debug("%s: %.2f milliseconds\n", msg, (float)diff / MICROSEC);
+ else
+ debug("%s: %.2f seconds\n", msg, (float)diff / NANOSEC);
+}
+
+/*
+ * Get the zone's & project's RSS from the kernel.
+ */
+static void
+rss_sample(boolean_t my_zone_only, uint_t col_types)
+{
+ size_t nres;
+ size_t i;
+ uint_t flags;
+ hrtime_t t1, t2;
- if (get_psinfo(lpc->lpc_pid, &psinfo, lpc->lpc_psinfo_fd,
- lprocess_update_psinfo_fd_cb, lpc, lpc) == 0) {
- lpc->lpc_rss = psinfo.pr_rssize;
- lpc->lpc_size = psinfo.pr_size;
+ if (my_zone_only) {
+ flags = VMUSAGE_ZONE;
} else {
- if (errno == ENOENT)
- debug("process %d finished\n", (int)lpc->lpc_pid);
- else
- debug("process %d: cannot read psinfo",
- (int)lpc->lpc_pid);
- lprocess_free(lpc);
+ flags = 0;
+ if (col_types & CAPPED_PROJECT)
+ flags |= VMUSAGE_PROJECTS;
+ if (col_types & CAPPED_ZONE && my_zoneid == GLOBAL_ZONEID)
+ flags |= VMUSAGE_ALL_ZONES;
}
- return (0);
+ debug("vmusage sample flags 0x%x\n", flags);
+ if (flags == 0)
+ return;
+
+again:
+ /* try the current buffer to see if the list will fit */
+ nres = vmu_vals_len;
+ t1 = gethrtime();
+ if (getvmusage(flags, my_zone_only ? 0 : rcfg.rcfg_rss_sample_interval,
+ vmu_vals, &nres) != 0) {
+ if (errno != EOVERFLOW) {
+ warn(gettext("can't read RSS from kernel\n"));
+ return;
+ }
+ }
+ t2 = gethrtime();
+ tm_fmt("getvmusage time", t1, t2);
+
+ debug("kernel nres %lu\n", (ulong_t)nres);
+
+ if (nres > vmu_vals_len) {
+ /* array size is now too small, increase it and try again */
+ free(vmu_vals);
+
+ if ((vmu_vals = (vmusage_t *)calloc(nres,
+ sizeof (vmusage_t))) == NULL) {
+ warn(gettext("out of memory: could not read RSS from "
+ "kernel\n"));
+ vmu_vals_len = nvmu_vals = 0;
+ return;
+ }
+ vmu_vals_len = nres;
+ goto again;
+ }
+
+ nvmu_vals = nres;
+
+ debug("vmusage_sample\n");
+ for (i = 0; i < nvmu_vals; i++) {
+ debug("%d: id: %d, type: 0x%x, rss_all: %llu (%lluKB), "
+ "swap: %llu\n", (int)i, (int)vmu_vals[i].vmu_id,
+ vmu_vals[i].vmu_type,
+ (unsigned long long)vmu_vals[i].vmu_rss_all,
+ (unsigned long long)vmu_vals[i].vmu_rss_all / 1024,
+ (unsigned long long)vmu_vals[i].vmu_swap_all);
+ }
+}
+
+static void
+update_col_rss(lcollection_t *lcol)
+{
+ int i;
+
+ lcol->lcol_rss = 0;
+ lcol->lcol_image_size = 0;
+
+ for (i = 0; i < nvmu_vals; i++) {
+ if (vmu_vals[i].vmu_id != lcol->lcol_id.rcid_val)
+ continue;
+
+ if (vmu_vals[i].vmu_type == VMUSAGE_ZONE &&
+ lcol->lcol_id.rcid_type != RCIDT_ZONE)
+ continue;
+
+ if (vmu_vals[i].vmu_type == VMUSAGE_PROJECTS &&
+ lcol->lcol_id.rcid_type != RCIDT_PROJECT)
+ continue;
+
+ /* we found the right RSS entry, update the collection vals */
+ lcol->lcol_rss = vmu_vals[i].vmu_rss_all / 1024;
+ lcol->lcol_image_size = vmu_vals[i].vmu_swap_all / 1024;
+ break;
+ }
}
/*
* Sample the collection RSS, updating the collection's statistics with the
- * results.
+ * results. Also, sum the rss of all capped projects & return true if
+ * the collection is over cap.
*/
-/*ARGSUSED*/
static int
rss_sample_col_cb(lcollection_t *lcol, void *arg)
{
int64_t excess;
uint64_t rss;
+ sample_col_arg_t *col_argp = (sample_col_arg_t *)arg;
- /*
- * If updating statistics for a new interval, reset the affected
- * counters.
- */
- if (lcol->lcol_stat_invalidate != 0) {
- lcol->lcol_stat_old = lcol->lcol_stat;
- lcol->lcol_stat.lcols_min_rss = (int64_t)-1;
- lcol->lcol_stat.lcols_max_rss = 0;
- lcol->lcol_stat_invalidate = 0;
- }
+ update_col_rss(lcol);
lcol->lcol_stat.lcols_rss_sample++;
- excess = lcol->lcol_rss - lcol->lcol_rss_cap;
rss = lcol->lcol_rss;
- if (excess > 0)
+ excess = rss - lcol->lcol_rss_cap;
+ if (excess > 0) {
lcol->lcol_stat.lcols_rss_act_sum += rss;
+ col_argp->sca_any_over_cap = B_TRUE;
+ if (lcol->lcol_id.rcid_type == RCIDT_PROJECT)
+ col_argp->sca_project_over_cap = B_TRUE;
+ }
lcol->lcol_stat.lcols_rss_sum += rss;
if (lcol->lcol_stat.lcols_min_rss > rss)
@@ -421,6 +557,30 @@ rss_sample_col_cb(lcollection_t *lcol, void *arg)
}
/*
+ * Determine if we have capped projects, capped zones or both.
+ */
+static int
+col_type_cb(lcollection_t *lcol, void *arg)
+{
+ uint_t *col_type = (uint_t *)arg;
+
+ /* skip uncapped collections */
+ if (lcol->lcol_rss_cap == 0)
+ return (1);
+
+ if (lcol->lcol_id.rcid_type == RCIDT_PROJECT)
+ *col_type |= CAPPED_PROJECT;
+ else
+ *col_type |= CAPPED_ZONE;
+
+ /* once we know everything is capped, we can stop looking */
+ if ((*col_type & CAPPED_ZONE) && (*col_type & CAPPED_PROJECT))
+ return (1);
+
+ return (0);
+}
+
+/*
* Open /proc and walk entries.
*/
static void
@@ -449,23 +609,6 @@ proc_walk_all(void (*cb)(const pid_t))
}
/*
- * Memory update callback.
- */
-static int
-memory_all_cb(lcollection_t *lcol, lprocess_t *lpc)
-{
- debug_high("%s %s, pid %d: rss += %llu/%llu\n", rcfg.rcfg_mode_name,
- lcol->lcol_name, (int)lpc->lpc_pid,
- (unsigned long long)lpc->lpc_rss,
- (unsigned long long)lpc->lpc_size);
- ASSERT(lpc->lpc_rss <= lpc->lpc_size);
- lcol->lcol_rss += lpc->lpc_rss;
- lcol->lcol_image_size += lpc->lpc_size;
-
- return (0);
-}
-
-/*
* Clear unmarked callback.
*/
/*ARGSUSED*/
@@ -483,19 +626,6 @@ sweep_process_cb(lcollection_t *lcol, lprocess_t *lpc)
}
/*
- * Memory clear callback.
- */
-/*ARGSUSED*/
-static int
-collection_zero_mem_cb(lcollection_t *lcol, void *arg)
-{
- lcol->lcol_rss = 0;
- lcol->lcol_image_size = 0;
-
- return (0);
-}
-
-/*
* Print, for debugging purposes, a collection's recently-sampled RSS and
* excess.
*/
@@ -506,7 +636,8 @@ excess_print_cb(lcollection_t *lcol, void *arg)
int64_t excess = lcol->lcol_rss - lcol->lcol_rss_cap;
debug("%s %s rss/cap: %llu/%llu, excess = %lld kB\n",
- rcfg.rcfg_mode_name, lcol->lcol_name,
+ (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
+ lcol->lcol_name,
(unsigned long long)lcol->lcol_rss,
(unsigned long long)lcol->lcol_rss_cap,
(long long)excess);
@@ -516,6 +647,10 @@ excess_print_cb(lcollection_t *lcol, void *arg)
/*
* Scan those collections which have exceeded their caps.
+ *
+ * If we're running in the global zone it might have a cap. We don't want to
+ * do any capping for the global zone yet since we might get under the cap by
+ * just capping the projects in the global zone.
*/
/*ARGSUSED*/
static int
@@ -523,6 +658,13 @@ scan_cb(lcollection_t *lcol, void *arg)
{
int64_t excess;
+ /* skip over global zone collection for now but keep track for later */
+ if (lcol->lcol_id.rcid_type == RCIDT_ZONE &&
+ lcol->lcol_id.rcid_val == GLOBAL_ZONEID) {
+ gz_col = lcol;
+ return (0);
+ }
+
if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
scan(lcol, excess);
lcol->lcol_stat.lcols_scan++;
@@ -532,6 +674,37 @@ scan_cb(lcollection_t *lcol, void *arg)
}
/*
+ * Scan the global zone collection and see if it still exceeds its cap.
+ * We take into account the effects of capping any global zone projects here.
+ */
+static void
+scan_gz(lcollection_t *lcol, boolean_t project_over_cap)
+{
+ int64_t excess;
+
+ /*
+ * If we had projects over their cap and the global zone was also over
+ * its cap then we need to get the up-to-date global zone rss to
+ * determine if we are still over the global zone cap. We might have
+ * gone under while we scanned the capped projects. If there were no
+ * projects over cap then we can use the rss value we already have for
+ * the global zone.
+ */
+ excess = lcol->lcol_rss - lcol->lcol_rss_cap;
+ if (project_over_cap && excess > 0) {
+ rss_sample(B_TRUE, CAPPED_ZONE);
+ update_col_rss(lcol);
+ excess = lcol->lcol_rss - lcol->lcol_rss_cap;
+ }
+
+ if (excess > 0) {
+ debug("global zone excess %lldKB\n", (long long)excess);
+ scan(lcol, excess);
+ lcol->lcol_stat.lcols_scan++;
+ }
+}
+
+/*
* Do a soft scan of those collections which have excesses. A soft scan is one
* in which the cap enforcement pressure is taken into account. The difference
* between the utilized physical memory and the cap enforcement pressure will
@@ -544,22 +717,72 @@ soft_scan_cb(lcollection_t *lcol, void *a)
int64_t excess;
soft_scan_arg_t *arg = a;
+ /* skip over global zone collection for now but keep track for later */
+ if (lcol->lcol_id.rcid_type == RCIDT_ZONE &&
+ lcol->lcol_id.rcid_val == GLOBAL_ZONEID) {
+ gz_col = lcol;
+ return (0);
+ }
+
if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
- debug("col %lld excess %lld scan_goal %lld sum_excess %llu, "
- "scanning %lld\n", (long long)lcol->lcol_id,
+ int64_t adjusted_excess =
+ excess * arg->ssa_scan_goal / arg->ssa_sum_excess;
+
+ debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, "
+ "scanning %lld\n",
+ (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
+ "project" : "zone"),
+ (long)lcol->lcol_id.rcid_val,
(long long)excess, (long long)arg->ssa_scan_goal,
(unsigned long long)arg->ssa_sum_excess,
- (long long)(excess * arg->ssa_scan_goal /
- arg->ssa_sum_excess));
+ (long long)adjusted_excess);
- scan(lcol, (int64_t)(excess * arg->ssa_scan_goal /
- arg->ssa_sum_excess));
+ scan(lcol, adjusted_excess);
lcol->lcol_stat.lcols_scan++;
}
return (0);
}
+static void
+soft_scan_gz(lcollection_t *lcol, void *a)
+{
+ int64_t excess;
+ soft_scan_arg_t *arg = a;
+
+ /*
+ * If we had projects over their cap and the global zone was also over
+ * its cap then we need to get the up-to-date global zone rss to
+ * determine if we are still over the global zone cap. We might have
+ * gone under while we scanned the capped projects. If there were no
+ * projects over cap then we can use the rss value we already have for
+ * the global zone.
+ */
+ excess = lcol->lcol_rss - lcol->lcol_rss_cap;
+ if (arg->ssa_project_over_cap && excess > 0) {
+ rss_sample(B_TRUE, CAPPED_ZONE);
+ update_col_rss(lcol);
+ excess = lcol->lcol_rss - lcol->lcol_rss_cap;
+ }
+
+ if (excess > 0) {
+ int64_t adjusted_excess =
+ excess * arg->ssa_scan_goal / arg->ssa_sum_excess;
+
+ debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, "
+ "scanning %lld\n",
+ (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
+ "project" : "zone"),
+ (long)lcol->lcol_id.rcid_val,
+ (long long)excess, (long long)arg->ssa_scan_goal,
+ (unsigned long long)arg->ssa_sum_excess,
+ (long long)adjusted_excess);
+
+ scan(lcol, adjusted_excess);
+ lcol->lcol_stat.lcols_scan++;
+ }
+}
+
/*
* When a scan could happen, but caps aren't enforced tick the
* lcols_unenforced_cap counter.
@@ -582,8 +805,7 @@ update_phys_total(void)
uint64_t old_phys_total;
old_phys_total = phys_total;
- phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE)
- / 1024;
+ phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * page_size_kb;
if (phys_total != old_phys_total)
debug("physical memory%s: %lluM\n", (old_phys_total == 0 ?
"" : " adjusted"), (unsigned long long)(phys_total / 1024));
@@ -687,7 +909,9 @@ static int
collection_sweep_cb(lcollection_t *lcol, void *arg)
{
if (lcol->lcol_mark == 0) {
- debug("freeing %s %s\n", rcfg.rcfg_mode_name, lcol->lcol_name);
+ debug("freeing %s %s\n",
+ (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
+ "project" : "zone"), lcol->lcol_name);
lcollection_free(lcol);
}
@@ -710,8 +934,6 @@ finish_configuration(void)
rcfg.rcfg_mode_name = "project";
rcfg.rcfg_mode = rctype_project;
}
-
- lcollection_set_type(rcfg.rcfg_mode);
}
/*
@@ -754,7 +976,8 @@ reread_configuration_file(void)
* deletions to cap definitions.
*/
static void
-reconfigure(void)
+reconfigure(hrtime_t now, hrtime_t *next_configuration,
+ hrtime_t *next_proc_walk, hrtime_t *next_rss_sample)
{
debug("reconfigure...\n");
@@ -770,6 +993,31 @@ reconfigure(void)
list_walk_collection(collection_clear_cb, NULL);
lcollection_update(LCU_ACTIVE_ONLY); /* mark */
list_walk_collection(collection_sweep_cb, NULL);
+
+ *next_configuration = NEXT_EVENT_TIME(now,
+ rcfg.rcfg_reconfiguration_interval);
+
+ /*
+ * Reset each event time to the shorter of the previous and new
+ * intervals.
+ */
+ if (next_report == 0 && rcfg.rcfg_report_interval > 0)
+ next_report = now;
+ else
+ next_report = POSITIVE_MIN(next_report,
+ NEXT_REPORT_EVENT_TIME(now, rcfg.rcfg_report_interval));
+
+ if (*next_proc_walk == 0 && rcfg.rcfg_proc_walk_interval > 0)
+ *next_proc_walk = now;
+ else
+ *next_proc_walk = POSITIVE_MIN(*next_proc_walk,
+ NEXT_EVENT_TIME(now, rcfg.rcfg_proc_walk_interval));
+
+ if (*next_rss_sample == 0 && rcfg.rcfg_rss_sample_interval > 0)
+ *next_rss_sample = now;
+ else
+ *next_rss_sample = POSITIVE_MIN(*next_rss_sample,
+ NEXT_EVENT_TIME(now, rcfg.rcfg_rss_sample_interval));
}
/*
@@ -791,20 +1039,20 @@ static int
simple_report_collection_cb(lcollection_t *lcol, void *arg)
{
#define DELTA(field) \
- (unsigned long long)(lcol->lcol_stat_invalidate ? 0 : \
+ (unsigned long long)( \
(lcol->lcol_stat.field - lcol->lcol_stat_old.field))
-#define VALID(field) \
- (unsigned long long)(lcol->lcol_stat_invalidate ? 0 : \
- lcol->lcol_stat.field)
debug("%s %s status: succeeded/attempted (k): %llu/%llu, "
"ineffective/scans/unenforced/samplings: %llu/%llu/%llu/%llu, RSS "
"min/max (k): %llu/%llu, cap %llu kB, processes/thpt: %llu/%llu, "
- "%llu scans over %llu ms\n", rcfg.rcfg_mode_name, lcol->lcol_name,
+ "%llu scans over %llu ms\n",
+ (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
+ lcol->lcol_name,
DELTA(lcols_pg_eff), DELTA(lcols_pg_att),
DELTA(lcols_scan_ineffective), DELTA(lcols_scan),
DELTA(lcols_unenforced_cap), DELTA(lcols_rss_sample),
- VALID(lcols_min_rss), VALID(lcols_max_rss),
+ (unsigned long long)lcol->lcol_stat.lcols_min_rss,
+ (unsigned long long)lcol->lcol_stat.lcols_max_rss,
(unsigned long long)lcol->lcol_rss_cap,
(unsigned long long)(lcol->lcol_stat.lcols_proc_in -
lcol->lcol_stat.lcols_proc_out), DELTA(lcols_proc_out),
@@ -812,7 +1060,6 @@ simple_report_collection_cb(lcollection_t *lcol, void *arg)
/ MILLISEC));
#undef DELTA
-#undef VALID
return (0);
}
@@ -838,13 +1085,11 @@ report_collection_cb(lcollection_t *lcol, void *arg)
dc.lcol_stat = lcol->lcol_stat;
if (write(fd, &dc, sizeof (dc)) == sizeof (dc)) {
- /*
- * Set a flag to indicate that the exported interval snapshot
- * values should be reset at the next sample.
- */
- lcol->lcol_stat_invalidate = 1;
+ lcol->lcol_stat_old = lcol->lcol_stat;
} else {
- debug("can't write %s %s statistics", rcfg.rcfg_mode_name,
+ debug("can't write %s %s statistics",
+ (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
+ "project" : "zone"),
lcol->lcol_name);
}
@@ -871,8 +1116,9 @@ get_globally_scanned_pages(uint64_t *scannedp)
if (kstat_read(kctl, ksp, NULL) != -1) {
scanned += ((cpu_stat_t *)
ksp->ks_data)->cpu_vminfo.scan;
- } else
+ } else {
return (-1);
+ }
}
}
@@ -881,6 +1127,59 @@ get_globally_scanned_pages(uint64_t *scannedp)
}
/*
+ * Determine if the global page scanner is running, during which no memory
+ * caps should be enforced, to prevent interference with the global page
+ * scanner.
+ */
+static boolean_t
+is_global_scanner_running()
+{
+ /* measure delta in page scan count */
+ static uint64_t new_sp = 0;
+ static uint64_t old_sp = 0;
+ boolean_t res = B_FALSE;
+
+ if (get_globally_scanned_pages(&new_sp) == 0) {
+ if (old_sp != 0 && (new_sp - old_sp) > 0) {
+ debug("global memory pressure detected (%llu "
+ "pages scanned since last interval)\n",
+ (unsigned long long)(new_sp - old_sp));
+ res = B_TRUE;
+ }
+ old_sp = new_sp;
+ } else {
+ warn(gettext("unable to read cpu statistics"));
+ new_sp = old_sp;
+ }
+
+ return (res);
+}
+
+/*
+ * If soft caps are in use, determine if global memory pressure exceeds the
+ * configured maximum above which soft caps are enforced.
+ */
+static boolean_t
+must_enforce_soft_caps()
+{
+ /*
+ * Check for changes to the amount of installed physical memory, to
+ * compute the current memory pressure.
+ */
+ update_phys_total();
+
+ memory_pressure = 100 - (int)((sysconf(_SC_AVPHYS_PAGES) * page_size_kb)
+ * 100.0 / phys_total);
+ memory_pressure_sample++;
+ if (rcfg.rcfg_memory_cap_enforcement_pressure > 0 &&
+ memory_pressure > rcfg.rcfg_memory_cap_enforcement_pressure) {
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
* Update the shared statistics file with each collection's current statistics.
* Return zero on success.
*/
@@ -973,6 +1272,26 @@ sum_excess_cb(lcollection_t *lcol, void *arg)
return (0);
}
+/*
+ * Compute the quantity of memory (in kilobytes) above the cap enforcement
+ * pressure. Set the scan goal to that quantity (or at most the excess).
+ */
+static void
+compute_soft_scan_goal(soft_scan_arg_t *argp)
+{
+ /*
+ * Compute the sum of the collections' excesses, which will be the
+ * denominator.
+ */
+ argp->ssa_sum_excess = 0;
+ list_walk_collection(sum_excess_cb, &(argp->ssa_sum_excess));
+
+ argp->ssa_scan_goal = MIN((sysconf(_SC_PHYS_PAGES) *
+ (100 - rcfg.rcfg_memory_cap_enforcement_pressure) / 100 -
+ sysconf(_SC_AVPHYS_PAGES)) * page_size_kb,
+ argp->ssa_sum_excess);
+}
+
static void
rcapd_usage(void)
{
@@ -1017,6 +1336,112 @@ verify_and_set_privileges(void)
priv_freeset(required);
}
+/*
+ * This function does the top-level work to determine if we should do any
+ * memory capping, and if so, it invokes the right call-backs to do the work.
+ */
+static void
+do_capping(hrtime_t now, hrtime_t *next_proc_walk)
+{
+ boolean_t enforce_caps;
+ /* soft cap enforcement flag, depending on memory pressure */
+ boolean_t enforce_soft_caps;
+ /* avoid interference with kernel's page scanner */
+ boolean_t global_scanner_running;
+ sample_col_arg_t col_arg;
+ soft_scan_arg_t arg;
+ uint_t col_types = 0;
+
+ /* check what kind of collections (project/zone) are capped */
+ list_walk_collection(col_type_cb, &col_types);
+ debug("collection types: 0x%x\n", col_types);
+
+ /* no capped collections, skip checking rss */
+ if (col_types == 0)
+ return;
+
+ /* Determine if soft caps are enforced. */
+ enforce_soft_caps = must_enforce_soft_caps();
+
+ /* Determine if the global page scanner is running. */
+ global_scanner_running = is_global_scanner_running();
+
+ /*
+ * Sample collections' member processes RSSes and recompute
+ * collections' excess.
+ */
+ rss_sample(B_FALSE, col_types);
+
+ col_arg.sca_any_over_cap = B_FALSE;
+ col_arg.sca_project_over_cap = B_FALSE;
+ list_walk_collection(rss_sample_col_cb, &col_arg);
+ list_walk_collection(excess_print_cb, NULL);
+ debug("any collection/project over cap = %d, %d\n",
+ col_arg.sca_any_over_cap, col_arg.sca_project_over_cap);
+
+ if (enforce_soft_caps)
+ debug("memory pressure %d%%\n", memory_pressure);
+
+ /*
+ * Cap enforcement is determined by the previous conditions.
+ */
+ enforce_caps = !global_scanner_running && col_arg.sca_any_over_cap &&
+ (rcfg.rcfg_memory_cap_enforcement_pressure == 0 ||
+ enforce_soft_caps);
+
+ debug("%senforcing caps\n", enforce_caps ? "" : "not ");
+
+ /*
+ * If soft caps are in use, determine the size of the portion from each
+ * collection to scan for.
+ */
+ if (enforce_caps && enforce_soft_caps)
+ compute_soft_scan_goal(&arg);
+
+ /*
+ * Victimize offending collections.
+ */
+ if (enforce_caps && (!enforce_soft_caps ||
+ (arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0))) {
+
+ /*
+ * Since at least one collection is over its cap & needs
+ * enforcing, check if it is at least time for a process walk
+ * (we could be well past time since we only walk /proc when
+ * we need to) and if so, update each collections process list
+ * in a single pass through /proc.
+ */
+ if (EVENT_TIME(now, *next_proc_walk)) {
+ debug("scanning process list...\n");
+ proc_walk_all(proc_cb); /* insert & mark */
+ list_walk_all(sweep_process_cb); /* free dead procs */
+ *next_proc_walk = NEXT_EVENT_TIME(now,
+ rcfg.rcfg_proc_walk_interval);
+ }
+
+ gz_col = NULL;
+ if (enforce_soft_caps) {
+ debug("scan goal is %lldKB\n",
+ (long long)arg.ssa_scan_goal);
+ list_walk_collection(soft_scan_cb, &arg);
+ if (gz_capped && gz_col != NULL) {
+ /* process global zone */
+ arg.ssa_project_over_cap =
+ col_arg.sca_project_over_cap;
+ soft_scan_gz(gz_col, &arg);
+ }
+ } else {
+ list_walk_collection(scan_cb, NULL);
+ if (gz_capped && gz_col != NULL) {
+ /* process global zone */
+ scan_gz(gz_col, col_arg.sca_project_over_cap);
+ }
+ }
+ } else if (col_arg.sca_any_over_cap) {
+ list_walk_collection(unenforced_cap_cb, NULL);
+ }
+}
+
int
main(int argc, char *argv[])
{
@@ -1029,9 +1454,6 @@ main(int argc, char *argv[])
hrtime_t next_proc_walk; /* time of next /proc scan */
hrtime_t next_configuration; /* time of next configuration */
hrtime_t next_rss_sample; /* (latest) time of next RSS sample */
- int old_enforce_caps; /* track changes in enforcement */
- /* conditions */
- soft_scan_arg_t arg;
(void) set_message_priority(RCM_INFO);
(void) setprogname("rcapd");
@@ -1125,13 +1547,6 @@ main(int argc, char *argv[])
next_configuration = NEXT_EVENT_TIME(gethrtime(),
rcfg.rcfg_reconfiguration_interval);
- if (rcfg.rcfg_memory_cap_enforcement_pressure == 0) {
- /*
- * Always enforce caps when strict caps are used.
- */
- enforce_caps = 1;
- }
-
/*
* Open the kstat chain.
*/
@@ -1158,6 +1573,9 @@ main(int argc, char *argv[])
else
debug("fd limit: unknown\n");
+ get_page_size();
+ my_zoneid = getzoneid();
+
/*
* Handle those signals whose (default) exit disposition
* prevents rcapd from finishing scanning before terminating.
@@ -1194,9 +1612,9 @@ main(int argc, char *argv[])
/*
* Loop forever, monitoring collections' resident set sizes and
- * enforcing their caps. Look for changes in caps and process
- * membership, as well as responding to requests to reread the
- * configuration. Update per-collection statistics periodically.
+ * enforcing their caps. Look for changes in caps as well as
+ * responding to requests to reread the configuration. Update
+ * per-collection statistics periodically.
*/
while (should_run != 0) {
struct timespec ts;
@@ -1210,9 +1628,10 @@ main(int argc, char *argv[])
}
/*
- * Update the process list once every proc_walk_interval. The
- * condition of global memory pressure is also checked at the
- * same frequency, if strict caps are in use.
+ * Check the configuration at every next_configuration interval.
+ * Update the rss data once every next_rss_sample interval.
+ * The condition of global memory pressure is also checked at
+ * the same frequency, if strict caps are in use.
*/
now = gethrtime();
@@ -1222,178 +1641,16 @@ main(int argc, char *argv[])
*/
if (EVENT_TIME(now, next_configuration) ||
should_reconfigure == 1) {
- reconfigure();
- next_configuration = NEXT_EVENT_TIME(now,
- rcfg.rcfg_reconfiguration_interval);
-
- /*
- * Reset each event time to the shorter of the
- * previous and new intervals.
- */
- if (next_report == 0 &&
- rcfg.rcfg_report_interval > 0)
- next_report = now;
- else
- next_report = POSITIVE_MIN(next_report,
- NEXT_REPORT_EVENT_TIME(now,
- rcfg.rcfg_report_interval));
- if (next_proc_walk == 0 &&
- rcfg.rcfg_proc_walk_interval > 0)
- next_proc_walk = now;
- else
- next_proc_walk = POSITIVE_MIN(next_proc_walk,
- NEXT_EVENT_TIME(now,
- rcfg.rcfg_proc_walk_interval));
- if (next_rss_sample == 0 &&
- rcfg.rcfg_rss_sample_interval > 0)
- next_rss_sample = now;
- else
- next_rss_sample = POSITIVE_MIN(next_rss_sample,
- NEXT_EVENT_TIME(now,
- rcfg.rcfg_rss_sample_interval));
-
+ reconfigure(now, &next_configuration, &next_proc_walk,
+ &next_rss_sample);
should_reconfigure = 0;
- continue;
- }
-
- if (EVENT_TIME(now, next_proc_walk)) {
- debug("scanning process list...\n");
- proc_walk_all(proc_cb); /* mark */
- list_walk_all(sweep_process_cb);
- next_proc_walk = NEXT_EVENT_TIME(now,
- rcfg.rcfg_proc_walk_interval);
}
+ /*
+ * Do the main work for enforcing caps.
+ */
if (EVENT_TIME(now, next_rss_sample)) {
- /*
- * Check for changes to the amount of installed
- * physical memory, to compute the current memory
- * pressure.
- */
- update_phys_total();
-
- /*
- * If soft caps are in use, determine if global memory
- * pressure exceeds the configured maximum above which
- * soft caps are enforced.
- */
- memory_pressure = 100 -
- (int)((sysconf(_SC_AVPHYS_PAGES) *
- (sysconf(_SC_PAGESIZE) / 1024)) * 100.0 /
- phys_total);
- memory_pressure_sample++;
- if (rcfg.rcfg_memory_cap_enforcement_pressure > 0) {
- if (memory_pressure >
- rcfg.rcfg_memory_cap_enforcement_pressure) {
- if (enforce_soft_caps == 0) {
- debug("memory pressure %d%%\n",
- memory_pressure);
- enforce_soft_caps = 1;
- }
- } else {
- if (enforce_soft_caps == 1)
- enforce_soft_caps = 0;
- }
- }
-
- /*
- * Determine if the global page scanner is running,
- * while which no memory caps should be enforced, to
- * prevent interference with the global page scanner.
- */
- if (get_globally_scanned_pages(&new_sp) == 0) {
- if (old_sp == 0)
- /*EMPTY*/
- ;
- else if ((new_sp - old_sp) > 0) {
- if (global_scanner_running == 0) {
- debug("global memory pressure "
- "detected (%llu pages "
- "scanned since last "
- "interval)\n",
- (unsigned long long)
- (new_sp - old_sp));
- global_scanner_running = 1;
- }
- } else if (global_scanner_running == 1) {
- debug("global memory pressure "
- "relieved\n");
- global_scanner_running = 0;
- }
- old_sp = new_sp;
- } else {
- warn(gettext("kstat_read() failed"));
- new_sp = old_sp;
- }
-
- /*
- * Cap enforcement is determined by the previous two
- * conditions.
- */
- old_enforce_caps = enforce_caps;
- enforce_caps =
- (rcfg.rcfg_memory_cap_enforcement_pressure ==
- 0 || enforce_soft_caps == 1) &&
- !global_scanner_running;
- if (old_enforce_caps != enforce_caps)
- debug("%senforcing caps\n", enforce_caps == 0 ?
- "not " : "");
-
- /*
- * Sample collections' member processes' RSSes and
- * recompute collections' excess.
- */
- list_walk_all(mem_sample_cb);
- list_walk_collection(collection_zero_mem_cb, NULL);
- list_walk_all(memory_all_cb);
- list_walk_collection(rss_sample_col_cb, NULL);
- if (rcfg.rcfg_memory_cap_enforcement_pressure > 0)
- debug("memory pressure %d%%\n",
- memory_pressure);
- list_walk_collection(excess_print_cb, NULL);
-
- /*
- * If soft caps are in use, determine the size of the
- * portion from each collection to scan for.
- */
- if (enforce_soft_caps == 1) {
- /*
- * Compute the sum of the collections'
- * excesses, which will be the denominator.
- */
- arg.ssa_sum_excess = 0;
- list_walk_collection(sum_excess_cb,
- &arg.ssa_sum_excess);
-
- /*
- * Compute the quantity of memory (in
- * kilobytes) above the cap enforcement
- * pressure. Set the scan goal to that
- * quantity (or at most the excess).
- */
- arg.ssa_scan_goal = MIN((
- sysconf(_SC_PHYS_PAGES) * (100 -
- rcfg.rcfg_memory_cap_enforcement_pressure)
- / 100 - sysconf(_SC_AVPHYS_PAGES)) *
- (sysconf(_SC_PAGESIZE) / 1024),
- arg.ssa_sum_excess);
- }
-
- /*
- * Victimize offending collections.
- */
- if (enforce_caps == 1 && ((enforce_soft_caps == 1 &&
- arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0) ||
- (enforce_soft_caps == 0)))
- if (enforce_soft_caps == 1) {
- debug("scan goal is %lldKB\n",
- (long long)arg.ssa_scan_goal);
- list_walk_collection(soft_scan_cb,
- &arg);
- } else
- list_walk_collection(scan_cb, NULL);
- else
- list_walk_collection(unenforced_cap_cb, NULL);
+ do_capping(now, &next_proc_walk);
next_rss_sample = NEXT_EVENT_TIME(now,
rcfg.rcfg_rss_sample_interval);
@@ -1409,7 +1666,6 @@ main(int argc, char *argv[])
*/
now = gethrtime();
next = next_configuration;
- next = POSITIVE_MIN(next, next_proc_walk);
next = POSITIVE_MIN(next, next_report);
next = POSITIVE_MIN(next, next_rss_sample);
if (next > now && should_run != 0) {
diff --git a/usr/src/cmd/rcap/rcapd/rcapd_scanner.c b/usr/src/cmd/rcap/rcapd/rcapd_scanner.c
index 15c503d1b4..b39811b552 100644
--- a/usr/src/cmd/rcap/rcapd/rcapd_scanner.c
+++ b/usr/src/cmd/rcap/rcapd/rcapd_scanner.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -104,7 +103,8 @@ st_debug(st_debug_level_t level, lcollection_t *lcol, char *msg, ...)
buf = malloc(len);
if (buf == NULL)
return;
- (void) snprintf(buf, len, "%s %s scanner %s", rcfg.rcfg_mode_name,
+ (void) snprintf(buf, len, "%s %s scanner %s",
+ (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
lcol->lcol_name, msg);
va_start(alist, msg);
@@ -471,6 +471,7 @@ merge_current_pagedata(lprocess_t *lpc,
{
prpageheader_t *pghp;
int mappings_changed = 0;
+ uint64_t cnt;
if (lpc->lpc_pgdata_fd < 0 || get_pagedata(&pghp, lpc->lpc_pgdata_fd) !=
0) {
@@ -485,9 +486,12 @@ merge_current_pagedata(lprocess_t *lpc,
debug("starting/resuming pagedata collection for %d\n",
(int)lpc->lpc_pid);
}
- debug("process %d: %llu/%llukB r/m'd since last read\n",
- (int)lpc->lpc_pid, (unsigned long long)count_pages(pghp, 0,
- PG_MODIFIED | PG_REFERENCED, 0), (unsigned long long)lpc->lpc_rss);
+
+ cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
+ if (cnt != 0 || lpc->lpc_rss != 0)
+ debug("process %d: %llu/%llukB rfd/mdfd since last read\n",
+ (int)lpc->lpc_pid, (unsigned long long)cnt,
+ (unsigned long long)lpc->lpc_rss);
if (lpc->lpc_prpageheader != NULL) {
/*
* OR the two snapshots.
@@ -519,10 +523,12 @@ merge_current_pagedata(lprocess_t *lpc,
} else
mappings_changed = 1;
lpc->lpc_prpageheader = pghp;
- debug("process %d: %llu/%llukB r/m'd since hand swept\n",
- (int)lpc->lpc_pid, (unsigned long long)count_pages(pghp, 0,
- PG_MODIFIED | PG_REFERENCED, 0),
- (unsigned long long)lpc->lpc_rss);
+
+ cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
+ if (cnt != 0 || lpc->lpc_rss != 0)
+ debug("process %d: %llu/%llukB rfd/mdfd since hand swept\n",
+ (int)lpc->lpc_pid, (unsigned long long)cnt,
+ (unsigned long long)lpc->lpc_rss);
if (mappings_changed != 0) {
debug("process %d: mappings changed\n", (int)lpc->lpc_pid);
if (mappings_changed_cb != NULL)
@@ -589,7 +595,6 @@ rss_delta(psinfo_t *new_psinfo, psinfo_t *old_psinfo, lprocess_t *vic)
static void
unignore_mappings(lprocess_t *lpc)
{
- debug("clearing ignored set\n");
lmapping_free(&lpc->lpc_ignore);
}
diff --git a/usr/src/cmd/rcap/rcapstat/Makefile b/usr/src/cmd/rcap/rcapstat/Makefile
index 47b9bcfb71..fb436f5684 100644
--- a/usr/src/cmd/rcap/rcapstat/Makefile
+++ b/usr/src/cmd/rcap/rcapstat/Makefile
@@ -2,9 +2,8 @@
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
# CDDL HEADER END
#
#
-# Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#ident "%Z%%M% %I% %E% SMI"
@@ -39,7 +38,7 @@ LINTSRCS = $(COMMON_DIR)/utils.c \
$(NOT_RELEASE_BUILD)CPPFLAGS += -DDEBUG
CPPFLAGS += -I$(COMMON_DIR)
-LDLIBS += -lumem -ll
+LDLIBS += -lumem -ll -lzonecfg
LINTFLAGS += $(LDLIBS) -mnu
diff --git a/usr/src/cmd/rcap/rcapstat/rcapstat.c b/usr/src/cmd/rcap/rcapstat/rcapstat.c
index 722502d05d..47eca3f2fa 100644
--- a/usr/src/cmd/rcap/rcapstat/rcapstat.c
+++ b/usr/src/cmd/rcap/rcapstat/rcapstat.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -77,7 +76,8 @@ col_find(rcid_t id)
{
col_t *col;
for (col = col_head; col != NULL; col = col->col_next)
- if (col->col_id == id)
+ if (col->col_id.rcid_type == id.rcid_type &&
+ col->col_id.rcid_val == id.rcid_val)
return (col);
return (NULL);
}
@@ -119,7 +119,7 @@ static void
usage()
{
(void) fprintf(stderr,
- gettext("usage: rcapstat [-g] [interval [count]]\n"));
+ gettext("usage: rcapstat [-g] [-p | -z] [interval [count]]\n"));
exit(E_USAGE);
}
@@ -139,12 +139,12 @@ format_size(char *str, uint64_t size, int length)
}
static int
-read_stats()
+read_stats(rcid_type_t stat_type)
{
int fd;
int proc_fd;
char procfile[20];
- pid_t pid;
+ uint64_t pid;
col_t *col, *col_next;
lcollection_report_t report;
struct stat st;
@@ -169,7 +169,7 @@ read_stats()
* Check if rcapd is running
*/
pid = hdr.rs_pid;
- (void) snprintf(procfile, 20, "/proc/%ld/psinfo", pid);
+ (void) snprintf(procfile, 20, "/proc/%lld/psinfo", pid);
if ((proc_fd = open(procfile, O_RDONLY)) < 0) {
warn(gettext("rcapd is not active\n"));
(void) close(fd);
@@ -185,6 +185,9 @@ read_stats()
}
while (read(fd, &report, sizeof (report)) == sizeof (report)) {
+ if (report.lcol_id.rcid_type != stat_type)
+ continue;
+
col = col_find(report.lcol_id);
if (col == NULL) {
col = col_insert(report.lcol_id);
@@ -291,12 +294,13 @@ print_unformatted_stats(void)
}
static void
-print_stats()
+print_stats(rcid_type_t stat_type)
{
col_t *col;
char size[6];
char limit[6];
char rss[6];
+ char nproc[6];
char paged_att[6];
char paged_eff[6];
char paged_att_avg[6];
@@ -310,12 +314,21 @@ print_stats()
*/
if (count == 0 || ncol != 1)
(void) printf("%6s %-15s %5s %5s %5s %5s %5s %5s %5s %5s\n",
- "id", mode, "nproc", "vm", "rss", "cap",
+ "id", (stat_type == RCIDT_PROJECT ? "project" : "zone"),
+ "nproc", "vm", "rss", "cap",
"at", "avgat", "pg", "avgpg");
if (++count >= 20 || (count >= 10 && global != 0) || ncol != 1)
count = 0;
for (col = col_head; col != NULL; col = col->col_next) {
+ if (col->col_id.rcid_type != stat_type)
+ continue;
+
+ if (col->col_paged_att == 0)
+ strlcpy(nproc, "-", sizeof (nproc));
+ else
+ (void) snprintf(nproc, sizeof (nproc), "%lld",
+ col->col_nproc);
format_size(size, col->col_vmsize, 6);
format_size(rss, col->col_rsssize, 6);
format_size(limit, col->col_rsslimit, 6);
@@ -323,8 +336,9 @@ print_stats()
format_size(paged_eff, col->col_paged_eff, 6);
format_size(paged_att_avg, col->col_paged_att_avg, 6);
format_size(paged_eff_avg, col->col_paged_eff_avg, 6);
- (void) printf("%6lld %-15s %5lld %5s %5s %5s %5s %5s %5s %5s\n",
- (long long)col->col_id, col->col_name, col->col_nproc,
+ (void) printf("%6lld %-15s %5s %5s %5s %5s %5s %5s %5s %5s\n",
+ col->col_id.rcid_val, col->col_name,
+ nproc,
size, rss, limit,
paged_att, paged_att_avg,
paged_eff, paged_eff_avg);
@@ -342,20 +356,32 @@ main(int argc, char *argv[])
int count;
int always = 1;
int opt;
+ int projects = 0;
+ int zones = 0;
+ /* project reporting is the default if no option is specified */
+ rcid_type_t stat_type = RCIDT_PROJECT;
(void) setlocale(LC_ALL, "");
(void) textdomain(TEXT_DOMAIN);
(void) setprogname("rcapstat");
global = unformatted = 0;
- while ((opt = getopt(argc, argv, "gu")) != (int)EOF) {
+ while ((opt = getopt(argc, argv, "gpuz")) != (int)EOF) {
switch (opt) {
case 'g':
global = 1;
break;
+ case 'p':
+ projects = 1;
+ stat_type = RCIDT_PROJECT;
+ break;
case 'u':
unformatted = 1;
break;
+ case 'z':
+ stat_type = RCIDT_ZONE;
+ zones = 1;
+ break;
default:
usage();
}
@@ -369,22 +395,22 @@ main(int argc, char *argv[])
die(gettext("invalid count specified\n"));
always = 0;
}
- if (argc > optind)
+ if (argc > optind || (projects > 0 && zones > 0))
usage();
while (always || count-- > 0) {
- if (read_stats() != E_SUCCESS)
+ if (read_stats(stat_type) != E_SUCCESS)
return (E_ERROR);
if (!unformatted) {
- print_stats();
- fflush(stdout);
+ print_stats(stat_type);
+ (void) fflush(stdout);
if (count || always)
(void) sleep(interval);
} else {
struct stat st;
print_unformatted_stats();
- fflush(stdout);
+ (void) fflush(stdout);
while (stat(STAT_FILE_DEFAULT, &st) == 0 &&
st.st_mtime == stat_mod)
usleep((useconds_t)(0.2 * MICROSEC));
diff --git a/usr/src/cmd/truss/print.c b/usr/src/cmd/truss/print.c
index 92739f2b1e..4dc70b0d37 100644
--- a/usr/src/cmd/truss/print.c
+++ b/usr/src/cmd/truss/print.c
@@ -2325,6 +2325,7 @@ prt_zga(private_t *pri, int raw, long val)
case ZONE_ATTR_INITNAME: s = "ZONE_ATTR_INITNAME"; break;
case ZONE_ATTR_BOOTARGS: s = "ZONE_ATTR_BOOTARGS"; break;
case ZONE_ATTR_BRAND: s = "ZONE_ATTR_BRAND"; break;
+ case ZONE_ATTR_PHYS_MCAP: s = "ZONE_ATTR_PHYS_MCAP"; break;
}
}
diff --git a/usr/src/cmd/truss/systable.c b/usr/src/cmd/truss/systable.c
index 695d0e28c2..f46e028bf5 100644
--- a/usr/src/cmd/truss/systable.c
+++ b/usr/src/cmd/truss/systable.c
@@ -404,7 +404,7 @@ const struct systable systable[] = {
{"kaio", 7, DEC, NOV, AIO, HEX, HEX, HEX, HEX, HEX, HEX}, /* 178 */
{"cpc", 5, DEC, NOV, CPC, DEC, HEX, HEX, HEX}, /* 179 */
{"lgrpsys", 3, DEC, NOV, DEC, DEC, HEX}, /* 180 */
-{"rusagesys", 2, DEC, NOV, DEC, HEX}, /* 181 */
+{"rusagesys", 5, DEC, NOV, DEC, HEX, DEC, HEX, HEX}, /* 181 */
{"portfs", 6, HEX, HEX, DEC, HEX, HEX, HEX, HEX, HEX}, /* 182 */
{"pollsys", 4, DEC, NOV, HEX, DEC, HEX, HEX}, /* 183 */
{"labelsys", 2, DEC, NOV, DEC, HEX}, /* 184 */
@@ -761,6 +761,7 @@ static const struct systable rusagesystable[] = {
{"getrusage", 2, DEC, NOV, HID, HEX}, /* 0 */
{"getrusage_chld", 2, DEC, NOV, HID, HEX}, /* 1 */
{"getrusage_lwp", 2, DEC, NOV, HID, HEX}, /* 2 */
+{"getvmusage", 5, DEC, NOV, HID, HEX, DEC, HEX, HEX}, /* 3 */
};
#define NRUSAGESYSCODE \
(sizeof (rusagesystable) / sizeof (struct systable))
@@ -942,6 +943,7 @@ const struct sysalias sysalias[] = {
{ "getrusage", SYS_rusagesys },
{ "getrusage_chld", SYS_rusagesys },
{ "getrusage_lwp", SYS_rusagesys },
+ { "getvmusage", SYS_rusagesys },
{ "getpeerucred", SYS_ucredsys },
{ "ucred_get", SYS_ucredsys },
{ "port_create", SYS_port },
@@ -956,6 +958,7 @@ const struct sysalias sysalias[] = {
{ "zone_create", SYS_zone },
{ "zone_destroy", SYS_zone },
{ "zone_getattr", SYS_zone },
+ { "zone_setattr", SYS_zone },
{ "zone_enter", SYS_zone },
{ "getzoneid", SYS_zone },
{ "zone_list", SYS_zone },
diff --git a/usr/src/cmd/zoneadm/Makefile b/usr/src/cmd/zoneadm/Makefile
index 4d0f91a6f3..e11609c6dd 100644
--- a/usr/src/cmd/zoneadm/Makefile
+++ b/usr/src/cmd/zoneadm/Makefile
@@ -27,8 +27,8 @@
#
PROG= zoneadm
-MANIFEST= zones.xml
-SVCMETHOD= svc-zones
+MANIFEST= zones.xml resource-mgmt.xml
+SVCMETHOD= svc-zones svc-resource-mgmt
include ../Makefile.cmd
diff --git a/usr/src/cmd/zoneadm/resource-mgmt.xml b/usr/src/cmd/zoneadm/resource-mgmt.xml
new file mode 100644
index 0000000000..264f26733f
--- /dev/null
+++ b/usr/src/cmd/zoneadm/resource-mgmt.xml
@@ -0,0 +1,116 @@
+<?xml version="1.0"?>
+<!DOCTYPE service_bundle SYSTEM "/usr/share/lib/xml/dtd/service_bundle.dtd.1">
+<!--
+ Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ Use is subject to license terms.
+
+ CDDL HEADER START
+
+ The contents of this file are subject to the terms of the
+ Common Development and Distribution License (the "License").
+ You may not use this file except in compliance with the License.
+
+ You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ or http://www.opensolaris.org/os/licensing.
+ See the License for the specific language governing permissions
+ and limitations under the License.
+
+ When distributing Covered Code, include this CDDL HEADER in each
+ file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ If applicable, add the following below this CDDL HEADER, with the
+ fields enclosed by brackets "[]" replaced with your own identifying
+ information: Portions Copyright [yyyy] [name of copyright owner]
+
+ CDDL HEADER END
+
+ ident "%Z%%M% %I% %E% SMI"
+
+ NOTE: This service manifest is not editable; its contents will
+ be overwritten by package or patch operations, including
+ operating system upgrade. Make customizations in a different
+ file.
+-->
+
+<service_bundle type='manifest' name='SUNWzoner:zones'>
+
+<!--
+ This service applies global zone resource management settings
+ at system startup.
+-->
+<service
+ name='system/resource-mgmt'
+ type='service'
+ version='1'>
+
+ <create_default_instance enabled='true' />
+
+ <single_instance />
+
+ <dependency
+ name='usr'
+ type='service'
+ grouping='require_all'
+ restart_on='none'>
+ <service_fmri value='svc:/system/filesystem/minimal' />
+ </dependency>
+
+ <dependency
+ name='scheduler'
+ type='service'
+ grouping='optional_all'
+ restart_on='none'>
+ <service_fmri value='svc:/system/scheduler' />
+ </dependency>
+
+ <dependency
+ name='pools'
+ type='service'
+ grouping='optional_all'
+ restart_on='none'>
+ <service_fmri value='svc:/system/pools' />
+ </dependency>
+
+ <dependent
+ name='rcap'
+ grouping='optional_all'
+ restart_on='none'>
+ <service_fmri value='svc:/system/rcap' />
+ </dependent>
+
+ <exec_method
+ type='method'
+ name='start'
+ exec='/lib/svc/method/svc-resource-mgmt %m'
+ timeout_seconds='60'>
+ </exec_method>
+
+ <exec_method
+ type='method'
+ name='stop'
+ exec=':true'
+ timeout_seconds='3'>
+ </exec_method>
+
+ <property_group name='startd' type='framework'>
+ <propval name='duration' type='astring' value='transient' />
+ </property_group>
+
+ <stability value='Unstable' />
+
+ <template>
+ <common_name>
+ <loctext xml:lang='C'>
+ Global zone resource management settings
+ </loctext>
+ </common_name>
+ <documentation>
+ <manpage title='zones' section='5' manpath='/usr/share/man' />
+ <manpage
+ title='zonecfg'
+ section='1M'
+ manpath='/usr/share/man' />
+ </documentation>
+ </template>
+</service>
+
+</service_bundle>
diff --git a/usr/src/cmd/zoneadm/svc-resource-mgmt b/usr/src/cmd/zoneadm/svc-resource-mgmt
new file mode 100644
index 0000000000..762de4c0d8
--- /dev/null
+++ b/usr/src/cmd/zoneadm/svc-resource-mgmt
@@ -0,0 +1,54 @@
+#!/sbin/sh
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+
+# ident "%Z%%M% %I% %E% SMI"
+#
+
+. /lib/svc/share/smf_include.sh
+
+[ ! -f /etc/zones/global.xml ] && exit $SMF_EXIT_OK # No global zone
+ # resource mgmt.
+ # configuration
+
+[ ! -x /usr/sbin/zoneadm ] && exit $SMF_EXIT_OK # SUNWzoneu not installed
+
+# Make sure working directory is / to prevent unmounting problems.
+cd /
+PATH=/usr/sbin:/usr/bin; export PATH
+
+case "$1" in
+'start')
+ zoneadm -z global apply
+ if [ $? -ne 0 ]; then
+ exit $SMF_EXIT_ERR_FATAL
+ fi
+ ;;
+
+*)
+ echo "Usage: $0 start"
+ exit $SMF_EXIT_ERR_FATAL
+ ;;
+esac
+exit $SMF_EXIT_OK
diff --git a/usr/src/cmd/zoneadm/zoneadm.c b/usr/src/cmd/zoneadm/zoneadm.c
index bff26cd356..b7ae32b30d 100644
--- a/usr/src/cmd/zoneadm/zoneadm.c
+++ b/usr/src/cmd/zoneadm/zoneadm.c
@@ -74,9 +74,12 @@
#include <fnmatch.h>
#include <sys/modctl.h>
#include <libbrand.h>
+#include <libscf.h>
#include <pool.h>
#include <sys/pool.h>
+#include <sys/priocntl.h>
+#include <sys/fsspriocntl.h>
#include "zoneadm.h"
@@ -154,6 +157,7 @@ static int move_func(int argc, char *argv[]);
static int detach_func(int argc, char *argv[]);
static int attach_func(int argc, char *argv[]);
static int mark_func(int argc, char *argv[]);
+static int apply_func(int argc, char *argv[]);
static int sanity_check(char *zone, int cmd_num, boolean_t running,
boolean_t unsafe_when_running, boolean_t force);
static int cmd_match(char *cmd);
@@ -177,7 +181,8 @@ static struct cmd cmdtab[] = {
{ CMD_MOVE, "move", SHELP_MOVE, move_func },
{ CMD_DETACH, "detach", SHELP_DETACH, detach_func },
{ CMD_ATTACH, "attach", SHELP_ATTACH, attach_func },
- { CMD_MARK, "mark", SHELP_MARK, mark_func }
+ { CMD_MARK, "mark", SHELP_MARK, mark_func },
+ { CMD_APPLY, "apply", NULL, apply_func }
};
/* global variables */
@@ -1501,6 +1506,7 @@ boot_func(int argc, char *argv[])
zerror(gettext("call to %s failed"), "zoneadmd");
return (Z_ERR);
}
+
return (Z_OK);
}
@@ -4355,15 +4361,22 @@ dev_fix(zone_dochandle_t handle)
zarg.cmd = Z_READY;
if (call_zoneadmd(target_zone, &zarg) != 0) {
zerror(gettext("call to %s failed"), "zoneadmd");
+ /* attempt to restore zone to configured state */
+ (void) zone_set_state(target_zone, ZONE_STATE_CONFIGURED);
return (Z_ERR);
}
zarg.cmd = Z_HALT;
if (call_zoneadmd(target_zone, &zarg) != 0) {
zerror(gettext("call to %s failed"), "zoneadmd");
+ /* attempt to restore zone to configured state */
+ (void) zone_set_state(target_zone, ZONE_STATE_CONFIGURED);
return (Z_ERR);
}
+ /* attempt to restore zone to configured state */
+ (void) zone_set_state(target_zone, ZONE_STATE_CONFIGURED);
+
if (zonecfg_setdevperment(handle) != Z_OK) {
(void) fprintf(stderr,
gettext("unable to enumerate device entries\n"));
@@ -4845,6 +4858,177 @@ mark_func(int argc, char *argv[])
return (err);
}
+/*
+ * Check what scheduling class we're running under and print a warning if
+ * we're not using FSS.
+ */
+static int
+check_sched_fss(zone_dochandle_t handle)
+{
+ char class_name[PC_CLNMSZ];
+
+ if (zonecfg_get_dflt_sched_class(handle, class_name,
+ sizeof (class_name)) != Z_OK) {
+ zerror(gettext("WARNING: unable to determine the zone's "
+ "scheduling class"));
+ } else if (strcmp("FSS", class_name) != 0) {
+ zerror(gettext("WARNING: The zone.cpu-shares rctl is set but\n"
+ "FSS is not the default scheduling class for this zone. "
+ "FSS will be\nused for processes in the zone but to get "
+ "the full benefit of FSS,\nit should be the default "
+ "scheduling class. See dispadmin(1M) for\nmore details."));
+ return (Z_SYSTEM);
+ }
+
+ return (Z_OK);
+}
+
+static int
+check_cpu_shares_sched(zone_dochandle_t handle)
+{
+ int err;
+ int res = Z_OK;
+ struct zone_rctltab rctl;
+
+ if ((err = zonecfg_setrctlent(handle)) != Z_OK) {
+ errno = err;
+ zperror(cmd_to_str(CMD_APPLY), B_TRUE);
+ return (err);
+ }
+
+ while (zonecfg_getrctlent(handle, &rctl) == Z_OK) {
+ if (strcmp(rctl.zone_rctl_name, "zone.cpu-shares") == 0) {
+ if (check_sched_fss(handle) != Z_OK)
+ res = Z_SYSTEM;
+ break;
+ }
+ }
+
+ (void) zonecfg_endrctlent(handle);
+
+ return (res);
+}
+
+/*
+ * This is an undocumented interface which is currently only used to apply
+ * the global zone resource management settings when the system boots.
+ * This function does not yet properly handle updating a running system so
+ * any projects running in the zone would be trashed if this function
+ * were to run after the zone had booted. It also does not reset any
+ * rctl settings that were removed from zonecfg. There is still work to be
+ * done before we can properly support dynamically updating the resource
+ * management settings for a running zone (global or non-global). Thus, this
+ * functionality is undocumented for now.
+ */
+/* ARGSUSED */
+static int
+apply_func(int argc, char *argv[])
+{
+ int err;
+ int res = Z_OK;
+ priv_set_t *privset;
+ zoneid_t zoneid;
+ zone_dochandle_t handle;
+ struct zone_mcaptab mcap;
+ char pool_err[128];
+
+ zoneid = getzoneid();
+
+ if (zonecfg_in_alt_root() || zoneid != GLOBAL_ZONEID ||
+ target_zone == NULL || strcmp(target_zone, GLOBAL_ZONENAME) != 0)
+ return (usage(B_FALSE));
+
+ if ((privset = priv_allocset()) == NULL) {
+ zerror(gettext("%s failed"), "priv_allocset");
+ return (Z_ERR);
+ }
+
+ if (getppriv(PRIV_EFFECTIVE, privset) != 0) {
+ zerror(gettext("%s failed"), "getppriv");
+ priv_freeset(privset);
+ return (Z_ERR);
+ }
+
+ if (priv_isfullset(privset) == B_FALSE) {
+ (void) usage(B_FALSE);
+ priv_freeset(privset);
+ return (Z_ERR);
+ }
+ priv_freeset(privset);
+
+ if ((handle = zonecfg_init_handle()) == NULL) {
+ zperror(cmd_to_str(CMD_APPLY), B_TRUE);
+ return (Z_ERR);
+ }
+
+ if ((err = zonecfg_get_handle(target_zone, handle)) != Z_OK) {
+ errno = err;
+ zperror(cmd_to_str(CMD_APPLY), B_TRUE);
+ zonecfg_fini_handle(handle);
+ return (Z_ERR);
+ }
+
+ /* specific error msgs are printed within apply_rctls */
+ if ((err = zonecfg_apply_rctls(target_zone, handle)) != Z_OK) {
+ errno = err;
+ zperror(cmd_to_str(CMD_APPLY), B_TRUE);
+ res = Z_ERR;
+ }
+
+ if ((err = check_cpu_shares_sched(handle)) != Z_OK)
+ res = Z_ERR;
+
+ /*
+ * The next two blocks of code attempt to set up temporary pools as
+ * well as persistent pools. In both cases we call the functions
+ * unconditionally. Within each funtion the code will check if the
+ * zone is actually configured for a temporary pool or persistent pool
+ * and just return if there is nothing to do.
+ */
+ if ((err = zonecfg_bind_tmp_pool(handle, zoneid, pool_err,
+ sizeof (pool_err))) != Z_OK) {
+ if (err == Z_POOL || err == Z_POOL_CREATE || err == Z_POOL_BIND)
+ zerror("%s: %s", zonecfg_strerror(err), pool_err);
+ else
+ zerror(gettext("could not bind zone to temporary "
+ "pool: %s"), zonecfg_strerror(err));
+ res = Z_ERR;
+ }
+
+ if ((err = zonecfg_bind_pool(handle, zoneid, pool_err,
+ sizeof (pool_err))) != Z_OK) {
+ if (err == Z_POOL || err == Z_POOL_BIND)
+ zerror("%s: %s", zonecfg_strerror(err), pool_err);
+ else
+ zerror("%s", zonecfg_strerror(err));
+ }
+
+ /*
+ * If a memory cap is configured, set the cap in the kernel using
+ * zone_setattr() and make sure the rcapd SMF service is enabled.
+ */
+ if (zonecfg_getmcapent(handle, &mcap) == Z_OK) {
+ uint64_t num;
+ char smf_err[128];
+
+ num = (uint64_t)strtoll(mcap.zone_physmem_cap, NULL, 10);
+ if (zone_setattr(zoneid, ZONE_ATTR_PHYS_MCAP, &num, 0) == -1) {
+ zerror(gettext("could not set zone memory cap"));
+ res = Z_ERR;
+ }
+
+ if (zonecfg_enable_rcapd(smf_err, sizeof (smf_err)) != Z_OK) {
+ zerror(gettext("enabling system/rcap service failed: "
+ "%s"), smf_err);
+ res = Z_ERR;
+ }
+ }
+
+ zonecfg_fini_handle(handle);
+
+ return (res);
+}
+
static int
help_func(int argc, char *argv[])
{
diff --git a/usr/src/cmd/zoneadm/zoneadm.h b/usr/src/cmd/zoneadm/zoneadm.h
index a94053e258..a299ece135 100644
--- a/usr/src/cmd/zoneadm/zoneadm.h
+++ b/usr/src/cmd/zoneadm/zoneadm.h
@@ -45,9 +45,10 @@
#define CMD_DETACH 13
#define CMD_ATTACH 14
#define CMD_MARK 15
+#define CMD_APPLY 16
#define CMD_MIN CMD_HELP
-#define CMD_MAX CMD_MARK
+#define CMD_MAX CMD_APPLY
#if !defined(TEXT_DOMAIN) /* should be defined by cc -D */
#define TEXT_DOMAIN "SYS_TEST" /* Use this only if it wasn't */
diff --git a/usr/src/cmd/zoneadmd/Makefile b/usr/src/cmd/zoneadmd/Makefile
index 8b77f8234c..34914694a8 100644
--- a/usr/src/cmd/zoneadmd/Makefile
+++ b/usr/src/cmd/zoneadmd/Makefile
@@ -42,7 +42,7 @@ POFILES= $(OBJS:%.o=%.po)
CFLAGS += $(CCVERBOSE)
LAZYLIBS = $(ZLAZYLOAD) -ltsnet -ltsol $(ZNOLAZYLOAD)
lint := LAZYLIBS = -ltsnet -ltsol
-LDLIBS += -lsocket -lzonecfg -lnsl -ldevinfo -ldevice -lnvpair -lpool \
+LDLIBS += -lsocket -lzonecfg -lnsl -ldevinfo -ldevice -lnvpair \
-lgen -lbsm -lcontract -lzfs -luuid -lbrand $(LAZYLIBS)
XGETFLAGS += -a -x zoneadmd.xcl
diff --git a/usr/src/cmd/zoneadmd/vplat.c b/usr/src/cmd/zoneadmd/vplat.c
index ca93b1c696..513921e5e2 100644
--- a/usr/src/cmd/zoneadmd/vplat.c
+++ b/usr/src/cmd/zoneadmd/vplat.c
@@ -106,6 +106,7 @@
#include <pool.h>
#include <sys/pool.h>
+#include <sys/priocntl.h>
#include <libbrand.h>
#include <sys/brand.h>
@@ -2661,27 +2662,6 @@ out:
}
static int
-get_zone_pool(zlog_t *zlogp, char *poolbuf, size_t bufsz)
-{
- zone_dochandle_t handle;
- int error;
-
- if ((handle = zonecfg_init_handle()) == NULL) {
- zerror(zlogp, B_TRUE, "getting zone configuration handle");
- return (Z_NOMEM);
- }
- error = zonecfg_get_snapshot_handle(zone_name, handle);
- if (error != Z_OK) {
- zerror(zlogp, B_FALSE, "invalid configuration");
- zonecfg_fini_handle(handle);
- return (error);
- }
- error = zonecfg_get_pool(handle, poolbuf, bufsz);
- zonecfg_fini_handle(handle);
- return (error);
-}
-
-static int
get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep)
{
zone_dochandle_t handle;
@@ -2818,75 +2798,6 @@ validate_datasets(zlog_t *zlogp)
return (0);
}
-static int
-bind_to_pool(zlog_t *zlogp, zoneid_t zoneid)
-{
- pool_conf_t *poolconf;
- pool_t *pool;
- char poolname[MAXPATHLEN];
- int status;
- int error;
-
- /*
- * Find the pool mentioned in the zone configuration, and bind to it.
- */
- error = get_zone_pool(zlogp, poolname, sizeof (poolname));
- if (error == Z_NO_ENTRY || (error == Z_OK && strlen(poolname) == 0)) {
- /*
- * The property is not set on the zone, so the pool
- * should be bound to the default pool. But that's
- * already done by the kernel, so we can just return.
- */
- return (0);
- }
- if (error != Z_OK) {
- /*
- * Not an error, even though it shouldn't be happening.
- */
- zerror(zlogp, B_FALSE,
- "WARNING: unable to retrieve default pool.");
- return (0);
- }
- /*
- * Don't do anything if pools aren't enabled.
- */
- if (pool_get_status(&status) != PO_SUCCESS || status != POOL_ENABLED) {
- zerror(zlogp, B_FALSE, "WARNING: pools facility not active; "
- "zone will not be bound to pool '%s'.", poolname);
- return (0);
- }
- /*
- * Try to provide a sane error message if the requested pool doesn't
- * exist.
- */
- if ((poolconf = pool_conf_alloc()) == NULL) {
- zerror(zlogp, B_FALSE, "%s failed", "pool_conf_alloc");
- return (-1);
- }
- if (pool_conf_open(poolconf, pool_dynamic_location(), PO_RDONLY) !=
- PO_SUCCESS) {
- zerror(zlogp, B_FALSE, "%s failed", "pool_conf_open");
- pool_conf_free(poolconf);
- return (-1);
- }
- pool = pool_get_pool(poolconf, poolname);
- (void) pool_conf_close(poolconf);
- pool_conf_free(poolconf);
- if (pool == NULL) {
- zerror(zlogp, B_FALSE, "WARNING: pool '%s' not found; "
- "using default pool.", poolname);
- return (0);
- }
- /*
- * Bind the zone to the pool.
- */
- if (pool_set_binding(poolname, P_ZONEID, zoneid) != PO_SUCCESS) {
- zerror(zlogp, B_FALSE, "WARNING: unable to bind to pool '%s'; "
- "using default pool.", poolname);
- }
- return (0);
-}
-
/*
* Mount lower level home directories into/from current zone
* Share exported directories specified in dfstab for zone
@@ -3482,6 +3393,149 @@ duplicate_reachable_path(zlog_t *zlogp, const char *rootpath)
return (B_FALSE);
}
+/*
+ * Set memory cap and pool info for the zone's resource management
+ * configuration.
+ */
+static int
+setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid)
+{
+ int res;
+ uint64_t tmp;
+ struct zone_mcaptab mcap;
+ char sched[MAXNAMELEN];
+ zone_dochandle_t handle = NULL;
+ char pool_err[128];
+
+ if ((handle = zonecfg_init_handle()) == NULL) {
+ zerror(zlogp, B_TRUE, "getting zone configuration handle");
+ return (Z_BAD_HANDLE);
+ }
+
+ if ((res = zonecfg_get_snapshot_handle(zone_name, handle)) != Z_OK) {
+ zerror(zlogp, B_FALSE, "invalid configuration");
+ zonecfg_fini_handle(handle);
+ return (res);
+ }
+
+ /*
+ * If a memory cap is configured, set the cap in the kernel using
+ * zone_setattr() and make sure the rcapd SMF service is enabled.
+ */
+ if (zonecfg_getmcapent(handle, &mcap) == Z_OK) {
+ uint64_t num;
+ char smf_err[128];
+
+ num = (uint64_t)strtoull(mcap.zone_physmem_cap, NULL, 10);
+ if (zone_setattr(zoneid, ZONE_ATTR_PHYS_MCAP, &num, 0) == -1) {
+ zerror(zlogp, B_TRUE, "could not set zone memory cap");
+ zonecfg_fini_handle(handle);
+ return (Z_INVAL);
+ }
+
+ if (zonecfg_enable_rcapd(smf_err, sizeof (smf_err)) != Z_OK) {
+ zerror(zlogp, B_FALSE, "enabling system/rcap service "
+ "failed: %s", smf_err);
+ zonecfg_fini_handle(handle);
+ return (Z_INVAL);
+ }
+ }
+
+ /* Get the scheduling class set in the zone configuration. */
+ if (zonecfg_get_sched_class(handle, sched, sizeof (sched)) == Z_OK &&
+ strlen(sched) > 0) {
+ if (zone_setattr(zoneid, ZONE_ATTR_SCHED_CLASS, sched,
+ strlen(sched)) == -1)
+ zerror(zlogp, B_TRUE, "WARNING: unable to set the "
+ "default scheduling class");
+
+ } else if (zonecfg_get_aliased_rctl(handle, ALIAS_SHARES, &tmp)
+ == Z_OK) {
+ /*
+ * If the zone has the zone.cpu-shares rctl set then we want to
+ * use the Fair Share Scheduler (FSS) for processes in the
+ * zone. Check what scheduling class the zone would be running
+ * in by default so we can print a warning and modify the class
+ * if we wouldn't be using FSS.
+ */
+ char class_name[PC_CLNMSZ];
+
+ if (zonecfg_get_dflt_sched_class(handle, class_name,
+ sizeof (class_name)) != Z_OK) {
+ zerror(zlogp, B_FALSE, "WARNING: unable to determine "
+ "the zone's scheduling class");
+
+ } else if (strcmp("FSS", class_name) != 0) {
+ zerror(zlogp, B_FALSE, "WARNING: The zone.cpu-shares "
+ "rctl is set but\nFSS is not the default "
+ "scheduling class for\nthis zone. FSS will be "
+ "used for processes\nin the zone but to get the "
+ "full benefit of FSS,\nit should be the default "
+ "scheduling class.\nSee dispadmin(1M) for more "
+ "details.");
+
+ if (zone_setattr(zoneid, ZONE_ATTR_SCHED_CLASS, "FSS",
+ strlen("FSS")) == -1)
+ zerror(zlogp, B_TRUE, "WARNING: unable to set "
+ "zone scheduling class to FSS");
+ }
+ }
+
+ /*
+ * The next few blocks of code attempt to set up temporary pools as
+ * well as persistent pools. In all cases we call the functions
+ * unconditionally. Within each funtion the code will check if the
+ * zone is actually configured for a temporary pool or persistent pool
+ * and just return if there is nothing to do.
+ *
+ * If we are rebooting we want to attempt to reuse any temporary pool
+ * that was previously set up. zonecfg_bind_tmp_pool() will do the
+ * right thing in all cases (reuse or create) based on the current
+ * zonecfg.
+ */
+ if ((res = zonecfg_bind_tmp_pool(handle, zoneid, pool_err,
+ sizeof (pool_err))) != Z_OK) {
+ if (res == Z_POOL || res == Z_POOL_CREATE || res == Z_POOL_BIND)
+ zerror(zlogp, B_FALSE, "%s: %s\ndedicated-cpu setting "
+ "cannot be instantiated", zonecfg_strerror(res),
+ pool_err);
+ else
+ zerror(zlogp, B_FALSE, "could not bind zone to "
+ "temporary pool: %s", zonecfg_strerror(res));
+ zonecfg_fini_handle(handle);
+ return (Z_POOL_BIND);
+ }
+
+ /*
+ * Check if we need to warn about poold not being enabled.
+ */
+ if (zonecfg_warn_poold(handle)) {
+ zerror(zlogp, B_FALSE, "WARNING: A range of dedicated-cpus has "
+ "been specified\nbut the dynamic pool service is not "
+ "enabled.\nThe system will not dynamically adjust the\n"
+ "processor allocation within the specified range\n"
+ "until svc:/system/pools/dynamic is enabled.\n"
+ "See poold(1M).");
+ }
+
+ /* The following is a warning, not an error. */
+ if ((res = zonecfg_bind_pool(handle, zoneid, pool_err,
+ sizeof (pool_err))) != Z_OK) {
+ if (res == Z_POOL_BIND)
+ zerror(zlogp, B_FALSE, "WARNING: unable to bind to "
+ "pool '%s'; using default pool.", pool_err);
+ else if (res == Z_POOL)
+ zerror(zlogp, B_FALSE, "WARNING: %s: %s",
+ zonecfg_strerror(res), pool_err);
+ else
+ zerror(zlogp, B_FALSE, "WARNING: %s",
+ zonecfg_strerror(res));
+ }
+
+ zonecfg_fini_handle(handle);
+ return (Z_OK);
+}
+
zoneid_t
vplat_create(zlog_t *zlogp, boolean_t mount_cmd)
{
@@ -3668,14 +3722,18 @@ vplat_create(zlog_t *zlogp, boolean_t mount_cmd)
}
/*
- * The following is a warning, not an error, and is not performed when
- * merely mounting a zone for administrative use.
+ * The following actions are not performed when merely mounting a zone
+ * for administrative use.
*/
- if (!mount_cmd && bind_to_pool(zlogp, zoneid) != 0)
- zerror(zlogp, B_FALSE, "WARNING: unable to bind zone to "
- "requested pool; using default pool.");
- if (!mount_cmd)
+ if (!mount_cmd) {
+ if (setup_zone_rm(zlogp, zone_name, zoneid) != Z_OK) {
+ (void) zone_shutdown(zoneid);
+ goto error;
+ }
+
set_mlps(zlogp, zoneid, zcent);
+ }
+
rval = zoneid;
zoneid = -1;
@@ -3878,10 +3936,12 @@ unmounted:
}
int
-vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd)
+vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting)
{
char *kzone;
zoneid_t zoneid;
+ int res;
+ char pool_err[128];
char zroot[MAXPATHLEN];
char cmdbuf[MAXPATHLEN];
char brand[MAXNAMELEN];
@@ -3972,6 +4032,19 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd)
goto error;
}
+ /*
+ * If we are rebooting then we don't want to destroy an existing
+ * temporary pool at this point so that we can just reuse it when the
+ * zone boots back up.
+ */
+ if (!unmount_cmd && !rebooting) {
+ if ((res = zonecfg_destroy_tmp_pool(zone_name, pool_err,
+ sizeof (pool_err))) != Z_OK) {
+ if (res == Z_POOL)
+ zerror(zlogp, B_FALSE, pool_err);
+ }
+ }
+
remove_mlps(zlogp, zoneid);
if (zone_destroy(zoneid) != 0) {
diff --git a/usr/src/cmd/zoneadmd/zoneadmd.c b/usr/src/cmd/zoneadmd/zoneadmd.c
index 313d24d95b..35206384b9 100644
--- a/usr/src/cmd/zoneadmd/zoneadmd.c
+++ b/usr/src/cmd/zoneadmd/zoneadmd.c
@@ -463,7 +463,7 @@ zone_ready(zlog_t *zlogp, boolean_t mount_cmd)
}
if (vplat_bringup(zlogp, mount_cmd, zone_id) != 0) {
bringup_failure_recovery = B_TRUE;
- (void) vplat_teardown(NULL, mount_cmd);
+ (void) vplat_teardown(NULL, mount_cmd, B_FALSE);
if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK)
zerror(zlogp, B_FALSE, "destroying snapshot: %s",
zonecfg_strerror(err));
@@ -738,11 +738,11 @@ zone_bootup(zlog_t *zlogp, const char *bootargs)
}
static int
-zone_halt(zlog_t *zlogp, boolean_t unmount_cmd)
+zone_halt(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting)
{
int err;
- if (vplat_teardown(zlogp, unmount_cmd) != 0) {
+ if (vplat_teardown(zlogp, unmount_cmd, rebooting) != 0) {
if (!bringup_failure_recovery)
zerror(zlogp, B_FALSE, "unable to destroy zone");
return (-1);
@@ -985,7 +985,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
audit_put_record(zlogp, uc, rval, "boot");
if (rval != 0) {
bringup_failure_recovery = B_TRUE;
- (void) zone_halt(zlogp, B_FALSE);
+ (void) zone_halt(zlogp, B_FALSE, B_FALSE);
eventstream_write(Z_EVT_ZONE_BOOTFAILED);
}
break;
@@ -1094,7 +1094,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
audit_put_record(zlogp, uc, rval, "boot");
if (rval != 0) {
bringup_failure_recovery = B_TRUE;
- (void) zone_halt(zlogp, B_FALSE);
+ (void) zone_halt(zlogp, B_FALSE, B_TRUE);
eventstream_write(Z_EVT_ZONE_BOOTFAILED);
}
boot_args[0] = '\0';
@@ -1102,7 +1102,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
case Z_HALT:
if (kernelcall) /* Invalid; can't happen */
abort();
- if ((rval = zone_halt(zlogp, B_FALSE)) != 0)
+ if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE)) != 0)
break;
eventstream_write(Z_EVT_ZONE_HALTED);
break;
@@ -1125,7 +1125,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
case Z_UNMOUNT:
if (kernelcall) /* Invalid; can't happen */
abort();
- rval = zone_halt(zlogp, B_TRUE);
+ rval = zone_halt(zlogp, B_TRUE, B_FALSE);
if (rval == 0) {
eventstream_write(Z_EVT_ZONE_HALTED);
(void) sema_post(&scratch_sem);
@@ -1147,7 +1147,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
case ZONE_STATE_DOWN:
switch (cmd) {
case Z_READY:
- if ((rval = zone_halt(zlogp, B_FALSE)) != 0)
+ if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE)) != 0)
break;
if ((rval = zone_ready(zlogp, B_FALSE)) == 0)
eventstream_write(Z_EVT_ZONE_READIED);
@@ -1165,7 +1165,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
rval = 0;
break;
case Z_HALT:
- if ((rval = zone_halt(zlogp, B_FALSE)) != 0)
+ if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE)) != 0)
break;
eventstream_write(Z_EVT_ZONE_HALTED);
break;
@@ -1173,7 +1173,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
(void) strlcpy(boot_args, zargp->bootbuf,
sizeof (boot_args));
eventstream_write(Z_EVT_ZONE_REBOOTING);
- if ((rval = zone_halt(zlogp, B_FALSE)) != 0) {
+ if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE)) != 0) {
eventstream_write(Z_EVT_ZONE_BOOTFAILED);
boot_args[0] = '\0';
break;
@@ -1186,7 +1186,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
rval = zone_bootup(zlogp, zargp->bootbuf);
audit_put_record(zlogp, uc, rval, "reboot");
if (rval != 0) {
- (void) zone_halt(zlogp, B_FALSE);
+ (void) zone_halt(zlogp, B_FALSE, B_TRUE);
eventstream_write(Z_EVT_ZONE_BOOTFAILED);
}
boot_args[0] = '\0';
diff --git a/usr/src/cmd/zoneadmd/zoneadmd.h b/usr/src/cmd/zoneadmd/zoneadmd.h
index cfb90f93f3..a4aba27b5c 100644
--- a/usr/src/cmd/zoneadmd/zoneadmd.h
+++ b/usr/src/cmd/zoneadmd/zoneadmd.h
@@ -106,7 +106,7 @@ extern void eventstream_write(zone_evt_t evt);
*/
extern zoneid_t vplat_create(zlog_t *, boolean_t);
extern int vplat_bringup(zlog_t *, boolean_t, zoneid_t);
-extern int vplat_teardown(zlog_t *, boolean_t);
+extern int vplat_teardown(zlog_t *, boolean_t, boolean_t);
/*
* Console subsystem routines.
diff --git a/usr/src/cmd/zonecfg/zonecfg.c b/usr/src/cmd/zonecfg/zonecfg.c
index ea745cbb61..34d6b99480 100644
--- a/usr/src/cmd/zonecfg/zonecfg.c
+++ b/usr/src/cmd/zonecfg/zonecfg.c
@@ -101,6 +101,8 @@ extern int lex_lineno;
#define MAX_CMD_HIST 1024
#define MAX_CMD_LEN 1024
+#define ONE_MB 1048576
+
/*
* Each SHELP_ should be a simple string.
*/
@@ -108,6 +110,7 @@ extern int lex_lineno;
#define SHELP_ADD "add <resource-type>\n\t(global scope)\n" \
"add <property-name> <property-value>\n\t(resource scope)"
#define SHELP_CANCEL "cancel"
+#define SHELP_CLEAR "clear <property-name>"
#define SHELP_COMMIT "commit"
#define SHELP_CREATE "create [-F] [ -a <path> | -b | -t <template> ]"
#define SHELP_DELETE "delete [-F]"
@@ -116,9 +119,11 @@ extern int lex_lineno;
#define SHELP_EXPORT "export [-f output-file]"
#define SHELP_HELP "help [commands] [syntax] [usage] [<command-name>]"
#define SHELP_INFO "info [<resource-type> [property-name=property-value]*]"
-#define SHELP_REMOVE "remove <resource-type> { <property-name>=<property-" \
- "value> }\n\t(global scope)\nremove <property-name> <property-value>" \
- "\n\t(resource scope)"
+#define SHELP_REMOVE "remove [-F] <resource-type> " \
+ "[ <property-name>=<property-value> ]*\n" \
+ "\t(global scope)\n" \
+ "remove <property-name> <property-value>\n" \
+ "\t(resource scope)"
#define SHELP_REVERT "revert [-F]"
#define SHELP_SELECT "select <resource-type> { <property-name>=" \
"<property-value> }"
@@ -128,6 +133,7 @@ extern int lex_lineno;
static struct help helptab[] = {
{ CMD_ADD, "add", HELP_RES_PROPS, SHELP_ADD, },
{ CMD_CANCEL, "cancel", 0, SHELP_CANCEL, },
+ { CMD_CLEAR, "clear", HELP_PROPS, SHELP_CLEAR, },
{ CMD_COMMIT, "commit", 0, SHELP_COMMIT, },
{ CMD_CREATE, "create", 0, SHELP_CREATE, },
{ CMD_DELETE, "delete", 0, SHELP_DELETE, },
@@ -163,6 +169,15 @@ static char *res_types[] = {
"limitpriv",
"bootargs",
"brand",
+ "dedicated-cpu",
+ "capped-memory",
+ ALIAS_MAXLWPS,
+ ALIAS_MAXSHMMEM,
+ ALIAS_MAXSHMIDS,
+ ALIAS_MAXMSGIDS,
+ ALIAS_MAXSEMIDS,
+ ALIAS_SHARES,
+ "scheduling-class",
NULL
};
@@ -189,6 +204,19 @@ static char *prop_types[] = {
"limitpriv",
"bootargs",
"brand",
+ "ncpus",
+ "importance",
+ "swap",
+ "locked",
+ ALIAS_SHARES,
+ ALIAS_MAXLWPS,
+ ALIAS_MAXSHMMEM,
+ ALIAS_MAXSHMIDS,
+ ALIAS_MAXMSGIDS,
+ ALIAS_MAXSEMIDS,
+ ALIAS_MAXLOCKEDMEM,
+ ALIAS_MAXSWAP,
+ "scheduling-class",
NULL
};
@@ -205,11 +233,12 @@ static char *prop_val_types[] = {
/*
* remove has a space afterwards because it has qualifiers; the other commands
- * that have qualifiers (add, select and set) don't need a space here because
+ * that have qualifiers (add, select, etc.) don't need a space here because
* they have their own _cmds[] lists below.
*/
static const char *global_scope_cmds[] = {
"add",
+ "clear",
"commit",
"create",
"delete",
@@ -233,6 +262,23 @@ static const char *add_cmds[] = {
"add rctl",
"add attr",
"add dataset",
+ "add dedicated-cpu",
+ "add capped-memory",
+ NULL
+};
+
+static const char *clear_cmds[] = {
+ "clear autoboot",
+ "clear pool",
+ "clear limitpriv",
+ "clear bootargs",
+ "clear scheduling-class",
+ "clear " ALIAS_MAXLWPS,
+ "clear " ALIAS_MAXSHMMEM,
+ "clear " ALIAS_MAXSHMIDS,
+ "clear " ALIAS_MAXMSGIDS,
+ "clear " ALIAS_MAXSEMIDS,
+ "clear " ALIAS_SHARES,
NULL
};
@@ -244,6 +290,8 @@ static const char *remove_cmds[] = {
"remove rctl ",
"remove attr ",
"remove dataset ",
+ "remove dedicated-cpu ",
+ "remove capped-memory ",
NULL
};
@@ -255,6 +303,8 @@ static const char *select_cmds[] = {
"select rctl ",
"select attr ",
"select dataset ",
+ "select dedicated-cpu",
+ "select capped-memory",
NULL
};
@@ -266,6 +316,13 @@ static const char *set_cmds[] = {
"set pool=",
"set limitpriv=",
"set bootargs=",
+ "set scheduling-class=",
+ "set " ALIAS_MAXLWPS "=",
+ "set " ALIAS_MAXSHMMEM "=",
+ "set " ALIAS_MAXSHMIDS "=",
+ "set " ALIAS_MAXMSGIDS "=",
+ "set " ALIAS_MAXSEMIDS "=",
+ "set " ALIAS_SHARES "=",
NULL
};
@@ -277,12 +334,22 @@ static const char *info_cmds[] = {
"info rctl ",
"info attr ",
"info dataset ",
+ "info capped-memory",
+ "info dedicated-cpu",
"info zonename",
"info zonepath",
"info autoboot",
"info pool",
"info limitpriv",
"info bootargs",
+ "info brand",
+ "info scheduling-class",
+ "info max-lwps",
+ "info max-shm-memory",
+ "info max-shm-ids",
+ "info max-msg-ids",
+ "info max-sem-ids",
+ "info cpu-shares",
NULL
};
@@ -298,6 +365,7 @@ static const char *fs_res_scope_cmds[] = {
"set raw=",
"set special=",
"set type=",
+ "clear raw",
NULL
};
@@ -366,6 +434,33 @@ static const char *dataset_res_scope_cmds[] = {
NULL
};
+static const char *pset_res_scope_cmds[] = {
+ "cancel",
+ "end",
+ "exit",
+ "help",
+ "info",
+ "set ncpus=",
+ "set importance=",
+ "clear importance",
+ NULL
+};
+
+static const char *mcap_res_scope_cmds[] = {
+ "cancel",
+ "end",
+ "exit",
+ "help",
+ "info",
+ "set physical=",
+ "set swap=",
+ "set locked=",
+ "clear physical",
+ "clear swap",
+ "clear locked",
+ NULL
+};
+
/* Global variables */
/* set early in main(), never modified thereafter, used all over the place */
@@ -406,6 +501,9 @@ static bool got_handle = FALSE;
/* initialized in do_interactive(), checked in initialize() */
static bool interactive_mode;
+/* set if configuring the global zone */
+static bool global_zone = FALSE;
+
/* set in main(), checked in multiple places */
static bool read_only_mode;
@@ -427,9 +525,13 @@ static struct zone_devtab old_devtab, in_progress_devtab;
static struct zone_rctltab old_rctltab, in_progress_rctltab;
static struct zone_attrtab old_attrtab, in_progress_attrtab;
static struct zone_dstab old_dstab, in_progress_dstab;
+static struct zone_psettab old_psettab, in_progress_psettab;
+static struct zone_mcaptab old_mcaptab, in_progress_mcaptab;
static GetLine *gl; /* The gl_get_line() resource object */
+static void bytes_to_units(char *str, char *buf, int bufsize);
+
/* Functions begin here */
static bool
@@ -469,6 +571,8 @@ CPL_MATCH_FN(cmd_cpl_fn)
*/
if (strncmp(line, "add ", MAX(MIN(word_end, 4), 1)) == 0)
return (add_stuff(cpl, line, add_cmds, word_end));
+ if (strncmp(line, "clear ", MAX(MIN(word_end, 6), 2)) == 0)
+ return (add_stuff(cpl, line, clear_cmds, word_end));
if (strncmp(line, "select ", MAX(MIN(word_end, 7), 3)) == 0)
return (add_stuff(cpl, line, select_cmds, word_end));
if (strncmp(line, "set ", MAX(MIN(word_end, 4), 3)) == 0)
@@ -494,6 +598,10 @@ CPL_MATCH_FN(cmd_cpl_fn)
return (add_stuff(cpl, line, attr_res_scope_cmds, word_end));
case RT_DATASET:
return (add_stuff(cpl, line, dataset_res_scope_cmds, word_end));
+ case RT_DCPU:
+ return (add_stuff(cpl, line, pset_res_scope_cmds, word_end));
+ case RT_MCAP:
+ return (add_stuff(cpl, line, mcap_res_scope_cmds, word_end));
}
return (0);
}
@@ -669,9 +777,8 @@ long_help(int cmd_num)
"flag can be used to force the\n\taction."));
case CMD_REMOVE:
return (gettext("Remove specified resource from "
- "configuration. Note that the curly\n\tbraces "
- "('{', '}') mean one or more of whatever "
- "is between them."));
+ "configuration. The -F flag can be used\n\tto "
+ "force the action."));
case CMD_SELECT:
(void) snprintf(line, sizeof (line),
gettext("Selects a resource to modify. "
@@ -684,6 +791,8 @@ long_help(int cmd_num)
return (line);
case CMD_SET:
return (gettext("Sets property values."));
+ case CMD_CLEAR:
+ return (gettext("Clears property values."));
case CMD_INFO:
return (gettext("Displays information about the "
"current configuration. If resource\n\ttype is "
@@ -870,6 +979,37 @@ usage(bool verbose, uint_t flags)
(void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET),
pt_to_str(PT_NAME), gettext("<name>"));
break;
+ case RT_DCPU:
+ (void) fprintf(fp, gettext("The '%s' resource scope "
+ "configures the 'pools' facility to dedicate\na "
+ "subset of the system's processors to this zone "
+ "while it is running.\n"),
+ rt_to_str(resource_scope));
+ (void) fprintf(fp, gettext("Valid commands:\n"));
+ (void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET),
+ pt_to_str(PT_NCPUS),
+ gettext("<unsigned integer | range>"));
+ (void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET),
+ pt_to_str(PT_IMPORTANCE),
+ gettext("<unsigned integer>"));
+ break;
+ case RT_MCAP:
+ (void) fprintf(fp, gettext("The '%s' resource scope is "
+ "used to set an upper limit (a cap) on the\n"
+ "amount of physical memory, swap space and locked "
+ "memory that can be used by\nthis zone.\n"),
+ rt_to_str(resource_scope));
+ (void) fprintf(fp, gettext("Valid commands:\n"));
+ (void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET),
+ pt_to_str(PT_PHYSICAL),
+ gettext("<qualified unsigned decimal>"));
+ (void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET),
+ pt_to_str(PT_SWAP),
+ gettext("<qualified unsigned decimal>"));
+ (void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET),
+ pt_to_str(PT_LOCKED),
+ gettext("<qualified unsigned decimal>"));
+ break;
}
(void) fprintf(fp, gettext("And from any resource scope, you "
"can:\n"));
@@ -928,11 +1068,12 @@ usage(bool verbose, uint_t flags)
}
if (flags & HELP_RESOURCES) {
(void) fprintf(fp, "<%s> := %s | %s | %s | %s | %s | %s |\n\t"
- "%s\n\n",
+ "%s | %s | %s\n\n",
gettext("resource type"), rt_to_str(RT_FS),
rt_to_str(RT_IPD), rt_to_str(RT_NET), rt_to_str(RT_DEVICE),
rt_to_str(RT_RCTL), rt_to_str(RT_ATTR),
- rt_to_str(RT_DATASET));
+ rt_to_str(RT_DATASET), rt_to_str(RT_DCPU),
+ rt_to_str(RT_MCAP));
}
if (flags & HELP_PROPS) {
(void) fprintf(fp, gettext("For resource type ... there are "
@@ -951,6 +1092,20 @@ usage(bool verbose, uint_t flags)
pt_to_str(PT_POOL));
(void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"),
pt_to_str(PT_LIMITPRIV));
+ (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"),
+ pt_to_str(PT_SCHED));
+ (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"),
+ pt_to_str(PT_MAXLWPS));
+ (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"),
+ pt_to_str(PT_MAXSHMMEM));
+ (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"),
+ pt_to_str(PT_MAXSHMIDS));
+ (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"),
+ pt_to_str(PT_MAXMSGIDS));
+ (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"),
+ pt_to_str(PT_MAXSEMIDS));
+ (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"),
+ pt_to_str(PT_SHARES));
(void) fprintf(fp, "\t%s\t\t%s, %s, %s, %s\n", rt_to_str(RT_FS),
pt_to_str(PT_DIR), pt_to_str(PT_SPECIAL),
pt_to_str(PT_RAW), pt_to_str(PT_TYPE),
@@ -968,6 +1123,11 @@ usage(bool verbose, uint_t flags)
pt_to_str(PT_VALUE));
(void) fprintf(fp, "\t%s\t\t%s\n", rt_to_str(RT_DATASET),
pt_to_str(PT_NAME));
+ (void) fprintf(fp, "\t%s\t%s, %s\n", rt_to_str(RT_DCPU),
+ pt_to_str(PT_NCPUS), pt_to_str(PT_IMPORTANCE));
+ (void) fprintf(fp, "\t%s\t%s, %s, %s\n", rt_to_str(RT_MCAP),
+ pt_to_str(PT_PHYSICAL), pt_to_str(PT_SWAP),
+ pt_to_str(PT_LOCKED));
}
if (need_to_close)
(void) pclose(fp);
@@ -1040,6 +1200,33 @@ initialize(bool handle_expected)
" Unable to continue", zone, brandname);
exit(Z_ERR);
}
+ } else if (global_zone && err == Z_NO_ZONE && !got_handle &&
+ !read_only_mode) {
+ /*
+ * We implicitly create the global zone config if it
+ * doesn't exist.
+ */
+ zone_dochandle_t tmphandle;
+
+ if ((tmphandle = zonecfg_init_handle()) == NULL) {
+ zone_perror(execname, Z_NOMEM, TRUE);
+ exit(Z_ERR);
+ }
+
+ err = zonecfg_get_template_handle("SUNWblank", zone,
+ tmphandle);
+
+ if (err != Z_OK) {
+ zonecfg_fini_handle(tmphandle);
+ zone_perror("SUNWblank", err, TRUE);
+ return (err);
+ }
+
+ need_to_commit = TRUE;
+ zonecfg_fini_handle(handle);
+ handle = tmphandle;
+ got_handle = TRUE;
+
} else {
zone_perror(zone, err, handle_expected || got_handle);
if (err == Z_NO_ZONE && !got_handle &&
@@ -1373,10 +1560,13 @@ export_func(cmd_t *cmd)
struct zone_attrtab attrtab;
struct zone_rctltab rctltab;
struct zone_dstab dstab;
+ struct zone_psettab psettab;
+ struct zone_mcaptab mcaptab;
struct zone_rctlvaltab *valptr;
int err, arg;
char zonepath[MAXPATHLEN], outfile[MAXPATHLEN], pool[MAXNAMELEN];
char bootargs[BOOTARGS_MAX];
+ char sched[MAXNAMELEN];
char brand[MAXNAMELEN];
char *limitpriv;
FILE *of;
@@ -1456,6 +1646,10 @@ export_func(cmd_t *cmd)
free(limitpriv);
}
+ if (zonecfg_get_sched_class(handle, sched, sizeof (sched)) == Z_OK &&
+ strlen(sched) > 0)
+ (void) fprintf(of, "%s %s=%s\n", cmd_to_str(CMD_SET),
+ pt_to_str(PT_SCHED), sched);
if ((err = zonecfg_setipdent(handle)) != Z_OK) {
zone_perror(zone, err, FALSE);
@@ -1576,6 +1770,33 @@ export_func(cmd_t *cmd)
}
(void) zonecfg_enddsent(handle);
+ if (zonecfg_getpsetent(handle, &psettab) == Z_OK) {
+ (void) fprintf(of, "%s %s\n", cmd_to_str(CMD_ADD),
+ rt_to_str(RT_DCPU));
+ if (strcmp(psettab.zone_ncpu_min, psettab.zone_ncpu_max) == 0)
+ (void) fprintf(of, "%s %s=%s\n", cmd_to_str(CMD_SET),
+ pt_to_str(PT_NCPUS), psettab.zone_ncpu_max);
+ else
+ (void) fprintf(of, "%s %s=%s-%s\n", cmd_to_str(CMD_SET),
+ pt_to_str(PT_NCPUS), psettab.zone_ncpu_min,
+ psettab.zone_ncpu_max);
+ if (psettab.zone_importance[0] != '\0')
+ (void) fprintf(of, "%s %s=%s\n", cmd_to_str(CMD_SET),
+ pt_to_str(PT_IMPORTANCE), psettab.zone_importance);
+ (void) fprintf(of, "%s\n", cmd_to_str(CMD_END));
+ }
+
+ if (zonecfg_getmcapent(handle, &mcaptab) == Z_OK) {
+ char buf[128];
+
+ (void) fprintf(of, "%s %s\n", cmd_to_str(CMD_ADD),
+ rt_to_str(RT_MCAP));
+ bytes_to_units(mcaptab.zone_physmem_cap, buf, sizeof (buf));
+ (void) fprintf(of, "%s %s=%s\n", cmd_to_str(CMD_SET),
+ pt_to_str(PT_PHYSICAL), buf);
+ (void) fprintf(of, "%s\n", cmd_to_str(CMD_END));
+ }
+
done:
if (need_to_close)
(void) fclose(of);
@@ -1641,6 +1862,10 @@ static void
add_resource(cmd_t *cmd)
{
int type;
+ struct zone_psettab tmp_psettab;
+ struct zone_mcaptab tmp_mcaptab;
+ uint64_t tmp_mcap;
+ char pool[MAXNAMELEN];
if ((type = cmd->cmd_res_type) == RT_UNKNOWN) {
long_usage(CMD_ADD, TRUE);
@@ -1667,6 +1892,12 @@ add_resource(cmd_t *cmd)
bzero(&in_progress_devtab, sizeof (in_progress_devtab));
return;
case RT_RCTL:
+ if (global_zone)
+ zerr(gettext("WARNING: Setting a global zone resource "
+ "control too low could deny\nservice "
+ "to even the root user; "
+ "this could render the system impossible\n"
+ "to administer. Please use caution."));
bzero(&in_progress_rctltab, sizeof (in_progress_rctltab));
return;
case RT_ATTR:
@@ -1675,6 +1906,48 @@ add_resource(cmd_t *cmd)
case RT_DATASET:
bzero(&in_progress_dstab, sizeof (in_progress_dstab));
return;
+ case RT_DCPU:
+ /* Make sure there isn't already a cpu-set entry. */
+ if (zonecfg_lookup_pset(handle, &tmp_psettab) == Z_OK) {
+ zerr(gettext("The %s resource already exists."),
+ rt_to_str(RT_DCPU));
+ goto bad;
+ }
+
+ /* Make sure the pool property isn't set. */
+ if (zonecfg_get_pool(handle, pool, sizeof (pool)) == Z_OK &&
+ strlen(pool) > 0) {
+ zerr(gettext("The %s property is already set. "
+ "A persistent pool is incompatible with\nthe %s "
+ "resource."),
+ pt_to_str(PT_POOL), rt_to_str(RT_DCPU));
+ goto bad;
+ }
+
+ bzero(&in_progress_psettab, sizeof (in_progress_psettab));
+ return;
+ case RT_MCAP:
+ /*
+ * Make sure there isn't already a mem-cap entry or max-swap
+ * or max-locked rctl.
+ */
+ if (zonecfg_lookup_mcap(handle, &tmp_mcaptab) == Z_OK ||
+ zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP, &tmp_mcap)
+ == Z_OK ||
+ zonecfg_get_aliased_rctl(handle, ALIAS_MAXLOCKEDMEM,
+ &tmp_mcap) == Z_OK) {
+ zerr(gettext("The %s resource or a related resource "
+ "control already exists."), rt_to_str(RT_MCAP));
+ goto bad;
+ }
+ if (global_zone)
+ zerr(gettext("WARNING: Setting a global zone memory "
+ "cap too low could deny\nservice "
+ "to even the root user; "
+ "this could render the system impossible\n"
+ "to administer. Please use caution."));
+ bzero(&in_progress_mcaptab, sizeof (in_progress_mcaptab));
+ return;
default:
zone_perror(rt_to_str(type), Z_NO_RESOURCE_TYPE, TRUE);
long_usage(CMD_ADD, TRUE);
@@ -1871,6 +2144,30 @@ add_property(cmd_t *cmd)
}
}
+static boolean_t
+gz_invalid_resource(int type)
+{
+ return (global_zone && (type == RT_FS || type == RT_IPD ||
+ type == RT_NET || type == RT_DEVICE || type == RT_ATTR ||
+ type == RT_DATASET));
+}
+
+static boolean_t
+gz_invalid_rt_property(int type)
+{
+ return (global_zone && (type == RT_ZONENAME || type == RT_ZONEPATH ||
+ type == RT_AUTOBOOT || type == RT_LIMITPRIV ||
+ type == RT_BOOTARGS || type == RT_BRAND || type == RT_SCHED));
+}
+
+static boolean_t
+gz_invalid_property(int type)
+{
+ return (global_zone && (type == PT_ZONENAME || type == PT_ZONEPATH ||
+ type == PT_AUTOBOOT || type == PT_LIMITPRIV ||
+ type == PT_BOOTARGS || type == PT_BRAND || type == PT_SCHED));
+}
+
void
add_func(cmd_t *cmd)
{
@@ -1900,6 +2197,13 @@ add_func(cmd_t *cmd)
if (initialize(TRUE) != Z_OK)
return;
if (global_scope) {
+ if (gz_invalid_resource(cmd->cmd_res_type)) {
+ zerr(gettext("Cannot add a %s resource to the "
+ "global zone."), rt_to_str(cmd->cmd_res_type));
+ saw_error = TRUE;
+ return;
+ }
+
global_scope = FALSE;
resource_scope = cmd->cmd_res_type;
end_op = CMD_ADD;
@@ -2273,26 +2577,85 @@ fill_in_dstab(cmd_t *cmd, struct zone_dstab *dstab, bool fill_in_only)
}
static void
-remove_resource(cmd_t *cmd)
+remove_aliased_rctl(int type, char *name)
{
- int err, type;
- struct zone_fstab fstab;
- struct zone_nwiftab nwiftab;
- struct zone_devtab devtab;
- struct zone_attrtab attrtab;
- struct zone_rctltab rctltab;
- struct zone_dstab dstab;
+ int err;
+ uint64_t tmp;
- if ((type = cmd->cmd_res_type) == RT_UNKNOWN) {
- long_usage(CMD_REMOVE, TRUE);
+ if ((err = zonecfg_get_aliased_rctl(handle, name, &tmp)) != Z_OK) {
+ zerr("%s %s: %s", cmd_to_str(CMD_CLEAR), pt_to_str(type),
+ zonecfg_strerror(err));
+ saw_error = TRUE;
return;
}
+ if ((err = zonecfg_rm_aliased_rctl(handle, name)) != Z_OK) {
+ zerr("%s %s: %s", cmd_to_str(CMD_CLEAR), pt_to_str(type),
+ zonecfg_strerror(err));
+ saw_error = TRUE;
+ } else {
+ need_to_commit = TRUE;
+ }
+}
- if (initialize(TRUE) != Z_OK)
- return;
+static boolean_t
+prompt_remove_resource(cmd_t *cmd, char *rsrc)
+{
+ int num;
+ int answer;
+ int arg;
+ boolean_t force = B_FALSE;
+ char prompt[128];
+
+ optind = 0;
+ while ((arg = getopt(cmd->cmd_argc, cmd->cmd_argv, "F")) != EOF) {
+ switch (arg) {
+ case 'F':
+ force = B_TRUE;
+ break;
+ default:
+ return (B_FALSE);
+ }
+ }
+
+ num = zonecfg_num_resources(handle, rsrc);
+
+ if (num == 0) {
+ z_cmd_rt_perror(CMD_REMOVE, cmd->cmd_res_type, Z_NO_ENTRY,
+ TRUE);
+ return (B_FALSE);
+ }
+ if (num > 1 && !force) {
+ if (!interactive_mode) {
+ zerr(gettext("There are multiple instances of this "
+ "resource. Either qualify the resource to\n"
+ "remove a single instance or use the -F option to "
+ "remove all instances."));
+ saw_error = TRUE;
+ return (B_FALSE);
+ }
+ (void) snprintf(prompt, sizeof (prompt), gettext(
+ "Are you sure you want to remove ALL '%s' resources"),
+ rsrc);
+ answer = ask_yesno(FALSE, prompt);
+ if (answer == -1) {
+ zerr(gettext("Resource incomplete."));
+ return (B_FALSE);
+ }
+ if (answer != 1)
+ return (B_FALSE);
+ }
+ return (B_TRUE);
+}
+
+static void
+remove_fs(cmd_t *cmd)
+{
+ int err;
+
+ /* traditional, qualified fs removal */
+ if (cmd->cmd_prop_nv_pairs > 0) {
+ struct zone_fstab fstab;
- switch (type) {
- case RT_FS:
if ((err = fill_in_fstab(cmd, &fstab, FALSE)) != Z_OK) {
z_cmd_rt_perror(CMD_REMOVE, RT_FS, err, TRUE);
return;
@@ -2303,13 +2666,36 @@ remove_resource(cmd_t *cmd)
need_to_commit = TRUE;
zonecfg_free_fs_option_list(fstab.zone_fs_options);
return;
- case RT_IPD:
- if (state_atleast(ZONE_STATE_INSTALLED)) {
- zerr(gettext("Zone %s already installed; %s %s not "
- "allowed."), zone, cmd_to_str(CMD_REMOVE),
- rt_to_str(RT_IPD));
- return;
- }
+ }
+
+ /*
+ * unqualified fs removal. remove all fs's but prompt if more
+ * than one.
+ */
+ if (!prompt_remove_resource(cmd, "fs"))
+ return;
+
+ if ((err = zonecfg_del_all_resources(handle, "fs")) != Z_OK)
+ z_cmd_rt_perror(CMD_REMOVE, RT_FS, err, TRUE);
+ else
+ need_to_commit = TRUE;
+}
+
+static void
+remove_ipd(cmd_t *cmd)
+{
+ int err;
+
+ if (state_atleast(ZONE_STATE_INSTALLED)) {
+ zerr(gettext("Zone %s already installed; %s %s not allowed."),
+ zone, cmd_to_str(CMD_REMOVE), rt_to_str(RT_IPD));
+ return;
+ }
+
+ /* traditional, qualified ipd removal */
+ if (cmd->cmd_prop_nv_pairs > 0) {
+ struct zone_fstab fstab;
+
if ((err = fill_in_ipdtab(cmd, &fstab, FALSE)) != Z_OK) {
z_cmd_rt_perror(CMD_REMOVE, RT_IPD, err, TRUE);
return;
@@ -2319,7 +2705,31 @@ remove_resource(cmd_t *cmd)
else
need_to_commit = TRUE;
return;
- case RT_NET:
+ }
+
+ /*
+ * unqualified ipd removal. remove all ipds but prompt if more
+ * than one.
+ */
+ if (!prompt_remove_resource(cmd, "inherit-pkg-dir"))
+ return;
+
+ if ((err = zonecfg_del_all_resources(handle, "inherit-pkg-dir"))
+ != Z_OK)
+ z_cmd_rt_perror(CMD_REMOVE, RT_IPD, err, TRUE);
+ else
+ need_to_commit = TRUE;
+}
+
+static void
+remove_net(cmd_t *cmd)
+{
+ int err;
+
+ /* traditional, qualified net removal */
+ if (cmd->cmd_prop_nv_pairs > 0) {
+ struct zone_nwiftab nwiftab;
+
if ((err = fill_in_nwiftab(cmd, &nwiftab, FALSE)) != Z_OK) {
z_cmd_rt_perror(CMD_REMOVE, RT_NET, err, TRUE);
return;
@@ -2329,7 +2739,30 @@ remove_resource(cmd_t *cmd)
else
need_to_commit = TRUE;
return;
- case RT_DEVICE:
+ }
+
+ /*
+ * unqualified net removal. remove all nets but prompt if more
+ * than one.
+ */
+ if (!prompt_remove_resource(cmd, "net"))
+ return;
+
+ if ((err = zonecfg_del_all_resources(handle, "net")) != Z_OK)
+ z_cmd_rt_perror(CMD_REMOVE, RT_NET, err, TRUE);
+ else
+ need_to_commit = TRUE;
+}
+
+static void
+remove_device(cmd_t *cmd)
+{
+ int err;
+
+ /* traditional, qualified device removal */
+ if (cmd->cmd_prop_nv_pairs > 0) {
+ struct zone_devtab devtab;
+
if ((err = fill_in_devtab(cmd, &devtab, FALSE)) != Z_OK) {
z_cmd_rt_perror(CMD_REMOVE, RT_DEVICE, err, TRUE);
return;
@@ -2339,18 +2772,30 @@ remove_resource(cmd_t *cmd)
else
need_to_commit = TRUE;
return;
- case RT_RCTL:
- if ((err = fill_in_rctltab(cmd, &rctltab, FALSE)) != Z_OK) {
- z_cmd_rt_perror(CMD_REMOVE, RT_RCTL, err, TRUE);
- return;
- }
- if ((err = zonecfg_delete_rctl(handle, &rctltab)) != Z_OK)
- z_cmd_rt_perror(CMD_REMOVE, RT_RCTL, err, TRUE);
- else
- need_to_commit = TRUE;
- zonecfg_free_rctl_value_list(rctltab.zone_rctl_valptr);
+ }
+
+ /*
+ * unqualified device removal. remove all devices but prompt if more
+ * than one.
+ */
+ if (!prompt_remove_resource(cmd, "device"))
return;
- case RT_ATTR:
+
+ if ((err = zonecfg_del_all_resources(handle, "device")) != Z_OK)
+ z_cmd_rt_perror(CMD_REMOVE, RT_DEVICE, err, TRUE);
+ else
+ need_to_commit = TRUE;
+}
+
+static void
+remove_attr(cmd_t *cmd)
+{
+ int err;
+
+ /* traditional, qualified attr removal */
+ if (cmd->cmd_prop_nv_pairs > 0) {
+ struct zone_attrtab attrtab;
+
if ((err = fill_in_attrtab(cmd, &attrtab, FALSE)) != Z_OK) {
z_cmd_rt_perror(CMD_REMOVE, RT_ATTR, err, TRUE);
return;
@@ -2360,7 +2805,30 @@ remove_resource(cmd_t *cmd)
else
need_to_commit = TRUE;
return;
- case RT_DATASET:
+ }
+
+ /*
+ * unqualified attr removal. remove all attrs but prompt if more
+ * than one.
+ */
+ if (!prompt_remove_resource(cmd, "attr"))
+ return;
+
+ if ((err = zonecfg_del_all_resources(handle, "attr")) != Z_OK)
+ z_cmd_rt_perror(CMD_REMOVE, RT_ATTR, err, TRUE);
+ else
+ need_to_commit = TRUE;
+}
+
+static void
+remove_dataset(cmd_t *cmd)
+{
+ int err;
+
+ /* traditional, qualified dataset removal */
+ if (cmd->cmd_prop_nv_pairs > 0) {
+ struct zone_dstab dstab;
+
if ((err = fill_in_dstab(cmd, &dstab, FALSE)) != Z_OK) {
z_cmd_rt_perror(CMD_REMOVE, RT_DATASET, err, TRUE);
return;
@@ -2370,6 +2838,177 @@ remove_resource(cmd_t *cmd)
else
need_to_commit = TRUE;
return;
+ }
+
+ /*
+ * unqualified dataset removal. remove all datasets but prompt if more
+ * than one.
+ */
+ if (!prompt_remove_resource(cmd, "dataset"))
+ return;
+
+ if ((err = zonecfg_del_all_resources(handle, "dataset")) != Z_OK)
+ z_cmd_rt_perror(CMD_REMOVE, RT_DATASET, err, TRUE);
+ else
+ need_to_commit = TRUE;
+}
+
+static void
+remove_rctl(cmd_t *cmd)
+{
+ int err;
+
+ /* traditional, qualified rctl removal */
+ if (cmd->cmd_prop_nv_pairs > 0) {
+ struct zone_rctltab rctltab;
+
+ if ((err = fill_in_rctltab(cmd, &rctltab, FALSE)) != Z_OK) {
+ z_cmd_rt_perror(CMD_REMOVE, RT_RCTL, err, TRUE);
+ return;
+ }
+ if ((err = zonecfg_delete_rctl(handle, &rctltab)) != Z_OK)
+ z_cmd_rt_perror(CMD_REMOVE, RT_RCTL, err, TRUE);
+ else
+ need_to_commit = TRUE;
+ zonecfg_free_rctl_value_list(rctltab.zone_rctl_valptr);
+ return;
+ }
+
+ /*
+ * unqualified rctl removal. remove all rctls but prompt if more
+ * than one.
+ */
+ if (!prompt_remove_resource(cmd, "rctl"))
+ return;
+
+ if ((err = zonecfg_del_all_resources(handle, "rctl")) != Z_OK)
+ z_cmd_rt_perror(CMD_REMOVE, RT_RCTL, err, TRUE);
+ else
+ need_to_commit = TRUE;
+}
+
+static void
+remove_pset()
+{
+ int err;
+ struct zone_psettab psettab;
+
+ if ((err = zonecfg_lookup_pset(handle, &psettab)) != Z_OK) {
+ z_cmd_rt_perror(CMD_REMOVE, RT_DCPU, err, TRUE);
+ return;
+ }
+ if ((err = zonecfg_delete_pset(handle)) != Z_OK)
+ z_cmd_rt_perror(CMD_REMOVE, RT_DCPU, err, TRUE);
+ else
+ need_to_commit = TRUE;
+}
+
+static void
+remove_mcap()
+{
+ int err, res1, res2, res3;
+ uint64_t tmp;
+ struct zone_mcaptab mcaptab;
+ boolean_t revert = B_FALSE;
+
+ res1 = zonecfg_lookup_mcap(handle, &mcaptab);
+ res2 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP, &tmp);
+ res3 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXLOCKEDMEM, &tmp);
+
+ /* if none of these exist, there is no resource to remove */
+ if (res1 != Z_OK && res2 != Z_OK && res3 != Z_OK) {
+ zerr("%s %s: %s", cmd_to_str(CMD_REMOVE), rt_to_str(RT_MCAP),
+ zonecfg_strerror(Z_NO_RESOURCE_TYPE));
+ saw_error = TRUE;
+ return;
+ }
+ if (res1 == Z_OK) {
+ if ((err = zonecfg_delete_mcap(handle)) != Z_OK) {
+ z_cmd_rt_perror(CMD_REMOVE, RT_MCAP, err, TRUE);
+ revert = B_TRUE;
+ } else {
+ need_to_commit = TRUE;
+ }
+ }
+ if (res2 == Z_OK) {
+ if ((err = zonecfg_rm_aliased_rctl(handle, ALIAS_MAXSWAP))
+ != Z_OK) {
+ z_cmd_rt_perror(CMD_REMOVE, RT_MCAP, err, TRUE);
+ revert = B_TRUE;
+ } else {
+ need_to_commit = TRUE;
+ }
+ }
+ if (res3 == Z_OK) {
+ if ((err = zonecfg_rm_aliased_rctl(handle, ALIAS_MAXLOCKEDMEM))
+ != Z_OK) {
+ z_cmd_rt_perror(CMD_REMOVE, RT_MCAP, err, TRUE);
+ revert = B_TRUE;
+ } else {
+ need_to_commit = TRUE;
+ }
+ }
+
+ if (revert)
+ need_to_commit = FALSE;
+}
+
+static void
+remove_resource(cmd_t *cmd)
+{
+ int type;
+ int arg;
+
+ if ((type = cmd->cmd_res_type) == RT_UNKNOWN) {
+ long_usage(CMD_REMOVE, TRUE);
+ return;
+ }
+
+ optind = 0;
+ while ((arg = getopt(cmd->cmd_argc, cmd->cmd_argv, "?F")) != EOF) {
+ switch (arg) {
+ case '?':
+ longer_usage(CMD_REMOVE);
+ return;
+ case 'F':
+ break;
+ default:
+ short_usage(CMD_REMOVE);
+ return;
+ }
+ }
+
+ if (initialize(TRUE) != Z_OK)
+ return;
+
+ switch (type) {
+ case RT_FS:
+ remove_fs(cmd);
+ return;
+ case RT_IPD:
+ remove_ipd(cmd);
+ return;
+ case RT_NET:
+ remove_net(cmd);
+ return;
+ case RT_DEVICE:
+ remove_device(cmd);
+ return;
+ case RT_RCTL:
+ remove_rctl(cmd);
+ return;
+ case RT_ATTR:
+ remove_attr(cmd);
+ return;
+ case RT_DATASET:
+ remove_dataset(cmd);
+ return;
+ case RT_DCPU:
+ remove_pset();
+ return;
+ case RT_MCAP:
+ remove_mcap();
+ return;
default:
zone_perror(rt_to_str(type), Z_NO_RESOURCE_TYPE, TRUE);
long_usage(CMD_REMOVE, TRUE);
@@ -2513,16 +3152,175 @@ remove_func(cmd_t *cmd)
assert(cmd != NULL);
- if (global_scope)
+ if (global_scope) {
+ if (gz_invalid_resource(cmd->cmd_res_type)) {
+ zerr(gettext("%s is not a valid resource for the "
+ "global zone."), rt_to_str(cmd->cmd_res_type));
+ saw_error = TRUE;
+ return;
+ }
remove_resource(cmd);
- else
+ } else {
remove_property(cmd);
+ }
+}
+
+static void
+clear_property(cmd_t *cmd)
+{
+ int res_type, prop_type;
+
+ res_type = resource_scope;
+ prop_type = cmd->cmd_res_type;
+ if (res_type == RT_UNKNOWN || prop_type == PT_UNKNOWN) {
+ long_usage(CMD_CLEAR, TRUE);
+ return;
+ }
+
+ if (initialize(TRUE) != Z_OK)
+ return;
+
+ switch (res_type) {
+ case RT_FS:
+ if (prop_type == PT_RAW) {
+ in_progress_fstab.zone_fs_raw[0] = '\0';
+ need_to_commit = TRUE;
+ return;
+ }
+ break;
+ case RT_DCPU:
+ if (prop_type == PT_IMPORTANCE) {
+ in_progress_psettab.zone_importance[0] = '\0';
+ need_to_commit = TRUE;
+ return;
+ }
+ break;
+ case RT_MCAP:
+ switch (prop_type) {
+ case PT_PHYSICAL:
+ in_progress_mcaptab.zone_physmem_cap[0] = '\0';
+ need_to_commit = TRUE;
+ return;
+ case PT_SWAP:
+ remove_aliased_rctl(PT_SWAP, ALIAS_MAXSWAP);
+ return;
+ case PT_LOCKED:
+ remove_aliased_rctl(PT_LOCKED, ALIAS_MAXLOCKEDMEM);
+ return;
+ }
+ break;
+ default:
+ break;
+ }
+
+ zone_perror(pt_to_str(prop_type), Z_CLEAR_DISALLOW, TRUE);
+}
+
+static void
+clear_global(cmd_t *cmd)
+{
+ int err, type;
+
+ if ((type = cmd->cmd_res_type) == RT_UNKNOWN) {
+ long_usage(CMD_CLEAR, TRUE);
+ return;
+ }
+
+ if (initialize(TRUE) != Z_OK)
+ return;
+
+ switch (type) {
+ case PT_ZONENAME:
+ /* FALLTHRU */
+ case PT_ZONEPATH:
+ /* FALLTHRU */
+ case PT_BRAND:
+ zone_perror(pt_to_str(type), Z_CLEAR_DISALLOW, TRUE);
+ return;
+ case PT_AUTOBOOT:
+ /* false is default; we'll treat as equivalent to clearing */
+ if ((err = zonecfg_set_autoboot(handle, B_FALSE)) != Z_OK)
+ z_cmd_rt_perror(CMD_CLEAR, RT_AUTOBOOT, err, TRUE);
+ else
+ need_to_commit = TRUE;
+ return;
+ case PT_POOL:
+ if ((err = zonecfg_set_pool(handle, NULL)) != Z_OK)
+ z_cmd_rt_perror(CMD_CLEAR, RT_POOL, err, TRUE);
+ else
+ need_to_commit = TRUE;
+ return;
+ case PT_LIMITPRIV:
+ if ((err = zonecfg_set_limitpriv(handle, NULL)) != Z_OK)
+ z_cmd_rt_perror(CMD_CLEAR, RT_LIMITPRIV, err, TRUE);
+ else
+ need_to_commit = TRUE;
+ return;
+ case PT_BOOTARGS:
+ if ((err = zonecfg_set_bootargs(handle, NULL)) != Z_OK)
+ z_cmd_rt_perror(CMD_CLEAR, RT_BOOTARGS, err, TRUE);
+ else
+ need_to_commit = TRUE;
+ return;
+ case PT_SCHED:
+ if ((err = zonecfg_set_sched(handle, NULL)) != Z_OK)
+ z_cmd_rt_perror(CMD_CLEAR, RT_SCHED, err, TRUE);
+ else
+ need_to_commit = TRUE;
+ return;
+ case PT_MAXLWPS:
+ remove_aliased_rctl(PT_MAXLWPS, ALIAS_MAXLWPS);
+ return;
+ case PT_MAXSHMMEM:
+ remove_aliased_rctl(PT_MAXSHMMEM, ALIAS_MAXSHMMEM);
+ return;
+ case PT_MAXSHMIDS:
+ remove_aliased_rctl(PT_MAXSHMIDS, ALIAS_MAXSHMIDS);
+ return;
+ case PT_MAXMSGIDS:
+ remove_aliased_rctl(PT_MAXMSGIDS, ALIAS_MAXMSGIDS);
+ return;
+ case PT_MAXSEMIDS:
+ remove_aliased_rctl(PT_MAXSEMIDS, ALIAS_MAXSEMIDS);
+ return;
+ case PT_SHARES:
+ remove_aliased_rctl(PT_SHARES, ALIAS_SHARES);
+ return;
+ default:
+ zone_perror(pt_to_str(type), Z_NO_PROPERTY_TYPE, TRUE);
+ long_usage(CMD_CLEAR, TRUE);
+ usage(FALSE, HELP_PROPS);
+ return;
+ }
+}
+
+void
+clear_func(cmd_t *cmd)
+{
+ if (zone_is_read_only(CMD_CLEAR))
+ return;
+
+ assert(cmd != NULL);
+
+ if (global_scope) {
+ if (gz_invalid_property(cmd->cmd_res_type)) {
+ zerr(gettext("%s is not a valid property for the "
+ "global zone."), pt_to_str(cmd->cmd_res_type));
+ saw_error = TRUE;
+ return;
+ }
+
+ clear_global(cmd);
+ } else {
+ clear_property(cmd);
+ }
}
void
select_func(cmd_t *cmd)
{
- int type, err;
+ int type, err, res;
+ uint64_t limit;
if (zone_is_read_only(CMD_SELECT))
return;
@@ -2612,6 +3410,32 @@ select_func(cmd_t *cmd)
bcopy(&old_dstab, &in_progress_dstab,
sizeof (struct zone_dstab));
return;
+ case RT_DCPU:
+ if ((err = zonecfg_lookup_pset(handle, &old_psettab)) != Z_OK) {
+ z_cmd_rt_perror(CMD_SELECT, RT_DCPU, err, TRUE);
+ global_scope = TRUE;
+ }
+ bcopy(&old_psettab, &in_progress_psettab,
+ sizeof (struct zone_psettab));
+ return;
+ case RT_MCAP:
+ /* if none of these exist, there is no resource to select */
+ if ((res = zonecfg_lookup_mcap(handle, &old_mcaptab)) != Z_OK &&
+ zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP, &limit)
+ != Z_OK &&
+ zonecfg_get_aliased_rctl(handle, ALIAS_MAXLOCKEDMEM, &limit)
+ != Z_OK) {
+ z_cmd_rt_perror(CMD_SELECT, RT_MCAP, Z_NO_RESOURCE_TYPE,
+ TRUE);
+ global_scope = TRUE;
+ }
+ if (res == Z_OK)
+ bcopy(&old_mcaptab, &in_progress_mcaptab,
+ sizeof (struct zone_mcaptab));
+ else
+ bzero(&in_progress_mcaptab,
+ sizeof (in_progress_mcaptab));
+ return;
default:
zone_perror(rt_to_str(type), Z_NO_RESOURCE_TYPE, TRUE);
long_usage(CMD_SELECT, TRUE);
@@ -2731,6 +3555,49 @@ valid_fs_type(const char *type)
return (B_TRUE);
}
+static void
+set_aliased_rctl(char *alias, int prop_type, char *s)
+{
+ uint64_t limit;
+ int err;
+ char tmp[128];
+
+ if (global_zone && strcmp(alias, ALIAS_SHARES) != 0)
+ zerr(gettext("WARNING: Setting a global zone resource "
+ "control too low could deny\nservice "
+ "to even the root user; "
+ "this could render the system impossible\n"
+ "to administer. Please use caution."));
+
+ /* convert memory based properties */
+ if (prop_type == PT_MAXSHMMEM) {
+ if (!zonecfg_valid_memlimit(s, &limit)) {
+ zerr(gettext("A non-negative number with a required "
+ "scale suffix (K, M, G or T) was expected\nhere."));
+ saw_error = TRUE;
+ return;
+ }
+
+ (void) snprintf(tmp, sizeof (tmp), "%llu", limit);
+ s = tmp;
+ }
+
+ if (!zonecfg_aliased_rctl_ok(handle, alias)) {
+ zone_perror(pt_to_str(prop_type), Z_ALIAS_DISALLOW, FALSE);
+ saw_error = TRUE;
+ } else if (!zonecfg_valid_alias_limit(alias, s, &limit)) {
+ zerr(gettext("%s property is out of range."),
+ pt_to_str(prop_type));
+ saw_error = TRUE;
+ } else if ((err = zonecfg_set_aliased_rctl(handle, alias, limit))
+ != Z_OK) {
+ zone_perror(zone, err, TRUE);
+ saw_error = TRUE;
+ } else {
+ need_to_commit = TRUE;
+ }
+}
+
void
set_func(cmd_t *cmd)
{
@@ -2739,6 +3606,9 @@ set_func(cmd_t *cmd)
property_value_ptr_t pp;
boolean_t autoboot;
boolean_t force_set = FALSE;
+ size_t physmem_size = sizeof (in_progress_mcaptab.zone_physmem_cap);
+ uint64_t mem_cap, mem_limit;
+ struct zone_psettab tmp_psettab;
if (zone_is_read_only(CMD_SET))
return;
@@ -2762,6 +3632,13 @@ set_func(cmd_t *cmd)
prop_type = cmd->cmd_prop_name[0];
if (global_scope) {
+ if (gz_invalid_property(prop_type)) {
+ zerr(gettext("%s is not a valid property for the "
+ "global zone."), pt_to_str(prop_type));
+ saw_error = TRUE;
+ return;
+ }
+
if (prop_type == PT_ZONENAME) {
res_type = RT_ZONENAME;
} else if (prop_type == PT_ZONEPATH) {
@@ -2776,6 +3653,20 @@ set_func(cmd_t *cmd)
res_type = RT_LIMITPRIV;
} else if (prop_type == PT_BOOTARGS) {
res_type = RT_BOOTARGS;
+ } else if (prop_type == PT_SCHED) {
+ res_type = RT_SCHED;
+ } else if (prop_type == PT_MAXLWPS) {
+ res_type = RT_MAXLWPS;
+ } else if (prop_type == PT_MAXSHMMEM) {
+ res_type = RT_MAXSHMMEM;
+ } else if (prop_type == PT_MAXSHMIDS) {
+ res_type = RT_MAXSHMIDS;
+ } else if (prop_type == PT_MAXMSGIDS) {
+ res_type = RT_MAXMSGIDS;
+ } else if (prop_type == PT_MAXSEMIDS) {
+ res_type = RT_MAXSEMIDS;
+ } else if (prop_type == PT_SHARES) {
+ res_type = RT_SHARES;
} else {
zerr(gettext("Cannot set a resource-specific property "
"from the global scope."));
@@ -2899,6 +3790,24 @@ set_func(cmd_t *cmd)
need_to_commit = TRUE;
return;
case RT_POOL:
+ /* don't allow use of the reserved temporary pool names */
+ if (strncmp("SUNW", prop_id, 4) == 0) {
+ zerr(gettext("pool names starting with SUNW are "
+ "reserved."));
+ saw_error = TRUE;
+ return;
+ }
+
+ /* can't set pool if dedicated-cpu exists */
+ if (zonecfg_lookup_pset(handle, &tmp_psettab) == Z_OK) {
+ zerr(gettext("The %s resource already exists. "
+ "A persistent pool is incompatible\nwith the %s "
+ "resource."), rt_to_str(RT_DCPU),
+ rt_to_str(RT_DCPU));
+ saw_error = TRUE;
+ return;
+ }
+
if ((err = zonecfg_set_pool(handle, prop_id)) != Z_OK)
zone_perror(zone, err, TRUE);
else
@@ -2916,6 +3825,30 @@ set_func(cmd_t *cmd)
else
need_to_commit = TRUE;
return;
+ case RT_SCHED:
+ if ((err = zonecfg_set_sched(handle, prop_id)) != Z_OK)
+ zone_perror(zone, err, TRUE);
+ else
+ need_to_commit = TRUE;
+ return;
+ case RT_MAXLWPS:
+ set_aliased_rctl(ALIAS_MAXLWPS, prop_type, prop_id);
+ return;
+ case RT_MAXSHMMEM:
+ set_aliased_rctl(ALIAS_MAXSHMMEM, prop_type, prop_id);
+ return;
+ case RT_MAXSHMIDS:
+ set_aliased_rctl(ALIAS_MAXSHMIDS, prop_type, prop_id);
+ return;
+ case RT_MAXMSGIDS:
+ set_aliased_rctl(ALIAS_MAXMSGIDS, prop_type, prop_id);
+ return;
+ case RT_MAXSEMIDS:
+ set_aliased_rctl(ALIAS_MAXSEMIDS, prop_type, prop_id);
+ return;
+ case RT_SHARES:
+ set_aliased_rctl(ALIAS_SHARES, prop_type, prop_id);
+ return;
case RT_FS:
switch (prop_type) {
case PT_DIR:
@@ -3095,6 +4028,146 @@ set_func(cmd_t *cmd)
long_usage(CMD_SET, TRUE);
usage(FALSE, HELP_PROPS);
return;
+ case RT_DCPU:
+ switch (prop_type) {
+ char *lowp, *highp;
+
+ case PT_NCPUS:
+ lowp = prop_id;
+ if ((highp = strchr(prop_id, '-')) != NULL)
+ *highp++ = '\0';
+ else
+ highp = lowp;
+
+ /* Make sure the input makes sense. */
+ if (!zonecfg_valid_ncpus(lowp, highp)) {
+ zerr(gettext("%s property is out of range."),
+ pt_to_str(PT_NCPUS));
+ saw_error = TRUE;
+ return;
+ }
+
+ (void) strlcpy(
+ in_progress_psettab.zone_ncpu_min, lowp,
+ sizeof (in_progress_psettab.zone_ncpu_min));
+ (void) strlcpy(
+ in_progress_psettab.zone_ncpu_max, highp,
+ sizeof (in_progress_psettab.zone_ncpu_max));
+ return;
+ case PT_IMPORTANCE:
+ /* Make sure the value makes sense. */
+ if (!zonecfg_valid_importance(prop_id)) {
+ zerr(gettext("%s property is out of range."),
+ pt_to_str(PT_IMPORTANCE));
+ saw_error = TRUE;
+ return;
+ }
+
+ (void) strlcpy(in_progress_psettab.zone_importance,
+ prop_id,
+ sizeof (in_progress_psettab.zone_importance));
+ return;
+ default:
+ break;
+ }
+ zone_perror(pt_to_str(prop_type), Z_NO_PROPERTY_TYPE, TRUE);
+ long_usage(CMD_SET, TRUE);
+ usage(FALSE, HELP_PROPS);
+ return;
+ case RT_MCAP:
+ switch (prop_type) {
+ case PT_PHYSICAL:
+ if (!zonecfg_valid_memlimit(prop_id, &mem_cap)) {
+ zerr(gettext("A positive number with a "
+ "required scale suffix (K, M, G or T) was "
+ "expected here."));
+ saw_error = TRUE;
+ } else if (mem_cap < ONE_MB) {
+ zerr(gettext("%s value is too small. It must "
+ "be at least 1M."), pt_to_str(PT_PHYSICAL));
+ saw_error = TRUE;
+ } else {
+ snprintf(in_progress_mcaptab.zone_physmem_cap,
+ physmem_size, "%llu", mem_cap);
+ }
+ break;
+ case PT_SWAP:
+ /*
+ * We have to check if an rctl is allowed here since
+ * there might already be a rctl defined that blocks
+ * the alias.
+ */
+ if (!zonecfg_aliased_rctl_ok(handle, ALIAS_MAXSWAP)) {
+ zone_perror(pt_to_str(PT_MAXSWAP),
+ Z_ALIAS_DISALLOW, FALSE);
+ saw_error = TRUE;
+ return;
+ }
+
+ if (global_zone)
+ mem_limit = ONE_MB * 100;
+ else
+ mem_limit = ONE_MB * 50;
+
+ if (!zonecfg_valid_memlimit(prop_id, &mem_cap)) {
+ zerr(gettext("A positive number with a "
+ "required scale suffix (K, M, G or T) was "
+ "expected here."));
+ saw_error = TRUE;
+ } else if (mem_cap < mem_limit) {
+ char buf[128];
+
+ (void) snprintf(buf, sizeof (buf), "%llu",
+ mem_limit);
+ bytes_to_units(buf, buf, sizeof (buf));
+ zerr(gettext("%s value is too small. It must "
+ "be at least %s."), pt_to_str(PT_SWAP),
+ buf);
+ saw_error = TRUE;
+ } else {
+ if ((err = zonecfg_set_aliased_rctl(handle,
+ ALIAS_MAXSWAP, mem_cap)) != Z_OK)
+ zone_perror(zone, err, TRUE);
+ else
+ need_to_commit = TRUE;
+ }
+ break;
+ case PT_LOCKED:
+ /*
+ * We have to check if an rctl is allowed here since
+ * there might already be a rctl defined that blocks
+ * the alias.
+ */
+ if (!zonecfg_aliased_rctl_ok(handle,
+ ALIAS_MAXLOCKEDMEM)) {
+ zone_perror(pt_to_str(PT_LOCKED),
+ Z_ALIAS_DISALLOW, FALSE);
+ saw_error = TRUE;
+ return;
+ }
+
+ if (!zonecfg_valid_memlimit(prop_id, &mem_cap)) {
+ zerr(gettext("A non-negative number with a "
+ "required scale suffix (K, M, G or T) was "
+ "expected\nhere."));
+ saw_error = TRUE;
+ } else {
+ if ((err = zonecfg_set_aliased_rctl(handle,
+ ALIAS_MAXLOCKEDMEM, mem_cap)) != Z_OK)
+ zone_perror(zone, err, TRUE);
+ else
+ need_to_commit = TRUE;
+ }
+ break;
+ default:
+ zone_perror(pt_to_str(prop_type), Z_NO_PROPERTY_TYPE,
+ TRUE);
+ long_usage(CMD_SET, TRUE);
+ usage(FALSE, HELP_PROPS);
+ return;
+ }
+
+ return;
default:
zone_perror(rt_to_str(res_type), Z_NO_RESOURCE_TYPE, TRUE);
long_usage(CMD_SET, TRUE);
@@ -3110,7 +4183,11 @@ output_prop(FILE *fp, int pnum, char *pval, bool print_notspec)
if (*pval != '\0') {
qstr = quoteit(pval);
- (void) fprintf(fp, "\t%s: %s\n", pt_to_str(pnum), qstr);
+ if (pnum == PT_SWAP || pnum == PT_LOCKED)
+ (void) fprintf(fp, "\t[%s: %s]\n", pt_to_str(pnum),
+ qstr);
+ else
+ (void) fprintf(fp, "\t%s: %s\n", pt_to_str(pnum), qstr);
free(qstr);
} else if (print_notspec)
(void) fprintf(fp, gettext("\t%s not specified\n"),
@@ -3213,6 +4290,20 @@ info_bootargs(zone_dochandle_t handle, FILE *fp)
}
static void
+info_sched(zone_dochandle_t handle, FILE *fp)
+{
+ char sched[MAXNAMELEN];
+ int err;
+
+ if ((err = zonecfg_get_sched_class(handle, sched, sizeof (sched)))
+ == Z_OK) {
+ (void) fprintf(fp, "%s: %s\n", pt_to_str(PT_SCHED), sched);
+ } else {
+ zone_perror(zone, err, TRUE);
+ }
+}
+
+static void
output_fs(FILE *fp, struct zone_fstab *fstab)
{
zone_fsopt_t *this;
@@ -3499,7 +4590,7 @@ info_ds(zone_dochandle_t handle, FILE *fp, cmd_t *cmd)
struct zone_dstab lookup, user;
bool output = FALSE;
- if (zonecfg_setdevent(handle) != Z_OK)
+ if (zonecfg_setdsent(handle) != Z_OK)
return;
while (zonecfg_getdsent(handle, &lookup) == Z_OK) {
if (cmd->cmd_prop_nv_pairs == 0) {
@@ -3525,12 +4616,132 @@ info_ds(zone_dochandle_t handle, FILE *fp, cmd_t *cmd)
rt_to_str(RT_DATASET));
}
+static void
+output_pset(FILE *fp, struct zone_psettab *psettab)
+{
+ (void) fprintf(fp, "%s:\n", rt_to_str(RT_DCPU));
+ if (strcmp(psettab->zone_ncpu_min, psettab->zone_ncpu_max) == 0)
+ (void) fprintf(fp, "\t%s: %s\n", pt_to_str(PT_NCPUS),
+ psettab->zone_ncpu_max);
+ else
+ (void) fprintf(fp, "\t%s: %s-%s\n", pt_to_str(PT_NCPUS),
+ psettab->zone_ncpu_min, psettab->zone_ncpu_max);
+ if (psettab->zone_importance[0] != '\0')
+ (void) fprintf(fp, "\t%s: %s\n", pt_to_str(PT_IMPORTANCE),
+ psettab->zone_importance);
+}
+
+static void
+info_pset(zone_dochandle_t handle, FILE *fp)
+{
+ struct zone_psettab lookup;
+
+ if (zonecfg_getpsetent(handle, &lookup) == Z_OK)
+ output_pset(fp, &lookup);
+}
+
+static void
+info_aliased_rctl(zone_dochandle_t handle, FILE *fp, char *alias)
+{
+ uint64_t limit;
+
+ if (zonecfg_get_aliased_rctl(handle, alias, &limit) == Z_OK) {
+ /* convert memory based properties */
+ if (strcmp(alias, ALIAS_MAXSHMMEM) == 0) {
+ char buf[128];
+
+ (void) snprintf(buf, sizeof (buf), "%llu", limit);
+ bytes_to_units(buf, buf, sizeof (buf));
+ (void) fprintf(fp, "[%s: %s]\n", alias, buf);
+ return;
+ }
+
+ (void) fprintf(fp, "[%s: %llu]\n", alias, limit);
+ }
+}
+
+static void
+bytes_to_units(char *str, char *buf, int bufsize)
+{
+ unsigned long long num;
+ unsigned long long save = 0;
+ char *units = "BKMGT";
+ char *up = units;
+
+ num = strtoll(str, NULL, 10);
+
+ if (num < 1024) {
+ (void) snprintf(buf, bufsize, "%llu", num);
+ return;
+ }
+
+ while ((num >= 1024) && (*up != 'T')) {
+ up++; /* next unit of measurement */
+ save = num;
+ num = (num + 512) >> 10;
+ }
+
+ /* check if we should output a fraction. snprintf will round for us */
+ if (save % 1024 != 0 && ((save >> 10) < 10))
+ (void) snprintf(buf, bufsize, "%2.1f%c", ((float)save / 1024),
+ *up);
+ else
+ (void) snprintf(buf, bufsize, "%llu%c", num, *up);
+}
+
+static void
+output_mcap(FILE *fp, struct zone_mcaptab *mcaptab, int showswap,
+ uint64_t maxswap, int showlocked, uint64_t maxlocked)
+{
+ char buf[128];
+
+ (void) fprintf(fp, "%s:\n", rt_to_str(RT_MCAP));
+ if (mcaptab->zone_physmem_cap[0] != '\0') {
+ bytes_to_units(mcaptab->zone_physmem_cap, buf, sizeof (buf));
+ output_prop(fp, PT_PHYSICAL, buf, B_TRUE);
+ }
+
+ if (showswap == Z_OK) {
+ (void) snprintf(buf, sizeof (buf), "%llu", maxswap);
+ bytes_to_units(buf, buf, sizeof (buf));
+ output_prop(fp, PT_SWAP, buf, B_TRUE);
+ }
+
+ if (showlocked == Z_OK) {
+ (void) snprintf(buf, sizeof (buf), "%llu", maxlocked);
+ bytes_to_units(buf, buf, sizeof (buf));
+ output_prop(fp, PT_LOCKED, buf, B_TRUE);
+ }
+}
+
+static void
+info_mcap(zone_dochandle_t handle, FILE *fp)
+{
+ int res1, res2, res3;
+ uint64_t swap_limit;
+ uint64_t locked_limit;
+ struct zone_mcaptab lookup;
+
+ bzero(&lookup, sizeof (lookup));
+ res1 = zonecfg_getmcapent(handle, &lookup);
+ res2 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP, &swap_limit);
+ res3 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXLOCKEDMEM,
+ &locked_limit);
+
+ if (res1 == Z_OK || res2 == Z_OK || res3 == Z_OK)
+ output_mcap(fp, &lookup, res2, swap_limit, res3, locked_limit);
+}
+
void
info_func(cmd_t *cmd)
{
FILE *fp = stdout;
bool need_to_close = FALSE;
char *pager;
+ int type;
+ int res1, res2;
+ uint64_t swap_limit;
+ uint64_t locked_limit;
assert(cmd != NULL);
@@ -3569,26 +4780,68 @@ info_func(cmd_t *cmd)
case RT_DATASET:
output_ds(fp, &in_progress_dstab);
break;
+ case RT_DCPU:
+ output_pset(fp, &in_progress_psettab);
+ break;
+ case RT_MCAP:
+ res1 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP,
+ &swap_limit);
+ res2 = zonecfg_get_aliased_rctl(handle,
+ ALIAS_MAXLOCKEDMEM, &locked_limit);
+ output_mcap(fp, &in_progress_mcaptab, res1, swap_limit,
+ res2, locked_limit);
+ break;
}
goto cleanup;
}
+ type = cmd->cmd_res_type;
+
+ if (gz_invalid_rt_property(type)) {
+ zerr(gettext("%s is not a valid property for the global zone."),
+ rt_to_str(type));
+ goto cleanup;
+ }
+
+ if (gz_invalid_resource(type)) {
+ zerr(gettext("%s is not a valid resource for the global zone."),
+ rt_to_str(type));
+ goto cleanup;
+ }
+
switch (cmd->cmd_res_type) {
case RT_UNKNOWN:
info_zonename(handle, fp);
- info_zonepath(handle, fp);
- info_brand(handle, fp);
- info_autoboot(handle, fp);
- info_bootargs(handle, fp);
+ if (!global_zone) {
+ info_zonepath(handle, fp);
+ info_brand(handle, fp);
+ info_autoboot(handle, fp);
+ info_bootargs(handle, fp);
+ }
info_pool(handle, fp);
- info_limitpriv(handle, fp);
- info_ipd(handle, fp, cmd);
- info_fs(handle, fp, cmd);
- info_net(handle, fp, cmd);
- info_dev(handle, fp, cmd);
+ if (!global_zone) {
+ info_limitpriv(handle, fp);
+ info_sched(handle, fp);
+ }
+ info_aliased_rctl(handle, fp, ALIAS_MAXLWPS);
+ info_aliased_rctl(handle, fp, ALIAS_MAXSHMMEM);
+ info_aliased_rctl(handle, fp, ALIAS_MAXSHMIDS);
+ info_aliased_rctl(handle, fp, ALIAS_MAXMSGIDS);
+ info_aliased_rctl(handle, fp, ALIAS_MAXSEMIDS);
+ info_aliased_rctl(handle, fp, ALIAS_SHARES);
+ if (!global_zone) {
+ info_ipd(handle, fp, cmd);
+ info_fs(handle, fp, cmd);
+ info_net(handle, fp, cmd);
+ info_dev(handle, fp, cmd);
+ }
+ info_pset(handle, fp);
+ info_mcap(handle, fp);
+ if (!global_zone) {
+ info_attr(handle, fp, cmd);
+ info_ds(handle, fp, cmd);
+ }
info_rctl(handle, fp, cmd);
- info_attr(handle, fp, cmd);
- info_ds(handle, fp, cmd);
break;
case RT_ZONENAME:
info_zonename(handle, fp);
@@ -3611,6 +4864,27 @@ info_func(cmd_t *cmd)
case RT_BOOTARGS:
info_bootargs(handle, fp);
break;
+ case RT_SCHED:
+ info_sched(handle, fp);
+ break;
+ case RT_MAXLWPS:
+ info_aliased_rctl(handle, fp, ALIAS_MAXLWPS);
+ break;
+ case RT_MAXSHMMEM:
+ info_aliased_rctl(handle, fp, ALIAS_MAXSHMMEM);
+ break;
+ case RT_MAXSHMIDS:
+ info_aliased_rctl(handle, fp, ALIAS_MAXSHMIDS);
+ break;
+ case RT_MAXMSGIDS:
+ info_aliased_rctl(handle, fp, ALIAS_MAXMSGIDS);
+ break;
+ case RT_MAXSEMIDS:
+ info_aliased_rctl(handle, fp, ALIAS_MAXSEMIDS);
+ break;
+ case RT_SHARES:
+ info_aliased_rctl(handle, fp, ALIAS_SHARES);
+ break;
case RT_FS:
info_fs(handle, fp, cmd);
break;
@@ -3632,6 +4906,12 @@ info_func(cmd_t *cmd)
case RT_DATASET:
info_ds(handle, fp, cmd);
break;
+ case RT_DCPU:
+ info_pset(handle, fp);
+ break;
+ case RT_MCAP:
+ info_mcap(handle, fp);
+ break;
default:
zone_perror(rt_to_str(cmd->cmd_res_type), Z_NO_RESOURCE_TYPE,
TRUE);
@@ -3765,10 +5045,13 @@ verify_func(cmd_t *cmd)
struct zone_attrtab attrtab;
struct zone_rctltab rctltab;
struct zone_dstab dstab;
+ struct zone_psettab psettab;
char zonepath[MAXPATHLEN];
+ char sched[MAXNAMELEN];
char brand[MAXNAMELEN];
int err, ret_val = Z_OK, arg;
bool save = FALSE;
+ boolean_t has_cpu_shares = B_FALSE;
optind = 0;
if ((arg = getopt(cmd->cmd_argc, cmd->cmd_argv, "?")) != EOF) {
@@ -3796,12 +5079,13 @@ verify_func(cmd_t *cmd)
if (initialize(TRUE) != Z_OK)
return;
- if (zonecfg_get_zonepath(handle, zonepath, sizeof (zonepath)) != Z_OK) {
+ if (zonecfg_get_zonepath(handle, zonepath, sizeof (zonepath)) != Z_OK &&
+ !global_zone) {
zerr(gettext("%s not specified"), pt_to_str(PT_ZONEPATH));
ret_val = Z_REQD_RESOURCE_MISSING;
saw_error = TRUE;
}
- if (strlen(zonepath) == 0) {
+ if (strlen(zonepath) == 0 && !global_zone) {
zerr(gettext("%s cannot be empty."), pt_to_str(PT_ZONEPATH));
ret_val = Z_REQD_RESOURCE_MISSING;
saw_error = TRUE;
@@ -3861,6 +5145,9 @@ verify_func(cmd_t *cmd)
check_reqd_prop(rctltab.zone_rctl_name, RT_RCTL, PT_NAME,
&ret_val);
+ if (strcmp(rctltab.zone_rctl_name, "zone.cpu-shares") == 0)
+ has_cpu_shares = B_TRUE;
+
if (rctltab.zone_rctl_valptr == NULL) {
zerr(gettext("%s: no %s specified"),
rt_to_str(RT_RCTL), pt_to_str(PT_VALUE));
@@ -3873,6 +5160,25 @@ verify_func(cmd_t *cmd)
}
(void) zonecfg_endrctlent(handle);
+ if (zonecfg_lookup_pset(handle, &psettab) == Z_OK && has_cpu_shares) {
+ zerr(gettext("%s zone.cpu-shares and %s are incompatible."),
+ rt_to_str(RT_RCTL), rt_to_str(RT_DCPU));
+ saw_error = TRUE;
+ if (ret_val == Z_OK)
+ ret_val = Z_INCOMPATIBLE;
+ }
+
+ if (has_cpu_shares && zonecfg_get_sched_class(handle, sched,
+ sizeof (sched)) == Z_OK && strlen(sched) > 0 &&
+ strcmp(sched, "FSS") != 0) {
+ zerr(gettext("WARNING: %s zone.cpu-shares and %s=%s are "
+ "incompatible"),
+ rt_to_str(RT_RCTL), rt_to_str(RT_SCHED), sched);
+ saw_error = TRUE;
+ if (ret_val == Z_OK)
+ ret_val = Z_INCOMPATIBLE;
+ }
+
if ((err = zonecfg_setattrent(handle)) != Z_OK) {
zone_perror(zone, err, TRUE);
return;
@@ -4061,7 +5367,9 @@ end_func(cmd_t *cmd)
struct zone_rctltab tmp_rctltab;
struct zone_attrtab tmp_attrtab;
struct zone_dstab tmp_dstab;
- int err, arg;
+ int err, arg, res1, res2, res3;
+ uint64_t swap_limit;
+ uint64_t locked_limit;
assert(cmd != NULL);
@@ -4361,6 +5669,73 @@ end_func(cmd_t *cmd)
&in_progress_dstab);
}
break;
+ case RT_DCPU:
+ /* Make sure everything was filled in. */
+ if (end_check_reqd(in_progress_psettab.zone_ncpu_min,
+ PT_NCPUS, &validation_failed) != Z_OK) {
+ saw_error = TRUE;
+ return;
+ }
+
+ if (end_op == CMD_ADD) {
+ err = zonecfg_add_pset(handle, &in_progress_psettab);
+ } else {
+ err = zonecfg_modify_pset(handle, &in_progress_psettab);
+ }
+ break;
+ case RT_MCAP:
+ /* Make sure everything was filled in. */
+ res1 = strlen(in_progress_mcaptab.zone_physmem_cap) == 0 ?
+ Z_ERR : Z_OK;
+ res2 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP,
+ &swap_limit);
+ res3 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXLOCKEDMEM,
+ &locked_limit);
+
+ if (res1 != Z_OK && res2 != Z_OK && res3 != Z_OK) {
+ zerr(gettext("No property was specified. One of %s, "
+ "%s or %s is required."), pt_to_str(PT_PHYSICAL),
+ pt_to_str(PT_SWAP), pt_to_str(PT_LOCKED));
+ saw_error = TRUE;
+ return;
+ }
+
+ /* if phys & locked are both set, verify locked <= phys */
+ if (res1 == Z_OK && res3 == Z_OK) {
+ uint64_t phys_limit;
+ char *endp;
+
+ phys_limit = strtoull(
+ in_progress_mcaptab.zone_physmem_cap, &endp, 10);
+ if (phys_limit < locked_limit) {
+ zerr(gettext("The %s cap must be less than or "
+ "equal to the %s cap."),
+ pt_to_str(PT_LOCKED),
+ pt_to_str(PT_PHYSICAL));
+ saw_error = TRUE;
+ return;
+ }
+ }
+
+ err = Z_OK;
+ if (res1 == Z_OK) {
+ /*
+ * We could be ending from either an add operation
+ * or a select operation. Since all of the properties
+ * within this resource are optional, we always use
+ * modify on the mcap entry. zonecfg_modify_mcap()
+ * will handle both adding and modifying a memory cap.
+ */
+ err = zonecfg_modify_mcap(handle, &in_progress_mcaptab);
+ } else if (end_op == CMD_SELECT) {
+ /*
+ * If we're ending from a select and the physical
+ * memory cap is empty then the user could have cleared
+ * the physical cap value, so try to delete the entry.
+ */
+ (void) zonecfg_delete_mcap(handle);
+ }
+ break;
default:
zone_perror(rt_to_str(resource_scope), Z_NO_RESOURCE_TYPE,
TRUE);
@@ -4885,7 +6260,9 @@ main(int argc, char *argv[])
zonecfg_set_root(optarg);
break;
case 'z':
- if (zonecfg_validate_zonename(optarg) != Z_OK) {
+ if (strcmp(optarg, GLOBAL_ZONENAME) == 0) {
+ global_zone = TRUE;
+ } else if (zonecfg_validate_zonename(optarg) != Z_OK) {
zone_perror(optarg, Z_BOGUS_ZONE_NAME, TRUE);
usage(FALSE, HELP_SYNTAX);
exit(Z_USAGE);
diff --git a/usr/src/cmd/zonecfg/zonecfg.h b/usr/src/cmd/zonecfg/zonecfg.h
index 6e153d40c1..64808e9623 100644
--- a/usr/src/cmd/zonecfg/zonecfg.h
+++ b/usr/src/cmd/zonecfg/zonecfg.h
@@ -50,19 +50,20 @@ typedef int bool;
#define CMD_ADD 0
#define CMD_CANCEL 1
-#define CMD_COMMIT 2
-#define CMD_CREATE 3
-#define CMD_DELETE 4
-#define CMD_END 5
-#define CMD_EXIT 6
-#define CMD_EXPORT 7
-#define CMD_HELP 8
-#define CMD_INFO 9
-#define CMD_REMOVE 10
-#define CMD_REVERT 11
-#define CMD_SELECT 12
-#define CMD_SET 13
-#define CMD_VERIFY 14
+#define CMD_CLEAR 2
+#define CMD_COMMIT 3
+#define CMD_CREATE 4
+#define CMD_DELETE 5
+#define CMD_END 6
+#define CMD_EXIT 7
+#define CMD_EXPORT 8
+#define CMD_HELP 9
+#define CMD_INFO 10
+#define CMD_REMOVE 11
+#define CMD_REVERT 12
+#define CMD_SELECT 13
+#define CMD_SET 14
+#define CMD_VERIFY 15
#define CMD_MIN CMD_ADD
#define CMD_MAX CMD_VERIFY
@@ -83,9 +84,18 @@ typedef int bool;
#define RT_LIMITPRIV 12 /* really a property, but for info ... */
#define RT_BOOTARGS 13 /* really a property, but for info ... */
#define RT_BRAND 14 /* really a property, but for info ... */
+#define RT_DCPU 15
+#define RT_MCAP 16
+#define RT_MAXLWPS 17 /* really a rctl alias property, but for info */
+#define RT_MAXSHMMEM 18 /* really a rctl alias property, but for info */
+#define RT_MAXSHMIDS 19 /* really a rctl alias property, but for info */
+#define RT_MAXMSGIDS 20 /* really a rctl alias property, but for info */
+#define RT_MAXSEMIDS 21 /* really a rctl alias property, but for info */
+#define RT_SHARES 22 /* really a rctl alias property, but for info */
+#define RT_SCHED 23 /* really a property, but for info ... */
#define RT_MIN RT_UNKNOWN
-#define RT_MAX RT_BRAND
+#define RT_MAX RT_SCHED
/* property types: increment PT_MAX when expanding this list */
#define PT_UNKNOWN 0
@@ -109,9 +119,22 @@ typedef int bool;
#define PT_LIMITPRIV 18
#define PT_BOOTARGS 19
#define PT_BRAND 20
+#define PT_NCPUS 21
+#define PT_IMPORTANCE 22
+#define PT_SWAP 23
+#define PT_LOCKED 24
+#define PT_SHARES 25
+#define PT_MAXLWPS 26
+#define PT_MAXSHMMEM 27
+#define PT_MAXSHMIDS 28
+#define PT_MAXMSGIDS 29
+#define PT_MAXSEMIDS 30
+#define PT_MAXLOCKEDMEM 31
+#define PT_MAXSWAP 32
+#define PT_SCHED 33
#define PT_MIN PT_UNKNOWN
-#define PT_MAX PT_BRAND
+#define PT_MAX PT_SCHED
#define MAX_EQ_PROP_PAIRS 3
@@ -184,6 +207,7 @@ extern void revert_func(cmd_t *);
extern void select_func(cmd_t *);
extern void set_func(cmd_t *);
extern void verify_func(cmd_t *);
+extern void clear_func(cmd_t *);
extern cmd_t *alloc_cmd(void);
extern complex_property_ptr_t alloc_complex(void);
diff --git a/usr/src/cmd/zonecfg/zonecfg_grammar.y b/usr/src/cmd/zonecfg/zonecfg_grammar.y
index dc391da0b9..5c0dc2263e 100644
--- a/usr/src/cmd/zonecfg/zonecfg_grammar.y
+++ b/usr/src/cmd/zonecfg/zonecfg_grammar.y
@@ -60,15 +60,17 @@ extern void yyerror(char *s);
%token COMMIT REVERT EXIT SEMICOLON TOKEN ZONENAME ZONEPATH AUTOBOOT POOL NET
%token FS IPD ATTR DEVICE RCTL SPECIAL RAW DIR OPTIONS TYPE ADDRESS PHYSICAL
%token NAME MATCH PRIV LIMIT ACTION VALUE EQUAL OPEN_SQ_BRACKET CLOSE_SQ_BRACKET
-%token OPEN_PAREN CLOSE_PAREN COMMA DATASET LIMITPRIV BOOTARGS BRAND
+%token OPEN_PAREN CLOSE_PAREN COMMA DATASET LIMITPRIV BOOTARGS BRAND PSET
+%token MCAP NCPUS IMPORTANCE SHARES MAXLWPS MAXSHMMEM MAXSHMIDS MAXMSGIDS
+%token MAXSEMIDS LOCKED SWAP SCHED CLEAR
%type <strval> TOKEN EQUAL OPEN_SQ_BRACKET CLOSE_SQ_BRACKET
property_value OPEN_PAREN CLOSE_PAREN COMMA simple_prop_val
%type <complex> complex_piece complex_prop_val
-%type <ival> resource_type NET FS IPD DEVICE RCTL ATTR
+%type <ival> resource_type NET FS IPD DEVICE RCTL ATTR DATASET PSET MCAP
%type <ival> property_name SPECIAL RAW DIR OPTIONS TYPE ADDRESS PHYSICAL NAME
MATCH ZONENAME ZONEPATH AUTOBOOT POOL LIMITPRIV BOOTARGS VALUE PRIV LIMIT
- ACTION BRAND
+ ACTION BRAND SCHED
%type <cmd> command
%type <cmd> add_command ADD
%type <cmd> cancel_command CANCEL
@@ -84,6 +86,7 @@ extern void yyerror(char *s);
%type <cmd> revert_command REVERT
%type <cmd> select_command SELECT
%type <cmd> set_command SET
+%type <cmd> clear_command CLEAR
%type <cmd> verify_command VERIFY
%type <cmd> terminator
@@ -126,6 +129,7 @@ commands: command terminator
command: add_command
| cancel_command
+ | clear_command
| create_command
| commit_command
| delete_command
@@ -465,6 +469,69 @@ info_command: INFO
$$->cmd_res_type = RT_BOOTARGS;
$$->cmd_prop_nv_pairs = 0;
}
+ | INFO SCHED
+ {
+ if (($$ = alloc_cmd()) == NULL)
+ YYERROR;
+ cmd = $$;
+ $$->cmd_handler = &info_func;
+ $$->cmd_res_type = RT_SCHED;
+ $$->cmd_prop_nv_pairs = 0;
+ }
+ | INFO SHARES
+ {
+ if (($$ = alloc_cmd()) == NULL)
+ YYERROR;
+ cmd = $$;
+ $$->cmd_handler = &info_func;
+ $$->cmd_res_type = RT_SHARES;
+ $$->cmd_prop_nv_pairs = 0;
+ }
+ | INFO MAXLWPS
+ {
+ if (($$ = alloc_cmd()) == NULL)
+ YYERROR;
+ cmd = $$;
+ $$->cmd_handler = &info_func;
+ $$->cmd_res_type = RT_MAXLWPS;
+ $$->cmd_prop_nv_pairs = 0;
+ }
+ | INFO MAXSHMMEM
+ {
+ if (($$ = alloc_cmd()) == NULL)
+ YYERROR;
+ cmd = $$;
+ $$->cmd_handler = &info_func;
+ $$->cmd_res_type = RT_MAXSHMMEM;
+ $$->cmd_prop_nv_pairs = 0;
+ }
+ | INFO MAXSHMIDS
+ {
+ if (($$ = alloc_cmd()) == NULL)
+ YYERROR;
+ cmd = $$;
+ $$->cmd_handler = &info_func;
+ $$->cmd_res_type = RT_MAXSHMIDS;
+ $$->cmd_prop_nv_pairs = 0;
+ }
+ | INFO MAXMSGIDS
+ {
+ if (($$ = alloc_cmd()) == NULL)
+ YYERROR;
+ cmd = $$;
+ $$->cmd_handler = &info_func;
+ $$->cmd_res_type = RT_MAXMSGIDS;
+ $$->cmd_prop_nv_pairs = 0;
+ }
+ | INFO MAXSEMIDS
+ {
+ if (($$ = alloc_cmd()) == NULL)
+ YYERROR;
+ cmd = $$;
+ $$->cmd_handler = &info_func;
+ $$->cmd_res_type = RT_MAXSEMIDS;
+ $$->cmd_prop_nv_pairs = 0;
+ }
| INFO resource_type property_name EQUAL property_value
{
if (($$ = alloc_cmd()) == NULL)
@@ -512,11 +579,32 @@ remove_command: REMOVE
usage(FALSE, HELP_RES_PROPS);
YYERROR;
}
- | REMOVE resource_type
+ | REMOVE TOKEN
{
short_usage(CMD_REMOVE);
+ (void) fputs("\n", stderr);
+ usage(FALSE, HELP_RES_PROPS);
YYERROR;
}
+ | REMOVE resource_type
+ {
+ if (($$ = alloc_cmd()) == NULL)
+ YYERROR;
+ cmd = $$;
+ $$->cmd_handler = &remove_func;
+ $$->cmd_res_type = $2;
+ }
+ | REMOVE TOKEN resource_type
+ {
+ if (($$ = alloc_cmd()) == NULL)
+ YYERROR;
+ cmd = $$;
+ $$->cmd_handler = &remove_func;
+ $$->cmd_res_type = $3;
+ $$->cmd_argc = 1;
+ $$->cmd_argv[0] = $2;
+ $$->cmd_argv[1] = NULL;
+ }
| REMOVE property_name property_value
{
if (($$ = alloc_cmd()) == NULL)
@@ -594,6 +682,22 @@ select_command: SELECT
usage(FALSE, HELP_RES_PROPS);
YYERROR;
}
+ | SELECT PSET
+ {
+ if (($$ = alloc_cmd()) == NULL)
+ YYERROR;
+ cmd = $$;
+ $$->cmd_handler = &select_func;
+ $$->cmd_res_type = RT_DCPU;
+ }
+ | SELECT MCAP
+ {
+ if (($$ = alloc_cmd()) == NULL)
+ YYERROR;
+ cmd = $$;
+ $$->cmd_handler = &select_func;
+ $$->cmd_res_type = RT_MCAP;
+ }
| SELECT resource_type
{
short_usage(CMD_SELECT);
@@ -682,6 +786,22 @@ set_command: SET
$$->cmd_property_ptr[0] = &property[0];
}
+clear_command: CLEAR
+ {
+ short_usage(CMD_CLEAR);
+ (void) fputs("\n", stderr);
+ usage(FALSE, HELP_PROPS);
+ YYERROR;
+ }
+ | CLEAR property_name
+ {
+ if (($$ = alloc_cmd()) == NULL)
+ YYERROR;
+ cmd = $$;
+ $$->cmd_handler = &clear_func;
+ $$->cmd_res_type = $2;
+ }
+
verify_command: VERIFY
{
if (($$ = alloc_cmd()) == NULL)
@@ -709,6 +829,8 @@ resource_type: NET { $$ = RT_NET; }
| RCTL { $$ = RT_RCTL; }
| ATTR { $$ = RT_ATTR; }
| DATASET { $$ = RT_DATASET; }
+ | PSET { $$ = RT_DCPU; }
+ | MCAP { $$ = RT_MCAP; }
property_name: SPECIAL { $$ = PT_SPECIAL; }
| RAW { $$ = PT_RAW; }
@@ -730,6 +852,17 @@ property_name: SPECIAL { $$ = PT_SPECIAL; }
| LIMIT { $$ = PT_LIMIT; }
| ACTION { $$ = PT_ACTION; }
| BRAND { $$ = PT_BRAND; }
+ | NCPUS { $$ = PT_NCPUS; }
+ | LOCKED { $$ = PT_LOCKED; }
+ | SWAP { $$ = PT_SWAP; }
+ | IMPORTANCE { $$ = PT_IMPORTANCE; }
+ | SHARES { $$ = PT_SHARES; }
+ | MAXLWPS { $$ = PT_MAXLWPS; }
+ | MAXSHMMEM { $$ = PT_MAXSHMMEM; }
+ | MAXSHMIDS { $$ = PT_MAXSHMIDS; }
+ | MAXMSGIDS { $$ = PT_MAXMSGIDS; }
+ | MAXSEMIDS { $$ = PT_MAXSEMIDS; }
+ | SCHED { $$ = PT_SCHED; }
/*
* The grammar builds data structures from the bottom up. Thus various
diff --git a/usr/src/cmd/zonecfg/zonecfg_lex.l b/usr/src/cmd/zonecfg/zonecfg_lex.l
index aef16edbcb..53f726ca2e 100644
--- a/usr/src/cmd/zonecfg/zonecfg_lex.l
+++ b/usr/src/cmd/zonecfg/zonecfg_lex.l
@@ -40,7 +40,10 @@ extern void yyerror(char *s);
char *safe_strdup(char *s);
%}
-%a 4000
+%a 6000
+%p 4000
+%e 2000
+%n 1000
%{
/*
@@ -139,6 +142,12 @@ char *safe_strdup(char *s);
return SET;
}
+<INITIAL>clear {
+ BEGIN TSTATE;
+ state = TSTATE;
+ return CLEAR;
+ }
+
<INITIAL>verify {
BEGIN TSTATE;
state = TSTATE;
@@ -162,6 +171,10 @@ char *safe_strdup(char *s);
<TSTATE>dataset { return DATASET; }
+<TSTATE>dedicated-cpu { return PSET; }
+
+<TSTATE>capped-memory { return MCAP; }
+
<TSTATE>zonepath { return ZONEPATH; }
<CSTATE>zonepath { return ZONEPATH; }
@@ -219,6 +232,39 @@ char *safe_strdup(char *s);
<TSTATE>action { return ACTION; }
<CSTATE>action { return ACTION; }
+<TSTATE>ncpus { return NCPUS; }
+<CSTATE>ncpus { return NCPUS; }
+
+<TSTATE>locked { return LOCKED; }
+<CSTATE>locked { return LOCKED; }
+
+<TSTATE>swap { return SWAP; }
+<CSTATE>swap { return SWAP; }
+
+<TSTATE>importance { return IMPORTANCE; }
+<CSTATE>importance { return IMPORTANCE; }
+
+<TSTATE>cpu-shares { return SHARES; }
+<CSTATE>cpu-shares { return SHARES; }
+
+<TSTATE>max-lwps { return MAXLWPS; }
+<CSTATE>max-lwps { return MAXLWPS; }
+
+<TSTATE>max-shm-memory { return MAXSHMMEM; }
+<CSTATE>max-shm-memory { return MAXSHMMEM; }
+
+<TSTATE>max-shm-ids { return MAXSHMIDS; }
+<CSTATE>max-shm-ids { return MAXSHMIDS; }
+
+<TSTATE>max-msg-ids { return MAXMSGIDS; }
+<CSTATE>max-msg-ids { return MAXMSGIDS; }
+
+<TSTATE>max-sem-ids { return MAXSEMIDS; }
+<CSTATE>max-sem-ids { return MAXSEMIDS; }
+
+<TSTATE>scheduling-class { return SCHED; }
+<CSTATE>scheduling-class { return SCHED; }
+
<TSTATE>= { return EQUAL; }
<LSTATE>= { return EQUAL; }
<CSTATE>= { return EQUAL; }
diff --git a/usr/src/head/libzonecfg.h b/usr/src/head/libzonecfg.h
index 3af98c1a6b..10ee4a2bb4 100644
--- a/usr/src/head/libzonecfg.h
+++ b/usr/src/head/libzonecfg.h
@@ -90,6 +90,15 @@ extern "C" {
#define Z_PRIV_REQUIRED 38 /* required privilege is missing */
#define Z_PRIV_UNKNOWN 39 /* specified privilege is unknown */
#define Z_BRAND_ERROR 40 /* brand-specific error */
+#define Z_INCOMPATIBLE 41 /* incompatible settings */
+#define Z_ALIAS_DISALLOW 42 /* rctl alias disallowed */
+#define Z_CLEAR_DISALLOW 43 /* clear property disallowed */
+#define Z_POOL 44 /* generic libpool error */
+#define Z_POOLS_NOT_ACTIVE 45 /* pool service not enabled */
+#define Z_POOL_ENABLE 46 /* pools enable failed */
+#define Z_NO_POOL 47 /* no such pool configured */
+#define Z_POOL_CREATE 48 /* pool create failed */
+#define Z_POOL_BIND 49 /* pool bind failed */
/*
* Warning: these are shared with the admin/install consolidation.
@@ -126,6 +135,18 @@ extern "C" {
#define ZONE_PKG_VERSMAX 256
/*
+ * Shortened alias names for the zones rctls.
+ */
+#define ALIAS_MAXLWPS "max-lwps"
+#define ALIAS_MAXSHMMEM "max-shm-memory"
+#define ALIAS_MAXSHMIDS "max-shm-ids"
+#define ALIAS_MAXMSGIDS "max-msg-ids"
+#define ALIAS_MAXSEMIDS "max-sem-ids"
+#define ALIAS_MAXLOCKEDMEM "locked"
+#define ALIAS_MAXSWAP "swap"
+#define ALIAS_SHARES "cpu-shares"
+
+/*
* Bit flag definitions for passing into libzonecfg functions.
*/
#define ZONE_DRY_RUN 0x01
@@ -190,6 +211,16 @@ struct zone_dstab {
char zone_dataset_name[MAXNAMELEN];
};
+struct zone_psettab {
+ char zone_ncpu_min[MAXNAMELEN];
+ char zone_ncpu_max[MAXNAMELEN];
+ char zone_importance[MAXNAMELEN];
+};
+
+struct zone_mcaptab {
+ char zone_physmem_cap[MAXNAMELEN];
+};
+
struct zone_pkgtab {
char zone_pkg_name[MAXNAMELEN];
char zone_pkg_version[ZONE_PKG_VERSMAX];
@@ -227,10 +258,17 @@ extern int zonecfg_access(const char *, int);
extern void zonecfg_set_root(const char *);
extern const char *zonecfg_get_root(void);
extern boolean_t zonecfg_in_alt_root(void);
+extern int zonecfg_num_resources(zone_dochandle_t, char *);
+extern int zonecfg_del_all_resources(zone_dochandle_t, char *);
+extern boolean_t zonecfg_valid_ncpus(char *, char *);
+extern boolean_t zonecfg_valid_importance(char *);
+extern int zonecfg_str_to_bytes(char *, uint64_t *);
+extern boolean_t zonecfg_valid_memlimit(char *, uint64_t *);
+extern boolean_t zonecfg_valid_alias_limit(char *, char *, uint64_t *);
/*
- * Zone name, path to zone directory, autoboot setting, pool and boot
- * arguments.
+ * Zone name, path to zone directory, autoboot setting, pool, boot
+ * arguments, and scheduling-class.
*/
extern int zonecfg_validate_zonename(const char *);
extern int zonecfg_get_name(zone_dochandle_t, char *, size_t);
@@ -243,6 +281,9 @@ extern int zonecfg_get_pool(zone_dochandle_t, char *, size_t);
extern int zonecfg_set_pool(zone_dochandle_t, char *);
extern int zonecfg_get_bootargs(zone_dochandle_t, char *, size_t);
extern int zonecfg_set_bootargs(zone_dochandle_t, char *);
+extern int zonecfg_get_sched_class(zone_dochandle_t, char *, size_t);
+extern int zonecfg_set_sched(zone_dochandle_t, char *);
+extern int zonecfg_get_dflt_sched_class(zone_dochandle_t, char *, int);
/*
* Set/retrieve the brand for the zone
@@ -302,6 +343,11 @@ extern int zonecfg_add_rctl_value(struct zone_rctltab *,
extern int zonecfg_remove_rctl_value(struct zone_rctltab *,
struct zone_rctlvaltab *);
extern void zonecfg_free_rctl_value_list(struct zone_rctlvaltab *);
+extern boolean_t zonecfg_aliased_rctl_ok(zone_dochandle_t, char *);
+extern int zonecfg_set_aliased_rctl(zone_dochandle_t, char *, uint64_t);
+extern int zonecfg_get_aliased_rctl(zone_dochandle_t, char *, uint64_t *);
+extern int zonecfg_rm_aliased_rctl(zone_dochandle_t, char *);
+extern int zonecfg_apply_rctls(char *, zone_dochandle_t);
/*
* Generic attribute configuration and type/value extraction.
@@ -328,6 +374,34 @@ extern int zonecfg_modify_ds(zone_dochandle_t, struct zone_dstab *,
extern int zonecfg_lookup_ds(zone_dochandle_t, struct zone_dstab *);
/*
+ * cpu-set configuration.
+ */
+extern int zonecfg_add_pset(zone_dochandle_t, struct zone_psettab *);
+extern int zonecfg_delete_pset(zone_dochandle_t);
+extern int zonecfg_modify_pset(zone_dochandle_t, struct zone_psettab *);
+extern int zonecfg_lookup_pset(zone_dochandle_t, struct zone_psettab *);
+
+/*
+ * mem-cap configuration.
+ */
+extern int zonecfg_delete_mcap(zone_dochandle_t);
+extern int zonecfg_modify_mcap(zone_dochandle_t, struct zone_mcaptab *);
+extern int zonecfg_lookup_mcap(zone_dochandle_t, struct zone_mcaptab *);
+
+/*
+ * Temporary pool support functions.
+ */
+extern int zonecfg_destroy_tmp_pool(char *, char *, int);
+extern int zonecfg_bind_tmp_pool(zone_dochandle_t, zoneid_t, char *, int);
+extern int zonecfg_bind_pool(zone_dochandle_t, zoneid_t, char *, int);
+extern boolean_t zonecfg_warn_poold(zone_dochandle_t);
+
+/*
+ * Miscellaneous utility functions.
+ */
+extern int zonecfg_enable_rcapd(char *, int);
+
+/*
* attach/detach support.
*/
extern int zonecfg_get_attach_handle(const char *, const char *,
@@ -373,6 +447,8 @@ extern int zonecfg_endrctlent(zone_dochandle_t);
extern int zonecfg_setdsent(zone_dochandle_t);
extern int zonecfg_getdsent(zone_dochandle_t, struct zone_dstab *);
extern int zonecfg_enddsent(zone_dochandle_t);
+extern int zonecfg_getpsetent(zone_dochandle_t, struct zone_psettab *);
+extern int zonecfg_getmcapent(zone_dochandle_t, struct zone_mcaptab *);
extern int zonecfg_setpkgent(zone_dochandle_t);
extern int zonecfg_getpkgent(zone_dochandle_t, struct zone_pkgtab *);
extern int zonecfg_endpkgent(zone_dochandle_t);
diff --git a/usr/src/lib/Makefile b/usr/src/lib/Makefile
index da3bdb3844..c541fcb01c 100644
--- a/usr/src/lib/Makefile
+++ b/usr/src/lib/Makefile
@@ -489,7 +489,7 @@ libldap5: libsasl libsocket libnsl libmd
libsldap: libldap5 libtsol
libpool: libnvpair libexacct
libzonecfg: libc libsocket libnsl libuuid libnvpair libsysevent libsec \
- libbrand
+ libbrand libpool libscf
libproc: ../cmd/sgs/librtld_db ../cmd/sgs/libelf libctf
libproject: libpool libproc libsecdb
libtsnet: libnsl libtsol libsecdb
diff --git a/usr/src/lib/libc/port/gen/getrusage.c b/usr/src/lib/libc/port/gen/getrusage.c
index c1f1b92188..efeaf0be24 100644
--- a/usr/src/lib/libc/port/gen/getrusage.c
+++ b/usr/src/lib/libc/port/gen/getrusage.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -49,6 +48,7 @@
#include <sys/param.h>
#include <errno.h>
#include <sys/resource.h>
+#include <sys/vm_usage.h>
#include <fcntl.h>
#include <sys/fcntl.h>
#include <procfs.h>
@@ -76,3 +76,10 @@ getrusage(int who, struct rusage *rusage)
return (-1);
}
}
+
+int
+getvmusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres)
+{
+ return (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, flags, age,
+ buf, nres));
+}
diff --git a/usr/src/lib/libc/port/mapfile-vers b/usr/src/lib/libc/port/mapfile-vers
index 22227a6413..8e1b399567 100644
--- a/usr/src/lib/libc/port/mapfile-vers
+++ b/usr/src/lib/libc/port/mapfile-vers
@@ -59,6 +59,7 @@ SUNW_1.23 { # SunOS 5.11 (Solaris 11)
fdatasync;
forkallx;
forkx;
+ getvmusage;
lio_listio;
mkdtemp;
_mkdtemp;
diff --git a/usr/src/lib/libpool/common/pool.c b/usr/src/lib/libpool/common/pool.c
index 167cd8be5b..6fbd7b34d3 100644
--- a/usr/src/lib/libpool/common/pool.c
+++ b/usr/src/lib/libpool/common/pool.c
@@ -914,10 +914,34 @@ pool_put_property(pool_conf_t *conf, pool_elem_t *pe, const char *name,
return (NULL);
}
- if (!is_valid_prop_name(name)) {
+ /* Don't allow (re)setting of the "temporary" property */
+ if (!is_valid_prop_name(name) || strstr(name, ".temporary") != NULL) {
pool_seterror(POE_BADPARAM);
return (PO_FAIL);
}
+
+ /* Don't allow rename of temporary pools/resources */
+ if (strstr(name, ".name") != NULL && elem_is_tmp(pe)) {
+ boolean_t rename = B_TRUE;
+ pool_value_t *pv = pool_value_alloc();
+
+ if (pe->pe_get_prop(pe, name, pv) != POC_INVAL) {
+ const char *s1 = NULL;
+ const char *s2 = NULL;
+
+ (void) pool_value_get_string(pv, &s1);
+ (void) pool_value_get_string(val, &s2);
+ if (s1 != NULL && s2 != NULL && strcmp(s1, s2) == 0)
+ rename = B_FALSE;
+ }
+ pool_value_free(pv);
+
+ if (rename) {
+ pool_seterror(POE_BADPARAM);
+ return (PO_FAIL);
+ }
+ }
+
/*
* Check to see if this is a property we are managing. If it is,
* ensure that we are happy with what the user is doing.
@@ -936,6 +960,46 @@ pool_put_property(pool_conf_t *conf, pool_elem_t *pe, const char *name,
}
/*
+ * Set temporary property to flag as a temporary element.
+ *
+ * PO_FAIL is returned if an error is detected and the error code is updated
+ * to indicate the cause of the error.
+ */
+int
+pool_set_temporary(pool_conf_t *conf, pool_elem_t *pe)
+{
+ int res;
+ char name[128];
+ pool_value_t *val;
+
+ if (pool_conf_check(conf) != PO_SUCCESS)
+ return (PO_FAIL);
+
+ if (TO_CONF(pe) != conf) {
+ pool_seterror(POE_BADPARAM);
+ return (PO_FAIL);
+ }
+
+ /* create property name based on element type */
+ if (snprintf(name, sizeof (name), "%s.temporary",
+ pool_elem_class_string(pe)) > sizeof (name)) {
+ pool_seterror(POE_SYSTEM);
+ return (PO_FAIL);
+ }
+
+ if ((val = pool_value_alloc()) == NULL)
+ return (PO_FAIL);
+
+ pool_value_set_bool(val, (uchar_t)1);
+
+ res = pe->pe_put_prop(pe, name, val);
+
+ pool_value_free(val);
+
+ return (res);
+}
+
+/*
* Update the specified property value with the namespace prepended.
* e.g. If this function is used to update the property "name" on a pool, it
* will attempt to update "pool.name".
@@ -1030,6 +1094,12 @@ pool_rm_property(pool_conf_t *conf, pool_elem_t *pe, const char *name)
return (NULL);
}
+ /* Don't allow removal of the "temporary" property */
+ if (strstr(name, ".temporary") != NULL) {
+ pool_seterror(POE_BADPARAM);
+ return (PO_FAIL);
+ }
+
/*
* Check to see if this is a property we are managing. If it is,
* ensure that we are happy with what the user is doing.
@@ -1122,6 +1192,17 @@ pool_create(pool_conf_t *conf, const char *name)
pool_seterror(POE_PUTPROP);
return (NULL);
}
+
+ /*
+ * If we are creating a temporary pool configuration, flag the pool.
+ */
+ if (conf->pc_prov->pc_oflags & PO_TEMP) {
+ if (pool_set_temporary(conf, pe) == PO_FAIL) {
+ (void) pool_destroy(conf, pool_elem_pool(pe));
+ return (NULL);
+ }
+ }
+
return (pool_elem_pool(pe));
}
@@ -1227,6 +1308,17 @@ pool_resource_create(pool_conf_t *conf, const char *sz_type, const char *name)
return (NULL);
}
}
+
+ /*
+ * If we are creating a temporary pool configuration, flag the resource.
+ */
+ if (conf->pc_prov->pc_oflags & PO_TEMP) {
+ if (pool_set_temporary(conf, pe) != PO_SUCCESS) {
+ (void) pool_resource_destroy(conf, pool_elem_res(pe));
+ return (NULL);
+ }
+ }
+
return (pool_elem_res(pe));
}
@@ -1396,7 +1488,8 @@ pool_conf_open(pool_conf_t *conf, const char *location, int oflags)
pool_seterror(POE_BADPARAM);
return (PO_FAIL);
}
- if (oflags & ~(PO_RDONLY | PO_RDWR | PO_CREAT | PO_DISCO | PO_UPDATE)) {
+ if (oflags & ~(PO_RDONLY | PO_RDWR | PO_CREAT | PO_DISCO | PO_UPDATE |
+ PO_TEMP)) {
pool_seterror(POE_BADPARAM);
return (PO_FAIL);
}
@@ -1408,6 +1501,10 @@ pool_conf_open(pool_conf_t *conf, const char *location, int oflags)
if (oflags & PO_CREAT)
oflags |= PO_RDWR;
+ /* location is ignored when creating a temporary configuration */
+ if (oflags & PO_TEMP)
+ location = "";
+
if ((conf->pc_location = strdup(location)) == NULL) {
pool_seterror(POE_SYSTEM);
return (PO_FAIL);
@@ -1415,14 +1512,25 @@ pool_conf_open(pool_conf_t *conf, const char *location, int oflags)
/*
* This is the crossover point into the actual data provider
* implementation, allocate a data provider of the appropriate
- * type for your data storage medium. In this case it's a kernel
- * data provider. To use a different data provider, write some
- * code to implement all the required interfaces and then
- * change the next line to allocate a data provider which uses your
- * new code. All data provider routines can be static, apart from
- * the allocation routine.
+ * type for your data storage medium. In this case it's either a kernel
+ * or xml data provider. To use a different data provider, write some
+ * code to implement all the required interfaces and then change the
+ * following code to allocate a data provider which uses your new code.
+ * All data provider routines can be static, apart from the allocation
+ * routine.
+ *
+ * For temporary pools (PO_TEMP) we start with a copy of the current
+ * dynamic configuration and do all of the updates in-memory.
*/
- if (strcmp(location, pool_dynamic_location()) == 0) {
+ if (oflags & PO_TEMP) {
+ if (pool_knl_connection_alloc(conf, PO_TEMP) != PO_SUCCESS) {
+ conf->pc_state = POF_INVALID;
+ return (PO_FAIL);
+ }
+ /* set rdwr flag so we can updated the in-memory config. */
+ conf->pc_prov->pc_oflags |= PO_RDWR;
+
+ } else if (strcmp(location, pool_dynamic_location()) == 0) {
if (pool_knl_connection_alloc(conf, oflags) != PO_SUCCESS) {
conf->pc_state = POF_INVALID;
return (PO_FAIL);
diff --git a/usr/src/lib/libpool/common/pool.h b/usr/src/lib/libpool/common/pool.h
index d38e9902e6..ee11aadb7b 100644
--- a/usr/src/lib/libpool/common/pool.h
+++ b/usr/src/lib/libpool/common/pool.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -96,6 +95,7 @@ extern uint_t pool_version(uint_t ver);
#define PO_CREAT 0x2
#define PO_DISCO 0x4
#define PO_UPDATE 0x8
+#define PO_TEMP 0x10
/* Allocation policy */
#define POA_IMPORTANCE "importance based"
@@ -218,6 +218,7 @@ extern pool_value_class_t pool_get_property(const pool_conf_t *,
extern int pool_put_property(pool_conf_t *, pool_elem_t *, const char *,
const pool_value_t *);
extern int pool_rm_property(pool_conf_t *, pool_elem_t *, const char *);
+
/*
* Walk the associated properties of the supplied element calling the supplied
* function for each property in turn. There is no implied order in the walk.
diff --git a/usr/src/lib/libpool/common/pool_commit.c b/usr/src/lib/libpool/common/pool_commit.c
index 1ea4808377..b996524b98 100644
--- a/usr/src/lib/libpool/common/pool_commit.c
+++ b/usr/src/lib/libpool/common/pool_commit.c
@@ -245,6 +245,9 @@ commit_delete(pool_elem_t *pe)
pool_t *pool;
int ret = 0;
+ if (elem_is_tmp(pe))
+ return (PO_SUCCESS);
+
switch (pool_elem_class(pe)) {
case PEC_SYSTEM: /* NO-OP */
break;
@@ -1306,7 +1309,14 @@ clone_element(pool_conf_t *conf, pool_elem_t *pe, const char *name,
if ((prop = provider_get_prop(pe, name)) != NULL &&
prop_is_readonly(prop) == PO_TRUE)
return (PO_SUCCESS);
- return (pool_put_property(TO_CONF(tgt), tgt, name, pv) == PO_FAIL);
+
+ /* The temporary property needs special handling */
+ if (strstr(name, ".temporary") != NULL)
+ return (pool_set_temporary(TO_CONF(tgt), tgt) ==
+ PO_FAIL ? PO_FAIL : PO_SUCCESS);
+ else
+ return (pool_put_property(TO_CONF(tgt), tgt, name, pv) ==
+ PO_FAIL ? PO_FAIL : PO_SUCCESS);
}
/*
@@ -1322,8 +1332,9 @@ clean_element(pool_conf_t *conf, pool_elem_t *pe, const char *name,
/*
* Some properties should be ignored
*/
- if ((prop = provider_get_prop(pe, name)) != NULL &&
- prop_is_optional(prop) == PO_FALSE)
+ if (strstr(name, ".temporary") != NULL ||
+ ((prop = provider_get_prop(pe, name)) != NULL &&
+ prop_is_optional(prop) == PO_FALSE))
return (PO_SUCCESS);
return (pool_rm_property(conf, (pool_elem_t *)pe, name) == PO_FAIL);
}
diff --git a/usr/src/lib/libpool/common/pool_internal.c b/usr/src/lib/libpool/common/pool_internal.c
index 210e63d620..5e572f6eaf 100644
--- a/usr/src/lib/libpool/common/pool_internal.c
+++ b/usr/src/lib/libpool/common/pool_internal.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -1143,6 +1142,23 @@ elem_is_default(const pool_elem_t *res)
}
/*
+ * Return B_TRUE if the element has the 'temporary' property set.
+ */
+boolean_t
+elem_is_tmp(const pool_elem_t *elem)
+{
+ pool_value_t val = POOL_VALUE_INITIALIZER;
+ uchar_t bval;
+
+ if (pool_get_ns_property(elem, "temporary", &val) != POC_BOOL)
+ return (B_FALSE);
+
+ (void) pool_value_get_bool(&val, &bval);
+
+ return (bval != 0);
+}
+
+/*
* get_default_elem() returns the default elem for type of the supplied
* elem.
*
diff --git a/usr/src/lib/libpool/common/pool_internal.h b/usr/src/lib/libpool/common/pool_internal.h
index 592c98d11d..e172d23af4 100644
--- a/usr/src/lib/libpool/common/pool_internal.h
+++ b/usr/src/lib/libpool/common/pool_internal.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -256,6 +255,7 @@ extern int resource_get_pinned(const pool_resource_t *,
extern char *elem_get_name(const pool_elem_t *);
extern id_t elem_get_sysid(const pool_elem_t *);
extern int elem_is_default(const pool_elem_t *);
+extern boolean_t elem_is_tmp(const pool_elem_t *);
extern const pool_elem_t *get_default_elem(const pool_elem_t *);
extern int qsort_elem_compare(const void *, const void *);
@@ -371,6 +371,7 @@ extern pool_value_class_t pool_get_ns_property(const pool_elem_t *,
extern int pool_walk_any_properties(pool_conf_t *, pool_elem_t *,
void *, int (*)(pool_conf_t *, pool_elem_t *, const char *,
pool_value_t *, void *), int);
+extern int pool_set_temporary(pool_conf_t *, pool_elem_t *);
/*
* Namespace aware utility functions.
diff --git a/usr/src/lib/libpool/common/pool_kernel.c b/usr/src/lib/libpool/common/pool_kernel.c
index f84d6f2ba5..3da4f0263c 100644
--- a/usr/src/lib/libpool/common/pool_kernel.c
+++ b/usr/src/lib/libpool/common/pool_kernel.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -646,10 +645,14 @@ pool_knl_close(pool_conf_t *conf)
}
/*
* Rollback any pending changes before freeing the prov. This
- * ensures there are no memory leaks from pending
- * transactions.
+ * ensures there are no memory leaks from pending transactions.
+ * However, don't rollback when we've done a temporary pool since the
+ * pool/resources haven't really been committed in this case.
+ * They will all be freed in pool_knl_connection_free and we don't
+ * want to double free them.
*/
- (void) pool_knl_rollback(conf);
+ if (!(conf->pc_prov->pc_oflags & PO_TEMP))
+ (void) pool_knl_rollback(conf);
pool_knl_connection_free(prov);
return (PO_SUCCESS);
}
@@ -997,6 +1000,9 @@ pool_knl_export(const pool_conf_t *conf, const char *location,
const char *sep = "";
int j;
+ if (elem_is_tmp(elem))
+ continue;
+
if ((info.ktx_node = node_create(system,
BAD_CAST element_class_tags
[pool_elem_class(elem)])) == NULL) {
@@ -1072,6 +1078,9 @@ pool_knl_export(const pool_conf_t *conf, const char *location,
uint_t ncompelem;
int j;
+ if (elem_is_tmp(elem))
+ continue;
+
if ((info.ktx_node = node_create(system,
BAD_CAST element_class_tags
[pool_elem_class(elem)])) == NULL) {
diff --git a/usr/src/lib/libproject/common/setproject.c b/usr/src/lib/libproject/common/setproject.c
index 2303576d32..d22878a36f 100644
--- a/usr/src/lib/libproject/common/setproject.c
+++ b/usr/src/lib/libproject/common/setproject.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -536,7 +535,12 @@ setproject_proc(const char *project_name, const char *user_name, int flags,
}
old_pool_name = pool_get_binding(pid);
- if (bind_to_pool(pool_name, pid, 0) != 0) {
+
+ /*
+ * If parent is not bound to the default pool, then we want
+ * to preserve same binding as parent.
+ */
+ if (pool_name != NULL && bind_to_pool(pool_name, pid, 0) != 0) {
if (old_pool_name)
free(old_pool_name);
_kva_free(kv_array);
diff --git a/usr/src/lib/libzonecfg/Makefile.com b/usr/src/lib/libzonecfg/Makefile.com
index b89a44fce3..b64df94527 100644
--- a/usr/src/lib/libzonecfg/Makefile.com
+++ b/usr/src/lib/libzonecfg/Makefile.com
@@ -32,7 +32,8 @@ OBJECTS= libzonecfg.o getzoneent.o scratchops.o
include ../../Makefile.lib
LIBS = $(DYNLIB) $(LINTLIB)
-LDLIBS += -lc -lsocket -lnsl -luuid -lnvpair -lsysevent -lsec -lbrand
+LDLIBS += -lc -lsocket -lnsl -luuid -lnvpair -lsysevent -lsec -lbrand \
+ -lpool -lscf -lproc
# DYNLIB libraries do not have lint libs and are not linted
$(DYNLIB) := LDLIBS += -lxml2
diff --git a/usr/src/lib/libzonecfg/common/libzonecfg.c b/usr/src/lib/libzonecfg/common/libzonecfg.c
index f4fbcde368..1a3fb37c8c 100644
--- a/usr/src/lib/libzonecfg/common/libzonecfg.c
+++ b/usr/src/lib/libzonecfg/common/libzonecfg.c
@@ -46,6 +46,10 @@
#include <sys/nvpair.h>
#include <sys/types.h>
#include <ftw.h>
+#include <pool.h>
+#include <libscf.h>
+#include <libproc.h>
+#include <sys/priocntl.h>
#include <arpa/inet.h>
#include <netdb.h>
@@ -79,6 +83,9 @@
#define DTD_ELEM_RCTLVALUE (const xmlChar *) "rctl-value"
#define DTD_ELEM_ZONE (const xmlChar *) "zone"
#define DTD_ELEM_DATASET (const xmlChar *) "dataset"
+#define DTD_ELEM_TMPPOOL (const xmlChar *) "tmp_pool"
+#define DTD_ELEM_PSET (const xmlChar *) "pset"
+#define DTD_ELEM_MCAP (const xmlChar *) "mcap"
#define DTD_ELEM_PACKAGE (const xmlChar *) "package"
#define DTD_ELEM_PATCH (const xmlChar *) "patch"
#define DTD_ELEM_OBSOLETES (const xmlChar *) "obsoletes"
@@ -92,6 +99,7 @@
#define DTD_ATTR_LIMIT (const xmlChar *) "limit"
#define DTD_ATTR_LIMITPRIV (const xmlChar *) "limitpriv"
#define DTD_ATTR_BOOTARGS (const xmlChar *) "bootargs"
+#define DTD_ATTR_SCHED (const xmlChar *) "scheduling-class"
#define DTD_ATTR_MATCH (const xmlChar *) "match"
#define DTD_ATTR_NAME (const xmlChar *) "name"
#define DTD_ATTR_PHYSICAL (const xmlChar *) "physical"
@@ -102,6 +110,10 @@
#define DTD_ATTR_TYPE (const xmlChar *) "type"
#define DTD_ATTR_VALUE (const xmlChar *) "value"
#define DTD_ATTR_ZONEPATH (const xmlChar *) "zonepath"
+#define DTD_ATTR_NCPU_MIN (const xmlChar *) "ncpu_min"
+#define DTD_ATTR_NCPU_MAX (const xmlChar *) "ncpu_max"
+#define DTD_ATTR_IMPORTANCE (const xmlChar *) "importance"
+#define DTD_ATTR_PHYSCAP (const xmlChar *) "physcap"
#define DTD_ATTR_VERSION (const xmlChar *) "version"
#define DTD_ATTR_ID (const xmlChar *) "id"
#define DTD_ATTR_UID (const xmlChar *) "uid"
@@ -133,6 +145,46 @@
#define PATCHINFO "PATCH_INFO_"
#define PKGINFO_RD_LEN 128
+#define TMP_POOL_NAME "SUNWtmp_%s"
+#define MAX_TMP_POOL_NAME (ZONENAME_MAX + 9)
+#define RCAP_SERVICE "system/rcap:default"
+#define POOLD_SERVICE "system/pools/dynamic:default"
+
+/*
+ * rctl alias definitions
+ *
+ * This holds the alias, the full rctl name, the default priv value, action
+ * and lower limit. The functions that handle rctl aliases step through
+ * this table, matching on the alias, and using the full values for setting
+ * the rctl entry as well the limit for validation.
+ */
+static struct alias {
+ char *shortname;
+ char *realname;
+ char *priv;
+ char *action;
+ uint64_t low_limit;
+} aliases[] = {
+ {ALIAS_MAXLWPS, "zone.max-lwps", "privileged", "deny", 100},
+ {ALIAS_MAXSHMMEM, "zone.max-shm-memory", "privileged", "deny", 0},
+ {ALIAS_MAXSHMIDS, "zone.max-shm-ids", "privileged", "deny", 0},
+ {ALIAS_MAXMSGIDS, "zone.max-msg-ids", "privileged", "deny", 0},
+ {ALIAS_MAXSEMIDS, "zone.max-sem-ids", "privileged", "deny", 0},
+ {ALIAS_MAXLOCKEDMEM, "zone.max-locked-memory", "privileged", "deny", 0},
+ {ALIAS_MAXSWAP, "zone.max-swap", "privileged", "deny", 0},
+ {ALIAS_SHARES, "zone.cpu-shares", "privileged", "none", 0},
+ {NULL, NULL, NULL, NULL, 0}
+};
+
+/*
+ * Structure for applying rctls to a running zone. It allows important
+ * process values to be passed together easily.
+ */
+typedef struct pr_info_handle {
+ struct ps_prochandle *pr;
+ pid_t pid;
+} pr_info_handle_t;
+
struct zone_dochandle {
char *zone_dh_rootdir;
xmlDocPtr zone_dh_doc;
@@ -446,14 +498,20 @@ setrootattr(zone_dochandle_t handle, const xmlChar *propname,
int err;
xmlNodePtr root;
- if (propval == NULL)
- return (Z_INVAL);
-
if ((err = getroot(handle, &root)) != Z_OK)
return (err);
- if (xmlSetProp(root, propname, (const xmlChar *) propval) == NULL)
- return (Z_INVAL);
+ /*
+ * If we get a null propval remove the property (ignore return since it
+ * may not be set to begin with).
+ */
+ if (propval == NULL) {
+ (void) xmlUnsetProp(root, propname);
+ } else {
+ if (xmlSetProp(root, propname, (const xmlChar *) propval)
+ == NULL)
+ return (Z_INVAL);
+ }
return (Z_OK);
}
@@ -947,6 +1005,18 @@ zonecfg_set_bootargs(zone_dochandle_t handle, char *bargs)
return (setrootattr(handle, DTD_ATTR_BOOTARGS, bargs));
}
+int
+zonecfg_get_sched_class(zone_dochandle_t handle, char *sched, size_t schedsize)
+{
+ return (getrootattr(handle, DTD_ATTR_SCHED, sched, schedsize));
+}
+
+int
+zonecfg_set_sched(zone_dochandle_t handle, char *sched)
+{
+ return (setrootattr(handle, DTD_ATTR_SCHED, sched));
+}
+
/*
* /etc/zones/index caches a vital piece of information which is also
* in the <zonename>.xml file: the path to the zone. This is for performance,
@@ -3047,6 +3117,30 @@ zonecfg_strerror(int errnum)
case Z_BRAND_ERROR:
return (dgettext(TEXT_DOMAIN,
"Brand-specific error"));
+ case Z_INCOMPATIBLE:
+ return (dgettext(TEXT_DOMAIN, "Incompatible settings"));
+ case Z_ALIAS_DISALLOW:
+ return (dgettext(TEXT_DOMAIN,
+ "An incompatible rctl already exists for this property"));
+ case Z_CLEAR_DISALLOW:
+ return (dgettext(TEXT_DOMAIN,
+ "Clearing this property is not allowed"));
+ case Z_POOL:
+ return (dgettext(TEXT_DOMAIN, "libpool(3LIB) error"));
+ case Z_POOLS_NOT_ACTIVE:
+ return (dgettext(TEXT_DOMAIN, "Pools facility not active; "
+ "zone will not be bound to pool"));
+ case Z_POOL_ENABLE:
+ return (dgettext(TEXT_DOMAIN,
+ "Could not enable pools facility"));
+ case Z_NO_POOL:
+ return (dgettext(TEXT_DOMAIN,
+ "Pool not found; using default pool"));
+ case Z_POOL_CREATE:
+ return (dgettext(TEXT_DOMAIN,
+ "Could not create a temporary pool"));
+ case Z_POOL_BIND:
+ return (dgettext(TEXT_DOMAIN, "Could not bind zone to pool"));
default:
return (dgettext(TEXT_DOMAIN, "Unknown error"));
}
@@ -3086,6 +3180,951 @@ zonecfg_endent(zone_dochandle_t handle)
return (Z_OK);
}
+/*
+ * Do the work required to manipulate a process through libproc.
+ * If grab_process() returns no errors (0), then release_process()
+ * must eventually be called.
+ *
+ * Return values:
+ * 0 Successful creation of agent thread
+ * 1 Error grabbing
+ * 2 Error creating agent
+ */
+static int
+grab_process(pr_info_handle_t *p)
+{
+ int ret;
+
+ if ((p->pr = Pgrab(p->pid, 0, &ret)) != NULL) {
+
+ if (Psetflags(p->pr, PR_RLC) != 0) {
+ Prelease(p->pr, 0);
+ return (1);
+ }
+ if (Pcreate_agent(p->pr) == 0) {
+ return (0);
+
+ } else {
+ Prelease(p->pr, 0);
+ return (2);
+ }
+ } else {
+ return (1);
+ }
+}
+
+/*
+ * Release the specified process. This destroys the agent
+ * and releases the process. If the process is NULL, nothing
+ * is done. This function should only be called if grab_process()
+ * has previously been called and returned success.
+ *
+ * This function is Pgrab-safe.
+ */
+static void
+release_process(struct ps_prochandle *Pr)
+{
+ if (Pr == NULL)
+ return;
+
+ Pdestroy_agent(Pr);
+ Prelease(Pr, 0);
+}
+
+static boolean_t
+grab_zone_proc(char *zonename, pr_info_handle_t *p)
+{
+ DIR *dirp;
+ struct dirent *dentp;
+ zoneid_t zoneid;
+ int pid_self;
+ psinfo_t psinfo;
+
+ if (zone_get_id(zonename, &zoneid) != 0)
+ return (B_FALSE);
+
+ pid_self = getpid();
+
+ if ((dirp = opendir("/proc")) == NULL)
+ return (B_FALSE);
+
+ while (dentp = readdir(dirp)) {
+ p->pid = atoi(dentp->d_name);
+
+ /* Skip self */
+ if (p->pid == pid_self)
+ continue;
+
+ if (proc_get_psinfo(p->pid, &psinfo) != 0)
+ continue;
+
+ if (psinfo.pr_zoneid != zoneid)
+ continue;
+
+ /* attempt to grab process */
+ if (grab_process(p) != 0)
+ continue;
+
+ if (pr_getzoneid(p->pr) != zoneid) {
+ release_process(p->pr);
+ continue;
+ }
+
+ (void) closedir(dirp);
+ return (B_TRUE);
+ }
+
+ (void) closedir(dirp);
+ return (B_FALSE);
+}
+
+static boolean_t
+get_priv_rctl(struct ps_prochandle *pr, char *name, rctlblk_t *rblk)
+{
+ if (pr_getrctl(pr, name, NULL, rblk, RCTL_FIRST))
+ return (B_FALSE);
+
+ if (rctlblk_get_privilege(rblk) == RCPRIV_PRIVILEGED)
+ return (B_TRUE);
+
+ while (pr_getrctl(pr, name, rblk, rblk, RCTL_NEXT) == 0) {
+ if (rctlblk_get_privilege(rblk) == RCPRIV_PRIVILEGED)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Apply the current rctl settings to the specified, running zone.
+ */
+int
+zonecfg_apply_rctls(char *zone_name, zone_dochandle_t handle)
+{
+ int err;
+ int res = Z_OK;
+ rctlblk_t *rblk;
+ pr_info_handle_t p;
+ struct zone_rctltab rctl;
+
+ if ((err = zonecfg_setrctlent(handle)) != Z_OK)
+ return (err);
+
+ if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL) {
+ (void) zonecfg_endrctlent(handle);
+ return (Z_NOMEM);
+ }
+
+ if (!grab_zone_proc(zone_name, &p)) {
+ (void) zonecfg_endrctlent(handle);
+ free(rblk);
+ return (Z_SYSTEM);
+ }
+
+ while (zonecfg_getrctlent(handle, &rctl) == Z_OK) {
+ char *rname;
+ struct zone_rctlvaltab *valptr;
+
+ rname = rctl.zone_rctl_name;
+
+ /* first delete all current privileged settings for this rctl */
+ while (get_priv_rctl(p.pr, rname, rblk)) {
+ if (pr_setrctl(p.pr, rname, NULL, rblk, RCTL_DELETE) !=
+ 0) {
+ res = Z_SYSTEM;
+ goto done;
+ }
+ }
+
+ /* now set each new value for the rctl */
+ for (valptr = rctl.zone_rctl_valptr; valptr != NULL;
+ valptr = valptr->zone_rctlval_next) {
+ if ((err = zonecfg_construct_rctlblk(valptr, rblk))
+ != Z_OK) {
+ res = errno = err;
+ goto done;
+ }
+
+ if (pr_setrctl(p.pr, rname, NULL, rblk, RCTL_INSERT)) {
+ res = Z_SYSTEM;
+ goto done;
+ }
+ }
+ }
+
+done:
+ release_process(p.pr);
+ free(rblk);
+ (void) zonecfg_endrctlent(handle);
+
+ return (res);
+}
+
+static const xmlChar *
+nm_to_dtd(char *nm)
+{
+ if (strcmp(nm, "device") == 0)
+ return (DTD_ELEM_DEVICE);
+ if (strcmp(nm, "fs") == 0)
+ return (DTD_ELEM_FS);
+ if (strcmp(nm, "inherit-pkg-dir") == 0)
+ return (DTD_ELEM_IPD);
+ if (strcmp(nm, "net") == 0)
+ return (DTD_ELEM_NET);
+ if (strcmp(nm, "attr") == 0)
+ return (DTD_ELEM_ATTR);
+ if (strcmp(nm, "rctl") == 0)
+ return (DTD_ELEM_RCTL);
+ if (strcmp(nm, "dataset") == 0)
+ return (DTD_ELEM_DATASET);
+
+ return (NULL);
+}
+
+int
+zonecfg_num_resources(zone_dochandle_t handle, char *rsrc)
+{
+ int num = 0;
+ const xmlChar *dtd;
+ xmlNodePtr cur;
+
+ if ((dtd = nm_to_dtd(rsrc)) == NULL)
+ return (num);
+
+ if (zonecfg_setent(handle) != Z_OK)
+ return (num);
+
+ for (cur = handle->zone_dh_cur; cur != NULL; cur = cur->next)
+ if (xmlStrcmp(cur->name, dtd) == 0)
+ num++;
+
+ (void) zonecfg_endent(handle);
+
+ return (num);
+}
+
+int
+zonecfg_del_all_resources(zone_dochandle_t handle, char *rsrc)
+{
+ int err;
+ const xmlChar *dtd;
+ xmlNodePtr cur;
+
+ if ((dtd = nm_to_dtd(rsrc)) == NULL)
+ return (Z_NO_RESOURCE_TYPE);
+
+ if ((err = zonecfg_setent(handle)) != Z_OK)
+ return (err);
+
+ cur = handle->zone_dh_cur;
+ while (cur != NULL) {
+ xmlNodePtr tmp;
+
+ if (xmlStrcmp(cur->name, dtd)) {
+ cur = cur->next;
+ continue;
+ }
+
+ tmp = cur->next;
+ xmlUnlinkNode(cur);
+ xmlFreeNode(cur);
+ cur = tmp;
+ }
+
+ (void) zonecfg_endent(handle);
+ return (Z_OK);
+}
+
+static boolean_t
+valid_uint(char *s, uint64_t *n)
+{
+ char *endp;
+
+ /* strtoull accepts '-'?! so we want to flag that as an error */
+ if (strchr(s, '-') != NULL)
+ return (B_FALSE);
+
+ errno = 0;
+ *n = strtoull(s, &endp, 10);
+
+ if (errno != 0 || *endp != '\0')
+ return (B_FALSE);
+ return (B_TRUE);
+}
+
+/*
+ * Convert a string representing a number (possibly a fraction) into an integer.
+ * The string can have a modifier (K, M, G or T). The modifiers are treated
+ * as powers of two (not 10).
+ */
+int
+zonecfg_str_to_bytes(char *str, uint64_t *bytes)
+{
+ long double val;
+ char *unitp;
+ uint64_t scale;
+
+ if ((val = strtold(str, &unitp)) < 0)
+ return (-1);
+
+ /* remove any leading white space from units string */
+ while (isspace(*unitp) != 0)
+ ++unitp;
+
+ /* if no units explicitly set, error */
+ if (unitp == NULL || *unitp == '\0') {
+ scale = 1;
+ } else {
+ int i;
+ char *units[] = {"K", "M", "G", "T", NULL};
+
+ scale = 1024;
+
+ /* update scale based on units */
+ for (i = 0; units[i] != NULL; i++) {
+ if (strcasecmp(unitp, units[i]) == 0)
+ break;
+ scale <<= 10;
+ }
+
+ if (units[i] == NULL)
+ return (-1);
+ }
+
+ *bytes = (uint64_t)(val * scale);
+ return (0);
+}
+
+boolean_t
+zonecfg_valid_ncpus(char *lowstr, char *highstr)
+{
+ uint64_t low, high;
+
+ if (!valid_uint(lowstr, &low) || !valid_uint(highstr, &high) ||
+ low < 1 || low > high)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+boolean_t
+zonecfg_valid_importance(char *impstr)
+{
+ uint64_t num;
+
+ if (!valid_uint(impstr, &num))
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+boolean_t
+zonecfg_valid_alias_limit(char *name, char *limitstr, uint64_t *limit)
+{
+ int i;
+
+ for (i = 0; aliases[i].shortname != NULL; i++)
+ if (strcmp(name, aliases[i].shortname) == 0)
+ break;
+
+ if (aliases[i].shortname == NULL)
+ return (B_FALSE);
+
+ if (!valid_uint(limitstr, limit) || *limit < aliases[i].low_limit)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+boolean_t
+zonecfg_valid_memlimit(char *memstr, uint64_t *mem_val)
+{
+ if (zonecfg_str_to_bytes(memstr, mem_val) != 0)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+static int
+zerr_pool(char *pool_err, int err_size, int res)
+{
+ (void) strlcpy(pool_err, pool_strerror(pool_error()), err_size);
+ return (res);
+}
+
+static int
+create_tmp_pset(char *pool_err, int err_size, pool_conf_t *pconf, pool_t *pool,
+ char *name, int min, int max)
+{
+ pool_resource_t *res;
+ pool_elem_t *elem;
+ pool_value_t *val;
+
+ if ((res = pool_resource_create(pconf, "pset", name)) == NULL)
+ return (zerr_pool(pool_err, err_size, Z_POOL));
+
+ if (pool_associate(pconf, pool, res) != PO_SUCCESS)
+ return (zerr_pool(pool_err, err_size, Z_POOL));
+
+ if ((elem = pool_resource_to_elem(pconf, res)) == NULL)
+ return (zerr_pool(pool_err, err_size, Z_POOL));
+
+ if ((val = pool_value_alloc()) == NULL)
+ return (zerr_pool(pool_err, err_size, Z_POOL));
+
+ /* set the maximum number of cpus for the pset */
+ pool_value_set_uint64(val, (uint64_t)max);
+
+ if (pool_put_property(pconf, elem, "pset.max", val) != PO_SUCCESS) {
+ pool_value_free(val);
+ return (zerr_pool(pool_err, err_size, Z_POOL));
+ }
+
+ /* set the minimum number of cpus for the pset */
+ pool_value_set_uint64(val, (uint64_t)min);
+
+ if (pool_put_property(pconf, elem, "pset.min", val) != PO_SUCCESS) {
+ pool_value_free(val);
+ return (zerr_pool(pool_err, err_size, Z_POOL));
+ }
+
+ pool_value_free(val);
+
+ return (Z_OK);
+}
+
+static int
+create_tmp_pool(char *pool_err, int err_size, pool_conf_t *pconf, char *name,
+ struct zone_psettab *pset_tab)
+{
+ pool_t *pool;
+ int res = Z_OK;
+
+ /* create a temporary pool configuration */
+ if (pool_conf_open(pconf, NULL, PO_TEMP) != PO_SUCCESS) {
+ res = zerr_pool(pool_err, err_size, Z_POOL);
+ return (res);
+ }
+
+ if ((pool = pool_create(pconf, name)) == NULL) {
+ res = zerr_pool(pool_err, err_size, Z_POOL_CREATE);
+ goto done;
+ }
+
+ /* set pool importance */
+ if (pset_tab->zone_importance[0] != '\0') {
+ pool_elem_t *elem;
+ pool_value_t *val;
+
+ if ((elem = pool_to_elem(pconf, pool)) == NULL) {
+ res = zerr_pool(pool_err, err_size, Z_POOL);
+ goto done;
+ }
+
+ if ((val = pool_value_alloc()) == NULL) {
+ res = zerr_pool(pool_err, err_size, Z_POOL);
+ goto done;
+ }
+
+ pool_value_set_int64(val,
+ (int64_t)atoi(pset_tab->zone_importance));
+
+ if (pool_put_property(pconf, elem, "pool.importance", val)
+ != PO_SUCCESS) {
+ res = zerr_pool(pool_err, err_size, Z_POOL);
+ pool_value_free(val);
+ goto done;
+ }
+
+ pool_value_free(val);
+ }
+
+ if ((res = create_tmp_pset(pool_err, err_size, pconf, pool, name,
+ atoi(pset_tab->zone_ncpu_min),
+ atoi(pset_tab->zone_ncpu_max))) != Z_OK)
+ goto done;
+
+ /* validation */
+ if (pool_conf_status(pconf) == POF_INVALID) {
+ res = zerr_pool(pool_err, err_size, Z_POOL);
+ goto done;
+ }
+
+ /*
+ * This validation is the one we expect to fail if the user specified
+ * an invalid configuration (too many cpus) for this system.
+ */
+ if (pool_conf_validate(pconf, POV_RUNTIME) != PO_SUCCESS) {
+ res = zerr_pool(pool_err, err_size, Z_POOL_CREATE);
+ goto done;
+ }
+
+ /*
+ * Commit the dynamic configuration but not the pool configuration
+ * file.
+ */
+ if (pool_conf_commit(pconf, 1) != PO_SUCCESS)
+ res = zerr_pool(pool_err, err_size, Z_POOL);
+
+done:
+ (void) pool_conf_close(pconf);
+ return (res);
+}
+
+static int
+get_running_tmp_pset(pool_conf_t *pconf, pool_t *pool, pool_resource_t *pset,
+ struct zone_psettab *pset_tab)
+{
+ int nfound = 0;
+ pool_elem_t *pe;
+ pool_value_t *pv = pool_value_alloc();
+ uint64_t val_uint;
+
+ if (pool != NULL) {
+ pe = pool_to_elem(pconf, pool);
+ if (pool_get_property(pconf, pe, "pool.importance", pv)
+ != POC_INVAL) {
+ int64_t val_int;
+
+ (void) pool_value_get_int64(pv, &val_int);
+ (void) snprintf(pset_tab->zone_importance,
+ sizeof (pset_tab->zone_importance), "%d", val_int);
+ nfound++;
+ }
+ }
+
+ if (pset != NULL) {
+ pe = pool_resource_to_elem(pconf, pset);
+ if (pool_get_property(pconf, pe, "pset.min", pv) != POC_INVAL) {
+ (void) pool_value_get_uint64(pv, &val_uint);
+ (void) snprintf(pset_tab->zone_ncpu_min,
+ sizeof (pset_tab->zone_ncpu_min), "%u", val_uint);
+ nfound++;
+ }
+
+ if (pool_get_property(pconf, pe, "pset.max", pv) != POC_INVAL) {
+ (void) pool_value_get_uint64(pv, &val_uint);
+ (void) snprintf(pset_tab->zone_ncpu_max,
+ sizeof (pset_tab->zone_ncpu_max), "%u", val_uint);
+ nfound++;
+ }
+ }
+
+ pool_value_free(pv);
+
+ if (nfound == 3)
+ return (PO_SUCCESS);
+
+ return (PO_FAIL);
+}
+
+/*
+ * Determine if a tmp pool is configured and if so, if the configuration is
+ * still valid or if it has been changed since the tmp pool was created.
+ * If the tmp pool configuration is no longer valid, delete the tmp pool.
+ *
+ * Set *valid=B_TRUE if there is an existing, valid tmp pool configuration.
+ */
+static int
+verify_del_tmp_pool(pool_conf_t *pconf, char *tmp_name, char *pool_err,
+ int err_size, struct zone_psettab *pset_tab, boolean_t *exists)
+{
+ int res = Z_OK;
+ pool_t *pool;
+ pool_resource_t *pset;
+ struct zone_psettab pset_current;
+
+ *exists = B_FALSE;
+
+ if (pool_conf_open(pconf, pool_dynamic_location(), PO_RDWR)
+ != PO_SUCCESS) {
+ res = zerr_pool(pool_err, err_size, Z_POOL);
+ return (res);
+ }
+
+ pool = pool_get_pool(pconf, tmp_name);
+ pset = pool_get_resource(pconf, "pset", tmp_name);
+
+ if (pool == NULL && pset == NULL) {
+ /* no tmp pool configured */
+ goto done;
+ }
+
+ /*
+ * If an existing tmp pool for this zone is configured with the proper
+ * settings, then the tmp pool is valid.
+ */
+ if (get_running_tmp_pset(pconf, pool, pset, &pset_current)
+ == PO_SUCCESS &&
+ strcmp(pset_tab->zone_ncpu_min,
+ pset_current.zone_ncpu_min) == 0 &&
+ strcmp(pset_tab->zone_ncpu_max,
+ pset_current.zone_ncpu_max) == 0 &&
+ strcmp(pset_tab->zone_importance,
+ pset_current.zone_importance) == 0) {
+ *exists = B_TRUE;
+
+ } else {
+ /*
+ * An out-of-date tmp pool configuration exists. Delete it
+ * so that we can create the correct tmp pool config.
+ */
+ if (pset != NULL &&
+ pool_resource_destroy(pconf, pset) != PO_SUCCESS) {
+ res = zerr_pool(pool_err, err_size, Z_POOL);
+ goto done;
+ }
+
+ if (pool != NULL &&
+ pool_destroy(pconf, pool) != PO_SUCCESS) {
+ res = zerr_pool(pool_err, err_size, Z_POOL);
+ goto done;
+ }
+
+ /* commit dynamic config */
+ if (pool_conf_commit(pconf, 0) != PO_SUCCESS)
+ res = zerr_pool(pool_err, err_size, Z_POOL);
+ }
+
+done:
+ (void) pool_conf_close(pconf);
+
+ return (res);
+}
+
+/*
+ * Destroy any existing tmp pool.
+ */
+int
+zonecfg_destroy_tmp_pool(char *zone_name, char *pool_err, int err_size)
+{
+ int status;
+ int res = Z_OK;
+ pool_conf_t *pconf;
+ pool_t *pool;
+ pool_resource_t *pset;
+ char tmp_name[MAX_TMP_POOL_NAME];
+
+ /* if pools not enabled then nothing to do */
+ if (pool_get_status(&status) != PO_SUCCESS || status != POOL_ENABLED)
+ return (Z_OK);
+
+ if ((pconf = pool_conf_alloc()) == NULL)
+ return (zerr_pool(pool_err, err_size, Z_POOL));
+
+ (void) snprintf(tmp_name, sizeof (tmp_name), TMP_POOL_NAME, zone_name);
+
+ if (pool_conf_open(pconf, pool_dynamic_location(), PO_RDWR)
+ != PO_SUCCESS) {
+ res = zerr_pool(pool_err, err_size, Z_POOL);
+ pool_conf_free(pconf);
+ return (res);
+ }
+
+ pool = pool_get_pool(pconf, tmp_name);
+ pset = pool_get_resource(pconf, "pset", tmp_name);
+
+ if (pool == NULL && pset == NULL) {
+ /* nothing to destroy, we're done */
+ goto done;
+ }
+
+ if (pset != NULL && pool_resource_destroy(pconf, pset) != PO_SUCCESS) {
+ res = zerr_pool(pool_err, err_size, Z_POOL);
+ goto done;
+ }
+
+ if (pool != NULL && pool_destroy(pconf, pool) != PO_SUCCESS) {
+ res = zerr_pool(pool_err, err_size, Z_POOL);
+ goto done;
+ }
+
+ /* commit dynamic config */
+ if (pool_conf_commit(pconf, 0) != PO_SUCCESS)
+ res = zerr_pool(pool_err, err_size, Z_POOL);
+
+done:
+ (void) pool_conf_close(pconf);
+ pool_conf_free(pconf);
+
+ return (res);
+}
+
+/*
+ * Attempt to bind to a tmp pool for this zone. If there is no tmp pool
+ * configured, we just return Z_OK.
+ *
+ * We either attempt to create the tmp pool for this zone or rebind to an
+ * existing tmp pool for this zone.
+ *
+ * Rebinding is used when a zone with a tmp pool reboots so that we don't have
+ * to recreate the tmp pool. To do this we need to be sure we work correctly
+ * for the following cases:
+ *
+ * - there is an existing, properly configured tmp pool.
+ * - zonecfg added tmp pool after zone was booted, must now create.
+ * - zonecfg updated tmp pool config after zone was booted, in this case
+ * we destroy the old tmp pool and create a new one.
+ */
+int
+zonecfg_bind_tmp_pool(zone_dochandle_t handle, zoneid_t zoneid, char *pool_err,
+ int err_size)
+{
+ struct zone_psettab pset_tab;
+ int err;
+ int status;
+ pool_conf_t *pconf;
+ boolean_t exists;
+ char zone_name[ZONENAME_MAX];
+ char tmp_name[MAX_TMP_POOL_NAME];
+
+ (void) getzonenamebyid(zoneid, zone_name, sizeof (zone_name));
+
+ err = zonecfg_lookup_pset(handle, &pset_tab);
+
+ /* if no temporary pool configured, we're done */
+ if (err == Z_NO_ENTRY)
+ return (Z_OK);
+
+ /*
+ * importance might not have a value but we need to validate it here,
+ * so set the default.
+ */
+ if (pset_tab.zone_importance[0] == '\0')
+ (void) strlcpy(pset_tab.zone_importance, "1",
+ sizeof (pset_tab.zone_importance));
+
+ /* if pools not enabled, enable them now */
+ if (pool_get_status(&status) != PO_SUCCESS || status != POOL_ENABLED) {
+ if (pool_set_status(POOL_ENABLED) != PO_SUCCESS)
+ return (Z_POOL_ENABLE);
+ }
+
+ if ((pconf = pool_conf_alloc()) == NULL)
+ return (zerr_pool(pool_err, err_size, Z_POOL));
+
+ (void) snprintf(tmp_name, sizeof (tmp_name), TMP_POOL_NAME, zone_name);
+
+ /*
+ * Check if a valid tmp pool/pset already exists. If so, we just
+ * reuse it.
+ */
+ if ((err = verify_del_tmp_pool(pconf, tmp_name, pool_err, err_size,
+ &pset_tab, &exists)) != Z_OK) {
+ pool_conf_free(pconf);
+ return (err);
+ }
+
+ if (!exists)
+ err = create_tmp_pool(pool_err, err_size, pconf, tmp_name,
+ &pset_tab);
+
+ pool_conf_free(pconf);
+
+ if (err != Z_OK)
+ return (err);
+
+ /* Bind the zone to the pool. */
+ if (pool_set_binding(tmp_name, P_ZONEID, zoneid) != PO_SUCCESS)
+ return (zerr_pool(pool_err, err_size, Z_POOL_BIND));
+
+ return (Z_OK);
+}
+
+/*
+ * Attempt to bind to a permanent pool for this zone. If there is no
+ * permanent pool configured, we just return Z_OK.
+ */
+int
+zonecfg_bind_pool(zone_dochandle_t handle, zoneid_t zoneid, char *pool_err,
+ int err_size)
+{
+ pool_conf_t *poolconf;
+ pool_t *pool;
+ char poolname[MAXPATHLEN];
+ int status;
+ int error;
+
+ /*
+ * Find the pool mentioned in the zone configuration, and bind to it.
+ */
+ error = zonecfg_get_pool(handle, poolname, sizeof (poolname));
+ if (error == Z_NO_ENTRY || (error == Z_OK && strlen(poolname) == 0)) {
+ /*
+ * The property is not set on the zone, so the pool
+ * should be bound to the default pool. But that's
+ * already done by the kernel, so we can just return.
+ */
+ return (Z_OK);
+ }
+ if (error != Z_OK) {
+ /*
+ * Not an error, even though it shouldn't be happening.
+ */
+ return (Z_OK);
+ }
+ /*
+ * Don't do anything if pools aren't enabled.
+ */
+ if (pool_get_status(&status) != PO_SUCCESS || status != POOL_ENABLED)
+ return (Z_POOLS_NOT_ACTIVE);
+
+ /*
+ * Try to provide a sane error message if the requested pool doesn't
+ * exist.
+ */
+ if ((poolconf = pool_conf_alloc()) == NULL)
+ return (zerr_pool(pool_err, err_size, Z_POOL));
+
+ if (pool_conf_open(poolconf, pool_dynamic_location(), PO_RDONLY) !=
+ PO_SUCCESS) {
+ pool_conf_free(poolconf);
+ return (zerr_pool(pool_err, err_size, Z_POOL));
+ }
+ pool = pool_get_pool(poolconf, poolname);
+ (void) pool_conf_close(poolconf);
+ pool_conf_free(poolconf);
+ if (pool == NULL)
+ return (Z_NO_POOL);
+
+ /*
+ * Bind the zone to the pool.
+ */
+ if (pool_set_binding(poolname, P_ZONEID, zoneid) != PO_SUCCESS) {
+ /* if bind fails, return poolname for the error msg */
+ (void) strlcpy(pool_err, poolname, err_size);
+ return (Z_POOL_BIND);
+ }
+
+ return (Z_OK);
+}
+
+
+static boolean_t
+svc_enabled(char *svc_name)
+{
+ scf_simple_prop_t *prop;
+ boolean_t found = B_FALSE;
+
+ prop = scf_simple_prop_get(NULL, svc_name, SCF_PG_GENERAL,
+ SCF_PROPERTY_ENABLED);
+
+ if (scf_simple_prop_numvalues(prop) == 1 &&
+ *scf_simple_prop_next_boolean(prop) != 0)
+ found = B_TRUE;
+
+ scf_simple_prop_free(prop);
+
+ return (found);
+}
+
+/*
+ * If the zone has capped-memory, make sure the rcap service is enabled.
+ */
+int
+zonecfg_enable_rcapd(char *err, int size)
+{
+ if (!svc_enabled(RCAP_SERVICE) &&
+ smf_enable_instance(RCAP_SERVICE, 0) == -1) {
+ (void) strlcpy(err, scf_strerror(scf_error()), size);
+ return (Z_SYSTEM);
+ }
+
+ return (Z_OK);
+}
+
+/*
+ * Return true if pset has cpu range specified and poold is not enabled.
+ */
+boolean_t
+zonecfg_warn_poold(zone_dochandle_t handle)
+{
+ struct zone_psettab pset_tab;
+ int min, max;
+ int err;
+
+ err = zonecfg_lookup_pset(handle, &pset_tab);
+
+ /* if no temporary pool configured, we're done */
+ if (err == Z_NO_ENTRY)
+ return (B_FALSE);
+
+ min = atoi(pset_tab.zone_ncpu_min);
+ max = atoi(pset_tab.zone_ncpu_max);
+
+ /* range not specified, no need for poold */
+ if (min == max)
+ return (B_FALSE);
+
+ /* we have a range, check if poold service is enabled */
+ if (svc_enabled(POOLD_SERVICE))
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+static int
+get_pool_sched_class(char *poolname, char *class, int clsize)
+{
+ int status;
+ pool_conf_t *poolconf;
+ pool_t *pool;
+ pool_elem_t *pe;
+ pool_value_t *pv = pool_value_alloc();
+ const char *sched_str;
+
+ if (pool_get_status(&status) != PO_SUCCESS || status != POOL_ENABLED)
+ return (Z_NO_POOL);
+
+ if ((poolconf = pool_conf_alloc()) == NULL)
+ return (Z_NO_POOL);
+
+ if (pool_conf_open(poolconf, pool_dynamic_location(), PO_RDONLY) !=
+ PO_SUCCESS) {
+ pool_conf_free(poolconf);
+ return (Z_NO_POOL);
+ }
+
+ if ((pool = pool_get_pool(poolconf, poolname)) == NULL) {
+ (void) pool_conf_close(poolconf);
+ pool_conf_free(poolconf);
+ return (Z_NO_POOL);
+ }
+
+ pe = pool_to_elem(poolconf, pool);
+ if (pool_get_property(poolconf, pe, "pool.scheduler", pv)
+ != POC_INVAL) {
+ (void) pool_value_get_string(pv, &sched_str);
+ if (strlcpy(class, sched_str, clsize) >= clsize)
+ return (Z_TOO_BIG);
+ }
+
+ (void) pool_conf_close(poolconf);
+ pool_conf_free(poolconf);
+ return (Z_OK);
+}
+
+/*
+ * Get the default scheduling class for the zone. This will either be the
+ * class set on the zone's pool or the system default scheduling class.
+ */
+int
+zonecfg_get_dflt_sched_class(zone_dochandle_t handle, char *class, int clsize)
+{
+ char poolname[MAXPATHLEN];
+
+ if (zonecfg_get_pool(handle, poolname, sizeof (poolname)) == Z_OK) {
+ /* check if the zone's pool specified a sched class */
+ if (get_pool_sched_class(poolname, class, clsize) == Z_OK)
+ return (Z_OK);
+ }
+
+ if (priocntl(0, 0, PC_GETDFLCL, class, (uint64_t)clsize) == -1)
+ return (Z_TOO_BIG);
+
+ return (Z_OK);
+}
+
int
zonecfg_setfsent(zone_dochandle_t handle)
{
@@ -4825,6 +5864,509 @@ zonecfg_enddsent(zone_dochandle_t handle)
return (zonecfg_endent(handle));
}
+/*
+ * Support for aliased rctls; that is, rctls that have simplified names in
+ * zonecfg. For example, max-lwps is an alias for a well defined zone.max-lwps
+ * rctl. If there are multiple existing values for one of these rctls or if
+ * there is a single value that does not match the well defined template (i.e.
+ * it has a different action) then we cannot treat the rctl as having an alias
+ * so we return Z_ALIAS_DISALLOW. That means that the rctl cannot be
+ * managed in zonecfg via an alias and that the standard rctl syntax must be
+ * used.
+ *
+ * The possible return values are:
+ * Z_NO_PROPERTY_ID - invalid alias name
+ * Z_ALIAS_DISALLOW - pre-existing, incompatible rctl definition
+ * Z_NO_ENTRY - no rctl is configured for this alias
+ * Z_OK - we got a valid rctl for the specified alias
+ */
+int
+zonecfg_get_aliased_rctl(zone_dochandle_t handle, char *name, uint64_t *rval)
+{
+ boolean_t found = B_FALSE;
+ boolean_t found_val = B_FALSE;
+ xmlNodePtr cur, val;
+ char savedname[MAXNAMELEN];
+ struct zone_rctlvaltab rctl;
+ int i;
+ int err;
+
+ for (i = 0; aliases[i].shortname != NULL; i++)
+ if (strcmp(name, aliases[i].shortname) == 0)
+ break;
+
+ if (aliases[i].shortname == NULL)
+ return (Z_NO_PROPERTY_ID);
+
+ if ((err = operation_prep(handle)) != Z_OK)
+ return (err);
+
+ cur = handle->zone_dh_cur;
+ for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) {
+ if (xmlStrcmp(cur->name, DTD_ELEM_RCTL) != 0)
+ continue;
+ if ((fetchprop(cur, DTD_ATTR_NAME, savedname,
+ sizeof (savedname)) == Z_OK) &&
+ (strcmp(savedname, aliases[i].realname) == 0)) {
+
+ /*
+ * If we already saw one of these, we can't have an
+ * alias since we just found another.
+ */
+ if (found)
+ return (Z_ALIAS_DISALLOW);
+ found = B_TRUE;
+
+ for (val = cur->xmlChildrenNode; val != NULL;
+ val = val->next) {
+ /*
+ * If we already have one value, we can't have
+ * an alias since we just found another.
+ */
+ if (found_val)
+ return (Z_ALIAS_DISALLOW);
+ found_val = B_TRUE;
+
+ if ((fetchprop(val, DTD_ATTR_PRIV,
+ rctl.zone_rctlval_priv,
+ sizeof (rctl.zone_rctlval_priv)) != Z_OK))
+ break;
+ if ((fetchprop(val, DTD_ATTR_LIMIT,
+ rctl.zone_rctlval_limit,
+ sizeof (rctl.zone_rctlval_limit)) != Z_OK))
+ break;
+ if ((fetchprop(val, DTD_ATTR_ACTION,
+ rctl.zone_rctlval_action,
+ sizeof (rctl.zone_rctlval_action)) != Z_OK))
+ break;
+ }
+
+ /* check priv and action match the expected vals */
+ if (strcmp(rctl.zone_rctlval_priv,
+ aliases[i].priv) != 0 ||
+ strcmp(rctl.zone_rctlval_action,
+ aliases[i].action) != 0)
+ return (Z_ALIAS_DISALLOW);
+ }
+ }
+
+ if (found) {
+ *rval = strtoull(rctl.zone_rctlval_limit, NULL, 10);
+ return (Z_OK);
+ }
+
+ return (Z_NO_ENTRY);
+}
+
+int
+zonecfg_rm_aliased_rctl(zone_dochandle_t handle, char *name)
+{
+ int i;
+ uint64_t val;
+ struct zone_rctltab rctltab;
+
+ /*
+ * First check that we have a valid aliased rctl to remove.
+ * This will catch an rctl entry with non-standard values or
+ * multiple rctl values for this name. We need to ignore those
+ * rctl entries.
+ */
+ if (zonecfg_get_aliased_rctl(handle, name, &val) != Z_OK)
+ return (Z_OK);
+
+ for (i = 0; aliases[i].shortname != NULL; i++)
+ if (strcmp(name, aliases[i].shortname) == 0)
+ break;
+
+ if (aliases[i].shortname == NULL)
+ return (Z_NO_RESOURCE_ID);
+
+ (void) strlcpy(rctltab.zone_rctl_name, aliases[i].realname,
+ sizeof (rctltab.zone_rctl_name));
+
+ return (zonecfg_delete_rctl(handle, &rctltab));
+}
+
+boolean_t
+zonecfg_aliased_rctl_ok(zone_dochandle_t handle, char *name)
+{
+ uint64_t tmp_val;
+
+ switch (zonecfg_get_aliased_rctl(handle, name, &tmp_val)) {
+ case Z_OK:
+ /*FALLTHRU*/
+ case Z_NO_ENTRY:
+ return (B_TRUE);
+ default:
+ return (B_FALSE);
+ }
+}
+
+int
+zonecfg_set_aliased_rctl(zone_dochandle_t handle, char *name, uint64_t val)
+{
+ int i;
+ int err;
+ struct zone_rctltab rctltab;
+ struct zone_rctlvaltab *rctlvaltab;
+ char buf[128];
+
+ if (!zonecfg_aliased_rctl_ok(handle, name))
+ return (Z_ALIAS_DISALLOW);
+
+ for (i = 0; aliases[i].shortname != NULL; i++)
+ if (strcmp(name, aliases[i].shortname) == 0)
+ break;
+
+ if (aliases[i].shortname == NULL)
+ return (Z_NO_RESOURCE_ID);
+
+ /* remove any pre-existing definition for this rctl */
+ (void) zonecfg_rm_aliased_rctl(handle, name);
+
+ (void) strlcpy(rctltab.zone_rctl_name, aliases[i].realname,
+ sizeof (rctltab.zone_rctl_name));
+
+ rctltab.zone_rctl_valptr = NULL;
+
+ if ((rctlvaltab = calloc(1, sizeof (struct zone_rctlvaltab))) == NULL)
+ return (Z_NOMEM);
+
+ (void) snprintf(buf, sizeof (buf), "%llu", (long long)val);
+
+ (void) strlcpy(rctlvaltab->zone_rctlval_priv, aliases[i].priv,
+ sizeof (rctlvaltab->zone_rctlval_priv));
+ (void) strlcpy(rctlvaltab->zone_rctlval_limit, buf,
+ sizeof (rctlvaltab->zone_rctlval_limit));
+ (void) strlcpy(rctlvaltab->zone_rctlval_action, aliases[i].action,
+ sizeof (rctlvaltab->zone_rctlval_action));
+
+ rctlvaltab->zone_rctlval_next = NULL;
+
+ if ((err = zonecfg_add_rctl_value(&rctltab, rctlvaltab)) != Z_OK)
+ return (err);
+
+ return (zonecfg_add_rctl(handle, &rctltab));
+}
+
+static int
+delete_tmp_pool(zone_dochandle_t handle)
+{
+ int err;
+ xmlNodePtr cur = handle->zone_dh_cur;
+
+ if ((err = operation_prep(handle)) != Z_OK)
+ return (err);
+
+ for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) {
+ if (xmlStrcmp(cur->name, DTD_ELEM_TMPPOOL) == 0) {
+ xmlUnlinkNode(cur);
+ xmlFreeNode(cur);
+ return (Z_OK);
+ }
+ }
+
+ return (Z_NO_RESOURCE_ID);
+}
+
+static int
+modify_tmp_pool(zone_dochandle_t handle, char *pool_importance)
+{
+ int err;
+ xmlNodePtr cur = handle->zone_dh_cur;
+ xmlNodePtr newnode;
+
+ err = delete_tmp_pool(handle);
+ if (err != Z_OK && err != Z_NO_RESOURCE_ID)
+ return (err);
+
+ if (*pool_importance != '\0') {
+ if ((err = operation_prep(handle)) != Z_OK)
+ return (err);
+
+ newnode = xmlNewTextChild(cur, NULL, DTD_ELEM_TMPPOOL, NULL);
+ if ((err = newprop(newnode, DTD_ATTR_IMPORTANCE,
+ pool_importance)) != Z_OK)
+ return (err);
+ }
+
+ return (Z_OK);
+}
+
+static int
+add_pset_core(zone_dochandle_t handle, struct zone_psettab *tabptr)
+{
+ xmlNodePtr newnode, cur = handle->zone_dh_cur;
+ int err;
+
+ newnode = xmlNewTextChild(cur, NULL, DTD_ELEM_PSET, NULL);
+ if ((err = newprop(newnode, DTD_ATTR_NCPU_MIN,
+ tabptr->zone_ncpu_min)) != Z_OK)
+ return (err);
+ if ((err = newprop(newnode, DTD_ATTR_NCPU_MAX,
+ tabptr->zone_ncpu_max)) != Z_OK)
+ return (err);
+
+ if ((err = modify_tmp_pool(handle, tabptr->zone_importance)) != Z_OK)
+ return (err);
+
+ return (Z_OK);
+}
+
+int
+zonecfg_add_pset(zone_dochandle_t handle, struct zone_psettab *tabptr)
+{
+ int err;
+
+ if (tabptr == NULL)
+ return (Z_INVAL);
+
+ if ((err = operation_prep(handle)) != Z_OK)
+ return (err);
+
+ if ((err = add_pset_core(handle, tabptr)) != Z_OK)
+ return (err);
+
+ return (Z_OK);
+}
+
+int
+zonecfg_delete_pset(zone_dochandle_t handle)
+{
+ int err;
+ int res = Z_NO_RESOURCE_ID;
+ xmlNodePtr cur = handle->zone_dh_cur;
+
+ if ((err = operation_prep(handle)) != Z_OK)
+ return (err);
+
+ for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) {
+ if (xmlStrcmp(cur->name, DTD_ELEM_PSET) == 0) {
+ xmlUnlinkNode(cur);
+ xmlFreeNode(cur);
+ res = Z_OK;
+ break;
+ }
+ }
+
+ /*
+ * Once we have msets, we should check that a mset
+ * do not exist before we delete the tmp_pool data.
+ */
+ err = delete_tmp_pool(handle);
+ if (err != Z_OK && err != Z_NO_RESOURCE_ID)
+ return (err);
+
+ return (res);
+}
+
+int
+zonecfg_modify_pset(zone_dochandle_t handle, struct zone_psettab *tabptr)
+{
+ int err;
+
+ if (tabptr == NULL)
+ return (Z_INVAL);
+
+ if ((err = zonecfg_delete_pset(handle)) != Z_OK)
+ return (err);
+
+ if ((err = add_pset_core(handle, tabptr)) != Z_OK)
+ return (err);
+
+ return (Z_OK);
+}
+
+int
+zonecfg_lookup_pset(zone_dochandle_t handle, struct zone_psettab *tabptr)
+{
+ xmlNodePtr cur;
+ int err;
+ int res = Z_NO_ENTRY;
+
+ if (tabptr == NULL)
+ return (Z_INVAL);
+
+ if ((err = operation_prep(handle)) != Z_OK)
+ return (err);
+
+ /* this is an optional component */
+ tabptr->zone_importance[0] = '\0';
+
+ cur = handle->zone_dh_cur;
+ for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) {
+ if (xmlStrcmp(cur->name, DTD_ELEM_PSET) == 0) {
+ if ((err = fetchprop(cur, DTD_ATTR_NCPU_MIN,
+ tabptr->zone_ncpu_min,
+ sizeof (tabptr->zone_ncpu_min))) != Z_OK) {
+ handle->zone_dh_cur = handle->zone_dh_top;
+ return (err);
+ }
+
+ if ((err = fetchprop(cur, DTD_ATTR_NCPU_MAX,
+ tabptr->zone_ncpu_max,
+ sizeof (tabptr->zone_ncpu_max))) != Z_OK) {
+ handle->zone_dh_cur = handle->zone_dh_top;
+ return (err);
+ }
+
+ res = Z_OK;
+
+ } else if (xmlStrcmp(cur->name, DTD_ELEM_TMPPOOL) == 0) {
+ if ((err = fetchprop(cur, DTD_ATTR_IMPORTANCE,
+ tabptr->zone_importance,
+ sizeof (tabptr->zone_importance))) != Z_OK) {
+ handle->zone_dh_cur = handle->zone_dh_top;
+ return (err);
+ }
+ }
+ }
+
+ return (res);
+}
+
+int
+zonecfg_getpsetent(zone_dochandle_t handle, struct zone_psettab *tabptr)
+{
+ int err;
+
+ if ((err = zonecfg_setent(handle)) != Z_OK)
+ return (err);
+
+ err = zonecfg_lookup_pset(handle, tabptr);
+
+ (void) zonecfg_endent(handle);
+
+ return (err);
+}
+
+static int
+add_mcap(zone_dochandle_t handle, struct zone_mcaptab *tabptr)
+{
+ xmlNodePtr newnode, cur = handle->zone_dh_cur;
+ int err;
+
+ newnode = xmlNewTextChild(cur, NULL, DTD_ELEM_MCAP, NULL);
+ if ((err = newprop(newnode, DTD_ATTR_PHYSCAP, tabptr->zone_physmem_cap))
+ != Z_OK)
+ return (err);
+
+ return (Z_OK);
+}
+
+int
+zonecfg_delete_mcap(zone_dochandle_t handle)
+{
+ int err;
+ xmlNodePtr cur = handle->zone_dh_cur;
+
+ if ((err = operation_prep(handle)) != Z_OK)
+ return (err);
+
+ for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) {
+ if (xmlStrcmp(cur->name, DTD_ELEM_MCAP) != 0)
+ continue;
+
+ xmlUnlinkNode(cur);
+ xmlFreeNode(cur);
+ return (Z_OK);
+ }
+ return (Z_NO_RESOURCE_ID);
+}
+
+int
+zonecfg_modify_mcap(zone_dochandle_t handle, struct zone_mcaptab *tabptr)
+{
+ int err;
+
+ if (tabptr == NULL)
+ return (Z_INVAL);
+
+ err = zonecfg_delete_mcap(handle);
+ /* it is ok if there is no mcap entry */
+ if (err != Z_OK && err != Z_NO_RESOURCE_ID)
+ return (err);
+
+ if ((err = add_mcap(handle, tabptr)) != Z_OK)
+ return (err);
+
+ return (Z_OK);
+}
+
+int
+zonecfg_lookup_mcap(zone_dochandle_t handle, struct zone_mcaptab *tabptr)
+{
+ xmlNodePtr cur;
+ int err;
+
+ if (tabptr == NULL)
+ return (Z_INVAL);
+
+ if ((err = operation_prep(handle)) != Z_OK)
+ return (err);
+
+ cur = handle->zone_dh_cur;
+ for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) {
+ if (xmlStrcmp(cur->name, DTD_ELEM_MCAP) != 0)
+ continue;
+ if ((err = fetchprop(cur, DTD_ATTR_PHYSCAP,
+ tabptr->zone_physmem_cap,
+ sizeof (tabptr->zone_physmem_cap))) != Z_OK) {
+ handle->zone_dh_cur = handle->zone_dh_top;
+ return (err);
+ }
+
+ return (Z_OK);
+ }
+
+ return (Z_NO_ENTRY);
+}
+
+static int
+getmcapent_core(zone_dochandle_t handle, struct zone_mcaptab *tabptr)
+{
+ xmlNodePtr cur;
+ int err;
+
+ if (handle == NULL)
+ return (Z_INVAL);
+
+ if ((cur = handle->zone_dh_cur) == NULL)
+ return (Z_NO_ENTRY);
+
+ for (; cur != NULL; cur = cur->next)
+ if (xmlStrcmp(cur->name, DTD_ELEM_MCAP) == 0)
+ break;
+ if (cur == NULL) {
+ handle->zone_dh_cur = handle->zone_dh_top;
+ return (Z_NO_ENTRY);
+ }
+
+ if ((err = fetchprop(cur, DTD_ATTR_PHYSCAP, tabptr->zone_physmem_cap,
+ sizeof (tabptr->zone_physmem_cap))) != Z_OK) {
+ handle->zone_dh_cur = handle->zone_dh_top;
+ return (err);
+ }
+
+ handle->zone_dh_cur = cur->next;
+ return (Z_OK);
+}
+
+int
+zonecfg_getmcapent(zone_dochandle_t handle, struct zone_mcaptab *tabptr)
+{
+ int err;
+
+ if ((err = zonecfg_setent(handle)) != Z_OK)
+ return (err);
+
+ err = getmcapent_core(handle, tabptr);
+
+ (void) zonecfg_endent(handle);
+
+ return (err);
+}
+
int
zonecfg_setpkgent(zone_dochandle_t handle)
{
diff --git a/usr/src/lib/libzonecfg/common/mapfile-vers b/usr/src/lib/libzonecfg/common/mapfile-vers
index a9d59548d3..e2bb782688 100644
--- a/usr/src/lib/libzonecfg/common/mapfile-vers
+++ b/usr/src/lib/libzonecfg/common/mapfile-vers
@@ -40,10 +40,15 @@ SUNWprivate_1.1 {
zonecfg_add_fs_option;
zonecfg_add_ipd;
zonecfg_add_nwif;
+ zonecfg_add_pset;
zonecfg_add_rctl;
zonecfg_add_rctl_value;
zonecfg_add_scratch;
+ zonecfg_aliased_rctl_ok;
+ zonecfg_apply_rctls;
zonecfg_attach_manifest;
+ zonecfg_bind_pool;
+ zonecfg_bind_tmp_pool;
zonecfg_check_handle;
zonecfg_close_scratch;
zonecfg_construct_rctlblk;
@@ -54,15 +59,20 @@ SUNWprivate_1.1 {
zonecfg_delete_ds;
zonecfg_delete_filesystem;
zonecfg_delete_ipd;
+ zonecfg_delete_mcap;
zonecfg_delete_nwif;
+ zonecfg_delete_pset;
zonecfg_delete_rctl;
zonecfg_delete_scratch;
+ zonecfg_del_all_resources;
zonecfg_destroy;
zonecfg_destroy_snapshot;
+ zonecfg_destroy_tmp_pool;
zonecfg_detached;
zonecfg_detach_save;
zonecfg_devperms_apply;
zonecfg_devwalk;
+ zonecfg_enable_rcapd;
zonecfg_endattrent;
zonecfg_enddevent;
zonecfg_enddevperment;
@@ -78,6 +88,7 @@ SUNWprivate_1.1 {
zonecfg_fini_handle;
zonecfg_free_fs_option_list;
zonecfg_free_rctl_value_list;
+ zonecfg_get_aliased_rctl;
zonecfg_get_attach_handle;
zonecfg_get_attr_boolean;
zonecfg_getattrent;
@@ -88,6 +99,7 @@ SUNWprivate_1.1 {
zonecfg_get_bootargs;
zonecfg_get_brand;
zonecfg_get_detach_info;
+ zonecfg_get_dflt_sched_class;
zonecfg_getdevent;
zonecfg_getdevperment;
zonecfg_getdsent;
@@ -95,6 +107,7 @@ SUNWprivate_1.1 {
zonecfg_get_handle;
zonecfg_getipdent;
zonecfg_get_limitpriv;
+ zonecfg_getmcapent;
zonecfg_get_name;
zonecfg_get_name_by_uuid;
zonecfg_getnwifent;
@@ -102,8 +115,10 @@ SUNWprivate_1.1 {
zonecfg_getpkgent;
zonecfg_get_pool;
zonecfg_get_privset;
+ zonecfg_getpsetent;
zonecfg_getrctlent;
zonecfg_get_root;
+ zonecfg_get_sched_class;
zonecfg_get_scratch;
zonecfg_get_snapshot_handle;
zonecfg_get_template_handle;
@@ -120,28 +135,35 @@ SUNWprivate_1.1 {
zonecfg_lookup_ds;
zonecfg_lookup_filesystem;
zonecfg_lookup_ipd;
+ zonecfg_lookup_mcap;
zonecfg_lookup_nwif;
+ zonecfg_lookup_pset;
zonecfg_lookup_rctl;
zonecfg_modify_attr;
zonecfg_modify_dev;
zonecfg_modify_ds;
zonecfg_modify_filesystem;
zonecfg_modify_ipd;
+ zonecfg_modify_mcap;
zonecfg_modify_nwif;
+ zonecfg_modify_pset;
zonecfg_modify_rctl;
zonecfg_notify_bind;
zonecfg_notify_critical_abort;
zonecfg_notify_critical_enter;
zonecfg_notify_critical_exit;
zonecfg_notify_unbind;
+ zonecfg_num_resources;
zonecfg_open_scratch;
zonecfg_remove_fs_option;
zonecfg_remove_rctl_value;
zonecfg_reverse_scratch;
+ zonecfg_rm_aliased_rctl;
zonecfg_rm_detached;
zonecfg_same_net_address;
zonecfg_save;
zonecfg_setattrent;
+ zonecfg_set_aliased_rctl;
zonecfg_set_autoboot;
zonecfg_set_bootargs;
zonecfg_set_brand;
@@ -158,15 +180,22 @@ SUNWprivate_1.1 {
zonecfg_set_pool;
zonecfg_setrctlent;
zonecfg_set_root;
+ zonecfg_set_sched;
zonecfg_set_zonepath;
zonecfg_strerror;
+ zonecfg_str_to_bytes;
zonecfg_validate_zonename;
+ zonecfg_valid_alias_limit;
zonecfg_valid_fs_type;
+ zonecfg_valid_importance;
+ zonecfg_valid_memlimit;
+ zonecfg_valid_ncpus;
zonecfg_valid_net_address;
zonecfg_valid_rctl;
zonecfg_valid_rctlblk;
zonecfg_valid_rctlname;
zonecfg_verify_save;
+ zonecfg_warn_poold;
zone_get_brand;
zone_get_devroot;
zone_get_id;
diff --git a/usr/src/lib/libzonecfg/dtd/zonecfg.dtd.1 b/usr/src/lib/libzonecfg/dtd/zonecfg.dtd.1
index 3208af7a79..c51e89add3 100644
--- a/usr/src/lib/libzonecfg/dtd/zonecfg.dtd.1
+++ b/usr/src/lib/libzonecfg/dtd/zonecfg.dtd.1
@@ -111,7 +111,27 @@
mode CDATA #REQUIRED
acl CDATA #REQUIRED>
-<!ELEMENT zone (filesystem | inherited-pkg-dir | network | device | deleted-device | rctl | attr | dataset | package | patch | dev-perm)*>
+<!--
+ The tmp_pool element is separate from the pset element so that
+ we can track the importance value at the pool level, where it
+ belongs, instead of at the pset level. Once we have msets this
+ will be important since tmp psets and tmp msets will share a common
+ pool-level importance.
+-->
+<!ELEMENT tmp_pool EMPTY>
+
+<!ATTLIST tmp_pool importance CDATA #REQUIRED>
+
+<!ELEMENT pset EMPTY>
+
+<!ATTLIST pset ncpu_min CDATA #REQUIRED
+ ncpu_max CDATA #REQUIRED>
+
+<!ELEMENT mcap EMPTY>
+
+<!ATTLIST mcap physcap CDATA #REQUIRED>
+
+<!ELEMENT zone (filesystem | inherited-pkg-dir | network | device | deleted-device | rctl | attr | dataset | package | patch | dev-perm | tmp_pool | pset | mcap)*>
<!ATTLIST zone name CDATA #REQUIRED
zonepath CDATA #REQUIRED
@@ -120,4 +140,5 @@
limitpriv CDATA ""
bootargs CDATA ""
brand CDATA ""
+ scheduling-class CDATA ""
version NMTOKEN #FIXED '1'>
diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com
index c52316079d..6ac2e461ab 100644
--- a/usr/src/pkgdefs/SUNWhea/prototype_com
+++ b/usr/src/pkgdefs/SUNWhea/prototype_com
@@ -1178,6 +1178,7 @@ f none usr/include/sys/varargs.h 644 root bin
f none usr/include/sys/vfs.h 644 root bin
f none usr/include/sys/vfstab.h 644 root bin
f none usr/include/sys/vm.h 644 root bin
+f none usr/include/sys/vm_usage.h 644 root bin
f none usr/include/sys/vmem.h 644 root bin
f none usr/include/sys/vmem_impl.h 644 root bin
f none usr/include/sys/vmem_impl_user.h 644 root bin
diff --git a/usr/src/pkgdefs/SUNWrcapu/depend b/usr/src/pkgdefs/SUNWrcapu/depend
index 9aaa446bca..a7375758b0 100644
--- a/usr/src/pkgdefs/SUNWrcapu/depend
+++ b/usr/src/pkgdefs/SUNWrcapu/depend
@@ -1,13 +1,12 @@
#
-# Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -43,3 +42,4 @@
P SUNWrcapr Solaris Resource Capping Daemon (Root)
P SUNWcsu Core Solaris, (Usr)
P SUNWcsl Core Solaris, (Shared Libs)
+P SUNWzoneu Solaris Zones (Usr)
diff --git a/usr/src/pkgdefs/SUNWzoner/prototype_com b/usr/src/pkgdefs/SUNWzoner/prototype_com
index 009de7fb9f..15661840ea 100644
--- a/usr/src/pkgdefs/SUNWzoner/prototype_com
+++ b/usr/src/pkgdefs/SUNWzoner/prototype_com
@@ -56,9 +56,11 @@ f none etc/zones/SUNWblank.xml 444 root bin
d none lib 755 root bin
d none lib/svc 0755 root bin
d none lib/svc/method 0755 root bin
+f none lib/svc/method/svc-resource-mgmt 0555 root bin
f none lib/svc/method/svc-zones 0555 root bin
d none var 755 root sys
d none var/svc 755 root sys
d none var/svc/manifest 755 root sys
d none var/svc/manifest/system 755 root sys
+f manifest var/svc/manifest/system/resource-mgmt.xml 0444 root sys
f manifest var/svc/manifest/system/zones.xml 0444 root sys
diff --git a/usr/src/tools/scripts/bfu.sh b/usr/src/tools/scripts/bfu.sh
index 21d5a7eb67..b10d453c7b 100644
--- a/usr/src/tools/scripts/bfu.sh
+++ b/usr/src/tools/scripts/bfu.sh
@@ -332,6 +332,7 @@ superfluous_local_zone_files="
lib/svc/method/svc-poold
lib/svc/method/svc-pools
lib/svc/method/svc-power
+ lib/svc/method/svc-resource-mgmt
lib/svc/method/svc-rmvolmgr
lib/svc/method/svc-scheduler
lib/svc/method/svc-sckmd
@@ -401,6 +402,7 @@ superfluous_local_zone_files="
var/svc/manifest/system/poold.xml
var/svc/manifest/system/pools.xml
var/svc/manifest/system/power.xml
+ var/svc/manifest/system/resource-mgmt.xml
var/svc/manifest/system/scheduler.xml
var/svc/manifest/system/sysevent.xml
var/svc/manifest/system/zones.xml
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 32a63d6c22..b2bbcbc8c3 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -334,6 +334,7 @@ GENUNIX_OBJS += \
vm_seg.o \
vm_subr.o \
vm_swap.o \
+ vm_usage.o \
vnode.o \
vuid_queue.o \
vuid_store.o \
diff --git a/usr/src/uts/common/disp/priocntl.c b/usr/src/uts/common/disp/priocntl.c
index 3bb90cf1fa..9197dc815b 100644
--- a/usr/src/uts/common/disp/priocntl.c
+++ b/usr/src/uts/common/disp/priocntl.c
@@ -136,6 +136,7 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg,
struct pcmpargs pcmpargs;
pc_vaparms_t vaparms;
char clname[PC_CLNMSZ];
+ char *outstr;
int count;
kthread_id_t retthreadp;
proc_t *initpp;
@@ -145,6 +146,7 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg,
int rv = 0;
pid_t saved_pid;
id_t classid;
+ int size;
int (*copyinfn)(const void *, void *, size_t);
int (*copyoutfn)(const void *, void *, size_t);
@@ -692,6 +694,21 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg,
ASSERT(defaultcid > 0 && defaultcid < loaded_classes);
break;
+ case PC_GETDFLCL:
+ mutex_enter(&class_lock);
+
+ if (defaultcid >= loaded_classes)
+ outstr = "";
+ else
+ outstr = sclass[defaultcid].cl_name;
+ size = strlen(outstr) + 1;
+ if (arg != NULL)
+ if ((*copyoutfn)(outstr, arg, size) != 0)
+ error = EFAULT;
+
+ mutex_exit(&class_lock);
+ break;
+
default:
error = EINVAL;
break;
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c
index 5a7000c242..c5145cccf0 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -67,6 +66,7 @@ tmp_resv(
int pagecreate) /* call anon_resv if set */
{
pgcnt_t pages = btopr(delta);
+ zone_t *zone;
ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
ASSERT(tp->tn_type == VREG);
@@ -79,9 +79,10 @@ tmp_resv(
*
* Deny if trying to reserve more than tmpfs can allocate
*/
+ zone = tm->tm_vfsp->vfs_zone;
if (pagecreate && ((tm->tm_anonmem + pages > tm->tm_anonmax) ||
- (!anon_checkspace(ptob(pages + tmpfs_minfree))) ||
- (anon_resv(delta) == 0))) {
+ (!anon_checkspace(ptob(pages + tmpfs_minfree), zone)) ||
+ (anon_resv_zone(delta, zone) == 0))) {
return (1);
}
@@ -114,7 +115,7 @@ tmp_unresv(
ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
ASSERT(tp->tn_type == VREG);
- anon_unresv(delta);
+ anon_unresv_zone(delta, tm->tm_vfsp->vfs_zone);
mutex_enter(&tm->tm_contents);
tm->tm_anonmem -= btopr(delta);
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
index d623dce3f7..aa870b124a 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
@@ -215,9 +215,26 @@ wrtmp(
if (delta > 0) {
pagecreate = 1;
if (tmp_resv(tm, tp, delta, pagecreate)) {
- cmn_err(CE_WARN,
- "%s: File system full, swap space limit exceeded",
+ /*
+ * Log file system full in the zone that owns
+ * the tmpfs mount, as well as in the global
+ * zone if necessary.
+ */
+ zcmn_err(tm->tm_vfsp->vfs_zone->zone_id,
+ CE_WARN, "%s: File system full, "
+ "swap space limit exceeded",
tm->tm_mntpath);
+
+ if (tm->tm_vfsp->vfs_zone->zone_id !=
+ GLOBAL_ZONEID) {
+
+ vfs_t *vfs = tm->tm_vfsp;
+
+ zcmn_err(GLOBAL_ZONEID,
+ CE_WARN, "%s: File system full, "
+ "swap space limit exceeded",
+ vfs->vfs_vnodecovered->v_path);
+ }
error = ENOSPC;
break;
}
diff --git a/usr/src/uts/common/os/modhash.c b/usr/src/uts/common/os/modhash.c
index 19700ce685..3c63231253 100644
--- a/usr/src/uts/common/os/modhash.c
+++ b/usr/src/uts/common/os/modhash.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -165,15 +164,6 @@
*/
#define MH_KEYCMP(hash, key1, key2) ((hash->mh_keycmp)(key1, key2))
-static void i_mod_hash_clear_nosync(mod_hash_t *);
-static int i_mod_hash_find_nosync(mod_hash_t *, mod_hash_key_t,
- mod_hash_val_t *);
-static int i_mod_hash_insert_nosync(mod_hash_t *, mod_hash_key_t,
- mod_hash_val_t, mod_hash_hndl_t);
-static int i_mod_hash_remove_nosync(mod_hash_t *, mod_hash_key_t,
- mod_hash_val_t *);
-static uint_t i_mod_hash(mod_hash_t *, mod_hash_key_t);
-
/*
* Cache for struct mod_hash_entry
*/
@@ -522,7 +512,7 @@ mod_hash_destroy_hash(mod_hash_t *hash)
* i_mod_hash()
* Call the hashing algorithm for this hash table, with the given key.
*/
-static uint_t
+uint_t
i_mod_hash(mod_hash_t *hash, mod_hash_key_t key)
{
uint_t h;
@@ -778,7 +768,7 @@ mod_hash_destroy(mod_hash_t *hash, mod_hash_key_t key)
* mod_hash_find()
* Find a value in the hash table corresponding to the given key.
*/
-static int
+int
i_mod_hash_find_nosync(mod_hash_t *hash, mod_hash_key_t key,
mod_hash_val_t *val)
{
@@ -826,7 +816,7 @@ mod_hash_find_cb(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val,
return (res);
}
-static void
+void
i_mod_hash_walk_nosync(mod_hash_t *hash,
uint_t (*callback)(mod_hash_key_t, mod_hash_val_t *, void *), void *arg)
{
@@ -870,7 +860,7 @@ mod_hash_walk(mod_hash_t *hash,
* Clears the given hash table by calling the destructor of every hash
* element and freeing up all mod_hash_entry's.
*/
-static void
+void
i_mod_hash_clear_nosync(mod_hash_t *hash)
{
int i;
diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c
index 88b0258afe..fecc4a6c45 100644
--- a/usr/src/uts/common/os/pid.c
+++ b/usr/src/uts/common/os/pid.c
@@ -385,6 +385,56 @@ pgfind(pid_t pgid)
}
/*
+ * Sets P_PR_LOCK on a non-system process. Process must be fully created
+ * and not exiting to succeed.
+ *
+ * Returns 0 on success.
+ * Returns 1 if P_PR_LOCK is set.
+ * Returns -1 if proc is in invalid state.
+ */
+int
+sprtrylock_proc(proc_t *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+
+ /* skip system and incomplete processes */
+ if (p->p_stat == SIDL || p->p_stat == SZOMB ||
+ (p->p_flag & (SSYS | SEXITING | SEXITLWPS))) {
+ return (-1);
+ }
+
+ if (p->p_proc_flag & P_PR_LOCK)
+ return (1);
+
+ p->p_proc_flag |= P_PR_LOCK;
+ THREAD_KPRI_REQUEST();
+
+ return (0);
+}
+
+/*
+ * Wait for P_PR_LOCK to become clear. Returns with p_lock dropped,
+ * and the proc pointer no longer valid, as the proc may have exited.
+ */
+void
+sprwaitlock_proc(proc_t *p)
+{
+ kmutex_t *mp;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(p->p_proc_flag & P_PR_LOCK);
+
+ /*
+ * p_lock is persistent, but p itself is not -- it could
+ * vanish during cv_wait(). Load p->p_lock now so we can
+ * drop it after cv_wait() without referencing p.
+ */
+ mp = &p->p_lock;
+ cv_wait(&pr_pid_cv[p->p_slot], mp);
+ mutex_exit(mp);
+}
+
+/*
* If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK.
* Returns the proc pointer on success, NULL on failure. sprlock() is
* really just a stripped-down version of pr_p_lock() to allow practive
@@ -394,7 +444,7 @@ proc_t *
sprlock_zone(pid_t pid, zoneid_t zoneid)
{
proc_t *p;
- kmutex_t *mp;
+ int ret;
for (;;) {
mutex_enter(&pidlock);
@@ -402,31 +452,21 @@ sprlock_zone(pid_t pid, zoneid_t zoneid)
mutex_exit(&pidlock);
return (NULL);
}
- /*
- * p_lock is persistent, but p itself is not -- it could
- * vanish during cv_wait(). Load p->p_lock now so we can
- * drop it after cv_wait() without referencing p.
- */
- mp = &p->p_lock;
- mutex_enter(mp);
+ mutex_enter(&p->p_lock);
mutex_exit(&pidlock);
- /*
- * If the process is in some half-baked state, fail.
- */
- if (p->p_stat == SZOMB || p->p_stat == SIDL ||
- (p->p_flag & (SEXITING | SEXITLWPS))) {
- mutex_exit(mp);
- return (NULL);
- }
+
if (panicstr)
return (p);
- if (!(p->p_proc_flag & P_PR_LOCK))
+
+ ret = sprtrylock_proc(p);
+ if (ret == -1) {
+ mutex_exit(&p->p_lock);
+ return (NULL);
+ } else if (ret == 0) {
break;
- cv_wait(&pr_pid_cv[p->p_slot], mp);
- mutex_exit(mp);
+ }
+ sprwaitlock_proc(p);
}
- p->p_proc_flag |= P_PR_LOCK;
- THREAD_KPRI_REQUEST();
return (p);
}
diff --git a/usr/src/uts/common/os/pool.c b/usr/src/uts/common/os/pool.c
index ceb90850fa..818bb54701 100644
--- a/usr/src/uts/common/os/pool.c
+++ b/usr/src/uts/common/os/pool.c
@@ -293,6 +293,8 @@ pool_enable(void)
(void) nvlist_add_string(pool_sys_prop, "system.comment", "");
(void) nvlist_add_int64(pool_sys_prop, "system.version", 1);
(void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1);
+ (void) nvlist_add_string(pool_sys_prop, "system.poold.objectives",
+ "wt-load");
(void) nvlist_alloc(&pool_default->pool_props,
NV_UNIQUE_NAME, KM_SLEEP);
@@ -1309,7 +1311,7 @@ pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags)
}
if (idtype == P_PROJID) {
- kpj = project_hold_by_id(id, GLOBAL_ZONEID, PROJECT_HOLD_FIND);
+ kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND);
if (kpj == NULL)
return (ESRCH);
mutex_enter(&kpj->kpj_poolbind);
diff --git a/usr/src/uts/common/os/project.c b/usr/src/uts/common/os/project.c
index 6c266c0ca3..d75b60f6e9 100644
--- a/usr/src/uts/common/os/project.c
+++ b/usr/src/uts/common/os/project.c
@@ -29,6 +29,7 @@
#include <sys/modhash.h>
#include <sys/modctl.h>
#include <sys/kmem.h>
+#include <sys/kstat.h>
#include <sys/atomic.h>
#include <sys/cmn_err.h>
#include <sys/proc.h>
@@ -103,6 +104,8 @@ struct project_zone {
* acquired, the hash lock is to be acquired first.
*/
+static kstat_t *project_kstat_create(kproject_t *pj, zone_t *zone);
+static void project_kstat_delete(kproject_t *pj);
static void
project_data_init(kproject_data_t *data)
@@ -118,6 +121,7 @@ project_data_init(kproject_data_t *data)
data->kpd_locked_mem_ctl = UINT64_MAX;
data->kpd_contract = 0;
data->kpd_crypto_mem = 0;
+ data->kpd_lockedmem_kstat = NULL;
}
/*ARGSUSED*/
@@ -179,11 +183,11 @@ project_hold(kproject_t *p)
}
/*
- * kproject_t *project_hold_by_id(projid_t, zoneid_t, int)
+ * kproject_t *project_hold_by_id(projid_t, zone_t *, int)
*
* Overview
* project_hold_by_id() performs a look-up in the dictionary of projects
- * active on the system by specified project ID + zone ID and puts a hold on
+ * active on the system by specified project ID + zone and puts a hold on
* it. The third argument defines the desired behavior in the case when
* project with given project ID cannot be found:
*
@@ -202,7 +206,7 @@ project_hold(kproject_t *p)
* Caller must be in a context suitable for KM_SLEEP allocations.
*/
kproject_t *
-project_hold_by_id(projid_t id, zoneid_t zoneid, int flag)
+project_hold_by_id(projid_t id, zone_t *zone, int flag)
{
kproject_t *spare_p;
kproject_t *p;
@@ -211,9 +215,11 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag)
rctl_alloc_gp_t *gp;
rctl_entity_p_t e;
struct project_zone pz;
+ boolean_t create = B_FALSE;
+ kstat_t *ksp;
pz.kpj_id = id;
- pz.kpj_zoneid = zoneid;
+ pz.kpj_zoneid = zone->zone_id;
if (flag == PROJECT_HOLD_FIND) {
mutex_enter(&project_hash_lock);
@@ -241,9 +247,10 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag)
mutex_enter(&project_hash_lock);
if (mod_hash_find(projects_hash, (mod_hash_key_t)&pz,
(mod_hash_val_t *)&p) == MH_ERR_NOTFOUND) {
+
p = spare_p;
p->kpj_id = id;
- p->kpj_zoneid = zoneid;
+ p->kpj_zoneid = zone->zone_id;
p->kpj_count = 0;
p->kpj_shares = 1;
p->kpj_nlwps = 0;
@@ -265,7 +272,7 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag)
* Insert project into global project list.
*/
mutex_enter(&projects_list_lock);
- if (id != 0 || zoneid != GLOBAL_ZONEID) {
+ if (id != 0 || zone != &zone0) {
p->kpj_next = projects_list;
p->kpj_prev = projects_list->kpj_prev;
p->kpj_prev->kpj_next = p;
@@ -279,6 +286,7 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag)
projects_list = p;
}
mutex_exit(&projects_list_lock);
+ create = B_TRUE;
} else {
mutex_exit(&curproc->p_lock);
mod_hash_cancel(projects_hash, &hndl);
@@ -290,10 +298,20 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag)
p->kpj_count++;
mutex_exit(&project_hash_lock);
+ /*
+ * The kstat stores the project's zone name, as zoneid's may change
+ * across reboots.
+ */
+ if (create == B_TRUE) {
+ ksp = project_kstat_create(p, zone);
+ mutex_enter(&project_hash_lock);
+ ASSERT(p->kpj_data.kpd_lockedmem_kstat == NULL);
+ p->kpj_data.kpd_lockedmem_kstat = ksp;
+ mutex_exit(&project_hash_lock);
+ }
return (p);
}
-
/*
* void project_rele(kproject_t *)
*
@@ -325,6 +343,7 @@ project_rele(kproject_t *p)
mutex_exit(&projects_list_lock);
rctl_set_free(p->kpj_rctls);
+ project_kstat_delete(p);
if (mod_hash_destroy(projects_hash, (mod_hash_key_t)p))
panic("unable to delete project %d zone %d", p->kpj_id,
@@ -636,9 +655,9 @@ project_locked_mem_usage(rctl_t *rctl, struct proc *p)
{
rctl_qty_t q;
ASSERT(MUTEX_HELD(&p->p_lock));
- mutex_enter(&p->p_zone->zone_rctl_lock);
+ mutex_enter(&p->p_zone->zone_mem_lock);
q = p->p_task->tk_proj->kpj_data.kpd_locked_mem;
- mutex_exit(&p->p_zone->zone_rctl_lock);
+ mutex_exit(&p->p_zone->zone_mem_lock);
return (q);
}
@@ -649,7 +668,7 @@ project_locked_mem_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e,
{
rctl_qty_t q;
ASSERT(MUTEX_HELD(&p->p_lock));
- ASSERT(MUTEX_HELD(&p->p_zone->zone_rctl_lock));
+ ASSERT(MUTEX_HELD(&p->p_zone->zone_mem_lock));
q = p->p_task->tk_proj->kpj_data.kpd_locked_mem;
if (q + inc > rval->rcv_value)
return (1);
@@ -868,7 +887,7 @@ project_init(void)
rctl_add_default_limit("project.max-contracts", 10000,
RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY);
- t0.t_proj = proj0p = project_hold_by_id(0, GLOBAL_ZONEID,
+ t0.t_proj = proj0p = project_hold_by_id(0, &zone0,
PROJECT_HOLD_INSERT);
mutex_enter(&p0.p_lock);
@@ -876,3 +895,57 @@ project_init(void)
mutex_exit(&p0.p_lock);
proj0p->kpj_ntasks = 1;
}
+
+static int
+project_lockedmem_kstat_update(kstat_t *ksp, int rw)
+{
+ kproject_t *pj = ksp->ks_private;
+ kproject_kstat_t *kpk = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ kpk->kpk_usage.value.ui64 = pj->kpj_data.kpd_locked_mem;
+ kpk->kpk_value.value.ui64 = pj->kpj_data.kpd_locked_mem_ctl;
+ return (0);
+}
+
+static kstat_t *
+project_kstat_create(kproject_t *pj, zone_t *zone)
+{
+ kstat_t *ksp;
+ kproject_kstat_t *kpk;
+ char *zonename = zone->zone_name;
+
+ ksp = rctl_kstat_create_project(pj, "lockedmem", KSTAT_TYPE_NAMED,
+ sizeof (kproject_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (ksp == NULL)
+ return (NULL);
+
+ kpk = ksp->ks_data = kmem_alloc(sizeof (kproject_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zonename) + 1;
+ kstat_named_init(&kpk->kpk_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&kpk->kpk_zonename, zonename);
+ kstat_named_init(&kpk->kpk_usage, "usage", KSTAT_DATA_UINT64);
+ kstat_named_init(&kpk->kpk_value, "value", KSTAT_DATA_UINT64);
+ ksp->ks_update = project_lockedmem_kstat_update;
+ ksp->ks_private = pj;
+ kstat_install(ksp);
+
+ return (ksp);
+}
+
+static void
+project_kstat_delete(kproject_t *pj)
+{
+ void *data;
+
+ if (pj->kpj_data.kpd_lockedmem_kstat != NULL) {
+ data = pj->kpj_data.kpd_lockedmem_kstat->ks_data;
+ kstat_delete(pj->kpj_data.kpd_lockedmem_kstat);
+ kmem_free(data, sizeof (zone_kstat_t));
+ }
+ pj->kpj_data.kpd_lockedmem_kstat = NULL;
+}
diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c
index 4de4c74fe8..c0479005ea 100644
--- a/usr/src/uts/common/os/rctl.c
+++ b/usr/src/uts/common/os/rctl.c
@@ -29,6 +29,7 @@
#include <sys/cmn_err.h>
#include <sys/id_space.h>
#include <sys/kmem.h>
+#include <sys/kstat.h>
#include <sys/log.h>
#include <sys/modctl.h>
#include <sys/modhash.h>
@@ -2599,7 +2600,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
zonep = p->p_zone;
}
- mutex_enter(&zonep->zone_rctl_lock);
+ mutex_enter(&zonep->zone_mem_lock);
e.rcep_p.proj = projp;
e.rcep_t = RCENTITY_PROJECT;
@@ -2627,7 +2628,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
p->p_locked_mem += inc;
}
out:
- mutex_exit(&zonep->zone_rctl_lock);
+ mutex_exit(&zonep->zone_mem_lock);
if (proj != NULL)
zone_rele(zonep);
return (ret);
@@ -2661,7 +2662,7 @@ rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
zonep = p->p_zone;
}
- mutex_enter(&zonep->zone_rctl_lock);
+ mutex_enter(&zonep->zone_mem_lock);
zonep->zone_locked_mem -= inc;
projp->kpj_data.kpd_locked_mem -= inc;
if (creditproc != 0) {
@@ -2669,7 +2670,120 @@ rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
ASSERT(MUTEX_HELD(&p->p_lock));
p->p_locked_mem -= inc;
}
- mutex_exit(&zonep->zone_rctl_lock);
+ mutex_exit(&zonep->zone_mem_lock);
if (proj != NULL)
zone_rele(zonep);
}
+
+/*
+ * rctl_incr_swap(proc_t *, zone_t *, size_t)
+ *
+ * Overview
+ * Increments the swap charge on the specified zone.
+ *
+ * Return values
+ * 0 on success. EAGAIN if swap increment fails due an rctl value
+ * on the zone.
+ *
+ * Callers context
+ * p_lock held on specified proc.
+ * swap must be even multiple of PAGESIZE
+ */
+int
+rctl_incr_swap(proc_t *proc, zone_t *zone, size_t swap)
+{
+ rctl_entity_p_t e;
+
+ ASSERT(MUTEX_HELD(&proc->p_lock));
+ ASSERT((swap & PAGEOFFSET) == 0);
+ e.rcep_p.zone = zone;
+ e.rcep_t = RCENTITY_ZONE;
+
+ mutex_enter(&zone->zone_mem_lock);
+
+ if ((zone->zone_max_swap + swap) >
+ zone->zone_max_swap_ctl) {
+
+ if (rctl_test_entity(rc_zone_max_swap, zone->zone_rctls,
+ proc, &e, swap, 0) & RCT_DENY) {
+ mutex_exit(&zone->zone_mem_lock);
+ return (EAGAIN);
+ }
+ }
+ zone->zone_max_swap += swap;
+ mutex_exit(&zone->zone_mem_lock);
+ return (0);
+}
+
+/*
+ * rctl_decr_swap(zone_t *, size_t)
+ *
+ * Overview
+ * Decrements the swap charge on the specified zone.
+ *
+ * Return values
+ * None
+ *
+ * Callers context
+ * swap must be even multiple of PAGESIZE
+ */
+void
+rctl_decr_swap(zone_t *zone, size_t swap)
+{
+ ASSERT((swap & PAGEOFFSET) == 0);
+ mutex_enter(&zone->zone_mem_lock);
+ ASSERT(zone->zone_max_swap >= swap);
+ zone->zone_max_swap -= swap;
+ mutex_exit(&zone->zone_mem_lock);
+}
+
+/*
+ * Create resource kstat
+ */
+static kstat_t *
+rctl_kstat_create_common(char *ks_name, int ks_instance, char *ks_class,
+ uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags, int ks_zoneid)
+{
+ kstat_t *ksp = NULL;
+ char name[KSTAT_STRLEN];
+
+ (void) snprintf(name, KSTAT_STRLEN, "%s_%d", ks_name, ks_instance);
+
+ if ((ksp = kstat_create_zone("caps", ks_zoneid,
+ name, ks_class, ks_type,
+ ks_ndata, ks_flags, ks_zoneid)) != NULL) {
+ if (ks_zoneid != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+ }
+ return (ksp);
+}
+
+/*
+ * Create zone-specific resource kstat
+ */
+kstat_t *
+rctl_kstat_create_zone(zone_t *zone, char *ks_name, uchar_t ks_type,
+ uint_t ks_ndata, uchar_t ks_flags)
+{
+ char name[KSTAT_STRLEN];
+
+ (void) snprintf(name, KSTAT_STRLEN, "%s_zone", ks_name);
+
+ return (rctl_kstat_create_common(name, zone->zone_id, "zone_caps",
+ ks_type, ks_ndata, ks_flags, zone->zone_id));
+}
+
+/*
+ * Create project-specific resource kstat
+ */
+kstat_t *
+rctl_kstat_create_project(kproject_t *kpj, char *ks_name, uchar_t ks_type,
+ uint_t ks_ndata, uchar_t ks_flags)
+{
+ char name[KSTAT_STRLEN];
+
+ (void) snprintf(name, KSTAT_STRLEN, "%s_project", ks_name);
+
+ return (rctl_kstat_create_common(name, kpj->kpj_id, "project_caps",
+ ks_type, ks_ndata, ks_flags, kpj->kpj_zoneid));
+}
diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c
index 66aae7d2bc..62279e0777 100644
--- a/usr/src/uts/common/os/schedctl.c
+++ b/usr/src/uts/common/os/schedctl.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -542,13 +541,13 @@ schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr)
* Set up anonymous memory struct. No swap reservation is
* needed since the page will be locked into memory.
*/
- amp = anonmap_alloc(PAGESIZE, PAGESIZE);
+ amp = anonmap_alloc(PAGESIZE, 0);
/*
* Allocate the page.
*/
- kaddr = segkp_get_withanonmap(segkp, PAGESIZE, KPD_LOCKED | KPD_ZERO,
- amp);
+ kaddr = segkp_get_withanonmap(segkp, PAGESIZE,
+ KPD_NO_ANON | KPD_LOCKED | KPD_ZERO, amp);
if (kaddr == NULL) {
amp->refcnt--;
anonmap_free(amp);
diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c
index 9ada0aac18..a7ef99fddb 100644
--- a/usr/src/uts/common/os/sysent.c
+++ b/usr/src/uts/common/os/sysent.c
@@ -666,7 +666,7 @@ struct sysent sysent[NSYSCALL] =
/* 178 */ SYSENT_LOADABLE(), /* kaio */
/* 179 */ SYSENT_LOADABLE(), /* cpc */
/* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3),
- /* 181 */ SYSENT_CI("rusagesys", rusagesys, 2),
+ /* 181 */ SYSENT_CI("rusagesys", rusagesys, 5),
/* 182 */ SYSENT_LOADABLE(), /* portfs */
/* 183 */ SYSENT_CI("pollsys", pollsys, 4),
/* 184 */ SYSENT_CI("labelsys", labelsys, 5),
@@ -1044,7 +1044,7 @@ struct sysent sysent32[NSYSCALL] =
/* 178 */ SYSENT_LOADABLE32(), /* kaio */
/* 179 */ SYSENT_LOADABLE32(), /* cpc */
/* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3),
- /* 181 */ SYSENT_CI("rusagesys", rusagesys, 2),
+ /* 181 */ SYSENT_CI("rusagesys", rusagesys, 5),
/* 182 */ SYSENT_LOADABLE32(), /* portfs */
/* 183 */ SYSENT_CI("pollsys", pollsys, 4),
/* 184 */ SYSENT_CI("labelsys", labelsys, 5),
diff --git a/usr/src/uts/common/os/task.c b/usr/src/uts/common/os/task.c
index 562e3596b5..785f74c145 100644
--- a/usr/src/uts/common/os/task.c
+++ b/usr/src/uts/common/os/task.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -389,7 +388,7 @@ task_create(projid_t projid, zone_t *zone)
tk->tk_nlwps = 0;
tk->tk_nlwps_ctl = INT_MAX;
tk->tk_usage = tu;
- tk->tk_proj = project_hold_by_id(projid, zone->zone_id,
+ tk->tk_proj = project_hold_by_id(projid, zone,
PROJECT_HOLD_INSERT);
tk->tk_flags = TASK_NORMAL;
@@ -848,7 +847,7 @@ task_init(void)
task0p->tk_tkid = id_alloc(taskid_space);
task0p->tk_usage = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
- task0p->tk_proj = project_hold_by_id(0, GLOBAL_ZONEID,
+ task0p->tk_proj = project_hold_by_id(0, &zone0,
PROJECT_HOLD_INSERT);
task0p->tk_flags = TASK_NORMAL;
task0p->tk_nlwps = p->p_lwpcnt;
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index 0fb2c2be55..19ea8b31f1 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -154,6 +154,10 @@
* zone_lock: This is a per-zone lock used to protect several fields of
* the zone_t (see <sys/zone.h> for details). In addition, holding
* this lock means that the zone cannot go away.
+ * zone_nlwps_lock: This is a per-zone lock used to protect the fields
+ * related to the zone.max-lwps rctl.
+ * zone_mem_lock: This is a per-zone lock used to protect the fields
+ * related to the zone.max-locked-memory and zone.max-swap rctls.
* zsd_key_lock: This is a global lock protecting the key state for ZSD.
* zone_deathrow_lock: This is a global lock protecting the "deathrow"
* list (a list of zones in the ZONE_IS_DEAD state).
@@ -162,6 +166,10 @@
* pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
* zone_lock --> zsd_key_lock --> pidlock --> p_lock
*
+ * When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
+ * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
+ * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
+ *
* Blocking memory allocations are permitted while holding any of the
* zone locks.
*
@@ -190,6 +198,7 @@
#include <sys/debug.h>
#include <sys/file.h>
#include <sys/kmem.h>
+#include <sys/kstat.h>
#include <sys/mutex.h>
#include <sys/note.h>
#include <sys/pathname.h>
@@ -232,6 +241,8 @@
#include <sys/zone.h>
#include <sys/tsol/label.h>
+#include <vm/seg.h>
+
/*
* cv used to signal that all references to the zone have been released. This
* needs to be global since there may be multiple waiters, and the first to
@@ -317,6 +328,7 @@ const char *zone_status_table[] = {
*/
rctl_hndl_t rc_zone_cpu_shares;
rctl_hndl_t rc_zone_locked_mem;
+rctl_hndl_t rc_zone_max_swap;
rctl_hndl_t rc_zone_nlwps;
rctl_hndl_t rc_zone_shmmax;
rctl_hndl_t rc_zone_shmmni;
@@ -1011,9 +1023,9 @@ zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
{
rctl_qty_t q;
ASSERT(MUTEX_HELD(&p->p_lock));
- mutex_enter(&p->p_zone->zone_rctl_lock);
+ mutex_enter(&p->p_zone->zone_mem_lock);
q = p->p_zone->zone_locked_mem;
- mutex_exit(&p->p_zone->zone_rctl_lock);
+ mutex_exit(&p->p_zone->zone_mem_lock);
return (q);
}
@@ -1023,9 +1035,12 @@ zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
{
rctl_qty_t q;
+ zone_t *z;
+
+ z = e->rcep_p.zone;
ASSERT(MUTEX_HELD(&p->p_lock));
- ASSERT(MUTEX_HELD(&p->p_zone->zone_rctl_lock));
- q = p->p_zone->zone_locked_mem;
+ ASSERT(MUTEX_HELD(&z->zone_mem_lock));
+ q = z->zone_locked_mem;
if (q + incr > rcntl->rcv_value)
return (1);
return (0);
@@ -1051,6 +1066,57 @@ static rctl_ops_t zone_locked_mem_ops = {
zone_locked_mem_test
};
+/*ARGSUSED*/
+static rctl_qty_t
+zone_max_swap_usage(rctl_t *rctl, struct proc *p)
+{
+ rctl_qty_t q;
+ zone_t *z = p->p_zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ mutex_enter(&z->zone_mem_lock);
+ q = z->zone_max_swap;
+ mutex_exit(&z->zone_mem_lock);
+ return (q);
+}
+
+/*ARGSUSED*/
+static int
+zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
+ rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
+{
+ rctl_qty_t q;
+ zone_t *z;
+
+ z = e->rcep_p.zone;
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(MUTEX_HELD(&z->zone_mem_lock));
+ q = z->zone_max_swap;
+ if (q + incr > rcntl->rcv_value)
+ return (1);
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+ if (e->rcep_p.zone == NULL)
+ return (0);
+ e->rcep_p.zone->zone_max_swap_ctl = nv;
+ return (0);
+}
+
+static rctl_ops_t zone_max_swap_ops = {
+ rcop_no_action,
+ zone_max_swap_usage,
+ zone_max_swap_set,
+ zone_max_swap_test
+};
+
/*
* Helper function to brand the zone with a unique ID.
*/
@@ -1080,6 +1146,96 @@ zone_get_kcred(zoneid_t zoneid)
return (cr);
}
+static int
+zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_kstat_t *zk = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ zk->zk_usage.value.ui64 = zone->zone_locked_mem;
+ zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
+ return (0);
+}
+
+static int
+zone_swapresv_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_kstat_t *zk = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ zk->zk_usage.value.ui64 = zone->zone_max_swap;
+ zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
+ return (0);
+}
+
+static void
+zone_kstat_create(zone_t *zone)
+{
+ kstat_t *ksp;
+ zone_kstat_t *zk;
+
+ ksp = rctl_kstat_create_zone(zone, "lockedmem", KSTAT_TYPE_NAMED,
+ sizeof (zone_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (ksp == NULL)
+ return;
+
+ zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
+ kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
+ kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
+ ksp->ks_update = zone_lockedmem_kstat_update;
+ ksp->ks_private = zone;
+ kstat_install(ksp);
+
+ zone->zone_lockedmem_kstat = ksp;
+
+ ksp = rctl_kstat_create_zone(zone, "swapresv", KSTAT_TYPE_NAMED,
+ sizeof (zone_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (ksp == NULL)
+ return;
+
+ zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
+ kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
+ kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
+ ksp->ks_update = zone_swapresv_kstat_update;
+ ksp->ks_private = zone;
+ kstat_install(ksp);
+
+ zone->zone_swapresv_kstat = ksp;
+}
+
+static void
+zone_kstat_delete(zone_t *zone)
+{
+ void *data;
+
+ if (zone->zone_lockedmem_kstat != NULL) {
+ data = zone->zone_lockedmem_kstat->ks_data;
+ kstat_delete(zone->zone_lockedmem_kstat);
+ kmem_free(data, sizeof (zone_kstat_t));
+ }
+ if (zone->zone_swapresv_kstat != NULL) {
+ data = zone->zone_swapresv_kstat->ks_data;
+ kstat_delete(zone->zone_swapresv_kstat);
+ kmem_free(data, sizeof (zone_kstat_t));
+ }
+}
+
/*
* Called very early on in boot to initialize the ZSD list so that
* zone_key_create() can be called before zone_init(). It also initializes
@@ -1101,8 +1257,14 @@ zone_zsd_init(void)
mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
zone0.zone_shares = 1;
+ zone0.zone_nlwps = 0;
zone0.zone_nlwps_ctl = INT_MAX;
+ zone0.zone_locked_mem = 0;
+ zone0.zone_locked_mem_ctl = UINT64_MAX;
+ ASSERT(zone0.zone_max_swap == 0);
+ zone0.zone_max_swap_ctl = UINT64_MAX;
zone0.zone_shmmax = 0;
zone0.zone_ipc.ipcq_shmmni = 0;
zone0.zone_ipc.ipcq_semmni = 0;
@@ -1120,6 +1282,8 @@ zone_zsd_init(void)
zone0.zone_ncpus_online = 0;
zone0.zone_proc_initpid = 1;
zone0.zone_initname = initname;
+ zone0.zone_lockedmem_kstat = NULL;
+ zone0.zone_swapresv_kstat = NULL;
list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
offsetof(struct zsd_entry, zsd_linkage));
list_insert_head(&zone_active, &zone0);
@@ -1259,6 +1423,12 @@ zone_init(void)
RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
&zone_locked_mem_ops);
+
+ rc_zone_max_swap = rctl_register("zone.max-swap",
+ RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
+ RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
+ &zone_max_swap_ops);
+
/*
* Initialize the ``global zone''.
*/
@@ -1277,9 +1447,14 @@ zone_init(void)
zone0.zone_brand = &native_brand;
rctl_prealloc_destroy(gp);
/*
- * pool_default hasn't been initialized yet, so we let pool_init() take
- * care of making the global zone is in the default pool.
+ * pool_default hasn't been initialized yet, so we let pool_init()
+ * take care of making sure the global zone is in the default pool.
+ */
+
+ /*
+ * Initialize global zone kstats
*/
+ zone_kstat_create(&zone0);
/*
* Initialize zone label.
@@ -1337,6 +1512,7 @@ zone_init(void)
if (res)
panic("Sysevent_evc_bind failed during zone setup.\n");
+
}
static void
@@ -1476,6 +1652,38 @@ zone_set_initname(zone_t *zone, const char *zone_initname)
return (0);
}
+static int
+zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
+{
+ uint64_t mcap;
+ int err = 0;
+
+ if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
+ zone->zone_phys_mcap = mcap;
+
+ return (err);
+}
+
+static int
+zone_set_sched_class(zone_t *zone, const char *new_class)
+{
+ char sched_class[PC_CLNMSZ];
+ id_t classid;
+ int err;
+
+ ASSERT(zone != global_zone);
+ if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
+ return (err); /* EFAULT or ENAMETOOLONG */
+
+ if (getcid(sched_class, &classid) != 0 || classid == syscid)
+ return (set_errno(EINVAL));
+ zone->zone_defaultcid = classid;
+ ASSERT(zone->zone_defaultcid > 0 &&
+ zone->zone_defaultcid < loaded_classes);
+
+ return (0);
+}
+
/*
* Block indefinitely waiting for (zone_status >= status)
*/
@@ -2510,10 +2718,10 @@ zsched(void *arg)
/*
* Decrement locked memory counts on old zone and project.
*/
- mutex_enter(&global_zone->zone_rctl_lock);
+ mutex_enter(&global_zone->zone_mem_lock);
global_zone->zone_locked_mem -= pp->p_locked_mem;
pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
- mutex_exit(&global_zone->zone_rctl_lock);
+ mutex_exit(&global_zone->zone_mem_lock);
/*
* Create and join a new task in project '0' of this zone.
@@ -2529,10 +2737,10 @@ zsched(void *arg)
pj = pp->p_task->tk_proj;
- mutex_enter(&zone->zone_rctl_lock);
+ mutex_enter(&zone->zone_mem_lock);
zone->zone_locked_mem += pp->p_locked_mem;
pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
- mutex_exit(&zone->zone_rctl_lock);
+ mutex_exit(&zone->zone_mem_lock);
/*
* add lwp counts to zsched's zone, and increment project's task count
@@ -2689,7 +2897,10 @@ zsched(void *arg)
* classid 'cid'.
*/
pool_lock();
- cid = pool_get_class(zone->zone_pool);
+ if (zone->zone_defaultcid > 0)
+ cid = zone->zone_defaultcid;
+ else
+ cid = pool_get_class(zone->zone_pool);
if (cid == -1)
cid = defaultcid;
@@ -3019,7 +3230,7 @@ zone_create(const char *zone_name, const char *zone_root,
zone->zone_initname = NULL;
mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&zone->zone_rctl_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
offsetof(struct zsd_entry, zsd_linkage));
@@ -3057,8 +3268,14 @@ zone_create(const char *zone_name, const char *zone_root,
zone->zone_initname =
kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
(void) strcpy(zone->zone_initname, zone_default_initname);
+ zone->zone_nlwps = 0;
+ zone->zone_nlwps_ctl = INT_MAX;
zone->zone_locked_mem = 0;
zone->zone_locked_mem_ctl = UINT64_MAX;
+ zone->zone_max_swap = 0;
+ zone->zone_max_swap_ctl = UINT64_MAX;
+ zone0.zone_lockedmem_kstat = NULL;
+ zone0.zone_swapresv_kstat = NULL;
/*
* Zsched initializes the rctls.
@@ -3233,6 +3450,11 @@ zone_create(const char *zone_name, const char *zone_root,
*/
/*
+ * Create zone kstats
+ */
+ zone_kstat_create(zone);
+
+ /*
* Let the other lwps continue.
*/
mutex_enter(&pp->p_lock);
@@ -3643,6 +3865,9 @@ zone_destroy(zoneid_t zoneid)
}
+ /* Get rid of the zone's kstats */
+ zone_kstat_delete(zone);
+
/*
* It is now safe to let the zone be recreated; remove it from the
* lists. The memory will not be freed until the last cred
@@ -3892,6 +4117,32 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
error = EFAULT;
}
break;
+ case ZONE_ATTR_PHYS_MCAP:
+ size = sizeof (zone->zone_phys_mcap);
+ if (bufsize > size)
+ bufsize = size;
+ if (buf != NULL &&
+ copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
+ error = EFAULT;
+ break;
+ case ZONE_ATTR_SCHED_CLASS:
+ mutex_enter(&class_lock);
+
+ if (zone->zone_defaultcid >= loaded_classes)
+ outstr = "";
+ else
+ outstr = sclass[zone->zone_defaultcid].cl_name;
+ size = strlen(outstr) + 1;
+ if (bufsize > size)
+ bufsize = size;
+ if (buf != NULL) {
+ err = copyoutstr(outstr, buf, bufsize, NULL);
+ if (err != 0 && err != ENAMETOOLONG)
+ error = EFAULT;
+ }
+
+ mutex_exit(&class_lock);
+ break;
default:
if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
size = bufsize;
@@ -3923,10 +4174,10 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
return (set_errno(EPERM));
/*
- * At present, attributes can only be set on non-running,
- * non-global zones.
+ * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
+ * global zone.
*/
- if (zoneid == GLOBAL_ZONEID) {
+ if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
return (set_errno(EINVAL));
}
@@ -3938,8 +4189,12 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
zone_hold(zone);
mutex_exit(&zonehash_lock);
+ /*
+ * At present most attributes can only be set on non-running,
+ * non-global zones.
+ */
zone_status = zone_status_get(zone);
- if (zone_status > ZONE_IS_READY)
+ if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY)
goto done;
switch (attr) {
@@ -3971,6 +4226,12 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
if (zone->zone_brand == NULL)
err = EINVAL;
break;
+ case ZONE_ATTR_PHYS_MCAP:
+ err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
+ break;
+ case ZONE_ATTR_SCHED_CLASS:
+ err = zone_set_sched_class(zone, (const char *)buf);
+ break;
default:
if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
@@ -3986,6 +4247,11 @@ done:
/*
* Return zero if the process has at least one vnode mapped in to its
* address space which shouldn't be allowed to change zones.
+ *
+ * Also return zero if the process has any shared mappings which reserve
+ * swap. This is because the counting for zone.max-swap does not allow swap
+ * revervation to be shared between zones. zone swap reservation is counted
+ * on zone->zone_max_swap.
*/
static int
as_can_change_zones(void)
@@ -3997,8 +4263,17 @@ as_can_change_zones(void)
int allow = 1;
ASSERT(pp->p_as != &kas);
- AS_LOCK_ENTER(&as, &as->a_lock, RW_READER);
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+
+ /*
+ * Cannot enter zone with shared anon memory which
+ * reserves swap. See comment above.
+ */
+ if (seg_can_change_zones(seg) == B_FALSE) {
+ allow = 0;
+ break;
+ }
/*
* if we can't get a backing vnode for this segment then skip
* it.
@@ -4011,11 +4286,30 @@ as_can_change_zones(void)
break;
}
}
- AS_LOCK_EXIT(&as, &as->a_lock);
+ AS_LOCK_EXIT(as, &as->a_lock);
return (allow);
}
/*
+ * Count swap reserved by curproc's address space
+ */
+static size_t
+as_swresv(void)
+{
+ proc_t *pp = curproc;
+ struct seg *seg;
+ struct as *as = pp->p_as;
+ size_t swap = 0;
+
+ ASSERT(pp->p_as != &kas);
+ ASSERT(AS_WRITE_HELD(as, &as->a_lock));
+ for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
+ swap += seg_swresv(seg);
+
+ return (swap);
+}
+
+/*
* Systemcall entry point for zone_enter().
*
* The current process is injected into said zone. In the process
@@ -4043,6 +4337,7 @@ zone_enter(zoneid_t zoneid)
zone_status_t status;
int err = 0;
rctl_entity_p_t e;
+ size_t swap;
if (secpolicy_zone_config(CRED()) != 0)
return (set_errno(EPERM));
@@ -4205,6 +4500,15 @@ zone_enter(zoneid_t zoneid)
goto out;
}
+ /*
+ * a_lock must be held while transfering locked memory and swap
+ * reservation from the global zone to the non global zone because
+ * asynchronous faults on the processes' address space can lock
+ * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
+ * segments respectively.
+ */
+ AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER);
+ swap = as_swresv();
mutex_enter(&pp->p_lock);
zone_proj0 = zone->zone_zsched->p_task->tk_proj;
/* verify that we do not exceed and task or lwp limits */
@@ -4216,10 +4520,11 @@ zone_enter(zoneid_t zoneid)
zone_proj0->kpj_ntasks += 1;
mutex_exit(&zone->zone_nlwps_lock);
- mutex_enter(&zone->zone_rctl_lock);
+ mutex_enter(&zone->zone_mem_lock);
zone->zone_locked_mem += pp->p_locked_mem;
zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
- mutex_exit(&zone->zone_rctl_lock);
+ zone->zone_max_swap += swap;
+ mutex_exit(&zone->zone_mem_lock);
/* remove lwps from proc's old zone and old project */
mutex_enter(&pp->p_zone->zone_nlwps_lock);
@@ -4227,12 +4532,14 @@ zone_enter(zoneid_t zoneid)
pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
mutex_exit(&pp->p_zone->zone_nlwps_lock);
- mutex_enter(&pp->p_zone->zone_rctl_lock);
+ mutex_enter(&pp->p_zone->zone_mem_lock);
pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
- mutex_exit(&pp->p_zone->zone_rctl_lock);
+ pp->p_zone->zone_max_swap -= swap;
+ mutex_exit(&pp->p_zone->zone_mem_lock);
mutex_exit(&pp->p_lock);
+ AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock);
/*
* Joining the zone cannot fail from now on.
@@ -4289,6 +4596,31 @@ zone_enter(zoneid_t zoneid)
sess_rele(pp->p_sessp, B_TRUE);
pp->p_sessp = sp;
pgjoin(pp, zone->zone_zsched->p_pidp);
+
+ /*
+ * If there is a default scheduling class for the zone and it is not
+ * the class we are currently in, change all of the threads in the
+ * process to the new class. We need to be holding pidlock & p_lock
+ * when we call parmsset so this is a good place to do it.
+ */
+ if (zone->zone_defaultcid > 0 &&
+ zone->zone_defaultcid != curthread->t_cid) {
+ pcparms_t pcparms;
+ kthread_id_t t;
+
+ pcparms.pc_cid = zone->zone_defaultcid;
+ pcparms.pc_clparms[0] = 0;
+
+ /*
+ * If setting the class fails, we still want to enter the zone.
+ */
+ if ((t = pp->p_tlist) != NULL) {
+ do {
+ (void) parmsset(&pcparms, t);
+ } while ((t = t->t_forw) != pp->p_tlist);
+ }
+ }
+
mutex_exit(&pp->p_lock);
mutex_exit(&pidlock);
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index ab103ef4c7..4493f99454 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -544,6 +544,7 @@ CHKHDRS= \
visual_io.h \
vlan.h \
vm.h \
+ vm_usage.h \
vmem.h \
vmem_impl.h \
vmmeter.h \
diff --git a/usr/src/uts/common/sys/modhash_impl.h b/usr/src/uts/common/sys/modhash_impl.h
index 25e45cec23..a187eb68ee 100644
--- a/usr/src/uts/common/sys/modhash_impl.h
+++ b/usr/src/uts/common/sys/modhash_impl.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -93,6 +92,18 @@ struct mod_hash {
*/
void mod_hash_init(void);
+/*
+ * Internal routines. Use directly with care.
+ */
+uint_t i_mod_hash(mod_hash_t *, mod_hash_key_t);
+int i_mod_hash_insert_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t,
+ mod_hash_hndl_t);
+int i_mod_hash_remove_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *);
+int i_mod_hash_find_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *);
+void i_mod_hash_walk_nosync(mod_hash_t *, uint_t (*)(mod_hash_key_t,
+ mod_hash_val_t *, void *), void *);
+void i_mod_hash_clear_nosync(mod_hash_t *hash);
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/priocntl.h b/usr/src/uts/common/sys/priocntl.h
index ca1a92400a..6475ed0a4c 100644
--- a/usr/src/uts/common/sys/priocntl.h
+++ b/usr/src/uts/common/sys/priocntl.h
@@ -65,6 +65,7 @@ extern long priocntl(), priocntlset();
#define PC_SETXPARMS 7 /* Set extended scheduling parameters */
#define PC_GETXPARMS 8 /* Get extended scheduling parameters */
#define PC_SETDFLCL 9 /* Set default class, not for general use */
+#define PC_GETDFLCL 10 /* Get default class, not for general use */
#define PC_CLNULL -1
diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h
index fcf953262c..9a0ba2cc37 100644
--- a/usr/src/uts/common/sys/proc.h
+++ b/usr/src/uts/common/sys/proc.h
@@ -613,6 +613,8 @@ extern proc_t *pgfind(pid_t);
extern proc_t *pgfind_zone(pid_t, zoneid_t);
extern proc_t *sprlock(pid_t);
extern proc_t *sprlock_zone(pid_t, zoneid_t);
+extern int sprtrylock_proc(proc_t *);
+extern void sprwaitlock_proc(proc_t *);
extern void sprlock_proc(proc_t *);
extern void sprunlock(proc_t *);
extern void pid_init(void);
diff --git a/usr/src/uts/common/sys/project.h b/usr/src/uts/common/sys/project.h
index 679c1eddc2..5018df8499 100644
--- a/usr/src/uts/common/sys/project.h
+++ b/usr/src/uts/common/sys/project.h
@@ -28,15 +28,24 @@
#pragma ident "%Z%%M% %I% %E% SMI"
+
#ifdef __cplusplus
extern "C" {
#endif
+
+#include <sys/kstat.h>
#include <sys/types.h>
#include <sys/mutex.h>
#include <sys/rctl.h>
#include <sys/ipc_rctl.h>
+typedef struct kproject_kstat {
+ kstat_named_t kpk_zonename;
+ kstat_named_t kpk_usage;
+ kstat_named_t kpk_value;
+} kproject_kstat_t;
+
typedef struct kproject_data { /* Datum protected by: */
rctl_qty_t kpd_shmmax; /* shm's ipcs_lock */
ipc_rqty_t kpd_ipc; /* shm|sem|msg's ipcs lock */
@@ -44,6 +53,7 @@ typedef struct kproject_data { /* Datum protected by: */
rctl_qty_t kpd_locked_mem_ctl; /* kpj_rctls->rcs_lock */
rctl_qty_t kpd_contract; /* contract_lock */
rctl_qty_t kpd_crypto_mem; /* crypto_rctl_lock */
+ kstat_t *kpd_lockedmem_kstat; /* locked memory kstat */
} kproject_data_t;
@@ -76,9 +86,11 @@ typedef struct kproject {
#define PROJECT_HOLD_FIND 1
#define PROJECT_HOLD_INSERT 2
+struct zone;
+
void project_init(void);
kproject_t *project_hold(kproject_t *);
-kproject_t *project_hold_by_id(projid_t, zoneid_t, int);
+kproject_t *project_hold_by_id(projid_t, struct zone *, int);
void project_rele(kproject_t *);
int project_walk_all(zoneid_t, int (*)(kproject_t *, void *), void *);
projid_t curprojid(void);
diff --git a/usr/src/uts/common/sys/rctl.h b/usr/src/uts/common/sys/rctl.h
index eb56fff9e5..a8480c2768 100644
--- a/usr/src/uts/common/sys/rctl.h
+++ b/usr/src/uts/common/sys/rctl.h
@@ -168,6 +168,7 @@ struct proc;
struct task;
struct kproject;
struct zone;
+struct kstat;
typedef struct rctl_entity_p_struct {
rctl_entity_t rcep_t;
@@ -324,6 +325,14 @@ int rctl_incr_locked_mem(struct proc *, struct kproject *, rctl_qty_t,
int);
void rctl_decr_locked_mem(struct proc *, struct kproject *, rctl_qty_t,
int);
+int rctl_incr_swap(struct proc *, struct zone *, size_t);
+void rctl_decr_swap(struct zone *, size_t);
+
+struct kstat *rctl_kstat_create_zone(struct zone *, char *, uchar_t, uint_t,
+ uchar_t);
+
+struct kstat *rctl_kstat_create_project(struct kproject *, char *, uchar_t,
+ uint_t, uchar_t);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/sys/resource.h b/usr/src/uts/common/sys/resource.h
index 86cc716d56..bf02808d4b 100644
--- a/usr/src/uts/common/sys/resource.h
+++ b/usr/src/uts/common/sys/resource.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -191,6 +190,7 @@ struct rusage {
#define _RUSAGESYS_GETRUSAGE 0 /* rusage process */
#define _RUSAGESYS_GETRUSAGE_CHLD 1 /* rusage child process */
#define _RUSAGESYS_GETRUSAGE_LWP 2 /* rusage lwp */
+#define _RUSAGESYS_GETVMUSAGE 3 /* getvmusage */
#if defined(_SYSCALL32)
diff --git a/usr/src/uts/common/sys/syscall.h b/usr/src/uts/common/sys/syscall.h
index 96cb967023..eedadfa0c0 100644
--- a/usr/src/uts/common/sys/syscall.h
+++ b/usr/src/uts/common/sys/syscall.h
@@ -384,7 +384,8 @@ extern "C" {
#define SYS_rusagesys 181
/*
* subcodes:
- * getrusage(...) :: rusagesys(RUSAGESYS_GETRUSAGE,...)
+ * getrusage(...) :: rusagesys(RUSAGESYS_GETRUSAGE, ...)
+ * getvmusage(...) :: rusagesys(RUSAGESYS_GETVMUSAGE, ...)
*/
#define SYS_port 182
/*
diff --git a/usr/src/uts/common/sys/vm_usage.h b/usr/src/uts/common/sys/vm_usage.h
new file mode 100644
index 0000000000..5f8c8b8fe5
--- /dev/null
+++ b/usr/src/uts/common/sys/vm_usage.h
@@ -0,0 +1,120 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VM_USAGE_H
+#define _SYS_VM_USAGE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The flags passed to getvmusage() request how to aggregate rss/swap results.
+ * Results can be aggregated by zone, project, task, ruser, and/or euser.
+ *
+ * If VMUSAGE_ALL_* or VMUSAGE_COL_* are passed from a non-global-zone, the
+ * flag is treated as VMUSAGE_*. For example, VMUSAGE_ALL_ZONES would be
+ * treated as VMUSAGE_ZONE.
+ *
+ * If VMUSAGE_SYSTEM is passed from a non-global zone, a result of type
+ * VMUSAGE_SYSTEM will be returned, but it will only reflect the usage
+ * of the calling zone.
+ *
+ * VMUSAGE_* requests results for the calling zone.
+ * VMUSAGE_ALL_* requests results for all zones.
+ * VMUSAGE_COL_* requests results for all zones, but collapses out the zoneid.
+ * For example, VMUSAGE_COL_PROJECTS requests results for all
+ * projects in all zones, and project N in ANY zone is treated
+ * as the same project.
+ */
+#define VMUSAGE_SYSTEM 0x1 /* rss/swap for ALL processes */
+#define VMUSAGE_ZONE 0x2 /* rss/swap for caller's zone */
+#define VMUSAGE_PROJECTS 0x4 /* rss/swap for all projects in */
+ /* caller's zone */
+#define VMUSAGE_TASKS 0x8 /* rss/swap for all tasks in */
+ /* caller's zones */
+#define VMUSAGE_RUSERS 0x10 /* rss/swap for all users (by process */
+ /* ruser) in the caller's zone */
+#define VMUSAGE_EUSERS 0x20 /* same as VMUSAGE_RUSERS, but by */
+ /* euser */
+
+#define VMUSAGE_ALL_ZONES 0x40 /* rss/swap for all zones */
+#define VMUSAGE_ALL_PROJECTS 0x80 /* rss/swap for all projects in */
+ /* all zones */
+#define VMUSAGE_ALL_TASKS 0x100 /* rss/swap for all tasks in all */
+ /* zones */
+#define VMUSAGE_ALL_RUSERS 0x200 /* rss/swap for all users (by process */
+ /* ruser) in all zones */
+#define VMUSAGE_ALL_EUSERS 0x400 /* same as VMUSAGE_ALL_RUSERS, but by */
+ /* euser */
+
+#define VMUSAGE_COL_PROJECTS 0x800 /* rss/swap for all projects in */
+ /* all zones. Collapse zoneid. */
+#define VMUSAGE_COL_RUSERS 0x1000 /* rss/swap for all users (by process */
+ /* ruser), in all zones. Collapse */
+ /* zoneid */
+#define VMUSAGE_COL_EUSERS 0x2000 /* same as VMUSAGE_COL_RUSERS, but by */
+ /* euser */
+
+#define VMUSAGE_MASK 0x3fff /* all valid flags for getvmusage() */
+
+typedef struct vmusage {
+ id_t vmu_zoneid; /* zoneid, or ALL_ZONES for */
+ /* VMUSAGE_COL_* results */
+ /* ALL_ZONES means that the result */
+ /* reflects swap and rss usage for */
+ /* a projid/uid across all zones */
+ uint_t vmu_type; /* Entity type of result. One of: */
+ /* VMUSAGE_(SYSTEM|ZONE|PROJECTS| */
+ /* TASKS|RUSERS|EUSERS) */
+ id_t vmu_id; /* zoneid, projid, taskid, ... */
+ size_t vmu_rss_all; /* total resident memory of entity */
+ /* in bytes */
+ size_t vmu_rss_private; /* total resident private memory */
+ size_t vmu_rss_shared; /* total resident shared memory */
+ size_t vmu_swap_all; /* total swap reserved, in bytes */
+ size_t vmu_swap_private; /* swap reserved for private mappings */
+ size_t vmu_swap_shared; /* swap reserved for shared mappings */
+
+} vmusage_t;
+
+extern int getvmusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres);
+
+#ifdef _KERNEL
+
+int vm_getusage(uint_t, time_t, vmusage_t *, size_t *);
+void vm_usage_init();
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VM_USAGE_H */
diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h
index daccd16bdf..94646bc976 100644
--- a/usr/src/uts/common/sys/zone.h
+++ b/usr/src/uts/common/sys/zone.h
@@ -88,6 +88,8 @@ extern "C" {
#define ZONE_ATTR_INITNAME 9
#define ZONE_ATTR_BOOTARGS 10
#define ZONE_ATTR_BRAND 11
+#define ZONE_ATTR_PHYS_MCAP 12
+#define ZONE_ATTR_SCHED_CLASS 13
/* Start of the brand-specific attribute namespace */
#define ZONE_ATTR_BRAND_ATTRS 32768
@@ -280,6 +282,15 @@ typedef struct zone_dataset {
list_node_t zd_linkage;
} zone_dataset_t;
+/*
+ * structure for zone kstats
+ */
+typedef struct zone_kstat {
+ kstat_named_t zk_zonename;
+ kstat_named_t zk_usage;
+ kstat_named_t zk_value;
+} zone_kstat_t;
+
typedef struct zone {
/*
* zone_name is never modified once set.
@@ -326,14 +337,20 @@ typedef struct zone {
uint_t zone_rootpathlen; /* strlen(zone_rootpath) + 1 */
uint32_t zone_shares; /* FSS shares allocated to zone */
rctl_set_t *zone_rctls; /* zone-wide (zone.*) rctls */
- kmutex_t zone_rctl_lock; /* protects zone_locked_mem and */
+ kmutex_t zone_mem_lock; /* protects zone_locked_mem and */
/* kpd_locked_mem for all */
- /* projects in zone */
+ /* projects in zone. */
+ /* Also protects zone_max_swap */
/* grab after p_lock, before rcs_lock */
- rctl_qty_t zone_locked_mem; /* bytes of locked memory in zone */
- rctl_qty_t zone_locked_mem_ctl; /* current locked memory */
+ rctl_qty_t zone_locked_mem; /* bytes of locked memory in */
+ /* zone */
+ rctl_qty_t zone_locked_mem_ctl; /* Current locked memory */
/* limit. Protected by */
/* zone_rctls->rcs_lock */
+ rctl_qty_t zone_max_swap; /* bytes of swap reserved by zone */
+ rctl_qty_t zone_max_swap_ctl; /* current swap limit. */
+ /* Protected by */
+ /* zone_rctls->rcs_lock */
list_t zone_zsd; /* list of Zone-Specific Data values */
kcondvar_t zone_cv; /* used to signal state changes */
struct proc *zone_zsched; /* Dummy kernel "zsched" process */
@@ -341,6 +358,7 @@ typedef struct zone {
char *zone_initname; /* fs path to 'init' */
int zone_boot_err; /* for zone_boot() if boot fails */
char *zone_bootargs; /* arguments passed via zone_boot() */
+ uint64_t zone_phys_mcap; /* physical memory cap */
/*
* zone_kthreads is protected by zone_status_lock.
*/
@@ -376,6 +394,9 @@ typedef struct zone {
boolean_t zone_restart_init; /* Restart init if it dies? */
struct brand *zone_brand; /* zone's brand */
+ id_t zone_defaultcid; /* dflt scheduling class id */
+ kstat_t *zone_swapresv_kstat;
+ kstat_t *zone_lockedmem_kstat;
} zone_t;
/*
@@ -553,6 +574,7 @@ extern void mount_completed(void);
extern int zone_walk(int (*)(zone_t *, void *), void *);
extern rctl_hndl_t rc_zone_locked_mem;
+extern rctl_hndl_t rc_zone_max_swap;
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/syscall/processor_bind.c b/usr/src/uts/common/syscall/processor_bind.c
index 10ca1178d5..bd416e43e6 100644
--- a/usr/src/uts/common/syscall/processor_bind.c
+++ b/usr/src/uts/common/syscall/processor_bind.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -285,9 +284,10 @@ processor_bind(idtype_t idtype, id_t id, processorid_t bind,
break;
case P_PROJID:
+ pp = curproc;
if (id == P_MYID)
id = curprojid();
- if ((kpj = project_hold_by_id(id, getzoneid(),
+ if ((kpj = project_hold_by_id(id, pp->p_zone,
PROJECT_HOLD_FIND)) == NULL) {
ret = ESRCH;
} else {
diff --git a/usr/src/uts/common/syscall/pset.c b/usr/src/uts/common/syscall/pset.c
index 5d3b7e6233..767529fc5d 100644
--- a/usr/src/uts/common/syscall/pset.c
+++ b/usr/src/uts/common/syscall/pset.c
@@ -542,9 +542,10 @@ pset_bind(psetid_t pset, idtype_t idtype, id_t id, psetid_t *opset)
break;
case P_PROJID:
+ pp = curproc;
if (id == P_MYID)
id = curprojid();
- if ((kpj = project_hold_by_id(id, getzoneid(),
+ if ((kpj = project_hold_by_id(id, pp->p_zone,
PROJECT_HOLD_FIND)) == NULL) {
error = ESRCH;
break;
diff --git a/usr/src/uts/common/syscall/rusagesys.c b/usr/src/uts/common/syscall/rusagesys.c
index 3e09643981..036500932f 100644
--- a/usr/src/uts/common/syscall/rusagesys.c
+++ b/usr/src/uts/common/syscall/rusagesys.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -35,6 +34,7 @@
#include <sys/time.h>
#include <sys/errno.h>
#include <sys/resource.h>
+#include <sys/vm_usage.h>
static int
getrusage(void *user_rusage)
@@ -246,16 +246,19 @@ getrusage_lwp(void *user_rusage)
}
int
-rusagesys(int code, void * arg)
+rusagesys(int code, void *arg1, void *arg2, void *arg3, void *arg4)
{
switch (code) {
case _RUSAGESYS_GETRUSAGE:
- return (getrusage(arg));
+ return (getrusage(arg1));
case _RUSAGESYS_GETRUSAGE_CHLD:
- return (getrusage_chld(arg));
+ return (getrusage_chld(arg1));
case _RUSAGESYS_GETRUSAGE_LWP:
- return (getrusage_lwp(arg));
+ return (getrusage_lwp(arg1));
+ case _RUSAGESYS_GETVMUSAGE:
+ return (vm_getusage((uint_t)(uintptr_t)arg1, (time_t)arg2,
+ (vmusage_t *)arg3, (size_t *)arg4));
default:
return (set_errno(EINVAL));
}
diff --git a/usr/src/uts/common/syscall/tasksys.c b/usr/src/uts/common/syscall/tasksys.c
index 705b543a37..bec091e61c 100644
--- a/usr/src/uts/common/syscall/tasksys.c
+++ b/usr/src/uts/common/syscall/tasksys.c
@@ -25,6 +25,7 @@
#pragma ident "%Z%%M% %I% %E% SMI"
+
/*
* System calls for creating and inquiring about tasks and projects
*/
@@ -102,7 +103,7 @@ tasksys_settaskid(projid_t projid, uint_t flags)
* Put a hold on our new project and make sure that nobody is
* trying to bind it to a pool while we're joining.
*/
- kpj = project_hold_by_id(projid, getzoneid(), PROJECT_HOLD_INSERT);
+ kpj = project_hold_by_id(projid, p->p_zone, PROJECT_HOLD_INSERT);
e.rcep_p.proj = kpj;
e.rcep_t = RCENTITY_PROJECT;
@@ -111,7 +112,7 @@ tasksys_settaskid(projid_t projid, uint_t flags)
zone = p->p_zone;
mutex_enter(&zone->zone_nlwps_lock);
- mutex_enter(&zone->zone_rctl_lock);
+ mutex_enter(&zone->zone_mem_lock);
if (kpj->kpj_nlwps + p->p_lwpcnt > kpj->kpj_nlwps_ctl)
if (rctl_test_entity(rc_project_nlwps, kpj->kpj_rctls, p, &e,
@@ -130,7 +131,7 @@ tasksys_settaskid(projid_t projid, uint_t flags)
rctlfail = 1;
if (rctlfail) {
- mutex_exit(&zone->zone_rctl_lock);
+ mutex_exit(&zone->zone_mem_lock);
mutex_exit(&zone->zone_nlwps_lock);
if (curthread != p->p_agenttp)
continuelwps(p);
@@ -144,7 +145,7 @@ tasksys_settaskid(projid_t projid, uint_t flags)
oldpj->kpj_data.kpd_locked_mem -= p->p_locked_mem;
oldpj->kpj_nlwps -= p->p_lwpcnt;
- mutex_exit(&zone->zone_rctl_lock);
+ mutex_exit(&zone->zone_mem_lock);
mutex_exit(&zone->zone_nlwps_lock);
mutex_exit(&p->p_lock);
diff --git a/usr/src/uts/common/vm/anon.h b/usr/src/uts/common/vm/anon.h
index 90f6e1e661..ed59ec590b 100644
--- a/usr/src/uts/common/vm/anon.h
+++ b/usr/src/uts/common/vm/anon.h
@@ -42,6 +42,7 @@
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/cred.h>
+#include <sys/zone.h>
#include <vm/seg.h>
#include <vm/vpage.h>
@@ -387,8 +388,8 @@ extern int anon_map_demotepages(struct anon_map *, ulong_t,
struct seg *, caddr_t, uint_t,
struct vpage [], struct cred *);
extern void anon_shmap_free_pages(struct anon_map *, ulong_t, size_t);
-extern int anon_resvmem(size_t, uint_t);
-extern void anon_unresv(size_t);
+extern int anon_resvmem(size_t, boolean_t, zone_t *);
+extern void anon_unresvmem(size_t, zone_t *);
extern struct anon_map *anonmap_alloc(size_t, size_t);
extern void anonmap_free(struct anon_map *);
extern void anon_decref(struct anon *);
@@ -416,9 +417,16 @@ extern void anon_array_exit(anon_sync_obj_t *);
* request and if so, reserves the appropriate anonymous memory resources.
* anon_checkspace just checks to see if there is space to fulfill the request,
* without taking any resources. Both return 1 if successful and 0 if not.
+ *
+ * Macros are provided as anon reservation is usually charged to the zone of
+ * the current process. In some cases (such as anon reserved by tmpfs), a
+ * zone pointer is needed to charge the appropriate zone.
*/
-#define anon_resv(size) anon_resvmem((size), 1)
-#define anon_checkspace(size) anon_resvmem((size), 0)
+#define anon_unresv(size) anon_unresvmem(size, curproc->p_zone)
+#define anon_unresv_zone(size, zone) anon_unresvmem(size, zone)
+#define anon_resv(size) anon_resvmem((size), 1, curproc->p_zone)
+#define anon_resv_zone(size, zone) anon_resvmem((size), 1, zone)
+#define anon_checkspace(size, zone) anon_resvmem((size), 0, zone)
/*
* Flags to anon_private
diff --git a/usr/src/uts/common/vm/seg.h b/usr/src/uts/common/vm/seg.h
index 0ee7d62ce1..a9683c0e54 100644
--- a/usr/src/uts/common/vm/seg.h
+++ b/usr/src/uts/common/vm/seg.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -245,6 +244,9 @@ uint_t seg_pages(struct seg *);
#endif /* VMDEBUG */
+boolean_t seg_can_change_zones(struct seg *);
+size_t seg_swresv(struct seg *);
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/vm/seg_kp.c b/usr/src/uts/common/vm/seg_kp.c
index ff9c47e0ff..d58e873a19 100644
--- a/usr/src/uts/common/vm/seg_kp.c
+++ b/usr/src/uts/common/vm/seg_kp.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -147,6 +146,7 @@ uint32_t red_closest = UINT_MAX;
uint32_t red_ndoubles;
pgcnt_t anon_segkp_pages_locked; /* See vm/anon.h */
+pgcnt_t anon_segkp_pages_resv; /* anon reserved by seg_kp */
static struct seg_ops segkp_ops = {
SEGKP_BADOP(int), /* dup */
@@ -448,8 +448,10 @@ segkp_get_internal(
* Note that we don't need swap space for the red zone page.
*/
if (amp != NULL) {
- ASSERT((flags & KPD_NO_ANON) == 0);
- /* The reserve has been done and the anon_hdr is separate. */
+ /*
+ * The swap reservation has been done, if required, and the
+ * anon_hdr is separate.
+ */
anon_idx = 0;
kpd->kp_anon_idx = anon_idx;
kpd->kp_anon = amp->ahp;
@@ -458,7 +460,7 @@ segkp_get_internal(
kpd, vbase, len, flags, 1);
} else if ((flags & KPD_NO_ANON) == 0) {
- if (anon_resv(SEGKP_MAPLEN(len, flags)) == 0) {
+ if (anon_resv_zone(SEGKP_MAPLEN(len, flags), NULL) == 0) {
if (flags & KPD_LOCKED) {
atomic_add_long(&anon_segkp_pages_locked,
-pages);
@@ -468,6 +470,8 @@ segkp_get_internal(
kmem_free(kpd, sizeof (struct segkp_data));
return (NULL);
}
+ atomic_add_long(&anon_segkp_pages_resv,
+ btop(SEGKP_MAPLEN(len, flags)));
anon_idx = ((uintptr_t)(vbase - s_base)) >> PAGESHIFT;
kpd->kp_anon_idx = anon_idx;
kpd->kp_anon = kpsd->kpsd_anon;
@@ -704,7 +708,9 @@ segkp_release_internal(struct seg *seg, struct segkp_data *kpd, size_t len)
if ((kpd->kp_flags & KPD_HASAMP) == 0) {
anon_free(kpd->kp_anon, kpd->kp_anon_idx + i,
PAGESIZE);
- anon_unresv(PAGESIZE);
+ anon_unresv_zone(PAGESIZE, NULL);
+ atomic_add_long(&anon_segkp_pages_resv,
+ -1);
}
TRACE_5(TR_FAC_VM,
TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c
index f48db44acc..e2069b27c6 100644
--- a/usr/src/uts/common/vm/seg_vn.c
+++ b/usr/src/uts/common/vm/seg_vn.c
@@ -2323,8 +2323,9 @@ segvn_faultpage(
* zeroes. If no advance reservations, reserve now.
*/
if (svd->flags & MAP_NORESERVE) {
- if (anon_resv(ptob(1))) {
- svd->swresv += ptob(1);
+ if (anon_resv_zone(ptob(1),
+ seg->s_as->a_proc->p_zone)) {
+ atomic_add_long(&svd->swresv, ptob(1));
} else {
err = ENOMEM;
goto out;
diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c
index 0cad34257c..3f225a345a 100644
--- a/usr/src/uts/common/vm/vm_anon.c
+++ b/usr/src/uts/common/vm/vm_anon.c
@@ -113,6 +113,7 @@
#include <sys/policy.h>
#include <sys/condvar_impl.h>
#include <sys/mutex_impl.h>
+#include <sys/rctl.h>
#include <vm/as.h>
#include <vm/hat.h>
@@ -729,12 +730,22 @@ set_anoninfo(void)
* Return non-zero on success.
*/
int
-anon_resvmem(size_t size, uint_t takemem)
+anon_resvmem(size_t size, boolean_t takemem, zone_t *zone)
{
pgcnt_t npages = btopr(size);
pgcnt_t mswap_pages = 0;
pgcnt_t pswap_pages = 0;
+ proc_t *p = curproc;
+ if (zone != NULL && takemem) {
+ /* test zone.max-swap resource control */
+ mutex_enter(&p->p_lock);
+ if (rctl_incr_swap(p, zone, ptob(npages)) != 0) {
+ mutex_exit(&p->p_lock);
+ return (0);
+ }
+ mutex_exit(&p->p_lock);
+ }
mutex_enter(&anoninfo_lock);
/*
@@ -834,16 +845,17 @@ anon_resvmem(size_t size, uint_t takemem)
mutex_exit(&anoninfo_lock);
ANON_PRINT(A_RESV,
("anon_resvmem: not enough space from swapfs\n"));
+ if (zone != NULL && takemem)
+ rctl_decr_swap(zone, ptob(npages));
return (0);
}
}
-
/*
* Give back an anon reservation.
*/
void
-anon_unresv(size_t size)
+anon_unresvmem(size_t size, zone_t *zone)
{
pgcnt_t npages = btopr(size);
spgcnt_t mem_free_pages = 0;
@@ -851,6 +863,8 @@ anon_unresv(size_t size)
#ifdef ANON_DEBUG
pgcnt_t mem_resv;
#endif
+ if (zone != NULL)
+ rctl_decr_swap(zone, size);
mutex_enter(&anoninfo_lock);
diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c
index 05bfe662be..adac07b766 100644
--- a/usr/src/uts/common/vm/vm_page.c
+++ b/usr/src/uts/common/vm/vm_page.c
@@ -77,7 +77,7 @@
#include <vm/pvn.h>
#include <vm/seg_kmem.h>
#include <vm/vm_dep.h>
-
+#include <sys/vm_usage.h>
#include <fs/fs_subr.h>
static int nopageage = 0;
@@ -343,6 +343,7 @@ vm_init(void)
(void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
page_init_mem_config();
page_retire_init();
+ vm_usage_init();
}
/*
diff --git a/usr/src/uts/common/vm/vm_seg.c b/usr/src/uts/common/vm/vm_seg.c
index 50cc21cdf7..aed892969d 100644
--- a/usr/src/uts/common/vm/vm_seg.c
+++ b/usr/src/uts/common/vm/vm_seg.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -54,12 +53,14 @@
#include <sys/cmn_err.h>
#include <sys/callb.h>
#include <sys/mem_config.h>
+#include <sys/mman.h>
#include <vm/hat.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_kmem.h>
-
+#include <vm/seg_spt.h>
+#include <vm/seg_vn.h>
/*
* kstats for segment advise
*/
@@ -950,3 +951,48 @@ seg_pinit_mem_config(void)
*/
ASSERT(ret == 0);
}
+
+extern struct seg_ops segvn_ops;
+extern struct seg_ops segspt_shmops;
+
+/*
+ * Verify that segment is not a shared anonymous segment which reserves
+ * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
+ * from one zone to another if any segments are shared. This is because the
+ * last process to exit will credit the swap reservation. This could lead
+ * to the swap being reserved by one zone, and credited to another.
+ */
+boolean_t
+seg_can_change_zones(struct seg *seg)
+{
+ struct segvn_data *svd;
+
+ if (seg->s_ops == &segspt_shmops)
+ return (B_FALSE);
+
+ if (seg->s_ops == &segvn_ops) {
+ svd = (struct segvn_data *)seg->s_data;
+ if (svd->type == MAP_SHARED &&
+ svd->amp != NULL &&
+ svd->amp->swresv > 0)
+ return (B_FALSE);
+ }
+ return (B_TRUE);
+}
+
+/*
+ * Return swap reserved by a segment backing a private mapping.
+ */
+size_t
+seg_swresv(struct seg *seg)
+{
+ struct segvn_data *svd;
+ size_t swap = 0;
+
+ if (seg->s_ops == &segvn_ops) {
+ svd = (struct segvn_data *)seg->s_data;
+ if (svd->type == MAP_PRIVATE && svd->swresv > 0)
+ swap = svd->swresv;
+ }
+ return (swap);
+}
diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c
new file mode 100644
index 0000000000..32a8811e10
--- /dev/null
+++ b/usr/src/uts/common/vm/vm_usage.c
@@ -0,0 +1,1978 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * vm_usage
+ *
+ * This file implements the getvmusage() private system call.
+ * getvmusage() counts the amount of resident memory pages and swap
+ * reserved by the specified process collective. A "process collective" is
+ * the set of processes owned by a particular, zone, project, task, or user.
+ *
+ * rss and swap are counted so that for a given process collective, a page is
+ * only counted once. For example, this means that if multiple processes in
+ * the same project map the same page, then the project will only be charged
+ * once for that page. On the other hand, if two processes in different
+ * projects map the same page, then both projects will be charged
+ * for the page.
+ *
+ * The vm_getusage() calculation is implemented so that the first thread
+ * performs the rss/swap counting. Other callers will wait for that thread to
+ * finish, copying the results. This enables multiple rcapds and prstats to
+ * consume data from the same calculation. The results are also cached so that
+ * a caller interested in recent results can just copy them instead of starting
+ * a new calculation. The caller passes the maximium age (in seconds) of the
+ * data. If the cached data is young enough, the cache is copied, otherwise,
+ * a new calculation is executed and the cache is replaced with the new
+ * data.
+ *
+ * The rss calculation for each process collective is as follows:
+ *
+ * - Inspect flags, determine if counting rss for zones, projects, tasks,
+ * and/or users.
+ * - For each proc:
+ * - Figure out proc's collectives (zone, project, task, and/or user).
+ * - For each seg in proc's address space:
+ * - If seg is private:
+ * - Lookup anons in the amp.
+ * - For incore pages not previously visited each of the
+ * proc's collectives, add incore pagesize to each.
+ * collective.
+ * Anon's with a refcnt of 1 can be assummed to be not
+ * previously visited.
+ * - For address ranges without anons in the amp:
+ * - Lookup pages in underlying vnode.
+ * - For incore pages not previously visiting for
+ * each of the proc's collectives, add incore
+ * pagesize to each collective.
+ * - If seg is shared:
+ * - Lookup pages in the shared amp or vnode.
+ * - For incore pages not previously visited for each of
+ * the proc's collectives, add incore pagesize to each
+ * collective.
+ *
+ * Swap is reserved by private segments, and shared anonymous segments.
+ * The only shared anon segments which do not reserve swap are ISM segments
+ * and schedctl segments, both of which can be identified by having
+ * amp->swresv == 0.
+ *
+ * The swap calculation for each collective is as follows:
+ *
+ * - Inspect flags, determine if counting rss for zones, projects, tasks,
+ * and/or users.
+ * - For each proc:
+ * - Figure out proc's collectives (zone, project, task, and/or user).
+ * - For each seg in proc's address space:
+ * - If seg is private:
+ * - Add svd->swresv pages to swap count for each of the
+ * proc's collectives.
+ * - If seg is anon, shared, and amp->swresv != 0
+ * - For address ranges in amp not previously visited for
+ * each of the proc's collectives, add size of address
+ * range to the swap count for each collective.
+ *
+ * These two calculations are done simultaneously, with most of the work
+ * being done in vmu_calculate_seg(). The results of the calculation are
+ * copied into "vmu_data.vmu_cache_results".
+ *
+ * To perform the calculation, various things are tracked and cached:
+ *
+ * - incore/not-incore page ranges for all vnodes.
+ * (vmu_data.vmu_all_vnodes_hash)
+ * This eliminates looking up the same page more than once.
+ *
+ * - incore/not-incore page ranges for all shared amps.
+ * (vmu_data.vmu_all_amps_hash)
+ * This eliminates looking up the same page more than once.
+ *
+ * - visited page ranges for each collective.
+ * - per vnode (entity->vme_vnode_hash)
+ * - per shared amp (entity->vme_amp_hash)
+ * For accurate counting of map-shared and cow-shared pages.
+ *
+ * - visited private anons (refcnt > 1) for each collective.
+ * (entity->vme_anon_hash)
+ * For accurate counting of cow-shared pages.
+ *
+ * The common accounting structure is the vmu_entity_t, which represents
+ * collectives:
+ *
+ * - A zone.
+ * - A project, task, or user within a zone.
+ * - The entire system (vmu_data.vmu_system).
+ * - Each collapsed (col) project and user. This means a given projid or
+ * uid, regardless of which zone the process is in. For instance,
+ * project 0 in the global zone and project 0 in a non global zone are
+ * the same collapsed project.
+ *
+ * Each entity structure tracks which pages have been already visited for
+ * that entity (via previously inspected processes) so that these pages are
+ * not double counted.
+ */
+
+#include <sys/errno.h>
+#include <sys/types.h>
+#include <sys/zone.h>
+#include <sys/proc.h>
+#include <sys/project.h>
+#include <sys/task.h>
+#include <sys/thread.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <sys/modhash.h>
+#include <sys/modhash_impl.h>
+#include <sys/shm.h>
+#include <sys/swap.h>
+#include <sys/synch.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/vm_usage.h>
+#include <sys/zone.h>
+#include <vm/anon.h>
+#include <vm/as.h>
+#include <vm/seg_vn.h>
+#include <vm/seg_spt.h>
+
+#define VMUSAGE_HASH_SIZE 512
+
+#define VMUSAGE_TYPE_VNODE 1
+#define VMUSAGE_TYPE_AMP 2
+#define VMUSAGE_TYPE_ANON 3
+
+#define VMUSAGE_BOUND_UNKNOWN 0
+#define VMUSAGE_BOUND_INCORE 1
+#define VMUSAGE_BOUND_NOT_INCORE 2
+
+/*
+ * bounds for vnodes and shared amps
+ * Each bound is either entirely incore, entirely not in core, or
+ * entirely unknown. bounds are stored in order by offset.
+ */
+typedef struct vmu_bound {
+ struct vmu_bound *vmb_next;
+ pgcnt_t vmb_start; /* page offset in vnode/amp on which bound starts */
+ pgcnt_t vmb_end; /* page offset in vnode/amp on which bound ends */
+ char vmb_type; /* One of VMUSAGE_BOUND_* */
+} vmu_bound_t;
+
+/*
+ * hash of visited objects (vnodes or shared amps)
+ * key is address of vnode or amp. Bounds lists known incore/non-incore
+ * bounds for vnode/amp.
+ */
+typedef struct vmu_object {
+ struct vmu_object *vmo_next; /* free list */
+ caddr_t vmo_key;
+ short vmo_type;
+ vmu_bound_t *vmo_bounds;
+} vmu_object_t;
+
+/*
+ * Entity by which to count results.
+ *
+ * The entity structure keeps the current rss/swap counts for each entity
+ * (zone, project, etc), and hashes of vm structures that have already
+ * been visited for the entity.
+ *
+ * vme_next: links the list of all entities currently being counted by
+ * vmu_calculate().
+ *
+ * vme_next_calc: links the list of entities related to the current process
+ * being counted by vmu_calculate_proc().
+ *
+ * vmu_calculate_proc() walks all processes. For each process, it makes a
+ * list of the entities related to that process using vme_next_calc. This
+ * list changes each time vmu_calculate_proc() is called.
+ *
+ */
+typedef struct vmu_entity {
+ struct vmu_entity *vme_next;
+ struct vmu_entity *vme_next_calc;
+ mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */
+ mod_hash_t *vme_amp_hash; /* shared amps visited for entity */
+ mod_hash_t *vme_anon_hash; /* cow anons visited for entity */
+ vmusage_t vme_result; /* identifies entity and results */
+} vmu_entity_t;
+
+/*
+ * Hash of entities visited within a zone, and an entity for the zone
+ * itself.
+ */
+typedef struct vmu_zone {
+ struct vmu_zone *vmz_next; /* free list */
+ id_t vmz_id;
+ vmu_entity_t *vmz_zone;
+ mod_hash_t *vmz_projects_hash;
+ mod_hash_t *vmz_tasks_hash;
+ mod_hash_t *vmz_rusers_hash;
+ mod_hash_t *vmz_eusers_hash;
+} vmu_zone_t;
+
+/*
+ * Cache of results from last calculation
+ */
+typedef struct vmu_cache {
+ vmusage_t *vmc_results; /* Results from last call to */
+ /* vm_getusage(). */
+ uint64_t vmc_nresults; /* Count of cached results */
+ uint64_t vmc_refcnt; /* refcnt for free */
+ uint_t vmc_flags; /* Flags for vm_getusage() */
+ hrtime_t vmc_timestamp; /* when cache was created */
+} vmu_cache_t;
+
+/*
+ * top level rss info for the system
+ */
+typedef struct vmu_data {
+ kmutex_t vmu_lock; /* Protects vmu_data */
+ kcondvar_t vmu_cv; /* Used to signal threads */
+ /* Waiting for */
+ /* Rss_calc_thread to finish */
+ vmu_entity_t *vmu_system; /* Entity for tracking */
+ /* rss/swap for all processes */
+ /* in all zones */
+ mod_hash_t *vmu_zones_hash; /* Zones visited */
+ mod_hash_t *vmu_projects_col_hash; /* These *_col_hash hashes */
+ mod_hash_t *vmu_rusers_col_hash; /* keep track of entities, */
+ mod_hash_t *vmu_eusers_col_hash; /* ignoring zoneid, in order */
+ /* to implement VMUSAGE_COL_* */
+ /* flags, which aggregate by */
+ /* project or user regardless */
+ /* of zoneid. */
+ mod_hash_t *vmu_all_vnodes_hash; /* System wide visited vnodes */
+ /* to track incore/not-incore */
+ mod_hash_t *vmu_all_amps_hash; /* System wide visited shared */
+ /* amps to track incore/not- */
+ /* incore */
+ vmu_entity_t *vmu_entities; /* Linked list of entities */
+ size_t vmu_nentities; /* Count of entities in list */
+ vmu_cache_t *vmu_cache; /* Cached results */
+ kthread_t *vmu_calc_thread; /* NULL, or thread running */
+ /* vmu_calculate() */
+ uint_t vmu_calc_flags; /* Flags being using by */
+ /* currently running calc */
+ /* thread */
+ uint_t vmu_pending_flags; /* Flags of vm_getusage() */
+ /* threads waiting for */
+ /* calc thread to finish */
+ uint_t vmu_pending_waiters; /* Number of threads waiting */
+ /* for calc thread */
+ vmu_bound_t *vmu_free_bounds;
+ vmu_object_t *vmu_free_objects;
+ vmu_entity_t *vmu_free_entities;
+ vmu_zone_t *vmu_free_zones;
+} vmu_data_t;
+
+extern struct as kas;
+extern proc_t *practive;
+extern zone_t *global_zone;
+extern struct seg_ops segvn_ops;
+extern struct seg_ops segspt_shmops;
+
+static vmu_data_t vmu_data;
+static kmem_cache_t *vmu_bound_cache;
+static kmem_cache_t *vmu_object_cache;
+
+/*
+ * Save a bound on the free list
+ */
+static void
+vmu_free_bound(vmu_bound_t *bound)
+{
+ bound->vmb_next = vmu_data.vmu_free_bounds;
+ vmu_data.vmu_free_bounds = bound;
+}
+
+/*
+ * Free an object, and all visited bound info.
+ */
+static void
+vmu_free_object(mod_hash_val_t val)
+{
+ vmu_object_t *obj = (vmu_object_t *)val;
+ vmu_bound_t *bound = obj->vmo_bounds;
+ vmu_bound_t *tmp;
+
+ while (bound != NULL) {
+ tmp = bound;
+ bound = bound->vmb_next;
+ vmu_free_bound(tmp);
+ }
+ obj->vmo_next = vmu_data.vmu_free_objects;
+ vmu_data.vmu_free_objects = obj;
+}
+
+/*
+ * Free an entity, and hashes of visited objects for that entity.
+ */
+static void
+vmu_free_entity(mod_hash_val_t val)
+{
+ vmu_entity_t *entity = (vmu_entity_t *)val;
+
+ if (entity->vme_vnode_hash != NULL)
+ i_mod_hash_clear_nosync(entity->vme_vnode_hash);
+ if (entity->vme_amp_hash != NULL)
+ i_mod_hash_clear_nosync(entity->vme_amp_hash);
+ if (entity->vme_anon_hash != NULL)
+ i_mod_hash_clear_nosync(entity->vme_anon_hash);
+
+ entity->vme_next = vmu_data.vmu_free_entities;
+ vmu_data.vmu_free_entities = entity;
+}
+
+/*
+ * Free zone entity, and all hashes of entities inside that zone,
+ * which are projects, tasks, and users.
+ */
+static void
+vmu_free_zone(mod_hash_val_t val)
+{
+ vmu_zone_t *zone = (vmu_zone_t *)val;
+
+ if (zone->vmz_zone != NULL) {
+ vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
+ zone->vmz_zone = NULL;
+ }
+ if (zone->vmz_projects_hash != NULL)
+ i_mod_hash_clear_nosync(zone->vmz_projects_hash);
+ if (zone->vmz_tasks_hash != NULL)
+ i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
+ if (zone->vmz_rusers_hash != NULL)
+ i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
+ if (zone->vmz_eusers_hash != NULL)
+ i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
+ zone->vmz_next = vmu_data.vmu_free_zones;
+ vmu_data.vmu_free_zones = zone;
+}
+
+/*
+ * Initialize synchronization primitives and hashes for system-wide tracking
+ * of visited vnodes and shared amps. Initialize results cache.
+ */
+void
+vm_usage_init()
+{
+ mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);
+
+ vmu_data.vmu_system = NULL;
+ vmu_data.vmu_zones_hash = NULL;
+ vmu_data.vmu_projects_col_hash = NULL;
+ vmu_data.vmu_rusers_col_hash = NULL;
+ vmu_data.vmu_eusers_col_hash = NULL;
+
+ vmu_data.vmu_free_bounds = NULL;
+ vmu_data.vmu_free_objects = NULL;
+ vmu_data.vmu_free_entities = NULL;
+ vmu_data.vmu_free_zones = NULL;
+
+ vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
+ "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
+ sizeof (vnode_t));
+ vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
+ "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
+ sizeof (struct anon_map));
+ vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
+ "vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
+ vmu_free_entity);
+ vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
+ "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
+ vmu_free_entity);
+ vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
+ "vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
+ vmu_free_entity);
+ vmu_data.vmu_zones_hash = mod_hash_create_idhash(
+ "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);
+
+ vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
+ sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ vmu_object_cache = kmem_cache_create("vmu_object_cache",
+ sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ vmu_data.vmu_entities = NULL;
+ vmu_data.vmu_nentities = 0;
+
+ vmu_data.vmu_cache = NULL;
+ vmu_data.vmu_calc_thread = NULL;
+ vmu_data.vmu_calc_flags = 0;
+ vmu_data.vmu_pending_flags = 0;
+ vmu_data.vmu_pending_waiters = 0;
+}
+
+/*
+ * Allocate hashes for tracking vm objects visited for an entity.
+ * Update list of entities.
+ */
+static vmu_entity_t *
+vmu_alloc_entity(id_t id, int type, id_t zoneid)
+{
+ vmu_entity_t *entity;
+
+ if (vmu_data.vmu_free_entities != NULL) {
+ entity = vmu_data.vmu_free_entities;
+ vmu_data.vmu_free_entities =
+ vmu_data.vmu_free_entities->vme_next;
+ bzero(&entity->vme_result, sizeof (vmusage_t));
+ } else {
+ entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
+ }
+ entity->vme_result.vmu_id = id;
+ entity->vme_result.vmu_zoneid = zoneid;
+ entity->vme_result.vmu_type = type;
+
+ if (entity->vme_vnode_hash == NULL)
+ entity->vme_vnode_hash = mod_hash_create_ptrhash(
+ "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
+ sizeof (vnode_t));
+
+ if (entity->vme_amp_hash == NULL)
+ entity->vme_amp_hash = mod_hash_create_ptrhash(
+ "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
+ sizeof (struct anon_map));
+
+ if (entity->vme_anon_hash == NULL)
+ entity->vme_anon_hash = mod_hash_create_ptrhash(
+ "vmusage anon hash", VMUSAGE_HASH_SIZE,
+ mod_hash_null_valdtor, sizeof (struct anon));
+
+ entity->vme_next = vmu_data.vmu_entities;
+ vmu_data.vmu_entities = entity;
+ vmu_data.vmu_nentities++;
+
+ return (entity);
+}
+
+/*
+ * Allocate a zone entity, and hashes for tracking visited vm objects
+ * for projects, tasks, and users within that zone.
+ */
+static vmu_zone_t *
+vmu_alloc_zone(id_t id)
+{
+ vmu_zone_t *zone;
+
+ if (vmu_data.vmu_free_zones != NULL) {
+ zone = vmu_data.vmu_free_zones;
+ vmu_data.vmu_free_zones =
+ vmu_data.vmu_free_zones->vmz_next;
+ zone->vmz_next = NULL;
+ zone->vmz_zone = NULL;
+ } else {
+ zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
+ }
+
+ zone->vmz_id = id;
+
+ if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
+ zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
+
+ if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
+ VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
+ zone->vmz_projects_hash = mod_hash_create_idhash(
+ "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
+
+ if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
+ != 0 && zone->vmz_tasks_hash == NULL)
+ zone->vmz_tasks_hash = mod_hash_create_idhash(
+ "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
+
+ if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
+ != 0 && zone->vmz_rusers_hash == NULL)
+ zone->vmz_rusers_hash = mod_hash_create_idhash(
+ "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
+
+ if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
+ != 0 && zone->vmz_eusers_hash == NULL)
+ zone->vmz_eusers_hash = mod_hash_create_idhash(
+ "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
+
+ return (zone);
+}
+
+/*
+ * Allocate a structure for tracking visited bounds for a vm object.
+ */
+static vmu_object_t *
+vmu_alloc_object(caddr_t key, int type)
+{
+ vmu_object_t *object;
+
+ if (vmu_data.vmu_free_objects != NULL) {
+ object = vmu_data.vmu_free_objects;
+ vmu_data.vmu_free_objects =
+ vmu_data.vmu_free_objects->vmo_next;
+ } else {
+ object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
+ }
+
+ object->vmo_key = key;
+ object->vmo_type = type;
+ object->vmo_bounds = NULL;
+
+ return (object);
+}
+
+/*
+ * Allocate and return a bound structure.
+ */
+static vmu_bound_t *
+vmu_alloc_bound()
+{
+ vmu_bound_t *bound;
+
+ if (vmu_data.vmu_free_bounds != NULL) {
+ bound = vmu_data.vmu_free_bounds;
+ vmu_data.vmu_free_bounds =
+ vmu_data.vmu_free_bounds->vmb_next;
+ bzero(bound, sizeof (vmu_bound_t));
+ } else {
+ bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
+ bzero(bound, sizeof (vmu_bound_t));
+ }
+ return (bound);
+}
+
+/*
+ * vmu_find_insert_* functions implement hash lookup or allocate and
+ * insert operations.
+ */
+static vmu_object_t *
+vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
+{
+ int ret;
+ vmu_object_t *object;
+
+ ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
+ (mod_hash_val_t *)&object);
+ if (ret != 0) {
+ object = vmu_alloc_object(key, type);
+ ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
+ (mod_hash_val_t)object, (mod_hash_hndl_t)0);
+ ASSERT(ret == 0);
+ }
+ return (object);
+}
+
+static int
+vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
+{
+ int ret;
+ caddr_t val;
+
+ ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
+ (mod_hash_val_t *)&val);
+
+ if (ret == 0)
+ return (0);
+
+ ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
+ (mod_hash_val_t)key, (mod_hash_hndl_t)0);
+
+ ASSERT(ret == 0);
+
+ return (1);
+}
+
+static vmu_entity_t *
+vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
+{
+ int ret;
+ vmu_entity_t *entity;
+
+ ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
+ (mod_hash_val_t *)&entity);
+ if (ret != 0) {
+ entity = vmu_alloc_entity(id, type, zoneid);
+ ret = i_mod_hash_insert_nosync(hash,
+ (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
+ (mod_hash_hndl_t)0);
+ ASSERT(ret == 0);
+ }
+ return (entity);
+}
+
+
+
+
+/*
+ * Returns list of object bounds between start and end. New bounds inserted
+ * by this call are given type.
+ *
+ * Returns the number of pages covered if new bounds are created. Returns 0
+ * if region between start/end consists of all existing bounds.
+ */
+static pgcnt_t
+vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
+ end, char type, vmu_bound_t **first, vmu_bound_t **last)
+{
+ vmu_bound_t *next;
+ vmu_bound_t *prev = NULL;
+ vmu_bound_t *tmp = NULL;
+ pgcnt_t ret = 0;
+
+ *first = *last = NULL;
+
+ for (next = ro->vmo_bounds; next != NULL; next = next->vmb_next) {
+ /*
+ * Find bounds overlapping or overlapped by range [start,end].
+ */
+ if (start > next->vmb_end) {
+ /* bound is before new bound */
+ prev = next;
+ continue;
+ }
+ if (next->vmb_start > end) {
+ /* bound is after new bound */
+ break;
+ }
+ if (*first == NULL)
+ *first = next;
+ *last = next;
+ }
+
+ if (*first == NULL) {
+ ASSERT(*last == NULL);
+ /*
+ * No bounds overlapping range [start,end], so create new
+ * bound
+ */
+ tmp = vmu_alloc_bound();
+ tmp->vmb_start = start;
+ tmp->vmb_end = end;
+ tmp->vmb_type = type;
+ if (prev == NULL) {
+ tmp->vmb_next = ro->vmo_bounds;
+ ro->vmo_bounds = tmp;
+ } else {
+ tmp->vmb_next = prev->vmb_next;
+ prev->vmb_next = tmp;
+ }
+ *first = tmp;
+ *last = tmp;
+ ASSERT(tmp->vmb_end >= tmp->vmb_start);
+ ret = tmp->vmb_end - tmp->vmb_start + 1;
+ return (ret);
+ }
+
+ /* Check to see if start is before first known bound */
+ ASSERT(first != NULL && last != NULL);
+ next = (*first);
+ if (start < (*first)->vmb_start) {
+ /* Create new bound before first bound */
+ tmp = vmu_alloc_bound();
+ tmp->vmb_start = start;
+ tmp->vmb_end = (*first)->vmb_start - 1;
+ tmp->vmb_type = type;
+ tmp->vmb_next = *first;
+ if (*first == ro->vmo_bounds)
+ ro->vmo_bounds = tmp;
+ if (prev != NULL)
+ prev->vmb_next = tmp;
+ ASSERT(tmp->vmb_end >= tmp->vmb_start);
+ ret += tmp->vmb_end - tmp->vmb_start + 1;
+ *first = tmp;
+ }
+ /*
+ * Between start and end, search for gaps between and after existing
+ * bounds. Create new bounds to fill gaps if they exist.
+ */
+ while (end > next->vmb_end) {
+ /*
+ * Check for gap between bound and next bound. if no gap,
+ * continue.
+ */
+ if ((next != *last) &&
+ ((next->vmb_end + 1) == next->vmb_next->vmb_start)) {
+ next = next->vmb_next;
+ continue;
+ }
+ /*
+ * Insert new bound in gap after bound, and before next
+ * bound if next bound exists.
+ */
+ tmp = vmu_alloc_bound();
+ tmp->vmb_type = type;
+ tmp->vmb_next = next->vmb_next;
+ tmp->vmb_start = next->vmb_end + 1;
+
+ if (next != *last) {
+ tmp->vmb_end = next->vmb_next->vmb_start - 1;
+ ASSERT(tmp->vmb_end >= tmp->vmb_start);
+ ret += tmp->vmb_end - tmp->vmb_start + 1;
+ next->vmb_next = tmp;
+ next = tmp->vmb_next;
+ } else {
+ tmp->vmb_end = end;
+ ASSERT(tmp->vmb_end >= tmp->vmb_start);
+ ret += tmp->vmb_end - tmp->vmb_start + 1;
+ next->vmb_next = tmp;
+ *last = tmp;
+ break;
+ }
+ }
+ return (ret);
+}
+
+/*
+ * vmu_update_bounds()
+ *
+ * first, last: list of continuous bounds, of which zero or more are of
+ * type VMUSAGE_BOUND_UNKNOWN.
+ *
+ * new_first, new_last: list of continuous bounds, of which none are of
+ * type VMUSAGE_BOUND_UNKNOWN. These bounds are used to
+ * update the types of bounds in (first,last) with
+ * type VMUSAGE_BOUND_UNKNOWN.
+ *
+ * For the list of bounds (first,last), this function updates any bounds
+ * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
+ * the list (new_first, new_last).
+ *
+ * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
+ * (new_first, new_last), it will be split into multiple bounds.
+ *
+ * Return value:
+ * The number of pages in the list of bounds (first,last) that were of
+ * type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
+ * VMUSAGE_BOUND_INCORE.
+ *
+ */
+static pgcnt_t
+vmu_update_bounds(vmu_bound_t **first, vmu_bound_t **last,
+ vmu_bound_t *new_first, vmu_bound_t *new_last)
+{
+ vmu_bound_t *next, *new_next, *tmp;
+ pgcnt_t rss = 0;
+
+ next = *first;
+ new_next = new_first;
+
+ /* verify bounds span same pages */
+ ASSERT((*first)->vmb_start >= new_next->vmb_start);
+ ASSERT((*last)->vmb_end <= new_last->vmb_end);
+ for (;;) {
+ /* If bound already has type, proceed to next bound */
+ if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
+ if (next == *last)
+ break;
+ next = next->vmb_next;
+ continue;
+ }
+ while (new_next->vmb_end < next->vmb_start)
+ new_next = new_next->vmb_next;
+ ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
+ next->vmb_type = new_next->vmb_type;
+ if (new_next->vmb_end < next->vmb_end) {
+ /* need to split bound */
+ tmp = vmu_alloc_bound();
+ tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
+ tmp->vmb_start = new_next->vmb_end + 1;
+ tmp->vmb_end = next->vmb_end;
+ tmp->vmb_next = next->vmb_next;
+ next->vmb_end = new_next->vmb_end;
+ next->vmb_next = tmp;
+ if (*last == next)
+ *last = tmp;
+ if (next->vmb_type == VMUSAGE_BOUND_INCORE)
+ rss += next->vmb_end - next->vmb_start + 1;
+ next = tmp;
+ } else {
+ if (next->vmb_type == VMUSAGE_BOUND_INCORE)
+ rss += next->vmb_end - next->vmb_start + 1;
+ if (next == *last)
+ break;
+ next = next->vmb_next;
+ }
+ }
+ return (rss);
+}
+
+/*
+ * merges adjacent bounds with same type between first and last bound.
+ * After merge, last pointer is no longer valid, as last bound may be
+ * merged away.
+ */
+static void
+vmu_merge_bounds(vmu_bound_t **first, vmu_bound_t **last)
+{
+ vmu_bound_t *next;
+ vmu_bound_t *tmp;
+
+ ASSERT(*first != NULL);
+ ASSERT(*last != NULL);
+
+ next = *first;
+ while (next != *last) {
+
+ /* If bounds are adjacent and have same type, merge them */
+ if (((next->vmb_end + 1) == next->vmb_next->vmb_start) &&
+ (next->vmb_type == next->vmb_next->vmb_type)) {
+ tmp = next->vmb_next;
+ next->vmb_end = tmp->vmb_end;
+ next->vmb_next = tmp->vmb_next;
+ vmu_free_bound(tmp);
+ if (tmp == *last)
+ *last = next;
+ } else {
+ next = next->vmb_next;
+ }
+ }
+}
+
+/*
+ * Given an amp and a list of bounds, updates each bound's type with
+ * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
+ *
+ * If a bound is partially incore, it will be split into two bounds.
+ * first and last may be modified, as bounds may be split into multiple
+ * bounds if the are partially incore/not-incore.
+ *
+ * Set incore to non-zero if bounds are already known to be incore
+ *
+ */
+static void
+vmu_amp_update_incore_bounds(struct anon_map *amp, vmu_bound_t **first,
+ vmu_bound_t **last, boolean_t incore)
+{
+ vmu_bound_t *next;
+ vmu_bound_t *tmp;
+ pgcnt_t index;
+ short bound_type;
+ short page_type;
+ vnode_t *vn;
+ anoff_t off;
+ struct anon *ap;
+
+ next = *first;
+ /* Shared anon slots don't change once set */
+ ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
+ for (;;) {
+ if (incore == B_TRUE)
+ next->vmb_type = VMUSAGE_BOUND_INCORE;
+
+ if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
+ if (next == *last)
+ break;
+ next = next->vmb_next;
+ continue;
+ }
+ bound_type = next->vmb_type;
+ index = next->vmb_start;
+ while (index <= next->vmb_end) {
+
+ /*
+ * These are used to determine how much to increment
+ * index when a large page is found.
+ */
+ page_t *page;
+ pgcnt_t pgcnt = 1;
+ uint_t pgshft;
+ pgcnt_t pgmsk;
+
+ ap = anon_get_ptr(amp->ahp, index);
+ if (ap != NULL)
+ swap_xlate(ap, &vn, &off);
+
+ if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
+ (page = page_exists(vn, off)) != NULL) {
+ page_type = VMUSAGE_BOUND_INCORE;
+ if (page->p_szc > 0) {
+ pgcnt = page_get_pagecnt(page->p_szc);
+ pgshft = page_get_shift(page->p_szc);
+ pgmsk = (0x1 << (pgshft - PAGESHIFT))
+ - 1;
+ }
+ } else {
+ page_type = VMUSAGE_BOUND_NOT_INCORE;
+ }
+ if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
+ next->vmb_type = page_type;
+ } else if (next->vmb_type != page_type) {
+ /*
+ * if current bound type does not match page
+ * type, need to split off new bound.
+ */
+ tmp = vmu_alloc_bound();
+ tmp->vmb_type = page_type;
+ tmp->vmb_start = index;
+ tmp->vmb_end = next->vmb_end;
+ tmp->vmb_next = next->vmb_next;
+ next->vmb_end = index - 1;
+ next->vmb_next = tmp;
+ if (*last == next)
+ *last = tmp;
+ next = tmp;
+ }
+ if (pgcnt > 1) {
+ /*
+ * If inside large page, jump to next large
+ * page
+ */
+ index = (index & ~pgmsk) + pgcnt;
+ } else {
+ index++;
+ }
+ }
+ if (next == *last) {
+ ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
+ break;
+ } else
+ next = next->vmb_next;
+ }
+ ANON_LOCK_EXIT(&amp->a_rwlock);
+}
+
+/*
+ * Same as vmu_amp_update_incore_bounds(), except for tracking
+ * incore-/not-incore for vnodes.
+ */
+static void
+vmu_vnode_update_incore_bounds(vnode_t *vnode, vmu_bound_t **first,
+ vmu_bound_t **last)
+{
+ vmu_bound_t *next;
+ vmu_bound_t *tmp;
+ pgcnt_t index;
+ short bound_type;
+ short page_type;
+
+ next = *first;
+ for (;;) {
+ if (vnode->v_pages == NULL)
+ next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
+
+ if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
+ if (next == *last)
+ break;
+ next = next->vmb_next;
+ continue;
+ }
+
+ bound_type = next->vmb_type;
+ index = next->vmb_start;
+ while (index <= next->vmb_end) {
+
+ /*
+ * These are used to determine how much to increment
+ * index when a large page is found.
+ */
+ page_t *page;
+ pgcnt_t pgcnt = 1;
+ uint_t pgshft;
+ pgcnt_t pgmsk;
+
+ if (vnode->v_pages != NULL &&
+ (page = page_exists(vnode, ptob(index))) != NULL) {
+ page_type = VMUSAGE_BOUND_INCORE;
+ if (page->p_szc > 0) {
+ pgcnt = page_get_pagecnt(page->p_szc);
+ pgshft = page_get_shift(page->p_szc);
+ pgmsk = (0x1 << (pgshft - PAGESHIFT))
+ - 1;
+ }
+ } else {
+ page_type = VMUSAGE_BOUND_NOT_INCORE;
+ }
+ if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
+ next->vmb_type = page_type;
+ } else if (next->vmb_type != page_type) {
+ /*
+ * if current bound type does not match page
+ * type, need to split off new bound.
+ */
+ tmp = vmu_alloc_bound();
+ tmp->vmb_type = page_type;
+ tmp->vmb_start = index;
+ tmp->vmb_end = next->vmb_end;
+ tmp->vmb_next = next->vmb_next;
+ next->vmb_end = index - 1;
+ next->vmb_next = tmp;
+ if (*last == next)
+ *last = tmp;
+ next = tmp;
+ }
+ if (pgcnt > 1) {
+ /*
+ * If inside large page, jump to next large
+ * page
+ */
+ index = (index & ~pgmsk) + pgcnt;
+ } else {
+ index++;
+ }
+ }
+ if (next == *last) {
+ ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
+ break;
+ } else
+ next = next->vmb_next;
+ }
+}
+
+/*
+ * Calculate the rss and swap consumed by a segment. vmu_entities is the
+ * list of entities to visit. For shared segments, the vnode or amp
+ * is looked up in each entity to see if has been already counted. Private
+ * anon pages are checked per entity to ensure that cow pages are not
+ * double counted.
+ *
+ * For private mapped files, first the amp is checked for private pages.
+ * Bounds not backed by the amp are looked up in the vnode for each entity
+ * to avoid double counting of private COW vnode pages.
+ */
+static void
+vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
+{
+ struct segvn_data *svd;
+ struct shm_data *shmd;
+ struct spt_data *sptd;
+ vmu_object_t *shared_object = NULL;
+ vmu_object_t *entity_object = NULL;
+ vmu_entity_t *entity;
+ vmusage_t *result;
+ vmu_bound_t *first = NULL;
+ vmu_bound_t *last = NULL;
+ vmu_bound_t *cur = NULL;
+ vmu_bound_t *e_first = NULL;
+ vmu_bound_t *e_last = NULL;
+ vmu_bound_t *tmp;
+ pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt;
+ struct anon_map *private_amp = NULL;
+ boolean_t incore = B_FALSE;
+ boolean_t shared = B_FALSE;
+ int file = 0;
+ pgcnt_t swresv = 0;
+ pgcnt_t panon = 0;
+
+ /* Can zero-length segments exist? Not sure, so parenoia */
+ if (seg->s_size <= 0)
+ return;
+
+ /*
+ * Figure out if there is a shared object (such as a named vnode or
+ * a shared amp, then figure out if there is a private amp, which
+ * identifies private pages.
+ */
+ if (seg->s_ops == &segvn_ops) {
+ svd = (struct segvn_data *)seg->s_data;
+ if (svd->type == MAP_SHARED)
+ shared = B_TRUE;
+ else
+ swresv = svd->swresv;
+
+ if (svd->vp != NULL) {
+ file = 1;
+ shared_object = vmu_find_insert_object(
+ vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp,
+ VMUSAGE_TYPE_VNODE);
+ s_start = btop(svd->offset);
+ s_end = btop(svd->offset + seg->s_size) - 1;
+ }
+ if (svd->amp != NULL && svd->type == MAP_SHARED) {
+ ASSERT(shared_object == NULL);
+ shared_object = vmu_find_insert_object(
+ vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp,
+ VMUSAGE_TYPE_AMP);
+ s_start = svd->anon_index;
+ s_end = svd->anon_index + btop(seg->s_size) - 1;
+ /* schedctl mappings are always in core */
+ if (svd->amp->swresv == 0)
+ incore = B_TRUE;
+ }
+ if (svd->amp != NULL && svd->type == MAP_PRIVATE) {
+ private_amp = svd->amp;
+ p_start = svd->anon_index;
+ p_end = svd->anon_index + btop(seg->s_size) - 1;
+ }
+ } else if (seg->s_ops == &segspt_shmops) {
+ shared = B_TRUE;
+ shmd = (struct shm_data *)seg->s_data;
+ shared_object = vmu_find_insert_object(
+ vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp,
+ VMUSAGE_TYPE_AMP);
+ s_start = 0;
+ s_end = btop(seg->s_size) - 1;
+ sptd = shmd->shm_sptseg->s_data;
+
+ /* ism segments are always incore and do not reserve swap */
+ if (sptd->spt_flags & SHM_SHARE_MMU)
+ incore = B_TRUE;
+
+ } else {
+ return;
+ }
+
+ /*
+ * If there is a private amp, count anon pages that exist. If an
+ * anon has a refcnt > 1 (cow sharing), then save the anon in a
+ * hash so that it is not double counted.
+ *
+ * If there is also a shared object, they figure out the bounds
+ * which are not mapped by the private amp.
+ */
+ if (private_amp != NULL) {
+
+ /* Enter as writer to prevent cow anons from being freed */
+ ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER);
+
+ p_index = p_start;
+ s_index = s_start;
+
+ while (p_index <= p_end) {
+
+ pgcnt_t p_index_next;
+ pgcnt_t p_bound_size;
+ int cnt;
+ anoff_t off;
+ struct vnode *vn;
+ struct anon *ap;
+ page_t *page; /* For handling of large */
+ pgcnt_t pgcnt = 1; /* pages */
+ pgcnt_t pgstart;
+ pgcnt_t pgend;
+ uint_t pgshft;
+ pgcnt_t pgmsk;
+
+ p_index_next = p_index;
+ ap = anon_get_next_ptr(private_amp->ahp,
+ &p_index_next);
+
+ /*
+ * If next anon is past end of mapping, simulate
+ * end of anon so loop terminates.
+ */
+ if (p_index_next > p_end) {
+ p_index_next = p_end + 1;
+ ap = NULL;
+ }
+ /*
+ * For cow segments, keep track of bounds not
+ * backed by private amp so they can be looked
+ * up in the backing vnode
+ */
+ if (p_index_next != p_index) {
+
+ /*
+ * Compute index difference between anon and
+ * previous anon.
+ */
+ p_bound_size = p_index_next - p_index - 1;
+
+ if (shared_object != NULL) {
+ cur = vmu_alloc_bound();
+ cur->vmb_next = NULL;
+ cur->vmb_start = s_index;
+ cur->vmb_end = s_index + p_bound_size;
+ cur->vmb_type = VMUSAGE_BOUND_UNKNOWN;
+ if (first == NULL) {
+ first = cur;
+ last = cur;
+ } else {
+ last->vmb_next = cur;
+ last = cur;
+ }
+ }
+ p_index = p_index + p_bound_size + 1;
+ s_index = s_index + p_bound_size + 1;
+ }
+
+ /* Detect end of anons in amp */
+ if (ap == NULL)
+ break;
+
+ cnt = ap->an_refcnt;
+ swap_xlate(ap, &vn, &off);
+
+ if (vn == NULL || vn->v_pages == NULL ||
+ (page = page_exists(vn, off)) == NULL) {
+ p_index++;
+ s_index++;
+ continue;
+ }
+
+ /*
+ * If large page is found, compute portion of large
+ * page in mapping, and increment indicies to the next
+ * large page.
+ */
+ if (page->p_szc > 0) {
+
+ pgcnt = page_get_pagecnt(page->p_szc);
+ pgshft = page_get_shift(page->p_szc);
+ pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1;
+
+ /* First page in large page */
+ pgstart = p_index & ~pgmsk;
+ /* Last page in large page */
+ pgend = pgstart + pgcnt - 1;
+ /*
+ * Artifically end page if page extends past
+ * end of mapping.
+ */
+ if (pgend > p_end)
+ pgend = p_end;
+
+ /*
+ * Compute number of pages from large page
+ * which are mapped.
+ */
+ pgcnt = pgend - p_index + 1;
+
+ /*
+ * Point indicies at page after large page,
+ * or at page after end of mapping.
+ */
+ p_index += pgcnt;
+ s_index += pgcnt;
+ } else {
+ p_index++;
+ s_index++;
+ }
+
+ /*
+ * Assume anon structs with a refcnt
+ * of 1 are not cow shared, so there
+ * is no reason to track them per entity.
+ */
+ if (cnt == 1) {
+ panon += pgcnt;
+ continue;
+ }
+ for (entity = vmu_entities; entity != NULL;
+ entity = entity->vme_next_calc) {
+
+ result = &entity->vme_result;
+ /*
+ * Track cow anons per entity so
+ * they are not double counted.
+ */
+ if (vmu_find_insert_anon(entity->vme_anon_hash,
+ (caddr_t)ap) == 0)
+ continue;
+
+ result->vmu_rss_all += (pgcnt << PAGESHIFT);
+ result->vmu_rss_private +=
+ (pgcnt << PAGESHIFT);
+ }
+ }
+ ANON_LOCK_EXIT(&private_amp->a_rwlock);
+ }
+
+ /* Add up resident anon and swap reserved for private mappings */
+ if (swresv > 0 || panon > 0) {
+ for (entity = vmu_entities; entity != NULL;
+ entity = entity->vme_next_calc) {
+ result = &entity->vme_result;
+ result->vmu_swap_all += swresv;
+ result->vmu_swap_private += swresv;
+ result->vmu_rss_all += (panon << PAGESHIFT);
+ result->vmu_rss_private += (panon << PAGESHIFT);
+ }
+ }
+
+ /* Compute resident pages backing shared amp or named vnode */
+ if (shared_object != NULL) {
+ if (first == NULL) {
+ /*
+ * No private amp, or private amp has no anon
+ * structs. This means entire segment is backed by
+ * the shared object.
+ */
+ first = vmu_alloc_bound();
+ first->vmb_next = NULL;
+ first->vmb_start = s_start;
+ first->vmb_end = s_end;
+ first->vmb_type = VMUSAGE_BOUND_UNKNOWN;
+ }
+ /*
+ * Iterate bounds not backed by private amp, and compute
+ * resident pages.
+ */
+ cur = first;
+ while (cur != NULL) {
+
+ if (vmu_insert_lookup_object_bounds(shared_object,
+ cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN,
+ &first, &last) > 0) {
+ /* new bounds, find incore/not-incore */
+ if (shared_object->vmo_type ==
+ VMUSAGE_TYPE_VNODE)
+ vmu_vnode_update_incore_bounds(
+ (vnode_t *)
+ shared_object->vmo_key, &first,
+ &last);
+ else
+ vmu_amp_update_incore_bounds(
+ (struct anon_map *)
+ shared_object->vmo_key, &first,
+ &last, incore);
+ vmu_merge_bounds(&first, &last);
+ }
+ for (entity = vmu_entities; entity != NULL;
+ entity = entity->vme_next_calc) {
+
+ result = &entity->vme_result;
+
+ entity_object = vmu_find_insert_object(
+ shared_object->vmo_type ==
+ VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash:
+ entity->vme_amp_hash,
+ shared_object->vmo_key,
+ shared_object->vmo_type);
+
+ virt = vmu_insert_lookup_object_bounds(
+ entity_object, cur->vmb_start, cur->vmb_end,
+ VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last);
+
+ if (virt == 0)
+ continue;
+ /*
+ * Range visited for this entity
+ */
+ rss = vmu_update_bounds(&e_first,
+ &e_last, first, last);
+ result->vmu_rss_all += (rss << PAGESHIFT);
+ if (shared == B_TRUE && file == B_FALSE) {
+ /* shared anon mapping */
+ result->vmu_swap_all +=
+ (virt << PAGESHIFT);
+ result->vmu_swap_shared +=
+ (virt << PAGESHIFT);
+ result->vmu_rss_shared +=
+ (rss << PAGESHIFT);
+ } else if (shared == B_TRUE && file == B_TRUE) {
+ /* shared file mapping */
+ result->vmu_rss_shared +=
+ (rss << PAGESHIFT);
+ } else if (shared == B_FALSE &&
+ file == B_TRUE) {
+ /* private file mapping */
+ result->vmu_rss_private +=
+ (rss << PAGESHIFT);
+ }
+ vmu_merge_bounds(&e_first, &e_last);
+ }
+ tmp = cur;
+ cur = cur->vmb_next;
+ vmu_free_bound(tmp);
+ }
+ }
+}
+
+/*
+ * Based on the current calculation flags, find the relevant entities
+ * which are relative to the process. Then calculate each segment
+ * in the process'es address space for each relevant entity.
+ */
+static void
+vmu_calculate_proc(proc_t *p)
+{
+ vmu_entity_t *entities = NULL;
+ vmu_zone_t *zone;
+ vmu_entity_t *tmp;
+ struct as *as;
+ struct seg *seg;
+ int ret;
+
+ /* Figure out which entities are being computed */
+ if ((vmu_data.vmu_system) != NULL) {
+ tmp = vmu_data.vmu_system;
+ tmp->vme_next_calc = entities;
+ entities = tmp;
+ }
+ if (vmu_data.vmu_calc_flags &
+ (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
+ VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
+ VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
+ VMUSAGE_ALL_EUSERS)) {
+ ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
+ (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
+ (mod_hash_val_t *)&zone);
+ if (ret != 0) {
+ zone = vmu_alloc_zone(p->p_zone->zone_id);
+ ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
+ (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
+ (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
+ ASSERT(ret == 0);
+ }
+ if (zone->vmz_zone != NULL) {
+ tmp = zone->vmz_zone;
+ tmp->vme_next_calc = entities;
+ entities = tmp;
+ }
+ if (vmu_data.vmu_calc_flags &
+ (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
+ tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
+ p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS,
+ zone->vmz_id);
+ tmp->vme_next_calc = entities;
+ entities = tmp;
+ }
+ if (vmu_data.vmu_calc_flags &
+ (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
+ tmp = vmu_find_insert_entity(zone->vmz_tasks_hash,
+ p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id);
+ tmp->vme_next_calc = entities;
+ entities = tmp;
+ }
+ if (vmu_data.vmu_calc_flags &
+ (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
+ tmp = vmu_find_insert_entity(zone->vmz_rusers_hash,
+ crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id);
+ tmp->vme_next_calc = entities;
+ entities = tmp;
+ }
+ if (vmu_data.vmu_calc_flags &
+ (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
+ tmp = vmu_find_insert_entity(zone->vmz_eusers_hash,
+ crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id);
+ tmp->vme_next_calc = entities;
+ entities = tmp;
+ }
+ }
+ /* Entities which collapse projects and users for all zones */
+ if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) {
+ tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash,
+ p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES);
+ tmp->vme_next_calc = entities;
+ entities = tmp;
+ }
+ if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) {
+ tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash,
+ crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES);
+ tmp->vme_next_calc = entities;
+ entities = tmp;
+ }
+ if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) {
+ tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash,
+ crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES);
+ tmp->vme_next_calc = entities;
+ entities = tmp;
+ }
+
+ ASSERT(entities != NULL);
+ /* process all segs in process's address space */
+ as = p->p_as;
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ for (seg = AS_SEGFIRST(as); seg != NULL;
+ seg = AS_SEGNEXT(as, seg)) {
+ vmu_calculate_seg(entities, seg);
+ }
+ AS_LOCK_EXIT(as, &as->a_lock);
+}
+
+/*
+ * Free data created by previous call to vmu_calculate().
+ */
+static void
+vmu_clear_calc()
+{
+ if (vmu_data.vmu_system != NULL)
+ vmu_free_entity(vmu_data.vmu_system);
+ vmu_data.vmu_system = NULL;
+ if (vmu_data.vmu_zones_hash != NULL)
+ i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash);
+ if (vmu_data.vmu_projects_col_hash != NULL)
+ i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash);
+ if (vmu_data.vmu_rusers_col_hash != NULL)
+ i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash);
+ if (vmu_data.vmu_eusers_col_hash != NULL)
+ i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash);
+
+ i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash);
+ i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash);
+}
+
+/*
+ * Free unused data structures. These can result if the system workload
+ * decreases between calculations.
+ */
+static void
+vmu_free_extra()
+{
+ vmu_bound_t *tb;
+ vmu_object_t *to;
+ vmu_entity_t *te;
+ vmu_zone_t *tz;
+
+ while (vmu_data.vmu_free_bounds != NULL) {
+ tb = vmu_data.vmu_free_bounds;
+ vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next;
+ kmem_cache_free(vmu_bound_cache, tb);
+ }
+ while (vmu_data.vmu_free_objects != NULL) {
+ to = vmu_data.vmu_free_objects;
+ vmu_data.vmu_free_objects =
+ vmu_data.vmu_free_objects->vmo_next;
+ kmem_cache_free(vmu_object_cache, to);
+ }
+ while (vmu_data.vmu_free_entities != NULL) {
+ te = vmu_data.vmu_free_entities;
+ vmu_data.vmu_free_entities =
+ vmu_data.vmu_free_entities->vme_next;
+ if (te->vme_vnode_hash != NULL)
+ mod_hash_destroy_hash(te->vme_vnode_hash);
+ if (te->vme_amp_hash != NULL)
+ mod_hash_destroy_hash(te->vme_amp_hash);
+ if (te->vme_anon_hash != NULL)
+ mod_hash_destroy_hash(te->vme_anon_hash);
+ kmem_free(te, sizeof (vmu_entity_t));
+ }
+ while (vmu_data.vmu_free_zones != NULL) {
+ tz = vmu_data.vmu_free_zones;
+ vmu_data.vmu_free_zones =
+ vmu_data.vmu_free_zones->vmz_next;
+ if (tz->vmz_projects_hash != NULL)
+ mod_hash_destroy_hash(tz->vmz_projects_hash);
+ if (tz->vmz_tasks_hash != NULL)
+ mod_hash_destroy_hash(tz->vmz_tasks_hash);
+ if (tz->vmz_rusers_hash != NULL)
+ mod_hash_destroy_hash(tz->vmz_rusers_hash);
+ if (tz->vmz_eusers_hash != NULL)
+ mod_hash_destroy_hash(tz->vmz_eusers_hash);
+ kmem_free(tz, sizeof (vmu_zone_t));
+ }
+}
+
+extern kcondvar_t *pr_pid_cv;
+
+/*
+ * Determine which entity types are relevant and allocate the hashes to
+ * track them. Then walk the process table and count rss and swap
+ * for each process'es address space. Address space object such as
+ * vnodes, amps and anons are tracked per entity, so that they are
+ * not double counted in the results.
+ *
+ */
+static void
+vmu_calculate()
+{
+ int i = 0;
+ int ret;
+ proc_t *p;
+
+ vmu_clear_calc();
+
+ if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM)
+ vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
+ ALL_ZONES);
+
+ /*
+ * Walk process table and calculate rss of each proc.
+ *
+ * Pidlock and p_lock cannot be held while doing the rss calculation.
+ * This is because:
+ * 1. The calculation allocates using KM_SLEEP.
+ * 2. The calculation grabs a_lock, which cannot be grabbed
+ * after p_lock.
+ *
+ * Since pidlock must be dropped, we cannot simply just walk the
+ * practive list. Instead, we walk the process table, and sprlock
+ * each process to ensure that it does not exit during the
+ * calculation.
+ */
+
+ mutex_enter(&pidlock);
+ for (i = 0; i < v.v_proc; i++) {
+again:
+ p = pid_entry(i);
+ if (p == NULL)
+ continue;
+
+ mutex_enter(&p->p_lock);
+ mutex_exit(&pidlock);
+
+ if (panicstr) {
+ mutex_exit(&p->p_lock);
+ return;
+ }
+
+ /* Try to set P_PR_LOCK */
+ ret = sprtrylock_proc(p);
+ if (ret == -1) {
+ /* Process in invalid state */
+ mutex_exit(&p->p_lock);
+ mutex_enter(&pidlock);
+ continue;
+ } else if (ret == 1) {
+ /*
+ * P_PR_LOCK is already set. Wait and try again.
+ * This also drops p_lock.
+ */
+ sprwaitlock_proc(p);
+ mutex_enter(&pidlock);
+ goto again;
+ }
+ mutex_exit(&p->p_lock);
+
+ vmu_calculate_proc(p);
+
+ mutex_enter(&p->p_lock);
+ sprunlock(p);
+ mutex_enter(&pidlock);
+ }
+ mutex_exit(&pidlock);
+
+ vmu_free_extra();
+}
+
+/*
+ * allocate a new cache for N results satisfying flags
+ */
+vmu_cache_t *
+vmu_cache_alloc(size_t nres, uint_t flags)
+{
+ vmu_cache_t *cache;
+
+ cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP);
+ cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP);
+ cache->vmc_nresults = nres;
+ cache->vmc_flags = flags;
+ cache->vmc_refcnt = 1;
+ return (cache);
+}
+
+/*
+ * Make sure cached results are not freed
+ */
+static void
+vmu_cache_hold(vmu_cache_t *cache)
+{
+ ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
+ cache->vmc_refcnt++;
+}
+
+/*
+ * free cache data
+ */
+static void
+vmu_cache_rele(vmu_cache_t *cache)
+{
+ ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
+ ASSERT(cache->vmc_refcnt > 0);
+ cache->vmc_refcnt--;
+ if (cache->vmc_refcnt == 0) {
+ kmem_free(cache->vmc_results, sizeof (vmusage_t) *
+ cache->vmc_nresults);
+ kmem_free(cache, sizeof (vmu_cache_t));
+ }
+}
+
+/*
+ * Copy out the cached results to a caller. Inspect the callers flags
+ * and zone to determine which cached results should be copied.
+ */
+static int
+vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
+ uint_t flags)
+{
+ vmusage_t *result, *out_result;
+ vmusage_t dummy;
+ size_t i, count = 0;
+ size_t bufsize;
+ int ret = 0;
+ uint_t types = 0;
+
+ if (nres != NULL) {
+ if (copyin((caddr_t)nres, &bufsize, sizeof (size_t)))
+ return (set_errno(EFAULT));
+ } else {
+ bufsize = 0;
+ }
+
+ /* figure out what results the caller is interested in. */
+ if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
+ types |= VMUSAGE_SYSTEM;
+ if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
+ types |= VMUSAGE_ZONE;
+ if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
+ VMUSAGE_COL_PROJECTS))
+ types |= VMUSAGE_PROJECTS;
+ if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
+ types |= VMUSAGE_TASKS;
+ if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
+ types |= VMUSAGE_RUSERS;
+ if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
+ types |= VMUSAGE_EUSERS;
+
+ /* count results for current zone */
+ out_result = buf;
+ for (result = cache->vmc_results, i = 0;
+ i < cache->vmc_nresults; result++, i++) {
+
+ /* Do not return "other-zone" results to non-global zones */
+ if (curproc->p_zone != global_zone &&
+ curproc->p_zone->zone_id != result->vmu_zoneid)
+ continue;
+
+ /*
+ * If non-global zone requests VMUSAGE_SYSTEM, fake
+ * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
+ */
+ if (curproc->p_zone != global_zone &&
+ (flags & VMUSAGE_SYSTEM) != 0 &&
+ result->vmu_type == VMUSAGE_ZONE) {
+ count++;
+ if (out_result != NULL) {
+ if (bufsize < count) {
+ ret = set_errno(EOVERFLOW);
+ } else {
+ dummy = *result;
+ dummy.vmu_zoneid = ALL_ZONES;
+ dummy.vmu_id = 0;
+ dummy.vmu_type = VMUSAGE_SYSTEM;
+ if (copyout(&dummy, out_result,
+ sizeof (vmusage_t)))
+ return (set_errno(
+ EFAULT));
+ out_result++;
+ }
+ }
+ }
+
+ /* Skip results that do not match requested type */
+ if ((result->vmu_type & types) == 0)
+ continue;
+
+ /* Skip collated results if not requested */
+ if (result->vmu_zoneid == ALL_ZONES) {
+ if (result->vmu_type == VMUSAGE_PROJECTS &&
+ (flags & VMUSAGE_COL_PROJECTS) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_EUSERS &&
+ (flags & VMUSAGE_COL_EUSERS) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_RUSERS &&
+ (flags & VMUSAGE_COL_RUSERS) == 0)
+ continue;
+ }
+
+ /* Skip "other zone" results if not requested */
+ if (result->vmu_zoneid != curproc->p_zone->zone_id) {
+ if (result->vmu_type == VMUSAGE_ZONE &&
+ (flags & VMUSAGE_ALL_ZONES) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_PROJECTS &&
+ (flags & (VMUSAGE_ALL_PROJECTS |
+ VMUSAGE_COL_PROJECTS)) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_TASKS &&
+ (flags & VMUSAGE_ALL_TASKS) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_RUSERS &&
+ (flags & (VMUSAGE_ALL_RUSERS |
+ VMUSAGE_COL_RUSERS)) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_EUSERS &&
+ (flags & (VMUSAGE_ALL_EUSERS |
+ VMUSAGE_COL_EUSERS)) == 0)
+ continue;
+ }
+ count++;
+ if (out_result != NULL) {
+ if (bufsize < count) {
+ ret = set_errno(EOVERFLOW);
+ } else {
+ if (copyout(result, out_result,
+ sizeof (vmusage_t)))
+ return (set_errno(EFAULT));
+ out_result++;
+ }
+ }
+ }
+ if (nres != NULL)
+ if (copyout(&count, (void *)nres, sizeof (size_t)))
+ return (set_errno(EFAULT));
+
+ return (ret);
+}
+
+/*
+ * vm_getusage()
+ *
+ * Counts rss and swap by zone, project, task, and/or user. The flags argument
+ * determines the type of results structures returned. Flags requesting
+ * results from more than one zone are "flattened" to the local zone if the
+ * caller is not the global zone.
+ *
+ * args:
+ * flags: bitmap consisting of one or more of VMUSAGE_*.
+ * age: maximum allowable age (time since counting was done) in
+ * seconds of the results. Results from previous callers are
+ * cached in kernel.
+ * buf: pointer to buffer array of vmusage_t. If NULL, then only nres
+ * set on success.
+ * nres: Set to number of vmusage_t structures pointed to by buf
+ * before calling vm_getusage().
+ * On return 0 (success) or ENOSPC, is set to the number of result
+ * structures returned or attempted to return.
+ *
+ * returns 0 on success, -1 on failure:
+ * EINTR (interrupted)
+ * ENOSPC (nres to small for results, nres set to needed value for success)
+ * EINVAL (flags invalid)
+ * EFAULT (bad address for buf or nres)
+ */
+int
+vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres)
+{
+ vmu_entity_t *entity;
+ vmusage_t *result;
+ int ret = 0;
+ int cacherecent = 0;
+ hrtime_t now;
+ uint_t flags_orig;
+
+ /*
+ * Non-global zones cannot request system wide and/or collated
+ * results, or the system result, so munge the flags accordingly.
+ */
+ flags_orig = flags;
+ if (curproc->p_zone != global_zone) {
+ if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
+ flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
+ flags |= VMUSAGE_PROJECTS;
+ }
+ if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
+ flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
+ flags |= VMUSAGE_RUSERS;
+ }
+ if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
+ flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
+ flags |= VMUSAGE_EUSERS;
+ }
+ if (flags & VMUSAGE_SYSTEM) {
+ flags &= ~VMUSAGE_SYSTEM;
+ flags |= VMUSAGE_ZONE;
+ }
+ }
+
+ /* Check for unknown flags */
+ if ((flags & (~VMUSAGE_MASK)) != 0)
+ return (set_errno(EINVAL));
+
+ /* Check for no flags */
+ if ((flags & VMUSAGE_MASK) == 0)
+ return (set_errno(EINVAL));
+
+ mutex_enter(&vmu_data.vmu_lock);
+ now = gethrtime();
+
+start:
+ if (vmu_data.vmu_cache != NULL) {
+
+ vmu_cache_t *cache;
+
+ if ((vmu_data.vmu_cache->vmc_timestamp +
+ ((hrtime_t)age * NANOSEC)) > now)
+ cacherecent = 1;
+
+ if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
+ cacherecent == 1) {
+ cache = vmu_data.vmu_cache;
+ vmu_cache_hold(cache);
+ mutex_exit(&vmu_data.vmu_lock);
+
+ ret = vmu_copyout_results(cache, buf, nres, flags_orig);
+ mutex_enter(&vmu_data.vmu_lock);
+ vmu_cache_rele(cache);
+ if (vmu_data.vmu_pending_waiters > 0)
+ cv_broadcast(&vmu_data.vmu_cv);
+ mutex_exit(&vmu_data.vmu_lock);
+ return (ret);
+ }
+ /*
+ * If the cache is recent, it is likely that there are other
+ * consumers of vm_getusage running, so add their flags to the
+ * desired flags for the calculation.
+ */
+ if (cacherecent == 1)
+ flags = vmu_data.vmu_cache->vmc_flags | flags;
+ }
+ if (vmu_data.vmu_calc_thread == NULL) {
+
+ vmu_cache_t *cache;
+
+ vmu_data.vmu_calc_thread = curthread;
+ vmu_data.vmu_calc_flags = flags;
+ vmu_data.vmu_entities = NULL;
+ vmu_data.vmu_nentities = 0;
+ if (vmu_data.vmu_pending_waiters > 0)
+ vmu_data.vmu_calc_flags |=
+ vmu_data.vmu_pending_flags;
+
+ vmu_data.vmu_pending_flags = 0;
+ mutex_exit(&vmu_data.vmu_lock);
+ vmu_calculate();
+ mutex_enter(&vmu_data.vmu_lock);
+ /* copy results to cache */
+ if (vmu_data.vmu_cache != NULL)
+ vmu_cache_rele(vmu_data.vmu_cache);
+ cache = vmu_data.vmu_cache =
+ vmu_cache_alloc(vmu_data.vmu_nentities,
+ vmu_data.vmu_calc_flags);
+
+ result = cache->vmc_results;
+ for (entity = vmu_data.vmu_entities; entity != NULL;
+ entity = entity->vme_next) {
+ *result = entity->vme_result;
+ result++;
+ }
+ cache->vmc_timestamp = gethrtime();
+ vmu_cache_hold(cache);
+
+ vmu_data.vmu_calc_flags = 0;
+ vmu_data.vmu_calc_thread = NULL;
+
+ if (vmu_data.vmu_pending_waiters > 0)
+ cv_broadcast(&vmu_data.vmu_cv);
+
+ mutex_exit(&vmu_data.vmu_lock);
+
+ /* copy cache */
+ ret = vmu_copyout_results(cache, buf, nres, flags_orig);
+ mutex_enter(&vmu_data.vmu_lock);
+ vmu_cache_rele(cache);
+ mutex_exit(&vmu_data.vmu_lock);
+
+ return (ret);
+ }
+ vmu_data.vmu_pending_flags |= flags;
+ vmu_data.vmu_pending_waiters++;
+ while (vmu_data.vmu_calc_thread != NULL) {
+ if (cv_wait_sig(&vmu_data.vmu_cv,
+ &vmu_data.vmu_lock) == 0) {
+ vmu_data.vmu_pending_waiters--;
+ mutex_exit(&vmu_data.vmu_lock);
+ return (set_errno(EINTR));
+ }
+ }
+ vmu_data.vmu_pending_waiters--;
+ goto start;
+}