diff options
| author | gjelinek <none@none> | 2006-12-14 13:35:17 -0800 |
|---|---|---|
| committer | gjelinek <none@none> | 2006-12-14 13:35:17 -0800 |
| commit | 0209230bf1261579beab4f55226bb509e6b850cb (patch) | |
| tree | c605b4105191d5a10962c524ad08019742cd52cb | |
| parent | 780774645a5b1b0176916fc66312dc1d9b4d14b4 (diff) | |
| download | illumos-joyent-0209230bf1261579beab4f55226bb509e6b850cb.tar.gz | |
PSARC 2006/496 Improved Zones/RM Integration
PSARC 2006/598 Swap resource control; locked memory RM improvements
PSARC 2006/660 rcapadm zone option
4754856 *prstat* prstat -atJTZ should count shared segments only once
4970603 RFE: should be able to persistently specify global zone's cpu shares
5026227 RFE: ability to rcap zones from global zone
5103071 RFE: local zones can run the global zone out of swap
6222025 RFE: simplify rctl syntax and improve cpu-shares/FSS interaction
6420985 rcapstat is broken on amd64
6421202 RFE: simplify and improve zones/pool integration
6442252 zonecfg's "unset" syntax is not documented and confusing
6490516 schedctl pages should not reserve swap
6490938 setproject can bind to the wrong pool
6498635 zone attach failure leaves zone in installed state
6500877 tmpfs syslogs incorrect path when non-global zone tmpfs mounts become full
85 files changed, 8180 insertions, 872 deletions
diff --git a/usr/src/cmd/prstat/prstat.c b/usr/src/cmd/prstat/prstat.c index 743990ad2a..5a4b9185ea 100644 --- a/usr/src/cmd/prstat/prstat.c +++ b/usr/src/cmd/prstat/prstat.c @@ -31,6 +31,7 @@ #include <sys/loadavg.h> #include <sys/time.h> #include <sys/pset.h> +#include <sys/vm_usage.h> #include <zone.h> #include <libzonecfg.h> @@ -86,21 +87,21 @@ #define USAGE_HEADER_LWP \ " PID USERNAME USR SYS TRP TFL DFL LCK SLP LAT VCX ICX SCL SIG PROCESS/LWPID " #define USER_HEADER_PROC \ -" NPROC USERNAME SIZE RSS MEMORY TIME CPU " +" NPROC USERNAME SWAP RSS MEMORY TIME CPU " #define USER_HEADER_LWP \ -" NLWP USERNAME SIZE RSS MEMORY TIME CPU " +" NLWP USERNAME SWAP RSS MEMORY TIME CPU " #define TASK_HEADER_PROC \ -"TASKID NPROC SIZE RSS MEMORY TIME CPU PROJECT " +"TASKID NPROC SWAP RSS MEMORY TIME CPU PROJECT " #define TASK_HEADER_LWP \ -"TASKID NLWP SIZE RSS MEMORY TIME CPU PROJECT " +"TASKID NLWP SWAP RSS MEMORY TIME CPU PROJECT " #define PROJECT_HEADER_PROC \ -"PROJID NPROC SIZE RSS MEMORY TIME CPU PROJECT " +"PROJID NPROC SWAP RSS MEMORY TIME CPU PROJECT " #define PROJECT_HEADER_LWP \ -"PROJID NLWP SIZE RSS MEMORY TIME CPU PROJECT " +"PROJID NLWP SWAP RSS MEMORY TIME CPU PROJECT " #define ZONE_HEADER_PROC \ -"ZONEID NPROC SIZE RSS MEMORY TIME CPU ZONE " +"ZONEID NPROC SWAP RSS MEMORY TIME CPU ZONE " #define ZONE_HEADER_LWP \ -"ZONEID NLWP SIZE RSS MEMORY TIME CPU ZONE " +"ZONEID NLWP SWAP RSS MEMORY TIME CPU ZONE " #define PSINFO_LINE \ "%6d %-8s %5s %5s %-6s %3s %3s %9s %3.3s%% %-.16s/%d" #define PSINFO_LINE_LGRP \ @@ -160,6 +161,8 @@ static volatile uint_t sigwinch = 0; static volatile uint_t sigtstp = 0; static volatile uint_t sigterm = 0; +static long pagesize; + /* default settings */ static optdesc_t opts = { @@ -185,6 +188,129 @@ psetloadavg(long psetid, void *ptr) } /* + * Queries the memory virtual and rss size for each member of a list. + * This will override the values computed by /proc aggregation. + */ +static void +list_getsize(list_t *list) +{ + id_info_t *id; + vmusage_t *results, *next; + vmusage_t *match; + size_t nres = 0; + size_t i; + uint_t flags = 0; + int ret; + size_t physmem = sysconf(_SC_PHYS_PAGES) * pagesize; + + /* + * Determine what swap/rss results to calculate. getvmusage() will + * prune results returned to non-global zones automatically, so + * there is no need to pass different flags when calling from a + * non-global zone. + * + * Currently list_getsize() is only called with a single flag. This + * is because -Z, -J, -T, and -a are mutually exclusive. Regardless + * of this, we handle multiple flags. + */ + if (opts.o_outpmode & OPT_USERS) { + /* + * Gather rss for all users in all zones. Treat the same + * uid in different zones as the same user. + */ + flags |= VMUSAGE_COL_RUSERS; + + } else if (opts.o_outpmode & OPT_TASKS) { + /* Gather rss for all tasks in all zones */ + flags |= VMUSAGE_ALL_TASKS; + + } else if (opts.o_outpmode & OPT_PROJECTS) { + /* + * Gather rss for all projects in all zones. Treat the same + * projid in diffrent zones as the same project. + */ + flags |= VMUSAGE_COL_PROJECTS; + + } else if (opts.o_outpmode & OPT_ZONES) { + /* Gather rss for all zones */ + flags |= VMUSAGE_ALL_ZONES; + + } else { + Die(gettext( + "Cannot determine rss flags for output options %x\n"), + opts.o_outpmode); + } + + /* + * getvmusage() returns an array of result structures. One for + * each zone, project, task, or user on the system, depending on + * flags. + * + * If getvmusage() fails, prstat will use the size already gathered + * from psinfo + */ + if (getvmusage(flags, opts.o_interval, NULL, &nres) != 0) + return; + + results = (vmusage_t *)Malloc(sizeof (vmusage_t) * nres); + for (;;) { + ret = getvmusage(flags, opts.o_interval, results, &nres); + if (ret == 0) + break; + if (errno == EOVERFLOW) { + results = (vmusage_t *)Realloc(results, + sizeof (vmusage_t) * nres); + continue; + } + /* + * Failure for some other reason. Prstat will use the size + * already gathered from psinfo. + */ + return; + } + for (id = list->l_head; id != NULL; id = id->id_next) { + + match = NULL; + next = results; + for (i = 0; i < nres; i++, next++) { + switch (flags) { + case VMUSAGE_COL_RUSERS: + if (next->vmu_id == id->id_uid) + match = next; + break; + case VMUSAGE_ALL_TASKS: + if (next->vmu_id == id->id_taskid) + match = next; + break; + case VMUSAGE_COL_PROJECTS: + if (next->vmu_id == id->id_projid) + match = next; + break; + case VMUSAGE_ALL_ZONES: + if (next->vmu_id == id->id_zoneid) + match = next; + break; + default: + Die(gettext( + "Unknown vmusage flags %d\n"), flags); + } + } + if (match != NULL) { + id->id_size = match->vmu_swap_all / 1024; + id->id_rssize = match->vmu_rss_all / 1024; + id->id_pctmem = (100.0 * (float)match->vmu_rss_all) / + (float)physmem; + /* Output using data from getvmusage() */ + id->id_sizematch = B_TRUE; + } + /* + * If no match is found, prstat will use the size already + * gathered from psinfo. + */ + } +} + +/* * A routine to display the contents of the list on the screen */ static void @@ -282,7 +408,7 @@ list_print(list_t *list) cpu = (100 * id->id_pctcpu) / total_cpu; else cpu = id->id_pctcpu; - if (total_mem >= 100) + if (id->id_sizematch == B_FALSE && total_mem >= 100) mem = (100 * id->id_pctmem) / total_mem; else mem = id->id_pctmem; @@ -566,6 +692,7 @@ update: id->id_zoneid = lwp->li_info.pr_zoneid; id->id_lgroup = lwp->li_info.pr_lwp.pr_lgrp; id->id_nproc++; + id->id_sizematch = B_FALSE; if (lwp->li_flags & LWP_REPRESENT) { id->id_size = lwp->li_info.pr_size; id->id_rssize = lwp->li_info.pr_rssize; @@ -1175,6 +1302,7 @@ Exit() fd_exit(); } + int main(int argc, char **argv) { @@ -1192,6 +1320,8 @@ main(int argc, char **argv) lwpid_init(); fd_init(Setrlimit()); + pagesize = sysconf(_SC_PAGESIZE); + while ((opt = getopt(argc, argv, "vcHmaRLtu:U:n:p:C:P:h:s:S:j:k:TJz:Z")) != (int)EOF) { switch (opt) { @@ -1419,21 +1549,25 @@ main(int argc, char **argv) list_print(&lwps); } if (opts.o_outpmode & OPT_USERS) { + list_getsize(&users); list_sort(&users); list_print(&users); list_clear(&users); } if (opts.o_outpmode & OPT_TASKS) { + list_getsize(&tasks); list_sort(&tasks); list_print(&tasks); list_clear(&tasks); } if (opts.o_outpmode & OPT_PROJECTS) { + list_getsize(&projects); list_sort(&projects); list_print(&projects); list_clear(&projects); } if (opts.o_outpmode & OPT_ZONES) { + list_getsize(&zones); list_sort(&zones); list_print(&zones); list_clear(&zones); diff --git a/usr/src/cmd/prstat/prstat.h b/usr/src/cmd/prstat/prstat.h index 1a13329845..d130164e7d 100644 --- a/usr/src/cmd/prstat/prstat.h +++ b/usr/src/cmd/prstat/prstat.h @@ -122,6 +122,7 @@ typedef struct id_info { zoneid_t id_zoneid; /* zone id */ int id_lgroup; /* lgroup id */ uint_t id_nproc; /* number of processes */ + boolean_t id_sizematch; /* size/rssize from getvmusage() */ size_t id_size; /* memory usage */ size_t id_rssize; /* resident set size */ ulong_t id_time; /* cpu time (in secs) */ diff --git a/usr/src/cmd/rcap/common/rcapd.h b/usr/src/cmd/rcap/common/rcapd.h index 89cf5f3d81..7a554c213b 100644 --- a/usr/src/cmd/rcap/common/rcapd.h +++ b/usr/src/cmd/rcap/common/rcapd.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -58,7 +57,21 @@ extern "C" { #define LCST_CAP_REMOVED (1<<1) #define LCST_CAP_ZERO (1<<2) -typedef int64_t rcid_t; +typedef enum { + RCIDT_PROJECT, + RCIDT_ZONE +} rcid_type_t; + +typedef struct { + /* + * The following field could just be a rcid_type_t but it gets + * written out to a file as binary data for communication between + * 64-bit rcapd & 32-bit rcapstat, so we need to force a standard size + * and alignment here. + */ + uint64_t rcid_type; + int64_t rcid_val; +} rcid_t; typedef enum { LCU_COMPLETE, /* an enumeration of all possible collections */ @@ -138,7 +151,6 @@ typedef struct lcollection { uint64_t lcol_rss; /* RSS of all processes (kB) */ uint64_t lcol_image_size; /* image size of all processes (kB) */ uint64_t lcol_rss_cap; /* RSS cap (kB) */ - int lcol_stat_invalidate; /* flag to reset interval statistics */ lcollection_stat_t lcol_stat; /* statistics */ lcollection_stat_t lcol_stat_old; /* previous interval's statistics */ lprocess_t *lcol_lprocess; /* member processes */ @@ -162,12 +174,11 @@ typedef struct lcollection_report { extern int get_psinfo(pid_t, struct psinfo *, int, int(*)(void *, int), void *, lprocess_t *); -extern lcollection_t *lcollection_find(id_t); +extern lcollection_t *lcollection_find(rcid_t *); extern void lcollection_freq_move(lprocess_t *); -extern lcollection_t *lcollection_insert_update(rcid_t, uint64_t, char *, +extern lcollection_t *lcollection_insert_update(rcid_t *, uint64_t, char *, int *changes); extern int lcollection_member(lcollection_t *, lprocess_t *); -extern void lcollection_set_type(rctype_t); extern void lcollection_free(lcollection_t *); extern void lcollection_update(lcollection_update_type_t); extern void list_walk_collection(int (*)(lcollection_t *, void *), void *); @@ -178,12 +189,6 @@ extern void scan_abort(void); extern void check_update_statistics(void); /* - * The collection-specific function determining the collection ID from a - * process' psinfo. - */ -extern rcid_t(*rc_getidbypsinfo)(struct psinfo *); - -/* * Global (in rcapd only) variables. */ extern rcfg_t rcfg; diff --git a/usr/src/cmd/rcap/common/rcapd_stat.h b/usr/src/cmd/rcap/common/rcapd_stat.h index c34ceb36e2..fa769ba643 100644 --- a/usr/src/cmd/rcap/common/rcapd_stat.h +++ b/usr/src/cmd/rcap/common/rcapd_stat.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -44,7 +43,10 @@ extern "C" { */ #define RC_MODE_LEN 16 typedef struct rcapd_stat_hdr { - pid_t rs_pid; /* pid of producer */ + /* + * sizeof pid_t can vary, so we use a fixed 64-bit quantity. + */ + uint64_t rs_pid; /* pid of producer */ hrtime_t rs_time; /* time recorded */ /* diff --git a/usr/src/cmd/rcap/common/utils.c b/usr/src/cmd/rcap/common/utils.c index f9757a12f6..c01f568915 100644 --- a/usr/src/cmd/rcap/common/utils.c +++ b/usr/src/cmd/rcap/common/utils.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -260,3 +259,77 @@ xatoi(char *p) return (i); } } + +/* + * get_running_zones() calls zone_list(2) to find out how many zones are + * running. It then calls zone_list(2) again to fetch the list of running + * zones (stored in *zents). + */ +int +get_running_zones(uint_t *nzents, zone_entry_t **zents) +{ + zoneid_t *zids; + uint_t nzents_saved; + int i; + zone_entry_t *zentp; + zone_state_t zstate; + + *zents = NULL; + if (zone_list(NULL, nzents) != 0) { + warn(gettext("could not get zoneid list\n")); + return (E_ERROR); + } + +again: + if (*nzents == 0) + return (E_SUCCESS); + + if ((zids = (zoneid_t *)calloc(*nzents, sizeof (zoneid_t))) == NULL) { + warn(gettext("out of memory: zones will not be capped\n")); + return (E_ERROR); + } + + nzents_saved = *nzents; + + if (zone_list(zids, nzents) != 0) { + warn(gettext("could not get zone list\n")); + free(zids); + return (E_ERROR); + } + if (*nzents != nzents_saved) { + /* list changed, try again */ + free(zids); + goto again; + } + + *zents = calloc(*nzents, sizeof (zone_entry_t)); + if (*zents == NULL) { + warn(gettext("out of memory: zones will not be capped\n")); + free(zids); + return (E_ERROR); + } + + zentp = *zents; + for (i = 0; i < *nzents; i++) { + char name[ZONENAME_MAX]; + + if (getzonenamebyid(zids[i], name, sizeof (name)) < 0) { + warn(gettext("could not get name for " + "zoneid %d\n"), zids[i]); + continue; + } + + (void) strlcpy(zentp->zname, name, sizeof (zentp->zname)); + zentp->zid = zids[i]; + if (zone_get_state(name, &zstate) != Z_OK || + zstate != ZONE_STATE_RUNNING) + continue; + + + zentp++; + } + *nzents = zentp - *zents; + + free(zids); + return (E_SUCCESS); +} diff --git a/usr/src/cmd/rcap/common/utils.h b/usr/src/cmd/rcap/common/utils.h index 678dee51ab..f952d59bbb 100644 --- a/usr/src/cmd/rcap/common/utils.h +++ b/usr/src/cmd/rcap/common/utils.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,6 +32,7 @@ #include <libintl.h> #include <stdarg.h> #include <time.h> +#include <libzonecfg.h> #ifdef __cplusplus extern "C" { @@ -63,6 +63,11 @@ typedef enum rcm_dst { RCD_SYSLOG /* syslog() daemon facility */ } rcm_dst_t; +typedef struct zone_entry { + zoneid_t zid; + char zname[ZONENAME_MAX]; +} zone_entry_t; + #define LINELEN 256 /* max. message length */ #ifdef DEBUG @@ -95,6 +100,7 @@ extern void vdprintfe(int, char *, va_list); extern void dprintfe(int, char *, ...); extern void hrt2ts(hrtime_t, timestruc_t *); extern int xatoi(char *); +extern int get_running_zones(uint_t *, zone_entry_t **); #ifdef __cplusplus } diff --git a/usr/src/cmd/rcap/rcapadm/Makefile b/usr/src/cmd/rcap/rcapadm/Makefile index 59c1530185..3b4de32953 100644 --- a/usr/src/cmd/rcap/rcapadm/Makefile +++ b/usr/src/cmd/rcap/rcapadm/Makefile @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # #ident "%Z%%M% %I% %E% SMI" @@ -41,7 +40,7 @@ LINTSRCS = $(COMMON_DIR)/utils.c \ $(NOT_RELEASE_BUILD)CPPFLAGS += -DDEBUG CPPFLAGS += -I$(COMMON_DIR) -LDLIBS += -lumem -ll -lscf +LDLIBS += -lumem -ll -lscf -lzonecfg LINTFLAGS += $(LDLIBS) -mnu diff --git a/usr/src/cmd/rcap/rcapadm/rcapadm.c b/usr/src/cmd/rcap/rcapadm/rcapadm.c index cc9fd290a1..1951682283 100644 --- a/usr/src/cmd/rcap/rcapadm/rcapadm.c +++ b/usr/src/cmd/rcap/rcapadm/rcapadm.c @@ -39,6 +39,8 @@ #include <libscf_priv.h> #include <libintl.h> #include <locale.h> +#include <zone.h> +#include <libzonecfg.h> #include "utils.h" #include "rcapd.h" @@ -61,7 +63,9 @@ usage() " [-c <percent>] " "# set memory cap\n" " " - "# enforcement threshold\n")); + "# enforcement threshold\n" + " [-z <zonename> -m <max-rss>] " + "# update zone memory cap\n")); exit(E_USAGE); } @@ -135,18 +139,54 @@ out: scf_handle_destroy(h); } +/* + * Update the in-kernel memory cap for the specified zone. + */ +static int +update_zone_mcap(char *zonename, char *maxrss) +{ + zoneid_t zone_id; + uint64_t num; + + if (getzoneid() != GLOBAL_ZONEID || zonecfg_in_alt_root()) + return (E_SUCCESS); + + /* get the running zone from the kernel */ + if ((zone_id = getzoneidbyname(zonename)) == -1) { + (void) fprintf(stderr, gettext("zone '%s' must be running\n"), + zonename); + return (E_ERROR); + } + + if (zonecfg_str_to_bytes(maxrss, &num) == -1) { + (void) fprintf(stderr, gettext("invalid max-rss value\n")); + return (E_ERROR); + } + + if (zone_setattr(zone_id, ZONE_ATTR_PHYS_MCAP, &num, 0) == -1) { + (void) fprintf(stderr, gettext("could not set memory " + "cap for zone '%s'\n"), zonename); + return (E_ERROR); + } + + return (E_SUCCESS); +} + int main(int argc, char *argv[]) { char *subopts, *optval; int modified = 0; + boolean_t refresh = B_FALSE; int opt; + char *zonename; + char *maxrss = NULL; (void) setprogname("rcapadm"); (void) setlocale(LC_ALL, ""); (void) textdomain(TEXT_DOMAIN); - while ((opt = getopt(argc, argv, "DEc:i:n")) != EOF) { + while ((opt = getopt(argc, argv, "DEc:i:m:nz:")) != EOF) { switch (opt) { case 'n': no_starting_stopping = 1; @@ -203,12 +243,24 @@ main(int argc, char *argv[]) } modified++; break; + case 'm': + maxrss = optarg; + break; + case 'z': + refresh = B_TRUE; + zonename = optarg; + break; default: usage(); } } - if (argc > optind) + /* the -z & -m options must be used together */ + if (argc > optind || (refresh && maxrss == NULL) || + (!refresh && maxrss != NULL)) + usage(); + + if (refresh && (no_starting_stopping > 0 || modified)) usage(); if (rcfg_read(fname, -1, &conf, NULL) < 0) { @@ -232,6 +284,9 @@ main(int argc, char *argv[]) } } + if (refresh) + return (update_zone_mcap(zonename, maxrss)); + if (modified) { if (pressure >= 0) conf.rcfg_memory_cap_enforcement_pressure = pressure; diff --git a/usr/src/cmd/rcap/rcapd/Makefile.rcapd b/usr/src/cmd/rcap/rcapd/Makefile.rcapd index 5fd0d01416..716ea41e38 100644 --- a/usr/src/cmd/rcap/rcapd/Makefile.rcapd +++ b/usr/src/cmd/rcap/rcapd/Makefile.rcapd @@ -35,6 +35,7 @@ SRCS = rcapd_main.c \ rcapd_collection.c \ rcapd_collection_project.c \ + rcapd_collection_zone.c \ rcapd_mapping.c \ rcapd_rfd.c \ rcapd_scanner.c \ @@ -44,6 +45,7 @@ SRCS = rcapd_main.c \ LINTSRCS = ../rcapd_main.c \ ../rcapd_collection.c \ ../rcapd_collection_project.c \ + ../rcapd_collection_zone.c \ ../rcapd_mapping.c \ ../rcapd_rfd.c \ ../rcapd_scanner.c \ @@ -53,7 +55,7 @@ LINTSRCS = ../rcapd_main.c \ $(NOT_RELEASE_BUILD)CPPFLAGS += -DDEBUG CPPFLAGS += -DDEBUG_MSG CPPFLAGS += -I$(COMMON_DIR) -LDLIBS += -lkstat -ll -lproc -lproject -lumem +LDLIBS += -lkstat -ll -lproc -lproject -lzonecfg -lumem LDLIBS += $(EXTRA_LDLIBS) LINTFLAGS += -u diff --git a/usr/src/cmd/rcap/rcapd/rcapd_collection.c b/usr/src/cmd/rcap/rcapd/rcapd_collection.c index 7dac0e8155..fdaf8dbfe0 100644 --- a/usr/src/cmd/rcap/rcapd/rcapd_collection.c +++ b/usr/src/cmd/rcap/rcapd/rcapd_collection.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -41,14 +40,16 @@ #define MAX(x, y) (((x) > (y)) ? (x) : (y)) typedef struct { - rcid_t lfa_colid; + rcid_t *lfa_colidp; lcollection_t *lfa_found; } lcollection_find_arg_t; extern void lcollection_update_project(lcollection_update_type_t, - void(*)(char *, int, uint64_t, int)); -extern void lcollection_set_type_project(); -static void lcollection_update_notification_cb(char *, int, uint64_t, int); + void(*)(char *, char *, int, uint64_t, int)); +extern void lcollection_update_zone(lcollection_update_type_t, + void(*)(char *, char *, int, uint64_t, int)); +static void lcollection_update_notification_cb(char *, char *, int, uint64_t, + int); rcid_t(*rc_getidbypsinfo)(psinfo_t *); uint64_t phys_total = 0; @@ -57,28 +58,8 @@ static lcollection_t *lcollection_head = NULL; void lcollection_update(lcollection_update_type_t ut) { - if (rcfg.rcfg_mode == rctype_project) - lcollection_update_project(ut, - lcollection_update_notification_cb); - else - die(gettext("unknown mode %s\n"), rcfg.rcfg_mode_name); -} - -/* - * Configure which collection type will be used. - */ -void -lcollection_set_type(rctype_t type) -{ - switch (type) { - case rctype_project: - lcollection_set_type_project(); - break; - default: - /* can't happen */ - die(gettext("unknown mode %d\n"), type); - /*NOTREACHED*/ - } + lcollection_update_zone(ut, lcollection_update_notification_cb); + lcollection_update_project(ut, lcollection_update_notification_cb); } /* @@ -93,7 +74,7 @@ lcollection_set_type(rctype_t type) * LCSS_CAP_ZERO */ lcollection_t * -lcollection_insert_update(rcid_t colid, uint64_t rss_cap, char *name, +lcollection_insert_update(rcid_t *colidp, uint64_t rss_cap, char *name, int *changes) { lcollection_t *lcol; @@ -103,7 +84,7 @@ lcollection_insert_update(rcid_t colid, uint64_t rss_cap, char *name, if (rss_cap == 0) *changes |= LCST_CAP_ZERO; - lcol = lcollection_find(colid); + lcol = lcollection_find(colidp); /* * If the specified collection is capped, add it to lcollection. @@ -120,12 +101,13 @@ lcollection_insert_update(rcid_t colid, uint64_t rss_cap, char *name, lcol = malloc(sizeof (*lcol)); if (lcol == NULL) { debug("not enough memory to monitor %s %s", - rcfg.rcfg_mode_name, name); + (colidp->rcid_type == RCIDT_PROJECT ? + "project" : "zone"), name); return (NULL); } (void) bzero(lcol, sizeof (*lcol)); - lcol->lcol_id = colid; + lcol->lcol_id = *colidp; debug("added collection %s\n", name); lcol->lcol_prev = NULL; lcol->lcol_next = lcollection_head; @@ -157,8 +139,8 @@ lcollection_insert_update(rcid_t colid, uint64_t rss_cap, char *name, } static void -lcollection_update_notification_cb(char *name, int changes, uint64_t rss_cap, - int mark) +lcollection_update_notification_cb(char *col_type, char *name, int changes, + uint64_t rss_cap, int mark) { /* * Assume the collection has been updated redundantly if its mark count @@ -168,10 +150,10 @@ lcollection_update_notification_cb(char *name, int changes, uint64_t rss_cap, return; if (changes & LCST_CAP_ZERO) - debug("%s %s: %s\n", rcfg.rcfg_mode_name, name, + debug("%s %s: %s\n", col_type, name, (changes & LCST_CAP_REMOVED) ? "cap removed" : "uncapped"); else - debug("%s %s: cap: %llukB\n", rcfg.rcfg_mode_name, name, + debug("%s %s: cap: %llukB\n", col_type, name, (unsigned long long)rss_cap); } @@ -215,19 +197,23 @@ lcollection_member(lcollection_t *lcol, lprocess_t *lpc) static int lcollection_find_cb(lcollection_t *lcol, void *arg) { - if (lcol->lcol_id == ((lcollection_find_arg_t *)arg)->lfa_colid) { + rcid_t *colidp = ((lcollection_find_arg_t *)arg)->lfa_colidp; + + if (lcol->lcol_id.rcid_type == colidp->rcid_type && + lcol->lcol_id.rcid_val == colidp->rcid_val) { ((lcollection_find_arg_t *)arg)->lfa_found = lcol; return (1); - } else - return (0); + } + + return (0); } lcollection_t * -lcollection_find(id_t colid) +lcollection_find(rcid_t *colidp) { lcollection_find_arg_t lfa; - lfa.lfa_colid = colid; + lfa.lfa_colidp = colidp; lfa.lfa_found = NULL; list_walk_collection(lcollection_find_cb, &lfa); diff --git a/usr/src/cmd/rcap/rcapd/rcapd_collection_project.c b/usr/src/cmd/rcap/rcapd/rcapd_collection_project.c index ba34100f05..eab6d2a94a 100644 --- a/usr/src/cmd/rcap/rcapd/rcapd_collection_project.c +++ b/usr/src/cmd/rcap/rcapd/rcapd_collection_project.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,24 +37,17 @@ /* round up to next y = 2^n */ #define ROUNDUP(x, y) (((x) + ((y) - 1)) & ~((y) - 1)) -static rcid_t rc_proj_getidbypsinfo(psinfo_t *); - -void -lcollection_set_type_project(void) -{ - rc_getidbypsinfo = rc_proj_getidbypsinfo; -} - static int lcollection_update_project_cb(const struct project *proj, void *walk_data) { - void(*update_notification_cb)(char *, int, uint64_t, int) = - (void(*)(char *, int, uint64_t, int))walk_data; + void(*update_notification_cb)(char *, char *, int, uint64_t, int) = + (void(*)(char *, char *, int, uint64_t, int))walk_data; char *capattr_abs; char *end; int changes; int64_t max_rss; lcollection_t *lcol; + rcid_t colid; capattr_abs = strstr(proj->pj_attr, PJ_ABS_ATTR_NAME "="); if (capattr_abs != NULL) { @@ -70,17 +62,19 @@ lcollection_update_project_cb(const struct project *proj, void *walk_data) capattr_abs += strlen(PJ_ABS_ATTR_NAME "="); max_rss = ROUNDUP(strtoll(capattr_abs, &end, 10), 1024) / 1024; if (end == capattr_abs || *end != ';' && *end != 0) - warn(gettext("%s %s: malformed %s value " - "'%s'\n"), rcfg.rcfg_mode_name, proj->pj_name, - PJ_ABS_ATTR_NAME, capattr_abs); + warn(gettext("project %s: malformed %s value '%s'\n"), + proj->pj_name, PJ_ABS_ATTR_NAME, capattr_abs); } else max_rss = 0; - lcol = lcollection_insert_update(proj->pj_projid, max_rss, - proj->pj_name, &changes); + colid.rcid_type = RCIDT_PROJECT; + colid.rcid_val = proj->pj_projid; + + lcol = lcollection_insert_update(&colid, max_rss, proj->pj_name, + &changes); if (update_notification_cb != NULL) - update_notification_cb(proj->pj_name, changes, max_rss, (lcol != - NULL) ? lcol->lcol_mark : 0); + update_notification_cb("project", proj->pj_name, changes, + max_rss, (lcol != NULL) ? lcol->lcol_mark : 0); return (0); } @@ -101,10 +95,13 @@ lcollection_update_project_byid_cb(const projid_t id, void *walk_data) static int lcollection_update_onceactive_cb(lcollection_t *lcol, void *walk_data) { - void(*update_notification_cb)(char *, int, uint64_t, int) = - (void(*)(char *, int, uint64_t, int))walk_data; + void(*update_notification_cb)(char *, char *, int, uint64_t, int) = + (void(*)(char *, char *, int, uint64_t, int))walk_data; + + if (lcol->lcol_id.rcid_type != RCIDT_PROJECT) + return (0); - return (lcollection_update_project_byid_cb(lcol->lcol_id, + return (lcollection_update_project_byid_cb(lcol->lcol_id.rcid_val, (void *)update_notification_cb)); } @@ -125,7 +122,7 @@ project_walk_all(int(*cb)(const struct project *, void *), void *walk_data) void lcollection_update_project(lcollection_update_type_t ut, - void(*update_notification_cb)(char *, int, uint64_t, int)) + void(*update_notification_cb)(char *, char *, int, uint64_t, int)) { switch (ut) { case LCU_ACTIVE_ONLY: @@ -154,9 +151,3 @@ lcollection_update_project(lcollection_update_type_t ut, (void *)update_notification_cb); } } - -static rcid_t -rc_proj_getidbypsinfo(psinfo_t *psinfo) -{ - return (psinfo->pr_projid); -} diff --git a/usr/src/cmd/rcap/rcapd/rcapd_collection_zone.c b/usr/src/cmd/rcap/rcapd/rcapd_collection_zone.c new file mode 100644 index 0000000000..db86aa6276 --- /dev/null +++ b/usr/src/cmd/rcap/rcapd/rcapd_collection_zone.c @@ -0,0 +1,99 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <procfs.h> +#include <project.h> +#include <stdlib.h> +#include <strings.h> +#include <zone.h> +#include <libzonecfg.h> +#include "rcapd.h" +#include "utils.h" + +extern boolean_t gz_capped; + + /* round up to next y = 2^n */ +#define ROUNDUP(x, y) (((x) + ((y) - 1)) & ~((y) - 1)) + +static void +update_zone(zone_entry_t *zent, void *walk_data) +{ + void(*update_notification_cb)(char *, char *, int, uint64_t, int) = + (void(*)(char *, char *, int, uint64_t, int))walk_data; + int changes; + int64_t max_rss; + uint64_t mcap; + lcollection_t *lcol; + rcid_t colid; + + if (zone_getattr(zent->zid, ZONE_ATTR_PHYS_MCAP, &mcap, + sizeof (mcap)) != -1 && mcap != 0) + max_rss = ROUNDUP(mcap, 1024) / 1024; + else + max_rss = 0; + + if (zent->zid == GLOBAL_ZONEID) { + if (max_rss > 0) + gz_capped = B_TRUE; + else + gz_capped = B_FALSE; + } + + + colid.rcid_type = RCIDT_ZONE; + colid.rcid_val = zent->zid; + + lcol = lcollection_insert_update(&colid, max_rss, zent->zname, + &changes); + if (update_notification_cb != NULL) + update_notification_cb("zone", zent->zname, changes, max_rss, + (lcol != NULL) ? lcol->lcol_mark : 0); +} + + +/* ARGSUSED */ +void +lcollection_update_zone(lcollection_update_type_t ut, + void(*update_notification_cb)(char *, char *, int, uint64_t, int)) +{ + int i; + uint_t nzents; + zone_entry_t *zents; + + /* + * Enumerate running zones. + */ + if (get_running_zones(&nzents, &zents) != 0) + return; + + for (i = 0; i < nzents; i++) { + update_zone(&zents[i], (void *)update_notification_cb); + + } + + free(zents); +} diff --git a/usr/src/cmd/rcap/rcapd/rcapd_main.c b/usr/src/cmd/rcap/rcapd/rcapd_main.c index 9c2e8b3c48..960065826e 100644 --- a/usr/src/cmd/rcap/rcapd/rcapd_main.c +++ b/usr/src/cmd/rcap/rcapd/rcapd_main.c @@ -61,6 +61,7 @@ #include <unistd.h> #include <zone.h> #include <assert.h> +#include <sys/vm_usage.h> #include "rcapd.h" #include "rcapd_mapping.h" #include "rcapd_rfd.h" @@ -80,30 +81,42 @@ #define STAT_TEMPLATE_SUFFIX ".XXXXXX" /* suffix of mkstemp() arg */ #define DAEMON_UID 1 /* uid to use */ +#define CAPPED_PROJECT 0x01 +#define CAPPED_ZONE 0x02 + typedef struct soft_scan_arg { uint64_t ssa_sum_excess; int64_t ssa_scan_goal; + boolean_t ssa_project_over_cap; } soft_scan_arg_t; +typedef struct sample_col_arg { + boolean_t sca_any_over_cap; + boolean_t sca_project_over_cap; +} sample_col_arg_t; + + static int debug_mode = 0; /* debug mode flag */ static pid_t rcapd_pid; /* rcapd's pid to ensure it's not */ /* scanned */ static kstat_ctl_t *kctl; /* kstat chain */ -static uint64_t new_sp = 0, old_sp = 0; /* measure delta in page scan count */ -static int enforce_caps = 0; /* cap enforcement flag, dependent on */ - /* enforce_soft_caps and */ - /* global_scanner_running */ -static int enforce_soft_caps = 0; /* soft cap enforcement flag, */ - /* depending on memory pressure */ static int memory_pressure = 0; /* physical memory utilization (%) */ static int memory_pressure_sample = 0; /* count of samples */ -static int global_scanner_running = 0; /* global scanning flag, to avoid */ - /* interference with kernel's page */ - /* scanner */ +static long page_size_kb = 0; /* system page size in KB */ +static size_t nvmu_vals = 0; /* # of kernel RSS/swap vals in array */ +static size_t vmu_vals_len = 0; /* size of RSS/swap vals array */ +static vmusage_t *vmu_vals = NULL; /* snapshot of kernel RSS/swap values */ static hrtime_t next_report; /* time of next report */ static int termination_signal = 0; /* terminating signal */ +static zoneid_t my_zoneid = (zoneid_t)-1; +static lcollection_t *gz_col; /* global zone collection */ rcfg_t rcfg; +/* + * Updated when we re-read the collection configurations if this rcapd instance + * is running in the global zone and the global zone is capped. + */ +boolean_t gz_capped = B_FALSE; /* * Flags. @@ -116,9 +129,9 @@ static int verify_statistics(void); static int update_statistics(void); /* - * Checks if a process is marked 'system'. Returns zero only when it is not. + * Checks if a process is marked 'system'. Returns FALSE only when it is not. */ -static int +static boolean_t proc_issystem(pid_t pid) { char pc_clname[PC_CLNMSZ]; @@ -128,22 +141,43 @@ proc_issystem(pid_t pid) return (strcmp(pc_clname, "SYS") == 0); } else { debug("cannot get class-specific scheduling parameters; " - "assuming system process"); - return (-1); + "assuming system process\n"); + return (B_TRUE); } } -/* - * fname is the process name, for debugging messages, and unscannable is a flag - * indicating whether the process should be scanned. - */ static void -lprocess_insert_mark(pid_t pid, id_t colid, char *fname, int unscannable) +lprocess_insert_mark(psinfo_t *psinfop) { + pid_t pid = psinfop->pr_pid; + /* flag indicating whether the process should be scanned. */ + int unscannable = psinfop->pr_nlwp == 0; + rcid_t colid; lcollection_t *lcol; lprocess_t *lproc; - if ((lcol = lcollection_find(colid)) == NULL) + /* + * Determine which collection to put this process into. We only have + * to worry about tracking both zone and project capped processes if + * this rcapd instance is running in the global zone, since we'll only + * see processes in our own projects in a non-global zone. In the + * global zone, if the process belongs to a non-global zone, we only + * need to track it for the capped non-global zone collection. For + * global zone processes, we first attempt to put the process into a + * capped project collection. On the second pass into this function + * the projid will be cleared so we will just track the process for the + * global zone collection as a whole. + */ + if (psinfop->pr_zoneid == my_zoneid && psinfop->pr_projid != -1) { + colid.rcid_type = RCIDT_PROJECT; + colid.rcid_val = psinfop->pr_projid; + } else { + /* try to add to zone collection */ + colid.rcid_type = RCIDT_ZONE; + colid.rcid_val = psinfop->pr_zoneid; + } + + if ((lcol = lcollection_find(&colid)) == NULL) return; /* @@ -193,7 +227,8 @@ lprocess_insert_mark(pid_t pid, id_t colid, char *fname, int unscannable) if (lcollection_member(lcol, lproc)) { lprocess_t *cur = lcol->lcol_lprocess; debug("The collection %lld already has these members, " - "including me, %d!\n", (long long)lcol->lcol_id, + "including me, %d!\n", + (long long)lcol->lcol_id.rcid_val, (int)lproc->lpc_pid); while (cur != NULL) { debug("\t%d\n", (int)cur->lpc_pid); @@ -209,7 +244,10 @@ lprocess_insert_mark(pid_t pid, id_t colid, char *fname, int unscannable) lproc->lpc_prev = NULL; lcol->lcol_lprocess = lproc; - debug("tracking %d %d %s%s\n", (int)colid, (int)pid, fname, + debug("tracking %s %ld %d %s%s\n", + (colid.rcid_type == RCIDT_PROJECT ? "project" : "zone"), + (long)colid.rcid_val, + (int)pid, psinfop->pr_psargs, (lproc->lpc_unscannable != 0) ? " (not scannable)" : ""); lcol->lcol_stat.lcols_proc_in++; } @@ -328,22 +366,28 @@ get_psinfo(pid_t pid, psinfo_t *psinfo, int cached_fd, } /* - * Retrieve the collection membership of all processes in our zone, and update - * the psinfo of those non-system, non-zombie ones in collections. + * Retrieve the collection membership of all processes and update the psinfo of + * those non-system, non-zombie ones in collections. For global zone processes, + * we first attempt to put the process into a capped project collection. We + * also want to track the process for the global zone collection as a whole. */ static void proc_cb(const pid_t pid) { - static zoneid_t ours = (zoneid_t)-1; psinfo_t psinfo; - if (ours == (zoneid_t)-1) - ours = getzoneid(); - - if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0 && - psinfo.pr_zoneid == ours) - lprocess_insert_mark(psinfo.pr_pid, rc_getidbypsinfo(&psinfo), - psinfo.pr_psargs, psinfo.pr_nlwp == 0); + if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0) { + lprocess_insert_mark(&psinfo); + if (gz_capped && psinfo.pr_zoneid == GLOBAL_ZONEID) { + /* + * We also want to track this process for the global + * zone as a whole so add it to the global zone + * collection as well. + */ + psinfo.pr_projid = -1; + lprocess_insert_mark(&psinfo); + } + } } /* @@ -359,57 +403,149 @@ lprocess_update_psinfo_fd_cb(void *arg, int fd) } /* - * Update the RSS of processes in monitored collections. + * Get the system pagesize. */ -/*ARGSUSED*/ -static int -mem_sample_cb(lcollection_t *lcol, lprocess_t *lpc) +static void +get_page_size(void) { - psinfo_t psinfo; + page_size_kb = sysconf(_SC_PAGESIZE) / 1024; + debug("physical page size: %luKB\n", page_size_kb); +} + +static void +tm_fmt(char *msg, hrtime_t t1, hrtime_t t2) +{ + hrtime_t diff = t2 - t1; + + if (diff < MILLISEC) + debug("%s: %lld nanoseconds\n", msg, diff); + else if (diff < MICROSEC) + debug("%s: %.2f microseconds\n", msg, (float)diff / MILLISEC); + else if (diff < NANOSEC) + debug("%s: %.2f milliseconds\n", msg, (float)diff / MICROSEC); + else + debug("%s: %.2f seconds\n", msg, (float)diff / NANOSEC); +} + +/* + * Get the zone's & project's RSS from the kernel. + */ +static void +rss_sample(boolean_t my_zone_only, uint_t col_types) +{ + size_t nres; + size_t i; + uint_t flags; + hrtime_t t1, t2; - if (get_psinfo(lpc->lpc_pid, &psinfo, lpc->lpc_psinfo_fd, - lprocess_update_psinfo_fd_cb, lpc, lpc) == 0) { - lpc->lpc_rss = psinfo.pr_rssize; - lpc->lpc_size = psinfo.pr_size; + if (my_zone_only) { + flags = VMUSAGE_ZONE; } else { - if (errno == ENOENT) - debug("process %d finished\n", (int)lpc->lpc_pid); - else - debug("process %d: cannot read psinfo", - (int)lpc->lpc_pid); - lprocess_free(lpc); + flags = 0; + if (col_types & CAPPED_PROJECT) + flags |= VMUSAGE_PROJECTS; + if (col_types & CAPPED_ZONE && my_zoneid == GLOBAL_ZONEID) + flags |= VMUSAGE_ALL_ZONES; } - return (0); + debug("vmusage sample flags 0x%x\n", flags); + if (flags == 0) + return; + +again: + /* try the current buffer to see if the list will fit */ + nres = vmu_vals_len; + t1 = gethrtime(); + if (getvmusage(flags, my_zone_only ? 0 : rcfg.rcfg_rss_sample_interval, + vmu_vals, &nres) != 0) { + if (errno != EOVERFLOW) { + warn(gettext("can't read RSS from kernel\n")); + return; + } + } + t2 = gethrtime(); + tm_fmt("getvmusage time", t1, t2); + + debug("kernel nres %lu\n", (ulong_t)nres); + + if (nres > vmu_vals_len) { + /* array size is now too small, increase it and try again */ + free(vmu_vals); + + if ((vmu_vals = (vmusage_t *)calloc(nres, + sizeof (vmusage_t))) == NULL) { + warn(gettext("out of memory: could not read RSS from " + "kernel\n")); + vmu_vals_len = nvmu_vals = 0; + return; + } + vmu_vals_len = nres; + goto again; + } + + nvmu_vals = nres; + + debug("vmusage_sample\n"); + for (i = 0; i < nvmu_vals; i++) { + debug("%d: id: %d, type: 0x%x, rss_all: %llu (%lluKB), " + "swap: %llu\n", (int)i, (int)vmu_vals[i].vmu_id, + vmu_vals[i].vmu_type, + (unsigned long long)vmu_vals[i].vmu_rss_all, + (unsigned long long)vmu_vals[i].vmu_rss_all / 1024, + (unsigned long long)vmu_vals[i].vmu_swap_all); + } +} + +static void +update_col_rss(lcollection_t *lcol) +{ + int i; + + lcol->lcol_rss = 0; + lcol->lcol_image_size = 0; + + for (i = 0; i < nvmu_vals; i++) { + if (vmu_vals[i].vmu_id != lcol->lcol_id.rcid_val) + continue; + + if (vmu_vals[i].vmu_type == VMUSAGE_ZONE && + lcol->lcol_id.rcid_type != RCIDT_ZONE) + continue; + + if (vmu_vals[i].vmu_type == VMUSAGE_PROJECTS && + lcol->lcol_id.rcid_type != RCIDT_PROJECT) + continue; + + /* we found the right RSS entry, update the collection vals */ + lcol->lcol_rss = vmu_vals[i].vmu_rss_all / 1024; + lcol->lcol_image_size = vmu_vals[i].vmu_swap_all / 1024; + break; + } } /* * Sample the collection RSS, updating the collection's statistics with the - * results. + * results. Also, sum the rss of all capped projects & return true if + * the collection is over cap. */ -/*ARGSUSED*/ static int rss_sample_col_cb(lcollection_t *lcol, void *arg) { int64_t excess; uint64_t rss; + sample_col_arg_t *col_argp = (sample_col_arg_t *)arg; - /* - * If updating statistics for a new interval, reset the affected - * counters. - */ - if (lcol->lcol_stat_invalidate != 0) { - lcol->lcol_stat_old = lcol->lcol_stat; - lcol->lcol_stat.lcols_min_rss = (int64_t)-1; - lcol->lcol_stat.lcols_max_rss = 0; - lcol->lcol_stat_invalidate = 0; - } + update_col_rss(lcol); lcol->lcol_stat.lcols_rss_sample++; - excess = lcol->lcol_rss - lcol->lcol_rss_cap; rss = lcol->lcol_rss; - if (excess > 0) + excess = rss - lcol->lcol_rss_cap; + if (excess > 0) { lcol->lcol_stat.lcols_rss_act_sum += rss; + col_argp->sca_any_over_cap = B_TRUE; + if (lcol->lcol_id.rcid_type == RCIDT_PROJECT) + col_argp->sca_project_over_cap = B_TRUE; + } lcol->lcol_stat.lcols_rss_sum += rss; if (lcol->lcol_stat.lcols_min_rss > rss) @@ -421,6 +557,30 @@ rss_sample_col_cb(lcollection_t *lcol, void *arg) } /* + * Determine if we have capped projects, capped zones or both. + */ +static int +col_type_cb(lcollection_t *lcol, void *arg) +{ + uint_t *col_type = (uint_t *)arg; + + /* skip uncapped collections */ + if (lcol->lcol_rss_cap == 0) + return (1); + + if (lcol->lcol_id.rcid_type == RCIDT_PROJECT) + *col_type |= CAPPED_PROJECT; + else + *col_type |= CAPPED_ZONE; + + /* once we know everything is capped, we can stop looking */ + if ((*col_type & CAPPED_ZONE) && (*col_type & CAPPED_PROJECT)) + return (1); + + return (0); +} + +/* * Open /proc and walk entries. */ static void @@ -449,23 +609,6 @@ proc_walk_all(void (*cb)(const pid_t)) } /* - * Memory update callback. - */ -static int -memory_all_cb(lcollection_t *lcol, lprocess_t *lpc) -{ - debug_high("%s %s, pid %d: rss += %llu/%llu\n", rcfg.rcfg_mode_name, - lcol->lcol_name, (int)lpc->lpc_pid, - (unsigned long long)lpc->lpc_rss, - (unsigned long long)lpc->lpc_size); - ASSERT(lpc->lpc_rss <= lpc->lpc_size); - lcol->lcol_rss += lpc->lpc_rss; - lcol->lcol_image_size += lpc->lpc_size; - - return (0); -} - -/* * Clear unmarked callback. */ /*ARGSUSED*/ @@ -483,19 +626,6 @@ sweep_process_cb(lcollection_t *lcol, lprocess_t *lpc) } /* - * Memory clear callback. - */ -/*ARGSUSED*/ -static int -collection_zero_mem_cb(lcollection_t *lcol, void *arg) -{ - lcol->lcol_rss = 0; - lcol->lcol_image_size = 0; - - return (0); -} - -/* * Print, for debugging purposes, a collection's recently-sampled RSS and * excess. */ @@ -506,7 +636,8 @@ excess_print_cb(lcollection_t *lcol, void *arg) int64_t excess = lcol->lcol_rss - lcol->lcol_rss_cap; debug("%s %s rss/cap: %llu/%llu, excess = %lld kB\n", - rcfg.rcfg_mode_name, lcol->lcol_name, + (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), + lcol->lcol_name, (unsigned long long)lcol->lcol_rss, (unsigned long long)lcol->lcol_rss_cap, (long long)excess); @@ -516,6 +647,10 @@ excess_print_cb(lcollection_t *lcol, void *arg) /* * Scan those collections which have exceeded their caps. + * + * If we're running in the global zone it might have a cap. We don't want to + * do any capping for the global zone yet since we might get under the cap by + * just capping the projects in the global zone. */ /*ARGSUSED*/ static int @@ -523,6 +658,13 @@ scan_cb(lcollection_t *lcol, void *arg) { int64_t excess; + /* skip over global zone collection for now but keep track for later */ + if (lcol->lcol_id.rcid_type == RCIDT_ZONE && + lcol->lcol_id.rcid_val == GLOBAL_ZONEID) { + gz_col = lcol; + return (0); + } + if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) { scan(lcol, excess); lcol->lcol_stat.lcols_scan++; @@ -532,6 +674,37 @@ scan_cb(lcollection_t *lcol, void *arg) } /* + * Scan the global zone collection and see if it still exceeds its cap. + * We take into account the effects of capping any global zone projects here. + */ +static void +scan_gz(lcollection_t *lcol, boolean_t project_over_cap) +{ + int64_t excess; + + /* + * If we had projects over their cap and the global zone was also over + * its cap then we need to get the up-to-date global zone rss to + * determine if we are still over the global zone cap. We might have + * gone under while we scanned the capped projects. If there were no + * projects over cap then we can use the rss value we already have for + * the global zone. + */ + excess = lcol->lcol_rss - lcol->lcol_rss_cap; + if (project_over_cap && excess > 0) { + rss_sample(B_TRUE, CAPPED_ZONE); + update_col_rss(lcol); + excess = lcol->lcol_rss - lcol->lcol_rss_cap; + } + + if (excess > 0) { + debug("global zone excess %lldKB\n", (long long)excess); + scan(lcol, excess); + lcol->lcol_stat.lcols_scan++; + } +} + +/* * Do a soft scan of those collections which have excesses. A soft scan is one * in which the cap enforcement pressure is taken into account. The difference * between the utilized physical memory and the cap enforcement pressure will @@ -544,22 +717,72 @@ soft_scan_cb(lcollection_t *lcol, void *a) int64_t excess; soft_scan_arg_t *arg = a; + /* skip over global zone collection for now but keep track for later */ + if (lcol->lcol_id.rcid_type == RCIDT_ZONE && + lcol->lcol_id.rcid_val == GLOBAL_ZONEID) { + gz_col = lcol; + return (0); + } + if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) { - debug("col %lld excess %lld scan_goal %lld sum_excess %llu, " - "scanning %lld\n", (long long)lcol->lcol_id, + int64_t adjusted_excess = + excess * arg->ssa_scan_goal / arg->ssa_sum_excess; + + debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, " + "scanning %lld\n", + (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? + "project" : "zone"), + (long)lcol->lcol_id.rcid_val, (long long)excess, (long long)arg->ssa_scan_goal, (unsigned long long)arg->ssa_sum_excess, - (long long)(excess * arg->ssa_scan_goal / - arg->ssa_sum_excess)); + (long long)adjusted_excess); - scan(lcol, (int64_t)(excess * arg->ssa_scan_goal / - arg->ssa_sum_excess)); + scan(lcol, adjusted_excess); lcol->lcol_stat.lcols_scan++; } return (0); } +static void +soft_scan_gz(lcollection_t *lcol, void *a) +{ + int64_t excess; + soft_scan_arg_t *arg = a; + + /* + * If we had projects over their cap and the global zone was also over + * its cap then we need to get the up-to-date global zone rss to + * determine if we are still over the global zone cap. We might have + * gone under while we scanned the capped projects. If there were no + * projects over cap then we can use the rss value we already have for + * the global zone. + */ + excess = lcol->lcol_rss - lcol->lcol_rss_cap; + if (arg->ssa_project_over_cap && excess > 0) { + rss_sample(B_TRUE, CAPPED_ZONE); + update_col_rss(lcol); + excess = lcol->lcol_rss - lcol->lcol_rss_cap; + } + + if (excess > 0) { + int64_t adjusted_excess = + excess * arg->ssa_scan_goal / arg->ssa_sum_excess; + + debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, " + "scanning %lld\n", + (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? + "project" : "zone"), + (long)lcol->lcol_id.rcid_val, + (long long)excess, (long long)arg->ssa_scan_goal, + (unsigned long long)arg->ssa_sum_excess, + (long long)adjusted_excess); + + scan(lcol, adjusted_excess); + lcol->lcol_stat.lcols_scan++; + } +} + /* * When a scan could happen, but caps aren't enforced tick the * lcols_unenforced_cap counter. @@ -582,8 +805,7 @@ update_phys_total(void) uint64_t old_phys_total; old_phys_total = phys_total; - phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE) - / 1024; + phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * page_size_kb; if (phys_total != old_phys_total) debug("physical memory%s: %lluM\n", (old_phys_total == 0 ? "" : " adjusted"), (unsigned long long)(phys_total / 1024)); @@ -687,7 +909,9 @@ static int collection_sweep_cb(lcollection_t *lcol, void *arg) { if (lcol->lcol_mark == 0) { - debug("freeing %s %s\n", rcfg.rcfg_mode_name, lcol->lcol_name); + debug("freeing %s %s\n", + (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? + "project" : "zone"), lcol->lcol_name); lcollection_free(lcol); } @@ -710,8 +934,6 @@ finish_configuration(void) rcfg.rcfg_mode_name = "project"; rcfg.rcfg_mode = rctype_project; } - - lcollection_set_type(rcfg.rcfg_mode); } /* @@ -754,7 +976,8 @@ reread_configuration_file(void) * deletions to cap definitions. */ static void -reconfigure(void) +reconfigure(hrtime_t now, hrtime_t *next_configuration, + hrtime_t *next_proc_walk, hrtime_t *next_rss_sample) { debug("reconfigure...\n"); @@ -770,6 +993,31 @@ reconfigure(void) list_walk_collection(collection_clear_cb, NULL); lcollection_update(LCU_ACTIVE_ONLY); /* mark */ list_walk_collection(collection_sweep_cb, NULL); + + *next_configuration = NEXT_EVENT_TIME(now, + rcfg.rcfg_reconfiguration_interval); + + /* + * Reset each event time to the shorter of the previous and new + * intervals. + */ + if (next_report == 0 && rcfg.rcfg_report_interval > 0) + next_report = now; + else + next_report = POSITIVE_MIN(next_report, + NEXT_REPORT_EVENT_TIME(now, rcfg.rcfg_report_interval)); + + if (*next_proc_walk == 0 && rcfg.rcfg_proc_walk_interval > 0) + *next_proc_walk = now; + else + *next_proc_walk = POSITIVE_MIN(*next_proc_walk, + NEXT_EVENT_TIME(now, rcfg.rcfg_proc_walk_interval)); + + if (*next_rss_sample == 0 && rcfg.rcfg_rss_sample_interval > 0) + *next_rss_sample = now; + else + *next_rss_sample = POSITIVE_MIN(*next_rss_sample, + NEXT_EVENT_TIME(now, rcfg.rcfg_rss_sample_interval)); } /* @@ -791,20 +1039,20 @@ static int simple_report_collection_cb(lcollection_t *lcol, void *arg) { #define DELTA(field) \ - (unsigned long long)(lcol->lcol_stat_invalidate ? 0 : \ + (unsigned long long)( \ (lcol->lcol_stat.field - lcol->lcol_stat_old.field)) -#define VALID(field) \ - (unsigned long long)(lcol->lcol_stat_invalidate ? 0 : \ - lcol->lcol_stat.field) debug("%s %s status: succeeded/attempted (k): %llu/%llu, " "ineffective/scans/unenforced/samplings: %llu/%llu/%llu/%llu, RSS " "min/max (k): %llu/%llu, cap %llu kB, processes/thpt: %llu/%llu, " - "%llu scans over %llu ms\n", rcfg.rcfg_mode_name, lcol->lcol_name, + "%llu scans over %llu ms\n", + (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), + lcol->lcol_name, DELTA(lcols_pg_eff), DELTA(lcols_pg_att), DELTA(lcols_scan_ineffective), DELTA(lcols_scan), DELTA(lcols_unenforced_cap), DELTA(lcols_rss_sample), - VALID(lcols_min_rss), VALID(lcols_max_rss), + (unsigned long long)lcol->lcol_stat.lcols_min_rss, + (unsigned long long)lcol->lcol_stat.lcols_max_rss, (unsigned long long)lcol->lcol_rss_cap, (unsigned long long)(lcol->lcol_stat.lcols_proc_in - lcol->lcol_stat.lcols_proc_out), DELTA(lcols_proc_out), @@ -812,7 +1060,6 @@ simple_report_collection_cb(lcollection_t *lcol, void *arg) / MILLISEC)); #undef DELTA -#undef VALID return (0); } @@ -838,13 +1085,11 @@ report_collection_cb(lcollection_t *lcol, void *arg) dc.lcol_stat = lcol->lcol_stat; if (write(fd, &dc, sizeof (dc)) == sizeof (dc)) { - /* - * Set a flag to indicate that the exported interval snapshot - * values should be reset at the next sample. - */ - lcol->lcol_stat_invalidate = 1; + lcol->lcol_stat_old = lcol->lcol_stat; } else { - debug("can't write %s %s statistics", rcfg.rcfg_mode_name, + debug("can't write %s %s statistics", + (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? + "project" : "zone"), lcol->lcol_name); } @@ -871,8 +1116,9 @@ get_globally_scanned_pages(uint64_t *scannedp) if (kstat_read(kctl, ksp, NULL) != -1) { scanned += ((cpu_stat_t *) ksp->ks_data)->cpu_vminfo.scan; - } else + } else { return (-1); + } } } @@ -881,6 +1127,59 @@ get_globally_scanned_pages(uint64_t *scannedp) } /* + * Determine if the global page scanner is running, during which no memory + * caps should be enforced, to prevent interference with the global page + * scanner. + */ +static boolean_t +is_global_scanner_running() +{ + /* measure delta in page scan count */ + static uint64_t new_sp = 0; + static uint64_t old_sp = 0; + boolean_t res = B_FALSE; + + if (get_globally_scanned_pages(&new_sp) == 0) { + if (old_sp != 0 && (new_sp - old_sp) > 0) { + debug("global memory pressure detected (%llu " + "pages scanned since last interval)\n", + (unsigned long long)(new_sp - old_sp)); + res = B_TRUE; + } + old_sp = new_sp; + } else { + warn(gettext("unable to read cpu statistics")); + new_sp = old_sp; + } + + return (res); +} + +/* + * If soft caps are in use, determine if global memory pressure exceeds the + * configured maximum above which soft caps are enforced. + */ +static boolean_t +must_enforce_soft_caps() +{ + /* + * Check for changes to the amount of installed physical memory, to + * compute the current memory pressure. + */ + update_phys_total(); + + memory_pressure = 100 - (int)((sysconf(_SC_AVPHYS_PAGES) * page_size_kb) + * 100.0 / phys_total); + memory_pressure_sample++; + if (rcfg.rcfg_memory_cap_enforcement_pressure > 0 && + memory_pressure > rcfg.rcfg_memory_cap_enforcement_pressure) { + return (B_TRUE); + } + + return (B_FALSE); +} + +/* * Update the shared statistics file with each collection's current statistics. * Return zero on success. */ @@ -973,6 +1272,26 @@ sum_excess_cb(lcollection_t *lcol, void *arg) return (0); } +/* + * Compute the quantity of memory (in kilobytes) above the cap enforcement + * pressure. Set the scan goal to that quantity (or at most the excess). + */ +static void +compute_soft_scan_goal(soft_scan_arg_t *argp) +{ + /* + * Compute the sum of the collections' excesses, which will be the + * denominator. + */ + argp->ssa_sum_excess = 0; + list_walk_collection(sum_excess_cb, &(argp->ssa_sum_excess)); + + argp->ssa_scan_goal = MIN((sysconf(_SC_PHYS_PAGES) * + (100 - rcfg.rcfg_memory_cap_enforcement_pressure) / 100 - + sysconf(_SC_AVPHYS_PAGES)) * page_size_kb, + argp->ssa_sum_excess); +} + static void rcapd_usage(void) { @@ -1017,6 +1336,112 @@ verify_and_set_privileges(void) priv_freeset(required); } +/* + * This function does the top-level work to determine if we should do any + * memory capping, and if so, it invokes the right call-backs to do the work. + */ +static void +do_capping(hrtime_t now, hrtime_t *next_proc_walk) +{ + boolean_t enforce_caps; + /* soft cap enforcement flag, depending on memory pressure */ + boolean_t enforce_soft_caps; + /* avoid interference with kernel's page scanner */ + boolean_t global_scanner_running; + sample_col_arg_t col_arg; + soft_scan_arg_t arg; + uint_t col_types = 0; + + /* check what kind of collections (project/zone) are capped */ + list_walk_collection(col_type_cb, &col_types); + debug("collection types: 0x%x\n", col_types); + + /* no capped collections, skip checking rss */ + if (col_types == 0) + return; + + /* Determine if soft caps are enforced. */ + enforce_soft_caps = must_enforce_soft_caps(); + + /* Determine if the global page scanner is running. */ + global_scanner_running = is_global_scanner_running(); + + /* + * Sample collections' member processes RSSes and recompute + * collections' excess. + */ + rss_sample(B_FALSE, col_types); + + col_arg.sca_any_over_cap = B_FALSE; + col_arg.sca_project_over_cap = B_FALSE; + list_walk_collection(rss_sample_col_cb, &col_arg); + list_walk_collection(excess_print_cb, NULL); + debug("any collection/project over cap = %d, %d\n", + col_arg.sca_any_over_cap, col_arg.sca_project_over_cap); + + if (enforce_soft_caps) + debug("memory pressure %d%%\n", memory_pressure); + + /* + * Cap enforcement is determined by the previous conditions. + */ + enforce_caps = !global_scanner_running && col_arg.sca_any_over_cap && + (rcfg.rcfg_memory_cap_enforcement_pressure == 0 || + enforce_soft_caps); + + debug("%senforcing caps\n", enforce_caps ? "" : "not "); + + /* + * If soft caps are in use, determine the size of the portion from each + * collection to scan for. + */ + if (enforce_caps && enforce_soft_caps) + compute_soft_scan_goal(&arg); + + /* + * Victimize offending collections. + */ + if (enforce_caps && (!enforce_soft_caps || + (arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0))) { + + /* + * Since at least one collection is over its cap & needs + * enforcing, check if it is at least time for a process walk + * (we could be well past time since we only walk /proc when + * we need to) and if so, update each collections process list + * in a single pass through /proc. + */ + if (EVENT_TIME(now, *next_proc_walk)) { + debug("scanning process list...\n"); + proc_walk_all(proc_cb); /* insert & mark */ + list_walk_all(sweep_process_cb); /* free dead procs */ + *next_proc_walk = NEXT_EVENT_TIME(now, + rcfg.rcfg_proc_walk_interval); + } + + gz_col = NULL; + if (enforce_soft_caps) { + debug("scan goal is %lldKB\n", + (long long)arg.ssa_scan_goal); + list_walk_collection(soft_scan_cb, &arg); + if (gz_capped && gz_col != NULL) { + /* process global zone */ + arg.ssa_project_over_cap = + col_arg.sca_project_over_cap; + soft_scan_gz(gz_col, &arg); + } + } else { + list_walk_collection(scan_cb, NULL); + if (gz_capped && gz_col != NULL) { + /* process global zone */ + scan_gz(gz_col, col_arg.sca_project_over_cap); + } + } + } else if (col_arg.sca_any_over_cap) { + list_walk_collection(unenforced_cap_cb, NULL); + } +} + int main(int argc, char *argv[]) { @@ -1029,9 +1454,6 @@ main(int argc, char *argv[]) hrtime_t next_proc_walk; /* time of next /proc scan */ hrtime_t next_configuration; /* time of next configuration */ hrtime_t next_rss_sample; /* (latest) time of next RSS sample */ - int old_enforce_caps; /* track changes in enforcement */ - /* conditions */ - soft_scan_arg_t arg; (void) set_message_priority(RCM_INFO); (void) setprogname("rcapd"); @@ -1125,13 +1547,6 @@ main(int argc, char *argv[]) next_configuration = NEXT_EVENT_TIME(gethrtime(), rcfg.rcfg_reconfiguration_interval); - if (rcfg.rcfg_memory_cap_enforcement_pressure == 0) { - /* - * Always enforce caps when strict caps are used. - */ - enforce_caps = 1; - } - /* * Open the kstat chain. */ @@ -1158,6 +1573,9 @@ main(int argc, char *argv[]) else debug("fd limit: unknown\n"); + get_page_size(); + my_zoneid = getzoneid(); + /* * Handle those signals whose (default) exit disposition * prevents rcapd from finishing scanning before terminating. @@ -1194,9 +1612,9 @@ main(int argc, char *argv[]) /* * Loop forever, monitoring collections' resident set sizes and - * enforcing their caps. Look for changes in caps and process - * membership, as well as responding to requests to reread the - * configuration. Update per-collection statistics periodically. + * enforcing their caps. Look for changes in caps as well as + * responding to requests to reread the configuration. Update + * per-collection statistics periodically. */ while (should_run != 0) { struct timespec ts; @@ -1210,9 +1628,10 @@ main(int argc, char *argv[]) } /* - * Update the process list once every proc_walk_interval. The - * condition of global memory pressure is also checked at the - * same frequency, if strict caps are in use. + * Check the configuration at every next_configuration interval. + * Update the rss data once every next_rss_sample interval. + * The condition of global memory pressure is also checked at + * the same frequency, if strict caps are in use. */ now = gethrtime(); @@ -1222,178 +1641,16 @@ main(int argc, char *argv[]) */ if (EVENT_TIME(now, next_configuration) || should_reconfigure == 1) { - reconfigure(); - next_configuration = NEXT_EVENT_TIME(now, - rcfg.rcfg_reconfiguration_interval); - - /* - * Reset each event time to the shorter of the - * previous and new intervals. - */ - if (next_report == 0 && - rcfg.rcfg_report_interval > 0) - next_report = now; - else - next_report = POSITIVE_MIN(next_report, - NEXT_REPORT_EVENT_TIME(now, - rcfg.rcfg_report_interval)); - if (next_proc_walk == 0 && - rcfg.rcfg_proc_walk_interval > 0) - next_proc_walk = now; - else - next_proc_walk = POSITIVE_MIN(next_proc_walk, - NEXT_EVENT_TIME(now, - rcfg.rcfg_proc_walk_interval)); - if (next_rss_sample == 0 && - rcfg.rcfg_rss_sample_interval > 0) - next_rss_sample = now; - else - next_rss_sample = POSITIVE_MIN(next_rss_sample, - NEXT_EVENT_TIME(now, - rcfg.rcfg_rss_sample_interval)); - + reconfigure(now, &next_configuration, &next_proc_walk, + &next_rss_sample); should_reconfigure = 0; - continue; - } - - if (EVENT_TIME(now, next_proc_walk)) { - debug("scanning process list...\n"); - proc_walk_all(proc_cb); /* mark */ - list_walk_all(sweep_process_cb); - next_proc_walk = NEXT_EVENT_TIME(now, - rcfg.rcfg_proc_walk_interval); } + /* + * Do the main work for enforcing caps. + */ if (EVENT_TIME(now, next_rss_sample)) { - /* - * Check for changes to the amount of installed - * physical memory, to compute the current memory - * pressure. - */ - update_phys_total(); - - /* - * If soft caps are in use, determine if global memory - * pressure exceeds the configured maximum above which - * soft caps are enforced. - */ - memory_pressure = 100 - - (int)((sysconf(_SC_AVPHYS_PAGES) * - (sysconf(_SC_PAGESIZE) / 1024)) * 100.0 / - phys_total); - memory_pressure_sample++; - if (rcfg.rcfg_memory_cap_enforcement_pressure > 0) { - if (memory_pressure > - rcfg.rcfg_memory_cap_enforcement_pressure) { - if (enforce_soft_caps == 0) { - debug("memory pressure %d%%\n", - memory_pressure); - enforce_soft_caps = 1; - } - } else { - if (enforce_soft_caps == 1) - enforce_soft_caps = 0; - } - } - - /* - * Determine if the global page scanner is running, - * while which no memory caps should be enforced, to - * prevent interference with the global page scanner. - */ - if (get_globally_scanned_pages(&new_sp) == 0) { - if (old_sp == 0) - /*EMPTY*/ - ; - else if ((new_sp - old_sp) > 0) { - if (global_scanner_running == 0) { - debug("global memory pressure " - "detected (%llu pages " - "scanned since last " - "interval)\n", - (unsigned long long) - (new_sp - old_sp)); - global_scanner_running = 1; - } - } else if (global_scanner_running == 1) { - debug("global memory pressure " - "relieved\n"); - global_scanner_running = 0; - } - old_sp = new_sp; - } else { - warn(gettext("kstat_read() failed")); - new_sp = old_sp; - } - - /* - * Cap enforcement is determined by the previous two - * conditions. - */ - old_enforce_caps = enforce_caps; - enforce_caps = - (rcfg.rcfg_memory_cap_enforcement_pressure == - 0 || enforce_soft_caps == 1) && - !global_scanner_running; - if (old_enforce_caps != enforce_caps) - debug("%senforcing caps\n", enforce_caps == 0 ? - "not " : ""); - - /* - * Sample collections' member processes' RSSes and - * recompute collections' excess. - */ - list_walk_all(mem_sample_cb); - list_walk_collection(collection_zero_mem_cb, NULL); - list_walk_all(memory_all_cb); - list_walk_collection(rss_sample_col_cb, NULL); - if (rcfg.rcfg_memory_cap_enforcement_pressure > 0) - debug("memory pressure %d%%\n", - memory_pressure); - list_walk_collection(excess_print_cb, NULL); - - /* - * If soft caps are in use, determine the size of the - * portion from each collection to scan for. - */ - if (enforce_soft_caps == 1) { - /* - * Compute the sum of the collections' - * excesses, which will be the denominator. - */ - arg.ssa_sum_excess = 0; - list_walk_collection(sum_excess_cb, - &arg.ssa_sum_excess); - - /* - * Compute the quantity of memory (in - * kilobytes) above the cap enforcement - * pressure. Set the scan goal to that - * quantity (or at most the excess). - */ - arg.ssa_scan_goal = MIN(( - sysconf(_SC_PHYS_PAGES) * (100 - - rcfg.rcfg_memory_cap_enforcement_pressure) - / 100 - sysconf(_SC_AVPHYS_PAGES)) * - (sysconf(_SC_PAGESIZE) / 1024), - arg.ssa_sum_excess); - } - - /* - * Victimize offending collections. - */ - if (enforce_caps == 1 && ((enforce_soft_caps == 1 && - arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0) || - (enforce_soft_caps == 0))) - if (enforce_soft_caps == 1) { - debug("scan goal is %lldKB\n", - (long long)arg.ssa_scan_goal); - list_walk_collection(soft_scan_cb, - &arg); - } else - list_walk_collection(scan_cb, NULL); - else - list_walk_collection(unenforced_cap_cb, NULL); + do_capping(now, &next_proc_walk); next_rss_sample = NEXT_EVENT_TIME(now, rcfg.rcfg_rss_sample_interval); @@ -1409,7 +1666,6 @@ main(int argc, char *argv[]) */ now = gethrtime(); next = next_configuration; - next = POSITIVE_MIN(next, next_proc_walk); next = POSITIVE_MIN(next, next_report); next = POSITIVE_MIN(next, next_rss_sample); if (next > now && should_run != 0) { diff --git a/usr/src/cmd/rcap/rcapd/rcapd_scanner.c b/usr/src/cmd/rcap/rcapd/rcapd_scanner.c index 15c503d1b4..b39811b552 100644 --- a/usr/src/cmd/rcap/rcapd/rcapd_scanner.c +++ b/usr/src/cmd/rcap/rcapd/rcapd_scanner.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -104,7 +103,8 @@ st_debug(st_debug_level_t level, lcollection_t *lcol, char *msg, ...) buf = malloc(len); if (buf == NULL) return; - (void) snprintf(buf, len, "%s %s scanner %s", rcfg.rcfg_mode_name, + (void) snprintf(buf, len, "%s %s scanner %s", + (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), lcol->lcol_name, msg); va_start(alist, msg); @@ -471,6 +471,7 @@ merge_current_pagedata(lprocess_t *lpc, { prpageheader_t *pghp; int mappings_changed = 0; + uint64_t cnt; if (lpc->lpc_pgdata_fd < 0 || get_pagedata(&pghp, lpc->lpc_pgdata_fd) != 0) { @@ -485,9 +486,12 @@ merge_current_pagedata(lprocess_t *lpc, debug("starting/resuming pagedata collection for %d\n", (int)lpc->lpc_pid); } - debug("process %d: %llu/%llukB r/m'd since last read\n", - (int)lpc->lpc_pid, (unsigned long long)count_pages(pghp, 0, - PG_MODIFIED | PG_REFERENCED, 0), (unsigned long long)lpc->lpc_rss); + + cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0); + if (cnt != 0 || lpc->lpc_rss != 0) + debug("process %d: %llu/%llukB rfd/mdfd since last read\n", + (int)lpc->lpc_pid, (unsigned long long)cnt, + (unsigned long long)lpc->lpc_rss); if (lpc->lpc_prpageheader != NULL) { /* * OR the two snapshots. @@ -519,10 +523,12 @@ merge_current_pagedata(lprocess_t *lpc, } else mappings_changed = 1; lpc->lpc_prpageheader = pghp; - debug("process %d: %llu/%llukB r/m'd since hand swept\n", - (int)lpc->lpc_pid, (unsigned long long)count_pages(pghp, 0, - PG_MODIFIED | PG_REFERENCED, 0), - (unsigned long long)lpc->lpc_rss); + + cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0); + if (cnt != 0 || lpc->lpc_rss != 0) + debug("process %d: %llu/%llukB rfd/mdfd since hand swept\n", + (int)lpc->lpc_pid, (unsigned long long)cnt, + (unsigned long long)lpc->lpc_rss); if (mappings_changed != 0) { debug("process %d: mappings changed\n", (int)lpc->lpc_pid); if (mappings_changed_cb != NULL) @@ -589,7 +595,6 @@ rss_delta(psinfo_t *new_psinfo, psinfo_t *old_psinfo, lprocess_t *vic) static void unignore_mappings(lprocess_t *lpc) { - debug("clearing ignored set\n"); lmapping_free(&lpc->lpc_ignore); } diff --git a/usr/src/cmd/rcap/rcapstat/Makefile b/usr/src/cmd/rcap/rcapstat/Makefile index 47b9bcfb71..fb436f5684 100644 --- a/usr/src/cmd/rcap/rcapstat/Makefile +++ b/usr/src/cmd/rcap/rcapstat/Makefile @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2003 Sun Microsystems, Inc. All rights reserved. +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # #ident "%Z%%M% %I% %E% SMI" @@ -39,7 +38,7 @@ LINTSRCS = $(COMMON_DIR)/utils.c \ $(NOT_RELEASE_BUILD)CPPFLAGS += -DDEBUG CPPFLAGS += -I$(COMMON_DIR) -LDLIBS += -lumem -ll +LDLIBS += -lumem -ll -lzonecfg LINTFLAGS += $(LDLIBS) -mnu diff --git a/usr/src/cmd/rcap/rcapstat/rcapstat.c b/usr/src/cmd/rcap/rcapstat/rcapstat.c index 722502d05d..47eca3f2fa 100644 --- a/usr/src/cmd/rcap/rcapstat/rcapstat.c +++ b/usr/src/cmd/rcap/rcapstat/rcapstat.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -77,7 +76,8 @@ col_find(rcid_t id) { col_t *col; for (col = col_head; col != NULL; col = col->col_next) - if (col->col_id == id) + if (col->col_id.rcid_type == id.rcid_type && + col->col_id.rcid_val == id.rcid_val) return (col); return (NULL); } @@ -119,7 +119,7 @@ static void usage() { (void) fprintf(stderr, - gettext("usage: rcapstat [-g] [interval [count]]\n")); + gettext("usage: rcapstat [-g] [-p | -z] [interval [count]]\n")); exit(E_USAGE); } @@ -139,12 +139,12 @@ format_size(char *str, uint64_t size, int length) } static int -read_stats() +read_stats(rcid_type_t stat_type) { int fd; int proc_fd; char procfile[20]; - pid_t pid; + uint64_t pid; col_t *col, *col_next; lcollection_report_t report; struct stat st; @@ -169,7 +169,7 @@ read_stats() * Check if rcapd is running */ pid = hdr.rs_pid; - (void) snprintf(procfile, 20, "/proc/%ld/psinfo", pid); + (void) snprintf(procfile, 20, "/proc/%lld/psinfo", pid); if ((proc_fd = open(procfile, O_RDONLY)) < 0) { warn(gettext("rcapd is not active\n")); (void) close(fd); @@ -185,6 +185,9 @@ read_stats() } while (read(fd, &report, sizeof (report)) == sizeof (report)) { + if (report.lcol_id.rcid_type != stat_type) + continue; + col = col_find(report.lcol_id); if (col == NULL) { col = col_insert(report.lcol_id); @@ -291,12 +294,13 @@ print_unformatted_stats(void) } static void -print_stats() +print_stats(rcid_type_t stat_type) { col_t *col; char size[6]; char limit[6]; char rss[6]; + char nproc[6]; char paged_att[6]; char paged_eff[6]; char paged_att_avg[6]; @@ -310,12 +314,21 @@ print_stats() */ if (count == 0 || ncol != 1) (void) printf("%6s %-15s %5s %5s %5s %5s %5s %5s %5s %5s\n", - "id", mode, "nproc", "vm", "rss", "cap", + "id", (stat_type == RCIDT_PROJECT ? "project" : "zone"), + "nproc", "vm", "rss", "cap", "at", "avgat", "pg", "avgpg"); if (++count >= 20 || (count >= 10 && global != 0) || ncol != 1) count = 0; for (col = col_head; col != NULL; col = col->col_next) { + if (col->col_id.rcid_type != stat_type) + continue; + + if (col->col_paged_att == 0) + strlcpy(nproc, "-", sizeof (nproc)); + else + (void) snprintf(nproc, sizeof (nproc), "%lld", + col->col_nproc); format_size(size, col->col_vmsize, 6); format_size(rss, col->col_rsssize, 6); format_size(limit, col->col_rsslimit, 6); @@ -323,8 +336,9 @@ print_stats() format_size(paged_eff, col->col_paged_eff, 6); format_size(paged_att_avg, col->col_paged_att_avg, 6); format_size(paged_eff_avg, col->col_paged_eff_avg, 6); - (void) printf("%6lld %-15s %5lld %5s %5s %5s %5s %5s %5s %5s\n", - (long long)col->col_id, col->col_name, col->col_nproc, + (void) printf("%6lld %-15s %5s %5s %5s %5s %5s %5s %5s %5s\n", + col->col_id.rcid_val, col->col_name, + nproc, size, rss, limit, paged_att, paged_att_avg, paged_eff, paged_eff_avg); @@ -342,20 +356,32 @@ main(int argc, char *argv[]) int count; int always = 1; int opt; + int projects = 0; + int zones = 0; + /* project reporting is the default if no option is specified */ + rcid_type_t stat_type = RCIDT_PROJECT; (void) setlocale(LC_ALL, ""); (void) textdomain(TEXT_DOMAIN); (void) setprogname("rcapstat"); global = unformatted = 0; - while ((opt = getopt(argc, argv, "gu")) != (int)EOF) { + while ((opt = getopt(argc, argv, "gpuz")) != (int)EOF) { switch (opt) { case 'g': global = 1; break; + case 'p': + projects = 1; + stat_type = RCIDT_PROJECT; + break; case 'u': unformatted = 1; break; + case 'z': + stat_type = RCIDT_ZONE; + zones = 1; + break; default: usage(); } @@ -369,22 +395,22 @@ main(int argc, char *argv[]) die(gettext("invalid count specified\n")); always = 0; } - if (argc > optind) + if (argc > optind || (projects > 0 && zones > 0)) usage(); while (always || count-- > 0) { - if (read_stats() != E_SUCCESS) + if (read_stats(stat_type) != E_SUCCESS) return (E_ERROR); if (!unformatted) { - print_stats(); - fflush(stdout); + print_stats(stat_type); + (void) fflush(stdout); if (count || always) (void) sleep(interval); } else { struct stat st; print_unformatted_stats(); - fflush(stdout); + (void) fflush(stdout); while (stat(STAT_FILE_DEFAULT, &st) == 0 && st.st_mtime == stat_mod) usleep((useconds_t)(0.2 * MICROSEC)); diff --git a/usr/src/cmd/truss/print.c b/usr/src/cmd/truss/print.c index 92739f2b1e..4dc70b0d37 100644 --- a/usr/src/cmd/truss/print.c +++ b/usr/src/cmd/truss/print.c @@ -2325,6 +2325,7 @@ prt_zga(private_t *pri, int raw, long val) case ZONE_ATTR_INITNAME: s = "ZONE_ATTR_INITNAME"; break; case ZONE_ATTR_BOOTARGS: s = "ZONE_ATTR_BOOTARGS"; break; case ZONE_ATTR_BRAND: s = "ZONE_ATTR_BRAND"; break; + case ZONE_ATTR_PHYS_MCAP: s = "ZONE_ATTR_PHYS_MCAP"; break; } } diff --git a/usr/src/cmd/truss/systable.c b/usr/src/cmd/truss/systable.c index 695d0e28c2..f46e028bf5 100644 --- a/usr/src/cmd/truss/systable.c +++ b/usr/src/cmd/truss/systable.c @@ -404,7 +404,7 @@ const struct systable systable[] = { {"kaio", 7, DEC, NOV, AIO, HEX, HEX, HEX, HEX, HEX, HEX}, /* 178 */ {"cpc", 5, DEC, NOV, CPC, DEC, HEX, HEX, HEX}, /* 179 */ {"lgrpsys", 3, DEC, NOV, DEC, DEC, HEX}, /* 180 */ -{"rusagesys", 2, DEC, NOV, DEC, HEX}, /* 181 */ +{"rusagesys", 5, DEC, NOV, DEC, HEX, DEC, HEX, HEX}, /* 181 */ {"portfs", 6, HEX, HEX, DEC, HEX, HEX, HEX, HEX, HEX}, /* 182 */ {"pollsys", 4, DEC, NOV, HEX, DEC, HEX, HEX}, /* 183 */ {"labelsys", 2, DEC, NOV, DEC, HEX}, /* 184 */ @@ -761,6 +761,7 @@ static const struct systable rusagesystable[] = { {"getrusage", 2, DEC, NOV, HID, HEX}, /* 0 */ {"getrusage_chld", 2, DEC, NOV, HID, HEX}, /* 1 */ {"getrusage_lwp", 2, DEC, NOV, HID, HEX}, /* 2 */ +{"getvmusage", 5, DEC, NOV, HID, HEX, DEC, HEX, HEX}, /* 3 */ }; #define NRUSAGESYSCODE \ (sizeof (rusagesystable) / sizeof (struct systable)) @@ -942,6 +943,7 @@ const struct sysalias sysalias[] = { { "getrusage", SYS_rusagesys }, { "getrusage_chld", SYS_rusagesys }, { "getrusage_lwp", SYS_rusagesys }, + { "getvmusage", SYS_rusagesys }, { "getpeerucred", SYS_ucredsys }, { "ucred_get", SYS_ucredsys }, { "port_create", SYS_port }, @@ -956,6 +958,7 @@ const struct sysalias sysalias[] = { { "zone_create", SYS_zone }, { "zone_destroy", SYS_zone }, { "zone_getattr", SYS_zone }, + { "zone_setattr", SYS_zone }, { "zone_enter", SYS_zone }, { "getzoneid", SYS_zone }, { "zone_list", SYS_zone }, diff --git a/usr/src/cmd/zoneadm/Makefile b/usr/src/cmd/zoneadm/Makefile index 4d0f91a6f3..e11609c6dd 100644 --- a/usr/src/cmd/zoneadm/Makefile +++ b/usr/src/cmd/zoneadm/Makefile @@ -27,8 +27,8 @@ # PROG= zoneadm -MANIFEST= zones.xml -SVCMETHOD= svc-zones +MANIFEST= zones.xml resource-mgmt.xml +SVCMETHOD= svc-zones svc-resource-mgmt include ../Makefile.cmd diff --git a/usr/src/cmd/zoneadm/resource-mgmt.xml b/usr/src/cmd/zoneadm/resource-mgmt.xml new file mode 100644 index 0000000000..264f26733f --- /dev/null +++ b/usr/src/cmd/zoneadm/resource-mgmt.xml @@ -0,0 +1,116 @@ +<?xml version="1.0"?> +<!DOCTYPE service_bundle SYSTEM "/usr/share/lib/xml/dtd/service_bundle.dtd.1"> +<!-- + Copyright 2006 Sun Microsystems, Inc. All rights reserved. + Use is subject to license terms. + + CDDL HEADER START + + The contents of this file are subject to the terms of the + Common Development and Distribution License (the "License"). + You may not use this file except in compliance with the License. + + You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + or http://www.opensolaris.org/os/licensing. + See the License for the specific language governing permissions + and limitations under the License. + + When distributing Covered Code, include this CDDL HEADER in each + file and include the License file at usr/src/OPENSOLARIS.LICENSE. + If applicable, add the following below this CDDL HEADER, with the + fields enclosed by brackets "[]" replaced with your own identifying + information: Portions Copyright [yyyy] [name of copyright owner] + + CDDL HEADER END + + ident "%Z%%M% %I% %E% SMI" + + NOTE: This service manifest is not editable; its contents will + be overwritten by package or patch operations, including + operating system upgrade. Make customizations in a different + file. +--> + +<service_bundle type='manifest' name='SUNWzoner:zones'> + +<!-- + This service applies global zone resource management settings + at system startup. +--> +<service + name='system/resource-mgmt' + type='service' + version='1'> + + <create_default_instance enabled='true' /> + + <single_instance /> + + <dependency + name='usr' + type='service' + grouping='require_all' + restart_on='none'> + <service_fmri value='svc:/system/filesystem/minimal' /> + </dependency> + + <dependency + name='scheduler' + type='service' + grouping='optional_all' + restart_on='none'> + <service_fmri value='svc:/system/scheduler' /> + </dependency> + + <dependency + name='pools' + type='service' + grouping='optional_all' + restart_on='none'> + <service_fmri value='svc:/system/pools' /> + </dependency> + + <dependent + name='rcap' + grouping='optional_all' + restart_on='none'> + <service_fmri value='svc:/system/rcap' /> + </dependent> + + <exec_method + type='method' + name='start' + exec='/lib/svc/method/svc-resource-mgmt %m' + timeout_seconds='60'> + </exec_method> + + <exec_method + type='method' + name='stop' + exec=':true' + timeout_seconds='3'> + </exec_method> + + <property_group name='startd' type='framework'> + <propval name='duration' type='astring' value='transient' /> + </property_group> + + <stability value='Unstable' /> + + <template> + <common_name> + <loctext xml:lang='C'> + Global zone resource management settings + </loctext> + </common_name> + <documentation> + <manpage title='zones' section='5' manpath='/usr/share/man' /> + <manpage + title='zonecfg' + section='1M' + manpath='/usr/share/man' /> + </documentation> + </template> +</service> + +</service_bundle> diff --git a/usr/src/cmd/zoneadm/svc-resource-mgmt b/usr/src/cmd/zoneadm/svc-resource-mgmt new file mode 100644 index 0000000000..762de4c0d8 --- /dev/null +++ b/usr/src/cmd/zoneadm/svc-resource-mgmt @@ -0,0 +1,54 @@ +#!/sbin/sh +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. + +# ident "%Z%%M% %I% %E% SMI" +# + +. /lib/svc/share/smf_include.sh + +[ ! -f /etc/zones/global.xml ] && exit $SMF_EXIT_OK # No global zone + # resource mgmt. + # configuration + +[ ! -x /usr/sbin/zoneadm ] && exit $SMF_EXIT_OK # SUNWzoneu not installed + +# Make sure working directory is / to prevent unmounting problems. +cd / +PATH=/usr/sbin:/usr/bin; export PATH + +case "$1" in +'start') + zoneadm -z global apply + if [ $? -ne 0 ]; then + exit $SMF_EXIT_ERR_FATAL + fi + ;; + +*) + echo "Usage: $0 start" + exit $SMF_EXIT_ERR_FATAL + ;; +esac +exit $SMF_EXIT_OK diff --git a/usr/src/cmd/zoneadm/zoneadm.c b/usr/src/cmd/zoneadm/zoneadm.c index bff26cd356..b7ae32b30d 100644 --- a/usr/src/cmd/zoneadm/zoneadm.c +++ b/usr/src/cmd/zoneadm/zoneadm.c @@ -74,9 +74,12 @@ #include <fnmatch.h> #include <sys/modctl.h> #include <libbrand.h> +#include <libscf.h> #include <pool.h> #include <sys/pool.h> +#include <sys/priocntl.h> +#include <sys/fsspriocntl.h> #include "zoneadm.h" @@ -154,6 +157,7 @@ static int move_func(int argc, char *argv[]); static int detach_func(int argc, char *argv[]); static int attach_func(int argc, char *argv[]); static int mark_func(int argc, char *argv[]); +static int apply_func(int argc, char *argv[]); static int sanity_check(char *zone, int cmd_num, boolean_t running, boolean_t unsafe_when_running, boolean_t force); static int cmd_match(char *cmd); @@ -177,7 +181,8 @@ static struct cmd cmdtab[] = { { CMD_MOVE, "move", SHELP_MOVE, move_func }, { CMD_DETACH, "detach", SHELP_DETACH, detach_func }, { CMD_ATTACH, "attach", SHELP_ATTACH, attach_func }, - { CMD_MARK, "mark", SHELP_MARK, mark_func } + { CMD_MARK, "mark", SHELP_MARK, mark_func }, + { CMD_APPLY, "apply", NULL, apply_func } }; /* global variables */ @@ -1501,6 +1506,7 @@ boot_func(int argc, char *argv[]) zerror(gettext("call to %s failed"), "zoneadmd"); return (Z_ERR); } + return (Z_OK); } @@ -4355,15 +4361,22 @@ dev_fix(zone_dochandle_t handle) zarg.cmd = Z_READY; if (call_zoneadmd(target_zone, &zarg) != 0) { zerror(gettext("call to %s failed"), "zoneadmd"); + /* attempt to restore zone to configured state */ + (void) zone_set_state(target_zone, ZONE_STATE_CONFIGURED); return (Z_ERR); } zarg.cmd = Z_HALT; if (call_zoneadmd(target_zone, &zarg) != 0) { zerror(gettext("call to %s failed"), "zoneadmd"); + /* attempt to restore zone to configured state */ + (void) zone_set_state(target_zone, ZONE_STATE_CONFIGURED); return (Z_ERR); } + /* attempt to restore zone to configured state */ + (void) zone_set_state(target_zone, ZONE_STATE_CONFIGURED); + if (zonecfg_setdevperment(handle) != Z_OK) { (void) fprintf(stderr, gettext("unable to enumerate device entries\n")); @@ -4845,6 +4858,177 @@ mark_func(int argc, char *argv[]) return (err); } +/* + * Check what scheduling class we're running under and print a warning if + * we're not using FSS. + */ +static int +check_sched_fss(zone_dochandle_t handle) +{ + char class_name[PC_CLNMSZ]; + + if (zonecfg_get_dflt_sched_class(handle, class_name, + sizeof (class_name)) != Z_OK) { + zerror(gettext("WARNING: unable to determine the zone's " + "scheduling class")); + } else if (strcmp("FSS", class_name) != 0) { + zerror(gettext("WARNING: The zone.cpu-shares rctl is set but\n" + "FSS is not the default scheduling class for this zone. " + "FSS will be\nused for processes in the zone but to get " + "the full benefit of FSS,\nit should be the default " + "scheduling class. See dispadmin(1M) for\nmore details.")); + return (Z_SYSTEM); + } + + return (Z_OK); +} + +static int +check_cpu_shares_sched(zone_dochandle_t handle) +{ + int err; + int res = Z_OK; + struct zone_rctltab rctl; + + if ((err = zonecfg_setrctlent(handle)) != Z_OK) { + errno = err; + zperror(cmd_to_str(CMD_APPLY), B_TRUE); + return (err); + } + + while (zonecfg_getrctlent(handle, &rctl) == Z_OK) { + if (strcmp(rctl.zone_rctl_name, "zone.cpu-shares") == 0) { + if (check_sched_fss(handle) != Z_OK) + res = Z_SYSTEM; + break; + } + } + + (void) zonecfg_endrctlent(handle); + + return (res); +} + +/* + * This is an undocumented interface which is currently only used to apply + * the global zone resource management settings when the system boots. + * This function does not yet properly handle updating a running system so + * any projects running in the zone would be trashed if this function + * were to run after the zone had booted. It also does not reset any + * rctl settings that were removed from zonecfg. There is still work to be + * done before we can properly support dynamically updating the resource + * management settings for a running zone (global or non-global). Thus, this + * functionality is undocumented for now. + */ +/* ARGSUSED */ +static int +apply_func(int argc, char *argv[]) +{ + int err; + int res = Z_OK; + priv_set_t *privset; + zoneid_t zoneid; + zone_dochandle_t handle; + struct zone_mcaptab mcap; + char pool_err[128]; + + zoneid = getzoneid(); + + if (zonecfg_in_alt_root() || zoneid != GLOBAL_ZONEID || + target_zone == NULL || strcmp(target_zone, GLOBAL_ZONENAME) != 0) + return (usage(B_FALSE)); + + if ((privset = priv_allocset()) == NULL) { + zerror(gettext("%s failed"), "priv_allocset"); + return (Z_ERR); + } + + if (getppriv(PRIV_EFFECTIVE, privset) != 0) { + zerror(gettext("%s failed"), "getppriv"); + priv_freeset(privset); + return (Z_ERR); + } + + if (priv_isfullset(privset) == B_FALSE) { + (void) usage(B_FALSE); + priv_freeset(privset); + return (Z_ERR); + } + priv_freeset(privset); + + if ((handle = zonecfg_init_handle()) == NULL) { + zperror(cmd_to_str(CMD_APPLY), B_TRUE); + return (Z_ERR); + } + + if ((err = zonecfg_get_handle(target_zone, handle)) != Z_OK) { + errno = err; + zperror(cmd_to_str(CMD_APPLY), B_TRUE); + zonecfg_fini_handle(handle); + return (Z_ERR); + } + + /* specific error msgs are printed within apply_rctls */ + if ((err = zonecfg_apply_rctls(target_zone, handle)) != Z_OK) { + errno = err; + zperror(cmd_to_str(CMD_APPLY), B_TRUE); + res = Z_ERR; + } + + if ((err = check_cpu_shares_sched(handle)) != Z_OK) + res = Z_ERR; + + /* + * The next two blocks of code attempt to set up temporary pools as + * well as persistent pools. In both cases we call the functions + * unconditionally. Within each funtion the code will check if the + * zone is actually configured for a temporary pool or persistent pool + * and just return if there is nothing to do. + */ + if ((err = zonecfg_bind_tmp_pool(handle, zoneid, pool_err, + sizeof (pool_err))) != Z_OK) { + if (err == Z_POOL || err == Z_POOL_CREATE || err == Z_POOL_BIND) + zerror("%s: %s", zonecfg_strerror(err), pool_err); + else + zerror(gettext("could not bind zone to temporary " + "pool: %s"), zonecfg_strerror(err)); + res = Z_ERR; + } + + if ((err = zonecfg_bind_pool(handle, zoneid, pool_err, + sizeof (pool_err))) != Z_OK) { + if (err == Z_POOL || err == Z_POOL_BIND) + zerror("%s: %s", zonecfg_strerror(err), pool_err); + else + zerror("%s", zonecfg_strerror(err)); + } + + /* + * If a memory cap is configured, set the cap in the kernel using + * zone_setattr() and make sure the rcapd SMF service is enabled. + */ + if (zonecfg_getmcapent(handle, &mcap) == Z_OK) { + uint64_t num; + char smf_err[128]; + + num = (uint64_t)strtoll(mcap.zone_physmem_cap, NULL, 10); + if (zone_setattr(zoneid, ZONE_ATTR_PHYS_MCAP, &num, 0) == -1) { + zerror(gettext("could not set zone memory cap")); + res = Z_ERR; + } + + if (zonecfg_enable_rcapd(smf_err, sizeof (smf_err)) != Z_OK) { + zerror(gettext("enabling system/rcap service failed: " + "%s"), smf_err); + res = Z_ERR; + } + } + + zonecfg_fini_handle(handle); + + return (res); +} + static int help_func(int argc, char *argv[]) { diff --git a/usr/src/cmd/zoneadm/zoneadm.h b/usr/src/cmd/zoneadm/zoneadm.h index a94053e258..a299ece135 100644 --- a/usr/src/cmd/zoneadm/zoneadm.h +++ b/usr/src/cmd/zoneadm/zoneadm.h @@ -45,9 +45,10 @@ #define CMD_DETACH 13 #define CMD_ATTACH 14 #define CMD_MARK 15 +#define CMD_APPLY 16 #define CMD_MIN CMD_HELP -#define CMD_MAX CMD_MARK +#define CMD_MAX CMD_APPLY #if !defined(TEXT_DOMAIN) /* should be defined by cc -D */ #define TEXT_DOMAIN "SYS_TEST" /* Use this only if it wasn't */ diff --git a/usr/src/cmd/zoneadmd/Makefile b/usr/src/cmd/zoneadmd/Makefile index 8b77f8234c..34914694a8 100644 --- a/usr/src/cmd/zoneadmd/Makefile +++ b/usr/src/cmd/zoneadmd/Makefile @@ -42,7 +42,7 @@ POFILES= $(OBJS:%.o=%.po) CFLAGS += $(CCVERBOSE) LAZYLIBS = $(ZLAZYLOAD) -ltsnet -ltsol $(ZNOLAZYLOAD) lint := LAZYLIBS = -ltsnet -ltsol -LDLIBS += -lsocket -lzonecfg -lnsl -ldevinfo -ldevice -lnvpair -lpool \ +LDLIBS += -lsocket -lzonecfg -lnsl -ldevinfo -ldevice -lnvpair \ -lgen -lbsm -lcontract -lzfs -luuid -lbrand $(LAZYLIBS) XGETFLAGS += -a -x zoneadmd.xcl diff --git a/usr/src/cmd/zoneadmd/vplat.c b/usr/src/cmd/zoneadmd/vplat.c index ca93b1c696..513921e5e2 100644 --- a/usr/src/cmd/zoneadmd/vplat.c +++ b/usr/src/cmd/zoneadmd/vplat.c @@ -106,6 +106,7 @@ #include <pool.h> #include <sys/pool.h> +#include <sys/priocntl.h> #include <libbrand.h> #include <sys/brand.h> @@ -2661,27 +2662,6 @@ out: } static int -get_zone_pool(zlog_t *zlogp, char *poolbuf, size_t bufsz) -{ - zone_dochandle_t handle; - int error; - - if ((handle = zonecfg_init_handle()) == NULL) { - zerror(zlogp, B_TRUE, "getting zone configuration handle"); - return (Z_NOMEM); - } - error = zonecfg_get_snapshot_handle(zone_name, handle); - if (error != Z_OK) { - zerror(zlogp, B_FALSE, "invalid configuration"); - zonecfg_fini_handle(handle); - return (error); - } - error = zonecfg_get_pool(handle, poolbuf, bufsz); - zonecfg_fini_handle(handle); - return (error); -} - -static int get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep) { zone_dochandle_t handle; @@ -2818,75 +2798,6 @@ validate_datasets(zlog_t *zlogp) return (0); } -static int -bind_to_pool(zlog_t *zlogp, zoneid_t zoneid) -{ - pool_conf_t *poolconf; - pool_t *pool; - char poolname[MAXPATHLEN]; - int status; - int error; - - /* - * Find the pool mentioned in the zone configuration, and bind to it. - */ - error = get_zone_pool(zlogp, poolname, sizeof (poolname)); - if (error == Z_NO_ENTRY || (error == Z_OK && strlen(poolname) == 0)) { - /* - * The property is not set on the zone, so the pool - * should be bound to the default pool. But that's - * already done by the kernel, so we can just return. - */ - return (0); - } - if (error != Z_OK) { - /* - * Not an error, even though it shouldn't be happening. - */ - zerror(zlogp, B_FALSE, - "WARNING: unable to retrieve default pool."); - return (0); - } - /* - * Don't do anything if pools aren't enabled. - */ - if (pool_get_status(&status) != PO_SUCCESS || status != POOL_ENABLED) { - zerror(zlogp, B_FALSE, "WARNING: pools facility not active; " - "zone will not be bound to pool '%s'.", poolname); - return (0); - } - /* - * Try to provide a sane error message if the requested pool doesn't - * exist. - */ - if ((poolconf = pool_conf_alloc()) == NULL) { - zerror(zlogp, B_FALSE, "%s failed", "pool_conf_alloc"); - return (-1); - } - if (pool_conf_open(poolconf, pool_dynamic_location(), PO_RDONLY) != - PO_SUCCESS) { - zerror(zlogp, B_FALSE, "%s failed", "pool_conf_open"); - pool_conf_free(poolconf); - return (-1); - } - pool = pool_get_pool(poolconf, poolname); - (void) pool_conf_close(poolconf); - pool_conf_free(poolconf); - if (pool == NULL) { - zerror(zlogp, B_FALSE, "WARNING: pool '%s' not found; " - "using default pool.", poolname); - return (0); - } - /* - * Bind the zone to the pool. - */ - if (pool_set_binding(poolname, P_ZONEID, zoneid) != PO_SUCCESS) { - zerror(zlogp, B_FALSE, "WARNING: unable to bind to pool '%s'; " - "using default pool.", poolname); - } - return (0); -} - /* * Mount lower level home directories into/from current zone * Share exported directories specified in dfstab for zone @@ -3482,6 +3393,149 @@ duplicate_reachable_path(zlog_t *zlogp, const char *rootpath) return (B_FALSE); } +/* + * Set memory cap and pool info for the zone's resource management + * configuration. + */ +static int +setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid) +{ + int res; + uint64_t tmp; + struct zone_mcaptab mcap; + char sched[MAXNAMELEN]; + zone_dochandle_t handle = NULL; + char pool_err[128]; + + if ((handle = zonecfg_init_handle()) == NULL) { + zerror(zlogp, B_TRUE, "getting zone configuration handle"); + return (Z_BAD_HANDLE); + } + + if ((res = zonecfg_get_snapshot_handle(zone_name, handle)) != Z_OK) { + zerror(zlogp, B_FALSE, "invalid configuration"); + zonecfg_fini_handle(handle); + return (res); + } + + /* + * If a memory cap is configured, set the cap in the kernel using + * zone_setattr() and make sure the rcapd SMF service is enabled. + */ + if (zonecfg_getmcapent(handle, &mcap) == Z_OK) { + uint64_t num; + char smf_err[128]; + + num = (uint64_t)strtoull(mcap.zone_physmem_cap, NULL, 10); + if (zone_setattr(zoneid, ZONE_ATTR_PHYS_MCAP, &num, 0) == -1) { + zerror(zlogp, B_TRUE, "could not set zone memory cap"); + zonecfg_fini_handle(handle); + return (Z_INVAL); + } + + if (zonecfg_enable_rcapd(smf_err, sizeof (smf_err)) != Z_OK) { + zerror(zlogp, B_FALSE, "enabling system/rcap service " + "failed: %s", smf_err); + zonecfg_fini_handle(handle); + return (Z_INVAL); + } + } + + /* Get the scheduling class set in the zone configuration. */ + if (zonecfg_get_sched_class(handle, sched, sizeof (sched)) == Z_OK && + strlen(sched) > 0) { + if (zone_setattr(zoneid, ZONE_ATTR_SCHED_CLASS, sched, + strlen(sched)) == -1) + zerror(zlogp, B_TRUE, "WARNING: unable to set the " + "default scheduling class"); + + } else if (zonecfg_get_aliased_rctl(handle, ALIAS_SHARES, &tmp) + == Z_OK) { + /* + * If the zone has the zone.cpu-shares rctl set then we want to + * use the Fair Share Scheduler (FSS) for processes in the + * zone. Check what scheduling class the zone would be running + * in by default so we can print a warning and modify the class + * if we wouldn't be using FSS. + */ + char class_name[PC_CLNMSZ]; + + if (zonecfg_get_dflt_sched_class(handle, class_name, + sizeof (class_name)) != Z_OK) { + zerror(zlogp, B_FALSE, "WARNING: unable to determine " + "the zone's scheduling class"); + + } else if (strcmp("FSS", class_name) != 0) { + zerror(zlogp, B_FALSE, "WARNING: The zone.cpu-shares " + "rctl is set but\nFSS is not the default " + "scheduling class for\nthis zone. FSS will be " + "used for processes\nin the zone but to get the " + "full benefit of FSS,\nit should be the default " + "scheduling class.\nSee dispadmin(1M) for more " + "details."); + + if (zone_setattr(zoneid, ZONE_ATTR_SCHED_CLASS, "FSS", + strlen("FSS")) == -1) + zerror(zlogp, B_TRUE, "WARNING: unable to set " + "zone scheduling class to FSS"); + } + } + + /* + * The next few blocks of code attempt to set up temporary pools as + * well as persistent pools. In all cases we call the functions + * unconditionally. Within each funtion the code will check if the + * zone is actually configured for a temporary pool or persistent pool + * and just return if there is nothing to do. + * + * If we are rebooting we want to attempt to reuse any temporary pool + * that was previously set up. zonecfg_bind_tmp_pool() will do the + * right thing in all cases (reuse or create) based on the current + * zonecfg. + */ + if ((res = zonecfg_bind_tmp_pool(handle, zoneid, pool_err, + sizeof (pool_err))) != Z_OK) { + if (res == Z_POOL || res == Z_POOL_CREATE || res == Z_POOL_BIND) + zerror(zlogp, B_FALSE, "%s: %s\ndedicated-cpu setting " + "cannot be instantiated", zonecfg_strerror(res), + pool_err); + else + zerror(zlogp, B_FALSE, "could not bind zone to " + "temporary pool: %s", zonecfg_strerror(res)); + zonecfg_fini_handle(handle); + return (Z_POOL_BIND); + } + + /* + * Check if we need to warn about poold not being enabled. + */ + if (zonecfg_warn_poold(handle)) { + zerror(zlogp, B_FALSE, "WARNING: A range of dedicated-cpus has " + "been specified\nbut the dynamic pool service is not " + "enabled.\nThe system will not dynamically adjust the\n" + "processor allocation within the specified range\n" + "until svc:/system/pools/dynamic is enabled.\n" + "See poold(1M)."); + } + + /* The following is a warning, not an error. */ + if ((res = zonecfg_bind_pool(handle, zoneid, pool_err, + sizeof (pool_err))) != Z_OK) { + if (res == Z_POOL_BIND) + zerror(zlogp, B_FALSE, "WARNING: unable to bind to " + "pool '%s'; using default pool.", pool_err); + else if (res == Z_POOL) + zerror(zlogp, B_FALSE, "WARNING: %s: %s", + zonecfg_strerror(res), pool_err); + else + zerror(zlogp, B_FALSE, "WARNING: %s", + zonecfg_strerror(res)); + } + + zonecfg_fini_handle(handle); + return (Z_OK); +} + zoneid_t vplat_create(zlog_t *zlogp, boolean_t mount_cmd) { @@ -3668,14 +3722,18 @@ vplat_create(zlog_t *zlogp, boolean_t mount_cmd) } /* - * The following is a warning, not an error, and is not performed when - * merely mounting a zone for administrative use. + * The following actions are not performed when merely mounting a zone + * for administrative use. */ - if (!mount_cmd && bind_to_pool(zlogp, zoneid) != 0) - zerror(zlogp, B_FALSE, "WARNING: unable to bind zone to " - "requested pool; using default pool."); - if (!mount_cmd) + if (!mount_cmd) { + if (setup_zone_rm(zlogp, zone_name, zoneid) != Z_OK) { + (void) zone_shutdown(zoneid); + goto error; + } + set_mlps(zlogp, zoneid, zcent); + } + rval = zoneid; zoneid = -1; @@ -3878,10 +3936,12 @@ unmounted: } int -vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd) +vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting) { char *kzone; zoneid_t zoneid; + int res; + char pool_err[128]; char zroot[MAXPATHLEN]; char cmdbuf[MAXPATHLEN]; char brand[MAXNAMELEN]; @@ -3972,6 +4032,19 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd) goto error; } + /* + * If we are rebooting then we don't want to destroy an existing + * temporary pool at this point so that we can just reuse it when the + * zone boots back up. + */ + if (!unmount_cmd && !rebooting) { + if ((res = zonecfg_destroy_tmp_pool(zone_name, pool_err, + sizeof (pool_err))) != Z_OK) { + if (res == Z_POOL) + zerror(zlogp, B_FALSE, pool_err); + } + } + remove_mlps(zlogp, zoneid); if (zone_destroy(zoneid) != 0) { diff --git a/usr/src/cmd/zoneadmd/zoneadmd.c b/usr/src/cmd/zoneadmd/zoneadmd.c index 313d24d95b..35206384b9 100644 --- a/usr/src/cmd/zoneadmd/zoneadmd.c +++ b/usr/src/cmd/zoneadmd/zoneadmd.c @@ -463,7 +463,7 @@ zone_ready(zlog_t *zlogp, boolean_t mount_cmd) } if (vplat_bringup(zlogp, mount_cmd, zone_id) != 0) { bringup_failure_recovery = B_TRUE; - (void) vplat_teardown(NULL, mount_cmd); + (void) vplat_teardown(NULL, mount_cmd, B_FALSE); if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK) zerror(zlogp, B_FALSE, "destroying snapshot: %s", zonecfg_strerror(err)); @@ -738,11 +738,11 @@ zone_bootup(zlog_t *zlogp, const char *bootargs) } static int -zone_halt(zlog_t *zlogp, boolean_t unmount_cmd) +zone_halt(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting) { int err; - if (vplat_teardown(zlogp, unmount_cmd) != 0) { + if (vplat_teardown(zlogp, unmount_cmd, rebooting) != 0) { if (!bringup_failure_recovery) zerror(zlogp, B_FALSE, "unable to destroy zone"); return (-1); @@ -985,7 +985,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, audit_put_record(zlogp, uc, rval, "boot"); if (rval != 0) { bringup_failure_recovery = B_TRUE; - (void) zone_halt(zlogp, B_FALSE); + (void) zone_halt(zlogp, B_FALSE, B_FALSE); eventstream_write(Z_EVT_ZONE_BOOTFAILED); } break; @@ -1094,7 +1094,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, audit_put_record(zlogp, uc, rval, "boot"); if (rval != 0) { bringup_failure_recovery = B_TRUE; - (void) zone_halt(zlogp, B_FALSE); + (void) zone_halt(zlogp, B_FALSE, B_TRUE); eventstream_write(Z_EVT_ZONE_BOOTFAILED); } boot_args[0] = '\0'; @@ -1102,7 +1102,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, case Z_HALT: if (kernelcall) /* Invalid; can't happen */ abort(); - if ((rval = zone_halt(zlogp, B_FALSE)) != 0) + if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE)) != 0) break; eventstream_write(Z_EVT_ZONE_HALTED); break; @@ -1125,7 +1125,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, case Z_UNMOUNT: if (kernelcall) /* Invalid; can't happen */ abort(); - rval = zone_halt(zlogp, B_TRUE); + rval = zone_halt(zlogp, B_TRUE, B_FALSE); if (rval == 0) { eventstream_write(Z_EVT_ZONE_HALTED); (void) sema_post(&scratch_sem); @@ -1147,7 +1147,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, case ZONE_STATE_DOWN: switch (cmd) { case Z_READY: - if ((rval = zone_halt(zlogp, B_FALSE)) != 0) + if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE)) != 0) break; if ((rval = zone_ready(zlogp, B_FALSE)) == 0) eventstream_write(Z_EVT_ZONE_READIED); @@ -1165,7 +1165,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, rval = 0; break; case Z_HALT: - if ((rval = zone_halt(zlogp, B_FALSE)) != 0) + if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE)) != 0) break; eventstream_write(Z_EVT_ZONE_HALTED); break; @@ -1173,7 +1173,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, (void) strlcpy(boot_args, zargp->bootbuf, sizeof (boot_args)); eventstream_write(Z_EVT_ZONE_REBOOTING); - if ((rval = zone_halt(zlogp, B_FALSE)) != 0) { + if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE)) != 0) { eventstream_write(Z_EVT_ZONE_BOOTFAILED); boot_args[0] = '\0'; break; @@ -1186,7 +1186,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, rval = zone_bootup(zlogp, zargp->bootbuf); audit_put_record(zlogp, uc, rval, "reboot"); if (rval != 0) { - (void) zone_halt(zlogp, B_FALSE); + (void) zone_halt(zlogp, B_FALSE, B_TRUE); eventstream_write(Z_EVT_ZONE_BOOTFAILED); } boot_args[0] = '\0'; diff --git a/usr/src/cmd/zoneadmd/zoneadmd.h b/usr/src/cmd/zoneadmd/zoneadmd.h index cfb90f93f3..a4aba27b5c 100644 --- a/usr/src/cmd/zoneadmd/zoneadmd.h +++ b/usr/src/cmd/zoneadmd/zoneadmd.h @@ -106,7 +106,7 @@ extern void eventstream_write(zone_evt_t evt); */ extern zoneid_t vplat_create(zlog_t *, boolean_t); extern int vplat_bringup(zlog_t *, boolean_t, zoneid_t); -extern int vplat_teardown(zlog_t *, boolean_t); +extern int vplat_teardown(zlog_t *, boolean_t, boolean_t); /* * Console subsystem routines. diff --git a/usr/src/cmd/zonecfg/zonecfg.c b/usr/src/cmd/zonecfg/zonecfg.c index ea745cbb61..34d6b99480 100644 --- a/usr/src/cmd/zonecfg/zonecfg.c +++ b/usr/src/cmd/zonecfg/zonecfg.c @@ -101,6 +101,8 @@ extern int lex_lineno; #define MAX_CMD_HIST 1024 #define MAX_CMD_LEN 1024 +#define ONE_MB 1048576 + /* * Each SHELP_ should be a simple string. */ @@ -108,6 +110,7 @@ extern int lex_lineno; #define SHELP_ADD "add <resource-type>\n\t(global scope)\n" \ "add <property-name> <property-value>\n\t(resource scope)" #define SHELP_CANCEL "cancel" +#define SHELP_CLEAR "clear <property-name>" #define SHELP_COMMIT "commit" #define SHELP_CREATE "create [-F] [ -a <path> | -b | -t <template> ]" #define SHELP_DELETE "delete [-F]" @@ -116,9 +119,11 @@ extern int lex_lineno; #define SHELP_EXPORT "export [-f output-file]" #define SHELP_HELP "help [commands] [syntax] [usage] [<command-name>]" #define SHELP_INFO "info [<resource-type> [property-name=property-value]*]" -#define SHELP_REMOVE "remove <resource-type> { <property-name>=<property-" \ - "value> }\n\t(global scope)\nremove <property-name> <property-value>" \ - "\n\t(resource scope)" +#define SHELP_REMOVE "remove [-F] <resource-type> " \ + "[ <property-name>=<property-value> ]*\n" \ + "\t(global scope)\n" \ + "remove <property-name> <property-value>\n" \ + "\t(resource scope)" #define SHELP_REVERT "revert [-F]" #define SHELP_SELECT "select <resource-type> { <property-name>=" \ "<property-value> }" @@ -128,6 +133,7 @@ extern int lex_lineno; static struct help helptab[] = { { CMD_ADD, "add", HELP_RES_PROPS, SHELP_ADD, }, { CMD_CANCEL, "cancel", 0, SHELP_CANCEL, }, + { CMD_CLEAR, "clear", HELP_PROPS, SHELP_CLEAR, }, { CMD_COMMIT, "commit", 0, SHELP_COMMIT, }, { CMD_CREATE, "create", 0, SHELP_CREATE, }, { CMD_DELETE, "delete", 0, SHELP_DELETE, }, @@ -163,6 +169,15 @@ static char *res_types[] = { "limitpriv", "bootargs", "brand", + "dedicated-cpu", + "capped-memory", + ALIAS_MAXLWPS, + ALIAS_MAXSHMMEM, + ALIAS_MAXSHMIDS, + ALIAS_MAXMSGIDS, + ALIAS_MAXSEMIDS, + ALIAS_SHARES, + "scheduling-class", NULL }; @@ -189,6 +204,19 @@ static char *prop_types[] = { "limitpriv", "bootargs", "brand", + "ncpus", + "importance", + "swap", + "locked", + ALIAS_SHARES, + ALIAS_MAXLWPS, + ALIAS_MAXSHMMEM, + ALIAS_MAXSHMIDS, + ALIAS_MAXMSGIDS, + ALIAS_MAXSEMIDS, + ALIAS_MAXLOCKEDMEM, + ALIAS_MAXSWAP, + "scheduling-class", NULL }; @@ -205,11 +233,12 @@ static char *prop_val_types[] = { /* * remove has a space afterwards because it has qualifiers; the other commands - * that have qualifiers (add, select and set) don't need a space here because + * that have qualifiers (add, select, etc.) don't need a space here because * they have their own _cmds[] lists below. */ static const char *global_scope_cmds[] = { "add", + "clear", "commit", "create", "delete", @@ -233,6 +262,23 @@ static const char *add_cmds[] = { "add rctl", "add attr", "add dataset", + "add dedicated-cpu", + "add capped-memory", + NULL +}; + +static const char *clear_cmds[] = { + "clear autoboot", + "clear pool", + "clear limitpriv", + "clear bootargs", + "clear scheduling-class", + "clear " ALIAS_MAXLWPS, + "clear " ALIAS_MAXSHMMEM, + "clear " ALIAS_MAXSHMIDS, + "clear " ALIAS_MAXMSGIDS, + "clear " ALIAS_MAXSEMIDS, + "clear " ALIAS_SHARES, NULL }; @@ -244,6 +290,8 @@ static const char *remove_cmds[] = { "remove rctl ", "remove attr ", "remove dataset ", + "remove dedicated-cpu ", + "remove capped-memory ", NULL }; @@ -255,6 +303,8 @@ static const char *select_cmds[] = { "select rctl ", "select attr ", "select dataset ", + "select dedicated-cpu", + "select capped-memory", NULL }; @@ -266,6 +316,13 @@ static const char *set_cmds[] = { "set pool=", "set limitpriv=", "set bootargs=", + "set scheduling-class=", + "set " ALIAS_MAXLWPS "=", + "set " ALIAS_MAXSHMMEM "=", + "set " ALIAS_MAXSHMIDS "=", + "set " ALIAS_MAXMSGIDS "=", + "set " ALIAS_MAXSEMIDS "=", + "set " ALIAS_SHARES "=", NULL }; @@ -277,12 +334,22 @@ static const char *info_cmds[] = { "info rctl ", "info attr ", "info dataset ", + "info capped-memory", + "info dedicated-cpu", "info zonename", "info zonepath", "info autoboot", "info pool", "info limitpriv", "info bootargs", + "info brand", + "info scheduling-class", + "info max-lwps", + "info max-shm-memory", + "info max-shm-ids", + "info max-msg-ids", + "info max-sem-ids", + "info cpu-shares", NULL }; @@ -298,6 +365,7 @@ static const char *fs_res_scope_cmds[] = { "set raw=", "set special=", "set type=", + "clear raw", NULL }; @@ -366,6 +434,33 @@ static const char *dataset_res_scope_cmds[] = { NULL }; +static const char *pset_res_scope_cmds[] = { + "cancel", + "end", + "exit", + "help", + "info", + "set ncpus=", + "set importance=", + "clear importance", + NULL +}; + +static const char *mcap_res_scope_cmds[] = { + "cancel", + "end", + "exit", + "help", + "info", + "set physical=", + "set swap=", + "set locked=", + "clear physical", + "clear swap", + "clear locked", + NULL +}; + /* Global variables */ /* set early in main(), never modified thereafter, used all over the place */ @@ -406,6 +501,9 @@ static bool got_handle = FALSE; /* initialized in do_interactive(), checked in initialize() */ static bool interactive_mode; +/* set if configuring the global zone */ +static bool global_zone = FALSE; + /* set in main(), checked in multiple places */ static bool read_only_mode; @@ -427,9 +525,13 @@ static struct zone_devtab old_devtab, in_progress_devtab; static struct zone_rctltab old_rctltab, in_progress_rctltab; static struct zone_attrtab old_attrtab, in_progress_attrtab; static struct zone_dstab old_dstab, in_progress_dstab; +static struct zone_psettab old_psettab, in_progress_psettab; +static struct zone_mcaptab old_mcaptab, in_progress_mcaptab; static GetLine *gl; /* The gl_get_line() resource object */ +static void bytes_to_units(char *str, char *buf, int bufsize); + /* Functions begin here */ static bool @@ -469,6 +571,8 @@ CPL_MATCH_FN(cmd_cpl_fn) */ if (strncmp(line, "add ", MAX(MIN(word_end, 4), 1)) == 0) return (add_stuff(cpl, line, add_cmds, word_end)); + if (strncmp(line, "clear ", MAX(MIN(word_end, 6), 2)) == 0) + return (add_stuff(cpl, line, clear_cmds, word_end)); if (strncmp(line, "select ", MAX(MIN(word_end, 7), 3)) == 0) return (add_stuff(cpl, line, select_cmds, word_end)); if (strncmp(line, "set ", MAX(MIN(word_end, 4), 3)) == 0) @@ -494,6 +598,10 @@ CPL_MATCH_FN(cmd_cpl_fn) return (add_stuff(cpl, line, attr_res_scope_cmds, word_end)); case RT_DATASET: return (add_stuff(cpl, line, dataset_res_scope_cmds, word_end)); + case RT_DCPU: + return (add_stuff(cpl, line, pset_res_scope_cmds, word_end)); + case RT_MCAP: + return (add_stuff(cpl, line, mcap_res_scope_cmds, word_end)); } return (0); } @@ -669,9 +777,8 @@ long_help(int cmd_num) "flag can be used to force the\n\taction.")); case CMD_REMOVE: return (gettext("Remove specified resource from " - "configuration. Note that the curly\n\tbraces " - "('{', '}') mean one or more of whatever " - "is between them.")); + "configuration. The -F flag can be used\n\tto " + "force the action.")); case CMD_SELECT: (void) snprintf(line, sizeof (line), gettext("Selects a resource to modify. " @@ -684,6 +791,8 @@ long_help(int cmd_num) return (line); case CMD_SET: return (gettext("Sets property values.")); + case CMD_CLEAR: + return (gettext("Clears property values.")); case CMD_INFO: return (gettext("Displays information about the " "current configuration. If resource\n\ttype is " @@ -870,6 +979,37 @@ usage(bool verbose, uint_t flags) (void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET), pt_to_str(PT_NAME), gettext("<name>")); break; + case RT_DCPU: + (void) fprintf(fp, gettext("The '%s' resource scope " + "configures the 'pools' facility to dedicate\na " + "subset of the system's processors to this zone " + "while it is running.\n"), + rt_to_str(resource_scope)); + (void) fprintf(fp, gettext("Valid commands:\n")); + (void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_NCPUS), + gettext("<unsigned integer | range>")); + (void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_IMPORTANCE), + gettext("<unsigned integer>")); + break; + case RT_MCAP: + (void) fprintf(fp, gettext("The '%s' resource scope is " + "used to set an upper limit (a cap) on the\n" + "amount of physical memory, swap space and locked " + "memory that can be used by\nthis zone.\n"), + rt_to_str(resource_scope)); + (void) fprintf(fp, gettext("Valid commands:\n")); + (void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_PHYSICAL), + gettext("<qualified unsigned decimal>")); + (void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_SWAP), + gettext("<qualified unsigned decimal>")); + (void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_LOCKED), + gettext("<qualified unsigned decimal>")); + break; } (void) fprintf(fp, gettext("And from any resource scope, you " "can:\n")); @@ -928,11 +1068,12 @@ usage(bool verbose, uint_t flags) } if (flags & HELP_RESOURCES) { (void) fprintf(fp, "<%s> := %s | %s | %s | %s | %s | %s |\n\t" - "%s\n\n", + "%s | %s | %s\n\n", gettext("resource type"), rt_to_str(RT_FS), rt_to_str(RT_IPD), rt_to_str(RT_NET), rt_to_str(RT_DEVICE), rt_to_str(RT_RCTL), rt_to_str(RT_ATTR), - rt_to_str(RT_DATASET)); + rt_to_str(RT_DATASET), rt_to_str(RT_DCPU), + rt_to_str(RT_MCAP)); } if (flags & HELP_PROPS) { (void) fprintf(fp, gettext("For resource type ... there are " @@ -951,6 +1092,20 @@ usage(bool verbose, uint_t flags) pt_to_str(PT_POOL)); (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"), pt_to_str(PT_LIMITPRIV)); + (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"), + pt_to_str(PT_SCHED)); + (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"), + pt_to_str(PT_MAXLWPS)); + (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"), + pt_to_str(PT_MAXSHMMEM)); + (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"), + pt_to_str(PT_MAXSHMIDS)); + (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"), + pt_to_str(PT_MAXMSGIDS)); + (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"), + pt_to_str(PT_MAXSEMIDS)); + (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"), + pt_to_str(PT_SHARES)); (void) fprintf(fp, "\t%s\t\t%s, %s, %s, %s\n", rt_to_str(RT_FS), pt_to_str(PT_DIR), pt_to_str(PT_SPECIAL), pt_to_str(PT_RAW), pt_to_str(PT_TYPE), @@ -968,6 +1123,11 @@ usage(bool verbose, uint_t flags) pt_to_str(PT_VALUE)); (void) fprintf(fp, "\t%s\t\t%s\n", rt_to_str(RT_DATASET), pt_to_str(PT_NAME)); + (void) fprintf(fp, "\t%s\t%s, %s\n", rt_to_str(RT_DCPU), + pt_to_str(PT_NCPUS), pt_to_str(PT_IMPORTANCE)); + (void) fprintf(fp, "\t%s\t%s, %s, %s\n", rt_to_str(RT_MCAP), + pt_to_str(PT_PHYSICAL), pt_to_str(PT_SWAP), + pt_to_str(PT_LOCKED)); } if (need_to_close) (void) pclose(fp); @@ -1040,6 +1200,33 @@ initialize(bool handle_expected) " Unable to continue", zone, brandname); exit(Z_ERR); } + } else if (global_zone && err == Z_NO_ZONE && !got_handle && + !read_only_mode) { + /* + * We implicitly create the global zone config if it + * doesn't exist. + */ + zone_dochandle_t tmphandle; + + if ((tmphandle = zonecfg_init_handle()) == NULL) { + zone_perror(execname, Z_NOMEM, TRUE); + exit(Z_ERR); + } + + err = zonecfg_get_template_handle("SUNWblank", zone, + tmphandle); + + if (err != Z_OK) { + zonecfg_fini_handle(tmphandle); + zone_perror("SUNWblank", err, TRUE); + return (err); + } + + need_to_commit = TRUE; + zonecfg_fini_handle(handle); + handle = tmphandle; + got_handle = TRUE; + } else { zone_perror(zone, err, handle_expected || got_handle); if (err == Z_NO_ZONE && !got_handle && @@ -1373,10 +1560,13 @@ export_func(cmd_t *cmd) struct zone_attrtab attrtab; struct zone_rctltab rctltab; struct zone_dstab dstab; + struct zone_psettab psettab; + struct zone_mcaptab mcaptab; struct zone_rctlvaltab *valptr; int err, arg; char zonepath[MAXPATHLEN], outfile[MAXPATHLEN], pool[MAXNAMELEN]; char bootargs[BOOTARGS_MAX]; + char sched[MAXNAMELEN]; char brand[MAXNAMELEN]; char *limitpriv; FILE *of; @@ -1456,6 +1646,10 @@ export_func(cmd_t *cmd) free(limitpriv); } + if (zonecfg_get_sched_class(handle, sched, sizeof (sched)) == Z_OK && + strlen(sched) > 0) + (void) fprintf(of, "%s %s=%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_SCHED), sched); if ((err = zonecfg_setipdent(handle)) != Z_OK) { zone_perror(zone, err, FALSE); @@ -1576,6 +1770,33 @@ export_func(cmd_t *cmd) } (void) zonecfg_enddsent(handle); + if (zonecfg_getpsetent(handle, &psettab) == Z_OK) { + (void) fprintf(of, "%s %s\n", cmd_to_str(CMD_ADD), + rt_to_str(RT_DCPU)); + if (strcmp(psettab.zone_ncpu_min, psettab.zone_ncpu_max) == 0) + (void) fprintf(of, "%s %s=%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_NCPUS), psettab.zone_ncpu_max); + else + (void) fprintf(of, "%s %s=%s-%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_NCPUS), psettab.zone_ncpu_min, + psettab.zone_ncpu_max); + if (psettab.zone_importance[0] != '\0') + (void) fprintf(of, "%s %s=%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_IMPORTANCE), psettab.zone_importance); + (void) fprintf(of, "%s\n", cmd_to_str(CMD_END)); + } + + if (zonecfg_getmcapent(handle, &mcaptab) == Z_OK) { + char buf[128]; + + (void) fprintf(of, "%s %s\n", cmd_to_str(CMD_ADD), + rt_to_str(RT_MCAP)); + bytes_to_units(mcaptab.zone_physmem_cap, buf, sizeof (buf)); + (void) fprintf(of, "%s %s=%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_PHYSICAL), buf); + (void) fprintf(of, "%s\n", cmd_to_str(CMD_END)); + } + done: if (need_to_close) (void) fclose(of); @@ -1641,6 +1862,10 @@ static void add_resource(cmd_t *cmd) { int type; + struct zone_psettab tmp_psettab; + struct zone_mcaptab tmp_mcaptab; + uint64_t tmp_mcap; + char pool[MAXNAMELEN]; if ((type = cmd->cmd_res_type) == RT_UNKNOWN) { long_usage(CMD_ADD, TRUE); @@ -1667,6 +1892,12 @@ add_resource(cmd_t *cmd) bzero(&in_progress_devtab, sizeof (in_progress_devtab)); return; case RT_RCTL: + if (global_zone) + zerr(gettext("WARNING: Setting a global zone resource " + "control too low could deny\nservice " + "to even the root user; " + "this could render the system impossible\n" + "to administer. Please use caution.")); bzero(&in_progress_rctltab, sizeof (in_progress_rctltab)); return; case RT_ATTR: @@ -1675,6 +1906,48 @@ add_resource(cmd_t *cmd) case RT_DATASET: bzero(&in_progress_dstab, sizeof (in_progress_dstab)); return; + case RT_DCPU: + /* Make sure there isn't already a cpu-set entry. */ + if (zonecfg_lookup_pset(handle, &tmp_psettab) == Z_OK) { + zerr(gettext("The %s resource already exists."), + rt_to_str(RT_DCPU)); + goto bad; + } + + /* Make sure the pool property isn't set. */ + if (zonecfg_get_pool(handle, pool, sizeof (pool)) == Z_OK && + strlen(pool) > 0) { + zerr(gettext("The %s property is already set. " + "A persistent pool is incompatible with\nthe %s " + "resource."), + pt_to_str(PT_POOL), rt_to_str(RT_DCPU)); + goto bad; + } + + bzero(&in_progress_psettab, sizeof (in_progress_psettab)); + return; + case RT_MCAP: + /* + * Make sure there isn't already a mem-cap entry or max-swap + * or max-locked rctl. + */ + if (zonecfg_lookup_mcap(handle, &tmp_mcaptab) == Z_OK || + zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP, &tmp_mcap) + == Z_OK || + zonecfg_get_aliased_rctl(handle, ALIAS_MAXLOCKEDMEM, + &tmp_mcap) == Z_OK) { + zerr(gettext("The %s resource or a related resource " + "control already exists."), rt_to_str(RT_MCAP)); + goto bad; + } + if (global_zone) + zerr(gettext("WARNING: Setting a global zone memory " + "cap too low could deny\nservice " + "to even the root user; " + "this could render the system impossible\n" + "to administer. Please use caution.")); + bzero(&in_progress_mcaptab, sizeof (in_progress_mcaptab)); + return; default: zone_perror(rt_to_str(type), Z_NO_RESOURCE_TYPE, TRUE); long_usage(CMD_ADD, TRUE); @@ -1871,6 +2144,30 @@ add_property(cmd_t *cmd) } } +static boolean_t +gz_invalid_resource(int type) +{ + return (global_zone && (type == RT_FS || type == RT_IPD || + type == RT_NET || type == RT_DEVICE || type == RT_ATTR || + type == RT_DATASET)); +} + +static boolean_t +gz_invalid_rt_property(int type) +{ + return (global_zone && (type == RT_ZONENAME || type == RT_ZONEPATH || + type == RT_AUTOBOOT || type == RT_LIMITPRIV || + type == RT_BOOTARGS || type == RT_BRAND || type == RT_SCHED)); +} + +static boolean_t +gz_invalid_property(int type) +{ + return (global_zone && (type == PT_ZONENAME || type == PT_ZONEPATH || + type == PT_AUTOBOOT || type == PT_LIMITPRIV || + type == PT_BOOTARGS || type == PT_BRAND || type == PT_SCHED)); +} + void add_func(cmd_t *cmd) { @@ -1900,6 +2197,13 @@ add_func(cmd_t *cmd) if (initialize(TRUE) != Z_OK) return; if (global_scope) { + if (gz_invalid_resource(cmd->cmd_res_type)) { + zerr(gettext("Cannot add a %s resource to the " + "global zone."), rt_to_str(cmd->cmd_res_type)); + saw_error = TRUE; + return; + } + global_scope = FALSE; resource_scope = cmd->cmd_res_type; end_op = CMD_ADD; @@ -2273,26 +2577,85 @@ fill_in_dstab(cmd_t *cmd, struct zone_dstab *dstab, bool fill_in_only) } static void -remove_resource(cmd_t *cmd) +remove_aliased_rctl(int type, char *name) { - int err, type; - struct zone_fstab fstab; - struct zone_nwiftab nwiftab; - struct zone_devtab devtab; - struct zone_attrtab attrtab; - struct zone_rctltab rctltab; - struct zone_dstab dstab; + int err; + uint64_t tmp; - if ((type = cmd->cmd_res_type) == RT_UNKNOWN) { - long_usage(CMD_REMOVE, TRUE); + if ((err = zonecfg_get_aliased_rctl(handle, name, &tmp)) != Z_OK) { + zerr("%s %s: %s", cmd_to_str(CMD_CLEAR), pt_to_str(type), + zonecfg_strerror(err)); + saw_error = TRUE; return; } + if ((err = zonecfg_rm_aliased_rctl(handle, name)) != Z_OK) { + zerr("%s %s: %s", cmd_to_str(CMD_CLEAR), pt_to_str(type), + zonecfg_strerror(err)); + saw_error = TRUE; + } else { + need_to_commit = TRUE; + } +} - if (initialize(TRUE) != Z_OK) - return; +static boolean_t +prompt_remove_resource(cmd_t *cmd, char *rsrc) +{ + int num; + int answer; + int arg; + boolean_t force = B_FALSE; + char prompt[128]; + + optind = 0; + while ((arg = getopt(cmd->cmd_argc, cmd->cmd_argv, "F")) != EOF) { + switch (arg) { + case 'F': + force = B_TRUE; + break; + default: + return (B_FALSE); + } + } + + num = zonecfg_num_resources(handle, rsrc); + + if (num == 0) { + z_cmd_rt_perror(CMD_REMOVE, cmd->cmd_res_type, Z_NO_ENTRY, + TRUE); + return (B_FALSE); + } + if (num > 1 && !force) { + if (!interactive_mode) { + zerr(gettext("There are multiple instances of this " + "resource. Either qualify the resource to\n" + "remove a single instance or use the -F option to " + "remove all instances.")); + saw_error = TRUE; + return (B_FALSE); + } + (void) snprintf(prompt, sizeof (prompt), gettext( + "Are you sure you want to remove ALL '%s' resources"), + rsrc); + answer = ask_yesno(FALSE, prompt); + if (answer == -1) { + zerr(gettext("Resource incomplete.")); + return (B_FALSE); + } + if (answer != 1) + return (B_FALSE); + } + return (B_TRUE); +} + +static void +remove_fs(cmd_t *cmd) +{ + int err; + + /* traditional, qualified fs removal */ + if (cmd->cmd_prop_nv_pairs > 0) { + struct zone_fstab fstab; - switch (type) { - case RT_FS: if ((err = fill_in_fstab(cmd, &fstab, FALSE)) != Z_OK) { z_cmd_rt_perror(CMD_REMOVE, RT_FS, err, TRUE); return; @@ -2303,13 +2666,36 @@ remove_resource(cmd_t *cmd) need_to_commit = TRUE; zonecfg_free_fs_option_list(fstab.zone_fs_options); return; - case RT_IPD: - if (state_atleast(ZONE_STATE_INSTALLED)) { - zerr(gettext("Zone %s already installed; %s %s not " - "allowed."), zone, cmd_to_str(CMD_REMOVE), - rt_to_str(RT_IPD)); - return; - } + } + + /* + * unqualified fs removal. remove all fs's but prompt if more + * than one. + */ + if (!prompt_remove_resource(cmd, "fs")) + return; + + if ((err = zonecfg_del_all_resources(handle, "fs")) != Z_OK) + z_cmd_rt_perror(CMD_REMOVE, RT_FS, err, TRUE); + else + need_to_commit = TRUE; +} + +static void +remove_ipd(cmd_t *cmd) +{ + int err; + + if (state_atleast(ZONE_STATE_INSTALLED)) { + zerr(gettext("Zone %s already installed; %s %s not allowed."), + zone, cmd_to_str(CMD_REMOVE), rt_to_str(RT_IPD)); + return; + } + + /* traditional, qualified ipd removal */ + if (cmd->cmd_prop_nv_pairs > 0) { + struct zone_fstab fstab; + if ((err = fill_in_ipdtab(cmd, &fstab, FALSE)) != Z_OK) { z_cmd_rt_perror(CMD_REMOVE, RT_IPD, err, TRUE); return; @@ -2319,7 +2705,31 @@ remove_resource(cmd_t *cmd) else need_to_commit = TRUE; return; - case RT_NET: + } + + /* + * unqualified ipd removal. remove all ipds but prompt if more + * than one. + */ + if (!prompt_remove_resource(cmd, "inherit-pkg-dir")) + return; + + if ((err = zonecfg_del_all_resources(handle, "inherit-pkg-dir")) + != Z_OK) + z_cmd_rt_perror(CMD_REMOVE, RT_IPD, err, TRUE); + else + need_to_commit = TRUE; +} + +static void +remove_net(cmd_t *cmd) +{ + int err; + + /* traditional, qualified net removal */ + if (cmd->cmd_prop_nv_pairs > 0) { + struct zone_nwiftab nwiftab; + if ((err = fill_in_nwiftab(cmd, &nwiftab, FALSE)) != Z_OK) { z_cmd_rt_perror(CMD_REMOVE, RT_NET, err, TRUE); return; @@ -2329,7 +2739,30 @@ remove_resource(cmd_t *cmd) else need_to_commit = TRUE; return; - case RT_DEVICE: + } + + /* + * unqualified net removal. remove all nets but prompt if more + * than one. + */ + if (!prompt_remove_resource(cmd, "net")) + return; + + if ((err = zonecfg_del_all_resources(handle, "net")) != Z_OK) + z_cmd_rt_perror(CMD_REMOVE, RT_NET, err, TRUE); + else + need_to_commit = TRUE; +} + +static void +remove_device(cmd_t *cmd) +{ + int err; + + /* traditional, qualified device removal */ + if (cmd->cmd_prop_nv_pairs > 0) { + struct zone_devtab devtab; + if ((err = fill_in_devtab(cmd, &devtab, FALSE)) != Z_OK) { z_cmd_rt_perror(CMD_REMOVE, RT_DEVICE, err, TRUE); return; @@ -2339,18 +2772,30 @@ remove_resource(cmd_t *cmd) else need_to_commit = TRUE; return; - case RT_RCTL: - if ((err = fill_in_rctltab(cmd, &rctltab, FALSE)) != Z_OK) { - z_cmd_rt_perror(CMD_REMOVE, RT_RCTL, err, TRUE); - return; - } - if ((err = zonecfg_delete_rctl(handle, &rctltab)) != Z_OK) - z_cmd_rt_perror(CMD_REMOVE, RT_RCTL, err, TRUE); - else - need_to_commit = TRUE; - zonecfg_free_rctl_value_list(rctltab.zone_rctl_valptr); + } + + /* + * unqualified device removal. remove all devices but prompt if more + * than one. + */ + if (!prompt_remove_resource(cmd, "device")) return; - case RT_ATTR: + + if ((err = zonecfg_del_all_resources(handle, "device")) != Z_OK) + z_cmd_rt_perror(CMD_REMOVE, RT_DEVICE, err, TRUE); + else + need_to_commit = TRUE; +} + +static void +remove_attr(cmd_t *cmd) +{ + int err; + + /* traditional, qualified attr removal */ + if (cmd->cmd_prop_nv_pairs > 0) { + struct zone_attrtab attrtab; + if ((err = fill_in_attrtab(cmd, &attrtab, FALSE)) != Z_OK) { z_cmd_rt_perror(CMD_REMOVE, RT_ATTR, err, TRUE); return; @@ -2360,7 +2805,30 @@ remove_resource(cmd_t *cmd) else need_to_commit = TRUE; return; - case RT_DATASET: + } + + /* + * unqualified attr removal. remove all attrs but prompt if more + * than one. + */ + if (!prompt_remove_resource(cmd, "attr")) + return; + + if ((err = zonecfg_del_all_resources(handle, "attr")) != Z_OK) + z_cmd_rt_perror(CMD_REMOVE, RT_ATTR, err, TRUE); + else + need_to_commit = TRUE; +} + +static void +remove_dataset(cmd_t *cmd) +{ + int err; + + /* traditional, qualified dataset removal */ + if (cmd->cmd_prop_nv_pairs > 0) { + struct zone_dstab dstab; + if ((err = fill_in_dstab(cmd, &dstab, FALSE)) != Z_OK) { z_cmd_rt_perror(CMD_REMOVE, RT_DATASET, err, TRUE); return; @@ -2370,6 +2838,177 @@ remove_resource(cmd_t *cmd) else need_to_commit = TRUE; return; + } + + /* + * unqualified dataset removal. remove all datasets but prompt if more + * than one. + */ + if (!prompt_remove_resource(cmd, "dataset")) + return; + + if ((err = zonecfg_del_all_resources(handle, "dataset")) != Z_OK) + z_cmd_rt_perror(CMD_REMOVE, RT_DATASET, err, TRUE); + else + need_to_commit = TRUE; +} + +static void +remove_rctl(cmd_t *cmd) +{ + int err; + + /* traditional, qualified rctl removal */ + if (cmd->cmd_prop_nv_pairs > 0) { + struct zone_rctltab rctltab; + + if ((err = fill_in_rctltab(cmd, &rctltab, FALSE)) != Z_OK) { + z_cmd_rt_perror(CMD_REMOVE, RT_RCTL, err, TRUE); + return; + } + if ((err = zonecfg_delete_rctl(handle, &rctltab)) != Z_OK) + z_cmd_rt_perror(CMD_REMOVE, RT_RCTL, err, TRUE); + else + need_to_commit = TRUE; + zonecfg_free_rctl_value_list(rctltab.zone_rctl_valptr); + return; + } + + /* + * unqualified rctl removal. remove all rctls but prompt if more + * than one. + */ + if (!prompt_remove_resource(cmd, "rctl")) + return; + + if ((err = zonecfg_del_all_resources(handle, "rctl")) != Z_OK) + z_cmd_rt_perror(CMD_REMOVE, RT_RCTL, err, TRUE); + else + need_to_commit = TRUE; +} + +static void +remove_pset() +{ + int err; + struct zone_psettab psettab; + + if ((err = zonecfg_lookup_pset(handle, &psettab)) != Z_OK) { + z_cmd_rt_perror(CMD_REMOVE, RT_DCPU, err, TRUE); + return; + } + if ((err = zonecfg_delete_pset(handle)) != Z_OK) + z_cmd_rt_perror(CMD_REMOVE, RT_DCPU, err, TRUE); + else + need_to_commit = TRUE; +} + +static void +remove_mcap() +{ + int err, res1, res2, res3; + uint64_t tmp; + struct zone_mcaptab mcaptab; + boolean_t revert = B_FALSE; + + res1 = zonecfg_lookup_mcap(handle, &mcaptab); + res2 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP, &tmp); + res3 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXLOCKEDMEM, &tmp); + + /* if none of these exist, there is no resource to remove */ + if (res1 != Z_OK && res2 != Z_OK && res3 != Z_OK) { + zerr("%s %s: %s", cmd_to_str(CMD_REMOVE), rt_to_str(RT_MCAP), + zonecfg_strerror(Z_NO_RESOURCE_TYPE)); + saw_error = TRUE; + return; + } + if (res1 == Z_OK) { + if ((err = zonecfg_delete_mcap(handle)) != Z_OK) { + z_cmd_rt_perror(CMD_REMOVE, RT_MCAP, err, TRUE); + revert = B_TRUE; + } else { + need_to_commit = TRUE; + } + } + if (res2 == Z_OK) { + if ((err = zonecfg_rm_aliased_rctl(handle, ALIAS_MAXSWAP)) + != Z_OK) { + z_cmd_rt_perror(CMD_REMOVE, RT_MCAP, err, TRUE); + revert = B_TRUE; + } else { + need_to_commit = TRUE; + } + } + if (res3 == Z_OK) { + if ((err = zonecfg_rm_aliased_rctl(handle, ALIAS_MAXLOCKEDMEM)) + != Z_OK) { + z_cmd_rt_perror(CMD_REMOVE, RT_MCAP, err, TRUE); + revert = B_TRUE; + } else { + need_to_commit = TRUE; + } + } + + if (revert) + need_to_commit = FALSE; +} + +static void +remove_resource(cmd_t *cmd) +{ + int type; + int arg; + + if ((type = cmd->cmd_res_type) == RT_UNKNOWN) { + long_usage(CMD_REMOVE, TRUE); + return; + } + + optind = 0; + while ((arg = getopt(cmd->cmd_argc, cmd->cmd_argv, "?F")) != EOF) { + switch (arg) { + case '?': + longer_usage(CMD_REMOVE); + return; + case 'F': + break; + default: + short_usage(CMD_REMOVE); + return; + } + } + + if (initialize(TRUE) != Z_OK) + return; + + switch (type) { + case RT_FS: + remove_fs(cmd); + return; + case RT_IPD: + remove_ipd(cmd); + return; + case RT_NET: + remove_net(cmd); + return; + case RT_DEVICE: + remove_device(cmd); + return; + case RT_RCTL: + remove_rctl(cmd); + return; + case RT_ATTR: + remove_attr(cmd); + return; + case RT_DATASET: + remove_dataset(cmd); + return; + case RT_DCPU: + remove_pset(); + return; + case RT_MCAP: + remove_mcap(); + return; default: zone_perror(rt_to_str(type), Z_NO_RESOURCE_TYPE, TRUE); long_usage(CMD_REMOVE, TRUE); @@ -2513,16 +3152,175 @@ remove_func(cmd_t *cmd) assert(cmd != NULL); - if (global_scope) + if (global_scope) { + if (gz_invalid_resource(cmd->cmd_res_type)) { + zerr(gettext("%s is not a valid resource for the " + "global zone."), rt_to_str(cmd->cmd_res_type)); + saw_error = TRUE; + return; + } remove_resource(cmd); - else + } else { remove_property(cmd); + } +} + +static void +clear_property(cmd_t *cmd) +{ + int res_type, prop_type; + + res_type = resource_scope; + prop_type = cmd->cmd_res_type; + if (res_type == RT_UNKNOWN || prop_type == PT_UNKNOWN) { + long_usage(CMD_CLEAR, TRUE); + return; + } + + if (initialize(TRUE) != Z_OK) + return; + + switch (res_type) { + case RT_FS: + if (prop_type == PT_RAW) { + in_progress_fstab.zone_fs_raw[0] = '\0'; + need_to_commit = TRUE; + return; + } + break; + case RT_DCPU: + if (prop_type == PT_IMPORTANCE) { + in_progress_psettab.zone_importance[0] = '\0'; + need_to_commit = TRUE; + return; + } + break; + case RT_MCAP: + switch (prop_type) { + case PT_PHYSICAL: + in_progress_mcaptab.zone_physmem_cap[0] = '\0'; + need_to_commit = TRUE; + return; + case PT_SWAP: + remove_aliased_rctl(PT_SWAP, ALIAS_MAXSWAP); + return; + case PT_LOCKED: + remove_aliased_rctl(PT_LOCKED, ALIAS_MAXLOCKEDMEM); + return; + } + break; + default: + break; + } + + zone_perror(pt_to_str(prop_type), Z_CLEAR_DISALLOW, TRUE); +} + +static void +clear_global(cmd_t *cmd) +{ + int err, type; + + if ((type = cmd->cmd_res_type) == RT_UNKNOWN) { + long_usage(CMD_CLEAR, TRUE); + return; + } + + if (initialize(TRUE) != Z_OK) + return; + + switch (type) { + case PT_ZONENAME: + /* FALLTHRU */ + case PT_ZONEPATH: + /* FALLTHRU */ + case PT_BRAND: + zone_perror(pt_to_str(type), Z_CLEAR_DISALLOW, TRUE); + return; + case PT_AUTOBOOT: + /* false is default; we'll treat as equivalent to clearing */ + if ((err = zonecfg_set_autoboot(handle, B_FALSE)) != Z_OK) + z_cmd_rt_perror(CMD_CLEAR, RT_AUTOBOOT, err, TRUE); + else + need_to_commit = TRUE; + return; + case PT_POOL: + if ((err = zonecfg_set_pool(handle, NULL)) != Z_OK) + z_cmd_rt_perror(CMD_CLEAR, RT_POOL, err, TRUE); + else + need_to_commit = TRUE; + return; + case PT_LIMITPRIV: + if ((err = zonecfg_set_limitpriv(handle, NULL)) != Z_OK) + z_cmd_rt_perror(CMD_CLEAR, RT_LIMITPRIV, err, TRUE); + else + need_to_commit = TRUE; + return; + case PT_BOOTARGS: + if ((err = zonecfg_set_bootargs(handle, NULL)) != Z_OK) + z_cmd_rt_perror(CMD_CLEAR, RT_BOOTARGS, err, TRUE); + else + need_to_commit = TRUE; + return; + case PT_SCHED: + if ((err = zonecfg_set_sched(handle, NULL)) != Z_OK) + z_cmd_rt_perror(CMD_CLEAR, RT_SCHED, err, TRUE); + else + need_to_commit = TRUE; + return; + case PT_MAXLWPS: + remove_aliased_rctl(PT_MAXLWPS, ALIAS_MAXLWPS); + return; + case PT_MAXSHMMEM: + remove_aliased_rctl(PT_MAXSHMMEM, ALIAS_MAXSHMMEM); + return; + case PT_MAXSHMIDS: + remove_aliased_rctl(PT_MAXSHMIDS, ALIAS_MAXSHMIDS); + return; + case PT_MAXMSGIDS: + remove_aliased_rctl(PT_MAXMSGIDS, ALIAS_MAXMSGIDS); + return; + case PT_MAXSEMIDS: + remove_aliased_rctl(PT_MAXSEMIDS, ALIAS_MAXSEMIDS); + return; + case PT_SHARES: + remove_aliased_rctl(PT_SHARES, ALIAS_SHARES); + return; + default: + zone_perror(pt_to_str(type), Z_NO_PROPERTY_TYPE, TRUE); + long_usage(CMD_CLEAR, TRUE); + usage(FALSE, HELP_PROPS); + return; + } +} + +void +clear_func(cmd_t *cmd) +{ + if (zone_is_read_only(CMD_CLEAR)) + return; + + assert(cmd != NULL); + + if (global_scope) { + if (gz_invalid_property(cmd->cmd_res_type)) { + zerr(gettext("%s is not a valid property for the " + "global zone."), pt_to_str(cmd->cmd_res_type)); + saw_error = TRUE; + return; + } + + clear_global(cmd); + } else { + clear_property(cmd); + } } void select_func(cmd_t *cmd) { - int type, err; + int type, err, res; + uint64_t limit; if (zone_is_read_only(CMD_SELECT)) return; @@ -2612,6 +3410,32 @@ select_func(cmd_t *cmd) bcopy(&old_dstab, &in_progress_dstab, sizeof (struct zone_dstab)); return; + case RT_DCPU: + if ((err = zonecfg_lookup_pset(handle, &old_psettab)) != Z_OK) { + z_cmd_rt_perror(CMD_SELECT, RT_DCPU, err, TRUE); + global_scope = TRUE; + } + bcopy(&old_psettab, &in_progress_psettab, + sizeof (struct zone_psettab)); + return; + case RT_MCAP: + /* if none of these exist, there is no resource to select */ + if ((res = zonecfg_lookup_mcap(handle, &old_mcaptab)) != Z_OK && + zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP, &limit) + != Z_OK && + zonecfg_get_aliased_rctl(handle, ALIAS_MAXLOCKEDMEM, &limit) + != Z_OK) { + z_cmd_rt_perror(CMD_SELECT, RT_MCAP, Z_NO_RESOURCE_TYPE, + TRUE); + global_scope = TRUE; + } + if (res == Z_OK) + bcopy(&old_mcaptab, &in_progress_mcaptab, + sizeof (struct zone_mcaptab)); + else + bzero(&in_progress_mcaptab, + sizeof (in_progress_mcaptab)); + return; default: zone_perror(rt_to_str(type), Z_NO_RESOURCE_TYPE, TRUE); long_usage(CMD_SELECT, TRUE); @@ -2731,6 +3555,49 @@ valid_fs_type(const char *type) return (B_TRUE); } +static void +set_aliased_rctl(char *alias, int prop_type, char *s) +{ + uint64_t limit; + int err; + char tmp[128]; + + if (global_zone && strcmp(alias, ALIAS_SHARES) != 0) + zerr(gettext("WARNING: Setting a global zone resource " + "control too low could deny\nservice " + "to even the root user; " + "this could render the system impossible\n" + "to administer. Please use caution.")); + + /* convert memory based properties */ + if (prop_type == PT_MAXSHMMEM) { + if (!zonecfg_valid_memlimit(s, &limit)) { + zerr(gettext("A non-negative number with a required " + "scale suffix (K, M, G or T) was expected\nhere.")); + saw_error = TRUE; + return; + } + + (void) snprintf(tmp, sizeof (tmp), "%llu", limit); + s = tmp; + } + + if (!zonecfg_aliased_rctl_ok(handle, alias)) { + zone_perror(pt_to_str(prop_type), Z_ALIAS_DISALLOW, FALSE); + saw_error = TRUE; + } else if (!zonecfg_valid_alias_limit(alias, s, &limit)) { + zerr(gettext("%s property is out of range."), + pt_to_str(prop_type)); + saw_error = TRUE; + } else if ((err = zonecfg_set_aliased_rctl(handle, alias, limit)) + != Z_OK) { + zone_perror(zone, err, TRUE); + saw_error = TRUE; + } else { + need_to_commit = TRUE; + } +} + void set_func(cmd_t *cmd) { @@ -2739,6 +3606,9 @@ set_func(cmd_t *cmd) property_value_ptr_t pp; boolean_t autoboot; boolean_t force_set = FALSE; + size_t physmem_size = sizeof (in_progress_mcaptab.zone_physmem_cap); + uint64_t mem_cap, mem_limit; + struct zone_psettab tmp_psettab; if (zone_is_read_only(CMD_SET)) return; @@ -2762,6 +3632,13 @@ set_func(cmd_t *cmd) prop_type = cmd->cmd_prop_name[0]; if (global_scope) { + if (gz_invalid_property(prop_type)) { + zerr(gettext("%s is not a valid property for the " + "global zone."), pt_to_str(prop_type)); + saw_error = TRUE; + return; + } + if (prop_type == PT_ZONENAME) { res_type = RT_ZONENAME; } else if (prop_type == PT_ZONEPATH) { @@ -2776,6 +3653,20 @@ set_func(cmd_t *cmd) res_type = RT_LIMITPRIV; } else if (prop_type == PT_BOOTARGS) { res_type = RT_BOOTARGS; + } else if (prop_type == PT_SCHED) { + res_type = RT_SCHED; + } else if (prop_type == PT_MAXLWPS) { + res_type = RT_MAXLWPS; + } else if (prop_type == PT_MAXSHMMEM) { + res_type = RT_MAXSHMMEM; + } else if (prop_type == PT_MAXSHMIDS) { + res_type = RT_MAXSHMIDS; + } else if (prop_type == PT_MAXMSGIDS) { + res_type = RT_MAXMSGIDS; + } else if (prop_type == PT_MAXSEMIDS) { + res_type = RT_MAXSEMIDS; + } else if (prop_type == PT_SHARES) { + res_type = RT_SHARES; } else { zerr(gettext("Cannot set a resource-specific property " "from the global scope.")); @@ -2899,6 +3790,24 @@ set_func(cmd_t *cmd) need_to_commit = TRUE; return; case RT_POOL: + /* don't allow use of the reserved temporary pool names */ + if (strncmp("SUNW", prop_id, 4) == 0) { + zerr(gettext("pool names starting with SUNW are " + "reserved.")); + saw_error = TRUE; + return; + } + + /* can't set pool if dedicated-cpu exists */ + if (zonecfg_lookup_pset(handle, &tmp_psettab) == Z_OK) { + zerr(gettext("The %s resource already exists. " + "A persistent pool is incompatible\nwith the %s " + "resource."), rt_to_str(RT_DCPU), + rt_to_str(RT_DCPU)); + saw_error = TRUE; + return; + } + if ((err = zonecfg_set_pool(handle, prop_id)) != Z_OK) zone_perror(zone, err, TRUE); else @@ -2916,6 +3825,30 @@ set_func(cmd_t *cmd) else need_to_commit = TRUE; return; + case RT_SCHED: + if ((err = zonecfg_set_sched(handle, prop_id)) != Z_OK) + zone_perror(zone, err, TRUE); + else + need_to_commit = TRUE; + return; + case RT_MAXLWPS: + set_aliased_rctl(ALIAS_MAXLWPS, prop_type, prop_id); + return; + case RT_MAXSHMMEM: + set_aliased_rctl(ALIAS_MAXSHMMEM, prop_type, prop_id); + return; + case RT_MAXSHMIDS: + set_aliased_rctl(ALIAS_MAXSHMIDS, prop_type, prop_id); + return; + case RT_MAXMSGIDS: + set_aliased_rctl(ALIAS_MAXMSGIDS, prop_type, prop_id); + return; + case RT_MAXSEMIDS: + set_aliased_rctl(ALIAS_MAXSEMIDS, prop_type, prop_id); + return; + case RT_SHARES: + set_aliased_rctl(ALIAS_SHARES, prop_type, prop_id); + return; case RT_FS: switch (prop_type) { case PT_DIR: @@ -3095,6 +4028,146 @@ set_func(cmd_t *cmd) long_usage(CMD_SET, TRUE); usage(FALSE, HELP_PROPS); return; + case RT_DCPU: + switch (prop_type) { + char *lowp, *highp; + + case PT_NCPUS: + lowp = prop_id; + if ((highp = strchr(prop_id, '-')) != NULL) + *highp++ = '\0'; + else + highp = lowp; + + /* Make sure the input makes sense. */ + if (!zonecfg_valid_ncpus(lowp, highp)) { + zerr(gettext("%s property is out of range."), + pt_to_str(PT_NCPUS)); + saw_error = TRUE; + return; + } + + (void) strlcpy( + in_progress_psettab.zone_ncpu_min, lowp, + sizeof (in_progress_psettab.zone_ncpu_min)); + (void) strlcpy( + in_progress_psettab.zone_ncpu_max, highp, + sizeof (in_progress_psettab.zone_ncpu_max)); + return; + case PT_IMPORTANCE: + /* Make sure the value makes sense. */ + if (!zonecfg_valid_importance(prop_id)) { + zerr(gettext("%s property is out of range."), + pt_to_str(PT_IMPORTANCE)); + saw_error = TRUE; + return; + } + + (void) strlcpy(in_progress_psettab.zone_importance, + prop_id, + sizeof (in_progress_psettab.zone_importance)); + return; + default: + break; + } + zone_perror(pt_to_str(prop_type), Z_NO_PROPERTY_TYPE, TRUE); + long_usage(CMD_SET, TRUE); + usage(FALSE, HELP_PROPS); + return; + case RT_MCAP: + switch (prop_type) { + case PT_PHYSICAL: + if (!zonecfg_valid_memlimit(prop_id, &mem_cap)) { + zerr(gettext("A positive number with a " + "required scale suffix (K, M, G or T) was " + "expected here.")); + saw_error = TRUE; + } else if (mem_cap < ONE_MB) { + zerr(gettext("%s value is too small. It must " + "be at least 1M."), pt_to_str(PT_PHYSICAL)); + saw_error = TRUE; + } else { + snprintf(in_progress_mcaptab.zone_physmem_cap, + physmem_size, "%llu", mem_cap); + } + break; + case PT_SWAP: + /* + * We have to check if an rctl is allowed here since + * there might already be a rctl defined that blocks + * the alias. + */ + if (!zonecfg_aliased_rctl_ok(handle, ALIAS_MAXSWAP)) { + zone_perror(pt_to_str(PT_MAXSWAP), + Z_ALIAS_DISALLOW, FALSE); + saw_error = TRUE; + return; + } + + if (global_zone) + mem_limit = ONE_MB * 100; + else + mem_limit = ONE_MB * 50; + + if (!zonecfg_valid_memlimit(prop_id, &mem_cap)) { + zerr(gettext("A positive number with a " + "required scale suffix (K, M, G or T) was " + "expected here.")); + saw_error = TRUE; + } else if (mem_cap < mem_limit) { + char buf[128]; + + (void) snprintf(buf, sizeof (buf), "%llu", + mem_limit); + bytes_to_units(buf, buf, sizeof (buf)); + zerr(gettext("%s value is too small. It must " + "be at least %s."), pt_to_str(PT_SWAP), + buf); + saw_error = TRUE; + } else { + if ((err = zonecfg_set_aliased_rctl(handle, + ALIAS_MAXSWAP, mem_cap)) != Z_OK) + zone_perror(zone, err, TRUE); + else + need_to_commit = TRUE; + } + break; + case PT_LOCKED: + /* + * We have to check if an rctl is allowed here since + * there might already be a rctl defined that blocks + * the alias. + */ + if (!zonecfg_aliased_rctl_ok(handle, + ALIAS_MAXLOCKEDMEM)) { + zone_perror(pt_to_str(PT_LOCKED), + Z_ALIAS_DISALLOW, FALSE); + saw_error = TRUE; + return; + } + + if (!zonecfg_valid_memlimit(prop_id, &mem_cap)) { + zerr(gettext("A non-negative number with a " + "required scale suffix (K, M, G or T) was " + "expected\nhere.")); + saw_error = TRUE; + } else { + if ((err = zonecfg_set_aliased_rctl(handle, + ALIAS_MAXLOCKEDMEM, mem_cap)) != Z_OK) + zone_perror(zone, err, TRUE); + else + need_to_commit = TRUE; + } + break; + default: + zone_perror(pt_to_str(prop_type), Z_NO_PROPERTY_TYPE, + TRUE); + long_usage(CMD_SET, TRUE); + usage(FALSE, HELP_PROPS); + return; + } + + return; default: zone_perror(rt_to_str(res_type), Z_NO_RESOURCE_TYPE, TRUE); long_usage(CMD_SET, TRUE); @@ -3110,7 +4183,11 @@ output_prop(FILE *fp, int pnum, char *pval, bool print_notspec) if (*pval != '\0') { qstr = quoteit(pval); - (void) fprintf(fp, "\t%s: %s\n", pt_to_str(pnum), qstr); + if (pnum == PT_SWAP || pnum == PT_LOCKED) + (void) fprintf(fp, "\t[%s: %s]\n", pt_to_str(pnum), + qstr); + else + (void) fprintf(fp, "\t%s: %s\n", pt_to_str(pnum), qstr); free(qstr); } else if (print_notspec) (void) fprintf(fp, gettext("\t%s not specified\n"), @@ -3213,6 +4290,20 @@ info_bootargs(zone_dochandle_t handle, FILE *fp) } static void +info_sched(zone_dochandle_t handle, FILE *fp) +{ + char sched[MAXNAMELEN]; + int err; + + if ((err = zonecfg_get_sched_class(handle, sched, sizeof (sched))) + == Z_OK) { + (void) fprintf(fp, "%s: %s\n", pt_to_str(PT_SCHED), sched); + } else { + zone_perror(zone, err, TRUE); + } +} + +static void output_fs(FILE *fp, struct zone_fstab *fstab) { zone_fsopt_t *this; @@ -3499,7 +4590,7 @@ info_ds(zone_dochandle_t handle, FILE *fp, cmd_t *cmd) struct zone_dstab lookup, user; bool output = FALSE; - if (zonecfg_setdevent(handle) != Z_OK) + if (zonecfg_setdsent(handle) != Z_OK) return; while (zonecfg_getdsent(handle, &lookup) == Z_OK) { if (cmd->cmd_prop_nv_pairs == 0) { @@ -3525,12 +4616,132 @@ info_ds(zone_dochandle_t handle, FILE *fp, cmd_t *cmd) rt_to_str(RT_DATASET)); } +static void +output_pset(FILE *fp, struct zone_psettab *psettab) +{ + (void) fprintf(fp, "%s:\n", rt_to_str(RT_DCPU)); + if (strcmp(psettab->zone_ncpu_min, psettab->zone_ncpu_max) == 0) + (void) fprintf(fp, "\t%s: %s\n", pt_to_str(PT_NCPUS), + psettab->zone_ncpu_max); + else + (void) fprintf(fp, "\t%s: %s-%s\n", pt_to_str(PT_NCPUS), + psettab->zone_ncpu_min, psettab->zone_ncpu_max); + if (psettab->zone_importance[0] != '\0') + (void) fprintf(fp, "\t%s: %s\n", pt_to_str(PT_IMPORTANCE), + psettab->zone_importance); +} + +static void +info_pset(zone_dochandle_t handle, FILE *fp) +{ + struct zone_psettab lookup; + + if (zonecfg_getpsetent(handle, &lookup) == Z_OK) + output_pset(fp, &lookup); +} + +static void +info_aliased_rctl(zone_dochandle_t handle, FILE *fp, char *alias) +{ + uint64_t limit; + + if (zonecfg_get_aliased_rctl(handle, alias, &limit) == Z_OK) { + /* convert memory based properties */ + if (strcmp(alias, ALIAS_MAXSHMMEM) == 0) { + char buf[128]; + + (void) snprintf(buf, sizeof (buf), "%llu", limit); + bytes_to_units(buf, buf, sizeof (buf)); + (void) fprintf(fp, "[%s: %s]\n", alias, buf); + return; + } + + (void) fprintf(fp, "[%s: %llu]\n", alias, limit); + } +} + +static void +bytes_to_units(char *str, char *buf, int bufsize) +{ + unsigned long long num; + unsigned long long save = 0; + char *units = "BKMGT"; + char *up = units; + + num = strtoll(str, NULL, 10); + + if (num < 1024) { + (void) snprintf(buf, bufsize, "%llu", num); + return; + } + + while ((num >= 1024) && (*up != 'T')) { + up++; /* next unit of measurement */ + save = num; + num = (num + 512) >> 10; + } + + /* check if we should output a fraction. snprintf will round for us */ + if (save % 1024 != 0 && ((save >> 10) < 10)) + (void) snprintf(buf, bufsize, "%2.1f%c", ((float)save / 1024), + *up); + else + (void) snprintf(buf, bufsize, "%llu%c", num, *up); +} + +static void +output_mcap(FILE *fp, struct zone_mcaptab *mcaptab, int showswap, + uint64_t maxswap, int showlocked, uint64_t maxlocked) +{ + char buf[128]; + + (void) fprintf(fp, "%s:\n", rt_to_str(RT_MCAP)); + if (mcaptab->zone_physmem_cap[0] != '\0') { + bytes_to_units(mcaptab->zone_physmem_cap, buf, sizeof (buf)); + output_prop(fp, PT_PHYSICAL, buf, B_TRUE); + } + + if (showswap == Z_OK) { + (void) snprintf(buf, sizeof (buf), "%llu", maxswap); + bytes_to_units(buf, buf, sizeof (buf)); + output_prop(fp, PT_SWAP, buf, B_TRUE); + } + + if (showlocked == Z_OK) { + (void) snprintf(buf, sizeof (buf), "%llu", maxlocked); + bytes_to_units(buf, buf, sizeof (buf)); + output_prop(fp, PT_LOCKED, buf, B_TRUE); + } +} + +static void +info_mcap(zone_dochandle_t handle, FILE *fp) +{ + int res1, res2, res3; + uint64_t swap_limit; + uint64_t locked_limit; + struct zone_mcaptab lookup; + + bzero(&lookup, sizeof (lookup)); + res1 = zonecfg_getmcapent(handle, &lookup); + res2 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP, &swap_limit); + res3 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXLOCKEDMEM, + &locked_limit); + + if (res1 == Z_OK || res2 == Z_OK || res3 == Z_OK) + output_mcap(fp, &lookup, res2, swap_limit, res3, locked_limit); +} + void info_func(cmd_t *cmd) { FILE *fp = stdout; bool need_to_close = FALSE; char *pager; + int type; + int res1, res2; + uint64_t swap_limit; + uint64_t locked_limit; assert(cmd != NULL); @@ -3569,26 +4780,68 @@ info_func(cmd_t *cmd) case RT_DATASET: output_ds(fp, &in_progress_dstab); break; + case RT_DCPU: + output_pset(fp, &in_progress_psettab); + break; + case RT_MCAP: + res1 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP, + &swap_limit); + res2 = zonecfg_get_aliased_rctl(handle, + ALIAS_MAXLOCKEDMEM, &locked_limit); + output_mcap(fp, &in_progress_mcaptab, res1, swap_limit, + res2, locked_limit); + break; } goto cleanup; } + type = cmd->cmd_res_type; + + if (gz_invalid_rt_property(type)) { + zerr(gettext("%s is not a valid property for the global zone."), + rt_to_str(type)); + goto cleanup; + } + + if (gz_invalid_resource(type)) { + zerr(gettext("%s is not a valid resource for the global zone."), + rt_to_str(type)); + goto cleanup; + } + switch (cmd->cmd_res_type) { case RT_UNKNOWN: info_zonename(handle, fp); - info_zonepath(handle, fp); - info_brand(handle, fp); - info_autoboot(handle, fp); - info_bootargs(handle, fp); + if (!global_zone) { + info_zonepath(handle, fp); + info_brand(handle, fp); + info_autoboot(handle, fp); + info_bootargs(handle, fp); + } info_pool(handle, fp); - info_limitpriv(handle, fp); - info_ipd(handle, fp, cmd); - info_fs(handle, fp, cmd); - info_net(handle, fp, cmd); - info_dev(handle, fp, cmd); + if (!global_zone) { + info_limitpriv(handle, fp); + info_sched(handle, fp); + } + info_aliased_rctl(handle, fp, ALIAS_MAXLWPS); + info_aliased_rctl(handle, fp, ALIAS_MAXSHMMEM); + info_aliased_rctl(handle, fp, ALIAS_MAXSHMIDS); + info_aliased_rctl(handle, fp, ALIAS_MAXMSGIDS); + info_aliased_rctl(handle, fp, ALIAS_MAXSEMIDS); + info_aliased_rctl(handle, fp, ALIAS_SHARES); + if (!global_zone) { + info_ipd(handle, fp, cmd); + info_fs(handle, fp, cmd); + info_net(handle, fp, cmd); + info_dev(handle, fp, cmd); + } + info_pset(handle, fp); + info_mcap(handle, fp); + if (!global_zone) { + info_attr(handle, fp, cmd); + info_ds(handle, fp, cmd); + } info_rctl(handle, fp, cmd); - info_attr(handle, fp, cmd); - info_ds(handle, fp, cmd); break; case RT_ZONENAME: info_zonename(handle, fp); @@ -3611,6 +4864,27 @@ info_func(cmd_t *cmd) case RT_BOOTARGS: info_bootargs(handle, fp); break; + case RT_SCHED: + info_sched(handle, fp); + break; + case RT_MAXLWPS: + info_aliased_rctl(handle, fp, ALIAS_MAXLWPS); + break; + case RT_MAXSHMMEM: + info_aliased_rctl(handle, fp, ALIAS_MAXSHMMEM); + break; + case RT_MAXSHMIDS: + info_aliased_rctl(handle, fp, ALIAS_MAXSHMIDS); + break; + case RT_MAXMSGIDS: + info_aliased_rctl(handle, fp, ALIAS_MAXMSGIDS); + break; + case RT_MAXSEMIDS: + info_aliased_rctl(handle, fp, ALIAS_MAXSEMIDS); + break; + case RT_SHARES: + info_aliased_rctl(handle, fp, ALIAS_SHARES); + break; case RT_FS: info_fs(handle, fp, cmd); break; @@ -3632,6 +4906,12 @@ info_func(cmd_t *cmd) case RT_DATASET: info_ds(handle, fp, cmd); break; + case RT_DCPU: + info_pset(handle, fp); + break; + case RT_MCAP: + info_mcap(handle, fp); + break; default: zone_perror(rt_to_str(cmd->cmd_res_type), Z_NO_RESOURCE_TYPE, TRUE); @@ -3765,10 +5045,13 @@ verify_func(cmd_t *cmd) struct zone_attrtab attrtab; struct zone_rctltab rctltab; struct zone_dstab dstab; + struct zone_psettab psettab; char zonepath[MAXPATHLEN]; + char sched[MAXNAMELEN]; char brand[MAXNAMELEN]; int err, ret_val = Z_OK, arg; bool save = FALSE; + boolean_t has_cpu_shares = B_FALSE; optind = 0; if ((arg = getopt(cmd->cmd_argc, cmd->cmd_argv, "?")) != EOF) { @@ -3796,12 +5079,13 @@ verify_func(cmd_t *cmd) if (initialize(TRUE) != Z_OK) return; - if (zonecfg_get_zonepath(handle, zonepath, sizeof (zonepath)) != Z_OK) { + if (zonecfg_get_zonepath(handle, zonepath, sizeof (zonepath)) != Z_OK && + !global_zone) { zerr(gettext("%s not specified"), pt_to_str(PT_ZONEPATH)); ret_val = Z_REQD_RESOURCE_MISSING; saw_error = TRUE; } - if (strlen(zonepath) == 0) { + if (strlen(zonepath) == 0 && !global_zone) { zerr(gettext("%s cannot be empty."), pt_to_str(PT_ZONEPATH)); ret_val = Z_REQD_RESOURCE_MISSING; saw_error = TRUE; @@ -3861,6 +5145,9 @@ verify_func(cmd_t *cmd) check_reqd_prop(rctltab.zone_rctl_name, RT_RCTL, PT_NAME, &ret_val); + if (strcmp(rctltab.zone_rctl_name, "zone.cpu-shares") == 0) + has_cpu_shares = B_TRUE; + if (rctltab.zone_rctl_valptr == NULL) { zerr(gettext("%s: no %s specified"), rt_to_str(RT_RCTL), pt_to_str(PT_VALUE)); @@ -3873,6 +5160,25 @@ verify_func(cmd_t *cmd) } (void) zonecfg_endrctlent(handle); + if (zonecfg_lookup_pset(handle, &psettab) == Z_OK && has_cpu_shares) { + zerr(gettext("%s zone.cpu-shares and %s are incompatible."), + rt_to_str(RT_RCTL), rt_to_str(RT_DCPU)); + saw_error = TRUE; + if (ret_val == Z_OK) + ret_val = Z_INCOMPATIBLE; + } + + if (has_cpu_shares && zonecfg_get_sched_class(handle, sched, + sizeof (sched)) == Z_OK && strlen(sched) > 0 && + strcmp(sched, "FSS") != 0) { + zerr(gettext("WARNING: %s zone.cpu-shares and %s=%s are " + "incompatible"), + rt_to_str(RT_RCTL), rt_to_str(RT_SCHED), sched); + saw_error = TRUE; + if (ret_val == Z_OK) + ret_val = Z_INCOMPATIBLE; + } + if ((err = zonecfg_setattrent(handle)) != Z_OK) { zone_perror(zone, err, TRUE); return; @@ -4061,7 +5367,9 @@ end_func(cmd_t *cmd) struct zone_rctltab tmp_rctltab; struct zone_attrtab tmp_attrtab; struct zone_dstab tmp_dstab; - int err, arg; + int err, arg, res1, res2, res3; + uint64_t swap_limit; + uint64_t locked_limit; assert(cmd != NULL); @@ -4361,6 +5669,73 @@ end_func(cmd_t *cmd) &in_progress_dstab); } break; + case RT_DCPU: + /* Make sure everything was filled in. */ + if (end_check_reqd(in_progress_psettab.zone_ncpu_min, + PT_NCPUS, &validation_failed) != Z_OK) { + saw_error = TRUE; + return; + } + + if (end_op == CMD_ADD) { + err = zonecfg_add_pset(handle, &in_progress_psettab); + } else { + err = zonecfg_modify_pset(handle, &in_progress_psettab); + } + break; + case RT_MCAP: + /* Make sure everything was filled in. */ + res1 = strlen(in_progress_mcaptab.zone_physmem_cap) == 0 ? + Z_ERR : Z_OK; + res2 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP, + &swap_limit); + res3 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXLOCKEDMEM, + &locked_limit); + + if (res1 != Z_OK && res2 != Z_OK && res3 != Z_OK) { + zerr(gettext("No property was specified. One of %s, " + "%s or %s is required."), pt_to_str(PT_PHYSICAL), + pt_to_str(PT_SWAP), pt_to_str(PT_LOCKED)); + saw_error = TRUE; + return; + } + + /* if phys & locked are both set, verify locked <= phys */ + if (res1 == Z_OK && res3 == Z_OK) { + uint64_t phys_limit; + char *endp; + + phys_limit = strtoull( + in_progress_mcaptab.zone_physmem_cap, &endp, 10); + if (phys_limit < locked_limit) { + zerr(gettext("The %s cap must be less than or " + "equal to the %s cap."), + pt_to_str(PT_LOCKED), + pt_to_str(PT_PHYSICAL)); + saw_error = TRUE; + return; + } + } + + err = Z_OK; + if (res1 == Z_OK) { + /* + * We could be ending from either an add operation + * or a select operation. Since all of the properties + * within this resource are optional, we always use + * modify on the mcap entry. zonecfg_modify_mcap() + * will handle both adding and modifying a memory cap. + */ + err = zonecfg_modify_mcap(handle, &in_progress_mcaptab); + } else if (end_op == CMD_SELECT) { + /* + * If we're ending from a select and the physical + * memory cap is empty then the user could have cleared + * the physical cap value, so try to delete the entry. + */ + (void) zonecfg_delete_mcap(handle); + } + break; default: zone_perror(rt_to_str(resource_scope), Z_NO_RESOURCE_TYPE, TRUE); @@ -4885,7 +6260,9 @@ main(int argc, char *argv[]) zonecfg_set_root(optarg); break; case 'z': - if (zonecfg_validate_zonename(optarg) != Z_OK) { + if (strcmp(optarg, GLOBAL_ZONENAME) == 0) { + global_zone = TRUE; + } else if (zonecfg_validate_zonename(optarg) != Z_OK) { zone_perror(optarg, Z_BOGUS_ZONE_NAME, TRUE); usage(FALSE, HELP_SYNTAX); exit(Z_USAGE); diff --git a/usr/src/cmd/zonecfg/zonecfg.h b/usr/src/cmd/zonecfg/zonecfg.h index 6e153d40c1..64808e9623 100644 --- a/usr/src/cmd/zonecfg/zonecfg.h +++ b/usr/src/cmd/zonecfg/zonecfg.h @@ -50,19 +50,20 @@ typedef int bool; #define CMD_ADD 0 #define CMD_CANCEL 1 -#define CMD_COMMIT 2 -#define CMD_CREATE 3 -#define CMD_DELETE 4 -#define CMD_END 5 -#define CMD_EXIT 6 -#define CMD_EXPORT 7 -#define CMD_HELP 8 -#define CMD_INFO 9 -#define CMD_REMOVE 10 -#define CMD_REVERT 11 -#define CMD_SELECT 12 -#define CMD_SET 13 -#define CMD_VERIFY 14 +#define CMD_CLEAR 2 +#define CMD_COMMIT 3 +#define CMD_CREATE 4 +#define CMD_DELETE 5 +#define CMD_END 6 +#define CMD_EXIT 7 +#define CMD_EXPORT 8 +#define CMD_HELP 9 +#define CMD_INFO 10 +#define CMD_REMOVE 11 +#define CMD_REVERT 12 +#define CMD_SELECT 13 +#define CMD_SET 14 +#define CMD_VERIFY 15 #define CMD_MIN CMD_ADD #define CMD_MAX CMD_VERIFY @@ -83,9 +84,18 @@ typedef int bool; #define RT_LIMITPRIV 12 /* really a property, but for info ... */ #define RT_BOOTARGS 13 /* really a property, but for info ... */ #define RT_BRAND 14 /* really a property, but for info ... */ +#define RT_DCPU 15 +#define RT_MCAP 16 +#define RT_MAXLWPS 17 /* really a rctl alias property, but for info */ +#define RT_MAXSHMMEM 18 /* really a rctl alias property, but for info */ +#define RT_MAXSHMIDS 19 /* really a rctl alias property, but for info */ +#define RT_MAXMSGIDS 20 /* really a rctl alias property, but for info */ +#define RT_MAXSEMIDS 21 /* really a rctl alias property, but for info */ +#define RT_SHARES 22 /* really a rctl alias property, but for info */ +#define RT_SCHED 23 /* really a property, but for info ... */ #define RT_MIN RT_UNKNOWN -#define RT_MAX RT_BRAND +#define RT_MAX RT_SCHED /* property types: increment PT_MAX when expanding this list */ #define PT_UNKNOWN 0 @@ -109,9 +119,22 @@ typedef int bool; #define PT_LIMITPRIV 18 #define PT_BOOTARGS 19 #define PT_BRAND 20 +#define PT_NCPUS 21 +#define PT_IMPORTANCE 22 +#define PT_SWAP 23 +#define PT_LOCKED 24 +#define PT_SHARES 25 +#define PT_MAXLWPS 26 +#define PT_MAXSHMMEM 27 +#define PT_MAXSHMIDS 28 +#define PT_MAXMSGIDS 29 +#define PT_MAXSEMIDS 30 +#define PT_MAXLOCKEDMEM 31 +#define PT_MAXSWAP 32 +#define PT_SCHED 33 #define PT_MIN PT_UNKNOWN -#define PT_MAX PT_BRAND +#define PT_MAX PT_SCHED #define MAX_EQ_PROP_PAIRS 3 @@ -184,6 +207,7 @@ extern void revert_func(cmd_t *); extern void select_func(cmd_t *); extern void set_func(cmd_t *); extern void verify_func(cmd_t *); +extern void clear_func(cmd_t *); extern cmd_t *alloc_cmd(void); extern complex_property_ptr_t alloc_complex(void); diff --git a/usr/src/cmd/zonecfg/zonecfg_grammar.y b/usr/src/cmd/zonecfg/zonecfg_grammar.y index dc391da0b9..5c0dc2263e 100644 --- a/usr/src/cmd/zonecfg/zonecfg_grammar.y +++ b/usr/src/cmd/zonecfg/zonecfg_grammar.y @@ -60,15 +60,17 @@ extern void yyerror(char *s); %token COMMIT REVERT EXIT SEMICOLON TOKEN ZONENAME ZONEPATH AUTOBOOT POOL NET %token FS IPD ATTR DEVICE RCTL SPECIAL RAW DIR OPTIONS TYPE ADDRESS PHYSICAL %token NAME MATCH PRIV LIMIT ACTION VALUE EQUAL OPEN_SQ_BRACKET CLOSE_SQ_BRACKET -%token OPEN_PAREN CLOSE_PAREN COMMA DATASET LIMITPRIV BOOTARGS BRAND +%token OPEN_PAREN CLOSE_PAREN COMMA DATASET LIMITPRIV BOOTARGS BRAND PSET +%token MCAP NCPUS IMPORTANCE SHARES MAXLWPS MAXSHMMEM MAXSHMIDS MAXMSGIDS +%token MAXSEMIDS LOCKED SWAP SCHED CLEAR %type <strval> TOKEN EQUAL OPEN_SQ_BRACKET CLOSE_SQ_BRACKET property_value OPEN_PAREN CLOSE_PAREN COMMA simple_prop_val %type <complex> complex_piece complex_prop_val -%type <ival> resource_type NET FS IPD DEVICE RCTL ATTR +%type <ival> resource_type NET FS IPD DEVICE RCTL ATTR DATASET PSET MCAP %type <ival> property_name SPECIAL RAW DIR OPTIONS TYPE ADDRESS PHYSICAL NAME MATCH ZONENAME ZONEPATH AUTOBOOT POOL LIMITPRIV BOOTARGS VALUE PRIV LIMIT - ACTION BRAND + ACTION BRAND SCHED %type <cmd> command %type <cmd> add_command ADD %type <cmd> cancel_command CANCEL @@ -84,6 +86,7 @@ extern void yyerror(char *s); %type <cmd> revert_command REVERT %type <cmd> select_command SELECT %type <cmd> set_command SET +%type <cmd> clear_command CLEAR %type <cmd> verify_command VERIFY %type <cmd> terminator @@ -126,6 +129,7 @@ commands: command terminator command: add_command | cancel_command + | clear_command | create_command | commit_command | delete_command @@ -465,6 +469,69 @@ info_command: INFO $$->cmd_res_type = RT_BOOTARGS; $$->cmd_prop_nv_pairs = 0; } + | INFO SCHED + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &info_func; + $$->cmd_res_type = RT_SCHED; + $$->cmd_prop_nv_pairs = 0; + } + | INFO SHARES + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &info_func; + $$->cmd_res_type = RT_SHARES; + $$->cmd_prop_nv_pairs = 0; + } + | INFO MAXLWPS + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &info_func; + $$->cmd_res_type = RT_MAXLWPS; + $$->cmd_prop_nv_pairs = 0; + } + | INFO MAXSHMMEM + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &info_func; + $$->cmd_res_type = RT_MAXSHMMEM; + $$->cmd_prop_nv_pairs = 0; + } + | INFO MAXSHMIDS + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &info_func; + $$->cmd_res_type = RT_MAXSHMIDS; + $$->cmd_prop_nv_pairs = 0; + } + | INFO MAXMSGIDS + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &info_func; + $$->cmd_res_type = RT_MAXMSGIDS; + $$->cmd_prop_nv_pairs = 0; + } + | INFO MAXSEMIDS + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &info_func; + $$->cmd_res_type = RT_MAXSEMIDS; + $$->cmd_prop_nv_pairs = 0; + } | INFO resource_type property_name EQUAL property_value { if (($$ = alloc_cmd()) == NULL) @@ -512,11 +579,32 @@ remove_command: REMOVE usage(FALSE, HELP_RES_PROPS); YYERROR; } - | REMOVE resource_type + | REMOVE TOKEN { short_usage(CMD_REMOVE); + (void) fputs("\n", stderr); + usage(FALSE, HELP_RES_PROPS); YYERROR; } + | REMOVE resource_type + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &remove_func; + $$->cmd_res_type = $2; + } + | REMOVE TOKEN resource_type + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &remove_func; + $$->cmd_res_type = $3; + $$->cmd_argc = 1; + $$->cmd_argv[0] = $2; + $$->cmd_argv[1] = NULL; + } | REMOVE property_name property_value { if (($$ = alloc_cmd()) == NULL) @@ -594,6 +682,22 @@ select_command: SELECT usage(FALSE, HELP_RES_PROPS); YYERROR; } + | SELECT PSET + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &select_func; + $$->cmd_res_type = RT_DCPU; + } + | SELECT MCAP + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &select_func; + $$->cmd_res_type = RT_MCAP; + } | SELECT resource_type { short_usage(CMD_SELECT); @@ -682,6 +786,22 @@ set_command: SET $$->cmd_property_ptr[0] = &property[0]; } +clear_command: CLEAR + { + short_usage(CMD_CLEAR); + (void) fputs("\n", stderr); + usage(FALSE, HELP_PROPS); + YYERROR; + } + | CLEAR property_name + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &clear_func; + $$->cmd_res_type = $2; + } + verify_command: VERIFY { if (($$ = alloc_cmd()) == NULL) @@ -709,6 +829,8 @@ resource_type: NET { $$ = RT_NET; } | RCTL { $$ = RT_RCTL; } | ATTR { $$ = RT_ATTR; } | DATASET { $$ = RT_DATASET; } + | PSET { $$ = RT_DCPU; } + | MCAP { $$ = RT_MCAP; } property_name: SPECIAL { $$ = PT_SPECIAL; } | RAW { $$ = PT_RAW; } @@ -730,6 +852,17 @@ property_name: SPECIAL { $$ = PT_SPECIAL; } | LIMIT { $$ = PT_LIMIT; } | ACTION { $$ = PT_ACTION; } | BRAND { $$ = PT_BRAND; } + | NCPUS { $$ = PT_NCPUS; } + | LOCKED { $$ = PT_LOCKED; } + | SWAP { $$ = PT_SWAP; } + | IMPORTANCE { $$ = PT_IMPORTANCE; } + | SHARES { $$ = PT_SHARES; } + | MAXLWPS { $$ = PT_MAXLWPS; } + | MAXSHMMEM { $$ = PT_MAXSHMMEM; } + | MAXSHMIDS { $$ = PT_MAXSHMIDS; } + | MAXMSGIDS { $$ = PT_MAXMSGIDS; } + | MAXSEMIDS { $$ = PT_MAXSEMIDS; } + | SCHED { $$ = PT_SCHED; } /* * The grammar builds data structures from the bottom up. Thus various diff --git a/usr/src/cmd/zonecfg/zonecfg_lex.l b/usr/src/cmd/zonecfg/zonecfg_lex.l index aef16edbcb..53f726ca2e 100644 --- a/usr/src/cmd/zonecfg/zonecfg_lex.l +++ b/usr/src/cmd/zonecfg/zonecfg_lex.l @@ -40,7 +40,10 @@ extern void yyerror(char *s); char *safe_strdup(char *s); %} -%a 4000 +%a 6000 +%p 4000 +%e 2000 +%n 1000 %{ /* @@ -139,6 +142,12 @@ char *safe_strdup(char *s); return SET; } +<INITIAL>clear { + BEGIN TSTATE; + state = TSTATE; + return CLEAR; + } + <INITIAL>verify { BEGIN TSTATE; state = TSTATE; @@ -162,6 +171,10 @@ char *safe_strdup(char *s); <TSTATE>dataset { return DATASET; } +<TSTATE>dedicated-cpu { return PSET; } + +<TSTATE>capped-memory { return MCAP; } + <TSTATE>zonepath { return ZONEPATH; } <CSTATE>zonepath { return ZONEPATH; } @@ -219,6 +232,39 @@ char *safe_strdup(char *s); <TSTATE>action { return ACTION; } <CSTATE>action { return ACTION; } +<TSTATE>ncpus { return NCPUS; } +<CSTATE>ncpus { return NCPUS; } + +<TSTATE>locked { return LOCKED; } +<CSTATE>locked { return LOCKED; } + +<TSTATE>swap { return SWAP; } +<CSTATE>swap { return SWAP; } + +<TSTATE>importance { return IMPORTANCE; } +<CSTATE>importance { return IMPORTANCE; } + +<TSTATE>cpu-shares { return SHARES; } +<CSTATE>cpu-shares { return SHARES; } + +<TSTATE>max-lwps { return MAXLWPS; } +<CSTATE>max-lwps { return MAXLWPS; } + +<TSTATE>max-shm-memory { return MAXSHMMEM; } +<CSTATE>max-shm-memory { return MAXSHMMEM; } + +<TSTATE>max-shm-ids { return MAXSHMIDS; } +<CSTATE>max-shm-ids { return MAXSHMIDS; } + +<TSTATE>max-msg-ids { return MAXMSGIDS; } +<CSTATE>max-msg-ids { return MAXMSGIDS; } + +<TSTATE>max-sem-ids { return MAXSEMIDS; } +<CSTATE>max-sem-ids { return MAXSEMIDS; } + +<TSTATE>scheduling-class { return SCHED; } +<CSTATE>scheduling-class { return SCHED; } + <TSTATE>= { return EQUAL; } <LSTATE>= { return EQUAL; } <CSTATE>= { return EQUAL; } diff --git a/usr/src/head/libzonecfg.h b/usr/src/head/libzonecfg.h index 3af98c1a6b..10ee4a2bb4 100644 --- a/usr/src/head/libzonecfg.h +++ b/usr/src/head/libzonecfg.h @@ -90,6 +90,15 @@ extern "C" { #define Z_PRIV_REQUIRED 38 /* required privilege is missing */ #define Z_PRIV_UNKNOWN 39 /* specified privilege is unknown */ #define Z_BRAND_ERROR 40 /* brand-specific error */ +#define Z_INCOMPATIBLE 41 /* incompatible settings */ +#define Z_ALIAS_DISALLOW 42 /* rctl alias disallowed */ +#define Z_CLEAR_DISALLOW 43 /* clear property disallowed */ +#define Z_POOL 44 /* generic libpool error */ +#define Z_POOLS_NOT_ACTIVE 45 /* pool service not enabled */ +#define Z_POOL_ENABLE 46 /* pools enable failed */ +#define Z_NO_POOL 47 /* no such pool configured */ +#define Z_POOL_CREATE 48 /* pool create failed */ +#define Z_POOL_BIND 49 /* pool bind failed */ /* * Warning: these are shared with the admin/install consolidation. @@ -126,6 +135,18 @@ extern "C" { #define ZONE_PKG_VERSMAX 256 /* + * Shortened alias names for the zones rctls. + */ +#define ALIAS_MAXLWPS "max-lwps" +#define ALIAS_MAXSHMMEM "max-shm-memory" +#define ALIAS_MAXSHMIDS "max-shm-ids" +#define ALIAS_MAXMSGIDS "max-msg-ids" +#define ALIAS_MAXSEMIDS "max-sem-ids" +#define ALIAS_MAXLOCKEDMEM "locked" +#define ALIAS_MAXSWAP "swap" +#define ALIAS_SHARES "cpu-shares" + +/* * Bit flag definitions for passing into libzonecfg functions. */ #define ZONE_DRY_RUN 0x01 @@ -190,6 +211,16 @@ struct zone_dstab { char zone_dataset_name[MAXNAMELEN]; }; +struct zone_psettab { + char zone_ncpu_min[MAXNAMELEN]; + char zone_ncpu_max[MAXNAMELEN]; + char zone_importance[MAXNAMELEN]; +}; + +struct zone_mcaptab { + char zone_physmem_cap[MAXNAMELEN]; +}; + struct zone_pkgtab { char zone_pkg_name[MAXNAMELEN]; char zone_pkg_version[ZONE_PKG_VERSMAX]; @@ -227,10 +258,17 @@ extern int zonecfg_access(const char *, int); extern void zonecfg_set_root(const char *); extern const char *zonecfg_get_root(void); extern boolean_t zonecfg_in_alt_root(void); +extern int zonecfg_num_resources(zone_dochandle_t, char *); +extern int zonecfg_del_all_resources(zone_dochandle_t, char *); +extern boolean_t zonecfg_valid_ncpus(char *, char *); +extern boolean_t zonecfg_valid_importance(char *); +extern int zonecfg_str_to_bytes(char *, uint64_t *); +extern boolean_t zonecfg_valid_memlimit(char *, uint64_t *); +extern boolean_t zonecfg_valid_alias_limit(char *, char *, uint64_t *); /* - * Zone name, path to zone directory, autoboot setting, pool and boot - * arguments. + * Zone name, path to zone directory, autoboot setting, pool, boot + * arguments, and scheduling-class. */ extern int zonecfg_validate_zonename(const char *); extern int zonecfg_get_name(zone_dochandle_t, char *, size_t); @@ -243,6 +281,9 @@ extern int zonecfg_get_pool(zone_dochandle_t, char *, size_t); extern int zonecfg_set_pool(zone_dochandle_t, char *); extern int zonecfg_get_bootargs(zone_dochandle_t, char *, size_t); extern int zonecfg_set_bootargs(zone_dochandle_t, char *); +extern int zonecfg_get_sched_class(zone_dochandle_t, char *, size_t); +extern int zonecfg_set_sched(zone_dochandle_t, char *); +extern int zonecfg_get_dflt_sched_class(zone_dochandle_t, char *, int); /* * Set/retrieve the brand for the zone @@ -302,6 +343,11 @@ extern int zonecfg_add_rctl_value(struct zone_rctltab *, extern int zonecfg_remove_rctl_value(struct zone_rctltab *, struct zone_rctlvaltab *); extern void zonecfg_free_rctl_value_list(struct zone_rctlvaltab *); +extern boolean_t zonecfg_aliased_rctl_ok(zone_dochandle_t, char *); +extern int zonecfg_set_aliased_rctl(zone_dochandle_t, char *, uint64_t); +extern int zonecfg_get_aliased_rctl(zone_dochandle_t, char *, uint64_t *); +extern int zonecfg_rm_aliased_rctl(zone_dochandle_t, char *); +extern int zonecfg_apply_rctls(char *, zone_dochandle_t); /* * Generic attribute configuration and type/value extraction. @@ -328,6 +374,34 @@ extern int zonecfg_modify_ds(zone_dochandle_t, struct zone_dstab *, extern int zonecfg_lookup_ds(zone_dochandle_t, struct zone_dstab *); /* + * cpu-set configuration. + */ +extern int zonecfg_add_pset(zone_dochandle_t, struct zone_psettab *); +extern int zonecfg_delete_pset(zone_dochandle_t); +extern int zonecfg_modify_pset(zone_dochandle_t, struct zone_psettab *); +extern int zonecfg_lookup_pset(zone_dochandle_t, struct zone_psettab *); + +/* + * mem-cap configuration. + */ +extern int zonecfg_delete_mcap(zone_dochandle_t); +extern int zonecfg_modify_mcap(zone_dochandle_t, struct zone_mcaptab *); +extern int zonecfg_lookup_mcap(zone_dochandle_t, struct zone_mcaptab *); + +/* + * Temporary pool support functions. + */ +extern int zonecfg_destroy_tmp_pool(char *, char *, int); +extern int zonecfg_bind_tmp_pool(zone_dochandle_t, zoneid_t, char *, int); +extern int zonecfg_bind_pool(zone_dochandle_t, zoneid_t, char *, int); +extern boolean_t zonecfg_warn_poold(zone_dochandle_t); + +/* + * Miscellaneous utility functions. + */ +extern int zonecfg_enable_rcapd(char *, int); + +/* * attach/detach support. */ extern int zonecfg_get_attach_handle(const char *, const char *, @@ -373,6 +447,8 @@ extern int zonecfg_endrctlent(zone_dochandle_t); extern int zonecfg_setdsent(zone_dochandle_t); extern int zonecfg_getdsent(zone_dochandle_t, struct zone_dstab *); extern int zonecfg_enddsent(zone_dochandle_t); +extern int zonecfg_getpsetent(zone_dochandle_t, struct zone_psettab *); +extern int zonecfg_getmcapent(zone_dochandle_t, struct zone_mcaptab *); extern int zonecfg_setpkgent(zone_dochandle_t); extern int zonecfg_getpkgent(zone_dochandle_t, struct zone_pkgtab *); extern int zonecfg_endpkgent(zone_dochandle_t); diff --git a/usr/src/lib/Makefile b/usr/src/lib/Makefile index da3bdb3844..c541fcb01c 100644 --- a/usr/src/lib/Makefile +++ b/usr/src/lib/Makefile @@ -489,7 +489,7 @@ libldap5: libsasl libsocket libnsl libmd libsldap: libldap5 libtsol libpool: libnvpair libexacct libzonecfg: libc libsocket libnsl libuuid libnvpair libsysevent libsec \ - libbrand + libbrand libpool libscf libproc: ../cmd/sgs/librtld_db ../cmd/sgs/libelf libctf libproject: libpool libproc libsecdb libtsnet: libnsl libtsol libsecdb diff --git a/usr/src/lib/libc/port/gen/getrusage.c b/usr/src/lib/libc/port/gen/getrusage.c index c1f1b92188..efeaf0be24 100644 --- a/usr/src/lib/libc/port/gen/getrusage.c +++ b/usr/src/lib/libc/port/gen/getrusage.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -49,6 +48,7 @@ #include <sys/param.h> #include <errno.h> #include <sys/resource.h> +#include <sys/vm_usage.h> #include <fcntl.h> #include <sys/fcntl.h> #include <procfs.h> @@ -76,3 +76,10 @@ getrusage(int who, struct rusage *rusage) return (-1); } } + +int +getvmusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres) +{ + return (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, flags, age, + buf, nres)); +} diff --git a/usr/src/lib/libc/port/mapfile-vers b/usr/src/lib/libc/port/mapfile-vers index 22227a6413..8e1b399567 100644 --- a/usr/src/lib/libc/port/mapfile-vers +++ b/usr/src/lib/libc/port/mapfile-vers @@ -59,6 +59,7 @@ SUNW_1.23 { # SunOS 5.11 (Solaris 11) fdatasync; forkallx; forkx; + getvmusage; lio_listio; mkdtemp; _mkdtemp; diff --git a/usr/src/lib/libpool/common/pool.c b/usr/src/lib/libpool/common/pool.c index 167cd8be5b..6fbd7b34d3 100644 --- a/usr/src/lib/libpool/common/pool.c +++ b/usr/src/lib/libpool/common/pool.c @@ -914,10 +914,34 @@ pool_put_property(pool_conf_t *conf, pool_elem_t *pe, const char *name, return (NULL); } - if (!is_valid_prop_name(name)) { + /* Don't allow (re)setting of the "temporary" property */ + if (!is_valid_prop_name(name) || strstr(name, ".temporary") != NULL) { pool_seterror(POE_BADPARAM); return (PO_FAIL); } + + /* Don't allow rename of temporary pools/resources */ + if (strstr(name, ".name") != NULL && elem_is_tmp(pe)) { + boolean_t rename = B_TRUE; + pool_value_t *pv = pool_value_alloc(); + + if (pe->pe_get_prop(pe, name, pv) != POC_INVAL) { + const char *s1 = NULL; + const char *s2 = NULL; + + (void) pool_value_get_string(pv, &s1); + (void) pool_value_get_string(val, &s2); + if (s1 != NULL && s2 != NULL && strcmp(s1, s2) == 0) + rename = B_FALSE; + } + pool_value_free(pv); + + if (rename) { + pool_seterror(POE_BADPARAM); + return (PO_FAIL); + } + } + /* * Check to see if this is a property we are managing. If it is, * ensure that we are happy with what the user is doing. @@ -936,6 +960,46 @@ pool_put_property(pool_conf_t *conf, pool_elem_t *pe, const char *name, } /* + * Set temporary property to flag as a temporary element. + * + * PO_FAIL is returned if an error is detected and the error code is updated + * to indicate the cause of the error. + */ +int +pool_set_temporary(pool_conf_t *conf, pool_elem_t *pe) +{ + int res; + char name[128]; + pool_value_t *val; + + if (pool_conf_check(conf) != PO_SUCCESS) + return (PO_FAIL); + + if (TO_CONF(pe) != conf) { + pool_seterror(POE_BADPARAM); + return (PO_FAIL); + } + + /* create property name based on element type */ + if (snprintf(name, sizeof (name), "%s.temporary", + pool_elem_class_string(pe)) > sizeof (name)) { + pool_seterror(POE_SYSTEM); + return (PO_FAIL); + } + + if ((val = pool_value_alloc()) == NULL) + return (PO_FAIL); + + pool_value_set_bool(val, (uchar_t)1); + + res = pe->pe_put_prop(pe, name, val); + + pool_value_free(val); + + return (res); +} + +/* * Update the specified property value with the namespace prepended. * e.g. If this function is used to update the property "name" on a pool, it * will attempt to update "pool.name". @@ -1030,6 +1094,12 @@ pool_rm_property(pool_conf_t *conf, pool_elem_t *pe, const char *name) return (NULL); } + /* Don't allow removal of the "temporary" property */ + if (strstr(name, ".temporary") != NULL) { + pool_seterror(POE_BADPARAM); + return (PO_FAIL); + } + /* * Check to see if this is a property we are managing. If it is, * ensure that we are happy with what the user is doing. @@ -1122,6 +1192,17 @@ pool_create(pool_conf_t *conf, const char *name) pool_seterror(POE_PUTPROP); return (NULL); } + + /* + * If we are creating a temporary pool configuration, flag the pool. + */ + if (conf->pc_prov->pc_oflags & PO_TEMP) { + if (pool_set_temporary(conf, pe) == PO_FAIL) { + (void) pool_destroy(conf, pool_elem_pool(pe)); + return (NULL); + } + } + return (pool_elem_pool(pe)); } @@ -1227,6 +1308,17 @@ pool_resource_create(pool_conf_t *conf, const char *sz_type, const char *name) return (NULL); } } + + /* + * If we are creating a temporary pool configuration, flag the resource. + */ + if (conf->pc_prov->pc_oflags & PO_TEMP) { + if (pool_set_temporary(conf, pe) != PO_SUCCESS) { + (void) pool_resource_destroy(conf, pool_elem_res(pe)); + return (NULL); + } + } + return (pool_elem_res(pe)); } @@ -1396,7 +1488,8 @@ pool_conf_open(pool_conf_t *conf, const char *location, int oflags) pool_seterror(POE_BADPARAM); return (PO_FAIL); } - if (oflags & ~(PO_RDONLY | PO_RDWR | PO_CREAT | PO_DISCO | PO_UPDATE)) { + if (oflags & ~(PO_RDONLY | PO_RDWR | PO_CREAT | PO_DISCO | PO_UPDATE | + PO_TEMP)) { pool_seterror(POE_BADPARAM); return (PO_FAIL); } @@ -1408,6 +1501,10 @@ pool_conf_open(pool_conf_t *conf, const char *location, int oflags) if (oflags & PO_CREAT) oflags |= PO_RDWR; + /* location is ignored when creating a temporary configuration */ + if (oflags & PO_TEMP) + location = ""; + if ((conf->pc_location = strdup(location)) == NULL) { pool_seterror(POE_SYSTEM); return (PO_FAIL); @@ -1415,14 +1512,25 @@ pool_conf_open(pool_conf_t *conf, const char *location, int oflags) /* * This is the crossover point into the actual data provider * implementation, allocate a data provider of the appropriate - * type for your data storage medium. In this case it's a kernel - * data provider. To use a different data provider, write some - * code to implement all the required interfaces and then - * change the next line to allocate a data provider which uses your - * new code. All data provider routines can be static, apart from - * the allocation routine. + * type for your data storage medium. In this case it's either a kernel + * or xml data provider. To use a different data provider, write some + * code to implement all the required interfaces and then change the + * following code to allocate a data provider which uses your new code. + * All data provider routines can be static, apart from the allocation + * routine. + * + * For temporary pools (PO_TEMP) we start with a copy of the current + * dynamic configuration and do all of the updates in-memory. */ - if (strcmp(location, pool_dynamic_location()) == 0) { + if (oflags & PO_TEMP) { + if (pool_knl_connection_alloc(conf, PO_TEMP) != PO_SUCCESS) { + conf->pc_state = POF_INVALID; + return (PO_FAIL); + } + /* set rdwr flag so we can updated the in-memory config. */ + conf->pc_prov->pc_oflags |= PO_RDWR; + + } else if (strcmp(location, pool_dynamic_location()) == 0) { if (pool_knl_connection_alloc(conf, oflags) != PO_SUCCESS) { conf->pc_state = POF_INVALID; return (PO_FAIL); diff --git a/usr/src/lib/libpool/common/pool.h b/usr/src/lib/libpool/common/pool.h index d38e9902e6..ee11aadb7b 100644 --- a/usr/src/lib/libpool/common/pool.h +++ b/usr/src/lib/libpool/common/pool.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -96,6 +95,7 @@ extern uint_t pool_version(uint_t ver); #define PO_CREAT 0x2 #define PO_DISCO 0x4 #define PO_UPDATE 0x8 +#define PO_TEMP 0x10 /* Allocation policy */ #define POA_IMPORTANCE "importance based" @@ -218,6 +218,7 @@ extern pool_value_class_t pool_get_property(const pool_conf_t *, extern int pool_put_property(pool_conf_t *, pool_elem_t *, const char *, const pool_value_t *); extern int pool_rm_property(pool_conf_t *, pool_elem_t *, const char *); + /* * Walk the associated properties of the supplied element calling the supplied * function for each property in turn. There is no implied order in the walk. diff --git a/usr/src/lib/libpool/common/pool_commit.c b/usr/src/lib/libpool/common/pool_commit.c index 1ea4808377..b996524b98 100644 --- a/usr/src/lib/libpool/common/pool_commit.c +++ b/usr/src/lib/libpool/common/pool_commit.c @@ -245,6 +245,9 @@ commit_delete(pool_elem_t *pe) pool_t *pool; int ret = 0; + if (elem_is_tmp(pe)) + return (PO_SUCCESS); + switch (pool_elem_class(pe)) { case PEC_SYSTEM: /* NO-OP */ break; @@ -1306,7 +1309,14 @@ clone_element(pool_conf_t *conf, pool_elem_t *pe, const char *name, if ((prop = provider_get_prop(pe, name)) != NULL && prop_is_readonly(prop) == PO_TRUE) return (PO_SUCCESS); - return (pool_put_property(TO_CONF(tgt), tgt, name, pv) == PO_FAIL); + + /* The temporary property needs special handling */ + if (strstr(name, ".temporary") != NULL) + return (pool_set_temporary(TO_CONF(tgt), tgt) == + PO_FAIL ? PO_FAIL : PO_SUCCESS); + else + return (pool_put_property(TO_CONF(tgt), tgt, name, pv) == + PO_FAIL ? PO_FAIL : PO_SUCCESS); } /* @@ -1322,8 +1332,9 @@ clean_element(pool_conf_t *conf, pool_elem_t *pe, const char *name, /* * Some properties should be ignored */ - if ((prop = provider_get_prop(pe, name)) != NULL && - prop_is_optional(prop) == PO_FALSE) + if (strstr(name, ".temporary") != NULL || + ((prop = provider_get_prop(pe, name)) != NULL && + prop_is_optional(prop) == PO_FALSE)) return (PO_SUCCESS); return (pool_rm_property(conf, (pool_elem_t *)pe, name) == PO_FAIL); } diff --git a/usr/src/lib/libpool/common/pool_internal.c b/usr/src/lib/libpool/common/pool_internal.c index 210e63d620..5e572f6eaf 100644 --- a/usr/src/lib/libpool/common/pool_internal.c +++ b/usr/src/lib/libpool/common/pool_internal.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1143,6 +1142,23 @@ elem_is_default(const pool_elem_t *res) } /* + * Return B_TRUE if the element has the 'temporary' property set. + */ +boolean_t +elem_is_tmp(const pool_elem_t *elem) +{ + pool_value_t val = POOL_VALUE_INITIALIZER; + uchar_t bval; + + if (pool_get_ns_property(elem, "temporary", &val) != POC_BOOL) + return (B_FALSE); + + (void) pool_value_get_bool(&val, &bval); + + return (bval != 0); +} + +/* * get_default_elem() returns the default elem for type of the supplied * elem. * diff --git a/usr/src/lib/libpool/common/pool_internal.h b/usr/src/lib/libpool/common/pool_internal.h index 592c98d11d..e172d23af4 100644 --- a/usr/src/lib/libpool/common/pool_internal.h +++ b/usr/src/lib/libpool/common/pool_internal.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -256,6 +255,7 @@ extern int resource_get_pinned(const pool_resource_t *, extern char *elem_get_name(const pool_elem_t *); extern id_t elem_get_sysid(const pool_elem_t *); extern int elem_is_default(const pool_elem_t *); +extern boolean_t elem_is_tmp(const pool_elem_t *); extern const pool_elem_t *get_default_elem(const pool_elem_t *); extern int qsort_elem_compare(const void *, const void *); @@ -371,6 +371,7 @@ extern pool_value_class_t pool_get_ns_property(const pool_elem_t *, extern int pool_walk_any_properties(pool_conf_t *, pool_elem_t *, void *, int (*)(pool_conf_t *, pool_elem_t *, const char *, pool_value_t *, void *), int); +extern int pool_set_temporary(pool_conf_t *, pool_elem_t *); /* * Namespace aware utility functions. diff --git a/usr/src/lib/libpool/common/pool_kernel.c b/usr/src/lib/libpool/common/pool_kernel.c index f84d6f2ba5..3da4f0263c 100644 --- a/usr/src/lib/libpool/common/pool_kernel.c +++ b/usr/src/lib/libpool/common/pool_kernel.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -646,10 +645,14 @@ pool_knl_close(pool_conf_t *conf) } /* * Rollback any pending changes before freeing the prov. This - * ensures there are no memory leaks from pending - * transactions. + * ensures there are no memory leaks from pending transactions. + * However, don't rollback when we've done a temporary pool since the + * pool/resources haven't really been committed in this case. + * They will all be freed in pool_knl_connection_free and we don't + * want to double free them. */ - (void) pool_knl_rollback(conf); + if (!(conf->pc_prov->pc_oflags & PO_TEMP)) + (void) pool_knl_rollback(conf); pool_knl_connection_free(prov); return (PO_SUCCESS); } @@ -997,6 +1000,9 @@ pool_knl_export(const pool_conf_t *conf, const char *location, const char *sep = ""; int j; + if (elem_is_tmp(elem)) + continue; + if ((info.ktx_node = node_create(system, BAD_CAST element_class_tags [pool_elem_class(elem)])) == NULL) { @@ -1072,6 +1078,9 @@ pool_knl_export(const pool_conf_t *conf, const char *location, uint_t ncompelem; int j; + if (elem_is_tmp(elem)) + continue; + if ((info.ktx_node = node_create(system, BAD_CAST element_class_tags [pool_elem_class(elem)])) == NULL) { diff --git a/usr/src/lib/libproject/common/setproject.c b/usr/src/lib/libproject/common/setproject.c index 2303576d32..d22878a36f 100644 --- a/usr/src/lib/libproject/common/setproject.c +++ b/usr/src/lib/libproject/common/setproject.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -536,7 +535,12 @@ setproject_proc(const char *project_name, const char *user_name, int flags, } old_pool_name = pool_get_binding(pid); - if (bind_to_pool(pool_name, pid, 0) != 0) { + + /* + * If parent is not bound to the default pool, then we want + * to preserve same binding as parent. + */ + if (pool_name != NULL && bind_to_pool(pool_name, pid, 0) != 0) { if (old_pool_name) free(old_pool_name); _kva_free(kv_array); diff --git a/usr/src/lib/libzonecfg/Makefile.com b/usr/src/lib/libzonecfg/Makefile.com index b89a44fce3..b64df94527 100644 --- a/usr/src/lib/libzonecfg/Makefile.com +++ b/usr/src/lib/libzonecfg/Makefile.com @@ -32,7 +32,8 @@ OBJECTS= libzonecfg.o getzoneent.o scratchops.o include ../../Makefile.lib LIBS = $(DYNLIB) $(LINTLIB) -LDLIBS += -lc -lsocket -lnsl -luuid -lnvpair -lsysevent -lsec -lbrand +LDLIBS += -lc -lsocket -lnsl -luuid -lnvpair -lsysevent -lsec -lbrand \ + -lpool -lscf -lproc # DYNLIB libraries do not have lint libs and are not linted $(DYNLIB) := LDLIBS += -lxml2 diff --git a/usr/src/lib/libzonecfg/common/libzonecfg.c b/usr/src/lib/libzonecfg/common/libzonecfg.c index f4fbcde368..1a3fb37c8c 100644 --- a/usr/src/lib/libzonecfg/common/libzonecfg.c +++ b/usr/src/lib/libzonecfg/common/libzonecfg.c @@ -46,6 +46,10 @@ #include <sys/nvpair.h> #include <sys/types.h> #include <ftw.h> +#include <pool.h> +#include <libscf.h> +#include <libproc.h> +#include <sys/priocntl.h> #include <arpa/inet.h> #include <netdb.h> @@ -79,6 +83,9 @@ #define DTD_ELEM_RCTLVALUE (const xmlChar *) "rctl-value" #define DTD_ELEM_ZONE (const xmlChar *) "zone" #define DTD_ELEM_DATASET (const xmlChar *) "dataset" +#define DTD_ELEM_TMPPOOL (const xmlChar *) "tmp_pool" +#define DTD_ELEM_PSET (const xmlChar *) "pset" +#define DTD_ELEM_MCAP (const xmlChar *) "mcap" #define DTD_ELEM_PACKAGE (const xmlChar *) "package" #define DTD_ELEM_PATCH (const xmlChar *) "patch" #define DTD_ELEM_OBSOLETES (const xmlChar *) "obsoletes" @@ -92,6 +99,7 @@ #define DTD_ATTR_LIMIT (const xmlChar *) "limit" #define DTD_ATTR_LIMITPRIV (const xmlChar *) "limitpriv" #define DTD_ATTR_BOOTARGS (const xmlChar *) "bootargs" +#define DTD_ATTR_SCHED (const xmlChar *) "scheduling-class" #define DTD_ATTR_MATCH (const xmlChar *) "match" #define DTD_ATTR_NAME (const xmlChar *) "name" #define DTD_ATTR_PHYSICAL (const xmlChar *) "physical" @@ -102,6 +110,10 @@ #define DTD_ATTR_TYPE (const xmlChar *) "type" #define DTD_ATTR_VALUE (const xmlChar *) "value" #define DTD_ATTR_ZONEPATH (const xmlChar *) "zonepath" +#define DTD_ATTR_NCPU_MIN (const xmlChar *) "ncpu_min" +#define DTD_ATTR_NCPU_MAX (const xmlChar *) "ncpu_max" +#define DTD_ATTR_IMPORTANCE (const xmlChar *) "importance" +#define DTD_ATTR_PHYSCAP (const xmlChar *) "physcap" #define DTD_ATTR_VERSION (const xmlChar *) "version" #define DTD_ATTR_ID (const xmlChar *) "id" #define DTD_ATTR_UID (const xmlChar *) "uid" @@ -133,6 +145,46 @@ #define PATCHINFO "PATCH_INFO_" #define PKGINFO_RD_LEN 128 +#define TMP_POOL_NAME "SUNWtmp_%s" +#define MAX_TMP_POOL_NAME (ZONENAME_MAX + 9) +#define RCAP_SERVICE "system/rcap:default" +#define POOLD_SERVICE "system/pools/dynamic:default" + +/* + * rctl alias definitions + * + * This holds the alias, the full rctl name, the default priv value, action + * and lower limit. The functions that handle rctl aliases step through + * this table, matching on the alias, and using the full values for setting + * the rctl entry as well the limit for validation. + */ +static struct alias { + char *shortname; + char *realname; + char *priv; + char *action; + uint64_t low_limit; +} aliases[] = { + {ALIAS_MAXLWPS, "zone.max-lwps", "privileged", "deny", 100}, + {ALIAS_MAXSHMMEM, "zone.max-shm-memory", "privileged", "deny", 0}, + {ALIAS_MAXSHMIDS, "zone.max-shm-ids", "privileged", "deny", 0}, + {ALIAS_MAXMSGIDS, "zone.max-msg-ids", "privileged", "deny", 0}, + {ALIAS_MAXSEMIDS, "zone.max-sem-ids", "privileged", "deny", 0}, + {ALIAS_MAXLOCKEDMEM, "zone.max-locked-memory", "privileged", "deny", 0}, + {ALIAS_MAXSWAP, "zone.max-swap", "privileged", "deny", 0}, + {ALIAS_SHARES, "zone.cpu-shares", "privileged", "none", 0}, + {NULL, NULL, NULL, NULL, 0} +}; + +/* + * Structure for applying rctls to a running zone. It allows important + * process values to be passed together easily. + */ +typedef struct pr_info_handle { + struct ps_prochandle *pr; + pid_t pid; +} pr_info_handle_t; + struct zone_dochandle { char *zone_dh_rootdir; xmlDocPtr zone_dh_doc; @@ -446,14 +498,20 @@ setrootattr(zone_dochandle_t handle, const xmlChar *propname, int err; xmlNodePtr root; - if (propval == NULL) - return (Z_INVAL); - if ((err = getroot(handle, &root)) != Z_OK) return (err); - if (xmlSetProp(root, propname, (const xmlChar *) propval) == NULL) - return (Z_INVAL); + /* + * If we get a null propval remove the property (ignore return since it + * may not be set to begin with). + */ + if (propval == NULL) { + (void) xmlUnsetProp(root, propname); + } else { + if (xmlSetProp(root, propname, (const xmlChar *) propval) + == NULL) + return (Z_INVAL); + } return (Z_OK); } @@ -947,6 +1005,18 @@ zonecfg_set_bootargs(zone_dochandle_t handle, char *bargs) return (setrootattr(handle, DTD_ATTR_BOOTARGS, bargs)); } +int +zonecfg_get_sched_class(zone_dochandle_t handle, char *sched, size_t schedsize) +{ + return (getrootattr(handle, DTD_ATTR_SCHED, sched, schedsize)); +} + +int +zonecfg_set_sched(zone_dochandle_t handle, char *sched) +{ + return (setrootattr(handle, DTD_ATTR_SCHED, sched)); +} + /* * /etc/zones/index caches a vital piece of information which is also * in the <zonename>.xml file: the path to the zone. This is for performance, @@ -3047,6 +3117,30 @@ zonecfg_strerror(int errnum) case Z_BRAND_ERROR: return (dgettext(TEXT_DOMAIN, "Brand-specific error")); + case Z_INCOMPATIBLE: + return (dgettext(TEXT_DOMAIN, "Incompatible settings")); + case Z_ALIAS_DISALLOW: + return (dgettext(TEXT_DOMAIN, + "An incompatible rctl already exists for this property")); + case Z_CLEAR_DISALLOW: + return (dgettext(TEXT_DOMAIN, + "Clearing this property is not allowed")); + case Z_POOL: + return (dgettext(TEXT_DOMAIN, "libpool(3LIB) error")); + case Z_POOLS_NOT_ACTIVE: + return (dgettext(TEXT_DOMAIN, "Pools facility not active; " + "zone will not be bound to pool")); + case Z_POOL_ENABLE: + return (dgettext(TEXT_DOMAIN, + "Could not enable pools facility")); + case Z_NO_POOL: + return (dgettext(TEXT_DOMAIN, + "Pool not found; using default pool")); + case Z_POOL_CREATE: + return (dgettext(TEXT_DOMAIN, + "Could not create a temporary pool")); + case Z_POOL_BIND: + return (dgettext(TEXT_DOMAIN, "Could not bind zone to pool")); default: return (dgettext(TEXT_DOMAIN, "Unknown error")); } @@ -3086,6 +3180,951 @@ zonecfg_endent(zone_dochandle_t handle) return (Z_OK); } +/* + * Do the work required to manipulate a process through libproc. + * If grab_process() returns no errors (0), then release_process() + * must eventually be called. + * + * Return values: + * 0 Successful creation of agent thread + * 1 Error grabbing + * 2 Error creating agent + */ +static int +grab_process(pr_info_handle_t *p) +{ + int ret; + + if ((p->pr = Pgrab(p->pid, 0, &ret)) != NULL) { + + if (Psetflags(p->pr, PR_RLC) != 0) { + Prelease(p->pr, 0); + return (1); + } + if (Pcreate_agent(p->pr) == 0) { + return (0); + + } else { + Prelease(p->pr, 0); + return (2); + } + } else { + return (1); + } +} + +/* + * Release the specified process. This destroys the agent + * and releases the process. If the process is NULL, nothing + * is done. This function should only be called if grab_process() + * has previously been called and returned success. + * + * This function is Pgrab-safe. + */ +static void +release_process(struct ps_prochandle *Pr) +{ + if (Pr == NULL) + return; + + Pdestroy_agent(Pr); + Prelease(Pr, 0); +} + +static boolean_t +grab_zone_proc(char *zonename, pr_info_handle_t *p) +{ + DIR *dirp; + struct dirent *dentp; + zoneid_t zoneid; + int pid_self; + psinfo_t psinfo; + + if (zone_get_id(zonename, &zoneid) != 0) + return (B_FALSE); + + pid_self = getpid(); + + if ((dirp = opendir("/proc")) == NULL) + return (B_FALSE); + + while (dentp = readdir(dirp)) { + p->pid = atoi(dentp->d_name); + + /* Skip self */ + if (p->pid == pid_self) + continue; + + if (proc_get_psinfo(p->pid, &psinfo) != 0) + continue; + + if (psinfo.pr_zoneid != zoneid) + continue; + + /* attempt to grab process */ + if (grab_process(p) != 0) + continue; + + if (pr_getzoneid(p->pr) != zoneid) { + release_process(p->pr); + continue; + } + + (void) closedir(dirp); + return (B_TRUE); + } + + (void) closedir(dirp); + return (B_FALSE); +} + +static boolean_t +get_priv_rctl(struct ps_prochandle *pr, char *name, rctlblk_t *rblk) +{ + if (pr_getrctl(pr, name, NULL, rblk, RCTL_FIRST)) + return (B_FALSE); + + if (rctlblk_get_privilege(rblk) == RCPRIV_PRIVILEGED) + return (B_TRUE); + + while (pr_getrctl(pr, name, rblk, rblk, RCTL_NEXT) == 0) { + if (rctlblk_get_privilege(rblk) == RCPRIV_PRIVILEGED) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Apply the current rctl settings to the specified, running zone. + */ +int +zonecfg_apply_rctls(char *zone_name, zone_dochandle_t handle) +{ + int err; + int res = Z_OK; + rctlblk_t *rblk; + pr_info_handle_t p; + struct zone_rctltab rctl; + + if ((err = zonecfg_setrctlent(handle)) != Z_OK) + return (err); + + if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL) { + (void) zonecfg_endrctlent(handle); + return (Z_NOMEM); + } + + if (!grab_zone_proc(zone_name, &p)) { + (void) zonecfg_endrctlent(handle); + free(rblk); + return (Z_SYSTEM); + } + + while (zonecfg_getrctlent(handle, &rctl) == Z_OK) { + char *rname; + struct zone_rctlvaltab *valptr; + + rname = rctl.zone_rctl_name; + + /* first delete all current privileged settings for this rctl */ + while (get_priv_rctl(p.pr, rname, rblk)) { + if (pr_setrctl(p.pr, rname, NULL, rblk, RCTL_DELETE) != + 0) { + res = Z_SYSTEM; + goto done; + } + } + + /* now set each new value for the rctl */ + for (valptr = rctl.zone_rctl_valptr; valptr != NULL; + valptr = valptr->zone_rctlval_next) { + if ((err = zonecfg_construct_rctlblk(valptr, rblk)) + != Z_OK) { + res = errno = err; + goto done; + } + + if (pr_setrctl(p.pr, rname, NULL, rblk, RCTL_INSERT)) { + res = Z_SYSTEM; + goto done; + } + } + } + +done: + release_process(p.pr); + free(rblk); + (void) zonecfg_endrctlent(handle); + + return (res); +} + +static const xmlChar * +nm_to_dtd(char *nm) +{ + if (strcmp(nm, "device") == 0) + return (DTD_ELEM_DEVICE); + if (strcmp(nm, "fs") == 0) + return (DTD_ELEM_FS); + if (strcmp(nm, "inherit-pkg-dir") == 0) + return (DTD_ELEM_IPD); + if (strcmp(nm, "net") == 0) + return (DTD_ELEM_NET); + if (strcmp(nm, "attr") == 0) + return (DTD_ELEM_ATTR); + if (strcmp(nm, "rctl") == 0) + return (DTD_ELEM_RCTL); + if (strcmp(nm, "dataset") == 0) + return (DTD_ELEM_DATASET); + + return (NULL); +} + +int +zonecfg_num_resources(zone_dochandle_t handle, char *rsrc) +{ + int num = 0; + const xmlChar *dtd; + xmlNodePtr cur; + + if ((dtd = nm_to_dtd(rsrc)) == NULL) + return (num); + + if (zonecfg_setent(handle) != Z_OK) + return (num); + + for (cur = handle->zone_dh_cur; cur != NULL; cur = cur->next) + if (xmlStrcmp(cur->name, dtd) == 0) + num++; + + (void) zonecfg_endent(handle); + + return (num); +} + +int +zonecfg_del_all_resources(zone_dochandle_t handle, char *rsrc) +{ + int err; + const xmlChar *dtd; + xmlNodePtr cur; + + if ((dtd = nm_to_dtd(rsrc)) == NULL) + return (Z_NO_RESOURCE_TYPE); + + if ((err = zonecfg_setent(handle)) != Z_OK) + return (err); + + cur = handle->zone_dh_cur; + while (cur != NULL) { + xmlNodePtr tmp; + + if (xmlStrcmp(cur->name, dtd)) { + cur = cur->next; + continue; + } + + tmp = cur->next; + xmlUnlinkNode(cur); + xmlFreeNode(cur); + cur = tmp; + } + + (void) zonecfg_endent(handle); + return (Z_OK); +} + +static boolean_t +valid_uint(char *s, uint64_t *n) +{ + char *endp; + + /* strtoull accepts '-'?! so we want to flag that as an error */ + if (strchr(s, '-') != NULL) + return (B_FALSE); + + errno = 0; + *n = strtoull(s, &endp, 10); + + if (errno != 0 || *endp != '\0') + return (B_FALSE); + return (B_TRUE); +} + +/* + * Convert a string representing a number (possibly a fraction) into an integer. + * The string can have a modifier (K, M, G or T). The modifiers are treated + * as powers of two (not 10). + */ +int +zonecfg_str_to_bytes(char *str, uint64_t *bytes) +{ + long double val; + char *unitp; + uint64_t scale; + + if ((val = strtold(str, &unitp)) < 0) + return (-1); + + /* remove any leading white space from units string */ + while (isspace(*unitp) != 0) + ++unitp; + + /* if no units explicitly set, error */ + if (unitp == NULL || *unitp == '\0') { + scale = 1; + } else { + int i; + char *units[] = {"K", "M", "G", "T", NULL}; + + scale = 1024; + + /* update scale based on units */ + for (i = 0; units[i] != NULL; i++) { + if (strcasecmp(unitp, units[i]) == 0) + break; + scale <<= 10; + } + + if (units[i] == NULL) + return (-1); + } + + *bytes = (uint64_t)(val * scale); + return (0); +} + +boolean_t +zonecfg_valid_ncpus(char *lowstr, char *highstr) +{ + uint64_t low, high; + + if (!valid_uint(lowstr, &low) || !valid_uint(highstr, &high) || + low < 1 || low > high) + return (B_FALSE); + + return (B_TRUE); +} + +boolean_t +zonecfg_valid_importance(char *impstr) +{ + uint64_t num; + + if (!valid_uint(impstr, &num)) + return (B_FALSE); + + return (B_TRUE); +} + +boolean_t +zonecfg_valid_alias_limit(char *name, char *limitstr, uint64_t *limit) +{ + int i; + + for (i = 0; aliases[i].shortname != NULL; i++) + if (strcmp(name, aliases[i].shortname) == 0) + break; + + if (aliases[i].shortname == NULL) + return (B_FALSE); + + if (!valid_uint(limitstr, limit) || *limit < aliases[i].low_limit) + return (B_FALSE); + + return (B_TRUE); +} + +boolean_t +zonecfg_valid_memlimit(char *memstr, uint64_t *mem_val) +{ + if (zonecfg_str_to_bytes(memstr, mem_val) != 0) + return (B_FALSE); + + return (B_TRUE); +} + +static int +zerr_pool(char *pool_err, int err_size, int res) +{ + (void) strlcpy(pool_err, pool_strerror(pool_error()), err_size); + return (res); +} + +static int +create_tmp_pset(char *pool_err, int err_size, pool_conf_t *pconf, pool_t *pool, + char *name, int min, int max) +{ + pool_resource_t *res; + pool_elem_t *elem; + pool_value_t *val; + + if ((res = pool_resource_create(pconf, "pset", name)) == NULL) + return (zerr_pool(pool_err, err_size, Z_POOL)); + + if (pool_associate(pconf, pool, res) != PO_SUCCESS) + return (zerr_pool(pool_err, err_size, Z_POOL)); + + if ((elem = pool_resource_to_elem(pconf, res)) == NULL) + return (zerr_pool(pool_err, err_size, Z_POOL)); + + if ((val = pool_value_alloc()) == NULL) + return (zerr_pool(pool_err, err_size, Z_POOL)); + + /* set the maximum number of cpus for the pset */ + pool_value_set_uint64(val, (uint64_t)max); + + if (pool_put_property(pconf, elem, "pset.max", val) != PO_SUCCESS) { + pool_value_free(val); + return (zerr_pool(pool_err, err_size, Z_POOL)); + } + + /* set the minimum number of cpus for the pset */ + pool_value_set_uint64(val, (uint64_t)min); + + if (pool_put_property(pconf, elem, "pset.min", val) != PO_SUCCESS) { + pool_value_free(val); + return (zerr_pool(pool_err, err_size, Z_POOL)); + } + + pool_value_free(val); + + return (Z_OK); +} + +static int +create_tmp_pool(char *pool_err, int err_size, pool_conf_t *pconf, char *name, + struct zone_psettab *pset_tab) +{ + pool_t *pool; + int res = Z_OK; + + /* create a temporary pool configuration */ + if (pool_conf_open(pconf, NULL, PO_TEMP) != PO_SUCCESS) { + res = zerr_pool(pool_err, err_size, Z_POOL); + return (res); + } + + if ((pool = pool_create(pconf, name)) == NULL) { + res = zerr_pool(pool_err, err_size, Z_POOL_CREATE); + goto done; + } + + /* set pool importance */ + if (pset_tab->zone_importance[0] != '\0') { + pool_elem_t *elem; + pool_value_t *val; + + if ((elem = pool_to_elem(pconf, pool)) == NULL) { + res = zerr_pool(pool_err, err_size, Z_POOL); + goto done; + } + + if ((val = pool_value_alloc()) == NULL) { + res = zerr_pool(pool_err, err_size, Z_POOL); + goto done; + } + + pool_value_set_int64(val, + (int64_t)atoi(pset_tab->zone_importance)); + + if (pool_put_property(pconf, elem, "pool.importance", val) + != PO_SUCCESS) { + res = zerr_pool(pool_err, err_size, Z_POOL); + pool_value_free(val); + goto done; + } + + pool_value_free(val); + } + + if ((res = create_tmp_pset(pool_err, err_size, pconf, pool, name, + atoi(pset_tab->zone_ncpu_min), + atoi(pset_tab->zone_ncpu_max))) != Z_OK) + goto done; + + /* validation */ + if (pool_conf_status(pconf) == POF_INVALID) { + res = zerr_pool(pool_err, err_size, Z_POOL); + goto done; + } + + /* + * This validation is the one we expect to fail if the user specified + * an invalid configuration (too many cpus) for this system. + */ + if (pool_conf_validate(pconf, POV_RUNTIME) != PO_SUCCESS) { + res = zerr_pool(pool_err, err_size, Z_POOL_CREATE); + goto done; + } + + /* + * Commit the dynamic configuration but not the pool configuration + * file. + */ + if (pool_conf_commit(pconf, 1) != PO_SUCCESS) + res = zerr_pool(pool_err, err_size, Z_POOL); + +done: + (void) pool_conf_close(pconf); + return (res); +} + +static int +get_running_tmp_pset(pool_conf_t *pconf, pool_t *pool, pool_resource_t *pset, + struct zone_psettab *pset_tab) +{ + int nfound = 0; + pool_elem_t *pe; + pool_value_t *pv = pool_value_alloc(); + uint64_t val_uint; + + if (pool != NULL) { + pe = pool_to_elem(pconf, pool); + if (pool_get_property(pconf, pe, "pool.importance", pv) + != POC_INVAL) { + int64_t val_int; + + (void) pool_value_get_int64(pv, &val_int); + (void) snprintf(pset_tab->zone_importance, + sizeof (pset_tab->zone_importance), "%d", val_int); + nfound++; + } + } + + if (pset != NULL) { + pe = pool_resource_to_elem(pconf, pset); + if (pool_get_property(pconf, pe, "pset.min", pv) != POC_INVAL) { + (void) pool_value_get_uint64(pv, &val_uint); + (void) snprintf(pset_tab->zone_ncpu_min, + sizeof (pset_tab->zone_ncpu_min), "%u", val_uint); + nfound++; + } + + if (pool_get_property(pconf, pe, "pset.max", pv) != POC_INVAL) { + (void) pool_value_get_uint64(pv, &val_uint); + (void) snprintf(pset_tab->zone_ncpu_max, + sizeof (pset_tab->zone_ncpu_max), "%u", val_uint); + nfound++; + } + } + + pool_value_free(pv); + + if (nfound == 3) + return (PO_SUCCESS); + + return (PO_FAIL); +} + +/* + * Determine if a tmp pool is configured and if so, if the configuration is + * still valid or if it has been changed since the tmp pool was created. + * If the tmp pool configuration is no longer valid, delete the tmp pool. + * + * Set *valid=B_TRUE if there is an existing, valid tmp pool configuration. + */ +static int +verify_del_tmp_pool(pool_conf_t *pconf, char *tmp_name, char *pool_err, + int err_size, struct zone_psettab *pset_tab, boolean_t *exists) +{ + int res = Z_OK; + pool_t *pool; + pool_resource_t *pset; + struct zone_psettab pset_current; + + *exists = B_FALSE; + + if (pool_conf_open(pconf, pool_dynamic_location(), PO_RDWR) + != PO_SUCCESS) { + res = zerr_pool(pool_err, err_size, Z_POOL); + return (res); + } + + pool = pool_get_pool(pconf, tmp_name); + pset = pool_get_resource(pconf, "pset", tmp_name); + + if (pool == NULL && pset == NULL) { + /* no tmp pool configured */ + goto done; + } + + /* + * If an existing tmp pool for this zone is configured with the proper + * settings, then the tmp pool is valid. + */ + if (get_running_tmp_pset(pconf, pool, pset, &pset_current) + == PO_SUCCESS && + strcmp(pset_tab->zone_ncpu_min, + pset_current.zone_ncpu_min) == 0 && + strcmp(pset_tab->zone_ncpu_max, + pset_current.zone_ncpu_max) == 0 && + strcmp(pset_tab->zone_importance, + pset_current.zone_importance) == 0) { + *exists = B_TRUE; + + } else { + /* + * An out-of-date tmp pool configuration exists. Delete it + * so that we can create the correct tmp pool config. + */ + if (pset != NULL && + pool_resource_destroy(pconf, pset) != PO_SUCCESS) { + res = zerr_pool(pool_err, err_size, Z_POOL); + goto done; + } + + if (pool != NULL && + pool_destroy(pconf, pool) != PO_SUCCESS) { + res = zerr_pool(pool_err, err_size, Z_POOL); + goto done; + } + + /* commit dynamic config */ + if (pool_conf_commit(pconf, 0) != PO_SUCCESS) + res = zerr_pool(pool_err, err_size, Z_POOL); + } + +done: + (void) pool_conf_close(pconf); + + return (res); +} + +/* + * Destroy any existing tmp pool. + */ +int +zonecfg_destroy_tmp_pool(char *zone_name, char *pool_err, int err_size) +{ + int status; + int res = Z_OK; + pool_conf_t *pconf; + pool_t *pool; + pool_resource_t *pset; + char tmp_name[MAX_TMP_POOL_NAME]; + + /* if pools not enabled then nothing to do */ + if (pool_get_status(&status) != PO_SUCCESS || status != POOL_ENABLED) + return (Z_OK); + + if ((pconf = pool_conf_alloc()) == NULL) + return (zerr_pool(pool_err, err_size, Z_POOL)); + + (void) snprintf(tmp_name, sizeof (tmp_name), TMP_POOL_NAME, zone_name); + + if (pool_conf_open(pconf, pool_dynamic_location(), PO_RDWR) + != PO_SUCCESS) { + res = zerr_pool(pool_err, err_size, Z_POOL); + pool_conf_free(pconf); + return (res); + } + + pool = pool_get_pool(pconf, tmp_name); + pset = pool_get_resource(pconf, "pset", tmp_name); + + if (pool == NULL && pset == NULL) { + /* nothing to destroy, we're done */ + goto done; + } + + if (pset != NULL && pool_resource_destroy(pconf, pset) != PO_SUCCESS) { + res = zerr_pool(pool_err, err_size, Z_POOL); + goto done; + } + + if (pool != NULL && pool_destroy(pconf, pool) != PO_SUCCESS) { + res = zerr_pool(pool_err, err_size, Z_POOL); + goto done; + } + + /* commit dynamic config */ + if (pool_conf_commit(pconf, 0) != PO_SUCCESS) + res = zerr_pool(pool_err, err_size, Z_POOL); + +done: + (void) pool_conf_close(pconf); + pool_conf_free(pconf); + + return (res); +} + +/* + * Attempt to bind to a tmp pool for this zone. If there is no tmp pool + * configured, we just return Z_OK. + * + * We either attempt to create the tmp pool for this zone or rebind to an + * existing tmp pool for this zone. + * + * Rebinding is used when a zone with a tmp pool reboots so that we don't have + * to recreate the tmp pool. To do this we need to be sure we work correctly + * for the following cases: + * + * - there is an existing, properly configured tmp pool. + * - zonecfg added tmp pool after zone was booted, must now create. + * - zonecfg updated tmp pool config after zone was booted, in this case + * we destroy the old tmp pool and create a new one. + */ +int +zonecfg_bind_tmp_pool(zone_dochandle_t handle, zoneid_t zoneid, char *pool_err, + int err_size) +{ + struct zone_psettab pset_tab; + int err; + int status; + pool_conf_t *pconf; + boolean_t exists; + char zone_name[ZONENAME_MAX]; + char tmp_name[MAX_TMP_POOL_NAME]; + + (void) getzonenamebyid(zoneid, zone_name, sizeof (zone_name)); + + err = zonecfg_lookup_pset(handle, &pset_tab); + + /* if no temporary pool configured, we're done */ + if (err == Z_NO_ENTRY) + return (Z_OK); + + /* + * importance might not have a value but we need to validate it here, + * so set the default. + */ + if (pset_tab.zone_importance[0] == '\0') + (void) strlcpy(pset_tab.zone_importance, "1", + sizeof (pset_tab.zone_importance)); + + /* if pools not enabled, enable them now */ + if (pool_get_status(&status) != PO_SUCCESS || status != POOL_ENABLED) { + if (pool_set_status(POOL_ENABLED) != PO_SUCCESS) + return (Z_POOL_ENABLE); + } + + if ((pconf = pool_conf_alloc()) == NULL) + return (zerr_pool(pool_err, err_size, Z_POOL)); + + (void) snprintf(tmp_name, sizeof (tmp_name), TMP_POOL_NAME, zone_name); + + /* + * Check if a valid tmp pool/pset already exists. If so, we just + * reuse it. + */ + if ((err = verify_del_tmp_pool(pconf, tmp_name, pool_err, err_size, + &pset_tab, &exists)) != Z_OK) { + pool_conf_free(pconf); + return (err); + } + + if (!exists) + err = create_tmp_pool(pool_err, err_size, pconf, tmp_name, + &pset_tab); + + pool_conf_free(pconf); + + if (err != Z_OK) + return (err); + + /* Bind the zone to the pool. */ + if (pool_set_binding(tmp_name, P_ZONEID, zoneid) != PO_SUCCESS) + return (zerr_pool(pool_err, err_size, Z_POOL_BIND)); + + return (Z_OK); +} + +/* + * Attempt to bind to a permanent pool for this zone. If there is no + * permanent pool configured, we just return Z_OK. + */ +int +zonecfg_bind_pool(zone_dochandle_t handle, zoneid_t zoneid, char *pool_err, + int err_size) +{ + pool_conf_t *poolconf; + pool_t *pool; + char poolname[MAXPATHLEN]; + int status; + int error; + + /* + * Find the pool mentioned in the zone configuration, and bind to it. + */ + error = zonecfg_get_pool(handle, poolname, sizeof (poolname)); + if (error == Z_NO_ENTRY || (error == Z_OK && strlen(poolname) == 0)) { + /* + * The property is not set on the zone, so the pool + * should be bound to the default pool. But that's + * already done by the kernel, so we can just return. + */ + return (Z_OK); + } + if (error != Z_OK) { + /* + * Not an error, even though it shouldn't be happening. + */ + return (Z_OK); + } + /* + * Don't do anything if pools aren't enabled. + */ + if (pool_get_status(&status) != PO_SUCCESS || status != POOL_ENABLED) + return (Z_POOLS_NOT_ACTIVE); + + /* + * Try to provide a sane error message if the requested pool doesn't + * exist. + */ + if ((poolconf = pool_conf_alloc()) == NULL) + return (zerr_pool(pool_err, err_size, Z_POOL)); + + if (pool_conf_open(poolconf, pool_dynamic_location(), PO_RDONLY) != + PO_SUCCESS) { + pool_conf_free(poolconf); + return (zerr_pool(pool_err, err_size, Z_POOL)); + } + pool = pool_get_pool(poolconf, poolname); + (void) pool_conf_close(poolconf); + pool_conf_free(poolconf); + if (pool == NULL) + return (Z_NO_POOL); + + /* + * Bind the zone to the pool. + */ + if (pool_set_binding(poolname, P_ZONEID, zoneid) != PO_SUCCESS) { + /* if bind fails, return poolname for the error msg */ + (void) strlcpy(pool_err, poolname, err_size); + return (Z_POOL_BIND); + } + + return (Z_OK); +} + + +static boolean_t +svc_enabled(char *svc_name) +{ + scf_simple_prop_t *prop; + boolean_t found = B_FALSE; + + prop = scf_simple_prop_get(NULL, svc_name, SCF_PG_GENERAL, + SCF_PROPERTY_ENABLED); + + if (scf_simple_prop_numvalues(prop) == 1 && + *scf_simple_prop_next_boolean(prop) != 0) + found = B_TRUE; + + scf_simple_prop_free(prop); + + return (found); +} + +/* + * If the zone has capped-memory, make sure the rcap service is enabled. + */ +int +zonecfg_enable_rcapd(char *err, int size) +{ + if (!svc_enabled(RCAP_SERVICE) && + smf_enable_instance(RCAP_SERVICE, 0) == -1) { + (void) strlcpy(err, scf_strerror(scf_error()), size); + return (Z_SYSTEM); + } + + return (Z_OK); +} + +/* + * Return true if pset has cpu range specified and poold is not enabled. + */ +boolean_t +zonecfg_warn_poold(zone_dochandle_t handle) +{ + struct zone_psettab pset_tab; + int min, max; + int err; + + err = zonecfg_lookup_pset(handle, &pset_tab); + + /* if no temporary pool configured, we're done */ + if (err == Z_NO_ENTRY) + return (B_FALSE); + + min = atoi(pset_tab.zone_ncpu_min); + max = atoi(pset_tab.zone_ncpu_max); + + /* range not specified, no need for poold */ + if (min == max) + return (B_FALSE); + + /* we have a range, check if poold service is enabled */ + if (svc_enabled(POOLD_SERVICE)) + return (B_FALSE); + + return (B_TRUE); +} + +static int +get_pool_sched_class(char *poolname, char *class, int clsize) +{ + int status; + pool_conf_t *poolconf; + pool_t *pool; + pool_elem_t *pe; + pool_value_t *pv = pool_value_alloc(); + const char *sched_str; + + if (pool_get_status(&status) != PO_SUCCESS || status != POOL_ENABLED) + return (Z_NO_POOL); + + if ((poolconf = pool_conf_alloc()) == NULL) + return (Z_NO_POOL); + + if (pool_conf_open(poolconf, pool_dynamic_location(), PO_RDONLY) != + PO_SUCCESS) { + pool_conf_free(poolconf); + return (Z_NO_POOL); + } + + if ((pool = pool_get_pool(poolconf, poolname)) == NULL) { + (void) pool_conf_close(poolconf); + pool_conf_free(poolconf); + return (Z_NO_POOL); + } + + pe = pool_to_elem(poolconf, pool); + if (pool_get_property(poolconf, pe, "pool.scheduler", pv) + != POC_INVAL) { + (void) pool_value_get_string(pv, &sched_str); + if (strlcpy(class, sched_str, clsize) >= clsize) + return (Z_TOO_BIG); + } + + (void) pool_conf_close(poolconf); + pool_conf_free(poolconf); + return (Z_OK); +} + +/* + * Get the default scheduling class for the zone. This will either be the + * class set on the zone's pool or the system default scheduling class. + */ +int +zonecfg_get_dflt_sched_class(zone_dochandle_t handle, char *class, int clsize) +{ + char poolname[MAXPATHLEN]; + + if (zonecfg_get_pool(handle, poolname, sizeof (poolname)) == Z_OK) { + /* check if the zone's pool specified a sched class */ + if (get_pool_sched_class(poolname, class, clsize) == Z_OK) + return (Z_OK); + } + + if (priocntl(0, 0, PC_GETDFLCL, class, (uint64_t)clsize) == -1) + return (Z_TOO_BIG); + + return (Z_OK); +} + int zonecfg_setfsent(zone_dochandle_t handle) { @@ -4825,6 +5864,509 @@ zonecfg_enddsent(zone_dochandle_t handle) return (zonecfg_endent(handle)); } +/* + * Support for aliased rctls; that is, rctls that have simplified names in + * zonecfg. For example, max-lwps is an alias for a well defined zone.max-lwps + * rctl. If there are multiple existing values for one of these rctls or if + * there is a single value that does not match the well defined template (i.e. + * it has a different action) then we cannot treat the rctl as having an alias + * so we return Z_ALIAS_DISALLOW. That means that the rctl cannot be + * managed in zonecfg via an alias and that the standard rctl syntax must be + * used. + * + * The possible return values are: + * Z_NO_PROPERTY_ID - invalid alias name + * Z_ALIAS_DISALLOW - pre-existing, incompatible rctl definition + * Z_NO_ENTRY - no rctl is configured for this alias + * Z_OK - we got a valid rctl for the specified alias + */ +int +zonecfg_get_aliased_rctl(zone_dochandle_t handle, char *name, uint64_t *rval) +{ + boolean_t found = B_FALSE; + boolean_t found_val = B_FALSE; + xmlNodePtr cur, val; + char savedname[MAXNAMELEN]; + struct zone_rctlvaltab rctl; + int i; + int err; + + for (i = 0; aliases[i].shortname != NULL; i++) + if (strcmp(name, aliases[i].shortname) == 0) + break; + + if (aliases[i].shortname == NULL) + return (Z_NO_PROPERTY_ID); + + if ((err = operation_prep(handle)) != Z_OK) + return (err); + + cur = handle->zone_dh_cur; + for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) { + if (xmlStrcmp(cur->name, DTD_ELEM_RCTL) != 0) + continue; + if ((fetchprop(cur, DTD_ATTR_NAME, savedname, + sizeof (savedname)) == Z_OK) && + (strcmp(savedname, aliases[i].realname) == 0)) { + + /* + * If we already saw one of these, we can't have an + * alias since we just found another. + */ + if (found) + return (Z_ALIAS_DISALLOW); + found = B_TRUE; + + for (val = cur->xmlChildrenNode; val != NULL; + val = val->next) { + /* + * If we already have one value, we can't have + * an alias since we just found another. + */ + if (found_val) + return (Z_ALIAS_DISALLOW); + found_val = B_TRUE; + + if ((fetchprop(val, DTD_ATTR_PRIV, + rctl.zone_rctlval_priv, + sizeof (rctl.zone_rctlval_priv)) != Z_OK)) + break; + if ((fetchprop(val, DTD_ATTR_LIMIT, + rctl.zone_rctlval_limit, + sizeof (rctl.zone_rctlval_limit)) != Z_OK)) + break; + if ((fetchprop(val, DTD_ATTR_ACTION, + rctl.zone_rctlval_action, + sizeof (rctl.zone_rctlval_action)) != Z_OK)) + break; + } + + /* check priv and action match the expected vals */ + if (strcmp(rctl.zone_rctlval_priv, + aliases[i].priv) != 0 || + strcmp(rctl.zone_rctlval_action, + aliases[i].action) != 0) + return (Z_ALIAS_DISALLOW); + } + } + + if (found) { + *rval = strtoull(rctl.zone_rctlval_limit, NULL, 10); + return (Z_OK); + } + + return (Z_NO_ENTRY); +} + +int +zonecfg_rm_aliased_rctl(zone_dochandle_t handle, char *name) +{ + int i; + uint64_t val; + struct zone_rctltab rctltab; + + /* + * First check that we have a valid aliased rctl to remove. + * This will catch an rctl entry with non-standard values or + * multiple rctl values for this name. We need to ignore those + * rctl entries. + */ + if (zonecfg_get_aliased_rctl(handle, name, &val) != Z_OK) + return (Z_OK); + + for (i = 0; aliases[i].shortname != NULL; i++) + if (strcmp(name, aliases[i].shortname) == 0) + break; + + if (aliases[i].shortname == NULL) + return (Z_NO_RESOURCE_ID); + + (void) strlcpy(rctltab.zone_rctl_name, aliases[i].realname, + sizeof (rctltab.zone_rctl_name)); + + return (zonecfg_delete_rctl(handle, &rctltab)); +} + +boolean_t +zonecfg_aliased_rctl_ok(zone_dochandle_t handle, char *name) +{ + uint64_t tmp_val; + + switch (zonecfg_get_aliased_rctl(handle, name, &tmp_val)) { + case Z_OK: + /*FALLTHRU*/ + case Z_NO_ENTRY: + return (B_TRUE); + default: + return (B_FALSE); + } +} + +int +zonecfg_set_aliased_rctl(zone_dochandle_t handle, char *name, uint64_t val) +{ + int i; + int err; + struct zone_rctltab rctltab; + struct zone_rctlvaltab *rctlvaltab; + char buf[128]; + + if (!zonecfg_aliased_rctl_ok(handle, name)) + return (Z_ALIAS_DISALLOW); + + for (i = 0; aliases[i].shortname != NULL; i++) + if (strcmp(name, aliases[i].shortname) == 0) + break; + + if (aliases[i].shortname == NULL) + return (Z_NO_RESOURCE_ID); + + /* remove any pre-existing definition for this rctl */ + (void) zonecfg_rm_aliased_rctl(handle, name); + + (void) strlcpy(rctltab.zone_rctl_name, aliases[i].realname, + sizeof (rctltab.zone_rctl_name)); + + rctltab.zone_rctl_valptr = NULL; + + if ((rctlvaltab = calloc(1, sizeof (struct zone_rctlvaltab))) == NULL) + return (Z_NOMEM); + + (void) snprintf(buf, sizeof (buf), "%llu", (long long)val); + + (void) strlcpy(rctlvaltab->zone_rctlval_priv, aliases[i].priv, + sizeof (rctlvaltab->zone_rctlval_priv)); + (void) strlcpy(rctlvaltab->zone_rctlval_limit, buf, + sizeof (rctlvaltab->zone_rctlval_limit)); + (void) strlcpy(rctlvaltab->zone_rctlval_action, aliases[i].action, + sizeof (rctlvaltab->zone_rctlval_action)); + + rctlvaltab->zone_rctlval_next = NULL; + + if ((err = zonecfg_add_rctl_value(&rctltab, rctlvaltab)) != Z_OK) + return (err); + + return (zonecfg_add_rctl(handle, &rctltab)); +} + +static int +delete_tmp_pool(zone_dochandle_t handle) +{ + int err; + xmlNodePtr cur = handle->zone_dh_cur; + + if ((err = operation_prep(handle)) != Z_OK) + return (err); + + for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) { + if (xmlStrcmp(cur->name, DTD_ELEM_TMPPOOL) == 0) { + xmlUnlinkNode(cur); + xmlFreeNode(cur); + return (Z_OK); + } + } + + return (Z_NO_RESOURCE_ID); +} + +static int +modify_tmp_pool(zone_dochandle_t handle, char *pool_importance) +{ + int err; + xmlNodePtr cur = handle->zone_dh_cur; + xmlNodePtr newnode; + + err = delete_tmp_pool(handle); + if (err != Z_OK && err != Z_NO_RESOURCE_ID) + return (err); + + if (*pool_importance != '\0') { + if ((err = operation_prep(handle)) != Z_OK) + return (err); + + newnode = xmlNewTextChild(cur, NULL, DTD_ELEM_TMPPOOL, NULL); + if ((err = newprop(newnode, DTD_ATTR_IMPORTANCE, + pool_importance)) != Z_OK) + return (err); + } + + return (Z_OK); +} + +static int +add_pset_core(zone_dochandle_t handle, struct zone_psettab *tabptr) +{ + xmlNodePtr newnode, cur = handle->zone_dh_cur; + int err; + + newnode = xmlNewTextChild(cur, NULL, DTD_ELEM_PSET, NULL); + if ((err = newprop(newnode, DTD_ATTR_NCPU_MIN, + tabptr->zone_ncpu_min)) != Z_OK) + return (err); + if ((err = newprop(newnode, DTD_ATTR_NCPU_MAX, + tabptr->zone_ncpu_max)) != Z_OK) + return (err); + + if ((err = modify_tmp_pool(handle, tabptr->zone_importance)) != Z_OK) + return (err); + + return (Z_OK); +} + +int +zonecfg_add_pset(zone_dochandle_t handle, struct zone_psettab *tabptr) +{ + int err; + + if (tabptr == NULL) + return (Z_INVAL); + + if ((err = operation_prep(handle)) != Z_OK) + return (err); + + if ((err = add_pset_core(handle, tabptr)) != Z_OK) + return (err); + + return (Z_OK); +} + +int +zonecfg_delete_pset(zone_dochandle_t handle) +{ + int err; + int res = Z_NO_RESOURCE_ID; + xmlNodePtr cur = handle->zone_dh_cur; + + if ((err = operation_prep(handle)) != Z_OK) + return (err); + + for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) { + if (xmlStrcmp(cur->name, DTD_ELEM_PSET) == 0) { + xmlUnlinkNode(cur); + xmlFreeNode(cur); + res = Z_OK; + break; + } + } + + /* + * Once we have msets, we should check that a mset + * do not exist before we delete the tmp_pool data. + */ + err = delete_tmp_pool(handle); + if (err != Z_OK && err != Z_NO_RESOURCE_ID) + return (err); + + return (res); +} + +int +zonecfg_modify_pset(zone_dochandle_t handle, struct zone_psettab *tabptr) +{ + int err; + + if (tabptr == NULL) + return (Z_INVAL); + + if ((err = zonecfg_delete_pset(handle)) != Z_OK) + return (err); + + if ((err = add_pset_core(handle, tabptr)) != Z_OK) + return (err); + + return (Z_OK); +} + +int +zonecfg_lookup_pset(zone_dochandle_t handle, struct zone_psettab *tabptr) +{ + xmlNodePtr cur; + int err; + int res = Z_NO_ENTRY; + + if (tabptr == NULL) + return (Z_INVAL); + + if ((err = operation_prep(handle)) != Z_OK) + return (err); + + /* this is an optional component */ + tabptr->zone_importance[0] = '\0'; + + cur = handle->zone_dh_cur; + for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) { + if (xmlStrcmp(cur->name, DTD_ELEM_PSET) == 0) { + if ((err = fetchprop(cur, DTD_ATTR_NCPU_MIN, + tabptr->zone_ncpu_min, + sizeof (tabptr->zone_ncpu_min))) != Z_OK) { + handle->zone_dh_cur = handle->zone_dh_top; + return (err); + } + + if ((err = fetchprop(cur, DTD_ATTR_NCPU_MAX, + tabptr->zone_ncpu_max, + sizeof (tabptr->zone_ncpu_max))) != Z_OK) { + handle->zone_dh_cur = handle->zone_dh_top; + return (err); + } + + res = Z_OK; + + } else if (xmlStrcmp(cur->name, DTD_ELEM_TMPPOOL) == 0) { + if ((err = fetchprop(cur, DTD_ATTR_IMPORTANCE, + tabptr->zone_importance, + sizeof (tabptr->zone_importance))) != Z_OK) { + handle->zone_dh_cur = handle->zone_dh_top; + return (err); + } + } + } + + return (res); +} + +int +zonecfg_getpsetent(zone_dochandle_t handle, struct zone_psettab *tabptr) +{ + int err; + + if ((err = zonecfg_setent(handle)) != Z_OK) + return (err); + + err = zonecfg_lookup_pset(handle, tabptr); + + (void) zonecfg_endent(handle); + + return (err); +} + +static int +add_mcap(zone_dochandle_t handle, struct zone_mcaptab *tabptr) +{ + xmlNodePtr newnode, cur = handle->zone_dh_cur; + int err; + + newnode = xmlNewTextChild(cur, NULL, DTD_ELEM_MCAP, NULL); + if ((err = newprop(newnode, DTD_ATTR_PHYSCAP, tabptr->zone_physmem_cap)) + != Z_OK) + return (err); + + return (Z_OK); +} + +int +zonecfg_delete_mcap(zone_dochandle_t handle) +{ + int err; + xmlNodePtr cur = handle->zone_dh_cur; + + if ((err = operation_prep(handle)) != Z_OK) + return (err); + + for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) { + if (xmlStrcmp(cur->name, DTD_ELEM_MCAP) != 0) + continue; + + xmlUnlinkNode(cur); + xmlFreeNode(cur); + return (Z_OK); + } + return (Z_NO_RESOURCE_ID); +} + +int +zonecfg_modify_mcap(zone_dochandle_t handle, struct zone_mcaptab *tabptr) +{ + int err; + + if (tabptr == NULL) + return (Z_INVAL); + + err = zonecfg_delete_mcap(handle); + /* it is ok if there is no mcap entry */ + if (err != Z_OK && err != Z_NO_RESOURCE_ID) + return (err); + + if ((err = add_mcap(handle, tabptr)) != Z_OK) + return (err); + + return (Z_OK); +} + +int +zonecfg_lookup_mcap(zone_dochandle_t handle, struct zone_mcaptab *tabptr) +{ + xmlNodePtr cur; + int err; + + if (tabptr == NULL) + return (Z_INVAL); + + if ((err = operation_prep(handle)) != Z_OK) + return (err); + + cur = handle->zone_dh_cur; + for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) { + if (xmlStrcmp(cur->name, DTD_ELEM_MCAP) != 0) + continue; + if ((err = fetchprop(cur, DTD_ATTR_PHYSCAP, + tabptr->zone_physmem_cap, + sizeof (tabptr->zone_physmem_cap))) != Z_OK) { + handle->zone_dh_cur = handle->zone_dh_top; + return (err); + } + + return (Z_OK); + } + + return (Z_NO_ENTRY); +} + +static int +getmcapent_core(zone_dochandle_t handle, struct zone_mcaptab *tabptr) +{ + xmlNodePtr cur; + int err; + + if (handle == NULL) + return (Z_INVAL); + + if ((cur = handle->zone_dh_cur) == NULL) + return (Z_NO_ENTRY); + + for (; cur != NULL; cur = cur->next) + if (xmlStrcmp(cur->name, DTD_ELEM_MCAP) == 0) + break; + if (cur == NULL) { + handle->zone_dh_cur = handle->zone_dh_top; + return (Z_NO_ENTRY); + } + + if ((err = fetchprop(cur, DTD_ATTR_PHYSCAP, tabptr->zone_physmem_cap, + sizeof (tabptr->zone_physmem_cap))) != Z_OK) { + handle->zone_dh_cur = handle->zone_dh_top; + return (err); + } + + handle->zone_dh_cur = cur->next; + return (Z_OK); +} + +int +zonecfg_getmcapent(zone_dochandle_t handle, struct zone_mcaptab *tabptr) +{ + int err; + + if ((err = zonecfg_setent(handle)) != Z_OK) + return (err); + + err = getmcapent_core(handle, tabptr); + + (void) zonecfg_endent(handle); + + return (err); +} + int zonecfg_setpkgent(zone_dochandle_t handle) { diff --git a/usr/src/lib/libzonecfg/common/mapfile-vers b/usr/src/lib/libzonecfg/common/mapfile-vers index a9d59548d3..e2bb782688 100644 --- a/usr/src/lib/libzonecfg/common/mapfile-vers +++ b/usr/src/lib/libzonecfg/common/mapfile-vers @@ -40,10 +40,15 @@ SUNWprivate_1.1 { zonecfg_add_fs_option; zonecfg_add_ipd; zonecfg_add_nwif; + zonecfg_add_pset; zonecfg_add_rctl; zonecfg_add_rctl_value; zonecfg_add_scratch; + zonecfg_aliased_rctl_ok; + zonecfg_apply_rctls; zonecfg_attach_manifest; + zonecfg_bind_pool; + zonecfg_bind_tmp_pool; zonecfg_check_handle; zonecfg_close_scratch; zonecfg_construct_rctlblk; @@ -54,15 +59,20 @@ SUNWprivate_1.1 { zonecfg_delete_ds; zonecfg_delete_filesystem; zonecfg_delete_ipd; + zonecfg_delete_mcap; zonecfg_delete_nwif; + zonecfg_delete_pset; zonecfg_delete_rctl; zonecfg_delete_scratch; + zonecfg_del_all_resources; zonecfg_destroy; zonecfg_destroy_snapshot; + zonecfg_destroy_tmp_pool; zonecfg_detached; zonecfg_detach_save; zonecfg_devperms_apply; zonecfg_devwalk; + zonecfg_enable_rcapd; zonecfg_endattrent; zonecfg_enddevent; zonecfg_enddevperment; @@ -78,6 +88,7 @@ SUNWprivate_1.1 { zonecfg_fini_handle; zonecfg_free_fs_option_list; zonecfg_free_rctl_value_list; + zonecfg_get_aliased_rctl; zonecfg_get_attach_handle; zonecfg_get_attr_boolean; zonecfg_getattrent; @@ -88,6 +99,7 @@ SUNWprivate_1.1 { zonecfg_get_bootargs; zonecfg_get_brand; zonecfg_get_detach_info; + zonecfg_get_dflt_sched_class; zonecfg_getdevent; zonecfg_getdevperment; zonecfg_getdsent; @@ -95,6 +107,7 @@ SUNWprivate_1.1 { zonecfg_get_handle; zonecfg_getipdent; zonecfg_get_limitpriv; + zonecfg_getmcapent; zonecfg_get_name; zonecfg_get_name_by_uuid; zonecfg_getnwifent; @@ -102,8 +115,10 @@ SUNWprivate_1.1 { zonecfg_getpkgent; zonecfg_get_pool; zonecfg_get_privset; + zonecfg_getpsetent; zonecfg_getrctlent; zonecfg_get_root; + zonecfg_get_sched_class; zonecfg_get_scratch; zonecfg_get_snapshot_handle; zonecfg_get_template_handle; @@ -120,28 +135,35 @@ SUNWprivate_1.1 { zonecfg_lookup_ds; zonecfg_lookup_filesystem; zonecfg_lookup_ipd; + zonecfg_lookup_mcap; zonecfg_lookup_nwif; + zonecfg_lookup_pset; zonecfg_lookup_rctl; zonecfg_modify_attr; zonecfg_modify_dev; zonecfg_modify_ds; zonecfg_modify_filesystem; zonecfg_modify_ipd; + zonecfg_modify_mcap; zonecfg_modify_nwif; + zonecfg_modify_pset; zonecfg_modify_rctl; zonecfg_notify_bind; zonecfg_notify_critical_abort; zonecfg_notify_critical_enter; zonecfg_notify_critical_exit; zonecfg_notify_unbind; + zonecfg_num_resources; zonecfg_open_scratch; zonecfg_remove_fs_option; zonecfg_remove_rctl_value; zonecfg_reverse_scratch; + zonecfg_rm_aliased_rctl; zonecfg_rm_detached; zonecfg_same_net_address; zonecfg_save; zonecfg_setattrent; + zonecfg_set_aliased_rctl; zonecfg_set_autoboot; zonecfg_set_bootargs; zonecfg_set_brand; @@ -158,15 +180,22 @@ SUNWprivate_1.1 { zonecfg_set_pool; zonecfg_setrctlent; zonecfg_set_root; + zonecfg_set_sched; zonecfg_set_zonepath; zonecfg_strerror; + zonecfg_str_to_bytes; zonecfg_validate_zonename; + zonecfg_valid_alias_limit; zonecfg_valid_fs_type; + zonecfg_valid_importance; + zonecfg_valid_memlimit; + zonecfg_valid_ncpus; zonecfg_valid_net_address; zonecfg_valid_rctl; zonecfg_valid_rctlblk; zonecfg_valid_rctlname; zonecfg_verify_save; + zonecfg_warn_poold; zone_get_brand; zone_get_devroot; zone_get_id; diff --git a/usr/src/lib/libzonecfg/dtd/zonecfg.dtd.1 b/usr/src/lib/libzonecfg/dtd/zonecfg.dtd.1 index 3208af7a79..c51e89add3 100644 --- a/usr/src/lib/libzonecfg/dtd/zonecfg.dtd.1 +++ b/usr/src/lib/libzonecfg/dtd/zonecfg.dtd.1 @@ -111,7 +111,27 @@ mode CDATA #REQUIRED acl CDATA #REQUIRED> -<!ELEMENT zone (filesystem | inherited-pkg-dir | network | device | deleted-device | rctl | attr | dataset | package | patch | dev-perm)*> +<!-- + The tmp_pool element is separate from the pset element so that + we can track the importance value at the pool level, where it + belongs, instead of at the pset level. Once we have msets this + will be important since tmp psets and tmp msets will share a common + pool-level importance. +--> +<!ELEMENT tmp_pool EMPTY> + +<!ATTLIST tmp_pool importance CDATA #REQUIRED> + +<!ELEMENT pset EMPTY> + +<!ATTLIST pset ncpu_min CDATA #REQUIRED + ncpu_max CDATA #REQUIRED> + +<!ELEMENT mcap EMPTY> + +<!ATTLIST mcap physcap CDATA #REQUIRED> + +<!ELEMENT zone (filesystem | inherited-pkg-dir | network | device | deleted-device | rctl | attr | dataset | package | patch | dev-perm | tmp_pool | pset | mcap)*> <!ATTLIST zone name CDATA #REQUIRED zonepath CDATA #REQUIRED @@ -120,4 +140,5 @@ limitpriv CDATA "" bootargs CDATA "" brand CDATA "" + scheduling-class CDATA "" version NMTOKEN #FIXED '1'> diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com index c52316079d..6ac2e461ab 100644 --- a/usr/src/pkgdefs/SUNWhea/prototype_com +++ b/usr/src/pkgdefs/SUNWhea/prototype_com @@ -1178,6 +1178,7 @@ f none usr/include/sys/varargs.h 644 root bin f none usr/include/sys/vfs.h 644 root bin f none usr/include/sys/vfstab.h 644 root bin f none usr/include/sys/vm.h 644 root bin +f none usr/include/sys/vm_usage.h 644 root bin f none usr/include/sys/vmem.h 644 root bin f none usr/include/sys/vmem_impl.h 644 root bin f none usr/include/sys/vmem_impl_user.h 644 root bin diff --git a/usr/src/pkgdefs/SUNWrcapu/depend b/usr/src/pkgdefs/SUNWrcapu/depend index 9aaa446bca..a7375758b0 100644 --- a/usr/src/pkgdefs/SUNWrcapu/depend +++ b/usr/src/pkgdefs/SUNWrcapu/depend @@ -1,13 +1,12 @@ # -# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -43,3 +42,4 @@ P SUNWrcapr Solaris Resource Capping Daemon (Root) P SUNWcsu Core Solaris, (Usr) P SUNWcsl Core Solaris, (Shared Libs) +P SUNWzoneu Solaris Zones (Usr) diff --git a/usr/src/pkgdefs/SUNWzoner/prototype_com b/usr/src/pkgdefs/SUNWzoner/prototype_com index 009de7fb9f..15661840ea 100644 --- a/usr/src/pkgdefs/SUNWzoner/prototype_com +++ b/usr/src/pkgdefs/SUNWzoner/prototype_com @@ -56,9 +56,11 @@ f none etc/zones/SUNWblank.xml 444 root bin d none lib 755 root bin d none lib/svc 0755 root bin d none lib/svc/method 0755 root bin +f none lib/svc/method/svc-resource-mgmt 0555 root bin f none lib/svc/method/svc-zones 0555 root bin d none var 755 root sys d none var/svc 755 root sys d none var/svc/manifest 755 root sys d none var/svc/manifest/system 755 root sys +f manifest var/svc/manifest/system/resource-mgmt.xml 0444 root sys f manifest var/svc/manifest/system/zones.xml 0444 root sys diff --git a/usr/src/tools/scripts/bfu.sh b/usr/src/tools/scripts/bfu.sh index 21d5a7eb67..b10d453c7b 100644 --- a/usr/src/tools/scripts/bfu.sh +++ b/usr/src/tools/scripts/bfu.sh @@ -332,6 +332,7 @@ superfluous_local_zone_files=" lib/svc/method/svc-poold lib/svc/method/svc-pools lib/svc/method/svc-power + lib/svc/method/svc-resource-mgmt lib/svc/method/svc-rmvolmgr lib/svc/method/svc-scheduler lib/svc/method/svc-sckmd @@ -401,6 +402,7 @@ superfluous_local_zone_files=" var/svc/manifest/system/poold.xml var/svc/manifest/system/pools.xml var/svc/manifest/system/power.xml + var/svc/manifest/system/resource-mgmt.xml var/svc/manifest/system/scheduler.xml var/svc/manifest/system/sysevent.xml var/svc/manifest/system/zones.xml diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 32a63d6c22..b2bbcbc8c3 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -334,6 +334,7 @@ GENUNIX_OBJS += \ vm_seg.o \ vm_subr.o \ vm_swap.o \ + vm_usage.o \ vnode.o \ vuid_queue.o \ vuid_store.o \ diff --git a/usr/src/uts/common/disp/priocntl.c b/usr/src/uts/common/disp/priocntl.c index 3bb90cf1fa..9197dc815b 100644 --- a/usr/src/uts/common/disp/priocntl.c +++ b/usr/src/uts/common/disp/priocntl.c @@ -136,6 +136,7 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg, struct pcmpargs pcmpargs; pc_vaparms_t vaparms; char clname[PC_CLNMSZ]; + char *outstr; int count; kthread_id_t retthreadp; proc_t *initpp; @@ -145,6 +146,7 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg, int rv = 0; pid_t saved_pid; id_t classid; + int size; int (*copyinfn)(const void *, void *, size_t); int (*copyoutfn)(const void *, void *, size_t); @@ -692,6 +694,21 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg, ASSERT(defaultcid > 0 && defaultcid < loaded_classes); break; + case PC_GETDFLCL: + mutex_enter(&class_lock); + + if (defaultcid >= loaded_classes) + outstr = ""; + else + outstr = sclass[defaultcid].cl_name; + size = strlen(outstr) + 1; + if (arg != NULL) + if ((*copyoutfn)(outstr, arg, size) != 0) + error = EFAULT; + + mutex_exit(&class_lock); + break; + default: error = EINVAL; break; diff --git a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c index 5a7000c242..c5145cccf0 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -67,6 +66,7 @@ tmp_resv( int pagecreate) /* call anon_resv if set */ { pgcnt_t pages = btopr(delta); + zone_t *zone; ASSERT(RW_WRITE_HELD(&tp->tn_rwlock)); ASSERT(tp->tn_type == VREG); @@ -79,9 +79,10 @@ tmp_resv( * * Deny if trying to reserve more than tmpfs can allocate */ + zone = tm->tm_vfsp->vfs_zone; if (pagecreate && ((tm->tm_anonmem + pages > tm->tm_anonmax) || - (!anon_checkspace(ptob(pages + tmpfs_minfree))) || - (anon_resv(delta) == 0))) { + (!anon_checkspace(ptob(pages + tmpfs_minfree), zone)) || + (anon_resv_zone(delta, zone) == 0))) { return (1); } @@ -114,7 +115,7 @@ tmp_unresv( ASSERT(RW_WRITE_HELD(&tp->tn_rwlock)); ASSERT(tp->tn_type == VREG); - anon_unresv(delta); + anon_unresv_zone(delta, tm->tm_vfsp->vfs_zone); mutex_enter(&tm->tm_contents); tm->tm_anonmem -= btopr(delta); diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c index d623dce3f7..aa870b124a 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c @@ -215,9 +215,26 @@ wrtmp( if (delta > 0) { pagecreate = 1; if (tmp_resv(tm, tp, delta, pagecreate)) { - cmn_err(CE_WARN, - "%s: File system full, swap space limit exceeded", + /* + * Log file system full in the zone that owns + * the tmpfs mount, as well as in the global + * zone if necessary. + */ + zcmn_err(tm->tm_vfsp->vfs_zone->zone_id, + CE_WARN, "%s: File system full, " + "swap space limit exceeded", tm->tm_mntpath); + + if (tm->tm_vfsp->vfs_zone->zone_id != + GLOBAL_ZONEID) { + + vfs_t *vfs = tm->tm_vfsp; + + zcmn_err(GLOBAL_ZONEID, + CE_WARN, "%s: File system full, " + "swap space limit exceeded", + vfs->vfs_vnodecovered->v_path); + } error = ENOSPC; break; } diff --git a/usr/src/uts/common/os/modhash.c b/usr/src/uts/common/os/modhash.c index 19700ce685..3c63231253 100644 --- a/usr/src/uts/common/os/modhash.c +++ b/usr/src/uts/common/os/modhash.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -165,15 +164,6 @@ */ #define MH_KEYCMP(hash, key1, key2) ((hash->mh_keycmp)(key1, key2)) -static void i_mod_hash_clear_nosync(mod_hash_t *); -static int i_mod_hash_find_nosync(mod_hash_t *, mod_hash_key_t, - mod_hash_val_t *); -static int i_mod_hash_insert_nosync(mod_hash_t *, mod_hash_key_t, - mod_hash_val_t, mod_hash_hndl_t); -static int i_mod_hash_remove_nosync(mod_hash_t *, mod_hash_key_t, - mod_hash_val_t *); -static uint_t i_mod_hash(mod_hash_t *, mod_hash_key_t); - /* * Cache for struct mod_hash_entry */ @@ -522,7 +512,7 @@ mod_hash_destroy_hash(mod_hash_t *hash) * i_mod_hash() * Call the hashing algorithm for this hash table, with the given key. */ -static uint_t +uint_t i_mod_hash(mod_hash_t *hash, mod_hash_key_t key) { uint_t h; @@ -778,7 +768,7 @@ mod_hash_destroy(mod_hash_t *hash, mod_hash_key_t key) * mod_hash_find() * Find a value in the hash table corresponding to the given key. */ -static int +int i_mod_hash_find_nosync(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val) { @@ -826,7 +816,7 @@ mod_hash_find_cb(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val, return (res); } -static void +void i_mod_hash_walk_nosync(mod_hash_t *hash, uint_t (*callback)(mod_hash_key_t, mod_hash_val_t *, void *), void *arg) { @@ -870,7 +860,7 @@ mod_hash_walk(mod_hash_t *hash, * Clears the given hash table by calling the destructor of every hash * element and freeing up all mod_hash_entry's. */ -static void +void i_mod_hash_clear_nosync(mod_hash_t *hash) { int i; diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c index 88b0258afe..fecc4a6c45 100644 --- a/usr/src/uts/common/os/pid.c +++ b/usr/src/uts/common/os/pid.c @@ -385,6 +385,56 @@ pgfind(pid_t pgid) } /* + * Sets P_PR_LOCK on a non-system process. Process must be fully created + * and not exiting to succeed. + * + * Returns 0 on success. + * Returns 1 if P_PR_LOCK is set. + * Returns -1 if proc is in invalid state. + */ +int +sprtrylock_proc(proc_t *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + + /* skip system and incomplete processes */ + if (p->p_stat == SIDL || p->p_stat == SZOMB || + (p->p_flag & (SSYS | SEXITING | SEXITLWPS))) { + return (-1); + } + + if (p->p_proc_flag & P_PR_LOCK) + return (1); + + p->p_proc_flag |= P_PR_LOCK; + THREAD_KPRI_REQUEST(); + + return (0); +} + +/* + * Wait for P_PR_LOCK to become clear. Returns with p_lock dropped, + * and the proc pointer no longer valid, as the proc may have exited. + */ +void +sprwaitlock_proc(proc_t *p) +{ + kmutex_t *mp; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(p->p_proc_flag & P_PR_LOCK); + + /* + * p_lock is persistent, but p itself is not -- it could + * vanish during cv_wait(). Load p->p_lock now so we can + * drop it after cv_wait() without referencing p. + */ + mp = &p->p_lock; + cv_wait(&pr_pid_cv[p->p_slot], mp); + mutex_exit(mp); +} + +/* * If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK. * Returns the proc pointer on success, NULL on failure. sprlock() is * really just a stripped-down version of pr_p_lock() to allow practive @@ -394,7 +444,7 @@ proc_t * sprlock_zone(pid_t pid, zoneid_t zoneid) { proc_t *p; - kmutex_t *mp; + int ret; for (;;) { mutex_enter(&pidlock); @@ -402,31 +452,21 @@ sprlock_zone(pid_t pid, zoneid_t zoneid) mutex_exit(&pidlock); return (NULL); } - /* - * p_lock is persistent, but p itself is not -- it could - * vanish during cv_wait(). Load p->p_lock now so we can - * drop it after cv_wait() without referencing p. - */ - mp = &p->p_lock; - mutex_enter(mp); + mutex_enter(&p->p_lock); mutex_exit(&pidlock); - /* - * If the process is in some half-baked state, fail. - */ - if (p->p_stat == SZOMB || p->p_stat == SIDL || - (p->p_flag & (SEXITING | SEXITLWPS))) { - mutex_exit(mp); - return (NULL); - } + if (panicstr) return (p); - if (!(p->p_proc_flag & P_PR_LOCK)) + + ret = sprtrylock_proc(p); + if (ret == -1) { + mutex_exit(&p->p_lock); + return (NULL); + } else if (ret == 0) { break; - cv_wait(&pr_pid_cv[p->p_slot], mp); - mutex_exit(mp); + } + sprwaitlock_proc(p); } - p->p_proc_flag |= P_PR_LOCK; - THREAD_KPRI_REQUEST(); return (p); } diff --git a/usr/src/uts/common/os/pool.c b/usr/src/uts/common/os/pool.c index ceb90850fa..818bb54701 100644 --- a/usr/src/uts/common/os/pool.c +++ b/usr/src/uts/common/os/pool.c @@ -293,6 +293,8 @@ pool_enable(void) (void) nvlist_add_string(pool_sys_prop, "system.comment", ""); (void) nvlist_add_int64(pool_sys_prop, "system.version", 1); (void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1); + (void) nvlist_add_string(pool_sys_prop, "system.poold.objectives", + "wt-load"); (void) nvlist_alloc(&pool_default->pool_props, NV_UNIQUE_NAME, KM_SLEEP); @@ -1309,7 +1311,7 @@ pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags) } if (idtype == P_PROJID) { - kpj = project_hold_by_id(id, GLOBAL_ZONEID, PROJECT_HOLD_FIND); + kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND); if (kpj == NULL) return (ESRCH); mutex_enter(&kpj->kpj_poolbind); diff --git a/usr/src/uts/common/os/project.c b/usr/src/uts/common/os/project.c index 6c266c0ca3..d75b60f6e9 100644 --- a/usr/src/uts/common/os/project.c +++ b/usr/src/uts/common/os/project.c @@ -29,6 +29,7 @@ #include <sys/modhash.h> #include <sys/modctl.h> #include <sys/kmem.h> +#include <sys/kstat.h> #include <sys/atomic.h> #include <sys/cmn_err.h> #include <sys/proc.h> @@ -103,6 +104,8 @@ struct project_zone { * acquired, the hash lock is to be acquired first. */ +static kstat_t *project_kstat_create(kproject_t *pj, zone_t *zone); +static void project_kstat_delete(kproject_t *pj); static void project_data_init(kproject_data_t *data) @@ -118,6 +121,7 @@ project_data_init(kproject_data_t *data) data->kpd_locked_mem_ctl = UINT64_MAX; data->kpd_contract = 0; data->kpd_crypto_mem = 0; + data->kpd_lockedmem_kstat = NULL; } /*ARGSUSED*/ @@ -179,11 +183,11 @@ project_hold(kproject_t *p) } /* - * kproject_t *project_hold_by_id(projid_t, zoneid_t, int) + * kproject_t *project_hold_by_id(projid_t, zone_t *, int) * * Overview * project_hold_by_id() performs a look-up in the dictionary of projects - * active on the system by specified project ID + zone ID and puts a hold on + * active on the system by specified project ID + zone and puts a hold on * it. The third argument defines the desired behavior in the case when * project with given project ID cannot be found: * @@ -202,7 +206,7 @@ project_hold(kproject_t *p) * Caller must be in a context suitable for KM_SLEEP allocations. */ kproject_t * -project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) +project_hold_by_id(projid_t id, zone_t *zone, int flag) { kproject_t *spare_p; kproject_t *p; @@ -211,9 +215,11 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) rctl_alloc_gp_t *gp; rctl_entity_p_t e; struct project_zone pz; + boolean_t create = B_FALSE; + kstat_t *ksp; pz.kpj_id = id; - pz.kpj_zoneid = zoneid; + pz.kpj_zoneid = zone->zone_id; if (flag == PROJECT_HOLD_FIND) { mutex_enter(&project_hash_lock); @@ -241,9 +247,10 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) mutex_enter(&project_hash_lock); if (mod_hash_find(projects_hash, (mod_hash_key_t)&pz, (mod_hash_val_t *)&p) == MH_ERR_NOTFOUND) { + p = spare_p; p->kpj_id = id; - p->kpj_zoneid = zoneid; + p->kpj_zoneid = zone->zone_id; p->kpj_count = 0; p->kpj_shares = 1; p->kpj_nlwps = 0; @@ -265,7 +272,7 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) * Insert project into global project list. */ mutex_enter(&projects_list_lock); - if (id != 0 || zoneid != GLOBAL_ZONEID) { + if (id != 0 || zone != &zone0) { p->kpj_next = projects_list; p->kpj_prev = projects_list->kpj_prev; p->kpj_prev->kpj_next = p; @@ -279,6 +286,7 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) projects_list = p; } mutex_exit(&projects_list_lock); + create = B_TRUE; } else { mutex_exit(&curproc->p_lock); mod_hash_cancel(projects_hash, &hndl); @@ -290,10 +298,20 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) p->kpj_count++; mutex_exit(&project_hash_lock); + /* + * The kstat stores the project's zone name, as zoneid's may change + * across reboots. + */ + if (create == B_TRUE) { + ksp = project_kstat_create(p, zone); + mutex_enter(&project_hash_lock); + ASSERT(p->kpj_data.kpd_lockedmem_kstat == NULL); + p->kpj_data.kpd_lockedmem_kstat = ksp; + mutex_exit(&project_hash_lock); + } return (p); } - /* * void project_rele(kproject_t *) * @@ -325,6 +343,7 @@ project_rele(kproject_t *p) mutex_exit(&projects_list_lock); rctl_set_free(p->kpj_rctls); + project_kstat_delete(p); if (mod_hash_destroy(projects_hash, (mod_hash_key_t)p)) panic("unable to delete project %d zone %d", p->kpj_id, @@ -636,9 +655,9 @@ project_locked_mem_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; ASSERT(MUTEX_HELD(&p->p_lock)); - mutex_enter(&p->p_zone->zone_rctl_lock); + mutex_enter(&p->p_zone->zone_mem_lock); q = p->p_task->tk_proj->kpj_data.kpd_locked_mem; - mutex_exit(&p->p_zone->zone_rctl_lock); + mutex_exit(&p->p_zone->zone_mem_lock); return (q); } @@ -649,7 +668,7 @@ project_locked_mem_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e, { rctl_qty_t q; ASSERT(MUTEX_HELD(&p->p_lock)); - ASSERT(MUTEX_HELD(&p->p_zone->zone_rctl_lock)); + ASSERT(MUTEX_HELD(&p->p_zone->zone_mem_lock)); q = p->p_task->tk_proj->kpj_data.kpd_locked_mem; if (q + inc > rval->rcv_value) return (1); @@ -868,7 +887,7 @@ project_init(void) rctl_add_default_limit("project.max-contracts", 10000, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY); - t0.t_proj = proj0p = project_hold_by_id(0, GLOBAL_ZONEID, + t0.t_proj = proj0p = project_hold_by_id(0, &zone0, PROJECT_HOLD_INSERT); mutex_enter(&p0.p_lock); @@ -876,3 +895,57 @@ project_init(void) mutex_exit(&p0.p_lock); proj0p->kpj_ntasks = 1; } + +static int +project_lockedmem_kstat_update(kstat_t *ksp, int rw) +{ + kproject_t *pj = ksp->ks_private; + kproject_kstat_t *kpk = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + kpk->kpk_usage.value.ui64 = pj->kpj_data.kpd_locked_mem; + kpk->kpk_value.value.ui64 = pj->kpj_data.kpd_locked_mem_ctl; + return (0); +} + +static kstat_t * +project_kstat_create(kproject_t *pj, zone_t *zone) +{ + kstat_t *ksp; + kproject_kstat_t *kpk; + char *zonename = zone->zone_name; + + ksp = rctl_kstat_create_project(pj, "lockedmem", KSTAT_TYPE_NAMED, + sizeof (kproject_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ksp == NULL) + return (NULL); + + kpk = ksp->ks_data = kmem_alloc(sizeof (kproject_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zonename) + 1; + kstat_named_init(&kpk->kpk_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&kpk->kpk_zonename, zonename); + kstat_named_init(&kpk->kpk_usage, "usage", KSTAT_DATA_UINT64); + kstat_named_init(&kpk->kpk_value, "value", KSTAT_DATA_UINT64); + ksp->ks_update = project_lockedmem_kstat_update; + ksp->ks_private = pj; + kstat_install(ksp); + + return (ksp); +} + +static void +project_kstat_delete(kproject_t *pj) +{ + void *data; + + if (pj->kpj_data.kpd_lockedmem_kstat != NULL) { + data = pj->kpj_data.kpd_lockedmem_kstat->ks_data; + kstat_delete(pj->kpj_data.kpd_lockedmem_kstat); + kmem_free(data, sizeof (zone_kstat_t)); + } + pj->kpj_data.kpd_lockedmem_kstat = NULL; +} diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c index 4de4c74fe8..c0479005ea 100644 --- a/usr/src/uts/common/os/rctl.c +++ b/usr/src/uts/common/os/rctl.c @@ -29,6 +29,7 @@ #include <sys/cmn_err.h> #include <sys/id_space.h> #include <sys/kmem.h> +#include <sys/kstat.h> #include <sys/log.h> #include <sys/modctl.h> #include <sys/modhash.h> @@ -2599,7 +2600,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, zonep = p->p_zone; } - mutex_enter(&zonep->zone_rctl_lock); + mutex_enter(&zonep->zone_mem_lock); e.rcep_p.proj = projp; e.rcep_t = RCENTITY_PROJECT; @@ -2627,7 +2628,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, p->p_locked_mem += inc; } out: - mutex_exit(&zonep->zone_rctl_lock); + mutex_exit(&zonep->zone_mem_lock); if (proj != NULL) zone_rele(zonep); return (ret); @@ -2661,7 +2662,7 @@ rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, zonep = p->p_zone; } - mutex_enter(&zonep->zone_rctl_lock); + mutex_enter(&zonep->zone_mem_lock); zonep->zone_locked_mem -= inc; projp->kpj_data.kpd_locked_mem -= inc; if (creditproc != 0) { @@ -2669,7 +2670,120 @@ rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, ASSERT(MUTEX_HELD(&p->p_lock)); p->p_locked_mem -= inc; } - mutex_exit(&zonep->zone_rctl_lock); + mutex_exit(&zonep->zone_mem_lock); if (proj != NULL) zone_rele(zonep); } + +/* + * rctl_incr_swap(proc_t *, zone_t *, size_t) + * + * Overview + * Increments the swap charge on the specified zone. + * + * Return values + * 0 on success. EAGAIN if swap increment fails due an rctl value + * on the zone. + * + * Callers context + * p_lock held on specified proc. + * swap must be even multiple of PAGESIZE + */ +int +rctl_incr_swap(proc_t *proc, zone_t *zone, size_t swap) +{ + rctl_entity_p_t e; + + ASSERT(MUTEX_HELD(&proc->p_lock)); + ASSERT((swap & PAGEOFFSET) == 0); + e.rcep_p.zone = zone; + e.rcep_t = RCENTITY_ZONE; + + mutex_enter(&zone->zone_mem_lock); + + if ((zone->zone_max_swap + swap) > + zone->zone_max_swap_ctl) { + + if (rctl_test_entity(rc_zone_max_swap, zone->zone_rctls, + proc, &e, swap, 0) & RCT_DENY) { + mutex_exit(&zone->zone_mem_lock); + return (EAGAIN); + } + } + zone->zone_max_swap += swap; + mutex_exit(&zone->zone_mem_lock); + return (0); +} + +/* + * rctl_decr_swap(zone_t *, size_t) + * + * Overview + * Decrements the swap charge on the specified zone. + * + * Return values + * None + * + * Callers context + * swap must be even multiple of PAGESIZE + */ +void +rctl_decr_swap(zone_t *zone, size_t swap) +{ + ASSERT((swap & PAGEOFFSET) == 0); + mutex_enter(&zone->zone_mem_lock); + ASSERT(zone->zone_max_swap >= swap); + zone->zone_max_swap -= swap; + mutex_exit(&zone->zone_mem_lock); +} + +/* + * Create resource kstat + */ +static kstat_t * +rctl_kstat_create_common(char *ks_name, int ks_instance, char *ks_class, + uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags, int ks_zoneid) +{ + kstat_t *ksp = NULL; + char name[KSTAT_STRLEN]; + + (void) snprintf(name, KSTAT_STRLEN, "%s_%d", ks_name, ks_instance); + + if ((ksp = kstat_create_zone("caps", ks_zoneid, + name, ks_class, ks_type, + ks_ndata, ks_flags, ks_zoneid)) != NULL) { + if (ks_zoneid != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + } + return (ksp); +} + +/* + * Create zone-specific resource kstat + */ +kstat_t * +rctl_kstat_create_zone(zone_t *zone, char *ks_name, uchar_t ks_type, + uint_t ks_ndata, uchar_t ks_flags) +{ + char name[KSTAT_STRLEN]; + + (void) snprintf(name, KSTAT_STRLEN, "%s_zone", ks_name); + + return (rctl_kstat_create_common(name, zone->zone_id, "zone_caps", + ks_type, ks_ndata, ks_flags, zone->zone_id)); +} + +/* + * Create project-specific resource kstat + */ +kstat_t * +rctl_kstat_create_project(kproject_t *kpj, char *ks_name, uchar_t ks_type, + uint_t ks_ndata, uchar_t ks_flags) +{ + char name[KSTAT_STRLEN]; + + (void) snprintf(name, KSTAT_STRLEN, "%s_project", ks_name); + + return (rctl_kstat_create_common(name, kpj->kpj_id, "project_caps", + ks_type, ks_ndata, ks_flags, kpj->kpj_zoneid)); +} diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c index 66aae7d2bc..62279e0777 100644 --- a/usr/src/uts/common/os/schedctl.c +++ b/usr/src/uts/common/os/schedctl.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -542,13 +541,13 @@ schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr) * Set up anonymous memory struct. No swap reservation is * needed since the page will be locked into memory. */ - amp = anonmap_alloc(PAGESIZE, PAGESIZE); + amp = anonmap_alloc(PAGESIZE, 0); /* * Allocate the page. */ - kaddr = segkp_get_withanonmap(segkp, PAGESIZE, KPD_LOCKED | KPD_ZERO, - amp); + kaddr = segkp_get_withanonmap(segkp, PAGESIZE, + KPD_NO_ANON | KPD_LOCKED | KPD_ZERO, amp); if (kaddr == NULL) { amp->refcnt--; anonmap_free(amp); diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c index 9ada0aac18..a7ef99fddb 100644 --- a/usr/src/uts/common/os/sysent.c +++ b/usr/src/uts/common/os/sysent.c @@ -666,7 +666,7 @@ struct sysent sysent[NSYSCALL] = /* 178 */ SYSENT_LOADABLE(), /* kaio */ /* 179 */ SYSENT_LOADABLE(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), - /* 181 */ SYSENT_CI("rusagesys", rusagesys, 2), + /* 181 */ SYSENT_CI("rusagesys", rusagesys, 5), /* 182 */ SYSENT_LOADABLE(), /* portfs */ /* 183 */ SYSENT_CI("pollsys", pollsys, 4), /* 184 */ SYSENT_CI("labelsys", labelsys, 5), @@ -1044,7 +1044,7 @@ struct sysent sysent32[NSYSCALL] = /* 178 */ SYSENT_LOADABLE32(), /* kaio */ /* 179 */ SYSENT_LOADABLE32(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), - /* 181 */ SYSENT_CI("rusagesys", rusagesys, 2), + /* 181 */ SYSENT_CI("rusagesys", rusagesys, 5), /* 182 */ SYSENT_LOADABLE32(), /* portfs */ /* 183 */ SYSENT_CI("pollsys", pollsys, 4), /* 184 */ SYSENT_CI("labelsys", labelsys, 5), diff --git a/usr/src/uts/common/os/task.c b/usr/src/uts/common/os/task.c index 562e3596b5..785f74c145 100644 --- a/usr/src/uts/common/os/task.c +++ b/usr/src/uts/common/os/task.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -389,7 +388,7 @@ task_create(projid_t projid, zone_t *zone) tk->tk_nlwps = 0; tk->tk_nlwps_ctl = INT_MAX; tk->tk_usage = tu; - tk->tk_proj = project_hold_by_id(projid, zone->zone_id, + tk->tk_proj = project_hold_by_id(projid, zone, PROJECT_HOLD_INSERT); tk->tk_flags = TASK_NORMAL; @@ -848,7 +847,7 @@ task_init(void) task0p->tk_tkid = id_alloc(taskid_space); task0p->tk_usage = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP); - task0p->tk_proj = project_hold_by_id(0, GLOBAL_ZONEID, + task0p->tk_proj = project_hold_by_id(0, &zone0, PROJECT_HOLD_INSERT); task0p->tk_flags = TASK_NORMAL; task0p->tk_nlwps = p->p_lwpcnt; diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index 0fb2c2be55..19ea8b31f1 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -154,6 +154,10 @@ * zone_lock: This is a per-zone lock used to protect several fields of * the zone_t (see <sys/zone.h> for details). In addition, holding * this lock means that the zone cannot go away. + * zone_nlwps_lock: This is a per-zone lock used to protect the fields + * related to the zone.max-lwps rctl. + * zone_mem_lock: This is a per-zone lock used to protect the fields + * related to the zone.max-locked-memory and zone.max-swap rctls. * zsd_key_lock: This is a global lock protecting the key state for ZSD. * zone_deathrow_lock: This is a global lock protecting the "deathrow" * list (a list of zones in the ZONE_IS_DEAD state). @@ -162,6 +166,10 @@ * pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock --> * zone_lock --> zsd_key_lock --> pidlock --> p_lock * + * When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is: + * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock + * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock + * * Blocking memory allocations are permitted while holding any of the * zone locks. * @@ -190,6 +198,7 @@ #include <sys/debug.h> #include <sys/file.h> #include <sys/kmem.h> +#include <sys/kstat.h> #include <sys/mutex.h> #include <sys/note.h> #include <sys/pathname.h> @@ -232,6 +241,8 @@ #include <sys/zone.h> #include <sys/tsol/label.h> +#include <vm/seg.h> + /* * cv used to signal that all references to the zone have been released. This * needs to be global since there may be multiple waiters, and the first to @@ -317,6 +328,7 @@ const char *zone_status_table[] = { */ rctl_hndl_t rc_zone_cpu_shares; rctl_hndl_t rc_zone_locked_mem; +rctl_hndl_t rc_zone_max_swap; rctl_hndl_t rc_zone_nlwps; rctl_hndl_t rc_zone_shmmax; rctl_hndl_t rc_zone_shmmni; @@ -1011,9 +1023,9 @@ zone_locked_mem_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; ASSERT(MUTEX_HELD(&p->p_lock)); - mutex_enter(&p->p_zone->zone_rctl_lock); + mutex_enter(&p->p_zone->zone_mem_lock); q = p->p_zone->zone_locked_mem; - mutex_exit(&p->p_zone->zone_rctl_lock); + mutex_exit(&p->p_zone->zone_mem_lock); return (q); } @@ -1023,9 +1035,12 @@ zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags) { rctl_qty_t q; + zone_t *z; + + z = e->rcep_p.zone; ASSERT(MUTEX_HELD(&p->p_lock)); - ASSERT(MUTEX_HELD(&p->p_zone->zone_rctl_lock)); - q = p->p_zone->zone_locked_mem; + ASSERT(MUTEX_HELD(&z->zone_mem_lock)); + q = z->zone_locked_mem; if (q + incr > rcntl->rcv_value) return (1); return (0); @@ -1051,6 +1066,57 @@ static rctl_ops_t zone_locked_mem_ops = { zone_locked_mem_test }; +/*ARGSUSED*/ +static rctl_qty_t +zone_max_swap_usage(rctl_t *rctl, struct proc *p) +{ + rctl_qty_t q; + zone_t *z = p->p_zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + mutex_enter(&z->zone_mem_lock); + q = z->zone_max_swap; + mutex_exit(&z->zone_mem_lock); + return (q); +} + +/*ARGSUSED*/ +static int +zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, + rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags) +{ + rctl_qty_t q; + zone_t *z; + + z = e->rcep_p.zone; + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(MUTEX_HELD(&z->zone_mem_lock)); + q = z->zone_max_swap; + if (q + incr > rcntl->rcv_value) + return (1); + return (0); +} + +/*ARGSUSED*/ +static int +zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + if (e->rcep_p.zone == NULL) + return (0); + e->rcep_p.zone->zone_max_swap_ctl = nv; + return (0); +} + +static rctl_ops_t zone_max_swap_ops = { + rcop_no_action, + zone_max_swap_usage, + zone_max_swap_set, + zone_max_swap_test +}; + /* * Helper function to brand the zone with a unique ID. */ @@ -1080,6 +1146,96 @@ zone_get_kcred(zoneid_t zoneid) return (cr); } +static int +zone_lockedmem_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = zone->zone_locked_mem; + zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl; + return (0); +} + +static int +zone_swapresv_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = zone->zone_max_swap; + zk->zk_value.value.ui64 = zone->zone_max_swap_ctl; + return (0); +} + +static void +zone_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_kstat_t *zk; + + ksp = rctl_kstat_create_zone(zone, "lockedmem", KSTAT_TYPE_NAMED, + sizeof (zone_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ksp == NULL) + return; + + zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zk->zk_zonename, zone->zone_name); + kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64); + kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64); + ksp->ks_update = zone_lockedmem_kstat_update; + ksp->ks_private = zone; + kstat_install(ksp); + + zone->zone_lockedmem_kstat = ksp; + + ksp = rctl_kstat_create_zone(zone, "swapresv", KSTAT_TYPE_NAMED, + sizeof (zone_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ksp == NULL) + return; + + zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zk->zk_zonename, zone->zone_name); + kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64); + kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64); + ksp->ks_update = zone_swapresv_kstat_update; + ksp->ks_private = zone; + kstat_install(ksp); + + zone->zone_swapresv_kstat = ksp; +} + +static void +zone_kstat_delete(zone_t *zone) +{ + void *data; + + if (zone->zone_lockedmem_kstat != NULL) { + data = zone->zone_lockedmem_kstat->ks_data; + kstat_delete(zone->zone_lockedmem_kstat); + kmem_free(data, sizeof (zone_kstat_t)); + } + if (zone->zone_swapresv_kstat != NULL) { + data = zone->zone_swapresv_kstat->ks_data; + kstat_delete(zone->zone_swapresv_kstat); + kmem_free(data, sizeof (zone_kstat_t)); + } +} + /* * Called very early on in boot to initialize the ZSD list so that * zone_key_create() can be called before zone_init(). It also initializes @@ -1101,8 +1257,14 @@ zone_zsd_init(void) mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL); zone0.zone_shares = 1; + zone0.zone_nlwps = 0; zone0.zone_nlwps_ctl = INT_MAX; + zone0.zone_locked_mem = 0; + zone0.zone_locked_mem_ctl = UINT64_MAX; + ASSERT(zone0.zone_max_swap == 0); + zone0.zone_max_swap_ctl = UINT64_MAX; zone0.zone_shmmax = 0; zone0.zone_ipc.ipcq_shmmni = 0; zone0.zone_ipc.ipcq_semmni = 0; @@ -1120,6 +1282,8 @@ zone_zsd_init(void) zone0.zone_ncpus_online = 0; zone0.zone_proc_initpid = 1; zone0.zone_initname = initname; + zone0.zone_lockedmem_kstat = NULL; + zone0.zone_swapresv_kstat = NULL; list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), offsetof(struct zsd_entry, zsd_linkage)); list_insert_head(&zone_active, &zone0); @@ -1259,6 +1423,12 @@ zone_init(void) RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, &zone_locked_mem_ops); + + rc_zone_max_swap = rctl_register("zone.max-swap", + RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | + RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, + &zone_max_swap_ops); + /* * Initialize the ``global zone''. */ @@ -1277,9 +1447,14 @@ zone_init(void) zone0.zone_brand = &native_brand; rctl_prealloc_destroy(gp); /* - * pool_default hasn't been initialized yet, so we let pool_init() take - * care of making the global zone is in the default pool. + * pool_default hasn't been initialized yet, so we let pool_init() + * take care of making sure the global zone is in the default pool. + */ + + /* + * Initialize global zone kstats */ + zone_kstat_create(&zone0); /* * Initialize zone label. @@ -1337,6 +1512,7 @@ zone_init(void) if (res) panic("Sysevent_evc_bind failed during zone setup.\n"); + } static void @@ -1476,6 +1652,38 @@ zone_set_initname(zone_t *zone, const char *zone_initname) return (0); } +static int +zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) +{ + uint64_t mcap; + int err = 0; + + if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) + zone->zone_phys_mcap = mcap; + + return (err); +} + +static int +zone_set_sched_class(zone_t *zone, const char *new_class) +{ + char sched_class[PC_CLNMSZ]; + id_t classid; + int err; + + ASSERT(zone != global_zone); + if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0) + return (err); /* EFAULT or ENAMETOOLONG */ + + if (getcid(sched_class, &classid) != 0 || classid == syscid) + return (set_errno(EINVAL)); + zone->zone_defaultcid = classid; + ASSERT(zone->zone_defaultcid > 0 && + zone->zone_defaultcid < loaded_classes); + + return (0); +} + /* * Block indefinitely waiting for (zone_status >= status) */ @@ -2510,10 +2718,10 @@ zsched(void *arg) /* * Decrement locked memory counts on old zone and project. */ - mutex_enter(&global_zone->zone_rctl_lock); + mutex_enter(&global_zone->zone_mem_lock); global_zone->zone_locked_mem -= pp->p_locked_mem; pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem; - mutex_exit(&global_zone->zone_rctl_lock); + mutex_exit(&global_zone->zone_mem_lock); /* * Create and join a new task in project '0' of this zone. @@ -2529,10 +2737,10 @@ zsched(void *arg) pj = pp->p_task->tk_proj; - mutex_enter(&zone->zone_rctl_lock); + mutex_enter(&zone->zone_mem_lock); zone->zone_locked_mem += pp->p_locked_mem; pj->kpj_data.kpd_locked_mem += pp->p_locked_mem; - mutex_exit(&zone->zone_rctl_lock); + mutex_exit(&zone->zone_mem_lock); /* * add lwp counts to zsched's zone, and increment project's task count @@ -2689,7 +2897,10 @@ zsched(void *arg) * classid 'cid'. */ pool_lock(); - cid = pool_get_class(zone->zone_pool); + if (zone->zone_defaultcid > 0) + cid = zone->zone_defaultcid; + else + cid = pool_get_class(zone->zone_pool); if (cid == -1) cid = defaultcid; @@ -3019,7 +3230,7 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_initname = NULL; mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&zone->zone_rctl_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL); list_create(&zone->zone_zsd, sizeof (struct zsd_entry), offsetof(struct zsd_entry, zsd_linkage)); @@ -3057,8 +3268,14 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_initname = kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP); (void) strcpy(zone->zone_initname, zone_default_initname); + zone->zone_nlwps = 0; + zone->zone_nlwps_ctl = INT_MAX; zone->zone_locked_mem = 0; zone->zone_locked_mem_ctl = UINT64_MAX; + zone->zone_max_swap = 0; + zone->zone_max_swap_ctl = UINT64_MAX; + zone0.zone_lockedmem_kstat = NULL; + zone0.zone_swapresv_kstat = NULL; /* * Zsched initializes the rctls. @@ -3233,6 +3450,11 @@ zone_create(const char *zone_name, const char *zone_root, */ /* + * Create zone kstats + */ + zone_kstat_create(zone); + + /* * Let the other lwps continue. */ mutex_enter(&pp->p_lock); @@ -3643,6 +3865,9 @@ zone_destroy(zoneid_t zoneid) } + /* Get rid of the zone's kstats */ + zone_kstat_delete(zone); + /* * It is now safe to let the zone be recreated; remove it from the * lists. The memory will not be freed until the last cred @@ -3892,6 +4117,32 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) error = EFAULT; } break; + case ZONE_ATTR_PHYS_MCAP: + size = sizeof (zone->zone_phys_mcap); + if (bufsize > size) + bufsize = size; + if (buf != NULL && + copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) + error = EFAULT; + break; + case ZONE_ATTR_SCHED_CLASS: + mutex_enter(&class_lock); + + if (zone->zone_defaultcid >= loaded_classes) + outstr = ""; + else + outstr = sclass[zone->zone_defaultcid].cl_name; + size = strlen(outstr) + 1; + if (bufsize > size) + bufsize = size; + if (buf != NULL) { + err = copyoutstr(outstr, buf, bufsize, NULL); + if (err != 0 && err != ENAMETOOLONG) + error = EFAULT; + } + + mutex_exit(&class_lock); + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -3923,10 +4174,10 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) return (set_errno(EPERM)); /* - * At present, attributes can only be set on non-running, - * non-global zones. + * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the + * global zone. */ - if (zoneid == GLOBAL_ZONEID) { + if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { return (set_errno(EINVAL)); } @@ -3938,8 +4189,12 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) zone_hold(zone); mutex_exit(&zonehash_lock); + /* + * At present most attributes can only be set on non-running, + * non-global zones. + */ zone_status = zone_status_get(zone); - if (zone_status > ZONE_IS_READY) + if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) goto done; switch (attr) { @@ -3971,6 +4226,12 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) if (zone->zone_brand == NULL) err = EINVAL; break; + case ZONE_ATTR_PHYS_MCAP: + err = zone_set_phys_mcap(zone, (const uint64_t *)buf); + break; + case ZONE_ATTR_SCHED_CLASS: + err = zone_set_sched_class(zone, (const char *)buf); + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize); @@ -3986,6 +4247,11 @@ done: /* * Return zero if the process has at least one vnode mapped in to its * address space which shouldn't be allowed to change zones. + * + * Also return zero if the process has any shared mappings which reserve + * swap. This is because the counting for zone.max-swap does not allow swap + * revervation to be shared between zones. zone swap reservation is counted + * on zone->zone_max_swap. */ static int as_can_change_zones(void) @@ -3997,8 +4263,17 @@ as_can_change_zones(void) int allow = 1; ASSERT(pp->p_as != &kas); - AS_LOCK_ENTER(&as, &as->a_lock, RW_READER); + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { + + /* + * Cannot enter zone with shared anon memory which + * reserves swap. See comment above. + */ + if (seg_can_change_zones(seg) == B_FALSE) { + allow = 0; + break; + } /* * if we can't get a backing vnode for this segment then skip * it. @@ -4011,11 +4286,30 @@ as_can_change_zones(void) break; } } - AS_LOCK_EXIT(&as, &as->a_lock); + AS_LOCK_EXIT(as, &as->a_lock); return (allow); } /* + * Count swap reserved by curproc's address space + */ +static size_t +as_swresv(void) +{ + proc_t *pp = curproc; + struct seg *seg; + struct as *as = pp->p_as; + size_t swap = 0; + + ASSERT(pp->p_as != &kas); + ASSERT(AS_WRITE_HELD(as, &as->a_lock)); + for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) + swap += seg_swresv(seg); + + return (swap); +} + +/* * Systemcall entry point for zone_enter(). * * The current process is injected into said zone. In the process @@ -4043,6 +4337,7 @@ zone_enter(zoneid_t zoneid) zone_status_t status; int err = 0; rctl_entity_p_t e; + size_t swap; if (secpolicy_zone_config(CRED()) != 0) return (set_errno(EPERM)); @@ -4205,6 +4500,15 @@ zone_enter(zoneid_t zoneid) goto out; } + /* + * a_lock must be held while transfering locked memory and swap + * reservation from the global zone to the non global zone because + * asynchronous faults on the processes' address space can lock + * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE + * segments respectively. + */ + AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER); + swap = as_swresv(); mutex_enter(&pp->p_lock); zone_proj0 = zone->zone_zsched->p_task->tk_proj; /* verify that we do not exceed and task or lwp limits */ @@ -4216,10 +4520,11 @@ zone_enter(zoneid_t zoneid) zone_proj0->kpj_ntasks += 1; mutex_exit(&zone->zone_nlwps_lock); - mutex_enter(&zone->zone_rctl_lock); + mutex_enter(&zone->zone_mem_lock); zone->zone_locked_mem += pp->p_locked_mem; zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem; - mutex_exit(&zone->zone_rctl_lock); + zone->zone_max_swap += swap; + mutex_exit(&zone->zone_mem_lock); /* remove lwps from proc's old zone and old project */ mutex_enter(&pp->p_zone->zone_nlwps_lock); @@ -4227,12 +4532,14 @@ zone_enter(zoneid_t zoneid) pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt; mutex_exit(&pp->p_zone->zone_nlwps_lock); - mutex_enter(&pp->p_zone->zone_rctl_lock); + mutex_enter(&pp->p_zone->zone_mem_lock); pp->p_zone->zone_locked_mem -= pp->p_locked_mem; pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem; - mutex_exit(&pp->p_zone->zone_rctl_lock); + pp->p_zone->zone_max_swap -= swap; + mutex_exit(&pp->p_zone->zone_mem_lock); mutex_exit(&pp->p_lock); + AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock); /* * Joining the zone cannot fail from now on. @@ -4289,6 +4596,31 @@ zone_enter(zoneid_t zoneid) sess_rele(pp->p_sessp, B_TRUE); pp->p_sessp = sp; pgjoin(pp, zone->zone_zsched->p_pidp); + + /* + * If there is a default scheduling class for the zone and it is not + * the class we are currently in, change all of the threads in the + * process to the new class. We need to be holding pidlock & p_lock + * when we call parmsset so this is a good place to do it. + */ + if (zone->zone_defaultcid > 0 && + zone->zone_defaultcid != curthread->t_cid) { + pcparms_t pcparms; + kthread_id_t t; + + pcparms.pc_cid = zone->zone_defaultcid; + pcparms.pc_clparms[0] = 0; + + /* + * If setting the class fails, we still want to enter the zone. + */ + if ((t = pp->p_tlist) != NULL) { + do { + (void) parmsset(&pcparms, t); + } while ((t = t->t_forw) != pp->p_tlist); + } + } + mutex_exit(&pp->p_lock); mutex_exit(&pidlock); diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index ab103ef4c7..4493f99454 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -544,6 +544,7 @@ CHKHDRS= \ visual_io.h \ vlan.h \ vm.h \ + vm_usage.h \ vmem.h \ vmem_impl.h \ vmmeter.h \ diff --git a/usr/src/uts/common/sys/modhash_impl.h b/usr/src/uts/common/sys/modhash_impl.h index 25e45cec23..a187eb68ee 100644 --- a/usr/src/uts/common/sys/modhash_impl.h +++ b/usr/src/uts/common/sys/modhash_impl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -93,6 +92,18 @@ struct mod_hash { */ void mod_hash_init(void); +/* + * Internal routines. Use directly with care. + */ +uint_t i_mod_hash(mod_hash_t *, mod_hash_key_t); +int i_mod_hash_insert_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t, + mod_hash_hndl_t); +int i_mod_hash_remove_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *); +int i_mod_hash_find_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *); +void i_mod_hash_walk_nosync(mod_hash_t *, uint_t (*)(mod_hash_key_t, + mod_hash_val_t *, void *), void *); +void i_mod_hash_clear_nosync(mod_hash_t *hash); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/priocntl.h b/usr/src/uts/common/sys/priocntl.h index ca1a92400a..6475ed0a4c 100644 --- a/usr/src/uts/common/sys/priocntl.h +++ b/usr/src/uts/common/sys/priocntl.h @@ -65,6 +65,7 @@ extern long priocntl(), priocntlset(); #define PC_SETXPARMS 7 /* Set extended scheduling parameters */ #define PC_GETXPARMS 8 /* Get extended scheduling parameters */ #define PC_SETDFLCL 9 /* Set default class, not for general use */ +#define PC_GETDFLCL 10 /* Get default class, not for general use */ #define PC_CLNULL -1 diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h index fcf953262c..9a0ba2cc37 100644 --- a/usr/src/uts/common/sys/proc.h +++ b/usr/src/uts/common/sys/proc.h @@ -613,6 +613,8 @@ extern proc_t *pgfind(pid_t); extern proc_t *pgfind_zone(pid_t, zoneid_t); extern proc_t *sprlock(pid_t); extern proc_t *sprlock_zone(pid_t, zoneid_t); +extern int sprtrylock_proc(proc_t *); +extern void sprwaitlock_proc(proc_t *); extern void sprlock_proc(proc_t *); extern void sprunlock(proc_t *); extern void pid_init(void); diff --git a/usr/src/uts/common/sys/project.h b/usr/src/uts/common/sys/project.h index 679c1eddc2..5018df8499 100644 --- a/usr/src/uts/common/sys/project.h +++ b/usr/src/uts/common/sys/project.h @@ -28,15 +28,24 @@ #pragma ident "%Z%%M% %I% %E% SMI" + #ifdef __cplusplus extern "C" { #endif + +#include <sys/kstat.h> #include <sys/types.h> #include <sys/mutex.h> #include <sys/rctl.h> #include <sys/ipc_rctl.h> +typedef struct kproject_kstat { + kstat_named_t kpk_zonename; + kstat_named_t kpk_usage; + kstat_named_t kpk_value; +} kproject_kstat_t; + typedef struct kproject_data { /* Datum protected by: */ rctl_qty_t kpd_shmmax; /* shm's ipcs_lock */ ipc_rqty_t kpd_ipc; /* shm|sem|msg's ipcs lock */ @@ -44,6 +53,7 @@ typedef struct kproject_data { /* Datum protected by: */ rctl_qty_t kpd_locked_mem_ctl; /* kpj_rctls->rcs_lock */ rctl_qty_t kpd_contract; /* contract_lock */ rctl_qty_t kpd_crypto_mem; /* crypto_rctl_lock */ + kstat_t *kpd_lockedmem_kstat; /* locked memory kstat */ } kproject_data_t; @@ -76,9 +86,11 @@ typedef struct kproject { #define PROJECT_HOLD_FIND 1 #define PROJECT_HOLD_INSERT 2 +struct zone; + void project_init(void); kproject_t *project_hold(kproject_t *); -kproject_t *project_hold_by_id(projid_t, zoneid_t, int); +kproject_t *project_hold_by_id(projid_t, struct zone *, int); void project_rele(kproject_t *); int project_walk_all(zoneid_t, int (*)(kproject_t *, void *), void *); projid_t curprojid(void); diff --git a/usr/src/uts/common/sys/rctl.h b/usr/src/uts/common/sys/rctl.h index eb56fff9e5..a8480c2768 100644 --- a/usr/src/uts/common/sys/rctl.h +++ b/usr/src/uts/common/sys/rctl.h @@ -168,6 +168,7 @@ struct proc; struct task; struct kproject; struct zone; +struct kstat; typedef struct rctl_entity_p_struct { rctl_entity_t rcep_t; @@ -324,6 +325,14 @@ int rctl_incr_locked_mem(struct proc *, struct kproject *, rctl_qty_t, int); void rctl_decr_locked_mem(struct proc *, struct kproject *, rctl_qty_t, int); +int rctl_incr_swap(struct proc *, struct zone *, size_t); +void rctl_decr_swap(struct zone *, size_t); + +struct kstat *rctl_kstat_create_zone(struct zone *, char *, uchar_t, uint_t, + uchar_t); + +struct kstat *rctl_kstat_create_project(struct kproject *, char *, uchar_t, + uint_t, uchar_t); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/resource.h b/usr/src/uts/common/sys/resource.h index 86cc716d56..bf02808d4b 100644 --- a/usr/src/uts/common/sys/resource.h +++ b/usr/src/uts/common/sys/resource.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -191,6 +190,7 @@ struct rusage { #define _RUSAGESYS_GETRUSAGE 0 /* rusage process */ #define _RUSAGESYS_GETRUSAGE_CHLD 1 /* rusage child process */ #define _RUSAGESYS_GETRUSAGE_LWP 2 /* rusage lwp */ +#define _RUSAGESYS_GETVMUSAGE 3 /* getvmusage */ #if defined(_SYSCALL32) diff --git a/usr/src/uts/common/sys/syscall.h b/usr/src/uts/common/sys/syscall.h index 96cb967023..eedadfa0c0 100644 --- a/usr/src/uts/common/sys/syscall.h +++ b/usr/src/uts/common/sys/syscall.h @@ -384,7 +384,8 @@ extern "C" { #define SYS_rusagesys 181 /* * subcodes: - * getrusage(...) :: rusagesys(RUSAGESYS_GETRUSAGE,...) + * getrusage(...) :: rusagesys(RUSAGESYS_GETRUSAGE, ...) + * getvmusage(...) :: rusagesys(RUSAGESYS_GETVMUSAGE, ...) */ #define SYS_port 182 /* diff --git a/usr/src/uts/common/sys/vm_usage.h b/usr/src/uts/common/sys/vm_usage.h new file mode 100644 index 0000000000..5f8c8b8fe5 --- /dev/null +++ b/usr/src/uts/common/sys/vm_usage.h @@ -0,0 +1,120 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VM_USAGE_H +#define _SYS_VM_USAGE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The flags passed to getvmusage() request how to aggregate rss/swap results. + * Results can be aggregated by zone, project, task, ruser, and/or euser. + * + * If VMUSAGE_ALL_* or VMUSAGE_COL_* are passed from a non-global-zone, the + * flag is treated as VMUSAGE_*. For example, VMUSAGE_ALL_ZONES would be + * treated as VMUSAGE_ZONE. + * + * If VMUSAGE_SYSTEM is passed from a non-global zone, a result of type + * VMUSAGE_SYSTEM will be returned, but it will only reflect the usage + * of the calling zone. + * + * VMUSAGE_* requests results for the calling zone. + * VMUSAGE_ALL_* requests results for all zones. + * VMUSAGE_COL_* requests results for all zones, but collapses out the zoneid. + * For example, VMUSAGE_COL_PROJECTS requests results for all + * projects in all zones, and project N in ANY zone is treated + * as the same project. + */ +#define VMUSAGE_SYSTEM 0x1 /* rss/swap for ALL processes */ +#define VMUSAGE_ZONE 0x2 /* rss/swap for caller's zone */ +#define VMUSAGE_PROJECTS 0x4 /* rss/swap for all projects in */ + /* caller's zone */ +#define VMUSAGE_TASKS 0x8 /* rss/swap for all tasks in */ + /* caller's zones */ +#define VMUSAGE_RUSERS 0x10 /* rss/swap for all users (by process */ + /* ruser) in the caller's zone */ +#define VMUSAGE_EUSERS 0x20 /* same as VMUSAGE_RUSERS, but by */ + /* euser */ + +#define VMUSAGE_ALL_ZONES 0x40 /* rss/swap for all zones */ +#define VMUSAGE_ALL_PROJECTS 0x80 /* rss/swap for all projects in */ + /* all zones */ +#define VMUSAGE_ALL_TASKS 0x100 /* rss/swap for all tasks in all */ + /* zones */ +#define VMUSAGE_ALL_RUSERS 0x200 /* rss/swap for all users (by process */ + /* ruser) in all zones */ +#define VMUSAGE_ALL_EUSERS 0x400 /* same as VMUSAGE_ALL_RUSERS, but by */ + /* euser */ + +#define VMUSAGE_COL_PROJECTS 0x800 /* rss/swap for all projects in */ + /* all zones. Collapse zoneid. */ +#define VMUSAGE_COL_RUSERS 0x1000 /* rss/swap for all users (by process */ + /* ruser), in all zones. Collapse */ + /* zoneid */ +#define VMUSAGE_COL_EUSERS 0x2000 /* same as VMUSAGE_COL_RUSERS, but by */ + /* euser */ + +#define VMUSAGE_MASK 0x3fff /* all valid flags for getvmusage() */ + +typedef struct vmusage { + id_t vmu_zoneid; /* zoneid, or ALL_ZONES for */ + /* VMUSAGE_COL_* results */ + /* ALL_ZONES means that the result */ + /* reflects swap and rss usage for */ + /* a projid/uid across all zones */ + uint_t vmu_type; /* Entity type of result. One of: */ + /* VMUSAGE_(SYSTEM|ZONE|PROJECTS| */ + /* TASKS|RUSERS|EUSERS) */ + id_t vmu_id; /* zoneid, projid, taskid, ... */ + size_t vmu_rss_all; /* total resident memory of entity */ + /* in bytes */ + size_t vmu_rss_private; /* total resident private memory */ + size_t vmu_rss_shared; /* total resident shared memory */ + size_t vmu_swap_all; /* total swap reserved, in bytes */ + size_t vmu_swap_private; /* swap reserved for private mappings */ + size_t vmu_swap_shared; /* swap reserved for shared mappings */ + +} vmusage_t; + +extern int getvmusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres); + +#ifdef _KERNEL + +int vm_getusage(uint_t, time_t, vmusage_t *, size_t *); +void vm_usage_init(); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VM_USAGE_H */ diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index daccd16bdf..94646bc976 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -88,6 +88,8 @@ extern "C" { #define ZONE_ATTR_INITNAME 9 #define ZONE_ATTR_BOOTARGS 10 #define ZONE_ATTR_BRAND 11 +#define ZONE_ATTR_PHYS_MCAP 12 +#define ZONE_ATTR_SCHED_CLASS 13 /* Start of the brand-specific attribute namespace */ #define ZONE_ATTR_BRAND_ATTRS 32768 @@ -280,6 +282,15 @@ typedef struct zone_dataset { list_node_t zd_linkage; } zone_dataset_t; +/* + * structure for zone kstats + */ +typedef struct zone_kstat { + kstat_named_t zk_zonename; + kstat_named_t zk_usage; + kstat_named_t zk_value; +} zone_kstat_t; + typedef struct zone { /* * zone_name is never modified once set. @@ -326,14 +337,20 @@ typedef struct zone { uint_t zone_rootpathlen; /* strlen(zone_rootpath) + 1 */ uint32_t zone_shares; /* FSS shares allocated to zone */ rctl_set_t *zone_rctls; /* zone-wide (zone.*) rctls */ - kmutex_t zone_rctl_lock; /* protects zone_locked_mem and */ + kmutex_t zone_mem_lock; /* protects zone_locked_mem and */ /* kpd_locked_mem for all */ - /* projects in zone */ + /* projects in zone. */ + /* Also protects zone_max_swap */ /* grab after p_lock, before rcs_lock */ - rctl_qty_t zone_locked_mem; /* bytes of locked memory in zone */ - rctl_qty_t zone_locked_mem_ctl; /* current locked memory */ + rctl_qty_t zone_locked_mem; /* bytes of locked memory in */ + /* zone */ + rctl_qty_t zone_locked_mem_ctl; /* Current locked memory */ /* limit. Protected by */ /* zone_rctls->rcs_lock */ + rctl_qty_t zone_max_swap; /* bytes of swap reserved by zone */ + rctl_qty_t zone_max_swap_ctl; /* current swap limit. */ + /* Protected by */ + /* zone_rctls->rcs_lock */ list_t zone_zsd; /* list of Zone-Specific Data values */ kcondvar_t zone_cv; /* used to signal state changes */ struct proc *zone_zsched; /* Dummy kernel "zsched" process */ @@ -341,6 +358,7 @@ typedef struct zone { char *zone_initname; /* fs path to 'init' */ int zone_boot_err; /* for zone_boot() if boot fails */ char *zone_bootargs; /* arguments passed via zone_boot() */ + uint64_t zone_phys_mcap; /* physical memory cap */ /* * zone_kthreads is protected by zone_status_lock. */ @@ -376,6 +394,9 @@ typedef struct zone { boolean_t zone_restart_init; /* Restart init if it dies? */ struct brand *zone_brand; /* zone's brand */ + id_t zone_defaultcid; /* dflt scheduling class id */ + kstat_t *zone_swapresv_kstat; + kstat_t *zone_lockedmem_kstat; } zone_t; /* @@ -553,6 +574,7 @@ extern void mount_completed(void); extern int zone_walk(int (*)(zone_t *, void *), void *); extern rctl_hndl_t rc_zone_locked_mem; +extern rctl_hndl_t rc_zone_max_swap; #endif /* _KERNEL */ diff --git a/usr/src/uts/common/syscall/processor_bind.c b/usr/src/uts/common/syscall/processor_bind.c index 10ca1178d5..bd416e43e6 100644 --- a/usr/src/uts/common/syscall/processor_bind.c +++ b/usr/src/uts/common/syscall/processor_bind.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -285,9 +284,10 @@ processor_bind(idtype_t idtype, id_t id, processorid_t bind, break; case P_PROJID: + pp = curproc; if (id == P_MYID) id = curprojid(); - if ((kpj = project_hold_by_id(id, getzoneid(), + if ((kpj = project_hold_by_id(id, pp->p_zone, PROJECT_HOLD_FIND)) == NULL) { ret = ESRCH; } else { diff --git a/usr/src/uts/common/syscall/pset.c b/usr/src/uts/common/syscall/pset.c index 5d3b7e6233..767529fc5d 100644 --- a/usr/src/uts/common/syscall/pset.c +++ b/usr/src/uts/common/syscall/pset.c @@ -542,9 +542,10 @@ pset_bind(psetid_t pset, idtype_t idtype, id_t id, psetid_t *opset) break; case P_PROJID: + pp = curproc; if (id == P_MYID) id = curprojid(); - if ((kpj = project_hold_by_id(id, getzoneid(), + if ((kpj = project_hold_by_id(id, pp->p_zone, PROJECT_HOLD_FIND)) == NULL) { error = ESRCH; break; diff --git a/usr/src/uts/common/syscall/rusagesys.c b/usr/src/uts/common/syscall/rusagesys.c index 3e09643981..036500932f 100644 --- a/usr/src/uts/common/syscall/rusagesys.c +++ b/usr/src/uts/common/syscall/rusagesys.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,6 +34,7 @@ #include <sys/time.h> #include <sys/errno.h> #include <sys/resource.h> +#include <sys/vm_usage.h> static int getrusage(void *user_rusage) @@ -246,16 +246,19 @@ getrusage_lwp(void *user_rusage) } int -rusagesys(int code, void * arg) +rusagesys(int code, void *arg1, void *arg2, void *arg3, void *arg4) { switch (code) { case _RUSAGESYS_GETRUSAGE: - return (getrusage(arg)); + return (getrusage(arg1)); case _RUSAGESYS_GETRUSAGE_CHLD: - return (getrusage_chld(arg)); + return (getrusage_chld(arg1)); case _RUSAGESYS_GETRUSAGE_LWP: - return (getrusage_lwp(arg)); + return (getrusage_lwp(arg1)); + case _RUSAGESYS_GETVMUSAGE: + return (vm_getusage((uint_t)(uintptr_t)arg1, (time_t)arg2, + (vmusage_t *)arg3, (size_t *)arg4)); default: return (set_errno(EINVAL)); } diff --git a/usr/src/uts/common/syscall/tasksys.c b/usr/src/uts/common/syscall/tasksys.c index 705b543a37..bec091e61c 100644 --- a/usr/src/uts/common/syscall/tasksys.c +++ b/usr/src/uts/common/syscall/tasksys.c @@ -25,6 +25,7 @@ #pragma ident "%Z%%M% %I% %E% SMI" + /* * System calls for creating and inquiring about tasks and projects */ @@ -102,7 +103,7 @@ tasksys_settaskid(projid_t projid, uint_t flags) * Put a hold on our new project and make sure that nobody is * trying to bind it to a pool while we're joining. */ - kpj = project_hold_by_id(projid, getzoneid(), PROJECT_HOLD_INSERT); + kpj = project_hold_by_id(projid, p->p_zone, PROJECT_HOLD_INSERT); e.rcep_p.proj = kpj; e.rcep_t = RCENTITY_PROJECT; @@ -111,7 +112,7 @@ tasksys_settaskid(projid_t projid, uint_t flags) zone = p->p_zone; mutex_enter(&zone->zone_nlwps_lock); - mutex_enter(&zone->zone_rctl_lock); + mutex_enter(&zone->zone_mem_lock); if (kpj->kpj_nlwps + p->p_lwpcnt > kpj->kpj_nlwps_ctl) if (rctl_test_entity(rc_project_nlwps, kpj->kpj_rctls, p, &e, @@ -130,7 +131,7 @@ tasksys_settaskid(projid_t projid, uint_t flags) rctlfail = 1; if (rctlfail) { - mutex_exit(&zone->zone_rctl_lock); + mutex_exit(&zone->zone_mem_lock); mutex_exit(&zone->zone_nlwps_lock); if (curthread != p->p_agenttp) continuelwps(p); @@ -144,7 +145,7 @@ tasksys_settaskid(projid_t projid, uint_t flags) oldpj->kpj_data.kpd_locked_mem -= p->p_locked_mem; oldpj->kpj_nlwps -= p->p_lwpcnt; - mutex_exit(&zone->zone_rctl_lock); + mutex_exit(&zone->zone_mem_lock); mutex_exit(&zone->zone_nlwps_lock); mutex_exit(&p->p_lock); diff --git a/usr/src/uts/common/vm/anon.h b/usr/src/uts/common/vm/anon.h index 90f6e1e661..ed59ec590b 100644 --- a/usr/src/uts/common/vm/anon.h +++ b/usr/src/uts/common/vm/anon.h @@ -42,6 +42,7 @@ #pragma ident "%Z%%M% %I% %E% SMI" #include <sys/cred.h> +#include <sys/zone.h> #include <vm/seg.h> #include <vm/vpage.h> @@ -387,8 +388,8 @@ extern int anon_map_demotepages(struct anon_map *, ulong_t, struct seg *, caddr_t, uint_t, struct vpage [], struct cred *); extern void anon_shmap_free_pages(struct anon_map *, ulong_t, size_t); -extern int anon_resvmem(size_t, uint_t); -extern void anon_unresv(size_t); +extern int anon_resvmem(size_t, boolean_t, zone_t *); +extern void anon_unresvmem(size_t, zone_t *); extern struct anon_map *anonmap_alloc(size_t, size_t); extern void anonmap_free(struct anon_map *); extern void anon_decref(struct anon *); @@ -416,9 +417,16 @@ extern void anon_array_exit(anon_sync_obj_t *); * request and if so, reserves the appropriate anonymous memory resources. * anon_checkspace just checks to see if there is space to fulfill the request, * without taking any resources. Both return 1 if successful and 0 if not. + * + * Macros are provided as anon reservation is usually charged to the zone of + * the current process. In some cases (such as anon reserved by tmpfs), a + * zone pointer is needed to charge the appropriate zone. */ -#define anon_resv(size) anon_resvmem((size), 1) -#define anon_checkspace(size) anon_resvmem((size), 0) +#define anon_unresv(size) anon_unresvmem(size, curproc->p_zone) +#define anon_unresv_zone(size, zone) anon_unresvmem(size, zone) +#define anon_resv(size) anon_resvmem((size), 1, curproc->p_zone) +#define anon_resv_zone(size, zone) anon_resvmem((size), 1, zone) +#define anon_checkspace(size, zone) anon_resvmem((size), 0, zone) /* * Flags to anon_private diff --git a/usr/src/uts/common/vm/seg.h b/usr/src/uts/common/vm/seg.h index 0ee7d62ce1..a9683c0e54 100644 --- a/usr/src/uts/common/vm/seg.h +++ b/usr/src/uts/common/vm/seg.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -245,6 +244,9 @@ uint_t seg_pages(struct seg *); #endif /* VMDEBUG */ +boolean_t seg_can_change_zones(struct seg *); +size_t seg_swresv(struct seg *); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/vm/seg_kp.c b/usr/src/uts/common/vm/seg_kp.c index ff9c47e0ff..d58e873a19 100644 --- a/usr/src/uts/common/vm/seg_kp.c +++ b/usr/src/uts/common/vm/seg_kp.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -147,6 +146,7 @@ uint32_t red_closest = UINT_MAX; uint32_t red_ndoubles; pgcnt_t anon_segkp_pages_locked; /* See vm/anon.h */ +pgcnt_t anon_segkp_pages_resv; /* anon reserved by seg_kp */ static struct seg_ops segkp_ops = { SEGKP_BADOP(int), /* dup */ @@ -448,8 +448,10 @@ segkp_get_internal( * Note that we don't need swap space for the red zone page. */ if (amp != NULL) { - ASSERT((flags & KPD_NO_ANON) == 0); - /* The reserve has been done and the anon_hdr is separate. */ + /* + * The swap reservation has been done, if required, and the + * anon_hdr is separate. + */ anon_idx = 0; kpd->kp_anon_idx = anon_idx; kpd->kp_anon = amp->ahp; @@ -458,7 +460,7 @@ segkp_get_internal( kpd, vbase, len, flags, 1); } else if ((flags & KPD_NO_ANON) == 0) { - if (anon_resv(SEGKP_MAPLEN(len, flags)) == 0) { + if (anon_resv_zone(SEGKP_MAPLEN(len, flags), NULL) == 0) { if (flags & KPD_LOCKED) { atomic_add_long(&anon_segkp_pages_locked, -pages); @@ -468,6 +470,8 @@ segkp_get_internal( kmem_free(kpd, sizeof (struct segkp_data)); return (NULL); } + atomic_add_long(&anon_segkp_pages_resv, + btop(SEGKP_MAPLEN(len, flags))); anon_idx = ((uintptr_t)(vbase - s_base)) >> PAGESHIFT; kpd->kp_anon_idx = anon_idx; kpd->kp_anon = kpsd->kpsd_anon; @@ -704,7 +708,9 @@ segkp_release_internal(struct seg *seg, struct segkp_data *kpd, size_t len) if ((kpd->kp_flags & KPD_HASAMP) == 0) { anon_free(kpd->kp_anon, kpd->kp_anon_idx + i, PAGESIZE); - anon_unresv(PAGESIZE); + anon_unresv_zone(PAGESIZE, NULL); + atomic_add_long(&anon_segkp_pages_resv, + -1); } TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u", diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c index f48db44acc..e2069b27c6 100644 --- a/usr/src/uts/common/vm/seg_vn.c +++ b/usr/src/uts/common/vm/seg_vn.c @@ -2323,8 +2323,9 @@ segvn_faultpage( * zeroes. If no advance reservations, reserve now. */ if (svd->flags & MAP_NORESERVE) { - if (anon_resv(ptob(1))) { - svd->swresv += ptob(1); + if (anon_resv_zone(ptob(1), + seg->s_as->a_proc->p_zone)) { + atomic_add_long(&svd->swresv, ptob(1)); } else { err = ENOMEM; goto out; diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c index 0cad34257c..3f225a345a 100644 --- a/usr/src/uts/common/vm/vm_anon.c +++ b/usr/src/uts/common/vm/vm_anon.c @@ -113,6 +113,7 @@ #include <sys/policy.h> #include <sys/condvar_impl.h> #include <sys/mutex_impl.h> +#include <sys/rctl.h> #include <vm/as.h> #include <vm/hat.h> @@ -729,12 +730,22 @@ set_anoninfo(void) * Return non-zero on success. */ int -anon_resvmem(size_t size, uint_t takemem) +anon_resvmem(size_t size, boolean_t takemem, zone_t *zone) { pgcnt_t npages = btopr(size); pgcnt_t mswap_pages = 0; pgcnt_t pswap_pages = 0; + proc_t *p = curproc; + if (zone != NULL && takemem) { + /* test zone.max-swap resource control */ + mutex_enter(&p->p_lock); + if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { + mutex_exit(&p->p_lock); + return (0); + } + mutex_exit(&p->p_lock); + } mutex_enter(&anoninfo_lock); /* @@ -834,16 +845,17 @@ anon_resvmem(size_t size, uint_t takemem) mutex_exit(&anoninfo_lock); ANON_PRINT(A_RESV, ("anon_resvmem: not enough space from swapfs\n")); + if (zone != NULL && takemem) + rctl_decr_swap(zone, ptob(npages)); return (0); } } - /* * Give back an anon reservation. */ void -anon_unresv(size_t size) +anon_unresvmem(size_t size, zone_t *zone) { pgcnt_t npages = btopr(size); spgcnt_t mem_free_pages = 0; @@ -851,6 +863,8 @@ anon_unresv(size_t size) #ifdef ANON_DEBUG pgcnt_t mem_resv; #endif + if (zone != NULL) + rctl_decr_swap(zone, size); mutex_enter(&anoninfo_lock); diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c index 05bfe662be..adac07b766 100644 --- a/usr/src/uts/common/vm/vm_page.c +++ b/usr/src/uts/common/vm/vm_page.c @@ -77,7 +77,7 @@ #include <vm/pvn.h> #include <vm/seg_kmem.h> #include <vm/vm_dep.h> - +#include <sys/vm_usage.h> #include <fs/fs_subr.h> static int nopageage = 0; @@ -343,6 +343,7 @@ vm_init(void) (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm"); page_init_mem_config(); page_retire_init(); + vm_usage_init(); } /* diff --git a/usr/src/uts/common/vm/vm_seg.c b/usr/src/uts/common/vm/vm_seg.c index 50cc21cdf7..aed892969d 100644 --- a/usr/src/uts/common/vm/vm_seg.c +++ b/usr/src/uts/common/vm/vm_seg.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -54,12 +53,14 @@ #include <sys/cmn_err.h> #include <sys/callb.h> #include <sys/mem_config.h> +#include <sys/mman.h> #include <vm/hat.h> #include <vm/as.h> #include <vm/seg.h> #include <vm/seg_kmem.h> - +#include <vm/seg_spt.h> +#include <vm/seg_vn.h> /* * kstats for segment advise */ @@ -950,3 +951,48 @@ seg_pinit_mem_config(void) */ ASSERT(ret == 0); } + +extern struct seg_ops segvn_ops; +extern struct seg_ops segspt_shmops; + +/* + * Verify that segment is not a shared anonymous segment which reserves + * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered + * from one zone to another if any segments are shared. This is because the + * last process to exit will credit the swap reservation. This could lead + * to the swap being reserved by one zone, and credited to another. + */ +boolean_t +seg_can_change_zones(struct seg *seg) +{ + struct segvn_data *svd; + + if (seg->s_ops == &segspt_shmops) + return (B_FALSE); + + if (seg->s_ops == &segvn_ops) { + svd = (struct segvn_data *)seg->s_data; + if (svd->type == MAP_SHARED && + svd->amp != NULL && + svd->amp->swresv > 0) + return (B_FALSE); + } + return (B_TRUE); +} + +/* + * Return swap reserved by a segment backing a private mapping. + */ +size_t +seg_swresv(struct seg *seg) +{ + struct segvn_data *svd; + size_t swap = 0; + + if (seg->s_ops == &segvn_ops) { + svd = (struct segvn_data *)seg->s_data; + if (svd->type == MAP_PRIVATE && svd->swresv > 0) + swap = svd->swresv; + } + return (swap); +} diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c new file mode 100644 index 0000000000..32a8811e10 --- /dev/null +++ b/usr/src/uts/common/vm/vm_usage.c @@ -0,0 +1,1978 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * vm_usage + * + * This file implements the getvmusage() private system call. + * getvmusage() counts the amount of resident memory pages and swap + * reserved by the specified process collective. A "process collective" is + * the set of processes owned by a particular, zone, project, task, or user. + * + * rss and swap are counted so that for a given process collective, a page is + * only counted once. For example, this means that if multiple processes in + * the same project map the same page, then the project will only be charged + * once for that page. On the other hand, if two processes in different + * projects map the same page, then both projects will be charged + * for the page. + * + * The vm_getusage() calculation is implemented so that the first thread + * performs the rss/swap counting. Other callers will wait for that thread to + * finish, copying the results. This enables multiple rcapds and prstats to + * consume data from the same calculation. The results are also cached so that + * a caller interested in recent results can just copy them instead of starting + * a new calculation. The caller passes the maximium age (in seconds) of the + * data. If the cached data is young enough, the cache is copied, otherwise, + * a new calculation is executed and the cache is replaced with the new + * data. + * + * The rss calculation for each process collective is as follows: + * + * - Inspect flags, determine if counting rss for zones, projects, tasks, + * and/or users. + * - For each proc: + * - Figure out proc's collectives (zone, project, task, and/or user). + * - For each seg in proc's address space: + * - If seg is private: + * - Lookup anons in the amp. + * - For incore pages not previously visited each of the + * proc's collectives, add incore pagesize to each. + * collective. + * Anon's with a refcnt of 1 can be assummed to be not + * previously visited. + * - For address ranges without anons in the amp: + * - Lookup pages in underlying vnode. + * - For incore pages not previously visiting for + * each of the proc's collectives, add incore + * pagesize to each collective. + * - If seg is shared: + * - Lookup pages in the shared amp or vnode. + * - For incore pages not previously visited for each of + * the proc's collectives, add incore pagesize to each + * collective. + * + * Swap is reserved by private segments, and shared anonymous segments. + * The only shared anon segments which do not reserve swap are ISM segments + * and schedctl segments, both of which can be identified by having + * amp->swresv == 0. + * + * The swap calculation for each collective is as follows: + * + * - Inspect flags, determine if counting rss for zones, projects, tasks, + * and/or users. + * - For each proc: + * - Figure out proc's collectives (zone, project, task, and/or user). + * - For each seg in proc's address space: + * - If seg is private: + * - Add svd->swresv pages to swap count for each of the + * proc's collectives. + * - If seg is anon, shared, and amp->swresv != 0 + * - For address ranges in amp not previously visited for + * each of the proc's collectives, add size of address + * range to the swap count for each collective. + * + * These two calculations are done simultaneously, with most of the work + * being done in vmu_calculate_seg(). The results of the calculation are + * copied into "vmu_data.vmu_cache_results". + * + * To perform the calculation, various things are tracked and cached: + * + * - incore/not-incore page ranges for all vnodes. + * (vmu_data.vmu_all_vnodes_hash) + * This eliminates looking up the same page more than once. + * + * - incore/not-incore page ranges for all shared amps. + * (vmu_data.vmu_all_amps_hash) + * This eliminates looking up the same page more than once. + * + * - visited page ranges for each collective. + * - per vnode (entity->vme_vnode_hash) + * - per shared amp (entity->vme_amp_hash) + * For accurate counting of map-shared and cow-shared pages. + * + * - visited private anons (refcnt > 1) for each collective. + * (entity->vme_anon_hash) + * For accurate counting of cow-shared pages. + * + * The common accounting structure is the vmu_entity_t, which represents + * collectives: + * + * - A zone. + * - A project, task, or user within a zone. + * - The entire system (vmu_data.vmu_system). + * - Each collapsed (col) project and user. This means a given projid or + * uid, regardless of which zone the process is in. For instance, + * project 0 in the global zone and project 0 in a non global zone are + * the same collapsed project. + * + * Each entity structure tracks which pages have been already visited for + * that entity (via previously inspected processes) so that these pages are + * not double counted. + */ + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/zone.h> +#include <sys/proc.h> +#include <sys/project.h> +#include <sys/task.h> +#include <sys/thread.h> +#include <sys/time.h> +#include <sys/mman.h> +#include <sys/modhash.h> +#include <sys/modhash_impl.h> +#include <sys/shm.h> +#include <sys/swap.h> +#include <sys/synch.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vm_usage.h> +#include <sys/zone.h> +#include <vm/anon.h> +#include <vm/as.h> +#include <vm/seg_vn.h> +#include <vm/seg_spt.h> + +#define VMUSAGE_HASH_SIZE 512 + +#define VMUSAGE_TYPE_VNODE 1 +#define VMUSAGE_TYPE_AMP 2 +#define VMUSAGE_TYPE_ANON 3 + +#define VMUSAGE_BOUND_UNKNOWN 0 +#define VMUSAGE_BOUND_INCORE 1 +#define VMUSAGE_BOUND_NOT_INCORE 2 + +/* + * bounds for vnodes and shared amps + * Each bound is either entirely incore, entirely not in core, or + * entirely unknown. bounds are stored in order by offset. + */ +typedef struct vmu_bound { + struct vmu_bound *vmb_next; + pgcnt_t vmb_start; /* page offset in vnode/amp on which bound starts */ + pgcnt_t vmb_end; /* page offset in vnode/amp on which bound ends */ + char vmb_type; /* One of VMUSAGE_BOUND_* */ +} vmu_bound_t; + +/* + * hash of visited objects (vnodes or shared amps) + * key is address of vnode or amp. Bounds lists known incore/non-incore + * bounds for vnode/amp. + */ +typedef struct vmu_object { + struct vmu_object *vmo_next; /* free list */ + caddr_t vmo_key; + short vmo_type; + vmu_bound_t *vmo_bounds; +} vmu_object_t; + +/* + * Entity by which to count results. + * + * The entity structure keeps the current rss/swap counts for each entity + * (zone, project, etc), and hashes of vm structures that have already + * been visited for the entity. + * + * vme_next: links the list of all entities currently being counted by + * vmu_calculate(). + * + * vme_next_calc: links the list of entities related to the current process + * being counted by vmu_calculate_proc(). + * + * vmu_calculate_proc() walks all processes. For each process, it makes a + * list of the entities related to that process using vme_next_calc. This + * list changes each time vmu_calculate_proc() is called. + * + */ +typedef struct vmu_entity { + struct vmu_entity *vme_next; + struct vmu_entity *vme_next_calc; + mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */ + mod_hash_t *vme_amp_hash; /* shared amps visited for entity */ + mod_hash_t *vme_anon_hash; /* cow anons visited for entity */ + vmusage_t vme_result; /* identifies entity and results */ +} vmu_entity_t; + +/* + * Hash of entities visited within a zone, and an entity for the zone + * itself. + */ +typedef struct vmu_zone { + struct vmu_zone *vmz_next; /* free list */ + id_t vmz_id; + vmu_entity_t *vmz_zone; + mod_hash_t *vmz_projects_hash; + mod_hash_t *vmz_tasks_hash; + mod_hash_t *vmz_rusers_hash; + mod_hash_t *vmz_eusers_hash; +} vmu_zone_t; + +/* + * Cache of results from last calculation + */ +typedef struct vmu_cache { + vmusage_t *vmc_results; /* Results from last call to */ + /* vm_getusage(). */ + uint64_t vmc_nresults; /* Count of cached results */ + uint64_t vmc_refcnt; /* refcnt for free */ + uint_t vmc_flags; /* Flags for vm_getusage() */ + hrtime_t vmc_timestamp; /* when cache was created */ +} vmu_cache_t; + +/* + * top level rss info for the system + */ +typedef struct vmu_data { + kmutex_t vmu_lock; /* Protects vmu_data */ + kcondvar_t vmu_cv; /* Used to signal threads */ + /* Waiting for */ + /* Rss_calc_thread to finish */ + vmu_entity_t *vmu_system; /* Entity for tracking */ + /* rss/swap for all processes */ + /* in all zones */ + mod_hash_t *vmu_zones_hash; /* Zones visited */ + mod_hash_t *vmu_projects_col_hash; /* These *_col_hash hashes */ + mod_hash_t *vmu_rusers_col_hash; /* keep track of entities, */ + mod_hash_t *vmu_eusers_col_hash; /* ignoring zoneid, in order */ + /* to implement VMUSAGE_COL_* */ + /* flags, which aggregate by */ + /* project or user regardless */ + /* of zoneid. */ + mod_hash_t *vmu_all_vnodes_hash; /* System wide visited vnodes */ + /* to track incore/not-incore */ + mod_hash_t *vmu_all_amps_hash; /* System wide visited shared */ + /* amps to track incore/not- */ + /* incore */ + vmu_entity_t *vmu_entities; /* Linked list of entities */ + size_t vmu_nentities; /* Count of entities in list */ + vmu_cache_t *vmu_cache; /* Cached results */ + kthread_t *vmu_calc_thread; /* NULL, or thread running */ + /* vmu_calculate() */ + uint_t vmu_calc_flags; /* Flags being using by */ + /* currently running calc */ + /* thread */ + uint_t vmu_pending_flags; /* Flags of vm_getusage() */ + /* threads waiting for */ + /* calc thread to finish */ + uint_t vmu_pending_waiters; /* Number of threads waiting */ + /* for calc thread */ + vmu_bound_t *vmu_free_bounds; + vmu_object_t *vmu_free_objects; + vmu_entity_t *vmu_free_entities; + vmu_zone_t *vmu_free_zones; +} vmu_data_t; + +extern struct as kas; +extern proc_t *practive; +extern zone_t *global_zone; +extern struct seg_ops segvn_ops; +extern struct seg_ops segspt_shmops; + +static vmu_data_t vmu_data; +static kmem_cache_t *vmu_bound_cache; +static kmem_cache_t *vmu_object_cache; + +/* + * Save a bound on the free list + */ +static void +vmu_free_bound(vmu_bound_t *bound) +{ + bound->vmb_next = vmu_data.vmu_free_bounds; + vmu_data.vmu_free_bounds = bound; +} + +/* + * Free an object, and all visited bound info. + */ +static void +vmu_free_object(mod_hash_val_t val) +{ + vmu_object_t *obj = (vmu_object_t *)val; + vmu_bound_t *bound = obj->vmo_bounds; + vmu_bound_t *tmp; + + while (bound != NULL) { + tmp = bound; + bound = bound->vmb_next; + vmu_free_bound(tmp); + } + obj->vmo_next = vmu_data.vmu_free_objects; + vmu_data.vmu_free_objects = obj; +} + +/* + * Free an entity, and hashes of visited objects for that entity. + */ +static void +vmu_free_entity(mod_hash_val_t val) +{ + vmu_entity_t *entity = (vmu_entity_t *)val; + + if (entity->vme_vnode_hash != NULL) + i_mod_hash_clear_nosync(entity->vme_vnode_hash); + if (entity->vme_amp_hash != NULL) + i_mod_hash_clear_nosync(entity->vme_amp_hash); + if (entity->vme_anon_hash != NULL) + i_mod_hash_clear_nosync(entity->vme_anon_hash); + + entity->vme_next = vmu_data.vmu_free_entities; + vmu_data.vmu_free_entities = entity; +} + +/* + * Free zone entity, and all hashes of entities inside that zone, + * which are projects, tasks, and users. + */ +static void +vmu_free_zone(mod_hash_val_t val) +{ + vmu_zone_t *zone = (vmu_zone_t *)val; + + if (zone->vmz_zone != NULL) { + vmu_free_entity((mod_hash_val_t)zone->vmz_zone); + zone->vmz_zone = NULL; + } + if (zone->vmz_projects_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_projects_hash); + if (zone->vmz_tasks_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_tasks_hash); + if (zone->vmz_rusers_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_rusers_hash); + if (zone->vmz_eusers_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_eusers_hash); + zone->vmz_next = vmu_data.vmu_free_zones; + vmu_data.vmu_free_zones = zone; +} + +/* + * Initialize synchronization primitives and hashes for system-wide tracking + * of visited vnodes and shared amps. Initialize results cache. + */ +void +vm_usage_init() +{ + mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL); + + vmu_data.vmu_system = NULL; + vmu_data.vmu_zones_hash = NULL; + vmu_data.vmu_projects_col_hash = NULL; + vmu_data.vmu_rusers_col_hash = NULL; + vmu_data.vmu_eusers_col_hash = NULL; + + vmu_data.vmu_free_bounds = NULL; + vmu_data.vmu_free_objects = NULL; + vmu_data.vmu_free_entities = NULL; + vmu_data.vmu_free_zones = NULL; + + vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash( + "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (vnode_t)); + vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash( + "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (struct anon_map)); + vmu_data.vmu_projects_col_hash = mod_hash_create_idhash( + "vmusage collapsed project hash", VMUSAGE_HASH_SIZE, + vmu_free_entity); + vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash( + "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE, + vmu_free_entity); + vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash( + "vmusage collpased euser hash", VMUSAGE_HASH_SIZE, + vmu_free_entity); + vmu_data.vmu_zones_hash = mod_hash_create_idhash( + "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone); + + vmu_bound_cache = kmem_cache_create("vmu_bound_cache", + sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + vmu_object_cache = kmem_cache_create("vmu_object_cache", + sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + vmu_data.vmu_entities = NULL; + vmu_data.vmu_nentities = 0; + + vmu_data.vmu_cache = NULL; + vmu_data.vmu_calc_thread = NULL; + vmu_data.vmu_calc_flags = 0; + vmu_data.vmu_pending_flags = 0; + vmu_data.vmu_pending_waiters = 0; +} + +/* + * Allocate hashes for tracking vm objects visited for an entity. + * Update list of entities. + */ +static vmu_entity_t * +vmu_alloc_entity(id_t id, int type, id_t zoneid) +{ + vmu_entity_t *entity; + + if (vmu_data.vmu_free_entities != NULL) { + entity = vmu_data.vmu_free_entities; + vmu_data.vmu_free_entities = + vmu_data.vmu_free_entities->vme_next; + bzero(&entity->vme_result, sizeof (vmusage_t)); + } else { + entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP); + } + entity->vme_result.vmu_id = id; + entity->vme_result.vmu_zoneid = zoneid; + entity->vme_result.vmu_type = type; + + if (entity->vme_vnode_hash == NULL) + entity->vme_vnode_hash = mod_hash_create_ptrhash( + "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (vnode_t)); + + if (entity->vme_amp_hash == NULL) + entity->vme_amp_hash = mod_hash_create_ptrhash( + "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (struct anon_map)); + + if (entity->vme_anon_hash == NULL) + entity->vme_anon_hash = mod_hash_create_ptrhash( + "vmusage anon hash", VMUSAGE_HASH_SIZE, + mod_hash_null_valdtor, sizeof (struct anon)); + + entity->vme_next = vmu_data.vmu_entities; + vmu_data.vmu_entities = entity; + vmu_data.vmu_nentities++; + + return (entity); +} + +/* + * Allocate a zone entity, and hashes for tracking visited vm objects + * for projects, tasks, and users within that zone. + */ +static vmu_zone_t * +vmu_alloc_zone(id_t id) +{ + vmu_zone_t *zone; + + if (vmu_data.vmu_free_zones != NULL) { + zone = vmu_data.vmu_free_zones; + vmu_data.vmu_free_zones = + vmu_data.vmu_free_zones->vmz_next; + zone->vmz_next = NULL; + zone->vmz_zone = NULL; + } else { + zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP); + } + + zone->vmz_id = id; + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0) + zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS | + VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL) + zone->vmz_projects_hash = mod_hash_create_idhash( + "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) + != 0 && zone->vmz_tasks_hash == NULL) + zone->vmz_tasks_hash = mod_hash_create_idhash( + "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) + != 0 && zone->vmz_rusers_hash == NULL) + zone->vmz_rusers_hash = mod_hash_create_idhash( + "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) + != 0 && zone->vmz_eusers_hash == NULL) + zone->vmz_eusers_hash = mod_hash_create_idhash( + "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + return (zone); +} + +/* + * Allocate a structure for tracking visited bounds for a vm object. + */ +static vmu_object_t * +vmu_alloc_object(caddr_t key, int type) +{ + vmu_object_t *object; + + if (vmu_data.vmu_free_objects != NULL) { + object = vmu_data.vmu_free_objects; + vmu_data.vmu_free_objects = + vmu_data.vmu_free_objects->vmo_next; + } else { + object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP); + } + + object->vmo_key = key; + object->vmo_type = type; + object->vmo_bounds = NULL; + + return (object); +} + +/* + * Allocate and return a bound structure. + */ +static vmu_bound_t * +vmu_alloc_bound() +{ + vmu_bound_t *bound; + + if (vmu_data.vmu_free_bounds != NULL) { + bound = vmu_data.vmu_free_bounds; + vmu_data.vmu_free_bounds = + vmu_data.vmu_free_bounds->vmb_next; + bzero(bound, sizeof (vmu_bound_t)); + } else { + bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP); + bzero(bound, sizeof (vmu_bound_t)); + } + return (bound); +} + +/* + * vmu_find_insert_* functions implement hash lookup or allocate and + * insert operations. + */ +static vmu_object_t * +vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type) +{ + int ret; + vmu_object_t *object; + + ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t *)&object); + if (ret != 0) { + object = vmu_alloc_object(key, type); + ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t)object, (mod_hash_hndl_t)0); + ASSERT(ret == 0); + } + return (object); +} + +static int +vmu_find_insert_anon(mod_hash_t *hash, caddr_t key) +{ + int ret; + caddr_t val; + + ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t *)&val); + + if (ret == 0) + return (0); + + ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t)key, (mod_hash_hndl_t)0); + + ASSERT(ret == 0); + + return (1); +} + +static vmu_entity_t * +vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid) +{ + int ret; + vmu_entity_t *entity; + + ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id, + (mod_hash_val_t *)&entity); + if (ret != 0) { + entity = vmu_alloc_entity(id, type, zoneid); + ret = i_mod_hash_insert_nosync(hash, + (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity, + (mod_hash_hndl_t)0); + ASSERT(ret == 0); + } + return (entity); +} + + + + +/* + * Returns list of object bounds between start and end. New bounds inserted + * by this call are given type. + * + * Returns the number of pages covered if new bounds are created. Returns 0 + * if region between start/end consists of all existing bounds. + */ +static pgcnt_t +vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t + end, char type, vmu_bound_t **first, vmu_bound_t **last) +{ + vmu_bound_t *next; + vmu_bound_t *prev = NULL; + vmu_bound_t *tmp = NULL; + pgcnt_t ret = 0; + + *first = *last = NULL; + + for (next = ro->vmo_bounds; next != NULL; next = next->vmb_next) { + /* + * Find bounds overlapping or overlapped by range [start,end]. + */ + if (start > next->vmb_end) { + /* bound is before new bound */ + prev = next; + continue; + } + if (next->vmb_start > end) { + /* bound is after new bound */ + break; + } + if (*first == NULL) + *first = next; + *last = next; + } + + if (*first == NULL) { + ASSERT(*last == NULL); + /* + * No bounds overlapping range [start,end], so create new + * bound + */ + tmp = vmu_alloc_bound(); + tmp->vmb_start = start; + tmp->vmb_end = end; + tmp->vmb_type = type; + if (prev == NULL) { + tmp->vmb_next = ro->vmo_bounds; + ro->vmo_bounds = tmp; + } else { + tmp->vmb_next = prev->vmb_next; + prev->vmb_next = tmp; + } + *first = tmp; + *last = tmp; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret = tmp->vmb_end - tmp->vmb_start + 1; + return (ret); + } + + /* Check to see if start is before first known bound */ + ASSERT(first != NULL && last != NULL); + next = (*first); + if (start < (*first)->vmb_start) { + /* Create new bound before first bound */ + tmp = vmu_alloc_bound(); + tmp->vmb_start = start; + tmp->vmb_end = (*first)->vmb_start - 1; + tmp->vmb_type = type; + tmp->vmb_next = *first; + if (*first == ro->vmo_bounds) + ro->vmo_bounds = tmp; + if (prev != NULL) + prev->vmb_next = tmp; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret += tmp->vmb_end - tmp->vmb_start + 1; + *first = tmp; + } + /* + * Between start and end, search for gaps between and after existing + * bounds. Create new bounds to fill gaps if they exist. + */ + while (end > next->vmb_end) { + /* + * Check for gap between bound and next bound. if no gap, + * continue. + */ + if ((next != *last) && + ((next->vmb_end + 1) == next->vmb_next->vmb_start)) { + next = next->vmb_next; + continue; + } + /* + * Insert new bound in gap after bound, and before next + * bound if next bound exists. + */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = type; + tmp->vmb_next = next->vmb_next; + tmp->vmb_start = next->vmb_end + 1; + + if (next != *last) { + tmp->vmb_end = next->vmb_next->vmb_start - 1; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret += tmp->vmb_end - tmp->vmb_start + 1; + next->vmb_next = tmp; + next = tmp->vmb_next; + } else { + tmp->vmb_end = end; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret += tmp->vmb_end - tmp->vmb_start + 1; + next->vmb_next = tmp; + *last = tmp; + break; + } + } + return (ret); +} + +/* + * vmu_update_bounds() + * + * first, last: list of continuous bounds, of which zero or more are of + * type VMUSAGE_BOUND_UNKNOWN. + * + * new_first, new_last: list of continuous bounds, of which none are of + * type VMUSAGE_BOUND_UNKNOWN. These bounds are used to + * update the types of bounds in (first,last) with + * type VMUSAGE_BOUND_UNKNOWN. + * + * For the list of bounds (first,last), this function updates any bounds + * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in + * the list (new_first, new_last). + * + * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list + * (new_first, new_last), it will be split into multiple bounds. + * + * Return value: + * The number of pages in the list of bounds (first,last) that were of + * type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type + * VMUSAGE_BOUND_INCORE. + * + */ +static pgcnt_t +vmu_update_bounds(vmu_bound_t **first, vmu_bound_t **last, + vmu_bound_t *new_first, vmu_bound_t *new_last) +{ + vmu_bound_t *next, *new_next, *tmp; + pgcnt_t rss = 0; + + next = *first; + new_next = new_first; + + /* verify bounds span same pages */ + ASSERT((*first)->vmb_start >= new_next->vmb_start); + ASSERT((*last)->vmb_end <= new_last->vmb_end); + for (;;) { + /* If bound already has type, proceed to next bound */ + if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { + if (next == *last) + break; + next = next->vmb_next; + continue; + } + while (new_next->vmb_end < next->vmb_start) + new_next = new_next->vmb_next; + ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN); + next->vmb_type = new_next->vmb_type; + if (new_next->vmb_end < next->vmb_end) { + /* need to split bound */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN; + tmp->vmb_start = new_next->vmb_end + 1; + tmp->vmb_end = next->vmb_end; + tmp->vmb_next = next->vmb_next; + next->vmb_end = new_next->vmb_end; + next->vmb_next = tmp; + if (*last == next) + *last = tmp; + if (next->vmb_type == VMUSAGE_BOUND_INCORE) + rss += next->vmb_end - next->vmb_start + 1; + next = tmp; + } else { + if (next->vmb_type == VMUSAGE_BOUND_INCORE) + rss += next->vmb_end - next->vmb_start + 1; + if (next == *last) + break; + next = next->vmb_next; + } + } + return (rss); +} + +/* + * merges adjacent bounds with same type between first and last bound. + * After merge, last pointer is no longer valid, as last bound may be + * merged away. + */ +static void +vmu_merge_bounds(vmu_bound_t **first, vmu_bound_t **last) +{ + vmu_bound_t *next; + vmu_bound_t *tmp; + + ASSERT(*first != NULL); + ASSERT(*last != NULL); + + next = *first; + while (next != *last) { + + /* If bounds are adjacent and have same type, merge them */ + if (((next->vmb_end + 1) == next->vmb_next->vmb_start) && + (next->vmb_type == next->vmb_next->vmb_type)) { + tmp = next->vmb_next; + next->vmb_end = tmp->vmb_end; + next->vmb_next = tmp->vmb_next; + vmu_free_bound(tmp); + if (tmp == *last) + *last = next; + } else { + next = next->vmb_next; + } + } +} + +/* + * Given an amp and a list of bounds, updates each bound's type with + * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE. + * + * If a bound is partially incore, it will be split into two bounds. + * first and last may be modified, as bounds may be split into multiple + * bounds if the are partially incore/not-incore. + * + * Set incore to non-zero if bounds are already known to be incore + * + */ +static void +vmu_amp_update_incore_bounds(struct anon_map *amp, vmu_bound_t **first, + vmu_bound_t **last, boolean_t incore) +{ + vmu_bound_t *next; + vmu_bound_t *tmp; + pgcnt_t index; + short bound_type; + short page_type; + vnode_t *vn; + anoff_t off; + struct anon *ap; + + next = *first; + /* Shared anon slots don't change once set */ + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + for (;;) { + if (incore == B_TRUE) + next->vmb_type = VMUSAGE_BOUND_INCORE; + + if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { + if (next == *last) + break; + next = next->vmb_next; + continue; + } + bound_type = next->vmb_type; + index = next->vmb_start; + while (index <= next->vmb_end) { + + /* + * These are used to determine how much to increment + * index when a large page is found. + */ + page_t *page; + pgcnt_t pgcnt = 1; + uint_t pgshft; + pgcnt_t pgmsk; + + ap = anon_get_ptr(amp->ahp, index); + if (ap != NULL) + swap_xlate(ap, &vn, &off); + + if (ap != NULL && vn != NULL && vn->v_pages != NULL && + (page = page_exists(vn, off)) != NULL) { + page_type = VMUSAGE_BOUND_INCORE; + if (page->p_szc > 0) { + pgcnt = page_get_pagecnt(page->p_szc); + pgshft = page_get_shift(page->p_szc); + pgmsk = (0x1 << (pgshft - PAGESHIFT)) + - 1; + } + } else { + page_type = VMUSAGE_BOUND_NOT_INCORE; + } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { + next->vmb_type = page_type; + } else if (next->vmb_type != page_type) { + /* + * if current bound type does not match page + * type, need to split off new bound. + */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = page_type; + tmp->vmb_start = index; + tmp->vmb_end = next->vmb_end; + tmp->vmb_next = next->vmb_next; + next->vmb_end = index - 1; + next->vmb_next = tmp; + if (*last == next) + *last = tmp; + next = tmp; + } + if (pgcnt > 1) { + /* + * If inside large page, jump to next large + * page + */ + index = (index & ~pgmsk) + pgcnt; + } else { + index++; + } + } + if (next == *last) { + ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN); + break; + } else + next = next->vmb_next; + } + ANON_LOCK_EXIT(&->a_rwlock); +} + +/* + * Same as vmu_amp_update_incore_bounds(), except for tracking + * incore-/not-incore for vnodes. + */ +static void +vmu_vnode_update_incore_bounds(vnode_t *vnode, vmu_bound_t **first, + vmu_bound_t **last) +{ + vmu_bound_t *next; + vmu_bound_t *tmp; + pgcnt_t index; + short bound_type; + short page_type; + + next = *first; + for (;;) { + if (vnode->v_pages == NULL) + next->vmb_type = VMUSAGE_BOUND_NOT_INCORE; + + if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { + if (next == *last) + break; + next = next->vmb_next; + continue; + } + + bound_type = next->vmb_type; + index = next->vmb_start; + while (index <= next->vmb_end) { + + /* + * These are used to determine how much to increment + * index when a large page is found. + */ + page_t *page; + pgcnt_t pgcnt = 1; + uint_t pgshft; + pgcnt_t pgmsk; + + if (vnode->v_pages != NULL && + (page = page_exists(vnode, ptob(index))) != NULL) { + page_type = VMUSAGE_BOUND_INCORE; + if (page->p_szc > 0) { + pgcnt = page_get_pagecnt(page->p_szc); + pgshft = page_get_shift(page->p_szc); + pgmsk = (0x1 << (pgshft - PAGESHIFT)) + - 1; + } + } else { + page_type = VMUSAGE_BOUND_NOT_INCORE; + } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { + next->vmb_type = page_type; + } else if (next->vmb_type != page_type) { + /* + * if current bound type does not match page + * type, need to split off new bound. + */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = page_type; + tmp->vmb_start = index; + tmp->vmb_end = next->vmb_end; + tmp->vmb_next = next->vmb_next; + next->vmb_end = index - 1; + next->vmb_next = tmp; + if (*last == next) + *last = tmp; + next = tmp; + } + if (pgcnt > 1) { + /* + * If inside large page, jump to next large + * page + */ + index = (index & ~pgmsk) + pgcnt; + } else { + index++; + } + } + if (next == *last) { + ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN); + break; + } else + next = next->vmb_next; + } +} + +/* + * Calculate the rss and swap consumed by a segment. vmu_entities is the + * list of entities to visit. For shared segments, the vnode or amp + * is looked up in each entity to see if has been already counted. Private + * anon pages are checked per entity to ensure that cow pages are not + * double counted. + * + * For private mapped files, first the amp is checked for private pages. + * Bounds not backed by the amp are looked up in the vnode for each entity + * to avoid double counting of private COW vnode pages. + */ +static void +vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) +{ + struct segvn_data *svd; + struct shm_data *shmd; + struct spt_data *sptd; + vmu_object_t *shared_object = NULL; + vmu_object_t *entity_object = NULL; + vmu_entity_t *entity; + vmusage_t *result; + vmu_bound_t *first = NULL; + vmu_bound_t *last = NULL; + vmu_bound_t *cur = NULL; + vmu_bound_t *e_first = NULL; + vmu_bound_t *e_last = NULL; + vmu_bound_t *tmp; + pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt; + struct anon_map *private_amp = NULL; + boolean_t incore = B_FALSE; + boolean_t shared = B_FALSE; + int file = 0; + pgcnt_t swresv = 0; + pgcnt_t panon = 0; + + /* Can zero-length segments exist? Not sure, so parenoia */ + if (seg->s_size <= 0) + return; + + /* + * Figure out if there is a shared object (such as a named vnode or + * a shared amp, then figure out if there is a private amp, which + * identifies private pages. + */ + if (seg->s_ops == &segvn_ops) { + svd = (struct segvn_data *)seg->s_data; + if (svd->type == MAP_SHARED) + shared = B_TRUE; + else + swresv = svd->swresv; + + if (svd->vp != NULL) { + file = 1; + shared_object = vmu_find_insert_object( + vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp, + VMUSAGE_TYPE_VNODE); + s_start = btop(svd->offset); + s_end = btop(svd->offset + seg->s_size) - 1; + } + if (svd->amp != NULL && svd->type == MAP_SHARED) { + ASSERT(shared_object == NULL); + shared_object = vmu_find_insert_object( + vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp, + VMUSAGE_TYPE_AMP); + s_start = svd->anon_index; + s_end = svd->anon_index + btop(seg->s_size) - 1; + /* schedctl mappings are always in core */ + if (svd->amp->swresv == 0) + incore = B_TRUE; + } + if (svd->amp != NULL && svd->type == MAP_PRIVATE) { + private_amp = svd->amp; + p_start = svd->anon_index; + p_end = svd->anon_index + btop(seg->s_size) - 1; + } + } else if (seg->s_ops == &segspt_shmops) { + shared = B_TRUE; + shmd = (struct shm_data *)seg->s_data; + shared_object = vmu_find_insert_object( + vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp, + VMUSAGE_TYPE_AMP); + s_start = 0; + s_end = btop(seg->s_size) - 1; + sptd = shmd->shm_sptseg->s_data; + + /* ism segments are always incore and do not reserve swap */ + if (sptd->spt_flags & SHM_SHARE_MMU) + incore = B_TRUE; + + } else { + return; + } + + /* + * If there is a private amp, count anon pages that exist. If an + * anon has a refcnt > 1 (cow sharing), then save the anon in a + * hash so that it is not double counted. + * + * If there is also a shared object, they figure out the bounds + * which are not mapped by the private amp. + */ + if (private_amp != NULL) { + + /* Enter as writer to prevent cow anons from being freed */ + ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER); + + p_index = p_start; + s_index = s_start; + + while (p_index <= p_end) { + + pgcnt_t p_index_next; + pgcnt_t p_bound_size; + int cnt; + anoff_t off; + struct vnode *vn; + struct anon *ap; + page_t *page; /* For handling of large */ + pgcnt_t pgcnt = 1; /* pages */ + pgcnt_t pgstart; + pgcnt_t pgend; + uint_t pgshft; + pgcnt_t pgmsk; + + p_index_next = p_index; + ap = anon_get_next_ptr(private_amp->ahp, + &p_index_next); + + /* + * If next anon is past end of mapping, simulate + * end of anon so loop terminates. + */ + if (p_index_next > p_end) { + p_index_next = p_end + 1; + ap = NULL; + } + /* + * For cow segments, keep track of bounds not + * backed by private amp so they can be looked + * up in the backing vnode + */ + if (p_index_next != p_index) { + + /* + * Compute index difference between anon and + * previous anon. + */ + p_bound_size = p_index_next - p_index - 1; + + if (shared_object != NULL) { + cur = vmu_alloc_bound(); + cur->vmb_next = NULL; + cur->vmb_start = s_index; + cur->vmb_end = s_index + p_bound_size; + cur->vmb_type = VMUSAGE_BOUND_UNKNOWN; + if (first == NULL) { + first = cur; + last = cur; + } else { + last->vmb_next = cur; + last = cur; + } + } + p_index = p_index + p_bound_size + 1; + s_index = s_index + p_bound_size + 1; + } + + /* Detect end of anons in amp */ + if (ap == NULL) + break; + + cnt = ap->an_refcnt; + swap_xlate(ap, &vn, &off); + + if (vn == NULL || vn->v_pages == NULL || + (page = page_exists(vn, off)) == NULL) { + p_index++; + s_index++; + continue; + } + + /* + * If large page is found, compute portion of large + * page in mapping, and increment indicies to the next + * large page. + */ + if (page->p_szc > 0) { + + pgcnt = page_get_pagecnt(page->p_szc); + pgshft = page_get_shift(page->p_szc); + pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1; + + /* First page in large page */ + pgstart = p_index & ~pgmsk; + /* Last page in large page */ + pgend = pgstart + pgcnt - 1; + /* + * Artifically end page if page extends past + * end of mapping. + */ + if (pgend > p_end) + pgend = p_end; + + /* + * Compute number of pages from large page + * which are mapped. + */ + pgcnt = pgend - p_index + 1; + + /* + * Point indicies at page after large page, + * or at page after end of mapping. + */ + p_index += pgcnt; + s_index += pgcnt; + } else { + p_index++; + s_index++; + } + + /* + * Assume anon structs with a refcnt + * of 1 are not cow shared, so there + * is no reason to track them per entity. + */ + if (cnt == 1) { + panon += pgcnt; + continue; + } + for (entity = vmu_entities; entity != NULL; + entity = entity->vme_next_calc) { + + result = &entity->vme_result; + /* + * Track cow anons per entity so + * they are not double counted. + */ + if (vmu_find_insert_anon(entity->vme_anon_hash, + (caddr_t)ap) == 0) + continue; + + result->vmu_rss_all += (pgcnt << PAGESHIFT); + result->vmu_rss_private += + (pgcnt << PAGESHIFT); + } + } + ANON_LOCK_EXIT(&private_amp->a_rwlock); + } + + /* Add up resident anon and swap reserved for private mappings */ + if (swresv > 0 || panon > 0) { + for (entity = vmu_entities; entity != NULL; + entity = entity->vme_next_calc) { + result = &entity->vme_result; + result->vmu_swap_all += swresv; + result->vmu_swap_private += swresv; + result->vmu_rss_all += (panon << PAGESHIFT); + result->vmu_rss_private += (panon << PAGESHIFT); + } + } + + /* Compute resident pages backing shared amp or named vnode */ + if (shared_object != NULL) { + if (first == NULL) { + /* + * No private amp, or private amp has no anon + * structs. This means entire segment is backed by + * the shared object. + */ + first = vmu_alloc_bound(); + first->vmb_next = NULL; + first->vmb_start = s_start; + first->vmb_end = s_end; + first->vmb_type = VMUSAGE_BOUND_UNKNOWN; + } + /* + * Iterate bounds not backed by private amp, and compute + * resident pages. + */ + cur = first; + while (cur != NULL) { + + if (vmu_insert_lookup_object_bounds(shared_object, + cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN, + &first, &last) > 0) { + /* new bounds, find incore/not-incore */ + if (shared_object->vmo_type == + VMUSAGE_TYPE_VNODE) + vmu_vnode_update_incore_bounds( + (vnode_t *) + shared_object->vmo_key, &first, + &last); + else + vmu_amp_update_incore_bounds( + (struct anon_map *) + shared_object->vmo_key, &first, + &last, incore); + vmu_merge_bounds(&first, &last); + } + for (entity = vmu_entities; entity != NULL; + entity = entity->vme_next_calc) { + + result = &entity->vme_result; + + entity_object = vmu_find_insert_object( + shared_object->vmo_type == + VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash: + entity->vme_amp_hash, + shared_object->vmo_key, + shared_object->vmo_type); + + virt = vmu_insert_lookup_object_bounds( + entity_object, cur->vmb_start, cur->vmb_end, + VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last); + + if (virt == 0) + continue; + /* + * Range visited for this entity + */ + rss = vmu_update_bounds(&e_first, + &e_last, first, last); + result->vmu_rss_all += (rss << PAGESHIFT); + if (shared == B_TRUE && file == B_FALSE) { + /* shared anon mapping */ + result->vmu_swap_all += + (virt << PAGESHIFT); + result->vmu_swap_shared += + (virt << PAGESHIFT); + result->vmu_rss_shared += + (rss << PAGESHIFT); + } else if (shared == B_TRUE && file == B_TRUE) { + /* shared file mapping */ + result->vmu_rss_shared += + (rss << PAGESHIFT); + } else if (shared == B_FALSE && + file == B_TRUE) { + /* private file mapping */ + result->vmu_rss_private += + (rss << PAGESHIFT); + } + vmu_merge_bounds(&e_first, &e_last); + } + tmp = cur; + cur = cur->vmb_next; + vmu_free_bound(tmp); + } + } +} + +/* + * Based on the current calculation flags, find the relevant entities + * which are relative to the process. Then calculate each segment + * in the process'es address space for each relevant entity. + */ +static void +vmu_calculate_proc(proc_t *p) +{ + vmu_entity_t *entities = NULL; + vmu_zone_t *zone; + vmu_entity_t *tmp; + struct as *as; + struct seg *seg; + int ret; + + /* Figure out which entities are being computed */ + if ((vmu_data.vmu_system) != NULL) { + tmp = vmu_data.vmu_system; + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS | + VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | + VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS | + VMUSAGE_ALL_EUSERS)) { + ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, + (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, + (mod_hash_val_t *)&zone); + if (ret != 0) { + zone = vmu_alloc_zone(p->p_zone->zone_id); + ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash, + (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, + (mod_hash_val_t)zone, (mod_hash_hndl_t)0); + ASSERT(ret == 0); + } + if (zone->vmz_zone != NULL) { + tmp = zone->vmz_zone; + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) { + tmp = vmu_find_insert_entity(zone->vmz_projects_hash, + p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, + zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) { + tmp = vmu_find_insert_entity(zone->vmz_tasks_hash, + p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) { + tmp = vmu_find_insert_entity(zone->vmz_rusers_hash, + crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) { + tmp = vmu_find_insert_entity(zone->vmz_eusers_hash, + crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + } + /* Entities which collapse projects and users for all zones */ + if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) { + tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash, + p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) { + tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash, + crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) { + tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash, + crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES); + tmp->vme_next_calc = entities; + entities = tmp; + } + + ASSERT(entities != NULL); + /* process all segs in process's address space */ + as = p->p_as; + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + for (seg = AS_SEGFIRST(as); seg != NULL; + seg = AS_SEGNEXT(as, seg)) { + vmu_calculate_seg(entities, seg); + } + AS_LOCK_EXIT(as, &as->a_lock); +} + +/* + * Free data created by previous call to vmu_calculate(). + */ +static void +vmu_clear_calc() +{ + if (vmu_data.vmu_system != NULL) + vmu_free_entity(vmu_data.vmu_system); + vmu_data.vmu_system = NULL; + if (vmu_data.vmu_zones_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash); + if (vmu_data.vmu_projects_col_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash); + if (vmu_data.vmu_rusers_col_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash); + if (vmu_data.vmu_eusers_col_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash); + + i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash); + i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash); +} + +/* + * Free unused data structures. These can result if the system workload + * decreases between calculations. + */ +static void +vmu_free_extra() +{ + vmu_bound_t *tb; + vmu_object_t *to; + vmu_entity_t *te; + vmu_zone_t *tz; + + while (vmu_data.vmu_free_bounds != NULL) { + tb = vmu_data.vmu_free_bounds; + vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next; + kmem_cache_free(vmu_bound_cache, tb); + } + while (vmu_data.vmu_free_objects != NULL) { + to = vmu_data.vmu_free_objects; + vmu_data.vmu_free_objects = + vmu_data.vmu_free_objects->vmo_next; + kmem_cache_free(vmu_object_cache, to); + } + while (vmu_data.vmu_free_entities != NULL) { + te = vmu_data.vmu_free_entities; + vmu_data.vmu_free_entities = + vmu_data.vmu_free_entities->vme_next; + if (te->vme_vnode_hash != NULL) + mod_hash_destroy_hash(te->vme_vnode_hash); + if (te->vme_amp_hash != NULL) + mod_hash_destroy_hash(te->vme_amp_hash); + if (te->vme_anon_hash != NULL) + mod_hash_destroy_hash(te->vme_anon_hash); + kmem_free(te, sizeof (vmu_entity_t)); + } + while (vmu_data.vmu_free_zones != NULL) { + tz = vmu_data.vmu_free_zones; + vmu_data.vmu_free_zones = + vmu_data.vmu_free_zones->vmz_next; + if (tz->vmz_projects_hash != NULL) + mod_hash_destroy_hash(tz->vmz_projects_hash); + if (tz->vmz_tasks_hash != NULL) + mod_hash_destroy_hash(tz->vmz_tasks_hash); + if (tz->vmz_rusers_hash != NULL) + mod_hash_destroy_hash(tz->vmz_rusers_hash); + if (tz->vmz_eusers_hash != NULL) + mod_hash_destroy_hash(tz->vmz_eusers_hash); + kmem_free(tz, sizeof (vmu_zone_t)); + } +} + +extern kcondvar_t *pr_pid_cv; + +/* + * Determine which entity types are relevant and allocate the hashes to + * track them. Then walk the process table and count rss and swap + * for each process'es address space. Address space object such as + * vnodes, amps and anons are tracked per entity, so that they are + * not double counted in the results. + * + */ +static void +vmu_calculate() +{ + int i = 0; + int ret; + proc_t *p; + + vmu_clear_calc(); + + if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM) + vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM, + ALL_ZONES); + + /* + * Walk process table and calculate rss of each proc. + * + * Pidlock and p_lock cannot be held while doing the rss calculation. + * This is because: + * 1. The calculation allocates using KM_SLEEP. + * 2. The calculation grabs a_lock, which cannot be grabbed + * after p_lock. + * + * Since pidlock must be dropped, we cannot simply just walk the + * practive list. Instead, we walk the process table, and sprlock + * each process to ensure that it does not exit during the + * calculation. + */ + + mutex_enter(&pidlock); + for (i = 0; i < v.v_proc; i++) { +again: + p = pid_entry(i); + if (p == NULL) + continue; + + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + if (panicstr) { + mutex_exit(&p->p_lock); + return; + } + + /* Try to set P_PR_LOCK */ + ret = sprtrylock_proc(p); + if (ret == -1) { + /* Process in invalid state */ + mutex_exit(&p->p_lock); + mutex_enter(&pidlock); + continue; + } else if (ret == 1) { + /* + * P_PR_LOCK is already set. Wait and try again. + * This also drops p_lock. + */ + sprwaitlock_proc(p); + mutex_enter(&pidlock); + goto again; + } + mutex_exit(&p->p_lock); + + vmu_calculate_proc(p); + + mutex_enter(&p->p_lock); + sprunlock(p); + mutex_enter(&pidlock); + } + mutex_exit(&pidlock); + + vmu_free_extra(); +} + +/* + * allocate a new cache for N results satisfying flags + */ +vmu_cache_t * +vmu_cache_alloc(size_t nres, uint_t flags) +{ + vmu_cache_t *cache; + + cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP); + cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP); + cache->vmc_nresults = nres; + cache->vmc_flags = flags; + cache->vmc_refcnt = 1; + return (cache); +} + +/* + * Make sure cached results are not freed + */ +static void +vmu_cache_hold(vmu_cache_t *cache) +{ + ASSERT(MUTEX_HELD(&vmu_data.vmu_lock)); + cache->vmc_refcnt++; +} + +/* + * free cache data + */ +static void +vmu_cache_rele(vmu_cache_t *cache) +{ + ASSERT(MUTEX_HELD(&vmu_data.vmu_lock)); + ASSERT(cache->vmc_refcnt > 0); + cache->vmc_refcnt--; + if (cache->vmc_refcnt == 0) { + kmem_free(cache->vmc_results, sizeof (vmusage_t) * + cache->vmc_nresults); + kmem_free(cache, sizeof (vmu_cache_t)); + } +} + +/* + * Copy out the cached results to a caller. Inspect the callers flags + * and zone to determine which cached results should be copied. + */ +static int +vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, + uint_t flags) +{ + vmusage_t *result, *out_result; + vmusage_t dummy; + size_t i, count = 0; + size_t bufsize; + int ret = 0; + uint_t types = 0; + + if (nres != NULL) { + if (copyin((caddr_t)nres, &bufsize, sizeof (size_t))) + return (set_errno(EFAULT)); + } else { + bufsize = 0; + } + + /* figure out what results the caller is interested in. */ + if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone) + types |= VMUSAGE_SYSTEM; + if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) + types |= VMUSAGE_ZONE; + if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | + VMUSAGE_COL_PROJECTS)) + types |= VMUSAGE_PROJECTS; + if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) + types |= VMUSAGE_TASKS; + if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) + types |= VMUSAGE_RUSERS; + if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) + types |= VMUSAGE_EUSERS; + + /* count results for current zone */ + out_result = buf; + for (result = cache->vmc_results, i = 0; + i < cache->vmc_nresults; result++, i++) { + + /* Do not return "other-zone" results to non-global zones */ + if (curproc->p_zone != global_zone && + curproc->p_zone->zone_id != result->vmu_zoneid) + continue; + + /* + * If non-global zone requests VMUSAGE_SYSTEM, fake + * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result. + */ + if (curproc->p_zone != global_zone && + (flags & VMUSAGE_SYSTEM) != 0 && + result->vmu_type == VMUSAGE_ZONE) { + count++; + if (out_result != NULL) { + if (bufsize < count) { + ret = set_errno(EOVERFLOW); + } else { + dummy = *result; + dummy.vmu_zoneid = ALL_ZONES; + dummy.vmu_id = 0; + dummy.vmu_type = VMUSAGE_SYSTEM; + if (copyout(&dummy, out_result, + sizeof (vmusage_t))) + return (set_errno( + EFAULT)); + out_result++; + } + } + } + + /* Skip results that do not match requested type */ + if ((result->vmu_type & types) == 0) + continue; + + /* Skip collated results if not requested */ + if (result->vmu_zoneid == ALL_ZONES) { + if (result->vmu_type == VMUSAGE_PROJECTS && + (flags & VMUSAGE_COL_PROJECTS) == 0) + continue; + if (result->vmu_type == VMUSAGE_EUSERS && + (flags & VMUSAGE_COL_EUSERS) == 0) + continue; + if (result->vmu_type == VMUSAGE_RUSERS && + (flags & VMUSAGE_COL_RUSERS) == 0) + continue; + } + + /* Skip "other zone" results if not requested */ + if (result->vmu_zoneid != curproc->p_zone->zone_id) { + if (result->vmu_type == VMUSAGE_ZONE && + (flags & VMUSAGE_ALL_ZONES) == 0) + continue; + if (result->vmu_type == VMUSAGE_PROJECTS && + (flags & (VMUSAGE_ALL_PROJECTS | + VMUSAGE_COL_PROJECTS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_TASKS && + (flags & VMUSAGE_ALL_TASKS) == 0) + continue; + if (result->vmu_type == VMUSAGE_RUSERS && + (flags & (VMUSAGE_ALL_RUSERS | + VMUSAGE_COL_RUSERS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_EUSERS && + (flags & (VMUSAGE_ALL_EUSERS | + VMUSAGE_COL_EUSERS)) == 0) + continue; + } + count++; + if (out_result != NULL) { + if (bufsize < count) { + ret = set_errno(EOVERFLOW); + } else { + if (copyout(result, out_result, + sizeof (vmusage_t))) + return (set_errno(EFAULT)); + out_result++; + } + } + } + if (nres != NULL) + if (copyout(&count, (void *)nres, sizeof (size_t))) + return (set_errno(EFAULT)); + + return (ret); +} + +/* + * vm_getusage() + * + * Counts rss and swap by zone, project, task, and/or user. The flags argument + * determines the type of results structures returned. Flags requesting + * results from more than one zone are "flattened" to the local zone if the + * caller is not the global zone. + * + * args: + * flags: bitmap consisting of one or more of VMUSAGE_*. + * age: maximum allowable age (time since counting was done) in + * seconds of the results. Results from previous callers are + * cached in kernel. + * buf: pointer to buffer array of vmusage_t. If NULL, then only nres + * set on success. + * nres: Set to number of vmusage_t structures pointed to by buf + * before calling vm_getusage(). + * On return 0 (success) or ENOSPC, is set to the number of result + * structures returned or attempted to return. + * + * returns 0 on success, -1 on failure: + * EINTR (interrupted) + * ENOSPC (nres to small for results, nres set to needed value for success) + * EINVAL (flags invalid) + * EFAULT (bad address for buf or nres) + */ +int +vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres) +{ + vmu_entity_t *entity; + vmusage_t *result; + int ret = 0; + int cacherecent = 0; + hrtime_t now; + uint_t flags_orig; + + /* + * Non-global zones cannot request system wide and/or collated + * results, or the system result, so munge the flags accordingly. + */ + flags_orig = flags; + if (curproc->p_zone != global_zone) { + if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) { + flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS); + flags |= VMUSAGE_PROJECTS; + } + if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) { + flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS); + flags |= VMUSAGE_RUSERS; + } + if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) { + flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS); + flags |= VMUSAGE_EUSERS; + } + if (flags & VMUSAGE_SYSTEM) { + flags &= ~VMUSAGE_SYSTEM; + flags |= VMUSAGE_ZONE; + } + } + + /* Check for unknown flags */ + if ((flags & (~VMUSAGE_MASK)) != 0) + return (set_errno(EINVAL)); + + /* Check for no flags */ + if ((flags & VMUSAGE_MASK) == 0) + return (set_errno(EINVAL)); + + mutex_enter(&vmu_data.vmu_lock); + now = gethrtime(); + +start: + if (vmu_data.vmu_cache != NULL) { + + vmu_cache_t *cache; + + if ((vmu_data.vmu_cache->vmc_timestamp + + ((hrtime_t)age * NANOSEC)) > now) + cacherecent = 1; + + if ((vmu_data.vmu_cache->vmc_flags & flags) == flags && + cacherecent == 1) { + cache = vmu_data.vmu_cache; + vmu_cache_hold(cache); + mutex_exit(&vmu_data.vmu_lock); + + ret = vmu_copyout_results(cache, buf, nres, flags_orig); + mutex_enter(&vmu_data.vmu_lock); + vmu_cache_rele(cache); + if (vmu_data.vmu_pending_waiters > 0) + cv_broadcast(&vmu_data.vmu_cv); + mutex_exit(&vmu_data.vmu_lock); + return (ret); + } + /* + * If the cache is recent, it is likely that there are other + * consumers of vm_getusage running, so add their flags to the + * desired flags for the calculation. + */ + if (cacherecent == 1) + flags = vmu_data.vmu_cache->vmc_flags | flags; + } + if (vmu_data.vmu_calc_thread == NULL) { + + vmu_cache_t *cache; + + vmu_data.vmu_calc_thread = curthread; + vmu_data.vmu_calc_flags = flags; + vmu_data.vmu_entities = NULL; + vmu_data.vmu_nentities = 0; + if (vmu_data.vmu_pending_waiters > 0) + vmu_data.vmu_calc_flags |= + vmu_data.vmu_pending_flags; + + vmu_data.vmu_pending_flags = 0; + mutex_exit(&vmu_data.vmu_lock); + vmu_calculate(); + mutex_enter(&vmu_data.vmu_lock); + /* copy results to cache */ + if (vmu_data.vmu_cache != NULL) + vmu_cache_rele(vmu_data.vmu_cache); + cache = vmu_data.vmu_cache = + vmu_cache_alloc(vmu_data.vmu_nentities, + vmu_data.vmu_calc_flags); + + result = cache->vmc_results; + for (entity = vmu_data.vmu_entities; entity != NULL; + entity = entity->vme_next) { + *result = entity->vme_result; + result++; + } + cache->vmc_timestamp = gethrtime(); + vmu_cache_hold(cache); + + vmu_data.vmu_calc_flags = 0; + vmu_data.vmu_calc_thread = NULL; + + if (vmu_data.vmu_pending_waiters > 0) + cv_broadcast(&vmu_data.vmu_cv); + + mutex_exit(&vmu_data.vmu_lock); + + /* copy cache */ + ret = vmu_copyout_results(cache, buf, nres, flags_orig); + mutex_enter(&vmu_data.vmu_lock); + vmu_cache_rele(cache); + mutex_exit(&vmu_data.vmu_lock); + + return (ret); + } + vmu_data.vmu_pending_flags |= flags; + vmu_data.vmu_pending_waiters++; + while (vmu_data.vmu_calc_thread != NULL) { + if (cv_wait_sig(&vmu_data.vmu_cv, + &vmu_data.vmu_lock) == 0) { + vmu_data.vmu_pending_waiters--; + mutex_exit(&vmu_data.vmu_lock); + return (set_errno(EINTR)); + } + } + vmu_data.vmu_pending_waiters--; + goto start; +} |
