diff options
85 files changed, 8180 insertions, 872 deletions
diff --git a/usr/src/cmd/prstat/prstat.c b/usr/src/cmd/prstat/prstat.c index 743990ad2a..5a4b9185ea 100644 --- a/usr/src/cmd/prstat/prstat.c +++ b/usr/src/cmd/prstat/prstat.c @@ -31,6 +31,7 @@ #include <sys/loadavg.h> #include <sys/time.h> #include <sys/pset.h> +#include <sys/vm_usage.h> #include <zone.h> #include <libzonecfg.h> @@ -86,21 +87,21 @@ #define USAGE_HEADER_LWP \ " PID USERNAME USR SYS TRP TFL DFL LCK SLP LAT VCX ICX SCL SIG PROCESS/LWPID " #define USER_HEADER_PROC \ -" NPROC USERNAME SIZE RSS MEMORY TIME CPU " +" NPROC USERNAME SWAP RSS MEMORY TIME CPU " #define USER_HEADER_LWP \ -" NLWP USERNAME SIZE RSS MEMORY TIME CPU " +" NLWP USERNAME SWAP RSS MEMORY TIME CPU " #define TASK_HEADER_PROC \ -"TASKID NPROC SIZE RSS MEMORY TIME CPU PROJECT " +"TASKID NPROC SWAP RSS MEMORY TIME CPU PROJECT " #define TASK_HEADER_LWP \ -"TASKID NLWP SIZE RSS MEMORY TIME CPU PROJECT " +"TASKID NLWP SWAP RSS MEMORY TIME CPU PROJECT " #define PROJECT_HEADER_PROC \ -"PROJID NPROC SIZE RSS MEMORY TIME CPU PROJECT " +"PROJID NPROC SWAP RSS MEMORY TIME CPU PROJECT " #define PROJECT_HEADER_LWP \ -"PROJID NLWP SIZE RSS MEMORY TIME CPU PROJECT " +"PROJID NLWP SWAP RSS MEMORY TIME CPU PROJECT " #define ZONE_HEADER_PROC \ -"ZONEID NPROC SIZE RSS MEMORY TIME CPU ZONE " +"ZONEID NPROC SWAP RSS MEMORY TIME CPU ZONE " #define ZONE_HEADER_LWP \ -"ZONEID NLWP SIZE RSS MEMORY TIME CPU ZONE " +"ZONEID NLWP SWAP RSS MEMORY TIME CPU ZONE " #define PSINFO_LINE \ "%6d %-8s %5s %5s %-6s %3s %3s %9s %3.3s%% %-.16s/%d" #define PSINFO_LINE_LGRP \ @@ -160,6 +161,8 @@ static volatile uint_t sigwinch = 0; static volatile uint_t sigtstp = 0; static volatile uint_t sigterm = 0; +static long pagesize; + /* default settings */ static optdesc_t opts = { @@ -185,6 +188,129 @@ psetloadavg(long psetid, void *ptr) } /* + * Queries the memory virtual and rss size for each member of a list. + * This will override the values computed by /proc aggregation. + */ +static void +list_getsize(list_t *list) +{ + id_info_t *id; + vmusage_t *results, *next; + vmusage_t *match; + size_t nres = 0; + size_t i; + uint_t flags = 0; + int ret; + size_t physmem = sysconf(_SC_PHYS_PAGES) * pagesize; + + /* + * Determine what swap/rss results to calculate. getvmusage() will + * prune results returned to non-global zones automatically, so + * there is no need to pass different flags when calling from a + * non-global zone. + * + * Currently list_getsize() is only called with a single flag. This + * is because -Z, -J, -T, and -a are mutually exclusive. Regardless + * of this, we handle multiple flags. + */ + if (opts.o_outpmode & OPT_USERS) { + /* + * Gather rss for all users in all zones. Treat the same + * uid in different zones as the same user. + */ + flags |= VMUSAGE_COL_RUSERS; + + } else if (opts.o_outpmode & OPT_TASKS) { + /* Gather rss for all tasks in all zones */ + flags |= VMUSAGE_ALL_TASKS; + + } else if (opts.o_outpmode & OPT_PROJECTS) { + /* + * Gather rss for all projects in all zones. Treat the same + * projid in diffrent zones as the same project. + */ + flags |= VMUSAGE_COL_PROJECTS; + + } else if (opts.o_outpmode & OPT_ZONES) { + /* Gather rss for all zones */ + flags |= VMUSAGE_ALL_ZONES; + + } else { + Die(gettext( + "Cannot determine rss flags for output options %x\n"), + opts.o_outpmode); + } + + /* + * getvmusage() returns an array of result structures. One for + * each zone, project, task, or user on the system, depending on + * flags. + * + * If getvmusage() fails, prstat will use the size already gathered + * from psinfo + */ + if (getvmusage(flags, opts.o_interval, NULL, &nres) != 0) + return; + + results = (vmusage_t *)Malloc(sizeof (vmusage_t) * nres); + for (;;) { + ret = getvmusage(flags, opts.o_interval, results, &nres); + if (ret == 0) + break; + if (errno == EOVERFLOW) { + results = (vmusage_t *)Realloc(results, + sizeof (vmusage_t) * nres); + continue; + } + /* + * Failure for some other reason. Prstat will use the size + * already gathered from psinfo. + */ + return; + } + for (id = list->l_head; id != NULL; id = id->id_next) { + + match = NULL; + next = results; + for (i = 0; i < nres; i++, next++) { + switch (flags) { + case VMUSAGE_COL_RUSERS: + if (next->vmu_id == id->id_uid) + match = next; + break; + case VMUSAGE_ALL_TASKS: + if (next->vmu_id == id->id_taskid) + match = next; + break; + case VMUSAGE_COL_PROJECTS: + if (next->vmu_id == id->id_projid) + match = next; + break; + case VMUSAGE_ALL_ZONES: + if (next->vmu_id == id->id_zoneid) + match = next; + break; + default: + Die(gettext( + "Unknown vmusage flags %d\n"), flags); + } + } + if (match != NULL) { + id->id_size = match->vmu_swap_all / 1024; + id->id_rssize = match->vmu_rss_all / 1024; + id->id_pctmem = (100.0 * (float)match->vmu_rss_all) / + (float)physmem; + /* Output using data from getvmusage() */ + id->id_sizematch = B_TRUE; + } + /* + * If no match is found, prstat will use the size already + * gathered from psinfo. + */ + } +} + +/* * A routine to display the contents of the list on the screen */ static void @@ -282,7 +408,7 @@ list_print(list_t *list) cpu = (100 * id->id_pctcpu) / total_cpu; else cpu = id->id_pctcpu; - if (total_mem >= 100) + if (id->id_sizematch == B_FALSE && total_mem >= 100) mem = (100 * id->id_pctmem) / total_mem; else mem = id->id_pctmem; @@ -566,6 +692,7 @@ update: id->id_zoneid = lwp->li_info.pr_zoneid; id->id_lgroup = lwp->li_info.pr_lwp.pr_lgrp; id->id_nproc++; + id->id_sizematch = B_FALSE; if (lwp->li_flags & LWP_REPRESENT) { id->id_size = lwp->li_info.pr_size; id->id_rssize = lwp->li_info.pr_rssize; @@ -1175,6 +1302,7 @@ Exit() fd_exit(); } + int main(int argc, char **argv) { @@ -1192,6 +1320,8 @@ main(int argc, char **argv) lwpid_init(); fd_init(Setrlimit()); + pagesize = sysconf(_SC_PAGESIZE); + while ((opt = getopt(argc, argv, "vcHmaRLtu:U:n:p:C:P:h:s:S:j:k:TJz:Z")) != (int)EOF) { switch (opt) { @@ -1419,21 +1549,25 @@ main(int argc, char **argv) list_print(&lwps); } if (opts.o_outpmode & OPT_USERS) { + list_getsize(&users); list_sort(&users); list_print(&users); list_clear(&users); } if (opts.o_outpmode & OPT_TASKS) { + list_getsize(&tasks); list_sort(&tasks); list_print(&tasks); list_clear(&tasks); } if (opts.o_outpmode & OPT_PROJECTS) { + list_getsize(&projects); list_sort(&projects); list_print(&projects); list_clear(&projects); } if (opts.o_outpmode & OPT_ZONES) { + list_getsize(&zones); list_sort(&zones); list_print(&zones); list_clear(&zones); diff --git a/usr/src/cmd/prstat/prstat.h b/usr/src/cmd/prstat/prstat.h index 1a13329845..d130164e7d 100644 --- a/usr/src/cmd/prstat/prstat.h +++ b/usr/src/cmd/prstat/prstat.h @@ -122,6 +122,7 @@ typedef struct id_info { zoneid_t id_zoneid; /* zone id */ int id_lgroup; /* lgroup id */ uint_t id_nproc; /* number of processes */ + boolean_t id_sizematch; /* size/rssize from getvmusage() */ size_t id_size; /* memory usage */ size_t id_rssize; /* resident set size */ ulong_t id_time; /* cpu time (in secs) */ diff --git a/usr/src/cmd/rcap/common/rcapd.h b/usr/src/cmd/rcap/common/rcapd.h index 89cf5f3d81..7a554c213b 100644 --- a/usr/src/cmd/rcap/common/rcapd.h +++ b/usr/src/cmd/rcap/common/rcapd.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -58,7 +57,21 @@ extern "C" { #define LCST_CAP_REMOVED (1<<1) #define LCST_CAP_ZERO (1<<2) -typedef int64_t rcid_t; +typedef enum { + RCIDT_PROJECT, + RCIDT_ZONE +} rcid_type_t; + +typedef struct { + /* + * The following field could just be a rcid_type_t but it gets + * written out to a file as binary data for communication between + * 64-bit rcapd & 32-bit rcapstat, so we need to force a standard size + * and alignment here. + */ + uint64_t rcid_type; + int64_t rcid_val; +} rcid_t; typedef enum { LCU_COMPLETE, /* an enumeration of all possible collections */ @@ -138,7 +151,6 @@ typedef struct lcollection { uint64_t lcol_rss; /* RSS of all processes (kB) */ uint64_t lcol_image_size; /* image size of all processes (kB) */ uint64_t lcol_rss_cap; /* RSS cap (kB) */ - int lcol_stat_invalidate; /* flag to reset interval statistics */ lcollection_stat_t lcol_stat; /* statistics */ lcollection_stat_t lcol_stat_old; /* previous interval's statistics */ lprocess_t *lcol_lprocess; /* member processes */ @@ -162,12 +174,11 @@ typedef struct lcollection_report { extern int get_psinfo(pid_t, struct psinfo *, int, int(*)(void *, int), void *, lprocess_t *); -extern lcollection_t *lcollection_find(id_t); +extern lcollection_t *lcollection_find(rcid_t *); extern void lcollection_freq_move(lprocess_t *); -extern lcollection_t *lcollection_insert_update(rcid_t, uint64_t, char *, +extern lcollection_t *lcollection_insert_update(rcid_t *, uint64_t, char *, int *changes); extern int lcollection_member(lcollection_t *, lprocess_t *); -extern void lcollection_set_type(rctype_t); extern void lcollection_free(lcollection_t *); extern void lcollection_update(lcollection_update_type_t); extern void list_walk_collection(int (*)(lcollection_t *, void *), void *); @@ -178,12 +189,6 @@ extern void scan_abort(void); extern void check_update_statistics(void); /* - * The collection-specific function determining the collection ID from a - * process' psinfo. - */ -extern rcid_t(*rc_getidbypsinfo)(struct psinfo *); - -/* * Global (in rcapd only) variables. */ extern rcfg_t rcfg; diff --git a/usr/src/cmd/rcap/common/rcapd_stat.h b/usr/src/cmd/rcap/common/rcapd_stat.h index c34ceb36e2..fa769ba643 100644 --- a/usr/src/cmd/rcap/common/rcapd_stat.h +++ b/usr/src/cmd/rcap/common/rcapd_stat.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -44,7 +43,10 @@ extern "C" { */ #define RC_MODE_LEN 16 typedef struct rcapd_stat_hdr { - pid_t rs_pid; /* pid of producer */ + /* + * sizeof pid_t can vary, so we use a fixed 64-bit quantity. + */ + uint64_t rs_pid; /* pid of producer */ hrtime_t rs_time; /* time recorded */ /* diff --git a/usr/src/cmd/rcap/common/utils.c b/usr/src/cmd/rcap/common/utils.c index f9757a12f6..c01f568915 100644 --- a/usr/src/cmd/rcap/common/utils.c +++ b/usr/src/cmd/rcap/common/utils.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -260,3 +259,77 @@ xatoi(char *p) return (i); } } + +/* + * get_running_zones() calls zone_list(2) to find out how many zones are + * running. It then calls zone_list(2) again to fetch the list of running + * zones (stored in *zents). + */ +int +get_running_zones(uint_t *nzents, zone_entry_t **zents) +{ + zoneid_t *zids; + uint_t nzents_saved; + int i; + zone_entry_t *zentp; + zone_state_t zstate; + + *zents = NULL; + if (zone_list(NULL, nzents) != 0) { + warn(gettext("could not get zoneid list\n")); + return (E_ERROR); + } + +again: + if (*nzents == 0) + return (E_SUCCESS); + + if ((zids = (zoneid_t *)calloc(*nzents, sizeof (zoneid_t))) == NULL) { + warn(gettext("out of memory: zones will not be capped\n")); + return (E_ERROR); + } + + nzents_saved = *nzents; + + if (zone_list(zids, nzents) != 0) { + warn(gettext("could not get zone list\n")); + free(zids); + return (E_ERROR); + } + if (*nzents != nzents_saved) { + /* list changed, try again */ + free(zids); + goto again; + } + + *zents = calloc(*nzents, sizeof (zone_entry_t)); + if (*zents == NULL) { + warn(gettext("out of memory: zones will not be capped\n")); + free(zids); + return (E_ERROR); + } + + zentp = *zents; + for (i = 0; i < *nzents; i++) { + char name[ZONENAME_MAX]; + + if (getzonenamebyid(zids[i], name, sizeof (name)) < 0) { + warn(gettext("could not get name for " + "zoneid %d\n"), zids[i]); + continue; + } + + (void) strlcpy(zentp->zname, name, sizeof (zentp->zname)); + zentp->zid = zids[i]; + if (zone_get_state(name, &zstate) != Z_OK || + zstate != ZONE_STATE_RUNNING) + continue; + + + zentp++; + } + *nzents = zentp - *zents; + + free(zids); + return (E_SUCCESS); +} diff --git a/usr/src/cmd/rcap/common/utils.h b/usr/src/cmd/rcap/common/utils.h index 678dee51ab..f952d59bbb 100644 --- a/usr/src/cmd/rcap/common/utils.h +++ b/usr/src/cmd/rcap/common/utils.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,6 +32,7 @@ #include <libintl.h> #include <stdarg.h> #include <time.h> +#include <libzonecfg.h> #ifdef __cplusplus extern "C" { @@ -63,6 +63,11 @@ typedef enum rcm_dst { RCD_SYSLOG /* syslog() daemon facility */ } rcm_dst_t; +typedef struct zone_entry { + zoneid_t zid; + char zname[ZONENAME_MAX]; +} zone_entry_t; + #define LINELEN 256 /* max. message length */ #ifdef DEBUG @@ -95,6 +100,7 @@ extern void vdprintfe(int, char *, va_list); extern void dprintfe(int, char *, ...); extern void hrt2ts(hrtime_t, timestruc_t *); extern int xatoi(char *); +extern int get_running_zones(uint_t *, zone_entry_t **); #ifdef __cplusplus } diff --git a/usr/src/cmd/rcap/rcapadm/Makefile b/usr/src/cmd/rcap/rcapadm/Makefile index 59c1530185..3b4de32953 100644 --- a/usr/src/cmd/rcap/rcapadm/Makefile +++ b/usr/src/cmd/rcap/rcapadm/Makefile @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # #ident "%Z%%M% %I% %E% SMI" @@ -41,7 +40,7 @@ LINTSRCS = $(COMMON_DIR)/utils.c \ $(NOT_RELEASE_BUILD)CPPFLAGS += -DDEBUG CPPFLAGS += -I$(COMMON_DIR) -LDLIBS += -lumem -ll -lscf +LDLIBS += -lumem -ll -lscf -lzonecfg LINTFLAGS += $(LDLIBS) -mnu diff --git a/usr/src/cmd/rcap/rcapadm/rcapadm.c b/usr/src/cmd/rcap/rcapadm/rcapadm.c index cc9fd290a1..1951682283 100644 --- a/usr/src/cmd/rcap/rcapadm/rcapadm.c +++ b/usr/src/cmd/rcap/rcapadm/rcapadm.c @@ -39,6 +39,8 @@ #include <libscf_priv.h> #include <libintl.h> #include <locale.h> +#include <zone.h> +#include <libzonecfg.h> #include "utils.h" #include "rcapd.h" @@ -61,7 +63,9 @@ usage() " [-c <percent>] " "# set memory cap\n" " " - "# enforcement threshold\n")); + "# enforcement threshold\n" + " [-z <zonename> -m <max-rss>] " + "# update zone memory cap\n")); exit(E_USAGE); } @@ -135,18 +139,54 @@ out: scf_handle_destroy(h); } +/* + * Update the in-kernel memory cap for the specified zone. + */ +static int +update_zone_mcap(char *zonename, char *maxrss) +{ + zoneid_t zone_id; + uint64_t num; + + if (getzoneid() != GLOBAL_ZONEID || zonecfg_in_alt_root()) + return (E_SUCCESS); + + /* get the running zone from the kernel */ + if ((zone_id = getzoneidbyname(zonename)) == -1) { + (void) fprintf(stderr, gettext("zone '%s' must be running\n"), + zonename); + return (E_ERROR); + } + + if (zonecfg_str_to_bytes(maxrss, &num) == -1) { + (void) fprintf(stderr, gettext("invalid max-rss value\n")); + return (E_ERROR); + } + + if (zone_setattr(zone_id, ZONE_ATTR_PHYS_MCAP, &num, 0) == -1) { + (void) fprintf(stderr, gettext("could not set memory " + "cap for zone '%s'\n"), zonename); + return (E_ERROR); + } + + return (E_SUCCESS); +} + int main(int argc, char *argv[]) { char *subopts, *optval; int modified = 0; + boolean_t refresh = B_FALSE; int opt; + char *zonename; + char *maxrss = NULL; (void) setprogname("rcapadm"); (void) setlocale(LC_ALL, ""); (void) textdomain(TEXT_DOMAIN); - while ((opt = getopt(argc, argv, "DEc:i:n")) != EOF) { + while ((opt = getopt(argc, argv, "DEc:i:m:nz:")) != EOF) { switch (opt) { case 'n': no_starting_stopping = 1; @@ -203,12 +243,24 @@ main(int argc, char *argv[]) } modified++; break; + case 'm': + maxrss = optarg; + break; + case 'z': + refresh = B_TRUE; + zonename = optarg; + break; default: usage(); } } - if (argc > optind) + /* the -z & -m options must be used together */ + if (argc > optind || (refresh && maxrss == NULL) || + (!refresh && maxrss != NULL)) + usage(); + + if (refresh && (no_starting_stopping > 0 || modified)) usage(); if (rcfg_read(fname, -1, &conf, NULL) < 0) { @@ -232,6 +284,9 @@ main(int argc, char *argv[]) } } + if (refresh) + return (update_zone_mcap(zonename, maxrss)); + if (modified) { if (pressure >= 0) conf.rcfg_memory_cap_enforcement_pressure = pressure; diff --git a/usr/src/cmd/rcap/rcapd/Makefile.rcapd b/usr/src/cmd/rcap/rcapd/Makefile.rcapd index 5fd0d01416..716ea41e38 100644 --- a/usr/src/cmd/rcap/rcapd/Makefile.rcapd +++ b/usr/src/cmd/rcap/rcapd/Makefile.rcapd @@ -35,6 +35,7 @@ SRCS = rcapd_main.c \ rcapd_collection.c \ rcapd_collection_project.c \ + rcapd_collection_zone.c \ rcapd_mapping.c \ rcapd_rfd.c \ rcapd_scanner.c \ @@ -44,6 +45,7 @@ SRCS = rcapd_main.c \ LINTSRCS = ../rcapd_main.c \ ../rcapd_collection.c \ ../rcapd_collection_project.c \ + ../rcapd_collection_zone.c \ ../rcapd_mapping.c \ ../rcapd_rfd.c \ ../rcapd_scanner.c \ @@ -53,7 +55,7 @@ LINTSRCS = ../rcapd_main.c \ $(NOT_RELEASE_BUILD)CPPFLAGS += -DDEBUG CPPFLAGS += -DDEBUG_MSG CPPFLAGS += -I$(COMMON_DIR) -LDLIBS += -lkstat -ll -lproc -lproject -lumem +LDLIBS += -lkstat -ll -lproc -lproject -lzonecfg -lumem LDLIBS += $(EXTRA_LDLIBS) LINTFLAGS += -u diff --git a/usr/src/cmd/rcap/rcapd/rcapd_collection.c b/usr/src/cmd/rcap/rcapd/rcapd_collection.c index 7dac0e8155..fdaf8dbfe0 100644 --- a/usr/src/cmd/rcap/rcapd/rcapd_collection.c +++ b/usr/src/cmd/rcap/rcapd/rcapd_collection.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -41,14 +40,16 @@ #define MAX(x, y) (((x) > (y)) ? (x) : (y)) typedef struct { - rcid_t lfa_colid; + rcid_t *lfa_colidp; lcollection_t *lfa_found; } lcollection_find_arg_t; extern void lcollection_update_project(lcollection_update_type_t, - void(*)(char *, int, uint64_t, int)); -extern void lcollection_set_type_project(); -static void lcollection_update_notification_cb(char *, int, uint64_t, int); + void(*)(char *, char *, int, uint64_t, int)); +extern void lcollection_update_zone(lcollection_update_type_t, + void(*)(char *, char *, int, uint64_t, int)); +static void lcollection_update_notification_cb(char *, char *, int, uint64_t, + int); rcid_t(*rc_getidbypsinfo)(psinfo_t *); uint64_t phys_total = 0; @@ -57,28 +58,8 @@ static lcollection_t *lcollection_head = NULL; void lcollection_update(lcollection_update_type_t ut) { - if (rcfg.rcfg_mode == rctype_project) - lcollection_update_project(ut, - lcollection_update_notification_cb); - else - die(gettext("unknown mode %s\n"), rcfg.rcfg_mode_name); -} - -/* - * Configure which collection type will be used. - */ -void -lcollection_set_type(rctype_t type) -{ - switch (type) { - case rctype_project: - lcollection_set_type_project(); - break; - default: - /* can't happen */ - die(gettext("unknown mode %d\n"), type); - /*NOTREACHED*/ - } + lcollection_update_zone(ut, lcollection_update_notification_cb); + lcollection_update_project(ut, lcollection_update_notification_cb); } /* @@ -93,7 +74,7 @@ lcollection_set_type(rctype_t type) * LCSS_CAP_ZERO */ lcollection_t * -lcollection_insert_update(rcid_t colid, uint64_t rss_cap, char *name, +lcollection_insert_update(rcid_t *colidp, uint64_t rss_cap, char *name, int *changes) { lcollection_t *lcol; @@ -103,7 +84,7 @@ lcollection_insert_update(rcid_t colid, uint64_t rss_cap, char *name, if (rss_cap == 0) *changes |= LCST_CAP_ZERO; - lcol = lcollection_find(colid); + lcol = lcollection_find(colidp); /* * If the specified collection is capped, add it to lcollection. @@ -120,12 +101,13 @@ lcollection_insert_update(rcid_t colid, uint64_t rss_cap, char *name, lcol = malloc(sizeof (*lcol)); if (lcol == NULL) { debug("not enough memory to monitor %s %s", - rcfg.rcfg_mode_name, name); + (colidp->rcid_type == RCIDT_PROJECT ? + "project" : "zone"), name); return (NULL); } (void) bzero(lcol, sizeof (*lcol)); - lcol->lcol_id = colid; + lcol->lcol_id = *colidp; debug("added collection %s\n", name); lcol->lcol_prev = NULL; lcol->lcol_next = lcollection_head; @@ -157,8 +139,8 @@ lcollection_insert_update(rcid_t colid, uint64_t rss_cap, char *name, } static void -lcollection_update_notification_cb(char *name, int changes, uint64_t rss_cap, - int mark) +lcollection_update_notification_cb(char *col_type, char *name, int changes, + uint64_t rss_cap, int mark) { /* * Assume the collection has been updated redundantly if its mark count @@ -168,10 +150,10 @@ lcollection_update_notification_cb(char *name, int changes, uint64_t rss_cap, return; if (changes & LCST_CAP_ZERO) - debug("%s %s: %s\n", rcfg.rcfg_mode_name, name, + debug("%s %s: %s\n", col_type, name, (changes & LCST_CAP_REMOVED) ? "cap removed" : "uncapped"); else - debug("%s %s: cap: %llukB\n", rcfg.rcfg_mode_name, name, + debug("%s %s: cap: %llukB\n", col_type, name, (unsigned long long)rss_cap); } @@ -215,19 +197,23 @@ lcollection_member(lcollection_t *lcol, lprocess_t *lpc) static int lcollection_find_cb(lcollection_t *lcol, void *arg) { - if (lcol->lcol_id == ((lcollection_find_arg_t *)arg)->lfa_colid) { + rcid_t *colidp = ((lcollection_find_arg_t *)arg)->lfa_colidp; + + if (lcol->lcol_id.rcid_type == colidp->rcid_type && + lcol->lcol_id.rcid_val == colidp->rcid_val) { ((lcollection_find_arg_t *)arg)->lfa_found = lcol; return (1); - } else - return (0); + } + + return (0); } lcollection_t * -lcollection_find(id_t colid) +lcollection_find(rcid_t *colidp) { lcollection_find_arg_t lfa; - lfa.lfa_colid = colid; + lfa.lfa_colidp = colidp; lfa.lfa_found = NULL; list_walk_collection(lcollection_find_cb, &lfa); diff --git a/usr/src/cmd/rcap/rcapd/rcapd_collection_project.c b/usr/src/cmd/rcap/rcapd/rcapd_collection_project.c index ba34100f05..eab6d2a94a 100644 --- a/usr/src/cmd/rcap/rcapd/rcapd_collection_project.c +++ b/usr/src/cmd/rcap/rcapd/rcapd_collection_project.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,24 +37,17 @@ /* round up to next y = 2^n */ #define ROUNDUP(x, y) (((x) + ((y) - 1)) & ~((y) - 1)) -static rcid_t rc_proj_getidbypsinfo(psinfo_t *); - -void -lcollection_set_type_project(void) -{ - rc_getidbypsinfo = rc_proj_getidbypsinfo; -} - static int lcollection_update_project_cb(const struct project *proj, void *walk_data) { - void(*update_notification_cb)(char *, int, uint64_t, int) = - (void(*)(char *, int, uint64_t, int))walk_data; + void(*update_notification_cb)(char *, char *, int, uint64_t, int) = + (void(*)(char *, char *, int, uint64_t, int))walk_data; char *capattr_abs; char *end; int changes; int64_t max_rss; lcollection_t *lcol; + rcid_t colid; capattr_abs = strstr(proj->pj_attr, PJ_ABS_ATTR_NAME "="); if (capattr_abs != NULL) { @@ -70,17 +62,19 @@ lcollection_update_project_cb(const struct project *proj, void *walk_data) capattr_abs += strlen(PJ_ABS_ATTR_NAME "="); max_rss = ROUNDUP(strtoll(capattr_abs, &end, 10), 1024) / 1024; if (end == capattr_abs || *end != ';' && *end != 0) - warn(gettext("%s %s: malformed %s value " - "'%s'\n"), rcfg.rcfg_mode_name, proj->pj_name, - PJ_ABS_ATTR_NAME, capattr_abs); + warn(gettext("project %s: malformed %s value '%s'\n"), + proj->pj_name, PJ_ABS_ATTR_NAME, capattr_abs); } else max_rss = 0; - lcol = lcollection_insert_update(proj->pj_projid, max_rss, - proj->pj_name, &changes); + colid.rcid_type = RCIDT_PROJECT; + colid.rcid_val = proj->pj_projid; + + lcol = lcollection_insert_update(&colid, max_rss, proj->pj_name, + &changes); if (update_notification_cb != NULL) - update_notification_cb(proj->pj_name, changes, max_rss, (lcol != - NULL) ? lcol->lcol_mark : 0); + update_notification_cb("project", proj->pj_name, changes, + max_rss, (lcol != NULL) ? lcol->lcol_mark : 0); return (0); } @@ -101,10 +95,13 @@ lcollection_update_project_byid_cb(const projid_t id, void *walk_data) static int lcollection_update_onceactive_cb(lcollection_t *lcol, void *walk_data) { - void(*update_notification_cb)(char *, int, uint64_t, int) = - (void(*)(char *, int, uint64_t, int))walk_data; + void(*update_notification_cb)(char *, char *, int, uint64_t, int) = + (void(*)(char *, char *, int, uint64_t, int))walk_data; + + if (lcol->lcol_id.rcid_type != RCIDT_PROJECT) + return (0); - return (lcollection_update_project_byid_cb(lcol->lcol_id, + return (lcollection_update_project_byid_cb(lcol->lcol_id.rcid_val, (void *)update_notification_cb)); } @@ -125,7 +122,7 @@ project_walk_all(int(*cb)(const struct project *, void *), void *walk_data) void lcollection_update_project(lcollection_update_type_t ut, - void(*update_notification_cb)(char *, int, uint64_t, int)) + void(*update_notification_cb)(char *, char *, int, uint64_t, int)) { switch (ut) { case LCU_ACTIVE_ONLY: @@ -154,9 +151,3 @@ lcollection_update_project(lcollection_update_type_t ut, (void *)update_notification_cb); } } - -static rcid_t -rc_proj_getidbypsinfo(psinfo_t *psinfo) -{ - return (psinfo->pr_projid); -} diff --git a/usr/src/cmd/rcap/rcapd/rcapd_collection_zone.c b/usr/src/cmd/rcap/rcapd/rcapd_collection_zone.c new file mode 100644 index 0000000000..db86aa6276 --- /dev/null +++ b/usr/src/cmd/rcap/rcapd/rcapd_collection_zone.c @@ -0,0 +1,99 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <procfs.h> +#include <project.h> +#include <stdlib.h> +#include <strings.h> +#include <zone.h> +#include <libzonecfg.h> +#include "rcapd.h" +#include "utils.h" + +extern boolean_t gz_capped; + + /* round up to next y = 2^n */ +#define ROUNDUP(x, y) (((x) + ((y) - 1)) & ~((y) - 1)) + +static void +update_zone(zone_entry_t *zent, void *walk_data) +{ + void(*update_notification_cb)(char *, char *, int, uint64_t, int) = + (void(*)(char *, char *, int, uint64_t, int))walk_data; + int changes; + int64_t max_rss; + uint64_t mcap; + lcollection_t *lcol; + rcid_t colid; + + if (zone_getattr(zent->zid, ZONE_ATTR_PHYS_MCAP, &mcap, + sizeof (mcap)) != -1 && mcap != 0) + max_rss = ROUNDUP(mcap, 1024) / 1024; + else + max_rss = 0; + + if (zent->zid == GLOBAL_ZONEID) { + if (max_rss > 0) + gz_capped = B_TRUE; + else + gz_capped = B_FALSE; + } + + + colid.rcid_type = RCIDT_ZONE; + colid.rcid_val = zent->zid; + + lcol = lcollection_insert_update(&colid, max_rss, zent->zname, + &changes); + if (update_notification_cb != NULL) + update_notification_cb("zone", zent->zname, changes, max_rss, + (lcol != NULL) ? lcol->lcol_mark : 0); +} + + +/* ARGSUSED */ +void +lcollection_update_zone(lcollection_update_type_t ut, + void(*update_notification_cb)(char *, char *, int, uint64_t, int)) +{ + int i; + uint_t nzents; + zone_entry_t *zents; + + /* + * Enumerate running zones. + */ + if (get_running_zones(&nzents, &zents) != 0) + return; + + for (i = 0; i < nzents; i++) { + update_zone(&zents[i], (void *)update_notification_cb); + + } + + free(zents); +} diff --git a/usr/src/cmd/rcap/rcapd/rcapd_main.c b/usr/src/cmd/rcap/rcapd/rcapd_main.c index 9c2e8b3c48..960065826e 100644 --- a/usr/src/cmd/rcap/rcapd/rcapd_main.c +++ b/usr/src/cmd/rcap/rcapd/rcapd_main.c @@ -61,6 +61,7 @@ #include <unistd.h> #include <zone.h> #include <assert.h> +#include <sys/vm_usage.h> #include "rcapd.h" #include "rcapd_mapping.h" #include "rcapd_rfd.h" @@ -80,30 +81,42 @@ #define STAT_TEMPLATE_SUFFIX ".XXXXXX" /* suffix of mkstemp() arg */ #define DAEMON_UID 1 /* uid to use */ +#define CAPPED_PROJECT 0x01 +#define CAPPED_ZONE 0x02 + typedef struct soft_scan_arg { uint64_t ssa_sum_excess; int64_t ssa_scan_goal; + boolean_t ssa_project_over_cap; } soft_scan_arg_t; +typedef struct sample_col_arg { + boolean_t sca_any_over_cap; + boolean_t sca_project_over_cap; +} sample_col_arg_t; + + static int debug_mode = 0; /* debug mode flag */ static pid_t rcapd_pid; /* rcapd's pid to ensure it's not */ /* scanned */ static kstat_ctl_t *kctl; /* kstat chain */ -static uint64_t new_sp = 0, old_sp = 0; /* measure delta in page scan count */ -static int enforce_caps = 0; /* cap enforcement flag, dependent on */ - /* enforce_soft_caps and */ - /* global_scanner_running */ -static int enforce_soft_caps = 0; /* soft cap enforcement flag, */ - /* depending on memory pressure */ static int memory_pressure = 0; /* physical memory utilization (%) */ static int memory_pressure_sample = 0; /* count of samples */ -static int global_scanner_running = 0; /* global scanning flag, to avoid */ - /* interference with kernel's page */ - /* scanner */ +static long page_size_kb = 0; /* system page size in KB */ +static size_t nvmu_vals = 0; /* # of kernel RSS/swap vals in array */ +static size_t vmu_vals_len = 0; /* size of RSS/swap vals array */ +static vmusage_t *vmu_vals = NULL; /* snapshot of kernel RSS/swap values */ static hrtime_t next_report; /* time of next report */ static int termination_signal = 0; /* terminating signal */ +static zoneid_t my_zoneid = (zoneid_t)-1; +static lcollection_t *gz_col; /* global zone collection */ rcfg_t rcfg; +/* + * Updated when we re-read the collection configurations if this rcapd instance + * is running in the global zone and the global zone is capped. + */ +boolean_t gz_capped = B_FALSE; /* * Flags. @@ -116,9 +129,9 @@ static int verify_statistics(void); static int update_statistics(void); /* - * Checks if a process is marked 'system'. Returns zero only when it is not. + * Checks if a process is marked 'system'. Returns FALSE only when it is not. */ -static int +static boolean_t proc_issystem(pid_t pid) { char pc_clname[PC_CLNMSZ]; @@ -128,22 +141,43 @@ proc_issystem(pid_t pid) return (strcmp(pc_clname, "SYS") == 0); } else { debug("cannot get class-specific scheduling parameters; " - "assuming system process"); - return (-1); + "assuming system process\n"); + return (B_TRUE); } } -/* - * fname is the process name, for debugging messages, and unscannable is a flag - * indicating whether the process should be scanned. - */ static void -lprocess_insert_mark(pid_t pid, id_t colid, char *fname, int unscannable) +lprocess_insert_mark(psinfo_t *psinfop) { + pid_t pid = psinfop->pr_pid; + /* flag indicating whether the process should be scanned. */ + int unscannable = psinfop->pr_nlwp == 0; + rcid_t colid; lcollection_t *lcol; lprocess_t *lproc; - if ((lcol = lcollection_find(colid)) == NULL) + /* + * Determine which collection to put this process into. We only have + * to worry about tracking both zone and project capped processes if + * this rcapd instance is running in the global zone, since we'll only + * see processes in our own projects in a non-global zone. In the + * global zone, if the process belongs to a non-global zone, we only + * need to track it for the capped non-global zone collection. For + * global zone processes, we first attempt to put the process into a + * capped project collection. On the second pass into this function + * the projid will be cleared so we will just track the process for the + * global zone collection as a whole. + */ + if (psinfop->pr_zoneid == my_zoneid && psinfop->pr_projid != -1) { + colid.rcid_type = RCIDT_PROJECT; + colid.rcid_val = psinfop->pr_projid; + } else { + /* try to add to zone collection */ + colid.rcid_type = RCIDT_ZONE; + colid.rcid_val = psinfop->pr_zoneid; + } + + if ((lcol = lcollection_find(&colid)) == NULL) return; /* @@ -193,7 +227,8 @@ lprocess_insert_mark(pid_t pid, id_t colid, char *fname, int unscannable) if (lcollection_member(lcol, lproc)) { lprocess_t *cur = lcol->lcol_lprocess; debug("The collection %lld already has these members, " - "including me, %d!\n", (long long)lcol->lcol_id, + "including me, %d!\n", + (long long)lcol->lcol_id.rcid_val, (int)lproc->lpc_pid); while (cur != NULL) { debug("\t%d\n", (int)cur->lpc_pid); @@ -209,7 +244,10 @@ lprocess_insert_mark(pid_t pid, id_t colid, char *fname, int unscannable) lproc->lpc_prev = NULL; lcol->lcol_lprocess = lproc; - debug("tracking %d %d %s%s\n", (int)colid, (int)pid, fname, + debug("tracking %s %ld %d %s%s\n", + (colid.rcid_type == RCIDT_PROJECT ? "project" : "zone"), + (long)colid.rcid_val, + (int)pid, psinfop->pr_psargs, (lproc->lpc_unscannable != 0) ? " (not scannable)" : ""); lcol->lcol_stat.lcols_proc_in++; } @@ -328,22 +366,28 @@ get_psinfo(pid_t pid, psinfo_t *psinfo, int cached_fd, } /* - * Retrieve the collection membership of all processes in our zone, and update - * the psinfo of those non-system, non-zombie ones in collections. + * Retrieve the collection membership of all processes and update the psinfo of + * those non-system, non-zombie ones in collections. For global zone processes, + * we first attempt to put the process into a capped project collection. We + * also want to track the process for the global zone collection as a whole. */ static void proc_cb(const pid_t pid) { - static zoneid_t ours = (zoneid_t)-1; psinfo_t psinfo; - if (ours == (zoneid_t)-1) - ours = getzoneid(); - - if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0 && - psinfo.pr_zoneid == ours) - lprocess_insert_mark(psinfo.pr_pid, rc_getidbypsinfo(&psinfo), - psinfo.pr_psargs, psinfo.pr_nlwp == 0); + if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0) { + lprocess_insert_mark(&psinfo); + if (gz_capped && psinfo.pr_zoneid == GLOBAL_ZONEID) { + /* + * We also want to track this process for the global + * zone as a whole so add it to the global zone + * collection as well. + */ + psinfo.pr_projid = -1; + lprocess_insert_mark(&psinfo); + } + } } /* @@ -359,57 +403,149 @@ lprocess_update_psinfo_fd_cb(void *arg, int fd) } /* - * Update the RSS of processes in monitored collections. + * Get the system pagesize. */ -/*ARGSUSED*/ -static int -mem_sample_cb(lcollection_t *lcol, lprocess_t *lpc) +static void +get_page_size(void) { - psinfo_t psinfo; + page_size_kb = sysconf(_SC_PAGESIZE) / 1024; + debug("physical page size: %luKB\n", page_size_kb); +} + +static void +tm_fmt(char *msg, hrtime_t t1, hrtime_t t2) +{ + hrtime_t diff = t2 - t1; + + if (diff < MILLISEC) + debug("%s: %lld nanoseconds\n", msg, diff); + else if (diff < MICROSEC) + debug("%s: %.2f microseconds\n", msg, (float)diff / MILLISEC); + else if (diff < NANOSEC) + debug("%s: %.2f milliseconds\n", msg, (float)diff / MICROSEC); + else + debug("%s: %.2f seconds\n", msg, (float)diff / NANOSEC); +} + +/* + * Get the zone's & project's RSS from the kernel. + */ +static void +rss_sample(boolean_t my_zone_only, uint_t col_types) +{ + size_t nres; + size_t i; + uint_t flags; + hrtime_t t1, t2; - if (get_psinfo(lpc->lpc_pid, &psinfo, lpc->lpc_psinfo_fd, - lprocess_update_psinfo_fd_cb, lpc, lpc) == 0) { - lpc->lpc_rss = psinfo.pr_rssize; - lpc->lpc_size = psinfo.pr_size; + if (my_zone_only) { + flags = VMUSAGE_ZONE; } else { - if (errno == ENOENT) - debug("process %d finished\n", (int)lpc->lpc_pid); - else - debug("process %d: cannot read psinfo", - (int)lpc->lpc_pid); - lprocess_free(lpc); + flags = 0; + if (col_types & CAPPED_PROJECT) + flags |= VMUSAGE_PROJECTS; + if (col_types & CAPPED_ZONE && my_zoneid == GLOBAL_ZONEID) + flags |= VMUSAGE_ALL_ZONES; } - return (0); + debug("vmusage sample flags 0x%x\n", flags); + if (flags == 0) + return; + +again: + /* try the current buffer to see if the list will fit */ + nres = vmu_vals_len; + t1 = gethrtime(); + if (getvmusage(flags, my_zone_only ? 0 : rcfg.rcfg_rss_sample_interval, + vmu_vals, &nres) != 0) { + if (errno != EOVERFLOW) { + warn(gettext("can't read RSS from kernel\n")); + return; + } + } + t2 = gethrtime(); + tm_fmt("getvmusage time", t1, t2); + + debug("kernel nres %lu\n", (ulong_t)nres); + + if (nres > vmu_vals_len) { + /* array size is now too small, increase it and try again */ + free(vmu_vals); + + if ((vmu_vals = (vmusage_t *)calloc(nres, + sizeof (vmusage_t))) == NULL) { + warn(gettext("out of memory: could not read RSS from " + "kernel\n")); + vmu_vals_len = nvmu_vals = 0; + return; + } + vmu_vals_len = nres; + goto again; + } + + nvmu_vals = nres; + + debug("vmusage_sample\n"); + for (i = 0; i < nvmu_vals; i++) { + debug("%d: id: %d, type: 0x%x, rss_all: %llu (%lluKB), " + "swap: %llu\n", (int)i, (int)vmu_vals[i].vmu_id, + vmu_vals[i].vmu_type, + (unsigned long long)vmu_vals[i].vmu_rss_all, + (unsigned long long)vmu_vals[i].vmu_rss_all / 1024, + (unsigned long long)vmu_vals[i].vmu_swap_all); + } +} + +static void +update_col_rss(lcollection_t *lcol) +{ + int i; + + lcol->lcol_rss = 0; + lcol->lcol_image_size = 0; + + for (i = 0; i < nvmu_vals; i++) { + if (vmu_vals[i].vmu_id != lcol->lcol_id.rcid_val) + continue; + + if (vmu_vals[i].vmu_type == VMUSAGE_ZONE && + lcol->lcol_id.rcid_type != RCIDT_ZONE) + continue; + + if (vmu_vals[i].vmu_type == VMUSAGE_PROJECTS && + lcol->lcol_id.rcid_type != RCIDT_PROJECT) + continue; + + /* we found the right RSS entry, update the collection vals */ + lcol->lcol_rss = vmu_vals[i].vmu_rss_all / 1024; + lcol->lcol_image_size = vmu_vals[i].vmu_swap_all / 1024; + break; + } } /* * Sample the collection RSS, updating the collection's statistics with the - * results. + * results. Also, sum the rss of all capped projects & return true if + * the collection is over cap. */ -/*ARGSUSED*/ static int rss_sample_col_cb(lcollection_t *lcol, void *arg) { int64_t excess; uint64_t rss; + sample_col_arg_t *col_argp = (sample_col_arg_t *)arg; - /* - * If updating statistics for a new interval, reset the affected - * counters. - */ - if (lcol->lcol_stat_invalidate != 0) { - lcol->lcol_stat_old = lcol->lcol_stat; - lcol->lcol_stat.lcols_min_rss = (int64_t)-1; - lcol->lcol_stat.lcols_max_rss = 0; - lcol->lcol_stat_invalidate = 0; - } + update_col_rss(lcol); lcol->lcol_stat.lcols_rss_sample++; - excess = lcol->lcol_rss - lcol->lcol_rss_cap; rss = lcol->lcol_rss; - if (excess > 0) + excess = rss - lcol->lcol_rss_cap; + if (excess > 0) { lcol->lcol_stat.lcols_rss_act_sum += rss; + col_argp->sca_any_over_cap = B_TRUE; + if (lcol->lcol_id.rcid_type == RCIDT_PROJECT) + col_argp->sca_project_over_cap = B_TRUE; + } lcol->lcol_stat.lcols_rss_sum += rss; if (lcol->lcol_stat.lcols_min_rss > rss) @@ -421,6 +557,30 @@ rss_sample_col_cb(lcollection_t *lcol, void *arg) } /* + * Determine if we have capped projects, capped zones or both. + */ +static int +col_type_cb(lcollection_t *lcol, void *arg) +{ + uint_t *col_type = (uint_t *)arg; + + /* skip uncapped collections */ + if (lcol->lcol_rss_cap == 0) + return (1); + + if (lcol->lcol_id.rcid_type == RCIDT_PROJECT) + *col_type |= CAPPED_PROJECT; + else + *col_type |= CAPPED_ZONE; + + /* once we know everything is capped, we can stop looking */ + if ((*col_type & CAPPED_ZONE) && (*col_type & CAPPED_PROJECT)) + return (1); + + return (0); +} + +/* * Open /proc and walk entries. */ static void @@ -449,23 +609,6 @@ proc_walk_all(void (*cb)(const pid_t)) } /* - * Memory update callback. - */ -static int -memory_all_cb(lcollection_t *lcol, lprocess_t *lpc) -{ - debug_high("%s %s, pid %d: rss += %llu/%llu\n", rcfg.rcfg_mode_name, - lcol->lcol_name, (int)lpc->lpc_pid, - (unsigned long long)lpc->lpc_rss, - (unsigned long long)lpc->lpc_size); - ASSERT(lpc->lpc_rss <= lpc->lpc_size); - lcol->lcol_rss += lpc->lpc_rss; - lcol->lcol_image_size += lpc->lpc_size; - - return (0); -} - -/* * Clear unmarked callback. */ /*ARGSUSED*/ @@ -483,19 +626,6 @@ sweep_process_cb(lcollection_t *lcol, lprocess_t *lpc) } /* - * Memory clear callback. - */ -/*ARGSUSED*/ -static int -collection_zero_mem_cb(lcollection_t *lcol, void *arg) -{ - lcol->lcol_rss = 0; - lcol->lcol_image_size = 0; - - return (0); -} - -/* * Print, for debugging purposes, a collection's recently-sampled RSS and * excess. */ @@ -506,7 +636,8 @@ excess_print_cb(lcollection_t *lcol, void *arg) int64_t excess = lcol->lcol_rss - lcol->lcol_rss_cap; debug("%s %s rss/cap: %llu/%llu, excess = %lld kB\n", - rcfg.rcfg_mode_name, lcol->lcol_name, + (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), + lcol->lcol_name, (unsigned long long)lcol->lcol_rss, (unsigned long long)lcol->lcol_rss_cap, (long long)excess); @@ -516,6 +647,10 @@ excess_print_cb(lcollection_t *lcol, void *arg) /* * Scan those collections which have exceeded their caps. + * + * If we're running in the global zone it might have a cap. We don't want to + * do any capping for the global zone yet since we might get under the cap by + * just capping the projects in the global zone. */ /*ARGSUSED*/ static int @@ -523,6 +658,13 @@ scan_cb(lcollection_t *lcol, void *arg) { int64_t excess; + /* skip over global zone collection for now but keep track for later */ + if (lcol->lcol_id.rcid_type == RCIDT_ZONE && + lcol->lcol_id.rcid_val == GLOBAL_ZONEID) { + gz_col = lcol; + return (0); + } + if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) { scan(lcol, excess); lcol->lcol_stat.lcols_scan++; @@ -532,6 +674,37 @@ scan_cb(lcollection_t *lcol, void *arg) } /* + * Scan the global zone collection and see if it still exceeds its cap. + * We take into account the effects of capping any global zone projects here. + */ +static void +scan_gz(lcollection_t *lcol, boolean_t project_over_cap) +{ + int64_t excess; + + /* + * If we had projects over their cap and the global zone was also over + * its cap then we need to get the up-to-date global zone rss to + * determine if we are still over the global zone cap. We might have + * gone under while we scanned the capped projects. If there were no + * projects over cap then we can use the rss value we already have for + * the global zone. + */ + excess = lcol->lcol_rss - lcol->lcol_rss_cap; + if (project_over_cap && excess > 0) { + rss_sample(B_TRUE, CAPPED_ZONE); + update_col_rss(lcol); + excess = lcol->lcol_rss - lcol->lcol_rss_cap; + } + + if (excess > 0) { + debug("global zone excess %lldKB\n", (long long)excess); + scan(lcol, excess); + lcol->lcol_stat.lcols_scan++; + } +} + +/* * Do a soft scan of those collections which have excesses. A soft scan is one * in which the cap enforcement pressure is taken into account. The difference * between the utilized physical memory and the cap enforcement pressure will @@ -544,22 +717,72 @@ soft_scan_cb(lcollection_t *lcol, void *a) int64_t excess; soft_scan_arg_t *arg = a; + /* skip over global zone collection for now but keep track for later */ + if (lcol->lcol_id.rcid_type == RCIDT_ZONE && + lcol->lcol_id.rcid_val == GLOBAL_ZONEID) { + gz_col = lcol; + return (0); + } + if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) { - debug("col %lld excess %lld scan_goal %lld sum_excess %llu, " - "scanning %lld\n", (long long)lcol->lcol_id, + int64_t adjusted_excess = + excess * arg->ssa_scan_goal / arg->ssa_sum_excess; + + debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, " + "scanning %lld\n", + (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? + "project" : "zone"), + (long)lcol->lcol_id.rcid_val, (long long)excess, (long long)arg->ssa_scan_goal, (unsigned long long)arg->ssa_sum_excess, - (long long)(excess * arg->ssa_scan_goal / - arg->ssa_sum_excess)); + (long long)adjusted_excess); - scan(lcol, (int64_t)(excess * arg->ssa_scan_goal / - arg->ssa_sum_excess)); + scan(lcol, adjusted_excess); lcol->lcol_stat.lcols_scan++; } return (0); } +static void +soft_scan_gz(lcollection_t *lcol, void *a) +{ + int64_t excess; + soft_scan_arg_t *arg = a; + + /* + * If we had projects over their cap and the global zone was also over + * its cap then we need to get the up-to-date global zone rss to + * determine if we are still over the global zone cap. We might have + * gone under while we scanned the capped projects. If there were no + * projects over cap then we can use the rss value we already have for + * the global zone. + */ + excess = lcol->lcol_rss - lcol->lcol_rss_cap; + if (arg->ssa_project_over_cap && excess > 0) { + rss_sample(B_TRUE, CAPPED_ZONE); + update_col_rss(lcol); + excess = lcol->lcol_rss - lcol->lcol_rss_cap; + } + + if (excess > 0) { + int64_t adjusted_excess = + excess * arg->ssa_scan_goal / arg->ssa_sum_excess; + + debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, " + "scanning %lld\n", + (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? + "project" : "zone"), + (long)lcol->lcol_id.rcid_val, + (long long)excess, (long long)arg->ssa_scan_goal, + (unsigned long long)arg->ssa_sum_excess, + (long long)adjusted_excess); + + scan(lcol, adjusted_excess); + lcol->lcol_stat.lcols_scan++; + } +} + /* * When a scan could happen, but caps aren't enforced tick the * lcols_unenforced_cap counter. @@ -582,8 +805,7 @@ update_phys_total(void) uint64_t old_phys_total; old_phys_total = phys_total; - phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE) - / 1024; + phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * page_size_kb; if (phys_total != old_phys_total) debug("physical memory%s: %lluM\n", (old_phys_total == 0 ? "" : " adjusted"), (unsigned long long)(phys_total / 1024)); @@ -687,7 +909,9 @@ static int collection_sweep_cb(lcollection_t *lcol, void *arg) { if (lcol->lcol_mark == 0) { - debug("freeing %s %s\n", rcfg.rcfg_mode_name, lcol->lcol_name); + debug("freeing %s %s\n", + (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? + "project" : "zone"), lcol->lcol_name); lcollection_free(lcol); } @@ -710,8 +934,6 @@ finish_configuration(void) rcfg.rcfg_mode_name = "project"; rcfg.rcfg_mode = rctype_project; } - - lcollection_set_type(rcfg.rcfg_mode); } /* @@ -754,7 +976,8 @@ reread_configuration_file(void) * deletions to cap definitions. */ static void -reconfigure(void) +reconfigure(hrtime_t now, hrtime_t *next_configuration, + hrtime_t *next_proc_walk, hrtime_t *next_rss_sample) { debug("reconfigure...\n"); @@ -770,6 +993,31 @@ reconfigure(void) list_walk_collection(collection_clear_cb, NULL); lcollection_update(LCU_ACTIVE_ONLY); /* mark */ list_walk_collection(collection_sweep_cb, NULL); + + *next_configuration = NEXT_EVENT_TIME(now, + rcfg.rcfg_reconfiguration_interval); + + /* + * Reset each event time to the shorter of the previous and new + * intervals. + */ + if (next_report == 0 && rcfg.rcfg_report_interval > 0) + next_report = now; + else + next_report = POSITIVE_MIN(next_report, + NEXT_REPORT_EVENT_TIME(now, rcfg.rcfg_report_interval)); + + if (*next_proc_walk == 0 && rcfg.rcfg_proc_walk_interval > 0) + *next_proc_walk = now; + else + *next_proc_walk = POSITIVE_MIN(*next_proc_walk, + NEXT_EVENT_TIME(now, rcfg.rcfg_proc_walk_interval)); + + if (*next_rss_sample == 0 && rcfg.rcfg_rss_sample_interval > 0) + *next_rss_sample = now; + else + *next_rss_sample = POSITIVE_MIN(*next_rss_sample, + NEXT_EVENT_TIME(now, rcfg.rcfg_rss_sample_interval)); } /* @@ -791,20 +1039,20 @@ static int simple_report_collection_cb(lcollection_t *lcol, void *arg) { #define DELTA(field) \ - (unsigned long long)(lcol->lcol_stat_invalidate ? 0 : \ + (unsigned long long)( \ (lcol->lcol_stat.field - lcol->lcol_stat_old.field)) -#define VALID(field) \ - (unsigned long long)(lcol->lcol_stat_invalidate ? 0 : \ - lcol->lcol_stat.field) debug("%s %s status: succeeded/attempted (k): %llu/%llu, " "ineffective/scans/unenforced/samplings: %llu/%llu/%llu/%llu, RSS " "min/max (k): %llu/%llu, cap %llu kB, processes/thpt: %llu/%llu, " - "%llu scans over %llu ms\n", rcfg.rcfg_mode_name, lcol->lcol_name, + "%llu scans over %llu ms\n", + (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), + lcol->lcol_name, DELTA(lcols_pg_eff), DELTA(lcols_pg_att), DELTA(lcols_scan_ineffective), DELTA(lcols_scan), DELTA(lcols_unenforced_cap), DELTA(lcols_rss_sample), - VALID(lcols_min_rss), VALID(lcols_max_rss), + (unsigned long long)lcol->lcol_stat.lcols_min_rss, + (unsigned long long)lcol->lcol_stat.lcols_max_rss, (unsigned long long)lcol->lcol_rss_cap, (unsigned long long)(lcol->lcol_stat.lcols_proc_in - lcol->lcol_stat.lcols_proc_out), DELTA(lcols_proc_out), @@ -812,7 +1060,6 @@ simple_report_collection_cb(lcollection_t *lcol, void *arg) / MILLISEC)); #undef DELTA -#undef VALID return (0); } @@ -838,13 +1085,11 @@ report_collection_cb(lcollection_t *lcol, void *arg) dc.lcol_stat = lcol->lcol_stat; if (write(fd, &dc, sizeof (dc)) == sizeof (dc)) { - /* - * Set a flag to indicate that the exported interval snapshot - * values should be reset at the next sample. - */ - lcol->lcol_stat_invalidate = 1; + lcol->lcol_stat_old = lcol->lcol_stat; } else { - debug("can't write %s %s statistics", rcfg.rcfg_mode_name, + debug("can't write %s %s statistics", + (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? + "project" : "zone"), lcol->lcol_name); } @@ -871,8 +1116,9 @@ get_globally_scanned_pages(uint64_t *scannedp) if (kstat_read(kctl, ksp, NULL) != -1) { scanned += ((cpu_stat_t *) ksp->ks_data)->cpu_vminfo.scan; - } else + } else { return (-1); + } } } @@ -881,6 +1127,59 @@ get_globally_scanned_pages(uint64_t *scannedp) } /* + * Determine if the global page scanner is running, during which no memory + * caps should be enforced, to prevent interference with the global page + * scanner. + */ +static boolean_t +is_global_scanner_running() +{ + /* measure delta in page scan count */ + static uint64_t new_sp = 0; + static uint64_t old_sp = 0; + boolean_t res = B_FALSE; + + if (get_globally_scanned_pages(&new_sp) == 0) { + if (old_sp != 0 && (new_sp - old_sp) > 0) { + debug("global memory pressure detected (%llu " + "pages scanned since last interval)\n", + (unsigned long long)(new_sp - old_sp)); + res = B_TRUE; + } + old_sp = new_sp; + } else { + warn(gettext("unable to read cpu statistics")); + new_sp = old_sp; + } + + return (res); +} + +/* + * If soft caps are in use, determine if global memory pressure exceeds the + * configured maximum above which soft caps are enforced. + */ +static boolean_t +must_enforce_soft_caps() +{ + /* + * Check for changes to the amount of installed physical memory, to + * compute the current memory pressure. + */ + update_phys_total(); + + memory_pressure = 100 - (int)((sysconf(_SC_AVPHYS_PAGES) * page_size_kb) + * 100.0 / phys_total); + memory_pressure_sample++; + if (rcfg.rcfg_memory_cap_enforcement_pressure > 0 && + memory_pressure > rcfg.rcfg_memory_cap_enforcement_pressure) { + return (B_TRUE); + } + + return (B_FALSE); +} + +/* * Update the shared statistics file with each collection's current statistics. * Return zero on success. */ @@ -973,6 +1272,26 @@ sum_excess_cb(lcollection_t *lcol, void *arg) return (0); } +/* + * Compute the quantity of memory (in kilobytes) above the cap enforcement + * pressure. Set the scan goal to that quantity (or at most the excess). + */ +static void +compute_soft_scan_goal(soft_scan_arg_t *argp) +{ + /* + * Compute the sum of the collections' excesses, which will be the + * denominator. + */ + argp->ssa_sum_excess = 0; + list_walk_collection(sum_excess_cb, &(argp->ssa_sum_excess)); + + argp->ssa_scan_goal = MIN((sysconf(_SC_PHYS_PAGES) * + (100 - rcfg.rcfg_memory_cap_enforcement_pressure) / 100 - + sysconf(_SC_AVPHYS_PAGES)) * page_size_kb, + argp->ssa_sum_excess); +} + static void rcapd_usage(void) { @@ -1017,6 +1336,112 @@ verify_and_set_privileges(void) priv_freeset(required); } +/* + * This function does the top-level work to determine if we should do any + * memory capping, and if so, it invokes the right call-backs to do the work. + */ +static void +do_capping(hrtime_t now, hrtime_t *next_proc_walk) +{ + boolean_t enforce_caps; + /* soft cap enforcement flag, depending on memory pressure */ + boolean_t enforce_soft_caps; + /* avoid interference with kernel's page scanner */ + boolean_t global_scanner_running; + sample_col_arg_t col_arg; + soft_scan_arg_t arg; + uint_t col_types = 0; + + /* check what kind of collections (project/zone) are capped */ + list_walk_collection(col_type_cb, &col_types); + debug("collection types: 0x%x\n", col_types); + + /* no capped collections, skip checking rss */ + if (col_types == 0) + return; + + /* Determine if soft caps are enforced. */ + enforce_soft_caps = must_enforce_soft_caps(); + + /* Determine if the global page scanner is running. */ + global_scanner_running = is_global_scanner_running(); + + /* + * Sample collections' member processes RSSes and recompute + * collections' excess. + */ + rss_sample(B_FALSE, col_types); + + col_arg.sca_any_over_cap = B_FALSE; + col_arg.sca_project_over_cap = B_FALSE; + list_walk_collection(rss_sample_col_cb, &col_arg); + list_walk_collection(excess_print_cb, NULL); + debug("any collection/project over cap = %d, %d\n", + col_arg.sca_any_over_cap, col_arg.sca_project_over_cap); + + if (enforce_soft_caps) + debug("memory pressure %d%%\n", memory_pressure); + + /* + * Cap enforcement is determined by the previous conditions. + */ + enforce_caps = !global_scanner_running && col_arg.sca_any_over_cap && + (rcfg.rcfg_memory_cap_enforcement_pressure == 0 || + enforce_soft_caps); + + debug("%senforcing caps\n", enforce_caps ? "" : "not "); + + /* + * If soft caps are in use, determine the size of the portion from each + * collection to scan for. + */ + if (enforce_caps && enforce_soft_caps) + compute_soft_scan_goal(&arg); + + /* + * Victimize offending collections. + */ + if (enforce_caps && (!enforce_soft_caps || + (arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0))) { + + /* + * Since at least one collection is over its cap & needs + * enforcing, check if it is at least time for a process walk + * (we could be well past time since we only walk /proc when + * we need to) and if so, update each collections process list + * in a single pass through /proc. + */ + if (EVENT_TIME(now, *next_proc_walk)) { + debug("scanning process list...\n"); + proc_walk_all(proc_cb); /* insert & mark */ + list_walk_all(sweep_process_cb); /* free dead procs */ + *next_proc_walk = NEXT_EVENT_TIME(now, + rcfg.rcfg_proc_walk_interval); + } + + gz_col = NULL; + if (enforce_soft_caps) { + debug("scan goal is %lldKB\n", + (long long)arg.ssa_scan_goal); + list_walk_collection(soft_scan_cb, &arg); + if (gz_capped && gz_col != NULL) { + /* process global zone */ + arg.ssa_project_over_cap = + col_arg.sca_project_over_cap; + soft_scan_gz(gz_col, &arg); + } + } else { + list_walk_collection(scan_cb, NULL); + if (gz_capped && gz_col != NULL) { + /* process global zone */ + scan_gz(gz_col, col_arg.sca_project_over_cap); + } + } + } else if (col_arg.sca_any_over_cap) { + list_walk_collection(unenforced_cap_cb, NULL); + } +} + int main(int argc, char *argv[]) { @@ -1029,9 +1454,6 @@ main(int argc, char *argv[]) hrtime_t next_proc_walk; /* time of next /proc scan */ hrtime_t next_configuration; /* time of next configuration */ hrtime_t next_rss_sample; /* (latest) time of next RSS sample */ - int old_enforce_caps; /* track changes in enforcement */ - /* conditions */ - soft_scan_arg_t arg; (void) set_message_priority(RCM_INFO); (void) setprogname("rcapd"); @@ -1125,13 +1547,6 @@ main(int argc, char *argv[]) next_configuration = NEXT_EVENT_TIME(gethrtime(), rcfg.rcfg_reconfiguration_interval); - if (rcfg.rcfg_memory_cap_enforcement_pressure == 0) { - /* - * Always enforce caps when strict caps are used. - */ - enforce_caps = 1; - } - /* * Open the kstat chain. */ @@ -1158,6 +1573,9 @@ main(int argc, char *argv[]) else debug("fd limit: unknown\n"); + get_page_size(); + my_zoneid = getzoneid(); + /* * Handle those signals whose (default) exit disposition * prevents rcapd from finishing scanning before terminating. @@ -1194,9 +1612,9 @@ main(int argc, char *argv[]) /* * Loop forever, monitoring collections' resident set sizes and - * enforcing their caps. Look for changes in caps and process - * membership, as well as responding to requests to reread the - * configuration. Update per-collection statistics periodically. + * enforcing their caps. Look for changes in caps as well as + * responding to requests to reread the configuration. Update + * per-collection statistics periodically. */ while (should_run != 0) { struct timespec ts; @@ -1210,9 +1628,10 @@ main(int argc, char *argv[]) } /* - * Update the process list once every proc_walk_interval. The - * condition of global memory pressure is also checked at the - * same frequency, if strict caps are in use. + * Check the configuration at every next_configuration interval. + * Update the rss data once every next_rss_sample interval. + * The condition of global memory pressure is also checked at + * the same frequency, if strict caps are in use. */ now = gethrtime(); @@ -1222,178 +1641,16 @@ main(int argc, char *argv[]) */ if (EVENT_TIME(now, next_configuration) || should_reconfigure == 1) { - reconfigure(); - next_configuration = NEXT_EVENT_TIME(now, - rcfg.rcfg_reconfiguration_interval); - - /* - * Reset each event time to the shorter of the - * previous and new intervals. - */ - if (next_report == 0 && - rcfg.rcfg_report_interval > 0) - next_report = now; - else - next_report = POSITIVE_MIN(next_report, - NEXT_REPORT_EVENT_TIME(now, - rcfg.rcfg_report_interval)); - if (next_proc_walk == 0 && - rcfg.rcfg_proc_walk_interval > 0) - next_proc_walk = now; - else - next_proc_walk = POSITIVE_MIN(next_proc_walk, - NEXT_EVENT_TIME(now, - rcfg.rcfg_proc_walk_interval)); - if (next_rss_sample == 0 && - rcfg.rcfg_rss_sample_interval > 0) - next_rss_sample = now; - else - next_rss_sample = POSITIVE_MIN(next_rss_sample, - NEXT_EVENT_TIME(now, - rcfg.rcfg_rss_sample_interval)); - + reconfigure(now, &next_configuration, &next_proc_walk, + &next_rss_sample); should_reconfigure = 0; - continue; - } - - if (EVENT_TIME(now, next_proc_walk)) { - debug("scanning process list...\n"); - proc_walk_all(proc_cb); /* mark */ - list_walk_all(sweep_process_cb); - next_proc_walk = NEXT_EVENT_TIME(now, - rcfg.rcfg_proc_walk_interval); } + /* + * Do the main work for enforcing caps. + */ if (EVENT_TIME(now, next_rss_sample)) { - /* - * Check for changes to the amount of installed - * physical memory, to compute the current memory - * pressure. - */ - update_phys_total(); - - /* - * If soft caps are in use, determine if global memory - * pressure exceeds the configured maximum above which - * soft caps are enforced. - */ - memory_pressure = 100 - - (int)((sysconf(_SC_AVPHYS_PAGES) * - (sysconf(_SC_PAGESIZE) / 1024)) * 100.0 / - phys_total); - memory_pressure_sample++; - if (rcfg.rcfg_memory_cap_enforcement_pressure > 0) { - if (memory_pressure > - rcfg.rcfg_memory_cap_enforcement_pressure) { - if (enforce_soft_caps == 0) { - debug("memory pressure %d%%\n", - memory_pressure); - enforce_soft_caps = 1; - } - } else { - if (enforce_soft_caps == 1) - enforce_soft_caps = 0; - } - } - - /* - * Determine if the global page scanner is running, - * while which no memory caps should be enforced, to - * prevent interference with the global page scanner. - */ - if (get_globally_scanned_pages(&new_sp) == 0) { - if (old_sp == 0) - /*EMPTY*/ - ; - else if ((new_sp - old_sp) > 0) { - if (global_scanner_running == 0) { - debug("global memory pressure " - "detected (%llu pages " - "scanned since last " - "interval)\n", - (unsigned long long) - (new_sp - old_sp)); - global_scanner_running = 1; - } - } else if (global_scanner_running == 1) { - debug("global memory pressure " - "relieved\n"); - global_scanner_running = 0; - } - old_sp = new_sp; - } else { - warn(gettext("kstat_read() failed")); - new_sp = old_sp; - } - - /* - * Cap enforcement is determined by the previous two - * conditions. - */ - old_enforce_caps = enforce_caps; - enforce_caps = - (rcfg.rcfg_memory_cap_enforcement_pressure == - 0 || enforce_soft_caps == 1) && - !global_scanner_running; - if (old_enforce_caps != enforce_caps) - debug("%senforcing caps\n", enforce_caps == 0 ? - "not " : ""); - - /* - * Sample collections' member processes' RSSes and - * recompute collections' excess. - */ - list_walk_all(mem_sample_cb); - list_walk_collection(collection_zero_mem_cb, NULL); - list_walk_all(memory_all_cb); - list_walk_collection(rss_sample_col_cb, NULL); - if (rcfg.rcfg_memory_cap_enforcement_pressure > 0) - debug("memory pressure %d%%\n", - memory_pressure); - list_walk_collection(excess_print_cb, NULL); - - /* - * If soft caps are in use, determine the size of the - * portion from each collection to scan for. - */ - if (enforce_soft_caps == 1) { - /* - * Compute the sum of the collections' - * excesses, which will be the denominator. - */ - arg.ssa_sum_excess = 0; - list_walk_collection(sum_excess_cb, - &arg.ssa_sum_excess); - - /* - * Compute the quantity of memory (in - * kilobytes) above the cap enforcement - * pressure. Set the scan goal to that - * quantity (or at most the excess). - */ - arg.ssa_scan_goal = MIN(( - sysconf(_SC_PHYS_PAGES) * (100 - - rcfg.rcfg_memory_cap_enforcement_pressure) - / 100 - sysconf(_SC_AVPHYS_PAGES)) * - (sysconf(_SC_PAGESIZE) / 1024), - arg.ssa_sum_excess); - } - - /* - * Victimize offending collections. - */ - if (enforce_caps == 1 && ((enforce_soft_caps == 1 && - arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0) || - (enforce_soft_caps == 0))) - if (enforce_soft_caps == 1) { - debug("scan goal is %lldKB\n", - (long long)arg.ssa_scan_goal); - list_walk_collection(soft_scan_cb, - &arg); - } else - list_walk_collection(scan_cb, NULL); - else - list_walk_collection(unenforced_cap_cb, NULL); + do_capping(now, &next_proc_walk); next_rss_sample = NEXT_EVENT_TIME(now, rcfg.rcfg_rss_sample_interval); @@ -1409,7 +1666,6 @@ main(int argc, char *argv[]) */ now = gethrtime(); next = next_configuration; - next = POSITIVE_MIN(next, next_proc_walk); next = POSITIVE_MIN(next, next_report); next = POSITIVE_MIN(next, next_rss_sample); if (next > now && should_run != 0) { diff --git a/usr/src/cmd/rcap/rcapd/rcapd_scanner.c b/usr/src/cmd/rcap/rcapd/rcapd_scanner.c index 15c503d1b4..b39811b552 100644 --- a/usr/src/cmd/rcap/rcapd/rcapd_scanner.c +++ b/usr/src/cmd/rcap/rcapd/rcapd_scanner.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -104,7 +103,8 @@ st_debug(st_debug_level_t level, lcollection_t *lcol, char *msg, ...) buf = malloc(len); if (buf == NULL) return; - (void) snprintf(buf, len, "%s %s scanner %s", rcfg.rcfg_mode_name, + (void) snprintf(buf, len, "%s %s scanner %s", + (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"), lcol->lcol_name, msg); va_start(alist, msg); @@ -471,6 +471,7 @@ merge_current_pagedata(lprocess_t *lpc, { prpageheader_t *pghp; int mappings_changed = 0; + uint64_t cnt; if (lpc->lpc_pgdata_fd < 0 || get_pagedata(&pghp, lpc->lpc_pgdata_fd) != 0) { @@ -485,9 +486,12 @@ merge_current_pagedata(lprocess_t *lpc, debug("starting/resuming pagedata collection for %d\n", (int)lpc->lpc_pid); } - debug("process %d: %llu/%llukB r/m'd since last read\n", - (int)lpc->lpc_pid, (unsigned long long)count_pages(pghp, 0, - PG_MODIFIED | PG_REFERENCED, 0), (unsigned long long)lpc->lpc_rss); + + cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0); + if (cnt != 0 || lpc->lpc_rss != 0) + debug("process %d: %llu/%llukB rfd/mdfd since last read\n", + (int)lpc->lpc_pid, (unsigned long long)cnt, + (unsigned long long)lpc->lpc_rss); if (lpc->lpc_prpageheader != NULL) { /* * OR the two snapshots. @@ -519,10 +523,12 @@ merge_current_pagedata(lprocess_t *lpc, } else mappings_changed = 1; lpc->lpc_prpageheader = pghp; - debug("process %d: %llu/%llukB r/m'd since hand swept\n", - (int)lpc->lpc_pid, (unsigned long long)count_pages(pghp, 0, - PG_MODIFIED | PG_REFERENCED, 0), - (unsigned long long)lpc->lpc_rss); + + cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0); + if (cnt != 0 || lpc->lpc_rss != 0) + debug("process %d: %llu/%llukB rfd/mdfd since hand swept\n", + (int)lpc->lpc_pid, (unsigned long long)cnt, + (unsigned long long)lpc->lpc_rss); if (mappings_changed != 0) { debug("process %d: mappings changed\n", (int)lpc->lpc_pid); if (mappings_changed_cb != NULL) @@ -589,7 +595,6 @@ rss_delta(psinfo_t *new_psinfo, psinfo_t *old_psinfo, lprocess_t *vic) static void unignore_mappings(lprocess_t *lpc) { - debug("clearing ignored set\n"); lmapping_free(&lpc->lpc_ignore); } diff --git a/usr/src/cmd/rcap/rcapstat/Makefile b/usr/src/cmd/rcap/rcapstat/Makefile index 47b9bcfb71..fb436f5684 100644 --- a/usr/src/cmd/rcap/rcapstat/Makefile +++ b/usr/src/cmd/rcap/rcapstat/Makefile @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2003 Sun Microsystems, Inc. All rights reserved. +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # #ident "%Z%%M% %I% %E% SMI" @@ -39,7 +38,7 @@ LINTSRCS = $(COMMON_DIR)/utils.c \ $(NOT_RELEASE_BUILD)CPPFLAGS += -DDEBUG CPPFLAGS += -I$(COMMON_DIR) -LDLIBS += -lumem -ll +LDLIBS += -lumem -ll -lzonecfg LINTFLAGS += $(LDLIBS) -mnu diff --git a/usr/src/cmd/rcap/rcapstat/rcapstat.c b/usr/src/cmd/rcap/rcapstat/rcapstat.c index 722502d05d..47eca3f2fa 100644 --- a/usr/src/cmd/rcap/rcapstat/rcapstat.c +++ b/usr/src/cmd/rcap/rcapstat/rcapstat.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -77,7 +76,8 @@ col_find(rcid_t id) { col_t *col; for (col = col_head; col != NULL; col = col->col_next) - if (col->col_id == id) + if (col->col_id.rcid_type == id.rcid_type && + col->col_id.rcid_val == id.rcid_val) return (col); return (NULL); } @@ -119,7 +119,7 @@ static void usage() { (void) fprintf(stderr, - gettext("usage: rcapstat [-g] [interval [count]]\n")); + gettext("usage: rcapstat [-g] [-p | -z] [interval [count]]\n")); exit(E_USAGE); } @@ -139,12 +139,12 @@ format_size(char *str, uint64_t size, int length) } static int -read_stats() +read_stats(rcid_type_t stat_type) { int fd; int proc_fd; char procfile[20]; - pid_t pid; + uint64_t pid; col_t *col, *col_next; lcollection_report_t report; struct stat st; @@ -169,7 +169,7 @@ read_stats() * Check if rcapd is running */ pid = hdr.rs_pid; - (void) snprintf(procfile, 20, "/proc/%ld/psinfo", pid); + (void) snprintf(procfile, 20, "/proc/%lld/psinfo", pid); if ((proc_fd = open(procfile, O_RDONLY)) < 0) { warn(gettext("rcapd is not active\n")); (void) close(fd); @@ -185,6 +185,9 @@ read_stats() } while (read(fd, &report, sizeof (report)) == sizeof (report)) { + if (report.lcol_id.rcid_type != stat_type) + continue; + col = col_find(report.lcol_id); if (col == NULL) { col = col_insert(report.lcol_id); @@ -291,12 +294,13 @@ print_unformatted_stats(void) } static void -print_stats() +print_stats(rcid_type_t stat_type) { col_t *col; char size[6]; char limit[6]; char rss[6]; + char nproc[6]; char paged_att[6]; char paged_eff[6]; char paged_att_avg[6]; @@ -310,12 +314,21 @@ print_stats() */ if (count == 0 || ncol != 1) (void) printf("%6s %-15s %5s %5s %5s %5s %5s %5s %5s %5s\n", - "id", mode, "nproc", "vm", "rss", "cap", + "id", (stat_type == RCIDT_PROJECT ? "project" : "zone"), + "nproc", "vm", "rss", "cap", "at", "avgat", "pg", "avgpg"); if (++count >= 20 || (count >= 10 && global != 0) || ncol != 1) count = 0; for (col = col_head; col != NULL; col = col->col_next) { + if (col->col_id.rcid_type != stat_type) + continue; + + if (col->col_paged_att == 0) + strlcpy(nproc, "-", sizeof (nproc)); + else + (void) snprintf(nproc, sizeof (nproc), "%lld", + col->col_nproc); format_size(size, col->col_vmsize, 6); format_size(rss, col->col_rsssize, 6); format_size(limit, col->col_rsslimit, 6); @@ -323,8 +336,9 @@ print_stats() format_size(paged_eff, col->col_paged_eff, 6); format_size(paged_att_avg, col->col_paged_att_avg, 6); format_size(paged_eff_avg, col->col_paged_eff_avg, 6); - (void) printf("%6lld %-15s %5lld %5s %5s %5s %5s %5s %5s %5s\n", - (long long)col->col_id, col->col_name, col->col_nproc, + (void) printf("%6lld %-15s %5s %5s %5s %5s %5s %5s %5s %5s\n", + col->col_id.rcid_val, col->col_name, + nproc, size, rss, limit, paged_att, paged_att_avg, paged_eff, paged_eff_avg); @@ -342,20 +356,32 @@ main(int argc, char *argv[]) int count; int always = 1; int opt; + int projects = 0; + int zones = 0; + /* project reporting is the default if no option is specified */ + rcid_type_t stat_type = RCIDT_PROJECT; (void) setlocale(LC_ALL, ""); (void) textdomain(TEXT_DOMAIN); (void) setprogname("rcapstat"); global = unformatted = 0; - while ((opt = getopt(argc, argv, "gu")) != (int)EOF) { + while ((opt = getopt(argc, argv, "gpuz")) != (int)EOF) { switch (opt) { case 'g': global = 1; break; + case 'p': + projects = 1; + stat_type = RCIDT_PROJECT; + break; case 'u': unformatted = 1; break; + case 'z': + stat_type = RCIDT_ZONE; + zones = 1; + break; default: usage(); } @@ -369,22 +395,22 @@ main(int argc, char *argv[]) die(gettext("invalid count specified\n")); always = 0; } - if (argc > optind) + if (argc > optind || (projects > 0 && zones > 0)) usage(); while (always || count-- > 0) { - if (read_stats() != E_SUCCESS) + if (read_stats(stat_type) != E_SUCCESS) return (E_ERROR); if (!unformatted) { - print_stats(); - fflush(stdout); + print_stats(stat_type); + (void) fflush(stdout); if (count || always) (void) sleep(interval); } else { struct stat st; print_unformatted_stats(); - fflush(stdout); + (void) fflush(stdout); while (stat(STAT_FILE_DEFAULT, &st) == 0 && st.st_mtime == stat_mod) usleep((useconds_t)(0.2 * MICROSEC)); diff --git a/usr/src/cmd/truss/print.c b/usr/src/cmd/truss/print.c index 92739f2b1e..4dc70b0d37 100644 --- a/usr/src/cmd/truss/print.c +++ b/usr/src/cmd/truss/print.c @@ -2325,6 +2325,7 @@ prt_zga(private_t *pri, int raw, long val) case ZONE_ATTR_INITNAME: s = "ZONE_ATTR_INITNAME"; break; case ZONE_ATTR_BOOTARGS: s = "ZONE_ATTR_BOOTARGS"; break; case ZONE_ATTR_BRAND: s = "ZONE_ATTR_BRAND"; break; + case ZONE_ATTR_PHYS_MCAP: s = "ZONE_ATTR_PHYS_MCAP"; break; } } diff --git a/usr/src/cmd/truss/systable.c b/usr/src/cmd/truss/systable.c index 695d0e28c2..f46e028bf5 100644 --- a/usr/src/cmd/truss/systable.c +++ b/usr/src/cmd/truss/systable.c @@ -404,7 +404,7 @@ const struct systable systable[] = { {"kaio", 7, DEC, NOV, AIO, HEX, HEX, HEX, HEX, HEX, HEX}, /* 178 */ {"cpc", 5, DEC, NOV, CPC, DEC, HEX, HEX, HEX}, /* 179 */ {"lgrpsys", 3, DEC, NOV, DEC, DEC, HEX}, /* 180 */ -{"rusagesys", 2, DEC, NOV, DEC, HEX}, /* 181 */ +{"rusagesys", 5, DEC, NOV, DEC, HEX, DEC, HEX, HEX}, /* 181 */ {"portfs", 6, HEX, HEX, DEC, HEX, HEX, HEX, HEX, HEX}, /* 182 */ {"pollsys", 4, DEC, NOV, HEX, DEC, HEX, HEX}, /* 183 */ {"labelsys", 2, DEC, NOV, DEC, HEX}, /* 184 */ @@ -761,6 +761,7 @@ static const struct systable rusagesystable[] = { {"getrusage", 2, DEC, NOV, HID, HEX}, /* 0 */ {"getrusage_chld", 2, DEC, NOV, HID, HEX}, /* 1 */ {"getrusage_lwp", 2, DEC, NOV, HID, HEX}, /* 2 */ +{"getvmusage", 5, DEC, NOV, HID, HEX, DEC, HEX, HEX}, /* 3 */ }; #define NRUSAGESYSCODE \ (sizeof (rusagesystable) / sizeof (struct systable)) @@ -942,6 +943,7 @@ const struct sysalias sysalias[] = { { "getrusage", SYS_rusagesys }, { "getrusage_chld", SYS_rusagesys }, { "getrusage_lwp", SYS_rusagesys }, + { "getvmusage", SYS_rusagesys }, { "getpeerucred", SYS_ucredsys }, { "ucred_get", SYS_ucredsys }, { "port_create", SYS_port }, @@ -956,6 +958,7 @@ const struct sysalias sysalias[] = { { "zone_create", SYS_zone }, { "zone_destroy", SYS_zone }, { "zone_getattr", SYS_zone }, + { "zone_setattr", SYS_zone }, { "zone_enter", SYS_zone }, { "getzoneid", SYS_zone }, { "zone_list", SYS_zone }, diff --git a/usr/src/cmd/zoneadm/Makefile b/usr/src/cmd/zoneadm/Makefile index 4d0f91a6f3..e11609c6dd 100644 --- a/usr/src/cmd/zoneadm/Makefile +++ b/usr/src/cmd/zoneadm/Makefile @@ -27,8 +27,8 @@ # PROG= zoneadm -MANIFEST= zones.xml -SVCMETHOD= svc-zones +MANIFEST= zones.xml resource-mgmt.xml +SVCMETHOD= svc-zones svc-resource-mgmt include ../Makefile.cmd diff --git a/usr/src/cmd/zoneadm/resource-mgmt.xml b/usr/src/cmd/zoneadm/resource-mgmt.xml new file mode 100644 index 0000000000..264f26733f --- /dev/null +++ b/usr/src/cmd/zoneadm/resource-mgmt.xml @@ -0,0 +1,116 @@ +<?xml version="1.0"?> +<!DOCTYPE service_bundle SYSTEM "/usr/share/lib/xml/dtd/service_bundle.dtd.1"> +<!-- + Copyright 2006 Sun Microsystems, Inc. All rights reserved. + Use is subject to license terms. + + CDDL HEADER START + + The contents of this file are subject to the terms of the + Common Development and Distribution License (the "License"). + You may not use this file except in compliance with the License. + + You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + or http://www.opensolaris.org/os/licensing. + See the License for the specific language governing permissions + and limitations under the License. + + When distributing Covered Code, include this CDDL HEADER in each + file and include the License file at usr/src/OPENSOLARIS.LICENSE. + If applicable, add the following below this CDDL HEADER, with the + fields enclosed by brackets "[]" replaced with your own identifying + information: Portions Copyright [yyyy] [name of copyright owner] + + CDDL HEADER END + + ident "%Z%%M% %I% %E% SMI" + + NOTE: This service manifest is not editable; its contents will + be overwritten by package or patch operations, including + operating system upgrade. Make customizations in a different + file. +--> + +<service_bundle type='manifest' name='SUNWzoner:zones'> + +<!-- + This service applies global zone resource management settings + at system startup. +--> +<service + name='system/resource-mgmt' + type='service' + version='1'> + + <create_default_instance enabled='true' /> + + <single_instance /> + + <dependency + name='usr' + type='service' + grouping='require_all' + restart_on='none'> + <service_fmri value='svc:/system/filesystem/minimal' /> + </dependency> + + <dependency + name='scheduler' + type='service' + grouping='optional_all' + restart_on='none'> + <service_fmri value='svc:/system/scheduler' /> + </dependency> + + <dependency + name='pools' + type='service' + grouping='optional_all' + restart_on='none'> + <service_fmri value='svc:/system/pools' /> + </dependency> + + <dependent + name='rcap' + grouping='optional_all' + restart_on='none'> + <service_fmri value='svc:/system/rcap' /> + </dependent> + + <exec_method + type='method' + name='start' + exec='/lib/svc/method/svc-resource-mgmt %m' + timeout_seconds='60'> + </exec_method> + + <exec_method + type='method' + name='stop' + exec=':true' + timeout_seconds='3'> + </exec_method> + + <property_group name='startd' type='framework'> + <propval name='duration' type='astring' value='transient' /> + </property_group> + + <stability value='Unstable' /> + + <template> + <common_name> + <loctext xml:lang='C'> + Global zone resource management settings + </loctext> + </common_name> + <documentation> + <manpage title='zones' section='5' manpath='/usr/share/man' /> + <manpage + title='zonecfg' + section='1M' + manpath='/usr/share/man' /> + </documentation> + </template> +</service> + +</service_bundle> diff --git a/usr/src/cmd/zoneadm/svc-resource-mgmt b/usr/src/cmd/zoneadm/svc-resource-mgmt new file mode 100644 index 0000000000..762de4c0d8 --- /dev/null +++ b/usr/src/cmd/zoneadm/svc-resource-mgmt @@ -0,0 +1,54 @@ +#!/sbin/sh +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. + +# ident "%Z%%M% %I% %E% SMI" +# + +. /lib/svc/share/smf_include.sh + +[ ! -f /etc/zones/global.xml ] && exit $SMF_EXIT_OK # No global zone + # resource mgmt. + # configuration + +[ ! -x /usr/sbin/zoneadm ] && exit $SMF_EXIT_OK # SUNWzoneu not installed + +# Make sure working directory is / to prevent unmounting problems. +cd / +PATH=/usr/sbin:/usr/bin; export PATH + +case "$1" in +'start') + zoneadm -z global apply + if [ $? -ne 0 ]; then + exit $SMF_EXIT_ERR_FATAL + fi + ;; + +*) + echo "Usage: $0 start" + exit $SMF_EXIT_ERR_FATAL + ;; +esac +exit $SMF_EXIT_OK diff --git a/usr/src/cmd/zoneadm/zoneadm.c b/usr/src/cmd/zoneadm/zoneadm.c index bff26cd356..b7ae32b30d 100644 --- a/usr/src/cmd/zoneadm/zoneadm.c +++ b/usr/src/cmd/zoneadm/zoneadm.c @@ -74,9 +74,12 @@ #include <fnmatch.h> #include <sys/modctl.h> #include <libbrand.h> +#include <libscf.h> #include <pool.h> #include <sys/pool.h> +#include <sys/priocntl.h> +#include <sys/fsspriocntl.h> #include "zoneadm.h" @@ -154,6 +157,7 @@ static int move_func(int argc, char *argv[]); static int detach_func(int argc, char *argv[]); static int attach_func(int argc, char *argv[]); static int mark_func(int argc, char *argv[]); +static int apply_func(int argc, char *argv[]); static int sanity_check(char *zone, int cmd_num, boolean_t running, boolean_t unsafe_when_running, boolean_t force); static int cmd_match(char *cmd); @@ -177,7 +181,8 @@ static struct cmd cmdtab[] = { { CMD_MOVE, "move", SHELP_MOVE, move_func }, { CMD_DETACH, "detach", SHELP_DETACH, detach_func }, { CMD_ATTACH, "attach", SHELP_ATTACH, attach_func }, - { CMD_MARK, "mark", SHELP_MARK, mark_func } + { CMD_MARK, "mark", SHELP_MARK, mark_func }, + { CMD_APPLY, "apply", NULL, apply_func } }; /* global variables */ @@ -1501,6 +1506,7 @@ boot_func(int argc, char *argv[]) zerror(gettext("call to %s failed"), "zoneadmd"); return (Z_ERR); } + return (Z_OK); } @@ -4355,15 +4361,22 @@ dev_fix(zone_dochandle_t handle) zarg.cmd = Z_READY; if (call_zoneadmd(target_zone, &zarg) != 0) { zerror(gettext("call to %s failed"), "zoneadmd"); + /* attempt to restore zone to configured state */ + (void) zone_set_state(target_zone, ZONE_STATE_CONFIGURED); return (Z_ERR); } zarg.cmd = Z_HALT; if (call_zoneadmd(target_zone, &zarg) != 0) { zerror(gettext("call to %s failed"), "zoneadmd"); + /* attempt to restore zone to configured state */ + (void) zone_set_state(target_zone, ZONE_STATE_CONFIGURED); return (Z_ERR); } + /* attempt to restore zone to configured state */ + (void) zone_set_state(target_zone, ZONE_STATE_CONFIGURED); + if (zonecfg_setdevperment(handle) != Z_OK) { (void) fprintf(stderr, gettext("unable to enumerate device entries\n")); @@ -4845,6 +4858,177 @@ mark_func(int argc, char *argv[]) return (err); } +/* + * Check what scheduling class we're running under and print a warning if + * we're not using FSS. + */ +static int +check_sched_fss(zone_dochandle_t handle) +{ + char class_name[PC_CLNMSZ]; + + if (zonecfg_get_dflt_sched_class(handle, class_name, + sizeof (class_name)) != Z_OK) { + zerror(gettext("WARNING: unable to determine the zone's " + "scheduling class")); + } else if (strcmp("FSS", class_name) != 0) { + zerror(gettext("WARNING: The zone.cpu-shares rctl is set but\n" + "FSS is not the default scheduling class for this zone. " + "FSS will be\nused for processes in the zone but to get " + "the full benefit of FSS,\nit should be the default " + "scheduling class. See dispadmin(1M) for\nmore details.")); + return (Z_SYSTEM); + } + + return (Z_OK); +} + +static int +check_cpu_shares_sched(zone_dochandle_t handle) +{ + int err; + int res = Z_OK; + struct zone_rctltab rctl; + + if ((err = zonecfg_setrctlent(handle)) != Z_OK) { + errno = err; + zperror(cmd_to_str(CMD_APPLY), B_TRUE); + return (err); + } + + while (zonecfg_getrctlent(handle, &rctl) == Z_OK) { + if (strcmp(rctl.zone_rctl_name, "zone.cpu-shares") == 0) { + if (check_sched_fss(handle) != Z_OK) + res = Z_SYSTEM; + break; + } + } + + (void) zonecfg_endrctlent(handle); + + return (res); +} + +/* + * This is an undocumented interface which is currently only used to apply + * the global zone resource management settings when the system boots. + * This function does not yet properly handle updating a running system so + * any projects running in the zone would be trashed if this function + * were to run after the zone had booted. It also does not reset any + * rctl settings that were removed from zonecfg. There is still work to be + * done before we can properly support dynamically updating the resource + * management settings for a running zone (global or non-global). Thus, this + * functionality is undocumented for now. + */ +/* ARGSUSED */ +static int +apply_func(int argc, char *argv[]) +{ + int err; + int res = Z_OK; + priv_set_t *privset; + zoneid_t zoneid; + zone_dochandle_t handle; + struct zone_mcaptab mcap; + char pool_err[128]; + + zoneid = getzoneid(); + + if (zonecfg_in_alt_root() || zoneid != GLOBAL_ZONEID || + target_zone == NULL || strcmp(target_zone, GLOBAL_ZONENAME) != 0) + return (usage(B_FALSE)); + + if ((privset = priv_allocset()) == NULL) { + zerror(gettext("%s failed"), "priv_allocset"); + return (Z_ERR); + } + + if (getppriv(PRIV_EFFECTIVE, privset) != 0) { + zerror(gettext("%s failed"), "getppriv"); + priv_freeset(privset); + return (Z_ERR); + } + + if (priv_isfullset(privset) == B_FALSE) { + (void) usage(B_FALSE); + priv_freeset(privset); + return (Z_ERR); + } + priv_freeset(privset); + + if ((handle = zonecfg_init_handle()) == NULL) { + zperror(cmd_to_str(CMD_APPLY), B_TRUE); + return (Z_ERR); + } + + if ((err = zonecfg_get_handle(target_zone, handle)) != Z_OK) { + errno = err; + zperror(cmd_to_str(CMD_APPLY), B_TRUE); + zonecfg_fini_handle(handle); + return (Z_ERR); + } + + /* specific error msgs are printed within apply_rctls */ + if ((err = zonecfg_apply_rctls(target_zone, handle)) != Z_OK) { + errno = err; + zperror(cmd_to_str(CMD_APPLY), B_TRUE); + res = Z_ERR; + } + + if ((err = check_cpu_shares_sched(handle)) != Z_OK) + res = Z_ERR; + + /* + * The next two blocks of code attempt to set up temporary pools as + * well as persistent pools. In both cases we call the functions + * unconditionally. Within each funtion the code will check if the + * zone is actually configured for a temporary pool or persistent pool + * and just return if there is nothing to do. + */ + if ((err = zonecfg_bind_tmp_pool(handle, zoneid, pool_err, + sizeof (pool_err))) != Z_OK) { + if (err == Z_POOL || err == Z_POOL_CREATE || err == Z_POOL_BIND) + zerror("%s: %s", zonecfg_strerror(err), pool_err); + else + zerror(gettext("could not bind zone to temporary " + "pool: %s"), zonecfg_strerror(err)); + res = Z_ERR; + } + + if ((err = zonecfg_bind_pool(handle, zoneid, pool_err, + sizeof (pool_err))) != Z_OK) { + if (err == Z_POOL || err == Z_POOL_BIND) + zerror("%s: %s", zonecfg_strerror(err), pool_err); + else + zerror("%s", zonecfg_strerror(err)); + } + + /* + * If a memory cap is configured, set the cap in the kernel using + * zone_setattr() and make sure the rcapd SMF service is enabled. + */ + if (zonecfg_getmcapent(handle, &mcap) == Z_OK) { + uint64_t num; + char smf_err[128]; + + num = (uint64_t)strtoll(mcap.zone_physmem_cap, NULL, 10); + if (zone_setattr(zoneid, ZONE_ATTR_PHYS_MCAP, &num, 0) == -1) { + zerror(gettext("could not set zone memory cap")); + res = Z_ERR; + } + + if (zonecfg_enable_rcapd(smf_err, sizeof (smf_err)) != Z_OK) { + zerror(gettext("enabling system/rcap service failed: " + "%s"), smf_err); + res = Z_ERR; + } + } + + zonecfg_fini_handle(handle); + + return (res); +} + static int help_func(int argc, char *argv[]) { diff --git a/usr/src/cmd/zoneadm/zoneadm.h b/usr/src/cmd/zoneadm/zoneadm.h index a94053e258..a299ece135 100644 --- a/usr/src/cmd/zoneadm/zoneadm.h +++ b/usr/src/cmd/zoneadm/zoneadm.h @@ -45,9 +45,10 @@ #define CMD_DETACH 13 #define CMD_ATTACH 14 #define CMD_MARK 15 +#define CMD_APPLY 16 #define CMD_MIN CMD_HELP -#define CMD_MAX CMD_MARK +#define CMD_MAX CMD_APPLY #if !defined(TEXT_DOMAIN) /* should be defined by cc -D */ #define TEXT_DOMAIN "SYS_TEST" /* Use this only if it wasn't */ diff --git a/usr/src/cmd/zoneadmd/Makefile b/usr/src/cmd/zoneadmd/Makefile index 8b77f8234c..34914694a8 100644 --- a/usr/src/cmd/zoneadmd/Makefile +++ b/usr/src/cmd/zoneadmd/Makefile @@ -42,7 +42,7 @@ POFILES= $(OBJS:%.o=%.po) CFLAGS += $(CCVERBOSE) LAZYLIBS = $(ZLAZYLOAD) -ltsnet -ltsol $(ZNOLAZYLOAD) lint := LAZYLIBS = -ltsnet -ltsol -LDLIBS += -lsocket -lzonecfg -lnsl -ldevinfo -ldevice -lnvpair -lpool \ +LDLIBS += -lsocket -lzonecfg -lnsl -ldevinfo -ldevice -lnvpair \ -lgen -lbsm -lcontract -lzfs -luuid -lbrand $(LAZYLIBS) XGETFLAGS += -a -x zoneadmd.xcl diff --git a/usr/src/cmd/zoneadmd/vplat.c b/usr/src/cmd/zoneadmd/vplat.c index ca93b1c696..513921e5e2 100644 --- a/usr/src/cmd/zoneadmd/vplat.c +++ b/usr/src/cmd/zoneadmd/vplat.c @@ -106,6 +106,7 @@ #include <pool.h> #include <sys/pool.h> +#include <sys/priocntl.h> #include <libbrand.h> #include <sys/brand.h> @@ -2661,27 +2662,6 @@ out: } static int -get_zone_pool(zlog_t *zlogp, char *poolbuf, size_t bufsz) -{ - zone_dochandle_t handle; - int error; - - if ((handle = zonecfg_init_handle()) == NULL) { - zerror(zlogp, B_TRUE, "getting zone configuration handle"); - return (Z_NOMEM); - } - error = zonecfg_get_snapshot_handle(zone_name, handle); - if (error != Z_OK) { - zerror(zlogp, B_FALSE, "invalid configuration"); - zonecfg_fini_handle(handle); - return (error); - } - error = zonecfg_get_pool(handle, poolbuf, bufsz); - zonecfg_fini_handle(handle); - return (error); -} - -static int get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep) { zone_dochandle_t handle; @@ -2818,75 +2798,6 @@ validate_datasets(zlog_t *zlogp) return (0); } -static int -bind_to_pool(zlog_t *zlogp, zoneid_t zoneid) -{ - pool_conf_t *poolconf; - pool_t *pool; - char poolname[MAXPATHLEN]; - int status; - int error; - - /* - * Find the pool mentioned in the zone configuration, and bind to it. - */ - error = get_zone_pool(zlogp, poolname, sizeof (poolname)); - if (error == Z_NO_ENTRY || (error == Z_OK && strlen(poolname) == 0)) { - /* - * The property is not set on the zone, so the pool - * should be bound to the default pool. But that's - * already done by the kernel, so we can just return. - */ - return (0); - } - if (error != Z_OK) { - /* - * Not an error, even though it shouldn't be happening. - */ - zerror(zlogp, B_FALSE, - "WARNING: unable to retrieve default pool."); - return (0); - } - /* - * Don't do anything if pools aren't enabled. - */ - if (pool_get_status(&status) != PO_SUCCESS || status != POOL_ENABLED) { - zerror(zlogp, B_FALSE, "WARNING: pools facility not active; " - "zone will not be bound to pool '%s'.", poolname); - return (0); - } - /* - * Try to provide a sane error message if the requested pool doesn't - * exist. - */ - if ((poolconf = pool_conf_alloc()) == NULL) { - zerror(zlogp, B_FALSE, "%s failed", "pool_conf_alloc"); - return (-1); - } - if (pool_conf_open(poolconf, pool_dynamic_location(), PO_RDONLY) != - PO_SUCCESS) { - zerror(zlogp, B_FALSE, "%s failed", "pool_conf_open"); - pool_conf_free(poolconf); - return (-1); - } - pool = pool_get_pool(poolconf, poolname); - (void) pool_conf_close(poolconf); - pool_conf_free(poolconf); - if (pool == NULL) { - zerror(zlogp, B_FALSE, "WARNING: pool '%s' not found; " - "using default pool.", poolname); - return (0); - } - /* - * Bind the zone to the pool. - */ - if (pool_set_binding(poolname, P_ZONEID, zoneid) != PO_SUCCESS) { - zerror(zlogp, B_FALSE, "WARNING: unable to bind to pool '%s'; " - "using default pool.", poolname); - } - return (0); -} - /* * Mount lower level home directories into/from current zone * Share exported directories specified in dfstab for zone @@ -3482,6 +3393,149 @@ duplicate_reachable_path(zlog_t *zlogp, const char *rootpath) return (B_FALSE); } +/* + * Set memory cap and pool info for the zone's resource management + * configuration. + */ +static int +setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid) +{ + int res; + uint64_t tmp; + struct zone_mcaptab mcap; + char sched[MAXNAMELEN]; + zone_dochandle_t handle = NULL; + char pool_err[128]; + + if ((handle = zonecfg_init_handle()) == NULL) { + zerror(zlogp, B_TRUE, "getting zone configuration handle"); + return (Z_BAD_HANDLE); + } + + if ((res = zonecfg_get_snapshot_handle(zone_name, handle)) != Z_OK) { + zerror(zlogp, B_FALSE, "invalid configuration"); + zonecfg_fini_handle(handle); + return (res); + } + + /* + * If a memory cap is configured, set the cap in the kernel using + * zone_setattr() and make sure the rcapd SMF service is enabled. + */ + if (zonecfg_getmcapent(handle, &mcap) == Z_OK) { + uint64_t num; + char smf_err[128]; + + num = (uint64_t)strtoull(mcap.zone_physmem_cap, NULL, 10); + if (zone_setattr(zoneid, ZONE_ATTR_PHYS_MCAP, &num, 0) == -1) { + zerror(zlogp, B_TRUE, "could not set zone memory cap"); + zonecfg_fini_handle(handle); + return (Z_INVAL); + } + + if (zonecfg_enable_rcapd(smf_err, sizeof (smf_err)) != Z_OK) { + zerror(zlogp, B_FALSE, "enabling system/rcap service " + "failed: %s", smf_err); + zonecfg_fini_handle(handle); + return (Z_INVAL); + } + } + + /* Get the scheduling class set in the zone configuration. */ + if (zonecfg_get_sched_class(handle, sched, sizeof (sched)) == Z_OK && + strlen(sched) > 0) { + if (zone_setattr(zoneid, ZONE_ATTR_SCHED_CLASS, sched, + strlen(sched)) == -1) + zerror(zlogp, B_TRUE, "WARNING: unable to set the " + "default scheduling class"); + + } else if (zonecfg_get_aliased_rctl(handle, ALIAS_SHARES, &tmp) + == Z_OK) { + /* + * If the zone has the zone.cpu-shares rctl set then we want to + * use the Fair Share Scheduler (FSS) for processes in the + * zone. Check what scheduling class the zone would be running + * in by default so we can print a warning and modify the class + * if we wouldn't be using FSS. + */ + char class_name[PC_CLNMSZ]; + + if (zonecfg_get_dflt_sched_class(handle, class_name, + sizeof (class_name)) != Z_OK) { + zerror(zlogp, B_FALSE, "WARNING: unable to determine " + "the zone's scheduling class"); + + } else if (strcmp("FSS", class_name) != 0) { + zerror(zlogp, B_FALSE, "WARNING: The zone.cpu-shares " + "rctl is set but\nFSS is not the default " + "scheduling class for\nthis zone. FSS will be " + "used for processes\nin the zone but to get the " + "full benefit of FSS,\nit should be the default " + "scheduling class.\nSee dispadmin(1M) for more " + "details."); + + if (zone_setattr(zoneid, ZONE_ATTR_SCHED_CLASS, "FSS", + strlen("FSS")) == -1) + zerror(zlogp, B_TRUE, "WARNING: unable to set " + "zone scheduling class to FSS"); + } + } + + /* + * The next few blocks of code attempt to set up temporary pools as + * well as persistent pools. In all cases we call the functions + * unconditionally. Within each funtion the code will check if the + * zone is actually configured for a temporary pool or persistent pool + * and just return if there is nothing to do. + * + * If we are rebooting we want to attempt to reuse any temporary pool + * that was previously set up. zonecfg_bind_tmp_pool() will do the + * right thing in all cases (reuse or create) based on the current + * zonecfg. + */ + if ((res = zonecfg_bind_tmp_pool(handle, zoneid, pool_err, + sizeof (pool_err))) != Z_OK) { + if (res == Z_POOL || res == Z_POOL_CREATE || res == Z_POOL_BIND) + zerror(zlogp, B_FALSE, "%s: %s\ndedicated-cpu setting " + "cannot be instantiated", zonecfg_strerror(res), + pool_err); + else + zerror(zlogp, B_FALSE, "could not bind zone to " + "temporary pool: %s", zonecfg_strerror(res)); + zonecfg_fini_handle(handle); + return (Z_POOL_BIND); + } + + /* + * Check if we need to warn about poold not being enabled. + */ + if (zonecfg_warn_poold(handle)) { + zerror(zlogp, B_FALSE, "WARNING: A range of dedicated-cpus has " + "been specified\nbut the dynamic pool service is not " + "enabled.\nThe system will not dynamically adjust the\n" + "processor allocation within the specified range\n" + "until svc:/system/pools/dynamic is enabled.\n" + "See poold(1M)."); + } + + /* The following is a warning, not an error. */ + if ((res = zonecfg_bind_pool(handle, zoneid, pool_err, + sizeof (pool_err))) != Z_OK) { + if (res == Z_POOL_BIND) + zerror(zlogp, B_FALSE, "WARNING: unable to bind to " + "pool '%s'; using default pool.", pool_err); + else if (res == Z_POOL) + zerror(zlogp, B_FALSE, "WARNING: %s: %s", + zonecfg_strerror(res), pool_err); + else + zerror(zlogp, B_FALSE, "WARNING: %s", + zonecfg_strerror(res)); + } + + zonecfg_fini_handle(handle); + return (Z_OK); +} + zoneid_t vplat_create(zlog_t *zlogp, boolean_t mount_cmd) { @@ -3668,14 +3722,18 @@ vplat_create(zlog_t *zlogp, boolean_t mount_cmd) } /* - * The following is a warning, not an error, and is not performed when - * merely mounting a zone for administrative use. + * The following actions are not performed when merely mounting a zone + * for administrative use. */ - if (!mount_cmd && bind_to_pool(zlogp, zoneid) != 0) - zerror(zlogp, B_FALSE, "WARNING: unable to bind zone to " - "requested pool; using default pool."); - if (!mount_cmd) + if (!mount_cmd) { + if (setup_zone_rm(zlogp, zone_name, zoneid) != Z_OK) { + (void) zone_shutdown(zoneid); + goto error; + } + set_mlps(zlogp, zoneid, zcent); + } + rval = zoneid; zoneid = -1; @@ -3878,10 +3936,12 @@ unmounted: } int -vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd) +vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting) { char *kzone; zoneid_t zoneid; + int res; + char pool_err[128]; char zroot[MAXPATHLEN]; char cmdbuf[MAXPATHLEN]; char brand[MAXNAMELEN]; @@ -3972,6 +4032,19 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd) goto error; } + /* + * If we are rebooting then we don't want to destroy an existing + * temporary pool at this point so that we can just reuse it when the + * zone boots back up. + */ + if (!unmount_cmd && !rebooting) { + if ((res = zonecfg_destroy_tmp_pool(zone_name, pool_err, + sizeof (pool_err))) != Z_OK) { + if (res == Z_POOL) + zerror(zlogp, B_FALSE, pool_err); + } + } + remove_mlps(zlogp, zoneid); if (zone_destroy(zoneid) != 0) { diff --git a/usr/src/cmd/zoneadmd/zoneadmd.c b/usr/src/cmd/zoneadmd/zoneadmd.c index 313d24d95b..35206384b9 100644 --- a/usr/src/cmd/zoneadmd/zoneadmd.c +++ b/usr/src/cmd/zoneadmd/zoneadmd.c @@ -463,7 +463,7 @@ zone_ready(zlog_t *zlogp, boolean_t mount_cmd) } if (vplat_bringup(zlogp, mount_cmd, zone_id) != 0) { bringup_failure_recovery = B_TRUE; - (void) vplat_teardown(NULL, mount_cmd); + (void) vplat_teardown(NULL, mount_cmd, B_FALSE); if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK) zerror(zlogp, B_FALSE, "destroying snapshot: %s", zonecfg_strerror(err)); @@ -738,11 +738,11 @@ zone_bootup(zlog_t *zlogp, const char *bootargs) } static int -zone_halt(zlog_t *zlogp, boolean_t unmount_cmd) +zone_halt(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting) { int err; - if (vplat_teardown(zlogp, unmount_cmd) != 0) { + if (vplat_teardown(zlogp, unmount_cmd, rebooting) != 0) { if (!bringup_failure_recovery) zerror(zlogp, B_FALSE, "unable to destroy zone"); return (-1); @@ -985,7 +985,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, audit_put_record(zlogp, uc, rval, "boot"); if (rval != 0) { bringup_failure_recovery = B_TRUE; - (void) zone_halt(zlogp, B_FALSE); + (void) zone_halt(zlogp, B_FALSE, B_FALSE); eventstream_write(Z_EVT_ZONE_BOOTFAILED); } break; @@ -1094,7 +1094,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, audit_put_record(zlogp, uc, rval, "boot"); if (rval != 0) { bringup_failure_recovery = B_TRUE; - (void) zone_halt(zlogp, B_FALSE); + (void) zone_halt(zlogp, B_FALSE, B_TRUE); eventstream_write(Z_EVT_ZONE_BOOTFAILED); } boot_args[0] = '\0'; @@ -1102,7 +1102,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, case Z_HALT: if (kernelcall) /* Invalid; can't happen */ abort(); - if ((rval = zone_halt(zlogp, B_FALSE)) != 0) + if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE)) != 0) break; eventstream_write(Z_EVT_ZONE_HALTED); break; @@ -1125,7 +1125,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, case Z_UNMOUNT: if (kernelcall) /* Invalid; can't happen */ abort(); - rval = zone_halt(zlogp, B_TRUE); + rval = zone_halt(zlogp, B_TRUE, B_FALSE); if (rval == 0) { eventstream_write(Z_EVT_ZONE_HALTED); (void) sema_post(&scratch_sem); @@ -1147,7 +1147,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, case ZONE_STATE_DOWN: switch (cmd) { case Z_READY: - if ((rval = zone_halt(zlogp, B_FALSE)) != 0) + if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE)) != 0) break; if ((rval = zone_ready(zlogp, B_FALSE)) == 0) eventstream_write(Z_EVT_ZONE_READIED); @@ -1165,7 +1165,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, rval = 0; break; case Z_HALT: - if ((rval = zone_halt(zlogp, B_FALSE)) != 0) + if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE)) != 0) break; eventstream_write(Z_EVT_ZONE_HALTED); break; @@ -1173,7 +1173,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, (void) strlcpy(boot_args, zargp->bootbuf, sizeof (boot_args)); eventstream_write(Z_EVT_ZONE_REBOOTING); - if ((rval = zone_halt(zlogp, B_FALSE)) != 0) { + if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE)) != 0) { eventstream_write(Z_EVT_ZONE_BOOTFAILED); boot_args[0] = '\0'; break; @@ -1186,7 +1186,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, rval = zone_bootup(zlogp, zargp->bootbuf); audit_put_record(zlogp, uc, rval, "reboot"); if (rval != 0) { - (void) zone_halt(zlogp, B_FALSE); + (void) zone_halt(zlogp, B_FALSE, B_TRUE); eventstream_write(Z_EVT_ZONE_BOOTFAILED); } boot_args[0] = '\0'; diff --git a/usr/src/cmd/zoneadmd/zoneadmd.h b/usr/src/cmd/zoneadmd/zoneadmd.h index cfb90f93f3..a4aba27b5c 100644 --- a/usr/src/cmd/zoneadmd/zoneadmd.h +++ b/usr/src/cmd/zoneadmd/zoneadmd.h @@ -106,7 +106,7 @@ extern void eventstream_write(zone_evt_t evt); */ extern zoneid_t vplat_create(zlog_t *, boolean_t); extern int vplat_bringup(zlog_t *, boolean_t, zoneid_t); -extern int vplat_teardown(zlog_t *, boolean_t); +extern int vplat_teardown(zlog_t *, boolean_t, boolean_t); /* * Console subsystem routines. diff --git a/usr/src/cmd/zonecfg/zonecfg.c b/usr/src/cmd/zonecfg/zonecfg.c index ea745cbb61..34d6b99480 100644 --- a/usr/src/cmd/zonecfg/zonecfg.c +++ b/usr/src/cmd/zonecfg/zonecfg.c @@ -101,6 +101,8 @@ extern int lex_lineno; #define MAX_CMD_HIST 1024 #define MAX_CMD_LEN 1024 +#define ONE_MB 1048576 + /* * Each SHELP_ should be a simple string. */ @@ -108,6 +110,7 @@ extern int lex_lineno; #define SHELP_ADD "add <resource-type>\n\t(global scope)\n" \ "add <property-name> <property-value>\n\t(resource scope)" #define SHELP_CANCEL "cancel" +#define SHELP_CLEAR "clear <property-name>" #define SHELP_COMMIT "commit" #define SHELP_CREATE "create [-F] [ -a <path> | -b | -t <template> ]" #define SHELP_DELETE "delete [-F]" @@ -116,9 +119,11 @@ extern int lex_lineno; #define SHELP_EXPORT "export [-f output-file]" #define SHELP_HELP "help [commands] [syntax] [usage] [<command-name>]" #define SHELP_INFO "info [<resource-type> [property-name=property-value]*]" -#define SHELP_REMOVE "remove <resource-type> { <property-name>=<property-" \ - "value> }\n\t(global scope)\nremove <property-name> <property-value>" \ - "\n\t(resource scope)" +#define SHELP_REMOVE "remove [-F] <resource-type> " \ + "[ <property-name>=<property-value> ]*\n" \ + "\t(global scope)\n" \ + "remove <property-name> <property-value>\n" \ + "\t(resource scope)" #define SHELP_REVERT "revert [-F]" #define SHELP_SELECT "select <resource-type> { <property-name>=" \ "<property-value> }" @@ -128,6 +133,7 @@ extern int lex_lineno; static struct help helptab[] = { { CMD_ADD, "add", HELP_RES_PROPS, SHELP_ADD, }, { CMD_CANCEL, "cancel", 0, SHELP_CANCEL, }, + { CMD_CLEAR, "clear", HELP_PROPS, SHELP_CLEAR, }, { CMD_COMMIT, "commit", 0, SHELP_COMMIT, }, { CMD_CREATE, "create", 0, SHELP_CREATE, }, { CMD_DELETE, "delete", 0, SHELP_DELETE, }, @@ -163,6 +169,15 @@ static char *res_types[] = { "limitpriv", "bootargs", "brand", + "dedicated-cpu", + "capped-memory", + ALIAS_MAXLWPS, + ALIAS_MAXSHMMEM, + ALIAS_MAXSHMIDS, + ALIAS_MAXMSGIDS, + ALIAS_MAXSEMIDS, + ALIAS_SHARES, + "scheduling-class", NULL }; @@ -189,6 +204,19 @@ static char *prop_types[] = { "limitpriv", "bootargs", "brand", + "ncpus", + "importance", + "swap", + "locked", + ALIAS_SHARES, + ALIAS_MAXLWPS, + ALIAS_MAXSHMMEM, + ALIAS_MAXSHMIDS, + ALIAS_MAXMSGIDS, + ALIAS_MAXSEMIDS, + ALIAS_MAXLOCKEDMEM, + ALIAS_MAXSWAP, + "scheduling-class", NULL }; @@ -205,11 +233,12 @@ static char *prop_val_types[] = { /* * remove has a space afterwards because it has qualifiers; the other commands - * that have qualifiers (add, select and set) don't need a space here because + * that have qualifiers (add, select, etc.) don't need a space here because * they have their own _cmds[] lists below. */ static const char *global_scope_cmds[] = { "add", + "clear", "commit", "create", "delete", @@ -233,6 +262,23 @@ static const char *add_cmds[] = { "add rctl", "add attr", "add dataset", + "add dedicated-cpu", + "add capped-memory", + NULL +}; + +static const char *clear_cmds[] = { + "clear autoboot", + "clear pool", + "clear limitpriv", + "clear bootargs", + "clear scheduling-class", + "clear " ALIAS_MAXLWPS, + "clear " ALIAS_MAXSHMMEM, + "clear " ALIAS_MAXSHMIDS, + "clear " ALIAS_MAXMSGIDS, + "clear " ALIAS_MAXSEMIDS, + "clear " ALIAS_SHARES, NULL }; @@ -244,6 +290,8 @@ static const char *remove_cmds[] = { "remove rctl ", "remove attr ", "remove dataset ", + "remove dedicated-cpu ", + "remove capped-memory ", NULL }; @@ -255,6 +303,8 @@ static const char *select_cmds[] = { "select rctl ", "select attr ", "select dataset ", + "select dedicated-cpu", + "select capped-memory", NULL }; @@ -266,6 +316,13 @@ static const char *set_cmds[] = { "set pool=", "set limitpriv=", "set bootargs=", + "set scheduling-class=", + "set " ALIAS_MAXLWPS "=", + "set " ALIAS_MAXSHMMEM "=", + "set " ALIAS_MAXSHMIDS "=", + "set " ALIAS_MAXMSGIDS "=", + "set " ALIAS_MAXSEMIDS "=", + "set " ALIAS_SHARES "=", NULL }; @@ -277,12 +334,22 @@ static const char *info_cmds[] = { "info rctl ", "info attr ", "info dataset ", + "info capped-memory", + "info dedicated-cpu", "info zonename", "info zonepath", "info autoboot", "info pool", "info limitpriv", "info bootargs", + "info brand", + "info scheduling-class", + "info max-lwps", + "info max-shm-memory", + "info max-shm-ids", + "info max-msg-ids", + "info max-sem-ids", + "info cpu-shares", NULL }; @@ -298,6 +365,7 @@ static const char *fs_res_scope_cmds[] = { "set raw=", "set special=", "set type=", + "clear raw", NULL }; @@ -366,6 +434,33 @@ static const char *dataset_res_scope_cmds[] = { NULL }; +static const char *pset_res_scope_cmds[] = { + "cancel", + "end", + "exit", + "help", + "info", + "set ncpus=", + "set importance=", + "clear importance", + NULL +}; + +static const char *mcap_res_scope_cmds[] = { + "cancel", + "end", + "exit", + "help", + "info", + "set physical=", + "set swap=", + "set locked=", + "clear physical", + "clear swap", + "clear locked", + NULL +}; + /* Global variables */ /* set early in main(), never modified thereafter, used all over the place */ @@ -406,6 +501,9 @@ static bool got_handle = FALSE; /* initialized in do_interactive(), checked in initialize() */ static bool interactive_mode; +/* set if configuring the global zone */ +static bool global_zone = FALSE; + /* set in main(), checked in multiple places */ static bool read_only_mode; @@ -427,9 +525,13 @@ static struct zone_devtab old_devtab, in_progress_devtab; static struct zone_rctltab old_rctltab, in_progress_rctltab; static struct zone_attrtab old_attrtab, in_progress_attrtab; static struct zone_dstab old_dstab, in_progress_dstab; +static struct zone_psettab old_psettab, in_progress_psettab; +static struct zone_mcaptab old_mcaptab, in_progress_mcaptab; static GetLine *gl; /* The gl_get_line() resource object */ +static void bytes_to_units(char *str, char *buf, int bufsize); + /* Functions begin here */ static bool @@ -469,6 +571,8 @@ CPL_MATCH_FN(cmd_cpl_fn) */ if (strncmp(line, "add ", MAX(MIN(word_end, 4), 1)) == 0) return (add_stuff(cpl, line, add_cmds, word_end)); + if (strncmp(line, "clear ", MAX(MIN(word_end, 6), 2)) == 0) + return (add_stuff(cpl, line, clear_cmds, word_end)); if (strncmp(line, "select ", MAX(MIN(word_end, 7), 3)) == 0) return (add_stuff(cpl, line, select_cmds, word_end)); if (strncmp(line, "set ", MAX(MIN(word_end, 4), 3)) == 0) @@ -494,6 +598,10 @@ CPL_MATCH_FN(cmd_cpl_fn) return (add_stuff(cpl, line, attr_res_scope_cmds, word_end)); case RT_DATASET: return (add_stuff(cpl, line, dataset_res_scope_cmds, word_end)); + case RT_DCPU: + return (add_stuff(cpl, line, pset_res_scope_cmds, word_end)); + case RT_MCAP: + return (add_stuff(cpl, line, mcap_res_scope_cmds, word_end)); } return (0); } @@ -669,9 +777,8 @@ long_help(int cmd_num) "flag can be used to force the\n\taction.")); case CMD_REMOVE: return (gettext("Remove specified resource from " - "configuration. Note that the curly\n\tbraces " - "('{', '}') mean one or more of whatever " - "is between them.")); + "configuration. The -F flag can be used\n\tto " + "force the action.")); case CMD_SELECT: (void) snprintf(line, sizeof (line), gettext("Selects a resource to modify. " @@ -684,6 +791,8 @@ long_help(int cmd_num) return (line); case CMD_SET: return (gettext("Sets property values.")); + case CMD_CLEAR: + return (gettext("Clears property values.")); case CMD_INFO: return (gettext("Displays information about the " "current configuration. If resource\n\ttype is " @@ -870,6 +979,37 @@ usage(bool verbose, uint_t flags) (void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET), pt_to_str(PT_NAME), gettext("<name>")); break; + case RT_DCPU: + (void) fprintf(fp, gettext("The '%s' resource scope " + "configures the 'pools' facility to dedicate\na " + "subset of the system's processors to this zone " + "while it is running.\n"), + rt_to_str(resource_scope)); + (void) fprintf(fp, gettext("Valid commands:\n")); + (void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_NCPUS), + gettext("<unsigned integer | range>")); + (void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_IMPORTANCE), + gettext("<unsigned integer>")); + break; + case RT_MCAP: + (void) fprintf(fp, gettext("The '%s' resource scope is " + "used to set an upper limit (a cap) on the\n" + "amount of physical memory, swap space and locked " + "memory that can be used by\nthis zone.\n"), + rt_to_str(resource_scope)); + (void) fprintf(fp, gettext("Valid commands:\n")); + (void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_PHYSICAL), + gettext("<qualified unsigned decimal>")); + (void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_SWAP), + gettext("<qualified unsigned decimal>")); + (void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_LOCKED), + gettext("<qualified unsigned decimal>")); + break; } (void) fprintf(fp, gettext("And from any resource scope, you " "can:\n")); @@ -928,11 +1068,12 @@ usage(bool verbose, uint_t flags) } if (flags & HELP_RESOURCES) { (void) fprintf(fp, "<%s> := %s | %s | %s | %s | %s | %s |\n\t" - "%s\n\n", + "%s | %s | %s\n\n", gettext("resource type"), rt_to_str(RT_FS), rt_to_str(RT_IPD), rt_to_str(RT_NET), rt_to_str(RT_DEVICE), rt_to_str(RT_RCTL), rt_to_str(RT_ATTR), - rt_to_str(RT_DATASET)); + rt_to_str(RT_DATASET), rt_to_str(RT_DCPU), + rt_to_str(RT_MCAP)); } if (flags & HELP_PROPS) { (void) fprintf(fp, gettext("For resource type ... there are " @@ -951,6 +1092,20 @@ usage(bool verbose, uint_t flags) pt_to_str(PT_POOL)); (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"), pt_to_str(PT_LIMITPRIV)); + (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"), + pt_to_str(PT_SCHED)); + (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"), + pt_to_str(PT_MAXLWPS)); + (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"), + pt_to_str(PT_MAXSHMMEM)); + (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"), + pt_to_str(PT_MAXSHMIDS)); + (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"), + pt_to_str(PT_MAXMSGIDS)); + (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"), + pt_to_str(PT_MAXSEMIDS)); + (void) fprintf(fp, "\t%s\t%s\n", gettext("(global)"), + pt_to_str(PT_SHARES)); (void) fprintf(fp, "\t%s\t\t%s, %s, %s, %s\n", rt_to_str(RT_FS), pt_to_str(PT_DIR), pt_to_str(PT_SPECIAL), pt_to_str(PT_RAW), pt_to_str(PT_TYPE), @@ -968,6 +1123,11 @@ usage(bool verbose, uint_t flags) pt_to_str(PT_VALUE)); (void) fprintf(fp, "\t%s\t\t%s\n", rt_to_str(RT_DATASET), pt_to_str(PT_NAME)); + (void) fprintf(fp, "\t%s\t%s, %s\n", rt_to_str(RT_DCPU), + pt_to_str(PT_NCPUS), pt_to_str(PT_IMPORTANCE)); + (void) fprintf(fp, "\t%s\t%s, %s, %s\n", rt_to_str(RT_MCAP), + pt_to_str(PT_PHYSICAL), pt_to_str(PT_SWAP), + pt_to_str(PT_LOCKED)); } if (need_to_close) (void) pclose(fp); @@ -1040,6 +1200,33 @@ initialize(bool handle_expected) " Unable to continue", zone, brandname); exit(Z_ERR); } + } else if (global_zone && err == Z_NO_ZONE && !got_handle && + !read_only_mode) { + /* + * We implicitly create the global zone config if it + * doesn't exist. + */ + zone_dochandle_t tmphandle; + + if ((tmphandle = zonecfg_init_handle()) == NULL) { + zone_perror(execname, Z_NOMEM, TRUE); + exit(Z_ERR); + } + + err = zonecfg_get_template_handle("SUNWblank", zone, + tmphandle); + + if (err != Z_OK) { + zonecfg_fini_handle(tmphandle); + zone_perror("SUNWblank", err, TRUE); + return (err); + } + + need_to_commit = TRUE; + zonecfg_fini_handle(handle); + handle = tmphandle; + got_handle = TRUE; + } else { zone_perror(zone, err, handle_expected || got_handle); if (err == Z_NO_ZONE && !got_handle && @@ -1373,10 +1560,13 @@ export_func(cmd_t *cmd) struct zone_attrtab attrtab; struct zone_rctltab rctltab; struct zone_dstab dstab; + struct zone_psettab psettab; + struct zone_mcaptab mcaptab; struct zone_rctlvaltab *valptr; int err, arg; char zonepath[MAXPATHLEN], outfile[MAXPATHLEN], pool[MAXNAMELEN]; char bootargs[BOOTARGS_MAX]; + char sched[MAXNAMELEN]; char brand[MAXNAMELEN]; char *limitpriv; FILE *of; @@ -1456,6 +1646,10 @@ export_func(cmd_t *cmd) free(limitpriv); } + if (zonecfg_get_sched_class(handle, sched, sizeof (sched)) == Z_OK && + strlen(sched) > 0) + (void) fprintf(of, "%s %s=%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_SCHED), sched); if ((err = zonecfg_setipdent(handle)) != Z_OK) { zone_perror(zone, err, FALSE); @@ -1576,6 +1770,33 @@ export_func(cmd_t *cmd) } (void) zonecfg_enddsent(handle); + if (zonecfg_getpsetent(handle, &psettab) == Z_OK) { + (void) fprintf(of, "%s %s\n", cmd_to_str(CMD_ADD), + rt_to_str(RT_DCPU)); + if (strcmp(psettab.zone_ncpu_min, psettab.zone_ncpu_max) == 0) + (void) fprintf(of, "%s %s=%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_NCPUS), psettab.zone_ncpu_max); + else + (void) fprintf(of, "%s %s=%s-%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_NCPUS), psettab.zone_ncpu_min, + psettab.zone_ncpu_max); + if (psettab.zone_importance[0] != '\0') + (void) fprintf(of, "%s %s=%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_IMPORTANCE), psettab.zone_importance); + (void) fprintf(of, "%s\n", cmd_to_str(CMD_END)); + } + + if (zonecfg_getmcapent(handle, &mcaptab) == Z_OK) { + char buf[128]; + + (void) fprintf(of, "%s %s\n", cmd_to_str(CMD_ADD), + rt_to_str(RT_MCAP)); + bytes_to_units(mcaptab.zone_physmem_cap, buf, sizeof (buf)); + (void) fprintf(of, "%s %s=%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_PHYSICAL), buf); + (void) fprintf(of, "%s\n", cmd_to_str(CMD_END)); + } + done: if (need_to_close) (void) fclose(of); @@ -1641,6 +1862,10 @@ static void add_resource(cmd_t *cmd) { int type; + struct zone_psettab tmp_psettab; + struct zone_mcaptab tmp_mcaptab; + uint64_t tmp_mcap; + char pool[MAXNAMELEN]; if ((type = cmd->cmd_res_type) == RT_UNKNOWN) { long_usage(CMD_ADD, TRUE); @@ -1667,6 +1892,12 @@ add_resource(cmd_t *cmd) bzero(&in_progress_devtab, sizeof (in_progress_devtab)); return; case RT_RCTL: + if (global_zone) + zerr(gettext("WARNING: Setting a global zone resource " + "control too low could deny\nservice " + "to even the root user; " + "this could render the system impossible\n" + "to administer. Please use caution.")); bzero(&in_progress_rctltab, sizeof (in_progress_rctltab)); return; case RT_ATTR: @@ -1675,6 +1906,48 @@ add_resource(cmd_t *cmd) case RT_DATASET: bzero(&in_progress_dstab, sizeof (in_progress_dstab)); return; + case RT_DCPU: + /* Make sure there isn't already a cpu-set entry. */ + if (zonecfg_lookup_pset(handle, &tmp_psettab) == Z_OK) { + zerr(gettext("The %s resource already exists."), + rt_to_str(RT_DCPU)); + goto bad; + } + + /* Make sure the pool property isn't set. */ + if (zonecfg_get_pool(handle, pool, sizeof (pool)) == Z_OK && + strlen(pool) > 0) { + zerr(gettext("The %s property is already set. " + "A persistent pool is incompatible with\nthe %s " + "resource."), + pt_to_str(PT_POOL), rt_to_str(RT_DCPU)); + goto bad; + } + + bzero(&in_progress_psettab, sizeof (in_progress_psettab)); + return; + case RT_MCAP: + /* + * Make sure there isn't already a mem-cap entry or max-swap + * or max-locked rctl. + */ + if (zonecfg_lookup_mcap(handle, &tmp_mcaptab) == Z_OK || + zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP, &tmp_mcap) + == Z_OK || + zonecfg_get_aliased_rctl(handle, ALIAS_MAXLOCKEDMEM, + &tmp_mcap) == Z_OK) { + zerr(gettext("The %s resource or a related resource " + "control already exists."), rt_to_str(RT_MCAP)); + goto bad; + } + if (global_zone) + zerr(gettext("WARNING: Setting a global zone memory " + "cap too low could deny\nservice " + "to even the root user; " + "this could render the system impossible\n" + "to administer. Please use caution.")); + bzero(&in_progress_mcaptab, sizeof (in_progress_mcaptab)); + return; default: zone_perror(rt_to_str(type), Z_NO_RESOURCE_TYPE, TRUE); long_usage(CMD_ADD, TRUE); @@ -1871,6 +2144,30 @@ add_property(cmd_t *cmd) } } +static boolean_t +gz_invalid_resource(int type) +{ + return (global_zone && (type == RT_FS || type == RT_IPD || + type == RT_NET || type == RT_DEVICE || type == RT_ATTR || + type == RT_DATASET)); +} + +static boolean_t +gz_invalid_rt_property(int type) +{ + return (global_zone && (type == RT_ZONENAME || type == RT_ZONEPATH || + type == RT_AUTOBOOT || type == RT_LIMITPRIV || + type == RT_BOOTARGS || type == RT_BRAND || type == RT_SCHED)); +} + +static boolean_t +gz_invalid_property(int type) +{ + return (global_zone && (type == PT_ZONENAME || type == PT_ZONEPATH || + type == PT_AUTOBOOT || type == PT_LIMITPRIV || + type == PT_BOOTARGS || type == PT_BRAND || type == PT_SCHED)); +} + void add_func(cmd_t *cmd) { @@ -1900,6 +2197,13 @@ add_func(cmd_t *cmd) if (initialize(TRUE) != Z_OK) return; if (global_scope) { + if (gz_invalid_resource(cmd->cmd_res_type)) { + zerr(gettext("Cannot add a %s resource to the " + "global zone."), rt_to_str(cmd->cmd_res_type)); + saw_error = TRUE; + return; + } + global_scope = FALSE; resource_scope = cmd->cmd_res_type; end_op = CMD_ADD; @@ -2273,26 +2577,85 @@ fill_in_dstab(cmd_t *cmd, struct zone_dstab *dstab, bool fill_in_only) } static void -remove_resource(cmd_t *cmd) +remove_aliased_rctl(int type, char *name) { - int err, type; - struct zone_fstab fstab; - struct zone_nwiftab nwiftab; - struct zone_devtab devtab; - struct zone_attrtab attrtab; - struct zone_rctltab rctltab; - struct zone_dstab dstab; + int err; + uint64_t tmp; - if ((type = cmd->cmd_res_type) == RT_UNKNOWN) { - long_usage(CMD_REMOVE, TRUE); + if ((err = zonecfg_get_aliased_rctl(handle, name, &tmp)) != Z_OK) { + zerr("%s %s: %s", cmd_to_str(CMD_CLEAR), pt_to_str(type), + zonecfg_strerror(err)); + saw_error = TRUE; return; } + if ((err = zonecfg_rm_aliased_rctl(handle, name)) != Z_OK) { + zerr("%s %s: %s", cmd_to_str(CMD_CLEAR), pt_to_str(type), + zonecfg_strerror(err)); + saw_error = TRUE; + } else { + need_to_commit = TRUE; + } +} - if (initialize(TRUE) != Z_OK) - return; +static boolean_t +prompt_remove_resource(cmd_t *cmd, char *rsrc) +{ + int num; + int answer; + int arg; + boolean_t force = B_FALSE; + char prompt[128]; + + optind = 0; + while ((arg = getopt(cmd->cmd_argc, cmd->cmd_argv, "F")) != EOF) { + switch (arg) { + case 'F': + force = B_TRUE; + break; + default: + return (B_FALSE); + } + } + + num = zonecfg_num_resources(handle, rsrc); + + if (num == 0) { + z_cmd_rt_perror(CMD_REMOVE, cmd->cmd_res_type, Z_NO_ENTRY, + TRUE); + return (B_FALSE); + } + if (num > 1 && !force) { + if (!interactive_mode) { + zerr(gettext("There are multiple instances of this " + "resource. Either qualify the resource to\n" + "remove a single instance or use the -F option to " + "remove all instances.")); + saw_error = TRUE; + return (B_FALSE); + } + (void) snprintf(prompt, sizeof (prompt), gettext( + "Are you sure you want to remove ALL '%s' resources"), + rsrc); + answer = ask_yesno(FALSE, prompt); + if (answer == -1) { + zerr(gettext("Resource incomplete.")); + return (B_FALSE); + } + if (answer != 1) + return (B_FALSE); + } + return (B_TRUE); +} + +static void +remove_fs(cmd_t *cmd) +{ + int err; + + /* traditional, qualified fs removal */ + if (cmd->cmd_prop_nv_pairs > 0) { + struct zone_fstab fstab; - switch (type) { - case RT_FS: if ((err = fill_in_fstab(cmd, &fstab, FALSE)) != Z_OK) { z_cmd_rt_perror(CMD_REMOVE, RT_FS, err, TRUE); return; @@ -2303,13 +2666,36 @@ remove_resource(cmd_t *cmd) need_to_commit = TRUE; zonecfg_free_fs_option_list(fstab.zone_fs_options); return; - case RT_IPD: - if (state_atleast(ZONE_STATE_INSTALLED)) { - zerr(gettext("Zone %s already installed; %s %s not " - "allowed."), zone, cmd_to_str(CMD_REMOVE), - rt_to_str(RT_IPD)); - return; - } + } + + /* + * unqualified fs removal. remove all fs's but prompt if more + * than one. + */ + if (!prompt_remove_resource(cmd, "fs")) + return; + + if ((err = zonecfg_del_all_resources(handle, "fs")) != Z_OK) + z_cmd_rt_perror(CMD_REMOVE, RT_FS, err, TRUE); + else + need_to_commit = TRUE; +} + +static void +remove_ipd(cmd_t *cmd) +{ + int err; + + if (state_atleast(ZONE_STATE_INSTALLED)) { + zerr(gettext("Zone %s already installed; %s %s not allowed."), + zone, cmd_to_str(CMD_REMOVE), rt_to_str(RT_IPD)); + return; + } + + /* traditional, qualified ipd removal */ + if (cmd->cmd_prop_nv_pairs > 0) { + struct zone_fstab fstab; + if ((err = fill_in_ipdtab(cmd, &fstab, FALSE)) != Z_OK) { z_cmd_rt_perror(CMD_REMOVE, RT_IPD, err, TRUE); return; @@ -2319,7 +2705,31 @@ remove_resource(cmd_t *cmd) else need_to_commit = TRUE; return; - case RT_NET: + } + + /* + * unqualified ipd removal. remove all ipds but prompt if more + * than one. + */ + if (!prompt_remove_resource(cmd, "inherit-pkg-dir")) + return; + + if ((err = zonecfg_del_all_resources(handle, "inherit-pkg-dir")) + != Z_OK) + z_cmd_rt_perror(CMD_REMOVE, RT_IPD, err, TRUE); + else + need_to_commit = TRUE; +} + +static void +remove_net(cmd_t *cmd) +{ + int err; + + /* traditional, qualified net removal */ + if (cmd->cmd_prop_nv_pairs > 0) { + struct zone_nwiftab nwiftab; + if ((err = fill_in_nwiftab(cmd, &nwiftab, FALSE)) != Z_OK) { z_cmd_rt_perror(CMD_REMOVE, RT_NET, err, TRUE); return; @@ -2329,7 +2739,30 @@ remove_resource(cmd_t *cmd) else need_to_commit = TRUE; return; - case RT_DEVICE: + } + + /* + * unqualified net removal. remove all nets but prompt if more + * than one. + */ + if (!prompt_remove_resource(cmd, "net")) + return; + + if ((err = zonecfg_del_all_resources(handle, "net")) != Z_OK) + z_cmd_rt_perror(CMD_REMOVE, RT_NET, err, TRUE); + else + need_to_commit = TRUE; +} + +static void +remove_device(cmd_t *cmd) +{ + int err; + + /* traditional, qualified device removal */ + if (cmd->cmd_prop_nv_pairs > 0) { + struct zone_devtab devtab; + if ((err = fill_in_devtab(cmd, &devtab, FALSE)) != Z_OK) { z_cmd_rt_perror(CMD_REMOVE, RT_DEVICE, err, TRUE); return; @@ -2339,18 +2772,30 @@ remove_resource(cmd_t *cmd) else need_to_commit = TRUE; return; - case RT_RCTL: - if ((err = fill_in_rctltab(cmd, &rctltab, FALSE)) != Z_OK) { - z_cmd_rt_perror(CMD_REMOVE, RT_RCTL, err, TRUE); - return; - } - if ((err = zonecfg_delete_rctl(handle, &rctltab)) != Z_OK) - z_cmd_rt_perror(CMD_REMOVE, RT_RCTL, err, TRUE); - else - need_to_commit = TRUE; - zonecfg_free_rctl_value_list(rctltab.zone_rctl_valptr); + } + + /* + * unqualified device removal. remove all devices but prompt if more + * than one. + */ + if (!prompt_remove_resource(cmd, "device")) return; - case RT_ATTR: + + if ((err = zonecfg_del_all_resources(handle, "device")) != Z_OK) + z_cmd_rt_perror(CMD_REMOVE, RT_DEVICE, err, TRUE); + else + need_to_commit = TRUE; +} + +static void +remove_attr(cmd_t *cmd) +{ + int err; + + /* traditional, qualified attr removal */ + if (cmd->cmd_prop_nv_pairs > 0) { + struct zone_attrtab attrtab; + if ((err = fill_in_attrtab(cmd, &attrtab, FALSE)) != Z_OK) { z_cmd_rt_perror(CMD_REMOVE, RT_ATTR, err, TRUE); return; @@ -2360,7 +2805,30 @@ remove_resource(cmd_t *cmd) else need_to_commit = TRUE; return; - case RT_DATASET: + } + + /* + * unqualified attr removal. remove all attrs but prompt if more + * than one. + */ + if (!prompt_remove_resource(cmd, "attr")) + return; + + if ((err = zonecfg_del_all_resources(handle, "attr")) != Z_OK) + z_cmd_rt_perror(CMD_REMOVE, RT_ATTR, err, TRUE); + else + need_to_commit = TRUE; +} + +static void +remove_dataset(cmd_t *cmd) +{ + int err; + + /* traditional, qualified dataset removal */ + if (cmd->cmd_prop_nv_pairs > 0) { + struct zone_dstab dstab; + if ((err = fill_in_dstab(cmd, &dstab, FALSE)) != Z_OK) { z_cmd_rt_perror(CMD_REMOVE, RT_DATASET, err, TRUE); return; @@ -2370,6 +2838,177 @@ remove_resource(cmd_t *cmd) else need_to_commit = TRUE; return; + } + + /* + * unqualified dataset removal. remove all datasets but prompt if more + * than one. + */ + if (!prompt_remove_resource(cmd, "dataset")) + return; + + if ((err = zonecfg_del_all_resources(handle, "dataset")) != Z_OK) + z_cmd_rt_perror(CMD_REMOVE, RT_DATASET, err, TRUE); + else + need_to_commit = TRUE; +} + +static void +remove_rctl(cmd_t *cmd) +{ + int err; + + /* traditional, qualified rctl removal */ + if (cmd->cmd_prop_nv_pairs > 0) { + struct zone_rctltab rctltab; + + if ((err = fill_in_rctltab(cmd, &rctltab, FALSE)) != Z_OK) { + z_cmd_rt_perror(CMD_REMOVE, RT_RCTL, err, TRUE); + return; + } + if ((err = zonecfg_delete_rctl(handle, &rctltab)) != Z_OK) + z_cmd_rt_perror(CMD_REMOVE, RT_RCTL, err, TRUE); + else + need_to_commit = TRUE; + zonecfg_free_rctl_value_list(rctltab.zone_rctl_valptr); + return; + } + + /* + * unqualified rctl removal. remove all rctls but prompt if more + * than one. + */ + if (!prompt_remove_resource(cmd, "rctl")) + return; + + if ((err = zonecfg_del_all_resources(handle, "rctl")) != Z_OK) + z_cmd_rt_perror(CMD_REMOVE, RT_RCTL, err, TRUE); + else + need_to_commit = TRUE; +} + +static void +remove_pset() +{ + int err; + struct zone_psettab psettab; + + if ((err = zonecfg_lookup_pset(handle, &psettab)) != Z_OK) { + z_cmd_rt_perror(CMD_REMOVE, RT_DCPU, err, TRUE); + return; + } + if ((err = zonecfg_delete_pset(handle)) != Z_OK) + z_cmd_rt_perror(CMD_REMOVE, RT_DCPU, err, TRUE); + else + need_to_commit = TRUE; +} + +static void +remove_mcap() +{ + int err, res1, res2, res3; + uint64_t tmp; + struct zone_mcaptab mcaptab; + boolean_t revert = B_FALSE; + + res1 = zonecfg_lookup_mcap(handle, &mcaptab); + res2 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP, &tmp); + res3 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXLOCKEDMEM, &tmp); + + /* if none of these exist, there is no resource to remove */ + if (res1 != Z_OK && res2 != Z_OK && res3 != Z_OK) { + zerr("%s %s: %s", cmd_to_str(CMD_REMOVE), rt_to_str(RT_MCAP), + zonecfg_strerror(Z_NO_RESOURCE_TYPE)); + saw_error = TRUE; + return; + } + if (res1 == Z_OK) { + if ((err = zonecfg_delete_mcap(handle)) != Z_OK) { + z_cmd_rt_perror(CMD_REMOVE, RT_MCAP, err, TRUE); + revert = B_TRUE; + } else { + need_to_commit = TRUE; + } + } + if (res2 == Z_OK) { + if ((err = zonecfg_rm_aliased_rctl(handle, ALIAS_MAXSWAP)) + != Z_OK) { + z_cmd_rt_perror(CMD_REMOVE, RT_MCAP, err, TRUE); + revert = B_TRUE; + } else { + need_to_commit = TRUE; + } + } + if (res3 == Z_OK) { + if ((err = zonecfg_rm_aliased_rctl(handle, ALIAS_MAXLOCKEDMEM)) + != Z_OK) { + z_cmd_rt_perror(CMD_REMOVE, RT_MCAP, err, TRUE); + revert = B_TRUE; + } else { + need_to_commit = TRUE; + } + } + + if (revert) + need_to_commit = FALSE; +} + +static void +remove_resource(cmd_t *cmd) +{ + int type; + int arg; + + if ((type = cmd->cmd_res_type) == RT_UNKNOWN) { + long_usage(CMD_REMOVE, TRUE); + return; + } + + optind = 0; + while ((arg = getopt(cmd->cmd_argc, cmd->cmd_argv, "?F")) != EOF) { + switch (arg) { + case '?': + longer_usage(CMD_REMOVE); + return; + case 'F': + break; + default: + short_usage(CMD_REMOVE); + return; + } + } + + if (initialize(TRUE) != Z_OK) + return; + + switch (type) { + case RT_FS: + remove_fs(cmd); + return; + case RT_IPD: + remove_ipd(cmd); + return; + case RT_NET: + remove_net(cmd); + return; + case RT_DEVICE: + remove_device(cmd); + return; + case RT_RCTL: + remove_rctl(cmd); + return; + case RT_ATTR: + remove_attr(cmd); + return; + case RT_DATASET: + remove_dataset(cmd); + return; + case RT_DCPU: + remove_pset(); + return; + case RT_MCAP: + remove_mcap(); + return; default: zone_perror(rt_to_str(type), Z_NO_RESOURCE_TYPE, TRUE); long_usage(CMD_REMOVE, TRUE); @@ -2513,16 +3152,175 @@ remove_func(cmd_t *cmd) assert(cmd != NULL); - if (global_scope) + if (global_scope) { + if (gz_invalid_resource(cmd->cmd_res_type)) { + zerr(gettext("%s is not a valid resource for the " + "global zone."), rt_to_str(cmd->cmd_res_type)); + saw_error = TRUE; + return; + } remove_resource(cmd); - else + } else { remove_property(cmd); + } +} + +static void +clear_property(cmd_t *cmd) +{ + int res_type, prop_type; + + res_type = resource_scope; + prop_type = cmd->cmd_res_type; + if (res_type == RT_UNKNOWN || prop_type == PT_UNKNOWN) { + long_usage(CMD_CLEAR, TRUE); + return; + } + + if (initialize(TRUE) != Z_OK) + return; + + switch (res_type) { + case RT_FS: + if (prop_type == PT_RAW) { + in_progress_fstab.zone_fs_raw[0] = '\0'; + need_to_commit = TRUE; + return; + } + break; + case RT_DCPU: + if (prop_type == PT_IMPORTANCE) { + in_progress_psettab.zone_importance[0] = '\0'; + need_to_commit = TRUE; + return; + } + break; + case RT_MCAP: + switch (prop_type) { + case PT_PHYSICAL: + in_progress_mcaptab.zone_physmem_cap[0] = '\0'; + need_to_commit = TRUE; + return; + case PT_SWAP: + remove_aliased_rctl(PT_SWAP, ALIAS_MAXSWAP); + return; + case PT_LOCKED: + remove_aliased_rctl(PT_LOCKED, ALIAS_MAXLOCKEDMEM); + return; + } + break; + default: + break; + } + + zone_perror(pt_to_str(prop_type), Z_CLEAR_DISALLOW, TRUE); +} + +static void +clear_global(cmd_t *cmd) +{ + int err, type; + + if ((type = cmd->cmd_res_type) == RT_UNKNOWN) { + long_usage(CMD_CLEAR, TRUE); + return; + } + + if (initialize(TRUE) != Z_OK) + return; + + switch (type) { + case PT_ZONENAME: + /* FALLTHRU */ + case PT_ZONEPATH: + /* FALLTHRU */ + case PT_BRAND: + zone_perror(pt_to_str(type), Z_CLEAR_DISALLOW, TRUE); + return; + case PT_AUTOBOOT: + /* false is default; we'll treat as equivalent to clearing */ + if ((err = zonecfg_set_autoboot(handle, B_FALSE)) != Z_OK) + z_cmd_rt_perror(CMD_CLEAR, RT_AUTOBOOT, err, TRUE); + else + need_to_commit = TRUE; + return; + case PT_POOL: + if ((err = zonecfg_set_pool(handle, NULL)) != Z_OK) + z_cmd_rt_perror(CMD_CLEAR, RT_POOL, err, TRUE); + else + need_to_commit = TRUE; + return; + case PT_LIMITPRIV: + if ((err = zonecfg_set_limitpriv(handle, NULL)) != Z_OK) + z_cmd_rt_perror(CMD_CLEAR, RT_LIMITPRIV, err, TRUE); + else + need_to_commit = TRUE; + return; + case PT_BOOTARGS: + if ((err = zonecfg_set_bootargs(handle, NULL)) != Z_OK) + z_cmd_rt_perror(CMD_CLEAR, RT_BOOTARGS, err, TRUE); + else + need_to_commit = TRUE; + return; + case PT_SCHED: + if ((err = zonecfg_set_sched(handle, NULL)) != Z_OK) + z_cmd_rt_perror(CMD_CLEAR, RT_SCHED, err, TRUE); + else + need_to_commit = TRUE; + return; + case PT_MAXLWPS: + remove_aliased_rctl(PT_MAXLWPS, ALIAS_MAXLWPS); + return; + case PT_MAXSHMMEM: + remove_aliased_rctl(PT_MAXSHMMEM, ALIAS_MAXSHMMEM); + return; + case PT_MAXSHMIDS: + remove_aliased_rctl(PT_MAXSHMIDS, ALIAS_MAXSHMIDS); + return; + case PT_MAXMSGIDS: + remove_aliased_rctl(PT_MAXMSGIDS, ALIAS_MAXMSGIDS); + return; + case PT_MAXSEMIDS: + remove_aliased_rctl(PT_MAXSEMIDS, ALIAS_MAXSEMIDS); + return; + case PT_SHARES: + remove_aliased_rctl(PT_SHARES, ALIAS_SHARES); + return; + default: + zone_perror(pt_to_str(type), Z_NO_PROPERTY_TYPE, TRUE); + long_usage(CMD_CLEAR, TRUE); + usage(FALSE, HELP_PROPS); + return; + } +} + +void +clear_func(cmd_t *cmd) +{ + if (zone_is_read_only(CMD_CLEAR)) + return; + + assert(cmd != NULL); + + if (global_scope) { + if (gz_invalid_property(cmd->cmd_res_type)) { + zerr(gettext("%s is not a valid property for the " + "global zone."), pt_to_str(cmd->cmd_res_type)); + saw_error = TRUE; + return; + } + + clear_global(cmd); + } else { + clear_property(cmd); + } } void select_func(cmd_t *cmd) { - int type, err; + int type, err, res; + uint64_t limit; if (zone_is_read_only(CMD_SELECT)) return; @@ -2612,6 +3410,32 @@ select_func(cmd_t *cmd) bcopy(&old_dstab, &in_progress_dstab, sizeof (struct zone_dstab)); return; + case RT_DCPU: + if ((err = zonecfg_lookup_pset(handle, &old_psettab)) != Z_OK) { + z_cmd_rt_perror(CMD_SELECT, RT_DCPU, err, TRUE); + global_scope = TRUE; + } + bcopy(&old_psettab, &in_progress_psettab, + sizeof (struct zone_psettab)); + return; + case RT_MCAP: + /* if none of these exist, there is no resource to select */ + if ((res = zonecfg_lookup_mcap(handle, &old_mcaptab)) != Z_OK && + zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP, &limit) + != Z_OK && + zonecfg_get_aliased_rctl(handle, ALIAS_MAXLOCKEDMEM, &limit) + != Z_OK) { + z_cmd_rt_perror(CMD_SELECT, RT_MCAP, Z_NO_RESOURCE_TYPE, + TRUE); + global_scope = TRUE; + } + if (res == Z_OK) + bcopy(&old_mcaptab, &in_progress_mcaptab, + sizeof (struct zone_mcaptab)); + else + bzero(&in_progress_mcaptab, + sizeof (in_progress_mcaptab)); + return; default: zone_perror(rt_to_str(type), Z_NO_RESOURCE_TYPE, TRUE); long_usage(CMD_SELECT, TRUE); @@ -2731,6 +3555,49 @@ valid_fs_type(const char *type) return (B_TRUE); } +static void +set_aliased_rctl(char *alias, int prop_type, char *s) +{ + uint64_t limit; + int err; + char tmp[128]; + + if (global_zone && strcmp(alias, ALIAS_SHARES) != 0) + zerr(gettext("WARNING: Setting a global zone resource " + "control too low could deny\nservice " + "to even the root user; " + "this could render the system impossible\n" + "to administer. Please use caution.")); + + /* convert memory based properties */ + if (prop_type == PT_MAXSHMMEM) { + if (!zonecfg_valid_memlimit(s, &limit)) { + zerr(gettext("A non-negative number with a required " + "scale suffix (K, M, G or T) was expected\nhere.")); + saw_error = TRUE; + return; + } + + (void) snprintf(tmp, sizeof (tmp), "%llu", limit); + s = tmp; + } + + if (!zonecfg_aliased_rctl_ok(handle, alias)) { + zone_perror(pt_to_str(prop_type), Z_ALIAS_DISALLOW, FALSE); + saw_error = TRUE; + } else if (!zonecfg_valid_alias_limit(alias, s, &limit)) { + zerr(gettext("%s property is out of range."), + pt_to_str(prop_type)); + saw_error = TRUE; + } else if ((err = zonecfg_set_aliased_rctl(handle, alias, limit)) + != Z_OK) { + zone_perror(zone, err, TRUE); + saw_error = TRUE; + } else { + need_to_commit = TRUE; + } +} + void set_func(cmd_t *cmd) { @@ -2739,6 +3606,9 @@ set_func(cmd_t *cmd) property_value_ptr_t pp; boolean_t autoboot; boolean_t force_set = FALSE; + size_t physmem_size = sizeof (in_progress_mcaptab.zone_physmem_cap); + uint64_t mem_cap, mem_limit; + struct zone_psettab tmp_psettab; if (zone_is_read_only(CMD_SET)) return; @@ -2762,6 +3632,13 @@ set_func(cmd_t *cmd) prop_type = cmd->cmd_prop_name[0]; if (global_scope) { + if (gz_invalid_property(prop_type)) { + zerr(gettext("%s is not a valid property for the " + "global zone."), pt_to_str(prop_type)); + saw_error = TRUE; + return; + } + if (prop_type == PT_ZONENAME) { res_type = RT_ZONENAME; } else if (prop_type == PT_ZONEPATH) { @@ -2776,6 +3653,20 @@ set_func(cmd_t *cmd) res_type = RT_LIMITPRIV; } else if (prop_type == PT_BOOTARGS) { res_type = RT_BOOTARGS; + } else if (prop_type == PT_SCHED) { + res_type = RT_SCHED; + } else if (prop_type == PT_MAXLWPS) { + res_type = RT_MAXLWPS; + } else if (prop_type == PT_MAXSHMMEM) { + res_type = RT_MAXSHMMEM; + } else if (prop_type == PT_MAXSHMIDS) { + res_type = RT_MAXSHMIDS; + } else if (prop_type == PT_MAXMSGIDS) { + res_type = RT_MAXMSGIDS; + } else if (prop_type == PT_MAXSEMIDS) { + res_type = RT_MAXSEMIDS; + } else if (prop_type == PT_SHARES) { + res_type = RT_SHARES; } else { zerr(gettext("Cannot set a resource-specific property " "from the global scope.")); @@ -2899,6 +3790,24 @@ set_func(cmd_t *cmd) need_to_commit = TRUE; return; case RT_POOL: + /* don't allow use of the reserved temporary pool names */ + if (strncmp("SUNW", prop_id, 4) == 0) { + zerr(gettext("pool names starting with SUNW are " + "reserved.")); + saw_error = TRUE; + return; + } + + /* can't set pool if dedicated-cpu exists */ + if (zonecfg_lookup_pset(handle, &tmp_psettab) == Z_OK) { + zerr(gettext("The %s resource already exists. " + "A persistent pool is incompatible\nwith the %s " + "resource."), rt_to_str(RT_DCPU), + rt_to_str(RT_DCPU)); + saw_error = TRUE; + return; + } + if ((err = zonecfg_set_pool(handle, prop_id)) != Z_OK) zone_perror(zone, err, TRUE); else @@ -2916,6 +3825,30 @@ set_func(cmd_t *cmd) else need_to_commit = TRUE; return; + case RT_SCHED: + if ((err = zonecfg_set_sched(handle, prop_id)) != Z_OK) + zone_perror(zone, err, TRUE); + else + need_to_commit = TRUE; + return; + case RT_MAXLWPS: + set_aliased_rctl(ALIAS_MAXLWPS, prop_type, prop_id); + return; + case RT_MAXSHMMEM: + set_aliased_rctl(ALIAS_MAXSHMMEM, prop_type, prop_id); + return; + case RT_MAXSHMIDS: + set_aliased_rctl(ALIAS_MAXSHMIDS, prop_type, prop_id); + return; + case RT_MAXMSGIDS: + set_aliased_rctl(ALIAS_MAXMSGIDS, prop_type, prop_id); + return; + case RT_MAXSEMIDS: + set_aliased_rctl(ALIAS_MAXSEMIDS, prop_type, prop_id); + return; + case RT_SHARES: + set_aliased_rctl(ALIAS_SHARES, prop_type, prop_id); + return; case RT_FS: switch (prop_type) { case PT_DIR: @@ -3095,6 +4028,146 @@ set_func(cmd_t *cmd) long_usage(CMD_SET, TRUE); usage(FALSE, HELP_PROPS); return; + case RT_DCPU: + switch (prop_type) { + char *lowp, *highp; + + case PT_NCPUS: + lowp = prop_id; + if ((highp = strchr(prop_id, '-')) != NULL) + *highp++ = '\0'; + else + highp = lowp; + + /* Make sure the input makes sense. */ + if (!zonecfg_valid_ncpus(lowp, highp)) { + zerr(gettext("%s property is out of range."), + pt_to_str(PT_NCPUS)); + saw_error = TRUE; + return; + } + + (void) strlcpy( + in_progress_psettab.zone_ncpu_min, lowp, + sizeof (in_progress_psettab.zone_ncpu_min)); + (void) strlcpy( + in_progress_psettab.zone_ncpu_max, highp, + sizeof (in_progress_psettab.zone_ncpu_max)); + return; + case PT_IMPORTANCE: + /* Make sure the value makes sense. */ + if (!zonecfg_valid_importance(prop_id)) { + zerr(gettext("%s property is out of range."), + pt_to_str(PT_IMPORTANCE)); + saw_error = TRUE; + return; + } + + (void) strlcpy(in_progress_psettab.zone_importance, + prop_id, + sizeof (in_progress_psettab.zone_importance)); + return; + default: + break; + } + zone_perror(pt_to_str(prop_type), Z_NO_PROPERTY_TYPE, TRUE); + long_usage(CMD_SET, TRUE); + usage(FALSE, HELP_PROPS); + return; + case RT_MCAP: + switch (prop_type) { + case PT_PHYSICAL: + if (!zonecfg_valid_memlimit(prop_id, &mem_cap)) { + zerr(gettext("A positive number with a " + "required scale suffix (K, M, G or T) was " + "expected here.")); + saw_error = TRUE; + } else if (mem_cap < ONE_MB) { + zerr(gettext("%s value is too small. It must " + "be at least 1M."), pt_to_str(PT_PHYSICAL)); + saw_error = TRUE; + } else { + snprintf(in_progress_mcaptab.zone_physmem_cap, + physmem_size, "%llu", mem_cap); + } + break; + case PT_SWAP: + /* + * We have to check if an rctl is allowed here since + * there might already be a rctl defined that blocks + * the alias. + */ + if (!zonecfg_aliased_rctl_ok(handle, ALIAS_MAXSWAP)) { + zone_perror(pt_to_str(PT_MAXSWAP), + Z_ALIAS_DISALLOW, FALSE); + saw_error = TRUE; + return; + } + + if (global_zone) + mem_limit = ONE_MB * 100; + else + mem_limit = ONE_MB * 50; + + if (!zonecfg_valid_memlimit(prop_id, &mem_cap)) { + zerr(gettext("A positive number with a " + "required scale suffix (K, M, G or T) was " + "expected here.")); + saw_error = TRUE; + } else if (mem_cap < mem_limit) { + char buf[128]; + + (void) snprintf(buf, sizeof (buf), "%llu", + mem_limit); + bytes_to_units(buf, buf, sizeof (buf)); + zerr(gettext("%s value is too small. It must " + "be at least %s."), pt_to_str(PT_SWAP), + buf); + saw_error = TRUE; + } else { + if ((err = zonecfg_set_aliased_rctl(handle, + ALIAS_MAXSWAP, mem_cap)) != Z_OK) + zone_perror(zone, err, TRUE); + else + need_to_commit = TRUE; + } + break; + case PT_LOCKED: + /* + * We have to check if an rctl is allowed here since + * there might already be a rctl defined that blocks + * the alias. + */ + if (!zonecfg_aliased_rctl_ok(handle, + ALIAS_MAXLOCKEDMEM)) { + zone_perror(pt_to_str(PT_LOCKED), + Z_ALIAS_DISALLOW, FALSE); + saw_error = TRUE; + return; + } + + if (!zonecfg_valid_memlimit(prop_id, &mem_cap)) { + zerr(gettext("A non-negative number with a " + "required scale suffix (K, M, G or T) was " + "expected\nhere.")); + saw_error = TRUE; + } else { + if ((err = zonecfg_set_aliased_rctl(handle, + ALIAS_MAXLOCKEDMEM, mem_cap)) != Z_OK) + zone_perror(zone, err, TRUE); + else + need_to_commit = TRUE; + } + break; + default: + zone_perror(pt_to_str(prop_type), Z_NO_PROPERTY_TYPE, + TRUE); + long_usage(CMD_SET, TRUE); + usage(FALSE, HELP_PROPS); + return; + } + + return; default: zone_perror(rt_to_str(res_type), Z_NO_RESOURCE_TYPE, TRUE); long_usage(CMD_SET, TRUE); @@ -3110,7 +4183,11 @@ output_prop(FILE *fp, int pnum, char *pval, bool print_notspec) if (*pval != '\0') { qstr = quoteit(pval); - (void) fprintf(fp, "\t%s: %s\n", pt_to_str(pnum), qstr); + if (pnum == PT_SWAP || pnum == PT_LOCKED) + (void) fprintf(fp, "\t[%s: %s]\n", pt_to_str(pnum), + qstr); + else + (void) fprintf(fp, "\t%s: %s\n", pt_to_str(pnum), qstr); free(qstr); } else if (print_notspec) (void) fprintf(fp, gettext("\t%s not specified\n"), @@ -3213,6 +4290,20 @@ info_bootargs(zone_dochandle_t handle, FILE *fp) } static void +info_sched(zone_dochandle_t handle, FILE *fp) +{ + char sched[MAXNAMELEN]; + int err; + + if ((err = zonecfg_get_sched_class(handle, sched, sizeof (sched))) + == Z_OK) { + (void) fprintf(fp, "%s: %s\n", pt_to_str(PT_SCHED), sched); + } else { + zone_perror(zone, err, TRUE); + } +} + +static void output_fs(FILE *fp, struct zone_fstab *fstab) { zone_fsopt_t *this; @@ -3499,7 +4590,7 @@ info_ds(zone_dochandle_t handle, FILE *fp, cmd_t *cmd) struct zone_dstab lookup, user; bool output = FALSE; - if (zonecfg_setdevent(handle) != Z_OK) + if (zonecfg_setdsent(handle) != Z_OK) return; while (zonecfg_getdsent(handle, &lookup) == Z_OK) { if (cmd->cmd_prop_nv_pairs == 0) { @@ -3525,12 +4616,132 @@ info_ds(zone_dochandle_t handle, FILE *fp, cmd_t *cmd) rt_to_str(RT_DATASET)); } +static void +output_pset(FILE *fp, struct zone_psettab *psettab) +{ + (void) fprintf(fp, "%s:\n", rt_to_str(RT_DCPU)); + if (strcmp(psettab->zone_ncpu_min, psettab->zone_ncpu_max) == 0) + (void) fprintf(fp, "\t%s: %s\n", pt_to_str(PT_NCPUS), + psettab->zone_ncpu_max); + else + (void) fprintf(fp, "\t%s: %s-%s\n", pt_to_str(PT_NCPUS), + psettab->zone_ncpu_min, psettab->zone_ncpu_max); + if (psettab->zone_importance[0] != '\0') + (void) fprintf(fp, "\t%s: %s\n", pt_to_str(PT_IMPORTANCE), + psettab->zone_importance); +} + +static void +info_pset(zone_dochandle_t handle, FILE *fp) +{ + struct zone_psettab lookup; + + if (zonecfg_getpsetent(handle, &lookup) == Z_OK) + output_pset(fp, &lookup); +} + +static void +info_aliased_rctl(zone_dochandle_t handle, FILE *fp, char *alias) +{ + uint64_t limit; + + if (zonecfg_get_aliased_rctl(handle, alias, &limit) == Z_OK) { + /* convert memory based properties */ + if (strcmp(alias, ALIAS_MAXSHMMEM) == 0) { + char buf[128]; + + (void) snprintf(buf, sizeof (buf), "%llu", limit); + bytes_to_units(buf, buf, sizeof (buf)); + (void) fprintf(fp, "[%s: %s]\n", alias, buf); + return; + } + + (void) fprintf(fp, "[%s: %llu]\n", alias, limit); + } +} + +static void +bytes_to_units(char *str, char *buf, int bufsize) +{ + unsigned long long num; + unsigned long long save = 0; + char *units = "BKMGT"; + char *up = units; + + num = strtoll(str, NULL, 10); + + if (num < 1024) { + (void) snprintf(buf, bufsize, "%llu", num); + return; + } + + while ((num >= 1024) && (*up != 'T')) { + up++; /* next unit of measurement */ + save = num; + num = (num + 512) >> 10; + } + + /* check if we should output a fraction. snprintf will round for us */ + if (save % 1024 != 0 && ((save >> 10) < 10)) + (void) snprintf(buf, bufsize, "%2.1f%c", ((float)save / 1024), + *up); + else + (void) snprintf(buf, bufsize, "%llu%c", num, *up); +} + +static void +output_mcap(FILE *fp, struct zone_mcaptab *mcaptab, int showswap, + uint64_t maxswap, int showlocked, uint64_t maxlocked) +{ + char buf[128]; + + (void) fprintf(fp, "%s:\n", rt_to_str(RT_MCAP)); + if (mcaptab->zone_physmem_cap[0] != '\0') { + bytes_to_units(mcaptab->zone_physmem_cap, buf, sizeof (buf)); + output_prop(fp, PT_PHYSICAL, buf, B_TRUE); + } + + if (showswap == Z_OK) { + (void) snprintf(buf, sizeof (buf), "%llu", maxswap); + bytes_to_units(buf, buf, sizeof (buf)); + output_prop(fp, PT_SWAP, buf, B_TRUE); + } + + if (showlocked == Z_OK) { + (void) snprintf(buf, sizeof (buf), "%llu", maxlocked); + bytes_to_units(buf, buf, sizeof (buf)); + output_prop(fp, PT_LOCKED, buf, B_TRUE); + } +} + +static void +info_mcap(zone_dochandle_t handle, FILE *fp) +{ + int res1, res2, res3; + uint64_t swap_limit; + uint64_t locked_limit; + struct zone_mcaptab lookup; + + bzero(&lookup, sizeof (lookup)); + res1 = zonecfg_getmcapent(handle, &lookup); + res2 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP, &swap_limit); + res3 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXLOCKEDMEM, + &locked_limit); + + if (res1 == Z_OK || res2 == Z_OK || res3 == Z_OK) + output_mcap(fp, &lookup, res2, swap_limit, res3, locked_limit); +} + void info_func(cmd_t *cmd) { FILE *fp = stdout; bool need_to_close = FALSE; char *pager; + int type; + int res1, res2; + uint64_t swap_limit; + uint64_t locked_limit; assert(cmd != NULL); @@ -3569,26 +4780,68 @@ info_func(cmd_t *cmd) case RT_DATASET: output_ds(fp, &in_progress_dstab); break; + case RT_DCPU: + output_pset(fp, &in_progress_psettab); + break; + case RT_MCAP: + res1 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP, + &swap_limit); + res2 = zonecfg_get_aliased_rctl(handle, + ALIAS_MAXLOCKEDMEM, &locked_limit); + output_mcap(fp, &in_progress_mcaptab, res1, swap_limit, + res2, locked_limit); + break; } goto cleanup; } + type = cmd->cmd_res_type; + + if (gz_invalid_rt_property(type)) { + zerr(gettext("%s is not a valid property for the global zone."), + rt_to_str(type)); + goto cleanup; + } + + if (gz_invalid_resource(type)) { + zerr(gettext("%s is not a valid resource for the global zone."), + rt_to_str(type)); + goto cleanup; + } + switch (cmd->cmd_res_type) { case RT_UNKNOWN: info_zonename(handle, fp); - info_zonepath(handle, fp); - info_brand(handle, fp); - info_autoboot(handle, fp); - info_bootargs(handle, fp); + if (!global_zone) { + info_zonepath(handle, fp); + info_brand(handle, fp); + info_autoboot(handle, fp); + info_bootargs(handle, fp); + } info_pool(handle, fp); - info_limitpriv(handle, fp); - info_ipd(handle, fp, cmd); - info_fs(handle, fp, cmd); - info_net(handle, fp, cmd); - info_dev(handle, fp, cmd); + if (!global_zone) { + info_limitpriv(handle, fp); + info_sched(handle, fp); + } + info_aliased_rctl(handle, fp, ALIAS_MAXLWPS); + info_aliased_rctl(handle, fp, ALIAS_MAXSHMMEM); + info_aliased_rctl(handle, fp, ALIAS_MAXSHMIDS); + info_aliased_rctl(handle, fp, ALIAS_MAXMSGIDS); + info_aliased_rctl(handle, fp, ALIAS_MAXSEMIDS); + info_aliased_rctl(handle, fp, ALIAS_SHARES); + if (!global_zone) { + info_ipd(handle, fp, cmd); + info_fs(handle, fp, cmd); + info_net(handle, fp, cmd); + info_dev(handle, fp, cmd); + } + info_pset(handle, fp); + info_mcap(handle, fp); + if (!global_zone) { + info_attr(handle, fp, cmd); + info_ds(handle, fp, cmd); + } info_rctl(handle, fp, cmd); - info_attr(handle, fp, cmd); - info_ds(handle, fp, cmd); break; case RT_ZONENAME: info_zonename(handle, fp); @@ -3611,6 +4864,27 @@ info_func(cmd_t *cmd) case RT_BOOTARGS: info_bootargs(handle, fp); break; + case RT_SCHED: + info_sched(handle, fp); + break; + case RT_MAXLWPS: + info_aliased_rctl(handle, fp, ALIAS_MAXLWPS); + break; + case RT_MAXSHMMEM: + info_aliased_rctl(handle, fp, ALIAS_MAXSHMMEM); + break; + case RT_MAXSHMIDS: + info_aliased_rctl(handle, fp, ALIAS_MAXSHMIDS); + break; + case RT_MAXMSGIDS: + info_aliased_rctl(handle, fp, ALIAS_MAXMSGIDS); + break; + case RT_MAXSEMIDS: + info_aliased_rctl(handle, fp, ALIAS_MAXSEMIDS); + break; + case RT_SHARES: + info_aliased_rctl(handle, fp, ALIAS_SHARES); + break; case RT_FS: info_fs(handle, fp, cmd); break; @@ -3632,6 +4906,12 @@ info_func(cmd_t *cmd) case RT_DATASET: info_ds(handle, fp, cmd); break; + case RT_DCPU: + info_pset(handle, fp); + break; + case RT_MCAP: + info_mcap(handle, fp); + break; default: zone_perror(rt_to_str(cmd->cmd_res_type), Z_NO_RESOURCE_TYPE, TRUE); @@ -3765,10 +5045,13 @@ verify_func(cmd_t *cmd) struct zone_attrtab attrtab; struct zone_rctltab rctltab; struct zone_dstab dstab; + struct zone_psettab psettab; char zonepath[MAXPATHLEN]; + char sched[MAXNAMELEN]; char brand[MAXNAMELEN]; int err, ret_val = Z_OK, arg; bool save = FALSE; + boolean_t has_cpu_shares = B_FALSE; optind = 0; if ((arg = getopt(cmd->cmd_argc, cmd->cmd_argv, "?")) != EOF) { @@ -3796,12 +5079,13 @@ verify_func(cmd_t *cmd) if (initialize(TRUE) != Z_OK) return; - if (zonecfg_get_zonepath(handle, zonepath, sizeof (zonepath)) != Z_OK) { + if (zonecfg_get_zonepath(handle, zonepath, sizeof (zonepath)) != Z_OK && + !global_zone) { zerr(gettext("%s not specified"), pt_to_str(PT_ZONEPATH)); ret_val = Z_REQD_RESOURCE_MISSING; saw_error = TRUE; } - if (strlen(zonepath) == 0) { + if (strlen(zonepath) == 0 && !global_zone) { zerr(gettext("%s cannot be empty."), pt_to_str(PT_ZONEPATH)); ret_val = Z_REQD_RESOURCE_MISSING; saw_error = TRUE; @@ -3861,6 +5145,9 @@ verify_func(cmd_t *cmd) check_reqd_prop(rctltab.zone_rctl_name, RT_RCTL, PT_NAME, &ret_val); + if (strcmp(rctltab.zone_rctl_name, "zone.cpu-shares") == 0) + has_cpu_shares = B_TRUE; + if (rctltab.zone_rctl_valptr == NULL) { zerr(gettext("%s: no %s specified"), rt_to_str(RT_RCTL), pt_to_str(PT_VALUE)); @@ -3873,6 +5160,25 @@ verify_func(cmd_t *cmd) } (void) zonecfg_endrctlent(handle); + if (zonecfg_lookup_pset(handle, &psettab) == Z_OK && has_cpu_shares) { + zerr(gettext("%s zone.cpu-shares and %s are incompatible."), + rt_to_str(RT_RCTL), rt_to_str(RT_DCPU)); + saw_error = TRUE; + if (ret_val == Z_OK) + ret_val = Z_INCOMPATIBLE; + } + + if (has_cpu_shares && zonecfg_get_sched_class(handle, sched, + sizeof (sched)) == Z_OK && strlen(sched) > 0 && + strcmp(sched, "FSS") != 0) { + zerr(gettext("WARNING: %s zone.cpu-shares and %s=%s are " + "incompatible"), + rt_to_str(RT_RCTL), rt_to_str(RT_SCHED), sched); + saw_error = TRUE; + if (ret_val == Z_OK) + ret_val = Z_INCOMPATIBLE; + } + if ((err = zonecfg_setattrent(handle)) != Z_OK) { zone_perror(zone, err, TRUE); return; @@ -4061,7 +5367,9 @@ end_func(cmd_t *cmd) struct zone_rctltab tmp_rctltab; struct zone_attrtab tmp_attrtab; struct zone_dstab tmp_dstab; - int err, arg; + int err, arg, res1, res2, res3; + uint64_t swap_limit; + uint64_t locked_limit; assert(cmd != NULL); @@ -4361,6 +5669,73 @@ end_func(cmd_t *cmd) &in_progress_dstab); } break; + case RT_DCPU: + /* Make sure everything was filled in. */ + if (end_check_reqd(in_progress_psettab.zone_ncpu_min, + PT_NCPUS, &validation_failed) != Z_OK) { + saw_error = TRUE; + return; + } + + if (end_op == CMD_ADD) { + err = zonecfg_add_pset(handle, &in_progress_psettab); + } else { + err = zonecfg_modify_pset(handle, &in_progress_psettab); + } + break; + case RT_MCAP: + /* Make sure everything was filled in. */ + res1 = strlen(in_progress_mcaptab.zone_physmem_cap) == 0 ? + Z_ERR : Z_OK; + res2 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP, + &swap_limit); + res3 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXLOCKEDMEM, + &locked_limit); + + if (res1 != Z_OK && res2 != Z_OK && res3 != Z_OK) { + zerr(gettext("No property was specified. One of %s, " + "%s or %s is required."), pt_to_str(PT_PHYSICAL), + pt_to_str(PT_SWAP), pt_to_str(PT_LOCKED)); + saw_error = TRUE; + return; + } + + /* if phys & locked are both set, verify locked <= phys */ + if (res1 == Z_OK && res3 == Z_OK) { + uint64_t phys_limit; + char *endp; + + phys_limit = strtoull( + in_progress_mcaptab.zone_physmem_cap, &endp, 10); + if (phys_limit < locked_limit) { + zerr(gettext("The %s cap must be less than or " + "equal to the %s cap."), + pt_to_str(PT_LOCKED), + pt_to_str(PT_PHYSICAL)); + saw_error = TRUE; + return; + } + } + + err = Z_OK; + if (res1 == Z_OK) { + /* + * We could be ending from either an add operation + * or a select operation. Since all of the properties + * within this resource are optional, we always use + * modify on the mcap entry. zonecfg_modify_mcap() + * will handle both adding and modifying a memory cap. + */ + err = zonecfg_modify_mcap(handle, &in_progress_mcaptab); + } else if (end_op == CMD_SELECT) { + /* + * If we're ending from a select and the physical + * memory cap is empty then the user could have cleared + * the physical cap value, so try to delete the entry. + */ + (void) zonecfg_delete_mcap(handle); + } + break; default: zone_perror(rt_to_str(resource_scope), Z_NO_RESOURCE_TYPE, TRUE); @@ -4885,7 +6260,9 @@ main(int argc, char *argv[]) zonecfg_set_root(optarg); break; case 'z': - if (zonecfg_validate_zonename(optarg) != Z_OK) { + if (strcmp(optarg, GLOBAL_ZONENAME) == 0) { + global_zone = TRUE; + } else if (zonecfg_validate_zonename(optarg) != Z_OK) { zone_perror(optarg, Z_BOGUS_ZONE_NAME, TRUE); usage(FALSE, HELP_SYNTAX); exit(Z_USAGE); diff --git a/usr/src/cmd/zonecfg/zonecfg.h b/usr/src/cmd/zonecfg/zonecfg.h index 6e153d40c1..64808e9623 100644 --- a/usr/src/cmd/zonecfg/zonecfg.h +++ b/usr/src/cmd/zonecfg/zonecfg.h @@ -50,19 +50,20 @@ typedef int bool; #define CMD_ADD 0 #define CMD_CANCEL 1 -#define CMD_COMMIT 2 -#define CMD_CREATE 3 -#define CMD_DELETE 4 -#define CMD_END 5 -#define CMD_EXIT 6 -#define CMD_EXPORT 7 -#define CMD_HELP 8 -#define CMD_INFO 9 -#define CMD_REMOVE 10 -#define CMD_REVERT 11 -#define CMD_SELECT 12 -#define CMD_SET 13 -#define CMD_VERIFY 14 +#define CMD_CLEAR 2 +#define CMD_COMMIT 3 +#define CMD_CREATE 4 +#define CMD_DELETE 5 +#define CMD_END 6 +#define CMD_EXIT 7 +#define CMD_EXPORT 8 +#define CMD_HELP 9 +#define CMD_INFO 10 +#define CMD_REMOVE 11 +#define CMD_REVERT 12 +#define CMD_SELECT 13 +#define CMD_SET 14 +#define CMD_VERIFY 15 #define CMD_MIN CMD_ADD #define CMD_MAX CMD_VERIFY @@ -83,9 +84,18 @@ typedef int bool; #define RT_LIMITPRIV 12 /* really a property, but for info ... */ #define RT_BOOTARGS 13 /* really a property, but for info ... */ #define RT_BRAND 14 /* really a property, but for info ... */ +#define RT_DCPU 15 +#define RT_MCAP 16 +#define RT_MAXLWPS 17 /* really a rctl alias property, but for info */ +#define RT_MAXSHMMEM 18 /* really a rctl alias property, but for info */ +#define RT_MAXSHMIDS 19 /* really a rctl alias property, but for info */ +#define RT_MAXMSGIDS 20 /* really a rctl alias property, but for info */ +#define RT_MAXSEMIDS 21 /* really a rctl alias property, but for info */ +#define RT_SHARES 22 /* really a rctl alias property, but for info */ +#define RT_SCHED 23 /* really a property, but for info ... */ #define RT_MIN RT_UNKNOWN -#define RT_MAX RT_BRAND +#define RT_MAX RT_SCHED /* property types: increment PT_MAX when expanding this list */ #define PT_UNKNOWN 0 @@ -109,9 +119,22 @@ typedef int bool; #define PT_LIMITPRIV 18 #define PT_BOOTARGS 19 #define PT_BRAND 20 +#define PT_NCPUS 21 +#define PT_IMPORTANCE 22 +#define PT_SWAP 23 +#define PT_LOCKED 24 +#define PT_SHARES 25 +#define PT_MAXLWPS 26 +#define PT_MAXSHMMEM 27 +#define PT_MAXSHMIDS 28 +#define PT_MAXMSGIDS 29 +#define PT_MAXSEMIDS 30 +#define PT_MAXLOCKEDMEM 31 +#define PT_MAXSWAP 32 +#define PT_SCHED 33 #define PT_MIN PT_UNKNOWN -#define PT_MAX PT_BRAND +#define PT_MAX PT_SCHED #define MAX_EQ_PROP_PAIRS 3 @@ -184,6 +207,7 @@ extern void revert_func(cmd_t *); extern void select_func(cmd_t *); extern void set_func(cmd_t *); extern void verify_func(cmd_t *); +extern void clear_func(cmd_t *); extern cmd_t *alloc_cmd(void); extern complex_property_ptr_t alloc_complex(void); diff --git a/usr/src/cmd/zonecfg/zonecfg_grammar.y b/usr/src/cmd/zonecfg/zonecfg_grammar.y index dc391da0b9..5c0dc2263e 100644 --- a/usr/src/cmd/zonecfg/zonecfg_grammar.y +++ b/usr/src/cmd/zonecfg/zonecfg_grammar.y @@ -60,15 +60,17 @@ extern void yyerror(char *s); %token COMMIT REVERT EXIT SEMICOLON TOKEN ZONENAME ZONEPATH AUTOBOOT POOL NET %token FS IPD ATTR DEVICE RCTL SPECIAL RAW DIR OPTIONS TYPE ADDRESS PHYSICAL %token NAME MATCH PRIV LIMIT ACTION VALUE EQUAL OPEN_SQ_BRACKET CLOSE_SQ_BRACKET -%token OPEN_PAREN CLOSE_PAREN COMMA DATASET LIMITPRIV BOOTARGS BRAND +%token OPEN_PAREN CLOSE_PAREN COMMA DATASET LIMITPRIV BOOTARGS BRAND PSET +%token MCAP NCPUS IMPORTANCE SHARES MAXLWPS MAXSHMMEM MAXSHMIDS MAXMSGIDS +%token MAXSEMIDS LOCKED SWAP SCHED CLEAR %type <strval> TOKEN EQUAL OPEN_SQ_BRACKET CLOSE_SQ_BRACKET property_value OPEN_PAREN CLOSE_PAREN COMMA simple_prop_val %type <complex> complex_piece complex_prop_val -%type <ival> resource_type NET FS IPD DEVICE RCTL ATTR +%type <ival> resource_type NET FS IPD DEVICE RCTL ATTR DATASET PSET MCAP %type <ival> property_name SPECIAL RAW DIR OPTIONS TYPE ADDRESS PHYSICAL NAME MATCH ZONENAME ZONEPATH AUTOBOOT POOL LIMITPRIV BOOTARGS VALUE PRIV LIMIT - ACTION BRAND + ACTION BRAND SCHED %type <cmd> command %type <cmd> add_command ADD %type <cmd> cancel_command CANCEL @@ -84,6 +86,7 @@ extern void yyerror(char *s); %type <cmd> revert_command REVERT %type <cmd> select_command SELECT %type <cmd> set_command SET +%type <cmd> clear_command CLEAR %type <cmd> verify_command VERIFY %type <cmd> terminator @@ -126,6 +129,7 @@ commands: command terminator command: add_command | cancel_command + | clear_command | create_command | commit_command | delete_command @@ -465,6 +469,69 @@ info_command: INFO $$->cmd_res_type = RT_BOOTARGS; $$->cmd_prop_nv_pairs = 0; } + | INFO SCHED + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &info_func; + $$->cmd_res_type = RT_SCHED; + $$->cmd_prop_nv_pairs = 0; + } + | INFO SHARES + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &info_func; + $$->cmd_res_type = RT_SHARES; + $$->cmd_prop_nv_pairs = 0; + } + | INFO MAXLWPS + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &info_func; + $$->cmd_res_type = RT_MAXLWPS; + $$->cmd_prop_nv_pairs = 0; + } + | INFO MAXSHMMEM + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &info_func; + $$->cmd_res_type = RT_MAXSHMMEM; + $$->cmd_prop_nv_pairs = 0; + } + | INFO MAXSHMIDS + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &info_func; + $$->cmd_res_type = RT_MAXSHMIDS; + $$->cmd_prop_nv_pairs = 0; + } + | INFO MAXMSGIDS + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &info_func; + $$->cmd_res_type = RT_MAXMSGIDS; + $$->cmd_prop_nv_pairs = 0; + } + | INFO MAXSEMIDS + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &info_func; + $$->cmd_res_type = RT_MAXSEMIDS; + $$->cmd_prop_nv_pairs = 0; + } | INFO resource_type property_name EQUAL property_value { if (($$ = alloc_cmd()) == NULL) @@ -512,11 +579,32 @@ remove_command: REMOVE usage(FALSE, HELP_RES_PROPS); YYERROR; } - | REMOVE resource_type + | REMOVE TOKEN { short_usage(CMD_REMOVE); + (void) fputs("\n", stderr); + usage(FALSE, HELP_RES_PROPS); YYERROR; } + | REMOVE resource_type + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &remove_func; + $$->cmd_res_type = $2; + } + | REMOVE TOKEN resource_type + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &remove_func; + $$->cmd_res_type = $3; + $$->cmd_argc = 1; + $$->cmd_argv[0] = $2; + $$->cmd_argv[1] = NULL; + } | REMOVE property_name property_value { if (($$ = alloc_cmd()) == NULL) @@ -594,6 +682,22 @@ select_command: SELECT usage(FALSE, HELP_RES_PROPS); YYERROR; } + | SELECT PSET + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &select_func; + $$->cmd_res_type = RT_DCPU; + } + | SELECT MCAP + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &select_func; + $$->cmd_res_type = RT_MCAP; + } | SELECT resource_type { short_usage(CMD_SELECT); @@ -682,6 +786,22 @@ set_command: SET $$->cmd_property_ptr[0] = &property[0]; } +clear_command: CLEAR + { + short_usage(CMD_CLEAR); + (void) fputs("\n", stderr); + usage(FALSE, HELP_PROPS); + YYERROR; + } + | CLEAR property_name + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &clear_func; + $$->cmd_res_type = $2; + } + verify_command: VERIFY { if (($$ = alloc_cmd()) == NULL) @@ -709,6 +829,8 @@ resource_type: NET { $$ = RT_NET; } | RCTL { $$ = RT_RCTL; } | ATTR { $$ = RT_ATTR; } | DATASET { $$ = RT_DATASET; } + | PSET { $$ = RT_DCPU; } + | MCAP { $$ = RT_MCAP; } property_name: SPECIAL { $$ = PT_SPECIAL; } | RAW { $$ = PT_RAW; } @@ -730,6 +852,17 @@ property_name: SPECIAL { $$ = PT_SPECIAL; } | LIMIT { $$ = PT_LIMIT; } | ACTION { $$ = PT_ACTION; } | BRAND { $$ = PT_BRAND; } + | NCPUS { $$ = PT_NCPUS; } + | LOCKED { $$ = PT_LOCKED; } + | SWAP { $$ = PT_SWAP; } + | IMPORTANCE { $$ = PT_IMPORTANCE; } + | SHARES { $$ = PT_SHARES; } + | MAXLWPS { $$ = PT_MAXLWPS; } + | MAXSHMMEM { $$ = PT_MAXSHMMEM; } + | MAXSHMIDS { $$ = PT_MAXSHMIDS; } + | MAXMSGIDS { $$ = PT_MAXMSGIDS; } + | MAXSEMIDS { $$ = PT_MAXSEMIDS; } + | SCHED { $$ = PT_SCHED; } /* * The grammar builds data structures from the bottom up. Thus various diff --git a/usr/src/cmd/zonecfg/zonecfg_lex.l b/usr/src/cmd/zonecfg/zonecfg_lex.l index aef16edbcb..53f726ca2e 100644 --- a/usr/src/cmd/zonecfg/zonecfg_lex.l +++ b/usr/src/cmd/zonecfg/zonecfg_lex.l @@ -40,7 +40,10 @@ extern void yyerror(char *s); char *safe_strdup(char *s); %} -%a 4000 +%a 6000 +%p 4000 +%e 2000 +%n 1000 %{ /* @@ -139,6 +142,12 @@ char *safe_strdup(char *s); return SET; } +<INITIAL>clear { + BEGIN TSTATE; + state = TSTATE; + return CLEAR; + } + <INITIAL>verify { BEGIN TSTATE; state = TSTATE; @@ -162,6 +171,10 @@ char *safe_strdup(char *s); <TSTATE>dataset { return DATASET; } +<TSTATE>dedicated-cpu { return PSET; } + +<TSTATE>capped-memory { return MCAP; } + <TSTATE>zonepath { return ZONEPATH; } <CSTATE>zonepath { return ZONEPATH; } @@ -219,6 +232,39 @@ char *safe_strdup(char *s); <TSTATE>action { return ACTION; } <CSTATE>action { return ACTION; } +<TSTATE>ncpus { return NCPUS; } +<CSTATE>ncpus { return NCPUS; } + +<TSTATE>locked { return LOCKED; } +<CSTATE>locked { return LOCKED; } + +<TSTATE>swap { return SWAP; } +<CSTATE>swap { return SWAP; } + +<TSTATE>importance { return IMPORTANCE; } +<CSTATE>importance { return IMPORTANCE; } + +<TSTATE>cpu-shares { return SHARES; } +<CSTATE>cpu-shares { return SHARES; } + +<TSTATE>max-lwps { return MAXLWPS; } +<CSTATE>max-lwps { return MAXLWPS; } + +<TSTATE>max-shm-memory { return MAXSHMMEM; } +<CSTATE>max-shm-memory { return MAXSHMMEM; } + +<TSTATE>max-shm-ids { return MAXSHMIDS; } +<CSTATE>max-shm-ids { return MAXSHMIDS; } + +<TSTATE>max-msg-ids { return MAXMSGIDS; } +<CSTATE>max-msg-ids { return MAXMSGIDS; } + +<TSTATE>max-sem-ids { return MAXSEMIDS; } +<CSTATE>max-sem-ids { return MAXSEMIDS; } + +<TSTATE>scheduling-class { return SCHED; } +<CSTATE>scheduling-class { return SCHED; } + <TSTATE>= { return EQUAL; } <LSTATE>= { return EQUAL; } <CSTATE>= { return EQUAL; } diff --git a/usr/src/head/libzonecfg.h b/usr/src/head/libzonecfg.h index 3af98c1a6b..10ee4a2bb4 100644 --- a/usr/src/head/libzonecfg.h +++ b/usr/src/head/libzonecfg.h @@ -90,6 +90,15 @@ extern "C" { #define Z_PRIV_REQUIRED 38 /* required privilege is missing */ #define Z_PRIV_UNKNOWN 39 /* specified privilege is unknown */ #define Z_BRAND_ERROR 40 /* brand-specific error */ +#define Z_INCOMPATIBLE 41 /* incompatible settings */ +#define Z_ALIAS_DISALLOW 42 /* rctl alias disallowed */ +#define Z_CLEAR_DISALLOW 43 /* clear property disallowed */ +#define Z_POOL 44 /* generic libpool error */ +#define Z_POOLS_NOT_ACTIVE 45 /* pool service not enabled */ +#define Z_POOL_ENABLE 46 /* pools enable failed */ +#define Z_NO_POOL 47 /* no such pool configured */ +#define Z_POOL_CREATE 48 /* pool create failed */ +#define Z_POOL_BIND 49 /* pool bind failed */ /* * Warning: these are shared with the admin/install consolidation. @@ -126,6 +135,18 @@ extern "C" { #define ZONE_PKG_VERSMAX 256 /* + * Shortened alias names for the zones rctls. + */ +#define ALIAS_MAXLWPS "max-lwps" +#define ALIAS_MAXSHMMEM "max-shm-memory" +#define ALIAS_MAXSHMIDS "max-shm-ids" +#define ALIAS_MAXMSGIDS "max-msg-ids" +#define ALIAS_MAXSEMIDS "max-sem-ids" +#define ALIAS_MAXLOCKEDMEM "locked" +#define ALIAS_MAXSWAP "swap" +#define ALIAS_SHARES "cpu-shares" + +/* * Bit flag definitions for passing into libzonecfg functions. */ #define ZONE_DRY_RUN 0x01 @@ -190,6 +211,16 @@ struct zone_dstab { char zone_dataset_name[MAXNAMELEN]; }; +struct zone_psettab { + char zone_ncpu_min[MAXNAMELEN]; + char zone_ncpu_max[MAXNAMELEN]; + char zone_importance[MAXNAMELEN]; +}; + +struct zone_mcaptab { + char zone_physmem_cap[MAXNAMELEN]; +}; + struct zone_pkgtab { char zone_pkg_name[MAXNAMELEN]; char zone_pkg_version[ZONE_PKG_VERSMAX]; @@ -227,10 +258,17 @@ extern int zonecfg_access(const char *, int); extern void zonecfg_set_root(const char *); extern const char *zonecfg_get_root(void); extern boolean_t zonecfg_in_alt_root(void); +extern int zonecfg_num_resources(zone_dochandle_t, char *); +extern int zonecfg_del_all_resources(zone_dochandle_t, char *); +extern boolean_t zonecfg_valid_ncpus(char *, char *); +extern boolean_t zonecfg_valid_importance(char *); +extern int zonecfg_str_to_bytes(char *, uint64_t *); +extern boolean_t zonecfg_valid_memlimit(char *, uint64_t *); +extern boolean_t zonecfg_valid_alias_limit(char *, char *, uint64_t *); /* - * Zone name, path to zone directory, autoboot setting, pool and boot - * arguments. + * Zone name, path to zone directory, autoboot setting, pool, boot + * arguments, and scheduling-class. */ extern int zonecfg_validate_zonename(const char *); extern int zonecfg_get_name(zone_dochandle_t, char *, size_t); @@ -243,6 +281,9 @@ extern int zonecfg_get_pool(zone_dochandle_t, char *, size_t); extern int zonecfg_set_pool(zone_dochandle_t, char *); extern int zonecfg_get_bootargs(zone_dochandle_t, char *, size_t); extern int zonecfg_set_bootargs(zone_dochandle_t, char *); +extern int zonecfg_get_sched_class(zone_dochandle_t, char *, size_t); +extern int zonecfg_set_sched(zone_dochandle_t, char *); +extern int zonecfg_get_dflt_sched_class(zone_dochandle_t, char *, int); /* * Set/retrieve the brand for the zone @@ -302,6 +343,11 @@ extern int zonecfg_add_rctl_value(struct zone_rctltab *, extern int zonecfg_remove_rctl_value(struct zone_rctltab *, struct zone_rctlvaltab *); extern void zonecfg_free_rctl_value_list(struct zone_rctlvaltab *); +extern boolean_t zonecfg_aliased_rctl_ok(zone_dochandle_t, char *); +extern int zonecfg_set_aliased_rctl(zone_dochandle_t, char *, uint64_t); +extern int zonecfg_get_aliased_rctl(zone_dochandle_t, char *, uint64_t *); +extern int zonecfg_rm_aliased_rctl(zone_dochandle_t, char *); +extern int zonecfg_apply_rctls(char *, zone_dochandle_t); /* * Generic attribute configuration and type/value extraction. @@ -328,6 +374,34 @@ extern int zonecfg_modify_ds(zone_dochandle_t, struct zone_dstab *, extern int zonecfg_lookup_ds(zone_dochandle_t, struct zone_dstab *); /* + * cpu-set configuration. + */ +extern int zonecfg_add_pset(zone_dochandle_t, struct zone_psettab *); +extern int zonecfg_delete_pset(zone_dochandle_t); +extern int zonecfg_modify_pset(zone_dochandle_t, struct zone_psettab *); +extern int zonecfg_lookup_pset(zone_dochandle_t, struct zone_psettab *); + +/* + * mem-cap configuration. + */ +extern int zonecfg_delete_mcap(zone_dochandle_t); +extern int zonecfg_modify_mcap(zone_dochandle_t, struct zone_mcaptab *); +extern int zonecfg_lookup_mcap(zone_dochandle_t, struct zone_mcaptab *); + +/* + * Temporary pool support functions. + */ +extern int zonecfg_destroy_tmp_pool(char *, char *, int); +extern int zonecfg_bind_tmp_pool(zone_dochandle_t, zoneid_t, char *, int); +extern int zonecfg_bind_pool(zone_dochandle_t, zoneid_t, char *, int); +extern boolean_t zonecfg_warn_poold(zone_dochandle_t); + +/* + * Miscellaneous utility functions. + */ +extern int zonecfg_enable_rcapd(char *, int); + +/* * attach/detach support. */ extern int zonecfg_get_attach_handle(const char *, const char *, @@ -373,6 +447,8 @@ extern int zonecfg_endrctlent(zone_dochandle_t); extern int zonecfg_setdsent(zone_dochandle_t); extern int zonecfg_getdsent(zone_dochandle_t, struct zone_dstab *); extern int zonecfg_enddsent(zone_dochandle_t); +extern int zonecfg_getpsetent(zone_dochandle_t, struct zone_psettab *); +extern int zonecfg_getmcapent(zone_dochandle_t, struct zone_mcaptab *); extern int zonecfg_setpkgent(zone_dochandle_t); extern int zonecfg_getpkgent(zone_dochandle_t, struct zone_pkgtab *); extern int zonecfg_endpkgent(zone_dochandle_t); diff --git a/usr/src/lib/Makefile b/usr/src/lib/Makefile index da3bdb3844..c541fcb01c 100644 --- a/usr/src/lib/Makefile +++ b/usr/src/lib/Makefile @@ -489,7 +489,7 @@ libldap5: libsasl libsocket libnsl libmd libsldap: libldap5 libtsol libpool: libnvpair libexacct libzonecfg: libc libsocket libnsl libuuid libnvpair libsysevent libsec \ - libbrand + libbrand libpool libscf libproc: ../cmd/sgs/librtld_db ../cmd/sgs/libelf libctf libproject: libpool libproc libsecdb libtsnet: libnsl libtsol libsecdb diff --git a/usr/src/lib/libc/port/gen/getrusage.c b/usr/src/lib/libc/port/gen/getrusage.c index c1f1b92188..efeaf0be24 100644 --- a/usr/src/lib/libc/port/gen/getrusage.c +++ b/usr/src/lib/libc/port/gen/getrusage.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -49,6 +48,7 @@ #include <sys/param.h> #include <errno.h> #include <sys/resource.h> +#include <sys/vm_usage.h> #include <fcntl.h> #include <sys/fcntl.h> #include <procfs.h> @@ -76,3 +76,10 @@ getrusage(int who, struct rusage *rusage) return (-1); } } + +int +getvmusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres) +{ + return (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, flags, age, + buf, nres)); +} diff --git a/usr/src/lib/libc/port/mapfile-vers b/usr/src/lib/libc/port/mapfile-vers index 22227a6413..8e1b399567 100644 --- a/usr/src/lib/libc/port/mapfile-vers +++ b/usr/src/lib/libc/port/mapfile-vers @@ -59,6 +59,7 @@ SUNW_1.23 { # SunOS 5.11 (Solaris 11) fdatasync; forkallx; forkx; + getvmusage; lio_listio; mkdtemp; _mkdtemp; diff --git a/usr/src/lib/libpool/common/pool.c b/usr/src/lib/libpool/common/pool.c index 167cd8be5b..6fbd7b34d3 100644 --- a/usr/src/lib/libpool/common/pool.c +++ b/usr/src/lib/libpool/common/pool.c @@ -914,10 +914,34 @@ pool_put_property(pool_conf_t *conf, pool_elem_t *pe, const char *name, return (NULL); } - if (!is_valid_prop_name(name)) { + /* Don't allow (re)setting of the "temporary" property */ + if (!is_valid_prop_name(name) || strstr(name, ".temporary") != NULL) { pool_seterror(POE_BADPARAM); return (PO_FAIL); } + + /* Don't allow rename of temporary pools/resources */ + if (strstr(name, ".name") != NULL && elem_is_tmp(pe)) { + boolean_t rename = B_TRUE; + pool_value_t *pv = pool_value_alloc(); + + if (pe->pe_get_prop(pe, name, pv) != POC_INVAL) { + const char *s1 = NULL; + const char *s2 = NULL; + + (void) pool_value_get_string(pv, &s1); + (void) pool_value_get_string(val, &s2); + if (s1 != NULL && s2 != NULL && strcmp(s1, s2) == 0) + rename = B_FALSE; + } + pool_value_free(pv); + + if (rename) { + pool_seterror(POE_BADPARAM); + return (PO_FAIL); + } + } + /* * Check to see if this is a property we are managing. If it is, * ensure that we are happy with what the user is doing. @@ -936,6 +960,46 @@ pool_put_property(pool_conf_t *conf, pool_elem_t *pe, const char *name, } /* + * Set temporary property to flag as a temporary element. + * + * PO_FAIL is returned if an error is detected and the error code is updated + * to indicate the cause of the error. + */ +int +pool_set_temporary(pool_conf_t *conf, pool_elem_t *pe) +{ + int res; + char name[128]; + pool_value_t *val; + + if (pool_conf_check(conf) != PO_SUCCESS) + return (PO_FAIL); + + if (TO_CONF(pe) != conf) { + pool_seterror(POE_BADPARAM); + return (PO_FAIL); + } + + /* create property name based on element type */ + if (snprintf(name, sizeof (name), "%s.temporary", + pool_elem_class_string(pe)) > sizeof (name)) { + pool_seterror(POE_SYSTEM); + return (PO_FAIL); + } + + if ((val = pool_value_alloc()) == NULL) + return (PO_FAIL); + + pool_value_set_bool(val, (uchar_t)1); + + res = pe->pe_put_prop(pe, name, val); + + pool_value_free(val); + + return (res); +} + +/* * Update the specified property value with the namespace prepended. * e.g. If this function is used to update the property "name" on a pool, it * will attempt to update "pool.name". @@ -1030,6 +1094,12 @@ pool_rm_property(pool_conf_t *conf, pool_elem_t *pe, const char *name) return (NULL); } + /* Don't allow removal of the "temporary" property */ + if (strstr(name, ".temporary") != NULL) { + pool_seterror(POE_BADPARAM); + return (PO_FAIL); + } + /* * Check to see if this is a property we are managing. If it is, * ensure that we are happy with what the user is doing. @@ -1122,6 +1192,17 @@ pool_create(pool_conf_t *conf, const char *name) pool_seterror(POE_PUTPROP); return (NULL); } + + /* + * If we are creating a temporary pool configuration, flag the pool. + */ + if (conf->pc_prov->pc_oflags & PO_TEMP) { + if (pool_set_temporary(conf, pe) == PO_FAIL) { + (void) pool_destroy(conf, pool_elem_pool(pe)); + return (NULL); + } + } + return (pool_elem_pool(pe)); } @@ -1227,6 +1308,17 @@ pool_resource_create(pool_conf_t *conf, const char *sz_type, const char *name) return (NULL); } } + + /* + * If we are creating a temporary pool configuration, flag the resource. + */ + if (conf->pc_prov->pc_oflags & PO_TEMP) { + if (pool_set_temporary(conf, pe) != PO_SUCCESS) { + (void) pool_resource_destroy(conf, pool_elem_res(pe)); + return (NULL); + } + } + return (pool_elem_res(pe)); } @@ -1396,7 +1488,8 @@ pool_conf_open(pool_conf_t *conf, const char *location, int oflags) pool_seterror(POE_BADPARAM); return (PO_FAIL); } - if (oflags & ~(PO_RDONLY | PO_RDWR | PO_CREAT | PO_DISCO | PO_UPDATE)) { + if (oflags & ~(PO_RDONLY | PO_RDWR | PO_CREAT | PO_DISCO | PO_UPDATE | + PO_TEMP)) { pool_seterror(POE_BADPARAM); return (PO_FAIL); } @@ -1408,6 +1501,10 @@ pool_conf_open(pool_conf_t *conf, const char *location, int oflags) if (oflags & PO_CREAT) oflags |= PO_RDWR; + /* location is ignored when creating a temporary configuration */ + if (oflags & PO_TEMP) + location = ""; + if ((conf->pc_location = strdup(location)) == NULL) { pool_seterror(POE_SYSTEM); return (PO_FAIL); @@ -1415,14 +1512,25 @@ pool_conf_open(pool_conf_t *conf, const char *location, int oflags) /* * This is the crossover point into the actual data provider * implementation, allocate a data provider of the appropriate - * type for your data storage medium. In this case it's a kernel - * data provider. To use a different data provider, write some - * code to implement all the required interfaces and then - * change the next line to allocate a data provider which uses your - * new code. All data provider routines can be static, apart from - * the allocation routine. + * type for your data storage medium. In this case it's either a kernel + * or xml data provider. To use a different data provider, write some + * code to implement all the required interfaces and then change the + * following code to allocate a data provider which uses your new code. + * All data provider routines can be static, apart from the allocation + * routine. + * + * For temporary pools (PO_TEMP) we start with a copy of the current + * dynamic configuration and do all of the updates in-memory. */ - if (strcmp(location, pool_dynamic_location()) == 0) { + if (oflags & PO_TEMP) { + if (pool_knl_connection_alloc(conf, PO_TEMP) != PO_SUCCESS) { + conf->pc_state = POF_INVALID; + return (PO_FAIL); + } + /* set rdwr flag so we can updated the in-memory config. */ + conf->pc_prov->pc_oflags |= PO_RDWR; + + } else if (strcmp(location, pool_dynamic_location()) == 0) { if (pool_knl_connection_alloc(conf, oflags) != PO_SUCCESS) { conf->pc_state = POF_INVALID; return (PO_FAIL); diff --git a/usr/src/lib/libpool/common/pool.h b/usr/src/lib/libpool/common/pool.h index d38e9902e6..ee11aadb7b 100644 --- a/usr/src/lib/libpool/common/pool.h +++ b/usr/src/lib/libpool/common/pool.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -96,6 +95,7 @@ extern uint_t pool_version(uint_t ver); #define PO_CREAT 0x2 #define PO_DISCO 0x4 #define PO_UPDATE 0x8 +#define PO_TEMP 0x10 /* Allocation policy */ #define POA_IMPORTANCE "importance based" @@ -218,6 +218,7 @@ extern pool_value_class_t pool_get_property(const pool_conf_t *, extern int pool_put_property(pool_conf_t *, pool_elem_t *, const char *, const pool_value_t *); extern int pool_rm_property(pool_conf_t *, pool_elem_t *, const char *); + /* * Walk the associated properties of the supplied element calling the supplied * function for each property in turn. There is no implied order in the walk. diff --git a/usr/src/lib/libpool/common/pool_commit.c b/usr/src/lib/libpool/common/pool_commit.c index 1ea4808377..b996524b98 100644 --- a/usr/src/lib/libpool/common/pool_commit.c +++ b/usr/src/lib/libpool/common/pool_commit.c @@ -245,6 +245,9 @@ commit_delete(pool_elem_t *pe) pool_t *pool; int ret = 0; + if (elem_is_tmp(pe)) + return (PO_SUCCESS); + switch (pool_elem_class(pe)) { case PEC_SYSTEM: /* NO-OP */ break; @@ -1306,7 +1309,14 @@ clone_element(pool_conf_t *conf, pool_elem_t *pe, const char *name, if ((prop = provider_get_prop(pe, name)) != NULL && prop_is_readonly(prop) == PO_TRUE) return (PO_SUCCESS); - return (pool_put_property(TO_CONF(tgt), tgt, name, pv) == PO_FAIL); + + /* The temporary property needs special handling */ + if (strstr(name, ".temporary") != NULL) + return (pool_set_temporary(TO_CONF(tgt), tgt) == + PO_FAIL ? PO_FAIL : PO_SUCCESS); + else + return (pool_put_property(TO_CONF(tgt), tgt, name, pv) == + PO_FAIL ? PO_FAIL : PO_SUCCESS); } /* @@ -1322,8 +1332,9 @@ clean_element(pool_conf_t *conf, pool_elem_t *pe, const char *name, /* * Some properties should be ignored */ - if ((prop = provider_get_prop(pe, name)) != NULL && - prop_is_optional(prop) == PO_FALSE) + if (strstr(name, ".temporary") != NULL || + ((prop = provider_get_prop(pe, name)) != NULL && + prop_is_optional(prop) == PO_FALSE)) return (PO_SUCCESS); return (pool_rm_property(conf, (pool_elem_t *)pe, name) == PO_FAIL); } diff --git a/usr/src/lib/libpool/common/pool_internal.c b/usr/src/lib/libpool/common/pool_internal.c index 210e63d620..5e572f6eaf 100644 --- a/usr/src/lib/libpool/common/pool_internal.c +++ b/usr/src/lib/libpool/common/pool_internal.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1143,6 +1142,23 @@ elem_is_default(const pool_elem_t *res) } /* + * Return B_TRUE if the element has the 'temporary' property set. + */ +boolean_t +elem_is_tmp(const pool_elem_t *elem) +{ + pool_value_t val = POOL_VALUE_INITIALIZER; + uchar_t bval; + + if (pool_get_ns_property(elem, "temporary", &val) != POC_BOOL) + return (B_FALSE); + + (void) pool_value_get_bool(&val, &bval); + + return (bval != 0); +} + +/* * get_default_elem() returns the default elem for type of the supplied * elem. * diff --git a/usr/src/lib/libpool/common/pool_internal.h b/usr/src/lib/libpool/common/pool_internal.h index 592c98d11d..e172d23af4 100644 --- a/usr/src/lib/libpool/common/pool_internal.h +++ b/usr/src/lib/libpool/common/pool_internal.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -256,6 +255,7 @@ extern int resource_get_pinned(const pool_resource_t *, extern char *elem_get_name(const pool_elem_t *); extern id_t elem_get_sysid(const pool_elem_t *); extern int elem_is_default(const pool_elem_t *); +extern boolean_t elem_is_tmp(const pool_elem_t *); extern const pool_elem_t *get_default_elem(const pool_elem_t *); extern int qsort_elem_compare(const void *, const void *); @@ -371,6 +371,7 @@ extern pool_value_class_t pool_get_ns_property(const pool_elem_t *, extern int pool_walk_any_properties(pool_conf_t *, pool_elem_t *, void *, int (*)(pool_conf_t *, pool_elem_t *, const char *, pool_value_t *, void *), int); +extern int pool_set_temporary(pool_conf_t *, pool_elem_t *); /* * Namespace aware utility functions. diff --git a/usr/src/lib/libpool/common/pool_kernel.c b/usr/src/lib/libpool/common/pool_kernel.c index f84d6f2ba5..3da4f0263c 100644 --- a/usr/src/lib/libpool/common/pool_kernel.c +++ b/usr/src/lib/libpool/common/pool_kernel.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -646,10 +645,14 @@ pool_knl_close(pool_conf_t *conf) } /* * Rollback any pending changes before freeing the prov. This - * ensures there are no memory leaks from pending - * transactions. + * ensures there are no memory leaks from pending transactions. + * However, don't rollback when we've done a temporary pool since the + * pool/resources haven't really been committed in this case. + * They will all be freed in pool_knl_connection_free and we don't + * want to double free them. */ - (void) pool_knl_rollback(conf); + if (!(conf->pc_prov->pc_oflags & PO_TEMP)) + (void) pool_knl_rollback(conf); pool_knl_connection_free(prov); return (PO_SUCCESS); } @@ -997,6 +1000,9 @@ pool_knl_export(const pool_conf_t *conf, const char *location, const char *sep = ""; int j; + if (elem_is_tmp(elem)) + continue; + if ((info.ktx_node = node_create(system, BAD_CAST element_class_tags [pool_elem_class(elem)])) == NULL) { @@ -1072,6 +1078,9 @@ pool_knl_export(const pool_conf_t *conf, const char *location, uint_t ncompelem; int j; + if (elem_is_tmp(elem)) + continue; + if ((info.ktx_node = node_create(system, BAD_CAST element_class_tags [pool_elem_class(elem)])) == NULL) { diff --git a/usr/src/lib/libproject/common/setproject.c b/usr/src/lib/libproject/common/setproject.c index 2303576d32..d22878a36f 100644 --- a/usr/src/lib/libproject/common/setproject.c +++ b/usr/src/lib/libproject/common/setproject.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -536,7 +535,12 @@ setproject_proc(const char *project_name, const char *user_name, int flags, } old_pool_name = pool_get_binding(pid); - if (bind_to_pool(pool_name, pid, 0) != 0) { + + /* + * If parent is not bound to the default pool, then we want + * to preserve same binding as parent. + */ + if (pool_name != NULL && bind_to_pool(pool_name, pid, 0) != 0) { if (old_pool_name) free(old_pool_name); _kva_free(kv_array); diff --git a/usr/src/lib/libzonecfg/Makefile.com b/usr/src/lib/libzonecfg/Makefile.com index b89a44fce3..b64df94527 100644 --- a/usr/src/lib/libzonecfg/Makefile.com +++ b/usr/src/lib/libzonecfg/Makefile.com @@ -32,7 +32,8 @@ OBJECTS= libzonecfg.o getzoneent.o scratchops.o include ../../Makefile.lib LIBS = $(DYNLIB) $(LINTLIB) -LDLIBS += -lc -lsocket -lnsl -luuid -lnvpair -lsysevent -lsec -lbrand +LDLIBS += -lc -lsocket -lnsl -luuid -lnvpair -lsysevent -lsec -lbrand \ + -lpool -lscf -lproc # DYNLIB libraries do not have lint libs and are not linted $(DYNLIB) := LDLIBS += -lxml2 diff --git a/usr/src/lib/libzonecfg/common/libzonecfg.c b/usr/src/lib/libzonecfg/common/libzonecfg.c index f4fbcde368..1a3fb37c8c 100644 --- a/usr/src/lib/libzonecfg/common/libzonecfg.c +++ b/usr/src/lib/libzonecfg/common/libzonecfg.c @@ -46,6 +46,10 @@ #include <sys/nvpair.h> #include <sys/types.h> #include <ftw.h> +#include <pool.h> +#include <libscf.h> +#include <libproc.h> +#include <sys/priocntl.h> #include <arpa/inet.h> #include <netdb.h> @@ -79,6 +83,9 @@ #define DTD_ELEM_RCTLVALUE (const xmlChar *) "rctl-value" #define DTD_ELEM_ZONE (const xmlChar *) "zone" #define DTD_ELEM_DATASET (const xmlChar *) "dataset" +#define DTD_ELEM_TMPPOOL (const xmlChar *) "tmp_pool" +#define DTD_ELEM_PSET (const xmlChar *) "pset" +#define DTD_ELEM_MCAP (const xmlChar *) "mcap" #define DTD_ELEM_PACKAGE (const xmlChar *) "package" #define DTD_ELEM_PATCH (const xmlChar *) "patch" #define DTD_ELEM_OBSOLETES (const xmlChar *) "obsoletes" @@ -92,6 +99,7 @@ #define DTD_ATTR_LIMIT (const xmlChar *) "limit" #define DTD_ATTR_LIMITPRIV (const xmlChar *) "limitpriv" #define DTD_ATTR_BOOTARGS (const xmlChar *) "bootargs" +#define DTD_ATTR_SCHED (const xmlChar *) "scheduling-class" #define DTD_ATTR_MATCH (const xmlChar *) "match" #define DTD_ATTR_NAME (const xmlChar *) "name" #define DTD_ATTR_PHYSICAL (const xmlChar *) "physical" @@ -102,6 +110,10 @@ #define DTD_ATTR_TYPE (const xmlChar *) "type" #define DTD_ATTR_VALUE (const xmlChar *) "value" #define DTD_ATTR_ZONEPATH (const xmlChar *) "zonepath" +#define DTD_ATTR_NCPU_MIN (const xmlChar *) "ncpu_min" +#define DTD_ATTR_NCPU_MAX (const xmlChar *) "ncpu_max" +#define DTD_ATTR_IMPORTANCE (const xmlChar *) "importance" +#define DTD_ATTR_PHYSCAP (const xmlChar *) "physcap" #define DTD_ATTR_VERSION (const xmlChar *) "version" #define DTD_ATTR_ID (const xmlChar *) "id" #define DTD_ATTR_UID (const xmlChar *) "uid" @@ -133,6 +145,46 @@ #define PATCHINFO "PATCH_INFO_" #define PKGINFO_RD_LEN 128 +#define TMP_POOL_NAME "SUNWtmp_%s" +#define MAX_TMP_POOL_NAME (ZONENAME_MAX + 9) +#define RCAP_SERVICE "system/rcap:default" +#define POOLD_SERVICE "system/pools/dynamic:default" + +/* + * rctl alias definitions + * + * This holds the alias, the full rctl name, the default priv value, action + * and lower limit. The functions that handle rctl aliases step through + * this table, matching on the alias, and using the full values for setting + * the rctl entry as well the limit for validation. + */ +static struct alias { + char *shortname; + char *realname; + char *priv; + char *action; + uint64_t low_limit; +} aliases[] = { + {ALIAS_MAXLWPS, "zone.max-lwps", "privileged", "deny", 100}, + {ALIAS_MAXSHMMEM, "zone.max-shm-memory", "privileged", "deny", 0}, + {ALIAS_MAXSHMIDS, "zone.max-shm-ids", "privileged", "deny", 0}, + {ALIAS_MAXMSGIDS, "zone.max-msg-ids", "privileged", "deny", 0}, + {ALIAS_MAXSEMIDS, "zone.max-sem-ids", "privileged", "deny", 0}, + {ALIAS_MAXLOCKEDMEM, "zone.max-locked-memory", "privileged", "deny", 0}, + {ALIAS_MAXSWAP, "zone.max-swap", "privileged", "deny", 0}, + {ALIAS_SHARES, "zone.cpu-shares", "privileged", "none", 0}, + {NULL, NULL, NULL, NULL, 0} +}; + +/* + * Structure for applying rctls to a running zone. It allows important + * process values to be passed together easily. + */ +typedef struct pr_info_handle { + struct ps_prochandle *pr; + pid_t pid; +} pr_info_handle_t; + struct zone_dochandle { char *zone_dh_rootdir; xmlDocPtr zone_dh_doc; @@ -446,14 +498,20 @@ setrootattr(zone_dochandle_t handle, const xmlChar *propname, int err; xmlNodePtr root; - if (propval == NULL) - return (Z_INVAL); - if ((err = getroot(handle, &root)) != Z_OK) return (err); - if (xmlSetProp(root, propname, (const xmlChar *) propval) == NULL) - return (Z_INVAL); + /* + * If we get a null propval remove the property (ignore return since it + * may not be set to begin with). + */ + if (propval == NULL) { + (void) xmlUnsetProp(root, propname); + } else { + if (xmlSetProp(root, propname, (const xmlChar *) propval) + == NULL) + return (Z_INVAL); + } return (Z_OK); } @@ -947,6 +1005,18 @@ zonecfg_set_bootargs(zone_dochandle_t handle, char *bargs) return (setrootattr(handle, DTD_ATTR_BOOTARGS, bargs)); } +int +zonecfg_get_sched_class(zone_dochandle_t handle, char *sched, size_t schedsize) +{ + return (getrootattr(handle, DTD_ATTR_SCHED, sched, schedsize)); +} + +int +zonecfg_set_sched(zone_dochandle_t handle, char *sched) +{ + return (setrootattr(handle, DTD_ATTR_SCHED, sched)); +} + /* * /etc/zones/index caches a vital piece of information which is also * in the <zonename>.xml file: the path to the zone. This is for performance, @@ -3047,6 +3117,30 @@ zonecfg_strerror(int errnum) case Z_BRAND_ERROR: return (dgettext(TEXT_DOMAIN, "Brand-specific error")); + case Z_INCOMPATIBLE: + return (dgettext(TEXT_DOMAIN, "Incompatible settings")); + case Z_ALIAS_DISALLOW: + return (dgettext(TEXT_DOMAIN, + "An incompatible rctl already exists for this property")); + case Z_CLEAR_DISALLOW: + return (dgettext(TEXT_DOMAIN, + "Clearing this property is not allowed")); + case Z_POOL: + return (dgettext(TEXT_DOMAIN, "libpool(3LIB) error")); + case Z_POOLS_NOT_ACTIVE: + return (dgettext(TEXT_DOMAIN, "Pools facility not active; " + "zone will not be bound to pool")); + case Z_POOL_ENABLE: + return (dgettext(TEXT_DOMAIN, + "Could not enable pools facility")); + case Z_NO_POOL: + return (dgettext(TEXT_DOMAIN, + "Pool not found; using default pool")); + case Z_POOL_CREATE: + return (dgettext(TEXT_DOMAIN, + "Could not create a temporary pool")); + case Z_POOL_BIND: + return (dgettext(TEXT_DOMAIN, "Could not bind zone to pool")); default: return (dgettext(TEXT_DOMAIN, "Unknown error")); } @@ -3086,6 +3180,951 @@ zonecfg_endent(zone_dochandle_t handle) return (Z_OK); } +/* + * Do the work required to manipulate a process through libproc. + * If grab_process() returns no errors (0), then release_process() + * must eventually be called. + * + * Return values: + * 0 Successful creation of agent thread + * 1 Error grabbing + * 2 Error creating agent + */ +static int +grab_process(pr_info_handle_t *p) +{ + int ret; + + if ((p->pr = Pgrab(p->pid, 0, &ret)) != NULL) { + + if (Psetflags(p->pr, PR_RLC) != 0) { + Prelease(p->pr, 0); + return (1); + } + if (Pcreate_agent(p->pr) == 0) { + return (0); + + } else { + Prelease(p->pr, 0); + return (2); + } + } else { + return (1); + } +} + +/* + * Release the specified process. This destroys the agent + * and releases the process. If the process is NULL, nothing + * is done. This function should only be called if grab_process() + * has previously been called and returned success. + * + * This function is Pgrab-safe. + */ +static void +release_process(struct ps_prochandle *Pr) +{ + if (Pr == NULL) + return; + + Pdestroy_agent(Pr); + Prelease(Pr, 0); +} + +static boolean_t +grab_zone_proc(char *zonename, pr_info_handle_t *p) +{ + DIR *dirp; + struct dirent *dentp; + zoneid_t zoneid; + int pid_self; + psinfo_t psinfo; + + if (zone_get_id(zonename, &zoneid) != 0) + return (B_FALSE); + + pid_self = getpid(); + + if ((dirp = opendir("/proc")) == NULL) + return (B_FALSE); + + while (dentp = readdir(dirp)) { + p->pid = atoi(dentp->d_name); + + /* Skip self */ + if (p->pid == pid_self) + continue; + + if (proc_get_psinfo(p->pid, &psinfo) != 0) + continue; + + if (psinfo.pr_zoneid != zoneid) + continue; + + /* attempt to grab process */ + if (grab_process(p) != 0) + continue; + + if (pr_getzoneid(p->pr) != zoneid) { + release_process(p->pr); + continue; + } + + (void) closedir(dirp); + return (B_TRUE); + } + + (void) closedir(dirp); + return (B_FALSE); +} + +static boolean_t +get_priv_rctl(struct ps_prochandle *pr, char *name, rctlblk_t *rblk) +{ + if (pr_getrctl(pr, name, NULL, rblk, RCTL_FIRST)) + return (B_FALSE); + + if (rctlblk_get_privilege(rblk) == RCPRIV_PRIVILEGED) + return (B_TRUE); + + while (pr_getrctl(pr, name, rblk, rblk, RCTL_NEXT) == 0) { + if (rctlblk_get_privilege(rblk) == RCPRIV_PRIVILEGED) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Apply the current rctl settings to the specified, running zone. + */ +int +zonecfg_apply_rctls(char *zone_name, zone_dochandle_t handle) +{ + int err; + int res = Z_OK; + rctlblk_t *rblk; + pr_info_handle_t p; + struct zone_rctltab rctl; + + if ((err = zonecfg_setrctlent(handle)) != Z_OK) + return (err); + + if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL) { + (void) zonecfg_endrctlent(handle); + return (Z_NOMEM); + } + + if (!grab_zone_proc(zone_name, &p)) { + (void) zonecfg_endrctlent(handle); + free(rblk); + return (Z_SYSTEM); + } + + while (zonecfg_getrctlent(handle, &rctl) == Z_OK) { + char *rname; + struct zone_rctlvaltab *valptr; + + rname = rctl.zone_rctl_name; + + /* first delete all current privileged settings for this rctl */ + while (get_priv_rctl(p.pr, rname, rblk)) { + if (pr_setrctl(p.pr, rname, NULL, rblk, RCTL_DELETE) != + 0) { + res = Z_SYSTEM; + goto done; + } + } + + /* now set each new value for the rctl */ + for (valptr = rctl.zone_rctl_valptr; valptr != NULL; + valptr = valptr->zone_rctlval_next) { + if ((err = zonecfg_construct_rctlblk(valptr, rblk)) + != Z_OK) { + res = errno = err; + goto done; + } + + if (pr_setrctl(p.pr, rname, NULL, rblk, RCTL_INSERT)) { + res = Z_SYSTEM; + goto done; + } + } + } + +done: + release_process(p.pr); + free(rblk); + (void) zonecfg_endrctlent(handle); + + return (res); +} + +static const xmlChar * +nm_to_dtd(char *nm) +{ + if (strcmp(nm, "device") == 0) + return (DTD_ELEM_DEVICE); + if (strcmp(nm, "fs") == 0) + return (DTD_ELEM_FS); + if (strcmp(nm, "inherit-pkg-dir") == 0) + return (DTD_ELEM_IPD); + if (strcmp(nm, "net") == 0) + return (DTD_ELEM_NET); + if (strcmp(nm, "attr") == 0) + return (DTD_ELEM_ATTR); + if (strcmp(nm, "rctl") == 0) + return (DTD_ELEM_RCTL); + if (strcmp(nm, "dataset") == 0) + return (DTD_ELEM_DATASET); + + return (NULL); +} + +int +zonecfg_num_resources(zone_dochandle_t handle, char *rsrc) +{ + int num = 0; + const xmlChar *dtd; + xmlNodePtr cur; + + if ((dtd = nm_to_dtd(rsrc)) == NULL) + return (num); + + if (zonecfg_setent(handle) != Z_OK) + return (num); + + for (cur = handle->zone_dh_cur; cur != NULL; cur = cur->next) + if (xmlStrcmp(cur->name, dtd) == 0) + num++; + + (void) zonecfg_endent(handle); + + return (num); +} + +int +zonecfg_del_all_resources(zone_dochandle_t handle, char *rsrc) +{ + int err; + const xmlChar *dtd; + xmlNodePtr cur; + + if ((dtd = nm_to_dtd(rsrc)) == NULL) + return (Z_NO_RESOURCE_TYPE); + + if ((err = zonecfg_setent(handle)) != Z_OK) + return (err); + + cur = handle->zone_dh_cur; + while (cur != NULL) { + xmlNodePtr tmp; + + if (xmlStrcmp(cur->name, dtd)) { + cur = cur->next; + continue; + } + + tmp = cur->next; + xmlUnlinkNode(cur); + xmlFreeNode(cur); + cur = tmp; + } + + (void) zonecfg_endent(handle); + return (Z_OK); +} + +static boolean_t +valid_uint(char *s, uint64_t *n) +{ + char *endp; + + /* strtoull accepts '-'?! so we want to flag that as an error */ + if (strchr(s, '-') != NULL) + return (B_FALSE); + + errno = 0; + *n = strtoull(s, &endp, 10); + + if (errno != 0 || *endp != '\0') + return (B_FALSE); + return (B_TRUE); +} + +/* + * Convert a string representing a number (possibly a fraction) into an integer. + * The string can have a modifier (K, M, G or T). The modifiers are treated + * as powers of two (not 10). + */ +int +zonecfg_str_to_bytes(char *str, uint64_t *bytes) +{ + long double val; + char *unitp; + uint64_t scale; + + if ((val = strtold(str, &unitp)) < 0) + return (-1); + + /* remove any leading white space from units string */ + while (isspace(*unitp) != 0) + ++unitp; + + /* if no units explicitly set, error */ + if (unitp == NULL || *unitp == '\0') { + scale = 1; + } else { + int i; + char *units[] = {"K", "M", "G", "T", NULL}; + + scale = 1024; + + /* update scale based on units */ + for (i = 0; units[i] != NULL; i++) { + if (strcasecmp(unitp, units[i]) == 0) + break; + scale <<= 10; + } + + if (units[i] == NULL) + return (-1); + } + + *bytes = (uint64_t)(val * scale); + return (0); +} + +boolean_t +zonecfg_valid_ncpus(char *lowstr, char *highstr) +{ + uint64_t low, high; + + if (!valid_uint(lowstr, &low) || !valid_uint(highstr, &high) || + low < 1 || low > high) + return (B_FALSE); + + return (B_TRUE); +} + +boolean_t +zonecfg_valid_importance(char *impstr) +{ + uint64_t num; + + if (!valid_uint(impstr, &num)) + return (B_FALSE); + + return (B_TRUE); +} + +boolean_t +zonecfg_valid_alias_limit(char *name, char *limitstr, uint64_t *limit) +{ + int i; + + for (i = 0; aliases[i].shortname != NULL; i++) + if (strcmp(name, aliases[i].shortname) == 0) + break; + + if (aliases[i].shortname == NULL) + return (B_FALSE); + + if (!valid_uint(limitstr, limit) || *limit < aliases[i].low_limit) + return (B_FALSE); + + return (B_TRUE); +} + +boolean_t +zonecfg_valid_memlimit(char *memstr, uint64_t *mem_val) +{ + if (zonecfg_str_to_bytes(memstr, mem_val) != 0) + return (B_FALSE); + + return (B_TRUE); +} + +static int +zerr_pool(char *pool_err, int err_size, int res) +{ + (void) strlcpy(pool_err, pool_strerror(pool_error()), err_size); + return (res); +} + +static int +create_tmp_pset(char *pool_err, int err_size, pool_conf_t *pconf, pool_t *pool, + char *name, int min, int max) +{ + pool_resource_t *res; + pool_elem_t *elem; + pool_value_t *val; + + if ((res = pool_resource_create(pconf, "pset", name)) == NULL) + return (zerr_pool(pool_err, err_size, Z_POOL)); + + if (pool_associate(pconf, pool, res) != PO_SUCCESS) + return (zerr_pool(pool_err, err_size, Z_POOL)); + + if ((elem = pool_resource_to_elem(pconf, res)) == NULL) + return (zerr_pool(pool_err, err_size, Z_POOL)); + + if ((val = pool_value_alloc()) == NULL) + return (zerr_pool(pool_err, err_size, Z_POOL)); + + /* set the maximum number of cpus for the pset */ + pool_value_set_uint64(val, (uint64_t)max); + + if (pool_put_property(pconf, elem, "pset.max", val) != PO_SUCCESS) { + pool_value_free(val); + return (zerr_pool(pool_err, err_size, Z_POOL)); + } + + /* set the minimum number of cpus for the pset */ + pool_value_set_uint64(val, (uint64_t)min); + + if (pool_put_property(pconf, elem, "pset.min", val) != PO_SUCCESS) { + pool_value_free(val); + return (zerr_pool(pool_err, err_size, Z_POOL)); + } + + pool_value_free(val); + + return (Z_OK); +} + +static int +create_tmp_pool(char *pool_err, int err_size, pool_conf_t *pconf, char *name, + struct zone_psettab *pset_tab) +{ + pool_t *pool; + int res = Z_OK; + + /* create a temporary pool configuration */ + if (pool_conf_open(pconf, NULL, PO_TEMP) != PO_SUCCESS) { + res = zerr_pool(pool_err, err_size, Z_POOL); + return (res); + } + + if ((pool = pool_create(pconf, name)) == NULL) { + res = zerr_pool(pool_err, err_size, Z_POOL_CREATE); + goto done; + } + + /* set pool importance */ + if (pset_tab->zone_importance[0] != '\0') { + pool_elem_t *elem; + pool_value_t *val; + + if ((elem = pool_to_elem(pconf, pool)) == NULL) { + res = zerr_pool(pool_err, err_size, Z_POOL); + goto done; + } + + if ((val = pool_value_alloc()) == NULL) { + res = zerr_pool(pool_err, err_size, Z_POOL); + goto done; + } + + pool_value_set_int64(val, + (int64_t)atoi(pset_tab->zone_importance)); + + if (pool_put_property(pconf, elem, "pool.importance", val) + != PO_SUCCESS) { + res = zerr_pool(pool_err, err_size, Z_POOL); + pool_value_free(val); + goto done; + } + + pool_value_free(val); + } + + if ((res = create_tmp_pset(pool_err, err_size, pconf, pool, name, + atoi(pset_tab->zone_ncpu_min), + atoi(pset_tab->zone_ncpu_max))) != Z_OK) + goto done; + + /* validation */ + if (pool_conf_status(pconf) == POF_INVALID) { + res = zerr_pool(pool_err, err_size, Z_POOL); + goto done; + } + + /* + * This validation is the one we expect to fail if the user specified + * an invalid configuration (too many cpus) for this system. + */ + if (pool_conf_validate(pconf, POV_RUNTIME) != PO_SUCCESS) { + res = zerr_pool(pool_err, err_size, Z_POOL_CREATE); + goto done; + } + + /* + * Commit the dynamic configuration but not the pool configuration + * file. + */ + if (pool_conf_commit(pconf, 1) != PO_SUCCESS) + res = zerr_pool(pool_err, err_size, Z_POOL); + +done: + (void) pool_conf_close(pconf); + return (res); +} + +static int +get_running_tmp_pset(pool_conf_t *pconf, pool_t *pool, pool_resource_t *pset, + struct zone_psettab *pset_tab) +{ + int nfound = 0; + pool_elem_t *pe; + pool_value_t *pv = pool_value_alloc(); + uint64_t val_uint; + + if (pool != NULL) { + pe = pool_to_elem(pconf, pool); + if (pool_get_property(pconf, pe, "pool.importance", pv) + != POC_INVAL) { + int64_t val_int; + + (void) pool_value_get_int64(pv, &val_int); + (void) snprintf(pset_tab->zone_importance, + sizeof (pset_tab->zone_importance), "%d", val_int); + nfound++; + } + } + + if (pset != NULL) { + pe = pool_resource_to_elem(pconf, pset); + if (pool_get_property(pconf, pe, "pset.min", pv) != POC_INVAL) { + (void) pool_value_get_uint64(pv, &val_uint); + (void) snprintf(pset_tab->zone_ncpu_min, + sizeof (pset_tab->zone_ncpu_min), "%u", val_uint); + nfound++; + } + + if (pool_get_property(pconf, pe, "pset.max", pv) != POC_INVAL) { + (void) pool_value_get_uint64(pv, &val_uint); + (void) snprintf(pset_tab->zone_ncpu_max, + sizeof (pset_tab->zone_ncpu_max), "%u", val_uint); + nfound++; + } + } + + pool_value_free(pv); + + if (nfound == 3) + return (PO_SUCCESS); + + return (PO_FAIL); +} + +/* + * Determine if a tmp pool is configured and if so, if the configuration is + * still valid or if it has been changed since the tmp pool was created. + * If the tmp pool configuration is no longer valid, delete the tmp pool. + * + * Set *valid=B_TRUE if there is an existing, valid tmp pool configuration. + */ +static int +verify_del_tmp_pool(pool_conf_t *pconf, char *tmp_name, char *pool_err, + int err_size, struct zone_psettab *pset_tab, boolean_t *exists) +{ + int res = Z_OK; + pool_t *pool; + pool_resource_t *pset; + struct zone_psettab pset_current; + + *exists = B_FALSE; + + if (pool_conf_open(pconf, pool_dynamic_location(), PO_RDWR) + != PO_SUCCESS) { + res = zerr_pool(pool_err, err_size, Z_POOL); + return (res); + } + + pool = pool_get_pool(pconf, tmp_name); + pset = pool_get_resource(pconf, "pset", tmp_name); + + if (pool == NULL && pset == NULL) { + /* no tmp pool configured */ + goto done; + } + + /* + * If an existing tmp pool for this zone is configured with the proper + * settings, then the tmp pool is valid. + */ + if (get_running_tmp_pset(pconf, pool, pset, &pset_current) + == PO_SUCCESS && + strcmp(pset_tab->zone_ncpu_min, + pset_current.zone_ncpu_min) == 0 && + strcmp(pset_tab->zone_ncpu_max, + pset_current.zone_ncpu_max) == 0 && + strcmp(pset_tab->zone_importance, + pset_current.zone_importance) == 0) { + *exists = B_TRUE; + + } else { + /* + * An out-of-date tmp pool configuration exists. Delete it + * so that we can create the correct tmp pool config. + */ + if (pset != NULL && + pool_resource_destroy(pconf, pset) != PO_SUCCESS) { + res = zerr_pool(pool_err, err_size, Z_POOL); + goto done; + } + + if (pool != NULL && + pool_destroy(pconf, pool) != PO_SUCCESS) { + res = zerr_pool(pool_err, err_size, Z_POOL); + goto done; + } + + /* commit dynamic config */ + if (pool_conf_commit(pconf, 0) != PO_SUCCESS) + res = zerr_pool(pool_err, err_size, Z_POOL); + } + +done: + (void) pool_conf_close(pconf); + + return (res); +} + +/* + * Destroy any existing tmp pool. + */ +int +zonecfg_destroy_tmp_pool(char *zone_name, char *pool_err, int err_size) +{ + int status; + int res = Z_OK; + pool_conf_t *pconf; + pool_t *pool; + pool_resource_t *pset; + char tmp_name[MAX_TMP_POOL_NAME]; + + /* if pools not enabled then nothing to do */ + if (pool_get_status(&status) != PO_SUCCESS || status != POOL_ENABLED) + return (Z_OK); + + if ((pconf = pool_conf_alloc()) == NULL) + return (zerr_pool(pool_err, err_size, Z_POOL)); + + (void) snprintf(tmp_name, sizeof (tmp_name), TMP_POOL_NAME, zone_name); + + if (pool_conf_open(pconf, pool_dynamic_location(), PO_RDWR) + != PO_SUCCESS) { + res = zerr_pool(pool_err, err_size, Z_POOL); + pool_conf_free(pconf); + return (res); + } + + pool = pool_get_pool(pconf, tmp_name); + pset = pool_get_resource(pconf, "pset", tmp_name); + + if (pool == NULL && pset == NULL) { + /* nothing to destroy, we're done */ + goto done; + } + + if (pset != NULL && pool_resource_destroy(pconf, pset) != PO_SUCCESS) { + res = zerr_pool(pool_err, err_size, Z_POOL); + goto done; + } + + if (pool != NULL && pool_destroy(pconf, pool) != PO_SUCCESS) { + res = zerr_pool(pool_err, err_size, Z_POOL); + goto done; + } + + /* commit dynamic config */ + if (pool_conf_commit(pconf, 0) != PO_SUCCESS) + res = zerr_pool(pool_err, err_size, Z_POOL); + +done: + (void) pool_conf_close(pconf); + pool_conf_free(pconf); + + return (res); +} + +/* + * Attempt to bind to a tmp pool for this zone. If there is no tmp pool + * configured, we just return Z_OK. + * + * We either attempt to create the tmp pool for this zone or rebind to an + * existing tmp pool for this zone. + * + * Rebinding is used when a zone with a tmp pool reboots so that we don't have + * to recreate the tmp pool. To do this we need to be sure we work correctly + * for the following cases: + * + * - there is an existing, properly configured tmp pool. + * - zonecfg added tmp pool after zone was booted, must now create. + * - zonecfg updated tmp pool config after zone was booted, in this case + * we destroy the old tmp pool and create a new one. + */ +int +zonecfg_bind_tmp_pool(zone_dochandle_t handle, zoneid_t zoneid, char *pool_err, + int err_size) +{ + struct zone_psettab pset_tab; + int err; + int status; + pool_conf_t *pconf; + boolean_t exists; + char zone_name[ZONENAME_MAX]; + char tmp_name[MAX_TMP_POOL_NAME]; + + (void) getzonenamebyid(zoneid, zone_name, sizeof (zone_name)); + + err = zonecfg_lookup_pset(handle, &pset_tab); + + /* if no temporary pool configured, we're done */ + if (err == Z_NO_ENTRY) + return (Z_OK); + + /* + * importance might not have a value but we need to validate it here, + * so set the default. + */ + if (pset_tab.zone_importance[0] == '\0') + (void) strlcpy(pset_tab.zone_importance, "1", + sizeof (pset_tab.zone_importance)); + + /* if pools not enabled, enable them now */ + if (pool_get_status(&status) != PO_SUCCESS || status != POOL_ENABLED) { + if (pool_set_status(POOL_ENABLED) != PO_SUCCESS) + return (Z_POOL_ENABLE); + } + + if ((pconf = pool_conf_alloc()) == NULL) + return (zerr_pool(pool_err, err_size, Z_POOL)); + + (void) snprintf(tmp_name, sizeof (tmp_name), TMP_POOL_NAME, zone_name); + + /* + * Check if a valid tmp pool/pset already exists. If so, we just + * reuse it. + */ + if ((err = verify_del_tmp_pool(pconf, tmp_name, pool_err, err_size, + &pset_tab, &exists)) != Z_OK) { + pool_conf_free(pconf); + return (err); + } + + if (!exists) + err = create_tmp_pool(pool_err, err_size, pconf, tmp_name, + &pset_tab); + + pool_conf_free(pconf); + + if (err != Z_OK) + return (err); + + /* Bind the zone to the pool. */ + if (pool_set_binding(tmp_name, P_ZONEID, zoneid) != PO_SUCCESS) + return (zerr_pool(pool_err, err_size, Z_POOL_BIND)); + + return (Z_OK); +} + +/* + * Attempt to bind to a permanent pool for this zone. If there is no + * permanent pool configured, we just return Z_OK. + */ +int +zonecfg_bind_pool(zone_dochandle_t handle, zoneid_t zoneid, char *pool_err, + int err_size) +{ + pool_conf_t *poolconf; + pool_t *pool; + char poolname[MAXPATHLEN]; + int status; + int error; + + /* + * Find the pool mentioned in the zone configuration, and bind to it. + */ + error = zonecfg_get_pool(handle, poolname, sizeof (poolname)); + if (error == Z_NO_ENTRY || (error == Z_OK && strlen(poolname) == 0)) { + /* + * The property is not set on the zone, so the pool + * should be bound to the default pool. But that's + * already done by the kernel, so we can just return. + */ + return (Z_OK); + } + if (error != Z_OK) { + /* + * Not an error, even though it shouldn't be happening. + */ + return (Z_OK); + } + /* + * Don't do anything if pools aren't enabled. + */ + if (pool_get_status(&status) != PO_SUCCESS || status != POOL_ENABLED) + return (Z_POOLS_NOT_ACTIVE); + + /* + * Try to provide a sane error message if the requested pool doesn't + * exist. + */ + if ((poolconf = pool_conf_alloc()) == NULL) + return (zerr_pool(pool_err, err_size, Z_POOL)); + + if (pool_conf_open(poolconf, pool_dynamic_location(), PO_RDONLY) != + PO_SUCCESS) { + pool_conf_free(poolconf); + return (zerr_pool(pool_err, err_size, Z_POOL)); + } + pool = pool_get_pool(poolconf, poolname); + (void) pool_conf_close(poolconf); + pool_conf_free(poolconf); + if (pool == NULL) + return (Z_NO_POOL); + + /* + * Bind the zone to the pool. + */ + if (pool_set_binding(poolname, P_ZONEID, zoneid) != PO_SUCCESS) { + /* if bind fails, return poolname for the error msg */ + (void) strlcpy(pool_err, poolname, err_size); + return (Z_POOL_BIND); + } + + return (Z_OK); +} + + +static boolean_t +svc_enabled(char *svc_name) +{ + scf_simple_prop_t *prop; + boolean_t found = B_FALSE; + + prop = scf_simple_prop_get(NULL, svc_name, SCF_PG_GENERAL, + SCF_PROPERTY_ENABLED); + + if (scf_simple_prop_numvalues(prop) == 1 && + *scf_simple_prop_next_boolean(prop) != 0) + found = B_TRUE; + + scf_simple_prop_free(prop); + + return (found); +} + +/* + * If the zone has capped-memory, make sure the rcap service is enabled. + */ +int +zonecfg_enable_rcapd(char *err, int size) +{ + if (!svc_enabled(RCAP_SERVICE) && + smf_enable_instance(RCAP_SERVICE, 0) == -1) { + (void) strlcpy(err, scf_strerror(scf_error()), size); + return (Z_SYSTEM); + } + + return (Z_OK); +} + +/* + * Return true if pset has cpu range specified and poold is not enabled. + */ +boolean_t +zonecfg_warn_poold(zone_dochandle_t handle) +{ + struct zone_psettab pset_tab; + int min, max; + int err; + + err = zonecfg_lookup_pset(handle, &pset_tab); + + /* if no temporary pool configured, we're done */ + if (err == Z_NO_ENTRY) + return (B_FALSE); + + min = atoi(pset_tab.zone_ncpu_min); + max = atoi(pset_tab.zone_ncpu_max); + + /* range not specified, no need for poold */ + if (min == max) + return (B_FALSE); + + /* we have a range, check if poold service is enabled */ + if (svc_enabled(POOLD_SERVICE)) + return (B_FALSE); + + return (B_TRUE); +} + +static int +get_pool_sched_class(char *poolname, char *class, int clsize) +{ + int status; + pool_conf_t *poolconf; + pool_t *pool; + pool_elem_t *pe; + pool_value_t *pv = pool_value_alloc(); + const char *sched_str; + + if (pool_get_status(&status) != PO_SUCCESS || status != POOL_ENABLED) + return (Z_NO_POOL); + + if ((poolconf = pool_conf_alloc()) == NULL) + return (Z_NO_POOL); + + if (pool_conf_open(poolconf, pool_dynamic_location(), PO_RDONLY) != + PO_SUCCESS) { + pool_conf_free(poolconf); + return (Z_NO_POOL); + } + + if ((pool = pool_get_pool(poolconf, poolname)) == NULL) { + (void) pool_conf_close(poolconf); + pool_conf_free(poolconf); + return (Z_NO_POOL); + } + + pe = pool_to_elem(poolconf, pool); + if (pool_get_property(poolconf, pe, "pool.scheduler", pv) + != POC_INVAL) { + (void) pool_value_get_string(pv, &sched_str); + if (strlcpy(class, sched_str, clsize) >= clsize) + return (Z_TOO_BIG); + } + + (void) pool_conf_close(poolconf); + pool_conf_free(poolconf); + return (Z_OK); +} + +/* + * Get the default scheduling class for the zone. This will either be the + * class set on the zone's pool or the system default scheduling class. + */ +int +zonecfg_get_dflt_sched_class(zone_dochandle_t handle, char *class, int clsize) +{ + char poolname[MAXPATHLEN]; + + if (zonecfg_get_pool(handle, poolname, sizeof (poolname)) == Z_OK) { + /* check if the zone's pool specified a sched class */ + if (get_pool_sched_class(poolname, class, clsize) == Z_OK) + return (Z_OK); + } + + if (priocntl(0, 0, PC_GETDFLCL, class, (uint64_t)clsize) == -1) + return (Z_TOO_BIG); + + return (Z_OK); +} + int zonecfg_setfsent(zone_dochandle_t handle) { @@ -4825,6 +5864,509 @@ zonecfg_enddsent(zone_dochandle_t handle) return (zonecfg_endent(handle)); } +/* + * Support for aliased rctls; that is, rctls that have simplified names in + * zonecfg. For example, max-lwps is an alias for a well defined zone.max-lwps + * rctl. If there are multiple existing values for one of these rctls or if + * there is a single value that does not match the well defined template (i.e. + * it has a different action) then we cannot treat the rctl as having an alias + * so we return Z_ALIAS_DISALLOW. That means that the rctl cannot be + * managed in zonecfg via an alias and that the standard rctl syntax must be + * used. + * + * The possible return values are: + * Z_NO_PROPERTY_ID - invalid alias name + * Z_ALIAS_DISALLOW - pre-existing, incompatible rctl definition + * Z_NO_ENTRY - no rctl is configured for this alias + * Z_OK - we got a valid rctl for the specified alias + */ +int +zonecfg_get_aliased_rctl(zone_dochandle_t handle, char *name, uint64_t *rval) +{ + boolean_t found = B_FALSE; + boolean_t found_val = B_FALSE; + xmlNodePtr cur, val; + char savedname[MAXNAMELEN]; + struct zone_rctlvaltab rctl; + int i; + int err; + + for (i = 0; aliases[i].shortname != NULL; i++) + if (strcmp(name, aliases[i].shortname) == 0) + break; + + if (aliases[i].shortname == NULL) + return (Z_NO_PROPERTY_ID); + + if ((err = operation_prep(handle)) != Z_OK) + return (err); + + cur = handle->zone_dh_cur; + for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) { + if (xmlStrcmp(cur->name, DTD_ELEM_RCTL) != 0) + continue; + if ((fetchprop(cur, DTD_ATTR_NAME, savedname, + sizeof (savedname)) == Z_OK) && + (strcmp(savedname, aliases[i].realname) == 0)) { + + /* + * If we already saw one of these, we can't have an + * alias since we just found another. + */ + if (found) + return (Z_ALIAS_DISALLOW); + found = B_TRUE; + + for (val = cur->xmlChildrenNode; val != NULL; + val = val->next) { + /* + * If we already have one value, we can't have + * an alias since we just found another. + */ + if (found_val) + return (Z_ALIAS_DISALLOW); + found_val = B_TRUE; + + if ((fetchprop(val, DTD_ATTR_PRIV, + rctl.zone_rctlval_priv, + sizeof (rctl.zone_rctlval_priv)) != Z_OK)) + break; + if ((fetchprop(val, DTD_ATTR_LIMIT, + rctl.zone_rctlval_limit, + sizeof (rctl.zone_rctlval_limit)) != Z_OK)) + break; + if ((fetchprop(val, DTD_ATTR_ACTION, + rctl.zone_rctlval_action, + sizeof (rctl.zone_rctlval_action)) != Z_OK)) + break; + } + + /* check priv and action match the expected vals */ + if (strcmp(rctl.zone_rctlval_priv, + aliases[i].priv) != 0 || + strcmp(rctl.zone_rctlval_action, + aliases[i].action) != 0) + return (Z_ALIAS_DISALLOW); + } + } + + if (found) { + *rval = strtoull(rctl.zone_rctlval_limit, NULL, 10); + return (Z_OK); + } + + return (Z_NO_ENTRY); +} + +int +zonecfg_rm_aliased_rctl(zone_dochandle_t handle, char *name) +{ + int i; + uint64_t val; + struct zone_rctltab rctltab; + + /* + * First check that we have a valid aliased rctl to remove. + * This will catch an rctl entry with non-standard values or + * multiple rctl values for this name. We need to ignore those + * rctl entries. + */ + if (zonecfg_get_aliased_rctl(handle, name, &val) != Z_OK) + return (Z_OK); + + for (i = 0; aliases[i].shortname != NULL; i++) + if (strcmp(name, aliases[i].shortname) == 0) + break; + + if (aliases[i].shortname == NULL) + return (Z_NO_RESOURCE_ID); + + (void) strlcpy(rctltab.zone_rctl_name, aliases[i].realname, + sizeof (rctltab.zone_rctl_name)); + + return (zonecfg_delete_rctl(handle, &rctltab)); +} + +boolean_t +zonecfg_aliased_rctl_ok(zone_dochandle_t handle, char *name) +{ + uint64_t tmp_val; + + switch (zonecfg_get_aliased_rctl(handle, name, &tmp_val)) { + case Z_OK: + /*FALLTHRU*/ + case Z_NO_ENTRY: + return (B_TRUE); + default: + return (B_FALSE); + } +} + +int +zonecfg_set_aliased_rctl(zone_dochandle_t handle, char *name, uint64_t val) +{ + int i; + int err; + struct zone_rctltab rctltab; + struct zone_rctlvaltab *rctlvaltab; + char buf[128]; + + if (!zonecfg_aliased_rctl_ok(handle, name)) + return (Z_ALIAS_DISALLOW); + + for (i = 0; aliases[i].shortname != NULL; i++) + if (strcmp(name, aliases[i].shortname) == 0) + break; + + if (aliases[i].shortname == NULL) + return (Z_NO_RESOURCE_ID); + + /* remove any pre-existing definition for this rctl */ + (void) zonecfg_rm_aliased_rctl(handle, name); + + (void) strlcpy(rctltab.zone_rctl_name, aliases[i].realname, + sizeof (rctltab.zone_rctl_name)); + + rctltab.zone_rctl_valptr = NULL; + + if ((rctlvaltab = calloc(1, sizeof (struct zone_rctlvaltab))) == NULL) + return (Z_NOMEM); + + (void) snprintf(buf, sizeof (buf), "%llu", (long long)val); + + (void) strlcpy(rctlvaltab->zone_rctlval_priv, aliases[i].priv, + sizeof (rctlvaltab->zone_rctlval_priv)); + (void) strlcpy(rctlvaltab->zone_rctlval_limit, buf, + sizeof (rctlvaltab->zone_rctlval_limit)); + (void) strlcpy(rctlvaltab->zone_rctlval_action, aliases[i].action, + sizeof (rctlvaltab->zone_rctlval_action)); + + rctlvaltab->zone_rctlval_next = NULL; + + if ((err = zonecfg_add_rctl_value(&rctltab, rctlvaltab)) != Z_OK) + return (err); + + return (zonecfg_add_rctl(handle, &rctltab)); +} + +static int +delete_tmp_pool(zone_dochandle_t handle) +{ + int err; + xmlNodePtr cur = handle->zone_dh_cur; + + if ((err = operation_prep(handle)) != Z_OK) + return (err); + + for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) { + if (xmlStrcmp(cur->name, DTD_ELEM_TMPPOOL) == 0) { + xmlUnlinkNode(cur); + xmlFreeNode(cur); + return (Z_OK); + } + } + + return (Z_NO_RESOURCE_ID); +} + +static int +modify_tmp_pool(zone_dochandle_t handle, char *pool_importance) +{ + int err; + xmlNodePtr cur = handle->zone_dh_cur; + xmlNodePtr newnode; + + err = delete_tmp_pool(handle); + if (err != Z_OK && err != Z_NO_RESOURCE_ID) + return (err); + + if (*pool_importance != '\0') { + if ((err = operation_prep(handle)) != Z_OK) + return (err); + + newnode = xmlNewTextChild(cur, NULL, DTD_ELEM_TMPPOOL, NULL); + if ((err = newprop(newnode, DTD_ATTR_IMPORTANCE, + pool_importance)) != Z_OK) + return (err); + } + + return (Z_OK); +} + +static int +add_pset_core(zone_dochandle_t handle, struct zone_psettab *tabptr) +{ + xmlNodePtr newnode, cur = handle->zone_dh_cur; + int err; + + newnode = xmlNewTextChild(cur, NULL, DTD_ELEM_PSET, NULL); + if ((err = newprop(newnode, DTD_ATTR_NCPU_MIN, + tabptr->zone_ncpu_min)) != Z_OK) + return (err); + if ((err = newprop(newnode, DTD_ATTR_NCPU_MAX, + tabptr->zone_ncpu_max)) != Z_OK) + return (err); + + if ((err = modify_tmp_pool(handle, tabptr->zone_importance)) != Z_OK) + return (err); + + return (Z_OK); +} + +int +zonecfg_add_pset(zone_dochandle_t handle, struct zone_psettab *tabptr) +{ + int err; + + if (tabptr == NULL) + return (Z_INVAL); + + if ((err = operation_prep(handle)) != Z_OK) + return (err); + + if ((err = add_pset_core(handle, tabptr)) != Z_OK) + return (err); + + return (Z_OK); +} + +int +zonecfg_delete_pset(zone_dochandle_t handle) +{ + int err; + int res = Z_NO_RESOURCE_ID; + xmlNodePtr cur = handle->zone_dh_cur; + + if ((err = operation_prep(handle)) != Z_OK) + return (err); + + for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) { + if (xmlStrcmp(cur->name, DTD_ELEM_PSET) == 0) { + xmlUnlinkNode(cur); + xmlFreeNode(cur); + res = Z_OK; + break; + } + } + + /* + * Once we have msets, we should check that a mset + * do not exist before we delete the tmp_pool data. + */ + err = delete_tmp_pool(handle); + if (err != Z_OK && err != Z_NO_RESOURCE_ID) + return (err); + + return (res); +} + +int +zonecfg_modify_pset(zone_dochandle_t handle, struct zone_psettab *tabptr) +{ + int err; + + if (tabptr == NULL) + return (Z_INVAL); + + if ((err = zonecfg_delete_pset(handle)) != Z_OK) + return (err); + + if ((err = add_pset_core(handle, tabptr)) != Z_OK) + return (err); + + return (Z_OK); +} + +int +zonecfg_lookup_pset(zone_dochandle_t handle, struct zone_psettab *tabptr) +{ + xmlNodePtr cur; + int err; + int res = Z_NO_ENTRY; + + if (tabptr == NULL) + return (Z_INVAL); + + if ((err = operation_prep(handle)) != Z_OK) + return (err); + + /* this is an optional component */ + tabptr->zone_importance[0] = '\0'; + + cur = handle->zone_dh_cur; + for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) { + if (xmlStrcmp(cur->name, DTD_ELEM_PSET) == 0) { + if ((err = fetchprop(cur, DTD_ATTR_NCPU_MIN, + tabptr->zone_ncpu_min, + sizeof (tabptr->zone_ncpu_min))) != Z_OK) { + handle->zone_dh_cur = handle->zone_dh_top; + return (err); + } + + if ((err = fetchprop(cur, DTD_ATTR_NCPU_MAX, + tabptr->zone_ncpu_max, + sizeof (tabptr->zone_ncpu_max))) != Z_OK) { + handle->zone_dh_cur = handle->zone_dh_top; + return (err); + } + + res = Z_OK; + + } else if (xmlStrcmp(cur->name, DTD_ELEM_TMPPOOL) == 0) { + if ((err = fetchprop(cur, DTD_ATTR_IMPORTANCE, + tabptr->zone_importance, + sizeof (tabptr->zone_importance))) != Z_OK) { + handle->zone_dh_cur = handle->zone_dh_top; + return (err); + } + } + } + + return (res); +} + +int +zonecfg_getpsetent(zone_dochandle_t handle, struct zone_psettab *tabptr) +{ + int err; + + if ((err = zonecfg_setent(handle)) != Z_OK) + return (err); + + err = zonecfg_lookup_pset(handle, tabptr); + + (void) zonecfg_endent(handle); + + return (err); +} + +static int +add_mcap(zone_dochandle_t handle, struct zone_mcaptab *tabptr) +{ + xmlNodePtr newnode, cur = handle->zone_dh_cur; + int err; + + newnode = xmlNewTextChild(cur, NULL, DTD_ELEM_MCAP, NULL); + if ((err = newprop(newnode, DTD_ATTR_PHYSCAP, tabptr->zone_physmem_cap)) + != Z_OK) + return (err); + + return (Z_OK); +} + +int +zonecfg_delete_mcap(zone_dochandle_t handle) +{ + int err; + xmlNodePtr cur = handle->zone_dh_cur; + + if ((err = operation_prep(handle)) != Z_OK) + return (err); + + for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) { + if (xmlStrcmp(cur->name, DTD_ELEM_MCAP) != 0) + continue; + + xmlUnlinkNode(cur); + xmlFreeNode(cur); + return (Z_OK); + } + return (Z_NO_RESOURCE_ID); +} + +int +zonecfg_modify_mcap(zone_dochandle_t handle, struct zone_mcaptab *tabptr) +{ + int err; + + if (tabptr == NULL) + return (Z_INVAL); + + err = zonecfg_delete_mcap(handle); + /* it is ok if there is no mcap entry */ + if (err != Z_OK && err != Z_NO_RESOURCE_ID) + return (err); + + if ((err = add_mcap(handle, tabptr)) != Z_OK) + return (err); + + return (Z_OK); +} + +int +zonecfg_lookup_mcap(zone_dochandle_t handle, struct zone_mcaptab *tabptr) +{ + xmlNodePtr cur; + int err; + + if (tabptr == NULL) + return (Z_INVAL); + + if ((err = operation_prep(handle)) != Z_OK) + return (err); + + cur = handle->zone_dh_cur; + for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) { + if (xmlStrcmp(cur->name, DTD_ELEM_MCAP) != 0) + continue; + if ((err = fetchprop(cur, DTD_ATTR_PHYSCAP, + tabptr->zone_physmem_cap, + sizeof (tabptr->zone_physmem_cap))) != Z_OK) { + handle->zone_dh_cur = handle->zone_dh_top; + return (err); + } + + return (Z_OK); + } + + return (Z_NO_ENTRY); +} + +static int +getmcapent_core(zone_dochandle_t handle, struct zone_mcaptab *tabptr) +{ + xmlNodePtr cur; + int err; + + if (handle == NULL) + return (Z_INVAL); + + if ((cur = handle->zone_dh_cur) == NULL) + return (Z_NO_ENTRY); + + for (; cur != NULL; cur = cur->next) + if (xmlStrcmp(cur->name, DTD_ELEM_MCAP) == 0) + break; + if (cur == NULL) { + handle->zone_dh_cur = handle->zone_dh_top; + return (Z_NO_ENTRY); + } + + if ((err = fetchprop(cur, DTD_ATTR_PHYSCAP, tabptr->zone_physmem_cap, + sizeof (tabptr->zone_physmem_cap))) != Z_OK) { + handle->zone_dh_cur = handle->zone_dh_top; + return (err); + } + + handle->zone_dh_cur = cur->next; + return (Z_OK); +} + +int +zonecfg_getmcapent(zone_dochandle_t handle, struct zone_mcaptab *tabptr) +{ + int err; + + if ((err = zonecfg_setent(handle)) != Z_OK) + return (err); + + err = getmcapent_core(handle, tabptr); + + (void) zonecfg_endent(handle); + + return (err); +} + int zonecfg_setpkgent(zone_dochandle_t handle) { diff --git a/usr/src/lib/libzonecfg/common/mapfile-vers b/usr/src/lib/libzonecfg/common/mapfile-vers index a9d59548d3..e2bb782688 100644 --- a/usr/src/lib/libzonecfg/common/mapfile-vers +++ b/usr/src/lib/libzonecfg/common/mapfile-vers @@ -40,10 +40,15 @@ SUNWprivate_1.1 { zonecfg_add_fs_option; zonecfg_add_ipd; zonecfg_add_nwif; + zonecfg_add_pset; zonecfg_add_rctl; zonecfg_add_rctl_value; zonecfg_add_scratch; + zonecfg_aliased_rctl_ok; + zonecfg_apply_rctls; zonecfg_attach_manifest; + zonecfg_bind_pool; + zonecfg_bind_tmp_pool; zonecfg_check_handle; zonecfg_close_scratch; zonecfg_construct_rctlblk; @@ -54,15 +59,20 @@ SUNWprivate_1.1 { zonecfg_delete_ds; zonecfg_delete_filesystem; zonecfg_delete_ipd; + zonecfg_delete_mcap; zonecfg_delete_nwif; + zonecfg_delete_pset; zonecfg_delete_rctl; zonecfg_delete_scratch; + zonecfg_del_all_resources; zonecfg_destroy; zonecfg_destroy_snapshot; + zonecfg_destroy_tmp_pool; zonecfg_detached; zonecfg_detach_save; zonecfg_devperms_apply; zonecfg_devwalk; + zonecfg_enable_rcapd; zonecfg_endattrent; zonecfg_enddevent; zonecfg_enddevperment; @@ -78,6 +88,7 @@ SUNWprivate_1.1 { zonecfg_fini_handle; zonecfg_free_fs_option_list; zonecfg_free_rctl_value_list; + zonecfg_get_aliased_rctl; zonecfg_get_attach_handle; zonecfg_get_attr_boolean; zonecfg_getattrent; @@ -88,6 +99,7 @@ SUNWprivate_1.1 { zonecfg_get_bootargs; zonecfg_get_brand; zonecfg_get_detach_info; + zonecfg_get_dflt_sched_class; zonecfg_getdevent; zonecfg_getdevperment; zonecfg_getdsent; @@ -95,6 +107,7 @@ SUNWprivate_1.1 { zonecfg_get_handle; zonecfg_getipdent; zonecfg_get_limitpriv; + zonecfg_getmcapent; zonecfg_get_name; zonecfg_get_name_by_uuid; zonecfg_getnwifent; @@ -102,8 +115,10 @@ SUNWprivate_1.1 { zonecfg_getpkgent; zonecfg_get_pool; zonecfg_get_privset; + zonecfg_getpsetent; zonecfg_getrctlent; zonecfg_get_root; + zonecfg_get_sched_class; zonecfg_get_scratch; zonecfg_get_snapshot_handle; zonecfg_get_template_handle; @@ -120,28 +135,35 @@ SUNWprivate_1.1 { zonecfg_lookup_ds; zonecfg_lookup_filesystem; zonecfg_lookup_ipd; + zonecfg_lookup_mcap; zonecfg_lookup_nwif; + zonecfg_lookup_pset; zonecfg_lookup_rctl; zonecfg_modify_attr; zonecfg_modify_dev; zonecfg_modify_ds; zonecfg_modify_filesystem; zonecfg_modify_ipd; + zonecfg_modify_mcap; zonecfg_modify_nwif; + zonecfg_modify_pset; zonecfg_modify_rctl; zonecfg_notify_bind; zonecfg_notify_critical_abort; zonecfg_notify_critical_enter; zonecfg_notify_critical_exit; zonecfg_notify_unbind; + zonecfg_num_resources; zonecfg_open_scratch; zonecfg_remove_fs_option; zonecfg_remove_rctl_value; zonecfg_reverse_scratch; + zonecfg_rm_aliased_rctl; zonecfg_rm_detached; zonecfg_same_net_address; zonecfg_save; zonecfg_setattrent; + zonecfg_set_aliased_rctl; zonecfg_set_autoboot; zonecfg_set_bootargs; zonecfg_set_brand; @@ -158,15 +180,22 @@ SUNWprivate_1.1 { zonecfg_set_pool; zonecfg_setrctlent; zonecfg_set_root; + zonecfg_set_sched; zonecfg_set_zonepath; zonecfg_strerror; + zonecfg_str_to_bytes; zonecfg_validate_zonename; + zonecfg_valid_alias_limit; zonecfg_valid_fs_type; + zonecfg_valid_importance; + zonecfg_valid_memlimit; + zonecfg_valid_ncpus; zonecfg_valid_net_address; zonecfg_valid_rctl; zonecfg_valid_rctlblk; zonecfg_valid_rctlname; zonecfg_verify_save; + zonecfg_warn_poold; zone_get_brand; zone_get_devroot; zone_get_id; diff --git a/usr/src/lib/libzonecfg/dtd/zonecfg.dtd.1 b/usr/src/lib/libzonecfg/dtd/zonecfg.dtd.1 index 3208af7a79..c51e89add3 100644 --- a/usr/src/lib/libzonecfg/dtd/zonecfg.dtd.1 +++ b/usr/src/lib/libzonecfg/dtd/zonecfg.dtd.1 @@ -111,7 +111,27 @@ mode CDATA #REQUIRED acl CDATA #REQUIRED> -<!ELEMENT zone (filesystem | inherited-pkg-dir | network | device | deleted-device | rctl | attr | dataset | package | patch | dev-perm)*> +<!-- + The tmp_pool element is separate from the pset element so that + we can track the importance value at the pool level, where it + belongs, instead of at the pset level. Once we have msets this + will be important since tmp psets and tmp msets will share a common + pool-level importance. +--> +<!ELEMENT tmp_pool EMPTY> + +<!ATTLIST tmp_pool importance CDATA #REQUIRED> + +<!ELEMENT pset EMPTY> + +<!ATTLIST pset ncpu_min CDATA #REQUIRED + ncpu_max CDATA #REQUIRED> + +<!ELEMENT mcap EMPTY> + +<!ATTLIST mcap physcap CDATA #REQUIRED> + +<!ELEMENT zone (filesystem | inherited-pkg-dir | network | device | deleted-device | rctl | attr | dataset | package | patch | dev-perm | tmp_pool | pset | mcap)*> <!ATTLIST zone name CDATA #REQUIRED zonepath CDATA #REQUIRED @@ -120,4 +140,5 @@ limitpriv CDATA "" bootargs CDATA "" brand CDATA "" + scheduling-class CDATA "" version NMTOKEN #FIXED '1'> diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com index c52316079d..6ac2e461ab 100644 --- a/usr/src/pkgdefs/SUNWhea/prototype_com +++ b/usr/src/pkgdefs/SUNWhea/prototype_com @@ -1178,6 +1178,7 @@ f none usr/include/sys/varargs.h 644 root bin f none usr/include/sys/vfs.h 644 root bin f none usr/include/sys/vfstab.h 644 root bin f none usr/include/sys/vm.h 644 root bin +f none usr/include/sys/vm_usage.h 644 root bin f none usr/include/sys/vmem.h 644 root bin f none usr/include/sys/vmem_impl.h 644 root bin f none usr/include/sys/vmem_impl_user.h 644 root bin diff --git a/usr/src/pkgdefs/SUNWrcapu/depend b/usr/src/pkgdefs/SUNWrcapu/depend index 9aaa446bca..a7375758b0 100644 --- a/usr/src/pkgdefs/SUNWrcapu/depend +++ b/usr/src/pkgdefs/SUNWrcapu/depend @@ -1,13 +1,12 @@ # -# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -43,3 +42,4 @@ P SUNWrcapr Solaris Resource Capping Daemon (Root) P SUNWcsu Core Solaris, (Usr) P SUNWcsl Core Solaris, (Shared Libs) +P SUNWzoneu Solaris Zones (Usr) diff --git a/usr/src/pkgdefs/SUNWzoner/prototype_com b/usr/src/pkgdefs/SUNWzoner/prototype_com index 009de7fb9f..15661840ea 100644 --- a/usr/src/pkgdefs/SUNWzoner/prototype_com +++ b/usr/src/pkgdefs/SUNWzoner/prototype_com @@ -56,9 +56,11 @@ f none etc/zones/SUNWblank.xml 444 root bin d none lib 755 root bin d none lib/svc 0755 root bin d none lib/svc/method 0755 root bin +f none lib/svc/method/svc-resource-mgmt 0555 root bin f none lib/svc/method/svc-zones 0555 root bin d none var 755 root sys d none var/svc 755 root sys d none var/svc/manifest 755 root sys d none var/svc/manifest/system 755 root sys +f manifest var/svc/manifest/system/resource-mgmt.xml 0444 root sys f manifest var/svc/manifest/system/zones.xml 0444 root sys diff --git a/usr/src/tools/scripts/bfu.sh b/usr/src/tools/scripts/bfu.sh index 21d5a7eb67..b10d453c7b 100644 --- a/usr/src/tools/scripts/bfu.sh +++ b/usr/src/tools/scripts/bfu.sh @@ -332,6 +332,7 @@ superfluous_local_zone_files=" lib/svc/method/svc-poold lib/svc/method/svc-pools lib/svc/method/svc-power + lib/svc/method/svc-resource-mgmt lib/svc/method/svc-rmvolmgr lib/svc/method/svc-scheduler lib/svc/method/svc-sckmd @@ -401,6 +402,7 @@ superfluous_local_zone_files=" var/svc/manifest/system/poold.xml var/svc/manifest/system/pools.xml var/svc/manifest/system/power.xml + var/svc/manifest/system/resource-mgmt.xml var/svc/manifest/system/scheduler.xml var/svc/manifest/system/sysevent.xml var/svc/manifest/system/zones.xml diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 32a63d6c22..b2bbcbc8c3 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -334,6 +334,7 @@ GENUNIX_OBJS += \ vm_seg.o \ vm_subr.o \ vm_swap.o \ + vm_usage.o \ vnode.o \ vuid_queue.o \ vuid_store.o \ diff --git a/usr/src/uts/common/disp/priocntl.c b/usr/src/uts/common/disp/priocntl.c index 3bb90cf1fa..9197dc815b 100644 --- a/usr/src/uts/common/disp/priocntl.c +++ b/usr/src/uts/common/disp/priocntl.c @@ -136,6 +136,7 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg, struct pcmpargs pcmpargs; pc_vaparms_t vaparms; char clname[PC_CLNMSZ]; + char *outstr; int count; kthread_id_t retthreadp; proc_t *initpp; @@ -145,6 +146,7 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg, int rv = 0; pid_t saved_pid; id_t classid; + int size; int (*copyinfn)(const void *, void *, size_t); int (*copyoutfn)(const void *, void *, size_t); @@ -692,6 +694,21 @@ priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg, ASSERT(defaultcid > 0 && defaultcid < loaded_classes); break; + case PC_GETDFLCL: + mutex_enter(&class_lock); + + if (defaultcid >= loaded_classes) + outstr = ""; + else + outstr = sclass[defaultcid].cl_name; + size = strlen(outstr) + 1; + if (arg != NULL) + if ((*copyoutfn)(outstr, arg, size) != 0) + error = EFAULT; + + mutex_exit(&class_lock); + break; + default: error = EINVAL; break; diff --git a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c index 5a7000c242..c5145cccf0 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -67,6 +66,7 @@ tmp_resv( int pagecreate) /* call anon_resv if set */ { pgcnt_t pages = btopr(delta); + zone_t *zone; ASSERT(RW_WRITE_HELD(&tp->tn_rwlock)); ASSERT(tp->tn_type == VREG); @@ -79,9 +79,10 @@ tmp_resv( * * Deny if trying to reserve more than tmpfs can allocate */ + zone = tm->tm_vfsp->vfs_zone; if (pagecreate && ((tm->tm_anonmem + pages > tm->tm_anonmax) || - (!anon_checkspace(ptob(pages + tmpfs_minfree))) || - (anon_resv(delta) == 0))) { + (!anon_checkspace(ptob(pages + tmpfs_minfree), zone)) || + (anon_resv_zone(delta, zone) == 0))) { return (1); } @@ -114,7 +115,7 @@ tmp_unresv( ASSERT(RW_WRITE_HELD(&tp->tn_rwlock)); ASSERT(tp->tn_type == VREG); - anon_unresv(delta); + anon_unresv_zone(delta, tm->tm_vfsp->vfs_zone); mutex_enter(&tm->tm_contents); tm->tm_anonmem -= btopr(delta); diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c index d623dce3f7..aa870b124a 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c @@ -215,9 +215,26 @@ wrtmp( if (delta > 0) { pagecreate = 1; if (tmp_resv(tm, tp, delta, pagecreate)) { - cmn_err(CE_WARN, - "%s: File system full, swap space limit exceeded", + /* + * Log file system full in the zone that owns + * the tmpfs mount, as well as in the global + * zone if necessary. + */ + zcmn_err(tm->tm_vfsp->vfs_zone->zone_id, + CE_WARN, "%s: File system full, " + "swap space limit exceeded", tm->tm_mntpath); + + if (tm->tm_vfsp->vfs_zone->zone_id != + GLOBAL_ZONEID) { + + vfs_t *vfs = tm->tm_vfsp; + + zcmn_err(GLOBAL_ZONEID, + CE_WARN, "%s: File system full, " + "swap space limit exceeded", + vfs->vfs_vnodecovered->v_path); + } error = ENOSPC; break; } diff --git a/usr/src/uts/common/os/modhash.c b/usr/src/uts/common/os/modhash.c index 19700ce685..3c63231253 100644 --- a/usr/src/uts/common/os/modhash.c +++ b/usr/src/uts/common/os/modhash.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -165,15 +164,6 @@ */ #define MH_KEYCMP(hash, key1, key2) ((hash->mh_keycmp)(key1, key2)) -static void i_mod_hash_clear_nosync(mod_hash_t *); -static int i_mod_hash_find_nosync(mod_hash_t *, mod_hash_key_t, - mod_hash_val_t *); -static int i_mod_hash_insert_nosync(mod_hash_t *, mod_hash_key_t, - mod_hash_val_t, mod_hash_hndl_t); -static int i_mod_hash_remove_nosync(mod_hash_t *, mod_hash_key_t, - mod_hash_val_t *); -static uint_t i_mod_hash(mod_hash_t *, mod_hash_key_t); - /* * Cache for struct mod_hash_entry */ @@ -522,7 +512,7 @@ mod_hash_destroy_hash(mod_hash_t *hash) * i_mod_hash() * Call the hashing algorithm for this hash table, with the given key. */ -static uint_t +uint_t i_mod_hash(mod_hash_t *hash, mod_hash_key_t key) { uint_t h; @@ -778,7 +768,7 @@ mod_hash_destroy(mod_hash_t *hash, mod_hash_key_t key) * mod_hash_find() * Find a value in the hash table corresponding to the given key. */ -static int +int i_mod_hash_find_nosync(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val) { @@ -826,7 +816,7 @@ mod_hash_find_cb(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val, return (res); } -static void +void i_mod_hash_walk_nosync(mod_hash_t *hash, uint_t (*callback)(mod_hash_key_t, mod_hash_val_t *, void *), void *arg) { @@ -870,7 +860,7 @@ mod_hash_walk(mod_hash_t *hash, * Clears the given hash table by calling the destructor of every hash * element and freeing up all mod_hash_entry's. */ -static void +void i_mod_hash_clear_nosync(mod_hash_t *hash) { int i; diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c index 88b0258afe..fecc4a6c45 100644 --- a/usr/src/uts/common/os/pid.c +++ b/usr/src/uts/common/os/pid.c @@ -385,6 +385,56 @@ pgfind(pid_t pgid) } /* + * Sets P_PR_LOCK on a non-system process. Process must be fully created + * and not exiting to succeed. + * + * Returns 0 on success. + * Returns 1 if P_PR_LOCK is set. + * Returns -1 if proc is in invalid state. + */ +int +sprtrylock_proc(proc_t *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + + /* skip system and incomplete processes */ + if (p->p_stat == SIDL || p->p_stat == SZOMB || + (p->p_flag & (SSYS | SEXITING | SEXITLWPS))) { + return (-1); + } + + if (p->p_proc_flag & P_PR_LOCK) + return (1); + + p->p_proc_flag |= P_PR_LOCK; + THREAD_KPRI_REQUEST(); + + return (0); +} + +/* + * Wait for P_PR_LOCK to become clear. Returns with p_lock dropped, + * and the proc pointer no longer valid, as the proc may have exited. + */ +void +sprwaitlock_proc(proc_t *p) +{ + kmutex_t *mp; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(p->p_proc_flag & P_PR_LOCK); + + /* + * p_lock is persistent, but p itself is not -- it could + * vanish during cv_wait(). Load p->p_lock now so we can + * drop it after cv_wait() without referencing p. + */ + mp = &p->p_lock; + cv_wait(&pr_pid_cv[p->p_slot], mp); + mutex_exit(mp); +} + +/* * If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK. * Returns the proc pointer on success, NULL on failure. sprlock() is * really just a stripped-down version of pr_p_lock() to allow practive @@ -394,7 +444,7 @@ proc_t * sprlock_zone(pid_t pid, zoneid_t zoneid) { proc_t *p; - kmutex_t *mp; + int ret; for (;;) { mutex_enter(&pidlock); @@ -402,31 +452,21 @@ sprlock_zone(pid_t pid, zoneid_t zoneid) mutex_exit(&pidlock); return (NULL); } - /* - * p_lock is persistent, but p itself is not -- it could - * vanish during cv_wait(). Load p->p_lock now so we can - * drop it after cv_wait() without referencing p. - */ - mp = &p->p_lock; - mutex_enter(mp); + mutex_enter(&p->p_lock); mutex_exit(&pidlock); - /* - * If the process is in some half-baked state, fail. - */ - if (p->p_stat == SZOMB || p->p_stat == SIDL || - (p->p_flag & (SEXITING | SEXITLWPS))) { - mutex_exit(mp); - return (NULL); - } + if (panicstr) return (p); - if (!(p->p_proc_flag & P_PR_LOCK)) + + ret = sprtrylock_proc(p); + if (ret == -1) { + mutex_exit(&p->p_lock); + return (NULL); + } else if (ret == 0) { break; - cv_wait(&pr_pid_cv[p->p_slot], mp); - mutex_exit(mp); + } + sprwaitlock_proc(p); } - p->p_proc_flag |= P_PR_LOCK; - THREAD_KPRI_REQUEST(); return (p); } diff --git a/usr/src/uts/common/os/pool.c b/usr/src/uts/common/os/pool.c index ceb90850fa..818bb54701 100644 --- a/usr/src/uts/common/os/pool.c +++ b/usr/src/uts/common/os/pool.c @@ -293,6 +293,8 @@ pool_enable(void) (void) nvlist_add_string(pool_sys_prop, "system.comment", ""); (void) nvlist_add_int64(pool_sys_prop, "system.version", 1); (void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1); + (void) nvlist_add_string(pool_sys_prop, "system.poold.objectives", + "wt-load"); (void) nvlist_alloc(&pool_default->pool_props, NV_UNIQUE_NAME, KM_SLEEP); @@ -1309,7 +1311,7 @@ pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags) } if (idtype == P_PROJID) { - kpj = project_hold_by_id(id, GLOBAL_ZONEID, PROJECT_HOLD_FIND); + kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND); if (kpj == NULL) return (ESRCH); mutex_enter(&kpj->kpj_poolbind); diff --git a/usr/src/uts/common/os/project.c b/usr/src/uts/common/os/project.c index 6c266c0ca3..d75b60f6e9 100644 --- a/usr/src/uts/common/os/project.c +++ b/usr/src/uts/common/os/project.c @@ -29,6 +29,7 @@ #include <sys/modhash.h> #include <sys/modctl.h> #include <sys/kmem.h> +#include <sys/kstat.h> #include <sys/atomic.h> #include <sys/cmn_err.h> #include <sys/proc.h> @@ -103,6 +104,8 @@ struct project_zone { * acquired, the hash lock is to be acquired first. */ +static kstat_t *project_kstat_create(kproject_t *pj, zone_t *zone); +static void project_kstat_delete(kproject_t *pj); static void project_data_init(kproject_data_t *data) @@ -118,6 +121,7 @@ project_data_init(kproject_data_t *data) data->kpd_locked_mem_ctl = UINT64_MAX; data->kpd_contract = 0; data->kpd_crypto_mem = 0; + data->kpd_lockedmem_kstat = NULL; } /*ARGSUSED*/ @@ -179,11 +183,11 @@ project_hold(kproject_t *p) } /* - * kproject_t *project_hold_by_id(projid_t, zoneid_t, int) + * kproject_t *project_hold_by_id(projid_t, zone_t *, int) * * Overview * project_hold_by_id() performs a look-up in the dictionary of projects - * active on the system by specified project ID + zone ID and puts a hold on + * active on the system by specified project ID + zone and puts a hold on * it. The third argument defines the desired behavior in the case when * project with given project ID cannot be found: * @@ -202,7 +206,7 @@ project_hold(kproject_t *p) * Caller must be in a context suitable for KM_SLEEP allocations. */ kproject_t * -project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) +project_hold_by_id(projid_t id, zone_t *zone, int flag) { kproject_t *spare_p; kproject_t *p; @@ -211,9 +215,11 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) rctl_alloc_gp_t *gp; rctl_entity_p_t e; struct project_zone pz; + boolean_t create = B_FALSE; + kstat_t *ksp; pz.kpj_id = id; - pz.kpj_zoneid = zoneid; + pz.kpj_zoneid = zone->zone_id; if (flag == PROJECT_HOLD_FIND) { mutex_enter(&project_hash_lock); @@ -241,9 +247,10 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) mutex_enter(&project_hash_lock); if (mod_hash_find(projects_hash, (mod_hash_key_t)&pz, (mod_hash_val_t *)&p) == MH_ERR_NOTFOUND) { + p = spare_p; p->kpj_id = id; - p->kpj_zoneid = zoneid; + p->kpj_zoneid = zone->zone_id; p->kpj_count = 0; p->kpj_shares = 1; p->kpj_nlwps = 0; @@ -265,7 +272,7 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) * Insert project into global project list. */ mutex_enter(&projects_list_lock); - if (id != 0 || zoneid != GLOBAL_ZONEID) { + if (id != 0 || zone != &zone0) { p->kpj_next = projects_list; p->kpj_prev = projects_list->kpj_prev; p->kpj_prev->kpj_next = p; @@ -279,6 +286,7 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) projects_list = p; } mutex_exit(&projects_list_lock); + create = B_TRUE; } else { mutex_exit(&curproc->p_lock); mod_hash_cancel(projects_hash, &hndl); @@ -290,10 +298,20 @@ project_hold_by_id(projid_t id, zoneid_t zoneid, int flag) p->kpj_count++; mutex_exit(&project_hash_lock); + /* + * The kstat stores the project's zone name, as zoneid's may change + * across reboots. + */ + if (create == B_TRUE) { + ksp = project_kstat_create(p, zone); + mutex_enter(&project_hash_lock); + ASSERT(p->kpj_data.kpd_lockedmem_kstat == NULL); + p->kpj_data.kpd_lockedmem_kstat = ksp; + mutex_exit(&project_hash_lock); + } return (p); } - /* * void project_rele(kproject_t *) * @@ -325,6 +343,7 @@ project_rele(kproject_t *p) mutex_exit(&projects_list_lock); rctl_set_free(p->kpj_rctls); + project_kstat_delete(p); if (mod_hash_destroy(projects_hash, (mod_hash_key_t)p)) panic("unable to delete project %d zone %d", p->kpj_id, @@ -636,9 +655,9 @@ project_locked_mem_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; ASSERT(MUTEX_HELD(&p->p_lock)); - mutex_enter(&p->p_zone->zone_rctl_lock); + mutex_enter(&p->p_zone->zone_mem_lock); q = p->p_task->tk_proj->kpj_data.kpd_locked_mem; - mutex_exit(&p->p_zone->zone_rctl_lock); + mutex_exit(&p->p_zone->zone_mem_lock); return (q); } @@ -649,7 +668,7 @@ project_locked_mem_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e, { rctl_qty_t q; ASSERT(MUTEX_HELD(&p->p_lock)); - ASSERT(MUTEX_HELD(&p->p_zone->zone_rctl_lock)); + ASSERT(MUTEX_HELD(&p->p_zone->zone_mem_lock)); q = p->p_task->tk_proj->kpj_data.kpd_locked_mem; if (q + inc > rval->rcv_value) return (1); @@ -868,7 +887,7 @@ project_init(void) rctl_add_default_limit("project.max-contracts", 10000, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY); - t0.t_proj = proj0p = project_hold_by_id(0, GLOBAL_ZONEID, + t0.t_proj = proj0p = project_hold_by_id(0, &zone0, PROJECT_HOLD_INSERT); mutex_enter(&p0.p_lock); @@ -876,3 +895,57 @@ project_init(void) mutex_exit(&p0.p_lock); proj0p->kpj_ntasks = 1; } + +static int +project_lockedmem_kstat_update(kstat_t *ksp, int rw) +{ + kproject_t *pj = ksp->ks_private; + kproject_kstat_t *kpk = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + kpk->kpk_usage.value.ui64 = pj->kpj_data.kpd_locked_mem; + kpk->kpk_value.value.ui64 = pj->kpj_data.kpd_locked_mem_ctl; + return (0); +} + +static kstat_t * +project_kstat_create(kproject_t *pj, zone_t *zone) +{ + kstat_t *ksp; + kproject_kstat_t *kpk; + char *zonename = zone->zone_name; + + ksp = rctl_kstat_create_project(pj, "lockedmem", KSTAT_TYPE_NAMED, + sizeof (kproject_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ksp == NULL) + return (NULL); + + kpk = ksp->ks_data = kmem_alloc(sizeof (kproject_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zonename) + 1; + kstat_named_init(&kpk->kpk_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&kpk->kpk_zonename, zonename); + kstat_named_init(&kpk->kpk_usage, "usage", KSTAT_DATA_UINT64); + kstat_named_init(&kpk->kpk_value, "value", KSTAT_DATA_UINT64); + ksp->ks_update = project_lockedmem_kstat_update; + ksp->ks_private = pj; + kstat_install(ksp); + + return (ksp); +} + +static void +project_kstat_delete(kproject_t *pj) +{ + void *data; + + if (pj->kpj_data.kpd_lockedmem_kstat != NULL) { + data = pj->kpj_data.kpd_lockedmem_kstat->ks_data; + kstat_delete(pj->kpj_data.kpd_lockedmem_kstat); + kmem_free(data, sizeof (zone_kstat_t)); + } + pj->kpj_data.kpd_lockedmem_kstat = NULL; +} diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c index 4de4c74fe8..c0479005ea 100644 --- a/usr/src/uts/common/os/rctl.c +++ b/usr/src/uts/common/os/rctl.c @@ -29,6 +29,7 @@ #include <sys/cmn_err.h> #include <sys/id_space.h> #include <sys/kmem.h> +#include <sys/kstat.h> #include <sys/log.h> #include <sys/modctl.h> #include <sys/modhash.h> @@ -2599,7 +2600,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, zonep = p->p_zone; } - mutex_enter(&zonep->zone_rctl_lock); + mutex_enter(&zonep->zone_mem_lock); e.rcep_p.proj = projp; e.rcep_t = RCENTITY_PROJECT; @@ -2627,7 +2628,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, p->p_locked_mem += inc; } out: - mutex_exit(&zonep->zone_rctl_lock); + mutex_exit(&zonep->zone_mem_lock); if (proj != NULL) zone_rele(zonep); return (ret); @@ -2661,7 +2662,7 @@ rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, zonep = p->p_zone; } - mutex_enter(&zonep->zone_rctl_lock); + mutex_enter(&zonep->zone_mem_lock); zonep->zone_locked_mem -= inc; projp->kpj_data.kpd_locked_mem -= inc; if (creditproc != 0) { @@ -2669,7 +2670,120 @@ rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, ASSERT(MUTEX_HELD(&p->p_lock)); p->p_locked_mem -= inc; } - mutex_exit(&zonep->zone_rctl_lock); + mutex_exit(&zonep->zone_mem_lock); if (proj != NULL) zone_rele(zonep); } + +/* + * rctl_incr_swap(proc_t *, zone_t *, size_t) + * + * Overview + * Increments the swap charge on the specified zone. + * + * Return values + * 0 on success. EAGAIN if swap increment fails due an rctl value + * on the zone. + * + * Callers context + * p_lock held on specified proc. + * swap must be even multiple of PAGESIZE + */ +int +rctl_incr_swap(proc_t *proc, zone_t *zone, size_t swap) +{ + rctl_entity_p_t e; + + ASSERT(MUTEX_HELD(&proc->p_lock)); + ASSERT((swap & PAGEOFFSET) == 0); + e.rcep_p.zone = zone; + e.rcep_t = RCENTITY_ZONE; + + mutex_enter(&zone->zone_mem_lock); + + if ((zone->zone_max_swap + swap) > + zone->zone_max_swap_ctl) { + + if (rctl_test_entity(rc_zone_max_swap, zone->zone_rctls, + proc, &e, swap, 0) & RCT_DENY) { + mutex_exit(&zone->zone_mem_lock); + return (EAGAIN); + } + } + zone->zone_max_swap += swap; + mutex_exit(&zone->zone_mem_lock); + return (0); +} + +/* + * rctl_decr_swap(zone_t *, size_t) + * + * Overview + * Decrements the swap charge on the specified zone. + * + * Return values + * None + * + * Callers context + * swap must be even multiple of PAGESIZE + */ +void +rctl_decr_swap(zone_t *zone, size_t swap) +{ + ASSERT((swap & PAGEOFFSET) == 0); + mutex_enter(&zone->zone_mem_lock); + ASSERT(zone->zone_max_swap >= swap); + zone->zone_max_swap -= swap; + mutex_exit(&zone->zone_mem_lock); +} + +/* + * Create resource kstat + */ +static kstat_t * +rctl_kstat_create_common(char *ks_name, int ks_instance, char *ks_class, + uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags, int ks_zoneid) +{ + kstat_t *ksp = NULL; + char name[KSTAT_STRLEN]; + + (void) snprintf(name, KSTAT_STRLEN, "%s_%d", ks_name, ks_instance); + + if ((ksp = kstat_create_zone("caps", ks_zoneid, + name, ks_class, ks_type, + ks_ndata, ks_flags, ks_zoneid)) != NULL) { + if (ks_zoneid != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + } + return (ksp); +} + +/* + * Create zone-specific resource kstat + */ +kstat_t * +rctl_kstat_create_zone(zone_t *zone, char *ks_name, uchar_t ks_type, + uint_t ks_ndata, uchar_t ks_flags) +{ + char name[KSTAT_STRLEN]; + + (void) snprintf(name, KSTAT_STRLEN, "%s_zone", ks_name); + + return (rctl_kstat_create_common(name, zone->zone_id, "zone_caps", + ks_type, ks_ndata, ks_flags, zone->zone_id)); +} + +/* + * Create project-specific resource kstat + */ +kstat_t * +rctl_kstat_create_project(kproject_t *kpj, char *ks_name, uchar_t ks_type, + uint_t ks_ndata, uchar_t ks_flags) +{ + char name[KSTAT_STRLEN]; + + (void) snprintf(name, KSTAT_STRLEN, "%s_project", ks_name); + + return (rctl_kstat_create_common(name, kpj->kpj_id, "project_caps", + ks_type, ks_ndata, ks_flags, kpj->kpj_zoneid)); +} diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c index 66aae7d2bc..62279e0777 100644 --- a/usr/src/uts/common/os/schedctl.c +++ b/usr/src/uts/common/os/schedctl.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -542,13 +541,13 @@ schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr) * Set up anonymous memory struct. No swap reservation is * needed since the page will be locked into memory. */ - amp = anonmap_alloc(PAGESIZE, PAGESIZE); + amp = anonmap_alloc(PAGESIZE, 0); /* * Allocate the page. */ - kaddr = segkp_get_withanonmap(segkp, PAGESIZE, KPD_LOCKED | KPD_ZERO, - amp); + kaddr = segkp_get_withanonmap(segkp, PAGESIZE, + KPD_NO_ANON | KPD_LOCKED | KPD_ZERO, amp); if (kaddr == NULL) { amp->refcnt--; anonmap_free(amp); diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c index 9ada0aac18..a7ef99fddb 100644 --- a/usr/src/uts/common/os/sysent.c +++ b/usr/src/uts/common/os/sysent.c @@ -666,7 +666,7 @@ struct sysent sysent[NSYSCALL] = /* 178 */ SYSENT_LOADABLE(), /* kaio */ /* 179 */ SYSENT_LOADABLE(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), - /* 181 */ SYSENT_CI("rusagesys", rusagesys, 2), + /* 181 */ SYSENT_CI("rusagesys", rusagesys, 5), /* 182 */ SYSENT_LOADABLE(), /* portfs */ /* 183 */ SYSENT_CI("pollsys", pollsys, 4), /* 184 */ SYSENT_CI("labelsys", labelsys, 5), @@ -1044,7 +1044,7 @@ struct sysent sysent32[NSYSCALL] = /* 178 */ SYSENT_LOADABLE32(), /* kaio */ /* 179 */ SYSENT_LOADABLE32(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), - /* 181 */ SYSENT_CI("rusagesys", rusagesys, 2), + /* 181 */ SYSENT_CI("rusagesys", rusagesys, 5), /* 182 */ SYSENT_LOADABLE32(), /* portfs */ /* 183 */ SYSENT_CI("pollsys", pollsys, 4), /* 184 */ SYSENT_CI("labelsys", labelsys, 5), diff --git a/usr/src/uts/common/os/task.c b/usr/src/uts/common/os/task.c index 562e3596b5..785f74c145 100644 --- a/usr/src/uts/common/os/task.c +++ b/usr/src/uts/common/os/task.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -389,7 +388,7 @@ task_create(projid_t projid, zone_t *zone) tk->tk_nlwps = 0; tk->tk_nlwps_ctl = INT_MAX; tk->tk_usage = tu; - tk->tk_proj = project_hold_by_id(projid, zone->zone_id, + tk->tk_proj = project_hold_by_id(projid, zone, PROJECT_HOLD_INSERT); tk->tk_flags = TASK_NORMAL; @@ -848,7 +847,7 @@ task_init(void) task0p->tk_tkid = id_alloc(taskid_space); task0p->tk_usage = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP); - task0p->tk_proj = project_hold_by_id(0, GLOBAL_ZONEID, + task0p->tk_proj = project_hold_by_id(0, &zone0, PROJECT_HOLD_INSERT); task0p->tk_flags = TASK_NORMAL; task0p->tk_nlwps = p->p_lwpcnt; diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index 0fb2c2be55..19ea8b31f1 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -154,6 +154,10 @@ * zone_lock: This is a per-zone lock used to protect several fields of * the zone_t (see <sys/zone.h> for details). In addition, holding * this lock means that the zone cannot go away. + * zone_nlwps_lock: This is a per-zone lock used to protect the fields + * related to the zone.max-lwps rctl. + * zone_mem_lock: This is a per-zone lock used to protect the fields + * related to the zone.max-locked-memory and zone.max-swap rctls. * zsd_key_lock: This is a global lock protecting the key state for ZSD. * zone_deathrow_lock: This is a global lock protecting the "deathrow" * list (a list of zones in the ZONE_IS_DEAD state). @@ -162,6 +166,10 @@ * pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock --> * zone_lock --> zsd_key_lock --> pidlock --> p_lock * + * When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is: + * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock + * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock + * * Blocking memory allocations are permitted while holding any of the * zone locks. * @@ -190,6 +198,7 @@ #include <sys/debug.h> #include <sys/file.h> #include <sys/kmem.h> +#include <sys/kstat.h> #include <sys/mutex.h> #include <sys/note.h> #include <sys/pathname.h> @@ -232,6 +241,8 @@ #include <sys/zone.h> #include <sys/tsol/label.h> +#include <vm/seg.h> + /* * cv used to signal that all references to the zone have been released. This * needs to be global since there may be multiple waiters, and the first to @@ -317,6 +328,7 @@ const char *zone_status_table[] = { */ rctl_hndl_t rc_zone_cpu_shares; rctl_hndl_t rc_zone_locked_mem; +rctl_hndl_t rc_zone_max_swap; rctl_hndl_t rc_zone_nlwps; rctl_hndl_t rc_zone_shmmax; rctl_hndl_t rc_zone_shmmni; @@ -1011,9 +1023,9 @@ zone_locked_mem_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; ASSERT(MUTEX_HELD(&p->p_lock)); - mutex_enter(&p->p_zone->zone_rctl_lock); + mutex_enter(&p->p_zone->zone_mem_lock); q = p->p_zone->zone_locked_mem; - mutex_exit(&p->p_zone->zone_rctl_lock); + mutex_exit(&p->p_zone->zone_mem_lock); return (q); } @@ -1023,9 +1035,12 @@ zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags) { rctl_qty_t q; + zone_t *z; + + z = e->rcep_p.zone; ASSERT(MUTEX_HELD(&p->p_lock)); - ASSERT(MUTEX_HELD(&p->p_zone->zone_rctl_lock)); - q = p->p_zone->zone_locked_mem; + ASSERT(MUTEX_HELD(&z->zone_mem_lock)); + q = z->zone_locked_mem; if (q + incr > rcntl->rcv_value) return (1); return (0); @@ -1051,6 +1066,57 @@ static rctl_ops_t zone_locked_mem_ops = { zone_locked_mem_test }; +/*ARGSUSED*/ +static rctl_qty_t +zone_max_swap_usage(rctl_t *rctl, struct proc *p) +{ + rctl_qty_t q; + zone_t *z = p->p_zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + mutex_enter(&z->zone_mem_lock); + q = z->zone_max_swap; + mutex_exit(&z->zone_mem_lock); + return (q); +} + +/*ARGSUSED*/ +static int +zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, + rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags) +{ + rctl_qty_t q; + zone_t *z; + + z = e->rcep_p.zone; + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(MUTEX_HELD(&z->zone_mem_lock)); + q = z->zone_max_swap; + if (q + incr > rcntl->rcv_value) + return (1); + return (0); +} + +/*ARGSUSED*/ +static int +zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + if (e->rcep_p.zone == NULL) + return (0); + e->rcep_p.zone->zone_max_swap_ctl = nv; + return (0); +} + +static rctl_ops_t zone_max_swap_ops = { + rcop_no_action, + zone_max_swap_usage, + zone_max_swap_set, + zone_max_swap_test +}; + /* * Helper function to brand the zone with a unique ID. */ @@ -1080,6 +1146,96 @@ zone_get_kcred(zoneid_t zoneid) return (cr); } +static int +zone_lockedmem_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = zone->zone_locked_mem; + zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl; + return (0); +} + +static int +zone_swapresv_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = zone->zone_max_swap; + zk->zk_value.value.ui64 = zone->zone_max_swap_ctl; + return (0); +} + +static void +zone_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_kstat_t *zk; + + ksp = rctl_kstat_create_zone(zone, "lockedmem", KSTAT_TYPE_NAMED, + sizeof (zone_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ksp == NULL) + return; + + zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zk->zk_zonename, zone->zone_name); + kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64); + kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64); + ksp->ks_update = zone_lockedmem_kstat_update; + ksp->ks_private = zone; + kstat_install(ksp); + + zone->zone_lockedmem_kstat = ksp; + + ksp = rctl_kstat_create_zone(zone, "swapresv", KSTAT_TYPE_NAMED, + sizeof (zone_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ksp == NULL) + return; + + zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zk->zk_zonename, zone->zone_name); + kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64); + kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64); + ksp->ks_update = zone_swapresv_kstat_update; + ksp->ks_private = zone; + kstat_install(ksp); + + zone->zone_swapresv_kstat = ksp; +} + +static void +zone_kstat_delete(zone_t *zone) +{ + void *data; + + if (zone->zone_lockedmem_kstat != NULL) { + data = zone->zone_lockedmem_kstat->ks_data; + kstat_delete(zone->zone_lockedmem_kstat); + kmem_free(data, sizeof (zone_kstat_t)); + } + if (zone->zone_swapresv_kstat != NULL) { + data = zone->zone_swapresv_kstat->ks_data; + kstat_delete(zone->zone_swapresv_kstat); + kmem_free(data, sizeof (zone_kstat_t)); + } +} + /* * Called very early on in boot to initialize the ZSD list so that * zone_key_create() can be called before zone_init(). It also initializes @@ -1101,8 +1257,14 @@ zone_zsd_init(void) mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL); zone0.zone_shares = 1; + zone0.zone_nlwps = 0; zone0.zone_nlwps_ctl = INT_MAX; + zone0.zone_locked_mem = 0; + zone0.zone_locked_mem_ctl = UINT64_MAX; + ASSERT(zone0.zone_max_swap == 0); + zone0.zone_max_swap_ctl = UINT64_MAX; zone0.zone_shmmax = 0; zone0.zone_ipc.ipcq_shmmni = 0; zone0.zone_ipc.ipcq_semmni = 0; @@ -1120,6 +1282,8 @@ zone_zsd_init(void) zone0.zone_ncpus_online = 0; zone0.zone_proc_initpid = 1; zone0.zone_initname = initname; + zone0.zone_lockedmem_kstat = NULL; + zone0.zone_swapresv_kstat = NULL; list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), offsetof(struct zsd_entry, zsd_linkage)); list_insert_head(&zone_active, &zone0); @@ -1259,6 +1423,12 @@ zone_init(void) RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, &zone_locked_mem_ops); + + rc_zone_max_swap = rctl_register("zone.max-swap", + RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | + RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, + &zone_max_swap_ops); + /* * Initialize the ``global zone''. */ @@ -1277,9 +1447,14 @@ zone_init(void) zone0.zone_brand = &native_brand; rctl_prealloc_destroy(gp); /* - * pool_default hasn't been initialized yet, so we let pool_init() take - * care of making the global zone is in the default pool. + * pool_default hasn't been initialized yet, so we let pool_init() + * take care of making sure the global zone is in the default pool. + */ + + /* + * Initialize global zone kstats */ + zone_kstat_create(&zone0); /* * Initialize zone label. @@ -1337,6 +1512,7 @@ zone_init(void) if (res) panic("Sysevent_evc_bind failed during zone setup.\n"); + } static void @@ -1476,6 +1652,38 @@ zone_set_initname(zone_t *zone, const char *zone_initname) return (0); } +static int +zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) +{ + uint64_t mcap; + int err = 0; + + if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) + zone->zone_phys_mcap = mcap; + + return (err); +} + +static int +zone_set_sched_class(zone_t *zone, const char *new_class) +{ + char sched_class[PC_CLNMSZ]; + id_t classid; + int err; + + ASSERT(zone != global_zone); + if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0) + return (err); /* EFAULT or ENAMETOOLONG */ + + if (getcid(sched_class, &classid) != 0 || classid == syscid) + return (set_errno(EINVAL)); + zone->zone_defaultcid = classid; + ASSERT(zone->zone_defaultcid > 0 && + zone->zone_defaultcid < loaded_classes); + + return (0); +} + /* * Block indefinitely waiting for (zone_status >= status) */ @@ -2510,10 +2718,10 @@ zsched(void *arg) /* * Decrement locked memory counts on old zone and project. */ - mutex_enter(&global_zone->zone_rctl_lock); + mutex_enter(&global_zone->zone_mem_lock); global_zone->zone_locked_mem -= pp->p_locked_mem; pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem; - mutex_exit(&global_zone->zone_rctl_lock); + mutex_exit(&global_zone->zone_mem_lock); /* * Create and join a new task in project '0' of this zone. @@ -2529,10 +2737,10 @@ zsched(void *arg) pj = pp->p_task->tk_proj; - mutex_enter(&zone->zone_rctl_lock); + mutex_enter(&zone->zone_mem_lock); zone->zone_locked_mem += pp->p_locked_mem; pj->kpj_data.kpd_locked_mem += pp->p_locked_mem; - mutex_exit(&zone->zone_rctl_lock); + mutex_exit(&zone->zone_mem_lock); /* * add lwp counts to zsched's zone, and increment project's task count @@ -2689,7 +2897,10 @@ zsched(void *arg) * classid 'cid'. */ pool_lock(); - cid = pool_get_class(zone->zone_pool); + if (zone->zone_defaultcid > 0) + cid = zone->zone_defaultcid; + else + cid = pool_get_class(zone->zone_pool); if (cid == -1) cid = defaultcid; @@ -3019,7 +3230,7 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_initname = NULL; mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&zone->zone_rctl_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL); list_create(&zone->zone_zsd, sizeof (struct zsd_entry), offsetof(struct zsd_entry, zsd_linkage)); @@ -3057,8 +3268,14 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_initname = kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP); (void) strcpy(zone->zone_initname, zone_default_initname); + zone->zone_nlwps = 0; + zone->zone_nlwps_ctl = INT_MAX; zone->zone_locked_mem = 0; zone->zone_locked_mem_ctl = UINT64_MAX; + zone->zone_max_swap = 0; + zone->zone_max_swap_ctl = UINT64_MAX; + zone0.zone_lockedmem_kstat = NULL; + zone0.zone_swapresv_kstat = NULL; /* * Zsched initializes the rctls. @@ -3233,6 +3450,11 @@ zone_create(const char *zone_name, const char *zone_root, */ /* + * Create zone kstats + */ + zone_kstat_create(zone); + + /* * Let the other lwps continue. */ mutex_enter(&pp->p_lock); @@ -3643,6 +3865,9 @@ zone_destroy(zoneid_t zoneid) } + /* Get rid of the zone's kstats */ + zone_kstat_delete(zone); + /* * It is now safe to let the zone be recreated; remove it from the * lists. The memory will not be freed until the last cred @@ -3892,6 +4117,32 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) error = EFAULT; } break; + case ZONE_ATTR_PHYS_MCAP: + size = sizeof (zone->zone_phys_mcap); + if (bufsize > size) + bufsize = size; + if (buf != NULL && + copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) + error = EFAULT; + break; + case ZONE_ATTR_SCHED_CLASS: + mutex_enter(&class_lock); + + if (zone->zone_defaultcid >= loaded_classes) + outstr = ""; + else + outstr = sclass[zone->zone_defaultcid].cl_name; + size = strlen(outstr) + 1; + if (bufsize > size) + bufsize = size; + if (buf != NULL) { + err = copyoutstr(outstr, buf, bufsize, NULL); + if (err != 0 && err != ENAMETOOLONG) + error = EFAULT; + } + + mutex_exit(&class_lock); + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -3923,10 +4174,10 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) return (set_errno(EPERM)); /* - * At present, attributes can only be set on non-running, - * non-global zones. + * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the + * global zone. */ - if (zoneid == GLOBAL_ZONEID) { + if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { return (set_errno(EINVAL)); } @@ -3938,8 +4189,12 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) zone_hold(zone); mutex_exit(&zonehash_lock); + /* + * At present most attributes can only be set on non-running, + * non-global zones. + */ zone_status = zone_status_get(zone); - if (zone_status > ZONE_IS_READY) + if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) goto done; switch (attr) { @@ -3971,6 +4226,12 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) if (zone->zone_brand == NULL) err = EINVAL; break; + case ZONE_ATTR_PHYS_MCAP: + err = zone_set_phys_mcap(zone, (const uint64_t *)buf); + break; + case ZONE_ATTR_SCHED_CLASS: + err = zone_set_sched_class(zone, (const char *)buf); + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize); @@ -3986,6 +4247,11 @@ done: /* * Return zero if the process has at least one vnode mapped in to its * address space which shouldn't be allowed to change zones. + * + * Also return zero if the process has any shared mappings which reserve + * swap. This is because the counting for zone.max-swap does not allow swap + * revervation to be shared between zones. zone swap reservation is counted + * on zone->zone_max_swap. */ static int as_can_change_zones(void) @@ -3997,8 +4263,17 @@ as_can_change_zones(void) int allow = 1; ASSERT(pp->p_as != &kas); - AS_LOCK_ENTER(&as, &as->a_lock, RW_READER); + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { + + /* + * Cannot enter zone with shared anon memory which + * reserves swap. See comment above. + */ + if (seg_can_change_zones(seg) == B_FALSE) { + allow = 0; + break; + } /* * if we can't get a backing vnode for this segment then skip * it. @@ -4011,11 +4286,30 @@ as_can_change_zones(void) break; } } - AS_LOCK_EXIT(&as, &as->a_lock); + AS_LOCK_EXIT(as, &as->a_lock); return (allow); } /* + * Count swap reserved by curproc's address space + */ +static size_t +as_swresv(void) +{ + proc_t *pp = curproc; + struct seg *seg; + struct as *as = pp->p_as; + size_t swap = 0; + + ASSERT(pp->p_as != &kas); + ASSERT(AS_WRITE_HELD(as, &as->a_lock)); + for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) + swap += seg_swresv(seg); + + return (swap); +} + +/* * Systemcall entry point for zone_enter(). * * The current process is injected into said zone. In the process @@ -4043,6 +4337,7 @@ zone_enter(zoneid_t zoneid) zone_status_t status; int err = 0; rctl_entity_p_t e; + size_t swap; if (secpolicy_zone_config(CRED()) != 0) return (set_errno(EPERM)); @@ -4205,6 +4500,15 @@ zone_enter(zoneid_t zoneid) goto out; } + /* + * a_lock must be held while transfering locked memory and swap + * reservation from the global zone to the non global zone because + * asynchronous faults on the processes' address space can lock + * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE + * segments respectively. + */ + AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER); + swap = as_swresv(); mutex_enter(&pp->p_lock); zone_proj0 = zone->zone_zsched->p_task->tk_proj; /* verify that we do not exceed and task or lwp limits */ @@ -4216,10 +4520,11 @@ zone_enter(zoneid_t zoneid) zone_proj0->kpj_ntasks += 1; mutex_exit(&zone->zone_nlwps_lock); - mutex_enter(&zone->zone_rctl_lock); + mutex_enter(&zone->zone_mem_lock); zone->zone_locked_mem += pp->p_locked_mem; zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem; - mutex_exit(&zone->zone_rctl_lock); + zone->zone_max_swap += swap; + mutex_exit(&zone->zone_mem_lock); /* remove lwps from proc's old zone and old project */ mutex_enter(&pp->p_zone->zone_nlwps_lock); @@ -4227,12 +4532,14 @@ zone_enter(zoneid_t zoneid) pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt; mutex_exit(&pp->p_zone->zone_nlwps_lock); - mutex_enter(&pp->p_zone->zone_rctl_lock); + mutex_enter(&pp->p_zone->zone_mem_lock); pp->p_zone->zone_locked_mem -= pp->p_locked_mem; pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem; - mutex_exit(&pp->p_zone->zone_rctl_lock); + pp->p_zone->zone_max_swap -= swap; + mutex_exit(&pp->p_zone->zone_mem_lock); mutex_exit(&pp->p_lock); + AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock); /* * Joining the zone cannot fail from now on. @@ -4289,6 +4596,31 @@ zone_enter(zoneid_t zoneid) sess_rele(pp->p_sessp, B_TRUE); pp->p_sessp = sp; pgjoin(pp, zone->zone_zsched->p_pidp); + + /* + * If there is a default scheduling class for the zone and it is not + * the class we are currently in, change all of the threads in the + * process to the new class. We need to be holding pidlock & p_lock + * when we call parmsset so this is a good place to do it. + */ + if (zone->zone_defaultcid > 0 && + zone->zone_defaultcid != curthread->t_cid) { + pcparms_t pcparms; + kthread_id_t t; + + pcparms.pc_cid = zone->zone_defaultcid; + pcparms.pc_clparms[0] = 0; + + /* + * If setting the class fails, we still want to enter the zone. + */ + if ((t = pp->p_tlist) != NULL) { + do { + (void) parmsset(&pcparms, t); + } while ((t = t->t_forw) != pp->p_tlist); + } + } + mutex_exit(&pp->p_lock); mutex_exit(&pidlock); diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index ab103ef4c7..4493f99454 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -544,6 +544,7 @@ CHKHDRS= \ visual_io.h \ vlan.h \ vm.h \ + vm_usage.h \ vmem.h \ vmem_impl.h \ vmmeter.h \ diff --git a/usr/src/uts/common/sys/modhash_impl.h b/usr/src/uts/common/sys/modhash_impl.h index 25e45cec23..a187eb68ee 100644 --- a/usr/src/uts/common/sys/modhash_impl.h +++ b/usr/src/uts/common/sys/modhash_impl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -93,6 +92,18 @@ struct mod_hash { */ void mod_hash_init(void); +/* + * Internal routines. Use directly with care. + */ +uint_t i_mod_hash(mod_hash_t *, mod_hash_key_t); +int i_mod_hash_insert_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t, + mod_hash_hndl_t); +int i_mod_hash_remove_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *); +int i_mod_hash_find_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *); +void i_mod_hash_walk_nosync(mod_hash_t *, uint_t (*)(mod_hash_key_t, + mod_hash_val_t *, void *), void *); +void i_mod_hash_clear_nosync(mod_hash_t *hash); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/priocntl.h b/usr/src/uts/common/sys/priocntl.h index ca1a92400a..6475ed0a4c 100644 --- a/usr/src/uts/common/sys/priocntl.h +++ b/usr/src/uts/common/sys/priocntl.h @@ -65,6 +65,7 @@ extern long priocntl(), priocntlset(); #define PC_SETXPARMS 7 /* Set extended scheduling parameters */ #define PC_GETXPARMS 8 /* Get extended scheduling parameters */ #define PC_SETDFLCL 9 /* Set default class, not for general use */ +#define PC_GETDFLCL 10 /* Get default class, not for general use */ #define PC_CLNULL -1 diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h index fcf953262c..9a0ba2cc37 100644 --- a/usr/src/uts/common/sys/proc.h +++ b/usr/src/uts/common/sys/proc.h @@ -613,6 +613,8 @@ extern proc_t *pgfind(pid_t); extern proc_t *pgfind_zone(pid_t, zoneid_t); extern proc_t *sprlock(pid_t); extern proc_t *sprlock_zone(pid_t, zoneid_t); +extern int sprtrylock_proc(proc_t *); +extern void sprwaitlock_proc(proc_t *); extern void sprlock_proc(proc_t *); extern void sprunlock(proc_t *); extern void pid_init(void); diff --git a/usr/src/uts/common/sys/project.h b/usr/src/uts/common/sys/project.h index 679c1eddc2..5018df8499 100644 --- a/usr/src/uts/common/sys/project.h +++ b/usr/src/uts/common/sys/project.h @@ -28,15 +28,24 @@ #pragma ident "%Z%%M% %I% %E% SMI" + #ifdef __cplusplus extern "C" { #endif + +#include <sys/kstat.h> #include <sys/types.h> #include <sys/mutex.h> #include <sys/rctl.h> #include <sys/ipc_rctl.h> +typedef struct kproject_kstat { + kstat_named_t kpk_zonename; + kstat_named_t kpk_usage; + kstat_named_t kpk_value; +} kproject_kstat_t; + typedef struct kproject_data { /* Datum protected by: */ rctl_qty_t kpd_shmmax; /* shm's ipcs_lock */ ipc_rqty_t kpd_ipc; /* shm|sem|msg's ipcs lock */ @@ -44,6 +53,7 @@ typedef struct kproject_data { /* Datum protected by: */ rctl_qty_t kpd_locked_mem_ctl; /* kpj_rctls->rcs_lock */ rctl_qty_t kpd_contract; /* contract_lock */ rctl_qty_t kpd_crypto_mem; /* crypto_rctl_lock */ + kstat_t *kpd_lockedmem_kstat; /* locked memory kstat */ } kproject_data_t; @@ -76,9 +86,11 @@ typedef struct kproject { #define PROJECT_HOLD_FIND 1 #define PROJECT_HOLD_INSERT 2 +struct zone; + void project_init(void); kproject_t *project_hold(kproject_t *); -kproject_t *project_hold_by_id(projid_t, zoneid_t, int); +kproject_t *project_hold_by_id(projid_t, struct zone *, int); void project_rele(kproject_t *); int project_walk_all(zoneid_t, int (*)(kproject_t *, void *), void *); projid_t curprojid(void); diff --git a/usr/src/uts/common/sys/rctl.h b/usr/src/uts/common/sys/rctl.h index eb56fff9e5..a8480c2768 100644 --- a/usr/src/uts/common/sys/rctl.h +++ b/usr/src/uts/common/sys/rctl.h @@ -168,6 +168,7 @@ struct proc; struct task; struct kproject; struct zone; +struct kstat; typedef struct rctl_entity_p_struct { rctl_entity_t rcep_t; @@ -324,6 +325,14 @@ int rctl_incr_locked_mem(struct proc *, struct kproject *, rctl_qty_t, int); void rctl_decr_locked_mem(struct proc *, struct kproject *, rctl_qty_t, int); +int rctl_incr_swap(struct proc *, struct zone *, size_t); +void rctl_decr_swap(struct zone *, size_t); + +struct kstat *rctl_kstat_create_zone(struct zone *, char *, uchar_t, uint_t, + uchar_t); + +struct kstat *rctl_kstat_create_project(struct kproject *, char *, uchar_t, + uint_t, uchar_t); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/resource.h b/usr/src/uts/common/sys/resource.h index 86cc716d56..bf02808d4b 100644 --- a/usr/src/uts/common/sys/resource.h +++ b/usr/src/uts/common/sys/resource.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -191,6 +190,7 @@ struct rusage { #define _RUSAGESYS_GETRUSAGE 0 /* rusage process */ #define _RUSAGESYS_GETRUSAGE_CHLD 1 /* rusage child process */ #define _RUSAGESYS_GETRUSAGE_LWP 2 /* rusage lwp */ +#define _RUSAGESYS_GETVMUSAGE 3 /* getvmusage */ #if defined(_SYSCALL32) diff --git a/usr/src/uts/common/sys/syscall.h b/usr/src/uts/common/sys/syscall.h index 96cb967023..eedadfa0c0 100644 --- a/usr/src/uts/common/sys/syscall.h +++ b/usr/src/uts/common/sys/syscall.h @@ -384,7 +384,8 @@ extern "C" { #define SYS_rusagesys 181 /* * subcodes: - * getrusage(...) :: rusagesys(RUSAGESYS_GETRUSAGE,...) + * getrusage(...) :: rusagesys(RUSAGESYS_GETRUSAGE, ...) + * getvmusage(...) :: rusagesys(RUSAGESYS_GETVMUSAGE, ...) */ #define SYS_port 182 /* diff --git a/usr/src/uts/common/sys/vm_usage.h b/usr/src/uts/common/sys/vm_usage.h new file mode 100644 index 0000000000..5f8c8b8fe5 --- /dev/null +++ b/usr/src/uts/common/sys/vm_usage.h @@ -0,0 +1,120 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VM_USAGE_H +#define _SYS_VM_USAGE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The flags passed to getvmusage() request how to aggregate rss/swap results. + * Results can be aggregated by zone, project, task, ruser, and/or euser. + * + * If VMUSAGE_ALL_* or VMUSAGE_COL_* are passed from a non-global-zone, the + * flag is treated as VMUSAGE_*. For example, VMUSAGE_ALL_ZONES would be + * treated as VMUSAGE_ZONE. + * + * If VMUSAGE_SYSTEM is passed from a non-global zone, a result of type + * VMUSAGE_SYSTEM will be returned, but it will only reflect the usage + * of the calling zone. + * + * VMUSAGE_* requests results for the calling zone. + * VMUSAGE_ALL_* requests results for all zones. + * VMUSAGE_COL_* requests results for all zones, but collapses out the zoneid. + * For example, VMUSAGE_COL_PROJECTS requests results for all + * projects in all zones, and project N in ANY zone is treated + * as the same project. + */ +#define VMUSAGE_SYSTEM 0x1 /* rss/swap for ALL processes */ +#define VMUSAGE_ZONE 0x2 /* rss/swap for caller's zone */ +#define VMUSAGE_PROJECTS 0x4 /* rss/swap for all projects in */ + /* caller's zone */ +#define VMUSAGE_TASKS 0x8 /* rss/swap for all tasks in */ + /* caller's zones */ +#define VMUSAGE_RUSERS 0x10 /* rss/swap for all users (by process */ + /* ruser) in the caller's zone */ +#define VMUSAGE_EUSERS 0x20 /* same as VMUSAGE_RUSERS, but by */ + /* euser */ + +#define VMUSAGE_ALL_ZONES 0x40 /* rss/swap for all zones */ +#define VMUSAGE_ALL_PROJECTS 0x80 /* rss/swap for all projects in */ + /* all zones */ +#define VMUSAGE_ALL_TASKS 0x100 /* rss/swap for all tasks in all */ + /* zones */ +#define VMUSAGE_ALL_RUSERS 0x200 /* rss/swap for all users (by process */ + /* ruser) in all zones */ +#define VMUSAGE_ALL_EUSERS 0x400 /* same as VMUSAGE_ALL_RUSERS, but by */ + /* euser */ + +#define VMUSAGE_COL_PROJECTS 0x800 /* rss/swap for all projects in */ + /* all zones. Collapse zoneid. */ +#define VMUSAGE_COL_RUSERS 0x1000 /* rss/swap for all users (by process */ + /* ruser), in all zones. Collapse */ + /* zoneid */ +#define VMUSAGE_COL_EUSERS 0x2000 /* same as VMUSAGE_COL_RUSERS, but by */ + /* euser */ + +#define VMUSAGE_MASK 0x3fff /* all valid flags for getvmusage() */ + +typedef struct vmusage { + id_t vmu_zoneid; /* zoneid, or ALL_ZONES for */ + /* VMUSAGE_COL_* results */ + /* ALL_ZONES means that the result */ + /* reflects swap and rss usage for */ + /* a projid/uid across all zones */ + uint_t vmu_type; /* Entity type of result. One of: */ + /* VMUSAGE_(SYSTEM|ZONE|PROJECTS| */ + /* TASKS|RUSERS|EUSERS) */ + id_t vmu_id; /* zoneid, projid, taskid, ... */ + size_t vmu_rss_all; /* total resident memory of entity */ + /* in bytes */ + size_t vmu_rss_private; /* total resident private memory */ + size_t vmu_rss_shared; /* total resident shared memory */ + size_t vmu_swap_all; /* total swap reserved, in bytes */ + size_t vmu_swap_private; /* swap reserved for private mappings */ + size_t vmu_swap_shared; /* swap reserved for shared mappings */ + +} vmusage_t; + +extern int getvmusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres); + +#ifdef _KERNEL + +int vm_getusage(uint_t, time_t, vmusage_t *, size_t *); +void vm_usage_init(); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VM_USAGE_H */ diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index daccd16bdf..94646bc976 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -88,6 +88,8 @@ extern "C" { #define ZONE_ATTR_INITNAME 9 #define ZONE_ATTR_BOOTARGS 10 #define ZONE_ATTR_BRAND 11 +#define ZONE_ATTR_PHYS_MCAP 12 +#define ZONE_ATTR_SCHED_CLASS 13 /* Start of the brand-specific attribute namespace */ #define ZONE_ATTR_BRAND_ATTRS 32768 @@ -280,6 +282,15 @@ typedef struct zone_dataset { list_node_t zd_linkage; } zone_dataset_t; +/* + * structure for zone kstats + */ +typedef struct zone_kstat { + kstat_named_t zk_zonename; + kstat_named_t zk_usage; + kstat_named_t zk_value; +} zone_kstat_t; + typedef struct zone { /* * zone_name is never modified once set. @@ -326,14 +337,20 @@ typedef struct zone { uint_t zone_rootpathlen; /* strlen(zone_rootpath) + 1 */ uint32_t zone_shares; /* FSS shares allocated to zone */ rctl_set_t *zone_rctls; /* zone-wide (zone.*) rctls */ - kmutex_t zone_rctl_lock; /* protects zone_locked_mem and */ + kmutex_t zone_mem_lock; /* protects zone_locked_mem and */ /* kpd_locked_mem for all */ - /* projects in zone */ + /* projects in zone. */ + /* Also protects zone_max_swap */ /* grab after p_lock, before rcs_lock */ - rctl_qty_t zone_locked_mem; /* bytes of locked memory in zone */ - rctl_qty_t zone_locked_mem_ctl; /* current locked memory */ + rctl_qty_t zone_locked_mem; /* bytes of locked memory in */ + /* zone */ + rctl_qty_t zone_locked_mem_ctl; /* Current locked memory */ /* limit. Protected by */ /* zone_rctls->rcs_lock */ + rctl_qty_t zone_max_swap; /* bytes of swap reserved by zone */ + rctl_qty_t zone_max_swap_ctl; /* current swap limit. */ + /* Protected by */ + /* zone_rctls->rcs_lock */ list_t zone_zsd; /* list of Zone-Specific Data values */ kcondvar_t zone_cv; /* used to signal state changes */ struct proc *zone_zsched; /* Dummy kernel "zsched" process */ @@ -341,6 +358,7 @@ typedef struct zone { char *zone_initname; /* fs path to 'init' */ int zone_boot_err; /* for zone_boot() if boot fails */ char *zone_bootargs; /* arguments passed via zone_boot() */ + uint64_t zone_phys_mcap; /* physical memory cap */ /* * zone_kthreads is protected by zone_status_lock. */ @@ -376,6 +394,9 @@ typedef struct zone { boolean_t zone_restart_init; /* Restart init if it dies? */ struct brand *zone_brand; /* zone's brand */ + id_t zone_defaultcid; /* dflt scheduling class id */ + kstat_t *zone_swapresv_kstat; + kstat_t *zone_lockedmem_kstat; } zone_t; /* @@ -553,6 +574,7 @@ extern void mount_completed(void); extern int zone_walk(int (*)(zone_t *, void *), void *); extern rctl_hndl_t rc_zone_locked_mem; +extern rctl_hndl_t rc_zone_max_swap; #endif /* _KERNEL */ diff --git a/usr/src/uts/common/syscall/processor_bind.c b/usr/src/uts/common/syscall/processor_bind.c index 10ca1178d5..bd416e43e6 100644 --- a/usr/src/uts/common/syscall/processor_bind.c +++ b/usr/src/uts/common/syscall/processor_bind.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -285,9 +284,10 @@ processor_bind(idtype_t idtype, id_t id, processorid_t bind, break; case P_PROJID: + pp = curproc; if (id == P_MYID) id = curprojid(); - if ((kpj = project_hold_by_id(id, getzoneid(), + if ((kpj = project_hold_by_id(id, pp->p_zone, PROJECT_HOLD_FIND)) == NULL) { ret = ESRCH; } else { diff --git a/usr/src/uts/common/syscall/pset.c b/usr/src/uts/common/syscall/pset.c index 5d3b7e6233..767529fc5d 100644 --- a/usr/src/uts/common/syscall/pset.c +++ b/usr/src/uts/common/syscall/pset.c @@ -542,9 +542,10 @@ pset_bind(psetid_t pset, idtype_t idtype, id_t id, psetid_t *opset) break; case P_PROJID: + pp = curproc; if (id == P_MYID) id = curprojid(); - if ((kpj = project_hold_by_id(id, getzoneid(), + if ((kpj = project_hold_by_id(id, pp->p_zone, PROJECT_HOLD_FIND)) == NULL) { error = ESRCH; break; diff --git a/usr/src/uts/common/syscall/rusagesys.c b/usr/src/uts/common/syscall/rusagesys.c index 3e09643981..036500932f 100644 --- a/usr/src/uts/common/syscall/rusagesys.c +++ b/usr/src/uts/common/syscall/rusagesys.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,6 +34,7 @@ #include <sys/time.h> #include <sys/errno.h> #include <sys/resource.h> +#include <sys/vm_usage.h> static int getrusage(void *user_rusage) @@ -246,16 +246,19 @@ getrusage_lwp(void *user_rusage) } int -rusagesys(int code, void * arg) +rusagesys(int code, void *arg1, void *arg2, void *arg3, void *arg4) { switch (code) { case _RUSAGESYS_GETRUSAGE: - return (getrusage(arg)); + return (getrusage(arg1)); case _RUSAGESYS_GETRUSAGE_CHLD: - return (getrusage_chld(arg)); + return (getrusage_chld(arg1)); case _RUSAGESYS_GETRUSAGE_LWP: - return (getrusage_lwp(arg)); + return (getrusage_lwp(arg1)); + case _RUSAGESYS_GETVMUSAGE: + return (vm_getusage((uint_t)(uintptr_t)arg1, (time_t)arg2, + (vmusage_t *)arg3, (size_t *)arg4)); default: return (set_errno(EINVAL)); } diff --git a/usr/src/uts/common/syscall/tasksys.c b/usr/src/uts/common/syscall/tasksys.c index 705b543a37..bec091e61c 100644 --- a/usr/src/uts/common/syscall/tasksys.c +++ b/usr/src/uts/common/syscall/tasksys.c @@ -25,6 +25,7 @@ #pragma ident "%Z%%M% %I% %E% SMI" + /* * System calls for creating and inquiring about tasks and projects */ @@ -102,7 +103,7 @@ tasksys_settaskid(projid_t projid, uint_t flags) * Put a hold on our new project and make sure that nobody is * trying to bind it to a pool while we're joining. */ - kpj = project_hold_by_id(projid, getzoneid(), PROJECT_HOLD_INSERT); + kpj = project_hold_by_id(projid, p->p_zone, PROJECT_HOLD_INSERT); e.rcep_p.proj = kpj; e.rcep_t = RCENTITY_PROJECT; @@ -111,7 +112,7 @@ tasksys_settaskid(projid_t projid, uint_t flags) zone = p->p_zone; mutex_enter(&zone->zone_nlwps_lock); - mutex_enter(&zone->zone_rctl_lock); + mutex_enter(&zone->zone_mem_lock); if (kpj->kpj_nlwps + p->p_lwpcnt > kpj->kpj_nlwps_ctl) if (rctl_test_entity(rc_project_nlwps, kpj->kpj_rctls, p, &e, @@ -130,7 +131,7 @@ tasksys_settaskid(projid_t projid, uint_t flags) rctlfail = 1; if (rctlfail) { - mutex_exit(&zone->zone_rctl_lock); + mutex_exit(&zone->zone_mem_lock); mutex_exit(&zone->zone_nlwps_lock); if (curthread != p->p_agenttp) continuelwps(p); @@ -144,7 +145,7 @@ tasksys_settaskid(projid_t projid, uint_t flags) oldpj->kpj_data.kpd_locked_mem -= p->p_locked_mem; oldpj->kpj_nlwps -= p->p_lwpcnt; - mutex_exit(&zone->zone_rctl_lock); + mutex_exit(&zone->zone_mem_lock); mutex_exit(&zone->zone_nlwps_lock); mutex_exit(&p->p_lock); diff --git a/usr/src/uts/common/vm/anon.h b/usr/src/uts/common/vm/anon.h index 90f6e1e661..ed59ec590b 100644 --- a/usr/src/uts/common/vm/anon.h +++ b/usr/src/uts/common/vm/anon.h @@ -42,6 +42,7 @@ #pragma ident "%Z%%M% %I% %E% SMI" #include <sys/cred.h> +#include <sys/zone.h> #include <vm/seg.h> #include <vm/vpage.h> @@ -387,8 +388,8 @@ extern int anon_map_demotepages(struct anon_map *, ulong_t, struct seg *, caddr_t, uint_t, struct vpage [], struct cred *); extern void anon_shmap_free_pages(struct anon_map *, ulong_t, size_t); -extern int anon_resvmem(size_t, uint_t); -extern void anon_unresv(size_t); +extern int anon_resvmem(size_t, boolean_t, zone_t *); +extern void anon_unresvmem(size_t, zone_t *); extern struct anon_map *anonmap_alloc(size_t, size_t); extern void anonmap_free(struct anon_map *); extern void anon_decref(struct anon *); @@ -416,9 +417,16 @@ extern void anon_array_exit(anon_sync_obj_t *); * request and if so, reserves the appropriate anonymous memory resources. * anon_checkspace just checks to see if there is space to fulfill the request, * without taking any resources. Both return 1 if successful and 0 if not. + * + * Macros are provided as anon reservation is usually charged to the zone of + * the current process. In some cases (such as anon reserved by tmpfs), a + * zone pointer is needed to charge the appropriate zone. */ -#define anon_resv(size) anon_resvmem((size), 1) -#define anon_checkspace(size) anon_resvmem((size), 0) +#define anon_unresv(size) anon_unresvmem(size, curproc->p_zone) +#define anon_unresv_zone(size, zone) anon_unresvmem(size, zone) +#define anon_resv(size) anon_resvmem((size), 1, curproc->p_zone) +#define anon_resv_zone(size, zone) anon_resvmem((size), 1, zone) +#define anon_checkspace(size, zone) anon_resvmem((size), 0, zone) /* * Flags to anon_private diff --git a/usr/src/uts/common/vm/seg.h b/usr/src/uts/common/vm/seg.h index 0ee7d62ce1..a9683c0e54 100644 --- a/usr/src/uts/common/vm/seg.h +++ b/usr/src/uts/common/vm/seg.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -245,6 +244,9 @@ uint_t seg_pages(struct seg *); #endif /* VMDEBUG */ +boolean_t seg_can_change_zones(struct seg *); +size_t seg_swresv(struct seg *); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/vm/seg_kp.c b/usr/src/uts/common/vm/seg_kp.c index ff9c47e0ff..d58e873a19 100644 --- a/usr/src/uts/common/vm/seg_kp.c +++ b/usr/src/uts/common/vm/seg_kp.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -147,6 +146,7 @@ uint32_t red_closest = UINT_MAX; uint32_t red_ndoubles; pgcnt_t anon_segkp_pages_locked; /* See vm/anon.h */ +pgcnt_t anon_segkp_pages_resv; /* anon reserved by seg_kp */ static struct seg_ops segkp_ops = { SEGKP_BADOP(int), /* dup */ @@ -448,8 +448,10 @@ segkp_get_internal( * Note that we don't need swap space for the red zone page. */ if (amp != NULL) { - ASSERT((flags & KPD_NO_ANON) == 0); - /* The reserve has been done and the anon_hdr is separate. */ + /* + * The swap reservation has been done, if required, and the + * anon_hdr is separate. + */ anon_idx = 0; kpd->kp_anon_idx = anon_idx; kpd->kp_anon = amp->ahp; @@ -458,7 +460,7 @@ segkp_get_internal( kpd, vbase, len, flags, 1); } else if ((flags & KPD_NO_ANON) == 0) { - if (anon_resv(SEGKP_MAPLEN(len, flags)) == 0) { + if (anon_resv_zone(SEGKP_MAPLEN(len, flags), NULL) == 0) { if (flags & KPD_LOCKED) { atomic_add_long(&anon_segkp_pages_locked, -pages); @@ -468,6 +470,8 @@ segkp_get_internal( kmem_free(kpd, sizeof (struct segkp_data)); return (NULL); } + atomic_add_long(&anon_segkp_pages_resv, + btop(SEGKP_MAPLEN(len, flags))); anon_idx = ((uintptr_t)(vbase - s_base)) >> PAGESHIFT; kpd->kp_anon_idx = anon_idx; kpd->kp_anon = kpsd->kpsd_anon; @@ -704,7 +708,9 @@ segkp_release_internal(struct seg *seg, struct segkp_data *kpd, size_t len) if ((kpd->kp_flags & KPD_HASAMP) == 0) { anon_free(kpd->kp_anon, kpd->kp_anon_idx + i, PAGESIZE); - anon_unresv(PAGESIZE); + anon_unresv_zone(PAGESIZE, NULL); + atomic_add_long(&anon_segkp_pages_resv, + -1); } TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u", diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c index f48db44acc..e2069b27c6 100644 --- a/usr/src/uts/common/vm/seg_vn.c +++ b/usr/src/uts/common/vm/seg_vn.c @@ -2323,8 +2323,9 @@ segvn_faultpage( * zeroes. If no advance reservations, reserve now. */ if (svd->flags & MAP_NORESERVE) { - if (anon_resv(ptob(1))) { - svd->swresv += ptob(1); + if (anon_resv_zone(ptob(1), + seg->s_as->a_proc->p_zone)) { + atomic_add_long(&svd->swresv, ptob(1)); } else { err = ENOMEM; goto out; diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c index 0cad34257c..3f225a345a 100644 --- a/usr/src/uts/common/vm/vm_anon.c +++ b/usr/src/uts/common/vm/vm_anon.c @@ -113,6 +113,7 @@ #include <sys/policy.h> #include <sys/condvar_impl.h> #include <sys/mutex_impl.h> +#include <sys/rctl.h> #include <vm/as.h> #include <vm/hat.h> @@ -729,12 +730,22 @@ set_anoninfo(void) * Return non-zero on success. */ int -anon_resvmem(size_t size, uint_t takemem) +anon_resvmem(size_t size, boolean_t takemem, zone_t *zone) { pgcnt_t npages = btopr(size); pgcnt_t mswap_pages = 0; pgcnt_t pswap_pages = 0; + proc_t *p = curproc; + if (zone != NULL && takemem) { + /* test zone.max-swap resource control */ + mutex_enter(&p->p_lock); + if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { + mutex_exit(&p->p_lock); + return (0); + } + mutex_exit(&p->p_lock); + } mutex_enter(&anoninfo_lock); /* @@ -834,16 +845,17 @@ anon_resvmem(size_t size, uint_t takemem) mutex_exit(&anoninfo_lock); ANON_PRINT(A_RESV, ("anon_resvmem: not enough space from swapfs\n")); + if (zone != NULL && takemem) + rctl_decr_swap(zone, ptob(npages)); return (0); } } - /* * Give back an anon reservation. */ void -anon_unresv(size_t size) +anon_unresvmem(size_t size, zone_t *zone) { pgcnt_t npages = btopr(size); spgcnt_t mem_free_pages = 0; @@ -851,6 +863,8 @@ anon_unresv(size_t size) #ifdef ANON_DEBUG pgcnt_t mem_resv; #endif + if (zone != NULL) + rctl_decr_swap(zone, size); mutex_enter(&anoninfo_lock); diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c index 05bfe662be..adac07b766 100644 --- a/usr/src/uts/common/vm/vm_page.c +++ b/usr/src/uts/common/vm/vm_page.c @@ -77,7 +77,7 @@ #include <vm/pvn.h> #include <vm/seg_kmem.h> #include <vm/vm_dep.h> - +#include <sys/vm_usage.h> #include <fs/fs_subr.h> static int nopageage = 0; @@ -343,6 +343,7 @@ vm_init(void) (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm"); page_init_mem_config(); page_retire_init(); + vm_usage_init(); } /* diff --git a/usr/src/uts/common/vm/vm_seg.c b/usr/src/uts/common/vm/vm_seg.c index 50cc21cdf7..aed892969d 100644 --- a/usr/src/uts/common/vm/vm_seg.c +++ b/usr/src/uts/common/vm/vm_seg.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -54,12 +53,14 @@ #include <sys/cmn_err.h> #include <sys/callb.h> #include <sys/mem_config.h> +#include <sys/mman.h> #include <vm/hat.h> #include <vm/as.h> #include <vm/seg.h> #include <vm/seg_kmem.h> - +#include <vm/seg_spt.h> +#include <vm/seg_vn.h> /* * kstats for segment advise */ @@ -950,3 +951,48 @@ seg_pinit_mem_config(void) */ ASSERT(ret == 0); } + +extern struct seg_ops segvn_ops; +extern struct seg_ops segspt_shmops; + +/* + * Verify that segment is not a shared anonymous segment which reserves + * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered + * from one zone to another if any segments are shared. This is because the + * last process to exit will credit the swap reservation. This could lead + * to the swap being reserved by one zone, and credited to another. + */ +boolean_t +seg_can_change_zones(struct seg *seg) +{ + struct segvn_data *svd; + + if (seg->s_ops == &segspt_shmops) + return (B_FALSE); + + if (seg->s_ops == &segvn_ops) { + svd = (struct segvn_data *)seg->s_data; + if (svd->type == MAP_SHARED && + svd->amp != NULL && + svd->amp->swresv > 0) + return (B_FALSE); + } + return (B_TRUE); +} + +/* + * Return swap reserved by a segment backing a private mapping. + */ +size_t +seg_swresv(struct seg *seg) +{ + struct segvn_data *svd; + size_t swap = 0; + + if (seg->s_ops == &segvn_ops) { + svd = (struct segvn_data *)seg->s_data; + if (svd->type == MAP_PRIVATE && svd->swresv > 0) + swap = svd->swresv; + } + return (swap); +} diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c new file mode 100644 index 0000000000..32a8811e10 --- /dev/null +++ b/usr/src/uts/common/vm/vm_usage.c @@ -0,0 +1,1978 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * vm_usage + * + * This file implements the getvmusage() private system call. + * getvmusage() counts the amount of resident memory pages and swap + * reserved by the specified process collective. A "process collective" is + * the set of processes owned by a particular, zone, project, task, or user. + * + * rss and swap are counted so that for a given process collective, a page is + * only counted once. For example, this means that if multiple processes in + * the same project map the same page, then the project will only be charged + * once for that page. On the other hand, if two processes in different + * projects map the same page, then both projects will be charged + * for the page. + * + * The vm_getusage() calculation is implemented so that the first thread + * performs the rss/swap counting. Other callers will wait for that thread to + * finish, copying the results. This enables multiple rcapds and prstats to + * consume data from the same calculation. The results are also cached so that + * a caller interested in recent results can just copy them instead of starting + * a new calculation. The caller passes the maximium age (in seconds) of the + * data. If the cached data is young enough, the cache is copied, otherwise, + * a new calculation is executed and the cache is replaced with the new + * data. + * + * The rss calculation for each process collective is as follows: + * + * - Inspect flags, determine if counting rss for zones, projects, tasks, + * and/or users. + * - For each proc: + * - Figure out proc's collectives (zone, project, task, and/or user). + * - For each seg in proc's address space: + * - If seg is private: + * - Lookup anons in the amp. + * - For incore pages not previously visited each of the + * proc's collectives, add incore pagesize to each. + * collective. + * Anon's with a refcnt of 1 can be assummed to be not + * previously visited. + * - For address ranges without anons in the amp: + * - Lookup pages in underlying vnode. + * - For incore pages not previously visiting for + * each of the proc's collectives, add incore + * pagesize to each collective. + * - If seg is shared: + * - Lookup pages in the shared amp or vnode. + * - For incore pages not previously visited for each of + * the proc's collectives, add incore pagesize to each + * collective. + * + * Swap is reserved by private segments, and shared anonymous segments. + * The only shared anon segments which do not reserve swap are ISM segments + * and schedctl segments, both of which can be identified by having + * amp->swresv == 0. + * + * The swap calculation for each collective is as follows: + * + * - Inspect flags, determine if counting rss for zones, projects, tasks, + * and/or users. + * - For each proc: + * - Figure out proc's collectives (zone, project, task, and/or user). + * - For each seg in proc's address space: + * - If seg is private: + * - Add svd->swresv pages to swap count for each of the + * proc's collectives. + * - If seg is anon, shared, and amp->swresv != 0 + * - For address ranges in amp not previously visited for + * each of the proc's collectives, add size of address + * range to the swap count for each collective. + * + * These two calculations are done simultaneously, with most of the work + * being done in vmu_calculate_seg(). The results of the calculation are + * copied into "vmu_data.vmu_cache_results". + * + * To perform the calculation, various things are tracked and cached: + * + * - incore/not-incore page ranges for all vnodes. + * (vmu_data.vmu_all_vnodes_hash) + * This eliminates looking up the same page more than once. + * + * - incore/not-incore page ranges for all shared amps. + * (vmu_data.vmu_all_amps_hash) + * This eliminates looking up the same page more than once. + * + * - visited page ranges for each collective. + * - per vnode (entity->vme_vnode_hash) + * - per shared amp (entity->vme_amp_hash) + * For accurate counting of map-shared and cow-shared pages. + * + * - visited private anons (refcnt > 1) for each collective. + * (entity->vme_anon_hash) + * For accurate counting of cow-shared pages. + * + * The common accounting structure is the vmu_entity_t, which represents + * collectives: + * + * - A zone. + * - A project, task, or user within a zone. + * - The entire system (vmu_data.vmu_system). + * - Each collapsed (col) project and user. This means a given projid or + * uid, regardless of which zone the process is in. For instance, + * project 0 in the global zone and project 0 in a non global zone are + * the same collapsed project. + * + * Each entity structure tracks which pages have been already visited for + * that entity (via previously inspected processes) so that these pages are + * not double counted. + */ + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/zone.h> +#include <sys/proc.h> +#include <sys/project.h> +#include <sys/task.h> +#include <sys/thread.h> +#include <sys/time.h> +#include <sys/mman.h> +#include <sys/modhash.h> +#include <sys/modhash_impl.h> +#include <sys/shm.h> +#include <sys/swap.h> +#include <sys/synch.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vm_usage.h> +#include <sys/zone.h> +#include <vm/anon.h> +#include <vm/as.h> +#include <vm/seg_vn.h> +#include <vm/seg_spt.h> + +#define VMUSAGE_HASH_SIZE 512 + +#define VMUSAGE_TYPE_VNODE 1 +#define VMUSAGE_TYPE_AMP 2 +#define VMUSAGE_TYPE_ANON 3 + +#define VMUSAGE_BOUND_UNKNOWN 0 +#define VMUSAGE_BOUND_INCORE 1 +#define VMUSAGE_BOUND_NOT_INCORE 2 + +/* + * bounds for vnodes and shared amps + * Each bound is either entirely incore, entirely not in core, or + * entirely unknown. bounds are stored in order by offset. + */ +typedef struct vmu_bound { + struct vmu_bound *vmb_next; + pgcnt_t vmb_start; /* page offset in vnode/amp on which bound starts */ + pgcnt_t vmb_end; /* page offset in vnode/amp on which bound ends */ + char vmb_type; /* One of VMUSAGE_BOUND_* */ +} vmu_bound_t; + +/* + * hash of visited objects (vnodes or shared amps) + * key is address of vnode or amp. Bounds lists known incore/non-incore + * bounds for vnode/amp. + */ +typedef struct vmu_object { + struct vmu_object *vmo_next; /* free list */ + caddr_t vmo_key; + short vmo_type; + vmu_bound_t *vmo_bounds; +} vmu_object_t; + +/* + * Entity by which to count results. + * + * The entity structure keeps the current rss/swap counts for each entity + * (zone, project, etc), and hashes of vm structures that have already + * been visited for the entity. + * + * vme_next: links the list of all entities currently being counted by + * vmu_calculate(). + * + * vme_next_calc: links the list of entities related to the current process + * being counted by vmu_calculate_proc(). + * + * vmu_calculate_proc() walks all processes. For each process, it makes a + * list of the entities related to that process using vme_next_calc. This + * list changes each time vmu_calculate_proc() is called. + * + */ +typedef struct vmu_entity { + struct vmu_entity *vme_next; + struct vmu_entity *vme_next_calc; + mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */ + mod_hash_t *vme_amp_hash; /* shared amps visited for entity */ + mod_hash_t *vme_anon_hash; /* cow anons visited for entity */ + vmusage_t vme_result; /* identifies entity and results */ +} vmu_entity_t; + +/* + * Hash of entities visited within a zone, and an entity for the zone + * itself. + */ +typedef struct vmu_zone { + struct vmu_zone *vmz_next; /* free list */ + id_t vmz_id; + vmu_entity_t *vmz_zone; + mod_hash_t *vmz_projects_hash; + mod_hash_t *vmz_tasks_hash; + mod_hash_t *vmz_rusers_hash; + mod_hash_t *vmz_eusers_hash; +} vmu_zone_t; + +/* + * Cache of results from last calculation + */ +typedef struct vmu_cache { + vmusage_t *vmc_results; /* Results from last call to */ + /* vm_getusage(). */ + uint64_t vmc_nresults; /* Count of cached results */ + uint64_t vmc_refcnt; /* refcnt for free */ + uint_t vmc_flags; /* Flags for vm_getusage() */ + hrtime_t vmc_timestamp; /* when cache was created */ +} vmu_cache_t; + +/* + * top level rss info for the system + */ +typedef struct vmu_data { + kmutex_t vmu_lock; /* Protects vmu_data */ + kcondvar_t vmu_cv; /* Used to signal threads */ + /* Waiting for */ + /* Rss_calc_thread to finish */ + vmu_entity_t *vmu_system; /* Entity for tracking */ + /* rss/swap for all processes */ + /* in all zones */ + mod_hash_t *vmu_zones_hash; /* Zones visited */ + mod_hash_t *vmu_projects_col_hash; /* These *_col_hash hashes */ + mod_hash_t *vmu_rusers_col_hash; /* keep track of entities, */ + mod_hash_t *vmu_eusers_col_hash; /* ignoring zoneid, in order */ + /* to implement VMUSAGE_COL_* */ + /* flags, which aggregate by */ + /* project or user regardless */ + /* of zoneid. */ + mod_hash_t *vmu_all_vnodes_hash; /* System wide visited vnodes */ + /* to track incore/not-incore */ + mod_hash_t *vmu_all_amps_hash; /* System wide visited shared */ + /* amps to track incore/not- */ + /* incore */ + vmu_entity_t *vmu_entities; /* Linked list of entities */ + size_t vmu_nentities; /* Count of entities in list */ + vmu_cache_t *vmu_cache; /* Cached results */ + kthread_t *vmu_calc_thread; /* NULL, or thread running */ + /* vmu_calculate() */ + uint_t vmu_calc_flags; /* Flags being using by */ + /* currently running calc */ + /* thread */ + uint_t vmu_pending_flags; /* Flags of vm_getusage() */ + /* threads waiting for */ + /* calc thread to finish */ + uint_t vmu_pending_waiters; /* Number of threads waiting */ + /* for calc thread */ + vmu_bound_t *vmu_free_bounds; + vmu_object_t *vmu_free_objects; + vmu_entity_t *vmu_free_entities; + vmu_zone_t *vmu_free_zones; +} vmu_data_t; + +extern struct as kas; +extern proc_t *practive; +extern zone_t *global_zone; +extern struct seg_ops segvn_ops; +extern struct seg_ops segspt_shmops; + +static vmu_data_t vmu_data; +static kmem_cache_t *vmu_bound_cache; +static kmem_cache_t *vmu_object_cache; + +/* + * Save a bound on the free list + */ +static void +vmu_free_bound(vmu_bound_t *bound) +{ + bound->vmb_next = vmu_data.vmu_free_bounds; + vmu_data.vmu_free_bounds = bound; +} + +/* + * Free an object, and all visited bound info. + */ +static void +vmu_free_object(mod_hash_val_t val) +{ + vmu_object_t *obj = (vmu_object_t *)val; + vmu_bound_t *bound = obj->vmo_bounds; + vmu_bound_t *tmp; + + while (bound != NULL) { + tmp = bound; + bound = bound->vmb_next; + vmu_free_bound(tmp); + } + obj->vmo_next = vmu_data.vmu_free_objects; + vmu_data.vmu_free_objects = obj; +} + +/* + * Free an entity, and hashes of visited objects for that entity. + */ +static void +vmu_free_entity(mod_hash_val_t val) +{ + vmu_entity_t *entity = (vmu_entity_t *)val; + + if (entity->vme_vnode_hash != NULL) + i_mod_hash_clear_nosync(entity->vme_vnode_hash); + if (entity->vme_amp_hash != NULL) + i_mod_hash_clear_nosync(entity->vme_amp_hash); + if (entity->vme_anon_hash != NULL) + i_mod_hash_clear_nosync(entity->vme_anon_hash); + + entity->vme_next = vmu_data.vmu_free_entities; + vmu_data.vmu_free_entities = entity; +} + +/* + * Free zone entity, and all hashes of entities inside that zone, + * which are projects, tasks, and users. + */ +static void +vmu_free_zone(mod_hash_val_t val) +{ + vmu_zone_t *zone = (vmu_zone_t *)val; + + if (zone->vmz_zone != NULL) { + vmu_free_entity((mod_hash_val_t)zone->vmz_zone); + zone->vmz_zone = NULL; + } + if (zone->vmz_projects_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_projects_hash); + if (zone->vmz_tasks_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_tasks_hash); + if (zone->vmz_rusers_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_rusers_hash); + if (zone->vmz_eusers_hash != NULL) + i_mod_hash_clear_nosync(zone->vmz_eusers_hash); + zone->vmz_next = vmu_data.vmu_free_zones; + vmu_data.vmu_free_zones = zone; +} + +/* + * Initialize synchronization primitives and hashes for system-wide tracking + * of visited vnodes and shared amps. Initialize results cache. + */ +void +vm_usage_init() +{ + mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL); + + vmu_data.vmu_system = NULL; + vmu_data.vmu_zones_hash = NULL; + vmu_data.vmu_projects_col_hash = NULL; + vmu_data.vmu_rusers_col_hash = NULL; + vmu_data.vmu_eusers_col_hash = NULL; + + vmu_data.vmu_free_bounds = NULL; + vmu_data.vmu_free_objects = NULL; + vmu_data.vmu_free_entities = NULL; + vmu_data.vmu_free_zones = NULL; + + vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash( + "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (vnode_t)); + vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash( + "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (struct anon_map)); + vmu_data.vmu_projects_col_hash = mod_hash_create_idhash( + "vmusage collapsed project hash", VMUSAGE_HASH_SIZE, + vmu_free_entity); + vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash( + "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE, + vmu_free_entity); + vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash( + "vmusage collpased euser hash", VMUSAGE_HASH_SIZE, + vmu_free_entity); + vmu_data.vmu_zones_hash = mod_hash_create_idhash( + "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone); + + vmu_bound_cache = kmem_cache_create("vmu_bound_cache", + sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + vmu_object_cache = kmem_cache_create("vmu_object_cache", + sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + vmu_data.vmu_entities = NULL; + vmu_data.vmu_nentities = 0; + + vmu_data.vmu_cache = NULL; + vmu_data.vmu_calc_thread = NULL; + vmu_data.vmu_calc_flags = 0; + vmu_data.vmu_pending_flags = 0; + vmu_data.vmu_pending_waiters = 0; +} + +/* + * Allocate hashes for tracking vm objects visited for an entity. + * Update list of entities. + */ +static vmu_entity_t * +vmu_alloc_entity(id_t id, int type, id_t zoneid) +{ + vmu_entity_t *entity; + + if (vmu_data.vmu_free_entities != NULL) { + entity = vmu_data.vmu_free_entities; + vmu_data.vmu_free_entities = + vmu_data.vmu_free_entities->vme_next; + bzero(&entity->vme_result, sizeof (vmusage_t)); + } else { + entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP); + } + entity->vme_result.vmu_id = id; + entity->vme_result.vmu_zoneid = zoneid; + entity->vme_result.vmu_type = type; + + if (entity->vme_vnode_hash == NULL) + entity->vme_vnode_hash = mod_hash_create_ptrhash( + "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (vnode_t)); + + if (entity->vme_amp_hash == NULL) + entity->vme_amp_hash = mod_hash_create_ptrhash( + "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, + sizeof (struct anon_map)); + + if (entity->vme_anon_hash == NULL) + entity->vme_anon_hash = mod_hash_create_ptrhash( + "vmusage anon hash", VMUSAGE_HASH_SIZE, + mod_hash_null_valdtor, sizeof (struct anon)); + + entity->vme_next = vmu_data.vmu_entities; + vmu_data.vmu_entities = entity; + vmu_data.vmu_nentities++; + + return (entity); +} + +/* + * Allocate a zone entity, and hashes for tracking visited vm objects + * for projects, tasks, and users within that zone. + */ +static vmu_zone_t * +vmu_alloc_zone(id_t id) +{ + vmu_zone_t *zone; + + if (vmu_data.vmu_free_zones != NULL) { + zone = vmu_data.vmu_free_zones; + vmu_data.vmu_free_zones = + vmu_data.vmu_free_zones->vmz_next; + zone->vmz_next = NULL; + zone->vmz_zone = NULL; + } else { + zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP); + } + + zone->vmz_id = id; + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0) + zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS | + VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL) + zone->vmz_projects_hash = mod_hash_create_idhash( + "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) + != 0 && zone->vmz_tasks_hash == NULL) + zone->vmz_tasks_hash = mod_hash_create_idhash( + "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) + != 0 && zone->vmz_rusers_hash == NULL) + zone->vmz_rusers_hash = mod_hash_create_idhash( + "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) + != 0 && zone->vmz_eusers_hash == NULL) + zone->vmz_eusers_hash = mod_hash_create_idhash( + "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); + + return (zone); +} + +/* + * Allocate a structure for tracking visited bounds for a vm object. + */ +static vmu_object_t * +vmu_alloc_object(caddr_t key, int type) +{ + vmu_object_t *object; + + if (vmu_data.vmu_free_objects != NULL) { + object = vmu_data.vmu_free_objects; + vmu_data.vmu_free_objects = + vmu_data.vmu_free_objects->vmo_next; + } else { + object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP); + } + + object->vmo_key = key; + object->vmo_type = type; + object->vmo_bounds = NULL; + + return (object); +} + +/* + * Allocate and return a bound structure. + */ +static vmu_bound_t * +vmu_alloc_bound() +{ + vmu_bound_t *bound; + + if (vmu_data.vmu_free_bounds != NULL) { + bound = vmu_data.vmu_free_bounds; + vmu_data.vmu_free_bounds = + vmu_data.vmu_free_bounds->vmb_next; + bzero(bound, sizeof (vmu_bound_t)); + } else { + bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP); + bzero(bound, sizeof (vmu_bound_t)); + } + return (bound); +} + +/* + * vmu_find_insert_* functions implement hash lookup or allocate and + * insert operations. + */ +static vmu_object_t * +vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type) +{ + int ret; + vmu_object_t *object; + + ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t *)&object); + if (ret != 0) { + object = vmu_alloc_object(key, type); + ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t)object, (mod_hash_hndl_t)0); + ASSERT(ret == 0); + } + return (object); +} + +static int +vmu_find_insert_anon(mod_hash_t *hash, caddr_t key) +{ + int ret; + caddr_t val; + + ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t *)&val); + + if (ret == 0) + return (0); + + ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, + (mod_hash_val_t)key, (mod_hash_hndl_t)0); + + ASSERT(ret == 0); + + return (1); +} + +static vmu_entity_t * +vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid) +{ + int ret; + vmu_entity_t *entity; + + ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id, + (mod_hash_val_t *)&entity); + if (ret != 0) { + entity = vmu_alloc_entity(id, type, zoneid); + ret = i_mod_hash_insert_nosync(hash, + (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity, + (mod_hash_hndl_t)0); + ASSERT(ret == 0); + } + return (entity); +} + + + + +/* + * Returns list of object bounds between start and end. New bounds inserted + * by this call are given type. + * + * Returns the number of pages covered if new bounds are created. Returns 0 + * if region between start/end consists of all existing bounds. + */ +static pgcnt_t +vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t + end, char type, vmu_bound_t **first, vmu_bound_t **last) +{ + vmu_bound_t *next; + vmu_bound_t *prev = NULL; + vmu_bound_t *tmp = NULL; + pgcnt_t ret = 0; + + *first = *last = NULL; + + for (next = ro->vmo_bounds; next != NULL; next = next->vmb_next) { + /* + * Find bounds overlapping or overlapped by range [start,end]. + */ + if (start > next->vmb_end) { + /* bound is before new bound */ + prev = next; + continue; + } + if (next->vmb_start > end) { + /* bound is after new bound */ + break; + } + if (*first == NULL) + *first = next; + *last = next; + } + + if (*first == NULL) { + ASSERT(*last == NULL); + /* + * No bounds overlapping range [start,end], so create new + * bound + */ + tmp = vmu_alloc_bound(); + tmp->vmb_start = start; + tmp->vmb_end = end; + tmp->vmb_type = type; + if (prev == NULL) { + tmp->vmb_next = ro->vmo_bounds; + ro->vmo_bounds = tmp; + } else { + tmp->vmb_next = prev->vmb_next; + prev->vmb_next = tmp; + } + *first = tmp; + *last = tmp; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret = tmp->vmb_end - tmp->vmb_start + 1; + return (ret); + } + + /* Check to see if start is before first known bound */ + ASSERT(first != NULL && last != NULL); + next = (*first); + if (start < (*first)->vmb_start) { + /* Create new bound before first bound */ + tmp = vmu_alloc_bound(); + tmp->vmb_start = start; + tmp->vmb_end = (*first)->vmb_start - 1; + tmp->vmb_type = type; + tmp->vmb_next = *first; + if (*first == ro->vmo_bounds) + ro->vmo_bounds = tmp; + if (prev != NULL) + prev->vmb_next = tmp; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret += tmp->vmb_end - tmp->vmb_start + 1; + *first = tmp; + } + /* + * Between start and end, search for gaps between and after existing + * bounds. Create new bounds to fill gaps if they exist. + */ + while (end > next->vmb_end) { + /* + * Check for gap between bound and next bound. if no gap, + * continue. + */ + if ((next != *last) && + ((next->vmb_end + 1) == next->vmb_next->vmb_start)) { + next = next->vmb_next; + continue; + } + /* + * Insert new bound in gap after bound, and before next + * bound if next bound exists. + */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = type; + tmp->vmb_next = next->vmb_next; + tmp->vmb_start = next->vmb_end + 1; + + if (next != *last) { + tmp->vmb_end = next->vmb_next->vmb_start - 1; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret += tmp->vmb_end - tmp->vmb_start + 1; + next->vmb_next = tmp; + next = tmp->vmb_next; + } else { + tmp->vmb_end = end; + ASSERT(tmp->vmb_end >= tmp->vmb_start); + ret += tmp->vmb_end - tmp->vmb_start + 1; + next->vmb_next = tmp; + *last = tmp; + break; + } + } + return (ret); +} + +/* + * vmu_update_bounds() + * + * first, last: list of continuous bounds, of which zero or more are of + * type VMUSAGE_BOUND_UNKNOWN. + * + * new_first, new_last: list of continuous bounds, of which none are of + * type VMUSAGE_BOUND_UNKNOWN. These bounds are used to + * update the types of bounds in (first,last) with + * type VMUSAGE_BOUND_UNKNOWN. + * + * For the list of bounds (first,last), this function updates any bounds + * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in + * the list (new_first, new_last). + * + * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list + * (new_first, new_last), it will be split into multiple bounds. + * + * Return value: + * The number of pages in the list of bounds (first,last) that were of + * type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type + * VMUSAGE_BOUND_INCORE. + * + */ +static pgcnt_t +vmu_update_bounds(vmu_bound_t **first, vmu_bound_t **last, + vmu_bound_t *new_first, vmu_bound_t *new_last) +{ + vmu_bound_t *next, *new_next, *tmp; + pgcnt_t rss = 0; + + next = *first; + new_next = new_first; + + /* verify bounds span same pages */ + ASSERT((*first)->vmb_start >= new_next->vmb_start); + ASSERT((*last)->vmb_end <= new_last->vmb_end); + for (;;) { + /* If bound already has type, proceed to next bound */ + if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { + if (next == *last) + break; + next = next->vmb_next; + continue; + } + while (new_next->vmb_end < next->vmb_start) + new_next = new_next->vmb_next; + ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN); + next->vmb_type = new_next->vmb_type; + if (new_next->vmb_end < next->vmb_end) { + /* need to split bound */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN; + tmp->vmb_start = new_next->vmb_end + 1; + tmp->vmb_end = next->vmb_end; + tmp->vmb_next = next->vmb_next; + next->vmb_end = new_next->vmb_end; + next->vmb_next = tmp; + if (*last == next) + *last = tmp; + if (next->vmb_type == VMUSAGE_BOUND_INCORE) + rss += next->vmb_end - next->vmb_start + 1; + next = tmp; + } else { + if (next->vmb_type == VMUSAGE_BOUND_INCORE) + rss += next->vmb_end - next->vmb_start + 1; + if (next == *last) + break; + next = next->vmb_next; + } + } + return (rss); +} + +/* + * merges adjacent bounds with same type between first and last bound. + * After merge, last pointer is no longer valid, as last bound may be + * merged away. + */ +static void +vmu_merge_bounds(vmu_bound_t **first, vmu_bound_t **last) +{ + vmu_bound_t *next; + vmu_bound_t *tmp; + + ASSERT(*first != NULL); + ASSERT(*last != NULL); + + next = *first; + while (next != *last) { + + /* If bounds are adjacent and have same type, merge them */ + if (((next->vmb_end + 1) == next->vmb_next->vmb_start) && + (next->vmb_type == next->vmb_next->vmb_type)) { + tmp = next->vmb_next; + next->vmb_end = tmp->vmb_end; + next->vmb_next = tmp->vmb_next; + vmu_free_bound(tmp); + if (tmp == *last) + *last = next; + } else { + next = next->vmb_next; + } + } +} + +/* + * Given an amp and a list of bounds, updates each bound's type with + * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE. + * + * If a bound is partially incore, it will be split into two bounds. + * first and last may be modified, as bounds may be split into multiple + * bounds if the are partially incore/not-incore. + * + * Set incore to non-zero if bounds are already known to be incore + * + */ +static void +vmu_amp_update_incore_bounds(struct anon_map *amp, vmu_bound_t **first, + vmu_bound_t **last, boolean_t incore) +{ + vmu_bound_t *next; + vmu_bound_t *tmp; + pgcnt_t index; + short bound_type; + short page_type; + vnode_t *vn; + anoff_t off; + struct anon *ap; + + next = *first; + /* Shared anon slots don't change once set */ + ANON_LOCK_ENTER(&->a_rwlock, RW_READER); + for (;;) { + if (incore == B_TRUE) + next->vmb_type = VMUSAGE_BOUND_INCORE; + + if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { + if (next == *last) + break; + next = next->vmb_next; + continue; + } + bound_type = next->vmb_type; + index = next->vmb_start; + while (index <= next->vmb_end) { + + /* + * These are used to determine how much to increment + * index when a large page is found. + */ + page_t *page; + pgcnt_t pgcnt = 1; + uint_t pgshft; + pgcnt_t pgmsk; + + ap = anon_get_ptr(amp->ahp, index); + if (ap != NULL) + swap_xlate(ap, &vn, &off); + + if (ap != NULL && vn != NULL && vn->v_pages != NULL && + (page = page_exists(vn, off)) != NULL) { + page_type = VMUSAGE_BOUND_INCORE; + if (page->p_szc > 0) { + pgcnt = page_get_pagecnt(page->p_szc); + pgshft = page_get_shift(page->p_szc); + pgmsk = (0x1 << (pgshft - PAGESHIFT)) + - 1; + } + } else { + page_type = VMUSAGE_BOUND_NOT_INCORE; + } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { + next->vmb_type = page_type; + } else if (next->vmb_type != page_type) { + /* + * if current bound type does not match page + * type, need to split off new bound. + */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = page_type; + tmp->vmb_start = index; + tmp->vmb_end = next->vmb_end; + tmp->vmb_next = next->vmb_next; + next->vmb_end = index - 1; + next->vmb_next = tmp; + if (*last == next) + *last = tmp; + next = tmp; + } + if (pgcnt > 1) { + /* + * If inside large page, jump to next large + * page + */ + index = (index & ~pgmsk) + pgcnt; + } else { + index++; + } + } + if (next == *last) { + ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN); + break; + } else + next = next->vmb_next; + } + ANON_LOCK_EXIT(&->a_rwlock); +} + +/* + * Same as vmu_amp_update_incore_bounds(), except for tracking + * incore-/not-incore for vnodes. + */ +static void +vmu_vnode_update_incore_bounds(vnode_t *vnode, vmu_bound_t **first, + vmu_bound_t **last) +{ + vmu_bound_t *next; + vmu_bound_t *tmp; + pgcnt_t index; + short bound_type; + short page_type; + + next = *first; + for (;;) { + if (vnode->v_pages == NULL) + next->vmb_type = VMUSAGE_BOUND_NOT_INCORE; + + if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { + if (next == *last) + break; + next = next->vmb_next; + continue; + } + + bound_type = next->vmb_type; + index = next->vmb_start; + while (index <= next->vmb_end) { + + /* + * These are used to determine how much to increment + * index when a large page is found. + */ + page_t *page; + pgcnt_t pgcnt = 1; + uint_t pgshft; + pgcnt_t pgmsk; + + if (vnode->v_pages != NULL && + (page = page_exists(vnode, ptob(index))) != NULL) { + page_type = VMUSAGE_BOUND_INCORE; + if (page->p_szc > 0) { + pgcnt = page_get_pagecnt(page->p_szc); + pgshft = page_get_shift(page->p_szc); + pgmsk = (0x1 << (pgshft - PAGESHIFT)) + - 1; + } + } else { + page_type = VMUSAGE_BOUND_NOT_INCORE; + } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { + next->vmb_type = page_type; + } else if (next->vmb_type != page_type) { + /* + * if current bound type does not match page + * type, need to split off new bound. + */ + tmp = vmu_alloc_bound(); + tmp->vmb_type = page_type; + tmp->vmb_start = index; + tmp->vmb_end = next->vmb_end; + tmp->vmb_next = next->vmb_next; + next->vmb_end = index - 1; + next->vmb_next = tmp; + if (*last == next) + *last = tmp; + next = tmp; + } + if (pgcnt > 1) { + /* + * If inside large page, jump to next large + * page + */ + index = (index & ~pgmsk) + pgcnt; + } else { + index++; + } + } + if (next == *last) { + ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN); + break; + } else + next = next->vmb_next; + } +} + +/* + * Calculate the rss and swap consumed by a segment. vmu_entities is the + * list of entities to visit. For shared segments, the vnode or amp + * is looked up in each entity to see if has been already counted. Private + * anon pages are checked per entity to ensure that cow pages are not + * double counted. + * + * For private mapped files, first the amp is checked for private pages. + * Bounds not backed by the amp are looked up in the vnode for each entity + * to avoid double counting of private COW vnode pages. + */ +static void +vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) +{ + struct segvn_data *svd; + struct shm_data *shmd; + struct spt_data *sptd; + vmu_object_t *shared_object = NULL; + vmu_object_t *entity_object = NULL; + vmu_entity_t *entity; + vmusage_t *result; + vmu_bound_t *first = NULL; + vmu_bound_t *last = NULL; + vmu_bound_t *cur = NULL; + vmu_bound_t *e_first = NULL; + vmu_bound_t *e_last = NULL; + vmu_bound_t *tmp; + pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt; + struct anon_map *private_amp = NULL; + boolean_t incore = B_FALSE; + boolean_t shared = B_FALSE; + int file = 0; + pgcnt_t swresv = 0; + pgcnt_t panon = 0; + + /* Can zero-length segments exist? Not sure, so parenoia */ + if (seg->s_size <= 0) + return; + + /* + * Figure out if there is a shared object (such as a named vnode or + * a shared amp, then figure out if there is a private amp, which + * identifies private pages. + */ + if (seg->s_ops == &segvn_ops) { + svd = (struct segvn_data *)seg->s_data; + if (svd->type == MAP_SHARED) + shared = B_TRUE; + else + swresv = svd->swresv; + + if (svd->vp != NULL) { + file = 1; + shared_object = vmu_find_insert_object( + vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp, + VMUSAGE_TYPE_VNODE); + s_start = btop(svd->offset); + s_end = btop(svd->offset + seg->s_size) - 1; + } + if (svd->amp != NULL && svd->type == MAP_SHARED) { + ASSERT(shared_object == NULL); + shared_object = vmu_find_insert_object( + vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp, + VMUSAGE_TYPE_AMP); + s_start = svd->anon_index; + s_end = svd->anon_index + btop(seg->s_size) - 1; + /* schedctl mappings are always in core */ + if (svd->amp->swresv == 0) + incore = B_TRUE; + } + if (svd->amp != NULL && svd->type == MAP_PRIVATE) { + private_amp = svd->amp; + p_start = svd->anon_index; + p_end = svd->anon_index + btop(seg->s_size) - 1; + } + } else if (seg->s_ops == &segspt_shmops) { + shared = B_TRUE; + shmd = (struct shm_data *)seg->s_data; + shared_object = vmu_find_insert_object( + vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp, + VMUSAGE_TYPE_AMP); + s_start = 0; + s_end = btop(seg->s_size) - 1; + sptd = shmd->shm_sptseg->s_data; + + /* ism segments are always incore and do not reserve swap */ + if (sptd->spt_flags & SHM_SHARE_MMU) + incore = B_TRUE; + + } else { + return; + } + + /* + * If there is a private amp, count anon pages that exist. If an + * anon has a refcnt > 1 (cow sharing), then save the anon in a + * hash so that it is not double counted. + * + * If there is also a shared object, they figure out the bounds + * which are not mapped by the private amp. + */ + if (private_amp != NULL) { + + /* Enter as writer to prevent cow anons from being freed */ + ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER); + + p_index = p_start; + s_index = s_start; + + while (p_index <= p_end) { + + pgcnt_t p_index_next; + pgcnt_t p_bound_size; + int cnt; + anoff_t off; + struct vnode *vn; + struct anon *ap; + page_t *page; /* For handling of large */ + pgcnt_t pgcnt = 1; /* pages */ + pgcnt_t pgstart; + pgcnt_t pgend; + uint_t pgshft; + pgcnt_t pgmsk; + + p_index_next = p_index; + ap = anon_get_next_ptr(private_amp->ahp, + &p_index_next); + + /* + * If next anon is past end of mapping, simulate + * end of anon so loop terminates. + */ + if (p_index_next > p_end) { + p_index_next = p_end + 1; + ap = NULL; + } + /* + * For cow segments, keep track of bounds not + * backed by private amp so they can be looked + * up in the backing vnode + */ + if (p_index_next != p_index) { + + /* + * Compute index difference between anon and + * previous anon. + */ + p_bound_size = p_index_next - p_index - 1; + + if (shared_object != NULL) { + cur = vmu_alloc_bound(); + cur->vmb_next = NULL; + cur->vmb_start = s_index; + cur->vmb_end = s_index + p_bound_size; + cur->vmb_type = VMUSAGE_BOUND_UNKNOWN; + if (first == NULL) { + first = cur; + last = cur; + } else { + last->vmb_next = cur; + last = cur; + } + } + p_index = p_index + p_bound_size + 1; + s_index = s_index + p_bound_size + 1; + } + + /* Detect end of anons in amp */ + if (ap == NULL) + break; + + cnt = ap->an_refcnt; + swap_xlate(ap, &vn, &off); + + if (vn == NULL || vn->v_pages == NULL || + (page = page_exists(vn, off)) == NULL) { + p_index++; + s_index++; + continue; + } + + /* + * If large page is found, compute portion of large + * page in mapping, and increment indicies to the next + * large page. + */ + if (page->p_szc > 0) { + + pgcnt = page_get_pagecnt(page->p_szc); + pgshft = page_get_shift(page->p_szc); + pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1; + + /* First page in large page */ + pgstart = p_index & ~pgmsk; + /* Last page in large page */ + pgend = pgstart + pgcnt - 1; + /* + * Artifically end page if page extends past + * end of mapping. + */ + if (pgend > p_end) + pgend = p_end; + + /* + * Compute number of pages from large page + * which are mapped. + */ + pgcnt = pgend - p_index + 1; + + /* + * Point indicies at page after large page, + * or at page after end of mapping. + */ + p_index += pgcnt; + s_index += pgcnt; + } else { + p_index++; + s_index++; + } + + /* + * Assume anon structs with a refcnt + * of 1 are not cow shared, so there + * is no reason to track them per entity. + */ + if (cnt == 1) { + panon += pgcnt; + continue; + } + for (entity = vmu_entities; entity != NULL; + entity = entity->vme_next_calc) { + + result = &entity->vme_result; + /* + * Track cow anons per entity so + * they are not double counted. + */ + if (vmu_find_insert_anon(entity->vme_anon_hash, + (caddr_t)ap) == 0) + continue; + + result->vmu_rss_all += (pgcnt << PAGESHIFT); + result->vmu_rss_private += + (pgcnt << PAGESHIFT); + } + } + ANON_LOCK_EXIT(&private_amp->a_rwlock); + } + + /* Add up resident anon and swap reserved for private mappings */ + if (swresv > 0 || panon > 0) { + for (entity = vmu_entities; entity != NULL; + entity = entity->vme_next_calc) { + result = &entity->vme_result; + result->vmu_swap_all += swresv; + result->vmu_swap_private += swresv; + result->vmu_rss_all += (panon << PAGESHIFT); + result->vmu_rss_private += (panon << PAGESHIFT); + } + } + + /* Compute resident pages backing shared amp or named vnode */ + if (shared_object != NULL) { + if (first == NULL) { + /* + * No private amp, or private amp has no anon + * structs. This means entire segment is backed by + * the shared object. + */ + first = vmu_alloc_bound(); + first->vmb_next = NULL; + first->vmb_start = s_start; + first->vmb_end = s_end; + first->vmb_type = VMUSAGE_BOUND_UNKNOWN; + } + /* + * Iterate bounds not backed by private amp, and compute + * resident pages. + */ + cur = first; + while (cur != NULL) { + + if (vmu_insert_lookup_object_bounds(shared_object, + cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN, + &first, &last) > 0) { + /* new bounds, find incore/not-incore */ + if (shared_object->vmo_type == + VMUSAGE_TYPE_VNODE) + vmu_vnode_update_incore_bounds( + (vnode_t *) + shared_object->vmo_key, &first, + &last); + else + vmu_amp_update_incore_bounds( + (struct anon_map *) + shared_object->vmo_key, &first, + &last, incore); + vmu_merge_bounds(&first, &last); + } + for (entity = vmu_entities; entity != NULL; + entity = entity->vme_next_calc) { + + result = &entity->vme_result; + + entity_object = vmu_find_insert_object( + shared_object->vmo_type == + VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash: + entity->vme_amp_hash, + shared_object->vmo_key, + shared_object->vmo_type); + + virt = vmu_insert_lookup_object_bounds( + entity_object, cur->vmb_start, cur->vmb_end, + VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last); + + if (virt == 0) + continue; + /* + * Range visited for this entity + */ + rss = vmu_update_bounds(&e_first, + &e_last, first, last); + result->vmu_rss_all += (rss << PAGESHIFT); + if (shared == B_TRUE && file == B_FALSE) { + /* shared anon mapping */ + result->vmu_swap_all += + (virt << PAGESHIFT); + result->vmu_swap_shared += + (virt << PAGESHIFT); + result->vmu_rss_shared += + (rss << PAGESHIFT); + } else if (shared == B_TRUE && file == B_TRUE) { + /* shared file mapping */ + result->vmu_rss_shared += + (rss << PAGESHIFT); + } else if (shared == B_FALSE && + file == B_TRUE) { + /* private file mapping */ + result->vmu_rss_private += + (rss << PAGESHIFT); + } + vmu_merge_bounds(&e_first, &e_last); + } + tmp = cur; + cur = cur->vmb_next; + vmu_free_bound(tmp); + } + } +} + +/* + * Based on the current calculation flags, find the relevant entities + * which are relative to the process. Then calculate each segment + * in the process'es address space for each relevant entity. + */ +static void +vmu_calculate_proc(proc_t *p) +{ + vmu_entity_t *entities = NULL; + vmu_zone_t *zone; + vmu_entity_t *tmp; + struct as *as; + struct seg *seg; + int ret; + + /* Figure out which entities are being computed */ + if ((vmu_data.vmu_system) != NULL) { + tmp = vmu_data.vmu_system; + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS | + VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | + VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS | + VMUSAGE_ALL_EUSERS)) { + ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, + (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, + (mod_hash_val_t *)&zone); + if (ret != 0) { + zone = vmu_alloc_zone(p->p_zone->zone_id); + ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash, + (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, + (mod_hash_val_t)zone, (mod_hash_hndl_t)0); + ASSERT(ret == 0); + } + if (zone->vmz_zone != NULL) { + tmp = zone->vmz_zone; + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) { + tmp = vmu_find_insert_entity(zone->vmz_projects_hash, + p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, + zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) { + tmp = vmu_find_insert_entity(zone->vmz_tasks_hash, + p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) { + tmp = vmu_find_insert_entity(zone->vmz_rusers_hash, + crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & + (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) { + tmp = vmu_find_insert_entity(zone->vmz_eusers_hash, + crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id); + tmp->vme_next_calc = entities; + entities = tmp; + } + } + /* Entities which collapse projects and users for all zones */ + if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) { + tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash, + p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) { + tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash, + crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES); + tmp->vme_next_calc = entities; + entities = tmp; + } + if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) { + tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash, + crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES); + tmp->vme_next_calc = entities; + entities = tmp; + } + + ASSERT(entities != NULL); + /* process all segs in process's address space */ + as = p->p_as; + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + for (seg = AS_SEGFIRST(as); seg != NULL; + seg = AS_SEGNEXT(as, seg)) { + vmu_calculate_seg(entities, seg); + } + AS_LOCK_EXIT(as, &as->a_lock); +} + +/* + * Free data created by previous call to vmu_calculate(). + */ +static void +vmu_clear_calc() +{ + if (vmu_data.vmu_system != NULL) + vmu_free_entity(vmu_data.vmu_system); + vmu_data.vmu_system = NULL; + if (vmu_data.vmu_zones_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash); + if (vmu_data.vmu_projects_col_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash); + if (vmu_data.vmu_rusers_col_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash); + if (vmu_data.vmu_eusers_col_hash != NULL) + i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash); + + i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash); + i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash); +} + +/* + * Free unused data structures. These can result if the system workload + * decreases between calculations. + */ +static void +vmu_free_extra() +{ + vmu_bound_t *tb; + vmu_object_t *to; + vmu_entity_t *te; + vmu_zone_t *tz; + + while (vmu_data.vmu_free_bounds != NULL) { + tb = vmu_data.vmu_free_bounds; + vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next; + kmem_cache_free(vmu_bound_cache, tb); + } + while (vmu_data.vmu_free_objects != NULL) { + to = vmu_data.vmu_free_objects; + vmu_data.vmu_free_objects = + vmu_data.vmu_free_objects->vmo_next; + kmem_cache_free(vmu_object_cache, to); + } + while (vmu_data.vmu_free_entities != NULL) { + te = vmu_data.vmu_free_entities; + vmu_data.vmu_free_entities = + vmu_data.vmu_free_entities->vme_next; + if (te->vme_vnode_hash != NULL) + mod_hash_destroy_hash(te->vme_vnode_hash); + if (te->vme_amp_hash != NULL) + mod_hash_destroy_hash(te->vme_amp_hash); + if (te->vme_anon_hash != NULL) + mod_hash_destroy_hash(te->vme_anon_hash); + kmem_free(te, sizeof (vmu_entity_t)); + } + while (vmu_data.vmu_free_zones != NULL) { + tz = vmu_data.vmu_free_zones; + vmu_data.vmu_free_zones = + vmu_data.vmu_free_zones->vmz_next; + if (tz->vmz_projects_hash != NULL) + mod_hash_destroy_hash(tz->vmz_projects_hash); + if (tz->vmz_tasks_hash != NULL) + mod_hash_destroy_hash(tz->vmz_tasks_hash); + if (tz->vmz_rusers_hash != NULL) + mod_hash_destroy_hash(tz->vmz_rusers_hash); + if (tz->vmz_eusers_hash != NULL) + mod_hash_destroy_hash(tz->vmz_eusers_hash); + kmem_free(tz, sizeof (vmu_zone_t)); + } +} + +extern kcondvar_t *pr_pid_cv; + +/* + * Determine which entity types are relevant and allocate the hashes to + * track them. Then walk the process table and count rss and swap + * for each process'es address space. Address space object such as + * vnodes, amps and anons are tracked per entity, so that they are + * not double counted in the results. + * + */ +static void +vmu_calculate() +{ + int i = 0; + int ret; + proc_t *p; + + vmu_clear_calc(); + + if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM) + vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM, + ALL_ZONES); + + /* + * Walk process table and calculate rss of each proc. + * + * Pidlock and p_lock cannot be held while doing the rss calculation. + * This is because: + * 1. The calculation allocates using KM_SLEEP. + * 2. The calculation grabs a_lock, which cannot be grabbed + * after p_lock. + * + * Since pidlock must be dropped, we cannot simply just walk the + * practive list. Instead, we walk the process table, and sprlock + * each process to ensure that it does not exit during the + * calculation. + */ + + mutex_enter(&pidlock); + for (i = 0; i < v.v_proc; i++) { +again: + p = pid_entry(i); + if (p == NULL) + continue; + + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + if (panicstr) { + mutex_exit(&p->p_lock); + return; + } + + /* Try to set P_PR_LOCK */ + ret = sprtrylock_proc(p); + if (ret == -1) { + /* Process in invalid state */ + mutex_exit(&p->p_lock); + mutex_enter(&pidlock); + continue; + } else if (ret == 1) { + /* + * P_PR_LOCK is already set. Wait and try again. + * This also drops p_lock. + */ + sprwaitlock_proc(p); + mutex_enter(&pidlock); + goto again; + } + mutex_exit(&p->p_lock); + + vmu_calculate_proc(p); + + mutex_enter(&p->p_lock); + sprunlock(p); + mutex_enter(&pidlock); + } + mutex_exit(&pidlock); + + vmu_free_extra(); +} + +/* + * allocate a new cache for N results satisfying flags + */ +vmu_cache_t * +vmu_cache_alloc(size_t nres, uint_t flags) +{ + vmu_cache_t *cache; + + cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP); + cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP); + cache->vmc_nresults = nres; + cache->vmc_flags = flags; + cache->vmc_refcnt = 1; + return (cache); +} + +/* + * Make sure cached results are not freed + */ +static void +vmu_cache_hold(vmu_cache_t *cache) +{ + ASSERT(MUTEX_HELD(&vmu_data.vmu_lock)); + cache->vmc_refcnt++; +} + +/* + * free cache data + */ +static void +vmu_cache_rele(vmu_cache_t *cache) +{ + ASSERT(MUTEX_HELD(&vmu_data.vmu_lock)); + ASSERT(cache->vmc_refcnt > 0); + cache->vmc_refcnt--; + if (cache->vmc_refcnt == 0) { + kmem_free(cache->vmc_results, sizeof (vmusage_t) * + cache->vmc_nresults); + kmem_free(cache, sizeof (vmu_cache_t)); + } +} + +/* + * Copy out the cached results to a caller. Inspect the callers flags + * and zone to determine which cached results should be copied. + */ +static int +vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, + uint_t flags) +{ + vmusage_t *result, *out_result; + vmusage_t dummy; + size_t i, count = 0; + size_t bufsize; + int ret = 0; + uint_t types = 0; + + if (nres != NULL) { + if (copyin((caddr_t)nres, &bufsize, sizeof (size_t))) + return (set_errno(EFAULT)); + } else { + bufsize = 0; + } + + /* figure out what results the caller is interested in. */ + if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone) + types |= VMUSAGE_SYSTEM; + if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) + types |= VMUSAGE_ZONE; + if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | + VMUSAGE_COL_PROJECTS)) + types |= VMUSAGE_PROJECTS; + if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) + types |= VMUSAGE_TASKS; + if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) + types |= VMUSAGE_RUSERS; + if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) + types |= VMUSAGE_EUSERS; + + /* count results for current zone */ + out_result = buf; + for (result = cache->vmc_results, i = 0; + i < cache->vmc_nresults; result++, i++) { + + /* Do not return "other-zone" results to non-global zones */ + if (curproc->p_zone != global_zone && + curproc->p_zone->zone_id != result->vmu_zoneid) + continue; + + /* + * If non-global zone requests VMUSAGE_SYSTEM, fake + * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result. + */ + if (curproc->p_zone != global_zone && + (flags & VMUSAGE_SYSTEM) != 0 && + result->vmu_type == VMUSAGE_ZONE) { + count++; + if (out_result != NULL) { + if (bufsize < count) { + ret = set_errno(EOVERFLOW); + } else { + dummy = *result; + dummy.vmu_zoneid = ALL_ZONES; + dummy.vmu_id = 0; + dummy.vmu_type = VMUSAGE_SYSTEM; + if (copyout(&dummy, out_result, + sizeof (vmusage_t))) + return (set_errno( + EFAULT)); + out_result++; + } + } + } + + /* Skip results that do not match requested type */ + if ((result->vmu_type & types) == 0) + continue; + + /* Skip collated results if not requested */ + if (result->vmu_zoneid == ALL_ZONES) { + if (result->vmu_type == VMUSAGE_PROJECTS && + (flags & VMUSAGE_COL_PROJECTS) == 0) + continue; + if (result->vmu_type == VMUSAGE_EUSERS && + (flags & VMUSAGE_COL_EUSERS) == 0) + continue; + if (result->vmu_type == VMUSAGE_RUSERS && + (flags & VMUSAGE_COL_RUSERS) == 0) + continue; + } + + /* Skip "other zone" results if not requested */ + if (result->vmu_zoneid != curproc->p_zone->zone_id) { + if (result->vmu_type == VMUSAGE_ZONE && + (flags & VMUSAGE_ALL_ZONES) == 0) + continue; + if (result->vmu_type == VMUSAGE_PROJECTS && + (flags & (VMUSAGE_ALL_PROJECTS | + VMUSAGE_COL_PROJECTS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_TASKS && + (flags & VMUSAGE_ALL_TASKS) == 0) + continue; + if (result->vmu_type == VMUSAGE_RUSERS && + (flags & (VMUSAGE_ALL_RUSERS | + VMUSAGE_COL_RUSERS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_EUSERS && + (flags & (VMUSAGE_ALL_EUSERS | + VMUSAGE_COL_EUSERS)) == 0) + continue; + } + count++; + if (out_result != NULL) { + if (bufsize < count) { + ret = set_errno(EOVERFLOW); + } else { + if (copyout(result, out_result, + sizeof (vmusage_t))) + return (set_errno(EFAULT)); + out_result++; + } + } + } + if (nres != NULL) + if (copyout(&count, (void *)nres, sizeof (size_t))) + return (set_errno(EFAULT)); + + return (ret); +} + +/* + * vm_getusage() + * + * Counts rss and swap by zone, project, task, and/or user. The flags argument + * determines the type of results structures returned. Flags requesting + * results from more than one zone are "flattened" to the local zone if the + * caller is not the global zone. + * + * args: + * flags: bitmap consisting of one or more of VMUSAGE_*. + * age: maximum allowable age (time since counting was done) in + * seconds of the results. Results from previous callers are + * cached in kernel. + * buf: pointer to buffer array of vmusage_t. If NULL, then only nres + * set on success. + * nres: Set to number of vmusage_t structures pointed to by buf + * before calling vm_getusage(). + * On return 0 (success) or ENOSPC, is set to the number of result + * structures returned or attempted to return. + * + * returns 0 on success, -1 on failure: + * EINTR (interrupted) + * ENOSPC (nres to small for results, nres set to needed value for success) + * EINVAL (flags invalid) + * EFAULT (bad address for buf or nres) + */ +int +vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres) +{ + vmu_entity_t *entity; + vmusage_t *result; + int ret = 0; + int cacherecent = 0; + hrtime_t now; + uint_t flags_orig; + + /* + * Non-global zones cannot request system wide and/or collated + * results, or the system result, so munge the flags accordingly. + */ + flags_orig = flags; + if (curproc->p_zone != global_zone) { + if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) { + flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS); + flags |= VMUSAGE_PROJECTS; + } + if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) { + flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS); + flags |= VMUSAGE_RUSERS; + } + if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) { + flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS); + flags |= VMUSAGE_EUSERS; + } + if (flags & VMUSAGE_SYSTEM) { + flags &= ~VMUSAGE_SYSTEM; + flags |= VMUSAGE_ZONE; + } + } + + /* Check for unknown flags */ + if ((flags & (~VMUSAGE_MASK)) != 0) + return (set_errno(EINVAL)); + + /* Check for no flags */ + if ((flags & VMUSAGE_MASK) == 0) + return (set_errno(EINVAL)); + + mutex_enter(&vmu_data.vmu_lock); + now = gethrtime(); + +start: + if (vmu_data.vmu_cache != NULL) { + + vmu_cache_t *cache; + + if ((vmu_data.vmu_cache->vmc_timestamp + + ((hrtime_t)age * NANOSEC)) > now) + cacherecent = 1; + + if ((vmu_data.vmu_cache->vmc_flags & flags) == flags && + cacherecent == 1) { + cache = vmu_data.vmu_cache; + vmu_cache_hold(cache); + mutex_exit(&vmu_data.vmu_lock); + + ret = vmu_copyout_results(cache, buf, nres, flags_orig); + mutex_enter(&vmu_data.vmu_lock); + vmu_cache_rele(cache); + if (vmu_data.vmu_pending_waiters > 0) + cv_broadcast(&vmu_data.vmu_cv); + mutex_exit(&vmu_data.vmu_lock); + return (ret); + } + /* + * If the cache is recent, it is likely that there are other + * consumers of vm_getusage running, so add their flags to the + * desired flags for the calculation. + */ + if (cacherecent == 1) + flags = vmu_data.vmu_cache->vmc_flags | flags; + } + if (vmu_data.vmu_calc_thread == NULL) { + + vmu_cache_t *cache; + + vmu_data.vmu_calc_thread = curthread; + vmu_data.vmu_calc_flags = flags; + vmu_data.vmu_entities = NULL; + vmu_data.vmu_nentities = 0; + if (vmu_data.vmu_pending_waiters > 0) + vmu_data.vmu_calc_flags |= + vmu_data.vmu_pending_flags; + + vmu_data.vmu_pending_flags = 0; + mutex_exit(&vmu_data.vmu_lock); + vmu_calculate(); + mutex_enter(&vmu_data.vmu_lock); + /* copy results to cache */ + if (vmu_data.vmu_cache != NULL) + vmu_cache_rele(vmu_data.vmu_cache); + cache = vmu_data.vmu_cache = + vmu_cache_alloc(vmu_data.vmu_nentities, + vmu_data.vmu_calc_flags); + + result = cache->vmc_results; + for (entity = vmu_data.vmu_entities; entity != NULL; + entity = entity->vme_next) { + *result = entity->vme_result; + result++; + } + cache->vmc_timestamp = gethrtime(); + vmu_cache_hold(cache); + + vmu_data.vmu_calc_flags = 0; + vmu_data.vmu_calc_thread = NULL; + + if (vmu_data.vmu_pending_waiters > 0) + cv_broadcast(&vmu_data.vmu_cv); + + mutex_exit(&vmu_data.vmu_lock); + + /* copy cache */ + ret = vmu_copyout_results(cache, buf, nres, flags_orig); + mutex_enter(&vmu_data.vmu_lock); + vmu_cache_rele(cache); + mutex_exit(&vmu_data.vmu_lock); + + return (ret); + } + vmu_data.vmu_pending_flags |= flags; + vmu_data.vmu_pending_waiters++; + while (vmu_data.vmu_calc_thread != NULL) { + if (cv_wait_sig(&vmu_data.vmu_cv, + &vmu_data.vmu_lock) == 0) { + vmu_data.vmu_pending_waiters--; + mutex_exit(&vmu_data.vmu_lock); + return (set_errno(EINTR)); + } + } + vmu_data.vmu_pending_waiters--; + goto start; +} |
