summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorakolb <none@none>2007-03-09 15:55:28 -0800
committerakolb <none@none>2007-03-09 15:55:28 -0800
commitc97ad5cdc75eb73e3cc38542ca3ba783574b0a7a (patch)
tree5ba1653d892978d87d6061c8c7f3821f4b3e354c
parent68d3ac02fc9db49ae9dccaecff999963114930a7 (diff)
downloadillumos-joyent-c97ad5cdc75eb73e3cc38542ca3ba783574b0a7a.tar.gz
PSARC/2004/402 CPU Caps
6327235 PSARC/2004/402 CPU caps 6464161 Dead KSLICE code should be removed 6514387 FX class contains dead code to keep list of member threads 6518395 kstat_zone_add performs KM_SLEEP allocation when it should not
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/genunix.c2
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/thread.c10
-rw-r--r--usr/src/cmd/prstat/prutil.c6
-rw-r--r--usr/src/cmd/zonecfg/zonecfg.c192
-rw-r--r--usr/src/cmd/zonecfg/zonecfg.h3
-rw-r--r--usr/src/cmd/zonecfg/zonecfg_grammar.y13
-rw-r--r--usr/src/cmd/zonecfg/zonecfg_lex.l2
-rw-r--r--usr/src/head/libzonecfg.h1
-rw-r--r--usr/src/lib/libdtrace/common/procfs.d.in8
-rw-r--r--usr/src/lib/libdtrace/common/procfs.sed.in2
-rw-r--r--usr/src/lib/libzonecfg/common/libzonecfg.c1
-rw-r--r--usr/src/pkgdefs/SUNWhea/prototype_com3
-rw-r--r--usr/src/uts/common/Makefile.files2
-rw-r--r--usr/src/uts/common/cpr/cpr_uthread.c3
-rw-r--r--usr/src/uts/common/disp/cpucaps.c1133
-rw-r--r--usr/src/uts/common/disp/fss.c117
-rw-r--r--usr/src/uts/common/disp/fx.c111
-rw-r--r--usr/src/uts/common/disp/sysclass.c31
-rw-r--r--usr/src/uts/common/disp/thread.c37
-rw-r--r--usr/src/uts/common/disp/ts.c100
-rw-r--r--usr/src/uts/common/dtrace/sdt_subr.c4
-rw-r--r--usr/src/uts/common/fs/proc/prcontrol.c23
-rw-r--r--usr/src/uts/common/fs/proc/prsubr.c5
-rw-r--r--usr/src/uts/common/os/clock.c32
-rw-r--r--usr/src/uts/common/os/cpu.c7
-rw-r--r--usr/src/uts/common/os/kstat_fr.c6
-rw-r--r--usr/src/uts/common/os/lwp.c12
-rw-r--r--usr/src/uts/common/os/msacct.c41
-rw-r--r--usr/src/uts/common/os/project.c58
-rw-r--r--usr/src/uts/common/os/sig.c7
-rw-r--r--usr/src/uts/common/os/task.c20
-rw-r--r--usr/src/uts/common/os/timers.c1
-rw-r--r--usr/src/uts/common/os/waitq.c386
-rw-r--r--usr/src/uts/common/os/zone.c102
-rw-r--r--usr/src/uts/common/sys/Makefile3
-rw-r--r--usr/src/uts/common/sys/cpucaps.h157
-rw-r--r--usr/src/uts/common/sys/cpucaps_impl.h102
-rw-r--r--usr/src/uts/common/sys/cpuvar.h1
-rw-r--r--usr/src/uts/common/sys/fss.h37
-rw-r--r--usr/src/uts/common/sys/fx.h14
-rw-r--r--usr/src/uts/common/sys/proc.h6
-rw-r--r--usr/src/uts/common/sys/project.h9
-rw-r--r--usr/src/uts/common/sys/schedctl.h8
-rw-r--r--usr/src/uts/common/sys/thread.h27
-rw-r--r--usr/src/uts/common/sys/ts.h52
-rw-r--r--usr/src/uts/common/sys/waitq.h90
-rw-r--r--usr/src/uts/common/sys/zone.h3
-rw-r--r--usr/src/uts/i86pc/os/trap.c2
-rw-r--r--usr/src/uts/intel/ia32/os/syscall.c4
-rw-r--r--usr/src/uts/sparc/os/syscall.c4
-rw-r--r--usr/src/uts/sun4/os/trap.c2
-rw-r--r--usr/src/uts/sun4u/ngdr/io/dr_quiesce.c5
-rw-r--r--usr/src/uts/sun4u/serengeti/io/sbdp_quiesce.c6
-rw-r--r--usr/src/uts/sun4u/sunfire/io/sysctrl_quiesce.c5
54 files changed, 2728 insertions, 290 deletions
diff --git a/usr/src/cmd/mdb/common/modules/genunix/genunix.c b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
index decf655500..51bebaedf5 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
@@ -124,6 +124,7 @@ pstat2ch(uchar_t state)
case SIDL: return ('I');
case SONPROC: return ('O');
case SSTOP: return ('T');
+ case SWAIT: return ('W');
default: return ('?');
}
}
@@ -148,6 +149,7 @@ ps_threadprint(uintptr_t addr, const void *data, void *private)
{ "TS_ONPROC", TS_ONPROC, TS_ONPROC },
{ "TS_ZOMB", TS_ZOMB, TS_ZOMB },
{ "TS_STOPPED", TS_STOPPED, TS_STOPPED },
+ { "TS_WAIT", TS_WAIT, TS_WAIT },
{ NULL, 0, 0 }
};
diff --git a/usr/src/cmd/mdb/common/modules/genunix/thread.c b/usr/src/cmd/mdb/common/modules/genunix/thread.c
index 704c27f42e..d552bd381f 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/thread.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/thread.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -455,6 +454,9 @@ thread(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
case TS_STOPPED:
state = "stopped";
break;
+ case TS_WAIT:
+ state = "wait";
+ break;
default:
(void) mdb_snprintf(stbuf, 11, "inval/%02x", t.t_state);
state = stbuf;
diff --git a/usr/src/cmd/prstat/prutil.c b/usr/src/cmd/prstat/prutil.c
index 638dc8086f..8c0f99f138 100644
--- a/usr/src/cmd/prstat/prutil.c
+++ b/usr/src/cmd/prstat/prutil.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -197,8 +197,8 @@ Format_state(char *str, char state, processorid_t pr_id, int length)
case 'I':
(void) strncpy(str, "idle", length);
break;
- case 'X':
- (void) strncpy(str, "xbrk", length);
+ case 'W':
+ (void) strncpy(str, "wait", length);
break;
case 'O':
(void) snprintf(str, length, "cpu%-3d", (int)pr_id);
diff --git a/usr/src/cmd/zonecfg/zonecfg.c b/usr/src/cmd/zonecfg/zonecfg.c
index 8a532faa49..ac5525604e 100644
--- a/usr/src/cmd/zonecfg/zonecfg.c
+++ b/usr/src/cmd/zonecfg/zonecfg.c
@@ -179,6 +179,7 @@ static char *res_types[] = {
ALIAS_SHARES,
"scheduling-class",
"ip-type",
+ "capped-cpu",
NULL
};
@@ -265,6 +266,7 @@ static const char *add_cmds[] = {
"add attr",
"add dataset",
"add dedicated-cpu",
+ "add capped-cpu",
"add capped-memory",
NULL
};
@@ -294,6 +296,7 @@ static const char *remove_cmds[] = {
"remove attr ",
"remove dataset ",
"remove dedicated-cpu ",
+ "remove capped-cpu ",
"remove capped-memory ",
NULL
};
@@ -307,6 +310,7 @@ static const char *select_cmds[] = {
"select attr ",
"select dataset ",
"select dedicated-cpu",
+ "select capped-cpu",
"select capped-memory",
NULL
};
@@ -340,6 +344,7 @@ static const char *info_cmds[] = {
"info dataset ",
"info capped-memory",
"info dedicated-cpu",
+ "info capped-cpu",
"info zonename",
"info zonepath",
"info autoboot",
@@ -451,6 +456,16 @@ static const char *pset_res_scope_cmds[] = {
NULL
};
+static const char *pcap_res_scope_cmds[] = {
+ "cancel",
+ "end",
+ "exit",
+ "help",
+ "info",
+ "set ncpus=",
+ NULL
+};
+
static const char *mcap_res_scope_cmds[] = {
"cancel",
"end",
@@ -605,6 +620,8 @@ CPL_MATCH_FN(cmd_cpl_fn)
return (add_stuff(cpl, line, dataset_res_scope_cmds, word_end));
case RT_DCPU:
return (add_stuff(cpl, line, pset_res_scope_cmds, word_end));
+ case RT_PCAP:
+ return (add_stuff(cpl, line, pcap_res_scope_cmds, word_end));
case RT_MCAP:
return (add_stuff(cpl, line, mcap_res_scope_cmds, word_end));
}
@@ -1003,6 +1020,20 @@ usage(bool verbose, uint_t flags)
pt_to_str(PT_IMPORTANCE),
gettext("<unsigned integer>"));
break;
+ case RT_PCAP:
+ (void) fprintf(fp, gettext("The '%s' resource scope is "
+ "used to set an upper limit (a cap) on the\n"
+ "percentage of CPU that can be used by this zone. "
+ "A '%s' value of 1\ncorresponds to one cpu. The "
+ "value can be set higher than 1, up to the total\n"
+ "number of CPUs on the system. The value can "
+ "also be less than 1,\nrepresenting a fraction of "
+ "a cpu.\n"),
+ rt_to_str(resource_scope), pt_to_str(PT_NCPUS));
+ (void) fprintf(fp, gettext("Valid commands:\n"));
+ (void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET),
+ pt_to_str(PT_NCPUS), gettext("<unsigned decimal>"));
+ break;
case RT_MCAP:
(void) fprintf(fp, gettext("The '%s' resource scope is "
"used to set an upper limit (a cap) on the\n"
@@ -1078,12 +1109,12 @@ usage(bool verbose, uint_t flags)
}
if (flags & HELP_RESOURCES) {
(void) fprintf(fp, "<%s> := %s | %s | %s | %s | %s | %s |\n\t"
- "%s | %s | %s\n\n",
+ "%s | %s | %s | %s\n\n",
gettext("resource type"), rt_to_str(RT_FS),
rt_to_str(RT_IPD), rt_to_str(RT_NET), rt_to_str(RT_DEVICE),
rt_to_str(RT_RCTL), rt_to_str(RT_ATTR),
rt_to_str(RT_DATASET), rt_to_str(RT_DCPU),
- rt_to_str(RT_MCAP));
+ rt_to_str(RT_PCAP), rt_to_str(RT_MCAP));
}
if (flags & HELP_PROPS) {
(void) fprintf(fp, gettext("For resource type ... there are "
@@ -1137,6 +1168,8 @@ usage(bool verbose, uint_t flags)
pt_to_str(PT_NAME));
(void) fprintf(fp, "\t%s\t%s, %s\n", rt_to_str(RT_DCPU),
pt_to_str(PT_NCPUS), pt_to_str(PT_IMPORTANCE));
+ (void) fprintf(fp, "\t%s\t%s\n", rt_to_str(RT_PCAP),
+ pt_to_str(PT_NCPUS));
(void) fprintf(fp, "\t%s\t%s, %s, %s\n", rt_to_str(RT_MCAP),
pt_to_str(PT_PHYSICAL), pt_to_str(PT_SWAP),
pt_to_str(PT_LOCKED));
@@ -1835,6 +1868,11 @@ export_func(cmd_t *cmd)
(void) fprintf(of, "%s\n", cmd_to_str(CMD_END));
}
+ /*
+ * There is nothing to export for pcap since this resource is just
+ * a container for an rctl alias.
+ */
+
done:
if (need_to_close)
(void) fclose(of);
@@ -1908,6 +1946,7 @@ add_resource(cmd_t *cmd)
int type;
struct zone_psettab tmp_psettab;
struct zone_mcaptab tmp_mcaptab;
+ uint64_t tmp;
uint64_t tmp_mcap;
char pool[MAXNAMELEN];
@@ -1951,12 +1990,18 @@ add_resource(cmd_t *cmd)
bzero(&in_progress_dstab, sizeof (in_progress_dstab));
return;
case RT_DCPU:
- /* Make sure there isn't already a cpu-set entry. */
+ /* Make sure there isn't already a cpu-set or cpu-cap entry. */
if (zonecfg_lookup_pset(handle, &tmp_psettab) == Z_OK) {
zerr(gettext("The %s resource already exists."),
rt_to_str(RT_DCPU));
goto bad;
}
+ if (zonecfg_get_aliased_rctl(handle, ALIAS_CPUCAP, &tmp) !=
+ Z_NO_ENTRY) {
+ zerr(gettext("The %s resource already exists."),
+ rt_to_str(RT_PCAP));
+ goto bad;
+ }
/* Make sure the pool property isn't set. */
if (zonecfg_get_pool(handle, pool, sizeof (pool)) == Z_OK &&
@@ -1970,6 +2015,32 @@ add_resource(cmd_t *cmd)
bzero(&in_progress_psettab, sizeof (in_progress_psettab));
return;
+ case RT_PCAP:
+ /*
+ * Make sure there isn't already a cpu-set or incompatible
+ * cpu-cap rctls.
+ */
+ if (zonecfg_lookup_pset(handle, &tmp_psettab) == Z_OK) {
+ zerr(gettext("The %s resource already exists."),
+ rt_to_str(RT_DCPU));
+ goto bad;
+ }
+
+ switch (zonecfg_get_aliased_rctl(handle, ALIAS_CPUCAP, &tmp)) {
+ case Z_ALIAS_DISALLOW:
+ zone_perror(rt_to_str(RT_PCAP), Z_ALIAS_DISALLOW,
+ FALSE);
+ goto bad;
+
+ case Z_OK:
+ zerr(gettext("The %s resource already exists."),
+ rt_to_str(RT_PCAP));
+ goto bad;
+
+ default:
+ break;
+ }
+ return;
case RT_MCAP:
/*
* Make sure there isn't already a mem-cap entry or max-swap
@@ -2967,6 +3038,25 @@ remove_pset()
}
static void
+remove_pcap()
+{
+ int err;
+ uint64_t tmp;
+
+ if (zonecfg_get_aliased_rctl(handle, ALIAS_CPUCAP, &tmp) != Z_OK) {
+ zerr("%s %s: %s", cmd_to_str(CMD_REMOVE), rt_to_str(RT_PCAP),
+ zonecfg_strerror(Z_NO_RESOURCE_TYPE));
+ saw_error = TRUE;
+ return;
+ }
+
+ if ((err = zonecfg_rm_aliased_rctl(handle, ALIAS_CPUCAP)) != Z_OK)
+ z_cmd_rt_perror(CMD_REMOVE, RT_PCAP, err, TRUE);
+ else
+ need_to_commit = TRUE;
+}
+
+static void
remove_mcap()
{
int err, res1, res2, res3;
@@ -3074,6 +3164,9 @@ remove_resource(cmd_t *cmd)
case RT_DCPU:
remove_pset();
return;
+ case RT_PCAP:
+ remove_pcap();
+ return;
case RT_MCAP:
remove_mcap();
return;
@@ -3396,6 +3489,7 @@ select_func(cmd_t *cmd)
{
int type, err, res;
uint64_t limit;
+ uint64_t tmp;
if (zone_is_read_only(CMD_SELECT))
return;
@@ -3493,6 +3587,13 @@ select_func(cmd_t *cmd)
bcopy(&old_psettab, &in_progress_psettab,
sizeof (struct zone_psettab));
return;
+ case RT_PCAP:
+ if ((err = zonecfg_get_aliased_rctl(handle, ALIAS_CPUCAP, &tmp))
+ != Z_OK) {
+ z_cmd_rt_perror(CMD_SELECT, RT_PCAP, err, TRUE);
+ global_scope = TRUE;
+ }
+ return;
case RT_MCAP:
/* if none of these exist, there is no resource to select */
if ((res = zonecfg_lookup_mcap(handle, &old_mcaptab)) != Z_OK &&
@@ -3708,6 +3809,8 @@ set_func(cmd_t *cmd)
boolean_t force_set = FALSE;
size_t physmem_size = sizeof (in_progress_mcaptab.zone_physmem_cap);
uint64_t mem_cap, mem_limit;
+ float cap;
+ char *unitp;
struct zone_psettab tmp_psettab;
bool arg_err = FALSE;
@@ -4200,6 +4303,34 @@ set_func(cmd_t *cmd)
long_usage(CMD_SET, TRUE);
usage(FALSE, HELP_PROPS);
return;
+ case RT_PCAP:
+ if (prop_type != PT_NCPUS) {
+ zone_perror(pt_to_str(prop_type), Z_NO_PROPERTY_TYPE,
+ TRUE);
+ long_usage(CMD_SET, TRUE);
+ usage(FALSE, HELP_PROPS);
+ return;
+ }
+
+ /*
+ * We already checked that an rctl alias is allowed in
+ * the add_resource() function.
+ */
+
+ if ((cap = strtof(prop_id, &unitp)) <= 0 || *unitp != '\0' ||
+ (int)(cap * 100) < 1) {
+ zerr(gettext("%s property is out of range."),
+ pt_to_str(PT_NCPUS));
+ saw_error = TRUE;
+ return;
+ }
+
+ if ((err = zonecfg_set_aliased_rctl(handle, ALIAS_CPUCAP,
+ (int)(cap * 100))) != Z_OK)
+ zone_perror(zone, err, TRUE);
+ else
+ need_to_commit = TRUE;
+ return;
case RT_MCAP:
switch (prop_type) {
case PT_PHYSICAL:
@@ -4790,6 +4921,26 @@ info_pset(zone_dochandle_t handle, FILE *fp)
}
static void
+output_pcap(FILE *fp)
+{
+ uint64_t cap;
+
+ if (zonecfg_get_aliased_rctl(handle, ALIAS_CPUCAP, &cap) == Z_OK) {
+ float scaled = (float)cap / 100;
+ (void) fprintf(fp, "%s:\n", rt_to_str(RT_PCAP));
+ (void) fprintf(fp, "\t[%s: %.2f]\n", pt_to_str(PT_NCPUS),
+ scaled);
+ }
+}
+
+static void
+info_pcap(FILE *fp)
+{
+ output_pcap(fp);
+}
+
+
+static void
info_aliased_rctl(zone_dochandle_t handle, FILE *fp, char *alias)
{
uint64_t limit;
@@ -4932,6 +5083,9 @@ info_func(cmd_t *cmd)
case RT_DCPU:
output_pset(fp, &in_progress_psettab);
break;
+ case RT_PCAP:
+ output_pcap(fp);
+ break;
case RT_MCAP:
res1 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP,
&swap_limit);
@@ -4986,6 +5140,7 @@ info_func(cmd_t *cmd)
info_dev(handle, fp, cmd);
}
info_pset(handle, fp);
+ info_pcap(fp);
info_mcap(handle, fp);
if (!global_zone) {
info_attr(handle, fp, cmd);
@@ -5062,6 +5217,9 @@ info_func(cmd_t *cmd)
case RT_DCPU:
info_pset(handle, fp);
break;
+ case RT_PCAP:
+ info_pcap(fp);
+ break;
case RT_MCAP:
info_mcap(handle, fp);
break;
@@ -5203,10 +5361,12 @@ verify_func(cmd_t *cmd)
char sched[MAXNAMELEN];
char brand[MAXNAMELEN];
int err, ret_val = Z_OK, arg;
+ int pset_res;
bool save = FALSE;
bool arg_err = FALSE;
zone_iptype_t iptype;
boolean_t has_cpu_shares = B_FALSE;
+ boolean_t has_cpu_cap = B_FALSE;
optind = 0;
while ((arg = getopt(cmd->cmd_argc, cmd->cmd_argv, "?")) != EOF) {
@@ -5333,6 +5493,9 @@ verify_func(cmd_t *cmd)
if (strcmp(rctltab.zone_rctl_name, "zone.cpu-shares") == 0)
has_cpu_shares = B_TRUE;
+ if (strcmp(rctltab.zone_rctl_name, "zone.cpu-cap") == 0)
+ has_cpu_cap = B_TRUE;
+
if (rctltab.zone_rctl_valptr == NULL) {
zerr(gettext("%s: no %s specified"),
rt_to_str(RT_RCTL), pt_to_str(PT_VALUE));
@@ -5345,7 +5508,8 @@ verify_func(cmd_t *cmd)
}
(void) zonecfg_endrctlent(handle);
- if (zonecfg_lookup_pset(handle, &psettab) == Z_OK && has_cpu_shares) {
+ if ((pset_res = zonecfg_lookup_pset(handle, &psettab)) == Z_OK &&
+ has_cpu_shares) {
zerr(gettext("%s zone.cpu-shares and %s are incompatible."),
rt_to_str(RT_RCTL), rt_to_str(RT_DCPU));
saw_error = TRUE;
@@ -5364,6 +5528,14 @@ verify_func(cmd_t *cmd)
ret_val = Z_INCOMPATIBLE;
}
+ if (pset_res == Z_OK && has_cpu_cap) {
+ zerr(gettext("%s zone.cpu-cap and the %s are incompatible."),
+ rt_to_str(RT_RCTL), rt_to_str(RT_DCPU));
+ saw_error = TRUE;
+ if (ret_val == Z_OK)
+ ret_val = Z_INCOMPATIBLE;
+ }
+
if ((err = zonecfg_setattrent(handle)) != Z_OK) {
zone_perror(zone, err, TRUE);
return;
@@ -5562,6 +5734,7 @@ end_func(cmd_t *cmd)
int err, arg, res1, res2, res3;
uint64_t swap_limit;
uint64_t locked_limit;
+ uint64_t proc_cap;
assert(cmd != NULL);
@@ -5888,6 +6061,17 @@ end_func(cmd_t *cmd)
err = zonecfg_modify_pset(handle, &in_progress_psettab);
}
break;
+ case RT_PCAP:
+ /* Make sure everything was filled in. */
+ if (zonecfg_get_aliased_rctl(handle, ALIAS_CPUCAP, &proc_cap)
+ != Z_OK) {
+ zerr(gettext("%s not specified"), pt_to_str(PT_NCPUS));
+ saw_error = TRUE;
+ validation_failed = TRUE;
+ return;
+ }
+ err = Z_OK;
+ break;
case RT_MCAP:
/* Make sure everything was filled in. */
res1 = strlen(in_progress_mcaptab.zone_physmem_cap) == 0 ?
diff --git a/usr/src/cmd/zonecfg/zonecfg.h b/usr/src/cmd/zonecfg/zonecfg.h
index 4f960b56d1..3369012438 100644
--- a/usr/src/cmd/zonecfg/zonecfg.h
+++ b/usr/src/cmd/zonecfg/zonecfg.h
@@ -94,9 +94,10 @@ typedef int bool;
#define RT_SHARES 22 /* really a rctl alias property, but for info */
#define RT_SCHED 23 /* really a property, but for info ... */
#define RT_IPTYPE 24 /* really a property, but for info ... */
+#define RT_PCAP 25
#define RT_MIN RT_UNKNOWN
-#define RT_MAX RT_IPTYPE
+#define RT_MAX RT_PCAP
/* property types: increment PT_MAX when expanding this list */
#define PT_UNKNOWN 0
diff --git a/usr/src/cmd/zonecfg/zonecfg_grammar.y b/usr/src/cmd/zonecfg/zonecfg_grammar.y
index 7ee9b4d612..69fc6d98bd 100644
--- a/usr/src/cmd/zonecfg/zonecfg_grammar.y
+++ b/usr/src/cmd/zonecfg/zonecfg_grammar.y
@@ -61,14 +61,14 @@ extern void yyerror(char *s);
%token FS IPD ATTR DEVICE RCTL SPECIAL RAW DIR OPTIONS TYPE ADDRESS PHYSICAL
%token IPTYPE
%token NAME MATCH PRIV LIMIT ACTION VALUE EQUAL OPEN_SQ_BRACKET CLOSE_SQ_BRACKET
-%token OPEN_PAREN CLOSE_PAREN COMMA DATASET LIMITPRIV BOOTARGS BRAND PSET
+%token OPEN_PAREN CLOSE_PAREN COMMA DATASET LIMITPRIV BOOTARGS BRAND PSET PCAP
%token MCAP NCPUS IMPORTANCE SHARES MAXLWPS MAXSHMMEM MAXSHMIDS MAXMSGIDS
%token MAXSEMIDS LOCKED SWAP SCHED CLEAR
%type <strval> TOKEN EQUAL OPEN_SQ_BRACKET CLOSE_SQ_BRACKET
property_value OPEN_PAREN CLOSE_PAREN COMMA simple_prop_val
%type <complex> complex_piece complex_prop_val
-%type <ival> resource_type NET FS IPD DEVICE RCTL ATTR DATASET PSET MCAP
+%type <ival> resource_type NET FS IPD DEVICE RCTL ATTR DATASET PSET PCAP MCAP
%type <ival> property_name SPECIAL RAW DIR OPTIONS TYPE ADDRESS PHYSICAL NAME
MATCH ZONENAME ZONEPATH AUTOBOOT POOL LIMITPRIV BOOTARGS VALUE PRIV LIMIT
ACTION BRAND SCHED IPTYPE
@@ -700,6 +700,14 @@ select_command: SELECT
$$->cmd_handler = &select_func;
$$->cmd_res_type = RT_DCPU;
}
+ | SELECT PCAP
+ {
+ if (($$ = alloc_cmd()) == NULL)
+ YYERROR;
+ cmd = $$;
+ $$->cmd_handler = &select_func;
+ $$->cmd_res_type = RT_PCAP;
+ }
| SELECT MCAP
{
if (($$ = alloc_cmd()) == NULL)
@@ -840,6 +848,7 @@ resource_type: NET { $$ = RT_NET; }
| ATTR { $$ = RT_ATTR; }
| DATASET { $$ = RT_DATASET; }
| PSET { $$ = RT_DCPU; }
+ | PCAP { $$ = RT_PCAP; }
| MCAP { $$ = RT_MCAP; }
property_name: SPECIAL { $$ = PT_SPECIAL; }
diff --git a/usr/src/cmd/zonecfg/zonecfg_lex.l b/usr/src/cmd/zonecfg/zonecfg_lex.l
index 81a0594c22..9b8bc81295 100644
--- a/usr/src/cmd/zonecfg/zonecfg_lex.l
+++ b/usr/src/cmd/zonecfg/zonecfg_lex.l
@@ -173,6 +173,8 @@ char *safe_strdup(char *s);
<TSTATE>dedicated-cpu { return PSET; }
+<TSTATE>capped-cpu { return PCAP; }
+
<TSTATE>capped-memory { return MCAP; }
<TSTATE>zonepath { return ZONEPATH; }
diff --git a/usr/src/head/libzonecfg.h b/usr/src/head/libzonecfg.h
index 83b70bc8e5..2eaf2e218a 100644
--- a/usr/src/head/libzonecfg.h
+++ b/usr/src/head/libzonecfg.h
@@ -145,6 +145,7 @@ extern "C" {
#define ALIAS_MAXLOCKEDMEM "locked"
#define ALIAS_MAXSWAP "swap"
#define ALIAS_SHARES "cpu-shares"
+#define ALIAS_CPUCAP "cpu-cap"
/*
* Bit flag definitions for passing into libzonecfg functions.
diff --git a/usr/src/lib/libdtrace/common/procfs.d.in b/usr/src/lib/libdtrace/common/procfs.d.in
index 915d754d88..0215f0d986 100644
--- a/usr/src/lib/libdtrace/common/procfs.d.in
+++ b/usr/src/lib/libdtrace/common/procfs.d.in
@@ -49,6 +49,8 @@ inline char SIDL = @SIDL@;
#pragma D binding "1.0" SIDL
inline char SONPROC = @SONPROC@;
#pragma D binding "1.0" SONPROC
+inline char SWAIT = @SWAIT@;
+#pragma D binding "1.0" SWAIT
inline int PR_STOPPED = @PR_STOPPED@;
#pragma D binding "1.0" PR_STOPPED
@@ -322,14 +324,16 @@ translator lwpsinfo_t < kthread_t *T > {
(T->t_state == @TS_RUN@) ? SRUN :
(T->t_state == @TS_ONPROC@) ? SONPROC :
(T->t_state == @TS_ZOMB@) ? SZOMB :
- (T->t_state == @TS_STOPPED@) ? SSTOP : 0;
+ (T->t_state == @TS_STOPPED@) ? SSTOP :
+ (T->t_state == @TS_WAIT@) ? SWAIT : 0;
pr_sname = (T->t_proc_flag & @TP_PRVSTOP@) ? 'T' :
(T->t_state == @TS_SLEEP@) ? 'S' :
(T->t_state == @TS_RUN@) ? 'R' :
(T->t_state == @TS_ONPROC@) ? 'O' :
(T->t_state == @TS_ZOMB@) ? 'Z' :
- (T->t_state == @TS_STOPPED@) ? 'T' : '?';
+ (T->t_state == @TS_STOPPED@) ? 'T' :
+ (T->t_state == @TS_WAIT@) ? 'W' : '?';
pr_syscall = T->t_sysnum;
pr_pri = T->t_pri;
diff --git a/usr/src/lib/libdtrace/common/procfs.sed.in b/usr/src/lib/libdtrace/common/procfs.sed.in
index f889f6333c..b4a7087a44 100644
--- a/usr/src/lib/libdtrace/common/procfs.sed.in
+++ b/usr/src/lib/libdtrace/common/procfs.sed.in
@@ -56,6 +56,7 @@ SED_REPLACE(TS_RUN)
SED_REPLACE(TS_ONPROC)
SED_REPLACE(TS_ZOMB)
SED_REPLACE(TS_STOPPED)
+SED_REPLACE(TS_WAIT)
SED_REPLACE(P_PR_FORK)
SED_REPLACE(P_PR_RUNLCL)
@@ -75,6 +76,7 @@ SED_REPLACE(SZOMB)
SED_REPLACE(SSTOP)
SED_REPLACE(SIDL)
SED_REPLACE(SONPROC)
+SED_REPLACE(SWAIT)
SED_REPLACE(CLDNOSIGCHLD)
SED_REPLACE(CLDWAITPID)
diff --git a/usr/src/lib/libzonecfg/common/libzonecfg.c b/usr/src/lib/libzonecfg/common/libzonecfg.c
index f2c5570861..33cc965e25 100644
--- a/usr/src/lib/libzonecfg/common/libzonecfg.c
+++ b/usr/src/lib/libzonecfg/common/libzonecfg.c
@@ -174,6 +174,7 @@ static struct alias {
{ALIAS_MAXLOCKEDMEM, "zone.max-locked-memory", "privileged", "deny", 0},
{ALIAS_MAXSWAP, "zone.max-swap", "privileged", "deny", 0},
{ALIAS_SHARES, "zone.cpu-shares", "privileged", "none", 0},
+ {ALIAS_CPUCAP, "zone.cpu-cap", "privileged", "deny", 0},
{NULL, NULL, NULL, NULL, 0}
};
diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com
index c0cbbb57a0..fc55931f62 100644
--- a/usr/src/pkgdefs/SUNWhea/prototype_com
+++ b/usr/src/pkgdefs/SUNWhea/prototype_com
@@ -546,6 +546,8 @@ f none usr/include/sys/bustypes.h 644 root bin
f none usr/include/sys/byteorder.h 644 root bin
f none usr/include/sys/callb.h 644 root bin
f none usr/include/sys/callo.h 644 root bin
+f none usr/include/sys/cpucaps.h 644 root bin
+f none usr/include/sys/cpucaps_impl.h 644 root bin
f none usr/include/sys/ccompile.h 644 root bin
f none usr/include/sys/cdio.h 644 root bin
f none usr/include/sys/cis.h 644 root bin
@@ -1205,6 +1207,7 @@ f none usr/include/sys/vuid_queue.h 644 root bin
f none usr/include/sys/vuid_state.h 644 root bin
f none usr/include/sys/vuid_store.h 644 root bin
f none usr/include/sys/wait.h 644 root bin
+f none usr/include/sys/waitq.h 644 root bin
f none usr/include/sys/watchpoint.h 644 root bin
f none usr/include/sys/xti_inet.h 644 root bin
f none usr/include/sys/xti_osi.h 644 root bin
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index b3433fc075..ced7d5c654 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -40,6 +40,7 @@ COMMON_CORE_OBJS += \
bitset.o \
bp_map.o \
brand.o \
+ cpucaps.o \
cmt.o \
cpu.o \
cpu_intr.o \
@@ -341,6 +342,7 @@ GENUNIX_OBJS += \
vnode.o \
vuid_queue.o \
vuid_store.o \
+ waitq.o \
watchpoint.o \
yield.o \
scsi_confdata.o \
diff --git a/usr/src/uts/common/cpr/cpr_uthread.c b/usr/src/uts/common/cpr/cpr_uthread.c
index 00d5e0e80b..49ea1dfb1f 100644
--- a/usr/src/uts/common/cpr/cpr_uthread.c
+++ b/usr/src/uts/common/cpr/cpr_uthread.c
@@ -148,8 +148,7 @@ cpr_stop_user(int wait)
aston(tp);
- if (tp->t_state == TS_SLEEP &&
- (tp->t_flag & T_WAKEABLE)) {
+ if (ISWAKEABLE(tp) || ISWAITING(tp)) {
setrun_locked(tp);
}
}
diff --git a/usr/src/uts/common/disp/cpucaps.c b/usr/src/uts/common/disp/cpucaps.c
new file mode 100644
index 0000000000..b290c5ecc4
--- /dev/null
+++ b/usr/src/uts/common/disp/cpucaps.c
@@ -0,0 +1,1133 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/disp.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/atomic.h>
+#include <sys/cpucaps_impl.h>
+#include <sys/dtrace.h>
+#include <sys/sdt.h>
+#include <sys/debug.h>
+#include <sys/rctl.h>
+#include <sys/errno.h>
+
+/*
+ * CPU Caps implementation
+ * =======================
+ *
+ * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU
+ * usage for all projects running inside the zone. If the zone CPU cap is set
+ * below the project CPU cap, the latter will have no effect.
+ *
+ * When CPU usage of projects and/or zones reaches specified caps, threads in
+ * them do not get scheduled and instead are placed on wait queues associated
+ * with a cap. Such threads will start running again only when CPU usage drops
+ * below the cap level. Each zone and each project has its own wait queue.
+ *
+ * When CPU cap is set, the kernel continously keeps track of CPU time used by
+ * capped zones and/or projects over a short time interval and calculates their
+ * current CPU usage as a percentage. When the accumulated usage reaches the CPU
+ * cap, LWPs running in the user-land (when they are not holding any critical
+ * kernel locks) are placed on special wait queues until their project's or
+ * zone's CPU usage drops below the cap.
+ *
+ * The system maintains a list of all capped projects and all capped zones. On
+ * every clock tick every active thread belonging to a capped project adds its
+ * CPU usage to its project. Usage from all projects belonging to a capped zone
+ * is aggregated to get the zone usage.
+ *
+ * When the current CPU usage is above the cap, a project or zone is considered
+ * over-capped. Every user thread caught running in an over-capped project or
+ * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and
+ * is requested to surrender its CPU. This causes scheduling class specific
+ * CL_PREEMPT() callback to be invoked. The callback function places threads
+ * marked as TS_PROJWAIT on a wait queue and calls switch().
+ *
+ * Threads are only placed on wait queues after trapping from user-land
+ * (they could be holding some user locks, but no kernel locks) and while
+ * returning from the trap back to the user-land when no kernel locks are held.
+ * Putting threads on wait queues in random places while running in the
+ * kernel might lead to all kinds of locking problems.
+ *
+ * Accounting
+ * ==========
+ *
+ * Accounting of CPU usage is based on per-thread micro-state accounting data.
+ * On every clock tick clock() adds new on-CPU time for every thread found on
+ * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU.
+ * New times means time since it was last accounted for. On-CPU times greater
+ * than 1 tick are truncated to 1 tick.
+ *
+ * Project CPU usage is aggregated from all threads within the project.
+ * Zone CPU usage is the sum of usages for all projects within the zone. Zone
+ * CPU usage is calculated on every clock tick by walking list of projects and
+ * adding their usage together.
+ *
+ * Decay
+ * =====
+ *
+ * CPU usage is decayed by the caps_update() routine which is called once per
+ * every clock tick. It walks lists of project caps and decays their usages by
+ * one per cent. If CPU usage drops below cap levels, threads on the wait queue
+ * are made runnable again, one thread per clock tick.
+ *
+ * Interfaces
+ * ==========
+ *
+ * The CPU Caps facility provides the following interfaces to the rest of the
+ * system:
+ *
+ * cpucaps_project_add(kproject_t *)
+ *
+ * Notifies the framework of a new project. It should be put on the
+ * capped_projects list if its zone has a cap.
+ *
+ * cpucaps_project_remove(kproject_t *)
+ *
+ * Remove the association between the specified project and its cap.
+ * Called right before the project is destroyed.
+ *
+ * cpucaps_project_set(kproject_t *, rctl_qty_t)
+ *
+ * Set project cap of the specified project to the specified value. Setting the
+ * value to NOCAP is equivalent to removing the cap.
+ *
+ * cpucaps_zone_set(zone_t *, rctl_qty_t)
+ *
+ * Set zone cap of the specified zone to the specified value. Setting the value
+ * to NOCAP is equivalent to removing the cap.
+ *
+ * cpucaps_zone_remove(zone_t *)
+ *
+ * Remove the association between the zone and its cap.
+ *
+ * cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t)
+ *
+ * Charges specified thread's project the amount of on-CPU time that it used.
+ * If the third argument is CPUCAPS_CHARGE_ONLY returns False.
+ * Otherwise returns True if project or zone should be penalized because its
+ * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ
+ * bits in t_schedflag in this case.
+ *
+ * CPUCAPS_ENFORCE(kthread_id_t *)
+ *
+ * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER
+ * state on project or zone wait queues, as requested by TS_PROJWAITQ or
+ * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a
+ * wait queue or False otherwise.
+ *
+ * cpucaps_sc_init(caps_sc_t *)
+ *
+ * Initializes the scheduling-class specific CPU Caps data for a thread.
+ *
+ * LOCKS
+ * =====
+ *
+ * all the individual caps structures and their lists are protected by a global
+ * caps_lock mutex. The lock is grabbed either by clock() or by events modifying
+ * caps, so it is usually uncontended. We avoid all blocking memory allocations
+ * while holding caps_lock to prevent clock() from blocking.
+ *
+ * Thread state is protected by the thread lock. It protects the association
+ * between a thread and its project and, as a consequence, to its zone. The
+ * association can not break while thread lock is held, so the project or zone
+ * cap are not going to disappear while thread lock is held.
+ *
+ * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is
+ * grabbed by scheduling classes already holding thread lock at high PIL and by
+ * clock thread performing usage decay. We should do as little work as possible
+ * while holding the lock since it may be very hot. All threads in the project
+ * contend for the same cache line doing cap usage updates.
+ */
+
+/*
+ * caps_lock protects list of capped projects and zones, changes in the cap
+ * state and changes of the global cpucaps_enabled flag.
+ *
+ * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is
+ * modified in parallel. This can be per-zone cap flag, but we don't keep any
+ * cap state for now.
+ */
+static kmutex_t caps_lock; /* lock to protect: */
+static list_t capped_zones; /* - list of zones with caps */
+static list_t capped_projects; /* - list of projects with caps */
+boolean_t cpucaps_enabled; /* - are there any caps defined? */
+boolean_t cpucaps_busy; /* - is framework busy? */
+
+/*
+ * The accounting is based on the number of nanoseconds threads spend running
+ * during a tick which is kept in the cap_tick_cost variable.
+ */
+static hrtime_t cap_tick_cost;
+
+/*
+ * How much of the usage value is decayed every clock tick
+ * Decay one per cent of value per tick
+ */
+#define CAP_DECAY_FACTOR 100
+
+/*
+ * Scale the value and round it to the closest integer value
+ */
+#define ROUND_SCALE(x, y) (((x) + (y) / 2) / (y))
+
+static void caps_update();
+
+/*
+ * CAP kstats.
+ */
+struct cap_kstat {
+ kstat_named_t cap_value;
+ kstat_named_t cap_usage;
+ kstat_named_t cap_nwait;
+ kstat_named_t cap_below;
+ kstat_named_t cap_above;
+ kstat_named_t cap_maxusage;
+ kstat_named_t cap_zonename;
+} cap_kstat = {
+ { "value", KSTAT_DATA_UINT64 },
+ { "usage", KSTAT_DATA_UINT64 },
+ { "nwait", KSTAT_DATA_UINT64 },
+ { "below_sec", KSTAT_DATA_UINT64 },
+ { "above_sec", KSTAT_DATA_UINT64 },
+ { "maxusage", KSTAT_DATA_UINT64 },
+ { "zonename", KSTAT_DATA_STRING },
+};
+
+
+static kmutex_t cap_kstat_lock;
+static int cap_kstat_update(kstat_t *, int);
+
+/*
+ * Initialize CPU caps infrastructure.
+ * - Initialize lists of capped zones and capped projects
+ * - Set cpucaps_clock_callout to NULL
+ */
+void
+cpucaps_init()
+{
+ /*
+ * Initialize global variables
+ */
+ cap_tick_cost = TICK_TO_NSEC((hrtime_t)1);
+
+ list_create(&capped_zones, sizeof (cpucap_t),
+ offsetof(cpucap_t, cap_link));
+ list_create(&capped_projects, sizeof (cpucap_t),
+ offsetof(cpucap_t, cap_link));
+
+ cpucaps_enabled = B_FALSE;
+ cpucaps_busy = B_FALSE;
+ cpucaps_clock_callout = NULL;
+}
+
+/*
+ * Initialize scheduling-class specific CPU Caps data.
+ */
+void
+cpucaps_sc_init(caps_sc_t *csc)
+{
+ csc->csc_cputime = 0;
+}
+
+/*
+ * Allocate and initialize cpucap structure
+ */
+static cpucap_t *
+cap_alloc(void)
+{
+ cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP);
+
+ DISP_LOCK_INIT(&cap->cap_usagelock);
+ waitq_init(&cap->cap_waitq);
+
+ return (cap);
+}
+
+/*
+ * Free cpucap structure
+ */
+static void
+cap_free(cpucap_t *cap)
+{
+ if (cap == NULL)
+ return;
+
+ /*
+ * This cap should not be active
+ */
+ ASSERT(!list_link_active(&cap->cap_link));
+ ASSERT(cap->cap_value == 0);
+ ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock));
+
+ waitq_fini(&cap->cap_waitq);
+ DISP_LOCK_DESTROY(&cap->cap_usagelock);
+
+ kmem_free(cap, sizeof (cpucap_t));
+}
+
+/*
+ * Activate cap - insert into active list and unblock its
+ * wait queue. Should be called with caps_lock held.
+ * The cap_value field is set to the value supplied.
+ */
+static void
+cap_enable(list_t *l, cpucap_t *cap, hrtime_t value)
+{
+ ASSERT(MUTEX_HELD(&caps_lock));
+
+ /*
+ * Cap can not be already enabled
+ */
+ ASSERT(!CAP_ENABLED(cap));
+ ASSERT(!list_link_active(&cap->cap_link));
+
+ list_insert_tail(l, cap);
+ cap->cap_below = cap->cap_above = 0;
+ cap->cap_maxusage = 0;
+ cap->cap_usage = 0;
+ cap->cap_value = value;
+ waitq_unblock(&cap->cap_waitq);
+ if (CPUCAPS_OFF()) {
+ cpucaps_enabled = B_TRUE;
+ cpucaps_clock_callout = caps_update;
+ }
+}
+
+/*
+ * Deactivate cap
+ * - Block its wait queue. This prevents any new threads from being
+ * enqueued there and moves all enqueued threads to the run queue.
+ * - Remove cap from list l.
+ * - Disable CPU caps globally if there are no capped projects or zones
+ *
+ * Should be called with caps_lock held.
+ */
+static void
+cap_disable(list_t *l, cpucap_t *cap)
+{
+ ASSERT(MUTEX_HELD(&caps_lock));
+ /*
+ * Cap should be currently active
+ */
+ ASSERT(CPUCAPS_ON());
+ ASSERT(list_link_active(&cap->cap_link));
+ ASSERT(CAP_ENABLED(cap));
+
+ waitq_block(&cap->cap_waitq);
+ list_remove(l, cap);
+ if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) {
+ cpucaps_enabled = B_FALSE;
+ cpucaps_clock_callout = NULL;
+ }
+ cap->cap_value = 0;
+ cap->cap_project = NULL;
+ cap->cap_zone = NULL;
+ if (cap->cap_kstat != NULL) {
+ kstat_delete(cap->cap_kstat);
+ cap->cap_kstat = NULL;
+ }
+
+}
+
+/*
+ * Enable cap for a project kpj
+ * It is safe to enable already enabled project cap.
+ * Should be called with caps_lock held.
+ */
+static void
+cap_project_enable(kproject_t *kpj, hrtime_t value)
+{
+ cpucap_t *cap = kpj->kpj_cpucap;
+
+ ASSERT(MUTEX_HELD(&caps_lock));
+ ASSERT(cap != NULL);
+
+ if (CAP_DISABLED(cap)) {
+ ASSERT(cap->cap_kstat == NULL);
+ cap_enable(&capped_projects, cap, value);
+ cap->cap_project = kpj;
+ cap->cap_zone = kpj->kpj_zone;
+
+ /*
+ * Create cap kstats
+ */
+ if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps",
+ KSTAT_TYPE_NAMED,
+ sizeof (cap_kstat) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL)) != NULL) {
+ cap->cap_kstat->ks_data_size +=
+ strlen(cap->cap_zone->zone_name) + 1;
+ cap->cap_kstat->ks_lock = &cap_kstat_lock;
+ cap->cap_kstat->ks_data = &cap_kstat;
+ cap->cap_kstat->ks_update = cap_kstat_update;
+ cap->cap_kstat->ks_private = cap;
+ kstat_install(cap->cap_kstat);
+ }
+ }
+}
+
+/*
+ * Disable project cap.
+ * It is safe to disable already disabled project cap.
+ * Should be called with caps_lock held.
+ */
+static void
+cap_project_disable(kproject_t *kpj)
+{
+ cpucap_t *cap = kpj->kpj_cpucap;
+
+ ASSERT(MUTEX_HELD(&caps_lock));
+ ASSERT(cap != NULL);
+ ASSERT(cap->cap_project == kpj);
+
+ if (CAP_ENABLED(cap))
+ cap_disable(&capped_projects, cap);
+}
+
+/*
+ * Enable cap for a zone
+ * It is safe to enable already enabled zone cap.
+ * Should be called with caps_lock held.
+ */
+static void
+cap_zone_enable(zone_t *zone, hrtime_t value)
+{
+ cpucap_t *cap = zone->zone_cpucap;
+
+ ASSERT(MUTEX_HELD(&caps_lock));
+ ASSERT(cap != NULL);
+
+ if (CAP_DISABLED(cap)) {
+ ASSERT(cap->cap_kstat == NULL);
+ cap_enable(&capped_zones, cap, value);
+ cap->cap_zone = zone;
+
+ /*
+ * Create cap kstats
+ */
+ if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps",
+ KSTAT_TYPE_NAMED,
+ sizeof (cap_kstat) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL)) != NULL) {
+ cap->cap_kstat->ks_data_size +=
+ strlen(cap->cap_zone->zone_name) + 1;
+ cap->cap_kstat->ks_lock = &cap_kstat_lock;
+ cap->cap_kstat->ks_data = &cap_kstat;
+ cap->cap_kstat->ks_update = cap_kstat_update;
+ cap->cap_kstat->ks_private = cap;
+ kstat_install(cap->cap_kstat);
+ }
+ }
+}
+
+/*
+ * Disable zone cap.
+ * It is safe to disable already disabled zone cap.
+ * Should be called with caps_lock held.
+ */
+static void
+cap_zone_disable(zone_t *zone)
+{
+ cpucap_t *cap = zone->zone_cpucap;
+
+ ASSERT(MUTEX_HELD(&caps_lock));
+ ASSERT(cap != NULL);
+ ASSERT(cap->cap_zone == zone);
+
+ if (CAP_ENABLED(cap))
+ cap_disable(&capped_zones, cap);
+}
+
+/*
+ * Apply specified callback to all caps contained in the list `l'.
+ */
+static void
+cap_walk(list_t *l, void (*cb)(cpucap_t *))
+{
+ cpucap_t *cap;
+
+ ASSERT(MUTEX_HELD(&caps_lock));
+
+ for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) {
+ (*cb)(cap);
+ }
+}
+
+/*
+ * If cap limit is not reached, make one thread from wait queue runnable.
+ * The waitq_isempty check is performed without the waitq lock. If a new thread
+ * is placed on the waitq right after the check, it will be picked up during the
+ * next invocation of cap_poke_waitq().
+ */
+static void
+cap_poke_waitq(cpucap_t *cap)
+{
+ ASSERT(MUTEX_HELD(&caps_lock));
+
+ if (cap->cap_usage >= cap->cap_value) {
+ cap->cap_above++;
+ } else {
+ waitq_t *wq = &cap->cap_waitq;
+
+ cap->cap_below++;
+
+ if (!waitq_isempty(wq))
+ waitq_runone(wq);
+ }
+}
+
+/*
+ * The callback function called for every cap on capped_projects list.
+ * Decay cap usage by CAP_DECAY_FACTOR
+ * Add this cap project usage to its zone usage.
+ * Kick off a thread from the cap waitq if cap is not reached.
+ */
+static void
+cap_project_usage_walker(cpucap_t *cap)
+{
+ zone_t *zone = cap->cap_zone;
+ hrtime_t cap_usage = cap->cap_usage;
+
+ ASSERT(MUTEX_HELD(&caps_lock));
+ ASSERT(cap->cap_project->kpj_cpucap == cap);
+ ASSERT(zone == cap->cap_project->kpj_zone);
+ ASSERT(CAP_ENABLED(cap));
+
+ /*
+ * Set or clear the CAP_REACHED flag based on the current usage.
+ * Only projects having their own caps are ever marked as CAP_REACHED.
+ */
+ cap_poke_waitq(cap);
+
+ /*
+ * Add project's CPU usage to our zone's CPU usage.
+ */
+ if (ZONE_IS_CAPPED(zone)) {
+ cpucap_t *zcap = zone->zone_cpucap;
+
+ ASSERT(zcap->cap_zone == zone);
+
+ /*
+ * If we haven't reset this zone's usage during this clock tick
+ * yet, then do it now. The cap_lbolt field is used to check
+ * whether this is the first zone's project we see during this
+ * tick or a subsequent one.
+ */
+ if (zcap->cap_lbolt != lbolt64) {
+ if (zcap->cap_usage > zcap->cap_maxusage)
+ zcap->cap_maxusage = zcap->cap_usage;
+ zcap->cap_usage = 0;
+ }
+ DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap,
+ hrtime_t, cap_usage);
+ zcap->cap_usage += cap_usage;
+ /* Check for overflows */
+ if (zcap->cap_usage < 0)
+ zcap->cap_usage = MAX_USAGE - 1;
+ }
+
+ /*
+ * Decay project usage.
+ */
+ disp_lock_enter(&cap->cap_usagelock);
+ cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR);
+ disp_lock_exit(&cap->cap_usagelock);
+}
+
+/*
+ * On every clock tick walk the list of project caps and update the CPU usage.
+ * Also walk the list of zone caps checking whether any threads should
+ * transition from wait queue to run queue.
+ *
+ * This function gets called by the clock thread directly when there are any
+ * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs
+ * caps_lock for long periods of time, so there should be almost no contention
+ * for it.
+ */
+static void
+caps_update()
+{
+ mutex_enter(&caps_lock);
+ cap_walk(&capped_projects, cap_project_usage_walker);
+ cap_walk(&capped_zones, cap_poke_waitq);
+ mutex_exit(&caps_lock);
+}
+
+/*
+ * The function is called for each project in a zone when the zone cap is
+ * modified. It enables project caps if zone cap is enabled and disables if the
+ * zone cap is disabled and project doesn't have its own cap.
+ *
+ * For each project that does not have cpucap structure allocated it allocates a
+ * new structure and assigns to kpj->cpu_cap. The allocation is performed
+ * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock
+ * held.
+ */
+static int
+cap_project_zone_modify_walker(kproject_t *kpj, void *arg)
+{
+ cpucap_t *project_cap = NULL;
+ cpucap_t *zone_cap = (cpucap_t *)arg;
+
+ ASSERT(zone_cap != NULL);
+
+ if (kpj->kpj_cpucap == NULL) {
+ /*
+ * This is the first time any cap was established for this
+ * project. Allocate a new cpucap structure for it.
+ */
+ project_cap = cap_alloc();
+ }
+
+ mutex_enter(&caps_lock);
+
+ /*
+ * Double-check that kpj_cpucap is still NULL - now with caps_lock held
+ * and assign the newly allocated cpucap structure to it.
+ */
+ if (kpj->kpj_cpucap == NULL) {
+ kpj->kpj_cpucap = project_cap;
+ } else if (project_cap != NULL) {
+ cap_free(project_cap);
+ }
+
+ project_cap = kpj->kpj_cpucap;
+
+ if (CAP_DISABLED(zone_cap)) {
+ /*
+ * Remove all projects in this zone without caps
+ * from the capped_projects list.
+ */
+ if (project_cap->cap_value == MAX_USAGE) {
+ cap_project_disable(kpj);
+ }
+ } else if (CAP_DISABLED(project_cap)) {
+ /*
+ * Add the project to capped_projects list.
+ */
+ ASSERT(project_cap->cap_value == 0);
+ cap_project_enable(kpj, MAX_USAGE);
+ }
+ mutex_exit(&caps_lock);
+
+ return (0);
+}
+
+/*
+ * Set zone cap to cap_val
+ * If cap_val is equal to NOCAP, disable zone cap.
+ *
+ * If this is the first time a cap is set on a zone, allocate cpucap structure
+ * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held.
+ */
+int
+cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
+{
+ cpucap_t *cap = NULL;
+ hrtime_t value;
+
+ if (cap_val == 0)
+ return (EINVAL);
+
+ ASSERT(cap_val <= MAXCAP);
+ if (cap_val > MAXCAP)
+ cap_val = MAXCAP;
+
+ /*
+ * Nothing to do if trying to disable a cap on a zone when caps are off
+ * or a zone which does not have a cap yet.
+ */
+ if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP))
+ return (0);
+
+ if (zone->zone_cpucap == NULL)
+ cap = cap_alloc();
+
+ mutex_enter(&caps_lock);
+
+ if (cpucaps_busy) {
+ mutex_exit(&caps_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
+ * held. If it is still NULL, assign a newly allocated cpucap to it.
+ */
+ if (zone->zone_cpucap == NULL) {
+ zone->zone_cpucap = cap;
+ } else if (cap != NULL) {
+ cap_free(cap);
+ }
+
+ cap = zone->zone_cpucap;
+ value = cap_val * cap_tick_cost;
+ if (value < 0)
+ value = MAX_USAGE;
+
+ /* Nothing to do if the value is staying the same */
+ if (value == cap->cap_value) {
+ mutex_exit(&caps_lock);
+ return (0);
+ }
+
+ /*
+ * Clear cap statistics since the cap value itself changes.
+ */
+ cap->cap_above = cap->cap_below = 0;
+
+
+ if (cap_val == NOCAP) {
+ if (CAP_ENABLED(cap)) {
+ /*
+ * Remove cap for the zone
+ */
+ cap_zone_disable(zone);
+ cpucaps_busy = B_TRUE;
+ mutex_exit(&caps_lock);
+ /*
+ * Disable caps for all project belonging to this zone
+ * unless they have their own cap.
+ */
+ (void) project_walk_all(zone->zone_id,
+ cap_project_zone_modify_walker, cap);
+
+ mutex_enter(&caps_lock);
+ cpucaps_busy = B_FALSE;
+ }
+ } else if (CAP_DISABLED(cap)) {
+ /*
+ * Set a cap on a zone which previously was not capped.
+ */
+ cap_zone_enable(zone, value);
+ cpucaps_busy = B_TRUE;
+ mutex_exit(&caps_lock);
+
+ /*
+ * Enable cap for all projects belonging to this zone.
+ */
+ (void) project_walk_all(zone->zone_id,
+ cap_project_zone_modify_walker, cap);
+
+ mutex_enter(&caps_lock);
+ cpucaps_busy = B_FALSE;
+ } else {
+ /*
+ * No state transitions, just change the value
+ */
+ cap->cap_value = value;
+ }
+
+ ASSERT(MUTEX_HELD(&caps_lock));
+ ASSERT(!cpucaps_busy);
+ mutex_exit(&caps_lock);
+
+ return (0);
+}
+
+/*
+ * The project is going away so disable its cap.
+ */
+void
+cpucaps_project_remove(kproject_t *kpj)
+{
+ mutex_enter(&caps_lock);
+ if (PROJECT_IS_CAPPED(kpj))
+ cap_project_disable(kpj);
+ if (kpj->kpj_cpucap != NULL) {
+ cap_free(kpj->kpj_cpucap);
+ kpj->kpj_cpucap = NULL;
+ }
+ mutex_exit(&caps_lock);
+}
+
+/*
+ * The zone is going away, so disable its cap.
+ */
+void
+cpucaps_zone_remove(zone_t *zone)
+{
+ mutex_enter(&caps_lock);
+ while (ZONE_IS_CAPPED(zone)) {
+ mutex_exit(&caps_lock);
+ (void) cpucaps_zone_set(zone, NOCAP);
+ mutex_enter(&caps_lock);
+ }
+ if (zone->zone_cpucap != NULL) {
+ cap_free(zone->zone_cpucap);
+ zone->zone_cpucap = NULL;
+ }
+ mutex_exit(&caps_lock);
+}
+
+/*
+ * New project was created. It should be put on the capped_projects list if
+ * its zone has a cap.
+ */
+void
+cpucaps_project_add(kproject_t *kpj)
+{
+ cpucap_t *cap = NULL;
+
+ if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone))
+ return;
+
+ /*
+ * This project was never capped before, so allocate its cap structure.
+ */
+ if (kpj->kpj_cpucap == NULL)
+ cap = cap_alloc();
+
+ mutex_enter(&caps_lock);
+ /*
+ * Double-check with caps_lock held
+ */
+ if (kpj->kpj_cpucap == NULL) {
+ kpj->kpj_cpucap = cap;
+ } else if (cap != NULL) {
+ cap_free(cap);
+ }
+
+ if (ZONE_IS_CAPPED(kpj->kpj_zone))
+ cap_project_enable(kpj, MAX_USAGE);
+
+ mutex_exit(&caps_lock);
+}
+
+/*
+ * Set project cap to cap_val
+ * If cap_val is equal to NOCAP, disable project cap.
+ *
+ * If this is the first time a cap is set on a project, allocate cpucap
+ * structure without holding caps_lock to avoid KM_SLEEP allocation with
+ * caps_lock held.
+ */
+int
+cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
+{
+ cpucap_t *cap = NULL;
+ hrtime_t value;
+
+ if (cap_val == 0)
+ return (EINVAL);
+
+ ASSERT(cap_val <= MAXCAP);
+ if (cap_val > MAXCAP)
+ cap_val = MAXCAP;
+
+ /*
+ * Nothing to do if trying to disable project cap and caps are not
+ * enabled or if trying to disable cap on a project that does not have
+ * cap enabled.
+ */
+ if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj)))
+ return (0);
+
+ if (kpj->kpj_cpucap == NULL) {
+ /*
+ * This project was never capped before, so allocate its cap
+ * structure.
+ */
+ cap = cap_alloc();
+ }
+
+ mutex_enter(&caps_lock);
+
+ /*
+ * Double-check with caps_lock held.
+ */
+ if (kpj->kpj_cpucap == NULL) {
+ kpj->kpj_cpucap = cap;
+ } else if (cap != NULL) {
+ cap_free(cap);
+ }
+
+ /*
+ * Get the actual pointer to the project cap.
+ */
+ cap = kpj->kpj_cpucap;
+ value = cap_val * cap_tick_cost;
+ if (value < 0)
+ value = MAX_USAGE;
+
+ /*
+ * Nothing to do if the value is not changing
+ */
+ if (value == cap->cap_value) {
+ mutex_exit(&caps_lock);
+ return (0);
+ }
+
+ /*
+ * Clear cap statistics since the cap value itself changes.
+ */
+ cap->cap_above = cap->cap_below = 0;
+ cap->cap_maxusage = 0;
+
+ if (cap_val != NOCAP) {
+ /*
+ * Enable this cap if it is not already enabled.
+ */
+ if (CAP_DISABLED(cap))
+ cap_project_enable(kpj, value);
+ else
+ cap->cap_value = value;
+ } else if (CAP_ENABLED(cap)) {
+ /*
+ * User requested to drop a cap on the project. If it is part of
+ * capped zone, keep the cap and set the value to MAX_USAGE,
+ * otherwise disable the cap.
+ */
+ if (ZONE_IS_CAPPED(kpj->kpj_zone)) {
+ cap->cap_value = MAX_USAGE;
+ } else {
+ cap_project_disable(kpj);
+ }
+ }
+ mutex_exit(&caps_lock);
+
+ return (0);
+}
+
+/*
+ * Get cap usage.
+ */
+static rctl_qty_t
+cap_get(cpucap_t *cap)
+{
+ return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0);
+}
+
+/*
+ * Get current project usage.
+ */
+rctl_qty_t
+cpucaps_project_get(kproject_t *kpj)
+{
+ return (cap_get(kpj->kpj_cpucap));
+}
+
+/*
+ * Get current zone usage.
+ */
+rctl_qty_t
+cpucaps_zone_get(zone_t *zone)
+{
+ return (cap_get(zone->zone_cpucap));
+}
+
+/*
+ * Charge project of thread t the time thread t spent on CPU since previously
+ * adjusted.
+ *
+ * Record the current on-CPU time in the csc structure.
+ *
+ * Do not adjust for more than one tick worth of time.
+ *
+ */
+static void
+caps_charge_adjust(kthread_id_t t, caps_sc_t *csc)
+{
+ kproject_t *kpj = ttoproj(t);
+ hrtime_t new_usage;
+ hrtime_t usage_delta;
+
+ ASSERT(THREAD_LOCK_HELD(t));
+ ASSERT(PROJECT_IS_CAPPED(kpj));
+
+ /* Get on-CPU time since birth of a thread */
+ new_usage = mstate_thread_onproc_time(t);
+
+ /* Time spent on CPU since last checked */
+ usage_delta = new_usage - csc->csc_cputime;
+
+ /* Save the accumulated on-CPU time */
+ csc->csc_cputime = new_usage;
+
+ /* Charge at most one tick worth of on-CPU time */
+ if (usage_delta > cap_tick_cost)
+ usage_delta = cap_tick_cost;
+
+ /* Add usage_delta to the project usage value. */
+ if (usage_delta > 0) {
+ cpucap_t *cap = kpj->kpj_cpucap;
+
+ DTRACE_PROBE2(cpucaps__project__charge,
+ kthread_id_t, t, hrtime_t, usage_delta);
+
+ disp_lock_enter_high(&cap->cap_usagelock);
+ cap->cap_usage += usage_delta;
+
+ /* Check for overflows */
+ if (cap->cap_usage < 0)
+ cap->cap_usage = MAX_USAGE - 1;
+
+ disp_lock_exit_high(&cap->cap_usagelock);
+
+ /*
+ * cap_maxusage is only kept for observability. Move it outside
+ * the lock to reduce the time spent while holding the lock.
+ */
+ if (cap->cap_usage > cap->cap_maxusage)
+ cap->cap_maxusage = cap->cap_usage;
+ }
+}
+
+/*
+ * Charge thread's project and return True if project or zone should be
+ * penalized because its project or zone is exceeding its cap. Also sets
+ * TS_PROJWAITQ or TS_ZONEWAITQ in this case.
+ */
+boolean_t
+cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
+{
+ kproject_t *kpj = ttoproj(t);
+ klwp_t *lwp = t->t_lwp;
+ zone_t *zone;
+ cpucap_t *project_cap;
+ boolean_t rc = B_FALSE;
+
+ ASSERT(THREAD_LOCK_HELD(t));
+
+ /* Nothing to do for projects that are not capped. */
+ if (lwp == NULL || !PROJECT_IS_CAPPED(kpj))
+ return (B_FALSE);
+
+ caps_charge_adjust(t, csc);
+
+ /*
+ * The caller only requested to charge the project usage, no enforcement
+ * part.
+ */
+ if (charge_type == CPUCAPS_CHARGE_ONLY)
+ return (B_FALSE);
+
+ project_cap = kpj->kpj_cpucap;
+
+ if (project_cap->cap_usage >= project_cap->cap_value) {
+ t->t_schedflag |= TS_PROJWAITQ;
+ rc = B_TRUE;
+ } else if (t->t_schedflag & TS_PROJWAITQ) {
+ t->t_schedflag &= ~TS_PROJWAITQ;
+ }
+
+ zone = ttozone(t);
+ if (!ZONE_IS_CAPPED(zone)) {
+ if (t->t_schedflag & TS_ZONEWAITQ)
+ t->t_schedflag &= ~TS_ZONEWAITQ;
+ } else {
+ cpucap_t *zone_cap = zone->zone_cpucap;
+
+ if (zone_cap->cap_usage >= zone_cap->cap_value) {
+ t->t_schedflag |= TS_ZONEWAITQ;
+ rc = B_TRUE;
+ } else if (t->t_schedflag & TS_ZONEWAITQ) {
+ t->t_schedflag &= ~TS_ZONEWAITQ;
+ }
+ }
+
+
+ return (rc);
+}
+
+/*
+ * Enforce CPU caps. If got preempted in the user-land, we know that thread does
+ * not hold any kernel locks, so enqueue ourselves on the waitq, if needed.
+ *
+ * CPU Caps are only enforced for user threads.
+ *
+ * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and
+ * threads marked with TS_ZONEWAITQ are placed on their zone wait queue.
+ *
+ * It is possible that by the time we enter cpucaps_enforce() the cap is already
+ * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We
+ * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer
+ * apply.
+ */
+boolean_t
+cpucaps_enforce(kthread_t *t)
+{
+ klwp_t *lwp = t->t_lwp;
+
+ ASSERT(THREAD_LOCK_HELD(t));
+
+ if (lwp != NULL && lwp->lwp_state == LWP_USER) {
+ if (t->t_schedflag & TS_PROJWAITQ) {
+ ASSERT(ttoproj(t)->kpj_cpucap != NULL);
+ t->t_schedflag &= ~TS_ANYWAITQ;
+ if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq),
+ t)) {
+ return (B_TRUE);
+ }
+ }
+ if (t->t_schedflag & TS_ZONEWAITQ) {
+ ASSERT(ttozone(t)->zone_cpucap != NULL);
+ t->t_schedflag &= ~TS_ZONEWAITQ;
+ if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq),
+ t)) {
+ return (B_TRUE);
+ }
+ }
+ }
+
+ /*
+ * The thread is not enqueued on the wait queue.
+ */
+ return (B_FALSE);
+}
+
+/*
+ * Convert internal cap statistics into values exported by cap kstat.
+ */
+static int
+cap_kstat_update(kstat_t *ksp, int rw)
+{
+ struct cap_kstat *capsp = &cap_kstat;
+ cpucap_t *cap = ksp->ks_private;
+ clock_t tick_sec = SEC_TO_TICK(1);
+ char *zonename = cap->cap_zone->zone_name;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ capsp->cap_value.value.ui64 =
+ ROUND_SCALE(cap->cap_value, cap_tick_cost);
+ capsp->cap_usage.value.ui64 =
+ ROUND_SCALE(cap->cap_usage, cap_tick_cost);
+ capsp->cap_maxusage.value.ui64 =
+ ROUND_SCALE(cap->cap_maxusage, cap_tick_cost);
+ capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count;
+ capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec);
+ capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec);
+ kstat_named_setstr(&capsp->cap_zonename, zonename);
+
+ return (0);
+}
diff --git a/usr/src/uts/common/disp/fss.c b/usr/src/uts/common/disp/fss.c
index af2b04f11b..a409ebc800 100644
--- a/usr/src/uts/common/disp/fss.c
+++ b/usr/src/uts/common/disp/fss.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -21,7 +20,7 @@
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -55,6 +54,7 @@
#include <sys/tnf_probe.h>
#include <sys/policy.h>
#include <sys/sdt.h>
+#include <sys/cpucaps.h>
/*
* FSS Data Structures:
@@ -1069,6 +1069,7 @@ fss_update_list(int i)
goto next;
if ((fssproc->fss_flags & FSSKPRI) != 0)
goto next;
+
fssproj = FSSPROC2FSSPROJ(fssproc);
if (fssproj == NULL)
goto next;
@@ -1084,7 +1085,7 @@ fss_update_list(int i)
if (t->t_schedctl && schedctl_get_nopreempt(t))
goto next;
- if (t->t_state != TS_RUN) {
+ if (t->t_state != TS_RUN && t->t_state != TS_WAIT) {
/*
* Make next syscall/trap call fss_trapret
*/
@@ -1373,6 +1374,7 @@ fss_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
fssproc->fss_timeleft = fss_quantum;
fssproc->fss_tp = t;
+ cpucaps_sc_init(&fssproc->fss_caps);
/*
* Put a lock on our fsspset structure.
@@ -1420,7 +1422,8 @@ fss_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
t->t_cldata = (void *)fssproc;
t->t_schedflag |= TS_RUNQMATCH;
fss_change_priority(t, fssproc);
- if (t->t_state == TS_RUN || t->t_state == TS_ONPROC)
+ if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
+ t->t_state == TS_WAIT)
fss_active(t);
thread_unlock(t);
@@ -1568,6 +1571,8 @@ fss_fork(kthread_t *pt, kthread_t *ct, void *bufp)
cfssproc->fss_upri = pfssproc->fss_upri;
cfssproc->fss_tp = ct;
cfssproc->fss_nice = pfssproc->fss_nice;
+ cpucaps_sc_init(&cfssproc->fss_caps);
+
cfssproc->fss_flags =
pfssproc->fss_flags & ~(FSSKPRI | FSSBACKQ | FSSRESTORE);
ct->t_cldata = (void *)cfssproc;
@@ -1793,6 +1798,14 @@ fss_exit(kthread_t *t)
}
mutex_exit(&fsspset->fssps_lock);
mutex_exit(&fsspsets_lock);
+
+ if (CPUCAPS_ON()) {
+ thread_lock(t);
+ fssproc = FSSPROC(t);
+ (void) cpucaps_charge(t, &fssproc->fss_caps,
+ CPUCAPS_CHARGE_ONLY);
+ thread_unlock(t);
+ }
}
static void
@@ -1861,7 +1874,8 @@ fss_swapout(kthread_t *t, int flags)
if (INHERITED(t) ||
(fssproc->fss_flags & FSSKPRI) ||
(t->t_proc_flag & TP_LWPEXIT) ||
- (t->t_state & (TS_ZOMB | TS_FREE | TS_STOPPED | TS_ONPROC)) ||
+ (t->t_state & (TS_ZOMB | TS_FREE | TS_STOPPED |
+ TS_ONPROC | TS_WAIT)) ||
!(t->t_schedflag & TS_LOAD) ||
!(SWAP_OK(t)))
return (-1);
@@ -1971,6 +1985,20 @@ fss_preempt(kthread_t *t)
t->t_trapret = 1; /* so that fss_trapret will run */
aston(t);
}
+
+ /*
+ * This thread may be placed on wait queue by CPU Caps. In this case we
+ * do not need to do anything until it is removed from the wait queue.
+ * Do not enforce CPU caps on threads running at a kernel priority
+ */
+ if (CPUCAPS_ON()) {
+ (void) cpucaps_charge(t, &fssproc->fss_caps,
+ CPUCAPS_CHARGE_ONLY);
+
+ if (!(fssproc->fss_flags & FSSKPRI) && CPUCAPS_ENFORCE(t))
+ return;
+ }
+
/*
* If preempted in user-land mark the thread as swappable because it
* cannot be holding any kernel locks.
@@ -2077,6 +2105,12 @@ fss_sleep(kthread_t *t)
ASSERT(THREAD_LOCK_HELD(t));
ASSERT(t->t_state == TS_ONPROC);
+
+ /*
+ * Account for time spent on CPU before going to sleep.
+ */
+ (void) CPUCAPS_CHARGE(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ONLY);
+
fss_inactive(t);
/*
@@ -2117,6 +2151,8 @@ fss_tick(kthread_t *t)
fssproc_t *fssproc;
fssproj_t *fssproj;
klwp_t *lwp;
+ boolean_t call_cpu_surrender = B_FALSE;
+ boolean_t cpucaps_enforce = B_FALSE;
ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
@@ -2136,6 +2172,17 @@ fss_tick(kthread_t *t)
}
/*
+ * Keep track of thread's project CPU usage. Note that projects
+ * get charged even when threads are running in the kernel.
+ * Do not surrender CPU if running in the SYS class.
+ */
+ if (CPUCAPS_ON()) {
+ cpucaps_enforce = cpucaps_charge(t,
+ &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE) &&
+ !(fssproc->fss_flags & FSSKPRI);
+ }
+
+ /*
* A thread's execution time for threads running in the SYS class
* is not tracked.
*/
@@ -2180,8 +2227,7 @@ fss_tick(kthread_t *t)
t->t_schedflag &= ~TS_DONT_SWAP;
fssproc->fss_timeleft = fss_quantum;
} else {
- fssproc->fss_flags |= FSSBACKQ;
- cpu_surrender(t);
+ call_cpu_surrender = B_TRUE;
}
} else if (t->t_state == TS_ONPROC &&
t->t_pri < t->t_disp_queue->disp_maxrunpri) {
@@ -2190,10 +2236,38 @@ fss_tick(kthread_t *t)
* waiting for a processor, then thread surrenders
* the processor.
*/
- fssproc->fss_flags |= FSSBACKQ;
- cpu_surrender(t);
+ call_cpu_surrender = B_TRUE;
}
}
+
+ if (cpucaps_enforce && 2 * fssproc->fss_timeleft > fss_quantum) {
+ /*
+ * The thread used more than half of its quantum, so assume that
+ * it used the whole quantum.
+ *
+ * Update thread's priority just before putting it on the wait
+ * queue so that it gets charged for the CPU time from its
+ * quantum even before that quantum expires.
+ */
+ fss_newpri(fssproc);
+ if (t->t_pri != fssproc->fss_umdpri)
+ fss_change_priority(t, fssproc);
+
+ /*
+ * We need to call cpu_surrender for this thread due to cpucaps
+ * enforcement, but fss_change_priority may have already done
+ * so. In this case FSSBACKQ is set and there is no need to call
+ * cpu-surrender again.
+ */
+ if (!(fssproc->fss_flags & FSSBACKQ))
+ call_cpu_surrender = B_TRUE;
+ }
+
+ if (call_cpu_surrender) {
+ fssproc->fss_flags |= FSSBACKQ;
+ cpu_surrender(t);
+ }
+
thread_unlock_nopreempt(t); /* clock thread can't be preempted */
}
@@ -2336,6 +2410,11 @@ fss_yield(kthread_t *t)
ASSERT(THREAD_LOCK_HELD(t));
/*
+ * Collect CPU usage spent before yielding
+ */
+ (void) CPUCAPS_CHARGE(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ONLY);
+
+ /*
* Clear the preemption control "yield" bit since the user is
* doing a yield.
*/
@@ -2439,7 +2518,8 @@ fss_changeproj(kthread_t *t, void *kp, void *zp, fssbuf_t *projbuf,
ASSERT(fssproj_new != NULL);
thread_lock(t);
- if (t->t_state == TS_RUN || t->t_state == TS_ONPROC)
+ if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
+ t->t_state == TS_WAIT)
fss_inactive(t);
ASSERT(fssproj_old->fssp_threads > 0);
if (--fssproj_old->fssp_threads == 0) {
@@ -2449,7 +2529,8 @@ fss_changeproj(kthread_t *t, void *kp, void *zp, fssbuf_t *projbuf,
fssproc->fss_proj = fssproj_new;
fssproc->fss_fsspri = 0;
fssproj_new->fssp_threads++;
- if (t->t_state == TS_RUN || t->t_state == TS_ONPROC)
+ if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
+ t->t_state == TS_WAIT)
fss_active(t);
thread_unlock(t);
if (free) {
@@ -2528,12 +2609,14 @@ fss_changepset(kthread_t *t, void *newcp, fssbuf_t *projbuf,
fssproj_new->fssp_threads++;
thread_lock(t);
- if (t->t_state == TS_RUN || t->t_state == TS_ONPROC)
- fss_inactive(t);
+ if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
+ t->t_state == TS_WAIT)
+ fss_inactive(t);
fssproc->fss_proj = fssproj_new;
fssproc->fss_fsspri = 0;
- if (t->t_state == TS_RUN || t->t_state == TS_ONPROC)
- fss_active(t);
+ if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
+ t->t_state == TS_WAIT)
+ fss_active(t);
thread_unlock(t);
mutex_exit(&fsspset_new->fssps_lock);
diff --git a/usr/src/uts/common/disp/fx.c b/usr/src/uts/common/disp/fx.c
index 62b5eb1da2..53bfb46e2a 100644
--- a/usr/src/uts/common/disp/fx.c
+++ b/usr/src/uts/common/disp/fx.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -54,6 +53,7 @@
#include <sys/policy.h>
#include <sys/sdt.h>
#include <sys/cpupart.h>
+#include <sys/cpucaps.h>
static pri_t fx_init(id_t, int, classfuncs_t **);
@@ -85,40 +85,6 @@ static struct modlinkage modlinkage = {
#define FX_MAX_UNPRIV_PRI 0 /* maximum unpriviledge priority */
/*
- * The fxproc_t structures are kept in an array of circular doubly linked
- * lists. A hash on the thread pointer is used to determine which list
- * each fxproc structure should be placed. Each list has a dummy "head" which
- * is never removed, so the list is never empty.
- */
-
-#define FX_LISTS 16 /* number of lists, must be power of 2 */
-#define FX_LIST_HASH(tp) (((uintptr_t)(tp) >> 9) & (FX_LISTS - 1))
-
-#define FX_LIST_INSERT(fxpp) \
-{ \
- int index = FX_LIST_HASH(fxpp->fx_tp); \
- kmutex_t *lockp = &fx_list_lock[index]; \
- fxproc_t *headp = &fx_plisthead[index]; \
- mutex_enter(lockp); \
- fxpp->fx_next = headp->fx_next; \
- fxpp->fx_prev = headp; \
- headp->fx_next->fx_prev = fxpp; \
- headp->fx_next = fxpp; \
- mutex_exit(lockp); \
-}
-
-#define FX_LIST_DELETE(fxpp) \
-{ \
- int index = FX_LIST_HASH(fxpp->fx_tp); \
- kmutex_t *lockp = &fx_list_lock[index]; \
- mutex_enter(lockp); \
- fxpp->fx_prev->fx_next = fxpp->fx_next; \
- fxpp->fx_next->fx_prev = fxpp->fx_prev; \
- mutex_exit(lockp); \
-}
-
-
-/*
* The fxproc_t structures that have a registered callback vector,
* are also kept in an array of circular doubly linked lists. A hash on
* the thread id (from ddi_get_kt_did()) is used to determine which list
@@ -192,10 +158,6 @@ static pri_t fx_maxglobpri; /* maximum global priority used by fx class */
static kmutex_t fx_dptblock; /* protects fixed priority dispatch table */
-static kmutex_t fx_list_lock[FX_LISTS]; /* protects fxproc lists */
-static fxproc_t fx_plisthead[FX_LISTS]; /* dummy fxproc at head of lists */
-
-
static kmutex_t fx_cb_list_lock[FX_CB_LISTS]; /* protects list of fxprocs */
/* that have callbacks */
static fxproc_t fx_cb_plisthead[FX_CB_LISTS]; /* dummy fxproc at head of */
@@ -316,14 +278,6 @@ fx_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
fx_cid = cid; /* Record our class ID */
/*
- * Initialize the fxproc hash table
- */
- for (i = 0; i < FX_LISTS; i++) {
- fx_plisthead[i].fx_next = fx_plisthead[i].fx_prev =
- &fx_plisthead[i];
- }
-
- /*
* Initialize the hash table for fxprocs with callbacks
*/
for (i = 0; i < FX_CB_LISTS; i++) {
@@ -477,7 +431,6 @@ fx_admin(caddr_t uaddr, cred_t *reqpcredp)
return (0);
}
-
/*
* Allocate a fixed priority class specific thread structure and
* initialize it with the parameters supplied. Also move the thread
@@ -565,6 +518,7 @@ fx_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
}
fxpp->fx_timeleft = fxpp->fx_pquantum;
+ cpucaps_sc_init(&fxpp->fx_caps);
fxpp->fx_tp = t;
thread_lock(t); /* get dispatcher lock on thread */
@@ -575,8 +529,6 @@ fx_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
fx_change_priority(t, fxpp);
thread_unlock(t);
- FX_LIST_INSERT(fxpp);
-
return (0);
}
@@ -591,6 +543,8 @@ fx_exit(kthread_t *t)
thread_lock(t);
fxpp = (fxproc_t *)(t->t_cldata);
+ (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ONLY);
+
if (FX_HAS_CB(fxpp)) {
FX_CB_EXIT(FX_CALLB(fxpp), fxpp->fx_cookie);
fxpp->fx_callback = NULL;
@@ -599,6 +553,7 @@ fx_exit(kthread_t *t)
FX_CB_LIST_DELETE(fxpp);
return;
}
+
thread_unlock(t);
}
@@ -621,7 +576,6 @@ fx_exitclass(void *procp)
FX_CB_LIST_DELETE(fxpp);
} else
thread_unlock(fxpp->fx_tp);
- FX_LIST_DELETE(fxpp);
kmem_free(fxpp, sizeof (fxproc_t));
}
@@ -662,6 +616,7 @@ fx_fork(kthread_t *t, kthread_t *ct, void *bufp)
cfxpp->fx_callback = NULL;
cfxpp->fx_cookie = NULL;
cfxpp->fx_flags = pfxpp->fx_flags & ~(FXBACKQ);
+ cpucaps_sc_init(&cfxpp->fx_caps);
cfxpp->fx_tp = ct;
ct->t_cldata = (void *)cfxpp;
@@ -670,7 +625,6 @@ fx_fork(kthread_t *t, kthread_t *ct, void *bufp)
/*
* Link new structure into fxproc list.
*/
- FX_LIST_INSERT(cfxpp);
return (0);
}
@@ -1157,13 +1111,12 @@ static void
fx_preempt(kthread_t *t)
{
fxproc_t *fxpp = (fxproc_t *)(t->t_cldata);
-#ifdef KSLICE
- extern int kslice;
-#endif
ASSERT(t == curthread);
ASSERT(THREAD_LOCK_HELD(curthread));
+ (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ONLY);
+
/*
* Check to see if we're doing "preemption control" here. If
* we are, and if the user has requested that this thread not
@@ -1209,17 +1162,20 @@ fx_preempt(kthread_t *t)
THREAD_CHANGE_PRI(t, fx_dptbl[fxpp->fx_pri].fx_globpri);
}
+ /*
+ * This thread may be placed on wait queue by CPU Caps. In this case we
+ * do not need to do anything until it is removed from the wait queue.
+ */
+ if (CPUCAPS_ENFORCE(t)) {
+ return;
+ }
+
if ((fxpp->fx_flags & (FXBACKQ)) == FXBACKQ) {
fxpp->fx_timeleft = fxpp->fx_pquantum;
fxpp->fx_flags &= ~FXBACKQ;
setbackdq(t);
} else {
-#ifdef KSLICE
- if (kslice)
- setbackdq(t);
- else
-#endif
- setfrontdq(t);
+ setfrontdq(t);
}
}
@@ -1250,6 +1206,11 @@ fx_sleep(kthread_t *t)
ASSERT(t == curthread);
ASSERT(THREAD_LOCK_HELD(t));
+ /*
+ * Account for time spent on CPU before going to sleep.
+ */
+ (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ONLY);
+
if (FX_HAS_CB(fxpp)) {
FX_CB_SLEEP(FX_CALLB(fxpp), fxpp->fx_cookie);
}
@@ -1318,6 +1279,7 @@ fx_stop(kthread_t *t, int why, int what)
static void
fx_tick(kthread_t *t)
{
+ boolean_t call_cpu_surrender = B_FALSE;
fxproc_t *fxpp;
ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
@@ -1342,6 +1304,14 @@ fx_tick(kthread_t *t)
fx_change_priority(t, fxpp);
}
}
+
+ /*
+ * Keep track of thread's project CPU usage. Note that projects
+ * get charged even when threads are running in the kernel.
+ */
+ call_cpu_surrender = CPUCAPS_CHARGE(t, &fxpp->fx_caps,
+ CPUCAPS_CHARGE_ENFORCE);
+
if ((fxpp->fx_pquantum != FX_TQINF) &&
(--fxpp->fx_timeleft <= 0)) {
pri_t new_pri;
@@ -1379,15 +1349,17 @@ fx_tick(kthread_t *t)
if (thread_change_pri(t, new_pri, 0)) {
fxpp->fx_timeleft = fxpp->fx_pquantum;
} else {
- fxpp->fx_flags |= FXBACKQ;
- cpu_surrender(t);
+ call_cpu_surrender = B_TRUE;
}
} else if (t->t_state == TS_ONPROC &&
t->t_pri < t->t_disp_queue->disp_maxrunpri) {
+ call_cpu_surrender = B_TRUE;
+ }
+
+ if (call_cpu_surrender) {
fxpp->fx_flags |= FXBACKQ;
cpu_surrender(t);
}
-
thread_unlock_nopreempt(t); /* clock thread can't be preempted */
}
@@ -1453,6 +1425,11 @@ fx_yield(kthread_t *t)
ASSERT(t == curthread);
ASSERT(THREAD_LOCK_HELD(t));
+ /*
+ * Collect CPU usage spent before yielding CPU.
+ */
+ (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ONLY);
+
if (FX_HAS_CB(fxpp)) {
clock_t new_quantum = (clock_t)fxpp->fx_pquantum;
pri_t newpri = fxpp->fx_pri;
diff --git a/usr/src/uts/common/disp/sysclass.c b/usr/src/uts/common/disp/sysclass.c
index 9f266dd7e7..d48cc3145e 100644
--- a/usr/src/uts/common/disp/sysclass.c
+++ b/usr/src/uts/common/disp/sysclass.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,9 +18,10 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright (c) 1996-2001 by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -98,11 +98,7 @@ struct classfuncs sys_classfuncs = {
sys_swappri, /* swapin */
sys_swappri, /* swapout */
sys_nullsys, /* trapret */
-#ifdef KSLICE
- sys_preempt,
-#else
- setfrontdq,
-#endif
+ setfrontdq, /* preempt */
setbackdq, /* setrun */
sys_nullsys, /* sleep */
sys_nullsys, /* tick */
@@ -218,21 +214,6 @@ sys_nullsys()
{
}
-#ifdef KSLICE
-static void
-sys_preempt(t)
- kthread_id_t t;
-{
- extern int kslice;
-
- if (kslice)
- setbackdq(t);
- else
- setfrontdq(t);
-}
-#endif
-
-
/* ARGSUSED */
static int
sys_donice(t, cr, incr, retvalp)
diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c
index 5f352b2203..77f0663f12 100644
--- a/usr/src/uts/common/disp/thread.c
+++ b/usr/src/uts/common/disp/thread.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -73,6 +73,8 @@
#include <sys/sdt.h>
#include <sys/reboot.h>
#include <sys/kdi.h>
+#include <sys/waitq.h>
+#include <sys/cpucaps.h>
struct kmem_cache *thread_cache; /* cache of free threads */
struct kmem_cache *lwp_cache; /* cache of free lwps */
@@ -185,10 +187,18 @@ thread_init(void)
label_init();
cred_init();
+ /*
+ * Initialize various resource management facilities.
+ */
rctl_init();
+ cpucaps_init();
+ /*
+ * Zone_init() should be called before project_init() so that project ID
+ * for the first project is initialized correctly.
+ */
+ zone_init();
project_init();
brand_init();
- zone_init();
task_init();
tcache_init();
pool_init();
@@ -1070,6 +1080,8 @@ setrun_locked(kthread_t *t)
* Already on dispatcher queue.
*/
return;
+ } else if (t->t_state == TS_WAIT) {
+ waitq_setrun(t);
} else if (t->t_state == TS_STOPPED) {
/*
* All of the sending of SIGCONT (TC_XSTART) and /proc
@@ -1111,8 +1123,6 @@ setrun_locked(kthread_t *t)
*/
CL_SETRUN(t);
}
-
-
}
void
@@ -1623,7 +1633,7 @@ thread_change_epri(kthread_t *t, pri_t disp_pri)
* If it's not on a queue, change the priority with
* impunity.
*/
- if ((state & (TS_SLEEP | TS_RUN)) == 0) {
+ if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) {
t->t_epri = disp_pri;
if (state == TS_ONPROC) {
@@ -1639,7 +1649,6 @@ thread_change_epri(kthread_t *t, pri_t disp_pri)
* It's either on a sleep queue or a run queue.
*/
if (state == TS_SLEEP) {
-
/*
* Take the thread out of its sleep queue.
* Change the inherited priority.
@@ -1648,6 +1657,13 @@ thread_change_epri(kthread_t *t, pri_t disp_pri)
* to do this in an appropriate manner.
*/
SOBJ_CHANGE_EPRI(t->t_sobj_ops, t, disp_pri);
+ } else if (state == TS_WAIT) {
+ /*
+ * Re-enqueue a thread on the wait queue if its
+ * effective priority needs to change.
+ */
+ if (disp_pri != t->t_epri)
+ waitq_change_pri(t, disp_pri);
} else {
/*
* The thread is on a run queue.
@@ -1682,7 +1698,7 @@ thread_change_pri(kthread_t *t, pri_t disp_pri, int front)
* If it's not on a queue, change the priority with
* impunity.
*/
- if ((state & (TS_SLEEP | TS_RUN)) == 0) {
+ if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) {
t->t_pri = disp_pri;
if (state == TS_ONPROC) {
@@ -1707,6 +1723,13 @@ thread_change_pri(kthread_t *t, pri_t disp_pri, int front)
*/
if (disp_pri != t->t_pri)
SOBJ_CHANGE_PRI(t->t_sobj_ops, t, disp_pri);
+ } else if (state == TS_WAIT) {
+ /*
+ * Re-enqueue a thread on the wait queue if its
+ * priority needs to change.
+ */
+ if (disp_pri != t->t_pri)
+ waitq_change_pri(t, disp_pri);
} else {
/*
* The thread is on a run queue.
diff --git a/usr/src/uts/common/disp/ts.c b/usr/src/uts/common/disp/ts.c
index 738a2e47b4..a55b890e83 100644
--- a/usr/src/uts/common/disp/ts.c
+++ b/usr/src/uts/common/disp/ts.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -59,10 +59,10 @@
#include <sys/policy.h>
#include <sys/sdt.h>
#include <sys/cpupart.h>
-
#include <vm/rm.h>
#include <vm/seg_kmem.h>
#include <sys/modctl.h>
+#include <sys/cpucaps.h>
static pri_t ts_init(id_t, int, classfuncs_t **);
@@ -194,6 +194,7 @@ static int ts_parmsout(void *, pc_vaparms_t *);
static int ts_vaparmsin(void *, pc_vaparms_t *);
static int ts_vaparmsout(void *, pc_vaparms_t *);
static int ts_parmsset(kthread_t *, void *, id_t, cred_t *);
+static void ts_exit(kthread_t *);
static int ts_donice(kthread_t *, cred_t *, int, int *);
static void ts_exitclass(void *);
static int ts_canexit(kthread_t *, cred_t *);
@@ -258,7 +259,7 @@ static struct classfuncs ts_classfuncs = {
ts_parmsget,
ts_parmsset,
ts_nullsys, /* stop */
- ts_nullsys, /* exit */
+ ts_exit,
ts_nullsys, /* active */
ts_nullsys, /* inactive */
ts_swapin,
@@ -302,7 +303,7 @@ static struct classfuncs ia_classfuncs = {
ia_parmsget,
ia_parmsset,
ts_nullsys, /* stop */
- ts_nullsys, /* exit */
+ ts_exit,
ts_nullsys, /* active */
ts_nullsys, /* inactive */
ts_swapin,
@@ -622,6 +623,7 @@ ts_enterclass(kthread_t *t, id_t cid, void *parmsp,
tspp->ts_dispwait = 0;
tspp->ts_timeleft = ts_dptbl[tspp->ts_cpupri].ts_quantum;
tspp->ts_tp = t;
+ cpucaps_sc_init(&tspp->ts_caps);
/*
* Reset priority. Process goes to a "user mode" priority
@@ -703,6 +705,7 @@ ts_fork(kthread_t *t, kthread_t *ct, void *bufp)
ctspp->ts_dispwait = 0;
ctspp->ts_flags = ptspp->ts_flags & ~(TSKPRI | TSBACKQ | TSRESTORE);
ctspp->ts_tp = ct;
+ cpucaps_sc_init(&ctspp->ts_caps);
thread_unlock(t);
/*
@@ -1307,6 +1310,24 @@ ia_parmsset(kthread_t *tx, void *parmsp, id_t reqpcid, cred_t *reqpcredp)
return (ts_parmsset(tx, parmsp, reqpcid, reqpcredp));
}
+static void
+ts_exit(kthread_t *t)
+{
+ tsproc_t *tspp;
+
+ if (CPUCAPS_ON()) {
+ /*
+ * A thread could be exiting in between clock ticks,
+ * so we need to calculate how much CPU time it used
+ * since it was charged last time.
+ */
+ thread_lock(t);
+ tspp = (tsproc_t *)t->t_cldata;
+ (void) cpucaps_charge(t, &tspp->ts_caps, CPUCAPS_CHARGE_ONLY);
+ thread_unlock(t);
+ }
+}
+
/*
* Return the global scheduling priority that would be assigned
* to a thread entering the time-sharing class with the ts_upri.
@@ -1337,10 +1358,7 @@ static void
ts_preempt(kthread_t *t)
{
tsproc_t *tspp = (tsproc_t *)(t->t_cldata);
- klwp_t *lwp;
-#ifdef KSLICE
- extern int kslice;
-#endif
+ klwp_t *lwp = curthread->t_lwp;
pri_t oldpri = t->t_pri;
ASSERT(t == curthread);
@@ -1350,7 +1368,6 @@ ts_preempt(kthread_t *t)
* If preempted in the kernel, make sure the thread has
* a kernel priority if needed.
*/
- lwp = curthread->t_lwp;
if (!(tspp->ts_flags & TSKPRI) && lwp != NULL && t->t_kpri_req) {
tspp->ts_flags |= TSKPRI;
THREAD_CHANGE_PRI(t, ts_kmdpris[0]);
@@ -1358,9 +1375,21 @@ ts_preempt(kthread_t *t)
t->t_trapret = 1; /* so ts_trapret will run */
aston(t);
}
+
/*
- * If preempted in user-land mark the thread
- * as swappable because I know it isn't holding any locks.
+ * This thread may be placed on wait queue by CPU Caps. In this case we
+ * do not need to do anything until it is removed from the wait queue.
+ * Do not enforce CPU caps on threads running at a kernel priority
+ */
+ if (CPUCAPS_ON()) {
+ (void) cpucaps_charge(t, &tspp->ts_caps, CPUCAPS_CHARGE_ONLY);
+ if (!(tspp->ts_flags & TSKPRI) && CPUCAPS_ENFORCE(t))
+ return;
+ }
+
+ /*
+ * If thread got preempted in the user-land then we know
+ * it isn't holding any locks. Mark it as swappable.
*/
ASSERT(t->t_schedflag & TS_DONT_SWAP);
if (lwp != NULL && lwp->lwp_state == LWP_USER)
@@ -1420,12 +1449,7 @@ ts_preempt(kthread_t *t)
tspp->ts_flags &= ~TSBACKQ;
setbackdq(t);
} else {
-#ifdef KSLICE
- if (kslice)
- setbackdq(t);
- else
-#endif
- setfrontdq(t);
+ setfrontdq(t);
}
done:
@@ -1482,6 +1506,11 @@ ts_sleep(kthread_t *t)
ASSERT(t == curthread);
ASSERT(THREAD_LOCK_HELD(t));
+ /*
+ * Account for time spent on CPU before going to sleep.
+ */
+ (void) CPUCAPS_CHARGE(t, &tspp->ts_caps, CPUCAPS_CHARGE_ONLY);
+
flags = tspp->ts_flags;
if (t->t_kpri_req) {
tspp->ts_flags = flags | TSKPRI;
@@ -1605,7 +1634,8 @@ ts_swapout(kthread_t *t, int flags)
if (INHERITED(t) || (tspp->ts_flags & (TSKPRI | TSIASET)) ||
(t->t_proc_flag & TP_LWPEXIT) ||
- (t->t_state & (TS_ZOMB | TS_FREE | TS_STOPPED | TS_ONPROC)) ||
+ (t->t_state & (TS_ZOMB | TS_FREE | TS_STOPPED |
+ TS_ONPROC | TS_WAIT)) ||
!(t->t_schedflag & TS_LOAD) || !SWAP_OK(t))
return (-1);
@@ -1653,17 +1683,27 @@ ts_swapout(kthread_t *t, int flags)
* move thread to priority specified in tsdptbl for time slice expiration
* and set runrun to cause preemption.
*/
-
static void
ts_tick(kthread_t *t)
{
tsproc_t *tspp = (tsproc_t *)(t->t_cldata);
klwp_t *lwp;
+ boolean_t call_cpu_surrender = B_FALSE;
pri_t oldpri = t->t_pri;
ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
thread_lock(t);
+
+ /*
+ * Keep track of thread's project CPU usage. Note that projects
+ * get charged even when threads are running in the kernel.
+ */
+ if (CPUCAPS_ON()) {
+ call_cpu_surrender = cpucaps_charge(t, &tspp->ts_caps,
+ CPUCAPS_CHARGE_ENFORCE) && !(tspp->ts_flags & TSKPRI);
+ }
+
if ((tspp->ts_flags & TSKPRI) == 0) {
if (--tspp->ts_timeleft <= 0) {
pri_t new_pri;
@@ -1709,17 +1749,21 @@ ts_tick(kthread_t *t)
tspp->ts_timeleft =
ts_dptbl[tspp->ts_cpupri].ts_quantum;
} else {
- tspp->ts_flags |= TSBACKQ;
- cpu_surrender(t);
+ call_cpu_surrender = B_TRUE;
}
TRACE_2(TR_FAC_DISP, TR_TICK,
"tick:tid %p old pri %d", t, oldpri);
} else if (t->t_state == TS_ONPROC &&
t->t_pri < t->t_disp_queue->disp_maxrunpri) {
- tspp->ts_flags |= TSBACKQ;
- cpu_surrender(t);
+ call_cpu_surrender = B_TRUE;
}
}
+
+ if (call_cpu_surrender) {
+ tspp->ts_flags |= TSBACKQ;
+ cpu_surrender(t);
+ }
+
thread_unlock_nopreempt(t); /* clock thread can't be preempted */
}
@@ -1877,8 +1921,8 @@ ts_update_list(int i)
goto next;
if (tx->t_schedctl && schedctl_get_nopreempt(tx))
goto next;
- if (tx->t_state != TS_RUN && (tx->t_state != TS_SLEEP ||
- !ts_sleep_promote)) {
+ if (tx->t_state != TS_RUN && tx->t_state != TS_WAIT &&
+ (tx->t_state != TS_SLEEP || !ts_sleep_promote)) {
/* make next syscall/trap do CL_TRAPRET */
tx->t_trapret = 1;
aston(tx);
@@ -1907,7 +1951,6 @@ next:
return (updated);
}
-
/*
* Processes waking up go to the back of their queue. We don't
* need to assign a time quantum here because thread is still
@@ -1981,6 +2024,11 @@ ts_yield(kthread_t *t)
ASSERT(THREAD_LOCK_HELD(t));
/*
+ * Collect CPU usage spent before yielding
+ */
+ (void) CPUCAPS_CHARGE(t, &tspp->ts_caps, CPUCAPS_CHARGE_ONLY);
+
+ /*
* Clear the preemption control "yield" bit since the user is
* doing a yield.
*/
diff --git a/usr/src/uts/common/dtrace/sdt_subr.c b/usr/src/uts/common/dtrace/sdt_subr.c
index cf3ceeb384..0c4a6c31a9 100644
--- a/usr/src/uts/common/dtrace/sdt_subr.c
+++ b/usr/src/uts/common/dtrace/sdt_subr.c
@@ -115,6 +115,10 @@ sdt_argdesc_t sdt_args[] = {
{ "sched", "schedctl-yield", 0, 0, "int" },
{ "sched", "surrender", 0, 0, "kthread_t *", "lwpsinfo_t *" },
{ "sched", "surrender", 1, 0, "kthread_t *", "psinfo_t *" },
+ { "sched", "cpucaps-sleep", 0, 0, "kthread_t *", "lwpsinfo_t *" },
+ { "sched", "cpucaps-sleep", 1, 0, "kthread_t *", "psinfo_t *" },
+ { "sched", "cpucaps-wakeup", 0, 0, "kthread_t *", "lwpsinfo_t *" },
+ { "sched", "cpucaps-wakeup", 1, 0, "kthread_t *", "psinfo_t *" },
{ "proc", "create", 0, 0, "proc_t *", "psinfo_t *" },
{ "proc", "exec", 0, 0, "string" },
{ "proc", "exec-failure", 0, 0, "int" },
diff --git a/usr/src/uts/common/fs/proc/prcontrol.c b/usr/src/uts/common/fs/proc/prcontrol.c
index 8308237e5b..97ba369083 100644
--- a/usr/src/uts/common/fs/proc/prcontrol.c
+++ b/usr/src/uts/common/fs/proc/prcontrol.c
@@ -1034,8 +1034,12 @@ pr_stop(prnode_t *pnp)
t->t_proc_flag |= TP_PRSTOP;
t->t_sig_check = 1; /* do ISSIG */
}
- if (t->t_state == TS_SLEEP &&
- (t->t_flag & T_WAKEABLE)) {
+
+ /* Move the thread from wait queue to run queue */
+ if (ISWAITING(t))
+ setrun_locked(t);
+
+ if (ISWAKEABLE(t)) {
if (t->t_wchan0 == NULL)
setrun_locked(t);
else if (!VSTOPPED(t)) {
@@ -1452,9 +1456,8 @@ pr_setsig(prnode_t *pnp, siginfo_t *sip)
}
}
thread_lock(t);
- if (t->t_state == TS_SLEEP &&
- (t->t_flag & T_WAKEABLE)) {
- /* Set signalled sleeping lwp running */
+ if (ISWAKEABLE(t) || ISWAITING(t)) {
+ /* Set signalled sleeping/waiting lwp running */
setrun_locked(t);
} else if (t->t_state == TS_STOPPED && sig == SIGKILL) {
/* If SIGKILL, set stopped lwp running */
@@ -1759,8 +1762,7 @@ pr_sethold(prnode_t *pnp, sigset_t *sp)
schedctl_finish_sigblock(t);
sigutok(sp, &t->t_hold);
- if (t->t_state == TS_SLEEP &&
- (t->t_flag & T_WAKEABLE) &&
+ if (ISWAKEABLE(t) &&
(fsig(&p->p_sig, t) || fsig(&t->t_sig, t)))
setrun_locked(t);
t->t_sig_check = 1; /* so thread will see new holdmask */
@@ -2363,10 +2365,9 @@ pauselwps(proc_t *p)
thread_lock(t);
t->t_proc_flag |= TP_PAUSE;
aston(t);
- if (t->t_state == TS_SLEEP &&
- (t->t_flag & T_WAKEABLE)) {
- if (t->t_wchan0 == NULL)
- setrun_locked(t);
+ if ((ISWAKEABLE(t) && (t->t_wchan0 == NULL)) ||
+ ISWAITING(t)) {
+ setrun_locked(t);
}
prpokethread(t);
thread_unlock(t);
diff --git a/usr/src/uts/common/fs/proc/prsubr.c b/usr/src/uts/common/fs/proc/prsubr.c
index e1c33384f7..60e541bb03 100644
--- a/usr/src/uts/common/fs/proc/prsubr.c
+++ b/usr/src/uts/common/fs/proc/prsubr.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -166,6 +166,7 @@ prchoose(proc_t *p)
}
break;
case TS_RUN:
+ case TS_WAIT:
if (t_run == NULL)
t_run = t;
break;
@@ -2507,6 +2508,7 @@ prgetlwpsinfo(kthread_t *t, lwpsinfo_t *psp)
case TS_ONPROC: state = SONPROC; c = 'O'; break;
case TS_ZOMB: state = SZOMB; c = 'Z'; break;
case TS_STOPPED: state = SSTOP; c = 'T'; break;
+ case TS_WAIT: state = SWAIT; c = 'W'; break;
default: state = 0; c = '?'; break;
}
psp->pr_state = state;
@@ -2573,6 +2575,7 @@ prgetlwpsinfo32(kthread_t *t, lwpsinfo32_t *psp)
case TS_ONPROC: state = SONPROC; c = 'O'; break;
case TS_ZOMB: state = SZOMB; c = 'Z'; break;
case TS_STOPPED: state = SSTOP; c = 'T'; break;
+ case TS_WAIT: state = SWAIT; c = 'W'; break;
default: state = 0; c = '?'; break;
}
psp->pr_state = state;
diff --git a/usr/src/uts/common/os/clock.c b/usr/src/uts/common/os/clock.c
index 0a8a99f0af..06babeb55c 100644
--- a/usr/src/uts/common/os/clock.c
+++ b/usr/src/uts/common/os/clock.c
@@ -79,8 +79,7 @@
#include <sys/inttypes.h>
/*
- * clock is called straight from
- * the real time clock interrupt.
+ * clock() is called straight from the clock cyclic; see clock_init().
*
* Functions:
* reprime clock
@@ -314,10 +313,7 @@ static int genloadavg(struct loadavg_s *);
static void loadavg_update();
void (*cmm_clock_callout)() = NULL;
-
-#ifdef KSLICE
-int kslice = KSLICE;
-#endif
+void (*cpucaps_clock_callout)() = NULL;
static void
clock(void)
@@ -513,9 +509,10 @@ clock(void)
/*
* Do tick processing for all the active threads running in
- * the system.
+ * the system. We're trying to be more fair by walking the
+ * list of CPUs starting from a different CPUs each time.
*/
- cp = cpu_list;
+ cp = clock_cpu_list;
nrunning = 0;
do {
klwp_id_t lwp;
@@ -649,21 +646,11 @@ clock(void)
clock_tick(t);
}
-#ifdef KSLICE
- /*
- * Ah what the heck, give this kid a taste of the real
- * world and yank the rug out from under it.
- * But, only if we are running UniProcessor.
- */
- if ((kslice) && (ncpus == 1)) {
- aston(t);
- cp->cpu_runrun = 1;
- cp->cpu_kprunrun = 1;
- }
-#endif
if (!exiting)
mutex_exit(plockp);
- } while ((cp = cp->cpu_next) != cpu_list);
+ } while ((cp = cp->cpu_next) != clock_cpu_list);
+
+ clock_cpu_list = clock_cpu_list->cpu_next;
/*
* bump time in ticks
@@ -683,6 +670,9 @@ clock(void)
if ((funcp = cmm_clock_callout) != NULL)
(*funcp)();
+ if ((funcp = cpucaps_clock_callout) != NULL)
+ (*funcp)();
+
/*
* Wakeup the cageout thread waiters once per second.
*/
diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c
index 7e28e4f1a1..ec553d1947 100644
--- a/usr/src/uts/common/os/cpu.c
+++ b/usr/src/uts/common/os/cpu.c
@@ -105,6 +105,7 @@ static int cpu_state_change_hooks(int, cpu_setup_t, cpu_setup_t);
*/
kmutex_t cpu_lock;
cpu_t *cpu_list; /* list of all CPUs */
+cpu_t *clock_cpu_list; /* used by clock to walk CPUs */
cpu_t *cpu_active; /* list of active CPUs */
static cpuset_t cpu_available; /* set of available CPUs */
cpuset_t cpu_seqid_inuse; /* which cpu_seqids are in use */
@@ -1618,6 +1619,7 @@ cpu_list_init(cpu_t *cp)
cp->cpu_next = cp;
cp->cpu_prev = cp;
cpu_list = cp;
+ clock_cpu_list = cp;
cp->cpu_next_onln = cp;
cp->cpu_prev_onln = cp;
@@ -1763,7 +1765,10 @@ cpu_del_unit(int cpuid)
cp->cpu_prev->cpu_next = cp->cpu_next;
cp->cpu_next->cpu_prev = cp->cpu_prev;
if (cp == cpu_list)
- cpu_list = cpnext;
+ cpu_list = cpnext;
+ if (cp == clock_cpu_list)
+ clock_cpu_list = cpnext;
+
/*
* Signals that the cpu has been deleted (see above).
diff --git a/usr/src/uts/common/os/kstat_fr.c b/usr/src/uts/common/os/kstat_fr.c
index 0036b1cd3d..c97b1621cb 100644
--- a/usr/src/uts/common/os/kstat_fr.c
+++ b/usr/src/uts/common/os/kstat_fr.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
@@ -289,7 +289,9 @@ kstat_zone_add(kstat_t *k, zoneid_t zoneid)
ekstat_t *e = (ekstat_t *)k;
kstat_zone_t *kz;
- kz = kmem_alloc(sizeof (*kz), KM_SLEEP);
+ kz = kmem_alloc(sizeof (*kz), KM_NOSLEEP);
+ if (kz == NULL)
+ return;
mutex_enter(&kstat_chain_lock);
kz->zoneid = zoneid;
kz->next = e->e_zone.next;
diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c
index 7533fb0028..c1d1a870e0 100644
--- a/usr/src/uts/common/os/lwp.c
+++ b/usr/src/uts/common/os/lwp.c
@@ -781,6 +781,7 @@ lwp_exit(void)
t->t_proc_flag |= TP_LWPEXIT;
term_mstate(t);
+
#ifndef NPROBE
/* Kernel probe */
if (t->t_tnf_tpdp)
@@ -916,10 +917,12 @@ top:
* XXX Should use virtual stop like /proc does instead of
* XXX waking the thread to get it to stop.
*/
- if (t->t_state == TS_SLEEP && (t->t_flag & T_WAKEABLE))
+ if (ISWAKEABLE(t) || ISWAITING(t)) {
setrun_locked(t);
- else if (t->t_state == TS_ONPROC && t->t_cpu != CPU)
+ } else if (t->t_state == TS_ONPROC && t->t_cpu != CPU) {
poke_cpu(t->t_cpu->cpu_id);
+ }
+
tid = t->t_tid; /* remember thread ID */
/*
* Wait for lwp to stop
@@ -1360,9 +1363,8 @@ pokelwps(proc_t *p)
continue;
thread_lock(t);
aston(t); /* make thread trap or do post_syscall */
- if (t->t_state == TS_SLEEP) {
- if (t->t_flag & T_WAKEABLE)
- setrun_locked(t);
+ if (ISWAKEABLE(t) || ISWAITING(t)) {
+ setrun_locked(t);
} else if (t->t_state == TS_STOPPED) {
/*
* Ensure that proc_exit() is not blocked by lwps
diff --git a/usr/src/uts/common/os/msacct.c b/usr/src/uts/common/os/msacct.c
index 134de5a513..57c56cef8f 100644
--- a/usr/src/uts/common/os/msacct.c
+++ b/usr/src/uts/common/os/msacct.c
@@ -236,6 +236,47 @@ new_cpu_mstate(int cmstate, hrtime_t curtime)
}
/*
+ * Return an aggregation of user and system CPU time consumed by
+ * the specified thread in scaled nanoseconds.
+ */
+hrtime_t
+mstate_thread_onproc_time(kthread_t *t)
+{
+ hrtime_t aggr_time;
+ hrtime_t now;
+ hrtime_t state_start;
+ struct mstate *ms;
+ klwp_t *lwp;
+ int mstate;
+
+ ASSERT(THREAD_LOCK_HELD(t));
+
+ if ((lwp = ttolwp(t)) == NULL)
+ return (0);
+
+ mstate = t->t_mstate;
+ ms = &lwp->lwp_mstate;
+ state_start = ms->ms_state_start;
+
+ aggr_time = ms->ms_acct[LMS_USER] +
+ ms->ms_acct[LMS_SYSTEM] + ms->ms_acct[LMS_TRAP];
+
+ now = gethrtime_unscaled();
+
+ /*
+ * NOTE: gethrtime_unscaled on X86 taken on different CPUs is
+ * inconsistent, so it is possible that now < state_start.
+ */
+ if ((mstate == LMS_USER || mstate == LMS_SYSTEM ||
+ mstate == LMS_TRAP) && (now > state_start)) {
+ aggr_time += now - state_start;
+ }
+
+ scalehrtime(&aggr_time);
+ return (aggr_time);
+}
+
+/*
* Return an aggregation of microstate times in scaled nanoseconds (high-res
* time). This keeps in mind that p_acct is already scaled, and ms_acct is
* not.
diff --git a/usr/src/uts/common/os/project.c b/usr/src/uts/common/os/project.c
index 3c17a0dc38..b5f96a8dd7 100644
--- a/usr/src/uts/common/os/project.c
+++ b/usr/src/uts/common/os/project.c
@@ -41,6 +41,7 @@
#include <sys/port_kernel.h>
#include <sys/task.h>
#include <sys/zone.h>
+#include <sys/cpucaps.h>
int project_hash_size = 64;
static kmutex_t project_hash_lock;
@@ -49,6 +50,7 @@ static mod_hash_t *projects_hash;
static kproject_t *projects_list;
rctl_hndl_t rc_project_cpu_shares;
+rctl_hndl_t rc_project_cpu_cap;
rctl_hndl_t rc_project_nlwps;
rctl_hndl_t rc_project_ntasks;
rctl_hndl_t rc_project_msgmni;
@@ -156,6 +158,7 @@ project_hash_val_dtor(mod_hash_val_t val)
kproject_t *kp = (kproject_t *)val;
ASSERT(kp->kpj_count == 0);
+ ASSERT(kp->kpj_cpucap == NULL);
kmem_free(kp, sizeof (kproject_t));
}
@@ -251,6 +254,7 @@ project_hold_by_id(projid_t id, zone_t *zone, int flag)
p = spare_p;
p->kpj_id = id;
+ p->kpj_zone = zone;
p->kpj_zoneid = zone->zone_id;
p->kpj_count = 0;
p->kpj_shares = 1;
@@ -304,6 +308,13 @@ project_hold_by_id(projid_t id, zone_t *zone, int flag)
* across reboots.
*/
if (create == B_TRUE) {
+ /*
+ * Inform CPU caps framework of the new project
+ */
+ cpucaps_project_add(p);
+ /*
+ * Set up project kstats
+ */
ksp = project_kstat_create(p, zone);
mutex_enter(&project_hash_lock);
ASSERT(p->kpj_data.kpd_lockedmem_kstat == NULL);
@@ -343,6 +354,8 @@ project_rele(kproject_t *p)
projects_list = p->kpj_next;
mutex_exit(&projects_list_lock);
+ cpucaps_project_remove(p);
+
rctl_set_free(p->kpj_rctls);
project_kstat_delete(p);
@@ -431,7 +444,6 @@ project_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
return (0);
}
-
static rctl_ops_t project_cpu_shares_ops = {
rcop_no_action,
project_cpu_shares_usage,
@@ -439,6 +451,43 @@ static rctl_ops_t project_cpu_shares_ops = {
rcop_no_test
};
+
+/*
+ * project.cpu-cap resource control support.
+ */
+/*ARGSUSED*/
+static rctl_qty_t
+project_cpu_cap_get(rctl_t *rctl, struct proc *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ return (cpucaps_project_get(p->p_task->tk_proj));
+}
+
+/*ARGSUSED*/
+static int
+project_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ kproject_t *kpj = e->rcep_p.proj;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_PROJECT);
+ if (kpj == NULL)
+ return (0);
+
+ /*
+ * set cap to the new value.
+ */
+ return (cpucaps_project_set(kpj, nv));
+}
+
+static rctl_ops_t project_cpu_cap_ops = {
+ rcop_no_action,
+ project_cpu_cap_get,
+ project_cpu_cap_set,
+ rcop_no_test
+};
+
/*ARGSUSED*/
static rctl_qty_t
project_lwps_usage(rctl_t *r, proc_t *p)
@@ -804,6 +853,13 @@ project_init(void)
rctl_add_default_limit("project.cpu-shares", 1, RCPRIV_PRIVILEGED,
RCTL_LOCAL_NOACTION);
+ rc_project_cpu_cap = rctl_register("project.cpu-cap",
+ RCENTITY_PROJECT, RCTL_GLOBAL_SIGNAL_NEVER |
+ RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
+ RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER |
+ RCTL_GLOBAL_INFINITE,
+ MAXCAP, MAXCAP, &project_cpu_cap_ops);
+
rc_project_nlwps = rctl_register("project.max-lwps", RCENTITY_PROJECT,
RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
INT_MAX, INT_MAX, &project_lwps_ops);
diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c
index 808e5d2095..5c72fb749b 100644
--- a/usr/src/uts/common/os/sig.c
+++ b/usr/src/uts/common/os/sig.c
@@ -181,7 +181,7 @@ eat_signal(kthread_t *t, int sig)
*/
if (!signal_is_blocked(t, sig)) {
t->t_sig_check = 1; /* have thread do an issig */
- if (t->t_state == TS_SLEEP && (t->t_flag & T_WAKEABLE)) {
+ if (ISWAKEABLE(t) || ISWAITING(t)) {
setrun_locked(t);
rval = 1;
} else if (t->t_state == TS_STOPPED && sig == SIGKILL &&
@@ -974,6 +974,11 @@ stop(int why, int what)
notify = 1;
}
}
+
+ /* Move waiting thread to run queue */
+ if (ISWAITING(tx))
+ setrun_locked(tx);
+
/*
* force the thread into the kernel
* if it is not already there.
diff --git a/usr/src/uts/common/os/task.c b/usr/src/uts/common/os/task.c
index 785f74c145..5e4ae1aefe 100644
--- a/usr/src/uts/common/os/task.c
+++ b/usr/src/uts/common/os/task.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -388,8 +388,7 @@ task_create(projid_t projid, zone_t *zone)
tk->tk_nlwps = 0;
tk->tk_nlwps_ctl = INT_MAX;
tk->tk_usage = tu;
- tk->tk_proj = project_hold_by_id(projid, zone,
- PROJECT_HOLD_INSERT);
+ tk->tk_proj = project_hold_by_id(projid, zone, PROJECT_HOLD_INSERT);
tk->tk_flags = TASK_NORMAL;
/*
@@ -670,6 +669,21 @@ changeproj(proc_t *p, kproject_t *kpj, zone_t *zone, void *projbuf,
thread_lock(t);
oldkpj = ttoproj(t);
+
+ /*
+ * Kick this thread so that he doesn't sit
+ * on a wrong wait queue.
+ */
+ if (ISWAITING(t))
+ setrun_locked(t);
+
+ /*
+ * The thread wants to go on the project wait queue, but
+ * the waitq is changing.
+ */
+ if (t->t_schedflag & TS_PROJWAITQ)
+ t->t_schedflag &= ~ TS_PROJWAITQ;
+
t->t_proj = kpj;
t->t_pre_sys = 1; /* For cred update */
thread_unlock(t);
diff --git a/usr/src/uts/common/os/timers.c b/usr/src/uts/common/os/timers.c
index ddaa2adff4..e656a17088 100644
--- a/usr/src/uts/common/os/timers.c
+++ b/usr/src/uts/common/os/timers.c
@@ -589,6 +589,7 @@ realprofexpire(void *arg)
}
break;
case TS_RUN:
+ case TS_WAIT:
mstate = LMS_WAIT_CPU;
break;
case TS_ONPROC:
diff --git a/usr/src/uts/common/os/waitq.c b/usr/src/uts/common/os/waitq.c
new file mode 100644
index 0000000000..802d7afdc4
--- /dev/null
+++ b/usr/src/uts/common/os/waitq.c
@@ -0,0 +1,386 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/class.h>
+#include <sys/debug.h>
+#include <sys/cpuvar.h>
+#include <sys/waitq.h>
+#include <sys/cmn_err.h>
+#include <sys/time.h>
+#include <sys/dtrace.h>
+#include <sys/sdt.h>
+#include <sys/zone.h>
+
+/*
+ * Wait queue implementation.
+ */
+
+void
+waitq_init(waitq_t *wq)
+{
+ DISP_LOCK_INIT(&wq->wq_lock);
+ wq->wq_first = NULL;
+ wq->wq_count = 0;
+ wq->wq_blocked = B_TRUE;
+}
+
+void
+waitq_fini(waitq_t *wq)
+{
+ ASSERT(wq->wq_count == 0);
+ ASSERT(wq->wq_first == NULL);
+ ASSERT(wq->wq_blocked == B_TRUE);
+ ASSERT(!DISP_LOCK_HELD(&wq->wq_lock));
+
+ DISP_LOCK_DESTROY(&wq->wq_lock);
+}
+
+/*
+ * Operations on waitq_t structures.
+ *
+ * A wait queue is a singly linked NULL-terminated list with doubly
+ * linked circular sublists. The singly linked list is in descending
+ * priority order and FIFO for threads of the same priority. It links
+ * through the t_link field of the thread structure. The doubly linked
+ * sublists link threads of the same priority. They use the t_priforw
+ * and t_priback fields of the thread structure.
+ *
+ * Graphically (with priorities in parens):
+ *
+ * ________________ _______ _______
+ * / \ / \ / \
+ * | | | | | |
+ * v v v v v v
+ * t1(60)-->t2(60)-->t3(60)-->t4(50)-->t5(50)-->t6(30)-->t7(0)-->t8(0)
+ * ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
+ * | | | | | | | | | |
+ * \______/ \______/ \_______/ \__/ \_______/
+ *
+ * There are three interesting operations on a waitq list: inserting
+ * a thread into the proper position according to priority; removing a
+ * thread given a pointer to it; and walking the list, possibly
+ * removing threads along the way. This design allows all three
+ * operations to be performed efficiently and easily.
+ *
+ * To insert a thread, traverse the list looking for the sublist of
+ * the same priority as the thread (or one of a lower priority,
+ * meaning there are no other threads in the list of the same
+ * priority). This can be done without touching all threads in the
+ * list by following the links between the first threads in each
+ * sublist. Given a thread t that is the head of a sublist (the first
+ * thread of that priority found when following the t_link pointers),
+ * t->t_priback->t_link points to the head of the next sublist. It's
+ * important to do this since a waitq may contain thousands of
+ * threads.
+ *
+ * Removing a thread from the list is also efficient. First, the
+ * t_waitq field contains a pointer to the waitq on which a thread
+ * is waiting (or NULL if it's not on a waitq). This is used to
+ * determine if the given thread is on the given waitq without
+ * searching the list. Assuming it is, if it's not the head of a
+ * sublist, just remove it from the sublist and use the t_priback
+ * pointer to find the thread that points to it with t_link. If it is
+ * the head of a sublist, search for it by walking the sublist heads,
+ * similar to searching for a given priority level when inserting a
+ * thread.
+ *
+ * To walk the list, simply follow the t_link pointers. Removing
+ * threads along the way can be done easily if the code maintains a
+ * pointer to the t_link field that pointed to the thread being
+ * removed.
+ */
+
+static void
+waitq_link(waitq_t *wq, kthread_t *t)
+{
+ kthread_t *next_tp;
+ kthread_t *last_tp;
+ kthread_t **tpp;
+ pri_t tpri, next_pri, last_pri = -1;
+
+ ASSERT(DISP_LOCK_HELD(&wq->wq_lock));
+
+ tpri = DISP_PRIO(t);
+ tpp = &wq->wq_first;
+ while ((next_tp = *tpp) != NULL) {
+ next_pri = DISP_PRIO(next_tp);
+ if (tpri > next_pri)
+ break;
+ last_tp = next_tp->t_priback;
+ last_pri = next_pri;
+ tpp = &last_tp->t_link;
+ }
+ *tpp = t;
+ t->t_link = next_tp;
+ if (last_pri == tpri) {
+ /* last_tp points to the last thread of this priority */
+ t->t_priback = last_tp;
+ t->t_priforw = last_tp->t_priforw;
+ last_tp->t_priforw->t_priback = t;
+ last_tp->t_priforw = t;
+ } else {
+ t->t_priback = t->t_priforw = t;
+ }
+ wq->wq_count++;
+ t->t_waitq = wq;
+}
+
+static void
+waitq_unlink(waitq_t *wq, kthread_t *t)
+{
+ kthread_t *nt;
+ kthread_t **ptl;
+
+ ASSERT(THREAD_LOCK_HELD(t));
+ ASSERT(DISP_LOCK_HELD(&wq->wq_lock));
+ ASSERT(t->t_waitq == wq);
+
+ ptl = &t->t_priback->t_link;
+ /*
+ * Is it the head of a priority sublist? If so, need to walk
+ * the priorities to find the t_link pointer that points to it.
+ */
+ if (*ptl != t) {
+ /*
+ * Find the right priority level.
+ */
+ ptl = &t->t_waitq->wq_first;
+ while ((nt = *ptl) != t)
+ ptl = &nt->t_priback->t_link;
+ }
+ /*
+ * Remove thread from the t_link list.
+ */
+ *ptl = t->t_link;
+
+ /*
+ * Take it off the priority sublist if there's more than one
+ * thread there.
+ */
+ if (t->t_priforw != t) {
+ t->t_priback->t_priforw = t->t_priforw;
+ t->t_priforw->t_priback = t->t_priback;
+ }
+ t->t_link = NULL;
+
+ wq->wq_count--;
+ t->t_waitq = NULL;
+ t->t_priforw = NULL;
+ t->t_priback = NULL;
+}
+
+/*
+ * Put specified thread to specified wait queue without dropping thread's lock.
+ * Returns 1 if thread was successfully placed on project's wait queue, or
+ * 0 if wait queue is blocked.
+ */
+int
+waitq_enqueue(waitq_t *wq, kthread_t *t)
+{
+ ASSERT(THREAD_LOCK_HELD(t));
+ ASSERT(t->t_sleepq == NULL);
+ ASSERT(t->t_waitq == NULL);
+ ASSERT(t->t_link == NULL);
+
+ disp_lock_enter_high(&wq->wq_lock);
+
+ /*
+ * Can't enqueue anything on a blocked wait queue
+ */
+ if (wq->wq_blocked) {
+ disp_lock_exit_high(&wq->wq_lock);
+ return (0);
+ }
+
+ /*
+ * Mark the time when thread is placed on wait queue. The microstate
+ * accounting code uses this timestamp to determine wait times.
+ */
+ t->t_waitrq = gethrtime_unscaled();
+
+ /*
+ * Mark thread as not swappable. If necessary, it will get
+ * swapped out when it returns to the userland.
+ */
+ t->t_schedflag |= TS_DONT_SWAP;
+ DTRACE_SCHED1(cpucaps__sleep, kthread_t *, t);
+ waitq_link(wq, t);
+
+ THREAD_WAIT(t, &wq->wq_lock);
+ return (1);
+}
+
+/*
+ * Change thread's priority while on the wait queue.
+ * Dequeue and equeue it again so that it gets placed in the right place.
+ */
+void
+waitq_change_pri(kthread_t *t, pri_t new_pri)
+{
+ waitq_t *wq = t->t_waitq;
+
+ ASSERT(THREAD_LOCK_HELD(t));
+ ASSERT(ISWAITING(t));
+ ASSERT(wq != NULL);
+
+ waitq_unlink(wq, t);
+ t->t_pri = new_pri;
+ waitq_link(wq, t);
+}
+
+static void
+waitq_dequeue(waitq_t *wq, kthread_t *t)
+{
+ ASSERT(THREAD_LOCK_HELD(t));
+ ASSERT(t->t_waitq == wq);
+ ASSERT(ISWAITING(t));
+
+ waitq_unlink(wq, t);
+ DTRACE_SCHED1(cpucaps__wakeup, kthread_t *, t);
+
+ /*
+ * Change thread to transition state without dropping
+ * the wait queue lock.
+ */
+ THREAD_TRANSITION_NOLOCK(t);
+}
+
+/*
+ * Return True iff there are any threads on the specified wait queue.
+ * The check is done **without holding any locks**.
+ */
+boolean_t
+waitq_isempty(waitq_t *wq)
+{
+ return (wq->wq_count == 0);
+}
+
+/*
+ * Take thread off its wait queue and make it runnable.
+ * Returns with thread lock held.
+ */
+void
+waitq_setrun(kthread_t *t)
+{
+ waitq_t *wq = t->t_waitq;
+
+ ASSERT(THREAD_LOCK_HELD(t));
+
+ ASSERT(ISWAITING(t));
+ if (wq == NULL)
+ panic("waitq_setrun: thread %p is not on waitq", t);
+ waitq_dequeue(wq, t);
+
+ disp_lock_exit_high(&wq->wq_lock);
+ CL_SETRUN(t);
+}
+
+/*
+ * Take the first thread off the wait queue and return pointer to it.
+ */
+static kthread_t *
+waitq_takeone(waitq_t *wq)
+{
+ kthread_t *t;
+
+ disp_lock_enter(&wq->wq_lock);
+ if ((t = wq->wq_first) != NULL)
+ waitq_dequeue(wq, wq->wq_first);
+ disp_lock_exit(&wq->wq_lock);
+ return (t);
+}
+
+/*
+ * Take the first thread off the wait queue and make it runnable.
+ * Return the pointer to the thread or NULL if waitq is empty
+ */
+static kthread_t *
+waitq_runfirst(waitq_t *wq)
+{
+ kthread_t *t;
+
+ t = waitq_takeone(wq);
+ if (t != NULL) {
+ CL_SETRUN(t);
+ thread_unlock(t); /* drops dispq lock */
+ }
+ return (t);
+}
+
+/*
+ * Take the first thread off the wait queue and make it runnable.
+ */
+void
+waitq_runone(waitq_t *wq)
+{
+ (void) waitq_runfirst(wq);
+}
+
+/*
+ * Take all threads off the wait queue and make them runnable.
+ */
+static void
+waitq_runall(waitq_t *wq)
+{
+ while (waitq_runfirst(wq) != NULL)
+ ;
+}
+
+/*
+ * Prevent any new threads from entering wait queue and make all threads
+ * currently on the wait queue runnable. After waitq_block() completion, no
+ * threads should ever appear on the wait queue untill it is unblocked.
+ */
+void
+waitq_block(waitq_t *wq)
+{
+ ASSERT(!wq->wq_blocked);
+ disp_lock_enter(&wq->wq_lock);
+ wq->wq_blocked = B_TRUE;
+ disp_lock_exit(&wq->wq_lock);
+ waitq_runall(wq);
+ ASSERT(waitq_isempty(wq));
+}
+
+/*
+ * Allow threads to be placed on the wait queue.
+ */
+void
+waitq_unblock(waitq_t *wq)
+{
+ disp_lock_enter(&wq->wq_lock);
+
+ ASSERT(waitq_isempty(wq));
+ ASSERT(wq->wq_blocked);
+
+ wq->wq_blocked = B_FALSE;
+
+ disp_lock_exit(&wq->wq_lock);
+}
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index d163dbc5cd..3838c07cfa 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -240,6 +240,7 @@
#include <sys/brand.h>
#include <sys/zone.h>
#include <net/if.h>
+#include <sys/cpucaps.h>
#include <vm/seg.h>
/*
@@ -328,6 +329,7 @@ const char *zone_status_table[] = {
rctl_hndl_t rc_zone_cpu_shares;
rctl_hndl_t rc_zone_locked_mem;
rctl_hndl_t rc_zone_max_swap;
+rctl_hndl_t rc_zone_cpu_cap;
rctl_hndl_t rc_zone_nlwps;
rctl_hndl_t rc_zone_shmmax;
rctl_hndl_t rc_zone_shmmni;
@@ -882,6 +884,43 @@ static rctl_ops_t zone_cpu_shares_ops = {
rcop_no_test
};
+/*
+ * zone.cpu-cap resource control support.
+ */
+/*ARGSUSED*/
+static rctl_qty_t
+zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ return (cpucaps_zone_get(p->p_zone));
+}
+
+/*ARGSUSED*/
+static int
+zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ /*
+ * set cap to the new value.
+ */
+ return (cpucaps_zone_set(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_cap_ops = {
+ rcop_no_action,
+ zone_cpu_cap_get,
+ zone_cpu_cap_set,
+ rcop_no_test
+};
+
/*ARGSUSED*/
static rctl_qty_t
zone_lwps_usage(rctl_t *r, proc_t *p)
@@ -1384,8 +1423,13 @@ zone_init(void)
rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
- FSS_MAXSHARES, FSS_MAXSHARES,
- &zone_cpu_shares_ops);
+ FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
+
+ rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
+ RCTL_GLOBAL_INFINITE,
+ MAXCAP, MAXCAP, &zone_cpu_cap_ops);
rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
@@ -1530,6 +1574,13 @@ zone_free(zone_t *zone)
ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
+ /*
+ * Remove any zone caps.
+ */
+ cpucaps_zone_remove(zone);
+
+ ASSERT(zone->zone_cpucap == NULL);
+
/* remove from deathrow list */
if (zone_status_get(zone) == ZONE_IS_DEAD) {
ASSERT(zone->zone_ref == 0);
@@ -2501,6 +2552,10 @@ zthread_exit(void)
zone->zone_kthreads = NULL;
if (zone_status_get(zone) == ZONE_IS_EMPTY) {
zone_status_set(zone, ZONE_IS_DOWN);
+ /*
+ * Remove any CPU caps on this zone.
+ */
+ cpucaps_zone_remove(zone);
}
} else {
t->t_forw->t_back = t->t_back;
@@ -2616,8 +2671,9 @@ zone_start_init(void)
* Make sure we are still in the booting state-- we could have
* raced and already be shutting down, or even further along.
*/
- if (zone_status_get(z) == ZONE_IS_BOOTING)
+ if (zone_status_get(z) == ZONE_IS_BOOTING) {
zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
+ }
mutex_exit(&zone_status_lock);
/* It's gone bad, dispose of the process */
if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
@@ -3879,7 +3935,13 @@ zone_destroy(zoneid_t zoneid)
}
- /* Get rid of the zone's kstats. */
+ /*
+ * Remove CPU cap for this zone now since we're not going to
+ * fail below this point.
+ */
+ cpucaps_zone_remove(zone);
+
+ /* Get rid of the zone's kstats */
zone_kstat_delete(zone);
/* Say goodbye to brand framework. */
@@ -3938,8 +4000,8 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
char *outstr;
zone_status_t zone_status;
pid_t initpid;
- boolean_t global = (curproc->p_zone == global_zone);
- boolean_t curzone = (curproc->p_zone->zone_id == zoneid);
+ boolean_t global = (curzone == global_zone);
+ boolean_t inzone = (curzone->zone_id == zoneid);
ushort_t flags;
mutex_enter(&zonehash_lock);
@@ -3980,7 +4042,7 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
bcopy(zone->zone_rootpath, zonepath, size);
zonepath[size - 1] = '\0';
} else {
- if (curzone || !is_system_labeled()) {
+ if (inzone || !is_system_labeled()) {
/*
* Caller is not in the global zone.
* if the query is on the current zone
@@ -4011,7 +4073,7 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
if (err != 0 && err != ENAMETOOLONG)
error = EFAULT;
}
- if (global || (is_system_labeled() && !curzone))
+ if (global || (is_system_labeled() && !inzone))
kmem_free(zonepath, size);
break;
@@ -4365,6 +4427,7 @@ zone_enter(zoneid_t zoneid)
int err = 0;
rctl_entity_p_t e;
size_t swap;
+ kthread_id_t t;
if (secpolicy_zone_config(CRED()) != 0)
return (set_errno(EPERM));
@@ -4625,6 +4688,28 @@ zone_enter(zoneid_t zoneid)
pgjoin(pp, zone->zone_zsched->p_pidp);
/*
+ * If any threads are scheduled to be placed on zone wait queue they
+ * should abandon the idea since the wait queue is changing.
+ * We need to be holding pidlock & p_lock to do this.
+ */
+ if ((t = pp->p_tlist) != NULL) {
+ do {
+ thread_lock(t);
+ /*
+ * Kick this thread so that he doesn't sit
+ * on a wrong wait queue.
+ */
+ if (ISWAITING(t))
+ setrun_locked(t);
+
+ if (t->t_schedflag & TS_ANYWAITQ)
+ t->t_schedflag &= ~ TS_ANYWAITQ;
+
+ thread_unlock(t);
+ } while ((t = t->t_forw) != pp->p_tlist);
+ }
+
+ /*
* If there is a default scheduling class for the zone and it is not
* the class we are currently in, change all of the threads in the
* process to the new class. We need to be holding pidlock & p_lock
@@ -4633,7 +4718,6 @@ zone_enter(zoneid_t zoneid)
if (zone->zone_defaultcid > 0 &&
zone->zone_defaultcid != curthread->t_cid) {
pcparms_t pcparms;
- kthread_id_t t;
pcparms.pc_cid = zone->zone_defaultcid;
pcparms.pc_clparms[0] = 0;
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index 62f1efac65..84f5253412 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -110,6 +110,8 @@ CHKHDRS= \
byteorder.h \
callb.h \
callo.h \
+ cpucaps.h \
+ cpucaps_impl.h \
ccompile.h \
cdio.h \
cladm.h \
@@ -564,6 +566,7 @@ CHKHDRS= \
vuid_state.h \
vuid_store.h \
wait.h \
+ waitq.h \
wanboot_impl.h \
watchpoint.h \
winlockio.h \
diff --git a/usr/src/uts/common/sys/cpucaps.h b/usr/src/uts/common/sys/cpucaps.h
new file mode 100644
index 0000000000..6063ff4380
--- /dev/null
+++ b/usr/src/uts/common/sys/cpucaps.h
@@ -0,0 +1,157 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_CPUCAPS_H
+#define _SYS_CPUCAPS_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/zone.h>
+#include <sys/project.h>
+#include <sys/time.h>
+#include <sys/rctl.h>
+
+/*
+ * CPU caps provide an absolute hard CPU usage limit which is enforced even if
+ * some CPUs are idle. It can be enforced at project or zone level.
+ */
+
+#ifdef _KERNEL
+
+/*
+ * Valid caps values go from 1 to MAXCAP - 1. Specifying the MAXCAP as the cap
+ * value is equivalent to disabling the cap.
+ */
+#define MAXCAP UINT_MAX
+
+/*
+ * cpucaps_enabled is used to quickly check whether any CPU caps specific code
+ * should be invoked. Users outside CPU Caps framework should use CPUCAPS_ON()
+ * and CPUCAPS_OFF() macros.
+ */
+extern boolean_t cpucaps_enabled;
+
+#define CPUCAPS_ON() cpucaps_enabled
+#define CPUCAPS_OFF() (!cpucaps_enabled)
+
+/*
+ * Initialize the CPU caps framework.
+ */
+extern void cpucaps_init(void);
+
+/*
+ * Notify caps framework of a new project coming in or existing project
+ * going away
+ */
+extern void cpucaps_project_add(kproject_t *);
+extern void cpucaps_project_remove(kproject_t *);
+
+/*
+ * Notify caps framework when a zone is going away.
+ */
+extern void cpucaps_zone_remove(zone_t *);
+
+/*
+ * Set project/zone cap to specified value. Value of MAXCAP should disable caps.
+ */
+extern int cpucaps_project_set(kproject_t *, rctl_qty_t);
+extern int cpucaps_zone_set(zone_t *, rctl_qty_t);
+
+/*
+ * Get current CPU usage for a project/zone.
+ */
+extern rctl_qty_t cpucaps_project_get(kproject_t *);
+extern rctl_qty_t cpucaps_zone_get(zone_t *);
+
+/*
+ * Scheduling class hooks into CPU caps framework.
+ */
+
+/*
+ * CPU caps specific data for each scheduling class.
+ *
+ * There is a small amount of accounting data that should be kept by each
+ * scheduling class for each thread which is only used by CPU caps code. This
+ * data is kept in the caps_sc structure which is transparent for all scheduling
+ * classes. The fields in the structure are:
+ *
+ * csc_cputime - Total time spent on CPU during thread lifetime, obtained
+ * as the sum of user, system and trap time, reported by
+ * microstate accounting.
+ */
+typedef struct caps_sc {
+ hrtime_t csc_cputime;
+} caps_sc_t;
+
+/*
+ * Initialize per-thread cpu-caps specific data.
+ */
+extern void cpucaps_sc_init(caps_sc_t *);
+
+/*
+ * Modus operandi for cpucaps_charge() function.
+ *
+ * CPUCAPS_CHARGE_ENFORCE - charge a thread for its CPU time and
+ * flag it to be placed on wait queue.
+ *
+ * CPUCAPS_CHARGE_ONLY - charge a thread for its CPU time.
+ */
+typedef enum {
+ CPUCAPS_CHARGE_ENFORCE,
+ CPUCAPS_CHARGE_ONLY
+} cpucaps_charge_t;
+
+/*
+ * Add accumulated CPU usage of a thread to its cap.
+ * Return True if thread should be placed on waitq.
+ */
+extern boolean_t cpucaps_charge(kthread_t *, caps_sc_t *, cpucaps_charge_t);
+#define CPUCAPS_CHARGE(t, scp, flag) \
+ (CPUCAPS_ON() && cpucaps_charge(t, scp, flag))
+
+/*
+ * Request a thread to be placed on a wait queue because the cap is exceeded
+ */
+extern boolean_t cpucaps_enforce(kthread_t *);
+#define CPUCAPS_ENFORCE(t) (CPUCAPS_ON() && cpucaps_enforce(t))
+
+/*
+ * CPU Caps hook into clock().
+ */
+extern void (*cpucaps_clock_callout)(void);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CPUCAPS_H */
diff --git a/usr/src/uts/common/sys/cpucaps_impl.h b/usr/src/uts/common/sys/cpucaps_impl.h
new file mode 100644
index 0000000000..ba4132993f
--- /dev/null
+++ b/usr/src/uts/common/sys/cpucaps_impl.h
@@ -0,0 +1,102 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_CPUCAPS_IMPL_H
+#define _SYS_CPUCAPS_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#include <sys/kstat.h>
+#include <sys/cpucaps.h>
+#include <sys/list.h>
+#include <sys/time.h>
+#include <sys/waitq.h>
+
+/*
+ * When resource control framework sets the cap to NOCAP value the cap
+ * is disabled.
+ */
+#define NOCAP MAXCAP
+
+/*
+ * Maximum value for the cap usage. Should be the maximum value for hrtime_t
+ */
+#if defined(_LP64)
+#define MAX_USAGE LONG_MAX
+#else
+#define MAX_USAGE 9223372036854775807LL
+#endif
+
+
+/*
+ * Most of the per-project or per-zone state related to CPU caps is kept in the
+ * cpucap_t structure.
+ */
+typedef struct cpucap {
+ list_node_t cap_link; /* next/prev capped entity */
+ struct kproject *cap_project; /* project for the cap */
+ struct zone *cap_zone; /* zone for the cap */
+ waitq_t cap_waitq; /* waitq for capped threads */
+ kstat_t *cap_kstat; /* cpucaps specific kstat */
+ int64_t cap_lbolt; /* zone cap specific */
+ hrtime_t cap_value; /* scaled CPU usage cap */
+ hrtime_t cap_usage; /* current CPU usage */
+ disp_lock_t cap_usagelock; /* protects cap_usage above */
+ /*
+ * Per cap statistics.
+ */
+ hrtime_t cap_maxusage; /* maximum cap usage */
+ u_longlong_t cap_below; /* # of ticks spend below the cap */
+ u_longlong_t cap_above; /* # of ticks spend above the cap */
+} cpucap_t;
+
+/*
+ * Wrapper macros for checking cap state.
+ */
+#define CAP_ENABLED(cap) ((cap)->cap_value != 0)
+#define CAP_DISABLED(cap) (!CAP_ENABLED(cap))
+
+#define PROJECT_IS_CAPPED(project) \
+ (((project)->kpj_cpucap != NULL) && \
+ CAP_ENABLED((project)->kpj_cpucap))
+
+#define ZONE_IS_CAPPED(zone) \
+ (((zone)->zone_cpucap != NULL) && \
+ CAP_ENABLED((zone)->zone_cpucap))
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CPUCAPS_IMPL_H */
diff --git a/usr/src/uts/common/sys/cpuvar.h b/usr/src/uts/common/sys/cpuvar.h
index c211b63185..b0493774e0 100644
--- a/usr/src/uts/common/sys/cpuvar.h
+++ b/usr/src/uts/common/sys/cpuvar.h
@@ -508,6 +508,7 @@ extern int max_ncpus; /* max present before ncpus is known */
extern int boot_max_ncpus; /* like max_ncpus but for real */
extern processorid_t max_cpuid; /* maximum CPU number */
extern struct cpu *cpu_inmotion; /* offline or partition move target */
+extern cpu_t *clock_cpu_list;
#if defined(__i386) || defined(__amd64)
extern struct cpu *curcpup(void);
diff --git a/usr/src/uts/common/sys/fss.h b/usr/src/uts/common/sys/fss.h
index 21323fcc1a..583586fd75 100644
--- a/usr/src/uts/common/sys/fss.h
+++ b/usr/src/uts/common/sys/fss.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -21,7 +20,7 @@
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -33,11 +32,14 @@
#include <sys/types.h>
#include <sys/thread.h>
#include <sys/project.h>
+#include <sys/cpucaps.h>
#ifdef __cplusplus
extern "C" {
#endif
+#ifdef _KERNEL
+
typedef uint64_t fsspri_t;
typedef uint64_t fssusage_t;
struct cpupart;
@@ -74,7 +76,7 @@ void fss_changepset(kthread_id_t, void *, fssbuf_t *, fssbuf_t *);
typedef struct fsspset {
kmutex_t fssps_lock; /* lock to protect per-pset */
/* list of fssproj structures */
- disp_lock_t fssps_displock; /* lock for fsps_maxfspri */
+ disp_lock_t fssps_displock; /* lock for fsps_maxfsspri */
struct cpupart *fssps_cpupart; /* ptr to our cpu partition */
/* protected by fsspsets_lock */
fsspri_t fssps_maxfsspri; /* maximum fsspri value among */
@@ -113,7 +115,7 @@ typedef struct fssproj {
*/
typedef struct fssproc {
kthread_t *fss_tp; /* pointer back to our thread */
- fssproj_t *fss_proj; /* pointer to our project FS data */
+ fssproj_t *fss_proj; /* pointer to our project FSS data */
uchar_t fss_flags; /* flags defined below */
int fss_timeleft; /* time remaining in procs quantum */
uint32_t fss_ticks; /* ticks accumulated by this thread */
@@ -126,20 +128,22 @@ typedef struct fssproc {
int fss_runnable; /* to indicate runnable/sleeping thread */
struct fssproc *fss_next; /* pointer to next fssproc_t struct */
struct fssproc *fss_prev; /* pointer to prev fssproc_t sturct */
+ caps_sc_t fss_caps; /* CPU caps specific data */
} fssproc_t;
/*
- * One of these structures is allocated to each zone running within each active
- * cpu partition.
+ * One of these structures is allocated to each zone running within
+ * each active cpu partition. This means that if a zone spans more
+ * than one cpu partition then it will have a few of these structures.
*/
typedef struct fsszone {
- struct zone *fssz_zone; /* ptr to our zone structure */
- struct fsszone *fssz_next; /* ptr to next fsszone in fsspset */
- struct fsszone *fssz_prev; /* ptr to prev fsszone in fsspset */
- uint32_t fssz_shares; /* total #shares for projs in zone */
- uint32_t fssz_nproj; /* # fssproj_t's in this fsszone */
- uint32_t fssz_rshares; /* "real" shares given to zone */
- uint32_t fssz_runnable; /* # projects with runnable threads */
+ struct zone *fssz_zone; /* ptr to our zone structure */
+ struct fsszone *fssz_next; /* next fsszone_t in fsspset_t */
+ struct fsszone *fssz_prev; /* prev fsszone_t in fsspset_t */
+ uint32_t fssz_shares; /* sum of all project shares */
+ uint32_t fssz_nproj; /* # of projects */
+ uint32_t fssz_rshares; /* "real" shares given to zone */
+ uint32_t fssz_runnable; /* # of runnable projects */
} fsszone_t;
#define FSSPROC(tx) ((fssproc_t *)(tx->t_cldata))
@@ -158,6 +162,9 @@ typedef struct fsszone {
/* the dispatch queue if preempted */
#define FSSRESTORE 0x04 /* thread was not preempted, due to schedctl */
/* restore priority from fss_scpri */
+
+#endif /* _KERNEL */
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/sys/fx.h b/usr/src/uts/common/sys/fx.h
index 166bc0a06c..2d4e1aa7fb 100644
--- a/usr/src/uts/common/sys/fx.h
+++ b/usr/src/uts/common/sys/fx.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,8 +19,8 @@
* CDDL HEADER END
*/
/*
- * Copyright (c) 2001 by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
*/
#ifndef _SYS_FX_H
@@ -33,6 +32,7 @@
#include <sys/thread.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
+#include <sys/cpucaps.h>
#ifdef __cplusplus
extern "C" {
@@ -114,9 +114,6 @@ typedef struct fxproc {
char fx_nice; /* nice value for compatibility */
uchar_t fx_flags; /* flags defined below */
kthread_t *fx_tp; /* pointer to thread */
- struct fxproc *fx_next; /* pointer to next fxproc */
-
- struct fxproc *fx_prev; /* pointer to previous fxproc */
/* the following are used only when we have callbacks registered */
kt_did_t fx_ktid;
@@ -128,6 +125,7 @@ typedef struct fxproc {
fx_cookie_t fx_cookie; /* cookie with which callback */
/* was registered */
fx_callbacks_t *fx_callback; /* pointer to callback structure */
+ caps_sc_t fx_caps; /* CPU caps specific data */
} fxproc_t;
diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h
index 9a0ba2cc37..02c414b4fc 100644
--- a/usr/src/uts/common/sys/proc.h
+++ b/usr/src/uts/common/sys/proc.h
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -424,11 +424,12 @@ extern struct pid pid0; /* p0's pid */
/* stat codes */
#define SSLEEP 1 /* awaiting an event */
-#define SRUN 2 /* running */
+#define SRUN 2 /* runnable */
#define SZOMB 3 /* process terminated but not waited for */
#define SSTOP 4 /* process stopped by debugger */
#define SIDL 5 /* intermediate state in process creation */
#define SONPROC 6 /* process is being run on a processor */
+#define SWAIT 7 /* process is waiting to become runnable */
/* p_pidflag codes */
#define CLDPEND 0x0001 /* have yet to post a SIGCHLD to the parent */
@@ -639,6 +640,7 @@ extern void term_mstate(kthread_t *);
extern void estimate_msacct(kthread_t *, hrtime_t);
extern void disable_msacct(proc_t *);
extern hrtime_t mstate_aggr_state(proc_t *, int);
+extern hrtime_t mstate_thread_onproc_time(kthread_t *);
extern void syscall_mstate(int, int);
extern uint_t cpu_update_pct(kthread_t *, hrtime_t);
diff --git a/usr/src/uts/common/sys/project.h b/usr/src/uts/common/sys/project.h
index 68aaef7730..15a0bf2193 100644
--- a/usr/src/uts/common/sys/project.h
+++ b/usr/src/uts/common/sys/project.h
@@ -39,6 +39,7 @@ extern "C" {
#include <sys/mutex.h>
#include <sys/rctl.h>
#include <sys/ipc_rctl.h>
+#include <sys/zone.h>
typedef struct kproject_kstat {
kstat_named_t kpk_zonename;
@@ -58,25 +59,29 @@ typedef struct kproject_data { /* Datum protected by: */
} kproject_data_t;
+struct cpucap;
+
/*
* The first two fields of this structure must not be reordered.
*/
typedef struct kproject {
projid_t kpj_id; /* project ID */
zoneid_t kpj_zoneid; /* zone ID */
+ struct zone *kpj_zone; /* zone pointer */
uint_t kpj_count; /* reference counter */
uint32_t kpj_shares; /* number of shares */
rctl_set_t *kpj_rctls; /* resource control set */
struct kproject *kpj_prev; /* previous project */
struct kproject *kpj_next; /* next project */
kproject_data_t kpj_data; /* subsystem-specfic data */
- kmutex_t kpj_poolbind; /* synch. with pools */
+ kmutex_t kpj_poolbind; /* synchronization with pools */
rctl_qty_t kpj_nlwps; /* protected by project's zone's */
/* zone_nlwps_lock */
rctl_qty_t kpj_nlwps_ctl; /* protected by kpj_rctls->rcs_lock */
rctl_qty_t kpj_ntasks; /* protected by project's zone's */
/* zone_nlwps_lock */
rctl_qty_t kpj_ntasks_ctl; /* protected by kpj_rctls->rcs_lock */
+ struct cpucap *kpj_cpucap; /* CPU cap data */
} kproject_t;
#ifdef _KERNEL
@@ -87,8 +92,6 @@ typedef struct kproject {
#define PROJECT_HOLD_FIND 1
#define PROJECT_HOLD_INSERT 2
-struct zone;
-
void project_init(void);
kproject_t *project_hold(kproject_t *);
kproject_t *project_hold_by_id(projid_t, struct zone *, int);
diff --git a/usr/src/uts/common/sys/schedctl.h b/usr/src/uts/common/sys/schedctl.h
index c6546e607e..165ff3f171 100644
--- a/usr/src/uts/common/sys/schedctl.h
+++ b/usr/src/uts/common/sys/schedctl.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 1997-2003 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -81,6 +80,7 @@ typedef struct sc_shared {
#define SC_RUN 0x02
#define SC_ONPROC 0x04
#define SC_STOPPED 0x10
+#define SC_WAIT 0x20
/* preemption control settings */
#define SC_MAX_TICKS 2 /* max time preemption can be blocked */
diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h
index c0ee411715..acfebbfb88 100644
--- a/usr/src/uts/common/sys/thread.h
+++ b/usr/src/uts/common/sys/thread.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -57,6 +57,7 @@ extern "C" {
#define TS_ONPROC 0x04 /* Thread is being run on a processor */
#define TS_ZOMB 0x08 /* Thread has died but hasn't been reaped */
#define TS_STOPPED 0x10 /* Stopped, initial state */
+#define TS_WAIT 0x20 /* Waiting to become runnable */
typedef struct ctxop {
void (*save_op)(void *); /* function to invoke to save context */
@@ -98,6 +99,7 @@ struct trap_info;
struct upimutex;
struct kproject;
struct on_trap_data;
+struct waitq;
/* Definition for kernel thread identifier type */
typedef uint64_t kt_did_t;
@@ -333,6 +335,7 @@ typedef struct _kthread {
#endif
hrtime_t t_hrtime; /* high-res last time on cpu */
kmutex_t t_ctx_lock; /* protects t_ctx in removectx() */
+ struct waitq *t_waitq; /* wait queue */
} kthread_t;
/*
@@ -391,6 +394,8 @@ typedef struct _kthread {
#define TS_SWAPENQ 0x0004 /* swap thread when it reaches a safe point */
#define TS_ON_SWAPQ 0x0008 /* thread is on the swap queue */
#define TS_SIGNALLED 0x0010 /* thread was awakened by cv_signal() */
+#define TS_PROJWAITQ 0x0020 /* thread is on its project's waitq */
+#define TS_ZONEWAITQ 0x0040 /* thread is on its zone's waitq */
#define TS_CSTART 0x0100 /* setrun() by continuelwps() */
#define TS_UNPAUSE 0x0200 /* setrun() by unpauselwps() */
#define TS_XSTART 0x0400 /* setrun() by SIGCONT */
@@ -400,6 +405,7 @@ typedef struct _kthread {
#define TS_RUNQMATCH 0x4000 /* exact run queue balancing by setbackdq() */
#define TS_ALLSTART \
(TS_CSTART|TS_UNPAUSE|TS_XSTART|TS_PSTART|TS_RESUME|TS_CREATE)
+#define TS_ANYWAITQ (TS_PROJWAITQ|TS_ZONEWAITQ)
/*
* No locking needed for AST field.
@@ -411,6 +417,13 @@ typedef struct _kthread {
#define ISTOPPED(t) ((t)->t_state == TS_STOPPED && \
!((t)->t_schedflag & TS_PSTART))
+/* True if thread is asleep and wakeable */
+#define ISWAKEABLE(t) (((t)->t_state == TS_SLEEP && \
+ ((t)->t_flag & T_WAKEABLE)))
+
+/* True if thread is on the wait queue */
+#define ISWAITING(t) ((t)->t_state == TS_WAIT)
+
/* similar to ISTOPPED except the event of interest is CPR */
#define CPR_ISTOPPED(t) ((t)->t_state == TS_STOPPED && \
!((t)->t_schedflag & TS_RESUME))
@@ -465,6 +478,9 @@ typedef struct _kthread {
* ttoproj(x)
* convert a thread pointer to its project pointer.
*
+ * ttozone(x)
+ * convert a thread pointer to its zone pointer.
+ *
* lwptot(x)
* convert a lwp pointer to its thread pointer.
*
@@ -476,6 +492,7 @@ typedef struct _kthread {
#define ttolwp(x) ((x)->t_lwp)
#define ttoproc(x) ((x)->t_procp)
#define ttoproj(x) ((x)->t_proj)
+#define ttozone(x) ((x)->t_procp->p_zone)
#define lwptot(x) ((x)->lwp_thread)
#define lwptoproc(x) ((x)->lwp_procp)
@@ -488,6 +505,7 @@ extern kthread_t *threadp(void); /* inline, returns thread pointer */
#define curthread (threadp()) /* current thread pointer */
#define curproc (ttoproc(curthread)) /* current process pointer */
#define curproj (ttoproj(curthread)) /* current project pointer */
+#define curzone (curproc->p_zone) /* current zone pointer */
extern struct _kthread t0; /* the scheduler thread */
extern kmutex_t pidlock; /* global process lock */
@@ -583,6 +601,12 @@ caddr_t thread_stk_init(caddr_t); /* init thread stack */
#define THREAD_RUN(tp, lp) THREAD_SET_STATE(tp, TS_RUN, lp)
/*
+ * Put thread in wait state, and set the lock pointer to the wait queue
+ * lock pointer provided. This lock should be held.
+ */
+#define THREAD_WAIT(tp, lp) THREAD_SET_STATE(tp, TS_WAIT, lp)
+
+/*
* Put thread in run state, and set the lock pointer to the dispatcher queue
* lock pointer provided (i.e., the "swapped_lock"). This lock should be held.
*/
@@ -620,7 +644,6 @@ caddr_t thread_stk_init(caddr_t); /* init thread stack */
#define THREAD_FREEINTR(tp, cpu) \
THREAD_SET_STATE(tp, TS_FREE, &(cpu)->cpu_thread_lock)
-
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/sys/ts.h b/usr/src/uts/common/sys/ts.h
index 982c594977..971bbaded9 100644
--- a/usr/src/uts/common/sys/ts.h
+++ b/usr/src/uts/common/sys/ts.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -21,7 +20,7 @@
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -36,6 +35,7 @@
#include <sys/types.h>
#include <sys/thread.h>
+#include <sys/cpucaps.h>
#ifdef __cplusplus
extern "C" {
@@ -57,36 +57,38 @@ typedef struct tsdpent {
/* ts_maxwait */
} tsdpent_t;
-
+#ifdef _KERNEL
/*
* time-sharing class specific thread structure
*/
typedef struct tsproc {
- int ts_timeleft; /* time remaining in procs quantum */
- uint_t ts_dispwait; /* wall clock seconds since start */
- /* of quantum (not reset upon preemption */
+ int ts_timeleft; /* time remaining in procs quantum */
+ uint_t ts_dispwait; /* wall clock seconds since start */
+ /* of quantum (not reset upon preempt) */
pri_t ts_cpupri; /* system controlled component of ts_umdpri */
- pri_t ts_uprilim; /* user priority limit */
- pri_t ts_upri; /* user priority */
- pri_t ts_umdpri; /* user mode priority within ts class */
- pri_t ts_scpri; /* remembered priority, for schedctl */
- char ts_nice; /* nice value for compatibility */
- char ts_boost; /* interactive priority offset */
- uchar_t ts_flags; /* flags defined below */
- kthread_t *ts_tp; /* pointer to thread */
- struct tsproc *ts_next; /* link to next tsproc on list */
- struct tsproc *ts_prev; /* link to previous tsproc on list */
+ pri_t ts_uprilim; /* user priority limit */
+ pri_t ts_upri; /* user priority */
+ pri_t ts_umdpri; /* user mode priority within ts class */
+ pri_t ts_scpri; /* remembered priority, for schedctl */
+ char ts_nice; /* nice value for compatibility */
+ char ts_boost; /* interactive priority offset */
+ uchar_t ts_flags; /* flags defined below */
+ kthread_t *ts_tp; /* pointer to thread */
+ struct tsproc *ts_next; /* link to next tsproc on list */
+ struct tsproc *ts_prev; /* link to previous tsproc on list */
+ caps_sc_t ts_caps; /* CPU caps specific data */
} tsproc_t;
-
/* flags */
-#define TSKPRI 0x01 /* thread at kernel mode priority */
-#define TSBACKQ 0x02 /* thread goes to back of disp q when preempted */
-#define TSIA 0x04 /* thread is interactive */
-#define TSIASET 0x08 /* interactive thread is "on" */
-#define TSIANICED 0x10 /* interactive thread has been niced */
+#define TSKPRI 0x01 /* thread at kernel mode priority */
+#define TSBACKQ 0x02 /* thread goes to back of dispq if preempted */
+#define TSIA 0x04 /* thread is interactive */
+#define TSIASET 0x08 /* interactive thread is "on" */
+#define TSIANICED 0x10 /* interactive thread has been niced */
#define TSRESTORE 0x20 /* thread was not preempted, due to schedctl */
- /* restore priority from ts_scpri */
+ /* restore priority from ts_scpri */
+
+#endif /* _KERNEL */
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/sys/waitq.h b/usr/src/uts/common/sys/waitq.h
new file mode 100644
index 0000000000..3b925884d0
--- /dev/null
+++ b/usr/src/uts/common/sys/waitq.h
@@ -0,0 +1,90 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_WAITQ_H
+#define _SYS_WAITQ_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#include <sys/types.h>
+#include <sys/machlock.h>
+#include <sys/thread.h>
+
+typedef struct waitq {
+ disp_lock_t wq_lock; /* protects all fields */
+ kthread_t *wq_first; /* first thread on the queue */
+ int wq_count; /* number of threads on the queue */
+ boolean_t wq_blocked; /* True if threads can't be enqueued */
+} waitq_t;
+
+extern void waitq_init(waitq_t *);
+extern void waitq_fini(waitq_t *);
+
+/*
+ * Place the thread on the wait queue. An attempt to enqueue a thread onto a
+ * blocked queue fails and returns zero. Successful enqueue returns non-zero
+ * value.
+ */
+extern int waitq_enqueue(waitq_t *, kthread_t *);
+
+/*
+ * Take thread off its wait queue and make it runnable.
+ */
+extern void waitq_setrun(kthread_t *t);
+
+/*
+ * Change priority for the thread on wait queue.
+ */
+extern void waitq_change_pri(kthread_t *, pri_t);
+
+/*
+ * Take the first thread off the wait queue and make it runnable.
+ */
+extern void waitq_runone(waitq_t *);
+
+/*
+ * Return True if there are no threads on the queue.
+ */
+extern boolean_t waitq_isempty(waitq_t *);
+
+/*
+ * Prevent and allow placing new threads on wait queue.
+ */
+extern void waitq_block(waitq_t *);
+extern void waitq_unblock(waitq_t *);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_WAITQ_H */
diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h
index 9983e8ec85..615228921a 100644
--- a/usr/src/uts/common/sys/zone.h
+++ b/usr/src/uts/common/sys/zone.h
@@ -305,6 +305,8 @@ typedef struct zone_kstat {
kstat_named_t zk_value;
} zone_kstat_t;
+struct cpucap;
+
typedef struct zone {
/*
* zone_name is never modified once set.
@@ -416,6 +418,7 @@ typedef struct zone {
*/
struct dlnamelist *zone_dl_list;
netstack_t *zone_netstack;
+ struct cpucap *zone_cpucap; /* CPU caps data */
} zone_t;
/*
diff --git a/usr/src/uts/i86pc/os/trap.c b/usr/src/uts/i86pc/os/trap.c
index b6c7621409..38451ba8ad 100644
--- a/usr/src/uts/i86pc/os/trap.c
+++ b/usr/src/uts/i86pc/os/trap.c
@@ -1408,7 +1408,7 @@ out: /* We can't get here from a system trap */
CL_TRAPRET(ct);
thread_unlock(ct);
}
- if (CPU->cpu_runrun)
+ if (CPU->cpu_runrun || curthread->t_schedflag & TS_ANYWAITQ)
preempt();
(void) new_mstate(ct, mstate);
diff --git a/usr/src/uts/intel/ia32/os/syscall.c b/usr/src/uts/intel/ia32/os/syscall.c
index a78fbc62a6..f40ab4a175 100644
--- a/usr/src/uts/intel/ia32/os/syscall.c
+++ b/usr/src/uts/intel/ia32/os/syscall.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -761,7 +761,7 @@ sig_check:
CL_TRAPRET(t);
thread_unlock(t);
}
- if (CPU->cpu_runrun)
+ if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ)
preempt();
lwp->lwp_errno = 0; /* clear error for next time */
diff --git a/usr/src/uts/sparc/os/syscall.c b/usr/src/uts/sparc/os/syscall.c
index 3d11c7f88a..51db208f41 100644
--- a/usr/src/uts/sparc/os/syscall.c
+++ b/usr/src/uts/sparc/os/syscall.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -857,7 +857,7 @@ sig_check:
CL_TRAPRET(t);
thread_unlock(t);
}
- if (CPU->cpu_runrun)
+ if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ)
preempt();
/*
diff --git a/usr/src/uts/sun4/os/trap.c b/usr/src/uts/sun4/os/trap.c
index 797d3ff839..493696f046 100644
--- a/usr/src/uts/sun4/os/trap.c
+++ b/usr/src/uts/sun4/os/trap.c
@@ -1564,7 +1564,7 @@ trap_rtt(void)
CL_TRAPRET(curthread);
thread_unlock(curthread);
}
- if (CPU->cpu_runrun)
+ if (CPU->cpu_runrun || curthread->t_schedflag & TS_ANYWAITQ)
preempt();
if (lwp->lwp_pcb.pcb_step != STEP_NONE)
prdostep();
diff --git a/usr/src/uts/sun4u/ngdr/io/dr_quiesce.c b/usr/src/uts/sun4u/ngdr/io/dr_quiesce.c
index fb61524dfb..3ef5e2a167 100644
--- a/usr/src/uts/sun4u/ngdr/io/dr_quiesce.c
+++ b/usr/src/uts/sun4u/ngdr/io/dr_quiesce.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -566,8 +566,7 @@ dr_stop_user_threads(dr_sr_handle_t *srh)
aston(tp);
- if (tp->t_state == TS_SLEEP &&
- (tp->t_flag & T_WAKEABLE)) {
+ if (ISWAKEABLE(tp) || ISWAITING(tp)) {
setrun_locked(tp);
}
diff --git a/usr/src/uts/sun4u/serengeti/io/sbdp_quiesce.c b/usr/src/uts/sun4u/serengeti/io/sbdp_quiesce.c
index ab078974e3..7b42c3e905 100644
--- a/usr/src/uts/sun4u/serengeti/io/sbdp_quiesce.c
+++ b/usr/src/uts/sun4u/serengeti/io/sbdp_quiesce.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -541,11 +541,9 @@ sbdp_stop_user_threads(sbdp_sr_handle_t *srh)
aston(tp);
- if (tp->t_state == TS_SLEEP &&
- (tp->t_flag & T_WAKEABLE)) {
+ if (ISWAKEABLE(tp) || ISWAITING(tp)) {
setrun_locked(tp);
}
-
}
/* grab thread if needed */
diff --git a/usr/src/uts/sun4u/sunfire/io/sysctrl_quiesce.c b/usr/src/uts/sun4u/sunfire/io/sysctrl_quiesce.c
index 93d229fb3e..ff72a5e344 100644
--- a/usr/src/uts/sun4u/sunfire/io/sysctrl_quiesce.c
+++ b/usr/src/uts/sun4u/sunfire/io/sysctrl_quiesce.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -372,8 +372,7 @@ sysctrl_stop_user_threads(sysc_cfga_pkt_t *pkt)
aston(tp);
- if (tp->t_state == TS_SLEEP &&
- (tp->t_flag & T_WAKEABLE)) {
+ if (ISWAKEABLE(tp) || ISWAITING(tp)) {
setrun_locked(tp);
}