diff options
author | akolb <none@none> | 2007-03-09 15:55:28 -0800 |
---|---|---|
committer | akolb <none@none> | 2007-03-09 15:55:28 -0800 |
commit | c97ad5cdc75eb73e3cc38542ca3ba783574b0a7a (patch) | |
tree | 5ba1653d892978d87d6061c8c7f3821f4b3e354c | |
parent | 68d3ac02fc9db49ae9dccaecff999963114930a7 (diff) | |
download | illumos-joyent-c97ad5cdc75eb73e3cc38542ca3ba783574b0a7a.tar.gz |
PSARC/2004/402 CPU Caps
6327235 PSARC/2004/402 CPU caps
6464161 Dead KSLICE code should be removed
6514387 FX class contains dead code to keep list of member threads
6518395 kstat_zone_add performs KM_SLEEP allocation when it should not
54 files changed, 2728 insertions, 290 deletions
diff --git a/usr/src/cmd/mdb/common/modules/genunix/genunix.c b/usr/src/cmd/mdb/common/modules/genunix/genunix.c index decf655500..51bebaedf5 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c +++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c @@ -124,6 +124,7 @@ pstat2ch(uchar_t state) case SIDL: return ('I'); case SONPROC: return ('O'); case SSTOP: return ('T'); + case SWAIT: return ('W'); default: return ('?'); } } @@ -148,6 +149,7 @@ ps_threadprint(uintptr_t addr, const void *data, void *private) { "TS_ONPROC", TS_ONPROC, TS_ONPROC }, { "TS_ZOMB", TS_ZOMB, TS_ZOMB }, { "TS_STOPPED", TS_STOPPED, TS_STOPPED }, + { "TS_WAIT", TS_WAIT, TS_WAIT }, { NULL, 0, 0 } }; diff --git a/usr/src/cmd/mdb/common/modules/genunix/thread.c b/usr/src/cmd/mdb/common/modules/genunix/thread.c index 704c27f42e..d552bd381f 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/thread.c +++ b/usr/src/cmd/mdb/common/modules/genunix/thread.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -455,6 +454,9 @@ thread(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) case TS_STOPPED: state = "stopped"; break; + case TS_WAIT: + state = "wait"; + break; default: (void) mdb_snprintf(stbuf, 11, "inval/%02x", t.t_state); state = stbuf; diff --git a/usr/src/cmd/prstat/prutil.c b/usr/src/cmd/prstat/prutil.c index 638dc8086f..8c0f99f138 100644 --- a/usr/src/cmd/prstat/prutil.c +++ b/usr/src/cmd/prstat/prutil.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -197,8 +197,8 @@ Format_state(char *str, char state, processorid_t pr_id, int length) case 'I': (void) strncpy(str, "idle", length); break; - case 'X': - (void) strncpy(str, "xbrk", length); + case 'W': + (void) strncpy(str, "wait", length); break; case 'O': (void) snprintf(str, length, "cpu%-3d", (int)pr_id); diff --git a/usr/src/cmd/zonecfg/zonecfg.c b/usr/src/cmd/zonecfg/zonecfg.c index 8a532faa49..ac5525604e 100644 --- a/usr/src/cmd/zonecfg/zonecfg.c +++ b/usr/src/cmd/zonecfg/zonecfg.c @@ -179,6 +179,7 @@ static char *res_types[] = { ALIAS_SHARES, "scheduling-class", "ip-type", + "capped-cpu", NULL }; @@ -265,6 +266,7 @@ static const char *add_cmds[] = { "add attr", "add dataset", "add dedicated-cpu", + "add capped-cpu", "add capped-memory", NULL }; @@ -294,6 +296,7 @@ static const char *remove_cmds[] = { "remove attr ", "remove dataset ", "remove dedicated-cpu ", + "remove capped-cpu ", "remove capped-memory ", NULL }; @@ -307,6 +310,7 @@ static const char *select_cmds[] = { "select attr ", "select dataset ", "select dedicated-cpu", + "select capped-cpu", "select capped-memory", NULL }; @@ -340,6 +344,7 @@ static const char *info_cmds[] = { "info dataset ", "info capped-memory", "info dedicated-cpu", + "info capped-cpu", "info zonename", "info zonepath", "info autoboot", @@ -451,6 +456,16 @@ static const char *pset_res_scope_cmds[] = { NULL }; +static const char *pcap_res_scope_cmds[] = { + "cancel", + "end", + "exit", + "help", + "info", + "set ncpus=", + NULL +}; + static const char *mcap_res_scope_cmds[] = { "cancel", "end", @@ -605,6 +620,8 @@ CPL_MATCH_FN(cmd_cpl_fn) return (add_stuff(cpl, line, dataset_res_scope_cmds, word_end)); case RT_DCPU: return (add_stuff(cpl, line, pset_res_scope_cmds, word_end)); + case RT_PCAP: + return (add_stuff(cpl, line, pcap_res_scope_cmds, word_end)); case RT_MCAP: return (add_stuff(cpl, line, mcap_res_scope_cmds, word_end)); } @@ -1003,6 +1020,20 @@ usage(bool verbose, uint_t flags) pt_to_str(PT_IMPORTANCE), gettext("<unsigned integer>")); break; + case RT_PCAP: + (void) fprintf(fp, gettext("The '%s' resource scope is " + "used to set an upper limit (a cap) on the\n" + "percentage of CPU that can be used by this zone. " + "A '%s' value of 1\ncorresponds to one cpu. The " + "value can be set higher than 1, up to the total\n" + "number of CPUs on the system. The value can " + "also be less than 1,\nrepresenting a fraction of " + "a cpu.\n"), + rt_to_str(resource_scope), pt_to_str(PT_NCPUS)); + (void) fprintf(fp, gettext("Valid commands:\n")); + (void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET), + pt_to_str(PT_NCPUS), gettext("<unsigned decimal>")); + break; case RT_MCAP: (void) fprintf(fp, gettext("The '%s' resource scope is " "used to set an upper limit (a cap) on the\n" @@ -1078,12 +1109,12 @@ usage(bool verbose, uint_t flags) } if (flags & HELP_RESOURCES) { (void) fprintf(fp, "<%s> := %s | %s | %s | %s | %s | %s |\n\t" - "%s | %s | %s\n\n", + "%s | %s | %s | %s\n\n", gettext("resource type"), rt_to_str(RT_FS), rt_to_str(RT_IPD), rt_to_str(RT_NET), rt_to_str(RT_DEVICE), rt_to_str(RT_RCTL), rt_to_str(RT_ATTR), rt_to_str(RT_DATASET), rt_to_str(RT_DCPU), - rt_to_str(RT_MCAP)); + rt_to_str(RT_PCAP), rt_to_str(RT_MCAP)); } if (flags & HELP_PROPS) { (void) fprintf(fp, gettext("For resource type ... there are " @@ -1137,6 +1168,8 @@ usage(bool verbose, uint_t flags) pt_to_str(PT_NAME)); (void) fprintf(fp, "\t%s\t%s, %s\n", rt_to_str(RT_DCPU), pt_to_str(PT_NCPUS), pt_to_str(PT_IMPORTANCE)); + (void) fprintf(fp, "\t%s\t%s\n", rt_to_str(RT_PCAP), + pt_to_str(PT_NCPUS)); (void) fprintf(fp, "\t%s\t%s, %s, %s\n", rt_to_str(RT_MCAP), pt_to_str(PT_PHYSICAL), pt_to_str(PT_SWAP), pt_to_str(PT_LOCKED)); @@ -1835,6 +1868,11 @@ export_func(cmd_t *cmd) (void) fprintf(of, "%s\n", cmd_to_str(CMD_END)); } + /* + * There is nothing to export for pcap since this resource is just + * a container for an rctl alias. + */ + done: if (need_to_close) (void) fclose(of); @@ -1908,6 +1946,7 @@ add_resource(cmd_t *cmd) int type; struct zone_psettab tmp_psettab; struct zone_mcaptab tmp_mcaptab; + uint64_t tmp; uint64_t tmp_mcap; char pool[MAXNAMELEN]; @@ -1951,12 +1990,18 @@ add_resource(cmd_t *cmd) bzero(&in_progress_dstab, sizeof (in_progress_dstab)); return; case RT_DCPU: - /* Make sure there isn't already a cpu-set entry. */ + /* Make sure there isn't already a cpu-set or cpu-cap entry. */ if (zonecfg_lookup_pset(handle, &tmp_psettab) == Z_OK) { zerr(gettext("The %s resource already exists."), rt_to_str(RT_DCPU)); goto bad; } + if (zonecfg_get_aliased_rctl(handle, ALIAS_CPUCAP, &tmp) != + Z_NO_ENTRY) { + zerr(gettext("The %s resource already exists."), + rt_to_str(RT_PCAP)); + goto bad; + } /* Make sure the pool property isn't set. */ if (zonecfg_get_pool(handle, pool, sizeof (pool)) == Z_OK && @@ -1970,6 +2015,32 @@ add_resource(cmd_t *cmd) bzero(&in_progress_psettab, sizeof (in_progress_psettab)); return; + case RT_PCAP: + /* + * Make sure there isn't already a cpu-set or incompatible + * cpu-cap rctls. + */ + if (zonecfg_lookup_pset(handle, &tmp_psettab) == Z_OK) { + zerr(gettext("The %s resource already exists."), + rt_to_str(RT_DCPU)); + goto bad; + } + + switch (zonecfg_get_aliased_rctl(handle, ALIAS_CPUCAP, &tmp)) { + case Z_ALIAS_DISALLOW: + zone_perror(rt_to_str(RT_PCAP), Z_ALIAS_DISALLOW, + FALSE); + goto bad; + + case Z_OK: + zerr(gettext("The %s resource already exists."), + rt_to_str(RT_PCAP)); + goto bad; + + default: + break; + } + return; case RT_MCAP: /* * Make sure there isn't already a mem-cap entry or max-swap @@ -2967,6 +3038,25 @@ remove_pset() } static void +remove_pcap() +{ + int err; + uint64_t tmp; + + if (zonecfg_get_aliased_rctl(handle, ALIAS_CPUCAP, &tmp) != Z_OK) { + zerr("%s %s: %s", cmd_to_str(CMD_REMOVE), rt_to_str(RT_PCAP), + zonecfg_strerror(Z_NO_RESOURCE_TYPE)); + saw_error = TRUE; + return; + } + + if ((err = zonecfg_rm_aliased_rctl(handle, ALIAS_CPUCAP)) != Z_OK) + z_cmd_rt_perror(CMD_REMOVE, RT_PCAP, err, TRUE); + else + need_to_commit = TRUE; +} + +static void remove_mcap() { int err, res1, res2, res3; @@ -3074,6 +3164,9 @@ remove_resource(cmd_t *cmd) case RT_DCPU: remove_pset(); return; + case RT_PCAP: + remove_pcap(); + return; case RT_MCAP: remove_mcap(); return; @@ -3396,6 +3489,7 @@ select_func(cmd_t *cmd) { int type, err, res; uint64_t limit; + uint64_t tmp; if (zone_is_read_only(CMD_SELECT)) return; @@ -3493,6 +3587,13 @@ select_func(cmd_t *cmd) bcopy(&old_psettab, &in_progress_psettab, sizeof (struct zone_psettab)); return; + case RT_PCAP: + if ((err = zonecfg_get_aliased_rctl(handle, ALIAS_CPUCAP, &tmp)) + != Z_OK) { + z_cmd_rt_perror(CMD_SELECT, RT_PCAP, err, TRUE); + global_scope = TRUE; + } + return; case RT_MCAP: /* if none of these exist, there is no resource to select */ if ((res = zonecfg_lookup_mcap(handle, &old_mcaptab)) != Z_OK && @@ -3708,6 +3809,8 @@ set_func(cmd_t *cmd) boolean_t force_set = FALSE; size_t physmem_size = sizeof (in_progress_mcaptab.zone_physmem_cap); uint64_t mem_cap, mem_limit; + float cap; + char *unitp; struct zone_psettab tmp_psettab; bool arg_err = FALSE; @@ -4200,6 +4303,34 @@ set_func(cmd_t *cmd) long_usage(CMD_SET, TRUE); usage(FALSE, HELP_PROPS); return; + case RT_PCAP: + if (prop_type != PT_NCPUS) { + zone_perror(pt_to_str(prop_type), Z_NO_PROPERTY_TYPE, + TRUE); + long_usage(CMD_SET, TRUE); + usage(FALSE, HELP_PROPS); + return; + } + + /* + * We already checked that an rctl alias is allowed in + * the add_resource() function. + */ + + if ((cap = strtof(prop_id, &unitp)) <= 0 || *unitp != '\0' || + (int)(cap * 100) < 1) { + zerr(gettext("%s property is out of range."), + pt_to_str(PT_NCPUS)); + saw_error = TRUE; + return; + } + + if ((err = zonecfg_set_aliased_rctl(handle, ALIAS_CPUCAP, + (int)(cap * 100))) != Z_OK) + zone_perror(zone, err, TRUE); + else + need_to_commit = TRUE; + return; case RT_MCAP: switch (prop_type) { case PT_PHYSICAL: @@ -4790,6 +4921,26 @@ info_pset(zone_dochandle_t handle, FILE *fp) } static void +output_pcap(FILE *fp) +{ + uint64_t cap; + + if (zonecfg_get_aliased_rctl(handle, ALIAS_CPUCAP, &cap) == Z_OK) { + float scaled = (float)cap / 100; + (void) fprintf(fp, "%s:\n", rt_to_str(RT_PCAP)); + (void) fprintf(fp, "\t[%s: %.2f]\n", pt_to_str(PT_NCPUS), + scaled); + } +} + +static void +info_pcap(FILE *fp) +{ + output_pcap(fp); +} + + +static void info_aliased_rctl(zone_dochandle_t handle, FILE *fp, char *alias) { uint64_t limit; @@ -4932,6 +5083,9 @@ info_func(cmd_t *cmd) case RT_DCPU: output_pset(fp, &in_progress_psettab); break; + case RT_PCAP: + output_pcap(fp); + break; case RT_MCAP: res1 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP, &swap_limit); @@ -4986,6 +5140,7 @@ info_func(cmd_t *cmd) info_dev(handle, fp, cmd); } info_pset(handle, fp); + info_pcap(fp); info_mcap(handle, fp); if (!global_zone) { info_attr(handle, fp, cmd); @@ -5062,6 +5217,9 @@ info_func(cmd_t *cmd) case RT_DCPU: info_pset(handle, fp); break; + case RT_PCAP: + info_pcap(fp); + break; case RT_MCAP: info_mcap(handle, fp); break; @@ -5203,10 +5361,12 @@ verify_func(cmd_t *cmd) char sched[MAXNAMELEN]; char brand[MAXNAMELEN]; int err, ret_val = Z_OK, arg; + int pset_res; bool save = FALSE; bool arg_err = FALSE; zone_iptype_t iptype; boolean_t has_cpu_shares = B_FALSE; + boolean_t has_cpu_cap = B_FALSE; optind = 0; while ((arg = getopt(cmd->cmd_argc, cmd->cmd_argv, "?")) != EOF) { @@ -5333,6 +5493,9 @@ verify_func(cmd_t *cmd) if (strcmp(rctltab.zone_rctl_name, "zone.cpu-shares") == 0) has_cpu_shares = B_TRUE; + if (strcmp(rctltab.zone_rctl_name, "zone.cpu-cap") == 0) + has_cpu_cap = B_TRUE; + if (rctltab.zone_rctl_valptr == NULL) { zerr(gettext("%s: no %s specified"), rt_to_str(RT_RCTL), pt_to_str(PT_VALUE)); @@ -5345,7 +5508,8 @@ verify_func(cmd_t *cmd) } (void) zonecfg_endrctlent(handle); - if (zonecfg_lookup_pset(handle, &psettab) == Z_OK && has_cpu_shares) { + if ((pset_res = zonecfg_lookup_pset(handle, &psettab)) == Z_OK && + has_cpu_shares) { zerr(gettext("%s zone.cpu-shares and %s are incompatible."), rt_to_str(RT_RCTL), rt_to_str(RT_DCPU)); saw_error = TRUE; @@ -5364,6 +5528,14 @@ verify_func(cmd_t *cmd) ret_val = Z_INCOMPATIBLE; } + if (pset_res == Z_OK && has_cpu_cap) { + zerr(gettext("%s zone.cpu-cap and the %s are incompatible."), + rt_to_str(RT_RCTL), rt_to_str(RT_DCPU)); + saw_error = TRUE; + if (ret_val == Z_OK) + ret_val = Z_INCOMPATIBLE; + } + if ((err = zonecfg_setattrent(handle)) != Z_OK) { zone_perror(zone, err, TRUE); return; @@ -5562,6 +5734,7 @@ end_func(cmd_t *cmd) int err, arg, res1, res2, res3; uint64_t swap_limit; uint64_t locked_limit; + uint64_t proc_cap; assert(cmd != NULL); @@ -5888,6 +6061,17 @@ end_func(cmd_t *cmd) err = zonecfg_modify_pset(handle, &in_progress_psettab); } break; + case RT_PCAP: + /* Make sure everything was filled in. */ + if (zonecfg_get_aliased_rctl(handle, ALIAS_CPUCAP, &proc_cap) + != Z_OK) { + zerr(gettext("%s not specified"), pt_to_str(PT_NCPUS)); + saw_error = TRUE; + validation_failed = TRUE; + return; + } + err = Z_OK; + break; case RT_MCAP: /* Make sure everything was filled in. */ res1 = strlen(in_progress_mcaptab.zone_physmem_cap) == 0 ? diff --git a/usr/src/cmd/zonecfg/zonecfg.h b/usr/src/cmd/zonecfg/zonecfg.h index 4f960b56d1..3369012438 100644 --- a/usr/src/cmd/zonecfg/zonecfg.h +++ b/usr/src/cmd/zonecfg/zonecfg.h @@ -94,9 +94,10 @@ typedef int bool; #define RT_SHARES 22 /* really a rctl alias property, but for info */ #define RT_SCHED 23 /* really a property, but for info ... */ #define RT_IPTYPE 24 /* really a property, but for info ... */ +#define RT_PCAP 25 #define RT_MIN RT_UNKNOWN -#define RT_MAX RT_IPTYPE +#define RT_MAX RT_PCAP /* property types: increment PT_MAX when expanding this list */ #define PT_UNKNOWN 0 diff --git a/usr/src/cmd/zonecfg/zonecfg_grammar.y b/usr/src/cmd/zonecfg/zonecfg_grammar.y index 7ee9b4d612..69fc6d98bd 100644 --- a/usr/src/cmd/zonecfg/zonecfg_grammar.y +++ b/usr/src/cmd/zonecfg/zonecfg_grammar.y @@ -61,14 +61,14 @@ extern void yyerror(char *s); %token FS IPD ATTR DEVICE RCTL SPECIAL RAW DIR OPTIONS TYPE ADDRESS PHYSICAL %token IPTYPE %token NAME MATCH PRIV LIMIT ACTION VALUE EQUAL OPEN_SQ_BRACKET CLOSE_SQ_BRACKET -%token OPEN_PAREN CLOSE_PAREN COMMA DATASET LIMITPRIV BOOTARGS BRAND PSET +%token OPEN_PAREN CLOSE_PAREN COMMA DATASET LIMITPRIV BOOTARGS BRAND PSET PCAP %token MCAP NCPUS IMPORTANCE SHARES MAXLWPS MAXSHMMEM MAXSHMIDS MAXMSGIDS %token MAXSEMIDS LOCKED SWAP SCHED CLEAR %type <strval> TOKEN EQUAL OPEN_SQ_BRACKET CLOSE_SQ_BRACKET property_value OPEN_PAREN CLOSE_PAREN COMMA simple_prop_val %type <complex> complex_piece complex_prop_val -%type <ival> resource_type NET FS IPD DEVICE RCTL ATTR DATASET PSET MCAP +%type <ival> resource_type NET FS IPD DEVICE RCTL ATTR DATASET PSET PCAP MCAP %type <ival> property_name SPECIAL RAW DIR OPTIONS TYPE ADDRESS PHYSICAL NAME MATCH ZONENAME ZONEPATH AUTOBOOT POOL LIMITPRIV BOOTARGS VALUE PRIV LIMIT ACTION BRAND SCHED IPTYPE @@ -700,6 +700,14 @@ select_command: SELECT $$->cmd_handler = &select_func; $$->cmd_res_type = RT_DCPU; } + | SELECT PCAP + { + if (($$ = alloc_cmd()) == NULL) + YYERROR; + cmd = $$; + $$->cmd_handler = &select_func; + $$->cmd_res_type = RT_PCAP; + } | SELECT MCAP { if (($$ = alloc_cmd()) == NULL) @@ -840,6 +848,7 @@ resource_type: NET { $$ = RT_NET; } | ATTR { $$ = RT_ATTR; } | DATASET { $$ = RT_DATASET; } | PSET { $$ = RT_DCPU; } + | PCAP { $$ = RT_PCAP; } | MCAP { $$ = RT_MCAP; } property_name: SPECIAL { $$ = PT_SPECIAL; } diff --git a/usr/src/cmd/zonecfg/zonecfg_lex.l b/usr/src/cmd/zonecfg/zonecfg_lex.l index 81a0594c22..9b8bc81295 100644 --- a/usr/src/cmd/zonecfg/zonecfg_lex.l +++ b/usr/src/cmd/zonecfg/zonecfg_lex.l @@ -173,6 +173,8 @@ char *safe_strdup(char *s); <TSTATE>dedicated-cpu { return PSET; } +<TSTATE>capped-cpu { return PCAP; } + <TSTATE>capped-memory { return MCAP; } <TSTATE>zonepath { return ZONEPATH; } diff --git a/usr/src/head/libzonecfg.h b/usr/src/head/libzonecfg.h index 83b70bc8e5..2eaf2e218a 100644 --- a/usr/src/head/libzonecfg.h +++ b/usr/src/head/libzonecfg.h @@ -145,6 +145,7 @@ extern "C" { #define ALIAS_MAXLOCKEDMEM "locked" #define ALIAS_MAXSWAP "swap" #define ALIAS_SHARES "cpu-shares" +#define ALIAS_CPUCAP "cpu-cap" /* * Bit flag definitions for passing into libzonecfg functions. diff --git a/usr/src/lib/libdtrace/common/procfs.d.in b/usr/src/lib/libdtrace/common/procfs.d.in index 915d754d88..0215f0d986 100644 --- a/usr/src/lib/libdtrace/common/procfs.d.in +++ b/usr/src/lib/libdtrace/common/procfs.d.in @@ -49,6 +49,8 @@ inline char SIDL = @SIDL@; #pragma D binding "1.0" SIDL inline char SONPROC = @SONPROC@; #pragma D binding "1.0" SONPROC +inline char SWAIT = @SWAIT@; +#pragma D binding "1.0" SWAIT inline int PR_STOPPED = @PR_STOPPED@; #pragma D binding "1.0" PR_STOPPED @@ -322,14 +324,16 @@ translator lwpsinfo_t < kthread_t *T > { (T->t_state == @TS_RUN@) ? SRUN : (T->t_state == @TS_ONPROC@) ? SONPROC : (T->t_state == @TS_ZOMB@) ? SZOMB : - (T->t_state == @TS_STOPPED@) ? SSTOP : 0; + (T->t_state == @TS_STOPPED@) ? SSTOP : + (T->t_state == @TS_WAIT@) ? SWAIT : 0; pr_sname = (T->t_proc_flag & @TP_PRVSTOP@) ? 'T' : (T->t_state == @TS_SLEEP@) ? 'S' : (T->t_state == @TS_RUN@) ? 'R' : (T->t_state == @TS_ONPROC@) ? 'O' : (T->t_state == @TS_ZOMB@) ? 'Z' : - (T->t_state == @TS_STOPPED@) ? 'T' : '?'; + (T->t_state == @TS_STOPPED@) ? 'T' : + (T->t_state == @TS_WAIT@) ? 'W' : '?'; pr_syscall = T->t_sysnum; pr_pri = T->t_pri; diff --git a/usr/src/lib/libdtrace/common/procfs.sed.in b/usr/src/lib/libdtrace/common/procfs.sed.in index f889f6333c..b4a7087a44 100644 --- a/usr/src/lib/libdtrace/common/procfs.sed.in +++ b/usr/src/lib/libdtrace/common/procfs.sed.in @@ -56,6 +56,7 @@ SED_REPLACE(TS_RUN) SED_REPLACE(TS_ONPROC) SED_REPLACE(TS_ZOMB) SED_REPLACE(TS_STOPPED) +SED_REPLACE(TS_WAIT) SED_REPLACE(P_PR_FORK) SED_REPLACE(P_PR_RUNLCL) @@ -75,6 +76,7 @@ SED_REPLACE(SZOMB) SED_REPLACE(SSTOP) SED_REPLACE(SIDL) SED_REPLACE(SONPROC) +SED_REPLACE(SWAIT) SED_REPLACE(CLDNOSIGCHLD) SED_REPLACE(CLDWAITPID) diff --git a/usr/src/lib/libzonecfg/common/libzonecfg.c b/usr/src/lib/libzonecfg/common/libzonecfg.c index f2c5570861..33cc965e25 100644 --- a/usr/src/lib/libzonecfg/common/libzonecfg.c +++ b/usr/src/lib/libzonecfg/common/libzonecfg.c @@ -174,6 +174,7 @@ static struct alias { {ALIAS_MAXLOCKEDMEM, "zone.max-locked-memory", "privileged", "deny", 0}, {ALIAS_MAXSWAP, "zone.max-swap", "privileged", "deny", 0}, {ALIAS_SHARES, "zone.cpu-shares", "privileged", "none", 0}, + {ALIAS_CPUCAP, "zone.cpu-cap", "privileged", "deny", 0}, {NULL, NULL, NULL, NULL, 0} }; diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com index c0cbbb57a0..fc55931f62 100644 --- a/usr/src/pkgdefs/SUNWhea/prototype_com +++ b/usr/src/pkgdefs/SUNWhea/prototype_com @@ -546,6 +546,8 @@ f none usr/include/sys/bustypes.h 644 root bin f none usr/include/sys/byteorder.h 644 root bin f none usr/include/sys/callb.h 644 root bin f none usr/include/sys/callo.h 644 root bin +f none usr/include/sys/cpucaps.h 644 root bin +f none usr/include/sys/cpucaps_impl.h 644 root bin f none usr/include/sys/ccompile.h 644 root bin f none usr/include/sys/cdio.h 644 root bin f none usr/include/sys/cis.h 644 root bin @@ -1205,6 +1207,7 @@ f none usr/include/sys/vuid_queue.h 644 root bin f none usr/include/sys/vuid_state.h 644 root bin f none usr/include/sys/vuid_store.h 644 root bin f none usr/include/sys/wait.h 644 root bin +f none usr/include/sys/waitq.h 644 root bin f none usr/include/sys/watchpoint.h 644 root bin f none usr/include/sys/xti_inet.h 644 root bin f none usr/include/sys/xti_osi.h 644 root bin diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index b3433fc075..ced7d5c654 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -40,6 +40,7 @@ COMMON_CORE_OBJS += \ bitset.o \ bp_map.o \ brand.o \ + cpucaps.o \ cmt.o \ cpu.o \ cpu_intr.o \ @@ -341,6 +342,7 @@ GENUNIX_OBJS += \ vnode.o \ vuid_queue.o \ vuid_store.o \ + waitq.o \ watchpoint.o \ yield.o \ scsi_confdata.o \ diff --git a/usr/src/uts/common/cpr/cpr_uthread.c b/usr/src/uts/common/cpr/cpr_uthread.c index 00d5e0e80b..49ea1dfb1f 100644 --- a/usr/src/uts/common/cpr/cpr_uthread.c +++ b/usr/src/uts/common/cpr/cpr_uthread.c @@ -148,8 +148,7 @@ cpr_stop_user(int wait) aston(tp); - if (tp->t_state == TS_SLEEP && - (tp->t_flag & T_WAKEABLE)) { + if (ISWAKEABLE(tp) || ISWAITING(tp)) { setrun_locked(tp); } } diff --git a/usr/src/uts/common/disp/cpucaps.c b/usr/src/uts/common/disp/cpucaps.c new file mode 100644 index 0000000000..b290c5ecc4 --- /dev/null +++ b/usr/src/uts/common/disp/cpucaps.c @@ -0,0 +1,1133 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/disp.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/atomic.h> +#include <sys/cpucaps_impl.h> +#include <sys/dtrace.h> +#include <sys/sdt.h> +#include <sys/debug.h> +#include <sys/rctl.h> +#include <sys/errno.h> + +/* + * CPU Caps implementation + * ======================= + * + * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU + * usage for all projects running inside the zone. If the zone CPU cap is set + * below the project CPU cap, the latter will have no effect. + * + * When CPU usage of projects and/or zones reaches specified caps, threads in + * them do not get scheduled and instead are placed on wait queues associated + * with a cap. Such threads will start running again only when CPU usage drops + * below the cap level. Each zone and each project has its own wait queue. + * + * When CPU cap is set, the kernel continously keeps track of CPU time used by + * capped zones and/or projects over a short time interval and calculates their + * current CPU usage as a percentage. When the accumulated usage reaches the CPU + * cap, LWPs running in the user-land (when they are not holding any critical + * kernel locks) are placed on special wait queues until their project's or + * zone's CPU usage drops below the cap. + * + * The system maintains a list of all capped projects and all capped zones. On + * every clock tick every active thread belonging to a capped project adds its + * CPU usage to its project. Usage from all projects belonging to a capped zone + * is aggregated to get the zone usage. + * + * When the current CPU usage is above the cap, a project or zone is considered + * over-capped. Every user thread caught running in an over-capped project or + * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and + * is requested to surrender its CPU. This causes scheduling class specific + * CL_PREEMPT() callback to be invoked. The callback function places threads + * marked as TS_PROJWAIT on a wait queue and calls switch(). + * + * Threads are only placed on wait queues after trapping from user-land + * (they could be holding some user locks, but no kernel locks) and while + * returning from the trap back to the user-land when no kernel locks are held. + * Putting threads on wait queues in random places while running in the + * kernel might lead to all kinds of locking problems. + * + * Accounting + * ========== + * + * Accounting of CPU usage is based on per-thread micro-state accounting data. + * On every clock tick clock() adds new on-CPU time for every thread found on + * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU. + * New times means time since it was last accounted for. On-CPU times greater + * than 1 tick are truncated to 1 tick. + * + * Project CPU usage is aggregated from all threads within the project. + * Zone CPU usage is the sum of usages for all projects within the zone. Zone + * CPU usage is calculated on every clock tick by walking list of projects and + * adding their usage together. + * + * Decay + * ===== + * + * CPU usage is decayed by the caps_update() routine which is called once per + * every clock tick. It walks lists of project caps and decays their usages by + * one per cent. If CPU usage drops below cap levels, threads on the wait queue + * are made runnable again, one thread per clock tick. + * + * Interfaces + * ========== + * + * The CPU Caps facility provides the following interfaces to the rest of the + * system: + * + * cpucaps_project_add(kproject_t *) + * + * Notifies the framework of a new project. It should be put on the + * capped_projects list if its zone has a cap. + * + * cpucaps_project_remove(kproject_t *) + * + * Remove the association between the specified project and its cap. + * Called right before the project is destroyed. + * + * cpucaps_project_set(kproject_t *, rctl_qty_t) + * + * Set project cap of the specified project to the specified value. Setting the + * value to NOCAP is equivalent to removing the cap. + * + * cpucaps_zone_set(zone_t *, rctl_qty_t) + * + * Set zone cap of the specified zone to the specified value. Setting the value + * to NOCAP is equivalent to removing the cap. + * + * cpucaps_zone_remove(zone_t *) + * + * Remove the association between the zone and its cap. + * + * cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t) + * + * Charges specified thread's project the amount of on-CPU time that it used. + * If the third argument is CPUCAPS_CHARGE_ONLY returns False. + * Otherwise returns True if project or zone should be penalized because its + * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ + * bits in t_schedflag in this case. + * + * CPUCAPS_ENFORCE(kthread_id_t *) + * + * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER + * state on project or zone wait queues, as requested by TS_PROJWAITQ or + * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a + * wait queue or False otherwise. + * + * cpucaps_sc_init(caps_sc_t *) + * + * Initializes the scheduling-class specific CPU Caps data for a thread. + * + * LOCKS + * ===== + * + * all the individual caps structures and their lists are protected by a global + * caps_lock mutex. The lock is grabbed either by clock() or by events modifying + * caps, so it is usually uncontended. We avoid all blocking memory allocations + * while holding caps_lock to prevent clock() from blocking. + * + * Thread state is protected by the thread lock. It protects the association + * between a thread and its project and, as a consequence, to its zone. The + * association can not break while thread lock is held, so the project or zone + * cap are not going to disappear while thread lock is held. + * + * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is + * grabbed by scheduling classes already holding thread lock at high PIL and by + * clock thread performing usage decay. We should do as little work as possible + * while holding the lock since it may be very hot. All threads in the project + * contend for the same cache line doing cap usage updates. + */ + +/* + * caps_lock protects list of capped projects and zones, changes in the cap + * state and changes of the global cpucaps_enabled flag. + * + * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is + * modified in parallel. This can be per-zone cap flag, but we don't keep any + * cap state for now. + */ +static kmutex_t caps_lock; /* lock to protect: */ +static list_t capped_zones; /* - list of zones with caps */ +static list_t capped_projects; /* - list of projects with caps */ +boolean_t cpucaps_enabled; /* - are there any caps defined? */ +boolean_t cpucaps_busy; /* - is framework busy? */ + +/* + * The accounting is based on the number of nanoseconds threads spend running + * during a tick which is kept in the cap_tick_cost variable. + */ +static hrtime_t cap_tick_cost; + +/* + * How much of the usage value is decayed every clock tick + * Decay one per cent of value per tick + */ +#define CAP_DECAY_FACTOR 100 + +/* + * Scale the value and round it to the closest integer value + */ +#define ROUND_SCALE(x, y) (((x) + (y) / 2) / (y)) + +static void caps_update(); + +/* + * CAP kstats. + */ +struct cap_kstat { + kstat_named_t cap_value; + kstat_named_t cap_usage; + kstat_named_t cap_nwait; + kstat_named_t cap_below; + kstat_named_t cap_above; + kstat_named_t cap_maxusage; + kstat_named_t cap_zonename; +} cap_kstat = { + { "value", KSTAT_DATA_UINT64 }, + { "usage", KSTAT_DATA_UINT64 }, + { "nwait", KSTAT_DATA_UINT64 }, + { "below_sec", KSTAT_DATA_UINT64 }, + { "above_sec", KSTAT_DATA_UINT64 }, + { "maxusage", KSTAT_DATA_UINT64 }, + { "zonename", KSTAT_DATA_STRING }, +}; + + +static kmutex_t cap_kstat_lock; +static int cap_kstat_update(kstat_t *, int); + +/* + * Initialize CPU caps infrastructure. + * - Initialize lists of capped zones and capped projects + * - Set cpucaps_clock_callout to NULL + */ +void +cpucaps_init() +{ + /* + * Initialize global variables + */ + cap_tick_cost = TICK_TO_NSEC((hrtime_t)1); + + list_create(&capped_zones, sizeof (cpucap_t), + offsetof(cpucap_t, cap_link)); + list_create(&capped_projects, sizeof (cpucap_t), + offsetof(cpucap_t, cap_link)); + + cpucaps_enabled = B_FALSE; + cpucaps_busy = B_FALSE; + cpucaps_clock_callout = NULL; +} + +/* + * Initialize scheduling-class specific CPU Caps data. + */ +void +cpucaps_sc_init(caps_sc_t *csc) +{ + csc->csc_cputime = 0; +} + +/* + * Allocate and initialize cpucap structure + */ +static cpucap_t * +cap_alloc(void) +{ + cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP); + + DISP_LOCK_INIT(&cap->cap_usagelock); + waitq_init(&cap->cap_waitq); + + return (cap); +} + +/* + * Free cpucap structure + */ +static void +cap_free(cpucap_t *cap) +{ + if (cap == NULL) + return; + + /* + * This cap should not be active + */ + ASSERT(!list_link_active(&cap->cap_link)); + ASSERT(cap->cap_value == 0); + ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock)); + + waitq_fini(&cap->cap_waitq); + DISP_LOCK_DESTROY(&cap->cap_usagelock); + + kmem_free(cap, sizeof (cpucap_t)); +} + +/* + * Activate cap - insert into active list and unblock its + * wait queue. Should be called with caps_lock held. + * The cap_value field is set to the value supplied. + */ +static void +cap_enable(list_t *l, cpucap_t *cap, hrtime_t value) +{ + ASSERT(MUTEX_HELD(&caps_lock)); + + /* + * Cap can not be already enabled + */ + ASSERT(!CAP_ENABLED(cap)); + ASSERT(!list_link_active(&cap->cap_link)); + + list_insert_tail(l, cap); + cap->cap_below = cap->cap_above = 0; + cap->cap_maxusage = 0; + cap->cap_usage = 0; + cap->cap_value = value; + waitq_unblock(&cap->cap_waitq); + if (CPUCAPS_OFF()) { + cpucaps_enabled = B_TRUE; + cpucaps_clock_callout = caps_update; + } +} + +/* + * Deactivate cap + * - Block its wait queue. This prevents any new threads from being + * enqueued there and moves all enqueued threads to the run queue. + * - Remove cap from list l. + * - Disable CPU caps globally if there are no capped projects or zones + * + * Should be called with caps_lock held. + */ +static void +cap_disable(list_t *l, cpucap_t *cap) +{ + ASSERT(MUTEX_HELD(&caps_lock)); + /* + * Cap should be currently active + */ + ASSERT(CPUCAPS_ON()); + ASSERT(list_link_active(&cap->cap_link)); + ASSERT(CAP_ENABLED(cap)); + + waitq_block(&cap->cap_waitq); + list_remove(l, cap); + if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) { + cpucaps_enabled = B_FALSE; + cpucaps_clock_callout = NULL; + } + cap->cap_value = 0; + cap->cap_project = NULL; + cap->cap_zone = NULL; + if (cap->cap_kstat != NULL) { + kstat_delete(cap->cap_kstat); + cap->cap_kstat = NULL; + } + +} + +/* + * Enable cap for a project kpj + * It is safe to enable already enabled project cap. + * Should be called with caps_lock held. + */ +static void +cap_project_enable(kproject_t *kpj, hrtime_t value) +{ + cpucap_t *cap = kpj->kpj_cpucap; + + ASSERT(MUTEX_HELD(&caps_lock)); + ASSERT(cap != NULL); + + if (CAP_DISABLED(cap)) { + ASSERT(cap->cap_kstat == NULL); + cap_enable(&capped_projects, cap, value); + cap->cap_project = kpj; + cap->cap_zone = kpj->kpj_zone; + + /* + * Create cap kstats + */ + if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps", + KSTAT_TYPE_NAMED, + sizeof (cap_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL)) != NULL) { + cap->cap_kstat->ks_data_size += + strlen(cap->cap_zone->zone_name) + 1; + cap->cap_kstat->ks_lock = &cap_kstat_lock; + cap->cap_kstat->ks_data = &cap_kstat; + cap->cap_kstat->ks_update = cap_kstat_update; + cap->cap_kstat->ks_private = cap; + kstat_install(cap->cap_kstat); + } + } +} + +/* + * Disable project cap. + * It is safe to disable already disabled project cap. + * Should be called with caps_lock held. + */ +static void +cap_project_disable(kproject_t *kpj) +{ + cpucap_t *cap = kpj->kpj_cpucap; + + ASSERT(MUTEX_HELD(&caps_lock)); + ASSERT(cap != NULL); + ASSERT(cap->cap_project == kpj); + + if (CAP_ENABLED(cap)) + cap_disable(&capped_projects, cap); +} + +/* + * Enable cap for a zone + * It is safe to enable already enabled zone cap. + * Should be called with caps_lock held. + */ +static void +cap_zone_enable(zone_t *zone, hrtime_t value) +{ + cpucap_t *cap = zone->zone_cpucap; + + ASSERT(MUTEX_HELD(&caps_lock)); + ASSERT(cap != NULL); + + if (CAP_DISABLED(cap)) { + ASSERT(cap->cap_kstat == NULL); + cap_enable(&capped_zones, cap, value); + cap->cap_zone = zone; + + /* + * Create cap kstats + */ + if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps", + KSTAT_TYPE_NAMED, + sizeof (cap_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL)) != NULL) { + cap->cap_kstat->ks_data_size += + strlen(cap->cap_zone->zone_name) + 1; + cap->cap_kstat->ks_lock = &cap_kstat_lock; + cap->cap_kstat->ks_data = &cap_kstat; + cap->cap_kstat->ks_update = cap_kstat_update; + cap->cap_kstat->ks_private = cap; + kstat_install(cap->cap_kstat); + } + } +} + +/* + * Disable zone cap. + * It is safe to disable already disabled zone cap. + * Should be called with caps_lock held. + */ +static void +cap_zone_disable(zone_t *zone) +{ + cpucap_t *cap = zone->zone_cpucap; + + ASSERT(MUTEX_HELD(&caps_lock)); + ASSERT(cap != NULL); + ASSERT(cap->cap_zone == zone); + + if (CAP_ENABLED(cap)) + cap_disable(&capped_zones, cap); +} + +/* + * Apply specified callback to all caps contained in the list `l'. + */ +static void +cap_walk(list_t *l, void (*cb)(cpucap_t *)) +{ + cpucap_t *cap; + + ASSERT(MUTEX_HELD(&caps_lock)); + + for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) { + (*cb)(cap); + } +} + +/* + * If cap limit is not reached, make one thread from wait queue runnable. + * The waitq_isempty check is performed without the waitq lock. If a new thread + * is placed on the waitq right after the check, it will be picked up during the + * next invocation of cap_poke_waitq(). + */ +static void +cap_poke_waitq(cpucap_t *cap) +{ + ASSERT(MUTEX_HELD(&caps_lock)); + + if (cap->cap_usage >= cap->cap_value) { + cap->cap_above++; + } else { + waitq_t *wq = &cap->cap_waitq; + + cap->cap_below++; + + if (!waitq_isempty(wq)) + waitq_runone(wq); + } +} + +/* + * The callback function called for every cap on capped_projects list. + * Decay cap usage by CAP_DECAY_FACTOR + * Add this cap project usage to its zone usage. + * Kick off a thread from the cap waitq if cap is not reached. + */ +static void +cap_project_usage_walker(cpucap_t *cap) +{ + zone_t *zone = cap->cap_zone; + hrtime_t cap_usage = cap->cap_usage; + + ASSERT(MUTEX_HELD(&caps_lock)); + ASSERT(cap->cap_project->kpj_cpucap == cap); + ASSERT(zone == cap->cap_project->kpj_zone); + ASSERT(CAP_ENABLED(cap)); + + /* + * Set or clear the CAP_REACHED flag based on the current usage. + * Only projects having their own caps are ever marked as CAP_REACHED. + */ + cap_poke_waitq(cap); + + /* + * Add project's CPU usage to our zone's CPU usage. + */ + if (ZONE_IS_CAPPED(zone)) { + cpucap_t *zcap = zone->zone_cpucap; + + ASSERT(zcap->cap_zone == zone); + + /* + * If we haven't reset this zone's usage during this clock tick + * yet, then do it now. The cap_lbolt field is used to check + * whether this is the first zone's project we see during this + * tick or a subsequent one. + */ + if (zcap->cap_lbolt != lbolt64) { + if (zcap->cap_usage > zcap->cap_maxusage) + zcap->cap_maxusage = zcap->cap_usage; + zcap->cap_usage = 0; + } + DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap, + hrtime_t, cap_usage); + zcap->cap_usage += cap_usage; + /* Check for overflows */ + if (zcap->cap_usage < 0) + zcap->cap_usage = MAX_USAGE - 1; + } + + /* + * Decay project usage. + */ + disp_lock_enter(&cap->cap_usagelock); + cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR); + disp_lock_exit(&cap->cap_usagelock); +} + +/* + * On every clock tick walk the list of project caps and update the CPU usage. + * Also walk the list of zone caps checking whether any threads should + * transition from wait queue to run queue. + * + * This function gets called by the clock thread directly when there are any + * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs + * caps_lock for long periods of time, so there should be almost no contention + * for it. + */ +static void +caps_update() +{ + mutex_enter(&caps_lock); + cap_walk(&capped_projects, cap_project_usage_walker); + cap_walk(&capped_zones, cap_poke_waitq); + mutex_exit(&caps_lock); +} + +/* + * The function is called for each project in a zone when the zone cap is + * modified. It enables project caps if zone cap is enabled and disables if the + * zone cap is disabled and project doesn't have its own cap. + * + * For each project that does not have cpucap structure allocated it allocates a + * new structure and assigns to kpj->cpu_cap. The allocation is performed + * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock + * held. + */ +static int +cap_project_zone_modify_walker(kproject_t *kpj, void *arg) +{ + cpucap_t *project_cap = NULL; + cpucap_t *zone_cap = (cpucap_t *)arg; + + ASSERT(zone_cap != NULL); + + if (kpj->kpj_cpucap == NULL) { + /* + * This is the first time any cap was established for this + * project. Allocate a new cpucap structure for it. + */ + project_cap = cap_alloc(); + } + + mutex_enter(&caps_lock); + + /* + * Double-check that kpj_cpucap is still NULL - now with caps_lock held + * and assign the newly allocated cpucap structure to it. + */ + if (kpj->kpj_cpucap == NULL) { + kpj->kpj_cpucap = project_cap; + } else if (project_cap != NULL) { + cap_free(project_cap); + } + + project_cap = kpj->kpj_cpucap; + + if (CAP_DISABLED(zone_cap)) { + /* + * Remove all projects in this zone without caps + * from the capped_projects list. + */ + if (project_cap->cap_value == MAX_USAGE) { + cap_project_disable(kpj); + } + } else if (CAP_DISABLED(project_cap)) { + /* + * Add the project to capped_projects list. + */ + ASSERT(project_cap->cap_value == 0); + cap_project_enable(kpj, MAX_USAGE); + } + mutex_exit(&caps_lock); + + return (0); +} + +/* + * Set zone cap to cap_val + * If cap_val is equal to NOCAP, disable zone cap. + * + * If this is the first time a cap is set on a zone, allocate cpucap structure + * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held. + */ +int +cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) +{ + cpucap_t *cap = NULL; + hrtime_t value; + + if (cap_val == 0) + return (EINVAL); + + ASSERT(cap_val <= MAXCAP); + if (cap_val > MAXCAP) + cap_val = MAXCAP; + + /* + * Nothing to do if trying to disable a cap on a zone when caps are off + * or a zone which does not have a cap yet. + */ + if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP)) + return (0); + + if (zone->zone_cpucap == NULL) + cap = cap_alloc(); + + mutex_enter(&caps_lock); + + if (cpucaps_busy) { + mutex_exit(&caps_lock); + return (EBUSY); + } + + /* + * Double-check whether zone->zone_cpucap is NULL, now with caps_lock + * held. If it is still NULL, assign a newly allocated cpucap to it. + */ + if (zone->zone_cpucap == NULL) { + zone->zone_cpucap = cap; + } else if (cap != NULL) { + cap_free(cap); + } + + cap = zone->zone_cpucap; + value = cap_val * cap_tick_cost; + if (value < 0) + value = MAX_USAGE; + + /* Nothing to do if the value is staying the same */ + if (value == cap->cap_value) { + mutex_exit(&caps_lock); + return (0); + } + + /* + * Clear cap statistics since the cap value itself changes. + */ + cap->cap_above = cap->cap_below = 0; + + + if (cap_val == NOCAP) { + if (CAP_ENABLED(cap)) { + /* + * Remove cap for the zone + */ + cap_zone_disable(zone); + cpucaps_busy = B_TRUE; + mutex_exit(&caps_lock); + /* + * Disable caps for all project belonging to this zone + * unless they have their own cap. + */ + (void) project_walk_all(zone->zone_id, + cap_project_zone_modify_walker, cap); + + mutex_enter(&caps_lock); + cpucaps_busy = B_FALSE; + } + } else if (CAP_DISABLED(cap)) { + /* + * Set a cap on a zone which previously was not capped. + */ + cap_zone_enable(zone, value); + cpucaps_busy = B_TRUE; + mutex_exit(&caps_lock); + + /* + * Enable cap for all projects belonging to this zone. + */ + (void) project_walk_all(zone->zone_id, + cap_project_zone_modify_walker, cap); + + mutex_enter(&caps_lock); + cpucaps_busy = B_FALSE; + } else { + /* + * No state transitions, just change the value + */ + cap->cap_value = value; + } + + ASSERT(MUTEX_HELD(&caps_lock)); + ASSERT(!cpucaps_busy); + mutex_exit(&caps_lock); + + return (0); +} + +/* + * The project is going away so disable its cap. + */ +void +cpucaps_project_remove(kproject_t *kpj) +{ + mutex_enter(&caps_lock); + if (PROJECT_IS_CAPPED(kpj)) + cap_project_disable(kpj); + if (kpj->kpj_cpucap != NULL) { + cap_free(kpj->kpj_cpucap); + kpj->kpj_cpucap = NULL; + } + mutex_exit(&caps_lock); +} + +/* + * The zone is going away, so disable its cap. + */ +void +cpucaps_zone_remove(zone_t *zone) +{ + mutex_enter(&caps_lock); + while (ZONE_IS_CAPPED(zone)) { + mutex_exit(&caps_lock); + (void) cpucaps_zone_set(zone, NOCAP); + mutex_enter(&caps_lock); + } + if (zone->zone_cpucap != NULL) { + cap_free(zone->zone_cpucap); + zone->zone_cpucap = NULL; + } + mutex_exit(&caps_lock); +} + +/* + * New project was created. It should be put on the capped_projects list if + * its zone has a cap. + */ +void +cpucaps_project_add(kproject_t *kpj) +{ + cpucap_t *cap = NULL; + + if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone)) + return; + + /* + * This project was never capped before, so allocate its cap structure. + */ + if (kpj->kpj_cpucap == NULL) + cap = cap_alloc(); + + mutex_enter(&caps_lock); + /* + * Double-check with caps_lock held + */ + if (kpj->kpj_cpucap == NULL) { + kpj->kpj_cpucap = cap; + } else if (cap != NULL) { + cap_free(cap); + } + + if (ZONE_IS_CAPPED(kpj->kpj_zone)) + cap_project_enable(kpj, MAX_USAGE); + + mutex_exit(&caps_lock); +} + +/* + * Set project cap to cap_val + * If cap_val is equal to NOCAP, disable project cap. + * + * If this is the first time a cap is set on a project, allocate cpucap + * structure without holding caps_lock to avoid KM_SLEEP allocation with + * caps_lock held. + */ +int +cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) +{ + cpucap_t *cap = NULL; + hrtime_t value; + + if (cap_val == 0) + return (EINVAL); + + ASSERT(cap_val <= MAXCAP); + if (cap_val > MAXCAP) + cap_val = MAXCAP; + + /* + * Nothing to do if trying to disable project cap and caps are not + * enabled or if trying to disable cap on a project that does not have + * cap enabled. + */ + if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj))) + return (0); + + if (kpj->kpj_cpucap == NULL) { + /* + * This project was never capped before, so allocate its cap + * structure. + */ + cap = cap_alloc(); + } + + mutex_enter(&caps_lock); + + /* + * Double-check with caps_lock held. + */ + if (kpj->kpj_cpucap == NULL) { + kpj->kpj_cpucap = cap; + } else if (cap != NULL) { + cap_free(cap); + } + + /* + * Get the actual pointer to the project cap. + */ + cap = kpj->kpj_cpucap; + value = cap_val * cap_tick_cost; + if (value < 0) + value = MAX_USAGE; + + /* + * Nothing to do if the value is not changing + */ + if (value == cap->cap_value) { + mutex_exit(&caps_lock); + return (0); + } + + /* + * Clear cap statistics since the cap value itself changes. + */ + cap->cap_above = cap->cap_below = 0; + cap->cap_maxusage = 0; + + if (cap_val != NOCAP) { + /* + * Enable this cap if it is not already enabled. + */ + if (CAP_DISABLED(cap)) + cap_project_enable(kpj, value); + else + cap->cap_value = value; + } else if (CAP_ENABLED(cap)) { + /* + * User requested to drop a cap on the project. If it is part of + * capped zone, keep the cap and set the value to MAX_USAGE, + * otherwise disable the cap. + */ + if (ZONE_IS_CAPPED(kpj->kpj_zone)) { + cap->cap_value = MAX_USAGE; + } else { + cap_project_disable(kpj); + } + } + mutex_exit(&caps_lock); + + return (0); +} + +/* + * Get cap usage. + */ +static rctl_qty_t +cap_get(cpucap_t *cap) +{ + return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0); +} + +/* + * Get current project usage. + */ +rctl_qty_t +cpucaps_project_get(kproject_t *kpj) +{ + return (cap_get(kpj->kpj_cpucap)); +} + +/* + * Get current zone usage. + */ +rctl_qty_t +cpucaps_zone_get(zone_t *zone) +{ + return (cap_get(zone->zone_cpucap)); +} + +/* + * Charge project of thread t the time thread t spent on CPU since previously + * adjusted. + * + * Record the current on-CPU time in the csc structure. + * + * Do not adjust for more than one tick worth of time. + * + */ +static void +caps_charge_adjust(kthread_id_t t, caps_sc_t *csc) +{ + kproject_t *kpj = ttoproj(t); + hrtime_t new_usage; + hrtime_t usage_delta; + + ASSERT(THREAD_LOCK_HELD(t)); + ASSERT(PROJECT_IS_CAPPED(kpj)); + + /* Get on-CPU time since birth of a thread */ + new_usage = mstate_thread_onproc_time(t); + + /* Time spent on CPU since last checked */ + usage_delta = new_usage - csc->csc_cputime; + + /* Save the accumulated on-CPU time */ + csc->csc_cputime = new_usage; + + /* Charge at most one tick worth of on-CPU time */ + if (usage_delta > cap_tick_cost) + usage_delta = cap_tick_cost; + + /* Add usage_delta to the project usage value. */ + if (usage_delta > 0) { + cpucap_t *cap = kpj->kpj_cpucap; + + DTRACE_PROBE2(cpucaps__project__charge, + kthread_id_t, t, hrtime_t, usage_delta); + + disp_lock_enter_high(&cap->cap_usagelock); + cap->cap_usage += usage_delta; + + /* Check for overflows */ + if (cap->cap_usage < 0) + cap->cap_usage = MAX_USAGE - 1; + + disp_lock_exit_high(&cap->cap_usagelock); + + /* + * cap_maxusage is only kept for observability. Move it outside + * the lock to reduce the time spent while holding the lock. + */ + if (cap->cap_usage > cap->cap_maxusage) + cap->cap_maxusage = cap->cap_usage; + } +} + +/* + * Charge thread's project and return True if project or zone should be + * penalized because its project or zone is exceeding its cap. Also sets + * TS_PROJWAITQ or TS_ZONEWAITQ in this case. + */ +boolean_t +cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) +{ + kproject_t *kpj = ttoproj(t); + klwp_t *lwp = t->t_lwp; + zone_t *zone; + cpucap_t *project_cap; + boolean_t rc = B_FALSE; + + ASSERT(THREAD_LOCK_HELD(t)); + + /* Nothing to do for projects that are not capped. */ + if (lwp == NULL || !PROJECT_IS_CAPPED(kpj)) + return (B_FALSE); + + caps_charge_adjust(t, csc); + + /* + * The caller only requested to charge the project usage, no enforcement + * part. + */ + if (charge_type == CPUCAPS_CHARGE_ONLY) + return (B_FALSE); + + project_cap = kpj->kpj_cpucap; + + if (project_cap->cap_usage >= project_cap->cap_value) { + t->t_schedflag |= TS_PROJWAITQ; + rc = B_TRUE; + } else if (t->t_schedflag & TS_PROJWAITQ) { + t->t_schedflag &= ~TS_PROJWAITQ; + } + + zone = ttozone(t); + if (!ZONE_IS_CAPPED(zone)) { + if (t->t_schedflag & TS_ZONEWAITQ) + t->t_schedflag &= ~TS_ZONEWAITQ; + } else { + cpucap_t *zone_cap = zone->zone_cpucap; + + if (zone_cap->cap_usage >= zone_cap->cap_value) { + t->t_schedflag |= TS_ZONEWAITQ; + rc = B_TRUE; + } else if (t->t_schedflag & TS_ZONEWAITQ) { + t->t_schedflag &= ~TS_ZONEWAITQ; + } + } + + + return (rc); +} + +/* + * Enforce CPU caps. If got preempted in the user-land, we know that thread does + * not hold any kernel locks, so enqueue ourselves on the waitq, if needed. + * + * CPU Caps are only enforced for user threads. + * + * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and + * threads marked with TS_ZONEWAITQ are placed on their zone wait queue. + * + * It is possible that by the time we enter cpucaps_enforce() the cap is already + * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We + * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer + * apply. + */ +boolean_t +cpucaps_enforce(kthread_t *t) +{ + klwp_t *lwp = t->t_lwp; + + ASSERT(THREAD_LOCK_HELD(t)); + + if (lwp != NULL && lwp->lwp_state == LWP_USER) { + if (t->t_schedflag & TS_PROJWAITQ) { + ASSERT(ttoproj(t)->kpj_cpucap != NULL); + t->t_schedflag &= ~TS_ANYWAITQ; + if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq), + t)) { + return (B_TRUE); + } + } + if (t->t_schedflag & TS_ZONEWAITQ) { + ASSERT(ttozone(t)->zone_cpucap != NULL); + t->t_schedflag &= ~TS_ZONEWAITQ; + if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq), + t)) { + return (B_TRUE); + } + } + } + + /* + * The thread is not enqueued on the wait queue. + */ + return (B_FALSE); +} + +/* + * Convert internal cap statistics into values exported by cap kstat. + */ +static int +cap_kstat_update(kstat_t *ksp, int rw) +{ + struct cap_kstat *capsp = &cap_kstat; + cpucap_t *cap = ksp->ks_private; + clock_t tick_sec = SEC_TO_TICK(1); + char *zonename = cap->cap_zone->zone_name; + + if (rw == KSTAT_WRITE) + return (EACCES); + + capsp->cap_value.value.ui64 = + ROUND_SCALE(cap->cap_value, cap_tick_cost); + capsp->cap_usage.value.ui64 = + ROUND_SCALE(cap->cap_usage, cap_tick_cost); + capsp->cap_maxusage.value.ui64 = + ROUND_SCALE(cap->cap_maxusage, cap_tick_cost); + capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count; + capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec); + capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec); + kstat_named_setstr(&capsp->cap_zonename, zonename); + + return (0); +} diff --git a/usr/src/uts/common/disp/fss.c b/usr/src/uts/common/disp/fss.c index af2b04f11b..a409ebc800 100644 --- a/usr/src/uts/common/disp/fss.c +++ b/usr/src/uts/common/disp/fss.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -21,7 +20,7 @@ */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -55,6 +54,7 @@ #include <sys/tnf_probe.h> #include <sys/policy.h> #include <sys/sdt.h> +#include <sys/cpucaps.h> /* * FSS Data Structures: @@ -1069,6 +1069,7 @@ fss_update_list(int i) goto next; if ((fssproc->fss_flags & FSSKPRI) != 0) goto next; + fssproj = FSSPROC2FSSPROJ(fssproc); if (fssproj == NULL) goto next; @@ -1084,7 +1085,7 @@ fss_update_list(int i) if (t->t_schedctl && schedctl_get_nopreempt(t)) goto next; - if (t->t_state != TS_RUN) { + if (t->t_state != TS_RUN && t->t_state != TS_WAIT) { /* * Make next syscall/trap call fss_trapret */ @@ -1373,6 +1374,7 @@ fss_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp, fssproc->fss_timeleft = fss_quantum; fssproc->fss_tp = t; + cpucaps_sc_init(&fssproc->fss_caps); /* * Put a lock on our fsspset structure. @@ -1420,7 +1422,8 @@ fss_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp, t->t_cldata = (void *)fssproc; t->t_schedflag |= TS_RUNQMATCH; fss_change_priority(t, fssproc); - if (t->t_state == TS_RUN || t->t_state == TS_ONPROC) + if (t->t_state == TS_RUN || t->t_state == TS_ONPROC || + t->t_state == TS_WAIT) fss_active(t); thread_unlock(t); @@ -1568,6 +1571,8 @@ fss_fork(kthread_t *pt, kthread_t *ct, void *bufp) cfssproc->fss_upri = pfssproc->fss_upri; cfssproc->fss_tp = ct; cfssproc->fss_nice = pfssproc->fss_nice; + cpucaps_sc_init(&cfssproc->fss_caps); + cfssproc->fss_flags = pfssproc->fss_flags & ~(FSSKPRI | FSSBACKQ | FSSRESTORE); ct->t_cldata = (void *)cfssproc; @@ -1793,6 +1798,14 @@ fss_exit(kthread_t *t) } mutex_exit(&fsspset->fssps_lock); mutex_exit(&fsspsets_lock); + + if (CPUCAPS_ON()) { + thread_lock(t); + fssproc = FSSPROC(t); + (void) cpucaps_charge(t, &fssproc->fss_caps, + CPUCAPS_CHARGE_ONLY); + thread_unlock(t); + } } static void @@ -1861,7 +1874,8 @@ fss_swapout(kthread_t *t, int flags) if (INHERITED(t) || (fssproc->fss_flags & FSSKPRI) || (t->t_proc_flag & TP_LWPEXIT) || - (t->t_state & (TS_ZOMB | TS_FREE | TS_STOPPED | TS_ONPROC)) || + (t->t_state & (TS_ZOMB | TS_FREE | TS_STOPPED | + TS_ONPROC | TS_WAIT)) || !(t->t_schedflag & TS_LOAD) || !(SWAP_OK(t))) return (-1); @@ -1971,6 +1985,20 @@ fss_preempt(kthread_t *t) t->t_trapret = 1; /* so that fss_trapret will run */ aston(t); } + + /* + * This thread may be placed on wait queue by CPU Caps. In this case we + * do not need to do anything until it is removed from the wait queue. + * Do not enforce CPU caps on threads running at a kernel priority + */ + if (CPUCAPS_ON()) { + (void) cpucaps_charge(t, &fssproc->fss_caps, + CPUCAPS_CHARGE_ONLY); + + if (!(fssproc->fss_flags & FSSKPRI) && CPUCAPS_ENFORCE(t)) + return; + } + /* * If preempted in user-land mark the thread as swappable because it * cannot be holding any kernel locks. @@ -2077,6 +2105,12 @@ fss_sleep(kthread_t *t) ASSERT(THREAD_LOCK_HELD(t)); ASSERT(t->t_state == TS_ONPROC); + + /* + * Account for time spent on CPU before going to sleep. + */ + (void) CPUCAPS_CHARGE(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ONLY); + fss_inactive(t); /* @@ -2117,6 +2151,8 @@ fss_tick(kthread_t *t) fssproc_t *fssproc; fssproj_t *fssproj; klwp_t *lwp; + boolean_t call_cpu_surrender = B_FALSE; + boolean_t cpucaps_enforce = B_FALSE; ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock)); @@ -2136,6 +2172,17 @@ fss_tick(kthread_t *t) } /* + * Keep track of thread's project CPU usage. Note that projects + * get charged even when threads are running in the kernel. + * Do not surrender CPU if running in the SYS class. + */ + if (CPUCAPS_ON()) { + cpucaps_enforce = cpucaps_charge(t, + &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE) && + !(fssproc->fss_flags & FSSKPRI); + } + + /* * A thread's execution time for threads running in the SYS class * is not tracked. */ @@ -2180,8 +2227,7 @@ fss_tick(kthread_t *t) t->t_schedflag &= ~TS_DONT_SWAP; fssproc->fss_timeleft = fss_quantum; } else { - fssproc->fss_flags |= FSSBACKQ; - cpu_surrender(t); + call_cpu_surrender = B_TRUE; } } else if (t->t_state == TS_ONPROC && t->t_pri < t->t_disp_queue->disp_maxrunpri) { @@ -2190,10 +2236,38 @@ fss_tick(kthread_t *t) * waiting for a processor, then thread surrenders * the processor. */ - fssproc->fss_flags |= FSSBACKQ; - cpu_surrender(t); + call_cpu_surrender = B_TRUE; } } + + if (cpucaps_enforce && 2 * fssproc->fss_timeleft > fss_quantum) { + /* + * The thread used more than half of its quantum, so assume that + * it used the whole quantum. + * + * Update thread's priority just before putting it on the wait + * queue so that it gets charged for the CPU time from its + * quantum even before that quantum expires. + */ + fss_newpri(fssproc); + if (t->t_pri != fssproc->fss_umdpri) + fss_change_priority(t, fssproc); + + /* + * We need to call cpu_surrender for this thread due to cpucaps + * enforcement, but fss_change_priority may have already done + * so. In this case FSSBACKQ is set and there is no need to call + * cpu-surrender again. + */ + if (!(fssproc->fss_flags & FSSBACKQ)) + call_cpu_surrender = B_TRUE; + } + + if (call_cpu_surrender) { + fssproc->fss_flags |= FSSBACKQ; + cpu_surrender(t); + } + thread_unlock_nopreempt(t); /* clock thread can't be preempted */ } @@ -2336,6 +2410,11 @@ fss_yield(kthread_t *t) ASSERT(THREAD_LOCK_HELD(t)); /* + * Collect CPU usage spent before yielding + */ + (void) CPUCAPS_CHARGE(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ONLY); + + /* * Clear the preemption control "yield" bit since the user is * doing a yield. */ @@ -2439,7 +2518,8 @@ fss_changeproj(kthread_t *t, void *kp, void *zp, fssbuf_t *projbuf, ASSERT(fssproj_new != NULL); thread_lock(t); - if (t->t_state == TS_RUN || t->t_state == TS_ONPROC) + if (t->t_state == TS_RUN || t->t_state == TS_ONPROC || + t->t_state == TS_WAIT) fss_inactive(t); ASSERT(fssproj_old->fssp_threads > 0); if (--fssproj_old->fssp_threads == 0) { @@ -2449,7 +2529,8 @@ fss_changeproj(kthread_t *t, void *kp, void *zp, fssbuf_t *projbuf, fssproc->fss_proj = fssproj_new; fssproc->fss_fsspri = 0; fssproj_new->fssp_threads++; - if (t->t_state == TS_RUN || t->t_state == TS_ONPROC) + if (t->t_state == TS_RUN || t->t_state == TS_ONPROC || + t->t_state == TS_WAIT) fss_active(t); thread_unlock(t); if (free) { @@ -2528,12 +2609,14 @@ fss_changepset(kthread_t *t, void *newcp, fssbuf_t *projbuf, fssproj_new->fssp_threads++; thread_lock(t); - if (t->t_state == TS_RUN || t->t_state == TS_ONPROC) - fss_inactive(t); + if (t->t_state == TS_RUN || t->t_state == TS_ONPROC || + t->t_state == TS_WAIT) + fss_inactive(t); fssproc->fss_proj = fssproj_new; fssproc->fss_fsspri = 0; - if (t->t_state == TS_RUN || t->t_state == TS_ONPROC) - fss_active(t); + if (t->t_state == TS_RUN || t->t_state == TS_ONPROC || + t->t_state == TS_WAIT) + fss_active(t); thread_unlock(t); mutex_exit(&fsspset_new->fssps_lock); diff --git a/usr/src/uts/common/disp/fx.c b/usr/src/uts/common/disp/fx.c index 62b5eb1da2..53bfb46e2a 100644 --- a/usr/src/uts/common/disp/fx.c +++ b/usr/src/uts/common/disp/fx.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -54,6 +53,7 @@ #include <sys/policy.h> #include <sys/sdt.h> #include <sys/cpupart.h> +#include <sys/cpucaps.h> static pri_t fx_init(id_t, int, classfuncs_t **); @@ -85,40 +85,6 @@ static struct modlinkage modlinkage = { #define FX_MAX_UNPRIV_PRI 0 /* maximum unpriviledge priority */ /* - * The fxproc_t structures are kept in an array of circular doubly linked - * lists. A hash on the thread pointer is used to determine which list - * each fxproc structure should be placed. Each list has a dummy "head" which - * is never removed, so the list is never empty. - */ - -#define FX_LISTS 16 /* number of lists, must be power of 2 */ -#define FX_LIST_HASH(tp) (((uintptr_t)(tp) >> 9) & (FX_LISTS - 1)) - -#define FX_LIST_INSERT(fxpp) \ -{ \ - int index = FX_LIST_HASH(fxpp->fx_tp); \ - kmutex_t *lockp = &fx_list_lock[index]; \ - fxproc_t *headp = &fx_plisthead[index]; \ - mutex_enter(lockp); \ - fxpp->fx_next = headp->fx_next; \ - fxpp->fx_prev = headp; \ - headp->fx_next->fx_prev = fxpp; \ - headp->fx_next = fxpp; \ - mutex_exit(lockp); \ -} - -#define FX_LIST_DELETE(fxpp) \ -{ \ - int index = FX_LIST_HASH(fxpp->fx_tp); \ - kmutex_t *lockp = &fx_list_lock[index]; \ - mutex_enter(lockp); \ - fxpp->fx_prev->fx_next = fxpp->fx_next; \ - fxpp->fx_next->fx_prev = fxpp->fx_prev; \ - mutex_exit(lockp); \ -} - - -/* * The fxproc_t structures that have a registered callback vector, * are also kept in an array of circular doubly linked lists. A hash on * the thread id (from ddi_get_kt_did()) is used to determine which list @@ -192,10 +158,6 @@ static pri_t fx_maxglobpri; /* maximum global priority used by fx class */ static kmutex_t fx_dptblock; /* protects fixed priority dispatch table */ -static kmutex_t fx_list_lock[FX_LISTS]; /* protects fxproc lists */ -static fxproc_t fx_plisthead[FX_LISTS]; /* dummy fxproc at head of lists */ - - static kmutex_t fx_cb_list_lock[FX_CB_LISTS]; /* protects list of fxprocs */ /* that have callbacks */ static fxproc_t fx_cb_plisthead[FX_CB_LISTS]; /* dummy fxproc at head of */ @@ -316,14 +278,6 @@ fx_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp) fx_cid = cid; /* Record our class ID */ /* - * Initialize the fxproc hash table - */ - for (i = 0; i < FX_LISTS; i++) { - fx_plisthead[i].fx_next = fx_plisthead[i].fx_prev = - &fx_plisthead[i]; - } - - /* * Initialize the hash table for fxprocs with callbacks */ for (i = 0; i < FX_CB_LISTS; i++) { @@ -477,7 +431,6 @@ fx_admin(caddr_t uaddr, cred_t *reqpcredp) return (0); } - /* * Allocate a fixed priority class specific thread structure and * initialize it with the parameters supplied. Also move the thread @@ -565,6 +518,7 @@ fx_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp, } fxpp->fx_timeleft = fxpp->fx_pquantum; + cpucaps_sc_init(&fxpp->fx_caps); fxpp->fx_tp = t; thread_lock(t); /* get dispatcher lock on thread */ @@ -575,8 +529,6 @@ fx_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp, fx_change_priority(t, fxpp); thread_unlock(t); - FX_LIST_INSERT(fxpp); - return (0); } @@ -591,6 +543,8 @@ fx_exit(kthread_t *t) thread_lock(t); fxpp = (fxproc_t *)(t->t_cldata); + (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ONLY); + if (FX_HAS_CB(fxpp)) { FX_CB_EXIT(FX_CALLB(fxpp), fxpp->fx_cookie); fxpp->fx_callback = NULL; @@ -599,6 +553,7 @@ fx_exit(kthread_t *t) FX_CB_LIST_DELETE(fxpp); return; } + thread_unlock(t); } @@ -621,7 +576,6 @@ fx_exitclass(void *procp) FX_CB_LIST_DELETE(fxpp); } else thread_unlock(fxpp->fx_tp); - FX_LIST_DELETE(fxpp); kmem_free(fxpp, sizeof (fxproc_t)); } @@ -662,6 +616,7 @@ fx_fork(kthread_t *t, kthread_t *ct, void *bufp) cfxpp->fx_callback = NULL; cfxpp->fx_cookie = NULL; cfxpp->fx_flags = pfxpp->fx_flags & ~(FXBACKQ); + cpucaps_sc_init(&cfxpp->fx_caps); cfxpp->fx_tp = ct; ct->t_cldata = (void *)cfxpp; @@ -670,7 +625,6 @@ fx_fork(kthread_t *t, kthread_t *ct, void *bufp) /* * Link new structure into fxproc list. */ - FX_LIST_INSERT(cfxpp); return (0); } @@ -1157,13 +1111,12 @@ static void fx_preempt(kthread_t *t) { fxproc_t *fxpp = (fxproc_t *)(t->t_cldata); -#ifdef KSLICE - extern int kslice; -#endif ASSERT(t == curthread); ASSERT(THREAD_LOCK_HELD(curthread)); + (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ONLY); + /* * Check to see if we're doing "preemption control" here. If * we are, and if the user has requested that this thread not @@ -1209,17 +1162,20 @@ fx_preempt(kthread_t *t) THREAD_CHANGE_PRI(t, fx_dptbl[fxpp->fx_pri].fx_globpri); } + /* + * This thread may be placed on wait queue by CPU Caps. In this case we + * do not need to do anything until it is removed from the wait queue. + */ + if (CPUCAPS_ENFORCE(t)) { + return; + } + if ((fxpp->fx_flags & (FXBACKQ)) == FXBACKQ) { fxpp->fx_timeleft = fxpp->fx_pquantum; fxpp->fx_flags &= ~FXBACKQ; setbackdq(t); } else { -#ifdef KSLICE - if (kslice) - setbackdq(t); - else -#endif - setfrontdq(t); + setfrontdq(t); } } @@ -1250,6 +1206,11 @@ fx_sleep(kthread_t *t) ASSERT(t == curthread); ASSERT(THREAD_LOCK_HELD(t)); + /* + * Account for time spent on CPU before going to sleep. + */ + (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ONLY); + if (FX_HAS_CB(fxpp)) { FX_CB_SLEEP(FX_CALLB(fxpp), fxpp->fx_cookie); } @@ -1318,6 +1279,7 @@ fx_stop(kthread_t *t, int why, int what) static void fx_tick(kthread_t *t) { + boolean_t call_cpu_surrender = B_FALSE; fxproc_t *fxpp; ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock)); @@ -1342,6 +1304,14 @@ fx_tick(kthread_t *t) fx_change_priority(t, fxpp); } } + + /* + * Keep track of thread's project CPU usage. Note that projects + * get charged even when threads are running in the kernel. + */ + call_cpu_surrender = CPUCAPS_CHARGE(t, &fxpp->fx_caps, + CPUCAPS_CHARGE_ENFORCE); + if ((fxpp->fx_pquantum != FX_TQINF) && (--fxpp->fx_timeleft <= 0)) { pri_t new_pri; @@ -1379,15 +1349,17 @@ fx_tick(kthread_t *t) if (thread_change_pri(t, new_pri, 0)) { fxpp->fx_timeleft = fxpp->fx_pquantum; } else { - fxpp->fx_flags |= FXBACKQ; - cpu_surrender(t); + call_cpu_surrender = B_TRUE; } } else if (t->t_state == TS_ONPROC && t->t_pri < t->t_disp_queue->disp_maxrunpri) { + call_cpu_surrender = B_TRUE; + } + + if (call_cpu_surrender) { fxpp->fx_flags |= FXBACKQ; cpu_surrender(t); } - thread_unlock_nopreempt(t); /* clock thread can't be preempted */ } @@ -1453,6 +1425,11 @@ fx_yield(kthread_t *t) ASSERT(t == curthread); ASSERT(THREAD_LOCK_HELD(t)); + /* + * Collect CPU usage spent before yielding CPU. + */ + (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ONLY); + if (FX_HAS_CB(fxpp)) { clock_t new_quantum = (clock_t)fxpp->fx_pquantum; pri_t newpri = fxpp->fx_pri; diff --git a/usr/src/uts/common/disp/sysclass.c b/usr/src/uts/common/disp/sysclass.c index 9f266dd7e7..d48cc3145e 100644 --- a/usr/src/uts/common/disp/sysclass.c +++ b/usr/src/uts/common/disp/sysclass.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,9 +18,10 @@ * * CDDL HEADER END */ + /* - * Copyright (c) 1996-2001 by Sun Microsystems, Inc. - * All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -98,11 +98,7 @@ struct classfuncs sys_classfuncs = { sys_swappri, /* swapin */ sys_swappri, /* swapout */ sys_nullsys, /* trapret */ -#ifdef KSLICE - sys_preempt, -#else - setfrontdq, -#endif + setfrontdq, /* preempt */ setbackdq, /* setrun */ sys_nullsys, /* sleep */ sys_nullsys, /* tick */ @@ -218,21 +214,6 @@ sys_nullsys() { } -#ifdef KSLICE -static void -sys_preempt(t) - kthread_id_t t; -{ - extern int kslice; - - if (kslice) - setbackdq(t); - else - setfrontdq(t); -} -#endif - - /* ARGSUSED */ static int sys_donice(t, cr, incr, retvalp) diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c index 5f352b2203..77f0663f12 100644 --- a/usr/src/uts/common/disp/thread.c +++ b/usr/src/uts/common/disp/thread.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -73,6 +73,8 @@ #include <sys/sdt.h> #include <sys/reboot.h> #include <sys/kdi.h> +#include <sys/waitq.h> +#include <sys/cpucaps.h> struct kmem_cache *thread_cache; /* cache of free threads */ struct kmem_cache *lwp_cache; /* cache of free lwps */ @@ -185,10 +187,18 @@ thread_init(void) label_init(); cred_init(); + /* + * Initialize various resource management facilities. + */ rctl_init(); + cpucaps_init(); + /* + * Zone_init() should be called before project_init() so that project ID + * for the first project is initialized correctly. + */ + zone_init(); project_init(); brand_init(); - zone_init(); task_init(); tcache_init(); pool_init(); @@ -1070,6 +1080,8 @@ setrun_locked(kthread_t *t) * Already on dispatcher queue. */ return; + } else if (t->t_state == TS_WAIT) { + waitq_setrun(t); } else if (t->t_state == TS_STOPPED) { /* * All of the sending of SIGCONT (TC_XSTART) and /proc @@ -1111,8 +1123,6 @@ setrun_locked(kthread_t *t) */ CL_SETRUN(t); } - - } void @@ -1623,7 +1633,7 @@ thread_change_epri(kthread_t *t, pri_t disp_pri) * If it's not on a queue, change the priority with * impunity. */ - if ((state & (TS_SLEEP | TS_RUN)) == 0) { + if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) { t->t_epri = disp_pri; if (state == TS_ONPROC) { @@ -1639,7 +1649,6 @@ thread_change_epri(kthread_t *t, pri_t disp_pri) * It's either on a sleep queue or a run queue. */ if (state == TS_SLEEP) { - /* * Take the thread out of its sleep queue. * Change the inherited priority. @@ -1648,6 +1657,13 @@ thread_change_epri(kthread_t *t, pri_t disp_pri) * to do this in an appropriate manner. */ SOBJ_CHANGE_EPRI(t->t_sobj_ops, t, disp_pri); + } else if (state == TS_WAIT) { + /* + * Re-enqueue a thread on the wait queue if its + * effective priority needs to change. + */ + if (disp_pri != t->t_epri) + waitq_change_pri(t, disp_pri); } else { /* * The thread is on a run queue. @@ -1682,7 +1698,7 @@ thread_change_pri(kthread_t *t, pri_t disp_pri, int front) * If it's not on a queue, change the priority with * impunity. */ - if ((state & (TS_SLEEP | TS_RUN)) == 0) { + if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) { t->t_pri = disp_pri; if (state == TS_ONPROC) { @@ -1707,6 +1723,13 @@ thread_change_pri(kthread_t *t, pri_t disp_pri, int front) */ if (disp_pri != t->t_pri) SOBJ_CHANGE_PRI(t->t_sobj_ops, t, disp_pri); + } else if (state == TS_WAIT) { + /* + * Re-enqueue a thread on the wait queue if its + * priority needs to change. + */ + if (disp_pri != t->t_pri) + waitq_change_pri(t, disp_pri); } else { /* * The thread is on a run queue. diff --git a/usr/src/uts/common/disp/ts.c b/usr/src/uts/common/disp/ts.c index 738a2e47b4..a55b890e83 100644 --- a/usr/src/uts/common/disp/ts.c +++ b/usr/src/uts/common/disp/ts.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -59,10 +59,10 @@ #include <sys/policy.h> #include <sys/sdt.h> #include <sys/cpupart.h> - #include <vm/rm.h> #include <vm/seg_kmem.h> #include <sys/modctl.h> +#include <sys/cpucaps.h> static pri_t ts_init(id_t, int, classfuncs_t **); @@ -194,6 +194,7 @@ static int ts_parmsout(void *, pc_vaparms_t *); static int ts_vaparmsin(void *, pc_vaparms_t *); static int ts_vaparmsout(void *, pc_vaparms_t *); static int ts_parmsset(kthread_t *, void *, id_t, cred_t *); +static void ts_exit(kthread_t *); static int ts_donice(kthread_t *, cred_t *, int, int *); static void ts_exitclass(void *); static int ts_canexit(kthread_t *, cred_t *); @@ -258,7 +259,7 @@ static struct classfuncs ts_classfuncs = { ts_parmsget, ts_parmsset, ts_nullsys, /* stop */ - ts_nullsys, /* exit */ + ts_exit, ts_nullsys, /* active */ ts_nullsys, /* inactive */ ts_swapin, @@ -302,7 +303,7 @@ static struct classfuncs ia_classfuncs = { ia_parmsget, ia_parmsset, ts_nullsys, /* stop */ - ts_nullsys, /* exit */ + ts_exit, ts_nullsys, /* active */ ts_nullsys, /* inactive */ ts_swapin, @@ -622,6 +623,7 @@ ts_enterclass(kthread_t *t, id_t cid, void *parmsp, tspp->ts_dispwait = 0; tspp->ts_timeleft = ts_dptbl[tspp->ts_cpupri].ts_quantum; tspp->ts_tp = t; + cpucaps_sc_init(&tspp->ts_caps); /* * Reset priority. Process goes to a "user mode" priority @@ -703,6 +705,7 @@ ts_fork(kthread_t *t, kthread_t *ct, void *bufp) ctspp->ts_dispwait = 0; ctspp->ts_flags = ptspp->ts_flags & ~(TSKPRI | TSBACKQ | TSRESTORE); ctspp->ts_tp = ct; + cpucaps_sc_init(&ctspp->ts_caps); thread_unlock(t); /* @@ -1307,6 +1310,24 @@ ia_parmsset(kthread_t *tx, void *parmsp, id_t reqpcid, cred_t *reqpcredp) return (ts_parmsset(tx, parmsp, reqpcid, reqpcredp)); } +static void +ts_exit(kthread_t *t) +{ + tsproc_t *tspp; + + if (CPUCAPS_ON()) { + /* + * A thread could be exiting in between clock ticks, + * so we need to calculate how much CPU time it used + * since it was charged last time. + */ + thread_lock(t); + tspp = (tsproc_t *)t->t_cldata; + (void) cpucaps_charge(t, &tspp->ts_caps, CPUCAPS_CHARGE_ONLY); + thread_unlock(t); + } +} + /* * Return the global scheduling priority that would be assigned * to a thread entering the time-sharing class with the ts_upri. @@ -1337,10 +1358,7 @@ static void ts_preempt(kthread_t *t) { tsproc_t *tspp = (tsproc_t *)(t->t_cldata); - klwp_t *lwp; -#ifdef KSLICE - extern int kslice; -#endif + klwp_t *lwp = curthread->t_lwp; pri_t oldpri = t->t_pri; ASSERT(t == curthread); @@ -1350,7 +1368,6 @@ ts_preempt(kthread_t *t) * If preempted in the kernel, make sure the thread has * a kernel priority if needed. */ - lwp = curthread->t_lwp; if (!(tspp->ts_flags & TSKPRI) && lwp != NULL && t->t_kpri_req) { tspp->ts_flags |= TSKPRI; THREAD_CHANGE_PRI(t, ts_kmdpris[0]); @@ -1358,9 +1375,21 @@ ts_preempt(kthread_t *t) t->t_trapret = 1; /* so ts_trapret will run */ aston(t); } + /* - * If preempted in user-land mark the thread - * as swappable because I know it isn't holding any locks. + * This thread may be placed on wait queue by CPU Caps. In this case we + * do not need to do anything until it is removed from the wait queue. + * Do not enforce CPU caps on threads running at a kernel priority + */ + if (CPUCAPS_ON()) { + (void) cpucaps_charge(t, &tspp->ts_caps, CPUCAPS_CHARGE_ONLY); + if (!(tspp->ts_flags & TSKPRI) && CPUCAPS_ENFORCE(t)) + return; + } + + /* + * If thread got preempted in the user-land then we know + * it isn't holding any locks. Mark it as swappable. */ ASSERT(t->t_schedflag & TS_DONT_SWAP); if (lwp != NULL && lwp->lwp_state == LWP_USER) @@ -1420,12 +1449,7 @@ ts_preempt(kthread_t *t) tspp->ts_flags &= ~TSBACKQ; setbackdq(t); } else { -#ifdef KSLICE - if (kslice) - setbackdq(t); - else -#endif - setfrontdq(t); + setfrontdq(t); } done: @@ -1482,6 +1506,11 @@ ts_sleep(kthread_t *t) ASSERT(t == curthread); ASSERT(THREAD_LOCK_HELD(t)); + /* + * Account for time spent on CPU before going to sleep. + */ + (void) CPUCAPS_CHARGE(t, &tspp->ts_caps, CPUCAPS_CHARGE_ONLY); + flags = tspp->ts_flags; if (t->t_kpri_req) { tspp->ts_flags = flags | TSKPRI; @@ -1605,7 +1634,8 @@ ts_swapout(kthread_t *t, int flags) if (INHERITED(t) || (tspp->ts_flags & (TSKPRI | TSIASET)) || (t->t_proc_flag & TP_LWPEXIT) || - (t->t_state & (TS_ZOMB | TS_FREE | TS_STOPPED | TS_ONPROC)) || + (t->t_state & (TS_ZOMB | TS_FREE | TS_STOPPED | + TS_ONPROC | TS_WAIT)) || !(t->t_schedflag & TS_LOAD) || !SWAP_OK(t)) return (-1); @@ -1653,17 +1683,27 @@ ts_swapout(kthread_t *t, int flags) * move thread to priority specified in tsdptbl for time slice expiration * and set runrun to cause preemption. */ - static void ts_tick(kthread_t *t) { tsproc_t *tspp = (tsproc_t *)(t->t_cldata); klwp_t *lwp; + boolean_t call_cpu_surrender = B_FALSE; pri_t oldpri = t->t_pri; ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock)); thread_lock(t); + + /* + * Keep track of thread's project CPU usage. Note that projects + * get charged even when threads are running in the kernel. + */ + if (CPUCAPS_ON()) { + call_cpu_surrender = cpucaps_charge(t, &tspp->ts_caps, + CPUCAPS_CHARGE_ENFORCE) && !(tspp->ts_flags & TSKPRI); + } + if ((tspp->ts_flags & TSKPRI) == 0) { if (--tspp->ts_timeleft <= 0) { pri_t new_pri; @@ -1709,17 +1749,21 @@ ts_tick(kthread_t *t) tspp->ts_timeleft = ts_dptbl[tspp->ts_cpupri].ts_quantum; } else { - tspp->ts_flags |= TSBACKQ; - cpu_surrender(t); + call_cpu_surrender = B_TRUE; } TRACE_2(TR_FAC_DISP, TR_TICK, "tick:tid %p old pri %d", t, oldpri); } else if (t->t_state == TS_ONPROC && t->t_pri < t->t_disp_queue->disp_maxrunpri) { - tspp->ts_flags |= TSBACKQ; - cpu_surrender(t); + call_cpu_surrender = B_TRUE; } } + + if (call_cpu_surrender) { + tspp->ts_flags |= TSBACKQ; + cpu_surrender(t); + } + thread_unlock_nopreempt(t); /* clock thread can't be preempted */ } @@ -1877,8 +1921,8 @@ ts_update_list(int i) goto next; if (tx->t_schedctl && schedctl_get_nopreempt(tx)) goto next; - if (tx->t_state != TS_RUN && (tx->t_state != TS_SLEEP || - !ts_sleep_promote)) { + if (tx->t_state != TS_RUN && tx->t_state != TS_WAIT && + (tx->t_state != TS_SLEEP || !ts_sleep_promote)) { /* make next syscall/trap do CL_TRAPRET */ tx->t_trapret = 1; aston(tx); @@ -1907,7 +1951,6 @@ next: return (updated); } - /* * Processes waking up go to the back of their queue. We don't * need to assign a time quantum here because thread is still @@ -1981,6 +2024,11 @@ ts_yield(kthread_t *t) ASSERT(THREAD_LOCK_HELD(t)); /* + * Collect CPU usage spent before yielding + */ + (void) CPUCAPS_CHARGE(t, &tspp->ts_caps, CPUCAPS_CHARGE_ONLY); + + /* * Clear the preemption control "yield" bit since the user is * doing a yield. */ diff --git a/usr/src/uts/common/dtrace/sdt_subr.c b/usr/src/uts/common/dtrace/sdt_subr.c index cf3ceeb384..0c4a6c31a9 100644 --- a/usr/src/uts/common/dtrace/sdt_subr.c +++ b/usr/src/uts/common/dtrace/sdt_subr.c @@ -115,6 +115,10 @@ sdt_argdesc_t sdt_args[] = { { "sched", "schedctl-yield", 0, 0, "int" }, { "sched", "surrender", 0, 0, "kthread_t *", "lwpsinfo_t *" }, { "sched", "surrender", 1, 0, "kthread_t *", "psinfo_t *" }, + { "sched", "cpucaps-sleep", 0, 0, "kthread_t *", "lwpsinfo_t *" }, + { "sched", "cpucaps-sleep", 1, 0, "kthread_t *", "psinfo_t *" }, + { "sched", "cpucaps-wakeup", 0, 0, "kthread_t *", "lwpsinfo_t *" }, + { "sched", "cpucaps-wakeup", 1, 0, "kthread_t *", "psinfo_t *" }, { "proc", "create", 0, 0, "proc_t *", "psinfo_t *" }, { "proc", "exec", 0, 0, "string" }, { "proc", "exec-failure", 0, 0, "int" }, diff --git a/usr/src/uts/common/fs/proc/prcontrol.c b/usr/src/uts/common/fs/proc/prcontrol.c index 8308237e5b..97ba369083 100644 --- a/usr/src/uts/common/fs/proc/prcontrol.c +++ b/usr/src/uts/common/fs/proc/prcontrol.c @@ -1034,8 +1034,12 @@ pr_stop(prnode_t *pnp) t->t_proc_flag |= TP_PRSTOP; t->t_sig_check = 1; /* do ISSIG */ } - if (t->t_state == TS_SLEEP && - (t->t_flag & T_WAKEABLE)) { + + /* Move the thread from wait queue to run queue */ + if (ISWAITING(t)) + setrun_locked(t); + + if (ISWAKEABLE(t)) { if (t->t_wchan0 == NULL) setrun_locked(t); else if (!VSTOPPED(t)) { @@ -1452,9 +1456,8 @@ pr_setsig(prnode_t *pnp, siginfo_t *sip) } } thread_lock(t); - if (t->t_state == TS_SLEEP && - (t->t_flag & T_WAKEABLE)) { - /* Set signalled sleeping lwp running */ + if (ISWAKEABLE(t) || ISWAITING(t)) { + /* Set signalled sleeping/waiting lwp running */ setrun_locked(t); } else if (t->t_state == TS_STOPPED && sig == SIGKILL) { /* If SIGKILL, set stopped lwp running */ @@ -1759,8 +1762,7 @@ pr_sethold(prnode_t *pnp, sigset_t *sp) schedctl_finish_sigblock(t); sigutok(sp, &t->t_hold); - if (t->t_state == TS_SLEEP && - (t->t_flag & T_WAKEABLE) && + if (ISWAKEABLE(t) && (fsig(&p->p_sig, t) || fsig(&t->t_sig, t))) setrun_locked(t); t->t_sig_check = 1; /* so thread will see new holdmask */ @@ -2363,10 +2365,9 @@ pauselwps(proc_t *p) thread_lock(t); t->t_proc_flag |= TP_PAUSE; aston(t); - if (t->t_state == TS_SLEEP && - (t->t_flag & T_WAKEABLE)) { - if (t->t_wchan0 == NULL) - setrun_locked(t); + if ((ISWAKEABLE(t) && (t->t_wchan0 == NULL)) || + ISWAITING(t)) { + setrun_locked(t); } prpokethread(t); thread_unlock(t); diff --git a/usr/src/uts/common/fs/proc/prsubr.c b/usr/src/uts/common/fs/proc/prsubr.c index e1c33384f7..60e541bb03 100644 --- a/usr/src/uts/common/fs/proc/prsubr.c +++ b/usr/src/uts/common/fs/proc/prsubr.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -166,6 +166,7 @@ prchoose(proc_t *p) } break; case TS_RUN: + case TS_WAIT: if (t_run == NULL) t_run = t; break; @@ -2507,6 +2508,7 @@ prgetlwpsinfo(kthread_t *t, lwpsinfo_t *psp) case TS_ONPROC: state = SONPROC; c = 'O'; break; case TS_ZOMB: state = SZOMB; c = 'Z'; break; case TS_STOPPED: state = SSTOP; c = 'T'; break; + case TS_WAIT: state = SWAIT; c = 'W'; break; default: state = 0; c = '?'; break; } psp->pr_state = state; @@ -2573,6 +2575,7 @@ prgetlwpsinfo32(kthread_t *t, lwpsinfo32_t *psp) case TS_ONPROC: state = SONPROC; c = 'O'; break; case TS_ZOMB: state = SZOMB; c = 'Z'; break; case TS_STOPPED: state = SSTOP; c = 'T'; break; + case TS_WAIT: state = SWAIT; c = 'W'; break; default: state = 0; c = '?'; break; } psp->pr_state = state; diff --git a/usr/src/uts/common/os/clock.c b/usr/src/uts/common/os/clock.c index 0a8a99f0af..06babeb55c 100644 --- a/usr/src/uts/common/os/clock.c +++ b/usr/src/uts/common/os/clock.c @@ -79,8 +79,7 @@ #include <sys/inttypes.h> /* - * clock is called straight from - * the real time clock interrupt. + * clock() is called straight from the clock cyclic; see clock_init(). * * Functions: * reprime clock @@ -314,10 +313,7 @@ static int genloadavg(struct loadavg_s *); static void loadavg_update(); void (*cmm_clock_callout)() = NULL; - -#ifdef KSLICE -int kslice = KSLICE; -#endif +void (*cpucaps_clock_callout)() = NULL; static void clock(void) @@ -513,9 +509,10 @@ clock(void) /* * Do tick processing for all the active threads running in - * the system. + * the system. We're trying to be more fair by walking the + * list of CPUs starting from a different CPUs each time. */ - cp = cpu_list; + cp = clock_cpu_list; nrunning = 0; do { klwp_id_t lwp; @@ -649,21 +646,11 @@ clock(void) clock_tick(t); } -#ifdef KSLICE - /* - * Ah what the heck, give this kid a taste of the real - * world and yank the rug out from under it. - * But, only if we are running UniProcessor. - */ - if ((kslice) && (ncpus == 1)) { - aston(t); - cp->cpu_runrun = 1; - cp->cpu_kprunrun = 1; - } -#endif if (!exiting) mutex_exit(plockp); - } while ((cp = cp->cpu_next) != cpu_list); + } while ((cp = cp->cpu_next) != clock_cpu_list); + + clock_cpu_list = clock_cpu_list->cpu_next; /* * bump time in ticks @@ -683,6 +670,9 @@ clock(void) if ((funcp = cmm_clock_callout) != NULL) (*funcp)(); + if ((funcp = cpucaps_clock_callout) != NULL) + (*funcp)(); + /* * Wakeup the cageout thread waiters once per second. */ diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c index 7e28e4f1a1..ec553d1947 100644 --- a/usr/src/uts/common/os/cpu.c +++ b/usr/src/uts/common/os/cpu.c @@ -105,6 +105,7 @@ static int cpu_state_change_hooks(int, cpu_setup_t, cpu_setup_t); */ kmutex_t cpu_lock; cpu_t *cpu_list; /* list of all CPUs */ +cpu_t *clock_cpu_list; /* used by clock to walk CPUs */ cpu_t *cpu_active; /* list of active CPUs */ static cpuset_t cpu_available; /* set of available CPUs */ cpuset_t cpu_seqid_inuse; /* which cpu_seqids are in use */ @@ -1618,6 +1619,7 @@ cpu_list_init(cpu_t *cp) cp->cpu_next = cp; cp->cpu_prev = cp; cpu_list = cp; + clock_cpu_list = cp; cp->cpu_next_onln = cp; cp->cpu_prev_onln = cp; @@ -1763,7 +1765,10 @@ cpu_del_unit(int cpuid) cp->cpu_prev->cpu_next = cp->cpu_next; cp->cpu_next->cpu_prev = cp->cpu_prev; if (cp == cpu_list) - cpu_list = cpnext; + cpu_list = cpnext; + if (cp == clock_cpu_list) + clock_cpu_list = cpnext; + /* * Signals that the cpu has been deleted (see above). diff --git a/usr/src/uts/common/os/kstat_fr.c b/usr/src/uts/common/os/kstat_fr.c index 0036b1cd3d..c97b1621cb 100644 --- a/usr/src/uts/common/os/kstat_fr.c +++ b/usr/src/uts/common/os/kstat_fr.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" @@ -289,7 +289,9 @@ kstat_zone_add(kstat_t *k, zoneid_t zoneid) ekstat_t *e = (ekstat_t *)k; kstat_zone_t *kz; - kz = kmem_alloc(sizeof (*kz), KM_SLEEP); + kz = kmem_alloc(sizeof (*kz), KM_NOSLEEP); + if (kz == NULL) + return; mutex_enter(&kstat_chain_lock); kz->zoneid = zoneid; kz->next = e->e_zone.next; diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c index 7533fb0028..c1d1a870e0 100644 --- a/usr/src/uts/common/os/lwp.c +++ b/usr/src/uts/common/os/lwp.c @@ -781,6 +781,7 @@ lwp_exit(void) t->t_proc_flag |= TP_LWPEXIT; term_mstate(t); + #ifndef NPROBE /* Kernel probe */ if (t->t_tnf_tpdp) @@ -916,10 +917,12 @@ top: * XXX Should use virtual stop like /proc does instead of * XXX waking the thread to get it to stop. */ - if (t->t_state == TS_SLEEP && (t->t_flag & T_WAKEABLE)) + if (ISWAKEABLE(t) || ISWAITING(t)) { setrun_locked(t); - else if (t->t_state == TS_ONPROC && t->t_cpu != CPU) + } else if (t->t_state == TS_ONPROC && t->t_cpu != CPU) { poke_cpu(t->t_cpu->cpu_id); + } + tid = t->t_tid; /* remember thread ID */ /* * Wait for lwp to stop @@ -1360,9 +1363,8 @@ pokelwps(proc_t *p) continue; thread_lock(t); aston(t); /* make thread trap or do post_syscall */ - if (t->t_state == TS_SLEEP) { - if (t->t_flag & T_WAKEABLE) - setrun_locked(t); + if (ISWAKEABLE(t) || ISWAITING(t)) { + setrun_locked(t); } else if (t->t_state == TS_STOPPED) { /* * Ensure that proc_exit() is not blocked by lwps diff --git a/usr/src/uts/common/os/msacct.c b/usr/src/uts/common/os/msacct.c index 134de5a513..57c56cef8f 100644 --- a/usr/src/uts/common/os/msacct.c +++ b/usr/src/uts/common/os/msacct.c @@ -236,6 +236,47 @@ new_cpu_mstate(int cmstate, hrtime_t curtime) } /* + * Return an aggregation of user and system CPU time consumed by + * the specified thread in scaled nanoseconds. + */ +hrtime_t +mstate_thread_onproc_time(kthread_t *t) +{ + hrtime_t aggr_time; + hrtime_t now; + hrtime_t state_start; + struct mstate *ms; + klwp_t *lwp; + int mstate; + + ASSERT(THREAD_LOCK_HELD(t)); + + if ((lwp = ttolwp(t)) == NULL) + return (0); + + mstate = t->t_mstate; + ms = &lwp->lwp_mstate; + state_start = ms->ms_state_start; + + aggr_time = ms->ms_acct[LMS_USER] + + ms->ms_acct[LMS_SYSTEM] + ms->ms_acct[LMS_TRAP]; + + now = gethrtime_unscaled(); + + /* + * NOTE: gethrtime_unscaled on X86 taken on different CPUs is + * inconsistent, so it is possible that now < state_start. + */ + if ((mstate == LMS_USER || mstate == LMS_SYSTEM || + mstate == LMS_TRAP) && (now > state_start)) { + aggr_time += now - state_start; + } + + scalehrtime(&aggr_time); + return (aggr_time); +} + +/* * Return an aggregation of microstate times in scaled nanoseconds (high-res * time). This keeps in mind that p_acct is already scaled, and ms_acct is * not. diff --git a/usr/src/uts/common/os/project.c b/usr/src/uts/common/os/project.c index 3c17a0dc38..b5f96a8dd7 100644 --- a/usr/src/uts/common/os/project.c +++ b/usr/src/uts/common/os/project.c @@ -41,6 +41,7 @@ #include <sys/port_kernel.h> #include <sys/task.h> #include <sys/zone.h> +#include <sys/cpucaps.h> int project_hash_size = 64; static kmutex_t project_hash_lock; @@ -49,6 +50,7 @@ static mod_hash_t *projects_hash; static kproject_t *projects_list; rctl_hndl_t rc_project_cpu_shares; +rctl_hndl_t rc_project_cpu_cap; rctl_hndl_t rc_project_nlwps; rctl_hndl_t rc_project_ntasks; rctl_hndl_t rc_project_msgmni; @@ -156,6 +158,7 @@ project_hash_val_dtor(mod_hash_val_t val) kproject_t *kp = (kproject_t *)val; ASSERT(kp->kpj_count == 0); + ASSERT(kp->kpj_cpucap == NULL); kmem_free(kp, sizeof (kproject_t)); } @@ -251,6 +254,7 @@ project_hold_by_id(projid_t id, zone_t *zone, int flag) p = spare_p; p->kpj_id = id; + p->kpj_zone = zone; p->kpj_zoneid = zone->zone_id; p->kpj_count = 0; p->kpj_shares = 1; @@ -304,6 +308,13 @@ project_hold_by_id(projid_t id, zone_t *zone, int flag) * across reboots. */ if (create == B_TRUE) { + /* + * Inform CPU caps framework of the new project + */ + cpucaps_project_add(p); + /* + * Set up project kstats + */ ksp = project_kstat_create(p, zone); mutex_enter(&project_hash_lock); ASSERT(p->kpj_data.kpd_lockedmem_kstat == NULL); @@ -343,6 +354,8 @@ project_rele(kproject_t *p) projects_list = p->kpj_next; mutex_exit(&projects_list_lock); + cpucaps_project_remove(p); + rctl_set_free(p->kpj_rctls); project_kstat_delete(p); @@ -431,7 +444,6 @@ project_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, return (0); } - static rctl_ops_t project_cpu_shares_ops = { rcop_no_action, project_cpu_shares_usage, @@ -439,6 +451,43 @@ static rctl_ops_t project_cpu_shares_ops = { rcop_no_test }; + +/* + * project.cpu-cap resource control support. + */ +/*ARGSUSED*/ +static rctl_qty_t +project_cpu_cap_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_project_get(p->p_task->tk_proj)); +} + +/*ARGSUSED*/ +static int +project_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + kproject_t *kpj = e->rcep_p.proj; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_PROJECT); + if (kpj == NULL) + return (0); + + /* + * set cap to the new value. + */ + return (cpucaps_project_set(kpj, nv)); +} + +static rctl_ops_t project_cpu_cap_ops = { + rcop_no_action, + project_cpu_cap_get, + project_cpu_cap_set, + rcop_no_test +}; + /*ARGSUSED*/ static rctl_qty_t project_lwps_usage(rctl_t *r, proc_t *p) @@ -804,6 +853,13 @@ project_init(void) rctl_add_default_limit("project.cpu-shares", 1, RCPRIV_PRIVILEGED, RCTL_LOCAL_NOACTION); + rc_project_cpu_cap = rctl_register("project.cpu-cap", + RCENTITY_PROJECT, RCTL_GLOBAL_SIGNAL_NEVER | + RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC | + RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER | + RCTL_GLOBAL_INFINITE, + MAXCAP, MAXCAP, &project_cpu_cap_ops); + rc_project_nlwps = rctl_register("project.max-lwps", RCENTITY_PROJECT, RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX, &project_lwps_ops); diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c index 808e5d2095..5c72fb749b 100644 --- a/usr/src/uts/common/os/sig.c +++ b/usr/src/uts/common/os/sig.c @@ -181,7 +181,7 @@ eat_signal(kthread_t *t, int sig) */ if (!signal_is_blocked(t, sig)) { t->t_sig_check = 1; /* have thread do an issig */ - if (t->t_state == TS_SLEEP && (t->t_flag & T_WAKEABLE)) { + if (ISWAKEABLE(t) || ISWAITING(t)) { setrun_locked(t); rval = 1; } else if (t->t_state == TS_STOPPED && sig == SIGKILL && @@ -974,6 +974,11 @@ stop(int why, int what) notify = 1; } } + + /* Move waiting thread to run queue */ + if (ISWAITING(tx)) + setrun_locked(tx); + /* * force the thread into the kernel * if it is not already there. diff --git a/usr/src/uts/common/os/task.c b/usr/src/uts/common/os/task.c index 785f74c145..5e4ae1aefe 100644 --- a/usr/src/uts/common/os/task.c +++ b/usr/src/uts/common/os/task.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -388,8 +388,7 @@ task_create(projid_t projid, zone_t *zone) tk->tk_nlwps = 0; tk->tk_nlwps_ctl = INT_MAX; tk->tk_usage = tu; - tk->tk_proj = project_hold_by_id(projid, zone, - PROJECT_HOLD_INSERT); + tk->tk_proj = project_hold_by_id(projid, zone, PROJECT_HOLD_INSERT); tk->tk_flags = TASK_NORMAL; /* @@ -670,6 +669,21 @@ changeproj(proc_t *p, kproject_t *kpj, zone_t *zone, void *projbuf, thread_lock(t); oldkpj = ttoproj(t); + + /* + * Kick this thread so that he doesn't sit + * on a wrong wait queue. + */ + if (ISWAITING(t)) + setrun_locked(t); + + /* + * The thread wants to go on the project wait queue, but + * the waitq is changing. + */ + if (t->t_schedflag & TS_PROJWAITQ) + t->t_schedflag &= ~ TS_PROJWAITQ; + t->t_proj = kpj; t->t_pre_sys = 1; /* For cred update */ thread_unlock(t); diff --git a/usr/src/uts/common/os/timers.c b/usr/src/uts/common/os/timers.c index ddaa2adff4..e656a17088 100644 --- a/usr/src/uts/common/os/timers.c +++ b/usr/src/uts/common/os/timers.c @@ -589,6 +589,7 @@ realprofexpire(void *arg) } break; case TS_RUN: + case TS_WAIT: mstate = LMS_WAIT_CPU; break; case TS_ONPROC: diff --git a/usr/src/uts/common/os/waitq.c b/usr/src/uts/common/os/waitq.c new file mode 100644 index 0000000000..802d7afdc4 --- /dev/null +++ b/usr/src/uts/common/os/waitq.c @@ -0,0 +1,386 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/class.h> +#include <sys/debug.h> +#include <sys/cpuvar.h> +#include <sys/waitq.h> +#include <sys/cmn_err.h> +#include <sys/time.h> +#include <sys/dtrace.h> +#include <sys/sdt.h> +#include <sys/zone.h> + +/* + * Wait queue implementation. + */ + +void +waitq_init(waitq_t *wq) +{ + DISP_LOCK_INIT(&wq->wq_lock); + wq->wq_first = NULL; + wq->wq_count = 0; + wq->wq_blocked = B_TRUE; +} + +void +waitq_fini(waitq_t *wq) +{ + ASSERT(wq->wq_count == 0); + ASSERT(wq->wq_first == NULL); + ASSERT(wq->wq_blocked == B_TRUE); + ASSERT(!DISP_LOCK_HELD(&wq->wq_lock)); + + DISP_LOCK_DESTROY(&wq->wq_lock); +} + +/* + * Operations on waitq_t structures. + * + * A wait queue is a singly linked NULL-terminated list with doubly + * linked circular sublists. The singly linked list is in descending + * priority order and FIFO for threads of the same priority. It links + * through the t_link field of the thread structure. The doubly linked + * sublists link threads of the same priority. They use the t_priforw + * and t_priback fields of the thread structure. + * + * Graphically (with priorities in parens): + * + * ________________ _______ _______ + * / \ / \ / \ + * | | | | | | + * v v v v v v + * t1(60)-->t2(60)-->t3(60)-->t4(50)-->t5(50)-->t6(30)-->t7(0)-->t8(0) + * ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ + * | | | | | | | | | | + * \______/ \______/ \_______/ \__/ \_______/ + * + * There are three interesting operations on a waitq list: inserting + * a thread into the proper position according to priority; removing a + * thread given a pointer to it; and walking the list, possibly + * removing threads along the way. This design allows all three + * operations to be performed efficiently and easily. + * + * To insert a thread, traverse the list looking for the sublist of + * the same priority as the thread (or one of a lower priority, + * meaning there are no other threads in the list of the same + * priority). This can be done without touching all threads in the + * list by following the links between the first threads in each + * sublist. Given a thread t that is the head of a sublist (the first + * thread of that priority found when following the t_link pointers), + * t->t_priback->t_link points to the head of the next sublist. It's + * important to do this since a waitq may contain thousands of + * threads. + * + * Removing a thread from the list is also efficient. First, the + * t_waitq field contains a pointer to the waitq on which a thread + * is waiting (or NULL if it's not on a waitq). This is used to + * determine if the given thread is on the given waitq without + * searching the list. Assuming it is, if it's not the head of a + * sublist, just remove it from the sublist and use the t_priback + * pointer to find the thread that points to it with t_link. If it is + * the head of a sublist, search for it by walking the sublist heads, + * similar to searching for a given priority level when inserting a + * thread. + * + * To walk the list, simply follow the t_link pointers. Removing + * threads along the way can be done easily if the code maintains a + * pointer to the t_link field that pointed to the thread being + * removed. + */ + +static void +waitq_link(waitq_t *wq, kthread_t *t) +{ + kthread_t *next_tp; + kthread_t *last_tp; + kthread_t **tpp; + pri_t tpri, next_pri, last_pri = -1; + + ASSERT(DISP_LOCK_HELD(&wq->wq_lock)); + + tpri = DISP_PRIO(t); + tpp = &wq->wq_first; + while ((next_tp = *tpp) != NULL) { + next_pri = DISP_PRIO(next_tp); + if (tpri > next_pri) + break; + last_tp = next_tp->t_priback; + last_pri = next_pri; + tpp = &last_tp->t_link; + } + *tpp = t; + t->t_link = next_tp; + if (last_pri == tpri) { + /* last_tp points to the last thread of this priority */ + t->t_priback = last_tp; + t->t_priforw = last_tp->t_priforw; + last_tp->t_priforw->t_priback = t; + last_tp->t_priforw = t; + } else { + t->t_priback = t->t_priforw = t; + } + wq->wq_count++; + t->t_waitq = wq; +} + +static void +waitq_unlink(waitq_t *wq, kthread_t *t) +{ + kthread_t *nt; + kthread_t **ptl; + + ASSERT(THREAD_LOCK_HELD(t)); + ASSERT(DISP_LOCK_HELD(&wq->wq_lock)); + ASSERT(t->t_waitq == wq); + + ptl = &t->t_priback->t_link; + /* + * Is it the head of a priority sublist? If so, need to walk + * the priorities to find the t_link pointer that points to it. + */ + if (*ptl != t) { + /* + * Find the right priority level. + */ + ptl = &t->t_waitq->wq_first; + while ((nt = *ptl) != t) + ptl = &nt->t_priback->t_link; + } + /* + * Remove thread from the t_link list. + */ + *ptl = t->t_link; + + /* + * Take it off the priority sublist if there's more than one + * thread there. + */ + if (t->t_priforw != t) { + t->t_priback->t_priforw = t->t_priforw; + t->t_priforw->t_priback = t->t_priback; + } + t->t_link = NULL; + + wq->wq_count--; + t->t_waitq = NULL; + t->t_priforw = NULL; + t->t_priback = NULL; +} + +/* + * Put specified thread to specified wait queue without dropping thread's lock. + * Returns 1 if thread was successfully placed on project's wait queue, or + * 0 if wait queue is blocked. + */ +int +waitq_enqueue(waitq_t *wq, kthread_t *t) +{ + ASSERT(THREAD_LOCK_HELD(t)); + ASSERT(t->t_sleepq == NULL); + ASSERT(t->t_waitq == NULL); + ASSERT(t->t_link == NULL); + + disp_lock_enter_high(&wq->wq_lock); + + /* + * Can't enqueue anything on a blocked wait queue + */ + if (wq->wq_blocked) { + disp_lock_exit_high(&wq->wq_lock); + return (0); + } + + /* + * Mark the time when thread is placed on wait queue. The microstate + * accounting code uses this timestamp to determine wait times. + */ + t->t_waitrq = gethrtime_unscaled(); + + /* + * Mark thread as not swappable. If necessary, it will get + * swapped out when it returns to the userland. + */ + t->t_schedflag |= TS_DONT_SWAP; + DTRACE_SCHED1(cpucaps__sleep, kthread_t *, t); + waitq_link(wq, t); + + THREAD_WAIT(t, &wq->wq_lock); + return (1); +} + +/* + * Change thread's priority while on the wait queue. + * Dequeue and equeue it again so that it gets placed in the right place. + */ +void +waitq_change_pri(kthread_t *t, pri_t new_pri) +{ + waitq_t *wq = t->t_waitq; + + ASSERT(THREAD_LOCK_HELD(t)); + ASSERT(ISWAITING(t)); + ASSERT(wq != NULL); + + waitq_unlink(wq, t); + t->t_pri = new_pri; + waitq_link(wq, t); +} + +static void +waitq_dequeue(waitq_t *wq, kthread_t *t) +{ + ASSERT(THREAD_LOCK_HELD(t)); + ASSERT(t->t_waitq == wq); + ASSERT(ISWAITING(t)); + + waitq_unlink(wq, t); + DTRACE_SCHED1(cpucaps__wakeup, kthread_t *, t); + + /* + * Change thread to transition state without dropping + * the wait queue lock. + */ + THREAD_TRANSITION_NOLOCK(t); +} + +/* + * Return True iff there are any threads on the specified wait queue. + * The check is done **without holding any locks**. + */ +boolean_t +waitq_isempty(waitq_t *wq) +{ + return (wq->wq_count == 0); +} + +/* + * Take thread off its wait queue and make it runnable. + * Returns with thread lock held. + */ +void +waitq_setrun(kthread_t *t) +{ + waitq_t *wq = t->t_waitq; + + ASSERT(THREAD_LOCK_HELD(t)); + + ASSERT(ISWAITING(t)); + if (wq == NULL) + panic("waitq_setrun: thread %p is not on waitq", t); + waitq_dequeue(wq, t); + + disp_lock_exit_high(&wq->wq_lock); + CL_SETRUN(t); +} + +/* + * Take the first thread off the wait queue and return pointer to it. + */ +static kthread_t * +waitq_takeone(waitq_t *wq) +{ + kthread_t *t; + + disp_lock_enter(&wq->wq_lock); + if ((t = wq->wq_first) != NULL) + waitq_dequeue(wq, wq->wq_first); + disp_lock_exit(&wq->wq_lock); + return (t); +} + +/* + * Take the first thread off the wait queue and make it runnable. + * Return the pointer to the thread or NULL if waitq is empty + */ +static kthread_t * +waitq_runfirst(waitq_t *wq) +{ + kthread_t *t; + + t = waitq_takeone(wq); + if (t != NULL) { + CL_SETRUN(t); + thread_unlock(t); /* drops dispq lock */ + } + return (t); +} + +/* + * Take the first thread off the wait queue and make it runnable. + */ +void +waitq_runone(waitq_t *wq) +{ + (void) waitq_runfirst(wq); +} + +/* + * Take all threads off the wait queue and make them runnable. + */ +static void +waitq_runall(waitq_t *wq) +{ + while (waitq_runfirst(wq) != NULL) + ; +} + +/* + * Prevent any new threads from entering wait queue and make all threads + * currently on the wait queue runnable. After waitq_block() completion, no + * threads should ever appear on the wait queue untill it is unblocked. + */ +void +waitq_block(waitq_t *wq) +{ + ASSERT(!wq->wq_blocked); + disp_lock_enter(&wq->wq_lock); + wq->wq_blocked = B_TRUE; + disp_lock_exit(&wq->wq_lock); + waitq_runall(wq); + ASSERT(waitq_isempty(wq)); +} + +/* + * Allow threads to be placed on the wait queue. + */ +void +waitq_unblock(waitq_t *wq) +{ + disp_lock_enter(&wq->wq_lock); + + ASSERT(waitq_isempty(wq)); + ASSERT(wq->wq_blocked); + + wq->wq_blocked = B_FALSE; + + disp_lock_exit(&wq->wq_lock); +} diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index d163dbc5cd..3838c07cfa 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -240,6 +240,7 @@ #include <sys/brand.h> #include <sys/zone.h> #include <net/if.h> +#include <sys/cpucaps.h> #include <vm/seg.h> /* @@ -328,6 +329,7 @@ const char *zone_status_table[] = { rctl_hndl_t rc_zone_cpu_shares; rctl_hndl_t rc_zone_locked_mem; rctl_hndl_t rc_zone_max_swap; +rctl_hndl_t rc_zone_cpu_cap; rctl_hndl_t rc_zone_nlwps; rctl_hndl_t rc_zone_shmmax; rctl_hndl_t rc_zone_shmmni; @@ -882,6 +884,43 @@ static rctl_ops_t zone_cpu_shares_ops = { rcop_no_test }; +/* + * zone.cpu-cap resource control support. + */ +/*ARGSUSED*/ +static rctl_qty_t +zone_cpu_cap_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get(p->p_zone)); +} + +/*ARGSUSED*/ +static int +zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + /* + * set cap to the new value. + */ + return (cpucaps_zone_set(zone, nv)); +} + +static rctl_ops_t zone_cpu_cap_ops = { + rcop_no_action, + zone_cpu_cap_get, + zone_cpu_cap_set, + rcop_no_test +}; + /*ARGSUSED*/ static rctl_qty_t zone_lwps_usage(rctl_t *r, proc_t *p) @@ -1384,8 +1423,13 @@ zone_init(void) rc_zone_cpu_shares = rctl_register("zone.cpu-shares", RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, - FSS_MAXSHARES, FSS_MAXSHARES, - &zone_cpu_shares_ops); + FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops); + + rc_zone_cpu_cap = rctl_register("zone.cpu-cap", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER | + RCTL_GLOBAL_INFINITE, + MAXCAP, MAXCAP, &zone_cpu_cap_ops); rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, @@ -1530,6 +1574,13 @@ zone_free(zone_t *zone) ASSERT(zone_status_get(zone) == ZONE_IS_DEAD || zone_status_get(zone) == ZONE_IS_UNINITIALIZED); + /* + * Remove any zone caps. + */ + cpucaps_zone_remove(zone); + + ASSERT(zone->zone_cpucap == NULL); + /* remove from deathrow list */ if (zone_status_get(zone) == ZONE_IS_DEAD) { ASSERT(zone->zone_ref == 0); @@ -2501,6 +2552,10 @@ zthread_exit(void) zone->zone_kthreads = NULL; if (zone_status_get(zone) == ZONE_IS_EMPTY) { zone_status_set(zone, ZONE_IS_DOWN); + /* + * Remove any CPU caps on this zone. + */ + cpucaps_zone_remove(zone); } } else { t->t_forw->t_back = t->t_back; @@ -2616,8 +2671,9 @@ zone_start_init(void) * Make sure we are still in the booting state-- we could have * raced and already be shutting down, or even further along. */ - if (zone_status_get(z) == ZONE_IS_BOOTING) + if (zone_status_get(z) == ZONE_IS_BOOTING) { zone_status_set(z, ZONE_IS_SHUTTING_DOWN); + } mutex_exit(&zone_status_lock); /* It's gone bad, dispose of the process */ if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) { @@ -3879,7 +3935,13 @@ zone_destroy(zoneid_t zoneid) } - /* Get rid of the zone's kstats. */ + /* + * Remove CPU cap for this zone now since we're not going to + * fail below this point. + */ + cpucaps_zone_remove(zone); + + /* Get rid of the zone's kstats */ zone_kstat_delete(zone); /* Say goodbye to brand framework. */ @@ -3938,8 +4000,8 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) char *outstr; zone_status_t zone_status; pid_t initpid; - boolean_t global = (curproc->p_zone == global_zone); - boolean_t curzone = (curproc->p_zone->zone_id == zoneid); + boolean_t global = (curzone == global_zone); + boolean_t inzone = (curzone->zone_id == zoneid); ushort_t flags; mutex_enter(&zonehash_lock); @@ -3980,7 +4042,7 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) bcopy(zone->zone_rootpath, zonepath, size); zonepath[size - 1] = '\0'; } else { - if (curzone || !is_system_labeled()) { + if (inzone || !is_system_labeled()) { /* * Caller is not in the global zone. * if the query is on the current zone @@ -4011,7 +4073,7 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) if (err != 0 && err != ENAMETOOLONG) error = EFAULT; } - if (global || (is_system_labeled() && !curzone)) + if (global || (is_system_labeled() && !inzone)) kmem_free(zonepath, size); break; @@ -4365,6 +4427,7 @@ zone_enter(zoneid_t zoneid) int err = 0; rctl_entity_p_t e; size_t swap; + kthread_id_t t; if (secpolicy_zone_config(CRED()) != 0) return (set_errno(EPERM)); @@ -4625,6 +4688,28 @@ zone_enter(zoneid_t zoneid) pgjoin(pp, zone->zone_zsched->p_pidp); /* + * If any threads are scheduled to be placed on zone wait queue they + * should abandon the idea since the wait queue is changing. + * We need to be holding pidlock & p_lock to do this. + */ + if ((t = pp->p_tlist) != NULL) { + do { + thread_lock(t); + /* + * Kick this thread so that he doesn't sit + * on a wrong wait queue. + */ + if (ISWAITING(t)) + setrun_locked(t); + + if (t->t_schedflag & TS_ANYWAITQ) + t->t_schedflag &= ~ TS_ANYWAITQ; + + thread_unlock(t); + } while ((t = t->t_forw) != pp->p_tlist); + } + + /* * If there is a default scheduling class for the zone and it is not * the class we are currently in, change all of the threads in the * process to the new class. We need to be holding pidlock & p_lock @@ -4633,7 +4718,6 @@ zone_enter(zoneid_t zoneid) if (zone->zone_defaultcid > 0 && zone->zone_defaultcid != curthread->t_cid) { pcparms_t pcparms; - kthread_id_t t; pcparms.pc_cid = zone->zone_defaultcid; pcparms.pc_clparms[0] = 0; diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index 62f1efac65..84f5253412 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -110,6 +110,8 @@ CHKHDRS= \ byteorder.h \ callb.h \ callo.h \ + cpucaps.h \ + cpucaps_impl.h \ ccompile.h \ cdio.h \ cladm.h \ @@ -564,6 +566,7 @@ CHKHDRS= \ vuid_state.h \ vuid_store.h \ wait.h \ + waitq.h \ wanboot_impl.h \ watchpoint.h \ winlockio.h \ diff --git a/usr/src/uts/common/sys/cpucaps.h b/usr/src/uts/common/sys/cpucaps.h new file mode 100644 index 0000000000..6063ff4380 --- /dev/null +++ b/usr/src/uts/common/sys/cpucaps.h @@ -0,0 +1,157 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CPUCAPS_H +#define _SYS_CPUCAPS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <sys/zone.h> +#include <sys/project.h> +#include <sys/time.h> +#include <sys/rctl.h> + +/* + * CPU caps provide an absolute hard CPU usage limit which is enforced even if + * some CPUs are idle. It can be enforced at project or zone level. + */ + +#ifdef _KERNEL + +/* + * Valid caps values go from 1 to MAXCAP - 1. Specifying the MAXCAP as the cap + * value is equivalent to disabling the cap. + */ +#define MAXCAP UINT_MAX + +/* + * cpucaps_enabled is used to quickly check whether any CPU caps specific code + * should be invoked. Users outside CPU Caps framework should use CPUCAPS_ON() + * and CPUCAPS_OFF() macros. + */ +extern boolean_t cpucaps_enabled; + +#define CPUCAPS_ON() cpucaps_enabled +#define CPUCAPS_OFF() (!cpucaps_enabled) + +/* + * Initialize the CPU caps framework. + */ +extern void cpucaps_init(void); + +/* + * Notify caps framework of a new project coming in or existing project + * going away + */ +extern void cpucaps_project_add(kproject_t *); +extern void cpucaps_project_remove(kproject_t *); + +/* + * Notify caps framework when a zone is going away. + */ +extern void cpucaps_zone_remove(zone_t *); + +/* + * Set project/zone cap to specified value. Value of MAXCAP should disable caps. + */ +extern int cpucaps_project_set(kproject_t *, rctl_qty_t); +extern int cpucaps_zone_set(zone_t *, rctl_qty_t); + +/* + * Get current CPU usage for a project/zone. + */ +extern rctl_qty_t cpucaps_project_get(kproject_t *); +extern rctl_qty_t cpucaps_zone_get(zone_t *); + +/* + * Scheduling class hooks into CPU caps framework. + */ + +/* + * CPU caps specific data for each scheduling class. + * + * There is a small amount of accounting data that should be kept by each + * scheduling class for each thread which is only used by CPU caps code. This + * data is kept in the caps_sc structure which is transparent for all scheduling + * classes. The fields in the structure are: + * + * csc_cputime - Total time spent on CPU during thread lifetime, obtained + * as the sum of user, system and trap time, reported by + * microstate accounting. + */ +typedef struct caps_sc { + hrtime_t csc_cputime; +} caps_sc_t; + +/* + * Initialize per-thread cpu-caps specific data. + */ +extern void cpucaps_sc_init(caps_sc_t *); + +/* + * Modus operandi for cpucaps_charge() function. + * + * CPUCAPS_CHARGE_ENFORCE - charge a thread for its CPU time and + * flag it to be placed on wait queue. + * + * CPUCAPS_CHARGE_ONLY - charge a thread for its CPU time. + */ +typedef enum { + CPUCAPS_CHARGE_ENFORCE, + CPUCAPS_CHARGE_ONLY +} cpucaps_charge_t; + +/* + * Add accumulated CPU usage of a thread to its cap. + * Return True if thread should be placed on waitq. + */ +extern boolean_t cpucaps_charge(kthread_t *, caps_sc_t *, cpucaps_charge_t); +#define CPUCAPS_CHARGE(t, scp, flag) \ + (CPUCAPS_ON() && cpucaps_charge(t, scp, flag)) + +/* + * Request a thread to be placed on a wait queue because the cap is exceeded + */ +extern boolean_t cpucaps_enforce(kthread_t *); +#define CPUCAPS_ENFORCE(t) (CPUCAPS_ON() && cpucaps_enforce(t)) + +/* + * CPU Caps hook into clock(). + */ +extern void (*cpucaps_clock_callout)(void); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CPUCAPS_H */ diff --git a/usr/src/uts/common/sys/cpucaps_impl.h b/usr/src/uts/common/sys/cpucaps_impl.h new file mode 100644 index 0000000000..ba4132993f --- /dev/null +++ b/usr/src/uts/common/sys/cpucaps_impl.h @@ -0,0 +1,102 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CPUCAPS_IMPL_H +#define _SYS_CPUCAPS_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#include <sys/kstat.h> +#include <sys/cpucaps.h> +#include <sys/list.h> +#include <sys/time.h> +#include <sys/waitq.h> + +/* + * When resource control framework sets the cap to NOCAP value the cap + * is disabled. + */ +#define NOCAP MAXCAP + +/* + * Maximum value for the cap usage. Should be the maximum value for hrtime_t + */ +#if defined(_LP64) +#define MAX_USAGE LONG_MAX +#else +#define MAX_USAGE 9223372036854775807LL +#endif + + +/* + * Most of the per-project or per-zone state related to CPU caps is kept in the + * cpucap_t structure. + */ +typedef struct cpucap { + list_node_t cap_link; /* next/prev capped entity */ + struct kproject *cap_project; /* project for the cap */ + struct zone *cap_zone; /* zone for the cap */ + waitq_t cap_waitq; /* waitq for capped threads */ + kstat_t *cap_kstat; /* cpucaps specific kstat */ + int64_t cap_lbolt; /* zone cap specific */ + hrtime_t cap_value; /* scaled CPU usage cap */ + hrtime_t cap_usage; /* current CPU usage */ + disp_lock_t cap_usagelock; /* protects cap_usage above */ + /* + * Per cap statistics. + */ + hrtime_t cap_maxusage; /* maximum cap usage */ + u_longlong_t cap_below; /* # of ticks spend below the cap */ + u_longlong_t cap_above; /* # of ticks spend above the cap */ +} cpucap_t; + +/* + * Wrapper macros for checking cap state. + */ +#define CAP_ENABLED(cap) ((cap)->cap_value != 0) +#define CAP_DISABLED(cap) (!CAP_ENABLED(cap)) + +#define PROJECT_IS_CAPPED(project) \ + (((project)->kpj_cpucap != NULL) && \ + CAP_ENABLED((project)->kpj_cpucap)) + +#define ZONE_IS_CAPPED(zone) \ + (((zone)->zone_cpucap != NULL) && \ + CAP_ENABLED((zone)->zone_cpucap)) + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CPUCAPS_IMPL_H */ diff --git a/usr/src/uts/common/sys/cpuvar.h b/usr/src/uts/common/sys/cpuvar.h index c211b63185..b0493774e0 100644 --- a/usr/src/uts/common/sys/cpuvar.h +++ b/usr/src/uts/common/sys/cpuvar.h @@ -508,6 +508,7 @@ extern int max_ncpus; /* max present before ncpus is known */ extern int boot_max_ncpus; /* like max_ncpus but for real */ extern processorid_t max_cpuid; /* maximum CPU number */ extern struct cpu *cpu_inmotion; /* offline or partition move target */ +extern cpu_t *clock_cpu_list; #if defined(__i386) || defined(__amd64) extern struct cpu *curcpup(void); diff --git a/usr/src/uts/common/sys/fss.h b/usr/src/uts/common/sys/fss.h index 21323fcc1a..583586fd75 100644 --- a/usr/src/uts/common/sys/fss.h +++ b/usr/src/uts/common/sys/fss.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -21,7 +20,7 @@ */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,11 +32,14 @@ #include <sys/types.h> #include <sys/thread.h> #include <sys/project.h> +#include <sys/cpucaps.h> #ifdef __cplusplus extern "C" { #endif +#ifdef _KERNEL + typedef uint64_t fsspri_t; typedef uint64_t fssusage_t; struct cpupart; @@ -74,7 +76,7 @@ void fss_changepset(kthread_id_t, void *, fssbuf_t *, fssbuf_t *); typedef struct fsspset { kmutex_t fssps_lock; /* lock to protect per-pset */ /* list of fssproj structures */ - disp_lock_t fssps_displock; /* lock for fsps_maxfspri */ + disp_lock_t fssps_displock; /* lock for fsps_maxfsspri */ struct cpupart *fssps_cpupart; /* ptr to our cpu partition */ /* protected by fsspsets_lock */ fsspri_t fssps_maxfsspri; /* maximum fsspri value among */ @@ -113,7 +115,7 @@ typedef struct fssproj { */ typedef struct fssproc { kthread_t *fss_tp; /* pointer back to our thread */ - fssproj_t *fss_proj; /* pointer to our project FS data */ + fssproj_t *fss_proj; /* pointer to our project FSS data */ uchar_t fss_flags; /* flags defined below */ int fss_timeleft; /* time remaining in procs quantum */ uint32_t fss_ticks; /* ticks accumulated by this thread */ @@ -126,20 +128,22 @@ typedef struct fssproc { int fss_runnable; /* to indicate runnable/sleeping thread */ struct fssproc *fss_next; /* pointer to next fssproc_t struct */ struct fssproc *fss_prev; /* pointer to prev fssproc_t sturct */ + caps_sc_t fss_caps; /* CPU caps specific data */ } fssproc_t; /* - * One of these structures is allocated to each zone running within each active - * cpu partition. + * One of these structures is allocated to each zone running within + * each active cpu partition. This means that if a zone spans more + * than one cpu partition then it will have a few of these structures. */ typedef struct fsszone { - struct zone *fssz_zone; /* ptr to our zone structure */ - struct fsszone *fssz_next; /* ptr to next fsszone in fsspset */ - struct fsszone *fssz_prev; /* ptr to prev fsszone in fsspset */ - uint32_t fssz_shares; /* total #shares for projs in zone */ - uint32_t fssz_nproj; /* # fssproj_t's in this fsszone */ - uint32_t fssz_rshares; /* "real" shares given to zone */ - uint32_t fssz_runnable; /* # projects with runnable threads */ + struct zone *fssz_zone; /* ptr to our zone structure */ + struct fsszone *fssz_next; /* next fsszone_t in fsspset_t */ + struct fsszone *fssz_prev; /* prev fsszone_t in fsspset_t */ + uint32_t fssz_shares; /* sum of all project shares */ + uint32_t fssz_nproj; /* # of projects */ + uint32_t fssz_rshares; /* "real" shares given to zone */ + uint32_t fssz_runnable; /* # of runnable projects */ } fsszone_t; #define FSSPROC(tx) ((fssproc_t *)(tx->t_cldata)) @@ -158,6 +162,9 @@ typedef struct fsszone { /* the dispatch queue if preempted */ #define FSSRESTORE 0x04 /* thread was not preempted, due to schedctl */ /* restore priority from fss_scpri */ + +#endif /* _KERNEL */ + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/fx.h b/usr/src/uts/common/sys/fx.h index 166bc0a06c..2d4e1aa7fb 100644 --- a/usr/src/uts/common/sys/fx.h +++ b/usr/src/uts/common/sys/fx.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,8 +19,8 @@ * CDDL HEADER END */ /* - * Copyright (c) 2001 by Sun Microsystems, Inc. - * All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. */ #ifndef _SYS_FX_H @@ -33,6 +32,7 @@ #include <sys/thread.h> #include <sys/ddi.h> #include <sys/sunddi.h> +#include <sys/cpucaps.h> #ifdef __cplusplus extern "C" { @@ -114,9 +114,6 @@ typedef struct fxproc { char fx_nice; /* nice value for compatibility */ uchar_t fx_flags; /* flags defined below */ kthread_t *fx_tp; /* pointer to thread */ - struct fxproc *fx_next; /* pointer to next fxproc */ - - struct fxproc *fx_prev; /* pointer to previous fxproc */ /* the following are used only when we have callbacks registered */ kt_did_t fx_ktid; @@ -128,6 +125,7 @@ typedef struct fxproc { fx_cookie_t fx_cookie; /* cookie with which callback */ /* was registered */ fx_callbacks_t *fx_callback; /* pointer to callback structure */ + caps_sc_t fx_caps; /* CPU caps specific data */ } fxproc_t; diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h index 9a0ba2cc37..02c414b4fc 100644 --- a/usr/src/uts/common/sys/proc.h +++ b/usr/src/uts/common/sys/proc.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -424,11 +424,12 @@ extern struct pid pid0; /* p0's pid */ /* stat codes */ #define SSLEEP 1 /* awaiting an event */ -#define SRUN 2 /* running */ +#define SRUN 2 /* runnable */ #define SZOMB 3 /* process terminated but not waited for */ #define SSTOP 4 /* process stopped by debugger */ #define SIDL 5 /* intermediate state in process creation */ #define SONPROC 6 /* process is being run on a processor */ +#define SWAIT 7 /* process is waiting to become runnable */ /* p_pidflag codes */ #define CLDPEND 0x0001 /* have yet to post a SIGCHLD to the parent */ @@ -639,6 +640,7 @@ extern void term_mstate(kthread_t *); extern void estimate_msacct(kthread_t *, hrtime_t); extern void disable_msacct(proc_t *); extern hrtime_t mstate_aggr_state(proc_t *, int); +extern hrtime_t mstate_thread_onproc_time(kthread_t *); extern void syscall_mstate(int, int); extern uint_t cpu_update_pct(kthread_t *, hrtime_t); diff --git a/usr/src/uts/common/sys/project.h b/usr/src/uts/common/sys/project.h index 68aaef7730..15a0bf2193 100644 --- a/usr/src/uts/common/sys/project.h +++ b/usr/src/uts/common/sys/project.h @@ -39,6 +39,7 @@ extern "C" { #include <sys/mutex.h> #include <sys/rctl.h> #include <sys/ipc_rctl.h> +#include <sys/zone.h> typedef struct kproject_kstat { kstat_named_t kpk_zonename; @@ -58,25 +59,29 @@ typedef struct kproject_data { /* Datum protected by: */ } kproject_data_t; +struct cpucap; + /* * The first two fields of this structure must not be reordered. */ typedef struct kproject { projid_t kpj_id; /* project ID */ zoneid_t kpj_zoneid; /* zone ID */ + struct zone *kpj_zone; /* zone pointer */ uint_t kpj_count; /* reference counter */ uint32_t kpj_shares; /* number of shares */ rctl_set_t *kpj_rctls; /* resource control set */ struct kproject *kpj_prev; /* previous project */ struct kproject *kpj_next; /* next project */ kproject_data_t kpj_data; /* subsystem-specfic data */ - kmutex_t kpj_poolbind; /* synch. with pools */ + kmutex_t kpj_poolbind; /* synchronization with pools */ rctl_qty_t kpj_nlwps; /* protected by project's zone's */ /* zone_nlwps_lock */ rctl_qty_t kpj_nlwps_ctl; /* protected by kpj_rctls->rcs_lock */ rctl_qty_t kpj_ntasks; /* protected by project's zone's */ /* zone_nlwps_lock */ rctl_qty_t kpj_ntasks_ctl; /* protected by kpj_rctls->rcs_lock */ + struct cpucap *kpj_cpucap; /* CPU cap data */ } kproject_t; #ifdef _KERNEL @@ -87,8 +92,6 @@ typedef struct kproject { #define PROJECT_HOLD_FIND 1 #define PROJECT_HOLD_INSERT 2 -struct zone; - void project_init(void); kproject_t *project_hold(kproject_t *); kproject_t *project_hold_by_id(projid_t, struct zone *, int); diff --git a/usr/src/uts/common/sys/schedctl.h b/usr/src/uts/common/sys/schedctl.h index c6546e607e..165ff3f171 100644 --- a/usr/src/uts/common/sys/schedctl.h +++ b/usr/src/uts/common/sys/schedctl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 1997-2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -81,6 +80,7 @@ typedef struct sc_shared { #define SC_RUN 0x02 #define SC_ONPROC 0x04 #define SC_STOPPED 0x10 +#define SC_WAIT 0x20 /* preemption control settings */ #define SC_MAX_TICKS 2 /* max time preemption can be blocked */ diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h index c0ee411715..acfebbfb88 100644 --- a/usr/src/uts/common/sys/thread.h +++ b/usr/src/uts/common/sys/thread.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -57,6 +57,7 @@ extern "C" { #define TS_ONPROC 0x04 /* Thread is being run on a processor */ #define TS_ZOMB 0x08 /* Thread has died but hasn't been reaped */ #define TS_STOPPED 0x10 /* Stopped, initial state */ +#define TS_WAIT 0x20 /* Waiting to become runnable */ typedef struct ctxop { void (*save_op)(void *); /* function to invoke to save context */ @@ -98,6 +99,7 @@ struct trap_info; struct upimutex; struct kproject; struct on_trap_data; +struct waitq; /* Definition for kernel thread identifier type */ typedef uint64_t kt_did_t; @@ -333,6 +335,7 @@ typedef struct _kthread { #endif hrtime_t t_hrtime; /* high-res last time on cpu */ kmutex_t t_ctx_lock; /* protects t_ctx in removectx() */ + struct waitq *t_waitq; /* wait queue */ } kthread_t; /* @@ -391,6 +394,8 @@ typedef struct _kthread { #define TS_SWAPENQ 0x0004 /* swap thread when it reaches a safe point */ #define TS_ON_SWAPQ 0x0008 /* thread is on the swap queue */ #define TS_SIGNALLED 0x0010 /* thread was awakened by cv_signal() */ +#define TS_PROJWAITQ 0x0020 /* thread is on its project's waitq */ +#define TS_ZONEWAITQ 0x0040 /* thread is on its zone's waitq */ #define TS_CSTART 0x0100 /* setrun() by continuelwps() */ #define TS_UNPAUSE 0x0200 /* setrun() by unpauselwps() */ #define TS_XSTART 0x0400 /* setrun() by SIGCONT */ @@ -400,6 +405,7 @@ typedef struct _kthread { #define TS_RUNQMATCH 0x4000 /* exact run queue balancing by setbackdq() */ #define TS_ALLSTART \ (TS_CSTART|TS_UNPAUSE|TS_XSTART|TS_PSTART|TS_RESUME|TS_CREATE) +#define TS_ANYWAITQ (TS_PROJWAITQ|TS_ZONEWAITQ) /* * No locking needed for AST field. @@ -411,6 +417,13 @@ typedef struct _kthread { #define ISTOPPED(t) ((t)->t_state == TS_STOPPED && \ !((t)->t_schedflag & TS_PSTART)) +/* True if thread is asleep and wakeable */ +#define ISWAKEABLE(t) (((t)->t_state == TS_SLEEP && \ + ((t)->t_flag & T_WAKEABLE))) + +/* True if thread is on the wait queue */ +#define ISWAITING(t) ((t)->t_state == TS_WAIT) + /* similar to ISTOPPED except the event of interest is CPR */ #define CPR_ISTOPPED(t) ((t)->t_state == TS_STOPPED && \ !((t)->t_schedflag & TS_RESUME)) @@ -465,6 +478,9 @@ typedef struct _kthread { * ttoproj(x) * convert a thread pointer to its project pointer. * + * ttozone(x) + * convert a thread pointer to its zone pointer. + * * lwptot(x) * convert a lwp pointer to its thread pointer. * @@ -476,6 +492,7 @@ typedef struct _kthread { #define ttolwp(x) ((x)->t_lwp) #define ttoproc(x) ((x)->t_procp) #define ttoproj(x) ((x)->t_proj) +#define ttozone(x) ((x)->t_procp->p_zone) #define lwptot(x) ((x)->lwp_thread) #define lwptoproc(x) ((x)->lwp_procp) @@ -488,6 +505,7 @@ extern kthread_t *threadp(void); /* inline, returns thread pointer */ #define curthread (threadp()) /* current thread pointer */ #define curproc (ttoproc(curthread)) /* current process pointer */ #define curproj (ttoproj(curthread)) /* current project pointer */ +#define curzone (curproc->p_zone) /* current zone pointer */ extern struct _kthread t0; /* the scheduler thread */ extern kmutex_t pidlock; /* global process lock */ @@ -583,6 +601,12 @@ caddr_t thread_stk_init(caddr_t); /* init thread stack */ #define THREAD_RUN(tp, lp) THREAD_SET_STATE(tp, TS_RUN, lp) /* + * Put thread in wait state, and set the lock pointer to the wait queue + * lock pointer provided. This lock should be held. + */ +#define THREAD_WAIT(tp, lp) THREAD_SET_STATE(tp, TS_WAIT, lp) + +/* * Put thread in run state, and set the lock pointer to the dispatcher queue * lock pointer provided (i.e., the "swapped_lock"). This lock should be held. */ @@ -620,7 +644,6 @@ caddr_t thread_stk_init(caddr_t); /* init thread stack */ #define THREAD_FREEINTR(tp, cpu) \ THREAD_SET_STATE(tp, TS_FREE, &(cpu)->cpu_thread_lock) - #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/ts.h b/usr/src/uts/common/sys/ts.h index 982c594977..971bbaded9 100644 --- a/usr/src/uts/common/sys/ts.h +++ b/usr/src/uts/common/sys/ts.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -21,7 +20,7 @@ */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -36,6 +35,7 @@ #include <sys/types.h> #include <sys/thread.h> +#include <sys/cpucaps.h> #ifdef __cplusplus extern "C" { @@ -57,36 +57,38 @@ typedef struct tsdpent { /* ts_maxwait */ } tsdpent_t; - +#ifdef _KERNEL /* * time-sharing class specific thread structure */ typedef struct tsproc { - int ts_timeleft; /* time remaining in procs quantum */ - uint_t ts_dispwait; /* wall clock seconds since start */ - /* of quantum (not reset upon preemption */ + int ts_timeleft; /* time remaining in procs quantum */ + uint_t ts_dispwait; /* wall clock seconds since start */ + /* of quantum (not reset upon preempt) */ pri_t ts_cpupri; /* system controlled component of ts_umdpri */ - pri_t ts_uprilim; /* user priority limit */ - pri_t ts_upri; /* user priority */ - pri_t ts_umdpri; /* user mode priority within ts class */ - pri_t ts_scpri; /* remembered priority, for schedctl */ - char ts_nice; /* nice value for compatibility */ - char ts_boost; /* interactive priority offset */ - uchar_t ts_flags; /* flags defined below */ - kthread_t *ts_tp; /* pointer to thread */ - struct tsproc *ts_next; /* link to next tsproc on list */ - struct tsproc *ts_prev; /* link to previous tsproc on list */ + pri_t ts_uprilim; /* user priority limit */ + pri_t ts_upri; /* user priority */ + pri_t ts_umdpri; /* user mode priority within ts class */ + pri_t ts_scpri; /* remembered priority, for schedctl */ + char ts_nice; /* nice value for compatibility */ + char ts_boost; /* interactive priority offset */ + uchar_t ts_flags; /* flags defined below */ + kthread_t *ts_tp; /* pointer to thread */ + struct tsproc *ts_next; /* link to next tsproc on list */ + struct tsproc *ts_prev; /* link to previous tsproc on list */ + caps_sc_t ts_caps; /* CPU caps specific data */ } tsproc_t; - /* flags */ -#define TSKPRI 0x01 /* thread at kernel mode priority */ -#define TSBACKQ 0x02 /* thread goes to back of disp q when preempted */ -#define TSIA 0x04 /* thread is interactive */ -#define TSIASET 0x08 /* interactive thread is "on" */ -#define TSIANICED 0x10 /* interactive thread has been niced */ +#define TSKPRI 0x01 /* thread at kernel mode priority */ +#define TSBACKQ 0x02 /* thread goes to back of dispq if preempted */ +#define TSIA 0x04 /* thread is interactive */ +#define TSIASET 0x08 /* interactive thread is "on" */ +#define TSIANICED 0x10 /* interactive thread has been niced */ #define TSRESTORE 0x20 /* thread was not preempted, due to schedctl */ - /* restore priority from ts_scpri */ + /* restore priority from ts_scpri */ + +#endif /* _KERNEL */ #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/waitq.h b/usr/src/uts/common/sys/waitq.h new file mode 100644 index 0000000000..3b925884d0 --- /dev/null +++ b/usr/src/uts/common/sys/waitq.h @@ -0,0 +1,90 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_WAITQ_H +#define _SYS_WAITQ_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#include <sys/types.h> +#include <sys/machlock.h> +#include <sys/thread.h> + +typedef struct waitq { + disp_lock_t wq_lock; /* protects all fields */ + kthread_t *wq_first; /* first thread on the queue */ + int wq_count; /* number of threads on the queue */ + boolean_t wq_blocked; /* True if threads can't be enqueued */ +} waitq_t; + +extern void waitq_init(waitq_t *); +extern void waitq_fini(waitq_t *); + +/* + * Place the thread on the wait queue. An attempt to enqueue a thread onto a + * blocked queue fails and returns zero. Successful enqueue returns non-zero + * value. + */ +extern int waitq_enqueue(waitq_t *, kthread_t *); + +/* + * Take thread off its wait queue and make it runnable. + */ +extern void waitq_setrun(kthread_t *t); + +/* + * Change priority for the thread on wait queue. + */ +extern void waitq_change_pri(kthread_t *, pri_t); + +/* + * Take the first thread off the wait queue and make it runnable. + */ +extern void waitq_runone(waitq_t *); + +/* + * Return True if there are no threads on the queue. + */ +extern boolean_t waitq_isempty(waitq_t *); + +/* + * Prevent and allow placing new threads on wait queue. + */ +extern void waitq_block(waitq_t *); +extern void waitq_unblock(waitq_t *); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_WAITQ_H */ diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index 9983e8ec85..615228921a 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -305,6 +305,8 @@ typedef struct zone_kstat { kstat_named_t zk_value; } zone_kstat_t; +struct cpucap; + typedef struct zone { /* * zone_name is never modified once set. @@ -416,6 +418,7 @@ typedef struct zone { */ struct dlnamelist *zone_dl_list; netstack_t *zone_netstack; + struct cpucap *zone_cpucap; /* CPU caps data */ } zone_t; /* diff --git a/usr/src/uts/i86pc/os/trap.c b/usr/src/uts/i86pc/os/trap.c index b6c7621409..38451ba8ad 100644 --- a/usr/src/uts/i86pc/os/trap.c +++ b/usr/src/uts/i86pc/os/trap.c @@ -1408,7 +1408,7 @@ out: /* We can't get here from a system trap */ CL_TRAPRET(ct); thread_unlock(ct); } - if (CPU->cpu_runrun) + if (CPU->cpu_runrun || curthread->t_schedflag & TS_ANYWAITQ) preempt(); (void) new_mstate(ct, mstate); diff --git a/usr/src/uts/intel/ia32/os/syscall.c b/usr/src/uts/intel/ia32/os/syscall.c index a78fbc62a6..f40ab4a175 100644 --- a/usr/src/uts/intel/ia32/os/syscall.c +++ b/usr/src/uts/intel/ia32/os/syscall.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -761,7 +761,7 @@ sig_check: CL_TRAPRET(t); thread_unlock(t); } - if (CPU->cpu_runrun) + if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ) preempt(); lwp->lwp_errno = 0; /* clear error for next time */ diff --git a/usr/src/uts/sparc/os/syscall.c b/usr/src/uts/sparc/os/syscall.c index 3d11c7f88a..51db208f41 100644 --- a/usr/src/uts/sparc/os/syscall.c +++ b/usr/src/uts/sparc/os/syscall.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -857,7 +857,7 @@ sig_check: CL_TRAPRET(t); thread_unlock(t); } - if (CPU->cpu_runrun) + if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ) preempt(); /* diff --git a/usr/src/uts/sun4/os/trap.c b/usr/src/uts/sun4/os/trap.c index 797d3ff839..493696f046 100644 --- a/usr/src/uts/sun4/os/trap.c +++ b/usr/src/uts/sun4/os/trap.c @@ -1564,7 +1564,7 @@ trap_rtt(void) CL_TRAPRET(curthread); thread_unlock(curthread); } - if (CPU->cpu_runrun) + if (CPU->cpu_runrun || curthread->t_schedflag & TS_ANYWAITQ) preempt(); if (lwp->lwp_pcb.pcb_step != STEP_NONE) prdostep(); diff --git a/usr/src/uts/sun4u/ngdr/io/dr_quiesce.c b/usr/src/uts/sun4u/ngdr/io/dr_quiesce.c index fb61524dfb..3ef5e2a167 100644 --- a/usr/src/uts/sun4u/ngdr/io/dr_quiesce.c +++ b/usr/src/uts/sun4u/ngdr/io/dr_quiesce.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -566,8 +566,7 @@ dr_stop_user_threads(dr_sr_handle_t *srh) aston(tp); - if (tp->t_state == TS_SLEEP && - (tp->t_flag & T_WAKEABLE)) { + if (ISWAKEABLE(tp) || ISWAITING(tp)) { setrun_locked(tp); } diff --git a/usr/src/uts/sun4u/serengeti/io/sbdp_quiesce.c b/usr/src/uts/sun4u/serengeti/io/sbdp_quiesce.c index ab078974e3..7b42c3e905 100644 --- a/usr/src/uts/sun4u/serengeti/io/sbdp_quiesce.c +++ b/usr/src/uts/sun4u/serengeti/io/sbdp_quiesce.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -541,11 +541,9 @@ sbdp_stop_user_threads(sbdp_sr_handle_t *srh) aston(tp); - if (tp->t_state == TS_SLEEP && - (tp->t_flag & T_WAKEABLE)) { + if (ISWAKEABLE(tp) || ISWAITING(tp)) { setrun_locked(tp); } - } /* grab thread if needed */ diff --git a/usr/src/uts/sun4u/sunfire/io/sysctrl_quiesce.c b/usr/src/uts/sun4u/sunfire/io/sysctrl_quiesce.c index 93d229fb3e..ff72a5e344 100644 --- a/usr/src/uts/sun4u/sunfire/io/sysctrl_quiesce.c +++ b/usr/src/uts/sun4u/sunfire/io/sysctrl_quiesce.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -372,8 +372,7 @@ sysctrl_stop_user_threads(sysc_cfga_pkt_t *pkt) aston(tp); - if (tp->t_state == TS_SLEEP && - (tp->t_flag & T_WAKEABLE)) { + if (ISWAKEABLE(tp) || ISWAITING(tp)) { setrun_locked(tp); } |