diff options
author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2012-01-02 23:14:09 +0000 |
---|---|---|
committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2012-01-02 23:14:09 +0000 |
commit | e469c0531a4ed9f2ebea5a1bb397b3cf6b48a361 (patch) | |
tree | 825b0a1de6db63e1b9bb42a7a5b2fe16dc0f4de1 /usr/src | |
parent | 12ee55d75bc01e9832f994ea28daf6d428cdcff6 (diff) | |
download | illumos-joyent-e469c0531a4ed9f2ebea5a1bb397b3cf6b48a361.tar.gz |
OS-164 Need better process scheduling for true bursting
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/lib/brand/joyent/zone/statechange.ksh | 8 | ||||
-rw-r--r-- | usr/src/uts/common/disp/cpucaps.c | 158 | ||||
-rw-r--r-- | usr/src/uts/common/os/zone.c | 48 | ||||
-rw-r--r-- | usr/src/uts/common/sys/cpucaps.h | 4 | ||||
-rw-r--r-- | usr/src/uts/common/sys/cpucaps_impl.h | 5 |
5 files changed, 202 insertions, 21 deletions
diff --git a/usr/src/lib/brand/joyent/zone/statechange.ksh b/usr/src/lib/brand/joyent/zone/statechange.ksh index 242a0a8181..70be7819b4 100644 --- a/usr/src/lib/brand/joyent/zone/statechange.ksh +++ b/usr/src/lib/brand/joyent/zone/statechange.ksh @@ -19,7 +19,7 @@ # # CDDL HEADER END # -# Copyright 2010, 2011 Joyent, Inc. All rights reserved. +# Copyright 2010, 2012 Joyent, Inc. All rights reserved. # Use is subject to license terms. # @@ -406,6 +406,12 @@ setup_snapshots() # setup_cpu_baseline() { + # If there is already a baseline, don't set one heuristically + curr_base=`prctl -P -n zone.cpu-baseline -i zone $ZONENAME | nawk '{ + if ($2 == "privileged") print $3 + }'` + [ -n "$curr_base" ] && return + # Get current cap and convert from zonecfg format into rctl format cap=`zonecfg -z $ZONENAME info capped-cpu | nawk '{ if ($1 == "[ncpus:") print (substr($2, 1, length($2) - 1) * 100) diff --git a/usr/src/uts/common/disp/cpucaps.c b/usr/src/uts/common/disp/cpucaps.c index 58b1860c9c..26067235a9 100644 --- a/usr/src/uts/common/disp/cpucaps.c +++ b/usr/src/uts/common/disp/cpucaps.c @@ -22,7 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2011 Joyent, Inc. All rights reserved. + * Copyright 2011, 2012 Joyent, Inc. All rights reserved. */ #include <sys/disp.h> @@ -83,7 +83,23 @@ * environment so that we know how much CPU is allocated for a tenant under * normal utilization. We can then track how much time a zone is spending * over the "normal" CPU utilization expected for that zone using the - * "above_base_sec" kstat. + * "above_base_sec" kstat. This kstat is cumulative. + * + * If the zone has a burst limit (zone.cpu-burst-time) then the zone can + * burst for that period of time (in seconds) before the effective cap is + * lowered to the baseline. Once the effective cap is lowered, the zone + * will run at the baseline for the burst limit before the effective cap is + * raised again to the full value. This will allow the zone to burst again. + * We can watch this behavior using the kstats. The "effective" kstat shows + * which cap is being used, the baseline value or the burst value. The + * "burst_limit_sec" shows the value of the zone.cpu-burst-time rctl and the + * "bursting_sec" kstat shows how many seconds the zone has currently been + * bursting. When the CPU load is continuously greater than the baseline, + * bursting_sec will increase, up to the burst_limit_sec value, then the + * effective kstat will drop to the baseline and the bursting_sec value will + * decrease until it hits 0, at which time the effective kstat will return to + * the full burst value and the bursting_sec value will begin to increase + * again. * * Accounting * ========== @@ -215,6 +231,9 @@ static void caps_update(); struct cap_kstat { kstat_named_t cap_value; kstat_named_t cap_baseline; + kstat_named_t cap_effective; + kstat_named_t cap_burst_limit; + kstat_named_t cap_bursting; kstat_named_t cap_usage; kstat_named_t cap_nwait; kstat_named_t cap_below; @@ -225,11 +244,14 @@ struct cap_kstat { } cap_kstat = { { "value", KSTAT_DATA_UINT64 }, { "baseline", KSTAT_DATA_UINT64 }, + { "effective", KSTAT_DATA_UINT64 }, + { "burst_limit_sec", KSTAT_DATA_UINT64 }, + { "bursting_sec", KSTAT_DATA_UINT64 }, { "usage", KSTAT_DATA_UINT64 }, { "nwait", KSTAT_DATA_UINT64 }, { "below_sec", KSTAT_DATA_UINT64 }, { "above_sec", KSTAT_DATA_UINT64 }, - { "above_base_sec", KSTAT_DATA_UINT64 }, + { "above_base_sec", KSTAT_DATA_UINT64 }, { "maxusage", KSTAT_DATA_UINT64 }, { "zonename", KSTAT_DATA_STRING }, }; @@ -326,7 +348,7 @@ cap_enable(list_t *l, cpucap_t *cap, hrtime_t value) cap->cap_below = cap->cap_above = 0; cap->cap_maxusage = 0; cap->cap_usage = 0; - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; waitq_unblock(&cap->cap_waitq); if (CPUCAPS_OFF()) { cpucaps_enabled = B_TRUE; @@ -360,7 +382,7 @@ cap_disable(list_t *l, cpucap_t *cap) cpucaps_enabled = B_FALSE; cpucaps_clock_callout = NULL; } - cap->cap_value = 0; + cap->cap_value = cap->cap_chk_value = 0; cap->cap_project = NULL; cap->cap_zone = NULL; if (cap->cap_kstat != NULL) { @@ -502,6 +524,8 @@ cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t)) * The waitq_isempty check is performed without the waitq lock. If a new thread * is placed on the waitq right after the check, it will be picked up during the * next invocation of cap_poke_waitq(). + * + * Called once per tick for zones. */ /* ARGSUSED */ static void @@ -509,10 +533,45 @@ cap_poke_waitq(cpucap_t *cap, int64_t gen) { ASSERT(MUTEX_HELD(&caps_lock)); - if (cap->cap_base != 0 && cap->cap_usage > cap->cap_base) - cap->cap_above_base++; + if (cap->cap_base != 0) { + /* + * Because of the way usage is calculated and decayed, its + * possible for the zone to be slightly over its cap, but we + * don't want to count that after we have reduced the effective + * cap to the baseline. That way the zone will be able to + * burst again after the burst_limit has expired. + */ + if (cap->cap_usage > cap->cap_base && + cap->cap_chk_value == cap->cap_value) { + cap->cap_above_base++; + + /* + * If bursting is limited and we've been bursting + * longer than we're supposed to, then set the + * effective cap to the baseline. + */ + if (cap->cap_burst_limit != 0) { + cap->cap_bursting++; + if (cap->cap_bursting >= cap->cap_burst_limit) + cap->cap_chk_value = cap->cap_base; + } + } else if (cap->cap_bursting > 0) { + /* + * We're not bursting now, but we were, decay the + * bursting timer. + */ + cap->cap_bursting--; + /* + * Reset the effective cap once we decay to 0 so we + * can burst again. + */ + if (cap->cap_bursting == 0 && + cap->cap_chk_value != cap->cap_value) + cap->cap_chk_value = cap->cap_value; + } + } - if (cap->cap_usage >= cap->cap_value) { + if (cap->cap_usage >= cap->cap_chk_value) { cap->cap_above++; } else { waitq_t *wq = &cap->cap_waitq; @@ -647,14 +706,14 @@ cap_project_zone_modify_walker(kproject_t *kpj, void *arg) * Remove all projects in this zone without caps * from the capped_projects list. */ - if (project_cap->cap_value == MAX_USAGE) { + if (project_cap->cap_chk_value == MAX_USAGE) { cap_project_disable(kpj); } } else if (CAP_DISABLED(project_cap)) { /* * Add the project to capped_projects list. */ - ASSERT(project_cap->cap_value == 0); + ASSERT(project_cap->cap_chk_value == 0); cap_project_enable(kpj, MAX_USAGE); } mutex_exit(&caps_lock); @@ -764,7 +823,7 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) /* * No state transitions, just change the value */ - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; } ASSERT(MUTEX_HELD(&caps_lock)); @@ -824,6 +883,59 @@ cpucaps_zone_set_base(zone_t *zone, rctl_qty_t base_val) } /* + * Set zone's maximum burst time in seconds. A burst time of 0 means that + * the zone can run over its baseline indefinitely. + */ +int +cpucaps_zone_set_burst_time(zone_t *zone, rctl_qty_t base_val) +{ + cpucap_t *cap = NULL; + hrtime_t value; + + ASSERT(base_val <= INT_MAX); + /* Treat the default as 0 - no limit */ + if (base_val == INT_MAX) + base_val = 0; + if (base_val > INT_MAX) + base_val = INT_MAX; + + if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) + return (0); + + if (zone->zone_cpucap == NULL) + cap = cap_alloc(); + + mutex_enter(&caps_lock); + + if (cpucaps_busy) { + mutex_exit(&caps_lock); + return (EBUSY); + } + + /* + * Double-check whether zone->zone_cpucap is NULL, now with caps_lock + * held. If it is still NULL, assign a newly allocated cpucap to it. + */ + if (zone->zone_cpucap == NULL) { + zone->zone_cpucap = cap; + } else if (cap != NULL) { + cap_free(cap); + } + + cap = zone->zone_cpucap; + + value = SEC_TO_TICK(base_val); + if (value < 0) + value = 0; + + cap->cap_burst_limit = value; + + mutex_exit(&caps_lock); + + return (0); +} + +/* * The project is going away so disable its cap. */ void @@ -969,7 +1081,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) if (CAP_DISABLED(cap)) cap_project_enable(kpj, value); else - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; } else if (CAP_ENABLED(cap)) { /* * User requested to drop a cap on the project. If it is part of @@ -977,7 +1089,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) * otherwise disable the cap. */ if (ZONE_IS_CAPPED(kpj->kpj_zone)) { - cap->cap_value = MAX_USAGE; + cap->cap_value = cap->cap_chk_value = MAX_USAGE; } else { cap_project_disable(kpj); } @@ -1025,6 +1137,16 @@ cpucaps_zone_get_base(zone_t *zone) } /* + * Get current zone maximum burst time. + */ +rctl_qty_t +cpucaps_zone_get_burst_time(zone_t *zone) +{ + return (zone->zone_cpucap != NULL ? + (rctl_qty_t)(TICK_TO_SEC(zone->zone_cpucap->cap_burst_limit)) : 0); +} + +/* * Charge project of thread t the time thread t spent on CPU since previously * adjusted. * @@ -1122,7 +1244,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) project_cap = kpj->kpj_cpucap; - if (project_cap->cap_usage >= project_cap->cap_value) { + if (project_cap->cap_usage >= project_cap->cap_chk_value) { t->t_schedflag |= TS_PROJWAITQ; rc = B_TRUE; } else if (t->t_schedflag & TS_PROJWAITQ) { @@ -1136,7 +1258,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) } else { cpucap_t *zone_cap = zone->zone_cpucap; - if (zone_cap->cap_usage >= zone_cap->cap_value) { + if (zone_cap->cap_usage >= zone_cap->cap_chk_value) { t->t_schedflag |= TS_ZONEWAITQ; rc = B_TRUE; } else if (t->t_schedflag & TS_ZONEWAITQ) { @@ -1212,6 +1334,10 @@ cap_kstat_update(kstat_t *ksp, int rw) ROUND_SCALE(cap->cap_value, cap_tick_cost); capsp->cap_baseline.value.ui64 = ROUND_SCALE(cap->cap_base, cap_tick_cost); + capsp->cap_effective.value.ui64 = + ROUND_SCALE(cap->cap_chk_value, cap_tick_cost); + capsp->cap_burst_limit.value.ui64 = + ROUND_SCALE(cap->cap_burst_limit, tick_sec); capsp->cap_usage.value.ui64 = ROUND_SCALE(cap->cap_usage, cap_tick_cost); capsp->cap_maxusage.value.ui64 = @@ -1221,6 +1347,8 @@ cap_kstat_update(kstat_t *ksp, int rw) capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec); capsp->cap_above_base.value.ui64 = ROUND_SCALE(cap->cap_above_base, tick_sec); + capsp->cap_bursting.value.ui64 = + ROUND_SCALE(cap->cap_bursting, tick_sec); kstat_named_setstr(&capsp->cap_zonename, zonename); return (0); diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index a7100e28a0..3ea0d0fe95 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent Inc. All rights reserved. + * Copyright (c) 2011, 2012, Joyent Inc. All rights reserved. */ /* @@ -373,6 +373,8 @@ rctl_hndl_t rc_zone_max_swap; rctl_hndl_t rc_zone_phys_mem; rctl_hndl_t rc_zone_max_lofi; rctl_hndl_t rc_zone_cpu_cap; +rctl_hndl_t rc_zone_cpu_baseline; +rctl_hndl_t rc_zone_cpu_burst_time; rctl_hndl_t rc_zone_zfs_io_pri; rctl_hndl_t rc_zone_nlwps; rctl_hndl_t rc_zone_nprocs; @@ -1417,6 +1419,41 @@ static rctl_ops_t zone_cpu_base_ops = { rcop_no_test }; +/*ARGSUSED*/ +static rctl_qty_t +zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_burst_time(p->p_zone)); +} + +/* + * The zone cpu burst time is used to set the amount of time CPU(s) can be + * bursting for the zone. + */ +/*ARGSUSED*/ +static int +zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_burst_time(zone, nv)); +} + +static rctl_ops_t zone_cpu_burst_time_ops = { + rcop_no_action, + zone_cpu_burst_time_get, + zone_cpu_burst_time_set, + rcop_no_test +}; + /* * zone.zfs-io-pri resource control support (IO priority). */ @@ -2463,11 +2500,16 @@ zone_init(void) RCTL_GLOBAL_INFINITE, MAXCAP, MAXCAP, &zone_cpu_cap_ops); - rc_zone_cpu_cap = rctl_register("zone.cpu-baseline", + rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline", RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | - RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER, + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, MAXCAP, MAXCAP, &zone_cpu_base_ops); + rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + INT_MAX, INT_MAX, &zone_cpu_burst_time_ops); + rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority", RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, diff --git a/usr/src/uts/common/sys/cpucaps.h b/usr/src/uts/common/sys/cpucaps.h index 9ead38d480..6bc042108c 100644 --- a/usr/src/uts/common/sys/cpucaps.h +++ b/usr/src/uts/common/sys/cpucaps.h @@ -22,7 +22,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2011 Joyent, Inc. All rights reserved. + * Copyright 2011, 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_CPUCAPS_H @@ -86,6 +86,7 @@ extern void cpucaps_zone_remove(zone_t *); extern int cpucaps_project_set(kproject_t *, rctl_qty_t); extern int cpucaps_zone_set(zone_t *, rctl_qty_t); extern int cpucaps_zone_set_base(zone_t *, rctl_qty_t); +extern int cpucaps_zone_set_burst_time(zone_t *, rctl_qty_t); /* * Get current CPU usage for a project/zone. @@ -93,6 +94,7 @@ extern int cpucaps_zone_set_base(zone_t *, rctl_qty_t); extern rctl_qty_t cpucaps_project_get(kproject_t *); extern rctl_qty_t cpucaps_zone_get(zone_t *); extern rctl_qty_t cpucaps_zone_get_base(zone_t *); +extern rctl_qty_t cpucaps_zone_get_burst_time(zone_t *); /* * Scheduling class hooks into CPU caps framework. diff --git a/usr/src/uts/common/sys/cpucaps_impl.h b/usr/src/uts/common/sys/cpucaps_impl.h index 7011e5ca6e..2cd4ed644d 100644 --- a/usr/src/uts/common/sys/cpucaps_impl.h +++ b/usr/src/uts/common/sys/cpucaps_impl.h @@ -22,7 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2011 Joyent, Inc. All rights reserved. + * Copyright 2011, 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_CPUCAPS_IMPL_H @@ -67,9 +67,12 @@ typedef struct cpucap { waitq_t cap_waitq; /* waitq for capped threads */ kstat_t *cap_kstat; /* cpucaps specific kstat */ int64_t cap_gen; /* zone cap specific */ + hrtime_t cap_chk_value; /* effective CPU usage cap */ hrtime_t cap_value; /* scaled CPU usage cap */ hrtime_t cap_usage; /* current CPU usage */ hrtime_t cap_base; /* base CPU for burst */ + u_longlong_t cap_burst_limit; /* max secs (in tics) for a burst */ + u_longlong_t cap_bursting; /* # of ticks currently bursting */ disp_lock_t cap_usagelock; /* protects cap_usage above */ /* * Per cap statistics. |