summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorJerry Jelinek <jerry.jelinek@joyent.com>2012-01-02 23:14:09 +0000
committerJerry Jelinek <jerry.jelinek@joyent.com>2012-01-02 23:14:09 +0000
commite469c0531a4ed9f2ebea5a1bb397b3cf6b48a361 (patch)
tree825b0a1de6db63e1b9bb42a7a5b2fe16dc0f4de1 /usr/src
parent12ee55d75bc01e9832f994ea28daf6d428cdcff6 (diff)
downloadillumos-joyent-e469c0531a4ed9f2ebea5a1bb397b3cf6b48a361.tar.gz
OS-164 Need better process scheduling for true bursting
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/lib/brand/joyent/zone/statechange.ksh8
-rw-r--r--usr/src/uts/common/disp/cpucaps.c158
-rw-r--r--usr/src/uts/common/os/zone.c48
-rw-r--r--usr/src/uts/common/sys/cpucaps.h4
-rw-r--r--usr/src/uts/common/sys/cpucaps_impl.h5
5 files changed, 202 insertions, 21 deletions
diff --git a/usr/src/lib/brand/joyent/zone/statechange.ksh b/usr/src/lib/brand/joyent/zone/statechange.ksh
index 242a0a8181..70be7819b4 100644
--- a/usr/src/lib/brand/joyent/zone/statechange.ksh
+++ b/usr/src/lib/brand/joyent/zone/statechange.ksh
@@ -19,7 +19,7 @@
#
# CDDL HEADER END
#
-# Copyright 2010, 2011 Joyent, Inc. All rights reserved.
+# Copyright 2010, 2012 Joyent, Inc. All rights reserved.
# Use is subject to license terms.
#
@@ -406,6 +406,12 @@ setup_snapshots()
#
setup_cpu_baseline()
{
+ # If there is already a baseline, don't set one heuristically
+ curr_base=`prctl -P -n zone.cpu-baseline -i zone $ZONENAME | nawk '{
+ if ($2 == "privileged") print $3
+ }'`
+ [ -n "$curr_base" ] && return
+
# Get current cap and convert from zonecfg format into rctl format
cap=`zonecfg -z $ZONENAME info capped-cpu | nawk '{
if ($1 == "[ncpus:") print (substr($2, 1, length($2) - 1) * 100)
diff --git a/usr/src/uts/common/disp/cpucaps.c b/usr/src/uts/common/disp/cpucaps.c
index 58b1860c9c..26067235a9 100644
--- a/usr/src/uts/common/disp/cpucaps.c
+++ b/usr/src/uts/common/disp/cpucaps.c
@@ -22,7 +22,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2011 Joyent, Inc. All rights reserved.
+ * Copyright 2011, 2012 Joyent, Inc. All rights reserved.
*/
#include <sys/disp.h>
@@ -83,7 +83,23 @@
* environment so that we know how much CPU is allocated for a tenant under
* normal utilization. We can then track how much time a zone is spending
* over the "normal" CPU utilization expected for that zone using the
- * "above_base_sec" kstat.
+ * "above_base_sec" kstat. This kstat is cumulative.
+ *
+ * If the zone has a burst limit (zone.cpu-burst-time) then the zone can
+ * burst for that period of time (in seconds) before the effective cap is
+ * lowered to the baseline. Once the effective cap is lowered, the zone
+ * will run at the baseline for the burst limit before the effective cap is
+ * raised again to the full value. This will allow the zone to burst again.
+ * We can watch this behavior using the kstats. The "effective" kstat shows
+ * which cap is being used, the baseline value or the burst value. The
+ * "burst_limit_sec" shows the value of the zone.cpu-burst-time rctl and the
+ * "bursting_sec" kstat shows how many seconds the zone has currently been
+ * bursting. When the CPU load is continuously greater than the baseline,
+ * bursting_sec will increase, up to the burst_limit_sec value, then the
+ * effective kstat will drop to the baseline and the bursting_sec value will
+ * decrease until it hits 0, at which time the effective kstat will return to
+ * the full burst value and the bursting_sec value will begin to increase
+ * again.
*
* Accounting
* ==========
@@ -215,6 +231,9 @@ static void caps_update();
struct cap_kstat {
kstat_named_t cap_value;
kstat_named_t cap_baseline;
+ kstat_named_t cap_effective;
+ kstat_named_t cap_burst_limit;
+ kstat_named_t cap_bursting;
kstat_named_t cap_usage;
kstat_named_t cap_nwait;
kstat_named_t cap_below;
@@ -225,11 +244,14 @@ struct cap_kstat {
} cap_kstat = {
{ "value", KSTAT_DATA_UINT64 },
{ "baseline", KSTAT_DATA_UINT64 },
+ { "effective", KSTAT_DATA_UINT64 },
+ { "burst_limit_sec", KSTAT_DATA_UINT64 },
+ { "bursting_sec", KSTAT_DATA_UINT64 },
{ "usage", KSTAT_DATA_UINT64 },
{ "nwait", KSTAT_DATA_UINT64 },
{ "below_sec", KSTAT_DATA_UINT64 },
{ "above_sec", KSTAT_DATA_UINT64 },
- { "above_base_sec", KSTAT_DATA_UINT64 },
+ { "above_base_sec", KSTAT_DATA_UINT64 },
{ "maxusage", KSTAT_DATA_UINT64 },
{ "zonename", KSTAT_DATA_STRING },
};
@@ -326,7 +348,7 @@ cap_enable(list_t *l, cpucap_t *cap, hrtime_t value)
cap->cap_below = cap->cap_above = 0;
cap->cap_maxusage = 0;
cap->cap_usage = 0;
- cap->cap_value = value;
+ cap->cap_value = cap->cap_chk_value = value;
waitq_unblock(&cap->cap_waitq);
if (CPUCAPS_OFF()) {
cpucaps_enabled = B_TRUE;
@@ -360,7 +382,7 @@ cap_disable(list_t *l, cpucap_t *cap)
cpucaps_enabled = B_FALSE;
cpucaps_clock_callout = NULL;
}
- cap->cap_value = 0;
+ cap->cap_value = cap->cap_chk_value = 0;
cap->cap_project = NULL;
cap->cap_zone = NULL;
if (cap->cap_kstat != NULL) {
@@ -502,6 +524,8 @@ cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t))
* The waitq_isempty check is performed without the waitq lock. If a new thread
* is placed on the waitq right after the check, it will be picked up during the
* next invocation of cap_poke_waitq().
+ *
+ * Called once per tick for zones.
*/
/* ARGSUSED */
static void
@@ -509,10 +533,45 @@ cap_poke_waitq(cpucap_t *cap, int64_t gen)
{
ASSERT(MUTEX_HELD(&caps_lock));
- if (cap->cap_base != 0 && cap->cap_usage > cap->cap_base)
- cap->cap_above_base++;
+ if (cap->cap_base != 0) {
+ /*
+ * Because of the way usage is calculated and decayed, its
+ * possible for the zone to be slightly over its cap, but we
+ * don't want to count that after we have reduced the effective
+ * cap to the baseline. That way the zone will be able to
+ * burst again after the burst_limit has expired.
+ */
+ if (cap->cap_usage > cap->cap_base &&
+ cap->cap_chk_value == cap->cap_value) {
+ cap->cap_above_base++;
+
+ /*
+ * If bursting is limited and we've been bursting
+ * longer than we're supposed to, then set the
+ * effective cap to the baseline.
+ */
+ if (cap->cap_burst_limit != 0) {
+ cap->cap_bursting++;
+ if (cap->cap_bursting >= cap->cap_burst_limit)
+ cap->cap_chk_value = cap->cap_base;
+ }
+ } else if (cap->cap_bursting > 0) {
+ /*
+ * We're not bursting now, but we were, decay the
+ * bursting timer.
+ */
+ cap->cap_bursting--;
+ /*
+ * Reset the effective cap once we decay to 0 so we
+ * can burst again.
+ */
+ if (cap->cap_bursting == 0 &&
+ cap->cap_chk_value != cap->cap_value)
+ cap->cap_chk_value = cap->cap_value;
+ }
+ }
- if (cap->cap_usage >= cap->cap_value) {
+ if (cap->cap_usage >= cap->cap_chk_value) {
cap->cap_above++;
} else {
waitq_t *wq = &cap->cap_waitq;
@@ -647,14 +706,14 @@ cap_project_zone_modify_walker(kproject_t *kpj, void *arg)
* Remove all projects in this zone without caps
* from the capped_projects list.
*/
- if (project_cap->cap_value == MAX_USAGE) {
+ if (project_cap->cap_chk_value == MAX_USAGE) {
cap_project_disable(kpj);
}
} else if (CAP_DISABLED(project_cap)) {
/*
* Add the project to capped_projects list.
*/
- ASSERT(project_cap->cap_value == 0);
+ ASSERT(project_cap->cap_chk_value == 0);
cap_project_enable(kpj, MAX_USAGE);
}
mutex_exit(&caps_lock);
@@ -764,7 +823,7 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
/*
* No state transitions, just change the value
*/
- cap->cap_value = value;
+ cap->cap_value = cap->cap_chk_value = value;
}
ASSERT(MUTEX_HELD(&caps_lock));
@@ -824,6 +883,59 @@ cpucaps_zone_set_base(zone_t *zone, rctl_qty_t base_val)
}
/*
+ * Set zone's maximum burst time in seconds. A burst time of 0 means that
+ * the zone can run over its baseline indefinitely.
+ */
+int
+cpucaps_zone_set_burst_time(zone_t *zone, rctl_qty_t base_val)
+{
+ cpucap_t *cap = NULL;
+ hrtime_t value;
+
+ ASSERT(base_val <= INT_MAX);
+ /* Treat the default as 0 - no limit */
+ if (base_val == INT_MAX)
+ base_val = 0;
+ if (base_val > INT_MAX)
+ base_val = INT_MAX;
+
+ if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone))
+ return (0);
+
+ if (zone->zone_cpucap == NULL)
+ cap = cap_alloc();
+
+ mutex_enter(&caps_lock);
+
+ if (cpucaps_busy) {
+ mutex_exit(&caps_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
+ * held. If it is still NULL, assign a newly allocated cpucap to it.
+ */
+ if (zone->zone_cpucap == NULL) {
+ zone->zone_cpucap = cap;
+ } else if (cap != NULL) {
+ cap_free(cap);
+ }
+
+ cap = zone->zone_cpucap;
+
+ value = SEC_TO_TICK(base_val);
+ if (value < 0)
+ value = 0;
+
+ cap->cap_burst_limit = value;
+
+ mutex_exit(&caps_lock);
+
+ return (0);
+}
+
+/*
* The project is going away so disable its cap.
*/
void
@@ -969,7 +1081,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
if (CAP_DISABLED(cap))
cap_project_enable(kpj, value);
else
- cap->cap_value = value;
+ cap->cap_value = cap->cap_chk_value = value;
} else if (CAP_ENABLED(cap)) {
/*
* User requested to drop a cap on the project. If it is part of
@@ -977,7 +1089,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
* otherwise disable the cap.
*/
if (ZONE_IS_CAPPED(kpj->kpj_zone)) {
- cap->cap_value = MAX_USAGE;
+ cap->cap_value = cap->cap_chk_value = MAX_USAGE;
} else {
cap_project_disable(kpj);
}
@@ -1025,6 +1137,16 @@ cpucaps_zone_get_base(zone_t *zone)
}
/*
+ * Get current zone maximum burst time.
+ */
+rctl_qty_t
+cpucaps_zone_get_burst_time(zone_t *zone)
+{
+ return (zone->zone_cpucap != NULL ?
+ (rctl_qty_t)(TICK_TO_SEC(zone->zone_cpucap->cap_burst_limit)) : 0);
+}
+
+/*
* Charge project of thread t the time thread t spent on CPU since previously
* adjusted.
*
@@ -1122,7 +1244,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
project_cap = kpj->kpj_cpucap;
- if (project_cap->cap_usage >= project_cap->cap_value) {
+ if (project_cap->cap_usage >= project_cap->cap_chk_value) {
t->t_schedflag |= TS_PROJWAITQ;
rc = B_TRUE;
} else if (t->t_schedflag & TS_PROJWAITQ) {
@@ -1136,7 +1258,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
} else {
cpucap_t *zone_cap = zone->zone_cpucap;
- if (zone_cap->cap_usage >= zone_cap->cap_value) {
+ if (zone_cap->cap_usage >= zone_cap->cap_chk_value) {
t->t_schedflag |= TS_ZONEWAITQ;
rc = B_TRUE;
} else if (t->t_schedflag & TS_ZONEWAITQ) {
@@ -1212,6 +1334,10 @@ cap_kstat_update(kstat_t *ksp, int rw)
ROUND_SCALE(cap->cap_value, cap_tick_cost);
capsp->cap_baseline.value.ui64 =
ROUND_SCALE(cap->cap_base, cap_tick_cost);
+ capsp->cap_effective.value.ui64 =
+ ROUND_SCALE(cap->cap_chk_value, cap_tick_cost);
+ capsp->cap_burst_limit.value.ui64 =
+ ROUND_SCALE(cap->cap_burst_limit, tick_sec);
capsp->cap_usage.value.ui64 =
ROUND_SCALE(cap->cap_usage, cap_tick_cost);
capsp->cap_maxusage.value.ui64 =
@@ -1221,6 +1347,8 @@ cap_kstat_update(kstat_t *ksp, int rw)
capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec);
capsp->cap_above_base.value.ui64 =
ROUND_SCALE(cap->cap_above_base, tick_sec);
+ capsp->cap_bursting.value.ui64 =
+ ROUND_SCALE(cap->cap_bursting, tick_sec);
kstat_named_setstr(&capsp->cap_zonename, zonename);
return (0);
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index a7100e28a0..3ea0d0fe95 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
+ * Copyright (c) 2011, 2012, Joyent Inc. All rights reserved.
*/
/*
@@ -373,6 +373,8 @@ rctl_hndl_t rc_zone_max_swap;
rctl_hndl_t rc_zone_phys_mem;
rctl_hndl_t rc_zone_max_lofi;
rctl_hndl_t rc_zone_cpu_cap;
+rctl_hndl_t rc_zone_cpu_baseline;
+rctl_hndl_t rc_zone_cpu_burst_time;
rctl_hndl_t rc_zone_zfs_io_pri;
rctl_hndl_t rc_zone_nlwps;
rctl_hndl_t rc_zone_nprocs;
@@ -1417,6 +1419,41 @@ static rctl_ops_t zone_cpu_base_ops = {
rcop_no_test
};
+/*ARGSUSED*/
+static rctl_qty_t
+zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ return (cpucaps_zone_get_burst_time(p->p_zone));
+}
+
+/*
+ * The zone cpu burst time is used to set the amount of time CPU(s) can be
+ * bursting for the zone.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ return (cpucaps_zone_set_burst_time(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_burst_time_ops = {
+ rcop_no_action,
+ zone_cpu_burst_time_get,
+ zone_cpu_burst_time_set,
+ rcop_no_test
+};
+
/*
* zone.zfs-io-pri resource control support (IO priority).
*/
@@ -2463,11 +2500,16 @@ zone_init(void)
RCTL_GLOBAL_INFINITE,
MAXCAP, MAXCAP, &zone_cpu_cap_ops);
- rc_zone_cpu_cap = rctl_register("zone.cpu-baseline",
+ rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline",
RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
- RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER,
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
MAXCAP, MAXCAP, &zone_cpu_base_ops);
+ rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+ INT_MAX, INT_MAX, &zone_cpu_burst_time_ops);
+
rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
diff --git a/usr/src/uts/common/sys/cpucaps.h b/usr/src/uts/common/sys/cpucaps.h
index 9ead38d480..6bc042108c 100644
--- a/usr/src/uts/common/sys/cpucaps.h
+++ b/usr/src/uts/common/sys/cpucaps.h
@@ -22,7 +22,7 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2011 Joyent, Inc. All rights reserved.
+ * Copyright 2011, 2012, Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_CPUCAPS_H
@@ -86,6 +86,7 @@ extern void cpucaps_zone_remove(zone_t *);
extern int cpucaps_project_set(kproject_t *, rctl_qty_t);
extern int cpucaps_zone_set(zone_t *, rctl_qty_t);
extern int cpucaps_zone_set_base(zone_t *, rctl_qty_t);
+extern int cpucaps_zone_set_burst_time(zone_t *, rctl_qty_t);
/*
* Get current CPU usage for a project/zone.
@@ -93,6 +94,7 @@ extern int cpucaps_zone_set_base(zone_t *, rctl_qty_t);
extern rctl_qty_t cpucaps_project_get(kproject_t *);
extern rctl_qty_t cpucaps_zone_get(zone_t *);
extern rctl_qty_t cpucaps_zone_get_base(zone_t *);
+extern rctl_qty_t cpucaps_zone_get_burst_time(zone_t *);
/*
* Scheduling class hooks into CPU caps framework.
diff --git a/usr/src/uts/common/sys/cpucaps_impl.h b/usr/src/uts/common/sys/cpucaps_impl.h
index 7011e5ca6e..2cd4ed644d 100644
--- a/usr/src/uts/common/sys/cpucaps_impl.h
+++ b/usr/src/uts/common/sys/cpucaps_impl.h
@@ -22,7 +22,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2011 Joyent, Inc. All rights reserved.
+ * Copyright 2011, 2012, Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_CPUCAPS_IMPL_H
@@ -67,9 +67,12 @@ typedef struct cpucap {
waitq_t cap_waitq; /* waitq for capped threads */
kstat_t *cap_kstat; /* cpucaps specific kstat */
int64_t cap_gen; /* zone cap specific */
+ hrtime_t cap_chk_value; /* effective CPU usage cap */
hrtime_t cap_value; /* scaled CPU usage cap */
hrtime_t cap_usage; /* current CPU usage */
hrtime_t cap_base; /* base CPU for burst */
+ u_longlong_t cap_burst_limit; /* max secs (in tics) for a burst */
+ u_longlong_t cap_bursting; /* # of ticks currently bursting */
disp_lock_t cap_usagelock; /* protects cap_usage above */
/*
* Per cap statistics.