summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/disp/cpucaps.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/disp/cpucaps.c')
-rw-r--r--usr/src/uts/common/disp/cpucaps.c285
1 files changed, 268 insertions, 17 deletions
diff --git a/usr/src/uts/common/disp/cpucaps.c b/usr/src/uts/common/disp/cpucaps.c
index 46f53faab6..2a4365ff73 100644
--- a/usr/src/uts/common/disp/cpucaps.c
+++ b/usr/src/uts/common/disp/cpucaps.c
@@ -22,6 +22,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2013 Joyent, Inc. All rights reserved.
*/
#include <sys/disp.h>
@@ -74,6 +75,32 @@
* Putting threads on wait queues in random places while running in the
* kernel might lead to all kinds of locking problems.
*
+ * Bursting
+ * ========
+ *
+ * CPU bursting occurs when the CPU usage is over the baseline but under the
+ * cap. The baseline CPU (zone.cpu-baseline) is set in a multi-tenant
+ * environment so that we know how much CPU is allocated for a tenant under
+ * normal utilization. We can then track how much time a zone is spending
+ * over the "normal" CPU utilization expected for that zone using the
+ * "above_base_sec" kstat. This kstat is cumulative.
+ *
+ * If the zone has a burst limit (zone.cpu-burst-time) then the zone can
+ * burst for that period of time (in seconds) before the effective cap is
+ * lowered to the baseline. Once the effective cap is lowered, the zone
+ * will run at the baseline for the burst limit before the effective cap is
+ * raised again to the full value. This will allow the zone to burst again.
+ * We can watch this behavior using the kstats. The "effective" kstat shows
+ * which cap is being used, the baseline value or the burst value. The
+ * "burst_limit_sec" shows the value of the zone.cpu-burst-time rctl and the
+ * "bursting_sec" kstat shows how many seconds the zone has currently been
+ * bursting. When the CPU load is continuously greater than the baseline,
+ * bursting_sec will increase, up to the burst_limit_sec value, then the
+ * effective kstat will drop to the baseline and the bursting_sec value will
+ * decrease until it hits 0, at which time the effective kstat will return to
+ * the full burst value and the bursting_sec value will begin to increase
+ * again.
+ *
* Accounting
* ==========
*
@@ -203,18 +230,28 @@ static void caps_update();
*/
struct cap_kstat {
kstat_named_t cap_value;
+ kstat_named_t cap_baseline;
+ kstat_named_t cap_effective;
+ kstat_named_t cap_burst_limit;
+ kstat_named_t cap_bursting;
kstat_named_t cap_usage;
kstat_named_t cap_nwait;
kstat_named_t cap_below;
kstat_named_t cap_above;
+ kstat_named_t cap_above_base;
kstat_named_t cap_maxusage;
kstat_named_t cap_zonename;
} cap_kstat = {
{ "value", KSTAT_DATA_UINT64 },
+ { "baseline", KSTAT_DATA_UINT64 },
+ { "effective", KSTAT_DATA_UINT64 },
+ { "burst_limit_sec", KSTAT_DATA_UINT64 },
+ { "bursting_sec", KSTAT_DATA_UINT64 },
{ "usage", KSTAT_DATA_UINT64 },
{ "nwait", KSTAT_DATA_UINT64 },
{ "below_sec", KSTAT_DATA_UINT64 },
{ "above_sec", KSTAT_DATA_UINT64 },
+ { "above_base_sec", KSTAT_DATA_UINT64 },
{ "maxusage", KSTAT_DATA_UINT64 },
{ "zonename", KSTAT_DATA_STRING },
};
@@ -311,7 +348,7 @@ cap_enable(list_t *l, cpucap_t *cap, hrtime_t value)
cap->cap_below = cap->cap_above = 0;
cap->cap_maxusage = 0;
cap->cap_usage = 0;
- cap->cap_value = value;
+ cap->cap_value = cap->cap_chk_value = value;
waitq_unblock(&cap->cap_waitq);
if (CPUCAPS_OFF()) {
cpucaps_enabled = B_TRUE;
@@ -340,19 +377,21 @@ cap_disable(list_t *l, cpucap_t *cap)
ASSERT(CAP_ENABLED(cap));
waitq_block(&cap->cap_waitq);
+
+ /* do this first to avoid race with cap_kstat_update */
+ if (cap->cap_kstat != NULL) {
+ kstat_delete(cap->cap_kstat);
+ cap->cap_kstat = NULL;
+ }
+
list_remove(l, cap);
if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) {
cpucaps_enabled = B_FALSE;
cpucaps_clock_callout = NULL;
}
- cap->cap_value = 0;
+ cap->cap_value = cap->cap_chk_value = 0;
cap->cap_project = NULL;
cap->cap_zone = NULL;
- if (cap->cap_kstat != NULL) {
- kstat_delete(cap->cap_kstat);
- cap->cap_kstat = NULL;
- }
-
}
/*
@@ -487,6 +526,8 @@ cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t))
* The waitq_isempty check is performed without the waitq lock. If a new thread
* is placed on the waitq right after the check, it will be picked up during the
* next invocation of cap_poke_waitq().
+ *
+ * Called once per tick for zones.
*/
/* ARGSUSED */
static void
@@ -494,15 +535,92 @@ cap_poke_waitq(cpucap_t *cap, int64_t gen)
{
ASSERT(MUTEX_HELD(&caps_lock));
- if (cap->cap_usage >= cap->cap_value) {
+ if (cap->cap_base != 0) {
+ /*
+ * Because of the way usage is calculated and decayed, its
+ * possible for the zone to be slightly over its cap, but we
+ * don't want to count that after we have reduced the effective
+ * cap to the baseline. That way the zone will be able to
+ * burst again after the burst_limit has expired.
+ */
+ if (cap->cap_usage > cap->cap_base &&
+ cap->cap_chk_value == cap->cap_value) {
+ cap->cap_above_base++;
+
+ /*
+ * If bursting is limited and we've been bursting
+ * longer than we're supposed to, then set the
+ * effective cap to the baseline.
+ */
+ if (cap->cap_burst_limit != 0) {
+ cap->cap_bursting++;
+ if (cap->cap_bursting >= cap->cap_burst_limit)
+ cap->cap_chk_value = cap->cap_base;
+ }
+ } else if (cap->cap_bursting > 0) {
+ /*
+ * We're not bursting now, but we were, decay the
+ * bursting timer.
+ */
+ cap->cap_bursting--;
+ /*
+ * Reset the effective cap once we decay to 0 so we
+ * can burst again.
+ */
+ if (cap->cap_bursting == 0 &&
+ cap->cap_chk_value != cap->cap_value)
+ cap->cap_chk_value = cap->cap_value;
+ }
+ }
+
+ if (cap->cap_usage >= cap->cap_chk_value) {
cap->cap_above++;
} else {
waitq_t *wq = &cap->cap_waitq;
cap->cap_below++;
- if (!waitq_isempty(wq))
- waitq_runone(wq);
+ if (!waitq_isempty(wq)) {
+ int i, ndequeue, p;
+
+ /*
+ * Since this function is only called once per tick,
+ * we can hit a situation where we have artificially
+ * limited the project/zone below its cap. This would
+ * happen if we have multiple threads queued up but
+ * only dequeued one thread/tick. To avoid this we
+ * dequeue multiple threads, calculated based on the
+ * usage percentage of the cap. It is possible that we
+ * could dequeue too many threads and some of them
+ * might be put back on the wait queue quickly, but
+ * since we know that threads are on the wait queue
+ * because we're capping, we know that there is unused
+ * CPU cycles anyway, so this extra work would not
+ * hurt. Also, the ndequeue number is only an upper
+ * bound and we might dequeue less, depending on how
+ * many threads are actually in the wait queue. The
+ * ndequeue values are empirically derived and could be
+ * adjusted or calculated in another way if necessary.
+ */
+ p = (int)((100 * cap->cap_usage) / cap->cap_chk_value);
+ if (p >= 98)
+ ndequeue = 10;
+ else if (p >= 95)
+ ndequeue = 20;
+ else if (p >= 90)
+ ndequeue = 40;
+ else if (p >= 85)
+ ndequeue = 80;
+ else
+ ndequeue = 160;
+
+ for (i = 0; i < ndequeue; i++) {
+ waitq_runone(wq);
+ if (waitq_isempty(wq))
+ break;
+ }
+ DTRACE_PROBE2(cpucaps__pokeq, int, p, int, i);
+ }
}
}
@@ -629,14 +747,14 @@ cap_project_zone_modify_walker(kproject_t *kpj, void *arg)
* Remove all projects in this zone without caps
* from the capped_projects list.
*/
- if (project_cap->cap_value == MAX_USAGE) {
+ if (project_cap->cap_chk_value == MAX_USAGE) {
cap_project_disable(kpj);
}
} else if (CAP_DISABLED(project_cap)) {
/*
* Add the project to capped_projects list.
*/
- ASSERT(project_cap->cap_value == 0);
+ ASSERT(project_cap->cap_chk_value == 0);
cap_project_enable(kpj, MAX_USAGE);
}
mutex_exit(&caps_lock);
@@ -746,7 +864,7 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
/*
* No state transitions, just change the value
*/
- cap->cap_value = value;
+ cap->cap_value = cap->cap_chk_value = value;
}
ASSERT(MUTEX_HELD(&caps_lock));
@@ -757,6 +875,108 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
}
/*
+ * Set zone's base cpu value to base_val
+ */
+int
+cpucaps_zone_set_base(zone_t *zone, rctl_qty_t base_val)
+{
+ cpucap_t *cap = NULL;
+ hrtime_t value;
+
+ ASSERT(base_val <= MAXCAP);
+ if (base_val > MAXCAP)
+ base_val = MAXCAP;
+
+ if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone))
+ return (0);
+
+ if (zone->zone_cpucap == NULL)
+ cap = cap_alloc();
+
+ mutex_enter(&caps_lock);
+
+ if (cpucaps_busy) {
+ mutex_exit(&caps_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
+ * held. If it is still NULL, assign a newly allocated cpucap to it.
+ */
+ if (zone->zone_cpucap == NULL) {
+ zone->zone_cpucap = cap;
+ } else if (cap != NULL) {
+ cap_free(cap);
+ }
+
+ cap = zone->zone_cpucap;
+
+ value = base_val * cap_tick_cost;
+ if (value < 0 || value > cap->cap_value)
+ value = 0;
+
+ cap->cap_base = value;
+
+ mutex_exit(&caps_lock);
+
+ return (0);
+}
+
+/*
+ * Set zone's maximum burst time in seconds. A burst time of 0 means that
+ * the zone can run over its baseline indefinitely.
+ */
+int
+cpucaps_zone_set_burst_time(zone_t *zone, rctl_qty_t base_val)
+{
+ cpucap_t *cap = NULL;
+ hrtime_t value;
+
+ ASSERT(base_val <= INT_MAX);
+ /* Treat the default as 0 - no limit */
+ if (base_val == INT_MAX)
+ base_val = 0;
+ if (base_val > INT_MAX)
+ base_val = INT_MAX;
+
+ if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone))
+ return (0);
+
+ if (zone->zone_cpucap == NULL)
+ cap = cap_alloc();
+
+ mutex_enter(&caps_lock);
+
+ if (cpucaps_busy) {
+ mutex_exit(&caps_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
+ * held. If it is still NULL, assign a newly allocated cpucap to it.
+ */
+ if (zone->zone_cpucap == NULL) {
+ zone->zone_cpucap = cap;
+ } else if (cap != NULL) {
+ cap_free(cap);
+ }
+
+ cap = zone->zone_cpucap;
+
+ value = SEC_TO_TICK(base_val);
+ if (value < 0)
+ value = 0;
+
+ cap->cap_burst_limit = value;
+
+ mutex_exit(&caps_lock);
+
+ return (0);
+}
+
+/*
* The project is going away so disable its cap.
*/
void
@@ -902,7 +1122,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
if (CAP_DISABLED(cap))
cap_project_enable(kpj, value);
else
- cap->cap_value = value;
+ cap->cap_value = cap->cap_chk_value = value;
} else if (CAP_ENABLED(cap)) {
/*
* User requested to drop a cap on the project. If it is part of
@@ -910,7 +1130,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
* otherwise disable the cap.
*/
if (ZONE_IS_CAPPED(kpj->kpj_zone)) {
- cap->cap_value = MAX_USAGE;
+ cap->cap_value = cap->cap_chk_value = MAX_USAGE;
} else {
cap_project_disable(kpj);
}
@@ -948,6 +1168,26 @@ cpucaps_zone_get(zone_t *zone)
}
/*
+ * Get current zone baseline.
+ */
+rctl_qty_t
+cpucaps_zone_get_base(zone_t *zone)
+{
+ return (zone->zone_cpucap != NULL ?
+ (rctl_qty_t)(zone->zone_cpucap->cap_base / cap_tick_cost) : 0);
+}
+
+/*
+ * Get current zone maximum burst time.
+ */
+rctl_qty_t
+cpucaps_zone_get_burst_time(zone_t *zone)
+{
+ return (zone->zone_cpucap != NULL ?
+ (rctl_qty_t)(TICK_TO_SEC(zone->zone_cpucap->cap_burst_limit)) : 0);
+}
+
+/*
* Charge project of thread t the time thread t spent on CPU since previously
* adjusted.
*
@@ -1045,7 +1285,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
project_cap = kpj->kpj_cpucap;
- if (project_cap->cap_usage >= project_cap->cap_value) {
+ if (project_cap->cap_usage >= project_cap->cap_chk_value) {
t->t_schedflag |= TS_PROJWAITQ;
rc = B_TRUE;
} else if (t->t_schedflag & TS_PROJWAITQ) {
@@ -1059,7 +1299,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
} else {
cpucap_t *zone_cap = zone->zone_cpucap;
- if (zone_cap->cap_usage >= zone_cap->cap_value) {
+ if (zone_cap->cap_usage >= zone_cap->cap_chk_value) {
t->t_schedflag |= TS_ZONEWAITQ;
rc = B_TRUE;
} else if (t->t_schedflag & TS_ZONEWAITQ) {
@@ -1119,6 +1359,7 @@ cpucaps_enforce(kthread_t *t)
/*
* Convert internal cap statistics into values exported by cap kstat.
+ * Note that the kstat is held throughout this function but caps_lock is not.
*/
static int
cap_kstat_update(kstat_t *ksp, int rw)
@@ -1133,6 +1374,12 @@ cap_kstat_update(kstat_t *ksp, int rw)
capsp->cap_value.value.ui64 =
ROUND_SCALE(cap->cap_value, cap_tick_cost);
+ capsp->cap_baseline.value.ui64 =
+ ROUND_SCALE(cap->cap_base, cap_tick_cost);
+ capsp->cap_effective.value.ui64 =
+ ROUND_SCALE(cap->cap_chk_value, cap_tick_cost);
+ capsp->cap_burst_limit.value.ui64 =
+ ROUND_SCALE(cap->cap_burst_limit, tick_sec);
capsp->cap_usage.value.ui64 =
ROUND_SCALE(cap->cap_usage, cap_tick_cost);
capsp->cap_maxusage.value.ui64 =
@@ -1140,6 +1387,10 @@ cap_kstat_update(kstat_t *ksp, int rw)
capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count;
capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec);
capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec);
+ capsp->cap_above_base.value.ui64 =
+ ROUND_SCALE(cap->cap_above_base, tick_sec);
+ capsp->cap_bursting.value.ui64 =
+ ROUND_SCALE(cap->cap_bursting, tick_sec);
kstat_named_setstr(&capsp->cap_zonename, zonename);
return (0);