summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/disp
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/disp')
-rw-r--r--usr/src/uts/common/disp/cmt.c8
-rw-r--r--usr/src/uts/common/disp/cpucaps.c285
-rw-r--r--usr/src/uts/common/disp/disp.c22
-rw-r--r--usr/src/uts/common/disp/fss.c241
-rw-r--r--usr/src/uts/common/disp/fx.c12
-rw-r--r--usr/src/uts/common/disp/priocntl.c4
-rw-r--r--usr/src/uts/common/disp/rt.c9
-rw-r--r--usr/src/uts/common/disp/rt_dptbl.c4
-rw-r--r--usr/src/uts/common/disp/sysdc.c26
-rw-r--r--usr/src/uts/common/disp/thread.c312
-rw-r--r--usr/src/uts/common/disp/thread_intr.c37
-rw-r--r--usr/src/uts/common/disp/ts.c295
12 files changed, 732 insertions, 523 deletions
diff --git a/usr/src/uts/common/disp/cmt.c b/usr/src/uts/common/disp/cmt.c
index 0196b15dae..80b5340543 100644
--- a/usr/src/uts/common/disp/cmt.c
+++ b/usr/src/uts/common/disp/cmt.c
@@ -201,13 +201,15 @@ pg_cmt_cpu_startup(cpu_t *cp)
/*
* Return non-zero if thread can migrate between "from" and "to"
- * without a performance penalty
+ * without a performance penalty. This is true only if we share a core on
+ * virtually any CPU; sharing the last-level cache is insufficient to make
+ * migration possible without penalty.
*/
int
pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
{
- if (from->cpu_physid->cpu_cacheid ==
- to->cpu_physid->cpu_cacheid)
+ if (from->cpu_physid->cpu_coreid ==
+ to->cpu_physid->cpu_coreid)
return (1);
return (0);
}
diff --git a/usr/src/uts/common/disp/cpucaps.c b/usr/src/uts/common/disp/cpucaps.c
index 46f53faab6..2a4365ff73 100644
--- a/usr/src/uts/common/disp/cpucaps.c
+++ b/usr/src/uts/common/disp/cpucaps.c
@@ -22,6 +22,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2013 Joyent, Inc. All rights reserved.
*/
#include <sys/disp.h>
@@ -74,6 +75,32 @@
* Putting threads on wait queues in random places while running in the
* kernel might lead to all kinds of locking problems.
*
+ * Bursting
+ * ========
+ *
+ * CPU bursting occurs when the CPU usage is over the baseline but under the
+ * cap. The baseline CPU (zone.cpu-baseline) is set in a multi-tenant
+ * environment so that we know how much CPU is allocated for a tenant under
+ * normal utilization. We can then track how much time a zone is spending
+ * over the "normal" CPU utilization expected for that zone using the
+ * "above_base_sec" kstat. This kstat is cumulative.
+ *
+ * If the zone has a burst limit (zone.cpu-burst-time) then the zone can
+ * burst for that period of time (in seconds) before the effective cap is
+ * lowered to the baseline. Once the effective cap is lowered, the zone
+ * will run at the baseline for the burst limit before the effective cap is
+ * raised again to the full value. This will allow the zone to burst again.
+ * We can watch this behavior using the kstats. The "effective" kstat shows
+ * which cap is being used, the baseline value or the burst value. The
+ * "burst_limit_sec" shows the value of the zone.cpu-burst-time rctl and the
+ * "bursting_sec" kstat shows how many seconds the zone has currently been
+ * bursting. When the CPU load is continuously greater than the baseline,
+ * bursting_sec will increase, up to the burst_limit_sec value, then the
+ * effective kstat will drop to the baseline and the bursting_sec value will
+ * decrease until it hits 0, at which time the effective kstat will return to
+ * the full burst value and the bursting_sec value will begin to increase
+ * again.
+ *
* Accounting
* ==========
*
@@ -203,18 +230,28 @@ static void caps_update();
*/
struct cap_kstat {
kstat_named_t cap_value;
+ kstat_named_t cap_baseline;
+ kstat_named_t cap_effective;
+ kstat_named_t cap_burst_limit;
+ kstat_named_t cap_bursting;
kstat_named_t cap_usage;
kstat_named_t cap_nwait;
kstat_named_t cap_below;
kstat_named_t cap_above;
+ kstat_named_t cap_above_base;
kstat_named_t cap_maxusage;
kstat_named_t cap_zonename;
} cap_kstat = {
{ "value", KSTAT_DATA_UINT64 },
+ { "baseline", KSTAT_DATA_UINT64 },
+ { "effective", KSTAT_DATA_UINT64 },
+ { "burst_limit_sec", KSTAT_DATA_UINT64 },
+ { "bursting_sec", KSTAT_DATA_UINT64 },
{ "usage", KSTAT_DATA_UINT64 },
{ "nwait", KSTAT_DATA_UINT64 },
{ "below_sec", KSTAT_DATA_UINT64 },
{ "above_sec", KSTAT_DATA_UINT64 },
+ { "above_base_sec", KSTAT_DATA_UINT64 },
{ "maxusage", KSTAT_DATA_UINT64 },
{ "zonename", KSTAT_DATA_STRING },
};
@@ -311,7 +348,7 @@ cap_enable(list_t *l, cpucap_t *cap, hrtime_t value)
cap->cap_below = cap->cap_above = 0;
cap->cap_maxusage = 0;
cap->cap_usage = 0;
- cap->cap_value = value;
+ cap->cap_value = cap->cap_chk_value = value;
waitq_unblock(&cap->cap_waitq);
if (CPUCAPS_OFF()) {
cpucaps_enabled = B_TRUE;
@@ -340,19 +377,21 @@ cap_disable(list_t *l, cpucap_t *cap)
ASSERT(CAP_ENABLED(cap));
waitq_block(&cap->cap_waitq);
+
+ /* do this first to avoid race with cap_kstat_update */
+ if (cap->cap_kstat != NULL) {
+ kstat_delete(cap->cap_kstat);
+ cap->cap_kstat = NULL;
+ }
+
list_remove(l, cap);
if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) {
cpucaps_enabled = B_FALSE;
cpucaps_clock_callout = NULL;
}
- cap->cap_value = 0;
+ cap->cap_value = cap->cap_chk_value = 0;
cap->cap_project = NULL;
cap->cap_zone = NULL;
- if (cap->cap_kstat != NULL) {
- kstat_delete(cap->cap_kstat);
- cap->cap_kstat = NULL;
- }
-
}
/*
@@ -487,6 +526,8 @@ cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t))
* The waitq_isempty check is performed without the waitq lock. If a new thread
* is placed on the waitq right after the check, it will be picked up during the
* next invocation of cap_poke_waitq().
+ *
+ * Called once per tick for zones.
*/
/* ARGSUSED */
static void
@@ -494,15 +535,92 @@ cap_poke_waitq(cpucap_t *cap, int64_t gen)
{
ASSERT(MUTEX_HELD(&caps_lock));
- if (cap->cap_usage >= cap->cap_value) {
+ if (cap->cap_base != 0) {
+ /*
+ * Because of the way usage is calculated and decayed, its
+ * possible for the zone to be slightly over its cap, but we
+ * don't want to count that after we have reduced the effective
+ * cap to the baseline. That way the zone will be able to
+ * burst again after the burst_limit has expired.
+ */
+ if (cap->cap_usage > cap->cap_base &&
+ cap->cap_chk_value == cap->cap_value) {
+ cap->cap_above_base++;
+
+ /*
+ * If bursting is limited and we've been bursting
+ * longer than we're supposed to, then set the
+ * effective cap to the baseline.
+ */
+ if (cap->cap_burst_limit != 0) {
+ cap->cap_bursting++;
+ if (cap->cap_bursting >= cap->cap_burst_limit)
+ cap->cap_chk_value = cap->cap_base;
+ }
+ } else if (cap->cap_bursting > 0) {
+ /*
+ * We're not bursting now, but we were, decay the
+ * bursting timer.
+ */
+ cap->cap_bursting--;
+ /*
+ * Reset the effective cap once we decay to 0 so we
+ * can burst again.
+ */
+ if (cap->cap_bursting == 0 &&
+ cap->cap_chk_value != cap->cap_value)
+ cap->cap_chk_value = cap->cap_value;
+ }
+ }
+
+ if (cap->cap_usage >= cap->cap_chk_value) {
cap->cap_above++;
} else {
waitq_t *wq = &cap->cap_waitq;
cap->cap_below++;
- if (!waitq_isempty(wq))
- waitq_runone(wq);
+ if (!waitq_isempty(wq)) {
+ int i, ndequeue, p;
+
+ /*
+ * Since this function is only called once per tick,
+ * we can hit a situation where we have artificially
+ * limited the project/zone below its cap. This would
+ * happen if we have multiple threads queued up but
+ * only dequeued one thread/tick. To avoid this we
+ * dequeue multiple threads, calculated based on the
+ * usage percentage of the cap. It is possible that we
+ * could dequeue too many threads and some of them
+ * might be put back on the wait queue quickly, but
+ * since we know that threads are on the wait queue
+ * because we're capping, we know that there is unused
+ * CPU cycles anyway, so this extra work would not
+ * hurt. Also, the ndequeue number is only an upper
+ * bound and we might dequeue less, depending on how
+ * many threads are actually in the wait queue. The
+ * ndequeue values are empirically derived and could be
+ * adjusted or calculated in another way if necessary.
+ */
+ p = (int)((100 * cap->cap_usage) / cap->cap_chk_value);
+ if (p >= 98)
+ ndequeue = 10;
+ else if (p >= 95)
+ ndequeue = 20;
+ else if (p >= 90)
+ ndequeue = 40;
+ else if (p >= 85)
+ ndequeue = 80;
+ else
+ ndequeue = 160;
+
+ for (i = 0; i < ndequeue; i++) {
+ waitq_runone(wq);
+ if (waitq_isempty(wq))
+ break;
+ }
+ DTRACE_PROBE2(cpucaps__pokeq, int, p, int, i);
+ }
}
}
@@ -629,14 +747,14 @@ cap_project_zone_modify_walker(kproject_t *kpj, void *arg)
* Remove all projects in this zone without caps
* from the capped_projects list.
*/
- if (project_cap->cap_value == MAX_USAGE) {
+ if (project_cap->cap_chk_value == MAX_USAGE) {
cap_project_disable(kpj);
}
} else if (CAP_DISABLED(project_cap)) {
/*
* Add the project to capped_projects list.
*/
- ASSERT(project_cap->cap_value == 0);
+ ASSERT(project_cap->cap_chk_value == 0);
cap_project_enable(kpj, MAX_USAGE);
}
mutex_exit(&caps_lock);
@@ -746,7 +864,7 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
/*
* No state transitions, just change the value
*/
- cap->cap_value = value;
+ cap->cap_value = cap->cap_chk_value = value;
}
ASSERT(MUTEX_HELD(&caps_lock));
@@ -757,6 +875,108 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
}
/*
+ * Set zone's base cpu value to base_val
+ */
+int
+cpucaps_zone_set_base(zone_t *zone, rctl_qty_t base_val)
+{
+ cpucap_t *cap = NULL;
+ hrtime_t value;
+
+ ASSERT(base_val <= MAXCAP);
+ if (base_val > MAXCAP)
+ base_val = MAXCAP;
+
+ if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone))
+ return (0);
+
+ if (zone->zone_cpucap == NULL)
+ cap = cap_alloc();
+
+ mutex_enter(&caps_lock);
+
+ if (cpucaps_busy) {
+ mutex_exit(&caps_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
+ * held. If it is still NULL, assign a newly allocated cpucap to it.
+ */
+ if (zone->zone_cpucap == NULL) {
+ zone->zone_cpucap = cap;
+ } else if (cap != NULL) {
+ cap_free(cap);
+ }
+
+ cap = zone->zone_cpucap;
+
+ value = base_val * cap_tick_cost;
+ if (value < 0 || value > cap->cap_value)
+ value = 0;
+
+ cap->cap_base = value;
+
+ mutex_exit(&caps_lock);
+
+ return (0);
+}
+
+/*
+ * Set zone's maximum burst time in seconds. A burst time of 0 means that
+ * the zone can run over its baseline indefinitely.
+ */
+int
+cpucaps_zone_set_burst_time(zone_t *zone, rctl_qty_t base_val)
+{
+ cpucap_t *cap = NULL;
+ hrtime_t value;
+
+ ASSERT(base_val <= INT_MAX);
+ /* Treat the default as 0 - no limit */
+ if (base_val == INT_MAX)
+ base_val = 0;
+ if (base_val > INT_MAX)
+ base_val = INT_MAX;
+
+ if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone))
+ return (0);
+
+ if (zone->zone_cpucap == NULL)
+ cap = cap_alloc();
+
+ mutex_enter(&caps_lock);
+
+ if (cpucaps_busy) {
+ mutex_exit(&caps_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
+ * held. If it is still NULL, assign a newly allocated cpucap to it.
+ */
+ if (zone->zone_cpucap == NULL) {
+ zone->zone_cpucap = cap;
+ } else if (cap != NULL) {
+ cap_free(cap);
+ }
+
+ cap = zone->zone_cpucap;
+
+ value = SEC_TO_TICK(base_val);
+ if (value < 0)
+ value = 0;
+
+ cap->cap_burst_limit = value;
+
+ mutex_exit(&caps_lock);
+
+ return (0);
+}
+
+/*
* The project is going away so disable its cap.
*/
void
@@ -902,7 +1122,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
if (CAP_DISABLED(cap))
cap_project_enable(kpj, value);
else
- cap->cap_value = value;
+ cap->cap_value = cap->cap_chk_value = value;
} else if (CAP_ENABLED(cap)) {
/*
* User requested to drop a cap on the project. If it is part of
@@ -910,7 +1130,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
* otherwise disable the cap.
*/
if (ZONE_IS_CAPPED(kpj->kpj_zone)) {
- cap->cap_value = MAX_USAGE;
+ cap->cap_value = cap->cap_chk_value = MAX_USAGE;
} else {
cap_project_disable(kpj);
}
@@ -948,6 +1168,26 @@ cpucaps_zone_get(zone_t *zone)
}
/*
+ * Get current zone baseline.
+ */
+rctl_qty_t
+cpucaps_zone_get_base(zone_t *zone)
+{
+ return (zone->zone_cpucap != NULL ?
+ (rctl_qty_t)(zone->zone_cpucap->cap_base / cap_tick_cost) : 0);
+}
+
+/*
+ * Get current zone maximum burst time.
+ */
+rctl_qty_t
+cpucaps_zone_get_burst_time(zone_t *zone)
+{
+ return (zone->zone_cpucap != NULL ?
+ (rctl_qty_t)(TICK_TO_SEC(zone->zone_cpucap->cap_burst_limit)) : 0);
+}
+
+/*
* Charge project of thread t the time thread t spent on CPU since previously
* adjusted.
*
@@ -1045,7 +1285,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
project_cap = kpj->kpj_cpucap;
- if (project_cap->cap_usage >= project_cap->cap_value) {
+ if (project_cap->cap_usage >= project_cap->cap_chk_value) {
t->t_schedflag |= TS_PROJWAITQ;
rc = B_TRUE;
} else if (t->t_schedflag & TS_PROJWAITQ) {
@@ -1059,7 +1299,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
} else {
cpucap_t *zone_cap = zone->zone_cpucap;
- if (zone_cap->cap_usage >= zone_cap->cap_value) {
+ if (zone_cap->cap_usage >= zone_cap->cap_chk_value) {
t->t_schedflag |= TS_ZONEWAITQ;
rc = B_TRUE;
} else if (t->t_schedflag & TS_ZONEWAITQ) {
@@ -1119,6 +1359,7 @@ cpucaps_enforce(kthread_t *t)
/*
* Convert internal cap statistics into values exported by cap kstat.
+ * Note that the kstat is held throughout this function but caps_lock is not.
*/
static int
cap_kstat_update(kstat_t *ksp, int rw)
@@ -1133,6 +1374,12 @@ cap_kstat_update(kstat_t *ksp, int rw)
capsp->cap_value.value.ui64 =
ROUND_SCALE(cap->cap_value, cap_tick_cost);
+ capsp->cap_baseline.value.ui64 =
+ ROUND_SCALE(cap->cap_base, cap_tick_cost);
+ capsp->cap_effective.value.ui64 =
+ ROUND_SCALE(cap->cap_chk_value, cap_tick_cost);
+ capsp->cap_burst_limit.value.ui64 =
+ ROUND_SCALE(cap->cap_burst_limit, tick_sec);
capsp->cap_usage.value.ui64 =
ROUND_SCALE(cap->cap_usage, cap_tick_cost);
capsp->cap_maxusage.value.ui64 =
@@ -1140,6 +1387,10 @@ cap_kstat_update(kstat_t *ksp, int rw)
capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count;
capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec);
capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec);
+ capsp->cap_above_base.value.ui64 =
+ ROUND_SCALE(cap->cap_above_base, tick_sec);
+ capsp->cap_bursting.value.ui64 =
+ ROUND_SCALE(cap->cap_bursting, tick_sec);
kstat_named_setstr(&capsp->cap_zonename, zonename);
return (0);
diff --git a/usr/src/uts/common/disp/disp.c b/usr/src/uts/common/disp/disp.c
index a4b49fa86d..7e933bccc4 100644
--- a/usr/src/uts/common/disp/disp.c
+++ b/usr/src/uts/common/disp/disp.c
@@ -110,7 +110,7 @@ static void cpu_resched(cpu_t *cp, pri_t tpri);
/*
* If this is set, only interrupt threads will cause kernel preemptions.
* This is done by changing the value of kpreemptpri. kpreemptpri
- * will either be the max sysclass pri + 1 or the min interrupt pri.
+ * will either be the max sysclass pri or the min interrupt pri.
*/
int only_intr_kpreempt;
@@ -257,7 +257,23 @@ dispinit(void)
maxglobpri = cl_maxglobpri;
}
}
- kpreemptpri = (pri_t)v.v_maxsyspri + 1;
+
+ /*
+ * Historically, kpreemptpri was set to v_maxsyspri + 1 -- which is
+ * to say, maxclsyspri + 1. However, over time, the system has used
+ * more and more asynchronous kernel threads, with an increasing number
+ * of these doing work on direct behalf of higher-level software (e.g.,
+ * network processing). This has led to potential priority inversions:
+ * threads doing low-priority lengthy kernel work can effectively
+ * delay kernel-level processing of higher-priority data. To minimize
+ * such inversions, we set kpreemptpri to be v_maxsyspri; anything in
+ * the kernel that runs at maxclsyspri will therefore induce kernel
+ * preemption, and this priority should be used if/when an asynchronous
+ * thread (or, as is often the case, task queue) is performing a task
+ * on behalf of higher-level software (or any task that is otherwise
+ * latency-sensitve).
+ */
+ kpreemptpri = (pri_t)v.v_maxsyspri;
if (kpqpri == KPQPRI)
kpqpri = kpreemptpri;
@@ -2259,7 +2275,7 @@ disp_getbest(disp_t *dp)
* placed earlier.
*/
if (tcp == NULL ||
- pri >= minclsyspri ||
+ (pri >= minclsyspri && tp->t_procp == &p0) ||
tp->t_cpu != tcp)
break;
diff --git a/usr/src/uts/common/disp/fss.c b/usr/src/uts/common/disp/fss.c
index 15aeda6d00..05f358e6d4 100644
--- a/usr/src/uts/common/disp/fss.c
+++ b/usr/src/uts/common/disp/fss.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#include <sys/types.h>
@@ -1212,9 +1212,9 @@ fss_decay_usage()
* If there is only one zone active on the pset
* the above reduces to:
*
- * zone_int_shares^2
+ * zone_int_shares^2
* shusage = usage * ---------------------
- * kpj_shares^2
+ * kpj_shares^2
*
* If there's only one project active in the
* zone this formula reduces to:
@@ -1373,8 +1373,6 @@ fss_update_list(int i)
*/
if (t->t_cid != fss_cid)
goto next;
- if ((fssproc->fss_flags & FSSKPRI) != 0)
- goto next;
fssproj = FSSPROC2FSSPROJ(fssproc);
if (fssproj == NULL)
@@ -1889,7 +1887,7 @@ fss_fork(kthread_t *pt, kthread_t *ct, void *bufp)
cpucaps_sc_init(&cfssproc->fss_caps);
cfssproc->fss_flags =
- pfssproc->fss_flags & ~(FSSKPRI | FSSBACKQ | FSSRESTORE);
+ pfssproc->fss_flags & ~(FSSBACKQ | FSSRESTORE);
ct->t_cldata = (void *)cfssproc;
ct->t_schedflag |= TS_RUNQMATCH;
thread_unlock(pt);
@@ -1940,7 +1938,6 @@ fss_forkret(kthread_t *t, kthread_t *ct)
fssproc->fss_timeleft = fss_quantum;
t->t_pri = fssproc->fss_umdpri;
ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri);
- fssproc->fss_flags &= ~FSSKPRI;
THREAD_TRANSITION(t);
/*
@@ -2039,11 +2036,6 @@ fss_parmsset(kthread_t *t, void *parmsp, id_t reqpcid, cred_t *reqpcredp)
fssproc->fss_nice = nice;
fss_newpri(fssproc, B_FALSE);
- if ((fssproc->fss_flags & FSSKPRI) != 0) {
- thread_unlock(t);
- return (0);
- }
-
fss_change_priority(t, fssproc);
thread_unlock(t);
return (0);
@@ -2158,7 +2150,7 @@ fss_swapin(kthread_t *t, int flags)
time_t swapout_time;
swapout_time = (ddi_get_lbolt() - t->t_stime) / hz;
- if (INHERITED(t) || (fssproc->fss_flags & FSSKPRI)) {
+ if (INHERITED(t)) {
epri = (long)DISP_PRIO(t) + swapout_time;
} else {
/*
@@ -2190,7 +2182,6 @@ fss_swapin(kthread_t *t, int flags)
static pri_t
fss_swapout(kthread_t *t, int flags)
{
- fssproc_t *fssproc = FSSPROC(t);
long epri = -1;
proc_t *pp = ttoproc(t);
time_t swapin_time;
@@ -2198,7 +2189,6 @@ fss_swapout(kthread_t *t, int flags)
ASSERT(THREAD_LOCK_HELD(t));
if (INHERITED(t) ||
- (fssproc->fss_flags & FSSKPRI) ||
(t->t_proc_flag & TP_LWPEXIT) ||
(t->t_state & (TS_ZOMB|TS_FREE|TS_STOPPED|TS_ONPROC|TS_WAIT)) ||
!(t->t_schedflag & TS_LOAD) ||
@@ -2241,16 +2231,11 @@ fss_swapout(kthread_t *t, int flags)
}
/*
- * If thread is currently at a kernel mode priority (has slept) and is
- * returning to the userland we assign it the appropriate user mode priority
- * and time quantum here. If we're lowering the thread's priority below that
- * of other runnable threads then we will set runrun via cpu_surrender() to
- * cause preemption.
+ * Run swap-out checks when returning to userspace.
*/
static void
fss_trapret(kthread_t *t)
{
- fssproc_t *fssproc = FSSPROC(t);
cpu_t *cp = CPU;
ASSERT(THREAD_LOCK_HELD(t));
@@ -2258,20 +2243,6 @@ fss_trapret(kthread_t *t)
ASSERT(cp->cpu_dispthread == t);
ASSERT(t->t_state == TS_ONPROC);
- t->t_kpri_req = 0;
- if (fssproc->fss_flags & FSSKPRI) {
- /*
- * If thread has blocked in the kernel
- */
- THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
- cp->cpu_dispatch_pri = DISP_PRIO(t);
- ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri);
- fssproc->fss_flags &= ~FSSKPRI;
-
- if (DISP_MUST_SURRENDER(t))
- cpu_surrender(t);
- }
-
/*
* Swapout lwp if the swapper is waiting for this thread to reach
* a safe point.
@@ -2299,19 +2270,6 @@ fss_preempt(kthread_t *t)
ASSERT(t->t_state == TS_ONPROC);
/*
- * If preempted in the kernel, make sure the thread has a kernel
- * priority if needed.
- */
- lwp = curthread->t_lwp;
- if (!(fssproc->fss_flags & FSSKPRI) && lwp != NULL && t->t_kpri_req) {
- fssproc->fss_flags |= FSSKPRI;
- THREAD_CHANGE_PRI(t, minclsyspri);
- ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri);
- t->t_trapret = 1; /* so that fss_trapret will run */
- aston(t);
- }
-
- /*
* This thread may be placed on wait queue by CPU Caps. In this case we
* do not need to do anything until it is removed from the wait queue.
* Do not enforce CPU caps on threads running at a kernel priority
@@ -2320,7 +2278,7 @@ fss_preempt(kthread_t *t)
(void) cpucaps_charge(t, &fssproc->fss_caps,
CPUCAPS_CHARGE_ENFORCE);
- if (!(fssproc->fss_flags & FSSKPRI) && CPUCAPS_ENFORCE(t))
+ if (CPUCAPS_ENFORCE(t))
return;
}
@@ -2329,6 +2287,7 @@ fss_preempt(kthread_t *t)
* cannot be holding any kernel locks.
*/
ASSERT(t->t_schedflag & TS_DONT_SWAP);
+ lwp = ttolwp(t);
if (lwp != NULL && lwp->lwp_state == LWP_USER)
t->t_schedflag &= ~TS_DONT_SWAP;
@@ -2346,18 +2305,16 @@ fss_preempt(kthread_t *t)
if (t->t_schedctl && schedctl_get_nopreempt(t)) {
if (fssproc->fss_timeleft > -SC_MAX_TICKS) {
DTRACE_SCHED1(schedctl__nopreempt, kthread_t *, t);
- if (!(fssproc->fss_flags & FSSKPRI)) {
- /*
- * If not already remembered, remember current
- * priority for restoration in fss_yield().
- */
- if (!(fssproc->fss_flags & FSSRESTORE)) {
- fssproc->fss_scpri = t->t_pri;
- fssproc->fss_flags |= FSSRESTORE;
- }
- THREAD_CHANGE_PRI(t, fss_maxumdpri);
- t->t_schedflag |= TS_DONT_SWAP;
+ /*
+ * If not already remembered, remember current
+ * priority for restoration in fss_yield().
+ */
+ if (!(fssproc->fss_flags & FSSRESTORE)) {
+ fssproc->fss_scpri = t->t_pri;
+ fssproc->fss_flags |= FSSRESTORE;
}
+ THREAD_CHANGE_PRI(t, fss_maxumdpri);
+ t->t_schedflag |= TS_DONT_SWAP;
schedctl_set_yield(t, 1);
setfrontdq(t);
return;
@@ -2374,15 +2331,12 @@ fss_preempt(kthread_t *t)
}
}
- flags = fssproc->fss_flags & (FSSBACKQ | FSSKPRI);
+ flags = fssproc->fss_flags & FSSBACKQ;
if (flags == FSSBACKQ) {
fssproc->fss_timeleft = fss_quantum;
fssproc->fss_flags &= ~FSSBACKQ;
setbackdq(t);
- } else if (flags == (FSSBACKQ | FSSKPRI)) {
- fssproc->fss_flags &= ~FSSBACKQ;
- setbackdq(t);
} else {
setfrontdq(t);
}
@@ -2404,12 +2358,7 @@ fss_setrun(kthread_t *t)
fssproc->fss_timeleft = fss_quantum;
fssproc->fss_flags &= ~FSSBACKQ;
- /*
- * If previously were running at the kernel priority then keep that
- * priority and the fss_timeleft doesn't matter.
- */
- if ((fssproc->fss_flags & FSSKPRI) == 0)
- THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
+ THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
if (t->t_disp_time != ddi_get_lbolt())
setbackdq(t);
@@ -2418,8 +2367,7 @@ fss_setrun(kthread_t *t)
}
/*
- * Prepare thread for sleep. We reset the thread priority so it will run at the
- * kernel priority level when it wakes up.
+ * Prepare thread for sleep.
*/
static void
fss_sleep(kthread_t *t)
@@ -2437,31 +2385,6 @@ fss_sleep(kthread_t *t)
(void) CPUCAPS_CHARGE(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE);
fss_inactive(t);
-
- /*
- * Assign a system priority to the thread and arrange for it to be
- * retained when the thread is next placed on the run queue (i.e.,
- * when it wakes up) instead of being given a new pri. Also arrange
- * for trapret processing as the thread leaves the system call so it
- * will drop back to normal priority range.
- */
- if (t->t_kpri_req) {
- THREAD_CHANGE_PRI(t, minclsyspri);
- fssproc->fss_flags |= FSSKPRI;
- t->t_trapret = 1; /* so that fss_trapret will run */
- aston(t);
- } else if (fssproc->fss_flags & FSSKPRI) {
- /*
- * The thread has done a THREAD_KPRI_REQUEST(), slept, then
- * done THREAD_KPRI_RELEASE() (so no t_kpri_req is 0 again),
- * then slept again all without finishing the current system
- * call so trapret won't have cleared FSSKPRI
- */
- fssproc->fss_flags &= ~FSSKPRI;
- THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
- if (DISP_MUST_SURRENDER(curthread))
- cpu_surrender(t);
- }
t->t_stime = ddi_get_lbolt(); /* time stamp for the swapper */
}
@@ -2503,67 +2426,56 @@ fss_tick(kthread_t *t)
* Do not surrender CPU if running in the SYS class.
*/
if (CPUCAPS_ON()) {
- cpucaps_enforce = cpucaps_charge(t,
- &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE) &&
- !(fssproc->fss_flags & FSSKPRI);
+ cpucaps_enforce = cpucaps_charge(t, &fssproc->fss_caps,
+ CPUCAPS_CHARGE_ENFORCE);
}
- /*
- * A thread's execution time for threads running in the SYS class
- * is not tracked.
- */
- if ((fssproc->fss_flags & FSSKPRI) == 0) {
+ if (--fssproc->fss_timeleft <= 0) {
+ pri_t new_pri;
+
/*
- * If thread is not in kernel mode, decrement its fss_timeleft
+ * If we're doing preemption control and trying to avoid
+ * preempting this thread, just note that the thread should
+ * yield soon and let it keep running (unless it's been a
+ * while).
*/
- if (--fssproc->fss_timeleft <= 0) {
- pri_t new_pri;
-
- /*
- * If we're doing preemption control and trying to
- * avoid preempting this thread, just note that the
- * thread should yield soon and let it keep running
- * (unless it's been a while).
- */
- if (t->t_schedctl && schedctl_get_nopreempt(t)) {
- if (fssproc->fss_timeleft > -SC_MAX_TICKS) {
- DTRACE_SCHED1(schedctl__nopreempt,
- kthread_t *, t);
- schedctl_set_yield(t, 1);
- thread_unlock_nopreempt(t);
- return;
- }
+ if (t->t_schedctl && schedctl_get_nopreempt(t)) {
+ if (fssproc->fss_timeleft > -SC_MAX_TICKS) {
+ DTRACE_SCHED1(schedctl__nopreempt,
+ kthread_t *, t);
+ schedctl_set_yield(t, 1);
+ thread_unlock_nopreempt(t);
+ return;
}
- fssproc->fss_flags &= ~FSSRESTORE;
+ }
+ fssproc->fss_flags &= ~FSSRESTORE;
- fss_newpri(fssproc, B_TRUE);
- new_pri = fssproc->fss_umdpri;
- ASSERT(new_pri >= 0 && new_pri <= fss_maxglobpri);
+ fss_newpri(fssproc, B_TRUE);
+ new_pri = fssproc->fss_umdpri;
+ ASSERT(new_pri >= 0 && new_pri <= fss_maxglobpri);
- /*
- * When the priority of a thread is changed, it may
- * be necessary to adjust its position on a sleep queue
- * or dispatch queue. The function thread_change_pri
- * accomplishes this.
- */
- if (thread_change_pri(t, new_pri, 0)) {
- if ((t->t_schedflag & TS_LOAD) &&
- (lwp = t->t_lwp) &&
- lwp->lwp_state == LWP_USER)
- t->t_schedflag &= ~TS_DONT_SWAP;
- fssproc->fss_timeleft = fss_quantum;
- } else {
- call_cpu_surrender = B_TRUE;
- }
- } else if (t->t_state == TS_ONPROC &&
- t->t_pri < t->t_disp_queue->disp_maxrunpri) {
- /*
- * If there is a higher-priority thread which is
- * waiting for a processor, then thread surrenders
- * the processor.
- */
+ /*
+ * When the priority of a thread is changed, it may be
+ * necessary to adjust its position on a sleep queue or
+ * dispatch queue. The function thread_change_pri accomplishes
+ * this.
+ */
+ if (thread_change_pri(t, new_pri, 0)) {
+ if ((t->t_schedflag & TS_LOAD) &&
+ (lwp = t->t_lwp) &&
+ lwp->lwp_state == LWP_USER)
+ t->t_schedflag &= ~TS_DONT_SWAP;
+ fssproc->fss_timeleft = fss_quantum;
+ } else {
call_cpu_surrender = B_TRUE;
}
+ } else if (t->t_state == TS_ONPROC &&
+ t->t_pri < t->t_disp_queue->disp_maxrunpri) {
+ /*
+ * If there is a higher-priority thread which is waiting for a
+ * processor, then thread surrenders the processor.
+ */
+ call_cpu_surrender = B_TRUE;
}
if (cpucaps_enforce && 2 * fssproc->fss_timeleft > fss_quantum) {
@@ -2618,32 +2530,13 @@ fss_wakeup(kthread_t *t)
fssproc = FSSPROC(t);
fssproc->fss_flags &= ~FSSBACKQ;
- if (fssproc->fss_flags & FSSKPRI) {
- /*
- * If we already have a kernel priority assigned, then we
- * just use it.
- */
- setbackdq(t);
- } else if (t->t_kpri_req) {
- /*
- * Give thread a priority boost if we were asked.
- */
- fssproc->fss_flags |= FSSKPRI;
- THREAD_CHANGE_PRI(t, minclsyspri);
- setbackdq(t);
- t->t_trapret = 1; /* so that fss_trapret will run */
- aston(t);
+ /* Recalculate the priority. */
+ if (t->t_disp_time == ddi_get_lbolt()) {
+ setfrontdq(t);
} else {
- /*
- * Otherwise, we recalculate the priority.
- */
- if (t->t_disp_time == ddi_get_lbolt()) {
- setfrontdq(t);
- } else {
- fssproc->fss_timeleft = fss_quantum;
- THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
- setbackdq(t);
- }
+ fssproc->fss_timeleft = fss_quantum;
+ THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
+ setbackdq(t);
}
}
diff --git a/usr/src/uts/common/disp/fx.c b/usr/src/uts/common/disp/fx.c
index adb70871e2..5b190242e6 100644
--- a/usr/src/uts/common/disp/fx.c
+++ b/usr/src/uts/common/disp/fx.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
#include <sys/types.h>
@@ -71,16 +71,6 @@ static struct modlinkage modlinkage = {
};
-/*
- * control flags (kparms->fx_cflags).
- */
-#define FX_DOUPRILIM 0x01 /* change user priority limit */
-#define FX_DOUPRI 0x02 /* change user priority */
-#define FX_DOTQ 0x04 /* change FX time quantum */
-
-
-#define FXMAXUPRI 60 /* maximum user priority setting */
-
#define FX_MAX_UNPRIV_PRI 0 /* maximum unpriviledge priority */
/*
diff --git a/usr/src/uts/common/disp/priocntl.c b/usr/src/uts/common/disp/priocntl.c
index 5412df83f5..60e870ba28 100644
--- a/usr/src/uts/common/disp/priocntl.c
+++ b/usr/src/uts/common/disp/priocntl.c
@@ -114,7 +114,7 @@ copyin_vaparms32(caddr_t arg, pc_vaparms_t *vap, uio_seg_t seg)
#endif
-static int donice(procset_t *, pcnice_t *);
+int donice(procset_t *, pcnice_t *);
static int doprio(procset_t *, pcprio_t *);
static int proccmp(proc_t *, struct pcmpargs *);
static int setparms(proc_t *, struct stprmargs *);
@@ -991,7 +991,7 @@ setprocnice(proc_t *pp, pcnice_t *pcnice)
/*
* Update the nice value of the specified LWP or set of processes.
*/
-static int
+int
donice(procset_t *procset, pcnice_t *pcnice)
{
int err_proc = 0;
diff --git a/usr/src/uts/common/disp/rt.c b/usr/src/uts/common/disp/rt.c
index f87f8c56ce..115e42ccb8 100644
--- a/usr/src/uts/common/disp/rt.c
+++ b/usr/src/uts/common/disp/rt.c
@@ -22,7 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2013 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -103,13 +103,6 @@ _info(struct modinfo *modinfop)
pri_t rt_maxpri = RTMAXPRI; /* maximum real-time priority */
rtdpent_t *rt_dptbl; /* real-time dispatcher parameter table */
-/*
- * control flags (kparms->rt_cflags).
- */
-#define RT_DOPRI 0x01 /* change priority */
-#define RT_DOTQ 0x02 /* change RT time quantum */
-#define RT_DOSIG 0x04 /* change RT time quantum signal */
-
static int rt_admin(caddr_t, cred_t *);
static int rt_enterclass(kthread_t *, id_t, void *, cred_t *, void *);
static int rt_fork(kthread_t *, kthread_t *, void *);
diff --git a/usr/src/uts/common/disp/rt_dptbl.c b/usr/src/uts/common/disp/rt_dptbl.c
index 768b499ef2..cc88ed72fc 100644
--- a/usr/src/uts/common/disp/rt_dptbl.c
+++ b/usr/src/uts/common/disp/rt_dptbl.c
@@ -28,8 +28,6 @@
/* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
/* All Rights Reserved */
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/proc.h>
#include <sys/priocntl.h>
#include <sys/class.h>
@@ -70,8 +68,6 @@ _info(struct modinfo *modinfop)
return (mod_info(&modlinkage, modinfop));
}
-#define RTGPPRIO0 100 /* Global priority for RT priority 0 */
-
rtdpent_t config_rt_dptbl[] = {
/* prilevel Time quantum */
diff --git a/usr/src/uts/common/disp/sysdc.c b/usr/src/uts/common/disp/sysdc.c
index 40cde57856..1f50788ceb 100644
--- a/usr/src/uts/common/disp/sysdc.c
+++ b/usr/src/uts/common/disp/sysdc.c
@@ -193,32 +193,6 @@
* flag. This flag currently has no effect, but marks threads which
* do bulk processing.
*
- * - t_kpri_req
- *
- * The TS and FSS scheduling classes pay attention to t_kpri_req,
- * which provides a simple form of priority inheritance for
- * synchronization primitives (such as rwlocks held as READER) which
- * cannot be traced to a unique thread. The SDC class does not honor
- * t_kpri_req, for a few reasons:
- *
- * 1. t_kpri_req is notoriously inaccurate. A measure of its
- * inaccuracy is that it needs to be cleared every time a thread
- * returns to user mode, because it is frequently non-zero at that
- * point. This can happen because "ownership" of synchronization
- * primitives that use t_kpri_req can be silently handed off,
- * leaving no opportunity to will the t_kpri_req inheritance.
- *
- * 2. Unlike in TS and FSS, threads in SDC *will* eventually run at
- * kernel priority. This means that even if an SDC thread
- * is holding a synchronization primitive and running at low
- * priority, its priority will eventually be raised above 60,
- * allowing it to drive on and release the resource.
- *
- * 3. The first consumer of SDC uses the taskq subsystem, which holds
- * a reader lock for the duration of the task's execution. This
- * would mean that SDC threads would never drop below kernel
- * priority in practice, which defeats one of the purposes of SDC.
- *
* - Why not FSS?
*
* It might seem that the existing FSS scheduling class could solve
diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c
index cfcb28aa0a..bf1f121b67 100644
--- a/usr/src/uts/common/disp/thread.c
+++ b/usr/src/uts/common/disp/thread.c
@@ -77,6 +77,10 @@
#include <sys/ctype.h>
#include <sys/smt.h>
+#ifndef STACK_GROWTH_DOWN
+#error Stacks do not grow downward; 3b2 zombie attack detected!
+#endif
+
struct kmem_cache *thread_cache; /* cache of free threads */
struct kmem_cache *lwp_cache; /* cache of free lwps */
struct kmem_cache *turnstile_cache; /* cache of free turnstiles */
@@ -374,7 +378,7 @@ thread_create(
if (stksize <= sizeof (kthread_t) + PTR24_ALIGN)
cmn_err(CE_PANIC, "thread_create: proposed stack size"
" too small to hold thread.");
-#ifdef STACK_GROWTH_DOWN
+
stksize -= SA(sizeof (kthread_t) + PTR24_ALIGN - 1);
stksize &= -PTR24_ALIGN; /* make thread aligned */
t = (kthread_t *)(stk + stksize);
@@ -383,13 +387,6 @@ thread_create(
audit_thread_create(t);
t->t_stk = stk + stksize;
t->t_stkbase = stk;
-#else /* stack grows to larger addresses */
- stksize -= SA(sizeof (kthread_t));
- t = (kthread_t *)(stk);
- bzero(t, sizeof (kthread_t));
- t->t_stk = stk + sizeof (kthread_t);
- t->t_stkbase = stk + stksize + sizeof (kthread_t);
-#endif /* STACK_GROWTH_DOWN */
t->t_flag |= T_TALLOCSTK;
t->t_swap = stk;
} else {
@@ -402,13 +399,8 @@ thread_create(
* Initialize t_stk to the kernel stack pointer to use
* upon entry to the kernel
*/
-#ifdef STACK_GROWTH_DOWN
t->t_stk = stk + stksize;
t->t_stkbase = stk;
-#else
- t->t_stk = stk; /* 3b2-like */
- t->t_stkbase = stk + stksize;
-#endif /* STACK_GROWTH_DOWN */
}
if (kmem_stackinfo != 0) {
@@ -588,6 +580,9 @@ thread_exit(void)
if ((t->t_proc_flag & TP_ZTHREAD) != 0)
cmn_err(CE_PANIC, "thread_exit: zthread_exit() not called");
+ if ((t->t_flag & T_SPLITSTK) != 0)
+ cmn_err(CE_PANIC, "thread_exit: called when stack is split");
+
tsd_exit(); /* Clean up this thread's TSD */
kcpc_passivate(); /* clean up performance counter state */
@@ -1053,8 +1048,44 @@ installctx(
ctx->exit_op = exit;
ctx->free_op = free;
ctx->arg = arg;
- ctx->next = t->t_ctx;
+ ctx->save_ts = 0;
+ ctx->restore_ts = 0;
+
+ /*
+ * Keep ctxops in a doubly-linked list to allow traversal in both
+ * directions. Using only the newest-to-oldest ordering was adequate
+ * previously, but reversing the order for restore_op actions is
+ * necessary if later-added ctxops depends on earlier ones.
+ *
+ * One example of such a dependency: Hypervisor software handling the
+ * guest FPU expects that it save FPU state prior to host FPU handling
+ * and consequently handle the guest logic _after_ the host FPU has
+ * been restored.
+ *
+ * The t_ctx member points to the most recently added ctxop or is NULL
+ * if no ctxops are associated with the thread. The 'next' pointers
+ * form a loop of the ctxops in newest-to-oldest order. The 'prev'
+ * pointers form a loop in the reverse direction, where t_ctx->prev is
+ * the oldest entry associated with the thread.
+ *
+ * The protection of kpreempt_disable is required to safely perform the
+ * list insertion, since there are inconsistent states between some of
+ * the pointer assignments.
+ */
+ kpreempt_disable();
+ if (t->t_ctx == NULL) {
+ ctx->next = ctx;
+ ctx->prev = ctx;
+ } else {
+ struct ctxop *head = t->t_ctx, *tail = t->t_ctx->prev;
+
+ ctx->next = head;
+ ctx->prev = tail;
+ head->prev = ctx;
+ tail->next = ctx;
+ }
t->t_ctx = ctx;
+ kpreempt_enable();
}
/*
@@ -1071,7 +1102,7 @@ removectx(
void (*exit)(void *),
void (*free)(void *, int))
{
- struct ctxop *ctx, *prev_ctx;
+ struct ctxop *ctx, *head;
/*
* The incoming kthread_t (which is the thread for which the
@@ -1096,17 +1127,31 @@ removectx(
* and the target thread from racing with each other during lwp exit.
*/
mutex_enter(&t->t_ctx_lock);
- prev_ctx = NULL;
kpreempt_disable();
- for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) {
+
+ if (t->t_ctx == NULL) {
+ mutex_exit(&t->t_ctx_lock);
+ kpreempt_enable();
+ return (0);
+ }
+
+ ctx = head = t->t_ctx;
+ do {
if (ctx->save_op == save && ctx->restore_op == restore &&
ctx->fork_op == fork && ctx->lwp_create_op == lwp_create &&
ctx->exit_op == exit && ctx->free_op == free &&
ctx->arg == arg) {
- if (prev_ctx)
- prev_ctx->next = ctx->next;
- else
+ ctx->prev->next = ctx->next;
+ ctx->next->prev = ctx->prev;
+ if (ctx->next == ctx) {
+ /* last remaining item */
+ t->t_ctx = NULL;
+ } else if (ctx == t->t_ctx) {
+ /* fix up head of list */
t->t_ctx = ctx->next;
+ }
+ ctx->next = ctx->prev = NULL;
+
mutex_exit(&t->t_ctx_lock);
if (ctx->free_op != NULL)
(ctx->free_op)(ctx->arg, 0);
@@ -1114,44 +1159,70 @@ removectx(
kpreempt_enable();
return (1);
}
- prev_ctx = ctx;
- }
+
+ ctx = ctx->next;
+ } while (ctx != head);
+
mutex_exit(&t->t_ctx_lock);
kpreempt_enable();
-
return (0);
}
void
savectx(kthread_t *t)
{
- struct ctxop *ctx;
-
ASSERT(t == curthread);
- for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
- if (ctx->save_op != NULL)
- (ctx->save_op)(ctx->arg);
+
+ if (t->t_ctx != NULL) {
+ struct ctxop *ctx, *head;
+
+ /* Forward traversal */
+ ctx = head = t->t_ctx;
+ do {
+ if (ctx->save_op != NULL) {
+ ctx->save_ts = gethrtime_unscaled();
+ (ctx->save_op)(ctx->arg);
+ }
+ ctx = ctx->next;
+ } while (ctx != head);
+ }
}
void
restorectx(kthread_t *t)
{
- struct ctxop *ctx;
-
ASSERT(t == curthread);
- for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
- if (ctx->restore_op != NULL)
- (ctx->restore_op)(ctx->arg);
+
+ if (t->t_ctx != NULL) {
+ struct ctxop *ctx, *tail;
+
+ /* Backward traversal (starting at the tail) */
+ ctx = tail = t->t_ctx->prev;
+ do {
+ if (ctx->restore_op != NULL) {
+ ctx->restore_ts = gethrtime_unscaled();
+ (ctx->restore_op)(ctx->arg);
+ }
+ ctx = ctx->prev;
+ } while (ctx != tail);
+ }
}
void
forkctx(kthread_t *t, kthread_t *ct)
{
- struct ctxop *ctx;
-
- for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
- if (ctx->fork_op != NULL)
- (ctx->fork_op)(t, ct);
+ if (t->t_ctx != NULL) {
+ struct ctxop *ctx, *head;
+
+ /* Forward traversal */
+ ctx = head = t->t_ctx;
+ do {
+ if (ctx->fork_op != NULL) {
+ (ctx->fork_op)(t, ct);
+ }
+ ctx = ctx->next;
+ } while (ctx != head);
+ }
}
/*
@@ -1162,11 +1233,18 @@ forkctx(kthread_t *t, kthread_t *ct)
void
lwp_createctx(kthread_t *t, kthread_t *ct)
{
- struct ctxop *ctx;
-
- for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
- if (ctx->lwp_create_op != NULL)
- (ctx->lwp_create_op)(t, ct);
+ if (t->t_ctx != NULL) {
+ struct ctxop *ctx, *head;
+
+ /* Forward traversal */
+ ctx = head = t->t_ctx;
+ do {
+ if (ctx->lwp_create_op != NULL) {
+ (ctx->lwp_create_op)(t, ct);
+ }
+ ctx = ctx->next;
+ } while (ctx != head);
+ }
}
/*
@@ -1179,11 +1257,18 @@ lwp_createctx(kthread_t *t, kthread_t *ct)
void
exitctx(kthread_t *t)
{
- struct ctxop *ctx;
-
- for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
- if (ctx->exit_op != NULL)
- (ctx->exit_op)(t);
+ if (t->t_ctx != NULL) {
+ struct ctxop *ctx, *head;
+
+ /* Forward traversal */
+ ctx = head = t->t_ctx;
+ do {
+ if (ctx->exit_op != NULL) {
+ (ctx->exit_op)(t);
+ }
+ ctx = ctx->next;
+ } while (ctx != head);
+ }
}
/*
@@ -1193,14 +1278,21 @@ exitctx(kthread_t *t)
void
freectx(kthread_t *t, int isexec)
{
- struct ctxop *ctx;
-
kpreempt_disable();
- while ((ctx = t->t_ctx) != NULL) {
- t->t_ctx = ctx->next;
- if (ctx->free_op != NULL)
- (ctx->free_op)(ctx->arg, isexec);
- kmem_free(ctx, sizeof (struct ctxop));
+ if (t->t_ctx != NULL) {
+ struct ctxop *ctx, *head;
+
+ ctx = head = t->t_ctx;
+ t->t_ctx = NULL;
+ do {
+ struct ctxop *next = ctx->next;
+
+ if (ctx->free_op != NULL) {
+ (ctx->free_op)(ctx->arg, isexec);
+ }
+ kmem_free(ctx, sizeof (struct ctxop));
+ ctx = next;
+ } while (ctx != head);
}
kpreempt_enable();
}
@@ -1215,17 +1307,22 @@ freectx(kthread_t *t, int isexec)
void
freectx_ctx(struct ctxop *ctx)
{
- struct ctxop *nctx;
+ struct ctxop *head = ctx;
ASSERT(ctx != NULL);
kpreempt_disable();
+
+ head = ctx;
do {
- nctx = ctx->next;
- if (ctx->free_op != NULL)
+ struct ctxop *next = ctx->next;
+
+ if (ctx->free_op != NULL) {
(ctx->free_op)(ctx->arg, 0);
+ }
kmem_free(ctx, sizeof (struct ctxop));
- } while ((ctx = nctx) != NULL);
+ ctx = next;
+ } while (ctx != head);
kpreempt_enable();
}
@@ -1889,6 +1986,103 @@ thread_change_pri(kthread_t *t, pri_t disp_pri, int front)
return (on_rq);
}
+
+/*
+ * There are occasions in the kernel when we need much more stack than we
+ * allocate by default, but we do not wish to have that work done
+ * asynchronously by another thread. To accommodate these scenarios, we allow
+ * for a split stack (also known as a "segmented stack") whereby a new stack
+ * is dynamically allocated and the current thread jumps onto it for purposes
+ * of executing the specified function. After the specified function returns,
+ * the stack is deallocated and control is returned to the caller. This
+ * functionality is implemented by thread_splitstack(), below; there are a few
+ * constraints on its use:
+ *
+ * - The caller must be in a context where it is safe to block for memory.
+ * - The caller cannot be in a t_onfault context
+ * - The called function must not call thread_exit() while on the split stack
+ *
+ * The code will explicitly panic if these constraints are violated. Notably,
+ * however, thread_splitstack() _can_ be called on a split stack -- there
+ * is no limit to the level that split stacks can nest.
+ *
+ * When the stack is split, it is constructed such that stack backtraces
+ * from kernel debuggers continue to function -- though note that DTrace's
+ * stack() action and stackdepth function will only show the stack up to and
+ * including thread_splitstack_run(); DTrace explicitly bounds itself to
+ * pointers that exist within the current declared stack as a safety
+ * mechanism.
+ */
+void
+thread_splitstack(void (*func)(void *), void *arg, size_t stksize)
+{
+ kthread_t *t = curthread;
+ caddr_t ostk, ostkbase, stk;
+ ushort_t otflag;
+
+ if (t->t_onfault != NULL)
+ panic("thread_splitstack: called with non-NULL t_onfault");
+
+ ostk = t->t_stk;
+ ostkbase = t->t_stkbase;
+ otflag = t->t_flag;
+
+ stksize = roundup(stksize, PAGESIZE);
+
+ if (stksize < default_stksize)
+ stksize = default_stksize;
+
+ if (stksize == default_stksize) {
+ stk = (caddr_t)segkp_cache_get(segkp_thread);
+ } else {
+ stksize = roundup(stksize, PAGESIZE);
+ stk = (caddr_t)segkp_get(segkp, stksize,
+ (KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED));
+ }
+
+ /*
+ * We're going to lock ourselves before we set T_SPLITSTK to assure
+ * that we're not swapped out in the meantime. (Note that we don't
+ * bother to set t_swap, as we're not going to be swapped out.)
+ */
+ thread_lock(t);
+
+ if (!(otflag & T_SPLITSTK))
+ t->t_flag |= T_SPLITSTK;
+
+ t->t_stk = stk + stksize;
+ t->t_stkbase = stk;
+
+ thread_unlock(t);
+
+ /*
+ * Now actually run on the new (split) stack...
+ */
+ thread_splitstack_run(t->t_stk, func, arg);
+
+ /*
+ * We're back onto our own stack; lock ourselves and restore our
+ * pre-split state.
+ */
+ thread_lock(t);
+
+ t->t_stk = ostk;
+ t->t_stkbase = ostkbase;
+
+ if (!(otflag & T_SPLITSTK))
+ t->t_flag &= ~T_SPLITSTK;
+
+ thread_unlock(t);
+
+ /*
+ * Now that we are entirely back on our own stack, call back into
+ * the platform layer to perform any platform-specific cleanup.
+ */
+ thread_splitstack_cleanup();
+
+ segkp_release(segkp, stk);
+}
+
/*
* Tunable kmem_stackinfo is set, fill the kernel thread stack with a
* specific pattern.
diff --git a/usr/src/uts/common/disp/thread_intr.c b/usr/src/uts/common/disp/thread_intr.c
index 67ccc6922f..c840bdf31a 100644
--- a/usr/src/uts/common/disp/thread_intr.c
+++ b/usr/src/uts/common/disp/thread_intr.c
@@ -23,19 +23,10 @@
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-
/*
- * FILE NOTICE BEGIN
- *
- * This file should not be modified. If you wish to modify it or have it
- * modified, please contact Sun Microsystems at <LFI149367@-sun-.-com->
- * (without anti-spam dashes)
- *
- * FILE NOTICE END
+ * Copyright 2015, Joyent, Inc.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/cpuvar.h>
#include <sys/stack.h>
#include <vm/seg_kp.h>
@@ -44,6 +35,17 @@
#include <sys/sysmacros.h>
/*
+ * Use a slightly larger thread stack size for interrupt threads rather than the
+ * default. This is useful for cases where the networking stack may do an rx and
+ * a tx in the context of a single interrupt and when combined with various
+ * promisc hooks that need memory, can cause us to get dangerously close to the
+ * edge of the traditional stack sizes. This is only a few pages more than a
+ * traditional stack and given that we don't have that many interrupt threads,
+ * the memory costs end up being more than worthwhile.
+ */
+#define LL_INTR_STKSZ (32 * 1024)
+
+/*
* Create and initialize an interrupt thread.
*/
static void
@@ -51,7 +53,7 @@ thread_create_intr(cpu_t *cp)
{
kthread_t *tp;
- tp = thread_create(NULL, 0,
+ tp = thread_create(NULL, LL_INTR_STKSZ,
(void (*)())thread_create_intr, NULL, 0, &p0, TS_ONPROC, 0);
/*
@@ -97,9 +99,12 @@ thread_create_intr(cpu_t *cp)
}
/*
- * Allocate a given number of interrupt threads for a given CPU.
- * These threads will get freed by cpu_destroy_bound_threads()
- * when CPU gets unconfigured.
+ * Allocate a given number of interrupt threads for a given CPU. These threads
+ * will get freed by cpu_destroy_bound_threads() when CPU gets unconfigured.
+ *
+ * Note, high level interrupts are always serviced using cpu_intr_stack and are
+ * not allowed to block. Low level interrupts or soft-interrupts use the
+ * kthread_t's that we create through the calls to thread_create_intr().
*/
void
cpu_intr_alloc(cpu_t *cp, int n)
@@ -110,6 +115,6 @@ cpu_intr_alloc(cpu_t *cp, int n)
thread_create_intr(cp);
cp->cpu_intr_stack = (caddr_t)segkp_get(segkp, INTR_STACK_SIZE,
- KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED) +
- INTR_STACK_SIZE - SA(MINFRAME);
+ KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED) +
+ INTR_STACK_SIZE - SA(MINFRAME);
}
diff --git a/usr/src/uts/common/disp/ts.c b/usr/src/uts/common/disp/ts.c
index bf65c3c42d..5d35b283d7 100644
--- a/usr/src/uts/common/disp/ts.c
+++ b/usr/src/uts/common/disp/ts.c
@@ -21,11 +21,11 @@
/*
* Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
+/* All Rights Reserved */
#include <sys/types.h>
#include <sys/param.h>
@@ -229,7 +229,6 @@ static void ia_set_process_group(pid_t, pid_t, pid_t);
static void ts_change_priority(kthread_t *, tsproc_t *);
-extern pri_t ts_maxkmdpri; /* maximum kernel mode ts priority */
static pri_t ts_maxglobpri; /* maximum global priority used by ts class */
static kmutex_t ts_dptblock; /* protects time sharing dispatch table */
static kmutex_t ts_list_lock[TS_LISTS]; /* protects tsproc lists */
@@ -541,8 +540,8 @@ ts_admin(caddr_t uaddr, cred_t *reqpcredp)
* to specified time-sharing priority.
*/
static int
-ts_enterclass(kthread_t *t, id_t cid, void *parmsp,
- cred_t *reqpcredp, void *bufp)
+ts_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
+ void *bufp)
{
tsparms_t *tsparmsp = (tsparms_t *)parmsp;
tsproc_t *tspp;
@@ -703,7 +702,7 @@ ts_fork(kthread_t *t, kthread_t *ct, void *bufp)
TS_NEWUMDPRI(ctspp);
ctspp->ts_nice = ptspp->ts_nice;
ctspp->ts_dispwait = 0;
- ctspp->ts_flags = ptspp->ts_flags & ~(TSKPRI | TSBACKQ | TSRESTORE);
+ ctspp->ts_flags = ptspp->ts_flags & ~(TSBACKQ | TSRESTORE);
ctspp->ts_tp = ct;
cpucaps_sc_init(&ctspp->ts_caps);
thread_unlock(t);
@@ -754,7 +753,6 @@ ts_forkret(kthread_t *t, kthread_t *ct)
tspp->ts_dispwait = 0;
t->t_pri = ts_dptbl[tspp->ts_umdpri].ts_globpri;
ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri);
- tspp->ts_flags &= ~TSKPRI;
THREAD_TRANSITION(t);
ts_setrun(t);
thread_unlock(t);
@@ -1217,11 +1215,6 @@ ts_parmsset(kthread_t *tx, void *parmsp, id_t reqpcid, cred_t *reqpcredp)
TS_NEWUMDPRI(tspp);
tspp->ts_nice = nice;
- if ((tspp->ts_flags & TSKPRI) != 0) {
- thread_unlock(tx);
- return (0);
- }
-
tspp->ts_dispwait = 0;
ts_change_priority(tx, tspp);
thread_unlock(tx);
@@ -1237,7 +1230,7 @@ ia_parmsset(kthread_t *tx, void *parmsp, id_t reqpcid, cred_t *reqpcredp)
proc_t *p;
pid_t pid, pgid, sid;
pid_t on, off;
- struct stdata *stp;
+ struct stdata *stp;
int sess_held;
/*
@@ -1373,33 +1366,20 @@ static void
ts_preempt(kthread_t *t)
{
tsproc_t *tspp = (tsproc_t *)(t->t_cldata);
- klwp_t *lwp = curthread->t_lwp;
+ klwp_t *lwp = ttolwp(t);
pri_t oldpri = t->t_pri;
ASSERT(t == curthread);
ASSERT(THREAD_LOCK_HELD(curthread));
/*
- * If preempted in the kernel, make sure the thread has
- * a kernel priority if needed.
- */
- if (!(tspp->ts_flags & TSKPRI) && lwp != NULL && t->t_kpri_req) {
- tspp->ts_flags |= TSKPRI;
- THREAD_CHANGE_PRI(t, ts_kmdpris[0]);
- ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri);
- t->t_trapret = 1; /* so ts_trapret will run */
- aston(t);
- }
-
- /*
* This thread may be placed on wait queue by CPU Caps. In this case we
* do not need to do anything until it is removed from the wait queue.
- * Do not enforce CPU caps on threads running at a kernel priority
*/
if (CPUCAPS_ON()) {
(void) cpucaps_charge(t, &tspp->ts_caps,
CPUCAPS_CHARGE_ENFORCE);
- if (!(tspp->ts_flags & TSKPRI) && CPUCAPS_ENFORCE(t))
+ if (CPUCAPS_ENFORCE(t))
return;
}
@@ -1425,18 +1405,16 @@ ts_preempt(kthread_t *t)
if (t->t_schedctl && schedctl_get_nopreempt(t)) {
if (tspp->ts_timeleft > -SC_MAX_TICKS) {
DTRACE_SCHED1(schedctl__nopreempt, kthread_t *, t);
- if (!(tspp->ts_flags & TSKPRI)) {
- /*
- * If not already remembered, remember current
- * priority for restoration in ts_yield().
- */
- if (!(tspp->ts_flags & TSRESTORE)) {
- tspp->ts_scpri = t->t_pri;
- tspp->ts_flags |= TSRESTORE;
- }
- THREAD_CHANGE_PRI(t, ts_maxumdpri);
- t->t_schedflag |= TS_DONT_SWAP;
+ /*
+ * If not already remembered, remember current
+ * priority for restoration in ts_yield().
+ */
+ if (!(tspp->ts_flags & TSRESTORE)) {
+ tspp->ts_scpri = t->t_pri;
+ tspp->ts_flags |= TSRESTORE;
}
+ THREAD_CHANGE_PRI(t, ts_maxumdpri);
+ t->t_schedflag |= TS_DONT_SWAP;
schedctl_set_yield(t, 1);
setfrontdq(t);
goto done;
@@ -1456,14 +1434,11 @@ ts_preempt(kthread_t *t)
}
}
- if ((tspp->ts_flags & (TSBACKQ|TSKPRI)) == TSBACKQ) {
+ if ((tspp->ts_flags & TSBACKQ) != 0) {
tspp->ts_timeleft = ts_dptbl[tspp->ts_cpupri].ts_quantum;
tspp->ts_dispwait = 0;
tspp->ts_flags &= ~TSBACKQ;
setbackdq(t);
- } else if ((tspp->ts_flags & (TSBACKQ|TSKPRI)) == (TSBACKQ|TSKPRI)) {
- tspp->ts_flags &= ~TSBACKQ;
- setbackdq(t);
} else {
setfrontdq(t);
}
@@ -1485,11 +1460,8 @@ ts_setrun(kthread_t *t)
TS_NEWUMDPRI(tspp);
tspp->ts_timeleft = ts_dptbl[tspp->ts_cpupri].ts_quantum;
tspp->ts_dispwait = 0;
- if ((tspp->ts_flags & TSKPRI) == 0) {
- THREAD_CHANGE_PRI(t,
- ts_dptbl[tspp->ts_umdpri].ts_globpri);
- ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri);
- }
+ THREAD_CHANGE_PRI(t, ts_dptbl[tspp->ts_umdpri].ts_globpri);
+ ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri);
}
tspp->ts_flags &= ~TSBACKQ;
@@ -1509,14 +1481,12 @@ ts_setrun(kthread_t *t)
/*
- * Prepare thread for sleep. We reset the thread priority so it will
- * run at the kernel priority level when it wakes up.
+ * Prepare thread for sleep.
*/
static void
ts_sleep(kthread_t *t)
{
tsproc_t *tspp = (tsproc_t *)(t->t_cldata);
- int flags;
pri_t old_pri = t->t_pri;
ASSERT(t == curthread);
@@ -1527,18 +1497,7 @@ ts_sleep(kthread_t *t)
*/
(void) CPUCAPS_CHARGE(t, &tspp->ts_caps, CPUCAPS_CHARGE_ENFORCE);
- flags = tspp->ts_flags;
- if (t->t_kpri_req) {
- tspp->ts_flags = flags | TSKPRI;
- THREAD_CHANGE_PRI(t, ts_kmdpris[0]);
- ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri);
- t->t_trapret = 1; /* so ts_trapret will run */
- aston(t);
- } else if (tspp->ts_dispwait > ts_dptbl[tspp->ts_umdpri].ts_maxwait) {
- /*
- * If thread has blocked in the kernel (as opposed to
- * being merely preempted), recompute the user mode priority.
- */
+ if (tspp->ts_dispwait > ts_dptbl[tspp->ts_umdpri].ts_maxwait) {
tspp->ts_cpupri = ts_dptbl[tspp->ts_cpupri].ts_slpret;
TS_NEWUMDPRI(tspp);
tspp->ts_timeleft = ts_dptbl[tspp->ts_cpupri].ts_quantum;
@@ -1548,16 +1507,6 @@ ts_sleep(kthread_t *t)
ts_dptbl[tspp->ts_umdpri].ts_globpri);
ASSERT(curthread->t_pri >= 0 &&
curthread->t_pri <= ts_maxglobpri);
- tspp->ts_flags = flags & ~TSKPRI;
-
- if (DISP_MUST_SURRENDER(curthread))
- cpu_surrender(curthread);
- } else if (flags & TSKPRI) {
- THREAD_CHANGE_PRI(curthread,
- ts_dptbl[tspp->ts_umdpri].ts_globpri);
- ASSERT(curthread->t_pri >= 0 &&
- curthread->t_pri <= ts_maxglobpri);
- tspp->ts_flags = flags & ~TSKPRI;
if (DISP_MUST_SURRENDER(curthread))
cpu_surrender(curthread);
@@ -1594,9 +1543,9 @@ ts_swapin(kthread_t *t, int flags)
time_t swapout_time;
swapout_time = (ddi_get_lbolt() - t->t_stime) / hz;
- if (INHERITED(t) || (tspp->ts_flags & (TSKPRI | TSIASET)))
+ if (INHERITED(t) || (tspp->ts_flags & TSIASET)) {
epri = (long)DISP_PRIO(t) + swapout_time;
- else {
+ } else {
/*
* Threads which have been out for a long time,
* have high user mode priority and are associated
@@ -1648,7 +1597,7 @@ ts_swapout(kthread_t *t, int flags)
ASSERT(THREAD_LOCK_HELD(t));
- if (INHERITED(t) || (tspp->ts_flags & (TSKPRI | TSIASET)) ||
+ if (INHERITED(t) || (tspp->ts_flags & TSIASET) ||
(t->t_proc_flag & TP_LWPEXIT) ||
(t->t_state & (TS_ZOMB | TS_FREE | TS_STOPPED |
TS_ONPROC | TS_WAIT)) ||
@@ -1717,62 +1666,59 @@ ts_tick(kthread_t *t)
*/
if (CPUCAPS_ON()) {
call_cpu_surrender = cpucaps_charge(t, &tspp->ts_caps,
- CPUCAPS_CHARGE_ENFORCE) && !(tspp->ts_flags & TSKPRI);
+ CPUCAPS_CHARGE_ENFORCE);
}
- if ((tspp->ts_flags & TSKPRI) == 0) {
- if (--tspp->ts_timeleft <= 0) {
- pri_t new_pri;
+ if (--tspp->ts_timeleft <= 0) {
+ pri_t new_pri;
- /*
- * If we're doing preemption control and trying to
- * avoid preempting this thread, just note that
- * the thread should yield soon and let it keep
- * running (unless it's been a while).
- */
- if (t->t_schedctl && schedctl_get_nopreempt(t)) {
- if (tspp->ts_timeleft > -SC_MAX_TICKS) {
- DTRACE_SCHED1(schedctl__nopreempt,
- kthread_t *, t);
- schedctl_set_yield(t, 1);
- thread_unlock_nopreempt(t);
- return;
- }
-
- TNF_PROBE_2(schedctl_failsafe,
- "schedctl TS ts_tick", /* CSTYLED */,
- tnf_pid, pid, ttoproc(t)->p_pid,
- tnf_lwpid, lwpid, t->t_tid);
- }
- tspp->ts_flags &= ~TSRESTORE;
- tspp->ts_cpupri = ts_dptbl[tspp->ts_cpupri].ts_tqexp;
- TS_NEWUMDPRI(tspp);
- tspp->ts_dispwait = 0;
- new_pri = ts_dptbl[tspp->ts_umdpri].ts_globpri;
- ASSERT(new_pri >= 0 && new_pri <= ts_maxglobpri);
- /*
- * When the priority of a thread is changed,
- * it may be necessary to adjust its position
- * on a sleep queue or dispatch queue.
- * The function thread_change_pri accomplishes
- * this.
- */
- if (thread_change_pri(t, new_pri, 0)) {
- if ((t->t_schedflag & TS_LOAD) &&
- (lwp = t->t_lwp) &&
- lwp->lwp_state == LWP_USER)
- t->t_schedflag &= ~TS_DONT_SWAP;
- tspp->ts_timeleft =
- ts_dptbl[tspp->ts_cpupri].ts_quantum;
- } else {
- call_cpu_surrender = B_TRUE;
+ /*
+ * If we're doing preemption control and trying to avoid
+ * preempting this thread, just note that the thread should
+ * yield soon and let it keep running (unless it's been a
+ * while).
+ */
+ if (t->t_schedctl && schedctl_get_nopreempt(t)) {
+ if (tspp->ts_timeleft > -SC_MAX_TICKS) {
+ DTRACE_SCHED1(schedctl__nopreempt,
+ kthread_t *, t);
+ schedctl_set_yield(t, 1);
+ thread_unlock_nopreempt(t);
+ return;
}
- TRACE_2(TR_FAC_DISP, TR_TICK,
- "tick:tid %p old pri %d", t, oldpri);
- } else if (t->t_state == TS_ONPROC &&
- t->t_pri < t->t_disp_queue->disp_maxrunpri) {
+
+ TNF_PROBE_2(schedctl_failsafe,
+ "schedctl TS ts_tick", /* CSTYLED */,
+ tnf_pid, pid, ttoproc(t)->p_pid,
+ tnf_lwpid, lwpid, t->t_tid);
+ }
+ tspp->ts_flags &= ~TSRESTORE;
+ tspp->ts_cpupri = ts_dptbl[tspp->ts_cpupri].ts_tqexp;
+ TS_NEWUMDPRI(tspp);
+ tspp->ts_dispwait = 0;
+ new_pri = ts_dptbl[tspp->ts_umdpri].ts_globpri;
+ ASSERT(new_pri >= 0 && new_pri <= ts_maxglobpri);
+ /*
+ * When the priority of a thread is changed, it may be
+ * necessary to adjust its position on a sleep queue or
+ * dispatch queue. The function thread_change_pri accomplishes
+ * this.
+ */
+ if (thread_change_pri(t, new_pri, 0)) {
+ if ((t->t_schedflag & TS_LOAD) &&
+ (lwp = t->t_lwp) &&
+ lwp->lwp_state == LWP_USER)
+ t->t_schedflag &= ~TS_DONT_SWAP;
+ tspp->ts_timeleft =
+ ts_dptbl[tspp->ts_cpupri].ts_quantum;
+ } else {
call_cpu_surrender = B_TRUE;
}
+ TRACE_2(TR_FAC_DISP, TR_TICK,
+ "tick:tid %p old pri %d", t, oldpri);
+ } else if (t->t_state == TS_ONPROC &&
+ t->t_pri < t->t_disp_queue->disp_maxrunpri) {
+ call_cpu_surrender = B_TRUE;
}
if (call_cpu_surrender) {
@@ -1785,11 +1731,8 @@ ts_tick(kthread_t *t)
/*
- * If thread is currently at a kernel mode priority (has slept)
- * we assign it the appropriate user mode priority and time quantum
- * here. If we are lowering the thread's priority below that of
- * other runnable threads we will normally set runrun via cpu_surrender() to
- * cause preemption.
+ * If we are lowering the thread's priority below that of other runnable
+ * threads we will normally set runrun via cpu_surrender() to cause preemption.
*/
static void
ts_trapret(kthread_t *t)
@@ -1803,7 +1746,6 @@ ts_trapret(kthread_t *t)
ASSERT(cp->cpu_dispthread == t);
ASSERT(t->t_state == TS_ONPROC);
- t->t_kpri_req = 0;
if (tspp->ts_dispwait > ts_dptbl[tspp->ts_umdpri].ts_maxwait) {
tspp->ts_cpupri = ts_dptbl[tspp->ts_cpupri].ts_slpret;
TS_NEWUMDPRI(tspp);
@@ -1817,27 +1759,14 @@ ts_trapret(kthread_t *t)
THREAD_CHANGE_PRI(t, ts_dptbl[tspp->ts_umdpri].ts_globpri);
cp->cpu_dispatch_pri = DISP_PRIO(t);
ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri);
- tspp->ts_flags &= ~TSKPRI;
-
- if (DISP_MUST_SURRENDER(t))
- cpu_surrender(t);
- } else if (tspp->ts_flags & TSKPRI) {
- /*
- * If thread has blocked in the kernel (as opposed to
- * being merely preempted), recompute the user mode priority.
- */
- THREAD_CHANGE_PRI(t, ts_dptbl[tspp->ts_umdpri].ts_globpri);
- cp->cpu_dispatch_pri = DISP_PRIO(t);
- ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri);
- tspp->ts_flags &= ~TSKPRI;
if (DISP_MUST_SURRENDER(t))
cpu_surrender(t);
}
/*
- * Swapout lwp if the swapper is waiting for this thread to
- * reach a safe point.
+ * Swapout lwp if the swapper is waiting for this thread to reach a
+ * safe point.
*/
if ((t->t_schedflag & TS_SWAPENQ) && !(tspp->ts_flags & TSIASET)) {
thread_unlock(t);
@@ -1931,8 +1860,6 @@ ts_update_list(int i)
tx->t_clfuncs != &ia_classfuncs.thread)
goto next;
tspp->ts_dispwait++;
- if ((tspp->ts_flags & TSKPRI) != 0)
- goto next;
if (tspp->ts_dispwait <= ts_dptbl[tspp->ts_umdpri].ts_maxwait)
goto next;
if (tx->t_schedctl && schedctl_get_nopreempt(tx))
@@ -1968,12 +1895,7 @@ next:
}
/*
- * Processes waking up go to the back of their queue. We don't
- * need to assign a time quantum here because thread is still
- * at a kernel mode priority and the time slicing is not done
- * for threads running in the kernel after sleeping. The proper
- * time quantum will be assigned by ts_trapret before the thread
- * returns to user mode.
+ * Processes waking up go to the back of their queue.
*/
static void
ts_wakeup(kthread_t *t)
@@ -1984,46 +1906,27 @@ ts_wakeup(kthread_t *t)
t->t_stime = ddi_get_lbolt(); /* time stamp for the swapper */
- if (tspp->ts_flags & TSKPRI) {
- tspp->ts_flags &= ~TSBACKQ;
+ if (tspp->ts_dispwait > ts_dptbl[tspp->ts_umdpri].ts_maxwait) {
+ tspp->ts_cpupri = ts_dptbl[tspp->ts_cpupri].ts_slpret;
+ TS_NEWUMDPRI(tspp);
+ tspp->ts_timeleft = ts_dptbl[tspp->ts_cpupri].ts_quantum;
+ tspp->ts_dispwait = 0;
+ THREAD_CHANGE_PRI(t, ts_dptbl[tspp->ts_umdpri].ts_globpri);
+ ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri);
+ }
+
+ tspp->ts_flags &= ~TSBACKQ;
+
+ if (tspp->ts_flags & TSIA) {
if (tspp->ts_flags & TSIASET)
setfrontdq(t);
else
setbackdq(t);
- } else if (t->t_kpri_req) {
- /*
- * Give thread a priority boost if we were asked.
- */
- tspp->ts_flags |= TSKPRI;
- THREAD_CHANGE_PRI(t, ts_kmdpris[0]);
- setbackdq(t);
- t->t_trapret = 1; /* so that ts_trapret will run */
- aston(t);
} else {
- if (tspp->ts_dispwait > ts_dptbl[tspp->ts_umdpri].ts_maxwait) {
- tspp->ts_cpupri = ts_dptbl[tspp->ts_cpupri].ts_slpret;
- TS_NEWUMDPRI(tspp);
- tspp->ts_timeleft =
- ts_dptbl[tspp->ts_cpupri].ts_quantum;
- tspp->ts_dispwait = 0;
- THREAD_CHANGE_PRI(t,
- ts_dptbl[tspp->ts_umdpri].ts_globpri);
- ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri);
- }
-
- tspp->ts_flags &= ~TSBACKQ;
-
- if (tspp->ts_flags & TSIA) {
- if (tspp->ts_flags & TSIASET)
- setfrontdq(t);
- else
- setbackdq(t);
- } else {
- if (t->t_disp_time != ddi_get_lbolt())
- setbackdq(t);
- else
- setfrontdq(t);
- }
+ if (t->t_disp_time != ddi_get_lbolt())
+ setbackdq(t);
+ else
+ setfrontdq(t);
}
}
@@ -2179,7 +2082,7 @@ ts_doprio(kthread_t *t, cred_t *cr, int incr, int *retvalp)
* and background processes as non-interactive iff the session
* leader is interactive. This routine is called from two places:
* strioctl:SPGRP when a new process group gets
- * control of the tty.
+ * control of the tty.
* ia_parmsset-when the process in question is a session leader.
* ia_set_process_group assumes that pidlock is held by the caller,
* either strioctl or priocntlsys. If the caller is priocntlsys
@@ -2189,7 +2092,7 @@ ts_doprio(kthread_t *t, cred_t *cr, int incr, int *retvalp)
static void
ia_set_process_group(pid_t sid, pid_t bg_pgid, pid_t fg_pgid)
{
- proc_t *leader, *fg, *bg;
+ proc_t *leader, *fg, *bg;
tsproc_t *tspp;
kthread_t *tx;
int plocked = 0;
@@ -2291,10 +2194,6 @@ ia_set_process_group(pid_t sid, pid_t bg_pgid, pid_t fg_pgid)
tspp->ts_flags |= TSIASET;
tspp->ts_boost = ia_boost;
TS_NEWUMDPRI(tspp);
- if ((tspp->ts_flags & TSKPRI) != 0) {
- thread_unlock(tx);
- continue;
- }
tspp->ts_dispwait = 0;
ts_change_priority(tx, tspp);
thread_unlock(tx);
@@ -2344,10 +2243,6 @@ skip:
tspp->ts_flags &= ~TSIASET;
tspp->ts_boost = -ia_boost;
TS_NEWUMDPRI(tspp);
- if ((tspp->ts_flags & TSKPRI) != 0) {
- thread_unlock(tx);
- continue;
- }
tspp->ts_dispwait = 0;
ts_change_priority(tx, tspp);