diff options
Diffstat (limited to 'usr/src/uts/common/disp')
| -rw-r--r-- | usr/src/uts/common/disp/cmt.c | 8 | ||||
| -rw-r--r-- | usr/src/uts/common/disp/cpucaps.c | 285 | ||||
| -rw-r--r-- | usr/src/uts/common/disp/disp.c | 22 | ||||
| -rw-r--r-- | usr/src/uts/common/disp/fss.c | 241 | ||||
| -rw-r--r-- | usr/src/uts/common/disp/fx.c | 12 | ||||
| -rw-r--r-- | usr/src/uts/common/disp/priocntl.c | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/disp/rt.c | 9 | ||||
| -rw-r--r-- | usr/src/uts/common/disp/rt_dptbl.c | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/disp/sysdc.c | 26 | ||||
| -rw-r--r-- | usr/src/uts/common/disp/thread.c | 312 | ||||
| -rw-r--r-- | usr/src/uts/common/disp/thread_intr.c | 37 | ||||
| -rw-r--r-- | usr/src/uts/common/disp/ts.c | 295 |
12 files changed, 732 insertions, 523 deletions
diff --git a/usr/src/uts/common/disp/cmt.c b/usr/src/uts/common/disp/cmt.c index 0196b15dae..80b5340543 100644 --- a/usr/src/uts/common/disp/cmt.c +++ b/usr/src/uts/common/disp/cmt.c @@ -201,13 +201,15 @@ pg_cmt_cpu_startup(cpu_t *cp) /* * Return non-zero if thread can migrate between "from" and "to" - * without a performance penalty + * without a performance penalty. This is true only if we share a core on + * virtually any CPU; sharing the last-level cache is insufficient to make + * migration possible without penalty. */ int pg_cmt_can_migrate(cpu_t *from, cpu_t *to) { - if (from->cpu_physid->cpu_cacheid == - to->cpu_physid->cpu_cacheid) + if (from->cpu_physid->cpu_coreid == + to->cpu_physid->cpu_coreid) return (1); return (0); } diff --git a/usr/src/uts/common/disp/cpucaps.c b/usr/src/uts/common/disp/cpucaps.c index 46f53faab6..2a4365ff73 100644 --- a/usr/src/uts/common/disp/cpucaps.c +++ b/usr/src/uts/common/disp/cpucaps.c @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2013 Joyent, Inc. All rights reserved. */ #include <sys/disp.h> @@ -74,6 +75,32 @@ * Putting threads on wait queues in random places while running in the * kernel might lead to all kinds of locking problems. * + * Bursting + * ======== + * + * CPU bursting occurs when the CPU usage is over the baseline but under the + * cap. The baseline CPU (zone.cpu-baseline) is set in a multi-tenant + * environment so that we know how much CPU is allocated for a tenant under + * normal utilization. We can then track how much time a zone is spending + * over the "normal" CPU utilization expected for that zone using the + * "above_base_sec" kstat. This kstat is cumulative. + * + * If the zone has a burst limit (zone.cpu-burst-time) then the zone can + * burst for that period of time (in seconds) before the effective cap is + * lowered to the baseline. Once the effective cap is lowered, the zone + * will run at the baseline for the burst limit before the effective cap is + * raised again to the full value. This will allow the zone to burst again. + * We can watch this behavior using the kstats. The "effective" kstat shows + * which cap is being used, the baseline value or the burst value. The + * "burst_limit_sec" shows the value of the zone.cpu-burst-time rctl and the + * "bursting_sec" kstat shows how many seconds the zone has currently been + * bursting. When the CPU load is continuously greater than the baseline, + * bursting_sec will increase, up to the burst_limit_sec value, then the + * effective kstat will drop to the baseline and the bursting_sec value will + * decrease until it hits 0, at which time the effective kstat will return to + * the full burst value and the bursting_sec value will begin to increase + * again. + * * Accounting * ========== * @@ -203,18 +230,28 @@ static void caps_update(); */ struct cap_kstat { kstat_named_t cap_value; + kstat_named_t cap_baseline; + kstat_named_t cap_effective; + kstat_named_t cap_burst_limit; + kstat_named_t cap_bursting; kstat_named_t cap_usage; kstat_named_t cap_nwait; kstat_named_t cap_below; kstat_named_t cap_above; + kstat_named_t cap_above_base; kstat_named_t cap_maxusage; kstat_named_t cap_zonename; } cap_kstat = { { "value", KSTAT_DATA_UINT64 }, + { "baseline", KSTAT_DATA_UINT64 }, + { "effective", KSTAT_DATA_UINT64 }, + { "burst_limit_sec", KSTAT_DATA_UINT64 }, + { "bursting_sec", KSTAT_DATA_UINT64 }, { "usage", KSTAT_DATA_UINT64 }, { "nwait", KSTAT_DATA_UINT64 }, { "below_sec", KSTAT_DATA_UINT64 }, { "above_sec", KSTAT_DATA_UINT64 }, + { "above_base_sec", KSTAT_DATA_UINT64 }, { "maxusage", KSTAT_DATA_UINT64 }, { "zonename", KSTAT_DATA_STRING }, }; @@ -311,7 +348,7 @@ cap_enable(list_t *l, cpucap_t *cap, hrtime_t value) cap->cap_below = cap->cap_above = 0; cap->cap_maxusage = 0; cap->cap_usage = 0; - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; waitq_unblock(&cap->cap_waitq); if (CPUCAPS_OFF()) { cpucaps_enabled = B_TRUE; @@ -340,19 +377,21 @@ cap_disable(list_t *l, cpucap_t *cap) ASSERT(CAP_ENABLED(cap)); waitq_block(&cap->cap_waitq); + + /* do this first to avoid race with cap_kstat_update */ + if (cap->cap_kstat != NULL) { + kstat_delete(cap->cap_kstat); + cap->cap_kstat = NULL; + } + list_remove(l, cap); if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) { cpucaps_enabled = B_FALSE; cpucaps_clock_callout = NULL; } - cap->cap_value = 0; + cap->cap_value = cap->cap_chk_value = 0; cap->cap_project = NULL; cap->cap_zone = NULL; - if (cap->cap_kstat != NULL) { - kstat_delete(cap->cap_kstat); - cap->cap_kstat = NULL; - } - } /* @@ -487,6 +526,8 @@ cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t)) * The waitq_isempty check is performed without the waitq lock. If a new thread * is placed on the waitq right after the check, it will be picked up during the * next invocation of cap_poke_waitq(). + * + * Called once per tick for zones. */ /* ARGSUSED */ static void @@ -494,15 +535,92 @@ cap_poke_waitq(cpucap_t *cap, int64_t gen) { ASSERT(MUTEX_HELD(&caps_lock)); - if (cap->cap_usage >= cap->cap_value) { + if (cap->cap_base != 0) { + /* + * Because of the way usage is calculated and decayed, its + * possible for the zone to be slightly over its cap, but we + * don't want to count that after we have reduced the effective + * cap to the baseline. That way the zone will be able to + * burst again after the burst_limit has expired. + */ + if (cap->cap_usage > cap->cap_base && + cap->cap_chk_value == cap->cap_value) { + cap->cap_above_base++; + + /* + * If bursting is limited and we've been bursting + * longer than we're supposed to, then set the + * effective cap to the baseline. + */ + if (cap->cap_burst_limit != 0) { + cap->cap_bursting++; + if (cap->cap_bursting >= cap->cap_burst_limit) + cap->cap_chk_value = cap->cap_base; + } + } else if (cap->cap_bursting > 0) { + /* + * We're not bursting now, but we were, decay the + * bursting timer. + */ + cap->cap_bursting--; + /* + * Reset the effective cap once we decay to 0 so we + * can burst again. + */ + if (cap->cap_bursting == 0 && + cap->cap_chk_value != cap->cap_value) + cap->cap_chk_value = cap->cap_value; + } + } + + if (cap->cap_usage >= cap->cap_chk_value) { cap->cap_above++; } else { waitq_t *wq = &cap->cap_waitq; cap->cap_below++; - if (!waitq_isempty(wq)) - waitq_runone(wq); + if (!waitq_isempty(wq)) { + int i, ndequeue, p; + + /* + * Since this function is only called once per tick, + * we can hit a situation where we have artificially + * limited the project/zone below its cap. This would + * happen if we have multiple threads queued up but + * only dequeued one thread/tick. To avoid this we + * dequeue multiple threads, calculated based on the + * usage percentage of the cap. It is possible that we + * could dequeue too many threads and some of them + * might be put back on the wait queue quickly, but + * since we know that threads are on the wait queue + * because we're capping, we know that there is unused + * CPU cycles anyway, so this extra work would not + * hurt. Also, the ndequeue number is only an upper + * bound and we might dequeue less, depending on how + * many threads are actually in the wait queue. The + * ndequeue values are empirically derived and could be + * adjusted or calculated in another way if necessary. + */ + p = (int)((100 * cap->cap_usage) / cap->cap_chk_value); + if (p >= 98) + ndequeue = 10; + else if (p >= 95) + ndequeue = 20; + else if (p >= 90) + ndequeue = 40; + else if (p >= 85) + ndequeue = 80; + else + ndequeue = 160; + + for (i = 0; i < ndequeue; i++) { + waitq_runone(wq); + if (waitq_isempty(wq)) + break; + } + DTRACE_PROBE2(cpucaps__pokeq, int, p, int, i); + } } } @@ -629,14 +747,14 @@ cap_project_zone_modify_walker(kproject_t *kpj, void *arg) * Remove all projects in this zone without caps * from the capped_projects list. */ - if (project_cap->cap_value == MAX_USAGE) { + if (project_cap->cap_chk_value == MAX_USAGE) { cap_project_disable(kpj); } } else if (CAP_DISABLED(project_cap)) { /* * Add the project to capped_projects list. */ - ASSERT(project_cap->cap_value == 0); + ASSERT(project_cap->cap_chk_value == 0); cap_project_enable(kpj, MAX_USAGE); } mutex_exit(&caps_lock); @@ -746,7 +864,7 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) /* * No state transitions, just change the value */ - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; } ASSERT(MUTEX_HELD(&caps_lock)); @@ -757,6 +875,108 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) } /* + * Set zone's base cpu value to base_val + */ +int +cpucaps_zone_set_base(zone_t *zone, rctl_qty_t base_val) +{ + cpucap_t *cap = NULL; + hrtime_t value; + + ASSERT(base_val <= MAXCAP); + if (base_val > MAXCAP) + base_val = MAXCAP; + + if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) + return (0); + + if (zone->zone_cpucap == NULL) + cap = cap_alloc(); + + mutex_enter(&caps_lock); + + if (cpucaps_busy) { + mutex_exit(&caps_lock); + return (EBUSY); + } + + /* + * Double-check whether zone->zone_cpucap is NULL, now with caps_lock + * held. If it is still NULL, assign a newly allocated cpucap to it. + */ + if (zone->zone_cpucap == NULL) { + zone->zone_cpucap = cap; + } else if (cap != NULL) { + cap_free(cap); + } + + cap = zone->zone_cpucap; + + value = base_val * cap_tick_cost; + if (value < 0 || value > cap->cap_value) + value = 0; + + cap->cap_base = value; + + mutex_exit(&caps_lock); + + return (0); +} + +/* + * Set zone's maximum burst time in seconds. A burst time of 0 means that + * the zone can run over its baseline indefinitely. + */ +int +cpucaps_zone_set_burst_time(zone_t *zone, rctl_qty_t base_val) +{ + cpucap_t *cap = NULL; + hrtime_t value; + + ASSERT(base_val <= INT_MAX); + /* Treat the default as 0 - no limit */ + if (base_val == INT_MAX) + base_val = 0; + if (base_val > INT_MAX) + base_val = INT_MAX; + + if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) + return (0); + + if (zone->zone_cpucap == NULL) + cap = cap_alloc(); + + mutex_enter(&caps_lock); + + if (cpucaps_busy) { + mutex_exit(&caps_lock); + return (EBUSY); + } + + /* + * Double-check whether zone->zone_cpucap is NULL, now with caps_lock + * held. If it is still NULL, assign a newly allocated cpucap to it. + */ + if (zone->zone_cpucap == NULL) { + zone->zone_cpucap = cap; + } else if (cap != NULL) { + cap_free(cap); + } + + cap = zone->zone_cpucap; + + value = SEC_TO_TICK(base_val); + if (value < 0) + value = 0; + + cap->cap_burst_limit = value; + + mutex_exit(&caps_lock); + + return (0); +} + +/* * The project is going away so disable its cap. */ void @@ -902,7 +1122,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) if (CAP_DISABLED(cap)) cap_project_enable(kpj, value); else - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; } else if (CAP_ENABLED(cap)) { /* * User requested to drop a cap on the project. If it is part of @@ -910,7 +1130,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) * otherwise disable the cap. */ if (ZONE_IS_CAPPED(kpj->kpj_zone)) { - cap->cap_value = MAX_USAGE; + cap->cap_value = cap->cap_chk_value = MAX_USAGE; } else { cap_project_disable(kpj); } @@ -948,6 +1168,26 @@ cpucaps_zone_get(zone_t *zone) } /* + * Get current zone baseline. + */ +rctl_qty_t +cpucaps_zone_get_base(zone_t *zone) +{ + return (zone->zone_cpucap != NULL ? + (rctl_qty_t)(zone->zone_cpucap->cap_base / cap_tick_cost) : 0); +} + +/* + * Get current zone maximum burst time. + */ +rctl_qty_t +cpucaps_zone_get_burst_time(zone_t *zone) +{ + return (zone->zone_cpucap != NULL ? + (rctl_qty_t)(TICK_TO_SEC(zone->zone_cpucap->cap_burst_limit)) : 0); +} + +/* * Charge project of thread t the time thread t spent on CPU since previously * adjusted. * @@ -1045,7 +1285,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) project_cap = kpj->kpj_cpucap; - if (project_cap->cap_usage >= project_cap->cap_value) { + if (project_cap->cap_usage >= project_cap->cap_chk_value) { t->t_schedflag |= TS_PROJWAITQ; rc = B_TRUE; } else if (t->t_schedflag & TS_PROJWAITQ) { @@ -1059,7 +1299,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) } else { cpucap_t *zone_cap = zone->zone_cpucap; - if (zone_cap->cap_usage >= zone_cap->cap_value) { + if (zone_cap->cap_usage >= zone_cap->cap_chk_value) { t->t_schedflag |= TS_ZONEWAITQ; rc = B_TRUE; } else if (t->t_schedflag & TS_ZONEWAITQ) { @@ -1119,6 +1359,7 @@ cpucaps_enforce(kthread_t *t) /* * Convert internal cap statistics into values exported by cap kstat. + * Note that the kstat is held throughout this function but caps_lock is not. */ static int cap_kstat_update(kstat_t *ksp, int rw) @@ -1133,6 +1374,12 @@ cap_kstat_update(kstat_t *ksp, int rw) capsp->cap_value.value.ui64 = ROUND_SCALE(cap->cap_value, cap_tick_cost); + capsp->cap_baseline.value.ui64 = + ROUND_SCALE(cap->cap_base, cap_tick_cost); + capsp->cap_effective.value.ui64 = + ROUND_SCALE(cap->cap_chk_value, cap_tick_cost); + capsp->cap_burst_limit.value.ui64 = + ROUND_SCALE(cap->cap_burst_limit, tick_sec); capsp->cap_usage.value.ui64 = ROUND_SCALE(cap->cap_usage, cap_tick_cost); capsp->cap_maxusage.value.ui64 = @@ -1140,6 +1387,10 @@ cap_kstat_update(kstat_t *ksp, int rw) capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count; capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec); capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec); + capsp->cap_above_base.value.ui64 = + ROUND_SCALE(cap->cap_above_base, tick_sec); + capsp->cap_bursting.value.ui64 = + ROUND_SCALE(cap->cap_bursting, tick_sec); kstat_named_setstr(&capsp->cap_zonename, zonename); return (0); diff --git a/usr/src/uts/common/disp/disp.c b/usr/src/uts/common/disp/disp.c index a4b49fa86d..7e933bccc4 100644 --- a/usr/src/uts/common/disp/disp.c +++ b/usr/src/uts/common/disp/disp.c @@ -110,7 +110,7 @@ static void cpu_resched(cpu_t *cp, pri_t tpri); /* * If this is set, only interrupt threads will cause kernel preemptions. * This is done by changing the value of kpreemptpri. kpreemptpri - * will either be the max sysclass pri + 1 or the min interrupt pri. + * will either be the max sysclass pri or the min interrupt pri. */ int only_intr_kpreempt; @@ -257,7 +257,23 @@ dispinit(void) maxglobpri = cl_maxglobpri; } } - kpreemptpri = (pri_t)v.v_maxsyspri + 1; + + /* + * Historically, kpreemptpri was set to v_maxsyspri + 1 -- which is + * to say, maxclsyspri + 1. However, over time, the system has used + * more and more asynchronous kernel threads, with an increasing number + * of these doing work on direct behalf of higher-level software (e.g., + * network processing). This has led to potential priority inversions: + * threads doing low-priority lengthy kernel work can effectively + * delay kernel-level processing of higher-priority data. To minimize + * such inversions, we set kpreemptpri to be v_maxsyspri; anything in + * the kernel that runs at maxclsyspri will therefore induce kernel + * preemption, and this priority should be used if/when an asynchronous + * thread (or, as is often the case, task queue) is performing a task + * on behalf of higher-level software (or any task that is otherwise + * latency-sensitve). + */ + kpreemptpri = (pri_t)v.v_maxsyspri; if (kpqpri == KPQPRI) kpqpri = kpreemptpri; @@ -2259,7 +2275,7 @@ disp_getbest(disp_t *dp) * placed earlier. */ if (tcp == NULL || - pri >= minclsyspri || + (pri >= minclsyspri && tp->t_procp == &p0) || tp->t_cpu != tcp) break; diff --git a/usr/src/uts/common/disp/fss.c b/usr/src/uts/common/disp/fss.c index 15aeda6d00..05f358e6d4 100644 --- a/usr/src/uts/common/disp/fss.c +++ b/usr/src/uts/common/disp/fss.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #include <sys/types.h> @@ -1212,9 +1212,9 @@ fss_decay_usage() * If there is only one zone active on the pset * the above reduces to: * - * zone_int_shares^2 + * zone_int_shares^2 * shusage = usage * --------------------- - * kpj_shares^2 + * kpj_shares^2 * * If there's only one project active in the * zone this formula reduces to: @@ -1373,8 +1373,6 @@ fss_update_list(int i) */ if (t->t_cid != fss_cid) goto next; - if ((fssproc->fss_flags & FSSKPRI) != 0) - goto next; fssproj = FSSPROC2FSSPROJ(fssproc); if (fssproj == NULL) @@ -1889,7 +1887,7 @@ fss_fork(kthread_t *pt, kthread_t *ct, void *bufp) cpucaps_sc_init(&cfssproc->fss_caps); cfssproc->fss_flags = - pfssproc->fss_flags & ~(FSSKPRI | FSSBACKQ | FSSRESTORE); + pfssproc->fss_flags & ~(FSSBACKQ | FSSRESTORE); ct->t_cldata = (void *)cfssproc; ct->t_schedflag |= TS_RUNQMATCH; thread_unlock(pt); @@ -1940,7 +1938,6 @@ fss_forkret(kthread_t *t, kthread_t *ct) fssproc->fss_timeleft = fss_quantum; t->t_pri = fssproc->fss_umdpri; ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri); - fssproc->fss_flags &= ~FSSKPRI; THREAD_TRANSITION(t); /* @@ -2039,11 +2036,6 @@ fss_parmsset(kthread_t *t, void *parmsp, id_t reqpcid, cred_t *reqpcredp) fssproc->fss_nice = nice; fss_newpri(fssproc, B_FALSE); - if ((fssproc->fss_flags & FSSKPRI) != 0) { - thread_unlock(t); - return (0); - } - fss_change_priority(t, fssproc); thread_unlock(t); return (0); @@ -2158,7 +2150,7 @@ fss_swapin(kthread_t *t, int flags) time_t swapout_time; swapout_time = (ddi_get_lbolt() - t->t_stime) / hz; - if (INHERITED(t) || (fssproc->fss_flags & FSSKPRI)) { + if (INHERITED(t)) { epri = (long)DISP_PRIO(t) + swapout_time; } else { /* @@ -2190,7 +2182,6 @@ fss_swapin(kthread_t *t, int flags) static pri_t fss_swapout(kthread_t *t, int flags) { - fssproc_t *fssproc = FSSPROC(t); long epri = -1; proc_t *pp = ttoproc(t); time_t swapin_time; @@ -2198,7 +2189,6 @@ fss_swapout(kthread_t *t, int flags) ASSERT(THREAD_LOCK_HELD(t)); if (INHERITED(t) || - (fssproc->fss_flags & FSSKPRI) || (t->t_proc_flag & TP_LWPEXIT) || (t->t_state & (TS_ZOMB|TS_FREE|TS_STOPPED|TS_ONPROC|TS_WAIT)) || !(t->t_schedflag & TS_LOAD) || @@ -2241,16 +2231,11 @@ fss_swapout(kthread_t *t, int flags) } /* - * If thread is currently at a kernel mode priority (has slept) and is - * returning to the userland we assign it the appropriate user mode priority - * and time quantum here. If we're lowering the thread's priority below that - * of other runnable threads then we will set runrun via cpu_surrender() to - * cause preemption. + * Run swap-out checks when returning to userspace. */ static void fss_trapret(kthread_t *t) { - fssproc_t *fssproc = FSSPROC(t); cpu_t *cp = CPU; ASSERT(THREAD_LOCK_HELD(t)); @@ -2258,20 +2243,6 @@ fss_trapret(kthread_t *t) ASSERT(cp->cpu_dispthread == t); ASSERT(t->t_state == TS_ONPROC); - t->t_kpri_req = 0; - if (fssproc->fss_flags & FSSKPRI) { - /* - * If thread has blocked in the kernel - */ - THREAD_CHANGE_PRI(t, fssproc->fss_umdpri); - cp->cpu_dispatch_pri = DISP_PRIO(t); - ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri); - fssproc->fss_flags &= ~FSSKPRI; - - if (DISP_MUST_SURRENDER(t)) - cpu_surrender(t); - } - /* * Swapout lwp if the swapper is waiting for this thread to reach * a safe point. @@ -2299,19 +2270,6 @@ fss_preempt(kthread_t *t) ASSERT(t->t_state == TS_ONPROC); /* - * If preempted in the kernel, make sure the thread has a kernel - * priority if needed. - */ - lwp = curthread->t_lwp; - if (!(fssproc->fss_flags & FSSKPRI) && lwp != NULL && t->t_kpri_req) { - fssproc->fss_flags |= FSSKPRI; - THREAD_CHANGE_PRI(t, minclsyspri); - ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri); - t->t_trapret = 1; /* so that fss_trapret will run */ - aston(t); - } - - /* * This thread may be placed on wait queue by CPU Caps. In this case we * do not need to do anything until it is removed from the wait queue. * Do not enforce CPU caps on threads running at a kernel priority @@ -2320,7 +2278,7 @@ fss_preempt(kthread_t *t) (void) cpucaps_charge(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE); - if (!(fssproc->fss_flags & FSSKPRI) && CPUCAPS_ENFORCE(t)) + if (CPUCAPS_ENFORCE(t)) return; } @@ -2329,6 +2287,7 @@ fss_preempt(kthread_t *t) * cannot be holding any kernel locks. */ ASSERT(t->t_schedflag & TS_DONT_SWAP); + lwp = ttolwp(t); if (lwp != NULL && lwp->lwp_state == LWP_USER) t->t_schedflag &= ~TS_DONT_SWAP; @@ -2346,18 +2305,16 @@ fss_preempt(kthread_t *t) if (t->t_schedctl && schedctl_get_nopreempt(t)) { if (fssproc->fss_timeleft > -SC_MAX_TICKS) { DTRACE_SCHED1(schedctl__nopreempt, kthread_t *, t); - if (!(fssproc->fss_flags & FSSKPRI)) { - /* - * If not already remembered, remember current - * priority for restoration in fss_yield(). - */ - if (!(fssproc->fss_flags & FSSRESTORE)) { - fssproc->fss_scpri = t->t_pri; - fssproc->fss_flags |= FSSRESTORE; - } - THREAD_CHANGE_PRI(t, fss_maxumdpri); - t->t_schedflag |= TS_DONT_SWAP; + /* + * If not already remembered, remember current + * priority for restoration in fss_yield(). + */ + if (!(fssproc->fss_flags & FSSRESTORE)) { + fssproc->fss_scpri = t->t_pri; + fssproc->fss_flags |= FSSRESTORE; } + THREAD_CHANGE_PRI(t, fss_maxumdpri); + t->t_schedflag |= TS_DONT_SWAP; schedctl_set_yield(t, 1); setfrontdq(t); return; @@ -2374,15 +2331,12 @@ fss_preempt(kthread_t *t) } } - flags = fssproc->fss_flags & (FSSBACKQ | FSSKPRI); + flags = fssproc->fss_flags & FSSBACKQ; if (flags == FSSBACKQ) { fssproc->fss_timeleft = fss_quantum; fssproc->fss_flags &= ~FSSBACKQ; setbackdq(t); - } else if (flags == (FSSBACKQ | FSSKPRI)) { - fssproc->fss_flags &= ~FSSBACKQ; - setbackdq(t); } else { setfrontdq(t); } @@ -2404,12 +2358,7 @@ fss_setrun(kthread_t *t) fssproc->fss_timeleft = fss_quantum; fssproc->fss_flags &= ~FSSBACKQ; - /* - * If previously were running at the kernel priority then keep that - * priority and the fss_timeleft doesn't matter. - */ - if ((fssproc->fss_flags & FSSKPRI) == 0) - THREAD_CHANGE_PRI(t, fssproc->fss_umdpri); + THREAD_CHANGE_PRI(t, fssproc->fss_umdpri); if (t->t_disp_time != ddi_get_lbolt()) setbackdq(t); @@ -2418,8 +2367,7 @@ fss_setrun(kthread_t *t) } /* - * Prepare thread for sleep. We reset the thread priority so it will run at the - * kernel priority level when it wakes up. + * Prepare thread for sleep. */ static void fss_sleep(kthread_t *t) @@ -2437,31 +2385,6 @@ fss_sleep(kthread_t *t) (void) CPUCAPS_CHARGE(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE); fss_inactive(t); - - /* - * Assign a system priority to the thread and arrange for it to be - * retained when the thread is next placed on the run queue (i.e., - * when it wakes up) instead of being given a new pri. Also arrange - * for trapret processing as the thread leaves the system call so it - * will drop back to normal priority range. - */ - if (t->t_kpri_req) { - THREAD_CHANGE_PRI(t, minclsyspri); - fssproc->fss_flags |= FSSKPRI; - t->t_trapret = 1; /* so that fss_trapret will run */ - aston(t); - } else if (fssproc->fss_flags & FSSKPRI) { - /* - * The thread has done a THREAD_KPRI_REQUEST(), slept, then - * done THREAD_KPRI_RELEASE() (so no t_kpri_req is 0 again), - * then slept again all without finishing the current system - * call so trapret won't have cleared FSSKPRI - */ - fssproc->fss_flags &= ~FSSKPRI; - THREAD_CHANGE_PRI(t, fssproc->fss_umdpri); - if (DISP_MUST_SURRENDER(curthread)) - cpu_surrender(t); - } t->t_stime = ddi_get_lbolt(); /* time stamp for the swapper */ } @@ -2503,67 +2426,56 @@ fss_tick(kthread_t *t) * Do not surrender CPU if running in the SYS class. */ if (CPUCAPS_ON()) { - cpucaps_enforce = cpucaps_charge(t, - &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE) && - !(fssproc->fss_flags & FSSKPRI); + cpucaps_enforce = cpucaps_charge(t, &fssproc->fss_caps, + CPUCAPS_CHARGE_ENFORCE); } - /* - * A thread's execution time for threads running in the SYS class - * is not tracked. - */ - if ((fssproc->fss_flags & FSSKPRI) == 0) { + if (--fssproc->fss_timeleft <= 0) { + pri_t new_pri; + /* - * If thread is not in kernel mode, decrement its fss_timeleft + * If we're doing preemption control and trying to avoid + * preempting this thread, just note that the thread should + * yield soon and let it keep running (unless it's been a + * while). */ - if (--fssproc->fss_timeleft <= 0) { - pri_t new_pri; - - /* - * If we're doing preemption control and trying to - * avoid preempting this thread, just note that the - * thread should yield soon and let it keep running - * (unless it's been a while). - */ - if (t->t_schedctl && schedctl_get_nopreempt(t)) { - if (fssproc->fss_timeleft > -SC_MAX_TICKS) { - DTRACE_SCHED1(schedctl__nopreempt, - kthread_t *, t); - schedctl_set_yield(t, 1); - thread_unlock_nopreempt(t); - return; - } + if (t->t_schedctl && schedctl_get_nopreempt(t)) { + if (fssproc->fss_timeleft > -SC_MAX_TICKS) { + DTRACE_SCHED1(schedctl__nopreempt, + kthread_t *, t); + schedctl_set_yield(t, 1); + thread_unlock_nopreempt(t); + return; } - fssproc->fss_flags &= ~FSSRESTORE; + } + fssproc->fss_flags &= ~FSSRESTORE; - fss_newpri(fssproc, B_TRUE); - new_pri = fssproc->fss_umdpri; - ASSERT(new_pri >= 0 && new_pri <= fss_maxglobpri); + fss_newpri(fssproc, B_TRUE); + new_pri = fssproc->fss_umdpri; + ASSERT(new_pri >= 0 && new_pri <= fss_maxglobpri); - /* - * When the priority of a thread is changed, it may - * be necessary to adjust its position on a sleep queue - * or dispatch queue. The function thread_change_pri - * accomplishes this. - */ - if (thread_change_pri(t, new_pri, 0)) { - if ((t->t_schedflag & TS_LOAD) && - (lwp = t->t_lwp) && - lwp->lwp_state == LWP_USER) - t->t_schedflag &= ~TS_DONT_SWAP; - fssproc->fss_timeleft = fss_quantum; - } else { - call_cpu_surrender = B_TRUE; - } - } else if (t->t_state == TS_ONPROC && - t->t_pri < t->t_disp_queue->disp_maxrunpri) { - /* - * If there is a higher-priority thread which is - * waiting for a processor, then thread surrenders - * the processor. - */ + /* + * When the priority of a thread is changed, it may be + * necessary to adjust its position on a sleep queue or + * dispatch queue. The function thread_change_pri accomplishes + * this. + */ + if (thread_change_pri(t, new_pri, 0)) { + if ((t->t_schedflag & TS_LOAD) && + (lwp = t->t_lwp) && + lwp->lwp_state == LWP_USER) + t->t_schedflag &= ~TS_DONT_SWAP; + fssproc->fss_timeleft = fss_quantum; + } else { call_cpu_surrender = B_TRUE; } + } else if (t->t_state == TS_ONPROC && + t->t_pri < t->t_disp_queue->disp_maxrunpri) { + /* + * If there is a higher-priority thread which is waiting for a + * processor, then thread surrenders the processor. + */ + call_cpu_surrender = B_TRUE; } if (cpucaps_enforce && 2 * fssproc->fss_timeleft > fss_quantum) { @@ -2618,32 +2530,13 @@ fss_wakeup(kthread_t *t) fssproc = FSSPROC(t); fssproc->fss_flags &= ~FSSBACKQ; - if (fssproc->fss_flags & FSSKPRI) { - /* - * If we already have a kernel priority assigned, then we - * just use it. - */ - setbackdq(t); - } else if (t->t_kpri_req) { - /* - * Give thread a priority boost if we were asked. - */ - fssproc->fss_flags |= FSSKPRI; - THREAD_CHANGE_PRI(t, minclsyspri); - setbackdq(t); - t->t_trapret = 1; /* so that fss_trapret will run */ - aston(t); + /* Recalculate the priority. */ + if (t->t_disp_time == ddi_get_lbolt()) { + setfrontdq(t); } else { - /* - * Otherwise, we recalculate the priority. - */ - if (t->t_disp_time == ddi_get_lbolt()) { - setfrontdq(t); - } else { - fssproc->fss_timeleft = fss_quantum; - THREAD_CHANGE_PRI(t, fssproc->fss_umdpri); - setbackdq(t); - } + fssproc->fss_timeleft = fss_quantum; + THREAD_CHANGE_PRI(t, fssproc->fss_umdpri); + setbackdq(t); } } diff --git a/usr/src/uts/common/disp/fx.c b/usr/src/uts/common/disp/fx.c index adb70871e2..5b190242e6 100644 --- a/usr/src/uts/common/disp/fx.c +++ b/usr/src/uts/common/disp/fx.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -71,16 +71,6 @@ static struct modlinkage modlinkage = { }; -/* - * control flags (kparms->fx_cflags). - */ -#define FX_DOUPRILIM 0x01 /* change user priority limit */ -#define FX_DOUPRI 0x02 /* change user priority */ -#define FX_DOTQ 0x04 /* change FX time quantum */ - - -#define FXMAXUPRI 60 /* maximum user priority setting */ - #define FX_MAX_UNPRIV_PRI 0 /* maximum unpriviledge priority */ /* diff --git a/usr/src/uts/common/disp/priocntl.c b/usr/src/uts/common/disp/priocntl.c index 5412df83f5..60e870ba28 100644 --- a/usr/src/uts/common/disp/priocntl.c +++ b/usr/src/uts/common/disp/priocntl.c @@ -114,7 +114,7 @@ copyin_vaparms32(caddr_t arg, pc_vaparms_t *vap, uio_seg_t seg) #endif -static int donice(procset_t *, pcnice_t *); +int donice(procset_t *, pcnice_t *); static int doprio(procset_t *, pcprio_t *); static int proccmp(proc_t *, struct pcmpargs *); static int setparms(proc_t *, struct stprmargs *); @@ -991,7 +991,7 @@ setprocnice(proc_t *pp, pcnice_t *pcnice) /* * Update the nice value of the specified LWP or set of processes. */ -static int +int donice(procset_t *procset, pcnice_t *pcnice) { int err_proc = 0; diff --git a/usr/src/uts/common/disp/rt.c b/usr/src/uts/common/disp/rt.c index f87f8c56ce..115e42ccb8 100644 --- a/usr/src/uts/common/disp/rt.c +++ b/usr/src/uts/common/disp/rt.c @@ -22,7 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -103,13 +103,6 @@ _info(struct modinfo *modinfop) pri_t rt_maxpri = RTMAXPRI; /* maximum real-time priority */ rtdpent_t *rt_dptbl; /* real-time dispatcher parameter table */ -/* - * control flags (kparms->rt_cflags). - */ -#define RT_DOPRI 0x01 /* change priority */ -#define RT_DOTQ 0x02 /* change RT time quantum */ -#define RT_DOSIG 0x04 /* change RT time quantum signal */ - static int rt_admin(caddr_t, cred_t *); static int rt_enterclass(kthread_t *, id_t, void *, cred_t *, void *); static int rt_fork(kthread_t *, kthread_t *, void *); diff --git a/usr/src/uts/common/disp/rt_dptbl.c b/usr/src/uts/common/disp/rt_dptbl.c index 768b499ef2..cc88ed72fc 100644 --- a/usr/src/uts/common/disp/rt_dptbl.c +++ b/usr/src/uts/common/disp/rt_dptbl.c @@ -28,8 +28,6 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ /* All Rights Reserved */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/proc.h> #include <sys/priocntl.h> #include <sys/class.h> @@ -70,8 +68,6 @@ _info(struct modinfo *modinfop) return (mod_info(&modlinkage, modinfop)); } -#define RTGPPRIO0 100 /* Global priority for RT priority 0 */ - rtdpent_t config_rt_dptbl[] = { /* prilevel Time quantum */ diff --git a/usr/src/uts/common/disp/sysdc.c b/usr/src/uts/common/disp/sysdc.c index 40cde57856..1f50788ceb 100644 --- a/usr/src/uts/common/disp/sysdc.c +++ b/usr/src/uts/common/disp/sysdc.c @@ -193,32 +193,6 @@ * flag. This flag currently has no effect, but marks threads which * do bulk processing. * - * - t_kpri_req - * - * The TS and FSS scheduling classes pay attention to t_kpri_req, - * which provides a simple form of priority inheritance for - * synchronization primitives (such as rwlocks held as READER) which - * cannot be traced to a unique thread. The SDC class does not honor - * t_kpri_req, for a few reasons: - * - * 1. t_kpri_req is notoriously inaccurate. A measure of its - * inaccuracy is that it needs to be cleared every time a thread - * returns to user mode, because it is frequently non-zero at that - * point. This can happen because "ownership" of synchronization - * primitives that use t_kpri_req can be silently handed off, - * leaving no opportunity to will the t_kpri_req inheritance. - * - * 2. Unlike in TS and FSS, threads in SDC *will* eventually run at - * kernel priority. This means that even if an SDC thread - * is holding a synchronization primitive and running at low - * priority, its priority will eventually be raised above 60, - * allowing it to drive on and release the resource. - * - * 3. The first consumer of SDC uses the taskq subsystem, which holds - * a reader lock for the duration of the task's execution. This - * would mean that SDC threads would never drop below kernel - * priority in practice, which defeats one of the purposes of SDC. - * * - Why not FSS? * * It might seem that the existing FSS scheduling class could solve diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c index cfcb28aa0a..bf1f121b67 100644 --- a/usr/src/uts/common/disp/thread.c +++ b/usr/src/uts/common/disp/thread.c @@ -77,6 +77,10 @@ #include <sys/ctype.h> #include <sys/smt.h> +#ifndef STACK_GROWTH_DOWN +#error Stacks do not grow downward; 3b2 zombie attack detected! +#endif + struct kmem_cache *thread_cache; /* cache of free threads */ struct kmem_cache *lwp_cache; /* cache of free lwps */ struct kmem_cache *turnstile_cache; /* cache of free turnstiles */ @@ -374,7 +378,7 @@ thread_create( if (stksize <= sizeof (kthread_t) + PTR24_ALIGN) cmn_err(CE_PANIC, "thread_create: proposed stack size" " too small to hold thread."); -#ifdef STACK_GROWTH_DOWN + stksize -= SA(sizeof (kthread_t) + PTR24_ALIGN - 1); stksize &= -PTR24_ALIGN; /* make thread aligned */ t = (kthread_t *)(stk + stksize); @@ -383,13 +387,6 @@ thread_create( audit_thread_create(t); t->t_stk = stk + stksize; t->t_stkbase = stk; -#else /* stack grows to larger addresses */ - stksize -= SA(sizeof (kthread_t)); - t = (kthread_t *)(stk); - bzero(t, sizeof (kthread_t)); - t->t_stk = stk + sizeof (kthread_t); - t->t_stkbase = stk + stksize + sizeof (kthread_t); -#endif /* STACK_GROWTH_DOWN */ t->t_flag |= T_TALLOCSTK; t->t_swap = stk; } else { @@ -402,13 +399,8 @@ thread_create( * Initialize t_stk to the kernel stack pointer to use * upon entry to the kernel */ -#ifdef STACK_GROWTH_DOWN t->t_stk = stk + stksize; t->t_stkbase = stk; -#else - t->t_stk = stk; /* 3b2-like */ - t->t_stkbase = stk + stksize; -#endif /* STACK_GROWTH_DOWN */ } if (kmem_stackinfo != 0) { @@ -588,6 +580,9 @@ thread_exit(void) if ((t->t_proc_flag & TP_ZTHREAD) != 0) cmn_err(CE_PANIC, "thread_exit: zthread_exit() not called"); + if ((t->t_flag & T_SPLITSTK) != 0) + cmn_err(CE_PANIC, "thread_exit: called when stack is split"); + tsd_exit(); /* Clean up this thread's TSD */ kcpc_passivate(); /* clean up performance counter state */ @@ -1053,8 +1048,44 @@ installctx( ctx->exit_op = exit; ctx->free_op = free; ctx->arg = arg; - ctx->next = t->t_ctx; + ctx->save_ts = 0; + ctx->restore_ts = 0; + + /* + * Keep ctxops in a doubly-linked list to allow traversal in both + * directions. Using only the newest-to-oldest ordering was adequate + * previously, but reversing the order for restore_op actions is + * necessary if later-added ctxops depends on earlier ones. + * + * One example of such a dependency: Hypervisor software handling the + * guest FPU expects that it save FPU state prior to host FPU handling + * and consequently handle the guest logic _after_ the host FPU has + * been restored. + * + * The t_ctx member points to the most recently added ctxop or is NULL + * if no ctxops are associated with the thread. The 'next' pointers + * form a loop of the ctxops in newest-to-oldest order. The 'prev' + * pointers form a loop in the reverse direction, where t_ctx->prev is + * the oldest entry associated with the thread. + * + * The protection of kpreempt_disable is required to safely perform the + * list insertion, since there are inconsistent states between some of + * the pointer assignments. + */ + kpreempt_disable(); + if (t->t_ctx == NULL) { + ctx->next = ctx; + ctx->prev = ctx; + } else { + struct ctxop *head = t->t_ctx, *tail = t->t_ctx->prev; + + ctx->next = head; + ctx->prev = tail; + head->prev = ctx; + tail->next = ctx; + } t->t_ctx = ctx; + kpreempt_enable(); } /* @@ -1071,7 +1102,7 @@ removectx( void (*exit)(void *), void (*free)(void *, int)) { - struct ctxop *ctx, *prev_ctx; + struct ctxop *ctx, *head; /* * The incoming kthread_t (which is the thread for which the @@ -1096,17 +1127,31 @@ removectx( * and the target thread from racing with each other during lwp exit. */ mutex_enter(&t->t_ctx_lock); - prev_ctx = NULL; kpreempt_disable(); - for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) { + + if (t->t_ctx == NULL) { + mutex_exit(&t->t_ctx_lock); + kpreempt_enable(); + return (0); + } + + ctx = head = t->t_ctx; + do { if (ctx->save_op == save && ctx->restore_op == restore && ctx->fork_op == fork && ctx->lwp_create_op == lwp_create && ctx->exit_op == exit && ctx->free_op == free && ctx->arg == arg) { - if (prev_ctx) - prev_ctx->next = ctx->next; - else + ctx->prev->next = ctx->next; + ctx->next->prev = ctx->prev; + if (ctx->next == ctx) { + /* last remaining item */ + t->t_ctx = NULL; + } else if (ctx == t->t_ctx) { + /* fix up head of list */ t->t_ctx = ctx->next; + } + ctx->next = ctx->prev = NULL; + mutex_exit(&t->t_ctx_lock); if (ctx->free_op != NULL) (ctx->free_op)(ctx->arg, 0); @@ -1114,44 +1159,70 @@ removectx( kpreempt_enable(); return (1); } - prev_ctx = ctx; - } + + ctx = ctx->next; + } while (ctx != head); + mutex_exit(&t->t_ctx_lock); kpreempt_enable(); - return (0); } void savectx(kthread_t *t) { - struct ctxop *ctx; - ASSERT(t == curthread); - for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) - if (ctx->save_op != NULL) - (ctx->save_op)(ctx->arg); + + if (t->t_ctx != NULL) { + struct ctxop *ctx, *head; + + /* Forward traversal */ + ctx = head = t->t_ctx; + do { + if (ctx->save_op != NULL) { + ctx->save_ts = gethrtime_unscaled(); + (ctx->save_op)(ctx->arg); + } + ctx = ctx->next; + } while (ctx != head); + } } void restorectx(kthread_t *t) { - struct ctxop *ctx; - ASSERT(t == curthread); - for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) - if (ctx->restore_op != NULL) - (ctx->restore_op)(ctx->arg); + + if (t->t_ctx != NULL) { + struct ctxop *ctx, *tail; + + /* Backward traversal (starting at the tail) */ + ctx = tail = t->t_ctx->prev; + do { + if (ctx->restore_op != NULL) { + ctx->restore_ts = gethrtime_unscaled(); + (ctx->restore_op)(ctx->arg); + } + ctx = ctx->prev; + } while (ctx != tail); + } } void forkctx(kthread_t *t, kthread_t *ct) { - struct ctxop *ctx; - - for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) - if (ctx->fork_op != NULL) - (ctx->fork_op)(t, ct); + if (t->t_ctx != NULL) { + struct ctxop *ctx, *head; + + /* Forward traversal */ + ctx = head = t->t_ctx; + do { + if (ctx->fork_op != NULL) { + (ctx->fork_op)(t, ct); + } + ctx = ctx->next; + } while (ctx != head); + } } /* @@ -1162,11 +1233,18 @@ forkctx(kthread_t *t, kthread_t *ct) void lwp_createctx(kthread_t *t, kthread_t *ct) { - struct ctxop *ctx; - - for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) - if (ctx->lwp_create_op != NULL) - (ctx->lwp_create_op)(t, ct); + if (t->t_ctx != NULL) { + struct ctxop *ctx, *head; + + /* Forward traversal */ + ctx = head = t->t_ctx; + do { + if (ctx->lwp_create_op != NULL) { + (ctx->lwp_create_op)(t, ct); + } + ctx = ctx->next; + } while (ctx != head); + } } /* @@ -1179,11 +1257,18 @@ lwp_createctx(kthread_t *t, kthread_t *ct) void exitctx(kthread_t *t) { - struct ctxop *ctx; - - for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) - if (ctx->exit_op != NULL) - (ctx->exit_op)(t); + if (t->t_ctx != NULL) { + struct ctxop *ctx, *head; + + /* Forward traversal */ + ctx = head = t->t_ctx; + do { + if (ctx->exit_op != NULL) { + (ctx->exit_op)(t); + } + ctx = ctx->next; + } while (ctx != head); + } } /* @@ -1193,14 +1278,21 @@ exitctx(kthread_t *t) void freectx(kthread_t *t, int isexec) { - struct ctxop *ctx; - kpreempt_disable(); - while ((ctx = t->t_ctx) != NULL) { - t->t_ctx = ctx->next; - if (ctx->free_op != NULL) - (ctx->free_op)(ctx->arg, isexec); - kmem_free(ctx, sizeof (struct ctxop)); + if (t->t_ctx != NULL) { + struct ctxop *ctx, *head; + + ctx = head = t->t_ctx; + t->t_ctx = NULL; + do { + struct ctxop *next = ctx->next; + + if (ctx->free_op != NULL) { + (ctx->free_op)(ctx->arg, isexec); + } + kmem_free(ctx, sizeof (struct ctxop)); + ctx = next; + } while (ctx != head); } kpreempt_enable(); } @@ -1215,17 +1307,22 @@ freectx(kthread_t *t, int isexec) void freectx_ctx(struct ctxop *ctx) { - struct ctxop *nctx; + struct ctxop *head = ctx; ASSERT(ctx != NULL); kpreempt_disable(); + + head = ctx; do { - nctx = ctx->next; - if (ctx->free_op != NULL) + struct ctxop *next = ctx->next; + + if (ctx->free_op != NULL) { (ctx->free_op)(ctx->arg, 0); + } kmem_free(ctx, sizeof (struct ctxop)); - } while ((ctx = nctx) != NULL); + ctx = next; + } while (ctx != head); kpreempt_enable(); } @@ -1889,6 +1986,103 @@ thread_change_pri(kthread_t *t, pri_t disp_pri, int front) return (on_rq); } + +/* + * There are occasions in the kernel when we need much more stack than we + * allocate by default, but we do not wish to have that work done + * asynchronously by another thread. To accommodate these scenarios, we allow + * for a split stack (also known as a "segmented stack") whereby a new stack + * is dynamically allocated and the current thread jumps onto it for purposes + * of executing the specified function. After the specified function returns, + * the stack is deallocated and control is returned to the caller. This + * functionality is implemented by thread_splitstack(), below; there are a few + * constraints on its use: + * + * - The caller must be in a context where it is safe to block for memory. + * - The caller cannot be in a t_onfault context + * - The called function must not call thread_exit() while on the split stack + * + * The code will explicitly panic if these constraints are violated. Notably, + * however, thread_splitstack() _can_ be called on a split stack -- there + * is no limit to the level that split stacks can nest. + * + * When the stack is split, it is constructed such that stack backtraces + * from kernel debuggers continue to function -- though note that DTrace's + * stack() action and stackdepth function will only show the stack up to and + * including thread_splitstack_run(); DTrace explicitly bounds itself to + * pointers that exist within the current declared stack as a safety + * mechanism. + */ +void +thread_splitstack(void (*func)(void *), void *arg, size_t stksize) +{ + kthread_t *t = curthread; + caddr_t ostk, ostkbase, stk; + ushort_t otflag; + + if (t->t_onfault != NULL) + panic("thread_splitstack: called with non-NULL t_onfault"); + + ostk = t->t_stk; + ostkbase = t->t_stkbase; + otflag = t->t_flag; + + stksize = roundup(stksize, PAGESIZE); + + if (stksize < default_stksize) + stksize = default_stksize; + + if (stksize == default_stksize) { + stk = (caddr_t)segkp_cache_get(segkp_thread); + } else { + stksize = roundup(stksize, PAGESIZE); + stk = (caddr_t)segkp_get(segkp, stksize, + (KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED)); + } + + /* + * We're going to lock ourselves before we set T_SPLITSTK to assure + * that we're not swapped out in the meantime. (Note that we don't + * bother to set t_swap, as we're not going to be swapped out.) + */ + thread_lock(t); + + if (!(otflag & T_SPLITSTK)) + t->t_flag |= T_SPLITSTK; + + t->t_stk = stk + stksize; + t->t_stkbase = stk; + + thread_unlock(t); + + /* + * Now actually run on the new (split) stack... + */ + thread_splitstack_run(t->t_stk, func, arg); + + /* + * We're back onto our own stack; lock ourselves and restore our + * pre-split state. + */ + thread_lock(t); + + t->t_stk = ostk; + t->t_stkbase = ostkbase; + + if (!(otflag & T_SPLITSTK)) + t->t_flag &= ~T_SPLITSTK; + + thread_unlock(t); + + /* + * Now that we are entirely back on our own stack, call back into + * the platform layer to perform any platform-specific cleanup. + */ + thread_splitstack_cleanup(); + + segkp_release(segkp, stk); +} + /* * Tunable kmem_stackinfo is set, fill the kernel thread stack with a * specific pattern. diff --git a/usr/src/uts/common/disp/thread_intr.c b/usr/src/uts/common/disp/thread_intr.c index 67ccc6922f..c840bdf31a 100644 --- a/usr/src/uts/common/disp/thread_intr.c +++ b/usr/src/uts/common/disp/thread_intr.c @@ -23,19 +23,10 @@ * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - /* - * FILE NOTICE BEGIN - * - * This file should not be modified. If you wish to modify it or have it - * modified, please contact Sun Microsystems at <LFI149367@-sun-.-com-> - * (without anti-spam dashes) - * - * FILE NOTICE END + * Copyright 2015, Joyent, Inc. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/cpuvar.h> #include <sys/stack.h> #include <vm/seg_kp.h> @@ -44,6 +35,17 @@ #include <sys/sysmacros.h> /* + * Use a slightly larger thread stack size for interrupt threads rather than the + * default. This is useful for cases where the networking stack may do an rx and + * a tx in the context of a single interrupt and when combined with various + * promisc hooks that need memory, can cause us to get dangerously close to the + * edge of the traditional stack sizes. This is only a few pages more than a + * traditional stack and given that we don't have that many interrupt threads, + * the memory costs end up being more than worthwhile. + */ +#define LL_INTR_STKSZ (32 * 1024) + +/* * Create and initialize an interrupt thread. */ static void @@ -51,7 +53,7 @@ thread_create_intr(cpu_t *cp) { kthread_t *tp; - tp = thread_create(NULL, 0, + tp = thread_create(NULL, LL_INTR_STKSZ, (void (*)())thread_create_intr, NULL, 0, &p0, TS_ONPROC, 0); /* @@ -97,9 +99,12 @@ thread_create_intr(cpu_t *cp) } /* - * Allocate a given number of interrupt threads for a given CPU. - * These threads will get freed by cpu_destroy_bound_threads() - * when CPU gets unconfigured. + * Allocate a given number of interrupt threads for a given CPU. These threads + * will get freed by cpu_destroy_bound_threads() when CPU gets unconfigured. + * + * Note, high level interrupts are always serviced using cpu_intr_stack and are + * not allowed to block. Low level interrupts or soft-interrupts use the + * kthread_t's that we create through the calls to thread_create_intr(). */ void cpu_intr_alloc(cpu_t *cp, int n) @@ -110,6 +115,6 @@ cpu_intr_alloc(cpu_t *cp, int n) thread_create_intr(cp); cp->cpu_intr_stack = (caddr_t)segkp_get(segkp, INTR_STACK_SIZE, - KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED) + - INTR_STACK_SIZE - SA(MINFRAME); + KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED) + + INTR_STACK_SIZE - SA(MINFRAME); } diff --git a/usr/src/uts/common/disp/ts.c b/usr/src/uts/common/disp/ts.c index bf65c3c42d..5d35b283d7 100644 --- a/usr/src/uts/common/disp/ts.c +++ b/usr/src/uts/common/disp/ts.c @@ -21,11 +21,11 @@ /* * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ -/* All Rights Reserved */ +/* All Rights Reserved */ #include <sys/types.h> #include <sys/param.h> @@ -229,7 +229,6 @@ static void ia_set_process_group(pid_t, pid_t, pid_t); static void ts_change_priority(kthread_t *, tsproc_t *); -extern pri_t ts_maxkmdpri; /* maximum kernel mode ts priority */ static pri_t ts_maxglobpri; /* maximum global priority used by ts class */ static kmutex_t ts_dptblock; /* protects time sharing dispatch table */ static kmutex_t ts_list_lock[TS_LISTS]; /* protects tsproc lists */ @@ -541,8 +540,8 @@ ts_admin(caddr_t uaddr, cred_t *reqpcredp) * to specified time-sharing priority. */ static int -ts_enterclass(kthread_t *t, id_t cid, void *parmsp, - cred_t *reqpcredp, void *bufp) +ts_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp, + void *bufp) { tsparms_t *tsparmsp = (tsparms_t *)parmsp; tsproc_t *tspp; @@ -703,7 +702,7 @@ ts_fork(kthread_t *t, kthread_t *ct, void *bufp) TS_NEWUMDPRI(ctspp); ctspp->ts_nice = ptspp->ts_nice; ctspp->ts_dispwait = 0; - ctspp->ts_flags = ptspp->ts_flags & ~(TSKPRI | TSBACKQ | TSRESTORE); + ctspp->ts_flags = ptspp->ts_flags & ~(TSBACKQ | TSRESTORE); ctspp->ts_tp = ct; cpucaps_sc_init(&ctspp->ts_caps); thread_unlock(t); @@ -754,7 +753,6 @@ ts_forkret(kthread_t *t, kthread_t *ct) tspp->ts_dispwait = 0; t->t_pri = ts_dptbl[tspp->ts_umdpri].ts_globpri; ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri); - tspp->ts_flags &= ~TSKPRI; THREAD_TRANSITION(t); ts_setrun(t); thread_unlock(t); @@ -1217,11 +1215,6 @@ ts_parmsset(kthread_t *tx, void *parmsp, id_t reqpcid, cred_t *reqpcredp) TS_NEWUMDPRI(tspp); tspp->ts_nice = nice; - if ((tspp->ts_flags & TSKPRI) != 0) { - thread_unlock(tx); - return (0); - } - tspp->ts_dispwait = 0; ts_change_priority(tx, tspp); thread_unlock(tx); @@ -1237,7 +1230,7 @@ ia_parmsset(kthread_t *tx, void *parmsp, id_t reqpcid, cred_t *reqpcredp) proc_t *p; pid_t pid, pgid, sid; pid_t on, off; - struct stdata *stp; + struct stdata *stp; int sess_held; /* @@ -1373,33 +1366,20 @@ static void ts_preempt(kthread_t *t) { tsproc_t *tspp = (tsproc_t *)(t->t_cldata); - klwp_t *lwp = curthread->t_lwp; + klwp_t *lwp = ttolwp(t); pri_t oldpri = t->t_pri; ASSERT(t == curthread); ASSERT(THREAD_LOCK_HELD(curthread)); /* - * If preempted in the kernel, make sure the thread has - * a kernel priority if needed. - */ - if (!(tspp->ts_flags & TSKPRI) && lwp != NULL && t->t_kpri_req) { - tspp->ts_flags |= TSKPRI; - THREAD_CHANGE_PRI(t, ts_kmdpris[0]); - ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri); - t->t_trapret = 1; /* so ts_trapret will run */ - aston(t); - } - - /* * This thread may be placed on wait queue by CPU Caps. In this case we * do not need to do anything until it is removed from the wait queue. - * Do not enforce CPU caps on threads running at a kernel priority */ if (CPUCAPS_ON()) { (void) cpucaps_charge(t, &tspp->ts_caps, CPUCAPS_CHARGE_ENFORCE); - if (!(tspp->ts_flags & TSKPRI) && CPUCAPS_ENFORCE(t)) + if (CPUCAPS_ENFORCE(t)) return; } @@ -1425,18 +1405,16 @@ ts_preempt(kthread_t *t) if (t->t_schedctl && schedctl_get_nopreempt(t)) { if (tspp->ts_timeleft > -SC_MAX_TICKS) { DTRACE_SCHED1(schedctl__nopreempt, kthread_t *, t); - if (!(tspp->ts_flags & TSKPRI)) { - /* - * If not already remembered, remember current - * priority for restoration in ts_yield(). - */ - if (!(tspp->ts_flags & TSRESTORE)) { - tspp->ts_scpri = t->t_pri; - tspp->ts_flags |= TSRESTORE; - } - THREAD_CHANGE_PRI(t, ts_maxumdpri); - t->t_schedflag |= TS_DONT_SWAP; + /* + * If not already remembered, remember current + * priority for restoration in ts_yield(). + */ + if (!(tspp->ts_flags & TSRESTORE)) { + tspp->ts_scpri = t->t_pri; + tspp->ts_flags |= TSRESTORE; } + THREAD_CHANGE_PRI(t, ts_maxumdpri); + t->t_schedflag |= TS_DONT_SWAP; schedctl_set_yield(t, 1); setfrontdq(t); goto done; @@ -1456,14 +1434,11 @@ ts_preempt(kthread_t *t) } } - if ((tspp->ts_flags & (TSBACKQ|TSKPRI)) == TSBACKQ) { + if ((tspp->ts_flags & TSBACKQ) != 0) { tspp->ts_timeleft = ts_dptbl[tspp->ts_cpupri].ts_quantum; tspp->ts_dispwait = 0; tspp->ts_flags &= ~TSBACKQ; setbackdq(t); - } else if ((tspp->ts_flags & (TSBACKQ|TSKPRI)) == (TSBACKQ|TSKPRI)) { - tspp->ts_flags &= ~TSBACKQ; - setbackdq(t); } else { setfrontdq(t); } @@ -1485,11 +1460,8 @@ ts_setrun(kthread_t *t) TS_NEWUMDPRI(tspp); tspp->ts_timeleft = ts_dptbl[tspp->ts_cpupri].ts_quantum; tspp->ts_dispwait = 0; - if ((tspp->ts_flags & TSKPRI) == 0) { - THREAD_CHANGE_PRI(t, - ts_dptbl[tspp->ts_umdpri].ts_globpri); - ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri); - } + THREAD_CHANGE_PRI(t, ts_dptbl[tspp->ts_umdpri].ts_globpri); + ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri); } tspp->ts_flags &= ~TSBACKQ; @@ -1509,14 +1481,12 @@ ts_setrun(kthread_t *t) /* - * Prepare thread for sleep. We reset the thread priority so it will - * run at the kernel priority level when it wakes up. + * Prepare thread for sleep. */ static void ts_sleep(kthread_t *t) { tsproc_t *tspp = (tsproc_t *)(t->t_cldata); - int flags; pri_t old_pri = t->t_pri; ASSERT(t == curthread); @@ -1527,18 +1497,7 @@ ts_sleep(kthread_t *t) */ (void) CPUCAPS_CHARGE(t, &tspp->ts_caps, CPUCAPS_CHARGE_ENFORCE); - flags = tspp->ts_flags; - if (t->t_kpri_req) { - tspp->ts_flags = flags | TSKPRI; - THREAD_CHANGE_PRI(t, ts_kmdpris[0]); - ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri); - t->t_trapret = 1; /* so ts_trapret will run */ - aston(t); - } else if (tspp->ts_dispwait > ts_dptbl[tspp->ts_umdpri].ts_maxwait) { - /* - * If thread has blocked in the kernel (as opposed to - * being merely preempted), recompute the user mode priority. - */ + if (tspp->ts_dispwait > ts_dptbl[tspp->ts_umdpri].ts_maxwait) { tspp->ts_cpupri = ts_dptbl[tspp->ts_cpupri].ts_slpret; TS_NEWUMDPRI(tspp); tspp->ts_timeleft = ts_dptbl[tspp->ts_cpupri].ts_quantum; @@ -1548,16 +1507,6 @@ ts_sleep(kthread_t *t) ts_dptbl[tspp->ts_umdpri].ts_globpri); ASSERT(curthread->t_pri >= 0 && curthread->t_pri <= ts_maxglobpri); - tspp->ts_flags = flags & ~TSKPRI; - - if (DISP_MUST_SURRENDER(curthread)) - cpu_surrender(curthread); - } else if (flags & TSKPRI) { - THREAD_CHANGE_PRI(curthread, - ts_dptbl[tspp->ts_umdpri].ts_globpri); - ASSERT(curthread->t_pri >= 0 && - curthread->t_pri <= ts_maxglobpri); - tspp->ts_flags = flags & ~TSKPRI; if (DISP_MUST_SURRENDER(curthread)) cpu_surrender(curthread); @@ -1594,9 +1543,9 @@ ts_swapin(kthread_t *t, int flags) time_t swapout_time; swapout_time = (ddi_get_lbolt() - t->t_stime) / hz; - if (INHERITED(t) || (tspp->ts_flags & (TSKPRI | TSIASET))) + if (INHERITED(t) || (tspp->ts_flags & TSIASET)) { epri = (long)DISP_PRIO(t) + swapout_time; - else { + } else { /* * Threads which have been out for a long time, * have high user mode priority and are associated @@ -1648,7 +1597,7 @@ ts_swapout(kthread_t *t, int flags) ASSERT(THREAD_LOCK_HELD(t)); - if (INHERITED(t) || (tspp->ts_flags & (TSKPRI | TSIASET)) || + if (INHERITED(t) || (tspp->ts_flags & TSIASET) || (t->t_proc_flag & TP_LWPEXIT) || (t->t_state & (TS_ZOMB | TS_FREE | TS_STOPPED | TS_ONPROC | TS_WAIT)) || @@ -1717,62 +1666,59 @@ ts_tick(kthread_t *t) */ if (CPUCAPS_ON()) { call_cpu_surrender = cpucaps_charge(t, &tspp->ts_caps, - CPUCAPS_CHARGE_ENFORCE) && !(tspp->ts_flags & TSKPRI); + CPUCAPS_CHARGE_ENFORCE); } - if ((tspp->ts_flags & TSKPRI) == 0) { - if (--tspp->ts_timeleft <= 0) { - pri_t new_pri; + if (--tspp->ts_timeleft <= 0) { + pri_t new_pri; - /* - * If we're doing preemption control and trying to - * avoid preempting this thread, just note that - * the thread should yield soon and let it keep - * running (unless it's been a while). - */ - if (t->t_schedctl && schedctl_get_nopreempt(t)) { - if (tspp->ts_timeleft > -SC_MAX_TICKS) { - DTRACE_SCHED1(schedctl__nopreempt, - kthread_t *, t); - schedctl_set_yield(t, 1); - thread_unlock_nopreempt(t); - return; - } - - TNF_PROBE_2(schedctl_failsafe, - "schedctl TS ts_tick", /* CSTYLED */, - tnf_pid, pid, ttoproc(t)->p_pid, - tnf_lwpid, lwpid, t->t_tid); - } - tspp->ts_flags &= ~TSRESTORE; - tspp->ts_cpupri = ts_dptbl[tspp->ts_cpupri].ts_tqexp; - TS_NEWUMDPRI(tspp); - tspp->ts_dispwait = 0; - new_pri = ts_dptbl[tspp->ts_umdpri].ts_globpri; - ASSERT(new_pri >= 0 && new_pri <= ts_maxglobpri); - /* - * When the priority of a thread is changed, - * it may be necessary to adjust its position - * on a sleep queue or dispatch queue. - * The function thread_change_pri accomplishes - * this. - */ - if (thread_change_pri(t, new_pri, 0)) { - if ((t->t_schedflag & TS_LOAD) && - (lwp = t->t_lwp) && - lwp->lwp_state == LWP_USER) - t->t_schedflag &= ~TS_DONT_SWAP; - tspp->ts_timeleft = - ts_dptbl[tspp->ts_cpupri].ts_quantum; - } else { - call_cpu_surrender = B_TRUE; + /* + * If we're doing preemption control and trying to avoid + * preempting this thread, just note that the thread should + * yield soon and let it keep running (unless it's been a + * while). + */ + if (t->t_schedctl && schedctl_get_nopreempt(t)) { + if (tspp->ts_timeleft > -SC_MAX_TICKS) { + DTRACE_SCHED1(schedctl__nopreempt, + kthread_t *, t); + schedctl_set_yield(t, 1); + thread_unlock_nopreempt(t); + return; } - TRACE_2(TR_FAC_DISP, TR_TICK, - "tick:tid %p old pri %d", t, oldpri); - } else if (t->t_state == TS_ONPROC && - t->t_pri < t->t_disp_queue->disp_maxrunpri) { + + TNF_PROBE_2(schedctl_failsafe, + "schedctl TS ts_tick", /* CSTYLED */, + tnf_pid, pid, ttoproc(t)->p_pid, + tnf_lwpid, lwpid, t->t_tid); + } + tspp->ts_flags &= ~TSRESTORE; + tspp->ts_cpupri = ts_dptbl[tspp->ts_cpupri].ts_tqexp; + TS_NEWUMDPRI(tspp); + tspp->ts_dispwait = 0; + new_pri = ts_dptbl[tspp->ts_umdpri].ts_globpri; + ASSERT(new_pri >= 0 && new_pri <= ts_maxglobpri); + /* + * When the priority of a thread is changed, it may be + * necessary to adjust its position on a sleep queue or + * dispatch queue. The function thread_change_pri accomplishes + * this. + */ + if (thread_change_pri(t, new_pri, 0)) { + if ((t->t_schedflag & TS_LOAD) && + (lwp = t->t_lwp) && + lwp->lwp_state == LWP_USER) + t->t_schedflag &= ~TS_DONT_SWAP; + tspp->ts_timeleft = + ts_dptbl[tspp->ts_cpupri].ts_quantum; + } else { call_cpu_surrender = B_TRUE; } + TRACE_2(TR_FAC_DISP, TR_TICK, + "tick:tid %p old pri %d", t, oldpri); + } else if (t->t_state == TS_ONPROC && + t->t_pri < t->t_disp_queue->disp_maxrunpri) { + call_cpu_surrender = B_TRUE; } if (call_cpu_surrender) { @@ -1785,11 +1731,8 @@ ts_tick(kthread_t *t) /* - * If thread is currently at a kernel mode priority (has slept) - * we assign it the appropriate user mode priority and time quantum - * here. If we are lowering the thread's priority below that of - * other runnable threads we will normally set runrun via cpu_surrender() to - * cause preemption. + * If we are lowering the thread's priority below that of other runnable + * threads we will normally set runrun via cpu_surrender() to cause preemption. */ static void ts_trapret(kthread_t *t) @@ -1803,7 +1746,6 @@ ts_trapret(kthread_t *t) ASSERT(cp->cpu_dispthread == t); ASSERT(t->t_state == TS_ONPROC); - t->t_kpri_req = 0; if (tspp->ts_dispwait > ts_dptbl[tspp->ts_umdpri].ts_maxwait) { tspp->ts_cpupri = ts_dptbl[tspp->ts_cpupri].ts_slpret; TS_NEWUMDPRI(tspp); @@ -1817,27 +1759,14 @@ ts_trapret(kthread_t *t) THREAD_CHANGE_PRI(t, ts_dptbl[tspp->ts_umdpri].ts_globpri); cp->cpu_dispatch_pri = DISP_PRIO(t); ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri); - tspp->ts_flags &= ~TSKPRI; - - if (DISP_MUST_SURRENDER(t)) - cpu_surrender(t); - } else if (tspp->ts_flags & TSKPRI) { - /* - * If thread has blocked in the kernel (as opposed to - * being merely preempted), recompute the user mode priority. - */ - THREAD_CHANGE_PRI(t, ts_dptbl[tspp->ts_umdpri].ts_globpri); - cp->cpu_dispatch_pri = DISP_PRIO(t); - ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri); - tspp->ts_flags &= ~TSKPRI; if (DISP_MUST_SURRENDER(t)) cpu_surrender(t); } /* - * Swapout lwp if the swapper is waiting for this thread to - * reach a safe point. + * Swapout lwp if the swapper is waiting for this thread to reach a + * safe point. */ if ((t->t_schedflag & TS_SWAPENQ) && !(tspp->ts_flags & TSIASET)) { thread_unlock(t); @@ -1931,8 +1860,6 @@ ts_update_list(int i) tx->t_clfuncs != &ia_classfuncs.thread) goto next; tspp->ts_dispwait++; - if ((tspp->ts_flags & TSKPRI) != 0) - goto next; if (tspp->ts_dispwait <= ts_dptbl[tspp->ts_umdpri].ts_maxwait) goto next; if (tx->t_schedctl && schedctl_get_nopreempt(tx)) @@ -1968,12 +1895,7 @@ next: } /* - * Processes waking up go to the back of their queue. We don't - * need to assign a time quantum here because thread is still - * at a kernel mode priority and the time slicing is not done - * for threads running in the kernel after sleeping. The proper - * time quantum will be assigned by ts_trapret before the thread - * returns to user mode. + * Processes waking up go to the back of their queue. */ static void ts_wakeup(kthread_t *t) @@ -1984,46 +1906,27 @@ ts_wakeup(kthread_t *t) t->t_stime = ddi_get_lbolt(); /* time stamp for the swapper */ - if (tspp->ts_flags & TSKPRI) { - tspp->ts_flags &= ~TSBACKQ; + if (tspp->ts_dispwait > ts_dptbl[tspp->ts_umdpri].ts_maxwait) { + tspp->ts_cpupri = ts_dptbl[tspp->ts_cpupri].ts_slpret; + TS_NEWUMDPRI(tspp); + tspp->ts_timeleft = ts_dptbl[tspp->ts_cpupri].ts_quantum; + tspp->ts_dispwait = 0; + THREAD_CHANGE_PRI(t, ts_dptbl[tspp->ts_umdpri].ts_globpri); + ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri); + } + + tspp->ts_flags &= ~TSBACKQ; + + if (tspp->ts_flags & TSIA) { if (tspp->ts_flags & TSIASET) setfrontdq(t); else setbackdq(t); - } else if (t->t_kpri_req) { - /* - * Give thread a priority boost if we were asked. - */ - tspp->ts_flags |= TSKPRI; - THREAD_CHANGE_PRI(t, ts_kmdpris[0]); - setbackdq(t); - t->t_trapret = 1; /* so that ts_trapret will run */ - aston(t); } else { - if (tspp->ts_dispwait > ts_dptbl[tspp->ts_umdpri].ts_maxwait) { - tspp->ts_cpupri = ts_dptbl[tspp->ts_cpupri].ts_slpret; - TS_NEWUMDPRI(tspp); - tspp->ts_timeleft = - ts_dptbl[tspp->ts_cpupri].ts_quantum; - tspp->ts_dispwait = 0; - THREAD_CHANGE_PRI(t, - ts_dptbl[tspp->ts_umdpri].ts_globpri); - ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri); - } - - tspp->ts_flags &= ~TSBACKQ; - - if (tspp->ts_flags & TSIA) { - if (tspp->ts_flags & TSIASET) - setfrontdq(t); - else - setbackdq(t); - } else { - if (t->t_disp_time != ddi_get_lbolt()) - setbackdq(t); - else - setfrontdq(t); - } + if (t->t_disp_time != ddi_get_lbolt()) + setbackdq(t); + else + setfrontdq(t); } } @@ -2179,7 +2082,7 @@ ts_doprio(kthread_t *t, cred_t *cr, int incr, int *retvalp) * and background processes as non-interactive iff the session * leader is interactive. This routine is called from two places: * strioctl:SPGRP when a new process group gets - * control of the tty. + * control of the tty. * ia_parmsset-when the process in question is a session leader. * ia_set_process_group assumes that pidlock is held by the caller, * either strioctl or priocntlsys. If the caller is priocntlsys @@ -2189,7 +2092,7 @@ ts_doprio(kthread_t *t, cred_t *cr, int incr, int *retvalp) static void ia_set_process_group(pid_t sid, pid_t bg_pgid, pid_t fg_pgid) { - proc_t *leader, *fg, *bg; + proc_t *leader, *fg, *bg; tsproc_t *tspp; kthread_t *tx; int plocked = 0; @@ -2291,10 +2194,6 @@ ia_set_process_group(pid_t sid, pid_t bg_pgid, pid_t fg_pgid) tspp->ts_flags |= TSIASET; tspp->ts_boost = ia_boost; TS_NEWUMDPRI(tspp); - if ((tspp->ts_flags & TSKPRI) != 0) { - thread_unlock(tx); - continue; - } tspp->ts_dispwait = 0; ts_change_priority(tx, tspp); thread_unlock(tx); @@ -2344,10 +2243,6 @@ skip: tspp->ts_flags &= ~TSIASET; tspp->ts_boost = -ia_boost; TS_NEWUMDPRI(tspp); - if ((tspp->ts_flags & TSKPRI) != 0) { - thread_unlock(tx); - continue; - } tspp->ts_dispwait = 0; ts_change_priority(tx, tspp); |
