12 files changed, 732 insertions, 523 deletions
diff --git a/usr/src/uts/common/disp/cmt.c b/usr/src/uts/common/disp/cmt.c
index 0196b15dae..80b5340543 100644
--- a/usr/src/uts/common/disp/cmt.c
+++ b/usr/src/uts/common/disp/cmt.c
@@ -201,13 +201,15 @@ pg_cmt_cpu_startup(cpu_t *cp)
 
 /*
  * Return non-zero if thread can migrate between "from" and "to"
- * without a performance penalty
+ * without a performance penalty.  This is true only if we share a core on
+ * virtually any CPU; sharing the last-level cache is insufficient to make
+ * migration possible without penalty.
  */
 int
 pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
 {
-	if (from->cpu_physid->cpu_cacheid ==
-	    to->cpu_physid->cpu_cacheid)
+	if (from->cpu_physid->cpu_coreid ==
+	    to->cpu_physid->cpu_coreid)
 		return (1);
 	return (0);
 }
diff --git a/usr/src/uts/common/disp/cpucaps.c b/usr/src/uts/common/disp/cpucaps.c
index 46f53faab6..2a4365ff73 100644
--- a/usr/src/uts/common/disp/cpucaps.c
+++ b/usr/src/uts/common/disp/cpucaps.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2013 Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/disp.h>
@@ -74,6 +75,32 @@
  * Putting threads on wait queues in random places while running in the
  * kernel might lead to all kinds of locking problems.
  *
+ * Bursting
+ * ========
+ *
+ * CPU bursting occurs when the CPU usage is over the baseline but under the
+ * cap.  The baseline CPU (zone.cpu-baseline) is set in a multi-tenant
+ * environment so that we know how much CPU is allocated for a tenant under
+ * normal utilization.  We can then track how much time a zone is spending
+ * over the "normal" CPU utilization expected for that zone using the
+ * "above_base_sec" kstat. This kstat is cumulative.
+ *
+ * If the zone has a burst limit (zone.cpu-burst-time) then the zone can
+ * burst for that period of time (in seconds) before the effective cap is
+ * lowered to the baseline.  Once the effective cap is lowered, the zone
+ * will run at the baseline for the burst limit before the effective cap is
+ * raised again to the full value.  This will allow the zone to burst again.
+ * We can watch this behavior using the kstats.  The "effective" kstat shows
+ * which cap is being used, the baseline value or the burst value.  The
+ * "burst_limit_sec" shows the value of the zone.cpu-burst-time rctl and the
+ * "bursting_sec" kstat shows how many seconds the zone has currently been
+ * bursting.  When the CPU load is continuously greater than the baseline,
+ * bursting_sec will increase, up to the burst_limit_sec value, then the
+ * effective kstat will drop to the baseline and the bursting_sec value will
+ * decrease until it hits 0, at which time the effective kstat will return to
+ * the full burst value and the bursting_sec value will begin to increase
+ * again.
+ *
  * Accounting
  * ==========
  *
@@ -203,18 +230,28 @@ static void caps_update();
  */
 struct cap_kstat {
 	kstat_named_t	cap_value;
+	kstat_named_t	cap_baseline;
+	kstat_named_t	cap_effective;
+	kstat_named_t	cap_burst_limit;
+	kstat_named_t	cap_bursting;
 	kstat_named_t	cap_usage;
 	kstat_named_t	cap_nwait;
 	kstat_named_t	cap_below;
 	kstat_named_t	cap_above;
+	kstat_named_t	cap_above_base;
 	kstat_named_t	cap_maxusage;
 	kstat_named_t	cap_zonename;
 } cap_kstat = {
 	{ "value",	KSTAT_DATA_UINT64 },
+	{ "baseline",	KSTAT_DATA_UINT64 },
+	{ "effective",	KSTAT_DATA_UINT64 },
+	{ "burst_limit_sec", KSTAT_DATA_UINT64 },
+	{ "bursting_sec", KSTAT_DATA_UINT64 },
 	{ "usage",	KSTAT_DATA_UINT64 },
 	{ "nwait",	KSTAT_DATA_UINT64 },
 	{ "below_sec",	KSTAT_DATA_UINT64 },
 	{ "above_sec",	KSTAT_DATA_UINT64 },
+	{ "above_base_sec", KSTAT_DATA_UINT64 },
 	{ "maxusage",	KSTAT_DATA_UINT64 },
 	{ "zonename",	KSTAT_DATA_STRING },
 };
@@ -311,7 +348,7 @@ cap_enable(list_t *l, cpucap_t *cap, hrtime_t value)
 	cap->cap_below = cap->cap_above = 0;
 	cap->cap_maxusage = 0;
 	cap->cap_usage = 0;
-	cap->cap_value = value;
+	cap->cap_value = cap->cap_chk_value = value;
 	waitq_unblock(&cap->cap_waitq);
 	if (CPUCAPS_OFF()) {
 		cpucaps_enabled = B_TRUE;
@@ -340,19 +377,21 @@ cap_disable(list_t *l, cpucap_t *cap)
 	ASSERT(CAP_ENABLED(cap));
 
 	waitq_block(&cap->cap_waitq);
+
+	/* do this first to avoid race with cap_kstat_update */
+	if (cap->cap_kstat != NULL) {
+		kstat_delete(cap->cap_kstat);
+		cap->cap_kstat = NULL;
+	}
+
 	list_remove(l, cap);
 	if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) {
 		cpucaps_enabled = B_FALSE;
 		cpucaps_clock_callout = NULL;
 	}
-	cap->cap_value = 0;
+	cap->cap_value = cap->cap_chk_value = 0;
 	cap->cap_project = NULL;
 	cap->cap_zone = NULL;
-	if (cap->cap_kstat != NULL) {
-		kstat_delete(cap->cap_kstat);
-		cap->cap_kstat = NULL;
-	}
-
 }
 
 /*
@@ -487,6 +526,8 @@ cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t))
  * The waitq_isempty check is performed without the waitq lock. If a new thread
  * is placed on the waitq right after the check, it will be picked up during the
  * next invocation of cap_poke_waitq().
+ *
+ * Called once per tick for zones.
  */
 /* ARGSUSED */
 static void
@@ -494,15 +535,92 @@ cap_poke_waitq(cpucap_t *cap, int64_t gen)
 {
 	ASSERT(MUTEX_HELD(&caps_lock));
 
-	if (cap->cap_usage >= cap->cap_value) {
+	if (cap->cap_base != 0) {
+		/*
+		 * Because of the way usage is calculated and decayed, its
+		 * possible for the zone to be slightly over its cap, but we
+		 * don't want to count that after we have reduced the effective
+		 * cap to the baseline.  That way the zone will be able to
+		 * burst again after the burst_limit has expired.
+		 */
+		if (cap->cap_usage > cap->cap_base &&
+		    cap->cap_chk_value == cap->cap_value) {
+			cap->cap_above_base++;
+
+			/*
+			 * If bursting is limited and we've been bursting
+			 * longer than we're supposed to, then set the
+			 * effective cap to the baseline.
+			 */
+			if (cap->cap_burst_limit != 0) {
+				cap->cap_bursting++;
+				if (cap->cap_bursting >= cap->cap_burst_limit)
+					cap->cap_chk_value = cap->cap_base;
+			}
+		} else if (cap->cap_bursting > 0) {
+			/*
+			 * We're not bursting now, but we were, decay the
+			 * bursting timer.
+			 */
+			cap->cap_bursting--;
+			/*
+			 * Reset the effective cap once we decay to 0 so we
+			 * can burst again.
+			 */
+			if (cap->cap_bursting == 0 &&
+			    cap->cap_chk_value != cap->cap_value)
+				cap->cap_chk_value = cap->cap_value;
+		}
+	}
+
+	if (cap->cap_usage >= cap->cap_chk_value) {
 		cap->cap_above++;
 	} else {
 		waitq_t *wq = &cap->cap_waitq;
 
 		cap->cap_below++;
 
-		if (!waitq_isempty(wq))
-			waitq_runone(wq);
+		if (!waitq_isempty(wq)) {
+			int i, ndequeue, p;
+
+			/*
+			 * Since this function is only called once per tick,
+			 * we can hit a situation where we have artificially
+			 * limited the project/zone below its cap.  This would
+			 * happen if we have multiple threads queued up but
+			 * only dequeued one thread/tick. To avoid this we
+			 * dequeue multiple threads, calculated based on the
+			 * usage percentage of the cap. It is possible that we
+			 * could dequeue too many threads and some of them
+			 * might be put back on the wait queue quickly, but
+			 * since we know that threads are on the wait queue
+			 * because we're capping, we know that there is unused
+			 * CPU cycles anyway, so this extra work would not
+			 * hurt. Also, the ndequeue number is only an upper
+			 * bound and we might dequeue less, depending on how
+			 * many threads are actually in the wait queue. The
+			 * ndequeue values are empirically derived and could be
+			 * adjusted or calculated in another way if necessary.
+			 */
+			p = (int)((100 * cap->cap_usage) / cap->cap_chk_value);
+			if (p >= 98)
+				ndequeue = 10;
+			else if (p >= 95)
+				ndequeue = 20;
+			else if (p >= 90)
+				ndequeue = 40;
+			else if (p >= 85)
+				ndequeue = 80;
+			else
+				ndequeue = 160;
+
+			for (i = 0; i < ndequeue; i++) {
+				waitq_runone(wq);
+				if (waitq_isempty(wq))
+					break;
+			}
+			DTRACE_PROBE2(cpucaps__pokeq, int, p, int, i);
+		}
 	}
 }
 
@@ -629,14 +747,14 @@ cap_project_zone_modify_walker(kproject_t *kpj, void *arg)
 		 * Remove all projects in this zone without caps
 		 * from the capped_projects list.
 		 */
-		if (project_cap->cap_value == MAX_USAGE) {
+		if (project_cap->cap_chk_value == MAX_USAGE) {
 			cap_project_disable(kpj);
 		}
 	} else if (CAP_DISABLED(project_cap)) {
 		/*
 		 * Add the project to capped_projects list.
 		 */
-		ASSERT(project_cap->cap_value == 0);
+		ASSERT(project_cap->cap_chk_value == 0);
 		cap_project_enable(kpj, MAX_USAGE);
 	}
 	mutex_exit(&caps_lock);
@@ -746,7 +864,7 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
 		/*
 		 * No state transitions, just change the value
 		 */
-		cap->cap_value = value;
+		cap->cap_value = cap->cap_chk_value = value;
 	}
 
 	ASSERT(MUTEX_HELD(&caps_lock));
@@ -757,6 +875,108 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
 }
 
 /*
+ * Set zone's base cpu value to base_val
+ */
+int
+cpucaps_zone_set_base(zone_t *zone, rctl_qty_t base_val)
+{
+	cpucap_t *cap = NULL;
+	hrtime_t value;
+
+	ASSERT(base_val <= MAXCAP);
+	if (base_val > MAXCAP)
+		base_val = MAXCAP;
+
+	if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone))
+		return (0);
+
+	if (zone->zone_cpucap == NULL)
+		cap = cap_alloc();
+
+	mutex_enter(&caps_lock);
+
+	if (cpucaps_busy) {
+		mutex_exit(&caps_lock);
+		return (EBUSY);
+	}
+
+	/*
+	 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
+	 * held. If it is still NULL, assign a newly allocated cpucap to it.
+	 */
+	if (zone->zone_cpucap == NULL) {
+		zone->zone_cpucap = cap;
+	} else if (cap != NULL) {
+		cap_free(cap);
+	}
+
+	cap = zone->zone_cpucap;
+
+	value = base_val * cap_tick_cost;
+	if (value < 0 || value > cap->cap_value)
+		value = 0;
+
+	cap->cap_base = value;
+
+	mutex_exit(&caps_lock);
+
+	return (0);
+}
+
+/*
+ * Set zone's maximum burst time in seconds.  A burst time of 0 means that
+ * the zone can run over its baseline indefinitely.
+ */
+int
+cpucaps_zone_set_burst_time(zone_t *zone, rctl_qty_t base_val)
+{
+	cpucap_t *cap = NULL;
+	hrtime_t value;
+
+	ASSERT(base_val <= INT_MAX);
+	/* Treat the default as 0 - no limit */
+	if (base_val == INT_MAX)
+		base_val = 0;
+	if (base_val > INT_MAX)
+		base_val = INT_MAX;
+
+	if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone))
+		return (0);
+
+	if (zone->zone_cpucap == NULL)
+		cap = cap_alloc();
+
+	mutex_enter(&caps_lock);
+
+	if (cpucaps_busy) {
+		mutex_exit(&caps_lock);
+		return (EBUSY);
+	}
+
+	/*
+	 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
+	 * held. If it is still NULL, assign a newly allocated cpucap to it.
+	 */
+	if (zone->zone_cpucap == NULL) {
+		zone->zone_cpucap = cap;
+	} else if (cap != NULL) {
+		cap_free(cap);
+	}
+
+	cap = zone->zone_cpucap;
+
+	value = SEC_TO_TICK(base_val);
+	if (value < 0)
+		value = 0;
+
+	cap->cap_burst_limit = value;
+
+	mutex_exit(&caps_lock);
+
+	return (0);
+}
+
+/*
  * The project is going away so disable its cap.
  */
 void
@@ -902,7 +1122,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
 		if (CAP_DISABLED(cap))
 			cap_project_enable(kpj, value);
 		else
-			cap->cap_value = value;
+			cap->cap_value = cap->cap_chk_value = value;
 	} else if (CAP_ENABLED(cap)) {
 		/*
 		 * User requested to drop a cap on the project. If it is part of
@@ -910,7 +1130,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
 		 * otherwise disable the cap.
 		 */
 		if (ZONE_IS_CAPPED(kpj->kpj_zone)) {
-			cap->cap_value = MAX_USAGE;
+			cap->cap_value = cap->cap_chk_value = MAX_USAGE;
 		} else {
 			cap_project_disable(kpj);
 		}
@@ -948,6 +1168,26 @@ cpucaps_zone_get(zone_t *zone)
 }
 
 /*
+ * Get current zone baseline.
+ */
+rctl_qty_t
+cpucaps_zone_get_base(zone_t *zone)
+{
+	return (zone->zone_cpucap != NULL ?
+	    (rctl_qty_t)(zone->zone_cpucap->cap_base / cap_tick_cost) : 0);
+}
+
+/*
+ * Get current zone maximum burst time.
+ */
+rctl_qty_t
+cpucaps_zone_get_burst_time(zone_t *zone)
+{
+	return (zone->zone_cpucap != NULL ?
+	    (rctl_qty_t)(TICK_TO_SEC(zone->zone_cpucap->cap_burst_limit)) : 0);
+}
+
+/*
  * Charge project of thread t the time thread t spent on CPU since previously
  * adjusted.
  *
@@ -1045,7 +1285,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
 
 	project_cap = kpj->kpj_cpucap;
 
-	if (project_cap->cap_usage >= project_cap->cap_value) {
+	if (project_cap->cap_usage >= project_cap->cap_chk_value) {
 		t->t_schedflag |= TS_PROJWAITQ;
 		rc = B_TRUE;
 	} else if (t->t_schedflag & TS_PROJWAITQ) {
@@ -1059,7 +1299,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
 	} else {
 		cpucap_t *zone_cap = zone->zone_cpucap;
 
-		if (zone_cap->cap_usage >= zone_cap->cap_value) {
+		if (zone_cap->cap_usage >= zone_cap->cap_chk_value) {
 			t->t_schedflag |= TS_ZONEWAITQ;
 			rc = B_TRUE;
 		} else if (t->t_schedflag & TS_ZONEWAITQ) {
@@ -1119,6 +1359,7 @@ cpucaps_enforce(kthread_t *t)
 
 /*
  * Convert internal cap statistics into values exported by cap kstat.
+ * Note that the kstat is held throughout this function but caps_lock is not.
  */
 static int
 cap_kstat_update(kstat_t *ksp, int rw)
@@ -1133,6 +1374,12 @@ cap_kstat_update(kstat_t *ksp, int rw)
 
 	capsp->cap_value.value.ui64 =
 	    ROUND_SCALE(cap->cap_value, cap_tick_cost);
+	capsp->cap_baseline.value.ui64 =
+	    ROUND_SCALE(cap->cap_base, cap_tick_cost);
+	capsp->cap_effective.value.ui64 =
+	    ROUND_SCALE(cap->cap_chk_value, cap_tick_cost);
+	capsp->cap_burst_limit.value.ui64 =
+	    ROUND_SCALE(cap->cap_burst_limit, tick_sec);
 	capsp->cap_usage.value.ui64 =
 	    ROUND_SCALE(cap->cap_usage, cap_tick_cost);
 	capsp->cap_maxusage.value.ui64 =
@@ -1140,6 +1387,10 @@ cap_kstat_update(kstat_t *ksp, int rw)
 	capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count;
 	capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec);
 	capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec);
+	capsp->cap_above_base.value.ui64 =
+	    ROUND_SCALE(cap->cap_above_base, tick_sec);
+	capsp->cap_bursting.value.ui64 =
+	    ROUND_SCALE(cap->cap_bursting, tick_sec);
 	kstat_named_setstr(&capsp->cap_zonename, zonename);
 
 	return (0);
diff --git a/usr/src/uts/common/disp/disp.c b/usr/src/uts/common/disp/disp.c
index a4b49fa86d..7e933bccc4 100644
--- a/usr/src/uts/common/disp/disp.c
+++ b/usr/src/uts/common/disp/disp.c
@@ -110,7 +110,7 @@ static void	cpu_resched(cpu_t *cp, pri_t tpri);
 /*
  * If this is set, only interrupt threads will cause kernel preemptions.
  * This is done by changing the value of kpreemptpri.  kpreemptpri
- * will either be the max sysclass pri + 1 or the min interrupt pri.
+ * will either be the max sysclass pri or the min interrupt pri.
  */
 int	only_intr_kpreempt;
 
@@ -257,7 +257,23 @@ dispinit(void)
 				maxglobpri = cl_maxglobpri;
 		}
 	}
-	kpreemptpri = (pri_t)v.v_maxsyspri + 1;
+
+	/*
+	 * Historically, kpreemptpri was set to v_maxsyspri + 1 -- which is
+	 * to say, maxclsyspri + 1.  However, over time, the system has used
+	 * more and more asynchronous kernel threads, with an increasing number
+	 * of these doing work on direct behalf of higher-level software (e.g.,
+	 * network processing).  This has led to potential priority inversions:
+	 * threads doing low-priority lengthy kernel work can effectively
+	 * delay kernel-level processing of higher-priority data. To minimize
+	 * such inversions, we set kpreemptpri to be v_maxsyspri; anything in
+	 * the kernel that runs at maxclsyspri will therefore induce kernel
+	 * preemption, and this priority should be used if/when an asynchronous
+	 * thread (or, as is often the case, task queue) is performing a task
+	 * on behalf of higher-level software (or any task that is otherwise
+	 * latency-sensitve).
+	 */
+	kpreemptpri = (pri_t)v.v_maxsyspri;
 	if (kpqpri == KPQPRI)
 		kpqpri = kpreemptpri;
 
@@ -2259,7 +2275,7 @@ disp_getbest(disp_t *dp)
 		 * placed earlier.
 		 */
 		if (tcp == NULL ||
-		    pri >= minclsyspri ||
+		    (pri >= minclsyspri && tp->t_procp == &p0) ||
 		    tp->t_cpu != tcp)
 			break;
 
diff --git a/usr/src/uts/common/disp/fss.c b/usr/src/uts/common/disp/fss.c
index 15aeda6d00..05f358e6d4 100644
--- a/usr/src/uts/common/disp/fss.c
+++ b/usr/src/uts/common/disp/fss.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -1212,9 +1212,9 @@ fss_decay_usage()
 				 * If there is only one zone active on the pset
 				 * the above reduces to:
 				 *
-				 * 			zone_int_shares^2
+				 *			zone_int_shares^2
 				 * shusage = usage * ---------------------
-				 * 			kpj_shares^2
+				 *			kpj_shares^2
 				 *
 				 * If there's only one project active in the
 				 * zone this formula reduces to:
@@ -1373,8 +1373,6 @@ fss_update_list(int i)
 		 */
 		if (t->t_cid != fss_cid)
 			goto next;
-		if ((fssproc->fss_flags & FSSKPRI) != 0)
-			goto next;
 
 		fssproj = FSSPROC2FSSPROJ(fssproc);
 		if (fssproj == NULL)
@@ -1889,7 +1887,7 @@ fss_fork(kthread_t *pt, kthread_t *ct, void *bufp)
 	cpucaps_sc_init(&cfssproc->fss_caps);
 
 	cfssproc->fss_flags =
-	    pfssproc->fss_flags & ~(FSSKPRI | FSSBACKQ | FSSRESTORE);
+	    pfssproc->fss_flags & ~(FSSBACKQ | FSSRESTORE);
 	ct->t_cldata = (void *)cfssproc;
 	ct->t_schedflag |= TS_RUNQMATCH;
 	thread_unlock(pt);
@@ -1940,7 +1938,6 @@ fss_forkret(kthread_t *t, kthread_t *ct)
 	fssproc->fss_timeleft = fss_quantum;
 	t->t_pri = fssproc->fss_umdpri;
 	ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri);
-	fssproc->fss_flags &= ~FSSKPRI;
 	THREAD_TRANSITION(t);
 
 	/*
@@ -2039,11 +2036,6 @@ fss_parmsset(kthread_t *t, void *parmsp, id_t reqpcid, cred_t *reqpcredp)
 	fssproc->fss_nice = nice;
 	fss_newpri(fssproc, B_FALSE);
 
-	if ((fssproc->fss_flags & FSSKPRI) != 0) {
-		thread_unlock(t);
-		return (0);
-	}
-
 	fss_change_priority(t, fssproc);
 	thread_unlock(t);
 	return (0);
@@ -2158,7 +2150,7 @@ fss_swapin(kthread_t *t, int flags)
 		time_t swapout_time;
 
 		swapout_time = (ddi_get_lbolt() - t->t_stime) / hz;
-		if (INHERITED(t) || (fssproc->fss_flags & FSSKPRI)) {
+		if (INHERITED(t)) {
 			epri = (long)DISP_PRIO(t) + swapout_time;
 		} else {
 			/*
@@ -2190,7 +2182,6 @@ fss_swapin(kthread_t *t, int flags)
 static pri_t
 fss_swapout(kthread_t *t, int flags)
 {
-	fssproc_t *fssproc = FSSPROC(t);
 	long epri = -1;
 	proc_t *pp = ttoproc(t);
 	time_t swapin_time;
@@ -2198,7 +2189,6 @@ fss_swapout(kthread_t *t, int flags)
 	ASSERT(THREAD_LOCK_HELD(t));
 
 	if (INHERITED(t) ||
-	    (fssproc->fss_flags & FSSKPRI) ||
 	    (t->t_proc_flag & TP_LWPEXIT) ||
 	    (t->t_state & (TS_ZOMB|TS_FREE|TS_STOPPED|TS_ONPROC|TS_WAIT)) ||
 	    !(t->t_schedflag & TS_LOAD) ||
@@ -2241,16 +2231,11 @@ fss_swapout(kthread_t *t, int flags)
 }
 
 /*
- * If thread is currently at a kernel mode priority (has slept) and is
- * returning to the userland we assign it the appropriate user mode priority
- * and time quantum here.  If we're lowering the thread's priority below that
- * of other runnable threads then we will set runrun via cpu_surrender() to
- * cause preemption.
+ * Run swap-out checks when returning to userspace.
  */
 static void
 fss_trapret(kthread_t *t)
 {
-	fssproc_t *fssproc = FSSPROC(t);
 	cpu_t *cp = CPU;
 
 	ASSERT(THREAD_LOCK_HELD(t));
@@ -2258,20 +2243,6 @@ fss_trapret(kthread_t *t)
 	ASSERT(cp->cpu_dispthread == t);
 	ASSERT(t->t_state == TS_ONPROC);
 
-	t->t_kpri_req = 0;
-	if (fssproc->fss_flags & FSSKPRI) {
-		/*
-		 * If thread has blocked in the kernel
-		 */
-		THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
-		cp->cpu_dispatch_pri = DISP_PRIO(t);
-		ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri);
-		fssproc->fss_flags &= ~FSSKPRI;
-
-		if (DISP_MUST_SURRENDER(t))
-			cpu_surrender(t);
-	}
-
 	/*
 	 * Swapout lwp if the swapper is waiting for this thread to reach
 	 * a safe point.
@@ -2299,19 +2270,6 @@ fss_preempt(kthread_t *t)
 	ASSERT(t->t_state == TS_ONPROC);
 
 	/*
-	 * If preempted in the kernel, make sure the thread has a kernel
-	 * priority if needed.
-	 */
-	lwp = curthread->t_lwp;
-	if (!(fssproc->fss_flags & FSSKPRI) && lwp != NULL && t->t_kpri_req) {
-		fssproc->fss_flags |= FSSKPRI;
-		THREAD_CHANGE_PRI(t, minclsyspri);
-		ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri);
-		t->t_trapret = 1;	/* so that fss_trapret will run */
-		aston(t);
-	}
-
-	/*
 	 * This thread may be placed on wait queue by CPU Caps. In this case we
 	 * do not need to do anything until it is removed from the wait queue.
 	 * Do not enforce CPU caps on threads running at a kernel priority
@@ -2320,7 +2278,7 @@ fss_preempt(kthread_t *t)
 		(void) cpucaps_charge(t, &fssproc->fss_caps,
 		    CPUCAPS_CHARGE_ENFORCE);
 
-		if (!(fssproc->fss_flags & FSSKPRI) && CPUCAPS_ENFORCE(t))
+		if (CPUCAPS_ENFORCE(t))
 			return;
 	}
 
@@ -2329,6 +2287,7 @@ fss_preempt(kthread_t *t)
 	 * cannot be holding any kernel locks.
 	 */
 	ASSERT(t->t_schedflag & TS_DONT_SWAP);
+	lwp = ttolwp(t);
 	if (lwp != NULL && lwp->lwp_state == LWP_USER)
 		t->t_schedflag &= ~TS_DONT_SWAP;
 
@@ -2346,18 +2305,16 @@ fss_preempt(kthread_t *t)
 	if (t->t_schedctl && schedctl_get_nopreempt(t)) {
 		if (fssproc->fss_timeleft > -SC_MAX_TICKS) {
 			DTRACE_SCHED1(schedctl__nopreempt, kthread_t *, t);
-			if (!(fssproc->fss_flags & FSSKPRI)) {
-				/*
-				 * If not already remembered, remember current
-				 * priority for restoration in fss_yield().
-				 */
-				if (!(fssproc->fss_flags & FSSRESTORE)) {
-					fssproc->fss_scpri = t->t_pri;
-					fssproc->fss_flags |= FSSRESTORE;
-				}
-				THREAD_CHANGE_PRI(t, fss_maxumdpri);
-				t->t_schedflag |= TS_DONT_SWAP;
+			/*
+			 * If not already remembered, remember current
+			 * priority for restoration in fss_yield().
+			 */
+			if (!(fssproc->fss_flags & FSSRESTORE)) {
+				fssproc->fss_scpri = t->t_pri;
+				fssproc->fss_flags |= FSSRESTORE;
 			}
+			THREAD_CHANGE_PRI(t, fss_maxumdpri);
+			t->t_schedflag |= TS_DONT_SWAP;
 			schedctl_set_yield(t, 1);
 			setfrontdq(t);
 			return;
@@ -2374,15 +2331,12 @@ fss_preempt(kthread_t *t)
 		}
 	}
 
-	flags = fssproc->fss_flags & (FSSBACKQ | FSSKPRI);
+	flags = fssproc->fss_flags & FSSBACKQ;
 
 	if (flags == FSSBACKQ) {
 		fssproc->fss_timeleft = fss_quantum;
 		fssproc->fss_flags &= ~FSSBACKQ;
 		setbackdq(t);
-	} else if (flags == (FSSBACKQ | FSSKPRI)) {
-		fssproc->fss_flags &= ~FSSBACKQ;
-		setbackdq(t);
 	} else {
 		setfrontdq(t);
 	}
@@ -2404,12 +2358,7 @@ fss_setrun(kthread_t *t)
 	fssproc->fss_timeleft = fss_quantum;
 
 	fssproc->fss_flags &= ~FSSBACKQ;
-	/*
-	 * If previously were running at the kernel priority then keep that
-	 * priority and the fss_timeleft doesn't matter.
-	 */
-	if ((fssproc->fss_flags & FSSKPRI) == 0)
-		THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
+	THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
 
 	if (t->t_disp_time != ddi_get_lbolt())
 		setbackdq(t);
@@ -2418,8 +2367,7 @@ fss_setrun(kthread_t *t)
 }
 
 /*
- * Prepare thread for sleep. We reset the thread priority so it will run at the
- * kernel priority level when it wakes up.
+ * Prepare thread for sleep.
  */
 static void
 fss_sleep(kthread_t *t)
@@ -2437,31 +2385,6 @@ fss_sleep(kthread_t *t)
 	(void) CPUCAPS_CHARGE(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE);
 
 	fss_inactive(t);
-
-	/*
-	 * Assign a system priority to the thread and arrange for it to be
-	 * retained when the thread is next placed on the run queue (i.e.,
-	 * when it wakes up) instead of being given a new pri.  Also arrange
-	 * for trapret processing as the thread leaves the system call so it
-	 * will drop back to normal priority range.
-	 */
-	if (t->t_kpri_req) {
-		THREAD_CHANGE_PRI(t, minclsyspri);
-		fssproc->fss_flags |= FSSKPRI;
-		t->t_trapret = 1;	/* so that fss_trapret will run */
-		aston(t);
-	} else if (fssproc->fss_flags & FSSKPRI) {
-		/*
-		 * The thread has done a THREAD_KPRI_REQUEST(), slept, then
-		 * done THREAD_KPRI_RELEASE() (so no t_kpri_req is 0 again),
-		 * then slept again all without finishing the current system
-		 * call so trapret won't have cleared FSSKPRI
-		 */
-		fssproc->fss_flags &= ~FSSKPRI;
-		THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
-		if (DISP_MUST_SURRENDER(curthread))
-			cpu_surrender(t);
-	}
 	t->t_stime = ddi_get_lbolt();	/* time stamp for the swapper */
 }
 
@@ -2503,67 +2426,56 @@ fss_tick(kthread_t *t)
 	 * Do not surrender CPU if running in the SYS class.
 	 */
 	if (CPUCAPS_ON()) {
-		cpucaps_enforce = cpucaps_charge(t,
-		    &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE) &&
-		    !(fssproc->fss_flags & FSSKPRI);
+		cpucaps_enforce = cpucaps_charge(t, &fssproc->fss_caps,
+		    CPUCAPS_CHARGE_ENFORCE);
 	}
 
-	/*
-	 * A thread's execution time for threads running in the SYS class
-	 * is not tracked.
-	 */
-	if ((fssproc->fss_flags & FSSKPRI) == 0) {
+	if (--fssproc->fss_timeleft <= 0) {
+		pri_t new_pri;
+
 		/*
-		 * If thread is not in kernel mode, decrement its fss_timeleft
+		 * If we're doing preemption control and trying to avoid
+		 * preempting this thread, just note that the thread should
+		 * yield soon and let it keep running (unless it's been a
+		 * while).
 		 */
-		if (--fssproc->fss_timeleft <= 0) {
-			pri_t new_pri;
-
-			/*
-			 * If we're doing preemption control and trying to
-			 * avoid preempting this thread, just note that the
-			 * thread should yield soon and let it keep running
-			 * (unless it's been a while).
-			 */
-			if (t->t_schedctl && schedctl_get_nopreempt(t)) {
-				if (fssproc->fss_timeleft > -SC_MAX_TICKS) {
-					DTRACE_SCHED1(schedctl__nopreempt,
-					    kthread_t *, t);
-					schedctl_set_yield(t, 1);
-					thread_unlock_nopreempt(t);
-					return;
-				}
+		if (t->t_schedctl && schedctl_get_nopreempt(t)) {
+			if (fssproc->fss_timeleft > -SC_MAX_TICKS) {
+				DTRACE_SCHED1(schedctl__nopreempt,
+				    kthread_t *, t);
+				schedctl_set_yield(t, 1);
+				thread_unlock_nopreempt(t);
+				return;
 			}
-			fssproc->fss_flags &= ~FSSRESTORE;
+		}
+		fssproc->fss_flags &= ~FSSRESTORE;
 
-			fss_newpri(fssproc, B_TRUE);
-			new_pri = fssproc->fss_umdpri;
-			ASSERT(new_pri >= 0 && new_pri <= fss_maxglobpri);
+		fss_newpri(fssproc, B_TRUE);
+		new_pri = fssproc->fss_umdpri;
+		ASSERT(new_pri >= 0 && new_pri <= fss_maxglobpri);
 
-			/*
-			 * When the priority of a thread is changed, it may
-			 * be necessary to adjust its position on a sleep queue
-			 * or dispatch queue. The function thread_change_pri
-			 * accomplishes this.
-			 */
-			if (thread_change_pri(t, new_pri, 0)) {
-				if ((t->t_schedflag & TS_LOAD) &&
-				    (lwp = t->t_lwp) &&
-				    lwp->lwp_state == LWP_USER)
-					t->t_schedflag &= ~TS_DONT_SWAP;
-				fssproc->fss_timeleft = fss_quantum;
-			} else {
-				call_cpu_surrender = B_TRUE;
-			}
-		} else if (t->t_state == TS_ONPROC &&
-		    t->t_pri < t->t_disp_queue->disp_maxrunpri) {
-			/*
-			 * If there is a higher-priority thread which is
-			 * waiting for a processor, then thread surrenders
-			 * the processor.
-			 */
+		/*
+		 * When the priority of a thread is changed, it may be
+		 * necessary to adjust its position on a sleep queue or
+		 * dispatch queue. The function thread_change_pri accomplishes
+		 * this.
+		 */
+		if (thread_change_pri(t, new_pri, 0)) {
+			if ((t->t_schedflag & TS_LOAD) &&
+			    (lwp = t->t_lwp) &&
+			    lwp->lwp_state == LWP_USER)
+				t->t_schedflag &= ~TS_DONT_SWAP;
+			fssproc->fss_timeleft = fss_quantum;
+		} else {
 			call_cpu_surrender = B_TRUE;
 		}
+	} else if (t->t_state == TS_ONPROC &&
+	    t->t_pri < t->t_disp_queue->disp_maxrunpri) {
+		/*
+		 * If there is a higher-priority thread which is waiting for a
+		 * processor, then thread surrenders the processor.
+		 */
+		call_cpu_surrender = B_TRUE;
 	}
 
 	if (cpucaps_enforce && 2 * fssproc->fss_timeleft > fss_quantum) {
@@ -2618,32 +2530,13 @@ fss_wakeup(kthread_t *t)
 	fssproc = FSSPROC(t);
 	fssproc->fss_flags &= ~FSSBACKQ;
 
-	if (fssproc->fss_flags & FSSKPRI) {
-		/*
-		 * If we already have a kernel priority assigned, then we
-		 * just use it.
-		 */
-		setbackdq(t);
-	} else if (t->t_kpri_req) {
-		/*
-		 * Give thread a priority boost if we were asked.
-		 */
-		fssproc->fss_flags |= FSSKPRI;
-		THREAD_CHANGE_PRI(t, minclsyspri);
-		setbackdq(t);
-		t->t_trapret = 1;	/* so that fss_trapret will run */
-		aston(t);
+	/* Recalculate the priority. */
+	if (t->t_disp_time == ddi_get_lbolt()) {
+		setfrontdq(t);
 	} else {
-		/*
-		 * Otherwise, we recalculate the priority.
-		 */
-		if (t->t_disp_time == ddi_get_lbolt()) {
-			setfrontdq(t);
-		} else {
-			fssproc->fss_timeleft = fss_quantum;
-			THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
-			setbackdq(t);
-		}
+		fssproc->fss_timeleft = fss_quantum;
+		THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
+		setbackdq(t);
 	}
 }
 
diff --git a/usr/src/uts/common/disp/fx.c b/usr/src/uts/common/disp/fx.c
index adb70871e2..5b190242e6 100644
--- a/usr/src/uts/common/disp/fx.c
+++ b/usr/src/uts/common/disp/fx.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -71,16 +71,6 @@ static struct modlinkage modlinkage = {
 };
 
 
-/*
- * control flags (kparms->fx_cflags).
- */
-#define	FX_DOUPRILIM	0x01    /* change user priority limit */
-#define	FX_DOUPRI	0x02    /* change user priority */
-#define	FX_DOTQ		0x04    /* change FX time quantum */
-
-
-#define	FXMAXUPRI 60		/* maximum user priority setting */
-
 #define	FX_MAX_UNPRIV_PRI	0	/* maximum unpriviledge priority */
 
 /*
diff --git a/usr/src/uts/common/disp/priocntl.c b/usr/src/uts/common/disp/priocntl.c
index 5412df83f5..60e870ba28 100644
--- a/usr/src/uts/common/disp/priocntl.c
+++ b/usr/src/uts/common/disp/priocntl.c
@@ -114,7 +114,7 @@ copyin_vaparms32(caddr_t arg, pc_vaparms_t *vap, uio_seg_t seg)
 
 #endif
 
-static int donice(procset_t *, pcnice_t *);
+int donice(procset_t *, pcnice_t *);
 static int doprio(procset_t *, pcprio_t *);
 static int proccmp(proc_t *, struct pcmpargs *);
 static int setparms(proc_t *, struct stprmargs *);
@@ -991,7 +991,7 @@ setprocnice(proc_t *pp, pcnice_t *pcnice)
 /*
  * Update the nice value of the specified LWP or set of processes.
  */
-static int
+int
 donice(procset_t *procset, pcnice_t *pcnice)
 {
 	int err_proc = 0;
diff --git a/usr/src/uts/common/disp/rt.c b/usr/src/uts/common/disp/rt.c
index f87f8c56ce..115e42ccb8 100644
--- a/usr/src/uts/common/disp/rt.c
+++ b/usr/src/uts/common/disp/rt.c
@@ -22,7 +22,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright 2013 Joyent, Inc.  All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -103,13 +103,6 @@ _info(struct modinfo *modinfop)
 pri_t rt_maxpri = RTMAXPRI;	/* maximum real-time priority */
 rtdpent_t *rt_dptbl;	  /* real-time dispatcher parameter table */
 
-/*
- * control flags (kparms->rt_cflags).
- */
-#define	RT_DOPRI	0x01	/* change priority */
-#define	RT_DOTQ		0x02	/* change RT time quantum */
-#define	RT_DOSIG	0x04	/* change RT time quantum signal */
-
 static int	rt_admin(caddr_t, cred_t *);
 static int	rt_enterclass(kthread_t *, id_t, void *, cred_t *, void *);
 static int	rt_fork(kthread_t *, kthread_t *, void *);
diff --git a/usr/src/uts/common/disp/rt_dptbl.c b/usr/src/uts/common/disp/rt_dptbl.c
index 768b499ef2..cc88ed72fc 100644
--- a/usr/src/uts/common/disp/rt_dptbl.c
+++ b/usr/src/uts/common/disp/rt_dptbl.c
@@ -28,8 +28,6 @@
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T	*/
 /*	  All Rights Reserved  	*/
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/proc.h>
 #include <sys/priocntl.h>
 #include <sys/class.h>
@@ -70,8 +68,6 @@ _info(struct modinfo *modinfop)
 	return (mod_info(&modlinkage, modinfop));
 }
 
-#define	RTGPPRIO0	100	/* Global priority for RT priority 0 */
-
 rtdpent_t	config_rt_dptbl[] = {
 
 /*   	prilevel    Time quantum */
diff --git a/usr/src/uts/common/disp/sysdc.c b/usr/src/uts/common/disp/sysdc.c
index 40cde57856..1f50788ceb 100644
--- a/usr/src/uts/common/disp/sysdc.c
+++ b/usr/src/uts/common/disp/sysdc.c
@@ -193,32 +193,6 @@
  *	flag.  This flag currently has no effect, but marks threads which
  *	do bulk processing.
  *
- * - t_kpri_req
- *
- *	The TS and FSS scheduling classes pay attention to t_kpri_req,
- *	which provides a simple form of priority inheritance for
- *	synchronization primitives (such as rwlocks held as READER) which
- *	cannot be traced to a unique thread.  The SDC class does not honor
- *	t_kpri_req, for a few reasons:
- *
- *	1.  t_kpri_req is notoriously inaccurate.  A measure of its
- *	    inaccuracy is that it needs to be cleared every time a thread
- *	    returns to user mode, because it is frequently non-zero at that
- *	    point.  This can happen because "ownership" of synchronization
- *	    primitives that use t_kpri_req can be silently handed off,
- *	    leaving no opportunity to will the t_kpri_req inheritance.
- *
- *	2.  Unlike in TS and FSS, threads in SDC *will* eventually run at
- *	    kernel priority.  This means that even if an SDC thread
- *	    is holding a synchronization primitive and running at low
- *	    priority, its priority will eventually be raised above 60,
- *	    allowing it to drive on and release the resource.
- *
- *	3.  The first consumer of SDC uses the taskq subsystem, which holds
- *	    a reader lock for the duration of the task's execution.  This
- *	    would mean that SDC threads would never drop below kernel
- *	    priority in practice, which defeats one of the purposes of SDC.
- *
  * - Why not FSS?
  *
  *	It might seem that the existing FSS scheduling class could solve
diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c
index cfcb28aa0a..bf1f121b67 100644
--- a/usr/src/uts/common/disp/thread.c
+++ b/usr/src/uts/common/disp/thread.c
@@ -77,6 +77,10 @@
 #include <sys/ctype.h>
 #include <sys/smt.h>
 
+#ifndef	STACK_GROWTH_DOWN
+#error Stacks do not grow downward; 3b2 zombie attack detected!
+#endif
+
 struct kmem_cache *thread_cache;	/* cache of free threads */
 struct kmem_cache *lwp_cache;		/* cache of free lwps */
 struct kmem_cache *turnstile_cache;	/* cache of free turnstiles */
@@ -374,7 +378,7 @@ thread_create(
 		if (stksize <= sizeof (kthread_t) + PTR24_ALIGN)
 			cmn_err(CE_PANIC, "thread_create: proposed stack size"
 			    " too small to hold thread.");
-#ifdef STACK_GROWTH_DOWN
+
 		stksize -= SA(sizeof (kthread_t) + PTR24_ALIGN - 1);
 		stksize &= -PTR24_ALIGN;	/* make thread aligned */
 		t = (kthread_t *)(stk + stksize);
@@ -383,13 +387,6 @@ thread_create(
 			audit_thread_create(t);
 		t->t_stk = stk + stksize;
 		t->t_stkbase = stk;
-#else	/* stack grows to larger addresses */
-		stksize -= SA(sizeof (kthread_t));
-		t = (kthread_t *)(stk);
-		bzero(t, sizeof (kthread_t));
-		t->t_stk = stk + sizeof (kthread_t);
-		t->t_stkbase = stk + stksize + sizeof (kthread_t);
-#endif	/* STACK_GROWTH_DOWN */
 		t->t_flag |= T_TALLOCSTK;
 		t->t_swap = stk;
 	} else {
@@ -402,13 +399,8 @@ thread_create(
 		 * Initialize t_stk to the kernel stack pointer to use
 		 * upon entry to the kernel
 		 */
-#ifdef STACK_GROWTH_DOWN
 		t->t_stk = stk + stksize;
 		t->t_stkbase = stk;
-#else
-		t->t_stk = stk;			/* 3b2-like */
-		t->t_stkbase = stk + stksize;
-#endif /* STACK_GROWTH_DOWN */
 	}
 
 	if (kmem_stackinfo != 0) {
@@ -588,6 +580,9 @@ thread_exit(void)
 	if ((t->t_proc_flag & TP_ZTHREAD) != 0)
 		cmn_err(CE_PANIC, "thread_exit: zthread_exit() not called");
 
+	if ((t->t_flag & T_SPLITSTK) != 0)
+		cmn_err(CE_PANIC, "thread_exit: called when stack is split");
+
 	tsd_exit();		/* Clean up this thread's TSD */
 
 	kcpc_passivate();	/* clean up performance counter state */
@@ -1053,8 +1048,44 @@ installctx(
 	ctx->exit_op = exit;
 	ctx->free_op = free;
 	ctx->arg = arg;
-	ctx->next = t->t_ctx;
+	ctx->save_ts = 0;
+	ctx->restore_ts = 0;
+
+	/*
+	 * Keep ctxops in a doubly-linked list to allow traversal in both
+	 * directions.  Using only the newest-to-oldest ordering was adequate
+	 * previously, but reversing the order for restore_op actions is
+	 * necessary if later-added ctxops depends on earlier ones.
+	 *
+	 * One example of such a dependency:  Hypervisor software handling the
+	 * guest FPU expects that it save FPU state prior to host FPU handling
+	 * and consequently handle the guest logic _after_ the host FPU has
+	 * been restored.
+	 *
+	 * The t_ctx member points to the most recently added ctxop or is NULL
+	 * if no ctxops are associated with the thread.  The 'next' pointers
+	 * form a loop of the ctxops in newest-to-oldest order.  The 'prev'
+	 * pointers form a loop in the reverse direction, where t_ctx->prev is
+	 * the oldest entry associated with the thread.
+	 *
+	 * The protection of kpreempt_disable is required to safely perform the
+	 * list insertion, since there are inconsistent states between some of
+	 * the pointer assignments.
+	 */
+	kpreempt_disable();
+	if (t->t_ctx == NULL) {
+		ctx->next = ctx;
+		ctx->prev = ctx;
+	} else {
+		struct ctxop *head = t->t_ctx, *tail = t->t_ctx->prev;
+
+		ctx->next = head;
+		ctx->prev = tail;
+		head->prev = ctx;
+		tail->next = ctx;
+	}
 	t->t_ctx = ctx;
+	kpreempt_enable();
 }
 
 /*
@@ -1071,7 +1102,7 @@ removectx(
 	void	(*exit)(void *),
 	void	(*free)(void *, int))
 {
-	struct ctxop *ctx, *prev_ctx;
+	struct ctxop *ctx, *head;
 
 	/*
 	 * The incoming kthread_t (which is the thread for which the
@@ -1096,17 +1127,31 @@ removectx(
 	 * and the target thread from racing with each other during lwp exit.
 	 */
 	mutex_enter(&t->t_ctx_lock);
-	prev_ctx = NULL;
 	kpreempt_disable();
-	for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) {
+
+	if (t->t_ctx == NULL) {
+		mutex_exit(&t->t_ctx_lock);
+		kpreempt_enable();
+		return (0);
+	}
+
+	ctx = head = t->t_ctx;
+	do {
 		if (ctx->save_op == save && ctx->restore_op == restore &&
 		    ctx->fork_op == fork && ctx->lwp_create_op == lwp_create &&
 		    ctx->exit_op == exit && ctx->free_op == free &&
 		    ctx->arg == arg) {
-			if (prev_ctx)
-				prev_ctx->next = ctx->next;
-			else
+			ctx->prev->next = ctx->next;
+			ctx->next->prev = ctx->prev;
+			if (ctx->next == ctx) {
+				/* last remaining item */
+				t->t_ctx = NULL;
+			} else if (ctx == t->t_ctx) {
+				/* fix up head of list */
 				t->t_ctx = ctx->next;
+			}
+			ctx->next = ctx->prev = NULL;
+
 			mutex_exit(&t->t_ctx_lock);
 			if (ctx->free_op != NULL)
 				(ctx->free_op)(ctx->arg, 0);
@@ -1114,44 +1159,70 @@ removectx(
 			kpreempt_enable();
 			return (1);
 		}
-		prev_ctx = ctx;
-	}
+
+		ctx = ctx->next;
+	} while (ctx != head);
+
 	mutex_exit(&t->t_ctx_lock);
 	kpreempt_enable();
-
 	return (0);
 }
 
 void
 savectx(kthread_t *t)
 {
-	struct ctxop *ctx;
-
 	ASSERT(t == curthread);
-	for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
-		if (ctx->save_op != NULL)
-			(ctx->save_op)(ctx->arg);
+
+	if (t->t_ctx != NULL) {
+		struct ctxop *ctx, *head;
+
+		/* Forward traversal */
+		ctx = head = t->t_ctx;
+		do {
+			if (ctx->save_op != NULL) {
+				ctx->save_ts = gethrtime_unscaled();
+				(ctx->save_op)(ctx->arg);
+			}
+			ctx = ctx->next;
+		} while (ctx != head);
+	}
 }
 
 void
 restorectx(kthread_t *t)
 {
-	struct ctxop *ctx;
-
 	ASSERT(t == curthread);
-	for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
-		if (ctx->restore_op != NULL)
-			(ctx->restore_op)(ctx->arg);
+
+	if (t->t_ctx != NULL) {
+		struct ctxop *ctx, *tail;
+
+		/* Backward traversal (starting at the tail) */
+		ctx = tail = t->t_ctx->prev;
+		do {
+			if (ctx->restore_op != NULL) {
+				ctx->restore_ts = gethrtime_unscaled();
+				(ctx->restore_op)(ctx->arg);
+			}
+			ctx = ctx->prev;
+		} while (ctx != tail);
+	}
 }
 
 void
 forkctx(kthread_t *t, kthread_t *ct)
 {
-	struct ctxop *ctx;
-
-	for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
-		if (ctx->fork_op != NULL)
-			(ctx->fork_op)(t, ct);
+	if (t->t_ctx != NULL) {
+		struct ctxop *ctx, *head;
+
+		/* Forward traversal */
+		ctx = head = t->t_ctx;
+		do {
+			if (ctx->fork_op != NULL) {
+				(ctx->fork_op)(t, ct);
+			}
+			ctx = ctx->next;
+		} while (ctx != head);
+	}
 }
 
 /*
@@ -1162,11 +1233,18 @@ forkctx(kthread_t *t, kthread_t *ct)
 void
 lwp_createctx(kthread_t *t, kthread_t *ct)
 {
-	struct ctxop *ctx;
-
-	for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
-		if (ctx->lwp_create_op != NULL)
-			(ctx->lwp_create_op)(t, ct);
+	if (t->t_ctx != NULL) {
+		struct ctxop *ctx, *head;
+
+		/* Forward traversal */
+		ctx = head = t->t_ctx;
+		do {
+			if (ctx->lwp_create_op != NULL) {
+				(ctx->lwp_create_op)(t, ct);
+			}
+			ctx = ctx->next;
+		} while (ctx != head);
+	}
 }
 
 /*
@@ -1179,11 +1257,18 @@ lwp_createctx(kthread_t *t, kthread_t *ct)
 void
 exitctx(kthread_t *t)
 {
-	struct ctxop *ctx;
-
-	for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
-		if (ctx->exit_op != NULL)
-			(ctx->exit_op)(t);
+	if (t->t_ctx != NULL) {
+		struct ctxop *ctx, *head;
+
+		/* Forward traversal */
+		ctx = head = t->t_ctx;
+		do {
+			if (ctx->exit_op != NULL) {
+				(ctx->exit_op)(t);
+			}
+			ctx = ctx->next;
+		} while (ctx != head);
+	}
 }
 
 /*
@@ -1193,14 +1278,21 @@ exitctx(kthread_t *t)
 void
 freectx(kthread_t *t, int isexec)
 {
-	struct ctxop *ctx;
-
 	kpreempt_disable();
-	while ((ctx = t->t_ctx) != NULL) {
-		t->t_ctx = ctx->next;
-		if (ctx->free_op != NULL)
-			(ctx->free_op)(ctx->arg, isexec);
-		kmem_free(ctx, sizeof (struct ctxop));
+	if (t->t_ctx != NULL) {
+		struct ctxop *ctx, *head;
+
+		ctx = head = t->t_ctx;
+		t->t_ctx = NULL;
+		do {
+			struct ctxop *next = ctx->next;
+
+			if (ctx->free_op != NULL) {
+				(ctx->free_op)(ctx->arg, isexec);
+			}
+			kmem_free(ctx, sizeof (struct ctxop));
+			ctx = next;
+		} while (ctx != head);
 	}
 	kpreempt_enable();
 }
@@ -1215,17 +1307,22 @@ freectx(kthread_t *t, int isexec)
 void
 freectx_ctx(struct ctxop *ctx)
 {
-	struct ctxop *nctx;
+	struct ctxop *head = ctx;
 
 	ASSERT(ctx != NULL);
 
 	kpreempt_disable();
+
+	head = ctx;
 	do {
-		nctx = ctx->next;
-		if (ctx->free_op != NULL)
+		struct ctxop *next = ctx->next;
+
+		if (ctx->free_op != NULL) {
 			(ctx->free_op)(ctx->arg, 0);
+		}
 		kmem_free(ctx, sizeof (struct ctxop));
-	} while ((ctx = nctx) != NULL);
+		ctx = next;
+	} while (ctx != head);
 	kpreempt_enable();
 }
 
@@ -1889,6 +1986,103 @@ thread_change_pri(kthread_t *t, pri_t disp_pri, int front)
 	return (on_rq);
 }
 
+
+/*
+ * There are occasions in the kernel when we need much more stack than we
+ * allocate by default, but we do not wish to have that work done
+ * asynchronously by another thread.  To accommodate these scenarios, we allow
+ * for a split stack (also known as a "segmented stack") whereby a new stack
+ * is dynamically allocated and the current thread jumps onto it for purposes
+ * of executing the specified function.  After the specified function returns,
+ * the stack is deallocated and control is returned to the caller.  This
+ * functionality is implemented by thread_splitstack(), below; there are a few
+ * constraints on its use:
+ *
+ * - The caller must be in a context where it is safe to block for memory.
+ * - The caller cannot be in a t_onfault context
+ * - The called function must not call thread_exit() while on the split stack
+ *
+ * The code will explicitly panic if these constraints are violated.  Notably,
+ * however, thread_splitstack() _can_ be called on a split stack -- there
+ * is no limit to the level that split stacks can nest.
+ *
+ * When the stack is split, it is constructed such that stack backtraces
+ * from kernel debuggers continue to function -- though note that DTrace's
+ * stack() action and stackdepth function will only show the stack up to and
+ * including thread_splitstack_run(); DTrace explicitly bounds itself to
+ * pointers that exist within the current declared stack as a safety
+ * mechanism.
+ */
+void
+thread_splitstack(void (*func)(void *), void *arg, size_t stksize)
+{
+	kthread_t *t = curthread;
+	caddr_t ostk, ostkbase, stk;
+	ushort_t otflag;
+
+	if (t->t_onfault != NULL)
+		panic("thread_splitstack: called with non-NULL t_onfault");
+
+	ostk = t->t_stk;
+	ostkbase = t->t_stkbase;
+	otflag = t->t_flag;
+
+	stksize = roundup(stksize, PAGESIZE);
+
+	if (stksize < default_stksize)
+		stksize = default_stksize;
+
+	if (stksize == default_stksize) {
+		stk = (caddr_t)segkp_cache_get(segkp_thread);
+	} else {
+		stksize = roundup(stksize, PAGESIZE);
+		stk = (caddr_t)segkp_get(segkp, stksize,
+		    (KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED));
+	}
+
+	/*
+	 * We're going to lock ourselves before we set T_SPLITSTK to assure
+	 * that we're not swapped out in the meantime.  (Note that we don't
+	 * bother to set t_swap, as we're not going to be swapped out.)
+	 */
+	thread_lock(t);
+
+	if (!(otflag & T_SPLITSTK))
+		t->t_flag |= T_SPLITSTK;
+
+	t->t_stk = stk + stksize;
+	t->t_stkbase = stk;
+
+	thread_unlock(t);
+
+	/*
+	 * Now actually run on the new (split) stack...
+	 */
+	thread_splitstack_run(t->t_stk, func, arg);
+
+	/*
+	 * We're back onto our own stack; lock ourselves and restore our
+	 * pre-split state.
+	 */
+	thread_lock(t);
+
+	t->t_stk = ostk;
+	t->t_stkbase = ostkbase;
+
+	if (!(otflag & T_SPLITSTK))
+		t->t_flag &= ~T_SPLITSTK;
+
+	thread_unlock(t);
+
+	/*
+	 * Now that we are entirely back on our own stack, call back into
+	 * the platform layer to perform any platform-specific cleanup.
+	 */
+	thread_splitstack_cleanup();
+
+	segkp_release(segkp, stk);
+}
+
 /*
  * Tunable kmem_stackinfo is set, fill the kernel thread stack with a
  * specific pattern.
diff --git a/usr/src/uts/common/disp/thread_intr.c b/usr/src/uts/common/disp/thread_intr.c
index 67ccc6922f..c840bdf31a 100644
--- a/usr/src/uts/common/disp/thread_intr.c
+++ b/usr/src/uts/common/disp/thread_intr.c
@@ -23,19 +23,10 @@
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
-
 /*
- * FILE NOTICE BEGIN
- *
- * This file should not be modified.  If you wish to modify it or have it
- * modified, please contact Sun Microsystems at <LFI149367@-sun-.-com->
- * (without anti-spam dashes)
- *
- * FILE NOTICE END
+ * Copyright 2015, Joyent, Inc.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/cpuvar.h>
 #include <sys/stack.h>
 #include <vm/seg_kp.h>
@@ -44,6 +35,17 @@
 #include <sys/sysmacros.h>
 
 /*
+ * Use a slightly larger thread stack size for interrupt threads rather than the
+ * default. This is useful for cases where the networking stack may do an rx and
+ * a tx in the context of a single interrupt and when combined with various
+ * promisc hooks that need memory, can cause us to get dangerously close to the
+ * edge of the traditional stack sizes. This is only a few pages more than a
+ * traditional stack and given that we don't have that many interrupt threads,
+ * the memory costs end up being more than worthwhile.
+ */
+#define	LL_INTR_STKSZ	(32 * 1024)
+
+/*
  * Create and initialize an interrupt thread.
  */
 static void
@@ -51,7 +53,7 @@ thread_create_intr(cpu_t *cp)
 {
 	kthread_t *tp;
 
-	tp = thread_create(NULL, 0,
+	tp = thread_create(NULL, LL_INTR_STKSZ,
 	    (void (*)())thread_create_intr, NULL, 0, &p0, TS_ONPROC, 0);
 
 	/*
@@ -97,9 +99,12 @@ thread_create_intr(cpu_t *cp)
 }
 
 /*
- * Allocate a given number of interrupt threads for a given CPU.
- * These threads will get freed by cpu_destroy_bound_threads()
- * when CPU gets unconfigured.
+ * Allocate a given number of interrupt threads for a given CPU.  These threads
+ * will get freed by cpu_destroy_bound_threads() when CPU gets unconfigured.
+ *
+ * Note, high level interrupts are always serviced using cpu_intr_stack and are
+ * not allowed to block. Low level interrupts or soft-interrupts use the
+ * kthread_t's that we create through the calls to thread_create_intr().
  */
 void
 cpu_intr_alloc(cpu_t *cp, int n)
@@ -110,6 +115,6 @@ cpu_intr_alloc(cpu_t *cp, int n)
 		thread_create_intr(cp);
 
 	cp->cpu_intr_stack = (caddr_t)segkp_get(segkp, INTR_STACK_SIZE,
-		KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED) +
-		INTR_STACK_SIZE - SA(MINFRAME);
+	    KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED) +
+	    INTR_STACK_SIZE - SA(MINFRAME);
 }
diff --git a/usr/src/uts/common/disp/ts.c b/usr/src/uts/common/disp/ts.c
index bf65c3c42d..5d35b283d7 100644
--- a/usr/src/uts/common/disp/ts.c
+++ b/usr/src/uts/common/disp/ts.c
@@ -21,11 +21,11 @@
 
 /*
  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
-/*	  All Rights Reserved  	*/
+/*	  All Rights Reserved	*/
 
 #include <sys/types.h>
 #include <sys/param.h>
@@ -229,7 +229,6 @@ static void	ia_set_process_group(pid_t, pid_t, pid_t);
 
 static void	ts_change_priority(kthread_t *, tsproc_t *);
 
-extern pri_t	ts_maxkmdpri;	/* maximum kernel mode ts priority */
 static pri_t	ts_maxglobpri;	/* maximum global priority used by ts class */
 static kmutex_t	ts_dptblock;	/* protects time sharing dispatch table */
 static kmutex_t	ts_list_lock[TS_LISTS];	/* protects tsproc lists */
@@ -541,8 +540,8 @@ ts_admin(caddr_t uaddr, cred_t *reqpcredp)
  * to specified time-sharing priority.
  */
 static int
-ts_enterclass(kthread_t *t, id_t cid, void *parmsp,
-	cred_t *reqpcredp, void *bufp)
+ts_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
+    void *bufp)
 {
 	tsparms_t	*tsparmsp = (tsparms_t *)parmsp;
 	tsproc_t	*tspp;
@@ -703,7 +702,7 @@ ts_fork(kthread_t *t, kthread_t *ct, void *bufp)
 	TS_NEWUMDPRI(ctspp);
 	ctspp->ts_nice = ptspp->ts_nice;
 	ctspp->ts_dispwait = 0;
-	ctspp->ts_flags = ptspp->ts_flags & ~(TSKPRI | TSBACKQ | TSRESTORE);
+	ctspp->ts_flags = ptspp->ts_flags & ~(TSBACKQ | TSRESTORE);
 	ctspp->ts_tp = ct;
 	cpucaps_sc_init(&ctspp->ts_caps);
 	thread_unlock(t);
@@ -754,7 +753,6 @@ ts_forkret(kthread_t *t, kthread_t *ct)
 	tspp->ts_dispwait = 0;
 	t->t_pri = ts_dptbl[tspp->ts_umdpri].ts_globpri;
 	ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri);
-	tspp->ts_flags &= ~TSKPRI;
 	THREAD_TRANSITION(t);
 	ts_setrun(t);
 	thread_unlock(t);
@@ -1217,11 +1215,6 @@ ts_parmsset(kthread_t *tx, void *parmsp, id_t reqpcid, cred_t *reqpcredp)
 	TS_NEWUMDPRI(tspp);
 	tspp->ts_nice = nice;
 
-	if ((tspp->ts_flags & TSKPRI) != 0) {
-		thread_unlock(tx);
-		return (0);
-	}
-
 	tspp->ts_dispwait = 0;
 	ts_change_priority(tx, tspp);
 	thread_unlock(tx);
@@ -1237,7 +1230,7 @@ ia_parmsset(kthread_t *tx, void *parmsp, id_t reqpcid, cred_t *reqpcredp)
 	proc_t		*p;
 	pid_t		pid, pgid, sid;
 	pid_t		on, off;
-	struct stdata 	*stp;
+	struct stdata	*stp;
 	int		sess_held;
 
 	/*
@@ -1373,33 +1366,20 @@ static void
 ts_preempt(kthread_t *t)
 {
 	tsproc_t	*tspp = (tsproc_t *)(t->t_cldata);
-	klwp_t		*lwp = curthread->t_lwp;
+	klwp_t		*lwp = ttolwp(t);
 	pri_t		oldpri = t->t_pri;
 
 	ASSERT(t == curthread);
 	ASSERT(THREAD_LOCK_HELD(curthread));
 
 	/*
-	 * If preempted in the kernel, make sure the thread has
-	 * a kernel priority if needed.
-	 */
-	if (!(tspp->ts_flags & TSKPRI) && lwp != NULL && t->t_kpri_req) {
-		tspp->ts_flags |= TSKPRI;
-		THREAD_CHANGE_PRI(t, ts_kmdpris[0]);
-		ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri);
-		t->t_trapret = 1;		/* so ts_trapret will run */
-		aston(t);
-	}
-
-	/*
 	 * This thread may be placed on wait queue by CPU Caps. In this case we
 	 * do not need to do anything until it is removed from the wait queue.
-	 * Do not enforce CPU caps on threads running at a kernel priority
 	 */
 	if (CPUCAPS_ON()) {
 		(void) cpucaps_charge(t, &tspp->ts_caps,
 		    CPUCAPS_CHARGE_ENFORCE);
-		if (!(tspp->ts_flags & TSKPRI) && CPUCAPS_ENFORCE(t))
+		if (CPUCAPS_ENFORCE(t))
 			return;
 	}
 
@@ -1425,18 +1405,16 @@ ts_preempt(kthread_t *t)
 	if (t->t_schedctl && schedctl_get_nopreempt(t)) {
 		if (tspp->ts_timeleft > -SC_MAX_TICKS) {
 			DTRACE_SCHED1(schedctl__nopreempt, kthread_t *, t);
-			if (!(tspp->ts_flags & TSKPRI)) {
-				/*
-				 * If not already remembered, remember current
-				 * priority for restoration in ts_yield().
-				 */
-				if (!(tspp->ts_flags & TSRESTORE)) {
-					tspp->ts_scpri = t->t_pri;
-					tspp->ts_flags |= TSRESTORE;
-				}
-				THREAD_CHANGE_PRI(t, ts_maxumdpri);
-				t->t_schedflag |= TS_DONT_SWAP;
+			/*
+			 * If not already remembered, remember current
+			 * priority for restoration in ts_yield().
+			 */
+			if (!(tspp->ts_flags & TSRESTORE)) {
+				tspp->ts_scpri = t->t_pri;
+				tspp->ts_flags |= TSRESTORE;
 			}
+			THREAD_CHANGE_PRI(t, ts_maxumdpri);
+			t->t_schedflag |= TS_DONT_SWAP;
 			schedctl_set_yield(t, 1);
 			setfrontdq(t);
 			goto done;
@@ -1456,14 +1434,11 @@ ts_preempt(kthread_t *t)
 		}
 	}
 
-	if ((tspp->ts_flags & (TSBACKQ|TSKPRI)) == TSBACKQ) {
+	if ((tspp->ts_flags & TSBACKQ) != 0) {
 		tspp->ts_timeleft = ts_dptbl[tspp->ts_cpupri].ts_quantum;
 		tspp->ts_dispwait = 0;
 		tspp->ts_flags &= ~TSBACKQ;
 		setbackdq(t);
-	} else if ((tspp->ts_flags & (TSBACKQ|TSKPRI)) == (TSBACKQ|TSKPRI)) {
-		tspp->ts_flags &= ~TSBACKQ;
-		setbackdq(t);
 	} else {
 		setfrontdq(t);
 	}
@@ -1485,11 +1460,8 @@ ts_setrun(kthread_t *t)
 		TS_NEWUMDPRI(tspp);
 		tspp->ts_timeleft = ts_dptbl[tspp->ts_cpupri].ts_quantum;
 		tspp->ts_dispwait = 0;
-		if ((tspp->ts_flags & TSKPRI) == 0) {
-			THREAD_CHANGE_PRI(t,
-			    ts_dptbl[tspp->ts_umdpri].ts_globpri);
-			ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri);
-		}
+		THREAD_CHANGE_PRI(t, ts_dptbl[tspp->ts_umdpri].ts_globpri);
+		ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri);
 	}
 
 	tspp->ts_flags &= ~TSBACKQ;
@@ -1509,14 +1481,12 @@ ts_setrun(kthread_t *t)
 
 
 /*
- * Prepare thread for sleep. We reset the thread priority so it will
- * run at the kernel priority level when it wakes up.
+ * Prepare thread for sleep.
  */
 static void
 ts_sleep(kthread_t *t)
 {
 	tsproc_t	*tspp = (tsproc_t *)(t->t_cldata);
-	int		flags;
 	pri_t		old_pri = t->t_pri;
 
 	ASSERT(t == curthread);
@@ -1527,18 +1497,7 @@ ts_sleep(kthread_t *t)
 	 */
 	(void) CPUCAPS_CHARGE(t, &tspp->ts_caps, CPUCAPS_CHARGE_ENFORCE);
 
-	flags = tspp->ts_flags;
-	if (t->t_kpri_req) {
-		tspp->ts_flags = flags | TSKPRI;
-		THREAD_CHANGE_PRI(t, ts_kmdpris[0]);
-		ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri);
-		t->t_trapret = 1;		/* so ts_trapret will run */
-		aston(t);
-	} else if (tspp->ts_dispwait > ts_dptbl[tspp->ts_umdpri].ts_maxwait) {
-		/*
-		 * If thread has blocked in the kernel (as opposed to
-		 * being merely preempted), recompute the user mode priority.
-		 */
+	if (tspp->ts_dispwait > ts_dptbl[tspp->ts_umdpri].ts_maxwait) {
 		tspp->ts_cpupri = ts_dptbl[tspp->ts_cpupri].ts_slpret;
 		TS_NEWUMDPRI(tspp);
 		tspp->ts_timeleft = ts_dptbl[tspp->ts_cpupri].ts_quantum;
@@ -1548,16 +1507,6 @@ ts_sleep(kthread_t *t)
 		    ts_dptbl[tspp->ts_umdpri].ts_globpri);
 		ASSERT(curthread->t_pri >= 0 &&
 		    curthread->t_pri <= ts_maxglobpri);
-		tspp->ts_flags = flags & ~TSKPRI;
-
-		if (DISP_MUST_SURRENDER(curthread))
-			cpu_surrender(curthread);
-	} else if (flags & TSKPRI) {
-		THREAD_CHANGE_PRI(curthread,
-		    ts_dptbl[tspp->ts_umdpri].ts_globpri);
-		ASSERT(curthread->t_pri >= 0 &&
-		    curthread->t_pri <= ts_maxglobpri);
-		tspp->ts_flags = flags & ~TSKPRI;
 
 		if (DISP_MUST_SURRENDER(curthread))
 			cpu_surrender(curthread);
@@ -1594,9 +1543,9 @@ ts_swapin(kthread_t *t, int flags)
 		time_t swapout_time;
 
 		swapout_time = (ddi_get_lbolt() - t->t_stime) / hz;
-		if (INHERITED(t) || (tspp->ts_flags & (TSKPRI | TSIASET)))
+		if (INHERITED(t) || (tspp->ts_flags & TSIASET)) {
 			epri = (long)DISP_PRIO(t) + swapout_time;
-		else {
+		} else {
 			/*
 			 * Threads which have been out for a long time,
 			 * have high user mode priority and are associated
@@ -1648,7 +1597,7 @@ ts_swapout(kthread_t *t, int flags)
 
 	ASSERT(THREAD_LOCK_HELD(t));
 
-	if (INHERITED(t) || (tspp->ts_flags & (TSKPRI | TSIASET)) ||
+	if (INHERITED(t) || (tspp->ts_flags & TSIASET) ||
 	    (t->t_proc_flag & TP_LWPEXIT) ||
 	    (t->t_state & (TS_ZOMB | TS_FREE | TS_STOPPED |
 	    TS_ONPROC | TS_WAIT)) ||
@@ -1717,62 +1666,59 @@ ts_tick(kthread_t *t)
 	 */
 	if (CPUCAPS_ON()) {
 		call_cpu_surrender = cpucaps_charge(t, &tspp->ts_caps,
-		    CPUCAPS_CHARGE_ENFORCE) && !(tspp->ts_flags & TSKPRI);
+		    CPUCAPS_CHARGE_ENFORCE);
 	}
 
-	if ((tspp->ts_flags & TSKPRI) == 0) {
-		if (--tspp->ts_timeleft <= 0) {
-			pri_t	new_pri;
+	if (--tspp->ts_timeleft <= 0) {
+		pri_t	new_pri;
 
-			/*
-			 * If we're doing preemption control and trying to
-			 * avoid preempting this thread, just note that
-			 * the thread should yield soon and let it keep
-			 * running (unless it's been a while).
-			 */
-			if (t->t_schedctl && schedctl_get_nopreempt(t)) {
-				if (tspp->ts_timeleft > -SC_MAX_TICKS) {
-					DTRACE_SCHED1(schedctl__nopreempt,
-					    kthread_t *, t);
-					schedctl_set_yield(t, 1);
-					thread_unlock_nopreempt(t);
-					return;
-				}
-
-				TNF_PROBE_2(schedctl_failsafe,
-				    "schedctl TS ts_tick", /* CSTYLED */,
-				    tnf_pid, pid, ttoproc(t)->p_pid,
-				    tnf_lwpid, lwpid, t->t_tid);
-			}
-			tspp->ts_flags &= ~TSRESTORE;
-			tspp->ts_cpupri = ts_dptbl[tspp->ts_cpupri].ts_tqexp;
-			TS_NEWUMDPRI(tspp);
-			tspp->ts_dispwait = 0;
-			new_pri = ts_dptbl[tspp->ts_umdpri].ts_globpri;
-			ASSERT(new_pri >= 0 && new_pri <= ts_maxglobpri);
-			/*
-			 * When the priority of a thread is changed,
-			 * it may be necessary to adjust its position
-			 * on a sleep queue or dispatch queue.
-			 * The function thread_change_pri accomplishes
-			 * this.
-			 */
-			if (thread_change_pri(t, new_pri, 0)) {
-				if ((t->t_schedflag & TS_LOAD) &&
-				    (lwp = t->t_lwp) &&
-				    lwp->lwp_state == LWP_USER)
-					t->t_schedflag &= ~TS_DONT_SWAP;
-				tspp->ts_timeleft =
-				    ts_dptbl[tspp->ts_cpupri].ts_quantum;
-			} else {
-				call_cpu_surrender = B_TRUE;
+		/*
+		 * If we're doing preemption control and trying to avoid
+		 * preempting this thread, just note that the thread should
+		 * yield soon and let it keep running (unless it's been a
+		 * while).
+		 */
+		if (t->t_schedctl && schedctl_get_nopreempt(t)) {
+			if (tspp->ts_timeleft > -SC_MAX_TICKS) {
+				DTRACE_SCHED1(schedctl__nopreempt,
+				    kthread_t *, t);
+				schedctl_set_yield(t, 1);
+				thread_unlock_nopreempt(t);
+				return;
 			}
-			TRACE_2(TR_FAC_DISP, TR_TICK,
-			    "tick:tid %p old pri %d", t, oldpri);
-		} else if (t->t_state == TS_ONPROC &&
-		    t->t_pri < t->t_disp_queue->disp_maxrunpri) {
+
+			TNF_PROBE_2(schedctl_failsafe,
+			    "schedctl TS ts_tick", /* CSTYLED */,
+			    tnf_pid, pid, ttoproc(t)->p_pid,
+			    tnf_lwpid, lwpid, t->t_tid);
+		}
+		tspp->ts_flags &= ~TSRESTORE;
+		tspp->ts_cpupri = ts_dptbl[tspp->ts_cpupri].ts_tqexp;
+		TS_NEWUMDPRI(tspp);
+		tspp->ts_dispwait = 0;
+		new_pri = ts_dptbl[tspp->ts_umdpri].ts_globpri;
+		ASSERT(new_pri >= 0 && new_pri <= ts_maxglobpri);
+		/*
+		 * When the priority of a thread is changed, it may be
+		 * necessary to adjust its position on a sleep queue or
+		 * dispatch queue.  The function thread_change_pri accomplishes
+		 * this.
+		 */
+		if (thread_change_pri(t, new_pri, 0)) {
+			if ((t->t_schedflag & TS_LOAD) &&
+			    (lwp = t->t_lwp) &&
+			    lwp->lwp_state == LWP_USER)
+				t->t_schedflag &= ~TS_DONT_SWAP;
+			tspp->ts_timeleft =
+			    ts_dptbl[tspp->ts_cpupri].ts_quantum;
+		} else {
 			call_cpu_surrender = B_TRUE;
 		}
+		TRACE_2(TR_FAC_DISP, TR_TICK,
+		    "tick:tid %p old pri %d", t, oldpri);
+	} else if (t->t_state == TS_ONPROC &&
+	    t->t_pri < t->t_disp_queue->disp_maxrunpri) {
+		call_cpu_surrender = B_TRUE;
 	}
 
 	if (call_cpu_surrender) {
@@ -1785,11 +1731,8 @@ ts_tick(kthread_t *t)
 
 
 /*
- * If thread is currently at a kernel mode priority (has slept)
- * we assign it the appropriate user mode priority and time quantum
- * here.  If we are lowering the thread's priority below that of
- * other runnable threads we will normally set runrun via cpu_surrender() to
- * cause preemption.
+ * If we are lowering the thread's priority below that of other runnable
+ * threads we will normally set runrun via cpu_surrender() to cause preemption.
  */
 static void
 ts_trapret(kthread_t *t)
@@ -1803,7 +1746,6 @@ ts_trapret(kthread_t *t)
 	ASSERT(cp->cpu_dispthread == t);
 	ASSERT(t->t_state == TS_ONPROC);
 
-	t->t_kpri_req = 0;
 	if (tspp->ts_dispwait > ts_dptbl[tspp->ts_umdpri].ts_maxwait) {
 		tspp->ts_cpupri = ts_dptbl[tspp->ts_cpupri].ts_slpret;
 		TS_NEWUMDPRI(tspp);
@@ -1817,27 +1759,14 @@ ts_trapret(kthread_t *t)
 		THREAD_CHANGE_PRI(t, ts_dptbl[tspp->ts_umdpri].ts_globpri);
 		cp->cpu_dispatch_pri = DISP_PRIO(t);
 		ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri);
-		tspp->ts_flags &= ~TSKPRI;
-
-		if (DISP_MUST_SURRENDER(t))
-			cpu_surrender(t);
-	} else if (tspp->ts_flags & TSKPRI) {
-		/*
-		 * If thread has blocked in the kernel (as opposed to
-		 * being merely preempted), recompute the user mode priority.
-		 */
-		THREAD_CHANGE_PRI(t, ts_dptbl[tspp->ts_umdpri].ts_globpri);
-		cp->cpu_dispatch_pri = DISP_PRIO(t);
-		ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri);
-		tspp->ts_flags &= ~TSKPRI;
 
 		if (DISP_MUST_SURRENDER(t))
 			cpu_surrender(t);
 	}
 
 	/*
-	 * Swapout lwp if the swapper is waiting for this thread to
-	 * reach a safe point.
+	 * Swapout lwp if the swapper is waiting for this thread to reach a
+	 * safe point.
 	 */
 	if ((t->t_schedflag & TS_SWAPENQ) && !(tspp->ts_flags & TSIASET)) {
 		thread_unlock(t);
@@ -1931,8 +1860,6 @@ ts_update_list(int i)
 		    tx->t_clfuncs != &ia_classfuncs.thread)
 			goto next;
 		tspp->ts_dispwait++;
-		if ((tspp->ts_flags & TSKPRI) != 0)
-			goto next;
 		if (tspp->ts_dispwait <= ts_dptbl[tspp->ts_umdpri].ts_maxwait)
 			goto next;
 		if (tx->t_schedctl && schedctl_get_nopreempt(tx))
@@ -1968,12 +1895,7 @@ next:
 }
 
 /*
- * Processes waking up go to the back of their queue.  We don't
- * need to assign a time quantum here because thread is still
- * at a kernel mode priority and the time slicing is not done
- * for threads running in the kernel after sleeping.  The proper
- * time quantum will be assigned by ts_trapret before the thread
- * returns to user mode.
+ * Processes waking up go to the back of their queue.
  */
 static void
 ts_wakeup(kthread_t *t)
@@ -1984,46 +1906,27 @@ ts_wakeup(kthread_t *t)
 
 	t->t_stime = ddi_get_lbolt();		/* time stamp for the swapper */
 
-	if (tspp->ts_flags & TSKPRI) {
-		tspp->ts_flags &= ~TSBACKQ;
+	if (tspp->ts_dispwait > ts_dptbl[tspp->ts_umdpri].ts_maxwait) {
+		tspp->ts_cpupri = ts_dptbl[tspp->ts_cpupri].ts_slpret;
+		TS_NEWUMDPRI(tspp);
+		tspp->ts_timeleft = ts_dptbl[tspp->ts_cpupri].ts_quantum;
+		tspp->ts_dispwait = 0;
+		THREAD_CHANGE_PRI(t, ts_dptbl[tspp->ts_umdpri].ts_globpri);
+		ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri);
+	}
+
+	tspp->ts_flags &= ~TSBACKQ;
+
+	if (tspp->ts_flags & TSIA) {
 		if (tspp->ts_flags & TSIASET)
 			setfrontdq(t);
 		else
 			setbackdq(t);
-	} else if (t->t_kpri_req) {
-		/*
-		 * Give thread a priority boost if we were asked.
-		 */
-		tspp->ts_flags |= TSKPRI;
-		THREAD_CHANGE_PRI(t, ts_kmdpris[0]);
-		setbackdq(t);
-		t->t_trapret = 1;	/* so that ts_trapret will run */
-		aston(t);
 	} else {
-		if (tspp->ts_dispwait > ts_dptbl[tspp->ts_umdpri].ts_maxwait) {
-			tspp->ts_cpupri = ts_dptbl[tspp->ts_cpupri].ts_slpret;
-			TS_NEWUMDPRI(tspp);
-			tspp->ts_timeleft =
-			    ts_dptbl[tspp->ts_cpupri].ts_quantum;
-			tspp->ts_dispwait = 0;
-			THREAD_CHANGE_PRI(t,
-			    ts_dptbl[tspp->ts_umdpri].ts_globpri);
-			ASSERT(t->t_pri >= 0 && t->t_pri <= ts_maxglobpri);
-		}
-
-		tspp->ts_flags &= ~TSBACKQ;
-
-		if (tspp->ts_flags & TSIA) {
-			if (tspp->ts_flags & TSIASET)
-				setfrontdq(t);
-			else
-				setbackdq(t);
-		} else {
-			if (t->t_disp_time != ddi_get_lbolt())
-				setbackdq(t);
-			else
-				setfrontdq(t);
-		}
+		if (t->t_disp_time != ddi_get_lbolt())
+			setbackdq(t);
+		else
+			setfrontdq(t);
 	}
 }
 
@@ -2179,7 +2082,7 @@ ts_doprio(kthread_t *t, cred_t *cr, int incr, int *retvalp)
  * and background processes as non-interactive iff the session
  * leader is interactive.  This routine is called from two places:
  *	strioctl:SPGRP when a new process group gets
- * 		control of the tty.
+ *		control of the tty.
  *	ia_parmsset-when the process in question is a session leader.
  * ia_set_process_group assumes that pidlock is held by the caller,
  * either strioctl or priocntlsys.  If the caller is priocntlsys
@@ -2189,7 +2092,7 @@ ts_doprio(kthread_t *t, cred_t *cr, int incr, int *retvalp)
 static void
 ia_set_process_group(pid_t sid, pid_t bg_pgid, pid_t fg_pgid)
 {
-	proc_t 		*leader, *fg, *bg;
+	proc_t		*leader, *fg, *bg;
 	tsproc_t	*tspp;
 	kthread_t	*tx;
 	int		plocked = 0;
@@ -2291,10 +2194,6 @@ ia_set_process_group(pid_t sid, pid_t bg_pgid, pid_t fg_pgid)
 			tspp->ts_flags |= TSIASET;
 			tspp->ts_boost = ia_boost;
 			TS_NEWUMDPRI(tspp);
-			if ((tspp->ts_flags & TSKPRI) != 0) {
-				thread_unlock(tx);
-				continue;
-			}
 			tspp->ts_dispwait = 0;
 			ts_change_priority(tx, tspp);
 			thread_unlock(tx);
@@ -2344,10 +2243,6 @@ skip:
 			tspp->ts_flags &= ~TSIASET;
 			tspp->ts_boost = -ia_boost;
 			TS_NEWUMDPRI(tspp);
-			if ((tspp->ts_flags & TSKPRI) != 0) {
-				thread_unlock(tx);
-				continue;
-			}
 
 			tspp->ts_dispwait = 0;
 			ts_change_priority(tx, tspp);