10 files changed, 195 insertions, 121 deletions
diff --git a/usr/src/uts/common/disp/cpupart.c b/usr/src/uts/common/disp/cpupart.c
index c260329c61..4ddc568187 100644
--- a/usr/src/uts/common/disp/cpupart.c
+++ b/usr/src/uts/common/disp/cpupart.c
@@ -20,6 +20,8 @@
  */
 /*
  * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -324,7 +326,7 @@ cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
 	kthread_t *t;
 	int	move_threads = 1;
 	lgrp_id_t lgrpid;
-	proc_t 	*p;
+	proc_t	*p;
 	int lgrp_diff_lpl;
 	lpl_t	*cpu_lpl;
 	int	ret;
@@ -569,8 +571,8 @@ again:
 				/* Update CPU last ran on if it was this CPU */
 				if (t->t_cpu == cp && t->t_cpupart == oldpp &&
 				    t->t_bound_cpu != cp) {
-					t->t_cpu = disp_lowpri_cpu(ncp,
-					    t->t_lpl, t->t_pri, NULL);
+					t->t_cpu = disp_lowpri_cpu(ncp, t,
+					    t->t_pri);
 				}
 				t = t->t_forw;
 			} while (t != p->p_tlist);
@@ -622,8 +624,8 @@ again:
 			/* Update CPU last ran on if it was this CPU */
 			if (t->t_cpu == cp && t->t_cpupart == oldpp &&
 			    t->t_bound_cpu != cp) {
-				t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl,
-				    t->t_pri, NULL);
+				t->t_cpu = disp_lowpri_cpu(ncp, t,
+				    t->t_pri);
 			}
 
 			t = t->t_next;
@@ -883,7 +885,7 @@ cpupart_create(psetid_t *psid)
 static int
 cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all)
 {
-	void 	*projbuf, *zonebuf;
+	void	*projbuf, *zonebuf;
 	kthread_t *t;
 	proc_t	*p;
 	int	err = 0;
diff --git a/usr/src/uts/common/disp/disp.c b/usr/src/uts/common/disp/disp.c
index 5f9c2c68a2..4898a18bf2 100644
--- a/usr/src/uts/common/disp/disp.c
+++ b/usr/src/uts/common/disp/disp.c
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -60,6 +60,7 @@
 #include <sys/dtrace.h>
 #include <sys/sdt.h>
 #include <sys/archsystm.h>
+#include <sys/ht.h>
 
 #include <vm/as.h>
 
@@ -1135,15 +1136,13 @@ swtch_to(kthread_t *next)
 	 */
 }
 
-#define	CPU_IDLING(pri)	((pri) == -1)
-
 static void
 cpu_resched(cpu_t *cp, pri_t tpri)
 {
 	int	call_poke_cpu = 0;
 	pri_t   cpupri = cp->cpu_dispatch_pri;
 
-	if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
+	if (cpupri != CPU_IDLE_PRI && cpupri < tpri) {
 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
@@ -1239,17 +1238,17 @@ setbackdq(kthread_t *tp)
 		/*
 		 * We'll generally let this thread continue to run where
 		 * it last ran...but will consider migration if:
-		 * - We thread probably doesn't have much cache warmth.
+		 * - The thread probably doesn't have much cache warmth.
+		 * - HT exclusion would prefer us to run elsewhere
 		 * - The CPU where it last ran is the target of an offline
 		 *   request.
-		 * - The thread last ran outside it's home lgroup.
+		 * - The thread last ran outside its home lgroup.
 		 */
 		if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
-		    (tp->t_cpu == cpu_inmotion)) {
-			cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
-		} else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
-			cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
-			    self ? tp->t_cpu : NULL);
+		    !ht_should_run(tp, tp->t_cpu) ||
+		    (tp->t_cpu == cpu_inmotion) ||
+		    !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
+			cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
 		} else {
 			cp = tp->t_cpu;
 		}
@@ -1278,7 +1277,8 @@ setbackdq(kthread_t *tp)
 					newcp = cp->cpu_next_part;
 				}
 
-				if (RUNQ_LEN(newcp, tpri) < qlen) {
+				if (ht_should_run(tp, newcp) &&
+				    RUNQ_LEN(newcp, tpri) < qlen) {
 					DTRACE_PROBE3(runq__balance,
 					    kthread_t *, tp,
 					    cpu_t *, cp, cpu_t *, newcp);
@@ -1289,8 +1289,8 @@ setbackdq(kthread_t *tp)
 			/*
 			 * Migrate to a cpu in the new partition.
 			 */
-			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
-			    tp->t_lpl, tp->t_pri, NULL);
+			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp,
+			    tp->t_pri);
 		}
 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
 	} else {
@@ -1427,7 +1427,7 @@ setfrontdq(kthread_t *tp)
 			/*
 			 * We'll generally let this thread continue to run
 			 * where it last ran, but will consider migration if:
-			 * - The thread last ran outside it's home lgroup.
+			 * - The thread last ran outside its home lgroup.
 			 * - The CPU where it last ran is the target of an
 			 *   offline request (a thread_nomigrate() on the in
 			 *   motion CPU relies on this when forcing a preempt).
@@ -1435,21 +1435,18 @@ setfrontdq(kthread_t *tp)
 			 *   it last ran, and it is considered not likely to
 			 *   have significant cache warmth.
 			 */
-			if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
-			    (cp == cpu_inmotion)) {
-				cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
-				    (tp == curthread) ? cp : NULL);
-			} else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
-			    (!THREAD_HAS_CACHE_WARMTH(tp))) {
-				cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
-				    NULL);
+			if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) ||
+			    cp == cpu_inmotion ||
+			    (tpri < cp->cpu_disp->disp_maxrunpri &&
+			    !THREAD_HAS_CACHE_WARMTH(tp))) {
+				cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
 			}
 		} else {
 			/*
 			 * Migrate to a cpu in the new partition.
 			 */
 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
-			    tp->t_lpl, tp->t_pri, NULL);
+			    tp, tp->t_pri);
 		}
 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
 	} else {
@@ -1600,7 +1597,7 @@ setkpdq(kthread_t *tp, int borf)
 		/* migrate to a cpu in the new partition */
 		cp = tp->t_cpupart->cp_cpulist;
 	}
-	cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
+	cp = disp_lowpri_cpu(cp, tp, tp->t_pri);
 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
 	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
 
@@ -2573,80 +2570,85 @@ disp_cpu_inactive(cpu_t *cp)
 }
 
 /*
- * disp_lowpri_cpu - find CPU running the lowest priority thread.
- *	The hint passed in is used as a starting point so we don't favor
- *	CPU 0 or any other CPU.  The caller should pass in the most recently
- *	used CPU for the thread.
+ * Return a score rating this CPU for running this thread: lower is better.
  *
- *	The lgroup and priority are used to determine the best CPU to run on
- *	in a NUMA machine.  The lgroup specifies which CPUs are closest while
- *	the thread priority will indicate whether the thread will actually run
- *	there.  To pick the best CPU, the CPUs inside and outside of the given
- *	lgroup which are running the lowest priority threads are found.  The
- *	remote CPU is chosen only if the thread will not run locally on a CPU
- *	within the lgroup, but will run on the remote CPU. If the thread
- *	cannot immediately run on any CPU, the best local CPU will be chosen.
+ * If curthread is looking for a new CPU, then we ignore cpu_dispatch_pri for
+ * curcpu (as that's our own priority).
  *
- *	The lpl specified also identifies the cpu partition from which
- *	disp_lowpri_cpu should select a CPU.
+ * If a cpu is the target of an offline request, then try to avoid it.
  *
- *	curcpu is used to indicate that disp_lowpri_cpu is being called on
- *      behalf of the current thread. (curthread is looking for a new cpu)
- *      In this case, cpu_dispatch_pri for this thread's cpu should be
- *      ignored.
+ * Otherwise we'll use double the effective dispatcher priority for the CPU.
  *
- *      If a cpu is the target of an offline request then try to avoid it.
+ * We do this so ht_adjust_cpu_score() can increment the score if needed,
+ * without ending up over-riding a dispatcher priority.
+ */
+static pri_t
+cpu_score(cpu_t *cp, kthread_t *tp)
+{
+	pri_t score;
+
+	if (tp == curthread && cp == curthread->t_cpu)
+		score = 2 * CPU_IDLE_PRI;
+	else if (cp == cpu_inmotion)
+		score = SHRT_MAX;
+	else
+		score = 2 * cp->cpu_dispatch_pri;
+
+	if (2 * cp->cpu_disp->disp_maxrunpri > score)
+		score = 2 * cp->cpu_disp->disp_maxrunpri;
+	if (2 * cp->cpu_chosen_level > score)
+		score = 2 * cp->cpu_chosen_level;
+
+	return (ht_adjust_cpu_score(tp, cp, score));
+}
+
+/*
+ * disp_lowpri_cpu - find a suitable CPU to run the given thread.
+ *
+ * We are looking for a CPU with an effective dispatch priority lower than the
+ * thread's, so that the thread will run immediately rather than be enqueued.
+ * For NUMA locality, we prefer "home" CPUs within the thread's ->t_lpl group.
+ * If we don't find an available CPU there, we will expand our search to include
+ * wider locality levels. (Note these groups are already divided by CPU
+ * partition.)
+ *
+ * If the thread cannot immediately run on *any* CPU, we'll enqueue ourselves on
+ * the best home CPU we found.
  *
- *	This function must be called at either high SPL, or with preemption
- *	disabled, so that the "hint" CPU cannot be removed from the online
- *	CPU list while we are traversing it.
+ * The hint passed in is used as a starting point so we don't favor CPU 0 or any
+ * other CPU.  The caller should pass in the most recently used CPU for the
+ * thread; it's of course possible that this CPU isn't in the home lgroup.
+ *
+ * This function must be called at either high SPL, or with preemption disabled,
+ * so that the "hint" CPU cannot be removed from the online CPU list while we
+ * are traversing it.
  */
 cpu_t *
-disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
+disp_lowpri_cpu(cpu_t *hint, kthread_t *tp, pri_t tpri)
 {
 	cpu_t	*bestcpu;
 	cpu_t	*besthomecpu;
 	cpu_t   *cp, *cpstart;
 
-	pri_t   bestpri;
-	pri_t   cpupri;
-
 	klgrpset_t	done;
-	klgrpset_t	cur_set;
 
 	lpl_t		*lpl_iter, *lpl_leaf;
-	int		i;
 
-	/*
-	 * Scan for a CPU currently running the lowest priority thread.
-	 * Cannot get cpu_lock here because it is adaptive.
-	 * We do not require lock on CPU list.
-	 */
 	ASSERT(hint != NULL);
-	ASSERT(lpl != NULL);
-	ASSERT(lpl->lpl_ncpu > 0);
+	ASSERT(tp->t_lpl->lpl_ncpu > 0);
 
-	/*
-	 * First examine local CPUs. Note that it's possible the hint CPU
-	 * passed in in remote to the specified home lgroup. If our priority
-	 * isn't sufficient enough such that we can run immediately at home,
-	 * then examine CPUs remote to our home lgroup.
-	 * We would like to give preference to CPUs closest to "home".
-	 * If we can't find a CPU where we'll run at a given level
-	 * of locality, we expand our search to include the next level.
-	 */
 	bestcpu = besthomecpu = NULL;
 	klgrpset_clear(done);
-	/* start with lpl we were passed */
 
-	lpl_iter = lpl;
+	lpl_iter = tp->t_lpl;
 
 	do {
+		pri_t best = SHRT_MAX;
+		klgrpset_t cur_set;
 
-		bestpri = SHRT_MAX;
 		klgrpset_clear(cur_set);
 
-		for (i = 0; i < lpl_iter->lpl_nrset; i++) {
+		for (int i = 0; i < lpl_iter->lpl_nrset; i++) {
 			lpl_leaf = lpl_iter->lpl_rset[i];
 			if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
 				continue;
@@ -2659,34 +2661,25 @@ disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
 				cp = cpstart = lpl_leaf->lpl_cpus;
 
 			do {
-				if (cp == curcpu)
-					cpupri = -1;
-				else if (cp == cpu_inmotion)
-					cpupri = SHRT_MAX;
-				else
-					cpupri = cp->cpu_dispatch_pri;
-				if (cp->cpu_disp->disp_maxrunpri > cpupri)
-					cpupri = cp->cpu_disp->disp_maxrunpri;
-				if (cp->cpu_chosen_level > cpupri)
-					cpupri = cp->cpu_chosen_level;
-				if (cpupri < bestpri) {
-					if (CPU_IDLING(cpupri)) {
-						ASSERT((cp->cpu_flags &
-						    CPU_QUIESCED) == 0);
-						return (cp);
-					}
+				pri_t score = cpu_score(cp, tp);
+
+				if (score < best) {
+					best = score;
 					bestcpu = cp;
-					bestpri = cpupri;
+
+					/* An idle CPU: we're done. */
+					if (score / 2 == CPU_IDLE_PRI)
+						goto out;
 				}
 			} while ((cp = cp->cpu_next_lpl) != cpstart);
 		}
 
-		if (bestcpu && (tpri > bestpri)) {
-			ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
-			return (bestcpu);
-		}
+		if (bestcpu != NULL && tpri > (best / 2))
+			goto out;
+
 		if (besthomecpu == NULL)
 			besthomecpu = bestcpu;
+
 		/*
 		 * Add the lgrps we just considered to the "done" set
 		 */
@@ -2698,8 +2691,11 @@ disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
 	 * The specified priority isn't high enough to run immediately
 	 * anywhere, so just return the best CPU from the home lgroup.
 	 */
-	ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
-	return (besthomecpu);
+	bestcpu = besthomecpu;
+
+out:
+	ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
+	return (bestcpu);
 }
 
 /*
@@ -2719,3 +2715,19 @@ static void
 generic_enq_thread(cpu_t *cpu, int bound)
 {
 }
+
+cpu_t *
+disp_choose_best_cpu(void)
+{
+	kthread_t *t = curthread;
+	cpu_t *curcpu = CPU;
+
+	ASSERT(t->t_preempt > 0);
+	ASSERT(t->t_state == TS_ONPROC);
+	ASSERT(t->t_schedflag & TS_VCPU);
+
+	if (ht_should_run(t, curcpu))
+		return (curcpu);
+
+	return (disp_lowpri_cpu(curcpu, t, t->t_pri));
+}
diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c
index af000bf4f1..c923ba5d1a 100644
--- a/usr/src/uts/common/disp/thread.c
+++ b/usr/src/uts/common/disp/thread.c
@@ -74,6 +74,7 @@
 #include <sys/waitq.h>
 #include <sys/cpucaps.h>
 #include <sys/kiconv.h>
+#include <sys/ht.h>
 
 #ifndef	STACK_GROWTH_DOWN
 #error Stacks do not grow downward; 3b2 zombie attack detected!
@@ -507,8 +508,8 @@ thread_create(
 	if (CPU->cpu_part == &cp_default)
 		t->t_cpu = CPU;
 	else
-		t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t->t_lpl,
-		    t->t_pri, NULL);
+		t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t,
+		    t->t_pri);
 
 	t->t_disp_queue = t->t_cpu->cpu_disp;
 	kpreempt_enable();
@@ -1422,6 +1423,8 @@ thread_unpin()
 	itp = t->t_intr;		/* interrupted thread */
 	t->t_intr = NULL;		/* clear interrupt ptr */
 
+	ht_end_intr();
+
 	/*
 	 * Get state from interrupt thread for the one
 	 * it interrupted.
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index 03d711838c..2127de2bf0 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -25,8 +25,8 @@
  *
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -90,6 +90,7 @@
 #include <sys/zfeature.h>
 #include <sys/zio_checksum.h>
 #include <sys/zil_impl.h>
+#include <sys/ht.h>
 
 #include "zfs_namecheck.h"
 
@@ -1281,6 +1282,8 @@ zvol_strategy(buf_t *bp)
 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
 	    !doread && !is_dumpified;
 
+	ht_begin_unsafe();
+
 	/*
 	 * There must be no buffer changes when doing a dmu_sync() because
 	 * we can't change the data whilst calculating the checksum.
@@ -1328,6 +1331,8 @@ zvol_strategy(buf_t *bp)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 	biodone(bp);
 
+	ht_end_unsafe();
+
 	return (0);
 }
 
@@ -1409,6 +1414,8 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 		return (error);
 	}
 
+	ht_begin_unsafe();
+
 	DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0);
 
 	mutex_enter(&zonep->zone_vfs_lock);
@@ -1469,6 +1476,8 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 	DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int,
 	    error);
 
+	ht_end_unsafe();
+
 	return (error);
 }
 
@@ -1501,6 +1510,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 		return (error);
 	}
 
+	ht_begin_unsafe();
+
 	DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1);
 
 	/*
@@ -1549,6 +1560,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 	DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int,
 	    error);
 
+	ht_end_unsafe();
+
 	mutex_enter(&zonep->zone_vfs_lock);
 	zonep->zone_vfs_rwstats.writes++;
 	zonep->zone_vfs_rwstats.nwritten += tot_bytes;
@@ -1818,11 +1831,17 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 	case DKIOCFLUSHWRITECACHE:
 		dkc = (struct dk_callback *)arg;
 		mutex_exit(&zfsdev_state_lock);
+
+		ht_begin_unsafe();
+
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 		if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
 			(*dkc->dkc_callback)(dkc->dkc_cookie, error);
 			error = 0;
 		}
+
+		ht_end_unsafe();
+
 		return (error);
 
 	case DKIOCGETWCE:
@@ -1847,7 +1866,9 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 		} else {
 			zv->zv_flags &= ~ZVOL_WCE;
 			mutex_exit(&zfsdev_state_lock);
+			ht_begin_unsafe();
 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
+			ht_end_unsafe();
 		}
 		return (0);
 	}
@@ -1900,6 +1921,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 
 		mutex_exit(&zfsdev_state_lock);
 
+		ht_begin_unsafe();
+
 		rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
 		    RL_WRITER);
 		tx = dmu_tx_create(zv->zv_objset);
@@ -1932,6 +1955,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
 		}
 
+		ht_end_unsafe();
+
 		return (error);
 	}
 
diff --git a/usr/src/uts/common/io/vnd/vnd.c b/usr/src/uts/common/io/vnd/vnd.c
index 5a25ed22d5..d03c7ce4ec 100644
--- a/usr/src/uts/common/io/vnd/vnd.c
+++ b/usr/src/uts/common/io/vnd/vnd.c
@@ -830,6 +830,7 @@
 #include <sys/disp.h>
 #include <sys/random.h>
 #include <sys/gsqueue.h>
+#include <sys/ht.h>
 
 #include <inet/ip.h>
 #include <inet/ip6.h>
@@ -3716,6 +3717,12 @@ vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy)
 	bsize = vsp->vns_bsize;
 	mutex_exit(&vsp->vns_lock);
 
+	/*
+	 * We're potentially going deep into the networking layer; make sure the
+	 * guest can't run concurrently.
+	 */
+	ht_begin_unsafe();
+
 	nmps = 0;
 	mptot = 0;
 	blocked = B_FALSE;
@@ -3736,6 +3743,8 @@ vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy)
 		}
 	}
 
+	ht_end_unsafe();
+
 	empty = vnd_dq_is_empty(&vsp->vns_dq_write);
 
 	/*
diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c
index 2efb68889c..4648dae9dd 100644
--- a/usr/src/uts/common/os/cpu.c
+++ b/usr/src/uts/common/os/cpu.c
@@ -400,6 +400,9 @@ force_thread_migrate(kthread_id_t tp)
  * CPUs prior to a successful return, it should take extra precautions (such as
  * their own call to kpreempt_disable) to ensure that safety.
  *
+ * CPU_BEST can be used to pick a "best" CPU to migrate to, including
+ * potentially the current CPU.
+ *
  * A CPU affinity reference count is maintained by thread_affinity_set and
  * thread_affinity_clear (incrementing and decrementing it, respectively),
  * maintaining CPU affinity while the count is non-zero, and allowing regions
@@ -416,6 +419,10 @@ thread_affinity_set(kthread_id_t t, int cpu_id)
 		VERIFY3P(t, ==, curthread);
 		kpreempt_disable();
 		cp = CPU;
+	} else if (cpu_id == CPU_BEST) {
+		VERIFY3P(t, ==, curthread);
+		kpreempt_disable();
+		cp = disp_choose_best_cpu();
 	} else {
 		/*
 		 * We should be asserting that cpu_lock is held here, but
@@ -453,9 +460,8 @@ thread_affinity_set(kthread_id_t t, int cpu_id)
 		thread_unlock(t);
 	}
 
-	if (cpu_id == CPU_CURRENT) {
+	if (cpu_id == CPU_CURRENT || cpu_id == CPU_BEST)
 		kpreempt_enable();
-	}
 }
 
 /*
@@ -1490,8 +1496,8 @@ again:	for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) {
 				 * Update CPU last ran on if it was this CPU
 				 */
 				if (t->t_cpu == cp && t->t_bound_cpu != cp)
-					t->t_cpu = disp_lowpri_cpu(ncp,
-					    t->t_lpl, t->t_pri, NULL);
+					t->t_cpu = disp_lowpri_cpu(ncp, t,
+					    t->t_pri);
 				ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||
 				    t->t_weakbound_cpu == cp);
 
@@ -1533,10 +1539,9 @@ again:	for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) {
 			 * Update CPU last ran on if it was this CPU
 			 */
 
-			if (t->t_cpu == cp && t->t_bound_cpu != cp) {
-				t->t_cpu = disp_lowpri_cpu(ncp,
-				    t->t_lpl, t->t_pri, NULL);
-			}
+			if (t->t_cpu == cp && t->t_bound_cpu != cp)
+				t->t_cpu = disp_lowpri_cpu(ncp, t, t->t_pri);
+
 			ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||
 			    t->t_weakbound_cpu == cp);
 			t = t->t_next;
diff --git a/usr/src/uts/common/os/lgrp.c b/usr/src/uts/common/os/lgrp.c
index 6288f47bed..6f6aced619 100644
--- a/usr/src/uts/common/os/lgrp.c
+++ b/usr/src/uts/common/os/lgrp.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -90,6 +91,7 @@
 #include <sys/pg.h>
 #include <sys/promif.h>
 #include <sys/sdt.h>
+#include <sys/ht.h>
 
 lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
@@ -520,6 +522,8 @@ lgrp_main_mp_init(void)
 {
 	klgrpset_t changed;
 
+	ht_init();
+
 	/*
 	 * Update lgroup topology (if necessary)
 	 */
diff --git a/usr/src/uts/common/sys/cpuvar.h b/usr/src/uts/common/sys/cpuvar.h
index 3ee4e70eec..2cfe5116d9 100644
--- a/usr/src/uts/common/sys/cpuvar.h
+++ b/usr/src/uts/common/sys/cpuvar.h
@@ -23,7 +23,7 @@
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  * Copyright 2017 RackTop Systems.
  */
 
@@ -540,13 +540,19 @@ extern struct cpu *curcpup(void);
 #endif
 
 /*
- * CPU_CURRENT indicates to thread_affinity_set to use CPU->cpu_id
- * as the target and to grab cpu_lock instead of requiring the caller
- * to grab it.
+ * CPU_CURRENT indicates to thread_affinity_set() to use whatever curthread's
+ * current CPU is; holding cpu_lock is not required.
  */
 #define	CPU_CURRENT	-3
 
 /*
+ * CPU_BEST can be used by thread_affinity_set() callers to set affinity to a
+ * good CPU (in particular, an ht_acquire()-friendly choice); holding cpu_lock
+ * is not required.
+ */
+#define	CPU_BEST	-4
+
+/*
  * Per-CPU statistics
  *
  * cpu_stats_t contains numerous system and VM-related statistics, in the form
diff --git a/usr/src/uts/common/sys/disp.h b/usr/src/uts/common/sys/disp.h
index b324f4d323..cb3711edcd 100644
--- a/usr/src/uts/common/sys/disp.h
+++ b/usr/src/uts/common/sys/disp.h
@@ -23,6 +23,8 @@
  * Use is subject to license terms.
  *
  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
+ *
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -63,11 +65,11 @@ typedef struct _disp {
 	/*
 	 * Priorities:
 	 *	disp_maxrunpri is the maximum run priority of runnable threads
-	 * 	on this queue.  It is -1 if nothing is runnable.
+	 *	on this queue.  It is -1 if nothing is runnable.
 	 *
 	 *	disp_max_unbound_pri is the maximum run priority of threads on
 	 *	this dispatch queue but runnable by any CPU.  This may be left
-	 * 	artificially high, then corrected when some CPU tries to take
+	 *	artificially high, then corrected when some CPU tries to take
 	 *	an unbound thread.  It is -1 if nothing is runnable.
 	 */
 	pri_t		disp_maxrunpri;	/* maximum run priority */
@@ -151,8 +153,7 @@ extern void		dq_srundec(kthread_t *);
 extern void		cpu_rechoose(kthread_t *);
 extern void		cpu_surrender(kthread_t *);
 extern void		kpreempt(int);
-extern struct cpu	*disp_lowpri_cpu(struct cpu *, struct lgrp_ld *, pri_t,
-			    struct cpu *);
+extern struct cpu	*disp_lowpri_cpu(struct cpu *, kthread_t *, pri_t);
 extern int		disp_bound_threads(struct cpu *, int);
 extern int		disp_bound_anythreads(struct cpu *, int);
 extern int		disp_bound_partition(struct cpu *, int);
@@ -167,6 +168,8 @@ extern void		resume_from_zombie(kthread_t *)
 extern void		disp_swapped_enq(kthread_t *);
 extern int		disp_anywork(void);
 
+extern struct cpu	*disp_choose_best_cpu(void);
+
 #define	KPREEMPT_SYNC		(-1)
 #define	kpreempt_disable()				\
 	{						\
@@ -183,6 +186,8 @@ extern int		disp_anywork(void);
 
 #endif	/* _KERNEL */
 
+#define	CPU_IDLE_PRI (-1)
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h
index af9fcb75cf..678d356564 100644
--- a/usr/src/uts/common/sys/thread.h
+++ b/usr/src/uts/common/sys/thread.h
@@ -354,6 +354,8 @@ typedef struct _kthread {
 	kmutex_t	t_wait_mutex;	/* used in CV wait functions */
 
 	char		*t_name;	/* thread name */
+
+	uint64_t	t_unsafe;	/* unsafe to run with HT VCPU thread */
 } kthread_t;
 
 /*
@@ -417,6 +419,7 @@ typedef struct _kthread {
 #define	TS_SIGNALLED	0x0010	/* thread was awakened by cv_signal() */
 #define	TS_PROJWAITQ	0x0020	/* thread is on its project's waitq */
 #define	TS_ZONEWAITQ	0x0040	/* thread is on its zone's waitq */
+#define	TS_VCPU		0x0080	/* thread will enter guest context */
 #define	TS_CSTART	0x0100	/* setrun() by continuelwps() */
 #define	TS_UNPAUSE	0x0200	/* setrun() by unpauselwps() */
 #define	TS_XSTART	0x0400	/* setrun() by SIGCONT */