OS-7125 Need mitigation of L1TF (CVE-2018-3646)release-20180802

Reviewed by: Robert Mustacchi <rm@joyent.com> Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Approved by: Robert Mustacchi <rm@joyent.com>
author: John Levon <john.levon@joyent.com> 2018-08-14 21:14:28 +0000
committer: Robert Mustacchi <rm@joyent.com> 2018-08-14 23:15:49 +0000
commit: 89d0fffcadbabb8694d3ce87b5be826e2b789c99 (patch)
tree: a038f703ae6cfae6d41fe1ed8c17be5687e864da
parent: 10ad6220c95adc2a5592ea98b1c7ced27d6942ed (diff)
download: illumos-joyent-release-20180802.tar.gz
36 files changed, 1182 insertions, 664 deletions
diff --git a/exception_lists/copyright b/exception_lists/copyright
index 12819c29a1..1fe2c1dd4e 100644
--- a/exception_lists/copyright
+++ b/exception_lists/copyright
@@ -126,6 +126,8 @@ usr/src/common/bzip2/huffman.c
 usr/src/common/ficl/*
 usr/src/data/hwdata/THIRDPARTYLICENSE.efifixes.descrip
 usr/src/data/hwdata/THIRDPARTYLICENSE.efifixes.tmpl
+usr/src/data/ucode/amd/*
+usr/src/data/ucode/intel/*
 usr/src/grub/grub-0.97/stage2/Makefile.am
 usr/src/grub/grub-0.97/stage2/builtins.c
 usr/src/grub/grub-0.97/stage2/disk_io.c
diff --git a/exception_lists/keywords b/exception_lists/keywords
index a8859d25d1..5ff9cc8ab0 100644
--- a/exception_lists/keywords
+++ b/exception_lists/keywords
@@ -22,6 +22,7 @@
 # Copyright 2017 Nexenta Systems, Inc.
 # Copyright (c) 2013 by Delphix. All rights reserved.
 # Copyright 2016 Toomas Soome <tsoome@me.com>
+# Copyright 2018 Joyent, Inc.
 #
 
 syntax: glob
@@ -36,6 +37,8 @@ usr/src/data/locale/data/zh_SG.UTF-8.src
 usr/src/data/locale/data/zh_TW.UTF-8.src
 usr/src/data/terminfo/termcap.src
 usr/src/data/terminfo/terminfo.src
+usr/src/data/ucode/amd/*
+usr/src/data/ucode/intel/*
 usr/src/test/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v15.dat.bz2
 usr/src/test/zfs-tests/tests/functional/delegate/delegate_common.kshlib
 usr/src/test/test-runner/cmd/run
diff --git a/exception_lists/wscheck b/exception_lists/wscheck
new file mode 100644
index 0000000000..0d06b13802
--- /dev/null
+++ b/exception_lists/wscheck
@@ -0,0 +1,96 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# Copyright 2018 Joyent, Inc.
+#
+syntax: glob
+
+usr/src/data/ucode/amd/*
+usr/src/data/ucode/intel/*
+
+# bhyve sources
+usr/src/cmd/bhyve/acpi.[ch]
+usr/src/cmd/bhyve/ahci.h
+usr/src/cmd/bhyve/atkbdc.[ch]
+usr/src/cmd/bhyve/bhyvegc.[ch]
+usr/src/cmd/bhyve/bhyverun.[ch]
+usr/src/cmd/bhyve/block_if.[ch]
+usr/src/cmd/bhyve/bootrom.[ch]
+usr/src/cmd/bhyve/console.[ch]
+usr/src/cmd/bhyve/consport.c
+usr/src/cmd/bhyve/dbgport.[ch]
+usr/src/cmd/bhyve/fwctl.[ch]
+usr/src/cmd/bhyve/gdb.[ch]
+usr/src/cmd/bhyve/inout.[ch]
+usr/src/cmd/bhyve/ioapic.[ch]
+usr/src/cmd/bhyve/mem.[ch]
+usr/src/cmd/bhyve/mevent.[ch]
+usr/src/cmd/bhyve/mevent_test.c
+usr/src/cmd/bhyve/mptbl.[ch]
+usr/src/cmd/bhyve/pci_ahci.c
+usr/src/cmd/bhyve/pci_e82545.c
+usr/src/cmd/bhyve/pci_emul.[ch]
+usr/src/cmd/bhyve/pci_fbuf.c
+usr/src/cmd/bhyve/pci_hostbridge.c
+usr/src/cmd/bhyve/pci_irq.[ch]
+usr/src/cmd/bhyve/pci_lpc.[ch]
+usr/src/cmd/bhyve/pci_passthru.c
+usr/src/cmd/bhyve/pci_uart.c
+usr/src/cmd/bhyve/pci_virtio_block.c
+usr/src/cmd/bhyve/pci_virtio_console.c
+usr/src/cmd/bhyve/pci_virtio_net.c
+usr/src/cmd/bhyve/pci_virtio_rnd.c
+usr/src/cmd/bhyve/pci_xhci.[ch]
+usr/src/cmd/bhyve/pm.c
+usr/src/cmd/bhyve/pmtmr.c
+usr/src/cmd/bhyve/post.c
+usr/src/cmd/bhyve/ps2kbd.[ch]
+usr/src/cmd/bhyve/ps2mouse.[ch]
+usr/src/cmd/bhyve/rfb.[ch]
+usr/src/cmd/bhyve/rtc.[ch]
+usr/src/cmd/bhyve/smbiostbl.[ch]
+usr/src/cmd/bhyve/sockstream.[ch]
+usr/src/cmd/bhyve/spinup_ap.[ch]
+usr/src/cmd/bhyve/task_switch.c
+usr/src/cmd/bhyve/uart_emul.[ch]
+usr/src/cmd/bhyve/usb_emul.[ch]
+usr/src/cmd/bhyve/usb_mouse.c
+usr/src/cmd/bhyve/vga.[ch]
+usr/src/cmd/bhyve/virtio.[ch]
+usr/src/cmd/bhyve/xmsr.[ch]
+usr/src/cmd/bhyveconsole/bhyveconsole.c
+usr/src/cmd/bhyvectl/bhyvectl.c
+usr/src/compat/freebsd/*.h
+usr/src/compat/freebsd/*/*.h
+usr/src/compat/freebsd/amd64/machine/*.h
+usr/contrib/freebsd/*/*.h
+usr/contrib/freebsd/*/*/*.h
+usr/contrib/freebsd/lib/libutil/*.c
+usr/src/head/bhyve.h
+usr/src/lib/libvmmapi/common/vmmapi.[ch]
+usr/src/uts/i86pc/io/vmm/amd/*.[ch]
+usr/src/uts/i86pc/io/vmm/intel/*.[chs]
+usr/src/uts/i86pc/io/vmm/io/*.[ch]
+usr/src/uts/i86pc/io/vmm/vmm.c
+usr/src/uts/i86pc/io/vmm/vmm_host.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c
+usr/src/uts/i86pc/io/vmm/vmm_ioport.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_ipi.h
+usr/src/uts/i86pc/io/vmm/vmm_ktr.h
+usr/src/uts/i86pc/io/vmm/vmm_lapic.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_mem.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
+usr/src/uts/i86pc/io/vmm/vmm_stat.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_util.[ch]
+usr/src/uts/i86pc/io/vmm/vmx_assym.s
+usr/src/uts/i86pc/io/vmm/x86.[ch]
+usr/src/uts/i86pc/sys/vmm.h
+usr/src/uts/i86pc/sys/vmm_dev.h
+usr/src/uts/i86pc/sys/vmm_instruction_emul.h
diff --git a/usr/src/uts/common/disp/cpupart.c b/usr/src/uts/common/disp/cpupart.c
index c260329c61..4ddc568187 100644
--- a/usr/src/uts/common/disp/cpupart.c
+++ b/usr/src/uts/common/disp/cpupart.c
@@ -20,6 +20,8 @@
  */
 /*
  * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -324,7 +326,7 @@ cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
 	kthread_t *t;
 	int	move_threads = 1;
 	lgrp_id_t lgrpid;
-	proc_t 	*p;
+	proc_t	*p;
 	int lgrp_diff_lpl;
 	lpl_t	*cpu_lpl;
 	int	ret;
@@ -569,8 +571,8 @@ again:
 				/* Update CPU last ran on if it was this CPU */
 				if (t->t_cpu == cp && t->t_cpupart == oldpp &&
 				    t->t_bound_cpu != cp) {
-					t->t_cpu = disp_lowpri_cpu(ncp,
-					    t->t_lpl, t->t_pri, NULL);
+					t->t_cpu = disp_lowpri_cpu(ncp, t,
+					    t->t_pri);
 				}
 				t = t->t_forw;
 			} while (t != p->p_tlist);
@@ -622,8 +624,8 @@ again:
 			/* Update CPU last ran on if it was this CPU */
 			if (t->t_cpu == cp && t->t_cpupart == oldpp &&
 			    t->t_bound_cpu != cp) {
-				t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl,
-				    t->t_pri, NULL);
+				t->t_cpu = disp_lowpri_cpu(ncp, t,
+				    t->t_pri);
 			}
 
 			t = t->t_next;
@@ -883,7 +885,7 @@ cpupart_create(psetid_t *psid)
 static int
 cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all)
 {
-	void 	*projbuf, *zonebuf;
+	void	*projbuf, *zonebuf;
 	kthread_t *t;
 	proc_t	*p;
 	int	err = 0;
diff --git a/usr/src/uts/common/disp/disp.c b/usr/src/uts/common/disp/disp.c
index 5f9c2c68a2..4898a18bf2 100644
--- a/usr/src/uts/common/disp/disp.c
+++ b/usr/src/uts/common/disp/disp.c
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -60,6 +60,7 @@
 #include <sys/dtrace.h>
 #include <sys/sdt.h>
 #include <sys/archsystm.h>
+#include <sys/ht.h>
 
 #include <vm/as.h>
 
@@ -1135,15 +1136,13 @@ swtch_to(kthread_t *next)
 	 */
 }
 
-#define	CPU_IDLING(pri)	((pri) == -1)
-
 static void
 cpu_resched(cpu_t *cp, pri_t tpri)
 {
 	int	call_poke_cpu = 0;
 	pri_t   cpupri = cp->cpu_dispatch_pri;
 
-	if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
+	if (cpupri != CPU_IDLE_PRI && cpupri < tpri) {
 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
@@ -1239,17 +1238,17 @@ setbackdq(kthread_t *tp)
 		/*
 		 * We'll generally let this thread continue to run where
 		 * it last ran...but will consider migration if:
-		 * - We thread probably doesn't have much cache warmth.
+		 * - The thread probably doesn't have much cache warmth.
+		 * - HT exclusion would prefer us to run elsewhere
 		 * - The CPU where it last ran is the target of an offline
 		 *   request.
-		 * - The thread last ran outside it's home lgroup.
+		 * - The thread last ran outside its home lgroup.
 		 */
 		if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
-		    (tp->t_cpu == cpu_inmotion)) {
-			cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
-		} else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
-			cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
-			    self ? tp->t_cpu : NULL);
+		    !ht_should_run(tp, tp->t_cpu) ||
+		    (tp->t_cpu == cpu_inmotion) ||
+		    !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
+			cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
 		} else {
 			cp = tp->t_cpu;
 		}
@@ -1278,7 +1277,8 @@ setbackdq(kthread_t *tp)
 					newcp = cp->cpu_next_part;
 				}
 
-				if (RUNQ_LEN(newcp, tpri) < qlen) {
+				if (ht_should_run(tp, newcp) &&
+				    RUNQ_LEN(newcp, tpri) < qlen) {
 					DTRACE_PROBE3(runq__balance,
 					    kthread_t *, tp,
 					    cpu_t *, cp, cpu_t *, newcp);
@@ -1289,8 +1289,8 @@ setbackdq(kthread_t *tp)
 			/*
 			 * Migrate to a cpu in the new partition.
 			 */
-			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
-			    tp->t_lpl, tp->t_pri, NULL);
+			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp,
+			    tp->t_pri);
 		}
 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
 	} else {
@@ -1427,7 +1427,7 @@ setfrontdq(kthread_t *tp)
 			/*
 			 * We'll generally let this thread continue to run
 			 * where it last ran, but will consider migration if:
-			 * - The thread last ran outside it's home lgroup.
+			 * - The thread last ran outside its home lgroup.
 			 * - The CPU where it last ran is the target of an
 			 *   offline request (a thread_nomigrate() on the in
 			 *   motion CPU relies on this when forcing a preempt).
@@ -1435,21 +1435,18 @@ setfrontdq(kthread_t *tp)
 			 *   it last ran, and it is considered not likely to
 			 *   have significant cache warmth.
 			 */
-			if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
-			    (cp == cpu_inmotion)) {
-				cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
-				    (tp == curthread) ? cp : NULL);
-			} else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
-			    (!THREAD_HAS_CACHE_WARMTH(tp))) {
-				cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
-				    NULL);
+			if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) ||
+			    cp == cpu_inmotion ||
+			    (tpri < cp->cpu_disp->disp_maxrunpri &&
+			    !THREAD_HAS_CACHE_WARMTH(tp))) {
+				cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
 			}
 		} else {
 			/*
 			 * Migrate to a cpu in the new partition.
 			 */
 			cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
-			    tp->t_lpl, tp->t_pri, NULL);
+			    tp, tp->t_pri);
 		}
 		ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
 	} else {
@@ -1600,7 +1597,7 @@ setkpdq(kthread_t *tp, int borf)
 		/* migrate to a cpu in the new partition */
 		cp = tp->t_cpupart->cp_cpulist;
 	}
-	cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
+	cp = disp_lowpri_cpu(cp, tp, tp->t_pri);
 	disp_lock_enter_high(&cp->cpu_disp->disp_lock);
 	ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
 
@@ -2573,80 +2570,85 @@ disp_cpu_inactive(cpu_t *cp)
 }
 
 /*
- * disp_lowpri_cpu - find CPU running the lowest priority thread.
- *	The hint passed in is used as a starting point so we don't favor
- *	CPU 0 or any other CPU.  The caller should pass in the most recently
- *	used CPU for the thread.
+ * Return a score rating this CPU for running this thread: lower is better.
  *
- *	The lgroup and priority are used to determine the best CPU to run on
- *	in a NUMA machine.  The lgroup specifies which CPUs are closest while
- *	the thread priority will indicate whether the thread will actually run
- *	there.  To pick the best CPU, the CPUs inside and outside of the given
- *	lgroup which are running the lowest priority threads are found.  The
- *	remote CPU is chosen only if the thread will not run locally on a CPU
- *	within the lgroup, but will run on the remote CPU. If the thread
- *	cannot immediately run on any CPU, the best local CPU will be chosen.
+ * If curthread is looking for a new CPU, then we ignore cpu_dispatch_pri for
+ * curcpu (as that's our own priority).
  *
- *	The lpl specified also identifies the cpu partition from which
- *	disp_lowpri_cpu should select a CPU.
+ * If a cpu is the target of an offline request, then try to avoid it.
  *
- *	curcpu is used to indicate that disp_lowpri_cpu is being called on
- *      behalf of the current thread. (curthread is looking for a new cpu)
- *      In this case, cpu_dispatch_pri for this thread's cpu should be
- *      ignored.
+ * Otherwise we'll use double the effective dispatcher priority for the CPU.
  *
- *      If a cpu is the target of an offline request then try to avoid it.
+ * We do this so ht_adjust_cpu_score() can increment the score if needed,
+ * without ending up over-riding a dispatcher priority.
+ */
+static pri_t
+cpu_score(cpu_t *cp, kthread_t *tp)
+{
+	pri_t score;
+
+	if (tp == curthread && cp == curthread->t_cpu)
+		score = 2 * CPU_IDLE_PRI;
+	else if (cp == cpu_inmotion)
+		score = SHRT_MAX;
+	else
+		score = 2 * cp->cpu_dispatch_pri;
+
+	if (2 * cp->cpu_disp->disp_maxrunpri > score)
+		score = 2 * cp->cpu_disp->disp_maxrunpri;
+	if (2 * cp->cpu_chosen_level > score)
+		score = 2 * cp->cpu_chosen_level;
+
+	return (ht_adjust_cpu_score(tp, cp, score));
+}
+
+/*
+ * disp_lowpri_cpu - find a suitable CPU to run the given thread.
+ *
+ * We are looking for a CPU with an effective dispatch priority lower than the
+ * thread's, so that the thread will run immediately rather than be enqueued.
+ * For NUMA locality, we prefer "home" CPUs within the thread's ->t_lpl group.
+ * If we don't find an available CPU there, we will expand our search to include
+ * wider locality levels. (Note these groups are already divided by CPU
+ * partition.)
+ *
+ * If the thread cannot immediately run on *any* CPU, we'll enqueue ourselves on
+ * the best home CPU we found.
  *
- *	This function must be called at either high SPL, or with preemption
- *	disabled, so that the "hint" CPU cannot be removed from the online
- *	CPU list while we are traversing it.
+ * The hint passed in is used as a starting point so we don't favor CPU 0 or any
+ * other CPU.  The caller should pass in the most recently used CPU for the
+ * thread; it's of course possible that this CPU isn't in the home lgroup.
+ *
+ * This function must be called at either high SPL, or with preemption disabled,
+ * so that the "hint" CPU cannot be removed from the online CPU list while we
+ * are traversing it.
  */
 cpu_t *
-disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
+disp_lowpri_cpu(cpu_t *hint, kthread_t *tp, pri_t tpri)
 {
 	cpu_t	*bestcpu;
 	cpu_t	*besthomecpu;
 	cpu_t   *cp, *cpstart;
 
-	pri_t   bestpri;
-	pri_t   cpupri;
-
 	klgrpset_t	done;
-	klgrpset_t	cur_set;
 
 	lpl_t		*lpl_iter, *lpl_leaf;
-	int		i;
 
-	/*
-	 * Scan for a CPU currently running the lowest priority thread.
-	 * Cannot get cpu_lock here because it is adaptive.
-	 * We do not require lock on CPU list.
-	 */
 	ASSERT(hint != NULL);
-	ASSERT(lpl != NULL);
-	ASSERT(lpl->lpl_ncpu > 0);
+	ASSERT(tp->t_lpl->lpl_ncpu > 0);
 
-	/*
-	 * First examine local CPUs. Note that it's possible the hint CPU
-	 * passed in in remote to the specified home lgroup. If our priority
-	 * isn't sufficient enough such that we can run immediately at home,
-	 * then examine CPUs remote to our home lgroup.
-	 * We would like to give preference to CPUs closest to "home".
-	 * If we can't find a CPU where we'll run at a given level
-	 * of locality, we expand our search to include the next level.
-	 */
 	bestcpu = besthomecpu = NULL;
 	klgrpset_clear(done);
-	/* start with lpl we were passed */
 
-	lpl_iter = lpl;
+	lpl_iter = tp->t_lpl;
 
 	do {
+		pri_t best = SHRT_MAX;
+		klgrpset_t cur_set;
 
-		bestpri = SHRT_MAX;
 		klgrpset_clear(cur_set);
 
-		for (i = 0; i < lpl_iter->lpl_nrset; i++) {
+		for (int i = 0; i < lpl_iter->lpl_nrset; i++) {
 			lpl_leaf = lpl_iter->lpl_rset[i];
 			if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
 				continue;
@@ -2659,34 +2661,25 @@ disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
 				cp = cpstart = lpl_leaf->lpl_cpus;
 
 			do {
-				if (cp == curcpu)
-					cpupri = -1;
-				else if (cp == cpu_inmotion)
-					cpupri = SHRT_MAX;
-				else
-					cpupri = cp->cpu_dispatch_pri;
-				if (cp->cpu_disp->disp_maxrunpri > cpupri)
-					cpupri = cp->cpu_disp->disp_maxrunpri;
-				if (cp->cpu_chosen_level > cpupri)
-					cpupri = cp->cpu_chosen_level;
-				if (cpupri < bestpri) {
-					if (CPU_IDLING(cpupri)) {
-						ASSERT((cp->cpu_flags &
-						    CPU_QUIESCED) == 0);
-						return (cp);
-					}
+				pri_t score = cpu_score(cp, tp);
+
+				if (score < best) {
+					best = score;
 					bestcpu = cp;
-					bestpri = cpupri;
+
+					/* An idle CPU: we're done. */
+					if (score / 2 == CPU_IDLE_PRI)
+						goto out;
 				}
 			} while ((cp = cp->cpu_next_lpl) != cpstart);
 		}
 
-		if (bestcpu && (tpri > bestpri)) {
-			ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
-			return (bestcpu);
-		}
+		if (bestcpu != NULL && tpri > (best / 2))
+			goto out;
+
 		if (besthomecpu == NULL)
 			besthomecpu = bestcpu;
+
 		/*
 		 * Add the lgrps we just considered to the "done" set
 		 */
@@ -2698,8 +2691,11 @@ disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
 	 * The specified priority isn't high enough to run immediately
 	 * anywhere, so just return the best CPU from the home lgroup.
 	 */
-	ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
-	return (besthomecpu);
+	bestcpu = besthomecpu;
+
+out:
+	ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
+	return (bestcpu);
 }
 
 /*
@@ -2719,3 +2715,19 @@ static void
 generic_enq_thread(cpu_t *cpu, int bound)
 {
 }
+
+cpu_t *
+disp_choose_best_cpu(void)
+{
+	kthread_t *t = curthread;
+	cpu_t *curcpu = CPU;
+
+	ASSERT(t->t_preempt > 0);
+	ASSERT(t->t_state == TS_ONPROC);
+	ASSERT(t->t_schedflag & TS_VCPU);
+
+	if (ht_should_run(t, curcpu))
+		return (curcpu);
+
+	return (disp_lowpri_cpu(curcpu, t, t->t_pri));
+}
diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c
index af000bf4f1..c923ba5d1a 100644
--- a/usr/src/uts/common/disp/thread.c
+++ b/usr/src/uts/common/disp/thread.c
@@ -74,6 +74,7 @@
 #include <sys/waitq.h>
 #include <sys/cpucaps.h>
 #include <sys/kiconv.h>
+#include <sys/ht.h>
 
 #ifndef	STACK_GROWTH_DOWN
 #error Stacks do not grow downward; 3b2 zombie attack detected!
@@ -507,8 +508,8 @@ thread_create(
 	if (CPU->cpu_part == &cp_default)
 		t->t_cpu = CPU;
 	else
-		t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t->t_lpl,
-		    t->t_pri, NULL);
+		t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t,
+		    t->t_pri);
 
 	t->t_disp_queue = t->t_cpu->cpu_disp;
 	kpreempt_enable();
@@ -1422,6 +1423,8 @@ thread_unpin()
 	itp = t->t_intr;		/* interrupted thread */
 	t->t_intr = NULL;		/* clear interrupt ptr */
 
+	ht_end_intr();
+
 	/*
 	 * Get state from interrupt thread for the one
 	 * it interrupted.
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index 03d711838c..2127de2bf0 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -25,8 +25,8 @@
  *
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -90,6 +90,7 @@
 #include <sys/zfeature.h>
 #include <sys/zio_checksum.h>
 #include <sys/zil_impl.h>
+#include <sys/ht.h>
 
 #include "zfs_namecheck.h"
 
@@ -1281,6 +1282,8 @@ zvol_strategy(buf_t *bp)
 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
 	    !doread && !is_dumpified;
 
+	ht_begin_unsafe();
+
 	/*
 	 * There must be no buffer changes when doing a dmu_sync() because
 	 * we can't change the data whilst calculating the checksum.
@@ -1328,6 +1331,8 @@ zvol_strategy(buf_t *bp)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 	biodone(bp);
 
+	ht_end_unsafe();
+
 	return (0);
 }
 
@@ -1409,6 +1414,8 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 		return (error);
 	}
 
+	ht_begin_unsafe();
+
 	DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0);
 
 	mutex_enter(&zonep->zone_vfs_lock);
@@ -1469,6 +1476,8 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 	DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int,
 	    error);
 
+	ht_end_unsafe();
+
 	return (error);
 }
 
@@ -1501,6 +1510,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 		return (error);
 	}
 
+	ht_begin_unsafe();
+
 	DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1);
 
 	/*
@@ -1549,6 +1560,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 	DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int,
 	    error);
 
+	ht_end_unsafe();
+
 	mutex_enter(&zonep->zone_vfs_lock);
 	zonep->zone_vfs_rwstats.writes++;
 	zonep->zone_vfs_rwstats.nwritten += tot_bytes;
@@ -1818,11 +1831,17 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 	case DKIOCFLUSHWRITECACHE:
 		dkc = (struct dk_callback *)arg;
 		mutex_exit(&zfsdev_state_lock);
+
+		ht_begin_unsafe();
+
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 		if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
 			(*dkc->dkc_callback)(dkc->dkc_cookie, error);
 			error = 0;
 		}
+
+		ht_end_unsafe();
+
 		return (error);
 
 	case DKIOCGETWCE:
@@ -1847,7 +1866,9 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 		} else {
 			zv->zv_flags &= ~ZVOL_WCE;
 			mutex_exit(&zfsdev_state_lock);
+			ht_begin_unsafe();
 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
+			ht_end_unsafe();
 		}
 		return (0);
 	}
@@ -1900,6 +1921,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 
 		mutex_exit(&zfsdev_state_lock);
 
+		ht_begin_unsafe();
+
 		rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
 		    RL_WRITER);
 		tx = dmu_tx_create(zv->zv_objset);
@@ -1932,6 +1955,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
 		}
 
+		ht_end_unsafe();
+
 		return (error);
 	}
 
diff --git a/usr/src/uts/common/io/vnd/vnd.c b/usr/src/uts/common/io/vnd/vnd.c
index 5a25ed22d5..d03c7ce4ec 100644
--- a/usr/src/uts/common/io/vnd/vnd.c
+++ b/usr/src/uts/common/io/vnd/vnd.c
@@ -830,6 +830,7 @@
 #include <sys/disp.h>
 #include <sys/random.h>
 #include <sys/gsqueue.h>
+#include <sys/ht.h>
 
 #include <inet/ip.h>
 #include <inet/ip6.h>
@@ -3716,6 +3717,12 @@ vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy)
 	bsize = vsp->vns_bsize;
 	mutex_exit(&vsp->vns_lock);
 
+	/*
+	 * We're potentially going deep into the networking layer; make sure the
+	 * guest can't run concurrently.
+	 */
+	ht_begin_unsafe();
+
 	nmps = 0;
 	mptot = 0;
 	blocked = B_FALSE;
@@ -3736,6 +3743,8 @@ vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy)
 		}
 	}
 
+	ht_end_unsafe();
+
 	empty = vnd_dq_is_empty(&vsp->vns_dq_write);
 
 	/*
diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c
index 2efb68889c..4648dae9dd 100644
--- a/usr/src/uts/common/os/cpu.c
+++ b/usr/src/uts/common/os/cpu.c
@@ -400,6 +400,9 @@ force_thread_migrate(kthread_id_t tp)
  * CPUs prior to a successful return, it should take extra precautions (such as
  * their own call to kpreempt_disable) to ensure that safety.
  *
+ * CPU_BEST can be used to pick a "best" CPU to migrate to, including
+ * potentially the current CPU.
+ *
  * A CPU affinity reference count is maintained by thread_affinity_set and
  * thread_affinity_clear (incrementing and decrementing it, respectively),
  * maintaining CPU affinity while the count is non-zero, and allowing regions
@@ -416,6 +419,10 @@ thread_affinity_set(kthread_id_t t, int cpu_id)
 		VERIFY3P(t, ==, curthread);
 		kpreempt_disable();
 		cp = CPU;
+	} else if (cpu_id == CPU_BEST) {
+		VERIFY3P(t, ==, curthread);
+		kpreempt_disable();
+		cp = disp_choose_best_cpu();
 	} else {
 		/*
 		 * We should be asserting that cpu_lock is held here, but
@@ -453,9 +460,8 @@ thread_affinity_set(kthread_id_t t, int cpu_id)
 		thread_unlock(t);
 	}
 
-	if (cpu_id == CPU_CURRENT) {
+	if (cpu_id == CPU_CURRENT || cpu_id == CPU_BEST)
 		kpreempt_enable();
-	}
 }
 
 /*
@@ -1490,8 +1496,8 @@ again:	for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) {
 				 * Update CPU last ran on if it was this CPU
 				 */
 				if (t->t_cpu == cp && t->t_bound_cpu != cp)
-					t->t_cpu = disp_lowpri_cpu(ncp,
-					    t->t_lpl, t->t_pri, NULL);
+					t->t_cpu = disp_lowpri_cpu(ncp, t,
+					    t->t_pri);
 				ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||
 				    t->t_weakbound_cpu == cp);
 
@@ -1533,10 +1539,9 @@ again:	for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) {
 			 * Update CPU last ran on if it was this CPU
 			 */
 
-			if (t->t_cpu == cp && t->t_bound_cpu != cp) {
-				t->t_cpu = disp_lowpri_cpu(ncp,
-				    t->t_lpl, t->t_pri, NULL);
-			}
+			if (t->t_cpu == cp && t->t_bound_cpu != cp)
+				t->t_cpu = disp_lowpri_cpu(ncp, t, t->t_pri);
+
 			ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||
 			    t->t_weakbound_cpu == cp);
 			t = t->t_next;
diff --git a/usr/src/uts/common/os/lgrp.c b/usr/src/uts/common/os/lgrp.c
index 6288f47bed..6f6aced619 100644
--- a/usr/src/uts/common/os/lgrp.c
+++ b/usr/src/uts/common/os/lgrp.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -90,6 +91,7 @@
 #include <sys/pg.h>
 #include <sys/promif.h>
 #include <sys/sdt.h>
+#include <sys/ht.h>
 
 lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
@@ -520,6 +522,8 @@ lgrp_main_mp_init(void)
 {
 	klgrpset_t changed;
 
+	ht_init();
+
 	/*
 	 * Update lgroup topology (if necessary)
 	 */
diff --git a/usr/src/uts/common/sys/cpuvar.h b/usr/src/uts/common/sys/cpuvar.h
index 3ee4e70eec..2cfe5116d9 100644
--- a/usr/src/uts/common/sys/cpuvar.h
+++ b/usr/src/uts/common/sys/cpuvar.h
@@ -23,7 +23,7 @@
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  * Copyright 2017 RackTop Systems.
  */
 
@@ -540,13 +540,19 @@ extern struct cpu *curcpup(void);
 #endif
 
 /*
- * CPU_CURRENT indicates to thread_affinity_set to use CPU->cpu_id
- * as the target and to grab cpu_lock instead of requiring the caller
- * to grab it.
+ * CPU_CURRENT indicates to thread_affinity_set() to use whatever curthread's
+ * current CPU is; holding cpu_lock is not required.
  */
 #define	CPU_CURRENT	-3
 
 /*
+ * CPU_BEST can be used by thread_affinity_set() callers to set affinity to a
+ * good CPU (in particular, an ht_acquire()-friendly choice); holding cpu_lock
+ * is not required.
+ */
+#define	CPU_BEST	-4
+
+/*
  * Per-CPU statistics
  *
  * cpu_stats_t contains numerous system and VM-related statistics, in the form
diff --git a/usr/src/uts/common/sys/disp.h b/usr/src/uts/common/sys/disp.h
index b324f4d323..cb3711edcd 100644
--- a/usr/src/uts/common/sys/disp.h
+++ b/usr/src/uts/common/sys/disp.h
@@ -23,6 +23,8 @@
  * Use is subject to license terms.
  *
  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
+ *
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -63,11 +65,11 @@ typedef struct _disp {
 	/*
 	 * Priorities:
 	 *	disp_maxrunpri is the maximum run priority of runnable threads
-	 * 	on this queue.  It is -1 if nothing is runnable.
+	 *	on this queue.  It is -1 if nothing is runnable.
 	 *
 	 *	disp_max_unbound_pri is the maximum run priority of threads on
 	 *	this dispatch queue but runnable by any CPU.  This may be left
-	 * 	artificially high, then corrected when some CPU tries to take
+	 *	artificially high, then corrected when some CPU tries to take
 	 *	an unbound thread.  It is -1 if nothing is runnable.
 	 */
 	pri_t		disp_maxrunpri;	/* maximum run priority */
@@ -151,8 +153,7 @@ extern void		dq_srundec(kthread_t *);
 extern void		cpu_rechoose(kthread_t *);
 extern void		cpu_surrender(kthread_t *);
 extern void		kpreempt(int);
-extern struct cpu	*disp_lowpri_cpu(struct cpu *, struct lgrp_ld *, pri_t,
-			    struct cpu *);
+extern struct cpu	*disp_lowpri_cpu(struct cpu *, kthread_t *, pri_t);
 extern int		disp_bound_threads(struct cpu *, int);
 extern int		disp_bound_anythreads(struct cpu *, int);
 extern int		disp_bound_partition(struct cpu *, int);
@@ -167,6 +168,8 @@ extern void		resume_from_zombie(kthread_t *)
 extern void		disp_swapped_enq(kthread_t *);
 extern int		disp_anywork(void);
 
+extern struct cpu	*disp_choose_best_cpu(void);
+
 #define	KPREEMPT_SYNC		(-1)
 #define	kpreempt_disable()				\
 	{						\
@@ -183,6 +186,8 @@ extern int		disp_anywork(void);
 
 #endif	/* _KERNEL */
 
+#define	CPU_IDLE_PRI (-1)
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h
index af9fcb75cf..678d356564 100644
--- a/usr/src/uts/common/sys/thread.h
+++ b/usr/src/uts/common/sys/thread.h
@@ -354,6 +354,8 @@ typedef struct _kthread {
 	kmutex_t	t_wait_mutex;	/* used in CV wait functions */
 
 	char		*t_name;	/* thread name */
+
+	uint64_t	t_unsafe;	/* unsafe to run with HT VCPU thread */
 } kthread_t;
 
 /*
@@ -417,6 +419,7 @@ typedef struct _kthread {
 #define	TS_SIGNALLED	0x0010	/* thread was awakened by cv_signal() */
 #define	TS_PROJWAITQ	0x0020	/* thread is on its project's waitq */
 #define	TS_ZONEWAITQ	0x0040	/* thread is on its zone's waitq */
+#define	TS_VCPU		0x0080	/* thread will enter guest context */
 #define	TS_CSTART	0x0100	/* setrun() by continuelwps() */
 #define	TS_UNPAUSE	0x0200	/* setrun() by unpauselwps() */
 #define	TS_XSTART	0x0400	/* setrun() by SIGCONT */
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index fcf9820fd8..2a94505acb 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -68,6 +68,7 @@ CORE_OBJS +=			\
 	hment.o			\
 	hold_page.o		\
 	hrtimers.o		\
+	ht.o			\
 	htable.o		\
 	hypercall.o		\
 	hypersubr.o		\
@@ -293,7 +294,7 @@ INC_PATH	+= -I$(UTSBASE)/i86xpv -I$(UTSBASE)/common/xen
 # since only C headers are included when #defined(__lint) is true.
 #
 
-ASSYM_DEPS      += 		\
+ASSYM_DEPS      +=		\
 	copy.o			\
 	desctbls_asm.o		\
 	ddi_i86_asm.o		\
diff --git a/usr/src/uts/i86pc/io/apix/apix_intr.c b/usr/src/uts/i86pc/io/apix/apix_intr.c
index 227ce0c991..59d8787839 100644
--- a/usr/src/uts/i86pc/io/apix/apix_intr.c
+++ b/usr/src/uts/i86pc/io/apix/apix_intr.c
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2018 Western Digital Corporation.  All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/cpuvar.h>
@@ -68,6 +69,7 @@
 #include <vm/hat_i86.h>
 #include <sys/stack.h>
 #include <sys/apix.h>
+#include <sys/ht.h>
 
 static void apix_post_hardint(int);
 
@@ -280,6 +282,7 @@ apix_do_softint_prolog(struct cpu *cpu, uint_t pil, uint_t oldpil,
 
 	it->t_intr = t;
 	cpu->cpu_thread = it;
+	ht_begin_intr(pil);
 
 	/*
 	 * Set bit for this pil in CPU's interrupt active bitmask.
@@ -350,7 +353,9 @@ apix_do_softint_epilog(struct cpu *cpu, uint_t oldpil)
 	it->t_link = cpu->cpu_intr_thread;
 	cpu->cpu_intr_thread = it;
 	it->t_state = TS_FREE;
+	ht_end_intr();
 	cpu->cpu_thread = t;
+
 	if (t->t_flag & T_INTR_THREAD)
 		t->t_intr_start = now;
 	basespl = cpu->cpu_base_spl;
@@ -466,6 +471,8 @@ apix_hilevel_intr_prolog(struct cpu *cpu, uint_t pil, uint_t oldpil,
 		}
 	}
 
+	ht_begin_intr(pil);
+
 	/* store starting timestamp in CPu structure for this IPL */
 	mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)] = now;
 
@@ -556,6 +563,8 @@ apix_hilevel_intr_epilog(struct cpu *cpu, uint_t oldpil)
 			t->t_intr_start = now;
 	}
 
+	ht_end_intr();
+
 	mcpu->mcpu_pri = oldpil;
 	if (pil < CBE_HIGH_PIL)
 		(void) (*setlvlx)(oldpil, 0);
@@ -668,6 +677,7 @@ apix_intr_thread_prolog(struct cpu *cpu, uint_t pil, caddr_t stackptr)
 	it->t_state = TS_ONPROC;
 
 	cpu->cpu_thread = it;
+	ht_begin_intr(pil);
 
 	/*
 	 * Initialize thread priority level from intr_pri
@@ -756,7 +766,9 @@ apix_intr_thread_epilog(struct cpu *cpu, uint_t oldpil)
 	cpu->cpu_intr_thread = it;
 	it->t_state = TS_FREE;
 
+	ht_end_intr();
 	cpu->cpu_thread = t;
+
 	if (t->t_flag & T_INTR_THREAD)
 		t->t_intr_start = now;
 	basespl = cpu->cpu_base_spl;
diff --git a/usr/src/uts/i86pc/io/viona/viona.c b/usr/src/uts/i86pc/io/viona/viona.c
index d3a3bdd44f..3c52457a0b 100644
--- a/usr/src/uts/i86pc/io/viona/viona.c
+++ b/usr/src/uts/i86pc/io/viona/viona.c
@@ -220,6 +220,7 @@
 #include <sys/strsubr.h>
 #include <sys/strsun.h>
 #include <vm/seg_kmem.h>
+#include <sys/ht.h>
 
 #include <sys/pattr.h>
 #include <sys/dls.h>
@@ -2414,7 +2415,13 @@ viona_tx(viona_link_t *link, viona_vring_t *ring)
 		viona_tx_done(ring, len, cookie);
 	}
 
+	/*
+	 * We're potentially going deep into the networking layer; make sure the
+	 * guest can't run concurrently.
+	 */
+	ht_begin_unsafe();
 	mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
+	ht_end_unsafe();
 	return;
 
 drop_fail:
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
index 9ad232a612..e07ee0ea52 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
@@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$");
 #ifndef __FreeBSD__
 #include <sys/x86_archext.h>
 #include <sys/smp_impldefs.h>
+#include <sys/ht.h>
 #endif
 
 #include <vm/vm.h>
@@ -3052,11 +3053,30 @@ vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
 			break;
 		}
 
+#ifndef __FreeBSD__
+		if ((rc = ht_acquire()) != 1) {
+			enable_intr();
+			vmexit->rip = rip;
+			vmexit->inst_length = 0;
+			if (rc == -1) {
+				vmexit->exitcode = VM_EXITCODE_HT;
+			} else {
+				vmexit->exitcode = VM_EXITCODE_BOGUS;
+				handled = HANDLED;
+			}
+			break;
+		}
+#endif
+
 		vmx_run_trace(vmx, vcpu);
 		vmx_dr_enter_guest(vmxctx);
 		rc = vmx_enter_guest(vmxctx, vmx, launched);
 		vmx_dr_leave_guest(vmxctx);
 
+#ifndef __FreeBSD__
+		ht_release();
+#endif
+
 		/* Collect some information for VM exit processing */
 		vmexit->rip = rip = vmcs_guest_rip();
 		vmexit->inst_length = vmexit_instruction_length();
diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c
index bcb6b77cea..164227cc5e 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm.c
@@ -1997,7 +1997,6 @@ vmm_freectx(void *arg, int isexec)
 
 #endif /* __FreeBSD */
 
-
 int
 vm_run(struct vm *vm, struct vm_run *vmrun)
 {
@@ -2013,6 +2012,7 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
 	pmap_t pmap;
 #ifndef	__FreeBSD__
 	vm_thread_ctx_t vtc;
+	int affinity_type = CPU_CURRENT;
 #endif
 
 	vcpuid = vmrun->cpuid;
@@ -2044,7 +2044,7 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
 
 restart:
 #ifndef	__FreeBSD__
-	thread_affinity_set(curthread, CPU_CURRENT);
+	thread_affinity_set(curthread, affinity_type);
 	/*
 	 * Resource localization should happen after the CPU affinity for the
 	 * thread has been set to ensure that access from restricted contexts,
@@ -2054,6 +2054,8 @@ restart:
 	 * This must be done prior to disabling kpreempt via critical_enter().
 	 */
 	vm_localize_resources(vm, vcpu);
+
+	affinity_type = CPU_CURRENT;
 #endif
 
 	critical_enter();
@@ -2145,6 +2147,12 @@ restart:
 				retu = true;
 			}
 			break;
+
+		case VM_EXITCODE_HT: {
+			affinity_type = CPU_BEST;
+			break;
+		}
+
 #endif
 		default:
 			retu = true;	/* handled in userland */
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
index fff951f82b..3c0d9beec2 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
@@ -28,6 +28,7 @@
 #include <sys/cpuset.h>
 #include <sys/id_space.h>
 #include <sys/fs/sdev_plugin.h>
+#include <sys/ht.h>
 
 #include <sys/kernel.h>
 
@@ -374,6 +375,10 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
 			break;
 		}
 		vmrun.cpuid = vcpu;
+
+		if (!(curthread->t_schedflag & TS_VCPU))
+			ht_mark_as_vcpu();
+
 		error = vm_run(sc->vmm_vm, &vmrun);
 		/*
 		 * XXXJOY: I think it's necessary to do copyout, even in the
diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c
index bc3d80189b..3f9132ba4e 100644
--- a/usr/src/uts/i86pc/os/cpuid.c
+++ b/usr/src/uts/i86pc/os/cpuid.c
@@ -217,7 +217,9 @@ static char *x86_feature_names[NUM_X86_FEATURES] = {
 	"ibrs_all",
 	"rsba",
 	"ssb_no",
-	"stibp_all"
+	"stibp_all",
+	"flush_cmd",
+	"l1d_vmentry_no"
 };
 
 boolean_t
@@ -986,6 +988,19 @@ cpuid_amd_getids(cpu_t *cpu)
 }
 
 static void
+spec_l1d_flush_noop(void)
+{
+}
+
+static void
+spec_l1d_flush_msr(void)
+{
+	wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
+}
+
+void (*spec_l1d_flush)(void) = spec_l1d_flush_noop;
+
+static void
 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
 {
 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
@@ -1051,6 +1066,10 @@ cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
 					add_x86_feature(featureset,
 					    X86FSET_RSBA);
 				}
+				if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
+					add_x86_feature(featureset,
+					    X86FSET_L1D_VM_NO);
+				}
 				if (reg & IA32_ARCH_CAP_SSB_NO) {
 					add_x86_feature(featureset,
 					    X86FSET_SSB_NO);
@@ -1062,7 +1081,47 @@ cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
 
 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
 			add_x86_feature(featureset, X86FSET_SSBD);
+
+		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
+			add_x86_feature(featureset, X86FSET_FLUSH_CMD);
+	}
+
+	if (cpu->cpu_id != 0)
+		return;
+
+	/*
+	 * We're the boot CPU, so let's figure out our L1TF status.
+	 *
+	 * First, if this is a RDCL_NO CPU, then we are not vulnerable: we don't
+	 * need to exclude with ht_acquire(), and we don't need to flush.
+	 */
+	if (is_x86_feature(featureset, X86FSET_RDCL_NO)) {
+		extern int ht_exclusion;
+		ht_exclusion = 0;
+		spec_l1d_flush = spec_l1d_flush_noop;
+		membar_producer();
+		return;
+	}
+
+	/*
+	 * If HT is enabled, we will need HT exclusion, as well as the flush on
+	 * VM entry.  If HT isn't enabled, we still need at least the flush for
+	 * the L1TF sequential case.
+	 *
+	 * However, if X86FSET_L1D_VM_NO is set, we're most likely running
+	 * inside a VM ourselves, and we don't need the flush.
+	 *
+	 * If we don't have the FLUSH_CMD available at all, we'd better just
+	 * hope HT is disabled.
+	 */
+	if (is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
+	    !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
+		spec_l1d_flush = spec_l1d_flush_msr;
+	} else {
+		spec_l1d_flush = spec_l1d_flush_noop;
 	}
+
+	membar_producer();
 }
 
 /*
@@ -3827,7 +3886,7 @@ cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
 	eax = cpi->cpi_std[1].cp_eax;
 
 #define	SH_B0(eax)	(eax == 0xf40 || eax == 0xf50)
-#define	SH_B3(eax) 	(eax == 0xf51)
+#define	SH_B3(eax)	(eax == 0xf51)
 #define	B(eax)		(SH_B0(eax) || SH_B3(eax))
 
 #define	SH_C0(eax)	(eax == 0xf48 || eax == 0xf58)
@@ -4131,9 +4190,9 @@ static const char sl3_cache_str[] = "sectored-l3-cache";
 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
 
 static const struct cachetab {
-	uint8_t 	ct_code;
+	uint8_t		ct_code;
 	uint8_t		ct_assoc;
-	uint16_t 	ct_line_size;
+	uint16_t	ct_line_size;
 	size_t		ct_size;
 	const char	*ct_label;
 } intel_ctab[] = {
diff --git a/usr/src/uts/i86pc/os/ht.c b/usr/src/uts/i86pc/os/ht.c
new file mode 100644
index 0000000000..f82c51ac08
--- /dev/null
+++ b/usr/src/uts/i86pc/os/ht.c
@@ -0,0 +1,599 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * HT exclusion: prevent a sibling in a hyper-threaded core from running in VMX
+ * non-root guest mode, when certain threads are running on the other sibling.
+ * This avoids speculation-based information leaks such as L1TF being available
+ * to the untrusted guest.  The stance we take is that threads from the same
+ * zone as the guest VPCU thread are considered safe to run alongside, but all
+ * other threads (except the idle thread), and all interrupts, are unsafe.  Note
+ * that due to the implementation here, there are significant sections of e.g.
+ * the dispatcher code that can run concurrently with a guest, until the thread
+ * reaches ht_mark().  This code assumes there are only two HT threads per core.
+ *
+ * The entry points are as follows:
+ *
+ * ht_mark_as_vcpu()
+ *
+ * All threads that enter guest mode (i.e. VCPU threads) need to call this at
+ * least once, which sets TS_VCPU in ->t_schedflag.
+ *
+ * ht_mark()
+ *
+ * A new ->cpu_thread is now curthread (although interrupt threads have their
+ * own separate handling).  After preventing any interrupts, we will take our
+ * own CPU's spinlock and update our own state in mcpu_ht.
+ *
+ * If our sibling is poisoned (i.e. in guest mode or the little bit of code
+ * around it), and we're not compatible (that is, same zone ID, or the idle
+ * thread), then we need to ht_kick() that sibling.  ht_kick() itself waits for
+ * the sibling to call ht_release(), and it will not re-enter guest mode until
+ * allowed.
+ *
+ * Note that we ignore the fact a process can change its zone ID: poisoning
+ * threads never do so, and we can ignore the other cases.
+ *
+ * ht_acquire()
+ *
+ * We are a VCPU thread about to start guest execution.  Interrupts are
+ * disabled.  We must have already run ht_mark() to be in this code, so there's
+ * no need to take our *own* spinlock in order to mark ourselves as CM_POISONED.
+ * Instead, we take our sibling's lock to also mark ourselves as poisoned in the
+ * sibling cpu_ht_t.  This is so ht_mark() will only ever need to look at its
+ * local mcpu_ht.
+ *
+ * We'll loop here for up to ht_acquire_wait_time microseconds; this is mainly
+ * to wait out any sibling interrupt: many of them will complete quicker than
+ * this.
+ *
+ * Finally, if we succeeded in acquiring the core, we'll flush the L1 cache as
+ * mitigation against L1TF: no incompatible thread will now be able to populate
+ * the L1 cache until *we* ht_release().
+ *
+ * ht_release()
+ *
+ * Simply unpoison ourselves similarly to ht_acquire(); ht_kick() will wait for
+ * this to happen if needed.
+ *
+ * ht_begin_intr()
+ *
+ * In an interrupt prolog.  We're either a hilevel interrupt, or a pinning
+ * interrupt.  In both cases, we mark our interrupt depth, and potentially
+ * ht_kick().  This enforces exclusion, but doesn't otherwise modify ->ch_state:
+ * we want the dispatcher code to essentially ignore interrupts.
+ *
+ * ht_end_intr()
+ *
+ * In an interrupt epilogue *or* thread_unpin().  In the first case, we never
+ * slept, and we can simply decrement our counter.  In the second case, we're an
+ * interrupt thread about to sleep: we'll still just decrement our counter, and
+ * henceforth treat the thread as a normal thread when it next gets scheduled,
+ * until it finally gets to its epilogue.
+ *
+ * ht_mark_unsafe() / ht_mark_safe()
+ *
+ * Mark the current thread as temporarily unsafe (guests should not be executing
+ * while a sibling is marked unsafe).  This can be used for a thread that's
+ * otherwise considered safe, if it needs to handle potentially sensitive data.
+ * Right now, this means certain I/O handling operations that reach down into
+ * the networking and ZFS sub-systems.
+ *
+ * ht_should_run(thread, cpu)
+ *
+ * This is used by the dispatcher when making scheduling decisions: if the
+ * sibling is compatible with the given thread, we return B_TRUE. This is
+ * essentially trying to guess if any subsequent ht_acquire() will fail, by
+ * peeking at the sibling CPU's state.  The peek is racy, but if we get things
+ * wrong, the "only" consequence is that ht_acquire() may lose.
+ *
+ * ht_adjust_cpu_score()
+ *
+ * Used when scoring other CPUs in disp_lowpri_cpu().  If we shouldn't run here,
+ * we'll add a small penalty to the score.  This also makes sure a VCPU thread
+ * migration behaves properly.
+ */
+
+#include <sys/archsystm.h>
+#include <sys/disp.h>
+#include <sys/cmt.h>
+#include <sys/systm.h>
+#include <sys/cpu.h>
+#include <sys/var.h>
+#include <sys/xc_levels.h>
+#include <sys/cmn_err.h>
+#include <sys/sysmacros.h>
+#include <sys/x86_archext.h>
+
+#define	CS_SHIFT (8)
+#define	CS_MASK ((1 << CS_SHIFT) - 1)
+#define	CS_MARK(s) ((s) & CS_MASK)
+#define	CS_ZONE(s) ((s) >> CS_SHIFT)
+#define	CS_MK(s, z) ((s) | (z << CS_SHIFT))
+
+typedef enum ch_mark {
+	CM_IDLE = 0,	/* running CPU idle thread */
+	CM_THREAD,	/* running general non-VCPU thread */
+	CM_UNSAFE,	/* running ->t_unsafe thread */
+	CM_VCPU,	/* running VCPU thread */
+	CM_POISONED	/* running in guest */
+} ch_mark_t;
+
+/* Double-check our false-sharing padding. */
+CTASSERT(offsetof(cpu_ht_t, ch_sib) == 64);
+CTASSERT(CM_IDLE == 0);
+CTASSERT(CM_POISONED < (1 << CS_SHIFT));
+CTASSERT(CM_POISONED > CM_VCPU);
+CTASSERT(CM_VCPU > CM_UNSAFE);
+
+/*
+ * If disabled, no HT exclusion is performed, and system is potentially
+ * vulnerable to L1TF if hyper-threading is enabled, and we don't have the "not
+ * vulnerable" CPUID bit.
+ */
+int ht_exclusion = 1;
+
+/*
+ * How long ht_acquire() will spin trying to acquire the core, in micro-seconds.
+ * This is enough time to wait out a significant proportion of interrupts.
+ */
+clock_t ht_acquire_wait_time = 64;
+
+static cpu_t *
+ht_find_sibling(cpu_t *cp)
+{
+	for (uint_t i = 0; i < GROUP_SIZE(&cp->cpu_pg->cmt_pgs); i++) {
+		pg_cmt_t *pg = GROUP_ACCESS(&cp->cpu_pg->cmt_pgs, i);
+		group_t *cg = &pg->cmt_pg.pghw_pg.pg_cpus;
+
+		if (pg->cmt_pg.pghw_hw != PGHW_IPIPE)
+			continue;
+
+		if (GROUP_SIZE(cg) == 1)
+			break;
+
+		VERIFY3U(GROUP_SIZE(cg), ==, 2);
+
+		if (GROUP_ACCESS(cg, 0) != cp)
+			return (GROUP_ACCESS(cg, 0));
+
+		VERIFY3P(GROUP_ACCESS(cg, 1), !=, cp);
+
+		return (GROUP_ACCESS(cg, 1));
+	}
+
+	return (NULL);
+}
+
+/*
+ * Initialize HT links.  We have to be careful here not to race with
+ * ht_begin/end_intr(), which also complicates trying to do this initialization
+ * from a cross-call; hence the slightly odd approach below.
+ */
+void
+ht_init(void)
+{
+	cpu_t *scp = CPU;
+	cpu_t *cp = scp;
+	ulong_t flags;
+
+	if (!ht_exclusion)
+		return;
+
+	mutex_enter(&cpu_lock);
+
+	do {
+		thread_affinity_set(curthread, cp->cpu_id);
+		flags = intr_clear();
+
+		cp->cpu_m.mcpu_ht.ch_intr_depth = 0;
+		cp->cpu_m.mcpu_ht.ch_state = CS_MK(CM_THREAD, GLOBAL_ZONEID);
+		cp->cpu_m.mcpu_ht.ch_sibstate = CS_MK(CM_THREAD, GLOBAL_ZONEID);
+		ASSERT3P(cp->cpu_m.mcpu_ht.ch_sib, ==, NULL);
+		cp->cpu_m.mcpu_ht.ch_sib = ht_find_sibling(cp);
+
+		intr_restore(flags);
+		thread_affinity_clear(curthread);
+	} while ((cp = cp->cpu_next_onln) != scp);
+
+	mutex_exit(&cpu_lock);
+}
+
+/*
+ * If our sibling is also a VCPU thread from a different zone, we need one of
+ * them to give up, otherwise they will just battle each other for exclusion
+ * until they exhaust their quantum.
+ *
+ * We arbitrate between them by dispatch priority: clearly, a higher-priority
+ * thread deserves to win the acquisition.  However, under CPU load, it'll be
+ * very common to see both threads with ->t_pri == 1.  If so, we'll break the
+ * tie by cpu_id (which is hopefully arbitrary enough).
+ *
+ * If we lose, the VMM code will take this as a hint to call
+ * thread_affinity_set(CPU_BEST), which will likely migrate the VCPU thread
+ * somewhere else.
+ *
+ * Note that all of this state examination is racy, as we don't own any locks
+ * here.
+ */
+static boolean_t
+yield_to_vcpu(cpu_t *sib, zoneid_t zoneid)
+{
+	cpu_ht_t *sibht = &sib->cpu_m.mcpu_ht;
+	uint64_t sibstate = sibht->ch_state;
+
+	/*
+	 * If we're likely just waiting for an interrupt, don't yield.
+	 */
+	if (sibht->ch_intr_depth != 0)
+		return (B_FALSE);
+
+	/*
+	 * We're only interested in VCPUs from a different zone.
+	 */
+	if (CS_MARK(sibstate) < CM_VCPU || CS_ZONE(sibstate) == zoneid)
+		return (B_FALSE);
+
+	if (curthread->t_pri < sib->cpu_dispatch_pri)
+		return (B_TRUE);
+
+	if (curthread->t_pri == sib->cpu_dispatch_pri &&
+	    CPU->cpu_id < sib->cpu_id)
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+static inline boolean_t
+sibling_compatible(cpu_ht_t *sibht, zoneid_t zoneid)
+{
+	uint64_t sibstate = sibht->ch_state;
+
+	if (sibht->ch_intr_depth != 0)
+		return (B_FALSE);
+
+	if (CS_MARK(sibstate) == CM_UNSAFE)
+		return (B_FALSE);
+
+	if (CS_MARK(sibstate) == CM_IDLE)
+		return (B_TRUE);
+
+	return (CS_ZONE(sibstate) == zoneid);
+}
+
+int
+ht_acquire(void)
+{
+	clock_t wait = ht_acquire_wait_time;
+	cpu_ht_t *ht = &CPU->cpu_m.mcpu_ht;
+	zoneid_t zoneid = getzoneid();
+	cpu_ht_t *sibht;
+	int ret = 0;
+
+	ASSERT(!interrupts_enabled());
+
+	if (ht->ch_sib == NULL) {
+		/* For the "sequential" L1TF case. */
+		spec_l1d_flush();
+		return (1);
+	}
+
+	sibht = &ht->ch_sib->cpu_m.mcpu_ht;
+
+	/* A VCPU thread should never change zone. */
+	ASSERT3U(CS_ZONE(ht->ch_state), ==, zoneid);
+	ASSERT3U(CS_MARK(ht->ch_state), ==, CM_VCPU);
+	ASSERT3U(zoneid, !=, GLOBAL_ZONEID);
+	ASSERT3U(curthread->t_preempt, >=, 1);
+	ASSERT(curthread->t_schedflag & TS_VCPU);
+
+	while (ret == 0 && wait > 0) {
+
+		if (yield_to_vcpu(ht->ch_sib, zoneid)) {
+			ret = -1;
+			break;
+		}
+
+		if (sibling_compatible(sibht, zoneid)) {
+			lock_set(&sibht->ch_lock);
+
+			if (sibling_compatible(sibht, zoneid)) {
+				ht->ch_state = CS_MK(CM_POISONED, zoneid);
+				sibht->ch_sibstate = CS_MK(CM_POISONED, zoneid);
+				membar_enter();
+				ret = 1;
+			}
+
+			lock_clear(&sibht->ch_lock);
+		} else {
+			drv_usecwait(10);
+			wait -= 10;
+		}
+	}
+
+	DTRACE_PROBE4(ht__acquire, int, ret, uint64_t, sibht->ch_state,
+	    uint64_t, sibht->ch_intr_depth, clock_t, wait);
+
+	if (ret == 1)
+		spec_l1d_flush();
+
+	return (ret);
+}
+
+void
+ht_release(void)
+{
+	cpu_ht_t *ht = &CPU->cpu_m.mcpu_ht;
+	zoneid_t zoneid = getzoneid();
+	cpu_ht_t *sibht;
+
+	ASSERT(!interrupts_enabled());
+
+	if (ht->ch_sib == NULL)
+		return;
+
+	ASSERT3U(zoneid, !=, GLOBAL_ZONEID);
+	ASSERT3U(CS_ZONE(ht->ch_state), ==, zoneid);
+	ASSERT3U(CS_MARK(ht->ch_state), ==, CM_POISONED);
+	ASSERT3U(curthread->t_preempt, >=, 1);
+
+	sibht = &ht->ch_sib->cpu_m.mcpu_ht;
+
+	lock_set(&sibht->ch_lock);
+
+	ht->ch_state = CS_MK(CM_VCPU, zoneid);
+	sibht->ch_sibstate = CS_MK(CM_VCPU, zoneid);
+	membar_producer();
+
+	lock_clear(&sibht->ch_lock);
+}
+
+static void
+ht_kick(cpu_ht_t *ht, zoneid_t zoneid)
+{
+	uint64_t sibstate;
+
+	ASSERT(LOCK_HELD(&ht->ch_lock));
+	ASSERT(!interrupts_enabled());
+
+	poke_cpu(ht->ch_sib->cpu_id);
+
+	for (;;) {
+		membar_consumer();
+		sibstate = ht->ch_sibstate;
+
+		if (CS_MARK(sibstate) != CM_POISONED ||
+		    CS_ZONE(sibstate) == zoneid)
+			return;
+
+		lock_clear(&ht->ch_lock);
+
+		for (;;) {
+			membar_consumer();
+			sibstate = ht->ch_sibstate;
+
+			if (CS_MARK(sibstate) != CM_POISONED ||
+			    CS_ZONE(sibstate) == zoneid) {
+				lock_set(&ht->ch_lock);
+				return;
+			}
+
+			SMT_PAUSE();
+		}
+
+		lock_set(&ht->ch_lock);
+	}
+}
+
+/*
+ * FIXME: do we need a callback in case somebody installs a handler at this PIL
+ * ever?
+ */
+static boolean_t
+pil_needs_kick(uint_t pil)
+{
+	return (pil != XC_CPUPOKE_PIL);
+}
+
+void
+ht_begin_intr(uint_t pil)
+{
+	ulong_t flags;
+	cpu_ht_t *ht;
+
+	flags = intr_clear();
+	ht = &CPU->cpu_m.mcpu_ht;
+
+	if (ht->ch_sib == NULL) {
+		intr_restore(flags);
+		return;
+	}
+
+	if (atomic_inc_64_nv(&ht->ch_intr_depth) == 1 && pil_needs_kick(pil)) {
+		lock_set(&ht->ch_lock);
+
+		membar_consumer();
+
+		if (CS_MARK(ht->ch_sibstate) == CM_POISONED)
+			ht_kick(ht, GLOBAL_ZONEID);
+
+		lock_clear(&ht->ch_lock);
+	}
+
+	intr_restore(flags);
+}
+
+void
+ht_end_intr(void)
+{
+	ulong_t flags;
+	cpu_ht_t *ht;
+
+	flags = intr_clear();
+	ht = &CPU->cpu_m.mcpu_ht;
+
+	if (ht->ch_sib == NULL) {
+		intr_restore(flags);
+		return;
+	}
+
+	ASSERT3U(ht->ch_intr_depth, >, 0);
+	atomic_dec_64(&ht->ch_intr_depth);
+
+	intr_restore(flags);
+}
+
+static inline boolean_t
+ht_need_kick(cpu_ht_t *ht, zoneid_t zoneid)
+{
+	membar_consumer();
+
+	if (CS_MARK(ht->ch_sibstate) != CM_POISONED)
+		return (B_FALSE);
+
+	if (CS_MARK(ht->ch_state) == CM_UNSAFE)
+		return (B_TRUE);
+
+	return (CS_ZONE(ht->ch_sibstate) != zoneid);
+}
+
+void
+ht_mark(void)
+{
+	zoneid_t zoneid = getzoneid();
+	kthread_t *t = curthread;
+	ulong_t flags;
+	cpu_ht_t *ht;
+	cpu_t *cp;
+
+	flags = intr_clear();
+
+	cp = CPU;
+	ht = &cp->cpu_m.mcpu_ht;
+
+	if (ht->ch_sib == NULL) {
+		intr_restore(flags);
+		return;
+	}
+
+	lock_set(&ht->ch_lock);
+
+	/*
+	 * If we were a nested interrupt and went through the resume_from_intr()
+	 * path, we can now be resuming to a pinning interrupt thread; in which
+	 * case, skip marking, until we later resume to a "real" thread.
+	 */
+	if (ht->ch_intr_depth > 0) {
+		ASSERT3P(t->t_intr, !=, NULL);
+
+		if (ht_need_kick(ht, zoneid))
+			ht_kick(ht, zoneid);
+		goto out;
+	}
+
+	if (t == t->t_cpu->cpu_idle_thread) {
+		ASSERT3U(zoneid, ==, GLOBAL_ZONEID);
+		ht->ch_state = CS_MK(CM_IDLE, zoneid);
+	} else {
+		uint64_t state = CM_THREAD;
+
+		if (t->t_unsafe)
+			state = CM_UNSAFE;
+		else if (t->t_schedflag & TS_VCPU)
+			state = CM_VCPU;
+
+		ht->ch_state = CS_MK(state, zoneid);
+
+		if (ht_need_kick(ht, zoneid))
+			ht_kick(ht, zoneid);
+	}
+
+out:
+	membar_producer();
+	lock_clear(&ht->ch_lock);
+	intr_restore(flags);
+}
+
+void
+ht_begin_unsafe(void)
+{
+	curthread->t_unsafe++;
+	ht_mark();
+}
+
+void
+ht_end_unsafe(void)
+{
+	ASSERT3U(curthread->t_unsafe, >, 0);
+	curthread->t_unsafe--;
+	ht_mark();
+}
+
+void
+ht_mark_as_vcpu(void)
+{
+	thread_lock(curthread);
+	curthread->t_schedflag |= TS_VCPU;
+	ht_mark();
+	thread_unlock(curthread);
+}
+
+boolean_t
+ht_should_run(kthread_t *t, cpu_t *cp)
+{
+	uint64_t sibstate;
+	cpu_t *sib;
+
+	if (t == t->t_cpu->cpu_idle_thread)
+		return (B_TRUE);
+
+	if ((sib = cp->cpu_m.mcpu_ht.ch_sib) == NULL)
+		return (B_TRUE);
+
+	sibstate = sib->cpu_m.mcpu_ht.ch_state;
+
+	if ((t->t_schedflag & TS_VCPU)) {
+		if (CS_MARK(sibstate) == CM_IDLE)
+			return (B_TRUE);
+		if (CS_MARK(sibstate) == CM_UNSAFE)
+			return (B_FALSE);
+		return (CS_ZONE(sibstate) == ttozone(t)->zone_id);
+	}
+
+	if (CS_MARK(sibstate) < CM_VCPU)
+		return (B_TRUE);
+
+	return (CS_ZONE(sibstate) == ttozone(t)->zone_id);
+}
+
+pri_t
+ht_adjust_cpu_score(kthread_t *t, struct cpu *cp, pri_t score)
+{
+	cpu_t *sib;
+
+	if (ht_should_run(t, cp))
+		return (score);
+
+	/*
+	 * If we're a VCPU thread scoring our current CPU, we are most likely
+	 * asking to be rescheduled elsewhere after losing ht_acquire().  In
+	 * this case, the current CPU is not a good choice, most likely, and we
+	 * should go elsewhere.
+	 */
+	if ((t->t_schedflag & TS_VCPU) && cp == t->t_cpu && score < 0)
+		return ((v.v_maxsyspri + 1) * 2);
+
+	return (score + 1);
+}
diff --git a/usr/src/uts/i86pc/os/intr.c b/usr/src/uts/i86pc/os/intr.c
index 29fa78109c..0634df1a94 100644
--- a/usr/src/uts/i86pc/os/intr.c
+++ b/usr/src/uts/i86pc/os/intr.c
@@ -466,25 +466,22 @@
 #include <sys/ontrap.h>
 #include <sys/x86_archext.h>
 #include <sys/promif.h>
+#include <sys/ht.h>
 #include <vm/hat_i86.h>
 #if defined(__xpv)
 #include <sys/hypervisor.h>
 #endif
 
-#if defined(__amd64) && !defined(__xpv)
-/* If this fails, then the padding numbers in machcpuvar.h are wrong. */
-CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_pad)) <
-    MMU_PAGESIZE);
-CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_kpti)) >=
-    MMU_PAGESIZE);
-CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_kpti_dbg)) <
-    2 * MMU_PAGESIZE);
-CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_pad2)) <
-    2 * MMU_PAGESIZE);
+/* If these fail, then the padding numbers in machcpuvar.h are wrong. */
+#if !defined(__xpv)
+#define	MCOFF(member)	\
+	(offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, member))
+CTASSERT(MCOFF(mcpu_pad) == MACHCPU_SIZE);
+CTASSERT(MCOFF(mcpu_pad2) == MMU_PAGESIZE);
+CTASSERT((MCOFF(mcpu_kpti) & 0xF) == 0);
 CTASSERT(((sizeof (struct kpti_frame)) & 0xF) == 0);
-CTASSERT(((offsetof(cpu_t, cpu_m) +
-    offsetof(struct machcpu, mcpu_kpti_dbg)) & 0xF) == 0);
 CTASSERT((offsetof(struct kpti_frame, kf_tr_rsp) & 0xF) == 0);
+CTASSERT(MCOFF(mcpu_pad3) < 2 * MMU_PAGESIZE);
 #endif
 
 #if defined(__xpv) && defined(DEBUG)
@@ -600,6 +597,8 @@ hilevel_intr_prolog(struct cpu *cpu, uint_t pil, uint_t oldpil, struct regs *rp)
 		}
 	}
 
+	ht_begin_intr(pil);
+
 	/*
 	 * Store starting timestamp in CPU structure for this PIL.
 	 */
@@ -704,6 +703,8 @@ hilevel_intr_epilog(struct cpu *cpu, uint_t pil, uint_t oldpil, uint_t vecnum)
 			t->t_intr_start = now;
 	}
 
+	ht_end_intr();
+
 	mcpu->mcpu_pri = oldpil;
 	(void) (*setlvlx)(oldpil, vecnum);
 
@@ -766,6 +767,8 @@ intr_thread_prolog(struct cpu *cpu, caddr_t stackptr, uint_t pil)
 	it->t_state = TS_ONPROC;
 
 	cpu->cpu_thread = it;		/* new curthread on this cpu */
+	ht_begin_intr(pil);
+
 	it->t_pil = (uchar_t)pil;
 	it->t_pri = intr_pri + (pri_t)pil;
 	it->t_intr_start = now;
@@ -856,6 +859,7 @@ intr_thread_epilog(struct cpu *cpu, uint_t vec, uint_t oldpil)
 	mcpu->mcpu_pri = pil;
 	(*setlvlx)(pil, vec);
 	t->t_intr_start = now;
+	ht_end_intr();
 	cpu->cpu_thread = t;
 }
 
@@ -1043,6 +1047,7 @@ top:
 
 	it->t_intr = t;
 	cpu->cpu_thread = it;
+	ht_begin_intr(pil);
 
 	/*
 	 * Set bit for this pil in CPU's interrupt active bitmask.
@@ -1103,7 +1108,9 @@ dosoftint_epilog(struct cpu *cpu, uint_t oldpil)
 	it->t_link = cpu->cpu_intr_thread;
 	cpu->cpu_intr_thread = it;
 	it->t_state = TS_FREE;
+	ht_end_intr();
 	cpu->cpu_thread = t;
+
 	if (t->t_flag & T_INTR_THREAD)
 		t->t_intr_start = now;
 	basespl = cpu->cpu_base_spl;
diff --git a/usr/src/uts/i86pc/sys/Makefile b/usr/src/uts/i86pc/sys/Makefile
index 0e3cbbe243..e6ea573d0b 100644
--- a/usr/src/uts/i86pc/sys/Makefile
+++ b/usr/src/uts/i86pc/sys/Makefile
@@ -21,7 +21,7 @@
 #
 # Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
-# Copyright 2017 Joyent, Inc.
+# Copyright 2018 Joyent, Inc.
 #
 # uts/i86pc/sys/Makefile
 #
@@ -44,8 +44,9 @@ CHKHDRS=  \
 	clock.h		\
 	cram.h		\
 	ddi_subrdefs.h	\
-	debug_info.h 	\
+	debug_info.h	\
 	fastboot.h	\
+	ht.h		\
 	mach_mmu.h	\
 	machclock.h	\
 	machcpuvar.h	\
diff --git a/usr/src/uts/i86pc/sys/ht.h b/usr/src/uts/i86pc/sys/ht.h
new file mode 100644
index 0000000000..6b1bfcdd2b
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/ht.h
@@ -0,0 +1,46 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#ifndef	_SYS_HT_H
+#define	_SYS_HT_H
+
+#include <sys/types.h>
+#include <sys/thread.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct cpu;
+
+extern void ht_init(void);
+
+extern int ht_acquire(void);
+extern void ht_release(void);
+extern void ht_mark(void);
+extern void ht_begin_unsafe(void);
+extern void ht_end_unsafe(void);
+extern void ht_begin_intr(uint_t);
+extern void ht_end_intr(void);
+extern void ht_mark_as_vcpu(void);
+
+extern boolean_t ht_should_run(kthread_t *, struct cpu *);
+extern pri_t ht_adjust_cpu_score(kthread_t *, struct cpu *, pri_t);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_HT_H */
diff --git a/usr/src/uts/i86pc/sys/machcpuvar.h b/usr/src/uts/i86pc/sys/machcpuvar.h
index 98873cd26c..3d652316a4 100644
--- a/usr/src/uts/i86pc/sys/machcpuvar.h
+++ b/usr/src/uts/i86pc/sys/machcpuvar.h
@@ -140,6 +140,15 @@ struct kpti_frame {
 	uint64_t	kf_upper_redzone;
 };
 
+typedef struct cpu_ht {
+	lock_t ch_lock;
+	char ch_pad[56];
+	struct cpu *ch_sib;
+	volatile uint64_t ch_intr_depth;
+	volatile uint64_t ch_state;
+	volatile uint64_t ch_sibstate;
+} cpu_ht_t;
+
 /*
  * This first value, MACHCPU_SIZE is the size of all the members in the cpu_t
  * AND struct machcpu, before we get to the mcpu_pad and the kpti area.
@@ -147,9 +156,9 @@ struct kpti_frame {
  * page-tables, and hence must be page-aligned and page-sized. See
  * hat_pcp_setup().
  *
- * There is a CTASSERT in os/intr.c that checks these numbers.
+ * There are CTASSERTs in os/intr.c that verify this all works out.
  */
-#define	MACHCPU_SIZE	(572 + 1584)
+#define	MACHCPU_SIZE	(1568 + 688)
 #define	MACHCPU_PAD	(MMU_PAGESIZE - MACHCPU_SIZE)
 #define	MACHCPU_PAD2	(MMU_PAGESIZE - 16 - 3 * sizeof (struct kpti_frame))
 
@@ -227,6 +236,8 @@ struct	machcpu {
 	 */
 	volatile uint32_t	mcpu_istamp;
 
+	cpu_ht_t		mcpu_ht;
+
 	char			mcpu_pad[MACHCPU_PAD];
 
 	/* This is the start of the page */
diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h
index bd8126cc0d..c200a5eb33 100644
--- a/usr/src/uts/i86pc/sys/vmm.h
+++ b/usr/src/uts/i86pc/sys/vmm.h
@@ -587,6 +587,9 @@ enum vm_exitcode {
 	VM_EXITCODE_SVM,
 	VM_EXITCODE_REQIDLE,
 	VM_EXITCODE_DEBUG,
+#ifndef	__FreeBSD__
+	VM_EXITCODE_HT,
+#endif
 	VM_EXITCODE_MAX
 };
 
diff --git a/usr/src/uts/i86xpv/Makefile.files b/usr/src/uts/i86xpv/Makefile.files
index a576b2f0a8..b4a78cc841 100644
--- a/usr/src/uts/i86xpv/Makefile.files
+++ b/usr/src/uts/i86xpv/Makefile.files
@@ -61,6 +61,7 @@ CORE_OBJS +=			\
 	hment.o			\
 	hold_page.o		\
 	hrtimers.o		\
+	ht.o			\
 	htable.o		\
 	i86_mmu.o		\
 	ibft.o			\
@@ -110,7 +111,7 @@ CORE_OBJS += $(SMBIOS_OBJS)
 
 #
 # These get compiled twice:
-# - once in the dboot (direct boot) identity mapped code 
+# - once in the dboot (direct boot) identity mapped code
 # - once for use during early startup in unix
 #
 BOOT_DRIVER_OBJS =		\
@@ -161,7 +162,7 @@ SPECIAL_OBJS_64 +=		\
 	locore.o		\
 	fast_trap_asm.o		\
 	interrupt.o		\
-	syscall_asm_amd64.o 	\
+	syscall_asm_amd64.o	\
 	kpti_trampolines.o
 
 SPECIAL_OBJS += $(SPECIAL_OBJS_$(CLASS))
@@ -234,7 +235,7 @@ INC_PATH	+= -I$(UTSBASE)/i86xpv -I$(UTSBASE)/i86pc -I$(SRC)/common \
 # since only C headers are included when #defined(__lint) is true.
 #
 
-ASSYM_DEPS      += 		\
+ASSYM_DEPS      +=		\
 	copy.o			\
 	desctbls_asm.o		\
 	ddi_i86_asm.o		\
diff --git a/usr/src/uts/intel/ia32/ml/copy.s b/usr/src/uts/intel/ia32/ml/copy.s
index 95b7cb3028..f76a8a43cb 100644
--- a/usr/src/uts/intel/ia32/ml/copy.s
+++ b/usr/src/uts/intel/ia32/ml/copy.s
@@ -36,7 +36,7 @@
 /*         All Rights Reserved						*/
 
 /*
- * Copyright (c) 2017 Joyent, Inc.
+ * Copyright (c) 2018 Joyent, Inc.
  */
 
 #include <sys/errno.h>
@@ -866,8 +866,8 @@ bcopy_patch_start:
 bcopy_patch_end:
 
 	.p2align 4
-	.globl bcopy_ck_size
-bcopy_ck_size:
+	ALTENTRY(bcopy_ck_size)
+
 	cmpq	$BCOPY_DFLT_REP, %rdx
 	jae	L(use_rep)
 
@@ -956,6 +956,7 @@ L(use_rep):
 	jnz	L(do_remainder)
 	ret
 #undef	L
+	SET_SIZE(bcopy_ck_size)
 
 #ifdef DEBUG
 	/*
diff --git a/usr/src/uts/intel/ia32/ml/swtch.s b/usr/src/uts/intel/ia32/ml/swtch.s
index 6fc38cfbe8..c2c9fd9bd2 100644
--- a/usr/src/uts/intel/ia32/ml/swtch.s
+++ b/usr/src/uts/intel/ia32/ml/swtch.s
@@ -31,14 +31,6 @@
  * Process switching routines.
  */
 
-#if defined(__lint)
-#include <sys/thread.h>
-#include <sys/systm.h>
-#include <sys/time.h>
-#else	/* __lint */
-#include "assym.h"
-#endif	/* __lint */
-
 #include <sys/asm_linkage.h>
 #include <sys/asm_misc.h>
 #include <sys/regset.h>
@@ -47,6 +39,9 @@
 #include <sys/segments.h>
 #include <sys/psw.h>
 
+#if !defined(__lint)
+#include "assym.h"
+
 /*
  * resume(thread_id_t t);
  *
@@ -74,16 +69,10 @@
  * off the stack.
  */
 
-#if !defined(__lint)
-
 #if LWP_PCB_FPU != 0
 #error LWP_PCB_FPU MUST be defined as 0 for code in swtch.s to work
 #endif	/* LWP_PCB_FPU != 0 */
 
-#endif	/* !__lint */
-
-#if defined(__amd64)
-
 /*
  * Save non-volatile regs other than %rsp (%rbx, %rbp, and %r12 - %r15)
  *
@@ -153,88 +142,6 @@
 	jnz	0b;					\
 1:
 
-#elif defined (__i386)
-
-/*
- * Save non-volatile registers (%ebp, %esi, %edi and %ebx)
- *
- * The stack frame must be created before the save of %esp so that tracebacks
- * of swtch()ed-out processes show the process as having last called swtch().
- */
-#define SAVE_REGS(thread_t, retaddr)			\
-	movl	%ebp, T_EBP(thread_t);			\
-	movl	%ebx, T_EBX(thread_t);			\
-	movl	%esi, T_ESI(thread_t);			\
-	movl	%edi, T_EDI(thread_t);			\
-	pushl	%ebp;					\
-	movl	%esp, %ebp;				\
-	movl	%esp, T_SP(thread_t);			\
-	movl	retaddr, T_PC(thread_t);		\
-	movl	8(%ebp), %edi;				\
-	pushl	%edi;					\
-	call	__dtrace_probe___sched_off__cpu;	\
-	addl	$CLONGSIZE, %esp
-
-/*
- * Restore non-volatile registers (%ebp, %esi, %edi and %ebx)
- *
- * We don't do a 'leave,' because reloading %rsp/%rbp from the label_t
- * already has the effect of putting the stack back the way it was when
- * we came in.
- */
-#define RESTORE_REGS(scratch_reg)			\
-	movl	%gs:CPU_THREAD, scratch_reg;		\
-	movl	T_EBP(scratch_reg), %ebp;		\
-	movl	T_EBX(scratch_reg), %ebx;		\
-	movl	T_ESI(scratch_reg), %esi;		\
-	movl	T_EDI(scratch_reg), %edi
-
-/*
- * Get pointer to a thread's hat structure
- */
-#define GET_THREAD_HATP(hatp, thread_t, scratch_reg)	\
-	movl	T_PROCP(thread_t), hatp;		\
-	movl	P_AS(hatp), scratch_reg;		\
-	movl	A_HAT(scratch_reg), hatp
-
-/*
- * If we are resuming an interrupt thread, store a timestamp in the thread
- * structure.  If an interrupt occurs between tsc_read() and its subsequent
- * store, the timestamp will be stale by the time it is stored.  We can detect
- * this by doing a compare-and-swap on the thread's timestamp, since any
- * interrupt occurring in this window will put a new timestamp in the thread's
- * t_intr_start field.
- */
-#define	STORE_INTR_START(thread_t)			\
-	testw	$T_INTR_THREAD, T_FLAGS(thread_t);	\
-	jz	1f;					\
-	pushl	%ecx;					\
-0:							\
-	pushl	T_INTR_START(thread_t);			\
-	pushl	T_INTR_START+4(thread_t);		\
-	call	tsc_read;				\
-	movl	%eax, %ebx;				\
-	movl	%edx, %ecx;				\
-	popl	%edx;					\
-	popl	%eax;					\
-	cmpxchg8b T_INTR_START(thread_t);		\
-	jnz	0b;					\
-	popl	%ecx;					\
-1:
-
-#endif	/* __amd64 */
-
-#if defined(__lint)
-
-/* ARGSUSED */
-void
-resume(kthread_t *t)
-{}
-
-#else	/* __lint */
-
-#if defined(__amd64)
-
 	.global	kpti_enable
 
 	ENTRY(resume)
@@ -436,6 +343,8 @@ resume(kthread_t *t)
 	call	smap_disable
 .nosmap:
 
+	call	ht_mark
+
 	/*
 	 * Restore non-volatile registers, then have spl0 return to the
 	 * resuming thread's PC after first setting the priority as low as
@@ -456,203 +365,6 @@ resume_return:
 	SET_SIZE(_resume_from_idle)
 	SET_SIZE(resume)
 
-#elif defined (__i386)
-
-	ENTRY(resume)
-	movl	%gs:CPU_THREAD, %eax
-	movl	$resume_return, %ecx
-
-	/*
-	 * Save non-volatile registers, and set return address for current
-	 * thread to resume_return.
-	 *
-	 * %edi = t (new thread) when done.
-	 */
-	SAVE_REGS(%eax,  %ecx)
-
-	LOADCPU(%ebx)			/* %ebx = CPU */
-	movl	CPU_THREAD(%ebx), %esi	/* %esi = curthread */
-
-#ifdef DEBUG
-	call	assert_ints_enabled	/* panics if we are cli'd */
-#endif
-	/*
-	 * Call savectx if thread has installed context ops.
-	 *
-	 * Note that if we have floating point context, the save op
-	 * (either fpsave_begin or fpxsave_begin) will issue the
-	 * async save instruction (fnsave or fxsave respectively)
-	 * that we fwait for below.
-	 */
-	movl	T_CTX(%esi), %eax	/* should current thread savectx? */
-	testl	%eax, %eax
-	jz	.nosavectx		/* skip call when zero */
-	pushl	%esi			/* arg = thread pointer */
-	call	savectx			/* call ctx ops */
-	addl	$4, %esp		/* restore stack pointer */
-.nosavectx:
-
-        /*
-         * Call savepctx if process has installed context ops.
-         */
-	movl	T_PROCP(%esi), %eax	/* %eax = proc */
-	cmpl	$0, P_PCTX(%eax)	/* should current thread savectx? */
-	je	.nosavepctx		/* skip call when zero */
-	pushl	%eax			/* arg = proc pointer */
-	call	savepctx		/* call ctx ops */
-	addl	$4, %esp
-.nosavepctx:
-
-	/*
-	 * Temporarily switch to the idle thread's stack
-	 */
-	movl	CPU_IDLE_THREAD(%ebx), %eax 	/* idle thread pointer */
-
-	/*
-	 * Set the idle thread as the current thread
-	 */
-	movl	T_SP(%eax), %esp	/* It is safe to set esp */
-	movl	%eax, CPU_THREAD(%ebx)
-
-	/* switch in the hat context for the new thread */
-	GET_THREAD_HATP(%ecx, %edi, %ecx)
-	pushl	%ecx
-	call	hat_switch
-	addl	$4, %esp
-
-	/*
-	 * Clear and unlock previous thread's t_lock
-	 * to allow it to be dispatched by another processor.
-	 */
-	movb	$0, T_LOCK(%esi)
-
-	/*
-	 * IMPORTANT: Registers at this point must be:
-	 *       %edi = new thread
-	 *
-	 * Here we are in the idle thread, have dropped the old thread.
-	 */
-	ALTENTRY(_resume_from_idle)
-	/*
-	 * spin until dispatched thread's mutex has
-	 * been unlocked. this mutex is unlocked when
-	 * it becomes safe for the thread to run.
-	 */
-.L4:
-	lock
-	btsl	$0, T_LOCK(%edi) /* lock new thread's mutex */
-	jc	.L4_2			/* lock did not succeed */
-
-	/*
-	 * Fix CPU structure to indicate new running thread.
-	 * Set pointer in new thread to the CPU structure.
-	 */
-	LOADCPU(%esi)			/* load current CPU pointer */
-	movl	T_STACK(%edi), %eax	/* here to use v pipeline of */
-					/* Pentium. Used few lines below */
-	cmpl	%esi, T_CPU(%edi)
-	jne	.L5_2
-.L5_1:
-	/*
-	 * Setup esp0 (kernel stack) in TSS to curthread's stack.  If this
-	 * thread doesn't have a regs structure above the stack -- that is, if
-	 * lwp_stk_init() was never called for the thread -- this will set
-	 * esp0 to the wrong value, but it's harmless as it's a kernel thread,
-	 * and it won't actually attempt to implicitly use the esp0 via a
-	 * privilege change.
-	 */
-	movl	CPU_TSS(%esi), %ecx
-	addl	$REGSIZE+MINFRAME, %eax	/* to the bottom of thread stack */
-#if !defined(__xpv)
-	movl	%eax, TSS_ESP0(%ecx)
-#else
-	pushl	%eax
-	pushl	$KDS_SEL
-	call	HYPERVISOR_stack_switch
-	addl	$8, %esp
-#endif	/* __xpv */
-
-	movl	%edi, CPU_THREAD(%esi)	/* set CPU's thread pointer */
-	mfence				/* synchronize with mutex_exit() */
-	xorl	%ebp, %ebp		/* make $<threadlist behave better */
-	movl	T_LWP(%edi), %eax 	/* set associated lwp to  */
-	movl	%eax, CPU_LWP(%esi) 	/* CPU's lwp ptr */
-
-	movl	T_SP(%edi), %esp	/* switch to outgoing thread's stack */
-	movl	T_PC(%edi), %esi	/* saved return addr */
-
-	/*
-	 * Call restorectx if context ops have been installed.
-	 */
-	movl	T_CTX(%edi), %eax	/* should resumed thread restorectx? */
-	testl	%eax, %eax
-	jz	.norestorectx		/* skip call when zero */
-	pushl	%edi			/* arg = thread pointer */
-	call	restorectx		/* call ctx ops */
-	addl	$4, %esp		/* restore stack pointer */
-.norestorectx:
-
-	/*
-	 * Call restorepctx if context ops have been installed for the proc.
-	 */
-	movl	T_PROCP(%edi), %eax
-	cmpl	$0, P_PCTX(%eax)
-	je	.norestorepctx
-	pushl	%eax			/* arg = proc pointer */
-	call	restorepctx
-	addl	$4, %esp		/* restore stack pointer */
-.norestorepctx:
-
-	STORE_INTR_START(%edi)
-
-	/*
-	 * Restore non-volatile registers, then have spl0 return to the
-	 * resuming thread's PC after first setting the priority as low as
-	 * possible and blocking all interrupt threads that may be active.
-	 */
-	movl	%esi, %eax		/* save return address */
-	RESTORE_REGS(%ecx)
-	pushl	%eax			/* push return address for spl0() */
-	call	__dtrace_probe___sched_on__cpu
-	jmp	spl0
-
-resume_return:
-	/*
-	 * Remove stack frame created in SAVE_REGS()
-	 */
-	addl	$CLONGSIZE, %esp
-	ret
-
-.L4_2:
-	pause
-	cmpb	$0, T_LOCK(%edi)
-	je	.L4
-	jmp	.L4_2
-
-.L5_2:
-	/* cp->cpu_stats.sys.cpumigrate++ */
-	addl    $1, CPU_STATS_SYS_CPUMIGRATE(%esi)
-	adcl    $0, CPU_STATS_SYS_CPUMIGRATE+4(%esi)
-	movl	%esi, T_CPU(%edi)	/* set new thread's CPU pointer */
-	jmp	.L5_1
-
-	SET_SIZE(_resume_from_idle)
-	SET_SIZE(resume)
-
-#endif	/* __amd64 */
-#endif	/* __lint */
-
-#if defined(__lint)
-
-/* ARGSUSED */
-void
-resume_from_zombie(kthread_t *t)
-{}
-
-#else	/* __lint */
-
-#if defined(__amd64)
-
 	ENTRY(resume_from_zombie)
 	movq	%gs:CPU_THREAD, %rax
 	leaq	resume_from_zombie_return(%rip), %r11
@@ -727,88 +439,6 @@ resume_from_zombie_return:
 	ret
 	SET_SIZE(resume_from_zombie)
 
-#elif defined (__i386)
-
-	ENTRY(resume_from_zombie)
-	movl	%gs:CPU_THREAD, %eax
-	movl	$resume_from_zombie_return, %ecx
-
-	/*
-	 * Save non-volatile registers, and set return address for current
-	 * thread to resume_from_zombie_return.
-	 *
-	 * %edi = t (new thread) when done.
-	 */
-	SAVE_REGS(%eax, %ecx)
-
-#ifdef DEBUG
-	call	assert_ints_enabled	/* panics if we are cli'd */
-#endif
-	movl	%gs:CPU_THREAD, %esi	/* %esi = curthread */
-
-	/* clean up the fp unit. It might be left enabled */
-
-	movl	%cr0, %eax
-	testl	$CR0_TS, %eax
-	jnz	.zfpu_disabled		/* if TS already set, nothing to do */
-	fninit				/* init fpu & discard pending error */
-	orl	$CR0_TS, %eax
-	movl	%eax, %cr0
-.zfpu_disabled:
-
-	/*
-	 * Temporarily switch to the idle thread's stack so that the zombie
-	 * thread's stack can be reclaimed by the reaper.
-	 */
-	movl	%gs:CPU_IDLE_THREAD, %eax /* idle thread pointer */
-	movl	T_SP(%eax), %esp	/* get onto idle thread stack */
-
-	/*
-	 * Set the idle thread as the current thread.
-	 */
-	movl	%eax, %gs:CPU_THREAD
-
-	/*
-	 * switch in the hat context for the new thread
-	 */
-	GET_THREAD_HATP(%ecx, %edi, %ecx)
-	pushl	%ecx
-	call	hat_switch
-	addl	$4, %esp
-
-	/*
-	 * Put the zombie on death-row.
-	 */
-	pushl	%esi
-	call	reapq_add
-	addl	$4, %esp
-	jmp	_resume_from_idle	/* finish job of resume */
-
-resume_from_zombie_return:
-	RESTORE_REGS(%ecx)		/* restore non-volatile registers */
-	call	__dtrace_probe___sched_on__cpu
-
-	/*
-	 * Remove stack frame created in SAVE_REGS()
-	 */
-	addl	$CLONGSIZE, %esp
-	ret
-	SET_SIZE(resume_from_zombie)
-
-#endif	/* __amd64 */
-#endif	/* __lint */
-
-#if defined(__lint)
-
-/* ARGSUSED */
-void
-resume_from_intr(kthread_t *t)
-{}
-
-#else	/* __lint */
-
-#if defined(__amd64)
-
 	ENTRY(resume_from_intr)
 	movq	%gs:CPU_THREAD, %rax
 	leaq	resume_from_intr_return(%rip), %r11
@@ -835,6 +465,8 @@ resume_from_intr(kthread_t *t)
 
 	STORE_INTR_START(%r12)
 
+	call	ht_mark
+
 	/*
 	 * Restore non-volatile registers, then have spl0 return to the
 	 * resuming thread's PC after first setting the priority as low as
@@ -854,69 +486,6 @@ resume_from_intr_return:
 	ret
 	SET_SIZE(resume_from_intr)
 
-#elif defined (__i386)
-
-	ENTRY(resume_from_intr)
-	movl	%gs:CPU_THREAD, %eax
-	movl	$resume_from_intr_return, %ecx
-
-	/*
-	 * Save non-volatile registers, and set return address for current
-	 * thread to resume_return.
-	 *
-	 * %edi = t (new thread) when done.
-	 */
-	SAVE_REGS(%eax, %ecx)
-
-#ifdef DEBUG
-	call	assert_ints_enabled	/* panics if we are cli'd */
-#endif
-	movl	%gs:CPU_THREAD, %esi	/* %esi = curthread */
-	movl	%edi, %gs:CPU_THREAD	/* set CPU's thread pointer */
-	mfence				/* synchronize with mutex_exit() */
-	movl	T_SP(%edi), %esp	/* restore resuming thread's sp */
-	xorl	%ebp, %ebp		/* make $<threadlist behave better */
-
-	/*
-	 * Unlock outgoing thread's mutex dispatched by another processor.
-	 */
-	xorl	%eax,%eax
-	xchgb	%al, T_LOCK(%esi)
-
-	STORE_INTR_START(%edi)
-
-	/*
-	 * Restore non-volatile registers, then have spl0 return to the
-	 * resuming thread's PC after first setting the priority as low as
-	 * possible and blocking all interrupt threads that may be active.
-	 */
-	movl	T_PC(%edi), %eax	/* saved return addr */
-	RESTORE_REGS(%ecx)
-	pushl	%eax			/* push return address for spl0() */
-	call	__dtrace_probe___sched_on__cpu
-	jmp	spl0
-
-resume_from_intr_return:
-	/*
-	 * Remove stack frame created in SAVE_REGS()
-	 */
-	addl	$CLONGSIZE, %esp
-	ret
-	SET_SIZE(resume_from_intr)
-
-#endif	/* __amd64 */
-#endif /* __lint */
-
-#if defined(__lint)
-
-void
-thread_start(void)
-{}
-
-#else   /* __lint */
-
-#if defined(__amd64)
-
 	ENTRY(thread_start)
 	popq	%rax		/* start() */
 	popq	%rdi		/* arg */
@@ -927,36 +496,6 @@ thread_start(void)
 	/*NOTREACHED*/
 	SET_SIZE(thread_start)
 
-#elif defined(__i386)
-
-	ENTRY(thread_start)
-	popl	%eax
-	movl	%esp, %ebp
-	addl	$8, %ebp
-	call	*%eax
-	addl	$8, %esp
-	call	thread_exit	/* destroy thread if it returns. */
-	/*NOTREACHED*/
-	SET_SIZE(thread_start)
-
-#endif	/* __i386 */
-
-#endif  /* __lint */
-
-#if defined(__lint)
-
-void
-thread_splitstack_run(caddr_t stack, void (*func)(void *), void *arg)
-{}
-
-void
-thread_splitstack_cleanup(void)
-{}
-
-#else   /* __lint */
-
-#if defined(__amd64)
-
 	ENTRY(thread_splitstack_run)
 	pushq	%rbp			/* push base pointer */
 	movq	%rsp, %rbp		/* construct frame */
@@ -995,34 +534,4 @@ thread_splitstack_cleanup(void)
 	ret
 	SET_SIZE(thread_splitstack_cleanup)
 
-#elif defined(__i386)
-
-	ENTRY(thread_splitstack_run)
-	pushl	%ebp			/* push base pointer */
-	movl	%esp, %ebp		/* construct frame */
-	movl	8(%ebp), %esp		/* set stack pointer */
-	movl	12(%ebp), %eax		/* load func */
-	movl	16(%ebp), %edx		/* load arg */
-	pushl	%edx			/* push arg */
-	call	*%eax			/* call specifed function */
-	addl	$4, %esp		/* restore stack pointer */
-	leave				/* pop base pointer */
-	ret
-	SET_SIZE(thread_splitstack_run)
-
-	/*
-	 * See comment in the amd64 code, above.
-	 */
-	ENTRY(thread_splitstack_cleanup)
-	LOADCPU(%eax)
-	movl	CPU_TSS(%eax), %ecx
-	movl 	CPU_THREAD(%eax), %edx
-	movl	T_STACK(%edx), %edx
-	addl    $REGSIZE+MINFRAME, %edx
-	movl	%edx, TSS_ESP0(%ecx)
-	ret
-	SET_SIZE(thread_splitstack_cleanup)
-
-#endif	/* __i386 */
-
-#endif  /* __lint */
+#endif /* !__lint */
diff --git a/usr/src/uts/intel/sys/x86_archext.h b/usr/src/uts/intel/sys/x86_archext.h
index c6d696dc6e..fb6b6f0fdb 100644
--- a/usr/src/uts/intel/sys/x86_archext.h
+++ b/usr/src/uts/intel/sys/x86_archext.h
@@ -254,6 +254,7 @@ extern "C" {
 #define	CPUID_INTC_EDX_7_0_AVX5124FMAPS	0x00000008	/* AVX512 4FMAPS */
 #define	CPUID_INTC_EDX_7_0_SPEC_CTRL	0x04000000	/* Spec, IBPB, IBRS */
 #define	CPUID_INTC_EDX_7_0_STIBP	0x08000000	/* STIBP */
+#define	CPUID_INTC_EDX_7_0_FLUSH_CMD	0x10000000	/* IA32_FLUSH_CMD */
 #define	CPUID_INTC_EDX_7_0_ARCH_CAPS	0x20000000	/* IA32_ARCH_CAPS */
 #define	CPUID_INTC_EDX_7_0_SSBD		0x80000000	/* SSBD */
 
@@ -362,11 +363,12 @@ extern "C" {
 /*
  * Intel IA32_ARCH_CAPABILITIES MSR.
  */
-#define	MSR_IA32_ARCH_CAPABILITIES	0x10a
-#define	IA32_ARCH_CAP_RDCL_NO		0x0001
-#define	IA32_ARCH_CAP_IBRS_ALL		0x0002
-#define	IA32_ARCH_CAP_RSBA		0x0004
-#define	IA32_ARCH_CAP_SSB_NO		0x0010
+#define	MSR_IA32_ARCH_CAPABILITIES		0x10a
+#define	IA32_ARCH_CAP_RDCL_NO			0x0001
+#define	IA32_ARCH_CAP_IBRS_ALL			0x0002
+#define	IA32_ARCH_CAP_RSBA			0x0004
+#define	IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY	0x0008
+#define	IA32_ARCH_CAP_SSB_NO			0x0010
 
 /*
  * Intel Speculation related MSRs
@@ -379,6 +381,9 @@ extern "C" {
 #define	MSR_IA32_PRED_CMD	0x49
 #define	IA32_PRED_CMD_IBPB	0x01
 
+#define	MSR_IA32_FLUSH_CMD	0x10b
+#define	IA32_FLUSH_CMD_L1D	0x01
+
 #define	MCI_CTL_VALUE		0xffffffff
 
 #define	MTRR_TYPE_UC		0
@@ -491,6 +496,8 @@ extern "C" {
 #define	X86FSET_RSBA		78
 #define	X86FSET_SSB_NO		79
 #define	X86FSET_STIBP_ALL	80
+#define	X86FSET_FLUSH_CMD	81
+#define	X86FSET_L1D_VM_NO	82
 
 /*
  * Intel Deep C-State invariant TSC in leaf 0x80000007.
@@ -773,7 +780,7 @@ extern "C" {
 
 #if defined(_KERNEL) || defined(_KMEMUSER)
 
-#define	NUM_X86_FEATURES	81
+#define	NUM_X86_FEATURES	83
 extern uchar_t x86_featureset[];
 
 extern void free_x86_featureset(void *featureset);
@@ -792,6 +799,8 @@ extern uint_t pentiumpro_bug4046376;
 
 extern const char CyrixInstead[];
 
+extern void (*spec_l1d_flush)(void);
+
 #endif
 
 #if defined(_KERNEL)
diff --git a/usr/src/uts/intel/vnd/Makefile b/usr/src/uts/intel/vnd/Makefile
index fc94398b99..b94d014eb7 100644
--- a/usr/src/uts/intel/vnd/Makefile
+++ b/usr/src/uts/intel/vnd/Makefile
@@ -10,7 +10,7 @@
 #
 
 #
-# Copyright 2017 Joyent, Inc.
+# Copyright 2018 Joyent, Inc.
 #
 
 UTSBASE	= ../..
@@ -29,7 +29,8 @@ LINT_TARGET	= $(MODULE).lint
 INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
 CONF_SRCDIR	= $(UTSBASE)/common/io/vnd
 
-LDFLAGS 	+= -dy -Nmisc/neti -Nmisch/hook -Nfs/dev -Nmisc/gsqueue
+CPPFLAGS	+= -I$(UTSBASE)/i86pc
+LDFLAGS		+= -dy -Nmisc/neti -Nmisch/hook -Nfs/dev -Nmisc/gsqueue
 
 #
 # We use <sys/ctype.h> which causes gcc to think that all of its inline
diff --git a/usr/src/uts/intel/zfs/Makefile b/usr/src/uts/intel/zfs/Makefile
index a4a2f4a561..07d4395c22 100644
--- a/usr/src/uts/intel/zfs/Makefile
+++ b/usr/src/uts/intel/zfs/Makefile
@@ -29,6 +29,7 @@
 #
 # Copyright (c) 2016 by Delphix. All rights reserved.
 #
+# Copyright 2018 Joyent, Inc.
 
 #
 #	Path to the base of the uts directory tree (usually /usr/src/uts).
@@ -72,6 +73,7 @@ INC_PATH	+= -I$(UTSBASE)/common/fs/zfs/lua
 INC_PATH	+= -I$(SRC)/common
 INC_PATH	+= -I$(COMMONBASE)/zfs
 
+CPPFLAGS	+= -I$(UTSBASE)/i86pc
 C99LMODE=	-Xc99=%all
 
 #
diff --git a/usr/src/uts/sparc/zfs/Makefile b/usr/src/uts/sparc/zfs/Makefile
index f32b408306..617d495325 100644
--- a/usr/src/uts/sparc/zfs/Makefile
+++ b/usr/src/uts/sparc/zfs/Makefile
@@ -29,6 +29,8 @@
 #
 # Copyright (c) 2016 by Delphix. All rights reserved.
 #
+# Copyright 2018 Joyent, Inc.
+#
 
 #
 #	Path to the base of the uts directory tree (usually /usr/src/uts).
@@ -74,6 +76,7 @@ INC_PATH	+= -I$(UTSBASE)/common/fs/zfs
 INC_PATH	+= -I$(UTSBASE)/common/fs/zfs/lua
 INC_PATH	+= -I$(SRC)/common
 INC_PATH	+= -I$(COMMONBASE)/zfs
+INC_PATH	+= -I$(UTSBASE)/sun4
 
 C99LMODE=	-Xc99=%all
 
diff --git a/usr/src/uts/sun4/sys/ht.h b/usr/src/uts/sun4/sys/ht.h
new file mode 100644
index 0000000000..831891979f
--- /dev/null
+++ b/usr/src/uts/sun4/sys/ht.h
@@ -0,0 +1,37 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#ifndef	_SYS_HT_H
+#define	_SYS_HT_H
+
+#include <sys/types.h>
+#include <sys/thread.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	ht_init() {}
+
+#define	ht_should_run(t, c) (B_TRUE)
+#define	ht_adjust_cpu_score(t, c, p) (p)
+#define	ht_mark_safe(void) {}
+#define	ht_mark_unsafe(void) {}
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_HT_H */
diff --git a/usr/src/uts/sun4u/sys/Makefile b/usr/src/uts/sun4u/sys/Makefile
index 8e73425995..a69a2b14f1 100644
--- a/usr/src/uts/sun4u/sys/Makefile
+++ b/usr/src/uts/sun4u/sys/Makefile
@@ -21,7 +21,7 @@
 # Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# uts/sun4u/sys/Makefile
+# Copyright 2018 Joyent, Inc.
 #
 UTSBASE	= ../..
 
@@ -40,18 +40,19 @@ SUN4_HDRS=			\
 	clock.h			\
 	cmp.h			\
 	cpc_ultra.h		\
-	cpu_sgnblk_defs.h 	\
+	cpu_sgnblk_defs.h	\
 	ddi_subrdefs.h		\
 	dvma.h			\
 	eeprom.h		\
 	errclassify.h		\
 	fcode.h			\
 	fc_plat.h		\
+	ht.h			\
 	idprom.h		\
 	intr.h			\
 	intreg.h		\
 	ivintr.h		\
-	memlist_plat.h  	\
+	memlist_plat.h		\
 	memnode.h		\
 	nexusdebug.h		\
 	prom_debug.h		\
diff --git a/usr/src/uts/sun4v/sys/Makefile b/usr/src/uts/sun4v/sys/Makefile
index 2af0d8841b..6c0fbd666c 100644
--- a/usr/src/uts/sun4v/sys/Makefile
+++ b/usr/src/uts/sun4v/sys/Makefile
@@ -22,8 +22,7 @@
 # Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-#
-# uts/sun4v/sys/Makefile
+# Copyright 2018 Joyent, Inc.
 #
 # include global definitions
 UTSBASE	= ../..
@@ -42,16 +41,17 @@ SUN4_HDRS=			\
 	clock.h			\
 	cmp.h			\
 	cpc_ultra.h		\
-	cpu_sgnblk_defs.h 	\
+	cpu_sgnblk_defs.h	\
 	ddi_subrdefs.h		\
 	dvma.h			\
 	eeprom.h		\
 	fcode.h			\
+	ht.h			\
 	idprom.h		\
 	intr.h			\
 	intreg.h		\
 	ivintr.h		\
-	memlist_plat.h	 	\
+	memlist_plat.h		\
 	memnode.h		\
 	nexusdebug.h		\
 	prom_debug.h		\
author	John Levon <john.levon@joyent.com>	2018-08-14 21:14:28 +0000
committer	Robert Mustacchi <rm@joyent.com>	2018-08-14 23:15:49 +0000
commit	89d0fffcadbabb8694d3ce87b5be826e2b789c99 (patch)
tree	a038f703ae6cfae6d41fe1ed8c17be5687e864da
parent	10ad6220c95adc2a5592ea98b1c7ced27d6942ed (diff)
download	illumos-joyent-release-20180802.tar.gz