summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Levon <john.levon@joyent.com>2018-08-14 21:14:28 +0000
committerRobert Mustacchi <rm@joyent.com>2018-08-14 23:15:49 +0000
commit89d0fffcadbabb8694d3ce87b5be826e2b789c99 (patch)
treea038f703ae6cfae6d41fe1ed8c17be5687e864da
parent10ad6220c95adc2a5592ea98b1c7ced27d6942ed (diff)
downloadillumos-joyent-release-20180802.tar.gz
OS-7125 Need mitigation of L1TF (CVE-2018-3646)release-20180802
Reviewed by: Robert Mustacchi <rm@joyent.com> Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Approved by: Robert Mustacchi <rm@joyent.com>
-rw-r--r--exception_lists/copyright2
-rw-r--r--exception_lists/keywords3
-rw-r--r--exception_lists/wscheck96
-rw-r--r--usr/src/uts/common/disp/cpupart.c14
-rw-r--r--usr/src/uts/common/disp/disp.c204
-rw-r--r--usr/src/uts/common/disp/thread.c7
-rw-r--r--usr/src/uts/common/fs/zfs/zvol.c27
-rw-r--r--usr/src/uts/common/io/vnd/vnd.c9
-rw-r--r--usr/src/uts/common/os/cpu.c21
-rw-r--r--usr/src/uts/common/os/lgrp.c4
-rw-r--r--usr/src/uts/common/sys/cpuvar.h14
-rw-r--r--usr/src/uts/common/sys/disp.h13
-rw-r--r--usr/src/uts/common/sys/thread.h3
-rw-r--r--usr/src/uts/i86pc/Makefile.files3
-rw-r--r--usr/src/uts/i86pc/io/apix/apix_intr.c12
-rw-r--r--usr/src/uts/i86pc/io/viona/viona.c7
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/vmx.c20
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm.c12
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c5
-rw-r--r--usr/src/uts/i86pc/os/cpuid.c67
-rw-r--r--usr/src/uts/i86pc/os/ht.c599
-rw-r--r--usr/src/uts/i86pc/os/intr.c31
-rw-r--r--usr/src/uts/i86pc/sys/Makefile5
-rw-r--r--usr/src/uts/i86pc/sys/ht.h46
-rw-r--r--usr/src/uts/i86pc/sys/machcpuvar.h15
-rw-r--r--usr/src/uts/i86pc/sys/vmm.h3
-rw-r--r--usr/src/uts/i86xpv/Makefile.files7
-rw-r--r--usr/src/uts/intel/ia32/ml/copy.s7
-rw-r--r--usr/src/uts/intel/ia32/ml/swtch.s507
-rw-r--r--usr/src/uts/intel/sys/x86_archext.h21
-rw-r--r--usr/src/uts/intel/vnd/Makefile5
-rw-r--r--usr/src/uts/intel/zfs/Makefile2
-rw-r--r--usr/src/uts/sparc/zfs/Makefile3
-rw-r--r--usr/src/uts/sun4/sys/ht.h37
-rw-r--r--usr/src/uts/sun4u/sys/Makefile7
-rw-r--r--usr/src/uts/sun4v/sys/Makefile8
36 files changed, 1182 insertions, 664 deletions
diff --git a/exception_lists/copyright b/exception_lists/copyright
index 12819c29a1..1fe2c1dd4e 100644
--- a/exception_lists/copyright
+++ b/exception_lists/copyright
@@ -126,6 +126,8 @@ usr/src/common/bzip2/huffman.c
usr/src/common/ficl/*
usr/src/data/hwdata/THIRDPARTYLICENSE.efifixes.descrip
usr/src/data/hwdata/THIRDPARTYLICENSE.efifixes.tmpl
+usr/src/data/ucode/amd/*
+usr/src/data/ucode/intel/*
usr/src/grub/grub-0.97/stage2/Makefile.am
usr/src/grub/grub-0.97/stage2/builtins.c
usr/src/grub/grub-0.97/stage2/disk_io.c
diff --git a/exception_lists/keywords b/exception_lists/keywords
index a8859d25d1..5ff9cc8ab0 100644
--- a/exception_lists/keywords
+++ b/exception_lists/keywords
@@ -22,6 +22,7 @@
# Copyright 2017 Nexenta Systems, Inc.
# Copyright (c) 2013 by Delphix. All rights reserved.
# Copyright 2016 Toomas Soome <tsoome@me.com>
+# Copyright 2018 Joyent, Inc.
#
syntax: glob
@@ -36,6 +37,8 @@ usr/src/data/locale/data/zh_SG.UTF-8.src
usr/src/data/locale/data/zh_TW.UTF-8.src
usr/src/data/terminfo/termcap.src
usr/src/data/terminfo/terminfo.src
+usr/src/data/ucode/amd/*
+usr/src/data/ucode/intel/*
usr/src/test/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v15.dat.bz2
usr/src/test/zfs-tests/tests/functional/delegate/delegate_common.kshlib
usr/src/test/test-runner/cmd/run
diff --git a/exception_lists/wscheck b/exception_lists/wscheck
new file mode 100644
index 0000000000..0d06b13802
--- /dev/null
+++ b/exception_lists/wscheck
@@ -0,0 +1,96 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# Copyright 2018 Joyent, Inc.
+#
+syntax: glob
+
+usr/src/data/ucode/amd/*
+usr/src/data/ucode/intel/*
+
+# bhyve sources
+usr/src/cmd/bhyve/acpi.[ch]
+usr/src/cmd/bhyve/ahci.h
+usr/src/cmd/bhyve/atkbdc.[ch]
+usr/src/cmd/bhyve/bhyvegc.[ch]
+usr/src/cmd/bhyve/bhyverun.[ch]
+usr/src/cmd/bhyve/block_if.[ch]
+usr/src/cmd/bhyve/bootrom.[ch]
+usr/src/cmd/bhyve/console.[ch]
+usr/src/cmd/bhyve/consport.c
+usr/src/cmd/bhyve/dbgport.[ch]
+usr/src/cmd/bhyve/fwctl.[ch]
+usr/src/cmd/bhyve/gdb.[ch]
+usr/src/cmd/bhyve/inout.[ch]
+usr/src/cmd/bhyve/ioapic.[ch]
+usr/src/cmd/bhyve/mem.[ch]
+usr/src/cmd/bhyve/mevent.[ch]
+usr/src/cmd/bhyve/mevent_test.c
+usr/src/cmd/bhyve/mptbl.[ch]
+usr/src/cmd/bhyve/pci_ahci.c
+usr/src/cmd/bhyve/pci_e82545.c
+usr/src/cmd/bhyve/pci_emul.[ch]
+usr/src/cmd/bhyve/pci_fbuf.c
+usr/src/cmd/bhyve/pci_hostbridge.c
+usr/src/cmd/bhyve/pci_irq.[ch]
+usr/src/cmd/bhyve/pci_lpc.[ch]
+usr/src/cmd/bhyve/pci_passthru.c
+usr/src/cmd/bhyve/pci_uart.c
+usr/src/cmd/bhyve/pci_virtio_block.c
+usr/src/cmd/bhyve/pci_virtio_console.c
+usr/src/cmd/bhyve/pci_virtio_net.c
+usr/src/cmd/bhyve/pci_virtio_rnd.c
+usr/src/cmd/bhyve/pci_xhci.[ch]
+usr/src/cmd/bhyve/pm.c
+usr/src/cmd/bhyve/pmtmr.c
+usr/src/cmd/bhyve/post.c
+usr/src/cmd/bhyve/ps2kbd.[ch]
+usr/src/cmd/bhyve/ps2mouse.[ch]
+usr/src/cmd/bhyve/rfb.[ch]
+usr/src/cmd/bhyve/rtc.[ch]
+usr/src/cmd/bhyve/smbiostbl.[ch]
+usr/src/cmd/bhyve/sockstream.[ch]
+usr/src/cmd/bhyve/spinup_ap.[ch]
+usr/src/cmd/bhyve/task_switch.c
+usr/src/cmd/bhyve/uart_emul.[ch]
+usr/src/cmd/bhyve/usb_emul.[ch]
+usr/src/cmd/bhyve/usb_mouse.c
+usr/src/cmd/bhyve/vga.[ch]
+usr/src/cmd/bhyve/virtio.[ch]
+usr/src/cmd/bhyve/xmsr.[ch]
+usr/src/cmd/bhyveconsole/bhyveconsole.c
+usr/src/cmd/bhyvectl/bhyvectl.c
+usr/src/compat/freebsd/*.h
+usr/src/compat/freebsd/*/*.h
+usr/src/compat/freebsd/amd64/machine/*.h
+usr/contrib/freebsd/*/*.h
+usr/contrib/freebsd/*/*/*.h
+usr/contrib/freebsd/lib/libutil/*.c
+usr/src/head/bhyve.h
+usr/src/lib/libvmmapi/common/vmmapi.[ch]
+usr/src/uts/i86pc/io/vmm/amd/*.[ch]
+usr/src/uts/i86pc/io/vmm/intel/*.[chs]
+usr/src/uts/i86pc/io/vmm/io/*.[ch]
+usr/src/uts/i86pc/io/vmm/vmm.c
+usr/src/uts/i86pc/io/vmm/vmm_host.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c
+usr/src/uts/i86pc/io/vmm/vmm_ioport.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_ipi.h
+usr/src/uts/i86pc/io/vmm/vmm_ktr.h
+usr/src/uts/i86pc/io/vmm/vmm_lapic.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_mem.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
+usr/src/uts/i86pc/io/vmm/vmm_stat.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_util.[ch]
+usr/src/uts/i86pc/io/vmm/vmx_assym.s
+usr/src/uts/i86pc/io/vmm/x86.[ch]
+usr/src/uts/i86pc/sys/vmm.h
+usr/src/uts/i86pc/sys/vmm_dev.h
+usr/src/uts/i86pc/sys/vmm_instruction_emul.h
diff --git a/usr/src/uts/common/disp/cpupart.c b/usr/src/uts/common/disp/cpupart.c
index c260329c61..4ddc568187 100644
--- a/usr/src/uts/common/disp/cpupart.c
+++ b/usr/src/uts/common/disp/cpupart.c
@@ -20,6 +20,8 @@
*/
/*
* Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/types.h>
@@ -324,7 +326,7 @@ cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
kthread_t *t;
int move_threads = 1;
lgrp_id_t lgrpid;
- proc_t *p;
+ proc_t *p;
int lgrp_diff_lpl;
lpl_t *cpu_lpl;
int ret;
@@ -569,8 +571,8 @@ again:
/* Update CPU last ran on if it was this CPU */
if (t->t_cpu == cp && t->t_cpupart == oldpp &&
t->t_bound_cpu != cp) {
- t->t_cpu = disp_lowpri_cpu(ncp,
- t->t_lpl, t->t_pri, NULL);
+ t->t_cpu = disp_lowpri_cpu(ncp, t,
+ t->t_pri);
}
t = t->t_forw;
} while (t != p->p_tlist);
@@ -622,8 +624,8 @@ again:
/* Update CPU last ran on if it was this CPU */
if (t->t_cpu == cp && t->t_cpupart == oldpp &&
t->t_bound_cpu != cp) {
- t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl,
- t->t_pri, NULL);
+ t->t_cpu = disp_lowpri_cpu(ncp, t,
+ t->t_pri);
}
t = t->t_next;
@@ -883,7 +885,7 @@ cpupart_create(psetid_t *psid)
static int
cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all)
{
- void *projbuf, *zonebuf;
+ void *projbuf, *zonebuf;
kthread_t *t;
proc_t *p;
int err = 0;
diff --git a/usr/src/uts/common/disp/disp.c b/usr/src/uts/common/disp/disp.c
index 5f9c2c68a2..4898a18bf2 100644
--- a/usr/src/uts/common/disp/disp.c
+++ b/usr/src/uts/common/disp/disp.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -60,6 +60,7 @@
#include <sys/dtrace.h>
#include <sys/sdt.h>
#include <sys/archsystm.h>
+#include <sys/ht.h>
#include <vm/as.h>
@@ -1135,15 +1136,13 @@ swtch_to(kthread_t *next)
*/
}
-#define CPU_IDLING(pri) ((pri) == -1)
-
static void
cpu_resched(cpu_t *cp, pri_t tpri)
{
int call_poke_cpu = 0;
pri_t cpupri = cp->cpu_dispatch_pri;
- if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
+ if (cpupri != CPU_IDLE_PRI && cpupri < tpri) {
TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
"CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
@@ -1239,17 +1238,17 @@ setbackdq(kthread_t *tp)
/*
* We'll generally let this thread continue to run where
* it last ran...but will consider migration if:
- * - We thread probably doesn't have much cache warmth.
+ * - The thread probably doesn't have much cache warmth.
+ * - HT exclusion would prefer us to run elsewhere
* - The CPU where it last ran is the target of an offline
* request.
- * - The thread last ran outside it's home lgroup.
+ * - The thread last ran outside its home lgroup.
*/
if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
- (tp->t_cpu == cpu_inmotion)) {
- cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
- } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
- cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
- self ? tp->t_cpu : NULL);
+ !ht_should_run(tp, tp->t_cpu) ||
+ (tp->t_cpu == cpu_inmotion) ||
+ !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
+ cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
} else {
cp = tp->t_cpu;
}
@@ -1278,7 +1277,8 @@ setbackdq(kthread_t *tp)
newcp = cp->cpu_next_part;
}
- if (RUNQ_LEN(newcp, tpri) < qlen) {
+ if (ht_should_run(tp, newcp) &&
+ RUNQ_LEN(newcp, tpri) < qlen) {
DTRACE_PROBE3(runq__balance,
kthread_t *, tp,
cpu_t *, cp, cpu_t *, newcp);
@@ -1289,8 +1289,8 @@ setbackdq(kthread_t *tp)
/*
* Migrate to a cpu in the new partition.
*/
- cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
- tp->t_lpl, tp->t_pri, NULL);
+ cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp,
+ tp->t_pri);
}
ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
} else {
@@ -1427,7 +1427,7 @@ setfrontdq(kthread_t *tp)
/*
* We'll generally let this thread continue to run
* where it last ran, but will consider migration if:
- * - The thread last ran outside it's home lgroup.
+ * - The thread last ran outside its home lgroup.
* - The CPU where it last ran is the target of an
* offline request (a thread_nomigrate() on the in
* motion CPU relies on this when forcing a preempt).
@@ -1435,21 +1435,18 @@ setfrontdq(kthread_t *tp)
* it last ran, and it is considered not likely to
* have significant cache warmth.
*/
- if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
- (cp == cpu_inmotion)) {
- cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
- (tp == curthread) ? cp : NULL);
- } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
- (!THREAD_HAS_CACHE_WARMTH(tp))) {
- cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
- NULL);
+ if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) ||
+ cp == cpu_inmotion ||
+ (tpri < cp->cpu_disp->disp_maxrunpri &&
+ !THREAD_HAS_CACHE_WARMTH(tp))) {
+ cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri);
}
} else {
/*
* Migrate to a cpu in the new partition.
*/
cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
- tp->t_lpl, tp->t_pri, NULL);
+ tp, tp->t_pri);
}
ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
} else {
@@ -1600,7 +1597,7 @@ setkpdq(kthread_t *tp, int borf)
/* migrate to a cpu in the new partition */
cp = tp->t_cpupart->cp_cpulist;
}
- cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
+ cp = disp_lowpri_cpu(cp, tp, tp->t_pri);
disp_lock_enter_high(&cp->cpu_disp->disp_lock);
ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
@@ -2573,80 +2570,85 @@ disp_cpu_inactive(cpu_t *cp)
}
/*
- * disp_lowpri_cpu - find CPU running the lowest priority thread.
- * The hint passed in is used as a starting point so we don't favor
- * CPU 0 or any other CPU. The caller should pass in the most recently
- * used CPU for the thread.
+ * Return a score rating this CPU for running this thread: lower is better.
*
- * The lgroup and priority are used to determine the best CPU to run on
- * in a NUMA machine. The lgroup specifies which CPUs are closest while
- * the thread priority will indicate whether the thread will actually run
- * there. To pick the best CPU, the CPUs inside and outside of the given
- * lgroup which are running the lowest priority threads are found. The
- * remote CPU is chosen only if the thread will not run locally on a CPU
- * within the lgroup, but will run on the remote CPU. If the thread
- * cannot immediately run on any CPU, the best local CPU will be chosen.
+ * If curthread is looking for a new CPU, then we ignore cpu_dispatch_pri for
+ * curcpu (as that's our own priority).
*
- * The lpl specified also identifies the cpu partition from which
- * disp_lowpri_cpu should select a CPU.
+ * If a cpu is the target of an offline request, then try to avoid it.
*
- * curcpu is used to indicate that disp_lowpri_cpu is being called on
- * behalf of the current thread. (curthread is looking for a new cpu)
- * In this case, cpu_dispatch_pri for this thread's cpu should be
- * ignored.
+ * Otherwise we'll use double the effective dispatcher priority for the CPU.
*
- * If a cpu is the target of an offline request then try to avoid it.
+ * We do this so ht_adjust_cpu_score() can increment the score if needed,
+ * without ending up over-riding a dispatcher priority.
+ */
+static pri_t
+cpu_score(cpu_t *cp, kthread_t *tp)
+{
+ pri_t score;
+
+ if (tp == curthread && cp == curthread->t_cpu)
+ score = 2 * CPU_IDLE_PRI;
+ else if (cp == cpu_inmotion)
+ score = SHRT_MAX;
+ else
+ score = 2 * cp->cpu_dispatch_pri;
+
+ if (2 * cp->cpu_disp->disp_maxrunpri > score)
+ score = 2 * cp->cpu_disp->disp_maxrunpri;
+ if (2 * cp->cpu_chosen_level > score)
+ score = 2 * cp->cpu_chosen_level;
+
+ return (ht_adjust_cpu_score(tp, cp, score));
+}
+
+/*
+ * disp_lowpri_cpu - find a suitable CPU to run the given thread.
+ *
+ * We are looking for a CPU with an effective dispatch priority lower than the
+ * thread's, so that the thread will run immediately rather than be enqueued.
+ * For NUMA locality, we prefer "home" CPUs within the thread's ->t_lpl group.
+ * If we don't find an available CPU there, we will expand our search to include
+ * wider locality levels. (Note these groups are already divided by CPU
+ * partition.)
+ *
+ * If the thread cannot immediately run on *any* CPU, we'll enqueue ourselves on
+ * the best home CPU we found.
*
- * This function must be called at either high SPL, or with preemption
- * disabled, so that the "hint" CPU cannot be removed from the online
- * CPU list while we are traversing it.
+ * The hint passed in is used as a starting point so we don't favor CPU 0 or any
+ * other CPU. The caller should pass in the most recently used CPU for the
+ * thread; it's of course possible that this CPU isn't in the home lgroup.
+ *
+ * This function must be called at either high SPL, or with preemption disabled,
+ * so that the "hint" CPU cannot be removed from the online CPU list while we
+ * are traversing it.
*/
cpu_t *
-disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
+disp_lowpri_cpu(cpu_t *hint, kthread_t *tp, pri_t tpri)
{
cpu_t *bestcpu;
cpu_t *besthomecpu;
cpu_t *cp, *cpstart;
- pri_t bestpri;
- pri_t cpupri;
-
klgrpset_t done;
- klgrpset_t cur_set;
lpl_t *lpl_iter, *lpl_leaf;
- int i;
- /*
- * Scan for a CPU currently running the lowest priority thread.
- * Cannot get cpu_lock here because it is adaptive.
- * We do not require lock on CPU list.
- */
ASSERT(hint != NULL);
- ASSERT(lpl != NULL);
- ASSERT(lpl->lpl_ncpu > 0);
+ ASSERT(tp->t_lpl->lpl_ncpu > 0);
- /*
- * First examine local CPUs. Note that it's possible the hint CPU
- * passed in in remote to the specified home lgroup. If our priority
- * isn't sufficient enough such that we can run immediately at home,
- * then examine CPUs remote to our home lgroup.
- * We would like to give preference to CPUs closest to "home".
- * If we can't find a CPU where we'll run at a given level
- * of locality, we expand our search to include the next level.
- */
bestcpu = besthomecpu = NULL;
klgrpset_clear(done);
- /* start with lpl we were passed */
- lpl_iter = lpl;
+ lpl_iter = tp->t_lpl;
do {
+ pri_t best = SHRT_MAX;
+ klgrpset_t cur_set;
- bestpri = SHRT_MAX;
klgrpset_clear(cur_set);
- for (i = 0; i < lpl_iter->lpl_nrset; i++) {
+ for (int i = 0; i < lpl_iter->lpl_nrset; i++) {
lpl_leaf = lpl_iter->lpl_rset[i];
if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
continue;
@@ -2659,34 +2661,25 @@ disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
cp = cpstart = lpl_leaf->lpl_cpus;
do {
- if (cp == curcpu)
- cpupri = -1;
- else if (cp == cpu_inmotion)
- cpupri = SHRT_MAX;
- else
- cpupri = cp->cpu_dispatch_pri;
- if (cp->cpu_disp->disp_maxrunpri > cpupri)
- cpupri = cp->cpu_disp->disp_maxrunpri;
- if (cp->cpu_chosen_level > cpupri)
- cpupri = cp->cpu_chosen_level;
- if (cpupri < bestpri) {
- if (CPU_IDLING(cpupri)) {
- ASSERT((cp->cpu_flags &
- CPU_QUIESCED) == 0);
- return (cp);
- }
+ pri_t score = cpu_score(cp, tp);
+
+ if (score < best) {
+ best = score;
bestcpu = cp;
- bestpri = cpupri;
+
+ /* An idle CPU: we're done. */
+ if (score / 2 == CPU_IDLE_PRI)
+ goto out;
}
} while ((cp = cp->cpu_next_lpl) != cpstart);
}
- if (bestcpu && (tpri > bestpri)) {
- ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
- return (bestcpu);
- }
+ if (bestcpu != NULL && tpri > (best / 2))
+ goto out;
+
if (besthomecpu == NULL)
besthomecpu = bestcpu;
+
/*
* Add the lgrps we just considered to the "done" set
*/
@@ -2698,8 +2691,11 @@ disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
* The specified priority isn't high enough to run immediately
* anywhere, so just return the best CPU from the home lgroup.
*/
- ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
- return (besthomecpu);
+ bestcpu = besthomecpu;
+
+out:
+ ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
+ return (bestcpu);
}
/*
@@ -2719,3 +2715,19 @@ static void
generic_enq_thread(cpu_t *cpu, int bound)
{
}
+
+cpu_t *
+disp_choose_best_cpu(void)
+{
+ kthread_t *t = curthread;
+ cpu_t *curcpu = CPU;
+
+ ASSERT(t->t_preempt > 0);
+ ASSERT(t->t_state == TS_ONPROC);
+ ASSERT(t->t_schedflag & TS_VCPU);
+
+ if (ht_should_run(t, curcpu))
+ return (curcpu);
+
+ return (disp_lowpri_cpu(curcpu, t, t->t_pri));
+}
diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c
index af000bf4f1..c923ba5d1a 100644
--- a/usr/src/uts/common/disp/thread.c
+++ b/usr/src/uts/common/disp/thread.c
@@ -74,6 +74,7 @@
#include <sys/waitq.h>
#include <sys/cpucaps.h>
#include <sys/kiconv.h>
+#include <sys/ht.h>
#ifndef STACK_GROWTH_DOWN
#error Stacks do not grow downward; 3b2 zombie attack detected!
@@ -507,8 +508,8 @@ thread_create(
if (CPU->cpu_part == &cp_default)
t->t_cpu = CPU;
else
- t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t->t_lpl,
- t->t_pri, NULL);
+ t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t,
+ t->t_pri);
t->t_disp_queue = t->t_cpu->cpu_disp;
kpreempt_enable();
@@ -1422,6 +1423,8 @@ thread_unpin()
itp = t->t_intr; /* interrupted thread */
t->t_intr = NULL; /* clear interrupt ptr */
+ ht_end_intr();
+
/*
* Get state from interrupt thread for the one
* it interrupted.
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index 03d711838c..2127de2bf0 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -25,8 +25,8 @@
*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -90,6 +90,7 @@
#include <sys/zfeature.h>
#include <sys/zio_checksum.h>
#include <sys/zil_impl.h>
+#include <sys/ht.h>
#include "zfs_namecheck.h"
@@ -1281,6 +1282,8 @@ zvol_strategy(buf_t *bp)
(zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
!doread && !is_dumpified;
+ ht_begin_unsafe();
+
/*
* There must be no buffer changes when doing a dmu_sync() because
* we can't change the data whilst calculating the checksum.
@@ -1328,6 +1331,8 @@ zvol_strategy(buf_t *bp)
zil_commit(zv->zv_zilog, ZVOL_OBJ);
biodone(bp);
+ ht_end_unsafe();
+
return (0);
}
@@ -1409,6 +1414,8 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
return (error);
}
+ ht_begin_unsafe();
+
DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0);
mutex_enter(&zonep->zone_vfs_lock);
@@ -1469,6 +1476,8 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int,
error);
+ ht_end_unsafe();
+
return (error);
}
@@ -1501,6 +1510,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
return (error);
}
+ ht_begin_unsafe();
+
DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1);
/*
@@ -1549,6 +1560,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int,
error);
+ ht_end_unsafe();
+
mutex_enter(&zonep->zone_vfs_lock);
zonep->zone_vfs_rwstats.writes++;
zonep->zone_vfs_rwstats.nwritten += tot_bytes;
@@ -1818,11 +1831,17 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
case DKIOCFLUSHWRITECACHE:
dkc = (struct dk_callback *)arg;
mutex_exit(&zfsdev_state_lock);
+
+ ht_begin_unsafe();
+
zil_commit(zv->zv_zilog, ZVOL_OBJ);
if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
(*dkc->dkc_callback)(dkc->dkc_cookie, error);
error = 0;
}
+
+ ht_end_unsafe();
+
return (error);
case DKIOCGETWCE:
@@ -1847,7 +1866,9 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
} else {
zv->zv_flags &= ~ZVOL_WCE;
mutex_exit(&zfsdev_state_lock);
+ ht_begin_unsafe();
zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ ht_end_unsafe();
}
return (0);
}
@@ -1900,6 +1921,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
mutex_exit(&zfsdev_state_lock);
+ ht_begin_unsafe();
+
rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
RL_WRITER);
tx = dmu_tx_create(zv->zv_objset);
@@ -1932,6 +1955,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
zil_commit(zv->zv_zilog, ZVOL_OBJ);
}
+ ht_end_unsafe();
+
return (error);
}
diff --git a/usr/src/uts/common/io/vnd/vnd.c b/usr/src/uts/common/io/vnd/vnd.c
index 5a25ed22d5..d03c7ce4ec 100644
--- a/usr/src/uts/common/io/vnd/vnd.c
+++ b/usr/src/uts/common/io/vnd/vnd.c
@@ -830,6 +830,7 @@
#include <sys/disp.h>
#include <sys/random.h>
#include <sys/gsqueue.h>
+#include <sys/ht.h>
#include <inet/ip.h>
#include <inet/ip6.h>
@@ -3716,6 +3717,12 @@ vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy)
bsize = vsp->vns_bsize;
mutex_exit(&vsp->vns_lock);
+ /*
+ * We're potentially going deep into the networking layer; make sure the
+ * guest can't run concurrently.
+ */
+ ht_begin_unsafe();
+
nmps = 0;
mptot = 0;
blocked = B_FALSE;
@@ -3736,6 +3743,8 @@ vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy)
}
}
+ ht_end_unsafe();
+
empty = vnd_dq_is_empty(&vsp->vns_dq_write);
/*
diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c
index 2efb68889c..4648dae9dd 100644
--- a/usr/src/uts/common/os/cpu.c
+++ b/usr/src/uts/common/os/cpu.c
@@ -400,6 +400,9 @@ force_thread_migrate(kthread_id_t tp)
* CPUs prior to a successful return, it should take extra precautions (such as
* their own call to kpreempt_disable) to ensure that safety.
*
+ * CPU_BEST can be used to pick a "best" CPU to migrate to, including
+ * potentially the current CPU.
+ *
* A CPU affinity reference count is maintained by thread_affinity_set and
* thread_affinity_clear (incrementing and decrementing it, respectively),
* maintaining CPU affinity while the count is non-zero, and allowing regions
@@ -416,6 +419,10 @@ thread_affinity_set(kthread_id_t t, int cpu_id)
VERIFY3P(t, ==, curthread);
kpreempt_disable();
cp = CPU;
+ } else if (cpu_id == CPU_BEST) {
+ VERIFY3P(t, ==, curthread);
+ kpreempt_disable();
+ cp = disp_choose_best_cpu();
} else {
/*
* We should be asserting that cpu_lock is held here, but
@@ -453,9 +460,8 @@ thread_affinity_set(kthread_id_t t, int cpu_id)
thread_unlock(t);
}
- if (cpu_id == CPU_CURRENT) {
+ if (cpu_id == CPU_CURRENT || cpu_id == CPU_BEST)
kpreempt_enable();
- }
}
/*
@@ -1490,8 +1496,8 @@ again: for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) {
* Update CPU last ran on if it was this CPU
*/
if (t->t_cpu == cp && t->t_bound_cpu != cp)
- t->t_cpu = disp_lowpri_cpu(ncp,
- t->t_lpl, t->t_pri, NULL);
+ t->t_cpu = disp_lowpri_cpu(ncp, t,
+ t->t_pri);
ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||
t->t_weakbound_cpu == cp);
@@ -1533,10 +1539,9 @@ again: for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) {
* Update CPU last ran on if it was this CPU
*/
- if (t->t_cpu == cp && t->t_bound_cpu != cp) {
- t->t_cpu = disp_lowpri_cpu(ncp,
- t->t_lpl, t->t_pri, NULL);
- }
+ if (t->t_cpu == cp && t->t_bound_cpu != cp)
+ t->t_cpu = disp_lowpri_cpu(ncp, t, t->t_pri);
+
ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||
t->t_weakbound_cpu == cp);
t = t->t_next;
diff --git a/usr/src/uts/common/os/lgrp.c b/usr/src/uts/common/os/lgrp.c
index 6288f47bed..6f6aced619 100644
--- a/usr/src/uts/common/os/lgrp.c
+++ b/usr/src/uts/common/os/lgrp.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -90,6 +91,7 @@
#include <sys/pg.h>
#include <sys/promif.h>
#include <sys/sdt.h>
+#include <sys/ht.h>
lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */
lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
@@ -520,6 +522,8 @@ lgrp_main_mp_init(void)
{
klgrpset_t changed;
+ ht_init();
+
/*
* Update lgroup topology (if necessary)
*/
diff --git a/usr/src/uts/common/sys/cpuvar.h b/usr/src/uts/common/sys/cpuvar.h
index 3ee4e70eec..2cfe5116d9 100644
--- a/usr/src/uts/common/sys/cpuvar.h
+++ b/usr/src/uts/common/sys/cpuvar.h
@@ -23,7 +23,7 @@
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
* Copyright 2017 RackTop Systems.
*/
@@ -540,13 +540,19 @@ extern struct cpu *curcpup(void);
#endif
/*
- * CPU_CURRENT indicates to thread_affinity_set to use CPU->cpu_id
- * as the target and to grab cpu_lock instead of requiring the caller
- * to grab it.
+ * CPU_CURRENT indicates to thread_affinity_set() to use whatever curthread's
+ * current CPU is; holding cpu_lock is not required.
*/
#define CPU_CURRENT -3
/*
+ * CPU_BEST can be used by thread_affinity_set() callers to set affinity to a
+ * good CPU (in particular, an ht_acquire()-friendly choice); holding cpu_lock
+ * is not required.
+ */
+#define CPU_BEST -4
+
+/*
* Per-CPU statistics
*
* cpu_stats_t contains numerous system and VM-related statistics, in the form
diff --git a/usr/src/uts/common/sys/disp.h b/usr/src/uts/common/sys/disp.h
index b324f4d323..cb3711edcd 100644
--- a/usr/src/uts/common/sys/disp.h
+++ b/usr/src/uts/common/sys/disp.h
@@ -23,6 +23,8 @@
* Use is subject to license terms.
*
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -63,11 +65,11 @@ typedef struct _disp {
/*
* Priorities:
* disp_maxrunpri is the maximum run priority of runnable threads
- * on this queue. It is -1 if nothing is runnable.
+ * on this queue. It is -1 if nothing is runnable.
*
* disp_max_unbound_pri is the maximum run priority of threads on
* this dispatch queue but runnable by any CPU. This may be left
- * artificially high, then corrected when some CPU tries to take
+ * artificially high, then corrected when some CPU tries to take
* an unbound thread. It is -1 if nothing is runnable.
*/
pri_t disp_maxrunpri; /* maximum run priority */
@@ -151,8 +153,7 @@ extern void dq_srundec(kthread_t *);
extern void cpu_rechoose(kthread_t *);
extern void cpu_surrender(kthread_t *);
extern void kpreempt(int);
-extern struct cpu *disp_lowpri_cpu(struct cpu *, struct lgrp_ld *, pri_t,
- struct cpu *);
+extern struct cpu *disp_lowpri_cpu(struct cpu *, kthread_t *, pri_t);
extern int disp_bound_threads(struct cpu *, int);
extern int disp_bound_anythreads(struct cpu *, int);
extern int disp_bound_partition(struct cpu *, int);
@@ -167,6 +168,8 @@ extern void resume_from_zombie(kthread_t *)
extern void disp_swapped_enq(kthread_t *);
extern int disp_anywork(void);
+extern struct cpu *disp_choose_best_cpu(void);
+
#define KPREEMPT_SYNC (-1)
#define kpreempt_disable() \
{ \
@@ -183,6 +186,8 @@ extern int disp_anywork(void);
#endif /* _KERNEL */
+#define CPU_IDLE_PRI (-1)
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h
index af9fcb75cf..678d356564 100644
--- a/usr/src/uts/common/sys/thread.h
+++ b/usr/src/uts/common/sys/thread.h
@@ -354,6 +354,8 @@ typedef struct _kthread {
kmutex_t t_wait_mutex; /* used in CV wait functions */
char *t_name; /* thread name */
+
+ uint64_t t_unsafe; /* unsafe to run with HT VCPU thread */
} kthread_t;
/*
@@ -417,6 +419,7 @@ typedef struct _kthread {
#define TS_SIGNALLED 0x0010 /* thread was awakened by cv_signal() */
#define TS_PROJWAITQ 0x0020 /* thread is on its project's waitq */
#define TS_ZONEWAITQ 0x0040 /* thread is on its zone's waitq */
+#define TS_VCPU 0x0080 /* thread will enter guest context */
#define TS_CSTART 0x0100 /* setrun() by continuelwps() */
#define TS_UNPAUSE 0x0200 /* setrun() by unpauselwps() */
#define TS_XSTART 0x0400 /* setrun() by SIGCONT */
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index fcf9820fd8..2a94505acb 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -68,6 +68,7 @@ CORE_OBJS += \
hment.o \
hold_page.o \
hrtimers.o \
+ ht.o \
htable.o \
hypercall.o \
hypersubr.o \
@@ -293,7 +294,7 @@ INC_PATH += -I$(UTSBASE)/i86xpv -I$(UTSBASE)/common/xen
# since only C headers are included when #defined(__lint) is true.
#
-ASSYM_DEPS += \
+ASSYM_DEPS += \
copy.o \
desctbls_asm.o \
ddi_i86_asm.o \
diff --git a/usr/src/uts/i86pc/io/apix/apix_intr.c b/usr/src/uts/i86pc/io/apix/apix_intr.c
index 227ce0c991..59d8787839 100644
--- a/usr/src/uts/i86pc/io/apix/apix_intr.c
+++ b/usr/src/uts/i86pc/io/apix/apix_intr.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2018 Western Digital Corporation. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/cpuvar.h>
@@ -68,6 +69,7 @@
#include <vm/hat_i86.h>
#include <sys/stack.h>
#include <sys/apix.h>
+#include <sys/ht.h>
static void apix_post_hardint(int);
@@ -280,6 +282,7 @@ apix_do_softint_prolog(struct cpu *cpu, uint_t pil, uint_t oldpil,
it->t_intr = t;
cpu->cpu_thread = it;
+ ht_begin_intr(pil);
/*
* Set bit for this pil in CPU's interrupt active bitmask.
@@ -350,7 +353,9 @@ apix_do_softint_epilog(struct cpu *cpu, uint_t oldpil)
it->t_link = cpu->cpu_intr_thread;
cpu->cpu_intr_thread = it;
it->t_state = TS_FREE;
+ ht_end_intr();
cpu->cpu_thread = t;
+
if (t->t_flag & T_INTR_THREAD)
t->t_intr_start = now;
basespl = cpu->cpu_base_spl;
@@ -466,6 +471,8 @@ apix_hilevel_intr_prolog(struct cpu *cpu, uint_t pil, uint_t oldpil,
}
}
+ ht_begin_intr(pil);
+
/* store starting timestamp in CPu structure for this IPL */
mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)] = now;
@@ -556,6 +563,8 @@ apix_hilevel_intr_epilog(struct cpu *cpu, uint_t oldpil)
t->t_intr_start = now;
}
+ ht_end_intr();
+
mcpu->mcpu_pri = oldpil;
if (pil < CBE_HIGH_PIL)
(void) (*setlvlx)(oldpil, 0);
@@ -668,6 +677,7 @@ apix_intr_thread_prolog(struct cpu *cpu, uint_t pil, caddr_t stackptr)
it->t_state = TS_ONPROC;
cpu->cpu_thread = it;
+ ht_begin_intr(pil);
/*
* Initialize thread priority level from intr_pri
@@ -756,7 +766,9 @@ apix_intr_thread_epilog(struct cpu *cpu, uint_t oldpil)
cpu->cpu_intr_thread = it;
it->t_state = TS_FREE;
+ ht_end_intr();
cpu->cpu_thread = t;
+
if (t->t_flag & T_INTR_THREAD)
t->t_intr_start = now;
basespl = cpu->cpu_base_spl;
diff --git a/usr/src/uts/i86pc/io/viona/viona.c b/usr/src/uts/i86pc/io/viona/viona.c
index d3a3bdd44f..3c52457a0b 100644
--- a/usr/src/uts/i86pc/io/viona/viona.c
+++ b/usr/src/uts/i86pc/io/viona/viona.c
@@ -220,6 +220,7 @@
#include <sys/strsubr.h>
#include <sys/strsun.h>
#include <vm/seg_kmem.h>
+#include <sys/ht.h>
#include <sys/pattr.h>
#include <sys/dls.h>
@@ -2414,7 +2415,13 @@ viona_tx(viona_link_t *link, viona_vring_t *ring)
viona_tx_done(ring, len, cookie);
}
+ /*
+ * We're potentially going deep into the networking layer; make sure the
+ * guest can't run concurrently.
+ */
+ ht_begin_unsafe();
mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
+ ht_end_unsafe();
return;
drop_fail:
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
index 9ad232a612..e07ee0ea52 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
@@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$");
#ifndef __FreeBSD__
#include <sys/x86_archext.h>
#include <sys/smp_impldefs.h>
+#include <sys/ht.h>
#endif
#include <vm/vm.h>
@@ -3052,11 +3053,30 @@ vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
break;
}
+#ifndef __FreeBSD__
+ if ((rc = ht_acquire()) != 1) {
+ enable_intr();
+ vmexit->rip = rip;
+ vmexit->inst_length = 0;
+ if (rc == -1) {
+ vmexit->exitcode = VM_EXITCODE_HT;
+ } else {
+ vmexit->exitcode = VM_EXITCODE_BOGUS;
+ handled = HANDLED;
+ }
+ break;
+ }
+#endif
+
vmx_run_trace(vmx, vcpu);
vmx_dr_enter_guest(vmxctx);
rc = vmx_enter_guest(vmxctx, vmx, launched);
vmx_dr_leave_guest(vmxctx);
+#ifndef __FreeBSD__
+ ht_release();
+#endif
+
/* Collect some information for VM exit processing */
vmexit->rip = rip = vmcs_guest_rip();
vmexit->inst_length = vmexit_instruction_length();
diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c
index bcb6b77cea..164227cc5e 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm.c
@@ -1997,7 +1997,6 @@ vmm_freectx(void *arg, int isexec)
#endif /* __FreeBSD */
-
int
vm_run(struct vm *vm, struct vm_run *vmrun)
{
@@ -2013,6 +2012,7 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
pmap_t pmap;
#ifndef __FreeBSD__
vm_thread_ctx_t vtc;
+ int affinity_type = CPU_CURRENT;
#endif
vcpuid = vmrun->cpuid;
@@ -2044,7 +2044,7 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
restart:
#ifndef __FreeBSD__
- thread_affinity_set(curthread, CPU_CURRENT);
+ thread_affinity_set(curthread, affinity_type);
/*
* Resource localization should happen after the CPU affinity for the
* thread has been set to ensure that access from restricted contexts,
@@ -2054,6 +2054,8 @@ restart:
* This must be done prior to disabling kpreempt via critical_enter().
*/
vm_localize_resources(vm, vcpu);
+
+ affinity_type = CPU_CURRENT;
#endif
critical_enter();
@@ -2145,6 +2147,12 @@ restart:
retu = true;
}
break;
+
+ case VM_EXITCODE_HT: {
+ affinity_type = CPU_BEST;
+ break;
+ }
+
#endif
default:
retu = true; /* handled in userland */
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
index fff951f82b..3c0d9beec2 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
@@ -28,6 +28,7 @@
#include <sys/cpuset.h>
#include <sys/id_space.h>
#include <sys/fs/sdev_plugin.h>
+#include <sys/ht.h>
#include <sys/kernel.h>
@@ -374,6 +375,10 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
break;
}
vmrun.cpuid = vcpu;
+
+ if (!(curthread->t_schedflag & TS_VCPU))
+ ht_mark_as_vcpu();
+
error = vm_run(sc->vmm_vm, &vmrun);
/*
* XXXJOY: I think it's necessary to do copyout, even in the
diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c
index bc3d80189b..3f9132ba4e 100644
--- a/usr/src/uts/i86pc/os/cpuid.c
+++ b/usr/src/uts/i86pc/os/cpuid.c
@@ -217,7 +217,9 @@ static char *x86_feature_names[NUM_X86_FEATURES] = {
"ibrs_all",
"rsba",
"ssb_no",
- "stibp_all"
+ "stibp_all",
+ "flush_cmd",
+ "l1d_vmentry_no"
};
boolean_t
@@ -986,6 +988,19 @@ cpuid_amd_getids(cpu_t *cpu)
}
static void
+spec_l1d_flush_noop(void)
+{
+}
+
+static void
+spec_l1d_flush_msr(void)
+{
+ wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
+}
+
+void (*spec_l1d_flush)(void) = spec_l1d_flush_noop;
+
+static void
cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
{
struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
@@ -1051,6 +1066,10 @@ cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
add_x86_feature(featureset,
X86FSET_RSBA);
}
+ if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
+ add_x86_feature(featureset,
+ X86FSET_L1D_VM_NO);
+ }
if (reg & IA32_ARCH_CAP_SSB_NO) {
add_x86_feature(featureset,
X86FSET_SSB_NO);
@@ -1062,7 +1081,47 @@ cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
add_x86_feature(featureset, X86FSET_SSBD);
+
+ if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
+ add_x86_feature(featureset, X86FSET_FLUSH_CMD);
+ }
+
+ if (cpu->cpu_id != 0)
+ return;
+
+ /*
+ * We're the boot CPU, so let's figure out our L1TF status.
+ *
+ * First, if this is a RDCL_NO CPU, then we are not vulnerable: we don't
+ * need to exclude with ht_acquire(), and we don't need to flush.
+ */
+ if (is_x86_feature(featureset, X86FSET_RDCL_NO)) {
+ extern int ht_exclusion;
+ ht_exclusion = 0;
+ spec_l1d_flush = spec_l1d_flush_noop;
+ membar_producer();
+ return;
+ }
+
+ /*
+ * If HT is enabled, we will need HT exclusion, as well as the flush on
+ * VM entry. If HT isn't enabled, we still need at least the flush for
+ * the L1TF sequential case.
+ *
+ * However, if X86FSET_L1D_VM_NO is set, we're most likely running
+ * inside a VM ourselves, and we don't need the flush.
+ *
+ * If we don't have the FLUSH_CMD available at all, we'd better just
+ * hope HT is disabled.
+ */
+ if (is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
+ !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
+ spec_l1d_flush = spec_l1d_flush_msr;
+ } else {
+ spec_l1d_flush = spec_l1d_flush_noop;
}
+
+ membar_producer();
}
/*
@@ -3827,7 +3886,7 @@ cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
eax = cpi->cpi_std[1].cp_eax;
#define SH_B0(eax) (eax == 0xf40 || eax == 0xf50)
-#define SH_B3(eax) (eax == 0xf51)
+#define SH_B3(eax) (eax == 0xf51)
#define B(eax) (SH_B0(eax) || SH_B3(eax))
#define SH_C0(eax) (eax == 0xf48 || eax == 0xf58)
@@ -4131,9 +4190,9 @@ static const char sl3_cache_str[] = "sectored-l3-cache";
static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
static const struct cachetab {
- uint8_t ct_code;
+ uint8_t ct_code;
uint8_t ct_assoc;
- uint16_t ct_line_size;
+ uint16_t ct_line_size;
size_t ct_size;
const char *ct_label;
} intel_ctab[] = {
diff --git a/usr/src/uts/i86pc/os/ht.c b/usr/src/uts/i86pc/os/ht.c
new file mode 100644
index 0000000000..f82c51ac08
--- /dev/null
+++ b/usr/src/uts/i86pc/os/ht.c
@@ -0,0 +1,599 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * HT exclusion: prevent a sibling in a hyper-threaded core from running in VMX
+ * non-root guest mode, when certain threads are running on the other sibling.
+ * This avoids speculation-based information leaks such as L1TF being available
+ * to the untrusted guest. The stance we take is that threads from the same
+ * zone as the guest VPCU thread are considered safe to run alongside, but all
+ * other threads (except the idle thread), and all interrupts, are unsafe. Note
+ * that due to the implementation here, there are significant sections of e.g.
+ * the dispatcher code that can run concurrently with a guest, until the thread
+ * reaches ht_mark(). This code assumes there are only two HT threads per core.
+ *
+ * The entry points are as follows:
+ *
+ * ht_mark_as_vcpu()
+ *
+ * All threads that enter guest mode (i.e. VCPU threads) need to call this at
+ * least once, which sets TS_VCPU in ->t_schedflag.
+ *
+ * ht_mark()
+ *
+ * A new ->cpu_thread is now curthread (although interrupt threads have their
+ * own separate handling). After preventing any interrupts, we will take our
+ * own CPU's spinlock and update our own state in mcpu_ht.
+ *
+ * If our sibling is poisoned (i.e. in guest mode or the little bit of code
+ * around it), and we're not compatible (that is, same zone ID, or the idle
+ * thread), then we need to ht_kick() that sibling. ht_kick() itself waits for
+ * the sibling to call ht_release(), and it will not re-enter guest mode until
+ * allowed.
+ *
+ * Note that we ignore the fact a process can change its zone ID: poisoning
+ * threads never do so, and we can ignore the other cases.
+ *
+ * ht_acquire()
+ *
+ * We are a VCPU thread about to start guest execution. Interrupts are
+ * disabled. We must have already run ht_mark() to be in this code, so there's
+ * no need to take our *own* spinlock in order to mark ourselves as CM_POISONED.
+ * Instead, we take our sibling's lock to also mark ourselves as poisoned in the
+ * sibling cpu_ht_t. This is so ht_mark() will only ever need to look at its
+ * local mcpu_ht.
+ *
+ * We'll loop here for up to ht_acquire_wait_time microseconds; this is mainly
+ * to wait out any sibling interrupt: many of them will complete quicker than
+ * this.
+ *
+ * Finally, if we succeeded in acquiring the core, we'll flush the L1 cache as
+ * mitigation against L1TF: no incompatible thread will now be able to populate
+ * the L1 cache until *we* ht_release().
+ *
+ * ht_release()
+ *
+ * Simply unpoison ourselves similarly to ht_acquire(); ht_kick() will wait for
+ * this to happen if needed.
+ *
+ * ht_begin_intr()
+ *
+ * In an interrupt prolog. We're either a hilevel interrupt, or a pinning
+ * interrupt. In both cases, we mark our interrupt depth, and potentially
+ * ht_kick(). This enforces exclusion, but doesn't otherwise modify ->ch_state:
+ * we want the dispatcher code to essentially ignore interrupts.
+ *
+ * ht_end_intr()
+ *
+ * In an interrupt epilogue *or* thread_unpin(). In the first case, we never
+ * slept, and we can simply decrement our counter. In the second case, we're an
+ * interrupt thread about to sleep: we'll still just decrement our counter, and
+ * henceforth treat the thread as a normal thread when it next gets scheduled,
+ * until it finally gets to its epilogue.
+ *
+ * ht_mark_unsafe() / ht_mark_safe()
+ *
+ * Mark the current thread as temporarily unsafe (guests should not be executing
+ * while a sibling is marked unsafe). This can be used for a thread that's
+ * otherwise considered safe, if it needs to handle potentially sensitive data.
+ * Right now, this means certain I/O handling operations that reach down into
+ * the networking and ZFS sub-systems.
+ *
+ * ht_should_run(thread, cpu)
+ *
+ * This is used by the dispatcher when making scheduling decisions: if the
+ * sibling is compatible with the given thread, we return B_TRUE. This is
+ * essentially trying to guess if any subsequent ht_acquire() will fail, by
+ * peeking at the sibling CPU's state. The peek is racy, but if we get things
+ * wrong, the "only" consequence is that ht_acquire() may lose.
+ *
+ * ht_adjust_cpu_score()
+ *
+ * Used when scoring other CPUs in disp_lowpri_cpu(). If we shouldn't run here,
+ * we'll add a small penalty to the score. This also makes sure a VCPU thread
+ * migration behaves properly.
+ */
+
+#include <sys/archsystm.h>
+#include <sys/disp.h>
+#include <sys/cmt.h>
+#include <sys/systm.h>
+#include <sys/cpu.h>
+#include <sys/var.h>
+#include <sys/xc_levels.h>
+#include <sys/cmn_err.h>
+#include <sys/sysmacros.h>
+#include <sys/x86_archext.h>
+
+#define CS_SHIFT (8)
+#define CS_MASK ((1 << CS_SHIFT) - 1)
+#define CS_MARK(s) ((s) & CS_MASK)
+#define CS_ZONE(s) ((s) >> CS_SHIFT)
+#define CS_MK(s, z) ((s) | (z << CS_SHIFT))
+
+typedef enum ch_mark {
+ CM_IDLE = 0, /* running CPU idle thread */
+ CM_THREAD, /* running general non-VCPU thread */
+ CM_UNSAFE, /* running ->t_unsafe thread */
+ CM_VCPU, /* running VCPU thread */
+ CM_POISONED /* running in guest */
+} ch_mark_t;
+
+/* Double-check our false-sharing padding. */
+CTASSERT(offsetof(cpu_ht_t, ch_sib) == 64);
+CTASSERT(CM_IDLE == 0);
+CTASSERT(CM_POISONED < (1 << CS_SHIFT));
+CTASSERT(CM_POISONED > CM_VCPU);
+CTASSERT(CM_VCPU > CM_UNSAFE);
+
+/*
+ * If disabled, no HT exclusion is performed, and system is potentially
+ * vulnerable to L1TF if hyper-threading is enabled, and we don't have the "not
+ * vulnerable" CPUID bit.
+ */
+int ht_exclusion = 1;
+
+/*
+ * How long ht_acquire() will spin trying to acquire the core, in micro-seconds.
+ * This is enough time to wait out a significant proportion of interrupts.
+ */
+clock_t ht_acquire_wait_time = 64;
+
+static cpu_t *
+ht_find_sibling(cpu_t *cp)
+{
+ for (uint_t i = 0; i < GROUP_SIZE(&cp->cpu_pg->cmt_pgs); i++) {
+ pg_cmt_t *pg = GROUP_ACCESS(&cp->cpu_pg->cmt_pgs, i);
+ group_t *cg = &pg->cmt_pg.pghw_pg.pg_cpus;
+
+ if (pg->cmt_pg.pghw_hw != PGHW_IPIPE)
+ continue;
+
+ if (GROUP_SIZE(cg) == 1)
+ break;
+
+ VERIFY3U(GROUP_SIZE(cg), ==, 2);
+
+ if (GROUP_ACCESS(cg, 0) != cp)
+ return (GROUP_ACCESS(cg, 0));
+
+ VERIFY3P(GROUP_ACCESS(cg, 1), !=, cp);
+
+ return (GROUP_ACCESS(cg, 1));
+ }
+
+ return (NULL);
+}
+
+/*
+ * Initialize HT links. We have to be careful here not to race with
+ * ht_begin/end_intr(), which also complicates trying to do this initialization
+ * from a cross-call; hence the slightly odd approach below.
+ */
+void
+ht_init(void)
+{
+ cpu_t *scp = CPU;
+ cpu_t *cp = scp;
+ ulong_t flags;
+
+ if (!ht_exclusion)
+ return;
+
+ mutex_enter(&cpu_lock);
+
+ do {
+ thread_affinity_set(curthread, cp->cpu_id);
+ flags = intr_clear();
+
+ cp->cpu_m.mcpu_ht.ch_intr_depth = 0;
+ cp->cpu_m.mcpu_ht.ch_state = CS_MK(CM_THREAD, GLOBAL_ZONEID);
+ cp->cpu_m.mcpu_ht.ch_sibstate = CS_MK(CM_THREAD, GLOBAL_ZONEID);
+ ASSERT3P(cp->cpu_m.mcpu_ht.ch_sib, ==, NULL);
+ cp->cpu_m.mcpu_ht.ch_sib = ht_find_sibling(cp);
+
+ intr_restore(flags);
+ thread_affinity_clear(curthread);
+ } while ((cp = cp->cpu_next_onln) != scp);
+
+ mutex_exit(&cpu_lock);
+}
+
+/*
+ * If our sibling is also a VCPU thread from a different zone, we need one of
+ * them to give up, otherwise they will just battle each other for exclusion
+ * until they exhaust their quantum.
+ *
+ * We arbitrate between them by dispatch priority: clearly, a higher-priority
+ * thread deserves to win the acquisition. However, under CPU load, it'll be
+ * very common to see both threads with ->t_pri == 1. If so, we'll break the
+ * tie by cpu_id (which is hopefully arbitrary enough).
+ *
+ * If we lose, the VMM code will take this as a hint to call
+ * thread_affinity_set(CPU_BEST), which will likely migrate the VCPU thread
+ * somewhere else.
+ *
+ * Note that all of this state examination is racy, as we don't own any locks
+ * here.
+ */
+static boolean_t
+yield_to_vcpu(cpu_t *sib, zoneid_t zoneid)
+{
+ cpu_ht_t *sibht = &sib->cpu_m.mcpu_ht;
+ uint64_t sibstate = sibht->ch_state;
+
+ /*
+ * If we're likely just waiting for an interrupt, don't yield.
+ */
+ if (sibht->ch_intr_depth != 0)
+ return (B_FALSE);
+
+ /*
+ * We're only interested in VCPUs from a different zone.
+ */
+ if (CS_MARK(sibstate) < CM_VCPU || CS_ZONE(sibstate) == zoneid)
+ return (B_FALSE);
+
+ if (curthread->t_pri < sib->cpu_dispatch_pri)
+ return (B_TRUE);
+
+ if (curthread->t_pri == sib->cpu_dispatch_pri &&
+ CPU->cpu_id < sib->cpu_id)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+static inline boolean_t
+sibling_compatible(cpu_ht_t *sibht, zoneid_t zoneid)
+{
+ uint64_t sibstate = sibht->ch_state;
+
+ if (sibht->ch_intr_depth != 0)
+ return (B_FALSE);
+
+ if (CS_MARK(sibstate) == CM_UNSAFE)
+ return (B_FALSE);
+
+ if (CS_MARK(sibstate) == CM_IDLE)
+ return (B_TRUE);
+
+ return (CS_ZONE(sibstate) == zoneid);
+}
+
+int
+ht_acquire(void)
+{
+ clock_t wait = ht_acquire_wait_time;
+ cpu_ht_t *ht = &CPU->cpu_m.mcpu_ht;
+ zoneid_t zoneid = getzoneid();
+ cpu_ht_t *sibht;
+ int ret = 0;
+
+ ASSERT(!interrupts_enabled());
+
+ if (ht->ch_sib == NULL) {
+ /* For the "sequential" L1TF case. */
+ spec_l1d_flush();
+ return (1);
+ }
+
+ sibht = &ht->ch_sib->cpu_m.mcpu_ht;
+
+ /* A VCPU thread should never change zone. */
+ ASSERT3U(CS_ZONE(ht->ch_state), ==, zoneid);
+ ASSERT3U(CS_MARK(ht->ch_state), ==, CM_VCPU);
+ ASSERT3U(zoneid, !=, GLOBAL_ZONEID);
+ ASSERT3U(curthread->t_preempt, >=, 1);
+ ASSERT(curthread->t_schedflag & TS_VCPU);
+
+ while (ret == 0 && wait > 0) {
+
+ if (yield_to_vcpu(ht->ch_sib, zoneid)) {
+ ret = -1;
+ break;
+ }
+
+ if (sibling_compatible(sibht, zoneid)) {
+ lock_set(&sibht->ch_lock);
+
+ if (sibling_compatible(sibht, zoneid)) {
+ ht->ch_state = CS_MK(CM_POISONED, zoneid);
+ sibht->ch_sibstate = CS_MK(CM_POISONED, zoneid);
+ membar_enter();
+ ret = 1;
+ }
+
+ lock_clear(&sibht->ch_lock);
+ } else {
+ drv_usecwait(10);
+ wait -= 10;
+ }
+ }
+
+ DTRACE_PROBE4(ht__acquire, int, ret, uint64_t, sibht->ch_state,
+ uint64_t, sibht->ch_intr_depth, clock_t, wait);
+
+ if (ret == 1)
+ spec_l1d_flush();
+
+ return (ret);
+}
+
+void
+ht_release(void)
+{
+ cpu_ht_t *ht = &CPU->cpu_m.mcpu_ht;
+ zoneid_t zoneid = getzoneid();
+ cpu_ht_t *sibht;
+
+ ASSERT(!interrupts_enabled());
+
+ if (ht->ch_sib == NULL)
+ return;
+
+ ASSERT3U(zoneid, !=, GLOBAL_ZONEID);
+ ASSERT3U(CS_ZONE(ht->ch_state), ==, zoneid);
+ ASSERT3U(CS_MARK(ht->ch_state), ==, CM_POISONED);
+ ASSERT3U(curthread->t_preempt, >=, 1);
+
+ sibht = &ht->ch_sib->cpu_m.mcpu_ht;
+
+ lock_set(&sibht->ch_lock);
+
+ ht->ch_state = CS_MK(CM_VCPU, zoneid);
+ sibht->ch_sibstate = CS_MK(CM_VCPU, zoneid);
+ membar_producer();
+
+ lock_clear(&sibht->ch_lock);
+}
+
+static void
+ht_kick(cpu_ht_t *ht, zoneid_t zoneid)
+{
+ uint64_t sibstate;
+
+ ASSERT(LOCK_HELD(&ht->ch_lock));
+ ASSERT(!interrupts_enabled());
+
+ poke_cpu(ht->ch_sib->cpu_id);
+
+ for (;;) {
+ membar_consumer();
+ sibstate = ht->ch_sibstate;
+
+ if (CS_MARK(sibstate) != CM_POISONED ||
+ CS_ZONE(sibstate) == zoneid)
+ return;
+
+ lock_clear(&ht->ch_lock);
+
+ for (;;) {
+ membar_consumer();
+ sibstate = ht->ch_sibstate;
+
+ if (CS_MARK(sibstate) != CM_POISONED ||
+ CS_ZONE(sibstate) == zoneid) {
+ lock_set(&ht->ch_lock);
+ return;
+ }
+
+ SMT_PAUSE();
+ }
+
+ lock_set(&ht->ch_lock);
+ }
+}
+
+/*
+ * FIXME: do we need a callback in case somebody installs a handler at this PIL
+ * ever?
+ */
+static boolean_t
+pil_needs_kick(uint_t pil)
+{
+ return (pil != XC_CPUPOKE_PIL);
+}
+
+void
+ht_begin_intr(uint_t pil)
+{
+ ulong_t flags;
+ cpu_ht_t *ht;
+
+ flags = intr_clear();
+ ht = &CPU->cpu_m.mcpu_ht;
+
+ if (ht->ch_sib == NULL) {
+ intr_restore(flags);
+ return;
+ }
+
+ if (atomic_inc_64_nv(&ht->ch_intr_depth) == 1 && pil_needs_kick(pil)) {
+ lock_set(&ht->ch_lock);
+
+ membar_consumer();
+
+ if (CS_MARK(ht->ch_sibstate) == CM_POISONED)
+ ht_kick(ht, GLOBAL_ZONEID);
+
+ lock_clear(&ht->ch_lock);
+ }
+
+ intr_restore(flags);
+}
+
+void
+ht_end_intr(void)
+{
+ ulong_t flags;
+ cpu_ht_t *ht;
+
+ flags = intr_clear();
+ ht = &CPU->cpu_m.mcpu_ht;
+
+ if (ht->ch_sib == NULL) {
+ intr_restore(flags);
+ return;
+ }
+
+ ASSERT3U(ht->ch_intr_depth, >, 0);
+ atomic_dec_64(&ht->ch_intr_depth);
+
+ intr_restore(flags);
+}
+
+static inline boolean_t
+ht_need_kick(cpu_ht_t *ht, zoneid_t zoneid)
+{
+ membar_consumer();
+
+ if (CS_MARK(ht->ch_sibstate) != CM_POISONED)
+ return (B_FALSE);
+
+ if (CS_MARK(ht->ch_state) == CM_UNSAFE)
+ return (B_TRUE);
+
+ return (CS_ZONE(ht->ch_sibstate) != zoneid);
+}
+
+void
+ht_mark(void)
+{
+ zoneid_t zoneid = getzoneid();
+ kthread_t *t = curthread;
+ ulong_t flags;
+ cpu_ht_t *ht;
+ cpu_t *cp;
+
+ flags = intr_clear();
+
+ cp = CPU;
+ ht = &cp->cpu_m.mcpu_ht;
+
+ if (ht->ch_sib == NULL) {
+ intr_restore(flags);
+ return;
+ }
+
+ lock_set(&ht->ch_lock);
+
+ /*
+ * If we were a nested interrupt and went through the resume_from_intr()
+ * path, we can now be resuming to a pinning interrupt thread; in which
+ * case, skip marking, until we later resume to a "real" thread.
+ */
+ if (ht->ch_intr_depth > 0) {
+ ASSERT3P(t->t_intr, !=, NULL);
+
+ if (ht_need_kick(ht, zoneid))
+ ht_kick(ht, zoneid);
+ goto out;
+ }
+
+ if (t == t->t_cpu->cpu_idle_thread) {
+ ASSERT3U(zoneid, ==, GLOBAL_ZONEID);
+ ht->ch_state = CS_MK(CM_IDLE, zoneid);
+ } else {
+ uint64_t state = CM_THREAD;
+
+ if (t->t_unsafe)
+ state = CM_UNSAFE;
+ else if (t->t_schedflag & TS_VCPU)
+ state = CM_VCPU;
+
+ ht->ch_state = CS_MK(state, zoneid);
+
+ if (ht_need_kick(ht, zoneid))
+ ht_kick(ht, zoneid);
+ }
+
+out:
+ membar_producer();
+ lock_clear(&ht->ch_lock);
+ intr_restore(flags);
+}
+
+void
+ht_begin_unsafe(void)
+{
+ curthread->t_unsafe++;
+ ht_mark();
+}
+
+void
+ht_end_unsafe(void)
+{
+ ASSERT3U(curthread->t_unsafe, >, 0);
+ curthread->t_unsafe--;
+ ht_mark();
+}
+
+void
+ht_mark_as_vcpu(void)
+{
+ thread_lock(curthread);
+ curthread->t_schedflag |= TS_VCPU;
+ ht_mark();
+ thread_unlock(curthread);
+}
+
+boolean_t
+ht_should_run(kthread_t *t, cpu_t *cp)
+{
+ uint64_t sibstate;
+ cpu_t *sib;
+
+ if (t == t->t_cpu->cpu_idle_thread)
+ return (B_TRUE);
+
+ if ((sib = cp->cpu_m.mcpu_ht.ch_sib) == NULL)
+ return (B_TRUE);
+
+ sibstate = sib->cpu_m.mcpu_ht.ch_state;
+
+ if ((t->t_schedflag & TS_VCPU)) {
+ if (CS_MARK(sibstate) == CM_IDLE)
+ return (B_TRUE);
+ if (CS_MARK(sibstate) == CM_UNSAFE)
+ return (B_FALSE);
+ return (CS_ZONE(sibstate) == ttozone(t)->zone_id);
+ }
+
+ if (CS_MARK(sibstate) < CM_VCPU)
+ return (B_TRUE);
+
+ return (CS_ZONE(sibstate) == ttozone(t)->zone_id);
+}
+
+pri_t
+ht_adjust_cpu_score(kthread_t *t, struct cpu *cp, pri_t score)
+{
+ cpu_t *sib;
+
+ if (ht_should_run(t, cp))
+ return (score);
+
+ /*
+ * If we're a VCPU thread scoring our current CPU, we are most likely
+ * asking to be rescheduled elsewhere after losing ht_acquire(). In
+ * this case, the current CPU is not a good choice, most likely, and we
+ * should go elsewhere.
+ */
+ if ((t->t_schedflag & TS_VCPU) && cp == t->t_cpu && score < 0)
+ return ((v.v_maxsyspri + 1) * 2);
+
+ return (score + 1);
+}
diff --git a/usr/src/uts/i86pc/os/intr.c b/usr/src/uts/i86pc/os/intr.c
index 29fa78109c..0634df1a94 100644
--- a/usr/src/uts/i86pc/os/intr.c
+++ b/usr/src/uts/i86pc/os/intr.c
@@ -466,25 +466,22 @@
#include <sys/ontrap.h>
#include <sys/x86_archext.h>
#include <sys/promif.h>
+#include <sys/ht.h>
#include <vm/hat_i86.h>
#if defined(__xpv)
#include <sys/hypervisor.h>
#endif
-#if defined(__amd64) && !defined(__xpv)
-/* If this fails, then the padding numbers in machcpuvar.h are wrong. */
-CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_pad)) <
- MMU_PAGESIZE);
-CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_kpti)) >=
- MMU_PAGESIZE);
-CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_kpti_dbg)) <
- 2 * MMU_PAGESIZE);
-CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_pad2)) <
- 2 * MMU_PAGESIZE);
+/* If these fail, then the padding numbers in machcpuvar.h are wrong. */
+#if !defined(__xpv)
+#define MCOFF(member) \
+ (offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, member))
+CTASSERT(MCOFF(mcpu_pad) == MACHCPU_SIZE);
+CTASSERT(MCOFF(mcpu_pad2) == MMU_PAGESIZE);
+CTASSERT((MCOFF(mcpu_kpti) & 0xF) == 0);
CTASSERT(((sizeof (struct kpti_frame)) & 0xF) == 0);
-CTASSERT(((offsetof(cpu_t, cpu_m) +
- offsetof(struct machcpu, mcpu_kpti_dbg)) & 0xF) == 0);
CTASSERT((offsetof(struct kpti_frame, kf_tr_rsp) & 0xF) == 0);
+CTASSERT(MCOFF(mcpu_pad3) < 2 * MMU_PAGESIZE);
#endif
#if defined(__xpv) && defined(DEBUG)
@@ -600,6 +597,8 @@ hilevel_intr_prolog(struct cpu *cpu, uint_t pil, uint_t oldpil, struct regs *rp)
}
}
+ ht_begin_intr(pil);
+
/*
* Store starting timestamp in CPU structure for this PIL.
*/
@@ -704,6 +703,8 @@ hilevel_intr_epilog(struct cpu *cpu, uint_t pil, uint_t oldpil, uint_t vecnum)
t->t_intr_start = now;
}
+ ht_end_intr();
+
mcpu->mcpu_pri = oldpil;
(void) (*setlvlx)(oldpil, vecnum);
@@ -766,6 +767,8 @@ intr_thread_prolog(struct cpu *cpu, caddr_t stackptr, uint_t pil)
it->t_state = TS_ONPROC;
cpu->cpu_thread = it; /* new curthread on this cpu */
+ ht_begin_intr(pil);
+
it->t_pil = (uchar_t)pil;
it->t_pri = intr_pri + (pri_t)pil;
it->t_intr_start = now;
@@ -856,6 +859,7 @@ intr_thread_epilog(struct cpu *cpu, uint_t vec, uint_t oldpil)
mcpu->mcpu_pri = pil;
(*setlvlx)(pil, vec);
t->t_intr_start = now;
+ ht_end_intr();
cpu->cpu_thread = t;
}
@@ -1043,6 +1047,7 @@ top:
it->t_intr = t;
cpu->cpu_thread = it;
+ ht_begin_intr(pil);
/*
* Set bit for this pil in CPU's interrupt active bitmask.
@@ -1103,7 +1108,9 @@ dosoftint_epilog(struct cpu *cpu, uint_t oldpil)
it->t_link = cpu->cpu_intr_thread;
cpu->cpu_intr_thread = it;
it->t_state = TS_FREE;
+ ht_end_intr();
cpu->cpu_thread = t;
+
if (t->t_flag & T_INTR_THREAD)
t->t_intr_start = now;
basespl = cpu->cpu_base_spl;
diff --git a/usr/src/uts/i86pc/sys/Makefile b/usr/src/uts/i86pc/sys/Makefile
index 0e3cbbe243..e6ea573d0b 100644
--- a/usr/src/uts/i86pc/sys/Makefile
+++ b/usr/src/uts/i86pc/sys/Makefile
@@ -21,7 +21,7 @@
#
# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
-# Copyright 2017 Joyent, Inc.
+# Copyright 2018 Joyent, Inc.
#
# uts/i86pc/sys/Makefile
#
@@ -44,8 +44,9 @@ CHKHDRS= \
clock.h \
cram.h \
ddi_subrdefs.h \
- debug_info.h \
+ debug_info.h \
fastboot.h \
+ ht.h \
mach_mmu.h \
machclock.h \
machcpuvar.h \
diff --git a/usr/src/uts/i86pc/sys/ht.h b/usr/src/uts/i86pc/sys/ht.h
new file mode 100644
index 0000000000..6b1bfcdd2b
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/ht.h
@@ -0,0 +1,46 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#ifndef _SYS_HT_H
+#define _SYS_HT_H
+
+#include <sys/types.h>
+#include <sys/thread.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct cpu;
+
+extern void ht_init(void);
+
+extern int ht_acquire(void);
+extern void ht_release(void);
+extern void ht_mark(void);
+extern void ht_begin_unsafe(void);
+extern void ht_end_unsafe(void);
+extern void ht_begin_intr(uint_t);
+extern void ht_end_intr(void);
+extern void ht_mark_as_vcpu(void);
+
+extern boolean_t ht_should_run(kthread_t *, struct cpu *);
+extern pri_t ht_adjust_cpu_score(kthread_t *, struct cpu *, pri_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_HT_H */
diff --git a/usr/src/uts/i86pc/sys/machcpuvar.h b/usr/src/uts/i86pc/sys/machcpuvar.h
index 98873cd26c..3d652316a4 100644
--- a/usr/src/uts/i86pc/sys/machcpuvar.h
+++ b/usr/src/uts/i86pc/sys/machcpuvar.h
@@ -140,6 +140,15 @@ struct kpti_frame {
uint64_t kf_upper_redzone;
};
+typedef struct cpu_ht {
+ lock_t ch_lock;
+ char ch_pad[56];
+ struct cpu *ch_sib;
+ volatile uint64_t ch_intr_depth;
+ volatile uint64_t ch_state;
+ volatile uint64_t ch_sibstate;
+} cpu_ht_t;
+
/*
* This first value, MACHCPU_SIZE is the size of all the members in the cpu_t
* AND struct machcpu, before we get to the mcpu_pad and the kpti area.
@@ -147,9 +156,9 @@ struct kpti_frame {
* page-tables, and hence must be page-aligned and page-sized. See
* hat_pcp_setup().
*
- * There is a CTASSERT in os/intr.c that checks these numbers.
+ * There are CTASSERTs in os/intr.c that verify this all works out.
*/
-#define MACHCPU_SIZE (572 + 1584)
+#define MACHCPU_SIZE (1568 + 688)
#define MACHCPU_PAD (MMU_PAGESIZE - MACHCPU_SIZE)
#define MACHCPU_PAD2 (MMU_PAGESIZE - 16 - 3 * sizeof (struct kpti_frame))
@@ -227,6 +236,8 @@ struct machcpu {
*/
volatile uint32_t mcpu_istamp;
+ cpu_ht_t mcpu_ht;
+
char mcpu_pad[MACHCPU_PAD];
/* This is the start of the page */
diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h
index bd8126cc0d..c200a5eb33 100644
--- a/usr/src/uts/i86pc/sys/vmm.h
+++ b/usr/src/uts/i86pc/sys/vmm.h
@@ -587,6 +587,9 @@ enum vm_exitcode {
VM_EXITCODE_SVM,
VM_EXITCODE_REQIDLE,
VM_EXITCODE_DEBUG,
+#ifndef __FreeBSD__
+ VM_EXITCODE_HT,
+#endif
VM_EXITCODE_MAX
};
diff --git a/usr/src/uts/i86xpv/Makefile.files b/usr/src/uts/i86xpv/Makefile.files
index a576b2f0a8..b4a78cc841 100644
--- a/usr/src/uts/i86xpv/Makefile.files
+++ b/usr/src/uts/i86xpv/Makefile.files
@@ -61,6 +61,7 @@ CORE_OBJS += \
hment.o \
hold_page.o \
hrtimers.o \
+ ht.o \
htable.o \
i86_mmu.o \
ibft.o \
@@ -110,7 +111,7 @@ CORE_OBJS += $(SMBIOS_OBJS)
#
# These get compiled twice:
-# - once in the dboot (direct boot) identity mapped code
+# - once in the dboot (direct boot) identity mapped code
# - once for use during early startup in unix
#
BOOT_DRIVER_OBJS = \
@@ -161,7 +162,7 @@ SPECIAL_OBJS_64 += \
locore.o \
fast_trap_asm.o \
interrupt.o \
- syscall_asm_amd64.o \
+ syscall_asm_amd64.o \
kpti_trampolines.o
SPECIAL_OBJS += $(SPECIAL_OBJS_$(CLASS))
@@ -234,7 +235,7 @@ INC_PATH += -I$(UTSBASE)/i86xpv -I$(UTSBASE)/i86pc -I$(SRC)/common \
# since only C headers are included when #defined(__lint) is true.
#
-ASSYM_DEPS += \
+ASSYM_DEPS += \
copy.o \
desctbls_asm.o \
ddi_i86_asm.o \
diff --git a/usr/src/uts/intel/ia32/ml/copy.s b/usr/src/uts/intel/ia32/ml/copy.s
index 95b7cb3028..f76a8a43cb 100644
--- a/usr/src/uts/intel/ia32/ml/copy.s
+++ b/usr/src/uts/intel/ia32/ml/copy.s
@@ -36,7 +36,7 @@
/* All Rights Reserved */
/*
- * Copyright (c) 2017 Joyent, Inc.
+ * Copyright (c) 2018 Joyent, Inc.
*/
#include <sys/errno.h>
@@ -866,8 +866,8 @@ bcopy_patch_start:
bcopy_patch_end:
.p2align 4
- .globl bcopy_ck_size
-bcopy_ck_size:
+ ALTENTRY(bcopy_ck_size)
+
cmpq $BCOPY_DFLT_REP, %rdx
jae L(use_rep)
@@ -956,6 +956,7 @@ L(use_rep):
jnz L(do_remainder)
ret
#undef L
+ SET_SIZE(bcopy_ck_size)
#ifdef DEBUG
/*
diff --git a/usr/src/uts/intel/ia32/ml/swtch.s b/usr/src/uts/intel/ia32/ml/swtch.s
index 6fc38cfbe8..c2c9fd9bd2 100644
--- a/usr/src/uts/intel/ia32/ml/swtch.s
+++ b/usr/src/uts/intel/ia32/ml/swtch.s
@@ -31,14 +31,6 @@
* Process switching routines.
*/
-#if defined(__lint)
-#include <sys/thread.h>
-#include <sys/systm.h>
-#include <sys/time.h>
-#else /* __lint */
-#include "assym.h"
-#endif /* __lint */
-
#include <sys/asm_linkage.h>
#include <sys/asm_misc.h>
#include <sys/regset.h>
@@ -47,6 +39,9 @@
#include <sys/segments.h>
#include <sys/psw.h>
+#if !defined(__lint)
+#include "assym.h"
+
/*
* resume(thread_id_t t);
*
@@ -74,16 +69,10 @@
* off the stack.
*/
-#if !defined(__lint)
-
#if LWP_PCB_FPU != 0
#error LWP_PCB_FPU MUST be defined as 0 for code in swtch.s to work
#endif /* LWP_PCB_FPU != 0 */
-#endif /* !__lint */
-
-#if defined(__amd64)
-
/*
* Save non-volatile regs other than %rsp (%rbx, %rbp, and %r12 - %r15)
*
@@ -153,88 +142,6 @@
jnz 0b; \
1:
-#elif defined (__i386)
-
-/*
- * Save non-volatile registers (%ebp, %esi, %edi and %ebx)
- *
- * The stack frame must be created before the save of %esp so that tracebacks
- * of swtch()ed-out processes show the process as having last called swtch().
- */
-#define SAVE_REGS(thread_t, retaddr) \
- movl %ebp, T_EBP(thread_t); \
- movl %ebx, T_EBX(thread_t); \
- movl %esi, T_ESI(thread_t); \
- movl %edi, T_EDI(thread_t); \
- pushl %ebp; \
- movl %esp, %ebp; \
- movl %esp, T_SP(thread_t); \
- movl retaddr, T_PC(thread_t); \
- movl 8(%ebp), %edi; \
- pushl %edi; \
- call __dtrace_probe___sched_off__cpu; \
- addl $CLONGSIZE, %esp
-
-/*
- * Restore non-volatile registers (%ebp, %esi, %edi and %ebx)
- *
- * We don't do a 'leave,' because reloading %rsp/%rbp from the label_t
- * already has the effect of putting the stack back the way it was when
- * we came in.
- */
-#define RESTORE_REGS(scratch_reg) \
- movl %gs:CPU_THREAD, scratch_reg; \
- movl T_EBP(scratch_reg), %ebp; \
- movl T_EBX(scratch_reg), %ebx; \
- movl T_ESI(scratch_reg), %esi; \
- movl T_EDI(scratch_reg), %edi
-
-/*
- * Get pointer to a thread's hat structure
- */
-#define GET_THREAD_HATP(hatp, thread_t, scratch_reg) \
- movl T_PROCP(thread_t), hatp; \
- movl P_AS(hatp), scratch_reg; \
- movl A_HAT(scratch_reg), hatp
-
-/*
- * If we are resuming an interrupt thread, store a timestamp in the thread
- * structure. If an interrupt occurs between tsc_read() and its subsequent
- * store, the timestamp will be stale by the time it is stored. We can detect
- * this by doing a compare-and-swap on the thread's timestamp, since any
- * interrupt occurring in this window will put a new timestamp in the thread's
- * t_intr_start field.
- */
-#define STORE_INTR_START(thread_t) \
- testw $T_INTR_THREAD, T_FLAGS(thread_t); \
- jz 1f; \
- pushl %ecx; \
-0: \
- pushl T_INTR_START(thread_t); \
- pushl T_INTR_START+4(thread_t); \
- call tsc_read; \
- movl %eax, %ebx; \
- movl %edx, %ecx; \
- popl %edx; \
- popl %eax; \
- cmpxchg8b T_INTR_START(thread_t); \
- jnz 0b; \
- popl %ecx; \
-1:
-
-#endif /* __amd64 */
-
-#if defined(__lint)
-
-/* ARGSUSED */
-void
-resume(kthread_t *t)
-{}
-
-#else /* __lint */
-
-#if defined(__amd64)
-
.global kpti_enable
ENTRY(resume)
@@ -436,6 +343,8 @@ resume(kthread_t *t)
call smap_disable
.nosmap:
+ call ht_mark
+
/*
* Restore non-volatile registers, then have spl0 return to the
* resuming thread's PC after first setting the priority as low as
@@ -456,203 +365,6 @@ resume_return:
SET_SIZE(_resume_from_idle)
SET_SIZE(resume)
-#elif defined (__i386)
-
- ENTRY(resume)
- movl %gs:CPU_THREAD, %eax
- movl $resume_return, %ecx
-
- /*
- * Save non-volatile registers, and set return address for current
- * thread to resume_return.
- *
- * %edi = t (new thread) when done.
- */
- SAVE_REGS(%eax, %ecx)
-
- LOADCPU(%ebx) /* %ebx = CPU */
- movl CPU_THREAD(%ebx), %esi /* %esi = curthread */
-
-#ifdef DEBUG
- call assert_ints_enabled /* panics if we are cli'd */
-#endif
- /*
- * Call savectx if thread has installed context ops.
- *
- * Note that if we have floating point context, the save op
- * (either fpsave_begin or fpxsave_begin) will issue the
- * async save instruction (fnsave or fxsave respectively)
- * that we fwait for below.
- */
- movl T_CTX(%esi), %eax /* should current thread savectx? */
- testl %eax, %eax
- jz .nosavectx /* skip call when zero */
- pushl %esi /* arg = thread pointer */
- call savectx /* call ctx ops */
- addl $4, %esp /* restore stack pointer */
-.nosavectx:
-
- /*
- * Call savepctx if process has installed context ops.
- */
- movl T_PROCP(%esi), %eax /* %eax = proc */
- cmpl $0, P_PCTX(%eax) /* should current thread savectx? */
- je .nosavepctx /* skip call when zero */
- pushl %eax /* arg = proc pointer */
- call savepctx /* call ctx ops */
- addl $4, %esp
-.nosavepctx:
-
- /*
- * Temporarily switch to the idle thread's stack
- */
- movl CPU_IDLE_THREAD(%ebx), %eax /* idle thread pointer */
-
- /*
- * Set the idle thread as the current thread
- */
- movl T_SP(%eax), %esp /* It is safe to set esp */
- movl %eax, CPU_THREAD(%ebx)
-
- /* switch in the hat context for the new thread */
- GET_THREAD_HATP(%ecx, %edi, %ecx)
- pushl %ecx
- call hat_switch
- addl $4, %esp
-
- /*
- * Clear and unlock previous thread's t_lock
- * to allow it to be dispatched by another processor.
- */
- movb $0, T_LOCK(%esi)
-
- /*
- * IMPORTANT: Registers at this point must be:
- * %edi = new thread
- *
- * Here we are in the idle thread, have dropped the old thread.
- */
- ALTENTRY(_resume_from_idle)
- /*
- * spin until dispatched thread's mutex has
- * been unlocked. this mutex is unlocked when
- * it becomes safe for the thread to run.
- */
-.L4:
- lock
- btsl $0, T_LOCK(%edi) /* lock new thread's mutex */
- jc .L4_2 /* lock did not succeed */
-
- /*
- * Fix CPU structure to indicate new running thread.
- * Set pointer in new thread to the CPU structure.
- */
- LOADCPU(%esi) /* load current CPU pointer */
- movl T_STACK(%edi), %eax /* here to use v pipeline of */
- /* Pentium. Used few lines below */
- cmpl %esi, T_CPU(%edi)
- jne .L5_2
-.L5_1:
- /*
- * Setup esp0 (kernel stack) in TSS to curthread's stack. If this
- * thread doesn't have a regs structure above the stack -- that is, if
- * lwp_stk_init() was never called for the thread -- this will set
- * esp0 to the wrong value, but it's harmless as it's a kernel thread,
- * and it won't actually attempt to implicitly use the esp0 via a
- * privilege change.
- */
- movl CPU_TSS(%esi), %ecx
- addl $REGSIZE+MINFRAME, %eax /* to the bottom of thread stack */
-#if !defined(__xpv)
- movl %eax, TSS_ESP0(%ecx)
-#else
- pushl %eax
- pushl $KDS_SEL
- call HYPERVISOR_stack_switch
- addl $8, %esp
-#endif /* __xpv */
-
- movl %edi, CPU_THREAD(%esi) /* set CPU's thread pointer */
- mfence /* synchronize with mutex_exit() */
- xorl %ebp, %ebp /* make $<threadlist behave better */
- movl T_LWP(%edi), %eax /* set associated lwp to */
- movl %eax, CPU_LWP(%esi) /* CPU's lwp ptr */
-
- movl T_SP(%edi), %esp /* switch to outgoing thread's stack */
- movl T_PC(%edi), %esi /* saved return addr */
-
- /*
- * Call restorectx if context ops have been installed.
- */
- movl T_CTX(%edi), %eax /* should resumed thread restorectx? */
- testl %eax, %eax
- jz .norestorectx /* skip call when zero */
- pushl %edi /* arg = thread pointer */
- call restorectx /* call ctx ops */
- addl $4, %esp /* restore stack pointer */
-.norestorectx:
-
- /*
- * Call restorepctx if context ops have been installed for the proc.
- */
- movl T_PROCP(%edi), %eax
- cmpl $0, P_PCTX(%eax)
- je .norestorepctx
- pushl %eax /* arg = proc pointer */
- call restorepctx
- addl $4, %esp /* restore stack pointer */
-.norestorepctx:
-
- STORE_INTR_START(%edi)
-
- /*
- * Restore non-volatile registers, then have spl0 return to the
- * resuming thread's PC after first setting the priority as low as
- * possible and blocking all interrupt threads that may be active.
- */
- movl %esi, %eax /* save return address */
- RESTORE_REGS(%ecx)
- pushl %eax /* push return address for spl0() */
- call __dtrace_probe___sched_on__cpu
- jmp spl0
-
-resume_return:
- /*
- * Remove stack frame created in SAVE_REGS()
- */
- addl $CLONGSIZE, %esp
- ret
-
-.L4_2:
- pause
- cmpb $0, T_LOCK(%edi)
- je .L4
- jmp .L4_2
-
-.L5_2:
- /* cp->cpu_stats.sys.cpumigrate++ */
- addl $1, CPU_STATS_SYS_CPUMIGRATE(%esi)
- adcl $0, CPU_STATS_SYS_CPUMIGRATE+4(%esi)
- movl %esi, T_CPU(%edi) /* set new thread's CPU pointer */
- jmp .L5_1
-
- SET_SIZE(_resume_from_idle)
- SET_SIZE(resume)
-
-#endif /* __amd64 */
-#endif /* __lint */
-
-#if defined(__lint)
-
-/* ARGSUSED */
-void
-resume_from_zombie(kthread_t *t)
-{}
-
-#else /* __lint */
-
-#if defined(__amd64)
-
ENTRY(resume_from_zombie)
movq %gs:CPU_THREAD, %rax
leaq resume_from_zombie_return(%rip), %r11
@@ -727,88 +439,6 @@ resume_from_zombie_return:
ret
SET_SIZE(resume_from_zombie)
-#elif defined (__i386)
-
- ENTRY(resume_from_zombie)
- movl %gs:CPU_THREAD, %eax
- movl $resume_from_zombie_return, %ecx
-
- /*
- * Save non-volatile registers, and set return address for current
- * thread to resume_from_zombie_return.
- *
- * %edi = t (new thread) when done.
- */
- SAVE_REGS(%eax, %ecx)
-
-#ifdef DEBUG
- call assert_ints_enabled /* panics if we are cli'd */
-#endif
- movl %gs:CPU_THREAD, %esi /* %esi = curthread */
-
- /* clean up the fp unit. It might be left enabled */
-
- movl %cr0, %eax
- testl $CR0_TS, %eax
- jnz .zfpu_disabled /* if TS already set, nothing to do */
- fninit /* init fpu & discard pending error */
- orl $CR0_TS, %eax
- movl %eax, %cr0
-.zfpu_disabled:
-
- /*
- * Temporarily switch to the idle thread's stack so that the zombie
- * thread's stack can be reclaimed by the reaper.
- */
- movl %gs:CPU_IDLE_THREAD, %eax /* idle thread pointer */
- movl T_SP(%eax), %esp /* get onto idle thread stack */
-
- /*
- * Set the idle thread as the current thread.
- */
- movl %eax, %gs:CPU_THREAD
-
- /*
- * switch in the hat context for the new thread
- */
- GET_THREAD_HATP(%ecx, %edi, %ecx)
- pushl %ecx
- call hat_switch
- addl $4, %esp
-
- /*
- * Put the zombie on death-row.
- */
- pushl %esi
- call reapq_add
- addl $4, %esp
- jmp _resume_from_idle /* finish job of resume */
-
-resume_from_zombie_return:
- RESTORE_REGS(%ecx) /* restore non-volatile registers */
- call __dtrace_probe___sched_on__cpu
-
- /*
- * Remove stack frame created in SAVE_REGS()
- */
- addl $CLONGSIZE, %esp
- ret
- SET_SIZE(resume_from_zombie)
-
-#endif /* __amd64 */
-#endif /* __lint */
-
-#if defined(__lint)
-
-/* ARGSUSED */
-void
-resume_from_intr(kthread_t *t)
-{}
-
-#else /* __lint */
-
-#if defined(__amd64)
-
ENTRY(resume_from_intr)
movq %gs:CPU_THREAD, %rax
leaq resume_from_intr_return(%rip), %r11
@@ -835,6 +465,8 @@ resume_from_intr(kthread_t *t)
STORE_INTR_START(%r12)
+ call ht_mark
+
/*
* Restore non-volatile registers, then have spl0 return to the
* resuming thread's PC after first setting the priority as low as
@@ -854,69 +486,6 @@ resume_from_intr_return:
ret
SET_SIZE(resume_from_intr)
-#elif defined (__i386)
-
- ENTRY(resume_from_intr)
- movl %gs:CPU_THREAD, %eax
- movl $resume_from_intr_return, %ecx
-
- /*
- * Save non-volatile registers, and set return address for current
- * thread to resume_return.
- *
- * %edi = t (new thread) when done.
- */
- SAVE_REGS(%eax, %ecx)
-
-#ifdef DEBUG
- call assert_ints_enabled /* panics if we are cli'd */
-#endif
- movl %gs:CPU_THREAD, %esi /* %esi = curthread */
- movl %edi, %gs:CPU_THREAD /* set CPU's thread pointer */
- mfence /* synchronize with mutex_exit() */
- movl T_SP(%edi), %esp /* restore resuming thread's sp */
- xorl %ebp, %ebp /* make $<threadlist behave better */
-
- /*
- * Unlock outgoing thread's mutex dispatched by another processor.
- */
- xorl %eax,%eax
- xchgb %al, T_LOCK(%esi)
-
- STORE_INTR_START(%edi)
-
- /*
- * Restore non-volatile registers, then have spl0 return to the
- * resuming thread's PC after first setting the priority as low as
- * possible and blocking all interrupt threads that may be active.
- */
- movl T_PC(%edi), %eax /* saved return addr */
- RESTORE_REGS(%ecx)
- pushl %eax /* push return address for spl0() */
- call __dtrace_probe___sched_on__cpu
- jmp spl0
-
-resume_from_intr_return:
- /*
- * Remove stack frame created in SAVE_REGS()
- */
- addl $CLONGSIZE, %esp
- ret
- SET_SIZE(resume_from_intr)
-
-#endif /* __amd64 */
-#endif /* __lint */
-
-#if defined(__lint)
-
-void
-thread_start(void)
-{}
-
-#else /* __lint */
-
-#if defined(__amd64)
-
ENTRY(thread_start)
popq %rax /* start() */
popq %rdi /* arg */
@@ -927,36 +496,6 @@ thread_start(void)
/*NOTREACHED*/
SET_SIZE(thread_start)
-#elif defined(__i386)
-
- ENTRY(thread_start)
- popl %eax
- movl %esp, %ebp
- addl $8, %ebp
- call *%eax
- addl $8, %esp
- call thread_exit /* destroy thread if it returns. */
- /*NOTREACHED*/
- SET_SIZE(thread_start)
-
-#endif /* __i386 */
-
-#endif /* __lint */
-
-#if defined(__lint)
-
-void
-thread_splitstack_run(caddr_t stack, void (*func)(void *), void *arg)
-{}
-
-void
-thread_splitstack_cleanup(void)
-{}
-
-#else /* __lint */
-
-#if defined(__amd64)
-
ENTRY(thread_splitstack_run)
pushq %rbp /* push base pointer */
movq %rsp, %rbp /* construct frame */
@@ -995,34 +534,4 @@ thread_splitstack_cleanup(void)
ret
SET_SIZE(thread_splitstack_cleanup)
-#elif defined(__i386)
-
- ENTRY(thread_splitstack_run)
- pushl %ebp /* push base pointer */
- movl %esp, %ebp /* construct frame */
- movl 8(%ebp), %esp /* set stack pointer */
- movl 12(%ebp), %eax /* load func */
- movl 16(%ebp), %edx /* load arg */
- pushl %edx /* push arg */
- call *%eax /* call specifed function */
- addl $4, %esp /* restore stack pointer */
- leave /* pop base pointer */
- ret
- SET_SIZE(thread_splitstack_run)
-
- /*
- * See comment in the amd64 code, above.
- */
- ENTRY(thread_splitstack_cleanup)
- LOADCPU(%eax)
- movl CPU_TSS(%eax), %ecx
- movl CPU_THREAD(%eax), %edx
- movl T_STACK(%edx), %edx
- addl $REGSIZE+MINFRAME, %edx
- movl %edx, TSS_ESP0(%ecx)
- ret
- SET_SIZE(thread_splitstack_cleanup)
-
-#endif /* __i386 */
-
-#endif /* __lint */
+#endif /* !__lint */
diff --git a/usr/src/uts/intel/sys/x86_archext.h b/usr/src/uts/intel/sys/x86_archext.h
index c6d696dc6e..fb6b6f0fdb 100644
--- a/usr/src/uts/intel/sys/x86_archext.h
+++ b/usr/src/uts/intel/sys/x86_archext.h
@@ -254,6 +254,7 @@ extern "C" {
#define CPUID_INTC_EDX_7_0_AVX5124FMAPS 0x00000008 /* AVX512 4FMAPS */
#define CPUID_INTC_EDX_7_0_SPEC_CTRL 0x04000000 /* Spec, IBPB, IBRS */
#define CPUID_INTC_EDX_7_0_STIBP 0x08000000 /* STIBP */
+#define CPUID_INTC_EDX_7_0_FLUSH_CMD 0x10000000 /* IA32_FLUSH_CMD */
#define CPUID_INTC_EDX_7_0_ARCH_CAPS 0x20000000 /* IA32_ARCH_CAPS */
#define CPUID_INTC_EDX_7_0_SSBD 0x80000000 /* SSBD */
@@ -362,11 +363,12 @@ extern "C" {
/*
* Intel IA32_ARCH_CAPABILITIES MSR.
*/
-#define MSR_IA32_ARCH_CAPABILITIES 0x10a
-#define IA32_ARCH_CAP_RDCL_NO 0x0001
-#define IA32_ARCH_CAP_IBRS_ALL 0x0002
-#define IA32_ARCH_CAP_RSBA 0x0004
-#define IA32_ARCH_CAP_SSB_NO 0x0010
+#define MSR_IA32_ARCH_CAPABILITIES 0x10a
+#define IA32_ARCH_CAP_RDCL_NO 0x0001
+#define IA32_ARCH_CAP_IBRS_ALL 0x0002
+#define IA32_ARCH_CAP_RSBA 0x0004
+#define IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY 0x0008
+#define IA32_ARCH_CAP_SSB_NO 0x0010
/*
* Intel Speculation related MSRs
@@ -379,6 +381,9 @@ extern "C" {
#define MSR_IA32_PRED_CMD 0x49
#define IA32_PRED_CMD_IBPB 0x01
+#define MSR_IA32_FLUSH_CMD 0x10b
+#define IA32_FLUSH_CMD_L1D 0x01
+
#define MCI_CTL_VALUE 0xffffffff
#define MTRR_TYPE_UC 0
@@ -491,6 +496,8 @@ extern "C" {
#define X86FSET_RSBA 78
#define X86FSET_SSB_NO 79
#define X86FSET_STIBP_ALL 80
+#define X86FSET_FLUSH_CMD 81
+#define X86FSET_L1D_VM_NO 82
/*
* Intel Deep C-State invariant TSC in leaf 0x80000007.
@@ -773,7 +780,7 @@ extern "C" {
#if defined(_KERNEL) || defined(_KMEMUSER)
-#define NUM_X86_FEATURES 81
+#define NUM_X86_FEATURES 83
extern uchar_t x86_featureset[];
extern void free_x86_featureset(void *featureset);
@@ -792,6 +799,8 @@ extern uint_t pentiumpro_bug4046376;
extern const char CyrixInstead[];
+extern void (*spec_l1d_flush)(void);
+
#endif
#if defined(_KERNEL)
diff --git a/usr/src/uts/intel/vnd/Makefile b/usr/src/uts/intel/vnd/Makefile
index fc94398b99..b94d014eb7 100644
--- a/usr/src/uts/intel/vnd/Makefile
+++ b/usr/src/uts/intel/vnd/Makefile
@@ -10,7 +10,7 @@
#
#
-# Copyright 2017 Joyent, Inc.
+# Copyright 2018 Joyent, Inc.
#
UTSBASE = ../..
@@ -29,7 +29,8 @@ LINT_TARGET = $(MODULE).lint
INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
CONF_SRCDIR = $(UTSBASE)/common/io/vnd
-LDFLAGS += -dy -Nmisc/neti -Nmisch/hook -Nfs/dev -Nmisc/gsqueue
+CPPFLAGS += -I$(UTSBASE)/i86pc
+LDFLAGS += -dy -Nmisc/neti -Nmisch/hook -Nfs/dev -Nmisc/gsqueue
#
# We use <sys/ctype.h> which causes gcc to think that all of its inline
diff --git a/usr/src/uts/intel/zfs/Makefile b/usr/src/uts/intel/zfs/Makefile
index a4a2f4a561..07d4395c22 100644
--- a/usr/src/uts/intel/zfs/Makefile
+++ b/usr/src/uts/intel/zfs/Makefile
@@ -29,6 +29,7 @@
#
# Copyright (c) 2016 by Delphix. All rights reserved.
#
+# Copyright 2018 Joyent, Inc.
#
# Path to the base of the uts directory tree (usually /usr/src/uts).
@@ -72,6 +73,7 @@ INC_PATH += -I$(UTSBASE)/common/fs/zfs/lua
INC_PATH += -I$(SRC)/common
INC_PATH += -I$(COMMONBASE)/zfs
+CPPFLAGS += -I$(UTSBASE)/i86pc
C99LMODE= -Xc99=%all
#
diff --git a/usr/src/uts/sparc/zfs/Makefile b/usr/src/uts/sparc/zfs/Makefile
index f32b408306..617d495325 100644
--- a/usr/src/uts/sparc/zfs/Makefile
+++ b/usr/src/uts/sparc/zfs/Makefile
@@ -29,6 +29,8 @@
#
# Copyright (c) 2016 by Delphix. All rights reserved.
#
+# Copyright 2018 Joyent, Inc.
+#
#
# Path to the base of the uts directory tree (usually /usr/src/uts).
@@ -74,6 +76,7 @@ INC_PATH += -I$(UTSBASE)/common/fs/zfs
INC_PATH += -I$(UTSBASE)/common/fs/zfs/lua
INC_PATH += -I$(SRC)/common
INC_PATH += -I$(COMMONBASE)/zfs
+INC_PATH += -I$(UTSBASE)/sun4
C99LMODE= -Xc99=%all
diff --git a/usr/src/uts/sun4/sys/ht.h b/usr/src/uts/sun4/sys/ht.h
new file mode 100644
index 0000000000..831891979f
--- /dev/null
+++ b/usr/src/uts/sun4/sys/ht.h
@@ -0,0 +1,37 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#ifndef _SYS_HT_H
+#define _SYS_HT_H
+
+#include <sys/types.h>
+#include <sys/thread.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ht_init() {}
+
+#define ht_should_run(t, c) (B_TRUE)
+#define ht_adjust_cpu_score(t, c, p) (p)
+#define ht_mark_safe(void) {}
+#define ht_mark_unsafe(void) {}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_HT_H */
diff --git a/usr/src/uts/sun4u/sys/Makefile b/usr/src/uts/sun4u/sys/Makefile
index 8e73425995..a69a2b14f1 100644
--- a/usr/src/uts/sun4u/sys/Makefile
+++ b/usr/src/uts/sun4u/sys/Makefile
@@ -21,7 +21,7 @@
# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# uts/sun4u/sys/Makefile
+# Copyright 2018 Joyent, Inc.
#
UTSBASE = ../..
@@ -40,18 +40,19 @@ SUN4_HDRS= \
clock.h \
cmp.h \
cpc_ultra.h \
- cpu_sgnblk_defs.h \
+ cpu_sgnblk_defs.h \
ddi_subrdefs.h \
dvma.h \
eeprom.h \
errclassify.h \
fcode.h \
fc_plat.h \
+ ht.h \
idprom.h \
intr.h \
intreg.h \
ivintr.h \
- memlist_plat.h \
+ memlist_plat.h \
memnode.h \
nexusdebug.h \
prom_debug.h \
diff --git a/usr/src/uts/sun4v/sys/Makefile b/usr/src/uts/sun4v/sys/Makefile
index 2af0d8841b..6c0fbd666c 100644
--- a/usr/src/uts/sun4v/sys/Makefile
+++ b/usr/src/uts/sun4v/sys/Makefile
@@ -22,8 +22,7 @@
# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#
-# uts/sun4v/sys/Makefile
+# Copyright 2018 Joyent, Inc.
#
# include global definitions
UTSBASE = ../..
@@ -42,16 +41,17 @@ SUN4_HDRS= \
clock.h \
cmp.h \
cpc_ultra.h \
- cpu_sgnblk_defs.h \
+ cpu_sgnblk_defs.h \
ddi_subrdefs.h \
dvma.h \
eeprom.h \
fcode.h \
+ ht.h \
idprom.h \
intr.h \
intreg.h \
ivintr.h \
- memlist_plat.h \
+ memlist_plat.h \
memnode.h \
nexusdebug.h \
prom_debug.h \