diff options
author | John Levon <john.levon@joyent.com> | 2018-08-14 21:14:28 +0000 |
---|---|---|
committer | Robert Mustacchi <rm@joyent.com> | 2018-08-14 23:15:49 +0000 |
commit | 89d0fffcadbabb8694d3ce87b5be826e2b789c99 (patch) | |
tree | a038f703ae6cfae6d41fe1ed8c17be5687e864da | |
parent | 10ad6220c95adc2a5592ea98b1c7ced27d6942ed (diff) | |
download | illumos-joyent-release-20180802.tar.gz |
OS-7125 Need mitigation of L1TF (CVE-2018-3646)release-20180802
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
36 files changed, 1182 insertions, 664 deletions
diff --git a/exception_lists/copyright b/exception_lists/copyright index 12819c29a1..1fe2c1dd4e 100644 --- a/exception_lists/copyright +++ b/exception_lists/copyright @@ -126,6 +126,8 @@ usr/src/common/bzip2/huffman.c usr/src/common/ficl/* usr/src/data/hwdata/THIRDPARTYLICENSE.efifixes.descrip usr/src/data/hwdata/THIRDPARTYLICENSE.efifixes.tmpl +usr/src/data/ucode/amd/* +usr/src/data/ucode/intel/* usr/src/grub/grub-0.97/stage2/Makefile.am usr/src/grub/grub-0.97/stage2/builtins.c usr/src/grub/grub-0.97/stage2/disk_io.c diff --git a/exception_lists/keywords b/exception_lists/keywords index a8859d25d1..5ff9cc8ab0 100644 --- a/exception_lists/keywords +++ b/exception_lists/keywords @@ -22,6 +22,7 @@ # Copyright 2017 Nexenta Systems, Inc. # Copyright (c) 2013 by Delphix. All rights reserved. # Copyright 2016 Toomas Soome <tsoome@me.com> +# Copyright 2018 Joyent, Inc. # syntax: glob @@ -36,6 +37,8 @@ usr/src/data/locale/data/zh_SG.UTF-8.src usr/src/data/locale/data/zh_TW.UTF-8.src usr/src/data/terminfo/termcap.src usr/src/data/terminfo/terminfo.src +usr/src/data/ucode/amd/* +usr/src/data/ucode/intel/* usr/src/test/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v15.dat.bz2 usr/src/test/zfs-tests/tests/functional/delegate/delegate_common.kshlib usr/src/test/test-runner/cmd/run diff --git a/exception_lists/wscheck b/exception_lists/wscheck new file mode 100644 index 0000000000..0d06b13802 --- /dev/null +++ b/exception_lists/wscheck @@ -0,0 +1,96 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# Copyright 2018 Joyent, Inc. +# +syntax: glob + +usr/src/data/ucode/amd/* +usr/src/data/ucode/intel/* + +# bhyve sources +usr/src/cmd/bhyve/acpi.[ch] +usr/src/cmd/bhyve/ahci.h +usr/src/cmd/bhyve/atkbdc.[ch] +usr/src/cmd/bhyve/bhyvegc.[ch] +usr/src/cmd/bhyve/bhyverun.[ch] +usr/src/cmd/bhyve/block_if.[ch] +usr/src/cmd/bhyve/bootrom.[ch] +usr/src/cmd/bhyve/console.[ch] +usr/src/cmd/bhyve/consport.c +usr/src/cmd/bhyve/dbgport.[ch] +usr/src/cmd/bhyve/fwctl.[ch] +usr/src/cmd/bhyve/gdb.[ch] +usr/src/cmd/bhyve/inout.[ch] +usr/src/cmd/bhyve/ioapic.[ch] +usr/src/cmd/bhyve/mem.[ch] +usr/src/cmd/bhyve/mevent.[ch] +usr/src/cmd/bhyve/mevent_test.c +usr/src/cmd/bhyve/mptbl.[ch] +usr/src/cmd/bhyve/pci_ahci.c +usr/src/cmd/bhyve/pci_e82545.c +usr/src/cmd/bhyve/pci_emul.[ch] +usr/src/cmd/bhyve/pci_fbuf.c +usr/src/cmd/bhyve/pci_hostbridge.c +usr/src/cmd/bhyve/pci_irq.[ch] +usr/src/cmd/bhyve/pci_lpc.[ch] +usr/src/cmd/bhyve/pci_passthru.c +usr/src/cmd/bhyve/pci_uart.c +usr/src/cmd/bhyve/pci_virtio_block.c +usr/src/cmd/bhyve/pci_virtio_console.c +usr/src/cmd/bhyve/pci_virtio_net.c +usr/src/cmd/bhyve/pci_virtio_rnd.c +usr/src/cmd/bhyve/pci_xhci.[ch] +usr/src/cmd/bhyve/pm.c +usr/src/cmd/bhyve/pmtmr.c +usr/src/cmd/bhyve/post.c +usr/src/cmd/bhyve/ps2kbd.[ch] +usr/src/cmd/bhyve/ps2mouse.[ch] +usr/src/cmd/bhyve/rfb.[ch] +usr/src/cmd/bhyve/rtc.[ch] +usr/src/cmd/bhyve/smbiostbl.[ch] +usr/src/cmd/bhyve/sockstream.[ch] +usr/src/cmd/bhyve/spinup_ap.[ch] +usr/src/cmd/bhyve/task_switch.c +usr/src/cmd/bhyve/uart_emul.[ch] +usr/src/cmd/bhyve/usb_emul.[ch] +usr/src/cmd/bhyve/usb_mouse.c +usr/src/cmd/bhyve/vga.[ch] +usr/src/cmd/bhyve/virtio.[ch] +usr/src/cmd/bhyve/xmsr.[ch] +usr/src/cmd/bhyveconsole/bhyveconsole.c +usr/src/cmd/bhyvectl/bhyvectl.c +usr/src/compat/freebsd/*.h +usr/src/compat/freebsd/*/*.h +usr/src/compat/freebsd/amd64/machine/*.h +usr/contrib/freebsd/*/*.h +usr/contrib/freebsd/*/*/*.h +usr/contrib/freebsd/lib/libutil/*.c +usr/src/head/bhyve.h +usr/src/lib/libvmmapi/common/vmmapi.[ch] +usr/src/uts/i86pc/io/vmm/amd/*.[ch] +usr/src/uts/i86pc/io/vmm/intel/*.[chs] +usr/src/uts/i86pc/io/vmm/io/*.[ch] +usr/src/uts/i86pc/io/vmm/vmm.c +usr/src/uts/i86pc/io/vmm/vmm_host.[ch] +usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c +usr/src/uts/i86pc/io/vmm/vmm_ioport.[ch] +usr/src/uts/i86pc/io/vmm/vmm_ipi.h +usr/src/uts/i86pc/io/vmm/vmm_ktr.h +usr/src/uts/i86pc/io/vmm/vmm_lapic.[ch] +usr/src/uts/i86pc/io/vmm/vmm_mem.[ch] +usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c +usr/src/uts/i86pc/io/vmm/vmm_stat.[ch] +usr/src/uts/i86pc/io/vmm/vmm_util.[ch] +usr/src/uts/i86pc/io/vmm/vmx_assym.s +usr/src/uts/i86pc/io/vmm/x86.[ch] +usr/src/uts/i86pc/sys/vmm.h +usr/src/uts/i86pc/sys/vmm_dev.h +usr/src/uts/i86pc/sys/vmm_instruction_emul.h diff --git a/usr/src/uts/common/disp/cpupart.c b/usr/src/uts/common/disp/cpupart.c index c260329c61..4ddc568187 100644 --- a/usr/src/uts/common/disp/cpupart.c +++ b/usr/src/uts/common/disp/cpupart.c @@ -20,6 +20,8 @@ */ /* * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright 2018 Joyent, Inc. */ #include <sys/types.h> @@ -324,7 +326,7 @@ cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced) kthread_t *t; int move_threads = 1; lgrp_id_t lgrpid; - proc_t *p; + proc_t *p; int lgrp_diff_lpl; lpl_t *cpu_lpl; int ret; @@ -569,8 +571,8 @@ again: /* Update CPU last ran on if it was this CPU */ if (t->t_cpu == cp && t->t_cpupart == oldpp && t->t_bound_cpu != cp) { - t->t_cpu = disp_lowpri_cpu(ncp, - t->t_lpl, t->t_pri, NULL); + t->t_cpu = disp_lowpri_cpu(ncp, t, + t->t_pri); } t = t->t_forw; } while (t != p->p_tlist); @@ -622,8 +624,8 @@ again: /* Update CPU last ran on if it was this CPU */ if (t->t_cpu == cp && t->t_cpupart == oldpp && t->t_bound_cpu != cp) { - t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl, - t->t_pri, NULL); + t->t_cpu = disp_lowpri_cpu(ncp, t, + t->t_pri); } t = t->t_next; @@ -883,7 +885,7 @@ cpupart_create(psetid_t *psid) static int cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all) { - void *projbuf, *zonebuf; + void *projbuf, *zonebuf; kthread_t *t; proc_t *p; int err = 0; diff --git a/usr/src/uts/common/disp/disp.c b/usr/src/uts/common/disp/disp.c index 5f9c2c68a2..4898a18bf2 100644 --- a/usr/src/uts/common/disp/disp.c +++ b/usr/src/uts/common/disp/disp.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2018, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -60,6 +60,7 @@ #include <sys/dtrace.h> #include <sys/sdt.h> #include <sys/archsystm.h> +#include <sys/ht.h> #include <vm/as.h> @@ -1135,15 +1136,13 @@ swtch_to(kthread_t *next) */ } -#define CPU_IDLING(pri) ((pri) == -1) - static void cpu_resched(cpu_t *cp, pri_t tpri) { int call_poke_cpu = 0; pri_t cpupri = cp->cpu_dispatch_pri; - if (!CPU_IDLING(cpupri) && (cpupri < tpri)) { + if (cpupri != CPU_IDLE_PRI && cpupri < tpri) { TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED, "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri); if (tpri >= upreemptpri && cp->cpu_runrun == 0) { @@ -1239,17 +1238,17 @@ setbackdq(kthread_t *tp) /* * We'll generally let this thread continue to run where * it last ran...but will consider migration if: - * - We thread probably doesn't have much cache warmth. + * - The thread probably doesn't have much cache warmth. + * - HT exclusion would prefer us to run elsewhere * - The CPU where it last ran is the target of an offline * request. - * - The thread last ran outside it's home lgroup. + * - The thread last ran outside its home lgroup. */ if ((!THREAD_HAS_CACHE_WARMTH(tp)) || - (tp->t_cpu == cpu_inmotion)) { - cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL); - } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) { - cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, - self ? tp->t_cpu : NULL); + !ht_should_run(tp, tp->t_cpu) || + (tp->t_cpu == cpu_inmotion) || + !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) { + cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri); } else { cp = tp->t_cpu; } @@ -1278,7 +1277,8 @@ setbackdq(kthread_t *tp) newcp = cp->cpu_next_part; } - if (RUNQ_LEN(newcp, tpri) < qlen) { + if (ht_should_run(tp, newcp) && + RUNQ_LEN(newcp, tpri) < qlen) { DTRACE_PROBE3(runq__balance, kthread_t *, tp, cpu_t *, cp, cpu_t *, newcp); @@ -1289,8 +1289,8 @@ setbackdq(kthread_t *tp) /* * Migrate to a cpu in the new partition. */ - cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, - tp->t_lpl, tp->t_pri, NULL); + cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp, + tp->t_pri); } ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); } else { @@ -1427,7 +1427,7 @@ setfrontdq(kthread_t *tp) /* * We'll generally let this thread continue to run * where it last ran, but will consider migration if: - * - The thread last ran outside it's home lgroup. + * - The thread last ran outside its home lgroup. * - The CPU where it last ran is the target of an * offline request (a thread_nomigrate() on the in * motion CPU relies on this when forcing a preempt). @@ -1435,21 +1435,18 @@ setfrontdq(kthread_t *tp) * it last ran, and it is considered not likely to * have significant cache warmth. */ - if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) || - (cp == cpu_inmotion)) { - cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, - (tp == curthread) ? cp : NULL); - } else if ((tpri < cp->cpu_disp->disp_maxrunpri) && - (!THREAD_HAS_CACHE_WARMTH(tp))) { - cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, - NULL); + if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) || + cp == cpu_inmotion || + (tpri < cp->cpu_disp->disp_maxrunpri && + !THREAD_HAS_CACHE_WARMTH(tp))) { + cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri); } } else { /* * Migrate to a cpu in the new partition. */ cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, - tp->t_lpl, tp->t_pri, NULL); + tp, tp->t_pri); } ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); } else { @@ -1600,7 +1597,7 @@ setkpdq(kthread_t *tp, int borf) /* migrate to a cpu in the new partition */ cp = tp->t_cpupart->cp_cpulist; } - cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL); + cp = disp_lowpri_cpu(cp, tp, tp->t_pri); disp_lock_enter_high(&cp->cpu_disp->disp_lock); ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); @@ -2573,80 +2570,85 @@ disp_cpu_inactive(cpu_t *cp) } /* - * disp_lowpri_cpu - find CPU running the lowest priority thread. - * The hint passed in is used as a starting point so we don't favor - * CPU 0 or any other CPU. The caller should pass in the most recently - * used CPU for the thread. + * Return a score rating this CPU for running this thread: lower is better. * - * The lgroup and priority are used to determine the best CPU to run on - * in a NUMA machine. The lgroup specifies which CPUs are closest while - * the thread priority will indicate whether the thread will actually run - * there. To pick the best CPU, the CPUs inside and outside of the given - * lgroup which are running the lowest priority threads are found. The - * remote CPU is chosen only if the thread will not run locally on a CPU - * within the lgroup, but will run on the remote CPU. If the thread - * cannot immediately run on any CPU, the best local CPU will be chosen. + * If curthread is looking for a new CPU, then we ignore cpu_dispatch_pri for + * curcpu (as that's our own priority). * - * The lpl specified also identifies the cpu partition from which - * disp_lowpri_cpu should select a CPU. + * If a cpu is the target of an offline request, then try to avoid it. * - * curcpu is used to indicate that disp_lowpri_cpu is being called on - * behalf of the current thread. (curthread is looking for a new cpu) - * In this case, cpu_dispatch_pri for this thread's cpu should be - * ignored. + * Otherwise we'll use double the effective dispatcher priority for the CPU. * - * If a cpu is the target of an offline request then try to avoid it. + * We do this so ht_adjust_cpu_score() can increment the score if needed, + * without ending up over-riding a dispatcher priority. + */ +static pri_t +cpu_score(cpu_t *cp, kthread_t *tp) +{ + pri_t score; + + if (tp == curthread && cp == curthread->t_cpu) + score = 2 * CPU_IDLE_PRI; + else if (cp == cpu_inmotion) + score = SHRT_MAX; + else + score = 2 * cp->cpu_dispatch_pri; + + if (2 * cp->cpu_disp->disp_maxrunpri > score) + score = 2 * cp->cpu_disp->disp_maxrunpri; + if (2 * cp->cpu_chosen_level > score) + score = 2 * cp->cpu_chosen_level; + + return (ht_adjust_cpu_score(tp, cp, score)); +} + +/* + * disp_lowpri_cpu - find a suitable CPU to run the given thread. + * + * We are looking for a CPU with an effective dispatch priority lower than the + * thread's, so that the thread will run immediately rather than be enqueued. + * For NUMA locality, we prefer "home" CPUs within the thread's ->t_lpl group. + * If we don't find an available CPU there, we will expand our search to include + * wider locality levels. (Note these groups are already divided by CPU + * partition.) + * + * If the thread cannot immediately run on *any* CPU, we'll enqueue ourselves on + * the best home CPU we found. * - * This function must be called at either high SPL, or with preemption - * disabled, so that the "hint" CPU cannot be removed from the online - * CPU list while we are traversing it. + * The hint passed in is used as a starting point so we don't favor CPU 0 or any + * other CPU. The caller should pass in the most recently used CPU for the + * thread; it's of course possible that this CPU isn't in the home lgroup. + * + * This function must be called at either high SPL, or with preemption disabled, + * so that the "hint" CPU cannot be removed from the online CPU list while we + * are traversing it. */ cpu_t * -disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu) +disp_lowpri_cpu(cpu_t *hint, kthread_t *tp, pri_t tpri) { cpu_t *bestcpu; cpu_t *besthomecpu; cpu_t *cp, *cpstart; - pri_t bestpri; - pri_t cpupri; - klgrpset_t done; - klgrpset_t cur_set; lpl_t *lpl_iter, *lpl_leaf; - int i; - /* - * Scan for a CPU currently running the lowest priority thread. - * Cannot get cpu_lock here because it is adaptive. - * We do not require lock on CPU list. - */ ASSERT(hint != NULL); - ASSERT(lpl != NULL); - ASSERT(lpl->lpl_ncpu > 0); + ASSERT(tp->t_lpl->lpl_ncpu > 0); - /* - * First examine local CPUs. Note that it's possible the hint CPU - * passed in in remote to the specified home lgroup. If our priority - * isn't sufficient enough such that we can run immediately at home, - * then examine CPUs remote to our home lgroup. - * We would like to give preference to CPUs closest to "home". - * If we can't find a CPU where we'll run at a given level - * of locality, we expand our search to include the next level. - */ bestcpu = besthomecpu = NULL; klgrpset_clear(done); - /* start with lpl we were passed */ - lpl_iter = lpl; + lpl_iter = tp->t_lpl; do { + pri_t best = SHRT_MAX; + klgrpset_t cur_set; - bestpri = SHRT_MAX; klgrpset_clear(cur_set); - for (i = 0; i < lpl_iter->lpl_nrset; i++) { + for (int i = 0; i < lpl_iter->lpl_nrset; i++) { lpl_leaf = lpl_iter->lpl_rset[i]; if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid)) continue; @@ -2659,34 +2661,25 @@ disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu) cp = cpstart = lpl_leaf->lpl_cpus; do { - if (cp == curcpu) - cpupri = -1; - else if (cp == cpu_inmotion) - cpupri = SHRT_MAX; - else - cpupri = cp->cpu_dispatch_pri; - if (cp->cpu_disp->disp_maxrunpri > cpupri) - cpupri = cp->cpu_disp->disp_maxrunpri; - if (cp->cpu_chosen_level > cpupri) - cpupri = cp->cpu_chosen_level; - if (cpupri < bestpri) { - if (CPU_IDLING(cpupri)) { - ASSERT((cp->cpu_flags & - CPU_QUIESCED) == 0); - return (cp); - } + pri_t score = cpu_score(cp, tp); + + if (score < best) { + best = score; bestcpu = cp; - bestpri = cpupri; + + /* An idle CPU: we're done. */ + if (score / 2 == CPU_IDLE_PRI) + goto out; } } while ((cp = cp->cpu_next_lpl) != cpstart); } - if (bestcpu && (tpri > bestpri)) { - ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0); - return (bestcpu); - } + if (bestcpu != NULL && tpri > (best / 2)) + goto out; + if (besthomecpu == NULL) besthomecpu = bestcpu; + /* * Add the lgrps we just considered to the "done" set */ @@ -2698,8 +2691,11 @@ disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu) * The specified priority isn't high enough to run immediately * anywhere, so just return the best CPU from the home lgroup. */ - ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0); - return (besthomecpu); + bestcpu = besthomecpu; + +out: + ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0); + return (bestcpu); } /* @@ -2719,3 +2715,19 @@ static void generic_enq_thread(cpu_t *cpu, int bound) { } + +cpu_t * +disp_choose_best_cpu(void) +{ + kthread_t *t = curthread; + cpu_t *curcpu = CPU; + + ASSERT(t->t_preempt > 0); + ASSERT(t->t_state == TS_ONPROC); + ASSERT(t->t_schedflag & TS_VCPU); + + if (ht_should_run(t, curcpu)) + return (curcpu); + + return (disp_lowpri_cpu(curcpu, t, t->t_pri)); +} diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c index af000bf4f1..c923ba5d1a 100644 --- a/usr/src/uts/common/disp/thread.c +++ b/usr/src/uts/common/disp/thread.c @@ -74,6 +74,7 @@ #include <sys/waitq.h> #include <sys/cpucaps.h> #include <sys/kiconv.h> +#include <sys/ht.h> #ifndef STACK_GROWTH_DOWN #error Stacks do not grow downward; 3b2 zombie attack detected! @@ -507,8 +508,8 @@ thread_create( if (CPU->cpu_part == &cp_default) t->t_cpu = CPU; else - t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t->t_lpl, - t->t_pri, NULL); + t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t, + t->t_pri); t->t_disp_queue = t->t_cpu->cpu_disp; kpreempt_enable(); @@ -1422,6 +1423,8 @@ thread_unpin() itp = t->t_intr; /* interrupted thread */ t->t_intr = NULL; /* clear interrupt ptr */ + ht_end_intr(); + /* * Get state from interrupt thread for the one * it interrupted. diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c index 03d711838c..2127de2bf0 100644 --- a/usr/src/uts/common/fs/zfs/zvol.c +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -25,8 +25,8 @@ * * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] + * Copyright 2018 Joyent, Inc. */ /* @@ -90,6 +90,7 @@ #include <sys/zfeature.h> #include <sys/zio_checksum.h> #include <sys/zil_impl.h> +#include <sys/ht.h> #include "zfs_namecheck.h" @@ -1281,6 +1282,8 @@ zvol_strategy(buf_t *bp) (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) && !doread && !is_dumpified; + ht_begin_unsafe(); + /* * There must be no buffer changes when doing a dmu_sync() because * we can't change the data whilst calculating the checksum. @@ -1328,6 +1331,8 @@ zvol_strategy(buf_t *bp) zil_commit(zv->zv_zilog, ZVOL_OBJ); biodone(bp); + ht_end_unsafe(); + return (0); } @@ -1409,6 +1414,8 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) return (error); } + ht_begin_unsafe(); + DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0); mutex_enter(&zonep->zone_vfs_lock); @@ -1469,6 +1476,8 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int, error); + ht_end_unsafe(); + return (error); } @@ -1501,6 +1510,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) return (error); } + ht_begin_unsafe(); + DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1); /* @@ -1549,6 +1560,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int, error); + ht_end_unsafe(); + mutex_enter(&zonep->zone_vfs_lock); zonep->zone_vfs_rwstats.writes++; zonep->zone_vfs_rwstats.nwritten += tot_bytes; @@ -1818,11 +1831,17 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) case DKIOCFLUSHWRITECACHE: dkc = (struct dk_callback *)arg; mutex_exit(&zfsdev_state_lock); + + ht_begin_unsafe(); + zil_commit(zv->zv_zilog, ZVOL_OBJ); if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) { (*dkc->dkc_callback)(dkc->dkc_cookie, error); error = 0; } + + ht_end_unsafe(); + return (error); case DKIOCGETWCE: @@ -1847,7 +1866,9 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) } else { zv->zv_flags &= ~ZVOL_WCE; mutex_exit(&zfsdev_state_lock); + ht_begin_unsafe(); zil_commit(zv->zv_zilog, ZVOL_OBJ); + ht_end_unsafe(); } return (0); } @@ -1900,6 +1921,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) mutex_exit(&zfsdev_state_lock); + ht_begin_unsafe(); + rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); @@ -1932,6 +1955,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) zil_commit(zv->zv_zilog, ZVOL_OBJ); } + ht_end_unsafe(); + return (error); } diff --git a/usr/src/uts/common/io/vnd/vnd.c b/usr/src/uts/common/io/vnd/vnd.c index 5a25ed22d5..d03c7ce4ec 100644 --- a/usr/src/uts/common/io/vnd/vnd.c +++ b/usr/src/uts/common/io/vnd/vnd.c @@ -830,6 +830,7 @@ #include <sys/disp.h> #include <sys/random.h> #include <sys/gsqueue.h> +#include <sys/ht.h> #include <inet/ip.h> #include <inet/ip6.h> @@ -3716,6 +3717,12 @@ vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy) bsize = vsp->vns_bsize; mutex_exit(&vsp->vns_lock); + /* + * We're potentially going deep into the networking layer; make sure the + * guest can't run concurrently. + */ + ht_begin_unsafe(); + nmps = 0; mptot = 0; blocked = B_FALSE; @@ -3736,6 +3743,8 @@ vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy) } } + ht_end_unsafe(); + empty = vnd_dq_is_empty(&vsp->vns_dq_write); /* diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c index 2efb68889c..4648dae9dd 100644 --- a/usr/src/uts/common/os/cpu.c +++ b/usr/src/uts/common/os/cpu.c @@ -400,6 +400,9 @@ force_thread_migrate(kthread_id_t tp) * CPUs prior to a successful return, it should take extra precautions (such as * their own call to kpreempt_disable) to ensure that safety. * + * CPU_BEST can be used to pick a "best" CPU to migrate to, including + * potentially the current CPU. + * * A CPU affinity reference count is maintained by thread_affinity_set and * thread_affinity_clear (incrementing and decrementing it, respectively), * maintaining CPU affinity while the count is non-zero, and allowing regions @@ -416,6 +419,10 @@ thread_affinity_set(kthread_id_t t, int cpu_id) VERIFY3P(t, ==, curthread); kpreempt_disable(); cp = CPU; + } else if (cpu_id == CPU_BEST) { + VERIFY3P(t, ==, curthread); + kpreempt_disable(); + cp = disp_choose_best_cpu(); } else { /* * We should be asserting that cpu_lock is held here, but @@ -453,9 +460,8 @@ thread_affinity_set(kthread_id_t t, int cpu_id) thread_unlock(t); } - if (cpu_id == CPU_CURRENT) { + if (cpu_id == CPU_CURRENT || cpu_id == CPU_BEST) kpreempt_enable(); - } } /* @@ -1490,8 +1496,8 @@ again: for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) { * Update CPU last ran on if it was this CPU */ if (t->t_cpu == cp && t->t_bound_cpu != cp) - t->t_cpu = disp_lowpri_cpu(ncp, - t->t_lpl, t->t_pri, NULL); + t->t_cpu = disp_lowpri_cpu(ncp, t, + t->t_pri); ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp || t->t_weakbound_cpu == cp); @@ -1533,10 +1539,9 @@ again: for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) { * Update CPU last ran on if it was this CPU */ - if (t->t_cpu == cp && t->t_bound_cpu != cp) { - t->t_cpu = disp_lowpri_cpu(ncp, - t->t_lpl, t->t_pri, NULL); - } + if (t->t_cpu == cp && t->t_bound_cpu != cp) + t->t_cpu = disp_lowpri_cpu(ncp, t, t->t_pri); + ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp || t->t_weakbound_cpu == cp); t = t->t_next; diff --git a/usr/src/uts/common/os/lgrp.c b/usr/src/uts/common/os/lgrp.c index 6288f47bed..6f6aced619 100644 --- a/usr/src/uts/common/os/lgrp.c +++ b/usr/src/uts/common/os/lgrp.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ /* @@ -90,6 +91,7 @@ #include <sys/pg.h> #include <sys/promif.h> #include <sys/sdt.h> +#include <sys/ht.h> lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */ lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ @@ -520,6 +522,8 @@ lgrp_main_mp_init(void) { klgrpset_t changed; + ht_init(); + /* * Update lgroup topology (if necessary) */ diff --git a/usr/src/uts/common/sys/cpuvar.h b/usr/src/uts/common/sys/cpuvar.h index 3ee4e70eec..2cfe5116d9 100644 --- a/usr/src/uts/common/sys/cpuvar.h +++ b/usr/src/uts/common/sys/cpuvar.h @@ -23,7 +23,7 @@ * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>. - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright 2017 RackTop Systems. */ @@ -540,13 +540,19 @@ extern struct cpu *curcpup(void); #endif /* - * CPU_CURRENT indicates to thread_affinity_set to use CPU->cpu_id - * as the target and to grab cpu_lock instead of requiring the caller - * to grab it. + * CPU_CURRENT indicates to thread_affinity_set() to use whatever curthread's + * current CPU is; holding cpu_lock is not required. */ #define CPU_CURRENT -3 /* + * CPU_BEST can be used by thread_affinity_set() callers to set affinity to a + * good CPU (in particular, an ht_acquire()-friendly choice); holding cpu_lock + * is not required. + */ +#define CPU_BEST -4 + +/* * Per-CPU statistics * * cpu_stats_t contains numerous system and VM-related statistics, in the form diff --git a/usr/src/uts/common/sys/disp.h b/usr/src/uts/common/sys/disp.h index b324f4d323..cb3711edcd 100644 --- a/usr/src/uts/common/sys/disp.h +++ b/usr/src/uts/common/sys/disp.h @@ -23,6 +23,8 @@ * Use is subject to license terms. * * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -63,11 +65,11 @@ typedef struct _disp { /* * Priorities: * disp_maxrunpri is the maximum run priority of runnable threads - * on this queue. It is -1 if nothing is runnable. + * on this queue. It is -1 if nothing is runnable. * * disp_max_unbound_pri is the maximum run priority of threads on * this dispatch queue but runnable by any CPU. This may be left - * artificially high, then corrected when some CPU tries to take + * artificially high, then corrected when some CPU tries to take * an unbound thread. It is -1 if nothing is runnable. */ pri_t disp_maxrunpri; /* maximum run priority */ @@ -151,8 +153,7 @@ extern void dq_srundec(kthread_t *); extern void cpu_rechoose(kthread_t *); extern void cpu_surrender(kthread_t *); extern void kpreempt(int); -extern struct cpu *disp_lowpri_cpu(struct cpu *, struct lgrp_ld *, pri_t, - struct cpu *); +extern struct cpu *disp_lowpri_cpu(struct cpu *, kthread_t *, pri_t); extern int disp_bound_threads(struct cpu *, int); extern int disp_bound_anythreads(struct cpu *, int); extern int disp_bound_partition(struct cpu *, int); @@ -167,6 +168,8 @@ extern void resume_from_zombie(kthread_t *) extern void disp_swapped_enq(kthread_t *); extern int disp_anywork(void); +extern struct cpu *disp_choose_best_cpu(void); + #define KPREEMPT_SYNC (-1) #define kpreempt_disable() \ { \ @@ -183,6 +186,8 @@ extern int disp_anywork(void); #endif /* _KERNEL */ +#define CPU_IDLE_PRI (-1) + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h index af9fcb75cf..678d356564 100644 --- a/usr/src/uts/common/sys/thread.h +++ b/usr/src/uts/common/sys/thread.h @@ -354,6 +354,8 @@ typedef struct _kthread { kmutex_t t_wait_mutex; /* used in CV wait functions */ char *t_name; /* thread name */ + + uint64_t t_unsafe; /* unsafe to run with HT VCPU thread */ } kthread_t; /* @@ -417,6 +419,7 @@ typedef struct _kthread { #define TS_SIGNALLED 0x0010 /* thread was awakened by cv_signal() */ #define TS_PROJWAITQ 0x0020 /* thread is on its project's waitq */ #define TS_ZONEWAITQ 0x0040 /* thread is on its zone's waitq */ +#define TS_VCPU 0x0080 /* thread will enter guest context */ #define TS_CSTART 0x0100 /* setrun() by continuelwps() */ #define TS_UNPAUSE 0x0200 /* setrun() by unpauselwps() */ #define TS_XSTART 0x0400 /* setrun() by SIGCONT */ diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files index fcf9820fd8..2a94505acb 100644 --- a/usr/src/uts/i86pc/Makefile.files +++ b/usr/src/uts/i86pc/Makefile.files @@ -68,6 +68,7 @@ CORE_OBJS += \ hment.o \ hold_page.o \ hrtimers.o \ + ht.o \ htable.o \ hypercall.o \ hypersubr.o \ @@ -293,7 +294,7 @@ INC_PATH += -I$(UTSBASE)/i86xpv -I$(UTSBASE)/common/xen # since only C headers are included when #defined(__lint) is true. # -ASSYM_DEPS += \ +ASSYM_DEPS += \ copy.o \ desctbls_asm.o \ ddi_i86_asm.o \ diff --git a/usr/src/uts/i86pc/io/apix/apix_intr.c b/usr/src/uts/i86pc/io/apix/apix_intr.c index 227ce0c991..59d8787839 100644 --- a/usr/src/uts/i86pc/io/apix/apix_intr.c +++ b/usr/src/uts/i86pc/io/apix/apix_intr.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2018 Western Digital Corporation. All rights reserved. + * Copyright 2018 Joyent, Inc. */ #include <sys/cpuvar.h> @@ -68,6 +69,7 @@ #include <vm/hat_i86.h> #include <sys/stack.h> #include <sys/apix.h> +#include <sys/ht.h> static void apix_post_hardint(int); @@ -280,6 +282,7 @@ apix_do_softint_prolog(struct cpu *cpu, uint_t pil, uint_t oldpil, it->t_intr = t; cpu->cpu_thread = it; + ht_begin_intr(pil); /* * Set bit for this pil in CPU's interrupt active bitmask. @@ -350,7 +353,9 @@ apix_do_softint_epilog(struct cpu *cpu, uint_t oldpil) it->t_link = cpu->cpu_intr_thread; cpu->cpu_intr_thread = it; it->t_state = TS_FREE; + ht_end_intr(); cpu->cpu_thread = t; + if (t->t_flag & T_INTR_THREAD) t->t_intr_start = now; basespl = cpu->cpu_base_spl; @@ -466,6 +471,8 @@ apix_hilevel_intr_prolog(struct cpu *cpu, uint_t pil, uint_t oldpil, } } + ht_begin_intr(pil); + /* store starting timestamp in CPu structure for this IPL */ mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)] = now; @@ -556,6 +563,8 @@ apix_hilevel_intr_epilog(struct cpu *cpu, uint_t oldpil) t->t_intr_start = now; } + ht_end_intr(); + mcpu->mcpu_pri = oldpil; if (pil < CBE_HIGH_PIL) (void) (*setlvlx)(oldpil, 0); @@ -668,6 +677,7 @@ apix_intr_thread_prolog(struct cpu *cpu, uint_t pil, caddr_t stackptr) it->t_state = TS_ONPROC; cpu->cpu_thread = it; + ht_begin_intr(pil); /* * Initialize thread priority level from intr_pri @@ -756,7 +766,9 @@ apix_intr_thread_epilog(struct cpu *cpu, uint_t oldpil) cpu->cpu_intr_thread = it; it->t_state = TS_FREE; + ht_end_intr(); cpu->cpu_thread = t; + if (t->t_flag & T_INTR_THREAD) t->t_intr_start = now; basespl = cpu->cpu_base_spl; diff --git a/usr/src/uts/i86pc/io/viona/viona.c b/usr/src/uts/i86pc/io/viona/viona.c index d3a3bdd44f..3c52457a0b 100644 --- a/usr/src/uts/i86pc/io/viona/viona.c +++ b/usr/src/uts/i86pc/io/viona/viona.c @@ -220,6 +220,7 @@ #include <sys/strsubr.h> #include <sys/strsun.h> #include <vm/seg_kmem.h> +#include <sys/ht.h> #include <sys/pattr.h> #include <sys/dls.h> @@ -2414,7 +2415,13 @@ viona_tx(viona_link_t *link, viona_vring_t *ring) viona_tx_done(ring, len, cookie); } + /* + * We're potentially going deep into the networking layer; make sure the + * guest can't run concurrently. + */ + ht_begin_unsafe(); mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL); + ht_end_unsafe(); return; drop_fail: diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c index 9ad232a612..e07ee0ea52 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c @@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$"); #ifndef __FreeBSD__ #include <sys/x86_archext.h> #include <sys/smp_impldefs.h> +#include <sys/ht.h> #endif #include <vm/vm.h> @@ -3052,11 +3053,30 @@ vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap, break; } +#ifndef __FreeBSD__ + if ((rc = ht_acquire()) != 1) { + enable_intr(); + vmexit->rip = rip; + vmexit->inst_length = 0; + if (rc == -1) { + vmexit->exitcode = VM_EXITCODE_HT; + } else { + vmexit->exitcode = VM_EXITCODE_BOGUS; + handled = HANDLED; + } + break; + } +#endif + vmx_run_trace(vmx, vcpu); vmx_dr_enter_guest(vmxctx); rc = vmx_enter_guest(vmxctx, vmx, launched); vmx_dr_leave_guest(vmxctx); +#ifndef __FreeBSD__ + ht_release(); +#endif + /* Collect some information for VM exit processing */ vmexit->rip = rip = vmcs_guest_rip(); vmexit->inst_length = vmexit_instruction_length(); diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c index bcb6b77cea..164227cc5e 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm.c +++ b/usr/src/uts/i86pc/io/vmm/vmm.c @@ -1997,7 +1997,6 @@ vmm_freectx(void *arg, int isexec) #endif /* __FreeBSD */ - int vm_run(struct vm *vm, struct vm_run *vmrun) { @@ -2013,6 +2012,7 @@ vm_run(struct vm *vm, struct vm_run *vmrun) pmap_t pmap; #ifndef __FreeBSD__ vm_thread_ctx_t vtc; + int affinity_type = CPU_CURRENT; #endif vcpuid = vmrun->cpuid; @@ -2044,7 +2044,7 @@ vm_run(struct vm *vm, struct vm_run *vmrun) restart: #ifndef __FreeBSD__ - thread_affinity_set(curthread, CPU_CURRENT); + thread_affinity_set(curthread, affinity_type); /* * Resource localization should happen after the CPU affinity for the * thread has been set to ensure that access from restricted contexts, @@ -2054,6 +2054,8 @@ restart: * This must be done prior to disabling kpreempt via critical_enter(). */ vm_localize_resources(vm, vcpu); + + affinity_type = CPU_CURRENT; #endif critical_enter(); @@ -2145,6 +2147,12 @@ restart: retu = true; } break; + + case VM_EXITCODE_HT: { + affinity_type = CPU_BEST; + break; + } + #endif default: retu = true; /* handled in userland */ diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c index fff951f82b..3c0d9beec2 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c @@ -28,6 +28,7 @@ #include <sys/cpuset.h> #include <sys/id_space.h> #include <sys/fs/sdev_plugin.h> +#include <sys/ht.h> #include <sys/kernel.h> @@ -374,6 +375,10 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, break; } vmrun.cpuid = vcpu; + + if (!(curthread->t_schedflag & TS_VCPU)) + ht_mark_as_vcpu(); + error = vm_run(sc->vmm_vm, &vmrun); /* * XXXJOY: I think it's necessary to do copyout, even in the diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c index bc3d80189b..3f9132ba4e 100644 --- a/usr/src/uts/i86pc/os/cpuid.c +++ b/usr/src/uts/i86pc/os/cpuid.c @@ -217,7 +217,9 @@ static char *x86_feature_names[NUM_X86_FEATURES] = { "ibrs_all", "rsba", "ssb_no", - "stibp_all" + "stibp_all", + "flush_cmd", + "l1d_vmentry_no" }; boolean_t @@ -986,6 +988,19 @@ cpuid_amd_getids(cpu_t *cpu) } static void +spec_l1d_flush_noop(void) +{ +} + +static void +spec_l1d_flush_msr(void) +{ + wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D); +} + +void (*spec_l1d_flush)(void) = spec_l1d_flush_noop; + +static void cpuid_scan_security(cpu_t *cpu, uchar_t *featureset) { struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; @@ -1051,6 +1066,10 @@ cpuid_scan_security(cpu_t *cpu, uchar_t *featureset) add_x86_feature(featureset, X86FSET_RSBA); } + if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) { + add_x86_feature(featureset, + X86FSET_L1D_VM_NO); + } if (reg & IA32_ARCH_CAP_SSB_NO) { add_x86_feature(featureset, X86FSET_SSB_NO); @@ -1062,7 +1081,47 @@ cpuid_scan_security(cpu_t *cpu, uchar_t *featureset) if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD) add_x86_feature(featureset, X86FSET_SSBD); + + if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD) + add_x86_feature(featureset, X86FSET_FLUSH_CMD); + } + + if (cpu->cpu_id != 0) + return; + + /* + * We're the boot CPU, so let's figure out our L1TF status. + * + * First, if this is a RDCL_NO CPU, then we are not vulnerable: we don't + * need to exclude with ht_acquire(), and we don't need to flush. + */ + if (is_x86_feature(featureset, X86FSET_RDCL_NO)) { + extern int ht_exclusion; + ht_exclusion = 0; + spec_l1d_flush = spec_l1d_flush_noop; + membar_producer(); + return; + } + + /* + * If HT is enabled, we will need HT exclusion, as well as the flush on + * VM entry. If HT isn't enabled, we still need at least the flush for + * the L1TF sequential case. + * + * However, if X86FSET_L1D_VM_NO is set, we're most likely running + * inside a VM ourselves, and we don't need the flush. + * + * If we don't have the FLUSH_CMD available at all, we'd better just + * hope HT is disabled. + */ + if (is_x86_feature(featureset, X86FSET_FLUSH_CMD) && + !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) { + spec_l1d_flush = spec_l1d_flush_msr; + } else { + spec_l1d_flush = spec_l1d_flush_noop; } + + membar_producer(); } /* @@ -3827,7 +3886,7 @@ cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum) eax = cpi->cpi_std[1].cp_eax; #define SH_B0(eax) (eax == 0xf40 || eax == 0xf50) -#define SH_B3(eax) (eax == 0xf51) +#define SH_B3(eax) (eax == 0xf51) #define B(eax) (SH_B0(eax) || SH_B3(eax)) #define SH_C0(eax) (eax == 0xf48 || eax == 0xf58) @@ -4131,9 +4190,9 @@ static const char sl3_cache_str[] = "sectored-l3-cache"; static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k"; static const struct cachetab { - uint8_t ct_code; + uint8_t ct_code; uint8_t ct_assoc; - uint16_t ct_line_size; + uint16_t ct_line_size; size_t ct_size; const char *ct_label; } intel_ctab[] = { diff --git a/usr/src/uts/i86pc/os/ht.c b/usr/src/uts/i86pc/os/ht.c new file mode 100644 index 0000000000..f82c51ac08 --- /dev/null +++ b/usr/src/uts/i86pc/os/ht.c @@ -0,0 +1,599 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * HT exclusion: prevent a sibling in a hyper-threaded core from running in VMX + * non-root guest mode, when certain threads are running on the other sibling. + * This avoids speculation-based information leaks such as L1TF being available + * to the untrusted guest. The stance we take is that threads from the same + * zone as the guest VPCU thread are considered safe to run alongside, but all + * other threads (except the idle thread), and all interrupts, are unsafe. Note + * that due to the implementation here, there are significant sections of e.g. + * the dispatcher code that can run concurrently with a guest, until the thread + * reaches ht_mark(). This code assumes there are only two HT threads per core. + * + * The entry points are as follows: + * + * ht_mark_as_vcpu() + * + * All threads that enter guest mode (i.e. VCPU threads) need to call this at + * least once, which sets TS_VCPU in ->t_schedflag. + * + * ht_mark() + * + * A new ->cpu_thread is now curthread (although interrupt threads have their + * own separate handling). After preventing any interrupts, we will take our + * own CPU's spinlock and update our own state in mcpu_ht. + * + * If our sibling is poisoned (i.e. in guest mode or the little bit of code + * around it), and we're not compatible (that is, same zone ID, or the idle + * thread), then we need to ht_kick() that sibling. ht_kick() itself waits for + * the sibling to call ht_release(), and it will not re-enter guest mode until + * allowed. + * + * Note that we ignore the fact a process can change its zone ID: poisoning + * threads never do so, and we can ignore the other cases. + * + * ht_acquire() + * + * We are a VCPU thread about to start guest execution. Interrupts are + * disabled. We must have already run ht_mark() to be in this code, so there's + * no need to take our *own* spinlock in order to mark ourselves as CM_POISONED. + * Instead, we take our sibling's lock to also mark ourselves as poisoned in the + * sibling cpu_ht_t. This is so ht_mark() will only ever need to look at its + * local mcpu_ht. + * + * We'll loop here for up to ht_acquire_wait_time microseconds; this is mainly + * to wait out any sibling interrupt: many of them will complete quicker than + * this. + * + * Finally, if we succeeded in acquiring the core, we'll flush the L1 cache as + * mitigation against L1TF: no incompatible thread will now be able to populate + * the L1 cache until *we* ht_release(). + * + * ht_release() + * + * Simply unpoison ourselves similarly to ht_acquire(); ht_kick() will wait for + * this to happen if needed. + * + * ht_begin_intr() + * + * In an interrupt prolog. We're either a hilevel interrupt, or a pinning + * interrupt. In both cases, we mark our interrupt depth, and potentially + * ht_kick(). This enforces exclusion, but doesn't otherwise modify ->ch_state: + * we want the dispatcher code to essentially ignore interrupts. + * + * ht_end_intr() + * + * In an interrupt epilogue *or* thread_unpin(). In the first case, we never + * slept, and we can simply decrement our counter. In the second case, we're an + * interrupt thread about to sleep: we'll still just decrement our counter, and + * henceforth treat the thread as a normal thread when it next gets scheduled, + * until it finally gets to its epilogue. + * + * ht_mark_unsafe() / ht_mark_safe() + * + * Mark the current thread as temporarily unsafe (guests should not be executing + * while a sibling is marked unsafe). This can be used for a thread that's + * otherwise considered safe, if it needs to handle potentially sensitive data. + * Right now, this means certain I/O handling operations that reach down into + * the networking and ZFS sub-systems. + * + * ht_should_run(thread, cpu) + * + * This is used by the dispatcher when making scheduling decisions: if the + * sibling is compatible with the given thread, we return B_TRUE. This is + * essentially trying to guess if any subsequent ht_acquire() will fail, by + * peeking at the sibling CPU's state. The peek is racy, but if we get things + * wrong, the "only" consequence is that ht_acquire() may lose. + * + * ht_adjust_cpu_score() + * + * Used when scoring other CPUs in disp_lowpri_cpu(). If we shouldn't run here, + * we'll add a small penalty to the score. This also makes sure a VCPU thread + * migration behaves properly. + */ + +#include <sys/archsystm.h> +#include <sys/disp.h> +#include <sys/cmt.h> +#include <sys/systm.h> +#include <sys/cpu.h> +#include <sys/var.h> +#include <sys/xc_levels.h> +#include <sys/cmn_err.h> +#include <sys/sysmacros.h> +#include <sys/x86_archext.h> + +#define CS_SHIFT (8) +#define CS_MASK ((1 << CS_SHIFT) - 1) +#define CS_MARK(s) ((s) & CS_MASK) +#define CS_ZONE(s) ((s) >> CS_SHIFT) +#define CS_MK(s, z) ((s) | (z << CS_SHIFT)) + +typedef enum ch_mark { + CM_IDLE = 0, /* running CPU idle thread */ + CM_THREAD, /* running general non-VCPU thread */ + CM_UNSAFE, /* running ->t_unsafe thread */ + CM_VCPU, /* running VCPU thread */ + CM_POISONED /* running in guest */ +} ch_mark_t; + +/* Double-check our false-sharing padding. */ +CTASSERT(offsetof(cpu_ht_t, ch_sib) == 64); +CTASSERT(CM_IDLE == 0); +CTASSERT(CM_POISONED < (1 << CS_SHIFT)); +CTASSERT(CM_POISONED > CM_VCPU); +CTASSERT(CM_VCPU > CM_UNSAFE); + +/* + * If disabled, no HT exclusion is performed, and system is potentially + * vulnerable to L1TF if hyper-threading is enabled, and we don't have the "not + * vulnerable" CPUID bit. + */ +int ht_exclusion = 1; + +/* + * How long ht_acquire() will spin trying to acquire the core, in micro-seconds. + * This is enough time to wait out a significant proportion of interrupts. + */ +clock_t ht_acquire_wait_time = 64; + +static cpu_t * +ht_find_sibling(cpu_t *cp) +{ + for (uint_t i = 0; i < GROUP_SIZE(&cp->cpu_pg->cmt_pgs); i++) { + pg_cmt_t *pg = GROUP_ACCESS(&cp->cpu_pg->cmt_pgs, i); + group_t *cg = &pg->cmt_pg.pghw_pg.pg_cpus; + + if (pg->cmt_pg.pghw_hw != PGHW_IPIPE) + continue; + + if (GROUP_SIZE(cg) == 1) + break; + + VERIFY3U(GROUP_SIZE(cg), ==, 2); + + if (GROUP_ACCESS(cg, 0) != cp) + return (GROUP_ACCESS(cg, 0)); + + VERIFY3P(GROUP_ACCESS(cg, 1), !=, cp); + + return (GROUP_ACCESS(cg, 1)); + } + + return (NULL); +} + +/* + * Initialize HT links. We have to be careful here not to race with + * ht_begin/end_intr(), which also complicates trying to do this initialization + * from a cross-call; hence the slightly odd approach below. + */ +void +ht_init(void) +{ + cpu_t *scp = CPU; + cpu_t *cp = scp; + ulong_t flags; + + if (!ht_exclusion) + return; + + mutex_enter(&cpu_lock); + + do { + thread_affinity_set(curthread, cp->cpu_id); + flags = intr_clear(); + + cp->cpu_m.mcpu_ht.ch_intr_depth = 0; + cp->cpu_m.mcpu_ht.ch_state = CS_MK(CM_THREAD, GLOBAL_ZONEID); + cp->cpu_m.mcpu_ht.ch_sibstate = CS_MK(CM_THREAD, GLOBAL_ZONEID); + ASSERT3P(cp->cpu_m.mcpu_ht.ch_sib, ==, NULL); + cp->cpu_m.mcpu_ht.ch_sib = ht_find_sibling(cp); + + intr_restore(flags); + thread_affinity_clear(curthread); + } while ((cp = cp->cpu_next_onln) != scp); + + mutex_exit(&cpu_lock); +} + +/* + * If our sibling is also a VCPU thread from a different zone, we need one of + * them to give up, otherwise they will just battle each other for exclusion + * until they exhaust their quantum. + * + * We arbitrate between them by dispatch priority: clearly, a higher-priority + * thread deserves to win the acquisition. However, under CPU load, it'll be + * very common to see both threads with ->t_pri == 1. If so, we'll break the + * tie by cpu_id (which is hopefully arbitrary enough). + * + * If we lose, the VMM code will take this as a hint to call + * thread_affinity_set(CPU_BEST), which will likely migrate the VCPU thread + * somewhere else. + * + * Note that all of this state examination is racy, as we don't own any locks + * here. + */ +static boolean_t +yield_to_vcpu(cpu_t *sib, zoneid_t zoneid) +{ + cpu_ht_t *sibht = &sib->cpu_m.mcpu_ht; + uint64_t sibstate = sibht->ch_state; + + /* + * If we're likely just waiting for an interrupt, don't yield. + */ + if (sibht->ch_intr_depth != 0) + return (B_FALSE); + + /* + * We're only interested in VCPUs from a different zone. + */ + if (CS_MARK(sibstate) < CM_VCPU || CS_ZONE(sibstate) == zoneid) + return (B_FALSE); + + if (curthread->t_pri < sib->cpu_dispatch_pri) + return (B_TRUE); + + if (curthread->t_pri == sib->cpu_dispatch_pri && + CPU->cpu_id < sib->cpu_id) + return (B_TRUE); + + return (B_FALSE); +} + +static inline boolean_t +sibling_compatible(cpu_ht_t *sibht, zoneid_t zoneid) +{ + uint64_t sibstate = sibht->ch_state; + + if (sibht->ch_intr_depth != 0) + return (B_FALSE); + + if (CS_MARK(sibstate) == CM_UNSAFE) + return (B_FALSE); + + if (CS_MARK(sibstate) == CM_IDLE) + return (B_TRUE); + + return (CS_ZONE(sibstate) == zoneid); +} + +int +ht_acquire(void) +{ + clock_t wait = ht_acquire_wait_time; + cpu_ht_t *ht = &CPU->cpu_m.mcpu_ht; + zoneid_t zoneid = getzoneid(); + cpu_ht_t *sibht; + int ret = 0; + + ASSERT(!interrupts_enabled()); + + if (ht->ch_sib == NULL) { + /* For the "sequential" L1TF case. */ + spec_l1d_flush(); + return (1); + } + + sibht = &ht->ch_sib->cpu_m.mcpu_ht; + + /* A VCPU thread should never change zone. */ + ASSERT3U(CS_ZONE(ht->ch_state), ==, zoneid); + ASSERT3U(CS_MARK(ht->ch_state), ==, CM_VCPU); + ASSERT3U(zoneid, !=, GLOBAL_ZONEID); + ASSERT3U(curthread->t_preempt, >=, 1); + ASSERT(curthread->t_schedflag & TS_VCPU); + + while (ret == 0 && wait > 0) { + + if (yield_to_vcpu(ht->ch_sib, zoneid)) { + ret = -1; + break; + } + + if (sibling_compatible(sibht, zoneid)) { + lock_set(&sibht->ch_lock); + + if (sibling_compatible(sibht, zoneid)) { + ht->ch_state = CS_MK(CM_POISONED, zoneid); + sibht->ch_sibstate = CS_MK(CM_POISONED, zoneid); + membar_enter(); + ret = 1; + } + + lock_clear(&sibht->ch_lock); + } else { + drv_usecwait(10); + wait -= 10; + } + } + + DTRACE_PROBE4(ht__acquire, int, ret, uint64_t, sibht->ch_state, + uint64_t, sibht->ch_intr_depth, clock_t, wait); + + if (ret == 1) + spec_l1d_flush(); + + return (ret); +} + +void +ht_release(void) +{ + cpu_ht_t *ht = &CPU->cpu_m.mcpu_ht; + zoneid_t zoneid = getzoneid(); + cpu_ht_t *sibht; + + ASSERT(!interrupts_enabled()); + + if (ht->ch_sib == NULL) + return; + + ASSERT3U(zoneid, !=, GLOBAL_ZONEID); + ASSERT3U(CS_ZONE(ht->ch_state), ==, zoneid); + ASSERT3U(CS_MARK(ht->ch_state), ==, CM_POISONED); + ASSERT3U(curthread->t_preempt, >=, 1); + + sibht = &ht->ch_sib->cpu_m.mcpu_ht; + + lock_set(&sibht->ch_lock); + + ht->ch_state = CS_MK(CM_VCPU, zoneid); + sibht->ch_sibstate = CS_MK(CM_VCPU, zoneid); + membar_producer(); + + lock_clear(&sibht->ch_lock); +} + +static void +ht_kick(cpu_ht_t *ht, zoneid_t zoneid) +{ + uint64_t sibstate; + + ASSERT(LOCK_HELD(&ht->ch_lock)); + ASSERT(!interrupts_enabled()); + + poke_cpu(ht->ch_sib->cpu_id); + + for (;;) { + membar_consumer(); + sibstate = ht->ch_sibstate; + + if (CS_MARK(sibstate) != CM_POISONED || + CS_ZONE(sibstate) == zoneid) + return; + + lock_clear(&ht->ch_lock); + + for (;;) { + membar_consumer(); + sibstate = ht->ch_sibstate; + + if (CS_MARK(sibstate) != CM_POISONED || + CS_ZONE(sibstate) == zoneid) { + lock_set(&ht->ch_lock); + return; + } + + SMT_PAUSE(); + } + + lock_set(&ht->ch_lock); + } +} + +/* + * FIXME: do we need a callback in case somebody installs a handler at this PIL + * ever? + */ +static boolean_t +pil_needs_kick(uint_t pil) +{ + return (pil != XC_CPUPOKE_PIL); +} + +void +ht_begin_intr(uint_t pil) +{ + ulong_t flags; + cpu_ht_t *ht; + + flags = intr_clear(); + ht = &CPU->cpu_m.mcpu_ht; + + if (ht->ch_sib == NULL) { + intr_restore(flags); + return; + } + + if (atomic_inc_64_nv(&ht->ch_intr_depth) == 1 && pil_needs_kick(pil)) { + lock_set(&ht->ch_lock); + + membar_consumer(); + + if (CS_MARK(ht->ch_sibstate) == CM_POISONED) + ht_kick(ht, GLOBAL_ZONEID); + + lock_clear(&ht->ch_lock); + } + + intr_restore(flags); +} + +void +ht_end_intr(void) +{ + ulong_t flags; + cpu_ht_t *ht; + + flags = intr_clear(); + ht = &CPU->cpu_m.mcpu_ht; + + if (ht->ch_sib == NULL) { + intr_restore(flags); + return; + } + + ASSERT3U(ht->ch_intr_depth, >, 0); + atomic_dec_64(&ht->ch_intr_depth); + + intr_restore(flags); +} + +static inline boolean_t +ht_need_kick(cpu_ht_t *ht, zoneid_t zoneid) +{ + membar_consumer(); + + if (CS_MARK(ht->ch_sibstate) != CM_POISONED) + return (B_FALSE); + + if (CS_MARK(ht->ch_state) == CM_UNSAFE) + return (B_TRUE); + + return (CS_ZONE(ht->ch_sibstate) != zoneid); +} + +void +ht_mark(void) +{ + zoneid_t zoneid = getzoneid(); + kthread_t *t = curthread; + ulong_t flags; + cpu_ht_t *ht; + cpu_t *cp; + + flags = intr_clear(); + + cp = CPU; + ht = &cp->cpu_m.mcpu_ht; + + if (ht->ch_sib == NULL) { + intr_restore(flags); + return; + } + + lock_set(&ht->ch_lock); + + /* + * If we were a nested interrupt and went through the resume_from_intr() + * path, we can now be resuming to a pinning interrupt thread; in which + * case, skip marking, until we later resume to a "real" thread. + */ + if (ht->ch_intr_depth > 0) { + ASSERT3P(t->t_intr, !=, NULL); + + if (ht_need_kick(ht, zoneid)) + ht_kick(ht, zoneid); + goto out; + } + + if (t == t->t_cpu->cpu_idle_thread) { + ASSERT3U(zoneid, ==, GLOBAL_ZONEID); + ht->ch_state = CS_MK(CM_IDLE, zoneid); + } else { + uint64_t state = CM_THREAD; + + if (t->t_unsafe) + state = CM_UNSAFE; + else if (t->t_schedflag & TS_VCPU) + state = CM_VCPU; + + ht->ch_state = CS_MK(state, zoneid); + + if (ht_need_kick(ht, zoneid)) + ht_kick(ht, zoneid); + } + +out: + membar_producer(); + lock_clear(&ht->ch_lock); + intr_restore(flags); +} + +void +ht_begin_unsafe(void) +{ + curthread->t_unsafe++; + ht_mark(); +} + +void +ht_end_unsafe(void) +{ + ASSERT3U(curthread->t_unsafe, >, 0); + curthread->t_unsafe--; + ht_mark(); +} + +void +ht_mark_as_vcpu(void) +{ + thread_lock(curthread); + curthread->t_schedflag |= TS_VCPU; + ht_mark(); + thread_unlock(curthread); +} + +boolean_t +ht_should_run(kthread_t *t, cpu_t *cp) +{ + uint64_t sibstate; + cpu_t *sib; + + if (t == t->t_cpu->cpu_idle_thread) + return (B_TRUE); + + if ((sib = cp->cpu_m.mcpu_ht.ch_sib) == NULL) + return (B_TRUE); + + sibstate = sib->cpu_m.mcpu_ht.ch_state; + + if ((t->t_schedflag & TS_VCPU)) { + if (CS_MARK(sibstate) == CM_IDLE) + return (B_TRUE); + if (CS_MARK(sibstate) == CM_UNSAFE) + return (B_FALSE); + return (CS_ZONE(sibstate) == ttozone(t)->zone_id); + } + + if (CS_MARK(sibstate) < CM_VCPU) + return (B_TRUE); + + return (CS_ZONE(sibstate) == ttozone(t)->zone_id); +} + +pri_t +ht_adjust_cpu_score(kthread_t *t, struct cpu *cp, pri_t score) +{ + cpu_t *sib; + + if (ht_should_run(t, cp)) + return (score); + + /* + * If we're a VCPU thread scoring our current CPU, we are most likely + * asking to be rescheduled elsewhere after losing ht_acquire(). In + * this case, the current CPU is not a good choice, most likely, and we + * should go elsewhere. + */ + if ((t->t_schedflag & TS_VCPU) && cp == t->t_cpu && score < 0) + return ((v.v_maxsyspri + 1) * 2); + + return (score + 1); +} diff --git a/usr/src/uts/i86pc/os/intr.c b/usr/src/uts/i86pc/os/intr.c index 29fa78109c..0634df1a94 100644 --- a/usr/src/uts/i86pc/os/intr.c +++ b/usr/src/uts/i86pc/os/intr.c @@ -466,25 +466,22 @@ #include <sys/ontrap.h> #include <sys/x86_archext.h> #include <sys/promif.h> +#include <sys/ht.h> #include <vm/hat_i86.h> #if defined(__xpv) #include <sys/hypervisor.h> #endif -#if defined(__amd64) && !defined(__xpv) -/* If this fails, then the padding numbers in machcpuvar.h are wrong. */ -CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_pad)) < - MMU_PAGESIZE); -CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_kpti)) >= - MMU_PAGESIZE); -CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_kpti_dbg)) < - 2 * MMU_PAGESIZE); -CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_pad2)) < - 2 * MMU_PAGESIZE); +/* If these fail, then the padding numbers in machcpuvar.h are wrong. */ +#if !defined(__xpv) +#define MCOFF(member) \ + (offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, member)) +CTASSERT(MCOFF(mcpu_pad) == MACHCPU_SIZE); +CTASSERT(MCOFF(mcpu_pad2) == MMU_PAGESIZE); +CTASSERT((MCOFF(mcpu_kpti) & 0xF) == 0); CTASSERT(((sizeof (struct kpti_frame)) & 0xF) == 0); -CTASSERT(((offsetof(cpu_t, cpu_m) + - offsetof(struct machcpu, mcpu_kpti_dbg)) & 0xF) == 0); CTASSERT((offsetof(struct kpti_frame, kf_tr_rsp) & 0xF) == 0); +CTASSERT(MCOFF(mcpu_pad3) < 2 * MMU_PAGESIZE); #endif #if defined(__xpv) && defined(DEBUG) @@ -600,6 +597,8 @@ hilevel_intr_prolog(struct cpu *cpu, uint_t pil, uint_t oldpil, struct regs *rp) } } + ht_begin_intr(pil); + /* * Store starting timestamp in CPU structure for this PIL. */ @@ -704,6 +703,8 @@ hilevel_intr_epilog(struct cpu *cpu, uint_t pil, uint_t oldpil, uint_t vecnum) t->t_intr_start = now; } + ht_end_intr(); + mcpu->mcpu_pri = oldpil; (void) (*setlvlx)(oldpil, vecnum); @@ -766,6 +767,8 @@ intr_thread_prolog(struct cpu *cpu, caddr_t stackptr, uint_t pil) it->t_state = TS_ONPROC; cpu->cpu_thread = it; /* new curthread on this cpu */ + ht_begin_intr(pil); + it->t_pil = (uchar_t)pil; it->t_pri = intr_pri + (pri_t)pil; it->t_intr_start = now; @@ -856,6 +859,7 @@ intr_thread_epilog(struct cpu *cpu, uint_t vec, uint_t oldpil) mcpu->mcpu_pri = pil; (*setlvlx)(pil, vec); t->t_intr_start = now; + ht_end_intr(); cpu->cpu_thread = t; } @@ -1043,6 +1047,7 @@ top: it->t_intr = t; cpu->cpu_thread = it; + ht_begin_intr(pil); /* * Set bit for this pil in CPU's interrupt active bitmask. @@ -1103,7 +1108,9 @@ dosoftint_epilog(struct cpu *cpu, uint_t oldpil) it->t_link = cpu->cpu_intr_thread; cpu->cpu_intr_thread = it; it->t_state = TS_FREE; + ht_end_intr(); cpu->cpu_thread = t; + if (t->t_flag & T_INTR_THREAD) t->t_intr_start = now; basespl = cpu->cpu_base_spl; diff --git a/usr/src/uts/i86pc/sys/Makefile b/usr/src/uts/i86pc/sys/Makefile index 0e3cbbe243..e6ea573d0b 100644 --- a/usr/src/uts/i86pc/sys/Makefile +++ b/usr/src/uts/i86pc/sys/Makefile @@ -21,7 +21,7 @@ # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. -# Copyright 2017 Joyent, Inc. +# Copyright 2018 Joyent, Inc. # # uts/i86pc/sys/Makefile # @@ -44,8 +44,9 @@ CHKHDRS= \ clock.h \ cram.h \ ddi_subrdefs.h \ - debug_info.h \ + debug_info.h \ fastboot.h \ + ht.h \ mach_mmu.h \ machclock.h \ machcpuvar.h \ diff --git a/usr/src/uts/i86pc/sys/ht.h b/usr/src/uts/i86pc/sys/ht.h new file mode 100644 index 0000000000..6b1bfcdd2b --- /dev/null +++ b/usr/src/uts/i86pc/sys/ht.h @@ -0,0 +1,46 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _SYS_HT_H +#define _SYS_HT_H + +#include <sys/types.h> +#include <sys/thread.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct cpu; + +extern void ht_init(void); + +extern int ht_acquire(void); +extern void ht_release(void); +extern void ht_mark(void); +extern void ht_begin_unsafe(void); +extern void ht_end_unsafe(void); +extern void ht_begin_intr(uint_t); +extern void ht_end_intr(void); +extern void ht_mark_as_vcpu(void); + +extern boolean_t ht_should_run(kthread_t *, struct cpu *); +extern pri_t ht_adjust_cpu_score(kthread_t *, struct cpu *, pri_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_HT_H */ diff --git a/usr/src/uts/i86pc/sys/machcpuvar.h b/usr/src/uts/i86pc/sys/machcpuvar.h index 98873cd26c..3d652316a4 100644 --- a/usr/src/uts/i86pc/sys/machcpuvar.h +++ b/usr/src/uts/i86pc/sys/machcpuvar.h @@ -140,6 +140,15 @@ struct kpti_frame { uint64_t kf_upper_redzone; }; +typedef struct cpu_ht { + lock_t ch_lock; + char ch_pad[56]; + struct cpu *ch_sib; + volatile uint64_t ch_intr_depth; + volatile uint64_t ch_state; + volatile uint64_t ch_sibstate; +} cpu_ht_t; + /* * This first value, MACHCPU_SIZE is the size of all the members in the cpu_t * AND struct machcpu, before we get to the mcpu_pad and the kpti area. @@ -147,9 +156,9 @@ struct kpti_frame { * page-tables, and hence must be page-aligned and page-sized. See * hat_pcp_setup(). * - * There is a CTASSERT in os/intr.c that checks these numbers. + * There are CTASSERTs in os/intr.c that verify this all works out. */ -#define MACHCPU_SIZE (572 + 1584) +#define MACHCPU_SIZE (1568 + 688) #define MACHCPU_PAD (MMU_PAGESIZE - MACHCPU_SIZE) #define MACHCPU_PAD2 (MMU_PAGESIZE - 16 - 3 * sizeof (struct kpti_frame)) @@ -227,6 +236,8 @@ struct machcpu { */ volatile uint32_t mcpu_istamp; + cpu_ht_t mcpu_ht; + char mcpu_pad[MACHCPU_PAD]; /* This is the start of the page */ diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h index bd8126cc0d..c200a5eb33 100644 --- a/usr/src/uts/i86pc/sys/vmm.h +++ b/usr/src/uts/i86pc/sys/vmm.h @@ -587,6 +587,9 @@ enum vm_exitcode { VM_EXITCODE_SVM, VM_EXITCODE_REQIDLE, VM_EXITCODE_DEBUG, +#ifndef __FreeBSD__ + VM_EXITCODE_HT, +#endif VM_EXITCODE_MAX }; diff --git a/usr/src/uts/i86xpv/Makefile.files b/usr/src/uts/i86xpv/Makefile.files index a576b2f0a8..b4a78cc841 100644 --- a/usr/src/uts/i86xpv/Makefile.files +++ b/usr/src/uts/i86xpv/Makefile.files @@ -61,6 +61,7 @@ CORE_OBJS += \ hment.o \ hold_page.o \ hrtimers.o \ + ht.o \ htable.o \ i86_mmu.o \ ibft.o \ @@ -110,7 +111,7 @@ CORE_OBJS += $(SMBIOS_OBJS) # # These get compiled twice: -# - once in the dboot (direct boot) identity mapped code +# - once in the dboot (direct boot) identity mapped code # - once for use during early startup in unix # BOOT_DRIVER_OBJS = \ @@ -161,7 +162,7 @@ SPECIAL_OBJS_64 += \ locore.o \ fast_trap_asm.o \ interrupt.o \ - syscall_asm_amd64.o \ + syscall_asm_amd64.o \ kpti_trampolines.o SPECIAL_OBJS += $(SPECIAL_OBJS_$(CLASS)) @@ -234,7 +235,7 @@ INC_PATH += -I$(UTSBASE)/i86xpv -I$(UTSBASE)/i86pc -I$(SRC)/common \ # since only C headers are included when #defined(__lint) is true. # -ASSYM_DEPS += \ +ASSYM_DEPS += \ copy.o \ desctbls_asm.o \ ddi_i86_asm.o \ diff --git a/usr/src/uts/intel/ia32/ml/copy.s b/usr/src/uts/intel/ia32/ml/copy.s index 95b7cb3028..f76a8a43cb 100644 --- a/usr/src/uts/intel/ia32/ml/copy.s +++ b/usr/src/uts/intel/ia32/ml/copy.s @@ -36,7 +36,7 @@ /* All Rights Reserved */ /* - * Copyright (c) 2017 Joyent, Inc. + * Copyright (c) 2018 Joyent, Inc. */ #include <sys/errno.h> @@ -866,8 +866,8 @@ bcopy_patch_start: bcopy_patch_end: .p2align 4 - .globl bcopy_ck_size -bcopy_ck_size: + ALTENTRY(bcopy_ck_size) + cmpq $BCOPY_DFLT_REP, %rdx jae L(use_rep) @@ -956,6 +956,7 @@ L(use_rep): jnz L(do_remainder) ret #undef L + SET_SIZE(bcopy_ck_size) #ifdef DEBUG /* diff --git a/usr/src/uts/intel/ia32/ml/swtch.s b/usr/src/uts/intel/ia32/ml/swtch.s index 6fc38cfbe8..c2c9fd9bd2 100644 --- a/usr/src/uts/intel/ia32/ml/swtch.s +++ b/usr/src/uts/intel/ia32/ml/swtch.s @@ -31,14 +31,6 @@ * Process switching routines. */ -#if defined(__lint) -#include <sys/thread.h> -#include <sys/systm.h> -#include <sys/time.h> -#else /* __lint */ -#include "assym.h" -#endif /* __lint */ - #include <sys/asm_linkage.h> #include <sys/asm_misc.h> #include <sys/regset.h> @@ -47,6 +39,9 @@ #include <sys/segments.h> #include <sys/psw.h> +#if !defined(__lint) +#include "assym.h" + /* * resume(thread_id_t t); * @@ -74,16 +69,10 @@ * off the stack. */ -#if !defined(__lint) - #if LWP_PCB_FPU != 0 #error LWP_PCB_FPU MUST be defined as 0 for code in swtch.s to work #endif /* LWP_PCB_FPU != 0 */ -#endif /* !__lint */ - -#if defined(__amd64) - /* * Save non-volatile regs other than %rsp (%rbx, %rbp, and %r12 - %r15) * @@ -153,88 +142,6 @@ jnz 0b; \ 1: -#elif defined (__i386) - -/* - * Save non-volatile registers (%ebp, %esi, %edi and %ebx) - * - * The stack frame must be created before the save of %esp so that tracebacks - * of swtch()ed-out processes show the process as having last called swtch(). - */ -#define SAVE_REGS(thread_t, retaddr) \ - movl %ebp, T_EBP(thread_t); \ - movl %ebx, T_EBX(thread_t); \ - movl %esi, T_ESI(thread_t); \ - movl %edi, T_EDI(thread_t); \ - pushl %ebp; \ - movl %esp, %ebp; \ - movl %esp, T_SP(thread_t); \ - movl retaddr, T_PC(thread_t); \ - movl 8(%ebp), %edi; \ - pushl %edi; \ - call __dtrace_probe___sched_off__cpu; \ - addl $CLONGSIZE, %esp - -/* - * Restore non-volatile registers (%ebp, %esi, %edi and %ebx) - * - * We don't do a 'leave,' because reloading %rsp/%rbp from the label_t - * already has the effect of putting the stack back the way it was when - * we came in. - */ -#define RESTORE_REGS(scratch_reg) \ - movl %gs:CPU_THREAD, scratch_reg; \ - movl T_EBP(scratch_reg), %ebp; \ - movl T_EBX(scratch_reg), %ebx; \ - movl T_ESI(scratch_reg), %esi; \ - movl T_EDI(scratch_reg), %edi - -/* - * Get pointer to a thread's hat structure - */ -#define GET_THREAD_HATP(hatp, thread_t, scratch_reg) \ - movl T_PROCP(thread_t), hatp; \ - movl P_AS(hatp), scratch_reg; \ - movl A_HAT(scratch_reg), hatp - -/* - * If we are resuming an interrupt thread, store a timestamp in the thread - * structure. If an interrupt occurs between tsc_read() and its subsequent - * store, the timestamp will be stale by the time it is stored. We can detect - * this by doing a compare-and-swap on the thread's timestamp, since any - * interrupt occurring in this window will put a new timestamp in the thread's - * t_intr_start field. - */ -#define STORE_INTR_START(thread_t) \ - testw $T_INTR_THREAD, T_FLAGS(thread_t); \ - jz 1f; \ - pushl %ecx; \ -0: \ - pushl T_INTR_START(thread_t); \ - pushl T_INTR_START+4(thread_t); \ - call tsc_read; \ - movl %eax, %ebx; \ - movl %edx, %ecx; \ - popl %edx; \ - popl %eax; \ - cmpxchg8b T_INTR_START(thread_t); \ - jnz 0b; \ - popl %ecx; \ -1: - -#endif /* __amd64 */ - -#if defined(__lint) - -/* ARGSUSED */ -void -resume(kthread_t *t) -{} - -#else /* __lint */ - -#if defined(__amd64) - .global kpti_enable ENTRY(resume) @@ -436,6 +343,8 @@ resume(kthread_t *t) call smap_disable .nosmap: + call ht_mark + /* * Restore non-volatile registers, then have spl0 return to the * resuming thread's PC after first setting the priority as low as @@ -456,203 +365,6 @@ resume_return: SET_SIZE(_resume_from_idle) SET_SIZE(resume) -#elif defined (__i386) - - ENTRY(resume) - movl %gs:CPU_THREAD, %eax - movl $resume_return, %ecx - - /* - * Save non-volatile registers, and set return address for current - * thread to resume_return. - * - * %edi = t (new thread) when done. - */ - SAVE_REGS(%eax, %ecx) - - LOADCPU(%ebx) /* %ebx = CPU */ - movl CPU_THREAD(%ebx), %esi /* %esi = curthread */ - -#ifdef DEBUG - call assert_ints_enabled /* panics if we are cli'd */ -#endif - /* - * Call savectx if thread has installed context ops. - * - * Note that if we have floating point context, the save op - * (either fpsave_begin or fpxsave_begin) will issue the - * async save instruction (fnsave or fxsave respectively) - * that we fwait for below. - */ - movl T_CTX(%esi), %eax /* should current thread savectx? */ - testl %eax, %eax - jz .nosavectx /* skip call when zero */ - pushl %esi /* arg = thread pointer */ - call savectx /* call ctx ops */ - addl $4, %esp /* restore stack pointer */ -.nosavectx: - - /* - * Call savepctx if process has installed context ops. - */ - movl T_PROCP(%esi), %eax /* %eax = proc */ - cmpl $0, P_PCTX(%eax) /* should current thread savectx? */ - je .nosavepctx /* skip call when zero */ - pushl %eax /* arg = proc pointer */ - call savepctx /* call ctx ops */ - addl $4, %esp -.nosavepctx: - - /* - * Temporarily switch to the idle thread's stack - */ - movl CPU_IDLE_THREAD(%ebx), %eax /* idle thread pointer */ - - /* - * Set the idle thread as the current thread - */ - movl T_SP(%eax), %esp /* It is safe to set esp */ - movl %eax, CPU_THREAD(%ebx) - - /* switch in the hat context for the new thread */ - GET_THREAD_HATP(%ecx, %edi, %ecx) - pushl %ecx - call hat_switch - addl $4, %esp - - /* - * Clear and unlock previous thread's t_lock - * to allow it to be dispatched by another processor. - */ - movb $0, T_LOCK(%esi) - - /* - * IMPORTANT: Registers at this point must be: - * %edi = new thread - * - * Here we are in the idle thread, have dropped the old thread. - */ - ALTENTRY(_resume_from_idle) - /* - * spin until dispatched thread's mutex has - * been unlocked. this mutex is unlocked when - * it becomes safe for the thread to run. - */ -.L4: - lock - btsl $0, T_LOCK(%edi) /* lock new thread's mutex */ - jc .L4_2 /* lock did not succeed */ - - /* - * Fix CPU structure to indicate new running thread. - * Set pointer in new thread to the CPU structure. - */ - LOADCPU(%esi) /* load current CPU pointer */ - movl T_STACK(%edi), %eax /* here to use v pipeline of */ - /* Pentium. Used few lines below */ - cmpl %esi, T_CPU(%edi) - jne .L5_2 -.L5_1: - /* - * Setup esp0 (kernel stack) in TSS to curthread's stack. If this - * thread doesn't have a regs structure above the stack -- that is, if - * lwp_stk_init() was never called for the thread -- this will set - * esp0 to the wrong value, but it's harmless as it's a kernel thread, - * and it won't actually attempt to implicitly use the esp0 via a - * privilege change. - */ - movl CPU_TSS(%esi), %ecx - addl $REGSIZE+MINFRAME, %eax /* to the bottom of thread stack */ -#if !defined(__xpv) - movl %eax, TSS_ESP0(%ecx) -#else - pushl %eax - pushl $KDS_SEL - call HYPERVISOR_stack_switch - addl $8, %esp -#endif /* __xpv */ - - movl %edi, CPU_THREAD(%esi) /* set CPU's thread pointer */ - mfence /* synchronize with mutex_exit() */ - xorl %ebp, %ebp /* make $<threadlist behave better */ - movl T_LWP(%edi), %eax /* set associated lwp to */ - movl %eax, CPU_LWP(%esi) /* CPU's lwp ptr */ - - movl T_SP(%edi), %esp /* switch to outgoing thread's stack */ - movl T_PC(%edi), %esi /* saved return addr */ - - /* - * Call restorectx if context ops have been installed. - */ - movl T_CTX(%edi), %eax /* should resumed thread restorectx? */ - testl %eax, %eax - jz .norestorectx /* skip call when zero */ - pushl %edi /* arg = thread pointer */ - call restorectx /* call ctx ops */ - addl $4, %esp /* restore stack pointer */ -.norestorectx: - - /* - * Call restorepctx if context ops have been installed for the proc. - */ - movl T_PROCP(%edi), %eax - cmpl $0, P_PCTX(%eax) - je .norestorepctx - pushl %eax /* arg = proc pointer */ - call restorepctx - addl $4, %esp /* restore stack pointer */ -.norestorepctx: - - STORE_INTR_START(%edi) - - /* - * Restore non-volatile registers, then have spl0 return to the - * resuming thread's PC after first setting the priority as low as - * possible and blocking all interrupt threads that may be active. - */ - movl %esi, %eax /* save return address */ - RESTORE_REGS(%ecx) - pushl %eax /* push return address for spl0() */ - call __dtrace_probe___sched_on__cpu - jmp spl0 - -resume_return: - /* - * Remove stack frame created in SAVE_REGS() - */ - addl $CLONGSIZE, %esp - ret - -.L4_2: - pause - cmpb $0, T_LOCK(%edi) - je .L4 - jmp .L4_2 - -.L5_2: - /* cp->cpu_stats.sys.cpumigrate++ */ - addl $1, CPU_STATS_SYS_CPUMIGRATE(%esi) - adcl $0, CPU_STATS_SYS_CPUMIGRATE+4(%esi) - movl %esi, T_CPU(%edi) /* set new thread's CPU pointer */ - jmp .L5_1 - - SET_SIZE(_resume_from_idle) - SET_SIZE(resume) - -#endif /* __amd64 */ -#endif /* __lint */ - -#if defined(__lint) - -/* ARGSUSED */ -void -resume_from_zombie(kthread_t *t) -{} - -#else /* __lint */ - -#if defined(__amd64) - ENTRY(resume_from_zombie) movq %gs:CPU_THREAD, %rax leaq resume_from_zombie_return(%rip), %r11 @@ -727,88 +439,6 @@ resume_from_zombie_return: ret SET_SIZE(resume_from_zombie) -#elif defined (__i386) - - ENTRY(resume_from_zombie) - movl %gs:CPU_THREAD, %eax - movl $resume_from_zombie_return, %ecx - - /* - * Save non-volatile registers, and set return address for current - * thread to resume_from_zombie_return. - * - * %edi = t (new thread) when done. - */ - SAVE_REGS(%eax, %ecx) - -#ifdef DEBUG - call assert_ints_enabled /* panics if we are cli'd */ -#endif - movl %gs:CPU_THREAD, %esi /* %esi = curthread */ - - /* clean up the fp unit. It might be left enabled */ - - movl %cr0, %eax - testl $CR0_TS, %eax - jnz .zfpu_disabled /* if TS already set, nothing to do */ - fninit /* init fpu & discard pending error */ - orl $CR0_TS, %eax - movl %eax, %cr0 -.zfpu_disabled: - - /* - * Temporarily switch to the idle thread's stack so that the zombie - * thread's stack can be reclaimed by the reaper. - */ - movl %gs:CPU_IDLE_THREAD, %eax /* idle thread pointer */ - movl T_SP(%eax), %esp /* get onto idle thread stack */ - - /* - * Set the idle thread as the current thread. - */ - movl %eax, %gs:CPU_THREAD - - /* - * switch in the hat context for the new thread - */ - GET_THREAD_HATP(%ecx, %edi, %ecx) - pushl %ecx - call hat_switch - addl $4, %esp - - /* - * Put the zombie on death-row. - */ - pushl %esi - call reapq_add - addl $4, %esp - jmp _resume_from_idle /* finish job of resume */ - -resume_from_zombie_return: - RESTORE_REGS(%ecx) /* restore non-volatile registers */ - call __dtrace_probe___sched_on__cpu - - /* - * Remove stack frame created in SAVE_REGS() - */ - addl $CLONGSIZE, %esp - ret - SET_SIZE(resume_from_zombie) - -#endif /* __amd64 */ -#endif /* __lint */ - -#if defined(__lint) - -/* ARGSUSED */ -void -resume_from_intr(kthread_t *t) -{} - -#else /* __lint */ - -#if defined(__amd64) - ENTRY(resume_from_intr) movq %gs:CPU_THREAD, %rax leaq resume_from_intr_return(%rip), %r11 @@ -835,6 +465,8 @@ resume_from_intr(kthread_t *t) STORE_INTR_START(%r12) + call ht_mark + /* * Restore non-volatile registers, then have spl0 return to the * resuming thread's PC after first setting the priority as low as @@ -854,69 +486,6 @@ resume_from_intr_return: ret SET_SIZE(resume_from_intr) -#elif defined (__i386) - - ENTRY(resume_from_intr) - movl %gs:CPU_THREAD, %eax - movl $resume_from_intr_return, %ecx - - /* - * Save non-volatile registers, and set return address for current - * thread to resume_return. - * - * %edi = t (new thread) when done. - */ - SAVE_REGS(%eax, %ecx) - -#ifdef DEBUG - call assert_ints_enabled /* panics if we are cli'd */ -#endif - movl %gs:CPU_THREAD, %esi /* %esi = curthread */ - movl %edi, %gs:CPU_THREAD /* set CPU's thread pointer */ - mfence /* synchronize with mutex_exit() */ - movl T_SP(%edi), %esp /* restore resuming thread's sp */ - xorl %ebp, %ebp /* make $<threadlist behave better */ - - /* - * Unlock outgoing thread's mutex dispatched by another processor. - */ - xorl %eax,%eax - xchgb %al, T_LOCK(%esi) - - STORE_INTR_START(%edi) - - /* - * Restore non-volatile registers, then have spl0 return to the - * resuming thread's PC after first setting the priority as low as - * possible and blocking all interrupt threads that may be active. - */ - movl T_PC(%edi), %eax /* saved return addr */ - RESTORE_REGS(%ecx) - pushl %eax /* push return address for spl0() */ - call __dtrace_probe___sched_on__cpu - jmp spl0 - -resume_from_intr_return: - /* - * Remove stack frame created in SAVE_REGS() - */ - addl $CLONGSIZE, %esp - ret - SET_SIZE(resume_from_intr) - -#endif /* __amd64 */ -#endif /* __lint */ - -#if defined(__lint) - -void -thread_start(void) -{} - -#else /* __lint */ - -#if defined(__amd64) - ENTRY(thread_start) popq %rax /* start() */ popq %rdi /* arg */ @@ -927,36 +496,6 @@ thread_start(void) /*NOTREACHED*/ SET_SIZE(thread_start) -#elif defined(__i386) - - ENTRY(thread_start) - popl %eax - movl %esp, %ebp - addl $8, %ebp - call *%eax - addl $8, %esp - call thread_exit /* destroy thread if it returns. */ - /*NOTREACHED*/ - SET_SIZE(thread_start) - -#endif /* __i386 */ - -#endif /* __lint */ - -#if defined(__lint) - -void -thread_splitstack_run(caddr_t stack, void (*func)(void *), void *arg) -{} - -void -thread_splitstack_cleanup(void) -{} - -#else /* __lint */ - -#if defined(__amd64) - ENTRY(thread_splitstack_run) pushq %rbp /* push base pointer */ movq %rsp, %rbp /* construct frame */ @@ -995,34 +534,4 @@ thread_splitstack_cleanup(void) ret SET_SIZE(thread_splitstack_cleanup) -#elif defined(__i386) - - ENTRY(thread_splitstack_run) - pushl %ebp /* push base pointer */ - movl %esp, %ebp /* construct frame */ - movl 8(%ebp), %esp /* set stack pointer */ - movl 12(%ebp), %eax /* load func */ - movl 16(%ebp), %edx /* load arg */ - pushl %edx /* push arg */ - call *%eax /* call specifed function */ - addl $4, %esp /* restore stack pointer */ - leave /* pop base pointer */ - ret - SET_SIZE(thread_splitstack_run) - - /* - * See comment in the amd64 code, above. - */ - ENTRY(thread_splitstack_cleanup) - LOADCPU(%eax) - movl CPU_TSS(%eax), %ecx - movl CPU_THREAD(%eax), %edx - movl T_STACK(%edx), %edx - addl $REGSIZE+MINFRAME, %edx - movl %edx, TSS_ESP0(%ecx) - ret - SET_SIZE(thread_splitstack_cleanup) - -#endif /* __i386 */ - -#endif /* __lint */ +#endif /* !__lint */ diff --git a/usr/src/uts/intel/sys/x86_archext.h b/usr/src/uts/intel/sys/x86_archext.h index c6d696dc6e..fb6b6f0fdb 100644 --- a/usr/src/uts/intel/sys/x86_archext.h +++ b/usr/src/uts/intel/sys/x86_archext.h @@ -254,6 +254,7 @@ extern "C" { #define CPUID_INTC_EDX_7_0_AVX5124FMAPS 0x00000008 /* AVX512 4FMAPS */ #define CPUID_INTC_EDX_7_0_SPEC_CTRL 0x04000000 /* Spec, IBPB, IBRS */ #define CPUID_INTC_EDX_7_0_STIBP 0x08000000 /* STIBP */ +#define CPUID_INTC_EDX_7_0_FLUSH_CMD 0x10000000 /* IA32_FLUSH_CMD */ #define CPUID_INTC_EDX_7_0_ARCH_CAPS 0x20000000 /* IA32_ARCH_CAPS */ #define CPUID_INTC_EDX_7_0_SSBD 0x80000000 /* SSBD */ @@ -362,11 +363,12 @@ extern "C" { /* * Intel IA32_ARCH_CAPABILITIES MSR. */ -#define MSR_IA32_ARCH_CAPABILITIES 0x10a -#define IA32_ARCH_CAP_RDCL_NO 0x0001 -#define IA32_ARCH_CAP_IBRS_ALL 0x0002 -#define IA32_ARCH_CAP_RSBA 0x0004 -#define IA32_ARCH_CAP_SSB_NO 0x0010 +#define MSR_IA32_ARCH_CAPABILITIES 0x10a +#define IA32_ARCH_CAP_RDCL_NO 0x0001 +#define IA32_ARCH_CAP_IBRS_ALL 0x0002 +#define IA32_ARCH_CAP_RSBA 0x0004 +#define IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY 0x0008 +#define IA32_ARCH_CAP_SSB_NO 0x0010 /* * Intel Speculation related MSRs @@ -379,6 +381,9 @@ extern "C" { #define MSR_IA32_PRED_CMD 0x49 #define IA32_PRED_CMD_IBPB 0x01 +#define MSR_IA32_FLUSH_CMD 0x10b +#define IA32_FLUSH_CMD_L1D 0x01 + #define MCI_CTL_VALUE 0xffffffff #define MTRR_TYPE_UC 0 @@ -491,6 +496,8 @@ extern "C" { #define X86FSET_RSBA 78 #define X86FSET_SSB_NO 79 #define X86FSET_STIBP_ALL 80 +#define X86FSET_FLUSH_CMD 81 +#define X86FSET_L1D_VM_NO 82 /* * Intel Deep C-State invariant TSC in leaf 0x80000007. @@ -773,7 +780,7 @@ extern "C" { #if defined(_KERNEL) || defined(_KMEMUSER) -#define NUM_X86_FEATURES 81 +#define NUM_X86_FEATURES 83 extern uchar_t x86_featureset[]; extern void free_x86_featureset(void *featureset); @@ -792,6 +799,8 @@ extern uint_t pentiumpro_bug4046376; extern const char CyrixInstead[]; +extern void (*spec_l1d_flush)(void); + #endif #if defined(_KERNEL) diff --git a/usr/src/uts/intel/vnd/Makefile b/usr/src/uts/intel/vnd/Makefile index fc94398b99..b94d014eb7 100644 --- a/usr/src/uts/intel/vnd/Makefile +++ b/usr/src/uts/intel/vnd/Makefile @@ -10,7 +10,7 @@ # # -# Copyright 2017 Joyent, Inc. +# Copyright 2018 Joyent, Inc. # UTSBASE = ../.. @@ -29,7 +29,8 @@ LINT_TARGET = $(MODULE).lint INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) CONF_SRCDIR = $(UTSBASE)/common/io/vnd -LDFLAGS += -dy -Nmisc/neti -Nmisch/hook -Nfs/dev -Nmisc/gsqueue +CPPFLAGS += -I$(UTSBASE)/i86pc +LDFLAGS += -dy -Nmisc/neti -Nmisch/hook -Nfs/dev -Nmisc/gsqueue # # We use <sys/ctype.h> which causes gcc to think that all of its inline diff --git a/usr/src/uts/intel/zfs/Makefile b/usr/src/uts/intel/zfs/Makefile index a4a2f4a561..07d4395c22 100644 --- a/usr/src/uts/intel/zfs/Makefile +++ b/usr/src/uts/intel/zfs/Makefile @@ -29,6 +29,7 @@ # # Copyright (c) 2016 by Delphix. All rights reserved. # +# Copyright 2018 Joyent, Inc. # # Path to the base of the uts directory tree (usually /usr/src/uts). @@ -72,6 +73,7 @@ INC_PATH += -I$(UTSBASE)/common/fs/zfs/lua INC_PATH += -I$(SRC)/common INC_PATH += -I$(COMMONBASE)/zfs +CPPFLAGS += -I$(UTSBASE)/i86pc C99LMODE= -Xc99=%all # diff --git a/usr/src/uts/sparc/zfs/Makefile b/usr/src/uts/sparc/zfs/Makefile index f32b408306..617d495325 100644 --- a/usr/src/uts/sparc/zfs/Makefile +++ b/usr/src/uts/sparc/zfs/Makefile @@ -29,6 +29,8 @@ # # Copyright (c) 2016 by Delphix. All rights reserved. # +# Copyright 2018 Joyent, Inc. +# # # Path to the base of the uts directory tree (usually /usr/src/uts). @@ -74,6 +76,7 @@ INC_PATH += -I$(UTSBASE)/common/fs/zfs INC_PATH += -I$(UTSBASE)/common/fs/zfs/lua INC_PATH += -I$(SRC)/common INC_PATH += -I$(COMMONBASE)/zfs +INC_PATH += -I$(UTSBASE)/sun4 C99LMODE= -Xc99=%all diff --git a/usr/src/uts/sun4/sys/ht.h b/usr/src/uts/sun4/sys/ht.h new file mode 100644 index 0000000000..831891979f --- /dev/null +++ b/usr/src/uts/sun4/sys/ht.h @@ -0,0 +1,37 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _SYS_HT_H +#define _SYS_HT_H + +#include <sys/types.h> +#include <sys/thread.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ht_init() {} + +#define ht_should_run(t, c) (B_TRUE) +#define ht_adjust_cpu_score(t, c, p) (p) +#define ht_mark_safe(void) {} +#define ht_mark_unsafe(void) {} + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_HT_H */ diff --git a/usr/src/uts/sun4u/sys/Makefile b/usr/src/uts/sun4u/sys/Makefile index 8e73425995..a69a2b14f1 100644 --- a/usr/src/uts/sun4u/sys/Makefile +++ b/usr/src/uts/sun4u/sys/Makefile @@ -21,7 +21,7 @@ # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# uts/sun4u/sys/Makefile +# Copyright 2018 Joyent, Inc. # UTSBASE = ../.. @@ -40,18 +40,19 @@ SUN4_HDRS= \ clock.h \ cmp.h \ cpc_ultra.h \ - cpu_sgnblk_defs.h \ + cpu_sgnblk_defs.h \ ddi_subrdefs.h \ dvma.h \ eeprom.h \ errclassify.h \ fcode.h \ fc_plat.h \ + ht.h \ idprom.h \ intr.h \ intreg.h \ ivintr.h \ - memlist_plat.h \ + memlist_plat.h \ memnode.h \ nexusdebug.h \ prom_debug.h \ diff --git a/usr/src/uts/sun4v/sys/Makefile b/usr/src/uts/sun4v/sys/Makefile index 2af0d8841b..6c0fbd666c 100644 --- a/usr/src/uts/sun4v/sys/Makefile +++ b/usr/src/uts/sun4v/sys/Makefile @@ -22,8 +22,7 @@ # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# -# uts/sun4v/sys/Makefile +# Copyright 2018 Joyent, Inc. # # include global definitions UTSBASE = ../.. @@ -42,16 +41,17 @@ SUN4_HDRS= \ clock.h \ cmp.h \ cpc_ultra.h \ - cpu_sgnblk_defs.h \ + cpu_sgnblk_defs.h \ ddi_subrdefs.h \ dvma.h \ eeprom.h \ fcode.h \ + ht.h \ idprom.h \ intr.h \ intreg.h \ ivintr.h \ - memlist_plat.h \ + memlist_plat.h \ memnode.h \ nexusdebug.h \ prom_debug.h \ |