summaryrefslogtreecommitdiff
path: root/usr/src/uts/i86pc/ml
diff options
context:
space:
mode:
authorJohn Levon <john.levon@joyent.com>2018-01-22 22:05:38 +0000
committerDan McDonald <danmcd@joyent.com>2018-04-10 10:37:19 -0400
commit74ecdb5171c9f3673b9393b1a3dc6f3a65e93895 (patch)
tree3c16ae772368de5b6eec80945340deb2b38d91d0 /usr/src/uts/i86pc/ml
parent1d9a8ab82e5abe86cb1e43c502f88c7c655658fd (diff)
downloadillumos-joyent-74ecdb5171c9f3673b9393b1a3dc6f3a65e93895.tar.gz
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Robert Mustacchi <rm@joyent.com> Approved by: Gordon Ross <gwr@nexenta.com>
Diffstat (limited to 'usr/src/uts/i86pc/ml')
-rw-r--r--usr/src/uts/i86pc/ml/fb_swtch_src.s5
-rw-r--r--usr/src/uts/i86pc/ml/genassym.c23
-rw-r--r--usr/src/uts/i86pc/ml/kdi_subr.s160
-rw-r--r--usr/src/uts/i86pc/ml/kpti_trampolines.s743
-rw-r--r--usr/src/uts/i86pc/ml/locore.s10
-rw-r--r--usr/src/uts/i86pc/ml/mpcore.s6
-rw-r--r--usr/src/uts/i86pc/ml/offsets.in40
-rw-r--r--usr/src/uts/i86pc/ml/syscall_asm_amd64.s103
8 files changed, 873 insertions, 217 deletions
diff --git a/usr/src/uts/i86pc/ml/fb_swtch_src.s b/usr/src/uts/i86pc/ml/fb_swtch_src.s
index e67837ee2b..4d1789fc9b 100644
--- a/usr/src/uts/i86pc/ml/fb_swtch_src.s
+++ b/usr/src/uts/i86pc/ml/fb_swtch_src.s
@@ -22,6 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
@@ -52,6 +53,9 @@ int fb_swtch_silence_lint = 0;
#define DISABLE_PAGING \
+ movl %cr4, %eax ;\
+ btrl $17, %eax /* clear PCIDE bit */ ;\
+ movl %eax, %cr4 ;\
movl %cr0, %eax ;\
btrl $31, %eax /* clear PG bit */ ;\
movl %eax, %cr0
@@ -222,6 +226,7 @@ _start:
* Disable long mode by:
* - shutting down paging (bit 31 of cr0). This will flush the
* TLBs.
+ * - turning off PCID in cr4
* - disabling LME (long mode enable) in EFER (extended feature reg)
*/
#endif
diff --git a/usr/src/uts/i86pc/ml/genassym.c b/usr/src/uts/i86pc/ml/genassym.c
index 088dd661a3..6d840368d7 100644
--- a/usr/src/uts/i86pc/ml/genassym.c
+++ b/usr/src/uts/i86pc/ml/genassym.c
@@ -20,6 +20,8 @@
*/
/*
* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _GENASSYM
@@ -68,8 +70,6 @@ extern void exit(int);
int
main(int argc, char *argv[])
{
- printf("#define\tT_AST 0x%x\n", T_AST);
-
printf("#define\tLOCK_LEVEL 0x%x\n", LOCK_LEVEL);
printf("#define\tCLOCK_LEVEL 0x%x\n", CLOCK_LEVEL);
printf("#define\tDISP_LEVEL 0x%x\n", DISP_LEVEL);
@@ -109,20 +109,6 @@ main(int argc, char *argv[])
printf("#define\tSSE_MXCSR_EFLAGS 0x%x\n", SSE_MXCSR_EFLAGS);
- printf("#define\tFP_487 0x%x\n", FP_487);
- printf("#define\tFP_486 0x%x\n", FP_486);
- printf("#define\tFPU_CW_INIT 0x%x\n", FPU_CW_INIT);
- printf("#define\tFPU_EN 0x%x\n", FPU_EN);
- printf("#define\tFPU_VALID 0x%x\n", FPU_VALID);
-
- printf("#define\tFP_NO 0x%x\n", FP_NO);
- printf("#define\tFP_SW 0x%x\n", FP_SW);
- printf("#define\tFP_HW 0x%x\n", FP_HW);
- printf("#define\tFP_287 0x%x\n", FP_287);
- printf("#define\tFP_387 0x%x\n", FP_387);
- printf("#define\t__FP_SSE 0x%x\n", __FP_SSE);
-
- printf("#define\tFP_FNSAVE 0x%x\n", FP_FNSAVE);
printf("#define\tFP_FXSAVE 0x%x\n", FP_FXSAVE);
printf("#define\tFP_XSAVE 0x%x\n", FP_XSAVE);
@@ -154,11 +140,6 @@ main(int argc, char *argv[])
printf("#define\tNSEC_PER_COUNTER_TICK 0x%llx\n", NANOSEC / PIT_HZ);
- printf("#define\tPITCTR0_PORT 0x%x\n", PITCTR0_PORT);
- printf("#define\tPITCTL_PORT 0x%x\n", PITCTL_PORT);
- printf("#define\tPIT_COUNTDOWN 0x%x\n",
- PIT_C0 | PIT_LOADMODE | PIT_NDIVMODE);
-
printf("#define\tNBPW 0x%x\n", NBPW);
printf("#define\tDDI_ACCATTR_IO_SPACE 0x%x\n", DDI_ACCATTR_IO_SPACE);
diff --git a/usr/src/uts/i86pc/ml/kdi_subr.s b/usr/src/uts/i86pc/ml/kdi_subr.s
deleted file mode 100644
index 8ed90ed410..0000000000
--- a/usr/src/uts/i86pc/ml/kdi_subr.s
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/asm_linkage.h>
-#include <sys/asm_misc.h>
-#include <sys/regset.h>
-#include <sys/privregs.h>
-#include <sys/psw.h>
-
-#if defined(__lint)
-#include <sys/types.h>
-#include <sys/segments.h>
-#endif
-
-#if defined(__lint)
-
-ulong_t
-kdi_getdr0(void)
-{
- return (0);
-}
-
-ulong_t
-kdi_getdr1(void)
-{
- return (0);
-}
-
-ulong_t
-kdi_getdr2(void)
-{
- return (0);
-}
-
-ulong_t
-kdi_getdr3(void)
-{
- return (0);
-}
-
-ulong_t
-kdi_getdr6(void)
-{
- return (0);
-}
-
-ulong_t
-kdi_getdr7(void)
-{
- return (0);
-}
-
-/*ARGSUSED*/
-void
-kdi_setdr0(ulong_t value)
-{}
-
-/*ARGSUSED*/
-void
-kdi_setdr1(ulong_t value)
-{}
-
-/*ARGSUSED*/
-void
-kdi_setdr2(ulong_t value)
-{}
-
-/*ARGSUSED*/
-void
-kdi_setdr3(ulong_t value)
-{}
-
-/*ARGSUSED*/
-void
-kdi_setdr4(ulong_t value)
-{}
-
-/*ARGSUSED*/
-void
-kdi_setdr6(ulong_t value)
-{}
-
-/*ARGSUSED*/
-void
-kdi_setdr7(ulong_t value)
-{}
-
-#else
-
-#if defined(__amd64)
-
-#define GETDREG(name, r) \
- ENTRY_NP(name); \
- movq r, %rax; \
- ret; \
- SET_SIZE(name)
-
-#define SETDREG(name, r) \
- ENTRY_NP(name); \
- movq %rdi, r; \
- ret; \
- SET_SIZE(name)
-
-#elif defined(__i386)
-
-#define GETDREG(name, r) \
- ENTRY_NP(name); \
- movl r, %eax; \
- ret; \
- SET_SIZE(name)
-
-#define SETDREG(name, r) \
- ENTRY_NP(name); \
- movl 4(%esp), %eax; \
- movl %eax, r; \
- ret; \
- SET_SIZE(name)
-
-#endif
-
- GETDREG(kdi_getdr0, %dr0)
- GETDREG(kdi_getdr1, %dr1)
- GETDREG(kdi_getdr2, %dr2)
- GETDREG(kdi_getdr3, %dr3)
- GETDREG(kdi_getdr6, %dr6)
- GETDREG(kdi_getdr7, %dr7)
-
- SETDREG(kdi_setdr0, %dr0)
- SETDREG(kdi_setdr1, %dr1)
- SETDREG(kdi_setdr2, %dr2)
- SETDREG(kdi_setdr3, %dr3)
- SETDREG(kdi_setdr6, %dr6)
- SETDREG(kdi_setdr7, %dr7)
-
-#endif /* __lint */
diff --git a/usr/src/uts/i86pc/ml/kpti_trampolines.s b/usr/src/uts/i86pc/ml/kpti_trampolines.s
new file mode 100644
index 0000000000..d50e964e62
--- /dev/null
+++ b/usr/src/uts/i86pc/ml/kpti_trampolines.s
@@ -0,0 +1,743 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * This file contains the trampolines that are used by KPTI in order to be
+ * able to take interrupts/trap/etc while on the "user" page table.
+ *
+ * We don't map the full kernel text into the user page table: instead we
+ * map this one small section of trampolines (which compiles to ~13 pages).
+ * These trampolines are set in the IDT always (so they will run no matter
+ * whether we're on the kernel or user page table), and their primary job is to
+ * pivot us to the kernel %cr3 and %rsp without ruining everything.
+ *
+ * All of these interrupts use the amd64 IST feature when we have KPTI enabled,
+ * meaning that they will execute with their %rsp set to a known location, even
+ * if we take them in the kernel.
+ *
+ * Over in desctbls.c (for cpu0) and mp_pc.c (other cpus) we set up the IST
+ * stack to point at &cpu->cpu_m.mcpu_kpti.kf_tr_rsp. You can see the mcpu_kpti
+ * (a struct kpti_frame) defined in machcpuvar.h. This struct is set up to be
+ * page-aligned, and we map the page it's on into both page tables. Using a
+ * struct attached to the cpu_t also means that we can use %rsp-relative
+ * addressing to find anything on the cpu_t, so we don't have to touch %gs or
+ * GSBASE at all on incoming interrupt trampolines (which can get pretty hairy).
+ *
+ * This little struct is where the CPU will push the actual interrupt frame.
+ * Then, in the trampoline, we change %cr3, then figure out our destination
+ * stack pointer and "pivot" to it (set %rsp and re-push the CPU's interrupt
+ * frame). Then we jump to the regular ISR in the kernel text and carry on as
+ * normal.
+ *
+ * We leave the original frame and any spilled regs behind in the kpti_frame
+ * lazily until we want to return to userland. Then, we clear any spilled
+ * regs from it, and overwrite the rest with our iret frame. When switching
+ * this cpu to a different process (in hat_switch), we bzero the whole region to
+ * make sure nothing can leak between processes.
+ *
+ * When we're returning back to the original place we took the interrupt later
+ * (especially if it was in userland), we have to jmp back to the "return
+ * trampolines" here, since when we set %cr3 back to the user value, we need to
+ * be executing from code here in these shared pages and not the main kernel
+ * text again. Even though it should be fine to iret directly from kernel text
+ * when returning to kernel code, we make things jmp to a trampoline here just
+ * for consistency.
+ *
+ * Note that with IST, it's very important that we always must have pivoted
+ * away from the IST stack before we could possibly take any other interrupt
+ * on the same IST (unless it's an end-of-the-world fault and we don't care
+ * about coming back from it ever).
+ *
+ * This is particularly relevant to the dbgtrap/brktrap trampolines, as they
+ * regularly have to happen from within trampoline code (e.g. in the sysenter
+ * single-step case) and then return to the world normally. As a result, these
+ * two are IST'd to their own kpti_frame right above the normal one (in the same
+ * page), so they don't clobber their parent interrupt.
+ *
+ * To aid with debugging, we also IST the page fault (#PF/pftrap), general
+ * protection fault (#GP/gptrap) and stack fault (#SS/stktrap) interrupts to
+ * their own separate kpti_frame. This ensures that if we take one of these
+ * due to a bug in trampoline code, we preserve the original trampoline
+ * state that caused the trap.
+ *
+ * NMI, MCE and dblfault interrupts also are taken on their own dedicated IST
+ * stacks, since they can interrupt another ISR at any time. These stacks are
+ * full-sized, however, and not a little kpti_frame struct. We only set %cr3 in
+ * their trampolines (and do it unconditionally), and don't bother pivoting
+ * away. We're either going into the panic() path, or we're going to return
+ * straight away without rescheduling, so it's fine to not be on our real
+ * kthread stack (and some of the state we want to go find it with might be
+ * corrupt!)
+ *
+ * Finally, for these "special" interrupts (NMI/MCE/double fault) we use a
+ * special %cr3 value we stash here in the text (kpti_safe_cr3). We set this to
+ * point at the PML4 for kas early in boot and never touch it again. Hopefully
+ * it survives whatever corruption brings down the rest of the kernel!
+ *
+ * Syscalls are different to interrupts (at least in the SYSENTER/SYSCALL64
+ * cases) in that they do not push an interrupt frame (and also have some other
+ * effects). In the syscall trampolines, we assume that we can only be taking
+ * the call from userland and use SWAPGS and an unconditional overwrite of %cr3.
+ * We do not do any stack pivoting for syscalls (and we leave SYSENTER's
+ * existing %rsp pivot untouched) -- instead we spill registers into
+ * %gs:CPU_KPTI_* as we need to.
+ *
+ * Note that the normal %cr3 values do not cause invalidations with PCIDE - see
+ * hat_switch().
+ */
+
+/*
+ * The macros here mostly line up with what's in kdi_idthdl.s, too, so if you
+ * fix bugs here check to see if they should be fixed there as well.
+ */
+
+#include <sys/asm_linkage.h>
+#include <sys/asm_misc.h>
+#include <sys/regset.h>
+#include <sys/privregs.h>
+#include <sys/psw.h>
+#include <sys/machbrand.h>
+#include <sys/param.h>
+
+#if defined(__lint)
+
+#include <sys/types.h>
+#include <sys/thread.h>
+#include <sys/systm.h>
+
+#else /* __lint */
+
+#include <sys/segments.h>
+#include <sys/pcb.h>
+#include <sys/trap.h>
+#include <sys/ftrace.h>
+#include <sys/traptrace.h>
+#include <sys/clock.h>
+#include <sys/model.h>
+#include <sys/panic.h>
+
+#if defined(__xpv)
+#include <sys/hypervisor.h>
+#endif
+
+#include "assym.h"
+
+ .data
+ DGDEF3(kpti_enable, 8, 8)
+ .fill 1, 8, 1
+
+.section ".text";
+.align MMU_PAGESIZE
+
+.global kpti_tramp_start
+kpti_tramp_start:
+ nop
+
+/* This will be set by mlsetup, and then double-checked later */
+.global kpti_safe_cr3
+kpti_safe_cr3:
+ .quad 0
+ SET_SIZE(kpti_safe_cr3)
+
+/* startup_kmem() will overwrite this */
+.global kpti_kbase
+kpti_kbase:
+ .quad KERNELBASE
+ SET_SIZE(kpti_kbase)
+
+#define SET_KERNEL_CR3(spillreg) \
+ mov %cr3, spillreg; \
+ mov spillreg, %gs:CPU_KPTI_TR_CR3; \
+ mov %gs:CPU_KPTI_KCR3, spillreg; \
+ cmp $0, spillreg; \
+ je 2f; \
+ mov spillreg, %cr3; \
+2:
+
+#if DEBUG
+#define SET_USER_CR3(spillreg) \
+ mov %cr3, spillreg; \
+ mov spillreg, %gs:CPU_KPTI_TR_CR3; \
+ mov %gs:CPU_KPTI_UCR3, spillreg; \
+ mov spillreg, %cr3
+#else
+#define SET_USER_CR3(spillreg) \
+ mov %gs:CPU_KPTI_UCR3, spillreg; \
+ mov spillreg, %cr3
+#endif
+
+#define PIVOT_KPTI_STK(spillreg) \
+ mov %rsp, spillreg; \
+ mov %gs:CPU_KPTI_RET_RSP, %rsp; \
+ pushq T_FRAMERET_SS(spillreg); \
+ pushq T_FRAMERET_RSP(spillreg); \
+ pushq T_FRAMERET_RFLAGS(spillreg); \
+ pushq T_FRAMERET_CS(spillreg); \
+ pushq T_FRAMERET_RIP(spillreg)
+
+
+#define INTERRUPT_TRAMPOLINE_P(errpush) \
+ pushq %r13; \
+ pushq %r14; \
+ subq $KPTI_R14, %rsp; \
+ /* Save current %cr3. */ \
+ mov %cr3, %r14; \
+ mov %r14, KPTI_TR_CR3(%rsp); \
+ \
+ cmpw $KCS_SEL, KPTI_CS(%rsp); \
+ je 3f; \
+1: \
+ /* Change to the "kernel" %cr3 */ \
+ mov KPTI_KCR3(%rsp), %r14; \
+ cmp $0, %r14; \
+ je 2f; \
+ mov %r14, %cr3; \
+2: \
+ /* Get our cpu_t in %r13 */ \
+ mov %rsp, %r13; \
+ and $(~(MMU_PAGESIZE - 1)), %r13; \
+ subq $CPU_KPTI_START, %r13; \
+ /* Use top of the kthread stk */ \
+ mov CPU_THREAD(%r13), %r14; \
+ mov T_STACK(%r14), %r14; \
+ addq $REGSIZE+MINFRAME, %r14; \
+ jmp 4f; \
+3: \
+ /* Check the %rsp in the frame. */ \
+ /* Is it above kernel base? */ \
+ mov kpti_kbase, %r14; \
+ cmp %r14, KPTI_RSP(%rsp); \
+ jb 1b; \
+ /* Use the %rsp from the trap frame */ \
+ mov KPTI_RSP(%rsp), %r14; \
+ and $(~0xf), %r14; \
+4: \
+ mov %rsp, %r13; \
+ /* %r14 contains our destination stk */ \
+ mov %r14, %rsp; \
+ pushq KPTI_SS(%r13); \
+ pushq KPTI_RSP(%r13); \
+ pushq KPTI_RFLAGS(%r13); \
+ pushq KPTI_CS(%r13); \
+ pushq KPTI_RIP(%r13); \
+ errpush; \
+ mov KPTI_R14(%r13), %r14; \
+ mov KPTI_R13(%r13), %r13
+
+#define INTERRUPT_TRAMPOLINE_NOERR \
+ INTERRUPT_TRAMPOLINE_P(/**/)
+
+#define INTERRUPT_TRAMPOLINE \
+ INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13))
+
+/*
+ * This is used for all interrupts that can plausibly be taken inside another
+ * interrupt and are using a kpti_frame stack (so #BP, #DB, #GP, #PF, #SS).
+ *
+ * We check for whether we took the interrupt while in another trampoline, in
+ * which case we need to use the kthread stack.
+ */
+#define DBG_INTERRUPT_TRAMPOLINE_P(errpush) \
+ pushq %r13; \
+ pushq %r14; \
+ subq $KPTI_R14, %rsp; \
+ /* Check for clobbering */ \
+ cmp $0, KPTI_FLAG(%rsp); \
+ je 1f; \
+ /* Don't worry, this totally works */ \
+ int $8; \
+1: \
+ movq $1, KPTI_FLAG(%rsp); \
+ /* Save current %cr3. */ \
+ mov %cr3, %r14; \
+ mov %r14, KPTI_TR_CR3(%rsp); \
+ \
+ cmpw $KCS_SEL, KPTI_CS(%rsp); \
+ je 4f; \
+2: \
+ /* Change to the "kernel" %cr3 */ \
+ mov KPTI_KCR3(%rsp), %r14; \
+ cmp $0, %r14; \
+ je 3f; \
+ mov %r14, %cr3; \
+3: \
+ /* Get our cpu_t in %r13 */ \
+ mov %rsp, %r13; \
+ and $(~(MMU_PAGESIZE - 1)), %r13; \
+ subq $CPU_KPTI_START, %r13; \
+ /* Use top of the kthread stk */ \
+ mov CPU_THREAD(%r13), %r14; \
+ mov T_STACK(%r14), %r14; \
+ addq $REGSIZE+MINFRAME, %r14; \
+ jmp 6f; \
+4: \
+ /* Check the %rsp in the frame. */ \
+ /* Is it above kernel base? */ \
+ /* If not, treat as user. */ \
+ mov kpti_kbase, %r14; \
+ cmp %r14, KPTI_RSP(%rsp); \
+ jb 2b; \
+ /* Is it within the kpti_frame page? */ \
+ /* If it is, treat as user interrupt */ \
+ mov %rsp, %r13; \
+ and $(~(MMU_PAGESIZE - 1)), %r13; \
+ mov KPTI_RSP(%rsp), %r14; \
+ and $(~(MMU_PAGESIZE - 1)), %r14; \
+ cmp %r13, %r14; \
+ je 2b; \
+ /* Were we in trampoline code? */ \
+ leaq kpti_tramp_start, %r14; \
+ cmp %r14, KPTI_RIP(%rsp); \
+ jb 5f; \
+ leaq kpti_tramp_end, %r14; \
+ cmp %r14, KPTI_RIP(%rsp); \
+ ja 5f; \
+ /* If we were, change %cr3: we might */ \
+ /* have interrupted before it did. */ \
+ mov KPTI_KCR3(%rsp), %r14; \
+ mov %r14, %cr3; \
+5: \
+ /* Use the %rsp from the trap frame */ \
+ mov KPTI_RSP(%rsp), %r14; \
+ and $(~0xf), %r14; \
+6: \
+ mov %rsp, %r13; \
+ /* %r14 contains our destination stk */ \
+ mov %r14, %rsp; \
+ pushq KPTI_SS(%r13); \
+ pushq KPTI_RSP(%r13); \
+ pushq KPTI_RFLAGS(%r13); \
+ pushq KPTI_CS(%r13); \
+ pushq KPTI_RIP(%r13); \
+ errpush; \
+ mov KPTI_R14(%r13), %r14; \
+ movq $0, KPTI_FLAG(%r13); \
+ mov KPTI_R13(%r13), %r13
+
+#define DBG_INTERRUPT_TRAMPOLINE_NOERR \
+ DBG_INTERRUPT_TRAMPOLINE_P(/**/)
+
+#define DBG_INTERRUPT_TRAMPOLINE \
+ DBG_INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13))
+
+ /*
+ * These labels (_start and _end) are used by trap.c to determine if
+ * we took an interrupt like an NMI during the return process.
+ */
+.global tr_sysc_ret_start
+tr_sysc_ret_start:
+
+ /*
+ * Syscall return trampolines.
+ *
+ * These are expected to be called on the kernel %gs. tr_sysret[ql] are
+ * called after %rsp is changed back to the user value, so we have no
+ * stack to work with. tr_sysexit has a kernel stack (but has to
+ * preserve rflags, soooo).
+ */
+ ENTRY_NP(tr_sysretq)
+ cmpq $1, kpti_enable
+ jne 1f
+
+ mov %r13, %gs:CPU_KPTI_R13
+ SET_USER_CR3(%r13)
+ mov %gs:CPU_KPTI_R13, %r13
+ /* Zero these to make sure they didn't leak from a kernel trap */
+ movq $0, %gs:CPU_KPTI_R13
+ movq $0, %gs:CPU_KPTI_R14
+1:
+ swapgs
+ sysretq
+ SET_SIZE(tr_sysretq)
+
+ ENTRY_NP(tr_sysretl)
+ cmpq $1, kpti_enable
+ jne 1f
+
+ mov %r13, %gs:CPU_KPTI_R13
+ SET_USER_CR3(%r13)
+ mov %gs:CPU_KPTI_R13, %r13
+ /* Zero these to make sure they didn't leak from a kernel trap */
+ movq $0, %gs:CPU_KPTI_R13
+ movq $0, %gs:CPU_KPTI_R14
+1:
+ SWAPGS
+ SYSRETL
+ SET_SIZE(tr_sysretl)
+
+ ENTRY_NP(tr_sysexit)
+ /*
+ * Note: we want to preserve RFLAGS across this branch, since sysexit
+ * (unlike sysret above) does not restore RFLAGS for us.
+ *
+ * We still have the real kernel stack (sysexit does restore that), so
+ * we can use pushfq/popfq.
+ */
+ pushfq
+
+ cmpq $1, kpti_enable
+ jne 1f
+
+ /* Have to pop it back off now before we change %cr3! */
+ popfq
+ mov %r13, %gs:CPU_KPTI_R13
+ SET_USER_CR3(%r13)
+ mov %gs:CPU_KPTI_R13, %r13
+ /* Zero these to make sure they didn't leak from a kernel trap */
+ movq $0, %gs:CPU_KPTI_R13
+ movq $0, %gs:CPU_KPTI_R14
+ jmp 2f
+1:
+ popfq
+2:
+ swapgs
+ sti
+ sysexit
+ SET_SIZE(tr_sysexit)
+
+.global tr_sysc_ret_end
+tr_sysc_ret_end:
+
+ /*
+ * Syscall entry trampolines.
+ */
+
+#if DEBUG
+#define MK_SYSCALL_TRAMPOLINE(isr) \
+ ENTRY_NP(tr_/**/isr); \
+ swapgs; \
+ mov %r13, %gs:CPU_KPTI_R13; \
+ mov %cr3, %r13; \
+ mov %r13, %gs:CPU_KPTI_TR_CR3; \
+ mov %gs:CPU_KPTI_KCR3, %r13; \
+ mov %r13, %cr3; \
+ mov %gs:CPU_KPTI_R13, %r13; \
+ swapgs; \
+ jmp isr; \
+ SET_SIZE(tr_/**/isr)
+#else
+#define MK_SYSCALL_TRAMPOLINE(isr) \
+ ENTRY_NP(tr_/**/isr); \
+ swapgs; \
+ mov %r13, %gs:CPU_KPTI_R13; \
+ mov %gs:CPU_KPTI_KCR3, %r13; \
+ mov %r13, %cr3; \
+ mov %gs:CPU_KPTI_R13, %r13; \
+ swapgs; \
+ jmp isr; \
+ SET_SIZE(tr_/**/isr)
+#endif
+
+ MK_SYSCALL_TRAMPOLINE(sys_syscall)
+ MK_SYSCALL_TRAMPOLINE(sys_syscall32)
+ MK_SYSCALL_TRAMPOLINE(brand_sys_syscall)
+ MK_SYSCALL_TRAMPOLINE(brand_sys_syscall32)
+
+ /*
+ * SYSENTER is special. The CPU is really not very helpful when it
+ * comes to preserving and restoring state with it, and as a result
+ * we have to do all of it by hand. So, since we want to preserve
+ * RFLAGS, we have to be very careful in these trampolines to not
+ * clobber any bits in it. That means no cmpqs or branches!
+ */
+ ENTRY_NP(tr_sys_sysenter)
+ swapgs
+ mov %r13, %gs:CPU_KPTI_R13
+#if DEBUG
+ mov %cr3, %r13
+ mov %r13, %gs:CPU_KPTI_TR_CR3
+#endif
+ mov %gs:CPU_KPTI_KCR3, %r13
+ mov %r13, %cr3
+ mov %gs:CPU_KPTI_R13, %r13
+ jmp _sys_sysenter_post_swapgs
+ SET_SIZE(tr_sys_sysenter)
+
+ ENTRY_NP(tr_brand_sys_sysenter)
+ swapgs
+ mov %r13, %gs:CPU_KPTI_R13
+#if DEBUG
+ mov %cr3, %r13
+ mov %r13, %gs:CPU_KPTI_TR_CR3
+#endif
+ mov %gs:CPU_KPTI_KCR3, %r13
+ mov %r13, %cr3
+ mov %gs:CPU_KPTI_R13, %r13
+ jmp _brand_sys_sysenter_post_swapgs
+ SET_SIZE(tr_brand_sys_sysenter)
+
+#define MK_SYSCALL_INT_TRAMPOLINE(isr) \
+ ENTRY_NP(tr_/**/isr); \
+ swapgs; \
+ mov %r13, %gs:CPU_KPTI_R13; \
+ SET_KERNEL_CR3(%r13); \
+ mov %gs:CPU_THREAD, %r13; \
+ mov T_STACK(%r13), %r13; \
+ addq $REGSIZE+MINFRAME, %r13; \
+ mov %r13, %rsp; \
+ pushq %gs:CPU_KPTI_SS; \
+ pushq %gs:CPU_KPTI_RSP; \
+ pushq %gs:CPU_KPTI_RFLAGS; \
+ pushq %gs:CPU_KPTI_CS; \
+ pushq %gs:CPU_KPTI_RIP; \
+ mov %gs:CPU_KPTI_R13, %r13; \
+ SWAPGS; \
+ jmp isr; \
+ SET_SIZE(tr_/**/isr)
+
+ MK_SYSCALL_INT_TRAMPOLINE(brand_sys_syscall_int)
+ MK_SYSCALL_INT_TRAMPOLINE(sys_syscall_int)
+
+ /*
+ * Interrupt/trap return trampolines
+ */
+
+.global tr_intr_ret_start
+tr_intr_ret_start:
+
+ ENTRY_NP(tr_iret_auto)
+ cmpq $1, kpti_enable
+ jne tr_iret_kernel
+ cmpw $KCS_SEL, T_FRAMERET_CS(%rsp)
+ je tr_iret_kernel
+ jmp tr_iret_user
+ SET_SIZE(tr_iret_auto)
+
+ ENTRY_NP(tr_iret_kernel)
+ /*
+ * Yes, this does nothing extra. But this way we know if we see iret
+ * elsewhere, then we've failed to properly consider trampolines there.
+ */
+ iretq
+ SET_SIZE(tr_iret_kernel)
+
+ ENTRY_NP(tr_iret_user)
+ cmpq $1, kpti_enable
+ jne 1f
+
+ swapgs
+ mov %r13, %gs:CPU_KPTI_R13
+ PIVOT_KPTI_STK(%r13)
+ SET_USER_CR3(%r13)
+ mov %gs:CPU_KPTI_R13, %r13
+ /* Zero these to make sure they didn't leak from a kernel trap */
+ movq $0, %gs:CPU_KPTI_R13
+ movq $0, %gs:CPU_KPTI_R14
+ swapgs
+1:
+ iretq
+ SET_SIZE(tr_iret_user)
+
+.global tr_intr_ret_end
+tr_intr_ret_end:
+
+ /*
+ * Interrupt/trap entry trampolines
+ */
+
+ /* CPU pushed an error code, and ISR wants one */
+#define MK_INTR_TRAMPOLINE(isr) \
+ ENTRY_NP(tr_/**/isr); \
+ INTERRUPT_TRAMPOLINE; \
+ jmp isr; \
+ SET_SIZE(tr_/**/isr)
+
+ /* CPU didn't push an error code, and ISR doesn't want one */
+#define MK_INTR_TRAMPOLINE_NOERR(isr) \
+ ENTRY_NP(tr_/**/isr); \
+ push $0; \
+ INTERRUPT_TRAMPOLINE_NOERR; \
+ jmp isr; \
+ SET_SIZE(tr_/**/isr)
+
+ /* CPU pushed an error code, and ISR wants one */
+#define MK_DBG_INTR_TRAMPOLINE(isr) \
+ ENTRY_NP(tr_/**/isr); \
+ DBG_INTERRUPT_TRAMPOLINE; \
+ jmp isr; \
+ SET_SIZE(tr_/**/isr)
+
+ /* CPU didn't push an error code, and ISR doesn't want one */
+#define MK_DBG_INTR_TRAMPOLINE_NOERR(isr) \
+ ENTRY_NP(tr_/**/isr); \
+ push $0; \
+ DBG_INTERRUPT_TRAMPOLINE_NOERR; \
+ jmp isr; \
+ SET_SIZE(tr_/**/isr)
+
+
+ MK_INTR_TRAMPOLINE_NOERR(div0trap)
+ MK_DBG_INTR_TRAMPOLINE_NOERR(dbgtrap)
+ MK_DBG_INTR_TRAMPOLINE_NOERR(brktrap)
+ MK_INTR_TRAMPOLINE_NOERR(ovflotrap)
+ MK_INTR_TRAMPOLINE_NOERR(boundstrap)
+ MK_INTR_TRAMPOLINE_NOERR(invoptrap)
+ MK_INTR_TRAMPOLINE_NOERR(ndptrap)
+ MK_INTR_TRAMPOLINE(invtsstrap)
+ MK_INTR_TRAMPOLINE(segnptrap)
+ MK_DBG_INTR_TRAMPOLINE(stktrap)
+ MK_DBG_INTR_TRAMPOLINE(gptrap)
+ MK_DBG_INTR_TRAMPOLINE(pftrap)
+ MK_INTR_TRAMPOLINE_NOERR(resvtrap)
+ MK_INTR_TRAMPOLINE_NOERR(ndperr)
+ MK_INTR_TRAMPOLINE(achktrap)
+ MK_INTR_TRAMPOLINE_NOERR(xmtrap)
+ MK_INTR_TRAMPOLINE_NOERR(invaltrap)
+ MK_INTR_TRAMPOLINE_NOERR(fasttrap)
+ MK_INTR_TRAMPOLINE_NOERR(dtrace_ret)
+
+ /*
+ * These are special because they can interrupt other traps, and
+ * each other. We don't need to pivot their stacks, because they have
+ * dedicated IST stack space, but we need to change %cr3.
+ */
+ ENTRY_NP(tr_nmiint)
+ pushq %r13
+ mov kpti_safe_cr3, %r13
+ mov %r13, %cr3
+ popq %r13
+ jmp nmiint
+ SET_SIZE(tr_nmiint)
+
+#if !defined(__xpv)
+ ENTRY_NP(tr_syserrtrap)
+ /*
+ * If we got here we should always have a zero error code pushed.
+ * The INT $0x8 instr doesn't seem to push one, though, which we use
+ * as an emergency panic in the other trampolines. So adjust things
+ * here.
+ */
+ cmpq $0, (%rsp)
+ je 1f
+ pushq $0
+1:
+ pushq %r13
+ mov kpti_safe_cr3, %r13
+ mov %r13, %cr3
+ popq %r13
+ jmp syserrtrap
+ SET_SIZE(tr_syserrtrap)
+#endif
+
+ ENTRY_NP(tr_mcetrap)
+ pushq %r13
+ mov kpti_safe_cr3, %r13
+ mov %r13, %cr3
+ popq %r13
+ jmp mcetrap
+ SET_SIZE(tr_mcetrap)
+
+ /*
+ * Interrupts start at 32
+ */
+#define MKIVCT(n) \
+ ENTRY_NP(tr_ivct/**/n) \
+ push $0; \
+ INTERRUPT_TRAMPOLINE; \
+ push $n - 0x20; \
+ jmp cmnint; \
+ SET_SIZE(tr_ivct/**/n)
+
+ MKIVCT(32); MKIVCT(33); MKIVCT(34); MKIVCT(35);
+ MKIVCT(36); MKIVCT(37); MKIVCT(38); MKIVCT(39);
+ MKIVCT(40); MKIVCT(41); MKIVCT(42); MKIVCT(43);
+ MKIVCT(44); MKIVCT(45); MKIVCT(46); MKIVCT(47);
+ MKIVCT(48); MKIVCT(49); MKIVCT(50); MKIVCT(51);
+ MKIVCT(52); MKIVCT(53); MKIVCT(54); MKIVCT(55);
+ MKIVCT(56); MKIVCT(57); MKIVCT(58); MKIVCT(59);
+ MKIVCT(60); MKIVCT(61); MKIVCT(62); MKIVCT(63);
+ MKIVCT(64); MKIVCT(65); MKIVCT(66); MKIVCT(67);
+ MKIVCT(68); MKIVCT(69); MKIVCT(70); MKIVCT(71);
+ MKIVCT(72); MKIVCT(73); MKIVCT(74); MKIVCT(75);
+ MKIVCT(76); MKIVCT(77); MKIVCT(78); MKIVCT(79);
+ MKIVCT(80); MKIVCT(81); MKIVCT(82); MKIVCT(83);
+ MKIVCT(84); MKIVCT(85); MKIVCT(86); MKIVCT(87);
+ MKIVCT(88); MKIVCT(89); MKIVCT(90); MKIVCT(91);
+ MKIVCT(92); MKIVCT(93); MKIVCT(94); MKIVCT(95);
+ MKIVCT(96); MKIVCT(97); MKIVCT(98); MKIVCT(99);
+ MKIVCT(100); MKIVCT(101); MKIVCT(102); MKIVCT(103);
+ MKIVCT(104); MKIVCT(105); MKIVCT(106); MKIVCT(107);
+ MKIVCT(108); MKIVCT(109); MKIVCT(110); MKIVCT(111);
+ MKIVCT(112); MKIVCT(113); MKIVCT(114); MKIVCT(115);
+ MKIVCT(116); MKIVCT(117); MKIVCT(118); MKIVCT(119);
+ MKIVCT(120); MKIVCT(121); MKIVCT(122); MKIVCT(123);
+ MKIVCT(124); MKIVCT(125); MKIVCT(126); MKIVCT(127);
+ MKIVCT(128); MKIVCT(129); MKIVCT(130); MKIVCT(131);
+ MKIVCT(132); MKIVCT(133); MKIVCT(134); MKIVCT(135);
+ MKIVCT(136); MKIVCT(137); MKIVCT(138); MKIVCT(139);
+ MKIVCT(140); MKIVCT(141); MKIVCT(142); MKIVCT(143);
+ MKIVCT(144); MKIVCT(145); MKIVCT(146); MKIVCT(147);
+ MKIVCT(148); MKIVCT(149); MKIVCT(150); MKIVCT(151);
+ MKIVCT(152); MKIVCT(153); MKIVCT(154); MKIVCT(155);
+ MKIVCT(156); MKIVCT(157); MKIVCT(158); MKIVCT(159);
+ MKIVCT(160); MKIVCT(161); MKIVCT(162); MKIVCT(163);
+ MKIVCT(164); MKIVCT(165); MKIVCT(166); MKIVCT(167);
+ MKIVCT(168); MKIVCT(169); MKIVCT(170); MKIVCT(171);
+ MKIVCT(172); MKIVCT(173); MKIVCT(174); MKIVCT(175);
+ MKIVCT(176); MKIVCT(177); MKIVCT(178); MKIVCT(179);
+ MKIVCT(180); MKIVCT(181); MKIVCT(182); MKIVCT(183);
+ MKIVCT(184); MKIVCT(185); MKIVCT(186); MKIVCT(187);
+ MKIVCT(188); MKIVCT(189); MKIVCT(190); MKIVCT(191);
+ MKIVCT(192); MKIVCT(193); MKIVCT(194); MKIVCT(195);
+ MKIVCT(196); MKIVCT(197); MKIVCT(198); MKIVCT(199);
+ MKIVCT(200); MKIVCT(201); MKIVCT(202); MKIVCT(203);
+ MKIVCT(204); MKIVCT(205); MKIVCT(206); MKIVCT(207);
+ MKIVCT(208); MKIVCT(209); MKIVCT(210); MKIVCT(211);
+ MKIVCT(212); MKIVCT(213); MKIVCT(214); MKIVCT(215);
+ MKIVCT(216); MKIVCT(217); MKIVCT(218); MKIVCT(219);
+ MKIVCT(220); MKIVCT(221); MKIVCT(222); MKIVCT(223);
+ MKIVCT(224); MKIVCT(225); MKIVCT(226); MKIVCT(227);
+ MKIVCT(228); MKIVCT(229); MKIVCT(230); MKIVCT(231);
+ MKIVCT(232); MKIVCT(233); MKIVCT(234); MKIVCT(235);
+ MKIVCT(236); MKIVCT(237); MKIVCT(238); MKIVCT(239);
+ MKIVCT(240); MKIVCT(241); MKIVCT(242); MKIVCT(243);
+ MKIVCT(244); MKIVCT(245); MKIVCT(246); MKIVCT(247);
+ MKIVCT(248); MKIVCT(249); MKIVCT(250); MKIVCT(251);
+ MKIVCT(252); MKIVCT(253); MKIVCT(254); MKIVCT(255);
+
+ /*
+ * We're PCIDE, but we don't have INVPCID. The only way to invalidate a
+ * PCID other than the current one, then, is to load its cr3 then
+ * invlpg. But loading kf_user_cr3 means we can longer access our
+ * caller's text mapping (or indeed, its stack). So this little helper
+ * has to live within our trampoline text region.
+ *
+ * Called as tr_mmu_flush_user_range(addr, len, pgsz, cr3)
+ */
+ ENTRY_NP(tr_mmu_flush_user_range)
+ push %rbx
+ /* When we read cr3, it never has the NOINVL bit set. */
+ mov %cr3, %rax
+ movq $CR3_NOINVL_BIT, %rbx
+ orq %rbx, %rax
+
+ mov %rcx, %cr3
+ add %rdi, %rsi
+.align ASM_ENTRY_ALIGN
+1:
+ invlpg (%rdi)
+ add %rdx, %rdi
+ cmp %rsi, %rdi
+ jb 1b
+ mov %rax, %cr3
+ pop %rbx
+ retq
+ SET_SIZE(tr_mmu_flush_user_range)
+
+.align MMU_PAGESIZE
+.global kpti_tramp_end
+kpti_tramp_end:
+ nop
+
+#endif /* __lint */
diff --git a/usr/src/uts/i86pc/ml/locore.s b/usr/src/uts/i86pc/ml/locore.s
index 042818844d..4626dd1492 100644
--- a/usr/src/uts/i86pc/ml/locore.s
+++ b/usr/src/uts/i86pc/ml/locore.s
@@ -23,7 +23,7 @@
* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
- * Copyright (c) 2016, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2018 Joyent, Inc.
*/
/* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
@@ -1186,7 +1186,7 @@ cmntrap()
addq %rax, %r12
movq %r12, REGOFF_RIP(%rbp)
INTR_POP
- IRET
+ jmp tr_iret_auto
/*NOTREACHED*/
3:
leaq dtrace_badflags(%rip), %rdi
@@ -1599,7 +1599,7 @@ _no_pending_updates:
*/
ALTENTRY(sys_rtt_syscall32)
USER32_POP
- IRET
+ jmp tr_iret_user
/*NOTREACHED*/
ALTENTRY(sys_rtt_syscall)
@@ -1608,7 +1608,7 @@ _no_pending_updates:
*/
USER_POP
ALTENTRY(nopop_sys_rtt_syscall)
- IRET
+ jmp tr_iret_user
/*NOTREACHED*/
SET_SIZE(nopop_sys_rtt_syscall)
@@ -1623,7 +1623,7 @@ _no_pending_updates:
* Restore regs before doing iretq to kernel mode
*/
INTR_POP
- IRET
+ jmp tr_iret_kernel
.globl _sys_rtt_end
_sys_rtt_end:
/*NOTREACHED*/
diff --git a/usr/src/uts/i86pc/ml/mpcore.s b/usr/src/uts/i86pc/ml/mpcore.s
index eaf70b72df..2151a14b04 100644
--- a/usr/src/uts/i86pc/ml/mpcore.s
+++ b/usr/src/uts/i86pc/ml/mpcore.s
@@ -24,6 +24,8 @@
/*
* Copyright (c) 2010, Intel Corporation.
* All rights reserved.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/asm_linkage.h>
@@ -326,7 +328,7 @@ kernel_cs_code:
* Complete the rest of the setup and call mp_startup().
*/
movq %gs:CPU_THREAD, %rax /* get thread ptr */
- call *T_PC(%rax) /* call mp_startup */
+ call *T_PC(%rax) /* call mp_startup_boot */
/* not reached */
int $20 /* whoops, returned somehow! */
@@ -502,7 +504,7 @@ kernel_cs_code:
/*
* Before going any farther, enable usage of page table NX bit if
- * that's how our page tables are set up.
+ * that's how our page tables are set up. (PCIDE is enabled later on).
*/
bt $X86FSET_NX, x86_featureset
jnc 1f
diff --git a/usr/src/uts/i86pc/ml/offsets.in b/usr/src/uts/i86pc/ml/offsets.in
index d0c75653a1..0946b369d9 100644
--- a/usr/src/uts/i86pc/ml/offsets.in
+++ b/usr/src/uts/i86pc/ml/offsets.in
@@ -1,7 +1,7 @@
\
\ Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
\ Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved.
-\ Copyright 2016 Joyent, Inc.
+\ Copyright 2018 Joyent, Inc.
\
\ CDDL HEADER START
\
@@ -232,6 +232,44 @@ cpu
cpu_m.mcpu_vcpu_info CPU_VCPU_INFO
#endif
+cpu
+ cpu_m.mcpu_kpti.kf_kernel_cr3 CPU_KPTI_KCR3
+ cpu_m.mcpu_kpti.kf_user_cr3 CPU_KPTI_UCR3
+ cpu_m.mcpu_kpti.kf_tr_rsp CPU_KPTI_TR_RSP
+ cpu_m.mcpu_kpti.kf_tr_cr3 CPU_KPTI_TR_CR3
+ cpu_m.mcpu_kpti.kf_r13 CPU_KPTI_R13
+ cpu_m.mcpu_kpti.kf_r14 CPU_KPTI_R14
+ cpu_m.mcpu_kpti.kf_tr_ret_rsp CPU_KPTI_RET_RSP
+
+ cpu_m.mcpu_kpti.kf_ss CPU_KPTI_SS
+ cpu_m.mcpu_kpti.kf_rsp CPU_KPTI_RSP
+ cpu_m.mcpu_kpti.kf_rflags CPU_KPTI_RFLAGS
+ cpu_m.mcpu_kpti.kf_cs CPU_KPTI_CS
+ cpu_m.mcpu_kpti.kf_rip CPU_KPTI_RIP
+ cpu_m.mcpu_kpti.kf_err CPU_KPTI_ERR
+
+ cpu_m.mcpu_pad2 CPU_KPTI_START
+ cpu_m.mcpu_pad3 CPU_KPTI_END
+
+kpti_frame
+ kf_r14 KPTI_R14
+ kf_r13 KPTI_R13
+ kf_err KPTI_ERR
+ kf_rip KPTI_RIP
+ kf_cs KPTI_CS
+ kf_rflags KPTI_RFLAGS
+ kf_rsp KPTI_RSP
+ kf_ss KPTI_SS
+
+ kf_tr_rsp KPTI_TOP
+
+ kf_kernel_cr3 KPTI_KCR3
+ kf_user_cr3 KPTI_UCR3
+ kf_tr_ret_rsp KPTI_RET_RSP
+ kf_tr_cr3 KPTI_TR_CR3
+
+ kf_tr_flag KPTI_FLAG
+
standard_pic
c_curmask
c_iplmask
diff --git a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
index 9501483e5f..be6a94c61b 100644
--- a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
+++ b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
@@ -491,6 +491,20 @@ noprod_sys_syscall:
movq %rbx, REGOFF_GS(%rsp)
/*
+ * If we're trying to use TRAPTRACE though, I take that back: we're
+ * probably debugging some problem in the SWAPGS logic and want to know
+ * what the incoming gsbase was.
+ *
+ * Since we already did SWAPGS, record the KGSBASE.
+ */
+#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
+ movl $MSR_AMD_KGSBASE, %ecx
+ rdmsr
+ movl %eax, REGOFF_GSBASE(%rsp)
+ movl %edx, REGOFF_GSBASE+4(%rsp)
+#endif
+
+ /*
* Machine state saved in the regs structure on the stack
* First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9
* %eax is the syscall number
@@ -671,8 +685,7 @@ _syscall_invoke:
SYSRETQ
#else
ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
- SWAPGS /* user gsbase */
- SYSRETQ
+ jmp tr_sysretq
#endif
/*NOTREACHED*/
SET_SIZE(nopop_sys_syscall_swapgs_sysretq)
@@ -773,6 +786,20 @@ _syscall32_save:
movq %rbx, REGOFF_GS(%rsp)
/*
+ * If we're trying to use TRAPTRACE though, I take that back: we're
+ * probably debugging some problem in the SWAPGS logic and want to know
+ * what the incoming gsbase was.
+ *
+ * Since we already did SWAPGS, record the KGSBASE.
+ */
+#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
+ movl $MSR_AMD_KGSBASE, %ecx
+ rdmsr
+ movl %eax, REGOFF_GSBASE(%rsp)
+ movl %edx, REGOFF_GSBASE+4(%rsp)
+#endif
+
+ /*
* Application state saved in the regs structure on the stack
* %eax is the syscall number
* %rsp is the thread's stack, %r15 is curthread
@@ -889,8 +916,7 @@ _syscall32_save:
ASSERT_UPCALL_MASK_IS_SET
ALTENTRY(nopop_sys_syscall32_swapgs_sysretl)
- SWAPGS /* user gsbase */
- SYSRETL
+ jmp tr_sysretl
SET_SIZE(nopop_sys_syscall32_swapgs_sysretl)
/*NOTREACHED*/
@@ -935,23 +961,22 @@ _full_syscall_postsys32:
* this call, as %edx is used by the sysexit instruction.
*
* One final complication in this routine is its interaction with
- * single-stepping in a debugger. For most of the system call mechanisms,
- * the CPU automatically clears the single-step flag before we enter the
- * kernel. The sysenter mechanism does not clear the flag, so a user
- * single-stepping through a libc routine may suddenly find themself
- * single-stepping through the kernel. To detect this, kmdb compares the
- * trap %pc to the [brand_]sys_enter addresses on each single-step trap.
- * If it finds that we have single-stepped to a sysenter entry point, it
- * explicitly clears the flag and executes the sys_sysenter routine.
+ * single-stepping in a debugger. For most of the system call mechanisms, the
+ * CPU automatically clears the single-step flag before we enter the kernel.
+ * The sysenter mechanism does not clear the flag, so a user single-stepping
+ * through a libc routine may suddenly find themself single-stepping through the
+ * kernel. To detect this, kmdb and trap() both compare the trap %pc to the
+ * [brand_]sys_enter addresses on each single-step trap. If it finds that we
+ * have single-stepped to a sysenter entry point, it explicitly clears the flag
+ * and executes the sys_sysenter routine.
*
- * One final complication in this final complication is the fact that we
- * have two different entry points for sysenter: brand_sys_sysenter and
- * sys_sysenter. If we enter at brand_sys_sysenter and start single-stepping
- * through the kernel with kmdb, we will eventually hit the instruction at
- * sys_sysenter. kmdb cannot distinguish between that valid single-step
- * and the undesirable one mentioned above. To avoid this situation, we
- * simply add a jump over the instruction at sys_sysenter to make it
- * impossible to single-step to it.
+ * One final complication in this final complication is the fact that we have
+ * two different entry points for sysenter: brand_sys_sysenter and sys_sysenter.
+ * If we enter at brand_sys_sysenter and start single-stepping through the
+ * kernel with kmdb, we will eventually hit the instruction at sys_sysenter.
+ * kmdb cannot distinguish between that valid single-step and the undesirable
+ * one mentioned above. To avoid this situation, we simply add a jump over the
+ * instruction at sys_sysenter to make it impossible to single-step to it.
*/
#if defined(__lint)
@@ -964,6 +989,7 @@ sys_sysenter()
ENTRY_NP(brand_sys_sysenter)
SWAPGS /* kernel gsbase */
ALTENTRY(_brand_sys_sysenter_post_swapgs)
+
BRAND_CALLBACK(BRAND_CB_SYSENTER, BRAND_URET_FROM_REG(%rdx))
/*
* Jump over sys_sysenter to allow single-stepping as described
@@ -973,13 +999,17 @@ sys_sysenter()
ALTENTRY(sys_sysenter)
SWAPGS /* kernel gsbase */
-
ALTENTRY(_sys_sysenter_post_swapgs)
+
movq %gs:CPU_THREAD, %r15
movl $U32CS_SEL, REGOFF_CS(%rsp)
movl %ecx, REGOFF_RSP(%rsp) /* wrapper: %esp -> %ecx */
movl %edx, REGOFF_RIP(%rsp) /* wrapper: %eip -> %edx */
+ /*
+ * NOTE: none of the instructions that run before we get here should
+ * clobber bits in (R)FLAGS! This includes the kpti trampoline.
+ */
pushfq
popq %r10
movl $UDS_SEL, REGOFF_SS(%rsp)
@@ -1021,6 +1051,20 @@ sys_sysenter()
movq %rbx, REGOFF_GS(%rsp)
/*
+ * If we're trying to use TRAPTRACE though, I take that back: we're
+ * probably debugging some problem in the SWAPGS logic and want to know
+ * what the incoming gsbase was.
+ *
+ * Since we already did SWAPGS, record the KGSBASE.
+ */
+#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
+ movl $MSR_AMD_KGSBASE, %ecx
+ rdmsr
+ movl %eax, REGOFF_GSBASE(%rsp)
+ movl %edx, REGOFF_GSBASE+4(%rsp)
+#endif
+
+ /*
* Application state saved in the regs structure on the stack
* %eax is the syscall number
* %rsp is the thread's stack, %r15 is curthread
@@ -1118,6 +1162,8 @@ sys_sysenter()
* If we were, and we ended up on another cpu, or another
* lwp got int ahead of us, it could change the segment
* registers without us noticing before we return to userland.
+ *
+ * This cli is undone in the tr_sysexit trampoline code.
*/
cli
CHECK_POSTSYS_NE(%r15, %r14, %ebx)
@@ -1151,9 +1197,7 @@ sys_sysenter()
popfq
movl REGOFF_RSP(%rsp), %ecx /* sysexit: %ecx -> %esp */
ALTENTRY(sys_sysenter_swapgs_sysexit)
- swapgs
- sti
- sysexit
+ jmp tr_sysexit
SET_SIZE(sys_sysenter_swapgs_sysexit)
SET_SIZE(sys_sysenter)
SET_SIZE(_sys_sysenter_post_swapgs)
@@ -1204,10 +1248,13 @@ nopop_syscall_int:
* or we could end up breaking branded zone support. See the usage of
* this label in lx_brand_int80_callback and sn1_brand_int91_callback
* for examples.
+ *
+ * We want to swapgs to maintain the invariant that all entries into
+ * tr_iret_user are done on the user gsbase.
*/
- ALTENTRY(sys_sysint_swapgs_iret)
- SWAPGS /* user gsbase */
- IRET
+ ALTENTRY(sys_sysint_swapgs_iret)
+ SWAPGS
+ jmp tr_iret_user
/*NOTREACHED*/
SET_SIZE(sys_sysint_swapgs_iret)
SET_SIZE(sys_syscall_int)