diff options
| author | John Levon <john.levon@joyent.com> | 2018-01-22 22:05:38 +0000 |
|---|---|---|
| committer | Dan McDonald <danmcd@joyent.com> | 2018-04-10 10:37:19 -0400 |
| commit | 74ecdb5171c9f3673b9393b1a3dc6f3a65e93895 (patch) | |
| tree | 3c16ae772368de5b6eec80945340deb2b38d91d0 /usr/src/uts/i86pc/ml | |
| parent | 1d9a8ab82e5abe86cb1e43c502f88c7c655658fd (diff) | |
| download | illumos-joyent-74ecdb5171c9f3673b9393b1a3dc6f3a65e93895.tar.gz | |
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Approved by: Gordon Ross <gwr@nexenta.com>
Diffstat (limited to 'usr/src/uts/i86pc/ml')
| -rw-r--r-- | usr/src/uts/i86pc/ml/fb_swtch_src.s | 5 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/ml/genassym.c | 23 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/ml/kdi_subr.s | 160 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/ml/kpti_trampolines.s | 743 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/ml/locore.s | 10 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/ml/mpcore.s | 6 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/ml/offsets.in | 40 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/ml/syscall_asm_amd64.s | 103 |
8 files changed, 873 insertions, 217 deletions
diff --git a/usr/src/uts/i86pc/ml/fb_swtch_src.s b/usr/src/uts/i86pc/ml/fb_swtch_src.s index e67837ee2b..4d1789fc9b 100644 --- a/usr/src/uts/i86pc/ml/fb_swtch_src.s +++ b/usr/src/uts/i86pc/ml/fb_swtch_src.s @@ -22,6 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ @@ -52,6 +53,9 @@ int fb_swtch_silence_lint = 0; #define DISABLE_PAGING \ + movl %cr4, %eax ;\ + btrl $17, %eax /* clear PCIDE bit */ ;\ + movl %eax, %cr4 ;\ movl %cr0, %eax ;\ btrl $31, %eax /* clear PG bit */ ;\ movl %eax, %cr0 @@ -222,6 +226,7 @@ _start: * Disable long mode by: * - shutting down paging (bit 31 of cr0). This will flush the * TLBs. + * - turning off PCID in cr4 * - disabling LME (long mode enable) in EFER (extended feature reg) */ #endif diff --git a/usr/src/uts/i86pc/ml/genassym.c b/usr/src/uts/i86pc/ml/genassym.c index 088dd661a3..6d840368d7 100644 --- a/usr/src/uts/i86pc/ml/genassym.c +++ b/usr/src/uts/i86pc/ml/genassym.c @@ -20,6 +20,8 @@ */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright 2018 Joyent, Inc. */ #ifndef _GENASSYM @@ -68,8 +70,6 @@ extern void exit(int); int main(int argc, char *argv[]) { - printf("#define\tT_AST 0x%x\n", T_AST); - printf("#define\tLOCK_LEVEL 0x%x\n", LOCK_LEVEL); printf("#define\tCLOCK_LEVEL 0x%x\n", CLOCK_LEVEL); printf("#define\tDISP_LEVEL 0x%x\n", DISP_LEVEL); @@ -109,20 +109,6 @@ main(int argc, char *argv[]) printf("#define\tSSE_MXCSR_EFLAGS 0x%x\n", SSE_MXCSR_EFLAGS); - printf("#define\tFP_487 0x%x\n", FP_487); - printf("#define\tFP_486 0x%x\n", FP_486); - printf("#define\tFPU_CW_INIT 0x%x\n", FPU_CW_INIT); - printf("#define\tFPU_EN 0x%x\n", FPU_EN); - printf("#define\tFPU_VALID 0x%x\n", FPU_VALID); - - printf("#define\tFP_NO 0x%x\n", FP_NO); - printf("#define\tFP_SW 0x%x\n", FP_SW); - printf("#define\tFP_HW 0x%x\n", FP_HW); - printf("#define\tFP_287 0x%x\n", FP_287); - printf("#define\tFP_387 0x%x\n", FP_387); - printf("#define\t__FP_SSE 0x%x\n", __FP_SSE); - - printf("#define\tFP_FNSAVE 0x%x\n", FP_FNSAVE); printf("#define\tFP_FXSAVE 0x%x\n", FP_FXSAVE); printf("#define\tFP_XSAVE 0x%x\n", FP_XSAVE); @@ -154,11 +140,6 @@ main(int argc, char *argv[]) printf("#define\tNSEC_PER_COUNTER_TICK 0x%llx\n", NANOSEC / PIT_HZ); - printf("#define\tPITCTR0_PORT 0x%x\n", PITCTR0_PORT); - printf("#define\tPITCTL_PORT 0x%x\n", PITCTL_PORT); - printf("#define\tPIT_COUNTDOWN 0x%x\n", - PIT_C0 | PIT_LOADMODE | PIT_NDIVMODE); - printf("#define\tNBPW 0x%x\n", NBPW); printf("#define\tDDI_ACCATTR_IO_SPACE 0x%x\n", DDI_ACCATTR_IO_SPACE); diff --git a/usr/src/uts/i86pc/ml/kdi_subr.s b/usr/src/uts/i86pc/ml/kdi_subr.s deleted file mode 100644 index 8ed90ed410..0000000000 --- a/usr/src/uts/i86pc/ml/kdi_subr.s +++ /dev/null @@ -1,160 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/asm_linkage.h> -#include <sys/asm_misc.h> -#include <sys/regset.h> -#include <sys/privregs.h> -#include <sys/psw.h> - -#if defined(__lint) -#include <sys/types.h> -#include <sys/segments.h> -#endif - -#if defined(__lint) - -ulong_t -kdi_getdr0(void) -{ - return (0); -} - -ulong_t -kdi_getdr1(void) -{ - return (0); -} - -ulong_t -kdi_getdr2(void) -{ - return (0); -} - -ulong_t -kdi_getdr3(void) -{ - return (0); -} - -ulong_t -kdi_getdr6(void) -{ - return (0); -} - -ulong_t -kdi_getdr7(void) -{ - return (0); -} - -/*ARGSUSED*/ -void -kdi_setdr0(ulong_t value) -{} - -/*ARGSUSED*/ -void -kdi_setdr1(ulong_t value) -{} - -/*ARGSUSED*/ -void -kdi_setdr2(ulong_t value) -{} - -/*ARGSUSED*/ -void -kdi_setdr3(ulong_t value) -{} - -/*ARGSUSED*/ -void -kdi_setdr4(ulong_t value) -{} - -/*ARGSUSED*/ -void -kdi_setdr6(ulong_t value) -{} - -/*ARGSUSED*/ -void -kdi_setdr7(ulong_t value) -{} - -#else - -#if defined(__amd64) - -#define GETDREG(name, r) \ - ENTRY_NP(name); \ - movq r, %rax; \ - ret; \ - SET_SIZE(name) - -#define SETDREG(name, r) \ - ENTRY_NP(name); \ - movq %rdi, r; \ - ret; \ - SET_SIZE(name) - -#elif defined(__i386) - -#define GETDREG(name, r) \ - ENTRY_NP(name); \ - movl r, %eax; \ - ret; \ - SET_SIZE(name) - -#define SETDREG(name, r) \ - ENTRY_NP(name); \ - movl 4(%esp), %eax; \ - movl %eax, r; \ - ret; \ - SET_SIZE(name) - -#endif - - GETDREG(kdi_getdr0, %dr0) - GETDREG(kdi_getdr1, %dr1) - GETDREG(kdi_getdr2, %dr2) - GETDREG(kdi_getdr3, %dr3) - GETDREG(kdi_getdr6, %dr6) - GETDREG(kdi_getdr7, %dr7) - - SETDREG(kdi_setdr0, %dr0) - SETDREG(kdi_setdr1, %dr1) - SETDREG(kdi_setdr2, %dr2) - SETDREG(kdi_setdr3, %dr3) - SETDREG(kdi_setdr6, %dr6) - SETDREG(kdi_setdr7, %dr7) - -#endif /* __lint */ diff --git a/usr/src/uts/i86pc/ml/kpti_trampolines.s b/usr/src/uts/i86pc/ml/kpti_trampolines.s new file mode 100644 index 0000000000..d50e964e62 --- /dev/null +++ b/usr/src/uts/i86pc/ml/kpti_trampolines.s @@ -0,0 +1,743 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * This file contains the trampolines that are used by KPTI in order to be + * able to take interrupts/trap/etc while on the "user" page table. + * + * We don't map the full kernel text into the user page table: instead we + * map this one small section of trampolines (which compiles to ~13 pages). + * These trampolines are set in the IDT always (so they will run no matter + * whether we're on the kernel or user page table), and their primary job is to + * pivot us to the kernel %cr3 and %rsp without ruining everything. + * + * All of these interrupts use the amd64 IST feature when we have KPTI enabled, + * meaning that they will execute with their %rsp set to a known location, even + * if we take them in the kernel. + * + * Over in desctbls.c (for cpu0) and mp_pc.c (other cpus) we set up the IST + * stack to point at &cpu->cpu_m.mcpu_kpti.kf_tr_rsp. You can see the mcpu_kpti + * (a struct kpti_frame) defined in machcpuvar.h. This struct is set up to be + * page-aligned, and we map the page it's on into both page tables. Using a + * struct attached to the cpu_t also means that we can use %rsp-relative + * addressing to find anything on the cpu_t, so we don't have to touch %gs or + * GSBASE at all on incoming interrupt trampolines (which can get pretty hairy). + * + * This little struct is where the CPU will push the actual interrupt frame. + * Then, in the trampoline, we change %cr3, then figure out our destination + * stack pointer and "pivot" to it (set %rsp and re-push the CPU's interrupt + * frame). Then we jump to the regular ISR in the kernel text and carry on as + * normal. + * + * We leave the original frame and any spilled regs behind in the kpti_frame + * lazily until we want to return to userland. Then, we clear any spilled + * regs from it, and overwrite the rest with our iret frame. When switching + * this cpu to a different process (in hat_switch), we bzero the whole region to + * make sure nothing can leak between processes. + * + * When we're returning back to the original place we took the interrupt later + * (especially if it was in userland), we have to jmp back to the "return + * trampolines" here, since when we set %cr3 back to the user value, we need to + * be executing from code here in these shared pages and not the main kernel + * text again. Even though it should be fine to iret directly from kernel text + * when returning to kernel code, we make things jmp to a trampoline here just + * for consistency. + * + * Note that with IST, it's very important that we always must have pivoted + * away from the IST stack before we could possibly take any other interrupt + * on the same IST (unless it's an end-of-the-world fault and we don't care + * about coming back from it ever). + * + * This is particularly relevant to the dbgtrap/brktrap trampolines, as they + * regularly have to happen from within trampoline code (e.g. in the sysenter + * single-step case) and then return to the world normally. As a result, these + * two are IST'd to their own kpti_frame right above the normal one (in the same + * page), so they don't clobber their parent interrupt. + * + * To aid with debugging, we also IST the page fault (#PF/pftrap), general + * protection fault (#GP/gptrap) and stack fault (#SS/stktrap) interrupts to + * their own separate kpti_frame. This ensures that if we take one of these + * due to a bug in trampoline code, we preserve the original trampoline + * state that caused the trap. + * + * NMI, MCE and dblfault interrupts also are taken on their own dedicated IST + * stacks, since they can interrupt another ISR at any time. These stacks are + * full-sized, however, and not a little kpti_frame struct. We only set %cr3 in + * their trampolines (and do it unconditionally), and don't bother pivoting + * away. We're either going into the panic() path, or we're going to return + * straight away without rescheduling, so it's fine to not be on our real + * kthread stack (and some of the state we want to go find it with might be + * corrupt!) + * + * Finally, for these "special" interrupts (NMI/MCE/double fault) we use a + * special %cr3 value we stash here in the text (kpti_safe_cr3). We set this to + * point at the PML4 for kas early in boot and never touch it again. Hopefully + * it survives whatever corruption brings down the rest of the kernel! + * + * Syscalls are different to interrupts (at least in the SYSENTER/SYSCALL64 + * cases) in that they do not push an interrupt frame (and also have some other + * effects). In the syscall trampolines, we assume that we can only be taking + * the call from userland and use SWAPGS and an unconditional overwrite of %cr3. + * We do not do any stack pivoting for syscalls (and we leave SYSENTER's + * existing %rsp pivot untouched) -- instead we spill registers into + * %gs:CPU_KPTI_* as we need to. + * + * Note that the normal %cr3 values do not cause invalidations with PCIDE - see + * hat_switch(). + */ + +/* + * The macros here mostly line up with what's in kdi_idthdl.s, too, so if you + * fix bugs here check to see if they should be fixed there as well. + */ + +#include <sys/asm_linkage.h> +#include <sys/asm_misc.h> +#include <sys/regset.h> +#include <sys/privregs.h> +#include <sys/psw.h> +#include <sys/machbrand.h> +#include <sys/param.h> + +#if defined(__lint) + +#include <sys/types.h> +#include <sys/thread.h> +#include <sys/systm.h> + +#else /* __lint */ + +#include <sys/segments.h> +#include <sys/pcb.h> +#include <sys/trap.h> +#include <sys/ftrace.h> +#include <sys/traptrace.h> +#include <sys/clock.h> +#include <sys/model.h> +#include <sys/panic.h> + +#if defined(__xpv) +#include <sys/hypervisor.h> +#endif + +#include "assym.h" + + .data + DGDEF3(kpti_enable, 8, 8) + .fill 1, 8, 1 + +.section ".text"; +.align MMU_PAGESIZE + +.global kpti_tramp_start +kpti_tramp_start: + nop + +/* This will be set by mlsetup, and then double-checked later */ +.global kpti_safe_cr3 +kpti_safe_cr3: + .quad 0 + SET_SIZE(kpti_safe_cr3) + +/* startup_kmem() will overwrite this */ +.global kpti_kbase +kpti_kbase: + .quad KERNELBASE + SET_SIZE(kpti_kbase) + +#define SET_KERNEL_CR3(spillreg) \ + mov %cr3, spillreg; \ + mov spillreg, %gs:CPU_KPTI_TR_CR3; \ + mov %gs:CPU_KPTI_KCR3, spillreg; \ + cmp $0, spillreg; \ + je 2f; \ + mov spillreg, %cr3; \ +2: + +#if DEBUG +#define SET_USER_CR3(spillreg) \ + mov %cr3, spillreg; \ + mov spillreg, %gs:CPU_KPTI_TR_CR3; \ + mov %gs:CPU_KPTI_UCR3, spillreg; \ + mov spillreg, %cr3 +#else +#define SET_USER_CR3(spillreg) \ + mov %gs:CPU_KPTI_UCR3, spillreg; \ + mov spillreg, %cr3 +#endif + +#define PIVOT_KPTI_STK(spillreg) \ + mov %rsp, spillreg; \ + mov %gs:CPU_KPTI_RET_RSP, %rsp; \ + pushq T_FRAMERET_SS(spillreg); \ + pushq T_FRAMERET_RSP(spillreg); \ + pushq T_FRAMERET_RFLAGS(spillreg); \ + pushq T_FRAMERET_CS(spillreg); \ + pushq T_FRAMERET_RIP(spillreg) + + +#define INTERRUPT_TRAMPOLINE_P(errpush) \ + pushq %r13; \ + pushq %r14; \ + subq $KPTI_R14, %rsp; \ + /* Save current %cr3. */ \ + mov %cr3, %r14; \ + mov %r14, KPTI_TR_CR3(%rsp); \ + \ + cmpw $KCS_SEL, KPTI_CS(%rsp); \ + je 3f; \ +1: \ + /* Change to the "kernel" %cr3 */ \ + mov KPTI_KCR3(%rsp), %r14; \ + cmp $0, %r14; \ + je 2f; \ + mov %r14, %cr3; \ +2: \ + /* Get our cpu_t in %r13 */ \ + mov %rsp, %r13; \ + and $(~(MMU_PAGESIZE - 1)), %r13; \ + subq $CPU_KPTI_START, %r13; \ + /* Use top of the kthread stk */ \ + mov CPU_THREAD(%r13), %r14; \ + mov T_STACK(%r14), %r14; \ + addq $REGSIZE+MINFRAME, %r14; \ + jmp 4f; \ +3: \ + /* Check the %rsp in the frame. */ \ + /* Is it above kernel base? */ \ + mov kpti_kbase, %r14; \ + cmp %r14, KPTI_RSP(%rsp); \ + jb 1b; \ + /* Use the %rsp from the trap frame */ \ + mov KPTI_RSP(%rsp), %r14; \ + and $(~0xf), %r14; \ +4: \ + mov %rsp, %r13; \ + /* %r14 contains our destination stk */ \ + mov %r14, %rsp; \ + pushq KPTI_SS(%r13); \ + pushq KPTI_RSP(%r13); \ + pushq KPTI_RFLAGS(%r13); \ + pushq KPTI_CS(%r13); \ + pushq KPTI_RIP(%r13); \ + errpush; \ + mov KPTI_R14(%r13), %r14; \ + mov KPTI_R13(%r13), %r13 + +#define INTERRUPT_TRAMPOLINE_NOERR \ + INTERRUPT_TRAMPOLINE_P(/**/) + +#define INTERRUPT_TRAMPOLINE \ + INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13)) + +/* + * This is used for all interrupts that can plausibly be taken inside another + * interrupt and are using a kpti_frame stack (so #BP, #DB, #GP, #PF, #SS). + * + * We check for whether we took the interrupt while in another trampoline, in + * which case we need to use the kthread stack. + */ +#define DBG_INTERRUPT_TRAMPOLINE_P(errpush) \ + pushq %r13; \ + pushq %r14; \ + subq $KPTI_R14, %rsp; \ + /* Check for clobbering */ \ + cmp $0, KPTI_FLAG(%rsp); \ + je 1f; \ + /* Don't worry, this totally works */ \ + int $8; \ +1: \ + movq $1, KPTI_FLAG(%rsp); \ + /* Save current %cr3. */ \ + mov %cr3, %r14; \ + mov %r14, KPTI_TR_CR3(%rsp); \ + \ + cmpw $KCS_SEL, KPTI_CS(%rsp); \ + je 4f; \ +2: \ + /* Change to the "kernel" %cr3 */ \ + mov KPTI_KCR3(%rsp), %r14; \ + cmp $0, %r14; \ + je 3f; \ + mov %r14, %cr3; \ +3: \ + /* Get our cpu_t in %r13 */ \ + mov %rsp, %r13; \ + and $(~(MMU_PAGESIZE - 1)), %r13; \ + subq $CPU_KPTI_START, %r13; \ + /* Use top of the kthread stk */ \ + mov CPU_THREAD(%r13), %r14; \ + mov T_STACK(%r14), %r14; \ + addq $REGSIZE+MINFRAME, %r14; \ + jmp 6f; \ +4: \ + /* Check the %rsp in the frame. */ \ + /* Is it above kernel base? */ \ + /* If not, treat as user. */ \ + mov kpti_kbase, %r14; \ + cmp %r14, KPTI_RSP(%rsp); \ + jb 2b; \ + /* Is it within the kpti_frame page? */ \ + /* If it is, treat as user interrupt */ \ + mov %rsp, %r13; \ + and $(~(MMU_PAGESIZE - 1)), %r13; \ + mov KPTI_RSP(%rsp), %r14; \ + and $(~(MMU_PAGESIZE - 1)), %r14; \ + cmp %r13, %r14; \ + je 2b; \ + /* Were we in trampoline code? */ \ + leaq kpti_tramp_start, %r14; \ + cmp %r14, KPTI_RIP(%rsp); \ + jb 5f; \ + leaq kpti_tramp_end, %r14; \ + cmp %r14, KPTI_RIP(%rsp); \ + ja 5f; \ + /* If we were, change %cr3: we might */ \ + /* have interrupted before it did. */ \ + mov KPTI_KCR3(%rsp), %r14; \ + mov %r14, %cr3; \ +5: \ + /* Use the %rsp from the trap frame */ \ + mov KPTI_RSP(%rsp), %r14; \ + and $(~0xf), %r14; \ +6: \ + mov %rsp, %r13; \ + /* %r14 contains our destination stk */ \ + mov %r14, %rsp; \ + pushq KPTI_SS(%r13); \ + pushq KPTI_RSP(%r13); \ + pushq KPTI_RFLAGS(%r13); \ + pushq KPTI_CS(%r13); \ + pushq KPTI_RIP(%r13); \ + errpush; \ + mov KPTI_R14(%r13), %r14; \ + movq $0, KPTI_FLAG(%r13); \ + mov KPTI_R13(%r13), %r13 + +#define DBG_INTERRUPT_TRAMPOLINE_NOERR \ + DBG_INTERRUPT_TRAMPOLINE_P(/**/) + +#define DBG_INTERRUPT_TRAMPOLINE \ + DBG_INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13)) + + /* + * These labels (_start and _end) are used by trap.c to determine if + * we took an interrupt like an NMI during the return process. + */ +.global tr_sysc_ret_start +tr_sysc_ret_start: + + /* + * Syscall return trampolines. + * + * These are expected to be called on the kernel %gs. tr_sysret[ql] are + * called after %rsp is changed back to the user value, so we have no + * stack to work with. tr_sysexit has a kernel stack (but has to + * preserve rflags, soooo). + */ + ENTRY_NP(tr_sysretq) + cmpq $1, kpti_enable + jne 1f + + mov %r13, %gs:CPU_KPTI_R13 + SET_USER_CR3(%r13) + mov %gs:CPU_KPTI_R13, %r13 + /* Zero these to make sure they didn't leak from a kernel trap */ + movq $0, %gs:CPU_KPTI_R13 + movq $0, %gs:CPU_KPTI_R14 +1: + swapgs + sysretq + SET_SIZE(tr_sysretq) + + ENTRY_NP(tr_sysretl) + cmpq $1, kpti_enable + jne 1f + + mov %r13, %gs:CPU_KPTI_R13 + SET_USER_CR3(%r13) + mov %gs:CPU_KPTI_R13, %r13 + /* Zero these to make sure they didn't leak from a kernel trap */ + movq $0, %gs:CPU_KPTI_R13 + movq $0, %gs:CPU_KPTI_R14 +1: + SWAPGS + SYSRETL + SET_SIZE(tr_sysretl) + + ENTRY_NP(tr_sysexit) + /* + * Note: we want to preserve RFLAGS across this branch, since sysexit + * (unlike sysret above) does not restore RFLAGS for us. + * + * We still have the real kernel stack (sysexit does restore that), so + * we can use pushfq/popfq. + */ + pushfq + + cmpq $1, kpti_enable + jne 1f + + /* Have to pop it back off now before we change %cr3! */ + popfq + mov %r13, %gs:CPU_KPTI_R13 + SET_USER_CR3(%r13) + mov %gs:CPU_KPTI_R13, %r13 + /* Zero these to make sure they didn't leak from a kernel trap */ + movq $0, %gs:CPU_KPTI_R13 + movq $0, %gs:CPU_KPTI_R14 + jmp 2f +1: + popfq +2: + swapgs + sti + sysexit + SET_SIZE(tr_sysexit) + +.global tr_sysc_ret_end +tr_sysc_ret_end: + + /* + * Syscall entry trampolines. + */ + +#if DEBUG +#define MK_SYSCALL_TRAMPOLINE(isr) \ + ENTRY_NP(tr_/**/isr); \ + swapgs; \ + mov %r13, %gs:CPU_KPTI_R13; \ + mov %cr3, %r13; \ + mov %r13, %gs:CPU_KPTI_TR_CR3; \ + mov %gs:CPU_KPTI_KCR3, %r13; \ + mov %r13, %cr3; \ + mov %gs:CPU_KPTI_R13, %r13; \ + swapgs; \ + jmp isr; \ + SET_SIZE(tr_/**/isr) +#else +#define MK_SYSCALL_TRAMPOLINE(isr) \ + ENTRY_NP(tr_/**/isr); \ + swapgs; \ + mov %r13, %gs:CPU_KPTI_R13; \ + mov %gs:CPU_KPTI_KCR3, %r13; \ + mov %r13, %cr3; \ + mov %gs:CPU_KPTI_R13, %r13; \ + swapgs; \ + jmp isr; \ + SET_SIZE(tr_/**/isr) +#endif + + MK_SYSCALL_TRAMPOLINE(sys_syscall) + MK_SYSCALL_TRAMPOLINE(sys_syscall32) + MK_SYSCALL_TRAMPOLINE(brand_sys_syscall) + MK_SYSCALL_TRAMPOLINE(brand_sys_syscall32) + + /* + * SYSENTER is special. The CPU is really not very helpful when it + * comes to preserving and restoring state with it, and as a result + * we have to do all of it by hand. So, since we want to preserve + * RFLAGS, we have to be very careful in these trampolines to not + * clobber any bits in it. That means no cmpqs or branches! + */ + ENTRY_NP(tr_sys_sysenter) + swapgs + mov %r13, %gs:CPU_KPTI_R13 +#if DEBUG + mov %cr3, %r13 + mov %r13, %gs:CPU_KPTI_TR_CR3 +#endif + mov %gs:CPU_KPTI_KCR3, %r13 + mov %r13, %cr3 + mov %gs:CPU_KPTI_R13, %r13 + jmp _sys_sysenter_post_swapgs + SET_SIZE(tr_sys_sysenter) + + ENTRY_NP(tr_brand_sys_sysenter) + swapgs + mov %r13, %gs:CPU_KPTI_R13 +#if DEBUG + mov %cr3, %r13 + mov %r13, %gs:CPU_KPTI_TR_CR3 +#endif + mov %gs:CPU_KPTI_KCR3, %r13 + mov %r13, %cr3 + mov %gs:CPU_KPTI_R13, %r13 + jmp _brand_sys_sysenter_post_swapgs + SET_SIZE(tr_brand_sys_sysenter) + +#define MK_SYSCALL_INT_TRAMPOLINE(isr) \ + ENTRY_NP(tr_/**/isr); \ + swapgs; \ + mov %r13, %gs:CPU_KPTI_R13; \ + SET_KERNEL_CR3(%r13); \ + mov %gs:CPU_THREAD, %r13; \ + mov T_STACK(%r13), %r13; \ + addq $REGSIZE+MINFRAME, %r13; \ + mov %r13, %rsp; \ + pushq %gs:CPU_KPTI_SS; \ + pushq %gs:CPU_KPTI_RSP; \ + pushq %gs:CPU_KPTI_RFLAGS; \ + pushq %gs:CPU_KPTI_CS; \ + pushq %gs:CPU_KPTI_RIP; \ + mov %gs:CPU_KPTI_R13, %r13; \ + SWAPGS; \ + jmp isr; \ + SET_SIZE(tr_/**/isr) + + MK_SYSCALL_INT_TRAMPOLINE(brand_sys_syscall_int) + MK_SYSCALL_INT_TRAMPOLINE(sys_syscall_int) + + /* + * Interrupt/trap return trampolines + */ + +.global tr_intr_ret_start +tr_intr_ret_start: + + ENTRY_NP(tr_iret_auto) + cmpq $1, kpti_enable + jne tr_iret_kernel + cmpw $KCS_SEL, T_FRAMERET_CS(%rsp) + je tr_iret_kernel + jmp tr_iret_user + SET_SIZE(tr_iret_auto) + + ENTRY_NP(tr_iret_kernel) + /* + * Yes, this does nothing extra. But this way we know if we see iret + * elsewhere, then we've failed to properly consider trampolines there. + */ + iretq + SET_SIZE(tr_iret_kernel) + + ENTRY_NP(tr_iret_user) + cmpq $1, kpti_enable + jne 1f + + swapgs + mov %r13, %gs:CPU_KPTI_R13 + PIVOT_KPTI_STK(%r13) + SET_USER_CR3(%r13) + mov %gs:CPU_KPTI_R13, %r13 + /* Zero these to make sure they didn't leak from a kernel trap */ + movq $0, %gs:CPU_KPTI_R13 + movq $0, %gs:CPU_KPTI_R14 + swapgs +1: + iretq + SET_SIZE(tr_iret_user) + +.global tr_intr_ret_end +tr_intr_ret_end: + + /* + * Interrupt/trap entry trampolines + */ + + /* CPU pushed an error code, and ISR wants one */ +#define MK_INTR_TRAMPOLINE(isr) \ + ENTRY_NP(tr_/**/isr); \ + INTERRUPT_TRAMPOLINE; \ + jmp isr; \ + SET_SIZE(tr_/**/isr) + + /* CPU didn't push an error code, and ISR doesn't want one */ +#define MK_INTR_TRAMPOLINE_NOERR(isr) \ + ENTRY_NP(tr_/**/isr); \ + push $0; \ + INTERRUPT_TRAMPOLINE_NOERR; \ + jmp isr; \ + SET_SIZE(tr_/**/isr) + + /* CPU pushed an error code, and ISR wants one */ +#define MK_DBG_INTR_TRAMPOLINE(isr) \ + ENTRY_NP(tr_/**/isr); \ + DBG_INTERRUPT_TRAMPOLINE; \ + jmp isr; \ + SET_SIZE(tr_/**/isr) + + /* CPU didn't push an error code, and ISR doesn't want one */ +#define MK_DBG_INTR_TRAMPOLINE_NOERR(isr) \ + ENTRY_NP(tr_/**/isr); \ + push $0; \ + DBG_INTERRUPT_TRAMPOLINE_NOERR; \ + jmp isr; \ + SET_SIZE(tr_/**/isr) + + + MK_INTR_TRAMPOLINE_NOERR(div0trap) + MK_DBG_INTR_TRAMPOLINE_NOERR(dbgtrap) + MK_DBG_INTR_TRAMPOLINE_NOERR(brktrap) + MK_INTR_TRAMPOLINE_NOERR(ovflotrap) + MK_INTR_TRAMPOLINE_NOERR(boundstrap) + MK_INTR_TRAMPOLINE_NOERR(invoptrap) + MK_INTR_TRAMPOLINE_NOERR(ndptrap) + MK_INTR_TRAMPOLINE(invtsstrap) + MK_INTR_TRAMPOLINE(segnptrap) + MK_DBG_INTR_TRAMPOLINE(stktrap) + MK_DBG_INTR_TRAMPOLINE(gptrap) + MK_DBG_INTR_TRAMPOLINE(pftrap) + MK_INTR_TRAMPOLINE_NOERR(resvtrap) + MK_INTR_TRAMPOLINE_NOERR(ndperr) + MK_INTR_TRAMPOLINE(achktrap) + MK_INTR_TRAMPOLINE_NOERR(xmtrap) + MK_INTR_TRAMPOLINE_NOERR(invaltrap) + MK_INTR_TRAMPOLINE_NOERR(fasttrap) + MK_INTR_TRAMPOLINE_NOERR(dtrace_ret) + + /* + * These are special because they can interrupt other traps, and + * each other. We don't need to pivot their stacks, because they have + * dedicated IST stack space, but we need to change %cr3. + */ + ENTRY_NP(tr_nmiint) + pushq %r13 + mov kpti_safe_cr3, %r13 + mov %r13, %cr3 + popq %r13 + jmp nmiint + SET_SIZE(tr_nmiint) + +#if !defined(__xpv) + ENTRY_NP(tr_syserrtrap) + /* + * If we got here we should always have a zero error code pushed. + * The INT $0x8 instr doesn't seem to push one, though, which we use + * as an emergency panic in the other trampolines. So adjust things + * here. + */ + cmpq $0, (%rsp) + je 1f + pushq $0 +1: + pushq %r13 + mov kpti_safe_cr3, %r13 + mov %r13, %cr3 + popq %r13 + jmp syserrtrap + SET_SIZE(tr_syserrtrap) +#endif + + ENTRY_NP(tr_mcetrap) + pushq %r13 + mov kpti_safe_cr3, %r13 + mov %r13, %cr3 + popq %r13 + jmp mcetrap + SET_SIZE(tr_mcetrap) + + /* + * Interrupts start at 32 + */ +#define MKIVCT(n) \ + ENTRY_NP(tr_ivct/**/n) \ + push $0; \ + INTERRUPT_TRAMPOLINE; \ + push $n - 0x20; \ + jmp cmnint; \ + SET_SIZE(tr_ivct/**/n) + + MKIVCT(32); MKIVCT(33); MKIVCT(34); MKIVCT(35); + MKIVCT(36); MKIVCT(37); MKIVCT(38); MKIVCT(39); + MKIVCT(40); MKIVCT(41); MKIVCT(42); MKIVCT(43); + MKIVCT(44); MKIVCT(45); MKIVCT(46); MKIVCT(47); + MKIVCT(48); MKIVCT(49); MKIVCT(50); MKIVCT(51); + MKIVCT(52); MKIVCT(53); MKIVCT(54); MKIVCT(55); + MKIVCT(56); MKIVCT(57); MKIVCT(58); MKIVCT(59); + MKIVCT(60); MKIVCT(61); MKIVCT(62); MKIVCT(63); + MKIVCT(64); MKIVCT(65); MKIVCT(66); MKIVCT(67); + MKIVCT(68); MKIVCT(69); MKIVCT(70); MKIVCT(71); + MKIVCT(72); MKIVCT(73); MKIVCT(74); MKIVCT(75); + MKIVCT(76); MKIVCT(77); MKIVCT(78); MKIVCT(79); + MKIVCT(80); MKIVCT(81); MKIVCT(82); MKIVCT(83); + MKIVCT(84); MKIVCT(85); MKIVCT(86); MKIVCT(87); + MKIVCT(88); MKIVCT(89); MKIVCT(90); MKIVCT(91); + MKIVCT(92); MKIVCT(93); MKIVCT(94); MKIVCT(95); + MKIVCT(96); MKIVCT(97); MKIVCT(98); MKIVCT(99); + MKIVCT(100); MKIVCT(101); MKIVCT(102); MKIVCT(103); + MKIVCT(104); MKIVCT(105); MKIVCT(106); MKIVCT(107); + MKIVCT(108); MKIVCT(109); MKIVCT(110); MKIVCT(111); + MKIVCT(112); MKIVCT(113); MKIVCT(114); MKIVCT(115); + MKIVCT(116); MKIVCT(117); MKIVCT(118); MKIVCT(119); + MKIVCT(120); MKIVCT(121); MKIVCT(122); MKIVCT(123); + MKIVCT(124); MKIVCT(125); MKIVCT(126); MKIVCT(127); + MKIVCT(128); MKIVCT(129); MKIVCT(130); MKIVCT(131); + MKIVCT(132); MKIVCT(133); MKIVCT(134); MKIVCT(135); + MKIVCT(136); MKIVCT(137); MKIVCT(138); MKIVCT(139); + MKIVCT(140); MKIVCT(141); MKIVCT(142); MKIVCT(143); + MKIVCT(144); MKIVCT(145); MKIVCT(146); MKIVCT(147); + MKIVCT(148); MKIVCT(149); MKIVCT(150); MKIVCT(151); + MKIVCT(152); MKIVCT(153); MKIVCT(154); MKIVCT(155); + MKIVCT(156); MKIVCT(157); MKIVCT(158); MKIVCT(159); + MKIVCT(160); MKIVCT(161); MKIVCT(162); MKIVCT(163); + MKIVCT(164); MKIVCT(165); MKIVCT(166); MKIVCT(167); + MKIVCT(168); MKIVCT(169); MKIVCT(170); MKIVCT(171); + MKIVCT(172); MKIVCT(173); MKIVCT(174); MKIVCT(175); + MKIVCT(176); MKIVCT(177); MKIVCT(178); MKIVCT(179); + MKIVCT(180); MKIVCT(181); MKIVCT(182); MKIVCT(183); + MKIVCT(184); MKIVCT(185); MKIVCT(186); MKIVCT(187); + MKIVCT(188); MKIVCT(189); MKIVCT(190); MKIVCT(191); + MKIVCT(192); MKIVCT(193); MKIVCT(194); MKIVCT(195); + MKIVCT(196); MKIVCT(197); MKIVCT(198); MKIVCT(199); + MKIVCT(200); MKIVCT(201); MKIVCT(202); MKIVCT(203); + MKIVCT(204); MKIVCT(205); MKIVCT(206); MKIVCT(207); + MKIVCT(208); MKIVCT(209); MKIVCT(210); MKIVCT(211); + MKIVCT(212); MKIVCT(213); MKIVCT(214); MKIVCT(215); + MKIVCT(216); MKIVCT(217); MKIVCT(218); MKIVCT(219); + MKIVCT(220); MKIVCT(221); MKIVCT(222); MKIVCT(223); + MKIVCT(224); MKIVCT(225); MKIVCT(226); MKIVCT(227); + MKIVCT(228); MKIVCT(229); MKIVCT(230); MKIVCT(231); + MKIVCT(232); MKIVCT(233); MKIVCT(234); MKIVCT(235); + MKIVCT(236); MKIVCT(237); MKIVCT(238); MKIVCT(239); + MKIVCT(240); MKIVCT(241); MKIVCT(242); MKIVCT(243); + MKIVCT(244); MKIVCT(245); MKIVCT(246); MKIVCT(247); + MKIVCT(248); MKIVCT(249); MKIVCT(250); MKIVCT(251); + MKIVCT(252); MKIVCT(253); MKIVCT(254); MKIVCT(255); + + /* + * We're PCIDE, but we don't have INVPCID. The only way to invalidate a + * PCID other than the current one, then, is to load its cr3 then + * invlpg. But loading kf_user_cr3 means we can longer access our + * caller's text mapping (or indeed, its stack). So this little helper + * has to live within our trampoline text region. + * + * Called as tr_mmu_flush_user_range(addr, len, pgsz, cr3) + */ + ENTRY_NP(tr_mmu_flush_user_range) + push %rbx + /* When we read cr3, it never has the NOINVL bit set. */ + mov %cr3, %rax + movq $CR3_NOINVL_BIT, %rbx + orq %rbx, %rax + + mov %rcx, %cr3 + add %rdi, %rsi +.align ASM_ENTRY_ALIGN +1: + invlpg (%rdi) + add %rdx, %rdi + cmp %rsi, %rdi + jb 1b + mov %rax, %cr3 + pop %rbx + retq + SET_SIZE(tr_mmu_flush_user_range) + +.align MMU_PAGESIZE +.global kpti_tramp_end +kpti_tramp_end: + nop + +#endif /* __lint */ diff --git a/usr/src/uts/i86pc/ml/locore.s b/usr/src/uts/i86pc/ml/locore.s index 042818844d..4626dd1492 100644 --- a/usr/src/uts/i86pc/ml/locore.s +++ b/usr/src/uts/i86pc/ml/locore.s @@ -23,7 +23,7 @@ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. */ /* - * Copyright (c) 2016, Joyent, Inc. All rights reserved. + * Copyright (c) 2018 Joyent, Inc. */ /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ @@ -1186,7 +1186,7 @@ cmntrap() addq %rax, %r12 movq %r12, REGOFF_RIP(%rbp) INTR_POP - IRET + jmp tr_iret_auto /*NOTREACHED*/ 3: leaq dtrace_badflags(%rip), %rdi @@ -1599,7 +1599,7 @@ _no_pending_updates: */ ALTENTRY(sys_rtt_syscall32) USER32_POP - IRET + jmp tr_iret_user /*NOTREACHED*/ ALTENTRY(sys_rtt_syscall) @@ -1608,7 +1608,7 @@ _no_pending_updates: */ USER_POP ALTENTRY(nopop_sys_rtt_syscall) - IRET + jmp tr_iret_user /*NOTREACHED*/ SET_SIZE(nopop_sys_rtt_syscall) @@ -1623,7 +1623,7 @@ _no_pending_updates: * Restore regs before doing iretq to kernel mode */ INTR_POP - IRET + jmp tr_iret_kernel .globl _sys_rtt_end _sys_rtt_end: /*NOTREACHED*/ diff --git a/usr/src/uts/i86pc/ml/mpcore.s b/usr/src/uts/i86pc/ml/mpcore.s index eaf70b72df..2151a14b04 100644 --- a/usr/src/uts/i86pc/ml/mpcore.s +++ b/usr/src/uts/i86pc/ml/mpcore.s @@ -24,6 +24,8 @@ /* * Copyright (c) 2010, Intel Corporation. * All rights reserved. + * + * Copyright 2018 Joyent, Inc. */ #include <sys/asm_linkage.h> @@ -326,7 +328,7 @@ kernel_cs_code: * Complete the rest of the setup and call mp_startup(). */ movq %gs:CPU_THREAD, %rax /* get thread ptr */ - call *T_PC(%rax) /* call mp_startup */ + call *T_PC(%rax) /* call mp_startup_boot */ /* not reached */ int $20 /* whoops, returned somehow! */ @@ -502,7 +504,7 @@ kernel_cs_code: /* * Before going any farther, enable usage of page table NX bit if - * that's how our page tables are set up. + * that's how our page tables are set up. (PCIDE is enabled later on). */ bt $X86FSET_NX, x86_featureset jnc 1f diff --git a/usr/src/uts/i86pc/ml/offsets.in b/usr/src/uts/i86pc/ml/offsets.in index d0c75653a1..0946b369d9 100644 --- a/usr/src/uts/i86pc/ml/offsets.in +++ b/usr/src/uts/i86pc/ml/offsets.in @@ -1,7 +1,7 @@ \ \ Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. \ Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. -\ Copyright 2016 Joyent, Inc. +\ Copyright 2018 Joyent, Inc. \ \ CDDL HEADER START \ @@ -232,6 +232,44 @@ cpu cpu_m.mcpu_vcpu_info CPU_VCPU_INFO #endif +cpu + cpu_m.mcpu_kpti.kf_kernel_cr3 CPU_KPTI_KCR3 + cpu_m.mcpu_kpti.kf_user_cr3 CPU_KPTI_UCR3 + cpu_m.mcpu_kpti.kf_tr_rsp CPU_KPTI_TR_RSP + cpu_m.mcpu_kpti.kf_tr_cr3 CPU_KPTI_TR_CR3 + cpu_m.mcpu_kpti.kf_r13 CPU_KPTI_R13 + cpu_m.mcpu_kpti.kf_r14 CPU_KPTI_R14 + cpu_m.mcpu_kpti.kf_tr_ret_rsp CPU_KPTI_RET_RSP + + cpu_m.mcpu_kpti.kf_ss CPU_KPTI_SS + cpu_m.mcpu_kpti.kf_rsp CPU_KPTI_RSP + cpu_m.mcpu_kpti.kf_rflags CPU_KPTI_RFLAGS + cpu_m.mcpu_kpti.kf_cs CPU_KPTI_CS + cpu_m.mcpu_kpti.kf_rip CPU_KPTI_RIP + cpu_m.mcpu_kpti.kf_err CPU_KPTI_ERR + + cpu_m.mcpu_pad2 CPU_KPTI_START + cpu_m.mcpu_pad3 CPU_KPTI_END + +kpti_frame + kf_r14 KPTI_R14 + kf_r13 KPTI_R13 + kf_err KPTI_ERR + kf_rip KPTI_RIP + kf_cs KPTI_CS + kf_rflags KPTI_RFLAGS + kf_rsp KPTI_RSP + kf_ss KPTI_SS + + kf_tr_rsp KPTI_TOP + + kf_kernel_cr3 KPTI_KCR3 + kf_user_cr3 KPTI_UCR3 + kf_tr_ret_rsp KPTI_RET_RSP + kf_tr_cr3 KPTI_TR_CR3 + + kf_tr_flag KPTI_FLAG + standard_pic c_curmask c_iplmask diff --git a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s index 9501483e5f..be6a94c61b 100644 --- a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s +++ b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015 Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright (c) 2016 by Delphix. All rights reserved. */ @@ -491,6 +491,20 @@ noprod_sys_syscall: movq %rbx, REGOFF_GS(%rsp) /* + * If we're trying to use TRAPTRACE though, I take that back: we're + * probably debugging some problem in the SWAPGS logic and want to know + * what the incoming gsbase was. + * + * Since we already did SWAPGS, record the KGSBASE. + */ +#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv) + movl $MSR_AMD_KGSBASE, %ecx + rdmsr + movl %eax, REGOFF_GSBASE(%rsp) + movl %edx, REGOFF_GSBASE+4(%rsp) +#endif + + /* * Machine state saved in the regs structure on the stack * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9 * %eax is the syscall number @@ -671,8 +685,7 @@ _syscall_invoke: SYSRETQ #else ALTENTRY(nopop_sys_syscall_swapgs_sysretq) - SWAPGS /* user gsbase */ - SYSRETQ + jmp tr_sysretq #endif /*NOTREACHED*/ SET_SIZE(nopop_sys_syscall_swapgs_sysretq) @@ -773,6 +786,20 @@ _syscall32_save: movq %rbx, REGOFF_GS(%rsp) /* + * If we're trying to use TRAPTRACE though, I take that back: we're + * probably debugging some problem in the SWAPGS logic and want to know + * what the incoming gsbase was. + * + * Since we already did SWAPGS, record the KGSBASE. + */ +#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv) + movl $MSR_AMD_KGSBASE, %ecx + rdmsr + movl %eax, REGOFF_GSBASE(%rsp) + movl %edx, REGOFF_GSBASE+4(%rsp) +#endif + + /* * Application state saved in the regs structure on the stack * %eax is the syscall number * %rsp is the thread's stack, %r15 is curthread @@ -889,8 +916,7 @@ _syscall32_save: ASSERT_UPCALL_MASK_IS_SET ALTENTRY(nopop_sys_syscall32_swapgs_sysretl) - SWAPGS /* user gsbase */ - SYSRETL + jmp tr_sysretl SET_SIZE(nopop_sys_syscall32_swapgs_sysretl) /*NOTREACHED*/ @@ -935,23 +961,22 @@ _full_syscall_postsys32: * this call, as %edx is used by the sysexit instruction. * * One final complication in this routine is its interaction with - * single-stepping in a debugger. For most of the system call mechanisms, - * the CPU automatically clears the single-step flag before we enter the - * kernel. The sysenter mechanism does not clear the flag, so a user - * single-stepping through a libc routine may suddenly find themself - * single-stepping through the kernel. To detect this, kmdb compares the - * trap %pc to the [brand_]sys_enter addresses on each single-step trap. - * If it finds that we have single-stepped to a sysenter entry point, it - * explicitly clears the flag and executes the sys_sysenter routine. + * single-stepping in a debugger. For most of the system call mechanisms, the + * CPU automatically clears the single-step flag before we enter the kernel. + * The sysenter mechanism does not clear the flag, so a user single-stepping + * through a libc routine may suddenly find themself single-stepping through the + * kernel. To detect this, kmdb and trap() both compare the trap %pc to the + * [brand_]sys_enter addresses on each single-step trap. If it finds that we + * have single-stepped to a sysenter entry point, it explicitly clears the flag + * and executes the sys_sysenter routine. * - * One final complication in this final complication is the fact that we - * have two different entry points for sysenter: brand_sys_sysenter and - * sys_sysenter. If we enter at brand_sys_sysenter and start single-stepping - * through the kernel with kmdb, we will eventually hit the instruction at - * sys_sysenter. kmdb cannot distinguish between that valid single-step - * and the undesirable one mentioned above. To avoid this situation, we - * simply add a jump over the instruction at sys_sysenter to make it - * impossible to single-step to it. + * One final complication in this final complication is the fact that we have + * two different entry points for sysenter: brand_sys_sysenter and sys_sysenter. + * If we enter at brand_sys_sysenter and start single-stepping through the + * kernel with kmdb, we will eventually hit the instruction at sys_sysenter. + * kmdb cannot distinguish between that valid single-step and the undesirable + * one mentioned above. To avoid this situation, we simply add a jump over the + * instruction at sys_sysenter to make it impossible to single-step to it. */ #if defined(__lint) @@ -964,6 +989,7 @@ sys_sysenter() ENTRY_NP(brand_sys_sysenter) SWAPGS /* kernel gsbase */ ALTENTRY(_brand_sys_sysenter_post_swapgs) + BRAND_CALLBACK(BRAND_CB_SYSENTER, BRAND_URET_FROM_REG(%rdx)) /* * Jump over sys_sysenter to allow single-stepping as described @@ -973,13 +999,17 @@ sys_sysenter() ALTENTRY(sys_sysenter) SWAPGS /* kernel gsbase */ - ALTENTRY(_sys_sysenter_post_swapgs) + movq %gs:CPU_THREAD, %r15 movl $U32CS_SEL, REGOFF_CS(%rsp) movl %ecx, REGOFF_RSP(%rsp) /* wrapper: %esp -> %ecx */ movl %edx, REGOFF_RIP(%rsp) /* wrapper: %eip -> %edx */ + /* + * NOTE: none of the instructions that run before we get here should + * clobber bits in (R)FLAGS! This includes the kpti trampoline. + */ pushfq popq %r10 movl $UDS_SEL, REGOFF_SS(%rsp) @@ -1021,6 +1051,20 @@ sys_sysenter() movq %rbx, REGOFF_GS(%rsp) /* + * If we're trying to use TRAPTRACE though, I take that back: we're + * probably debugging some problem in the SWAPGS logic and want to know + * what the incoming gsbase was. + * + * Since we already did SWAPGS, record the KGSBASE. + */ +#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv) + movl $MSR_AMD_KGSBASE, %ecx + rdmsr + movl %eax, REGOFF_GSBASE(%rsp) + movl %edx, REGOFF_GSBASE+4(%rsp) +#endif + + /* * Application state saved in the regs structure on the stack * %eax is the syscall number * %rsp is the thread's stack, %r15 is curthread @@ -1118,6 +1162,8 @@ sys_sysenter() * If we were, and we ended up on another cpu, or another * lwp got int ahead of us, it could change the segment * registers without us noticing before we return to userland. + * + * This cli is undone in the tr_sysexit trampoline code. */ cli CHECK_POSTSYS_NE(%r15, %r14, %ebx) @@ -1151,9 +1197,7 @@ sys_sysenter() popfq movl REGOFF_RSP(%rsp), %ecx /* sysexit: %ecx -> %esp */ ALTENTRY(sys_sysenter_swapgs_sysexit) - swapgs - sti - sysexit + jmp tr_sysexit SET_SIZE(sys_sysenter_swapgs_sysexit) SET_SIZE(sys_sysenter) SET_SIZE(_sys_sysenter_post_swapgs) @@ -1204,10 +1248,13 @@ nopop_syscall_int: * or we could end up breaking branded zone support. See the usage of * this label in lx_brand_int80_callback and sn1_brand_int91_callback * for examples. + * + * We want to swapgs to maintain the invariant that all entries into + * tr_iret_user are done on the user gsbase. */ - ALTENTRY(sys_sysint_swapgs_iret) - SWAPGS /* user gsbase */ - IRET + ALTENTRY(sys_sysint_swapgs_iret) + SWAPGS + jmp tr_iret_user /*NOTREACHED*/ SET_SIZE(sys_sysint_swapgs_iret) SET_SIZE(sys_syscall_int) |
