diff options
author | Robert Mustacchi <rm@joyent.com> | 2016-03-24 09:12:47 -0700 |
---|---|---|
committer | Robert Mustacchi <rm@joyent.com> | 2016-03-30 16:57:24 -0700 |
commit | 3ce2fcdcae00f6a5ca9abd0567a142752e44373b (patch) | |
tree | 95b74131fd0e7b0a23f6728eb6a1e985c376ada7 | |
parent | c3a3f6c4a232444769e56b84076450e094d50532 (diff) | |
download | illumos-gate-3ce2fcdcae00f6a5ca9abd0567a142752e44373b.tar.gz |
6789 Want SMAP support
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Approved by: Garrett D'Amore <garrett@damore.org>
-rw-r--r-- | usr/src/cmd/mdb/i86pc/modules/unix/unix.c | 1 | ||||
-rw-r--r-- | usr/src/uts/common/sys/thread.h | 1 | ||||
-rw-r--r-- | usr/src/uts/common/syscall/lwp_sobj.c | 35 | ||||
-rw-r--r-- | usr/src/uts/i86pc/ml/offsets.in | 3 | ||||
-rw-r--r-- | usr/src/uts/i86pc/ml/syscall_asm_amd64.s | 4 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/cpuid.c | 17 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/mlsetup.c | 5 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/mp_startup.c | 8 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/startup.c | 63 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/trap.c | 15 | ||||
-rw-r--r-- | usr/src/uts/intel/dtrace/dtrace_asm.s | 19 | ||||
-rw-r--r-- | usr/src/uts/intel/ia32/ml/copy.s | 257 | ||||
-rw-r--r-- | usr/src/uts/intel/ia32/ml/i86_subr.s | 8 | ||||
-rw-r--r-- | usr/src/uts/intel/ia32/ml/swtch.s | 38 | ||||
-rw-r--r-- | usr/src/uts/intel/ia32/os/sendsig.c | 6 | ||||
-rw-r--r-- | usr/src/uts/intel/ia32/sys/psw.h | 6 | ||||
-rw-r--r-- | usr/src/uts/intel/sys/archsystm.h | 12 | ||||
-rw-r--r-- | usr/src/uts/intel/sys/controlregs.h | 6 | ||||
-rw-r--r-- | usr/src/uts/intel/sys/x86_archext.h | 4 |
19 files changed, 462 insertions, 46 deletions
diff --git a/usr/src/cmd/mdb/i86pc/modules/unix/unix.c b/usr/src/cmd/mdb/i86pc/modules/unix/unix.c index 3a90ce431b..4bce7100ef 100644 --- a/usr/src/cmd/mdb/i86pc/modules/unix/unix.c +++ b/usr/src/cmd/mdb/i86pc/modules/unix/unix.c @@ -918,6 +918,7 @@ crregs_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) { "SMXE", CR4_SMXE, CR4_SMXE }, { "OSXSAVE", CR4_OSXSAVE, CR4_OSXSAVE }, { "SMEP", CR4_SMEP, CR4_SMEP }, + { "SMAP", CR4_SMAP, CR4_SMAP }, { NULL, 0, 0 } }; diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h index fd6a60c65e..d917944edf 100644 --- a/usr/src/uts/common/sys/thread.h +++ b/usr/src/uts/common/sys/thread.h @@ -339,6 +339,7 @@ typedef struct _kthread { uintptr_t t_dtrace_astpc; /* DTrace return sequence location */ #ifdef __amd64 uint64_t t_dtrace_regv; /* DTrace saved reg from fasttrap */ + uint64_t t_useracc; /* SMAP state saved across swtch() */ #endif hrtime_t t_hrtime; /* high-res last time on cpu */ kmutex_t t_ctx_lock; /* protects t_ctx in removectx() */ diff --git a/usr/src/uts/common/syscall/lwp_sobj.c b/usr/src/uts/common/syscall/lwp_sobj.c index 3ac8504e6a..a87aa05cdd 100644 --- a/usr/src/uts/common/syscall/lwp_sobj.c +++ b/usr/src/uts/common/syscall/lwp_sobj.c @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -1022,7 +1023,10 @@ out: } /* - * Set the owner and ownerpid fields of a user-level mutex. + * Set the owner and ownerpid fields of a user-level mutex. Note, this function + * uses the suword*_noerr routines which must be called between + * on_fault/no_fault. However, this routine itself does not do the + * on_fault/no_fault and it is assumed all the callers will do so instead! */ static void set_owner_pid(lwp_mutex_t *lp, uintptr_t owner, pid_t pid) @@ -1201,9 +1205,21 @@ lwp_mutex_timedlock(lwp_mutex_t *lp, timespec_t *tsp, uintptr_t owner) if (UPIMUTEX(type)) { no_fault(); error = lwp_upimutex_lock(lp, type, UPIMUTEX_BLOCK, &lwpt); - if (error == 0 || error == EOWNERDEAD || error == ELOCKUNMAPPED) + if (error == 0 || error == EOWNERDEAD || + error == ELOCKUNMAPPED) { + volatile int locked = error != 0; + if (on_fault(&ljb)) { + if (locked != 0) + error = lwp_upimutex_unlock(lp, type); + else + error = EFAULT; + goto upierr; + } set_owner_pid(lp, owner, (type & USYNC_PROCESS)? p->p_pid : 0); + no_fault(); + } +upierr: if (tsp && !time_error) /* copyout the residual time left */ error = lwp_timer_copyout(&lwpt, error); if (error) @@ -3045,9 +3061,22 @@ lwp_mutex_trylock(lwp_mutex_t *lp, uintptr_t owner) if (UPIMUTEX(type)) { no_fault(); error = lwp_upimutex_lock(lp, type, UPIMUTEX_TRY, NULL); - if (error == 0 || error == EOWNERDEAD || error == ELOCKUNMAPPED) + if (error == 0 || error == EOWNERDEAD || + error == ELOCKUNMAPPED) { + volatile int locked = error != 0; + if (on_fault(&ljb)) { + if (locked != 0) + error = lwp_upimutex_unlock(lp, type); + else + error = EFAULT; + goto upierr; + } set_owner_pid(lp, owner, (type & USYNC_PROCESS)? p->p_pid : 0); + no_fault(); + } + +upierr: if (error) return (set_errno(error)); return (0); diff --git a/usr/src/uts/i86pc/ml/offsets.in b/usr/src/uts/i86pc/ml/offsets.in index 50a27b3d30..721d32fa3a 100644 --- a/usr/src/uts/i86pc/ml/offsets.in +++ b/usr/src/uts/i86pc/ml/offsets.in @@ -124,6 +124,9 @@ _kthread THREAD_SIZE _tu._ts._t_post_sys T_POST_SYS _tu._t_post_sys_ast T_POST_SYS_AST t_copyops +#ifdef __amd64 + t_useracc +#endif ctxop save_op CTXOP_SAVE diff --git a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s index 443689cec3..f26468c8cc 100644 --- a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s +++ b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/asm_linkage.h> @@ -1177,12 +1177,14 @@ sys_syscall_int() ENTRY_NP(brand_sys_syscall_int) SWAPGS /* kernel gsbase */ XPV_TRAP_POP + call smap_enable BRAND_CALLBACK(BRAND_CB_INT91, BRAND_URET_FROM_INTR_STACK()) jmp nopop_syscall_int ALTENTRY(sys_syscall_int) SWAPGS /* kernel gsbase */ XPV_TRAP_POP + call smap_enable nopop_syscall_int: movq %gs:CPU_THREAD, %r15 diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c index 586ec0656b..44e475f328 100644 --- a/usr/src/uts/i86pc/os/cpuid.c +++ b/usr/src/uts/i86pc/os/cpuid.c @@ -170,7 +170,8 @@ static char *x86_feature_names[NUM_X86_FEATURES] = { "bmi1", "bmi2", "fma", - "smep" + "smep", + "smap" }; boolean_t @@ -223,6 +224,7 @@ print_x86_featureset(void *featureset) static size_t xsave_state_size = 0; uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE); boolean_t xsave_force_disable = B_FALSE; +extern int disable_smap; /* * This is set to platform type we are running on. @@ -1249,6 +1251,19 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset) if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP) add_x86_feature(featureset, X86FSET_SMEP); + + /* + * We check disable_smap here in addition to in startup_smap() + * to ensure CPUs that aren't the boot CPU don't accidentally + * include it in the feature set and thus generate a mismatched + * x86 feature set across CPUs. Note that at this time we only + * enable SMAP for the 64-bit kernel. + */ +#if defined(__amd64) + if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP && + disable_smap == 0) + add_x86_feature(featureset, X86FSET_SMAP); +#endif } /* diff --git a/usr/src/uts/i86pc/os/mlsetup.c b/usr/src/uts/i86pc/os/mlsetup.c index 8cb56d9682..045adbcb7b 100644 --- a/usr/src/uts/i86pc/os/mlsetup.c +++ b/usr/src/uts/i86pc/os/mlsetup.c @@ -269,7 +269,10 @@ mlsetup(struct regs *rp) (void) wrmsr(MSR_AMD_TSCAUX, 0); /* - * Let's get the other %cr4 stuff while we're here. + * Let's get the other %cr4 stuff while we're here. Note, we defer + * enabling CR4_SMAP until startup_end(); however, that's importantly + * before we start other CPUs. That ensures that it will be synced out + * to other CPUs. */ if (is_x86_feature(x86_featureset, X86FSET_DE)) setcr4(getcr4() | CR4_DE); diff --git a/usr/src/uts/i86pc/os/mp_startup.c b/usr/src/uts/i86pc/os/mp_startup.c index a068914259..3c7b453949 100644 --- a/usr/src/uts/i86pc/os/mp_startup.c +++ b/usr/src/uts/i86pc/os/mp_startup.c @@ -27,7 +27,7 @@ * All rights reserved. */ /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. */ @@ -171,6 +171,7 @@ init_cpu_syscall(struct cpu *cp) #if defined(__amd64) if (is_x86_feature(x86_featureset, X86FSET_MSR) && is_x86_feature(x86_featureset, X86FSET_ASYSC)) { + uint64_t flags; #if !defined(__lint) /* @@ -199,7 +200,10 @@ init_cpu_syscall(struct cpu *cp) * This list of flags is masked off the incoming * %rfl when we enter the kernel. */ - wrmsr(MSR_AMD_SFMASK, (uint64_t)(uintptr_t)(PS_IE | PS_T)); + flags = PS_IE | PS_T; + if (is_x86_feature(x86_featureset, X86FSET_SMAP) == B_TRUE) + flags |= PS_ACHK; + wrmsr(MSR_AMD_SFMASK, flags); } #endif diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c index f52707036c..0f16f3cc63 100644 --- a/usr/src/uts/i86pc/os/startup.c +++ b/usr/src/uts/i86pc/os/startup.c @@ -22,7 +22,7 @@ * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* * Copyright (c) 2010, Intel Corporation. @@ -284,6 +284,12 @@ int segzio_fromheap = 1; #endif /* + * Give folks an escape hatch for disabling SMAP via kmdb. Doesn't work + * post-boot. + */ +int disable_smap = 0; + +/* * new memory fragmentations are possible in startup() due to BOP_ALLOCs. this * depends on number of BOP_ALLOC calls made and requested size, memory size * combination and whether boot.bin memory needs to be freed. @@ -673,6 +679,60 @@ perform_allocations(void) } /* + * Set up and enable SMAP now before we start other CPUs, but after the kernel's + * VM has been set up so we can use hot_patch_kernel_text(). + * + * We can only patch 1, 2, or 4 bytes, but not three bytes. So instead, we + * replace the four byte word at the patch point. See uts/intel/ia32/ml/copy.s + * for more information on what's going on here. + */ +static void +startup_smap(void) +{ + int i; + uint32_t inst; + uint8_t *instp; + char sym[128]; + + extern int _smap_enable_patch_count; + extern int _smap_disable_patch_count; + + if (disable_smap != 0) + remove_x86_feature(x86_featureset, X86FSET_SMAP); + + if (is_x86_feature(x86_featureset, X86FSET_SMAP) == B_FALSE) + return; + + for (i = 0; i < _smap_enable_patch_count; i++) { + int sizep; + + VERIFY3U(i, <, _smap_enable_patch_count); + VERIFY(snprintf(sym, sizeof (sym), "_smap_enable_patch_%d", i) < + sizeof (sym)); + instp = (uint8_t *)(void *)kobj_getelfsym(sym, NULL, &sizep); + VERIFY(instp != 0); + inst = (instp[3] << 24) | (SMAP_CLAC_INSTR & 0x00ffffff); + hot_patch_kernel_text((caddr_t)instp, inst, 4); + } + + for (i = 0; i < _smap_disable_patch_count; i++) { + int sizep; + + VERIFY(snprintf(sym, sizeof (sym), "_smap_disable_patch_%d", + i) < sizeof (sym)); + instp = (uint8_t *)(void *)kobj_getelfsym(sym, NULL, &sizep); + VERIFY(instp != 0); + inst = (instp[3] << 24) | (SMAP_STAC_INSTR & 0x00ffffff); + hot_patch_kernel_text((caddr_t)instp, inst, 4); + } + + hot_patch_kernel_text((caddr_t)smap_enable, SMAP_CLAC_INSTR, 4); + hot_patch_kernel_text((caddr_t)smap_disable, SMAP_STAC_INSTR, 4); + setcr4(getcr4() | CR4_SMAP); + smap_enable(); +} + +/* * Our world looks like this at startup time. * * In a 32-bit OS, boot loads the kernel text at 0xfe800000 and kernel data @@ -727,6 +787,7 @@ startup(void) * the irq routing table (used for pci labels). */ startup_pci_bios(); + startup_smap(); #endif #if defined(__xpv) startup_xen_mca(); diff --git a/usr/src/uts/i86pc/os/trap.c b/usr/src/uts/i86pc/os/trap.c index 4184b116f5..9390690e95 100644 --- a/usr/src/uts/i86pc/os/trap.c +++ b/usr/src/uts/i86pc/os/trap.c @@ -638,6 +638,21 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid) } /* + * We need to check if SMAP is in play. If SMAP is in play, then + * any access to a user page will show up as a protection + * violation. To see if SMAP is enabled we first check if it's a + * user address and whether we have the feature flag set. If we + * do and the interrupted registers do not allow for user + * accesses (PS_ACHK is not enabled), then we need to die + * immediately. + */ + if (addr < (caddr_t)kernelbase && + is_x86_feature(x86_featureset, X86FSET_SMAP) == B_TRUE && + (rp->r_ps & PS_ACHK) == 0) { + (void) die(type, rp, addr, cpuid); + } + + /* * See if we can handle as pagefault. Save lofault and onfault * across this. Here we assume that an address less than * KERNELBASE is a user fault. We can do this as copy.s diff --git a/usr/src/uts/intel/dtrace/dtrace_asm.s b/usr/src/uts/intel/dtrace/dtrace_asm.s index 3aad499599..cd2dc5c5bf 100644 --- a/usr/src/uts/intel/dtrace/dtrace_asm.s +++ b/usr/src/uts/intel/dtrace/dtrace_asm.s @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright 2011 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/asm_linkage.h> @@ -195,12 +195,14 @@ dtrace_copy(uintptr_t src, uintptr_t dest, size_t size) ENTRY(dtrace_copy) pushq %rbp + call smap_disable movq %rsp, %rbp xchgq %rdi, %rsi /* make %rsi source, %rdi dest */ movq %rdx, %rcx /* load count */ repz /* repeat for count ... */ smovb /* move from %ds:rsi to %ed:rdi */ + call smap_enable leave ret SET_SIZE(dtrace_copy) @@ -244,7 +246,7 @@ dtrace_copystr(uintptr_t uaddr, uintptr_t kaddr, size_t size, ENTRY(dtrace_copystr) pushq %rbp movq %rsp, %rbp - + call smap_disable 0: movb (%rdi), %al /* load from source */ movb %al, (%rsi) /* store to destination */ @@ -261,6 +263,7 @@ dtrace_copystr(uintptr_t uaddr, uintptr_t kaddr, size_t size, cmpq $0, %rdx jne 0b 2: + call smap_enable leave ret @@ -273,7 +276,7 @@ dtrace_copystr(uintptr_t uaddr, uintptr_t kaddr, size_t size, pushl %ebp / Setup stack frame movl %esp, %ebp pushl %ebx / Save registers - + movl 8(%ebp), %ebx / Load source address movl 12(%ebp), %edx / Load destination address movl 16(%ebp), %ecx / Load count @@ -317,7 +320,9 @@ dtrace_fulword(void *addr) #if defined(__amd64) ENTRY(dtrace_fulword) + call smap_disable movq (%rdi), %rax + call smap_enable ret SET_SIZE(dtrace_fulword) @@ -344,8 +349,10 @@ dtrace_fuword8_nocheck(void *addr) #if defined(__amd64) ENTRY(dtrace_fuword8_nocheck) + call smap_disable xorq %rax, %rax movb (%rdi), %al + call smap_enable ret SET_SIZE(dtrace_fuword8_nocheck) @@ -372,8 +379,10 @@ dtrace_fuword16_nocheck(void *addr) #if defined(__amd64) ENTRY(dtrace_fuword16_nocheck) + call smap_disable xorq %rax, %rax movw (%rdi), %ax + call smap_enable ret SET_SIZE(dtrace_fuword16_nocheck) @@ -400,8 +409,10 @@ dtrace_fuword32_nocheck(void *addr) #if defined(__amd64) ENTRY(dtrace_fuword32_nocheck) + call smap_disable xorq %rax, %rax movl (%rdi), %eax + call smap_enable ret SET_SIZE(dtrace_fuword32_nocheck) @@ -428,7 +439,9 @@ dtrace_fuword64_nocheck(void *addr) #if defined(__amd64) ENTRY(dtrace_fuword64_nocheck) + call smap_disable movq (%rdi), %rax + call smap_enable ret SET_SIZE(dtrace_fuword64_nocheck) diff --git a/usr/src/uts/intel/ia32/ml/copy.s b/usr/src/uts/intel/ia32/ml/copy.s index 3b4eef261d..35414743a9 100644 --- a/usr/src/uts/intel/ia32/ml/copy.s +++ b/usr/src/uts/intel/ia32/ml/copy.s @@ -35,6 +35,10 @@ /* Copyright (c) 1987, 1988 Microsoft Corporation */ /* All Rights Reserved */ +/* + * Copyright 2015 Joyent, Inc. + */ + #include <sys/errno.h> #include <sys/asm_linkage.h> @@ -56,6 +60,69 @@ #define COUNT_ALIGN_MASK _CONST(COUNT_ALIGN_SIZE-1) /* + * With the introduction of Broadwell, Intel has introduced supervisor mode + * access protection -- SMAP. SMAP forces the kernel to set certain bits to + * enable access of user pages (AC in rflags, defines as PS_ACHK in + * <sys/psw.h>). One of the challenges is that the implementation of many of the + * userland copy routines directly use the kernel ones. For example, copyin and + * copyout simply go and jump to the do_copy_fault label and traditionally let + * those deal with the return for them. In fact, changing that is a can of frame + * pointers. + * + * Rules and Constraints: + * + * 1. For anything that's not in copy.s, we have it do explicit calls to the + * smap related code. It usually is in a position where it is able to. This is + * restricted to the following three places: DTrace, resume() in swtch.s and + * on_fault/no_fault. If you want to add it somewhere else, we should be + * thinking twice. + * + * 2. We try to toggle this at the smallest window possible. This means that if + * we take a fault, need to try to use a copyop in copyin() or copyout(), or any + * other function, we will always leave with SMAP enabled (the kernel cannot + * access user pages). + * + * 3. None of the *_noerr() or ucopy/uzero routines should toggle SMAP. They are + * explicitly only allowed to be called while in an on_fault()/no_fault() handler, + * which already takes care of ensuring that SMAP is enabled and disabled. Note + * this means that when under an on_fault()/no_fault() handler, one must not + * call the non-*_noeer() routines. + * + * 4. The first thing we should do after coming out of an lofault handler is to + * make sure that we call smap_enable again to ensure that we are safely + * protected, as more often than not, we will have disabled smap to get there. + * + * 5. The SMAP functions, smap_enable and smap_disable may not touch any + * registers beyond those done by the call and ret. These routines may be called + * from arbitrary contexts in copy.s where we have slightly more special ABIs in + * place. + * + * 6. For any inline user of SMAP, the appropriate SMAP_ENABLE_INSTR and + * SMAP_DISABLE_INSTR macro should be used (except for smap_enable() and + * smap_disable()). If the number of these is changed, you must update the + * constants SMAP_ENABLE_COUNT and SMAP_DISABLE_COUNT below. + * + * 7. Note, at this time SMAP is not implemented for the 32-bit kernel. There is + * no known technical reason preventing it from being enabled. + * + * 8. Generally this .s file is processed by a K&R style cpp. This means that it + * really has a lot of feelings about whitespace. In particular, if you have a + * macro FOO with the arguments FOO(1, 3), the second argument is in fact ' 3'. + * + * 9. The smap_enable and smap_disable functions should not generally be called. + * They exist such that DTrace and on_trap() may use them, that's it. + * + * 10. In general, the kernel has its own value for rflags that gets used. This + * is maintained in a few different places which vary based on how the thread + * comes into existence and whether it's a user thread. In general, when the + * kernel takes a trap, it always will set ourselves to a known set of flags, + * mainly as part of ENABLE_INTR_FLAGS and F_OFF and F_ON. These ensure that + * PS_ACHK is cleared for us. In addition, when using the sysenter instruction, + * we mask off PS_ACHK off via the AMD_SFMASK MSR. See init_cpu_syscall() for + * where that gets masked off. + */ + +/* * The optimal 64-bit bcopy and kcopy for modern x86 processors uses * "rep smovq" for large sizes. Performance data shows that many calls to * bcopy/kcopy/bzero/kzero operate on small buffers. For best performance for @@ -72,6 +139,28 @@ * Returns errno value on pagefault error, 0 if all ok */ +/* + * I'm sorry about these macros, but copy.s is unsurprisingly sensitive to + * additional call instructions. + */ +#if defined(__amd64) +#define SMAP_DISABLE_COUNT 16 +#define SMAP_ENABLE_COUNT 26 +#elif defined(__i386) +#define SMAP_DISABLE_COUNT 0 +#define SMAP_ENABLE_COUNT 0 +#endif + +#define SMAP_DISABLE_INSTR(ITER) \ + .globl _smap_disable_patch_/**/ITER; \ + _smap_disable_patch_/**/ITER/**/:; \ + nop; nop; nop; + +#define SMAP_ENABLE_INSTR(ITER) \ + .globl _smap_enable_patch_/**/ITER; \ + _smap_enable_patch_/**/ITER/**/:; \ + nop; nop; nop; + #if defined(__lint) /* ARGSUSED */ @@ -110,6 +199,7 @@ do_copy_fault: movq %rcx, T_LOFAULT(%r9) /* new lofault */ call bcopy_altentry xorl %eax, %eax /* return 0 (success) */ + SMAP_ENABLE_INSTR(0) /* * A fault during do_copy_fault is indicated through an errno value @@ -268,6 +358,7 @@ kcopy_nta(const void *from, void *to, size_t count, int copy_cached) mfence xorl %eax, %eax /* return 0 (success) */ + SMAP_ENABLE_INSTR(1) _kcopy_nta_copyerr: movq %r11, T_LOFAULT(%r9) /* restore original lofault */ @@ -1466,10 +1557,12 @@ copyin(const void *uaddr, void *kaddr, size_t count) movq %gs:CPU_THREAD, %r9 cmpq %rax, %rdi /* test uaddr < kernelbase */ - jb do_copy_fault - jmp 3f + jae 3f /* take copyop if uaddr > kernelbase */ + SMAP_DISABLE_INSTR(0) + jmp do_copy_fault /* Takes care of leave for us */ _copyin_err: + SMAP_ENABLE_INSTR(2) movq %r11, T_LOFAULT(%r9) /* restore original lofault */ addq $8, %rsp /* pop bcopy_altentry call ret addr */ 3: @@ -1577,24 +1670,29 @@ xcopyin_nta(const void *uaddr, void *kaddr, size_t count, int copy_cached) * pass lofault value as 4th argument to do_copy_fault */ leaq _xcopyin_err(%rip), %rcx /* doesn't set rflags */ - jnz do_copy_fault /* use regular access */ + jnz 6f /* use regular access */ /* * Make sure cnt is >= XCOPY_MIN_SIZE bytes */ cmpq $XCOPY_MIN_SIZE, %rdx - jb do_copy_fault + jae 5f +6: + SMAP_DISABLE_INSTR(1) + jmp do_copy_fault /* * Make sure src and dst are NTA_ALIGN_SIZE aligned, * count is COUNT_ALIGN_SIZE aligned. */ +5: movq %rdi, %r10 orq %rsi, %r10 andq $NTA_ALIGN_MASK, %r10 orq %rdx, %r10 andq $COUNT_ALIGN_MASK, %r10 - jnz do_copy_fault + jnz 6b leaq _xcopyin_nta_err(%rip), %rcx /* doesn't set rflags */ + SMAP_DISABLE_INSTR(2) jmp do_copy_fault_nta /* use non-temporal access */ 4: @@ -1609,6 +1707,7 @@ xcopyin_nta(const void *uaddr, void *kaddr, size_t count, int copy_cached) _xcopyin_err: addq $8, %rsp /* pop bcopy_altentry call ret addr */ _xcopyin_nta_err: + SMAP_ENABLE_INSTR(3) movq %r11, T_LOFAULT(%r9) /* restore original lofault */ 3: movq T_COPYOPS(%r9), %r8 @@ -1745,10 +1844,12 @@ copyout(const void *kaddr, void *uaddr, size_t count) movq %gs:CPU_THREAD, %r9 cmpq %rax, %rsi /* test uaddr < kernelbase */ - jb do_copy_fault - jmp 3f + jae 3f /* take copyop if uaddr > kernelbase */ + SMAP_DISABLE_INSTR(3) + jmp do_copy_fault /* Calls leave for us */ _copyout_err: + SMAP_ENABLE_INSTR(4) movq %r11, T_LOFAULT(%r9) /* restore original lofault */ addq $8, %rsp /* pop bcopy_altentry call ret addr */ 3: @@ -1855,25 +1956,32 @@ xcopyout_nta(const void *kaddr, void *uaddr, size_t count, int copy_cached) * pass lofault value as 4th argument to do_copy_fault */ leaq _xcopyout_err(%rip), %rcx - jnz do_copy_fault + jnz 6f /* * Make sure cnt is >= XCOPY_MIN_SIZE bytes */ cmpq $XCOPY_MIN_SIZE, %rdx - jb do_copy_fault + jae 5f +6: + SMAP_DISABLE_INSTR(4) + jmp do_copy_fault /* * Make sure src and dst are NTA_ALIGN_SIZE aligned, * count is COUNT_ALIGN_SIZE aligned. */ +5: movq %rdi, %r10 orq %rsi, %r10 andq $NTA_ALIGN_MASK, %r10 orq %rdx, %r10 andq $COUNT_ALIGN_MASK, %r10 - jnz do_copy_fault + jnz 6b leaq _xcopyout_nta_err(%rip), %rcx - jmp do_copy_fault_nta + SMAP_DISABLE_INSTR(5) + call do_copy_fault_nta + SMAP_ENABLE_INSTR(5) + ret 4: movl $EFAULT, %eax @@ -1887,6 +1995,7 @@ xcopyout_nta(const void *kaddr, void *uaddr, size_t count, int copy_cached) _xcopyout_err: addq $8, %rsp /* pop bcopy_altentry call ret addr */ _xcopyout_nta_err: + SMAP_ENABLE_INSTR(6) movq %r11, T_LOFAULT(%r9) /* restore original lofault */ 3: movq T_COPYOPS(%r9), %r8 @@ -2011,6 +2120,8 @@ copystr(const char *from, char *to, size_t maxlength, size_t *lencopied) movq %gs:CPU_THREAD, %r9 movq T_LOFAULT(%r9), %r8 /* pass current lofault value as */ /* 5th argument to do_copystr */ + xorl %r10d,%r10d /* pass smap restore need in %r10d */ + /* as a non-ABI 6th arg */ do_copystr: movq %gs:CPU_THREAD, %r9 /* %r9 = thread addr */ movq T_LOFAULT(%r9), %r11 /* save the current lofault */ @@ -2041,10 +2152,15 @@ copystr_null: copystr_out: cmpq $0, %rcx /* want length? */ - je copystr_done /* no */ + je copystr_smap /* no */ subq %r8, %rdx /* compute length and store it */ movq %rdx, (%rcx) +copystr_smap: + cmpl $0, %r10d + jz copystr_done + SMAP_ENABLE_INSTR(7) + copystr_done: movq %r11, T_LOFAULT(%r9) /* restore the original lofault */ leave @@ -2178,15 +2294,21 @@ copyinstr(const char *uaddr, char *kaddr, size_t maxlength, #endif /* * pass lofault value as 5th argument to do_copystr + * do_copystr expects whether or not we need smap in %r10d */ leaq _copyinstr_error(%rip), %r8 + movl $1, %r10d cmpq %rax, %rdi /* test uaddr < kernelbase */ - jb do_copystr + jae 4f + SMAP_DISABLE_INSTR(6) + jmp do_copystr +4: movq %gs:CPU_THREAD, %r9 jmp 3f _copyinstr_error: + SMAP_ENABLE_INSTR(8) movq %r11, T_LOFAULT(%r9) /* restore original lofault */ 3: movq T_COPYOPS(%r9), %rax @@ -2294,15 +2416,21 @@ copyoutstr(const char *kaddr, char *uaddr, size_t maxlength, #endif /* * pass lofault value as 5th argument to do_copystr + * pass one as 6th argument to do_copystr in %r10d */ leaq _copyoutstr_error(%rip), %r8 + movl $1, %r10d cmpq %rax, %rsi /* test uaddr < kernelbase */ - jb do_copystr + jae 4f + SMAP_DISABLE_INSTR(7) + jmp do_copystr +4: movq %gs:CPU_THREAD, %r9 jmp 3f _copyoutstr_error: + SMAP_ENABLE_INSTR(9) movq %r11, T_LOFAULT(%r9) /* restore the original lofault */ 3: movq T_COPYOPS(%r9), %rax @@ -2406,23 +2534,28 @@ fuword8(const void *addr, uint8_t *dst) #if defined(__amd64) /* - * (Note that we don't save and reload the arguments here - * because their values are not altered in the copy path) + * Note that we don't save and reload the arguments here + * because their values are not altered in the copy path. + * Additionally, when successful, the smap_enable jmp will + * actually return us to our original caller. */ -#define FUWORD(NAME, INSTR, REG, COPYOP) \ +#define FUWORD(NAME, INSTR, REG, COPYOP, DISNUM, EN1, EN2) \ ENTRY(NAME) \ movq %gs:CPU_THREAD, %r9; \ cmpq kernelbase(%rip), %rdi; \ jae 1f; \ leaq _flt_/**/NAME, %rdx; \ movq %rdx, T_LOFAULT(%r9); \ + SMAP_DISABLE_INSTR(DISNUM) \ INSTR (%rdi), REG; \ movq $0, T_LOFAULT(%r9); \ INSTR REG, (%rsi); \ xorl %eax, %eax; \ + SMAP_ENABLE_INSTR(EN1) \ ret; \ _flt_/**/NAME: \ + SMAP_ENABLE_INSTR(EN2) \ movq $0, T_LOFAULT(%r9); \ 1: \ movq T_COPYOPS(%r9), %rax; \ @@ -2434,10 +2567,10 @@ _flt_/**/NAME: \ ret; \ SET_SIZE(NAME) - FUWORD(fuword64, movq, %rax, CP_FUWORD64) - FUWORD(fuword32, movl, %eax, CP_FUWORD32) - FUWORD(fuword16, movw, %ax, CP_FUWORD16) - FUWORD(fuword8, movb, %al, CP_FUWORD8) + FUWORD(fuword64, movq, %rax, CP_FUWORD64,8,10,11) + FUWORD(fuword32, movl, %eax, CP_FUWORD32,9,12,13) + FUWORD(fuword16, movw, %ax, CP_FUWORD16,10,14,15) + FUWORD(fuword8, movb, %al, CP_FUWORD8,11,16,17) #elif defined(__i386) @@ -2513,22 +2646,25 @@ suword8(void *addr, uint8_t value) #if defined(__amd64) /* - * (Note that we don't save and reload the arguments here - * because their values are not altered in the copy path) + * Note that we don't save and reload the arguments here + * because their values are not altered in the copy path. */ -#define SUWORD(NAME, INSTR, REG, COPYOP) \ +#define SUWORD(NAME, INSTR, REG, COPYOP, DISNUM, EN1, EN2) \ ENTRY(NAME) \ movq %gs:CPU_THREAD, %r9; \ cmpq kernelbase(%rip), %rdi; \ jae 1f; \ leaq _flt_/**/NAME, %rdx; \ + SMAP_DISABLE_INSTR(DISNUM) \ movq %rdx, T_LOFAULT(%r9); \ INSTR REG, (%rdi); \ movq $0, T_LOFAULT(%r9); \ xorl %eax, %eax; \ + SMAP_ENABLE_INSTR(EN1) \ ret; \ _flt_/**/NAME: \ + SMAP_ENABLE_INSTR(EN2) \ movq $0, T_LOFAULT(%r9); \ 1: \ movq T_COPYOPS(%r9), %rax; \ @@ -2540,10 +2676,10 @@ _flt_/**/NAME: \ ret; \ SET_SIZE(NAME) - SUWORD(suword64, movq, %rsi, CP_SUWORD64) - SUWORD(suword32, movl, %esi, CP_SUWORD32) - SUWORD(suword16, movw, %si, CP_SUWORD16) - SUWORD(suword8, movb, %sil, CP_SUWORD8) + SUWORD(suword64, movq, %rsi, CP_SUWORD64,12,18,19) + SUWORD(suword32, movl, %esi, CP_SUWORD32,13,20,21) + SUWORD(suword16, movw, %si, CP_SUWORD16,14,22,23) + SUWORD(suword8, movb, %sil, CP_SUWORD8,15,24,25) #elif defined(__i386) @@ -2880,6 +3016,10 @@ ucopystr(const char *ufrom, char *uto, size_t umaxlength, size_t *lencopied) jmp do_copy SET_SIZE(ucopy) + /* + * Note, the frame pointer is required here becuase do_copystr expects + * to be able to pop it off! + */ ENTRY(ucopystr) pushq %rbp movq %rsp, %rbp @@ -2889,6 +3029,8 @@ ucopystr(const char *ufrom, char *uto, size_t umaxlength, size_t *lencopied) cmpq %rax, %rsi cmovaeq %rax, %rsi /* force fault at kernelbase */ /* do_copystr expects lofault address in %r8 */ + /* do_copystr expects whether or not we need smap in %r10 */ + xorl %r10d, %r10d movq %gs:CPU_THREAD, %r8 movq T_LOFAULT(%r8), %r8 jmp do_copystr @@ -2995,3 +3137,62 @@ ucopystr(const char *ufrom, char *uto, size_t umaxlength, size_t *lencopied) #endif #endif /* __lint */ + +/* + * These functions are used for SMAP, supervisor mode access protection. They + * are hotpatched to become real instructions when the system starts up which is + * done in mlsetup() as a part of enabling the other CR4 related features. + * + * Generally speaking, smap_disable() is a stac instruction and smap_enable is a + * clac instruction. It's safe to call these any number of times, and in fact, + * out of paranoia, the kernel will likely call it at several points. + */ + +#if defined(__lint) + +void +smap_enable(void) +{} + +void +smap_disable(void) +{} + +#else + +#if defined (__amd64) || defined(__i386) + ENTRY(smap_disable) + nop + nop + nop + ret + SET_SIZE(smap_disable) + + ENTRY(smap_enable) + nop + nop + nop + ret + SET_SIZE(smap_enable) + +#endif /* __amd64 || __i386 */ + +#endif /* __lint */ + +#ifndef __lint + +.data +.align 4 +.globl _smap_enable_patch_count +.type _smap_enable_patch_count,@object +.size _smap_enable_patch_count, 4 +_smap_enable_patch_count: + .long SMAP_ENABLE_COUNT + +.globl _smap_disable_patch_count +.type _smap_disable_patch_count,@object +.size _smap_disable_patch_count, 4 +_smap_disable_patch_count: + .long SMAP_DISABLE_COUNT + +#endif /* __lint */ diff --git a/usr/src/uts/intel/ia32/ml/i86_subr.s b/usr/src/uts/intel/ia32/ml/i86_subr.s index 23b20ebbde..e9d0b8128f 100644 --- a/usr/src/uts/intel/ia32/ml/i86_subr.s +++ b/usr/src/uts/intel/ia32/ml/i86_subr.s @@ -71,9 +71,12 @@ /* * on_fault() + * * Catch lofault faults. Like setjmp except it returns one * if code following causes uncorrectable fault. Turned off - * by calling no_fault(). + * by calling no_fault(). Note that while under on_fault(), + * SMAP is disabled. For more information see + * uts/intel/ia32/ml/copy.s. */ #if defined(__lint) @@ -96,6 +99,7 @@ no_fault(void) leaq catch_fault(%rip), %rdx movq %rdi, T_ONFAULT(%rsi) /* jumpbuf in t_onfault */ movq %rdx, T_LOFAULT(%rsi) /* catch_fault in t_lofault */ + call smap_disable /* allow user accesses */ jmp setjmp /* let setjmp do the rest */ catch_fault: @@ -104,6 +108,7 @@ catch_fault: xorl %eax, %eax movq %rax, T_ONFAULT(%rsi) /* turn off onfault */ movq %rax, T_LOFAULT(%rsi) /* turn off lofault */ + call smap_enable /* disallow user accesses */ jmp longjmp /* let longjmp do the rest */ SET_SIZE(on_fault) @@ -112,6 +117,7 @@ catch_fault: xorl %eax, %eax movq %rax, T_ONFAULT(%rsi) /* turn off onfault */ movq %rax, T_LOFAULT(%rsi) /* turn off lofault */ + call smap_enable /* disallow user accesses */ ret SET_SIZE(no_fault) diff --git a/usr/src/uts/intel/ia32/ml/swtch.s b/usr/src/uts/intel/ia32/ml/swtch.s index 331c38d00e..67ba255cbc 100644 --- a/usr/src/uts/intel/ia32/ml/swtch.s +++ b/usr/src/uts/intel/ia32/ml/swtch.s @@ -45,6 +45,7 @@ #include <sys/privregs.h> #include <sys/stack.h> #include <sys/segments.h> +#include <sys/psw.h> /* * resume(thread_id_t t); @@ -239,6 +240,29 @@ resume(kthread_t *t) leaq resume_return(%rip), %r11 /* + * Deal with SMAP here. A thread may be switched out at any point while + * it is executing. The thread could be under on_fault() or it could be + * pre-empted while performing a copy interruption. If this happens and + * we're not in the context of an interrupt which happens to handle + * saving and restoring rflags correctly, we may lose our SMAP related + * state. + * + * To handle this, as part of being switched out, we first save whether + * or not userland access is allowed ($PS_ACHK in rflags) and store that + * in t_useracc on the kthread_t and unconditionally enable SMAP to + * protect the system. + * + * Later, when the thread finishes resuming, we potentially disable smap + * if PS_ACHK was present in rflags. See uts/intel/ia32/ml/copy.s for + * more information on rflags and SMAP. + */ + pushfq + popq %rsi + andq $PS_ACHK, %rsi + movq %rsi, T_USERACC(%rax) + call smap_enable + + /* * Save non-volatile registers, and set return address for current * thread to resume_return. * @@ -246,6 +270,7 @@ resume(kthread_t *t) */ SAVE_REGS(%rax, %r11) + LOADCPU(%r15) /* %r15 = CPU */ movq CPU_THREAD(%r15), %r13 /* %r13 = curthread */ @@ -387,6 +412,19 @@ resume(kthread_t *t) STORE_INTR_START(%r12) /* + * If we came into swtch with the ability to access userland pages, go + * ahead and restore that fact by disabling SMAP. Clear the indicator + * flag out of paranoia. + */ + movq T_USERACC(%r12), %rax /* should we disable smap? */ + cmpq $0, %rax /* skip call when zero */ + jz .nosmap + xorq %rax, %rax + movq %rax, T_USERACC(%r12) + call smap_disable +.nosmap: + + /* * Restore non-volatile registers, then have spl0 return to the * resuming thread's PC after first setting the priority as low as * possible and blocking all interrupt threads that may be active. diff --git a/usr/src/uts/intel/ia32/os/sendsig.c b/usr/src/uts/intel/ia32/os/sendsig.c index 979c9e3294..b7b79f38ca 100644 --- a/usr/src/uts/intel/ia32/os/sendsig.c +++ b/usr/src/uts/intel/ia32/os/sendsig.c @@ -288,7 +288,10 @@ sendsig(int sig, k_siginfo_t *sip, void (*hdlr)()) */ uc = (ucontext_t *)(sp + sizeof (struct sigframe)); tuc = kmem_alloc(sizeof (*tuc), KM_SLEEP); + no_fault(); savecontext(tuc, &lwp->lwp_sigoldmask); + if (on_fault(&ljb)) + goto badstack; copyout_noerr(tuc, uc, sizeof (*tuc)); kmem_free(tuc, sizeof (*tuc)); tuc = NULL; @@ -506,7 +509,10 @@ sendsig32(int sig, k_siginfo_t *sip, void (*hdlr)()) fp -= SA32(sizeof (*tuc)); uc = (ucontext32_t *)fp; tuc = kmem_alloc(sizeof (*tuc), KM_SLEEP); + no_fault(); savecontext32(tuc, &lwp->lwp_sigoldmask); + if (on_fault(&ljb)) + goto badstack; copyout_noerr(tuc, uc, sizeof (*tuc)); kmem_free(tuc, sizeof (*tuc)); tuc = NULL; diff --git a/usr/src/uts/intel/ia32/sys/psw.h b/usr/src/uts/intel/ia32/sys/psw.h index 5697323d40..7c63813929 100644 --- a/usr/src/uts/intel/ia32/sys/psw.h +++ b/usr/src/uts/intel/ia32/sys/psw.h @@ -30,8 +30,6 @@ #ifndef _IA32_SYS_PSW_H #define _IA32_SYS_PSW_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -98,6 +96,10 @@ typedef struct flags { /* * kernel flags settings + * + * Note that the kernel's SMAP protection relies on PS_ACHK not being present in + * the following two definitions. See uts/intel/ia32/ml/copy.s for more + * information on SMAP. */ #define F_OFF 0x2 /* interrupts off */ #define F_ON 0x202 /* interrupts on */ diff --git a/usr/src/uts/intel/sys/archsystm.h b/usr/src/uts/intel/sys/archsystm.h index a39221b353..e06e79de97 100644 --- a/usr/src/uts/intel/sys/archsystm.h +++ b/usr/src/uts/intel/sys/archsystm.h @@ -21,6 +21,7 @@ /* * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_ARCHSYSTM_H @@ -181,6 +182,17 @@ extern void fakesoftint(void); extern void *plat_traceback(void *); +/* + * The following two macros are the four byte instruction sequence of stac, ret + * and clac, ret. These are used in startup_smap() as a part of properly setting + * up the valid instructions. For more information on SMAP, see + * uts/intel/ia32/ml/copy.s. + */ +#define SMAP_CLAC_INSTR 0xc3ca010f +#define SMAP_STAC_INSTR 0xc3cb010f +extern void smap_disable(void); +extern void smap_enable(void); + #if defined(__xpv) extern void xen_init_callbacks(void); extern void xen_set_callback(void (*)(void), uint_t, uint_t); diff --git a/usr/src/uts/intel/sys/controlregs.h b/usr/src/uts/intel/sys/controlregs.h index 98615937b2..41a4a72191 100644 --- a/usr/src/uts/intel/sys/controlregs.h +++ b/usr/src/uts/intel/sys/controlregs.h @@ -110,9 +110,11 @@ extern "C" { #define CR4_SMXE 0x4000 #define CR4_OSXSAVE 0x40000 /* OS xsave/xrestore support */ #define CR4_SMEP 0x100000 /* NX for user pages in kernel */ +#define CR4_SMAP 0x200000 /* kernel can't access user pages */ -#define FMT_CR4 \ - "\20\25smep\23osxsav\17smxe\16vmxe\13xmme\12fxsr\11pce\10pge" \ +#define FMT_CR4 \ + "\20\26smap\25smep\23osxsav" \ + "\17smxe\16vmxe\13xmme\12fxsr\11pce\10pge" \ "\7mce\6pae\5pse\4de\3tsd\2pvi\1vme" /* diff --git a/usr/src/uts/intel/sys/x86_archext.h b/usr/src/uts/intel/sys/x86_archext.h index 0e1d45fc77..4aedd08a53 100644 --- a/usr/src/uts/intel/sys/x86_archext.h +++ b/usr/src/uts/intel/sys/x86_archext.h @@ -195,6 +195,7 @@ extern "C" { #define CPUID_INTC_EBX_7_0_AVX2 0x00000020 /* AVX2 supported */ #define CPUID_INTC_EBX_7_0_SMEP 0x00000080 /* SMEP in CR4 */ #define CPUID_INTC_EBX_7_0_BMI2 0x00000100 /* BMI2 Instrs */ +#define CPUID_INTC_EBX_7_0_SMAP 0x00100000 /* SMAP in CR 4 */ #define P5_MCHADDR 0x0 #define P5_CESR 0x11 @@ -370,6 +371,7 @@ extern "C" { #define X86FSET_BMI2 43 #define X86FSET_FMA 44 #define X86FSET_SMEP 45 +#define X86FSET_SMAP 46 /* * flags to patch tsc_read routine. @@ -630,7 +632,7 @@ extern "C" { #if defined(_KERNEL) || defined(_KMEMUSER) -#define NUM_X86_FEATURES 46 +#define NUM_X86_FEATURES 47 extern uchar_t x86_featureset[]; extern void free_x86_featureset(void *featureset); |