From d02748937355fe239b4e1f4a7b927795d98d889d Mon Sep 17 00:00:00 2001 From: Jerry Jelinek Date: Tue, 14 Oct 2014 12:24:58 +0000 Subject: OS-3415 lxbrand mishandles amd64 red zone, original %r15 value clobbered during uucopy --- usr/src/lib/brand/lx/lx_brand/amd64/lx_handler.s | 49 ++++++++++++++--------- usr/src/uts/intel/brand/lx/lx_brand_asm.s | 51 ++++++++++++++---------- 2 files changed, 60 insertions(+), 40 deletions(-) diff --git a/usr/src/lib/brand/lx/lx_brand/amd64/lx_handler.s b/usr/src/lib/brand/lx/lx_brand/amd64/lx_handler.s index b808c398a2..f449f47be3 100644 --- a/usr/src/lib/brand/lx/lx_brand/amd64/lx_handler.s +++ b/usr/src/lib/brand/lx/lx_brand/amd64/lx_handler.s @@ -37,12 +37,12 @@ #define JMP \ pushq $_CONST(. - lx_handler_table); \ jmp lx_handler; \ - .align 16; + .align 16; #define JMP4 JMP; JMP; JMP; JMP -#define JMP16 JMP4; JMP4; JMP4; JMP4 -#define JMP64 JMP16; JMP16; JMP16; JMP16 -#define JMP256 JMP64; JMP64; JMP64; JMP64 +#define JMP16 JMP4; JMP4; JMP4; JMP4 +#define JMP64 JMP16; JMP16; JMP16; JMP16 +#define JMP256 JMP64; JMP64; JMP64; JMP64 /* * Alternate jump table that turns on lx_traceflag before proceeding with @@ -51,14 +51,14 @@ #define TJMP \ pushq $_CONST(. - lx_handler_trace_table); \ jmp lx_handler_trace; \ - .align 16; + .align 16; #define TJMP4 TJMP; TJMP; TJMP; TJMP -#define TJMP16 TJMP4; TJMP4; TJMP4; TJMP4 -#define TJMP64 TJMP16; TJMP16; TJMP16; TJMP16 -#define TJMP256 TJMP64; TJMP64; TJMP64; TJMP64 +#define TJMP16 TJMP4; TJMP4; TJMP4; TJMP4 +#define TJMP64 TJMP16; TJMP16; TJMP16; TJMP16 +#define TJMP256 TJMP64; TJMP64; TJMP64; TJMP64 + - #if defined(lint) #include @@ -126,10 +126,12 @@ lx_sigreturn_tolibc(uintptr_t sp) SET_SIZE(lx_handler_table) ENTRY_NP(lx_handler_trace) + subq $128, %rsp /* skip red zone */ pushq %rsi - movq lx_traceflag@GOTPCREL(%rip), %rsi + movq lx_traceflag@GOTPCREL(%rip), %rsi movq $1, (%rsi) popq %rsi + addq $128, %rsp /* * While we could just fall through to lx_handler(), we "tail-call" it * instead to make ourselves a little more comprehensible to trace @@ -139,6 +141,13 @@ lx_sigreturn_tolibc(uintptr_t sp) SET_SIZE(lx_handler_trace) ALTENTRY(lx_handler) + /* + * We are running on the Linux process's stack here so we have to + * account for the AMD64 ABI red zone of 128 bytes past the %rsp which + * the process can use as scratch space. + */ + subq $128, %rsp + /* * %rbp isn't always going to be a frame pointer on Linux, but when * it is, saving it here lets us have a coherent stack backtrace. @@ -154,13 +163,13 @@ lx_sigreturn_tolibc(uintptr_t sp) * Save %rbp and then fill it with what would be its usual value as * the frame pointer. The value we save for %rsp needs to be the * stack pointer at the time of the syscall so we need to skip the - * saved %rbp and (what will be) the return address. + * red zone, saved %rbp and (what will be) the return address. */ movq %rbp, LXR_RBP(%rsp) movq %rsp, %rbp - addq $_CONST(SIZEOF_LX_REGS_T), %rbp + addq $SIZEOF_LX_REGS_T, %rbp movq %rbp, LXR_RSP(%rsp) - addq $_CONST(_MUL(CPTRSIZE, 2)), LXR_RSP(%rsp) + addq $144, LXR_RSP(%rsp) /* 128 byte red zone + 2 pointers */ movq $0, LXR_GS(%rsp) movw %gs, LXR_GS(%rsp) @@ -187,7 +196,7 @@ lx_sigreturn_tolibc(uintptr_t sp) * value on the stack with the return address, and use the value to * compute the system call number by dividing by the table entry size. */ - xchgq CPTRSIZE(%rbp), %rax + xchgq 136(%rbp), %rax /* 128 byte red zone + rbp we pushed */ shrq $4, %rax movq %rax, LXR_RAX(%rsp) @@ -225,10 +234,12 @@ lx_sigreturn_tolibc(uintptr_t sp) movq LXR_R15(%rsp), %r15 movw LXR_GS(%rsp), %gs - addq $SIZEOF_LX_REGS_T, %rsp + /* addq $SIZEOF_LX_REGS_T, %rsp not needed due to next instr. */ movq %rbp, %rsp popq %rbp + + addq $128, %rsp /* red zone */ ret SET_SIZE(lx_handler) @@ -236,7 +247,7 @@ lx_sigreturn_tolibc(uintptr_t sp) * lx_setup_clone(uintptr_t %gs, void *retaddr, void *stack) * ignore arg0 (%rdi) on 64-bit * Return to Linux app using arg1 (%rsi) with the Linux stack we got - * in arg2 (%rdx). + * in arg2 (%rdx). */ ENTRY_NP(lx_setup_clone) xorq %rbp, %rbp /* terminating stack */ @@ -281,7 +292,7 @@ lx_sigreturn_tolibc(uintptr_t sp) * Unlike the 32-bit case, we don't reset %rbp before jumping into the * Linux handler, since that would mean the handler would clobber our * data in the stack frame it builds. - * + * */ ENTRY_NP(lx_sigdeliver) pushq %rbp @@ -320,7 +331,7 @@ lx_sigreturn_tolibc(uintptr_t sp) * arg1 %rsi is ptr to converted siginfo on stack or NULL */ movq -16(%rbp), %rsi - cmp $0, %rsi + cmp $0, %rsi je 1f movq 8(%rsp), %rsi 1: @@ -352,7 +363,7 @@ lx_sigreturn_tolibc(uintptr_t sp) * lx_sigacthandler(int sig, siginfo_t *s, void *p) */ ENTRY_NP(lx_sigacthandler) - movq libc_sigacthandler@GOTPCREL(%rip), %rax + movq libc_sigacthandler@GOTPCREL(%rip), %rax jmp *(%rax) /* jmp to libc's interposer */ SET_SIZE(lx_sigacthandler) diff --git a/usr/src/uts/intel/brand/lx/lx_brand_asm.s b/usr/src/uts/intel/brand/lx/lx_brand_asm.s index fdd80e1ad2..897f3891fc 100644 --- a/usr/src/uts/intel/brand/lx/lx_brand_asm.s +++ b/usr/src/uts/intel/brand/lx/lx_brand_asm.s @@ -116,10 +116,15 @@ SET_SIZE(lx_brand_int80_disable) * * %rax - syscall number * - * When called, all general registers, except for %r15, are as they were when - * the user process made the system call. %r15 is available to the callback as - * a scratch register. If the callback returns to the kernel path, %r15 does - * not have to be restored to the user value. + * See uts/i86pc/ml/syscall_asm_amd64.s for what happens before we get into + * the following lx brand-specific codepath. + * + * As the comment on the BRAND_CALLBACK macro describes, when we're called, all + * general registers, except for %r15, are as they were when the user process + * made the system call. %r15 is available to the callback as a scratch + * register. If the callback returns to the kernel path, %r15 does not have to + * be restored to the user value since BRAND_CALLBACK does that. If we jump + * out to the emulation we need to restore %r15 here. * * To 'return' to our user-space handler, we just need to place its address * into %rcx. The original return address is passed back in %rax. @@ -163,18 +168,18 @@ ENTRY(lx_brand_syscall_callback) * This block is basically similar to a large assert. * * In debug code we do some extra validation of the %fsbase register to - * validate that we always have the expected Linux thread pointer and not - * the native value. At this point we know that the lwp brand data should - * contain the Linux %fsbase (from a Linux arch_prctl syscall) since the - * native %fsbase check above is non-null. We also know that we are - * making a Linux syscall from the other check above. We read the %fsbase - * and compare to the saved Linux %fsbase in the lwp_brand data. If we - * don't have the expected value, we save the incorrect %fsbase value - * into the br_lx_fsbase member for later inspection and change the - * syscall we are making into the Linux pivot_root syscall (an obscure - * syscall which we don't support and which an app in the zone cannot - * use). This allows us to see this error downstream via DTrace and - * see the incorrect %fsbase value we had. + * validate that we always have the expected Linux thread pointer and + * not the native value. At this point we know that the lwp brand data + * should contain the Linux %fsbase (from a Linux arch_prctl syscall) + * since the native %fsbase check above is non-null. We also know that + * we are making a Linux syscall from the other check above. We read + * the %fsbase and compare to the saved Linux %fsbase in the lwp_brand + * data. If we don't have the expected value, we save the incorrect + * %fsbase value into the br_lx_fsbase member for later inspection and + * change the syscall we are making into the Linux pivot_root syscall + * (an obscure syscall which we don't support and which an app in the + * zone cannot use). This allows us to see this error downstream via + * DTrace and see the incorrect %fsbase value we had. */ GET_V(SP_REG, 0, V_LWP, %r15); /* get lwp pointer */ movq LWP_BRAND(%r15), %r15 /* grab lx lwp data pointer */ @@ -188,7 +193,8 @@ ENTRY(lx_brand_syscall_callback) movl $MSR_AMD_FSBASE, %ecx /* fsbase msr */ rdmsr /* get fsbase to edx:eax */ - shlq $32, %rdx /* fix %edx; %eax lo already ok */ + /* fix %edx; %eax lo already ok */ + shlq $32, %rdx or %rdx, %rax /* full value in %rax */ cmp %rax, %r15 /* check if is lx fsbase */ je 4f /* match, ok */ @@ -198,7 +204,7 @@ ENTRY(lx_brand_syscall_callback) GET_V(%rdx, 0, V_LWP, %r15); /* get lwp pointer */ movq LWP_BRAND(%r15), %r15 /* grab lx lwp data pointer */ movq %rax, BR_LX_FSBASE(%r15) /* save bad Linux fsbase */ - movq $155, %rax /* fail! use pivot_root syscall */ + movq $155, %rax /* fail! use pivot_root */ jmp 5f 4: @@ -236,9 +242,12 @@ ENTRY(lx_brand_syscall_callback) movq 0x8(%rsp), %rcx movq 0x10(%rsp), %rdx addq $24, %rsp -3: - /* Linux syscall - validate syscall number */ +3: + /* + * Linux syscall - validate syscall number. + * If necessary, the Linux %fsbase has already been loaded above. + */ GET_PROCP(SP_REG, 0, %r15) movq P_ZONE(%r15), %r15 /* grab the zone pointer */ /* grab the 'max syscall num' for this process from 'zone brand data' */ @@ -304,7 +313,7 @@ ENTRY(lx_brand_int80_callback) cmpl %ebx, %eax /* is 0 <= syscall <= MAX? */ jbe 0f /* yes, syscall is OK */ - xorl %eax, %eax /* no, zero syscall number */ + xorl %eax, %eax /* no, zero syscall number */ 0: .lx_brand_int80_patch_point: -- cgit v1.2.3