diff options
author | Joshua M. Clulow <jmc@joyent.com> | 2015-02-23 22:43:12 -0800 |
---|---|---|
committer | Joshua M. Clulow <jmc@joyent.com> | 2015-02-24 06:43:12 +0000 |
commit | cbb62638d5ccc777c90e15b41b1cf6943d284bd4 (patch) | |
tree | 9c660c98372081889d3f7e2e63853a37693468df /usr/src | |
parent | a5e945f618fb3657405a0971ee2886cbee1595d7 (diff) | |
download | illumos-joyent-cbb62638d5ccc777c90e15b41b1cf6943d284bd4.tar.gz |
OS-3561 lxbrand emulation library should execute on alternate stack
OS-3558 lxbrand add support for full in-kernel syscall handling
OS-3545 lx_syscall_regs should not walk stack
OS-3868 many LTP testcases now hang
OS-3901 lxbrand lx_recvmsg fails to translate control messages when 64-bit
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Bryan Cantrill <bryan@joyent.com>
Diffstat (limited to 'usr/src')
69 files changed, 6826 insertions, 5444 deletions
diff --git a/usr/src/cmd/mdb/common/modules/libc/libc.c b/usr/src/cmd/mdb/common/modules/libc/libc.c index 7ad7f86996..c4b713f096 100644 --- a/usr/src/cmd/mdb/common/modules/libc/libc.c +++ b/usr/src/cmd/mdb/common/modules/libc/libc.c @@ -137,6 +137,8 @@ d_ucontext(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) uc.uc_stack.ss_sp, uc.uc_stack.ss_size, stack_flags(&uc.uc_stack)); mdb_printf(" mcontext = 0x%p\n", addr + OFFSETOF(ucontext_t, uc_mcontext)); + mdb_printf(" brand = 0x%p 0x%p 0x%p\n", + uc.uc_brand_data[0], uc.uc_brand_data[1], uc.uc_brand_data[2]); return (DCMD_OK); } diff --git a/usr/src/common/brand/lx/lx_signum.c b/usr/src/common/brand/lx/lx_signum.c index 08ab453885..22afb99ac7 100644 --- a/usr/src/common/brand/lx/lx_signum.c +++ b/usr/src/common/brand/lx/lx_signum.c @@ -22,11 +22,17 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/signal.h> +#include <sys/lx_siginfo.h> #include <lx_signum.h> +#ifdef _KERNEL +#include <sys/debug.h> +#else +#include <assert.h> +#endif /* * Delivering signals to a Linux process is complicated by differences in @@ -242,3 +248,75 @@ stol_signo[NSIG] = { LX_SIGRTMIN + 30, LX_SIGRTMAX, /* 73: Solaris _SIGRTMAX */ }; + +/* + * Convert an illumos native signal number to a Linux signal number and return + * it. If no valid conversion is possible, the function fails back to the + * value of "defsig". In userland, passing a default signal number of "-1" + * will abort the program if the signal number could not be converted. + */ +int +lx_stol_signo(int signo, int defsig) +{ + int rval; + +#ifdef _KERNEL + VERIFY(defsig != -1); +#endif + + if (signo < 0 || signo >= NSIG || (rval = stol_signo[signo]) < 1) { +#ifndef _KERNEL + if (defsig == -1) { + assert(0); + } +#endif + return (defsig); + } + + return (rval); +} + +/* + * Convert the "status" field of a SIGCLD siginfo_t. We need to extract the + * illumos signal number and convert it to a Linux signal number while leaving + * the ptrace(2) event bits intact. In userland, passing a default signal + * number of "-1" will abort the program if the signal number could not be + * converted, as for lx_stol_signo(). + */ +int +lx_stol_status(int s, int defsig) +{ + /* + * We mask out the top bit here in case PTRACE_O_TRACESYSGOOD + * is in use and 0x80 has been ORed with the signal number. + */ + int stat = lx_stol_signo(s & 0x7f, defsig); + + /* + * We must mix in the ptrace(2) event which may be stored in + * the second byte of the status code. We also re-include the + * PTRACE_O_TRACESYSGOOD bit. + */ + return ((s & 0xff80) | stat); +} + +int +lx_stol_sigcode(int code) +{ + switch (code) { + case SI_USER: + return (LX_SI_USER); + case SI_LWP: + return (LX_SI_TKILL); + case SI_QUEUE: + return (LX_SI_QUEUE); + case SI_TIMER: + return (LX_SI_TIMER); + case SI_ASYNCIO: + return (LX_SI_ASYNCIO); + case SI_MESGQ: + return (LX_SI_MESGQ); + default: + return (code); + } +} diff --git a/usr/src/common/brand/lx/lx_signum.h b/usr/src/common/brand/lx/lx_signum.h index f410500925..a7807c2b07 100644 --- a/usr/src/common/brand/lx/lx_signum.h +++ b/usr/src/common/brand/lx/lx_signum.h @@ -21,7 +21,7 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _LX_SIGNUM_H @@ -74,6 +74,10 @@ extern "C" { extern const int ltos_signo[]; extern const int stol_signo[]; +extern int lx_stol_signo(int, int); +extern int lx_stol_status(int, int); +extern int lx_stol_sigcode(int); + #ifdef __cplusplus } #endif diff --git a/usr/src/lib/brand/lx/lx_brand/Makefile.com b/usr/src/lib/brand/lx/lx_brand/Makefile.com index 80f30d48be..804dccfce7 100644 --- a/usr/src/lib/brand/lx/lx_brand/Makefile.com +++ b/usr/src/lib/brand/lx/lx_brand/Makefile.com @@ -21,7 +21,7 @@ # # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. -# Copyright 2014 Joyent, Inc. All rights reserved. +# Copyright 2015 Joyent, Inc. # LX_CMN = $(SRC)/common/brand/lx @@ -55,17 +55,16 @@ COBJS = aio.o \ sendfile.o \ signal.o \ socket.o \ + stack.o \ stat.o \ statfs.o \ sysctl.o \ sysv_ipc.o \ time.o \ - truncate.o \ - wait.o \ - xattr.o + truncate.o CMNOBJS = lx_signum.o -ASOBJS = lx_handler.o lx_runexe.o lx_crt.o +ASOBJS = lx_handler.o lx_crt.o OBJECTS = $(CMNOBJS) $(COBJS) $(ASOBJS) USDT_PROVIDERS = lx_provider.d diff --git a/usr/src/lib/brand/lx/lx_brand/amd64/lx_handler.s b/usr/src/lib/brand/lx/lx_brand/amd64/lx_handler.s index af8fae621f..b33845d8a0 100644 --- a/usr/src/lib/brand/lx/lx_brand/amd64/lx_handler.s +++ b/usr/src/lib/brand/lx/lx_brand/amd64/lx_handler.s @@ -11,7 +11,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/asm_linkage.h> @@ -28,40 +28,8 @@ #include "assym.h" /* 64-bit signal syscall numbers */ -#define LX_SYS_sigreturn 513 #define LX_SYS_rt_sigreturn 15 -/* - * Each JMP must occupy 16 bytes. - * The syscall offset is stored immediately above the red zone to avoid - * clobbering data there. Once lx_handler is reached, the stack will be - * advanced to account for both the red zone and the stored syscall offset. - */ -#define JMP \ - movl $_CONST(. - lx_handler_table), -136(%rsp); \ - jmp lx_handler; \ - .align 16; - -#define JMP4 JMP; JMP; JMP; JMP -#define JMP16 JMP4; JMP4; JMP4; JMP4 -#define JMP64 JMP16; JMP16; JMP16; JMP16 -#define JMP256 JMP64; JMP64; JMP64; JMP64 - -/* - * Alternate jump table that turns on lx_traceflag before proceeding with - * the normal emulation routine. - */ -#define TJMP \ - movl $_CONST(. - lx_handler_trace_table), -136(%rsp); \ - jmp lx_handler_trace; \ - .align 16; - -#define TJMP4 TJMP; TJMP; TJMP; TJMP -#define TJMP16 TJMP4; TJMP4; TJMP4; TJMP4 -#define TJMP64 TJMP16; TJMP16; TJMP16; TJMP16 -#define TJMP256 TJMP64; TJMP64; TJMP64; TJMP64 - - #if defined(lint) #include <sys/types.h> @@ -69,343 +37,16 @@ #include <sys/signal.h> void -lx_handler_table(void) -{} - -void -lx_handler(void) -{} - -/* ARGSUSED */ -void -lx_setup_clone(uintptr_t gs, void *retaddr, void *stk) -{} - -/* ARGSUSED */ -void -lx_sigdeliver(int sig, siginfo_t *sip, void *p, size_t stacksz, - void (*stack_frame_builder)(void), void (*lx_sighandler)(void), - uintptr_t gs) -{} - -/* ARGSUSED */ -void -lx_sigacthandler(int sig, siginfo_t *s, void *p) -{} - -void -lx_sigreturn_tramp(void) -{} - -void lx_rt_sigreturn_tramp(void) {} -/* ARGSUSED */ void -lx_sigreturn_tolibc(uintptr_t sp) +lx_vsyscall_tramp(void) {} #else /* lint */ /* - * On entry to this table, %rax will hold the return address. The - * location where we enter the table is a function of the system - * call number. The table needs the same alignment as the individual - * entries. - */ - .align 16 - ENTRY_NP(lx_handler_trace_table) - TJMP256 - TJMP64 - TJMP64 - SET_SIZE(lx_handler_trace_table) - - .align 16 - ENTRY_NP(lx_handler_table) - JMP256 - JMP64 - JMP64 - SET_SIZE(lx_handler_table) - - ENTRY_NP(lx_handler_trace) - subq $136, %rsp /* skip red zone + syscall offset */ - pushq %rsi - movq lx_traceflag@GOTPCREL(%rip), %rsi - movq $1, (%rsi) - popq %rsi - addq $136, %rsp - /* - * While we could just fall through to lx_handler(), we "tail-call" it - * instead to make ourselves a little more comprehensible to trace - * tools. - */ - jmp lx_handler - SET_SIZE(lx_handler_trace) - - ALTENTRY(lx_handler) - /* - * We are running on the Linux process's stack here so we have to - * account for the AMD64 ABI red zone of 128 bytes past the %rsp which - * the process can use as scratch space. In addition to the red zone, - * the syscall offset stored by the handler tables above must be - * accounted for. To that end, rsp is advanced by a further 8 bytes to - * include the syscall offset. - */ - subq $136, %rsp /* red zone + syscall offset */ - - /* - * In order to keep the hander_table entries within 16 bytes, only 4 - * bytes of the syscall offset are stored during dispatch. - * The upper 4 bytes are zeroed here to account for that. - */ - movl $0, 4(%rsp) - - /* - * %rbp isn't always going to be a frame pointer on Linux, but when - * it is, saving it here lets us have a coherent stack backtrace. - */ - pushq %rbp - - /* - * Fill in a lx_regs_t structure on the stack. - */ - subq $SIZEOF_LX_REGS_T, %rsp - - /* - * Save %rbp and then fill it with what would be its usual value as - * the frame pointer. The value we save for %rsp needs to be the - * stack pointer at the time of the syscall so we need to skip the - * red zone, saved %rbp and (what will be) the return address. - */ - movq %rbp, LXR_RBP(%rsp) - movq %rsp, %rbp - addq $SIZEOF_LX_REGS_T, %rbp - movq %rbp, LXR_RSP(%rsp) - addq $144, LXR_RSP(%rsp) /* 128 byte red zone + 2 pointers */ - - movq $0, LXR_FS(%rsp) - movw %fs, LXR_FS(%rsp) - movq %rdi, LXR_RDI(%rsp) - movq %rsi, LXR_RSI(%rsp) - movq %rbx, LXR_RBX(%rsp) - movq %rdx, LXR_RDX(%rsp) - movq %rcx, LXR_RCX(%rsp) - movq %rax, LXR_RIP(%rsp) /* %rax holds the return addr. */ - movq %r8, LXR_R8(%rsp) - movq %r9, LXR_R9(%rsp) - movq %r10, LXR_R10(%rsp) - movq %r11, LXR_R11(%rsp) - movq %r12, LXR_R12(%rsp) - movq %r13, LXR_R13(%rsp) - movq %r14, LXR_R14(%rsp) - movq %r15, LXR_R15(%rsp) - - /* - * The kernel drops us into the middle of one of the tables above - * that then stores the table offset immediately above the 128 byte - * red zone and calls into lx_handler. That offset indicates the - * syscall number while %rax holds the return address for the syscall. - * We replace the value on the stack with the return address, and use - * the value to compute the system call number by dividing by the table - * entry size. - */ - xchgq 8(%rbp), %rax /* just after the rbp we pushed */ - shrq $4, %rax - movq %rax, LXR_RAX(%rsp) - - /* - * Call lx_emulate() whose only argument is a pointer to the - * lx_regs_t structure we've placed on the stack. - */ - movq %rsp, %rdi - call lx_emulate - - /* - * We use this global symbol to identify this return site when - * walking the stack backtrace. It needs to remain immediately - * after the call to lx_emulate(). - */ - ALTENTRY(lx_emulate_done) - - /* - * Restore the saved register state; we get %rbp and %rsp from - * the ordinary locations rather than the saved state. - */ - movq LXR_RDI(%rsp), %rdi - movq LXR_RSI(%rsp), %rsi - movq LXR_RBX(%rsp), %rbx - movq LXR_RDX(%rsp), %rdx - movq LXR_RCX(%rsp), %rcx - movq LXR_RAX(%rsp), %rax - movq LXR_R8(%rsp), %r8 - movq LXR_R9(%rsp), %r9 - movq LXR_R10(%rsp), %r10 - movq LXR_R11(%rsp), %r11 - movq LXR_R12(%rsp), %r12 - movq LXR_R13(%rsp), %r13 - movq LXR_R14(%rsp), %r14 - movq LXR_R15(%rsp), %r15 - /* XXX movw LXR_FS(%rsp), %fs */ - - movq %rbp, %rsp - popq %rbp - - /* - * Returning from lx_handler is complicated by our preservation of the - * red zone on the stack. The return address resides just above the - * red zone making it impossible to use 'retq' and return rsp to the - * correct value. Instead, rsp is manually moved to its original - * position and we jmp using the return address at the known stack - * offset above the red zone. - */ - addq $136, %rsp /* red zone + return address */ - jmpq *-136(%rsp) - SET_SIZE(lx_handler) - - /* - * lx_setup_clone(lx_regs_t *regp, void *retaddr, void *stack) - * Restore the register state using arg0 (%rdi). - * Return to Linux app using arg1 (%rsi) with the Linux stack we got - * in arg2 (%rdx). - */ - ENTRY_NP(lx_setup_clone) - /* - * arg0 is a ptr to an lx_regs_t struct. The AMD64 ABI says that the - * kernel clobbers %rcx and %r11 so we use those for working registers. - */ - movq %rdi, %rcx /* arg0, use rcx as ptr */ - movq %rsi, %r11 /* arg1, the return addr */ - movq LXR_RDI(%rcx), %rdi - movq LXR_RSI(%rcx), %rsi - movq LXR_RBX(%rcx), %rbx - movq LXR_R8(%rcx), %r8 - movq LXR_R9(%rcx), %r9 - movq LXR_R10(%rcx), %r10 - movq LXR_R12(%rcx), %r12 - movq LXR_R13(%rcx), %r13 - movq LXR_R14(%rcx), %r14 - movq LXR_R15(%rcx), %r15 - - xorq %rbp, %rbp /* terminating stack */ - popq %rax /* pop the clone_start() return address */ - movq %rdx, %rsp /* arg2 is new stack pointer */ - movq LXR_RDX(%rcx), %rdx - xorq %rax, %rax /* child returns 0 to SYS_clone() */ - jmp *%r11 /* return to Linux app. using arg1 addr. */ - SET_SIZE(lx_setup_clone) - - /* - * lx_sigdeliver(int sig, siginfo_t *, ucontext_t *, int stack_size, - * void *stack_build_routine, void *signal_handler, void *glibc_gs) - * - * The final parameter (%gs) is ignored in the 64-bit code. - * - * we're called by: - * lx_call_user_handler(int sig, siginfo_t *sip, void *p) - * - * This routine allocates stack space for the lx_sigstack local - * variable structure, calls a routine to populate that structure, and - * then calls the Linux signal handler. This is written in assembly - * because of the way we directly jmp to the Linux signal handler - * with everything setup as if this function wasn't really here. We - * rely on the code in lx_rt_sigreturn() to cleanup the things we've - * pushed on the stack here. - * - * See lx_build_signal_frame() for the code which populates lx_sigstack. - * - * When we jump to the Linux signal handler, the stack will look - * like this: - * - * ================================================= - * | %rbp | - * | ================================================= - * | | stuff we saved in our prologue | - * | ================================================= - * | | LX_SIGRT_MAGIC | - * | ================================================= - * | | {unused word to maintain ABI stack alignment} | - * V ================================================= - * | Linux local data built by lx stk_builder() | - * ================================================= - * - * Unlike the 32-bit case, we don't reset %rbp before jumping into the - * Linux handler, since that would mean the handler would clobber our - * data in the stack frame it builds. - * - */ - ENTRY_NP(lx_sigdeliver) - pushq %rbp - movq %rsp, %rbp - subq $0x40, %rsp /* an extra word to maintain alignmnt */ - movq %rdi, -8(%rbp) /* sig */ - movq %rsi, -16(%rbp) /* siginfo* */ - movq %rdx, -24(%rbp) /* ucontext* */ - movq %rcx, -32(%rbp) /* stack size */ - movq %r8, -40(%rbp) /* stack builder */ - movq %r9, -48(%rbp) /* Linux signal handler */ - - subq %rcx, %rsp /* create stack_size stack buffer */ - - movq $LX_SIGRT_MAGIC, %rcx /* load and place marker value onto */ - movq %rcx, -56(%rbp) /* stack for lx_rt_sigreturn */ - - movq %rsp, %rcx /* arg3 - %rcx is stack pointer */ - /* arg2 - %rdx is ucontext ptr */ - /* arg1 - %rsi is siginfo ptr */ - /* arg0 - %rdi is sig num */ - call *%r8 /* stk_builder(sig, sip, ucp, sp) */ - - /* setup for jump to Linux signal hander */ - movq -8(%rbp), %rdi /* arg0 %rdi is sig num */ - - /* - * If we had a NULL siginfo pointer as input then we never converted - * anything in the stack builder function and we need to pass along - * a null siginfo pointer to the Linux handler. - * - * arg1 %rsi is ptr to converted siginfo on stack or NULL - */ - movq -16(%rbp), %rsi - cmp $0, %rsi - je 1f - movq %rsp, %rsi - addq $SI, %rsi -1: - /* - * arg2 %rdx is ptr to converted ucontext on stk (uc member of - * lx_sigstack). - */ - movq %rsp, %rdx - addq $UC, %rdx - - movq -48(%rbp), %r9 /* fetch signal handler ptr */ - jmp *%r9 /* jmp to the Linux signal handler */ - SET_SIZE(lx_sigdeliver) - - /* - * The libc routine that calls user signal handlers ends with a - * setcontext, so we would never return here even if we used a call - * rather than a jmp. However, we'll let the emulation unwind the stack - * with a brand call that combines the setcontext with the management - * of the syscall mode flag. - * - * Note that because libc_sigacthandler is an extern, it needs to be - * dereferenced via the GOT. - * - * IMPORTANT: Because libc apparently gets upset if extra data is - * left on its stack, this routine needs to be crafted - * in assembly so that the jmp to the libc interposer - * doesn't leave any cruft lying around. - * - * lx_sigacthandler(int sig, siginfo_t *s, void *p) - */ - ENTRY_NP(lx_sigacthandler) - movq libc_sigacthandler@GOTPCREL(%rip), %rax - jmp *(%rax) /* jmp to libc's interposer */ - SET_SIZE(lx_sigacthandler) - - /* * Trampoline code is called by the return at the end of a Linux * signal handler to return control to the interrupted application * via the lx_rt_sigreturn() syscall. @@ -416,13 +57,16 @@ lx_sigreturn_tolibc(uintptr_t sp) SET_SIZE(lx_rt_sigreturn_tramp) /* - * Manipulate the stack in the way necessary for it to appear to libc - * that the signal handler it invoked via call_user_handler() is - * returning. + * Before calling to a vsyscall address, the system call arguments + * are loaded into the usual registers by the emulated program. The + * brand SIGSEGV handler detects a jump to these addresses and modifies + * the interrupted context to restart at this trampoline with %rax set + * to the intended system call number. When the system call returns, + * we return to the address on the stack from the original call. */ - ENTRY_NP(lx_sigreturn_tolibc) - movq %rdi, %rsp /* set %rsp to passed value */ - popq %rbp /* restore proper %rbp */ - ret /* return to lx_call_user_handler */ - SET_SIZE(lx_sigreturn_tolibc) + ENTRY_NP(lx_vsyscall_tramp) + syscall + ret + SET_SIZE(lx_vsyscall_tramp) + #endif /* lint */ diff --git a/usr/src/lib/brand/lx/lx_brand/amd64/lx_runexe.s b/usr/src/lib/brand/lx/lx_brand/amd64/lx_runexe.s deleted file mode 100644 index 70cd75cf41..0000000000 --- a/usr/src/lib/brand/lx/lx_brand/amd64/lx_runexe.s +++ /dev/null @@ -1,46 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - * Copyright 2014 Joyent, Inc. All rights reserved. - */ - -#include <sys/asm_linkage.h> - -#if defined(lint) - -/*ARGSUSED*/ -void -lx_runexe(void *argv, void *entry) -{ -} - -#else /* lint */ - - /* - * Set our stack pointer, clear the general registers, - * and jump to the brand linker's entry point. - */ - ENTRY_NP(lx_runexe) - movq %rdi, %rax / %rax = &argv[0] - movq %rsi, %rbx / Brand linker's entry point in %rbx - subq $8, %rax / Top of stack - must point at argc - movq %rax, %rsp / Set %rsp to what linkers expect - - movq $0, %rdx - - jmp *%rbx / And away we go... - - /* target will never return. */ - SET_SIZE(lx_runexe) -#endif /* lint */ diff --git a/usr/src/lib/brand/lx/lx_brand/common/clone.c b/usr/src/lib/brand/lx/lx_brand/common/clone.c index 87f966cc89..ee442ef280 100644 --- a/usr/src/lib/brand/lx/lx_brand/common/clone.c +++ b/usr/src/lib/brand/lx/lx_brand/common/clone.c @@ -49,8 +49,10 @@ #include <sys/lx_debug.h> #include <sys/lx_thread.h> #include <sys/fork.h> +#include <sys/mman.h> #include <lx_syscall.h> + #define SHARED_AS \ (LX_CLONE_VM | LX_CLONE_FS | LX_CLONE_FILES | LX_CLONE_SIGHAND \ | LX_CLONE_THREAD) @@ -60,9 +62,6 @@ #define IS_FORK(f) (((f) & SHARED_AS) == 0) #define IS_VFORK(f) (((f) & CLONE_VFORK) == CLONE_VFORK) -#define LX_EXIT 1 -#define LX_EXIT_GROUP 2 - /* * This is dicey. This seems to be an internal glibc structure, and not * part of any external interface. Thus, it is subject to change without @@ -92,19 +91,16 @@ struct clone_state { void *c_ptidp; struct lx_desc *c_ldtinfo; /* thread-specific segment */ void *c_ctidp; -#if defined(_LP64) - lx_regs_t c_regs; /* original register state */ -#else - uintptr_t c_gs; /* Linux's %gs */ -#endif + ucontext_t c_uc; /* original register state */ sigset_t c_sigmask; /* signal mask */ lx_affmask_t c_affmask; /* CPU affinity mask */ volatile int *c_clone_res; /* pid/error returned to cloner */ int c_ptrace_event; /* ptrace(2) event for child stop */ + void *c_ntv_stk; /* native stack for this thread */ + size_t c_ntv_stk_sz; /* native stack size */ + lx_tsd_t *c_lx_tsd; /* tsd area for thread */ }; -extern void lx_setup_clone(uintptr_t, void *, void *); - /* * Counter incremented when we vfork(2) ourselves, and decremented when the * vfork(2)ed child exit(2)s or exec(2)s. @@ -114,7 +110,7 @@ static int is_vforked = 0; long lx_exit(uintptr_t p1) { - int ret, status = (int)p1; + int status = (int)p1; lx_tsd_t *lx_tsd; /* @@ -126,33 +122,18 @@ lx_exit(uintptr_t p1) _exit(status); } - if ((ret = thr_getspecific(lx_tsd_key, (void **)&lx_tsd)) != 0) - lx_err_fatal("exit: unable to read thread-specific data: %s", - strerror(ret)); - - assert(lx_tsd != 0); + lx_tsd = lx_get_tsd(); lx_tsd->lxtsd_exit = LX_ET_EXIT; lx_tsd->lxtsd_exit_status = status; lx_ptrace_stop_if_option(LX_PTRACE_O_TRACEEXIT, B_FALSE, - (ulong_t)status); - - /* - * Block all signals in the exit context to avoid taking any signals - * (to the degree possible) while exiting. - */ - (void) sigfillset(&lx_tsd->lxtsd_exit_context.uc_sigmask); + (ulong_t)status, NULL); /* * This thread is exiting. Restore the state of the thread to * what it was before we started running linux code. - * For 64-bit code, since we know we are unwinding the stack back to - * lx_init, we need to unwind the syscall mode flag "stack" as well. */ -#if defined(_LP64) - (void) syscall(SYS_brand, B_UNWIND_NTV_SYSC_FLAG); -#endif (void) setcontext(&lx_tsd->lxtsd_exit_context); /* @@ -167,7 +148,7 @@ lx_exit(uintptr_t p1) long lx_group_exit(uintptr_t p1) { - int ret, status = (int)p1; + int status = (int)p1; lx_tsd_t *lx_tsd; /* @@ -179,36 +160,21 @@ lx_group_exit(uintptr_t p1) _exit(status); } - if ((ret = thr_getspecific(lx_tsd_key, (void **)&lx_tsd)) != 0) - lx_err_fatal("group_exit: unable to read thread-specific " - "data: %s", strerror(ret)); - - assert(lx_tsd != 0); + lx_tsd = lx_get_tsd(); lx_tsd->lxtsd_exit = LX_ET_EXIT_GROUP; lx_tsd->lxtsd_exit_status = status; /* - * Block all signals in the exit context to avoid taking any signals - * (to the degree possible) while exiting. - */ - (void) sigfillset(&lx_tsd->lxtsd_exit_context.uc_sigmask); - - /* * This thread is exiting. Restore the state of the thread to * what it was before we started running linux code. - * For 64-bit code, since we know we are unwinding the stack back to - * lx_init, we need to unwind the syscall mode flag "stack" as well. */ -#if defined(_LP64) - (void) syscall(SYS_brand, B_UNWIND_NTV_SYSC_FLAG); -#endif (void) setcontext(&lx_tsd->lxtsd_exit_context); /* * If we returned from the setcontext(2), something is very wrong. */ - lx_err_fatal("group_exits: unable to set exit context: %s", + lx_err_fatal("group_exit: unable to set exit context: %s", strerror(errno)); /*NOTREACHED*/ @@ -220,7 +186,7 @@ clone_start(void *arg) { int rval; struct clone_state *cs = (struct clone_state *)arg; - lx_tsd_t lx_tsd; + lx_tsd_t *lxtsd; /* * Let the kernel finish setting up all the needed state for this @@ -228,18 +194,14 @@ clone_start(void *arg) * * We already created the thread using the thr_create(3C) library * call, so most of the work required to emulate lx_clone(2) has - * been done by the time we get to this point. Instead of creating - * a new brandsys(2) subcommand to perform the last few bits of - * bookkeeping, we just use the lx_clone() slot in the syscall - * table. + * been done by the time we get to this point. */ lx_debug("\tre-vectoring to lx kernel module to complete lx_clone()"); - lx_debug("\tLX_SYS_clone(0x%x, 0x%p, 0x%p, 0x%p, 0x%p)", - cs->c_flags, cs->c_stk, cs->c_ptidp, cs->c_ldtinfo, cs->c_ctidp); + lx_debug("\tB_HELPER_CLONE(0x%x, 0x%p, 0x%p, 0x%p)", + cs->c_flags, cs->c_ptidp, cs->c_ldtinfo, cs->c_ctidp); - rval = syscall(SYS_brand, B_IKE_SYSCALL + LX_EMUL_clone, - cs->c_flags, cs->c_stk, cs->c_ptidp, cs->c_ldtinfo, cs->c_ctidp, - NULL); + rval = syscall(SYS_brand, B_HELPER_CLONE, cs->c_flags, cs->c_ptidp, + cs->c_ldtinfo, cs->c_ctidp); /* * At this point the parent is waiting for cs->c_clone_res to go @@ -250,6 +212,8 @@ clone_start(void *arg) if (rval < 0) { *(cs->c_clone_res) = -errno; lx_debug("\tkernel clone failed, errno %d\n", errno); + free(cs->c_lx_tsd); + free(cs); return (NULL); } @@ -261,84 +225,72 @@ clone_start(void *arg) strerror(errno)); } - /* Initialize the thread specific data for this thread. */ - bzero(&lx_tsd, sizeof (lx_tsd)); -#if defined(_ILP32) - lx_tsd.lxtsd_gs = cs->c_gs; -#else - lx_tsd.lxtsd_fsbase = (uintptr_t)cs->c_ldtinfo; -#endif - /* - * Use the address of the stack-allocated lx_tsd as the - * per-thread storage area to cache various values for later - * use. - * - * This address is only used by this thread, so there is no - * danger of other threads using this storage area, nor of it - * being accessed once this stack frame has been freed. + * Initialize the thread specific data for this thread. */ - if (thr_setspecific(lx_tsd_key, &lx_tsd) != 0) { - *(cs->c_clone_res) = -errno; - lx_err_fatal("Unable to set thread-specific ptr for clone: %s", - strerror(rval)); - } + lxtsd = cs->c_lx_tsd; + lx_init_tsd(lxtsd); + lxtsd->lxtsd_clone_state = cs; /* - * Save the current context of this thread. - * - * We'll restore this context when this thread attempts to exit. + * Install the emulation stack for this thread. Register the + * thread-specific data structure with the stack list so that it may be + * freed at thread exit or fork(2). */ - if (getcontext(&lx_tsd.lxtsd_exit_context) != 0) { + lx_install_stack(cs->c_ntv_stk, cs->c_ntv_stk_sz, lxtsd); + + if (sigprocmask(SIG_SETMASK, &cs->c_sigmask, NULL) < 0) { *(cs->c_clone_res) = -errno; - lx_err_fatal("Unable to initialize thread-specific exit " - "context: %s", strerror(errno)); + lx_err_fatal("Unable to release held signals for child " + "thread: %s", strerror(errno)); } /* - * Do the final stack twiddling, reset %gs, and return to the - * clone(2) path. + * Let the parent know that the clone has (effectively) been + * completed. */ - if (lx_tsd.lxtsd_exit == LX_ET_NONE) { - if (sigprocmask(SIG_SETMASK, &cs->c_sigmask, NULL) < 0) { - *(cs->c_clone_res) = -errno; + *(cs->c_clone_res) = rval; - lx_err_fatal("Unable to release held signals for child " - "thread: %s", strerror(errno)); - } + /* + * We want to load the general registers from this context, and + * switch to the BRAND stack. + */ + cs->c_uc.uc_flags = UC_CPU; + cs->c_uc.uc_brand_data[0] = (void *)LX_UC_STACK_BRAND; - /* - * Let the parent know that the clone has (effectively) been - * completed. - */ - *(cs->c_clone_res) = rval; + /* + * New threads will not link into the existing context chain. + */ + cs->c_uc.uc_link = NULL; - /* - * Fire the ptrace(2) event stop in the new thread: - */ - lx_ptrace_stop_if_option(cs->c_ptrace_event, B_TRUE, 0); + /* + * Set stack pointer and entry point for new thread: + */ + LX_REG(&cs->c_uc, REG_SP) = (uintptr_t)cs->c_stk; + LX_REG(&cs->c_uc, REG_PC) = (uintptr_t)cs->c_retaddr; -#if defined(_LP64) - (void) syscall(SYS_brand, B_CLR_NTV_SYSC_FLAG); - lx_setup_clone((uintptr_t)&cs->c_regs, cs->c_retaddr, - cs->c_stk); -#else - lx_setup_clone(cs->c_gs, cs->c_retaddr, cs->c_stk); -#endif + /* + * Return 0 to the child: + */ + LX_REG(&cs->c_uc, REG_R0) = (uintptr_t)0; - /* lx_setup_clone() should never return. */ - assert(0); - } + /* + * Fire the ptrace(2) event stop in the new thread: + */ + lx_ptrace_stop_if_option(cs->c_ptrace_event, B_TRUE, 0, &cs->c_uc); /* - * We are here because the Linux application called the exit() or - * exit_group() system call. In turn the brand library did a - * setcontext() to jump to the thread context state saved in - * getcontext(), above. + * Jump to the Linux process. The system call must not return. */ - lx_exit_common(lx_tsd.lxtsd_exit, lx_tsd.lxtsd_exit_status); + if (syscall(SYS_brand, B_JUMP_TO_LINUX, &cs->c_uc) == -1) { + lx_err_fatal("B_JUMP_TO_LINUX failed: %s", + strerror(errno)); + } + abort(); + /*NOTREACHED*/ + return (NULL); } /* @@ -386,10 +338,11 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, int sig; int rval; int pid; - lx_regs_t *rp; - sigset_t sigmask; + ucontext_t *ucp; + sigset_t sigmask, osigmask; int fork_flags = 0; int ptrace_event; + int error = 0; if (flags & LX_CLONE_SETTLS) { lx_debug("lx_clone(flags=0x%x stk=0x%p ptidp=0x%p ldt=0x%p " @@ -400,7 +353,8 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, } /* - * Only supported for pid 0 on Linux + * Only supported for pid 0 on Linux after version 2.3.21, and + * apparently not at all since 2.5.16. */ if (flags & LX_CLONE_PID) return (-EINVAL); @@ -422,7 +376,7 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, return (-EINVAL); } - rp = lx_syscall_regs(); + ucp = lx_syscall_regs(); /* test if pointer passed by user are writable */ if (flags & LX_CLONE_PARENT_SETTID) { @@ -446,7 +400,10 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, */ lx_ptrace_clone_begin(ptrace_event, !!(flags & LX_CLONE_PTRACE)); - /* See if this is a fork() operation or a thr_create(). */ + /* + * Handle a fork(2) operation here. If this is not a fork, a new + * thread will be created after this block. + */ if (IS_FORK(flags) || IS_VFORK(flags)) { if (flags & LX_CLONE_PARENT) { lx_unsupported("clone(2) only supports CLONE_PARENT " @@ -457,6 +414,11 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, if ((flags & LX_CSIGNAL) == 0) fork_flags |= FORK_NOSIGCHLD; + /* + * Suspend signal delivery and perform the actual fork(2) + * operation. + */ + _sigoff(); if (flags & LX_CLONE_VFORK) { is_vforked++; rval = vforkx(fork_flags); @@ -469,12 +431,45 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, } /* - * Since we've already forked, we can't do much if uucopy - * fails, so we just ignore failure. Failure is unlikely since - * we've tested the memory before we did the fork. + * The parent process returns through the regular system call + * path here. + */ + if (rval != 0) { + /* + * Since we've already forked, we can't do much if + * uucopy fails, so we just ignore failure. Failure is + * unlikely since we've tested the memory before we did + * the fork. + */ + if (rval > 0 && (flags & LX_CLONE_PARENT_SETTID)) { + (void) uucopy(&rval, ptidp, sizeof (int)); + } + + if (rval > 0) { + lx_ptrace_stop_if_option(ptrace_event, B_FALSE, + (ulong_t)rval, NULL); + } + + /* + * Re-enable signal delivery in the parent process. + */ + _sigon(); + + return ((rval < 0) ? -errno : rval); + } + + /* + * The rest of this block runs only within the new child + * process. */ - if (rval > 0 && (flags & LX_CLONE_PARENT_SETTID)) { - (void) uucopy(&rval, ptidp, sizeof (int)); + + if (!IS_VFORK(flags)) { + /* + * We must free the stacks and thread-specific data + * objects for every thread except the one duplicated + * from the parent by forkx(). + */ + lx_free_other_stacks(); } if (rval == 0 && (flags & LX_CLONE_CHILD_SETTID)) { @@ -484,50 +479,95 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, * forked, so on failure, we just don't copy the * memory. */ - pid = lx_getpid(); + pid = syscall(SYS_brand, B_GETPID); if (pid >= 0) (void) uucopy(&pid, ctidp, sizeof (int)); } - /* Parent just returns */ - if (rval != 0) { - if (rval > 0) - lx_ptrace_stop_if_option(ptrace_event, B_FALSE, - (ulong_t)rval); - return ((rval < 0) ? -errno : rval); - } - /* * Set up additional data in the lx_proc_data structure as * necessary. */ - rval = syscall(SYS_brand, B_IKE_SYSCALL + LX_EMUL_clone, - flags, cldstk, ptidp, ldtinfo, ctidp, NULL); - if (rval < 0) { + if ((rval = syscall(SYS_brand, B_HELPER_CLONE, flags, ptidp, + ldtinfo, ctidp)) < 0) { return (rval); } - /* - * lx_setup_clone() doesn't return below, so stop now, if - * necessary. - */ - lx_ptrace_stop_if_option(ptrace_event, B_TRUE, 0); + if (IS_VFORK(flags)) { + ucontext_t vforkuc; + + /* + * The vfork(2) interface is somewhat less than ideal. + * The unfortunate notion of borrowing the address + * space of the parent process requires us to jump + * through several hoops to prevent corrupting parent + * emulation state. + * + * When returning in the child, we make a copy of the + * system call return context and discard three pages + * of the native stack. Returning normally would + * clobber the native stack frame in which the brand + * library in the parent process is presently waiting. + * + * The calling program is expected to correctly use + * this dusty, underspecified relic. Neglecting to + * immediately call execve(2) or exit(2) is not + * cricket; this stack space will be permanently lost, + * not to mention myriad other undefined behaviour. + */ + bcopy(ucp, &vforkuc, sizeof (vforkuc)); + vforkuc.uc_brand_data[1] -= LX_NATIVE_STACK_VFORK_GAP; + vforkuc.uc_link = NULL; + + lx_debug("\tvfork native stack sp %p", + vforkuc.uc_brand_data[1]); + + /* + * If provided, the child needs its new stack set up. + */ + if (cldstk != 0) { + lx_debug("\tvfork cldstk %p", cldstk); + LX_REG(&vforkuc, REG_SP) = (uintptr_t)cldstk; + } + + /* + * Stop for ptrace if required. + */ + lx_ptrace_stop_if_option(ptrace_event, B_TRUE, 0, NULL); + + /* + * Return to the child via the specially constructed + * vfork(2) context. + */ + LX_EMULATE_RETURN(&vforkuc, LX_SYS_clone, 0, 0); + (void) syscall(SYS_brand, B_EMULATION_DONE, &vforkuc, + LX_SYS_clone, 0, 0); + + assert(0); + } /* * If provided, the child needs its new stack set up. */ - if (cldstk) { -#if defined(_LP64) - (void) syscall(SYS_brand, B_CLR_NTV_SYSC_FLAG); - lx_setup_clone((uintptr_t)rp, (void *)rp->lxr_rip, - cldstk); -#else - lx_setup_clone(rp->lxr_gs, (void *)rp->lxr_eip, cldstk); -#endif - /* lx_setup_clone() should never return. */ - assert(0); + if (cldstk != 0) { + lx_debug("\tcldstk %p", cldstk); + LX_REG(ucp, REG_SP) = (uintptr_t)cldstk; } + /* + * Stop for ptrace if required. + */ + lx_ptrace_stop_if_option(ptrace_event, B_TRUE, 0, NULL); + + /* + * Re-enable signal delivery in the child process. + */ + _sigon(); + + /* + * The child process returns via the regular emulated system + * call path: + */ return (0); } @@ -557,13 +597,13 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, } /* - * To avoid malloc() here, we steal a part of the new thread's - * stack to store all the info that thread might need for - * initialization. We also make it 64-bit aligned for good - * measure. + * Initialise the state structure we pass as an argument to the new + * thread: */ - cs = (struct clone_state *) - ((p2 - sizeof (struct clone_state)) & -((uintptr_t)8)); + if ((cs = malloc(sizeof (*cs))) == NULL) { + lx_debug("could not allocate clone_state: %s", strerror(errno)); + return (-ENOMEM); + } cs->c_flags = flags; cs->c_sig = sig; cs->c_stk = cldstk; @@ -572,43 +612,27 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, cs->c_ctidp = ctidp; cs->c_clone_res = &clone_res; cs->c_ptrace_event = ptrace_event; -#if defined(_LP64) /* - * The AMD64 ABI says that the kernel clobbers %rcx and %r11. We - * return a value in %rax. The new %rsp and %rip will be setup in - * lx_setup_clone. Thus, we don't worry about passing/restoring those - * registers. + * We want the new thread to return directly to the call site for + * the system call. */ - cs->c_regs.lxr_rdi = rp->lxr_rdi; - cs->c_regs.lxr_rsi = rp->lxr_rsi; - cs->c_regs.lxr_rbx = rp->lxr_rbx; - cs->c_regs.lxr_rdx = rp->lxr_rdx; - cs->c_regs.lxr_rdi = rp->lxr_rdi; - cs->c_regs.lxr_r8 = rp->lxr_r8; - cs->c_regs.lxr_r9 = rp->lxr_r9; - cs->c_regs.lxr_r10 = rp->lxr_r10; - cs->c_regs.lxr_r12 = rp->lxr_r12; - cs->c_regs.lxr_r13 = rp->lxr_r13; - cs->c_regs.lxr_r14 = rp->lxr_r14; - cs->c_regs.lxr_r15 = rp->lxr_r15; -#else - cs->c_gs = rp->lxr_gs; -#endif + cs->c_retaddr = (void *)LX_REG(ucp, REG_PC); + /* + * Copy the saved context for the clone(2) system call so that the + * new thread may use it to initialise registers. + */ + bcopy(ucp, &cs->c_uc, sizeof (cs->c_uc)); + if ((cs->c_lx_tsd = malloc(sizeof (*cs->c_lx_tsd))) == NULL) { + free(cs); + return (-ENOMEM); + } if (lx_sched_getaffinity(0, sizeof (cs->c_affmask), - (uintptr_t)&cs->c_affmask) == -1) + (uintptr_t)&cs->c_affmask) == -1) { lx_err_fatal("Unable to get affinity mask for parent " "thread: %s", strerror(errno)); + } - /* - * We want the new thread to return directly to the return site for - * the system call. - */ -#if defined(_LP64) - cs->c_retaddr = (void *)rp->lxr_rip; -#else - cs->c_retaddr = (void *)rp->lxr_eip; -#endif clone_res = 0; (void) sigfillset(&sigmask); @@ -617,17 +641,40 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, * Block all signals because the thread we create won't be able to * properly handle them until it's fully set up. */ - if (sigprocmask(SIG_BLOCK, &sigmask, &cs->c_sigmask) < 0) { + if (sigprocmask(SIG_BLOCK, &sigmask, &osigmask) < 0) { lx_debug("lx_clone sigprocmask() failed: %s", strerror(errno)); + free(cs->c_lx_tsd); + free(cs); return (-errno); } + cs->c_sigmask = osigmask; + + /* + * Allocate the native stack for this new thread now, so that we + * can return failure gracefully as ENOMEM. + */ + if (lx_alloc_stack(&cs->c_ntv_stk, &cs->c_ntv_stk_sz) != 0) { + free(cs->c_lx_tsd); + free(cs); + return (-ENOMEM); + } rval = thr_create(NULL, NULL, clone_start, cs, THR_DETACHED, &tid); /* + * If the thread did not start, free the resources we allocated: + */ + if (rval == -1) { + error = errno; + (void) munmap(cs->c_ntv_stk, cs->c_ntv_stk_sz); + free(cs->c_lx_tsd); + free(cs); + } + + /* * Release any pending signals */ - (void) sigprocmask(SIG_SETMASK, &cs->c_sigmask, NULL); + (void) sigprocmask(SIG_SETMASK, &osigmask, NULL); /* * Wait for the child to be created and have its tid assigned. @@ -637,8 +684,14 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, ; rval = clone_res; - lx_ptrace_stop_if_option(ptrace_event, B_FALSE, (ulong_t)rval); - } + lx_ptrace_stop_if_option(ptrace_event, B_FALSE, (ulong_t)rval, + NULL); - return (rval); + return (rval); + } else { + /* + * Return the error from thr_create(3C). + */ + return (-error); + } } diff --git a/usr/src/lib/brand/lx/lx_brand/common/file.c b/usr/src/lib/brand/lx/lx_brand/common/file.c index 56201035ff..1f2c4032f5 100644 --- a/usr/src/lib/brand/lx/lx_brand/common/file.c +++ b/usr/src/lib/brand/lx/lx_brand/common/file.c @@ -120,34 +120,6 @@ ltos_at_flag(int lflag, int allow, boolean_t enforce) */ /* - * Linux creates half-duplex pipes and Illumos creates full-duplex pipes. - * Thus, to get the correct semantics, we need to setup pipes in the kernel's - * lx brand module. - */ - -long -lx_pipe2(uintptr_t p1, uintptr_t p2) -{ - int flags = 0; - int r; - - if (p2 & LX_O_NONBLOCK) { - flags |= O_NONBLOCK; - p2 &= ~LX_O_NONBLOCK; - } - if (p2 & LX_O_CLOEXEC) { - flags |= O_CLOEXEC; - p2 &= ~LX_O_CLOEXEC; - } - if (p2 != 0) - return (-EINVAL); - - r = syscall(SYS_brand, B_IKE_SYSCALL + LX_EMUL_pipe2, p1, flags); - - return ((r == -1) ? -errno : r); -} - -/* * On Linux, even root cannot create a link to a directory, so we have to * add an explicit check. */ diff --git a/usr/src/lib/brand/lx/lx_brand/common/fork.c b/usr/src/lib/brand/lx/lx_brand/common/fork.c index b0edee1adb..b382dd9410 100644 --- a/usr/src/lib/brand/lx/lx_brand/common/fork.c +++ b/usr/src/lib/brand/lx/lx_brand/common/fork.c @@ -37,77 +37,75 @@ * initialization or else bad things will happen (i.e. ending up with a bad * schedctl page). On Linux, there is no such thing as forkall(), so we use * fork1() here. + * + * For vfork(), we have a serious problem because the child is not allowed to + * return from the current frame because it will corrupt the parent's stack. + * Since the semantics of vfork() are rather ill-defined (other than "it's + * faster than fork"), we should theoretically be safe by falling back to + * fork1(). */ -long -lx_fork(void) +static long +lx_fork_common(boolean_t is_vfork) { int ret; + int ptopt = is_vfork ? LX_PTRACE_O_TRACEVFORK : LX_PTRACE_O_TRACEFORK; /* * Inform the in-kernel ptrace(2) subsystem that we are about to * emulate fork(2). */ - lx_ptrace_clone_begin(LX_PTRACE_O_TRACEFORK, B_FALSE); + lx_ptrace_clone_begin(ptopt, B_FALSE); + /* + * Suspend signal delivery and perform the fork operation. + */ + _sigoff(); switch (ret = fork1()) { case -1: + _sigon(); return (-errno); case 0: /* - * Returning in the new child. + * Returning in the new child. We must free the stacks and + * thread-specific data objects for the threads we did not + * duplicate; i.e. every other thread. */ - if (lx_is_rpm) { + lx_free_other_stacks(); + + if (!is_vfork && lx_is_rpm) { (void) sleep(lx_rpm_delay); } - lx_ptrace_stop_if_option(LX_PTRACE_O_TRACEFORK, B_TRUE, 0); + + lx_ptrace_stop_if_option(ptopt, B_TRUE, 0, NULL); + + /* + * Re-enable signal delivery in the child and return to the + * new process. + */ + _sigon(); return (0); default: + lx_ptrace_stop_if_option(ptopt, B_FALSE, (ulong_t)ret, NULL); + /* - * Returning in the new parent. + * Re-enable signal delivery in the parent and return from + * the emulated system call. */ - lx_ptrace_stop_if_option(LX_PTRACE_O_TRACEFORK, B_FALSE, - (ulong_t)ret); + _sigon(); return (ret); } } -/* - * For vfork(), we have a serious problem because the child is not allowed to - * return from the current frame because it will corrupt the parent's stack. - * Since the semantics of vfork() are rather ill-defined (other than "it's - * faster than fork"), we should theoretically be safe by falling back to - * fork1(). - */ long -lx_vfork(void) +lx_fork(void) { - int ret; - - /* - * Inform the in-kernel ptrace(2) subsystem that we are about to - * emulate vfork(2). - */ - lx_ptrace_clone_begin(LX_PTRACE_O_TRACEVFORK, B_FALSE); - - switch (ret = fork1()) { - case -1: - return (-errno); - - case 0: - /* - * Returning in the new child. - */ - lx_ptrace_stop_if_option(LX_PTRACE_O_TRACEVFORK, B_TRUE, 0); - return (0); + return (lx_fork_common(B_FALSE)); +} - default: - /* - * Returning in the new parent. - */ - lx_ptrace_stop_if_option(LX_PTRACE_O_TRACEVFORK, B_FALSE, - (ulong_t)ret); - return (ret); - } +long +lx_vfork(void) +{ + return (lx_fork_common(B_TRUE)); } diff --git a/usr/src/lib/brand/lx/lx_brand/common/lx_brand.c b/usr/src/lib/brand/lx/lx_brand/common/lx_brand.c index 655374b6f6..661fae3402 100644 --- a/usr/src/lib/brand/lx/lx_brand/common/lx_brand.c +++ b/usr/src/lib/brand/lx/lx_brand/common/lx_brand.c @@ -42,6 +42,7 @@ #include <zone.h> #include <sys/brand.h> #include <sys/epoll.h> +#include <sys/stack.h> #include <assert.h> #include <stdio.h> @@ -76,26 +77,8 @@ #include <sys/lx_aio.h> /* - * General emulation guidelines. - * - * Once the emulation handler has been installed onto the process, we need to - * be concerned about system calls made by the emulation, as well as any - * library calls which in turn make system calls. This is actually only an - * issue for the 64-bit case, since the kernel sycall entry point is common for - * both Illumos and Linux. The trampoline code in the kernel needs some way to - * distinguish when it should bounce out for emulation (Linux system call) vs. - * stay in the kernel (emulation system call). For the 32-bit case Linux uses - * int80 for system calls which is orthogonal to all of the Illumos system call - * entry points and thus there is no issue. - * - * To cope with this for the 64-bit case, we maintain a mode flag on each - * LWP so we can tell when a system call comes from Linux. We then set the mode - * flag to Illumos so that all future system calls from the emulation are - * handled correctly. The emulation must reset the mode when it is ready to - * return control to Linux. This is done via the B_CLR_NTV_SYSC_FLAG brand - * call. There is additional complexity with this mode switching in the - * case of a user-defined signal handler. This is described in the signal - * emulation code comments. + * There is a block comment in "uts/common/brand/lx/os/lx_brand.c" that + * describes the functioning of the LX brand in some detail. * * *** Setting errno * @@ -103,61 +86,12 @@ * application whose address space we're running in. The Linux libc errno is * independent of our native libc errno. To pass back an error the emulation * function should return -errno back to the Linux caller. - * - * *** General considerations - * - * The lx brand interposes on _all_ system calls. Linux system calls that need - * special handling in the kernel are redirected back to the kernel via the - * in-kernel emulation (IKE) mechanism which uses a range of the brand system - * call command number to determine which in-kernel lx function to invoke. - * - * *** DTrace - * - * The lx-syscall DTrace provider (see lx_systrace_attach in - * uts/common/brand/lx/dtrace/lx_systrace.c) works as follows: - * - * When probes are enabled: - * lx_systrace_enable -> lx_brand_systrace_enable - * - * This enables the trace jump table in the kernel (see - * uts/intel/brand/lx/lx_brand_asm.s which has the functions - * lx_brand_int80_enable and lx_brand_syscall_enable, and the corresponding - * patch points lx_brand_int80_patch_point and lx_brand_syscall_patch_point). - * - * The library code defines lx_handler_table and lx_handler_trace_table - * in the i386 and amd64 lx_handler.s code. - * - * The trace jump table enables lx_traceflag which is used in the lx_emulate - * function to make the B_SYSENTRY/B_SYSRETURN brandsys syscalls. These in turn - * will call lx_systrace_entry_ptr/lx_systrace_return_ptr so that we can DTrace - * the Linux syscalls via the provider. - * - * When probes are disbaled, we undo the patch points via: - * lx_systrace_disable -> lx_brand_systrace_disable */ - /* * Map Illumos errno to the Linux equivalent. */ -static int stol_errno[] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, - 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, - 30, 31, 32, 33, 34, 42, 43, 44, 45, 46, - 47, 48, 49, 50, 51, 35, 47, 22, 38, 22, /* 49 */ - 52, 53, 54, 55, 56, 57, 58, 59, 22, 22, - 61, 61, 62, 63, 64, 65, 66, 67, 68, 69, - 70, 71, 22, 22, 72, 22, 22, 74, 36, 75, - 76, 77, 78, 79, 80, 81, 82, 83, 84, 38, - 40, 85, 86, 39, 87, 88, 89, 90, 91, 92, /* 99 */ - 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, - 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, - 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, - 103, 104, 105, 106, 107, 22, 22, 22, 22, 22, - 22, 22, 22, 108, 109, 110, 111, 112, 113, 114, /* 149 */ - 115, 116 -}; +static int stol_errno[] = LX_STOL_ERRNO_INIT; char lx_release[LX_VERS_MAX]; char lx_cmd_name[MAXNAMLEN]; @@ -172,9 +106,6 @@ struct lx_locale_ending { int se_size; /* solaris ending string length */ }; -__thread int lx_do_syscall_restart; -__thread int lx_had_sigchild; - #define l2s_locale(lname, sname) \ {(lname), (sname), sizeof ((lname)) - 1, sizeof ((sname)) - 1} @@ -184,45 +115,6 @@ __thread int lx_had_sigchild; #endif /* - * This flag is part of the registration with the in-kernel brand module. It's - * used in lx_handler() to determine if we should go back into the kernel after - * a system call in case the kernel needs to perform some post-syscall work - * like tracing for example. - */ -int lx_traceflag; - -#define LX_SYS_NOSYS_REASON 0x07 -#define LX_SYS_EBPARG6 0x08 -#define LX_SYS_IKE 0x10 - -#define LX_IKE(sysnum) ((long(*)(void))LX_EMUL_##sysnum) - -/* - * Flags that denote the specific reason that we don't have a particular - * system call. These reasons are only valid if the function is NULL. - */ -#define NOSYS_NULL 0 -#define NOSYS_NONE 1 -#define NOSYS_NO_EQUIV 2 -#define NOSYS_KERNEL 3 -#define NOSYS_UNDOC 4 -#define NOSYS_OBSOLETE 5 -#define NOSYS_MAX 5 - -#if NOSYS_MAX > LX_SYS_NOSYS_REASON -#error NOSYS reason codes must fit in LX_SYS_NOSYS_REASON -#endif - -static char *nosys_reasons[] = { - "Not done yet", - "No such Linux system call", - "No equivalent Solaris functionality", - "Reads/modifies Linux kernel state", - "Undocumented and/or rarely used system call", - "Unsupported, obsolete system call" -}; - -/* * Most syscalls return an int but some return something else, typically a * ssize_t. This can be either an int or a long, depending on if we're compiled * for 32-bit or 64-bit. To correctly propagate the -errno return code in the @@ -231,14 +123,9 @@ static char *nosys_reasons[] = { * Linux, we will have the right size value in both the 32 and 64 bit cases. */ -struct lx_sysent { - char *sy_name; - long (*sy_callc)(); - char sy_flags; - char sy_narg; -}; +typedef long (*lx_syscall_handler_t)(); -static struct lx_sysent sysents[LX_NSYSCALLS + 1]; +static lx_syscall_handler_t lx_handlers[LX_NSYSCALLS + 1]; static uintptr_t stack_bottom; @@ -254,7 +141,6 @@ int lx_verbose = 0; /* verbose mode enabled if non-zero */ int lx_debug_enabled = 0; /* debugging output enabled if non-zero */ pid_t zoneinit_pid; /* zone init PID */ -long max_pid; /* native maximum PID */ thread_key_t lx_tsd_key; @@ -458,271 +344,82 @@ lx_unsupported(char *msg, ...) (void) kill(getpid(), SIGSYS); } -extern void lx_runexe(void *argv, void *entry); int lx_init(int argc, char *argv[], char *envp[]); -static int -lx_emulate_args(lx_regs_t *rp, struct lx_sysent *s, uintptr_t *args) +lx_tsd_t * +lx_get_tsd(void) { -#if defined(_LP64) - /* - * Note: Syscall argument passing is different from function call - * argument passing on amd64. For function calls, the fourth arg is - * passed via %rcx, but for system calls the 4th arg is passed via %r10. - * This is because in amd64, the syscall instruction puts the lower - * 32 bits of %rflags in %r11 and puts the %rip value to %rcx. - * - * Appendix A of the amd64 ABI (Linux conventions) states that syscalls - * are limited to 6 args and no arg is passed on the stack. - */ - args[0] = rp->lxr_rdi; - args[1] = rp->lxr_rsi; - args[2] = rp->lxr_rdx; - args[3] = rp->lxr_r10; - args[4] = rp->lxr_r8; - args[5] = rp->lxr_r9; -#else - /* - * If the system call takes 6 args, then libc has stashed them in - * memory at the address contained in %ebx. Except for some syscalls - * which store the 6th argument in %ebp. - */ - if (s->sy_narg == 6 && !(s->sy_flags & LX_SYS_EBPARG6)) { - if (uucopy((void *)rp->lxr_ebx, args, - sizeof (args[0]) * 6) != 0) - return (-stol_errno[errno]); - } else { - args[0] = rp->lxr_ebx; - args[1] = rp->lxr_ecx; - args[2] = rp->lxr_edx; - args[3] = rp->lxr_esi; - args[4] = rp->lxr_edi; - args[5] = rp->lxr_ebp; + int ret; + lx_tsd_t *lx_tsd; + + if ((ret = thr_getspecific(lx_tsd_key, (void **)&lx_tsd)) != 0) { + lx_err_fatal("lx_get_tsd: unable to read " + "thread-specific data: %s", strerror(ret)); } -#endif - return (0); + assert(lx_tsd != 0); + + return (lx_tsd); } +/* + * This function is called from the kernel like a signal handler. Each + * function call is a request to provide emulation for a system call that, on + * illumos, is implemented in userland. The system call number selection and + * argument parsing have already been done by the kernel. + */ void -lx_emulate(lx_regs_t *rp) +lx_emulate(ucontext_t *ucp, int syscall_num, uintptr_t *args) { - struct lx_sysent *s; - uintptr_t args[6]; -#if defined(_ILP32) - uintptr_t gs = rp->lxr_gs & 0xffff; /* %gs is only 16 bits */ -#endif - int syscall_num; - long ret; + long emu_ret; + int emu_errno = 0; -#if defined(_LP64) - syscall_num = rp->lxr_rax; -#else - syscall_num = rp->lxr_eax; -#endif + LX_EMULATE_ENTER(ucp, syscall_num, args); + lx_debug("lx_emulate(%p, %d, [%p, %p, %p, %p, %p, %p])\n", + ucp, syscall_num, args[0], args[1], args[2], args[3], args[4], + args[5]); /* - * lx_brand_int80_callback() or lx_brand_syscall_callback() ensures - * that the syscall_num is sane; Use it as is. + * The kernel should have saved us a context that will not restore the + * previous signal mask. Some emulated system calls alter the signal + * mask; restoring it after the emulation would cancel that out. */ - assert(syscall_num >= 0); - assert(syscall_num < (sizeof (sysents) / sizeof (sysents[0]))); - s = &sysents[syscall_num]; - - if ((ret = lx_emulate_args(rp, s, args)) != 0) - goto out; + assert(!(ucp->uc_flags & UC_SIGMASK)); /* - * If the tracing flag is enabled we call into the brand-specific - * kernel module to handle the tracing activity (DTrace or ptrace). - * It would be tempting to perform DTrace activity in the brand - * module's syscall trap callback, rather than having to return - * to the kernel here, but -- since argument encoding can vary - * according to the specific system call -- that would require - * replicating the knowledge of argument decoding in the kernel - * module as well as here in the brand library. + * The kernel ensures that the syscall_num is sane; Use it as is. */ - if (lx_traceflag != 0) { - /* - * Part of the ptrace "interface" is that on syscall entry - * %rax / %eax should be reported as -ENOSYS while the - * orig_rax / orig_eax field of the user structure needs to - * contain the actual system call number. If we end up stopping - * here, the controlling process will dig the lx_regs_t - * structure out of our stack. - */ -#if defined(_LP64) - rp->lxr_orig_rax = syscall_num; - rp->lxr_rax = -stol_errno[ENOSYS]; -#else - rp->lxr_orig_eax = syscall_num; - rp->lxr_eax = -stol_errno[ENOSYS]; -#endif - - (void) syscall(SYS_brand, B_SYSENTRY, syscall_num, args); - - /* - * The external tracer may have modified the arguments to this - * system call. Refresh the argument cache to account for this. - */ - if ((ret = lx_emulate_args(rp, s, args)) != 0) - goto out; - } - - if (s->sy_callc == NULL) { - int reason = s->sy_flags & LX_SYS_NOSYS_REASON; - lx_unsupported("unimplemented syscall #%d (%s): %s\n", - syscall_num, s->sy_name, nosys_reasons[reason]); - ret = -stol_errno[ENOTSUP]; - goto out; - } - - if (LX_DEBUG_ISENABLED) { - const char *fmt = NULL; - - switch (s->sy_narg) { - case 0: - fmt = "calling %s()"; - break; - case 1: - fmt = "calling %s(0x%p)"; - break; - case 2: - fmt = "calling %s(0x%p, 0x%p)"; - break; - case 3: - fmt = "calling %s(0x%p, 0x%p, 0x%p)"; - break; - case 4: - fmt = "calling %s(0x%p, 0x%p, 0x%p, 0x%p)"; - break; - case 5: - fmt = "calling %s(0x%p, 0x%p, 0x%p, 0x%p, 0x%p)"; - break; - case 6: - fmt = "calling %s(0x%p, 0x%p, 0x%p, 0x%p, 0x%p, 0x%p)"; - break; - } - - lx_debug(fmt, s->sy_name, args[0], args[1], args[2], args[3], - args[4], args[5]); + assert(syscall_num >= 0); + assert(syscall_num < (sizeof (lx_handlers) / sizeof (lx_handlers[0]))); + if (lx_handlers[syscall_num] == NULL) { + lx_err_fatal("lx_emulate: kernel sent us a call we cannot " + "emulate (%d)", syscall_num); } /* - * On 64-bit code, the %gs will be 0 in both native and Linux code. + * Call our handler function: */ -#if defined(_ILP32) - if (gs != LWPGS_SEL) { - lx_tsd_t *lx_tsd; - - /* - * While a %gs of 0 is technically legal (as long as the - * application never dereferences memory using %gs), Solaris - * has its own ideas as to how a zero %gs should be handled in - * _update_sregs(), such that any 32-bit user process with a - * %gs of zero running on a system with a 64-bit kernel will - * have its %gs hidden base register stomped on on return from - * a system call, leaving an incorrect base address in place - * until the next time %gs is actually reloaded (forcing a - * reload of the base address from the appropriate descriptor - * table.) - * - * Of course the kernel will once again stomp on THAT base - * address when returning from a system call, resulting in an - * an application segmentation fault. - * - * To avoid this situation, disallow a save of a zero %gs - * here in order to try and capture any Linux process that - * attempts to make a syscall with a zero %gs installed. - */ - assert(gs != 0); - - if ((ret = thr_getspecific(lx_tsd_key, - (void **)&lx_tsd)) != 0) - lx_err_fatal("lx_emulate: unable to read " - "thread-specific data: %s", strerror(ret)); - - assert(lx_tsd != 0); - - lx_tsd->lxtsd_gs = gs; - - lx_debug("lx_emulate(): gsp 0x%p, saved gs: 0x%x", lx_tsd, gs); - } -#endif /* _ILP32 */ - -restart_syscall: - if (s->sy_flags & LX_SYS_IKE) { - lx_debug("\tsyscall %d re-vectoring to lx kernel module " - "for %s()", syscall_num, s->sy_name); - - if ((ret = syscall(SYS_brand, B_IKE_SYSCALL, - (uintptr_t)s->sy_callc, args)) == -1) - ret = -errno; - } else { - ret = s->sy_callc(args[0], args[1], args[2], - args[3], args[4], args[5]); - } - - if (ret > -65536 && ret < 65536) - lx_debug("\t= %d", ret); - else - lx_debug("\t= 0x%x", ret); + emu_ret = lx_handlers[syscall_num](args[0], args[1], args[2], args[3], + args[4], args[5]); /* - * If the return value is between -1 and -4095 then it's an errno, so - * we translate the Illumos error number into the Linux equivalent. + * If the return value is between -1 and -4095 then it's an errno. + * The kernel will translate it to the Linux equivalent for us. */ - if (ret < 0 && ret > -4096) { - if (-ret >= sizeof (stol_errno) / sizeof (stol_errno[0])) { - lx_debug("Invalid return value from emulated " - "syscall %d (%s): %d\n", - syscall_num, s->sy_name, ret); - assert(0); - } - - ret = -stol_errno[-ret]; + if (emu_ret < 0 && emu_ret > -4096) { + emu_errno = (int)-emu_ret; } - if (lx_do_syscall_restart && ret == -stol_errno[EINTR]) { - lx_debug("restarting system call due to signal interruption"); - lx_do_syscall_restart = 0; - goto restart_syscall; - } - -out: /* - * For 32-bit, %eax holds the return code from the system call. For - * 64-bit, %rax holds the return code. + * Return to the context we were passed */ -#if defined(_LP64) - rp->lxr_rax = ret; -#else - rp->lxr_eax = ret; -#endif + LX_EMULATE_RETURN(ucp, syscall_num, emu_ret, emu_errno); + lx_debug("\tlx_emulate(%d) done (ret %ld / 0x%p ; errno %d)", + syscall_num, emu_ret, emu_ret, emu_errno); + (void) syscall(SYS_brand, B_EMULATION_DONE, ucp, syscall_num, emu_ret, + emu_errno); - /* - * If the trace flag is set, bounce into the kernel to let it do - * any necessary tracing (DTrace or ptrace). - */ - if (lx_traceflag != 0) { -#if defined(_LP64) - rp->lxr_orig_rax = syscall_num; -#else - rp->lxr_orig_eax = syscall_num; -#endif - (void) syscall(SYS_brand, B_SYSRETURN, syscall_num, ret); - } - -#if defined(_LP64) - /* - * For 64-bit code this must be the last thing we do in the emulation - * code path before we return back to the Linux program. This will - * disable native syscalls so the next time a syscall happens on this - * thread, it will come back into the emulation. We can omit the extra - * syscall overhead in the 32-bit case. - */ - (void) syscall(SYS_brand, B_CLR_NTV_SYSC_FLAG); -#endif + assert(!"cannot be returned here"); } static void @@ -771,6 +468,106 @@ map_vdso() } #endif +/* + * Initialize the thread specific data for this thread. + */ +void +lx_init_tsd(lx_tsd_t *lxtsd) +{ + int err; + + bzero(lxtsd, sizeof (*lxtsd)); + lxtsd->lxtsd_exit = LX_ET_NONE; + + /* + * The Linux alternate signal stack is initially disabled: + */ + lxtsd->lxtsd_sigaltstack.ss_flags = LX_SS_DISABLE; + + /* + * Create a per-thread exit context from the current register and + * native/brand stack state. Replace the saved program counter value + * with the address of lx_exit_common(); we wish to revector there when + * the thread or process is exiting. + */ + if (getcontext(&lxtsd->lxtsd_exit_context) != 0) { + lx_err_fatal("Unable to initialize thread-specific exit " + "context: %s", strerror(errno)); + } + LX_REG(&lxtsd->lxtsd_exit_context, REG_PC) = (uintptr_t)lx_exit_common; + + /* + * Align the stack pointer and clear the frame pointer. + */ + LX_REG(&lxtsd->lxtsd_exit_context, REG_FP) = 0; + LX_REG(&lxtsd->lxtsd_exit_context, REG_SP) &= ~(STACK_ALIGN - 1UL); +#if defined(_LP64) +#if (STACK_ENTRY_ALIGN != 8) && (STACK_ALIGN != 16) +#error "lx_init_tsd: unexpected STACK_[ENTRY_]ALIGN values" +#endif + /* + * The AMD64 ABI requires that, on entry to a function, the stack + * pointer must be 8-byte aligned, but _not_ 16-byte aligned. When + * the frame pointer is pushed, the alignment will then be correct. + */ + LX_REG(&lxtsd->lxtsd_exit_context, REG_SP) -= STACK_ENTRY_ALIGN; +#endif + + /* + * Block all signals in the exit context to avoid taking any signals + * (to the degree possible) while exiting. + */ + (void) sigfillset(&lxtsd->lxtsd_exit_context.uc_sigmask); + + if ((err = thr_setspecific(lx_tsd_key, lxtsd)) != 0) { + lx_err_fatal("Unable to initialize thread-specific data: %s", + strerror(err)); + } +} + +static void +lx_start(uintptr_t sp, uintptr_t entry) +{ + ucontext_t jump_uc; + + if (getcontext(&jump_uc) != 0) { + lx_err_fatal("Unable to getcontext for program start: %s", + strerror(errno)); + } + + /* + * We want to load the general registers from this + * context, and switch to the BRAND stack. + */ + jump_uc.uc_flags = UC_CPU; + jump_uc.uc_brand_data[0] = (void *)LX_UC_STACK_BRAND; + + LX_REG(&jump_uc, REG_FP) = NULL; + LX_REG(&jump_uc, REG_SP) = sp; + LX_REG(&jump_uc, REG_PC) = entry; + +#if defined(_LP64) + /* + * The AMD64 ABI states that at process entry, %rdx contains "a + * function pointer that the application should register with + * atexit()". We make sure to pass NULL explicitly so that + * no function is registered. + */ + LX_REG(&jump_uc, REG_RDX) = NULL; +#endif + + lx_debug("starting Linux program sp %p ldentry %p", sp, entry); + + /* + * This system call should not return. + */ + if (syscall(SYS_brand, B_JUMP_TO_LINUX, &jump_uc) == -1) { + lx_err_fatal("B_JUMP_TO_LINUX failed: %s", + strerror(errno)); + } + abort(); +} + /*ARGSUSED*/ int lx_init(int argc, char *argv[], char *envp[]) @@ -781,7 +578,7 @@ lx_init(int argc, char *argv[], char *envp[]) int err; lx_elf_data_t edp; lx_brand_registration_t reg; - static lx_tsd_t lx_tsd; + lx_tsd_t *lxtsd; #if defined(_LP64) void *vdso_hdr; #endif @@ -836,13 +633,10 @@ lx_init(int argc, char *argv[], char *envp[]) lx_debug("VERBOSE mode enabled.\n"); } - /* needed in wait4(), get it once since it never changes */ - max_pid = sysconf(_SC_MAXPID); - (void) strlcpy(lx_cmd_name, basename(argv[0]), sizeof (lx_cmd_name)); lx_debug("executing linux process: %s", argv[0]); lx_debug("branding myself and setting handler to 0x%p", - (void *)lx_handler_table); + (void *)lx_emulate); /* * The version of rpm that ships with CentOS/RHEL 3.x has a race @@ -863,9 +657,7 @@ lx_init(int argc, char *argv[], char *envp[]) lx_is_rpm = B_TRUE; reg.lxbr_version = LX_VERSION; - reg.lxbr_handler = (void *)&lx_handler_table; - reg.lxbr_tracehandler = (void *)&lx_handler_trace_table; - reg.lxbr_traceflag = (void *)&lx_traceflag; + reg.lxbr_handler = (void *)&lx_emulate; /* * Register the address of the user-space handler with the lx brand @@ -942,64 +734,74 @@ lx_init(int argc, char *argv[], char *envp[]) lxt_server_init(argc, argv); /* Setup signal handler information. */ - if (lx_siginit()) + if (lx_siginit()) { lx_err_fatal("failed to initialize lx signals for the " "branded process"); + } /* Setup thread-specific data area for managing linux threads. */ - if ((err = thr_keycreate(&lx_tsd_key, NULL)) != 0) + if ((err = thr_keycreate(&lx_tsd_key, NULL)) != 0) { lx_err_fatal("thr_keycreate(lx_tsd_key) failed: %s", strerror(err)); + } lx_debug("thr_keycreate created lx_tsd_key (%d)", lx_tsd_key); - /* Initialize the thread specific data for this thread. */ - bzero(&lx_tsd, sizeof (lx_tsd)); -#if defined(_ILP32) - /* start with %gs having the native libc value */ - lx_tsd.lxtsd_gs = LWPGS_SEL; -#endif - - if ((err = thr_setspecific(lx_tsd_key, &lx_tsd)) != 0) - lx_err_fatal("Unable to initialize thread-specific data: %s", - strerror(err)); - /* - * Save the current context of this thread. - * We'll restore this context when this thread attempts to exit. + * Initialize the thread specific data for this thread. */ - if (getcontext(&lx_tsd.lxtsd_exit_context) != 0) - lx_err_fatal("Unable to initialize thread-specific exit " - "context: %s", strerror(errno)); - - if (lx_tsd.lxtsd_exit == LX_ET_NONE) { -#if defined(_LP64) - /* Switch to Linux syscall mode */ - (void) syscall(SYS_brand, B_CLR_NTV_SYSC_FLAG); -#endif - - lx_runexe(argv, (void *)edp.ed_ldentry); - /* lx_runexe() never returns. */ - assert(0); + if ((lxtsd = malloc(sizeof (*lxtsd))) == NULL) { + lx_err_fatal("failed to allocate tsd for main thread: %s", + strerror(errno)); } + lx_debug("lx tsd allocated @ %p", lxtsd); + lx_init_tsd(lxtsd); /* - * We are here because the Linux application called the exit() or - * exit_group() system call. In turn the brand library did a - * setcontext() to jump to the thread context state we saved above. + * Allocate the brand emulation stack for the main process thread. + * Register the thread-specific data structure with the stack list so + * that it may be freed at thread exit or fork(2). */ - lx_exit_common(lx_tsd.lxtsd_exit, lx_tsd.lxtsd_exit_status); + lx_install_stack(NULL, 0, lxtsd); + + /* + * The brand linker expects the stack pointer to point to + * "argc", which is just before &argv[0]. + */ + lx_start((uintptr_t)argv - sizeof (void *), edp.ed_ldentry); + /*NOTREACHED*/ + abort(); return (0); } +/* + * We "return" to this function via a context hand-crafted by + * "lx_init_tsd()"; see that function for more detail. + * + * NOTE: Our call frame is on the main thread stack, not the alternate native + * stack -- it is safe to release the latter here. The frame does not have a + * valid return address, so this function MUST NOT return. + */ void -lx_exit_common(lx_exit_type_t exit_type, uintptr_t exit_value) +lx_exit_common(void) { - int ev = 0xff & exit_value; + lx_tsd_t *lxtsd = lx_get_tsd(); + int ev = (0xff & lxtsd->lxtsd_exit_status); - switch (exit_type) { + switch (lxtsd->lxtsd_exit) { case LX_ET_EXIT: + lx_debug("lx_exit_common(LX_ET_EXIT, %d)\n", ev); + + /* + * If the thread is exiting, but not the entire process, we + * must free the stack we allocated for usermode emulation. + * This is safe to do here because the setcontext() put us + * back on the BRAND stack for this process. This function + * also frees the thread-specific data object for this thread. + */ + lx_free_stack(); + /* * The native thread return value is never seen so we pass * NULL. @@ -1008,6 +810,7 @@ lx_exit_common(lx_exit_type_t exit_type, uintptr_t exit_value) break; case LX_ET_EXIT_GROUP: + lx_debug("lx_exit_common(LX_ET_EXIT_GROUP, %d)\n", ev); exit(ev); break; @@ -1018,30 +821,74 @@ lx_exit_common(lx_exit_type_t exit_type, uintptr_t exit_value) abort(); } -/* - * Walk back through the stack until we find the lx_emulate() frame. - */ -lx_regs_t * -lx_syscall_regs(void) +const ucontext_t * +lx_find_brand_uc(void) { - /* LINTED - alignment */ - struct frame *fr = (struct frame *)_getfp(); + ucontext_t *ucp = NULL; + + /* + * Ask for the current emulation (or signal handling) ucontext_t... + */ + assert(syscall(SYS_brand, B_GET_CURRENT_CONTEXT, &ucp) == 0); + + for (;;) { + uintptr_t flags; - while (fr->fr_savpc != (uintptr_t)&lx_emulate_done) { - fr = (struct frame *)fr->fr_savfp; - assert(fr->fr_savpc != NULL); + lx_debug("lx_find_brand_uc: inspect ucp %p...\n", ucp); + assert(ucp != NULL); + + flags = (uintptr_t)ucp->uc_brand_data[0]; + + if (flags & LX_UC_STACK_BRAND) { + lx_debug("lx_find_brand_uc: ucp %p\n", ucp); + + return (ucp); + } + + lx_debug("lx_find_brand_uc: skip non-BRAND ucp %p\n", ucp); + + /* + * Walk up the context chain to find the most recently stored + * brand register state. + */ + ucp = ucp->uc_link; } +} + +uintptr_t +lx_find_brand_sp(void) +{ + const ucontext_t *ucp = lx_find_brand_uc(); + uintptr_t sp = LX_REG(ucp, REG_SP); + + lx_debug("lx_find_brand_sp: ucp %p sp %p\n", ucp, sp); + + return (sp); +} + +ucontext_t * +lx_syscall_regs(void) +{ + ucontext_t *ucp = NULL; + uintptr_t flags; -#if defined(_LP64) /* - * This is %rbp, update to be at the end of the frame for correct - * struct offsets. lx_emulate only takes one parameter, a pointer to - * lx_regs_t. + * Ask for the current emulation (or signal handling) ucontext_t... */ - return ((lx_regs_t *)(fr->fr_savfp - sizeof (lx_regs_t))); -#else - return ((lx_regs_t *)((uintptr_t *)fr)[2]); -#endif + assert(syscall(SYS_brand, B_GET_CURRENT_CONTEXT, &ucp) == 0); + assert(ucp != NULL); + + /* + * Use of the lx_syscall_regs() function implies that the topmost (i.e. + * current) context is for a system call emulation request from the + * kernel, rather than a signal handling frame. + */ + flags = (uintptr_t)ucp->uc_brand_data[0]; + assert(flags & LX_UC_FRAME_IS_SYSCALL); + + lx_debug("lx_syscall_regs: ucp %p\n", ucp); + + return (ucp); } int @@ -1111,324 +958,330 @@ lx_fd_to_path(int fd, char *buf, int buf_size) #if defined(_LP64) /* The following is the 64-bit syscall table */ -static struct lx_sysent sysents[] = { - {"read", LX_IKE(read), LX_SYS_IKE, 3}, /* 0 */ - {"write", lx_write, 0, 3}, /* 1 */ - {"open", lx_open, 0, 3}, /* 2 */ - {"close", lx_close, 0, 1}, /* 3 */ - {"stat", lx_stat64, 0, 2}, /* 4 */ - {"fstat", lx_fstat64, 0, 2}, /* 5 */ - {"lstat", lx_lstat64, 0, 2}, /* 6 */ - {"poll", lx_poll, 0, 3}, /* 7 */ - {"lseek", lx_lseek, 0, 3}, /* 8 */ - {"mmap", lx_mmap, 0, 6}, /* 9 */ - {"mprotect", lx_mprotect, 0, 3}, /* 10 */ - {"munmap", lx_munmap, 0, 2}, /* 11 */ - {"brk", LX_IKE(brk), LX_SYS_IKE, 1}, /* 12 */ - {"rt_sigaction", lx_rt_sigaction, 0, 4}, /* 13 */ - {"rt_sigprocmask", lx_rt_sigprocmask, 0, 4}, /* 14 */ - {"rt_sigreturn", lx_rt_sigreturn, 0, 0}, /* 15 */ - {"ioctl", LX_IKE(ioctl), LX_SYS_IKE, 3}, /* 16 */ - {"pread64", lx_pread, 0, 4}, /* 17 */ - {"pwrite64", lx_pwrite, 0, 4}, /* 18 */ - {"readv", lx_readv, 0, 3}, /* 19 */ - {"writev", lx_writev, 0, 3}, /* 20 */ - {"access", lx_access, 0, 2}, /* 21 */ - {"pipe", LX_IKE(pipe), LX_SYS_IKE, 1}, /* 22 */ - {"select", lx_select, 0, 5}, /* 23 */ - {"sched_yield", lx_yield, 0, 0}, /* 24 */ - {"mremap", lx_remap, 0, 5}, /* 25 */ - {"msync", lx_msync, 0, 3}, /* 26 */ - {"mincore", lx_mincore, 0, 3}, /* 27 */ - {"madvise", lx_madvise, 0, 3}, /* 28 */ - {"shmget", lx_shmget, 0, 3}, /* 29 */ - {"shmat", lx_shmat, 0, 4}, /* 30 */ - {"shmctl", lx_shmctl, 0, 3}, /* 31 */ - {"dup", lx_dup, 0, 1}, /* 32 */ - {"dup2", lx_dup2, 0, 2}, /* 33 */ - {"pause", lx_pause, 0, 0}, /* 34 */ - {"nanosleep", lx_nanosleep, 0, 2}, /* 35 */ - {"getitimer", lx_getitimer, 0, 2}, /* 36 */ - {"alarm", lx_alarm, 0, 1}, /* 37 */ - {"setitimer", lx_setitimer, 0, 3}, /* 38 */ - {"getpid", lx_getpid, 0, 0}, /* 39 */ - {"sendfile", lx_sendfile64, 0, 4}, /* 40 */ - {"socket", lx_socket, 0, 3}, /* 41 */ - {"connect", lx_connect, 0, 3}, /* 42 */ - {"accept", lx_accept, 0, 3}, /* 43 */ - {"sendto", lx_sendto, 0, 6}, /* 44 */ - {"recvfrom", lx_recvfrom, 0, 6}, /* 45 */ - {"sendmsg", lx_sendmsg, 0, 3}, /* 46 */ - {"recvmsg", lx_recvmsg, 0, 3}, /* 47 */ - {"shutdown", lx_shutdown, 0, 2}, /* 48 */ - {"bind", lx_bind, 0, 3}, /* 49 */ - {"listen", lx_listen, 0, 2}, /* 50 */ - {"getsockname", lx_getsockname, 0, 3}, /* 51 */ - {"getpeername", lx_getpeername, 0, 3}, /* 52 */ - {"socketpair", lx_socketpair, 0, 4}, /* 53 */ - {"setsockopt", lx_setsockopt, 0, 5}, /* 54 */ - {"getsockopt", lx_getsockopt, 0, 5}, /* 55 */ - {"clone", lx_clone, 0, 5}, /* 56 */ - {"fork", lx_fork, 0, 0}, /* 57 */ - {"vfork", lx_vfork, 0, 0}, /* 58 */ - {"execve", lx_execve, 0, 3}, /* 59 */ - {"exit", lx_exit, 0, 1}, /* 60 */ - {"wait4", lx_wait4, 0, 4}, /* 61 */ - {"kill", LX_IKE(kill), LX_SYS_IKE, 2}, /* 62 */ - {"uname", lx_uname, 0, 1}, /* 63 */ - {"semget", lx_semget, 0, 3}, /* 64 */ - {"semop", lx_semop, 0, 3}, /* 65 */ - {"semctl", lx_semctl, 0, 4}, /* 66 */ - {"shmdt", lx_shmdt, 0, 1}, /* 67 */ - {"msgget", lx_msgget, 0, 2}, /* 68 */ - {"msgsnd", lx_msgsnd, 0, 4}, /* 69 */ - {"msgrcv", lx_msgrcv, 0, 5}, /* 70 */ - {"msgctl", lx_msgctl, 0, 3}, /* 71 */ - {"fcntl", lx_fcntl64, 0, 3}, /* 72 */ - {"flock", lx_flock, 0, 2}, /* 73 */ - {"fsync", lx_fsync, 0, 1}, /* 74 */ - {"fdatasync", lx_fdatasync, 0, 1}, /* 75 */ - {"truncate", lx_truncate, 0, 2}, /* 76 */ - {"ftruncate", lx_ftruncate, 0, 2}, /* 77 */ - {"getdents", lx_getdents, 0, 3}, /* 78 */ - {"getcwd", lx_getcwd, 0, 2}, /* 79 */ - {"chdir", lx_chdir, 0, 1}, /* 80 */ - {"fchdir", lx_fchdir, 0, 1}, /* 81 */ - {"rename", lx_rename, 0, 2}, /* 82 */ - {"mkdir", lx_mkdir, 0, 2}, /* 83 */ - {"rmdir", lx_rmdir, 0, 1}, /* 84 */ - {"creat", lx_creat, 0, 2}, /* 85 */ - {"link", lx_link, 0, 2}, /* 86 */ - {"unlink", lx_unlink, 0, 1}, /* 87 */ - {"symlink", lx_symlink, 0, 2}, /* 88 */ - {"readlink", lx_readlink, 0, 3}, /* 89 */ - {"chmod", lx_chmod, 0, 2}, /* 90 */ - {"fchmod", lx_fchmod, 0, 2}, /* 91 */ - {"chown", lx_chown, 0, 3}, /* 92 */ - {"fchown", lx_fchown, 0, 3}, /* 93 */ - {"lchown", lx_lchown, 0, 3}, /* 94 */ - {"umask", lx_umask, 0, 1}, /* 95 */ - {"gettimeofday", lx_gettimeofday, 0, 2}, /* 96 */ - {"getrlimit", lx_getrlimit, 0, 2}, /* 97 */ - {"getrusage", lx_getrusage, 0, 2}, /* 98 */ - {"sysinfo", LX_IKE(sysinfo), LX_SYS_IKE, 1}, /* 99 */ - {"times", lx_times, 0, 1}, /* 100 */ - {"ptrace", lx_ptrace, 0, 4}, /* 101 */ - {"getuid", lx_getuid, 0, 0}, /* 102 */ - {"syslog", lx_syslog, 0, 3}, /* 103 */ - {"getgid", lx_getgid, 0, 0}, /* 104 */ - {"setuid", lx_setuid, 0, 1}, /* 105 */ - {"setgid", lx_setgid, 0, 1}, /* 106 */ - {"geteuid", lx_geteuid, 0, 0}, /* 107 */ - {"getegid", lx_getegid, 0, 0}, /* 108 */ - {"setpgid", lx_setpgid, 0, 2}, /* 109 */ - {"getppid", LX_IKE(getppid), LX_SYS_IKE, 0}, /* 110 */ - {"getpgrp", lx_getpgrp, 0, 0}, /* 111 */ - {"setsid", lx_setsid, 0, 0}, /* 112 */ - {"setreuid", lx_setreuid, 0, 0}, /* 113 */ - {"setregid", lx_setregid, 0, 0}, /* 114 */ - {"getgroups", lx_getgroups, 0, 2}, /* 115 */ - {"setgroups", lx_setgroups, 0, 2}, /* 116 */ - {"setresuid", LX_IKE(setresuid), LX_SYS_IKE, 3}, /* 117 */ - {"getresuid", lx_getresuid, 0, 3}, /* 118 */ - {"setresgid", LX_IKE(setresgid), LX_SYS_IKE, 3}, /* 119 */ - {"getresgid", lx_getresgid, 0, 3}, /* 120 */ - {"getpgid", lx_getpgid, 0, 1}, /* 121 */ - {"setfsuid", lx_setfsuid, 0, 1}, /* 122 */ - {"setfsgid", lx_setfsgid, 0, 1}, /* 123 */ - {"getsid", lx_getsid, 0, 1}, /* 124 */ - {"capget", lx_capget, 0, 2}, /* 125 */ - {"capset", lx_capset, 0, 2}, /* 126 */ - {"rt_sigpending", lx_rt_sigpending, 0, 2}, /* 127 */ - {"rt_sigtimedwait", lx_rt_sigtimedwait, 0, 4}, /* 128 */ - {"rt_sigqueueinfo", lx_rt_sigqueueinfo, 0, 3}, /* 129 */ - {"rt_sigsuspend", lx_rt_sigsuspend, 0, 2}, /* 130 */ - {"sigaltstack", lx_sigaltstack, 0, 2}, /* 131 */ - {"utime", lx_utime, 0, 2}, /* 132 */ - {"mknod", lx_mknod, 0, 3}, /* 133 */ - {"uselib", NULL, NOSYS_KERNEL, 0}, /* 134 */ - {"personality", lx_personality, 0, 1}, /* 135 */ - {"ustat", NULL, NOSYS_OBSOLETE, 2}, /* 136 */ - {"statfs", lx_statfs, 0, 2}, /* 137 */ - {"fstatfs", lx_fstatfs, 0, 2}, /* 138 */ - {"sysfs", lx_sysfs, 0, 3}, /* 139 */ - {"getpriority", lx_getpriority, 0, 2}, /* 140 */ - {"setpriority", lx_setpriority, 0, 3}, /* 141 */ - {"sched_setparam", lx_sched_setparam, 0, 2}, /* 142 */ - {"sched_getparam", lx_sched_getparam, 0, 2}, /* 143 */ - {"sched_setscheduler", lx_sched_setscheduler, 0, 3}, /* 144 */ - {"sched_getscheduler", lx_sched_getscheduler, 0, 1}, /* 145 */ - {"sched_get_priority_max", lx_sched_get_priority_max, 0, 1}, /* 146 */ - {"sched_get_priority_min", lx_sched_get_priority_min, 0, 1}, /* 147 */ - {"sched_rr_get_interval", lx_sched_rr_get_interval, 0, 2}, /* 148 */ - {"mlock", lx_mlock, 0, 2}, /* 149 */ - {"munlock", lx_munlock, 0, 2}, /* 150 */ - {"mlockall", lx_mlockall, 0, 1}, /* 151 */ - {"munlockall", lx_munlockall, 0, 0}, /* 152 */ - {"vhangup", lx_vhangup, 0, 0}, /* 153 */ - {"modify_ldt", LX_IKE(modify_ldt), LX_SYS_IKE, 3}, /* 154 */ - {"pivot_root", NULL, NOSYS_KERNEL, 0}, /* 155 */ - {"sysctl", lx_sysctl, 0, 1}, /* 156 */ - {"prctl", lx_prctl, 0, 5}, /* 157 */ - {"arch_prctl", lx_arch_prctl, 0, 2}, /* 158 */ - {"adjtimex", lx_adjtimex, 0, 1}, /* 159 */ - {"setrlimit", lx_setrlimit, 0, 2}, /* 160 */ - {"chroot", lx_chroot, 0, 1}, /* 161 */ - {"sync", lx_sync, 0, 0}, /* 162 */ - {"acct", NULL, NOSYS_NO_EQUIV, 0}, /* 163 */ - {"settimeofday", lx_settimeofday, 0, 2}, /* 164 */ - {"mount", lx_mount, 0, 5}, /* 165 */ - {"umount2", lx_umount2, 0, 2}, /* 166 */ - {"swapon", NULL, NOSYS_KERNEL, 0}, /* 167 */ - {"swapoff", NULL, NOSYS_KERNEL, 0}, /* 168 */ - {"reboot", lx_reboot, 0, 4}, /* 169 */ - {"sethostname", lx_sethostname, 0, 2}, /* 170 */ - {"setdomainname", lx_setdomainname, 0, 2}, /* 171 */ - {"iopl", NULL, NOSYS_NO_EQUIV, 0}, /* 172 */ - {"ioperm", NULL, NOSYS_NO_EQUIV, 0}, /* 173 */ - {"create_module", NULL, NOSYS_KERNEL, 0}, /* 174 */ - {"init_module", NULL, NOSYS_KERNEL, 0}, /* 175 */ - {"delete_module", NULL, NOSYS_KERNEL, 0}, /* 176 */ - {"get_kernel_syms", NULL, NOSYS_KERNEL, 0}, /* 177 */ - {"query_module", lx_query_module, NOSYS_KERNEL, 5}, /* 178 */ - {"quotactl", NULL, NOSYS_KERNEL, 0}, /* 179 */ - {"nfsservctl", NULL, NOSYS_KERNEL, 0}, /* 180 */ - {"getpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 181 */ - {"putpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 182 */ - {"afs_syscall", NULL, NOSYS_KERNEL, 0}, /* 183 */ - {"tux", NULL, NOSYS_NO_EQUIV, 0}, /* 184 */ - {"security", NULL, NOSYS_NO_EQUIV, 0}, /* 185 */ - {"gettid", LX_IKE(gettid), LX_SYS_IKE, 0}, /* 186 */ - {"readahead", NULL, NOSYS_NO_EQUIV, 0}, /* 187 */ - {"setxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 188 */ - {"lsetxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 189 */ - {"fsetxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 190 */ - {"getxattr", lx_xattr4, 0, 4}, /* 191 */ - {"lgetxattr", lx_xattr4, 0, 4}, /* 192 */ - {"fgetxattr", lx_xattr4, 0, 4}, /* 193 */ - {"listxattr", lx_xattr3, 0, 3}, /* 194 */ - {"llistxattr", lx_xattr3, 0, 3}, /* 195 */ - {"flistxattr", lx_xattr3, 0, 3}, /* 196 */ - {"removexattr", lx_xattr2, 0, 2}, /* 197 */ - {"lremovexattr", lx_xattr2, 0, 2}, /* 198 */ - {"fremovexattr", lx_xattr2, 0, 2}, /* 199 */ - {"tkill", LX_IKE(tkill), LX_SYS_IKE, 2}, /* 200 */ - {"time", lx_time, 0, 1}, /* 201 */ - {"futex", LX_IKE(futex), LX_SYS_IKE, 6}, /* 202 */ - {"sched_setaffinity", lx_sched_setaffinity, 0, 3}, /* 203 */ - {"sched_getaffinity", lx_sched_getaffinity, 0, 3}, /* 204 */ - {"set_thread_area", LX_IKE(set_thread_area), LX_SYS_IKE, 1}, /* 205 */ - {"io_setup", lx_io_setup, 0, 2}, /* 206 */ - {"io_destroy", lx_io_destroy, 0, 1}, /* 207 */ - {"io_getevents", lx_io_getevents, 0, 5}, /* 208 */ - {"io_submit", lx_io_submit, 0, 3}, /* 209 */ - {"io_cancel", lx_io_cancel, 0, 3}, /* 210 */ - {"get_thread_area", LX_IKE(get_thread_area), LX_SYS_IKE, 1}, /* 211 */ - {"lookup_dcookie", NULL, NOSYS_NO_EQUIV, 0}, /* 212 */ - {"epoll_create", lx_epoll_create, 0, 1}, /* 213 */ - {"epoll_ctl_old", NULL, NOSYS_NULL, 0}, /* 214 */ - {"epoll_wait_old", NULL, NOSYS_NULL, 0}, /* 215 */ - {"remap_file_pages", NULL, NOSYS_NO_EQUIV, 0}, /* 216 */ - {"getdents64", lx_getdents64, 0, 3}, /* 217 */ - {"set_tid_address", LX_IKE(set_tid_address), LX_SYS_IKE, 1}, /* 218 */ - {"restart_syscall", NULL, NOSYS_NULL, 0}, /* 219 */ - {"semtimedop", lx_semtimedop, 0, 4}, /* 220 */ - {"fadvise64", lx_fadvise64_64, 0, 4}, /* 221 */ - {"timer_create", lx_timer_create, 0, 3}, /* 222 */ - {"timer_settime", lx_timer_settime, 0, 4}, /* 223 */ - {"timer_gettime", lx_timer_gettime, 0, 2}, /* 224 */ - {"timer_getoverrun", lx_timer_getoverrun, 0, 1}, /* 225 */ - {"timer_delete", lx_timer_delete, 0, 1}, /* 226 */ - {"clock_settime", lx_clock_settime, 0, 2}, /* 227 */ - {"clock_gettime", lx_clock_gettime, 0, 2}, /* 228 */ - {"clock_getres", lx_clock_getres, 0, 2}, /* 229 */ - {"clock_nanosleep", lx_clock_nanosleep, 0, 4}, /* 230 */ - {"exit_group", lx_group_exit, 0, 1}, /* 231 */ - {"epoll_wait", lx_epoll_wait, 0, 4}, /* 232 */ - {"epoll_ctl", lx_epoll_ctl, 0, 4}, /* 233 */ - {"tgkill", LX_IKE(tgkill), LX_SYS_IKE, 3}, /* 234 */ - {"utimes", lx_utimes, 0, 2}, /* 235 */ - {"vserver", NULL, NOSYS_NULL, 0}, /* 236 */ - {"mbind", NULL, NOSYS_NULL, 0}, /* 237 */ - {"set_mempolicy", NULL, NOSYS_NULL, 0}, /* 238 */ - {"get_mempolicy", NULL, NOSYS_NULL, 0}, /* 239 */ - {"mq_open", NULL, NOSYS_NULL, 0}, /* 240 */ - {"mq_unlink", NULL, NOSYS_NULL, 0}, /* 241 */ - {"mq_timedsend", NULL, NOSYS_NULL, 0}, /* 242 */ - {"mq_timedreceive", NULL, NOSYS_NULL, 0}, /* 243 */ - {"mq_notify", NULL, NOSYS_NULL, 0}, /* 244 */ - {"mq_getsetattr", NULL, NOSYS_NULL, 0}, /* 245 */ - {"kexec_load", NULL, NOSYS_NULL, 0}, /* 246 */ - {"waitid", lx_waitid, 0, 4}, /* 247 */ - {"add_key", NULL, NOSYS_NULL, 0}, /* 248 */ - {"request_key", NULL, NOSYS_NULL, 0}, /* 249 */ - {"keyctl", NULL, NOSYS_NULL, 0}, /* 250 */ - {"ioprio_set", NULL, NOSYS_NULL, 0}, /* 251 */ - {"ioprio_get", NULL, NOSYS_NULL, 0}, /* 252 */ - {"inotify_init", lx_inotify_init, 0, 0}, /* 253 */ - {"inotify_add_watch", lx_inotify_add_watch, 0, 3}, /* 254 */ - {"inotify_rm_watch", lx_inotify_rm_watch, 0, 2}, /* 255 */ - {"migrate_pages", NULL, NOSYS_NULL, 0}, /* 256 */ - {"openat", lx_openat, 0, 4}, /* 257 */ - {"mkdirat", lx_mkdirat, 0, 3}, /* 258 */ - {"mknodat", lx_mknodat, 0, 4}, /* 259 */ - {"fchownat", lx_fchownat, 0, 5}, /* 260 */ - {"futimesat", lx_futimesat, 0, 3}, /* 261 */ - {"fstatat64", lx_fstatat64, 0, 4}, /* 262 */ - {"unlinkat", lx_unlinkat, 0, 3}, /* 263 */ - {"renameat", lx_renameat, 0, 4}, /* 264 */ - {"linkat", lx_linkat, 0, 5}, /* 265 */ - {"symlinkat", lx_symlinkat, 0, 3}, /* 266 */ - {"readlinkat", lx_readlinkat, 0, 4}, /* 267 */ - {"fchmodat", lx_fchmodat, 0, 4}, /* 268 */ - {"faccessat", lx_faccessat, 0, 4}, /* 269 */ - {"pselect6", lx_pselect6, 0, 6}, /* 270 */ - {"ppoll", lx_ppoll, 0, 5}, /* 271 */ - {"unshare", NULL, NOSYS_NULL, 0}, /* 272 */ - {"set_robust_list", NULL, NOSYS_NULL, 0}, /* 273 */ - {"get_robust_list", NULL, NOSYS_NULL, 0}, /* 274 */ - {"splice", NULL, NOSYS_NULL, 0}, /* 275 */ - {"tee", NULL, NOSYS_NULL, 0}, /* 276 */ - {"sync_file_range", NULL, NOSYS_NULL, 0}, /* 277 */ - {"vmsplice", NULL, NOSYS_NULL, 0}, /* 278 */ - {"move_pages", NULL, NOSYS_NULL, 0}, /* 279 */ - {"utimensat", lx_utimensat, 0, 4}, /* 280 */ - {"epoll_pwait", lx_epoll_pwait, 0, 5}, /* 281 */ - {"signalfd", NULL, NOSYS_NULL, 0}, /* 282 */ - {"timerfd_create", lx_timerfd_create, 0, 2}, /* 283 */ - {"eventfd", lx_eventfd, 0, 1}, /* 284 */ - {"fallocate", NULL, NOSYS_NULL, 0}, /* 285 */ - {"timerfd_settime", lx_timerfd_settime, 0, 4}, /* 286 */ - {"timerfd_gettime", lx_timerfd_gettime, 0, 2}, /* 287 */ - {"accept4", lx_accept4, 0, 4}, /* 288 */ - {"signalfd4", NULL, NOSYS_NULL, 0}, /* 289 */ - {"eventfd2", lx_eventfd2, 0, 2}, /* 290 */ - {"epoll_create1", lx_epoll_create1, 0, 1}, /* 291 */ - {"dup3", lx_dup3, 0, 3}, /* 292 */ - {"pipe2", lx_pipe2, 0, 2}, /* 293 */ - {"inotify_init1", lx_inotify_init1, 0, 1}, /* 294 */ - {"preadv", lx_preadv, 0, 4}, /* 295 */ - {"pwritev", lx_pwritev, 0, 4}, /* 296 */ - {"rt_tgsigqueueinfo", lx_rt_tgsigqueueinfo, 0, 4}, /* 297 */ - {"perf_event_open", NULL, NOSYS_NULL, 0}, /* 298 */ - {"recvmmsg", NULL, NOSYS_NULL, 0}, /* 299 */ - {"fanotify_init", NULL, NOSYS_NULL, 0}, /* 300 */ - {"fanotify_mark", NULL, NOSYS_NULL, 0}, /* 301 */ - {"prlimit64", lx_prlimit64, 0, 4}, /* 302 */ - {"name_to_handle_at", NULL, NOSYS_NULL, 0}, /* 303 */ - {"open_by_handle_at", NULL, NOSYS_NULL, 0}, /* 304 */ - {"clock_adjtime", NULL, NOSYS_NULL, 0}, /* 305 */ - {"syncfs", NULL, NOSYS_NULL, 0}, /* 306 */ - {"sendmmsg", NULL, NOSYS_NULL, 0}, /* 307 */ - {"setns", NULL, NOSYS_NULL, 0}, /* 309 */ - {"getcpu", lx_getcpu, 0, 3}, /* 309 */ - {"process_vm_readv", NULL, NOSYS_NULL, 0}, /* 310 */ - {"process_vm_writev", NULL, NOSYS_NULL, 0}, /* 311 */ - {"kcmp", NULL, NOSYS_NULL, 0}, /* 312 */ - {"finit_module", NULL, NOSYS_NULL, 0}, /* 313 */ - {"sched_setattr", NULL, NOSYS_NULL, 0}, /* 314 */ - {"sched_getattr", NULL, NOSYS_NULL, 0}, /* 315 */ - {"renameat2", NULL, NOSYS_NULL, 0}, /* 316 */ +static lx_syscall_handler_t lx_handlers[] = { + NULL, /* 0: read */ + NULL, /* 1: write */ + lx_open, + lx_close, + lx_stat64, + lx_fstat64, + lx_lstat64, + lx_poll, + lx_lseek, + lx_mmap, + lx_mprotect, + lx_munmap, + NULL, /* 12: brk */ + lx_rt_sigaction, + lx_rt_sigprocmask, + lx_rt_sigreturn, + NULL, /* 16: ioctl */ + lx_pread, + lx_pwrite, + lx_readv, + lx_writev, + lx_access, + NULL, /* 22: pipe */ + lx_select, + NULL, /* 24: sched_yield */ + lx_remap, + lx_msync, + lx_mincore, + lx_madvise, + lx_shmget, + lx_shmat, + lx_shmctl, + lx_dup, + lx_dup2, + lx_pause, + lx_nanosleep, + lx_getitimer, + lx_alarm, + lx_setitimer, + NULL, /* 39: getpid */ + lx_sendfile64, + lx_socket, + lx_connect, + lx_accept, + lx_sendto, + lx_recvfrom, + lx_sendmsg, + lx_recvmsg, + lx_shutdown, + lx_bind, + lx_listen, + lx_getsockname, + lx_getpeername, + lx_socketpair, + lx_setsockopt, + lx_getsockopt, + lx_clone, + lx_fork, + lx_vfork, + lx_execve, + lx_exit, + NULL, /* 61: wait4 */ + NULL, /* 62: kill */ + lx_uname, + lx_semget, + lx_semop, + lx_semctl, + lx_shmdt, + lx_msgget, + lx_msgsnd, + lx_msgrcv, + lx_msgctl, + lx_fcntl64, + lx_flock, + lx_fsync, + lx_fdatasync, + lx_truncate, + lx_ftruncate, + lx_getdents, + lx_getcwd, + lx_chdir, + lx_fchdir, + lx_rename, + lx_mkdir, + lx_rmdir, + lx_creat, + lx_link, + lx_unlink, + lx_symlink, + lx_readlink, + lx_chmod, + lx_fchmod, + lx_chown, + lx_fchown, + lx_lchown, + lx_umask, + lx_gettimeofday, + lx_getrlimit, + lx_getrusage, + NULL, /* 99: sysinfo */ + lx_times, + lx_ptrace, + lx_getuid, + lx_syslog, + lx_getgid, + lx_setuid, + lx_setgid, + lx_geteuid, + lx_getegid, + lx_setpgid, + NULL, /* 110: getppid */ + lx_getpgrp, + lx_setsid, + lx_setreuid, + lx_setregid, + lx_getgroups, + lx_setgroups, + NULL, /* 117: setresuid */ + lx_getresuid, + NULL, /* 119: setresgid */ + lx_getresgid, + lx_getpgid, + lx_setfsuid, + lx_setfsgid, + lx_getsid, + lx_capget, + lx_capset, + lx_rt_sigpending, + lx_rt_sigtimedwait, + lx_rt_sigqueueinfo, + lx_rt_sigsuspend, + lx_sigaltstack, + lx_utime, + lx_mknod, + NULL, /* 134: uselib */ + lx_personality, + NULL, /* 136: ustat */ + lx_statfs, + lx_fstatfs, + lx_sysfs, + lx_getpriority, + lx_setpriority, + lx_sched_setparam, + lx_sched_getparam, + lx_sched_setscheduler, + lx_sched_getscheduler, + lx_sched_get_priority_max, + lx_sched_get_priority_min, + lx_sched_rr_get_interval, + lx_mlock, + lx_munlock, + lx_mlockall, + lx_munlockall, + lx_vhangup, + NULL, /* 154: modify_ldt */ + NULL, /* 155: pivot_root */ + lx_sysctl, + lx_prctl, + NULL, /* 158: arch_prctl */ + lx_adjtimex, + lx_setrlimit, + lx_chroot, + lx_sync, + NULL, /* 163: acct */ + lx_settimeofday, + lx_mount, + lx_umount2, + NULL, /* 167: swapon */ + NULL, /* 168: swapoff */ + lx_reboot, + lx_sethostname, + lx_setdomainname, + NULL, /* 172: iopl */ + NULL, /* 173: ioperm */ + NULL, /* 174: create_module */ + NULL, /* 175: init_module */ + NULL, /* 176: delete_module */ + NULL, /* 177: get_kernel_syms */ + lx_query_module, + NULL, /* 179: quotactl */ + NULL, /* 180: nfsservctl */ + NULL, /* 181: getpmsg */ + NULL, /* 182: putpmsg */ + NULL, /* 183: afs_syscall */ + NULL, /* 184: tux */ + NULL, /* 185: security */ + NULL, /* 186: gettid */ + NULL, /* 187: readahead */ + NULL, /* 188: setxattr */ + NULL, /* 189: lsetxattr */ + NULL, /* 190: fsetxattr */ + NULL, /* 191: getxattr */ + NULL, /* 192: lgetxattr */ + NULL, /* 193: fgetxattr */ + NULL, /* 194: listxattr */ + NULL, /* 195: llistxattr */ + NULL, /* 196: flistxattr */ + NULL, /* 197: removexattr */ + NULL, /* 198: lremovexattr */ + NULL, /* 199: fremovexattr */ + NULL, /* 200: tkill */ + lx_time, + NULL, /* 202: futex */ + lx_sched_setaffinity, + lx_sched_getaffinity, + NULL, /* 205: set_thread_area */ + NULL, /* 206: io_setup */ + NULL, /* 207: io_destroy */ + NULL, /* 208: io_getevents */ + NULL, /* 209: io_submit */ + NULL, /* 210: io_cancel */ + NULL, /* 211: get_thread_area */ + NULL, /* 212: lookup_dcookie */ + lx_epoll_create, + NULL, /* 214: epoll_ctl_old */ + NULL, /* 215: epoll_wait_old */ + NULL, /* 216: remap_file_pages */ + lx_getdents64, + NULL, /* 218: set_tid_address */ + NULL, /* 219: restart_syscall */ + lx_semtimedop, + lx_fadvise64_64, + lx_timer_create, + lx_timer_settime, + lx_timer_gettime, + lx_timer_getoverrun, + lx_timer_delete, + lx_clock_settime, + lx_clock_gettime, + lx_clock_getres, + lx_clock_nanosleep, + lx_group_exit, + lx_epoll_wait, + lx_epoll_ctl, + NULL, /* 234: tgkill */ + lx_utimes, + NULL, /* 236: vserver */ + NULL, /* 237: mbind */ + NULL, /* 238: set_mempolicy */ + NULL, /* 239: get_mempolicy */ + NULL, /* 240: mq_open */ + NULL, /* 241: mq_unlink */ + NULL, /* 242: mq_timedsend */ + NULL, /* 243: mq_timedreceive */ + NULL, /* 244: mq_notify */ + NULL, /* 245: mq_getsetattr */ + NULL, /* 246: kexec_load */ + NULL, /* 247: waitid */ + NULL, /* 248: add_key */ + NULL, /* 249: request_key */ + NULL, /* 250: keyctl */ + NULL, /* 251: ioprio_set */ + NULL, /* 252: ioprio_get */ + lx_inotify_init, + lx_inotify_add_watch, + lx_inotify_rm_watch, + NULL, /* 256: migrate_pages */ + lx_openat, + lx_mkdirat, + lx_mknodat, + lx_fchownat, + lx_futimesat, + lx_fstatat64, + lx_unlinkat, + lx_renameat, + lx_linkat, + lx_symlinkat, + lx_readlinkat, + lx_fchmodat, + lx_faccessat, + lx_pselect6, + lx_ppoll, + NULL, /* 272: unshare */ + NULL, /* 273: set_robust_list */ + NULL, /* 274: get_robust_list */ + NULL, /* 275: splice */ + NULL, /* 276: tee */ + NULL, /* 277: sync_file_range */ + NULL, /* 278: vmsplice */ + NULL, /* 279: move_pages */ + lx_utimensat, + lx_epoll_pwait, + NULL, /* 282: signalfd */ + lx_timerfd_create, + lx_eventfd, + NULL, /* 285: fallocate */ + lx_timerfd_settime, + lx_timerfd_gettime, + lx_accept4, + NULL, /* 289: signalfd4 */ + lx_eventfd2, + lx_epoll_create1, + lx_dup3, + NULL, /* 293: pipe2 */ + lx_inotify_init1, + NULL, /* 295: preadv */ + NULL, /* 296: pwritev */ + lx_rt_tgsigqueueinfo, + NULL, /* 298: perf_event_open */ + NULL, /* 299: recvmmsg */ + NULL, /* 300: fanotify_init */ + NULL, /* 301: fanotify_mark */ + lx_prlimit64, + NULL, /* 303: name_to_handle_at */ + NULL, /* 304: open_by_handle_at */ + NULL, /* 305: clock_adjtime */ + NULL, /* 306: syncfs */ + NULL, /* 307: sendmmsg */ + NULL, /* 309: setns */ + lx_getcpu, + NULL, /* 310: process_vm_readv */ + NULL, /* 311: process_vm_writev */ + NULL, /* 312: kcmp */ + NULL, /* 313: finit_module */ + NULL, /* 314: sched_setattr */ + NULL, /* 315: sched_getattr */ + NULL, /* 316: renameat2 */ + NULL, /* 317: seccomp */ + NULL, /* 318: getrandom */ + NULL, /* 319: memfd_create */ + NULL, /* 320: kexec_file_load */ + NULL, /* 321: bpf */ + NULL, /* 322: execveat */ /* XXX TBD gap then x32 syscalls from 512 - 544 */ }; @@ -1436,361 +1289,365 @@ static struct lx_sysent sysents[] = { #else /* The following is the 32-bit syscall table */ -static struct lx_sysent sysents[] = { - {"nosys", NULL, NOSYS_NONE, 0}, /* 0 */ - {"exit", lx_exit, 0, 1}, /* 1 */ - {"fork", lx_fork, 0, 0}, /* 2 */ - {"read", LX_IKE(read), LX_SYS_IKE, 3}, /* 3 */ - {"write", lx_write, 0, 3}, /* 4 */ - {"open", lx_open, 0, 3}, /* 5 */ - {"close", lx_close, 0, 1}, /* 6 */ - {"waitpid", lx_waitpid, 0, 3}, /* 7 */ - {"creat", lx_creat, 0, 2}, /* 8 */ - {"link", lx_link, 0, 2}, /* 9 */ - {"unlink", lx_unlink, 0, 1}, /* 10 */ - {"execve", lx_execve, 0, 3}, /* 11 */ - {"chdir", lx_chdir, 0, 1}, /* 12 */ - {"time", lx_time, 0, 1}, /* 13 */ - {"mknod", lx_mknod, 0, 3}, /* 14 */ - {"chmod", lx_chmod, 0, 2}, /* 15 */ - {"lchown16", lx_lchown16, 0, 3}, /* 16 */ - {"break", NULL, NOSYS_OBSOLETE, 0}, /* 17 */ - {"stat", NULL, NOSYS_OBSOLETE, 0}, /* 18 */ - {"lseek", lx_lseek, 0, 3}, /* 19 */ - {"getpid", lx_getpid, 0, 0}, /* 20 */ - {"mount", lx_mount, 0, 5}, /* 21 */ - {"umount", lx_umount, 0, 1}, /* 22 */ - {"setuid16", lx_setuid16, 0, 1}, /* 23 */ - {"getuid16", lx_getuid16, 0, 0}, /* 24 */ - {"stime", lx_stime, 0, 1}, /* 25 */ - {"ptrace", lx_ptrace, 0, 4}, /* 26 */ - {"alarm", lx_alarm, 0, 1}, /* 27 */ - {"fstat", NULL, NOSYS_OBSOLETE, 0}, /* 28 */ - {"pause", lx_pause, 0, 0}, /* 29 */ - {"utime", lx_utime, 0, 2}, /* 30 */ - {"stty", NULL, NOSYS_OBSOLETE, 0}, /* 31 */ - {"gtty", NULL, NOSYS_OBSOLETE, 0}, /* 32 */ - {"access", lx_access, 0, 2}, /* 33 */ - {"nice", lx_nice, 0, 1}, /* 34 */ - {"ftime", NULL, NOSYS_OBSOLETE, 0}, /* 35 */ - {"sync", lx_sync, 0, 0}, /* 36 */ - {"kill", LX_IKE(kill), LX_SYS_IKE, 2}, /* 37 */ - {"rename", lx_rename, 0, 2}, /* 38 */ - {"mkdir", lx_mkdir, 0, 2}, /* 39 */ - {"rmdir", lx_rmdir, 0, 1}, /* 40 */ - {"dup", lx_dup, 0, 1}, /* 41 */ - {"pipe", LX_IKE(pipe), LX_SYS_IKE, 1}, /* 42 */ - {"times", lx_times, 0, 1}, /* 43 */ - {"prof", NULL, NOSYS_OBSOLETE, 0}, /* 44 */ - {"brk", LX_IKE(brk), LX_SYS_IKE, 1}, /* 45 */ - {"setgid16", lx_setgid16, 0, 1}, /* 46 */ - {"getgid16", lx_getgid16, 0, 0}, /* 47 */ - {"signal", lx_signal, 0, 2}, /* 48 */ - {"geteuid16", lx_geteuid16, 0, 0}, /* 49 */ - {"getegid16", lx_getegid16, 0, 0}, /* 50 */ - {"acct", NULL, NOSYS_NO_EQUIV, 0}, /* 51 */ - {"umount2", lx_umount2, 0, 2}, /* 52 */ - {"lock", NULL, NOSYS_OBSOLETE, 0}, /* 53 */ - {"ioctl", LX_IKE(ioctl), LX_SYS_IKE, 3}, /* 54 */ - {"fcntl", lx_fcntl, 0, 3}, /* 55 */ - {"mpx", NULL, NOSYS_OBSOLETE, 0}, /* 56 */ - {"setpgid", lx_setpgid, 0, 2}, /* 57 */ - {"ulimit", NULL, NOSYS_OBSOLETE, 0}, /* 58 */ - {"olduname", NULL, NOSYS_OBSOLETE, 0}, /* 59 */ - {"umask", lx_umask, 0, 1}, /* 60 */ - {"chroot", lx_chroot, 0, 1}, /* 61 */ - {"ustat", NULL, NOSYS_OBSOLETE, 2}, /* 62 */ - {"dup2", lx_dup2, 0, 2}, /* 63 */ - {"getppid", LX_IKE(getppid), LX_SYS_IKE, 0}, /* 64 */ - {"getpgrp", lx_getpgrp, 0, 0}, /* 65 */ - {"setsid", lx_setsid, 0, 0}, /* 66 */ - {"sigaction", lx_sigaction, 0, 3}, /* 67 */ - {"sgetmask", NULL, NOSYS_OBSOLETE, 0}, /* 68 */ - {"ssetmask", NULL, NOSYS_OBSOLETE, 0}, /* 69 */ - {"setreuid16", lx_setreuid16, 0, 2}, /* 70 */ - {"setregid16", lx_setregid16, 0, 2}, /* 71 */ - {"sigsuspend", lx_sigsuspend, 0, 1}, /* 72 */ - {"sigpending", lx_sigpending, 0, 1}, /* 73 */ - {"sethostname", lx_sethostname, 0, 2}, /* 74 */ - {"setrlimit", lx_setrlimit, 0, 2}, /* 75 */ - {"getrlimit", lx_oldgetrlimit, 0, 2}, /* 76 */ - {"getrusage", lx_getrusage, 0, 2}, /* 77 */ - {"gettimeofday", lx_gettimeofday, 0, 2}, /* 78 */ - {"settimeofday", lx_settimeofday, 0, 2}, /* 79 */ - {"getgroups16", lx_getgroups16, 0, 2}, /* 80 */ - {"setgroups16", lx_setgroups16, 0, 2}, /* 81 */ - {"select", NULL, NOSYS_OBSOLETE, 0}, /* 82 */ - {"symlink", lx_symlink, 0, 2}, /* 83 */ - {"oldlstat", NULL, NOSYS_OBSOLETE, 0}, /* 84 */ - {"readlink", lx_readlink, 0, 3}, /* 85 */ - {"uselib", NULL, NOSYS_KERNEL, 0}, /* 86 */ - {"swapon", NULL, NOSYS_KERNEL, 0}, /* 87 */ - {"reboot", lx_reboot, 0, 4}, /* 88 */ - {"readdir", lx_readdir, 0, 3}, /* 89 */ - {"mmap", lx_mmap, 0, 6}, /* 90 */ - {"munmap", lx_munmap, 0, 2}, /* 91 */ - {"truncate", lx_truncate, 0, 2}, /* 92 */ - {"ftruncate", lx_ftruncate, 0, 2}, /* 93 */ - {"fchmod", lx_fchmod, 0, 2}, /* 94 */ - {"fchown16", lx_fchown16, 0, 3}, /* 95 */ - {"getpriority", lx_getpriority, 0, 2}, /* 96 */ - {"setpriority", lx_setpriority, 0, 3}, /* 97 */ - {"profil", NULL, NOSYS_NO_EQUIV, 0}, /* 98 */ - {"statfs", lx_statfs, 0, 2}, /* 99 */ - {"fstatfs", lx_fstatfs, 0, 2}, /* 100 */ - {"ioperm", NULL, NOSYS_NO_EQUIV, 0}, /* 101 */ - {"socketcall", lx_socketcall, 0, 2}, /* 102 */ - {"syslog", lx_syslog, 0, 3}, /* 103 */ - {"setitimer", lx_setitimer, 0, 3}, /* 104 */ - {"getitimer", lx_getitimer, 0, 2}, /* 105 */ - {"stat", lx_stat, 0, 2}, /* 106 */ - {"lstat", lx_lstat, 0, 2}, /* 107 */ - {"fstat", lx_fstat, 0, 2}, /* 108 */ - {"uname", NULL, NOSYS_OBSOLETE, 0}, /* 109 */ - {"oldiopl", NULL, NOSYS_NO_EQUIV, 0}, /* 110 */ - {"vhangup", lx_vhangup, 0, 0}, /* 111 */ - {"idle", NULL, NOSYS_NO_EQUIV, 0}, /* 112 */ - {"vm86old", NULL, NOSYS_OBSOLETE, 0}, /* 113 */ - {"wait4", lx_wait4, 0, 4}, /* 114 */ - {"swapoff", NULL, NOSYS_KERNEL, 0}, /* 115 */ - {"sysinfo", lx_sysinfo32, 0, 1}, /* 116 */ - {"ipc", lx_ipc, 0, 5}, /* 117 */ - {"fsync", lx_fsync, 0, 1}, /* 118 */ - {"sigreturn", lx_sigreturn, 0, 1}, /* 119 */ - {"clone", lx_clone, 0, 5}, /* 120 */ - {"setdomainname", lx_setdomainname, 0, 2}, /* 121 */ - {"uname", lx_uname, 0, 1}, /* 122 */ - {"modify_ldt", LX_IKE(modify_ldt), LX_SYS_IKE, 3}, /* 123 */ - {"adjtimex", lx_adjtimex, 0, 1}, /* 124 */ - {"mprotect", lx_mprotect, 0, 3}, /* 125 */ - {"sigprocmask", lx_sigprocmask, 0, 3}, /* 126 */ - {"create_module", NULL, NOSYS_KERNEL, 0}, /* 127 */ - {"init_module", NULL, NOSYS_KERNEL, 0}, /* 128 */ - {"delete_module", NULL, NOSYS_KERNEL, 0}, /* 129 */ - {"get_kernel_syms", NULL, NOSYS_KERNEL, 0}, /* 130 */ - {"quotactl", NULL, NOSYS_KERNEL, 0}, /* 131 */ - {"getpgid", lx_getpgid, 0, 1}, /* 132 */ - {"fchdir", lx_fchdir, 0, 1}, /* 133 */ - {"bdflush", NULL, NOSYS_KERNEL, 0}, /* 134 */ - {"sysfs", lx_sysfs, 0, 3}, /* 135 */ - {"personality", lx_personality, 0, 1}, /* 136 */ - {"afs_syscall", NULL, NOSYS_KERNEL, 0}, /* 137 */ - {"setfsuid16", lx_setfsuid16, 0, 1}, /* 138 */ - {"setfsgid16", lx_setfsgid16, 0, 1}, /* 139 */ - {"llseek", lx_llseek, 0, 5}, /* 140 */ - {"getdents", lx_getdents, 0, 3}, /* 141 */ - {"select", lx_select, 0, 5}, /* 142 */ - {"flock", lx_flock, 0, 2}, /* 143 */ - {"msync", lx_msync, 0, 3}, /* 144 */ - {"readv", lx_readv, 0, 3}, /* 145 */ - {"writev", lx_writev, 0, 3}, /* 146 */ - {"getsid", lx_getsid, 0, 1}, /* 147 */ - {"fdatasync", lx_fdatasync, 0, 1}, /* 148 */ - {"sysctl", lx_sysctl, 0, 1}, /* 149 */ - {"mlock", lx_mlock, 0, 2}, /* 150 */ - {"munlock", lx_munlock, 0, 2}, /* 151 */ - {"mlockall", lx_mlockall, 0, 1}, /* 152 */ - {"munlockall", lx_munlockall, 0, 0}, /* 153 */ - {"sched_setparam", lx_sched_setparam, 0, 2}, /* 154 */ - {"sched_getparam", lx_sched_getparam, 0, 2}, /* 155 */ - {"sched_setscheduler", lx_sched_setscheduler, 0, 3}, /* 156 */ - {"sched_getscheduler", lx_sched_getscheduler, 0, 1}, /* 157 */ - {"sched_yield", lx_yield, 0, 0}, /* 158 */ - {"sched_get_priority_max", lx_sched_get_priority_max, 0, 1}, /* 159 */ - {"sched_get_priority_min", lx_sched_get_priority_min, 0, 1}, /* 160 */ - {"sched_rr_get_interval", lx_sched_rr_get_interval, 0, 2}, /* 161 */ - {"nanosleep", lx_nanosleep, 0, 2}, /* 162 */ - {"mremap", lx_remap, 0, 5}, /* 163 */ - {"setresuid16", LX_IKE(setresuid16), LX_SYS_IKE, 3}, /* 164 */ - {"getresuid16", lx_getresuid16, 0, 3}, /* 165 */ - {"vm86", NULL, NOSYS_NO_EQUIV, 0}, /* 166 */ - {"query_module", lx_query_module, NOSYS_KERNEL, 5}, /* 167 */ - {"poll", lx_poll, 0, 3}, /* 168 */ - {"nfsservctl", NULL, NOSYS_KERNEL, 0}, /* 169 */ - {"setresgid16", LX_IKE(setresgid16), LX_SYS_IKE, 3}, /* 170 */ - {"getresgid16", lx_getresgid16, 0, 3}, /* 171 */ - {"prctl", lx_prctl, 0, 5}, /* 172 */ - {"rt_sigreturn", lx_rt_sigreturn, 0, 0}, /* 173 */ - {"rt_sigaction", lx_rt_sigaction, 0, 4}, /* 174 */ - {"rt_sigprocmask", lx_rt_sigprocmask, 0, 4}, /* 175 */ - {"rt_sigpending", lx_rt_sigpending, 0, 2}, /* 176 */ - {"rt_sigtimedwait", lx_rt_sigtimedwait, 0, 4}, /* 177 */ - {"rt_sigqueueinfo", lx_rt_sigqueueinfo, 0, 3}, /* 178 */ - {"rt_sigsuspend", lx_rt_sigsuspend, 0, 2}, /* 179 */ - {"pread64", lx_pread64, 0, 5}, /* 180 */ - {"pwrite64", lx_pwrite64, 0, 5}, /* 181 */ - {"chown16", lx_chown16, 0, 3}, /* 182 */ - {"getcwd", lx_getcwd, 0, 2}, /* 183 */ - {"capget", lx_capget, 0, 2}, /* 184 */ - {"capset", lx_capset, 0, 2}, /* 185 */ - {"sigaltstack", lx_sigaltstack, 0, 2}, /* 186 */ - {"sendfile", lx_sendfile, 0, 4}, /* 187 */ - {"getpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 188 */ - {"putpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 189 */ - {"vfork", lx_vfork, 0, 0}, /* 190 */ - {"getrlimit", lx_getrlimit, 0, 2}, /* 191 */ - {"mmap2", lx_mmap2, LX_SYS_EBPARG6, 6}, /* 192 */ - {"truncate64", lx_truncate64, 0, 3}, /* 193 */ - {"ftruncate64", lx_ftruncate64, 0, 3}, /* 194 */ - {"stat64", lx_stat64, 0, 2}, /* 195 */ - {"lstat64", lx_lstat64, 0, 2}, /* 196 */ - {"fstat64", lx_fstat64, 0, 2}, /* 197 */ - {"lchown", lx_lchown, 0, 3}, /* 198 */ - {"getuid", lx_getuid, 0, 0}, /* 199 */ - {"getgid", lx_getgid, 0, 0}, /* 200 */ - {"geteuid", lx_geteuid, 0, 0}, /* 201 */ - {"getegid", lx_getegid, 0, 0}, /* 202 */ - {"setreuid", lx_setreuid, 0, 0}, /* 203 */ - {"setregid", lx_setregid, 0, 0}, /* 204 */ - {"getgroups", lx_getgroups, 0, 2}, /* 205 */ - {"setgroups", lx_setgroups, 0, 2}, /* 206 */ - {"fchown", lx_fchown, 0, 3}, /* 207 */ - {"setresuid", LX_IKE(setresuid), LX_SYS_IKE, 3}, /* 208 */ - {"getresuid", lx_getresuid, 0, 3}, /* 209 */ - {"setresgid", LX_IKE(setresgid), LX_SYS_IKE, 3}, /* 210 */ - {"getresgid", lx_getresgid, 0, 3}, /* 211 */ - {"chown", lx_chown, 0, 3}, /* 212 */ - {"setuid", lx_setuid, 0, 1}, /* 213 */ - {"setgid", lx_setgid, 0, 1}, /* 214 */ - {"setfsuid", lx_setfsuid, 0, 1}, /* 215 */ - {"setfsgid", lx_setfsgid, 0, 1}, /* 216 */ - {"pivot_root", NULL, NOSYS_KERNEL, 0}, /* 217 */ - {"mincore", lx_mincore, 0, 3}, /* 218 */ - {"madvise", lx_madvise, 0, 3}, /* 219 */ - {"getdents64", lx_getdents64, 0, 3}, /* 220 */ - {"fcntl64", lx_fcntl64, 0, 3}, /* 221 */ - {"tux", NULL, NOSYS_NO_EQUIV, 0}, /* 222 */ - {"security", NULL, NOSYS_NO_EQUIV, 0}, /* 223 */ - {"gettid", LX_IKE(gettid), LX_SYS_IKE, 0}, /* 224 */ - {"readahead", NULL, NOSYS_NO_EQUIV, 0}, /* 225 */ - {"setxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 226 */ - {"lsetxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 227 */ - {"fsetxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 228 */ - {"getxattr", lx_xattr4, 0, 4}, /* 229 */ - {"lgetxattr", lx_xattr4, 0, 4}, /* 230 */ - {"fgetxattr", lx_xattr4, 0, 4}, /* 231 */ - {"listxattr", lx_xattr3, 0, 3}, /* 232 */ - {"llistxattr", lx_xattr3, 0, 3}, /* 233 */ - {"flistxattr", lx_xattr3, 0, 3}, /* 234 */ - {"removexattr", lx_xattr2, 0, 2}, /* 235 */ - {"lremovexattr", lx_xattr2, 0, 2}, /* 236 */ - {"fremovexattr", lx_xattr2, 0, 2}, /* 237 */ - {"tkill", LX_IKE(tkill), LX_SYS_IKE, 2}, /* 238 */ - {"sendfile64", lx_sendfile64, 0, 4}, /* 239 */ - {"futex", LX_IKE(futex), LX_SYS_IKE | LX_SYS_EBPARG6, 6}, /* 240 */ - {"sched_setaffinity", lx_sched_setaffinity, 0, 3}, /* 241 */ - {"sched_getaffinity", lx_sched_getaffinity, 0, 3}, /* 242 */ - {"set_thread_area", LX_IKE(set_thread_area), LX_SYS_IKE, 1}, /* 243 */ - {"get_thread_area", LX_IKE(get_thread_area), LX_SYS_IKE, 1}, /* 244 */ - {"io_setup", lx_io_setup, 0, 2}, /* 245 */ - {"io_destroy", lx_io_destroy, 0, 1}, /* 246 */ - {"io_getevents", lx_io_getevents, 0, 5}, /* 247 */ - {"io_submit", lx_io_submit, 0, 3}, /* 248 */ - {"io_cancel", lx_io_cancel, 0, 3}, /* 249 */ - {"fadvise64", lx_fadvise64, 0, 4}, /* 250 */ - {"nosys", NULL, 0, 0}, /* 251 */ - {"group_exit", lx_group_exit, 0, 1}, /* 252 */ - {"lookup_dcookie", NULL, NOSYS_NO_EQUIV, 0}, /* 253 */ - {"epoll_create", lx_epoll_create, 0, 1}, /* 254 */ - {"epoll_ctl", lx_epoll_ctl, 0, 4}, /* 255 */ - {"epoll_wait", lx_epoll_wait, 0, 4}, /* 256 */ - {"remap_file_pages", NULL, NOSYS_NO_EQUIV, 0}, /* 257 */ - {"set_tid_address", LX_IKE(set_tid_address), LX_SYS_IKE, 1}, /* 258 */ - {"timer_create", lx_timer_create, 0, 3}, /* 259 */ - {"timer_settime", lx_timer_settime, 0, 4}, /* 260 */ - {"timer_gettime", lx_timer_gettime, 0, 2}, /* 261 */ - {"timer_getoverrun", lx_timer_getoverrun, 0, 1}, /* 262 */ - {"timer_delete", lx_timer_delete, 0, 1}, /* 263 */ - {"clock_settime", lx_clock_settime, 0, 2}, /* 264 */ - {"clock_gettime", lx_clock_gettime, 0, 2}, /* 265 */ - {"clock_getres", lx_clock_getres, 0, 2}, /* 266 */ - {"clock_nanosleep", lx_clock_nanosleep, 0, 4}, /* 267 */ - {"statfs64", lx_statfs64, 0, 2}, /* 268 */ - {"fstatfs64", lx_fstatfs64, 0, 2}, /* 269 */ - {"tgkill", LX_IKE(tgkill), LX_SYS_IKE, 3}, /* 270 */ - - /* The following system calls only exist in kernel 2.6 and greater */ - {"utimes", lx_utimes, 0, 2}, /* 271 */ - {"fadvise64_64", lx_fadvise64_64, 0, 4}, /* 272 */ - {"vserver", NULL, NOSYS_NULL, 0}, /* 273 */ - {"mbind", NULL, NOSYS_NULL, 0}, /* 274 */ - {"get_mempolicy", NULL, NOSYS_NULL, 0}, /* 275 */ - {"set_mempolicy", NULL, NOSYS_NULL, 0}, /* 276 */ - {"mq_open", NULL, NOSYS_NULL, 0}, /* 277 */ - {"mq_unlink", NULL, NOSYS_NULL, 0}, /* 278 */ - {"mq_timedsend", NULL, NOSYS_NULL, 0}, /* 279 */ - {"mq_timedreceive", NULL, NOSYS_NULL, 0}, /* 280 */ - {"mq_notify", NULL, NOSYS_NULL, 0}, /* 281 */ - {"mq_getsetattr", NULL, NOSYS_NULL, 0}, /* 282 */ - {"kexec_load", NULL, NOSYS_NULL, 0}, /* 283 */ - {"waitid", lx_waitid, 0, 4}, /* 284 */ - {"sys_setaltroot", NULL, NOSYS_NULL, 0}, /* 285 */ - {"add_key", NULL, NOSYS_NULL, 0}, /* 286 */ - {"request_key", NULL, NOSYS_NULL, 0}, /* 287 */ - {"keyctl", NULL, NOSYS_NULL, 0}, /* 288 */ - {"ioprio_set", NULL, NOSYS_NULL, 0}, /* 289 */ - {"ioprio_get", NULL, NOSYS_NULL, 0}, /* 290 */ - {"inotify_init", lx_inotify_init, 0, 0}, /* 291 */ - {"inotify_add_watch", lx_inotify_add_watch, 0, 3}, /* 292 */ - {"inotify_rm_watch", lx_inotify_rm_watch, 0, 2}, /* 293 */ - {"migrate_pages", NULL, NOSYS_NULL, 0}, /* 294 */ - {"openat", lx_openat, 0, 4}, /* 295 */ - {"mkdirat", lx_mkdirat, 0, 3}, /* 296 */ - {"mknodat", lx_mknodat, 0, 4}, /* 297 */ - {"fchownat", lx_fchownat, 0, 5}, /* 298 */ - {"futimesat", lx_futimesat, 0, 3}, /* 299 */ - {"fstatat64", lx_fstatat64, 0, 4}, /* 300 */ - {"unlinkat", lx_unlinkat, 0, 3}, /* 301 */ - {"renameat", lx_renameat, 0, 4}, /* 302 */ - {"linkat", lx_linkat, 0, 5}, /* 303 */ - {"symlinkat", lx_symlinkat, 0, 3}, /* 304 */ - {"readlinkat", lx_readlinkat, 0, 4}, /* 305 */ - {"fchmodat", lx_fchmodat, 0, 4}, /* 306 */ - {"faccessat", lx_faccessat, 0, 4}, /* 307 */ - {"pselect6", lx_pselect6, LX_SYS_EBPARG6, 6}, /* 308 */ - {"ppoll", lx_ppoll, 0, 5}, /* 309 */ - {"unshare", NULL, NOSYS_NULL, 0}, /* 310 */ - {"set_robust_list", NULL, NOSYS_NULL, 0}, /* 311 */ - {"get_robust_list", NULL, NOSYS_NULL, 0}, /* 312 */ - {"splice", NULL, NOSYS_NULL, 0}, /* 313 */ - {"sync_file_range", NULL, NOSYS_NULL, 0}, /* 314 */ - {"tee", NULL, NOSYS_NULL, 0}, /* 315 */ - {"vmsplice", NULL, NOSYS_NULL, 0}, /* 316 */ - {"move_pages", NULL, NOSYS_NULL, 0}, /* 317 */ - {"getcpu", lx_getcpu, 0, 3}, /* 318 */ - {"epoll_pwait", lx_epoll_pwait, 0, 5}, /* 319 */ - {"utimensat", lx_utimensat, 0, 4}, /* 320 */ - {"signalfd", NULL, NOSYS_NULL, 0}, /* 321 */ - {"timerfd_create", lx_timerfd_create, 0, 2}, /* 322 */ - {"eventfd", lx_eventfd, 0, 1}, /* 323 */ - {"fallocate", NULL, NOSYS_NULL, 0}, /* 324 */ - {"timerfd_settime", lx_timerfd_settime, 0, 4}, /* 325 */ - {"timerfd_gettime", lx_timerfd_gettime, 0, 2}, /* 326 */ - {"signalfd4", NULL, NOSYS_NULL, 0}, /* 327 */ - {"eventfd2", lx_eventfd2, 0, 2}, /* 328 */ - {"epoll_create1", lx_epoll_create1, 0, 1}, /* 329 */ - {"dup3", lx_dup3, 0, 3}, /* 330 */ - {"pipe2", lx_pipe2, 0, 2}, /* 331 */ - {"inotify_init1", lx_inotify_init1, 0, 1}, /* 332 */ - {"preadv", lx_preadv, 0, 4}, /* 333 */ - {"pwritev", lx_pwritev, 0, 4}, /* 334 */ - {"rt_tgsigqueueinfo", lx_rt_tgsigqueueinfo, 0, 4}, /* 335 */ - {"perf_event_open", NULL, NOSYS_NULL, 0}, /* 336 */ - {"recvmmsg", NULL, NOSYS_NULL, 0}, /* 337 */ - {"fanotify_init", NULL, NOSYS_NULL, 0}, /* 338 */ - {"fanotify_mark", NULL, NOSYS_NULL, 0}, /* 339 */ - {"prlimit64", lx_prlimit64, 0, 4}, /* 340 */ - {"name_to_handle_at", NULL, NOSYS_NULL, 0}, /* 341 */ - {"open_by_handle_at", NULL, NOSYS_NULL, 0}, /* 342 */ - {"clock_adjtime", NULL, NOSYS_NULL, 0}, /* 343 */ - {"syncfs", NULL, NOSYS_NULL, 0}, /* 344 */ - {"sendmmsg", NULL, NOSYS_NULL, 0}, /* 345 */ - {"setns", NULL, NOSYS_NULL, 0}, /* 346 */ - {"process_vm_readv", NULL, NOSYS_NULL, 0}, /* 347 */ - {"process_vm_writev", NULL, NOSYS_NULL, 0}, /* 348 */ - {"kcmp", NULL, NOSYS_NULL, 0}, /* 349 */ - {"finit_module", NULL, NOSYS_NULL, 0}, /* 350 */ - {"sched_setattr", NULL, NOSYS_NULL, 0}, /* 351 */ - {"sched_getattr", NULL, NOSYS_NULL, 0}, /* 352 */ +static lx_syscall_handler_t lx_handlers[] = { + NULL, /* 0: nosys */ + lx_exit, + lx_fork, + NULL, /* 3: read */ + NULL, /* 4: write */ + lx_open, + lx_close, + NULL, /* 7: waitpid */ + lx_creat, + lx_link, + lx_unlink, + lx_execve, + lx_chdir, + lx_time, + lx_mknod, + lx_chmod, + lx_lchown16, + NULL, /* 17: break */ + NULL, /* 18: stat */ + lx_lseek, + NULL, /* 20: getpid */ + lx_mount, + lx_umount, + lx_setuid16, + lx_getuid16, + lx_stime, + lx_ptrace, + lx_alarm, + NULL, /* 28: fstat */ + lx_pause, + lx_utime, + NULL, /* 31: stty */ + NULL, /* 32: gtty */ + lx_access, + lx_nice, + NULL, /* 35: ftime */ + lx_sync, + NULL, /* 37: kill */ + lx_rename, + lx_mkdir, + lx_rmdir, + lx_dup, + NULL, /* 42: pipe */ + lx_times, + NULL, /* 44: prof */ + NULL, /* 45: brk */ + lx_setgid16, + lx_getgid16, + lx_signal, + lx_geteuid16, + lx_getegid16, + NULL, /* 51: acct */ + lx_umount2, + NULL, /* 53: lock */ + NULL, /* 54: ioctl */ + lx_fcntl, + NULL, /* 56: mpx */ + lx_setpgid, + NULL, /* 58: ulimit */ + NULL, /* 59: olduname */ + lx_umask, + lx_chroot, + NULL, /* 62: ustat */ + lx_dup2, + NULL, /* 64: getppid */ + lx_getpgrp, + lx_setsid, + lx_sigaction, + NULL, /* 68: sgetmask */ + NULL, /* 69: ssetmask */ + lx_setreuid16, + lx_setregid16, + lx_sigsuspend, + lx_sigpending, + lx_sethostname, + lx_setrlimit, + lx_oldgetrlimit, + lx_getrusage, + lx_gettimeofday, + lx_settimeofday, + lx_getgroups16, + lx_setgroups16, + NULL, /* 82: select */ + lx_symlink, + NULL, /* 84: oldlstat */ + lx_readlink, + NULL, /* 86: uselib */ + NULL, /* 87: swapon */ + lx_reboot, + lx_readdir, + lx_mmap, + lx_munmap, + lx_truncate, + lx_ftruncate, + lx_fchmod, + lx_fchown16, + lx_getpriority, + lx_setpriority, + NULL, /* 98: profil */ + lx_statfs, + lx_fstatfs, + NULL, /* 101: ioperm */ + lx_socketcall, + lx_syslog, + lx_setitimer, + lx_getitimer, + lx_stat, + lx_lstat, + lx_fstat, + NULL, /* 109: uname */ + NULL, /* 110: oldiopl */ + lx_vhangup, + NULL, /* 112: idle */ + NULL, /* 113: vm86old */ + NULL, /* 114: wait4 */ + NULL, /* 115: swapoff */ + NULL, /* 116: sysinfo */ + lx_ipc, + lx_fsync, + lx_sigreturn, + lx_clone, + lx_setdomainname, + lx_uname, + NULL, /* 123: modify_ldt */ + lx_adjtimex, + lx_mprotect, + lx_sigprocmask, + NULL, /* 127: create_module */ + NULL, /* 128: init_module */ + NULL, /* 129: delete_module */ + NULL, /* 130: get_kernel_syms */ + NULL, /* 131: quotactl */ + lx_getpgid, + lx_fchdir, + NULL, /* 134: bdflush */ + lx_sysfs, + lx_personality, + NULL, /* 137: afs_syscall */ + lx_setfsuid16, + lx_setfsgid16, + lx_llseek, + lx_getdents, + lx_select, + lx_flock, + lx_msync, + lx_readv, + lx_writev, + lx_getsid, + lx_fdatasync, + lx_sysctl, + lx_mlock, + lx_munlock, + lx_mlockall, + lx_munlockall, + lx_sched_setparam, + lx_sched_getparam, + lx_sched_setscheduler, + lx_sched_getscheduler, + NULL, /* 158: sched_yield */ + lx_sched_get_priority_max, + lx_sched_get_priority_min, + lx_sched_rr_get_interval, + lx_nanosleep, + lx_remap, + NULL, /* 164: setresuid16 */ + lx_getresuid16, + NULL, /* 166: vm86 */ + lx_query_module, + lx_poll, + NULL, /* 169: nfsservctl */ + NULL, /* 170: setresgid16 */ + lx_getresgid16, + lx_prctl, + lx_rt_sigreturn, + lx_rt_sigaction, + lx_rt_sigprocmask, + lx_rt_sigpending, + lx_rt_sigtimedwait, + lx_rt_sigqueueinfo, + lx_rt_sigsuspend, + lx_pread64, + lx_pwrite64, + lx_chown16, + lx_getcwd, + lx_capget, + lx_capset, + lx_sigaltstack, + lx_sendfile, + NULL, /* 188: getpmsg */ + NULL, /* 189: putpmsg */ + lx_vfork, + lx_getrlimit, + lx_mmap2, + lx_truncate64, + lx_ftruncate64, + lx_stat64, + lx_lstat64, + lx_fstat64, + lx_lchown, + lx_getuid, + lx_getgid, + lx_geteuid, + lx_getegid, + lx_setreuid, + lx_setregid, + lx_getgroups, + lx_setgroups, + lx_fchown, + NULL, /* 208: setresuid */ + lx_getresuid, + NULL, /* 210: setresgid */ + lx_getresgid, + lx_chown, + lx_setuid, + lx_setgid, + lx_setfsuid, + lx_setfsgid, + NULL, /* 217: pivot_root */ + lx_mincore, + lx_madvise, + lx_getdents64, + lx_fcntl64, + NULL, /* 222: tux */ + NULL, /* 223: security */ + NULL, /* 224: gettid */ + NULL, /* 225: readahead */ + NULL, /* 226: setxattr */ + NULL, /* 227: lsetxattr */ + NULL, /* 228: fsetxattr */ + NULL, /* 229: getxattr */ + NULL, /* 230: lgetxattr */ + NULL, /* 231: fgetxattr */ + NULL, /* 232: listxattr */ + NULL, /* 233: llistxattr */ + NULL, /* 234: flistxattr */ + NULL, /* 235: removexattr */ + NULL, /* 236: lremovexattr */ + NULL, /* 237: fremovexattr */ + NULL, /* 238: tkill */ + lx_sendfile64, + NULL, /* 240: futex */ + lx_sched_setaffinity, + lx_sched_getaffinity, + NULL, /* 243: set_thread_area */ + NULL, /* 244: get_thread_area */ + NULL, /* 245: io_setup */ + NULL, /* 246: io_destroy */ + NULL, /* 247: io_getevents */ + NULL, /* 248: io_submit */ + NULL, /* 249: io_cancel */ + lx_fadvise64, + NULL, /* 251: nosys */ + lx_group_exit, + NULL, /* 253: lookup_dcookie */ + lx_epoll_create, + lx_epoll_ctl, + lx_epoll_wait, + NULL, /* 257: remap_file_pages */ + NULL, /* 258: set_tid_address */ + lx_timer_create, + lx_timer_settime, + lx_timer_gettime, + lx_timer_getoverrun, + lx_timer_delete, + lx_clock_settime, + lx_clock_gettime, + lx_clock_getres, + lx_clock_nanosleep, + lx_statfs64, + lx_fstatfs64, + NULL, /* 270: tgkill */ + lx_utimes, + lx_fadvise64_64, + NULL, /* 273: vserver */ + NULL, /* 274: mbind */ + NULL, /* 275: get_mempolicy */ + NULL, /* 276: set_mempolicy */ + NULL, /* 277: mq_open */ + NULL, /* 278: mq_unlink */ + NULL, /* 279: mq_timedsend */ + NULL, /* 280: mq_timedreceive */ + NULL, /* 281: mq_notify */ + NULL, /* 282: mq_getsetattr */ + NULL, /* 283: kexec_load */ + NULL, /* 284: waitid */ + NULL, /* 285: sys_setaltroot */ + NULL, /* 286: add_key */ + NULL, /* 287: request_key */ + NULL, /* 288: keyctl */ + NULL, /* 289: ioprio_set */ + NULL, /* 290: ioprio_get */ + lx_inotify_init, + lx_inotify_add_watch, + lx_inotify_rm_watch, + NULL, /* 294: migrate_pages */ + lx_openat, + lx_mkdirat, + lx_mknodat, + lx_fchownat, + lx_futimesat, + lx_fstatat64, + lx_unlinkat, + lx_renameat, + lx_linkat, + lx_symlinkat, + lx_readlinkat, + lx_fchmodat, + lx_faccessat, + lx_pselect6, + lx_ppoll, + NULL, /* 310: unshare */ + NULL, /* 311: set_robust_list */ + NULL, /* 312: get_robust_list */ + NULL, /* 313: splice */ + NULL, /* 314: sync_file_range */ + NULL, /* 315: tee */ + NULL, /* 316: vmsplice */ + NULL, /* 317: move_pages */ + lx_getcpu, + lx_epoll_pwait, + lx_utimensat, + NULL, /* 321: signalfd */ + lx_timerfd_create, + lx_eventfd, + NULL, /* 324: fallocate */ + lx_timerfd_settime, + lx_timerfd_gettime, + NULL, /* 327: signalfd4 */ + lx_eventfd2, + lx_epoll_create1, + lx_dup3, + NULL, /* 331: pipe2 */ + lx_inotify_init1, + NULL, /* 333: preadv */ + NULL, /* 334: pwritev */ + lx_rt_tgsigqueueinfo, + NULL, /* 336: perf_event_open */ + NULL, /* 337: recvmmsg */ + NULL, /* 338: fanotify_init */ + NULL, /* 339: fanotify_mark */ + lx_prlimit64, + NULL, /* 341: name_to_handle_at */ + NULL, /* 342: open_by_handle_at */ + NULL, /* 343: clock_adjtime */ + NULL, /* 344: syncfs */ + NULL, /* 345: sendmmsg */ + NULL, /* 346: setns */ + NULL, /* 347: process_vm_readv */ + NULL, /* 348: process_vm_writev */ + NULL, /* 349: kcmp */ + NULL, /* 350: finit_module */ + NULL, /* 351: sched_setattr */ + NULL, /* 352: sched_getattr */ + NULL, /* 353: renameat2 */ + NULL, /* 354: seccomp */ + NULL, /* 355: getrandom */ + NULL, /* 356: memfd_create */ + NULL, /* 357: bpf */ + NULL, /* 358: execveat */ }; #endif diff --git a/usr/src/lib/brand/lx/lx_brand/common/lx_provider.d b/usr/src/lib/brand/lx/lx_brand/common/lx_provider.d index 2a07c00c7a..14326e8f56 100644 --- a/usr/src/lib/brand/lx/lx_brand/common/lx_provider.d +++ b/usr/src/lib/brand/lx/lx_brand/common/lx_provider.d @@ -10,14 +10,26 @@ */ /* - * Copyright (c) 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ provider lx { probe debug(char *buf); - probe sigdeliver(int sig, void *lx_sigaction, void *lx_sigstack, - void *lx_ucontext); + probe sigdeliver(int sig, void *lx_sigaction, void *lx_sigstack); probe sigreturn(void *lx_ucontext, void *ucontext, uintptr_t sp); + + probe signal__delivery__frame__create(void *lx_sigdeliver_frame); + probe signal__delivery__frame__found(void *lx_sigdeliver_frame); + probe signal__delivery__frame__corrupt(void *lx_sigdeliver_frame); + + probe signal__post__handler(uintptr_t old_sp, uintptr_t new_sp); + + probe signal__altstack__enable(uintptr_t alt_sp); + probe signal__altstack__disable(); + + probe emulate__enter(void *ucp, int syscall_num, uintptr_t *args); + probe emulate__return(void *ucp, int syscall_num, uintptr_t ret, + uintptr_t errn); }; #pragma D attributes Evolving/Evolving/ISA provider lx provider diff --git a/usr/src/lib/brand/lx/lx_brand/common/lx_thunk_server.c b/usr/src/lib/brand/lx/lx_brand/common/lx_thunk_server.c index 02bfe48e01..08e77572ab 100644 --- a/usr/src/lib/brand/lx/lx_brand/common/lx_thunk_server.c +++ b/usr/src/lib/brand/lx/lx_brand/common/lx_thunk_server.c @@ -22,7 +22,7 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* @@ -262,7 +262,6 @@ static cond_t lxt_req_cv = DEFAULTCV; static lxt_req_t *lxt_req_ptr = NULL; static mutex_t lxt_pid_lock = DEFAULTMUTEX; -static pid_t lxt_pid = NULL; /* * Interfaces used to call from lx_brand.so into Linux code. @@ -370,26 +369,26 @@ lx_call(lx_handle_sym_t lx_ch, uintptr_t p1, uintptr_t p2, { typedef uintptr_t (*fp8_t)(uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); - lx_regs_t *rp; + ucontext_t *ucp; uintptr_t ret; fp8_t lx_funcp = (fp8_t)lx_ch; #if defined(_ILP32) long cur_gs; #endif - rp = lx_syscall_regs(); + ucp = lx_syscall_regs(); #if defined(_ILP32) - lx_debug("lx_call: loading Linux gs, rp = 0x%p, gs = 0x%p", - rp, rp->lxr_gs); - lx_swap_gs(rp->lxr_gs, &cur_gs); + lx_debug("lx_call: loading Linux gs, ucp = 0x%p, gs = 0x%p", + ucp, LX_REG(ucp, GS)); + lx_swap_gs(LX_REG(ucp, GS), &cur_gs); #endif lx_debug("lx_call: calling to Linux code at 0x%p", lx_ch); ret = lx_funcp(p1, p2, p3, p4, p5, p6, p7, p8); #if defined(_ILP32) - lx_swap_gs(cur_gs, &rp->lxr_gs); + lx_swap_gs(cur_gs, (long *)&LX_REG(ucp, GS)); #endif lx_debug("lx_call: returned from Linux code at 0x%p (%p)", lx_ch, ret); @@ -725,7 +724,7 @@ lxt_server_syslog(lxt_server_arg_t *request, size_t request_size, * We do this by telling our getpid() system call to return a * different value. */ - lxt_pid = data->lxt_sl_pid; + (void) syscall(SYS_brand, B_SET_THUNK_PID, data->lxt_sl_pid); /* * Ensure the message has the correct program name. @@ -750,7 +749,7 @@ lxt_server_syslog(lxt_server_arg_t *request, size_t request_size, /* Restore pid and program name. */ (void) uucopy(&progname_ptr_old, lxt_handles[LXTH_PROGNAME].lxth_handle, sizeof (char *)); - lxt_pid = NULL; + (void) syscall(SYS_brand, B_SET_THUNK_PID, 0); (void) mutex_unlock(&lxt_pid_lock); @@ -1022,12 +1021,3 @@ lxt_server_init(int argc, char *argv[]) lxt_server_processes = 1; lx_debug("lx_thunk server detected, delaying initalization"); } - -int -lxt_server_pid(int *pid) -{ - if (lxt_server_processes == 0) - return (0); - *pid = lxt_pid; - return (1); -} diff --git a/usr/src/lib/brand/lx/lx_brand/common/mem.c b/usr/src/lib/brand/lx/lx_brand/common/mem.c index 416596ae88..d5a8b14bef 100644 --- a/usr/src/lib/brand/lx/lx_brand/common/mem.c +++ b/usr/src/lib/brand/lx/lx_brand/common/mem.c @@ -21,7 +21,7 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <errno.h> @@ -112,6 +112,21 @@ mmap_common(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, fd = -1; /* + * We refuse, as a matter of principle, to overcommit memory. + * Unfortunately, several bits of important and popular software expect + * to be able to pre-allocate large amounts of virtual memory but then + * probably never use it. One particularly bad example of this + * practice is golang. + * + * In the interest of running software, unsafe or not, we fudge + * something vaguely similar to overcommit by permanently enabling + * MAP_NORESERVE unless MAP_LOCKED was requested: + */ + if (!(flags & LX_MAP_LOCKED)) { + flags |= LX_MAP_NORESERVE; + } + + /* * This is totally insane. The NOTES section in the linux mmap(2) man * page claims that on some architectures, read protection may * automatically include exec protection. It has been observed on a diff --git a/usr/src/lib/brand/lx/lx_brand/common/misc.c b/usr/src/lib/brand/lx/lx_brand/common/misc.c index 5b71b43bf1..7e16fb717e 100644 --- a/usr/src/lib/brand/lx/lx_brand/common/misc.c +++ b/usr/src/lib/brand/lx/lx_brand/common/misc.c @@ -24,6 +24,7 @@ * Copyright 2015 Joyent, Inc. All rights reserved. */ +#include <stdlib.h> #include <assert.h> #include <alloca.h> #include <errno.h> @@ -60,40 +61,6 @@ extern int sethostname(char *, int); -struct lx_sysinfo { - int64_t si_uptime; /* Seconds since boot */ - uint64_t si_loads[3]; /* 1, 5, and 15 minute avg runq length */ - uint64_t si_totalram; /* Total memory size */ - uint64_t si_freeram; /* Available memory */ - uint64_t si_sharedram; /* Shared memory */ - uint64_t si_bufferram; /* Buffer memory */ - uint64_t si_totalswap; /* Total swap space */ - uint64_t si_freeswap; /* Avail swap space */ - uint16_t si_procs; /* Process count */ - uint16_t si_pad; /* Padding */ - uint64_t si_totalhigh; /* High memory size */ - uint64_t si_freehigh; /* Avail high memory */ - uint32_t si_mem_unit; /* Unit size of memory fields */ -}; - -struct lx_sysinfo32 { - int32_t si_uptime; /* Seconds since boot */ - uint32_t si_loads[3]; /* 1, 5, and 15 minute avg runq length */ - uint32_t si_totalram; /* Total memory size */ - uint32_t si_freeram; /* Available memory */ - uint32_t si_sharedram; /* Shared memory */ - uint32_t si_bufferram; /* Buffer memory */ - uint32_t si_totalswap; /* Total swap space */ - uint32_t si_freeswap; /* Avail swap space */ - uint16_t si_procs; /* Process count */ - uint16_t si_pad; /* Padding */ - uint32_t si_totalhigh; /* High memory size */ - uint32_t si_freehigh; /* Avail high memory */ - uint32_t si_mem_unit; /* Unit size of memory fields */ -}; - -extern long lx_sysinfo(struct lx_sysinfo *sip); - /* ARGUSED */ long lx_rename(uintptr_t p1, uintptr_t p2) @@ -284,7 +251,7 @@ lx_uname(uintptr_t p1) /* * {get,set}groups16() - Handle the conversion between 16-bit Linux gids and - * 32-bit Solaris gids. + * 32-bit illumos gids. */ long lx_getgroups16(uintptr_t p1, uintptr_t p2) @@ -298,11 +265,15 @@ lx_getgroups16(uintptr_t p1, uintptr_t p2) if (count < 0) return (-EINVAL); - grouplist32 = SAFE_ALLOCA(count * sizeof (gid_t)); - if (grouplist32 == NULL && count > 0) + grouplist32 = malloc(count * sizeof (gid_t)); + if (grouplist32 == NULL && count > 0) { + free(grouplist32); return (-ENOMEM); - if ((ret = getgroups(count, grouplist32)) < 0) + } + if ((ret = getgroups(count, grouplist32)) < 0) { + free(grouplist32); return (-errno); + } /* we must not modify the list if the incoming count was 0 */ if (count > 0) { @@ -310,28 +281,48 @@ lx_getgroups16(uintptr_t p1, uintptr_t p2) grouplist[i] = LX_GID32_TO_GID16(grouplist32[i]); } + free(grouplist32); return (ret); } long lx_setgroups16(uintptr_t p1, uintptr_t p2) { + long rv; int count = (int)p1; - lx_gid16_t *grouplist = (lx_gid16_t *)p2; - gid_t *grouplist32; + lx_gid16_t *grouplist = NULL; + gid_t *grouplist32 = NULL; int i; - grouplist32 = SAFE_ALLOCA(count * sizeof (gid_t)); - if (grouplist32 == NULL) + if ((grouplist = malloc(count * sizeof (lx_gid16_t))) == NULL) { + return (-ENOMEM); + } + if (uucopy((void *)p2, grouplist, count * sizeof (lx_gid16_t)) != 0) { + free(grouplist); + return (-EFAULT); + } + + grouplist32 = malloc(count * sizeof (gid_t)); + if (grouplist32 == NULL) { + free(grouplist); return (-ENOMEM); + } for (i = 0; i < count; i++) grouplist32[i] = LX_GID16_TO_GID32(grouplist[i]); /* order matters here to get the correct errno back */ - if (count > NGROUPS_MAX_DEFAULT) + if (count > NGROUPS_MAX_DEFAULT) { + free(grouplist); + free(grouplist32); return (-EINVAL); + } - return (setgroups(count, grouplist32) ? -errno : 0); + rv = setgroups(count, grouplist32); + + free(grouplist); + free(grouplist32); + + return (rv != 0 ? -errno : 0); } /* @@ -440,10 +431,10 @@ lx_mknod(uintptr_t p1, uintptr_t p2, uintptr_t p3) * * Most programmers aren't even aware you can do this. * - * Note you can also do this via Solaris' mknod(2), but + * Note you can also do this via illumos' mknod(2), but * Linux allows anyone who can create a UNIX domain * socket via bind(2) to create one via mknod(2); - * Solaris requires the caller to be privileged. + * illumos requires the caller to be privileged. */ if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) return (-errno); @@ -524,19 +515,6 @@ lx_setdomainname(uintptr_t p1, uintptr_t p2) } long -lx_getpid(void) -{ - int pid; - - /* First call the thunk server hook. */ - if (lxt_server_pid(&pid) != 0) - return (pid); - - pid = syscall(SYS_brand, B_IKE_SYSCALL + LX_EMUL_getpid); - return ((pid == -1) ? -errno : pid); -} - -long lx_execve(uintptr_t p1, uintptr_t p2, uintptr_t p3) { char *filename = (char *)p1; @@ -595,15 +573,17 @@ lx_setgroups(uintptr_t p1, uintptr_t p2) lx_debug("\tlx_setgroups(%d, 0x%p", ng, p2); if (ng > 0) { - if ((glist = (gid_t *)SAFE_ALLOCA(ng * sizeof (gid_t))) == NULL) + if ((glist = (gid_t *)malloc(ng * sizeof (gid_t))) == NULL) return (-ENOMEM); - if (uucopy((void *)p2, glist, ng * sizeof (gid_t)) != 0) + if (uucopy((void *)p2, glist, ng * sizeof (gid_t)) != 0) { + free(glist); return (-errno); + } /* * Linux doesn't check the validity of the group IDs, but - * Solaris does. Change any invalid group IDs to a known, valid + * illumos does. Change any invalid group IDs to a known, valid * value (yuck). */ for (i = 0; i < ng; i++) { @@ -613,12 +593,14 @@ lx_setgroups(uintptr_t p1, uintptr_t p2) } /* order matters here to get the correct errno back */ - if (ng > NGROUPS_MAX_DEFAULT) + if (ng > NGROUPS_MAX_DEFAULT) { + free(glist); return (-EINVAL); + } - r = syscall(SYS_brand, B_IKE_SYSCALL + LX_EMUL_setgroups, - ng, glist); + r = syscall(SYS_brand, B_HELPER_SETGROUPS, ng, glist); + free(glist); return ((r == -1) ? -errno : r); } @@ -712,29 +694,6 @@ lx_prctl(int option, uintptr_t arg2, uintptr_t arg3, return (0); } -#if defined(_LP64) -long -lx_arch_prctl(int code, uintptr_t addr) -{ - long rv; - int ret; - lx_tsd_t *lx_tsd; - - rv = syscall(SYS_brand, B_IKE_SYSCALL + LX_EMUL_arch_prctl, code, addr); - - if (code == LX_ARCH_SET_FS && rv == 0) { - /* Track lx fsbase for debugging purposes */ - if ((ret = thr_getspecific(lx_tsd_key, - (void **)&lx_tsd)) != 0) { - lx_err_fatal("arch_prctl: unable to read TSD: %s", - strerror(ret)); - } - lx_tsd->lxtsd_fsbase = addr; - } - return ((rv == 0) ? 0 : -errno); -} -#endif - /* * For syslog(), as there is no kernel and nothing to log, we simply emulate a * kernel cyclic buffer (LOG_BUF_LEN) of 0 bytes, only handling errors for bad @@ -759,45 +718,6 @@ lx_syslog(int type, char *bufp, int len) return (0); } -long -lx_sysinfo32(uintptr_t arg) -{ - struct lx_sysinfo32 *sip = (struct lx_sysinfo32 *)arg; - struct lx_sysinfo32 si; - struct lx_sysinfo sil; - int i; - - if (syscall(SYS_brand, B_IKE_SYSCALL + LX_EMUL_sysinfo, &sil) != 0) - return (-errno); - - si.si_uptime = sil.si_uptime; - - for (i = 0; i < 3; i++) { - if ((sil.si_loads[i]) > 0x7fffffff) - si.si_loads[i] = 0x7fffffff; - else - si.si_loads[i] = sil.si_loads[i]; - } - - si.si_procs = sil.si_procs; - si.si_totalram = sil.si_totalram; - si.si_freeram = sil.si_freeram; - si.si_totalswap = sil.si_totalswap; - si.si_freeswap = sil.si_freeswap; - si.si_mem_unit = sil.si_mem_unit; - - si.si_bufferram = sil.si_bufferram; - si.si_sharedram = sil.si_sharedram; - - si.si_totalhigh = sil.si_totalhigh; - si.si_freehigh = sil.si_freehigh; - - if (uucopy(&si, sip, sizeof (si)) != 0) - return (-errno); - - return (0); -} - /* * The following are pass-through functions but we need to return the correct * long so that the errno propagates back to the Linux code correctly. @@ -1160,23 +1080,6 @@ lx_utimes(const char *path, const struct timeval times[2]) } long -lx_write(int fildes, const void *buf, size_t nbyte) -{ - int r; - - r = write(fildes, buf, nbyte); - return ((r == -1) ? -errno : r); -} - -long -lx_yield(void) -{ - - yield(); - return (0); -} - -long lx_vhangup(void) { if (geteuid() != 0) diff --git a/usr/src/lib/brand/lx/lx_brand/common/poll_select.c b/usr/src/lib/brand/lx/lx_brand/common/poll_select.c index 4fa63e677c..1dce9b278d 100644 --- a/usr/src/lib/brand/lx/lx_brand/common/poll_select.c +++ b/usr/src/lib/brand/lx/lx_brand/common/poll_select.c @@ -70,21 +70,21 @@ lx_select(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, if (nfds > 0) { if (p2 != NULL) { - rfdsp = SAFE_ALLOCA(fd_set_len); + rfdsp = alloca(fd_set_len); if (rfdsp == NULL) return (-ENOMEM); if (uucopy((void *)p2, rfdsp, fd_set_len) != 0) return (-errno); } if (p3 != NULL) { - wfdsp = SAFE_ALLOCA(fd_set_len); + wfdsp = alloca(fd_set_len); if (wfdsp == NULL) return (-ENOMEM); if (uucopy((void *)p3, wfdsp, fd_set_len) != 0) return (-errno); } if (p4 != NULL) { - efdsp = SAFE_ALLOCA(fd_set_len); + efdsp = alloca(fd_set_len); if (efdsp == NULL) return (-ENOMEM); if (uucopy((void *)p4, efdsp, fd_set_len) != 0) @@ -165,7 +165,7 @@ lx_poll(uintptr_t p1, uintptr_t p2, uintptr_t p3) * structures are identical. Copy in the linux poll structure. */ fds_size = sizeof (struct pollfd) * nfds; - lfds = (struct pollfd *)SAFE_ALLOCA(fds_size); + lfds = (struct pollfd *)alloca(fds_size); if (lfds == NULL) return (-ENOMEM); if (uucopy((void *)p1, lfds, fds_size) != 0) @@ -175,7 +175,7 @@ lx_poll(uintptr_t p1, uintptr_t p2, uintptr_t p3) * The poll system call modifies the poll structures passed in * so we'll need to make an extra copy of them. */ - sfds = (struct pollfd *)SAFE_ALLOCA(fds_size); + sfds = (struct pollfd *)alloca(fds_size); if (sfds == NULL) return (-ENOMEM); diff --git a/usr/src/lib/brand/lx/lx_brand/common/ptrace.c b/usr/src/lib/brand/lx/lx_brand/common/ptrace.c index 174dbe8c19..65fe303835 100644 --- a/usr/src/lib/brand/lx/lx_brand/common/ptrace.c +++ b/usr/src/lib/brand/lx/lx_brand/common/ptrace.c @@ -59,13 +59,6 @@ * detail. */ -/* execve syscall numbers for 64-bit vs. 32-bit */ -#if defined(_LP64) -#define LX_SYS_execve 59 -#else -#define LX_SYS_execve 11 -#endif - /* * This corresponds to the user_i387_struct Linux structure. */ @@ -99,61 +92,6 @@ typedef struct lx_user_fpxregs { long lxux_padding[56]; } lx_user_fpxregs_t; -/* - * This corresponds to the user_regs_struct Linux structure. - */ -#if defined(_LP64) -typedef struct lx_user_regs { - long lxur_r15; - long lxur_r14; - long lxur_r13; - long lxur_r12; - long lxur_rbp; - long lxur_rbx; - long lxur_r11; - long lxur_r10; - long lxur_r9; - long lxur_r8; - long lxur_rax; - long lxur_rcx; - long lxur_rdx; - long lxur_rsi; - long lxur_rdi; - long lxur_orig_rax; - long lxur_rip; - long lxur_xcs; - long lxur_rflags; - long lxur_rsp; - long lxur_xss; - long lxur_xfs_base; - long lxur_xgs_base; - long lxur_xds; - long lxur_xes; - long lxur_xfs; - long lxur_xgs; -} lx_user_regs_t; -#else -typedef struct lx_user_regs { - long lxur_ebx; - long lxur_ecx; - long lxur_edx; - long lxur_esi; - long lxur_edi; - long lxur_ebp; - long lxur_eax; - long lxur_xds; - long lxur_xes; - long lxur_xfs; - long lxur_xgs; - long lxur_orig_eax; - long lxur_eip; - long lxur_xcs; - long lxur_eflags; - long lxur_esp; - long lxur_xss; -} lx_user_regs_t; -#endif - typedef struct lx_user { lx_user_regs_t lxu_regs; int lxu_fpvalid; @@ -242,336 +180,6 @@ get_lwpstatus(pid_t pid, lwpid_t lwpid, lwpstatus_t *lsp) return (0); } -static uintptr_t -syscall_regs(int fd, uintptr_t fp, pid_t pid) -{ - uintptr_t addr, done; - struct frame fr; - auxv_t auxv; - int afd; -#if defined(_LP64) - Elf64_Phdr phdr; -#elif defined(_ILP32) - Elf32_Phdr phdr; -#endif - - /* - * Try to walk the stack looking for a return address that corresponds - * to the traced process's lx_emulate_done symbol. This relies on the - * fact that the brand library in the traced process is the same as the - * brand library in this process (indeed, this is true of all processes - * in a given branded zone). - */ - - /* - * Find the base address for the brand library in the traced process - * by grabbing the AT_PHDR auxv entry, reading in the program header - * at that location and subtracting off the p_vaddr member. We use - * this to compute the location of lx_emulate done in the traced - * process. - */ - if ((afd = open_procfile(pid, O_RDONLY, "auxv")) < 0) - return (0); - - do { - if (read(afd, &auxv, sizeof (auxv)) != sizeof (auxv)) { - (void) close(afd); - return (0); - } - } while (auxv.a_type != AT_PHDR); - - (void) close(afd); - - if (pread(fd, &phdr, sizeof (phdr), auxv.a_un.a_val) != sizeof (phdr)) { - lx_debug("failed to read brand library's phdr"); - return (0); - } - - addr = auxv.a_un.a_val - phdr.p_vaddr; - done = (uintptr_t)&lx_emulate_done - (uintptr_t)&_START_ + addr; - - fr.fr_savfp = fp; - - do { - addr = fr.fr_savfp; - if (pread(fd, &fr, sizeof (fr), addr) != sizeof (fr)) { - lx_debug("ptrace read failed for stack walk"); - return (0); - } - - if (addr >= fr.fr_savfp) { - lx_debug("ptrace stack not monotonically increasing " - "%p %p (%p)", addr, fr.fr_savfp, done); - return (0); - } - } while (fr.fr_savpc != done); - - /* - * The first argument to lx_emulate is known to be an lx_regs_t - * structure and the ABI specifies that it will be placed on the stack - * immediately preceeding the return address. - */ - addr += sizeof (fr); - - /* - * On i386 we need to perform an additional read as we used the stack - * to pass the argument to lx_emulate. On amd64 we passed the argument - * in %rdi so addr already contains the correct address. - */ -#if defined(_ILP32) - if (pread(fd, &addr, sizeof (addr), addr) != sizeof (addr)) { - lx_debug("ptrace stack failed to read register set address"); - return (0); - } -#endif - - return (addr); -} - -static int -getregs(pid_t pid, lwpid_t lwpid, lx_user_regs_t *rp) -{ - lwpstatus_t status; - uintptr_t addr; - int fd, ret; - - if ((ret = get_lwpstatus(pid, lwpid, &status)) != 0) - return (ret); - - if ((fd = open_procfile(pid, O_RDONLY, "as")) < 0) - return (-ESRCH); - - /* - * If we find the syscall regs (and are therefore in an emulated - * syscall, use the register set at given address. Otherwise, use the - * registers as reported by /proc. - */ - if ((addr = syscall_regs(fd, status.pr_reg[REG_FP], pid)) != 0) { - lx_regs_t regs; - - if (pread(fd, ®s, sizeof (regs), addr) != sizeof (regs)) { - (void) close(fd); - lx_debug("ptrace failed to read register set"); - return (-EIO); - } - - (void) close(fd); - -#if defined(_LP64) - rp->lxur_r15 = regs.lxr_r15; - rp->lxur_r14 = regs.lxr_r14; - rp->lxur_r13 = regs.lxr_r13; - rp->lxur_r12 = regs.lxr_r12; - rp->lxur_rbp = regs.lxr_rbp; - rp->lxur_rbx = regs.lxr_rbx; - rp->lxur_r11 = regs.lxr_r11; - rp->lxur_r10 = regs.lxr_r10; - rp->lxur_r9 = regs.lxr_r9; - rp->lxur_r8 = regs.lxr_r8; - rp->lxur_rax = regs.lxr_rax; - rp->lxur_rcx = regs.lxr_rcx; - rp->lxur_rdx = regs.lxr_rdx; - rp->lxur_rsi = regs.lxr_rsi; - rp->lxur_rdi = regs.lxr_rdi; - rp->lxur_orig_rax = regs.lxr_orig_rax; - rp->lxur_rip = regs.lxr_rip; - rp->lxur_xcs = status.pr_reg[REG_CS]; - rp->lxur_rflags = status.pr_reg[REG_RFL]; - rp->lxur_rsp = regs.lxr_rsp; - rp->lxur_xss = status.pr_reg[REG_SS]; - rp->lxur_xfs_base = status.pr_reg[REG_FSBASE]; - rp->lxur_xgs_base = status.pr_reg[REG_GSBASE]; - rp->lxur_xds = status.pr_reg[REG_DS]; - rp->lxur_xes = status.pr_reg[REG_ES]; - rp->lxur_xfs = regs.lxr_fs; - rp->lxur_xgs = status.pr_reg[REG_GS]; -#elif defined(_ILP32) - rp->lxur_ebx = regs.lxr_ebx; - rp->lxur_ecx = regs.lxr_ecx; - rp->lxur_edx = regs.lxr_edx; - rp->lxur_esi = regs.lxr_esi; - rp->lxur_edi = regs.lxr_edi; - rp->lxur_ebp = regs.lxr_ebp; - rp->lxur_eax = regs.lxr_eax; - rp->lxur_xds = status.pr_reg[DS]; - rp->lxur_xes = status.pr_reg[ES]; - rp->lxur_xfs = status.pr_reg[FS]; - rp->lxur_xgs = regs.lxr_gs; - rp->lxur_orig_eax = regs.lxr_orig_eax; - rp->lxur_eip = regs.lxr_eip; - rp->lxur_xcs = status.pr_reg[CS]; - rp->lxur_eflags = status.pr_reg[EFL]; - rp->lxur_esp = regs.lxr_esp; - rp->lxur_xss = status.pr_reg[SS]; -#endif - - } else { - (void) close(fd); - -#if defined(_LP64) - rp->lxur_r15 = status.pr_reg[REG_R15]; - rp->lxur_r14 = status.pr_reg[REG_R14]; - rp->lxur_r13 = status.pr_reg[REG_R13]; - rp->lxur_r12 = status.pr_reg[REG_R12]; - rp->lxur_rbp = status.pr_reg[REG_RBP]; - rp->lxur_rbx = status.pr_reg[REG_RBX]; - rp->lxur_r11 = status.pr_reg[REG_R11]; - rp->lxur_r10 = status.pr_reg[REG_R10]; - rp->lxur_r9 = status.pr_reg[REG_R9]; - rp->lxur_r8 = status.pr_reg[REG_R8]; - rp->lxur_rax = status.pr_reg[REG_RAX]; - rp->lxur_rcx = status.pr_reg[REG_RCX]; - rp->lxur_rdx = status.pr_reg[REG_RDX]; - rp->lxur_rsi = status.pr_reg[REG_RSI]; - rp->lxur_rdi = status.pr_reg[REG_RDI]; - rp->lxur_orig_rax = 0; - rp->lxur_rip = status.pr_reg[REG_RIP]; - rp->lxur_xcs = status.pr_reg[REG_CS]; - rp->lxur_rflags = status.pr_reg[REG_RFL]; - rp->lxur_rsp = status.pr_reg[REG_RSP]; - rp->lxur_xss = status.pr_reg[REG_SS]; - rp->lxur_xfs = status.pr_reg[REG_FSBASE]; - rp->lxur_xgs = status.pr_reg[REG_GSBASE]; - rp->lxur_xds = status.pr_reg[REG_DS]; - rp->lxur_xes = status.pr_reg[REG_ES]; - rp->lxur_xfs = status.pr_reg[REG_FSBASE]; - rp->lxur_xgs = status.pr_reg[REG_GSBASE]; -#elif defined(_ILP32) - rp->lxur_ebx = status.pr_reg[EBX]; - rp->lxur_ecx = status.pr_reg[ECX]; - rp->lxur_edx = status.pr_reg[EDX]; - rp->lxur_esi = status.pr_reg[ESI]; - rp->lxur_edi = status.pr_reg[EDI]; - rp->lxur_ebp = status.pr_reg[EBP]; - rp->lxur_eax = status.pr_reg[EAX]; - rp->lxur_xds = status.pr_reg[DS]; - rp->lxur_xes = status.pr_reg[ES]; - rp->lxur_xfs = status.pr_reg[FS]; - rp->lxur_xgs = status.pr_reg[GS]; - rp->lxur_orig_eax = 0; - rp->lxur_eip = status.pr_reg[EIP]; - rp->lxur_xcs = status.pr_reg[CS]; - rp->lxur_eflags = status.pr_reg[EFL]; - rp->lxur_esp = status.pr_reg[UESP]; - rp->lxur_xss = status.pr_reg[SS]; -#endif - - /* - * If the target process has just returned from exec, it's not - * going to be sitting in the emulation function. In that case - * we need to manually fake up the values for %eax and orig_eax - * to indicate a successful return and that the traced process - * had called execve (respectively). - */ - if (status.pr_why == PR_SYSEXIT && - status.pr_what == SYS_execve) { -#if defined(_LP64) - rp->lxur_rax = 0; - rp->lxur_orig_rax = LX_SYS_execve; -#elif defined(_ILP32) - rp->lxur_eax = 0; - rp->lxur_orig_eax = LX_SYS_execve; -#endif - } - } - - return (0); -} - -static int -setregs(pid_t pid, lwpid_t lwpid, const lx_user_regs_t *rp) -{ - long ctl[1 + sizeof (prgregset_t) / sizeof (long)]; - lwpstatus_t status; - uintptr_t addr; - int fd, ret; - - if ((ret = get_lwpstatus(pid, lwpid, &status)) != 0) - return (ret); - - if ((fd = open_procfile(pid, O_RDWR, "as")) < 0) - return (-ESRCH); - - /* - * If we find the syscall regs (and are therefore in an emulated - * syscall, modify the register set at given address and set the - * remaining registers through the /proc interface. Otherwise just use - * the /proc interface to set register values; - */ - if ((addr = syscall_regs(fd, status.pr_reg[REG_FP], pid)) != 0) { -#if defined(_ILP32) - lx_regs_t regs; - - regs.lxr_ebx = rp->lxur_ebx; - regs.lxr_ecx = rp->lxur_ecx; - regs.lxr_edx = rp->lxur_edx; - regs.lxr_esi = rp->lxur_esi; - regs.lxr_edi = rp->lxur_edi; - regs.lxr_ebp = rp->lxur_ebp; - regs.lxr_eax = rp->lxur_eax; - regs.lxr_gs = rp->lxur_xgs; - regs.lxr_orig_eax = rp->lxur_orig_eax; - regs.lxr_eip = rp->lxur_eip; - regs.lxr_esp = rp->lxur_esp; - - if (pwrite(fd, ®s, sizeof (regs), addr) != sizeof (regs)) { - (void) close(fd); - lx_debug("ptrace failed to write register set"); - return (-EIO); - } -#endif - - (void) close(fd); - -#if defined(_ILP32) - status.pr_reg[DS] = rp->lxur_xds; - status.pr_reg[ES] = rp->lxur_xes; - status.pr_reg[FS] = rp->lxur_xfs; - status.pr_reg[CS] = rp->lxur_xcs; - status.pr_reg[EFL] = rp->lxur_eflags; - status.pr_reg[SS] = rp->lxur_xss; -#endif - - } else { - (void) close(fd); - -#if defined(_ILP32) - status.pr_reg[EBX] = rp->lxur_ebx; - status.pr_reg[ECX] = rp->lxur_ecx; - status.pr_reg[EDX] = rp->lxur_edx; - status.pr_reg[ESI] = rp->lxur_esi; - status.pr_reg[EDI] = rp->lxur_edi; - status.pr_reg[EBP] = rp->lxur_ebp; - status.pr_reg[EAX] = rp->lxur_eax; - status.pr_reg[DS] = rp->lxur_xds; - status.pr_reg[ES] = rp->lxur_xes; - status.pr_reg[FS] = rp->lxur_xfs; - status.pr_reg[GS] = rp->lxur_xgs; - status.pr_reg[EIP] = rp->lxur_eip; - status.pr_reg[CS] = rp->lxur_xcs; - status.pr_reg[EFL] = rp->lxur_eflags; - status.pr_reg[UESP] = rp->lxur_esp; - status.pr_reg[SS] = rp->lxur_xss; - status.pr_reg[SS] = rp->lxur_xss; -#endif - } - - if ((fd = open_lwpfile(pid, lwpid, O_WRONLY, "lwpctl")) < 0) - return (-ESRCH); - - ctl[0] = PCSREG; - bcopy(status.pr_reg, &ctl[1], sizeof (prgregset_t)); - - if (write(fd, &ctl, sizeof (ctl)) != sizeof (ctl)) { - (void) close(fd); - return (-EIO); - } - - (void) close(fd); - - return (0); -} - static int getfpregs(pid_t pid, lwpid_t lwpid, lx_user_fpregs_t *rp) { @@ -904,7 +512,7 @@ ptrace_peek(pid_t pid, uintptr_t addr, long *ret) (offsetof(lx_user_t, m) + sizeof (((lx_user_t *)NULL)->m)) static int -ptrace_peek_user(pid_t pid, lwpid_t lwpid, uintptr_t off, int *ret) +ptrace_peek_user(pid_t lxpid, pid_t pid, lwpid_t lwpid, uintptr_t off, int *ret) { int err, data; uintptr_t *debugreg; @@ -919,8 +527,10 @@ ptrace_peek_user(pid_t pid, lwpid_t lwpid, uintptr_t off, int *ret) if (off < LX_USER_BOUND(lxu_regs)) { lx_user_regs_t regs; - if ((err = getregs(pid, lwpid, ®s)) != 0) + if ((err = lx_ptrace_kernel(LX_PTRACE_GETREGS, lxpid, NULL, + (uintptr_t)®s)) != 0) { return (err); + } data = *(int *)((uintptr_t)®s + off - offsetof(lx_user_t, lxu_regs)); @@ -1019,7 +629,7 @@ ptrace_poke(pid_t pid, uintptr_t addr, int data) } static int -ptrace_poke_user(pid_t pid, lwpid_t lwpid, uintptr_t off, int data) +ptrace_poke_user(pid_t lxpid, pid_t pid, lwpid_t lwpid, uintptr_t off, int data) { lx_user_regs_t regs; int err = 0; @@ -1030,11 +640,16 @@ ptrace_poke_user(pid_t pid, lwpid_t lwpid, uintptr_t off, int data) return (-EINVAL); if (off < offsetof(lx_user_t, lxu_regs) + sizeof (lx_user_regs_t)) { - if ((err = getregs(pid, lwpid, ®s)) != 0) + if ((err = lx_ptrace_kernel(LX_PTRACE_GETREGS, lxpid, NULL, + (uintptr_t)®s)) != 0) { return (err); + } + *(int *)((uintptr_t)®s + off - offsetof(lx_user_t, lxu_regs)) = data; - return (setregs(pid, lwpid, ®s)); + + return (lx_ptrace_kernel(LX_PTRACE_SETREGS, lxpid, NULL, + (uintptr_t)®s)); } if (off >= offsetof(lx_user_t, lxu_debugreg) && @@ -1068,32 +683,6 @@ ptrace_kill(pid_t pid) } static int -ptrace_getregs(pid_t pid, lwpid_t lwpid, uintptr_t addr) -{ - lx_user_regs_t regs; - int ret; - - if ((ret = getregs(pid, lwpid, ®s)) != 0) - return (ret); - - if (uucopy(®s, (void *)addr, sizeof (regs)) != 0) - return (-errno); - - return (0); -} - -static int -ptrace_setregs(pid_t pid, lwpid_t lwpid, uintptr_t addr) -{ - lx_user_regs_t regs; - - if (uucopy((void *)addr, ®s, sizeof (regs)) != 0) - return (-errno); - - return (setregs(pid, lwpid, ®s)); -} - -static int ptrace_getfpregs(pid_t pid, lwpid_t lwpid, uintptr_t addr) { lx_user_fpregs_t regs; @@ -1146,16 +735,21 @@ ptrace_setfpxregs(pid_t pid, lwpid_t lwpid, uintptr_t addr) } void -lx_ptrace_stop_if_option(int option, boolean_t child, ulong_t msg) +lx_ptrace_stop_if_option(int option, boolean_t child, ulong_t msg, + ucontext_t *ucp) { /* * We call into the kernel to see if we need to stop for specific * ptrace(2) events. */ - lx_debug("lx_ptrace_stop_if_option(%d, %s, %lu)", option, - child ? "TRUE [child]" : "FALSE [parent]", msg); - if (syscall(SYS_brand, B_PTRACE_STOP_FOR_OPT, option, child, - msg) != 0) { + lx_debug("lx_ptrace_stop_if_option(%d, %s, %lu, %p)", option, + child ? "TRUE [child]" : "FALSE [parent]", msg, ucp); + if (ucp == NULL) { + ucp = (ucontext_t *)lx_find_brand_uc(); + lx_debug("\tucp = %p", ucp); + } + if (syscall(SYS_brand, B_PTRACE_STOP_FOR_OPT, option, child, msg, + ucp) != 0) { if (errno != ESRCH) { /* * This should _only_ fail if we are not traced, or do @@ -1243,6 +837,8 @@ lx_ptrace(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4) */ case LX_PTRACE_SETOPTIONS: case LX_PTRACE_GETEVENTMSG: + case LX_PTRACE_GETREGS: + case LX_PTRACE_SETREGS: return (lx_ptrace_kernel(ptrace_op, lxpid, p3, p4)); } @@ -1262,24 +858,18 @@ lx_ptrace(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4) return (ptrace_peek(pid, p3, (long *)p4)); case LX_PTRACE_PEEKUSER: - return (ptrace_peek_user(pid, lwpid, p3, (int *)p4)); + return (ptrace_peek_user(lxpid, pid, lwpid, p3, (int *)p4)); case LX_PTRACE_POKETEXT: case LX_PTRACE_POKEDATA: return (ptrace_poke(pid, p3, (int)p4)); case LX_PTRACE_POKEUSER: - return (ptrace_poke_user(pid, lwpid, p3, (int)p4)); + return (ptrace_poke_user(lxpid, pid, lwpid, p3, (int)p4)); case LX_PTRACE_KILL: return (ptrace_kill(pid)); - case LX_PTRACE_GETREGS: - return (ptrace_getregs(pid, lwpid, p4)); - - case LX_PTRACE_SETREGS: - return (ptrace_setregs(pid, lwpid, p4)); - case LX_PTRACE_GETFPREGS: return (ptrace_getfpregs(pid, lwpid, p4)); diff --git a/usr/src/lib/brand/lx/lx_brand/common/signal.c b/usr/src/lib/brand/lx/lx_brand/common/signal.c index 7d3865c2de..4c143720c3 100644 --- a/usr/src/lib/brand/lx/lx_brand/common/signal.c +++ b/usr/src/lib/brand/lx/lx_brand/common/signal.c @@ -42,6 +42,7 @@ #include <sys/lx_thread.h> #include <sys/syscall.h> #include <lx_provider_impl.h> +#include <sys/stack.h> #include <assert.h> #include <errno.h> #include <poll.h> @@ -60,10 +61,10 @@ #if defined(_ILP32) extern int pselect_large_fdset(int nfds, fd_set *in0, fd_set *out0, fd_set *ex0, const timespec_t *tsp, const sigset_t *sp); -#else -static int lx_setcontext(const ucontext_t *ucp); #endif +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + /* * Delivering signals to a Linux process is complicated by differences in * signal numbering, stack structure and contents, and the action taken when a @@ -91,31 +92,24 @@ static int lx_setcontext(const ucontext_t *ucp); * * Adding a Linux branded thread to the mix complicates things somewhat. * - * First (for 32-bit code), when a thread receives a signal, it may be running - * with a Linux value in the x86 %gs segment register as opposed to the value - * Illumos threads expect; if control were passed directly to Illumos code, - * such as libc's sigacthandler(), that code would experience a segmentation - * fault the first time it tried to dereference a memory location using %gs. - * - * For 64-bit code the %gs is usually 0 for both native and Linux code and the - * thread pointer for both Illumos and Linux libc is referenced off the %fsbase - * register, as per the AMD64 ABI. When a thread receives a signal, it may be - * running with the Linux value in the x86 %fsbase register as opposed to the - * value Illumos libc expects. Switching the %fsbase value is handled in the - * kernel module at the same time as we switch the syscall mode flag. We track - * the syscall mode flag in the kernel using the per-lwp br_scms integer so we - * can save/restore the correct mode at the end of the signal handling. The - * flag value is saved/restored in the per-thread br_scms variable which is - * used like a stack to push/pop the flag bit as we take signals and return. + * First, when a thread receives a signal, it may either be running in an + * emulated Linux context or a native illumos context. In either case, the + * in-kernel brand module is responsible for preserving the register state + * from the interrupted context, regardless of whether emulated or native + * software was running at the time. The kernel is also responsible for + * ensuring that the illumos native sigacthandler() is called with register + * values appropriate for native code. Of particular note is the %gs segment + * selector for 32-bit code, and the %fsbase segment base register for 64-bit + * code; these are used by libc to locate per-thread data structures. * * Second, the signal number translation referenced above must take place. - * Further, for 32-bit code, as was the case with Illumos libc, before the - * Linux signal handler is called, the value of the %gs segment register MUST - * be restored to the value Linux code expects. + * Finally, when we hand control to the Linux signal handler we must do so + * on the brand stack, and with registers configured appropriately for the + * Linux application. * - * This need to translate signal numbers (and manipulate the %gs register) - * means that with standard Illumos libc, following a signal from generation to - * delivery looks something like: + * This need to translate signal numbers (and manipulate the signal handling + * context) means that with standard Illumos libc, following a signal from + * generation to delivery looks something like: * * kernel -> * sigacthandler() -> @@ -125,21 +119,15 @@ static int lx_setcontext(const ucontext_t *ucp); * but for the brand's Linux threads, this would look like: * * kernel -> - * lx_sigacthandler() -> - * sigacthandler() -> - * call_user_handler() -> - * lx_call_user_handler() -> - * lx_sigdeliver() JMP to - * Linux user signal handler + * sigacthandler() -> + * call_user_handler() -> + * lx_call_user_handler() -> + * lx_sigdeliver() -> + * syscall(B_JUMP_TO_LINUX, ...) -> + * Linux user signal handler * * The new addtions are: * - * lx_sigacthandler - * ================ - * This routine is responsible for setting the %gs segment register to the - * value 32-bit Illumos code expects (it does nothing in 64-bit code) and - * jumping to Illumos' libc signal interposition handler, sigacthandler(). - * * lx_call_user_handler * ==================== * This routine is responsible for translating Illumos signal numbers to @@ -148,25 +136,8 @@ static int lx_setcontext(const ucontext_t *ucp); * registered Linux signal handler. It is, in effect, the Linux thread * equivalent to libc's call_user_handler(). * - * Installing lx_sigacthandler() is a bit tricky, as normally libc's - * sigacthandler() routine is hidden from user programs. To facilitate this, a - * libc private function is used; setsigacthandler(): - * - * void setsigacthandler(void (*new_handler)(int, siginfo_t *, void *), - * void (**old_handler)(int, siginfo_t *, void *) - * int (*brsetctxt)(const ucontext_t *)) - * - * The routine works by modifying the per-thread data structure (uberdata) in - * libc that keeps track of the address of its own interposition handler with - * the address passed in; the old handler's address is returned in the pointer - * pointed to by the second argument, if it is non-NULL, mimicking the behavior - * of sigaction() itself. In a similar way, this function can also set a - * replacement handler for the libc __setcontext call which is made by libc's - * setcontext() when returning from a signal handler. Using this we can hook - * in to managing the syscall mode flag for 64-bit code when returning to the - * interrupted code. Once setsigacthandler() has been executed, all future - * branded threads this thread may create will automatically have the proper - * interposition handler(s) invoked as the result of a normal sigaction() call. + * lx_sigdeliver + * ============= * * Note that none of this interposition is necessary unless a Linux thread * registers a user signal handler, as the default action for all signals is the @@ -184,18 +155,18 @@ static int lx_setcontext(const ucontext_t *ucp); * translating the value WTERMSIG() would return from a Illumos signal number * to the appropriate Linux value. * - * lx_call_user_handler() calls lx_sigdeliver with a helper function (typically - * lx_build_signal_frame) which builds a stack frame for the 32-bit Linux - * signal handler, or populates a local (on the stack) structure for the 64-bit - * Linux signal handler, then jmp's into the handler. The stack at that time - * looks like this: + * lx_call_user_handler() calls lx_sigdeliver() with a helper function + * (typically lx_build_signal_frame) which builds a stack frame for the 32-bit + * Linux signal handler, or populates a local (on the stack) structure for the + * 64-bit Linux signal handler. The stack at that time looks like this: * - * ================================================= - * | | LX_SIGRT_MAGIC | - * | ================================================= - * | | Linux signal frame (32-bit) or local data | - * V | (64-bit) built by stack_builder() | - * ================================================= + * ========================================================= + * | | lx_sigdeliver_frame_t -- includes LX_SIGRT_MAGIC and | + * | | a return context for the eventual sigreturn(2) call | + * | ========================================================= + * | | Linux signal frame (32-bit) or local data | + * V | (64-bit) built by stack_builder() | + * ========================================================= * * The process of returning to an interrupted thread of execution from a user * signal handler is entirely different between Illumos and Linux. While @@ -207,24 +178,7 @@ static int lx_setcontext(const ucontext_t *ucp); * call to setcontext(2), the rt_sigreturn(2) Linux system call is responsible * for accomplishing much the same thing. It's for this reason that the stack * frame we build has the lx_(rt_)sigreturn_tramp code on the top of the - * stack. - * - * The lx_rt_sigreturn() function will handle the syscall, do its cleanup, - * then return to the libc signal handling code (call_user_handler) so that - * libc can use setcontext() to get back to the point where things were - * interrupted. However, for the 64-bit case, due to the syscall mode switching, - * we cannot simply let the libc setcontext() take us back because we may also - * have to switch the syscall mode back to Linux (it depends on where we were - * when we took the signal). For the 64-bit case we used setsigacthandler() - * to setup a libc replacement function on __setcontext(). This is the - * lx_setcontext() function. This function uses a brand call (B_SIGNAL_RETURN) - * which combines the syscall mode switching and setcontext handling in the lx - * kernel module. - * - * An additional oddity in the signal return code is that in the stack builder - * function we push some x86 code onto the bottom of the stack that looks like - * it invokes the Linux (rt)_sigreturn syscall. This is needed by gdb to - * tell that it's in a signal handler. The code looks like this: + * stack. The code looks like this: * * 32-bit 64-bit * -------------------------------- ----------------------------- @@ -239,11 +193,13 @@ static int lx_setcontext(const ucontext_t *ucp); * trampoline code on the stack to determine whether it is in a signal stack * frame or not. Really.) * - * When the 32-bit Linux user signal handler is eventually called, the stack - * frame looks like this (in the case of a "modern" signal stack; see the - * lx_sigstack structure definition): + * When the 32-bit Linux user signal handler is eventually called, the brand + * stack frame looks like this (in the case of a "modern" signal stack; see + * the lx_sigstack structure definition): * * ========================================================= + * | | lx_sigdeliver_frame_t | + * | ========================================================= * | | Trampoline code (marker for gdb, not really executed) | * | ========================================================= * | | Linux struct _fpstate | @@ -264,6 +220,8 @@ static int lx_setcontext(const ucontext_t *ucp); * The 64-bit stack-local data looks like this: * * ========================================================= + * | | lx_sigdeliver_frame_t | + * | ========================================================= * | | Trampoline code (marker for gdb, not really executed) | * | ========================================================= * | | Linux struct _fpstate | @@ -277,14 +235,22 @@ static int lx_setcontext(const ucontext_t *ucp); * * As usual in 64-bit code, %rdi is arg0 which is the signal number. * - * As mentioned above, the brand intercepts the Linux (rt_)sigreturn(2) system - * call. This turns into some stack cleanup and a call to lx_sigreturn_tolibc() - * which returns through the libc call stack that Illumos expects, with the - * caveat that 64-bit code combines the __setcontext and syscall mode switch - * via a brand call. This returns the thread executing the code back to the - * location originally interrupted by receipt of the signal. + * The *sigreturn(2) family of emulated system call handlers locates the + * "lx_sigdeliver_frame_t" struct on the Linux stack as part of processing + * the system call. This object contains a guard value (LX_SIGRT_MAGIC) to + * detect stack smashing or an incorrect stack pointer. It also contains a + * "return" context, which we use to get back to the "lx_sigdeliver()" frame + * on the native stack that originally dispatched to the Linux signal + * handler. The lx_sigdeliver() function is then able to return to the + * native libc signal handler in the usual way. This results in a further + * setcontext() back to whatever was running when we took the signal. */ +typedef struct lx_sigdeliver_frame { + uintptr_t lxsdf_magic; + ucontext_t *lxsdf_retucp; + ucontext_t *lxsdf_sigucp; +} lx_sigdeliver_frame_t; struct lx_oldsigstack { void (*retaddr)(); /* address of real lx_sigreturn code */ @@ -296,12 +262,6 @@ struct lx_oldsigstack { }; /* - * libc_sigacthandler is set to the address of the libc signal interposition - * routine, sigacthandler(). - */ -void (*libc_sigacthandler)(int, siginfo_t *, void*); - -/* * The lx_sighandlers structure needs to be a global due to the semantics of * clone(). * @@ -324,13 +284,13 @@ static lx_sighandlers_t lx_sighandlers; struct lx_vsyscall { uintptr_t lv_addr; - long (*lv_func)(); + uintptr_t lv_scnum; char *lv_msg; } lx_vsyscalls[] = { - {LX_VSYS_gettimeofday, lx_gettimeofday, + {LX_VSYS_gettimeofday, LX_SYS_gettimeofday, "vsyscall gettimeofday(%p, %p)" }, - {LX_VSYS_time, lx_time, "vsyscall time(%p)" }, - {LX_VSYS_getcpu, lx_getcpu, "vsyscall getcpu(%p, %lx, %lx)" }, + {LX_VSYS_time, LX_SYS_time, "vsyscall time(%p)" }, + {LX_VSYS_getcpu, LX_SYS_getcpu, "vsyscall getcpu(%p, %lx, %lx)" }, {NULL, NULL, NULL} }; @@ -352,6 +312,9 @@ static int lx_sigsegv_depth = 0; */ static int lx_no_abort_handler = 0; +static void lx_sigdeliver(int, siginfo_t *, ucontext_t *, size_t, void (*)(), + void (*)(), struct lx_sigaction *); + /* * Cache result of process.max-file-descriptor to avoid calling getrctl() * for each lx_ppoll(). @@ -464,27 +427,6 @@ stol_osigset(sigset_t *s_sigsetp, lx_osigset_t *lx_osigsetp) #endif static int -stol_sigcode(int si_code) -{ - switch (si_code) { - case SI_USER: - return (LX_SI_USER); - case SI_LWP: - return (LX_SI_TKILL); - case SI_QUEUE: - return (LX_SI_QUEUE); - case SI_TIMER: - return (LX_SI_TIMER); - case SI_ASYNCIO: - return (LX_SI_ASYNCIO); - case SI_MESGQ: - return (LX_SI_MESGQ); - default: - return (si_code); - } -} - -static int ltos_sigcode(int si_code) { switch (si_code) { @@ -505,29 +447,6 @@ ltos_sigcode(int si_code) } } -/* - * Convert the "status" field of a SIGCLD siginfo_t. We need to extract the - * illumos signal number and convert it to a Linux signal number while leaving - * the ptrace(2) event bits intact. - */ -int -stol_status(int s) -{ - /* - * We mask out the top bit here in case PTRACE_O_TRACESYSGOOD - * is in use and 0x80 has been ORed with the signal number. - */ - int stat = stol_signo[s & 0x7f]; - assert(stat != -1); - - /* - * We must mix in the ptrace(2) event which may be stored in - * the second byte of the status code. We also re-include the - * PTRACE_O_TRACESYSGOOD bit. - */ - return ((s & 0xff80) | stat); -} - int stol_siginfo(siginfo_t *siginfop, lx_siginfo_t *lx_siginfop) { @@ -546,7 +465,7 @@ stol_siginfo(siginfo_t *siginfop, lx_siginfo_t *lx_siginfop) ret = -1; } - lx_siginfo.lsi_code = stol_sigcode(siginfop->si_code); + lx_siginfo.lsi_code = lx_stol_sigcode(siginfop->si_code); lx_siginfo.lsi_errno = siginfop->si_errno; switch (lx_siginfo.lsi_signo) { @@ -561,11 +480,12 @@ stol_siginfo(siginfo_t *siginfop, lx_siginfo_t *lx_siginfop) case LX_SIGCHLD: lx_siginfo.lsi_pid = siginfop->si_pid; - if (siginfop->si_code == CLD_EXITED) { + if (siginfop->si_code <= 0 || siginfop->si_code == + CLD_EXITED) { lx_siginfo.lsi_status = siginfop->si_status; } else { - lx_siginfo.lsi_status = stol_status( - siginfop->si_status); + lx_siginfo.lsi_status = lx_stol_status( + siginfop->si_status, -1); } lx_siginfo.lsi_utime = siginfop->si_utime; lx_siginfo.lsi_stime = siginfop->si_stime; @@ -695,41 +615,59 @@ ltos_fpstate(lx_fpstate_t *lfpr, fpregset_t *fpr) } /* - * The brand needs a lx version of this because the format of the lx stack_t - * differs from the Illumos stack_t not really in content but in ORDER, - * so we can't simply pass pointers and expect things to work (sigh...) + * We do not use the system sigaltstack() infrastructure as that would conflict + * with our handling of both system call emulation and native signals on the + * native stack. Instead, we track the Linux stack structure in our + * thread-specific data. This function is modeled on the behaviour of the + * native sigaltstack system call handler. */ long -lx_sigaltstack(uintptr_t nsp, uintptr_t osp) +lx_sigaltstack(uintptr_t ssp, uintptr_t oss) { - lx_stack_t ls; - stack_t newsstack, oldsstack; - stack_t *nssp = (nsp ? &newsstack : NULL); - stack_t *ossp = (osp ? &oldsstack : NULL); + lx_tsd_t *lxtsd = lx_get_tsd(); + lx_stack_t ss; - if (nsp) { - if (uucopy((void *)nsp, &ls, sizeof (lx_stack_t)) != 0) - return (-errno); + if (ssp != NULL) { + if (lxtsd->lxtsd_sigaltstack.ss_flags & LX_SS_ONSTACK) { + /* + * If we are currently using the installed alternate + * stack for signal handling, the user may not modify + * the stack for this thread. + */ + return (-EPERM); + } - if ((ls.ss_flags & LX_SS_DISABLE) == 0 && - ls.ss_size < LX_MINSIGSTKSZ) - return (-ENOMEM); + if (uucopy((void *)ssp, &ss, sizeof (ss)) != 0) { + return (-EFAULT); + } - newsstack.ss_sp = (int *)ls.ss_sp; - newsstack.ss_size = (long)ls.ss_size; - newsstack.ss_flags = ls.ss_flags; - } + if (ss.ss_flags & ~LX_SS_DISABLE) { + /* + * The user may not specify a value for flags other + * than 0 or SS_DISABLE. + */ + return (-EINVAL); + } - if (sigaltstack(nssp, ossp) != 0) - return (-errno); + if (!(ss.ss_flags & LX_SS_DISABLE) && ss.ss_size < + LX_MINSIGSTKSZ) { + return (-ENOMEM); + } + } - if (osp) { - ls.ss_sp = (void *)oldsstack.ss_sp; - ls.ss_size = (size_t)oldsstack.ss_size; - ls.ss_flags = oldsstack.ss_flags; + if (oss != NULL) { + /* + * User provided old and new stack_t pointers may point to + * the same location. Copy out before we modify. + */ + if (uucopy(&lxtsd->lxtsd_sigaltstack, (void *)oss, + sizeof (lxtsd->lxtsd_sigaltstack)) != 0) { + return (-EFAULT); + } + } - if (uucopy(&ls, (void *)osp, sizeof (lx_stack_t)) != 0) - return (-errno); + if (ssp != NULL) { + lxtsd->lxtsd_sigaltstack = ss; } return (0); @@ -993,20 +931,21 @@ lx_rt_sigtimedwait(uintptr_t set, uintptr_t sinfo, uintptr_t toutp, long lx_sigreturn(void) { + lx_sigdeliver_frame_t *lxsdf; struct lx_oldsigstack *lx_ossp; lx_sigset_t lx_sigset; - lx_regs_t *rp; ucontext_t *ucp; + ucontext_t *sigucp; uintptr_t sp; - rp = lx_syscall_regs(); + ucp = lx_syscall_regs(); /* * NOTE: The sp saved in the context is eight bytes off of where we * need it to be (either due to trampoline or the copying of * sp = uesp, not clear which). */ - sp = (uintptr_t)rp->lxr_esp - 8; + sp = LX_REG(ucp, REG_SP) - 8; /* * At this point, the stack pointer should point to the struct @@ -1015,32 +954,34 @@ lx_sigreturn(void) * save a pointer to it before incrementing our copy of the sp. */ lx_ossp = (struct lx_oldsigstack *)sp; - sp += sizeof (struct lx_oldsigstack); + sp += SA(sizeof (struct lx_oldsigstack)); + /* - * lx_sigdeliver() pushes LX_SIGRT_MAGIC on the stack before it - * creates the struct lx_oldsigstack. + * lx_sigdeliver() pushes a lx_sigdeliver_frame_t onto the stack + * before it creates the struct lx_oldsigstack. * - * If we don't find it here, the stack's been corrupted and we need to - * kill ourselves. - */ - if (*(uint32_t *)sp != LX_SIGRT_MAGIC) + * If we do not find it here, the stack has been corrupted and we + * need to kill ourselves. + */ + lxsdf = (lx_sigdeliver_frame_t *)sp; + lx_debug("lx_sigreturn: reading lx_sigdeliver_frame_t @ %p\n", + lxsdf); + lx_debug("lx_sigreturn: lxsdf: magic %p retucp %p sigucp %p\n", + lxsdf->lxsdf_magic, lxsdf->lxsdf_retucp, lxsdf->lxsdf_sigucp); + if (lxsdf->lxsdf_magic != LX_SIGRT_MAGIC) { + LX_SIGNAL_DELIVERY_FRAME_CORRUPT(lxsdf); lx_err_fatal("sp @ 0x%p, expected 0x%x, found 0x%x!", - sp, LX_SIGRT_MAGIC, *(uint32_t *)sp); + sp, LX_SIGRT_MAGIC, lxsdf->lxsdf_magic); + } - sp += sizeof (uint32_t); + LX_SIGNAL_DELIVERY_FRAME_FOUND(lxsdf); /* - * For signal mask handling to be done properly, this call needs to - * return to the libc routine that originally called the signal handler - * rather than directly set the context back to the place the signal - * interrupted execution as the original Linux code would do. - * - * Here *sp points to the Illumos ucontext_t, so we need to copy - * machine registers the Linux signal handler may have modified - * back to the Illumos version. + * We need to copy machine registers the Linux signal handler may have + * modified back to the Illumos ucontext_t. */ - ucp = (ucontext_t *)(*(ssize_t *)sp); + sigucp = lxsdf->lxsdf_sigucp; /* * General registers copy across as-is, except Linux expects that @@ -1051,30 +992,31 @@ lx_sigreturn(void) * value to ESP. */ lx_ossp->sigc.sc_esp_at_signal = lx_ossp->sigc.sc_esp; - bcopy(&lx_ossp->sigc, &ucp->uc_mcontext, sizeof (gregset_t)); + bcopy(&lx_ossp->sigc, &sigucp->uc_mcontext, sizeof (gregset_t)); + + LX_SIGRETURN(NULL, sigucp, sp); /* copy back FP regs if present */ if (lx_ossp->sigc.sc_fpstate != NULL) - ltos_fpstate(&lx_ossp->fpstate, &ucp->uc_mcontext.fpregs); + ltos_fpstate(&lx_ossp->fpstate, &sigucp->uc_mcontext.fpregs); /* convert Linux signal mask back to its Illumos equivalent */ bzero(&lx_sigset, sizeof (lx_sigset_t)); lx_sigset.__bits[0] = lx_ossp->sigc.sc_mask; lx_sigset.__bits[1] = lx_ossp->sig_extra; - (void) ltos_sigset(&lx_sigset, &ucp->uc_sigmask); + (void) ltos_sigset(&lx_sigset, &sigucp->uc_sigmask); /* - * At this point sp contains the value of the stack pointer when - * lx_call_user_handler() was called. - * - * Pop one more value off the stack and pass the new sp to - * lx_sigreturn_tolibc(), which will in turn manipulate the x86 - * registers to make it appear to libc's call_user_handler() as if the - * handler it had called returned. + * For signal mask handling to be done properly, this call needs to + * return to the libc routine that originally called the signal handler + * rather than directly set the context back to the place the signal + * interrupted execution as the original Linux code would do. */ - sp += sizeof (uint32_t); - lx_debug("calling lx_sigreturn_tolibc(0x%p)", sp); - lx_sigreturn_tolibc(sp); + lx_debug("lx_sigreturn: calling setcontext; retucp %p flags %lx " + "link %p\n", lxsdf->lxsdf_retucp, lxsdf->lxsdf_retucp->uc_flags, + lxsdf->lxsdf_retucp->uc_link); + setcontext(lxsdf->lxsdf_retucp); + assert(0); /*NOTREACHED*/ return (0); @@ -1087,16 +1029,19 @@ lx_sigreturn(void) long lx_rt_sigreturn(void) { + lx_sigdeliver_frame_t *lxsdf; struct lx_sigstack *lx_ssp; - lx_regs_t *rp; lx_ucontext_t *lx_ucp; ucontext_t *ucp; + ucontext_t *sigucp; uintptr_t sp; /* Get the registers at the emulated Linux rt_sigreturn syscall */ - rp = lx_syscall_regs(); + ucp = lx_syscall_regs(); #if defined(_ILP32) + lx_debug("lx_rt_sigreturn: ESP %p UESP %p\n", LX_REG(ucp, ESP), + LX_REG(ucp, UESP)); /* * For 32-bit * @@ -1121,14 +1066,14 @@ lx_rt_sigreturn(void) * lx_sigdeliver() created the stack frame for the Linux signal * handler. */ - sp = (uintptr_t)rp->lxr_esp - 4; + sp = (uintptr_t)LX_REG(ucp, REG_SP) - 4; #else /* * We need to make an adjustment for 64-bit code as well. Since 64-bit * does not use the trampoline, it's probably for the same reason as * alluded to above. */ - sp = (uintptr_t)rp->lxr_rsp - 8; + sp = (uintptr_t)LX_REG(ucp, REG_SP) - 8; #endif /* @@ -1138,39 +1083,41 @@ lx_rt_sigreturn(void) * save a pointer to it before incrementing our copy of the sp. */ lx_ssp = (struct lx_sigstack *)sp; - sp += sizeof (struct lx_sigstack); + sp += SA(sizeof (struct lx_sigstack)); +#if defined(_LP64) /* - * We handle 32 vs. 64 bit differently here, but first, lx_sigdeliver() - * pushed LX_SIGRT_MAGIC on the stack before it created the - * struct lx_sigstack (and possibly struct lx_fpstate_t). - * - * If we don't find LX_SIGRT_MAGIC here, the stack's been corrupted and - * we need to kill ourselves. - * - * Check for and remove LX_SIGRT_MAGIC from the stack. + * The 64-bit lx_sigdeliver() inserts 8 bytes of padding between + * the lx_sigstack_t and the delivery frame to maintain ABI stack + * alignment. */ -#if defined(_LP64) - /* account for extra word used in lx_sigdeliver for stack alignment */ sp += 8; +#endif - if (*(uint64_t *)sp != LX_SIGRT_MAGIC) + /* + * lx_sigdeliver() pushes a lx_sigdeliver_frame_t onto the stack + * before it creates the struct lx_oldsigstack. + * + * If we do not find it here, the stack has been corrupted and we + * need to kill ourselves. + */ + lxsdf = (lx_sigdeliver_frame_t *)sp; + if (lxsdf->lxsdf_magic != LX_SIGRT_MAGIC) { + LX_SIGNAL_DELIVERY_FRAME_CORRUPT(lxsdf); lx_err_fatal("sp @ 0x%p, expected 0x%x, found 0x%x!", - sp, LX_SIGRT_MAGIC, *(uint32_t *)sp); - sp += sizeof (uint64_t); + sp, LX_SIGRT_MAGIC, lxsdf->lxsdf_magic); + } + + LX_SIGNAL_DELIVERY_FRAME_FOUND(lxsdf); + + sigucp = lxsdf->lxsdf_sigucp; /* - * Now *(sp + 24) points to the Illumos ucontext_t (working backwards - * through the Linux signal hander, the stack builder, and the stack - * size) which we saved on the stack in the lx_sigdeliver assembly - * prologue before we pushed LX_SIGRT_MAGIC, so we need to copy machine - * registers the Linux signal handler may have modified back to the - * Illumos version. + * We need to copy machine registers the Linux signal handler may have + * modified back to the Illumos version. */ - ucp = (ucontext_t *)(*(ssize_t *)(sp + 24)); - +#if defined(_LP64) lx_ucp = &lx_ssp->uc; - LX_SIGRETURN(lx_ucp, ucp, sp); /* Track SIGSEGV recursion depth for vsyscall */ if (lx_ssp->si.lsi_signo == LX_SIGSEGV) { @@ -1181,47 +1128,33 @@ lx_rt_sigreturn(void) /* * General register layout is completely different. */ - ucp->uc_mcontext.gregs[REG_R15] = lx_ucp->uc_sigcontext.sc_r15; - ucp->uc_mcontext.gregs[REG_R14] = lx_ucp->uc_sigcontext.sc_r14; - ucp->uc_mcontext.gregs[REG_R13] = lx_ucp->uc_sigcontext.sc_r13; - ucp->uc_mcontext.gregs[REG_R12] = lx_ucp->uc_sigcontext.sc_r12; - ucp->uc_mcontext.gregs[REG_R11] = lx_ucp->uc_sigcontext.sc_r11; - ucp->uc_mcontext.gregs[REG_R10] = lx_ucp->uc_sigcontext.sc_r10; - ucp->uc_mcontext.gregs[REG_R9] = lx_ucp->uc_sigcontext.sc_r9; - ucp->uc_mcontext.gregs[REG_R8] = lx_ucp->uc_sigcontext.sc_r8; - ucp->uc_mcontext.gregs[REG_RDI] = lx_ucp->uc_sigcontext.sc_rdi; - ucp->uc_mcontext.gregs[REG_RSI] = lx_ucp->uc_sigcontext.sc_rsi; - ucp->uc_mcontext.gregs[REG_RBP] = lx_ucp->uc_sigcontext.sc_rbp; - ucp->uc_mcontext.gregs[REG_RBX] = lx_ucp->uc_sigcontext.sc_rbx; - ucp->uc_mcontext.gregs[REG_RDX] = lx_ucp->uc_sigcontext.sc_rdx; - ucp->uc_mcontext.gregs[REG_RCX] = lx_ucp->uc_sigcontext.sc_rcx; - ucp->uc_mcontext.gregs[REG_RAX] = lx_ucp->uc_sigcontext.sc_rax; - ucp->uc_mcontext.gregs[REG_TRAPNO] = lx_ucp->uc_sigcontext.sc_trapno; - ucp->uc_mcontext.gregs[REG_ERR] = lx_ucp->uc_sigcontext.sc_err; - ucp->uc_mcontext.gregs[REG_RIP] = lx_ucp->uc_sigcontext.sc_rip; - ucp->uc_mcontext.gregs[REG_CS] = lx_ucp->uc_sigcontext.sc_cs; - ucp->uc_mcontext.gregs[REG_RFL] = lx_ucp->uc_sigcontext.sc_eflags; - ucp->uc_mcontext.gregs[REG_RSP] = lx_ucp->uc_sigcontext.sc_rsp; - ucp->uc_mcontext.gregs[REG_SS] = lx_ucp->uc_sigcontext.sc_pad0; - ucp->uc_mcontext.gregs[REG_FS] = lx_ucp->uc_sigcontext.sc_fs; - ucp->uc_mcontext.gregs[REG_GS] = lx_ucp->uc_sigcontext.sc_gs; + LX_REG(sigucp, REG_R15) = lx_ucp->uc_sigcontext.sc_r15; + LX_REG(sigucp, REG_R14) = lx_ucp->uc_sigcontext.sc_r14; + LX_REG(sigucp, REG_R13) = lx_ucp->uc_sigcontext.sc_r13; + LX_REG(sigucp, REG_R12) = lx_ucp->uc_sigcontext.sc_r12; + LX_REG(sigucp, REG_R11) = lx_ucp->uc_sigcontext.sc_r11; + LX_REG(sigucp, REG_R10) = lx_ucp->uc_sigcontext.sc_r10; + LX_REG(sigucp, REG_R9) = lx_ucp->uc_sigcontext.sc_r9; + LX_REG(sigucp, REG_R8) = lx_ucp->uc_sigcontext.sc_r8; + LX_REG(sigucp, REG_RDI) = lx_ucp->uc_sigcontext.sc_rdi; + LX_REG(sigucp, REG_RSI) = lx_ucp->uc_sigcontext.sc_rsi; + LX_REG(sigucp, REG_RBP) = lx_ucp->uc_sigcontext.sc_rbp; + LX_REG(sigucp, REG_RBX) = lx_ucp->uc_sigcontext.sc_rbx; + LX_REG(sigucp, REG_RDX) = lx_ucp->uc_sigcontext.sc_rdx; + LX_REG(sigucp, REG_RCX) = lx_ucp->uc_sigcontext.sc_rcx; + LX_REG(sigucp, REG_RAX) = lx_ucp->uc_sigcontext.sc_rax; + LX_REG(sigucp, REG_TRAPNO) = lx_ucp->uc_sigcontext.sc_trapno; + LX_REG(sigucp, REG_ERR) = lx_ucp->uc_sigcontext.sc_err; + LX_REG(sigucp, REG_RIP) = lx_ucp->uc_sigcontext.sc_rip; + LX_REG(sigucp, REG_CS) = lx_ucp->uc_sigcontext.sc_cs; + LX_REG(sigucp, REG_RFL) = lx_ucp->uc_sigcontext.sc_eflags; + LX_REG(sigucp, REG_RSP) = lx_ucp->uc_sigcontext.sc_rsp; + LX_REG(sigucp, REG_SS) = lx_ucp->uc_sigcontext.sc_pad0; + LX_REG(sigucp, REG_FS) = lx_ucp->uc_sigcontext.sc_fs; + LX_REG(sigucp, REG_GS) = lx_ucp->uc_sigcontext.sc_gs; #else /* is _ILP32 */ - if (*(uint32_t *)sp != LX_SIGRT_MAGIC) - lx_err_fatal("sp @ 0x%p, expected 0x%x, found 0x%x!", - sp, LX_SIGRT_MAGIC, *(uint32_t *)sp); - sp += sizeof (uint32_t); - - /* - * Here *sp points to the Illumos ucontext_t which was saved on stack - * right before we pushed LX_SIGRT_MAGIC in the 32-bit lx_sigdeliver - * assembly code. We need to copy machine registers the Linux signal - * handler may have modified back to the Illumos version. - */ - ucp = (ucontext_t *)(*(ssize_t *)sp); - lx_ucp = &lx_ssp->uc; - LX_SIGRETURN(lx_ucp, ucp, sp); /* * Illumos and Linux both follow the SysV i386 ABI layout for the @@ -1236,84 +1169,39 @@ lx_rt_sigreturn(void) */ lx_ucp->uc_sigcontext.sc_esp_at_signal = lx_ucp->uc_sigcontext.sc_esp; - bcopy(&lx_ucp->uc_sigcontext, &ucp->uc_mcontext.gregs, + bcopy(&lx_ucp->uc_sigcontext, &sigucp->uc_mcontext.gregs, sizeof (gregset_t)); #endif - if (lx_ucp->uc_sigcontext.sc_fpstate != NULL) + LX_SIGRETURN(lx_ucp, sigucp, sp); + + if (lx_ucp->uc_sigcontext.sc_fpstate != NULL) { ltos_fpstate(lx_ucp->uc_sigcontext.sc_fpstate, - &ucp->uc_mcontext.fpregs); + &sigucp->uc_mcontext.fpregs); + } /* * Convert the Linux signal mask and stack back to their * Illumos equivalents. */ - (void) ltos_sigset(&lx_ucp->uc_sigmask, &ucp->uc_sigmask); - ltos_stack(&lx_ucp->uc_stack, &ucp->uc_stack); - - /* - * For signal mask handling to be done properly, this function must - * return to the libc call_user_handler() routine that originally - * called the signal handler, rather than directly set the context back - * to the place the signal interrupted execution, as the original Linux - * code would do. - * - * For the 64-bit case we can't simply let call_user_handler() invoke - * __setcontext() since we need to also manage the syscall mode. Thus - * we use the lx_setcontext callback hook into libc to manage this via - * a brand call which combines the setcontext with setting the mode - * switch. - */ -#if defined(_LP64) - /* - * At this point sp points to the end of the stack frame we constructed - * on entry to lx_sigdeliver. Pop this frame off the stack. - */ - sp += 0x30; - -#else - /* - * At this point sp points to the ucontext_t pointer we pushed on the - * stack right before we pushed LX_SIGRT_MAGIC in lx_sigdeliver. Pop - * this value off the stack. - */ - sp += sizeof (uint32_t); -#endif + (void) ltos_sigset(&lx_ucp->uc_sigmask, &sigucp->uc_sigmask); + ltos_stack(&lx_ucp->uc_stack, &sigucp->uc_stack); /* - * At this point sp points to the base frame we had on entry to - * lx_sigdeliver (%ebp/%rbp at TOS, return address next). - * - * Pass the new sp to lx_sigreturn_tolibc(), which will in turn - * manipulate the x86 registers to make it appear that - * lx_call_user_handler() has returned. This will then take us directly - * back to libc's call_user_handler(). + * For signal mask handling to be done properly, this call needs to + * return to the libc routine that originally called the signal handler + * rather than directly set the context back to the place the signal + * interrupted execution as the original Linux code would do. */ - lx_debug("calling lx_sigreturn_tolibc(0x%p)", sp); - lx_sigreturn_tolibc(sp); + lx_debug("lx_rt_sigreturn: calling setcontext; retucp %p\n", + lxsdf->lxsdf_retucp); + setcontext(lxsdf->lxsdf_retucp); + assert(0); /*NOTREACHED*/ return (0); } -#if defined(_LP64) -static int -lx_setcontext(const ucontext_t *ucp) -{ - extern int lx_traceflag; - - /* - * Since we don't return via lx_emulate, issue a trace msg here if - * necessary. We know this is only called in the 64-bit rt_sigreturn - * code path to the syscall number is 15. - */ - if (lx_traceflag != 0) { - (void) syscall(SYS_brand, B_SYSRETURN, 15, 0); - } - return (syscall(SYS_brand, B_SIGNAL_RETURN, ucp)); -} -#endif - #if defined(_ILP32) /* @@ -1321,7 +1209,8 @@ lx_setcontext(const ucontext_t *ucp) * This stack-builder function is only used by 32-bit code. */ static void -lx_build_old_signal_frame(int lx_sig, siginfo_t *sip, void *p, void *sp) +lx_build_old_signal_frame(int lx_sig, siginfo_t *sip, void *p, void *sp, + uintptr_t *hargs) { extern void lx_sigreturn_tramp(); @@ -1394,7 +1283,8 @@ lx_build_old_signal_frame(int lx_sig, siginfo_t *sip, void *p, void *sp) * code (32-bit code also calls this when using "modern" signals). */ static void -lx_build_signal_frame(int lx_sig, siginfo_t *sip, void *p, void *sp) +lx_build_signal_frame(int lx_sig, siginfo_t *sip, void *p, void *sp, + uintptr_t *hargs) { extern void lx_rt_sigreturn_tramp(); @@ -1407,8 +1297,20 @@ lx_build_signal_frame(int lx_sig, siginfo_t *sip, void *p, void *sp) lx_ucp = &lx_ssp->uc; #if defined(_ILP32) + /* + * Arguments are passed to the 32-bit signal handler on the stack. + */ lx_ssp->ucp = lx_ucp; + lx_ssp->sip = sip != NULL ? &lx_ssp->si : NULL; lx_ssp->sig = lx_sig; +#else + /* + * Arguments to the 64-bit signal handler are passed in registers: + * hdlr(int sig, siginfo_t *sip, void *ucp); + */ + hargs[0] = lx_sig; + hargs[1] = sip != NULL ? (uintptr_t)&lx_ssp->si : NULL; + hargs[2] = (uintptr_t)lx_ucp; #endif lxsap = &lx_sighandlers.lx_sa[lx_sig]; @@ -1442,30 +1344,30 @@ lx_build_signal_frame(int lx_sig, siginfo_t *sip, void *p, void *sp) /* * General register layout is completely different. */ - lx_ucp->uc_sigcontext.sc_r8 = ucp->uc_mcontext.gregs[REG_R8]; - lx_ucp->uc_sigcontext.sc_r9 = ucp->uc_mcontext.gregs[REG_R9]; - lx_ucp->uc_sigcontext.sc_r10 = ucp->uc_mcontext.gregs[REG_R10]; - lx_ucp->uc_sigcontext.sc_r11 = ucp->uc_mcontext.gregs[REG_R11]; - lx_ucp->uc_sigcontext.sc_r12 = ucp->uc_mcontext.gregs[REG_R12]; - lx_ucp->uc_sigcontext.sc_r13 = ucp->uc_mcontext.gregs[REG_R13]; - lx_ucp->uc_sigcontext.sc_r14 = ucp->uc_mcontext.gregs[REG_R14]; - lx_ucp->uc_sigcontext.sc_r15 = ucp->uc_mcontext.gregs[REG_R15]; - lx_ucp->uc_sigcontext.sc_rdi = ucp->uc_mcontext.gregs[REG_RDI]; - lx_ucp->uc_sigcontext.sc_rsi = ucp->uc_mcontext.gregs[REG_RSI]; - lx_ucp->uc_sigcontext.sc_rbp = ucp->uc_mcontext.gregs[REG_RBP]; - lx_ucp->uc_sigcontext.sc_rbx = ucp->uc_mcontext.gregs[REG_RBX]; - lx_ucp->uc_sigcontext.sc_rdx = ucp->uc_mcontext.gregs[REG_RDX]; - lx_ucp->uc_sigcontext.sc_rax = ucp->uc_mcontext.gregs[REG_RAX]; - lx_ucp->uc_sigcontext.sc_rcx = ucp->uc_mcontext.gregs[REG_RCX]; - lx_ucp->uc_sigcontext.sc_rsp = ucp->uc_mcontext.gregs[REG_RSP]; - lx_ucp->uc_sigcontext.sc_rip = ucp->uc_mcontext.gregs[REG_RIP]; - lx_ucp->uc_sigcontext.sc_eflags = ucp->uc_mcontext.gregs[REG_RFL]; - lx_ucp->uc_sigcontext.sc_cs = ucp->uc_mcontext.gregs[REG_CS]; - lx_ucp->uc_sigcontext.sc_gs = ucp->uc_mcontext.gregs[REG_GS]; - lx_ucp->uc_sigcontext.sc_fs = ucp->uc_mcontext.gregs[REG_FS]; - lx_ucp->uc_sigcontext.sc_pad0 = ucp->uc_mcontext.gregs[REG_SS]; - lx_ucp->uc_sigcontext.sc_err = ucp->uc_mcontext.gregs[REG_ERR]; - lx_ucp->uc_sigcontext.sc_trapno = ucp->uc_mcontext.gregs[REG_TRAPNO]; + lx_ucp->uc_sigcontext.sc_r8 = LX_REG(ucp, REG_R8); + lx_ucp->uc_sigcontext.sc_r9 = LX_REG(ucp, REG_R9); + lx_ucp->uc_sigcontext.sc_r10 = LX_REG(ucp, REG_R10); + lx_ucp->uc_sigcontext.sc_r11 = LX_REG(ucp, REG_R11); + lx_ucp->uc_sigcontext.sc_r12 = LX_REG(ucp, REG_R12); + lx_ucp->uc_sigcontext.sc_r13 = LX_REG(ucp, REG_R13); + lx_ucp->uc_sigcontext.sc_r14 = LX_REG(ucp, REG_R14); + lx_ucp->uc_sigcontext.sc_r15 = LX_REG(ucp, REG_R15); + lx_ucp->uc_sigcontext.sc_rdi = LX_REG(ucp, REG_RDI); + lx_ucp->uc_sigcontext.sc_rsi = LX_REG(ucp, REG_RSI); + lx_ucp->uc_sigcontext.sc_rbp = LX_REG(ucp, REG_RBP); + lx_ucp->uc_sigcontext.sc_rbx = LX_REG(ucp, REG_RBX); + lx_ucp->uc_sigcontext.sc_rdx = LX_REG(ucp, REG_RDX); + lx_ucp->uc_sigcontext.sc_rax = LX_REG(ucp, REG_RAX); + lx_ucp->uc_sigcontext.sc_rcx = LX_REG(ucp, REG_RCX); + lx_ucp->uc_sigcontext.sc_rsp = LX_REG(ucp, REG_RSP); + lx_ucp->uc_sigcontext.sc_rip = LX_REG(ucp, REG_RIP); + lx_ucp->uc_sigcontext.sc_eflags = LX_REG(ucp, REG_RFL); + lx_ucp->uc_sigcontext.sc_cs = LX_REG(ucp, REG_CS); + lx_ucp->uc_sigcontext.sc_gs = LX_REG(ucp, REG_GS); + lx_ucp->uc_sigcontext.sc_fs = LX_REG(ucp, REG_FS); + lx_ucp->uc_sigcontext.sc_pad0 = LX_REG(ucp, REG_SS); + lx_ucp->uc_sigcontext.sc_err = LX_REG(ucp, REG_ERR); + lx_ucp->uc_sigcontext.sc_trapno = LX_REG(ucp, REG_TRAPNO); #else /* is _ILP32 */ /* @@ -1485,19 +1387,6 @@ lx_build_signal_frame(int lx_sig, siginfo_t *sip, void *p, void *sp) (uintptr_t)sip->si_addr : 0; /* - * Point the lx_siginfo_t pointer to the signal stack's lx_siginfo_t - * if there was a Illumos siginfo_t to convert, otherwise set it to - * NULL. For 64-bit code a NULL sip is handled in the lx_deliver - * assembly code. - */ -#if defined(_ILP32) - if (sip != NULL) - lx_ssp->sip = &lx_ssp->si; - else - lx_ssp->sip = NULL; -#endif - - /* * This should only return an error if the signum is invalid but that * also gets converted into a LX_SIGKILL by this function. */ @@ -1529,76 +1418,21 @@ lx_build_signal_frame(int lx_sig, siginfo_t *sip, void *p, void *sp) bcopy((void *)lx_rt_sigreturn_tramp, lx_ssp->trampoline, sizeof (lx_ssp->trampoline)); #endif - - LX_SIGDELIVER(lx_sig, lxsap, lx_ssp, lx_ucp); - -#if defined(_LP64) - /* - * For the 64-bit code this must be the last syscall we do in the - * emulation code path before we return back to the Linux signal - * handler. This will disable native syscalls so the next time a - * syscall happens on this thread, it will come back into the emulation. - */ - (void) syscall(SYS_brand, B_CLR_NTV_SYSC_FLAG); -#endif - - /* We return to lx_sigdeliver to jump into the Linux signal handler */ -} - -#if defined(_LP64) -static void -lx_vsyscall_return(long ret, ucontext_t *ucp) -{ - lx_debug("\tvsyscall return val = %lX", ret); - ucp->uc_mcontext.gregs[REG_RAX] = ret; - /* - * Simulate a 'ret' by grabbing the return address off the caller's - * stack and incrementing rsp manually before sigreturning back. - */ - (void) uucopy((void*)ucp->uc_mcontext.gregs[REG_RSP], - &ucp->uc_mcontext.gregs[REG_RIP], sizeof (void*)); - lx_debug("\tvsyscall return to %p", ucp->uc_mcontext.gregs[REG_RIP]); - ucp->uc_mcontext.gregs[REG_RSP] += sizeof (void*); - - /* - * Make sure that libc's ul_sigmask reflects what the sigmask is about - * to become. - */ - thr_sigsetmask(SIG_SETMASK, &ucp->uc_sigmask, NULL); - - (void) syscall(SYS_brand, B_SIGNAL_RETURN, ucp); } -#endif /* - * This is the second level interposition handler for Linux signals. + * This is the interposition handler for Linux signals. */ static void lx_call_user_handler(int sig, siginfo_t *sip, void *p) { void (*user_handler)(); void (*stk_builder)(); -#if defined(_ILP32) - lx_tsd_t *lx_tsd; - int err; -#endif struct lx_sigaction *lxsap; ucontext_t *ucp = (ucontext_t *)p; - uintptr_t gs; size_t stksize; int lx_sig; - switch (sig) { - case SIGCLD: - /* - * Signal to an interrupted waitpid() that it was interrupted - * by a SIGCLD, and should restart to grab the wait status - * this signal represented. - */ - lx_had_sigchild = 1; - break; - } - /* * If Illumos signal has no Linux equivalent, effectively ignore it. */ @@ -1615,18 +1449,6 @@ lx_call_user_handler(int sig, siginfo_t *sip, void *p) lx_debug("lxsap @ 0x%p", lxsap); /* - * If the delivery of this signal interrupted a system call, we must - * only restart it if sigaction(2) was used to set the SA_RESTART flag - * for this signal. The lx_emulate() function checks this per-thread - * variable to discover the restart disposition of the most recently - * handled signal. - * - * NOTE: this mechanism may not stand up to close scrutiny in the face - * of nested asynchronous signal delivery. - */ - lx_do_syscall_restart = !!(lxsap->lxsa_flags & LX_SA_RESTART); - - /* * Emulate vsyscall support. * * Linux magically maps a single page into the address space of each @@ -1656,27 +1478,35 @@ lx_call_user_handler(int sig, siginfo_t *sip, void *p) if (sig == SIGSEGV) { int i; for (i = 0; lx_vsyscalls[i].lv_addr != NULL; i++) { - if (lx_vsyscalls[i].lv_addr != (uintptr_t)sip->si_addr) + extern void lx_vsyscall_tramp(void); + uintptr_t addr = (uintptr_t)sip->si_addr; + + if (lx_vsyscalls[i].lv_addr != addr) continue; + /* * Users of vsyscall must commit fully by using * jmp/call access the vsyscall. Cowardly reading data * from the page beforehand isn't allowed or possible. */ - if (sip->si_addr != - (void*)ucp->uc_mcontext.gregs[REG_RIP]) + if (addr != LX_REG(ucp, REG_PC)) continue; - lx_debug(lx_vsyscalls[i].lv_msg, - ucp->uc_mcontext.gregs[REG_RDI], - ucp->uc_mcontext.gregs[REG_RSI], - ucp->uc_mcontext.gregs[REG_RDX]); - long ret = lx_vsyscalls[i].lv_func( - ucp->uc_mcontext.gregs[REG_RDI], - ucp->uc_mcontext.gregs[REG_RSI], - ucp->uc_mcontext.gregs[REG_RDX]); - lx_vsyscall_return(ret, ucp); - assert(0); + lx_debug(lx_vsyscalls[i].lv_msg, LX_REG(ucp, REG_RDI), + LX_REG(ucp, REG_RSI), LX_REG(ucp, REG_RDX)); + + /* + * Modify the interrupted context so that, on return + * from the signal handler, the kernel revectors this + * LWP to the vsyscall trampoline. That trampoline + * will immediately invoke the "syscall" instruction + * and returns to the address on the stack when + * complete. + */ + LX_REG(ucp, REG_R0) = lx_vsyscalls[i].lv_scnum; + LX_REG(ucp, REG_PC) = (uintptr_t)&lx_vsyscall_tramp; + lx_debug("\treturning from signal handler\n"); + return; } /* @@ -1715,28 +1545,9 @@ lx_call_user_handler(int sig, siginfo_t *sip, void *p) (lxsap->lxsa_handler == SIG_DFL) ? "SIG_DFL" : "SIG_IGN"); #if defined(_LP64) - /* %gs is ignored in the 64-bit lx_sigdeliver */ - gs = 0; - stksize = sizeof (struct lx_sigstack); stk_builder = lx_build_signal_frame; - #else - if ((err = thr_getspecific(lx_tsd_key, (void **)&lx_tsd)) != 0) - lx_err_fatal("lx_call_user_handler: unable to read " - "thread-specific data: %s", strerror(err)); - - assert(lx_tsd != 0); - - gs = lx_tsd->lxtsd_gs & 0xffff; /* gs is only 16 bits */ - - /* - * Any zero %gs value should be caught when a save is attempted in - * lx_emulate(), but this extra check will catch any zero values due to - * bugs in the library. This is only applicable to 32-bit code. - */ - assert(gs != 0); - if (lxsap->lxsa_flags & LX_SA_SIGINFO) { stksize = sizeof (struct lx_sigstack); stk_builder = lx_build_signal_frame; @@ -1748,22 +1559,333 @@ lx_call_user_handler(int sig, siginfo_t *sip, void *p) user_handler = lxsap->lxsa_handler; - lx_debug("delivering %d (lx %d) to handler at 0x%p with gs 0x%x", sig, - lx_sig, lxsap->lxsa_handler, gs); + lx_debug("delivering %d (lx %d) to handler at 0x%p", sig, lx_sig, + lxsap->lxsa_handler); if (lxsap->lxsa_flags & LX_SA_RESETHAND) lxsap->lxsa_handler = SIG_DFL; + lx_sigdeliver(lx_sig, sip, ucp, stksize, stk_builder, user_handler, + lxsap); + /* - * lx_sigdeliver() doesn't return, so it relies on the Linux signal - * handler to clean up the stack, reset the current signal mask and - * make a system call (sigreturn or rt_sigreturn) which is intended to - * return to the code interrupted by the signal. The emulation will - * catch that syscall, finish it's own cleanup, then actually return - * back through here via lx_sigreturn_tolibc(), which leads us back - * into libc and then back to the point where we were interrupted. + * We need to handle restarting system calls if requested by the + * program for this signal type: + */ + if (lxsap->lxsa_flags & LX_SA_RESTART) { + uintptr_t flags = (uintptr_t)ucp->uc_brand_data[0]; + long ret = (long)LX_REG(ucp, REG_R0); + boolean_t interrupted = (ret == -lx_errno(EINTR)); + + /* + * If the system call returned EINTR, and the system + * call handler set "br_syscall_restart" when returning, + * we modify the context to try the system call again + * when we return from this signal handler. + */ + if ((flags & LX_UC_RESTART_SYSCALL) && interrupted) { + int syscall_num = (int)(uintptr_t)ucp->uc_brand_data[2]; + + lx_debug("restarting interrupted system call %d", + syscall_num); + + /* + * Both the "int 0x80" and the "syscall" instruction + * are two bytes long. Wind the program counter back + * to the start of this instruction. + * + * The system call we interrupted is preserved in the + * brand-specific data in the ucontext_t when the + * LX_UC_RESTART_SYSCALL flag is set. This is + * analogous to the "orig_[er]ax" field in the Linux + * "user_regs_struct". + */ + LX_REG(ucp, REG_PC) -= 2; + LX_REG(ucp, REG_R0) = syscall_num; + } + } +} + +/* + * The "lx_sigdeliver()" function is responsible for constructing the emulated + * signal delivery frame on the brand stack for this LWP. A context is saved + * on the stack which will be used by the "sigreturn(2)" family of emulated + * system calls to get us back here after the Linux signal handler returns. + * This function is modelled on the in-kernel "sendsig()" signal delivery + * mechanism. + */ +void +lx_sigdeliver(int lx_sig, siginfo_t *sip, ucontext_t *ucp, size_t stacksz, + void (*stack_builder)(), void (*user_handler)(), + struct lx_sigaction *lxsap) +{ + ucontext_t uc; + lx_tsd_t *lxtsd = lx_get_tsd(); + int totsz = 0; + uintptr_t flags; + uintptr_t hargs[3]; + /* + * These variables must be "volatile", as they are modified after the + * getcontext() stores the register state: + */ + volatile boolean_t signal_delivered = B_FALSE; + volatile uintptr_t lxfp; + volatile uintptr_t old_tsd_sp; + volatile int newstack; + + /* + * This function involves modifying the Linux process stack for this + * thread. To do so without corruption requires us to exclude other + * signal handlers (or emulated system calls called from within those + * handlers) from running while we reserve space on that stack. We + * defer the execution of further instances of lx_call_user_handler() + * until we have completed this operation. + */ + _sigoff(); + + /* + * Clear register arguments vector. + */ + bzero(hargs, sizeof (hargs)); + + /* + * We save a context here so that we can be returned later to complete + * handling the signal. + */ + lx_debug("lx_sigdeliver: STORING RETURN CONTEXT @ %p\n", &uc); + assert(getcontext(&uc) == 0); + lx_debug("lx_sigdeliver: RETURN CONTEXT %p LINK %p FLAGS %lx\n", + &uc, uc.uc_link, uc.uc_flags); + if (signal_delivered) { + /* + * If the "signal_delivered" flag is set, we are returned here + * via setcontext() as called by the emulated Linux signal + * return system call. + */ + lx_debug("lx_sigdeliver: WE ARE BACK, VIA UC @ %p!\n", &uc); + goto after_signal_handler; + } + signal_delivered = B_TRUE; + + /* + * Preserve the current tsd value of the Linux process stack pointer, + * even if it is zero. We will restore it when we are returned here + * via setcontext() after the Linux process has completed execution of + * its signal handler. + */ + old_tsd_sp = lxtsd->lxtsd_lx_sp; + + /* + * Figure out whether we will be handling this signal on an alternate + * stack specified by the user. + */ + newstack = (lxsap->lxsa_flags & LX_SA_ONSTACK) && + !(lxtsd->lxtsd_sigaltstack.ss_flags & (LX_SS_ONSTACK | + LX_SS_DISABLE)); + + /* + * Find the first unused region of the Linux process stack, where + * we will assemble our signal delivery frame. + */ + flags = (uintptr_t)ucp->uc_brand_data[0]; + if (newstack) { + /* + * We are moving to the user-provided alternate signal + * stack. + */ + lxfp = SA((uintptr_t)lxtsd->lxtsd_sigaltstack.ss_sp) + + SA(lxtsd->lxtsd_sigaltstack.ss_size) - STACK_ALIGN; + lx_debug("lx_sigdeliver: moving to ALTSTACK sp %p\n", lxfp); + LX_SIGNAL_ALTSTACK_ENABLE(lxfp); + } else if (flags & LX_UC_STACK_BRAND) { + /* + * We interrupted the Linux process to take this signal. The + * stack pointer is the one saved in this context. + */ + lxfp = LX_REG(ucp, REG_SP); + } else { + /* + * We interrupted a native (emulation) routine, so we must get + * the current stack pointer from either the tsd (if one is + * stored there) or via the context chain. + * + */ + lxfp = lx_find_brand_sp(); + if (lxtsd->lxtsd_lx_sp != 0) { + /* + * We must also make room for the possibility of nested + * signal delivery -- we may be pre-empting the + * in-progress handling of another signal. + * + * Note that if we were already on the alternate stack, + * any emulated Linux system calls would be betwixt + * that original signal frame and this new one on the + * one contiguous stack, so this logic holds either + * way: + */ + lxfp = MIN(lxtsd->lxtsd_lx_sp, lxfp); + } + } + + /* + * Account for a reserved stack region (for amd64, this is 128 bytes), + * and align the stack: + */ + lxfp -= STACK_RESERVE; + lxfp &= ~(STACK_ALIGN - 1); + + /* + * Allocate space on the Linux process stack for our delivery frame, + * including: + * + * ----------------------------------------------------- old %sp + * - lx_sigdeliver_frame_t + * - (ucontext_t pointers and stack magic) + * ----------------------------------------------------- + * - (amd64-only 8-byte alignment gap) + * ----------------------------------------------------- + * - frame of size "stacksz" from the stack builder + * ----------------------------------------------------- new %sp + */ +#if defined(_LP64) + /* + * The AMD64 ABI requires us to align the stack such that when the + * called function pushes the base pointer, the stack is 16 byte + * aligned. The stack must, therefore, be 8- but _not_ 16-byte + * aligned. + */ +#if (STACK_ALIGN != 16) || (STACK_ENTRY_ALIGN != 8) +#error "lx_sigdeliver() did not find expected stack alignment" +#endif + totsz = SA(sizeof (lx_sigdeliver_frame_t)) + SA(stacksz) + 8; + assert((totsz & (STACK_ENTRY_ALIGN - 1)) == 0); + assert((totsz & (STACK_ALIGN - 1)) == 8); +#else + totsz = SA(sizeof (lx_sigdeliver_frame_t)) + SA(stacksz); + assert((totsz & (STACK_ALIGN - 1)) == 0); +#endif + + /* + * Copy our return frame into place: + */ + lxfp -= SA(sizeof (lx_sigdeliver_frame_t)); + lx_debug("lx_sigdeliver: lx_sigdeliver_frame_t @ %p\n", lxfp); + { + lx_sigdeliver_frame_t frm; + + frm.lxsdf_magic = LX_SIGRT_MAGIC; + frm.lxsdf_retucp = &uc; + frm.lxsdf_sigucp = ucp; + + lx_debug("lx_sigdeliver: retucp %p sigucp %p\n", + frm.lxsdf_retucp, frm.lxsdf_sigucp); + + if (uucopy(&frm, (void *)lxfp, sizeof (frm)) != 0) { + /* + * We could not modify the stack of the emulated Linux + * program. Act like the kernel and terminate the + * program with a segmentation violation. + */ + (void) syscall(SYS_brand, B_EXIT_AS_SIG, SIGSEGV); + } + + LX_SIGNAL_DELIVERY_FRAME_CREATE((void *)lxfp); + } + + /* + * Build the Linux signal handling frame: + */ +#if defined(_LP64) + lxfp -= SA(stacksz) + 8; +#else + lxfp -= SA(stacksz); +#endif + lx_debug("lx_sigdeliver: Linux sig frame @ %p\n", lxfp); + stack_builder(lx_sig, sip, ucp, lxfp, hargs); + + /* + * Record our reservation so that any nested signal handlers + * can see it. + */ + lx_debug("lx_sigdeliver: Linux tsd sp %p -> %p\n", lxtsd->lxtsd_lx_sp, + lxfp); + lxtsd->lxtsd_lx_sp = lxfp; + + if (newstack) { + lxtsd->lxtsd_sigaltstack.ss_flags |= LX_SS_ONSTACK; + } + + LX_SIGDELIVER(lx_sig, lxsap, (void *)lxfp); + + /* + * Re-enable signal delivery. If a signal was queued while we were + * in the critical section, it will be delivered immediately. + */ + _sigon(); + + /* + * Pass control to the Linux signal handler: + */ + lx_debug("lx_sigdeliver: JUMPING TO LINUX (sig %d sp %p eip %p)\n", + lx_sig, lxfp, user_handler); + { + ucontext_t jump_uc; + + bcopy(lx_find_brand_uc(), &jump_uc, sizeof (jump_uc)); + + /* + * We want to load the general registers from this context, and + * switch to the BRAND stack. We do _not_ want to restore the + * uc_link value from this synthetic context, as that would + * break the signal handling context chain. + */ + jump_uc.uc_flags = UC_CPU; + jump_uc.uc_brand_data[0] = (void *)(LX_UC_STACK_BRAND | + LX_UC_IGNORE_LINK); + + LX_REG(&jump_uc, REG_FP) = 0; + LX_REG(&jump_uc, REG_SP) = lxfp; + LX_REG(&jump_uc, REG_PC) = (uintptr_t)user_handler; + +#if defined(_LP64) + /* + * Pass signal handler arguments by registers on AMD64. + */ + LX_REG(&jump_uc, REG_RDI) = hargs[0]; + LX_REG(&jump_uc, REG_RSI) = hargs[1]; + LX_REG(&jump_uc, REG_RDX) = hargs[2]; +#endif + + if (syscall(SYS_brand, B_JUMP_TO_LINUX, &jump_uc) == -1) { + lx_err_fatal("B_JUMP_TO_LINUX failed: %s", + strerror(errno)); + } + } + + assert(0); + +after_signal_handler: + /* + * Ensure all nested signal handlers have completed correctly + * and then remove our stack reservation. + */ + _sigoff(); + LX_SIGNAL_POST_HANDLER(lxfp, old_tsd_sp); + assert(lxtsd->lxtsd_lx_sp == lxfp); + lx_debug("lx_sigdeliver: after; Linux tsd sp %p -> %p\n", lxfp, + old_tsd_sp); + lxtsd->lxtsd_lx_sp = old_tsd_sp; + if (newstack) { + LX_SIGNAL_ALTSTACK_DISABLE(); + lx_debug("lx_sigdeliver: disabling ALTSTACK sp %p\n", lxfp); + lxtsd->lxtsd_sigaltstack.ss_flags &= ~LX_SS_ONSTACK; + } + _sigon(); + + /* + * Here we return to libc so that it may clean up and restore the + * context originally interrupted by this signal. */ - lx_sigdeliver(lx_sig, sip, ucp, stksize, stk_builder, user_handler, gs); } /* @@ -1849,12 +1971,17 @@ lx_sigaction_common(int lx_sig, struct lx_sigaction *lxsp, */ sa.sa_flags = SA_SIGINFO; + /* + * When translating from Linux to illumos + * sigaction(2) flags, we explicitly do not + * pass SA_ONSTACK to the kernel. The + * alternate stack for Linux signal handling is + * handled entirely by the emulation code. + */ if (lxsa.lxsa_flags & LX_SA_NOCLDSTOP) sa.sa_flags |= SA_NOCLDSTOP; if (lxsa.lxsa_flags & LX_SA_NOCLDWAIT) sa.sa_flags |= SA_NOCLDWAIT; - if (lxsa.lxsa_flags & LX_SA_ONSTACK) - sa.sa_flags |= SA_ONSTACK; if (lxsa.lxsa_flags & LX_SA_RESTART) sa.sa_flags |= SA_RESTART; if (lxsa.lxsa_flags & LX_SA_NODEFER) @@ -2079,61 +2206,11 @@ lx_signal(uintptr_t lx_sig, uintptr_t handler) } #endif -#if defined(_ILP32) -/* - * This is only used in 32-bit code and is called by the assembly routine - * lx_sigacthandler. - * - * This C routine saves the passed %gs value into the thread-specific save area. - */ -void -lx_sigsavegs(uintptr_t signalled_gs) -{ - lx_tsd_t *lx_tsd; - int err; - - signalled_gs &= 0xffff; /* gs is only 16 bits */ - - /* - * While a %gs of 0 is technically legal (as long as the application - * never dereferences memory using %gs), Illumos has its own ideas as - * to how a zero %gs should be handled in _update_sregs(), such that - * any 32-bit user process with a %gs of zero running on a system with - * a 64-bit kernel will have its %gs hidden base register stomped on on - * return from a system call, leaving an incorrect base address in - * place until the next time %gs is actually reloaded (forcing a reload - * of the base address from the appropriate descriptor table.) - * - * Of course the kernel will once again stomp on THAT base address when - * returning from a system call, resulting in an application - * segmentation fault. - * - * To avoid this situation, disallow a save of a zero %gs here in order - * to try and capture any Linux process that takes a signal with a zero - * %gs installed. - */ - assert(signalled_gs != 0); - - if (signalled_gs != LWPGS_SEL) { - if ((err = thr_getspecific(lx_tsd_key, - (void **)&lx_tsd)) != 0) - lx_err_fatal("sigsavegs: unable to read " - "thread-specific data: %s", strerror(err)); - - assert(lx_tsd != 0); - - lx_tsd->lxtsd_gs = signalled_gs; - lx_debug("lx_sigsavegs(): gsp 0x%p, saved gs: 0x%x\n", - lx_tsd, signalled_gs); - } -} -#endif - int lx_siginit(void) { extern void set_setcontext_enforcement(int); - extern void lx_sigacthandler(int, siginfo_t *, void *); + extern void set_escaped_context_cleanup(int); struct sigaction sa; sigset_t new_set, oset; @@ -2162,38 +2239,6 @@ lx_siginit(void) (void) sigignore(sig); /* - * As mentioned previously, when a user signal handler is installed - * via sigaction(), libc interposes on the mechanism by actually - * installing an internal routine sigacthandler() as the signal - * handler. On receipt of the signal, libc does some thread-related - * processing via sigacthandler(), then calls the registered user - * signal handler on behalf of the user. - * - * For 32-bit code we need to interpose on that mechanism to make sure - * the correct %gs segment register value is installed before the libc - * routine is called, otherwise the libc code will die with a - * segmentation fault. - * - * For 64-bit code we overload the %gs register as a mechanism to pass - * the syscall mode flag out of the kernel. - * - * The private libc routine setsigacthandler() will set our - * interposition routine, lx_sigacthandler(), as the default - * "sigacthandler" routine for all new signal handlers for this - * thread. We also use this in 64-bit code to set the libc interposition - * routine for setting the context when returning from a signal handler. - * This is needed so we can combine changing the syscall mode flag and - * doing __setcontext() in one call. - */ -#if defined(_LP64) - setsigacthandler(lx_sigacthandler, &libc_sigacthandler, lx_setcontext); -#else - setsigacthandler(lx_sigacthandler, &libc_sigacthandler, NULL); -#endif - lx_debug("lx_sigacthandler installed, libc_sigacthandler = 0x%p", - libc_sigacthandler); - - /* * Mark any signals that are ignored as ignored in our interposition * handler array */ @@ -2239,7 +2284,17 @@ lx_siginit(void) set_setcontext_enforcement(0); /* - * Reset the signal mask to what we came in with + * The illumos libc attempts to clean up dangling uc_link pointers in + * signal handling contexts when libc believes us to have escaped a + * signal handler incorrectly in the past. We want to disable this + * behaviour, so that the system call emulation context saved by the + * kernel brand module for lx_emulate() may be part of the context + * chain without itself being used for signal handling. + */ + set_escaped_context_cleanup(0); + + /* + * Reset the signal mask to what we came in with. */ (void) sigprocmask(SIG_SETMASK, &oset, NULL); @@ -2248,7 +2303,7 @@ lx_siginit(void) } /* - * This code stongly resemebles lx_poll(), but is here to be able to take + * This code strongly resembles lx_poll(), but is here to be able to take * advantage of the Linux signal helper routines. */ long @@ -2545,11 +2600,9 @@ lx_rt_sigqueueinfo(uintptr_t p1, uintptr_t p2, uintptr_t p3) siginfo.si_pid = lx_siginfo.lsi_pid; siginfo.si_value = lx_siginfo.lsi_value; siginfo.si_uid = lx_siginfo.lsi_uid; - return ((syscall(SYS_brand, B_IKE_SYSCALL + - LX_EMUL_rt_sigqueueinfo, tgid, sig, &siginfo)) ? - (-errno) : 0); + return ((syscall(SYS_brand, B_HELPER_SIGQUEUE, + tgid, sig, &siginfo)) ? (-errno) : 0); } - } /* @@ -2587,7 +2640,6 @@ lx_rt_tgsigqueueinfo(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4) siginfo.si_value = lx_siginfo.lsi_value; siginfo.si_uid = lx_siginfo.lsi_uid; - return ((syscall(SYS_brand, B_IKE_SYSCALL + - LX_EMUL_rt_tgsigqueueinfo, tgid, tid, sig, &siginfo)) ? - (-errno) : 0); + return ((syscall(SYS_brand, B_HELPER_TGSIGQUEUE, tgid, tid, sig, + &siginfo)) ? (-errno) : 0); } diff --git a/usr/src/lib/brand/lx/lx_brand/common/socket.c b/usr/src/lib/brand/lx/lx_brand/common/socket.c index fa925628e7..b8c2c31582 100644 --- a/usr/src/lib/brand/lx/lx_brand/common/socket.c +++ b/usr/src/lib/brand/lx/lx_brand/common/socket.c @@ -648,14 +648,28 @@ ltos_xform_cmsgs(struct lx_msghdr *msg, struct cmsghdr *ntv_cmsg) static int stol_xform_cmsgs(struct lx_msghdr *msg, lx_cmsghdr64_t *lx_cmsg) { + struct lx_msghdr tmsg; lx_cmsghdr64_t *lcmsg, *last; struct cmsghdr *cmsg, *lp; int nlen = 0; int err = 0; - lcmsg = lx_cmsg; + /* + * Create a temporary "struct lx_msghdr" so that we can use the + * LX_CMSG_*HDR() iteration macros. + */ + tmsg = *msg; + tmsg.msg_control = lx_cmsg; + tmsg.msg_controllen = msg->msg_controllen + LX_CMSG_EXTRA; + + lcmsg = LX_CMSG_FIRSTHDR(&tmsg); cmsg = CMSG_FIRSTHDR(msg); while (cmsg != NULL && err == 0) { + if (lcmsg == NULL) { + err = ENOTSUP; + break; + } + lcmsg->cmsg_len = LX_CMSG_LEN(cmsg->cmsg_len - sizeof (struct cmsghdr)); lcmsg->cmsg_level = cmsg->cmsg_level; @@ -668,12 +682,13 @@ stol_xform_cmsgs(struct lx_msghdr *msg, lx_cmsghdr64_t *lx_cmsg) cmsg = CMSG_NXTHDR(msg, lp); last = lcmsg; - lcmsg = LX_CMSG_NXTHDR(msg, last); + lcmsg = LX_CMSG_NXTHDR(&tmsg, last); nlen += (int)((uint64_t)lcmsg - (uint64_t)last); - if (nlen > (msg->msg_controllen + LX_CMSG_EXTRA)) + if (nlen > (msg->msg_controllen + LX_CMSG_EXTRA)) { err = ENOTSUP; + } } if (err) { @@ -876,7 +891,7 @@ ltos_sockaddr(struct sockaddr *addr, socklen_t *len, case AF_INET6: /* - * The Solaris sockaddr_in6 has one more 32-bit field + * The illumos sockaddr_in6 has one more 32-bit field * than the Linux version. We assume the caller has * zeroed the sockaddr we're copying into. */ @@ -1063,7 +1078,7 @@ convert_sock_args(int in_dom, int in_type, int in_protocol, int *out_dom, /* * Linux does not allow the app to specify IP Protocol for raw - * sockets. Solaris does, so bail out here. + * sockets. Illumos does, so bail out here. */ if (domain == AF_INET && type == SOCK_RAW && in_protocol == IPPROTO_IP) return (-ESOCKTNOSUPPORT); @@ -1092,25 +1107,25 @@ convert_sock_args(int in_dom, int in_type, int in_protocol, int *out_dom, static int convert_sockflags(int lx_flags, char *call) { - int solaris_flags = 0; + int native_flags = 0; if (lx_flags & LX_MSG_OOB) { - solaris_flags |= MSG_OOB; + native_flags |= MSG_OOB; lx_flags &= ~LX_MSG_OOB; } if (lx_flags & LX_MSG_PEEK) { - solaris_flags |= MSG_PEEK; + native_flags |= MSG_PEEK; lx_flags &= ~LX_MSG_PEEK; } if (lx_flags & LX_MSG_DONTROUTE) { - solaris_flags |= MSG_DONTROUTE; + native_flags |= MSG_DONTROUTE; lx_flags &= ~LX_MSG_DONTROUTE; } if (lx_flags & LX_MSG_CTRUNC) { - solaris_flags |= MSG_CTRUNC; + native_flags |= MSG_CTRUNC; lx_flags &= ~LX_MSG_CTRUNC; } @@ -1120,22 +1135,22 @@ convert_sockflags(int lx_flags, char *call) } if (lx_flags & LX_MSG_TRUNC) { - solaris_flags |= MSG_TRUNC; + native_flags |= MSG_TRUNC; lx_flags &= ~LX_MSG_TRUNC; } if (lx_flags & LX_MSG_DONTWAIT) { - solaris_flags |= MSG_DONTWAIT; + native_flags |= MSG_DONTWAIT; lx_flags &= ~LX_MSG_DONTWAIT; } if (lx_flags & LX_MSG_EOR) { - solaris_flags |= MSG_EOR; + native_flags |= MSG_EOR; lx_flags &= ~LX_MSG_EOR; } if (lx_flags & LX_MSG_WAITALL) { - solaris_flags |= MSG_WAITALL; + native_flags |= MSG_WAITALL; lx_flags &= ~LX_MSG_WAITALL; } @@ -1200,7 +1215,7 @@ convert_sockflags(int lx_flags, char *call) lx_unsupported("%s: unknown socket flag(s) 0x%x", call, lx_flags); - return (solaris_flags); + return (native_flags); } long @@ -1374,7 +1389,7 @@ lx_accept(int sockfd, void *name, int *nlp) * If it is NULL, we don't care about the namelen pointer's value * or about dereferencing it. * - * Happily, Solaris' accept(3SOCKET) treats NULL name pointers and + * Happily, illumos' accept(3SOCKET) treats NULL name pointers and * zero namelens the same way. */ if ((name != NULL) && @@ -1948,7 +1963,7 @@ lx_getsockopt(int sockfd, int level, int optname, void *optval, int *optlenp) /* * According to the Linux man page, a NULL optval should indicate - * (as in Solaris) that no return value is expected. Instead, it + * (as in illumos) that no return value is expected. Instead, it * actually triggers an EFAULT error. */ if (optval == NULL) @@ -2132,7 +2147,7 @@ lx_sendmsg(int sockfd, void *lmp, int flags) /* * If there are control messages bundled in this message, we need - * to convert them from Linux to Solaris. + * to convert them from Linux to illumos. */ if (msg.msg_control != NULL) { if (msg.msg_controllen == 0) { @@ -2213,6 +2228,7 @@ lx_recvmsg(int sockfd, void *lmp, int flags) void *new_cmsg = NULL; int r, err; socklen_t len, orig_len = 0; + void *msg_control = NULL; int nosigpipe = flags & LX_MSG_NOSIGNAL; struct sigaction newact, oact; @@ -2238,8 +2254,7 @@ lx_recvmsg(int sockfd, void *lmp, int flags) len = sizeof (struct sockaddr); if (getsockname(sockfd, &sname, &len) < 0) len = sizeof (struct sockaddr); - if ((name = SAFE_ALLOCA(len)) == NULL) - return (-ENOMEM); + name = alloca(len); orig_name = msg.msg_name; orig_len = msg.msg_namelen; msg.msg_name = name; @@ -2256,14 +2271,25 @@ lx_recvmsg(int sockfd, void *lmp, int flags) if (msg.msg_controllen == 0) { msg.msg_control = NULL; } else { - msg.msg_control = SAFE_ALLOCA(msg.msg_controllen); - if (msg.msg_control == NULL) - return (-EINVAL); + /* + * Note that control message buffers can be quite + * long, e.g. 128KB or more. The native stack is + * not big enough for these two allocations so we + * use malloc(3C). + */ + lx_debug("\tmsg.msg_controllen = %d", + msg.msg_controllen); + if ((msg_control = malloc(msg.msg_controllen)) == + NULL) { + return (-ENOMEM); + } + msg.msg_control = msg_control; #if defined(_LP64) - new_cmsg = SAFE_ALLOCA(msg.msg_controllen + - LX_CMSG_EXTRA); - if (new_cmsg == NULL) + if ((new_cmsg = malloc(msg.msg_controllen + + LX_CMSG_EXTRA)) == NULL) { + free(msg_control); return (-EINVAL); + } #endif } } @@ -2283,29 +2309,37 @@ lx_recvmsg(int sockfd, void *lmp, int flags) newact.sa_flags = 0; (void) sigemptyset(&newact.sa_mask); - if (sigaction(SIGPIPE, &newact, &oact) < 0) + if (sigaction(SIGPIPE, &newact, &oact) < 0) { lx_err_fatal("recvmsg(): could not ignore SIGPIPE to " "emulate LX_MSG_NOSIGNAL"); + } } r = _so_recvmsg(sockfd, (struct msghdr *)&msg, flags | MSG_XPG4_2); - if ((nosigpipe) && (sigaction(SIGPIPE, &oact, NULL) < 0)) + if ((nosigpipe) && (sigaction(SIGPIPE, &oact, NULL) < 0)) { lx_err_fatal("recvmsg(): could not reset SIGPIPE handler to " "emulate LX_MSG_NOSIGNAL"); + } if (r >= 0 && msg.msg_controllen >= sizeof (struct cmsghdr)) { /* - * If there are control messages bundled in this message, - * we need to convert them from Linux to Solaris. + * If there are control messages bundled in this message, we + * need to convert them from native illumos to Linux format. */ if ((err = convert_cmsgs(SOL_TO_LX, &msg, new_cmsg, - "recvmsg()")) != 0) + "recvmsg()")) != 0) { + free(msg_control); + free(new_cmsg); return (-err); + } if ((uucopy(msg.msg_control, cmsg, - msg.msg_controllen)) != 0) + msg.msg_controllen)) != 0) { + free(msg_control); + free(new_cmsg); return (-errno); + } } msg.msg_control = cmsg; @@ -2314,8 +2348,11 @@ lx_recvmsg(int sockfd, void *lmp, int flags) if (msg.msg_name != NULL) { err = stol_sockaddr(orig_name, &msg.msg_namelen, msg.msg_name, msg.msg_namelen, orig_len); - if (err != 0) + if (err != 0) { + free(msg_control); + free(new_cmsg); return (-err); + } msg.msg_name = orig_name; } @@ -2324,9 +2361,14 @@ lx_recvmsg(int sockfd, void *lmp, int flags) * call, so copy their values back to the caller. Rather than iterate, * just copy the whole structure back. */ - if (uucopy(&msg, lmp, sizeof (msg)) != 0) + if (uucopy(&msg, lmp, sizeof (msg)) != 0) { + free(msg_control); + free(new_cmsg); return (-errno); + } + free(msg_control); + free(new_cmsg); return ((r < 0) ? -errno : r); } diff --git a/usr/src/lib/brand/lx/lx_brand/common/stack.c b/usr/src/lib/brand/lx/lx_brand/common/stack.c new file mode 100644 index 0000000000..6ddb2c1527 --- /dev/null +++ b/usr/src/lib/brand/lx/lx_brand/common/stack.c @@ -0,0 +1,280 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * Manage the native/emulation stack for LX-branded LWPs. + */ + +#include <assert.h> +#include <stdlib.h> +#include <strings.h> +#include <errno.h> + +#include <thread.h> +#include <sys/mman.h> +#include <sys/brand.h> +#include <sys/syscall.h> + +#include <sys/lx_brand.h> +#include <sys/lx_misc.h> +#include <sys/lx_debug.h> +#include <sys/lx_thread.h> + + +typedef struct lx_stack_list_ent { + thread_t sle_tid; + void *sle_stack; + size_t sle_stack_size; + lx_tsd_t *sle_tsd; +} lx_stack_list_ent_t; + +static mutex_t lx_stack_list_lock = DEFAULTMUTEX; +lx_stack_list_ent_t *lx_stack_list = NULL; +unsigned int lx_stack_list_elems = 0; + +/* + * Usermode emulation alternate stack size, expressed as a page count: + */ +int lx_native_stack_page_count = LX_NATIVE_STACK_PAGE_COUNT; + +/* + * We use these private functions from libc to suspend signal delivery in + * critical sections: + */ +extern void _sigon(void); +extern void _sigoff(void); + +/* + * Free the alternate stack for this thread. + */ +void +lx_free_stack(void) +{ + thread_t me = thr_self(); + int i; + + _sigoff(); + mutex_lock(&lx_stack_list_lock); + + /* + * Find this thread's stack in the list of stacks. + */ + for (i = 0; i < lx_stack_list_elems; i++) { + if (lx_stack_list[i].sle_tid != me) { + continue; + } + + (void) munmap(lx_stack_list[i].sle_stack, + lx_stack_list[i].sle_stack_size); + + /* + * Free the thread-specific data structure for this thread. + */ + if (lx_stack_list[i].sle_tsd != NULL) { + free(lx_stack_list[i].sle_tsd->lxtsd_clone_state); + free(lx_stack_list[i].sle_tsd); + } + + /* + * Free up this stack list entry: + */ + bzero(&lx_stack_list[i], sizeof (lx_stack_list[i])); + + mutex_unlock(&lx_stack_list_lock); + _sigon(); + return; + } + + /* + * Did not find the stack in the list. + */ + assert(0); +} + +/* + * After fork1(), we must unmap the stack of every thread other than the + * one copied into the child process. + */ +void +lx_free_other_stacks(void) +{ + int i, this_stack = -1; + thread_t me = thr_self(); + + _sigoff(); + mutex_lock(&lx_stack_list_lock); + + for (i = 0; i < lx_stack_list_elems; i++) { + if (lx_stack_list[i].sle_tid == me) { + /* + * Do not unmap the stack for this LWP. + */ + this_stack = i; + continue; + } else if (lx_stack_list[i].sle_tid == 0) { + /* + * Skip any holes in the list. + */ + continue; + } + + /* + * Free the thread-specific data structure for this thread. + */ + if (lx_stack_list[i].sle_tsd != NULL) { + free(lx_stack_list[i].sle_tsd->lxtsd_clone_state); + free(lx_stack_list[i].sle_tsd); + } + + /* + * Unmap the stack of every other LWP. + */ + (void) munmap(lx_stack_list[i].sle_stack, + lx_stack_list[i].sle_stack_size); + } + /* + * Did not find the stack for this LWP in the list. + */ + assert(this_stack != -1); + + /* + * Ensure the stack data for this LWP is in the first slot and shrink + * the list. + */ + if (this_stack != 0) { + lx_stack_list[0] = lx_stack_list[this_stack]; + } + lx_stack_list_elems = 1; + lx_stack_list = realloc(lx_stack_list, lx_stack_list_elems * + sizeof (lx_stack_list[0])); + if (lx_stack_list == NULL) { + lx_err_fatal("failed to shrink stack list: %s", + strerror(errno)); + } + + mutex_unlock(&lx_stack_list_lock); + _sigon(); +} + +/* + * Allocate an alternate stack for the execution of native emulation routines. + * This routine is based, in part, on find_stack() from libc. + */ +int +lx_alloc_stack(void **nstack, size_t *nstack_size) +{ + static int pagesize = 0; + static int stackprot = 0; + int stacksize = 0; + void *stack; + + /* + * Fetch configuration once: + */ + if (pagesize == 0) { + pagesize = _sysconf(_SC_PAGESIZE); + assert(pagesize > 0); + } + if (stackprot == 0) { + long lprot = _sysconf(_SC_STACK_PROT); + + stackprot = lprot > 0 ? lprot : (PROT_READ | PROT_WRITE); + } + + stacksize = lx_native_stack_page_count * pagesize; + + if ((stack = mmap(NULL, stacksize, stackprot, MAP_PRIVATE | + MAP_NORESERVE | MAP_ANON, -1, (off_t)0)) == MAP_FAILED) { + int en = errno; + lx_debug("lx_alloc_stack: failed to allocate stack: %s", + strerror(errno)); + errno = en; + return (-1); + } + +#if DEBUG + /* + * Write a recognisable pattern into the allocated stack pages. + */ + for (pos = 0; pos < ((stacksize - 1) / 4); pos++) { + ((uint32_t *)stack)[pos] = 0x0facade0; + } +#endif + + *nstack = stack; + *nstack_size = stacksize; + + return (0); +} + +/* + * Configure the in-kernel brand-specific LWP data with the native stack + * pointer for this thread. If a stack is not passed, allocate one first. + */ +void +lx_install_stack(void *stack, size_t stacksize, lx_tsd_t *tsd) +{ + thread_t me = thr_self(); + int i; + uintptr_t stack_top; + + if (stack == NULL) { + /* + * If we were not passed a stack, then allocate one: + */ + if (lx_alloc_stack(&stack, &stacksize) == -1) { + lx_err_fatal("failed to allocate stack for thread " + "%d: %s", me, strerror(errno)); + } + } + + /* + * Install the stack in the global list of thread stacks. + */ + _sigoff(); + mutex_lock(&lx_stack_list_lock); + + for (i = 0; i < lx_stack_list_elems; i++) { + assert(lx_stack_list[i].sle_tid != me); + if (lx_stack_list[i].sle_tid == 0) + break; + } + if (i >= lx_stack_list_elems) { + lx_stack_list_elems++; + lx_stack_list = realloc(lx_stack_list, lx_stack_list_elems * + sizeof (lx_stack_list[0])); + if (lx_stack_list == NULL) { + lx_err_fatal("failed to extend stack list: %s", + strerror(errno)); + } + } + lx_stack_list[i].sle_tid = me; + lx_stack_list[i].sle_stack = stack; + lx_stack_list[i].sle_stack_size = stacksize; + lx_stack_list[i].sle_tsd = tsd; + + mutex_unlock(&lx_stack_list_lock); + _sigon(); + + /* + * Inform the kernel of the location of the brand emulation + * stack for this LWP: + */ + stack_top = (uintptr_t)stack + stacksize; + lx_debug("stack %p stack_top %p\n", stack, stack_top); + if (syscall(SYS_brand, B_SET_NATIVE_STACK, stack_top) != 0) { + lx_err_fatal("unable to set native stack: %s", strerror(errno)); + } +} diff --git a/usr/src/lib/brand/lx/lx_brand/i386/lx_handler.s b/usr/src/lib/brand/lx/lx_brand/i386/lx_handler.s index 2b382c9f76..bce7f0005c 100644 --- a/usr/src/lib/brand/lx/lx_brand/i386/lx_handler.s +++ b/usr/src/lib/brand/lx/lx_brand/i386/lx_handler.s @@ -21,7 +21,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/asm_linkage.h> @@ -41,39 +41,6 @@ #define LX_SYS_sigreturn 119 #define LX_SYS_rt_sigreturn 173 -#define PIC_SETUP(r) \ - call 9f; \ -9: popl r; \ - addl $_GLOBAL_OFFSET_TABLE_ + [. - 9b], r - -/* - * Each JMP must occupy 16 bytes - */ -#define JMP \ - pushl $_CONST(. - lx_handler_table); \ - jmp lx_handler; \ - .align 16; - -#define JMP4 JMP; JMP; JMP; JMP -#define JMP16 JMP4; JMP4; JMP4; JMP4 -#define JMP64 JMP16; JMP16; JMP16; JMP16 -#define JMP256 JMP64; JMP64; JMP64; JMP64 - -/* - * Alternate jump table that turns on lx_traceflag before proceeding with - * the normal emulation routine. - */ -#define TJMP \ - pushl $_CONST(. - lx_handler_trace_table); \ - jmp lx_handler_trace; \ - .align 16; - -#define TJMP4 TJMP; TJMP; TJMP; TJMP -#define TJMP16 TJMP4; TJMP4; TJMP4; TJMP4 -#define TJMP64 TJMP16; TJMP16; TJMP16; TJMP16 -#define TJMP256 TJMP64; TJMP64; TJMP64; TJMP64 - - #if defined(lint) #include <sys/types.h> @@ -81,31 +48,6 @@ #include <sys/signal.h> void -lx_handler_table(void) -{} - -void -lx_handler(void) -{} - -/* ARGSUSED */ -void -lx_setup_clone(uintptr_t gs, void *retaddr, void *stk) -{} - -/* ARGSUSED */ -void -lx_sigdeliver(int sig, siginfo_t *sip, void *p, size_t stacksz, - void (*stack_frame_builder)(void), void (*lx_sighandler)(void), - uintptr_t gs) -{} - -/* ARGSUSED */ -void -lx_sigacthandler(int sig, siginfo_t *s, void *p) -{} - -void lx_sigreturn_tramp(void) {} @@ -113,134 +55,8 @@ void lx_rt_sigreturn_tramp(void) {} -/* ARGSUSED */ -void -lx_sigreturn_tolibc(uintptr_t sp) -{} - #else /* lint */ - /* - * On entry to this table, %eax will hold the return address. The - * location where we enter the table is a function of the system - * call number. The table needs the same alignment as the individual - * entries. - */ - .align 16 - ENTRY_NP(lx_handler_trace_table) - TJMP256 - TJMP64 - TJMP64 - SET_SIZE(lx_handler_trace_table) - - .align 16 - ENTRY_NP(lx_handler_table) - JMP256 - JMP64 - JMP64 - SET_SIZE(lx_handler_table) - - ENTRY_NP(lx_handler_trace) - pushl %esi - PIC_SETUP(%esi) - movl lx_traceflag@GOT(%esi), %esi - movl $1, (%esi) - popl %esi - /* - * While we could just fall through to lx_handler(), we "tail-call" it - * instead to make ourselves a little more comprehensible to trace - * tools. - */ - jmp lx_handler - SET_SIZE(lx_handler_trace) - - ALTENTRY(lx_handler) - /* - * %ebp isn't always going to be a frame pointer on Linux, but when - * it is, saving it here lets us have a coherent stack backtrace. - */ - pushl %ebp - - /* - * Fill in a lx_regs_t structure on the stack. - */ - subl $SIZEOF_LX_REGS_T, %esp - - /* - * Save %ebp and then fill it with what would be its usual value as - * the frame pointer. The value we save for %esp needs to be the - * stack pointer at the time of the interrupt so we need to skip the - * saved %ebp and (what will be) the return address. - */ - movl %ebp, LXR_EBP(%esp) - movl %esp, %ebp - addl $_CONST(SIZEOF_LX_REGS_T), %ebp - movl %ebp, LXR_ESP(%esp) - addl $_CONST(_MUL(CPTRSIZE, 2)), LXR_ESP(%esp) - - movl $0, LXR_GS(%esp) - movw %gs, LXR_GS(%esp) - movl %edi, LXR_EDI(%esp) - movl %esi, LXR_ESI(%esp) - movl %ebx, LXR_EBX(%esp) - movl %edx, LXR_EDX(%esp) - movl %ecx, LXR_ECX(%esp) - movl %eax, LXR_EIP(%esp) - - /* - * The kernel drops us into the middle of one of the tables above - * that then pushes that table offset onto the stack, and calls into - * lx_handler. That offset indicates the system call number while - * %eax holds the return address for the system call. We replace the - * value on the stack with the return address, and use the value to - * compute the system call number by dividing by the table entry size. - */ - xchgl CPTRSIZE(%ebp), %eax - shrl $4, %eax - movl %eax, LXR_EAX(%esp) - - /* - * Switch to the Solaris libc's %gs. - */ - movl $LWPGS_SEL, %ebx - movw %bx, %gs - - /* - * Call lx_emulate() whose only argument is a pointer to the - * lx_regs_t structure we've placed on the stack. - */ - pushl %esp - call lx_emulate - - /* - * We use this global symbol to identify this return site when - * walking the stack backtrace. It needs to remain immediately - * after the call to lx_emulate(). - */ - ALTENTRY(lx_emulate_done) - - /* - * Clean up the argument to lx_emulate(). - */ - addl $4, %esp - - /* - * Restore the saved register state; we get %ebp, %esp and %esp from - * the ordinary locations rather than the saved state. - */ - movl LXR_EDI(%esp), %edi - movl LXR_ESI(%esp), %esi - movl LXR_EBX(%esp), %ebx - movl LXR_EDX(%esp), %edx - movl LXR_ECX(%esp), %ecx - movl LXR_EAX(%esp), %eax - movw LXR_GS(%esp), %gs - - movl %ebp, %esp - popl %ebp - ret - SET_SIZE(lx_handler) - ENTRY_NP(lx_swap_gs) push %eax /* save the current eax value */ movl 0xc(%esp),%eax /* 2nd param is a pointer */ @@ -251,102 +67,6 @@ lx_sigreturn_tolibc(uintptr_t sp) ret SET_SIZE(lx_swap_gs) - ENTRY_NP(lx_setup_clone) - xorl %ebp, %ebp /* terminating stack */ - popl %edx /* eat the clone_start() return address */ - popl %gs /* Switch back to the Linux libc's %gs */ - popl %edx /* Linux clone() return address */ - popl %esp /* New stack pointer */ - xorl %eax, %eax /* child returns 0 to SYS_clone() */ - jmp *%edx /* return to Linux app. */ - SET_SIZE(lx_setup_clone) - - /* - * lx_sigdeliver(sig, siginfo_t *, ucontext_t *, stack_size, - * stack_build_routine, signal_handler, glibc_gs) - * - * This routine allocates stack space for the Linux signal stack, - * calls a routine to build the signal stack and then calls the Linux - * signal handler. This is written in assembly because of the way - * we need to directly manipulate the stack and pass the resulting - * stack to the signal handler with the Linux signal stack on top. - * - * When the Linux signal handler is called, the stack will look - * like this: - * - * ================================================= - * | | %ebp | - * | ================================================= - * | | LX_SIGRT_MAGIC | - * | ================================================= - * V | Linux signal frame built by lx_stackbuilder() | - * ================================================= - * - * The stack frame (%ebp) will be reset to its original value (i.e. the - * previous frame) on entry to the Linux signal handler. - */ - ENTRY_NP(lx_sigdeliver) - pushl %ebp - movl %esp, %ebp - movl 16(%ebp), %edx /* pointer to Solaris ucontext_t */ - pushl %edx /* save ucontext_t ptr for later */ - pushl $LX_SIGRT_MAGIC /* marker value for lx_(rt)_sigreturn */ - - subl 20(%ebp), %esp /* create stack_size stack buffer */ - pushl %esp /* push stack pointer */ - pushl %edx /* push pointer to ucontext_t */ - pushl 12(%ebp) /* push pointer to siginfo_t */ - pushl 8(%ebp) /* push signal number */ - call *24(%ebp) /* lx_stackbuilder(sig, sip, ucp, sp) */ - add $16, %esp /* remove args from stack */ - movw 32(%ebp), %gs /* only low 16 bits are used */ - - mov 4(%ebp),%eax /* fetch old %ebp from stack */ - mov 28(%ebp), %edx /* get address of Linux handler */ - mov %eax, %ebp /* restore old %ebp */ - jmp *%edx /* jmp to the Linux signal handler */ - SET_SIZE(lx_sigdeliver) - - /* - * Due to the nature of signals, we need to be able to force the %gs - * value to that used by Solaris by running any Solaris code. - * - * This routine does that, then calls a C routine that will save the - * %gs value at the time of the signal off into a thread-specific data - * structure. Finally, we trampoline to the libc code that would - * normally interpose itself before calling a signal handler. - * - * The libc routine that calls user signal handlers ends with a - * setcontext, so we would never return here even if we used a call - * rather than a jmp. - * - * %esi is used for the PIC as it is guaranteed by the 386 ABI to - * survive the call to lx_sigsavegs. The downside is we must also - * preserve its value for our caller. - * - * Note that because lx_sigsavegs and libc_sigacthandler are externs, - * they need to be dereferenced via the GOT. - * - * IMPORTANT: Because libc apparently gets upset if extra data is - * left on its stack, this routine needs to be crafted - * in assembly so that the jmp to the libc interposer - * doesn't leave any cruft lying around. - */ - ENTRY_NP(lx_sigacthandler) - pushl %esi /* save %esi */ - pushl %gs /* push the Linux %gs */ - pushl $LWPGS_SEL - popl %gs /* install the Solaris %gs */ - - PIC_SETUP(%esi) - movl lx_sigsavegs@GOT(%esi), %eax - call *%eax /* save the Linux %gs */ - movl libc_sigacthandler@GOT(%esi), %eax - add $4, %esp /* clear Linux %gs from stack */ - popl %esi /* restore %esi */ - jmp *(%eax) /* jmp to libc's interposer */ - SET_SIZE(lx_sigacthandler) - /* * Trampoline code is called by the return at the end of a Linux * signal handler to return control to the interrupted application @@ -370,15 +90,4 @@ lx_sigreturn_tolibc(uintptr_t sp) movl $LX_SYS_rt_sigreturn, %eax int $0x80 SET_SIZE(lx_rt_sigreturn_tramp) - - /* - * Manipulate the stack in the way necessary for it to appear to libc - * that the signal handler it invoked via call_user_handler() is - * returning. - */ - ENTRY_NP(lx_sigreturn_tolibc) - movl 4(%esp), %esp /* set %esp to passed value */ - popl %ebp /* restore proper %ebp */ - ret /* return to lx_call_user_handler */ - SET_SIZE(lx_sigreturn_tolibc) #endif /* lint */ diff --git a/usr/src/lib/brand/lx/lx_brand/i386/lx_runexe.s b/usr/src/lib/brand/lx/lx_brand/i386/lx_runexe.s deleted file mode 100644 index a90bc5621b..0000000000 --- a/usr/src/lib/brand/lx/lx_brand/i386/lx_runexe.s +++ /dev/null @@ -1,60 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - * Copyright 2014 Joyent, Inc. All rights reserved. - */ - -#include <sys/asm_linkage.h> - -#if defined(lint) - -/*ARGSUSED*/ -void -lx_runexe(void *argv, void *entry) -{ -} - -#else /* lint */ - - /* - * Set our stack pointer, clear the general registers, - * and jump to the brand linker's entry point. - */ - ENTRY_NP(lx_runexe) - movl 4(%esp), %eax / %eax = &argv[0] - movl 8(%esp), %ebx / Brand linker's entry point in %ebx - subl $4, %eax / Top of stack - must point at argc - movl %eax, %esp / Set %esp to what linkers expect - - movl $0, %eax - movl $0, %ecx - movl $0, %edx - movl $0, %esi - movl $0, %edi - movl $0, %ebp - - jmp *%ebx / And away we go... - SET_SIZE(lx_runexe) - -#endif /* lint */ diff --git a/usr/src/lib/brand/lx/lx_brand/sys/lx_misc.h b/usr/src/lib/brand/lx/lx_brand/sys/lx_misc.h index f50535d0c4..bed6a8da4b 100644 --- a/usr/src/lib/brand/lx/lx_brand/sys/lx_misc.h +++ b/usr/src/lib/brand/lx/lx_brand/sys/lx_misc.h @@ -39,6 +39,7 @@ #include <sys/lwp.h> #include <sys/lx_brand.h> +#include <sys/lx_thread.h> #ifdef __cplusplus extern "C" { @@ -55,13 +56,6 @@ extern int lx_rpm_delay; extern boolean_t lx_is_rpm; /* - * These thread-specific variables allow the signal interposition code - * to communicate restart disposition for any interrupting signals. - */ -extern __thread int lx_had_sigchild; -extern __thread int lx_do_syscall_restart; - -/* * Values Linux expects for init */ #define LX_INIT_PGID 0 @@ -151,6 +145,11 @@ extern __thread int lx_do_syscall_restart; B_TRACE_POINT_5(0, 0, 0, 0, 0) /* + * Macros to access register state within a ucontext_t: + */ +#define LX_REG(ucp, r) ((ucp)->uc_mcontext.gregs[(r)]) + +/* * normally we never want to write to stderr or stdout because it's unsafe * to make assumptions about the underlying file descriptors. to protect * against writes to these file descriptors we go ahead and close them @@ -166,10 +165,9 @@ extern void lx_unsupported(char *, ...); struct ucontext; -extern void lx_handler_table(void); -extern void lx_handler_trace_table(void); -extern void lx_emulate_done(void); -extern lx_regs_t *lx_syscall_regs(void); +extern ucontext_t *lx_syscall_regs(void); +extern uintptr_t lx_find_brand_sp(void); +extern const ucontext_t *lx_find_brand_uc(void); extern int lx_errno(int); extern char *lx_fd_to_path(int fd, char *buf, int buf_size); @@ -179,7 +177,7 @@ extern int lx_lpid_to_spid(pid_t, pid_t *); extern void lx_ptrace_init(); extern int lx_ptrace_wait(siginfo_t *); extern void lx_ptrace_fork(void); -extern void lx_ptrace_stop_if_option(int, boolean_t, ulong_t msg); +extern void lx_ptrace_stop_if_option(int, boolean_t, ulong_t msg, ucontext_t *); extern void lx_ptrace_clone_begin(int, boolean_t); extern int lx_check_alloca(size_t); @@ -187,6 +185,12 @@ extern int lx_check_alloca(size_t); extern int ltos_at_flag(int lflag, int allow, boolean_t enforce); +extern void lx_init_tsd(lx_tsd_t *); +extern int lx_alloc_stack(void **, size_t *); +extern void lx_install_stack(void *, size_t, lx_tsd_t *); +extern void lx_free_stack(void); +extern void lx_free_other_stacks(void); + /* * NO_UUCOPY disables calls to the uucopy* system calls to help with * debugging brand library accesses to linux application memory. @@ -201,6 +205,13 @@ int uucopystr_unsafe(const void *src, void *dst, size_t n); #endif /* NO_UUCOPY */ +/* + * We use these Private libc interfaces to defer signals during critical + * sections. + */ +extern void _sigon(void); +extern void _sigoff(void); + #ifdef __cplusplus } #endif diff --git a/usr/src/lib/brand/lx/lx_brand/sys/lx_signal.h b/usr/src/lib/brand/lx/lx_brand/sys/lx_signal.h index f3d39fca64..3c612d9ab8 100644 --- a/usr/src/lib/brand/lx/lx_brand/sys/lx_signal.h +++ b/usr/src/lib/brand/lx/lx_brand/sys/lx_signal.h @@ -30,6 +30,7 @@ #if !defined(_ASM) #include <sys/lx_types.h> #include <sys/ucontext.h> +#include <sys/lx_siginfo.h> #include <lx_signum.h> #endif /* !defined(_ASM) */ @@ -118,93 +119,6 @@ typedef struct lx_osigaction { #define USE_OSIGSET 0 #define USE_SIGSET 1 -#define LX_SI_MAX_SIZE 128 -#if defined(_LP64) -/* - * Because of the odd number (3) of ints before the union, we need to account - * for the smaller padding needed on x64 due to the union being offset to an 8 - * byte boundary. - */ -#define LX_SI_PAD_SIZE ((LX_SI_MAX_SIZE/sizeof (int)) - 4) - -#else -#define LX_SI_PAD_SIZE ((LX_SI_MAX_SIZE/sizeof (int)) - 3) -#endif - -typedef struct lx_siginfo { - int lsi_signo; - int lsi_errno; - int lsi_code; - union { - int _pad[LX_SI_PAD_SIZE]; - - struct { - pid_t _pid; - lx_uid16_t _uid; - } _kill; - - struct { - uint_t _timer1; - uint_t _timer2; - } _timer; - - struct { - pid_t _pid; /* sender's pid */ - lx_uid16_t _uid; /* sender's uid */ - union sigval _sigval; - } _rt; - - struct { - pid_t _pid; /* which child */ - lx_uid16_t _uid; /* sender's uid */ - int _status; /* exit code */ - clock_t _utime; - clock_t _stime; - } _sigchld; - - struct { - void *_addr; /* faulting insn/memory ref. */ - } _sigfault; - - struct { - int _band; /* POLL_IN,POLL_OUT,POLL_MSG */ - int _fd; - } _sigpoll; - } _sifields; -} lx_siginfo_t; - -/* - * lx_siginfo_t lsi_code values - * - * LX_SI_ASYNCNL: Sent by asynch name lookup completion - * LX_SI_DETHREAD: Sent by execve() killing subsidiary threads - * LX_SI_SIGIO: Sent by queued SIGIO - * LX_SI_ASYNCIO: Sent by asynchronous I/O completion - * LX_SI_MESGQ: Sent by real time message queue state change - * LX_SI_TIMER: Sent by timer expiration - * LX_SI_QUEUE: Sent by sigqueue - * LX_SI_USER: Sent by kill, sigsend, raise, etc. - * LX_SI_KERNEL: Sent by kernel - * LX_SI_CODE_NOT_EXIST: Error code. When translating from Linux to - * illumos errors, if there is no translation available, this value - * should be used. This value should have no meaning as an si_code in - * illumos or Linux. - * - * At present, LX_SI_ASYNCNL, LX_SI_DETHREAD, and LX_SI_SIGIO are unused by - * BrandZ. - */ -#define LX_SI_CODE_NOT_EXIST (-61) -#define LX_SI_ASYNCNL (-60) -#define LX_SI_DETHREAD (-7) -#define LX_SI_TKILL (-6) -#define LX_SI_SIGIO (-5) -#define LX_SI_ASYNCIO (-4) -#define LX_SI_MESGQ (-3) -#define LX_SI_TIMER (-2) -#define LX_SI_QUEUE (-1) -#define LX_SI_USER (0) -#define LX_SI_KERNEL (0x80) - typedef struct lx_sighandlers { struct lx_sigaction lx_sa[LX_NSIG + 1]; } lx_sighandlers_t; @@ -370,18 +284,6 @@ typedef struct lx_ucontext { lx_sigset_t uc_sigmask; } lx_ucontext_t; -#define lsi_pid _sifields._kill._pid -#define lsi_uid _sifields._kill._uid -#define lsi_status _sifields._sigchld._status -#define lsi_utime _sifields._sigchld._utime -#define lsi_stime _sifields._sigchld._stime -#define lsi_value _sifields._rt._sigval -#define lsi_int _sifields._rt._sigval.sivalx_int -#define lsi_ptr _sifields._rt._sigval.sivalx_ptr -#define lsi_addr _sifields._sigfault._addr -#define lsi_band _sifields._sigpoll._band -#define lsi_fd _sifields._sigpoll._fd - extern const int ltos_signo[]; extern const int stol_signo[]; @@ -391,10 +293,6 @@ extern void setsigacthandler(void (*)(int, siginfo_t *, void *), extern int lx_siginit(void); -extern void lx_sigreturn_tolibc(uintptr_t); -extern void lx_sigdeliver(int, siginfo_t *, void *, size_t, void (*)(), - void (*)(), uintptr_t); - extern int stol_siginfo(siginfo_t *siginfop, lx_siginfo_t *lx_siginfop); extern int stol_status(int); diff --git a/usr/src/lib/brand/lx/lx_brand/sys/lx_syscall.h b/usr/src/lib/brand/lx/lx_brand/sys/lx_syscall.h index a070bb69b6..4cc72ba0c6 100644 --- a/usr/src/lib/brand/lx/lx_brand/sys/lx_syscall.h +++ b/usr/src/lib/brand/lx/lx_brand/sys/lx_syscall.h @@ -74,7 +74,6 @@ extern long lx_readlink(uintptr_t, uintptr_t, uintptr_t); extern long lx_readdir(uintptr_t, uintptr_t, uintptr_t); extern long lx_getdents(uintptr_t, uintptr_t, uintptr_t); extern long lx_getdents64(uintptr_t, uintptr_t, uintptr_t); -extern long lx_getpid(void); extern long lx_execve(uintptr_t, uintptr_t, uintptr_t); extern long lx_dup2(uintptr_t, uintptr_t); extern long lx_dup3(uintptr_t, uintptr_t, uintptr_t); @@ -132,12 +131,6 @@ extern long lx_getpgid(uintptr_t); extern long lx_setpgid(uintptr_t, uintptr_t); extern long lx_getsid(uintptr_t); extern long lx_setsid(void); -extern long lx_setgroups(uintptr_t, uintptr_t); - - -extern long lx_waitpid(uintptr_t, uintptr_t, uintptr_t); -extern long lx_waitid(uintptr_t, uintptr_t, uintptr_t, uintptr_t); -extern long lx_wait4(uintptr_t, uintptr_t, uintptr_t, uintptr_t); extern long lx_getuid16(void); extern long lx_getgid16(void); @@ -183,7 +176,6 @@ extern long lx_ftruncate64(uintptr_t, uintptr_t, uintptr_t); extern long lx_sysctl(uintptr_t); extern long lx_fsync(uintptr_t); extern long lx_fdatasync(uintptr_t); -extern long lx_pipe2(uintptr_t, uintptr_t); extern long lx_link(uintptr_t, uintptr_t); extern long lx_unlink(uintptr_t); extern long lx_rmdir(uintptr_t); @@ -204,6 +196,7 @@ extern long lx_getcwd(uintptr_t, uintptr_t); extern long lx_uname(uintptr_t); extern long lx_reboot(uintptr_t, uintptr_t, uintptr_t, uintptr_t); extern long lx_getgroups16(uintptr_t, uintptr_t); +extern long lx_setgroups(uintptr_t, uintptr_t); extern long lx_setgroups16(uintptr_t, uintptr_t); extern long lx_personality(uintptr_t); @@ -312,7 +305,6 @@ extern long lx_shmat(int, void *, int); extern long lx_shmctl(int, int, void *); extern long lx_prctl(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t); -extern long lx_arch_prctl(int, uintptr_t); extern long lx_alarm(unsigned int); extern long lx_close(int); @@ -351,59 +343,36 @@ extern long lx_shmdt(char *); extern long lx_stime(const time_t *); extern long lx_symlink(const char *, const char *); extern long lx_syslog(int, char *, int); -extern long lx_sysinfo32(uintptr_t); extern long lx_timerfd_create(int, int); extern long lx_timerfd_settime(int, int, const struct itimerspec *, struct itimerspec *); extern long lx_timerfd_gettime(int, struct itimerspec *); extern long lx_umask(mode_t); extern long lx_utimes(const char *, const struct timeval *); -extern long lx_write(int, const void *, size_t); -extern long lx_yield(void); #endif /* !defined(_ASM) */ -/* - * Constants for the In-Kernel Emulation table. - */ -#define LX_EMUL_getpid 1 -#define LX_EMUL_kill 2 -#define LX_EMUL_pipe 3 -#define LX_EMUL_brk 4 -#define LX_EMUL_getppid 5 -#define LX_EMUL_sysinfo 6 -#define LX_EMUL_clone 7 -#define LX_EMUL_modify_ldt 8 -#define LX_EMUL_sched_setparam 9 -#define LX_EMUL_sched_getparam 10 -#define LX_EMUL_sched_rr_get_interval 11 -#define LX_EMUL_setresuid16 12 -#define LX_EMUL_setresgid16 13 -#define LX_EMUL_rt_sigqueueinfo 14 -#define LX_EMUL_setgroups 15 -#define LX_EMUL_setresuid 16 -#define LX_EMUL_setresgid 17 -#define LX_EMUL_gettid 18 -#define LX_EMUL_tkill 19 -#define LX_EMUL_futex 20 -#define LX_EMUL_set_thread_area 21 -#define LX_EMUL_get_thread_area 22 -#define LX_EMUL_set_tid_address 23 -#define LX_EMUL_pipe2 24 -#define LX_EMUL_rt_tgsigqueueinfo 25 -#define LX_EMUL_arch_prctl 26 -#define LX_EMUL_tgkill 27 -#define LX_EMUL_read 28 -#define LX_EMUL_ioctl LX_N_IKE_FUNCS - -/* Note: adjust LX_N_IKE_FUNCS when adding new in-kernel functions */ - -/* Linux vsyscall addresses */ #if defined(_LP64) +/* + * Linux vsyscall addresses: + */ #define LX_VSYS_gettimeofday (uintptr_t)0xffffffffff600000 #define LX_VSYS_time (uintptr_t)0xffffffffff600400 #define LX_VSYS_getcpu (uintptr_t)0xffffffffff600800 + +/* + * System call numbers for vsyscall revectoring: + */ +#define LX_SYS_gettimeofday 96 +#define LX_SYS_time 201 +#define LX_SYS_getcpu 309 +#endif + +#if defined(_LP64) +#define LX_SYS_clone 56 +#else +#define LX_SYS_clone 120 #endif #ifdef __cplusplus diff --git a/usr/src/lib/brand/lx/lx_brand/sys/lx_thread.h b/usr/src/lib/brand/lx/lx_brand/sys/lx_thread.h index 3d7b9018e1..fae81c9fc9 100644 --- a/usr/src/lib/brand/lx/lx_brand/sys/lx_thread.h +++ b/usr/src/lib/brand/lx/lx_brand/sys/lx_thread.h @@ -32,6 +32,7 @@ extern "C" { #endif +#include <sys/lx_signal.h> #include <thread.h> typedef enum lx_exit_type { @@ -41,23 +42,32 @@ typedef enum lx_exit_type { } lx_exit_type_t; typedef struct lx_tsd { -#if defined(_ILP32) - /* 32-bit thread-specific Linux %gs value */ - uintptr_t lxtsd_gs; -#else - /* 64-bit thread-specific Linux %fsbase value */ - uintptr_t lxtsd_fsbase; -#endif lx_exit_type_t lxtsd_exit; int lxtsd_exit_status; ucontext_t lxtsd_exit_context; + + /* + * If this value is non-zero, we use it in lx_sigdeliver() to represent + * the in-use extent of the Linux (i.e. BRAND) stack for this thread. + * Access to this value must be protected by _sigoff()/_sigon(). + */ + uintptr_t lxtsd_lx_sp; + + /* + * Alternate stack for Linux sigaltstack emulation: + */ + lx_stack_t lxtsd_sigaltstack; + + void *lxtsd_clone_state; } lx_tsd_t; extern thread_key_t lx_tsd_key; extern void lx_swap_gs(long, long *); -extern void lx_exit_common(lx_exit_type_t, uintptr_t) __NORETURN; +extern void lx_exit_common(void) __NORETURN; + +extern lx_tsd_t *lx_get_tsd(void); #ifdef __cplusplus } diff --git a/usr/src/lib/brand/lx/lx_brand/sys/lx_thunk_server.h b/usr/src/lib/brand/lx/lx_brand/sys/lx_thunk_server.h index a56fe8eeb3..33704bffb6 100644 --- a/usr/src/lib/brand/lx/lx_brand/sys/lx_thunk_server.h +++ b/usr/src/lib/brand/lx/lx_brand/sys/lx_thunk_server.h @@ -27,8 +27,6 @@ #ifndef _LX_THUNK_SERVER_H #define _LX_THUNK_SERVER_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -133,7 +131,6 @@ typedef struct lxt_syslog_arg { * thunk server process. */ void lxt_server_init(int, char *[]); -int lxt_server_pid(int *pid); void lxt_server_exec_check(void); #ifdef __cplusplus diff --git a/usr/src/lib/libc/port/mapfile-vers b/usr/src/lib/libc/port/mapfile-vers index 3f61d448e5..f4eea53408 100644 --- a/usr/src/lib/libc/port/mapfile-vers +++ b/usr/src/lib/libc/port/mapfile-vers @@ -2954,6 +2954,7 @@ $endif scrwidth; semctl64; _semctl64; + set_escaped_context_cleanup; set_setcontext_enforcement; _setbufend; __set_errno; diff --git a/usr/src/lib/libc/port/threads/sigaction.c b/usr/src/lib/libc/port/threads/sigaction.c index dd7e6159fb..09be90e54f 100644 --- a/usr/src/lib/libc/port/threads/sigaction.c +++ b/usr/src/lib/libc/port/threads/sigaction.c @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include "lint.h" @@ -285,6 +285,24 @@ take_deferred_signal(int sig) thr_panic("take_deferred_signal(): __sigresend() failed"); } +/* + * sigacthandler() attempts to clean up dangling uc_link pointers in + * signal handling contexts when libc believes us to have escaped + * a signal handler incorrectly in the past. + * + * Branded processes have a legitimate use for a chain including contexts + * other than those used for signal handling when tracking emulation + * requests from the kernel. We allow them to disable this cleanup + * behaviour. + */ +static int escaped_context_cleanup = 1; + +void +set_escaped_context_cleanup(int on) +{ + escaped_context_cleanup = on; +} + void sigacthandler(int sig, siginfo_t *sip, void *uvp) { @@ -307,7 +325,7 @@ sigacthandler(int sig, siginfo_t *sip, void *uvp) * we are actually executing at main level (self->ul_siglink == NULL). * See the code for setjmp()/longjmp() for more details. */ - if (self->ul_siglink == NULL) + if (escaped_context_cleanup && self->ul_siglink == NULL) ucp->uc_link = NULL; /* diff --git a/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c index b4e38f062a..510626d220 100644 --- a/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c +++ b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c @@ -22,7 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ @@ -35,6 +35,7 @@ #include <sys/dtrace.h> #include <sys/dtrace_impl.h> +#include <sys/lx_brand.h> #include <sys/lx_impl.h> /* @@ -59,696 +60,6 @@ #define LX_SYSTRACE_ENTRY_AFRAMES 2 #define LX_SYSTRACE_RETURN_AFRAMES 4 -typedef struct lx_sys_names { - char *sy_name; -} lx_sys_names_t; - -static lx_sys_names_t lx_sysnames32[] = -{ - {"lx_nosys"}, /* 0 */ - {"exit"}, /* 1 */ - {"lx_fork"}, - {"read"}, - {"write"}, - {"open"}, - {"close"}, - {"waitpid"}, - {"creat"}, - {"link"}, - {"unlink"}, /* 10 */ - {"exec"}, - {"chdir"}, - {"gtime"}, - {"mknod"}, - {"chmod"}, - {"lchown16"}, - {"break"}, - {"stat"}, - {"lseek"}, - {"getpid"}, /* 20 */ - {"mount"}, - {"umount"}, - {"setuid16"}, - {"getuid16"}, - {"stime"}, - {"ptrace"}, - {"alarm"}, - {"fstat"}, - {"pause"}, - {"utime"}, /* 30 */ - {"stty"}, - {"gtty"}, - {"access"}, - {"nice"}, - {"ftime"}, - {"sync"}, - {"kill"}, - {"rename"}, - {"mkdir"}, - {"rmdir"}, /* 40 */ - {"dup"}, - {"pipe"}, - {"times"}, - {"prof"}, - {"brk"}, - {"setgid16"}, - {"getgid16"}, - {"signal"}, - {"geteuid16"}, - {"getegid16"}, /* 50 */ - {"sysacct"}, - {"umount2"}, - {"lock"}, - {"ioctl"}, - {"fcntl"}, - {"mpx"}, - {"setpgid"}, - {"ulimit"}, - {"olduname"}, - {"umask"}, /* 60 */ - {"chroot"}, - {"ustat"}, - {"dup2"}, - {"getppid"}, - {"pgrp"}, - {"setsid"}, - {"sigaction"}, - {"sgetmask"}, - {"ssetmask"}, - {"setreuid16"}, /* 70 */ - {"setregid16"}, - {"sigsuspend"}, - {"sigpending"}, - {"sethostname"}, - {"setrlimit"}, - {"old_getrlimit"}, - {"getrusage"}, - {"gettimeofday"}, - {"settimeofday"}, - {"getgroups16"}, /* 80 */ - {"setgroups16"}, - {"old_select"}, - {"symlink"}, - {"oldlstat"}, - {"readlink"}, - {"uselib"}, - {"swapon"}, - {"reboot"}, - {"old_readdir"}, - {"old_mmap"}, /* 90 */ - {"munmap"}, - {"truncate"}, - {"ftruncate"}, - {"fchmod"}, - {"fchown16"}, - {"getpriority"}, - {"setpriority"}, - {"profil"}, - {"statfs"}, - {"fstatfs"}, /* 100 */ - {"ioperm"}, - {"socketcall"}, - {"syslog"}, - {"setitimer"}, - {"getitimer"}, - {"newstat"}, - {"newsltat"}, - {"newsftat"}, - {"uname"}, - {"oldiopl"}, /* 110 */ - {"oldvhangup"}, - {"idle"}, - {"vm86old"}, - {"wait4"}, - {"swapoff"}, - {"sysinfo"}, - {"ipc"}, - {"fsync"}, - {"sigreturn"}, - {"clone"}, /* 120 */ - {"setdomainname"}, - {"newuname"}, - {"modify_ldt"}, - {"adjtimex"}, - {"mprotect"}, - {"sigprocmask"}, - {"create_module"}, - {"init_module"}, - {"delete_module"}, - {"get_kernel_syms"}, /* 130 */ - {"quotactl"}, - {"getpgid"}, - {"fchdir"}, - {"bdflush"}, - {"sysfs"}, - {"personality"}, - {"afs_syscall"}, - {"setfsuid16"}, - {"setfsgid16"}, - {"llseek"}, /* 140 */ - {"getdents"}, - {"select"}, - {"flock"}, - {"msync"}, - {"readv"}, - {"writev"}, - {"getsid"}, - {"fdatasync"}, - {"sysctl"}, - {"mlock"}, /* 150 */ - {"munlock"}, - {"mlockall"}, - {"munlockall"}, - {"sched_setparam"}, - {"sched_getparam"}, - {"sched_setscheduler"}, - {"sched_getscheduler"}, - {"yield"}, - {"sched_get_priority_max"}, - {"sched_get_priority_min"}, /* 160 */ - {"sched_rr_get_interval"}, - {"nanosleep"}, - {"mremap"}, - {"setresuid16"}, - {"getresuid16"}, - {"vm86"}, - {"query_module"}, - {"poll"}, - {"nfsserctl"}, - {"setresgid16"}, /* 170 */ - {"getresgid16"}, - {"prctl"}, - {"rt_sigreturn"}, - {"rt_sigaction"}, - {"rt_sigprocmask"}, - {"rt_sigpending"}, - {"rt_sigtimedwait"}, - {"rt_sigqueueinfo"}, - {"rt_sigsuspend"}, - {"pread64"}, /* 180 */ - {"pwrite64"}, - {"chown16"}, - {"getcwd"}, - {"capget"}, - {"capset"}, - {"sigaltstack"}, - {"sendfile"}, - {"getpmsg"}, - {"putpmsg"}, - {"vfork"}, /* 190 */ - {"getrlimit"}, - {"mmap2"}, - {"truncate64"}, - {"ftruncate64"}, - {"stat64"}, - {"lstat64"}, - {"fstat64"}, - {"lchown"}, - {"getuid"}, - {"getgid"}, /* 200 */ - {"geteuid"}, - {"getegid"}, - {"setreuid"}, - {"setregid"}, - {"getgroups"}, - {"setgroups"}, - {"fchown"}, - {"setresuid"}, - {"getresuid"}, - {"setresgid"}, /* 210 */ - {"getresgid"}, - {"chown"}, - {"setuid"}, - {"setgid"}, - {"setfsuid"}, - {"setfsgid"}, - {"pivot_root"}, - {"mincore"}, - {"madvise"}, - {"getdents64"}, /* 220 */ - {"fcntl64"}, - {"lx_nosys"}, - {"security"}, - {"gettid"}, - {"readahead"}, - {"setxattr"}, - {"lsetxattr"}, - {"fsetxattr"}, - {"getxattr"}, - {"lgetxattr"}, /* 230 */ - {"fgetxattr"}, - {"listxattr"}, - {"llistxattr"}, - {"flistxattr"}, - {"removexattr"}, - {"lremovexattr"}, - {"fremovexattr"}, - {"tkill"}, - {"sendfile64"}, - {"futex"}, /* 240 */ - {"sched_setaffinity"}, - {"sched_getaffinity"}, - {"set_thread_area"}, - {"get_thread_area"}, - {"io_setup"}, - {"io_destroy"}, - {"io_getevents"}, - {"io_submit"}, - {"io_cancel"}, - {"fadvise64"}, /* 250 */ - {"lx_nosys"}, - {"exit_group"}, - {"lookup_dcookie"}, - {"epoll_create"}, - {"epoll_ctl"}, - {"epoll_wait"}, - {"remap_file_pages"}, - {"set_tid_address"}, - {"timer_create"}, - {"timer_settime"}, /* 260 */ - {"timer_gettime"}, - {"timer_getoverrun"}, - {"timer_delete"}, - {"clock_settime"}, - {"clock_gettime"}, - {"clock_getres"}, - {"clock_nanosleep"}, - {"statfs64"}, - {"fstatfs64"}, - {"tgkill"}, /* 270 */ - /* The following are Linux 2.6 system calls */ - {"utimes"}, - {"fadvise64_64"}, - {"vserver"}, - {"mbind"}, - {"get_mempolicy"}, - {"set_mempolicy"}, - {"mq_open"}, - {"mq_unlink"}, - {"mq_timedsend"}, - {"mq_timedreceive"}, /* 280 */ - {"mq_notify"}, - {"mq_getsetattr"}, - {"kexec_load"}, - {"waitid"}, - {"sys_setaltroot"}, - {"add_key"}, - {"request_key"}, - {"keyctl"}, - {"ioprio_set"}, - {"ioprio_get"}, /* 290 */ - {"inotify_init"}, - {"inotify_add_watch"}, - {"inotify_rm_watch"}, - {"migrate_pages"}, - {"openat"}, - {"mkdirat"}, - {"mknodat"}, - {"fchownat"}, - {"futimesat"}, - {"fstatat64"}, /* 300 */ - {"unlinkat"}, - {"renameat"}, - {"linkat"}, - {"syslinkat"}, - {"readlinkat"}, - {"fchmodat"}, - {"faccessat"}, - {"pselect6"}, - {"ppoll"}, - {"unshare"}, /* 310 */ - {"set_robust_list"}, - {"get_robust_list"}, - {"splice"}, - {"sync_file_range"}, - {"tee"}, - {"vmsplice"}, - {"move_pages"}, - {"getcpu"}, - {"epoll_pwait"}, - {"utimensat"}, /* 320 */ - {"signalfd"}, - {"timerfd_create"}, - {"eventfd"}, - {"fallocate"}, - {"timerfd_settime"}, - {"timerfd_gettime"}, - {"signalfd4"}, - {"eventfd2"}, - {"epoll_create1"}, - {"dup3"}, /* 330 */ - {"pipe2"}, - {"inotify_init1"}, - {"preadv"}, - {"pwritev"}, - {"rt_tgsigqueueinfo"}, - {"perf_event_open"}, - {"recvmmsg"}, - {"fanotify_init"}, - {"fanotify_mark"}, - {"prlimit64"}, /* 340 */ - {"name_to_handle_at"}, - {"open_by_handle_at"}, - {"clock_adjtime"}, - {"syncfs"}, - {"sendmmsg"}, - {"setns"}, - {"process_vm_readv"}, - {"process_vm_writev"}, - {"kcmp"}, - {"finit_module"}, /* 350 */ - {"sched_setattr"}, - {"sched_getattr"}, - NULL /* NULL-termination is required for lx_systrace */ -}; - -#if defined(_LP64) -static lx_sys_names_t lx_sysnames64[] = -{ - {"read"}, /* 0 */ - {"write"}, - {"open"}, - {"close"}, - {"stat"}, - {"fstat"}, - {"lstat"}, - {"poll"}, - {"lseek"}, - {"mmap"}, - {"mprotect"}, /* 10 */ - {"munmap"}, - {"brk"}, - {"rt_sigaction"}, - {"rt_sigprocmask"}, - {"rt_sigreturn"}, - {"ioctl"}, - {"pread64"}, - {"pwrite64"}, - {"readv"}, - {"writev"}, /* 20 */ - {"access"}, - {"pipe"}, - {"select"}, - {"sched_yield"}, - {"mremap"}, - {"msync"}, - {"mincore"}, - {"madvise"}, - {"shmget"}, - {"shmat"}, /* 30 */ - {"shmctl"}, - {"dup"}, - {"dup2"}, - {"pause"}, - {"nanosleep"}, - {"getitimer"}, - {"alarm"}, - {"setitimer"}, - {"getpid"}, - {"sendfile"}, /* 40 */ - {"socket"}, - {"connect"}, - {"accept"}, - {"sendto"}, - {"recvfrom"}, - {"sendmsg"}, - {"recvmsg"}, - {"shutdown"}, - {"bind"}, - {"listen"}, /* 50 */ - {"getsockname"}, - {"getpeername"}, - {"socketpair"}, - {"setsockopt"}, - {"getsockopt"}, - {"clone"}, - {"fork"}, - {"vfork"}, - {"execve"}, - {"exit"}, /* 60 */ - {"wait4"}, - {"kill"}, - {"uname"}, - {"semget"}, - {"semop"}, - {"semctl"}, - {"shmdt"}, - {"msgget"}, - {"msgsnd"}, - {"msgrcv"}, /* 70 */ - {"msgctl"}, - {"fcntl"}, - {"flock"}, - {"fsync"}, - {"fdatasync"}, - {"truncate"}, - {"ftruncate"}, - {"getdents"}, - {"getcwd"}, - {"chdir"}, /* 80 */ - {"fchdir"}, - {"rename"}, - {"mkdir"}, - {"rmdir"}, - {"creat"}, - {"link"}, - {"unlink"}, - {"symlink"}, - {"readlink"}, - {"chmod"}, /* 90 */ - {"fchmod"}, - {"chown"}, - {"fchown"}, - {"lchown"}, - {"umask"}, - {"gettimeofday"}, - {"getrlimit"}, - {"getrusage"}, - {"sysinfo"}, - {"times"}, /* 100 */ - {"ptrace"}, - {"getuid"}, - {"syslog"}, - {"getgid"}, - {"setuid"}, - {"setgid"}, - {"geteuid"}, - {"getegid"}, - {"setpgid"}, - {"getppid"}, /* 110 */ - {"getpgrp"}, - {"setsid"}, - {"setreuid"}, - {"setregid"}, - {"getgroups"}, - {"setgroups"}, - {"setresuid"}, - {"getresuid"}, - {"setresgid"}, - {"getresgid"}, /* 120 */ - {"getpgid"}, - {"setfsuid"}, - {"setfsgid"}, - {"getsid"}, - {"capget"}, - {"capset"}, - {"rt_sigpending"}, - {"rt_sigtimedwait"}, - {"rt_sigqueueinfo"}, - {"rt_sigsuspend"}, /* 130 */ - {"sigaltstack"}, - {"utime"}, - {"mknod"}, - {"uselib"}, - {"personality"}, - {"ustat"}, - {"statfs"}, - {"fstatfs"}, - {"sysfs"}, - {"getpriority"}, /* 140 */ - {"setpriority"}, - {"sched_setparam"}, - {"sched_getparam"}, - {"sched_setscheduler"}, - {"sched_getscheduler"}, - {"sched_get_priority_max"}, - {"sched_get_priority_min"}, - {"sched_rr_get_interval"}, - {"mlock"}, - {"munlock"}, /* 150 */ - {"mlockall"}, - {"munlockall"}, - {"vhangup"}, - {"modify_ldt"}, - {"pivot_root"}, - {"sysctl"}, - {"prctl"}, - {"arch_prctl"}, - {"adjtimex"}, - {"setrlimit"}, /* 150 */ - {"chroot"}, - {"sync"}, - {"acct"}, - {"settimeofday"}, - {"mount"}, - {"umount2"}, - {"swapon"}, - {"swapoff"}, - {"reboot"}, - {"sethostname"}, /* 170 */ - {"setdomainname"}, - {"iopl"}, - {"ioperm"}, - {"create_module"}, - {"init_module"}, - {"delete_module"}, - {"get_kernel_syms"}, - {"query_module"}, - {"quotactl"}, - {"nfsservctl"}, /* 180 */ - {"getpmsg"}, - {"putpmsg"}, - {"afs_syscall"}, - {"tux"}, - {"security"}, - {"gettid"}, - {"readahead"}, - {"setxattr"}, - {"lsetxattr"}, - {"fsetxattr"}, /* 190 */ - {"getxattr"}, - {"lgetxattr"}, - {"fgetxattr"}, - {"listxattr"}, - {"llistxattr"}, - {"flistxattr"}, - {"removexattr"}, - {"lremovexattr"}, - {"fremovexattr"}, - {"tkill"}, /* 200 */ - {"time"}, - {"futex"}, - {"sched_setaffinity"}, - {"sched_getaffinity"}, - {"set_thread_area"}, - {"io_setup"}, - {"io_destroy"}, - {"io_getevents"}, - {"io_submit"}, - {"io_cancel"}, /* 210 */ - {"get_thread_area"}, - {"lookup_dcookie"}, - {"epoll_create"}, - {"epoll_ctl_old"}, - {"epoll_wait_old"}, - {"remap_file_pages"}, - {"getdents64"}, - {"set_tid_address"}, - {"restart_syscall"}, - {"semtimedop"}, /* 220 */ - {"fadvise64"}, - {"timer_create"}, - {"timer_settime"}, - {"timer_gettime"}, - {"timer_getoverrun"}, - {"timer_delete"}, - {"clock_settime"}, - {"clock_gettime"}, - {"clock_getres"}, - {"clock_nanosleep"}, /* 230 */ - {"exit_group"}, - {"epoll_wait"}, - {"epoll_ctl"}, - {"tgkill"}, - {"utimes"}, - {"vserver"}, - {"mbind"}, - {"set_mempolicy"}, - {"get_mempolicy"}, - {"mq_open"}, /* 240 */ - {"mq_unlink"}, - {"mq_timedsend"}, - {"mq_timedreceive"}, - {"mq_notify"}, - {"mq_getsetattr"}, - {"kexec_load"}, - {"waitid"}, - {"add_key"}, - {"request_key"}, - {"keyctl"}, /* 250 */ - {"ioprio_set"}, - {"ioprio_get"}, - {"inotify_init"}, - {"inotify_add_watch"}, - {"inotify_rm_watch"}, - {"migrate_pages"}, - {"openat"}, - {"mkdirat"}, - {"mknodat"}, - {"fchownat"}, /* 260 */ - {"futimesat"}, - {"fstatat64"}, - {"unlinkat"}, - {"renameat"}, - {"linkat"}, - {"symlinkat"}, - {"readlinkat"}, - {"fchmodat"}, - {"faccessat"}, - {"pselect6"}, /* 270 */ - {"ppoll"}, - {"unshare"}, - {"set_robust_list"}, - {"get_robust_list"}, - {"splice"}, - {"tee"}, - {"sync_file_range"}, - {"vmsplice"}, - {"move_pages"}, - {"utimensat"}, /* 280 */ - {"epoll_pwait"}, - {"signalfd"}, - {"timerfd_create"}, - {"eventfd"}, - {"fallocate"}, - {"timerfd_settime"}, - {"timerfd_gettime"}, - {"accept4"}, - {"signalfd4"}, - {"eventfd2"}, /* 290 */ - {"epoll_create1"}, - {"dup3"}, - {"pipe2"}, - {"inotify_init1"}, - {"preadv"}, - {"pwritev"}, - {"rt_tgsigqueueinfo"}, - {"perf_event_open"}, - {"recvmmsg"}, - {"fanotify_init"}, /* 300 */ - {"fanotify_mark"}, - {"prlimit64"}, - {"name_to_handle_at"}, - {"open_by_handle_at"}, - {"clock_adjtime"}, - {"syncfs"}, - {"sendmmsg"}, - {"setns"}, - {"getcpu"}, - {"process_vm_readv"}, /* 310 */ - {"process_vm_writev"}, - {"kcmp"}, - {"finit_module"}, - {"sched_setattr"}, - {"sched_getattr"}, - {"renameat2"}, /* 316 */ - - /* XXX gap then x32 syscalls from 512 - 544 */ - - NULL /* NULL-termination is required for lx_systrace */ -}; -#endif - typedef struct lx_systrace_sysent { const char *lss_name; dtrace_id_t lss_entry; @@ -1034,40 +345,30 @@ lx_systrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) lx_systrace_devi = devi; /* - * Count up the 32-bit Linux system calls. - */ - for (i = 0; lx_sysnames32[i].sy_name != NULL; i++) - continue; - - /* * Initialize the 32-bit table. */ - lx_systrace_sysent32 = kmem_zalloc(i * sizeof (lx_systrace_sysent_t), - KM_SLEEP); - lx_systrace_nsysent32 = i; + VERIFY(lx_nsysent32 > 0); + lx_systrace_nsysent32 = lx_nsysent32; + lx_systrace_sysent32 = kmem_zalloc(lx_systrace_nsysent32 * + sizeof (lx_systrace_sysent_t), KM_SLEEP); for (i = 0; i < lx_systrace_nsysent32; i++) { - lx_systrace_sysent32[i].lss_name = lx_sysnames32[i].sy_name; + lx_systrace_sysent32[i].lss_name = lx_sysent32[i].sy_name; lx_systrace_sysent32[i].lss_entry = DTRACE_IDNONE; lx_systrace_sysent32[i].lss_return = DTRACE_IDNONE; } #if defined(_LP64) /* - * Count up the 64-bit Linux system calls. - */ - for (i = 0; lx_sysnames64[i].sy_name != NULL; i++) - continue; - - /* * Initialize the 64-bit table. */ - lx_systrace_sysent64 = kmem_zalloc(i * sizeof (lx_systrace_sysent_t), - KM_SLEEP); - lx_systrace_nsysent64 = i; + VERIFY(lx_nsysent64 > 0); + lx_systrace_nsysent64 = lx_nsysent64; + lx_systrace_sysent64 = kmem_zalloc(lx_systrace_nsysent64 * + sizeof (lx_systrace_sysent_t), KM_SLEEP); for (i = 0; i < lx_systrace_nsysent64; i++) { - lx_systrace_sysent64[i].lss_name = lx_sysnames64[i].sy_name; + lx_systrace_sysent64[i].lss_name = lx_sysent64[i].sy_name; lx_systrace_sysent64[i].lss_entry = DTRACE_IDNONE; lx_systrace_sysent64[i].lss_return = DTRACE_IDNONE; } diff --git a/usr/src/uts/common/brand/lx/os/lx_brand.c b/usr/src/uts/common/brand/lx/os/lx_brand.c index 4507c0303c..fc9aaa6055 100644 --- a/usr/src/uts/common/brand/lx/os/lx_brand.c +++ b/usr/src/uts/common/brand/lx/os/lx_brand.c @@ -28,6 +28,110 @@ * Copyright 2015, Joyent, Inc. All rights reserved. */ +/* + * The LX Brand: emulation of a Linux operating environment within a zone. + * + * OVERVIEW + * + * The LX brand enables a full Linux userland -- including a C library, + * init(1) framework, and some set of applications -- to run unmodified + * within an illumos zone. Unlike illumos, where applications are expected + * to link against and consume functions exported from libraries, the + * supported Linux binary compatibility boundary is the system call + * interface. By accurately emulating the behaviour of Linux system calls, + * Linux software can be executed in this environment as if it were running + * on a native Linux system. + * + * EMULATING LINUX SYSTEM CALLS + * + * Linux system calls are made in 32-bit processes via the "int 0x80" + * instruction; in 64-bit processes the "syscall" instruction is used, as it + * is with native illumos processes. In both cases, arguments to system + * calls are generally passed in registers and the usermode stack is not + * interpreted or modified by the Linux kernel. + * + * When the emulated Linux process makes a system call, it traps into the + * illumos kernel. The in-kernel brand module contains various emulation + * routines, and can fully service some emulated system calls; e.g. read(2) + * and write(2). Other system calls require assistance from the illumos + * libc, bouncing back out to the brand library ("lx_brand.so.1") for + * emulation. + * + * The brand mechanism allows for the provision of an alternative trap + * handler for the various system call mechanisms. Traditionally this was + * used to immediately revector execution to the usermode emulation library, + * which was responsible for handling all system calls. In the interests of + * more accurate emulation and increased performance, much of the regular + * illumos system call path is now invoked. Only the argument processing and + * handler dispatch are replaced by the brand, via the per-LWP + * "lwp_brand_syscall" interposition function pointer. + * + * THE NATIVE AND BRAND STACKS + * + * Some runtime environments (e.g. the Go language) allocate very small + * thread stacks, preferring to grow or split the stack as necessary. The + * Linux kernel generally does not use the usermode stack when servicing + * system calls, so this is not a problem. In order for our emulation to + * have the same zero stack impact, we must execute usermode emulation + * routines on an _alternate_ stack. This is similar, in principle, to the + * use of sigaltstack(3C) to run signal handlers off the main thread stack. + * + * To this end, the brand library allocates and installs an alternate stack + * (called the "native" stack) for each LWP. The in-kernel brand code uses + * this stack for usermode emulation calls and interposed signal delivery, + * while the emulated Linux process sees only the data on the main thread + * stack, known as the "brand" stack. The stack mode is tracked in the + * per-LWP brand-private data, using the LX_STACK_MODE_* enum. + * + * The stack mode doubles as a system call "mode bit". When in the + * LX_STACK_MODE_BRAND mode, system calls are processed as emulated Linux + * system calls. In other modes, system calls are assumed to be native + * illumos system calls as made during brand library initialisation and + * usermode emulation. + * + * USERMODE EMULATION + * + * When a Linux system call cannot be emulated within the kernel, we preserve + * the register state of the Linux process and revector the LWP to the brand + * library usermode emulation handler: the "lx_emulate()" function in + * "lx_brand.so.1". This revectoring is modelled on the delivery of signals, + * and is performed in "lx_emulate_user()". + * + * First, the emulated process state is written out to the usermode stack of + * the process as a "ucontext_t" object. Arguments to the emulation routine + * are passed on the stack or in registers, depending on the ABI. When the + * usermode emulation is complete, the result is passed back to the kernel + * (via the "B_EMULATION_DONE" brandsys subcommand) with the saved context + * for restoration. + * + * SIGNAL DELIVERY, SETCONTEXT AND GETCONTEXT + * + * When servicing emulated system calls in the usermode brand library, or + * during signal delivery, various state is preserved by the kernel so that + * the running LWP may be revectored to a handling routine. The context + * allows the kernel to restart the program at the point of interruption, + * either at the return of the signal handler, via setcontext(3C); or after + * the usermode emulation request has been serviced, via B_EMULATION_DONE. + * + * In illumos native processes, the saved context (a "ucontext_t" object) + * includes the state of registers and the current signal mask at the point + * of interruption. The context also includes a link to the most recently + * saved context, forming a chain to be unwound as requests complete. The LX + * brand requires additional book-keeping to describe the machine state: in + * particular, the current stack mode and the occupied extent of the native + * stack. + * + * The brand code is able to interpose on the context save and restore + * operations in the kernel -- see "lx_savecontext()" and + * "lx_restorecontext()" -- to enable getcontext(3C) and setcontext(3C) to + * function correctly in the face of a dual stack LWP. The brand also + * interposes on the signal delivery mechanism -- see "lx_sendsig()" and + * "lx_sendsig_stack()" -- to allow all signals to be delivered to the brand + * library interposer on the native stack, regardless of the interrupted + * execution mode. Linux sigaltstack(2) emulation is performed entirely by + * the usermode brand library during signal handler interposition. + */ + #include <sys/types.h> #include <sys/kmem.h> #include <sys/errno.h> @@ -63,6 +167,7 @@ #include <sys/x86_archext.h> #include <sys/controlregs.h> #include <sys/core.h> +#include <sys/stack.h> #include <lx_signum.h> int lx_debug = 0; @@ -80,18 +185,16 @@ void lx_copy_procdata(proc_t *, proc_t *); extern int getsetcontext(int, void *); extern int waitsys(idtype_t, id_t, siginfo_t *, int); #if defined(_SYSCALL32_IMPL) +extern int getsetcontext32(int, void *); extern int waitsys32(idtype_t, id_t, siginfo_t *, int); #endif extern void lx_proc_exit(proc_t *, klwp_t *); -static void lx_psig_to_proc(proc_t *, kthread_t *, int); extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *); extern void lx_ioctl_init(); extern void lx_ioctl_fini(); -int lx_systrace_brand_enabled; - lx_systrace_f *lx_systrace_entry_ptr; lx_systrace_f *lx_systrace_return_ptr; @@ -113,6 +216,15 @@ static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, static boolean_t lx_native_exec(uint8_t, const char **); static uint32_t lx_map32limit(proc_t *); +static void lx_savecontext(ucontext_t *); +static void lx_restorecontext(ucontext_t *); +static caddr_t lx_sendsig_stack(int); +static void lx_sendsig(int); +#if defined(_SYSCALL32_IMPL) +static void lx_savecontext32(ucontext32_t *); +#endif + + /* lx brand */ struct brand_ops lx_brops = { lx_init_brand_data, /* b_init_brand_data */ @@ -132,7 +244,7 @@ struct brand_ops lx_brops = { lx_elfexec, /* b_elfexec */ NULL, /* b_sigset_native_to_brand */ NULL, /* b_sigset_brand_to_native */ - lx_psig_to_proc, /* b_psig_to_proc */ + NULL, /* b_psig_to_proc */ NSIG, /* b_nsig */ lx_exit_with_sig, /* b_exit_with_sig */ lx_wait_filter, /* b_wait_filter */ @@ -142,14 +254,21 @@ struct brand_ops lx_brops = { lx_stop_notify, /* b_stop_notify */ lx_waitid_helper, /* b_waitid_helper */ lx_sigcld_repost, /* b_sigcld_repost */ - lx_issig_stop /* b_issig_stop */ + lx_issig_stop, /* b_issig_stop */ + lx_savecontext, /* b_savecontext */ +#if defined(_SYSCALL32_IMPL) + lx_savecontext32, /* b_savecontext32 */ +#endif + lx_restorecontext, /* b_restorecontext */ + lx_sendsig_stack, /* b_sendsig_stack */ + lx_sendsig /* b_sendsig */ }; struct brand_mach_ops lx_mops = { NULL, - lx_brand_int80_callback, /* 32-bit Linux entry point */ NULL, - lx_brand_syscall_callback, /* 64-bit common entry point */ + NULL, + NULL, NULL, lx_fixsegreg, lx_fsbase @@ -294,18 +413,7 @@ lx_map32limit(proc_t *p) void lx_brand_systrace_enable(void) { - extern void lx_brand_int80_enable(void); - - ASSERT(!lx_systrace_enabled); - -#if defined(__amd64) - /* enable the trace points for both 32-bit and 64-bit lx calls */ - extern void lx_brand_syscall_enable(void); - lx_brand_syscall_enable(); - lx_brand_int80_enable(); -#else - lx_brand_int80_enable(); -#endif + VERIFY(!lx_systrace_enabled); lx_systrace_enabled = 1; } @@ -313,106 +421,260 @@ lx_brand_systrace_enable(void) void lx_brand_systrace_disable(void) { - extern void lx_brand_int80_disable(void); + VERIFY(lx_systrace_enabled); - ASSERT(lx_systrace_enabled); + lx_systrace_enabled = 0; +} -#if defined(__amd64) - /* disable the trace points for both 32-bit and 64-bit lx calls */ - extern void lx_brand_syscall_disable(void); - lx_brand_syscall_disable(); - lx_brand_int80_disable(); -#else - lx_brand_int80_disable(); -#endif +void +lx_lwp_set_native_stack_current(lx_lwp_data_t *lwpd, uintptr_t new_sp) +{ + VERIFY(lwpd->br_ntv_stack != 0); - lx_systrace_enabled = 0; + /* + * The "brand-lx-set-ntv-stack-current" probe has arguments: + * arg0: stack pointer before change + * arg1: stack pointer after change + * arg2: current stack base + */ + DTRACE_PROBE3(brand__lx__set__ntv__stack__current, + uintptr_t, lwpd->br_ntv_stack_current, + uintptr_t, new_sp, + uintptr_t, lwpd->br_ntv_stack); + + lwpd->br_ntv_stack_current = new_sp; +} + +/* + * This hook runs prior to sendsig() processing and allows us to nominate + * an alternative stack pointer for delivery of the signal handling frame. + * Critically, this routine should _not_ modify any LWP state as the + * savecontext() does not run until after this hook. + */ +static caddr_t +lx_sendsig_stack(int sig) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + /* + * We want to take signal delivery on the native stack, but only if + * one has been allocated and installed for this LWP. + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + /* + * The program is not running on the native stack. Return + * the native stack pointer from our brand-private data so + * that we may switch to it for signal handling. + */ + return ((caddr_t)lwpd->br_ntv_stack_current); + } else { + struct regs *rp = lwptoregs(lwp); + + /* + * Either the program is already running on the native stack, + * or one has not yet been allocated for this LWP. Use the + * current stack pointer value. + */ + return ((caddr_t)rp->r_sp); + } } /* - * Posting a signal to a proc/thread, switch to native syscall mode. - * See the comment on lwp_segregs_save() for how we handle the user-land - * registers when we come into the kernel and see update_sregs() for how we - * restore. + * This hook runs after sendsig() processing and allows us to update the + * per-LWP mode flags for system calls and stacks. The pre-signal + * context has already been saved and delivered to the user at this point. */ -/*ARGSUSED*/ static void -lx_psig_to_proc(proc_t *p, kthread_t *t, int sig) +lx_sendsig(int sig) { -#if defined(__amd64) - lx_lwp_data_t *lwpd = ttolxlwp(t); - klwp_t *lwp = ttolwp(t); - pcb_t *pcb; - model_t datamodel; + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + + switch (lwpd->br_stack_mode) { + case LX_STACK_MODE_BRAND: + case LX_STACK_MODE_NATIVE: + /* + * In lx_sendsig_stack(), we nominated a stack pointer from the + * native stack. Update the stack mode, and the current in-use + * extent of the native stack, accordingly: + */ + lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; + lx_lwp_set_native_stack_current(lwpd, rp->r_sp); + + /* + * Fix up segment registers, etc. + */ + lx_switch_to_native(lwp); + break; - datamodel = lwp_getdatamodel(lwp); - if (datamodel != DATAMODEL_NATIVE) + default: + /* + * Otherwise, the brand library has not yet installed the + * alternate stack for this LWP. Signals will be handled on + * the regular stack thread. + */ return; + } +} - pcb = &lwp->lwp_pcb; +/* + * This hook runs prior to the context restoration, allowing us to take action + * or modify the context before it is loaded. + */ +static void +lx_restorecontext(ucontext_t *ucp) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + uintptr_t flags = (uintptr_t)ucp->uc_brand_data[0]; + caddr_t sp = ucp->uc_brand_data[1]; -#ifdef DEBUG /* - * Debug check to see if we have the correct fsbase. - * - * Note that it is not guaranteed that our %fsbase is loaded (i.e. - * rdmsr(MSR_AMD_FSBASE) won't necessarily return our expected fsbase) - * when this function runs. While it is usually loaded, it's possible - * to be in this function via the following sequence: - * we go off-cpu in the kernel - * another process runs in user-land and its fsbase gets loaded - * we go on-cpu to run and post a signal, but since we haven't run - * in user-land yet, our fsbase has not yet been loaded by - * update_sregs. + * We have a saved native stack pointer value that we must restore + * into the per-LWP data. */ - if (lwpd->br_ntv_syscall == 0 && lwpd->br_lx_fsbase != 0) { - /* should have Linux fsbase */ - if (lwpd->br_lx_fsbase != pcb->pcb_fsbase) { - DTRACE_PROBE2(brand__lx__psig__lx__pcb, - uintptr_t, lwpd->br_lx_fsbase, - uintptr_t, pcb->pcb_fsbase); - } + if (flags & LX_UC_RESTORE_NATIVE_SP) { + lx_lwp_set_native_stack_current(lwpd, (uintptr_t)sp); + } + + /* + * We do not wish to restore the value of uc_link in this context, + * so replace it with the value currently in the LWP. + */ + if (flags & LX_UC_IGNORE_LINK) { + ucp->uc_link = (ucontext_t *)lwp->lwp_oldcontext; + } + /* + * Restore the stack mode: + */ + if (flags & LX_UC_STACK_NATIVE) { + lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; + } else if (flags & LX_UC_STACK_BRAND) { + lwpd->br_stack_mode = LX_STACK_MODE_BRAND; } - if (lwpd->br_ntv_syscall == 1 && lwpd->br_ntv_fsbase != 0) { - /* should have Illumos fsbase */ - if (lwpd->br_ntv_fsbase != pcb->pcb_fsbase) { - DTRACE_PROBE2(brand__lx__psig__ntv__pcb, - uintptr_t, lwpd->br_ntv_fsbase, - uintptr_t, pcb->pcb_fsbase); +#if defined(__amd64) + /* + * Override the fsbase in the context with the value provided through + * the Linux arch_prctl(2) system call. + */ + if (flags & LX_UC_STACK_BRAND) { + if (lwpd->br_lx_fsbase != 0) { + ucp->uc_mcontext.gregs[REG_FSBASE] = lwpd->br_lx_fsbase; } } #endif +} + +static void +lx_savecontext(ucontext_t *ucp) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + uintptr_t flags = 0; - /* We "push" the current syscall mode flag on the "stack". */ - ASSERT(lwpd->br_ntv_syscall == 0 || lwpd->br_ntv_syscall == 1); - lwpd->br_scms = (lwpd->br_scms << 1) | lwpd->br_ntv_syscall; + /* + * The ucontext_t affords us three private pointer-sized members in + * "uc_brand_data". We pack a variety of flags into the first element, + * and an optional stack pointer in the second element. The flags + * determine which stack pointer (native or brand), if any, is stored + * in the second element. The third element may contain the system + * call number; this is analogous to the "orig_[er]ax" member of a + * Linux "user_regs_struct". + */ - if (lwpd->br_ntv_syscall == 0 && lwpd->br_ntv_fsbase != 0) { + if (lwpd->br_stack_mode != LX_STACK_MODE_INIT && + lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { /* - * We were executing in Linux code but now that we're handling - * a signal we have to make sure we have the native fsbase - * loaded. Also update pcb so that if we service an interrupt - * we will restore the correct fsbase in update_sregs(). - * Because of the amd64 guard and datamodel check, this - * obviously will only happen for the 64-bit user-land. - * - * There is a non-obvious side-effect here. Since the fsbase - * will now be the native value, when we bounce out to - * user-land the ucontext will capture the native value, even - * though we need to restore the Linux value when we return - * from the signal. This is handled by the B_SIGNAL_RETURN - * code in lx_brandsys(). + * Record the value of the native stack pointer to restore + * when returning to this branded context: */ - pcb->pcb_fsbase = lwpd->br_ntv_fsbase; + flags |= LX_UC_RESTORE_NATIVE_SP; + ucp->uc_brand_data[1] = (void *)lwpd->br_ntv_stack_current; + } - /* Ensure that we go out via update_sregs */ - pcb->pcb_rupdate = 1; + /* + * Save the stack mode: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) { + flags |= LX_UC_STACK_NATIVE; + } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + flags |= LX_UC_STACK_BRAND; } - lwpd->br_ntv_syscall = 1; -#endif + + /* + * If we might need to restart this system call, save that information + * in the context: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + ucp->uc_brand_data[2] = + (void *)(uintptr_t)lwpd->br_syscall_num; + if (lwpd->br_syscall_restart) { + flags |= LX_UC_RESTART_SYSCALL; + } + } else { + ucp->uc_brand_data[2] = NULL; + } + + ucp->uc_brand_data[0] = (void *)flags; +} + +#if defined(_SYSCALL32_IMPL) +static void +lx_savecontext32(ucontext32_t *ucp) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + unsigned int flags = 0; + + /* + * The ucontext_t affords us three private pointer-sized members in + * "uc_brand_data". We pack a variety of flags into the first element, + * and an optional stack pointer in the second element. The flags + * determine which stack pointer (native or brand), if any, is stored + * in the second element. The third element may contain the system + * call number; this is analogous to the "orig_[er]ax" member of a + * Linux "user_regs_struct". + */ + + if (lwpd->br_stack_mode != LX_STACK_MODE_INIT && + lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { + /* + * Record the value of the native stack pointer to restore + * when returning to this branded context: + */ + flags |= LX_UC_RESTORE_NATIVE_SP; + ucp->uc_brand_data[1] = (caddr32_t)lwpd->br_ntv_stack_current; + } + + /* + * Save the stack mode: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) { + flags |= LX_UC_STACK_NATIVE; + } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + flags |= LX_UC_STACK_BRAND; + } + + /* + * If we might need to restart this system call, save that information + * in the context: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + ucp->uc_brand_data[2] = (caddr32_t)lwpd->br_syscall_num; + if (lwpd->br_syscall_restart) { + flags |= LX_UC_RESTART_SYSCALL; + } + } else { + ucp->uc_brand_data[2] = NULL; + } + + ucp->uc_brand_data[0] = flags; } +#endif void lx_init_brand_data(zone_t *zone) @@ -426,7 +688,6 @@ lx_init_brand_data(zone_t *zone) * This can be changed by a call to setattr() during zone boot. */ (void) strlcpy(data->lxzd_kernel_version, "2.4.21", LX_VERS_MAX); - data->lxzd_max_syscall = LX_NSYSCALLS; zone->zone_brand_data = data; /* @@ -448,6 +709,27 @@ lx_unsupported(char *dmsg) DTRACE_PROBE1(brand__lx__unsupported, char *, dmsg); } +void +lx_trace_sysenter(int syscall_num, uintptr_t *args) +{ + if (lx_systrace_enabled) { + VERIFY(lx_systrace_entry_ptr != NULL); + + (*lx_systrace_entry_ptr)(syscall_num, args[0], args[1], + args[2], args[3], args[4], args[5]); + } +} + +void +lx_trace_sysreturn(int syscall_num, long ret) +{ + if (lx_systrace_enabled) { + VERIFY(lx_systrace_return_ptr != NULL); + + (*lx_systrace_return_ptr)(syscall_num, ret, ret, 0, 0, 0, 0); + } +} + /* * Get the addresses of the user-space system call handler and attach it to * the proc structure. Returning 0 indicates success; the value returned @@ -462,16 +744,16 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) { kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); proc_t *p = ttoproc(t); lx_proc_data_t *pd; - int ike_call; struct termios *termios; uint_t termios_len; int error; int code; int sig; lx_brand_registration_t reg; - lx_lwp_data_t *lwpd; + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); /* * There is one operation that is suppored for non-branded @@ -480,8 +762,8 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, * a branded process. */ if (cmd == B_EXEC_BRAND) { - ASSERT(p->p_zone != NULL); - ASSERT(p->p_zone->zone_brand == &lx_brand); + VERIFY(p->p_zone != NULL); + VERIFY(p->p_zone->zone_brand == &lx_brand); return (exec_common( (char *)arg1, (const char **)arg2, (const char **)arg3, EBA_BRAND)); @@ -489,13 +771,19 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, /* For all other operations this must be a branded process. */ if (p->p_brand == NULL) - return (set_errno(ENOSYS)); + return (ENOSYS); - ASSERT(p->p_brand == &lx_brand); - ASSERT(p->p_brand_data != NULL); + VERIFY(p->p_brand == &lx_brand); + VERIFY(p->p_brand_data != NULL); switch (cmd) { case B_REGISTER: + if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { + lx_print("stack mode was not PREINIT during " + "REGISTER\n"); + return (EINVAL); + } + if (p->p_model == DATAMODEL_NATIVE) { if (copyin((void *)arg1, ®, sizeof (reg)) != 0) { lx_print("Failed to copyin brand registration " @@ -517,10 +805,6 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, reg.lxbr_version = (uint_t)reg32.lxbr_version; reg.lxbr_handler = (void *)(uintptr_t)reg32.lxbr_handler; - reg.lxbr_tracehandler = - (void *)(uintptr_t)reg32.lxbr_tracehandler; - reg.lxbr_traceflag = - (void *)(uintptr_t)reg32.lxbr_traceflag; } #endif @@ -534,34 +818,9 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p); pd = p->p_brand_data; pd->l_handler = (uintptr_t)reg.lxbr_handler; - pd->l_tracehandler = (uintptr_t)reg.lxbr_tracehandler; - pd->l_traceflag = (uintptr_t)reg.lxbr_traceflag; - -#if defined(__amd64) - /* - * When we register, start with native syscalls enabled so that - * lx_init can finish initialization before switch to Linux - * syscall mode. Also initialize the syscall mode "stack" to - * native. We push/pop bits into this "stack" during signal - * handling. - */ - lwpd = ttolxlwp(t); - lwpd->br_ntv_syscall = 1; - lwpd->br_scms = 1; -#endif - if (pd->l_traceflag != NULL && pd->l_ptrace != 0) { - /* - * If ptrace(2) is active on this process, it is likely - * that we just finished an emulated execve(2) in a - * traced child. The usermode traceflag will have been - * clobbered by the exec, so we set it again here: - */ - (void) suword32((void *)pd->l_traceflag, 1); - } - - *rval = 0; return (0); + case B_TTYMODES: /* This is necessary for emulating TCGETS ioctls. */ if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(), @@ -577,7 +836,6 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, } ddi_prop_free(termios); - *rval = 0; return (0); case B_ELFDATA: @@ -585,8 +843,7 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, if (get_udatamodel() == DATAMODEL_NATIVE) { if (copyout(&pd->l_elf_data, (void *)arg1, sizeof (lx_elf_data_t)) != 0) { - (void) set_errno(EFAULT); - return (*rval = -1); + return (EFAULT); } } #if defined(_LP64) @@ -603,23 +860,15 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, if (copyout(&led32, (void *)arg1, sizeof (led32)) != 0) { - (void) set_errno(EFAULT); - return (*rval = -1); + return (EFAULT); } } #endif - *rval = 0; return (0); case B_EXEC_NATIVE: - error = exec_common( - (char *)arg1, (const char **)arg2, (const char **)arg3, - EBA_NATIVE); - if (error) { - (void) set_errno(error); - return (*rval = -1); - } - return (*rval = 0); + return (exec_common((char *)arg1, (const char **)arg2, + (const char **)arg3, EBA_NATIVE)); /* * The B_TRUSS_POINT subcommand is used so that we can make a no-op @@ -627,99 +876,34 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, * emulation. */ case B_TRUSS_POINT: - *rval = 0; return (0); - case B_LPID_TO_SPAIR: + case B_LPID_TO_SPAIR: { /* * Given a Linux pid as arg1, return the Solaris pid in arg2 and * the Solaris LWP in arg3. We also translate pid 1 (which is * hardcoded in many applications) to the zone's init process. */ - { - pid_t s_pid; - id_t s_tid; - - if ((pid_t)arg1 == 1) { - s_pid = p->p_zone->zone_proc_initpid; - /* handle the dead/missing init(1M) case */ - if (s_pid == -1) - s_pid = 1; - s_tid = 1; - } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, - &s_tid) < 0) - return (ESRCH); - - if (copyout(&s_pid, (void *)arg2, - sizeof (s_pid)) != 0 || - copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) - return (EFAULT); - - *rval = 0; - return (0); - } - - case B_SYSENTRY: - if (lx_systrace_enabled) { - ASSERT(lx_systrace_entry_ptr != NULL); - - if (get_udatamodel() == DATAMODEL_NATIVE) { - uintptr_t a[6]; - - if (copyin((void *)arg2, a, sizeof (a)) != 0) - return (EFAULT); - - (*lx_systrace_entry_ptr)(arg1, a[0], a[1], - a[2], a[3], a[4], a[5]); - } -#if defined(_LP64) - else { - /* 32-bit userland on 64-bit kernel */ - uint32_t a[6]; - - if (copyin((void *)arg2, a, sizeof (a)) != 0) - return (EFAULT); - - (*lx_systrace_entry_ptr)(arg1, a[0], a[1], - a[2], a[3], a[4], a[5]); - } -#endif + pid_t s_pid; + id_t s_tid; + + if ((pid_t)arg1 == 1) { + s_pid = p->p_zone->zone_proc_initpid; + /* handle the dead/missing init(1M) case */ + if (s_pid == -1) + s_pid = 1; + s_tid = 1; + } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, &s_tid) < 0) { + return (ESRCH); } - (void) lx_ptrace_stop(LX_PR_SYSENTRY); - - pd = p->p_brand_data; - - /* - * If neither DTrace not ptrace are interested in tracing - * this process any more, turn off the trace flag. - */ - if (!lx_systrace_enabled && !pd->l_ptrace) - (void) suword32((void *)pd->l_traceflag, 0); - - *rval = 0; - return (0); - - case B_SYSRETURN: - if (lx_systrace_enabled) { - ASSERT(lx_systrace_return_ptr != NULL); - - (*lx_systrace_return_ptr)(arg1, arg2, arg2, 0, 0, 0, 0); + if (copyout(&s_pid, (void *)arg2, sizeof (s_pid)) != 0 || + copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) { + return (EFAULT); } - (void) lx_ptrace_stop(LX_PR_SYSEXIT); - - pd = p->p_brand_data; - - /* - * If neither DTrace not ptrace are interested in tracing - * this process any more, turn off the trace flag. - */ - if (!lx_systrace_enabled && !pd->l_ptrace) - (void) suword32((void *)pd->l_traceflag, 0); - - *rval = 0; return (0); + } case B_SET_AFFINITY_MASK: case B_GET_AFFINITY_MASK: @@ -735,7 +919,7 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, case B_PTRACE_STOP_FOR_OPT: return (lx_ptrace_stop_for_option((int)arg1, arg2 == 0 ? - B_FALSE : B_TRUE, (ulong_t)arg3)); + B_FALSE : B_TRUE, (ulong_t)arg3, arg4)); case B_PTRACE_CLONE_BEGIN: return (lx_ptrace_set_clone_inherit((int)arg1, arg2 == 0 ? @@ -783,8 +967,7 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, return (0); } - case B_UNSUPPORTED: - { + case B_UNSUPPORTED: { char dmsg[256]; if (copyin((void *)arg1, &dmsg, sizeof (dmsg)) != 0) { @@ -794,11 +977,11 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, } dmsg[255] = '\0'; lx_unsupported(dmsg); - } return (0); + } - case B_STORE_ARGS: + case B_STORE_ARGS: { /* * B_STORE_ARGS subcommand * arg1 = address of struct to be copied in @@ -806,141 +989,208 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, * arg3-arg6 ignored * rval = the amount of data copied. */ - { - int err; - void *buf; - - lwpd = ttolxlwp(curthread); - /* only have upper limit because arg2 is unsigned */ - if (arg2 > LX_BR_ARGS_SIZE_MAX) { - return (EINVAL); - } + void *buf; - buf = kmem_alloc(arg2, KM_SLEEP); - if ((err = copyin((void *)arg1, buf, arg2)) != 0) { - lx_print("Failed to copyin scall arg at 0x%p\n", - (void *) arg1); - kmem_free(buf, arg2); - /* - * Purposely not setting br_scall_args to NULL - * to preserve data for debugging. - */ - return (EFAULT); - } + /* only have upper limit because arg2 is unsigned */ + if (arg2 > LX_BR_ARGS_SIZE_MAX) { + return (EINVAL); + } - if (lwpd->br_scall_args != NULL) { - ASSERT(lwpd->br_args_size > 0); - kmem_free(lwpd->br_scall_args, - lwpd->br_args_size); - } + buf = kmem_alloc(arg2, KM_SLEEP); + if (copyin((void *)arg1, buf, arg2) != 0) { + lx_print("Failed to copyin scall arg at 0x%p\n", + (void *) arg1); + kmem_free(buf, arg2); + /* + * Purposely not setting br_scall_args to NULL + * to preserve data for debugging. + */ + return (EFAULT); + } - lwpd->br_scall_args = buf; - lwpd->br_args_size = arg2; - *rval = arg2; - return (0); + if (lwpd->br_scall_args != NULL) { + ASSERT(lwpd->br_args_size > 0); + kmem_free(lwpd->br_scall_args, + lwpd->br_args_size); } - case B_CLR_NTV_SYSC_FLAG: -#if defined(__amd64) - lwpd = ttolxlwp(curthread); - lwpd->br_ntv_syscall = 0; + lwpd->br_scall_args = buf; + lwpd->br_args_size = arg2; + *rval = arg2; + return (0); + } + + case B_HELPER_CLONE: + return (lx_helper_clone(rval, arg1, (void *)arg2, (void *)arg3, + (void *)arg4)); + case B_HELPER_SETGROUPS: + return (lx_helper_setgroups(arg1, (gid_t *)arg2)); + + case B_HELPER_SIGQUEUE: + return (lx_helper_rt_sigqueueinfo(arg1, arg2, + (siginfo_t *)arg3)); + + case B_HELPER_TGSIGQUEUE: + return (lx_helper_rt_tgsigqueueinfo(arg1, arg2, arg3, + (siginfo_t *)arg4)); + + case B_SET_THUNK_PID: + lwpd->br_lx_thunk_pid = arg1; + return (0); + + case B_GETPID: /* - * If Linux fsbase has been set, restore it. The user-level - * code only ever calls this in the 64-bit library. - * - * Note that it is not guaranteed that our %fsbase is loaded - * (i.e. rdmsr(MSR_AMD_FSBASE) won't necessarily return our - * expected fsbase) when this block runs. While it is usually - * loaded, it's possible to be in this function via the - * following sequence: - * we make the brandsys syscall and go off-cpu on entering - * the kernel - * another process runs in user-land and its fsbase gets - * loaded - * we go on-cpu to finish the syscall but since we haven't - * run again in user-land yet, our fsbase has not yet been - * reloaded by update_sregs + * The usermode clone(2) code needs to be able to call + * lx_getpid() from native code: */ - if (lwpd->br_lx_fsbase != 0) { - klwp_t *lwp = ttolwp(t); - pcb_t *pcb = &lwp->lwp_pcb; - - pcb->pcb_fsbase = lwpd->br_lx_fsbase; + *rval = lx_getpid(); + return (0); - /* Ensure that we go out via update_sregs */ - pcb->pcb_rupdate = 1; + case B_SET_NATIVE_STACK: + /* + * B_SET_NATIVE_STACK subcommand + * arg1 = the base of the stack to use for emulation + */ + if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { + lx_print("B_SET_NATIVE_STACK when stack was already " + "set to %p\n", (void *)arg1); + return (EEXIST); } -#endif - return (0); - case B_SIGNAL_RETURN: -#if defined(__amd64) /* - * Set the syscall mode and do the setcontext syscall. The - * user-level code only ever calls this in the 64-bit library. + * We move from the PREINIT state, where we have no brand + * emulation stack, to the INIT state. Here, we are still + * running on what will become the BRAND stack, but are running + * emulation (i.e. native) code. Once the initialisation + * process for this thread has finished, we will jump to + * brand-specific code, while moving to the BRAND mode. * - * We get the previous syscall mode off of the br_scms "stack". - * That is a sequence of syscall mode flag bits we've pushed - * into that int as we took signals. - * arg1 = ucontext_t pointer + * When a new LWP is created, lx_initlwp() will clear the + * stack data. If that LWP is actually being duplicated + * into a child process by fork(2), lx_forklwp() will copy + * it so that the cloned thread will keep using the same + * alternate stack. + */ + lwpd->br_ntv_stack = arg1; + lwpd->br_stack_mode = LX_STACK_MODE_INIT; + lx_lwp_set_native_stack_current(lwpd, arg1); + + return (0); + + case B_GET_CURRENT_CONTEXT: + /* + * B_GET_CURRENT_CONTEXT subcommand: + * arg1 = address for pointer to current ucontext_t */ - lwpd = ttolxlwp(curthread); - lwpd->br_ntv_syscall = lwpd->br_scms & 0x1; - /* "pop" this value from the "stack" */ - lwpd->br_scms >>= 1; +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + caddr32_t addr = (caddr32_t)lwp->lwp_oldcontext; + + error = copyout(&addr, (void *)arg1, sizeof (addr)); + } else +#endif + { + error = copyout(&lwp->lwp_oldcontext, (void *)arg1, + sizeof (lwp->lwp_oldcontext)); + } + + return (error != 0 ? EFAULT : 0); + case B_JUMP_TO_LINUX: /* - * If setting the mode to lx, make sure we fix up the context - * so that we load the lx fsbase when we return to the Linux - * code. For the native case, the context already has the - * correct native fsbase so we don't need to do anything here. - * Note that setgregs updates the pcb and in update_sregs we - * wrmsr the correct fsbase when we return to user-level. - * getsetcontext -> restorecontext -> setgregs + * B_JUMP_TO_LINUX subcommand: + * arg1 = ucontext_t pointer for jump state */ - if (lwpd->br_ntv_syscall == 0 && lwpd->br_lx_fsbase != 0 && - arg1 != NULL) { + + if (arg1 == NULL) + return (EINVAL); + + switch (lwpd->br_stack_mode) { + case LX_STACK_MODE_NATIVE: { + struct regs *rp = lwptoregs(lwp); + /* - * Linux fsbase has been initialized, restore it. - * We have to copyin to modify since the user-level - * emulation doesn't have a copy of the lx fsbase or - * know that we are returning to Linux code. + * We are on the NATIVE stack, so we must preserve + * the extent of that stack. The pointer will be + * reset by a future setcontext(). */ - ucontext_t uc; - klwp_t *lwp = ttolwp(t); - pcb_t *pcb = &lwp->lwp_pcb; - - if (copyin((void *)arg1, &uc, sizeof (ucontext_t) - - sizeof (uc.uc_filler) - - sizeof (uc.uc_mcontext.fpregs))) - return (set_errno(EFAULT)); + lx_lwp_set_native_stack_current(lwpd, + (uintptr_t)rp->r_sp); + break; + } - uc.uc_mcontext.gregs[REG_FSBASE] = lwpd->br_lx_fsbase; + case LX_STACK_MODE_INIT: + /* + * The LWP is transitioning to Linux code for the first + * time. + */ + break; - if (copyout(&uc, (void *)arg1, sizeof (ucontext_t) - - sizeof (uc.uc_filler) - - sizeof (uc.uc_mcontext.fpregs))) - return (set_errno(EFAULT)); + case LX_STACK_MODE_PREINIT: + /* + * This LWP has not installed an alternate stack for + * usermode emulation handling. + */ + return (ENOENT); - /* Ensure that we go out via update_sregs */ - pcb->pcb_rupdate = 1; + case LX_STACK_MODE_BRAND: + /* + * The LWP should not be on the BRAND stack. + */ + exit(CLD_KILLED, SIGSYS); + return (0); } -#endif /* amd64 */ - return (getsetcontext(SETCONTEXT, (void *)arg1)); - case B_UNWIND_NTV_SYSC_FLAG: -#if defined(__amd64) /* - * Used when exiting to support the setcontext back to the - * getcontext we performed in lx_init. We need to unwin - * whatever signal state is in br_scms since we are exiting. - * This sets us up for the B_SIGNAL_RETURN from lx_setcontext. + * Transfer control to Linux: */ - lwpd = ttolxlwp(curthread); - lwpd->br_scms = 1; + return (lx_runexe(lwp, (void *)arg1)); + + case B_EMULATION_DONE: + /* + * B_EMULATION_DONE subcommand: + * arg1 = ucontext_t * to restore + * arg2 = system call number + * arg3 = return code + * arg4 = if operation failed, the errno value + */ + + /* + * The first part of this operation is a setcontext() to + * restore the register state to the copy we preserved + * before vectoring to the usermode emulation routine. + * If that fails, we return (hopefully) to the emulation + * routine and it will handle the error. + */ +#if (_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + error = getsetcontext32(SETCONTEXT, (void *)arg1); + } else #endif + { + error = getsetcontext(SETCONTEXT, (void *)arg1); + } + + if (error != 0) { + return (error); + } + + /* + * The saved Linux context has been restored. We handle the + * return value or errno with code common to the in-kernel + * system call emulation. + */ + if ((error = (int)arg4) != 0) { + /* + * lx_syscall_return() looks at the errno in the LWP, + * so set it here: + */ + set_errno(error); + } + lx_syscall_return(ttolwp(curthread), (int)arg2, (long)arg3); + return (0); case B_EXIT_AS_SIG: @@ -959,41 +1209,6 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, exit(code, sig); /* NOTREACHED */ break; - - case B_IKE_SYSCALL: - if (arg1 > LX_N_IKE_FUNCS) - return (EINVAL); - - if (get_udatamodel() == DATAMODEL_NATIVE) { - uintptr_t a[6]; - - if (copyin((void *)arg2, a, sizeof (a)) != 0) - return (EFAULT); - - *rval = lx_emulate_syscall(arg1, a[0], a[1], - a[2], a[3], a[4], a[5]); -#if defined(_LP64) - } else { - /* 32-bit userland on 64-bit kernel */ - uint32_t a[6]; - - if (copyin((void *)arg2, a, sizeof (a)) != 0) - return (EFAULT); - - *rval = lx_emulate_syscall(arg1, a[0], a[1], - a[2], a[3], a[4], a[5]); -#endif - } - - return (0); - - default: - ike_call = cmd - B_IKE_SYSCALL; - if (ike_call > 0 && ike_call <= LX_N_IKE_FUNCS) { - *rval = lx_emulate_syscall(ike_call, arg1, arg2, - arg3, arg4, arg5, 0xbadbeef); - return (0); - } } return (EINVAL); @@ -1443,11 +1658,37 @@ lx_native_exec(uint8_t osabi, const char **interp) return (B_TRUE); } +static void +lx_syscall_init(void) +{ + int i; + + /* + * Count up the 32-bit Linux system calls. Note that lx_sysent32 + * has (LX_NSYSCALLS + 1) entries. + */ + for (i = 0; i <= LX_NSYSCALLS && lx_sysent32[i].sy_name != NULL; i++) + continue; + lx_nsysent32 = i; + +#if defined(_LP64) + /* + * Count up the 64-bit Linux system calls. Note that lx_sysent64 + * has (LX_NSYSCALLS + 1) entries. + */ + for (i = 0; i <= LX_NSYSCALLS && lx_sysent64[i].sy_name != NULL; i++) + continue; + lx_nsysent64 = i; +#endif +} + int _init(void) { int err = 0; + lx_syscall_init(); + /* pid/tid conversion hash tables */ lx_pid_init(); diff --git a/usr/src/uts/common/brand/lx/os/lx_misc.c b/usr/src/uts/common/brand/lx/os/lx_misc.c index abb0ab6e63..ebe37a01c0 100644 --- a/usr/src/uts/common/brand/lx/os/lx_misc.c +++ b/usr/src/uts/common/brand/lx/os/lx_misc.c @@ -88,7 +88,6 @@ lx_exec() * invalid; clear them. */ pd->l_handler = NULL; - pd->l_tracehandler = NULL; /* * There are two mutually exclusive special cases we need to @@ -118,12 +117,20 @@ lx_exec() * we are traced we can post either the PTRACE_EVENT_EXEC event or the * legacy SIGTRAP. */ - (void) lx_ptrace_stop_for_option(LX_PTRACE_O_TRACEEXEC, B_FALSE, 0); + (void) lx_ptrace_stop_for_option(LX_PTRACE_O_TRACEEXEC, B_FALSE, 0, 0); /* clear the fsbase values until the app. can reinitialize them */ lwpd->br_lx_fsbase = NULL; lwpd->br_ntv_fsbase = NULL; + /* + * Clear the native stack flags. This will be reinitialised by + * lx_init() in the new process image. + */ + lwpd->br_stack_mode = LX_STACK_MODE_PREINIT; + lwpd->br_ntv_stack = 0; + lwpd->br_ntv_stack_current = 0; + installctx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL, lx_save, NULL); @@ -236,6 +243,11 @@ lx_freelwp(klwp_t *lwp) { struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + /* + * Remove our system call interposer. + */ + lwp->lwp_brand_syscall = NULL; + if (lwpd != NULL) { (void) removectx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL, lx_save, NULL); @@ -269,8 +281,7 @@ lx_initlwp(klwp_t *lwp) lwpd->br_clear_ctidp = NULL; lwpd->br_set_ctidp = NULL; lwpd->br_signal = 0; - lwpd->br_ntv_syscall = 1; - lwpd->br_scms = 1; + lwpd->br_stack_mode = LX_STACK_MODE_PREINIT; /* * lwpd->br_affinitymask was zeroed by kmem_zalloc() @@ -320,6 +331,11 @@ lx_initlwp(klwp_t *lwp) lx_ptrace_inherit_tracer(plwpd, lwpd); } + /* + * Install branded system call hook for this LWP: + */ + lwp->lwp_brand_syscall = lx_syscall_enter; + return (0); } @@ -339,6 +355,27 @@ lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp) dst->br_ptid = lwptot(srclwp)->t_tid; bcopy(src->br_tls, dst->br_tls, sizeof (dst->br_tls)); + switch (src->br_stack_mode) { + case LX_STACK_MODE_BRAND: + case LX_STACK_MODE_NATIVE: + /* + * The parent LWP has an alternate stack installed. + * The child LWP should have the same stack base and extent. + */ + dst->br_stack_mode = src->br_stack_mode; + dst->br_ntv_stack = src->br_ntv_stack; + dst->br_ntv_stack_current = src->br_ntv_stack_current; + break; + + default: + /* + * Otherwise, clear the stack data for this LWP. + */ + dst->br_stack_mode = LX_STACK_MODE_PREINIT; + dst->br_ntv_stack = 0; + dst->br_ntv_stack_current = 0; + } + /* * copy only these flags */ @@ -436,7 +473,7 @@ lx_fixsegreg(greg_t sr, model_t datamodel) } /* - * Brand-specific function to convert the fsbase as pulled from the regsiter + * Brand-specific function to convert the fsbase as pulled from the register * into a native fsbase suitable for locating the ulwp_t from the kernel. */ uintptr_t @@ -444,8 +481,10 @@ lx_fsbase(klwp_t *lwp, uintptr_t fsbase) { lx_lwp_data_t *lwpd = lwp->lwp_brand; - if (lwpd->br_ntv_syscall || lwpd->br_ntv_fsbase == NULL) + if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND || + lwpd->br_ntv_fsbase == NULL) { return (fsbase); + } return (lwpd->br_ntv_fsbase); } diff --git a/usr/src/uts/common/brand/lx/os/lx_ptrace.c b/usr/src/uts/common/brand/lx/os/lx_ptrace.c index 6e4b74531d..a97a1b6d43 100644 --- a/usr/src/uts/common/brand/lx/os/lx_ptrace.c +++ b/usr/src/uts/common/brand/lx/os/lx_ptrace.c @@ -485,7 +485,8 @@ lx_ptrace_restart_lwp(klwp_t *lwp) */ rlwpd->br_ptrace_whystop = 0; rlwpd->br_ptrace_whatstop = 0; - rlwpd->br_ptrace_flags &= ~LX_PTRACE_CLDPEND; + rlwpd->br_ptrace_flags &= ~(LX_PTRACE_CLDPEND | + LX_PTRACE_WAITPEND); } thread_unlock(rt); } @@ -551,9 +552,8 @@ lx_winfo(lx_lwp_data_t *remote, k_siginfo_t *ip, boolean_t waitflag, * so that it may be re-fetched on another call to waitid(). */ if (waitflag) { - remote->br_ptrace_whystop = 0; - remote->br_ptrace_whatstop = 0; - remote->br_ptrace_flags &= ~LX_PTRACE_CLDPEND; + remote->br_ptrace_flags &= ~(LX_PTRACE_CLDPEND | + LX_PTRACE_WAITPEND); } } @@ -637,6 +637,7 @@ lx_stop_notify(proc_t *p, klwp_t *lwp, ushort_t why, ushort_t what) */ lwpd->br_ptrace_whystop = why; lwpd->br_ptrace_whatstop = what; + lwpd->br_ptrace_flags |= LX_PTRACE_WAITPEND; /* * If this event does not depend on an event from the parent LWP, @@ -805,6 +806,60 @@ lx_ptrace_geteventmsg(lx_lwp_data_t *remote, void *umsgp) return (error); } +static int +lx_ptrace_getregs(lx_lwp_data_t *remote, void *uregsp) +{ + if (remote->br_stack_mode == LX_STACK_MODE_BRAND) { + /* + * The LWP was stopped with the brand stack and register + * state loaded, e.g. during a system call emulated within + * the kernel. Return the LWP register state. + */ + return (lx_regs_to_userregs(remote, uregsp)); + } else if (remote->br_ptrace_stopucp != NULL) { + /* + * The LWP was stopped in the usermode emulation library + * but a ucontext_t for the preserved brand stack and + * register state was provided. Return the register state + * from that ucontext_t. + */ + return (lx_uc_to_userregs(remote, + (void *)remote->br_ptrace_stopucp, uregsp)); + } else { + /* + * The register state is not currently available. + */ + return (EIO); + } +} + +static int +lx_ptrace_setregs(lx_lwp_data_t *remote, void *uregsp) +{ + if (remote->br_stack_mode == LX_STACK_MODE_BRAND) { + /* + * The LWP was stopped with the brand stack and register + * state loaded, e.g. during a system call emulated within + * the kernel. Write to the LWP register state. + */ + return (lx_userregs_to_regs(remote, uregsp)); + } else if (remote->br_ptrace_stopucp != NULL) { + /* + * The LWP was stopped in the usermode emulation library + * but a ucontext_t for the preserved brand stack and + * register state was provided. Write to the register state + * in that ucontext_t. + */ + return (lx_userregs_to_uc(remote, + (void *)remote->br_ptrace_stopucp, uregsp)); + } else { + /* + * The register state is not currently available. + */ + return (EIO); + } +} + /* * Implements the PTRACE_CONT subcommand of the Linux ptrace(2) interface. */ @@ -907,7 +962,6 @@ static int lx_ptrace_attach(pid_t lx_pid) { int error = ESRCH; - int32_t one = 1; /* * Our (Tracer) LWP: */ @@ -1016,15 +1070,9 @@ lx_ptrace_attach(pid_t lx_pid) /* * Set the in-kernel process-wide ptrace(2) enable flag. - * Attempt also to write the usermode trace flag so that the - * process knows to enter the kernel for potential ptrace(2) - * syscall-stops. */ rprocd = ttolxproc(rthr); rprocd->l_ptrace = 1; - mutex_exit(&rproc->p_lock); - (void) uwrite(rproc, &one, sizeof (one), rprocd->l_traceflag); - mutex_enter(&rproc->p_lock); error = 0; } @@ -1294,12 +1342,9 @@ lx_ptrace_traceme(void) /* * Set the in-kernel process-wide ptrace(2) enable - * flag. Attempt also to write the usermode trace flag - * so that the process knows to enter the kernel for - * potential ptrace(2) syscall-stops. + * flag. */ procd->l_ptrace = 1; - (void) suword32((void *)procd->l_traceflag, 1); return (0); } @@ -1360,6 +1405,7 @@ lx_ptrace_stop_common(proc_t *p, lx_lwp_data_t *lwpd, ushort_t what) */ lwpd->br_ptrace_flags &= ~(LX_PTRACE_STOPPING | LX_PTRACE_STOPPED | LX_PTRACE_CLDPEND); + lwpd->br_ptrace_stopucp = NULL; cv_broadcast(&lx_ptrace_busy_cv); mutex_exit(&p->p_lock); @@ -1367,7 +1413,8 @@ lx_ptrace_stop_common(proc_t *p, lx_lwp_data_t *lwpd, ushort_t what) } int -lx_ptrace_stop_for_option(int option, boolean_t child, ulong_t msg) +lx_ptrace_stop_for_option(int option, boolean_t child, ulong_t msg, + uintptr_t ucp) { kthread_t *t = curthread; klwp_t *lwp = ttolwp(t); @@ -1453,6 +1500,12 @@ lx_ptrace_stop_for_option(int option, boolean_t child, ulong_t msg) } /* + * Userland may have passed in a ucontext_t pointer for + * PTRACE_GETREGS/PTRACE_SETREGS usage while stopped. + */ + lwpd->br_ptrace_stopucp = ucp; + + /* * p_lock for the process containing the tracee will be dropped by * lx_ptrace_stop_common(). */ @@ -1874,7 +1927,8 @@ lx_sigcld_repost(proc_t *pp, sigqueue_t *sqp) continue; } - if (remote->br_ptrace_whystop == 0 || + if (!(remote->br_ptrace_flags & LX_PTRACE_WAITPEND) || + remote->br_ptrace_whystop == 0 || remote->br_ptrace_whatstop == 0) { /* * No (new) stop reason to post for this LWP. @@ -2041,7 +2095,8 @@ lx_waitid_helper(idtype_t idtype, id_t id, k_siginfo_t *ip, int options, continue; } - if (remote->br_ptrace_whystop == 0 || + if (!(remote->br_ptrace_flags & LX_PTRACE_WAITPEND) || + remote->br_ptrace_whystop == 0 || remote->br_ptrace_whatstop == 0) { /* * No (new) stop reason to post for this LWP. @@ -2230,6 +2285,14 @@ lx_ptrace_kernel(int ptrace_op, pid_t lxpid, uintptr_t addr, uintptr_t data) error = lx_ptrace_geteventmsg(remote, (void *)data); break; + case LX_PTRACE_GETREGS: + error = lx_ptrace_getregs(remote, (void *)data); + break; + + case LX_PTRACE_SETREGS: + error = lx_ptrace_setregs(remote, (void *)data); + break; + default: error = EINVAL; } diff --git a/usr/src/uts/common/brand/lx/os/lx_syscall.c b/usr/src/uts/common/brand/lx/os/lx_syscall.c index d26f10d851..5c6e5b29f3 100644 --- a/usr/src/uts/common/brand/lx/os/lx_syscall.c +++ b/usr/src/uts/common/brand/lx/os/lx_syscall.c @@ -22,7 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/kmem.h> @@ -34,145 +34,1093 @@ #include <sys/modctl.h> #include <sys/cmn_err.h> #include <sys/model.h> +#include <sys/privregs.h> #include <sys/brand.h> #include <sys/machbrand.h> +#include <sys/sdt.h> #include <sys/lx_syscalls.h> #include <sys/lx_brand.h> #include <sys/lx_impl.h> #include <sys/lx_misc.h> + /* - * Some system calls return either a 32-bit or a 64-bit value, depending - * on the datamodel. + * Flags for sysent entries: */ -#ifdef _LP64 -#define V_RVAL SE_64RVAL -#else -#define V_RVAL SE_32RVAL1 -#endif +#define LX_SYS_NOSYS_REASON 0x07 +#define LX_SYS_EBPARG6 0x08 /* - * Define system calls that return a native 'long' quantity i.e. a 32-bit - * or 64-bit integer - depending on how the kernel is itself compiled - * e.g. read(2) returns 'ssize_t' in the kernel and in userland. + * Flags that denote the specific reason we do not have a particular system + * call. These reasons are only valid if the function is NULL. */ -#define LX_CL(name, call, narg) \ - { V_RVAL, (name), (llfcn_t)(call), (narg) } +#define NOSYS_USERMODE 0 +#define NOSYS_NULL 1 +#define NOSYS_NONE 2 +#define NOSYS_NO_EQUIV 3 +#define NOSYS_KERNEL 4 +#define NOSYS_UNDOC 5 +#define NOSYS_OBSOLETE 6 +#define NOSYS_MAX NOSYS_OBSOLETE + +#if NOSYS_MAX > LX_SYS_NOSYS_REASON +#error NOSYS reason codes must fit in LX_SYS_NOSYS_REASON +#endif /* - * Returns a 32 bit quantity regardless of datamodel + * Strings describing the reason we do not emulate a particular system call + * in the kernel. */ -#define LX_CI(name, call, narg) \ - { SE_32RVAL1, (name), (llfcn_t)(call), (narg) } +static char *nosys_reasons[] = { + NULL, /* NOSYS_USERMODE means this call is emulated in usermode */ + "Not done yet", + "No such Linux system call", + "No equivalent illumos functionality", + "Reads/modifies Linux kernel state", + "Undocumented and/or rarely used system call", + "Unsupported, obsolete system call" +}; -#define LX_NOSYS(name) \ - {SE_64RVAL, (name), (llfcn_t)lx_nosys, 0} -typedef int64_t (*llfcn_t)(); +#if defined(_LP64) +/* + * System call handler table and entry count for Linux x86_64 (amd64): + */ +lx_sysent_t lx_sysent64[LX_NSYSCALLS + 1]; +int lx_nsysent64; +#endif +/* + * System call handler table and entry count for Linux x86 (i386): + */ +lx_sysent_t lx_sysent32[LX_NSYSCALLS + 1]; +int lx_nsysent32; /* - * In-Kernel Emulation table - * The entries in this table are NOT indexed by either of the Linux syscall - * numbers (32-bit or 64-bit). Instead, the entries are laid out linearly - * with the LX_EMUL_* defines uses to lookup the correct entry. + * Map Illumos errno to the Linux equivalent. */ -typedef struct lx_ike { - int sy_flags; - char *sy_name; - llfcn_t sy_callc; - char sy_narg; -} lx_ike_t; - -static lx_ike_t lx_ike_ent[] = +int lx_stol_errno[] = LX_STOL_ERRNO_INIT; + +#if defined(__amd64) +static int +lx_emulate_args(klwp_t *lwp, const lx_sysent_t *s, uintptr_t *args) { - LX_NOSYS("lx_nosys"), /* 0 */ - LX_CL("getpid", lx_getpid, 0), /* 1 */ - LX_CL("kill", lx_kill, 2), - LX_CL("pipe", lx_pipe, 1), - LX_CL("brk", lx_brk, 1), - LX_CL("getppid", lx_getppid, 0), - LX_CL("sysinfo", lx_sysinfo, 1), - LX_CL("clone", lx_clone, 5), - LX_CL("modify_ldt", lx_modify_ldt, 3), - LX_CL("sched_setparam", lx_sched_setparam, 2), - LX_CL("sched_getparam", lx_sched_getparam, 2), /* 10 */ - LX_CL("sched_rr_get_interval", lx_sched_rr_get_interval, 2), - LX_CL("setresuid16", lx_setresuid16, 3), - LX_CL("setresgid16", lx_setresgid16, 3), - LX_CL("rt_sigqueueinfo", lx_rt_sigqueueinfo, 3), - LX_CL("setgroups", lx_setgroups, 2), - LX_CL("setresuid", lx_setresuid, 3), - LX_CL("setresgid", lx_setresgid, 3), - LX_CL("gettid", lx_gettid, 0), - LX_CL("tkill", lx_tkill, 2), - LX_CL("futex", lx_futex, 6), /* 20 */ - LX_CL("set_thread_area", lx_set_thread_area, 1), - LX_CL("get_thread_area", lx_get_thread_area, 1), - LX_CL("set_tid_address", lx_set_tid_address, 1), - LX_CL("pipe2", lx_pipe2, 2), - LX_CL("rt_tgsigqueueinfo", lx_rt_tgsigqueueinfo, 4), - LX_CL("arch_prctl", lx_arch_prctl, 2), - LX_CL("tgkill", lx_tgkill, 3), - LX_CL("read", lx_read, 3), - LX_CL("ioctl", lx_ioctl, 3), -}; + struct regs *rp = lwptoregs(lwp); + + if (get_udatamodel() == DATAMODEL_NATIVE) { + /* + * Note: Syscall argument passing is different from function + * call argument passing on amd64. For function calls, the + * fourth arg is passed via %rcx, but for system calls the 4th + * arg is passed via %r10. This is because in amd64, the + * syscall instruction puts the lower 32 bits of %rflags in + * %r11 and puts the %rip value to %rcx. + * + * Appendix A of the amd64 ABI (Linux conventions) states that + * syscalls are limited to 6 args and no arg is passed on the + * stack. + */ + args[0] = rp->r_rdi; + args[1] = rp->r_rsi; + args[2] = rp->r_rdx; + args[3] = rp->r_r10; + args[4] = rp->r_r8; + args[5] = rp->r_r9; + } else { + /* + * If the system call takes 6 args, then libc has stashed them + * in memory at the address contained in %ebx. Except for some + * syscalls which store the 6th argument in %ebp. + */ + if (s->sy_narg == 6 && !(s->sy_flags & LX_SYS_EBPARG6)) { + uint32_t args32[6]; + + if (copyin((void *)rp->r_rbx, &args32, + sizeof (args32)) != 0) { + /* + * Clear the argument vector so that the + * trace probe does not expose kernel + * memory. + */ + bzero(args, 6 * sizeof (uintptr_t)); + return (set_errno(EFAULT)); + } + + args[0] = args32[0]; + args[1] = args32[1]; + args[2] = args32[2]; + args[3] = args32[3]; + args[4] = args32[4]; + args[5] = args32[5]; + } else { + args[0] = rp->r_rbx; + args[1] = rp->r_rcx; + args[2] = rp->r_rdx; + args[3] = rp->r_rsi; + args[4] = rp->r_rdi; + args[5] = rp->r_rbp; + } + } + + return (0); +} -int64_t -lx_emulate_syscall(int num, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) +#else /* !__amd64 */ + +static int +lx_emulate_args(klwp_t *lwp, const lx_sysent_t *s, uintptr_t *args) { - lx_ike_t *jsp; - int64_t rval; + struct regs *rp = lwptoregs(lwp); - rval = (int64_t)0; + /* + * If the system call takes 6 args, then libc has stashed them + * in memory at the address contained in %ebx. Except for some + * syscalls which store the 6th argument in %ebp. + */ + if (s->sy_narg == 6 && !(s->sy_flags & LX_SYS_EBPARG6)) { + if (copyin((void *)rp->r_ebx, args, 6 * sizeof (uintptr_t)) != + 0) { + /* + * Clear the argument vector so that the trace probe + * does not expose kernel memory. + */ + bzero(args, 6 * sizeof (uintptr_t)); + return (set_errno(EFAULT)); + } + } else { + args[0] = rp->r_ebx; + args[1] = rp->r_ecx; + args[2] = rp->r_edx; + args[3] = rp->r_esi; + args[4] = rp->r_edi; + args[5] = rp->r_ebp; + } - jsp = &(lx_ike_ent[num]); + return (0); +} +#endif - switch (jsp->sy_narg) { - case 0: { - lx_print("--> %s()\n", jsp->sy_name); - rval = (int64_t)jsp->sy_callc(); - break; +int +lx_syscall_return(klwp_t *lwp, int syscall_num, long ret) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + int error = lwp->lwp_errno; + + if (error != EINTR) { + /* + * If this system call was not interrupted, clear the system + * call restart flag before lx_setcontext() can pass it to + * usermode. + */ + lwpd->br_syscall_restart = B_FALSE; + } + + if (error != 0) { + /* + * Convert from illumos to Linux errno: + */ + if (error < 1 || error >= (sizeof (lx_stol_errno) / + sizeof (lx_stol_errno[0]))) { + /* + * The provided error number is not valid. + */ + error = EINVAL; + } + ret = -lx_stol_errno[error]; } - case 1: { - lx_print("--> %s(0x%lx)\n", jsp->sy_name, arg1); - rval = (int64_t)jsp->sy_callc(arg1); - break; + + /* + * 32-bit Linux system calls return via %eax; 64-bit calls return via + * %rax. + */ + rp->r_r0 = ret; + + /* + * Hold for the ptrace(2) "syscall-exit-stop" condition if required by + * PTRACE_SYSCALL. Note that the register state may be modified by + * tracer. + */ + lx_ptrace_stop(LX_PR_SYSEXIT); + + /* + * Fire the DTrace "lx-syscall:::return" probe: + */ + lx_trace_sysreturn(syscall_num, ret); + + /* + * Clear errno for next time. We do not clear "br_syscall_restart" or + * "br_syscall_num" as they are potentially used by "lx_savecontext()" + * in the signal delivery path. + */ + lwp->lwp_errno = 0; + + /* + * We want complete control of the registers on return from this + * emulated Linux system call: + */ + lwp->lwp_eosys = JUSTRETURN; + curthread->t_post_sys = 1; + aston(curthread); + + return (0); +} + +static void +lx_syscall_unsup_msg(lx_sysent_t *s, int syscall_num, int unsup_reason) +{ + char buf[100]; + + if (s == NULL) { + (void) snprintf(buf, sizeof (buf), "NOSYS (%d): out of bounds", + syscall_num); + } else { + VERIFY(unsup_reason < (sizeof (nosys_reasons) / + sizeof (*nosys_reasons))); + + if (s->sy_name == NULL) { + (void) snprintf(buf, sizeof (buf), "NOSYS (%d): %s", + syscall_num, nosys_reasons[unsup_reason]); + } else { + (void) snprintf(buf, sizeof (buf), "NOSYS (%s): %s", + s->sy_name, nosys_reasons[unsup_reason]); + } } - case 2: { - lx_print("--> %s(0x%lx, 0x%lx)\n", jsp->sy_name, arg1, arg2); - rval = (int64_t)jsp->sy_callc(arg1, arg2); - break; + + lx_unsupported(buf); +} + +/* + * This function is used to override the processing of arguments and + * invocation of a handler for emulated system calls, installed on each + * branded LWP as "lwp_brand_syscall". If this system call should use the + * native path, we return 1. If we handled this system call (and have made + * arrangements with respect to post-return usermode register state) we + * return 0. + */ +int +lx_syscall_enter(void) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + int syscall_num; + int error; + long ret = 0; + lx_sysent_t *s; + uintptr_t args[6]; + unsigned int unsup_reason; + + /* + * If we got here, we should have an LWP-specific brand data + * structure. + */ + VERIFY(lwpd != NULL); + + if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND) { + /* + * The lwp is not in in BRAND execution mode, so we return + * to the regular native system call path. + */ + DTRACE_PROBE(brand__lx__syscall__hook__skip); + return (1); } - case 3: { - lx_print("--> %s(0x%lx, 0x%lx, 0x%lx)\n", - jsp->sy_name, arg1, arg2, arg3); - rval = (int64_t)jsp->sy_callc(arg1, arg2, arg3); - break; + + /* + * Clear the restartable system call flag. This flag will be set + * on in the system call handler if the call is a candidate for + * a restart. It will be saved by lx_setcontext() in the event + * that we take a signal, and used in the signal handling path + * to restart the system call iff SA_RESTART was set for this + * signal. Save the system call number so that we can store it + * in the saved context if required. + */ + lwpd->br_syscall_restart = B_FALSE; + lwpd->br_syscall_num = (int)rp->r_r0; + + /* + * Hold for the ptrace(2) "syscall-entry-stop" condition if traced by + * PTRACE_SYSCALL. The system call number and arguments may be + * modified by the tracer. + */ + lx_ptrace_stop(LX_PR_SYSENTRY); + + /* + * Check that the system call number is within the bounds we expect. + */ + syscall_num = lwpd->br_syscall_num; + if (syscall_num < 0 || syscall_num > LX_MAX_SYSCALL(lwp)) { + lx_syscall_unsup_msg(NULL, syscall_num, 0); + + set_errno(ENOTSUP); + lx_syscall_return(lwp, syscall_num, -1); + return (0); } - case 4: { - lx_print("--> %s(0x%lx, 0x%lx, 0x%lx, 0x%lx)\n", - jsp->sy_name, arg1, arg2, arg3, arg4); - rval = (int64_t)jsp->sy_callc(arg1, arg2, arg3, arg4); - break; + +#if defined(_LP64) + if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) { + s = &lx_sysent64[syscall_num]; + } else +#endif + { + s = &lx_sysent32[syscall_num]; } - case 5: { - lx_print("--> %s(0x%lx, 0x%lx, 0x%lx, 0x%lx, 0x%lx)\n", - jsp->sy_name, arg1, arg2, arg3, arg4, arg5); - rval = (int64_t)jsp->sy_callc(arg1, arg2, arg3, arg4, arg5); - break; + + /* + * Process the arguments for this system call and fire the DTrace + * "lx-syscall:::entry" probe: + */ + error = lx_emulate_args(lwp, s, args); + lx_trace_sysenter(syscall_num, args); + if (error != 0) { + /* + * Could not read and process the arguments. Return the error + * to the process. + */ + set_errno(error); + lx_syscall_return(lwp, syscall_num, -1); + return (0); } - case 6: { - lx_print("--> %s(0x%lx, 0x%lx, 0x%lx, 0x%lx," - " 0x%lx, 0x%lx)\n", - jsp->sy_name, arg1, arg2, arg3, arg4, arg5, arg6); - rval = (int64_t)jsp->sy_callc(arg1, arg2, arg3, arg4, arg5, - arg6); - break; + + if (s->sy_callc != NULL) { + /* + * Call the in-kernel handler for this Linux system call: + */ + ret = s->sy_callc(args[0], args[1], args[2], args[3], args[4], + args[5]); + lx_syscall_return(lwp, syscall_num, ret); + return (0); } + + /* + * There is no in-kernel handler. + */ + switch (unsup_reason = (s->sy_flags & LX_SYS_NOSYS_REASON)) { + case NOSYS_USERMODE: + /* + * Pass to the usermode emulation routine. + */ +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + lx_emulate_user32(lwp, syscall_num, args); + } else +#endif + { + lx_emulate_user(lwp, syscall_num, args); + } + return (0); + default: - panic("Invalid IKE entry: #%d at 0x%p\n", num, (void *)jsp); + /* + * We are not emulating this system call at all. + */ + lx_syscall_unsup_msg(s, syscall_num, unsup_reason); + + set_errno(ENOTSUP); + lx_syscall_return(lwp, syscall_num, -1); + return (0); } - lx_print("----------> return (0x%llx)\n", (long long)rval); - return (rval); } + +/* + * Linux defines system call numbers for 32-bit x86 in the file: + * arch/x86/syscalls/syscall_32.tbl + */ +lx_sysent_t lx_sysent32[] = { + {"nosys", NULL, NOSYS_NONE, 0}, /* 0 */ + {"exit", NULL, 0, 1}, /* 1 */ + {"fork", NULL, 0, 0}, /* 2 */ + {"read", lx_read, 0, 3}, /* 3 */ + {"write", lx_write, 0, 3}, /* 4 */ + {"open", NULL, 0, 3}, /* 5 */ + {"close", NULL, 0, 1}, /* 6 */ + {"waitpid", lx_waitpid, 0, 3}, /* 7 */ + {"creat", NULL, 0, 2}, /* 8 */ + {"link", NULL, 0, 2}, /* 9 */ + {"unlink", NULL, 0, 1}, /* 10 */ + {"execve", NULL, 0, 3}, /* 11 */ + {"chdir", NULL, 0, 1}, /* 12 */ + {"time", NULL, 0, 1}, /* 13 */ + {"mknod", NULL, 0, 3}, /* 14 */ + {"chmod", NULL, 0, 2}, /* 15 */ + {"lchown16", NULL, 0, 3}, /* 16 */ + {"break", NULL, NOSYS_OBSOLETE, 0}, /* 17 */ + {"stat", NULL, NOSYS_OBSOLETE, 0}, /* 18 */ + {"lseek", NULL, 0, 3}, /* 19 */ + {"getpid", lx_getpid, 0, 0}, /* 20 */ + {"mount", NULL, 0, 5}, /* 21 */ + {"umount", NULL, 0, 1}, /* 22 */ + {"setuid16", NULL, 0, 1}, /* 23 */ + {"getuid16", NULL, 0, 0}, /* 24 */ + {"stime", NULL, 0, 1}, /* 25 */ + {"ptrace", NULL, 0, 4}, /* 26 */ + {"alarm", NULL, 0, 1}, /* 27 */ + {"fstat", NULL, NOSYS_OBSOLETE, 0}, /* 28 */ + {"pause", NULL, 0, 0}, /* 29 */ + {"utime", NULL, 0, 2}, /* 30 */ + {"stty", NULL, NOSYS_OBSOLETE, 0}, /* 31 */ + {"gtty", NULL, NOSYS_OBSOLETE, 0}, /* 32 */ + {"access", NULL, 0, 2}, /* 33 */ + {"nice", NULL, 0, 1}, /* 34 */ + {"ftime", NULL, NOSYS_OBSOLETE, 0}, /* 35 */ + {"sync", NULL, 0, 0}, /* 36 */ + {"kill", lx_kill, 0, 2}, /* 37 */ + {"rename", NULL, 0, 2}, /* 38 */ + {"mkdir", NULL, 0, 2}, /* 39 */ + {"rmdir", NULL, 0, 1}, /* 40 */ + {"dup", NULL, 0, 1}, /* 41 */ + {"pipe", lx_pipe, 0, 1}, /* 42 */ + {"times", NULL, 0, 1}, /* 43 */ + {"prof", NULL, NOSYS_OBSOLETE, 0}, /* 44 */ + {"brk", lx_brk, 0, 1}, /* 45 */ + {"setgid16", NULL, 0, 1}, /* 46 */ + {"getgid16", NULL, 0, 0}, /* 47 */ + {"signal", NULL, 0, 2}, /* 48 */ + {"geteuid16", NULL, 0, 0}, /* 49 */ + {"getegid16", NULL, 0, 0}, /* 50 */ + {"acct", NULL, NOSYS_NO_EQUIV, 0}, /* 51 */ + {"umount2", NULL, 0, 2}, /* 52 */ + {"lock", NULL, NOSYS_OBSOLETE, 0}, /* 53 */ + {"ioctl", lx_ioctl, 0, 3}, /* 54 */ + {"fcntl", NULL, 0, 3}, /* 55 */ + {"mpx", NULL, NOSYS_OBSOLETE, 0}, /* 56 */ + {"setpgid", NULL, 0, 2}, /* 57 */ + {"ulimit", NULL, NOSYS_OBSOLETE, 0}, /* 58 */ + {"olduname", NULL, NOSYS_OBSOLETE, 0}, /* 59 */ + {"umask", NULL, 0, 1}, /* 60 */ + {"chroot", NULL, 0, 1}, /* 61 */ + {"ustat", NULL, NOSYS_OBSOLETE, 2}, /* 62 */ + {"dup2", NULL, 0, 2}, /* 63 */ + {"getppid", lx_getppid, 0, 0}, /* 64 */ + {"getpgrp", NULL, 0, 0}, /* 65 */ + {"setsid", NULL, 0, 0}, /* 66 */ + {"sigaction", NULL, 0, 3}, /* 67 */ + {"sgetmask", NULL, NOSYS_OBSOLETE, 0}, /* 68 */ + {"ssetmask", NULL, NOSYS_OBSOLETE, 0}, /* 69 */ + {"setreuid16", NULL, 0, 2}, /* 70 */ + {"setregid16", NULL, 0, 2}, /* 71 */ + {"sigsuspend", NULL, 0, 1}, /* 72 */ + {"sigpending", NULL, 0, 1}, /* 73 */ + {"sethostname", NULL, 0, 2}, /* 74 */ + {"setrlimit", NULL, 0, 2}, /* 75 */ + {"getrlimit", NULL, 0, 2}, /* 76 */ + {"getrusage", NULL, 0, 2}, /* 77 */ + {"gettimeofday", NULL, 0, 2}, /* 78 */ + {"settimeofday", NULL, 0, 2}, /* 79 */ + {"getgroups16", NULL, 0, 2}, /* 80 */ + {"setgroups16", NULL, 0, 2}, /* 81 */ + {"select", NULL, NOSYS_OBSOLETE, 0}, /* 82 */ + {"symlink", NULL, 0, 2}, /* 83 */ + {"oldlstat", NULL, NOSYS_OBSOLETE, 0}, /* 84 */ + {"readlink", NULL, 0, 3}, /* 85 */ + {"uselib", NULL, NOSYS_KERNEL, 0}, /* 86 */ + {"swapon", NULL, NOSYS_KERNEL, 0}, /* 87 */ + {"reboot", NULL, 0, 4}, /* 88 */ + {"readdir", NULL, 0, 3}, /* 89 */ + {"mmap", NULL, 0, 6}, /* 90 */ + {"munmap", NULL, 0, 2}, /* 91 */ + {"truncate", NULL, 0, 2}, /* 92 */ + {"ftruncate", NULL, 0, 2}, /* 93 */ + {"fchmod", NULL, 0, 2}, /* 94 */ + {"fchown16", NULL, 0, 3}, /* 95 */ + {"getpriority", NULL, 0, 2}, /* 96 */ + {"setpriority", NULL, 0, 3}, /* 97 */ + {"profil", NULL, NOSYS_NO_EQUIV, 0}, /* 98 */ + {"statfs", NULL, 0, 2}, /* 99 */ + {"fstatfs", NULL, 0, 2}, /* 100 */ + {"ioperm", NULL, NOSYS_NO_EQUIV, 0}, /* 101 */ + {"socketcall", NULL, 0, 2}, /* 102 */ + {"syslog", NULL, 0, 3}, /* 103 */ + {"setitimer", NULL, 0, 3}, /* 104 */ + {"getitimer", NULL, 0, 2}, /* 105 */ + {"stat", NULL, 0, 2}, /* 106 */ + {"lstat", NULL, 0, 2}, /* 107 */ + {"fstat", NULL, 0, 2}, /* 108 */ + {"uname", NULL, NOSYS_OBSOLETE, 0}, /* 109 */ + {"oldiopl", NULL, NOSYS_NO_EQUIV, 0}, /* 110 */ + {"vhangup", NULL, 0, 0}, /* 111 */ + {"idle", NULL, NOSYS_NO_EQUIV, 0}, /* 112 */ + {"vm86old", NULL, NOSYS_OBSOLETE, 0}, /* 113 */ + {"wait4", lx_wait4, 0, 4}, /* 114 */ + {"swapoff", NULL, NOSYS_KERNEL, 0}, /* 115 */ + {"sysinfo", lx_sysinfo32, 0, 1}, /* 116 */ + {"ipc", NULL, 0, 5}, /* 117 */ + {"fsync", NULL, 0, 1}, /* 118 */ + {"sigreturn", NULL, 0, 1}, /* 119 */ + {"clone", NULL, 0, 5}, /* 120 */ + {"setdomainname", NULL, 0, 2}, /* 121 */ + {"uname", NULL, 0, 1}, /* 122 */ + {"modify_ldt", lx_modify_ldt, 0, 3}, /* 123 */ + {"adjtimex", NULL, 0, 1}, /* 124 */ + {"mprotect", NULL, 0, 3}, /* 125 */ + {"sigprocmask", NULL, 0, 3}, /* 126 */ + {"create_module", NULL, NOSYS_KERNEL, 0}, /* 127 */ + {"init_module", NULL, NOSYS_KERNEL, 0}, /* 128 */ + {"delete_module", NULL, NOSYS_KERNEL, 0}, /* 129 */ + {"get_kernel_syms", NULL, NOSYS_KERNEL, 0}, /* 130 */ + {"quotactl", NULL, NOSYS_KERNEL, 0}, /* 131 */ + {"getpgid", NULL, 0, 1}, /* 132 */ + {"fchdir", NULL, 0, 1}, /* 133 */ + {"bdflush", NULL, NOSYS_KERNEL, 0}, /* 134 */ + {"sysfs", NULL, 0, 3}, /* 135 */ + {"personality", NULL, 0, 1}, /* 136 */ + {"afs_syscall", NULL, NOSYS_KERNEL, 0}, /* 137 */ + {"setfsuid16", NULL, 0, 1}, /* 138 */ + {"setfsgid16", NULL, 0, 1}, /* 139 */ + {"llseek", NULL, 0, 5}, /* 140 */ + {"getdents", NULL, 0, 3}, /* 141 */ + {"select", NULL, 0, 5}, /* 142 */ + {"flock", NULL, 0, 2}, /* 143 */ + {"msync", NULL, 0, 3}, /* 144 */ + {"readv", NULL, 0, 3}, /* 145 */ + {"writev", NULL, 0, 3}, /* 146 */ + {"getsid", NULL, 0, 1}, /* 147 */ + {"fdatasync", NULL, 0, 1}, /* 148 */ + {"sysctl", NULL, 0, 1}, /* 149 */ + {"mlock", NULL, 0, 2}, /* 150 */ + {"munlock", NULL, 0, 2}, /* 151 */ + {"mlockall", NULL, 0, 1}, /* 152 */ + {"munlockall", NULL, 0, 0}, /* 153 */ + {"sched_setparam", NULL, 0, 2}, /* 154 */ + {"sched_getparam", NULL, 0, 2}, /* 155 */ + {"sched_setscheduler", NULL, 0, 3}, /* 156 */ + {"sched_getscheduler", NULL, 0, 1}, /* 157 */ + {"sched_yield", lx_sched_yield, 0, 0}, /* 158 */ + {"sched_get_priority_max", NULL, 0, 1}, /* 159 */ + {"sched_get_priority_min", NULL, 0, 1}, /* 160 */ + {"sched_rr_get_interval", NULL, 0, 2}, /* 161 */ + {"nanosleep", NULL, 0, 2}, /* 162 */ + {"mremap", NULL, 0, 5}, /* 163 */ + {"setresuid16", lx_setresuid16, 0, 3}, /* 164 */ + {"getresuid16", NULL, 0, 3}, /* 165 */ + {"vm86", NULL, NOSYS_NO_EQUIV, 0}, /* 166 */ + {"query_module", NULL, 0, 5}, /* 167 */ + {"poll", NULL, 0, 3}, /* 168 */ + {"nfsservctl", NULL, NOSYS_KERNEL, 0}, /* 169 */ + {"setresgid16", lx_setresgid16, 0, 3}, /* 170 */ + {"getresgid16", NULL, 0, 3}, /* 171 */ + {"prctl", NULL, 0, 5}, /* 172 */ + {"rt_sigreturn", NULL, 0, 0}, /* 173 */ + {"rt_sigaction", NULL, 0, 4}, /* 174 */ + {"rt_sigprocmask", NULL, 0, 4}, /* 175 */ + {"rt_sigpending", NULL, 0, 2}, /* 176 */ + {"rt_sigtimedwait", NULL, 0, 4}, /* 177 */ + {"rt_sigqueueinfo", NULL, 0, 3}, /* 178 */ + {"rt_sigsuspend", NULL, 0, 2}, /* 179 */ + {"pread64", NULL, 0, 5}, /* 180 */ + {"pwrite64", NULL, 0, 5}, /* 181 */ + {"chown16", NULL, 0, 3}, /* 182 */ + {"getcwd", NULL, 0, 2}, /* 183 */ + {"capget", NULL, 0, 2}, /* 184 */ + {"capset", NULL, 0, 2}, /* 185 */ + {"sigaltstack", NULL, 0, 2}, /* 186 */ + {"sendfile", NULL, 0, 4}, /* 187 */ + {"getpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 188 */ + {"putpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 189 */ + {"vfork", NULL, 0, 0}, /* 190 */ + {"getrlimit", NULL, 0, 2}, /* 191 */ + {"mmap2", NULL, LX_SYS_EBPARG6, 6}, /* 192 */ + {"truncate64", NULL, 0, 3}, /* 193 */ + {"ftruncate64", NULL, 0, 3}, /* 194 */ + {"stat64", NULL, 0, 2}, /* 195 */ + {"lstat64", NULL, 0, 2}, /* 196 */ + {"fstat64", NULL, 0, 2}, /* 197 */ + {"lchown", NULL, 0, 3}, /* 198 */ + {"getuid", NULL, 0, 0}, /* 199 */ + {"getgid", NULL, 0, 0}, /* 200 */ + {"geteuid", NULL, 0, 0}, /* 201 */ + {"getegid", NULL, 0, 0}, /* 202 */ + {"setreuid", NULL, 0, 0}, /* 203 */ + {"setregid", NULL, 0, 0}, /* 204 */ + {"getgroups", NULL, 0, 2}, /* 205 */ + {"setgroups", NULL, 0, 2}, /* 206 */ + {"fchown", NULL, 0, 3}, /* 207 */ + {"setresuid", lx_setresuid, 0, 3}, /* 208 */ + {"getresuid", NULL, 0, 3}, /* 209 */ + {"setresgid", lx_setresgid, 0, 3}, /* 210 */ + {"getresgid", NULL, 0, 3}, /* 211 */ + {"chown", NULL, 0, 3}, /* 212 */ + {"setuid", NULL, 0, 1}, /* 213 */ + {"setgid", NULL, 0, 1}, /* 214 */ + {"setfsuid", NULL, 0, 1}, /* 215 */ + {"setfsgid", NULL, 0, 1}, /* 216 */ + {"pivot_root", NULL, NOSYS_KERNEL, 0}, /* 217 */ + {"mincore", NULL, 0, 3}, /* 218 */ + {"madvise", NULL, 0, 3}, /* 219 */ + {"getdents64", NULL, 0, 3}, /* 220 */ + {"fcntl64", NULL, 0, 3}, /* 221 */ + {"tux", NULL, NOSYS_NO_EQUIV, 0}, /* 222 */ + {"security", NULL, NOSYS_NO_EQUIV, 0}, /* 223 */ + {"gettid", lx_gettid, 0, 0}, /* 224 */ + {"readahead", NULL, NOSYS_NO_EQUIV, 0}, /* 225 */ + {"setxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 226 */ + {"lsetxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 227 */ + {"fsetxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 228 */ + {"getxattr", lx_xattr, 0, 4}, /* 229 */ + {"lgetxattr", lx_xattr, 0, 4}, /* 230 */ + {"fgetxattr", lx_xattr, 0, 4}, /* 231 */ + {"listxattr", lx_xattr, 0, 3}, /* 232 */ + {"llistxattr", lx_xattr, 0, 3}, /* 233 */ + {"flistxattr", lx_xattr, 0, 3}, /* 234 */ + {"removexattr", lx_xattr, 0, 2}, /* 235 */ + {"lremovexattr", lx_xattr, 0, 2}, /* 236 */ + {"fremovexattr", lx_xattr, 0, 2}, /* 237 */ + {"tkill", lx_tkill, 0, 2}, /* 238 */ + {"sendfile64", NULL, 0, 4}, /* 239 */ + {"futex", lx_futex, LX_SYS_EBPARG6, 6}, /* 240 */ + {"sched_setaffinity", NULL, 0, 3}, /* 241 */ + {"sched_getaffinity", NULL, 0, 3}, /* 242 */ + {"set_thread_area", lx_set_thread_area, 0, 1}, /* 243 */ + {"get_thread_area", lx_get_thread_area, 0, 1}, /* 244 */ + {"io_setup", NULL, NOSYS_NO_EQUIV, 0}, /* 245 */ + {"io_destroy", NULL, NOSYS_NO_EQUIV, 0}, /* 246 */ + {"io_getevents", NULL, NOSYS_NO_EQUIV, 0}, /* 247 */ + {"io_submit", NULL, NOSYS_NO_EQUIV, 0}, /* 248 */ + {"io_cancel", NULL, NOSYS_NO_EQUIV, 0}, /* 249 */ + {"fadvise64", NULL, 0, 4}, /* 250 */ + {"nosys", NULL, 0, 0}, /* 251 */ + {"group_exit", NULL, 0, 1}, /* 252 */ + {"lookup_dcookie", NULL, NOSYS_NO_EQUIV, 0}, /* 253 */ + {"epoll_create", NULL, 0, 1}, /* 254 */ + {"epoll_ctl", NULL, 0, 4}, /* 255 */ + {"epoll_wait", NULL, 0, 4}, /* 256 */ + {"remap_file_pages", NULL, NOSYS_NO_EQUIV, 0}, /* 257 */ + {"set_tid_address", lx_set_tid_address, 0, 1}, /* 258 */ + {"timer_create", NULL, 0, 3}, /* 259 */ + {"timer_settime", NULL, 0, 4}, /* 260 */ + {"timer_gettime", NULL, 0, 2}, /* 261 */ + {"timer_getoverrun", NULL, 0, 1}, /* 262 */ + {"timer_delete", NULL, 0, 1}, /* 263 */ + {"clock_settime", NULL, 0, 2}, /* 264 */ + {"clock_gettime", NULL, 0, 2}, /* 265 */ + {"clock_getres", NULL, 0, 2}, /* 266 */ + {"clock_nanosleep", NULL, 0, 4}, /* 267 */ + {"statfs64", NULL, 0, 2}, /* 268 */ + {"fstatfs64", NULL, 0, 2}, /* 269 */ + {"tgkill", lx_tgkill, 0, 3}, /* 270 */ + +/* + * The following system calls only exist in kernel 2.6 and greater: + */ + {"utimes", NULL, 0, 2}, /* 271 */ + {"fadvise64_64", NULL, 0, 4}, /* 272 */ + {"vserver", NULL, NOSYS_NULL, 0}, /* 273 */ + {"mbind", NULL, NOSYS_NULL, 0}, /* 274 */ + {"get_mempolicy", NULL, NOSYS_NULL, 0}, /* 275 */ + {"set_mempolicy", NULL, NOSYS_NULL, 0}, /* 276 */ + {"mq_open", NULL, NOSYS_NULL, 0}, /* 277 */ + {"mq_unlink", NULL, NOSYS_NULL, 0}, /* 278 */ + {"mq_timedsend", NULL, NOSYS_NULL, 0}, /* 279 */ + {"mq_timedreceive", NULL, NOSYS_NULL, 0}, /* 280 */ + {"mq_notify", NULL, NOSYS_NULL, 0}, /* 281 */ + {"mq_getsetattr", NULL, NOSYS_NULL, 0}, /* 282 */ + {"kexec_load", NULL, NOSYS_NULL, 0}, /* 283 */ + {"waitid", lx_waitid, 0, 4}, /* 284 */ + {"sys_setaltroot", NULL, NOSYS_NULL, 0}, /* 285 */ + {"add_key", NULL, NOSYS_NULL, 0}, /* 286 */ + {"request_key", NULL, NOSYS_NULL, 0}, /* 287 */ + {"keyctl", NULL, NOSYS_NULL, 0}, /* 288 */ + {"ioprio_set", NULL, NOSYS_NULL, 0}, /* 289 */ + {"ioprio_get", NULL, NOSYS_NULL, 0}, /* 290 */ + {"inotify_init", NULL, 0, 0}, /* 291 */ + {"inotify_add_watch", NULL, 0, 3}, /* 292 */ + {"inotify_rm_watch", NULL, 0, 2}, /* 293 */ + {"migrate_pages", NULL, NOSYS_NULL, 0}, /* 294 */ + {"openat", NULL, 0, 4}, /* 295 */ + {"mkdirat", NULL, 0, 3}, /* 296 */ + {"mknodat", NULL, 0, 4}, /* 297 */ + {"fchownat", NULL, 0, 5}, /* 298 */ + {"futimesat", NULL, 0, 3}, /* 299 */ + {"fstatat64", NULL, 0, 4}, /* 300 */ + {"unlinkat", NULL, 0, 3}, /* 301 */ + {"renameat", NULL, 0, 4}, /* 302 */ + {"linkat", NULL, 0, 5}, /* 303 */ + {"symlinkat", NULL, 0, 3}, /* 304 */ + {"readlinkat", NULL, 0, 4}, /* 305 */ + {"fchmodat", NULL, 0, 4}, /* 306 */ + {"faccessat", NULL, 0, 4}, /* 307 */ + {"pselect6", NULL, LX_SYS_EBPARG6, 6}, /* 308 */ + {"ppoll", NULL, 0, 5}, /* 309 */ + {"unshare", NULL, NOSYS_NULL, 0}, /* 310 */ + {"set_robust_list", NULL, NOSYS_NULL, 0}, /* 311 */ + {"get_robust_list", NULL, NOSYS_NULL, 0}, /* 312 */ + {"splice", NULL, NOSYS_NULL, 0}, /* 313 */ + {"sync_file_range", NULL, NOSYS_NULL, 0}, /* 314 */ + {"tee", NULL, NOSYS_NULL, 0}, /* 315 */ + {"vmsplice", NULL, NOSYS_NULL, 0}, /* 316 */ + {"move_pages", NULL, NOSYS_NULL, 0}, /* 317 */ + {"getcpu", NULL, 0, 3}, /* 318 */ + {"epoll_pwait", NULL, 0, 5}, /* 319 */ + {"utimensat", NULL, 0, 4}, /* 320 */ + {"signalfd", NULL, NOSYS_NULL, 0}, /* 321 */ + {"timerfd_create", NULL, 0, 2}, /* 322 */ + {"eventfd", NULL, 0, 1}, /* 323 */ + {"fallocate", NULL, NOSYS_NULL, 0}, /* 324 */ + {"timerfd_settime", NULL, 0, 4}, /* 325 */ + {"timerfd_gettime", NULL, 0, 2}, /* 326 */ + {"signalfd4", NULL, NOSYS_NULL, 0}, /* 327 */ + {"eventfd2", NULL, 0, 2}, /* 328 */ + {"epoll_create1", NULL, 0, 1}, /* 329 */ + {"dup3", NULL, 0, 3}, /* 330 */ + {"pipe2", lx_pipe2, 0, 2}, /* 331 */ + {"inotify_init1", NULL, 0, 1}, /* 332 */ + {"preadv", NULL, NOSYS_NULL, 0}, /* 333 */ + {"pwritev", NULL, NOSYS_NULL, 0}, /* 334 */ + {"rt_tgsigqueueinfo", NULL, 0, 4}, /* 335 */ + {"perf_event_open", NULL, NOSYS_NULL, 0}, /* 336 */ + {"recvmmsg", NULL, NOSYS_NULL, 0}, /* 337 */ + {"fanotify_init", NULL, NOSYS_NULL, 0}, /* 338 */ + {"fanotify_mark", NULL, NOSYS_NULL, 0}, /* 339 */ + {"prlimit64", NULL, 0, 4}, /* 340 */ + {"name_to_handle_at", NULL, NOSYS_NULL, 0}, /* 341 */ + {"open_by_handle_at", NULL, NOSYS_NULL, 0}, /* 342 */ + {"clock_adjtime", NULL, NOSYS_NULL, 0}, /* 343 */ + {"syncfs", NULL, NOSYS_NULL, 0}, /* 344 */ + {"sendmmsg", NULL, NOSYS_NULL, 0}, /* 345 */ + {"setns", NULL, NOSYS_NULL, 0}, /* 346 */ + {"process_vm_readv", NULL, NOSYS_NULL, 0}, /* 347 */ + {"process_vm_writev", NULL, NOSYS_NULL, 0}, /* 348 */ + {"kcmp", NULL, NOSYS_NULL, 0}, /* 349 */ + {"finit_module", NULL, NOSYS_NULL, 0}, /* 350 */ + {"sched_setattr", NULL, NOSYS_NULL, 0}, /* 351 */ + {"sched_getattr", NULL, NOSYS_NULL, 0}, /* 352 */ + {"renameat2", NULL, NOSYS_NULL, 0}, /* 353 */ + {"seccomp", NULL, NOSYS_NULL, 0}, /* 354 */ + {"getrandom", NULL, NOSYS_NULL, 0}, /* 355 */ + {"memfd_create", NULL, NOSYS_NULL, 0}, /* 356 */ + {"bpf", NULL, NOSYS_NULL, 0}, /* 357 */ + {"execveat", NULL, NOSYS_NULL, 0}, /* 358 */ +}; + +#if defined(_LP64) +/* + * Linux defines system call numbers for 64-bit x86 in the file: + * arch/x86/syscalls/syscall_64.tbl + */ +lx_sysent_t lx_sysent64[] = { + {"read", lx_read, 0, 3}, /* 0 */ + {"write", lx_write, 0, 3}, /* 1 */ + {"open", NULL, 0, 3}, /* 2 */ + {"close", NULL, 0, 1}, /* 3 */ + {"stat", NULL, 0, 2}, /* 4 */ + {"fstat", NULL, 0, 2}, /* 5 */ + {"lstat", NULL, 0, 2}, /* 6 */ + {"poll", NULL, 0, 3}, /* 7 */ + {"lseek", NULL, 0, 3}, /* 8 */ + {"mmap", NULL, 0, 6}, /* 9 */ + {"mprotect", NULL, 0, 3}, /* 10 */ + {"munmap", NULL, 0, 2}, /* 11 */ + {"brk", lx_brk, 0, 1}, /* 12 */ + {"rt_sigaction", NULL, 0, 4}, /* 13 */ + {"rt_sigprocmask", NULL, 0, 4}, /* 14 */ + {"rt_sigreturn", NULL, 0, 0}, /* 15 */ + {"ioctl", lx_ioctl, 0, 3}, /* 16 */ + {"pread64", NULL, 0, 4}, /* 17 */ + {"pwrite64", NULL, 0, 4}, /* 18 */ + {"readv", NULL, 0, 3}, /* 19 */ + {"writev", NULL, 0, 3}, /* 20 */ + {"access", NULL, 0, 2}, /* 21 */ + {"pipe", lx_pipe, 0, 1}, /* 22 */ + {"select", NULL, 0, 5}, /* 23 */ + {"sched_yield", lx_sched_yield, 0, 0}, /* 24 */ + {"mremap", NULL, 0, 5}, /* 25 */ + {"msync", NULL, 0, 3}, /* 26 */ + {"mincore", NULL, 0, 3}, /* 27 */ + {"madvise", NULL, 0, 3}, /* 28 */ + {"shmget", NULL, 0, 3}, /* 29 */ + {"shmat", NULL, 0, 4}, /* 30 */ + {"shmctl", NULL, 0, 3}, /* 31 */ + {"dup", NULL, 0, 1}, /* 32 */ + {"dup2", NULL, 0, 2}, /* 33 */ + {"pause", NULL, 0, 0}, /* 34 */ + {"nanosleep", NULL, 0, 2}, /* 35 */ + {"getitimer", NULL, 0, 2}, /* 36 */ + {"alarm", NULL, 0, 1}, /* 37 */ + {"setitimer", NULL, 0, 3}, /* 38 */ + {"getpid", lx_getpid, 0, 0}, /* 39 */ + {"sendfile", NULL, 0, 4}, /* 40 */ + {"socket", NULL, 0, 3}, /* 41 */ + {"connect", NULL, 0, 3}, /* 42 */ + {"accept", NULL, 0, 3}, /* 43 */ + {"sendto", NULL, 0, 6}, /* 44 */ + {"recvfrom", NULL, 0, 6}, /* 45 */ + {"sendmsg", NULL, 0, 3}, /* 46 */ + {"recvmsg", NULL, 0, 3}, /* 47 */ + {"shutdown", NULL, 0, 2}, /* 48 */ + {"bind", NULL, 0, 3}, /* 49 */ + {"listen", NULL, 0, 2}, /* 50 */ + {"getsockname", NULL, 0, 3}, /* 51 */ + {"getpeername", NULL, 0, 3}, /* 52 */ + {"socketpair", NULL, 0, 4}, /* 53 */ + {"setsockopt", NULL, 0, 5}, /* 54 */ + {"getsockopt", NULL, 0, 5}, /* 55 */ + {"clone", NULL, 0, 5}, /* 56 */ + {"fork", NULL, 0, 0}, /* 57 */ + {"vfork", NULL, 0, 0}, /* 58 */ + {"execve", NULL, 0, 3}, /* 59 */ + {"exit", NULL, 0, 1}, /* 60 */ + {"wait4", lx_wait4, 0, 4}, /* 61 */ + {"kill", lx_kill, 0, 2}, /* 62 */ + {"uname", NULL, 0, 1}, /* 63 */ + {"semget", NULL, 0, 3}, /* 64 */ + {"semop", NULL, 0, 3}, /* 65 */ + {"semctl", NULL, 0, 4}, /* 66 */ + {"shmdt", NULL, 0, 1}, /* 67 */ + {"msgget", NULL, 0, 2}, /* 68 */ + {"msgsnd", NULL, 0, 4}, /* 69 */ + {"msgrcv", NULL, 0, 5}, /* 70 */ + {"msgctl", NULL, 0, 3}, /* 71 */ + {"fcntl", NULL, 0, 3}, /* 72 */ + {"flock", NULL, 0, 2}, /* 73 */ + {"fsync", NULL, 0, 1}, /* 74 */ + {"fdatasync", NULL, 0, 1}, /* 75 */ + {"truncate", NULL, 0, 2}, /* 76 */ + {"ftruncate", NULL, 0, 2}, /* 77 */ + {"getdents", NULL, 0, 3}, /* 78 */ + {"getcwd", NULL, 0, 2}, /* 79 */ + {"chdir", NULL, 0, 1}, /* 80 */ + {"fchdir", NULL, 0, 1}, /* 81 */ + {"rename", NULL, 0, 2}, /* 82 */ + {"mkdir", NULL, 0, 2}, /* 83 */ + {"rmdir", NULL, 0, 1}, /* 84 */ + {"creat", NULL, 0, 2}, /* 85 */ + {"link", NULL, 0, 2}, /* 86 */ + {"unlink", NULL, 0, 1}, /* 87 */ + {"symlink", NULL, 0, 2}, /* 88 */ + {"readlink", NULL, 0, 3}, /* 89 */ + {"chmod", NULL, 0, 2}, /* 90 */ + {"fchmod", NULL, 0, 2}, /* 91 */ + {"chown", NULL, 0, 3}, /* 92 */ + {"fchown", NULL, 0, 3}, /* 93 */ + {"lchown", NULL, 0, 3}, /* 94 */ + {"umask", NULL, 0, 1}, /* 95 */ + {"gettimeofday", NULL, 0, 2}, /* 96 */ + {"getrlimit", NULL, 0, 2}, /* 97 */ + {"getrusage", NULL, 0, 2}, /* 98 */ + {"sysinfo", lx_sysinfo64, 0, 1}, /* 99 */ + {"times", NULL, 0, 1}, /* 100 */ + {"ptrace", NULL, 0, 4}, /* 101 */ + {"getuid", NULL, 0, 0}, /* 102 */ + {"syslog", NULL, 0, 3}, /* 103 */ + {"getgid", NULL, 0, 0}, /* 104 */ + {"setuid", NULL, 0, 1}, /* 105 */ + {"setgid", NULL, 0, 1}, /* 106 */ + {"geteuid", NULL, 0, 0}, /* 107 */ + {"getegid", NULL, 0, 0}, /* 108 */ + {"setpgid", NULL, 0, 2}, /* 109 */ + {"getppid", lx_getppid, 0, 0}, /* 110 */ + {"getpgrp", NULL, 0, 0}, /* 111 */ + {"setsid", NULL, 0, 0}, /* 112 */ + {"setreuid", NULL, 0, 0}, /* 113 */ + {"setregid", NULL, 0, 0}, /* 114 */ + {"getgroups", NULL, 0, 2}, /* 115 */ + {"setgroups", NULL, 0, 2}, /* 116 */ + {"setresuid", lx_setresuid, 0, 3}, /* 117 */ + {"getresuid", NULL, 0, 3}, /* 118 */ + {"setresgid", lx_setresgid, 0, 3}, /* 119 */ + {"getresgid", NULL, 0, 3}, /* 120 */ + {"getpgid", NULL, 0, 1}, /* 121 */ + {"setfsuid", NULL, 0, 1}, /* 122 */ + {"setfsgid", NULL, 0, 1}, /* 123 */ + {"getsid", NULL, 0, 1}, /* 124 */ + {"capget", NULL, 0, 2}, /* 125 */ + {"capset", NULL, 0, 2}, /* 126 */ + {"rt_sigpending", NULL, 0, 2}, /* 127 */ + {"rt_sigtimedwait", NULL, 0, 4}, /* 128 */ + {"rt_sigqueueinfo", NULL, 0, 3}, /* 129 */ + {"rt_sigsuspend", NULL, 0, 2}, /* 130 */ + {"sigaltstack", NULL, 0, 2}, /* 131 */ + {"utime", NULL, 0, 2}, /* 132 */ + {"mknod", NULL, 0, 3}, /* 133 */ + {"uselib", NULL, NOSYS_KERNEL, 0}, /* 134 */ + {"personality", NULL, 0, 1}, /* 135 */ + {"ustat", NULL, NOSYS_OBSOLETE, 2}, /* 136 */ + {"statfs", NULL, 0, 2}, /* 137 */ + {"fstatfs", NULL, 0, 2}, /* 138 */ + {"sysfs", NULL, 0, 3}, /* 139 */ + {"getpriority", NULL, 0, 2}, /* 140 */ + {"setpriority", NULL, 0, 3}, /* 141 */ + {"sched_setparam", NULL, 0, 2}, /* 142 */ + {"sched_getparam", NULL, 0, 2}, /* 143 */ + {"sched_setscheduler", NULL, 0, 3}, /* 144 */ + {"sched_getscheduler", NULL, 0, 1}, /* 145 */ + {"sched_get_priority_max", NULL, 0, 1}, /* 146 */ + {"sched_get_priority_min", NULL, 0, 1}, /* 147 */ + {"sched_rr_get_interval", NULL, 0, 2}, /* 148 */ + {"mlock", NULL, 0, 2}, /* 149 */ + {"munlock", NULL, 0, 2}, /* 150 */ + {"mlockall", NULL, 0, 1}, /* 151 */ + {"munlockall", NULL, 0, 0}, /* 152 */ + {"vhangup", NULL, 0, 0}, /* 153 */ + {"modify_ldt", lx_modify_ldt, 0, 3}, /* 154 */ + {"pivot_root", NULL, NOSYS_KERNEL, 0}, /* 155 */ + {"sysctl", NULL, 0, 1}, /* 156 */ + {"prctl", NULL, 0, 5}, /* 157 */ + {"arch_prctl", lx_arch_prctl, 0, 2}, /* 158 */ + {"adjtimex", NULL, 0, 1}, /* 159 */ + {"setrlimit", NULL, 0, 2}, /* 160 */ + {"chroot", NULL, 0, 1}, /* 161 */ + {"sync", NULL, 0, 0}, /* 162 */ + {"acct", NULL, NOSYS_NO_EQUIV, 0}, /* 163 */ + {"settimeofday", NULL, 0, 2}, /* 164 */ + {"mount", NULL, 0, 5}, /* 165 */ + {"umount2", NULL, 0, 2}, /* 166 */ + {"swapon", NULL, NOSYS_KERNEL, 0}, /* 167 */ + {"swapoff", NULL, NOSYS_KERNEL, 0}, /* 168 */ + {"reboot", NULL, 0, 4}, /* 169 */ + {"sethostname", NULL, 0, 2}, /* 170 */ + {"setdomainname", NULL, 0, 2}, /* 171 */ + {"iopl", NULL, NOSYS_NO_EQUIV, 0}, /* 172 */ + {"ioperm", NULL, NOSYS_NO_EQUIV, 0}, /* 173 */ + {"create_module", NULL, NOSYS_KERNEL, 0}, /* 174 */ + {"init_module", NULL, NOSYS_KERNEL, 0}, /* 175 */ + {"delete_module", NULL, NOSYS_KERNEL, 0}, /* 176 */ + {"get_kernel_syms", NULL, NOSYS_KERNEL, 0}, /* 177 */ + {"query_module", NULL, 0, 5}, /* 178 */ + {"quotactl", NULL, NOSYS_KERNEL, 0}, /* 179 */ + {"nfsservctl", NULL, NOSYS_KERNEL, 0}, /* 180 */ + {"getpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 181 */ + {"putpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 182 */ + {"afs_syscall", NULL, NOSYS_KERNEL, 0}, /* 183 */ + {"tux", NULL, NOSYS_NO_EQUIV, 0}, /* 184 */ + {"security", NULL, NOSYS_NO_EQUIV, 0}, /* 185 */ + {"gettid", lx_gettid, 0, 0}, /* 186 */ + {"readahead", NULL, NOSYS_NO_EQUIV, 0}, /* 187 */ + {"setxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 188 */ + {"lsetxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 189 */ + {"fsetxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 190 */ + {"getxattr", lx_xattr, 0, 4}, /* 191 */ + {"lgetxattr", lx_xattr, 0, 4}, /* 192 */ + {"fgetxattr", lx_xattr, 0, 4}, /* 193 */ + {"listxattr", lx_xattr, 0, 3}, /* 194 */ + {"llistxattr", lx_xattr, 0, 3}, /* 195 */ + {"flistxattr", lx_xattr, 0, 3}, /* 196 */ + {"removexattr", lx_xattr, 0, 2}, /* 197 */ + {"lremovexattr", lx_xattr, 0, 2}, /* 198 */ + {"fremovexattr", lx_xattr, 0, 2}, /* 199 */ + {"tkill", lx_tkill, 0, 2}, /* 200 */ + {"time", NULL, 0, 1}, /* 201 */ + {"futex", lx_futex, 0, 6}, /* 202 */ + {"sched_setaffinity", NULL, 0, 3}, /* 203 */ + {"sched_getaffinity", NULL, 0, 3}, /* 204 */ + {"set_thread_area", lx_set_thread_area, 0, 1}, /* 205 */ + {"io_setup", NULL, NOSYS_NO_EQUIV, 0}, /* 206 */ + {"io_destroy", NULL, NOSYS_NO_EQUIV, 0}, /* 207 */ + {"io_getevents", NULL, NOSYS_NO_EQUIV, 0}, /* 208 */ + {"io_submit", NULL, NOSYS_NO_EQUIV, 0}, /* 209 */ + {"io_cancel", NULL, NOSYS_NO_EQUIV, 0}, /* 210 */ + {"get_thread_area", lx_get_thread_area, 0, 1}, /* 211 */ + {"lookup_dcookie", NULL, NOSYS_NO_EQUIV, 0}, /* 212 */ + {"epoll_create", NULL, 0, 1}, /* 213 */ + {"epoll_ctl_old", NULL, NOSYS_NULL, 0}, /* 214 */ + {"epoll_wait_old", NULL, NOSYS_NULL, 0}, /* 215 */ + {"remap_file_pages", NULL, NOSYS_NO_EQUIV, 0}, /* 216 */ + {"getdents64", NULL, 0, 3}, /* 217 */ + {"set_tid_address", lx_set_tid_address, 0, 1}, /* 218 */ + {"restart_syscall", NULL, NOSYS_NULL, 0}, /* 219 */ + {"semtimedop", NULL, 0, 4}, /* 220 */ + {"fadvise64", NULL, 0, 4}, /* 221 */ + {"timer_create", NULL, 0, 3}, /* 222 */ + {"timer_settime", NULL, 0, 4}, /* 223 */ + {"timer_gettime", NULL, 0, 2}, /* 224 */ + {"timer_getoverrun", NULL, 0, 1}, /* 225 */ + {"timer_delete", NULL, 0, 1}, /* 226 */ + {"clock_settime", NULL, 0, 2}, /* 227 */ + {"clock_gettime", NULL, 0, 2}, /* 228 */ + {"clock_getres", NULL, 0, 2}, /* 229 */ + {"clock_nanosleep", NULL, 0, 4}, /* 230 */ + {"exit_group", NULL, 0, 1}, /* 231 */ + {"epoll_wait", NULL, 0, 4}, /* 232 */ + {"epoll_ctl", NULL, 0, 4}, /* 233 */ + {"tgkill", lx_tgkill, 0, 3}, /* 234 */ + {"utimes", NULL, 0, 2}, /* 235 */ + {"vserver", NULL, NOSYS_NULL, 0}, /* 236 */ + {"mbind", NULL, NOSYS_NULL, 0}, /* 237 */ + {"set_mempolicy", NULL, NOSYS_NULL, 0}, /* 238 */ + {"get_mempolicy", NULL, NOSYS_NULL, 0}, /* 239 */ + {"mq_open", NULL, NOSYS_NULL, 0}, /* 240 */ + {"mq_unlink", NULL, NOSYS_NULL, 0}, /* 241 */ + {"mq_timedsend", NULL, NOSYS_NULL, 0}, /* 242 */ + {"mq_timedreceive", NULL, NOSYS_NULL, 0}, /* 243 */ + {"mq_notify", NULL, NOSYS_NULL, 0}, /* 244 */ + {"mq_getsetattr", NULL, NOSYS_NULL, 0}, /* 245 */ + {"kexec_load", NULL, NOSYS_NULL, 0}, /* 246 */ + {"waitid", lx_waitid, 0, 4}, /* 247 */ + {"add_key", NULL, NOSYS_NULL, 0}, /* 248 */ + {"request_key", NULL, NOSYS_NULL, 0}, /* 249 */ + {"keyctl", NULL, NOSYS_NULL, 0}, /* 250 */ + {"ioprio_set", NULL, NOSYS_NULL, 0}, /* 251 */ + {"ioprio_get", NULL, NOSYS_NULL, 0}, /* 252 */ + {"inotify_init", NULL, 0, 0}, /* 253 */ + {"inotify_add_watch", NULL, 0, 3}, /* 254 */ + {"inotify_rm_watch", NULL, 0, 2}, /* 255 */ + {"migrate_pages", NULL, NOSYS_NULL, 0}, /* 256 */ + {"openat", NULL, 0, 4}, /* 257 */ + {"mkdirat", NULL, 0, 3}, /* 258 */ + {"mknodat", NULL, 0, 4}, /* 259 */ + {"fchownat", NULL, 0, 5}, /* 260 */ + {"futimesat", NULL, 0, 3}, /* 261 */ + {"fstatat64", NULL, 0, 4}, /* 262 */ + {"unlinkat", NULL, 0, 3}, /* 263 */ + {"renameat", NULL, 0, 4}, /* 264 */ + {"linkat", NULL, 0, 5}, /* 265 */ + {"symlinkat", NULL, 0, 3}, /* 266 */ + {"readlinkat", NULL, 0, 4}, /* 267 */ + {"fchmodat", NULL, 0, 4}, /* 268 */ + {"faccessat", NULL, 0, 4}, /* 269 */ + {"pselect6", NULL, 0, 6}, /* 270 */ + {"ppoll", NULL, 0, 5}, /* 271 */ + {"unshare", NULL, NOSYS_NULL, 0}, /* 272 */ + {"set_robust_list", NULL, NOSYS_NULL, 0}, /* 273 */ + {"get_robust_list", NULL, NOSYS_NULL, 0}, /* 274 */ + {"splice", NULL, NOSYS_NULL, 0}, /* 275 */ + {"tee", NULL, NOSYS_NULL, 0}, /* 276 */ + {"sync_file_range", NULL, NOSYS_NULL, 0}, /* 277 */ + {"vmsplice", NULL, NOSYS_NULL, 0}, /* 278 */ + {"move_pages", NULL, NOSYS_NULL, 0}, /* 279 */ + {"utimensat", NULL, 0, 4}, /* 280 */ + {"epoll_pwait", NULL, 0, 5}, /* 281 */ + {"signalfd", NULL, NOSYS_NULL, 0}, /* 282 */ + {"timerfd_create", NULL, 0, 2}, /* 283 */ + {"eventfd", NULL, 0, 1}, /* 284 */ + {"fallocate", NULL, NOSYS_NULL, 0}, /* 285 */ + {"timerfd_settime", NULL, 0, 4}, /* 286 */ + {"timerfd_gettime", NULL, 0, 2}, /* 287 */ + {"accept4", NULL, 0, 4}, /* 288 */ + {"signalfd4", NULL, NOSYS_NULL, 0}, /* 289 */ + {"eventfd2", NULL, 0, 2}, /* 290 */ + {"epoll_create1", NULL, 0, 1}, /* 291 */ + {"dup3", NULL, 0, 3}, /* 292 */ + {"pipe2", lx_pipe2, 0, 2}, /* 293 */ + {"inotify_init1", NULL, 0, 1}, /* 294 */ + {"preadv", NULL, NOSYS_NULL, 0}, /* 295 */ + {"pwritev", NULL, NOSYS_NULL, 0}, /* 296 */ + {"rt_tgsigqueueinfo", NULL, 0, 4}, /* 297 */ + {"perf_event_open", NULL, NOSYS_NULL, 0}, /* 298 */ + {"recvmmsg", NULL, NOSYS_NULL, 0}, /* 299 */ + {"fanotify_init", NULL, NOSYS_NULL, 0}, /* 300 */ + {"fanotify_mark", NULL, NOSYS_NULL, 0}, /* 301 */ + {"prlimit64", NULL, 0, 4}, /* 302 */ + {"name_to_handle_at", NULL, NOSYS_NULL, 0}, /* 303 */ + {"open_by_handle_at", NULL, NOSYS_NULL, 0}, /* 304 */ + {"clock_adjtime", NULL, NOSYS_NULL, 0}, /* 305 */ + {"syncfs", NULL, NOSYS_NULL, 0}, /* 306 */ + {"sendmmsg", NULL, NOSYS_NULL, 0}, /* 307 */ + {"setns", NULL, NOSYS_NULL, 0}, /* 309 */ + {"getcpu", NULL, 0, 3}, /* 309 */ + {"process_vm_readv", NULL, NOSYS_NULL, 0}, /* 310 */ + {"process_vm_writev", NULL, NOSYS_NULL, 0}, /* 311 */ + {"kcmp", NULL, NOSYS_NULL, 0}, /* 312 */ + {"finit_module", NULL, NOSYS_NULL, 0}, /* 313 */ + {"sched_setattr", NULL, NOSYS_NULL, 0}, /* 314 */ + {"sched_getattr", NULL, NOSYS_NULL, 0}, /* 315 */ + {"renameat2", NULL, NOSYS_NULL, 0}, /* 316 */ + {"seccomp", NULL, NOSYS_NULL, 0}, /* 317 */ + {"getrandom", NULL, NOSYS_NULL, 0}, /* 318 */ + {"memfd_create", NULL, NOSYS_NULL, 0}, /* 319 */ + {"kexec_file_load", NULL, NOSYS_NULL, 0}, /* 320 */ + {"bpf", NULL, NOSYS_NULL, 0}, /* 321 */ + {"execveat", NULL, NOSYS_NULL, 0}, /* 322 */ + + /* XXX TBD gap then x32 syscalls from 512 - 544 */ +}; +#endif diff --git a/usr/src/uts/common/brand/lx/sys/lx_brand.h b/usr/src/uts/common/brand/lx/sys/lx_brand.h index e7f5ee9867..543373b5fa 100644 --- a/usr/src/uts/common/brand/lx/sys/lx_brand.h +++ b/usr/src/uts/common/brand/lx/sys/lx_brand.h @@ -66,10 +66,7 @@ extern "C" { /* * This must be large enough for both the 32-bit table and 64-bit table. */ -#define LX_NSYSCALLS 352 - -/* The number of In-Kernel Emulation functions */ -#define LX_N_IKE_FUNCS 29 +#define LX_NSYSCALLS 358 /* * brand(2) subcommands @@ -78,8 +75,8 @@ extern "C" { * > 192 is reserved for in-kernel emulated system calls. */ #define B_LPID_TO_SPAIR 128 -#define B_SYSENTRY 129 -#define B_SYSRETURN 130 +#define B_GET_CURRENT_CONTEXT 129 +#define B_EMULATION_DONE 130 #define B_PTRACE_KERNEL 131 #define B_SET_AFFINITY_MASK 132 #define B_GET_AFFINITY_MASK 133 @@ -87,13 +84,16 @@ extern "C" { #define B_PTRACE_STOP_FOR_OPT 135 #define B_UNSUPPORTED 136 #define B_STORE_ARGS 137 -#define B_CLR_NTV_SYSC_FLAG 138 -#define B_SIGNAL_RETURN 139 -#define B_UNWIND_NTV_SYSC_FLAG 140 +#define B_GETPID 138 +#define B_JUMP_TO_LINUX 139 +#define B_SET_THUNK_PID 140 #define B_EXIT_AS_SIG 141 #define B_HELPER_WAITID 142 - -#define B_IKE_SYSCALL 192 +#define B_HELPER_CLONE 143 +#define B_HELPER_SETGROUPS 144 +#define B_HELPER_SIGQUEUE 145 +#define B_HELPER_TGSIGQUEUE 146 +#define B_SET_NATIVE_STACK 147 #ifndef _ASM /* @@ -157,6 +157,45 @@ typedef enum lx_ptrace_options { /* Aux vector containing vDSO addr */ #define AT_SYSINFO_EHDR 33 +/* + * This table initialiser maps errno values from illumos to Linux numbers. + * It is presently used in both the usermode and kernel emulation code, + * so it is defined here. + */ +/* BEGIN CSTYLED */ +#define LX_STOL_ERRNO_INIT { \ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, \ + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, \ + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, \ + 30, 31, 32, 33, 34, 42, 43, 44, 45, 46, \ + 47, 48, 49, 50, 51, 35, 47, 22, 38, 22, /* 49 */ \ + 52, 53, 54, 55, 56, 57, 58, 59, 22, 22, \ + 61, 61, 62, 63, 64, 65, 66, 67, 68, 69, \ + 70, 71, 22, 22, 72, 22, 22, 74, 36, 75, \ + 76, 77, 78, 79, 80, 81, 82, 83, 84, 38, \ + 40, 85, 86, 39, 87, 88, 89, 90, 91, 92, /* 99 */ \ + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, \ + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, \ + 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, \ + 103, 104, 105, 106, 107, 22, 22, 22, 22, 22, \ + 22, 22, 22, 108, 109, 110, 111, 112, 113, 114, /* 149 */ \ + 115, 116 } +/* END CSTYLED */ + +/* + * Usermode emulation routines are run on an alternate stack allocated by + * the brand library. Every LWP in a process will incur this overhead beyond + * the regular thread stack: + */ +#define LX_NATIVE_STACK_PAGE_COUNT 64 + +/* + * When returning in a new child process created with vfork(2) (or CLONE_VFORK) + * we discard some of the native stack to prevent corruption of the parent + * emulation state. + */ +#define LX_NATIVE_STACK_VFORK_GAP 0x3000 + #ifndef _ASM extern struct brand lx_brand; @@ -164,18 +203,15 @@ extern struct brand lx_brand; typedef struct lx_brand_registration { uint_t lxbr_version; /* version number */ void *lxbr_handler; /* base address of handler */ - void *lxbr_tracehandler; /* base address of trace handler */ - void *lxbr_traceflag; /* address of trace flag */ } lx_brand_registration_t; typedef struct lx_brand_registration32 { uint_t lxbr_version; /* version number */ uint32_t lxbr_handler; /* base address of handler */ - uint32_t lxbr_tracehandler; /* base address of trace handler */ - uint32_t lxbr_traceflag; /* address of trace flag */ } lx_brand_registration32_t; #ifdef __amd64 + typedef struct lx_regs { long lxr_fs; long lxr_rdi; @@ -198,7 +234,24 @@ typedef struct lx_regs { long lxr_orig_rax; } lx_regs_t; + +typedef struct lx_regs32 { + uint32_t lxr_gs; + uint32_t lxr_edi; + uint32_t lxr_esi; + uint32_t lxr_ebp; + uint32_t lxr_esp; + uint32_t lxr_ebx; + uint32_t lxr_edx; + uint32_t lxr_ecx; + uint32_t lxr_eax; + uint32_t lxr_eip; + + uint32_t lxr_orig_eax; +} lx_regs32_t; + #else /* ! __amd64 */ + typedef struct lx_regs { long lxr_gs; long lxr_edi; @@ -213,6 +266,91 @@ typedef struct lx_regs { long lxr_orig_eax; } lx_regs_t; + +#endif /* __amd64 */ + +#ifdef __amd64 +/* + * The 64-bit native "user_regs_struct" Linux structure. + */ +typedef struct lx_user_regs { + long lxur_r15; + long lxur_r14; + long lxur_r13; + long lxur_r12; + long lxur_rbp; + long lxur_rbx; + long lxur_r11; + long lxur_r10; + long lxur_r9; + long lxur_r8; + long lxur_rax; + long lxur_rcx; + long lxur_rdx; + long lxur_rsi; + long lxur_rdi; + long lxur_orig_rax; + long lxur_rip; + long lxur_xcs; + long lxur_rflags; + long lxur_rsp; + long lxur_xss; + long lxur_xfs_base; + long lxur_xgs_base; + long lxur_xds; + long lxur_xes; + long lxur_xfs; + long lxur_xgs; +} lx_user_regs_t; + +#if defined(_KERNEL) && defined(_SYSCALL32_IMPL) +/* + * 64-bit kernel view of the 32-bit "user_regs_struct" Linux structure. + */ +typedef struct lx_user_regs32 { + int32_t lxur_ebx; + int32_t lxur_ecx; + int32_t lxur_edx; + int32_t lxur_esi; + int32_t lxur_edi; + int32_t lxur_ebp; + int32_t lxur_eax; + int32_t lxur_xds; + int32_t lxur_xes; + int32_t lxur_xfs; + int32_t lxur_xgs; + int32_t lxur_orig_eax; + int32_t lxur_eip; + int32_t lxur_xcs; + int32_t lxur_eflags; + int32_t lxur_esp; + int32_t lxur_xss; +} lx_user_regs32_t; +#endif /* defined(_KERNEL) && defined(_SYSCALL32_IMPL) */ + +#else /* !__amd64 */ +/* + * The 32-bit native "user_regs_struct" Linux structure. + */ +typedef struct lx_user_regs { + long lxur_ebx; + long lxur_ecx; + long lxur_edx; + long lxur_esi; + long lxur_edi; + long lxur_ebp; + long lxur_eax; + long lxur_xds; + long lxur_xes; + long lxur_xfs; + long lxur_xgs; + long lxur_orig_eax; + long lxur_eip; + long lxur_xcs; + long lxur_eflags; + long lxur_esp; + long lxur_xss; +} lx_user_regs_t; #endif /* __amd64 */ #endif /* _ASM */ @@ -240,12 +378,12 @@ typedef struct lx_elf_data64 { } lx_elf_data64_t; typedef struct lx_elf_data32 { - int ed_phdr; - int ed_phent; - int ed_phnum; - int ed_entry; - int ed_base; - int ed_ldentry; + uint32_t ed_phdr; + uint32_t ed_phent; + uint32_t ed_phnum; + uint32_t ed_entry; + uint32_t ed_base; + uint32_t ed_ldentry; } lx_elf_data32_t; #if defined(_LP64) @@ -258,8 +396,6 @@ typedef lx_elf_data32_t lx_elf_data_t; typedef struct lx_proc_data { uintptr_t l_handler; /* address of user-space handler */ - uintptr_t l_tracehandler; /* address of user-space traced handler */ - uintptr_t l_traceflag; /* address of 32-bit tracing flag */ pid_t l_ppid; /* pid of originating parent proc */ uint64_t l_ptrace; /* process being observed with ptrace */ lx_elf_data_t l_elf_data; /* ELF data for linux executable */ @@ -281,6 +417,16 @@ typedef ulong_t lx_affmask_t[LX_AFF_ULONGS]; /* Max. length of kernel version string */ #define LX_VERS_MAX 16 +/* + * Flag values for uc_brand_data[0] in the ucontext_t: + */ +#define LX_UC_STACK_NATIVE 0x00001 +#define LX_UC_STACK_BRAND 0x00002 +#define LX_UC_RESTORE_NATIVE_SP 0x00010 +#define LX_UC_FRAME_IS_SYSCALL 0x00100 +#define LX_UC_RESTART_SYSCALL 0x01000 +#define LX_UC_IGNORE_LINK 0x10000 + #ifdef _KERNEL typedef struct lx_lwp_data lx_lwp_data_t; @@ -303,7 +449,8 @@ typedef enum lx_ptrace_state { LX_PTRACE_STOPPED = 0x10, LX_PTRACE_PARENT_WAIT = 0x20, LX_PTRACE_CLDPEND = 0x40, - LX_PTRACE_CLONING = 0x80 + LX_PTRACE_CLONING = 0x80, + LX_PTRACE_WAITPEND = 0x100 } lx_ptrace_state_t; /* @@ -343,11 +490,17 @@ typedef enum lx_ptrace_attach { LX_PTA_INHERIT_OPTIONS = 0x08 /* due to PTRACE_SETOPTIONS options */ } lx_ptrace_attach_t; +typedef enum lx_stack_mode { + LX_STACK_MODE_PREINIT = 0, + LX_STACK_MODE_INIT, + LX_STACK_MODE_NATIVE, + LX_STACK_MODE_BRAND +} lx_stack_mode_t; + /* * lx-specific data in the klwp_t */ struct lx_lwp_data { - uint_t br_ntv_syscall; /* 1 = syscall from native libc */ uint_t br_lwp_flags; /* misc. flags */ klwp_t *br_lwp; /* back pointer to container lwp */ int br_signal; /* signal to send to parent when */ @@ -359,12 +512,6 @@ struct lx_lwp_data { /* descriptors used by libc for TLS */ ulong_t br_lx_fsbase; /* lx fsbase for 64-bit thread ptr */ ulong_t br_ntv_fsbase; /* native fsbase 64-bit thread ptr */ - /* - * 64-bit thread-specific syscall mode state "stack". Bits tracking the - * syscall mode are shifted on/off this int like a stack as we take - * signals and return. - */ - uint_t br_scms; pid_t br_pid; /* converted pid for this thread */ pid_t br_tgid; /* thread group ID for this thread */ pid_t br_ppid; /* parent pid for this thread */ @@ -396,9 +543,30 @@ struct lx_lwp_data { ushort_t br_ptrace_whatstop; /* stop sub-reason */ int32_t br_ptrace_stopsig; /* stop signal, 0 for no signal */ + uintptr_t br_ptrace_stopucp; /* usermode ucontext_t pointer */ uint_t br_ptrace_event; ulong_t br_ptrace_eventmsg; + + int br_syscall_num; /* current system call number */ + boolean_t br_syscall_restart; /* should restart on EINTR */ + + /* + * Store the LX_STACK_MODE for this LWP, and the current extent of the + * native (emulation) stack. This is similar, in principle, to the + * sigaltstack mechanism for signal handling. We also use this mode + * flag to determine how to process system calls from this LWP. + */ + lx_stack_mode_t br_stack_mode; + uintptr_t br_ntv_stack; + uintptr_t br_ntv_stack_current; + + /* + * If this pid is set, we return it with getpid(). This allows the + * thunking server to interpose on the pid returned to the Linux + * syslog software. + */ + pid_t br_lx_thunk_pid; }; /* @@ -410,7 +578,6 @@ struct lx_lwp_data { /* brand specific data */ typedef struct lx_zone_data { char lxzd_kernel_version[LX_VERS_MAX]; - int lxzd_max_syscall; } lx_zone_data_t; #define BR_CPU_BOUND 0x0001 @@ -428,16 +595,61 @@ typedef struct lx_zone_data { #define LX_ARGS(scall) ((struct lx_##scall##_args *)\ (ttolxlwp(curthread)->br_scall_args)) -void lx_brand_int80_callback(void); -void lx_brand_syscall_callback(void); -int64_t lx_emulate_syscall(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t); +/* + * Determine the upper bound on the system call number: + */ +#if defined(_LP64) +#define LX_MAX_SYSCALL(lwp) \ + ((lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) ? \ + lx_nsysent64 : lx_nsysent32) +#else +#define LX_MAX_SYSCALL(lwp) lx_nsysent32 +#endif extern char *lx_get_zone_kern_version(zone_t *); +extern void lx_lwp_set_native_stack_current(lx_lwp_data_t *, uintptr_t); +extern void lx_divert(klwp_t *, uintptr_t); +extern int lx_runexe(klwp_t *, void *); +extern void lx_switch_to_native(klwp_t *); +extern int lx_regs_to_userregs(lx_lwp_data_t *, void *); +extern int lx_uc_to_userregs(lx_lwp_data_t *, void *, void *); +extern int lx_userregs_to_regs(lx_lwp_data_t *lwpd, void *); +extern int lx_userregs_to_uc(lx_lwp_data_t *lwpd, void *, void *); + +extern int lx_syscall_enter(void); +extern int lx_syscall_return(klwp_t *, int, long); + +extern void lx_trace_sysenter(int, uintptr_t *); +extern void lx_trace_sysreturn(int, long); + +extern void lx_emulate_user(klwp_t *, int, uintptr_t *); +#if defined(_SYSCALL32_IMPL) +extern void lx_emulate_user32(klwp_t *, int, uintptr_t *); +#endif + extern int lx_debug; #define lx_print if (lx_debug) printf +extern int lx_stol_errno[]; + +/* + * In-Kernel Linux System Call Description. + */ +typedef struct lx_sysent { + char *sy_name; + long (*sy_callc)(); + char sy_flags; + char sy_narg; +} lx_sysent_t; + +#if defined(_LP64) +extern lx_sysent_t lx_sysent64[LX_NSYSCALLS + 1]; +extern int lx_nsysent64; +#endif +extern lx_sysent_t lx_sysent32[LX_NSYSCALLS + 1]; +extern int lx_nsysent32; + #endif /* _KERNEL */ #endif /* _ASM */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_misc.h b/usr/src/uts/common/brand/lx/sys/lx_misc.h index 7b77789c56..cc8c6d44f6 100644 --- a/usr/src/uts/common/brand/lx/sys/lx_misc.h +++ b/usr/src/uts/common/brand/lx/sys/lx_misc.h @@ -55,11 +55,16 @@ extern int lx_waitid_helper(idtype_t, id_t, k_siginfo_t *, int, boolean_t *, int *); extern void lx_ptrace_exit(proc_t *, klwp_t *); extern void lx_ptrace_inherit_tracer(lx_lwp_data_t *, lx_lwp_data_t *); -extern int lx_ptrace_stop_for_option(int, boolean_t, ulong_t); +extern int lx_ptrace_stop_for_option(int, boolean_t, ulong_t, uintptr_t); extern int lx_ptrace_set_clone_inherit(int, boolean_t); extern int lx_sigcld_repost(proc_t *, sigqueue_t *); extern int lx_issig_stop(proc_t *, klwp_t *); +extern int lx_helper_clone(int64_t *, int, void *, void *, void *); +extern int lx_helper_setgroups(int, gid_t *); +extern int lx_helper_rt_sigqueueinfo(pid_t, int, siginfo_t *); +extern int lx_helper_rt_tgsigqueueinfo(pid_t, pid_t, int, siginfo_t *); + #endif #ifdef __cplusplus diff --git a/usr/src/uts/common/brand/lx/sys/lx_siginfo.h b/usr/src/uts/common/brand/lx/sys/lx_siginfo.h new file mode 100644 index 0000000000..9f606b614f --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_siginfo.h @@ -0,0 +1,190 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LX_SIGINFO_H +#define _LX_SIGINFO_H + +#include <sys/lx_types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lx_siginfo_t lsi_code values + * + * LX_SI_ASYNCNL: Sent by asynch name lookup completion + * LX_SI_DETHREAD: Sent by execve() killing subsidiary threads + * LX_SI_SIGIO: Sent by queued SIGIO + * LX_SI_ASYNCIO: Sent by asynchronous I/O completion + * LX_SI_MESGQ: Sent by real time message queue state change + * LX_SI_TIMER: Sent by timer expiration + * LX_SI_QUEUE: Sent by sigqueue + * LX_SI_USER: Sent by kill, sigsend, raise, etc. + * LX_SI_KERNEL: Sent by kernel + * LX_SI_CODE_NOT_EXIST: Error code. When translating from Linux to + * illumos errors, if there is no translation available, this value + * should be used. This value should have no meaning as an si_code in + * illumos or Linux. + * + * At present, LX_SI_ASYNCNL, LX_SI_DETHREAD, and LX_SI_SIGIO are unused by + * BrandZ. + */ +#define LX_SI_CODE_NOT_EXIST (-61) +#define LX_SI_ASYNCNL (-60) +#define LX_SI_DETHREAD (-7) +#define LX_SI_TKILL (-6) +#define LX_SI_SIGIO (-5) +#define LX_SI_ASYNCIO (-4) +#define LX_SI_MESGQ (-3) +#define LX_SI_TIMER (-2) +#define LX_SI_QUEUE (-1) +#define LX_SI_USER (0) +#define LX_SI_KERNEL (0x80) + +#define LX_SI_MAX_SIZE 128 +#define LX_SI_PAD_SIZE_32 ((LX_SI_MAX_SIZE / sizeof (int)) - 3) +#define LX_SI_PAD_SIZE_64 ((LX_SI_MAX_SIZE / sizeof (int)) - 4) + +#if defined(_LP64) +/* + * Because of the odd number (3) of ints before the union, we need to account + * for the smaller padding needed on x64 due to the union being offset to an 8 + * byte boundary. + */ +#define LX_SI_PAD_SIZE LX_SI_PAD_SIZE_64 +#else +#define LX_SI_PAD_SIZE LX_SI_PAD_SIZE_32 +#endif + +typedef struct lx_siginfo { + int lsi_signo; + int lsi_errno; + int lsi_code; + union { + int _pad[LX_SI_PAD_SIZE]; + + struct { + pid_t _pid; + lx_uid16_t _uid; + } _kill; + + struct { + uint_t _timer1; + uint_t _timer2; + } _timer; + + struct { + pid_t _pid; + lx_uid16_t _uid; + union sigval _sigval; + } _rt; + + struct { + pid_t _pid; + lx_uid16_t _uid; + int _status; + clock_t _utime; + clock_t _stime; + } _sigchld; + + struct { + void *_addr; + } _sigfault; + + struct { + int _band; + int _fd; + } _sigpoll; + } _sifields; +} lx_siginfo_t; + +#if defined(_KERNEL) && defined(_SYSCALL32_IMPL) +/* + * 64-bit kernel view of the 32-bit "lx_siginfo_t" object. + */ +#pragma pack(4) +typedef struct lx_siginfo32 { + int lsi_signo; + int lsi_errno; + int lsi_code; + union { + int _pad[LX_SI_PAD_SIZE_32]; + + struct { + pid32_t _pid; + lx_uid16_t _uid; + } _kill; + + struct { + uint_t _timer1; + uint_t _timer2; + } _timer; + + struct { + pid32_t _pid; + lx_uid16_t _uid; + union sigval32 _sigval; + } _rt; + + struct { + pid32_t _pid; + lx_uid16_t _uid; + int _status; + clock32_t _utime; + clock32_t _stime; + } _sigchld; + + struct { + caddr32_t _addr; + } _sigfault; + + struct { + int _band; + int _fd; + } _sigpoll; + } _sifields; +} lx_siginfo32_t; +#pragma pack() +#endif /* defined(_KERNEL) && defined(_SYSCALL32_IMPL) */ + +#define lsi_pid _sifields._kill._pid +#define lsi_uid _sifields._kill._uid +#define lsi_status _sifields._sigchld._status +#define lsi_utime _sifields._sigchld._utime +#define lsi_stime _sifields._sigchld._stime +#define lsi_value _sifields._rt._sigval +#define lsi_int _sifields._rt._sigval.sivalx_int +#define lsi_ptr _sifields._rt._sigval.sivalx_ptr +#define lsi_addr _sifields._sigfault._addr +#define lsi_band _sifields._sigpoll._band +#define lsi_fd _sifields._sigpoll._fd + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_SIGINFO_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_syscalls.h b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h index 766aa91ef5..2d9abf2fe6 100644 --- a/usr/src/uts/common/brand/lx/sys/lx_syscalls.h +++ b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h @@ -22,7 +22,7 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_LINUX_SYSCALLS_H @@ -38,7 +38,6 @@ extern long lx_arch_prctl(); extern long lx_brk(); extern long lx_getpid(); extern long lx_getppid(); -extern long lx_clone(); extern long lx_kill(); extern long lx_tkill(); extern long lx_tgkill(); @@ -55,16 +54,20 @@ extern long lx_sched_getscheduler(); extern long lx_sched_rr_get_interval(); extern long lx_sched_setparam(); extern long lx_sched_setscheduler(); +extern long lx_sched_yield(); extern long lx_set_thread_area(); extern long lx_set_tid_address(); extern long lx_setresgid(); extern long lx_setresgid16(); extern long lx_setresuid(); extern long lx_setresuid16(); -extern long lx_sysinfo(); -extern long lx_setgroups(); -extern long lx_rt_sigqueueinfo(); -extern long lx_rt_tgsigqueueinfo(); +extern long lx_sysinfo32(); +extern long lx_sysinfo64(); +extern long lx_wait4(); +extern long lx_waitid(); +extern long lx_waitpid(); +extern long lx_write(); +extern long lx_xattr(); #endif /* _KERNEL */ diff --git a/usr/src/lib/brand/lx/lx_brand/sys/lx_types.h b/usr/src/uts/common/brand/lx/sys/lx_types.h index d98c8bc586..922c412020 100644 --- a/usr/src/lib/brand/lx/lx_brand/sys/lx_types.h +++ b/usr/src/uts/common/brand/lx/sys/lx_types.h @@ -31,6 +31,8 @@ extern "C" { #endif +#ifndef _KERNEL + #define SHRT_MIN (-32768) /* min value of a "short int" */ #define SHRT_MAX 32767 /* max value of a "short int" */ #define USHRT_MAX 65535 /* max of "unsigned short int" */ @@ -46,6 +48,8 @@ extern "C" { #define ULONG_MAX 4294967295UL /* max value of a 32-bit "ulong int" */ #endif +#endif /* !_KERNEL */ + #define LX_SYS_UTS_LN 65 struct lx_utsname { diff --git a/usr/src/uts/common/brand/lx/syscall/lx_clone.c b/usr/src/uts/common/brand/lx/syscall/lx_clone.c index d73c5f100b..50cdeaeab9 100644 --- a/usr/src/uts/common/brand/lx/syscall/lx_clone.c +++ b/usr/src/uts/common/brand/lx/syscall/lx_clone.c @@ -42,8 +42,8 @@ * linux cloned thread. */ /* ARGSUSED */ -long -lx_clone(int flags, void *stkp, void *ptidp, void *tls, void *ctidp) +int +lx_helper_clone(int64_t *rval, int flags, void *ptidp, void *tls, void *ctidp) { struct lx_lwp_data *lwpd = ttolxlwp(curthread); struct lx_proc_data *lproc = ttolxproc(curthread); @@ -85,19 +85,10 @@ lx_clone(int flags, void *stkp, void *ptidp, void *tls, void *ctidp) lx_set_gdt(entry, &lwpd->br_tls[tls_index]); } else { /* - * For 64-bit, we need to set %fsbase -- which - * requires us to save the native %fsbase and - * set our LX %fsbase. Don't use rdmsr since - * the value might get changed before we get to - * this code. We use the value from the pcb - * which the native libc should have already - * setup via syslwp_private. + * Set the Linux %fsbase for this LWP. We will + * restore it the next time we return to Linux + * via setcontext()/lx_restorecontext(). */ -#if defined(__amd64) - pcb_t *pcb; - pcb = (pcb_t *)&curthread->t_lwp->lwp_pcb; - lwpd->br_ntv_fsbase = pcb->pcb_fsbase; -#endif lwpd->br_lx_fsbase = (uintptr_t)tls; } } @@ -129,7 +120,9 @@ lx_clone(int flags, void *stkp, void *ptidp, void *tls, void *ctidp) return (set_errno(EFAULT)); } } - return (lwpd->br_pid); + + *rval = lwpd->br_pid; + return (0); } long diff --git a/usr/src/uts/common/brand/lx/syscall/lx_futex.c b/usr/src/uts/common/brand/lx/syscall/lx_futex.c index 7f8abcd8d9..3dd3971e62 100644 --- a/usr/src/uts/common/brand/lx/syscall/lx_futex.c +++ b/usr/src/uts/common/brand/lx/syscall/lx_futex.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/types.h> @@ -40,6 +40,8 @@ #include <sys/condvar.h> #include <sys/inttypes.h> #include <sys/cmn_err.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> #include <sys/lx_futex.h> #include <sys/lx_impl.h> @@ -277,10 +279,16 @@ futex_wait(memid_t *memid, caddr_t addr, int val, timespec_t *timeout) while ((fw.fw_woken == 0) && (err == 0)) { ret = cv_waituntil_sig(&fw.fw_cv, &futex_hash_lock[index], timeout, timechanged); - if (ret < 0) + if (ret < 0) { err = set_errno(ETIMEDOUT); - else if (ret == 0) + } else if (ret == 0) { + /* + * According to signal(7), a futex(2) call with the + * FUTEX_WAIT operation is restartable. + */ + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; err = set_errno(EINTR); + } } /* diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getpid.c b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c index aa8b2b40e1..88b1792d3c 100644 --- a/usr/src/uts/common/brand/lx/syscall/lx_getpid.c +++ b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c @@ -22,8 +22,9 @@ * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright 2015 Joyent, Inc. + */ #include <sys/zone.h> #include <sys/types.h> @@ -38,7 +39,7 @@ * return the pid */ long -lx_getpid() +lx_getpid(void) { lx_lwp_data_t *lwpd = ttolxlwp(curthread); long rv; @@ -46,8 +47,13 @@ lx_getpid() if (curproc->p_pid == curproc->p_zone->zone_proc_initpid) { rv = 1; } else { - ASSERT(lwpd != NULL); - rv = lwpd->br_tgid; + VERIFY(lwpd != NULL); + + if (lwpd->br_lx_thunk_pid != 0) { + rv = lwpd->br_lx_thunk_pid; + } else { + rv = lwpd->br_tgid; + } } return (rv); diff --git a/usr/src/uts/common/brand/lx/syscall/lx_id.c b/usr/src/uts/common/brand/lx/syscall/lx_id.c index 5ca18b7556..baa41f52fa 100644 --- a/usr/src/uts/common/brand/lx/syscall/lx_id.c +++ b/usr/src/uts/common/brand/lx/syscall/lx_id.c @@ -22,10 +22,9 @@ * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - - -#pragma ident "%Z%%M% %I% %E% SMI" - +/* + * Copyright 2015 Joyent, Inc. + */ #include <sys/types.h> #include <sys/systm.h> @@ -50,7 +49,7 @@ extern int setgroups(int, gid_t *); /* * This function is based on setreuid in common/syscall/uid.c and exists - * because Solaris does not have a way to explicitly set the saved uid (suid) + * because illumos does not have a way to explicitly set the saved uid (suid) * from any other system call. */ long @@ -179,9 +178,9 @@ lx_setresuid16(l_uid16_t ruid16, l_uid16_t euid16, l_uid16_t suid16) long rval; rval = lx_setresuid( - LINUX_UID16_TO_UID32(ruid16), - LINUX_UID16_TO_UID32(euid16), - LINUX_UID16_TO_UID32(suid16)); + LINUX_UID16_TO_UID32(ruid16), + LINUX_UID16_TO_UID32(euid16), + LINUX_UID16_TO_UID32(suid16)); return (rval); } @@ -274,19 +273,19 @@ lx_setresgid16(l_gid16_t rgid16, l_gid16_t egid16, l_gid16_t sgid16) long rval; rval = lx_setresgid( - LINUX_GID16_TO_GID32(rgid16), - LINUX_GID16_TO_GID32(egid16), - LINUX_GID16_TO_GID32(sgid16)); + LINUX_GID16_TO_GID32(rgid16), + LINUX_GID16_TO_GID32(egid16), + LINUX_GID16_TO_GID32(sgid16)); return (rval); } /* - * Linux defines NGROUPS_MAX to be 32, but on Solaris it is only 16. We employ + * Linux defines NGROUPS_MAX to be 32, but on illumos it is only 16. We employ * the terrible hack below so that tests may proceed, if only on DEBUG kernels. */ long -lx_setgroups(int ngroups, gid_t *grouplist) +lx_helper_setgroups(int ngroups, gid_t *grouplist) { #ifdef DEBUG if (ngroups > ngroups_max && ngroups <= LX_NGROUPS_MAX) diff --git a/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c b/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c index 2637f8f33d..8c6ac61ca7 100644 --- a/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c +++ b/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c @@ -1145,7 +1145,7 @@ lx_ioctl_fini() vsd_destroy(&lx_ioctl_vsd); } -int +long lx_ioctl(int fdes, int cmd, intptr_t arg) { file_t *fp; diff --git a/usr/src/uts/common/brand/lx/syscall/lx_kill.c b/usr/src/uts/common/brand/lx/syscall/lx_kill.c index e20e906d33..a5da7fe2df 100644 --- a/usr/src/uts/common/brand/lx/syscall/lx_kill.c +++ b/usr/src/uts/common/brand/lx/syscall/lx_kill.c @@ -21,7 +21,7 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ @@ -275,8 +275,8 @@ lx_kill(pid_t lx_pid, int lx_sig) * queuable are sent through the sigqueue syscall via the user level function * lx_rt_sigqueueinfo(). */ -long -lx_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *uinfo) +int +lx_helper_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *uinfo) { proc_t *target_proc; pid_t s_pid; @@ -310,7 +310,7 @@ lx_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *uinfo) } /* * We shouldn't have queuable signals here, those are sent elsewhere by - * the useland handler for this emulated call. + * the usermode handler for this emulated call. */ if (!SI_CANQUEUE(kinfo.si_code)) { return (set_errno(EINVAL)); @@ -341,8 +341,8 @@ lx_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *uinfo) * Unlike the above function, this handles all system calls to rt_tgsigqueue * regardless of si_code. */ -long -lx_rt_tgsigqueueinfo(pid_t tgid, pid_t tid, int sig, siginfo_t *uinfo) +int +lx_helper_rt_tgsigqueueinfo(pid_t tgid, pid_t tid, int sig, siginfo_t *uinfo) { id_t s_tid; pid_t s_pid; diff --git a/usr/src/uts/common/brand/lx/syscall/lx_pipe.c b/usr/src/uts/common/brand/lx/syscall/lx_pipe.c index cef549141e..fe354a8d54 100644 --- a/usr/src/uts/common/brand/lx/syscall/lx_pipe.c +++ b/usr/src/uts/common/brand/lx/syscall/lx_pipe.c @@ -24,7 +24,7 @@ * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/zone.h> @@ -45,6 +45,9 @@ #include <sys/brand.h> #include <sys/lx_brand.h> +#define LX_O_NONBLOCK 04000 +#define LX_O_CLOEXEC 02000000 + /* * Based on native pipe(2) system call, except that the pipe is half-duplex. */ @@ -174,7 +177,26 @@ lx_pipe(intptr_t arg) * pipe2(2) system call. */ long -lx_pipe2(intptr_t arg, int flags) +lx_pipe2(intptr_t arg, int lxflags) { + int flags = 0; + + /* + * Validate allowed flags. + */ + if ((lxflags & ~(LX_O_NONBLOCK | LX_O_CLOEXEC)) != 0) { + return (set_errno(EINVAL)); + } + + /* + * Convert from Linux flags to illumos flags. + */ + if (lxflags & LX_O_NONBLOCK) { + flags |= FNONBLOCK; + } + if (lxflags & LX_O_CLOEXEC) { + flags |= FCLOEXEC; + } + return (lx_hd_pipe(arg, flags)); } diff --git a/usr/src/uts/common/brand/lx/syscall/lx_rw.c b/usr/src/uts/common/brand/lx/syscall/lx_rw.c index 57cc3e54d0..b21a81da48 100644 --- a/usr/src/uts/common/brand/lx/syscall/lx_rw.c +++ b/usr/src/uts/common/brand/lx/syscall/lx_rw.c @@ -10,18 +10,21 @@ */ /* - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/errno.h> #include <sys/systm.h> #include <sys/file.h> #include <sys/vnode.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> /* uts/common/syscall/rw.c */ extern ssize_t read(int fdes, void *cbuf, size_t count); +extern ssize_t write(int fdes, void *cbuf, size_t count); -ssize_t +long lx_read(int fd, void *buf, size_t nbyte) { file_t *fp; @@ -35,5 +38,23 @@ lx_read(int fd, void *buf, size_t nbyte) if (t == VDIR) return (set_errno(EISDIR)); + /* + * If read(2) returns EINTR, we want to signal that restarting the + * system call is acceptable: + */ + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + return (read(fd, buf, nbyte)); } + +long +lx_write(int fd, void *buf, size_t nbyte) +{ + /* + * If write(2) returns EINTR, we want to signal that restarting the + * system call is acceptable: + */ + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + + return (write(fd, buf, nbyte)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sched.c b/usr/src/uts/common/brand/lx/syscall/lx_sched.c index bb91a752d2..4ebb7ff387 100644 --- a/usr/src/uts/common/brand/lx/syscall/lx_sched.c +++ b/usr/src/uts/common/brand/lx/syscall/lx_sched.c @@ -22,8 +22,9 @@ * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright 2015 Joyent, Inc. + */ #include <sys/types.h> #include <sys/systm.h> @@ -38,8 +39,17 @@ #include <sys/lx_sched.h> #include <sys/lx_brand.h> +extern int yield(); extern long priocntl_common(int, procset_t *, int, caddr_t, caddr_t, uio_seg_t); +long +lx_sched_yield(void) +{ + yield(); + + return (0); +} + int lx_sched_affinity(int cmd, uintptr_t pid, int len, uintptr_t maskp, int64_t *rval) @@ -169,13 +179,14 @@ lx_sched_setscheduler(l_pid_t pid, int policy, struct lx_sched_param *param) if (lwp->lwp_errno) return (lwp->lwp_errno); - if (strcmp(pcinfo.pc_clname, "TS") == 0) + if (strcmp(pcinfo.pc_clname, "TS") == 0) { policy = LX_SCHED_OTHER; - else if (strcmp(pcinfo.pc_clname, "RT") == 0) + } else if (strcmp(pcinfo.pc_clname, "RT") == 0) { policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs == - RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; - else + RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; + } else { return (set_errno(EINVAL)); + } } bzero(&pcinfo, sizeof (pcinfo)); @@ -195,7 +206,7 @@ lx_sched_setscheduler(l_pid_t pid, int policy, struct lx_sched_param *param) pcparm.pc_cid = pcinfo.pc_cid; ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio; ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = - policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF; + policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF; break; case LX_SCHED_OTHER: @@ -263,7 +274,7 @@ lx_sched_getscheduler(l_pid_t pid) policy = LX_SCHED_OTHER; else if (strcmp(pcinfo.pc_clname, "RT") == 0) policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs == - RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; + RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; else policy = set_errno(EINVAL); @@ -316,7 +327,7 @@ lx_sched_setparam(l_pid_t pid, struct lx_sched_param *param) policy = LX_SCHED_OTHER; else if (strcmp(pcinfo.pc_clname, "RT") == 0) policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs == - RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; + RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; else return (set_errno(EINVAL)); @@ -337,7 +348,7 @@ lx_sched_setparam(l_pid_t pid, struct lx_sched_param *param) pcparm.pc_cid = pcinfo.pc_cid; ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio; ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = - policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF; + policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF; break; case LX_SCHED_OTHER: @@ -416,11 +427,12 @@ lx_sched_getparam(l_pid_t pid, struct lx_sched_param *param) local_param.lx_sched_prio = 0; else local_param.lx_sched_prio = -(prio * 20) / scale; - } else if (strcmp(pcinfo.pc_clname, "RT") == 0) + } else if (strcmp(pcinfo.pc_clname, "RT") == 0) { local_param.lx_sched_prio = - ((rtparms_t *)pcparm.pc_clparms)->rt_pri; - else + ((rtparms_t *)pcparm.pc_clparms)->rt_pri; + } else { rv = set_errno(EINVAL); + } if (rv == 0) if (copyout(&local_param, param, sizeof (local_param))) diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c index 6151656cf0..449d5882d4 100644 --- a/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c +++ b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c @@ -21,7 +21,7 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <vm/anon.h> @@ -30,7 +30,7 @@ #include <sys/zone.h> #include <sys/time.h> -struct lx_sysinfo { +typedef struct lx_sysinfo { int64_t si_uptime; /* Seconds since boot */ uint64_t si_loads[3]; /* 1, 5, and 15 minute avg runq length */ uint64_t si_totalram; /* Total memory size */ @@ -44,28 +44,51 @@ struct lx_sysinfo { uint64_t si_totalhigh; /* High memory size */ uint64_t si_freehigh; /* Avail high memory */ uint32_t si_mem_unit; /* Unit size of memory fields */ -}; +} lx_sysinfo_t; + +#if defined(_SYSCALL32_IMPL) +/* + * 64-bit kernel view of the 32-bit usermode struct. + */ +#pragma pack(4) +typedef struct lx_sysinfo32 { + int32_t si_uptime; /* Seconds since boot */ + uint32_t si_loads[3]; /* 1, 5, and 15 minute avg runq length */ + uint32_t si_totalram; /* Total memory size */ + uint32_t si_freeram; /* Available memory */ + uint32_t si_sharedram; /* Shared memory */ + uint32_t si_bufferram; /* Buffer memory */ + uint32_t si_totalswap; /* Total swap space */ + uint32_t si_freeswap; /* Avail swap space */ + uint16_t si_procs; /* Process count */ + uint16_t si_pad; /* Padding */ + uint32_t si_totalhigh; /* High memory size */ + uint32_t si_freehigh; /* Avail high memory */ + uint32_t si_mem_unit; /* Unit size of memory fields */ + char __si_pad[8]; +} lx_sysinfo32_t; +#pragma pack() +#endif extern pgcnt_t swapfs_minfree; -long -lx_sysinfo(struct lx_sysinfo *sip) +static void +lx_sysinfo_common(lx_sysinfo_t *si) { - struct lx_sysinfo si; zone_t *zone = curthread->t_procp->p_zone; uint64_t zphysmem, zfreemem, ztotswap, zfreeswap; - si.si_uptime = gethrestime_sec() - zone->zone_boot_time; + si->si_uptime = gethrestime_sec() - zone->zone_boot_time; - si.si_loads[0] = zone->zone_hp_avenrun[0]; - si.si_loads[1] = zone->zone_hp_avenrun[1]; - si.si_loads[2] = zone->zone_hp_avenrun[2]; + si->si_loads[0] = zone->zone_hp_avenrun[0]; + si->si_loads[1] = zone->zone_hp_avenrun[1]; + si->si_loads[2] = zone->zone_hp_avenrun[2]; /* * In linux each thread looks like a process, so we conflate the * two in this stat as well. */ - si.si_procs = (int32_t)zone->zone_nlwps; + si->si_procs = (int32_t)zone->zone_nlwps; /* * If memory or swap limits are set on the zone, use those, otherwise @@ -111,30 +134,85 @@ lx_sysinfo(struct lx_sysinfo *sip) * option. */ if (MAX(zphysmem, ztotswap) < 1024 * 1024) { - si.si_totalram = ptob(zphysmem); - si.si_freeram = ptob(zfreemem); - si.si_totalswap = ptob(ztotswap); - si.si_freeswap = ptob(zfreeswap); - si.si_mem_unit = 1; + si->si_totalram = ptob(zphysmem); + si->si_freeram = ptob(zfreemem); + si->si_totalswap = ptob(ztotswap); + si->si_freeswap = ptob(zfreeswap); + si->si_mem_unit = 1; } else { - si.si_totalram = zphysmem; - si.si_freeram = zfreemem; - si.si_totalswap = ztotswap; - si.si_freeswap = zfreeswap; - si.si_mem_unit = PAGESIZE; + si->si_totalram = zphysmem; + si->si_freeram = zfreemem; + si->si_totalswap = ztotswap; + si->si_freeswap = zfreeswap; + si->si_mem_unit = PAGESIZE; } - si.si_bufferram = 0; - si.si_sharedram = 0; + si->si_bufferram = 0; + si->si_sharedram = 0; /* * These two stats refer to high physical memory. If an * application running in a Linux zone cares about this, then * either it or we are broken. */ - si.si_totalhigh = 0; - si.si_freehigh = 0; + si->si_totalhigh = 0; + si->si_freehigh = 0; +} + +long +lx_sysinfo64(caddr_t sip) +{ + lx_sysinfo_t si; + + bzero(&si, sizeof (si)); + lx_sysinfo_common(&si); + + if (copyout(&si, sip, sizeof (si)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} + +#if defined(_SYSCALL32_IMPL) +long +lx_sysinfo32(caddr_t sip) +{ + lx_sysinfo_t si; + lx_sysinfo32_t si32; + int i; + + lx_sysinfo_common(&si); + + /* + * Convert the lx_sysinfo_t into the legacy 32-bit view: + */ + bzero(&si32, sizeof (si32)); + si32.si_uptime = si.si_uptime; + + for (i = 0; i < 3; i++) { + if ((si.si_loads[i]) > 0x7fffffff) + si32.si_loads[i] = 0x7fffffff; + else + si32.si_loads[i] = si.si_loads[i]; + } + + si32.si_procs = si.si_procs; + si32.si_totalram = si.si_totalram; + si32.si_freeram = si.si_freeram; + si32.si_totalswap = si.si_totalswap; + si32.si_freeswap = si.si_freeswap; + si32.si_mem_unit = si.si_mem_unit; - if (copyout(&si, sip, sizeof (si)) != 0) + si32.si_bufferram = si.si_bufferram; + si32.si_sharedram = si.si_sharedram; + + si32.si_totalhigh = si.si_totalhigh; + si32.si_freehigh = si.si_freehigh; + + if (copyout(&si32, sip, sizeof (si32)) != 0) { return (set_errno(EFAULT)); + } + return (0); } +#endif diff --git a/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c b/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c index b1528a37c5..c7c611412b 100644 --- a/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c +++ b/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c @@ -42,28 +42,34 @@ long lx_arch_prctl(int code, ulong_t addr) { #if defined(__amd64) - struct lx_lwp_data *llwp = ttolxlwp(curthread); - pcb_t *pcb; - + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *llwp = lwptolxlwp(lwp); + pcb_t *pcb = &lwp->lwp_pcb; /* We currently only support [g|s]et_fs */ switch (code) { case LX_ARCH_GET_FS: if (copyout(&llwp->br_lx_fsbase, (void *)addr, - sizeof (llwp->br_lx_fsbase))) + sizeof (llwp->br_lx_fsbase)) != 0) { return (set_errno(EFAULT)); + } break; + case LX_ARCH_SET_FS: llwp->br_lx_fsbase = addr; - /* - * Save current native libc fsbase. Don't use rdmsr since the - * value might get changed before we get to this code. We - * use the value from the pcb which the native libc should - * have already setup via syslwp_private. - */ - pcb = (pcb_t *)&curthread->t_lwp->lwp_pcb; - llwp->br_ntv_fsbase = pcb->pcb_fsbase; + + kpreempt_disable(); + if (pcb->pcb_fsbase != llwp->br_lx_fsbase) { + pcb->pcb_fsbase = llwp->br_lx_fsbase; + + /* + * Ensure we go out via update_sregs. + */ + pcb->pcb_rupdate = 1; + } + kpreempt_enable(); break; + default: return (set_errno(EINVAL)); } diff --git a/usr/src/lib/brand/lx/lx_brand/common/wait.c b/usr/src/uts/common/brand/lx/syscall/lx_wait.c index c3421858eb..7b10d2f90b 100644 --- a/usr/src/lib/brand/lx/lx_brand/common/wait.c +++ b/usr/src/uts/common/brand/lx/syscall/lx_wait.c @@ -66,21 +66,21 @@ * covers at least fork() and pthread_create(). */ -#include <errno.h> #include <sys/wait.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> #include <sys/lx_types.h> -#include <sys/lx_signal.h> -#include <sys/lx_debug.h> -#include <sys/lx_misc.h> -#include <sys/lx_syscall.h> -#include <sys/syscall.h> -#include <sys/times.h> -#include <strings.h> -#include <unistd.h> -#include <assert.h> +#include <sys/lx_siginfo.h> +#include <lx_signum.h> #include <lx_syscall.h> /* + * From "uts/common/os/exit.c" and "uts/common/syscall/rusagesys.c": + */ +extern int waitid(idtype_t, id_t, k_siginfo_t *, int); +extern int rusagesys(int, void *, void *, void *, void *); + +/* * Convert between Linux options and Solaris options, returning -1 if any * invalid flags are found. */ @@ -99,8 +99,6 @@ #define LX_P_PID 0x1 #define LX_P_GID 0x2 -extern long max_pid; - /* * Split the passed waitpid/waitid options into two separate variables: * those for the native illumos waitid(2), and the extra Linux-specific @@ -149,17 +147,14 @@ lx_wstat(int code, int status) stat = status << 8; break; case CLD_DUMPED: - stat = stol_signo[status]; - assert(stat != -1); - stat |= WCOREFLG; + stat = lx_stol_signo(status, SIGKILL) | WCOREFLG; break; case CLD_KILLED: - stat = stol_signo[status]; - assert(stat != -1); + stat = lx_stol_signo(status, SIGKILL); break; case CLD_TRAPPED: case CLD_STOPPED: - stat = (stol_status(status) << 8) | WSTOPFLG; + stat = (lx_stol_status(status, SIGKILL) << 8) | WSTOPFLG; break; case CLD_CONTINUED: stat = WCONTFLG; @@ -170,50 +165,62 @@ lx_wstat(int code, int status) } static int -lx_waitid_helper(idtype_t idtype, id_t id, siginfo_t *sip, int native_options, +lx_call_waitid(idtype_t idtype, id_t id, k_siginfo_t *sip, int native_options, int extra_options) { + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + int error; + /* - * Call into our in-kernel waitid() wrapper: + * Our brand-specific waitid helper only understands a subset of + * the possible idtypes. Ensure we keep to that subset here: */ -restart: - lx_had_sigchild = 0; - if (syscall(SYS_brand, B_HELPER_WAITID, idtype, id, sip, - native_options, extra_options) != 0) { - if (errno == EINTR && (lx_had_sigchild || - lx_do_syscall_restart)) { - /* - * If we handled a SIGCLD while blocked in waitid(), - * or the SA_RESTART flag was set, we should wait - * again. - */ - lx_debug("lx_waitid_helper() restarting due to" - " interrupted system call"); - goto restart; - } - return (-1); + if (idtype != P_ALL && idtype != P_PID && idtype != P_PGID) { + return (EINVAL); } - return (0); + /* + * Enable the return of emulated ptrace(2) stop conditions + * through lx_waitid_helper, and stash the Linux-specific + * extra waitid() flags. + */ + lwpd->br_waitid_emulate = B_TRUE; + lwpd->br_waitid_flags = extra_options; + + if ((error = waitid(idtype, id, sip, native_options)) == EINTR) { + /* + * According to signal(7), the wait4(2), waitid(2), and + * waitpid(2) system calls are restartable. + */ + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + + lwpd->br_waitid_emulate = B_FALSE; + lwpd->br_waitid_flags = 0; + + return (error); } long lx_wait4(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4) { - siginfo_t info = { 0 }; - struct rusage ru = { 0 }; + k_siginfo_t info = { 0 }; idtype_t idtype; id_t id; int status = 0; pid_t pid = (pid_t)p1; - int rval; + int error; int native_options, extra_options; + int *statusp = (int *)p2; + void *rup = (void *)p4; - if (ltos_options(p3, &native_options, &extra_options) == -1) - return (-EINVAL); + if (ltos_options(p3, &native_options, &extra_options) == -1) { + return (set_errno(EINVAL)); + } - if (pid > max_pid) - return (-ECHILD); + if (pid > maxpid) { + return (set_errno(ECHILD)); + } /* * While not listed as a valid return code, Linux's wait4(2) does, @@ -226,15 +233,34 @@ lx_wait4(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4) * * This will fail if the buffers in question are write-only. */ - if ((void *)p2 != NULL && - ((uucopy((void *)p2, &status, sizeof (status)) != 0) || - (uucopy(&status, (void *)p2, sizeof (status)) != 0))) - return (-EFAULT); - - if ((void *)p4 != NULL) { - if ((uucopy((void *)p4, &ru, sizeof (ru)) != 0) || - (uucopy(&ru, (void *)p4, sizeof (ru)) != 0)) - return (-EFAULT); + if (statusp != NULL) { + if (copyin(statusp, &status, sizeof (status)) != 0 || + copyout(&status, statusp, sizeof (status)) != 0) { + return (set_errno(EFAULT)); + } + } + + /* + * Do the same check for the "struct rusage" pointer, which differs + * in size for 32- and 64-bit processes. + */ + if (rup != NULL) { + struct rusage ru; + void *krup = &ru; + size_t rusz = sizeof (ru); +#if defined(_SYSCALL32_IMPL) + struct rusage32 ru32; + + if (get_udatamodel() != DATAMODEL_NATIVE) { + krup = &ru32; + rusz = sizeof (ru32); + } +#endif + + if (copyin(rup, krup, rusz) != 0 || + copyout(krup, rup, rusz) != 0) { + return (set_errno(EFAULT)); + } } if (pid < -1) { @@ -245,24 +271,27 @@ lx_wait4(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4) id = 0; } else if (pid == 0) { idtype = P_PGID; - id = getpgrp(); + mutex_enter(&pidlock); + id = curproc->p_pgrp; + mutex_exit(&pidlock); } else { idtype = P_PID; id = pid; } - native_options |= WEXITED | WTRAPPED; + native_options |= (WEXITED | WTRAPPED); - if (lx_waitid_helper(idtype, id, &info, native_options, - extra_options) == -1) { - return (-errno); + if ((error = lx_call_waitid(idtype, id, &info, native_options, + extra_options)) != 0) { + return (set_errno(error)); } /* * If the WNOHANG flag was specified and no child was found return 0. */ - if ((native_options & WNOHANG) && info.si_pid == 0) + if ((native_options & WNOHANG) && info.si_pid == 0) { return (0); + } status = lx_wstat(info.si_code, info.si_status); @@ -273,11 +302,18 @@ lx_wait4(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4) * should succeed on a Linux system. This, however, is rather * unlikely since we tested the validity of both above. */ - if (p2 != NULL && uucopy(&status, (void *)p2, sizeof (status)) != 0) - return (-EFAULT); + if (statusp != NULL) { + if (copyout(&status, statusp, sizeof (status)) != 0) { + return (set_errno(EFAULT)); + } + } - if (p4 != NULL && (rval = lx_getrusage(LX_RUSAGE_CHILDREN, p4)) != 0) - return (rval); + if (rup != NULL) { + if ((error = rusagesys(_RUSAGESYS_GETRUSAGE_CHLD, rup, NULL, + NULL, NULL)) != 0) { + return (set_errno(error)); + } + } return (info.si_pid); } @@ -288,17 +324,116 @@ lx_waitpid(uintptr_t p1, uintptr_t p2, uintptr_t p3) return (lx_wait4(p1, p2, p3, NULL)); } +static int +stol_ksiginfo(k_siginfo_t *sip, uintptr_t lxsip) +{ + lx_siginfo_t lsi; + + bzero(&lsi, sizeof (lsi)); + lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD); + lsi.lsi_code = lx_stol_sigcode(sip->si_code); + lsi.lsi_errno = lx_stol_errno[sip->si_errno]; + + switch (lsi.lsi_signo) { + case LX_SIGPOLL: + lsi.lsi_band = sip->si_band; + lsi.lsi_fd = sip->si_fd; + break; + + case LX_SIGCHLD: + lsi.lsi_pid = sip->si_pid; + if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) { + lsi.lsi_status = sip->si_status; + } else { + lsi.lsi_status = lx_stol_status(sip->si_status, + SIGKILL); + } + lsi.lsi_utime = sip->si_utime; + lsi.lsi_stime = sip->si_stime; + break; + + case LX_SIGILL: + case LX_SIGBUS: + case LX_SIGFPE: + case LX_SIGSEGV: + lsi.lsi_addr = sip->si_addr; + break; + + default: + lsi.lsi_pid = sip->si_pid; + lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid); + } + + if (copyout(&lsi, (void *)lxsip, sizeof (lsi)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} + +#if defined(_SYSCALL32_IMPL) +static int +stol_ksiginfo32(k_siginfo_t *sip, uintptr_t lxsip) +{ + lx_siginfo32_t lsi; + + bzero(&lsi, sizeof (lsi)); + lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD); + lsi.lsi_code = lx_stol_sigcode(sip->si_code); + lsi.lsi_errno = lx_stol_errno[sip->si_errno]; + + switch (lsi.lsi_signo) { + case LX_SIGPOLL: + lsi.lsi_band = sip->si_band; + lsi.lsi_fd = sip->si_fd; + break; + + case LX_SIGCHLD: + lsi.lsi_pid = sip->si_pid; + if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) { + lsi.lsi_status = sip->si_status; + } else { + lsi.lsi_status = lx_stol_status(sip->si_status, + SIGKILL); + } + lsi.lsi_utime = sip->si_utime; + lsi.lsi_stime = sip->si_stime; + break; + + case LX_SIGILL: + case LX_SIGBUS: + case LX_SIGFPE: + case LX_SIGSEGV: + lsi.lsi_addr = (caddr32_t)(uintptr_t)sip->si_addr; + break; + + default: + lsi.lsi_pid = sip->si_pid; + lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid); + } + + if (copyout(&lsi, (void *)lxsip, sizeof (lsi)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} +#endif + long lx_waitid(uintptr_t idtype, uintptr_t id, uintptr_t infop, uintptr_t opt) { + int error; int native_options, extra_options; - siginfo_t s_info = {0}; + k_siginfo_t info = { 0 }; - if (ltos_options(opt, &native_options, &extra_options) == -1) - return (-EINVAL); + if (ltos_options(opt, &native_options, &extra_options) == -1) { + return (set_errno(EINVAL)); + } - if (((opt) & (LX_WEXITED | LX_WSTOPPED | LX_WCONTINUED)) == 0) - return (-EINVAL); + if (((opt) & (LX_WEXITED | LX_WSTOPPED | LX_WCONTINUED)) == 0) { + return (set_errno(EINVAL)); + } switch (idtype) { case LX_P_ALL: @@ -311,17 +446,27 @@ lx_waitid(uintptr_t idtype, uintptr_t id, uintptr_t infop, uintptr_t opt) idtype = P_PGID; break; default: - return (-EINVAL); + return (set_errno(EINVAL)); } - if (lx_waitid_helper(idtype, id, &s_info, native_options, - extra_options) == -1) { - return (-errno); + if ((error = lx_call_waitid(idtype, id, &info, native_options, + extra_options)) != 0) { + return (set_errno(error)); } - /* If the WNOHANG flag was specified and no child was found return 0. */ - if ((native_options & WNOHANG) && s_info.si_pid == 0) + /* + * If the WNOHANG flag was specified and no child was found return 0. + */ + if ((native_options & WNOHANG) && info.si_pid == 0) { return (0); + } - return (stol_siginfo(&s_info, (lx_siginfo_t *)infop)); +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + return (stol_ksiginfo32(&info, infop)); + } else +#endif + { + return (stol_ksiginfo(&info, infop)); + } } diff --git a/usr/src/lib/brand/lx/lx_brand/common/xattr.c b/usr/src/uts/common/brand/lx/syscall/lx_xattr.c index 39d6d0361b..ea23c3e4b8 100644 --- a/usr/src/lib/brand/lx/lx_brand/common/xattr.c +++ b/usr/src/uts/common/brand/lx/syscall/lx_xattr.c @@ -10,9 +10,13 @@ */ /* - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/file.h> + /* * *xattr() family of functions. * @@ -20,28 +24,8 @@ * than using NOSYS_NO_EQUIV to avoid unwanted stderr output from ls(1). */ -#include <errno.h> -#include <sys/types.h> -#include <sys/lx_types.h> -#include <sys/lx_syscall.h> - -long -lx_xattr2(uintptr_t p1, uintptr_t p2) -{ - - return (-EOPNOTSUPP); -} - long -lx_xattr3(uintptr_t p1, uintptr_t p2, uintptr_t p3) +lx_xattr(void) { - - return (-EOPNOTSUPP); -} - -long -lx_xattr4(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4) -{ - - return (-EOPNOTSUPP); + return (set_errno(EOPNOTSUPP)); } diff --git a/usr/src/uts/common/brand/sn1/sn1_brand.c b/usr/src/uts/common/brand/sn1/sn1_brand.c index 8b1338d578..0390434cfb 100644 --- a/usr/src/uts/common/brand/sn1/sn1_brand.c +++ b/usr/src/uts/common/brand/sn1/sn1_brand.c @@ -62,29 +62,41 @@ int sn1_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, /* sn1 brand */ struct brand_ops sn1_brops = { - sn1_init_brand_data, - sn1_free_brand_data, - sn1_brandsys, - sn1_setbrand, - sn1_getattr, - sn1_setattr, - sn1_copy_procdata, - sn1_proc_exit, - sn1_exec, - lwp_setrval, - sn1_initlwp, - sn1_forklwp, - sn1_freelwp, - sn1_lwpexit, - sn1_elfexec, - NULL, - NULL, - NULL, - NSIG, - NULL, - NULL, - NULL, - NULL + sn1_init_brand_data, /* b_init_brand_data */ + sn1_free_brand_data, /* b_free_brand_data */ + sn1_brandsys, /* b_brandsys */ + sn1_setbrand, /* b_setbrand */ + sn1_getattr, /* b_getattr */ + sn1_setattr, /* b_setattr */ + sn1_copy_procdata, /* b_copy_procdata */ + sn1_proc_exit, /* b_proc_exit */ + sn1_exec, /* b_exec */ + lwp_setrval, /* b_lwp_setrval */ + sn1_initlwp, /* b_initlwp */ + sn1_forklwp, /* b_forklwp */ + sn1_freelwp, /* b_freelwp */ + sn1_lwpexit, /* b_lwpexit */ + sn1_elfexec, /* b_elfexec */ + NULL, /* b_sigset_native_to_brand */ + NULL, /* b_sigset_brand_to_native */ + NULL, /* b_psig_to_proc */ + NSIG, /* b_nsig */ + NULL, /* b_exit_with_sig */ + NULL, /* b_wait_filter */ + NULL, /* b_native_exec */ + NULL, /* b_ptrace_exectrap */ + NULL, /* b_map32limit */ + NULL, /* b_stop_notify */ + NULL, /* b_waitid_helper */ + NULL, /* b_sigcld_repost */ + NULL, /* b_issig_stop */ + NULL, /* b_savecontext */ +#if defined(_SYSCALL32_IMPL) + NULL, /* b_savecontext32 */ +#endif + NULL, /* b_restorecontext */ + NULL, /* b_sendsig_stack */ + NULL /* b_sendsig */ }; #ifdef sparc diff --git a/usr/src/uts/common/brand/sngl/sngl_brand.c b/usr/src/uts/common/brand/sngl/sngl_brand.c index b04635c0f6..97d172d80e 100644 --- a/usr/src/uts/common/brand/sngl/sngl_brand.c +++ b/usr/src/uts/common/brand/sngl/sngl_brand.c @@ -63,29 +63,41 @@ int sngl_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, /* SNGL brand */ struct brand_ops sngl_brops = { - sngl_init_brand_data, - sngl_free_brand_data, - sngl_brandsys, - sngl_setbrand, - sngl_getattr, - sngl_setattr, - sngl_copy_procdata, - sngl_proc_exit, - sngl_exec, - lwp_setrval, - sngl_initlwp, - sngl_forklwp, - sngl_freelwp, - sngl_lwpexit, - sngl_elfexec, - NULL, - NULL, - NULL, - NSIG, - NULL, - NULL, - NULL, - NULL + sngl_init_brand_data, /* b_init_brand_data */ + sngl_free_brand_data, /* b_free_brand_data */ + sngl_brandsys, /* b_brandsys */ + sngl_setbrand, /* b_setbrand */ + sngl_getattr, /* b_getattr */ + sngl_setattr, /* b_setattr */ + sngl_copy_procdata, /* b_copy_procdata */ + sngl_proc_exit, /* b_proc_exit */ + sngl_exec, /* b_exec */ + lwp_setrval, /* b_lwp_setrval */ + sngl_initlwp, /* b_initlwp */ + sngl_forklwp, /* b_forklwp */ + sngl_freelwp, /* b_freelwp */ + sngl_lwpexit, /* b_lwpexit */ + sngl_elfexec, /* b_elfexec */ + NULL, /* b_sigset_native_to_brand */ + NULL, /* b_sigset_brand_to_native */ + NULL, /* b_psig_to_proc */ + NSIG, /* b_nsig */ + NULL, /* b_exit_with_sig */ + NULL, /* b_wait_filter */ + NULL, /* b_native_exec */ + NULL, /* b_ptrace_exectrap */ + NULL, /* b_map32limit */ + NULL, /* b_stop_notify */ + NULL, /* b_waitid_helper */ + NULL, /* b_sigcld_repost */ + NULL, /* b_issig_stop */ + NULL, /* b_savecontext */ +#if defined(_SYSCALL32_IMPL) + NULL, /* b_savecontext32 */ +#endif + NULL, /* b_restorecontext */ + NULL, /* b_sendsig_stack */ + NULL /* b_sendsig */ }; #ifdef __amd64 diff --git a/usr/src/uts/common/brand/solaris10/s10_brand.c b/usr/src/uts/common/brand/solaris10/s10_brand.c index bedbaa53d3..b6a0eeadff 100644 --- a/usr/src/uts/common/brand/solaris10/s10_brand.c +++ b/usr/src/uts/common/brand/solaris10/s10_brand.c @@ -67,29 +67,41 @@ void s10_sigset_s10_to_native(sigset_t *); /* s10 brand */ struct brand_ops s10_brops = { - s10_init_brand_data, - s10_free_brand_data, - s10_brandsys, - s10_setbrand, - s10_getattr, - s10_setattr, - s10_copy_procdata, - s10_proc_exit, - s10_exec, - lwp_setrval, - s10_initlwp, - s10_forklwp, - s10_freelwp, - s10_lwpexit, - s10_elfexec, - s10_sigset_native_to_s10, - s10_sigset_s10_to_native, - NULL, - S10_NSIG, - NULL, - NULL, - NULL, - NULL + s10_init_brand_data, /* b_init_brand_data */ + s10_free_brand_data, /* b_free_brand_data */ + s10_brandsys, /* b_brandsys */ + s10_setbrand, /* b_setbrand */ + s10_getattr, /* b_getattr */ + s10_setattr, /* b_setattr */ + s10_copy_procdata, /* b_copy_procdata */ + s10_proc_exit, /* b_proc_exit */ + s10_exec, /* b_exec */ + lwp_setrval, /* b_lwp_setrval */ + s10_initlwp, /* b_initlwp */ + s10_forklwp, /* b_forklwp */ + s10_freelwp, /* b_freelwp */ + s10_lwpexit, /* b_lwpexit */ + s10_elfexec, /* b_elfexec */ + s10_sigset_native_to_s10, /* b_sigset_native_to_brand */ + s10_sigset_s10_to_native, /* b_sigset_brand_to_native */ + NULL, /* b_psig_to_proc */ + S10_NSIG, /* b_nsig */ + NULL, /* b_exit_with_sig */ + NULL, /* b_wait_filter */ + NULL, /* b_native_exec */ + NULL, /* b_ptrace_exectrap */ + NULL, /* b_map32limit */ + NULL, /* b_stop_notify */ + NULL, /* b_waitid_helper */ + NULL, /* b_sigcld_repost */ + NULL, /* b_issig_stop */ + NULL, /* b_savecontext */ +#if defined(_SYSCALL32_IMPL) + NULL, /* b_savecontext32 */ +#endif + NULL, /* b_restorecontext */ + NULL, /* b_sendsig_stack */ + NULL /* b_sendsig */ }; #ifdef sparc diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c index e702fb211c..00258d1ced 100644 --- a/usr/src/uts/common/os/exec.c +++ b/usr/src/uts/common/os/exec.c @@ -362,6 +362,8 @@ exec_common(const char *fname, const char **argp, const char **envp, * pending held signals remain held, so don't clear t_hold. */ mutex_enter(&p->p_lock); + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0); lwp->lwp_oldcontext = 0; lwp->lwp_ustack = 0; lwp->lwp_old_stk_ctl = 0; diff --git a/usr/src/uts/common/sys/brand.h b/usr/src/uts/common/sys/brand.h index b3abada863..394c38d3cf 100644 --- a/usr/src/uts/common/sys/brand.h +++ b/usr/src/uts/common/sys/brand.h @@ -137,6 +137,13 @@ struct brand_ops { boolean_t *, int *); int (*b_sigcld_repost)(proc_t *, sigqueue_t *); int (*b_issig_stop)(proc_t *, klwp_t *); + void (*b_savecontext)(ucontext_t *); +#if defined(_SYSCALL32_IMPL) + void (*b_savecontext32)(ucontext32_t *); +#endif + void (*b_restorecontext)(ucontext_t *); + caddr_t (*b_sendsig_stack)(int); + void (*b_sendsig)(int); }; /* diff --git a/usr/src/uts/common/sys/klwp.h b/usr/src/uts/common/sys/klwp.h index 41b70f6a6e..2aceb3a0f6 100644 --- a/usr/src/uts/common/sys/klwp.h +++ b/usr/src/uts/common/sys/klwp.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_KLWP_H @@ -192,6 +192,8 @@ typedef struct _klwp { struct contract *lwp_ct_latest[CTT_MAXTYPE]; /* last created contract */ void *lwp_brand; /* per-lwp brand data */ + int (*lwp_brand_syscall)(void); /* brand syscall interposer */ + struct psinfo *lwp_spymaster; /* if an agent LWP, our spymaster */ } klwp_t; diff --git a/usr/src/uts/i86pc/ml/offsets.in b/usr/src/uts/i86pc/ml/offsets.in index 9922891e56..955d934311 100644 --- a/usr/src/uts/i86pc/ml/offsets.in +++ b/usr/src/uts/i86pc/ml/offsets.in @@ -1,6 +1,7 @@ \ \ Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. \ Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. +\ Copyright 2015 Joyent, Inc. \ \ CDDL HEADER START \ @@ -147,6 +148,7 @@ _klwp lwp_thread lwp_procp lwp_brand + lwp_brand_syscall lwp_eosys lwp_regs lwp_arg diff --git a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s index 823404b485..7b9b844768 100644 --- a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s +++ b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/asm_linkage.h> @@ -503,6 +503,7 @@ noprod_sys_syscall: movq T_LWP(%r15), %r14 ASSERT_NO_RUPDATE_PENDING(%r14) + ENABLE_INTR_FLAGS MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) @@ -516,6 +517,26 @@ noprod_sys_syscall: incq %gs:CPU_STATS_SYS_SYSCALL + /* + * If our LWP has an alternate system call handler, run that instead of + * the regular system call path. + */ + movq LWP_BRAND_SYSCALL(%r14), %rdi + testq %rdi, %rdi + jz _syscall_no_brand + + pushq %rax + call *%rdi + + /* + * If the alternate handler returns 0, we skip straight to the return to + * usermode. Otherwise, we resume regular system call processing. + */ + testl %eax, %eax + popq %rax + jz _syscall_after_brand + +_syscall_no_brand: movw %ax, T_SYSNUM(%r15) movzbl T_PRE_SYS(%r15), %ebx ORL_SYSCALLTRACE(%ebx) @@ -550,6 +571,8 @@ _syscall_invoke: shrq $32, %r13 /* upper 32-bits into %edx */ movl %r12d, %r12d /* lower 32-bits into %eax */ 5: + +_syscall_after_brand: /* * Optimistically assume that there's no post-syscall * work to do. (This is to avoid having to call syscall_mstate() @@ -795,6 +818,25 @@ _syscall32_save: incq %gs:CPU_STATS_SYS_SYSCALL /* + * If our lwp has an alternate system call handler, run that instead + * of the regular system call path. + */ + movq LWP_BRAND_SYSCALL(%r14), %rax + testq %rax, %rax + jz _syscall32_no_brand + + movb $LWP_SYS, LWP_STATE(%r14) + call *%rax + + /* + * If the alternate handler returns 0, we skip straight to the return + * to usermode. Otherwise, we resume regular system call processing. + */ + testl %eax, %eax + jz _syscall32_after_brand + +_syscall32_no_brand: + /* * Make some space for MAXSYSARGS (currently 8) 32-bit args placed * into 64-bit (long) arg slots, maintaining 16 byte alignment. Or * more succinctly: @@ -861,6 +903,8 @@ _syscall32_save: shrq $32, %r13 /* upper 32-bits into %edx */ movl %eax, %r12d /* lower 32-bits into %eax */ +_syscall32_after_brand: + /* * Optimistically assume that there's no post-syscall * work to do. (This is to avoid having to call syscall_mstate() @@ -1191,7 +1235,31 @@ sys_int80() ENTRY_NP(brand_sys_int80) SWAPGS /* kernel gsbase */ XPV_TRAP_POP + + /* + * We first attempt to call the "b_int80" handler from the "struct + * brand_mach_ops" for this brand. If no handler function is installed + * for this brand, the BRAND_CALLBACK() macro returns here and we + * check the lwp for a "lwp_brand_syscall" handler. + */ BRAND_CALLBACK(BRAND_CB_INT80, BRAND_URET_FROM_INTR_STACK()) + + /* + * Check to see if this lwp provides "lwp_brand_syscall". If so, we + * will route this int80 through the regular system call handling path. + */ + movq %r15, %gs:CPU_RTMP_R15 + movq %gs:CPU_THREAD, %r15 + movq T_LWP(%r15), %r15 + movq LWP_BRAND_SYSCALL(%r15), %r15 + testq %r15, %r15 + movq %gs:CPU_RTMP_R15, %r15 + jnz nopop_syscall_int + + /* + * The brand provided neither a "b_int80", nor a "lwp_brand_syscall" + * function, and has thus opted out of handling this trap. + */ SWAPGS /* user gsbase */ jmp nopop_int80 diff --git a/usr/src/uts/intel/Makefile.files b/usr/src/uts/intel/Makefile.files index 0f058f262d..7eefb1c062 100644 --- a/usr/src/uts/intel/Makefile.files +++ b/usr/src/uts/intel/Makefile.files @@ -276,8 +276,8 @@ SN1_BRAND_OBJS = sn1_brand.o sn1_brand_asm.o SNGL_BRAND_OBJS = sngl_brand.o sngl_brand_asm.o S10_BRAND_OBJS = s10_brand.o s10_brand_asm.o LX_BRAND_OBJS = \ + lx_archdep.o \ lx_brand.o \ - lx_brand_asm.o \ lx_brk.o \ lx_clone.o \ lx_futex.o \ @@ -295,7 +295,9 @@ LX_BRAND_OBJS = \ lx_signum.o \ lx_syscall.o \ lx_sysinfo.o \ - lx_thread_area.o + lx_thread_area.o \ + lx_wait.o \ + lx_xattr.o # # special files diff --git a/usr/src/uts/intel/brand/lx/lx_archdep.c b/usr/src/uts/intel/brand/lx/lx_archdep.c new file mode 100644 index 0000000000..49f2f12172 --- /dev/null +++ b/usr/src/uts/intel/brand/lx/lx_archdep.c @@ -0,0 +1,1171 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * LX brand Intel-specific routines. + */ + +#include <sys/types.h> +#include <sys/sunddi.h> +#include <sys/ddi.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/privregs.h> +#include <sys/pcb.h> +#include <sys/archsystm.h> +#include <sys/stack.h> +#include <sys/sdt.h> +#include <sys/sysmacros.h> + +#define LX_REG(ucp, r) ((ucp)->uc_mcontext.gregs[(r)]) + +extern int getsetcontext(int, void *); +#if defined(_SYSCALL32_IMPL) +extern int getsetcontext32(int, void *); +#endif + +#if defined(__amd64) +static int +lx_rw_uc(proc_t *p, void *ucp, void *kucp, size_t ucsz, boolean_t writing) +{ + int error = 0; + size_t rem = ucsz; + off_t pos = 0; + + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * Grab P_PR_LOCK so that we can drop p_lock while doing I/O. + */ + sprlock_proc(p); + + /* + * Drop p_lock while we do I/O to avoid deadlock with the clock thread. + */ + mutex_exit(&p->p_lock); + while (rem != 0) { + uintptr_t addr = (uintptr_t)ucp + pos; + size_t len = MIN(rem, PAGESIZE - (addr & PAGEOFFSET)); + + if (writing) { + error = uwrite(p, kucp + pos, len, addr); + } else { + error = uread(p, kucp + pos, len, addr); + } + + if (error != 0) { + break; + } + + rem -= len; + pos += len; + } + mutex_enter(&p->p_lock); + + sprunlock(p); + mutex_enter(&p->p_lock); + + return (error); +} + +/* + * Read a ucontext_t from the target process, which may or may not be + * the current process. + */ +static int +lx_read_uc(proc_t *p, void *ucp, void *kucp, size_t ucsz) +{ + return (lx_rw_uc(p, ucp, kucp, ucsz, B_FALSE)); +} + +/* + * Write a ucontext_t to the target process, which may or may not be + * the current process. + */ +static int +lx_write_uc(proc_t *p, void *ucp, void *kucp, size_t ucsz) +{ + return (lx_rw_uc(p, ucp, kucp, ucsz, B_TRUE)); +} +#endif /* __amd64 */ + +/* + * Load register state from a usermode "lx_user_regs_t" in the tracer + * and store it in the tracee ucontext_t. + */ +int +lx_userregs_to_uc(lx_lwp_data_t *lwpd, void *ucp, void *uregsp) +{ +#if defined(__amd64) + klwp_t *lwp = lwpd->br_lwp; + proc_t *p = lwptoproc(lwp); + + switch (get_udatamodel()) { + case DATAMODEL_LP64: { + lx_user_regs_t lxur; + + if (copyin(uregsp, &lxur, sizeof (lxur)) != 0) { + return (EFAULT); + } + + switch (lwp_getdatamodel(lwp)) { + case DATAMODEL_LP64: { + ucontext_t uc; + + if (lx_read_uc(p, ucp, &uc, sizeof (uc)) != 0) { + return (EIO); + } + + /* + * Note: we currently ignore "lxur_orig_rax" here, as + * this path should not be used for system call stops. + */ + LX_REG(&uc, REG_R15) = lxur.lxur_r15; + LX_REG(&uc, REG_R14) = lxur.lxur_r14; + LX_REG(&uc, REG_R13) = lxur.lxur_r13; + LX_REG(&uc, REG_R12) = lxur.lxur_r12; + LX_REG(&uc, REG_RBP) = lxur.lxur_rbp; + LX_REG(&uc, REG_RBX) = lxur.lxur_rbx; + LX_REG(&uc, REG_R11) = lxur.lxur_r11; + LX_REG(&uc, REG_R10) = lxur.lxur_r10; + LX_REG(&uc, REG_R9) = lxur.lxur_r9; + LX_REG(&uc, REG_R8) = lxur.lxur_r8; + LX_REG(&uc, REG_RAX) = lxur.lxur_rax; + LX_REG(&uc, REG_RCX) = lxur.lxur_rcx; + LX_REG(&uc, REG_RDX) = lxur.lxur_rdx; + LX_REG(&uc, REG_RSI) = lxur.lxur_rsi; + LX_REG(&uc, REG_RDI) = lxur.lxur_rdi; + LX_REG(&uc, REG_RIP) = lxur.lxur_rip; + LX_REG(&uc, REG_CS) = lxur.lxur_xcs; + LX_REG(&uc, REG_RFL) = lxur.lxur_rflags; + LX_REG(&uc, REG_RSP) = lxur.lxur_rsp; + LX_REG(&uc, REG_SS) = lxur.lxur_xss; + LX_REG(&uc, REG_FSBASE) = lxur.lxur_xfs_base; + LX_REG(&uc, REG_GSBASE) = lxur.lxur_xgs_base; + + LX_REG(&uc, REG_DS) = lxur.lxur_xds; + LX_REG(&uc, REG_ES) = lxur.lxur_xes; + LX_REG(&uc, REG_FS) = lxur.lxur_xfs; + LX_REG(&uc, REG_GS) = lxur.lxur_xgs; + + if (lx_write_uc(p, ucp, &uc, sizeof (uc)) != 0) { + return (EIO); + } + + return (0); + } + + case DATAMODEL_ILP32: { + ucontext32_t uc; + + if (lx_read_uc(p, ucp, &uc, sizeof (uc)) != 0) { + return (EIO); + } + + /* + * Note: we currently ignore "lxur_orig_eax" here, as + * this path should not be used for system call stops. + */ + LX_REG(&uc, EBP) = (int32_t)lxur.lxur_rbp; + LX_REG(&uc, EBX) = (int32_t)lxur.lxur_rbx; + LX_REG(&uc, EAX) = (int32_t)lxur.lxur_rax; + LX_REG(&uc, ECX) = (int32_t)lxur.lxur_rcx; + LX_REG(&uc, EDX) = (int32_t)lxur.lxur_rdx; + LX_REG(&uc, ESI) = (int32_t)lxur.lxur_rsi; + LX_REG(&uc, EDI) = (int32_t)lxur.lxur_rdi; + LX_REG(&uc, EIP) = (int32_t)lxur.lxur_rip; + LX_REG(&uc, CS) = (int32_t)lxur.lxur_xcs; + LX_REG(&uc, EFL) = (int32_t)lxur.lxur_rflags; + LX_REG(&uc, UESP) = (int32_t)lxur.lxur_rsp; + LX_REG(&uc, SS) = (int32_t)lxur.lxur_xss; + + LX_REG(&uc, DS) = (int32_t)lxur.lxur_xds; + LX_REG(&uc, ES) = (int32_t)lxur.lxur_xes; + LX_REG(&uc, FS) = (int32_t)lxur.lxur_xfs; + LX_REG(&uc, GS) = (int32_t)lxur.lxur_xgs; + + if (lx_write_uc(p, ucp, &uc, sizeof (uc)) != 0) { + return (EIO); + } + + return (0); + } + + default: + return (EIO); + } + + return (EIO); + } + + case DATAMODEL_ILP32: { + lx_user_regs32_t lxur; + ucontext32_t uc; + + if (lwp_getdatamodel(lwp) != DATAMODEL_ILP32) { + /* + * The target is not a 32-bit LWP. We refuse to + * present truncated 64-bit registers to a 32-bit + * tracer. + */ + return (EIO); + } + + if (copyin(uregsp, &lxur, sizeof (lxur)) != 0) { + return (EFAULT); + } + + if (lx_read_uc(p, ucp, &uc, sizeof (uc)) != 0) { + return (EIO); + } + + /* + * Note: we currently ignore "lxur_orig_eax" here, as + * this path should not be used for system call stops. + */ + LX_REG(&uc, EBX) = lxur.lxur_ebx; + LX_REG(&uc, ECX) = lxur.lxur_ecx; + LX_REG(&uc, EDX) = lxur.lxur_edx; + LX_REG(&uc, ESI) = lxur.lxur_esi; + LX_REG(&uc, EDI) = lxur.lxur_edi; + LX_REG(&uc, EBP) = lxur.lxur_ebp; + LX_REG(&uc, EAX) = lxur.lxur_eax; + LX_REG(&uc, EIP) = lxur.lxur_eip; + LX_REG(&uc, CS) = lxur.lxur_xcs; + LX_REG(&uc, EFL) = lxur.lxur_eflags; + LX_REG(&uc, UESP) = lxur.lxur_esp; + LX_REG(&uc, SS) = lxur.lxur_xss; + + LX_REG(&uc, DS) = lxur.lxur_xds; + LX_REG(&uc, ES) = lxur.lxur_xes; + LX_REG(&uc, FS) = lxur.lxur_xfs; + LX_REG(&uc, GS) = lxur.lxur_xgs; + + if (lx_write_uc(p, ucp, &uc, sizeof (uc)) != 0) { + return (EIO); + } + + return (EIO); + } + + default: + return (EIO); + } +#else + cmn_err(CE_WARN, "%s: no 32-bit kernel support", __FUNCTION__); + exit(CLD_KILLED, SIGSYS); + return (EIO); +#endif /* __amd64 */ +} + +/* + * Copy register state from a ucontext_t in the tracee to a usermode + * "lx_user_regs_t" in the tracer. + */ +int +lx_uc_to_userregs(lx_lwp_data_t *lwpd, void *ucp, void *uregsp) +{ +#if defined(__amd64) + klwp_t *lwp = lwpd->br_lwp; + proc_t *p = lwptoproc(lwp); + + switch (get_udatamodel()) { + case DATAMODEL_LP64: { + lx_user_regs_t lxur; + + switch (lwp_getdatamodel(lwp)) { + case DATAMODEL_LP64: { + ucontext_t uc; + + if (lx_read_uc(p, ucp, &uc, sizeof (uc)) != 0) { + return (EIO); + } + + lxur.lxur_r15 = LX_REG(&uc, REG_R15); + lxur.lxur_r14 = LX_REG(&uc, REG_R14); + lxur.lxur_r13 = LX_REG(&uc, REG_R13); + lxur.lxur_r12 = LX_REG(&uc, REG_R12); + lxur.lxur_rbp = LX_REG(&uc, REG_RBP); + lxur.lxur_rbx = LX_REG(&uc, REG_RBX); + lxur.lxur_r11 = LX_REG(&uc, REG_R11); + lxur.lxur_r10 = LX_REG(&uc, REG_R10); + lxur.lxur_r9 = LX_REG(&uc, REG_R9); + lxur.lxur_r8 = LX_REG(&uc, REG_R8); + lxur.lxur_rax = LX_REG(&uc, REG_RAX); + lxur.lxur_rcx = LX_REG(&uc, REG_RCX); + lxur.lxur_rdx = LX_REG(&uc, REG_RDX); + lxur.lxur_rsi = LX_REG(&uc, REG_RSI); + lxur.lxur_rdi = LX_REG(&uc, REG_RDI); + lxur.lxur_orig_rax = 0; + lxur.lxur_rip = LX_REG(&uc, REG_RIP); + lxur.lxur_xcs = LX_REG(&uc, REG_CS); + lxur.lxur_rflags = LX_REG(&uc, REG_RFL); + lxur.lxur_rsp = LX_REG(&uc, REG_RSP); + lxur.lxur_xss = LX_REG(&uc, REG_SS); + lxur.lxur_xfs_base = LX_REG(&uc, REG_FSBASE); + lxur.lxur_xgs_base = LX_REG(&uc, REG_GSBASE); + + lxur.lxur_xds = LX_REG(&uc, REG_DS); + lxur.lxur_xes = LX_REG(&uc, REG_ES); + lxur.lxur_xfs = LX_REG(&uc, REG_FS); + lxur.lxur_xgs = LX_REG(&uc, REG_GS); + + if (copyout(&lxur, uregsp, sizeof (lxur)) != 0) { + return (EFAULT); + } + + return (0); + } + + case DATAMODEL_ILP32: { + ucontext32_t uc; + + if (lx_read_uc(p, ucp, &uc, sizeof (uc)) != 0) { + return (EIO); + } + + lxur.lxur_r15 = 0; + lxur.lxur_r14 = 0; + lxur.lxur_r13 = 0; + lxur.lxur_r12 = 0; + lxur.lxur_rbp = LX_REG(&uc, EBP); + lxur.lxur_rbx = LX_REG(&uc, EBX); + lxur.lxur_r11 = 0; + lxur.lxur_r10 = 0; + lxur.lxur_r9 = 0; + lxur.lxur_r8 = 0; + lxur.lxur_rax = LX_REG(&uc, EAX); + lxur.lxur_rcx = LX_REG(&uc, ECX); + lxur.lxur_rdx = LX_REG(&uc, EDX); + lxur.lxur_rsi = LX_REG(&uc, ESI); + lxur.lxur_rdi = LX_REG(&uc, EDI); + lxur.lxur_orig_rax = 0; + lxur.lxur_rip = LX_REG(&uc, EIP); + lxur.lxur_xcs = LX_REG(&uc, CS); + lxur.lxur_rflags = LX_REG(&uc, EFL); + lxur.lxur_rsp = LX_REG(&uc, UESP); + lxur.lxur_xss = LX_REG(&uc, SS); + lxur.lxur_xfs_base = 0; + lxur.lxur_xgs_base = 0; + + lxur.lxur_xds = LX_REG(&uc, DS); + lxur.lxur_xes = LX_REG(&uc, ES); + lxur.lxur_xfs = LX_REG(&uc, FS); + lxur.lxur_xgs = LX_REG(&uc, GS); + + if (copyout(&lxur, uregsp, sizeof (lxur)) != 0) { + return (EFAULT); + } + + return (0); + } + + default: + return (EIO); + } + } + + case DATAMODEL_ILP32: { + lx_user_regs32_t lxur; + ucontext32_t uc; + + if (lwp_getdatamodel(lwp) != DATAMODEL_ILP32) { + /* + * The target is not a 32-bit LWP. We refuse to + * present truncated 64-bit registers to a 32-bit + * tracer. + */ + return (EIO); + } + + if (lx_read_uc(p, ucp, &uc, sizeof (uc)) != 0) { + return (EIO); + } + + lxur.lxur_ebx = LX_REG(&uc, EBX); + lxur.lxur_ecx = LX_REG(&uc, ECX); + lxur.lxur_edx = LX_REG(&uc, EDX); + lxur.lxur_esi = LX_REG(&uc, ESI); + lxur.lxur_edi = LX_REG(&uc, EDI); + lxur.lxur_ebp = LX_REG(&uc, EBP); + lxur.lxur_eax = LX_REG(&uc, EAX); + lxur.lxur_orig_eax = 0; + lxur.lxur_eip = LX_REG(&uc, EIP); + lxur.lxur_xcs = LX_REG(&uc, CS); + lxur.lxur_eflags = LX_REG(&uc, EFL); + lxur.lxur_esp = LX_REG(&uc, UESP); + lxur.lxur_xss = LX_REG(&uc, SS); + + lxur.lxur_xds = LX_REG(&uc, DS); + lxur.lxur_xes = LX_REG(&uc, ES); + lxur.lxur_xfs = LX_REG(&uc, FS); + lxur.lxur_xgs = LX_REG(&uc, GS); + + if (copyout(&lxur, uregsp, sizeof (lxur)) != 0) { + return (EFAULT); + } + + return (0); + } + + default: + return (EIO); + } +#else + cmn_err(CE_WARN, "%s: no 32-bit kernel support", __FUNCTION__); + exit(CLD_KILLED, SIGSYS); + return (EIO); +#endif +} + +/* + * Load a usermode "lx_user_regs_t" into the register state of the target LWP. + */ +int +lx_userregs_to_regs(lx_lwp_data_t *lwpd, void *uregsp) +{ + klwp_t *lwp = lwpd->br_lwp; + proc_t *p = lwptoproc(lwp); + + VERIFY(MUTEX_HELD(&p->p_lock)); + +#if defined(__amd64) + struct regs *rp = lwptoregs(lwp); + struct pcb *pcb = &lwp->lwp_pcb; + + switch (get_udatamodel()) { + case DATAMODEL_LP64: { + lx_user_regs_t lxur; + + if (copyin(uregsp, &lxur, sizeof (lxur)) != 0) { + return (EFAULT); + } + + rp->r_r15 = lxur.lxur_r15; + rp->r_r14 = lxur.lxur_r14; + rp->r_r13 = lxur.lxur_r13; + rp->r_r12 = lxur.lxur_r12; + rp->r_rbp = lxur.lxur_rbp; + rp->r_rbx = lxur.lxur_rbx; + rp->r_r11 = lxur.lxur_r11; + rp->r_r10 = lxur.lxur_r10; + rp->r_r9 = lxur.lxur_r9; + rp->r_r8 = lxur.lxur_r8; + rp->r_rax = lxur.lxur_rax; + rp->r_rcx = lxur.lxur_rcx; + rp->r_rdx = lxur.lxur_rdx; + rp->r_rsi = lxur.lxur_rsi; + rp->r_rdi = lxur.lxur_rdi; + lwpd->br_syscall_num = (int)lxur.lxur_orig_rax; + rp->r_rip = lxur.lxur_rip; + rp->r_cs = lxur.lxur_xcs; + rp->r_rfl = lxur.lxur_rflags; + rp->r_rsp = lxur.lxur_rsp; + rp->r_ss = lxur.lxur_xss; + pcb->pcb_fsbase = lxur.lxur_xfs_base; + pcb->pcb_gsbase = lxur.lxur_xgs_base; + + kpreempt_disable(); + pcb->pcb_rupdate = 1; + pcb->pcb_ds = lxur.lxur_xds; + pcb->pcb_es = lxur.lxur_xes; + pcb->pcb_fs = lxur.lxur_xfs; + pcb->pcb_gs = lxur.lxur_xgs; + kpreempt_enable(); + + return (0); + } + + case DATAMODEL_ILP32: { + lx_user_regs32_t lxur; + + if (lwp_getdatamodel(lwp) != DATAMODEL_ILP32) { + /* + * The target is not a 32-bit LWP. We refuse to + * present truncated 64-bit registers to a 32-bit + * tracer. + */ + return (EIO); + } + + if (copyin(uregsp, &lxur, sizeof (lxur)) != 0) { + return (EFAULT); + } + + rp->r_rbx = lxur.lxur_ebx; + rp->r_rcx = lxur.lxur_ecx; + rp->r_rdx = lxur.lxur_edx; + rp->r_rsi = lxur.lxur_esi; + rp->r_rdi = lxur.lxur_edi; + rp->r_rbp = lxur.lxur_ebp; + rp->r_rax = lxur.lxur_eax; + lwpd->br_syscall_num = (int)lxur.lxur_orig_eax; + rp->r_rip = lxur.lxur_eip; + rp->r_cs = lxur.lxur_xcs; + rp->r_rfl = lxur.lxur_eflags; + rp->r_rsp = lxur.lxur_esp; + rp->r_ss = lxur.lxur_xss; + + kpreempt_disable(); + pcb->pcb_rupdate = 1; + pcb->pcb_ds = lxur.lxur_xds; + pcb->pcb_es = lxur.lxur_xes; + pcb->pcb_fs = lxur.lxur_xfs; + pcb->pcb_gs = lxur.lxur_xgs; + kpreempt_enable(); + + return (0); + } + + default: + return (EIO); + } +#else + cmn_err(CE_WARN, "%s: no 32-bit kernel support", __FUNCTION__); + exit(CLD_KILLED, SIGSYS); + return (EIO); +#endif /* __amd64 */ +} + +/* + * Copy the current LWP register state of the target LWP to a usermode + * "lx_user_regs_t". + */ +int +lx_regs_to_userregs(lx_lwp_data_t *lwpd, void *uregsp) +{ +#if defined(__amd64) + klwp_t *lwp = lwpd->br_lwp; + struct regs *rp = lwptoregs(lwp); + proc_t *p = lwptoproc(lwp); + + VERIFY(MUTEX_HELD(&p->p_lock)); + + struct pcb *pcb = &lwp->lwp_pcb; + long r0, orig_r0; + + /* + * We must precisely emulate the "syscall-entry-stop" and + * "syscall-exit-stop" register appearance from the Linux kernel. + */ + switch (lwpd->br_ptrace_whatstop) { + case LX_PR_SYSENTRY: + orig_r0 = lwpd->br_syscall_num; + r0 = -lx_stol_errno[ENOTSUP]; + break; + case LX_PR_SYSEXIT: + orig_r0 = lwpd->br_syscall_num; + r0 = rp->r_rax; + break; + default: + orig_r0 = 0; + r0 = rp->r_rax; + } + + switch (get_udatamodel()) { + case DATAMODEL_LP64: { + lx_user_regs_t lxur; + + lxur.lxur_r15 = rp->r_r15; + lxur.lxur_r14 = rp->r_r14; + lxur.lxur_r13 = rp->r_r13; + lxur.lxur_r12 = rp->r_r12; + lxur.lxur_rbp = rp->r_rbp; + lxur.lxur_rbx = rp->r_rbx; + lxur.lxur_r11 = rp->r_r11; + lxur.lxur_r10 = rp->r_r10; + lxur.lxur_r9 = rp->r_r9; + lxur.lxur_r8 = rp->r_r8; + lxur.lxur_rax = r0; + lxur.lxur_rcx = rp->r_rcx; + lxur.lxur_rdx = rp->r_rdx; + lxur.lxur_rsi = rp->r_rsi; + lxur.lxur_rdi = rp->r_rdi; + lxur.lxur_orig_rax = orig_r0; + lxur.lxur_rip = rp->r_rip; + lxur.lxur_xcs = rp->r_cs; + lxur.lxur_rflags = rp->r_rfl; + lxur.lxur_rsp = rp->r_rsp; + lxur.lxur_xss = rp->r_ss; + lxur.lxur_xfs_base = pcb->pcb_fsbase; + lxur.lxur_xgs_base = pcb->pcb_gsbase; + + kpreempt_disable(); + if (pcb->pcb_rupdate == 1) { + lxur.lxur_xds = pcb->pcb_ds; + lxur.lxur_xes = pcb->pcb_es; + lxur.lxur_xfs = pcb->pcb_fs; + lxur.lxur_xgs = pcb->pcb_gs; + } else { + lxur.lxur_xds = rp->r_ds; + lxur.lxur_xes = rp->r_es; + lxur.lxur_xfs = rp->r_fs; + lxur.lxur_xgs = rp->r_gs; + } + kpreempt_enable(); + + if (copyout(&lxur, uregsp, sizeof (lxur)) != 0) { + return (EFAULT); + } + + return (0); + } + + case DATAMODEL_ILP32: { + lx_user_regs32_t lxur; + + if (lwp_getdatamodel(lwp) != DATAMODEL_ILP32) { + /* + * The target is not a 32-bit LWP. We refuse to + * present truncated 64-bit registers to a 32-bit + * tracer. + */ + return (EIO); + } + + lxur.lxur_ebx = (int32_t)rp->r_rbx; + lxur.lxur_ecx = (int32_t)rp->r_rcx; + lxur.lxur_edx = (int32_t)rp->r_rdx; + lxur.lxur_esi = (int32_t)rp->r_rsi; + lxur.lxur_edi = (int32_t)rp->r_rdi; + lxur.lxur_ebp = (int32_t)rp->r_rbp; + lxur.lxur_eax = (int32_t)r0; + lxur.lxur_orig_eax = (int32_t)orig_r0; + lxur.lxur_eip = (int32_t)rp->r_rip; + lxur.lxur_xcs = (int32_t)rp->r_cs; + lxur.lxur_eflags = (int32_t)rp->r_rfl; + lxur.lxur_esp = (int32_t)rp->r_rsp; + lxur.lxur_xss = (int32_t)rp->r_ss; + + kpreempt_disable(); + if (pcb->pcb_rupdate == 1) { + lxur.lxur_xds = pcb->pcb_ds; + lxur.lxur_xes = pcb->pcb_es; + lxur.lxur_xfs = pcb->pcb_fs; + lxur.lxur_xgs = pcb->pcb_gs; + } else { + lxur.lxur_xds = rp->r_ds; + lxur.lxur_xes = rp->r_es; + lxur.lxur_xfs = rp->r_fs; + lxur.lxur_xgs = rp->r_gs; + } + kpreempt_enable(); + + if (copyout(&lxur, uregsp, sizeof (lxur)) != 0) { + return (EFAULT); + } + + return (0); + } + + default: + return (EIO); + } +#else + cmn_err(CE_WARN, "%s: no 32-bit kernel support", __FUNCTION__); + exit(CLD_KILLED, SIGSYS); + return (EIO); +#endif /* __amd64 */ +} + +/* + * Load registers and repoint the stack and program counter. This function is + * used by the B_JUMP_TO_LINUX brand system call to revector to a Linux + * entrypoint. + */ +int +lx_runexe(klwp_t *lwp, void *ucp) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + /* + * We should only make it here when transitioning to Linux from + * the NATIVE or INIT mode. + */ + VERIFY(lwpd->br_stack_mode == LX_STACK_MODE_NATIVE || + lwpd->br_stack_mode == LX_STACK_MODE_INIT); + +#if defined(__amd64) + if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) { + struct pcb *pcb = &lwp->lwp_pcb; + + /* + * Preserve the %fsbase value for this LWP, as set and used by + * native illumos code. + */ + lwpd->br_ntv_fsbase = pcb->pcb_fsbase; + + return (getsetcontext(SETCONTEXT, ucp)); + } else { + return (getsetcontext32(SETCONTEXT, ucp)); + } +#else + return (getsetcontext(SETCONTEXT, ucp)); +#endif +} + +/* + * The usermode emulation code is illumos library code. This routine ensures + * the segment registers are set up correctly for native illumos code. It + * should be called _after_ we have stored the outgoing Linux machine state + * but _before_ we return from the kernel to any illumos native code; e.g. the + * usermode emulation library, or any interposed signal handlers. + * + * See the comment on lwp_segregs_save() for how we handle the usermode + * registers when we come into the kernel and see update_sregs() for how we + * restore. + */ +void +lx_switch_to_native(klwp_t *lwp) +{ +#if defined(__amd64) + model_t datamodel = lwp_getdatamodel(lwp); + + switch (datamodel) { + case DATAMODEL_ILP32: { + struct pcb *pcb = &lwp->lwp_pcb; + + /* + * For 32-bit processes, we ensure that the correct %gs value + * is loaded: + */ + kpreempt_disable(); + if (pcb->pcb_rupdate == 1) { + /* + * If we are already flushing the segment registers, + * then ensure we are flushing the native %gs. + */ + pcb->pcb_gs = LWPGS_SEL; + } else { + struct regs *rp = lwptoregs(lwp); + + /* + * If we are not flushing the segment registers yet, + * only do so if %gs is not correct already: + */ + if (rp->r_gs != LWPGS_SEL) { + pcb->pcb_gs = LWPGS_SEL; + + /* + * Ensure we go out via update_sregs. + */ + pcb->pcb_rupdate = 1; + } + } + kpreempt_enable(); + break; + } + + case DATAMODEL_LP64: { + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + /* + * For 64-bit processes we ensure that the correct %fsbase + * value is loaded: + */ + if (lwpd->br_ntv_fsbase != 0) { + struct pcb *pcb = &lwp->lwp_pcb; + + kpreempt_disable(); + if (pcb->pcb_fsbase != lwpd->br_ntv_fsbase) { + pcb->pcb_fsbase = lwpd->br_ntv_fsbase; + + /* + * Ensure we go out via update_sregs. + */ + pcb->pcb_rupdate = 1; + } + kpreempt_enable(); + } + break; + } + + default: + cmn_err(CE_PANIC, "unknown data model: %d", datamodel); + } +#elif defined(__i386) + struct regs *rp = lwptoregs(lwp); + + rp->r_gs = LWPGS_SEL; +#else +#error "unknown x86" +#endif +} + +#if defined(__amd64) +/* + * Call frame for the 64-bit usermode emulation handler: + * lx_emulate(ucontext_t *ucp, int syscall_num, uintptr_t *args) + * + * old sp: -------------------------------------------------------------- + * | - ucontext_t (register state for emulation) + * | - uintptr_t[6] (system call arguments array) + * V -------------------------------------------------------------- + * new sp: - bogus return address + * + * Arguments are passed in registers, per the AMD64 ABI: %rdi, %rsi and %rdx. + */ +void +lx_emulate_user(klwp_t *lwp, int syscall_num, uintptr_t *args) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + label_t lab; + uintptr_t uc_addr; + uintptr_t args_addr; + uintptr_t top; + /* + * Variables used after on_fault() returns for a fault + * must be volatile. + */ + volatile size_t frsz; + volatile uintptr_t sp; + volatile proc_t *p = lwptoproc(lwp); + volatile int watched; + + /* + * We should not be able to get here unless we are running Linux + * code for a system call we cannot emulate in the kernel. + */ + VERIFY(lwpd->br_stack_mode == LX_STACK_MODE_BRAND); + + /* + * The AMD64 ABI requires us to align the return address on the stack + * so that when the called function pushes %rbp, the stack is 16-byte + * aligned. + * + * This routine, like the amd64 version of sendsig(), depends on + * STACK_ALIGN being 16 and STACK_ENTRY_ALIGN being 8. + */ +#if STACK_ALIGN != 16 || STACK_ENTRY_ALIGN != 8 +#error "lx_emulate_user() amd64 did not find the expected stack alignments" +#endif + + /* + * We begin at the current native stack pointer, and reserve space for + * the ucontext_t we are copying onto the stack, as well as the call + * arguments for the usermode emulation handler. + * + * We 16-byte align the entire frame, and then unalign it again by + * adding space for the return address. + */ + frsz = SA(sizeof (ucontext_t)) + SA(6 * sizeof (uintptr_t)) + + sizeof (uintptr_t); + VERIFY((frsz & (STACK_ALIGN - 1UL)) == 8); + VERIFY((frsz & (STACK_ENTRY_ALIGN - 1UL)) == 0); + + if (lwpd->br_ntv_stack == lwpd->br_ntv_stack_current) { + /* + * Nobody else is using the stack right now, so start at the + * top. + */ + top = lwpd->br_ntv_stack_current; + } else { + /* + * Drop below the 128-byte reserved region of the stack frame + * we are interrupting. + */ + top = lwpd->br_ntv_stack_current - STACK_RESERVE; + } + top = top & ~(STACK_ALIGN - 1); + sp = top - frsz; + + uc_addr = top - SA(sizeof (ucontext_t)); + args_addr = uc_addr - SA(6 * sizeof (uintptr_t)); + + watched = watch_disable_addr((caddr_t)sp, frsz, S_WRITE); + if (on_fault(&lab)) { + goto badstack; + } + + /* + * Save the register state we preserved on the way into this brand + * system call and drop it on the native stack. + */ + { + /* + * Note: the amd64 ucontext_t is 864 bytes. + */ + ucontext_t uc; + + /* + * We do not want to save the signal mask for an emulation + * context. Some emulated system calls alter the signal mask; + * restoring it when the emulation is complete would clobber + * those intentional side effects. + */ + savecontext(&uc, NULL); + + /* + * Mark this as a system call emulation context: + */ + uc.uc_brand_data[0] = (void *)((uintptr_t) + uc.uc_brand_data[0] | LX_UC_FRAME_IS_SYSCALL); + + copyout_noerr(&uc, (void *)(uintptr_t)uc_addr, sizeof (uc)); + } + + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, uintptr_t, uc_addr); + lwp->lwp_oldcontext = (uintptr_t)uc_addr; + + /* + * Copy the system call arguments out to userland: + */ + copyout_noerr(args, (void *)(uintptr_t)args_addr, + 6 * sizeof (uintptr_t)); + + /* + * Drop the bogus return address on the stack. + */ + suword64_noerr((void *)sp, 0); + + no_fault(); + if (watched) { + watch_enable_addr((caddr_t)sp, frsz, S_WRITE); + } + + /* + * Pass the arguments to lx_emulate() in the appropriate registers. + */ + rp->r_rdi = uc_addr; + rp->r_rsi = syscall_num; + rp->r_rdx = args_addr; + + /* + * In order to be able to restore %edx, we need to JUSTRETURN. + */ + lwp->lwp_eosys = JUSTRETURN; + curthread->t_post_sys = 1; + aston(curthread); + + /* + * Set stack pointer and return address to the usermode emulation + * handler: + */ + lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; + lx_lwp_set_native_stack_current(lwpd, sp); + + /* + * Divert execution, on our return, to the usermode emulation stack + * and handler: + */ + rp->r_fp = 0; + rp->r_sp = sp; + rp->r_pc = ptolxproc(p)->l_handler; + + /* + * Fix up segment registers, etc. + */ + lx_switch_to_native(lwp); + + return; + +badstack: + no_fault(); + if (watched) { + watch_enable_addr((caddr_t)sp, frsz, S_WRITE); + } + +#ifdef DEBUG + printf("lx_emulate_user: bad native stack cmd=%s, pid=%d, sp=0x%lx\n", + PTOU(p)->u_comm, p->p_pid, sp); +#endif + + exit(CLD_KILLED, SIGSEGV); +} + +#if defined(_SYSCALL32_IMPL) +/* + * Call frame for the 32-bit usermode emulation handler: + * lx_emulate(ucontext_t *ucp, int syscall_num, uintptr_t *args) + * + * old sp: -------------------------------------------------------------- + * | - ucontext_t (register state for emulation) + * | - uintptr_t[6] (system call arguments array) + * | -------------------------------------------------------------- + * | - arg2: uintptr_t * (pointer to arguments array above) + * | - arg1: int (system call number) + * V - arg0: ucontext_t * (pointer to context above) + * new sp: - bogus return address + */ +struct lx_emu_frame32 { + caddr32_t retaddr; /* 0 */ + caddr32_t ucontextp; /* 4 */ + int32_t syscall_num; /* 8 */ + caddr32_t argsp; /* c */ +}; + +/* + * This function arranges for the lwp to execute the usermode emulation handler + * for this system call. The mechanism is similar to signal handling, and this + * function is modelled on sendsig32(). + */ +void +lx_emulate_user32(klwp_t *lwp, int syscall_num, uintptr_t *args) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + label_t lab; + caddr32_t uc_addr; + caddr32_t args_addr; + caddr32_t top; + /* + * Variables used after on_fault() returns for a fault + * must be volatile. + */ + volatile size_t frsz; + volatile caddr32_t sp; + volatile proc_t *p = lwptoproc(lwp); + volatile int watched; + + /* + * We should not be able to get here unless we are running Linux + * code for a system call we cannot emulate in the kernel. + */ + VERIFY(lwpd->br_stack_mode == LX_STACK_MODE_BRAND); + + /* + * We begin at the current native stack pointer, and reserve space for + * the ucontext_t we are copying onto the stack, as well as the call + * arguments for the usermode emulation handler. + */ + frsz = SA32(sizeof (ucontext32_t)) + SA32(6 * sizeof (uint32_t)) + + SA32(sizeof (struct lx_emu_frame32)); + VERIFY((frsz & (STACK_ALIGN32 - 1)) == 0); + + top = (caddr32_t)(lwpd->br_ntv_stack_current & ~(STACK_ALIGN32 - 1)); + sp = top - frsz; + + uc_addr = top - SA32(sizeof (ucontext32_t)); + args_addr = uc_addr - SA32(6 * sizeof (uint32_t)); + + watched = watch_disable_addr((caddr_t)(uintptr_t)sp, frsz, S_WRITE); + if (on_fault(&lab)) { + goto badstack; + } + + /* + * Save the register state we preserved on the way into this brand + * system call and drop it on the native stack. + */ + { + /* + * Note: ucontext32_t is 512 bytes. + */ + ucontext32_t uc; + + /* + * We do not want to save the signal mask for an emulation + * context. Some emulated system calls alter the signal mask; + * restoring it when the emulation is complete would clobber + * those intentional side effects. + */ + savecontext32(&uc, NULL); + + /* + * Mark this as a system call emulation context: + */ + uc.uc_brand_data[0] |= LX_UC_FRAME_IS_SYSCALL; + copyout_noerr(&uc, (void *)(uintptr_t)uc_addr, sizeof (uc)); + } + + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, uintptr_t, uc_addr); + lwp->lwp_oldcontext = (uintptr_t)uc_addr; + + /* + * Copy the system call arguments out to userland: + */ + { + uint32_t args32[6]; + + args32[0] = args[0]; + args32[1] = args[1]; + args32[2] = args[2]; + args32[3] = args[3]; + args32[4] = args[4]; + args32[5] = args[5]; + + copyout_noerr(&args32, (void *)(uintptr_t)args_addr, + sizeof (args32)); + } + + /* + * Assemble the call frame on the stack. + */ + { + struct lx_emu_frame32 frm; + + frm.retaddr = 0; + frm.ucontextp = uc_addr; + frm.argsp = args_addr; + frm.syscall_num = syscall_num; + + copyout_noerr(&frm, (void *)(uintptr_t)sp, sizeof (frm)); + } + + no_fault(); + if (watched) { + watch_enable_addr((caddr_t)(uintptr_t)sp, frsz, S_WRITE); + } + + /* + * Set stack pointer and return address to the usermode emulation + * handler: + */ + lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; + lx_lwp_set_native_stack_current(lwpd, sp); + + /* + * Divert execution, on our return, to the usermode emulation stack + * and handler: + */ + rp->r_fp = 0; + rp->r_sp = sp; + rp->r_pc = ptolxproc(p)->l_handler; + + /* + * Fix up segment registers, etc. + */ + lx_switch_to_native(lwp); + + return; + +badstack: + no_fault(); + if (watched) { + watch_enable_addr((caddr_t)(uintptr_t)sp, frsz, S_WRITE); + } + +#ifdef DEBUG + printf("lx_emulate_user32: bad native stack cmd=%s, pid=%d, sp=0x%x\n", + PTOU(p)->u_comm, p->p_pid, sp); +#endif + + exit(CLD_KILLED, SIGSEGV); +} +#endif /* _SYSCALL32_IMPL */ + +#else /* !__amd64 (__i386) */ + +void +lx_emulate_user(klwp_t *lwp, int syscall_num, uintptr_t *args) +{ + cmn_err(CE_WARN, "%s: no 32-bit kernel support", __FUNCTION__); + exit(CLD_KILLED, SIGSYS); +} + +#endif /* __amd64 */ diff --git a/usr/src/uts/intel/brand/lx/lx_brand_asm.s b/usr/src/uts/intel/brand/lx/lx_brand_asm.s deleted file mode 100644 index 568d462c2c..0000000000 --- a/usr/src/uts/intel/brand/lx/lx_brand_asm.s +++ /dev/null @@ -1,359 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - * Copyright 2014 Joyent, Inc. All rights reserved. - */ - -#if defined(__lint) - -#include <sys/systm.h> - -#else /* __lint */ - -#include <sys/controlregs.h> -#include "genassym.h" -#include "../common/brand_asm.h" - -#endif /* __lint */ - -#ifdef __lint - -void -lx_brand_int80_callback(void) -{ -} - -void -lx_brand_syscall_callback(void) -{ -} - -#else /* __lint */ - -#if defined(__amd64) - -/* - * syscall handler for 32-bit Linux user processes: - * See "64-BIT INTERPOSITION STACK" in brand_asm.h. - */ -ENTRY(lx_brand_int80_callback) - GET_PROCP(SP_REG, 0, %r15) - movq P_ZONE(%r15), %r15 /* grab the zone pointer */ - /* grab the 'max syscall num' for this process from 'zone brand data' */ - movq ZONE_BRAND_DATA(%r15), %r15 /* grab the zone brand ptr */ - movl LXZD_MAX_SYSCALL(%r15), %r15d /* get the 'max sysnum' word */ - cmpq %r15, %rax /* is 0 <= syscall <= MAX? */ - jbe 0f /* yes, syscall is OK */ - xorl %eax, %eax /* no, zero syscall number */ -0: - -.lx_brand_int80_patch_point: - jmp .lx_brand_int80_notrace - -.lx_brand_int80_notrace: - CALC_TABLE_ADDR(%r15, L_HANDLER) -1: - movq %r15, %rax - GET_V(%rsp, 0, V_SSP, %rsp) /* restore intr. stack pointer */ - xchgq (%rsp), %rax /* swap %rax and return addr */ - jmp sys_sysint_swapgs_iret - -.lx_brand_int80_trace: - /* - * If tracing is active, we vector to an alternate trace-enabling - * handler table instead. - */ - CALC_TABLE_ADDR(%r15, L_TRACEHANDLER) - jmp 1b -SET_SIZE(lx_brand_int80_callback) - -#define PATCH_POINT80 _CONST(.lx_brand_int80_patch_point + 1) -#define PATCH_VAL80 _CONST(.lx_brand_int80_trace - .lx_brand_int80_notrace) - -ENTRY(lx_brand_int80_enable) - movl $1, lx_systrace_brand_enabled(%rip) - movq $PATCH_POINT80, %r8 - movb $PATCH_VAL80, (%r8) - ret -SET_SIZE(lx_brand_int80_enable) - -ENTRY(lx_brand_int80_disable) - movq $PATCH_POINT80, %r8 - movb $0, (%r8) - movl $0, lx_systrace_brand_enabled(%rip) - ret -SET_SIZE(lx_brand_int80_disable) - - -/* - * syscall handler for 64-bit user processes: - * - * We're running on the kernel's %gs. - * - * We return directly to userland, bypassing the update_sregs() logic, so - * this routine must NOT do anything that could cause a context switch. - * - * %rax - syscall number - * - * See uts/i86pc/ml/syscall_asm_amd64.s for what happens before we get into - * the following lx brand-specific codepath. - * - * As the comment on the BRAND_CALLBACK macro describes, when we're called, all - * general registers, except for %r15, are as they were when the user process - * made the system call. %r15 is available to the callback as a scratch - * register. If the callback returns to the kernel path, %r15 does not have to - * be restored to the user value since BRAND_CALLBACK does that. If we jump - * out to the emulation we need to restore %r15 here. - * - * To 'return' to our user-space handler, we just need to place its address - * into %rcx. The original return address is passed back in %rax. - * - * Since this is the common syscall path for all 64-bit code (both Linux and - * native libc) in the branded zone (unlike the int80 path), we have to do a - * bit more checking to see if interpositioning is in effect (i.e. syscalls - * from the native ld.so.1 are not interposed since the emulation has not yet - * been installed, or the emulation is in native syscall mode). - */ -ENTRY(lx_brand_syscall_callback) - /* callback prologue */ - GET_PROCP(SP_REG, 0, %r15) - mov __P_BRAND_DATA(%r15), %r15 /* get p_brand_data */ - cmp $0, %r15 /* null ptr? */ - je 2f /* yes, take normal ret path */ - cmp $0, L_HANDLER(%r15) /* handler installed? */ - je 2f /* no, take normal ret path */ - - /* check for native vs. Linux syscall */ - GET_V(SP_REG, 0, V_LWP, %r15); /* get lwp pointer */ - movq LWP_BRAND(%r15), %r15 /* grab lx lwp data pointer */ - movl BR_NTV_SYSCALL(%r15), %r15d /* grab syscall src flag */ - cmp $1, %r15 /* check for native syscall */ - je 2f /* is native, stay in kernel */ - - /* Linux syscall - subsequent emul. syscalls will use native mode */ - GET_V(SP_REG, 0, V_LWP, %r15); /* get lwp pointer */ - movq LWP_BRAND(%r15), %r15 /* grab lx lwp data pointer */ - movl $1, BR_NTV_SYSCALL(%r15) /* set native syscall flag */ - - /* check if we have to restore native fsbase */ - GET_V(SP_REG, 0, V_LWP, %r15); /* get lwp pointer */ - movq LWP_BRAND(%r15), %r15 /* grab lx lwp data pointer */ - movq BR_NTV_FSBASE(%r15), %r15 /* grab native fsbase */ - cmp $0, %r15 /* native fsbase not saved? */ - je 3f /* yes, skip loading */ - -#ifdef DEBUG - /* - * This block is basically similar to a large assert. - * - * In debug code we do some extra validation of the %fsbase register to - * validate that we always have the expected Linux thread pointer and - * not the native value. At this point we know that the lwp brand data - * should contain the Linux %fsbase (from a Linux arch_prctl syscall) - * since the native %fsbase check above is non-null. We also know that - * we are making a Linux syscall from the other check above. We read - * the %fsbase and compare to the saved Linux %fsbase in the lwp_brand - * data. If we don't have the expected value, we save the incorrect - * %fsbase value into the br_lx_fsbase member for later inspection and - * change the syscall we are making into the Linux pivot_root syscall - * (an obscure syscall which we don't support and which an app in the - * zone cannot use). This allows us to see this error downstream via - * DTrace and see the incorrect %fsbase value we had. - */ - GET_V(SP_REG, 0, V_LWP, %r15); /* get lwp pointer */ - movq LWP_BRAND(%r15), %r15 /* grab lx lwp data pointer */ - movq BR_LX_FSBASE(%r15), %r15 /* grab Linux fsbase */ - - subq $24, %rsp /* make room for 3 regs */ - movq %rax, 0x0(%rsp) /* save regs used by rdmsr */ - movq %rcx, 0x8(%rsp) - movq %rdx, 0x10(%rsp) - - movl $MSR_AMD_FSBASE, %ecx /* fsbase msr */ - rdmsr /* get fsbase to edx:eax */ - - /* fix %edx; %eax lo already ok */ - shlq $32, %rdx - or %rdx, %rax /* full value in %rax */ - cmp %rax, %r15 /* check if is lx fsbase */ - je 4f /* match, ok */ - - movq %rax, %rdi /* pass bad fsbase as arg0 */ - movq $155, %rax /* fail! use pivot_root */ - jmp 5f - -4: - movq 0x0(%rsp), %rax /* restore %rax */ -5: - movq 0x8(%rsp), %rcx /* restore other regs */ - movq 0x10(%rsp), %rdx - addq $24, %rsp - - /* reload r15 with the native value */ - GET_V(SP_REG, 0, V_LWP, %r15); /* get lwp pointer */ - movq LWP_BRAND(%r15), %r15 /* grab lx lwp data pointer */ - movq BR_NTV_FSBASE(%r15), %r15 /* grab native fsbase */ -#endif - - /* - * Switch fsbase from Linux value back to native value. Also update pcb - * so that if we service an interrupt we will restore the correct fsbase - * in update_sregs(). - */ - subq $24, %rsp /* make room for 3 regs */ - movq %rax, 0x0(%rsp) /* save regs used by wrmsr */ - movq %rcx, 0x8(%rsp) - movq %rdx, 0x10(%rsp) - movq %r15, %rax /* native fsbase to %rax */ - movq %rax, %rdx /* setup regs for wrmsr */ - shrq $32, %rdx /* fix %edx; %eax already ok */ - movl $MSR_AMD_FSBASE, %ecx /* fsbase msr */ - wrmsr /* set fsbase from edx:eax */ - movq %rsp, %rdx /* use rdx as temp sp */ - addq $24, %rdx /* fix it back up */ - GET_V(%rdx, 0, V_LWP, %r15); /* get lwp pointer */ - movq %rax, LWP_PCB_FSBASE(%r15) /* save native fsbase in pcb */ - movq 0x0(%rsp), %rax /* restore regs */ - movq 0x8(%rsp), %rcx - movq 0x10(%rsp), %rdx - addq $24, %rsp - -3: - /* - * Linux syscall - validate syscall number. - * If necessary, the Linux %fsbase has already been loaded above. - */ - GET_PROCP(SP_REG, 0, %r15) - movq P_ZONE(%r15), %r15 /* grab the zone pointer */ - /* grab the 'max syscall num' for this process from 'zone brand data' */ - movq ZONE_BRAND_DATA(%r15), %r15 /* grab the zone brand ptr */ - movl LXZD_MAX_SYSCALL(%r15), %r15d /* get the 'max sysnum' word */ - cmp %r15, %rax /* is 0 <= syscall <= MAX? */ - ja 2f /* no, take normal ret path */ - -.lx_brand_syscall_patch_point: - jmp .lx_brand_syscall_notrace -.lx_brand_syscall_notrace: - - CALC_TABLE_ADDR(%r15, L_HANDLER) -1: - mov %rcx, %rax; /* save orig return addr in syscall_reg */ - mov %r15, %rcx; /* place new return addr in %rcx */ - mov %gs:CPU_RTMP_R15, %r15; /* restore scratch register */ - mov V_SSP(SP_REG), SP_REG /* restore user stack pointer */ - jmp nopop_sys_syscall_swapgs_sysretq - -2: /* no emulation, continue normal system call flow */ - retq - -.lx_brand_syscall_trace: - /* - * If tracing is active, we vector to an alternate trace-enabling - * handler table instead. - */ - CALC_TABLE_ADDR(%r15, L_TRACEHANDLER) - jmp 1b -SET_SIZE(lx_brand_syscall_callback) - -#define PATCH_POINT_SC _CONST(.lx_brand_syscall_patch_point + 1) -#define PATCH_VAL_SC \ - _CONST(.lx_brand_syscall_trace - .lx_brand_syscall_notrace) - -ENTRY(lx_brand_syscall_enable) - movl $1, lx_systrace_brand_enabled(%rip) - movq $PATCH_POINT_SC, %r8 - movb $PATCH_VAL_SC, (%r8) - ret -SET_SIZE(lx_brand_syscall_enable) - -ENTRY(lx_brand_syscall_disable) - movq $PATCH_POINT_SC, %r8 - movb $0, (%r8) - movl $0, lx_systrace_brand_enabled(%rip) - ret -SET_SIZE(lx_brand_syscall_disable) - - -#elif defined(__i386) - -/* - * See "32-BIT INTERPOSITION STACK" in brand_asm.h. - */ -ENTRY(lx_brand_int80_callback) - GET_PROCP(SP_REG, 0, %ebx) - movl P_ZONE(%ebx), %ebx /* grab the zone pointer */ - /* grab the 'max syscall num' for this process from 'zone brand data' */ - movl ZONE_BRAND_DATA(%ebx), %ebx /* grab the zone brand data */ - movl LXZD_MAX_SYSCALL(%ebx), %ebx /* get the max sysnum */ - - cmpl %ebx, %eax /* is 0 <= syscall <= MAX? */ - jbe 0f /* yes, syscall is OK */ - xorl %eax, %eax /* no, zero syscall number */ -0: - -.lx_brand_int80_patch_point: - jmp .lx_brand_int80_notrace - -.lx_brand_int80_notrace: - CALC_TABLE_ADDR(%ebx, L_HANDLER) - -1: - movl %ebx, %eax - GET_V(%esp, 0, V_U_EBX, %ebx) /* restore scratch register */ - addl $V_END, %esp /* restore intr. stack ptr */ - xchgl (%esp), %eax /* swap new and orig. return addrs */ - jmp nopop_sys_rtt_syscall - -.lx_brand_int80_trace: - CALC_TABLE_ADDR(%ebx, L_TRACEHANDLER) - jmp 1b -SET_SIZE(lx_brand_int80_callback) - - -#define PATCH_POINT _CONST(.lx_brand_int80_patch_point + 1) -#define PATCH_VAL _CONST(.lx_brand_int80_trace - .lx_brand_int80_notrace) - -ENTRY(lx_brand_int80_enable) - pushl %ebx - pushl %eax - movl $1, lx_systrace_brand_enabled - movl $PATCH_POINT, %ebx - movl $PATCH_VAL, %eax - movb %al, (%ebx) - popl %eax - popl %ebx - ret -SET_SIZE(lx_brand_int80_enable) - -ENTRY(lx_brand_int80_disable) - pushl %ebx - movl $PATCH_POINT, %ebx - movb $0, (%ebx) - movl $0, lx_systrace_brand_enabled - popl %ebx - ret -SET_SIZE(lx_brand_int80_disable) - -#endif /* __i386 */ -#endif /* __lint */ diff --git a/usr/src/uts/intel/genassym/offsets.in b/usr/src/uts/intel/genassym/offsets.in index 59763c1b4b..70221c02f9 100644 --- a/usr/src/uts/intel/genassym/offsets.in +++ b/usr/src/uts/intel/genassym/offsets.in @@ -21,7 +21,7 @@ \ \ Copyright 2010 Sun Microsystems, Inc. All rights reserved. \ Use is subject to license terms. -\ Copyright 2014 Joyent, Inc. All rights reserved. +\ Copyright 2015 Joyent, Inc. \ \ @@ -37,13 +37,7 @@ lx_proc_data l_handler - l_tracehandler - l_traceflag - -lx_zone_data - lxzd_max_syscall lx_lwp_data - br_ntv_syscall br_lx_fsbase br_ntv_fsbase diff --git a/usr/src/uts/intel/ia32/os/archdep.c b/usr/src/uts/intel/ia32/os/archdep.c index 42cc0d4d10..db4ccac06b 100644 --- a/usr/src/uts/intel/ia32/os/archdep.c +++ b/usr/src/uts/intel/ia32/os/archdep.c @@ -25,7 +25,7 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. * Copyright 2012 Nexenta Systems, Inc. All rights reserved. */ @@ -575,6 +575,13 @@ ucontext_32ton(const ucontext32_t *src, ucontext_t *dst) if (src->uc_flags & UC_FPU) fpregset_32ton(&src->uc_mcontext.fpregs, &dst->uc_mcontext.fpregs); + + /* + * Copy the brand-private data: + */ + dst->uc_brand_data[0] = (void *)(uintptr_t)src->uc_brand_data[0]; + dst->uc_brand_data[1] = (void *)(uintptr_t)src->uc_brand_data[1]; + dst->uc_brand_data[2] = (void *)(uintptr_t)src->uc_brand_data[2]; } #endif /* _SYSCALL32_IMPL */ @@ -633,7 +640,7 @@ static greg_t fix_segreg(greg_t sr, int iscs, model_t datamodel) { kthread_t *t = curthread; - + switch (sr &= 0xffff) { case 0: @@ -669,7 +676,7 @@ fix_segreg(greg_t sr, int iscs, model_t datamodel) break; } - /* + /* * Allow this process's brand to do any necessary segment register * manipulation. */ diff --git a/usr/src/uts/intel/ia32/os/sendsig.c b/usr/src/uts/intel/ia32/os/sendsig.c index 979c9e3294..f6c14324bc 100644 --- a/usr/src/uts/intel/ia32/os/sendsig.c +++ b/usr/src/uts/intel/ia32/os/sendsig.c @@ -20,6 +20,9 @@ */ /* + * Copyright 2015 Joyent, Inc. + */ +/* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -87,6 +90,8 @@ #include <sys/kdi.h> #include <sys/contract_impl.h> #include <sys/x86_archext.h> +#include <sys/brand.h> +#include <sys/sdt.h> /* * Construct the execution environment for the user's signal @@ -186,7 +191,18 @@ sendsig(int sig, k_siginfo_t *sip, void (*hdlr)()) newstack = sigismember(&PTOU(curproc)->u_sigonstack, sig) && !(lwp->lwp_sigaltstack.ss_flags & (SS_ONSTACK|SS_DISABLE)); - if (newstack) { + /* + * If this is a branded process, the brand may provide an alternate + * stack pointer for signal delivery: + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_sendsig_stack != NULL) { + /* + * Use the stack pointer value provided by the brand, + * accounting for the 128-byte reserved region. + */ + newstack = 0; + fp = BROP(p)->b_sendsig_stack(sig) - STACK_RESERVE; + } else if (newstack) { fp = (caddr_t)(SA((uintptr_t)lwp->lwp_sigaltstack.ss_sp) + SA(lwp->lwp_sigaltstack.ss_size) - STACK_ALIGN); } else { @@ -293,6 +309,8 @@ sendsig(int sig, k_siginfo_t *sip, void (*hdlr)()) kmem_free(tuc, sizeof (*tuc)); tuc = NULL; + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, uintptr_t, (uintptr_t)uc); lwp->lwp_oldcontext = (uintptr_t)uc; if (newstack) { @@ -342,6 +360,14 @@ sendsig(int sig, k_siginfo_t *sip, void (*hdlr)()) } /* + * Allow the brand to perform additional book-keeping once the signal + * handling frame has been fully assembled: + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_sendsig != NULL) { + BROP(p)->b_sendsig(sig); + } + + /* * Don't set lwp_eosys here. sendsig() is called via psig() after * lwp_eosys is handled, so setting it here would affect the next * system call. @@ -417,7 +443,17 @@ sendsig32(int sig, k_siginfo_t *sip, void (*hdlr)()) newstack = sigismember(&PTOU(curproc)->u_sigonstack, sig) && !(lwp->lwp_sigaltstack.ss_flags & (SS_ONSTACK|SS_DISABLE)); - if (newstack) { + /* + * If this is a branded process, the brand may provide an alternate + * stack pointer for signal delivery: + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_sendsig_stack != NULL) { + /* + * Use the stack pointer value provided by the brand: + */ + newstack = 0; + fp = BROP(p)->b_sendsig_stack(sig); + } else if (newstack) { fp = (caddr_t)(SA32((uintptr_t)lwp->lwp_sigaltstack.ss_sp) + SA32(lwp->lwp_sigaltstack.ss_size) - STACK_ALIGN32); } else if ((rp->r_ss & 0xffff) != UDS_SEL) { @@ -432,8 +468,9 @@ sendsig32(int sig, k_siginfo_t *sip, void (*hdlr)()) USEGD_GETBASE(&ldt[SELTOIDX(rp->r_ss)]); else fp = (caddr_t)rp->r_sp; - } else + } else { fp = (caddr_t)rp->r_sp; + } /* * Force proper stack pointer alignment, even in the face of a @@ -511,6 +548,8 @@ sendsig32(int sig, k_siginfo_t *sip, void (*hdlr)()) kmem_free(tuc, sizeof (*tuc)); tuc = NULL; + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, uintptr_t, (uintptr_t)uc); lwp->lwp_oldcontext = (uintptr_t)uc; if (newstack) { @@ -560,6 +599,14 @@ sendsig32(int sig, k_siginfo_t *sip, void (*hdlr)()) } /* + * Allow the brand to perform additional book-keeping once the signal + * handling frame has been fully assembled: + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_sendsig != NULL) { + BROP(p)->b_sendsig(sig); + } + + /* * Don't set lwp_eosys here. sendsig() is called via psig() after * lwp_eosys is handled, so setting it here would affect the next * system call. @@ -637,7 +684,17 @@ sendsig(int sig, k_siginfo_t *sip, void (*hdlr)()) newstack = sigismember(&PTOU(curproc)->u_sigonstack, sig) && !(lwp->lwp_sigaltstack.ss_flags & (SS_ONSTACK|SS_DISABLE)); - if (newstack) { + /* + * If this is a branded process, the brand may provide an alternate + * stack pointer for signal delivery: + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_sendsig_stack != NULL) { + /* + * Use the stack pointer value provided by the brand: + */ + newstack = 0; + fp = BROP(p)->b_sendsig_stack(sig); + } else if (newstack) { fp = (caddr_t)(SA((uintptr_t)lwp->lwp_sigaltstack.ss_sp) + SA(lwp->lwp_sigaltstack.ss_size) - STACK_ALIGN); } else if ((rp->r_ss & 0xffff) != UDS_SEL) { @@ -652,8 +709,9 @@ sendsig(int sig, k_siginfo_t *sip, void (*hdlr)()) USEGD_GETBASE(&ldt[SELTOIDX(rp->r_ss)]); else fp = (caddr_t)rp->r_sp; - } else + } else { fp = (caddr_t)rp->r_sp; + } /* * Force proper stack pointer alignment, even in the face of a @@ -731,6 +789,8 @@ sendsig(int sig, k_siginfo_t *sip, void (*hdlr)()) kmem_free(tuc, sizeof (*tuc)); tuc = NULL; + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, uintptr_t, (uintptr_t)uc); lwp->lwp_oldcontext = (uintptr_t)uc; if (newstack) { @@ -768,6 +828,14 @@ sendsig(int sig, k_siginfo_t *sip, void (*hdlr)()) } /* + * Allow the brand to perform additional book-keeping once the signal + * handling frame has been fully assembled: + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_sendsig != NULL) { + BROP(p)->b_sendsig(sig); + } + + /* * Don't set lwp_eosys here. sendsig() is called via psig() after * lwp_eosys is handled, so setting it here would affect the next * system call. diff --git a/usr/src/uts/intel/ia32/syscall/getcontext.c b/usr/src/uts/intel/ia32/syscall/getcontext.c index cb5a5b52ba..8f72b5da72 100644 --- a/usr/src/uts/intel/ia32/syscall/getcontext.c +++ b/usr/src/uts/intel/ia32/syscall/getcontext.c @@ -20,6 +20,9 @@ */ /* + * Copyright 2015 Joyent, Inc. + */ +/* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -46,6 +49,7 @@ #include <sys/schedctl.h> #include <sys/debug.h> #include <sys/sysmacros.h> +#include <sys/sdt.h> /* * Save user context. @@ -125,7 +129,23 @@ savecontext(ucontext_t *ucp, const k_sigset_t *mask) else ucp->uc_flags &= ~UC_FPU; - sigktou(mask, &ucp->uc_sigmask); + if (mask != NULL) { + /* + * Save signal mask. + */ + sigktou(mask, &ucp->uc_sigmask); + } else { + ucp->uc_flags &= ~UC_SIGMASK; + bzero(&ucp->uc_sigmask, sizeof (ucp->uc_sigmask)); + } + + if (PROC_IS_BRANDED(p) && BROP(p)->b_savecontext != NULL) { + /* + * Allow the brand the chance to modify the context we + * saved: + */ + BROP(p)->b_savecontext(ucp); + } } /* @@ -136,7 +156,19 @@ restorecontext(ucontext_t *ucp) { kthread_t *t = curthread; klwp_t *lwp = ttolwp(t); + proc_t *p = lwptoproc(lwp); + if (PROC_IS_BRANDED(p) && BROP(p)->b_restorecontext != NULL) { + /* + * Allow the brand the chance to modify the context before + * we restore it: + */ + BROP(p)->b_restorecontext(ucp); + } + + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, + uintptr_t, (uintptr_t)ucp->uc_link); lwp->lwp_oldcontext = (uintptr_t)ucp->uc_link; if (ucp->uc_flags & UC_STACK) { @@ -184,6 +216,7 @@ getsetcontext(int flag, void *arg) ucontext_t *ucp; klwp_t *lwp = ttolwp(curthread); stack_t dummy_stk; + proc_t *p = lwptoproc(lwp); /* * In future releases, when the ucontext structure grows, @@ -228,6 +261,15 @@ getsetcontext(int flag, void *arg) return (set_errno(EFAULT)); } + /* + * If this is a branded process, copy in the brand-private + * data: + */ + if (PROC_IS_BRANDED(p) && copyin(&ucp->uc_brand_data, + &uc.uc_brand_data, sizeof (uc.uc_brand_data)) != 0) { + return (set_errno(EFAULT)); + } + restorecontext(&uc); if ((uc.uc_flags & UC_STACK) && (lwp->lwp_ustack != 0)) @@ -311,7 +353,23 @@ savecontext32(ucontext32_t *ucp, const k_sigset_t *mask) else ucp->uc_flags &= ~UC_FPU; - sigktou(mask, &ucp->uc_sigmask); + if (mask != NULL) { + /* + * Save signal mask. + */ + sigktou(mask, &ucp->uc_sigmask); + } else { + ucp->uc_flags &= ~UC_SIGMASK; + bzero(&ucp->uc_sigmask, sizeof (ucp->uc_sigmask)); + } + + if (PROC_IS_BRANDED(p) && BROP(p)->b_savecontext32 != NULL) { + /* + * Allow the brand the chance to modify the context we + * saved: + */ + BROP(p)->b_savecontext32(ucp); + } } int @@ -323,6 +381,7 @@ getsetcontext32(int flag, void *arg) klwp_t *lwp = ttolwp(curthread); caddr32_t ustack32; stack32_t dummy_stk32; + proc_t *p = lwptoproc(lwp); switch (flag) { default: @@ -354,6 +413,15 @@ getsetcontext32(int flag, void *arg) return (set_errno(EFAULT)); } + /* + * If this is a branded process, copy in the brand-private + * data: + */ + if (PROC_IS_BRANDED(p) && copyin(&ucp->uc_brand_data, + &uc.uc_brand_data, sizeof (uc.uc_brand_data)) != 0) { + return (set_errno(EFAULT)); + } + ucontext_32ton(&uc, &ucnat); restorecontext(&ucnat); diff --git a/usr/src/uts/intel/lx_brand/Makefile.rules b/usr/src/uts/intel/lx_brand/Makefile.rules index e78bcb1827..0a83e15493 100644 --- a/usr/src/uts/intel/lx_brand/Makefile.rules +++ b/usr/src/uts/intel/lx_brand/Makefile.rules @@ -21,7 +21,7 @@ # # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. -# Copyright 2014 Joyent, Inc. All rights reserved. +# Copyright 2015 Joyent, Inc. # # @@ -44,15 +44,23 @@ $(OBJS_DIR_DBG64)/%.o: $(UTSBASE)/common/brand/lx/syscall/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR_OBJ64)/%.o: $(UTSBASE)/intel/brand/lx/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR_DBG64)/%.o: $(UTSBASE)/intel/brand/lx/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR_OBJ64)/%.o: $(UTSBASE)/intel/brand/lx/%.s - $(COMPILE.s) -o $@ $< + $(COMPILE.s) -I$(UTSBASE)/i86pc -o $@ $< $(OBJS_DIR_OBJ64)/%.o: $(LX_CMN)/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) $(OBJS_DIR_DBG64)/%.o: $(UTSBASE)/intel/brand/lx/%.s - $(COMPILE.s) -o $@ $< + $(COMPILE.s) -I$(UTSBASE)/i86pc -o $@ $< $(OBJS_DIR)/%.o: $(UTSBASE)/common/brand/lx/os/%.c $(COMPILE.c) -o $@ $< @@ -62,12 +70,16 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/brand/lx/syscall/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/intel/brand/lx/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(LX_CMN)/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) $(OBJS_DIR)/%.o: $(UTSBASE)/intel/brand/lx/%.s - $(COMPILE.s) -o $@ $< + $(COMPILE.s) -I$(UTSBASE)/i86pc -o $@ $< # # Section 1b: Lint `object' build rules. diff --git a/usr/src/uts/intel/sys/ucontext.h b/usr/src/uts/intel/sys/ucontext.h index acd6ddc99e..26f5923930 100644 --- a/usr/src/uts/intel/sys/ucontext.h +++ b/usr/src/uts/intel/sys/ucontext.h @@ -20,6 +20,9 @@ */ /* + * Copyright 2015 Joyent, Inc. + */ +/* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -82,9 +85,16 @@ struct __ucontext { sigset_t uc_sigmask; stack_t uc_stack; mcontext_t uc_mcontext; - long uc_filler[5]; /* see ABI spec for Intel386 */ + /* + * The Intel386 ABI specification includes a 5-element array of longs + * called "uc_filler", padding the size of the struct to 512 bytes. To + * allow zone brands to communicate extra data right the way through + * the signal handling process, from sigacthandler to setcontext, we + * steal the first three of these longs as a brand-private member. + */ + void *uc_brand_data[3]; + long uc_filler[2]; }; - #if defined(_SYSCALL32) /* Kernel view of user ILP32 ucontext structure */ @@ -95,7 +105,8 @@ typedef struct ucontext32 { sigset_t uc_sigmask; stack32_t uc_stack; mcontext32_t uc_mcontext; - int32_t uc_filler[5]; + caddr32_t uc_brand_data[3]; + int32_t uc_filler[2]; } ucontext32_t; #if defined(_KERNEL) diff --git a/usr/src/uts/sparc/syscall/getcontext.c b/usr/src/uts/sparc/syscall/getcontext.c index 437eef5e1a..fd0acaadf1 100644 --- a/usr/src/uts/sparc/syscall/getcontext.c +++ b/usr/src/uts/sparc/syscall/getcontext.c @@ -20,6 +20,9 @@ */ /* + * Copyright 2015 Joyent, Inc. + */ +/* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -110,10 +113,15 @@ savecontext(ucontext_t *ucp, const k_sigset_t *mask) ucp->uc_flags &= ~UC_FPU; ucp->uc_mcontext.gwins = (gwindows_t *)NULL; - /* - * Save signal mask. - */ - sigktou(mask, &ucp->uc_sigmask); + if (mask != NULL) { + /* + * Save signal mask. + */ + sigktou(mask, &ucp->uc_sigmask); + } else { + ucp->uc_flags &= ~UC_SIGMASK; + bzero(&ucp->uc_sigmask, sizeof (ucp->uc_sigmask)); + } } @@ -412,11 +420,16 @@ savecontext32(ucontext32_t *ucp, const k_sigset_t *mask, struct fq32 *dfq) ucp->uc_flags &= ~UC_FPU; ucp->uc_mcontext.gwins = (caddr32_t)NULL; - /* - * Save signal mask (the 32- and 64-bit sigset_t structures are - * identical). - */ - sigktou(mask, (sigset_t *)&ucp->uc_sigmask); + if (mask != NULL) { + /* + * Save signal mask (the 32- and 64-bit sigset_t structures are + * identical). + */ + sigktou(mask, (sigset_t *)&ucp->uc_sigmask); + } else { + ucp->uc_flags &= ~UC_SIGMASK; + bzero(&ucp->uc_sigmask, sizeof (ucp->uc_sigmask)); + } } int |