summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorJoshua M. Clulow <jmc@joyent.com>2015-02-23 22:43:12 -0800
committerJoshua M. Clulow <jmc@joyent.com>2015-02-24 06:43:12 +0000
commitcbb62638d5ccc777c90e15b41b1cf6943d284bd4 (patch)
tree9c660c98372081889d3f7e2e63853a37693468df /usr/src
parenta5e945f618fb3657405a0971ee2886cbee1595d7 (diff)
downloadillumos-joyent-cbb62638d5ccc777c90e15b41b1cf6943d284bd4.tar.gz
OS-3561 lxbrand emulation library should execute on alternate stack
OS-3558 lxbrand add support for full in-kernel syscall handling OS-3545 lx_syscall_regs should not walk stack OS-3868 many LTP testcases now hang OS-3901 lxbrand lx_recvmsg fails to translate control messages when 64-bit Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Patrick Mooney <patrick.mooney@joyent.com> Reviewed by: Bryan Cantrill <bryan@joyent.com>
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/mdb/common/modules/libc/libc.c2
-rw-r--r--usr/src/common/brand/lx/lx_signum.c80
-rw-r--r--usr/src/common/brand/lx/lx_signum.h6
-rw-r--r--usr/src/lib/brand/lx/lx_brand/Makefile.com9
-rw-r--r--usr/src/lib/brand/lx/lx_brand/amd64/lx_handler.s382
-rw-r--r--usr/src/lib/brand/lx/lx_brand/amd64/lx_runexe.s46
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/clone.c437
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/file.c28
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/fork.c84
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/lx_brand.c2041
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/lx_provider.d18
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/lx_thunk_server.c28
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/mem.c17
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/misc.c191
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/poll_select.c10
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/ptrace.c462
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/signal.c1212
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/socket.c110
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/stack.c280
-rw-r--r--usr/src/lib/brand/lx/lx_brand/i386/lx_handler.s293
-rw-r--r--usr/src/lib/brand/lx/lx_brand/i386/lx_runexe.s60
-rw-r--r--usr/src/lib/brand/lx/lx_brand/sys/lx_misc.h35
-rw-r--r--usr/src/lib/brand/lx/lx_brand/sys/lx_signal.h104
-rw-r--r--usr/src/lib/brand/lx/lx_brand/sys/lx_syscall.h65
-rw-r--r--usr/src/lib/brand/lx/lx_brand/sys/lx_thread.h26
-rw-r--r--usr/src/lib/brand/lx/lx_brand/sys/lx_thunk_server.h3
-rw-r--r--usr/src/lib/libc/port/mapfile-vers1
-rw-r--r--usr/src/lib/libc/port/threads/sigaction.c22
-rw-r--r--usr/src/uts/common/brand/lx/dtrace/lx_systrace.c723
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_brand.c983
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_misc.c51
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_ptrace.c99
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_syscall.c1162
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_brand.h284
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_misc.h7
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_siginfo.h190
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_syscalls.h15
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_types.h (renamed from usr/src/lib/brand/lx/lx_brand/sys/lx_types.h)4
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_clone.c23
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_futex.c14
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_getpid.c16
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_id.c25
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_ioctl.c2
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_kill.c12
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_pipe.c26
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_rw.c25
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_sched.c38
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c130
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_thread_area.c30
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_wait.c (renamed from usr/src/lib/brand/lx/lx_brand/common/wait.c)295
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_xattr.c (renamed from usr/src/lib/brand/lx/lx_brand/common/xattr.c)30
-rw-r--r--usr/src/uts/common/brand/sn1/sn1_brand.c58
-rw-r--r--usr/src/uts/common/brand/sngl/sngl_brand.c58
-rw-r--r--usr/src/uts/common/brand/solaris10/s10_brand.c58
-rw-r--r--usr/src/uts/common/os/exec.c2
-rw-r--r--usr/src/uts/common/sys/brand.h7
-rw-r--r--usr/src/uts/common/sys/klwp.h4
-rw-r--r--usr/src/uts/i86pc/ml/offsets.in2
-rw-r--r--usr/src/uts/i86pc/ml/syscall_asm_amd64.s70
-rw-r--r--usr/src/uts/intel/Makefile.files6
-rw-r--r--usr/src/uts/intel/brand/lx/lx_archdep.c1171
-rw-r--r--usr/src/uts/intel/brand/lx/lx_brand_asm.s359
-rw-r--r--usr/src/uts/intel/genassym/offsets.in8
-rw-r--r--usr/src/uts/intel/ia32/os/archdep.c13
-rw-r--r--usr/src/uts/intel/ia32/os/sendsig.c78
-rw-r--r--usr/src/uts/intel/ia32/syscall/getcontext.c72
-rw-r--r--usr/src/uts/intel/lx_brand/Makefile.rules20
-rw-r--r--usr/src/uts/intel/sys/ucontext.h17
-rw-r--r--usr/src/uts/sparc/syscall/getcontext.c31
69 files changed, 6826 insertions, 5444 deletions
diff --git a/usr/src/cmd/mdb/common/modules/libc/libc.c b/usr/src/cmd/mdb/common/modules/libc/libc.c
index 7ad7f86996..c4b713f096 100644
--- a/usr/src/cmd/mdb/common/modules/libc/libc.c
+++ b/usr/src/cmd/mdb/common/modules/libc/libc.c
@@ -137,6 +137,8 @@ d_ucontext(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
uc.uc_stack.ss_sp, uc.uc_stack.ss_size, stack_flags(&uc.uc_stack));
mdb_printf(" mcontext = 0x%p\n",
addr + OFFSETOF(ucontext_t, uc_mcontext));
+ mdb_printf(" brand = 0x%p 0x%p 0x%p\n",
+ uc.uc_brand_data[0], uc.uc_brand_data[1], uc.uc_brand_data[2]);
return (DCMD_OK);
}
diff --git a/usr/src/common/brand/lx/lx_signum.c b/usr/src/common/brand/lx/lx_signum.c
index 08ab453885..22afb99ac7 100644
--- a/usr/src/common/brand/lx/lx_signum.c
+++ b/usr/src/common/brand/lx/lx_signum.c
@@ -22,11 +22,17 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#include <sys/signal.h>
+#include <sys/lx_siginfo.h>
#include <lx_signum.h>
+#ifdef _KERNEL
+#include <sys/debug.h>
+#else
+#include <assert.h>
+#endif
/*
* Delivering signals to a Linux process is complicated by differences in
@@ -242,3 +248,75 @@ stol_signo[NSIG] = {
LX_SIGRTMIN + 30,
LX_SIGRTMAX, /* 73: Solaris _SIGRTMAX */
};
+
+/*
+ * Convert an illumos native signal number to a Linux signal number and return
+ * it. If no valid conversion is possible, the function fails back to the
+ * value of "defsig". In userland, passing a default signal number of "-1"
+ * will abort the program if the signal number could not be converted.
+ */
+int
+lx_stol_signo(int signo, int defsig)
+{
+ int rval;
+
+#ifdef _KERNEL
+ VERIFY(defsig != -1);
+#endif
+
+ if (signo < 0 || signo >= NSIG || (rval = stol_signo[signo]) < 1) {
+#ifndef _KERNEL
+ if (defsig == -1) {
+ assert(0);
+ }
+#endif
+ return (defsig);
+ }
+
+ return (rval);
+}
+
+/*
+ * Convert the "status" field of a SIGCLD siginfo_t. We need to extract the
+ * illumos signal number and convert it to a Linux signal number while leaving
+ * the ptrace(2) event bits intact. In userland, passing a default signal
+ * number of "-1" will abort the program if the signal number could not be
+ * converted, as for lx_stol_signo().
+ */
+int
+lx_stol_status(int s, int defsig)
+{
+ /*
+ * We mask out the top bit here in case PTRACE_O_TRACESYSGOOD
+ * is in use and 0x80 has been ORed with the signal number.
+ */
+ int stat = lx_stol_signo(s & 0x7f, defsig);
+
+ /*
+ * We must mix in the ptrace(2) event which may be stored in
+ * the second byte of the status code. We also re-include the
+ * PTRACE_O_TRACESYSGOOD bit.
+ */
+ return ((s & 0xff80) | stat);
+}
+
+int
+lx_stol_sigcode(int code)
+{
+ switch (code) {
+ case SI_USER:
+ return (LX_SI_USER);
+ case SI_LWP:
+ return (LX_SI_TKILL);
+ case SI_QUEUE:
+ return (LX_SI_QUEUE);
+ case SI_TIMER:
+ return (LX_SI_TIMER);
+ case SI_ASYNCIO:
+ return (LX_SI_ASYNCIO);
+ case SI_MESGQ:
+ return (LX_SI_MESGQ);
+ default:
+ return (code);
+ }
+}
diff --git a/usr/src/common/brand/lx/lx_signum.h b/usr/src/common/brand/lx/lx_signum.h
index f410500925..a7807c2b07 100644
--- a/usr/src/common/brand/lx/lx_signum.h
+++ b/usr/src/common/brand/lx/lx_signum.h
@@ -21,7 +21,7 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#ifndef _LX_SIGNUM_H
@@ -74,6 +74,10 @@ extern "C" {
extern const int ltos_signo[];
extern const int stol_signo[];
+extern int lx_stol_signo(int, int);
+extern int lx_stol_status(int, int);
+extern int lx_stol_sigcode(int);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/lib/brand/lx/lx_brand/Makefile.com b/usr/src/lib/brand/lx/lx_brand/Makefile.com
index 80f30d48be..804dccfce7 100644
--- a/usr/src/lib/brand/lx/lx_brand/Makefile.com
+++ b/usr/src/lib/brand/lx/lx_brand/Makefile.com
@@ -21,7 +21,7 @@
#
# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
-# Copyright 2014 Joyent, Inc. All rights reserved.
+# Copyright 2015 Joyent, Inc.
#
LX_CMN = $(SRC)/common/brand/lx
@@ -55,17 +55,16 @@ COBJS = aio.o \
sendfile.o \
signal.o \
socket.o \
+ stack.o \
stat.o \
statfs.o \
sysctl.o \
sysv_ipc.o \
time.o \
- truncate.o \
- wait.o \
- xattr.o
+ truncate.o
CMNOBJS = lx_signum.o
-ASOBJS = lx_handler.o lx_runexe.o lx_crt.o
+ASOBJS = lx_handler.o lx_crt.o
OBJECTS = $(CMNOBJS) $(COBJS) $(ASOBJS)
USDT_PROVIDERS = lx_provider.d
diff --git a/usr/src/lib/brand/lx/lx_brand/amd64/lx_handler.s b/usr/src/lib/brand/lx/lx_brand/amd64/lx_handler.s
index af8fae621f..b33845d8a0 100644
--- a/usr/src/lib/brand/lx/lx_brand/amd64/lx_handler.s
+++ b/usr/src/lib/brand/lx/lx_brand/amd64/lx_handler.s
@@ -11,7 +11,7 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#include <sys/asm_linkage.h>
@@ -28,40 +28,8 @@
#include "assym.h"
/* 64-bit signal syscall numbers */
-#define LX_SYS_sigreturn 513
#define LX_SYS_rt_sigreturn 15
-/*
- * Each JMP must occupy 16 bytes.
- * The syscall offset is stored immediately above the red zone to avoid
- * clobbering data there. Once lx_handler is reached, the stack will be
- * advanced to account for both the red zone and the stored syscall offset.
- */
-#define JMP \
- movl $_CONST(. - lx_handler_table), -136(%rsp); \
- jmp lx_handler; \
- .align 16;
-
-#define JMP4 JMP; JMP; JMP; JMP
-#define JMP16 JMP4; JMP4; JMP4; JMP4
-#define JMP64 JMP16; JMP16; JMP16; JMP16
-#define JMP256 JMP64; JMP64; JMP64; JMP64
-
-/*
- * Alternate jump table that turns on lx_traceflag before proceeding with
- * the normal emulation routine.
- */
-#define TJMP \
- movl $_CONST(. - lx_handler_trace_table), -136(%rsp); \
- jmp lx_handler_trace; \
- .align 16;
-
-#define TJMP4 TJMP; TJMP; TJMP; TJMP
-#define TJMP16 TJMP4; TJMP4; TJMP4; TJMP4
-#define TJMP64 TJMP16; TJMP16; TJMP16; TJMP16
-#define TJMP256 TJMP64; TJMP64; TJMP64; TJMP64
-
-
#if defined(lint)
#include <sys/types.h>
@@ -69,343 +37,16 @@
#include <sys/signal.h>
void
-lx_handler_table(void)
-{}
-
-void
-lx_handler(void)
-{}
-
-/* ARGSUSED */
-void
-lx_setup_clone(uintptr_t gs, void *retaddr, void *stk)
-{}
-
-/* ARGSUSED */
-void
-lx_sigdeliver(int sig, siginfo_t *sip, void *p, size_t stacksz,
- void (*stack_frame_builder)(void), void (*lx_sighandler)(void),
- uintptr_t gs)
-{}
-
-/* ARGSUSED */
-void
-lx_sigacthandler(int sig, siginfo_t *s, void *p)
-{}
-
-void
-lx_sigreturn_tramp(void)
-{}
-
-void
lx_rt_sigreturn_tramp(void)
{}
-/* ARGSUSED */
void
-lx_sigreturn_tolibc(uintptr_t sp)
+lx_vsyscall_tramp(void)
{}
#else /* lint */
/*
- * On entry to this table, %rax will hold the return address. The
- * location where we enter the table is a function of the system
- * call number. The table needs the same alignment as the individual
- * entries.
- */
- .align 16
- ENTRY_NP(lx_handler_trace_table)
- TJMP256
- TJMP64
- TJMP64
- SET_SIZE(lx_handler_trace_table)
-
- .align 16
- ENTRY_NP(lx_handler_table)
- JMP256
- JMP64
- JMP64
- SET_SIZE(lx_handler_table)
-
- ENTRY_NP(lx_handler_trace)
- subq $136, %rsp /* skip red zone + syscall offset */
- pushq %rsi
- movq lx_traceflag@GOTPCREL(%rip), %rsi
- movq $1, (%rsi)
- popq %rsi
- addq $136, %rsp
- /*
- * While we could just fall through to lx_handler(), we "tail-call" it
- * instead to make ourselves a little more comprehensible to trace
- * tools.
- */
- jmp lx_handler
- SET_SIZE(lx_handler_trace)
-
- ALTENTRY(lx_handler)
- /*
- * We are running on the Linux process's stack here so we have to
- * account for the AMD64 ABI red zone of 128 bytes past the %rsp which
- * the process can use as scratch space. In addition to the red zone,
- * the syscall offset stored by the handler tables above must be
- * accounted for. To that end, rsp is advanced by a further 8 bytes to
- * include the syscall offset.
- */
- subq $136, %rsp /* red zone + syscall offset */
-
- /*
- * In order to keep the hander_table entries within 16 bytes, only 4
- * bytes of the syscall offset are stored during dispatch.
- * The upper 4 bytes are zeroed here to account for that.
- */
- movl $0, 4(%rsp)
-
- /*
- * %rbp isn't always going to be a frame pointer on Linux, but when
- * it is, saving it here lets us have a coherent stack backtrace.
- */
- pushq %rbp
-
- /*
- * Fill in a lx_regs_t structure on the stack.
- */
- subq $SIZEOF_LX_REGS_T, %rsp
-
- /*
- * Save %rbp and then fill it with what would be its usual value as
- * the frame pointer. The value we save for %rsp needs to be the
- * stack pointer at the time of the syscall so we need to skip the
- * red zone, saved %rbp and (what will be) the return address.
- */
- movq %rbp, LXR_RBP(%rsp)
- movq %rsp, %rbp
- addq $SIZEOF_LX_REGS_T, %rbp
- movq %rbp, LXR_RSP(%rsp)
- addq $144, LXR_RSP(%rsp) /* 128 byte red zone + 2 pointers */
-
- movq $0, LXR_FS(%rsp)
- movw %fs, LXR_FS(%rsp)
- movq %rdi, LXR_RDI(%rsp)
- movq %rsi, LXR_RSI(%rsp)
- movq %rbx, LXR_RBX(%rsp)
- movq %rdx, LXR_RDX(%rsp)
- movq %rcx, LXR_RCX(%rsp)
- movq %rax, LXR_RIP(%rsp) /* %rax holds the return addr. */
- movq %r8, LXR_R8(%rsp)
- movq %r9, LXR_R9(%rsp)
- movq %r10, LXR_R10(%rsp)
- movq %r11, LXR_R11(%rsp)
- movq %r12, LXR_R12(%rsp)
- movq %r13, LXR_R13(%rsp)
- movq %r14, LXR_R14(%rsp)
- movq %r15, LXR_R15(%rsp)
-
- /*
- * The kernel drops us into the middle of one of the tables above
- * that then stores the table offset immediately above the 128 byte
- * red zone and calls into lx_handler. That offset indicates the
- * syscall number while %rax holds the return address for the syscall.
- * We replace the value on the stack with the return address, and use
- * the value to compute the system call number by dividing by the table
- * entry size.
- */
- xchgq 8(%rbp), %rax /* just after the rbp we pushed */
- shrq $4, %rax
- movq %rax, LXR_RAX(%rsp)
-
- /*
- * Call lx_emulate() whose only argument is a pointer to the
- * lx_regs_t structure we've placed on the stack.
- */
- movq %rsp, %rdi
- call lx_emulate
-
- /*
- * We use this global symbol to identify this return site when
- * walking the stack backtrace. It needs to remain immediately
- * after the call to lx_emulate().
- */
- ALTENTRY(lx_emulate_done)
-
- /*
- * Restore the saved register state; we get %rbp and %rsp from
- * the ordinary locations rather than the saved state.
- */
- movq LXR_RDI(%rsp), %rdi
- movq LXR_RSI(%rsp), %rsi
- movq LXR_RBX(%rsp), %rbx
- movq LXR_RDX(%rsp), %rdx
- movq LXR_RCX(%rsp), %rcx
- movq LXR_RAX(%rsp), %rax
- movq LXR_R8(%rsp), %r8
- movq LXR_R9(%rsp), %r9
- movq LXR_R10(%rsp), %r10
- movq LXR_R11(%rsp), %r11
- movq LXR_R12(%rsp), %r12
- movq LXR_R13(%rsp), %r13
- movq LXR_R14(%rsp), %r14
- movq LXR_R15(%rsp), %r15
- /* XXX movw LXR_FS(%rsp), %fs */
-
- movq %rbp, %rsp
- popq %rbp
-
- /*
- * Returning from lx_handler is complicated by our preservation of the
- * red zone on the stack. The return address resides just above the
- * red zone making it impossible to use 'retq' and return rsp to the
- * correct value. Instead, rsp is manually moved to its original
- * position and we jmp using the return address at the known stack
- * offset above the red zone.
- */
- addq $136, %rsp /* red zone + return address */
- jmpq *-136(%rsp)
- SET_SIZE(lx_handler)
-
- /*
- * lx_setup_clone(lx_regs_t *regp, void *retaddr, void *stack)
- * Restore the register state using arg0 (%rdi).
- * Return to Linux app using arg1 (%rsi) with the Linux stack we got
- * in arg2 (%rdx).
- */
- ENTRY_NP(lx_setup_clone)
- /*
- * arg0 is a ptr to an lx_regs_t struct. The AMD64 ABI says that the
- * kernel clobbers %rcx and %r11 so we use those for working registers.
- */
- movq %rdi, %rcx /* arg0, use rcx as ptr */
- movq %rsi, %r11 /* arg1, the return addr */
- movq LXR_RDI(%rcx), %rdi
- movq LXR_RSI(%rcx), %rsi
- movq LXR_RBX(%rcx), %rbx
- movq LXR_R8(%rcx), %r8
- movq LXR_R9(%rcx), %r9
- movq LXR_R10(%rcx), %r10
- movq LXR_R12(%rcx), %r12
- movq LXR_R13(%rcx), %r13
- movq LXR_R14(%rcx), %r14
- movq LXR_R15(%rcx), %r15
-
- xorq %rbp, %rbp /* terminating stack */
- popq %rax /* pop the clone_start() return address */
- movq %rdx, %rsp /* arg2 is new stack pointer */
- movq LXR_RDX(%rcx), %rdx
- xorq %rax, %rax /* child returns 0 to SYS_clone() */
- jmp *%r11 /* return to Linux app. using arg1 addr. */
- SET_SIZE(lx_setup_clone)
-
- /*
- * lx_sigdeliver(int sig, siginfo_t *, ucontext_t *, int stack_size,
- * void *stack_build_routine, void *signal_handler, void *glibc_gs)
- *
- * The final parameter (%gs) is ignored in the 64-bit code.
- *
- * we're called by:
- * lx_call_user_handler(int sig, siginfo_t *sip, void *p)
- *
- * This routine allocates stack space for the lx_sigstack local
- * variable structure, calls a routine to populate that structure, and
- * then calls the Linux signal handler. This is written in assembly
- * because of the way we directly jmp to the Linux signal handler
- * with everything setup as if this function wasn't really here. We
- * rely on the code in lx_rt_sigreturn() to cleanup the things we've
- * pushed on the stack here.
- *
- * See lx_build_signal_frame() for the code which populates lx_sigstack.
- *
- * When we jump to the Linux signal handler, the stack will look
- * like this:
- *
- * =================================================
- * | %rbp |
- * | =================================================
- * | | stuff we saved in our prologue |
- * | =================================================
- * | | LX_SIGRT_MAGIC |
- * | =================================================
- * | | {unused word to maintain ABI stack alignment} |
- * V =================================================
- * | Linux local data built by lx stk_builder() |
- * =================================================
- *
- * Unlike the 32-bit case, we don't reset %rbp before jumping into the
- * Linux handler, since that would mean the handler would clobber our
- * data in the stack frame it builds.
- *
- */
- ENTRY_NP(lx_sigdeliver)
- pushq %rbp
- movq %rsp, %rbp
- subq $0x40, %rsp /* an extra word to maintain alignmnt */
- movq %rdi, -8(%rbp) /* sig */
- movq %rsi, -16(%rbp) /* siginfo* */
- movq %rdx, -24(%rbp) /* ucontext* */
- movq %rcx, -32(%rbp) /* stack size */
- movq %r8, -40(%rbp) /* stack builder */
- movq %r9, -48(%rbp) /* Linux signal handler */
-
- subq %rcx, %rsp /* create stack_size stack buffer */
-
- movq $LX_SIGRT_MAGIC, %rcx /* load and place marker value onto */
- movq %rcx, -56(%rbp) /* stack for lx_rt_sigreturn */
-
- movq %rsp, %rcx /* arg3 - %rcx is stack pointer */
- /* arg2 - %rdx is ucontext ptr */
- /* arg1 - %rsi is siginfo ptr */
- /* arg0 - %rdi is sig num */
- call *%r8 /* stk_builder(sig, sip, ucp, sp) */
-
- /* setup for jump to Linux signal hander */
- movq -8(%rbp), %rdi /* arg0 %rdi is sig num */
-
- /*
- * If we had a NULL siginfo pointer as input then we never converted
- * anything in the stack builder function and we need to pass along
- * a null siginfo pointer to the Linux handler.
- *
- * arg1 %rsi is ptr to converted siginfo on stack or NULL
- */
- movq -16(%rbp), %rsi
- cmp $0, %rsi
- je 1f
- movq %rsp, %rsi
- addq $SI, %rsi
-1:
- /*
- * arg2 %rdx is ptr to converted ucontext on stk (uc member of
- * lx_sigstack).
- */
- movq %rsp, %rdx
- addq $UC, %rdx
-
- movq -48(%rbp), %r9 /* fetch signal handler ptr */
- jmp *%r9 /* jmp to the Linux signal handler */
- SET_SIZE(lx_sigdeliver)
-
- /*
- * The libc routine that calls user signal handlers ends with a
- * setcontext, so we would never return here even if we used a call
- * rather than a jmp. However, we'll let the emulation unwind the stack
- * with a brand call that combines the setcontext with the management
- * of the syscall mode flag.
- *
- * Note that because libc_sigacthandler is an extern, it needs to be
- * dereferenced via the GOT.
- *
- * IMPORTANT: Because libc apparently gets upset if extra data is
- * left on its stack, this routine needs to be crafted
- * in assembly so that the jmp to the libc interposer
- * doesn't leave any cruft lying around.
- *
- * lx_sigacthandler(int sig, siginfo_t *s, void *p)
- */
- ENTRY_NP(lx_sigacthandler)
- movq libc_sigacthandler@GOTPCREL(%rip), %rax
- jmp *(%rax) /* jmp to libc's interposer */
- SET_SIZE(lx_sigacthandler)
-
- /*
* Trampoline code is called by the return at the end of a Linux
* signal handler to return control to the interrupted application
* via the lx_rt_sigreturn() syscall.
@@ -416,13 +57,16 @@ lx_sigreturn_tolibc(uintptr_t sp)
SET_SIZE(lx_rt_sigreturn_tramp)
/*
- * Manipulate the stack in the way necessary for it to appear to libc
- * that the signal handler it invoked via call_user_handler() is
- * returning.
+ * Before calling to a vsyscall address, the system call arguments
+ * are loaded into the usual registers by the emulated program. The
+ * brand SIGSEGV handler detects a jump to these addresses and modifies
+ * the interrupted context to restart at this trampoline with %rax set
+ * to the intended system call number. When the system call returns,
+ * we return to the address on the stack from the original call.
*/
- ENTRY_NP(lx_sigreturn_tolibc)
- movq %rdi, %rsp /* set %rsp to passed value */
- popq %rbp /* restore proper %rbp */
- ret /* return to lx_call_user_handler */
- SET_SIZE(lx_sigreturn_tolibc)
+ ENTRY_NP(lx_vsyscall_tramp)
+ syscall
+ ret
+ SET_SIZE(lx_vsyscall_tramp)
+
#endif /* lint */
diff --git a/usr/src/lib/brand/lx/lx_brand/amd64/lx_runexe.s b/usr/src/lib/brand/lx/lx_brand/amd64/lx_runexe.s
deleted file mode 100644
index 70cd75cf41..0000000000
--- a/usr/src/lib/brand/lx/lx_brand/amd64/lx_runexe.s
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- * Copyright 2014 Joyent, Inc. All rights reserved.
- */
-
-#include <sys/asm_linkage.h>
-
-#if defined(lint)
-
-/*ARGSUSED*/
-void
-lx_runexe(void *argv, void *entry)
-{
-}
-
-#else /* lint */
-
- /*
- * Set our stack pointer, clear the general registers,
- * and jump to the brand linker's entry point.
- */
- ENTRY_NP(lx_runexe)
- movq %rdi, %rax / %rax = &argv[0]
- movq %rsi, %rbx / Brand linker's entry point in %rbx
- subq $8, %rax / Top of stack - must point at argc
- movq %rax, %rsp / Set %rsp to what linkers expect
-
- movq $0, %rdx
-
- jmp *%rbx / And away we go...
-
- /* target will never return. */
- SET_SIZE(lx_runexe)
-#endif /* lint */
diff --git a/usr/src/lib/brand/lx/lx_brand/common/clone.c b/usr/src/lib/brand/lx/lx_brand/common/clone.c
index 87f966cc89..ee442ef280 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/clone.c
+++ b/usr/src/lib/brand/lx/lx_brand/common/clone.c
@@ -49,8 +49,10 @@
#include <sys/lx_debug.h>
#include <sys/lx_thread.h>
#include <sys/fork.h>
+#include <sys/mman.h>
#include <lx_syscall.h>
+
#define SHARED_AS \
(LX_CLONE_VM | LX_CLONE_FS | LX_CLONE_FILES | LX_CLONE_SIGHAND \
| LX_CLONE_THREAD)
@@ -60,9 +62,6 @@
#define IS_FORK(f) (((f) & SHARED_AS) == 0)
#define IS_VFORK(f) (((f) & CLONE_VFORK) == CLONE_VFORK)
-#define LX_EXIT 1
-#define LX_EXIT_GROUP 2
-
/*
* This is dicey. This seems to be an internal glibc structure, and not
* part of any external interface. Thus, it is subject to change without
@@ -92,19 +91,16 @@ struct clone_state {
void *c_ptidp;
struct lx_desc *c_ldtinfo; /* thread-specific segment */
void *c_ctidp;
-#if defined(_LP64)
- lx_regs_t c_regs; /* original register state */
-#else
- uintptr_t c_gs; /* Linux's %gs */
-#endif
+ ucontext_t c_uc; /* original register state */
sigset_t c_sigmask; /* signal mask */
lx_affmask_t c_affmask; /* CPU affinity mask */
volatile int *c_clone_res; /* pid/error returned to cloner */
int c_ptrace_event; /* ptrace(2) event for child stop */
+ void *c_ntv_stk; /* native stack for this thread */
+ size_t c_ntv_stk_sz; /* native stack size */
+ lx_tsd_t *c_lx_tsd; /* tsd area for thread */
};
-extern void lx_setup_clone(uintptr_t, void *, void *);
-
/*
* Counter incremented when we vfork(2) ourselves, and decremented when the
* vfork(2)ed child exit(2)s or exec(2)s.
@@ -114,7 +110,7 @@ static int is_vforked = 0;
long
lx_exit(uintptr_t p1)
{
- int ret, status = (int)p1;
+ int status = (int)p1;
lx_tsd_t *lx_tsd;
/*
@@ -126,33 +122,18 @@ lx_exit(uintptr_t p1)
_exit(status);
}
- if ((ret = thr_getspecific(lx_tsd_key, (void **)&lx_tsd)) != 0)
- lx_err_fatal("exit: unable to read thread-specific data: %s",
- strerror(ret));
-
- assert(lx_tsd != 0);
+ lx_tsd = lx_get_tsd();
lx_tsd->lxtsd_exit = LX_ET_EXIT;
lx_tsd->lxtsd_exit_status = status;
lx_ptrace_stop_if_option(LX_PTRACE_O_TRACEEXIT, B_FALSE,
- (ulong_t)status);
-
- /*
- * Block all signals in the exit context to avoid taking any signals
- * (to the degree possible) while exiting.
- */
- (void) sigfillset(&lx_tsd->lxtsd_exit_context.uc_sigmask);
+ (ulong_t)status, NULL);
/*
* This thread is exiting. Restore the state of the thread to
* what it was before we started running linux code.
- * For 64-bit code, since we know we are unwinding the stack back to
- * lx_init, we need to unwind the syscall mode flag "stack" as well.
*/
-#if defined(_LP64)
- (void) syscall(SYS_brand, B_UNWIND_NTV_SYSC_FLAG);
-#endif
(void) setcontext(&lx_tsd->lxtsd_exit_context);
/*
@@ -167,7 +148,7 @@ lx_exit(uintptr_t p1)
long
lx_group_exit(uintptr_t p1)
{
- int ret, status = (int)p1;
+ int status = (int)p1;
lx_tsd_t *lx_tsd;
/*
@@ -179,36 +160,21 @@ lx_group_exit(uintptr_t p1)
_exit(status);
}
- if ((ret = thr_getspecific(lx_tsd_key, (void **)&lx_tsd)) != 0)
- lx_err_fatal("group_exit: unable to read thread-specific "
- "data: %s", strerror(ret));
-
- assert(lx_tsd != 0);
+ lx_tsd = lx_get_tsd();
lx_tsd->lxtsd_exit = LX_ET_EXIT_GROUP;
lx_tsd->lxtsd_exit_status = status;
/*
- * Block all signals in the exit context to avoid taking any signals
- * (to the degree possible) while exiting.
- */
- (void) sigfillset(&lx_tsd->lxtsd_exit_context.uc_sigmask);
-
- /*
* This thread is exiting. Restore the state of the thread to
* what it was before we started running linux code.
- * For 64-bit code, since we know we are unwinding the stack back to
- * lx_init, we need to unwind the syscall mode flag "stack" as well.
*/
-#if defined(_LP64)
- (void) syscall(SYS_brand, B_UNWIND_NTV_SYSC_FLAG);
-#endif
(void) setcontext(&lx_tsd->lxtsd_exit_context);
/*
* If we returned from the setcontext(2), something is very wrong.
*/
- lx_err_fatal("group_exits: unable to set exit context: %s",
+ lx_err_fatal("group_exit: unable to set exit context: %s",
strerror(errno));
/*NOTREACHED*/
@@ -220,7 +186,7 @@ clone_start(void *arg)
{
int rval;
struct clone_state *cs = (struct clone_state *)arg;
- lx_tsd_t lx_tsd;
+ lx_tsd_t *lxtsd;
/*
* Let the kernel finish setting up all the needed state for this
@@ -228,18 +194,14 @@ clone_start(void *arg)
*
* We already created the thread using the thr_create(3C) library
* call, so most of the work required to emulate lx_clone(2) has
- * been done by the time we get to this point. Instead of creating
- * a new brandsys(2) subcommand to perform the last few bits of
- * bookkeeping, we just use the lx_clone() slot in the syscall
- * table.
+ * been done by the time we get to this point.
*/
lx_debug("\tre-vectoring to lx kernel module to complete lx_clone()");
- lx_debug("\tLX_SYS_clone(0x%x, 0x%p, 0x%p, 0x%p, 0x%p)",
- cs->c_flags, cs->c_stk, cs->c_ptidp, cs->c_ldtinfo, cs->c_ctidp);
+ lx_debug("\tB_HELPER_CLONE(0x%x, 0x%p, 0x%p, 0x%p)",
+ cs->c_flags, cs->c_ptidp, cs->c_ldtinfo, cs->c_ctidp);
- rval = syscall(SYS_brand, B_IKE_SYSCALL + LX_EMUL_clone,
- cs->c_flags, cs->c_stk, cs->c_ptidp, cs->c_ldtinfo, cs->c_ctidp,
- NULL);
+ rval = syscall(SYS_brand, B_HELPER_CLONE, cs->c_flags, cs->c_ptidp,
+ cs->c_ldtinfo, cs->c_ctidp);
/*
* At this point the parent is waiting for cs->c_clone_res to go
@@ -250,6 +212,8 @@ clone_start(void *arg)
if (rval < 0) {
*(cs->c_clone_res) = -errno;
lx_debug("\tkernel clone failed, errno %d\n", errno);
+ free(cs->c_lx_tsd);
+ free(cs);
return (NULL);
}
@@ -261,84 +225,72 @@ clone_start(void *arg)
strerror(errno));
}
- /* Initialize the thread specific data for this thread. */
- bzero(&lx_tsd, sizeof (lx_tsd));
-#if defined(_ILP32)
- lx_tsd.lxtsd_gs = cs->c_gs;
-#else
- lx_tsd.lxtsd_fsbase = (uintptr_t)cs->c_ldtinfo;
-#endif
-
/*
- * Use the address of the stack-allocated lx_tsd as the
- * per-thread storage area to cache various values for later
- * use.
- *
- * This address is only used by this thread, so there is no
- * danger of other threads using this storage area, nor of it
- * being accessed once this stack frame has been freed.
+ * Initialize the thread specific data for this thread.
*/
- if (thr_setspecific(lx_tsd_key, &lx_tsd) != 0) {
- *(cs->c_clone_res) = -errno;
- lx_err_fatal("Unable to set thread-specific ptr for clone: %s",
- strerror(rval));
- }
+ lxtsd = cs->c_lx_tsd;
+ lx_init_tsd(lxtsd);
+ lxtsd->lxtsd_clone_state = cs;
/*
- * Save the current context of this thread.
- *
- * We'll restore this context when this thread attempts to exit.
+ * Install the emulation stack for this thread. Register the
+ * thread-specific data structure with the stack list so that it may be
+ * freed at thread exit or fork(2).
*/
- if (getcontext(&lx_tsd.lxtsd_exit_context) != 0) {
+ lx_install_stack(cs->c_ntv_stk, cs->c_ntv_stk_sz, lxtsd);
+
+ if (sigprocmask(SIG_SETMASK, &cs->c_sigmask, NULL) < 0) {
*(cs->c_clone_res) = -errno;
- lx_err_fatal("Unable to initialize thread-specific exit "
- "context: %s", strerror(errno));
+ lx_err_fatal("Unable to release held signals for child "
+ "thread: %s", strerror(errno));
}
/*
- * Do the final stack twiddling, reset %gs, and return to the
- * clone(2) path.
+ * Let the parent know that the clone has (effectively) been
+ * completed.
*/
- if (lx_tsd.lxtsd_exit == LX_ET_NONE) {
- if (sigprocmask(SIG_SETMASK, &cs->c_sigmask, NULL) < 0) {
- *(cs->c_clone_res) = -errno;
+ *(cs->c_clone_res) = rval;
- lx_err_fatal("Unable to release held signals for child "
- "thread: %s", strerror(errno));
- }
+ /*
+ * We want to load the general registers from this context, and
+ * switch to the BRAND stack.
+ */
+ cs->c_uc.uc_flags = UC_CPU;
+ cs->c_uc.uc_brand_data[0] = (void *)LX_UC_STACK_BRAND;
- /*
- * Let the parent know that the clone has (effectively) been
- * completed.
- */
- *(cs->c_clone_res) = rval;
+ /*
+ * New threads will not link into the existing context chain.
+ */
+ cs->c_uc.uc_link = NULL;
- /*
- * Fire the ptrace(2) event stop in the new thread:
- */
- lx_ptrace_stop_if_option(cs->c_ptrace_event, B_TRUE, 0);
+ /*
+ * Set stack pointer and entry point for new thread:
+ */
+ LX_REG(&cs->c_uc, REG_SP) = (uintptr_t)cs->c_stk;
+ LX_REG(&cs->c_uc, REG_PC) = (uintptr_t)cs->c_retaddr;
-#if defined(_LP64)
- (void) syscall(SYS_brand, B_CLR_NTV_SYSC_FLAG);
- lx_setup_clone((uintptr_t)&cs->c_regs, cs->c_retaddr,
- cs->c_stk);
-#else
- lx_setup_clone(cs->c_gs, cs->c_retaddr, cs->c_stk);
-#endif
+ /*
+ * Return 0 to the child:
+ */
+ LX_REG(&cs->c_uc, REG_R0) = (uintptr_t)0;
- /* lx_setup_clone() should never return. */
- assert(0);
- }
+ /*
+ * Fire the ptrace(2) event stop in the new thread:
+ */
+ lx_ptrace_stop_if_option(cs->c_ptrace_event, B_TRUE, 0, &cs->c_uc);
/*
- * We are here because the Linux application called the exit() or
- * exit_group() system call. In turn the brand library did a
- * setcontext() to jump to the thread context state saved in
- * getcontext(), above.
+ * Jump to the Linux process. The system call must not return.
*/
- lx_exit_common(lx_tsd.lxtsd_exit, lx_tsd.lxtsd_exit_status);
+ if (syscall(SYS_brand, B_JUMP_TO_LINUX, &cs->c_uc) == -1) {
+ lx_err_fatal("B_JUMP_TO_LINUX failed: %s",
+ strerror(errno));
+ }
+ abort();
+
/*NOTREACHED*/
+ return (NULL);
}
/*
@@ -386,10 +338,11 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
int sig;
int rval;
int pid;
- lx_regs_t *rp;
- sigset_t sigmask;
+ ucontext_t *ucp;
+ sigset_t sigmask, osigmask;
int fork_flags = 0;
int ptrace_event;
+ int error = 0;
if (flags & LX_CLONE_SETTLS) {
lx_debug("lx_clone(flags=0x%x stk=0x%p ptidp=0x%p ldt=0x%p "
@@ -400,7 +353,8 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
}
/*
- * Only supported for pid 0 on Linux
+ * Only supported for pid 0 on Linux after version 2.3.21, and
+ * apparently not at all since 2.5.16.
*/
if (flags & LX_CLONE_PID)
return (-EINVAL);
@@ -422,7 +376,7 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
return (-EINVAL);
}
- rp = lx_syscall_regs();
+ ucp = lx_syscall_regs();
/* test if pointer passed by user are writable */
if (flags & LX_CLONE_PARENT_SETTID) {
@@ -446,7 +400,10 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
*/
lx_ptrace_clone_begin(ptrace_event, !!(flags & LX_CLONE_PTRACE));
- /* See if this is a fork() operation or a thr_create(). */
+ /*
+ * Handle a fork(2) operation here. If this is not a fork, a new
+ * thread will be created after this block.
+ */
if (IS_FORK(flags) || IS_VFORK(flags)) {
if (flags & LX_CLONE_PARENT) {
lx_unsupported("clone(2) only supports CLONE_PARENT "
@@ -457,6 +414,11 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
if ((flags & LX_CSIGNAL) == 0)
fork_flags |= FORK_NOSIGCHLD;
+ /*
+ * Suspend signal delivery and perform the actual fork(2)
+ * operation.
+ */
+ _sigoff();
if (flags & LX_CLONE_VFORK) {
is_vforked++;
rval = vforkx(fork_flags);
@@ -469,12 +431,45 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
}
/*
- * Since we've already forked, we can't do much if uucopy
- * fails, so we just ignore failure. Failure is unlikely since
- * we've tested the memory before we did the fork.
+ * The parent process returns through the regular system call
+ * path here.
+ */
+ if (rval != 0) {
+ /*
+ * Since we've already forked, we can't do much if
+ * uucopy fails, so we just ignore failure. Failure is
+ * unlikely since we've tested the memory before we did
+ * the fork.
+ */
+ if (rval > 0 && (flags & LX_CLONE_PARENT_SETTID)) {
+ (void) uucopy(&rval, ptidp, sizeof (int));
+ }
+
+ if (rval > 0) {
+ lx_ptrace_stop_if_option(ptrace_event, B_FALSE,
+ (ulong_t)rval, NULL);
+ }
+
+ /*
+ * Re-enable signal delivery in the parent process.
+ */
+ _sigon();
+
+ return ((rval < 0) ? -errno : rval);
+ }
+
+ /*
+ * The rest of this block runs only within the new child
+ * process.
*/
- if (rval > 0 && (flags & LX_CLONE_PARENT_SETTID)) {
- (void) uucopy(&rval, ptidp, sizeof (int));
+
+ if (!IS_VFORK(flags)) {
+ /*
+ * We must free the stacks and thread-specific data
+ * objects for every thread except the one duplicated
+ * from the parent by forkx().
+ */
+ lx_free_other_stacks();
}
if (rval == 0 && (flags & LX_CLONE_CHILD_SETTID)) {
@@ -484,50 +479,95 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
* forked, so on failure, we just don't copy the
* memory.
*/
- pid = lx_getpid();
+ pid = syscall(SYS_brand, B_GETPID);
if (pid >= 0)
(void) uucopy(&pid, ctidp, sizeof (int));
}
- /* Parent just returns */
- if (rval != 0) {
- if (rval > 0)
- lx_ptrace_stop_if_option(ptrace_event, B_FALSE,
- (ulong_t)rval);
- return ((rval < 0) ? -errno : rval);
- }
-
/*
* Set up additional data in the lx_proc_data structure as
* necessary.
*/
- rval = syscall(SYS_brand, B_IKE_SYSCALL + LX_EMUL_clone,
- flags, cldstk, ptidp, ldtinfo, ctidp, NULL);
- if (rval < 0) {
+ if ((rval = syscall(SYS_brand, B_HELPER_CLONE, flags, ptidp,
+ ldtinfo, ctidp)) < 0) {
return (rval);
}
- /*
- * lx_setup_clone() doesn't return below, so stop now, if
- * necessary.
- */
- lx_ptrace_stop_if_option(ptrace_event, B_TRUE, 0);
+ if (IS_VFORK(flags)) {
+ ucontext_t vforkuc;
+
+ /*
+ * The vfork(2) interface is somewhat less than ideal.
+ * The unfortunate notion of borrowing the address
+ * space of the parent process requires us to jump
+ * through several hoops to prevent corrupting parent
+ * emulation state.
+ *
+ * When returning in the child, we make a copy of the
+ * system call return context and discard three pages
+ * of the native stack. Returning normally would
+ * clobber the native stack frame in which the brand
+ * library in the parent process is presently waiting.
+ *
+ * The calling program is expected to correctly use
+ * this dusty, underspecified relic. Neglecting to
+ * immediately call execve(2) or exit(2) is not
+ * cricket; this stack space will be permanently lost,
+ * not to mention myriad other undefined behaviour.
+ */
+ bcopy(ucp, &vforkuc, sizeof (vforkuc));
+ vforkuc.uc_brand_data[1] -= LX_NATIVE_STACK_VFORK_GAP;
+ vforkuc.uc_link = NULL;
+
+ lx_debug("\tvfork native stack sp %p",
+ vforkuc.uc_brand_data[1]);
+
+ /*
+ * If provided, the child needs its new stack set up.
+ */
+ if (cldstk != 0) {
+ lx_debug("\tvfork cldstk %p", cldstk);
+ LX_REG(&vforkuc, REG_SP) = (uintptr_t)cldstk;
+ }
+
+ /*
+ * Stop for ptrace if required.
+ */
+ lx_ptrace_stop_if_option(ptrace_event, B_TRUE, 0, NULL);
+
+ /*
+ * Return to the child via the specially constructed
+ * vfork(2) context.
+ */
+ LX_EMULATE_RETURN(&vforkuc, LX_SYS_clone, 0, 0);
+ (void) syscall(SYS_brand, B_EMULATION_DONE, &vforkuc,
+ LX_SYS_clone, 0, 0);
+
+ assert(0);
+ }
/*
* If provided, the child needs its new stack set up.
*/
- if (cldstk) {
-#if defined(_LP64)
- (void) syscall(SYS_brand, B_CLR_NTV_SYSC_FLAG);
- lx_setup_clone((uintptr_t)rp, (void *)rp->lxr_rip,
- cldstk);
-#else
- lx_setup_clone(rp->lxr_gs, (void *)rp->lxr_eip, cldstk);
-#endif
- /* lx_setup_clone() should never return. */
- assert(0);
+ if (cldstk != 0) {
+ lx_debug("\tcldstk %p", cldstk);
+ LX_REG(ucp, REG_SP) = (uintptr_t)cldstk;
}
+ /*
+ * Stop for ptrace if required.
+ */
+ lx_ptrace_stop_if_option(ptrace_event, B_TRUE, 0, NULL);
+
+ /*
+ * Re-enable signal delivery in the child process.
+ */
+ _sigon();
+
+ /*
+ * The child process returns via the regular emulated system
+ * call path:
+ */
return (0);
}
@@ -557,13 +597,13 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
}
/*
- * To avoid malloc() here, we steal a part of the new thread's
- * stack to store all the info that thread might need for
- * initialization. We also make it 64-bit aligned for good
- * measure.
+ * Initialise the state structure we pass as an argument to the new
+ * thread:
*/
- cs = (struct clone_state *)
- ((p2 - sizeof (struct clone_state)) & -((uintptr_t)8));
+ if ((cs = malloc(sizeof (*cs))) == NULL) {
+ lx_debug("could not allocate clone_state: %s", strerror(errno));
+ return (-ENOMEM);
+ }
cs->c_flags = flags;
cs->c_sig = sig;
cs->c_stk = cldstk;
@@ -572,43 +612,27 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
cs->c_ctidp = ctidp;
cs->c_clone_res = &clone_res;
cs->c_ptrace_event = ptrace_event;
-#if defined(_LP64)
/*
- * The AMD64 ABI says that the kernel clobbers %rcx and %r11. We
- * return a value in %rax. The new %rsp and %rip will be setup in
- * lx_setup_clone. Thus, we don't worry about passing/restoring those
- * registers.
+ * We want the new thread to return directly to the call site for
+ * the system call.
*/
- cs->c_regs.lxr_rdi = rp->lxr_rdi;
- cs->c_regs.lxr_rsi = rp->lxr_rsi;
- cs->c_regs.lxr_rbx = rp->lxr_rbx;
- cs->c_regs.lxr_rdx = rp->lxr_rdx;
- cs->c_regs.lxr_rdi = rp->lxr_rdi;
- cs->c_regs.lxr_r8 = rp->lxr_r8;
- cs->c_regs.lxr_r9 = rp->lxr_r9;
- cs->c_regs.lxr_r10 = rp->lxr_r10;
- cs->c_regs.lxr_r12 = rp->lxr_r12;
- cs->c_regs.lxr_r13 = rp->lxr_r13;
- cs->c_regs.lxr_r14 = rp->lxr_r14;
- cs->c_regs.lxr_r15 = rp->lxr_r15;
-#else
- cs->c_gs = rp->lxr_gs;
-#endif
+ cs->c_retaddr = (void *)LX_REG(ucp, REG_PC);
+ /*
+ * Copy the saved context for the clone(2) system call so that the
+ * new thread may use it to initialise registers.
+ */
+ bcopy(ucp, &cs->c_uc, sizeof (cs->c_uc));
+ if ((cs->c_lx_tsd = malloc(sizeof (*cs->c_lx_tsd))) == NULL) {
+ free(cs);
+ return (-ENOMEM);
+ }
if (lx_sched_getaffinity(0, sizeof (cs->c_affmask),
- (uintptr_t)&cs->c_affmask) == -1)
+ (uintptr_t)&cs->c_affmask) == -1) {
lx_err_fatal("Unable to get affinity mask for parent "
"thread: %s", strerror(errno));
+ }
- /*
- * We want the new thread to return directly to the return site for
- * the system call.
- */
-#if defined(_LP64)
- cs->c_retaddr = (void *)rp->lxr_rip;
-#else
- cs->c_retaddr = (void *)rp->lxr_eip;
-#endif
clone_res = 0;
(void) sigfillset(&sigmask);
@@ -617,17 +641,40 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
* Block all signals because the thread we create won't be able to
* properly handle them until it's fully set up.
*/
- if (sigprocmask(SIG_BLOCK, &sigmask, &cs->c_sigmask) < 0) {
+ if (sigprocmask(SIG_BLOCK, &sigmask, &osigmask) < 0) {
lx_debug("lx_clone sigprocmask() failed: %s", strerror(errno));
+ free(cs->c_lx_tsd);
+ free(cs);
return (-errno);
}
+ cs->c_sigmask = osigmask;
+
+ /*
+ * Allocate the native stack for this new thread now, so that we
+ * can return failure gracefully as ENOMEM.
+ */
+ if (lx_alloc_stack(&cs->c_ntv_stk, &cs->c_ntv_stk_sz) != 0) {
+ free(cs->c_lx_tsd);
+ free(cs);
+ return (-ENOMEM);
+ }
rval = thr_create(NULL, NULL, clone_start, cs, THR_DETACHED, &tid);
/*
+ * If the thread did not start, free the resources we allocated:
+ */
+ if (rval == -1) {
+ error = errno;
+ (void) munmap(cs->c_ntv_stk, cs->c_ntv_stk_sz);
+ free(cs->c_lx_tsd);
+ free(cs);
+ }
+
+ /*
* Release any pending signals
*/
- (void) sigprocmask(SIG_SETMASK, &cs->c_sigmask, NULL);
+ (void) sigprocmask(SIG_SETMASK, &osigmask, NULL);
/*
* Wait for the child to be created and have its tid assigned.
@@ -637,8 +684,14 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
;
rval = clone_res;
- lx_ptrace_stop_if_option(ptrace_event, B_FALSE, (ulong_t)rval);
- }
+ lx_ptrace_stop_if_option(ptrace_event, B_FALSE, (ulong_t)rval,
+ NULL);
- return (rval);
+ return (rval);
+ } else {
+ /*
+ * Return the error from thr_create(3C).
+ */
+ return (-error);
+ }
}
diff --git a/usr/src/lib/brand/lx/lx_brand/common/file.c b/usr/src/lib/brand/lx/lx_brand/common/file.c
index 56201035ff..1f2c4032f5 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/file.c
+++ b/usr/src/lib/brand/lx/lx_brand/common/file.c
@@ -120,34 +120,6 @@ ltos_at_flag(int lflag, int allow, boolean_t enforce)
*/
/*
- * Linux creates half-duplex pipes and Illumos creates full-duplex pipes.
- * Thus, to get the correct semantics, we need to setup pipes in the kernel's
- * lx brand module.
- */
-
-long
-lx_pipe2(uintptr_t p1, uintptr_t p2)
-{
- int flags = 0;
- int r;
-
- if (p2 & LX_O_NONBLOCK) {
- flags |= O_NONBLOCK;
- p2 &= ~LX_O_NONBLOCK;
- }
- if (p2 & LX_O_CLOEXEC) {
- flags |= O_CLOEXEC;
- p2 &= ~LX_O_CLOEXEC;
- }
- if (p2 != 0)
- return (-EINVAL);
-
- r = syscall(SYS_brand, B_IKE_SYSCALL + LX_EMUL_pipe2, p1, flags);
-
- return ((r == -1) ? -errno : r);
-}
-
-/*
* On Linux, even root cannot create a link to a directory, so we have to
* add an explicit check.
*/
diff --git a/usr/src/lib/brand/lx/lx_brand/common/fork.c b/usr/src/lib/brand/lx/lx_brand/common/fork.c
index b0edee1adb..b382dd9410 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/fork.c
+++ b/usr/src/lib/brand/lx/lx_brand/common/fork.c
@@ -37,77 +37,75 @@
* initialization or else bad things will happen (i.e. ending up with a bad
* schedctl page). On Linux, there is no such thing as forkall(), so we use
* fork1() here.
+ *
+ * For vfork(), we have a serious problem because the child is not allowed to
+ * return from the current frame because it will corrupt the parent's stack.
+ * Since the semantics of vfork() are rather ill-defined (other than "it's
+ * faster than fork"), we should theoretically be safe by falling back to
+ * fork1().
*/
-long
-lx_fork(void)
+static long
+lx_fork_common(boolean_t is_vfork)
{
int ret;
+ int ptopt = is_vfork ? LX_PTRACE_O_TRACEVFORK : LX_PTRACE_O_TRACEFORK;
/*
* Inform the in-kernel ptrace(2) subsystem that we are about to
* emulate fork(2).
*/
- lx_ptrace_clone_begin(LX_PTRACE_O_TRACEFORK, B_FALSE);
+ lx_ptrace_clone_begin(ptopt, B_FALSE);
+ /*
+ * Suspend signal delivery and perform the fork operation.
+ */
+ _sigoff();
switch (ret = fork1()) {
case -1:
+ _sigon();
return (-errno);
case 0:
/*
- * Returning in the new child.
+ * Returning in the new child. We must free the stacks and
+ * thread-specific data objects for the threads we did not
+ * duplicate; i.e. every other thread.
*/
- if (lx_is_rpm) {
+ lx_free_other_stacks();
+
+ if (!is_vfork && lx_is_rpm) {
(void) sleep(lx_rpm_delay);
}
- lx_ptrace_stop_if_option(LX_PTRACE_O_TRACEFORK, B_TRUE, 0);
+
+ lx_ptrace_stop_if_option(ptopt, B_TRUE, 0, NULL);
+
+ /*
+ * Re-enable signal delivery in the child and return to the
+ * new process.
+ */
+ _sigon();
return (0);
default:
+ lx_ptrace_stop_if_option(ptopt, B_FALSE, (ulong_t)ret, NULL);
+
/*
- * Returning in the new parent.
+ * Re-enable signal delivery in the parent and return from
+ * the emulated system call.
*/
- lx_ptrace_stop_if_option(LX_PTRACE_O_TRACEFORK, B_FALSE,
- (ulong_t)ret);
+ _sigon();
return (ret);
}
}
-/*
- * For vfork(), we have a serious problem because the child is not allowed to
- * return from the current frame because it will corrupt the parent's stack.
- * Since the semantics of vfork() are rather ill-defined (other than "it's
- * faster than fork"), we should theoretically be safe by falling back to
- * fork1().
- */
long
-lx_vfork(void)
+lx_fork(void)
{
- int ret;
-
- /*
- * Inform the in-kernel ptrace(2) subsystem that we are about to
- * emulate vfork(2).
- */
- lx_ptrace_clone_begin(LX_PTRACE_O_TRACEVFORK, B_FALSE);
-
- switch (ret = fork1()) {
- case -1:
- return (-errno);
-
- case 0:
- /*
- * Returning in the new child.
- */
- lx_ptrace_stop_if_option(LX_PTRACE_O_TRACEVFORK, B_TRUE, 0);
- return (0);
+ return (lx_fork_common(B_FALSE));
+}
- default:
- /*
- * Returning in the new parent.
- */
- lx_ptrace_stop_if_option(LX_PTRACE_O_TRACEVFORK, B_FALSE,
- (ulong_t)ret);
- return (ret);
- }
+long
+lx_vfork(void)
+{
+ return (lx_fork_common(B_TRUE));
}
diff --git a/usr/src/lib/brand/lx/lx_brand/common/lx_brand.c b/usr/src/lib/brand/lx/lx_brand/common/lx_brand.c
index 655374b6f6..661fae3402 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/lx_brand.c
+++ b/usr/src/lib/brand/lx/lx_brand/common/lx_brand.c
@@ -42,6 +42,7 @@
#include <zone.h>
#include <sys/brand.h>
#include <sys/epoll.h>
+#include <sys/stack.h>
#include <assert.h>
#include <stdio.h>
@@ -76,26 +77,8 @@
#include <sys/lx_aio.h>
/*
- * General emulation guidelines.
- *
- * Once the emulation handler has been installed onto the process, we need to
- * be concerned about system calls made by the emulation, as well as any
- * library calls which in turn make system calls. This is actually only an
- * issue for the 64-bit case, since the kernel sycall entry point is common for
- * both Illumos and Linux. The trampoline code in the kernel needs some way to
- * distinguish when it should bounce out for emulation (Linux system call) vs.
- * stay in the kernel (emulation system call). For the 32-bit case Linux uses
- * int80 for system calls which is orthogonal to all of the Illumos system call
- * entry points and thus there is no issue.
- *
- * To cope with this for the 64-bit case, we maintain a mode flag on each
- * LWP so we can tell when a system call comes from Linux. We then set the mode
- * flag to Illumos so that all future system calls from the emulation are
- * handled correctly. The emulation must reset the mode when it is ready to
- * return control to Linux. This is done via the B_CLR_NTV_SYSC_FLAG brand
- * call. There is additional complexity with this mode switching in the
- * case of a user-defined signal handler. This is described in the signal
- * emulation code comments.
+ * There is a block comment in "uts/common/brand/lx/os/lx_brand.c" that
+ * describes the functioning of the LX brand in some detail.
*
* *** Setting errno
*
@@ -103,61 +86,12 @@
* application whose address space we're running in. The Linux libc errno is
* independent of our native libc errno. To pass back an error the emulation
* function should return -errno back to the Linux caller.
- *
- * *** General considerations
- *
- * The lx brand interposes on _all_ system calls. Linux system calls that need
- * special handling in the kernel are redirected back to the kernel via the
- * in-kernel emulation (IKE) mechanism which uses a range of the brand system
- * call command number to determine which in-kernel lx function to invoke.
- *
- * *** DTrace
- *
- * The lx-syscall DTrace provider (see lx_systrace_attach in
- * uts/common/brand/lx/dtrace/lx_systrace.c) works as follows:
- *
- * When probes are enabled:
- * lx_systrace_enable -> lx_brand_systrace_enable
- *
- * This enables the trace jump table in the kernel (see
- * uts/intel/brand/lx/lx_brand_asm.s which has the functions
- * lx_brand_int80_enable and lx_brand_syscall_enable, and the corresponding
- * patch points lx_brand_int80_patch_point and lx_brand_syscall_patch_point).
- *
- * The library code defines lx_handler_table and lx_handler_trace_table
- * in the i386 and amd64 lx_handler.s code.
- *
- * The trace jump table enables lx_traceflag which is used in the lx_emulate
- * function to make the B_SYSENTRY/B_SYSRETURN brandsys syscalls. These in turn
- * will call lx_systrace_entry_ptr/lx_systrace_return_ptr so that we can DTrace
- * the Linux syscalls via the provider.
- *
- * When probes are disbaled, we undo the patch points via:
- * lx_systrace_disable -> lx_brand_systrace_disable
*/
-
/*
* Map Illumos errno to the Linux equivalent.
*/
-static int stol_errno[] = {
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
- 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
- 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
- 30, 31, 32, 33, 34, 42, 43, 44, 45, 46,
- 47, 48, 49, 50, 51, 35, 47, 22, 38, 22, /* 49 */
- 52, 53, 54, 55, 56, 57, 58, 59, 22, 22,
- 61, 61, 62, 63, 64, 65, 66, 67, 68, 69,
- 70, 71, 22, 22, 72, 22, 22, 74, 36, 75,
- 76, 77, 78, 79, 80, 81, 82, 83, 84, 38,
- 40, 85, 86, 39, 87, 88, 89, 90, 91, 92, /* 99 */
- 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
- 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
- 93, 94, 95, 96, 97, 98, 99, 100, 101, 102,
- 103, 104, 105, 106, 107, 22, 22, 22, 22, 22,
- 22, 22, 22, 108, 109, 110, 111, 112, 113, 114, /* 149 */
- 115, 116
-};
+static int stol_errno[] = LX_STOL_ERRNO_INIT;
char lx_release[LX_VERS_MAX];
char lx_cmd_name[MAXNAMLEN];
@@ -172,9 +106,6 @@ struct lx_locale_ending {
int se_size; /* solaris ending string length */
};
-__thread int lx_do_syscall_restart;
-__thread int lx_had_sigchild;
-
#define l2s_locale(lname, sname) \
{(lname), (sname), sizeof ((lname)) - 1, sizeof ((sname)) - 1}
@@ -184,45 +115,6 @@ __thread int lx_had_sigchild;
#endif
/*
- * This flag is part of the registration with the in-kernel brand module. It's
- * used in lx_handler() to determine if we should go back into the kernel after
- * a system call in case the kernel needs to perform some post-syscall work
- * like tracing for example.
- */
-int lx_traceflag;
-
-#define LX_SYS_NOSYS_REASON 0x07
-#define LX_SYS_EBPARG6 0x08
-#define LX_SYS_IKE 0x10
-
-#define LX_IKE(sysnum) ((long(*)(void))LX_EMUL_##sysnum)
-
-/*
- * Flags that denote the specific reason that we don't have a particular
- * system call. These reasons are only valid if the function is NULL.
- */
-#define NOSYS_NULL 0
-#define NOSYS_NONE 1
-#define NOSYS_NO_EQUIV 2
-#define NOSYS_KERNEL 3
-#define NOSYS_UNDOC 4
-#define NOSYS_OBSOLETE 5
-#define NOSYS_MAX 5
-
-#if NOSYS_MAX > LX_SYS_NOSYS_REASON
-#error NOSYS reason codes must fit in LX_SYS_NOSYS_REASON
-#endif
-
-static char *nosys_reasons[] = {
- "Not done yet",
- "No such Linux system call",
- "No equivalent Solaris functionality",
- "Reads/modifies Linux kernel state",
- "Undocumented and/or rarely used system call",
- "Unsupported, obsolete system call"
-};
-
-/*
* Most syscalls return an int but some return something else, typically a
* ssize_t. This can be either an int or a long, depending on if we're compiled
* for 32-bit or 64-bit. To correctly propagate the -errno return code in the
@@ -231,14 +123,9 @@ static char *nosys_reasons[] = {
* Linux, we will have the right size value in both the 32 and 64 bit cases.
*/
-struct lx_sysent {
- char *sy_name;
- long (*sy_callc)();
- char sy_flags;
- char sy_narg;
-};
+typedef long (*lx_syscall_handler_t)();
-static struct lx_sysent sysents[LX_NSYSCALLS + 1];
+static lx_syscall_handler_t lx_handlers[LX_NSYSCALLS + 1];
static uintptr_t stack_bottom;
@@ -254,7 +141,6 @@ int lx_verbose = 0; /* verbose mode enabled if non-zero */
int lx_debug_enabled = 0; /* debugging output enabled if non-zero */
pid_t zoneinit_pid; /* zone init PID */
-long max_pid; /* native maximum PID */
thread_key_t lx_tsd_key;
@@ -458,271 +344,82 @@ lx_unsupported(char *msg, ...)
(void) kill(getpid(), SIGSYS);
}
-extern void lx_runexe(void *argv, void *entry);
int lx_init(int argc, char *argv[], char *envp[]);
-static int
-lx_emulate_args(lx_regs_t *rp, struct lx_sysent *s, uintptr_t *args)
+lx_tsd_t *
+lx_get_tsd(void)
{
-#if defined(_LP64)
- /*
- * Note: Syscall argument passing is different from function call
- * argument passing on amd64. For function calls, the fourth arg is
- * passed via %rcx, but for system calls the 4th arg is passed via %r10.
- * This is because in amd64, the syscall instruction puts the lower
- * 32 bits of %rflags in %r11 and puts the %rip value to %rcx.
- *
- * Appendix A of the amd64 ABI (Linux conventions) states that syscalls
- * are limited to 6 args and no arg is passed on the stack.
- */
- args[0] = rp->lxr_rdi;
- args[1] = rp->lxr_rsi;
- args[2] = rp->lxr_rdx;
- args[3] = rp->lxr_r10;
- args[4] = rp->lxr_r8;
- args[5] = rp->lxr_r9;
-#else
- /*
- * If the system call takes 6 args, then libc has stashed them in
- * memory at the address contained in %ebx. Except for some syscalls
- * which store the 6th argument in %ebp.
- */
- if (s->sy_narg == 6 && !(s->sy_flags & LX_SYS_EBPARG6)) {
- if (uucopy((void *)rp->lxr_ebx, args,
- sizeof (args[0]) * 6) != 0)
- return (-stol_errno[errno]);
- } else {
- args[0] = rp->lxr_ebx;
- args[1] = rp->lxr_ecx;
- args[2] = rp->lxr_edx;
- args[3] = rp->lxr_esi;
- args[4] = rp->lxr_edi;
- args[5] = rp->lxr_ebp;
+ int ret;
+ lx_tsd_t *lx_tsd;
+
+ if ((ret = thr_getspecific(lx_tsd_key, (void **)&lx_tsd)) != 0) {
+ lx_err_fatal("lx_get_tsd: unable to read "
+ "thread-specific data: %s", strerror(ret));
}
-#endif
- return (0);
+ assert(lx_tsd != 0);
+
+ return (lx_tsd);
}
+/*
+ * This function is called from the kernel like a signal handler. Each
+ * function call is a request to provide emulation for a system call that, on
+ * illumos, is implemented in userland. The system call number selection and
+ * argument parsing have already been done by the kernel.
+ */
void
-lx_emulate(lx_regs_t *rp)
+lx_emulate(ucontext_t *ucp, int syscall_num, uintptr_t *args)
{
- struct lx_sysent *s;
- uintptr_t args[6];
-#if defined(_ILP32)
- uintptr_t gs = rp->lxr_gs & 0xffff; /* %gs is only 16 bits */
-#endif
- int syscall_num;
- long ret;
+ long emu_ret;
+ int emu_errno = 0;
-#if defined(_LP64)
- syscall_num = rp->lxr_rax;
-#else
- syscall_num = rp->lxr_eax;
-#endif
+ LX_EMULATE_ENTER(ucp, syscall_num, args);
+ lx_debug("lx_emulate(%p, %d, [%p, %p, %p, %p, %p, %p])\n",
+ ucp, syscall_num, args[0], args[1], args[2], args[3], args[4],
+ args[5]);
/*
- * lx_brand_int80_callback() or lx_brand_syscall_callback() ensures
- * that the syscall_num is sane; Use it as is.
+ * The kernel should have saved us a context that will not restore the
+ * previous signal mask. Some emulated system calls alter the signal
+ * mask; restoring it after the emulation would cancel that out.
*/
- assert(syscall_num >= 0);
- assert(syscall_num < (sizeof (sysents) / sizeof (sysents[0])));
- s = &sysents[syscall_num];
-
- if ((ret = lx_emulate_args(rp, s, args)) != 0)
- goto out;
+ assert(!(ucp->uc_flags & UC_SIGMASK));
/*
- * If the tracing flag is enabled we call into the brand-specific
- * kernel module to handle the tracing activity (DTrace or ptrace).
- * It would be tempting to perform DTrace activity in the brand
- * module's syscall trap callback, rather than having to return
- * to the kernel here, but -- since argument encoding can vary
- * according to the specific system call -- that would require
- * replicating the knowledge of argument decoding in the kernel
- * module as well as here in the brand library.
+ * The kernel ensures that the syscall_num is sane; Use it as is.
*/
- if (lx_traceflag != 0) {
- /*
- * Part of the ptrace "interface" is that on syscall entry
- * %rax / %eax should be reported as -ENOSYS while the
- * orig_rax / orig_eax field of the user structure needs to
- * contain the actual system call number. If we end up stopping
- * here, the controlling process will dig the lx_regs_t
- * structure out of our stack.
- */
-#if defined(_LP64)
- rp->lxr_orig_rax = syscall_num;
- rp->lxr_rax = -stol_errno[ENOSYS];
-#else
- rp->lxr_orig_eax = syscall_num;
- rp->lxr_eax = -stol_errno[ENOSYS];
-#endif
-
- (void) syscall(SYS_brand, B_SYSENTRY, syscall_num, args);
-
- /*
- * The external tracer may have modified the arguments to this
- * system call. Refresh the argument cache to account for this.
- */
- if ((ret = lx_emulate_args(rp, s, args)) != 0)
- goto out;
- }
-
- if (s->sy_callc == NULL) {
- int reason = s->sy_flags & LX_SYS_NOSYS_REASON;
- lx_unsupported("unimplemented syscall #%d (%s): %s\n",
- syscall_num, s->sy_name, nosys_reasons[reason]);
- ret = -stol_errno[ENOTSUP];
- goto out;
- }
-
- if (LX_DEBUG_ISENABLED) {
- const char *fmt = NULL;
-
- switch (s->sy_narg) {
- case 0:
- fmt = "calling %s()";
- break;
- case 1:
- fmt = "calling %s(0x%p)";
- break;
- case 2:
- fmt = "calling %s(0x%p, 0x%p)";
- break;
- case 3:
- fmt = "calling %s(0x%p, 0x%p, 0x%p)";
- break;
- case 4:
- fmt = "calling %s(0x%p, 0x%p, 0x%p, 0x%p)";
- break;
- case 5:
- fmt = "calling %s(0x%p, 0x%p, 0x%p, 0x%p, 0x%p)";
- break;
- case 6:
- fmt = "calling %s(0x%p, 0x%p, 0x%p, 0x%p, 0x%p, 0x%p)";
- break;
- }
-
- lx_debug(fmt, s->sy_name, args[0], args[1], args[2], args[3],
- args[4], args[5]);
+ assert(syscall_num >= 0);
+ assert(syscall_num < (sizeof (lx_handlers) / sizeof (lx_handlers[0])));
+ if (lx_handlers[syscall_num] == NULL) {
+ lx_err_fatal("lx_emulate: kernel sent us a call we cannot "
+ "emulate (%d)", syscall_num);
}
/*
- * On 64-bit code, the %gs will be 0 in both native and Linux code.
+ * Call our handler function:
*/
-#if defined(_ILP32)
- if (gs != LWPGS_SEL) {
- lx_tsd_t *lx_tsd;
-
- /*
- * While a %gs of 0 is technically legal (as long as the
- * application never dereferences memory using %gs), Solaris
- * has its own ideas as to how a zero %gs should be handled in
- * _update_sregs(), such that any 32-bit user process with a
- * %gs of zero running on a system with a 64-bit kernel will
- * have its %gs hidden base register stomped on on return from
- * a system call, leaving an incorrect base address in place
- * until the next time %gs is actually reloaded (forcing a
- * reload of the base address from the appropriate descriptor
- * table.)
- *
- * Of course the kernel will once again stomp on THAT base
- * address when returning from a system call, resulting in an
- * an application segmentation fault.
- *
- * To avoid this situation, disallow a save of a zero %gs
- * here in order to try and capture any Linux process that
- * attempts to make a syscall with a zero %gs installed.
- */
- assert(gs != 0);
-
- if ((ret = thr_getspecific(lx_tsd_key,
- (void **)&lx_tsd)) != 0)
- lx_err_fatal("lx_emulate: unable to read "
- "thread-specific data: %s", strerror(ret));
-
- assert(lx_tsd != 0);
-
- lx_tsd->lxtsd_gs = gs;
-
- lx_debug("lx_emulate(): gsp 0x%p, saved gs: 0x%x", lx_tsd, gs);
- }
-#endif /* _ILP32 */
-
-restart_syscall:
- if (s->sy_flags & LX_SYS_IKE) {
- lx_debug("\tsyscall %d re-vectoring to lx kernel module "
- "for %s()", syscall_num, s->sy_name);
-
- if ((ret = syscall(SYS_brand, B_IKE_SYSCALL,
- (uintptr_t)s->sy_callc, args)) == -1)
- ret = -errno;
- } else {
- ret = s->sy_callc(args[0], args[1], args[2],
- args[3], args[4], args[5]);
- }
-
- if (ret > -65536 && ret < 65536)
- lx_debug("\t= %d", ret);
- else
- lx_debug("\t= 0x%x", ret);
+ emu_ret = lx_handlers[syscall_num](args[0], args[1], args[2], args[3],
+ args[4], args[5]);
/*
- * If the return value is between -1 and -4095 then it's an errno, so
- * we translate the Illumos error number into the Linux equivalent.
+ * If the return value is between -1 and -4095 then it's an errno.
+ * The kernel will translate it to the Linux equivalent for us.
*/
- if (ret < 0 && ret > -4096) {
- if (-ret >= sizeof (stol_errno) / sizeof (stol_errno[0])) {
- lx_debug("Invalid return value from emulated "
- "syscall %d (%s): %d\n",
- syscall_num, s->sy_name, ret);
- assert(0);
- }
-
- ret = -stol_errno[-ret];
+ if (emu_ret < 0 && emu_ret > -4096) {
+ emu_errno = (int)-emu_ret;
}
- if (lx_do_syscall_restart && ret == -stol_errno[EINTR]) {
- lx_debug("restarting system call due to signal interruption");
- lx_do_syscall_restart = 0;
- goto restart_syscall;
- }
-
-out:
/*
- * For 32-bit, %eax holds the return code from the system call. For
- * 64-bit, %rax holds the return code.
+ * Return to the context we were passed
*/
-#if defined(_LP64)
- rp->lxr_rax = ret;
-#else
- rp->lxr_eax = ret;
-#endif
+ LX_EMULATE_RETURN(ucp, syscall_num, emu_ret, emu_errno);
+ lx_debug("\tlx_emulate(%d) done (ret %ld / 0x%p ; errno %d)",
+ syscall_num, emu_ret, emu_ret, emu_errno);
+ (void) syscall(SYS_brand, B_EMULATION_DONE, ucp, syscall_num, emu_ret,
+ emu_errno);
- /*
- * If the trace flag is set, bounce into the kernel to let it do
- * any necessary tracing (DTrace or ptrace).
- */
- if (lx_traceflag != 0) {
-#if defined(_LP64)
- rp->lxr_orig_rax = syscall_num;
-#else
- rp->lxr_orig_eax = syscall_num;
-#endif
- (void) syscall(SYS_brand, B_SYSRETURN, syscall_num, ret);
- }
-
-#if defined(_LP64)
- /*
- * For 64-bit code this must be the last thing we do in the emulation
- * code path before we return back to the Linux program. This will
- * disable native syscalls so the next time a syscall happens on this
- * thread, it will come back into the emulation. We can omit the extra
- * syscall overhead in the 32-bit case.
- */
- (void) syscall(SYS_brand, B_CLR_NTV_SYSC_FLAG);
-#endif
+ assert(!"cannot be returned here");
}
static void
@@ -771,6 +468,106 @@ map_vdso()
}
#endif
+/*
+ * Initialize the thread specific data for this thread.
+ */
+void
+lx_init_tsd(lx_tsd_t *lxtsd)
+{
+ int err;
+
+ bzero(lxtsd, sizeof (*lxtsd));
+ lxtsd->lxtsd_exit = LX_ET_NONE;
+
+ /*
+ * The Linux alternate signal stack is initially disabled:
+ */
+ lxtsd->lxtsd_sigaltstack.ss_flags = LX_SS_DISABLE;
+
+ /*
+ * Create a per-thread exit context from the current register and
+ * native/brand stack state. Replace the saved program counter value
+ * with the address of lx_exit_common(); we wish to revector there when
+ * the thread or process is exiting.
+ */
+ if (getcontext(&lxtsd->lxtsd_exit_context) != 0) {
+ lx_err_fatal("Unable to initialize thread-specific exit "
+ "context: %s", strerror(errno));
+ }
+ LX_REG(&lxtsd->lxtsd_exit_context, REG_PC) = (uintptr_t)lx_exit_common;
+
+ /*
+ * Align the stack pointer and clear the frame pointer.
+ */
+ LX_REG(&lxtsd->lxtsd_exit_context, REG_FP) = 0;
+ LX_REG(&lxtsd->lxtsd_exit_context, REG_SP) &= ~(STACK_ALIGN - 1UL);
+#if defined(_LP64)
+#if (STACK_ENTRY_ALIGN != 8) && (STACK_ALIGN != 16)
+#error "lx_init_tsd: unexpected STACK_[ENTRY_]ALIGN values"
+#endif
+ /*
+ * The AMD64 ABI requires that, on entry to a function, the stack
+ * pointer must be 8-byte aligned, but _not_ 16-byte aligned. When
+ * the frame pointer is pushed, the alignment will then be correct.
+ */
+ LX_REG(&lxtsd->lxtsd_exit_context, REG_SP) -= STACK_ENTRY_ALIGN;
+#endif
+
+ /*
+ * Block all signals in the exit context to avoid taking any signals
+ * (to the degree possible) while exiting.
+ */
+ (void) sigfillset(&lxtsd->lxtsd_exit_context.uc_sigmask);
+
+ if ((err = thr_setspecific(lx_tsd_key, lxtsd)) != 0) {
+ lx_err_fatal("Unable to initialize thread-specific data: %s",
+ strerror(err));
+ }
+}
+
+static void
+lx_start(uintptr_t sp, uintptr_t entry)
+{
+ ucontext_t jump_uc;
+
+ if (getcontext(&jump_uc) != 0) {
+ lx_err_fatal("Unable to getcontext for program start: %s",
+ strerror(errno));
+ }
+
+ /*
+ * We want to load the general registers from this
+ * context, and switch to the BRAND stack.
+ */
+ jump_uc.uc_flags = UC_CPU;
+ jump_uc.uc_brand_data[0] = (void *)LX_UC_STACK_BRAND;
+
+ LX_REG(&jump_uc, REG_FP) = NULL;
+ LX_REG(&jump_uc, REG_SP) = sp;
+ LX_REG(&jump_uc, REG_PC) = entry;
+
+#if defined(_LP64)
+ /*
+ * The AMD64 ABI states that at process entry, %rdx contains "a
+ * function pointer that the application should register with
+ * atexit()". We make sure to pass NULL explicitly so that
+ * no function is registered.
+ */
+ LX_REG(&jump_uc, REG_RDX) = NULL;
+#endif
+
+ lx_debug("starting Linux program sp %p ldentry %p", sp, entry);
+
+ /*
+ * This system call should not return.
+ */
+ if (syscall(SYS_brand, B_JUMP_TO_LINUX, &jump_uc) == -1) {
+ lx_err_fatal("B_JUMP_TO_LINUX failed: %s",
+ strerror(errno));
+ }
+ abort();
+}
+
/*ARGSUSED*/
int
lx_init(int argc, char *argv[], char *envp[])
@@ -781,7 +578,7 @@ lx_init(int argc, char *argv[], char *envp[])
int err;
lx_elf_data_t edp;
lx_brand_registration_t reg;
- static lx_tsd_t lx_tsd;
+ lx_tsd_t *lxtsd;
#if defined(_LP64)
void *vdso_hdr;
#endif
@@ -836,13 +633,10 @@ lx_init(int argc, char *argv[], char *envp[])
lx_debug("VERBOSE mode enabled.\n");
}
- /* needed in wait4(), get it once since it never changes */
- max_pid = sysconf(_SC_MAXPID);
-
(void) strlcpy(lx_cmd_name, basename(argv[0]), sizeof (lx_cmd_name));
lx_debug("executing linux process: %s", argv[0]);
lx_debug("branding myself and setting handler to 0x%p",
- (void *)lx_handler_table);
+ (void *)lx_emulate);
/*
* The version of rpm that ships with CentOS/RHEL 3.x has a race
@@ -863,9 +657,7 @@ lx_init(int argc, char *argv[], char *envp[])
lx_is_rpm = B_TRUE;
reg.lxbr_version = LX_VERSION;
- reg.lxbr_handler = (void *)&lx_handler_table;
- reg.lxbr_tracehandler = (void *)&lx_handler_trace_table;
- reg.lxbr_traceflag = (void *)&lx_traceflag;
+ reg.lxbr_handler = (void *)&lx_emulate;
/*
* Register the address of the user-space handler with the lx brand
@@ -942,64 +734,74 @@ lx_init(int argc, char *argv[], char *envp[])
lxt_server_init(argc, argv);
/* Setup signal handler information. */
- if (lx_siginit())
+ if (lx_siginit()) {
lx_err_fatal("failed to initialize lx signals for the "
"branded process");
+ }
/* Setup thread-specific data area for managing linux threads. */
- if ((err = thr_keycreate(&lx_tsd_key, NULL)) != 0)
+ if ((err = thr_keycreate(&lx_tsd_key, NULL)) != 0) {
lx_err_fatal("thr_keycreate(lx_tsd_key) failed: %s",
strerror(err));
+ }
lx_debug("thr_keycreate created lx_tsd_key (%d)", lx_tsd_key);
- /* Initialize the thread specific data for this thread. */
- bzero(&lx_tsd, sizeof (lx_tsd));
-#if defined(_ILP32)
- /* start with %gs having the native libc value */
- lx_tsd.lxtsd_gs = LWPGS_SEL;
-#endif
-
- if ((err = thr_setspecific(lx_tsd_key, &lx_tsd)) != 0)
- lx_err_fatal("Unable to initialize thread-specific data: %s",
- strerror(err));
-
/*
- * Save the current context of this thread.
- * We'll restore this context when this thread attempts to exit.
+ * Initialize the thread specific data for this thread.
*/
- if (getcontext(&lx_tsd.lxtsd_exit_context) != 0)
- lx_err_fatal("Unable to initialize thread-specific exit "
- "context: %s", strerror(errno));
-
- if (lx_tsd.lxtsd_exit == LX_ET_NONE) {
-#if defined(_LP64)
- /* Switch to Linux syscall mode */
- (void) syscall(SYS_brand, B_CLR_NTV_SYSC_FLAG);
-#endif
-
- lx_runexe(argv, (void *)edp.ed_ldentry);
- /* lx_runexe() never returns. */
- assert(0);
+ if ((lxtsd = malloc(sizeof (*lxtsd))) == NULL) {
+ lx_err_fatal("failed to allocate tsd for main thread: %s",
+ strerror(errno));
}
+ lx_debug("lx tsd allocated @ %p", lxtsd);
+ lx_init_tsd(lxtsd);
/*
- * We are here because the Linux application called the exit() or
- * exit_group() system call. In turn the brand library did a
- * setcontext() to jump to the thread context state we saved above.
+ * Allocate the brand emulation stack for the main process thread.
+ * Register the thread-specific data structure with the stack list so
+ * that it may be freed at thread exit or fork(2).
*/
- lx_exit_common(lx_tsd.lxtsd_exit, lx_tsd.lxtsd_exit_status);
+ lx_install_stack(NULL, 0, lxtsd);
+
+ /*
+ * The brand linker expects the stack pointer to point to
+ * "argc", which is just before &argv[0].
+ */
+ lx_start((uintptr_t)argv - sizeof (void *), edp.ed_ldentry);
+
/*NOTREACHED*/
+ abort();
return (0);
}
+/*
+ * We "return" to this function via a context hand-crafted by
+ * "lx_init_tsd()"; see that function for more detail.
+ *
+ * NOTE: Our call frame is on the main thread stack, not the alternate native
+ * stack -- it is safe to release the latter here. The frame does not have a
+ * valid return address, so this function MUST NOT return.
+ */
void
-lx_exit_common(lx_exit_type_t exit_type, uintptr_t exit_value)
+lx_exit_common(void)
{
- int ev = 0xff & exit_value;
+ lx_tsd_t *lxtsd = lx_get_tsd();
+ int ev = (0xff & lxtsd->lxtsd_exit_status);
- switch (exit_type) {
+ switch (lxtsd->lxtsd_exit) {
case LX_ET_EXIT:
+ lx_debug("lx_exit_common(LX_ET_EXIT, %d)\n", ev);
+
+ /*
+ * If the thread is exiting, but not the entire process, we
+ * must free the stack we allocated for usermode emulation.
+ * This is safe to do here because the setcontext() put us
+ * back on the BRAND stack for this process. This function
+ * also frees the thread-specific data object for this thread.
+ */
+ lx_free_stack();
+
/*
* The native thread return value is never seen so we pass
* NULL.
@@ -1008,6 +810,7 @@ lx_exit_common(lx_exit_type_t exit_type, uintptr_t exit_value)
break;
case LX_ET_EXIT_GROUP:
+ lx_debug("lx_exit_common(LX_ET_EXIT_GROUP, %d)\n", ev);
exit(ev);
break;
@@ -1018,30 +821,74 @@ lx_exit_common(lx_exit_type_t exit_type, uintptr_t exit_value)
abort();
}
-/*
- * Walk back through the stack until we find the lx_emulate() frame.
- */
-lx_regs_t *
-lx_syscall_regs(void)
+const ucontext_t *
+lx_find_brand_uc(void)
{
- /* LINTED - alignment */
- struct frame *fr = (struct frame *)_getfp();
+ ucontext_t *ucp = NULL;
+
+ /*
+ * Ask for the current emulation (or signal handling) ucontext_t...
+ */
+ assert(syscall(SYS_brand, B_GET_CURRENT_CONTEXT, &ucp) == 0);
+
+ for (;;) {
+ uintptr_t flags;
- while (fr->fr_savpc != (uintptr_t)&lx_emulate_done) {
- fr = (struct frame *)fr->fr_savfp;
- assert(fr->fr_savpc != NULL);
+ lx_debug("lx_find_brand_uc: inspect ucp %p...\n", ucp);
+ assert(ucp != NULL);
+
+ flags = (uintptr_t)ucp->uc_brand_data[0];
+
+ if (flags & LX_UC_STACK_BRAND) {
+ lx_debug("lx_find_brand_uc: ucp %p\n", ucp);
+
+ return (ucp);
+ }
+
+ lx_debug("lx_find_brand_uc: skip non-BRAND ucp %p\n", ucp);
+
+ /*
+ * Walk up the context chain to find the most recently stored
+ * brand register state.
+ */
+ ucp = ucp->uc_link;
}
+}
+
+uintptr_t
+lx_find_brand_sp(void)
+{
+ const ucontext_t *ucp = lx_find_brand_uc();
+ uintptr_t sp = LX_REG(ucp, REG_SP);
+
+ lx_debug("lx_find_brand_sp: ucp %p sp %p\n", ucp, sp);
+
+ return (sp);
+}
+
+ucontext_t *
+lx_syscall_regs(void)
+{
+ ucontext_t *ucp = NULL;
+ uintptr_t flags;
-#if defined(_LP64)
/*
- * This is %rbp, update to be at the end of the frame for correct
- * struct offsets. lx_emulate only takes one parameter, a pointer to
- * lx_regs_t.
+ * Ask for the current emulation (or signal handling) ucontext_t...
*/
- return ((lx_regs_t *)(fr->fr_savfp - sizeof (lx_regs_t)));
-#else
- return ((lx_regs_t *)((uintptr_t *)fr)[2]);
-#endif
+ assert(syscall(SYS_brand, B_GET_CURRENT_CONTEXT, &ucp) == 0);
+ assert(ucp != NULL);
+
+ /*
+ * Use of the lx_syscall_regs() function implies that the topmost (i.e.
+ * current) context is for a system call emulation request from the
+ * kernel, rather than a signal handling frame.
+ */
+ flags = (uintptr_t)ucp->uc_brand_data[0];
+ assert(flags & LX_UC_FRAME_IS_SYSCALL);
+
+ lx_debug("lx_syscall_regs: ucp %p\n", ucp);
+
+ return (ucp);
}
int
@@ -1111,324 +958,330 @@ lx_fd_to_path(int fd, char *buf, int buf_size)
#if defined(_LP64)
/* The following is the 64-bit syscall table */
-static struct lx_sysent sysents[] = {
- {"read", LX_IKE(read), LX_SYS_IKE, 3}, /* 0 */
- {"write", lx_write, 0, 3}, /* 1 */
- {"open", lx_open, 0, 3}, /* 2 */
- {"close", lx_close, 0, 1}, /* 3 */
- {"stat", lx_stat64, 0, 2}, /* 4 */
- {"fstat", lx_fstat64, 0, 2}, /* 5 */
- {"lstat", lx_lstat64, 0, 2}, /* 6 */
- {"poll", lx_poll, 0, 3}, /* 7 */
- {"lseek", lx_lseek, 0, 3}, /* 8 */
- {"mmap", lx_mmap, 0, 6}, /* 9 */
- {"mprotect", lx_mprotect, 0, 3}, /* 10 */
- {"munmap", lx_munmap, 0, 2}, /* 11 */
- {"brk", LX_IKE(brk), LX_SYS_IKE, 1}, /* 12 */
- {"rt_sigaction", lx_rt_sigaction, 0, 4}, /* 13 */
- {"rt_sigprocmask", lx_rt_sigprocmask, 0, 4}, /* 14 */
- {"rt_sigreturn", lx_rt_sigreturn, 0, 0}, /* 15 */
- {"ioctl", LX_IKE(ioctl), LX_SYS_IKE, 3}, /* 16 */
- {"pread64", lx_pread, 0, 4}, /* 17 */
- {"pwrite64", lx_pwrite, 0, 4}, /* 18 */
- {"readv", lx_readv, 0, 3}, /* 19 */
- {"writev", lx_writev, 0, 3}, /* 20 */
- {"access", lx_access, 0, 2}, /* 21 */
- {"pipe", LX_IKE(pipe), LX_SYS_IKE, 1}, /* 22 */
- {"select", lx_select, 0, 5}, /* 23 */
- {"sched_yield", lx_yield, 0, 0}, /* 24 */
- {"mremap", lx_remap, 0, 5}, /* 25 */
- {"msync", lx_msync, 0, 3}, /* 26 */
- {"mincore", lx_mincore, 0, 3}, /* 27 */
- {"madvise", lx_madvise, 0, 3}, /* 28 */
- {"shmget", lx_shmget, 0, 3}, /* 29 */
- {"shmat", lx_shmat, 0, 4}, /* 30 */
- {"shmctl", lx_shmctl, 0, 3}, /* 31 */
- {"dup", lx_dup, 0, 1}, /* 32 */
- {"dup2", lx_dup2, 0, 2}, /* 33 */
- {"pause", lx_pause, 0, 0}, /* 34 */
- {"nanosleep", lx_nanosleep, 0, 2}, /* 35 */
- {"getitimer", lx_getitimer, 0, 2}, /* 36 */
- {"alarm", lx_alarm, 0, 1}, /* 37 */
- {"setitimer", lx_setitimer, 0, 3}, /* 38 */
- {"getpid", lx_getpid, 0, 0}, /* 39 */
- {"sendfile", lx_sendfile64, 0, 4}, /* 40 */
- {"socket", lx_socket, 0, 3}, /* 41 */
- {"connect", lx_connect, 0, 3}, /* 42 */
- {"accept", lx_accept, 0, 3}, /* 43 */
- {"sendto", lx_sendto, 0, 6}, /* 44 */
- {"recvfrom", lx_recvfrom, 0, 6}, /* 45 */
- {"sendmsg", lx_sendmsg, 0, 3}, /* 46 */
- {"recvmsg", lx_recvmsg, 0, 3}, /* 47 */
- {"shutdown", lx_shutdown, 0, 2}, /* 48 */
- {"bind", lx_bind, 0, 3}, /* 49 */
- {"listen", lx_listen, 0, 2}, /* 50 */
- {"getsockname", lx_getsockname, 0, 3}, /* 51 */
- {"getpeername", lx_getpeername, 0, 3}, /* 52 */
- {"socketpair", lx_socketpair, 0, 4}, /* 53 */
- {"setsockopt", lx_setsockopt, 0, 5}, /* 54 */
- {"getsockopt", lx_getsockopt, 0, 5}, /* 55 */
- {"clone", lx_clone, 0, 5}, /* 56 */
- {"fork", lx_fork, 0, 0}, /* 57 */
- {"vfork", lx_vfork, 0, 0}, /* 58 */
- {"execve", lx_execve, 0, 3}, /* 59 */
- {"exit", lx_exit, 0, 1}, /* 60 */
- {"wait4", lx_wait4, 0, 4}, /* 61 */
- {"kill", LX_IKE(kill), LX_SYS_IKE, 2}, /* 62 */
- {"uname", lx_uname, 0, 1}, /* 63 */
- {"semget", lx_semget, 0, 3}, /* 64 */
- {"semop", lx_semop, 0, 3}, /* 65 */
- {"semctl", lx_semctl, 0, 4}, /* 66 */
- {"shmdt", lx_shmdt, 0, 1}, /* 67 */
- {"msgget", lx_msgget, 0, 2}, /* 68 */
- {"msgsnd", lx_msgsnd, 0, 4}, /* 69 */
- {"msgrcv", lx_msgrcv, 0, 5}, /* 70 */
- {"msgctl", lx_msgctl, 0, 3}, /* 71 */
- {"fcntl", lx_fcntl64, 0, 3}, /* 72 */
- {"flock", lx_flock, 0, 2}, /* 73 */
- {"fsync", lx_fsync, 0, 1}, /* 74 */
- {"fdatasync", lx_fdatasync, 0, 1}, /* 75 */
- {"truncate", lx_truncate, 0, 2}, /* 76 */
- {"ftruncate", lx_ftruncate, 0, 2}, /* 77 */
- {"getdents", lx_getdents, 0, 3}, /* 78 */
- {"getcwd", lx_getcwd, 0, 2}, /* 79 */
- {"chdir", lx_chdir, 0, 1}, /* 80 */
- {"fchdir", lx_fchdir, 0, 1}, /* 81 */
- {"rename", lx_rename, 0, 2}, /* 82 */
- {"mkdir", lx_mkdir, 0, 2}, /* 83 */
- {"rmdir", lx_rmdir, 0, 1}, /* 84 */
- {"creat", lx_creat, 0, 2}, /* 85 */
- {"link", lx_link, 0, 2}, /* 86 */
- {"unlink", lx_unlink, 0, 1}, /* 87 */
- {"symlink", lx_symlink, 0, 2}, /* 88 */
- {"readlink", lx_readlink, 0, 3}, /* 89 */
- {"chmod", lx_chmod, 0, 2}, /* 90 */
- {"fchmod", lx_fchmod, 0, 2}, /* 91 */
- {"chown", lx_chown, 0, 3}, /* 92 */
- {"fchown", lx_fchown, 0, 3}, /* 93 */
- {"lchown", lx_lchown, 0, 3}, /* 94 */
- {"umask", lx_umask, 0, 1}, /* 95 */
- {"gettimeofday", lx_gettimeofday, 0, 2}, /* 96 */
- {"getrlimit", lx_getrlimit, 0, 2}, /* 97 */
- {"getrusage", lx_getrusage, 0, 2}, /* 98 */
- {"sysinfo", LX_IKE(sysinfo), LX_SYS_IKE, 1}, /* 99 */
- {"times", lx_times, 0, 1}, /* 100 */
- {"ptrace", lx_ptrace, 0, 4}, /* 101 */
- {"getuid", lx_getuid, 0, 0}, /* 102 */
- {"syslog", lx_syslog, 0, 3}, /* 103 */
- {"getgid", lx_getgid, 0, 0}, /* 104 */
- {"setuid", lx_setuid, 0, 1}, /* 105 */
- {"setgid", lx_setgid, 0, 1}, /* 106 */
- {"geteuid", lx_geteuid, 0, 0}, /* 107 */
- {"getegid", lx_getegid, 0, 0}, /* 108 */
- {"setpgid", lx_setpgid, 0, 2}, /* 109 */
- {"getppid", LX_IKE(getppid), LX_SYS_IKE, 0}, /* 110 */
- {"getpgrp", lx_getpgrp, 0, 0}, /* 111 */
- {"setsid", lx_setsid, 0, 0}, /* 112 */
- {"setreuid", lx_setreuid, 0, 0}, /* 113 */
- {"setregid", lx_setregid, 0, 0}, /* 114 */
- {"getgroups", lx_getgroups, 0, 2}, /* 115 */
- {"setgroups", lx_setgroups, 0, 2}, /* 116 */
- {"setresuid", LX_IKE(setresuid), LX_SYS_IKE, 3}, /* 117 */
- {"getresuid", lx_getresuid, 0, 3}, /* 118 */
- {"setresgid", LX_IKE(setresgid), LX_SYS_IKE, 3}, /* 119 */
- {"getresgid", lx_getresgid, 0, 3}, /* 120 */
- {"getpgid", lx_getpgid, 0, 1}, /* 121 */
- {"setfsuid", lx_setfsuid, 0, 1}, /* 122 */
- {"setfsgid", lx_setfsgid, 0, 1}, /* 123 */
- {"getsid", lx_getsid, 0, 1}, /* 124 */
- {"capget", lx_capget, 0, 2}, /* 125 */
- {"capset", lx_capset, 0, 2}, /* 126 */
- {"rt_sigpending", lx_rt_sigpending, 0, 2}, /* 127 */
- {"rt_sigtimedwait", lx_rt_sigtimedwait, 0, 4}, /* 128 */
- {"rt_sigqueueinfo", lx_rt_sigqueueinfo, 0, 3}, /* 129 */
- {"rt_sigsuspend", lx_rt_sigsuspend, 0, 2}, /* 130 */
- {"sigaltstack", lx_sigaltstack, 0, 2}, /* 131 */
- {"utime", lx_utime, 0, 2}, /* 132 */
- {"mknod", lx_mknod, 0, 3}, /* 133 */
- {"uselib", NULL, NOSYS_KERNEL, 0}, /* 134 */
- {"personality", lx_personality, 0, 1}, /* 135 */
- {"ustat", NULL, NOSYS_OBSOLETE, 2}, /* 136 */
- {"statfs", lx_statfs, 0, 2}, /* 137 */
- {"fstatfs", lx_fstatfs, 0, 2}, /* 138 */
- {"sysfs", lx_sysfs, 0, 3}, /* 139 */
- {"getpriority", lx_getpriority, 0, 2}, /* 140 */
- {"setpriority", lx_setpriority, 0, 3}, /* 141 */
- {"sched_setparam", lx_sched_setparam, 0, 2}, /* 142 */
- {"sched_getparam", lx_sched_getparam, 0, 2}, /* 143 */
- {"sched_setscheduler", lx_sched_setscheduler, 0, 3}, /* 144 */
- {"sched_getscheduler", lx_sched_getscheduler, 0, 1}, /* 145 */
- {"sched_get_priority_max", lx_sched_get_priority_max, 0, 1}, /* 146 */
- {"sched_get_priority_min", lx_sched_get_priority_min, 0, 1}, /* 147 */
- {"sched_rr_get_interval", lx_sched_rr_get_interval, 0, 2}, /* 148 */
- {"mlock", lx_mlock, 0, 2}, /* 149 */
- {"munlock", lx_munlock, 0, 2}, /* 150 */
- {"mlockall", lx_mlockall, 0, 1}, /* 151 */
- {"munlockall", lx_munlockall, 0, 0}, /* 152 */
- {"vhangup", lx_vhangup, 0, 0}, /* 153 */
- {"modify_ldt", LX_IKE(modify_ldt), LX_SYS_IKE, 3}, /* 154 */
- {"pivot_root", NULL, NOSYS_KERNEL, 0}, /* 155 */
- {"sysctl", lx_sysctl, 0, 1}, /* 156 */
- {"prctl", lx_prctl, 0, 5}, /* 157 */
- {"arch_prctl", lx_arch_prctl, 0, 2}, /* 158 */
- {"adjtimex", lx_adjtimex, 0, 1}, /* 159 */
- {"setrlimit", lx_setrlimit, 0, 2}, /* 160 */
- {"chroot", lx_chroot, 0, 1}, /* 161 */
- {"sync", lx_sync, 0, 0}, /* 162 */
- {"acct", NULL, NOSYS_NO_EQUIV, 0}, /* 163 */
- {"settimeofday", lx_settimeofday, 0, 2}, /* 164 */
- {"mount", lx_mount, 0, 5}, /* 165 */
- {"umount2", lx_umount2, 0, 2}, /* 166 */
- {"swapon", NULL, NOSYS_KERNEL, 0}, /* 167 */
- {"swapoff", NULL, NOSYS_KERNEL, 0}, /* 168 */
- {"reboot", lx_reboot, 0, 4}, /* 169 */
- {"sethostname", lx_sethostname, 0, 2}, /* 170 */
- {"setdomainname", lx_setdomainname, 0, 2}, /* 171 */
- {"iopl", NULL, NOSYS_NO_EQUIV, 0}, /* 172 */
- {"ioperm", NULL, NOSYS_NO_EQUIV, 0}, /* 173 */
- {"create_module", NULL, NOSYS_KERNEL, 0}, /* 174 */
- {"init_module", NULL, NOSYS_KERNEL, 0}, /* 175 */
- {"delete_module", NULL, NOSYS_KERNEL, 0}, /* 176 */
- {"get_kernel_syms", NULL, NOSYS_KERNEL, 0}, /* 177 */
- {"query_module", lx_query_module, NOSYS_KERNEL, 5}, /* 178 */
- {"quotactl", NULL, NOSYS_KERNEL, 0}, /* 179 */
- {"nfsservctl", NULL, NOSYS_KERNEL, 0}, /* 180 */
- {"getpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 181 */
- {"putpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 182 */
- {"afs_syscall", NULL, NOSYS_KERNEL, 0}, /* 183 */
- {"tux", NULL, NOSYS_NO_EQUIV, 0}, /* 184 */
- {"security", NULL, NOSYS_NO_EQUIV, 0}, /* 185 */
- {"gettid", LX_IKE(gettid), LX_SYS_IKE, 0}, /* 186 */
- {"readahead", NULL, NOSYS_NO_EQUIV, 0}, /* 187 */
- {"setxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 188 */
- {"lsetxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 189 */
- {"fsetxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 190 */
- {"getxattr", lx_xattr4, 0, 4}, /* 191 */
- {"lgetxattr", lx_xattr4, 0, 4}, /* 192 */
- {"fgetxattr", lx_xattr4, 0, 4}, /* 193 */
- {"listxattr", lx_xattr3, 0, 3}, /* 194 */
- {"llistxattr", lx_xattr3, 0, 3}, /* 195 */
- {"flistxattr", lx_xattr3, 0, 3}, /* 196 */
- {"removexattr", lx_xattr2, 0, 2}, /* 197 */
- {"lremovexattr", lx_xattr2, 0, 2}, /* 198 */
- {"fremovexattr", lx_xattr2, 0, 2}, /* 199 */
- {"tkill", LX_IKE(tkill), LX_SYS_IKE, 2}, /* 200 */
- {"time", lx_time, 0, 1}, /* 201 */
- {"futex", LX_IKE(futex), LX_SYS_IKE, 6}, /* 202 */
- {"sched_setaffinity", lx_sched_setaffinity, 0, 3}, /* 203 */
- {"sched_getaffinity", lx_sched_getaffinity, 0, 3}, /* 204 */
- {"set_thread_area", LX_IKE(set_thread_area), LX_SYS_IKE, 1}, /* 205 */
- {"io_setup", lx_io_setup, 0, 2}, /* 206 */
- {"io_destroy", lx_io_destroy, 0, 1}, /* 207 */
- {"io_getevents", lx_io_getevents, 0, 5}, /* 208 */
- {"io_submit", lx_io_submit, 0, 3}, /* 209 */
- {"io_cancel", lx_io_cancel, 0, 3}, /* 210 */
- {"get_thread_area", LX_IKE(get_thread_area), LX_SYS_IKE, 1}, /* 211 */
- {"lookup_dcookie", NULL, NOSYS_NO_EQUIV, 0}, /* 212 */
- {"epoll_create", lx_epoll_create, 0, 1}, /* 213 */
- {"epoll_ctl_old", NULL, NOSYS_NULL, 0}, /* 214 */
- {"epoll_wait_old", NULL, NOSYS_NULL, 0}, /* 215 */
- {"remap_file_pages", NULL, NOSYS_NO_EQUIV, 0}, /* 216 */
- {"getdents64", lx_getdents64, 0, 3}, /* 217 */
- {"set_tid_address", LX_IKE(set_tid_address), LX_SYS_IKE, 1}, /* 218 */
- {"restart_syscall", NULL, NOSYS_NULL, 0}, /* 219 */
- {"semtimedop", lx_semtimedop, 0, 4}, /* 220 */
- {"fadvise64", lx_fadvise64_64, 0, 4}, /* 221 */
- {"timer_create", lx_timer_create, 0, 3}, /* 222 */
- {"timer_settime", lx_timer_settime, 0, 4}, /* 223 */
- {"timer_gettime", lx_timer_gettime, 0, 2}, /* 224 */
- {"timer_getoverrun", lx_timer_getoverrun, 0, 1}, /* 225 */
- {"timer_delete", lx_timer_delete, 0, 1}, /* 226 */
- {"clock_settime", lx_clock_settime, 0, 2}, /* 227 */
- {"clock_gettime", lx_clock_gettime, 0, 2}, /* 228 */
- {"clock_getres", lx_clock_getres, 0, 2}, /* 229 */
- {"clock_nanosleep", lx_clock_nanosleep, 0, 4}, /* 230 */
- {"exit_group", lx_group_exit, 0, 1}, /* 231 */
- {"epoll_wait", lx_epoll_wait, 0, 4}, /* 232 */
- {"epoll_ctl", lx_epoll_ctl, 0, 4}, /* 233 */
- {"tgkill", LX_IKE(tgkill), LX_SYS_IKE, 3}, /* 234 */
- {"utimes", lx_utimes, 0, 2}, /* 235 */
- {"vserver", NULL, NOSYS_NULL, 0}, /* 236 */
- {"mbind", NULL, NOSYS_NULL, 0}, /* 237 */
- {"set_mempolicy", NULL, NOSYS_NULL, 0}, /* 238 */
- {"get_mempolicy", NULL, NOSYS_NULL, 0}, /* 239 */
- {"mq_open", NULL, NOSYS_NULL, 0}, /* 240 */
- {"mq_unlink", NULL, NOSYS_NULL, 0}, /* 241 */
- {"mq_timedsend", NULL, NOSYS_NULL, 0}, /* 242 */
- {"mq_timedreceive", NULL, NOSYS_NULL, 0}, /* 243 */
- {"mq_notify", NULL, NOSYS_NULL, 0}, /* 244 */
- {"mq_getsetattr", NULL, NOSYS_NULL, 0}, /* 245 */
- {"kexec_load", NULL, NOSYS_NULL, 0}, /* 246 */
- {"waitid", lx_waitid, 0, 4}, /* 247 */
- {"add_key", NULL, NOSYS_NULL, 0}, /* 248 */
- {"request_key", NULL, NOSYS_NULL, 0}, /* 249 */
- {"keyctl", NULL, NOSYS_NULL, 0}, /* 250 */
- {"ioprio_set", NULL, NOSYS_NULL, 0}, /* 251 */
- {"ioprio_get", NULL, NOSYS_NULL, 0}, /* 252 */
- {"inotify_init", lx_inotify_init, 0, 0}, /* 253 */
- {"inotify_add_watch", lx_inotify_add_watch, 0, 3}, /* 254 */
- {"inotify_rm_watch", lx_inotify_rm_watch, 0, 2}, /* 255 */
- {"migrate_pages", NULL, NOSYS_NULL, 0}, /* 256 */
- {"openat", lx_openat, 0, 4}, /* 257 */
- {"mkdirat", lx_mkdirat, 0, 3}, /* 258 */
- {"mknodat", lx_mknodat, 0, 4}, /* 259 */
- {"fchownat", lx_fchownat, 0, 5}, /* 260 */
- {"futimesat", lx_futimesat, 0, 3}, /* 261 */
- {"fstatat64", lx_fstatat64, 0, 4}, /* 262 */
- {"unlinkat", lx_unlinkat, 0, 3}, /* 263 */
- {"renameat", lx_renameat, 0, 4}, /* 264 */
- {"linkat", lx_linkat, 0, 5}, /* 265 */
- {"symlinkat", lx_symlinkat, 0, 3}, /* 266 */
- {"readlinkat", lx_readlinkat, 0, 4}, /* 267 */
- {"fchmodat", lx_fchmodat, 0, 4}, /* 268 */
- {"faccessat", lx_faccessat, 0, 4}, /* 269 */
- {"pselect6", lx_pselect6, 0, 6}, /* 270 */
- {"ppoll", lx_ppoll, 0, 5}, /* 271 */
- {"unshare", NULL, NOSYS_NULL, 0}, /* 272 */
- {"set_robust_list", NULL, NOSYS_NULL, 0}, /* 273 */
- {"get_robust_list", NULL, NOSYS_NULL, 0}, /* 274 */
- {"splice", NULL, NOSYS_NULL, 0}, /* 275 */
- {"tee", NULL, NOSYS_NULL, 0}, /* 276 */
- {"sync_file_range", NULL, NOSYS_NULL, 0}, /* 277 */
- {"vmsplice", NULL, NOSYS_NULL, 0}, /* 278 */
- {"move_pages", NULL, NOSYS_NULL, 0}, /* 279 */
- {"utimensat", lx_utimensat, 0, 4}, /* 280 */
- {"epoll_pwait", lx_epoll_pwait, 0, 5}, /* 281 */
- {"signalfd", NULL, NOSYS_NULL, 0}, /* 282 */
- {"timerfd_create", lx_timerfd_create, 0, 2}, /* 283 */
- {"eventfd", lx_eventfd, 0, 1}, /* 284 */
- {"fallocate", NULL, NOSYS_NULL, 0}, /* 285 */
- {"timerfd_settime", lx_timerfd_settime, 0, 4}, /* 286 */
- {"timerfd_gettime", lx_timerfd_gettime, 0, 2}, /* 287 */
- {"accept4", lx_accept4, 0, 4}, /* 288 */
- {"signalfd4", NULL, NOSYS_NULL, 0}, /* 289 */
- {"eventfd2", lx_eventfd2, 0, 2}, /* 290 */
- {"epoll_create1", lx_epoll_create1, 0, 1}, /* 291 */
- {"dup3", lx_dup3, 0, 3}, /* 292 */
- {"pipe2", lx_pipe2, 0, 2}, /* 293 */
- {"inotify_init1", lx_inotify_init1, 0, 1}, /* 294 */
- {"preadv", lx_preadv, 0, 4}, /* 295 */
- {"pwritev", lx_pwritev, 0, 4}, /* 296 */
- {"rt_tgsigqueueinfo", lx_rt_tgsigqueueinfo, 0, 4}, /* 297 */
- {"perf_event_open", NULL, NOSYS_NULL, 0}, /* 298 */
- {"recvmmsg", NULL, NOSYS_NULL, 0}, /* 299 */
- {"fanotify_init", NULL, NOSYS_NULL, 0}, /* 300 */
- {"fanotify_mark", NULL, NOSYS_NULL, 0}, /* 301 */
- {"prlimit64", lx_prlimit64, 0, 4}, /* 302 */
- {"name_to_handle_at", NULL, NOSYS_NULL, 0}, /* 303 */
- {"open_by_handle_at", NULL, NOSYS_NULL, 0}, /* 304 */
- {"clock_adjtime", NULL, NOSYS_NULL, 0}, /* 305 */
- {"syncfs", NULL, NOSYS_NULL, 0}, /* 306 */
- {"sendmmsg", NULL, NOSYS_NULL, 0}, /* 307 */
- {"setns", NULL, NOSYS_NULL, 0}, /* 309 */
- {"getcpu", lx_getcpu, 0, 3}, /* 309 */
- {"process_vm_readv", NULL, NOSYS_NULL, 0}, /* 310 */
- {"process_vm_writev", NULL, NOSYS_NULL, 0}, /* 311 */
- {"kcmp", NULL, NOSYS_NULL, 0}, /* 312 */
- {"finit_module", NULL, NOSYS_NULL, 0}, /* 313 */
- {"sched_setattr", NULL, NOSYS_NULL, 0}, /* 314 */
- {"sched_getattr", NULL, NOSYS_NULL, 0}, /* 315 */
- {"renameat2", NULL, NOSYS_NULL, 0}, /* 316 */
+static lx_syscall_handler_t lx_handlers[] = {
+ NULL, /* 0: read */
+ NULL, /* 1: write */
+ lx_open,
+ lx_close,
+ lx_stat64,
+ lx_fstat64,
+ lx_lstat64,
+ lx_poll,
+ lx_lseek,
+ lx_mmap,
+ lx_mprotect,
+ lx_munmap,
+ NULL, /* 12: brk */
+ lx_rt_sigaction,
+ lx_rt_sigprocmask,
+ lx_rt_sigreturn,
+ NULL, /* 16: ioctl */
+ lx_pread,
+ lx_pwrite,
+ lx_readv,
+ lx_writev,
+ lx_access,
+ NULL, /* 22: pipe */
+ lx_select,
+ NULL, /* 24: sched_yield */
+ lx_remap,
+ lx_msync,
+ lx_mincore,
+ lx_madvise,
+ lx_shmget,
+ lx_shmat,
+ lx_shmctl,
+ lx_dup,
+ lx_dup2,
+ lx_pause,
+ lx_nanosleep,
+ lx_getitimer,
+ lx_alarm,
+ lx_setitimer,
+ NULL, /* 39: getpid */
+ lx_sendfile64,
+ lx_socket,
+ lx_connect,
+ lx_accept,
+ lx_sendto,
+ lx_recvfrom,
+ lx_sendmsg,
+ lx_recvmsg,
+ lx_shutdown,
+ lx_bind,
+ lx_listen,
+ lx_getsockname,
+ lx_getpeername,
+ lx_socketpair,
+ lx_setsockopt,
+ lx_getsockopt,
+ lx_clone,
+ lx_fork,
+ lx_vfork,
+ lx_execve,
+ lx_exit,
+ NULL, /* 61: wait4 */
+ NULL, /* 62: kill */
+ lx_uname,
+ lx_semget,
+ lx_semop,
+ lx_semctl,
+ lx_shmdt,
+ lx_msgget,
+ lx_msgsnd,
+ lx_msgrcv,
+ lx_msgctl,
+ lx_fcntl64,
+ lx_flock,
+ lx_fsync,
+ lx_fdatasync,
+ lx_truncate,
+ lx_ftruncate,
+ lx_getdents,
+ lx_getcwd,
+ lx_chdir,
+ lx_fchdir,
+ lx_rename,
+ lx_mkdir,
+ lx_rmdir,
+ lx_creat,
+ lx_link,
+ lx_unlink,
+ lx_symlink,
+ lx_readlink,
+ lx_chmod,
+ lx_fchmod,
+ lx_chown,
+ lx_fchown,
+ lx_lchown,
+ lx_umask,
+ lx_gettimeofday,
+ lx_getrlimit,
+ lx_getrusage,
+ NULL, /* 99: sysinfo */
+ lx_times,
+ lx_ptrace,
+ lx_getuid,
+ lx_syslog,
+ lx_getgid,
+ lx_setuid,
+ lx_setgid,
+ lx_geteuid,
+ lx_getegid,
+ lx_setpgid,
+ NULL, /* 110: getppid */
+ lx_getpgrp,
+ lx_setsid,
+ lx_setreuid,
+ lx_setregid,
+ lx_getgroups,
+ lx_setgroups,
+ NULL, /* 117: setresuid */
+ lx_getresuid,
+ NULL, /* 119: setresgid */
+ lx_getresgid,
+ lx_getpgid,
+ lx_setfsuid,
+ lx_setfsgid,
+ lx_getsid,
+ lx_capget,
+ lx_capset,
+ lx_rt_sigpending,
+ lx_rt_sigtimedwait,
+ lx_rt_sigqueueinfo,
+ lx_rt_sigsuspend,
+ lx_sigaltstack,
+ lx_utime,
+ lx_mknod,
+ NULL, /* 134: uselib */
+ lx_personality,
+ NULL, /* 136: ustat */
+ lx_statfs,
+ lx_fstatfs,
+ lx_sysfs,
+ lx_getpriority,
+ lx_setpriority,
+ lx_sched_setparam,
+ lx_sched_getparam,
+ lx_sched_setscheduler,
+ lx_sched_getscheduler,
+ lx_sched_get_priority_max,
+ lx_sched_get_priority_min,
+ lx_sched_rr_get_interval,
+ lx_mlock,
+ lx_munlock,
+ lx_mlockall,
+ lx_munlockall,
+ lx_vhangup,
+ NULL, /* 154: modify_ldt */
+ NULL, /* 155: pivot_root */
+ lx_sysctl,
+ lx_prctl,
+ NULL, /* 158: arch_prctl */
+ lx_adjtimex,
+ lx_setrlimit,
+ lx_chroot,
+ lx_sync,
+ NULL, /* 163: acct */
+ lx_settimeofday,
+ lx_mount,
+ lx_umount2,
+ NULL, /* 167: swapon */
+ NULL, /* 168: swapoff */
+ lx_reboot,
+ lx_sethostname,
+ lx_setdomainname,
+ NULL, /* 172: iopl */
+ NULL, /* 173: ioperm */
+ NULL, /* 174: create_module */
+ NULL, /* 175: init_module */
+ NULL, /* 176: delete_module */
+ NULL, /* 177: get_kernel_syms */
+ lx_query_module,
+ NULL, /* 179: quotactl */
+ NULL, /* 180: nfsservctl */
+ NULL, /* 181: getpmsg */
+ NULL, /* 182: putpmsg */
+ NULL, /* 183: afs_syscall */
+ NULL, /* 184: tux */
+ NULL, /* 185: security */
+ NULL, /* 186: gettid */
+ NULL, /* 187: readahead */
+ NULL, /* 188: setxattr */
+ NULL, /* 189: lsetxattr */
+ NULL, /* 190: fsetxattr */
+ NULL, /* 191: getxattr */
+ NULL, /* 192: lgetxattr */
+ NULL, /* 193: fgetxattr */
+ NULL, /* 194: listxattr */
+ NULL, /* 195: llistxattr */
+ NULL, /* 196: flistxattr */
+ NULL, /* 197: removexattr */
+ NULL, /* 198: lremovexattr */
+ NULL, /* 199: fremovexattr */
+ NULL, /* 200: tkill */
+ lx_time,
+ NULL, /* 202: futex */
+ lx_sched_setaffinity,
+ lx_sched_getaffinity,
+ NULL, /* 205: set_thread_area */
+ NULL, /* 206: io_setup */
+ NULL, /* 207: io_destroy */
+ NULL, /* 208: io_getevents */
+ NULL, /* 209: io_submit */
+ NULL, /* 210: io_cancel */
+ NULL, /* 211: get_thread_area */
+ NULL, /* 212: lookup_dcookie */
+ lx_epoll_create,
+ NULL, /* 214: epoll_ctl_old */
+ NULL, /* 215: epoll_wait_old */
+ NULL, /* 216: remap_file_pages */
+ lx_getdents64,
+ NULL, /* 218: set_tid_address */
+ NULL, /* 219: restart_syscall */
+ lx_semtimedop,
+ lx_fadvise64_64,
+ lx_timer_create,
+ lx_timer_settime,
+ lx_timer_gettime,
+ lx_timer_getoverrun,
+ lx_timer_delete,
+ lx_clock_settime,
+ lx_clock_gettime,
+ lx_clock_getres,
+ lx_clock_nanosleep,
+ lx_group_exit,
+ lx_epoll_wait,
+ lx_epoll_ctl,
+ NULL, /* 234: tgkill */
+ lx_utimes,
+ NULL, /* 236: vserver */
+ NULL, /* 237: mbind */
+ NULL, /* 238: set_mempolicy */
+ NULL, /* 239: get_mempolicy */
+ NULL, /* 240: mq_open */
+ NULL, /* 241: mq_unlink */
+ NULL, /* 242: mq_timedsend */
+ NULL, /* 243: mq_timedreceive */
+ NULL, /* 244: mq_notify */
+ NULL, /* 245: mq_getsetattr */
+ NULL, /* 246: kexec_load */
+ NULL, /* 247: waitid */
+ NULL, /* 248: add_key */
+ NULL, /* 249: request_key */
+ NULL, /* 250: keyctl */
+ NULL, /* 251: ioprio_set */
+ NULL, /* 252: ioprio_get */
+ lx_inotify_init,
+ lx_inotify_add_watch,
+ lx_inotify_rm_watch,
+ NULL, /* 256: migrate_pages */
+ lx_openat,
+ lx_mkdirat,
+ lx_mknodat,
+ lx_fchownat,
+ lx_futimesat,
+ lx_fstatat64,
+ lx_unlinkat,
+ lx_renameat,
+ lx_linkat,
+ lx_symlinkat,
+ lx_readlinkat,
+ lx_fchmodat,
+ lx_faccessat,
+ lx_pselect6,
+ lx_ppoll,
+ NULL, /* 272: unshare */
+ NULL, /* 273: set_robust_list */
+ NULL, /* 274: get_robust_list */
+ NULL, /* 275: splice */
+ NULL, /* 276: tee */
+ NULL, /* 277: sync_file_range */
+ NULL, /* 278: vmsplice */
+ NULL, /* 279: move_pages */
+ lx_utimensat,
+ lx_epoll_pwait,
+ NULL, /* 282: signalfd */
+ lx_timerfd_create,
+ lx_eventfd,
+ NULL, /* 285: fallocate */
+ lx_timerfd_settime,
+ lx_timerfd_gettime,
+ lx_accept4,
+ NULL, /* 289: signalfd4 */
+ lx_eventfd2,
+ lx_epoll_create1,
+ lx_dup3,
+ NULL, /* 293: pipe2 */
+ lx_inotify_init1,
+ NULL, /* 295: preadv */
+ NULL, /* 296: pwritev */
+ lx_rt_tgsigqueueinfo,
+ NULL, /* 298: perf_event_open */
+ NULL, /* 299: recvmmsg */
+ NULL, /* 300: fanotify_init */
+ NULL, /* 301: fanotify_mark */
+ lx_prlimit64,
+ NULL, /* 303: name_to_handle_at */
+ NULL, /* 304: open_by_handle_at */
+ NULL, /* 305: clock_adjtime */
+ NULL, /* 306: syncfs */
+ NULL, /* 307: sendmmsg */
+ NULL, /* 309: setns */
+ lx_getcpu,
+ NULL, /* 310: process_vm_readv */
+ NULL, /* 311: process_vm_writev */
+ NULL, /* 312: kcmp */
+ NULL, /* 313: finit_module */
+ NULL, /* 314: sched_setattr */
+ NULL, /* 315: sched_getattr */
+ NULL, /* 316: renameat2 */
+ NULL, /* 317: seccomp */
+ NULL, /* 318: getrandom */
+ NULL, /* 319: memfd_create */
+ NULL, /* 320: kexec_file_load */
+ NULL, /* 321: bpf */
+ NULL, /* 322: execveat */
/* XXX TBD gap then x32 syscalls from 512 - 544 */
};
@@ -1436,361 +1289,365 @@ static struct lx_sysent sysents[] = {
#else
/* The following is the 32-bit syscall table */
-static struct lx_sysent sysents[] = {
- {"nosys", NULL, NOSYS_NONE, 0}, /* 0 */
- {"exit", lx_exit, 0, 1}, /* 1 */
- {"fork", lx_fork, 0, 0}, /* 2 */
- {"read", LX_IKE(read), LX_SYS_IKE, 3}, /* 3 */
- {"write", lx_write, 0, 3}, /* 4 */
- {"open", lx_open, 0, 3}, /* 5 */
- {"close", lx_close, 0, 1}, /* 6 */
- {"waitpid", lx_waitpid, 0, 3}, /* 7 */
- {"creat", lx_creat, 0, 2}, /* 8 */
- {"link", lx_link, 0, 2}, /* 9 */
- {"unlink", lx_unlink, 0, 1}, /* 10 */
- {"execve", lx_execve, 0, 3}, /* 11 */
- {"chdir", lx_chdir, 0, 1}, /* 12 */
- {"time", lx_time, 0, 1}, /* 13 */
- {"mknod", lx_mknod, 0, 3}, /* 14 */
- {"chmod", lx_chmod, 0, 2}, /* 15 */
- {"lchown16", lx_lchown16, 0, 3}, /* 16 */
- {"break", NULL, NOSYS_OBSOLETE, 0}, /* 17 */
- {"stat", NULL, NOSYS_OBSOLETE, 0}, /* 18 */
- {"lseek", lx_lseek, 0, 3}, /* 19 */
- {"getpid", lx_getpid, 0, 0}, /* 20 */
- {"mount", lx_mount, 0, 5}, /* 21 */
- {"umount", lx_umount, 0, 1}, /* 22 */
- {"setuid16", lx_setuid16, 0, 1}, /* 23 */
- {"getuid16", lx_getuid16, 0, 0}, /* 24 */
- {"stime", lx_stime, 0, 1}, /* 25 */
- {"ptrace", lx_ptrace, 0, 4}, /* 26 */
- {"alarm", lx_alarm, 0, 1}, /* 27 */
- {"fstat", NULL, NOSYS_OBSOLETE, 0}, /* 28 */
- {"pause", lx_pause, 0, 0}, /* 29 */
- {"utime", lx_utime, 0, 2}, /* 30 */
- {"stty", NULL, NOSYS_OBSOLETE, 0}, /* 31 */
- {"gtty", NULL, NOSYS_OBSOLETE, 0}, /* 32 */
- {"access", lx_access, 0, 2}, /* 33 */
- {"nice", lx_nice, 0, 1}, /* 34 */
- {"ftime", NULL, NOSYS_OBSOLETE, 0}, /* 35 */
- {"sync", lx_sync, 0, 0}, /* 36 */
- {"kill", LX_IKE(kill), LX_SYS_IKE, 2}, /* 37 */
- {"rename", lx_rename, 0, 2}, /* 38 */
- {"mkdir", lx_mkdir, 0, 2}, /* 39 */
- {"rmdir", lx_rmdir, 0, 1}, /* 40 */
- {"dup", lx_dup, 0, 1}, /* 41 */
- {"pipe", LX_IKE(pipe), LX_SYS_IKE, 1}, /* 42 */
- {"times", lx_times, 0, 1}, /* 43 */
- {"prof", NULL, NOSYS_OBSOLETE, 0}, /* 44 */
- {"brk", LX_IKE(brk), LX_SYS_IKE, 1}, /* 45 */
- {"setgid16", lx_setgid16, 0, 1}, /* 46 */
- {"getgid16", lx_getgid16, 0, 0}, /* 47 */
- {"signal", lx_signal, 0, 2}, /* 48 */
- {"geteuid16", lx_geteuid16, 0, 0}, /* 49 */
- {"getegid16", lx_getegid16, 0, 0}, /* 50 */
- {"acct", NULL, NOSYS_NO_EQUIV, 0}, /* 51 */
- {"umount2", lx_umount2, 0, 2}, /* 52 */
- {"lock", NULL, NOSYS_OBSOLETE, 0}, /* 53 */
- {"ioctl", LX_IKE(ioctl), LX_SYS_IKE, 3}, /* 54 */
- {"fcntl", lx_fcntl, 0, 3}, /* 55 */
- {"mpx", NULL, NOSYS_OBSOLETE, 0}, /* 56 */
- {"setpgid", lx_setpgid, 0, 2}, /* 57 */
- {"ulimit", NULL, NOSYS_OBSOLETE, 0}, /* 58 */
- {"olduname", NULL, NOSYS_OBSOLETE, 0}, /* 59 */
- {"umask", lx_umask, 0, 1}, /* 60 */
- {"chroot", lx_chroot, 0, 1}, /* 61 */
- {"ustat", NULL, NOSYS_OBSOLETE, 2}, /* 62 */
- {"dup2", lx_dup2, 0, 2}, /* 63 */
- {"getppid", LX_IKE(getppid), LX_SYS_IKE, 0}, /* 64 */
- {"getpgrp", lx_getpgrp, 0, 0}, /* 65 */
- {"setsid", lx_setsid, 0, 0}, /* 66 */
- {"sigaction", lx_sigaction, 0, 3}, /* 67 */
- {"sgetmask", NULL, NOSYS_OBSOLETE, 0}, /* 68 */
- {"ssetmask", NULL, NOSYS_OBSOLETE, 0}, /* 69 */
- {"setreuid16", lx_setreuid16, 0, 2}, /* 70 */
- {"setregid16", lx_setregid16, 0, 2}, /* 71 */
- {"sigsuspend", lx_sigsuspend, 0, 1}, /* 72 */
- {"sigpending", lx_sigpending, 0, 1}, /* 73 */
- {"sethostname", lx_sethostname, 0, 2}, /* 74 */
- {"setrlimit", lx_setrlimit, 0, 2}, /* 75 */
- {"getrlimit", lx_oldgetrlimit, 0, 2}, /* 76 */
- {"getrusage", lx_getrusage, 0, 2}, /* 77 */
- {"gettimeofday", lx_gettimeofday, 0, 2}, /* 78 */
- {"settimeofday", lx_settimeofday, 0, 2}, /* 79 */
- {"getgroups16", lx_getgroups16, 0, 2}, /* 80 */
- {"setgroups16", lx_setgroups16, 0, 2}, /* 81 */
- {"select", NULL, NOSYS_OBSOLETE, 0}, /* 82 */
- {"symlink", lx_symlink, 0, 2}, /* 83 */
- {"oldlstat", NULL, NOSYS_OBSOLETE, 0}, /* 84 */
- {"readlink", lx_readlink, 0, 3}, /* 85 */
- {"uselib", NULL, NOSYS_KERNEL, 0}, /* 86 */
- {"swapon", NULL, NOSYS_KERNEL, 0}, /* 87 */
- {"reboot", lx_reboot, 0, 4}, /* 88 */
- {"readdir", lx_readdir, 0, 3}, /* 89 */
- {"mmap", lx_mmap, 0, 6}, /* 90 */
- {"munmap", lx_munmap, 0, 2}, /* 91 */
- {"truncate", lx_truncate, 0, 2}, /* 92 */
- {"ftruncate", lx_ftruncate, 0, 2}, /* 93 */
- {"fchmod", lx_fchmod, 0, 2}, /* 94 */
- {"fchown16", lx_fchown16, 0, 3}, /* 95 */
- {"getpriority", lx_getpriority, 0, 2}, /* 96 */
- {"setpriority", lx_setpriority, 0, 3}, /* 97 */
- {"profil", NULL, NOSYS_NO_EQUIV, 0}, /* 98 */
- {"statfs", lx_statfs, 0, 2}, /* 99 */
- {"fstatfs", lx_fstatfs, 0, 2}, /* 100 */
- {"ioperm", NULL, NOSYS_NO_EQUIV, 0}, /* 101 */
- {"socketcall", lx_socketcall, 0, 2}, /* 102 */
- {"syslog", lx_syslog, 0, 3}, /* 103 */
- {"setitimer", lx_setitimer, 0, 3}, /* 104 */
- {"getitimer", lx_getitimer, 0, 2}, /* 105 */
- {"stat", lx_stat, 0, 2}, /* 106 */
- {"lstat", lx_lstat, 0, 2}, /* 107 */
- {"fstat", lx_fstat, 0, 2}, /* 108 */
- {"uname", NULL, NOSYS_OBSOLETE, 0}, /* 109 */
- {"oldiopl", NULL, NOSYS_NO_EQUIV, 0}, /* 110 */
- {"vhangup", lx_vhangup, 0, 0}, /* 111 */
- {"idle", NULL, NOSYS_NO_EQUIV, 0}, /* 112 */
- {"vm86old", NULL, NOSYS_OBSOLETE, 0}, /* 113 */
- {"wait4", lx_wait4, 0, 4}, /* 114 */
- {"swapoff", NULL, NOSYS_KERNEL, 0}, /* 115 */
- {"sysinfo", lx_sysinfo32, 0, 1}, /* 116 */
- {"ipc", lx_ipc, 0, 5}, /* 117 */
- {"fsync", lx_fsync, 0, 1}, /* 118 */
- {"sigreturn", lx_sigreturn, 0, 1}, /* 119 */
- {"clone", lx_clone, 0, 5}, /* 120 */
- {"setdomainname", lx_setdomainname, 0, 2}, /* 121 */
- {"uname", lx_uname, 0, 1}, /* 122 */
- {"modify_ldt", LX_IKE(modify_ldt), LX_SYS_IKE, 3}, /* 123 */
- {"adjtimex", lx_adjtimex, 0, 1}, /* 124 */
- {"mprotect", lx_mprotect, 0, 3}, /* 125 */
- {"sigprocmask", lx_sigprocmask, 0, 3}, /* 126 */
- {"create_module", NULL, NOSYS_KERNEL, 0}, /* 127 */
- {"init_module", NULL, NOSYS_KERNEL, 0}, /* 128 */
- {"delete_module", NULL, NOSYS_KERNEL, 0}, /* 129 */
- {"get_kernel_syms", NULL, NOSYS_KERNEL, 0}, /* 130 */
- {"quotactl", NULL, NOSYS_KERNEL, 0}, /* 131 */
- {"getpgid", lx_getpgid, 0, 1}, /* 132 */
- {"fchdir", lx_fchdir, 0, 1}, /* 133 */
- {"bdflush", NULL, NOSYS_KERNEL, 0}, /* 134 */
- {"sysfs", lx_sysfs, 0, 3}, /* 135 */
- {"personality", lx_personality, 0, 1}, /* 136 */
- {"afs_syscall", NULL, NOSYS_KERNEL, 0}, /* 137 */
- {"setfsuid16", lx_setfsuid16, 0, 1}, /* 138 */
- {"setfsgid16", lx_setfsgid16, 0, 1}, /* 139 */
- {"llseek", lx_llseek, 0, 5}, /* 140 */
- {"getdents", lx_getdents, 0, 3}, /* 141 */
- {"select", lx_select, 0, 5}, /* 142 */
- {"flock", lx_flock, 0, 2}, /* 143 */
- {"msync", lx_msync, 0, 3}, /* 144 */
- {"readv", lx_readv, 0, 3}, /* 145 */
- {"writev", lx_writev, 0, 3}, /* 146 */
- {"getsid", lx_getsid, 0, 1}, /* 147 */
- {"fdatasync", lx_fdatasync, 0, 1}, /* 148 */
- {"sysctl", lx_sysctl, 0, 1}, /* 149 */
- {"mlock", lx_mlock, 0, 2}, /* 150 */
- {"munlock", lx_munlock, 0, 2}, /* 151 */
- {"mlockall", lx_mlockall, 0, 1}, /* 152 */
- {"munlockall", lx_munlockall, 0, 0}, /* 153 */
- {"sched_setparam", lx_sched_setparam, 0, 2}, /* 154 */
- {"sched_getparam", lx_sched_getparam, 0, 2}, /* 155 */
- {"sched_setscheduler", lx_sched_setscheduler, 0, 3}, /* 156 */
- {"sched_getscheduler", lx_sched_getscheduler, 0, 1}, /* 157 */
- {"sched_yield", lx_yield, 0, 0}, /* 158 */
- {"sched_get_priority_max", lx_sched_get_priority_max, 0, 1}, /* 159 */
- {"sched_get_priority_min", lx_sched_get_priority_min, 0, 1}, /* 160 */
- {"sched_rr_get_interval", lx_sched_rr_get_interval, 0, 2}, /* 161 */
- {"nanosleep", lx_nanosleep, 0, 2}, /* 162 */
- {"mremap", lx_remap, 0, 5}, /* 163 */
- {"setresuid16", LX_IKE(setresuid16), LX_SYS_IKE, 3}, /* 164 */
- {"getresuid16", lx_getresuid16, 0, 3}, /* 165 */
- {"vm86", NULL, NOSYS_NO_EQUIV, 0}, /* 166 */
- {"query_module", lx_query_module, NOSYS_KERNEL, 5}, /* 167 */
- {"poll", lx_poll, 0, 3}, /* 168 */
- {"nfsservctl", NULL, NOSYS_KERNEL, 0}, /* 169 */
- {"setresgid16", LX_IKE(setresgid16), LX_SYS_IKE, 3}, /* 170 */
- {"getresgid16", lx_getresgid16, 0, 3}, /* 171 */
- {"prctl", lx_prctl, 0, 5}, /* 172 */
- {"rt_sigreturn", lx_rt_sigreturn, 0, 0}, /* 173 */
- {"rt_sigaction", lx_rt_sigaction, 0, 4}, /* 174 */
- {"rt_sigprocmask", lx_rt_sigprocmask, 0, 4}, /* 175 */
- {"rt_sigpending", lx_rt_sigpending, 0, 2}, /* 176 */
- {"rt_sigtimedwait", lx_rt_sigtimedwait, 0, 4}, /* 177 */
- {"rt_sigqueueinfo", lx_rt_sigqueueinfo, 0, 3}, /* 178 */
- {"rt_sigsuspend", lx_rt_sigsuspend, 0, 2}, /* 179 */
- {"pread64", lx_pread64, 0, 5}, /* 180 */
- {"pwrite64", lx_pwrite64, 0, 5}, /* 181 */
- {"chown16", lx_chown16, 0, 3}, /* 182 */
- {"getcwd", lx_getcwd, 0, 2}, /* 183 */
- {"capget", lx_capget, 0, 2}, /* 184 */
- {"capset", lx_capset, 0, 2}, /* 185 */
- {"sigaltstack", lx_sigaltstack, 0, 2}, /* 186 */
- {"sendfile", lx_sendfile, 0, 4}, /* 187 */
- {"getpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 188 */
- {"putpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 189 */
- {"vfork", lx_vfork, 0, 0}, /* 190 */
- {"getrlimit", lx_getrlimit, 0, 2}, /* 191 */
- {"mmap2", lx_mmap2, LX_SYS_EBPARG6, 6}, /* 192 */
- {"truncate64", lx_truncate64, 0, 3}, /* 193 */
- {"ftruncate64", lx_ftruncate64, 0, 3}, /* 194 */
- {"stat64", lx_stat64, 0, 2}, /* 195 */
- {"lstat64", lx_lstat64, 0, 2}, /* 196 */
- {"fstat64", lx_fstat64, 0, 2}, /* 197 */
- {"lchown", lx_lchown, 0, 3}, /* 198 */
- {"getuid", lx_getuid, 0, 0}, /* 199 */
- {"getgid", lx_getgid, 0, 0}, /* 200 */
- {"geteuid", lx_geteuid, 0, 0}, /* 201 */
- {"getegid", lx_getegid, 0, 0}, /* 202 */
- {"setreuid", lx_setreuid, 0, 0}, /* 203 */
- {"setregid", lx_setregid, 0, 0}, /* 204 */
- {"getgroups", lx_getgroups, 0, 2}, /* 205 */
- {"setgroups", lx_setgroups, 0, 2}, /* 206 */
- {"fchown", lx_fchown, 0, 3}, /* 207 */
- {"setresuid", LX_IKE(setresuid), LX_SYS_IKE, 3}, /* 208 */
- {"getresuid", lx_getresuid, 0, 3}, /* 209 */
- {"setresgid", LX_IKE(setresgid), LX_SYS_IKE, 3}, /* 210 */
- {"getresgid", lx_getresgid, 0, 3}, /* 211 */
- {"chown", lx_chown, 0, 3}, /* 212 */
- {"setuid", lx_setuid, 0, 1}, /* 213 */
- {"setgid", lx_setgid, 0, 1}, /* 214 */
- {"setfsuid", lx_setfsuid, 0, 1}, /* 215 */
- {"setfsgid", lx_setfsgid, 0, 1}, /* 216 */
- {"pivot_root", NULL, NOSYS_KERNEL, 0}, /* 217 */
- {"mincore", lx_mincore, 0, 3}, /* 218 */
- {"madvise", lx_madvise, 0, 3}, /* 219 */
- {"getdents64", lx_getdents64, 0, 3}, /* 220 */
- {"fcntl64", lx_fcntl64, 0, 3}, /* 221 */
- {"tux", NULL, NOSYS_NO_EQUIV, 0}, /* 222 */
- {"security", NULL, NOSYS_NO_EQUIV, 0}, /* 223 */
- {"gettid", LX_IKE(gettid), LX_SYS_IKE, 0}, /* 224 */
- {"readahead", NULL, NOSYS_NO_EQUIV, 0}, /* 225 */
- {"setxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 226 */
- {"lsetxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 227 */
- {"fsetxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 228 */
- {"getxattr", lx_xattr4, 0, 4}, /* 229 */
- {"lgetxattr", lx_xattr4, 0, 4}, /* 230 */
- {"fgetxattr", lx_xattr4, 0, 4}, /* 231 */
- {"listxattr", lx_xattr3, 0, 3}, /* 232 */
- {"llistxattr", lx_xattr3, 0, 3}, /* 233 */
- {"flistxattr", lx_xattr3, 0, 3}, /* 234 */
- {"removexattr", lx_xattr2, 0, 2}, /* 235 */
- {"lremovexattr", lx_xattr2, 0, 2}, /* 236 */
- {"fremovexattr", lx_xattr2, 0, 2}, /* 237 */
- {"tkill", LX_IKE(tkill), LX_SYS_IKE, 2}, /* 238 */
- {"sendfile64", lx_sendfile64, 0, 4}, /* 239 */
- {"futex", LX_IKE(futex), LX_SYS_IKE | LX_SYS_EBPARG6, 6}, /* 240 */
- {"sched_setaffinity", lx_sched_setaffinity, 0, 3}, /* 241 */
- {"sched_getaffinity", lx_sched_getaffinity, 0, 3}, /* 242 */
- {"set_thread_area", LX_IKE(set_thread_area), LX_SYS_IKE, 1}, /* 243 */
- {"get_thread_area", LX_IKE(get_thread_area), LX_SYS_IKE, 1}, /* 244 */
- {"io_setup", lx_io_setup, 0, 2}, /* 245 */
- {"io_destroy", lx_io_destroy, 0, 1}, /* 246 */
- {"io_getevents", lx_io_getevents, 0, 5}, /* 247 */
- {"io_submit", lx_io_submit, 0, 3}, /* 248 */
- {"io_cancel", lx_io_cancel, 0, 3}, /* 249 */
- {"fadvise64", lx_fadvise64, 0, 4}, /* 250 */
- {"nosys", NULL, 0, 0}, /* 251 */
- {"group_exit", lx_group_exit, 0, 1}, /* 252 */
- {"lookup_dcookie", NULL, NOSYS_NO_EQUIV, 0}, /* 253 */
- {"epoll_create", lx_epoll_create, 0, 1}, /* 254 */
- {"epoll_ctl", lx_epoll_ctl, 0, 4}, /* 255 */
- {"epoll_wait", lx_epoll_wait, 0, 4}, /* 256 */
- {"remap_file_pages", NULL, NOSYS_NO_EQUIV, 0}, /* 257 */
- {"set_tid_address", LX_IKE(set_tid_address), LX_SYS_IKE, 1}, /* 258 */
- {"timer_create", lx_timer_create, 0, 3}, /* 259 */
- {"timer_settime", lx_timer_settime, 0, 4}, /* 260 */
- {"timer_gettime", lx_timer_gettime, 0, 2}, /* 261 */
- {"timer_getoverrun", lx_timer_getoverrun, 0, 1}, /* 262 */
- {"timer_delete", lx_timer_delete, 0, 1}, /* 263 */
- {"clock_settime", lx_clock_settime, 0, 2}, /* 264 */
- {"clock_gettime", lx_clock_gettime, 0, 2}, /* 265 */
- {"clock_getres", lx_clock_getres, 0, 2}, /* 266 */
- {"clock_nanosleep", lx_clock_nanosleep, 0, 4}, /* 267 */
- {"statfs64", lx_statfs64, 0, 2}, /* 268 */
- {"fstatfs64", lx_fstatfs64, 0, 2}, /* 269 */
- {"tgkill", LX_IKE(tgkill), LX_SYS_IKE, 3}, /* 270 */
-
- /* The following system calls only exist in kernel 2.6 and greater */
- {"utimes", lx_utimes, 0, 2}, /* 271 */
- {"fadvise64_64", lx_fadvise64_64, 0, 4}, /* 272 */
- {"vserver", NULL, NOSYS_NULL, 0}, /* 273 */
- {"mbind", NULL, NOSYS_NULL, 0}, /* 274 */
- {"get_mempolicy", NULL, NOSYS_NULL, 0}, /* 275 */
- {"set_mempolicy", NULL, NOSYS_NULL, 0}, /* 276 */
- {"mq_open", NULL, NOSYS_NULL, 0}, /* 277 */
- {"mq_unlink", NULL, NOSYS_NULL, 0}, /* 278 */
- {"mq_timedsend", NULL, NOSYS_NULL, 0}, /* 279 */
- {"mq_timedreceive", NULL, NOSYS_NULL, 0}, /* 280 */
- {"mq_notify", NULL, NOSYS_NULL, 0}, /* 281 */
- {"mq_getsetattr", NULL, NOSYS_NULL, 0}, /* 282 */
- {"kexec_load", NULL, NOSYS_NULL, 0}, /* 283 */
- {"waitid", lx_waitid, 0, 4}, /* 284 */
- {"sys_setaltroot", NULL, NOSYS_NULL, 0}, /* 285 */
- {"add_key", NULL, NOSYS_NULL, 0}, /* 286 */
- {"request_key", NULL, NOSYS_NULL, 0}, /* 287 */
- {"keyctl", NULL, NOSYS_NULL, 0}, /* 288 */
- {"ioprio_set", NULL, NOSYS_NULL, 0}, /* 289 */
- {"ioprio_get", NULL, NOSYS_NULL, 0}, /* 290 */
- {"inotify_init", lx_inotify_init, 0, 0}, /* 291 */
- {"inotify_add_watch", lx_inotify_add_watch, 0, 3}, /* 292 */
- {"inotify_rm_watch", lx_inotify_rm_watch, 0, 2}, /* 293 */
- {"migrate_pages", NULL, NOSYS_NULL, 0}, /* 294 */
- {"openat", lx_openat, 0, 4}, /* 295 */
- {"mkdirat", lx_mkdirat, 0, 3}, /* 296 */
- {"mknodat", lx_mknodat, 0, 4}, /* 297 */
- {"fchownat", lx_fchownat, 0, 5}, /* 298 */
- {"futimesat", lx_futimesat, 0, 3}, /* 299 */
- {"fstatat64", lx_fstatat64, 0, 4}, /* 300 */
- {"unlinkat", lx_unlinkat, 0, 3}, /* 301 */
- {"renameat", lx_renameat, 0, 4}, /* 302 */
- {"linkat", lx_linkat, 0, 5}, /* 303 */
- {"symlinkat", lx_symlinkat, 0, 3}, /* 304 */
- {"readlinkat", lx_readlinkat, 0, 4}, /* 305 */
- {"fchmodat", lx_fchmodat, 0, 4}, /* 306 */
- {"faccessat", lx_faccessat, 0, 4}, /* 307 */
- {"pselect6", lx_pselect6, LX_SYS_EBPARG6, 6}, /* 308 */
- {"ppoll", lx_ppoll, 0, 5}, /* 309 */
- {"unshare", NULL, NOSYS_NULL, 0}, /* 310 */
- {"set_robust_list", NULL, NOSYS_NULL, 0}, /* 311 */
- {"get_robust_list", NULL, NOSYS_NULL, 0}, /* 312 */
- {"splice", NULL, NOSYS_NULL, 0}, /* 313 */
- {"sync_file_range", NULL, NOSYS_NULL, 0}, /* 314 */
- {"tee", NULL, NOSYS_NULL, 0}, /* 315 */
- {"vmsplice", NULL, NOSYS_NULL, 0}, /* 316 */
- {"move_pages", NULL, NOSYS_NULL, 0}, /* 317 */
- {"getcpu", lx_getcpu, 0, 3}, /* 318 */
- {"epoll_pwait", lx_epoll_pwait, 0, 5}, /* 319 */
- {"utimensat", lx_utimensat, 0, 4}, /* 320 */
- {"signalfd", NULL, NOSYS_NULL, 0}, /* 321 */
- {"timerfd_create", lx_timerfd_create, 0, 2}, /* 322 */
- {"eventfd", lx_eventfd, 0, 1}, /* 323 */
- {"fallocate", NULL, NOSYS_NULL, 0}, /* 324 */
- {"timerfd_settime", lx_timerfd_settime, 0, 4}, /* 325 */
- {"timerfd_gettime", lx_timerfd_gettime, 0, 2}, /* 326 */
- {"signalfd4", NULL, NOSYS_NULL, 0}, /* 327 */
- {"eventfd2", lx_eventfd2, 0, 2}, /* 328 */
- {"epoll_create1", lx_epoll_create1, 0, 1}, /* 329 */
- {"dup3", lx_dup3, 0, 3}, /* 330 */
- {"pipe2", lx_pipe2, 0, 2}, /* 331 */
- {"inotify_init1", lx_inotify_init1, 0, 1}, /* 332 */
- {"preadv", lx_preadv, 0, 4}, /* 333 */
- {"pwritev", lx_pwritev, 0, 4}, /* 334 */
- {"rt_tgsigqueueinfo", lx_rt_tgsigqueueinfo, 0, 4}, /* 335 */
- {"perf_event_open", NULL, NOSYS_NULL, 0}, /* 336 */
- {"recvmmsg", NULL, NOSYS_NULL, 0}, /* 337 */
- {"fanotify_init", NULL, NOSYS_NULL, 0}, /* 338 */
- {"fanotify_mark", NULL, NOSYS_NULL, 0}, /* 339 */
- {"prlimit64", lx_prlimit64, 0, 4}, /* 340 */
- {"name_to_handle_at", NULL, NOSYS_NULL, 0}, /* 341 */
- {"open_by_handle_at", NULL, NOSYS_NULL, 0}, /* 342 */
- {"clock_adjtime", NULL, NOSYS_NULL, 0}, /* 343 */
- {"syncfs", NULL, NOSYS_NULL, 0}, /* 344 */
- {"sendmmsg", NULL, NOSYS_NULL, 0}, /* 345 */
- {"setns", NULL, NOSYS_NULL, 0}, /* 346 */
- {"process_vm_readv", NULL, NOSYS_NULL, 0}, /* 347 */
- {"process_vm_writev", NULL, NOSYS_NULL, 0}, /* 348 */
- {"kcmp", NULL, NOSYS_NULL, 0}, /* 349 */
- {"finit_module", NULL, NOSYS_NULL, 0}, /* 350 */
- {"sched_setattr", NULL, NOSYS_NULL, 0}, /* 351 */
- {"sched_getattr", NULL, NOSYS_NULL, 0}, /* 352 */
+static lx_syscall_handler_t lx_handlers[] = {
+ NULL, /* 0: nosys */
+ lx_exit,
+ lx_fork,
+ NULL, /* 3: read */
+ NULL, /* 4: write */
+ lx_open,
+ lx_close,
+ NULL, /* 7: waitpid */
+ lx_creat,
+ lx_link,
+ lx_unlink,
+ lx_execve,
+ lx_chdir,
+ lx_time,
+ lx_mknod,
+ lx_chmod,
+ lx_lchown16,
+ NULL, /* 17: break */
+ NULL, /* 18: stat */
+ lx_lseek,
+ NULL, /* 20: getpid */
+ lx_mount,
+ lx_umount,
+ lx_setuid16,
+ lx_getuid16,
+ lx_stime,
+ lx_ptrace,
+ lx_alarm,
+ NULL, /* 28: fstat */
+ lx_pause,
+ lx_utime,
+ NULL, /* 31: stty */
+ NULL, /* 32: gtty */
+ lx_access,
+ lx_nice,
+ NULL, /* 35: ftime */
+ lx_sync,
+ NULL, /* 37: kill */
+ lx_rename,
+ lx_mkdir,
+ lx_rmdir,
+ lx_dup,
+ NULL, /* 42: pipe */
+ lx_times,
+ NULL, /* 44: prof */
+ NULL, /* 45: brk */
+ lx_setgid16,
+ lx_getgid16,
+ lx_signal,
+ lx_geteuid16,
+ lx_getegid16,
+ NULL, /* 51: acct */
+ lx_umount2,
+ NULL, /* 53: lock */
+ NULL, /* 54: ioctl */
+ lx_fcntl,
+ NULL, /* 56: mpx */
+ lx_setpgid,
+ NULL, /* 58: ulimit */
+ NULL, /* 59: olduname */
+ lx_umask,
+ lx_chroot,
+ NULL, /* 62: ustat */
+ lx_dup2,
+ NULL, /* 64: getppid */
+ lx_getpgrp,
+ lx_setsid,
+ lx_sigaction,
+ NULL, /* 68: sgetmask */
+ NULL, /* 69: ssetmask */
+ lx_setreuid16,
+ lx_setregid16,
+ lx_sigsuspend,
+ lx_sigpending,
+ lx_sethostname,
+ lx_setrlimit,
+ lx_oldgetrlimit,
+ lx_getrusage,
+ lx_gettimeofday,
+ lx_settimeofday,
+ lx_getgroups16,
+ lx_setgroups16,
+ NULL, /* 82: select */
+ lx_symlink,
+ NULL, /* 84: oldlstat */
+ lx_readlink,
+ NULL, /* 86: uselib */
+ NULL, /* 87: swapon */
+ lx_reboot,
+ lx_readdir,
+ lx_mmap,
+ lx_munmap,
+ lx_truncate,
+ lx_ftruncate,
+ lx_fchmod,
+ lx_fchown16,
+ lx_getpriority,
+ lx_setpriority,
+ NULL, /* 98: profil */
+ lx_statfs,
+ lx_fstatfs,
+ NULL, /* 101: ioperm */
+ lx_socketcall,
+ lx_syslog,
+ lx_setitimer,
+ lx_getitimer,
+ lx_stat,
+ lx_lstat,
+ lx_fstat,
+ NULL, /* 109: uname */
+ NULL, /* 110: oldiopl */
+ lx_vhangup,
+ NULL, /* 112: idle */
+ NULL, /* 113: vm86old */
+ NULL, /* 114: wait4 */
+ NULL, /* 115: swapoff */
+ NULL, /* 116: sysinfo */
+ lx_ipc,
+ lx_fsync,
+ lx_sigreturn,
+ lx_clone,
+ lx_setdomainname,
+ lx_uname,
+ NULL, /* 123: modify_ldt */
+ lx_adjtimex,
+ lx_mprotect,
+ lx_sigprocmask,
+ NULL, /* 127: create_module */
+ NULL, /* 128: init_module */
+ NULL, /* 129: delete_module */
+ NULL, /* 130: get_kernel_syms */
+ NULL, /* 131: quotactl */
+ lx_getpgid,
+ lx_fchdir,
+ NULL, /* 134: bdflush */
+ lx_sysfs,
+ lx_personality,
+ NULL, /* 137: afs_syscall */
+ lx_setfsuid16,
+ lx_setfsgid16,
+ lx_llseek,
+ lx_getdents,
+ lx_select,
+ lx_flock,
+ lx_msync,
+ lx_readv,
+ lx_writev,
+ lx_getsid,
+ lx_fdatasync,
+ lx_sysctl,
+ lx_mlock,
+ lx_munlock,
+ lx_mlockall,
+ lx_munlockall,
+ lx_sched_setparam,
+ lx_sched_getparam,
+ lx_sched_setscheduler,
+ lx_sched_getscheduler,
+ NULL, /* 158: sched_yield */
+ lx_sched_get_priority_max,
+ lx_sched_get_priority_min,
+ lx_sched_rr_get_interval,
+ lx_nanosleep,
+ lx_remap,
+ NULL, /* 164: setresuid16 */
+ lx_getresuid16,
+ NULL, /* 166: vm86 */
+ lx_query_module,
+ lx_poll,
+ NULL, /* 169: nfsservctl */
+ NULL, /* 170: setresgid16 */
+ lx_getresgid16,
+ lx_prctl,
+ lx_rt_sigreturn,
+ lx_rt_sigaction,
+ lx_rt_sigprocmask,
+ lx_rt_sigpending,
+ lx_rt_sigtimedwait,
+ lx_rt_sigqueueinfo,
+ lx_rt_sigsuspend,
+ lx_pread64,
+ lx_pwrite64,
+ lx_chown16,
+ lx_getcwd,
+ lx_capget,
+ lx_capset,
+ lx_sigaltstack,
+ lx_sendfile,
+ NULL, /* 188: getpmsg */
+ NULL, /* 189: putpmsg */
+ lx_vfork,
+ lx_getrlimit,
+ lx_mmap2,
+ lx_truncate64,
+ lx_ftruncate64,
+ lx_stat64,
+ lx_lstat64,
+ lx_fstat64,
+ lx_lchown,
+ lx_getuid,
+ lx_getgid,
+ lx_geteuid,
+ lx_getegid,
+ lx_setreuid,
+ lx_setregid,
+ lx_getgroups,
+ lx_setgroups,
+ lx_fchown,
+ NULL, /* 208: setresuid */
+ lx_getresuid,
+ NULL, /* 210: setresgid */
+ lx_getresgid,
+ lx_chown,
+ lx_setuid,
+ lx_setgid,
+ lx_setfsuid,
+ lx_setfsgid,
+ NULL, /* 217: pivot_root */
+ lx_mincore,
+ lx_madvise,
+ lx_getdents64,
+ lx_fcntl64,
+ NULL, /* 222: tux */
+ NULL, /* 223: security */
+ NULL, /* 224: gettid */
+ NULL, /* 225: readahead */
+ NULL, /* 226: setxattr */
+ NULL, /* 227: lsetxattr */
+ NULL, /* 228: fsetxattr */
+ NULL, /* 229: getxattr */
+ NULL, /* 230: lgetxattr */
+ NULL, /* 231: fgetxattr */
+ NULL, /* 232: listxattr */
+ NULL, /* 233: llistxattr */
+ NULL, /* 234: flistxattr */
+ NULL, /* 235: removexattr */
+ NULL, /* 236: lremovexattr */
+ NULL, /* 237: fremovexattr */
+ NULL, /* 238: tkill */
+ lx_sendfile64,
+ NULL, /* 240: futex */
+ lx_sched_setaffinity,
+ lx_sched_getaffinity,
+ NULL, /* 243: set_thread_area */
+ NULL, /* 244: get_thread_area */
+ NULL, /* 245: io_setup */
+ NULL, /* 246: io_destroy */
+ NULL, /* 247: io_getevents */
+ NULL, /* 248: io_submit */
+ NULL, /* 249: io_cancel */
+ lx_fadvise64,
+ NULL, /* 251: nosys */
+ lx_group_exit,
+ NULL, /* 253: lookup_dcookie */
+ lx_epoll_create,
+ lx_epoll_ctl,
+ lx_epoll_wait,
+ NULL, /* 257: remap_file_pages */
+ NULL, /* 258: set_tid_address */
+ lx_timer_create,
+ lx_timer_settime,
+ lx_timer_gettime,
+ lx_timer_getoverrun,
+ lx_timer_delete,
+ lx_clock_settime,
+ lx_clock_gettime,
+ lx_clock_getres,
+ lx_clock_nanosleep,
+ lx_statfs64,
+ lx_fstatfs64,
+ NULL, /* 270: tgkill */
+ lx_utimes,
+ lx_fadvise64_64,
+ NULL, /* 273: vserver */
+ NULL, /* 274: mbind */
+ NULL, /* 275: get_mempolicy */
+ NULL, /* 276: set_mempolicy */
+ NULL, /* 277: mq_open */
+ NULL, /* 278: mq_unlink */
+ NULL, /* 279: mq_timedsend */
+ NULL, /* 280: mq_timedreceive */
+ NULL, /* 281: mq_notify */
+ NULL, /* 282: mq_getsetattr */
+ NULL, /* 283: kexec_load */
+ NULL, /* 284: waitid */
+ NULL, /* 285: sys_setaltroot */
+ NULL, /* 286: add_key */
+ NULL, /* 287: request_key */
+ NULL, /* 288: keyctl */
+ NULL, /* 289: ioprio_set */
+ NULL, /* 290: ioprio_get */
+ lx_inotify_init,
+ lx_inotify_add_watch,
+ lx_inotify_rm_watch,
+ NULL, /* 294: migrate_pages */
+ lx_openat,
+ lx_mkdirat,
+ lx_mknodat,
+ lx_fchownat,
+ lx_futimesat,
+ lx_fstatat64,
+ lx_unlinkat,
+ lx_renameat,
+ lx_linkat,
+ lx_symlinkat,
+ lx_readlinkat,
+ lx_fchmodat,
+ lx_faccessat,
+ lx_pselect6,
+ lx_ppoll,
+ NULL, /* 310: unshare */
+ NULL, /* 311: set_robust_list */
+ NULL, /* 312: get_robust_list */
+ NULL, /* 313: splice */
+ NULL, /* 314: sync_file_range */
+ NULL, /* 315: tee */
+ NULL, /* 316: vmsplice */
+ NULL, /* 317: move_pages */
+ lx_getcpu,
+ lx_epoll_pwait,
+ lx_utimensat,
+ NULL, /* 321: signalfd */
+ lx_timerfd_create,
+ lx_eventfd,
+ NULL, /* 324: fallocate */
+ lx_timerfd_settime,
+ lx_timerfd_gettime,
+ NULL, /* 327: signalfd4 */
+ lx_eventfd2,
+ lx_epoll_create1,
+ lx_dup3,
+ NULL, /* 331: pipe2 */
+ lx_inotify_init1,
+ NULL, /* 333: preadv */
+ NULL, /* 334: pwritev */
+ lx_rt_tgsigqueueinfo,
+ NULL, /* 336: perf_event_open */
+ NULL, /* 337: recvmmsg */
+ NULL, /* 338: fanotify_init */
+ NULL, /* 339: fanotify_mark */
+ lx_prlimit64,
+ NULL, /* 341: name_to_handle_at */
+ NULL, /* 342: open_by_handle_at */
+ NULL, /* 343: clock_adjtime */
+ NULL, /* 344: syncfs */
+ NULL, /* 345: sendmmsg */
+ NULL, /* 346: setns */
+ NULL, /* 347: process_vm_readv */
+ NULL, /* 348: process_vm_writev */
+ NULL, /* 349: kcmp */
+ NULL, /* 350: finit_module */
+ NULL, /* 351: sched_setattr */
+ NULL, /* 352: sched_getattr */
+ NULL, /* 353: renameat2 */
+ NULL, /* 354: seccomp */
+ NULL, /* 355: getrandom */
+ NULL, /* 356: memfd_create */
+ NULL, /* 357: bpf */
+ NULL, /* 358: execveat */
};
#endif
diff --git a/usr/src/lib/brand/lx/lx_brand/common/lx_provider.d b/usr/src/lib/brand/lx/lx_brand/common/lx_provider.d
index 2a07c00c7a..14326e8f56 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/lx_provider.d
+++ b/usr/src/lib/brand/lx/lx_brand/common/lx_provider.d
@@ -10,14 +10,26 @@
*/
/*
- * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
provider lx {
probe debug(char *buf);
- probe sigdeliver(int sig, void *lx_sigaction, void *lx_sigstack,
- void *lx_ucontext);
+ probe sigdeliver(int sig, void *lx_sigaction, void *lx_sigstack);
probe sigreturn(void *lx_ucontext, void *ucontext, uintptr_t sp);
+
+ probe signal__delivery__frame__create(void *lx_sigdeliver_frame);
+ probe signal__delivery__frame__found(void *lx_sigdeliver_frame);
+ probe signal__delivery__frame__corrupt(void *lx_sigdeliver_frame);
+
+ probe signal__post__handler(uintptr_t old_sp, uintptr_t new_sp);
+
+ probe signal__altstack__enable(uintptr_t alt_sp);
+ probe signal__altstack__disable();
+
+ probe emulate__enter(void *ucp, int syscall_num, uintptr_t *args);
+ probe emulate__return(void *ucp, int syscall_num, uintptr_t ret,
+ uintptr_t errn);
};
#pragma D attributes Evolving/Evolving/ISA provider lx provider
diff --git a/usr/src/lib/brand/lx/lx_brand/common/lx_thunk_server.c b/usr/src/lib/brand/lx/lx_brand/common/lx_thunk_server.c
index 02bfe48e01..08e77572ab 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/lx_thunk_server.c
+++ b/usr/src/lib/brand/lx/lx_brand/common/lx_thunk_server.c
@@ -22,7 +22,7 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
/*
@@ -262,7 +262,6 @@ static cond_t lxt_req_cv = DEFAULTCV;
static lxt_req_t *lxt_req_ptr = NULL;
static mutex_t lxt_pid_lock = DEFAULTMUTEX;
-static pid_t lxt_pid = NULL;
/*
* Interfaces used to call from lx_brand.so into Linux code.
@@ -370,26 +369,26 @@ lx_call(lx_handle_sym_t lx_ch, uintptr_t p1, uintptr_t p2,
{
typedef uintptr_t (*fp8_t)(uintptr_t, uintptr_t, uintptr_t,
uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
- lx_regs_t *rp;
+ ucontext_t *ucp;
uintptr_t ret;
fp8_t lx_funcp = (fp8_t)lx_ch;
#if defined(_ILP32)
long cur_gs;
#endif
- rp = lx_syscall_regs();
+ ucp = lx_syscall_regs();
#if defined(_ILP32)
- lx_debug("lx_call: loading Linux gs, rp = 0x%p, gs = 0x%p",
- rp, rp->lxr_gs);
- lx_swap_gs(rp->lxr_gs, &cur_gs);
+ lx_debug("lx_call: loading Linux gs, ucp = 0x%p, gs = 0x%p",
+ ucp, LX_REG(ucp, GS));
+ lx_swap_gs(LX_REG(ucp, GS), &cur_gs);
#endif
lx_debug("lx_call: calling to Linux code at 0x%p", lx_ch);
ret = lx_funcp(p1, p2, p3, p4, p5, p6, p7, p8);
#if defined(_ILP32)
- lx_swap_gs(cur_gs, &rp->lxr_gs);
+ lx_swap_gs(cur_gs, (long *)&LX_REG(ucp, GS));
#endif
lx_debug("lx_call: returned from Linux code at 0x%p (%p)", lx_ch, ret);
@@ -725,7 +724,7 @@ lxt_server_syslog(lxt_server_arg_t *request, size_t request_size,
* We do this by telling our getpid() system call to return a
* different value.
*/
- lxt_pid = data->lxt_sl_pid;
+ (void) syscall(SYS_brand, B_SET_THUNK_PID, data->lxt_sl_pid);
/*
* Ensure the message has the correct program name.
@@ -750,7 +749,7 @@ lxt_server_syslog(lxt_server_arg_t *request, size_t request_size,
/* Restore pid and program name. */
(void) uucopy(&progname_ptr_old,
lxt_handles[LXTH_PROGNAME].lxth_handle, sizeof (char *));
- lxt_pid = NULL;
+ (void) syscall(SYS_brand, B_SET_THUNK_PID, 0);
(void) mutex_unlock(&lxt_pid_lock);
@@ -1022,12 +1021,3 @@ lxt_server_init(int argc, char *argv[])
lxt_server_processes = 1;
lx_debug("lx_thunk server detected, delaying initalization");
}
-
-int
-lxt_server_pid(int *pid)
-{
- if (lxt_server_processes == 0)
- return (0);
- *pid = lxt_pid;
- return (1);
-}
diff --git a/usr/src/lib/brand/lx/lx_brand/common/mem.c b/usr/src/lib/brand/lx/lx_brand/common/mem.c
index 416596ae88..d5a8b14bef 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/mem.c
+++ b/usr/src/lib/brand/lx/lx_brand/common/mem.c
@@ -21,7 +21,7 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#include <errno.h>
@@ -112,6 +112,21 @@ mmap_common(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
fd = -1;
/*
+ * We refuse, as a matter of principle, to overcommit memory.
+ * Unfortunately, several bits of important and popular software expect
+ * to be able to pre-allocate large amounts of virtual memory but then
+ * probably never use it. One particularly bad example of this
+ * practice is golang.
+ *
+ * In the interest of running software, unsafe or not, we fudge
+ * something vaguely similar to overcommit by permanently enabling
+ * MAP_NORESERVE unless MAP_LOCKED was requested:
+ */
+ if (!(flags & LX_MAP_LOCKED)) {
+ flags |= LX_MAP_NORESERVE;
+ }
+
+ /*
* This is totally insane. The NOTES section in the linux mmap(2) man
* page claims that on some architectures, read protection may
* automatically include exec protection. It has been observed on a
diff --git a/usr/src/lib/brand/lx/lx_brand/common/misc.c b/usr/src/lib/brand/lx/lx_brand/common/misc.c
index 5b71b43bf1..7e16fb717e 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/misc.c
+++ b/usr/src/lib/brand/lx/lx_brand/common/misc.c
@@ -24,6 +24,7 @@
* Copyright 2015 Joyent, Inc. All rights reserved.
*/
+#include <stdlib.h>
#include <assert.h>
#include <alloca.h>
#include <errno.h>
@@ -60,40 +61,6 @@
extern int sethostname(char *, int);
-struct lx_sysinfo {
- int64_t si_uptime; /* Seconds since boot */
- uint64_t si_loads[3]; /* 1, 5, and 15 minute avg runq length */
- uint64_t si_totalram; /* Total memory size */
- uint64_t si_freeram; /* Available memory */
- uint64_t si_sharedram; /* Shared memory */
- uint64_t si_bufferram; /* Buffer memory */
- uint64_t si_totalswap; /* Total swap space */
- uint64_t si_freeswap; /* Avail swap space */
- uint16_t si_procs; /* Process count */
- uint16_t si_pad; /* Padding */
- uint64_t si_totalhigh; /* High memory size */
- uint64_t si_freehigh; /* Avail high memory */
- uint32_t si_mem_unit; /* Unit size of memory fields */
-};
-
-struct lx_sysinfo32 {
- int32_t si_uptime; /* Seconds since boot */
- uint32_t si_loads[3]; /* 1, 5, and 15 minute avg runq length */
- uint32_t si_totalram; /* Total memory size */
- uint32_t si_freeram; /* Available memory */
- uint32_t si_sharedram; /* Shared memory */
- uint32_t si_bufferram; /* Buffer memory */
- uint32_t si_totalswap; /* Total swap space */
- uint32_t si_freeswap; /* Avail swap space */
- uint16_t si_procs; /* Process count */
- uint16_t si_pad; /* Padding */
- uint32_t si_totalhigh; /* High memory size */
- uint32_t si_freehigh; /* Avail high memory */
- uint32_t si_mem_unit; /* Unit size of memory fields */
-};
-
-extern long lx_sysinfo(struct lx_sysinfo *sip);
-
/* ARGUSED */
long
lx_rename(uintptr_t p1, uintptr_t p2)
@@ -284,7 +251,7 @@ lx_uname(uintptr_t p1)
/*
* {get,set}groups16() - Handle the conversion between 16-bit Linux gids and
- * 32-bit Solaris gids.
+ * 32-bit illumos gids.
*/
long
lx_getgroups16(uintptr_t p1, uintptr_t p2)
@@ -298,11 +265,15 @@ lx_getgroups16(uintptr_t p1, uintptr_t p2)
if (count < 0)
return (-EINVAL);
- grouplist32 = SAFE_ALLOCA(count * sizeof (gid_t));
- if (grouplist32 == NULL && count > 0)
+ grouplist32 = malloc(count * sizeof (gid_t));
+ if (grouplist32 == NULL && count > 0) {
+ free(grouplist32);
return (-ENOMEM);
- if ((ret = getgroups(count, grouplist32)) < 0)
+ }
+ if ((ret = getgroups(count, grouplist32)) < 0) {
+ free(grouplist32);
return (-errno);
+ }
/* we must not modify the list if the incoming count was 0 */
if (count > 0) {
@@ -310,28 +281,48 @@ lx_getgroups16(uintptr_t p1, uintptr_t p2)
grouplist[i] = LX_GID32_TO_GID16(grouplist32[i]);
}
+ free(grouplist32);
return (ret);
}
long
lx_setgroups16(uintptr_t p1, uintptr_t p2)
{
+ long rv;
int count = (int)p1;
- lx_gid16_t *grouplist = (lx_gid16_t *)p2;
- gid_t *grouplist32;
+ lx_gid16_t *grouplist = NULL;
+ gid_t *grouplist32 = NULL;
int i;
- grouplist32 = SAFE_ALLOCA(count * sizeof (gid_t));
- if (grouplist32 == NULL)
+ if ((grouplist = malloc(count * sizeof (lx_gid16_t))) == NULL) {
+ return (-ENOMEM);
+ }
+ if (uucopy((void *)p2, grouplist, count * sizeof (lx_gid16_t)) != 0) {
+ free(grouplist);
+ return (-EFAULT);
+ }
+
+ grouplist32 = malloc(count * sizeof (gid_t));
+ if (grouplist32 == NULL) {
+ free(grouplist);
return (-ENOMEM);
+ }
for (i = 0; i < count; i++)
grouplist32[i] = LX_GID16_TO_GID32(grouplist[i]);
/* order matters here to get the correct errno back */
- if (count > NGROUPS_MAX_DEFAULT)
+ if (count > NGROUPS_MAX_DEFAULT) {
+ free(grouplist);
+ free(grouplist32);
return (-EINVAL);
+ }
- return (setgroups(count, grouplist32) ? -errno : 0);
+ rv = setgroups(count, grouplist32);
+
+ free(grouplist);
+ free(grouplist32);
+
+ return (rv != 0 ? -errno : 0);
}
/*
@@ -440,10 +431,10 @@ lx_mknod(uintptr_t p1, uintptr_t p2, uintptr_t p3)
*
* Most programmers aren't even aware you can do this.
*
- * Note you can also do this via Solaris' mknod(2), but
+ * Note you can also do this via illumos' mknod(2), but
* Linux allows anyone who can create a UNIX domain
* socket via bind(2) to create one via mknod(2);
- * Solaris requires the caller to be privileged.
+ * illumos requires the caller to be privileged.
*/
if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
return (-errno);
@@ -524,19 +515,6 @@ lx_setdomainname(uintptr_t p1, uintptr_t p2)
}
long
-lx_getpid(void)
-{
- int pid;
-
- /* First call the thunk server hook. */
- if (lxt_server_pid(&pid) != 0)
- return (pid);
-
- pid = syscall(SYS_brand, B_IKE_SYSCALL + LX_EMUL_getpid);
- return ((pid == -1) ? -errno : pid);
-}
-
-long
lx_execve(uintptr_t p1, uintptr_t p2, uintptr_t p3)
{
char *filename = (char *)p1;
@@ -595,15 +573,17 @@ lx_setgroups(uintptr_t p1, uintptr_t p2)
lx_debug("\tlx_setgroups(%d, 0x%p", ng, p2);
if (ng > 0) {
- if ((glist = (gid_t *)SAFE_ALLOCA(ng * sizeof (gid_t))) == NULL)
+ if ((glist = (gid_t *)malloc(ng * sizeof (gid_t))) == NULL)
return (-ENOMEM);
- if (uucopy((void *)p2, glist, ng * sizeof (gid_t)) != 0)
+ if (uucopy((void *)p2, glist, ng * sizeof (gid_t)) != 0) {
+ free(glist);
return (-errno);
+ }
/*
* Linux doesn't check the validity of the group IDs, but
- * Solaris does. Change any invalid group IDs to a known, valid
+ * illumos does. Change any invalid group IDs to a known, valid
* value (yuck).
*/
for (i = 0; i < ng; i++) {
@@ -613,12 +593,14 @@ lx_setgroups(uintptr_t p1, uintptr_t p2)
}
/* order matters here to get the correct errno back */
- if (ng > NGROUPS_MAX_DEFAULT)
+ if (ng > NGROUPS_MAX_DEFAULT) {
+ free(glist);
return (-EINVAL);
+ }
- r = syscall(SYS_brand, B_IKE_SYSCALL + LX_EMUL_setgroups,
- ng, glist);
+ r = syscall(SYS_brand, B_HELPER_SETGROUPS, ng, glist);
+ free(glist);
return ((r == -1) ? -errno : r);
}
@@ -712,29 +694,6 @@ lx_prctl(int option, uintptr_t arg2, uintptr_t arg3,
return (0);
}
-#if defined(_LP64)
-long
-lx_arch_prctl(int code, uintptr_t addr)
-{
- long rv;
- int ret;
- lx_tsd_t *lx_tsd;
-
- rv = syscall(SYS_brand, B_IKE_SYSCALL + LX_EMUL_arch_prctl, code, addr);
-
- if (code == LX_ARCH_SET_FS && rv == 0) {
- /* Track lx fsbase for debugging purposes */
- if ((ret = thr_getspecific(lx_tsd_key,
- (void **)&lx_tsd)) != 0) {
- lx_err_fatal("arch_prctl: unable to read TSD: %s",
- strerror(ret));
- }
- lx_tsd->lxtsd_fsbase = addr;
- }
- return ((rv == 0) ? 0 : -errno);
-}
-#endif
-
/*
* For syslog(), as there is no kernel and nothing to log, we simply emulate a
* kernel cyclic buffer (LOG_BUF_LEN) of 0 bytes, only handling errors for bad
@@ -759,45 +718,6 @@ lx_syslog(int type, char *bufp, int len)
return (0);
}
-long
-lx_sysinfo32(uintptr_t arg)
-{
- struct lx_sysinfo32 *sip = (struct lx_sysinfo32 *)arg;
- struct lx_sysinfo32 si;
- struct lx_sysinfo sil;
- int i;
-
- if (syscall(SYS_brand, B_IKE_SYSCALL + LX_EMUL_sysinfo, &sil) != 0)
- return (-errno);
-
- si.si_uptime = sil.si_uptime;
-
- for (i = 0; i < 3; i++) {
- if ((sil.si_loads[i]) > 0x7fffffff)
- si.si_loads[i] = 0x7fffffff;
- else
- si.si_loads[i] = sil.si_loads[i];
- }
-
- si.si_procs = sil.si_procs;
- si.si_totalram = sil.si_totalram;
- si.si_freeram = sil.si_freeram;
- si.si_totalswap = sil.si_totalswap;
- si.si_freeswap = sil.si_freeswap;
- si.si_mem_unit = sil.si_mem_unit;
-
- si.si_bufferram = sil.si_bufferram;
- si.si_sharedram = sil.si_sharedram;
-
- si.si_totalhigh = sil.si_totalhigh;
- si.si_freehigh = sil.si_freehigh;
-
- if (uucopy(&si, sip, sizeof (si)) != 0)
- return (-errno);
-
- return (0);
-}
-
/*
* The following are pass-through functions but we need to return the correct
* long so that the errno propagates back to the Linux code correctly.
@@ -1160,23 +1080,6 @@ lx_utimes(const char *path, const struct timeval times[2])
}
long
-lx_write(int fildes, const void *buf, size_t nbyte)
-{
- int r;
-
- r = write(fildes, buf, nbyte);
- return ((r == -1) ? -errno : r);
-}
-
-long
-lx_yield(void)
-{
-
- yield();
- return (0);
-}
-
-long
lx_vhangup(void)
{
if (geteuid() != 0)
diff --git a/usr/src/lib/brand/lx/lx_brand/common/poll_select.c b/usr/src/lib/brand/lx/lx_brand/common/poll_select.c
index 4fa63e677c..1dce9b278d 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/poll_select.c
+++ b/usr/src/lib/brand/lx/lx_brand/common/poll_select.c
@@ -70,21 +70,21 @@ lx_select(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
if (nfds > 0) {
if (p2 != NULL) {
- rfdsp = SAFE_ALLOCA(fd_set_len);
+ rfdsp = alloca(fd_set_len);
if (rfdsp == NULL)
return (-ENOMEM);
if (uucopy((void *)p2, rfdsp, fd_set_len) != 0)
return (-errno);
}
if (p3 != NULL) {
- wfdsp = SAFE_ALLOCA(fd_set_len);
+ wfdsp = alloca(fd_set_len);
if (wfdsp == NULL)
return (-ENOMEM);
if (uucopy((void *)p3, wfdsp, fd_set_len) != 0)
return (-errno);
}
if (p4 != NULL) {
- efdsp = SAFE_ALLOCA(fd_set_len);
+ efdsp = alloca(fd_set_len);
if (efdsp == NULL)
return (-ENOMEM);
if (uucopy((void *)p4, efdsp, fd_set_len) != 0)
@@ -165,7 +165,7 @@ lx_poll(uintptr_t p1, uintptr_t p2, uintptr_t p3)
* structures are identical. Copy in the linux poll structure.
*/
fds_size = sizeof (struct pollfd) * nfds;
- lfds = (struct pollfd *)SAFE_ALLOCA(fds_size);
+ lfds = (struct pollfd *)alloca(fds_size);
if (lfds == NULL)
return (-ENOMEM);
if (uucopy((void *)p1, lfds, fds_size) != 0)
@@ -175,7 +175,7 @@ lx_poll(uintptr_t p1, uintptr_t p2, uintptr_t p3)
* The poll system call modifies the poll structures passed in
* so we'll need to make an extra copy of them.
*/
- sfds = (struct pollfd *)SAFE_ALLOCA(fds_size);
+ sfds = (struct pollfd *)alloca(fds_size);
if (sfds == NULL)
return (-ENOMEM);
diff --git a/usr/src/lib/brand/lx/lx_brand/common/ptrace.c b/usr/src/lib/brand/lx/lx_brand/common/ptrace.c
index 174dbe8c19..65fe303835 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/ptrace.c
+++ b/usr/src/lib/brand/lx/lx_brand/common/ptrace.c
@@ -59,13 +59,6 @@
* detail.
*/
-/* execve syscall numbers for 64-bit vs. 32-bit */
-#if defined(_LP64)
-#define LX_SYS_execve 59
-#else
-#define LX_SYS_execve 11
-#endif
-
/*
* This corresponds to the user_i387_struct Linux structure.
*/
@@ -99,61 +92,6 @@ typedef struct lx_user_fpxregs {
long lxux_padding[56];
} lx_user_fpxregs_t;
-/*
- * This corresponds to the user_regs_struct Linux structure.
- */
-#if defined(_LP64)
-typedef struct lx_user_regs {
- long lxur_r15;
- long lxur_r14;
- long lxur_r13;
- long lxur_r12;
- long lxur_rbp;
- long lxur_rbx;
- long lxur_r11;
- long lxur_r10;
- long lxur_r9;
- long lxur_r8;
- long lxur_rax;
- long lxur_rcx;
- long lxur_rdx;
- long lxur_rsi;
- long lxur_rdi;
- long lxur_orig_rax;
- long lxur_rip;
- long lxur_xcs;
- long lxur_rflags;
- long lxur_rsp;
- long lxur_xss;
- long lxur_xfs_base;
- long lxur_xgs_base;
- long lxur_xds;
- long lxur_xes;
- long lxur_xfs;
- long lxur_xgs;
-} lx_user_regs_t;
-#else
-typedef struct lx_user_regs {
- long lxur_ebx;
- long lxur_ecx;
- long lxur_edx;
- long lxur_esi;
- long lxur_edi;
- long lxur_ebp;
- long lxur_eax;
- long lxur_xds;
- long lxur_xes;
- long lxur_xfs;
- long lxur_xgs;
- long lxur_orig_eax;
- long lxur_eip;
- long lxur_xcs;
- long lxur_eflags;
- long lxur_esp;
- long lxur_xss;
-} lx_user_regs_t;
-#endif
-
typedef struct lx_user {
lx_user_regs_t lxu_regs;
int lxu_fpvalid;
@@ -242,336 +180,6 @@ get_lwpstatus(pid_t pid, lwpid_t lwpid, lwpstatus_t *lsp)
return (0);
}
-static uintptr_t
-syscall_regs(int fd, uintptr_t fp, pid_t pid)
-{
- uintptr_t addr, done;
- struct frame fr;
- auxv_t auxv;
- int afd;
-#if defined(_LP64)
- Elf64_Phdr phdr;
-#elif defined(_ILP32)
- Elf32_Phdr phdr;
-#endif
-
- /*
- * Try to walk the stack looking for a return address that corresponds
- * to the traced process's lx_emulate_done symbol. This relies on the
- * fact that the brand library in the traced process is the same as the
- * brand library in this process (indeed, this is true of all processes
- * in a given branded zone).
- */
-
- /*
- * Find the base address for the brand library in the traced process
- * by grabbing the AT_PHDR auxv entry, reading in the program header
- * at that location and subtracting off the p_vaddr member. We use
- * this to compute the location of lx_emulate done in the traced
- * process.
- */
- if ((afd = open_procfile(pid, O_RDONLY, "auxv")) < 0)
- return (0);
-
- do {
- if (read(afd, &auxv, sizeof (auxv)) != sizeof (auxv)) {
- (void) close(afd);
- return (0);
- }
- } while (auxv.a_type != AT_PHDR);
-
- (void) close(afd);
-
- if (pread(fd, &phdr, sizeof (phdr), auxv.a_un.a_val) != sizeof (phdr)) {
- lx_debug("failed to read brand library's phdr");
- return (0);
- }
-
- addr = auxv.a_un.a_val - phdr.p_vaddr;
- done = (uintptr_t)&lx_emulate_done - (uintptr_t)&_START_ + addr;
-
- fr.fr_savfp = fp;
-
- do {
- addr = fr.fr_savfp;
- if (pread(fd, &fr, sizeof (fr), addr) != sizeof (fr)) {
- lx_debug("ptrace read failed for stack walk");
- return (0);
- }
-
- if (addr >= fr.fr_savfp) {
- lx_debug("ptrace stack not monotonically increasing "
- "%p %p (%p)", addr, fr.fr_savfp, done);
- return (0);
- }
- } while (fr.fr_savpc != done);
-
- /*
- * The first argument to lx_emulate is known to be an lx_regs_t
- * structure and the ABI specifies that it will be placed on the stack
- * immediately preceeding the return address.
- */
- addr += sizeof (fr);
-
- /*
- * On i386 we need to perform an additional read as we used the stack
- * to pass the argument to lx_emulate. On amd64 we passed the argument
- * in %rdi so addr already contains the correct address.
- */
-#if defined(_ILP32)
- if (pread(fd, &addr, sizeof (addr), addr) != sizeof (addr)) {
- lx_debug("ptrace stack failed to read register set address");
- return (0);
- }
-#endif
-
- return (addr);
-}
-
-static int
-getregs(pid_t pid, lwpid_t lwpid, lx_user_regs_t *rp)
-{
- lwpstatus_t status;
- uintptr_t addr;
- int fd, ret;
-
- if ((ret = get_lwpstatus(pid, lwpid, &status)) != 0)
- return (ret);
-
- if ((fd = open_procfile(pid, O_RDONLY, "as")) < 0)
- return (-ESRCH);
-
- /*
- * If we find the syscall regs (and are therefore in an emulated
- * syscall, use the register set at given address. Otherwise, use the
- * registers as reported by /proc.
- */
- if ((addr = syscall_regs(fd, status.pr_reg[REG_FP], pid)) != 0) {
- lx_regs_t regs;
-
- if (pread(fd, &regs, sizeof (regs), addr) != sizeof (regs)) {
- (void) close(fd);
- lx_debug("ptrace failed to read register set");
- return (-EIO);
- }
-
- (void) close(fd);
-
-#if defined(_LP64)
- rp->lxur_r15 = regs.lxr_r15;
- rp->lxur_r14 = regs.lxr_r14;
- rp->lxur_r13 = regs.lxr_r13;
- rp->lxur_r12 = regs.lxr_r12;
- rp->lxur_rbp = regs.lxr_rbp;
- rp->lxur_rbx = regs.lxr_rbx;
- rp->lxur_r11 = regs.lxr_r11;
- rp->lxur_r10 = regs.lxr_r10;
- rp->lxur_r9 = regs.lxr_r9;
- rp->lxur_r8 = regs.lxr_r8;
- rp->lxur_rax = regs.lxr_rax;
- rp->lxur_rcx = regs.lxr_rcx;
- rp->lxur_rdx = regs.lxr_rdx;
- rp->lxur_rsi = regs.lxr_rsi;
- rp->lxur_rdi = regs.lxr_rdi;
- rp->lxur_orig_rax = regs.lxr_orig_rax;
- rp->lxur_rip = regs.lxr_rip;
- rp->lxur_xcs = status.pr_reg[REG_CS];
- rp->lxur_rflags = status.pr_reg[REG_RFL];
- rp->lxur_rsp = regs.lxr_rsp;
- rp->lxur_xss = status.pr_reg[REG_SS];
- rp->lxur_xfs_base = status.pr_reg[REG_FSBASE];
- rp->lxur_xgs_base = status.pr_reg[REG_GSBASE];
- rp->lxur_xds = status.pr_reg[REG_DS];
- rp->lxur_xes = status.pr_reg[REG_ES];
- rp->lxur_xfs = regs.lxr_fs;
- rp->lxur_xgs = status.pr_reg[REG_GS];
-#elif defined(_ILP32)
- rp->lxur_ebx = regs.lxr_ebx;
- rp->lxur_ecx = regs.lxr_ecx;
- rp->lxur_edx = regs.lxr_edx;
- rp->lxur_esi = regs.lxr_esi;
- rp->lxur_edi = regs.lxr_edi;
- rp->lxur_ebp = regs.lxr_ebp;
- rp->lxur_eax = regs.lxr_eax;
- rp->lxur_xds = status.pr_reg[DS];
- rp->lxur_xes = status.pr_reg[ES];
- rp->lxur_xfs = status.pr_reg[FS];
- rp->lxur_xgs = regs.lxr_gs;
- rp->lxur_orig_eax = regs.lxr_orig_eax;
- rp->lxur_eip = regs.lxr_eip;
- rp->lxur_xcs = status.pr_reg[CS];
- rp->lxur_eflags = status.pr_reg[EFL];
- rp->lxur_esp = regs.lxr_esp;
- rp->lxur_xss = status.pr_reg[SS];
-#endif
-
- } else {
- (void) close(fd);
-
-#if defined(_LP64)
- rp->lxur_r15 = status.pr_reg[REG_R15];
- rp->lxur_r14 = status.pr_reg[REG_R14];
- rp->lxur_r13 = status.pr_reg[REG_R13];
- rp->lxur_r12 = status.pr_reg[REG_R12];
- rp->lxur_rbp = status.pr_reg[REG_RBP];
- rp->lxur_rbx = status.pr_reg[REG_RBX];
- rp->lxur_r11 = status.pr_reg[REG_R11];
- rp->lxur_r10 = status.pr_reg[REG_R10];
- rp->lxur_r9 = status.pr_reg[REG_R9];
- rp->lxur_r8 = status.pr_reg[REG_R8];
- rp->lxur_rax = status.pr_reg[REG_RAX];
- rp->lxur_rcx = status.pr_reg[REG_RCX];
- rp->lxur_rdx = status.pr_reg[REG_RDX];
- rp->lxur_rsi = status.pr_reg[REG_RSI];
- rp->lxur_rdi = status.pr_reg[REG_RDI];
- rp->lxur_orig_rax = 0;
- rp->lxur_rip = status.pr_reg[REG_RIP];
- rp->lxur_xcs = status.pr_reg[REG_CS];
- rp->lxur_rflags = status.pr_reg[REG_RFL];
- rp->lxur_rsp = status.pr_reg[REG_RSP];
- rp->lxur_xss = status.pr_reg[REG_SS];
- rp->lxur_xfs = status.pr_reg[REG_FSBASE];
- rp->lxur_xgs = status.pr_reg[REG_GSBASE];
- rp->lxur_xds = status.pr_reg[REG_DS];
- rp->lxur_xes = status.pr_reg[REG_ES];
- rp->lxur_xfs = status.pr_reg[REG_FSBASE];
- rp->lxur_xgs = status.pr_reg[REG_GSBASE];
-#elif defined(_ILP32)
- rp->lxur_ebx = status.pr_reg[EBX];
- rp->lxur_ecx = status.pr_reg[ECX];
- rp->lxur_edx = status.pr_reg[EDX];
- rp->lxur_esi = status.pr_reg[ESI];
- rp->lxur_edi = status.pr_reg[EDI];
- rp->lxur_ebp = status.pr_reg[EBP];
- rp->lxur_eax = status.pr_reg[EAX];
- rp->lxur_xds = status.pr_reg[DS];
- rp->lxur_xes = status.pr_reg[ES];
- rp->lxur_xfs = status.pr_reg[FS];
- rp->lxur_xgs = status.pr_reg[GS];
- rp->lxur_orig_eax = 0;
- rp->lxur_eip = status.pr_reg[EIP];
- rp->lxur_xcs = status.pr_reg[CS];
- rp->lxur_eflags = status.pr_reg[EFL];
- rp->lxur_esp = status.pr_reg[UESP];
- rp->lxur_xss = status.pr_reg[SS];
-#endif
-
- /*
- * If the target process has just returned from exec, it's not
- * going to be sitting in the emulation function. In that case
- * we need to manually fake up the values for %eax and orig_eax
- * to indicate a successful return and that the traced process
- * had called execve (respectively).
- */
- if (status.pr_why == PR_SYSEXIT &&
- status.pr_what == SYS_execve) {
-#if defined(_LP64)
- rp->lxur_rax = 0;
- rp->lxur_orig_rax = LX_SYS_execve;
-#elif defined(_ILP32)
- rp->lxur_eax = 0;
- rp->lxur_orig_eax = LX_SYS_execve;
-#endif
- }
- }
-
- return (0);
-}
-
-static int
-setregs(pid_t pid, lwpid_t lwpid, const lx_user_regs_t *rp)
-{
- long ctl[1 + sizeof (prgregset_t) / sizeof (long)];
- lwpstatus_t status;
- uintptr_t addr;
- int fd, ret;
-
- if ((ret = get_lwpstatus(pid, lwpid, &status)) != 0)
- return (ret);
-
- if ((fd = open_procfile(pid, O_RDWR, "as")) < 0)
- return (-ESRCH);
-
- /*
- * If we find the syscall regs (and are therefore in an emulated
- * syscall, modify the register set at given address and set the
- * remaining registers through the /proc interface. Otherwise just use
- * the /proc interface to set register values;
- */
- if ((addr = syscall_regs(fd, status.pr_reg[REG_FP], pid)) != 0) {
-#if defined(_ILP32)
- lx_regs_t regs;
-
- regs.lxr_ebx = rp->lxur_ebx;
- regs.lxr_ecx = rp->lxur_ecx;
- regs.lxr_edx = rp->lxur_edx;
- regs.lxr_esi = rp->lxur_esi;
- regs.lxr_edi = rp->lxur_edi;
- regs.lxr_ebp = rp->lxur_ebp;
- regs.lxr_eax = rp->lxur_eax;
- regs.lxr_gs = rp->lxur_xgs;
- regs.lxr_orig_eax = rp->lxur_orig_eax;
- regs.lxr_eip = rp->lxur_eip;
- regs.lxr_esp = rp->lxur_esp;
-
- if (pwrite(fd, &regs, sizeof (regs), addr) != sizeof (regs)) {
- (void) close(fd);
- lx_debug("ptrace failed to write register set");
- return (-EIO);
- }
-#endif
-
- (void) close(fd);
-
-#if defined(_ILP32)
- status.pr_reg[DS] = rp->lxur_xds;
- status.pr_reg[ES] = rp->lxur_xes;
- status.pr_reg[FS] = rp->lxur_xfs;
- status.pr_reg[CS] = rp->lxur_xcs;
- status.pr_reg[EFL] = rp->lxur_eflags;
- status.pr_reg[SS] = rp->lxur_xss;
-#endif
-
- } else {
- (void) close(fd);
-
-#if defined(_ILP32)
- status.pr_reg[EBX] = rp->lxur_ebx;
- status.pr_reg[ECX] = rp->lxur_ecx;
- status.pr_reg[EDX] = rp->lxur_edx;
- status.pr_reg[ESI] = rp->lxur_esi;
- status.pr_reg[EDI] = rp->lxur_edi;
- status.pr_reg[EBP] = rp->lxur_ebp;
- status.pr_reg[EAX] = rp->lxur_eax;
- status.pr_reg[DS] = rp->lxur_xds;
- status.pr_reg[ES] = rp->lxur_xes;
- status.pr_reg[FS] = rp->lxur_xfs;
- status.pr_reg[GS] = rp->lxur_xgs;
- status.pr_reg[EIP] = rp->lxur_eip;
- status.pr_reg[CS] = rp->lxur_xcs;
- status.pr_reg[EFL] = rp->lxur_eflags;
- status.pr_reg[UESP] = rp->lxur_esp;
- status.pr_reg[SS] = rp->lxur_xss;
- status.pr_reg[SS] = rp->lxur_xss;
-#endif
- }
-
- if ((fd = open_lwpfile(pid, lwpid, O_WRONLY, "lwpctl")) < 0)
- return (-ESRCH);
-
- ctl[0] = PCSREG;
- bcopy(status.pr_reg, &ctl[1], sizeof (prgregset_t));
-
- if (write(fd, &ctl, sizeof (ctl)) != sizeof (ctl)) {
- (void) close(fd);
- return (-EIO);
- }
-
- (void) close(fd);
-
- return (0);
-}
-
static int
getfpregs(pid_t pid, lwpid_t lwpid, lx_user_fpregs_t *rp)
{
@@ -904,7 +512,7 @@ ptrace_peek(pid_t pid, uintptr_t addr, long *ret)
(offsetof(lx_user_t, m) + sizeof (((lx_user_t *)NULL)->m))
static int
-ptrace_peek_user(pid_t pid, lwpid_t lwpid, uintptr_t off, int *ret)
+ptrace_peek_user(pid_t lxpid, pid_t pid, lwpid_t lwpid, uintptr_t off, int *ret)
{
int err, data;
uintptr_t *debugreg;
@@ -919,8 +527,10 @@ ptrace_peek_user(pid_t pid, lwpid_t lwpid, uintptr_t off, int *ret)
if (off < LX_USER_BOUND(lxu_regs)) {
lx_user_regs_t regs;
- if ((err = getregs(pid, lwpid, &regs)) != 0)
+ if ((err = lx_ptrace_kernel(LX_PTRACE_GETREGS, lxpid, NULL,
+ (uintptr_t)&regs)) != 0) {
return (err);
+ }
data = *(int *)((uintptr_t)&regs + off -
offsetof(lx_user_t, lxu_regs));
@@ -1019,7 +629,7 @@ ptrace_poke(pid_t pid, uintptr_t addr, int data)
}
static int
-ptrace_poke_user(pid_t pid, lwpid_t lwpid, uintptr_t off, int data)
+ptrace_poke_user(pid_t lxpid, pid_t pid, lwpid_t lwpid, uintptr_t off, int data)
{
lx_user_regs_t regs;
int err = 0;
@@ -1030,11 +640,16 @@ ptrace_poke_user(pid_t pid, lwpid_t lwpid, uintptr_t off, int data)
return (-EINVAL);
if (off < offsetof(lx_user_t, lxu_regs) + sizeof (lx_user_regs_t)) {
- if ((err = getregs(pid, lwpid, &regs)) != 0)
+ if ((err = lx_ptrace_kernel(LX_PTRACE_GETREGS, lxpid, NULL,
+ (uintptr_t)&regs)) != 0) {
return (err);
+ }
+
*(int *)((uintptr_t)&regs + off -
offsetof(lx_user_t, lxu_regs)) = data;
- return (setregs(pid, lwpid, &regs));
+
+ return (lx_ptrace_kernel(LX_PTRACE_SETREGS, lxpid, NULL,
+ (uintptr_t)&regs));
}
if (off >= offsetof(lx_user_t, lxu_debugreg) &&
@@ -1068,32 +683,6 @@ ptrace_kill(pid_t pid)
}
static int
-ptrace_getregs(pid_t pid, lwpid_t lwpid, uintptr_t addr)
-{
- lx_user_regs_t regs;
- int ret;
-
- if ((ret = getregs(pid, lwpid, &regs)) != 0)
- return (ret);
-
- if (uucopy(&regs, (void *)addr, sizeof (regs)) != 0)
- return (-errno);
-
- return (0);
-}
-
-static int
-ptrace_setregs(pid_t pid, lwpid_t lwpid, uintptr_t addr)
-{
- lx_user_regs_t regs;
-
- if (uucopy((void *)addr, &regs, sizeof (regs)) != 0)
- return (-errno);
-
- return (setregs(pid, lwpid, &regs));
-}
-
-static int
ptrace_getfpregs(pid_t pid, lwpid_t lwpid, uintptr_t addr)
{
lx_user_fpregs_t regs;
@@ -1146,16 +735,21 @@ ptrace_setfpxregs(pid_t pid, lwpid_t lwpid, uintptr_t addr)
}
void
-lx_ptrace_stop_if_option(int option, boolean_t child, ulong_t msg)
+lx_ptrace_stop_if_option(int option, boolean_t child, ulong_t msg,
+ ucontext_t *ucp)
{
/*
* We call into the kernel to see if we need to stop for specific
* ptrace(2) events.
*/
- lx_debug("lx_ptrace_stop_if_option(%d, %s, %lu)", option,
- child ? "TRUE [child]" : "FALSE [parent]", msg);
- if (syscall(SYS_brand, B_PTRACE_STOP_FOR_OPT, option, child,
- msg) != 0) {
+ lx_debug("lx_ptrace_stop_if_option(%d, %s, %lu, %p)", option,
+ child ? "TRUE [child]" : "FALSE [parent]", msg, ucp);
+ if (ucp == NULL) {
+ ucp = (ucontext_t *)lx_find_brand_uc();
+ lx_debug("\tucp = %p", ucp);
+ }
+ if (syscall(SYS_brand, B_PTRACE_STOP_FOR_OPT, option, child, msg,
+ ucp) != 0) {
if (errno != ESRCH) {
/*
* This should _only_ fail if we are not traced, or do
@@ -1243,6 +837,8 @@ lx_ptrace(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4)
*/
case LX_PTRACE_SETOPTIONS:
case LX_PTRACE_GETEVENTMSG:
+ case LX_PTRACE_GETREGS:
+ case LX_PTRACE_SETREGS:
return (lx_ptrace_kernel(ptrace_op, lxpid, p3, p4));
}
@@ -1262,24 +858,18 @@ lx_ptrace(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4)
return (ptrace_peek(pid, p3, (long *)p4));
case LX_PTRACE_PEEKUSER:
- return (ptrace_peek_user(pid, lwpid, p3, (int *)p4));
+ return (ptrace_peek_user(lxpid, pid, lwpid, p3, (int *)p4));
case LX_PTRACE_POKETEXT:
case LX_PTRACE_POKEDATA:
return (ptrace_poke(pid, p3, (int)p4));
case LX_PTRACE_POKEUSER:
- return (ptrace_poke_user(pid, lwpid, p3, (int)p4));
+ return (ptrace_poke_user(lxpid, pid, lwpid, p3, (int)p4));
case LX_PTRACE_KILL:
return (ptrace_kill(pid));
- case LX_PTRACE_GETREGS:
- return (ptrace_getregs(pid, lwpid, p4));
-
- case LX_PTRACE_SETREGS:
- return (ptrace_setregs(pid, lwpid, p4));
-
case LX_PTRACE_GETFPREGS:
return (ptrace_getfpregs(pid, lwpid, p4));
diff --git a/usr/src/lib/brand/lx/lx_brand/common/signal.c b/usr/src/lib/brand/lx/lx_brand/common/signal.c
index 7d3865c2de..4c143720c3 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/signal.c
+++ b/usr/src/lib/brand/lx/lx_brand/common/signal.c
@@ -42,6 +42,7 @@
#include <sys/lx_thread.h>
#include <sys/syscall.h>
#include <lx_provider_impl.h>
+#include <sys/stack.h>
#include <assert.h>
#include <errno.h>
#include <poll.h>
@@ -60,10 +61,10 @@
#if defined(_ILP32)
extern int pselect_large_fdset(int nfds, fd_set *in0, fd_set *out0, fd_set *ex0,
const timespec_t *tsp, const sigset_t *sp);
-#else
-static int lx_setcontext(const ucontext_t *ucp);
#endif
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
/*
* Delivering signals to a Linux process is complicated by differences in
* signal numbering, stack structure and contents, and the action taken when a
@@ -91,31 +92,24 @@ static int lx_setcontext(const ucontext_t *ucp);
*
* Adding a Linux branded thread to the mix complicates things somewhat.
*
- * First (for 32-bit code), when a thread receives a signal, it may be running
- * with a Linux value in the x86 %gs segment register as opposed to the value
- * Illumos threads expect; if control were passed directly to Illumos code,
- * such as libc's sigacthandler(), that code would experience a segmentation
- * fault the first time it tried to dereference a memory location using %gs.
- *
- * For 64-bit code the %gs is usually 0 for both native and Linux code and the
- * thread pointer for both Illumos and Linux libc is referenced off the %fsbase
- * register, as per the AMD64 ABI. When a thread receives a signal, it may be
- * running with the Linux value in the x86 %fsbase register as opposed to the
- * value Illumos libc expects. Switching the %fsbase value is handled in the
- * kernel module at the same time as we switch the syscall mode flag. We track
- * the syscall mode flag in the kernel using the per-lwp br_scms integer so we
- * can save/restore the correct mode at the end of the signal handling. The
- * flag value is saved/restored in the per-thread br_scms variable which is
- * used like a stack to push/pop the flag bit as we take signals and return.
+ * First, when a thread receives a signal, it may either be running in an
+ * emulated Linux context or a native illumos context. In either case, the
+ * in-kernel brand module is responsible for preserving the register state
+ * from the interrupted context, regardless of whether emulated or native
+ * software was running at the time. The kernel is also responsible for
+ * ensuring that the illumos native sigacthandler() is called with register
+ * values appropriate for native code. Of particular note is the %gs segment
+ * selector for 32-bit code, and the %fsbase segment base register for 64-bit
+ * code; these are used by libc to locate per-thread data structures.
*
* Second, the signal number translation referenced above must take place.
- * Further, for 32-bit code, as was the case with Illumos libc, before the
- * Linux signal handler is called, the value of the %gs segment register MUST
- * be restored to the value Linux code expects.
+ * Finally, when we hand control to the Linux signal handler we must do so
+ * on the brand stack, and with registers configured appropriately for the
+ * Linux application.
*
- * This need to translate signal numbers (and manipulate the %gs register)
- * means that with standard Illumos libc, following a signal from generation to
- * delivery looks something like:
+ * This need to translate signal numbers (and manipulate the signal handling
+ * context) means that with standard Illumos libc, following a signal from
+ * generation to delivery looks something like:
*
* kernel ->
* sigacthandler() ->
@@ -125,21 +119,15 @@ static int lx_setcontext(const ucontext_t *ucp);
* but for the brand's Linux threads, this would look like:
*
* kernel ->
- * lx_sigacthandler() ->
- * sigacthandler() ->
- * call_user_handler() ->
- * lx_call_user_handler() ->
- * lx_sigdeliver() JMP to
- * Linux user signal handler
+ * sigacthandler() ->
+ * call_user_handler() ->
+ * lx_call_user_handler() ->
+ * lx_sigdeliver() ->
+ * syscall(B_JUMP_TO_LINUX, ...) ->
+ * Linux user signal handler
*
* The new addtions are:
*
- * lx_sigacthandler
- * ================
- * This routine is responsible for setting the %gs segment register to the
- * value 32-bit Illumos code expects (it does nothing in 64-bit code) and
- * jumping to Illumos' libc signal interposition handler, sigacthandler().
- *
* lx_call_user_handler
* ====================
* This routine is responsible for translating Illumos signal numbers to
@@ -148,25 +136,8 @@ static int lx_setcontext(const ucontext_t *ucp);
* registered Linux signal handler. It is, in effect, the Linux thread
* equivalent to libc's call_user_handler().
*
- * Installing lx_sigacthandler() is a bit tricky, as normally libc's
- * sigacthandler() routine is hidden from user programs. To facilitate this, a
- * libc private function is used; setsigacthandler():
- *
- * void setsigacthandler(void (*new_handler)(int, siginfo_t *, void *),
- * void (**old_handler)(int, siginfo_t *, void *)
- * int (*brsetctxt)(const ucontext_t *))
- *
- * The routine works by modifying the per-thread data structure (uberdata) in
- * libc that keeps track of the address of its own interposition handler with
- * the address passed in; the old handler's address is returned in the pointer
- * pointed to by the second argument, if it is non-NULL, mimicking the behavior
- * of sigaction() itself. In a similar way, this function can also set a
- * replacement handler for the libc __setcontext call which is made by libc's
- * setcontext() when returning from a signal handler. Using this we can hook
- * in to managing the syscall mode flag for 64-bit code when returning to the
- * interrupted code. Once setsigacthandler() has been executed, all future
- * branded threads this thread may create will automatically have the proper
- * interposition handler(s) invoked as the result of a normal sigaction() call.
+ * lx_sigdeliver
+ * =============
*
* Note that none of this interposition is necessary unless a Linux thread
* registers a user signal handler, as the default action for all signals is the
@@ -184,18 +155,18 @@ static int lx_setcontext(const ucontext_t *ucp);
* translating the value WTERMSIG() would return from a Illumos signal number
* to the appropriate Linux value.
*
- * lx_call_user_handler() calls lx_sigdeliver with a helper function (typically
- * lx_build_signal_frame) which builds a stack frame for the 32-bit Linux
- * signal handler, or populates a local (on the stack) structure for the 64-bit
- * Linux signal handler, then jmp's into the handler. The stack at that time
- * looks like this:
+ * lx_call_user_handler() calls lx_sigdeliver() with a helper function
+ * (typically lx_build_signal_frame) which builds a stack frame for the 32-bit
+ * Linux signal handler, or populates a local (on the stack) structure for the
+ * 64-bit Linux signal handler. The stack at that time looks like this:
*
- * =================================================
- * | | LX_SIGRT_MAGIC |
- * | =================================================
- * | | Linux signal frame (32-bit) or local data |
- * V | (64-bit) built by stack_builder() |
- * =================================================
+ * =========================================================
+ * | | lx_sigdeliver_frame_t -- includes LX_SIGRT_MAGIC and |
+ * | | a return context for the eventual sigreturn(2) call |
+ * | =========================================================
+ * | | Linux signal frame (32-bit) or local data |
+ * V | (64-bit) built by stack_builder() |
+ * =========================================================
*
* The process of returning to an interrupted thread of execution from a user
* signal handler is entirely different between Illumos and Linux. While
@@ -207,24 +178,7 @@ static int lx_setcontext(const ucontext_t *ucp);
* call to setcontext(2), the rt_sigreturn(2) Linux system call is responsible
* for accomplishing much the same thing. It's for this reason that the stack
* frame we build has the lx_(rt_)sigreturn_tramp code on the top of the
- * stack.
- *
- * The lx_rt_sigreturn() function will handle the syscall, do its cleanup,
- * then return to the libc signal handling code (call_user_handler) so that
- * libc can use setcontext() to get back to the point where things were
- * interrupted. However, for the 64-bit case, due to the syscall mode switching,
- * we cannot simply let the libc setcontext() take us back because we may also
- * have to switch the syscall mode back to Linux (it depends on where we were
- * when we took the signal). For the 64-bit case we used setsigacthandler()
- * to setup a libc replacement function on __setcontext(). This is the
- * lx_setcontext() function. This function uses a brand call (B_SIGNAL_RETURN)
- * which combines the syscall mode switching and setcontext handling in the lx
- * kernel module.
- *
- * An additional oddity in the signal return code is that in the stack builder
- * function we push some x86 code onto the bottom of the stack that looks like
- * it invokes the Linux (rt)_sigreturn syscall. This is needed by gdb to
- * tell that it's in a signal handler. The code looks like this:
+ * stack. The code looks like this:
*
* 32-bit 64-bit
* -------------------------------- -----------------------------
@@ -239,11 +193,13 @@ static int lx_setcontext(const ucontext_t *ucp);
* trampoline code on the stack to determine whether it is in a signal stack
* frame or not. Really.)
*
- * When the 32-bit Linux user signal handler is eventually called, the stack
- * frame looks like this (in the case of a "modern" signal stack; see the
- * lx_sigstack structure definition):
+ * When the 32-bit Linux user signal handler is eventually called, the brand
+ * stack frame looks like this (in the case of a "modern" signal stack; see
+ * the lx_sigstack structure definition):
*
* =========================================================
+ * | | lx_sigdeliver_frame_t |
+ * | =========================================================
* | | Trampoline code (marker for gdb, not really executed) |
* | =========================================================
* | | Linux struct _fpstate |
@@ -264,6 +220,8 @@ static int lx_setcontext(const ucontext_t *ucp);
* The 64-bit stack-local data looks like this:
*
* =========================================================
+ * | | lx_sigdeliver_frame_t |
+ * | =========================================================
* | | Trampoline code (marker for gdb, not really executed) |
* | =========================================================
* | | Linux struct _fpstate |
@@ -277,14 +235,22 @@ static int lx_setcontext(const ucontext_t *ucp);
*
* As usual in 64-bit code, %rdi is arg0 which is the signal number.
*
- * As mentioned above, the brand intercepts the Linux (rt_)sigreturn(2) system
- * call. This turns into some stack cleanup and a call to lx_sigreturn_tolibc()
- * which returns through the libc call stack that Illumos expects, with the
- * caveat that 64-bit code combines the __setcontext and syscall mode switch
- * via a brand call. This returns the thread executing the code back to the
- * location originally interrupted by receipt of the signal.
+ * The *sigreturn(2) family of emulated system call handlers locates the
+ * "lx_sigdeliver_frame_t" struct on the Linux stack as part of processing
+ * the system call. This object contains a guard value (LX_SIGRT_MAGIC) to
+ * detect stack smashing or an incorrect stack pointer. It also contains a
+ * "return" context, which we use to get back to the "lx_sigdeliver()" frame
+ * on the native stack that originally dispatched to the Linux signal
+ * handler. The lx_sigdeliver() function is then able to return to the
+ * native libc signal handler in the usual way. This results in a further
+ * setcontext() back to whatever was running when we took the signal.
*/
+typedef struct lx_sigdeliver_frame {
+ uintptr_t lxsdf_magic;
+ ucontext_t *lxsdf_retucp;
+ ucontext_t *lxsdf_sigucp;
+} lx_sigdeliver_frame_t;
struct lx_oldsigstack {
void (*retaddr)(); /* address of real lx_sigreturn code */
@@ -296,12 +262,6 @@ struct lx_oldsigstack {
};
/*
- * libc_sigacthandler is set to the address of the libc signal interposition
- * routine, sigacthandler().
- */
-void (*libc_sigacthandler)(int, siginfo_t *, void*);
-
-/*
* The lx_sighandlers structure needs to be a global due to the semantics of
* clone().
*
@@ -324,13 +284,13 @@ static lx_sighandlers_t lx_sighandlers;
struct lx_vsyscall
{
uintptr_t lv_addr;
- long (*lv_func)();
+ uintptr_t lv_scnum;
char *lv_msg;
} lx_vsyscalls[] = {
- {LX_VSYS_gettimeofday, lx_gettimeofday,
+ {LX_VSYS_gettimeofday, LX_SYS_gettimeofday,
"vsyscall gettimeofday(%p, %p)" },
- {LX_VSYS_time, lx_time, "vsyscall time(%p)" },
- {LX_VSYS_getcpu, lx_getcpu, "vsyscall getcpu(%p, %lx, %lx)" },
+ {LX_VSYS_time, LX_SYS_time, "vsyscall time(%p)" },
+ {LX_VSYS_getcpu, LX_SYS_getcpu, "vsyscall getcpu(%p, %lx, %lx)" },
{NULL, NULL, NULL}
};
@@ -352,6 +312,9 @@ static int lx_sigsegv_depth = 0;
*/
static int lx_no_abort_handler = 0;
+static void lx_sigdeliver(int, siginfo_t *, ucontext_t *, size_t, void (*)(),
+ void (*)(), struct lx_sigaction *);
+
/*
* Cache result of process.max-file-descriptor to avoid calling getrctl()
* for each lx_ppoll().
@@ -464,27 +427,6 @@ stol_osigset(sigset_t *s_sigsetp, lx_osigset_t *lx_osigsetp)
#endif
static int
-stol_sigcode(int si_code)
-{
- switch (si_code) {
- case SI_USER:
- return (LX_SI_USER);
- case SI_LWP:
- return (LX_SI_TKILL);
- case SI_QUEUE:
- return (LX_SI_QUEUE);
- case SI_TIMER:
- return (LX_SI_TIMER);
- case SI_ASYNCIO:
- return (LX_SI_ASYNCIO);
- case SI_MESGQ:
- return (LX_SI_MESGQ);
- default:
- return (si_code);
- }
-}
-
-static int
ltos_sigcode(int si_code)
{
switch (si_code) {
@@ -505,29 +447,6 @@ ltos_sigcode(int si_code)
}
}
-/*
- * Convert the "status" field of a SIGCLD siginfo_t. We need to extract the
- * illumos signal number and convert it to a Linux signal number while leaving
- * the ptrace(2) event bits intact.
- */
-int
-stol_status(int s)
-{
- /*
- * We mask out the top bit here in case PTRACE_O_TRACESYSGOOD
- * is in use and 0x80 has been ORed with the signal number.
- */
- int stat = stol_signo[s & 0x7f];
- assert(stat != -1);
-
- /*
- * We must mix in the ptrace(2) event which may be stored in
- * the second byte of the status code. We also re-include the
- * PTRACE_O_TRACESYSGOOD bit.
- */
- return ((s & 0xff80) | stat);
-}
-
int
stol_siginfo(siginfo_t *siginfop, lx_siginfo_t *lx_siginfop)
{
@@ -546,7 +465,7 @@ stol_siginfo(siginfo_t *siginfop, lx_siginfo_t *lx_siginfop)
ret = -1;
}
- lx_siginfo.lsi_code = stol_sigcode(siginfop->si_code);
+ lx_siginfo.lsi_code = lx_stol_sigcode(siginfop->si_code);
lx_siginfo.lsi_errno = siginfop->si_errno;
switch (lx_siginfo.lsi_signo) {
@@ -561,11 +480,12 @@ stol_siginfo(siginfo_t *siginfop, lx_siginfo_t *lx_siginfop)
case LX_SIGCHLD:
lx_siginfo.lsi_pid = siginfop->si_pid;
- if (siginfop->si_code == CLD_EXITED) {
+ if (siginfop->si_code <= 0 || siginfop->si_code ==
+ CLD_EXITED) {
lx_siginfo.lsi_status = siginfop->si_status;
} else {
- lx_siginfo.lsi_status = stol_status(
- siginfop->si_status);
+ lx_siginfo.lsi_status = lx_stol_status(
+ siginfop->si_status, -1);
}
lx_siginfo.lsi_utime = siginfop->si_utime;
lx_siginfo.lsi_stime = siginfop->si_stime;
@@ -695,41 +615,59 @@ ltos_fpstate(lx_fpstate_t *lfpr, fpregset_t *fpr)
}
/*
- * The brand needs a lx version of this because the format of the lx stack_t
- * differs from the Illumos stack_t not really in content but in ORDER,
- * so we can't simply pass pointers and expect things to work (sigh...)
+ * We do not use the system sigaltstack() infrastructure as that would conflict
+ * with our handling of both system call emulation and native signals on the
+ * native stack. Instead, we track the Linux stack structure in our
+ * thread-specific data. This function is modeled on the behaviour of the
+ * native sigaltstack system call handler.
*/
long
-lx_sigaltstack(uintptr_t nsp, uintptr_t osp)
+lx_sigaltstack(uintptr_t ssp, uintptr_t oss)
{
- lx_stack_t ls;
- stack_t newsstack, oldsstack;
- stack_t *nssp = (nsp ? &newsstack : NULL);
- stack_t *ossp = (osp ? &oldsstack : NULL);
+ lx_tsd_t *lxtsd = lx_get_tsd();
+ lx_stack_t ss;
- if (nsp) {
- if (uucopy((void *)nsp, &ls, sizeof (lx_stack_t)) != 0)
- return (-errno);
+ if (ssp != NULL) {
+ if (lxtsd->lxtsd_sigaltstack.ss_flags & LX_SS_ONSTACK) {
+ /*
+ * If we are currently using the installed alternate
+ * stack for signal handling, the user may not modify
+ * the stack for this thread.
+ */
+ return (-EPERM);
+ }
- if ((ls.ss_flags & LX_SS_DISABLE) == 0 &&
- ls.ss_size < LX_MINSIGSTKSZ)
- return (-ENOMEM);
+ if (uucopy((void *)ssp, &ss, sizeof (ss)) != 0) {
+ return (-EFAULT);
+ }
- newsstack.ss_sp = (int *)ls.ss_sp;
- newsstack.ss_size = (long)ls.ss_size;
- newsstack.ss_flags = ls.ss_flags;
- }
+ if (ss.ss_flags & ~LX_SS_DISABLE) {
+ /*
+ * The user may not specify a value for flags other
+ * than 0 or SS_DISABLE.
+ */
+ return (-EINVAL);
+ }
- if (sigaltstack(nssp, ossp) != 0)
- return (-errno);
+ if (!(ss.ss_flags & LX_SS_DISABLE) && ss.ss_size <
+ LX_MINSIGSTKSZ) {
+ return (-ENOMEM);
+ }
+ }
- if (osp) {
- ls.ss_sp = (void *)oldsstack.ss_sp;
- ls.ss_size = (size_t)oldsstack.ss_size;
- ls.ss_flags = oldsstack.ss_flags;
+ if (oss != NULL) {
+ /*
+ * User provided old and new stack_t pointers may point to
+ * the same location. Copy out before we modify.
+ */
+ if (uucopy(&lxtsd->lxtsd_sigaltstack, (void *)oss,
+ sizeof (lxtsd->lxtsd_sigaltstack)) != 0) {
+ return (-EFAULT);
+ }
+ }
- if (uucopy(&ls, (void *)osp, sizeof (lx_stack_t)) != 0)
- return (-errno);
+ if (ssp != NULL) {
+ lxtsd->lxtsd_sigaltstack = ss;
}
return (0);
@@ -993,20 +931,21 @@ lx_rt_sigtimedwait(uintptr_t set, uintptr_t sinfo, uintptr_t toutp,
long
lx_sigreturn(void)
{
+ lx_sigdeliver_frame_t *lxsdf;
struct lx_oldsigstack *lx_ossp;
lx_sigset_t lx_sigset;
- lx_regs_t *rp;
ucontext_t *ucp;
+ ucontext_t *sigucp;
uintptr_t sp;
- rp = lx_syscall_regs();
+ ucp = lx_syscall_regs();
/*
* NOTE: The sp saved in the context is eight bytes off of where we
* need it to be (either due to trampoline or the copying of
* sp = uesp, not clear which).
*/
- sp = (uintptr_t)rp->lxr_esp - 8;
+ sp = LX_REG(ucp, REG_SP) - 8;
/*
* At this point, the stack pointer should point to the struct
@@ -1015,32 +954,34 @@ lx_sigreturn(void)
* save a pointer to it before incrementing our copy of the sp.
*/
lx_ossp = (struct lx_oldsigstack *)sp;
- sp += sizeof (struct lx_oldsigstack);
+ sp += SA(sizeof (struct lx_oldsigstack));
+
/*
- * lx_sigdeliver() pushes LX_SIGRT_MAGIC on the stack before it
- * creates the struct lx_oldsigstack.
+ * lx_sigdeliver() pushes a lx_sigdeliver_frame_t onto the stack
+ * before it creates the struct lx_oldsigstack.
*
- * If we don't find it here, the stack's been corrupted and we need to
- * kill ourselves.
- */
- if (*(uint32_t *)sp != LX_SIGRT_MAGIC)
+ * If we do not find it here, the stack has been corrupted and we
+ * need to kill ourselves.
+ */
+ lxsdf = (lx_sigdeliver_frame_t *)sp;
+ lx_debug("lx_sigreturn: reading lx_sigdeliver_frame_t @ %p\n",
+ lxsdf);
+ lx_debug("lx_sigreturn: lxsdf: magic %p retucp %p sigucp %p\n",
+ lxsdf->lxsdf_magic, lxsdf->lxsdf_retucp, lxsdf->lxsdf_sigucp);
+ if (lxsdf->lxsdf_magic != LX_SIGRT_MAGIC) {
+ LX_SIGNAL_DELIVERY_FRAME_CORRUPT(lxsdf);
lx_err_fatal("sp @ 0x%p, expected 0x%x, found 0x%x!",
- sp, LX_SIGRT_MAGIC, *(uint32_t *)sp);
+ sp, LX_SIGRT_MAGIC, lxsdf->lxsdf_magic);
+ }
- sp += sizeof (uint32_t);
+ LX_SIGNAL_DELIVERY_FRAME_FOUND(lxsdf);
/*
- * For signal mask handling to be done properly, this call needs to
- * return to the libc routine that originally called the signal handler
- * rather than directly set the context back to the place the signal
- * interrupted execution as the original Linux code would do.
- *
- * Here *sp points to the Illumos ucontext_t, so we need to copy
- * machine registers the Linux signal handler may have modified
- * back to the Illumos version.
+ * We need to copy machine registers the Linux signal handler may have
+ * modified back to the Illumos ucontext_t.
*/
- ucp = (ucontext_t *)(*(ssize_t *)sp);
+ sigucp = lxsdf->lxsdf_sigucp;
/*
* General registers copy across as-is, except Linux expects that
@@ -1051,30 +992,31 @@ lx_sigreturn(void)
* value to ESP.
*/
lx_ossp->sigc.sc_esp_at_signal = lx_ossp->sigc.sc_esp;
- bcopy(&lx_ossp->sigc, &ucp->uc_mcontext, sizeof (gregset_t));
+ bcopy(&lx_ossp->sigc, &sigucp->uc_mcontext, sizeof (gregset_t));
+
+ LX_SIGRETURN(NULL, sigucp, sp);
/* copy back FP regs if present */
if (lx_ossp->sigc.sc_fpstate != NULL)
- ltos_fpstate(&lx_ossp->fpstate, &ucp->uc_mcontext.fpregs);
+ ltos_fpstate(&lx_ossp->fpstate, &sigucp->uc_mcontext.fpregs);
/* convert Linux signal mask back to its Illumos equivalent */
bzero(&lx_sigset, sizeof (lx_sigset_t));
lx_sigset.__bits[0] = lx_ossp->sigc.sc_mask;
lx_sigset.__bits[1] = lx_ossp->sig_extra;
- (void) ltos_sigset(&lx_sigset, &ucp->uc_sigmask);
+ (void) ltos_sigset(&lx_sigset, &sigucp->uc_sigmask);
/*
- * At this point sp contains the value of the stack pointer when
- * lx_call_user_handler() was called.
- *
- * Pop one more value off the stack and pass the new sp to
- * lx_sigreturn_tolibc(), which will in turn manipulate the x86
- * registers to make it appear to libc's call_user_handler() as if the
- * handler it had called returned.
+ * For signal mask handling to be done properly, this call needs to
+ * return to the libc routine that originally called the signal handler
+ * rather than directly set the context back to the place the signal
+ * interrupted execution as the original Linux code would do.
*/
- sp += sizeof (uint32_t);
- lx_debug("calling lx_sigreturn_tolibc(0x%p)", sp);
- lx_sigreturn_tolibc(sp);
+ lx_debug("lx_sigreturn: calling setcontext; retucp %p flags %lx "
+ "link %p\n", lxsdf->lxsdf_retucp, lxsdf->lxsdf_retucp->uc_flags,
+ lxsdf->lxsdf_retucp->uc_link);
+ setcontext(lxsdf->lxsdf_retucp);
+ assert(0);
/*NOTREACHED*/
return (0);
@@ -1087,16 +1029,19 @@ lx_sigreturn(void)
long
lx_rt_sigreturn(void)
{
+ lx_sigdeliver_frame_t *lxsdf;
struct lx_sigstack *lx_ssp;
- lx_regs_t *rp;
lx_ucontext_t *lx_ucp;
ucontext_t *ucp;
+ ucontext_t *sigucp;
uintptr_t sp;
/* Get the registers at the emulated Linux rt_sigreturn syscall */
- rp = lx_syscall_regs();
+ ucp = lx_syscall_regs();
#if defined(_ILP32)
+ lx_debug("lx_rt_sigreturn: ESP %p UESP %p\n", LX_REG(ucp, ESP),
+ LX_REG(ucp, UESP));
/*
* For 32-bit
*
@@ -1121,14 +1066,14 @@ lx_rt_sigreturn(void)
* lx_sigdeliver() created the stack frame for the Linux signal
* handler.
*/
- sp = (uintptr_t)rp->lxr_esp - 4;
+ sp = (uintptr_t)LX_REG(ucp, REG_SP) - 4;
#else
/*
* We need to make an adjustment for 64-bit code as well. Since 64-bit
* does not use the trampoline, it's probably for the same reason as
* alluded to above.
*/
- sp = (uintptr_t)rp->lxr_rsp - 8;
+ sp = (uintptr_t)LX_REG(ucp, REG_SP) - 8;
#endif
/*
@@ -1138,39 +1083,41 @@ lx_rt_sigreturn(void)
* save a pointer to it before incrementing our copy of the sp.
*/
lx_ssp = (struct lx_sigstack *)sp;
- sp += sizeof (struct lx_sigstack);
+ sp += SA(sizeof (struct lx_sigstack));
+#if defined(_LP64)
/*
- * We handle 32 vs. 64 bit differently here, but first, lx_sigdeliver()
- * pushed LX_SIGRT_MAGIC on the stack before it created the
- * struct lx_sigstack (and possibly struct lx_fpstate_t).
- *
- * If we don't find LX_SIGRT_MAGIC here, the stack's been corrupted and
- * we need to kill ourselves.
- *
- * Check for and remove LX_SIGRT_MAGIC from the stack.
+ * The 64-bit lx_sigdeliver() inserts 8 bytes of padding between
+ * the lx_sigstack_t and the delivery frame to maintain ABI stack
+ * alignment.
*/
-#if defined(_LP64)
- /* account for extra word used in lx_sigdeliver for stack alignment */
sp += 8;
+#endif
- if (*(uint64_t *)sp != LX_SIGRT_MAGIC)
+ /*
+ * lx_sigdeliver() pushes a lx_sigdeliver_frame_t onto the stack
+ * before it creates the struct lx_oldsigstack.
+ *
+ * If we do not find it here, the stack has been corrupted and we
+ * need to kill ourselves.
+ */
+ lxsdf = (lx_sigdeliver_frame_t *)sp;
+ if (lxsdf->lxsdf_magic != LX_SIGRT_MAGIC) {
+ LX_SIGNAL_DELIVERY_FRAME_CORRUPT(lxsdf);
lx_err_fatal("sp @ 0x%p, expected 0x%x, found 0x%x!",
- sp, LX_SIGRT_MAGIC, *(uint32_t *)sp);
- sp += sizeof (uint64_t);
+ sp, LX_SIGRT_MAGIC, lxsdf->lxsdf_magic);
+ }
+
+ LX_SIGNAL_DELIVERY_FRAME_FOUND(lxsdf);
+
+ sigucp = lxsdf->lxsdf_sigucp;
/*
- * Now *(sp + 24) points to the Illumos ucontext_t (working backwards
- * through the Linux signal hander, the stack builder, and the stack
- * size) which we saved on the stack in the lx_sigdeliver assembly
- * prologue before we pushed LX_SIGRT_MAGIC, so we need to copy machine
- * registers the Linux signal handler may have modified back to the
- * Illumos version.
+ * We need to copy machine registers the Linux signal handler may have
+ * modified back to the Illumos version.
*/
- ucp = (ucontext_t *)(*(ssize_t *)(sp + 24));
-
+#if defined(_LP64)
lx_ucp = &lx_ssp->uc;
- LX_SIGRETURN(lx_ucp, ucp, sp);
/* Track SIGSEGV recursion depth for vsyscall */
if (lx_ssp->si.lsi_signo == LX_SIGSEGV) {
@@ -1181,47 +1128,33 @@ lx_rt_sigreturn(void)
/*
* General register layout is completely different.
*/
- ucp->uc_mcontext.gregs[REG_R15] = lx_ucp->uc_sigcontext.sc_r15;
- ucp->uc_mcontext.gregs[REG_R14] = lx_ucp->uc_sigcontext.sc_r14;
- ucp->uc_mcontext.gregs[REG_R13] = lx_ucp->uc_sigcontext.sc_r13;
- ucp->uc_mcontext.gregs[REG_R12] = lx_ucp->uc_sigcontext.sc_r12;
- ucp->uc_mcontext.gregs[REG_R11] = lx_ucp->uc_sigcontext.sc_r11;
- ucp->uc_mcontext.gregs[REG_R10] = lx_ucp->uc_sigcontext.sc_r10;
- ucp->uc_mcontext.gregs[REG_R9] = lx_ucp->uc_sigcontext.sc_r9;
- ucp->uc_mcontext.gregs[REG_R8] = lx_ucp->uc_sigcontext.sc_r8;
- ucp->uc_mcontext.gregs[REG_RDI] = lx_ucp->uc_sigcontext.sc_rdi;
- ucp->uc_mcontext.gregs[REG_RSI] = lx_ucp->uc_sigcontext.sc_rsi;
- ucp->uc_mcontext.gregs[REG_RBP] = lx_ucp->uc_sigcontext.sc_rbp;
- ucp->uc_mcontext.gregs[REG_RBX] = lx_ucp->uc_sigcontext.sc_rbx;
- ucp->uc_mcontext.gregs[REG_RDX] = lx_ucp->uc_sigcontext.sc_rdx;
- ucp->uc_mcontext.gregs[REG_RCX] = lx_ucp->uc_sigcontext.sc_rcx;
- ucp->uc_mcontext.gregs[REG_RAX] = lx_ucp->uc_sigcontext.sc_rax;
- ucp->uc_mcontext.gregs[REG_TRAPNO] = lx_ucp->uc_sigcontext.sc_trapno;
- ucp->uc_mcontext.gregs[REG_ERR] = lx_ucp->uc_sigcontext.sc_err;
- ucp->uc_mcontext.gregs[REG_RIP] = lx_ucp->uc_sigcontext.sc_rip;
- ucp->uc_mcontext.gregs[REG_CS] = lx_ucp->uc_sigcontext.sc_cs;
- ucp->uc_mcontext.gregs[REG_RFL] = lx_ucp->uc_sigcontext.sc_eflags;
- ucp->uc_mcontext.gregs[REG_RSP] = lx_ucp->uc_sigcontext.sc_rsp;
- ucp->uc_mcontext.gregs[REG_SS] = lx_ucp->uc_sigcontext.sc_pad0;
- ucp->uc_mcontext.gregs[REG_FS] = lx_ucp->uc_sigcontext.sc_fs;
- ucp->uc_mcontext.gregs[REG_GS] = lx_ucp->uc_sigcontext.sc_gs;
+ LX_REG(sigucp, REG_R15) = lx_ucp->uc_sigcontext.sc_r15;
+ LX_REG(sigucp, REG_R14) = lx_ucp->uc_sigcontext.sc_r14;
+ LX_REG(sigucp, REG_R13) = lx_ucp->uc_sigcontext.sc_r13;
+ LX_REG(sigucp, REG_R12) = lx_ucp->uc_sigcontext.sc_r12;
+ LX_REG(sigucp, REG_R11) = lx_ucp->uc_sigcontext.sc_r11;
+ LX_REG(sigucp, REG_R10) = lx_ucp->uc_sigcontext.sc_r10;
+ LX_REG(sigucp, REG_R9) = lx_ucp->uc_sigcontext.sc_r9;
+ LX_REG(sigucp, REG_R8) = lx_ucp->uc_sigcontext.sc_r8;
+ LX_REG(sigucp, REG_RDI) = lx_ucp->uc_sigcontext.sc_rdi;
+ LX_REG(sigucp, REG_RSI) = lx_ucp->uc_sigcontext.sc_rsi;
+ LX_REG(sigucp, REG_RBP) = lx_ucp->uc_sigcontext.sc_rbp;
+ LX_REG(sigucp, REG_RBX) = lx_ucp->uc_sigcontext.sc_rbx;
+ LX_REG(sigucp, REG_RDX) = lx_ucp->uc_sigcontext.sc_rdx;
+ LX_REG(sigucp, REG_RCX) = lx_ucp->uc_sigcontext.sc_rcx;
+ LX_REG(sigucp, REG_RAX) = lx_ucp->uc_sigcontext.sc_rax;
+ LX_REG(sigucp, REG_TRAPNO) = lx_ucp->uc_sigcontext.sc_trapno;
+ LX_REG(sigucp, REG_ERR) = lx_ucp->uc_sigcontext.sc_err;
+ LX_REG(sigucp, REG_RIP) = lx_ucp->uc_sigcontext.sc_rip;
+ LX_REG(sigucp, REG_CS) = lx_ucp->uc_sigcontext.sc_cs;
+ LX_REG(sigucp, REG_RFL) = lx_ucp->uc_sigcontext.sc_eflags;
+ LX_REG(sigucp, REG_RSP) = lx_ucp->uc_sigcontext.sc_rsp;
+ LX_REG(sigucp, REG_SS) = lx_ucp->uc_sigcontext.sc_pad0;
+ LX_REG(sigucp, REG_FS) = lx_ucp->uc_sigcontext.sc_fs;
+ LX_REG(sigucp, REG_GS) = lx_ucp->uc_sigcontext.sc_gs;
#else /* is _ILP32 */
- if (*(uint32_t *)sp != LX_SIGRT_MAGIC)
- lx_err_fatal("sp @ 0x%p, expected 0x%x, found 0x%x!",
- sp, LX_SIGRT_MAGIC, *(uint32_t *)sp);
- sp += sizeof (uint32_t);
-
- /*
- * Here *sp points to the Illumos ucontext_t which was saved on stack
- * right before we pushed LX_SIGRT_MAGIC in the 32-bit lx_sigdeliver
- * assembly code. We need to copy machine registers the Linux signal
- * handler may have modified back to the Illumos version.
- */
- ucp = (ucontext_t *)(*(ssize_t *)sp);
-
lx_ucp = &lx_ssp->uc;
- LX_SIGRETURN(lx_ucp, ucp, sp);
/*
* Illumos and Linux both follow the SysV i386 ABI layout for the
@@ -1236,84 +1169,39 @@ lx_rt_sigreturn(void)
*/
lx_ucp->uc_sigcontext.sc_esp_at_signal = lx_ucp->uc_sigcontext.sc_esp;
- bcopy(&lx_ucp->uc_sigcontext, &ucp->uc_mcontext.gregs,
+ bcopy(&lx_ucp->uc_sigcontext, &sigucp->uc_mcontext.gregs,
sizeof (gregset_t));
#endif
- if (lx_ucp->uc_sigcontext.sc_fpstate != NULL)
+ LX_SIGRETURN(lx_ucp, sigucp, sp);
+
+ if (lx_ucp->uc_sigcontext.sc_fpstate != NULL) {
ltos_fpstate(lx_ucp->uc_sigcontext.sc_fpstate,
- &ucp->uc_mcontext.fpregs);
+ &sigucp->uc_mcontext.fpregs);
+ }
/*
* Convert the Linux signal mask and stack back to their
* Illumos equivalents.
*/
- (void) ltos_sigset(&lx_ucp->uc_sigmask, &ucp->uc_sigmask);
- ltos_stack(&lx_ucp->uc_stack, &ucp->uc_stack);
-
- /*
- * For signal mask handling to be done properly, this function must
- * return to the libc call_user_handler() routine that originally
- * called the signal handler, rather than directly set the context back
- * to the place the signal interrupted execution, as the original Linux
- * code would do.
- *
- * For the 64-bit case we can't simply let call_user_handler() invoke
- * __setcontext() since we need to also manage the syscall mode. Thus
- * we use the lx_setcontext callback hook into libc to manage this via
- * a brand call which combines the setcontext with setting the mode
- * switch.
- */
-#if defined(_LP64)
- /*
- * At this point sp points to the end of the stack frame we constructed
- * on entry to lx_sigdeliver. Pop this frame off the stack.
- */
- sp += 0x30;
-
-#else
- /*
- * At this point sp points to the ucontext_t pointer we pushed on the
- * stack right before we pushed LX_SIGRT_MAGIC in lx_sigdeliver. Pop
- * this value off the stack.
- */
- sp += sizeof (uint32_t);
-#endif
+ (void) ltos_sigset(&lx_ucp->uc_sigmask, &sigucp->uc_sigmask);
+ ltos_stack(&lx_ucp->uc_stack, &sigucp->uc_stack);
/*
- * At this point sp points to the base frame we had on entry to
- * lx_sigdeliver (%ebp/%rbp at TOS, return address next).
- *
- * Pass the new sp to lx_sigreturn_tolibc(), which will in turn
- * manipulate the x86 registers to make it appear that
- * lx_call_user_handler() has returned. This will then take us directly
- * back to libc's call_user_handler().
+ * For signal mask handling to be done properly, this call needs to
+ * return to the libc routine that originally called the signal handler
+ * rather than directly set the context back to the place the signal
+ * interrupted execution as the original Linux code would do.
*/
- lx_debug("calling lx_sigreturn_tolibc(0x%p)", sp);
- lx_sigreturn_tolibc(sp);
+ lx_debug("lx_rt_sigreturn: calling setcontext; retucp %p\n",
+ lxsdf->lxsdf_retucp);
+ setcontext(lxsdf->lxsdf_retucp);
+ assert(0);
/*NOTREACHED*/
return (0);
}
-#if defined(_LP64)
-static int
-lx_setcontext(const ucontext_t *ucp)
-{
- extern int lx_traceflag;
-
- /*
- * Since we don't return via lx_emulate, issue a trace msg here if
- * necessary. We know this is only called in the 64-bit rt_sigreturn
- * code path to the syscall number is 15.
- */
- if (lx_traceflag != 0) {
- (void) syscall(SYS_brand, B_SYSRETURN, 15, 0);
- }
- return (syscall(SYS_brand, B_SIGNAL_RETURN, ucp));
-}
-#endif
-
#if defined(_ILP32)
/*
@@ -1321,7 +1209,8 @@ lx_setcontext(const ucontext_t *ucp)
* This stack-builder function is only used by 32-bit code.
*/
static void
-lx_build_old_signal_frame(int lx_sig, siginfo_t *sip, void *p, void *sp)
+lx_build_old_signal_frame(int lx_sig, siginfo_t *sip, void *p, void *sp,
+ uintptr_t *hargs)
{
extern void lx_sigreturn_tramp();
@@ -1394,7 +1283,8 @@ lx_build_old_signal_frame(int lx_sig, siginfo_t *sip, void *p, void *sp)
* code (32-bit code also calls this when using "modern" signals).
*/
static void
-lx_build_signal_frame(int lx_sig, siginfo_t *sip, void *p, void *sp)
+lx_build_signal_frame(int lx_sig, siginfo_t *sip, void *p, void *sp,
+ uintptr_t *hargs)
{
extern void lx_rt_sigreturn_tramp();
@@ -1407,8 +1297,20 @@ lx_build_signal_frame(int lx_sig, siginfo_t *sip, void *p, void *sp)
lx_ucp = &lx_ssp->uc;
#if defined(_ILP32)
+ /*
+ * Arguments are passed to the 32-bit signal handler on the stack.
+ */
lx_ssp->ucp = lx_ucp;
+ lx_ssp->sip = sip != NULL ? &lx_ssp->si : NULL;
lx_ssp->sig = lx_sig;
+#else
+ /*
+ * Arguments to the 64-bit signal handler are passed in registers:
+ * hdlr(int sig, siginfo_t *sip, void *ucp);
+ */
+ hargs[0] = lx_sig;
+ hargs[1] = sip != NULL ? (uintptr_t)&lx_ssp->si : NULL;
+ hargs[2] = (uintptr_t)lx_ucp;
#endif
lxsap = &lx_sighandlers.lx_sa[lx_sig];
@@ -1442,30 +1344,30 @@ lx_build_signal_frame(int lx_sig, siginfo_t *sip, void *p, void *sp)
/*
* General register layout is completely different.
*/
- lx_ucp->uc_sigcontext.sc_r8 = ucp->uc_mcontext.gregs[REG_R8];
- lx_ucp->uc_sigcontext.sc_r9 = ucp->uc_mcontext.gregs[REG_R9];
- lx_ucp->uc_sigcontext.sc_r10 = ucp->uc_mcontext.gregs[REG_R10];
- lx_ucp->uc_sigcontext.sc_r11 = ucp->uc_mcontext.gregs[REG_R11];
- lx_ucp->uc_sigcontext.sc_r12 = ucp->uc_mcontext.gregs[REG_R12];
- lx_ucp->uc_sigcontext.sc_r13 = ucp->uc_mcontext.gregs[REG_R13];
- lx_ucp->uc_sigcontext.sc_r14 = ucp->uc_mcontext.gregs[REG_R14];
- lx_ucp->uc_sigcontext.sc_r15 = ucp->uc_mcontext.gregs[REG_R15];
- lx_ucp->uc_sigcontext.sc_rdi = ucp->uc_mcontext.gregs[REG_RDI];
- lx_ucp->uc_sigcontext.sc_rsi = ucp->uc_mcontext.gregs[REG_RSI];
- lx_ucp->uc_sigcontext.sc_rbp = ucp->uc_mcontext.gregs[REG_RBP];
- lx_ucp->uc_sigcontext.sc_rbx = ucp->uc_mcontext.gregs[REG_RBX];
- lx_ucp->uc_sigcontext.sc_rdx = ucp->uc_mcontext.gregs[REG_RDX];
- lx_ucp->uc_sigcontext.sc_rax = ucp->uc_mcontext.gregs[REG_RAX];
- lx_ucp->uc_sigcontext.sc_rcx = ucp->uc_mcontext.gregs[REG_RCX];
- lx_ucp->uc_sigcontext.sc_rsp = ucp->uc_mcontext.gregs[REG_RSP];
- lx_ucp->uc_sigcontext.sc_rip = ucp->uc_mcontext.gregs[REG_RIP];
- lx_ucp->uc_sigcontext.sc_eflags = ucp->uc_mcontext.gregs[REG_RFL];
- lx_ucp->uc_sigcontext.sc_cs = ucp->uc_mcontext.gregs[REG_CS];
- lx_ucp->uc_sigcontext.sc_gs = ucp->uc_mcontext.gregs[REG_GS];
- lx_ucp->uc_sigcontext.sc_fs = ucp->uc_mcontext.gregs[REG_FS];
- lx_ucp->uc_sigcontext.sc_pad0 = ucp->uc_mcontext.gregs[REG_SS];
- lx_ucp->uc_sigcontext.sc_err = ucp->uc_mcontext.gregs[REG_ERR];
- lx_ucp->uc_sigcontext.sc_trapno = ucp->uc_mcontext.gregs[REG_TRAPNO];
+ lx_ucp->uc_sigcontext.sc_r8 = LX_REG(ucp, REG_R8);
+ lx_ucp->uc_sigcontext.sc_r9 = LX_REG(ucp, REG_R9);
+ lx_ucp->uc_sigcontext.sc_r10 = LX_REG(ucp, REG_R10);
+ lx_ucp->uc_sigcontext.sc_r11 = LX_REG(ucp, REG_R11);
+ lx_ucp->uc_sigcontext.sc_r12 = LX_REG(ucp, REG_R12);
+ lx_ucp->uc_sigcontext.sc_r13 = LX_REG(ucp, REG_R13);
+ lx_ucp->uc_sigcontext.sc_r14 = LX_REG(ucp, REG_R14);
+ lx_ucp->uc_sigcontext.sc_r15 = LX_REG(ucp, REG_R15);
+ lx_ucp->uc_sigcontext.sc_rdi = LX_REG(ucp, REG_RDI);
+ lx_ucp->uc_sigcontext.sc_rsi = LX_REG(ucp, REG_RSI);
+ lx_ucp->uc_sigcontext.sc_rbp = LX_REG(ucp, REG_RBP);
+ lx_ucp->uc_sigcontext.sc_rbx = LX_REG(ucp, REG_RBX);
+ lx_ucp->uc_sigcontext.sc_rdx = LX_REG(ucp, REG_RDX);
+ lx_ucp->uc_sigcontext.sc_rax = LX_REG(ucp, REG_RAX);
+ lx_ucp->uc_sigcontext.sc_rcx = LX_REG(ucp, REG_RCX);
+ lx_ucp->uc_sigcontext.sc_rsp = LX_REG(ucp, REG_RSP);
+ lx_ucp->uc_sigcontext.sc_rip = LX_REG(ucp, REG_RIP);
+ lx_ucp->uc_sigcontext.sc_eflags = LX_REG(ucp, REG_RFL);
+ lx_ucp->uc_sigcontext.sc_cs = LX_REG(ucp, REG_CS);
+ lx_ucp->uc_sigcontext.sc_gs = LX_REG(ucp, REG_GS);
+ lx_ucp->uc_sigcontext.sc_fs = LX_REG(ucp, REG_FS);
+ lx_ucp->uc_sigcontext.sc_pad0 = LX_REG(ucp, REG_SS);
+ lx_ucp->uc_sigcontext.sc_err = LX_REG(ucp, REG_ERR);
+ lx_ucp->uc_sigcontext.sc_trapno = LX_REG(ucp, REG_TRAPNO);
#else /* is _ILP32 */
/*
@@ -1485,19 +1387,6 @@ lx_build_signal_frame(int lx_sig, siginfo_t *sip, void *p, void *sp)
(uintptr_t)sip->si_addr : 0;
/*
- * Point the lx_siginfo_t pointer to the signal stack's lx_siginfo_t
- * if there was a Illumos siginfo_t to convert, otherwise set it to
- * NULL. For 64-bit code a NULL sip is handled in the lx_deliver
- * assembly code.
- */
-#if defined(_ILP32)
- if (sip != NULL)
- lx_ssp->sip = &lx_ssp->si;
- else
- lx_ssp->sip = NULL;
-#endif
-
- /*
* This should only return an error if the signum is invalid but that
* also gets converted into a LX_SIGKILL by this function.
*/
@@ -1529,76 +1418,21 @@ lx_build_signal_frame(int lx_sig, siginfo_t *sip, void *p, void *sp)
bcopy((void *)lx_rt_sigreturn_tramp, lx_ssp->trampoline,
sizeof (lx_ssp->trampoline));
#endif
-
- LX_SIGDELIVER(lx_sig, lxsap, lx_ssp, lx_ucp);
-
-#if defined(_LP64)
- /*
- * For the 64-bit code this must be the last syscall we do in the
- * emulation code path before we return back to the Linux signal
- * handler. This will disable native syscalls so the next time a
- * syscall happens on this thread, it will come back into the emulation.
- */
- (void) syscall(SYS_brand, B_CLR_NTV_SYSC_FLAG);
-#endif
-
- /* We return to lx_sigdeliver to jump into the Linux signal handler */
-}
-
-#if defined(_LP64)
-static void
-lx_vsyscall_return(long ret, ucontext_t *ucp)
-{
- lx_debug("\tvsyscall return val = %lX", ret);
- ucp->uc_mcontext.gregs[REG_RAX] = ret;
- /*
- * Simulate a 'ret' by grabbing the return address off the caller's
- * stack and incrementing rsp manually before sigreturning back.
- */
- (void) uucopy((void*)ucp->uc_mcontext.gregs[REG_RSP],
- &ucp->uc_mcontext.gregs[REG_RIP], sizeof (void*));
- lx_debug("\tvsyscall return to %p", ucp->uc_mcontext.gregs[REG_RIP]);
- ucp->uc_mcontext.gregs[REG_RSP] += sizeof (void*);
-
- /*
- * Make sure that libc's ul_sigmask reflects what the sigmask is about
- * to become.
- */
- thr_sigsetmask(SIG_SETMASK, &ucp->uc_sigmask, NULL);
-
- (void) syscall(SYS_brand, B_SIGNAL_RETURN, ucp);
}
-#endif
/*
- * This is the second level interposition handler for Linux signals.
+ * This is the interposition handler for Linux signals.
*/
static void
lx_call_user_handler(int sig, siginfo_t *sip, void *p)
{
void (*user_handler)();
void (*stk_builder)();
-#if defined(_ILP32)
- lx_tsd_t *lx_tsd;
- int err;
-#endif
struct lx_sigaction *lxsap;
ucontext_t *ucp = (ucontext_t *)p;
- uintptr_t gs;
size_t stksize;
int lx_sig;
- switch (sig) {
- case SIGCLD:
- /*
- * Signal to an interrupted waitpid() that it was interrupted
- * by a SIGCLD, and should restart to grab the wait status
- * this signal represented.
- */
- lx_had_sigchild = 1;
- break;
- }
-
/*
* If Illumos signal has no Linux equivalent, effectively ignore it.
*/
@@ -1615,18 +1449,6 @@ lx_call_user_handler(int sig, siginfo_t *sip, void *p)
lx_debug("lxsap @ 0x%p", lxsap);
/*
- * If the delivery of this signal interrupted a system call, we must
- * only restart it if sigaction(2) was used to set the SA_RESTART flag
- * for this signal. The lx_emulate() function checks this per-thread
- * variable to discover the restart disposition of the most recently
- * handled signal.
- *
- * NOTE: this mechanism may not stand up to close scrutiny in the face
- * of nested asynchronous signal delivery.
- */
- lx_do_syscall_restart = !!(lxsap->lxsa_flags & LX_SA_RESTART);
-
- /*
* Emulate vsyscall support.
*
* Linux magically maps a single page into the address space of each
@@ -1656,27 +1478,35 @@ lx_call_user_handler(int sig, siginfo_t *sip, void *p)
if (sig == SIGSEGV) {
int i;
for (i = 0; lx_vsyscalls[i].lv_addr != NULL; i++) {
- if (lx_vsyscalls[i].lv_addr != (uintptr_t)sip->si_addr)
+ extern void lx_vsyscall_tramp(void);
+ uintptr_t addr = (uintptr_t)sip->si_addr;
+
+ if (lx_vsyscalls[i].lv_addr != addr)
continue;
+
/*
* Users of vsyscall must commit fully by using
* jmp/call access the vsyscall. Cowardly reading data
* from the page beforehand isn't allowed or possible.
*/
- if (sip->si_addr !=
- (void*)ucp->uc_mcontext.gregs[REG_RIP])
+ if (addr != LX_REG(ucp, REG_PC))
continue;
- lx_debug(lx_vsyscalls[i].lv_msg,
- ucp->uc_mcontext.gregs[REG_RDI],
- ucp->uc_mcontext.gregs[REG_RSI],
- ucp->uc_mcontext.gregs[REG_RDX]);
- long ret = lx_vsyscalls[i].lv_func(
- ucp->uc_mcontext.gregs[REG_RDI],
- ucp->uc_mcontext.gregs[REG_RSI],
- ucp->uc_mcontext.gregs[REG_RDX]);
- lx_vsyscall_return(ret, ucp);
- assert(0);
+ lx_debug(lx_vsyscalls[i].lv_msg, LX_REG(ucp, REG_RDI),
+ LX_REG(ucp, REG_RSI), LX_REG(ucp, REG_RDX));
+
+ /*
+ * Modify the interrupted context so that, on return
+ * from the signal handler, the kernel revectors this
+ * LWP to the vsyscall trampoline. That trampoline
+ * will immediately invoke the "syscall" instruction
+ * and returns to the address on the stack when
+ * complete.
+ */
+ LX_REG(ucp, REG_R0) = lx_vsyscalls[i].lv_scnum;
+ LX_REG(ucp, REG_PC) = (uintptr_t)&lx_vsyscall_tramp;
+ lx_debug("\treturning from signal handler\n");
+ return;
}
/*
@@ -1715,28 +1545,9 @@ lx_call_user_handler(int sig, siginfo_t *sip, void *p)
(lxsap->lxsa_handler == SIG_DFL) ? "SIG_DFL" : "SIG_IGN");
#if defined(_LP64)
- /* %gs is ignored in the 64-bit lx_sigdeliver */
- gs = 0;
-
stksize = sizeof (struct lx_sigstack);
stk_builder = lx_build_signal_frame;
-
#else
- if ((err = thr_getspecific(lx_tsd_key, (void **)&lx_tsd)) != 0)
- lx_err_fatal("lx_call_user_handler: unable to read "
- "thread-specific data: %s", strerror(err));
-
- assert(lx_tsd != 0);
-
- gs = lx_tsd->lxtsd_gs & 0xffff; /* gs is only 16 bits */
-
- /*
- * Any zero %gs value should be caught when a save is attempted in
- * lx_emulate(), but this extra check will catch any zero values due to
- * bugs in the library. This is only applicable to 32-bit code.
- */
- assert(gs != 0);
-
if (lxsap->lxsa_flags & LX_SA_SIGINFO) {
stksize = sizeof (struct lx_sigstack);
stk_builder = lx_build_signal_frame;
@@ -1748,22 +1559,333 @@ lx_call_user_handler(int sig, siginfo_t *sip, void *p)
user_handler = lxsap->lxsa_handler;
- lx_debug("delivering %d (lx %d) to handler at 0x%p with gs 0x%x", sig,
- lx_sig, lxsap->lxsa_handler, gs);
+ lx_debug("delivering %d (lx %d) to handler at 0x%p", sig, lx_sig,
+ lxsap->lxsa_handler);
if (lxsap->lxsa_flags & LX_SA_RESETHAND)
lxsap->lxsa_handler = SIG_DFL;
+ lx_sigdeliver(lx_sig, sip, ucp, stksize, stk_builder, user_handler,
+ lxsap);
+
/*
- * lx_sigdeliver() doesn't return, so it relies on the Linux signal
- * handler to clean up the stack, reset the current signal mask and
- * make a system call (sigreturn or rt_sigreturn) which is intended to
- * return to the code interrupted by the signal. The emulation will
- * catch that syscall, finish it's own cleanup, then actually return
- * back through here via lx_sigreturn_tolibc(), which leads us back
- * into libc and then back to the point where we were interrupted.
+ * We need to handle restarting system calls if requested by the
+ * program for this signal type:
+ */
+ if (lxsap->lxsa_flags & LX_SA_RESTART) {
+ uintptr_t flags = (uintptr_t)ucp->uc_brand_data[0];
+ long ret = (long)LX_REG(ucp, REG_R0);
+ boolean_t interrupted = (ret == -lx_errno(EINTR));
+
+ /*
+ * If the system call returned EINTR, and the system
+ * call handler set "br_syscall_restart" when returning,
+ * we modify the context to try the system call again
+ * when we return from this signal handler.
+ */
+ if ((flags & LX_UC_RESTART_SYSCALL) && interrupted) {
+ int syscall_num = (int)(uintptr_t)ucp->uc_brand_data[2];
+
+ lx_debug("restarting interrupted system call %d",
+ syscall_num);
+
+ /*
+ * Both the "int 0x80" and the "syscall" instruction
+ * are two bytes long. Wind the program counter back
+ * to the start of this instruction.
+ *
+ * The system call we interrupted is preserved in the
+ * brand-specific data in the ucontext_t when the
+ * LX_UC_RESTART_SYSCALL flag is set. This is
+ * analogous to the "orig_[er]ax" field in the Linux
+ * "user_regs_struct".
+ */
+ LX_REG(ucp, REG_PC) -= 2;
+ LX_REG(ucp, REG_R0) = syscall_num;
+ }
+ }
+}
+
+/*
+ * The "lx_sigdeliver()" function is responsible for constructing the emulated
+ * signal delivery frame on the brand stack for this LWP. A context is saved
+ * on the stack which will be used by the "sigreturn(2)" family of emulated
+ * system calls to get us back here after the Linux signal handler returns.
+ * This function is modelled on the in-kernel "sendsig()" signal delivery
+ * mechanism.
+ */
+void
+lx_sigdeliver(int lx_sig, siginfo_t *sip, ucontext_t *ucp, size_t stacksz,
+ void (*stack_builder)(), void (*user_handler)(),
+ struct lx_sigaction *lxsap)
+{
+ ucontext_t uc;
+ lx_tsd_t *lxtsd = lx_get_tsd();
+ int totsz = 0;
+ uintptr_t flags;
+ uintptr_t hargs[3];
+ /*
+ * These variables must be "volatile", as they are modified after the
+ * getcontext() stores the register state:
+ */
+ volatile boolean_t signal_delivered = B_FALSE;
+ volatile uintptr_t lxfp;
+ volatile uintptr_t old_tsd_sp;
+ volatile int newstack;
+
+ /*
+ * This function involves modifying the Linux process stack for this
+ * thread. To do so without corruption requires us to exclude other
+ * signal handlers (or emulated system calls called from within those
+ * handlers) from running while we reserve space on that stack. We
+ * defer the execution of further instances of lx_call_user_handler()
+ * until we have completed this operation.
+ */
+ _sigoff();
+
+ /*
+ * Clear register arguments vector.
+ */
+ bzero(hargs, sizeof (hargs));
+
+ /*
+ * We save a context here so that we can be returned later to complete
+ * handling the signal.
+ */
+ lx_debug("lx_sigdeliver: STORING RETURN CONTEXT @ %p\n", &uc);
+ assert(getcontext(&uc) == 0);
+ lx_debug("lx_sigdeliver: RETURN CONTEXT %p LINK %p FLAGS %lx\n",
+ &uc, uc.uc_link, uc.uc_flags);
+ if (signal_delivered) {
+ /*
+ * If the "signal_delivered" flag is set, we are returned here
+ * via setcontext() as called by the emulated Linux signal
+ * return system call.
+ */
+ lx_debug("lx_sigdeliver: WE ARE BACK, VIA UC @ %p!\n", &uc);
+ goto after_signal_handler;
+ }
+ signal_delivered = B_TRUE;
+
+ /*
+ * Preserve the current tsd value of the Linux process stack pointer,
+ * even if it is zero. We will restore it when we are returned here
+ * via setcontext() after the Linux process has completed execution of
+ * its signal handler.
+ */
+ old_tsd_sp = lxtsd->lxtsd_lx_sp;
+
+ /*
+ * Figure out whether we will be handling this signal on an alternate
+ * stack specified by the user.
+ */
+ newstack = (lxsap->lxsa_flags & LX_SA_ONSTACK) &&
+ !(lxtsd->lxtsd_sigaltstack.ss_flags & (LX_SS_ONSTACK |
+ LX_SS_DISABLE));
+
+ /*
+ * Find the first unused region of the Linux process stack, where
+ * we will assemble our signal delivery frame.
+ */
+ flags = (uintptr_t)ucp->uc_brand_data[0];
+ if (newstack) {
+ /*
+ * We are moving to the user-provided alternate signal
+ * stack.
+ */
+ lxfp = SA((uintptr_t)lxtsd->lxtsd_sigaltstack.ss_sp) +
+ SA(lxtsd->lxtsd_sigaltstack.ss_size) - STACK_ALIGN;
+ lx_debug("lx_sigdeliver: moving to ALTSTACK sp %p\n", lxfp);
+ LX_SIGNAL_ALTSTACK_ENABLE(lxfp);
+ } else if (flags & LX_UC_STACK_BRAND) {
+ /*
+ * We interrupted the Linux process to take this signal. The
+ * stack pointer is the one saved in this context.
+ */
+ lxfp = LX_REG(ucp, REG_SP);
+ } else {
+ /*
+ * We interrupted a native (emulation) routine, so we must get
+ * the current stack pointer from either the tsd (if one is
+ * stored there) or via the context chain.
+ *
+ */
+ lxfp = lx_find_brand_sp();
+ if (lxtsd->lxtsd_lx_sp != 0) {
+ /*
+ * We must also make room for the possibility of nested
+ * signal delivery -- we may be pre-empting the
+ * in-progress handling of another signal.
+ *
+ * Note that if we were already on the alternate stack,
+ * any emulated Linux system calls would be betwixt
+ * that original signal frame and this new one on the
+ * one contiguous stack, so this logic holds either
+ * way:
+ */
+ lxfp = MIN(lxtsd->lxtsd_lx_sp, lxfp);
+ }
+ }
+
+ /*
+ * Account for a reserved stack region (for amd64, this is 128 bytes),
+ * and align the stack:
+ */
+ lxfp -= STACK_RESERVE;
+ lxfp &= ~(STACK_ALIGN - 1);
+
+ /*
+ * Allocate space on the Linux process stack for our delivery frame,
+ * including:
+ *
+ * ----------------------------------------------------- old %sp
+ * - lx_sigdeliver_frame_t
+ * - (ucontext_t pointers and stack magic)
+ * -----------------------------------------------------
+ * - (amd64-only 8-byte alignment gap)
+ * -----------------------------------------------------
+ * - frame of size "stacksz" from the stack builder
+ * ----------------------------------------------------- new %sp
+ */
+#if defined(_LP64)
+ /*
+ * The AMD64 ABI requires us to align the stack such that when the
+ * called function pushes the base pointer, the stack is 16 byte
+ * aligned. The stack must, therefore, be 8- but _not_ 16-byte
+ * aligned.
+ */
+#if (STACK_ALIGN != 16) || (STACK_ENTRY_ALIGN != 8)
+#error "lx_sigdeliver() did not find expected stack alignment"
+#endif
+ totsz = SA(sizeof (lx_sigdeliver_frame_t)) + SA(stacksz) + 8;
+ assert((totsz & (STACK_ENTRY_ALIGN - 1)) == 0);
+ assert((totsz & (STACK_ALIGN - 1)) == 8);
+#else
+ totsz = SA(sizeof (lx_sigdeliver_frame_t)) + SA(stacksz);
+ assert((totsz & (STACK_ALIGN - 1)) == 0);
+#endif
+
+ /*
+ * Copy our return frame into place:
+ */
+ lxfp -= SA(sizeof (lx_sigdeliver_frame_t));
+ lx_debug("lx_sigdeliver: lx_sigdeliver_frame_t @ %p\n", lxfp);
+ {
+ lx_sigdeliver_frame_t frm;
+
+ frm.lxsdf_magic = LX_SIGRT_MAGIC;
+ frm.lxsdf_retucp = &uc;
+ frm.lxsdf_sigucp = ucp;
+
+ lx_debug("lx_sigdeliver: retucp %p sigucp %p\n",
+ frm.lxsdf_retucp, frm.lxsdf_sigucp);
+
+ if (uucopy(&frm, (void *)lxfp, sizeof (frm)) != 0) {
+ /*
+ * We could not modify the stack of the emulated Linux
+ * program. Act like the kernel and terminate the
+ * program with a segmentation violation.
+ */
+ (void) syscall(SYS_brand, B_EXIT_AS_SIG, SIGSEGV);
+ }
+
+ LX_SIGNAL_DELIVERY_FRAME_CREATE((void *)lxfp);
+ }
+
+ /*
+ * Build the Linux signal handling frame:
+ */
+#if defined(_LP64)
+ lxfp -= SA(stacksz) + 8;
+#else
+ lxfp -= SA(stacksz);
+#endif
+ lx_debug("lx_sigdeliver: Linux sig frame @ %p\n", lxfp);
+ stack_builder(lx_sig, sip, ucp, lxfp, hargs);
+
+ /*
+ * Record our reservation so that any nested signal handlers
+ * can see it.
+ */
+ lx_debug("lx_sigdeliver: Linux tsd sp %p -> %p\n", lxtsd->lxtsd_lx_sp,
+ lxfp);
+ lxtsd->lxtsd_lx_sp = lxfp;
+
+ if (newstack) {
+ lxtsd->lxtsd_sigaltstack.ss_flags |= LX_SS_ONSTACK;
+ }
+
+ LX_SIGDELIVER(lx_sig, lxsap, (void *)lxfp);
+
+ /*
+ * Re-enable signal delivery. If a signal was queued while we were
+ * in the critical section, it will be delivered immediately.
+ */
+ _sigon();
+
+ /*
+ * Pass control to the Linux signal handler:
+ */
+ lx_debug("lx_sigdeliver: JUMPING TO LINUX (sig %d sp %p eip %p)\n",
+ lx_sig, lxfp, user_handler);
+ {
+ ucontext_t jump_uc;
+
+ bcopy(lx_find_brand_uc(), &jump_uc, sizeof (jump_uc));
+
+ /*
+ * We want to load the general registers from this context, and
+ * switch to the BRAND stack. We do _not_ want to restore the
+ * uc_link value from this synthetic context, as that would
+ * break the signal handling context chain.
+ */
+ jump_uc.uc_flags = UC_CPU;
+ jump_uc.uc_brand_data[0] = (void *)(LX_UC_STACK_BRAND |
+ LX_UC_IGNORE_LINK);
+
+ LX_REG(&jump_uc, REG_FP) = 0;
+ LX_REG(&jump_uc, REG_SP) = lxfp;
+ LX_REG(&jump_uc, REG_PC) = (uintptr_t)user_handler;
+
+#if defined(_LP64)
+ /*
+ * Pass signal handler arguments by registers on AMD64.
+ */
+ LX_REG(&jump_uc, REG_RDI) = hargs[0];
+ LX_REG(&jump_uc, REG_RSI) = hargs[1];
+ LX_REG(&jump_uc, REG_RDX) = hargs[2];
+#endif
+
+ if (syscall(SYS_brand, B_JUMP_TO_LINUX, &jump_uc) == -1) {
+ lx_err_fatal("B_JUMP_TO_LINUX failed: %s",
+ strerror(errno));
+ }
+ }
+
+ assert(0);
+
+after_signal_handler:
+ /*
+ * Ensure all nested signal handlers have completed correctly
+ * and then remove our stack reservation.
+ */
+ _sigoff();
+ LX_SIGNAL_POST_HANDLER(lxfp, old_tsd_sp);
+ assert(lxtsd->lxtsd_lx_sp == lxfp);
+ lx_debug("lx_sigdeliver: after; Linux tsd sp %p -> %p\n", lxfp,
+ old_tsd_sp);
+ lxtsd->lxtsd_lx_sp = old_tsd_sp;
+ if (newstack) {
+ LX_SIGNAL_ALTSTACK_DISABLE();
+ lx_debug("lx_sigdeliver: disabling ALTSTACK sp %p\n", lxfp);
+ lxtsd->lxtsd_sigaltstack.ss_flags &= ~LX_SS_ONSTACK;
+ }
+ _sigon();
+
+ /*
+ * Here we return to libc so that it may clean up and restore the
+ * context originally interrupted by this signal.
*/
- lx_sigdeliver(lx_sig, sip, ucp, stksize, stk_builder, user_handler, gs);
}
/*
@@ -1849,12 +1971,17 @@ lx_sigaction_common(int lx_sig, struct lx_sigaction *lxsp,
*/
sa.sa_flags = SA_SIGINFO;
+ /*
+ * When translating from Linux to illumos
+ * sigaction(2) flags, we explicitly do not
+ * pass SA_ONSTACK to the kernel. The
+ * alternate stack for Linux signal handling is
+ * handled entirely by the emulation code.
+ */
if (lxsa.lxsa_flags & LX_SA_NOCLDSTOP)
sa.sa_flags |= SA_NOCLDSTOP;
if (lxsa.lxsa_flags & LX_SA_NOCLDWAIT)
sa.sa_flags |= SA_NOCLDWAIT;
- if (lxsa.lxsa_flags & LX_SA_ONSTACK)
- sa.sa_flags |= SA_ONSTACK;
if (lxsa.lxsa_flags & LX_SA_RESTART)
sa.sa_flags |= SA_RESTART;
if (lxsa.lxsa_flags & LX_SA_NODEFER)
@@ -2079,61 +2206,11 @@ lx_signal(uintptr_t lx_sig, uintptr_t handler)
}
#endif
-#if defined(_ILP32)
-/*
- * This is only used in 32-bit code and is called by the assembly routine
- * lx_sigacthandler.
- *
- * This C routine saves the passed %gs value into the thread-specific save area.
- */
-void
-lx_sigsavegs(uintptr_t signalled_gs)
-{
- lx_tsd_t *lx_tsd;
- int err;
-
- signalled_gs &= 0xffff; /* gs is only 16 bits */
-
- /*
- * While a %gs of 0 is technically legal (as long as the application
- * never dereferences memory using %gs), Illumos has its own ideas as
- * to how a zero %gs should be handled in _update_sregs(), such that
- * any 32-bit user process with a %gs of zero running on a system with
- * a 64-bit kernel will have its %gs hidden base register stomped on on
- * return from a system call, leaving an incorrect base address in
- * place until the next time %gs is actually reloaded (forcing a reload
- * of the base address from the appropriate descriptor table.)
- *
- * Of course the kernel will once again stomp on THAT base address when
- * returning from a system call, resulting in an application
- * segmentation fault.
- *
- * To avoid this situation, disallow a save of a zero %gs here in order
- * to try and capture any Linux process that takes a signal with a zero
- * %gs installed.
- */
- assert(signalled_gs != 0);
-
- if (signalled_gs != LWPGS_SEL) {
- if ((err = thr_getspecific(lx_tsd_key,
- (void **)&lx_tsd)) != 0)
- lx_err_fatal("sigsavegs: unable to read "
- "thread-specific data: %s", strerror(err));
-
- assert(lx_tsd != 0);
-
- lx_tsd->lxtsd_gs = signalled_gs;
- lx_debug("lx_sigsavegs(): gsp 0x%p, saved gs: 0x%x\n",
- lx_tsd, signalled_gs);
- }
-}
-#endif
-
int
lx_siginit(void)
{
extern void set_setcontext_enforcement(int);
- extern void lx_sigacthandler(int, siginfo_t *, void *);
+ extern void set_escaped_context_cleanup(int);
struct sigaction sa;
sigset_t new_set, oset;
@@ -2162,38 +2239,6 @@ lx_siginit(void)
(void) sigignore(sig);
/*
- * As mentioned previously, when a user signal handler is installed
- * via sigaction(), libc interposes on the mechanism by actually
- * installing an internal routine sigacthandler() as the signal
- * handler. On receipt of the signal, libc does some thread-related
- * processing via sigacthandler(), then calls the registered user
- * signal handler on behalf of the user.
- *
- * For 32-bit code we need to interpose on that mechanism to make sure
- * the correct %gs segment register value is installed before the libc
- * routine is called, otherwise the libc code will die with a
- * segmentation fault.
- *
- * For 64-bit code we overload the %gs register as a mechanism to pass
- * the syscall mode flag out of the kernel.
- *
- * The private libc routine setsigacthandler() will set our
- * interposition routine, lx_sigacthandler(), as the default
- * "sigacthandler" routine for all new signal handlers for this
- * thread. We also use this in 64-bit code to set the libc interposition
- * routine for setting the context when returning from a signal handler.
- * This is needed so we can combine changing the syscall mode flag and
- * doing __setcontext() in one call.
- */
-#if defined(_LP64)
- setsigacthandler(lx_sigacthandler, &libc_sigacthandler, lx_setcontext);
-#else
- setsigacthandler(lx_sigacthandler, &libc_sigacthandler, NULL);
-#endif
- lx_debug("lx_sigacthandler installed, libc_sigacthandler = 0x%p",
- libc_sigacthandler);
-
- /*
* Mark any signals that are ignored as ignored in our interposition
* handler array
*/
@@ -2239,7 +2284,17 @@ lx_siginit(void)
set_setcontext_enforcement(0);
/*
- * Reset the signal mask to what we came in with
+ * The illumos libc attempts to clean up dangling uc_link pointers in
+ * signal handling contexts when libc believes us to have escaped a
+ * signal handler incorrectly in the past. We want to disable this
+ * behaviour, so that the system call emulation context saved by the
+ * kernel brand module for lx_emulate() may be part of the context
+ * chain without itself being used for signal handling.
+ */
+ set_escaped_context_cleanup(0);
+
+ /*
+ * Reset the signal mask to what we came in with.
*/
(void) sigprocmask(SIG_SETMASK, &oset, NULL);
@@ -2248,7 +2303,7 @@ lx_siginit(void)
}
/*
- * This code stongly resemebles lx_poll(), but is here to be able to take
+ * This code strongly resembles lx_poll(), but is here to be able to take
* advantage of the Linux signal helper routines.
*/
long
@@ -2545,11 +2600,9 @@ lx_rt_sigqueueinfo(uintptr_t p1, uintptr_t p2, uintptr_t p3)
siginfo.si_pid = lx_siginfo.lsi_pid;
siginfo.si_value = lx_siginfo.lsi_value;
siginfo.si_uid = lx_siginfo.lsi_uid;
- return ((syscall(SYS_brand, B_IKE_SYSCALL +
- LX_EMUL_rt_sigqueueinfo, tgid, sig, &siginfo)) ?
- (-errno) : 0);
+ return ((syscall(SYS_brand, B_HELPER_SIGQUEUE,
+ tgid, sig, &siginfo)) ? (-errno) : 0);
}
-
}
/*
@@ -2587,7 +2640,6 @@ lx_rt_tgsigqueueinfo(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4)
siginfo.si_value = lx_siginfo.lsi_value;
siginfo.si_uid = lx_siginfo.lsi_uid;
- return ((syscall(SYS_brand, B_IKE_SYSCALL +
- LX_EMUL_rt_tgsigqueueinfo, tgid, tid, sig, &siginfo)) ?
- (-errno) : 0);
+ return ((syscall(SYS_brand, B_HELPER_TGSIGQUEUE, tgid, tid, sig,
+ &siginfo)) ? (-errno) : 0);
}
diff --git a/usr/src/lib/brand/lx/lx_brand/common/socket.c b/usr/src/lib/brand/lx/lx_brand/common/socket.c
index fa925628e7..b8c2c31582 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/socket.c
+++ b/usr/src/lib/brand/lx/lx_brand/common/socket.c
@@ -648,14 +648,28 @@ ltos_xform_cmsgs(struct lx_msghdr *msg, struct cmsghdr *ntv_cmsg)
static int
stol_xform_cmsgs(struct lx_msghdr *msg, lx_cmsghdr64_t *lx_cmsg)
{
+ struct lx_msghdr tmsg;
lx_cmsghdr64_t *lcmsg, *last;
struct cmsghdr *cmsg, *lp;
int nlen = 0;
int err = 0;
- lcmsg = lx_cmsg;
+ /*
+ * Create a temporary "struct lx_msghdr" so that we can use the
+ * LX_CMSG_*HDR() iteration macros.
+ */
+ tmsg = *msg;
+ tmsg.msg_control = lx_cmsg;
+ tmsg.msg_controllen = msg->msg_controllen + LX_CMSG_EXTRA;
+
+ lcmsg = LX_CMSG_FIRSTHDR(&tmsg);
cmsg = CMSG_FIRSTHDR(msg);
while (cmsg != NULL && err == 0) {
+ if (lcmsg == NULL) {
+ err = ENOTSUP;
+ break;
+ }
+
lcmsg->cmsg_len =
LX_CMSG_LEN(cmsg->cmsg_len - sizeof (struct cmsghdr));
lcmsg->cmsg_level = cmsg->cmsg_level;
@@ -668,12 +682,13 @@ stol_xform_cmsgs(struct lx_msghdr *msg, lx_cmsghdr64_t *lx_cmsg)
cmsg = CMSG_NXTHDR(msg, lp);
last = lcmsg;
- lcmsg = LX_CMSG_NXTHDR(msg, last);
+ lcmsg = LX_CMSG_NXTHDR(&tmsg, last);
nlen += (int)((uint64_t)lcmsg - (uint64_t)last);
- if (nlen > (msg->msg_controllen + LX_CMSG_EXTRA))
+ if (nlen > (msg->msg_controllen + LX_CMSG_EXTRA)) {
err = ENOTSUP;
+ }
}
if (err) {
@@ -876,7 +891,7 @@ ltos_sockaddr(struct sockaddr *addr, socklen_t *len,
case AF_INET6:
/*
- * The Solaris sockaddr_in6 has one more 32-bit field
+ * The illumos sockaddr_in6 has one more 32-bit field
* than the Linux version. We assume the caller has
* zeroed the sockaddr we're copying into.
*/
@@ -1063,7 +1078,7 @@ convert_sock_args(int in_dom, int in_type, int in_protocol, int *out_dom,
/*
* Linux does not allow the app to specify IP Protocol for raw
- * sockets. Solaris does, so bail out here.
+ * sockets. Illumos does, so bail out here.
*/
if (domain == AF_INET && type == SOCK_RAW && in_protocol == IPPROTO_IP)
return (-ESOCKTNOSUPPORT);
@@ -1092,25 +1107,25 @@ convert_sock_args(int in_dom, int in_type, int in_protocol, int *out_dom,
static int
convert_sockflags(int lx_flags, char *call)
{
- int solaris_flags = 0;
+ int native_flags = 0;
if (lx_flags & LX_MSG_OOB) {
- solaris_flags |= MSG_OOB;
+ native_flags |= MSG_OOB;
lx_flags &= ~LX_MSG_OOB;
}
if (lx_flags & LX_MSG_PEEK) {
- solaris_flags |= MSG_PEEK;
+ native_flags |= MSG_PEEK;
lx_flags &= ~LX_MSG_PEEK;
}
if (lx_flags & LX_MSG_DONTROUTE) {
- solaris_flags |= MSG_DONTROUTE;
+ native_flags |= MSG_DONTROUTE;
lx_flags &= ~LX_MSG_DONTROUTE;
}
if (lx_flags & LX_MSG_CTRUNC) {
- solaris_flags |= MSG_CTRUNC;
+ native_flags |= MSG_CTRUNC;
lx_flags &= ~LX_MSG_CTRUNC;
}
@@ -1120,22 +1135,22 @@ convert_sockflags(int lx_flags, char *call)
}
if (lx_flags & LX_MSG_TRUNC) {
- solaris_flags |= MSG_TRUNC;
+ native_flags |= MSG_TRUNC;
lx_flags &= ~LX_MSG_TRUNC;
}
if (lx_flags & LX_MSG_DONTWAIT) {
- solaris_flags |= MSG_DONTWAIT;
+ native_flags |= MSG_DONTWAIT;
lx_flags &= ~LX_MSG_DONTWAIT;
}
if (lx_flags & LX_MSG_EOR) {
- solaris_flags |= MSG_EOR;
+ native_flags |= MSG_EOR;
lx_flags &= ~LX_MSG_EOR;
}
if (lx_flags & LX_MSG_WAITALL) {
- solaris_flags |= MSG_WAITALL;
+ native_flags |= MSG_WAITALL;
lx_flags &= ~LX_MSG_WAITALL;
}
@@ -1200,7 +1215,7 @@ convert_sockflags(int lx_flags, char *call)
lx_unsupported("%s: unknown socket flag(s) 0x%x", call,
lx_flags);
- return (solaris_flags);
+ return (native_flags);
}
long
@@ -1374,7 +1389,7 @@ lx_accept(int sockfd, void *name, int *nlp)
* If it is NULL, we don't care about the namelen pointer's value
* or about dereferencing it.
*
- * Happily, Solaris' accept(3SOCKET) treats NULL name pointers and
+ * Happily, illumos' accept(3SOCKET) treats NULL name pointers and
* zero namelens the same way.
*/
if ((name != NULL) &&
@@ -1948,7 +1963,7 @@ lx_getsockopt(int sockfd, int level, int optname, void *optval, int *optlenp)
/*
* According to the Linux man page, a NULL optval should indicate
- * (as in Solaris) that no return value is expected. Instead, it
+ * (as in illumos) that no return value is expected. Instead, it
* actually triggers an EFAULT error.
*/
if (optval == NULL)
@@ -2132,7 +2147,7 @@ lx_sendmsg(int sockfd, void *lmp, int flags)
/*
* If there are control messages bundled in this message, we need
- * to convert them from Linux to Solaris.
+ * to convert them from Linux to illumos.
*/
if (msg.msg_control != NULL) {
if (msg.msg_controllen == 0) {
@@ -2213,6 +2228,7 @@ lx_recvmsg(int sockfd, void *lmp, int flags)
void *new_cmsg = NULL;
int r, err;
socklen_t len, orig_len = 0;
+ void *msg_control = NULL;
int nosigpipe = flags & LX_MSG_NOSIGNAL;
struct sigaction newact, oact;
@@ -2238,8 +2254,7 @@ lx_recvmsg(int sockfd, void *lmp, int flags)
len = sizeof (struct sockaddr);
if (getsockname(sockfd, &sname, &len) < 0)
len = sizeof (struct sockaddr);
- if ((name = SAFE_ALLOCA(len)) == NULL)
- return (-ENOMEM);
+ name = alloca(len);
orig_name = msg.msg_name;
orig_len = msg.msg_namelen;
msg.msg_name = name;
@@ -2256,14 +2271,25 @@ lx_recvmsg(int sockfd, void *lmp, int flags)
if (msg.msg_controllen == 0) {
msg.msg_control = NULL;
} else {
- msg.msg_control = SAFE_ALLOCA(msg.msg_controllen);
- if (msg.msg_control == NULL)
- return (-EINVAL);
+ /*
+ * Note that control message buffers can be quite
+ * long, e.g. 128KB or more. The native stack is
+ * not big enough for these two allocations so we
+ * use malloc(3C).
+ */
+ lx_debug("\tmsg.msg_controllen = %d",
+ msg.msg_controllen);
+ if ((msg_control = malloc(msg.msg_controllen)) ==
+ NULL) {
+ return (-ENOMEM);
+ }
+ msg.msg_control = msg_control;
#if defined(_LP64)
- new_cmsg = SAFE_ALLOCA(msg.msg_controllen +
- LX_CMSG_EXTRA);
- if (new_cmsg == NULL)
+ if ((new_cmsg = malloc(msg.msg_controllen +
+ LX_CMSG_EXTRA)) == NULL) {
+ free(msg_control);
return (-EINVAL);
+ }
#endif
}
}
@@ -2283,29 +2309,37 @@ lx_recvmsg(int sockfd, void *lmp, int flags)
newact.sa_flags = 0;
(void) sigemptyset(&newact.sa_mask);
- if (sigaction(SIGPIPE, &newact, &oact) < 0)
+ if (sigaction(SIGPIPE, &newact, &oact) < 0) {
lx_err_fatal("recvmsg(): could not ignore SIGPIPE to "
"emulate LX_MSG_NOSIGNAL");
+ }
}
r = _so_recvmsg(sockfd, (struct msghdr *)&msg, flags | MSG_XPG4_2);
- if ((nosigpipe) && (sigaction(SIGPIPE, &oact, NULL) < 0))
+ if ((nosigpipe) && (sigaction(SIGPIPE, &oact, NULL) < 0)) {
lx_err_fatal("recvmsg(): could not reset SIGPIPE handler to "
"emulate LX_MSG_NOSIGNAL");
+ }
if (r >= 0 && msg.msg_controllen >= sizeof (struct cmsghdr)) {
/*
- * If there are control messages bundled in this message,
- * we need to convert them from Linux to Solaris.
+ * If there are control messages bundled in this message, we
+ * need to convert them from native illumos to Linux format.
*/
if ((err = convert_cmsgs(SOL_TO_LX, &msg, new_cmsg,
- "recvmsg()")) != 0)
+ "recvmsg()")) != 0) {
+ free(msg_control);
+ free(new_cmsg);
return (-err);
+ }
if ((uucopy(msg.msg_control, cmsg,
- msg.msg_controllen)) != 0)
+ msg.msg_controllen)) != 0) {
+ free(msg_control);
+ free(new_cmsg);
return (-errno);
+ }
}
msg.msg_control = cmsg;
@@ -2314,8 +2348,11 @@ lx_recvmsg(int sockfd, void *lmp, int flags)
if (msg.msg_name != NULL) {
err = stol_sockaddr(orig_name, &msg.msg_namelen, msg.msg_name,
msg.msg_namelen, orig_len);
- if (err != 0)
+ if (err != 0) {
+ free(msg_control);
+ free(new_cmsg);
return (-err);
+ }
msg.msg_name = orig_name;
}
@@ -2324,9 +2361,14 @@ lx_recvmsg(int sockfd, void *lmp, int flags)
* call, so copy their values back to the caller. Rather than iterate,
* just copy the whole structure back.
*/
- if (uucopy(&msg, lmp, sizeof (msg)) != 0)
+ if (uucopy(&msg, lmp, sizeof (msg)) != 0) {
+ free(msg_control);
+ free(new_cmsg);
return (-errno);
+ }
+ free(msg_control);
+ free(new_cmsg);
return ((r < 0) ? -errno : r);
}
diff --git a/usr/src/lib/brand/lx/lx_brand/common/stack.c b/usr/src/lib/brand/lx/lx_brand/common/stack.c
new file mode 100644
index 0000000000..6ddb2c1527
--- /dev/null
+++ b/usr/src/lib/brand/lx/lx_brand/common/stack.c
@@ -0,0 +1,280 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * Manage the native/emulation stack for LX-branded LWPs.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <errno.h>
+
+#include <thread.h>
+#include <sys/mman.h>
+#include <sys/brand.h>
+#include <sys/syscall.h>
+
+#include <sys/lx_brand.h>
+#include <sys/lx_misc.h>
+#include <sys/lx_debug.h>
+#include <sys/lx_thread.h>
+
+
+typedef struct lx_stack_list_ent {
+ thread_t sle_tid;
+ void *sle_stack;
+ size_t sle_stack_size;
+ lx_tsd_t *sle_tsd;
+} lx_stack_list_ent_t;
+
+static mutex_t lx_stack_list_lock = DEFAULTMUTEX;
+lx_stack_list_ent_t *lx_stack_list = NULL;
+unsigned int lx_stack_list_elems = 0;
+
+/*
+ * Usermode emulation alternate stack size, expressed as a page count:
+ */
+int lx_native_stack_page_count = LX_NATIVE_STACK_PAGE_COUNT;
+
+/*
+ * We use these private functions from libc to suspend signal delivery in
+ * critical sections:
+ */
+extern void _sigon(void);
+extern void _sigoff(void);
+
+/*
+ * Free the alternate stack for this thread.
+ */
+void
+lx_free_stack(void)
+{
+ thread_t me = thr_self();
+ int i;
+
+ _sigoff();
+ mutex_lock(&lx_stack_list_lock);
+
+ /*
+ * Find this thread's stack in the list of stacks.
+ */
+ for (i = 0; i < lx_stack_list_elems; i++) {
+ if (lx_stack_list[i].sle_tid != me) {
+ continue;
+ }
+
+ (void) munmap(lx_stack_list[i].sle_stack,
+ lx_stack_list[i].sle_stack_size);
+
+ /*
+ * Free the thread-specific data structure for this thread.
+ */
+ if (lx_stack_list[i].sle_tsd != NULL) {
+ free(lx_stack_list[i].sle_tsd->lxtsd_clone_state);
+ free(lx_stack_list[i].sle_tsd);
+ }
+
+ /*
+ * Free up this stack list entry:
+ */
+ bzero(&lx_stack_list[i], sizeof (lx_stack_list[i]));
+
+ mutex_unlock(&lx_stack_list_lock);
+ _sigon();
+ return;
+ }
+
+ /*
+ * Did not find the stack in the list.
+ */
+ assert(0);
+}
+
+/*
+ * After fork1(), we must unmap the stack of every thread other than the
+ * one copied into the child process.
+ */
+void
+lx_free_other_stacks(void)
+{
+ int i, this_stack = -1;
+ thread_t me = thr_self();
+
+ _sigoff();
+ mutex_lock(&lx_stack_list_lock);
+
+ for (i = 0; i < lx_stack_list_elems; i++) {
+ if (lx_stack_list[i].sle_tid == me) {
+ /*
+ * Do not unmap the stack for this LWP.
+ */
+ this_stack = i;
+ continue;
+ } else if (lx_stack_list[i].sle_tid == 0) {
+ /*
+ * Skip any holes in the list.
+ */
+ continue;
+ }
+
+ /*
+ * Free the thread-specific data structure for this thread.
+ */
+ if (lx_stack_list[i].sle_tsd != NULL) {
+ free(lx_stack_list[i].sle_tsd->lxtsd_clone_state);
+ free(lx_stack_list[i].sle_tsd);
+ }
+
+ /*
+ * Unmap the stack of every other LWP.
+ */
+ (void) munmap(lx_stack_list[i].sle_stack,
+ lx_stack_list[i].sle_stack_size);
+ }
+ /*
+ * Did not find the stack for this LWP in the list.
+ */
+ assert(this_stack != -1);
+
+ /*
+ * Ensure the stack data for this LWP is in the first slot and shrink
+ * the list.
+ */
+ if (this_stack != 0) {
+ lx_stack_list[0] = lx_stack_list[this_stack];
+ }
+ lx_stack_list_elems = 1;
+ lx_stack_list = realloc(lx_stack_list, lx_stack_list_elems *
+ sizeof (lx_stack_list[0]));
+ if (lx_stack_list == NULL) {
+ lx_err_fatal("failed to shrink stack list: %s",
+ strerror(errno));
+ }
+
+ mutex_unlock(&lx_stack_list_lock);
+ _sigon();
+}
+
+/*
+ * Allocate an alternate stack for the execution of native emulation routines.
+ * This routine is based, in part, on find_stack() from libc.
+ */
+int
+lx_alloc_stack(void **nstack, size_t *nstack_size)
+{
+ static int pagesize = 0;
+ static int stackprot = 0;
+ int stacksize = 0;
+ void *stack;
+
+ /*
+ * Fetch configuration once:
+ */
+ if (pagesize == 0) {
+ pagesize = _sysconf(_SC_PAGESIZE);
+ assert(pagesize > 0);
+ }
+ if (stackprot == 0) {
+ long lprot = _sysconf(_SC_STACK_PROT);
+
+ stackprot = lprot > 0 ? lprot : (PROT_READ | PROT_WRITE);
+ }
+
+ stacksize = lx_native_stack_page_count * pagesize;
+
+ if ((stack = mmap(NULL, stacksize, stackprot, MAP_PRIVATE |
+ MAP_NORESERVE | MAP_ANON, -1, (off_t)0)) == MAP_FAILED) {
+ int en = errno;
+ lx_debug("lx_alloc_stack: failed to allocate stack: %s",
+ strerror(errno));
+ errno = en;
+ return (-1);
+ }
+
+#if DEBUG
+ /*
+ * Write a recognisable pattern into the allocated stack pages.
+ */
+ for (pos = 0; pos < ((stacksize - 1) / 4); pos++) {
+ ((uint32_t *)stack)[pos] = 0x0facade0;
+ }
+#endif
+
+ *nstack = stack;
+ *nstack_size = stacksize;
+
+ return (0);
+}
+
+/*
+ * Configure the in-kernel brand-specific LWP data with the native stack
+ * pointer for this thread. If a stack is not passed, allocate one first.
+ */
+void
+lx_install_stack(void *stack, size_t stacksize, lx_tsd_t *tsd)
+{
+ thread_t me = thr_self();
+ int i;
+ uintptr_t stack_top;
+
+ if (stack == NULL) {
+ /*
+ * If we were not passed a stack, then allocate one:
+ */
+ if (lx_alloc_stack(&stack, &stacksize) == -1) {
+ lx_err_fatal("failed to allocate stack for thread "
+ "%d: %s", me, strerror(errno));
+ }
+ }
+
+ /*
+ * Install the stack in the global list of thread stacks.
+ */
+ _sigoff();
+ mutex_lock(&lx_stack_list_lock);
+
+ for (i = 0; i < lx_stack_list_elems; i++) {
+ assert(lx_stack_list[i].sle_tid != me);
+ if (lx_stack_list[i].sle_tid == 0)
+ break;
+ }
+ if (i >= lx_stack_list_elems) {
+ lx_stack_list_elems++;
+ lx_stack_list = realloc(lx_stack_list, lx_stack_list_elems *
+ sizeof (lx_stack_list[0]));
+ if (lx_stack_list == NULL) {
+ lx_err_fatal("failed to extend stack list: %s",
+ strerror(errno));
+ }
+ }
+ lx_stack_list[i].sle_tid = me;
+ lx_stack_list[i].sle_stack = stack;
+ lx_stack_list[i].sle_stack_size = stacksize;
+ lx_stack_list[i].sle_tsd = tsd;
+
+ mutex_unlock(&lx_stack_list_lock);
+ _sigon();
+
+ /*
+ * Inform the kernel of the location of the brand emulation
+ * stack for this LWP:
+ */
+ stack_top = (uintptr_t)stack + stacksize;
+ lx_debug("stack %p stack_top %p\n", stack, stack_top);
+ if (syscall(SYS_brand, B_SET_NATIVE_STACK, stack_top) != 0) {
+ lx_err_fatal("unable to set native stack: %s", strerror(errno));
+ }
+}
diff --git a/usr/src/lib/brand/lx/lx_brand/i386/lx_handler.s b/usr/src/lib/brand/lx/lx_brand/i386/lx_handler.s
index 2b382c9f76..bce7f0005c 100644
--- a/usr/src/lib/brand/lx/lx_brand/i386/lx_handler.s
+++ b/usr/src/lib/brand/lx/lx_brand/i386/lx_handler.s
@@ -21,7 +21,7 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#include <sys/asm_linkage.h>
@@ -41,39 +41,6 @@
#define LX_SYS_sigreturn 119
#define LX_SYS_rt_sigreturn 173
-#define PIC_SETUP(r) \
- call 9f; \
-9: popl r; \
- addl $_GLOBAL_OFFSET_TABLE_ + [. - 9b], r
-
-/*
- * Each JMP must occupy 16 bytes
- */
-#define JMP \
- pushl $_CONST(. - lx_handler_table); \
- jmp lx_handler; \
- .align 16;
-
-#define JMP4 JMP; JMP; JMP; JMP
-#define JMP16 JMP4; JMP4; JMP4; JMP4
-#define JMP64 JMP16; JMP16; JMP16; JMP16
-#define JMP256 JMP64; JMP64; JMP64; JMP64
-
-/*
- * Alternate jump table that turns on lx_traceflag before proceeding with
- * the normal emulation routine.
- */
-#define TJMP \
- pushl $_CONST(. - lx_handler_trace_table); \
- jmp lx_handler_trace; \
- .align 16;
-
-#define TJMP4 TJMP; TJMP; TJMP; TJMP
-#define TJMP16 TJMP4; TJMP4; TJMP4; TJMP4
-#define TJMP64 TJMP16; TJMP16; TJMP16; TJMP16
-#define TJMP256 TJMP64; TJMP64; TJMP64; TJMP64
-
-
#if defined(lint)
#include <sys/types.h>
@@ -81,31 +48,6 @@
#include <sys/signal.h>
void
-lx_handler_table(void)
-{}
-
-void
-lx_handler(void)
-{}
-
-/* ARGSUSED */
-void
-lx_setup_clone(uintptr_t gs, void *retaddr, void *stk)
-{}
-
-/* ARGSUSED */
-void
-lx_sigdeliver(int sig, siginfo_t *sip, void *p, size_t stacksz,
- void (*stack_frame_builder)(void), void (*lx_sighandler)(void),
- uintptr_t gs)
-{}
-
-/* ARGSUSED */
-void
-lx_sigacthandler(int sig, siginfo_t *s, void *p)
-{}
-
-void
lx_sigreturn_tramp(void)
{}
@@ -113,134 +55,8 @@ void
lx_rt_sigreturn_tramp(void)
{}
-/* ARGSUSED */
-void
-lx_sigreturn_tolibc(uintptr_t sp)
-{}
-
#else /* lint */
- /*
- * On entry to this table, %eax will hold the return address. The
- * location where we enter the table is a function of the system
- * call number. The table needs the same alignment as the individual
- * entries.
- */
- .align 16
- ENTRY_NP(lx_handler_trace_table)
- TJMP256
- TJMP64
- TJMP64
- SET_SIZE(lx_handler_trace_table)
-
- .align 16
- ENTRY_NP(lx_handler_table)
- JMP256
- JMP64
- JMP64
- SET_SIZE(lx_handler_table)
-
- ENTRY_NP(lx_handler_trace)
- pushl %esi
- PIC_SETUP(%esi)
- movl lx_traceflag@GOT(%esi), %esi
- movl $1, (%esi)
- popl %esi
- /*
- * While we could just fall through to lx_handler(), we "tail-call" it
- * instead to make ourselves a little more comprehensible to trace
- * tools.
- */
- jmp lx_handler
- SET_SIZE(lx_handler_trace)
-
- ALTENTRY(lx_handler)
- /*
- * %ebp isn't always going to be a frame pointer on Linux, but when
- * it is, saving it here lets us have a coherent stack backtrace.
- */
- pushl %ebp
-
- /*
- * Fill in a lx_regs_t structure on the stack.
- */
- subl $SIZEOF_LX_REGS_T, %esp
-
- /*
- * Save %ebp and then fill it with what would be its usual value as
- * the frame pointer. The value we save for %esp needs to be the
- * stack pointer at the time of the interrupt so we need to skip the
- * saved %ebp and (what will be) the return address.
- */
- movl %ebp, LXR_EBP(%esp)
- movl %esp, %ebp
- addl $_CONST(SIZEOF_LX_REGS_T), %ebp
- movl %ebp, LXR_ESP(%esp)
- addl $_CONST(_MUL(CPTRSIZE, 2)), LXR_ESP(%esp)
-
- movl $0, LXR_GS(%esp)
- movw %gs, LXR_GS(%esp)
- movl %edi, LXR_EDI(%esp)
- movl %esi, LXR_ESI(%esp)
- movl %ebx, LXR_EBX(%esp)
- movl %edx, LXR_EDX(%esp)
- movl %ecx, LXR_ECX(%esp)
- movl %eax, LXR_EIP(%esp)
-
- /*
- * The kernel drops us into the middle of one of the tables above
- * that then pushes that table offset onto the stack, and calls into
- * lx_handler. That offset indicates the system call number while
- * %eax holds the return address for the system call. We replace the
- * value on the stack with the return address, and use the value to
- * compute the system call number by dividing by the table entry size.
- */
- xchgl CPTRSIZE(%ebp), %eax
- shrl $4, %eax
- movl %eax, LXR_EAX(%esp)
-
- /*
- * Switch to the Solaris libc's %gs.
- */
- movl $LWPGS_SEL, %ebx
- movw %bx, %gs
-
- /*
- * Call lx_emulate() whose only argument is a pointer to the
- * lx_regs_t structure we've placed on the stack.
- */
- pushl %esp
- call lx_emulate
-
- /*
- * We use this global symbol to identify this return site when
- * walking the stack backtrace. It needs to remain immediately
- * after the call to lx_emulate().
- */
- ALTENTRY(lx_emulate_done)
-
- /*
- * Clean up the argument to lx_emulate().
- */
- addl $4, %esp
-
- /*
- * Restore the saved register state; we get %ebp, %esp and %esp from
- * the ordinary locations rather than the saved state.
- */
- movl LXR_EDI(%esp), %edi
- movl LXR_ESI(%esp), %esi
- movl LXR_EBX(%esp), %ebx
- movl LXR_EDX(%esp), %edx
- movl LXR_ECX(%esp), %ecx
- movl LXR_EAX(%esp), %eax
- movw LXR_GS(%esp), %gs
-
- movl %ebp, %esp
- popl %ebp
- ret
- SET_SIZE(lx_handler)
-
ENTRY_NP(lx_swap_gs)
push %eax /* save the current eax value */
movl 0xc(%esp),%eax /* 2nd param is a pointer */
@@ -251,102 +67,6 @@ lx_sigreturn_tolibc(uintptr_t sp)
ret
SET_SIZE(lx_swap_gs)
- ENTRY_NP(lx_setup_clone)
- xorl %ebp, %ebp /* terminating stack */
- popl %edx /* eat the clone_start() return address */
- popl %gs /* Switch back to the Linux libc's %gs */
- popl %edx /* Linux clone() return address */
- popl %esp /* New stack pointer */
- xorl %eax, %eax /* child returns 0 to SYS_clone() */
- jmp *%edx /* return to Linux app. */
- SET_SIZE(lx_setup_clone)
-
- /*
- * lx_sigdeliver(sig, siginfo_t *, ucontext_t *, stack_size,
- * stack_build_routine, signal_handler, glibc_gs)
- *
- * This routine allocates stack space for the Linux signal stack,
- * calls a routine to build the signal stack and then calls the Linux
- * signal handler. This is written in assembly because of the way
- * we need to directly manipulate the stack and pass the resulting
- * stack to the signal handler with the Linux signal stack on top.
- *
- * When the Linux signal handler is called, the stack will look
- * like this:
- *
- * =================================================
- * | | %ebp |
- * | =================================================
- * | | LX_SIGRT_MAGIC |
- * | =================================================
- * V | Linux signal frame built by lx_stackbuilder() |
- * =================================================
- *
- * The stack frame (%ebp) will be reset to its original value (i.e. the
- * previous frame) on entry to the Linux signal handler.
- */
- ENTRY_NP(lx_sigdeliver)
- pushl %ebp
- movl %esp, %ebp
- movl 16(%ebp), %edx /* pointer to Solaris ucontext_t */
- pushl %edx /* save ucontext_t ptr for later */
- pushl $LX_SIGRT_MAGIC /* marker value for lx_(rt)_sigreturn */
-
- subl 20(%ebp), %esp /* create stack_size stack buffer */
- pushl %esp /* push stack pointer */
- pushl %edx /* push pointer to ucontext_t */
- pushl 12(%ebp) /* push pointer to siginfo_t */
- pushl 8(%ebp) /* push signal number */
- call *24(%ebp) /* lx_stackbuilder(sig, sip, ucp, sp) */
- add $16, %esp /* remove args from stack */
- movw 32(%ebp), %gs /* only low 16 bits are used */
-
- mov 4(%ebp),%eax /* fetch old %ebp from stack */
- mov 28(%ebp), %edx /* get address of Linux handler */
- mov %eax, %ebp /* restore old %ebp */
- jmp *%edx /* jmp to the Linux signal handler */
- SET_SIZE(lx_sigdeliver)
-
- /*
- * Due to the nature of signals, we need to be able to force the %gs
- * value to that used by Solaris by running any Solaris code.
- *
- * This routine does that, then calls a C routine that will save the
- * %gs value at the time of the signal off into a thread-specific data
- * structure. Finally, we trampoline to the libc code that would
- * normally interpose itself before calling a signal handler.
- *
- * The libc routine that calls user signal handlers ends with a
- * setcontext, so we would never return here even if we used a call
- * rather than a jmp.
- *
- * %esi is used for the PIC as it is guaranteed by the 386 ABI to
- * survive the call to lx_sigsavegs. The downside is we must also
- * preserve its value for our caller.
- *
- * Note that because lx_sigsavegs and libc_sigacthandler are externs,
- * they need to be dereferenced via the GOT.
- *
- * IMPORTANT: Because libc apparently gets upset if extra data is
- * left on its stack, this routine needs to be crafted
- * in assembly so that the jmp to the libc interposer
- * doesn't leave any cruft lying around.
- */
- ENTRY_NP(lx_sigacthandler)
- pushl %esi /* save %esi */
- pushl %gs /* push the Linux %gs */
- pushl $LWPGS_SEL
- popl %gs /* install the Solaris %gs */
-
- PIC_SETUP(%esi)
- movl lx_sigsavegs@GOT(%esi), %eax
- call *%eax /* save the Linux %gs */
- movl libc_sigacthandler@GOT(%esi), %eax
- add $4, %esp /* clear Linux %gs from stack */
- popl %esi /* restore %esi */
- jmp *(%eax) /* jmp to libc's interposer */
- SET_SIZE(lx_sigacthandler)
-
/*
* Trampoline code is called by the return at the end of a Linux
* signal handler to return control to the interrupted application
@@ -370,15 +90,4 @@ lx_sigreturn_tolibc(uintptr_t sp)
movl $LX_SYS_rt_sigreturn, %eax
int $0x80
SET_SIZE(lx_rt_sigreturn_tramp)
-
- /*
- * Manipulate the stack in the way necessary for it to appear to libc
- * that the signal handler it invoked via call_user_handler() is
- * returning.
- */
- ENTRY_NP(lx_sigreturn_tolibc)
- movl 4(%esp), %esp /* set %esp to passed value */
- popl %ebp /* restore proper %ebp */
- ret /* return to lx_call_user_handler */
- SET_SIZE(lx_sigreturn_tolibc)
#endif /* lint */
diff --git a/usr/src/lib/brand/lx/lx_brand/i386/lx_runexe.s b/usr/src/lib/brand/lx/lx_brand/i386/lx_runexe.s
deleted file mode 100644
index a90bc5621b..0000000000
--- a/usr/src/lib/brand/lx/lx_brand/i386/lx_runexe.s
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- * Copyright 2014 Joyent, Inc. All rights reserved.
- */
-
-#include <sys/asm_linkage.h>
-
-#if defined(lint)
-
-/*ARGSUSED*/
-void
-lx_runexe(void *argv, void *entry)
-{
-}
-
-#else /* lint */
-
- /*
- * Set our stack pointer, clear the general registers,
- * and jump to the brand linker's entry point.
- */
- ENTRY_NP(lx_runexe)
- movl 4(%esp), %eax / %eax = &argv[0]
- movl 8(%esp), %ebx / Brand linker's entry point in %ebx
- subl $4, %eax / Top of stack - must point at argc
- movl %eax, %esp / Set %esp to what linkers expect
-
- movl $0, %eax
- movl $0, %ecx
- movl $0, %edx
- movl $0, %esi
- movl $0, %edi
- movl $0, %ebp
-
- jmp *%ebx / And away we go...
- SET_SIZE(lx_runexe)
-
-#endif /* lint */
diff --git a/usr/src/lib/brand/lx/lx_brand/sys/lx_misc.h b/usr/src/lib/brand/lx/lx_brand/sys/lx_misc.h
index f50535d0c4..bed6a8da4b 100644
--- a/usr/src/lib/brand/lx/lx_brand/sys/lx_misc.h
+++ b/usr/src/lib/brand/lx/lx_brand/sys/lx_misc.h
@@ -39,6 +39,7 @@
#include <sys/lwp.h>
#include <sys/lx_brand.h>
+#include <sys/lx_thread.h>
#ifdef __cplusplus
extern "C" {
@@ -55,13 +56,6 @@ extern int lx_rpm_delay;
extern boolean_t lx_is_rpm;
/*
- * These thread-specific variables allow the signal interposition code
- * to communicate restart disposition for any interrupting signals.
- */
-extern __thread int lx_had_sigchild;
-extern __thread int lx_do_syscall_restart;
-
-/*
* Values Linux expects for init
*/
#define LX_INIT_PGID 0
@@ -151,6 +145,11 @@ extern __thread int lx_do_syscall_restart;
B_TRACE_POINT_5(0, 0, 0, 0, 0)
/*
+ * Macros to access register state within a ucontext_t:
+ */
+#define LX_REG(ucp, r) ((ucp)->uc_mcontext.gregs[(r)])
+
+/*
* normally we never want to write to stderr or stdout because it's unsafe
* to make assumptions about the underlying file descriptors. to protect
* against writes to these file descriptors we go ahead and close them
@@ -166,10 +165,9 @@ extern void lx_unsupported(char *, ...);
struct ucontext;
-extern void lx_handler_table(void);
-extern void lx_handler_trace_table(void);
-extern void lx_emulate_done(void);
-extern lx_regs_t *lx_syscall_regs(void);
+extern ucontext_t *lx_syscall_regs(void);
+extern uintptr_t lx_find_brand_sp(void);
+extern const ucontext_t *lx_find_brand_uc(void);
extern int lx_errno(int);
extern char *lx_fd_to_path(int fd, char *buf, int buf_size);
@@ -179,7 +177,7 @@ extern int lx_lpid_to_spid(pid_t, pid_t *);
extern void lx_ptrace_init();
extern int lx_ptrace_wait(siginfo_t *);
extern void lx_ptrace_fork(void);
-extern void lx_ptrace_stop_if_option(int, boolean_t, ulong_t msg);
+extern void lx_ptrace_stop_if_option(int, boolean_t, ulong_t msg, ucontext_t *);
extern void lx_ptrace_clone_begin(int, boolean_t);
extern int lx_check_alloca(size_t);
@@ -187,6 +185,12 @@ extern int lx_check_alloca(size_t);
extern int ltos_at_flag(int lflag, int allow, boolean_t enforce);
+extern void lx_init_tsd(lx_tsd_t *);
+extern int lx_alloc_stack(void **, size_t *);
+extern void lx_install_stack(void *, size_t, lx_tsd_t *);
+extern void lx_free_stack(void);
+extern void lx_free_other_stacks(void);
+
/*
* NO_UUCOPY disables calls to the uucopy* system calls to help with
* debugging brand library accesses to linux application memory.
@@ -201,6 +205,13 @@ int uucopystr_unsafe(const void *src, void *dst, size_t n);
#endif /* NO_UUCOPY */
+/*
+ * We use these Private libc interfaces to defer signals during critical
+ * sections.
+ */
+extern void _sigon(void);
+extern void _sigoff(void);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/lib/brand/lx/lx_brand/sys/lx_signal.h b/usr/src/lib/brand/lx/lx_brand/sys/lx_signal.h
index f3d39fca64..3c612d9ab8 100644
--- a/usr/src/lib/brand/lx/lx_brand/sys/lx_signal.h
+++ b/usr/src/lib/brand/lx/lx_brand/sys/lx_signal.h
@@ -30,6 +30,7 @@
#if !defined(_ASM)
#include <sys/lx_types.h>
#include <sys/ucontext.h>
+#include <sys/lx_siginfo.h>
#include <lx_signum.h>
#endif /* !defined(_ASM) */
@@ -118,93 +119,6 @@ typedef struct lx_osigaction {
#define USE_OSIGSET 0
#define USE_SIGSET 1
-#define LX_SI_MAX_SIZE 128
-#if defined(_LP64)
-/*
- * Because of the odd number (3) of ints before the union, we need to account
- * for the smaller padding needed on x64 due to the union being offset to an 8
- * byte boundary.
- */
-#define LX_SI_PAD_SIZE ((LX_SI_MAX_SIZE/sizeof (int)) - 4)
-
-#else
-#define LX_SI_PAD_SIZE ((LX_SI_MAX_SIZE/sizeof (int)) - 3)
-#endif
-
-typedef struct lx_siginfo {
- int lsi_signo;
- int lsi_errno;
- int lsi_code;
- union {
- int _pad[LX_SI_PAD_SIZE];
-
- struct {
- pid_t _pid;
- lx_uid16_t _uid;
- } _kill;
-
- struct {
- uint_t _timer1;
- uint_t _timer2;
- } _timer;
-
- struct {
- pid_t _pid; /* sender's pid */
- lx_uid16_t _uid; /* sender's uid */
- union sigval _sigval;
- } _rt;
-
- struct {
- pid_t _pid; /* which child */
- lx_uid16_t _uid; /* sender's uid */
- int _status; /* exit code */
- clock_t _utime;
- clock_t _stime;
- } _sigchld;
-
- struct {
- void *_addr; /* faulting insn/memory ref. */
- } _sigfault;
-
- struct {
- int _band; /* POLL_IN,POLL_OUT,POLL_MSG */
- int _fd;
- } _sigpoll;
- } _sifields;
-} lx_siginfo_t;
-
-/*
- * lx_siginfo_t lsi_code values
- *
- * LX_SI_ASYNCNL: Sent by asynch name lookup completion
- * LX_SI_DETHREAD: Sent by execve() killing subsidiary threads
- * LX_SI_SIGIO: Sent by queued SIGIO
- * LX_SI_ASYNCIO: Sent by asynchronous I/O completion
- * LX_SI_MESGQ: Sent by real time message queue state change
- * LX_SI_TIMER: Sent by timer expiration
- * LX_SI_QUEUE: Sent by sigqueue
- * LX_SI_USER: Sent by kill, sigsend, raise, etc.
- * LX_SI_KERNEL: Sent by kernel
- * LX_SI_CODE_NOT_EXIST: Error code. When translating from Linux to
- * illumos errors, if there is no translation available, this value
- * should be used. This value should have no meaning as an si_code in
- * illumos or Linux.
- *
- * At present, LX_SI_ASYNCNL, LX_SI_DETHREAD, and LX_SI_SIGIO are unused by
- * BrandZ.
- */
-#define LX_SI_CODE_NOT_EXIST (-61)
-#define LX_SI_ASYNCNL (-60)
-#define LX_SI_DETHREAD (-7)
-#define LX_SI_TKILL (-6)
-#define LX_SI_SIGIO (-5)
-#define LX_SI_ASYNCIO (-4)
-#define LX_SI_MESGQ (-3)
-#define LX_SI_TIMER (-2)
-#define LX_SI_QUEUE (-1)
-#define LX_SI_USER (0)
-#define LX_SI_KERNEL (0x80)
-
typedef struct lx_sighandlers {
struct lx_sigaction lx_sa[LX_NSIG + 1];
} lx_sighandlers_t;
@@ -370,18 +284,6 @@ typedef struct lx_ucontext {
lx_sigset_t uc_sigmask;
} lx_ucontext_t;
-#define lsi_pid _sifields._kill._pid
-#define lsi_uid _sifields._kill._uid
-#define lsi_status _sifields._sigchld._status
-#define lsi_utime _sifields._sigchld._utime
-#define lsi_stime _sifields._sigchld._stime
-#define lsi_value _sifields._rt._sigval
-#define lsi_int _sifields._rt._sigval.sivalx_int
-#define lsi_ptr _sifields._rt._sigval.sivalx_ptr
-#define lsi_addr _sifields._sigfault._addr
-#define lsi_band _sifields._sigpoll._band
-#define lsi_fd _sifields._sigpoll._fd
-
extern const int ltos_signo[];
extern const int stol_signo[];
@@ -391,10 +293,6 @@ extern void setsigacthandler(void (*)(int, siginfo_t *, void *),
extern int lx_siginit(void);
-extern void lx_sigreturn_tolibc(uintptr_t);
-extern void lx_sigdeliver(int, siginfo_t *, void *, size_t, void (*)(),
- void (*)(), uintptr_t);
-
extern int stol_siginfo(siginfo_t *siginfop, lx_siginfo_t *lx_siginfop);
extern int stol_status(int);
diff --git a/usr/src/lib/brand/lx/lx_brand/sys/lx_syscall.h b/usr/src/lib/brand/lx/lx_brand/sys/lx_syscall.h
index a070bb69b6..4cc72ba0c6 100644
--- a/usr/src/lib/brand/lx/lx_brand/sys/lx_syscall.h
+++ b/usr/src/lib/brand/lx/lx_brand/sys/lx_syscall.h
@@ -74,7 +74,6 @@ extern long lx_readlink(uintptr_t, uintptr_t, uintptr_t);
extern long lx_readdir(uintptr_t, uintptr_t, uintptr_t);
extern long lx_getdents(uintptr_t, uintptr_t, uintptr_t);
extern long lx_getdents64(uintptr_t, uintptr_t, uintptr_t);
-extern long lx_getpid(void);
extern long lx_execve(uintptr_t, uintptr_t, uintptr_t);
extern long lx_dup2(uintptr_t, uintptr_t);
extern long lx_dup3(uintptr_t, uintptr_t, uintptr_t);
@@ -132,12 +131,6 @@ extern long lx_getpgid(uintptr_t);
extern long lx_setpgid(uintptr_t, uintptr_t);
extern long lx_getsid(uintptr_t);
extern long lx_setsid(void);
-extern long lx_setgroups(uintptr_t, uintptr_t);
-
-
-extern long lx_waitpid(uintptr_t, uintptr_t, uintptr_t);
-extern long lx_waitid(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
-extern long lx_wait4(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
extern long lx_getuid16(void);
extern long lx_getgid16(void);
@@ -183,7 +176,6 @@ extern long lx_ftruncate64(uintptr_t, uintptr_t, uintptr_t);
extern long lx_sysctl(uintptr_t);
extern long lx_fsync(uintptr_t);
extern long lx_fdatasync(uintptr_t);
-extern long lx_pipe2(uintptr_t, uintptr_t);
extern long lx_link(uintptr_t, uintptr_t);
extern long lx_unlink(uintptr_t);
extern long lx_rmdir(uintptr_t);
@@ -204,6 +196,7 @@ extern long lx_getcwd(uintptr_t, uintptr_t);
extern long lx_uname(uintptr_t);
extern long lx_reboot(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
extern long lx_getgroups16(uintptr_t, uintptr_t);
+extern long lx_setgroups(uintptr_t, uintptr_t);
extern long lx_setgroups16(uintptr_t, uintptr_t);
extern long lx_personality(uintptr_t);
@@ -312,7 +305,6 @@ extern long lx_shmat(int, void *, int);
extern long lx_shmctl(int, int, void *);
extern long lx_prctl(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
-extern long lx_arch_prctl(int, uintptr_t);
extern long lx_alarm(unsigned int);
extern long lx_close(int);
@@ -351,59 +343,36 @@ extern long lx_shmdt(char *);
extern long lx_stime(const time_t *);
extern long lx_symlink(const char *, const char *);
extern long lx_syslog(int, char *, int);
-extern long lx_sysinfo32(uintptr_t);
extern long lx_timerfd_create(int, int);
extern long lx_timerfd_settime(int, int,
const struct itimerspec *, struct itimerspec *);
extern long lx_timerfd_gettime(int, struct itimerspec *);
extern long lx_umask(mode_t);
extern long lx_utimes(const char *, const struct timeval *);
-extern long lx_write(int, const void *, size_t);
-extern long lx_yield(void);
#endif /* !defined(_ASM) */
-/*
- * Constants for the In-Kernel Emulation table.
- */
-#define LX_EMUL_getpid 1
-#define LX_EMUL_kill 2
-#define LX_EMUL_pipe 3
-#define LX_EMUL_brk 4
-#define LX_EMUL_getppid 5
-#define LX_EMUL_sysinfo 6
-#define LX_EMUL_clone 7
-#define LX_EMUL_modify_ldt 8
-#define LX_EMUL_sched_setparam 9
-#define LX_EMUL_sched_getparam 10
-#define LX_EMUL_sched_rr_get_interval 11
-#define LX_EMUL_setresuid16 12
-#define LX_EMUL_setresgid16 13
-#define LX_EMUL_rt_sigqueueinfo 14
-#define LX_EMUL_setgroups 15
-#define LX_EMUL_setresuid 16
-#define LX_EMUL_setresgid 17
-#define LX_EMUL_gettid 18
-#define LX_EMUL_tkill 19
-#define LX_EMUL_futex 20
-#define LX_EMUL_set_thread_area 21
-#define LX_EMUL_get_thread_area 22
-#define LX_EMUL_set_tid_address 23
-#define LX_EMUL_pipe2 24
-#define LX_EMUL_rt_tgsigqueueinfo 25
-#define LX_EMUL_arch_prctl 26
-#define LX_EMUL_tgkill 27
-#define LX_EMUL_read 28
-#define LX_EMUL_ioctl LX_N_IKE_FUNCS
-
-/* Note: adjust LX_N_IKE_FUNCS when adding new in-kernel functions */
-
-/* Linux vsyscall addresses */
#if defined(_LP64)
+/*
+ * Linux vsyscall addresses:
+ */
#define LX_VSYS_gettimeofday (uintptr_t)0xffffffffff600000
#define LX_VSYS_time (uintptr_t)0xffffffffff600400
#define LX_VSYS_getcpu (uintptr_t)0xffffffffff600800
+
+/*
+ * System call numbers for vsyscall revectoring:
+ */
+#define LX_SYS_gettimeofday 96
+#define LX_SYS_time 201
+#define LX_SYS_getcpu 309
+#endif
+
+#if defined(_LP64)
+#define LX_SYS_clone 56
+#else
+#define LX_SYS_clone 120
#endif
#ifdef __cplusplus
diff --git a/usr/src/lib/brand/lx/lx_brand/sys/lx_thread.h b/usr/src/lib/brand/lx/lx_brand/sys/lx_thread.h
index 3d7b9018e1..fae81c9fc9 100644
--- a/usr/src/lib/brand/lx/lx_brand/sys/lx_thread.h
+++ b/usr/src/lib/brand/lx/lx_brand/sys/lx_thread.h
@@ -32,6 +32,7 @@
extern "C" {
#endif
+#include <sys/lx_signal.h>
#include <thread.h>
typedef enum lx_exit_type {
@@ -41,23 +42,32 @@ typedef enum lx_exit_type {
} lx_exit_type_t;
typedef struct lx_tsd {
-#if defined(_ILP32)
- /* 32-bit thread-specific Linux %gs value */
- uintptr_t lxtsd_gs;
-#else
- /* 64-bit thread-specific Linux %fsbase value */
- uintptr_t lxtsd_fsbase;
-#endif
lx_exit_type_t lxtsd_exit;
int lxtsd_exit_status;
ucontext_t lxtsd_exit_context;
+
+ /*
+ * If this value is non-zero, we use it in lx_sigdeliver() to represent
+ * the in-use extent of the Linux (i.e. BRAND) stack for this thread.
+ * Access to this value must be protected by _sigoff()/_sigon().
+ */
+ uintptr_t lxtsd_lx_sp;
+
+ /*
+ * Alternate stack for Linux sigaltstack emulation:
+ */
+ lx_stack_t lxtsd_sigaltstack;
+
+ void *lxtsd_clone_state;
} lx_tsd_t;
extern thread_key_t lx_tsd_key;
extern void lx_swap_gs(long, long *);
-extern void lx_exit_common(lx_exit_type_t, uintptr_t) __NORETURN;
+extern void lx_exit_common(void) __NORETURN;
+
+extern lx_tsd_t *lx_get_tsd(void);
#ifdef __cplusplus
}
diff --git a/usr/src/lib/brand/lx/lx_brand/sys/lx_thunk_server.h b/usr/src/lib/brand/lx/lx_brand/sys/lx_thunk_server.h
index a56fe8eeb3..33704bffb6 100644
--- a/usr/src/lib/brand/lx/lx_brand/sys/lx_thunk_server.h
+++ b/usr/src/lib/brand/lx/lx_brand/sys/lx_thunk_server.h
@@ -27,8 +27,6 @@
#ifndef _LX_THUNK_SERVER_H
#define _LX_THUNK_SERVER_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -133,7 +131,6 @@ typedef struct lxt_syslog_arg {
* thunk server process.
*/
void lxt_server_init(int, char *[]);
-int lxt_server_pid(int *pid);
void lxt_server_exec_check(void);
#ifdef __cplusplus
diff --git a/usr/src/lib/libc/port/mapfile-vers b/usr/src/lib/libc/port/mapfile-vers
index 3f61d448e5..f4eea53408 100644
--- a/usr/src/lib/libc/port/mapfile-vers
+++ b/usr/src/lib/libc/port/mapfile-vers
@@ -2954,6 +2954,7 @@ $endif
scrwidth;
semctl64;
_semctl64;
+ set_escaped_context_cleanup;
set_setcontext_enforcement;
_setbufend;
__set_errno;
diff --git a/usr/src/lib/libc/port/threads/sigaction.c b/usr/src/lib/libc/port/threads/sigaction.c
index dd7e6159fb..09be90e54f 100644
--- a/usr/src/lib/libc/port/threads/sigaction.c
+++ b/usr/src/lib/libc/port/threads/sigaction.c
@@ -22,7 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#include "lint.h"
@@ -285,6 +285,24 @@ take_deferred_signal(int sig)
thr_panic("take_deferred_signal(): __sigresend() failed");
}
+/*
+ * sigacthandler() attempts to clean up dangling uc_link pointers in
+ * signal handling contexts when libc believes us to have escaped
+ * a signal handler incorrectly in the past.
+ *
+ * Branded processes have a legitimate use for a chain including contexts
+ * other than those used for signal handling when tracking emulation
+ * requests from the kernel. We allow them to disable this cleanup
+ * behaviour.
+ */
+static int escaped_context_cleanup = 1;
+
+void
+set_escaped_context_cleanup(int on)
+{
+ escaped_context_cleanup = on;
+}
+
void
sigacthandler(int sig, siginfo_t *sip, void *uvp)
{
@@ -307,7 +325,7 @@ sigacthandler(int sig, siginfo_t *sip, void *uvp)
* we are actually executing at main level (self->ul_siglink == NULL).
* See the code for setjmp()/longjmp() for more details.
*/
- if (self->ul_siglink == NULL)
+ if (escaped_context_cleanup && self->ul_siglink == NULL)
ucp->uc_link = NULL;
/*
diff --git a/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c
index b4e38f062a..510626d220 100644
--- a/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c
+++ b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c
@@ -22,7 +22,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
@@ -35,6 +35,7 @@
#include <sys/dtrace.h>
#include <sys/dtrace_impl.h>
+#include <sys/lx_brand.h>
#include <sys/lx_impl.h>
/*
@@ -59,696 +60,6 @@
#define LX_SYSTRACE_ENTRY_AFRAMES 2
#define LX_SYSTRACE_RETURN_AFRAMES 4
-typedef struct lx_sys_names {
- char *sy_name;
-} lx_sys_names_t;
-
-static lx_sys_names_t lx_sysnames32[] =
-{
- {"lx_nosys"}, /* 0 */
- {"exit"}, /* 1 */
- {"lx_fork"},
- {"read"},
- {"write"},
- {"open"},
- {"close"},
- {"waitpid"},
- {"creat"},
- {"link"},
- {"unlink"}, /* 10 */
- {"exec"},
- {"chdir"},
- {"gtime"},
- {"mknod"},
- {"chmod"},
- {"lchown16"},
- {"break"},
- {"stat"},
- {"lseek"},
- {"getpid"}, /* 20 */
- {"mount"},
- {"umount"},
- {"setuid16"},
- {"getuid16"},
- {"stime"},
- {"ptrace"},
- {"alarm"},
- {"fstat"},
- {"pause"},
- {"utime"}, /* 30 */
- {"stty"},
- {"gtty"},
- {"access"},
- {"nice"},
- {"ftime"},
- {"sync"},
- {"kill"},
- {"rename"},
- {"mkdir"},
- {"rmdir"}, /* 40 */
- {"dup"},
- {"pipe"},
- {"times"},
- {"prof"},
- {"brk"},
- {"setgid16"},
- {"getgid16"},
- {"signal"},
- {"geteuid16"},
- {"getegid16"}, /* 50 */
- {"sysacct"},
- {"umount2"},
- {"lock"},
- {"ioctl"},
- {"fcntl"},
- {"mpx"},
- {"setpgid"},
- {"ulimit"},
- {"olduname"},
- {"umask"}, /* 60 */
- {"chroot"},
- {"ustat"},
- {"dup2"},
- {"getppid"},
- {"pgrp"},
- {"setsid"},
- {"sigaction"},
- {"sgetmask"},
- {"ssetmask"},
- {"setreuid16"}, /* 70 */
- {"setregid16"},
- {"sigsuspend"},
- {"sigpending"},
- {"sethostname"},
- {"setrlimit"},
- {"old_getrlimit"},
- {"getrusage"},
- {"gettimeofday"},
- {"settimeofday"},
- {"getgroups16"}, /* 80 */
- {"setgroups16"},
- {"old_select"},
- {"symlink"},
- {"oldlstat"},
- {"readlink"},
- {"uselib"},
- {"swapon"},
- {"reboot"},
- {"old_readdir"},
- {"old_mmap"}, /* 90 */
- {"munmap"},
- {"truncate"},
- {"ftruncate"},
- {"fchmod"},
- {"fchown16"},
- {"getpriority"},
- {"setpriority"},
- {"profil"},
- {"statfs"},
- {"fstatfs"}, /* 100 */
- {"ioperm"},
- {"socketcall"},
- {"syslog"},
- {"setitimer"},
- {"getitimer"},
- {"newstat"},
- {"newsltat"},
- {"newsftat"},
- {"uname"},
- {"oldiopl"}, /* 110 */
- {"oldvhangup"},
- {"idle"},
- {"vm86old"},
- {"wait4"},
- {"swapoff"},
- {"sysinfo"},
- {"ipc"},
- {"fsync"},
- {"sigreturn"},
- {"clone"}, /* 120 */
- {"setdomainname"},
- {"newuname"},
- {"modify_ldt"},
- {"adjtimex"},
- {"mprotect"},
- {"sigprocmask"},
- {"create_module"},
- {"init_module"},
- {"delete_module"},
- {"get_kernel_syms"}, /* 130 */
- {"quotactl"},
- {"getpgid"},
- {"fchdir"},
- {"bdflush"},
- {"sysfs"},
- {"personality"},
- {"afs_syscall"},
- {"setfsuid16"},
- {"setfsgid16"},
- {"llseek"}, /* 140 */
- {"getdents"},
- {"select"},
- {"flock"},
- {"msync"},
- {"readv"},
- {"writev"},
- {"getsid"},
- {"fdatasync"},
- {"sysctl"},
- {"mlock"}, /* 150 */
- {"munlock"},
- {"mlockall"},
- {"munlockall"},
- {"sched_setparam"},
- {"sched_getparam"},
- {"sched_setscheduler"},
- {"sched_getscheduler"},
- {"yield"},
- {"sched_get_priority_max"},
- {"sched_get_priority_min"}, /* 160 */
- {"sched_rr_get_interval"},
- {"nanosleep"},
- {"mremap"},
- {"setresuid16"},
- {"getresuid16"},
- {"vm86"},
- {"query_module"},
- {"poll"},
- {"nfsserctl"},
- {"setresgid16"}, /* 170 */
- {"getresgid16"},
- {"prctl"},
- {"rt_sigreturn"},
- {"rt_sigaction"},
- {"rt_sigprocmask"},
- {"rt_sigpending"},
- {"rt_sigtimedwait"},
- {"rt_sigqueueinfo"},
- {"rt_sigsuspend"},
- {"pread64"}, /* 180 */
- {"pwrite64"},
- {"chown16"},
- {"getcwd"},
- {"capget"},
- {"capset"},
- {"sigaltstack"},
- {"sendfile"},
- {"getpmsg"},
- {"putpmsg"},
- {"vfork"}, /* 190 */
- {"getrlimit"},
- {"mmap2"},
- {"truncate64"},
- {"ftruncate64"},
- {"stat64"},
- {"lstat64"},
- {"fstat64"},
- {"lchown"},
- {"getuid"},
- {"getgid"}, /* 200 */
- {"geteuid"},
- {"getegid"},
- {"setreuid"},
- {"setregid"},
- {"getgroups"},
- {"setgroups"},
- {"fchown"},
- {"setresuid"},
- {"getresuid"},
- {"setresgid"}, /* 210 */
- {"getresgid"},
- {"chown"},
- {"setuid"},
- {"setgid"},
- {"setfsuid"},
- {"setfsgid"},
- {"pivot_root"},
- {"mincore"},
- {"madvise"},
- {"getdents64"}, /* 220 */
- {"fcntl64"},
- {"lx_nosys"},
- {"security"},
- {"gettid"},
- {"readahead"},
- {"setxattr"},
- {"lsetxattr"},
- {"fsetxattr"},
- {"getxattr"},
- {"lgetxattr"}, /* 230 */
- {"fgetxattr"},
- {"listxattr"},
- {"llistxattr"},
- {"flistxattr"},
- {"removexattr"},
- {"lremovexattr"},
- {"fremovexattr"},
- {"tkill"},
- {"sendfile64"},
- {"futex"}, /* 240 */
- {"sched_setaffinity"},
- {"sched_getaffinity"},
- {"set_thread_area"},
- {"get_thread_area"},
- {"io_setup"},
- {"io_destroy"},
- {"io_getevents"},
- {"io_submit"},
- {"io_cancel"},
- {"fadvise64"}, /* 250 */
- {"lx_nosys"},
- {"exit_group"},
- {"lookup_dcookie"},
- {"epoll_create"},
- {"epoll_ctl"},
- {"epoll_wait"},
- {"remap_file_pages"},
- {"set_tid_address"},
- {"timer_create"},
- {"timer_settime"}, /* 260 */
- {"timer_gettime"},
- {"timer_getoverrun"},
- {"timer_delete"},
- {"clock_settime"},
- {"clock_gettime"},
- {"clock_getres"},
- {"clock_nanosleep"},
- {"statfs64"},
- {"fstatfs64"},
- {"tgkill"}, /* 270 */
- /* The following are Linux 2.6 system calls */
- {"utimes"},
- {"fadvise64_64"},
- {"vserver"},
- {"mbind"},
- {"get_mempolicy"},
- {"set_mempolicy"},
- {"mq_open"},
- {"mq_unlink"},
- {"mq_timedsend"},
- {"mq_timedreceive"}, /* 280 */
- {"mq_notify"},
- {"mq_getsetattr"},
- {"kexec_load"},
- {"waitid"},
- {"sys_setaltroot"},
- {"add_key"},
- {"request_key"},
- {"keyctl"},
- {"ioprio_set"},
- {"ioprio_get"}, /* 290 */
- {"inotify_init"},
- {"inotify_add_watch"},
- {"inotify_rm_watch"},
- {"migrate_pages"},
- {"openat"},
- {"mkdirat"},
- {"mknodat"},
- {"fchownat"},
- {"futimesat"},
- {"fstatat64"}, /* 300 */
- {"unlinkat"},
- {"renameat"},
- {"linkat"},
- {"syslinkat"},
- {"readlinkat"},
- {"fchmodat"},
- {"faccessat"},
- {"pselect6"},
- {"ppoll"},
- {"unshare"}, /* 310 */
- {"set_robust_list"},
- {"get_robust_list"},
- {"splice"},
- {"sync_file_range"},
- {"tee"},
- {"vmsplice"},
- {"move_pages"},
- {"getcpu"},
- {"epoll_pwait"},
- {"utimensat"}, /* 320 */
- {"signalfd"},
- {"timerfd_create"},
- {"eventfd"},
- {"fallocate"},
- {"timerfd_settime"},
- {"timerfd_gettime"},
- {"signalfd4"},
- {"eventfd2"},
- {"epoll_create1"},
- {"dup3"}, /* 330 */
- {"pipe2"},
- {"inotify_init1"},
- {"preadv"},
- {"pwritev"},
- {"rt_tgsigqueueinfo"},
- {"perf_event_open"},
- {"recvmmsg"},
- {"fanotify_init"},
- {"fanotify_mark"},
- {"prlimit64"}, /* 340 */
- {"name_to_handle_at"},
- {"open_by_handle_at"},
- {"clock_adjtime"},
- {"syncfs"},
- {"sendmmsg"},
- {"setns"},
- {"process_vm_readv"},
- {"process_vm_writev"},
- {"kcmp"},
- {"finit_module"}, /* 350 */
- {"sched_setattr"},
- {"sched_getattr"},
- NULL /* NULL-termination is required for lx_systrace */
-};
-
-#if defined(_LP64)
-static lx_sys_names_t lx_sysnames64[] =
-{
- {"read"}, /* 0 */
- {"write"},
- {"open"},
- {"close"},
- {"stat"},
- {"fstat"},
- {"lstat"},
- {"poll"},
- {"lseek"},
- {"mmap"},
- {"mprotect"}, /* 10 */
- {"munmap"},
- {"brk"},
- {"rt_sigaction"},
- {"rt_sigprocmask"},
- {"rt_sigreturn"},
- {"ioctl"},
- {"pread64"},
- {"pwrite64"},
- {"readv"},
- {"writev"}, /* 20 */
- {"access"},
- {"pipe"},
- {"select"},
- {"sched_yield"},
- {"mremap"},
- {"msync"},
- {"mincore"},
- {"madvise"},
- {"shmget"},
- {"shmat"}, /* 30 */
- {"shmctl"},
- {"dup"},
- {"dup2"},
- {"pause"},
- {"nanosleep"},
- {"getitimer"},
- {"alarm"},
- {"setitimer"},
- {"getpid"},
- {"sendfile"}, /* 40 */
- {"socket"},
- {"connect"},
- {"accept"},
- {"sendto"},
- {"recvfrom"},
- {"sendmsg"},
- {"recvmsg"},
- {"shutdown"},
- {"bind"},
- {"listen"}, /* 50 */
- {"getsockname"},
- {"getpeername"},
- {"socketpair"},
- {"setsockopt"},
- {"getsockopt"},
- {"clone"},
- {"fork"},
- {"vfork"},
- {"execve"},
- {"exit"}, /* 60 */
- {"wait4"},
- {"kill"},
- {"uname"},
- {"semget"},
- {"semop"},
- {"semctl"},
- {"shmdt"},
- {"msgget"},
- {"msgsnd"},
- {"msgrcv"}, /* 70 */
- {"msgctl"},
- {"fcntl"},
- {"flock"},
- {"fsync"},
- {"fdatasync"},
- {"truncate"},
- {"ftruncate"},
- {"getdents"},
- {"getcwd"},
- {"chdir"}, /* 80 */
- {"fchdir"},
- {"rename"},
- {"mkdir"},
- {"rmdir"},
- {"creat"},
- {"link"},
- {"unlink"},
- {"symlink"},
- {"readlink"},
- {"chmod"}, /* 90 */
- {"fchmod"},
- {"chown"},
- {"fchown"},
- {"lchown"},
- {"umask"},
- {"gettimeofday"},
- {"getrlimit"},
- {"getrusage"},
- {"sysinfo"},
- {"times"}, /* 100 */
- {"ptrace"},
- {"getuid"},
- {"syslog"},
- {"getgid"},
- {"setuid"},
- {"setgid"},
- {"geteuid"},
- {"getegid"},
- {"setpgid"},
- {"getppid"}, /* 110 */
- {"getpgrp"},
- {"setsid"},
- {"setreuid"},
- {"setregid"},
- {"getgroups"},
- {"setgroups"},
- {"setresuid"},
- {"getresuid"},
- {"setresgid"},
- {"getresgid"}, /* 120 */
- {"getpgid"},
- {"setfsuid"},
- {"setfsgid"},
- {"getsid"},
- {"capget"},
- {"capset"},
- {"rt_sigpending"},
- {"rt_sigtimedwait"},
- {"rt_sigqueueinfo"},
- {"rt_sigsuspend"}, /* 130 */
- {"sigaltstack"},
- {"utime"},
- {"mknod"},
- {"uselib"},
- {"personality"},
- {"ustat"},
- {"statfs"},
- {"fstatfs"},
- {"sysfs"},
- {"getpriority"}, /* 140 */
- {"setpriority"},
- {"sched_setparam"},
- {"sched_getparam"},
- {"sched_setscheduler"},
- {"sched_getscheduler"},
- {"sched_get_priority_max"},
- {"sched_get_priority_min"},
- {"sched_rr_get_interval"},
- {"mlock"},
- {"munlock"}, /* 150 */
- {"mlockall"},
- {"munlockall"},
- {"vhangup"},
- {"modify_ldt"},
- {"pivot_root"},
- {"sysctl"},
- {"prctl"},
- {"arch_prctl"},
- {"adjtimex"},
- {"setrlimit"}, /* 150 */
- {"chroot"},
- {"sync"},
- {"acct"},
- {"settimeofday"},
- {"mount"},
- {"umount2"},
- {"swapon"},
- {"swapoff"},
- {"reboot"},
- {"sethostname"}, /* 170 */
- {"setdomainname"},
- {"iopl"},
- {"ioperm"},
- {"create_module"},
- {"init_module"},
- {"delete_module"},
- {"get_kernel_syms"},
- {"query_module"},
- {"quotactl"},
- {"nfsservctl"}, /* 180 */
- {"getpmsg"},
- {"putpmsg"},
- {"afs_syscall"},
- {"tux"},
- {"security"},
- {"gettid"},
- {"readahead"},
- {"setxattr"},
- {"lsetxattr"},
- {"fsetxattr"}, /* 190 */
- {"getxattr"},
- {"lgetxattr"},
- {"fgetxattr"},
- {"listxattr"},
- {"llistxattr"},
- {"flistxattr"},
- {"removexattr"},
- {"lremovexattr"},
- {"fremovexattr"},
- {"tkill"}, /* 200 */
- {"time"},
- {"futex"},
- {"sched_setaffinity"},
- {"sched_getaffinity"},
- {"set_thread_area"},
- {"io_setup"},
- {"io_destroy"},
- {"io_getevents"},
- {"io_submit"},
- {"io_cancel"}, /* 210 */
- {"get_thread_area"},
- {"lookup_dcookie"},
- {"epoll_create"},
- {"epoll_ctl_old"},
- {"epoll_wait_old"},
- {"remap_file_pages"},
- {"getdents64"},
- {"set_tid_address"},
- {"restart_syscall"},
- {"semtimedop"}, /* 220 */
- {"fadvise64"},
- {"timer_create"},
- {"timer_settime"},
- {"timer_gettime"},
- {"timer_getoverrun"},
- {"timer_delete"},
- {"clock_settime"},
- {"clock_gettime"},
- {"clock_getres"},
- {"clock_nanosleep"}, /* 230 */
- {"exit_group"},
- {"epoll_wait"},
- {"epoll_ctl"},
- {"tgkill"},
- {"utimes"},
- {"vserver"},
- {"mbind"},
- {"set_mempolicy"},
- {"get_mempolicy"},
- {"mq_open"}, /* 240 */
- {"mq_unlink"},
- {"mq_timedsend"},
- {"mq_timedreceive"},
- {"mq_notify"},
- {"mq_getsetattr"},
- {"kexec_load"},
- {"waitid"},
- {"add_key"},
- {"request_key"},
- {"keyctl"}, /* 250 */
- {"ioprio_set"},
- {"ioprio_get"},
- {"inotify_init"},
- {"inotify_add_watch"},
- {"inotify_rm_watch"},
- {"migrate_pages"},
- {"openat"},
- {"mkdirat"},
- {"mknodat"},
- {"fchownat"}, /* 260 */
- {"futimesat"},
- {"fstatat64"},
- {"unlinkat"},
- {"renameat"},
- {"linkat"},
- {"symlinkat"},
- {"readlinkat"},
- {"fchmodat"},
- {"faccessat"},
- {"pselect6"}, /* 270 */
- {"ppoll"},
- {"unshare"},
- {"set_robust_list"},
- {"get_robust_list"},
- {"splice"},
- {"tee"},
- {"sync_file_range"},
- {"vmsplice"},
- {"move_pages"},
- {"utimensat"}, /* 280 */
- {"epoll_pwait"},
- {"signalfd"},
- {"timerfd_create"},
- {"eventfd"},
- {"fallocate"},
- {"timerfd_settime"},
- {"timerfd_gettime"},
- {"accept4"},
- {"signalfd4"},
- {"eventfd2"}, /* 290 */
- {"epoll_create1"},
- {"dup3"},
- {"pipe2"},
- {"inotify_init1"},
- {"preadv"},
- {"pwritev"},
- {"rt_tgsigqueueinfo"},
- {"perf_event_open"},
- {"recvmmsg"},
- {"fanotify_init"}, /* 300 */
- {"fanotify_mark"},
- {"prlimit64"},
- {"name_to_handle_at"},
- {"open_by_handle_at"},
- {"clock_adjtime"},
- {"syncfs"},
- {"sendmmsg"},
- {"setns"},
- {"getcpu"},
- {"process_vm_readv"}, /* 310 */
- {"process_vm_writev"},
- {"kcmp"},
- {"finit_module"},
- {"sched_setattr"},
- {"sched_getattr"},
- {"renameat2"}, /* 316 */
-
- /* XXX gap then x32 syscalls from 512 - 544 */
-
- NULL /* NULL-termination is required for lx_systrace */
-};
-#endif
-
typedef struct lx_systrace_sysent {
const char *lss_name;
dtrace_id_t lss_entry;
@@ -1034,40 +345,30 @@ lx_systrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
lx_systrace_devi = devi;
/*
- * Count up the 32-bit Linux system calls.
- */
- for (i = 0; lx_sysnames32[i].sy_name != NULL; i++)
- continue;
-
- /*
* Initialize the 32-bit table.
*/
- lx_systrace_sysent32 = kmem_zalloc(i * sizeof (lx_systrace_sysent_t),
- KM_SLEEP);
- lx_systrace_nsysent32 = i;
+ VERIFY(lx_nsysent32 > 0);
+ lx_systrace_nsysent32 = lx_nsysent32;
+ lx_systrace_sysent32 = kmem_zalloc(lx_systrace_nsysent32 *
+ sizeof (lx_systrace_sysent_t), KM_SLEEP);
for (i = 0; i < lx_systrace_nsysent32; i++) {
- lx_systrace_sysent32[i].lss_name = lx_sysnames32[i].sy_name;
+ lx_systrace_sysent32[i].lss_name = lx_sysent32[i].sy_name;
lx_systrace_sysent32[i].lss_entry = DTRACE_IDNONE;
lx_systrace_sysent32[i].lss_return = DTRACE_IDNONE;
}
#if defined(_LP64)
/*
- * Count up the 64-bit Linux system calls.
- */
- for (i = 0; lx_sysnames64[i].sy_name != NULL; i++)
- continue;
-
- /*
* Initialize the 64-bit table.
*/
- lx_systrace_sysent64 = kmem_zalloc(i * sizeof (lx_systrace_sysent_t),
- KM_SLEEP);
- lx_systrace_nsysent64 = i;
+ VERIFY(lx_nsysent64 > 0);
+ lx_systrace_nsysent64 = lx_nsysent64;
+ lx_systrace_sysent64 = kmem_zalloc(lx_systrace_nsysent64 *
+ sizeof (lx_systrace_sysent_t), KM_SLEEP);
for (i = 0; i < lx_systrace_nsysent64; i++) {
- lx_systrace_sysent64[i].lss_name = lx_sysnames64[i].sy_name;
+ lx_systrace_sysent64[i].lss_name = lx_sysent64[i].sy_name;
lx_systrace_sysent64[i].lss_entry = DTRACE_IDNONE;
lx_systrace_sysent64[i].lss_return = DTRACE_IDNONE;
}
diff --git a/usr/src/uts/common/brand/lx/os/lx_brand.c b/usr/src/uts/common/brand/lx/os/lx_brand.c
index 4507c0303c..fc9aaa6055 100644
--- a/usr/src/uts/common/brand/lx/os/lx_brand.c
+++ b/usr/src/uts/common/brand/lx/os/lx_brand.c
@@ -28,6 +28,110 @@
* Copyright 2015, Joyent, Inc. All rights reserved.
*/
+/*
+ * The LX Brand: emulation of a Linux operating environment within a zone.
+ *
+ * OVERVIEW
+ *
+ * The LX brand enables a full Linux userland -- including a C library,
+ * init(1) framework, and some set of applications -- to run unmodified
+ * within an illumos zone. Unlike illumos, where applications are expected
+ * to link against and consume functions exported from libraries, the
+ * supported Linux binary compatibility boundary is the system call
+ * interface. By accurately emulating the behaviour of Linux system calls,
+ * Linux software can be executed in this environment as if it were running
+ * on a native Linux system.
+ *
+ * EMULATING LINUX SYSTEM CALLS
+ *
+ * Linux system calls are made in 32-bit processes via the "int 0x80"
+ * instruction; in 64-bit processes the "syscall" instruction is used, as it
+ * is with native illumos processes. In both cases, arguments to system
+ * calls are generally passed in registers and the usermode stack is not
+ * interpreted or modified by the Linux kernel.
+ *
+ * When the emulated Linux process makes a system call, it traps into the
+ * illumos kernel. The in-kernel brand module contains various emulation
+ * routines, and can fully service some emulated system calls; e.g. read(2)
+ * and write(2). Other system calls require assistance from the illumos
+ * libc, bouncing back out to the brand library ("lx_brand.so.1") for
+ * emulation.
+ *
+ * The brand mechanism allows for the provision of an alternative trap
+ * handler for the various system call mechanisms. Traditionally this was
+ * used to immediately revector execution to the usermode emulation library,
+ * which was responsible for handling all system calls. In the interests of
+ * more accurate emulation and increased performance, much of the regular
+ * illumos system call path is now invoked. Only the argument processing and
+ * handler dispatch are replaced by the brand, via the per-LWP
+ * "lwp_brand_syscall" interposition function pointer.
+ *
+ * THE NATIVE AND BRAND STACKS
+ *
+ * Some runtime environments (e.g. the Go language) allocate very small
+ * thread stacks, preferring to grow or split the stack as necessary. The
+ * Linux kernel generally does not use the usermode stack when servicing
+ * system calls, so this is not a problem. In order for our emulation to
+ * have the same zero stack impact, we must execute usermode emulation
+ * routines on an _alternate_ stack. This is similar, in principle, to the
+ * use of sigaltstack(3C) to run signal handlers off the main thread stack.
+ *
+ * To this end, the brand library allocates and installs an alternate stack
+ * (called the "native" stack) for each LWP. The in-kernel brand code uses
+ * this stack for usermode emulation calls and interposed signal delivery,
+ * while the emulated Linux process sees only the data on the main thread
+ * stack, known as the "brand" stack. The stack mode is tracked in the
+ * per-LWP brand-private data, using the LX_STACK_MODE_* enum.
+ *
+ * The stack mode doubles as a system call "mode bit". When in the
+ * LX_STACK_MODE_BRAND mode, system calls are processed as emulated Linux
+ * system calls. In other modes, system calls are assumed to be native
+ * illumos system calls as made during brand library initialisation and
+ * usermode emulation.
+ *
+ * USERMODE EMULATION
+ *
+ * When a Linux system call cannot be emulated within the kernel, we preserve
+ * the register state of the Linux process and revector the LWP to the brand
+ * library usermode emulation handler: the "lx_emulate()" function in
+ * "lx_brand.so.1". This revectoring is modelled on the delivery of signals,
+ * and is performed in "lx_emulate_user()".
+ *
+ * First, the emulated process state is written out to the usermode stack of
+ * the process as a "ucontext_t" object. Arguments to the emulation routine
+ * are passed on the stack or in registers, depending on the ABI. When the
+ * usermode emulation is complete, the result is passed back to the kernel
+ * (via the "B_EMULATION_DONE" brandsys subcommand) with the saved context
+ * for restoration.
+ *
+ * SIGNAL DELIVERY, SETCONTEXT AND GETCONTEXT
+ *
+ * When servicing emulated system calls in the usermode brand library, or
+ * during signal delivery, various state is preserved by the kernel so that
+ * the running LWP may be revectored to a handling routine. The context
+ * allows the kernel to restart the program at the point of interruption,
+ * either at the return of the signal handler, via setcontext(3C); or after
+ * the usermode emulation request has been serviced, via B_EMULATION_DONE.
+ *
+ * In illumos native processes, the saved context (a "ucontext_t" object)
+ * includes the state of registers and the current signal mask at the point
+ * of interruption. The context also includes a link to the most recently
+ * saved context, forming a chain to be unwound as requests complete. The LX
+ * brand requires additional book-keeping to describe the machine state: in
+ * particular, the current stack mode and the occupied extent of the native
+ * stack.
+ *
+ * The brand code is able to interpose on the context save and restore
+ * operations in the kernel -- see "lx_savecontext()" and
+ * "lx_restorecontext()" -- to enable getcontext(3C) and setcontext(3C) to
+ * function correctly in the face of a dual stack LWP. The brand also
+ * interposes on the signal delivery mechanism -- see "lx_sendsig()" and
+ * "lx_sendsig_stack()" -- to allow all signals to be delivered to the brand
+ * library interposer on the native stack, regardless of the interrupted
+ * execution mode. Linux sigaltstack(2) emulation is performed entirely by
+ * the usermode brand library during signal handler interposition.
+ */
+
#include <sys/types.h>
#include <sys/kmem.h>
#include <sys/errno.h>
@@ -63,6 +167,7 @@
#include <sys/x86_archext.h>
#include <sys/controlregs.h>
#include <sys/core.h>
+#include <sys/stack.h>
#include <lx_signum.h>
int lx_debug = 0;
@@ -80,18 +185,16 @@ void lx_copy_procdata(proc_t *, proc_t *);
extern int getsetcontext(int, void *);
extern int waitsys(idtype_t, id_t, siginfo_t *, int);
#if defined(_SYSCALL32_IMPL)
+extern int getsetcontext32(int, void *);
extern int waitsys32(idtype_t, id_t, siginfo_t *, int);
#endif
extern void lx_proc_exit(proc_t *, klwp_t *);
-static void lx_psig_to_proc(proc_t *, kthread_t *, int);
extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *);
extern void lx_ioctl_init();
extern void lx_ioctl_fini();
-int lx_systrace_brand_enabled;
-
lx_systrace_f *lx_systrace_entry_ptr;
lx_systrace_f *lx_systrace_return_ptr;
@@ -113,6 +216,15 @@ static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
static boolean_t lx_native_exec(uint8_t, const char **);
static uint32_t lx_map32limit(proc_t *);
+static void lx_savecontext(ucontext_t *);
+static void lx_restorecontext(ucontext_t *);
+static caddr_t lx_sendsig_stack(int);
+static void lx_sendsig(int);
+#if defined(_SYSCALL32_IMPL)
+static void lx_savecontext32(ucontext32_t *);
+#endif
+
+
/* lx brand */
struct brand_ops lx_brops = {
lx_init_brand_data, /* b_init_brand_data */
@@ -132,7 +244,7 @@ struct brand_ops lx_brops = {
lx_elfexec, /* b_elfexec */
NULL, /* b_sigset_native_to_brand */
NULL, /* b_sigset_brand_to_native */
- lx_psig_to_proc, /* b_psig_to_proc */
+ NULL, /* b_psig_to_proc */
NSIG, /* b_nsig */
lx_exit_with_sig, /* b_exit_with_sig */
lx_wait_filter, /* b_wait_filter */
@@ -142,14 +254,21 @@ struct brand_ops lx_brops = {
lx_stop_notify, /* b_stop_notify */
lx_waitid_helper, /* b_waitid_helper */
lx_sigcld_repost, /* b_sigcld_repost */
- lx_issig_stop /* b_issig_stop */
+ lx_issig_stop, /* b_issig_stop */
+ lx_savecontext, /* b_savecontext */
+#if defined(_SYSCALL32_IMPL)
+ lx_savecontext32, /* b_savecontext32 */
+#endif
+ lx_restorecontext, /* b_restorecontext */
+ lx_sendsig_stack, /* b_sendsig_stack */
+ lx_sendsig /* b_sendsig */
};
struct brand_mach_ops lx_mops = {
NULL,
- lx_brand_int80_callback, /* 32-bit Linux entry point */
NULL,
- lx_brand_syscall_callback, /* 64-bit common entry point */
+ NULL,
+ NULL,
NULL,
lx_fixsegreg,
lx_fsbase
@@ -294,18 +413,7 @@ lx_map32limit(proc_t *p)
void
lx_brand_systrace_enable(void)
{
- extern void lx_brand_int80_enable(void);
-
- ASSERT(!lx_systrace_enabled);
-
-#if defined(__amd64)
- /* enable the trace points for both 32-bit and 64-bit lx calls */
- extern void lx_brand_syscall_enable(void);
- lx_brand_syscall_enable();
- lx_brand_int80_enable();
-#else
- lx_brand_int80_enable();
-#endif
+ VERIFY(!lx_systrace_enabled);
lx_systrace_enabled = 1;
}
@@ -313,106 +421,260 @@ lx_brand_systrace_enable(void)
void
lx_brand_systrace_disable(void)
{
- extern void lx_brand_int80_disable(void);
+ VERIFY(lx_systrace_enabled);
- ASSERT(lx_systrace_enabled);
+ lx_systrace_enabled = 0;
+}
-#if defined(__amd64)
- /* disable the trace points for both 32-bit and 64-bit lx calls */
- extern void lx_brand_syscall_disable(void);
- lx_brand_syscall_disable();
- lx_brand_int80_disable();
-#else
- lx_brand_int80_disable();
-#endif
+void
+lx_lwp_set_native_stack_current(lx_lwp_data_t *lwpd, uintptr_t new_sp)
+{
+ VERIFY(lwpd->br_ntv_stack != 0);
- lx_systrace_enabled = 0;
+ /*
+ * The "brand-lx-set-ntv-stack-current" probe has arguments:
+ * arg0: stack pointer before change
+ * arg1: stack pointer after change
+ * arg2: current stack base
+ */
+ DTRACE_PROBE3(brand__lx__set__ntv__stack__current,
+ uintptr_t, lwpd->br_ntv_stack_current,
+ uintptr_t, new_sp,
+ uintptr_t, lwpd->br_ntv_stack);
+
+ lwpd->br_ntv_stack_current = new_sp;
+}
+
+/*
+ * This hook runs prior to sendsig() processing and allows us to nominate
+ * an alternative stack pointer for delivery of the signal handling frame.
+ * Critically, this routine should _not_ modify any LWP state as the
+ * savecontext() does not run until after this hook.
+ */
+static caddr_t
+lx_sendsig_stack(int sig)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+
+ /*
+ * We want to take signal delivery on the native stack, but only if
+ * one has been allocated and installed for this LWP.
+ */
+ if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
+ /*
+ * The program is not running on the native stack. Return
+ * the native stack pointer from our brand-private data so
+ * that we may switch to it for signal handling.
+ */
+ return ((caddr_t)lwpd->br_ntv_stack_current);
+ } else {
+ struct regs *rp = lwptoregs(lwp);
+
+ /*
+ * Either the program is already running on the native stack,
+ * or one has not yet been allocated for this LWP. Use the
+ * current stack pointer value.
+ */
+ return ((caddr_t)rp->r_sp);
+ }
}
/*
- * Posting a signal to a proc/thread, switch to native syscall mode.
- * See the comment on lwp_segregs_save() for how we handle the user-land
- * registers when we come into the kernel and see update_sregs() for how we
- * restore.
+ * This hook runs after sendsig() processing and allows us to update the
+ * per-LWP mode flags for system calls and stacks. The pre-signal
+ * context has already been saved and delivered to the user at this point.
*/
-/*ARGSUSED*/
static void
-lx_psig_to_proc(proc_t *p, kthread_t *t, int sig)
+lx_sendsig(int sig)
{
-#if defined(__amd64)
- lx_lwp_data_t *lwpd = ttolxlwp(t);
- klwp_t *lwp = ttolwp(t);
- pcb_t *pcb;
- model_t datamodel;
+ klwp_t *lwp = ttolwp(curthread);
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ struct regs *rp = lwptoregs(lwp);
+
+ switch (lwpd->br_stack_mode) {
+ case LX_STACK_MODE_BRAND:
+ case LX_STACK_MODE_NATIVE:
+ /*
+ * In lx_sendsig_stack(), we nominated a stack pointer from the
+ * native stack. Update the stack mode, and the current in-use
+ * extent of the native stack, accordingly:
+ */
+ lwpd->br_stack_mode = LX_STACK_MODE_NATIVE;
+ lx_lwp_set_native_stack_current(lwpd, rp->r_sp);
+
+ /*
+ * Fix up segment registers, etc.
+ */
+ lx_switch_to_native(lwp);
+ break;
- datamodel = lwp_getdatamodel(lwp);
- if (datamodel != DATAMODEL_NATIVE)
+ default:
+ /*
+ * Otherwise, the brand library has not yet installed the
+ * alternate stack for this LWP. Signals will be handled on
+ * the regular stack thread.
+ */
return;
+ }
+}
- pcb = &lwp->lwp_pcb;
+/*
+ * This hook runs prior to the context restoration, allowing us to take action
+ * or modify the context before it is loaded.
+ */
+static void
+lx_restorecontext(ucontext_t *ucp)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ uintptr_t flags = (uintptr_t)ucp->uc_brand_data[0];
+ caddr_t sp = ucp->uc_brand_data[1];
-#ifdef DEBUG
/*
- * Debug check to see if we have the correct fsbase.
- *
- * Note that it is not guaranteed that our %fsbase is loaded (i.e.
- * rdmsr(MSR_AMD_FSBASE) won't necessarily return our expected fsbase)
- * when this function runs. While it is usually loaded, it's possible
- * to be in this function via the following sequence:
- * we go off-cpu in the kernel
- * another process runs in user-land and its fsbase gets loaded
- * we go on-cpu to run and post a signal, but since we haven't run
- * in user-land yet, our fsbase has not yet been loaded by
- * update_sregs.
+ * We have a saved native stack pointer value that we must restore
+ * into the per-LWP data.
*/
- if (lwpd->br_ntv_syscall == 0 && lwpd->br_lx_fsbase != 0) {
- /* should have Linux fsbase */
- if (lwpd->br_lx_fsbase != pcb->pcb_fsbase) {
- DTRACE_PROBE2(brand__lx__psig__lx__pcb,
- uintptr_t, lwpd->br_lx_fsbase,
- uintptr_t, pcb->pcb_fsbase);
- }
+ if (flags & LX_UC_RESTORE_NATIVE_SP) {
+ lx_lwp_set_native_stack_current(lwpd, (uintptr_t)sp);
+ }
+
+ /*
+ * We do not wish to restore the value of uc_link in this context,
+ * so replace it with the value currently in the LWP.
+ */
+ if (flags & LX_UC_IGNORE_LINK) {
+ ucp->uc_link = (ucontext_t *)lwp->lwp_oldcontext;
+ }
+ /*
+ * Restore the stack mode:
+ */
+ if (flags & LX_UC_STACK_NATIVE) {
+ lwpd->br_stack_mode = LX_STACK_MODE_NATIVE;
+ } else if (flags & LX_UC_STACK_BRAND) {
+ lwpd->br_stack_mode = LX_STACK_MODE_BRAND;
}
- if (lwpd->br_ntv_syscall == 1 && lwpd->br_ntv_fsbase != 0) {
- /* should have Illumos fsbase */
- if (lwpd->br_ntv_fsbase != pcb->pcb_fsbase) {
- DTRACE_PROBE2(brand__lx__psig__ntv__pcb,
- uintptr_t, lwpd->br_ntv_fsbase,
- uintptr_t, pcb->pcb_fsbase);
+#if defined(__amd64)
+ /*
+ * Override the fsbase in the context with the value provided through
+ * the Linux arch_prctl(2) system call.
+ */
+ if (flags & LX_UC_STACK_BRAND) {
+ if (lwpd->br_lx_fsbase != 0) {
+ ucp->uc_mcontext.gregs[REG_FSBASE] = lwpd->br_lx_fsbase;
}
}
#endif
+}
+
+static void
+lx_savecontext(ucontext_t *ucp)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ uintptr_t flags = 0;
- /* We "push" the current syscall mode flag on the "stack". */
- ASSERT(lwpd->br_ntv_syscall == 0 || lwpd->br_ntv_syscall == 1);
- lwpd->br_scms = (lwpd->br_scms << 1) | lwpd->br_ntv_syscall;
+ /*
+ * The ucontext_t affords us three private pointer-sized members in
+ * "uc_brand_data". We pack a variety of flags into the first element,
+ * and an optional stack pointer in the second element. The flags
+ * determine which stack pointer (native or brand), if any, is stored
+ * in the second element. The third element may contain the system
+ * call number; this is analogous to the "orig_[er]ax" member of a
+ * Linux "user_regs_struct".
+ */
- if (lwpd->br_ntv_syscall == 0 && lwpd->br_ntv_fsbase != 0) {
+ if (lwpd->br_stack_mode != LX_STACK_MODE_INIT &&
+ lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
/*
- * We were executing in Linux code but now that we're handling
- * a signal we have to make sure we have the native fsbase
- * loaded. Also update pcb so that if we service an interrupt
- * we will restore the correct fsbase in update_sregs().
- * Because of the amd64 guard and datamodel check, this
- * obviously will only happen for the 64-bit user-land.
- *
- * There is a non-obvious side-effect here. Since the fsbase
- * will now be the native value, when we bounce out to
- * user-land the ucontext will capture the native value, even
- * though we need to restore the Linux value when we return
- * from the signal. This is handled by the B_SIGNAL_RETURN
- * code in lx_brandsys().
+ * Record the value of the native stack pointer to restore
+ * when returning to this branded context:
*/
- pcb->pcb_fsbase = lwpd->br_ntv_fsbase;
+ flags |= LX_UC_RESTORE_NATIVE_SP;
+ ucp->uc_brand_data[1] = (void *)lwpd->br_ntv_stack_current;
+ }
- /* Ensure that we go out via update_sregs */
- pcb->pcb_rupdate = 1;
+ /*
+ * Save the stack mode:
+ */
+ if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) {
+ flags |= LX_UC_STACK_NATIVE;
+ } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
+ flags |= LX_UC_STACK_BRAND;
}
- lwpd->br_ntv_syscall = 1;
-#endif
+
+ /*
+ * If we might need to restart this system call, save that information
+ * in the context:
+ */
+ if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
+ ucp->uc_brand_data[2] =
+ (void *)(uintptr_t)lwpd->br_syscall_num;
+ if (lwpd->br_syscall_restart) {
+ flags |= LX_UC_RESTART_SYSCALL;
+ }
+ } else {
+ ucp->uc_brand_data[2] = NULL;
+ }
+
+ ucp->uc_brand_data[0] = (void *)flags;
+}
+
+#if defined(_SYSCALL32_IMPL)
+static void
+lx_savecontext32(ucontext32_t *ucp)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ unsigned int flags = 0;
+
+ /*
+ * The ucontext_t affords us three private pointer-sized members in
+ * "uc_brand_data". We pack a variety of flags into the first element,
+ * and an optional stack pointer in the second element. The flags
+ * determine which stack pointer (native or brand), if any, is stored
+ * in the second element. The third element may contain the system
+ * call number; this is analogous to the "orig_[er]ax" member of a
+ * Linux "user_regs_struct".
+ */
+
+ if (lwpd->br_stack_mode != LX_STACK_MODE_INIT &&
+ lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
+ /*
+ * Record the value of the native stack pointer to restore
+ * when returning to this branded context:
+ */
+ flags |= LX_UC_RESTORE_NATIVE_SP;
+ ucp->uc_brand_data[1] = (caddr32_t)lwpd->br_ntv_stack_current;
+ }
+
+ /*
+ * Save the stack mode:
+ */
+ if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) {
+ flags |= LX_UC_STACK_NATIVE;
+ } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
+ flags |= LX_UC_STACK_BRAND;
+ }
+
+ /*
+ * If we might need to restart this system call, save that information
+ * in the context:
+ */
+ if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
+ ucp->uc_brand_data[2] = (caddr32_t)lwpd->br_syscall_num;
+ if (lwpd->br_syscall_restart) {
+ flags |= LX_UC_RESTART_SYSCALL;
+ }
+ } else {
+ ucp->uc_brand_data[2] = NULL;
+ }
+
+ ucp->uc_brand_data[0] = flags;
}
+#endif
void
lx_init_brand_data(zone_t *zone)
@@ -426,7 +688,6 @@ lx_init_brand_data(zone_t *zone)
* This can be changed by a call to setattr() during zone boot.
*/
(void) strlcpy(data->lxzd_kernel_version, "2.4.21", LX_VERS_MAX);
- data->lxzd_max_syscall = LX_NSYSCALLS;
zone->zone_brand_data = data;
/*
@@ -448,6 +709,27 @@ lx_unsupported(char *dmsg)
DTRACE_PROBE1(brand__lx__unsupported, char *, dmsg);
}
+void
+lx_trace_sysenter(int syscall_num, uintptr_t *args)
+{
+ if (lx_systrace_enabled) {
+ VERIFY(lx_systrace_entry_ptr != NULL);
+
+ (*lx_systrace_entry_ptr)(syscall_num, args[0], args[1],
+ args[2], args[3], args[4], args[5]);
+ }
+}
+
+void
+lx_trace_sysreturn(int syscall_num, long ret)
+{
+ if (lx_systrace_enabled) {
+ VERIFY(lx_systrace_return_ptr != NULL);
+
+ (*lx_systrace_return_ptr)(syscall_num, ret, ret, 0, 0, 0, 0);
+ }
+}
+
/*
* Get the addresses of the user-space system call handler and attach it to
* the proc structure. Returning 0 indicates success; the value returned
@@ -462,16 +744,16 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
{
kthread_t *t = curthread;
+ klwp_t *lwp = ttolwp(t);
proc_t *p = ttoproc(t);
lx_proc_data_t *pd;
- int ike_call;
struct termios *termios;
uint_t termios_len;
int error;
int code;
int sig;
lx_brand_registration_t reg;
- lx_lwp_data_t *lwpd;
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
/*
* There is one operation that is suppored for non-branded
@@ -480,8 +762,8 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
* a branded process.
*/
if (cmd == B_EXEC_BRAND) {
- ASSERT(p->p_zone != NULL);
- ASSERT(p->p_zone->zone_brand == &lx_brand);
+ VERIFY(p->p_zone != NULL);
+ VERIFY(p->p_zone->zone_brand == &lx_brand);
return (exec_common(
(char *)arg1, (const char **)arg2, (const char **)arg3,
EBA_BRAND));
@@ -489,13 +771,19 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
/* For all other operations this must be a branded process. */
if (p->p_brand == NULL)
- return (set_errno(ENOSYS));
+ return (ENOSYS);
- ASSERT(p->p_brand == &lx_brand);
- ASSERT(p->p_brand_data != NULL);
+ VERIFY(p->p_brand == &lx_brand);
+ VERIFY(p->p_brand_data != NULL);
switch (cmd) {
case B_REGISTER:
+ if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
+ lx_print("stack mode was not PREINIT during "
+ "REGISTER\n");
+ return (EINVAL);
+ }
+
if (p->p_model == DATAMODEL_NATIVE) {
if (copyin((void *)arg1, &reg, sizeof (reg)) != 0) {
lx_print("Failed to copyin brand registration "
@@ -517,10 +805,6 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
reg.lxbr_version = (uint_t)reg32.lxbr_version;
reg.lxbr_handler =
(void *)(uintptr_t)reg32.lxbr_handler;
- reg.lxbr_tracehandler =
- (void *)(uintptr_t)reg32.lxbr_tracehandler;
- reg.lxbr_traceflag =
- (void *)(uintptr_t)reg32.lxbr_traceflag;
}
#endif
@@ -534,34 +818,9 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
(void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p);
pd = p->p_brand_data;
pd->l_handler = (uintptr_t)reg.lxbr_handler;
- pd->l_tracehandler = (uintptr_t)reg.lxbr_tracehandler;
- pd->l_traceflag = (uintptr_t)reg.lxbr_traceflag;
-
-#if defined(__amd64)
- /*
- * When we register, start with native syscalls enabled so that
- * lx_init can finish initialization before switch to Linux
- * syscall mode. Also initialize the syscall mode "stack" to
- * native. We push/pop bits into this "stack" during signal
- * handling.
- */
- lwpd = ttolxlwp(t);
- lwpd->br_ntv_syscall = 1;
- lwpd->br_scms = 1;
-#endif
- if (pd->l_traceflag != NULL && pd->l_ptrace != 0) {
- /*
- * If ptrace(2) is active on this process, it is likely
- * that we just finished an emulated execve(2) in a
- * traced child. The usermode traceflag will have been
- * clobbered by the exec, so we set it again here:
- */
- (void) suword32((void *)pd->l_traceflag, 1);
- }
-
- *rval = 0;
return (0);
+
case B_TTYMODES:
/* This is necessary for emulating TCGETS ioctls. */
if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(),
@@ -577,7 +836,6 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
}
ddi_prop_free(termios);
- *rval = 0;
return (0);
case B_ELFDATA:
@@ -585,8 +843,7 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
if (get_udatamodel() == DATAMODEL_NATIVE) {
if (copyout(&pd->l_elf_data, (void *)arg1,
sizeof (lx_elf_data_t)) != 0) {
- (void) set_errno(EFAULT);
- return (*rval = -1);
+ return (EFAULT);
}
}
#if defined(_LP64)
@@ -603,23 +860,15 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
if (copyout(&led32, (void *)arg1,
sizeof (led32)) != 0) {
- (void) set_errno(EFAULT);
- return (*rval = -1);
+ return (EFAULT);
}
}
#endif
- *rval = 0;
return (0);
case B_EXEC_NATIVE:
- error = exec_common(
- (char *)arg1, (const char **)arg2, (const char **)arg3,
- EBA_NATIVE);
- if (error) {
- (void) set_errno(error);
- return (*rval = -1);
- }
- return (*rval = 0);
+ return (exec_common((char *)arg1, (const char **)arg2,
+ (const char **)arg3, EBA_NATIVE));
/*
* The B_TRUSS_POINT subcommand is used so that we can make a no-op
@@ -627,99 +876,34 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
* emulation.
*/
case B_TRUSS_POINT:
- *rval = 0;
return (0);
- case B_LPID_TO_SPAIR:
+ case B_LPID_TO_SPAIR: {
/*
* Given a Linux pid as arg1, return the Solaris pid in arg2 and
* the Solaris LWP in arg3. We also translate pid 1 (which is
* hardcoded in many applications) to the zone's init process.
*/
- {
- pid_t s_pid;
- id_t s_tid;
-
- if ((pid_t)arg1 == 1) {
- s_pid = p->p_zone->zone_proc_initpid;
- /* handle the dead/missing init(1M) case */
- if (s_pid == -1)
- s_pid = 1;
- s_tid = 1;
- } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid,
- &s_tid) < 0)
- return (ESRCH);
-
- if (copyout(&s_pid, (void *)arg2,
- sizeof (s_pid)) != 0 ||
- copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0)
- return (EFAULT);
-
- *rval = 0;
- return (0);
- }
-
- case B_SYSENTRY:
- if (lx_systrace_enabled) {
- ASSERT(lx_systrace_entry_ptr != NULL);
-
- if (get_udatamodel() == DATAMODEL_NATIVE) {
- uintptr_t a[6];
-
- if (copyin((void *)arg2, a, sizeof (a)) != 0)
- return (EFAULT);
-
- (*lx_systrace_entry_ptr)(arg1, a[0], a[1],
- a[2], a[3], a[4], a[5]);
- }
-#if defined(_LP64)
- else {
- /* 32-bit userland on 64-bit kernel */
- uint32_t a[6];
-
- if (copyin((void *)arg2, a, sizeof (a)) != 0)
- return (EFAULT);
-
- (*lx_systrace_entry_ptr)(arg1, a[0], a[1],
- a[2], a[3], a[4], a[5]);
- }
-#endif
+ pid_t s_pid;
+ id_t s_tid;
+
+ if ((pid_t)arg1 == 1) {
+ s_pid = p->p_zone->zone_proc_initpid;
+ /* handle the dead/missing init(1M) case */
+ if (s_pid == -1)
+ s_pid = 1;
+ s_tid = 1;
+ } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, &s_tid) < 0) {
+ return (ESRCH);
}
- (void) lx_ptrace_stop(LX_PR_SYSENTRY);
-
- pd = p->p_brand_data;
-
- /*
- * If neither DTrace not ptrace are interested in tracing
- * this process any more, turn off the trace flag.
- */
- if (!lx_systrace_enabled && !pd->l_ptrace)
- (void) suword32((void *)pd->l_traceflag, 0);
-
- *rval = 0;
- return (0);
-
- case B_SYSRETURN:
- if (lx_systrace_enabled) {
- ASSERT(lx_systrace_return_ptr != NULL);
-
- (*lx_systrace_return_ptr)(arg1, arg2, arg2, 0, 0, 0, 0);
+ if (copyout(&s_pid, (void *)arg2, sizeof (s_pid)) != 0 ||
+ copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) {
+ return (EFAULT);
}
- (void) lx_ptrace_stop(LX_PR_SYSEXIT);
-
- pd = p->p_brand_data;
-
- /*
- * If neither DTrace not ptrace are interested in tracing
- * this process any more, turn off the trace flag.
- */
- if (!lx_systrace_enabled && !pd->l_ptrace)
- (void) suword32((void *)pd->l_traceflag, 0);
-
- *rval = 0;
return (0);
+ }
case B_SET_AFFINITY_MASK:
case B_GET_AFFINITY_MASK:
@@ -735,7 +919,7 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
case B_PTRACE_STOP_FOR_OPT:
return (lx_ptrace_stop_for_option((int)arg1, arg2 == 0 ?
- B_FALSE : B_TRUE, (ulong_t)arg3));
+ B_FALSE : B_TRUE, (ulong_t)arg3, arg4));
case B_PTRACE_CLONE_BEGIN:
return (lx_ptrace_set_clone_inherit((int)arg1, arg2 == 0 ?
@@ -783,8 +967,7 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
return (0);
}
- case B_UNSUPPORTED:
- {
+ case B_UNSUPPORTED: {
char dmsg[256];
if (copyin((void *)arg1, &dmsg, sizeof (dmsg)) != 0) {
@@ -794,11 +977,11 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
}
dmsg[255] = '\0';
lx_unsupported(dmsg);
- }
return (0);
+ }
- case B_STORE_ARGS:
+ case B_STORE_ARGS: {
/*
* B_STORE_ARGS subcommand
* arg1 = address of struct to be copied in
@@ -806,141 +989,208 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
* arg3-arg6 ignored
* rval = the amount of data copied.
*/
- {
- int err;
- void *buf;
-
- lwpd = ttolxlwp(curthread);
- /* only have upper limit because arg2 is unsigned */
- if (arg2 > LX_BR_ARGS_SIZE_MAX) {
- return (EINVAL);
- }
+ void *buf;
- buf = kmem_alloc(arg2, KM_SLEEP);
- if ((err = copyin((void *)arg1, buf, arg2)) != 0) {
- lx_print("Failed to copyin scall arg at 0x%p\n",
- (void *) arg1);
- kmem_free(buf, arg2);
- /*
- * Purposely not setting br_scall_args to NULL
- * to preserve data for debugging.
- */
- return (EFAULT);
- }
+ /* only have upper limit because arg2 is unsigned */
+ if (arg2 > LX_BR_ARGS_SIZE_MAX) {
+ return (EINVAL);
+ }
- if (lwpd->br_scall_args != NULL) {
- ASSERT(lwpd->br_args_size > 0);
- kmem_free(lwpd->br_scall_args,
- lwpd->br_args_size);
- }
+ buf = kmem_alloc(arg2, KM_SLEEP);
+ if (copyin((void *)arg1, buf, arg2) != 0) {
+ lx_print("Failed to copyin scall arg at 0x%p\n",
+ (void *) arg1);
+ kmem_free(buf, arg2);
+ /*
+ * Purposely not setting br_scall_args to NULL
+ * to preserve data for debugging.
+ */
+ return (EFAULT);
+ }
- lwpd->br_scall_args = buf;
- lwpd->br_args_size = arg2;
- *rval = arg2;
- return (0);
+ if (lwpd->br_scall_args != NULL) {
+ ASSERT(lwpd->br_args_size > 0);
+ kmem_free(lwpd->br_scall_args,
+ lwpd->br_args_size);
}
- case B_CLR_NTV_SYSC_FLAG:
-#if defined(__amd64)
- lwpd = ttolxlwp(curthread);
- lwpd->br_ntv_syscall = 0;
+ lwpd->br_scall_args = buf;
+ lwpd->br_args_size = arg2;
+ *rval = arg2;
+ return (0);
+ }
+
+ case B_HELPER_CLONE:
+ return (lx_helper_clone(rval, arg1, (void *)arg2, (void *)arg3,
+ (void *)arg4));
+ case B_HELPER_SETGROUPS:
+ return (lx_helper_setgroups(arg1, (gid_t *)arg2));
+
+ case B_HELPER_SIGQUEUE:
+ return (lx_helper_rt_sigqueueinfo(arg1, arg2,
+ (siginfo_t *)arg3));
+
+ case B_HELPER_TGSIGQUEUE:
+ return (lx_helper_rt_tgsigqueueinfo(arg1, arg2, arg3,
+ (siginfo_t *)arg4));
+
+ case B_SET_THUNK_PID:
+ lwpd->br_lx_thunk_pid = arg1;
+ return (0);
+
+ case B_GETPID:
/*
- * If Linux fsbase has been set, restore it. The user-level
- * code only ever calls this in the 64-bit library.
- *
- * Note that it is not guaranteed that our %fsbase is loaded
- * (i.e. rdmsr(MSR_AMD_FSBASE) won't necessarily return our
- * expected fsbase) when this block runs. While it is usually
- * loaded, it's possible to be in this function via the
- * following sequence:
- * we make the brandsys syscall and go off-cpu on entering
- * the kernel
- * another process runs in user-land and its fsbase gets
- * loaded
- * we go on-cpu to finish the syscall but since we haven't
- * run again in user-land yet, our fsbase has not yet been
- * reloaded by update_sregs
+ * The usermode clone(2) code needs to be able to call
+ * lx_getpid() from native code:
*/
- if (lwpd->br_lx_fsbase != 0) {
- klwp_t *lwp = ttolwp(t);
- pcb_t *pcb = &lwp->lwp_pcb;
-
- pcb->pcb_fsbase = lwpd->br_lx_fsbase;
+ *rval = lx_getpid();
+ return (0);
- /* Ensure that we go out via update_sregs */
- pcb->pcb_rupdate = 1;
+ case B_SET_NATIVE_STACK:
+ /*
+ * B_SET_NATIVE_STACK subcommand
+ * arg1 = the base of the stack to use for emulation
+ */
+ if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
+ lx_print("B_SET_NATIVE_STACK when stack was already "
+ "set to %p\n", (void *)arg1);
+ return (EEXIST);
}
-#endif
- return (0);
- case B_SIGNAL_RETURN:
-#if defined(__amd64)
/*
- * Set the syscall mode and do the setcontext syscall. The
- * user-level code only ever calls this in the 64-bit library.
+ * We move from the PREINIT state, where we have no brand
+ * emulation stack, to the INIT state. Here, we are still
+ * running on what will become the BRAND stack, but are running
+ * emulation (i.e. native) code. Once the initialisation
+ * process for this thread has finished, we will jump to
+ * brand-specific code, while moving to the BRAND mode.
*
- * We get the previous syscall mode off of the br_scms "stack".
- * That is a sequence of syscall mode flag bits we've pushed
- * into that int as we took signals.
- * arg1 = ucontext_t pointer
+ * When a new LWP is created, lx_initlwp() will clear the
+ * stack data. If that LWP is actually being duplicated
+ * into a child process by fork(2), lx_forklwp() will copy
+ * it so that the cloned thread will keep using the same
+ * alternate stack.
+ */
+ lwpd->br_ntv_stack = arg1;
+ lwpd->br_stack_mode = LX_STACK_MODE_INIT;
+ lx_lwp_set_native_stack_current(lwpd, arg1);
+
+ return (0);
+
+ case B_GET_CURRENT_CONTEXT:
+ /*
+ * B_GET_CURRENT_CONTEXT subcommand:
+ * arg1 = address for pointer to current ucontext_t
*/
- lwpd = ttolxlwp(curthread);
- lwpd->br_ntv_syscall = lwpd->br_scms & 0x1;
- /* "pop" this value from the "stack" */
- lwpd->br_scms >>= 1;
+#if defined(_SYSCALL32_IMPL)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ caddr32_t addr = (caddr32_t)lwp->lwp_oldcontext;
+
+ error = copyout(&addr, (void *)arg1, sizeof (addr));
+ } else
+#endif
+ {
+ error = copyout(&lwp->lwp_oldcontext, (void *)arg1,
+ sizeof (lwp->lwp_oldcontext));
+ }
+
+ return (error != 0 ? EFAULT : 0);
+ case B_JUMP_TO_LINUX:
/*
- * If setting the mode to lx, make sure we fix up the context
- * so that we load the lx fsbase when we return to the Linux
- * code. For the native case, the context already has the
- * correct native fsbase so we don't need to do anything here.
- * Note that setgregs updates the pcb and in update_sregs we
- * wrmsr the correct fsbase when we return to user-level.
- * getsetcontext -> restorecontext -> setgregs
+ * B_JUMP_TO_LINUX subcommand:
+ * arg1 = ucontext_t pointer for jump state
*/
- if (lwpd->br_ntv_syscall == 0 && lwpd->br_lx_fsbase != 0 &&
- arg1 != NULL) {
+
+ if (arg1 == NULL)
+ return (EINVAL);
+
+ switch (lwpd->br_stack_mode) {
+ case LX_STACK_MODE_NATIVE: {
+ struct regs *rp = lwptoregs(lwp);
+
/*
- * Linux fsbase has been initialized, restore it.
- * We have to copyin to modify since the user-level
- * emulation doesn't have a copy of the lx fsbase or
- * know that we are returning to Linux code.
+ * We are on the NATIVE stack, so we must preserve
+ * the extent of that stack. The pointer will be
+ * reset by a future setcontext().
*/
- ucontext_t uc;
- klwp_t *lwp = ttolwp(t);
- pcb_t *pcb = &lwp->lwp_pcb;
-
- if (copyin((void *)arg1, &uc, sizeof (ucontext_t) -
- sizeof (uc.uc_filler) -
- sizeof (uc.uc_mcontext.fpregs)))
- return (set_errno(EFAULT));
+ lx_lwp_set_native_stack_current(lwpd,
+ (uintptr_t)rp->r_sp);
+ break;
+ }
- uc.uc_mcontext.gregs[REG_FSBASE] = lwpd->br_lx_fsbase;
+ case LX_STACK_MODE_INIT:
+ /*
+ * The LWP is transitioning to Linux code for the first
+ * time.
+ */
+ break;
- if (copyout(&uc, (void *)arg1, sizeof (ucontext_t) -
- sizeof (uc.uc_filler) -
- sizeof (uc.uc_mcontext.fpregs)))
- return (set_errno(EFAULT));
+ case LX_STACK_MODE_PREINIT:
+ /*
+ * This LWP has not installed an alternate stack for
+ * usermode emulation handling.
+ */
+ return (ENOENT);
- /* Ensure that we go out via update_sregs */
- pcb->pcb_rupdate = 1;
+ case LX_STACK_MODE_BRAND:
+ /*
+ * The LWP should not be on the BRAND stack.
+ */
+ exit(CLD_KILLED, SIGSYS);
+ return (0);
}
-#endif /* amd64 */
- return (getsetcontext(SETCONTEXT, (void *)arg1));
- case B_UNWIND_NTV_SYSC_FLAG:
-#if defined(__amd64)
/*
- * Used when exiting to support the setcontext back to the
- * getcontext we performed in lx_init. We need to unwin
- * whatever signal state is in br_scms since we are exiting.
- * This sets us up for the B_SIGNAL_RETURN from lx_setcontext.
+ * Transfer control to Linux:
*/
- lwpd = ttolxlwp(curthread);
- lwpd->br_scms = 1;
+ return (lx_runexe(lwp, (void *)arg1));
+
+ case B_EMULATION_DONE:
+ /*
+ * B_EMULATION_DONE subcommand:
+ * arg1 = ucontext_t * to restore
+ * arg2 = system call number
+ * arg3 = return code
+ * arg4 = if operation failed, the errno value
+ */
+
+ /*
+ * The first part of this operation is a setcontext() to
+ * restore the register state to the copy we preserved
+ * before vectoring to the usermode emulation routine.
+ * If that fails, we return (hopefully) to the emulation
+ * routine and it will handle the error.
+ */
+#if (_SYSCALL32_IMPL)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ error = getsetcontext32(SETCONTEXT, (void *)arg1);
+ } else
#endif
+ {
+ error = getsetcontext(SETCONTEXT, (void *)arg1);
+ }
+
+ if (error != 0) {
+ return (error);
+ }
+
+ /*
+ * The saved Linux context has been restored. We handle the
+ * return value or errno with code common to the in-kernel
+ * system call emulation.
+ */
+ if ((error = (int)arg4) != 0) {
+ /*
+ * lx_syscall_return() looks at the errno in the LWP,
+ * so set it here:
+ */
+ set_errno(error);
+ }
+ lx_syscall_return(ttolwp(curthread), (int)arg2, (long)arg3);
+
return (0);
case B_EXIT_AS_SIG:
@@ -959,41 +1209,6 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
exit(code, sig);
/* NOTREACHED */
break;
-
- case B_IKE_SYSCALL:
- if (arg1 > LX_N_IKE_FUNCS)
- return (EINVAL);
-
- if (get_udatamodel() == DATAMODEL_NATIVE) {
- uintptr_t a[6];
-
- if (copyin((void *)arg2, a, sizeof (a)) != 0)
- return (EFAULT);
-
- *rval = lx_emulate_syscall(arg1, a[0], a[1],
- a[2], a[3], a[4], a[5]);
-#if defined(_LP64)
- } else {
- /* 32-bit userland on 64-bit kernel */
- uint32_t a[6];
-
- if (copyin((void *)arg2, a, sizeof (a)) != 0)
- return (EFAULT);
-
- *rval = lx_emulate_syscall(arg1, a[0], a[1],
- a[2], a[3], a[4], a[5]);
-#endif
- }
-
- return (0);
-
- default:
- ike_call = cmd - B_IKE_SYSCALL;
- if (ike_call > 0 && ike_call <= LX_N_IKE_FUNCS) {
- *rval = lx_emulate_syscall(ike_call, arg1, arg2,
- arg3, arg4, arg5, 0xbadbeef);
- return (0);
- }
}
return (EINVAL);
@@ -1443,11 +1658,37 @@ lx_native_exec(uint8_t osabi, const char **interp)
return (B_TRUE);
}
+static void
+lx_syscall_init(void)
+{
+ int i;
+
+ /*
+ * Count up the 32-bit Linux system calls. Note that lx_sysent32
+ * has (LX_NSYSCALLS + 1) entries.
+ */
+ for (i = 0; i <= LX_NSYSCALLS && lx_sysent32[i].sy_name != NULL; i++)
+ continue;
+ lx_nsysent32 = i;
+
+#if defined(_LP64)
+ /*
+ * Count up the 64-bit Linux system calls. Note that lx_sysent64
+ * has (LX_NSYSCALLS + 1) entries.
+ */
+ for (i = 0; i <= LX_NSYSCALLS && lx_sysent64[i].sy_name != NULL; i++)
+ continue;
+ lx_nsysent64 = i;
+#endif
+}
+
int
_init(void)
{
int err = 0;
+ lx_syscall_init();
+
/* pid/tid conversion hash tables */
lx_pid_init();
diff --git a/usr/src/uts/common/brand/lx/os/lx_misc.c b/usr/src/uts/common/brand/lx/os/lx_misc.c
index abb0ab6e63..ebe37a01c0 100644
--- a/usr/src/uts/common/brand/lx/os/lx_misc.c
+++ b/usr/src/uts/common/brand/lx/os/lx_misc.c
@@ -88,7 +88,6 @@ lx_exec()
* invalid; clear them.
*/
pd->l_handler = NULL;
- pd->l_tracehandler = NULL;
/*
* There are two mutually exclusive special cases we need to
@@ -118,12 +117,20 @@ lx_exec()
* we are traced we can post either the PTRACE_EVENT_EXEC event or the
* legacy SIGTRAP.
*/
- (void) lx_ptrace_stop_for_option(LX_PTRACE_O_TRACEEXEC, B_FALSE, 0);
+ (void) lx_ptrace_stop_for_option(LX_PTRACE_O_TRACEEXEC, B_FALSE, 0, 0);
/* clear the fsbase values until the app. can reinitialize them */
lwpd->br_lx_fsbase = NULL;
lwpd->br_ntv_fsbase = NULL;
+ /*
+ * Clear the native stack flags. This will be reinitialised by
+ * lx_init() in the new process image.
+ */
+ lwpd->br_stack_mode = LX_STACK_MODE_PREINIT;
+ lwpd->br_ntv_stack = 0;
+ lwpd->br_ntv_stack_current = 0;
+
installctx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL, lx_save,
NULL);
@@ -236,6 +243,11 @@ lx_freelwp(klwp_t *lwp)
{
struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
+ /*
+ * Remove our system call interposer.
+ */
+ lwp->lwp_brand_syscall = NULL;
+
if (lwpd != NULL) {
(void) removectx(lwptot(lwp), lwp, lx_save, lx_restore,
NULL, NULL, lx_save, NULL);
@@ -269,8 +281,7 @@ lx_initlwp(klwp_t *lwp)
lwpd->br_clear_ctidp = NULL;
lwpd->br_set_ctidp = NULL;
lwpd->br_signal = 0;
- lwpd->br_ntv_syscall = 1;
- lwpd->br_scms = 1;
+ lwpd->br_stack_mode = LX_STACK_MODE_PREINIT;
/*
* lwpd->br_affinitymask was zeroed by kmem_zalloc()
@@ -320,6 +331,11 @@ lx_initlwp(klwp_t *lwp)
lx_ptrace_inherit_tracer(plwpd, lwpd);
}
+ /*
+ * Install branded system call hook for this LWP:
+ */
+ lwp->lwp_brand_syscall = lx_syscall_enter;
+
return (0);
}
@@ -339,6 +355,27 @@ lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp)
dst->br_ptid = lwptot(srclwp)->t_tid;
bcopy(src->br_tls, dst->br_tls, sizeof (dst->br_tls));
+ switch (src->br_stack_mode) {
+ case LX_STACK_MODE_BRAND:
+ case LX_STACK_MODE_NATIVE:
+ /*
+ * The parent LWP has an alternate stack installed.
+ * The child LWP should have the same stack base and extent.
+ */
+ dst->br_stack_mode = src->br_stack_mode;
+ dst->br_ntv_stack = src->br_ntv_stack;
+ dst->br_ntv_stack_current = src->br_ntv_stack_current;
+ break;
+
+ default:
+ /*
+ * Otherwise, clear the stack data for this LWP.
+ */
+ dst->br_stack_mode = LX_STACK_MODE_PREINIT;
+ dst->br_ntv_stack = 0;
+ dst->br_ntv_stack_current = 0;
+ }
+
/*
* copy only these flags
*/
@@ -436,7 +473,7 @@ lx_fixsegreg(greg_t sr, model_t datamodel)
}
/*
- * Brand-specific function to convert the fsbase as pulled from the regsiter
+ * Brand-specific function to convert the fsbase as pulled from the register
* into a native fsbase suitable for locating the ulwp_t from the kernel.
*/
uintptr_t
@@ -444,8 +481,10 @@ lx_fsbase(klwp_t *lwp, uintptr_t fsbase)
{
lx_lwp_data_t *lwpd = lwp->lwp_brand;
- if (lwpd->br_ntv_syscall || lwpd->br_ntv_fsbase == NULL)
+ if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND ||
+ lwpd->br_ntv_fsbase == NULL) {
return (fsbase);
+ }
return (lwpd->br_ntv_fsbase);
}
diff --git a/usr/src/uts/common/brand/lx/os/lx_ptrace.c b/usr/src/uts/common/brand/lx/os/lx_ptrace.c
index 6e4b74531d..a97a1b6d43 100644
--- a/usr/src/uts/common/brand/lx/os/lx_ptrace.c
+++ b/usr/src/uts/common/brand/lx/os/lx_ptrace.c
@@ -485,7 +485,8 @@ lx_ptrace_restart_lwp(klwp_t *lwp)
*/
rlwpd->br_ptrace_whystop = 0;
rlwpd->br_ptrace_whatstop = 0;
- rlwpd->br_ptrace_flags &= ~LX_PTRACE_CLDPEND;
+ rlwpd->br_ptrace_flags &= ~(LX_PTRACE_CLDPEND |
+ LX_PTRACE_WAITPEND);
}
thread_unlock(rt);
}
@@ -551,9 +552,8 @@ lx_winfo(lx_lwp_data_t *remote, k_siginfo_t *ip, boolean_t waitflag,
* so that it may be re-fetched on another call to waitid().
*/
if (waitflag) {
- remote->br_ptrace_whystop = 0;
- remote->br_ptrace_whatstop = 0;
- remote->br_ptrace_flags &= ~LX_PTRACE_CLDPEND;
+ remote->br_ptrace_flags &= ~(LX_PTRACE_CLDPEND |
+ LX_PTRACE_WAITPEND);
}
}
@@ -637,6 +637,7 @@ lx_stop_notify(proc_t *p, klwp_t *lwp, ushort_t why, ushort_t what)
*/
lwpd->br_ptrace_whystop = why;
lwpd->br_ptrace_whatstop = what;
+ lwpd->br_ptrace_flags |= LX_PTRACE_WAITPEND;
/*
* If this event does not depend on an event from the parent LWP,
@@ -805,6 +806,60 @@ lx_ptrace_geteventmsg(lx_lwp_data_t *remote, void *umsgp)
return (error);
}
+static int
+lx_ptrace_getregs(lx_lwp_data_t *remote, void *uregsp)
+{
+ if (remote->br_stack_mode == LX_STACK_MODE_BRAND) {
+ /*
+ * The LWP was stopped with the brand stack and register
+ * state loaded, e.g. during a system call emulated within
+ * the kernel. Return the LWP register state.
+ */
+ return (lx_regs_to_userregs(remote, uregsp));
+ } else if (remote->br_ptrace_stopucp != NULL) {
+ /*
+ * The LWP was stopped in the usermode emulation library
+ * but a ucontext_t for the preserved brand stack and
+ * register state was provided. Return the register state
+ * from that ucontext_t.
+ */
+ return (lx_uc_to_userregs(remote,
+ (void *)remote->br_ptrace_stopucp, uregsp));
+ } else {
+ /*
+ * The register state is not currently available.
+ */
+ return (EIO);
+ }
+}
+
+static int
+lx_ptrace_setregs(lx_lwp_data_t *remote, void *uregsp)
+{
+ if (remote->br_stack_mode == LX_STACK_MODE_BRAND) {
+ /*
+ * The LWP was stopped with the brand stack and register
+ * state loaded, e.g. during a system call emulated within
+ * the kernel. Write to the LWP register state.
+ */
+ return (lx_userregs_to_regs(remote, uregsp));
+ } else if (remote->br_ptrace_stopucp != NULL) {
+ /*
+ * The LWP was stopped in the usermode emulation library
+ * but a ucontext_t for the preserved brand stack and
+ * register state was provided. Write to the register state
+ * in that ucontext_t.
+ */
+ return (lx_userregs_to_uc(remote,
+ (void *)remote->br_ptrace_stopucp, uregsp));
+ } else {
+ /*
+ * The register state is not currently available.
+ */
+ return (EIO);
+ }
+}
+
/*
* Implements the PTRACE_CONT subcommand of the Linux ptrace(2) interface.
*/
@@ -907,7 +962,6 @@ static int
lx_ptrace_attach(pid_t lx_pid)
{
int error = ESRCH;
- int32_t one = 1;
/*
* Our (Tracer) LWP:
*/
@@ -1016,15 +1070,9 @@ lx_ptrace_attach(pid_t lx_pid)
/*
* Set the in-kernel process-wide ptrace(2) enable flag.
- * Attempt also to write the usermode trace flag so that the
- * process knows to enter the kernel for potential ptrace(2)
- * syscall-stops.
*/
rprocd = ttolxproc(rthr);
rprocd->l_ptrace = 1;
- mutex_exit(&rproc->p_lock);
- (void) uwrite(rproc, &one, sizeof (one), rprocd->l_traceflag);
- mutex_enter(&rproc->p_lock);
error = 0;
}
@@ -1294,12 +1342,9 @@ lx_ptrace_traceme(void)
/*
* Set the in-kernel process-wide ptrace(2) enable
- * flag. Attempt also to write the usermode trace flag
- * so that the process knows to enter the kernel for
- * potential ptrace(2) syscall-stops.
+ * flag.
*/
procd->l_ptrace = 1;
- (void) suword32((void *)procd->l_traceflag, 1);
return (0);
}
@@ -1360,6 +1405,7 @@ lx_ptrace_stop_common(proc_t *p, lx_lwp_data_t *lwpd, ushort_t what)
*/
lwpd->br_ptrace_flags &= ~(LX_PTRACE_STOPPING | LX_PTRACE_STOPPED |
LX_PTRACE_CLDPEND);
+ lwpd->br_ptrace_stopucp = NULL;
cv_broadcast(&lx_ptrace_busy_cv);
mutex_exit(&p->p_lock);
@@ -1367,7 +1413,8 @@ lx_ptrace_stop_common(proc_t *p, lx_lwp_data_t *lwpd, ushort_t what)
}
int
-lx_ptrace_stop_for_option(int option, boolean_t child, ulong_t msg)
+lx_ptrace_stop_for_option(int option, boolean_t child, ulong_t msg,
+ uintptr_t ucp)
{
kthread_t *t = curthread;
klwp_t *lwp = ttolwp(t);
@@ -1453,6 +1500,12 @@ lx_ptrace_stop_for_option(int option, boolean_t child, ulong_t msg)
}
/*
+ * Userland may have passed in a ucontext_t pointer for
+ * PTRACE_GETREGS/PTRACE_SETREGS usage while stopped.
+ */
+ lwpd->br_ptrace_stopucp = ucp;
+
+ /*
* p_lock for the process containing the tracee will be dropped by
* lx_ptrace_stop_common().
*/
@@ -1874,7 +1927,8 @@ lx_sigcld_repost(proc_t *pp, sigqueue_t *sqp)
continue;
}
- if (remote->br_ptrace_whystop == 0 ||
+ if (!(remote->br_ptrace_flags & LX_PTRACE_WAITPEND) ||
+ remote->br_ptrace_whystop == 0 ||
remote->br_ptrace_whatstop == 0) {
/*
* No (new) stop reason to post for this LWP.
@@ -2041,7 +2095,8 @@ lx_waitid_helper(idtype_t idtype, id_t id, k_siginfo_t *ip, int options,
continue;
}
- if (remote->br_ptrace_whystop == 0 ||
+ if (!(remote->br_ptrace_flags & LX_PTRACE_WAITPEND) ||
+ remote->br_ptrace_whystop == 0 ||
remote->br_ptrace_whatstop == 0) {
/*
* No (new) stop reason to post for this LWP.
@@ -2230,6 +2285,14 @@ lx_ptrace_kernel(int ptrace_op, pid_t lxpid, uintptr_t addr, uintptr_t data)
error = lx_ptrace_geteventmsg(remote, (void *)data);
break;
+ case LX_PTRACE_GETREGS:
+ error = lx_ptrace_getregs(remote, (void *)data);
+ break;
+
+ case LX_PTRACE_SETREGS:
+ error = lx_ptrace_setregs(remote, (void *)data);
+ break;
+
default:
error = EINVAL;
}
diff --git a/usr/src/uts/common/brand/lx/os/lx_syscall.c b/usr/src/uts/common/brand/lx/os/lx_syscall.c
index d26f10d851..5c6e5b29f3 100644
--- a/usr/src/uts/common/brand/lx/os/lx_syscall.c
+++ b/usr/src/uts/common/brand/lx/os/lx_syscall.c
@@ -22,7 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#include <sys/kmem.h>
@@ -34,145 +34,1093 @@
#include <sys/modctl.h>
#include <sys/cmn_err.h>
#include <sys/model.h>
+#include <sys/privregs.h>
#include <sys/brand.h>
#include <sys/machbrand.h>
+#include <sys/sdt.h>
#include <sys/lx_syscalls.h>
#include <sys/lx_brand.h>
#include <sys/lx_impl.h>
#include <sys/lx_misc.h>
+
/*
- * Some system calls return either a 32-bit or a 64-bit value, depending
- * on the datamodel.
+ * Flags for sysent entries:
*/
-#ifdef _LP64
-#define V_RVAL SE_64RVAL
-#else
-#define V_RVAL SE_32RVAL1
-#endif
+#define LX_SYS_NOSYS_REASON 0x07
+#define LX_SYS_EBPARG6 0x08
/*
- * Define system calls that return a native 'long' quantity i.e. a 32-bit
- * or 64-bit integer - depending on how the kernel is itself compiled
- * e.g. read(2) returns 'ssize_t' in the kernel and in userland.
+ * Flags that denote the specific reason we do not have a particular system
+ * call. These reasons are only valid if the function is NULL.
*/
-#define LX_CL(name, call, narg) \
- { V_RVAL, (name), (llfcn_t)(call), (narg) }
+#define NOSYS_USERMODE 0
+#define NOSYS_NULL 1
+#define NOSYS_NONE 2
+#define NOSYS_NO_EQUIV 3
+#define NOSYS_KERNEL 4
+#define NOSYS_UNDOC 5
+#define NOSYS_OBSOLETE 6
+#define NOSYS_MAX NOSYS_OBSOLETE
+
+#if NOSYS_MAX > LX_SYS_NOSYS_REASON
+#error NOSYS reason codes must fit in LX_SYS_NOSYS_REASON
+#endif
/*
- * Returns a 32 bit quantity regardless of datamodel
+ * Strings describing the reason we do not emulate a particular system call
+ * in the kernel.
*/
-#define LX_CI(name, call, narg) \
- { SE_32RVAL1, (name), (llfcn_t)(call), (narg) }
+static char *nosys_reasons[] = {
+ NULL, /* NOSYS_USERMODE means this call is emulated in usermode */
+ "Not done yet",
+ "No such Linux system call",
+ "No equivalent illumos functionality",
+ "Reads/modifies Linux kernel state",
+ "Undocumented and/or rarely used system call",
+ "Unsupported, obsolete system call"
+};
-#define LX_NOSYS(name) \
- {SE_64RVAL, (name), (llfcn_t)lx_nosys, 0}
-typedef int64_t (*llfcn_t)();
+#if defined(_LP64)
+/*
+ * System call handler table and entry count for Linux x86_64 (amd64):
+ */
+lx_sysent_t lx_sysent64[LX_NSYSCALLS + 1];
+int lx_nsysent64;
+#endif
+/*
+ * System call handler table and entry count for Linux x86 (i386):
+ */
+lx_sysent_t lx_sysent32[LX_NSYSCALLS + 1];
+int lx_nsysent32;
/*
- * In-Kernel Emulation table
- * The entries in this table are NOT indexed by either of the Linux syscall
- * numbers (32-bit or 64-bit). Instead, the entries are laid out linearly
- * with the LX_EMUL_* defines uses to lookup the correct entry.
+ * Map Illumos errno to the Linux equivalent.
*/
-typedef struct lx_ike {
- int sy_flags;
- char *sy_name;
- llfcn_t sy_callc;
- char sy_narg;
-} lx_ike_t;
-
-static lx_ike_t lx_ike_ent[] =
+int lx_stol_errno[] = LX_STOL_ERRNO_INIT;
+
+#if defined(__amd64)
+static int
+lx_emulate_args(klwp_t *lwp, const lx_sysent_t *s, uintptr_t *args)
{
- LX_NOSYS("lx_nosys"), /* 0 */
- LX_CL("getpid", lx_getpid, 0), /* 1 */
- LX_CL("kill", lx_kill, 2),
- LX_CL("pipe", lx_pipe, 1),
- LX_CL("brk", lx_brk, 1),
- LX_CL("getppid", lx_getppid, 0),
- LX_CL("sysinfo", lx_sysinfo, 1),
- LX_CL("clone", lx_clone, 5),
- LX_CL("modify_ldt", lx_modify_ldt, 3),
- LX_CL("sched_setparam", lx_sched_setparam, 2),
- LX_CL("sched_getparam", lx_sched_getparam, 2), /* 10 */
- LX_CL("sched_rr_get_interval", lx_sched_rr_get_interval, 2),
- LX_CL("setresuid16", lx_setresuid16, 3),
- LX_CL("setresgid16", lx_setresgid16, 3),
- LX_CL("rt_sigqueueinfo", lx_rt_sigqueueinfo, 3),
- LX_CL("setgroups", lx_setgroups, 2),
- LX_CL("setresuid", lx_setresuid, 3),
- LX_CL("setresgid", lx_setresgid, 3),
- LX_CL("gettid", lx_gettid, 0),
- LX_CL("tkill", lx_tkill, 2),
- LX_CL("futex", lx_futex, 6), /* 20 */
- LX_CL("set_thread_area", lx_set_thread_area, 1),
- LX_CL("get_thread_area", lx_get_thread_area, 1),
- LX_CL("set_tid_address", lx_set_tid_address, 1),
- LX_CL("pipe2", lx_pipe2, 2),
- LX_CL("rt_tgsigqueueinfo", lx_rt_tgsigqueueinfo, 4),
- LX_CL("arch_prctl", lx_arch_prctl, 2),
- LX_CL("tgkill", lx_tgkill, 3),
- LX_CL("read", lx_read, 3),
- LX_CL("ioctl", lx_ioctl, 3),
-};
+ struct regs *rp = lwptoregs(lwp);
+
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ /*
+ * Note: Syscall argument passing is different from function
+ * call argument passing on amd64. For function calls, the
+ * fourth arg is passed via %rcx, but for system calls the 4th
+ * arg is passed via %r10. This is because in amd64, the
+ * syscall instruction puts the lower 32 bits of %rflags in
+ * %r11 and puts the %rip value to %rcx.
+ *
+ * Appendix A of the amd64 ABI (Linux conventions) states that
+ * syscalls are limited to 6 args and no arg is passed on the
+ * stack.
+ */
+ args[0] = rp->r_rdi;
+ args[1] = rp->r_rsi;
+ args[2] = rp->r_rdx;
+ args[3] = rp->r_r10;
+ args[4] = rp->r_r8;
+ args[5] = rp->r_r9;
+ } else {
+ /*
+ * If the system call takes 6 args, then libc has stashed them
+ * in memory at the address contained in %ebx. Except for some
+ * syscalls which store the 6th argument in %ebp.
+ */
+ if (s->sy_narg == 6 && !(s->sy_flags & LX_SYS_EBPARG6)) {
+ uint32_t args32[6];
+
+ if (copyin((void *)rp->r_rbx, &args32,
+ sizeof (args32)) != 0) {
+ /*
+ * Clear the argument vector so that the
+ * trace probe does not expose kernel
+ * memory.
+ */
+ bzero(args, 6 * sizeof (uintptr_t));
+ return (set_errno(EFAULT));
+ }
+
+ args[0] = args32[0];
+ args[1] = args32[1];
+ args[2] = args32[2];
+ args[3] = args32[3];
+ args[4] = args32[4];
+ args[5] = args32[5];
+ } else {
+ args[0] = rp->r_rbx;
+ args[1] = rp->r_rcx;
+ args[2] = rp->r_rdx;
+ args[3] = rp->r_rsi;
+ args[4] = rp->r_rdi;
+ args[5] = rp->r_rbp;
+ }
+ }
+
+ return (0);
+}
-int64_t
-lx_emulate_syscall(int num, uintptr_t arg1, uintptr_t arg2,
- uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6)
+#else /* !__amd64 */
+
+static int
+lx_emulate_args(klwp_t *lwp, const lx_sysent_t *s, uintptr_t *args)
{
- lx_ike_t *jsp;
- int64_t rval;
+ struct regs *rp = lwptoregs(lwp);
- rval = (int64_t)0;
+ /*
+ * If the system call takes 6 args, then libc has stashed them
+ * in memory at the address contained in %ebx. Except for some
+ * syscalls which store the 6th argument in %ebp.
+ */
+ if (s->sy_narg == 6 && !(s->sy_flags & LX_SYS_EBPARG6)) {
+ if (copyin((void *)rp->r_ebx, args, 6 * sizeof (uintptr_t)) !=
+ 0) {
+ /*
+ * Clear the argument vector so that the trace probe
+ * does not expose kernel memory.
+ */
+ bzero(args, 6 * sizeof (uintptr_t));
+ return (set_errno(EFAULT));
+ }
+ } else {
+ args[0] = rp->r_ebx;
+ args[1] = rp->r_ecx;
+ args[2] = rp->r_edx;
+ args[3] = rp->r_esi;
+ args[4] = rp->r_edi;
+ args[5] = rp->r_ebp;
+ }
- jsp = &(lx_ike_ent[num]);
+ return (0);
+}
+#endif
- switch (jsp->sy_narg) {
- case 0: {
- lx_print("--> %s()\n", jsp->sy_name);
- rval = (int64_t)jsp->sy_callc();
- break;
+int
+lx_syscall_return(klwp_t *lwp, int syscall_num, long ret)
+{
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ struct regs *rp = lwptoregs(lwp);
+ int error = lwp->lwp_errno;
+
+ if (error != EINTR) {
+ /*
+ * If this system call was not interrupted, clear the system
+ * call restart flag before lx_setcontext() can pass it to
+ * usermode.
+ */
+ lwpd->br_syscall_restart = B_FALSE;
+ }
+
+ if (error != 0) {
+ /*
+ * Convert from illumos to Linux errno:
+ */
+ if (error < 1 || error >= (sizeof (lx_stol_errno) /
+ sizeof (lx_stol_errno[0]))) {
+ /*
+ * The provided error number is not valid.
+ */
+ error = EINVAL;
+ }
+ ret = -lx_stol_errno[error];
}
- case 1: {
- lx_print("--> %s(0x%lx)\n", jsp->sy_name, arg1);
- rval = (int64_t)jsp->sy_callc(arg1);
- break;
+
+ /*
+ * 32-bit Linux system calls return via %eax; 64-bit calls return via
+ * %rax.
+ */
+ rp->r_r0 = ret;
+
+ /*
+ * Hold for the ptrace(2) "syscall-exit-stop" condition if required by
+ * PTRACE_SYSCALL. Note that the register state may be modified by
+ * tracer.
+ */
+ lx_ptrace_stop(LX_PR_SYSEXIT);
+
+ /*
+ * Fire the DTrace "lx-syscall:::return" probe:
+ */
+ lx_trace_sysreturn(syscall_num, ret);
+
+ /*
+ * Clear errno for next time. We do not clear "br_syscall_restart" or
+ * "br_syscall_num" as they are potentially used by "lx_savecontext()"
+ * in the signal delivery path.
+ */
+ lwp->lwp_errno = 0;
+
+ /*
+ * We want complete control of the registers on return from this
+ * emulated Linux system call:
+ */
+ lwp->lwp_eosys = JUSTRETURN;
+ curthread->t_post_sys = 1;
+ aston(curthread);
+
+ return (0);
+}
+
+static void
+lx_syscall_unsup_msg(lx_sysent_t *s, int syscall_num, int unsup_reason)
+{
+ char buf[100];
+
+ if (s == NULL) {
+ (void) snprintf(buf, sizeof (buf), "NOSYS (%d): out of bounds",
+ syscall_num);
+ } else {
+ VERIFY(unsup_reason < (sizeof (nosys_reasons) /
+ sizeof (*nosys_reasons)));
+
+ if (s->sy_name == NULL) {
+ (void) snprintf(buf, sizeof (buf), "NOSYS (%d): %s",
+ syscall_num, nosys_reasons[unsup_reason]);
+ } else {
+ (void) snprintf(buf, sizeof (buf), "NOSYS (%s): %s",
+ s->sy_name, nosys_reasons[unsup_reason]);
+ }
}
- case 2: {
- lx_print("--> %s(0x%lx, 0x%lx)\n", jsp->sy_name, arg1, arg2);
- rval = (int64_t)jsp->sy_callc(arg1, arg2);
- break;
+
+ lx_unsupported(buf);
+}
+
+/*
+ * This function is used to override the processing of arguments and
+ * invocation of a handler for emulated system calls, installed on each
+ * branded LWP as "lwp_brand_syscall". If this system call should use the
+ * native path, we return 1. If we handled this system call (and have made
+ * arrangements with respect to post-return usermode register state) we
+ * return 0.
+ */
+int
+lx_syscall_enter(void)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ struct regs *rp = lwptoregs(lwp);
+ int syscall_num;
+ int error;
+ long ret = 0;
+ lx_sysent_t *s;
+ uintptr_t args[6];
+ unsigned int unsup_reason;
+
+ /*
+ * If we got here, we should have an LWP-specific brand data
+ * structure.
+ */
+ VERIFY(lwpd != NULL);
+
+ if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND) {
+ /*
+ * The lwp is not in in BRAND execution mode, so we return
+ * to the regular native system call path.
+ */
+ DTRACE_PROBE(brand__lx__syscall__hook__skip);
+ return (1);
}
- case 3: {
- lx_print("--> %s(0x%lx, 0x%lx, 0x%lx)\n",
- jsp->sy_name, arg1, arg2, arg3);
- rval = (int64_t)jsp->sy_callc(arg1, arg2, arg3);
- break;
+
+ /*
+ * Clear the restartable system call flag. This flag will be set
+ * on in the system call handler if the call is a candidate for
+ * a restart. It will be saved by lx_setcontext() in the event
+ * that we take a signal, and used in the signal handling path
+ * to restart the system call iff SA_RESTART was set for this
+ * signal. Save the system call number so that we can store it
+ * in the saved context if required.
+ */
+ lwpd->br_syscall_restart = B_FALSE;
+ lwpd->br_syscall_num = (int)rp->r_r0;
+
+ /*
+ * Hold for the ptrace(2) "syscall-entry-stop" condition if traced by
+ * PTRACE_SYSCALL. The system call number and arguments may be
+ * modified by the tracer.
+ */
+ lx_ptrace_stop(LX_PR_SYSENTRY);
+
+ /*
+ * Check that the system call number is within the bounds we expect.
+ */
+ syscall_num = lwpd->br_syscall_num;
+ if (syscall_num < 0 || syscall_num > LX_MAX_SYSCALL(lwp)) {
+ lx_syscall_unsup_msg(NULL, syscall_num, 0);
+
+ set_errno(ENOTSUP);
+ lx_syscall_return(lwp, syscall_num, -1);
+ return (0);
}
- case 4: {
- lx_print("--> %s(0x%lx, 0x%lx, 0x%lx, 0x%lx)\n",
- jsp->sy_name, arg1, arg2, arg3, arg4);
- rval = (int64_t)jsp->sy_callc(arg1, arg2, arg3, arg4);
- break;
+
+#if defined(_LP64)
+ if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
+ s = &lx_sysent64[syscall_num];
+ } else
+#endif
+ {
+ s = &lx_sysent32[syscall_num];
}
- case 5: {
- lx_print("--> %s(0x%lx, 0x%lx, 0x%lx, 0x%lx, 0x%lx)\n",
- jsp->sy_name, arg1, arg2, arg3, arg4, arg5);
- rval = (int64_t)jsp->sy_callc(arg1, arg2, arg3, arg4, arg5);
- break;
+
+ /*
+ * Process the arguments for this system call and fire the DTrace
+ * "lx-syscall:::entry" probe:
+ */
+ error = lx_emulate_args(lwp, s, args);
+ lx_trace_sysenter(syscall_num, args);
+ if (error != 0) {
+ /*
+ * Could not read and process the arguments. Return the error
+ * to the process.
+ */
+ set_errno(error);
+ lx_syscall_return(lwp, syscall_num, -1);
+ return (0);
}
- case 6: {
- lx_print("--> %s(0x%lx, 0x%lx, 0x%lx, 0x%lx,"
- " 0x%lx, 0x%lx)\n",
- jsp->sy_name, arg1, arg2, arg3, arg4, arg5, arg6);
- rval = (int64_t)jsp->sy_callc(arg1, arg2, arg3, arg4, arg5,
- arg6);
- break;
+
+ if (s->sy_callc != NULL) {
+ /*
+ * Call the in-kernel handler for this Linux system call:
+ */
+ ret = s->sy_callc(args[0], args[1], args[2], args[3], args[4],
+ args[5]);
+ lx_syscall_return(lwp, syscall_num, ret);
+ return (0);
}
+
+ /*
+ * There is no in-kernel handler.
+ */
+ switch (unsup_reason = (s->sy_flags & LX_SYS_NOSYS_REASON)) {
+ case NOSYS_USERMODE:
+ /*
+ * Pass to the usermode emulation routine.
+ */
+#if defined(_LP64)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ lx_emulate_user32(lwp, syscall_num, args);
+ } else
+#endif
+ {
+ lx_emulate_user(lwp, syscall_num, args);
+ }
+ return (0);
+
default:
- panic("Invalid IKE entry: #%d at 0x%p\n", num, (void *)jsp);
+ /*
+ * We are not emulating this system call at all.
+ */
+ lx_syscall_unsup_msg(s, syscall_num, unsup_reason);
+
+ set_errno(ENOTSUP);
+ lx_syscall_return(lwp, syscall_num, -1);
+ return (0);
}
- lx_print("----------> return (0x%llx)\n", (long long)rval);
- return (rval);
}
+
+/*
+ * Linux defines system call numbers for 32-bit x86 in the file:
+ * arch/x86/syscalls/syscall_32.tbl
+ */
+lx_sysent_t lx_sysent32[] = {
+ {"nosys", NULL, NOSYS_NONE, 0}, /* 0 */
+ {"exit", NULL, 0, 1}, /* 1 */
+ {"fork", NULL, 0, 0}, /* 2 */
+ {"read", lx_read, 0, 3}, /* 3 */
+ {"write", lx_write, 0, 3}, /* 4 */
+ {"open", NULL, 0, 3}, /* 5 */
+ {"close", NULL, 0, 1}, /* 6 */
+ {"waitpid", lx_waitpid, 0, 3}, /* 7 */
+ {"creat", NULL, 0, 2}, /* 8 */
+ {"link", NULL, 0, 2}, /* 9 */
+ {"unlink", NULL, 0, 1}, /* 10 */
+ {"execve", NULL, 0, 3}, /* 11 */
+ {"chdir", NULL, 0, 1}, /* 12 */
+ {"time", NULL, 0, 1}, /* 13 */
+ {"mknod", NULL, 0, 3}, /* 14 */
+ {"chmod", NULL, 0, 2}, /* 15 */
+ {"lchown16", NULL, 0, 3}, /* 16 */
+ {"break", NULL, NOSYS_OBSOLETE, 0}, /* 17 */
+ {"stat", NULL, NOSYS_OBSOLETE, 0}, /* 18 */
+ {"lseek", NULL, 0, 3}, /* 19 */
+ {"getpid", lx_getpid, 0, 0}, /* 20 */
+ {"mount", NULL, 0, 5}, /* 21 */
+ {"umount", NULL, 0, 1}, /* 22 */
+ {"setuid16", NULL, 0, 1}, /* 23 */
+ {"getuid16", NULL, 0, 0}, /* 24 */
+ {"stime", NULL, 0, 1}, /* 25 */
+ {"ptrace", NULL, 0, 4}, /* 26 */
+ {"alarm", NULL, 0, 1}, /* 27 */
+ {"fstat", NULL, NOSYS_OBSOLETE, 0}, /* 28 */
+ {"pause", NULL, 0, 0}, /* 29 */
+ {"utime", NULL, 0, 2}, /* 30 */
+ {"stty", NULL, NOSYS_OBSOLETE, 0}, /* 31 */
+ {"gtty", NULL, NOSYS_OBSOLETE, 0}, /* 32 */
+ {"access", NULL, 0, 2}, /* 33 */
+ {"nice", NULL, 0, 1}, /* 34 */
+ {"ftime", NULL, NOSYS_OBSOLETE, 0}, /* 35 */
+ {"sync", NULL, 0, 0}, /* 36 */
+ {"kill", lx_kill, 0, 2}, /* 37 */
+ {"rename", NULL, 0, 2}, /* 38 */
+ {"mkdir", NULL, 0, 2}, /* 39 */
+ {"rmdir", NULL, 0, 1}, /* 40 */
+ {"dup", NULL, 0, 1}, /* 41 */
+ {"pipe", lx_pipe, 0, 1}, /* 42 */
+ {"times", NULL, 0, 1}, /* 43 */
+ {"prof", NULL, NOSYS_OBSOLETE, 0}, /* 44 */
+ {"brk", lx_brk, 0, 1}, /* 45 */
+ {"setgid16", NULL, 0, 1}, /* 46 */
+ {"getgid16", NULL, 0, 0}, /* 47 */
+ {"signal", NULL, 0, 2}, /* 48 */
+ {"geteuid16", NULL, 0, 0}, /* 49 */
+ {"getegid16", NULL, 0, 0}, /* 50 */
+ {"acct", NULL, NOSYS_NO_EQUIV, 0}, /* 51 */
+ {"umount2", NULL, 0, 2}, /* 52 */
+ {"lock", NULL, NOSYS_OBSOLETE, 0}, /* 53 */
+ {"ioctl", lx_ioctl, 0, 3}, /* 54 */
+ {"fcntl", NULL, 0, 3}, /* 55 */
+ {"mpx", NULL, NOSYS_OBSOLETE, 0}, /* 56 */
+ {"setpgid", NULL, 0, 2}, /* 57 */
+ {"ulimit", NULL, NOSYS_OBSOLETE, 0}, /* 58 */
+ {"olduname", NULL, NOSYS_OBSOLETE, 0}, /* 59 */
+ {"umask", NULL, 0, 1}, /* 60 */
+ {"chroot", NULL, 0, 1}, /* 61 */
+ {"ustat", NULL, NOSYS_OBSOLETE, 2}, /* 62 */
+ {"dup2", NULL, 0, 2}, /* 63 */
+ {"getppid", lx_getppid, 0, 0}, /* 64 */
+ {"getpgrp", NULL, 0, 0}, /* 65 */
+ {"setsid", NULL, 0, 0}, /* 66 */
+ {"sigaction", NULL, 0, 3}, /* 67 */
+ {"sgetmask", NULL, NOSYS_OBSOLETE, 0}, /* 68 */
+ {"ssetmask", NULL, NOSYS_OBSOLETE, 0}, /* 69 */
+ {"setreuid16", NULL, 0, 2}, /* 70 */
+ {"setregid16", NULL, 0, 2}, /* 71 */
+ {"sigsuspend", NULL, 0, 1}, /* 72 */
+ {"sigpending", NULL, 0, 1}, /* 73 */
+ {"sethostname", NULL, 0, 2}, /* 74 */
+ {"setrlimit", NULL, 0, 2}, /* 75 */
+ {"getrlimit", NULL, 0, 2}, /* 76 */
+ {"getrusage", NULL, 0, 2}, /* 77 */
+ {"gettimeofday", NULL, 0, 2}, /* 78 */
+ {"settimeofday", NULL, 0, 2}, /* 79 */
+ {"getgroups16", NULL, 0, 2}, /* 80 */
+ {"setgroups16", NULL, 0, 2}, /* 81 */
+ {"select", NULL, NOSYS_OBSOLETE, 0}, /* 82 */
+ {"symlink", NULL, 0, 2}, /* 83 */
+ {"oldlstat", NULL, NOSYS_OBSOLETE, 0}, /* 84 */
+ {"readlink", NULL, 0, 3}, /* 85 */
+ {"uselib", NULL, NOSYS_KERNEL, 0}, /* 86 */
+ {"swapon", NULL, NOSYS_KERNEL, 0}, /* 87 */
+ {"reboot", NULL, 0, 4}, /* 88 */
+ {"readdir", NULL, 0, 3}, /* 89 */
+ {"mmap", NULL, 0, 6}, /* 90 */
+ {"munmap", NULL, 0, 2}, /* 91 */
+ {"truncate", NULL, 0, 2}, /* 92 */
+ {"ftruncate", NULL, 0, 2}, /* 93 */
+ {"fchmod", NULL, 0, 2}, /* 94 */
+ {"fchown16", NULL, 0, 3}, /* 95 */
+ {"getpriority", NULL, 0, 2}, /* 96 */
+ {"setpriority", NULL, 0, 3}, /* 97 */
+ {"profil", NULL, NOSYS_NO_EQUIV, 0}, /* 98 */
+ {"statfs", NULL, 0, 2}, /* 99 */
+ {"fstatfs", NULL, 0, 2}, /* 100 */
+ {"ioperm", NULL, NOSYS_NO_EQUIV, 0}, /* 101 */
+ {"socketcall", NULL, 0, 2}, /* 102 */
+ {"syslog", NULL, 0, 3}, /* 103 */
+ {"setitimer", NULL, 0, 3}, /* 104 */
+ {"getitimer", NULL, 0, 2}, /* 105 */
+ {"stat", NULL, 0, 2}, /* 106 */
+ {"lstat", NULL, 0, 2}, /* 107 */
+ {"fstat", NULL, 0, 2}, /* 108 */
+ {"uname", NULL, NOSYS_OBSOLETE, 0}, /* 109 */
+ {"oldiopl", NULL, NOSYS_NO_EQUIV, 0}, /* 110 */
+ {"vhangup", NULL, 0, 0}, /* 111 */
+ {"idle", NULL, NOSYS_NO_EQUIV, 0}, /* 112 */
+ {"vm86old", NULL, NOSYS_OBSOLETE, 0}, /* 113 */
+ {"wait4", lx_wait4, 0, 4}, /* 114 */
+ {"swapoff", NULL, NOSYS_KERNEL, 0}, /* 115 */
+ {"sysinfo", lx_sysinfo32, 0, 1}, /* 116 */
+ {"ipc", NULL, 0, 5}, /* 117 */
+ {"fsync", NULL, 0, 1}, /* 118 */
+ {"sigreturn", NULL, 0, 1}, /* 119 */
+ {"clone", NULL, 0, 5}, /* 120 */
+ {"setdomainname", NULL, 0, 2}, /* 121 */
+ {"uname", NULL, 0, 1}, /* 122 */
+ {"modify_ldt", lx_modify_ldt, 0, 3}, /* 123 */
+ {"adjtimex", NULL, 0, 1}, /* 124 */
+ {"mprotect", NULL, 0, 3}, /* 125 */
+ {"sigprocmask", NULL, 0, 3}, /* 126 */
+ {"create_module", NULL, NOSYS_KERNEL, 0}, /* 127 */
+ {"init_module", NULL, NOSYS_KERNEL, 0}, /* 128 */
+ {"delete_module", NULL, NOSYS_KERNEL, 0}, /* 129 */
+ {"get_kernel_syms", NULL, NOSYS_KERNEL, 0}, /* 130 */
+ {"quotactl", NULL, NOSYS_KERNEL, 0}, /* 131 */
+ {"getpgid", NULL, 0, 1}, /* 132 */
+ {"fchdir", NULL, 0, 1}, /* 133 */
+ {"bdflush", NULL, NOSYS_KERNEL, 0}, /* 134 */
+ {"sysfs", NULL, 0, 3}, /* 135 */
+ {"personality", NULL, 0, 1}, /* 136 */
+ {"afs_syscall", NULL, NOSYS_KERNEL, 0}, /* 137 */
+ {"setfsuid16", NULL, 0, 1}, /* 138 */
+ {"setfsgid16", NULL, 0, 1}, /* 139 */
+ {"llseek", NULL, 0, 5}, /* 140 */
+ {"getdents", NULL, 0, 3}, /* 141 */
+ {"select", NULL, 0, 5}, /* 142 */
+ {"flock", NULL, 0, 2}, /* 143 */
+ {"msync", NULL, 0, 3}, /* 144 */
+ {"readv", NULL, 0, 3}, /* 145 */
+ {"writev", NULL, 0, 3}, /* 146 */
+ {"getsid", NULL, 0, 1}, /* 147 */
+ {"fdatasync", NULL, 0, 1}, /* 148 */
+ {"sysctl", NULL, 0, 1}, /* 149 */
+ {"mlock", NULL, 0, 2}, /* 150 */
+ {"munlock", NULL, 0, 2}, /* 151 */
+ {"mlockall", NULL, 0, 1}, /* 152 */
+ {"munlockall", NULL, 0, 0}, /* 153 */
+ {"sched_setparam", NULL, 0, 2}, /* 154 */
+ {"sched_getparam", NULL, 0, 2}, /* 155 */
+ {"sched_setscheduler", NULL, 0, 3}, /* 156 */
+ {"sched_getscheduler", NULL, 0, 1}, /* 157 */
+ {"sched_yield", lx_sched_yield, 0, 0}, /* 158 */
+ {"sched_get_priority_max", NULL, 0, 1}, /* 159 */
+ {"sched_get_priority_min", NULL, 0, 1}, /* 160 */
+ {"sched_rr_get_interval", NULL, 0, 2}, /* 161 */
+ {"nanosleep", NULL, 0, 2}, /* 162 */
+ {"mremap", NULL, 0, 5}, /* 163 */
+ {"setresuid16", lx_setresuid16, 0, 3}, /* 164 */
+ {"getresuid16", NULL, 0, 3}, /* 165 */
+ {"vm86", NULL, NOSYS_NO_EQUIV, 0}, /* 166 */
+ {"query_module", NULL, 0, 5}, /* 167 */
+ {"poll", NULL, 0, 3}, /* 168 */
+ {"nfsservctl", NULL, NOSYS_KERNEL, 0}, /* 169 */
+ {"setresgid16", lx_setresgid16, 0, 3}, /* 170 */
+ {"getresgid16", NULL, 0, 3}, /* 171 */
+ {"prctl", NULL, 0, 5}, /* 172 */
+ {"rt_sigreturn", NULL, 0, 0}, /* 173 */
+ {"rt_sigaction", NULL, 0, 4}, /* 174 */
+ {"rt_sigprocmask", NULL, 0, 4}, /* 175 */
+ {"rt_sigpending", NULL, 0, 2}, /* 176 */
+ {"rt_sigtimedwait", NULL, 0, 4}, /* 177 */
+ {"rt_sigqueueinfo", NULL, 0, 3}, /* 178 */
+ {"rt_sigsuspend", NULL, 0, 2}, /* 179 */
+ {"pread64", NULL, 0, 5}, /* 180 */
+ {"pwrite64", NULL, 0, 5}, /* 181 */
+ {"chown16", NULL, 0, 3}, /* 182 */
+ {"getcwd", NULL, 0, 2}, /* 183 */
+ {"capget", NULL, 0, 2}, /* 184 */
+ {"capset", NULL, 0, 2}, /* 185 */
+ {"sigaltstack", NULL, 0, 2}, /* 186 */
+ {"sendfile", NULL, 0, 4}, /* 187 */
+ {"getpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 188 */
+ {"putpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 189 */
+ {"vfork", NULL, 0, 0}, /* 190 */
+ {"getrlimit", NULL, 0, 2}, /* 191 */
+ {"mmap2", NULL, LX_SYS_EBPARG6, 6}, /* 192 */
+ {"truncate64", NULL, 0, 3}, /* 193 */
+ {"ftruncate64", NULL, 0, 3}, /* 194 */
+ {"stat64", NULL, 0, 2}, /* 195 */
+ {"lstat64", NULL, 0, 2}, /* 196 */
+ {"fstat64", NULL, 0, 2}, /* 197 */
+ {"lchown", NULL, 0, 3}, /* 198 */
+ {"getuid", NULL, 0, 0}, /* 199 */
+ {"getgid", NULL, 0, 0}, /* 200 */
+ {"geteuid", NULL, 0, 0}, /* 201 */
+ {"getegid", NULL, 0, 0}, /* 202 */
+ {"setreuid", NULL, 0, 0}, /* 203 */
+ {"setregid", NULL, 0, 0}, /* 204 */
+ {"getgroups", NULL, 0, 2}, /* 205 */
+ {"setgroups", NULL, 0, 2}, /* 206 */
+ {"fchown", NULL, 0, 3}, /* 207 */
+ {"setresuid", lx_setresuid, 0, 3}, /* 208 */
+ {"getresuid", NULL, 0, 3}, /* 209 */
+ {"setresgid", lx_setresgid, 0, 3}, /* 210 */
+ {"getresgid", NULL, 0, 3}, /* 211 */
+ {"chown", NULL, 0, 3}, /* 212 */
+ {"setuid", NULL, 0, 1}, /* 213 */
+ {"setgid", NULL, 0, 1}, /* 214 */
+ {"setfsuid", NULL, 0, 1}, /* 215 */
+ {"setfsgid", NULL, 0, 1}, /* 216 */
+ {"pivot_root", NULL, NOSYS_KERNEL, 0}, /* 217 */
+ {"mincore", NULL, 0, 3}, /* 218 */
+ {"madvise", NULL, 0, 3}, /* 219 */
+ {"getdents64", NULL, 0, 3}, /* 220 */
+ {"fcntl64", NULL, 0, 3}, /* 221 */
+ {"tux", NULL, NOSYS_NO_EQUIV, 0}, /* 222 */
+ {"security", NULL, NOSYS_NO_EQUIV, 0}, /* 223 */
+ {"gettid", lx_gettid, 0, 0}, /* 224 */
+ {"readahead", NULL, NOSYS_NO_EQUIV, 0}, /* 225 */
+ {"setxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 226 */
+ {"lsetxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 227 */
+ {"fsetxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 228 */
+ {"getxattr", lx_xattr, 0, 4}, /* 229 */
+ {"lgetxattr", lx_xattr, 0, 4}, /* 230 */
+ {"fgetxattr", lx_xattr, 0, 4}, /* 231 */
+ {"listxattr", lx_xattr, 0, 3}, /* 232 */
+ {"llistxattr", lx_xattr, 0, 3}, /* 233 */
+ {"flistxattr", lx_xattr, 0, 3}, /* 234 */
+ {"removexattr", lx_xattr, 0, 2}, /* 235 */
+ {"lremovexattr", lx_xattr, 0, 2}, /* 236 */
+ {"fremovexattr", lx_xattr, 0, 2}, /* 237 */
+ {"tkill", lx_tkill, 0, 2}, /* 238 */
+ {"sendfile64", NULL, 0, 4}, /* 239 */
+ {"futex", lx_futex, LX_SYS_EBPARG6, 6}, /* 240 */
+ {"sched_setaffinity", NULL, 0, 3}, /* 241 */
+ {"sched_getaffinity", NULL, 0, 3}, /* 242 */
+ {"set_thread_area", lx_set_thread_area, 0, 1}, /* 243 */
+ {"get_thread_area", lx_get_thread_area, 0, 1}, /* 244 */
+ {"io_setup", NULL, NOSYS_NO_EQUIV, 0}, /* 245 */
+ {"io_destroy", NULL, NOSYS_NO_EQUIV, 0}, /* 246 */
+ {"io_getevents", NULL, NOSYS_NO_EQUIV, 0}, /* 247 */
+ {"io_submit", NULL, NOSYS_NO_EQUIV, 0}, /* 248 */
+ {"io_cancel", NULL, NOSYS_NO_EQUIV, 0}, /* 249 */
+ {"fadvise64", NULL, 0, 4}, /* 250 */
+ {"nosys", NULL, 0, 0}, /* 251 */
+ {"group_exit", NULL, 0, 1}, /* 252 */
+ {"lookup_dcookie", NULL, NOSYS_NO_EQUIV, 0}, /* 253 */
+ {"epoll_create", NULL, 0, 1}, /* 254 */
+ {"epoll_ctl", NULL, 0, 4}, /* 255 */
+ {"epoll_wait", NULL, 0, 4}, /* 256 */
+ {"remap_file_pages", NULL, NOSYS_NO_EQUIV, 0}, /* 257 */
+ {"set_tid_address", lx_set_tid_address, 0, 1}, /* 258 */
+ {"timer_create", NULL, 0, 3}, /* 259 */
+ {"timer_settime", NULL, 0, 4}, /* 260 */
+ {"timer_gettime", NULL, 0, 2}, /* 261 */
+ {"timer_getoverrun", NULL, 0, 1}, /* 262 */
+ {"timer_delete", NULL, 0, 1}, /* 263 */
+ {"clock_settime", NULL, 0, 2}, /* 264 */
+ {"clock_gettime", NULL, 0, 2}, /* 265 */
+ {"clock_getres", NULL, 0, 2}, /* 266 */
+ {"clock_nanosleep", NULL, 0, 4}, /* 267 */
+ {"statfs64", NULL, 0, 2}, /* 268 */
+ {"fstatfs64", NULL, 0, 2}, /* 269 */
+ {"tgkill", lx_tgkill, 0, 3}, /* 270 */
+
+/*
+ * The following system calls only exist in kernel 2.6 and greater:
+ */
+ {"utimes", NULL, 0, 2}, /* 271 */
+ {"fadvise64_64", NULL, 0, 4}, /* 272 */
+ {"vserver", NULL, NOSYS_NULL, 0}, /* 273 */
+ {"mbind", NULL, NOSYS_NULL, 0}, /* 274 */
+ {"get_mempolicy", NULL, NOSYS_NULL, 0}, /* 275 */
+ {"set_mempolicy", NULL, NOSYS_NULL, 0}, /* 276 */
+ {"mq_open", NULL, NOSYS_NULL, 0}, /* 277 */
+ {"mq_unlink", NULL, NOSYS_NULL, 0}, /* 278 */
+ {"mq_timedsend", NULL, NOSYS_NULL, 0}, /* 279 */
+ {"mq_timedreceive", NULL, NOSYS_NULL, 0}, /* 280 */
+ {"mq_notify", NULL, NOSYS_NULL, 0}, /* 281 */
+ {"mq_getsetattr", NULL, NOSYS_NULL, 0}, /* 282 */
+ {"kexec_load", NULL, NOSYS_NULL, 0}, /* 283 */
+ {"waitid", lx_waitid, 0, 4}, /* 284 */
+ {"sys_setaltroot", NULL, NOSYS_NULL, 0}, /* 285 */
+ {"add_key", NULL, NOSYS_NULL, 0}, /* 286 */
+ {"request_key", NULL, NOSYS_NULL, 0}, /* 287 */
+ {"keyctl", NULL, NOSYS_NULL, 0}, /* 288 */
+ {"ioprio_set", NULL, NOSYS_NULL, 0}, /* 289 */
+ {"ioprio_get", NULL, NOSYS_NULL, 0}, /* 290 */
+ {"inotify_init", NULL, 0, 0}, /* 291 */
+ {"inotify_add_watch", NULL, 0, 3}, /* 292 */
+ {"inotify_rm_watch", NULL, 0, 2}, /* 293 */
+ {"migrate_pages", NULL, NOSYS_NULL, 0}, /* 294 */
+ {"openat", NULL, 0, 4}, /* 295 */
+ {"mkdirat", NULL, 0, 3}, /* 296 */
+ {"mknodat", NULL, 0, 4}, /* 297 */
+ {"fchownat", NULL, 0, 5}, /* 298 */
+ {"futimesat", NULL, 0, 3}, /* 299 */
+ {"fstatat64", NULL, 0, 4}, /* 300 */
+ {"unlinkat", NULL, 0, 3}, /* 301 */
+ {"renameat", NULL, 0, 4}, /* 302 */
+ {"linkat", NULL, 0, 5}, /* 303 */
+ {"symlinkat", NULL, 0, 3}, /* 304 */
+ {"readlinkat", NULL, 0, 4}, /* 305 */
+ {"fchmodat", NULL, 0, 4}, /* 306 */
+ {"faccessat", NULL, 0, 4}, /* 307 */
+ {"pselect6", NULL, LX_SYS_EBPARG6, 6}, /* 308 */
+ {"ppoll", NULL, 0, 5}, /* 309 */
+ {"unshare", NULL, NOSYS_NULL, 0}, /* 310 */
+ {"set_robust_list", NULL, NOSYS_NULL, 0}, /* 311 */
+ {"get_robust_list", NULL, NOSYS_NULL, 0}, /* 312 */
+ {"splice", NULL, NOSYS_NULL, 0}, /* 313 */
+ {"sync_file_range", NULL, NOSYS_NULL, 0}, /* 314 */
+ {"tee", NULL, NOSYS_NULL, 0}, /* 315 */
+ {"vmsplice", NULL, NOSYS_NULL, 0}, /* 316 */
+ {"move_pages", NULL, NOSYS_NULL, 0}, /* 317 */
+ {"getcpu", NULL, 0, 3}, /* 318 */
+ {"epoll_pwait", NULL, 0, 5}, /* 319 */
+ {"utimensat", NULL, 0, 4}, /* 320 */
+ {"signalfd", NULL, NOSYS_NULL, 0}, /* 321 */
+ {"timerfd_create", NULL, 0, 2}, /* 322 */
+ {"eventfd", NULL, 0, 1}, /* 323 */
+ {"fallocate", NULL, NOSYS_NULL, 0}, /* 324 */
+ {"timerfd_settime", NULL, 0, 4}, /* 325 */
+ {"timerfd_gettime", NULL, 0, 2}, /* 326 */
+ {"signalfd4", NULL, NOSYS_NULL, 0}, /* 327 */
+ {"eventfd2", NULL, 0, 2}, /* 328 */
+ {"epoll_create1", NULL, 0, 1}, /* 329 */
+ {"dup3", NULL, 0, 3}, /* 330 */
+ {"pipe2", lx_pipe2, 0, 2}, /* 331 */
+ {"inotify_init1", NULL, 0, 1}, /* 332 */
+ {"preadv", NULL, NOSYS_NULL, 0}, /* 333 */
+ {"pwritev", NULL, NOSYS_NULL, 0}, /* 334 */
+ {"rt_tgsigqueueinfo", NULL, 0, 4}, /* 335 */
+ {"perf_event_open", NULL, NOSYS_NULL, 0}, /* 336 */
+ {"recvmmsg", NULL, NOSYS_NULL, 0}, /* 337 */
+ {"fanotify_init", NULL, NOSYS_NULL, 0}, /* 338 */
+ {"fanotify_mark", NULL, NOSYS_NULL, 0}, /* 339 */
+ {"prlimit64", NULL, 0, 4}, /* 340 */
+ {"name_to_handle_at", NULL, NOSYS_NULL, 0}, /* 341 */
+ {"open_by_handle_at", NULL, NOSYS_NULL, 0}, /* 342 */
+ {"clock_adjtime", NULL, NOSYS_NULL, 0}, /* 343 */
+ {"syncfs", NULL, NOSYS_NULL, 0}, /* 344 */
+ {"sendmmsg", NULL, NOSYS_NULL, 0}, /* 345 */
+ {"setns", NULL, NOSYS_NULL, 0}, /* 346 */
+ {"process_vm_readv", NULL, NOSYS_NULL, 0}, /* 347 */
+ {"process_vm_writev", NULL, NOSYS_NULL, 0}, /* 348 */
+ {"kcmp", NULL, NOSYS_NULL, 0}, /* 349 */
+ {"finit_module", NULL, NOSYS_NULL, 0}, /* 350 */
+ {"sched_setattr", NULL, NOSYS_NULL, 0}, /* 351 */
+ {"sched_getattr", NULL, NOSYS_NULL, 0}, /* 352 */
+ {"renameat2", NULL, NOSYS_NULL, 0}, /* 353 */
+ {"seccomp", NULL, NOSYS_NULL, 0}, /* 354 */
+ {"getrandom", NULL, NOSYS_NULL, 0}, /* 355 */
+ {"memfd_create", NULL, NOSYS_NULL, 0}, /* 356 */
+ {"bpf", NULL, NOSYS_NULL, 0}, /* 357 */
+ {"execveat", NULL, NOSYS_NULL, 0}, /* 358 */
+};
+
+#if defined(_LP64)
+/*
+ * Linux defines system call numbers for 64-bit x86 in the file:
+ * arch/x86/syscalls/syscall_64.tbl
+ */
+lx_sysent_t lx_sysent64[] = {
+ {"read", lx_read, 0, 3}, /* 0 */
+ {"write", lx_write, 0, 3}, /* 1 */
+ {"open", NULL, 0, 3}, /* 2 */
+ {"close", NULL, 0, 1}, /* 3 */
+ {"stat", NULL, 0, 2}, /* 4 */
+ {"fstat", NULL, 0, 2}, /* 5 */
+ {"lstat", NULL, 0, 2}, /* 6 */
+ {"poll", NULL, 0, 3}, /* 7 */
+ {"lseek", NULL, 0, 3}, /* 8 */
+ {"mmap", NULL, 0, 6}, /* 9 */
+ {"mprotect", NULL, 0, 3}, /* 10 */
+ {"munmap", NULL, 0, 2}, /* 11 */
+ {"brk", lx_brk, 0, 1}, /* 12 */
+ {"rt_sigaction", NULL, 0, 4}, /* 13 */
+ {"rt_sigprocmask", NULL, 0, 4}, /* 14 */
+ {"rt_sigreturn", NULL, 0, 0}, /* 15 */
+ {"ioctl", lx_ioctl, 0, 3}, /* 16 */
+ {"pread64", NULL, 0, 4}, /* 17 */
+ {"pwrite64", NULL, 0, 4}, /* 18 */
+ {"readv", NULL, 0, 3}, /* 19 */
+ {"writev", NULL, 0, 3}, /* 20 */
+ {"access", NULL, 0, 2}, /* 21 */
+ {"pipe", lx_pipe, 0, 1}, /* 22 */
+ {"select", NULL, 0, 5}, /* 23 */
+ {"sched_yield", lx_sched_yield, 0, 0}, /* 24 */
+ {"mremap", NULL, 0, 5}, /* 25 */
+ {"msync", NULL, 0, 3}, /* 26 */
+ {"mincore", NULL, 0, 3}, /* 27 */
+ {"madvise", NULL, 0, 3}, /* 28 */
+ {"shmget", NULL, 0, 3}, /* 29 */
+ {"shmat", NULL, 0, 4}, /* 30 */
+ {"shmctl", NULL, 0, 3}, /* 31 */
+ {"dup", NULL, 0, 1}, /* 32 */
+ {"dup2", NULL, 0, 2}, /* 33 */
+ {"pause", NULL, 0, 0}, /* 34 */
+ {"nanosleep", NULL, 0, 2}, /* 35 */
+ {"getitimer", NULL, 0, 2}, /* 36 */
+ {"alarm", NULL, 0, 1}, /* 37 */
+ {"setitimer", NULL, 0, 3}, /* 38 */
+ {"getpid", lx_getpid, 0, 0}, /* 39 */
+ {"sendfile", NULL, 0, 4}, /* 40 */
+ {"socket", NULL, 0, 3}, /* 41 */
+ {"connect", NULL, 0, 3}, /* 42 */
+ {"accept", NULL, 0, 3}, /* 43 */
+ {"sendto", NULL, 0, 6}, /* 44 */
+ {"recvfrom", NULL, 0, 6}, /* 45 */
+ {"sendmsg", NULL, 0, 3}, /* 46 */
+ {"recvmsg", NULL, 0, 3}, /* 47 */
+ {"shutdown", NULL, 0, 2}, /* 48 */
+ {"bind", NULL, 0, 3}, /* 49 */
+ {"listen", NULL, 0, 2}, /* 50 */
+ {"getsockname", NULL, 0, 3}, /* 51 */
+ {"getpeername", NULL, 0, 3}, /* 52 */
+ {"socketpair", NULL, 0, 4}, /* 53 */
+ {"setsockopt", NULL, 0, 5}, /* 54 */
+ {"getsockopt", NULL, 0, 5}, /* 55 */
+ {"clone", NULL, 0, 5}, /* 56 */
+ {"fork", NULL, 0, 0}, /* 57 */
+ {"vfork", NULL, 0, 0}, /* 58 */
+ {"execve", NULL, 0, 3}, /* 59 */
+ {"exit", NULL, 0, 1}, /* 60 */
+ {"wait4", lx_wait4, 0, 4}, /* 61 */
+ {"kill", lx_kill, 0, 2}, /* 62 */
+ {"uname", NULL, 0, 1}, /* 63 */
+ {"semget", NULL, 0, 3}, /* 64 */
+ {"semop", NULL, 0, 3}, /* 65 */
+ {"semctl", NULL, 0, 4}, /* 66 */
+ {"shmdt", NULL, 0, 1}, /* 67 */
+ {"msgget", NULL, 0, 2}, /* 68 */
+ {"msgsnd", NULL, 0, 4}, /* 69 */
+ {"msgrcv", NULL, 0, 5}, /* 70 */
+ {"msgctl", NULL, 0, 3}, /* 71 */
+ {"fcntl", NULL, 0, 3}, /* 72 */
+ {"flock", NULL, 0, 2}, /* 73 */
+ {"fsync", NULL, 0, 1}, /* 74 */
+ {"fdatasync", NULL, 0, 1}, /* 75 */
+ {"truncate", NULL, 0, 2}, /* 76 */
+ {"ftruncate", NULL, 0, 2}, /* 77 */
+ {"getdents", NULL, 0, 3}, /* 78 */
+ {"getcwd", NULL, 0, 2}, /* 79 */
+ {"chdir", NULL, 0, 1}, /* 80 */
+ {"fchdir", NULL, 0, 1}, /* 81 */
+ {"rename", NULL, 0, 2}, /* 82 */
+ {"mkdir", NULL, 0, 2}, /* 83 */
+ {"rmdir", NULL, 0, 1}, /* 84 */
+ {"creat", NULL, 0, 2}, /* 85 */
+ {"link", NULL, 0, 2}, /* 86 */
+ {"unlink", NULL, 0, 1}, /* 87 */
+ {"symlink", NULL, 0, 2}, /* 88 */
+ {"readlink", NULL, 0, 3}, /* 89 */
+ {"chmod", NULL, 0, 2}, /* 90 */
+ {"fchmod", NULL, 0, 2}, /* 91 */
+ {"chown", NULL, 0, 3}, /* 92 */
+ {"fchown", NULL, 0, 3}, /* 93 */
+ {"lchown", NULL, 0, 3}, /* 94 */
+ {"umask", NULL, 0, 1}, /* 95 */
+ {"gettimeofday", NULL, 0, 2}, /* 96 */
+ {"getrlimit", NULL, 0, 2}, /* 97 */
+ {"getrusage", NULL, 0, 2}, /* 98 */
+ {"sysinfo", lx_sysinfo64, 0, 1}, /* 99 */
+ {"times", NULL, 0, 1}, /* 100 */
+ {"ptrace", NULL, 0, 4}, /* 101 */
+ {"getuid", NULL, 0, 0}, /* 102 */
+ {"syslog", NULL, 0, 3}, /* 103 */
+ {"getgid", NULL, 0, 0}, /* 104 */
+ {"setuid", NULL, 0, 1}, /* 105 */
+ {"setgid", NULL, 0, 1}, /* 106 */
+ {"geteuid", NULL, 0, 0}, /* 107 */
+ {"getegid", NULL, 0, 0}, /* 108 */
+ {"setpgid", NULL, 0, 2}, /* 109 */
+ {"getppid", lx_getppid, 0, 0}, /* 110 */
+ {"getpgrp", NULL, 0, 0}, /* 111 */
+ {"setsid", NULL, 0, 0}, /* 112 */
+ {"setreuid", NULL, 0, 0}, /* 113 */
+ {"setregid", NULL, 0, 0}, /* 114 */
+ {"getgroups", NULL, 0, 2}, /* 115 */
+ {"setgroups", NULL, 0, 2}, /* 116 */
+ {"setresuid", lx_setresuid, 0, 3}, /* 117 */
+ {"getresuid", NULL, 0, 3}, /* 118 */
+ {"setresgid", lx_setresgid, 0, 3}, /* 119 */
+ {"getresgid", NULL, 0, 3}, /* 120 */
+ {"getpgid", NULL, 0, 1}, /* 121 */
+ {"setfsuid", NULL, 0, 1}, /* 122 */
+ {"setfsgid", NULL, 0, 1}, /* 123 */
+ {"getsid", NULL, 0, 1}, /* 124 */
+ {"capget", NULL, 0, 2}, /* 125 */
+ {"capset", NULL, 0, 2}, /* 126 */
+ {"rt_sigpending", NULL, 0, 2}, /* 127 */
+ {"rt_sigtimedwait", NULL, 0, 4}, /* 128 */
+ {"rt_sigqueueinfo", NULL, 0, 3}, /* 129 */
+ {"rt_sigsuspend", NULL, 0, 2}, /* 130 */
+ {"sigaltstack", NULL, 0, 2}, /* 131 */
+ {"utime", NULL, 0, 2}, /* 132 */
+ {"mknod", NULL, 0, 3}, /* 133 */
+ {"uselib", NULL, NOSYS_KERNEL, 0}, /* 134 */
+ {"personality", NULL, 0, 1}, /* 135 */
+ {"ustat", NULL, NOSYS_OBSOLETE, 2}, /* 136 */
+ {"statfs", NULL, 0, 2}, /* 137 */
+ {"fstatfs", NULL, 0, 2}, /* 138 */
+ {"sysfs", NULL, 0, 3}, /* 139 */
+ {"getpriority", NULL, 0, 2}, /* 140 */
+ {"setpriority", NULL, 0, 3}, /* 141 */
+ {"sched_setparam", NULL, 0, 2}, /* 142 */
+ {"sched_getparam", NULL, 0, 2}, /* 143 */
+ {"sched_setscheduler", NULL, 0, 3}, /* 144 */
+ {"sched_getscheduler", NULL, 0, 1}, /* 145 */
+ {"sched_get_priority_max", NULL, 0, 1}, /* 146 */
+ {"sched_get_priority_min", NULL, 0, 1}, /* 147 */
+ {"sched_rr_get_interval", NULL, 0, 2}, /* 148 */
+ {"mlock", NULL, 0, 2}, /* 149 */
+ {"munlock", NULL, 0, 2}, /* 150 */
+ {"mlockall", NULL, 0, 1}, /* 151 */
+ {"munlockall", NULL, 0, 0}, /* 152 */
+ {"vhangup", NULL, 0, 0}, /* 153 */
+ {"modify_ldt", lx_modify_ldt, 0, 3}, /* 154 */
+ {"pivot_root", NULL, NOSYS_KERNEL, 0}, /* 155 */
+ {"sysctl", NULL, 0, 1}, /* 156 */
+ {"prctl", NULL, 0, 5}, /* 157 */
+ {"arch_prctl", lx_arch_prctl, 0, 2}, /* 158 */
+ {"adjtimex", NULL, 0, 1}, /* 159 */
+ {"setrlimit", NULL, 0, 2}, /* 160 */
+ {"chroot", NULL, 0, 1}, /* 161 */
+ {"sync", NULL, 0, 0}, /* 162 */
+ {"acct", NULL, NOSYS_NO_EQUIV, 0}, /* 163 */
+ {"settimeofday", NULL, 0, 2}, /* 164 */
+ {"mount", NULL, 0, 5}, /* 165 */
+ {"umount2", NULL, 0, 2}, /* 166 */
+ {"swapon", NULL, NOSYS_KERNEL, 0}, /* 167 */
+ {"swapoff", NULL, NOSYS_KERNEL, 0}, /* 168 */
+ {"reboot", NULL, 0, 4}, /* 169 */
+ {"sethostname", NULL, 0, 2}, /* 170 */
+ {"setdomainname", NULL, 0, 2}, /* 171 */
+ {"iopl", NULL, NOSYS_NO_EQUIV, 0}, /* 172 */
+ {"ioperm", NULL, NOSYS_NO_EQUIV, 0}, /* 173 */
+ {"create_module", NULL, NOSYS_KERNEL, 0}, /* 174 */
+ {"init_module", NULL, NOSYS_KERNEL, 0}, /* 175 */
+ {"delete_module", NULL, NOSYS_KERNEL, 0}, /* 176 */
+ {"get_kernel_syms", NULL, NOSYS_KERNEL, 0}, /* 177 */
+ {"query_module", NULL, 0, 5}, /* 178 */
+ {"quotactl", NULL, NOSYS_KERNEL, 0}, /* 179 */
+ {"nfsservctl", NULL, NOSYS_KERNEL, 0}, /* 180 */
+ {"getpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 181 */
+ {"putpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 182 */
+ {"afs_syscall", NULL, NOSYS_KERNEL, 0}, /* 183 */
+ {"tux", NULL, NOSYS_NO_EQUIV, 0}, /* 184 */
+ {"security", NULL, NOSYS_NO_EQUIV, 0}, /* 185 */
+ {"gettid", lx_gettid, 0, 0}, /* 186 */
+ {"readahead", NULL, NOSYS_NO_EQUIV, 0}, /* 187 */
+ {"setxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 188 */
+ {"lsetxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 189 */
+ {"fsetxattr", NULL, NOSYS_NO_EQUIV, 0}, /* 190 */
+ {"getxattr", lx_xattr, 0, 4}, /* 191 */
+ {"lgetxattr", lx_xattr, 0, 4}, /* 192 */
+ {"fgetxattr", lx_xattr, 0, 4}, /* 193 */
+ {"listxattr", lx_xattr, 0, 3}, /* 194 */
+ {"llistxattr", lx_xattr, 0, 3}, /* 195 */
+ {"flistxattr", lx_xattr, 0, 3}, /* 196 */
+ {"removexattr", lx_xattr, 0, 2}, /* 197 */
+ {"lremovexattr", lx_xattr, 0, 2}, /* 198 */
+ {"fremovexattr", lx_xattr, 0, 2}, /* 199 */
+ {"tkill", lx_tkill, 0, 2}, /* 200 */
+ {"time", NULL, 0, 1}, /* 201 */
+ {"futex", lx_futex, 0, 6}, /* 202 */
+ {"sched_setaffinity", NULL, 0, 3}, /* 203 */
+ {"sched_getaffinity", NULL, 0, 3}, /* 204 */
+ {"set_thread_area", lx_set_thread_area, 0, 1}, /* 205 */
+ {"io_setup", NULL, NOSYS_NO_EQUIV, 0}, /* 206 */
+ {"io_destroy", NULL, NOSYS_NO_EQUIV, 0}, /* 207 */
+ {"io_getevents", NULL, NOSYS_NO_EQUIV, 0}, /* 208 */
+ {"io_submit", NULL, NOSYS_NO_EQUIV, 0}, /* 209 */
+ {"io_cancel", NULL, NOSYS_NO_EQUIV, 0}, /* 210 */
+ {"get_thread_area", lx_get_thread_area, 0, 1}, /* 211 */
+ {"lookup_dcookie", NULL, NOSYS_NO_EQUIV, 0}, /* 212 */
+ {"epoll_create", NULL, 0, 1}, /* 213 */
+ {"epoll_ctl_old", NULL, NOSYS_NULL, 0}, /* 214 */
+ {"epoll_wait_old", NULL, NOSYS_NULL, 0}, /* 215 */
+ {"remap_file_pages", NULL, NOSYS_NO_EQUIV, 0}, /* 216 */
+ {"getdents64", NULL, 0, 3}, /* 217 */
+ {"set_tid_address", lx_set_tid_address, 0, 1}, /* 218 */
+ {"restart_syscall", NULL, NOSYS_NULL, 0}, /* 219 */
+ {"semtimedop", NULL, 0, 4}, /* 220 */
+ {"fadvise64", NULL, 0, 4}, /* 221 */
+ {"timer_create", NULL, 0, 3}, /* 222 */
+ {"timer_settime", NULL, 0, 4}, /* 223 */
+ {"timer_gettime", NULL, 0, 2}, /* 224 */
+ {"timer_getoverrun", NULL, 0, 1}, /* 225 */
+ {"timer_delete", NULL, 0, 1}, /* 226 */
+ {"clock_settime", NULL, 0, 2}, /* 227 */
+ {"clock_gettime", NULL, 0, 2}, /* 228 */
+ {"clock_getres", NULL, 0, 2}, /* 229 */
+ {"clock_nanosleep", NULL, 0, 4}, /* 230 */
+ {"exit_group", NULL, 0, 1}, /* 231 */
+ {"epoll_wait", NULL, 0, 4}, /* 232 */
+ {"epoll_ctl", NULL, 0, 4}, /* 233 */
+ {"tgkill", lx_tgkill, 0, 3}, /* 234 */
+ {"utimes", NULL, 0, 2}, /* 235 */
+ {"vserver", NULL, NOSYS_NULL, 0}, /* 236 */
+ {"mbind", NULL, NOSYS_NULL, 0}, /* 237 */
+ {"set_mempolicy", NULL, NOSYS_NULL, 0}, /* 238 */
+ {"get_mempolicy", NULL, NOSYS_NULL, 0}, /* 239 */
+ {"mq_open", NULL, NOSYS_NULL, 0}, /* 240 */
+ {"mq_unlink", NULL, NOSYS_NULL, 0}, /* 241 */
+ {"mq_timedsend", NULL, NOSYS_NULL, 0}, /* 242 */
+ {"mq_timedreceive", NULL, NOSYS_NULL, 0}, /* 243 */
+ {"mq_notify", NULL, NOSYS_NULL, 0}, /* 244 */
+ {"mq_getsetattr", NULL, NOSYS_NULL, 0}, /* 245 */
+ {"kexec_load", NULL, NOSYS_NULL, 0}, /* 246 */
+ {"waitid", lx_waitid, 0, 4}, /* 247 */
+ {"add_key", NULL, NOSYS_NULL, 0}, /* 248 */
+ {"request_key", NULL, NOSYS_NULL, 0}, /* 249 */
+ {"keyctl", NULL, NOSYS_NULL, 0}, /* 250 */
+ {"ioprio_set", NULL, NOSYS_NULL, 0}, /* 251 */
+ {"ioprio_get", NULL, NOSYS_NULL, 0}, /* 252 */
+ {"inotify_init", NULL, 0, 0}, /* 253 */
+ {"inotify_add_watch", NULL, 0, 3}, /* 254 */
+ {"inotify_rm_watch", NULL, 0, 2}, /* 255 */
+ {"migrate_pages", NULL, NOSYS_NULL, 0}, /* 256 */
+ {"openat", NULL, 0, 4}, /* 257 */
+ {"mkdirat", NULL, 0, 3}, /* 258 */
+ {"mknodat", NULL, 0, 4}, /* 259 */
+ {"fchownat", NULL, 0, 5}, /* 260 */
+ {"futimesat", NULL, 0, 3}, /* 261 */
+ {"fstatat64", NULL, 0, 4}, /* 262 */
+ {"unlinkat", NULL, 0, 3}, /* 263 */
+ {"renameat", NULL, 0, 4}, /* 264 */
+ {"linkat", NULL, 0, 5}, /* 265 */
+ {"symlinkat", NULL, 0, 3}, /* 266 */
+ {"readlinkat", NULL, 0, 4}, /* 267 */
+ {"fchmodat", NULL, 0, 4}, /* 268 */
+ {"faccessat", NULL, 0, 4}, /* 269 */
+ {"pselect6", NULL, 0, 6}, /* 270 */
+ {"ppoll", NULL, 0, 5}, /* 271 */
+ {"unshare", NULL, NOSYS_NULL, 0}, /* 272 */
+ {"set_robust_list", NULL, NOSYS_NULL, 0}, /* 273 */
+ {"get_robust_list", NULL, NOSYS_NULL, 0}, /* 274 */
+ {"splice", NULL, NOSYS_NULL, 0}, /* 275 */
+ {"tee", NULL, NOSYS_NULL, 0}, /* 276 */
+ {"sync_file_range", NULL, NOSYS_NULL, 0}, /* 277 */
+ {"vmsplice", NULL, NOSYS_NULL, 0}, /* 278 */
+ {"move_pages", NULL, NOSYS_NULL, 0}, /* 279 */
+ {"utimensat", NULL, 0, 4}, /* 280 */
+ {"epoll_pwait", NULL, 0, 5}, /* 281 */
+ {"signalfd", NULL, NOSYS_NULL, 0}, /* 282 */
+ {"timerfd_create", NULL, 0, 2}, /* 283 */
+ {"eventfd", NULL, 0, 1}, /* 284 */
+ {"fallocate", NULL, NOSYS_NULL, 0}, /* 285 */
+ {"timerfd_settime", NULL, 0, 4}, /* 286 */
+ {"timerfd_gettime", NULL, 0, 2}, /* 287 */
+ {"accept4", NULL, 0, 4}, /* 288 */
+ {"signalfd4", NULL, NOSYS_NULL, 0}, /* 289 */
+ {"eventfd2", NULL, 0, 2}, /* 290 */
+ {"epoll_create1", NULL, 0, 1}, /* 291 */
+ {"dup3", NULL, 0, 3}, /* 292 */
+ {"pipe2", lx_pipe2, 0, 2}, /* 293 */
+ {"inotify_init1", NULL, 0, 1}, /* 294 */
+ {"preadv", NULL, NOSYS_NULL, 0}, /* 295 */
+ {"pwritev", NULL, NOSYS_NULL, 0}, /* 296 */
+ {"rt_tgsigqueueinfo", NULL, 0, 4}, /* 297 */
+ {"perf_event_open", NULL, NOSYS_NULL, 0}, /* 298 */
+ {"recvmmsg", NULL, NOSYS_NULL, 0}, /* 299 */
+ {"fanotify_init", NULL, NOSYS_NULL, 0}, /* 300 */
+ {"fanotify_mark", NULL, NOSYS_NULL, 0}, /* 301 */
+ {"prlimit64", NULL, 0, 4}, /* 302 */
+ {"name_to_handle_at", NULL, NOSYS_NULL, 0}, /* 303 */
+ {"open_by_handle_at", NULL, NOSYS_NULL, 0}, /* 304 */
+ {"clock_adjtime", NULL, NOSYS_NULL, 0}, /* 305 */
+ {"syncfs", NULL, NOSYS_NULL, 0}, /* 306 */
+ {"sendmmsg", NULL, NOSYS_NULL, 0}, /* 307 */
+ {"setns", NULL, NOSYS_NULL, 0}, /* 309 */
+ {"getcpu", NULL, 0, 3}, /* 309 */
+ {"process_vm_readv", NULL, NOSYS_NULL, 0}, /* 310 */
+ {"process_vm_writev", NULL, NOSYS_NULL, 0}, /* 311 */
+ {"kcmp", NULL, NOSYS_NULL, 0}, /* 312 */
+ {"finit_module", NULL, NOSYS_NULL, 0}, /* 313 */
+ {"sched_setattr", NULL, NOSYS_NULL, 0}, /* 314 */
+ {"sched_getattr", NULL, NOSYS_NULL, 0}, /* 315 */
+ {"renameat2", NULL, NOSYS_NULL, 0}, /* 316 */
+ {"seccomp", NULL, NOSYS_NULL, 0}, /* 317 */
+ {"getrandom", NULL, NOSYS_NULL, 0}, /* 318 */
+ {"memfd_create", NULL, NOSYS_NULL, 0}, /* 319 */
+ {"kexec_file_load", NULL, NOSYS_NULL, 0}, /* 320 */
+ {"bpf", NULL, NOSYS_NULL, 0}, /* 321 */
+ {"execveat", NULL, NOSYS_NULL, 0}, /* 322 */
+
+ /* XXX TBD gap then x32 syscalls from 512 - 544 */
+};
+#endif
diff --git a/usr/src/uts/common/brand/lx/sys/lx_brand.h b/usr/src/uts/common/brand/lx/sys/lx_brand.h
index e7f5ee9867..543373b5fa 100644
--- a/usr/src/uts/common/brand/lx/sys/lx_brand.h
+++ b/usr/src/uts/common/brand/lx/sys/lx_brand.h
@@ -66,10 +66,7 @@ extern "C" {
/*
* This must be large enough for both the 32-bit table and 64-bit table.
*/
-#define LX_NSYSCALLS 352
-
-/* The number of In-Kernel Emulation functions */
-#define LX_N_IKE_FUNCS 29
+#define LX_NSYSCALLS 358
/*
* brand(2) subcommands
@@ -78,8 +75,8 @@ extern "C" {
* > 192 is reserved for in-kernel emulated system calls.
*/
#define B_LPID_TO_SPAIR 128
-#define B_SYSENTRY 129
-#define B_SYSRETURN 130
+#define B_GET_CURRENT_CONTEXT 129
+#define B_EMULATION_DONE 130
#define B_PTRACE_KERNEL 131
#define B_SET_AFFINITY_MASK 132
#define B_GET_AFFINITY_MASK 133
@@ -87,13 +84,16 @@ extern "C" {
#define B_PTRACE_STOP_FOR_OPT 135
#define B_UNSUPPORTED 136
#define B_STORE_ARGS 137
-#define B_CLR_NTV_SYSC_FLAG 138
-#define B_SIGNAL_RETURN 139
-#define B_UNWIND_NTV_SYSC_FLAG 140
+#define B_GETPID 138
+#define B_JUMP_TO_LINUX 139
+#define B_SET_THUNK_PID 140
#define B_EXIT_AS_SIG 141
#define B_HELPER_WAITID 142
-
-#define B_IKE_SYSCALL 192
+#define B_HELPER_CLONE 143
+#define B_HELPER_SETGROUPS 144
+#define B_HELPER_SIGQUEUE 145
+#define B_HELPER_TGSIGQUEUE 146
+#define B_SET_NATIVE_STACK 147
#ifndef _ASM
/*
@@ -157,6 +157,45 @@ typedef enum lx_ptrace_options {
/* Aux vector containing vDSO addr */
#define AT_SYSINFO_EHDR 33
+/*
+ * This table initialiser maps errno values from illumos to Linux numbers.
+ * It is presently used in both the usermode and kernel emulation code,
+ * so it is defined here.
+ */
+/* BEGIN CSTYLED */
+#define LX_STOL_ERRNO_INIT { \
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, \
+ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, \
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, \
+ 30, 31, 32, 33, 34, 42, 43, 44, 45, 46, \
+ 47, 48, 49, 50, 51, 35, 47, 22, 38, 22, /* 49 */ \
+ 52, 53, 54, 55, 56, 57, 58, 59, 22, 22, \
+ 61, 61, 62, 63, 64, 65, 66, 67, 68, 69, \
+ 70, 71, 22, 22, 72, 22, 22, 74, 36, 75, \
+ 76, 77, 78, 79, 80, 81, 82, 83, 84, 38, \
+ 40, 85, 86, 39, 87, 88, 89, 90, 91, 92, /* 99 */ \
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, \
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, \
+ 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, \
+ 103, 104, 105, 106, 107, 22, 22, 22, 22, 22, \
+ 22, 22, 22, 108, 109, 110, 111, 112, 113, 114, /* 149 */ \
+ 115, 116 }
+/* END CSTYLED */
+
+/*
+ * Usermode emulation routines are run on an alternate stack allocated by
+ * the brand library. Every LWP in a process will incur this overhead beyond
+ * the regular thread stack:
+ */
+#define LX_NATIVE_STACK_PAGE_COUNT 64
+
+/*
+ * When returning in a new child process created with vfork(2) (or CLONE_VFORK)
+ * we discard some of the native stack to prevent corruption of the parent
+ * emulation state.
+ */
+#define LX_NATIVE_STACK_VFORK_GAP 0x3000
+
#ifndef _ASM
extern struct brand lx_brand;
@@ -164,18 +203,15 @@ extern struct brand lx_brand;
typedef struct lx_brand_registration {
uint_t lxbr_version; /* version number */
void *lxbr_handler; /* base address of handler */
- void *lxbr_tracehandler; /* base address of trace handler */
- void *lxbr_traceflag; /* address of trace flag */
} lx_brand_registration_t;
typedef struct lx_brand_registration32 {
uint_t lxbr_version; /* version number */
uint32_t lxbr_handler; /* base address of handler */
- uint32_t lxbr_tracehandler; /* base address of trace handler */
- uint32_t lxbr_traceflag; /* address of trace flag */
} lx_brand_registration32_t;
#ifdef __amd64
+
typedef struct lx_regs {
long lxr_fs;
long lxr_rdi;
@@ -198,7 +234,24 @@ typedef struct lx_regs {
long lxr_orig_rax;
} lx_regs_t;
+
+typedef struct lx_regs32 {
+ uint32_t lxr_gs;
+ uint32_t lxr_edi;
+ uint32_t lxr_esi;
+ uint32_t lxr_ebp;
+ uint32_t lxr_esp;
+ uint32_t lxr_ebx;
+ uint32_t lxr_edx;
+ uint32_t lxr_ecx;
+ uint32_t lxr_eax;
+ uint32_t lxr_eip;
+
+ uint32_t lxr_orig_eax;
+} lx_regs32_t;
+
#else /* ! __amd64 */
+
typedef struct lx_regs {
long lxr_gs;
long lxr_edi;
@@ -213,6 +266,91 @@ typedef struct lx_regs {
long lxr_orig_eax;
} lx_regs_t;
+
+#endif /* __amd64 */
+
+#ifdef __amd64
+/*
+ * The 64-bit native "user_regs_struct" Linux structure.
+ */
+typedef struct lx_user_regs {
+ long lxur_r15;
+ long lxur_r14;
+ long lxur_r13;
+ long lxur_r12;
+ long lxur_rbp;
+ long lxur_rbx;
+ long lxur_r11;
+ long lxur_r10;
+ long lxur_r9;
+ long lxur_r8;
+ long lxur_rax;
+ long lxur_rcx;
+ long lxur_rdx;
+ long lxur_rsi;
+ long lxur_rdi;
+ long lxur_orig_rax;
+ long lxur_rip;
+ long lxur_xcs;
+ long lxur_rflags;
+ long lxur_rsp;
+ long lxur_xss;
+ long lxur_xfs_base;
+ long lxur_xgs_base;
+ long lxur_xds;
+ long lxur_xes;
+ long lxur_xfs;
+ long lxur_xgs;
+} lx_user_regs_t;
+
+#if defined(_KERNEL) && defined(_SYSCALL32_IMPL)
+/*
+ * 64-bit kernel view of the 32-bit "user_regs_struct" Linux structure.
+ */
+typedef struct lx_user_regs32 {
+ int32_t lxur_ebx;
+ int32_t lxur_ecx;
+ int32_t lxur_edx;
+ int32_t lxur_esi;
+ int32_t lxur_edi;
+ int32_t lxur_ebp;
+ int32_t lxur_eax;
+ int32_t lxur_xds;
+ int32_t lxur_xes;
+ int32_t lxur_xfs;
+ int32_t lxur_xgs;
+ int32_t lxur_orig_eax;
+ int32_t lxur_eip;
+ int32_t lxur_xcs;
+ int32_t lxur_eflags;
+ int32_t lxur_esp;
+ int32_t lxur_xss;
+} lx_user_regs32_t;
+#endif /* defined(_KERNEL) && defined(_SYSCALL32_IMPL) */
+
+#else /* !__amd64 */
+/*
+ * The 32-bit native "user_regs_struct" Linux structure.
+ */
+typedef struct lx_user_regs {
+ long lxur_ebx;
+ long lxur_ecx;
+ long lxur_edx;
+ long lxur_esi;
+ long lxur_edi;
+ long lxur_ebp;
+ long lxur_eax;
+ long lxur_xds;
+ long lxur_xes;
+ long lxur_xfs;
+ long lxur_xgs;
+ long lxur_orig_eax;
+ long lxur_eip;
+ long lxur_xcs;
+ long lxur_eflags;
+ long lxur_esp;
+ long lxur_xss;
+} lx_user_regs_t;
#endif /* __amd64 */
#endif /* _ASM */
@@ -240,12 +378,12 @@ typedef struct lx_elf_data64 {
} lx_elf_data64_t;
typedef struct lx_elf_data32 {
- int ed_phdr;
- int ed_phent;
- int ed_phnum;
- int ed_entry;
- int ed_base;
- int ed_ldentry;
+ uint32_t ed_phdr;
+ uint32_t ed_phent;
+ uint32_t ed_phnum;
+ uint32_t ed_entry;
+ uint32_t ed_base;
+ uint32_t ed_ldentry;
} lx_elf_data32_t;
#if defined(_LP64)
@@ -258,8 +396,6 @@ typedef lx_elf_data32_t lx_elf_data_t;
typedef struct lx_proc_data {
uintptr_t l_handler; /* address of user-space handler */
- uintptr_t l_tracehandler; /* address of user-space traced handler */
- uintptr_t l_traceflag; /* address of 32-bit tracing flag */
pid_t l_ppid; /* pid of originating parent proc */
uint64_t l_ptrace; /* process being observed with ptrace */
lx_elf_data_t l_elf_data; /* ELF data for linux executable */
@@ -281,6 +417,16 @@ typedef ulong_t lx_affmask_t[LX_AFF_ULONGS];
/* Max. length of kernel version string */
#define LX_VERS_MAX 16
+/*
+ * Flag values for uc_brand_data[0] in the ucontext_t:
+ */
+#define LX_UC_STACK_NATIVE 0x00001
+#define LX_UC_STACK_BRAND 0x00002
+#define LX_UC_RESTORE_NATIVE_SP 0x00010
+#define LX_UC_FRAME_IS_SYSCALL 0x00100
+#define LX_UC_RESTART_SYSCALL 0x01000
+#define LX_UC_IGNORE_LINK 0x10000
+
#ifdef _KERNEL
typedef struct lx_lwp_data lx_lwp_data_t;
@@ -303,7 +449,8 @@ typedef enum lx_ptrace_state {
LX_PTRACE_STOPPED = 0x10,
LX_PTRACE_PARENT_WAIT = 0x20,
LX_PTRACE_CLDPEND = 0x40,
- LX_PTRACE_CLONING = 0x80
+ LX_PTRACE_CLONING = 0x80,
+ LX_PTRACE_WAITPEND = 0x100
} lx_ptrace_state_t;
/*
@@ -343,11 +490,17 @@ typedef enum lx_ptrace_attach {
LX_PTA_INHERIT_OPTIONS = 0x08 /* due to PTRACE_SETOPTIONS options */
} lx_ptrace_attach_t;
+typedef enum lx_stack_mode {
+ LX_STACK_MODE_PREINIT = 0,
+ LX_STACK_MODE_INIT,
+ LX_STACK_MODE_NATIVE,
+ LX_STACK_MODE_BRAND
+} lx_stack_mode_t;
+
/*
* lx-specific data in the klwp_t
*/
struct lx_lwp_data {
- uint_t br_ntv_syscall; /* 1 = syscall from native libc */
uint_t br_lwp_flags; /* misc. flags */
klwp_t *br_lwp; /* back pointer to container lwp */
int br_signal; /* signal to send to parent when */
@@ -359,12 +512,6 @@ struct lx_lwp_data {
/* descriptors used by libc for TLS */
ulong_t br_lx_fsbase; /* lx fsbase for 64-bit thread ptr */
ulong_t br_ntv_fsbase; /* native fsbase 64-bit thread ptr */
- /*
- * 64-bit thread-specific syscall mode state "stack". Bits tracking the
- * syscall mode are shifted on/off this int like a stack as we take
- * signals and return.
- */
- uint_t br_scms;
pid_t br_pid; /* converted pid for this thread */
pid_t br_tgid; /* thread group ID for this thread */
pid_t br_ppid; /* parent pid for this thread */
@@ -396,9 +543,30 @@ struct lx_lwp_data {
ushort_t br_ptrace_whatstop; /* stop sub-reason */
int32_t br_ptrace_stopsig; /* stop signal, 0 for no signal */
+ uintptr_t br_ptrace_stopucp; /* usermode ucontext_t pointer */
uint_t br_ptrace_event;
ulong_t br_ptrace_eventmsg;
+
+ int br_syscall_num; /* current system call number */
+ boolean_t br_syscall_restart; /* should restart on EINTR */
+
+ /*
+ * Store the LX_STACK_MODE for this LWP, and the current extent of the
+ * native (emulation) stack. This is similar, in principle, to the
+ * sigaltstack mechanism for signal handling. We also use this mode
+ * flag to determine how to process system calls from this LWP.
+ */
+ lx_stack_mode_t br_stack_mode;
+ uintptr_t br_ntv_stack;
+ uintptr_t br_ntv_stack_current;
+
+ /*
+ * If this pid is set, we return it with getpid(). This allows the
+ * thunking server to interpose on the pid returned to the Linux
+ * syslog software.
+ */
+ pid_t br_lx_thunk_pid;
};
/*
@@ -410,7 +578,6 @@ struct lx_lwp_data {
/* brand specific data */
typedef struct lx_zone_data {
char lxzd_kernel_version[LX_VERS_MAX];
- int lxzd_max_syscall;
} lx_zone_data_t;
#define BR_CPU_BOUND 0x0001
@@ -428,16 +595,61 @@ typedef struct lx_zone_data {
#define LX_ARGS(scall) ((struct lx_##scall##_args *)\
(ttolxlwp(curthread)->br_scall_args))
-void lx_brand_int80_callback(void);
-void lx_brand_syscall_callback(void);
-int64_t lx_emulate_syscall(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t,
- uintptr_t, uintptr_t);
+/*
+ * Determine the upper bound on the system call number:
+ */
+#if defined(_LP64)
+#define LX_MAX_SYSCALL(lwp) \
+ ((lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) ? \
+ lx_nsysent64 : lx_nsysent32)
+#else
+#define LX_MAX_SYSCALL(lwp) lx_nsysent32
+#endif
extern char *lx_get_zone_kern_version(zone_t *);
+extern void lx_lwp_set_native_stack_current(lx_lwp_data_t *, uintptr_t);
+extern void lx_divert(klwp_t *, uintptr_t);
+extern int lx_runexe(klwp_t *, void *);
+extern void lx_switch_to_native(klwp_t *);
+extern int lx_regs_to_userregs(lx_lwp_data_t *, void *);
+extern int lx_uc_to_userregs(lx_lwp_data_t *, void *, void *);
+extern int lx_userregs_to_regs(lx_lwp_data_t *lwpd, void *);
+extern int lx_userregs_to_uc(lx_lwp_data_t *lwpd, void *, void *);
+
+extern int lx_syscall_enter(void);
+extern int lx_syscall_return(klwp_t *, int, long);
+
+extern void lx_trace_sysenter(int, uintptr_t *);
+extern void lx_trace_sysreturn(int, long);
+
+extern void lx_emulate_user(klwp_t *, int, uintptr_t *);
+#if defined(_SYSCALL32_IMPL)
+extern void lx_emulate_user32(klwp_t *, int, uintptr_t *);
+#endif
+
extern int lx_debug;
#define lx_print if (lx_debug) printf
+extern int lx_stol_errno[];
+
+/*
+ * In-Kernel Linux System Call Description.
+ */
+typedef struct lx_sysent {
+ char *sy_name;
+ long (*sy_callc)();
+ char sy_flags;
+ char sy_narg;
+} lx_sysent_t;
+
+#if defined(_LP64)
+extern lx_sysent_t lx_sysent64[LX_NSYSCALLS + 1];
+extern int lx_nsysent64;
+#endif
+extern lx_sysent_t lx_sysent32[LX_NSYSCALLS + 1];
+extern int lx_nsysent32;
+
#endif /* _KERNEL */
#endif /* _ASM */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_misc.h b/usr/src/uts/common/brand/lx/sys/lx_misc.h
index 7b77789c56..cc8c6d44f6 100644
--- a/usr/src/uts/common/brand/lx/sys/lx_misc.h
+++ b/usr/src/uts/common/brand/lx/sys/lx_misc.h
@@ -55,11 +55,16 @@ extern int lx_waitid_helper(idtype_t, id_t, k_siginfo_t *, int, boolean_t *,
int *);
extern void lx_ptrace_exit(proc_t *, klwp_t *);
extern void lx_ptrace_inherit_tracer(lx_lwp_data_t *, lx_lwp_data_t *);
-extern int lx_ptrace_stop_for_option(int, boolean_t, ulong_t);
+extern int lx_ptrace_stop_for_option(int, boolean_t, ulong_t, uintptr_t);
extern int lx_ptrace_set_clone_inherit(int, boolean_t);
extern int lx_sigcld_repost(proc_t *, sigqueue_t *);
extern int lx_issig_stop(proc_t *, klwp_t *);
+extern int lx_helper_clone(int64_t *, int, void *, void *, void *);
+extern int lx_helper_setgroups(int, gid_t *);
+extern int lx_helper_rt_sigqueueinfo(pid_t, int, siginfo_t *);
+extern int lx_helper_rt_tgsigqueueinfo(pid_t, pid_t, int, siginfo_t *);
+
#endif
#ifdef __cplusplus
diff --git a/usr/src/uts/common/brand/lx/sys/lx_siginfo.h b/usr/src/uts/common/brand/lx/sys/lx_siginfo.h
new file mode 100644
index 0000000000..9f606b614f
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_siginfo.h
@@ -0,0 +1,190 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _LX_SIGINFO_H
+#define _LX_SIGINFO_H
+
+#include <sys/lx_types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * lx_siginfo_t lsi_code values
+ *
+ * LX_SI_ASYNCNL: Sent by asynch name lookup completion
+ * LX_SI_DETHREAD: Sent by execve() killing subsidiary threads
+ * LX_SI_SIGIO: Sent by queued SIGIO
+ * LX_SI_ASYNCIO: Sent by asynchronous I/O completion
+ * LX_SI_MESGQ: Sent by real time message queue state change
+ * LX_SI_TIMER: Sent by timer expiration
+ * LX_SI_QUEUE: Sent by sigqueue
+ * LX_SI_USER: Sent by kill, sigsend, raise, etc.
+ * LX_SI_KERNEL: Sent by kernel
+ * LX_SI_CODE_NOT_EXIST: Error code. When translating from Linux to
+ * illumos errors, if there is no translation available, this value
+ * should be used. This value should have no meaning as an si_code in
+ * illumos or Linux.
+ *
+ * At present, LX_SI_ASYNCNL, LX_SI_DETHREAD, and LX_SI_SIGIO are unused by
+ * BrandZ.
+ */
+#define LX_SI_CODE_NOT_EXIST (-61)
+#define LX_SI_ASYNCNL (-60)
+#define LX_SI_DETHREAD (-7)
+#define LX_SI_TKILL (-6)
+#define LX_SI_SIGIO (-5)
+#define LX_SI_ASYNCIO (-4)
+#define LX_SI_MESGQ (-3)
+#define LX_SI_TIMER (-2)
+#define LX_SI_QUEUE (-1)
+#define LX_SI_USER (0)
+#define LX_SI_KERNEL (0x80)
+
+#define LX_SI_MAX_SIZE 128
+#define LX_SI_PAD_SIZE_32 ((LX_SI_MAX_SIZE / sizeof (int)) - 3)
+#define LX_SI_PAD_SIZE_64 ((LX_SI_MAX_SIZE / sizeof (int)) - 4)
+
+#if defined(_LP64)
+/*
+ * Because of the odd number (3) of ints before the union, we need to account
+ * for the smaller padding needed on x64 due to the union being offset to an 8
+ * byte boundary.
+ */
+#define LX_SI_PAD_SIZE LX_SI_PAD_SIZE_64
+#else
+#define LX_SI_PAD_SIZE LX_SI_PAD_SIZE_32
+#endif
+
+typedef struct lx_siginfo {
+ int lsi_signo;
+ int lsi_errno;
+ int lsi_code;
+ union {
+ int _pad[LX_SI_PAD_SIZE];
+
+ struct {
+ pid_t _pid;
+ lx_uid16_t _uid;
+ } _kill;
+
+ struct {
+ uint_t _timer1;
+ uint_t _timer2;
+ } _timer;
+
+ struct {
+ pid_t _pid;
+ lx_uid16_t _uid;
+ union sigval _sigval;
+ } _rt;
+
+ struct {
+ pid_t _pid;
+ lx_uid16_t _uid;
+ int _status;
+ clock_t _utime;
+ clock_t _stime;
+ } _sigchld;
+
+ struct {
+ void *_addr;
+ } _sigfault;
+
+ struct {
+ int _band;
+ int _fd;
+ } _sigpoll;
+ } _sifields;
+} lx_siginfo_t;
+
+#if defined(_KERNEL) && defined(_SYSCALL32_IMPL)
+/*
+ * 64-bit kernel view of the 32-bit "lx_siginfo_t" object.
+ */
+#pragma pack(4)
+typedef struct lx_siginfo32 {
+ int lsi_signo;
+ int lsi_errno;
+ int lsi_code;
+ union {
+ int _pad[LX_SI_PAD_SIZE_32];
+
+ struct {
+ pid32_t _pid;
+ lx_uid16_t _uid;
+ } _kill;
+
+ struct {
+ uint_t _timer1;
+ uint_t _timer2;
+ } _timer;
+
+ struct {
+ pid32_t _pid;
+ lx_uid16_t _uid;
+ union sigval32 _sigval;
+ } _rt;
+
+ struct {
+ pid32_t _pid;
+ lx_uid16_t _uid;
+ int _status;
+ clock32_t _utime;
+ clock32_t _stime;
+ } _sigchld;
+
+ struct {
+ caddr32_t _addr;
+ } _sigfault;
+
+ struct {
+ int _band;
+ int _fd;
+ } _sigpoll;
+ } _sifields;
+} lx_siginfo32_t;
+#pragma pack()
+#endif /* defined(_KERNEL) && defined(_SYSCALL32_IMPL) */
+
+#define lsi_pid _sifields._kill._pid
+#define lsi_uid _sifields._kill._uid
+#define lsi_status _sifields._sigchld._status
+#define lsi_utime _sifields._sigchld._utime
+#define lsi_stime _sifields._sigchld._stime
+#define lsi_value _sifields._rt._sigval
+#define lsi_int _sifields._rt._sigval.sivalx_int
+#define lsi_ptr _sifields._rt._sigval.sivalx_ptr
+#define lsi_addr _sifields._sigfault._addr
+#define lsi_band _sifields._sigpoll._band
+#define lsi_fd _sifields._sigpoll._fd
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LX_SIGINFO_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_syscalls.h b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h
index 766aa91ef5..2d9abf2fe6 100644
--- a/usr/src/uts/common/brand/lx/sys/lx_syscalls.h
+++ b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h
@@ -22,7 +22,7 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#ifndef _SYS_LINUX_SYSCALLS_H
@@ -38,7 +38,6 @@ extern long lx_arch_prctl();
extern long lx_brk();
extern long lx_getpid();
extern long lx_getppid();
-extern long lx_clone();
extern long lx_kill();
extern long lx_tkill();
extern long lx_tgkill();
@@ -55,16 +54,20 @@ extern long lx_sched_getscheduler();
extern long lx_sched_rr_get_interval();
extern long lx_sched_setparam();
extern long lx_sched_setscheduler();
+extern long lx_sched_yield();
extern long lx_set_thread_area();
extern long lx_set_tid_address();
extern long lx_setresgid();
extern long lx_setresgid16();
extern long lx_setresuid();
extern long lx_setresuid16();
-extern long lx_sysinfo();
-extern long lx_setgroups();
-extern long lx_rt_sigqueueinfo();
-extern long lx_rt_tgsigqueueinfo();
+extern long lx_sysinfo32();
+extern long lx_sysinfo64();
+extern long lx_wait4();
+extern long lx_waitid();
+extern long lx_waitpid();
+extern long lx_write();
+extern long lx_xattr();
#endif /* _KERNEL */
diff --git a/usr/src/lib/brand/lx/lx_brand/sys/lx_types.h b/usr/src/uts/common/brand/lx/sys/lx_types.h
index d98c8bc586..922c412020 100644
--- a/usr/src/lib/brand/lx/lx_brand/sys/lx_types.h
+++ b/usr/src/uts/common/brand/lx/sys/lx_types.h
@@ -31,6 +31,8 @@
extern "C" {
#endif
+#ifndef _KERNEL
+
#define SHRT_MIN (-32768) /* min value of a "short int" */
#define SHRT_MAX 32767 /* max value of a "short int" */
#define USHRT_MAX 65535 /* max of "unsigned short int" */
@@ -46,6 +48,8 @@ extern "C" {
#define ULONG_MAX 4294967295UL /* max value of a 32-bit "ulong int" */
#endif
+#endif /* !_KERNEL */
+
#define LX_SYS_UTS_LN 65
struct lx_utsname {
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_clone.c b/usr/src/uts/common/brand/lx/syscall/lx_clone.c
index d73c5f100b..50cdeaeab9 100644
--- a/usr/src/uts/common/brand/lx/syscall/lx_clone.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_clone.c
@@ -42,8 +42,8 @@
* linux cloned thread.
*/
/* ARGSUSED */
-long
-lx_clone(int flags, void *stkp, void *ptidp, void *tls, void *ctidp)
+int
+lx_helper_clone(int64_t *rval, int flags, void *ptidp, void *tls, void *ctidp)
{
struct lx_lwp_data *lwpd = ttolxlwp(curthread);
struct lx_proc_data *lproc = ttolxproc(curthread);
@@ -85,19 +85,10 @@ lx_clone(int flags, void *stkp, void *ptidp, void *tls, void *ctidp)
lx_set_gdt(entry, &lwpd->br_tls[tls_index]);
} else {
/*
- * For 64-bit, we need to set %fsbase -- which
- * requires us to save the native %fsbase and
- * set our LX %fsbase. Don't use rdmsr since
- * the value might get changed before we get to
- * this code. We use the value from the pcb
- * which the native libc should have already
- * setup via syslwp_private.
+ * Set the Linux %fsbase for this LWP. We will
+ * restore it the next time we return to Linux
+ * via setcontext()/lx_restorecontext().
*/
-#if defined(__amd64)
- pcb_t *pcb;
- pcb = (pcb_t *)&curthread->t_lwp->lwp_pcb;
- lwpd->br_ntv_fsbase = pcb->pcb_fsbase;
-#endif
lwpd->br_lx_fsbase = (uintptr_t)tls;
}
}
@@ -129,7 +120,9 @@ lx_clone(int flags, void *stkp, void *ptidp, void *tls, void *ctidp)
return (set_errno(EFAULT));
}
}
- return (lwpd->br_pid);
+
+ *rval = lwpd->br_pid;
+ return (0);
}
long
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_futex.c b/usr/src/uts/common/brand/lx/syscall/lx_futex.c
index 7f8abcd8d9..3dd3971e62 100644
--- a/usr/src/uts/common/brand/lx/syscall/lx_futex.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_futex.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#include <sys/types.h>
@@ -40,6 +40,8 @@
#include <sys/condvar.h>
#include <sys/inttypes.h>
#include <sys/cmn_err.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
#include <sys/lx_futex.h>
#include <sys/lx_impl.h>
@@ -277,10 +279,16 @@ futex_wait(memid_t *memid, caddr_t addr, int val, timespec_t *timeout)
while ((fw.fw_woken == 0) && (err == 0)) {
ret = cv_waituntil_sig(&fw.fw_cv, &futex_hash_lock[index],
timeout, timechanged);
- if (ret < 0)
+ if (ret < 0) {
err = set_errno(ETIMEDOUT);
- else if (ret == 0)
+ } else if (ret == 0) {
+ /*
+ * According to signal(7), a futex(2) call with the
+ * FUTEX_WAIT operation is restartable.
+ */
+ ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
err = set_errno(EINTR);
+ }
}
/*
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getpid.c b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c
index aa8b2b40e1..88b1792d3c 100644
--- a/usr/src/uts/common/brand/lx/syscall/lx_getpid.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c
@@ -22,8 +22,9 @@
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
#include <sys/zone.h>
#include <sys/types.h>
@@ -38,7 +39,7 @@
* return the pid
*/
long
-lx_getpid()
+lx_getpid(void)
{
lx_lwp_data_t *lwpd = ttolxlwp(curthread);
long rv;
@@ -46,8 +47,13 @@ lx_getpid()
if (curproc->p_pid == curproc->p_zone->zone_proc_initpid) {
rv = 1;
} else {
- ASSERT(lwpd != NULL);
- rv = lwpd->br_tgid;
+ VERIFY(lwpd != NULL);
+
+ if (lwpd->br_lx_thunk_pid != 0) {
+ rv = lwpd->br_lx_thunk_pid;
+ } else {
+ rv = lwpd->br_tgid;
+ }
}
return (rv);
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_id.c b/usr/src/uts/common/brand/lx/syscall/lx_id.c
index 5ca18b7556..baa41f52fa 100644
--- a/usr/src/uts/common/brand/lx/syscall/lx_id.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_id.c
@@ -22,10 +22,9 @@
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
#include <sys/types.h>
#include <sys/systm.h>
@@ -50,7 +49,7 @@ extern int setgroups(int, gid_t *);
/*
* This function is based on setreuid in common/syscall/uid.c and exists
- * because Solaris does not have a way to explicitly set the saved uid (suid)
+ * because illumos does not have a way to explicitly set the saved uid (suid)
* from any other system call.
*/
long
@@ -179,9 +178,9 @@ lx_setresuid16(l_uid16_t ruid16, l_uid16_t euid16, l_uid16_t suid16)
long rval;
rval = lx_setresuid(
- LINUX_UID16_TO_UID32(ruid16),
- LINUX_UID16_TO_UID32(euid16),
- LINUX_UID16_TO_UID32(suid16));
+ LINUX_UID16_TO_UID32(ruid16),
+ LINUX_UID16_TO_UID32(euid16),
+ LINUX_UID16_TO_UID32(suid16));
return (rval);
}
@@ -274,19 +273,19 @@ lx_setresgid16(l_gid16_t rgid16, l_gid16_t egid16, l_gid16_t sgid16)
long rval;
rval = lx_setresgid(
- LINUX_GID16_TO_GID32(rgid16),
- LINUX_GID16_TO_GID32(egid16),
- LINUX_GID16_TO_GID32(sgid16));
+ LINUX_GID16_TO_GID32(rgid16),
+ LINUX_GID16_TO_GID32(egid16),
+ LINUX_GID16_TO_GID32(sgid16));
return (rval);
}
/*
- * Linux defines NGROUPS_MAX to be 32, but on Solaris it is only 16. We employ
+ * Linux defines NGROUPS_MAX to be 32, but on illumos it is only 16. We employ
* the terrible hack below so that tests may proceed, if only on DEBUG kernels.
*/
long
-lx_setgroups(int ngroups, gid_t *grouplist)
+lx_helper_setgroups(int ngroups, gid_t *grouplist)
{
#ifdef DEBUG
if (ngroups > ngroups_max && ngroups <= LX_NGROUPS_MAX)
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c b/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c
index 2637f8f33d..8c6ac61ca7 100644
--- a/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c
@@ -1145,7 +1145,7 @@ lx_ioctl_fini()
vsd_destroy(&lx_ioctl_vsd);
}
-int
+long
lx_ioctl(int fdes, int cmd, intptr_t arg)
{
file_t *fp;
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_kill.c b/usr/src/uts/common/brand/lx/syscall/lx_kill.c
index e20e906d33..a5da7fe2df 100644
--- a/usr/src/uts/common/brand/lx/syscall/lx_kill.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_kill.c
@@ -21,7 +21,7 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
@@ -275,8 +275,8 @@ lx_kill(pid_t lx_pid, int lx_sig)
* queuable are sent through the sigqueue syscall via the user level function
* lx_rt_sigqueueinfo().
*/
-long
-lx_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *uinfo)
+int
+lx_helper_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *uinfo)
{
proc_t *target_proc;
pid_t s_pid;
@@ -310,7 +310,7 @@ lx_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *uinfo)
}
/*
* We shouldn't have queuable signals here, those are sent elsewhere by
- * the useland handler for this emulated call.
+ * the usermode handler for this emulated call.
*/
if (!SI_CANQUEUE(kinfo.si_code)) {
return (set_errno(EINVAL));
@@ -341,8 +341,8 @@ lx_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *uinfo)
* Unlike the above function, this handles all system calls to rt_tgsigqueue
* regardless of si_code.
*/
-long
-lx_rt_tgsigqueueinfo(pid_t tgid, pid_t tid, int sig, siginfo_t *uinfo)
+int
+lx_helper_rt_tgsigqueueinfo(pid_t tgid, pid_t tid, int sig, siginfo_t *uinfo)
{
id_t s_tid;
pid_t s_pid;
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_pipe.c b/usr/src/uts/common/brand/lx/syscall/lx_pipe.c
index cef549141e..fe354a8d54 100644
--- a/usr/src/uts/common/brand/lx/syscall/lx_pipe.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_pipe.c
@@ -24,7 +24,7 @@
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#include <sys/zone.h>
@@ -45,6 +45,9 @@
#include <sys/brand.h>
#include <sys/lx_brand.h>
+#define LX_O_NONBLOCK 04000
+#define LX_O_CLOEXEC 02000000
+
/*
* Based on native pipe(2) system call, except that the pipe is half-duplex.
*/
@@ -174,7 +177,26 @@ lx_pipe(intptr_t arg)
* pipe2(2) system call.
*/
long
-lx_pipe2(intptr_t arg, int flags)
+lx_pipe2(intptr_t arg, int lxflags)
{
+ int flags = 0;
+
+ /*
+ * Validate allowed flags.
+ */
+ if ((lxflags & ~(LX_O_NONBLOCK | LX_O_CLOEXEC)) != 0) {
+ return (set_errno(EINVAL));
+ }
+
+ /*
+ * Convert from Linux flags to illumos flags.
+ */
+ if (lxflags & LX_O_NONBLOCK) {
+ flags |= FNONBLOCK;
+ }
+ if (lxflags & LX_O_CLOEXEC) {
+ flags |= FCLOEXEC;
+ }
+
return (lx_hd_pipe(arg, flags));
}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_rw.c b/usr/src/uts/common/brand/lx/syscall/lx_rw.c
index 57cc3e54d0..b21a81da48 100644
--- a/usr/src/uts/common/brand/lx/syscall/lx_rw.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_rw.c
@@ -10,18 +10,21 @@
*/
/*
- * Copyright 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/file.h>
#include <sys/vnode.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
/* uts/common/syscall/rw.c */
extern ssize_t read(int fdes, void *cbuf, size_t count);
+extern ssize_t write(int fdes, void *cbuf, size_t count);
-ssize_t
+long
lx_read(int fd, void *buf, size_t nbyte)
{
file_t *fp;
@@ -35,5 +38,23 @@ lx_read(int fd, void *buf, size_t nbyte)
if (t == VDIR)
return (set_errno(EISDIR));
+ /*
+ * If read(2) returns EINTR, we want to signal that restarting the
+ * system call is acceptable:
+ */
+ ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+
return (read(fd, buf, nbyte));
}
+
+long
+lx_write(int fd, void *buf, size_t nbyte)
+{
+ /*
+ * If write(2) returns EINTR, we want to signal that restarting the
+ * system call is acceptable:
+ */
+ ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+
+ return (write(fd, buf, nbyte));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sched.c b/usr/src/uts/common/brand/lx/syscall/lx_sched.c
index bb91a752d2..4ebb7ff387 100644
--- a/usr/src/uts/common/brand/lx/syscall/lx_sched.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_sched.c
@@ -22,8 +22,9 @@
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
#include <sys/types.h>
#include <sys/systm.h>
@@ -38,8 +39,17 @@
#include <sys/lx_sched.h>
#include <sys/lx_brand.h>
+extern int yield();
extern long priocntl_common(int, procset_t *, int, caddr_t, caddr_t, uio_seg_t);
+long
+lx_sched_yield(void)
+{
+ yield();
+
+ return (0);
+}
+
int
lx_sched_affinity(int cmd, uintptr_t pid, int len, uintptr_t maskp,
int64_t *rval)
@@ -169,13 +179,14 @@ lx_sched_setscheduler(l_pid_t pid, int policy, struct lx_sched_param *param)
if (lwp->lwp_errno)
return (lwp->lwp_errno);
- if (strcmp(pcinfo.pc_clname, "TS") == 0)
+ if (strcmp(pcinfo.pc_clname, "TS") == 0) {
policy = LX_SCHED_OTHER;
- else if (strcmp(pcinfo.pc_clname, "RT") == 0)
+ } else if (strcmp(pcinfo.pc_clname, "RT") == 0) {
policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs ==
- RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR;
- else
+ RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR;
+ } else {
return (set_errno(EINVAL));
+ }
}
bzero(&pcinfo, sizeof (pcinfo));
@@ -195,7 +206,7 @@ lx_sched_setscheduler(l_pid_t pid, int policy, struct lx_sched_param *param)
pcparm.pc_cid = pcinfo.pc_cid;
((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio;
((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs =
- policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF;
+ policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF;
break;
case LX_SCHED_OTHER:
@@ -263,7 +274,7 @@ lx_sched_getscheduler(l_pid_t pid)
policy = LX_SCHED_OTHER;
else if (strcmp(pcinfo.pc_clname, "RT") == 0)
policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs ==
- RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR;
+ RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR;
else
policy = set_errno(EINVAL);
@@ -316,7 +327,7 @@ lx_sched_setparam(l_pid_t pid, struct lx_sched_param *param)
policy = LX_SCHED_OTHER;
else if (strcmp(pcinfo.pc_clname, "RT") == 0)
policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs ==
- RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR;
+ RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR;
else
return (set_errno(EINVAL));
@@ -337,7 +348,7 @@ lx_sched_setparam(l_pid_t pid, struct lx_sched_param *param)
pcparm.pc_cid = pcinfo.pc_cid;
((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio;
((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs =
- policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF;
+ policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF;
break;
case LX_SCHED_OTHER:
@@ -416,11 +427,12 @@ lx_sched_getparam(l_pid_t pid, struct lx_sched_param *param)
local_param.lx_sched_prio = 0;
else
local_param.lx_sched_prio = -(prio * 20) / scale;
- } else if (strcmp(pcinfo.pc_clname, "RT") == 0)
+ } else if (strcmp(pcinfo.pc_clname, "RT") == 0) {
local_param.lx_sched_prio =
- ((rtparms_t *)pcparm.pc_clparms)->rt_pri;
- else
+ ((rtparms_t *)pcparm.pc_clparms)->rt_pri;
+ } else {
rv = set_errno(EINVAL);
+ }
if (rv == 0)
if (copyout(&local_param, param, sizeof (local_param)))
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c
index 6151656cf0..449d5882d4 100644
--- a/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c
@@ -21,7 +21,7 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#include <vm/anon.h>
@@ -30,7 +30,7 @@
#include <sys/zone.h>
#include <sys/time.h>
-struct lx_sysinfo {
+typedef struct lx_sysinfo {
int64_t si_uptime; /* Seconds since boot */
uint64_t si_loads[3]; /* 1, 5, and 15 minute avg runq length */
uint64_t si_totalram; /* Total memory size */
@@ -44,28 +44,51 @@ struct lx_sysinfo {
uint64_t si_totalhigh; /* High memory size */
uint64_t si_freehigh; /* Avail high memory */
uint32_t si_mem_unit; /* Unit size of memory fields */
-};
+} lx_sysinfo_t;
+
+#if defined(_SYSCALL32_IMPL)
+/*
+ * 64-bit kernel view of the 32-bit usermode struct.
+ */
+#pragma pack(4)
+typedef struct lx_sysinfo32 {
+ int32_t si_uptime; /* Seconds since boot */
+ uint32_t si_loads[3]; /* 1, 5, and 15 minute avg runq length */
+ uint32_t si_totalram; /* Total memory size */
+ uint32_t si_freeram; /* Available memory */
+ uint32_t si_sharedram; /* Shared memory */
+ uint32_t si_bufferram; /* Buffer memory */
+ uint32_t si_totalswap; /* Total swap space */
+ uint32_t si_freeswap; /* Avail swap space */
+ uint16_t si_procs; /* Process count */
+ uint16_t si_pad; /* Padding */
+ uint32_t si_totalhigh; /* High memory size */
+ uint32_t si_freehigh; /* Avail high memory */
+ uint32_t si_mem_unit; /* Unit size of memory fields */
+ char __si_pad[8];
+} lx_sysinfo32_t;
+#pragma pack()
+#endif
extern pgcnt_t swapfs_minfree;
-long
-lx_sysinfo(struct lx_sysinfo *sip)
+static void
+lx_sysinfo_common(lx_sysinfo_t *si)
{
- struct lx_sysinfo si;
zone_t *zone = curthread->t_procp->p_zone;
uint64_t zphysmem, zfreemem, ztotswap, zfreeswap;
- si.si_uptime = gethrestime_sec() - zone->zone_boot_time;
+ si->si_uptime = gethrestime_sec() - zone->zone_boot_time;
- si.si_loads[0] = zone->zone_hp_avenrun[0];
- si.si_loads[1] = zone->zone_hp_avenrun[1];
- si.si_loads[2] = zone->zone_hp_avenrun[2];
+ si->si_loads[0] = zone->zone_hp_avenrun[0];
+ si->si_loads[1] = zone->zone_hp_avenrun[1];
+ si->si_loads[2] = zone->zone_hp_avenrun[2];
/*
* In linux each thread looks like a process, so we conflate the
* two in this stat as well.
*/
- si.si_procs = (int32_t)zone->zone_nlwps;
+ si->si_procs = (int32_t)zone->zone_nlwps;
/*
* If memory or swap limits are set on the zone, use those, otherwise
@@ -111,30 +134,85 @@ lx_sysinfo(struct lx_sysinfo *sip)
* option.
*/
if (MAX(zphysmem, ztotswap) < 1024 * 1024) {
- si.si_totalram = ptob(zphysmem);
- si.si_freeram = ptob(zfreemem);
- si.si_totalswap = ptob(ztotswap);
- si.si_freeswap = ptob(zfreeswap);
- si.si_mem_unit = 1;
+ si->si_totalram = ptob(zphysmem);
+ si->si_freeram = ptob(zfreemem);
+ si->si_totalswap = ptob(ztotswap);
+ si->si_freeswap = ptob(zfreeswap);
+ si->si_mem_unit = 1;
} else {
- si.si_totalram = zphysmem;
- si.si_freeram = zfreemem;
- si.si_totalswap = ztotswap;
- si.si_freeswap = zfreeswap;
- si.si_mem_unit = PAGESIZE;
+ si->si_totalram = zphysmem;
+ si->si_freeram = zfreemem;
+ si->si_totalswap = ztotswap;
+ si->si_freeswap = zfreeswap;
+ si->si_mem_unit = PAGESIZE;
}
- si.si_bufferram = 0;
- si.si_sharedram = 0;
+ si->si_bufferram = 0;
+ si->si_sharedram = 0;
/*
* These two stats refer to high physical memory. If an
* application running in a Linux zone cares about this, then
* either it or we are broken.
*/
- si.si_totalhigh = 0;
- si.si_freehigh = 0;
+ si->si_totalhigh = 0;
+ si->si_freehigh = 0;
+}
+
+long
+lx_sysinfo64(caddr_t sip)
+{
+ lx_sysinfo_t si;
+
+ bzero(&si, sizeof (si));
+ lx_sysinfo_common(&si);
+
+ if (copyout(&si, sip, sizeof (si)) != 0) {
+ return (set_errno(EFAULT));
+ }
+
+ return (0);
+}
+
+#if defined(_SYSCALL32_IMPL)
+long
+lx_sysinfo32(caddr_t sip)
+{
+ lx_sysinfo_t si;
+ lx_sysinfo32_t si32;
+ int i;
+
+ lx_sysinfo_common(&si);
+
+ /*
+ * Convert the lx_sysinfo_t into the legacy 32-bit view:
+ */
+ bzero(&si32, sizeof (si32));
+ si32.si_uptime = si.si_uptime;
+
+ for (i = 0; i < 3; i++) {
+ if ((si.si_loads[i]) > 0x7fffffff)
+ si32.si_loads[i] = 0x7fffffff;
+ else
+ si32.si_loads[i] = si.si_loads[i];
+ }
+
+ si32.si_procs = si.si_procs;
+ si32.si_totalram = si.si_totalram;
+ si32.si_freeram = si.si_freeram;
+ si32.si_totalswap = si.si_totalswap;
+ si32.si_freeswap = si.si_freeswap;
+ si32.si_mem_unit = si.si_mem_unit;
- if (copyout(&si, sip, sizeof (si)) != 0)
+ si32.si_bufferram = si.si_bufferram;
+ si32.si_sharedram = si.si_sharedram;
+
+ si32.si_totalhigh = si.si_totalhigh;
+ si32.si_freehigh = si.si_freehigh;
+
+ if (copyout(&si32, sip, sizeof (si32)) != 0) {
return (set_errno(EFAULT));
+ }
+
return (0);
}
+#endif
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c b/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c
index b1528a37c5..c7c611412b 100644
--- a/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c
@@ -42,28 +42,34 @@ long
lx_arch_prctl(int code, ulong_t addr)
{
#if defined(__amd64)
- struct lx_lwp_data *llwp = ttolxlwp(curthread);
- pcb_t *pcb;
-
+ klwp_t *lwp = ttolwp(curthread);
+ lx_lwp_data_t *llwp = lwptolxlwp(lwp);
+ pcb_t *pcb = &lwp->lwp_pcb;
/* We currently only support [g|s]et_fs */
switch (code) {
case LX_ARCH_GET_FS:
if (copyout(&llwp->br_lx_fsbase, (void *)addr,
- sizeof (llwp->br_lx_fsbase)))
+ sizeof (llwp->br_lx_fsbase)) != 0) {
return (set_errno(EFAULT));
+ }
break;
+
case LX_ARCH_SET_FS:
llwp->br_lx_fsbase = addr;
- /*
- * Save current native libc fsbase. Don't use rdmsr since the
- * value might get changed before we get to this code. We
- * use the value from the pcb which the native libc should
- * have already setup via syslwp_private.
- */
- pcb = (pcb_t *)&curthread->t_lwp->lwp_pcb;
- llwp->br_ntv_fsbase = pcb->pcb_fsbase;
+
+ kpreempt_disable();
+ if (pcb->pcb_fsbase != llwp->br_lx_fsbase) {
+ pcb->pcb_fsbase = llwp->br_lx_fsbase;
+
+ /*
+ * Ensure we go out via update_sregs.
+ */
+ pcb->pcb_rupdate = 1;
+ }
+ kpreempt_enable();
break;
+
default:
return (set_errno(EINVAL));
}
diff --git a/usr/src/lib/brand/lx/lx_brand/common/wait.c b/usr/src/uts/common/brand/lx/syscall/lx_wait.c
index c3421858eb..7b10d2f90b 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/wait.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_wait.c
@@ -66,21 +66,21 @@
* covers at least fork() and pthread_create().
*/
-#include <errno.h>
#include <sys/wait.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
#include <sys/lx_types.h>
-#include <sys/lx_signal.h>
-#include <sys/lx_debug.h>
-#include <sys/lx_misc.h>
-#include <sys/lx_syscall.h>
-#include <sys/syscall.h>
-#include <sys/times.h>
-#include <strings.h>
-#include <unistd.h>
-#include <assert.h>
+#include <sys/lx_siginfo.h>
+#include <lx_signum.h>
#include <lx_syscall.h>
/*
+ * From "uts/common/os/exit.c" and "uts/common/syscall/rusagesys.c":
+ */
+extern int waitid(idtype_t, id_t, k_siginfo_t *, int);
+extern int rusagesys(int, void *, void *, void *, void *);
+
+/*
* Convert between Linux options and Solaris options, returning -1 if any
* invalid flags are found.
*/
@@ -99,8 +99,6 @@
#define LX_P_PID 0x1
#define LX_P_GID 0x2
-extern long max_pid;
-
/*
* Split the passed waitpid/waitid options into two separate variables:
* those for the native illumos waitid(2), and the extra Linux-specific
@@ -149,17 +147,14 @@ lx_wstat(int code, int status)
stat = status << 8;
break;
case CLD_DUMPED:
- stat = stol_signo[status];
- assert(stat != -1);
- stat |= WCOREFLG;
+ stat = lx_stol_signo(status, SIGKILL) | WCOREFLG;
break;
case CLD_KILLED:
- stat = stol_signo[status];
- assert(stat != -1);
+ stat = lx_stol_signo(status, SIGKILL);
break;
case CLD_TRAPPED:
case CLD_STOPPED:
- stat = (stol_status(status) << 8) | WSTOPFLG;
+ stat = (lx_stol_status(status, SIGKILL) << 8) | WSTOPFLG;
break;
case CLD_CONTINUED:
stat = WCONTFLG;
@@ -170,50 +165,62 @@ lx_wstat(int code, int status)
}
static int
-lx_waitid_helper(idtype_t idtype, id_t id, siginfo_t *sip, int native_options,
+lx_call_waitid(idtype_t idtype, id_t id, k_siginfo_t *sip, int native_options,
int extra_options)
{
+ lx_lwp_data_t *lwpd = ttolxlwp(curthread);
+ int error;
+
/*
- * Call into our in-kernel waitid() wrapper:
+ * Our brand-specific waitid helper only understands a subset of
+ * the possible idtypes. Ensure we keep to that subset here:
*/
-restart:
- lx_had_sigchild = 0;
- if (syscall(SYS_brand, B_HELPER_WAITID, idtype, id, sip,
- native_options, extra_options) != 0) {
- if (errno == EINTR && (lx_had_sigchild ||
- lx_do_syscall_restart)) {
- /*
- * If we handled a SIGCLD while blocked in waitid(),
- * or the SA_RESTART flag was set, we should wait
- * again.
- */
- lx_debug("lx_waitid_helper() restarting due to"
- " interrupted system call");
- goto restart;
- }
- return (-1);
+ if (idtype != P_ALL && idtype != P_PID && idtype != P_PGID) {
+ return (EINVAL);
}
- return (0);
+ /*
+ * Enable the return of emulated ptrace(2) stop conditions
+ * through lx_waitid_helper, and stash the Linux-specific
+ * extra waitid() flags.
+ */
+ lwpd->br_waitid_emulate = B_TRUE;
+ lwpd->br_waitid_flags = extra_options;
+
+ if ((error = waitid(idtype, id, sip, native_options)) == EINTR) {
+ /*
+ * According to signal(7), the wait4(2), waitid(2), and
+ * waitpid(2) system calls are restartable.
+ */
+ ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+ }
+
+ lwpd->br_waitid_emulate = B_FALSE;
+ lwpd->br_waitid_flags = 0;
+
+ return (error);
}
long
lx_wait4(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4)
{
- siginfo_t info = { 0 };
- struct rusage ru = { 0 };
+ k_siginfo_t info = { 0 };
idtype_t idtype;
id_t id;
int status = 0;
pid_t pid = (pid_t)p1;
- int rval;
+ int error;
int native_options, extra_options;
+ int *statusp = (int *)p2;
+ void *rup = (void *)p4;
- if (ltos_options(p3, &native_options, &extra_options) == -1)
- return (-EINVAL);
+ if (ltos_options(p3, &native_options, &extra_options) == -1) {
+ return (set_errno(EINVAL));
+ }
- if (pid > max_pid)
- return (-ECHILD);
+ if (pid > maxpid) {
+ return (set_errno(ECHILD));
+ }
/*
* While not listed as a valid return code, Linux's wait4(2) does,
@@ -226,15 +233,34 @@ lx_wait4(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4)
*
* This will fail if the buffers in question are write-only.
*/
- if ((void *)p2 != NULL &&
- ((uucopy((void *)p2, &status, sizeof (status)) != 0) ||
- (uucopy(&status, (void *)p2, sizeof (status)) != 0)))
- return (-EFAULT);
-
- if ((void *)p4 != NULL) {
- if ((uucopy((void *)p4, &ru, sizeof (ru)) != 0) ||
- (uucopy(&ru, (void *)p4, sizeof (ru)) != 0))
- return (-EFAULT);
+ if (statusp != NULL) {
+ if (copyin(statusp, &status, sizeof (status)) != 0 ||
+ copyout(&status, statusp, sizeof (status)) != 0) {
+ return (set_errno(EFAULT));
+ }
+ }
+
+ /*
+ * Do the same check for the "struct rusage" pointer, which differs
+ * in size for 32- and 64-bit processes.
+ */
+ if (rup != NULL) {
+ struct rusage ru;
+ void *krup = &ru;
+ size_t rusz = sizeof (ru);
+#if defined(_SYSCALL32_IMPL)
+ struct rusage32 ru32;
+
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ krup = &ru32;
+ rusz = sizeof (ru32);
+ }
+#endif
+
+ if (copyin(rup, krup, rusz) != 0 ||
+ copyout(krup, rup, rusz) != 0) {
+ return (set_errno(EFAULT));
+ }
}
if (pid < -1) {
@@ -245,24 +271,27 @@ lx_wait4(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4)
id = 0;
} else if (pid == 0) {
idtype = P_PGID;
- id = getpgrp();
+ mutex_enter(&pidlock);
+ id = curproc->p_pgrp;
+ mutex_exit(&pidlock);
} else {
idtype = P_PID;
id = pid;
}
- native_options |= WEXITED | WTRAPPED;
+ native_options |= (WEXITED | WTRAPPED);
- if (lx_waitid_helper(idtype, id, &info, native_options,
- extra_options) == -1) {
- return (-errno);
+ if ((error = lx_call_waitid(idtype, id, &info, native_options,
+ extra_options)) != 0) {
+ return (set_errno(error));
}
/*
* If the WNOHANG flag was specified and no child was found return 0.
*/
- if ((native_options & WNOHANG) && info.si_pid == 0)
+ if ((native_options & WNOHANG) && info.si_pid == 0) {
return (0);
+ }
status = lx_wstat(info.si_code, info.si_status);
@@ -273,11 +302,18 @@ lx_wait4(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4)
* should succeed on a Linux system. This, however, is rather
* unlikely since we tested the validity of both above.
*/
- if (p2 != NULL && uucopy(&status, (void *)p2, sizeof (status)) != 0)
- return (-EFAULT);
+ if (statusp != NULL) {
+ if (copyout(&status, statusp, sizeof (status)) != 0) {
+ return (set_errno(EFAULT));
+ }
+ }
- if (p4 != NULL && (rval = lx_getrusage(LX_RUSAGE_CHILDREN, p4)) != 0)
- return (rval);
+ if (rup != NULL) {
+ if ((error = rusagesys(_RUSAGESYS_GETRUSAGE_CHLD, rup, NULL,
+ NULL, NULL)) != 0) {
+ return (set_errno(error));
+ }
+ }
return (info.si_pid);
}
@@ -288,17 +324,116 @@ lx_waitpid(uintptr_t p1, uintptr_t p2, uintptr_t p3)
return (lx_wait4(p1, p2, p3, NULL));
}
+static int
+stol_ksiginfo(k_siginfo_t *sip, uintptr_t lxsip)
+{
+ lx_siginfo_t lsi;
+
+ bzero(&lsi, sizeof (lsi));
+ lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD);
+ lsi.lsi_code = lx_stol_sigcode(sip->si_code);
+ lsi.lsi_errno = lx_stol_errno[sip->si_errno];
+
+ switch (lsi.lsi_signo) {
+ case LX_SIGPOLL:
+ lsi.lsi_band = sip->si_band;
+ lsi.lsi_fd = sip->si_fd;
+ break;
+
+ case LX_SIGCHLD:
+ lsi.lsi_pid = sip->si_pid;
+ if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) {
+ lsi.lsi_status = sip->si_status;
+ } else {
+ lsi.lsi_status = lx_stol_status(sip->si_status,
+ SIGKILL);
+ }
+ lsi.lsi_utime = sip->si_utime;
+ lsi.lsi_stime = sip->si_stime;
+ break;
+
+ case LX_SIGILL:
+ case LX_SIGBUS:
+ case LX_SIGFPE:
+ case LX_SIGSEGV:
+ lsi.lsi_addr = sip->si_addr;
+ break;
+
+ default:
+ lsi.lsi_pid = sip->si_pid;
+ lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid);
+ }
+
+ if (copyout(&lsi, (void *)lxsip, sizeof (lsi)) != 0) {
+ return (set_errno(EFAULT));
+ }
+
+ return (0);
+}
+
+#if defined(_SYSCALL32_IMPL)
+static int
+stol_ksiginfo32(k_siginfo_t *sip, uintptr_t lxsip)
+{
+ lx_siginfo32_t lsi;
+
+ bzero(&lsi, sizeof (lsi));
+ lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD);
+ lsi.lsi_code = lx_stol_sigcode(sip->si_code);
+ lsi.lsi_errno = lx_stol_errno[sip->si_errno];
+
+ switch (lsi.lsi_signo) {
+ case LX_SIGPOLL:
+ lsi.lsi_band = sip->si_band;
+ lsi.lsi_fd = sip->si_fd;
+ break;
+
+ case LX_SIGCHLD:
+ lsi.lsi_pid = sip->si_pid;
+ if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) {
+ lsi.lsi_status = sip->si_status;
+ } else {
+ lsi.lsi_status = lx_stol_status(sip->si_status,
+ SIGKILL);
+ }
+ lsi.lsi_utime = sip->si_utime;
+ lsi.lsi_stime = sip->si_stime;
+ break;
+
+ case LX_SIGILL:
+ case LX_SIGBUS:
+ case LX_SIGFPE:
+ case LX_SIGSEGV:
+ lsi.lsi_addr = (caddr32_t)(uintptr_t)sip->si_addr;
+ break;
+
+ default:
+ lsi.lsi_pid = sip->si_pid;
+ lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid);
+ }
+
+ if (copyout(&lsi, (void *)lxsip, sizeof (lsi)) != 0) {
+ return (set_errno(EFAULT));
+ }
+
+ return (0);
+}
+#endif
+
long
lx_waitid(uintptr_t idtype, uintptr_t id, uintptr_t infop, uintptr_t opt)
{
+ int error;
int native_options, extra_options;
- siginfo_t s_info = {0};
+ k_siginfo_t info = { 0 };
- if (ltos_options(opt, &native_options, &extra_options) == -1)
- return (-EINVAL);
+ if (ltos_options(opt, &native_options, &extra_options) == -1) {
+ return (set_errno(EINVAL));
+ }
- if (((opt) & (LX_WEXITED | LX_WSTOPPED | LX_WCONTINUED)) == 0)
- return (-EINVAL);
+ if (((opt) & (LX_WEXITED | LX_WSTOPPED | LX_WCONTINUED)) == 0) {
+ return (set_errno(EINVAL));
+ }
switch (idtype) {
case LX_P_ALL:
@@ -311,17 +446,27 @@ lx_waitid(uintptr_t idtype, uintptr_t id, uintptr_t infop, uintptr_t opt)
idtype = P_PGID;
break;
default:
- return (-EINVAL);
+ return (set_errno(EINVAL));
}
- if (lx_waitid_helper(idtype, id, &s_info, native_options,
- extra_options) == -1) {
- return (-errno);
+ if ((error = lx_call_waitid(idtype, id, &info, native_options,
+ extra_options)) != 0) {
+ return (set_errno(error));
}
- /* If the WNOHANG flag was specified and no child was found return 0. */
- if ((native_options & WNOHANG) && s_info.si_pid == 0)
+ /*
+ * If the WNOHANG flag was specified and no child was found return 0.
+ */
+ if ((native_options & WNOHANG) && info.si_pid == 0) {
return (0);
+ }
- return (stol_siginfo(&s_info, (lx_siginfo_t *)infop));
+#if defined(_SYSCALL32_IMPL)
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ return (stol_ksiginfo32(&info, infop));
+ } else
+#endif
+ {
+ return (stol_ksiginfo(&info, infop));
+ }
}
diff --git a/usr/src/lib/brand/lx/lx_brand/common/xattr.c b/usr/src/uts/common/brand/lx/syscall/lx_xattr.c
index 39d6d0361b..ea23c3e4b8 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/xattr.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_xattr.c
@@ -10,9 +10,13 @@
*/
/*
- * Copyright 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/file.h>
+
/*
* *xattr() family of functions.
*
@@ -20,28 +24,8 @@
* than using NOSYS_NO_EQUIV to avoid unwanted stderr output from ls(1).
*/
-#include <errno.h>
-#include <sys/types.h>
-#include <sys/lx_types.h>
-#include <sys/lx_syscall.h>
-
-long
-lx_xattr2(uintptr_t p1, uintptr_t p2)
-{
-
- return (-EOPNOTSUPP);
-}
-
long
-lx_xattr3(uintptr_t p1, uintptr_t p2, uintptr_t p3)
+lx_xattr(void)
{
-
- return (-EOPNOTSUPP);
-}
-
-long
-lx_xattr4(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4)
-{
-
- return (-EOPNOTSUPP);
+ return (set_errno(EOPNOTSUPP));
}
diff --git a/usr/src/uts/common/brand/sn1/sn1_brand.c b/usr/src/uts/common/brand/sn1/sn1_brand.c
index 8b1338d578..0390434cfb 100644
--- a/usr/src/uts/common/brand/sn1/sn1_brand.c
+++ b/usr/src/uts/common/brand/sn1/sn1_brand.c
@@ -62,29 +62,41 @@ int sn1_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
/* sn1 brand */
struct brand_ops sn1_brops = {
- sn1_init_brand_data,
- sn1_free_brand_data,
- sn1_brandsys,
- sn1_setbrand,
- sn1_getattr,
- sn1_setattr,
- sn1_copy_procdata,
- sn1_proc_exit,
- sn1_exec,
- lwp_setrval,
- sn1_initlwp,
- sn1_forklwp,
- sn1_freelwp,
- sn1_lwpexit,
- sn1_elfexec,
- NULL,
- NULL,
- NULL,
- NSIG,
- NULL,
- NULL,
- NULL,
- NULL
+ sn1_init_brand_data, /* b_init_brand_data */
+ sn1_free_brand_data, /* b_free_brand_data */
+ sn1_brandsys, /* b_brandsys */
+ sn1_setbrand, /* b_setbrand */
+ sn1_getattr, /* b_getattr */
+ sn1_setattr, /* b_setattr */
+ sn1_copy_procdata, /* b_copy_procdata */
+ sn1_proc_exit, /* b_proc_exit */
+ sn1_exec, /* b_exec */
+ lwp_setrval, /* b_lwp_setrval */
+ sn1_initlwp, /* b_initlwp */
+ sn1_forklwp, /* b_forklwp */
+ sn1_freelwp, /* b_freelwp */
+ sn1_lwpexit, /* b_lwpexit */
+ sn1_elfexec, /* b_elfexec */
+ NULL, /* b_sigset_native_to_brand */
+ NULL, /* b_sigset_brand_to_native */
+ NULL, /* b_psig_to_proc */
+ NSIG, /* b_nsig */
+ NULL, /* b_exit_with_sig */
+ NULL, /* b_wait_filter */
+ NULL, /* b_native_exec */
+ NULL, /* b_ptrace_exectrap */
+ NULL, /* b_map32limit */
+ NULL, /* b_stop_notify */
+ NULL, /* b_waitid_helper */
+ NULL, /* b_sigcld_repost */
+ NULL, /* b_issig_stop */
+ NULL, /* b_savecontext */
+#if defined(_SYSCALL32_IMPL)
+ NULL, /* b_savecontext32 */
+#endif
+ NULL, /* b_restorecontext */
+ NULL, /* b_sendsig_stack */
+ NULL /* b_sendsig */
};
#ifdef sparc
diff --git a/usr/src/uts/common/brand/sngl/sngl_brand.c b/usr/src/uts/common/brand/sngl/sngl_brand.c
index b04635c0f6..97d172d80e 100644
--- a/usr/src/uts/common/brand/sngl/sngl_brand.c
+++ b/usr/src/uts/common/brand/sngl/sngl_brand.c
@@ -63,29 +63,41 @@ int sngl_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
/* SNGL brand */
struct brand_ops sngl_brops = {
- sngl_init_brand_data,
- sngl_free_brand_data,
- sngl_brandsys,
- sngl_setbrand,
- sngl_getattr,
- sngl_setattr,
- sngl_copy_procdata,
- sngl_proc_exit,
- sngl_exec,
- lwp_setrval,
- sngl_initlwp,
- sngl_forklwp,
- sngl_freelwp,
- sngl_lwpexit,
- sngl_elfexec,
- NULL,
- NULL,
- NULL,
- NSIG,
- NULL,
- NULL,
- NULL,
- NULL
+ sngl_init_brand_data, /* b_init_brand_data */
+ sngl_free_brand_data, /* b_free_brand_data */
+ sngl_brandsys, /* b_brandsys */
+ sngl_setbrand, /* b_setbrand */
+ sngl_getattr, /* b_getattr */
+ sngl_setattr, /* b_setattr */
+ sngl_copy_procdata, /* b_copy_procdata */
+ sngl_proc_exit, /* b_proc_exit */
+ sngl_exec, /* b_exec */
+ lwp_setrval, /* b_lwp_setrval */
+ sngl_initlwp, /* b_initlwp */
+ sngl_forklwp, /* b_forklwp */
+ sngl_freelwp, /* b_freelwp */
+ sngl_lwpexit, /* b_lwpexit */
+ sngl_elfexec, /* b_elfexec */
+ NULL, /* b_sigset_native_to_brand */
+ NULL, /* b_sigset_brand_to_native */
+ NULL, /* b_psig_to_proc */
+ NSIG, /* b_nsig */
+ NULL, /* b_exit_with_sig */
+ NULL, /* b_wait_filter */
+ NULL, /* b_native_exec */
+ NULL, /* b_ptrace_exectrap */
+ NULL, /* b_map32limit */
+ NULL, /* b_stop_notify */
+ NULL, /* b_waitid_helper */
+ NULL, /* b_sigcld_repost */
+ NULL, /* b_issig_stop */
+ NULL, /* b_savecontext */
+#if defined(_SYSCALL32_IMPL)
+ NULL, /* b_savecontext32 */
+#endif
+ NULL, /* b_restorecontext */
+ NULL, /* b_sendsig_stack */
+ NULL /* b_sendsig */
};
#ifdef __amd64
diff --git a/usr/src/uts/common/brand/solaris10/s10_brand.c b/usr/src/uts/common/brand/solaris10/s10_brand.c
index bedbaa53d3..b6a0eeadff 100644
--- a/usr/src/uts/common/brand/solaris10/s10_brand.c
+++ b/usr/src/uts/common/brand/solaris10/s10_brand.c
@@ -67,29 +67,41 @@ void s10_sigset_s10_to_native(sigset_t *);
/* s10 brand */
struct brand_ops s10_brops = {
- s10_init_brand_data,
- s10_free_brand_data,
- s10_brandsys,
- s10_setbrand,
- s10_getattr,
- s10_setattr,
- s10_copy_procdata,
- s10_proc_exit,
- s10_exec,
- lwp_setrval,
- s10_initlwp,
- s10_forklwp,
- s10_freelwp,
- s10_lwpexit,
- s10_elfexec,
- s10_sigset_native_to_s10,
- s10_sigset_s10_to_native,
- NULL,
- S10_NSIG,
- NULL,
- NULL,
- NULL,
- NULL
+ s10_init_brand_data, /* b_init_brand_data */
+ s10_free_brand_data, /* b_free_brand_data */
+ s10_brandsys, /* b_brandsys */
+ s10_setbrand, /* b_setbrand */
+ s10_getattr, /* b_getattr */
+ s10_setattr, /* b_setattr */
+ s10_copy_procdata, /* b_copy_procdata */
+ s10_proc_exit, /* b_proc_exit */
+ s10_exec, /* b_exec */
+ lwp_setrval, /* b_lwp_setrval */
+ s10_initlwp, /* b_initlwp */
+ s10_forklwp, /* b_forklwp */
+ s10_freelwp, /* b_freelwp */
+ s10_lwpexit, /* b_lwpexit */
+ s10_elfexec, /* b_elfexec */
+ s10_sigset_native_to_s10, /* b_sigset_native_to_brand */
+ s10_sigset_s10_to_native, /* b_sigset_brand_to_native */
+ NULL, /* b_psig_to_proc */
+ S10_NSIG, /* b_nsig */
+ NULL, /* b_exit_with_sig */
+ NULL, /* b_wait_filter */
+ NULL, /* b_native_exec */
+ NULL, /* b_ptrace_exectrap */
+ NULL, /* b_map32limit */
+ NULL, /* b_stop_notify */
+ NULL, /* b_waitid_helper */
+ NULL, /* b_sigcld_repost */
+ NULL, /* b_issig_stop */
+ NULL, /* b_savecontext */
+#if defined(_SYSCALL32_IMPL)
+ NULL, /* b_savecontext32 */
+#endif
+ NULL, /* b_restorecontext */
+ NULL, /* b_sendsig_stack */
+ NULL /* b_sendsig */
};
#ifdef sparc
diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c
index e702fb211c..00258d1ced 100644
--- a/usr/src/uts/common/os/exec.c
+++ b/usr/src/uts/common/os/exec.c
@@ -362,6 +362,8 @@ exec_common(const char *fname, const char **argp, const char **envp,
* pending held signals remain held, so don't clear t_hold.
*/
mutex_enter(&p->p_lock);
+ DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp,
+ uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0);
lwp->lwp_oldcontext = 0;
lwp->lwp_ustack = 0;
lwp->lwp_old_stk_ctl = 0;
diff --git a/usr/src/uts/common/sys/brand.h b/usr/src/uts/common/sys/brand.h
index b3abada863..394c38d3cf 100644
--- a/usr/src/uts/common/sys/brand.h
+++ b/usr/src/uts/common/sys/brand.h
@@ -137,6 +137,13 @@ struct brand_ops {
boolean_t *, int *);
int (*b_sigcld_repost)(proc_t *, sigqueue_t *);
int (*b_issig_stop)(proc_t *, klwp_t *);
+ void (*b_savecontext)(ucontext_t *);
+#if defined(_SYSCALL32_IMPL)
+ void (*b_savecontext32)(ucontext32_t *);
+#endif
+ void (*b_restorecontext)(ucontext_t *);
+ caddr_t (*b_sendsig_stack)(int);
+ void (*b_sendsig)(int);
};
/*
diff --git a/usr/src/uts/common/sys/klwp.h b/usr/src/uts/common/sys/klwp.h
index 41b70f6a6e..2aceb3a0f6 100644
--- a/usr/src/uts/common/sys/klwp.h
+++ b/usr/src/uts/common/sys/klwp.h
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#ifndef _SYS_KLWP_H
@@ -192,6 +192,8 @@ typedef struct _klwp {
struct contract *lwp_ct_latest[CTT_MAXTYPE]; /* last created contract */
void *lwp_brand; /* per-lwp brand data */
+ int (*lwp_brand_syscall)(void); /* brand syscall interposer */
+
struct psinfo *lwp_spymaster; /* if an agent LWP, our spymaster */
} klwp_t;
diff --git a/usr/src/uts/i86pc/ml/offsets.in b/usr/src/uts/i86pc/ml/offsets.in
index 9922891e56..955d934311 100644
--- a/usr/src/uts/i86pc/ml/offsets.in
+++ b/usr/src/uts/i86pc/ml/offsets.in
@@ -1,6 +1,7 @@
\
\ Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
\ Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved.
+\ Copyright 2015 Joyent, Inc.
\
\ CDDL HEADER START
\
@@ -147,6 +148,7 @@ _klwp
lwp_thread
lwp_procp
lwp_brand
+ lwp_brand_syscall
lwp_eosys
lwp_regs
lwp_arg
diff --git a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
index 823404b485..7b9b844768 100644
--- a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
+++ b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#include <sys/asm_linkage.h>
@@ -503,6 +503,7 @@ noprod_sys_syscall:
movq T_LWP(%r15), %r14
ASSERT_NO_RUPDATE_PENDING(%r14)
+
ENABLE_INTR_FLAGS
MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
@@ -516,6 +517,26 @@ noprod_sys_syscall:
incq %gs:CPU_STATS_SYS_SYSCALL
+ /*
+ * If our LWP has an alternate system call handler, run that instead of
+ * the regular system call path.
+ */
+ movq LWP_BRAND_SYSCALL(%r14), %rdi
+ testq %rdi, %rdi
+ jz _syscall_no_brand
+
+ pushq %rax
+ call *%rdi
+
+ /*
+ * If the alternate handler returns 0, we skip straight to the return to
+ * usermode. Otherwise, we resume regular system call processing.
+ */
+ testl %eax, %eax
+ popq %rax
+ jz _syscall_after_brand
+
+_syscall_no_brand:
movw %ax, T_SYSNUM(%r15)
movzbl T_PRE_SYS(%r15), %ebx
ORL_SYSCALLTRACE(%ebx)
@@ -550,6 +571,8 @@ _syscall_invoke:
shrq $32, %r13 /* upper 32-bits into %edx */
movl %r12d, %r12d /* lower 32-bits into %eax */
5:
+
+_syscall_after_brand:
/*
* Optimistically assume that there's no post-syscall
* work to do. (This is to avoid having to call syscall_mstate()
@@ -795,6 +818,25 @@ _syscall32_save:
incq %gs:CPU_STATS_SYS_SYSCALL
/*
+ * If our lwp has an alternate system call handler, run that instead
+ * of the regular system call path.
+ */
+ movq LWP_BRAND_SYSCALL(%r14), %rax
+ testq %rax, %rax
+ jz _syscall32_no_brand
+
+ movb $LWP_SYS, LWP_STATE(%r14)
+ call *%rax
+
+ /*
+ * If the alternate handler returns 0, we skip straight to the return
+ * to usermode. Otherwise, we resume regular system call processing.
+ */
+ testl %eax, %eax
+ jz _syscall32_after_brand
+
+_syscall32_no_brand:
+ /*
* Make some space for MAXSYSARGS (currently 8) 32-bit args placed
* into 64-bit (long) arg slots, maintaining 16 byte alignment. Or
* more succinctly:
@@ -861,6 +903,8 @@ _syscall32_save:
shrq $32, %r13 /* upper 32-bits into %edx */
movl %eax, %r12d /* lower 32-bits into %eax */
+_syscall32_after_brand:
+
/*
* Optimistically assume that there's no post-syscall
* work to do. (This is to avoid having to call syscall_mstate()
@@ -1191,7 +1235,31 @@ sys_int80()
ENTRY_NP(brand_sys_int80)
SWAPGS /* kernel gsbase */
XPV_TRAP_POP
+
+ /*
+ * We first attempt to call the "b_int80" handler from the "struct
+ * brand_mach_ops" for this brand. If no handler function is installed
+ * for this brand, the BRAND_CALLBACK() macro returns here and we
+ * check the lwp for a "lwp_brand_syscall" handler.
+ */
BRAND_CALLBACK(BRAND_CB_INT80, BRAND_URET_FROM_INTR_STACK())
+
+ /*
+ * Check to see if this lwp provides "lwp_brand_syscall". If so, we
+ * will route this int80 through the regular system call handling path.
+ */
+ movq %r15, %gs:CPU_RTMP_R15
+ movq %gs:CPU_THREAD, %r15
+ movq T_LWP(%r15), %r15
+ movq LWP_BRAND_SYSCALL(%r15), %r15
+ testq %r15, %r15
+ movq %gs:CPU_RTMP_R15, %r15
+ jnz nopop_syscall_int
+
+ /*
+ * The brand provided neither a "b_int80", nor a "lwp_brand_syscall"
+ * function, and has thus opted out of handling this trap.
+ */
SWAPGS /* user gsbase */
jmp nopop_int80
diff --git a/usr/src/uts/intel/Makefile.files b/usr/src/uts/intel/Makefile.files
index 0f058f262d..7eefb1c062 100644
--- a/usr/src/uts/intel/Makefile.files
+++ b/usr/src/uts/intel/Makefile.files
@@ -276,8 +276,8 @@ SN1_BRAND_OBJS = sn1_brand.o sn1_brand_asm.o
SNGL_BRAND_OBJS = sngl_brand.o sngl_brand_asm.o
S10_BRAND_OBJS = s10_brand.o s10_brand_asm.o
LX_BRAND_OBJS = \
+ lx_archdep.o \
lx_brand.o \
- lx_brand_asm.o \
lx_brk.o \
lx_clone.o \
lx_futex.o \
@@ -295,7 +295,9 @@ LX_BRAND_OBJS = \
lx_signum.o \
lx_syscall.o \
lx_sysinfo.o \
- lx_thread_area.o
+ lx_thread_area.o \
+ lx_wait.o \
+ lx_xattr.o
#
# special files
diff --git a/usr/src/uts/intel/brand/lx/lx_archdep.c b/usr/src/uts/intel/brand/lx/lx_archdep.c
new file mode 100644
index 0000000000..49f2f12172
--- /dev/null
+++ b/usr/src/uts/intel/brand/lx/lx_archdep.c
@@ -0,0 +1,1171 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * LX brand Intel-specific routines.
+ */
+
+#include <sys/types.h>
+#include <sys/sunddi.h>
+#include <sys/ddi.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/privregs.h>
+#include <sys/pcb.h>
+#include <sys/archsystm.h>
+#include <sys/stack.h>
+#include <sys/sdt.h>
+#include <sys/sysmacros.h>
+
+#define LX_REG(ucp, r) ((ucp)->uc_mcontext.gregs[(r)])
+
+extern int getsetcontext(int, void *);
+#if defined(_SYSCALL32_IMPL)
+extern int getsetcontext32(int, void *);
+#endif
+
+#if defined(__amd64)
+static int
+lx_rw_uc(proc_t *p, void *ucp, void *kucp, size_t ucsz, boolean_t writing)
+{
+ int error = 0;
+ size_t rem = ucsz;
+ off_t pos = 0;
+
+ VERIFY(MUTEX_HELD(&p->p_lock));
+
+ /*
+ * Grab P_PR_LOCK so that we can drop p_lock while doing I/O.
+ */
+ sprlock_proc(p);
+
+ /*
+ * Drop p_lock while we do I/O to avoid deadlock with the clock thread.
+ */
+ mutex_exit(&p->p_lock);
+ while (rem != 0) {
+ uintptr_t addr = (uintptr_t)ucp + pos;
+ size_t len = MIN(rem, PAGESIZE - (addr & PAGEOFFSET));
+
+ if (writing) {
+ error = uwrite(p, kucp + pos, len, addr);
+ } else {
+ error = uread(p, kucp + pos, len, addr);
+ }
+
+ if (error != 0) {
+ break;
+ }
+
+ rem -= len;
+ pos += len;
+ }
+ mutex_enter(&p->p_lock);
+
+ sprunlock(p);
+ mutex_enter(&p->p_lock);
+
+ return (error);
+}
+
+/*
+ * Read a ucontext_t from the target process, which may or may not be
+ * the current process.
+ */
+static int
+lx_read_uc(proc_t *p, void *ucp, void *kucp, size_t ucsz)
+{
+ return (lx_rw_uc(p, ucp, kucp, ucsz, B_FALSE));
+}
+
+/*
+ * Write a ucontext_t to the target process, which may or may not be
+ * the current process.
+ */
+static int
+lx_write_uc(proc_t *p, void *ucp, void *kucp, size_t ucsz)
+{
+ return (lx_rw_uc(p, ucp, kucp, ucsz, B_TRUE));
+}
+#endif /* __amd64 */
+
+/*
+ * Load register state from a usermode "lx_user_regs_t" in the tracer
+ * and store it in the tracee ucontext_t.
+ */
+int
+lx_userregs_to_uc(lx_lwp_data_t *lwpd, void *ucp, void *uregsp)
+{
+#if defined(__amd64)
+ klwp_t *lwp = lwpd->br_lwp;
+ proc_t *p = lwptoproc(lwp);
+
+ switch (get_udatamodel()) {
+ case DATAMODEL_LP64: {
+ lx_user_regs_t lxur;
+
+ if (copyin(uregsp, &lxur, sizeof (lxur)) != 0) {
+ return (EFAULT);
+ }
+
+ switch (lwp_getdatamodel(lwp)) {
+ case DATAMODEL_LP64: {
+ ucontext_t uc;
+
+ if (lx_read_uc(p, ucp, &uc, sizeof (uc)) != 0) {
+ return (EIO);
+ }
+
+ /*
+ * Note: we currently ignore "lxur_orig_rax" here, as
+ * this path should not be used for system call stops.
+ */
+ LX_REG(&uc, REG_R15) = lxur.lxur_r15;
+ LX_REG(&uc, REG_R14) = lxur.lxur_r14;
+ LX_REG(&uc, REG_R13) = lxur.lxur_r13;
+ LX_REG(&uc, REG_R12) = lxur.lxur_r12;
+ LX_REG(&uc, REG_RBP) = lxur.lxur_rbp;
+ LX_REG(&uc, REG_RBX) = lxur.lxur_rbx;
+ LX_REG(&uc, REG_R11) = lxur.lxur_r11;
+ LX_REG(&uc, REG_R10) = lxur.lxur_r10;
+ LX_REG(&uc, REG_R9) = lxur.lxur_r9;
+ LX_REG(&uc, REG_R8) = lxur.lxur_r8;
+ LX_REG(&uc, REG_RAX) = lxur.lxur_rax;
+ LX_REG(&uc, REG_RCX) = lxur.lxur_rcx;
+ LX_REG(&uc, REG_RDX) = lxur.lxur_rdx;
+ LX_REG(&uc, REG_RSI) = lxur.lxur_rsi;
+ LX_REG(&uc, REG_RDI) = lxur.lxur_rdi;
+ LX_REG(&uc, REG_RIP) = lxur.lxur_rip;
+ LX_REG(&uc, REG_CS) = lxur.lxur_xcs;
+ LX_REG(&uc, REG_RFL) = lxur.lxur_rflags;
+ LX_REG(&uc, REG_RSP) = lxur.lxur_rsp;
+ LX_REG(&uc, REG_SS) = lxur.lxur_xss;
+ LX_REG(&uc, REG_FSBASE) = lxur.lxur_xfs_base;
+ LX_REG(&uc, REG_GSBASE) = lxur.lxur_xgs_base;
+
+ LX_REG(&uc, REG_DS) = lxur.lxur_xds;
+ LX_REG(&uc, REG_ES) = lxur.lxur_xes;
+ LX_REG(&uc, REG_FS) = lxur.lxur_xfs;
+ LX_REG(&uc, REG_GS) = lxur.lxur_xgs;
+
+ if (lx_write_uc(p, ucp, &uc, sizeof (uc)) != 0) {
+ return (EIO);
+ }
+
+ return (0);
+ }
+
+ case DATAMODEL_ILP32: {
+ ucontext32_t uc;
+
+ if (lx_read_uc(p, ucp, &uc, sizeof (uc)) != 0) {
+ return (EIO);
+ }
+
+ /*
+ * Note: we currently ignore "lxur_orig_eax" here, as
+ * this path should not be used for system call stops.
+ */
+ LX_REG(&uc, EBP) = (int32_t)lxur.lxur_rbp;
+ LX_REG(&uc, EBX) = (int32_t)lxur.lxur_rbx;
+ LX_REG(&uc, EAX) = (int32_t)lxur.lxur_rax;
+ LX_REG(&uc, ECX) = (int32_t)lxur.lxur_rcx;
+ LX_REG(&uc, EDX) = (int32_t)lxur.lxur_rdx;
+ LX_REG(&uc, ESI) = (int32_t)lxur.lxur_rsi;
+ LX_REG(&uc, EDI) = (int32_t)lxur.lxur_rdi;
+ LX_REG(&uc, EIP) = (int32_t)lxur.lxur_rip;
+ LX_REG(&uc, CS) = (int32_t)lxur.lxur_xcs;
+ LX_REG(&uc, EFL) = (int32_t)lxur.lxur_rflags;
+ LX_REG(&uc, UESP) = (int32_t)lxur.lxur_rsp;
+ LX_REG(&uc, SS) = (int32_t)lxur.lxur_xss;
+
+ LX_REG(&uc, DS) = (int32_t)lxur.lxur_xds;
+ LX_REG(&uc, ES) = (int32_t)lxur.lxur_xes;
+ LX_REG(&uc, FS) = (int32_t)lxur.lxur_xfs;
+ LX_REG(&uc, GS) = (int32_t)lxur.lxur_xgs;
+
+ if (lx_write_uc(p, ucp, &uc, sizeof (uc)) != 0) {
+ return (EIO);
+ }
+
+ return (0);
+ }
+
+ default:
+ return (EIO);
+ }
+
+ return (EIO);
+ }
+
+ case DATAMODEL_ILP32: {
+ lx_user_regs32_t lxur;
+ ucontext32_t uc;
+
+ if (lwp_getdatamodel(lwp) != DATAMODEL_ILP32) {
+ /*
+ * The target is not a 32-bit LWP. We refuse to
+ * present truncated 64-bit registers to a 32-bit
+ * tracer.
+ */
+ return (EIO);
+ }
+
+ if (copyin(uregsp, &lxur, sizeof (lxur)) != 0) {
+ return (EFAULT);
+ }
+
+ if (lx_read_uc(p, ucp, &uc, sizeof (uc)) != 0) {
+ return (EIO);
+ }
+
+ /*
+ * Note: we currently ignore "lxur_orig_eax" here, as
+ * this path should not be used for system call stops.
+ */
+ LX_REG(&uc, EBX) = lxur.lxur_ebx;
+ LX_REG(&uc, ECX) = lxur.lxur_ecx;
+ LX_REG(&uc, EDX) = lxur.lxur_edx;
+ LX_REG(&uc, ESI) = lxur.lxur_esi;
+ LX_REG(&uc, EDI) = lxur.lxur_edi;
+ LX_REG(&uc, EBP) = lxur.lxur_ebp;
+ LX_REG(&uc, EAX) = lxur.lxur_eax;
+ LX_REG(&uc, EIP) = lxur.lxur_eip;
+ LX_REG(&uc, CS) = lxur.lxur_xcs;
+ LX_REG(&uc, EFL) = lxur.lxur_eflags;
+ LX_REG(&uc, UESP) = lxur.lxur_esp;
+ LX_REG(&uc, SS) = lxur.lxur_xss;
+
+ LX_REG(&uc, DS) = lxur.lxur_xds;
+ LX_REG(&uc, ES) = lxur.lxur_xes;
+ LX_REG(&uc, FS) = lxur.lxur_xfs;
+ LX_REG(&uc, GS) = lxur.lxur_xgs;
+
+ if (lx_write_uc(p, ucp, &uc, sizeof (uc)) != 0) {
+ return (EIO);
+ }
+
+ return (EIO);
+ }
+
+ default:
+ return (EIO);
+ }
+#else
+ cmn_err(CE_WARN, "%s: no 32-bit kernel support", __FUNCTION__);
+ exit(CLD_KILLED, SIGSYS);
+ return (EIO);
+#endif /* __amd64 */
+}
+
+/*
+ * Copy register state from a ucontext_t in the tracee to a usermode
+ * "lx_user_regs_t" in the tracer.
+ */
+int
+lx_uc_to_userregs(lx_lwp_data_t *lwpd, void *ucp, void *uregsp)
+{
+#if defined(__amd64)
+ klwp_t *lwp = lwpd->br_lwp;
+ proc_t *p = lwptoproc(lwp);
+
+ switch (get_udatamodel()) {
+ case DATAMODEL_LP64: {
+ lx_user_regs_t lxur;
+
+ switch (lwp_getdatamodel(lwp)) {
+ case DATAMODEL_LP64: {
+ ucontext_t uc;
+
+ if (lx_read_uc(p, ucp, &uc, sizeof (uc)) != 0) {
+ return (EIO);
+ }
+
+ lxur.lxur_r15 = LX_REG(&uc, REG_R15);
+ lxur.lxur_r14 = LX_REG(&uc, REG_R14);
+ lxur.lxur_r13 = LX_REG(&uc, REG_R13);
+ lxur.lxur_r12 = LX_REG(&uc, REG_R12);
+ lxur.lxur_rbp = LX_REG(&uc, REG_RBP);
+ lxur.lxur_rbx = LX_REG(&uc, REG_RBX);
+ lxur.lxur_r11 = LX_REG(&uc, REG_R11);
+ lxur.lxur_r10 = LX_REG(&uc, REG_R10);
+ lxur.lxur_r9 = LX_REG(&uc, REG_R9);
+ lxur.lxur_r8 = LX_REG(&uc, REG_R8);
+ lxur.lxur_rax = LX_REG(&uc, REG_RAX);
+ lxur.lxur_rcx = LX_REG(&uc, REG_RCX);
+ lxur.lxur_rdx = LX_REG(&uc, REG_RDX);
+ lxur.lxur_rsi = LX_REG(&uc, REG_RSI);
+ lxur.lxur_rdi = LX_REG(&uc, REG_RDI);
+ lxur.lxur_orig_rax = 0;
+ lxur.lxur_rip = LX_REG(&uc, REG_RIP);
+ lxur.lxur_xcs = LX_REG(&uc, REG_CS);
+ lxur.lxur_rflags = LX_REG(&uc, REG_RFL);
+ lxur.lxur_rsp = LX_REG(&uc, REG_RSP);
+ lxur.lxur_xss = LX_REG(&uc, REG_SS);
+ lxur.lxur_xfs_base = LX_REG(&uc, REG_FSBASE);
+ lxur.lxur_xgs_base = LX_REG(&uc, REG_GSBASE);
+
+ lxur.lxur_xds = LX_REG(&uc, REG_DS);
+ lxur.lxur_xes = LX_REG(&uc, REG_ES);
+ lxur.lxur_xfs = LX_REG(&uc, REG_FS);
+ lxur.lxur_xgs = LX_REG(&uc, REG_GS);
+
+ if (copyout(&lxur, uregsp, sizeof (lxur)) != 0) {
+ return (EFAULT);
+ }
+
+ return (0);
+ }
+
+ case DATAMODEL_ILP32: {
+ ucontext32_t uc;
+
+ if (lx_read_uc(p, ucp, &uc, sizeof (uc)) != 0) {
+ return (EIO);
+ }
+
+ lxur.lxur_r15 = 0;
+ lxur.lxur_r14 = 0;
+ lxur.lxur_r13 = 0;
+ lxur.lxur_r12 = 0;
+ lxur.lxur_rbp = LX_REG(&uc, EBP);
+ lxur.lxur_rbx = LX_REG(&uc, EBX);
+ lxur.lxur_r11 = 0;
+ lxur.lxur_r10 = 0;
+ lxur.lxur_r9 = 0;
+ lxur.lxur_r8 = 0;
+ lxur.lxur_rax = LX_REG(&uc, EAX);
+ lxur.lxur_rcx = LX_REG(&uc, ECX);
+ lxur.lxur_rdx = LX_REG(&uc, EDX);
+ lxur.lxur_rsi = LX_REG(&uc, ESI);
+ lxur.lxur_rdi = LX_REG(&uc, EDI);
+ lxur.lxur_orig_rax = 0;
+ lxur.lxur_rip = LX_REG(&uc, EIP);
+ lxur.lxur_xcs = LX_REG(&uc, CS);
+ lxur.lxur_rflags = LX_REG(&uc, EFL);
+ lxur.lxur_rsp = LX_REG(&uc, UESP);
+ lxur.lxur_xss = LX_REG(&uc, SS);
+ lxur.lxur_xfs_base = 0;
+ lxur.lxur_xgs_base = 0;
+
+ lxur.lxur_xds = LX_REG(&uc, DS);
+ lxur.lxur_xes = LX_REG(&uc, ES);
+ lxur.lxur_xfs = LX_REG(&uc, FS);
+ lxur.lxur_xgs = LX_REG(&uc, GS);
+
+ if (copyout(&lxur, uregsp, sizeof (lxur)) != 0) {
+ return (EFAULT);
+ }
+
+ return (0);
+ }
+
+ default:
+ return (EIO);
+ }
+ }
+
+ case DATAMODEL_ILP32: {
+ lx_user_regs32_t lxur;
+ ucontext32_t uc;
+
+ if (lwp_getdatamodel(lwp) != DATAMODEL_ILP32) {
+ /*
+ * The target is not a 32-bit LWP. We refuse to
+ * present truncated 64-bit registers to a 32-bit
+ * tracer.
+ */
+ return (EIO);
+ }
+
+ if (lx_read_uc(p, ucp, &uc, sizeof (uc)) != 0) {
+ return (EIO);
+ }
+
+ lxur.lxur_ebx = LX_REG(&uc, EBX);
+ lxur.lxur_ecx = LX_REG(&uc, ECX);
+ lxur.lxur_edx = LX_REG(&uc, EDX);
+ lxur.lxur_esi = LX_REG(&uc, ESI);
+ lxur.lxur_edi = LX_REG(&uc, EDI);
+ lxur.lxur_ebp = LX_REG(&uc, EBP);
+ lxur.lxur_eax = LX_REG(&uc, EAX);
+ lxur.lxur_orig_eax = 0;
+ lxur.lxur_eip = LX_REG(&uc, EIP);
+ lxur.lxur_xcs = LX_REG(&uc, CS);
+ lxur.lxur_eflags = LX_REG(&uc, EFL);
+ lxur.lxur_esp = LX_REG(&uc, UESP);
+ lxur.lxur_xss = LX_REG(&uc, SS);
+
+ lxur.lxur_xds = LX_REG(&uc, DS);
+ lxur.lxur_xes = LX_REG(&uc, ES);
+ lxur.lxur_xfs = LX_REG(&uc, FS);
+ lxur.lxur_xgs = LX_REG(&uc, GS);
+
+ if (copyout(&lxur, uregsp, sizeof (lxur)) != 0) {
+ return (EFAULT);
+ }
+
+ return (0);
+ }
+
+ default:
+ return (EIO);
+ }
+#else
+ cmn_err(CE_WARN, "%s: no 32-bit kernel support", __FUNCTION__);
+ exit(CLD_KILLED, SIGSYS);
+ return (EIO);
+#endif
+}
+
+/*
+ * Load a usermode "lx_user_regs_t" into the register state of the target LWP.
+ */
+int
+lx_userregs_to_regs(lx_lwp_data_t *lwpd, void *uregsp)
+{
+ klwp_t *lwp = lwpd->br_lwp;
+ proc_t *p = lwptoproc(lwp);
+
+ VERIFY(MUTEX_HELD(&p->p_lock));
+
+#if defined(__amd64)
+ struct regs *rp = lwptoregs(lwp);
+ struct pcb *pcb = &lwp->lwp_pcb;
+
+ switch (get_udatamodel()) {
+ case DATAMODEL_LP64: {
+ lx_user_regs_t lxur;
+
+ if (copyin(uregsp, &lxur, sizeof (lxur)) != 0) {
+ return (EFAULT);
+ }
+
+ rp->r_r15 = lxur.lxur_r15;
+ rp->r_r14 = lxur.lxur_r14;
+ rp->r_r13 = lxur.lxur_r13;
+ rp->r_r12 = lxur.lxur_r12;
+ rp->r_rbp = lxur.lxur_rbp;
+ rp->r_rbx = lxur.lxur_rbx;
+ rp->r_r11 = lxur.lxur_r11;
+ rp->r_r10 = lxur.lxur_r10;
+ rp->r_r9 = lxur.lxur_r9;
+ rp->r_r8 = lxur.lxur_r8;
+ rp->r_rax = lxur.lxur_rax;
+ rp->r_rcx = lxur.lxur_rcx;
+ rp->r_rdx = lxur.lxur_rdx;
+ rp->r_rsi = lxur.lxur_rsi;
+ rp->r_rdi = lxur.lxur_rdi;
+ lwpd->br_syscall_num = (int)lxur.lxur_orig_rax;
+ rp->r_rip = lxur.lxur_rip;
+ rp->r_cs = lxur.lxur_xcs;
+ rp->r_rfl = lxur.lxur_rflags;
+ rp->r_rsp = lxur.lxur_rsp;
+ rp->r_ss = lxur.lxur_xss;
+ pcb->pcb_fsbase = lxur.lxur_xfs_base;
+ pcb->pcb_gsbase = lxur.lxur_xgs_base;
+
+ kpreempt_disable();
+ pcb->pcb_rupdate = 1;
+ pcb->pcb_ds = lxur.lxur_xds;
+ pcb->pcb_es = lxur.lxur_xes;
+ pcb->pcb_fs = lxur.lxur_xfs;
+ pcb->pcb_gs = lxur.lxur_xgs;
+ kpreempt_enable();
+
+ return (0);
+ }
+
+ case DATAMODEL_ILP32: {
+ lx_user_regs32_t lxur;
+
+ if (lwp_getdatamodel(lwp) != DATAMODEL_ILP32) {
+ /*
+ * The target is not a 32-bit LWP. We refuse to
+ * present truncated 64-bit registers to a 32-bit
+ * tracer.
+ */
+ return (EIO);
+ }
+
+ if (copyin(uregsp, &lxur, sizeof (lxur)) != 0) {
+ return (EFAULT);
+ }
+
+ rp->r_rbx = lxur.lxur_ebx;
+ rp->r_rcx = lxur.lxur_ecx;
+ rp->r_rdx = lxur.lxur_edx;
+ rp->r_rsi = lxur.lxur_esi;
+ rp->r_rdi = lxur.lxur_edi;
+ rp->r_rbp = lxur.lxur_ebp;
+ rp->r_rax = lxur.lxur_eax;
+ lwpd->br_syscall_num = (int)lxur.lxur_orig_eax;
+ rp->r_rip = lxur.lxur_eip;
+ rp->r_cs = lxur.lxur_xcs;
+ rp->r_rfl = lxur.lxur_eflags;
+ rp->r_rsp = lxur.lxur_esp;
+ rp->r_ss = lxur.lxur_xss;
+
+ kpreempt_disable();
+ pcb->pcb_rupdate = 1;
+ pcb->pcb_ds = lxur.lxur_xds;
+ pcb->pcb_es = lxur.lxur_xes;
+ pcb->pcb_fs = lxur.lxur_xfs;
+ pcb->pcb_gs = lxur.lxur_xgs;
+ kpreempt_enable();
+
+ return (0);
+ }
+
+ default:
+ return (EIO);
+ }
+#else
+ cmn_err(CE_WARN, "%s: no 32-bit kernel support", __FUNCTION__);
+ exit(CLD_KILLED, SIGSYS);
+ return (EIO);
+#endif /* __amd64 */
+}
+
+/*
+ * Copy the current LWP register state of the target LWP to a usermode
+ * "lx_user_regs_t".
+ */
+int
+lx_regs_to_userregs(lx_lwp_data_t *lwpd, void *uregsp)
+{
+#if defined(__amd64)
+ klwp_t *lwp = lwpd->br_lwp;
+ struct regs *rp = lwptoregs(lwp);
+ proc_t *p = lwptoproc(lwp);
+
+ VERIFY(MUTEX_HELD(&p->p_lock));
+
+ struct pcb *pcb = &lwp->lwp_pcb;
+ long r0, orig_r0;
+
+ /*
+ * We must precisely emulate the "syscall-entry-stop" and
+ * "syscall-exit-stop" register appearance from the Linux kernel.
+ */
+ switch (lwpd->br_ptrace_whatstop) {
+ case LX_PR_SYSENTRY:
+ orig_r0 = lwpd->br_syscall_num;
+ r0 = -lx_stol_errno[ENOTSUP];
+ break;
+ case LX_PR_SYSEXIT:
+ orig_r0 = lwpd->br_syscall_num;
+ r0 = rp->r_rax;
+ break;
+ default:
+ orig_r0 = 0;
+ r0 = rp->r_rax;
+ }
+
+ switch (get_udatamodel()) {
+ case DATAMODEL_LP64: {
+ lx_user_regs_t lxur;
+
+ lxur.lxur_r15 = rp->r_r15;
+ lxur.lxur_r14 = rp->r_r14;
+ lxur.lxur_r13 = rp->r_r13;
+ lxur.lxur_r12 = rp->r_r12;
+ lxur.lxur_rbp = rp->r_rbp;
+ lxur.lxur_rbx = rp->r_rbx;
+ lxur.lxur_r11 = rp->r_r11;
+ lxur.lxur_r10 = rp->r_r10;
+ lxur.lxur_r9 = rp->r_r9;
+ lxur.lxur_r8 = rp->r_r8;
+ lxur.lxur_rax = r0;
+ lxur.lxur_rcx = rp->r_rcx;
+ lxur.lxur_rdx = rp->r_rdx;
+ lxur.lxur_rsi = rp->r_rsi;
+ lxur.lxur_rdi = rp->r_rdi;
+ lxur.lxur_orig_rax = orig_r0;
+ lxur.lxur_rip = rp->r_rip;
+ lxur.lxur_xcs = rp->r_cs;
+ lxur.lxur_rflags = rp->r_rfl;
+ lxur.lxur_rsp = rp->r_rsp;
+ lxur.lxur_xss = rp->r_ss;
+ lxur.lxur_xfs_base = pcb->pcb_fsbase;
+ lxur.lxur_xgs_base = pcb->pcb_gsbase;
+
+ kpreempt_disable();
+ if (pcb->pcb_rupdate == 1) {
+ lxur.lxur_xds = pcb->pcb_ds;
+ lxur.lxur_xes = pcb->pcb_es;
+ lxur.lxur_xfs = pcb->pcb_fs;
+ lxur.lxur_xgs = pcb->pcb_gs;
+ } else {
+ lxur.lxur_xds = rp->r_ds;
+ lxur.lxur_xes = rp->r_es;
+ lxur.lxur_xfs = rp->r_fs;
+ lxur.lxur_xgs = rp->r_gs;
+ }
+ kpreempt_enable();
+
+ if (copyout(&lxur, uregsp, sizeof (lxur)) != 0) {
+ return (EFAULT);
+ }
+
+ return (0);
+ }
+
+ case DATAMODEL_ILP32: {
+ lx_user_regs32_t lxur;
+
+ if (lwp_getdatamodel(lwp) != DATAMODEL_ILP32) {
+ /*
+ * The target is not a 32-bit LWP. We refuse to
+ * present truncated 64-bit registers to a 32-bit
+ * tracer.
+ */
+ return (EIO);
+ }
+
+ lxur.lxur_ebx = (int32_t)rp->r_rbx;
+ lxur.lxur_ecx = (int32_t)rp->r_rcx;
+ lxur.lxur_edx = (int32_t)rp->r_rdx;
+ lxur.lxur_esi = (int32_t)rp->r_rsi;
+ lxur.lxur_edi = (int32_t)rp->r_rdi;
+ lxur.lxur_ebp = (int32_t)rp->r_rbp;
+ lxur.lxur_eax = (int32_t)r0;
+ lxur.lxur_orig_eax = (int32_t)orig_r0;
+ lxur.lxur_eip = (int32_t)rp->r_rip;
+ lxur.lxur_xcs = (int32_t)rp->r_cs;
+ lxur.lxur_eflags = (int32_t)rp->r_rfl;
+ lxur.lxur_esp = (int32_t)rp->r_rsp;
+ lxur.lxur_xss = (int32_t)rp->r_ss;
+
+ kpreempt_disable();
+ if (pcb->pcb_rupdate == 1) {
+ lxur.lxur_xds = pcb->pcb_ds;
+ lxur.lxur_xes = pcb->pcb_es;
+ lxur.lxur_xfs = pcb->pcb_fs;
+ lxur.lxur_xgs = pcb->pcb_gs;
+ } else {
+ lxur.lxur_xds = rp->r_ds;
+ lxur.lxur_xes = rp->r_es;
+ lxur.lxur_xfs = rp->r_fs;
+ lxur.lxur_xgs = rp->r_gs;
+ }
+ kpreempt_enable();
+
+ if (copyout(&lxur, uregsp, sizeof (lxur)) != 0) {
+ return (EFAULT);
+ }
+
+ return (0);
+ }
+
+ default:
+ return (EIO);
+ }
+#else
+ cmn_err(CE_WARN, "%s: no 32-bit kernel support", __FUNCTION__);
+ exit(CLD_KILLED, SIGSYS);
+ return (EIO);
+#endif /* __amd64 */
+}
+
+/*
+ * Load registers and repoint the stack and program counter. This function is
+ * used by the B_JUMP_TO_LINUX brand system call to revector to a Linux
+ * entrypoint.
+ */
+int
+lx_runexe(klwp_t *lwp, void *ucp)
+{
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+
+ /*
+ * We should only make it here when transitioning to Linux from
+ * the NATIVE or INIT mode.
+ */
+ VERIFY(lwpd->br_stack_mode == LX_STACK_MODE_NATIVE ||
+ lwpd->br_stack_mode == LX_STACK_MODE_INIT);
+
+#if defined(__amd64)
+ if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
+ struct pcb *pcb = &lwp->lwp_pcb;
+
+ /*
+ * Preserve the %fsbase value for this LWP, as set and used by
+ * native illumos code.
+ */
+ lwpd->br_ntv_fsbase = pcb->pcb_fsbase;
+
+ return (getsetcontext(SETCONTEXT, ucp));
+ } else {
+ return (getsetcontext32(SETCONTEXT, ucp));
+ }
+#else
+ return (getsetcontext(SETCONTEXT, ucp));
+#endif
+}
+
+/*
+ * The usermode emulation code is illumos library code. This routine ensures
+ * the segment registers are set up correctly for native illumos code. It
+ * should be called _after_ we have stored the outgoing Linux machine state
+ * but _before_ we return from the kernel to any illumos native code; e.g. the
+ * usermode emulation library, or any interposed signal handlers.
+ *
+ * See the comment on lwp_segregs_save() for how we handle the usermode
+ * registers when we come into the kernel and see update_sregs() for how we
+ * restore.
+ */
+void
+lx_switch_to_native(klwp_t *lwp)
+{
+#if defined(__amd64)
+ model_t datamodel = lwp_getdatamodel(lwp);
+
+ switch (datamodel) {
+ case DATAMODEL_ILP32: {
+ struct pcb *pcb = &lwp->lwp_pcb;
+
+ /*
+ * For 32-bit processes, we ensure that the correct %gs value
+ * is loaded:
+ */
+ kpreempt_disable();
+ if (pcb->pcb_rupdate == 1) {
+ /*
+ * If we are already flushing the segment registers,
+ * then ensure we are flushing the native %gs.
+ */
+ pcb->pcb_gs = LWPGS_SEL;
+ } else {
+ struct regs *rp = lwptoregs(lwp);
+
+ /*
+ * If we are not flushing the segment registers yet,
+ * only do so if %gs is not correct already:
+ */
+ if (rp->r_gs != LWPGS_SEL) {
+ pcb->pcb_gs = LWPGS_SEL;
+
+ /*
+ * Ensure we go out via update_sregs.
+ */
+ pcb->pcb_rupdate = 1;
+ }
+ }
+ kpreempt_enable();
+ break;
+ }
+
+ case DATAMODEL_LP64: {
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+
+ /*
+ * For 64-bit processes we ensure that the correct %fsbase
+ * value is loaded:
+ */
+ if (lwpd->br_ntv_fsbase != 0) {
+ struct pcb *pcb = &lwp->lwp_pcb;
+
+ kpreempt_disable();
+ if (pcb->pcb_fsbase != lwpd->br_ntv_fsbase) {
+ pcb->pcb_fsbase = lwpd->br_ntv_fsbase;
+
+ /*
+ * Ensure we go out via update_sregs.
+ */
+ pcb->pcb_rupdate = 1;
+ }
+ kpreempt_enable();
+ }
+ break;
+ }
+
+ default:
+ cmn_err(CE_PANIC, "unknown data model: %d", datamodel);
+ }
+#elif defined(__i386)
+ struct regs *rp = lwptoregs(lwp);
+
+ rp->r_gs = LWPGS_SEL;
+#else
+#error "unknown x86"
+#endif
+}
+
+#if defined(__amd64)
+/*
+ * Call frame for the 64-bit usermode emulation handler:
+ * lx_emulate(ucontext_t *ucp, int syscall_num, uintptr_t *args)
+ *
+ * old sp: --------------------------------------------------------------
+ * | - ucontext_t (register state for emulation)
+ * | - uintptr_t[6] (system call arguments array)
+ * V --------------------------------------------------------------
+ * new sp: - bogus return address
+ *
+ * Arguments are passed in registers, per the AMD64 ABI: %rdi, %rsi and %rdx.
+ */
+void
+lx_emulate_user(klwp_t *lwp, int syscall_num, uintptr_t *args)
+{
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ struct regs *rp = lwptoregs(lwp);
+ label_t lab;
+ uintptr_t uc_addr;
+ uintptr_t args_addr;
+ uintptr_t top;
+ /*
+ * Variables used after on_fault() returns for a fault
+ * must be volatile.
+ */
+ volatile size_t frsz;
+ volatile uintptr_t sp;
+ volatile proc_t *p = lwptoproc(lwp);
+ volatile int watched;
+
+ /*
+ * We should not be able to get here unless we are running Linux
+ * code for a system call we cannot emulate in the kernel.
+ */
+ VERIFY(lwpd->br_stack_mode == LX_STACK_MODE_BRAND);
+
+ /*
+ * The AMD64 ABI requires us to align the return address on the stack
+ * so that when the called function pushes %rbp, the stack is 16-byte
+ * aligned.
+ *
+ * This routine, like the amd64 version of sendsig(), depends on
+ * STACK_ALIGN being 16 and STACK_ENTRY_ALIGN being 8.
+ */
+#if STACK_ALIGN != 16 || STACK_ENTRY_ALIGN != 8
+#error "lx_emulate_user() amd64 did not find the expected stack alignments"
+#endif
+
+ /*
+ * We begin at the current native stack pointer, and reserve space for
+ * the ucontext_t we are copying onto the stack, as well as the call
+ * arguments for the usermode emulation handler.
+ *
+ * We 16-byte align the entire frame, and then unalign it again by
+ * adding space for the return address.
+ */
+ frsz = SA(sizeof (ucontext_t)) + SA(6 * sizeof (uintptr_t)) +
+ sizeof (uintptr_t);
+ VERIFY((frsz & (STACK_ALIGN - 1UL)) == 8);
+ VERIFY((frsz & (STACK_ENTRY_ALIGN - 1UL)) == 0);
+
+ if (lwpd->br_ntv_stack == lwpd->br_ntv_stack_current) {
+ /*
+ * Nobody else is using the stack right now, so start at the
+ * top.
+ */
+ top = lwpd->br_ntv_stack_current;
+ } else {
+ /*
+ * Drop below the 128-byte reserved region of the stack frame
+ * we are interrupting.
+ */
+ top = lwpd->br_ntv_stack_current - STACK_RESERVE;
+ }
+ top = top & ~(STACK_ALIGN - 1);
+ sp = top - frsz;
+
+ uc_addr = top - SA(sizeof (ucontext_t));
+ args_addr = uc_addr - SA(6 * sizeof (uintptr_t));
+
+ watched = watch_disable_addr((caddr_t)sp, frsz, S_WRITE);
+ if (on_fault(&lab)) {
+ goto badstack;
+ }
+
+ /*
+ * Save the register state we preserved on the way into this brand
+ * system call and drop it on the native stack.
+ */
+ {
+ /*
+ * Note: the amd64 ucontext_t is 864 bytes.
+ */
+ ucontext_t uc;
+
+ /*
+ * We do not want to save the signal mask for an emulation
+ * context. Some emulated system calls alter the signal mask;
+ * restoring it when the emulation is complete would clobber
+ * those intentional side effects.
+ */
+ savecontext(&uc, NULL);
+
+ /*
+ * Mark this as a system call emulation context:
+ */
+ uc.uc_brand_data[0] = (void *)((uintptr_t)
+ uc.uc_brand_data[0] | LX_UC_FRAME_IS_SYSCALL);
+
+ copyout_noerr(&uc, (void *)(uintptr_t)uc_addr, sizeof (uc));
+ }
+
+ DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp,
+ uintptr_t, lwp->lwp_oldcontext, uintptr_t, uc_addr);
+ lwp->lwp_oldcontext = (uintptr_t)uc_addr;
+
+ /*
+ * Copy the system call arguments out to userland:
+ */
+ copyout_noerr(args, (void *)(uintptr_t)args_addr,
+ 6 * sizeof (uintptr_t));
+
+ /*
+ * Drop the bogus return address on the stack.
+ */
+ suword64_noerr((void *)sp, 0);
+
+ no_fault();
+ if (watched) {
+ watch_enable_addr((caddr_t)sp, frsz, S_WRITE);
+ }
+
+ /*
+ * Pass the arguments to lx_emulate() in the appropriate registers.
+ */
+ rp->r_rdi = uc_addr;
+ rp->r_rsi = syscall_num;
+ rp->r_rdx = args_addr;
+
+ /*
+ * In order to be able to restore %edx, we need to JUSTRETURN.
+ */
+ lwp->lwp_eosys = JUSTRETURN;
+ curthread->t_post_sys = 1;
+ aston(curthread);
+
+ /*
+ * Set stack pointer and return address to the usermode emulation
+ * handler:
+ */
+ lwpd->br_stack_mode = LX_STACK_MODE_NATIVE;
+ lx_lwp_set_native_stack_current(lwpd, sp);
+
+ /*
+ * Divert execution, on our return, to the usermode emulation stack
+ * and handler:
+ */
+ rp->r_fp = 0;
+ rp->r_sp = sp;
+ rp->r_pc = ptolxproc(p)->l_handler;
+
+ /*
+ * Fix up segment registers, etc.
+ */
+ lx_switch_to_native(lwp);
+
+ return;
+
+badstack:
+ no_fault();
+ if (watched) {
+ watch_enable_addr((caddr_t)sp, frsz, S_WRITE);
+ }
+
+#ifdef DEBUG
+ printf("lx_emulate_user: bad native stack cmd=%s, pid=%d, sp=0x%lx\n",
+ PTOU(p)->u_comm, p->p_pid, sp);
+#endif
+
+ exit(CLD_KILLED, SIGSEGV);
+}
+
+#if defined(_SYSCALL32_IMPL)
+/*
+ * Call frame for the 32-bit usermode emulation handler:
+ * lx_emulate(ucontext_t *ucp, int syscall_num, uintptr_t *args)
+ *
+ * old sp: --------------------------------------------------------------
+ * | - ucontext_t (register state for emulation)
+ * | - uintptr_t[6] (system call arguments array)
+ * | --------------------------------------------------------------
+ * | - arg2: uintptr_t * (pointer to arguments array above)
+ * | - arg1: int (system call number)
+ * V - arg0: ucontext_t * (pointer to context above)
+ * new sp: - bogus return address
+ */
+struct lx_emu_frame32 {
+ caddr32_t retaddr; /* 0 */
+ caddr32_t ucontextp; /* 4 */
+ int32_t syscall_num; /* 8 */
+ caddr32_t argsp; /* c */
+};
+
+/*
+ * This function arranges for the lwp to execute the usermode emulation handler
+ * for this system call. The mechanism is similar to signal handling, and this
+ * function is modelled on sendsig32().
+ */
+void
+lx_emulate_user32(klwp_t *lwp, int syscall_num, uintptr_t *args)
+{
+ lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+ struct regs *rp = lwptoregs(lwp);
+ label_t lab;
+ caddr32_t uc_addr;
+ caddr32_t args_addr;
+ caddr32_t top;
+ /*
+ * Variables used after on_fault() returns for a fault
+ * must be volatile.
+ */
+ volatile size_t frsz;
+ volatile caddr32_t sp;
+ volatile proc_t *p = lwptoproc(lwp);
+ volatile int watched;
+
+ /*
+ * We should not be able to get here unless we are running Linux
+ * code for a system call we cannot emulate in the kernel.
+ */
+ VERIFY(lwpd->br_stack_mode == LX_STACK_MODE_BRAND);
+
+ /*
+ * We begin at the current native stack pointer, and reserve space for
+ * the ucontext_t we are copying onto the stack, as well as the call
+ * arguments for the usermode emulation handler.
+ */
+ frsz = SA32(sizeof (ucontext32_t)) + SA32(6 * sizeof (uint32_t)) +
+ SA32(sizeof (struct lx_emu_frame32));
+ VERIFY((frsz & (STACK_ALIGN32 - 1)) == 0);
+
+ top = (caddr32_t)(lwpd->br_ntv_stack_current & ~(STACK_ALIGN32 - 1));
+ sp = top - frsz;
+
+ uc_addr = top - SA32(sizeof (ucontext32_t));
+ args_addr = uc_addr - SA32(6 * sizeof (uint32_t));
+
+ watched = watch_disable_addr((caddr_t)(uintptr_t)sp, frsz, S_WRITE);
+ if (on_fault(&lab)) {
+ goto badstack;
+ }
+
+ /*
+ * Save the register state we preserved on the way into this brand
+ * system call and drop it on the native stack.
+ */
+ {
+ /*
+ * Note: ucontext32_t is 512 bytes.
+ */
+ ucontext32_t uc;
+
+ /*
+ * We do not want to save the signal mask for an emulation
+ * context. Some emulated system calls alter the signal mask;
+ * restoring it when the emulation is complete would clobber
+ * those intentional side effects.
+ */
+ savecontext32(&uc, NULL);
+
+ /*
+ * Mark this as a system call emulation context:
+ */
+ uc.uc_brand_data[0] |= LX_UC_FRAME_IS_SYSCALL;
+ copyout_noerr(&uc, (void *)(uintptr_t)uc_addr, sizeof (uc));
+ }
+
+ DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp,
+ uintptr_t, lwp->lwp_oldcontext, uintptr_t, uc_addr);
+ lwp->lwp_oldcontext = (uintptr_t)uc_addr;
+
+ /*
+ * Copy the system call arguments out to userland:
+ */
+ {
+ uint32_t args32[6];
+
+ args32[0] = args[0];
+ args32[1] = args[1];
+ args32[2] = args[2];
+ args32[3] = args[3];
+ args32[4] = args[4];
+ args32[5] = args[5];
+
+ copyout_noerr(&args32, (void *)(uintptr_t)args_addr,
+ sizeof (args32));
+ }
+
+ /*
+ * Assemble the call frame on the stack.
+ */
+ {
+ struct lx_emu_frame32 frm;
+
+ frm.retaddr = 0;
+ frm.ucontextp = uc_addr;
+ frm.argsp = args_addr;
+ frm.syscall_num = syscall_num;
+
+ copyout_noerr(&frm, (void *)(uintptr_t)sp, sizeof (frm));
+ }
+
+ no_fault();
+ if (watched) {
+ watch_enable_addr((caddr_t)(uintptr_t)sp, frsz, S_WRITE);
+ }
+
+ /*
+ * Set stack pointer and return address to the usermode emulation
+ * handler:
+ */
+ lwpd->br_stack_mode = LX_STACK_MODE_NATIVE;
+ lx_lwp_set_native_stack_current(lwpd, sp);
+
+ /*
+ * Divert execution, on our return, to the usermode emulation stack
+ * and handler:
+ */
+ rp->r_fp = 0;
+ rp->r_sp = sp;
+ rp->r_pc = ptolxproc(p)->l_handler;
+
+ /*
+ * Fix up segment registers, etc.
+ */
+ lx_switch_to_native(lwp);
+
+ return;
+
+badstack:
+ no_fault();
+ if (watched) {
+ watch_enable_addr((caddr_t)(uintptr_t)sp, frsz, S_WRITE);
+ }
+
+#ifdef DEBUG
+ printf("lx_emulate_user32: bad native stack cmd=%s, pid=%d, sp=0x%x\n",
+ PTOU(p)->u_comm, p->p_pid, sp);
+#endif
+
+ exit(CLD_KILLED, SIGSEGV);
+}
+#endif /* _SYSCALL32_IMPL */
+
+#else /* !__amd64 (__i386) */
+
+void
+lx_emulate_user(klwp_t *lwp, int syscall_num, uintptr_t *args)
+{
+ cmn_err(CE_WARN, "%s: no 32-bit kernel support", __FUNCTION__);
+ exit(CLD_KILLED, SIGSYS);
+}
+
+#endif /* __amd64 */
diff --git a/usr/src/uts/intel/brand/lx/lx_brand_asm.s b/usr/src/uts/intel/brand/lx/lx_brand_asm.s
deleted file mode 100644
index 568d462c2c..0000000000
--- a/usr/src/uts/intel/brand/lx/lx_brand_asm.s
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- * Copyright 2014 Joyent, Inc. All rights reserved.
- */
-
-#if defined(__lint)
-
-#include <sys/systm.h>
-
-#else /* __lint */
-
-#include <sys/controlregs.h>
-#include "genassym.h"
-#include "../common/brand_asm.h"
-
-#endif /* __lint */
-
-#ifdef __lint
-
-void
-lx_brand_int80_callback(void)
-{
-}
-
-void
-lx_brand_syscall_callback(void)
-{
-}
-
-#else /* __lint */
-
-#if defined(__amd64)
-
-/*
- * syscall handler for 32-bit Linux user processes:
- * See "64-BIT INTERPOSITION STACK" in brand_asm.h.
- */
-ENTRY(lx_brand_int80_callback)
- GET_PROCP(SP_REG, 0, %r15)
- movq P_ZONE(%r15), %r15 /* grab the zone pointer */
- /* grab the 'max syscall num' for this process from 'zone brand data' */
- movq ZONE_BRAND_DATA(%r15), %r15 /* grab the zone brand ptr */
- movl LXZD_MAX_SYSCALL(%r15), %r15d /* get the 'max sysnum' word */
- cmpq %r15, %rax /* is 0 <= syscall <= MAX? */
- jbe 0f /* yes, syscall is OK */
- xorl %eax, %eax /* no, zero syscall number */
-0:
-
-.lx_brand_int80_patch_point:
- jmp .lx_brand_int80_notrace
-
-.lx_brand_int80_notrace:
- CALC_TABLE_ADDR(%r15, L_HANDLER)
-1:
- movq %r15, %rax
- GET_V(%rsp, 0, V_SSP, %rsp) /* restore intr. stack pointer */
- xchgq (%rsp), %rax /* swap %rax and return addr */
- jmp sys_sysint_swapgs_iret
-
-.lx_brand_int80_trace:
- /*
- * If tracing is active, we vector to an alternate trace-enabling
- * handler table instead.
- */
- CALC_TABLE_ADDR(%r15, L_TRACEHANDLER)
- jmp 1b
-SET_SIZE(lx_brand_int80_callback)
-
-#define PATCH_POINT80 _CONST(.lx_brand_int80_patch_point + 1)
-#define PATCH_VAL80 _CONST(.lx_brand_int80_trace - .lx_brand_int80_notrace)
-
-ENTRY(lx_brand_int80_enable)
- movl $1, lx_systrace_brand_enabled(%rip)
- movq $PATCH_POINT80, %r8
- movb $PATCH_VAL80, (%r8)
- ret
-SET_SIZE(lx_brand_int80_enable)
-
-ENTRY(lx_brand_int80_disable)
- movq $PATCH_POINT80, %r8
- movb $0, (%r8)
- movl $0, lx_systrace_brand_enabled(%rip)
- ret
-SET_SIZE(lx_brand_int80_disable)
-
-
-/*
- * syscall handler for 64-bit user processes:
- *
- * We're running on the kernel's %gs.
- *
- * We return directly to userland, bypassing the update_sregs() logic, so
- * this routine must NOT do anything that could cause a context switch.
- *
- * %rax - syscall number
- *
- * See uts/i86pc/ml/syscall_asm_amd64.s for what happens before we get into
- * the following lx brand-specific codepath.
- *
- * As the comment on the BRAND_CALLBACK macro describes, when we're called, all
- * general registers, except for %r15, are as they were when the user process
- * made the system call. %r15 is available to the callback as a scratch
- * register. If the callback returns to the kernel path, %r15 does not have to
- * be restored to the user value since BRAND_CALLBACK does that. If we jump
- * out to the emulation we need to restore %r15 here.
- *
- * To 'return' to our user-space handler, we just need to place its address
- * into %rcx. The original return address is passed back in %rax.
- *
- * Since this is the common syscall path for all 64-bit code (both Linux and
- * native libc) in the branded zone (unlike the int80 path), we have to do a
- * bit more checking to see if interpositioning is in effect (i.e. syscalls
- * from the native ld.so.1 are not interposed since the emulation has not yet
- * been installed, or the emulation is in native syscall mode).
- */
-ENTRY(lx_brand_syscall_callback)
- /* callback prologue */
- GET_PROCP(SP_REG, 0, %r15)
- mov __P_BRAND_DATA(%r15), %r15 /* get p_brand_data */
- cmp $0, %r15 /* null ptr? */
- je 2f /* yes, take normal ret path */
- cmp $0, L_HANDLER(%r15) /* handler installed? */
- je 2f /* no, take normal ret path */
-
- /* check for native vs. Linux syscall */
- GET_V(SP_REG, 0, V_LWP, %r15); /* get lwp pointer */
- movq LWP_BRAND(%r15), %r15 /* grab lx lwp data pointer */
- movl BR_NTV_SYSCALL(%r15), %r15d /* grab syscall src flag */
- cmp $1, %r15 /* check for native syscall */
- je 2f /* is native, stay in kernel */
-
- /* Linux syscall - subsequent emul. syscalls will use native mode */
- GET_V(SP_REG, 0, V_LWP, %r15); /* get lwp pointer */
- movq LWP_BRAND(%r15), %r15 /* grab lx lwp data pointer */
- movl $1, BR_NTV_SYSCALL(%r15) /* set native syscall flag */
-
- /* check if we have to restore native fsbase */
- GET_V(SP_REG, 0, V_LWP, %r15); /* get lwp pointer */
- movq LWP_BRAND(%r15), %r15 /* grab lx lwp data pointer */
- movq BR_NTV_FSBASE(%r15), %r15 /* grab native fsbase */
- cmp $0, %r15 /* native fsbase not saved? */
- je 3f /* yes, skip loading */
-
-#ifdef DEBUG
- /*
- * This block is basically similar to a large assert.
- *
- * In debug code we do some extra validation of the %fsbase register to
- * validate that we always have the expected Linux thread pointer and
- * not the native value. At this point we know that the lwp brand data
- * should contain the Linux %fsbase (from a Linux arch_prctl syscall)
- * since the native %fsbase check above is non-null. We also know that
- * we are making a Linux syscall from the other check above. We read
- * the %fsbase and compare to the saved Linux %fsbase in the lwp_brand
- * data. If we don't have the expected value, we save the incorrect
- * %fsbase value into the br_lx_fsbase member for later inspection and
- * change the syscall we are making into the Linux pivot_root syscall
- * (an obscure syscall which we don't support and which an app in the
- * zone cannot use). This allows us to see this error downstream via
- * DTrace and see the incorrect %fsbase value we had.
- */
- GET_V(SP_REG, 0, V_LWP, %r15); /* get lwp pointer */
- movq LWP_BRAND(%r15), %r15 /* grab lx lwp data pointer */
- movq BR_LX_FSBASE(%r15), %r15 /* grab Linux fsbase */
-
- subq $24, %rsp /* make room for 3 regs */
- movq %rax, 0x0(%rsp) /* save regs used by rdmsr */
- movq %rcx, 0x8(%rsp)
- movq %rdx, 0x10(%rsp)
-
- movl $MSR_AMD_FSBASE, %ecx /* fsbase msr */
- rdmsr /* get fsbase to edx:eax */
-
- /* fix %edx; %eax lo already ok */
- shlq $32, %rdx
- or %rdx, %rax /* full value in %rax */
- cmp %rax, %r15 /* check if is lx fsbase */
- je 4f /* match, ok */
-
- movq %rax, %rdi /* pass bad fsbase as arg0 */
- movq $155, %rax /* fail! use pivot_root */
- jmp 5f
-
-4:
- movq 0x0(%rsp), %rax /* restore %rax */
-5:
- movq 0x8(%rsp), %rcx /* restore other regs */
- movq 0x10(%rsp), %rdx
- addq $24, %rsp
-
- /* reload r15 with the native value */
- GET_V(SP_REG, 0, V_LWP, %r15); /* get lwp pointer */
- movq LWP_BRAND(%r15), %r15 /* grab lx lwp data pointer */
- movq BR_NTV_FSBASE(%r15), %r15 /* grab native fsbase */
-#endif
-
- /*
- * Switch fsbase from Linux value back to native value. Also update pcb
- * so that if we service an interrupt we will restore the correct fsbase
- * in update_sregs().
- */
- subq $24, %rsp /* make room for 3 regs */
- movq %rax, 0x0(%rsp) /* save regs used by wrmsr */
- movq %rcx, 0x8(%rsp)
- movq %rdx, 0x10(%rsp)
- movq %r15, %rax /* native fsbase to %rax */
- movq %rax, %rdx /* setup regs for wrmsr */
- shrq $32, %rdx /* fix %edx; %eax already ok */
- movl $MSR_AMD_FSBASE, %ecx /* fsbase msr */
- wrmsr /* set fsbase from edx:eax */
- movq %rsp, %rdx /* use rdx as temp sp */
- addq $24, %rdx /* fix it back up */
- GET_V(%rdx, 0, V_LWP, %r15); /* get lwp pointer */
- movq %rax, LWP_PCB_FSBASE(%r15) /* save native fsbase in pcb */
- movq 0x0(%rsp), %rax /* restore regs */
- movq 0x8(%rsp), %rcx
- movq 0x10(%rsp), %rdx
- addq $24, %rsp
-
-3:
- /*
- * Linux syscall - validate syscall number.
- * If necessary, the Linux %fsbase has already been loaded above.
- */
- GET_PROCP(SP_REG, 0, %r15)
- movq P_ZONE(%r15), %r15 /* grab the zone pointer */
- /* grab the 'max syscall num' for this process from 'zone brand data' */
- movq ZONE_BRAND_DATA(%r15), %r15 /* grab the zone brand ptr */
- movl LXZD_MAX_SYSCALL(%r15), %r15d /* get the 'max sysnum' word */
- cmp %r15, %rax /* is 0 <= syscall <= MAX? */
- ja 2f /* no, take normal ret path */
-
-.lx_brand_syscall_patch_point:
- jmp .lx_brand_syscall_notrace
-.lx_brand_syscall_notrace:
-
- CALC_TABLE_ADDR(%r15, L_HANDLER)
-1:
- mov %rcx, %rax; /* save orig return addr in syscall_reg */
- mov %r15, %rcx; /* place new return addr in %rcx */
- mov %gs:CPU_RTMP_R15, %r15; /* restore scratch register */
- mov V_SSP(SP_REG), SP_REG /* restore user stack pointer */
- jmp nopop_sys_syscall_swapgs_sysretq
-
-2: /* no emulation, continue normal system call flow */
- retq
-
-.lx_brand_syscall_trace:
- /*
- * If tracing is active, we vector to an alternate trace-enabling
- * handler table instead.
- */
- CALC_TABLE_ADDR(%r15, L_TRACEHANDLER)
- jmp 1b
-SET_SIZE(lx_brand_syscall_callback)
-
-#define PATCH_POINT_SC _CONST(.lx_brand_syscall_patch_point + 1)
-#define PATCH_VAL_SC \
- _CONST(.lx_brand_syscall_trace - .lx_brand_syscall_notrace)
-
-ENTRY(lx_brand_syscall_enable)
- movl $1, lx_systrace_brand_enabled(%rip)
- movq $PATCH_POINT_SC, %r8
- movb $PATCH_VAL_SC, (%r8)
- ret
-SET_SIZE(lx_brand_syscall_enable)
-
-ENTRY(lx_brand_syscall_disable)
- movq $PATCH_POINT_SC, %r8
- movb $0, (%r8)
- movl $0, lx_systrace_brand_enabled(%rip)
- ret
-SET_SIZE(lx_brand_syscall_disable)
-
-
-#elif defined(__i386)
-
-/*
- * See "32-BIT INTERPOSITION STACK" in brand_asm.h.
- */
-ENTRY(lx_brand_int80_callback)
- GET_PROCP(SP_REG, 0, %ebx)
- movl P_ZONE(%ebx), %ebx /* grab the zone pointer */
- /* grab the 'max syscall num' for this process from 'zone brand data' */
- movl ZONE_BRAND_DATA(%ebx), %ebx /* grab the zone brand data */
- movl LXZD_MAX_SYSCALL(%ebx), %ebx /* get the max sysnum */
-
- cmpl %ebx, %eax /* is 0 <= syscall <= MAX? */
- jbe 0f /* yes, syscall is OK */
- xorl %eax, %eax /* no, zero syscall number */
-0:
-
-.lx_brand_int80_patch_point:
- jmp .lx_brand_int80_notrace
-
-.lx_brand_int80_notrace:
- CALC_TABLE_ADDR(%ebx, L_HANDLER)
-
-1:
- movl %ebx, %eax
- GET_V(%esp, 0, V_U_EBX, %ebx) /* restore scratch register */
- addl $V_END, %esp /* restore intr. stack ptr */
- xchgl (%esp), %eax /* swap new and orig. return addrs */
- jmp nopop_sys_rtt_syscall
-
-.lx_brand_int80_trace:
- CALC_TABLE_ADDR(%ebx, L_TRACEHANDLER)
- jmp 1b
-SET_SIZE(lx_brand_int80_callback)
-
-
-#define PATCH_POINT _CONST(.lx_brand_int80_patch_point + 1)
-#define PATCH_VAL _CONST(.lx_brand_int80_trace - .lx_brand_int80_notrace)
-
-ENTRY(lx_brand_int80_enable)
- pushl %ebx
- pushl %eax
- movl $1, lx_systrace_brand_enabled
- movl $PATCH_POINT, %ebx
- movl $PATCH_VAL, %eax
- movb %al, (%ebx)
- popl %eax
- popl %ebx
- ret
-SET_SIZE(lx_brand_int80_enable)
-
-ENTRY(lx_brand_int80_disable)
- pushl %ebx
- movl $PATCH_POINT, %ebx
- movb $0, (%ebx)
- movl $0, lx_systrace_brand_enabled
- popl %ebx
- ret
-SET_SIZE(lx_brand_int80_disable)
-
-#endif /* __i386 */
-#endif /* __lint */
diff --git a/usr/src/uts/intel/genassym/offsets.in b/usr/src/uts/intel/genassym/offsets.in
index 59763c1b4b..70221c02f9 100644
--- a/usr/src/uts/intel/genassym/offsets.in
+++ b/usr/src/uts/intel/genassym/offsets.in
@@ -21,7 +21,7 @@
\
\ Copyright 2010 Sun Microsystems, Inc. All rights reserved.
\ Use is subject to license terms.
-\ Copyright 2014 Joyent, Inc. All rights reserved.
+\ Copyright 2015 Joyent, Inc.
\
\
@@ -37,13 +37,7 @@
lx_proc_data
l_handler
- l_tracehandler
- l_traceflag
-
-lx_zone_data
- lxzd_max_syscall
lx_lwp_data
- br_ntv_syscall
br_lx_fsbase
br_ntv_fsbase
diff --git a/usr/src/uts/intel/ia32/os/archdep.c b/usr/src/uts/intel/ia32/os/archdep.c
index 42cc0d4d10..db4ccac06b 100644
--- a/usr/src/uts/intel/ia32/os/archdep.c
+++ b/usr/src/uts/intel/ia32/os/archdep.c
@@ -25,7 +25,7 @@
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
* Copyright 2012 Nexenta Systems, Inc. All rights reserved.
*/
@@ -575,6 +575,13 @@ ucontext_32ton(const ucontext32_t *src, ucontext_t *dst)
if (src->uc_flags & UC_FPU)
fpregset_32ton(&src->uc_mcontext.fpregs,
&dst->uc_mcontext.fpregs);
+
+ /*
+ * Copy the brand-private data:
+ */
+ dst->uc_brand_data[0] = (void *)(uintptr_t)src->uc_brand_data[0];
+ dst->uc_brand_data[1] = (void *)(uintptr_t)src->uc_brand_data[1];
+ dst->uc_brand_data[2] = (void *)(uintptr_t)src->uc_brand_data[2];
}
#endif /* _SYSCALL32_IMPL */
@@ -633,7 +640,7 @@ static greg_t
fix_segreg(greg_t sr, int iscs, model_t datamodel)
{
kthread_t *t = curthread;
-
+
switch (sr &= 0xffff) {
case 0:
@@ -669,7 +676,7 @@ fix_segreg(greg_t sr, int iscs, model_t datamodel)
break;
}
- /*
+ /*
* Allow this process's brand to do any necessary segment register
* manipulation.
*/
diff --git a/usr/src/uts/intel/ia32/os/sendsig.c b/usr/src/uts/intel/ia32/os/sendsig.c
index 979c9e3294..f6c14324bc 100644
--- a/usr/src/uts/intel/ia32/os/sendsig.c
+++ b/usr/src/uts/intel/ia32/os/sendsig.c
@@ -20,6 +20,9 @@
*/
/*
+ * Copyright 2015 Joyent, Inc.
+ */
+/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -87,6 +90,8 @@
#include <sys/kdi.h>
#include <sys/contract_impl.h>
#include <sys/x86_archext.h>
+#include <sys/brand.h>
+#include <sys/sdt.h>
/*
* Construct the execution environment for the user's signal
@@ -186,7 +191,18 @@ sendsig(int sig, k_siginfo_t *sip, void (*hdlr)())
newstack = sigismember(&PTOU(curproc)->u_sigonstack, sig) &&
!(lwp->lwp_sigaltstack.ss_flags & (SS_ONSTACK|SS_DISABLE));
- if (newstack) {
+ /*
+ * If this is a branded process, the brand may provide an alternate
+ * stack pointer for signal delivery:
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_sendsig_stack != NULL) {
+ /*
+ * Use the stack pointer value provided by the brand,
+ * accounting for the 128-byte reserved region.
+ */
+ newstack = 0;
+ fp = BROP(p)->b_sendsig_stack(sig) - STACK_RESERVE;
+ } else if (newstack) {
fp = (caddr_t)(SA((uintptr_t)lwp->lwp_sigaltstack.ss_sp) +
SA(lwp->lwp_sigaltstack.ss_size) - STACK_ALIGN);
} else {
@@ -293,6 +309,8 @@ sendsig(int sig, k_siginfo_t *sip, void (*hdlr)())
kmem_free(tuc, sizeof (*tuc));
tuc = NULL;
+ DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp,
+ uintptr_t, lwp->lwp_oldcontext, uintptr_t, (uintptr_t)uc);
lwp->lwp_oldcontext = (uintptr_t)uc;
if (newstack) {
@@ -342,6 +360,14 @@ sendsig(int sig, k_siginfo_t *sip, void (*hdlr)())
}
/*
+ * Allow the brand to perform additional book-keeping once the signal
+ * handling frame has been fully assembled:
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_sendsig != NULL) {
+ BROP(p)->b_sendsig(sig);
+ }
+
+ /*
* Don't set lwp_eosys here. sendsig() is called via psig() after
* lwp_eosys is handled, so setting it here would affect the next
* system call.
@@ -417,7 +443,17 @@ sendsig32(int sig, k_siginfo_t *sip, void (*hdlr)())
newstack = sigismember(&PTOU(curproc)->u_sigonstack, sig) &&
!(lwp->lwp_sigaltstack.ss_flags & (SS_ONSTACK|SS_DISABLE));
- if (newstack) {
+ /*
+ * If this is a branded process, the brand may provide an alternate
+ * stack pointer for signal delivery:
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_sendsig_stack != NULL) {
+ /*
+ * Use the stack pointer value provided by the brand:
+ */
+ newstack = 0;
+ fp = BROP(p)->b_sendsig_stack(sig);
+ } else if (newstack) {
fp = (caddr_t)(SA32((uintptr_t)lwp->lwp_sigaltstack.ss_sp) +
SA32(lwp->lwp_sigaltstack.ss_size) - STACK_ALIGN32);
} else if ((rp->r_ss & 0xffff) != UDS_SEL) {
@@ -432,8 +468,9 @@ sendsig32(int sig, k_siginfo_t *sip, void (*hdlr)())
USEGD_GETBASE(&ldt[SELTOIDX(rp->r_ss)]);
else
fp = (caddr_t)rp->r_sp;
- } else
+ } else {
fp = (caddr_t)rp->r_sp;
+ }
/*
* Force proper stack pointer alignment, even in the face of a
@@ -511,6 +548,8 @@ sendsig32(int sig, k_siginfo_t *sip, void (*hdlr)())
kmem_free(tuc, sizeof (*tuc));
tuc = NULL;
+ DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp,
+ uintptr_t, lwp->lwp_oldcontext, uintptr_t, (uintptr_t)uc);
lwp->lwp_oldcontext = (uintptr_t)uc;
if (newstack) {
@@ -560,6 +599,14 @@ sendsig32(int sig, k_siginfo_t *sip, void (*hdlr)())
}
/*
+ * Allow the brand to perform additional book-keeping once the signal
+ * handling frame has been fully assembled:
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_sendsig != NULL) {
+ BROP(p)->b_sendsig(sig);
+ }
+
+ /*
* Don't set lwp_eosys here. sendsig() is called via psig() after
* lwp_eosys is handled, so setting it here would affect the next
* system call.
@@ -637,7 +684,17 @@ sendsig(int sig, k_siginfo_t *sip, void (*hdlr)())
newstack = sigismember(&PTOU(curproc)->u_sigonstack, sig) &&
!(lwp->lwp_sigaltstack.ss_flags & (SS_ONSTACK|SS_DISABLE));
- if (newstack) {
+ /*
+ * If this is a branded process, the brand may provide an alternate
+ * stack pointer for signal delivery:
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_sendsig_stack != NULL) {
+ /*
+ * Use the stack pointer value provided by the brand:
+ */
+ newstack = 0;
+ fp = BROP(p)->b_sendsig_stack(sig);
+ } else if (newstack) {
fp = (caddr_t)(SA((uintptr_t)lwp->lwp_sigaltstack.ss_sp) +
SA(lwp->lwp_sigaltstack.ss_size) - STACK_ALIGN);
} else if ((rp->r_ss & 0xffff) != UDS_SEL) {
@@ -652,8 +709,9 @@ sendsig(int sig, k_siginfo_t *sip, void (*hdlr)())
USEGD_GETBASE(&ldt[SELTOIDX(rp->r_ss)]);
else
fp = (caddr_t)rp->r_sp;
- } else
+ } else {
fp = (caddr_t)rp->r_sp;
+ }
/*
* Force proper stack pointer alignment, even in the face of a
@@ -731,6 +789,8 @@ sendsig(int sig, k_siginfo_t *sip, void (*hdlr)())
kmem_free(tuc, sizeof (*tuc));
tuc = NULL;
+ DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp,
+ uintptr_t, lwp->lwp_oldcontext, uintptr_t, (uintptr_t)uc);
lwp->lwp_oldcontext = (uintptr_t)uc;
if (newstack) {
@@ -768,6 +828,14 @@ sendsig(int sig, k_siginfo_t *sip, void (*hdlr)())
}
/*
+ * Allow the brand to perform additional book-keeping once the signal
+ * handling frame has been fully assembled:
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_sendsig != NULL) {
+ BROP(p)->b_sendsig(sig);
+ }
+
+ /*
* Don't set lwp_eosys here. sendsig() is called via psig() after
* lwp_eosys is handled, so setting it here would affect the next
* system call.
diff --git a/usr/src/uts/intel/ia32/syscall/getcontext.c b/usr/src/uts/intel/ia32/syscall/getcontext.c
index cb5a5b52ba..8f72b5da72 100644
--- a/usr/src/uts/intel/ia32/syscall/getcontext.c
+++ b/usr/src/uts/intel/ia32/syscall/getcontext.c
@@ -20,6 +20,9 @@
*/
/*
+ * Copyright 2015 Joyent, Inc.
+ */
+/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -46,6 +49,7 @@
#include <sys/schedctl.h>
#include <sys/debug.h>
#include <sys/sysmacros.h>
+#include <sys/sdt.h>
/*
* Save user context.
@@ -125,7 +129,23 @@ savecontext(ucontext_t *ucp, const k_sigset_t *mask)
else
ucp->uc_flags &= ~UC_FPU;
- sigktou(mask, &ucp->uc_sigmask);
+ if (mask != NULL) {
+ /*
+ * Save signal mask.
+ */
+ sigktou(mask, &ucp->uc_sigmask);
+ } else {
+ ucp->uc_flags &= ~UC_SIGMASK;
+ bzero(&ucp->uc_sigmask, sizeof (ucp->uc_sigmask));
+ }
+
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_savecontext != NULL) {
+ /*
+ * Allow the brand the chance to modify the context we
+ * saved:
+ */
+ BROP(p)->b_savecontext(ucp);
+ }
}
/*
@@ -136,7 +156,19 @@ restorecontext(ucontext_t *ucp)
{
kthread_t *t = curthread;
klwp_t *lwp = ttolwp(t);
+ proc_t *p = lwptoproc(lwp);
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_restorecontext != NULL) {
+ /*
+ * Allow the brand the chance to modify the context before
+ * we restore it:
+ */
+ BROP(p)->b_restorecontext(ucp);
+ }
+
+ DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp,
+ uintptr_t, lwp->lwp_oldcontext,
+ uintptr_t, (uintptr_t)ucp->uc_link);
lwp->lwp_oldcontext = (uintptr_t)ucp->uc_link;
if (ucp->uc_flags & UC_STACK) {
@@ -184,6 +216,7 @@ getsetcontext(int flag, void *arg)
ucontext_t *ucp;
klwp_t *lwp = ttolwp(curthread);
stack_t dummy_stk;
+ proc_t *p = lwptoproc(lwp);
/*
* In future releases, when the ucontext structure grows,
@@ -228,6 +261,15 @@ getsetcontext(int flag, void *arg)
return (set_errno(EFAULT));
}
+ /*
+ * If this is a branded process, copy in the brand-private
+ * data:
+ */
+ if (PROC_IS_BRANDED(p) && copyin(&ucp->uc_brand_data,
+ &uc.uc_brand_data, sizeof (uc.uc_brand_data)) != 0) {
+ return (set_errno(EFAULT));
+ }
+
restorecontext(&uc);
if ((uc.uc_flags & UC_STACK) && (lwp->lwp_ustack != 0))
@@ -311,7 +353,23 @@ savecontext32(ucontext32_t *ucp, const k_sigset_t *mask)
else
ucp->uc_flags &= ~UC_FPU;
- sigktou(mask, &ucp->uc_sigmask);
+ if (mask != NULL) {
+ /*
+ * Save signal mask.
+ */
+ sigktou(mask, &ucp->uc_sigmask);
+ } else {
+ ucp->uc_flags &= ~UC_SIGMASK;
+ bzero(&ucp->uc_sigmask, sizeof (ucp->uc_sigmask));
+ }
+
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_savecontext32 != NULL) {
+ /*
+ * Allow the brand the chance to modify the context we
+ * saved:
+ */
+ BROP(p)->b_savecontext32(ucp);
+ }
}
int
@@ -323,6 +381,7 @@ getsetcontext32(int flag, void *arg)
klwp_t *lwp = ttolwp(curthread);
caddr32_t ustack32;
stack32_t dummy_stk32;
+ proc_t *p = lwptoproc(lwp);
switch (flag) {
default:
@@ -354,6 +413,15 @@ getsetcontext32(int flag, void *arg)
return (set_errno(EFAULT));
}
+ /*
+ * If this is a branded process, copy in the brand-private
+ * data:
+ */
+ if (PROC_IS_BRANDED(p) && copyin(&ucp->uc_brand_data,
+ &uc.uc_brand_data, sizeof (uc.uc_brand_data)) != 0) {
+ return (set_errno(EFAULT));
+ }
+
ucontext_32ton(&uc, &ucnat);
restorecontext(&ucnat);
diff --git a/usr/src/uts/intel/lx_brand/Makefile.rules b/usr/src/uts/intel/lx_brand/Makefile.rules
index e78bcb1827..0a83e15493 100644
--- a/usr/src/uts/intel/lx_brand/Makefile.rules
+++ b/usr/src/uts/intel/lx_brand/Makefile.rules
@@ -21,7 +21,7 @@
#
# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
-# Copyright 2014 Joyent, Inc. All rights reserved.
+# Copyright 2015 Joyent, Inc.
#
#
@@ -44,15 +44,23 @@ $(OBJS_DIR_DBG64)/%.o: $(UTSBASE)/common/brand/lx/syscall/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR_OBJ64)/%.o: $(UTSBASE)/intel/brand/lx/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
+$(OBJS_DIR_DBG64)/%.o: $(UTSBASE)/intel/brand/lx/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR_OBJ64)/%.o: $(UTSBASE)/intel/brand/lx/%.s
- $(COMPILE.s) -o $@ $<
+ $(COMPILE.s) -I$(UTSBASE)/i86pc -o $@ $<
$(OBJS_DIR_OBJ64)/%.o: $(LX_CMN)/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
$(OBJS_DIR_DBG64)/%.o: $(UTSBASE)/intel/brand/lx/%.s
- $(COMPILE.s) -o $@ $<
+ $(COMPILE.s) -I$(UTSBASE)/i86pc -o $@ $<
$(OBJS_DIR)/%.o: $(UTSBASE)/common/brand/lx/os/%.c
$(COMPILE.c) -o $@ $<
@@ -62,12 +70,16 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/brand/lx/syscall/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/intel/brand/lx/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(LX_CMN)/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
$(OBJS_DIR)/%.o: $(UTSBASE)/intel/brand/lx/%.s
- $(COMPILE.s) -o $@ $<
+ $(COMPILE.s) -I$(UTSBASE)/i86pc -o $@ $<
#
# Section 1b: Lint `object' build rules.
diff --git a/usr/src/uts/intel/sys/ucontext.h b/usr/src/uts/intel/sys/ucontext.h
index acd6ddc99e..26f5923930 100644
--- a/usr/src/uts/intel/sys/ucontext.h
+++ b/usr/src/uts/intel/sys/ucontext.h
@@ -20,6 +20,9 @@
*/
/*
+ * Copyright 2015 Joyent, Inc.
+ */
+/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -82,9 +85,16 @@ struct __ucontext {
sigset_t uc_sigmask;
stack_t uc_stack;
mcontext_t uc_mcontext;
- long uc_filler[5]; /* see ABI spec for Intel386 */
+ /*
+ * The Intel386 ABI specification includes a 5-element array of longs
+ * called "uc_filler", padding the size of the struct to 512 bytes. To
+ * allow zone brands to communicate extra data right the way through
+ * the signal handling process, from sigacthandler to setcontext, we
+ * steal the first three of these longs as a brand-private member.
+ */
+ void *uc_brand_data[3];
+ long uc_filler[2];
};
-
#if defined(_SYSCALL32)
/* Kernel view of user ILP32 ucontext structure */
@@ -95,7 +105,8 @@ typedef struct ucontext32 {
sigset_t uc_sigmask;
stack32_t uc_stack;
mcontext32_t uc_mcontext;
- int32_t uc_filler[5];
+ caddr32_t uc_brand_data[3];
+ int32_t uc_filler[2];
} ucontext32_t;
#if defined(_KERNEL)
diff --git a/usr/src/uts/sparc/syscall/getcontext.c b/usr/src/uts/sparc/syscall/getcontext.c
index 437eef5e1a..fd0acaadf1 100644
--- a/usr/src/uts/sparc/syscall/getcontext.c
+++ b/usr/src/uts/sparc/syscall/getcontext.c
@@ -20,6 +20,9 @@
*/
/*
+ * Copyright 2015 Joyent, Inc.
+ */
+/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -110,10 +113,15 @@ savecontext(ucontext_t *ucp, const k_sigset_t *mask)
ucp->uc_flags &= ~UC_FPU;
ucp->uc_mcontext.gwins = (gwindows_t *)NULL;
- /*
- * Save signal mask.
- */
- sigktou(mask, &ucp->uc_sigmask);
+ if (mask != NULL) {
+ /*
+ * Save signal mask.
+ */
+ sigktou(mask, &ucp->uc_sigmask);
+ } else {
+ ucp->uc_flags &= ~UC_SIGMASK;
+ bzero(&ucp->uc_sigmask, sizeof (ucp->uc_sigmask));
+ }
}
@@ -412,11 +420,16 @@ savecontext32(ucontext32_t *ucp, const k_sigset_t *mask, struct fq32 *dfq)
ucp->uc_flags &= ~UC_FPU;
ucp->uc_mcontext.gwins = (caddr32_t)NULL;
- /*
- * Save signal mask (the 32- and 64-bit sigset_t structures are
- * identical).
- */
- sigktou(mask, (sigset_t *)&ucp->uc_sigmask);
+ if (mask != NULL) {
+ /*
+ * Save signal mask (the 32- and 64-bit sigset_t structures are
+ * identical).
+ */
+ sigktou(mask, (sigset_t *)&ucp->uc_sigmask);
+ } else {
+ ucp->uc_flags &= ~UC_SIGMASK;
+ bzero(&ucp->uc_sigmask, sizeof (ucp->uc_sigmask));
+ }
}
int