diff options
| author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2016-12-08 20:07:05 +0000 |
|---|---|---|
| committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2016-12-08 21:40:48 +0000 |
| commit | 83390469eaf76687ae434504ed3e41fdbe4ae3b6 (patch) | |
| tree | 011a030673def2c9511ad60cc9976fb987049b2f | |
| parent | 060157c37b10d81a8a264aeb85849663571caa8b (diff) | |
| download | illumos-joyent-83390469eaf76687ae434504ed3e41fdbe4ae3b6.tar.gz | |
OS-5805 chromium depends on CLONE_FS w/o full SHARED_AS
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Ryan Zezeski <ryan.zeseski@joyent.com>
Approved by: Patrick Mooney <patrick.mooney@joyent.com>
| -rw-r--r-- | usr/src/common/brand/lx/lx_syscall.h | 14 | ||||
| -rw-r--r-- | usr/src/lib/brand/lx/lx_brand/common/clone.c | 20 | ||||
| -rw-r--r-- | usr/src/lib/brand/lx/lx_brand/common/fork.c | 4 | ||||
| -rw-r--r-- | usr/src/lib/brand/lx/lx_brand/common/ptrace.c | 4 | ||||
| -rw-r--r-- | usr/src/lib/brand/lx/lx_brand/sys/lx_misc.h | 2 | ||||
| -rw-r--r-- | usr/src/uts/common/brand/lx/os/lx_brand.c | 20 | ||||
| -rw-r--r-- | usr/src/uts/common/brand/lx/os/lx_misc.c | 7 | ||||
| -rw-r--r-- | usr/src/uts/common/brand/lx/sys/lx_brand.h | 19 | ||||
| -rw-r--r-- | usr/src/uts/common/brand/lx/sys/lx_misc.h | 7 | ||||
| -rw-r--r-- | usr/src/uts/common/brand/lx/syscall/lx_clone.c | 372 | ||||
| -rw-r--r-- | usr/src/uts/common/brand/lx/syscall/lx_miscsys.c | 87 | ||||
| -rw-r--r-- | usr/src/uts/common/brand/lx/syscall/lx_umask.c | 27 | ||||
| -rw-r--r-- | usr/src/uts/common/brand/sn1/sn1_brand.c | 3 | ||||
| -rw-r--r-- | usr/src/uts/common/brand/solaris10/s10_brand.c | 3 | ||||
| -rw-r--r-- | usr/src/uts/common/os/brand.c | 5 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/brand.h | 2 | ||||
| -rw-r--r-- | usr/src/uts/common/syscall/chdir.c | 29 |
17 files changed, 596 insertions, 29 deletions
diff --git a/usr/src/common/brand/lx/lx_syscall.h b/usr/src/common/brand/lx/lx_syscall.h index 54fb196b5a..a0292023a0 100644 --- a/usr/src/common/brand/lx/lx_syscall.h +++ b/usr/src/common/brand/lx/lx_syscall.h @@ -88,6 +88,20 @@ extern "C" { #define LX_CLONE_DETACH 0x00400000 #define LX_CLONE_CHILD_SETTID 0x01000000 +#define SHARED_AS \ + (LX_CLONE_VM | LX_CLONE_FS | LX_CLONE_FILES | LX_CLONE_SIGHAND | \ + LX_CLONE_THREAD) + +/* + * Valid clone flags when not a full process or full thread (SHARED_AS), This + * can be expanded as additional clone-group support is added. + */ +#define LX_CLONE_GRP_SUBSET (LX_CLONE_FS) + +#define LX_IS_CLONE_GRP(X) ((X & SHARED_AS) != 0 && \ + (X & SHARED_AS) != SHARED_AS && \ + ((X & SHARED_AS) & ~LX_CLONE_GRP_SUBSET) == 0) + #ifdef __cplusplus } #endif diff --git a/usr/src/lib/brand/lx/lx_brand/common/clone.c b/usr/src/lib/brand/lx/lx_brand/common/clone.c index 586c042a83..698d13f9d1 100644 --- a/usr/src/lib/brand/lx/lx_brand/common/clone.c +++ b/usr/src/lib/brand/lx/lx_brand/common/clone.c @@ -53,10 +53,6 @@ #include <sys/debug.h> #include <lx_syscall.h> - -#define SHARED_AS \ - (LX_CLONE_VM | LX_CLONE_FS | LX_CLONE_FILES | LX_CLONE_SIGHAND \ - | LX_CLONE_THREAD) #define CLONE_VFORK (LX_CLONE_VM | LX_CLONE_VFORK) #define CLONE_TD (LX_CLONE_THREAD|LX_CLONE_DETACH) @@ -364,13 +360,15 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, uintptr_t p5) * Inform the in-kernel ptrace(2) subsystem that we are about to * emulate a fork(2), vfork(2) or clone(2) system call. */ - lx_ptrace_clone_begin(ptrace_event, !!(flags & LX_CLONE_PTRACE)); + lx_ptrace_clone_begin(ptrace_event, !!(flags & LX_CLONE_PTRACE), flags); /* - * Handle a fork(2) operation here. If this is not a fork, a new - * thread will be created after this block. + * Handle a fork(2) operation here. If this is not a fork, a new + * thread will be created after this block. We can also create a new + * clone-group here (when two or more processes share data represented + * by a subset of the SHARED_AS flags, but not a true thread). */ - if (IS_FORK(flags) || IS_VFORK(flags)) { + if (IS_FORK(flags) || IS_VFORK(flags) || LX_IS_CLONE_GRP(flags)) { if (flags & LX_CLONE_PARENT) { lx_unsupported("clone(2) only supports CLONE_PARENT " "for threads.\n"); @@ -571,11 +569,11 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, uintptr_t p5) } /* - * We have very restricted support.... only exactly these flags are - * supported + * A supported clone-group was handled above, so now it must be a + * true native thread, which means exactly these flags are supported */ if (((flags & SHARED_AS) != SHARED_AS)) { - lx_unsupported("clone(2) requires that all or none of " + lx_unsupported("clone(2) a thread requires that all or none of " "CLONE_VM/FS/FILES/THREAD/SIGHAND be set. (flags:0x%08X)\n", flags); return (-ENOTSUP); diff --git a/usr/src/lib/brand/lx/lx_brand/common/fork.c b/usr/src/lib/brand/lx/lx_brand/common/fork.c index 7a48f89c38..aa14267185 100644 --- a/usr/src/lib/brand/lx/lx_brand/common/fork.c +++ b/usr/src/lib/brand/lx/lx_brand/common/fork.c @@ -53,7 +53,7 @@ lx_fork(void) * Inform the in-kernel ptrace(2) subsystem that we are about to * emulate fork(2). */ - lx_ptrace_clone_begin(LX_PTRACE_O_TRACEFORK, B_FALSE); + lx_ptrace_clone_begin(LX_PTRACE_O_TRACEFORK, B_FALSE, 0); /* * Suspend signal delivery, run the stack management prefork handler @@ -115,7 +115,7 @@ lx_vfork(void) * Inform the in-kernel ptrace(2) subsystem that we are about to * emulate vfork(2). */ - lx_ptrace_clone_begin(LX_PTRACE_O_TRACEVFORK, B_FALSE); + lx_ptrace_clone_begin(LX_PTRACE_O_TRACEVFORK, B_FALSE, 0); /* * Suspend signal delivery, run the stack management prefork handler diff --git a/usr/src/lib/brand/lx/lx_brand/common/ptrace.c b/usr/src/lib/brand/lx/lx_brand/common/ptrace.c index bb6e52a112..2c6f5041a1 100644 --- a/usr/src/lib/brand/lx/lx_brand/common/ptrace.c +++ b/usr/src/lib/brand/lx/lx_brand/common/ptrace.c @@ -93,12 +93,12 @@ lx_ptrace_stop_if_option(int option, boolean_t child, ulong_t msg, * was passed to clone(2), inherit_flag should be B_TRUE. */ void -lx_ptrace_clone_begin(int option, boolean_t inherit_flag) +lx_ptrace_clone_begin(int option, boolean_t inherit_flag, int flags) { lx_debug("lx_ptrace_clone_begin(%d, %sPTRACE_CLONE)", option, inherit_flag ? "" : "!"); if (syscall(SYS_brand, B_PTRACE_CLONE_BEGIN, option, - inherit_flag) != 0) { + inherit_flag, flags) != 0) { lx_err_fatal("B_PTRACE_CLONE_BEGIN failed: %s", strerror(errno)); } diff --git a/usr/src/lib/brand/lx/lx_brand/sys/lx_misc.h b/usr/src/lib/brand/lx/lx_brand/sys/lx_misc.h index 4bbde06bff..5879311cef 100644 --- a/usr/src/lib/brand/lx/lx_brand/sys/lx_misc.h +++ b/usr/src/lib/brand/lx/lx_brand/sys/lx_misc.h @@ -134,7 +134,7 @@ extern void lx_ptrace_init(); extern int lx_ptrace_wait(siginfo_t *); extern void lx_ptrace_fork(void); extern void lx_ptrace_stop_if_option(int, boolean_t, ulong_t msg, ucontext_t *); -extern void lx_ptrace_clone_begin(int, boolean_t); +extern void lx_ptrace_clone_begin(int, boolean_t, int); extern int lx_check_alloca(size_t); #define SAFE_ALLOCA(sz) (lx_check_alloca(sz) ? alloca(sz) : NULL) diff --git a/usr/src/uts/common/brand/lx/os/lx_brand.c b/usr/src/uts/common/brand/lx/os/lx_brand.c index 33bab64751..4fbf8530bb 100644 --- a/usr/src/uts/common/brand/lx/os/lx_brand.c +++ b/usr/src/uts/common/brand/lx/os/lx_brand.c @@ -245,6 +245,7 @@ static int lx_setid_clear(vattr_t *, cred_t *); static int lx_pagefault(proc_t *, klwp_t *, caddr_t, enum fault_type, enum seg_rw); #endif +static void lx_clearbrand(proc_t *, boolean_t); typedef struct lx_zfs_ds { list_node_t ds_link; @@ -298,7 +299,8 @@ struct brand_ops lx_brops = { #else NULL, #endif - B_FALSE /* b_intp_parse_arg */ + B_FALSE, /* b_intp_parse_arg */ + lx_clearbrand /* b_clearbrand */ }; struct brand_mach_ops lx_mops = { @@ -333,6 +335,8 @@ lx_proc_exit(proc_t *p) lx_proc_data_t *lxpd; proc_t *cp; + lx_clone_grp_exit(p, B_FALSE); + mutex_enter(&p->p_lock); VERIFY((lxpd = ptolxproc(p)) != NULL); VERIFY(lxpd->l_ptrace == 0); @@ -544,6 +548,12 @@ lx_pagefault(proc_t *p, klwp_t *lwp, caddr_t addr, enum fault_type type, } #endif +static void +lx_clearbrand(proc_t *p, boolean_t lwps_ok) +{ + lx_clone_grp_exit(p, lwps_ok); +} + /* * This hook runs prior to sendsig() processing and allows us to nominate * an alternative stack pointer for delivery of the signal handling frame. @@ -1473,6 +1483,12 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, B_FALSE : B_TRUE, (ulong_t)arg3, arg4)); case B_PTRACE_CLONE_BEGIN: + /* + * Leverage ptrace brand call to create a clone group for this + * proc if necessary. + */ + lx_clone_grp_create((uint_t)arg3); + return (lx_ptrace_set_clone_inherit((int)arg1, arg2 == 0 ? B_FALSE : B_TRUE)); @@ -1869,6 +1885,8 @@ lx_copy_procdata(proc_t *cp, proc_t *pp) cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = LX_RLIM64_INFINITY; cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = LX_RLIM64_INFINITY; + + bzero(cpd->l_clone_grps, sizeof (cpd->l_clone_grps)); } #if defined(_LP64) diff --git a/usr/src/uts/common/brand/lx/os/lx_misc.c b/usr/src/uts/common/brand/lx/os/lx_misc.c index b8c9b52329..0025a1f105 100644 --- a/usr/src/uts/common/brand/lx/os/lx_misc.c +++ b/usr/src/uts/common/brand/lx/os/lx_misc.c @@ -575,6 +575,13 @@ lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp) * Flag so child doesn't ptrace-stop on syscall exit. */ dst->br_ptrace_flags |= LX_PTF_NOSTOP; + + if (src->br_clone_grp_flags != 0) { + lx_clone_grp_enter(src->br_clone_grp_flags, lwptoproc(srclwp), + lwptoproc(dstlwp)); + /* clone group no longer pending on this thread */ + src->br_clone_grp_flags = 0; + } } /* diff --git a/usr/src/uts/common/brand/lx/sys/lx_brand.h b/usr/src/uts/common/brand/lx/sys/lx_brand.h index 9be10aff98..9bb8bd290a 100644 --- a/usr/src/uts/common/brand/lx/sys/lx_brand.h +++ b/usr/src/uts/common/brand/lx/sys/lx_brand.h @@ -296,6 +296,21 @@ typedef struct { uint64_t rlim_max; } lx_rlimit64_t; +typedef struct { + list_node_t lx_clgrpm_link; + proc_t *lx_clgrpm_pp; +} lx_clone_grp_member_t; + +typedef struct { + kmutex_t lx_clgrp_lock; /* protects cnt & member list */ + uint_t lx_clgrp_cnt; + list_t lx_clgrp_members; +} lx_clone_grp_t; + +/* Entries in the l_clone_grps clone-group array */ +#define LX_CLGRP_FS 0 +#define LX_CLGRP_MAX 1 + typedef struct lx_proc_data { uintptr_t l_handler; /* address of user-space handler */ pid_t l_ppid; /* pid of originating parent proc */ @@ -308,6 +323,9 @@ typedef struct lx_proc_data { int l_parent_deathsig; lx_proc_flags_t l_flags; + kmutex_t l_clone_grp_lock; /* protects the following member */ + lx_clone_grp_t *l_clone_grps[LX_CLGRP_MAX]; + lx_rlimit64_t l_fake_limits[LX_RLFAKE_NLIMITS]; /* original start/end bounds of arg/env string data */ @@ -566,6 +584,7 @@ struct lx_lwp_data { uint64_t br_schd_period; /* emulated DEADLINE */ fwaiter_t br_fwaiter; /* futex upon which we're waiting */ + uint_t br_clone_grp_flags; /* pending clone group */ }; /* diff --git a/usr/src/uts/common/brand/lx/sys/lx_misc.h b/usr/src/uts/common/brand/lx/sys/lx_misc.h index af073c3f5f..5e8cbe150d 100644 --- a/usr/src/uts/common/brand/lx/sys/lx_misc.h +++ b/usr/src/uts/common/brand/lx/sys/lx_misc.h @@ -44,6 +44,13 @@ extern void lx_clear_gdt(int); extern longlong_t lx_nosys(); +extern void lx_clone_grp_create(uint_t); +extern void lx_clone_grp_enter(uint_t, proc_t *, proc_t *); +extern void lx_clone_grp_exit(proc_t *, boolean_t); +extern boolean_t lx_clone_grp_member(lx_proc_data_t *, uint_t); +extern int lx_clone_grp_walk(lx_proc_data_t *, uint_t, + int (*)(proc_t *, void *), void *); + extern greg_t lx_fixsegreg(greg_t, model_t); extern uintptr_t lx_fsbase(klwp_t *, uintptr_t); extern void lx_exit_with_sig(proc_t *, sigqueue_t *); diff --git a/usr/src/uts/common/brand/lx/syscall/lx_clone.c b/usr/src/uts/common/brand/lx/syscall/lx_clone.c index 50cdeaeab9..4e00e90b1a 100644 --- a/usr/src/uts/common/brand/lx/syscall/lx_clone.c +++ b/usr/src/uts/common/brand/lx/syscall/lx_clone.c @@ -21,9 +21,115 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ +/* + * [This comment omits the 'LX_' prefix on the clone flag names.] + * + * The vast majority of clone calls result in the creation of a new process or + * a new thread. Both of these map easily from Linux to our native code. For + * these calls, the user-level brand library uses a brand call to hook into the + * lx_helper_clone function for the required in-kernel support. + * + * A fork will typically provide these clone flags: + * CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID + * + * A new thread will use our SHARED_AS macro which has the flags: + * CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_THREAD | CLONE_VM + * + * In rare cases an application will attempt to use a subset of the SHARED_AS + * flags in order to implement some sharing between two processes without using + * a true thread. Because we do not have native support for this concept, the + * lx brand implements the notion of a 'clone-group'. This is a set of + * processes which share a subset of the allowed SHARED_AS flags. The lx brand + * syscalls implement the appropriate sharing for each flag. A clone-group is + * only instantiated in the rare case that a subset of the SHARED_AS flags are + * used with clone. + * + * The following set of flags could theoretically be supported, although most + * are not implemented at this time. The user-level brand library will validate + * that a supported subset of the flags are being used, or error if not. We + * also re-validate in the kernel. + * + * CLONE_FILES: share the file descriptor table + * CLONE_FS: share the filesystem information (root of the filesystem, the + * CWD, and the umask) + * CLONE_SIGHAND: share the table of signal handlers + * CLONE_THREAD: share the thread group + * CLONE_VM: share the address space + * + * At this time, only those flags defined in CLONE_GRP_SUBSET (CLONE_FS) are + * implemented. + * + * When a clone-group is in use, the lx_proc_data_t`l_clone_grps array will + * hold groups of processes sharing the attributes relevant to the clone flag. + * Each supported flag can have an associated group list in the array. + * + * On the first clone, a new lx_clone_grp_t struct will be created. This struct + * holds a pointer to each process in the group. A reference to that group is + * held in the appropriate slot in l_clone_grps. The struct is created for + * the parent process by lx_clone_grp_create() and then the child process will + * associate itself with the group(s) using lx_clone_grp_enter(). + * + * Each syscall acting upon attributes relevant to a clone-group must include + * logic to do so properly. The syscalls will use lx_clone_grp_member() to + * determine if clone-group handling is required, and use lx_clone_grp_walk() + * to walk the list of processes in the group and apply the provided callback + * to each process. + * + * The following example illustrates how a common clone group would be used, + * as processes clone with the same set of CLONE_* flags. + * A clones B with CLONE_FS + * B clones C with CLONE_FS + * When A clones B, a new clone group is created and saved in the LX_CLGRP_FS + * slot in the l_clone_grps array on both A and B. When B clones, since a group + * already exists, C is added to the group and the group is saved in the + * LX_CLGRP_FS slot on C. + * + * The following example illustrates how two common clone groups would be used, + * as processes clone with the same set of CLONE_* flags. + * A clones B with CLONE_FS|CLONE_THREAD + * A new clone group is created and saved in the LX_CLGRP_FS slot in the + * l_clone_grps array on both A and B. A second clone group is created and + * saved in the LX_CLGRP_THREAD slot on both A and B (note that LX_CLGRP_THREAD + * is not implemented at this time). + * + * The following example illustrates how different clone groups would be used, + * as processes clone with different sets of CLONE_* flags. + * A clones B with CLONE_FS + * B clones C with CLONE_THREAD + * C clones D with CLONE_FS + * In this example, only A&B and C&D should share their FS information. B&C + * have to be in two clone groups. When A clones, a new clone group is created + * and saved in the LX_CLGRP_FS slot in the l_clone_grps array on both A and B. + * When B clones, a new clone group is created and saved in the LX_CLGRP_THREAD + * slot on both B and C (note that LX_CLGRP_THREAD is not implemented at this + * time). When C clones, a new clone group is created and saved in the + * LX_CLGRP_FS slot on both C and D. + * + * When a process exits, it removes itself from any groups to which it belongs. + * When the last process exits a group, it is cleaned up. + * + * If clone-groups were commonly used, this implementation would be inefficient + * and unwieldy, but since they are so rare a straightforward list-based + * approach is adequate. + * + * During group creation, the l_clone_grp_lock is first taken to ensure only + * one group is created, otherwise, only the group's lx_clgrp_lock protects the + * list. + * + * Note: Despite the locking, there is still a subtle race that can occur in + * this code. This occurs if a process has two threads and one of them is about + * to execute a clone-group aware syscall (e.g. chdir), while the other thread + * is forking to create a new clone-group. In theory the child process could be + * created, but not yet in the group. The syscall in the first thread could + * thus miss the new process. For example, the first thread might chdir the + * parent, but since the child process was alrady created, but not yet in the + * clone-group, it would not be chdir-ed. + */ + + #include <sys/types.h> #include <sys/systm.h> #include <sys/errno.h> @@ -37,6 +143,270 @@ #include <sys/controlregs.h> /* + * We currently only support a single clone-group (CLONE_FS) but the design + * allows for future expansion by expanding the lx_proc_data+t`l_clone_grps + * array. + */ +static int +lx_clone_flag2grp(uint_t flag) +{ + if (flag & LX_CLONE_FS) + return (LX_CLGRP_FS); + + return (-1); +} + +/* + * Note: this function has the side effect of clearing the flags. + */ +static int +lx_clone_flags_iter(uint_t *fp) +{ + if (*fp & LX_CLONE_FS) { + *fp &= ~LX_CLONE_FS; + return (LX_CLGRP_FS); + } + + return (-1); +} + +/* + * Setup the current process in the proper clone-group(s) and record the + * clone-group flags on the lwp so that we can join the child process to the + * group during lx_forklwp(). + */ +void +lx_clone_grp_create(uint_t flags) +{ + int offset; + lx_proc_data_t *plproc = ttolxproc(curthread); + lx_lwp_data_t *ldp = (lx_lwp_data_t *)ttolwp(curthread)->lwp_brand; + lx_clone_grp_t **cgps; + lx_clone_grp_t *cgp; + lx_clone_grp_member_t *mp; + + if (!LX_IS_CLONE_GRP(flags)) + return; + + ldp->br_clone_grp_flags = flags & LX_CLONE_GRP_SUBSET; + + cgps = plproc->l_clone_grps; + /* + * We take the top-level mutex during create to ensure we only create + * one group per flag. + */ + mutex_enter(&plproc->l_clone_grp_lock); + while ((offset = lx_clone_flags_iter(&flags)) != -1) { + cgp = cgps[offset]; + + /* + * If we already havae a clone-group list for this flag then + * nothing to do. + */ + if (cgp != NULL) + continue; + + /* + * Create a new clone-group. If it ever becomes an issue, we + * could preallocate this memory before taking + * l_clone_grp_lock. + */ + cgp = kmem_alloc(sizeof (lx_clone_grp_t), KM_SLEEP); + mutex_init(&cgp->lx_clgrp_lock, NULL, MUTEX_DEFAULT, NULL); + cgp->lx_clgrp_cnt = 1; + list_create(&cgp->lx_clgrp_members, + sizeof (lx_clone_grp_member_t), + offsetof(lx_clone_grp_member_t, lx_clgrpm_link)); + + mp = kmem_zalloc(sizeof (lx_clone_grp_member_t), KM_SLEEP); + mp->lx_clgrpm_pp = curproc; + list_insert_tail(&cgp->lx_clgrp_members, mp); + + /* Attach group to our proc */ + plproc->l_clone_grps[offset] = cgp; + } + mutex_exit(&plproc->l_clone_grp_lock); +} + +/* + * Add the child process to the proper parent clone-group(s). + * + * Called from lx_forklwp, thus there is no need to have any locking for the + * destination proc. This is always run in the thread context of the source + * thread, and the destination thread is always newly created and not referred + * to from anywhere else. The source process should have already created the + * clone group(s) that we need to place the child into via lx_clone_grp_create. + */ +void +lx_clone_grp_enter(uint_t flags, proc_t *srcp, proc_t *dstp) +{ + int offset; + lx_proc_data_t *plproc = ptolxproc(srcp); + lx_proc_data_t *clproc = ptolxproc(dstp); + lx_clone_grp_t **cgps; + lx_clone_grp_t *cgp; + lx_clone_grp_member_t *mp; + + cgps = plproc->l_clone_grps; + while ((offset = lx_clone_flags_iter(&flags)) != -1) { + cgp = cgps[offset]; + + /* + * Parent should already have a clone-group list for this flag. + * The child joins that group. + */ + VERIFY(cgp != NULL); + + mp = kmem_zalloc(sizeof (lx_clone_grp_member_t), KM_SLEEP); + mp->lx_clgrpm_pp = dstp; + + mutex_enter(&cgp->lx_clgrp_lock); + list_insert_tail(&cgp->lx_clgrp_members, mp); + cgp->lx_clgrp_cnt++; + clproc->l_clone_grps[offset] = cgp; + mutex_exit(&cgp->lx_clgrp_lock); + } +} + +/* + * The process is exiting or we're exec-ing a native app. In the unlikely event + * it is in a clone-group, remove it from the group and perform any necessary + * cleanup. Normally we're called from lx_proc_exit(), so we know we're the + * last lwp in the process, but we can also be called from lx_clearbrand() when + * exec-ing a native application. In this case we know the lwp(s) are stopped + * (It is possible to have multiple lwps if we branded the process but the + * exec failed. Those lwps were just branded as part of the exec, and will + * be de-branded). + */ +void +lx_clone_grp_exit(proc_t *p, boolean_t lwps_ok) +{ + int i; + lx_proc_data_t *plproc = ptolxproc(p); + lx_clone_grp_t **cgps; + + ASSERT(!MUTEX_HELD(&p->p_lock)); + ASSERT(plproc != NULL); + + if (!lwps_ok) + VERIFY(p->p_lwpcnt <= 1); + + cgps = plproc->l_clone_grps; + for (i = 0; i < LX_CLGRP_MAX; i++) { + lx_clone_grp_t *cgp; + lx_clone_grp_member_t *mp; + boolean_t found; + + cgp = cgps[i]; + if (cgp == NULL) + continue; + + /* + * The rare case when this process belongs to a clone-group. + */ + + mutex_enter(&cgp->lx_clgrp_lock); + + /* First remove ourselves from the group. */ + found = B_FALSE; + mp = list_head(&cgp->lx_clgrp_members); + while (mp != NULL) { + if (mp->lx_clgrpm_pp == p) { + found = B_TRUE; + list_remove(&cgp->lx_clgrp_members, mp); + kmem_free(mp, sizeof (lx_clone_grp_member_t)); + ASSERT(cgp->lx_clgrp_cnt > 0); + cgp->lx_clgrp_cnt--; + plproc->l_clone_grps[i] = NULL; + break; + } + mp = list_next(&cgp->lx_clgrp_members, mp); + } + VERIFY(found); + + if (cgp->lx_clgrp_cnt > 0) { + mutex_exit(&cgp->lx_clgrp_lock); + continue; + } + + /* + * cgp->lx_clgrp_cnt == 0 + * + * We're the sole remaining member; finish cleanup now. + */ + ASSERT(plproc->l_clone_grps[i] == NULL); + mutex_exit(&cgp->lx_clgrp_lock); + + /* Delete the group since there are no more references to it. */ + VERIFY(list_is_empty(&cgp->lx_clgrp_members)); + + list_destroy(&cgp->lx_clgrp_members); + mutex_destroy(&cgp->lx_clgrp_lock); + kmem_free(cgp, sizeof (lx_clone_grp_t)); + } +} + +/* + * Return true in the rare case that the process is a member of a clone group + * with the specific flag set. Clone groups are only added to the array + * atomically until this process exits, so we don't need to take + * l_clone_grp_lock. + */ +boolean_t +lx_clone_grp_member(lx_proc_data_t *dp, uint_t flag) +{ + int offset; + + if ((offset = lx_clone_flag2grp(flag)) == -1) + return (B_FALSE); + + if (dp->l_clone_grps[offset] != NULL) { + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Walk all of the processes in the clone-group list and apply the callback + * to each. Because we're holding the group list lock (lx_clgrp_lock) none of + * the processes can exit, but that is the only locking guarantee made by this + * function itself. + */ +int +lx_clone_grp_walk(lx_proc_data_t *dp, uint_t flag, int (*cb)(proc_t *, void *), + void *arg) +{ + int offset; + lx_clone_grp_t *cgp; + lx_clone_grp_member_t *mp; + int res, rv = 0; + + + ASSERT(dp != NULL); + /* We should not be called unless we belong to a group */ + VERIFY((offset = lx_clone_flag2grp(flag)) != -1); + VERIFY(dp->l_clone_grps[offset] != NULL); + + cgp = dp->l_clone_grps[offset]; + mutex_enter(&cgp->lx_clgrp_lock); + + mp = list_head(&cgp->lx_clgrp_members); + while (mp != NULL) { + res = cb(mp->lx_clgrpm_pp, arg); + /* return the first error we see, but try all procs */ + if (res != 0 && rv == 0) + rv = res; + mp = list_next(&cgp->lx_clgrp_members, mp); + } + + mutex_exit(&cgp->lx_clgrp_lock); + + return (rv); +} + + +/* * Our lwp has already been created at this point, so this routine is * responsible for setting up all the state needed to track this as a * linux cloned thread. diff --git a/usr/src/uts/common/brand/lx/syscall/lx_miscsys.c b/usr/src/uts/common/brand/lx/syscall/lx_miscsys.c index 87bb9bde1b..36d6886ecf 100644 --- a/usr/src/uts/common/brand/lx/syscall/lx_miscsys.c +++ b/usr/src/uts/common/brand/lx/syscall/lx_miscsys.c @@ -18,6 +18,7 @@ #include <sys/resource.h> #include <sys/uadmin.h> #include <sys/lx_misc.h> +#include <lx_syscall.h> #define LINUX_REBOOT_MAGIC1 0xfee1dead #define LINUX_REBOOT_MAGIC2 672274793 @@ -68,6 +69,18 @@ extern int getitimer(uint_t, struct itimerval *); extern int stime(time_t); /* From uts/common/syscall/uadmin.c */ extern int uadmin(int, int, uintptr_t); +/* From uts/common/syscall/chdir.c */ +extern int chdir_proc(proc_t *, vnode_t *, boolean_t, boolean_t); +/* From uts/common/fs/lookup.c */ +extern int lookupname(char *, enum uio_seg, int, vnode_t **, vnode_t **); +/* From uts/common/fs/fs_subr.c */ +extern int fs_need_estale_retry(int); + +/* The callback arguments when handling a FS clone group. */ +typedef struct { + vnode_t *lcfa_vp; + boolean_t lcfa_type; +} lx_clone_fs_arg_t; long lx_alarm(int seconds) @@ -75,15 +88,89 @@ lx_alarm(int seconds) return (alarm(seconds)); } +static int +lx_clone_fs_cb(proc_t *pp, void *arg) +{ + lx_clone_fs_arg_t *ap = (lx_clone_fs_arg_t *)arg; + int err; + + /* + * The initial lookupname() from lx_clone_fs_do_group() will have added + * a hold on the vnode to ensure its existence throughout the walk. We + * need to add another hold for each process in the group. + */ + VN_HOLD(ap->lcfa_vp); + if ((err = chdir_proc(pp, ap->lcfa_vp, ap->lcfa_type, B_TRUE)) != 0) { + /* if we failed, chdir_proc already did a rele on vp */ + return (err); + } + + return (0); +} + +/* + * Check to see if the process is in a CLONE_FS clone group. Return false + * if not (the normal case), otherwise perform the setup, do the group walk + * and return true. + */ +static boolean_t +lx_clone_fs_do_group(char *path, boolean_t is_chroot, int *errp) +{ + lx_proc_data_t *lproc = ttolxproc(curthread); + vnode_t *vp; + lx_clone_fs_arg_t arg; + int err; + int estale_retry = 0; + + if (!lx_clone_grp_member(lproc, LX_CLONE_FS)) + return (B_FALSE); + + /* Handle the rare case of being in a CLONE_FS clone group */ + +retry: + err = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp); + if (err != 0) { + if (err == ESTALE && fs_need_estale_retry(estale_retry++)) + goto retry; + *errp = err; + return (B_TRUE); + } + + arg.lcfa_vp = vp; + arg.lcfa_type = is_chroot; + + /* + * We use the VN_HOLD from the lookup to guarantee vp exists for the + * entire walk. + */ + err = lx_clone_grp_walk(lproc, LX_CLONE_FS, lx_clone_fs_cb, + (void *)&arg); + VN_RELE(vp); + *errp = err; + return (B_TRUE); +} + long lx_chdir(char *path) { + int err; + + /* Handle the rare case of being in a CLONE_FS clone group */ + if (lx_clone_fs_do_group(path, B_FALSE, &err)) + return ((err != 0) ? set_errno(err) : 0); + return (chdir(path)); } long lx_chroot(char *path) { + int err; + + /* Handle the rare case of being in a CLONE_FS clone group */ + if (lx_clone_fs_do_group(path, B_TRUE, &err)) + return ((err != 0) ? set_errno(err) : 0); + return (chroot(path)); } diff --git a/usr/src/uts/common/brand/lx/syscall/lx_umask.c b/usr/src/uts/common/brand/lx/syscall/lx_umask.c index 130af6c776..cb5e4ed232 100644 --- a/usr/src/uts/common/brand/lx/syscall/lx_umask.c +++ b/usr/src/uts/common/brand/lx/syscall/lx_umask.c @@ -14,12 +14,39 @@ */ #include <sys/types.h> +#include <sys/lx_misc.h> +#include <lx_syscall.h> /* From usr/src/uts/common/syscall/umask.c */ extern int umask(int); +/* + * Just do what umask() does, but for the given process. + */ +static int +lx_clone_umask_cb(proc_t *pp, void *arg) +{ + mode_t cmask = (mode_t)(intptr_t)arg; + mode_t orig; + + orig = PTOU(pp)->u_cmask; + PTOU(pp)->u_cmask = (mode_t)(cmask & PERMMASK); + return ((int)orig); +} + long lx_umask(mode_t cmask) { + lx_proc_data_t *lproc = ttolxproc(curthread); + + /* Handle the rare case of being in a CLONE_FS clone group */ + if (lx_clone_grp_member(lproc, LX_CLONE_FS)) { + int omask; + + omask = lx_clone_grp_walk(lproc, LX_CLONE_FS, lx_clone_umask_cb, + (void *)(intptr_t)cmask); + return (omask); + } + return (umask(cmask)); } diff --git a/usr/src/uts/common/brand/sn1/sn1_brand.c b/usr/src/uts/common/brand/sn1/sn1_brand.c index 3d3acd6036..f31961b231 100644 --- a/usr/src/uts/common/brand/sn1/sn1_brand.c +++ b/usr/src/uts/common/brand/sn1/sn1_brand.c @@ -102,7 +102,8 @@ struct brand_ops sn1_brops = { NULL, /* b_sendsig */ NULL, /* b_setid_clear */ NULL, /* b_pagefault */ - B_TRUE /* b_intp_parse_arg */ + B_TRUE, /* b_intp_parse_arg */ + NULL /* b_clearbrand */ }; #ifdef sparc diff --git a/usr/src/uts/common/brand/solaris10/s10_brand.c b/usr/src/uts/common/brand/solaris10/s10_brand.c index 50997fba02..c49d605b00 100644 --- a/usr/src/uts/common/brand/solaris10/s10_brand.c +++ b/usr/src/uts/common/brand/solaris10/s10_brand.c @@ -107,7 +107,8 @@ struct brand_ops s10_brops = { NULL, /* b_sendsig */ NULL, /* b_setid_clear */ NULL, /* b_pagefault */ - B_TRUE /* b_intp_parse_arg */ + B_TRUE, /* b_intp_parse_arg */ + NULL /* b_clearbrand */ }; #ifdef sparc diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c index 02901d023d..62c3bbe2d6 100644 --- a/usr/src/uts/common/os/brand.c +++ b/usr/src/uts/common/os/brand.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2016, Joyent, Inc. */ #include <sys/kmem.h> @@ -389,6 +389,9 @@ brand_clearbrand(proc_t *p, boolean_t lwps_ok) VERIFY(bp != NULL); VERIFY(PROC_IS_BRANDED(p)); + if (BROP(p)->b_clearbrand != NULL) + BROP(p)->b_clearbrand(p, lwps_ok); + mutex_enter(&p->p_lock); p->p_brand = &native_brand; brand_data = p->p_brand_data; diff --git a/usr/src/uts/common/sys/brand.h b/usr/src/uts/common/sys/brand.h index 231daa5e9d..e50c4e055a 100644 --- a/usr/src/uts/common/sys/brand.h +++ b/usr/src/uts/common/sys/brand.h @@ -149,6 +149,7 @@ struct execa; * b_setid_clear - Override setid_clear behavior * b_pagefault - Trap pagefault events * b_intp_parse_arg - Controls interpreter argument handling (allow 1 or all) + * b_clearbrand - Perform any actions necessary when clearing the brand. */ struct brand_ops { void (*b_init_brand_data)(zone_t *, kmutex_t *); @@ -198,6 +199,7 @@ struct brand_ops { int (*b_pagefault)(proc_t *, klwp_t *, caddr_t, enum fault_type, enum seg_rw); boolean_t b_intp_parse_arg; + void (*b_clearbrand)(proc_t *, boolean_t); }; /* diff --git a/usr/src/uts/common/syscall/chdir.c b/usr/src/uts/common/syscall/chdir.c index 84c924f570..deb5532b50 100644 --- a/usr/src/uts/common/syscall/chdir.c +++ b/usr/src/uts/common/syscall/chdir.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -62,7 +63,7 @@ /* * Change current working directory ("."). */ -static int chdirec(vnode_t *, int ischroot, int do_traverse); +static int chdirec(vnode_t *, boolean_t ischroot, boolean_t do_traverse); int chdir(char *fname) @@ -78,7 +79,7 @@ lookup: return (set_errno(error)); } - error = chdirec(vp, 0, 1); + error = chdirec(vp, B_FALSE, B_TRUE); if (error) { if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) goto lookup; @@ -102,7 +103,7 @@ fchdir(int fd) vp = fp->f_vnode; VN_HOLD(vp); releasef(fd); - error = chdirec(vp, 0, 0); + error = chdirec(vp, B_FALSE, B_FALSE); if (error) return (set_errno(error)); return (0); @@ -125,7 +126,7 @@ lookup: return (set_errno(error)); } - error = chdirec(vp, 1, 1); + error = chdirec(vp, B_TRUE, B_TRUE); if (error) { if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) goto lookup; @@ -152,18 +153,18 @@ fchroot(int fd) vp = fp->f_vnode; VN_HOLD(vp); releasef(fd); - error = chdirec(vp, 1, 0); + error = chdirec(vp, B_TRUE, B_FALSE); if (error) return (set_errno(error)); return (0); } static int -chdirec(vnode_t *vp, int ischroot, int do_traverse) +chdirec_common(proc_t *pp, vnode_t *vp, boolean_t ischroot, + boolean_t do_traverse) { int error; vnode_t *oldvp; - proc_t *pp = curproc; vnode_t **vpp; refstr_t *cwd; int newcwd = 1; @@ -194,7 +195,7 @@ chdirec(vnode_t *vp, int ischroot, int do_traverse) if (ischroot) { struct vattr tattr; struct vattr rattr; - vnode_t *zonevp = curproc->p_zone->zone_rootvp; + vnode_t *zonevp = pp->p_zone->zone_rootvp; tattr.va_mask = AT_FSID|AT_NODEID; if (error = VOP_GETATTR(vp, &tattr, 0, CRED(), NULL)) @@ -243,3 +244,15 @@ bad: VN_RELE(vp); return (error); } + +int +chdir_proc(proc_t *pp, vnode_t *vp, boolean_t ischroot, boolean_t do_traverse) +{ + return (chdirec_common(pp, vp, ischroot, do_traverse)); +} + +static int +chdirec(vnode_t *vp, boolean_t ischroot, boolean_t do_traverse) +{ + return (chdirec_common(curproc, vp, ischroot, do_traverse)); +} |
