summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJerry Jelinek <jerry.jelinek@joyent.com>2016-12-08 20:07:05 +0000
committerJerry Jelinek <jerry.jelinek@joyent.com>2016-12-08 21:40:48 +0000
commit83390469eaf76687ae434504ed3e41fdbe4ae3b6 (patch)
tree011a030673def2c9511ad60cc9976fb987049b2f
parent060157c37b10d81a8a264aeb85849663571caa8b (diff)
downloadillumos-joyent-83390469eaf76687ae434504ed3e41fdbe4ae3b6.tar.gz
OS-5805 chromium depends on CLONE_FS w/o full SHARED_AS
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com> Reviewed by: Ryan Zezeski <ryan.zeseski@joyent.com> Approved by: Patrick Mooney <patrick.mooney@joyent.com>
-rw-r--r--usr/src/common/brand/lx/lx_syscall.h14
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/clone.c20
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/fork.c4
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/ptrace.c4
-rw-r--r--usr/src/lib/brand/lx/lx_brand/sys/lx_misc.h2
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_brand.c20
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_misc.c7
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_brand.h19
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_misc.h7
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_clone.c372
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_miscsys.c87
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_umask.c27
-rw-r--r--usr/src/uts/common/brand/sn1/sn1_brand.c3
-rw-r--r--usr/src/uts/common/brand/solaris10/s10_brand.c3
-rw-r--r--usr/src/uts/common/os/brand.c5
-rw-r--r--usr/src/uts/common/sys/brand.h2
-rw-r--r--usr/src/uts/common/syscall/chdir.c29
17 files changed, 596 insertions, 29 deletions
diff --git a/usr/src/common/brand/lx/lx_syscall.h b/usr/src/common/brand/lx/lx_syscall.h
index 54fb196b5a..a0292023a0 100644
--- a/usr/src/common/brand/lx/lx_syscall.h
+++ b/usr/src/common/brand/lx/lx_syscall.h
@@ -88,6 +88,20 @@ extern "C" {
#define LX_CLONE_DETACH 0x00400000
#define LX_CLONE_CHILD_SETTID 0x01000000
+#define SHARED_AS \
+ (LX_CLONE_VM | LX_CLONE_FS | LX_CLONE_FILES | LX_CLONE_SIGHAND | \
+ LX_CLONE_THREAD)
+
+/*
+ * Valid clone flags when not a full process or full thread (SHARED_AS), This
+ * can be expanded as additional clone-group support is added.
+ */
+#define LX_CLONE_GRP_SUBSET (LX_CLONE_FS)
+
+#define LX_IS_CLONE_GRP(X) ((X & SHARED_AS) != 0 && \
+ (X & SHARED_AS) != SHARED_AS && \
+ ((X & SHARED_AS) & ~LX_CLONE_GRP_SUBSET) == 0)
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/lib/brand/lx/lx_brand/common/clone.c b/usr/src/lib/brand/lx/lx_brand/common/clone.c
index 586c042a83..698d13f9d1 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/clone.c
+++ b/usr/src/lib/brand/lx/lx_brand/common/clone.c
@@ -53,10 +53,6 @@
#include <sys/debug.h>
#include <lx_syscall.h>
-
-#define SHARED_AS \
- (LX_CLONE_VM | LX_CLONE_FS | LX_CLONE_FILES | LX_CLONE_SIGHAND \
- | LX_CLONE_THREAD)
#define CLONE_VFORK (LX_CLONE_VM | LX_CLONE_VFORK)
#define CLONE_TD (LX_CLONE_THREAD|LX_CLONE_DETACH)
@@ -364,13 +360,15 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, uintptr_t p5)
* Inform the in-kernel ptrace(2) subsystem that we are about to
* emulate a fork(2), vfork(2) or clone(2) system call.
*/
- lx_ptrace_clone_begin(ptrace_event, !!(flags & LX_CLONE_PTRACE));
+ lx_ptrace_clone_begin(ptrace_event, !!(flags & LX_CLONE_PTRACE), flags);
/*
- * Handle a fork(2) operation here. If this is not a fork, a new
- * thread will be created after this block.
+ * Handle a fork(2) operation here. If this is not a fork, a new
+ * thread will be created after this block. We can also create a new
+ * clone-group here (when two or more processes share data represented
+ * by a subset of the SHARED_AS flags, but not a true thread).
*/
- if (IS_FORK(flags) || IS_VFORK(flags)) {
+ if (IS_FORK(flags) || IS_VFORK(flags) || LX_IS_CLONE_GRP(flags)) {
if (flags & LX_CLONE_PARENT) {
lx_unsupported("clone(2) only supports CLONE_PARENT "
"for threads.\n");
@@ -571,11 +569,11 @@ lx_clone(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, uintptr_t p5)
}
/*
- * We have very restricted support.... only exactly these flags are
- * supported
+ * A supported clone-group was handled above, so now it must be a
+ * true native thread, which means exactly these flags are supported
*/
if (((flags & SHARED_AS) != SHARED_AS)) {
- lx_unsupported("clone(2) requires that all or none of "
+ lx_unsupported("clone(2) a thread requires that all or none of "
"CLONE_VM/FS/FILES/THREAD/SIGHAND be set. (flags:0x%08X)\n",
flags);
return (-ENOTSUP);
diff --git a/usr/src/lib/brand/lx/lx_brand/common/fork.c b/usr/src/lib/brand/lx/lx_brand/common/fork.c
index 7a48f89c38..aa14267185 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/fork.c
+++ b/usr/src/lib/brand/lx/lx_brand/common/fork.c
@@ -53,7 +53,7 @@ lx_fork(void)
* Inform the in-kernel ptrace(2) subsystem that we are about to
* emulate fork(2).
*/
- lx_ptrace_clone_begin(LX_PTRACE_O_TRACEFORK, B_FALSE);
+ lx_ptrace_clone_begin(LX_PTRACE_O_TRACEFORK, B_FALSE, 0);
/*
* Suspend signal delivery, run the stack management prefork handler
@@ -115,7 +115,7 @@ lx_vfork(void)
* Inform the in-kernel ptrace(2) subsystem that we are about to
* emulate vfork(2).
*/
- lx_ptrace_clone_begin(LX_PTRACE_O_TRACEVFORK, B_FALSE);
+ lx_ptrace_clone_begin(LX_PTRACE_O_TRACEVFORK, B_FALSE, 0);
/*
* Suspend signal delivery, run the stack management prefork handler
diff --git a/usr/src/lib/brand/lx/lx_brand/common/ptrace.c b/usr/src/lib/brand/lx/lx_brand/common/ptrace.c
index bb6e52a112..2c6f5041a1 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/ptrace.c
+++ b/usr/src/lib/brand/lx/lx_brand/common/ptrace.c
@@ -93,12 +93,12 @@ lx_ptrace_stop_if_option(int option, boolean_t child, ulong_t msg,
* was passed to clone(2), inherit_flag should be B_TRUE.
*/
void
-lx_ptrace_clone_begin(int option, boolean_t inherit_flag)
+lx_ptrace_clone_begin(int option, boolean_t inherit_flag, int flags)
{
lx_debug("lx_ptrace_clone_begin(%d, %sPTRACE_CLONE)", option,
inherit_flag ? "" : "!");
if (syscall(SYS_brand, B_PTRACE_CLONE_BEGIN, option,
- inherit_flag) != 0) {
+ inherit_flag, flags) != 0) {
lx_err_fatal("B_PTRACE_CLONE_BEGIN failed: %s",
strerror(errno));
}
diff --git a/usr/src/lib/brand/lx/lx_brand/sys/lx_misc.h b/usr/src/lib/brand/lx/lx_brand/sys/lx_misc.h
index 4bbde06bff..5879311cef 100644
--- a/usr/src/lib/brand/lx/lx_brand/sys/lx_misc.h
+++ b/usr/src/lib/brand/lx/lx_brand/sys/lx_misc.h
@@ -134,7 +134,7 @@ extern void lx_ptrace_init();
extern int lx_ptrace_wait(siginfo_t *);
extern void lx_ptrace_fork(void);
extern void lx_ptrace_stop_if_option(int, boolean_t, ulong_t msg, ucontext_t *);
-extern void lx_ptrace_clone_begin(int, boolean_t);
+extern void lx_ptrace_clone_begin(int, boolean_t, int);
extern int lx_check_alloca(size_t);
#define SAFE_ALLOCA(sz) (lx_check_alloca(sz) ? alloca(sz) : NULL)
diff --git a/usr/src/uts/common/brand/lx/os/lx_brand.c b/usr/src/uts/common/brand/lx/os/lx_brand.c
index 33bab64751..4fbf8530bb 100644
--- a/usr/src/uts/common/brand/lx/os/lx_brand.c
+++ b/usr/src/uts/common/brand/lx/os/lx_brand.c
@@ -245,6 +245,7 @@ static int lx_setid_clear(vattr_t *, cred_t *);
static int lx_pagefault(proc_t *, klwp_t *, caddr_t, enum fault_type,
enum seg_rw);
#endif
+static void lx_clearbrand(proc_t *, boolean_t);
typedef struct lx_zfs_ds {
list_node_t ds_link;
@@ -298,7 +299,8 @@ struct brand_ops lx_brops = {
#else
NULL,
#endif
- B_FALSE /* b_intp_parse_arg */
+ B_FALSE, /* b_intp_parse_arg */
+ lx_clearbrand /* b_clearbrand */
};
struct brand_mach_ops lx_mops = {
@@ -333,6 +335,8 @@ lx_proc_exit(proc_t *p)
lx_proc_data_t *lxpd;
proc_t *cp;
+ lx_clone_grp_exit(p, B_FALSE);
+
mutex_enter(&p->p_lock);
VERIFY((lxpd = ptolxproc(p)) != NULL);
VERIFY(lxpd->l_ptrace == 0);
@@ -544,6 +548,12 @@ lx_pagefault(proc_t *p, klwp_t *lwp, caddr_t addr, enum fault_type type,
}
#endif
+static void
+lx_clearbrand(proc_t *p, boolean_t lwps_ok)
+{
+ lx_clone_grp_exit(p, lwps_ok);
+}
+
/*
* This hook runs prior to sendsig() processing and allows us to nominate
* an alternative stack pointer for delivery of the signal handling frame.
@@ -1473,6 +1483,12 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
B_FALSE : B_TRUE, (ulong_t)arg3, arg4));
case B_PTRACE_CLONE_BEGIN:
+ /*
+ * Leverage ptrace brand call to create a clone group for this
+ * proc if necessary.
+ */
+ lx_clone_grp_create((uint_t)arg3);
+
return (lx_ptrace_set_clone_inherit((int)arg1, arg2 == 0 ?
B_FALSE : B_TRUE));
@@ -1869,6 +1885,8 @@ lx_copy_procdata(proc_t *cp, proc_t *pp)
cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = LX_RLIM64_INFINITY;
cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = LX_RLIM64_INFINITY;
+
+ bzero(cpd->l_clone_grps, sizeof (cpd->l_clone_grps));
}
#if defined(_LP64)
diff --git a/usr/src/uts/common/brand/lx/os/lx_misc.c b/usr/src/uts/common/brand/lx/os/lx_misc.c
index b8c9b52329..0025a1f105 100644
--- a/usr/src/uts/common/brand/lx/os/lx_misc.c
+++ b/usr/src/uts/common/brand/lx/os/lx_misc.c
@@ -575,6 +575,13 @@ lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp)
* Flag so child doesn't ptrace-stop on syscall exit.
*/
dst->br_ptrace_flags |= LX_PTF_NOSTOP;
+
+ if (src->br_clone_grp_flags != 0) {
+ lx_clone_grp_enter(src->br_clone_grp_flags, lwptoproc(srclwp),
+ lwptoproc(dstlwp));
+ /* clone group no longer pending on this thread */
+ src->br_clone_grp_flags = 0;
+ }
}
/*
diff --git a/usr/src/uts/common/brand/lx/sys/lx_brand.h b/usr/src/uts/common/brand/lx/sys/lx_brand.h
index 9be10aff98..9bb8bd290a 100644
--- a/usr/src/uts/common/brand/lx/sys/lx_brand.h
+++ b/usr/src/uts/common/brand/lx/sys/lx_brand.h
@@ -296,6 +296,21 @@ typedef struct {
uint64_t rlim_max;
} lx_rlimit64_t;
+typedef struct {
+ list_node_t lx_clgrpm_link;
+ proc_t *lx_clgrpm_pp;
+} lx_clone_grp_member_t;
+
+typedef struct {
+ kmutex_t lx_clgrp_lock; /* protects cnt & member list */
+ uint_t lx_clgrp_cnt;
+ list_t lx_clgrp_members;
+} lx_clone_grp_t;
+
+/* Entries in the l_clone_grps clone-group array */
+#define LX_CLGRP_FS 0
+#define LX_CLGRP_MAX 1
+
typedef struct lx_proc_data {
uintptr_t l_handler; /* address of user-space handler */
pid_t l_ppid; /* pid of originating parent proc */
@@ -308,6 +323,9 @@ typedef struct lx_proc_data {
int l_parent_deathsig;
lx_proc_flags_t l_flags;
+ kmutex_t l_clone_grp_lock; /* protects the following member */
+ lx_clone_grp_t *l_clone_grps[LX_CLGRP_MAX];
+
lx_rlimit64_t l_fake_limits[LX_RLFAKE_NLIMITS];
/* original start/end bounds of arg/env string data */
@@ -566,6 +584,7 @@ struct lx_lwp_data {
uint64_t br_schd_period; /* emulated DEADLINE */
fwaiter_t br_fwaiter; /* futex upon which we're waiting */
+ uint_t br_clone_grp_flags; /* pending clone group */
};
/*
diff --git a/usr/src/uts/common/brand/lx/sys/lx_misc.h b/usr/src/uts/common/brand/lx/sys/lx_misc.h
index af073c3f5f..5e8cbe150d 100644
--- a/usr/src/uts/common/brand/lx/sys/lx_misc.h
+++ b/usr/src/uts/common/brand/lx/sys/lx_misc.h
@@ -44,6 +44,13 @@ extern void lx_clear_gdt(int);
extern longlong_t lx_nosys();
+extern void lx_clone_grp_create(uint_t);
+extern void lx_clone_grp_enter(uint_t, proc_t *, proc_t *);
+extern void lx_clone_grp_exit(proc_t *, boolean_t);
+extern boolean_t lx_clone_grp_member(lx_proc_data_t *, uint_t);
+extern int lx_clone_grp_walk(lx_proc_data_t *, uint_t,
+ int (*)(proc_t *, void *), void *);
+
extern greg_t lx_fixsegreg(greg_t, model_t);
extern uintptr_t lx_fsbase(klwp_t *, uintptr_t);
extern void lx_exit_with_sig(proc_t *, sigqueue_t *);
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_clone.c b/usr/src/uts/common/brand/lx/syscall/lx_clone.c
index 50cdeaeab9..4e00e90b1a 100644
--- a/usr/src/uts/common/brand/lx/syscall/lx_clone.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_clone.c
@@ -21,9 +21,115 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2016 Joyent, Inc.
*/
+/*
+ * [This comment omits the 'LX_' prefix on the clone flag names.]
+ *
+ * The vast majority of clone calls result in the creation of a new process or
+ * a new thread. Both of these map easily from Linux to our native code. For
+ * these calls, the user-level brand library uses a brand call to hook into the
+ * lx_helper_clone function for the required in-kernel support.
+ *
+ * A fork will typically provide these clone flags:
+ * CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID
+ *
+ * A new thread will use our SHARED_AS macro which has the flags:
+ * CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_THREAD | CLONE_VM
+ *
+ * In rare cases an application will attempt to use a subset of the SHARED_AS
+ * flags in order to implement some sharing between two processes without using
+ * a true thread. Because we do not have native support for this concept, the
+ * lx brand implements the notion of a 'clone-group'. This is a set of
+ * processes which share a subset of the allowed SHARED_AS flags. The lx brand
+ * syscalls implement the appropriate sharing for each flag. A clone-group is
+ * only instantiated in the rare case that a subset of the SHARED_AS flags are
+ * used with clone.
+ *
+ * The following set of flags could theoretically be supported, although most
+ * are not implemented at this time. The user-level brand library will validate
+ * that a supported subset of the flags are being used, or error if not. We
+ * also re-validate in the kernel.
+ *
+ * CLONE_FILES: share the file descriptor table
+ * CLONE_FS: share the filesystem information (root of the filesystem, the
+ * CWD, and the umask)
+ * CLONE_SIGHAND: share the table of signal handlers
+ * CLONE_THREAD: share the thread group
+ * CLONE_VM: share the address space
+ *
+ * At this time, only those flags defined in CLONE_GRP_SUBSET (CLONE_FS) are
+ * implemented.
+ *
+ * When a clone-group is in use, the lx_proc_data_t`l_clone_grps array will
+ * hold groups of processes sharing the attributes relevant to the clone flag.
+ * Each supported flag can have an associated group list in the array.
+ *
+ * On the first clone, a new lx_clone_grp_t struct will be created. This struct
+ * holds a pointer to each process in the group. A reference to that group is
+ * held in the appropriate slot in l_clone_grps. The struct is created for
+ * the parent process by lx_clone_grp_create() and then the child process will
+ * associate itself with the group(s) using lx_clone_grp_enter().
+ *
+ * Each syscall acting upon attributes relevant to a clone-group must include
+ * logic to do so properly. The syscalls will use lx_clone_grp_member() to
+ * determine if clone-group handling is required, and use lx_clone_grp_walk()
+ * to walk the list of processes in the group and apply the provided callback
+ * to each process.
+ *
+ * The following example illustrates how a common clone group would be used,
+ * as processes clone with the same set of CLONE_* flags.
+ * A clones B with CLONE_FS
+ * B clones C with CLONE_FS
+ * When A clones B, a new clone group is created and saved in the LX_CLGRP_FS
+ * slot in the l_clone_grps array on both A and B. When B clones, since a group
+ * already exists, C is added to the group and the group is saved in the
+ * LX_CLGRP_FS slot on C.
+ *
+ * The following example illustrates how two common clone groups would be used,
+ * as processes clone with the same set of CLONE_* flags.
+ * A clones B with CLONE_FS|CLONE_THREAD
+ * A new clone group is created and saved in the LX_CLGRP_FS slot in the
+ * l_clone_grps array on both A and B. A second clone group is created and
+ * saved in the LX_CLGRP_THREAD slot on both A and B (note that LX_CLGRP_THREAD
+ * is not implemented at this time).
+ *
+ * The following example illustrates how different clone groups would be used,
+ * as processes clone with different sets of CLONE_* flags.
+ * A clones B with CLONE_FS
+ * B clones C with CLONE_THREAD
+ * C clones D with CLONE_FS
+ * In this example, only A&B and C&D should share their FS information. B&C
+ * have to be in two clone groups. When A clones, a new clone group is created
+ * and saved in the LX_CLGRP_FS slot in the l_clone_grps array on both A and B.
+ * When B clones, a new clone group is created and saved in the LX_CLGRP_THREAD
+ * slot on both B and C (note that LX_CLGRP_THREAD is not implemented at this
+ * time). When C clones, a new clone group is created and saved in the
+ * LX_CLGRP_FS slot on both C and D.
+ *
+ * When a process exits, it removes itself from any groups to which it belongs.
+ * When the last process exits a group, it is cleaned up.
+ *
+ * If clone-groups were commonly used, this implementation would be inefficient
+ * and unwieldy, but since they are so rare a straightforward list-based
+ * approach is adequate.
+ *
+ * During group creation, the l_clone_grp_lock is first taken to ensure only
+ * one group is created, otherwise, only the group's lx_clgrp_lock protects the
+ * list.
+ *
+ * Note: Despite the locking, there is still a subtle race that can occur in
+ * this code. This occurs if a process has two threads and one of them is about
+ * to execute a clone-group aware syscall (e.g. chdir), while the other thread
+ * is forking to create a new clone-group. In theory the child process could be
+ * created, but not yet in the group. The syscall in the first thread could
+ * thus miss the new process. For example, the first thread might chdir the
+ * parent, but since the child process was alrady created, but not yet in the
+ * clone-group, it would not be chdir-ed.
+ */
+
+
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/errno.h>
@@ -37,6 +143,270 @@
#include <sys/controlregs.h>
/*
+ * We currently only support a single clone-group (CLONE_FS) but the design
+ * allows for future expansion by expanding the lx_proc_data+t`l_clone_grps
+ * array.
+ */
+static int
+lx_clone_flag2grp(uint_t flag)
+{
+ if (flag & LX_CLONE_FS)
+ return (LX_CLGRP_FS);
+
+ return (-1);
+}
+
+/*
+ * Note: this function has the side effect of clearing the flags.
+ */
+static int
+lx_clone_flags_iter(uint_t *fp)
+{
+ if (*fp & LX_CLONE_FS) {
+ *fp &= ~LX_CLONE_FS;
+ return (LX_CLGRP_FS);
+ }
+
+ return (-1);
+}
+
+/*
+ * Setup the current process in the proper clone-group(s) and record the
+ * clone-group flags on the lwp so that we can join the child process to the
+ * group during lx_forklwp().
+ */
+void
+lx_clone_grp_create(uint_t flags)
+{
+ int offset;
+ lx_proc_data_t *plproc = ttolxproc(curthread);
+ lx_lwp_data_t *ldp = (lx_lwp_data_t *)ttolwp(curthread)->lwp_brand;
+ lx_clone_grp_t **cgps;
+ lx_clone_grp_t *cgp;
+ lx_clone_grp_member_t *mp;
+
+ if (!LX_IS_CLONE_GRP(flags))
+ return;
+
+ ldp->br_clone_grp_flags = flags & LX_CLONE_GRP_SUBSET;
+
+ cgps = plproc->l_clone_grps;
+ /*
+ * We take the top-level mutex during create to ensure we only create
+ * one group per flag.
+ */
+ mutex_enter(&plproc->l_clone_grp_lock);
+ while ((offset = lx_clone_flags_iter(&flags)) != -1) {
+ cgp = cgps[offset];
+
+ /*
+ * If we already havae a clone-group list for this flag then
+ * nothing to do.
+ */
+ if (cgp != NULL)
+ continue;
+
+ /*
+ * Create a new clone-group. If it ever becomes an issue, we
+ * could preallocate this memory before taking
+ * l_clone_grp_lock.
+ */
+ cgp = kmem_alloc(sizeof (lx_clone_grp_t), KM_SLEEP);
+ mutex_init(&cgp->lx_clgrp_lock, NULL, MUTEX_DEFAULT, NULL);
+ cgp->lx_clgrp_cnt = 1;
+ list_create(&cgp->lx_clgrp_members,
+ sizeof (lx_clone_grp_member_t),
+ offsetof(lx_clone_grp_member_t, lx_clgrpm_link));
+
+ mp = kmem_zalloc(sizeof (lx_clone_grp_member_t), KM_SLEEP);
+ mp->lx_clgrpm_pp = curproc;
+ list_insert_tail(&cgp->lx_clgrp_members, mp);
+
+ /* Attach group to our proc */
+ plproc->l_clone_grps[offset] = cgp;
+ }
+ mutex_exit(&plproc->l_clone_grp_lock);
+}
+
+/*
+ * Add the child process to the proper parent clone-group(s).
+ *
+ * Called from lx_forklwp, thus there is no need to have any locking for the
+ * destination proc. This is always run in the thread context of the source
+ * thread, and the destination thread is always newly created and not referred
+ * to from anywhere else. The source process should have already created the
+ * clone group(s) that we need to place the child into via lx_clone_grp_create.
+ */
+void
+lx_clone_grp_enter(uint_t flags, proc_t *srcp, proc_t *dstp)
+{
+ int offset;
+ lx_proc_data_t *plproc = ptolxproc(srcp);
+ lx_proc_data_t *clproc = ptolxproc(dstp);
+ lx_clone_grp_t **cgps;
+ lx_clone_grp_t *cgp;
+ lx_clone_grp_member_t *mp;
+
+ cgps = plproc->l_clone_grps;
+ while ((offset = lx_clone_flags_iter(&flags)) != -1) {
+ cgp = cgps[offset];
+
+ /*
+ * Parent should already have a clone-group list for this flag.
+ * The child joins that group.
+ */
+ VERIFY(cgp != NULL);
+
+ mp = kmem_zalloc(sizeof (lx_clone_grp_member_t), KM_SLEEP);
+ mp->lx_clgrpm_pp = dstp;
+
+ mutex_enter(&cgp->lx_clgrp_lock);
+ list_insert_tail(&cgp->lx_clgrp_members, mp);
+ cgp->lx_clgrp_cnt++;
+ clproc->l_clone_grps[offset] = cgp;
+ mutex_exit(&cgp->lx_clgrp_lock);
+ }
+}
+
+/*
+ * The process is exiting or we're exec-ing a native app. In the unlikely event
+ * it is in a clone-group, remove it from the group and perform any necessary
+ * cleanup. Normally we're called from lx_proc_exit(), so we know we're the
+ * last lwp in the process, but we can also be called from lx_clearbrand() when
+ * exec-ing a native application. In this case we know the lwp(s) are stopped
+ * (It is possible to have multiple lwps if we branded the process but the
+ * exec failed. Those lwps were just branded as part of the exec, and will
+ * be de-branded).
+ */
+void
+lx_clone_grp_exit(proc_t *p, boolean_t lwps_ok)
+{
+ int i;
+ lx_proc_data_t *plproc = ptolxproc(p);
+ lx_clone_grp_t **cgps;
+
+ ASSERT(!MUTEX_HELD(&p->p_lock));
+ ASSERT(plproc != NULL);
+
+ if (!lwps_ok)
+ VERIFY(p->p_lwpcnt <= 1);
+
+ cgps = plproc->l_clone_grps;
+ for (i = 0; i < LX_CLGRP_MAX; i++) {
+ lx_clone_grp_t *cgp;
+ lx_clone_grp_member_t *mp;
+ boolean_t found;
+
+ cgp = cgps[i];
+ if (cgp == NULL)
+ continue;
+
+ /*
+ * The rare case when this process belongs to a clone-group.
+ */
+
+ mutex_enter(&cgp->lx_clgrp_lock);
+
+ /* First remove ourselves from the group. */
+ found = B_FALSE;
+ mp = list_head(&cgp->lx_clgrp_members);
+ while (mp != NULL) {
+ if (mp->lx_clgrpm_pp == p) {
+ found = B_TRUE;
+ list_remove(&cgp->lx_clgrp_members, mp);
+ kmem_free(mp, sizeof (lx_clone_grp_member_t));
+ ASSERT(cgp->lx_clgrp_cnt > 0);
+ cgp->lx_clgrp_cnt--;
+ plproc->l_clone_grps[i] = NULL;
+ break;
+ }
+ mp = list_next(&cgp->lx_clgrp_members, mp);
+ }
+ VERIFY(found);
+
+ if (cgp->lx_clgrp_cnt > 0) {
+ mutex_exit(&cgp->lx_clgrp_lock);
+ continue;
+ }
+
+ /*
+ * cgp->lx_clgrp_cnt == 0
+ *
+ * We're the sole remaining member; finish cleanup now.
+ */
+ ASSERT(plproc->l_clone_grps[i] == NULL);
+ mutex_exit(&cgp->lx_clgrp_lock);
+
+ /* Delete the group since there are no more references to it. */
+ VERIFY(list_is_empty(&cgp->lx_clgrp_members));
+
+ list_destroy(&cgp->lx_clgrp_members);
+ mutex_destroy(&cgp->lx_clgrp_lock);
+ kmem_free(cgp, sizeof (lx_clone_grp_t));
+ }
+}
+
+/*
+ * Return true in the rare case that the process is a member of a clone group
+ * with the specific flag set. Clone groups are only added to the array
+ * atomically until this process exits, so we don't need to take
+ * l_clone_grp_lock.
+ */
+boolean_t
+lx_clone_grp_member(lx_proc_data_t *dp, uint_t flag)
+{
+ int offset;
+
+ if ((offset = lx_clone_flag2grp(flag)) == -1)
+ return (B_FALSE);
+
+ if (dp->l_clone_grps[offset] != NULL) {
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Walk all of the processes in the clone-group list and apply the callback
+ * to each. Because we're holding the group list lock (lx_clgrp_lock) none of
+ * the processes can exit, but that is the only locking guarantee made by this
+ * function itself.
+ */
+int
+lx_clone_grp_walk(lx_proc_data_t *dp, uint_t flag, int (*cb)(proc_t *, void *),
+ void *arg)
+{
+ int offset;
+ lx_clone_grp_t *cgp;
+ lx_clone_grp_member_t *mp;
+ int res, rv = 0;
+
+
+ ASSERT(dp != NULL);
+ /* We should not be called unless we belong to a group */
+ VERIFY((offset = lx_clone_flag2grp(flag)) != -1);
+ VERIFY(dp->l_clone_grps[offset] != NULL);
+
+ cgp = dp->l_clone_grps[offset];
+ mutex_enter(&cgp->lx_clgrp_lock);
+
+ mp = list_head(&cgp->lx_clgrp_members);
+ while (mp != NULL) {
+ res = cb(mp->lx_clgrpm_pp, arg);
+ /* return the first error we see, but try all procs */
+ if (res != 0 && rv == 0)
+ rv = res;
+ mp = list_next(&cgp->lx_clgrp_members, mp);
+ }
+
+ mutex_exit(&cgp->lx_clgrp_lock);
+
+ return (rv);
+}
+
+
+/*
* Our lwp has already been created at this point, so this routine is
* responsible for setting up all the state needed to track this as a
* linux cloned thread.
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_miscsys.c b/usr/src/uts/common/brand/lx/syscall/lx_miscsys.c
index 87bb9bde1b..36d6886ecf 100644
--- a/usr/src/uts/common/brand/lx/syscall/lx_miscsys.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_miscsys.c
@@ -18,6 +18,7 @@
#include <sys/resource.h>
#include <sys/uadmin.h>
#include <sys/lx_misc.h>
+#include <lx_syscall.h>
#define LINUX_REBOOT_MAGIC1 0xfee1dead
#define LINUX_REBOOT_MAGIC2 672274793
@@ -68,6 +69,18 @@ extern int getitimer(uint_t, struct itimerval *);
extern int stime(time_t);
/* From uts/common/syscall/uadmin.c */
extern int uadmin(int, int, uintptr_t);
+/* From uts/common/syscall/chdir.c */
+extern int chdir_proc(proc_t *, vnode_t *, boolean_t, boolean_t);
+/* From uts/common/fs/lookup.c */
+extern int lookupname(char *, enum uio_seg, int, vnode_t **, vnode_t **);
+/* From uts/common/fs/fs_subr.c */
+extern int fs_need_estale_retry(int);
+
+/* The callback arguments when handling a FS clone group. */
+typedef struct {
+ vnode_t *lcfa_vp;
+ boolean_t lcfa_type;
+} lx_clone_fs_arg_t;
long
lx_alarm(int seconds)
@@ -75,15 +88,89 @@ lx_alarm(int seconds)
return (alarm(seconds));
}
+static int
+lx_clone_fs_cb(proc_t *pp, void *arg)
+{
+ lx_clone_fs_arg_t *ap = (lx_clone_fs_arg_t *)arg;
+ int err;
+
+ /*
+ * The initial lookupname() from lx_clone_fs_do_group() will have added
+ * a hold on the vnode to ensure its existence throughout the walk. We
+ * need to add another hold for each process in the group.
+ */
+ VN_HOLD(ap->lcfa_vp);
+ if ((err = chdir_proc(pp, ap->lcfa_vp, ap->lcfa_type, B_TRUE)) != 0) {
+ /* if we failed, chdir_proc already did a rele on vp */
+ return (err);
+ }
+
+ return (0);
+}
+
+/*
+ * Check to see if the process is in a CLONE_FS clone group. Return false
+ * if not (the normal case), otherwise perform the setup, do the group walk
+ * and return true.
+ */
+static boolean_t
+lx_clone_fs_do_group(char *path, boolean_t is_chroot, int *errp)
+{
+ lx_proc_data_t *lproc = ttolxproc(curthread);
+ vnode_t *vp;
+ lx_clone_fs_arg_t arg;
+ int err;
+ int estale_retry = 0;
+
+ if (!lx_clone_grp_member(lproc, LX_CLONE_FS))
+ return (B_FALSE);
+
+ /* Handle the rare case of being in a CLONE_FS clone group */
+
+retry:
+ err = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp);
+ if (err != 0) {
+ if (err == ESTALE && fs_need_estale_retry(estale_retry++))
+ goto retry;
+ *errp = err;
+ return (B_TRUE);
+ }
+
+ arg.lcfa_vp = vp;
+ arg.lcfa_type = is_chroot;
+
+ /*
+ * We use the VN_HOLD from the lookup to guarantee vp exists for the
+ * entire walk.
+ */
+ err = lx_clone_grp_walk(lproc, LX_CLONE_FS, lx_clone_fs_cb,
+ (void *)&arg);
+ VN_RELE(vp);
+ *errp = err;
+ return (B_TRUE);
+}
+
long
lx_chdir(char *path)
{
+ int err;
+
+ /* Handle the rare case of being in a CLONE_FS clone group */
+ if (lx_clone_fs_do_group(path, B_FALSE, &err))
+ return ((err != 0) ? set_errno(err) : 0);
+
return (chdir(path));
}
long
lx_chroot(char *path)
{
+ int err;
+
+ /* Handle the rare case of being in a CLONE_FS clone group */
+ if (lx_clone_fs_do_group(path, B_TRUE, &err))
+ return ((err != 0) ? set_errno(err) : 0);
+
return (chroot(path));
}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_umask.c b/usr/src/uts/common/brand/lx/syscall/lx_umask.c
index 130af6c776..cb5e4ed232 100644
--- a/usr/src/uts/common/brand/lx/syscall/lx_umask.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_umask.c
@@ -14,12 +14,39 @@
*/
#include <sys/types.h>
+#include <sys/lx_misc.h>
+#include <lx_syscall.h>
/* From usr/src/uts/common/syscall/umask.c */
extern int umask(int);
+/*
+ * Just do what umask() does, but for the given process.
+ */
+static int
+lx_clone_umask_cb(proc_t *pp, void *arg)
+{
+ mode_t cmask = (mode_t)(intptr_t)arg;
+ mode_t orig;
+
+ orig = PTOU(pp)->u_cmask;
+ PTOU(pp)->u_cmask = (mode_t)(cmask & PERMMASK);
+ return ((int)orig);
+}
+
long
lx_umask(mode_t cmask)
{
+ lx_proc_data_t *lproc = ttolxproc(curthread);
+
+ /* Handle the rare case of being in a CLONE_FS clone group */
+ if (lx_clone_grp_member(lproc, LX_CLONE_FS)) {
+ int omask;
+
+ omask = lx_clone_grp_walk(lproc, LX_CLONE_FS, lx_clone_umask_cb,
+ (void *)(intptr_t)cmask);
+ return (omask);
+ }
+
return (umask(cmask));
}
diff --git a/usr/src/uts/common/brand/sn1/sn1_brand.c b/usr/src/uts/common/brand/sn1/sn1_brand.c
index 3d3acd6036..f31961b231 100644
--- a/usr/src/uts/common/brand/sn1/sn1_brand.c
+++ b/usr/src/uts/common/brand/sn1/sn1_brand.c
@@ -102,7 +102,8 @@ struct brand_ops sn1_brops = {
NULL, /* b_sendsig */
NULL, /* b_setid_clear */
NULL, /* b_pagefault */
- B_TRUE /* b_intp_parse_arg */
+ B_TRUE, /* b_intp_parse_arg */
+ NULL /* b_clearbrand */
};
#ifdef sparc
diff --git a/usr/src/uts/common/brand/solaris10/s10_brand.c b/usr/src/uts/common/brand/solaris10/s10_brand.c
index 50997fba02..c49d605b00 100644
--- a/usr/src/uts/common/brand/solaris10/s10_brand.c
+++ b/usr/src/uts/common/brand/solaris10/s10_brand.c
@@ -107,7 +107,8 @@ struct brand_ops s10_brops = {
NULL, /* b_sendsig */
NULL, /* b_setid_clear */
NULL, /* b_pagefault */
- B_TRUE /* b_intp_parse_arg */
+ B_TRUE, /* b_intp_parse_arg */
+ NULL /* b_clearbrand */
};
#ifdef sparc
diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c
index 02901d023d..62c3bbe2d6 100644
--- a/usr/src/uts/common/os/brand.c
+++ b/usr/src/uts/common/os/brand.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2015, Joyent, Inc. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
*/
#include <sys/kmem.h>
@@ -389,6 +389,9 @@ brand_clearbrand(proc_t *p, boolean_t lwps_ok)
VERIFY(bp != NULL);
VERIFY(PROC_IS_BRANDED(p));
+ if (BROP(p)->b_clearbrand != NULL)
+ BROP(p)->b_clearbrand(p, lwps_ok);
+
mutex_enter(&p->p_lock);
p->p_brand = &native_brand;
brand_data = p->p_brand_data;
diff --git a/usr/src/uts/common/sys/brand.h b/usr/src/uts/common/sys/brand.h
index 231daa5e9d..e50c4e055a 100644
--- a/usr/src/uts/common/sys/brand.h
+++ b/usr/src/uts/common/sys/brand.h
@@ -149,6 +149,7 @@ struct execa;
* b_setid_clear - Override setid_clear behavior
* b_pagefault - Trap pagefault events
* b_intp_parse_arg - Controls interpreter argument handling (allow 1 or all)
+ * b_clearbrand - Perform any actions necessary when clearing the brand.
*/
struct brand_ops {
void (*b_init_brand_data)(zone_t *, kmutex_t *);
@@ -198,6 +199,7 @@ struct brand_ops {
int (*b_pagefault)(proc_t *, klwp_t *, caddr_t, enum fault_type,
enum seg_rw);
boolean_t b_intp_parse_arg;
+ void (*b_clearbrand)(proc_t *, boolean_t);
};
/*
diff --git a/usr/src/uts/common/syscall/chdir.c b/usr/src/uts/common/syscall/chdir.c
index 84c924f570..deb5532b50 100644
--- a/usr/src/uts/common/syscall/chdir.c
+++ b/usr/src/uts/common/syscall/chdir.c
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -62,7 +63,7 @@
/*
* Change current working directory (".").
*/
-static int chdirec(vnode_t *, int ischroot, int do_traverse);
+static int chdirec(vnode_t *, boolean_t ischroot, boolean_t do_traverse);
int
chdir(char *fname)
@@ -78,7 +79,7 @@ lookup:
return (set_errno(error));
}
- error = chdirec(vp, 0, 1);
+ error = chdirec(vp, B_FALSE, B_TRUE);
if (error) {
if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
goto lookup;
@@ -102,7 +103,7 @@ fchdir(int fd)
vp = fp->f_vnode;
VN_HOLD(vp);
releasef(fd);
- error = chdirec(vp, 0, 0);
+ error = chdirec(vp, B_FALSE, B_FALSE);
if (error)
return (set_errno(error));
return (0);
@@ -125,7 +126,7 @@ lookup:
return (set_errno(error));
}
- error = chdirec(vp, 1, 1);
+ error = chdirec(vp, B_TRUE, B_TRUE);
if (error) {
if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
goto lookup;
@@ -152,18 +153,18 @@ fchroot(int fd)
vp = fp->f_vnode;
VN_HOLD(vp);
releasef(fd);
- error = chdirec(vp, 1, 0);
+ error = chdirec(vp, B_TRUE, B_FALSE);
if (error)
return (set_errno(error));
return (0);
}
static int
-chdirec(vnode_t *vp, int ischroot, int do_traverse)
+chdirec_common(proc_t *pp, vnode_t *vp, boolean_t ischroot,
+ boolean_t do_traverse)
{
int error;
vnode_t *oldvp;
- proc_t *pp = curproc;
vnode_t **vpp;
refstr_t *cwd;
int newcwd = 1;
@@ -194,7 +195,7 @@ chdirec(vnode_t *vp, int ischroot, int do_traverse)
if (ischroot) {
struct vattr tattr;
struct vattr rattr;
- vnode_t *zonevp = curproc->p_zone->zone_rootvp;
+ vnode_t *zonevp = pp->p_zone->zone_rootvp;
tattr.va_mask = AT_FSID|AT_NODEID;
if (error = VOP_GETATTR(vp, &tattr, 0, CRED(), NULL))
@@ -243,3 +244,15 @@ bad:
VN_RELE(vp);
return (error);
}
+
+int
+chdir_proc(proc_t *pp, vnode_t *vp, boolean_t ischroot, boolean_t do_traverse)
+{
+ return (chdirec_common(pp, vp, ischroot, do_traverse));
+}
+
+static int
+chdirec(vnode_t *vp, boolean_t ischroot, boolean_t do_traverse)
+{
+ return (chdirec_common(curproc, vp, ischroot, do_traverse));
+}