diff options
author | Tianon Gravi <admwiggin@gmail.com> | 2015-01-15 11:54:00 -0700 |
---|---|---|
committer | Tianon Gravi <admwiggin@gmail.com> | 2015-01-15 11:54:00 -0700 |
commit | f154da9e12608589e8d5f0508f908a0c3e88a1bb (patch) | |
tree | f8255d51e10c6f1e0ed69702200b966c9556a431 /src/syscall/exec_linux.go | |
parent | 8d8329ed5dfb9622c82a9fbec6fd99a580f9c9f6 (diff) | |
download | golang-upstream/1.4.tar.gz |
Imported Upstream version 1.4upstream/1.4
Diffstat (limited to 'src/syscall/exec_linux.go')
-rw-r--r-- | src/syscall/exec_linux.go | 363 |
1 files changed, 363 insertions, 0 deletions
diff --git a/src/syscall/exec_linux.go b/src/syscall/exec_linux.go new file mode 100644 index 000000000..042c20a46 --- /dev/null +++ b/src/syscall/exec_linux.go @@ -0,0 +1,363 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build linux + +package syscall + +import ( + "unsafe" +) + +// SysProcIDMap holds Container ID to Host ID mappings used for User Namespaces in Linux. +// See user_namespaces(7). +type SysProcIDMap struct { + ContainerID int // Container ID. + HostID int // Host ID. + Size int // Size. +} + +type SysProcAttr struct { + Chroot string // Chroot. + Credential *Credential // Credential. + Ptrace bool // Enable tracing. + Setsid bool // Create session. + Setpgid bool // Set process group ID to new pid (SYSV setpgrp) + Setctty bool // Set controlling terminal to fd Ctty (only meaningful if Setsid is set) + Noctty bool // Detach fd 0 from controlling terminal + Ctty int // Controlling TTY fd (Linux only) + Pdeathsig Signal // Signal that the process will get when its parent dies (Linux only) + Cloneflags uintptr // Flags for clone calls (Linux only) + UidMappings []SysProcIDMap // User ID mappings for user namespaces. + GidMappings []SysProcIDMap // Group ID mappings for user namespaces. +} + +// Implemented in runtime package. +func runtime_BeforeFork() +func runtime_AfterFork() + +// Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child. +// If a dup or exec fails, write the errno error to pipe. +// (Pipe is close-on-exec so if exec succeeds, it will be closed.) +// In the child, this function must not acquire any locks, because +// they might have been locked at the time of the fork. This means +// no rescheduling, no malloc calls, and no new stack segments. +// For the same reason compiler does not race instrument it. +// The calls to RawSyscall are okay because they are assembly +// functions that do not grow the stack. +func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) { + // Declare all variables at top in case any + // declarations require heap allocation (e.g., err1). + var ( + r1 uintptr + err1 Errno + err2 Errno + nextfd int + i int + p [2]int + ) + + // Guard against side effects of shuffling fds below. + // Make sure that nextfd is beyond any currently open files so + // that we can't run the risk of overwriting any of them. + fd := make([]int, len(attr.Files)) + nextfd = len(attr.Files) + for i, ufd := range attr.Files { + if nextfd < int(ufd) { + nextfd = int(ufd) + } + fd[i] = int(ufd) + } + nextfd++ + + // Allocate another pipe for parent to child communication for + // synchronizing writing of User ID/Group ID mappings. + if sys.UidMappings != nil || sys.GidMappings != nil { + if err := forkExecPipe(p[:]); err != nil { + return 0, err.(Errno) + } + } + + // About to call fork. + // No more allocation or calls of non-assembly functions. + runtime_BeforeFork() + r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0) + if err1 != 0 { + runtime_AfterFork() + return 0, err1 + } + + if r1 != 0 { + // parent; return PID + runtime_AfterFork() + pid = int(r1) + + if sys.UidMappings != nil || sys.GidMappings != nil { + Close(p[0]) + err := writeUidGidMappings(pid, sys) + if err != nil { + err2 = err.(Errno) + } + RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2)) + Close(p[1]) + } + + return pid, 0 + } + + // Fork succeeded, now in child. + + // Wait for User ID/Group ID mappings to be written. + if sys.UidMappings != nil || sys.GidMappings != nil { + if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 { + goto childerror + } + r1, _, err1 = RawSyscall(SYS_READ, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2)) + if err1 != 0 { + goto childerror + } + if r1 != unsafe.Sizeof(err2) { + err1 = EINVAL + goto childerror + } + if err2 != 0 { + err1 = err2 + goto childerror + } + } + + // Parent death signal + if sys.Pdeathsig != 0 { + _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0) + if err1 != 0 { + goto childerror + } + + // Signal self if parent is already dead. This might cause a + // duplicate signal in rare cases, but it won't matter when + // using SIGKILL. + r1, _, _ = RawSyscall(SYS_GETPPID, 0, 0, 0) + if r1 == 1 { + pid, _, _ := RawSyscall(SYS_GETPID, 0, 0, 0) + _, _, err1 := RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0) + if err1 != 0 { + goto childerror + } + } + } + + // Enable tracing if requested. + if sys.Ptrace { + _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0) + if err1 != 0 { + goto childerror + } + } + + // Session ID + if sys.Setsid { + _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0) + if err1 != 0 { + goto childerror + } + } + + // Set process group + if sys.Setpgid { + _, _, err1 = RawSyscall(SYS_SETPGID, 0, 0, 0) + if err1 != 0 { + goto childerror + } + } + + // Chroot + if chroot != nil { + _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0) + if err1 != 0 { + goto childerror + } + } + + // User and groups + if cred := sys.Credential; cred != nil { + ngroups := uintptr(len(cred.Groups)) + var groups unsafe.Pointer + if ngroups > 0 { + groups = unsafe.Pointer(&cred.Groups[0]) + } + _, _, err1 = RawSyscall(SYS_SETGROUPS, ngroups, uintptr(groups), 0) + if err1 != 0 { + goto childerror + } + _, _, err1 = RawSyscall(SYS_SETGID, uintptr(cred.Gid), 0, 0) + if err1 != 0 { + goto childerror + } + _, _, err1 = RawSyscall(SYS_SETUID, uintptr(cred.Uid), 0, 0) + if err1 != 0 { + goto childerror + } + } + + // Chdir + if dir != nil { + _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0) + if err1 != 0 { + goto childerror + } + } + + // Pass 1: look for fd[i] < i and move those up above len(fd) + // so that pass 2 won't stomp on an fd it needs later. + if pipe < nextfd { + _, _, err1 = RawSyscall(SYS_DUP2, uintptr(pipe), uintptr(nextfd), 0) + if err1 != 0 { + goto childerror + } + RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC) + pipe = nextfd + nextfd++ + } + for i = 0; i < len(fd); i++ { + if fd[i] >= 0 && fd[i] < int(i) { + _, _, err1 = RawSyscall(SYS_DUP2, uintptr(fd[i]), uintptr(nextfd), 0) + if err1 != 0 { + goto childerror + } + RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC) + fd[i] = nextfd + nextfd++ + if nextfd == pipe { // don't stomp on pipe + nextfd++ + } + } + } + + // Pass 2: dup fd[i] down onto i. + for i = 0; i < len(fd); i++ { + if fd[i] == -1 { + RawSyscall(SYS_CLOSE, uintptr(i), 0, 0) + continue + } + if fd[i] == int(i) { + // dup2(i, i) won't clear close-on-exec flag on Linux, + // probably not elsewhere either. + _, _, err1 = RawSyscall(SYS_FCNTL, uintptr(fd[i]), F_SETFD, 0) + if err1 != 0 { + goto childerror + } + continue + } + // The new fd is created NOT close-on-exec, + // which is exactly what we want. + _, _, err1 = RawSyscall(SYS_DUP2, uintptr(fd[i]), uintptr(i), 0) + if err1 != 0 { + goto childerror + } + } + + // By convention, we don't close-on-exec the fds we are + // started with, so if len(fd) < 3, close 0, 1, 2 as needed. + // Programs that know they inherit fds >= 3 will need + // to set them close-on-exec. + for i = len(fd); i < 3; i++ { + RawSyscall(SYS_CLOSE, uintptr(i), 0, 0) + } + + // Detach fd 0 from tty + if sys.Noctty { + _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0) + if err1 != 0 { + goto childerror + } + } + + // Set the controlling TTY to Ctty + if sys.Setctty && sys.Ctty >= 0 { + _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 0) + if err1 != 0 { + goto childerror + } + } + + // Time to exec. + _, _, err1 = RawSyscall(SYS_EXECVE, + uintptr(unsafe.Pointer(argv0)), + uintptr(unsafe.Pointer(&argv[0])), + uintptr(unsafe.Pointer(&envv[0]))) + +childerror: + // send error code on pipe + RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1)) + for { + RawSyscall(SYS_EXIT, 253, 0, 0) + } +} + +// Try to open a pipe with O_CLOEXEC set on both file descriptors. +func forkExecPipe(p []int) (err error) { + err = Pipe2(p, O_CLOEXEC) + // pipe2 was added in 2.6.27 and our minimum requirement is 2.6.23, so it + // might not be implemented. + if err == ENOSYS { + if err = Pipe(p); err != nil { + return + } + if _, err = fcntl(p[0], F_SETFD, FD_CLOEXEC); err != nil { + return + } + _, err = fcntl(p[1], F_SETFD, FD_CLOEXEC) + } + return +} + +// writeIDMappings writes the user namespace User ID or Group ID mappings to the specified path. +func writeIDMappings(path string, idMap []SysProcIDMap) error { + fd, err := Open(path, O_RDWR, 0) + if err != nil { + return err + } + + data := "" + for _, im := range idMap { + data = data + itoa(im.ContainerID) + " " + itoa(im.HostID) + " " + itoa(im.Size) + "\n" + } + + bytes, err := ByteSliceFromString(data) + if err != nil { + Close(fd) + return err + } + + if _, err := Write(fd, bytes); err != nil { + Close(fd) + return err + } + + if err := Close(fd); err != nil { + return err + } + + return nil +} + +// writeUidGidMappings writes User ID and Group ID mappings for user namespaces +// for a process and it is called from the parent process. +func writeUidGidMappings(pid int, sys *SysProcAttr) error { + if sys.UidMappings != nil { + uidf := "/proc/" + itoa(pid) + "/uid_map" + if err := writeIDMappings(uidf, sys.UidMappings); err != nil { + return err + } + } + + if sys.GidMappings != nil { + gidf := "/proc/" + itoa(pid) + "/gid_map" + if err := writeIDMappings(gidf, sys.GidMappings); err != nil { + return err + } + } + + return nil +} |