...
Run Format

Source file src/syscall/exec_linux.go

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// +build linux
     6	
     7	package syscall
     8	
     9	import (
    10		"runtime"
    11		"unsafe"
    12	)
    13	
    14	// SysProcIDMap holds Container ID to Host ID mappings used for User Namespaces in Linux.
    15	// See user_namespaces(7).
    16	type SysProcIDMap struct {
    17		ContainerID int // Container ID.
    18		HostID      int // Host ID.
    19		Size        int // Size.
    20	}
    21	
    22	type SysProcAttr struct {
    23		Chroot       string         // Chroot.
    24		Credential   *Credential    // Credential.
    25		Ptrace       bool           // Enable tracing.
    26		Setsid       bool           // Create session.
    27		Setpgid      bool           // Set process group ID to Pgid, or, if Pgid == 0, to new pid.
    28		Setctty      bool           // Set controlling terminal to fd Ctty (only meaningful if Setsid is set)
    29		Noctty       bool           // Detach fd 0 from controlling terminal
    30		Ctty         int            // Controlling TTY fd
    31		Foreground   bool           // Place child's process group in foreground. (Implies Setpgid. Uses Ctty as fd of controlling TTY)
    32		Pgid         int            // Child's process group ID if Setpgid.
    33		Pdeathsig    Signal         // Signal that the process will get when its parent dies (Linux only)
    34		Cloneflags   uintptr        // Flags for clone calls (Linux only)
    35		Unshareflags uintptr        // Flags for unshare calls (Linux only)
    36		UidMappings  []SysProcIDMap // User ID mappings for user namespaces.
    37		GidMappings  []SysProcIDMap // Group ID mappings for user namespaces.
    38		// GidMappingsEnableSetgroups enabling setgroups syscall.
    39		// If false, then setgroups syscall will be disabled for the child process.
    40		// This parameter is no-op if GidMappings == nil. Otherwise for unprivileged
    41		// users this should be set to false for mappings work.
    42		GidMappingsEnableSetgroups bool
    43	}
    44	
    45	// Implemented in runtime package.
    46	func runtime_BeforeFork()
    47	func runtime_AfterFork()
    48	
    49	// Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child.
    50	// If a dup or exec fails, write the errno error to pipe.
    51	// (Pipe is close-on-exec so if exec succeeds, it will be closed.)
    52	// In the child, this function must not acquire any locks, because
    53	// they might have been locked at the time of the fork. This means
    54	// no rescheduling, no malloc calls, and no new stack segments.
    55	// For the same reason compiler does not race instrument it.
    56	// The calls to RawSyscall are okay because they are assembly
    57	// functions that do not grow the stack.
    58	//go:norace
    59	func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
    60		// Declare all variables at top in case any
    61		// declarations require heap allocation (e.g., err1).
    62		var (
    63			r1     uintptr
    64			err1   Errno
    65			err2   Errno
    66			nextfd int
    67			i      int
    68			p      [2]int
    69		)
    70	
    71		// Record parent PID so child can test if it has died.
    72		ppid, _, _ := RawSyscall(SYS_GETPID, 0, 0, 0)
    73	
    74		// Guard against side effects of shuffling fds below.
    75		// Make sure that nextfd is beyond any currently open files so
    76		// that we can't run the risk of overwriting any of them.
    77		fd := make([]int, len(attr.Files))
    78		nextfd = len(attr.Files)
    79		for i, ufd := range attr.Files {
    80			if nextfd < int(ufd) {
    81				nextfd = int(ufd)
    82			}
    83			fd[i] = int(ufd)
    84		}
    85		nextfd++
    86	
    87		// Allocate another pipe for parent to child communication for
    88		// synchronizing writing of User ID/Group ID mappings.
    89		if sys.UidMappings != nil || sys.GidMappings != nil {
    90			if err := forkExecPipe(p[:]); err != nil {
    91				return 0, err.(Errno)
    92			}
    93		}
    94	
    95		// About to call fork.
    96		// No more allocation or calls of non-assembly functions.
    97		runtime_BeforeFork()
    98		if runtime.GOARCH == "s390x" {
    99			r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0)
   100		} else {
   101			r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
   102		}
   103		if err1 != 0 {
   104			runtime_AfterFork()
   105			return 0, err1
   106		}
   107	
   108		if r1 != 0 {
   109			// parent; return PID
   110			runtime_AfterFork()
   111			pid = int(r1)
   112	
   113			if sys.UidMappings != nil || sys.GidMappings != nil {
   114				Close(p[0])
   115				err := writeUidGidMappings(pid, sys)
   116				if err != nil {
   117					err2 = err.(Errno)
   118				}
   119				RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
   120				Close(p[1])
   121			}
   122	
   123			return pid, 0
   124		}
   125	
   126		// Fork succeeded, now in child.
   127	
   128		// Wait for User ID/Group ID mappings to be written.
   129		if sys.UidMappings != nil || sys.GidMappings != nil {
   130			if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 {
   131				goto childerror
   132			}
   133			r1, _, err1 = RawSyscall(SYS_READ, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
   134			if err1 != 0 {
   135				goto childerror
   136			}
   137			if r1 != unsafe.Sizeof(err2) {
   138				err1 = EINVAL
   139				goto childerror
   140			}
   141			if err2 != 0 {
   142				err1 = err2
   143				goto childerror
   144			}
   145		}
   146	
   147		// Enable tracing if requested.
   148		if sys.Ptrace {
   149			_, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
   150			if err1 != 0 {
   151				goto childerror
   152			}
   153		}
   154	
   155		// Session ID
   156		if sys.Setsid {
   157			_, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
   158			if err1 != 0 {
   159				goto childerror
   160			}
   161		}
   162	
   163		// Set process group
   164		if sys.Setpgid || sys.Foreground {
   165			// Place child in process group.
   166			_, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
   167			if err1 != 0 {
   168				goto childerror
   169			}
   170		}
   171	
   172		if sys.Foreground {
   173			pgrp := int32(sys.Pgid)
   174			if pgrp == 0 {
   175				r1, _, err1 = RawSyscall(SYS_GETPID, 0, 0, 0)
   176				if err1 != 0 {
   177					goto childerror
   178				}
   179	
   180				pgrp = int32(r1)
   181			}
   182	
   183			// Place process group in foreground.
   184			_, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
   185			if err1 != 0 {
   186				goto childerror
   187			}
   188		}
   189	
   190		// Chroot
   191		if chroot != nil {
   192			_, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
   193			if err1 != 0 {
   194				goto childerror
   195			}
   196		}
   197	
   198		// Unshare
   199		if sys.Unshareflags != 0 {
   200			_, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
   201			if err1 != 0 {
   202				goto childerror
   203			}
   204		}
   205	
   206		// User and groups
   207		if cred := sys.Credential; cred != nil {
   208			ngroups := uintptr(len(cred.Groups))
   209			groups := uintptr(0)
   210			if ngroups > 0 {
   211				groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
   212			}
   213			// Don't call setgroups in case of user namespace, gid mappings
   214			// and disabled setgroups, because otherwise unprivileged user namespace
   215			// will fail with any non-empty SysProcAttr.Credential.
   216			if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) {
   217				_, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
   218				if err1 != 0 {
   219					goto childerror
   220				}
   221			}
   222			_, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
   223			if err1 != 0 {
   224				goto childerror
   225			}
   226			_, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
   227			if err1 != 0 {
   228				goto childerror
   229			}
   230		}
   231	
   232		// Chdir
   233		if dir != nil {
   234			_, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
   235			if err1 != 0 {
   236				goto childerror
   237			}
   238		}
   239	
   240		// Parent death signal
   241		if sys.Pdeathsig != 0 {
   242			_, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
   243			if err1 != 0 {
   244				goto childerror
   245			}
   246	
   247			// Signal self if parent is already dead. This might cause a
   248			// duplicate signal in rare cases, but it won't matter when
   249			// using SIGKILL.
   250			r1, _, _ = RawSyscall(SYS_GETPPID, 0, 0, 0)
   251			if r1 != ppid {
   252				pid, _, _ := RawSyscall(SYS_GETPID, 0, 0, 0)
   253				_, _, err1 := RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
   254				if err1 != 0 {
   255					goto childerror
   256				}
   257			}
   258		}
   259	
   260		// Pass 1: look for fd[i] < i and move those up above len(fd)
   261		// so that pass 2 won't stomp on an fd it needs later.
   262		if pipe < nextfd {
   263			_, _, err1 = RawSyscall(_SYS_dup, uintptr(pipe), uintptr(nextfd), 0)
   264			if err1 != 0 {
   265				goto childerror
   266			}
   267			RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC)
   268			pipe = nextfd
   269			nextfd++
   270		}
   271		for i = 0; i < len(fd); i++ {
   272			if fd[i] >= 0 && fd[i] < int(i) {
   273				if nextfd == pipe { // don't stomp on pipe
   274					nextfd++
   275				}
   276				_, _, err1 = RawSyscall(_SYS_dup, uintptr(fd[i]), uintptr(nextfd), 0)
   277				if err1 != 0 {
   278					goto childerror
   279				}
   280				RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC)
   281				fd[i] = nextfd
   282				nextfd++
   283			}
   284		}
   285	
   286		// Pass 2: dup fd[i] down onto i.
   287		for i = 0; i < len(fd); i++ {
   288			if fd[i] == -1 {
   289				RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
   290				continue
   291			}
   292			if fd[i] == int(i) {
   293				// dup2(i, i) won't clear close-on-exec flag on Linux,
   294				// probably not elsewhere either.
   295				_, _, err1 = RawSyscall(SYS_FCNTL, uintptr(fd[i]), F_SETFD, 0)
   296				if err1 != 0 {
   297					goto childerror
   298				}
   299				continue
   300			}
   301			// The new fd is created NOT close-on-exec,
   302			// which is exactly what we want.
   303			_, _, err1 = RawSyscall(_SYS_dup, uintptr(fd[i]), uintptr(i), 0)
   304			if err1 != 0 {
   305				goto childerror
   306			}
   307		}
   308	
   309		// By convention, we don't close-on-exec the fds we are
   310		// started with, so if len(fd) < 3, close 0, 1, 2 as needed.
   311		// Programs that know they inherit fds >= 3 will need
   312		// to set them close-on-exec.
   313		for i = len(fd); i < 3; i++ {
   314			RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
   315		}
   316	
   317		// Detach fd 0 from tty
   318		if sys.Noctty {
   319			_, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
   320			if err1 != 0 {
   321				goto childerror
   322			}
   323		}
   324	
   325		// Set the controlling TTY to Ctty
   326		if sys.Setctty {
   327			_, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 0)
   328			if err1 != 0 {
   329				goto childerror
   330			}
   331		}
   332	
   333		// Time to exec.
   334		_, _, err1 = RawSyscall(SYS_EXECVE,
   335			uintptr(unsafe.Pointer(argv0)),
   336			uintptr(unsafe.Pointer(&argv[0])),
   337			uintptr(unsafe.Pointer(&envv[0])))
   338	
   339	childerror:
   340		// send error code on pipe
   341		RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
   342		for {
   343			RawSyscall(SYS_EXIT, 253, 0, 0)
   344		}
   345	}
   346	
   347	// Try to open a pipe with O_CLOEXEC set on both file descriptors.
   348	func forkExecPipe(p []int) (err error) {
   349		err = Pipe2(p, O_CLOEXEC)
   350		// pipe2 was added in 2.6.27 and our minimum requirement is 2.6.23, so it
   351		// might not be implemented.
   352		if err == ENOSYS {
   353			if err = Pipe(p); err != nil {
   354				return
   355			}
   356			if _, err = fcntl(p[0], F_SETFD, FD_CLOEXEC); err != nil {
   357				return
   358			}
   359			_, err = fcntl(p[1], F_SETFD, FD_CLOEXEC)
   360		}
   361		return
   362	}
   363	
   364	// writeIDMappings writes the user namespace User ID or Group ID mappings to the specified path.
   365	func writeIDMappings(path string, idMap []SysProcIDMap) error {
   366		fd, err := Open(path, O_RDWR, 0)
   367		if err != nil {
   368			return err
   369		}
   370	
   371		data := ""
   372		for _, im := range idMap {
   373			data = data + itoa(im.ContainerID) + " " + itoa(im.HostID) + " " + itoa(im.Size) + "\n"
   374		}
   375	
   376		bytes, err := ByteSliceFromString(data)
   377		if err != nil {
   378			Close(fd)
   379			return err
   380		}
   381	
   382		if _, err := Write(fd, bytes); err != nil {
   383			Close(fd)
   384			return err
   385		}
   386	
   387		if err := Close(fd); err != nil {
   388			return err
   389		}
   390	
   391		return nil
   392	}
   393	
   394	// writeSetgroups writes to /proc/PID/setgroups "deny" if enable is false
   395	// and "allow" if enable is true.
   396	// This is needed since kernel 3.19, because you can't write gid_map without
   397	// disabling setgroups() system call.
   398	func writeSetgroups(pid int, enable bool) error {
   399		sgf := "/proc/" + itoa(pid) + "/setgroups"
   400		fd, err := Open(sgf, O_RDWR, 0)
   401		if err != nil {
   402			return err
   403		}
   404	
   405		var data []byte
   406		if enable {
   407			data = []byte("allow")
   408		} else {
   409			data = []byte("deny")
   410		}
   411	
   412		if _, err := Write(fd, data); err != nil {
   413			Close(fd)
   414			return err
   415		}
   416	
   417		return Close(fd)
   418	}
   419	
   420	// writeUidGidMappings writes User ID and Group ID mappings for user namespaces
   421	// for a process and it is called from the parent process.
   422	func writeUidGidMappings(pid int, sys *SysProcAttr) error {
   423		if sys.UidMappings != nil {
   424			uidf := "/proc/" + itoa(pid) + "/uid_map"
   425			if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
   426				return err
   427			}
   428		}
   429	
   430		if sys.GidMappings != nil {
   431			// If the kernel is too old to support /proc/PID/setgroups, writeSetGroups will return ENOENT; this is OK.
   432			if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
   433				return err
   434			}
   435			gidf := "/proc/" + itoa(pid) + "/gid_map"
   436			if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
   437				return err
   438			}
   439		}
   440	
   441		return nil
   442	}
   443	

View as plain text