Source file
src/syscall/exec_linux.go
Documentation: syscall
1
2
3
4
5
6
7 package syscall
8
9 import (
10 "runtime"
11 "unsafe"
12 )
13
14
15
16 type SysProcIDMap struct {
17 ContainerID int
18 HostID int
19 Size int
20 }
21
22 type SysProcAttr struct {
23 Chroot string
24 Credential *Credential
25
26
27
28 Ptrace bool
29 Setsid bool
30
31
32 Setpgid bool
33
34
35
36
37 Setctty bool
38 Noctty bool
39 Ctty int
40
41
42
43
44
45 Foreground bool
46 Pgid int
47 Pdeathsig Signal
48 Cloneflags uintptr
49 Unshareflags uintptr
50 UidMappings []SysProcIDMap
51 GidMappings []SysProcIDMap
52
53
54
55
56 GidMappingsEnableSetgroups bool
57 AmbientCaps []uintptr
58 }
59
60 var (
61 none = [...]byte{'n', 'o', 'n', 'e', 0}
62 slash = [...]byte{'/', 0}
63 )
64
65
66 func runtime_BeforeFork()
67 func runtime_AfterFork()
68 func runtime_AfterForkInChild()
69
70
71
72
73
74
75
76
77
78
79
80 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
81
82
83 r1, err1, p, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
84 if locked {
85 runtime_AfterFork()
86 }
87 if err1 != 0 {
88 return 0, err1
89 }
90
91
92 pid = int(r1)
93
94 if sys.UidMappings != nil || sys.GidMappings != nil {
95 Close(p[0])
96 var err2 Errno
97
98
99 if sys.Unshareflags&CLONE_NEWUSER == 0 {
100 if err := writeUidGidMappings(pid, sys); err != nil {
101 err2 = err.(Errno)
102 }
103 }
104 RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
105 Close(p[1])
106 }
107
108 return pid, 0
109 }
110
111 const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
112
113 type capHeader struct {
114 version uint32
115 pid int32
116 }
117
118 type capData struct {
119 effective uint32
120 permitted uint32
121 inheritable uint32
122 }
123 type caps struct {
124 hdr capHeader
125 data [2]capData
126 }
127
128
129 func capToIndex(cap uintptr) uintptr { return cap >> 5 }
130
131
132 func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
133
134
135
136
137
138
139
140
141
142
143
144 func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (r1 uintptr, err1 Errno, p [2]int, locked bool) {
145
146 const (
147 PR_CAP_AMBIENT = 0x2f
148 PR_CAP_AMBIENT_RAISE = 0x2
149 )
150
151
152
153
154
155
156
157
158 var (
159 err2 Errno
160 nextfd int
161 i int
162 caps caps
163 fd1 uintptr
164 puid, psetgroups, pgid []byte
165 uidmap, setgroups, gidmap []byte
166 )
167
168 if sys.UidMappings != nil {
169 puid = []byte("/proc/self/uid_map\000")
170 uidmap = formatIDMappings(sys.UidMappings)
171 }
172
173 if sys.GidMappings != nil {
174 psetgroups = []byte("/proc/self/setgroups\000")
175 pgid = []byte("/proc/self/gid_map\000")
176
177 if sys.GidMappingsEnableSetgroups {
178 setgroups = []byte("allow\000")
179 } else {
180 setgroups = []byte("deny\000")
181 }
182 gidmap = formatIDMappings(sys.GidMappings)
183 }
184
185
186 ppid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
187
188
189
190
191 fd := make([]int, len(attr.Files))
192 nextfd = len(attr.Files)
193 for i, ufd := range attr.Files {
194 if nextfd < int(ufd) {
195 nextfd = int(ufd)
196 }
197 fd[i] = int(ufd)
198 }
199 nextfd++
200
201
202
203 if sys.UidMappings != nil || sys.GidMappings != nil {
204 if err := forkExecPipe(p[:]); err != nil {
205 err1 = err.(Errno)
206 return
207 }
208 }
209
210 var hasRawVforkSyscall bool
211 switch runtime.GOARCH {
212 case "amd64", "arm64", "ppc64", "riscv64", "s390x":
213 hasRawVforkSyscall = true
214 }
215
216
217
218 runtime_BeforeFork()
219 locked = true
220 switch {
221 case hasRawVforkSyscall && (sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0):
222 r1, err1 = rawVforkSyscall(SYS_CLONE, uintptr(SIGCHLD|CLONE_VFORK|CLONE_VM)|sys.Cloneflags)
223 case runtime.GOARCH == "s390x":
224 r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0)
225 default:
226 r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
227 }
228 if err1 != 0 || r1 != 0 {
229
230
231
232
233
234
235 return
236 }
237
238
239
240 runtime_AfterForkInChild()
241
242
243 if len(sys.AmbientCaps) > 0 {
244 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
245 if err1 != 0 {
246 goto childerror
247 }
248 }
249
250
251 if sys.UidMappings != nil || sys.GidMappings != nil {
252 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 {
253 goto childerror
254 }
255 r1, _, err1 = RawSyscall(SYS_READ, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
256 if err1 != 0 {
257 goto childerror
258 }
259 if r1 != unsafe.Sizeof(err2) {
260 err1 = EINVAL
261 goto childerror
262 }
263 if err2 != 0 {
264 err1 = err2
265 goto childerror
266 }
267 }
268
269
270 if sys.Setsid {
271 _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
272 if err1 != 0 {
273 goto childerror
274 }
275 }
276
277
278 if sys.Setpgid || sys.Foreground {
279
280 _, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
281 if err1 != 0 {
282 goto childerror
283 }
284 }
285
286 if sys.Foreground {
287 pgrp := int32(sys.Pgid)
288 if pgrp == 0 {
289 r1, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
290
291 pgrp = int32(r1)
292 }
293
294
295 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
296 if err1 != 0 {
297 goto childerror
298 }
299 }
300
301
302 if sys.Unshareflags != 0 {
303 _, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
304 if err1 != 0 {
305 goto childerror
306 }
307
308 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil {
309 dirfd := int(_AT_FDCWD)
310 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
311 goto childerror
312 }
313 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
314 if err1 != 0 {
315 goto childerror
316 }
317 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
318 goto childerror
319 }
320
321 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
322 goto childerror
323 }
324 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
325 if err1 != 0 {
326 goto childerror
327 }
328 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
329 goto childerror
330 }
331 }
332
333 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil {
334 dirfd := int(_AT_FDCWD)
335 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
336 goto childerror
337 }
338 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
339 if err1 != 0 {
340 goto childerror
341 }
342 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
343 goto childerror
344 }
345 }
346
347
348
349
350
351
352
353
354 if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
355 _, _, err1 = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)
356 if err1 != 0 {
357 goto childerror
358 }
359 }
360 }
361
362
363 if chroot != nil {
364 _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
365 if err1 != 0 {
366 goto childerror
367 }
368 }
369
370
371 if cred := sys.Credential; cred != nil {
372 ngroups := uintptr(len(cred.Groups))
373 groups := uintptr(0)
374 if ngroups > 0 {
375 groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
376 }
377 if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
378 _, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
379 if err1 != 0 {
380 goto childerror
381 }
382 }
383 _, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
384 if err1 != 0 {
385 goto childerror
386 }
387 _, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
388 if err1 != 0 {
389 goto childerror
390 }
391 }
392
393 if len(sys.AmbientCaps) != 0 {
394
395
396 caps.hdr.version = _LINUX_CAPABILITY_VERSION_3
397
398 if _, _, err1 := RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
399 goto childerror
400 }
401
402 for _, c := range sys.AmbientCaps {
403
404
405 caps.data[capToIndex(c)].permitted |= capToMask(c)
406 caps.data[capToIndex(c)].inheritable |= capToMask(c)
407 }
408
409 if _, _, err1 := RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
410 goto childerror
411 }
412
413 for _, c := range sys.AmbientCaps {
414 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
415 if err1 != 0 {
416 goto childerror
417 }
418 }
419 }
420
421
422 if dir != nil {
423 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
424 if err1 != 0 {
425 goto childerror
426 }
427 }
428
429
430 if sys.Pdeathsig != 0 {
431 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
432 if err1 != 0 {
433 goto childerror
434 }
435
436
437
438
439 r1, _ = rawSyscallNoError(SYS_GETPPID, 0, 0, 0)
440 if r1 != ppid {
441 pid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
442 _, _, err1 := RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
443 if err1 != 0 {
444 goto childerror
445 }
446 }
447 }
448
449
450
451 if pipe < nextfd {
452 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(pipe), uintptr(nextfd), O_CLOEXEC)
453 if _SYS_dup != SYS_DUP3 && err1 == ENOSYS {
454 _, _, err1 = RawSyscall(_SYS_dup, uintptr(pipe), uintptr(nextfd), 0)
455 if err1 != 0 {
456 goto childerror
457 }
458 RawSyscall(fcntl64Syscall, uintptr(nextfd), F_SETFD, FD_CLOEXEC)
459 } else if err1 != 0 {
460 goto childerror
461 }
462 pipe = nextfd
463 nextfd++
464 }
465 for i = 0; i < len(fd); i++ {
466 if fd[i] >= 0 && fd[i] < int(i) {
467 if nextfd == pipe {
468 nextfd++
469 }
470 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), O_CLOEXEC)
471 if _SYS_dup != SYS_DUP3 && err1 == ENOSYS {
472 _, _, err1 = RawSyscall(_SYS_dup, uintptr(fd[i]), uintptr(nextfd), 0)
473 if err1 != 0 {
474 goto childerror
475 }
476 RawSyscall(fcntl64Syscall, uintptr(nextfd), F_SETFD, FD_CLOEXEC)
477 } else if err1 != 0 {
478 goto childerror
479 }
480 fd[i] = nextfd
481 nextfd++
482 }
483 }
484
485
486 for i = 0; i < len(fd); i++ {
487 if fd[i] == -1 {
488 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
489 continue
490 }
491 if fd[i] == int(i) {
492
493
494 _, _, err1 = RawSyscall(fcntl64Syscall, uintptr(fd[i]), F_SETFD, 0)
495 if err1 != 0 {
496 goto childerror
497 }
498 continue
499 }
500
501
502 _, _, err1 = RawSyscall(_SYS_dup, uintptr(fd[i]), uintptr(i), 0)
503 if err1 != 0 {
504 goto childerror
505 }
506 }
507
508
509
510
511
512 for i = len(fd); i < 3; i++ {
513 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
514 }
515
516
517 if sys.Noctty {
518 _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
519 if err1 != 0 {
520 goto childerror
521 }
522 }
523
524
525 if sys.Setctty {
526 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 1)
527 if err1 != 0 {
528 goto childerror
529 }
530 }
531
532
533
534
535 if sys.Ptrace {
536 _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
537 if err1 != 0 {
538 goto childerror
539 }
540 }
541
542
543 _, _, err1 = RawSyscall(SYS_EXECVE,
544 uintptr(unsafe.Pointer(argv0)),
545 uintptr(unsafe.Pointer(&argv[0])),
546 uintptr(unsafe.Pointer(&envv[0])))
547
548 childerror:
549
550 RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
551 for {
552 RawSyscall(SYS_EXIT, 253, 0, 0)
553 }
554 }
555
556
557 func forkExecPipe(p []int) (err error) {
558 err = Pipe2(p, O_CLOEXEC)
559
560
561 if err == ENOSYS {
562 if err = Pipe(p); err != nil {
563 return
564 }
565 if _, err = fcntl(p[0], F_SETFD, FD_CLOEXEC); err != nil {
566 return
567 }
568 _, err = fcntl(p[1], F_SETFD, FD_CLOEXEC)
569 }
570 return
571 }
572
573 func formatIDMappings(idMap []SysProcIDMap) []byte {
574 var data []byte
575 for _, im := range idMap {
576 data = append(data, []byte(itoa(im.ContainerID)+" "+itoa(im.HostID)+" "+itoa(im.Size)+"\n")...)
577 }
578 return data
579 }
580
581
582 func writeIDMappings(path string, idMap []SysProcIDMap) error {
583 fd, err := Open(path, O_RDWR, 0)
584 if err != nil {
585 return err
586 }
587
588 if _, err := Write(fd, formatIDMappings(idMap)); err != nil {
589 Close(fd)
590 return err
591 }
592
593 if err := Close(fd); err != nil {
594 return err
595 }
596
597 return nil
598 }
599
600
601
602
603
604 func writeSetgroups(pid int, enable bool) error {
605 sgf := "/proc/" + itoa(pid) + "/setgroups"
606 fd, err := Open(sgf, O_RDWR, 0)
607 if err != nil {
608 return err
609 }
610
611 var data []byte
612 if enable {
613 data = []byte("allow")
614 } else {
615 data = []byte("deny")
616 }
617
618 if _, err := Write(fd, data); err != nil {
619 Close(fd)
620 return err
621 }
622
623 return Close(fd)
624 }
625
626
627
628 func writeUidGidMappings(pid int, sys *SysProcAttr) error {
629 if sys.UidMappings != nil {
630 uidf := "/proc/" + itoa(pid) + "/uid_map"
631 if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
632 return err
633 }
634 }
635
636 if sys.GidMappings != nil {
637
638 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
639 return err
640 }
641 gidf := "/proc/" + itoa(pid) + "/gid_map"
642 if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
643 return err
644 }
645 }
646
647 return nil
648 }
649
View as plain text