Source file
src/syscall/exec_linux.go
Documentation: syscall
1
2
3
4
5
6
7 package syscall
8
9 import (
10 "runtime"
11 "unsafe"
12 )
13
14
15
16 type SysProcIDMap struct {
17 ContainerID int
18 HostID int
19 Size int
20 }
21
22 type SysProcAttr struct {
23 Chroot string
24 Credential *Credential
25
26
27
28 Ptrace bool
29 Setsid bool
30
31
32 Setpgid bool
33
34
35
36
37 Setctty bool
38 Noctty bool
39 Ctty int
40
41
42
43
44
45 Foreground bool
46 Pgid int
47 Pdeathsig Signal
48 Cloneflags uintptr
49 Unshareflags uintptr
50 UidMappings []SysProcIDMap
51 GidMappings []SysProcIDMap
52
53
54
55
56 GidMappingsEnableSetgroups bool
57 AmbientCaps []uintptr
58 }
59
60 var (
61 none = [...]byte{'n', 'o', 'n', 'e', 0}
62 slash = [...]byte{'/', 0}
63 )
64
65
66 func runtime_BeforeFork()
67 func runtime_AfterFork()
68 func runtime_AfterForkInChild()
69
70
71
72
73
74
75
76
77
78
79
80 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
81
82
83 r1, err1, p, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
84 if locked {
85 runtime_AfterFork()
86 }
87 if err1 != 0 {
88 return 0, err1
89 }
90
91
92 pid = int(r1)
93
94 if sys.UidMappings != nil || sys.GidMappings != nil {
95 Close(p[0])
96 var err2 Errno
97
98
99 if sys.Unshareflags&CLONE_NEWUSER == 0 {
100 if err := writeUidGidMappings(pid, sys); err != nil {
101 err2 = err.(Errno)
102 }
103 }
104 RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
105 Close(p[1])
106 }
107
108 return pid, 0
109 }
110
111 const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
112
113 type capHeader struct {
114 version uint32
115 pid int32
116 }
117
118 type capData struct {
119 effective uint32
120 permitted uint32
121 inheritable uint32
122 }
123 type caps struct {
124 hdr capHeader
125 data [2]capData
126 }
127
128
129 func capToIndex(cap uintptr) uintptr { return cap >> 5 }
130
131
132 func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
133
134
135
136
137
138
139
140
141
142
143
144 func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (r1 uintptr, err1 Errno, p [2]int, locked bool) {
145
146 const (
147 PR_CAP_AMBIENT = 0x2f
148 PR_CAP_AMBIENT_RAISE = 0x2
149 )
150
151
152
153
154
155
156
157
158 var (
159 err2 Errno
160 nextfd int
161 i int
162 caps caps
163 fd1 uintptr
164 puid, psetgroups, pgid []byte
165 uidmap, setgroups, gidmap []byte
166 )
167
168 if sys.UidMappings != nil {
169 puid = []byte("/proc/self/uid_map\000")
170 uidmap = formatIDMappings(sys.UidMappings)
171 }
172
173 if sys.GidMappings != nil {
174 psetgroups = []byte("/proc/self/setgroups\000")
175 pgid = []byte("/proc/self/gid_map\000")
176
177 if sys.GidMappingsEnableSetgroups {
178 setgroups = []byte("allow\000")
179 } else {
180 setgroups = []byte("deny\000")
181 }
182 gidmap = formatIDMappings(sys.GidMappings)
183 }
184
185
186 ppid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
187
188
189
190
191 fd := make([]int, len(attr.Files))
192 nextfd = len(attr.Files)
193 for i, ufd := range attr.Files {
194 if nextfd < int(ufd) {
195 nextfd = int(ufd)
196 }
197 fd[i] = int(ufd)
198 }
199 nextfd++
200
201
202
203 if sys.UidMappings != nil || sys.GidMappings != nil {
204 if err := forkExecPipe(p[:]); err != nil {
205 err1 = err.(Errno)
206 return
207 }
208 }
209
210 hasRawVforkSyscall := runtime.GOARCH == "amd64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "s390x" || runtime.GOARCH == "arm64"
211
212
213
214 runtime_BeforeFork()
215 locked = true
216 switch {
217 case hasRawVforkSyscall && (sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0):
218 r1, err1 = rawVforkSyscall(SYS_CLONE, uintptr(SIGCHLD|CLONE_VFORK|CLONE_VM)|sys.Cloneflags)
219 case runtime.GOARCH == "s390x":
220 r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0)
221 default:
222 r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
223 }
224 if err1 != 0 || r1 != 0 {
225
226
227
228
229
230
231 return
232 }
233
234
235
236 runtime_AfterForkInChild()
237
238
239 if len(sys.AmbientCaps) > 0 {
240 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
241 if err1 != 0 {
242 goto childerror
243 }
244 }
245
246
247 if sys.UidMappings != nil || sys.GidMappings != nil {
248 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 {
249 goto childerror
250 }
251 r1, _, err1 = RawSyscall(SYS_READ, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
252 if err1 != 0 {
253 goto childerror
254 }
255 if r1 != unsafe.Sizeof(err2) {
256 err1 = EINVAL
257 goto childerror
258 }
259 if err2 != 0 {
260 err1 = err2
261 goto childerror
262 }
263 }
264
265
266 if sys.Setsid {
267 _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
268 if err1 != 0 {
269 goto childerror
270 }
271 }
272
273
274 if sys.Setpgid || sys.Foreground {
275
276 _, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
277 if err1 != 0 {
278 goto childerror
279 }
280 }
281
282 if sys.Foreground {
283 pgrp := int32(sys.Pgid)
284 if pgrp == 0 {
285 r1, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
286
287 pgrp = int32(r1)
288 }
289
290
291 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
292 if err1 != 0 {
293 goto childerror
294 }
295 }
296
297
298 if sys.Unshareflags != 0 {
299 _, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
300 if err1 != 0 {
301 goto childerror
302 }
303
304 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil {
305 dirfd := int(_AT_FDCWD)
306 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
307 goto childerror
308 }
309 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
310 if err1 != 0 {
311 goto childerror
312 }
313 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
314 goto childerror
315 }
316
317 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
318 goto childerror
319 }
320 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
321 if err1 != 0 {
322 goto childerror
323 }
324 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
325 goto childerror
326 }
327 }
328
329 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil {
330 dirfd := int(_AT_FDCWD)
331 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
332 goto childerror
333 }
334 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
335 if err1 != 0 {
336 goto childerror
337 }
338 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
339 goto childerror
340 }
341 }
342
343
344
345
346
347
348
349
350 if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
351 _, _, err1 = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)
352 if err1 != 0 {
353 goto childerror
354 }
355 }
356 }
357
358
359 if chroot != nil {
360 _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
361 if err1 != 0 {
362 goto childerror
363 }
364 }
365
366
367 if cred := sys.Credential; cred != nil {
368 ngroups := uintptr(len(cred.Groups))
369 groups := uintptr(0)
370 if ngroups > 0 {
371 groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
372 }
373 if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
374 _, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
375 if err1 != 0 {
376 goto childerror
377 }
378 }
379 _, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
380 if err1 != 0 {
381 goto childerror
382 }
383 _, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
384 if err1 != 0 {
385 goto childerror
386 }
387 }
388
389 if len(sys.AmbientCaps) != 0 {
390
391
392 caps.hdr.version = _LINUX_CAPABILITY_VERSION_3
393
394 if _, _, err1 := RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
395 goto childerror
396 }
397
398 for _, c := range sys.AmbientCaps {
399
400
401 caps.data[capToIndex(c)].permitted |= capToMask(c)
402 caps.data[capToIndex(c)].inheritable |= capToMask(c)
403 }
404
405 if _, _, err1 := RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
406 goto childerror
407 }
408
409 for _, c := range sys.AmbientCaps {
410 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
411 if err1 != 0 {
412 goto childerror
413 }
414 }
415 }
416
417
418 if dir != nil {
419 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
420 if err1 != 0 {
421 goto childerror
422 }
423 }
424
425
426 if sys.Pdeathsig != 0 {
427 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
428 if err1 != 0 {
429 goto childerror
430 }
431
432
433
434
435 r1, _ = rawSyscallNoError(SYS_GETPPID, 0, 0, 0)
436 if r1 != ppid {
437 pid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
438 _, _, err1 := RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
439 if err1 != 0 {
440 goto childerror
441 }
442 }
443 }
444
445
446
447 if pipe < nextfd {
448 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(pipe), uintptr(nextfd), O_CLOEXEC)
449 if _SYS_dup != SYS_DUP3 && err1 == ENOSYS {
450 _, _, err1 = RawSyscall(_SYS_dup, uintptr(pipe), uintptr(nextfd), 0)
451 if err1 != 0 {
452 goto childerror
453 }
454 RawSyscall(fcntl64Syscall, uintptr(nextfd), F_SETFD, FD_CLOEXEC)
455 } else if err1 != 0 {
456 goto childerror
457 }
458 pipe = nextfd
459 nextfd++
460 }
461 for i = 0; i < len(fd); i++ {
462 if fd[i] >= 0 && fd[i] < int(i) {
463 if nextfd == pipe {
464 nextfd++
465 }
466 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), O_CLOEXEC)
467 if _SYS_dup != SYS_DUP3 && err1 == ENOSYS {
468 _, _, err1 = RawSyscall(_SYS_dup, uintptr(fd[i]), uintptr(nextfd), 0)
469 if err1 != 0 {
470 goto childerror
471 }
472 RawSyscall(fcntl64Syscall, uintptr(nextfd), F_SETFD, FD_CLOEXEC)
473 } else if err1 != 0 {
474 goto childerror
475 }
476 fd[i] = nextfd
477 nextfd++
478 }
479 }
480
481
482 for i = 0; i < len(fd); i++ {
483 if fd[i] == -1 {
484 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
485 continue
486 }
487 if fd[i] == int(i) {
488
489
490 _, _, err1 = RawSyscall(fcntl64Syscall, uintptr(fd[i]), F_SETFD, 0)
491 if err1 != 0 {
492 goto childerror
493 }
494 continue
495 }
496
497
498 _, _, err1 = RawSyscall(_SYS_dup, uintptr(fd[i]), uintptr(i), 0)
499 if err1 != 0 {
500 goto childerror
501 }
502 }
503
504
505
506
507
508 for i = len(fd); i < 3; i++ {
509 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
510 }
511
512
513 if sys.Noctty {
514 _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
515 if err1 != 0 {
516 goto childerror
517 }
518 }
519
520
521 if sys.Setctty {
522 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 1)
523 if err1 != 0 {
524 goto childerror
525 }
526 }
527
528
529
530
531 if sys.Ptrace {
532 _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
533 if err1 != 0 {
534 goto childerror
535 }
536 }
537
538
539 _, _, err1 = RawSyscall(SYS_EXECVE,
540 uintptr(unsafe.Pointer(argv0)),
541 uintptr(unsafe.Pointer(&argv[0])),
542 uintptr(unsafe.Pointer(&envv[0])))
543
544 childerror:
545
546 RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
547 for {
548 RawSyscall(SYS_EXIT, 253, 0, 0)
549 }
550 }
551
552
553 func forkExecPipe(p []int) (err error) {
554 err = Pipe2(p, O_CLOEXEC)
555
556
557 if err == ENOSYS {
558 if err = Pipe(p); err != nil {
559 return
560 }
561 if _, err = fcntl(p[0], F_SETFD, FD_CLOEXEC); err != nil {
562 return
563 }
564 _, err = fcntl(p[1], F_SETFD, FD_CLOEXEC)
565 }
566 return
567 }
568
569 func formatIDMappings(idMap []SysProcIDMap) []byte {
570 var data []byte
571 for _, im := range idMap {
572 data = append(data, []byte(itoa(im.ContainerID)+" "+itoa(im.HostID)+" "+itoa(im.Size)+"\n")...)
573 }
574 return data
575 }
576
577
578 func writeIDMappings(path string, idMap []SysProcIDMap) error {
579 fd, err := Open(path, O_RDWR, 0)
580 if err != nil {
581 return err
582 }
583
584 if _, err := Write(fd, formatIDMappings(idMap)); err != nil {
585 Close(fd)
586 return err
587 }
588
589 if err := Close(fd); err != nil {
590 return err
591 }
592
593 return nil
594 }
595
596
597
598
599
600 func writeSetgroups(pid int, enable bool) error {
601 sgf := "/proc/" + itoa(pid) + "/setgroups"
602 fd, err := Open(sgf, O_RDWR, 0)
603 if err != nil {
604 return err
605 }
606
607 var data []byte
608 if enable {
609 data = []byte("allow")
610 } else {
611 data = []byte("deny")
612 }
613
614 if _, err := Write(fd, data); err != nil {
615 Close(fd)
616 return err
617 }
618
619 return Close(fd)
620 }
621
622
623
624 func writeUidGidMappings(pid int, sys *SysProcAttr) error {
625 if sys.UidMappings != nil {
626 uidf := "/proc/" + itoa(pid) + "/uid_map"
627 if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
628 return err
629 }
630 }
631
632 if sys.GidMappings != nil {
633
634 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
635 return err
636 }
637 gidf := "/proc/" + itoa(pid) + "/gid_map"
638 if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
639 return err
640 }
641 }
642
643 return nil
644 }
645
View as plain text