Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

runtime: TestArenaCollision flaky on windows-amd64-race builder #25698

Closed
bcmills opened this issue Jun 1, 2018 · 11 comments
Closed

runtime: TestArenaCollision flaky on windows-amd64-race builder #25698

bcmills opened this issue Jun 1, 2018 · 11 comments
Labels
FrozenDueToAge NeedsInvestigation Someone must examine and confirm this is a valid issue and not a duplicate of an existing one. OS-Windows
Milestone

Comments

@bcmills
Copy link
Contributor

bcmills commented Jun 1, 2018

Possibly related to #24133.

Examples:
https://build.golang.org/log/6d737e6c59f640a2f98ccb898b598d34a898d1d6
https://build.golang.org/log/9b656635a90cc9112f7c2c136d35bc8116af26c3

Detail:

--- FAIL: TestArenaCollision (0.04s)
    malloc_test.go:182: want "too many address space collisions", got:
        === RUN   TestArenaCollision
        ==2800==ERROR: ThreadSanitizer failed to allocate 0x000001000000 (16777216) bytes at 0x080000000000 (error code: 87)
        ==2800==ERROR: ThreadSanitizer failed to allocate 0x000000200000 (2097152) bytes at 0x084000000000 (error code: 87)
        runtime: newstack sp=0x22fe10 stack=[0xc0001ea000, 0xc0001ec000]
        	morebuf={pc:0x42030f sp:0x22fe20 lr:0x0}
        	sched={pc:0x479f26 sp:0x22fe18 lr:0x0 ctxt:0x0}
        runtime: gp=0xc000044480, goid=19, gp->status=0x2
         runtime: split stack overflow: 0x22fe10 < 0xc0001ea000
        fatal error: runtime: split stack overflow
        
        runtime stack:
        runtime.throw(0x80f8c9, 0x1d)
        	C:/workdir/go/src/runtime/panic.go:589 +0x71 fp=0x22fd78 sp=0x22fd48 pc=0x4648e1
        runtime.newstack()
        	C:/workdir/go/src/runtime/stack.go:955 +0x8ad fp=0x22ff10 sp=0x22fd78 pc=0x47e18d
        runtime.morestack()
        	C:/workdir/go/src/runtime/asm_amd64.s:429 +0x97 fp=0x22ff18 sp=0x22ff10 pc=0x495297
        
        goroutine 19 [running]:
        runtime.sigpanic()
        	C:/workdir/go/src/runtime/signal_windows.go:163 +0x176 fp=0x22fe20 sp=0x22fe18 pc=0x479f26
        created by testing.(*T).Run
        	C:/workdir/go/src/testing/testing.go:878 +0x658
        
        goroutine 1 [chan receive]:
        runtime.gopark(0x819930, 0xc00003e3b8, 0x170d, 0x3)
        	C:/workdir/go/src/runtime/proc.go:298 +0x100 fp=0xc0000719b8 sp=0xc000071998 pc=0x466670
        runtime.goparkunlock(0xc00003e3b8, 0x170d, 0x3)
        	C:/workdir/go/src/runtime/proc.go:304 +0x5a fp=0xc0000719e8 sp=0xc0000719b8 pc=0x46672a
        runtime.chanrecv(0xc00003e360, 0xc000079aff, 0xc000002101, 0x58a488)
        	C:/workdir/go/src/runtime/chan.go:520 +0x219 fp=0xc000071a78 sp=0xc0000719e8 pc=0x439da9
        runtime.chanrecv1(0xc00003e360, 0xc000079aff)
        	C:/workdir/go/src/runtime/chan.go:402 +0x2b fp=0xc000071aa8 sp=0xc000071a78 pc=0x439b3b
        testing.(*T).Run(0xc0000c4100, 0x808aff, 0x12, 0x81a528, 0xc000079b01)
        	C:/workdir/go/src/testing/testing.go:879 +0x690 fp=0xc000071bb0 sp=0xc000071aa8 pc=0x58a4c0
        testing.runTests.func1(0xc0000c4100)
        	C:/workdir/go/src/testing/testing.go:1119 +0xb0 fp=0xc000071c08 sp=0xc000071bb0 pc=0x590220
        testing.tRunner(0xc0000c4100, 0xc000079d38)
        	C:/workdir/go/src/testing/testing.go:827 +0x16a fp=0xc000071c60 sp=0xc000071c08 pc=0x589daa
        testing.runTests(0xc00004c780, 0xa727c0, 0x106, 0x106, 0x0)
        	C:/workdir/go/src/testing/testing.go:1117 +0x4ee fp=0xc000071d68 sp=0xc000071c60 pc=0x58c27e
        testing.(*M).Run(0xc0000ae200, 0x0)
        	C:/workdir/go/src/testing/testing.go:1034 +0x2f6 fp=0xc000071ec0 sp=0xc000071d68 pc=0x58abf6
        runtime_test.TestMain(0xc0000ae200)
        	C:/workdir/go/src/runtime/crash_test.go:28 +0x44 fp=0xc000071f10 sp=0xc000071ec0 pc=0x6f0994
        main.main()
        	_testmain.go:1016 +0x229 fp=0xc000071f98 sp=0xc000071f10 pc=0x76df89
        runtime.main()
        	C:/workdir/go/src/runtime/proc.go:201 +0x207 fp=0xc000071fe0 sp=0xc000071f98 pc=0x466257
        runtime.goexit()
        	C:/workdir/go/src/runtime/asm_amd64.s:1333 +0x1 fp=0xc000071fe8 sp=0xc000071fe0 pc=0x497301
        
        goroutine 2 [force gc (idle)]:
        runtime.gopark(0x819930, 0xa75420, 0x1410, 0x1)
        	C:/workdir/go/src/runtime/proc.go:298 +0x100 fp=0xc000027f80 sp=0xc000027f60 pc=0x466670
        runtime.goparkunlock(0xa75420, 0x1410, 0x1)
        	C:/workdir/go/src/runtime/proc.go:304 +0x5a fp=0xc000027fb0 sp=0xc000027f80 pc=0x46672a
        runtime.forcegchelper()
        	C:/workdir/go/src/runtime/proc.go:251 +0xc1 fp=0xc000027fe0 sp=0xc000027fb0 pc=0x4664d1
        runtime.goexit()
        	C:/workdir/go/src/runtime/asm_amd64.s:1333 +0x1 fp=0xc000027fe8 sp=0xc000027fe0 pc=0x497301
        created by runtime.init.4
        	C:/workdir/go/src/runtime/proc.go:240 +0x3c
        
        goroutine 3 [GC sweep wait]:
        runtime.gopark(0x819930, 0xa757c0, 0x45140c, 0x1)
        	C:/workdir/go/src/runtime/proc.go:298 +0x100 fp=0xc000029f80 sp=0xc000029f60 pc=0x466670
        runtime.goparkunlock(0xa757c0, 0x85140c, 0x1)
        	C:/workdir/go/src/runtime/proc.go:304 +0x5a fp=0xc000029fb0 sp=0xc000029f80 pc=0x46672a
        runtime.bgsweep(0xc000034000)
        	C:/workdir/go/src/runtime/mgcsweep.go:52 +0x9d fp=0xc000029fd8 sp=0xc000029fb0 pc=0x45667d
        runtime.goexit()
        	C:/workdir/go/src/runtime/asm_amd64.s:1333 +0x1 fp=0xc000029fe0 sp=0xc000029fd8 pc=0x497301
        created by runtime.gcenable
        	C:/workdir/go/src/runtime/mgc.go:216 +0x5f
        
        goroutine 18 [finalizer wait]:
        runtime.gopark(0x819930, 0xa96928, 0x20000140f, 0x1)
        	C:/workdir/go/src/runtime/proc.go:298 +0x100 fp=0xc00002bf28 sp=0xc00002bf08 pc=0x466670
        runtime.goparkunlock(0xa96928, 0xa7140f, 0x1)
        	C:/workdir/go/src/runtime/proc.go:304 +0x5a fp=0xc00002bf58 sp=0xc00002bf28 pc=0x46672a
        runtime.runfinq()
        	C:/workdir/go/src/runtime/mfinal.go:175 +0xa7 fp=0xc00002bfe0 sp=0xc00002bf58 pc=0x44dd57
        runtime.goexit()
        	C:/workdir/go/src/runtime/asm_amd64.s:1333 +0x1 fp=0xc00002bfe8 sp=0xc00002bfe0 pc=0x497301
        created by runtime.createfing
        	C:/workdir/go/src/runtime/mfinal.go:156 +0x68
        
        goroutine 20 [running]:
        	goroutine running on other thread; stack unavailable
        created by runtime.gcBgMarkStartWorkers
        	C:/workdir/go/src/runtime/mgc.go:1720 +0x7e
        
        goroutine 34 [GC worker (idle)]:
        runtime.gopark(0x819708, 0xc000052620, 0x1417, 0x0)
        	C:/workdir/go/src/runtime/proc.go:298 +0x100 fp=0xc0001e7f60 sp=0xc0001e7f40 pc=0x466670
        runtime.gcBgMarkWorker(0xc00001a500)
        	C:/workdir/go/src/runtime/mgc.go:1772 +0x111 fp=0xc0001e7fd8 sp=0xc0001e7f60 pc=0x451781
        runtime.goexit()
        	C:/workdir/go/src/runtime/asm_amd64.s:1333 +0x1 fp=0xc0001e7fe0 sp=0xc0001e7fd8 pc=0x497301
        created by runtime.gcBgMarkStartWorkers
        	C:/workdir/go/src/runtime/mgc.go:1720 +0x7e
        
        goroutine 21 [GC worker (idle)]:
        runtime.gopark(0x819708, 0xc000052630, 0x1417, 0x0)
        	C:/workdir/go/src/runtime/proc.go:298 +0x100 fp=0xc0003f3f60 sp=0xc0003f3f40 pc=0x466670
        runtime.gcBgMarkWorker(0xc00001ca00)
        	C:/workdir/go/src/runtime/mgc.go:1772 +0x111 fp=0xc0003f3fd8 sp=0xc0003f3f60 pc=0x451781
        runtime.goexit()
        	C:/workdir/go/src/runtime/asm_amd64.s:1333 +0x1 fp=0xc0003f3fe0 sp=0xc0003f3fd8 pc=0x497301
        created by runtime.gcBgMarkStartWorkers
        	C:/workdir/go/src/runtime/mgc.go:1720 +0x7e
        
        goroutine 22 [GC worker (idle)]:
        runtime.gopark(0x819708, 0xc00000a0d0, 0x1417, 0x0)
        	C:/workdir/go/src/runtime/proc.go:298 +0x100 fp=0xc0003f5f60 sp=0xc0003f5f40 pc=0x466670
        runtime.gcBgMarkWorker(0xc00001ef00)
        	C:/workdir/go/src/runtime/mgc.go:1772 +0x111 fp=0xc0003f5fd8 sp=0xc0003f5f60 pc=0x451781
        runtime.goexit()
        	C:/workdir/go/src/runtime/asm_amd64.s:1333 +0x1 fp=0xc0003f5fe0 sp=0xc0003f5fd8 pc=0x497301
        created by runtime.gcBgMarkStartWorkers
        	C:/workdir/go/src/runtime/mgc.go:1720 +0x7e
FAIL
FAIL	runtime	30.991s
@bcmills bcmills added the NeedsInvestigation Someone must examine and confirm this is a valid issue and not a duplicate of an existing one. label Jun 1, 2018
@bcmills bcmills added this to the Go1.11 milestone Jun 1, 2018
@alexbrainman
Copy link
Member

@bcmills I suspect these will be fixed once we update windows race syso files. @dvyukov tried to update windows race syso file (see https://go-review.googlesource.com/c/build/+/101137 ), but I don't think you need CL 101137 changes - you just need your gcc bin directory listed in your PATH.

Let me know, if you want me to try and build windows race syso file here. But I agree, that race build is broken at this moment, so this issue needs to be fixed before go1.11 is released.

Alex

@bcmills
Copy link
Contributor Author

bcmills commented Jun 4, 2018

The syso files were updated in https://golang.org/cl/112896. I had to run the Windows build several times due to test flakes, and I'm pretty sure TestArenaCollision was among them.

@alexbrainman
Copy link
Member

The syso files were updated in https://golang.org/cl/112896.

Good to know.

I'm pretty sure TestArenaCollision was among them.

I cannot help you much with this. I suppose you need to look at the source of ThreadSanitizer. Just quick look at the stack trace

   ==2800==ERROR: ThreadSanitizer failed to allocate 0x000001000000 (16777216) bytes at 0x080000000000 (error code: 87)
  ==2800==ERROR: ThreadSanitizer failed to allocate 0x000000200000 (2097152) bytes at 0x084000000000 (error code: 87)

My guess is that "error code: 87" is for ERROR_INVALID_PARAMETER (see https://msdn.microsoft.com/en-us/library/windows/desktop/ms681382(v=vs.85).aspx ). And "failed to allocate" is probably because some of WIndows memory allocation APIs was called with wrong parameters (therefore ERROR_INVALID_PARAMETER). But you would have to reproduce this to be sure.

Alex

@gopherbot
Copy link

Change https://golang.org/cl/123175 mentions this issue: runtime: deflake TestArenaCollision on Windows with race detector

@aclements
Copy link
Member

Looking over the failures, they all have the same address, size, and error:

ERROR: ThreadSanitizer failed to allocate 0x000001000000 (16777216) bytes at 0x080000000000 (error code: 87)
ERROR: ThreadSanitizer failed to allocate 0x000000200000 (2097152) bytes at 0x084000000000 (error code: 87)

This message is printed by MmapFixedNoReserve and indicates an ERROR_INVALID_PARAMETER result from VirtualAlloc (as @alexbrainman said). This is weird because all of the parameters look completely reasonable.

But what strikes me as more weird is that these fixed addresses aren't in the tsan runtime's memory map, which ends at 0x07d000000000. The region just before this is "metainfo" aka "MetaShadow". If we assume it's trying to map the MetaShadow and back out the corresponding heap mapping, we get 0x020000000000 and 0x028000000000, which are both well outside where we'd expect to see the Go heap mapped. @dvyukov, any thoughts on this?

There's also the question of why this failure leads to split stack overflow. There I have a hunch: MmapFixedNoReserve returns 0, which MapShadow doesn't check for. I suspect the race runtime then tries to use the address we failed to map, which leads to an exception. However, since runtime.racecall only switched the SP to the g0 stack and didn't switch the current g (getg()) to g0, runtime.sigtramp will again switch SP to the g0 stack, clobber who-knows-what, and when it returns we go off the deep end.

@aclements
Copy link
Member

I was able to reproduce this quickly, though only once in 350 runs with:

VM=$(gomote create windows-amd64-race) && gomote push $VM && gomote run $VM go/src/make.bat && gomote run $VM go/bin/go test -c runtime -race

stress -p 1 gomote run $VM go/bin/runtime.test.exe -test.run TestArenaCollision

@aclements
Copy link
Member

Oh. The "ThreadSanitizer failed to allocate" always happens. It's just that most of the time it doesn't seem to matter, and sometimes we crash hard after that. E.g., here's a "successful" run (with a little extra debugging):

$ gomote run -e TEST_ARENA_COLLISION=1 $VM go/bin/runtime.test.exe -test.run TestArenaCollision -test.v
__tsan_map_shadow 0xc000000000 0x400000
__tsan_map_shadow done
=== RUN   TestArenaCollision
__tsan_map_shadow 0x1c000000000 0x400000
==432==ERROR: ThreadSanitizer failed to allocate 0x000001000000 (16777216) bytes at 0x080000000000 (error code: 87)
==432==ERROR: ThreadSanitizer failed to allocate 0x000000200000 (2097152) bytes at 0x084000000000 (error code: 87)
__tsan_map_shadow done
__tsan_map_shadow 0x2c000000000 0x400000
==432==ERROR: ThreadSanitizer failed to allocate 0x000001000000 (16777216) bytes at 0x0c0000000000 (error code: 87)
==432==ERROR: ThreadSanitizer failed to allocate 0x000000200000 (2097152) bytes at 0x08c000000000 (error code: 87)
__tsan_map_shadow done
fatal error: too many address space collisions for -race mode

In contrast, a "failed" run:

__tsan_map_shadow 0xc000000000 0x400000
__tsan_map_shadow done
=== RUN   TestArenaCollision
__tsan_map_shadow 0x1c000000000 0x400000
==2332==ERROR: ThreadSanitizer failed to allocate 0x000001000000 (16777216) bytes at 0x080000000000 (error code: 87)
==2332==ERROR: ThreadSanitizer failed to allocate 0x000000200000 (2097152) bytes at 0x084000000000 (error code: 87)
__tsan_map_shadow done
runtime: newstack sp=0x22fe90 stack=[0xc0000de000, 0xc0000e0000]
	morebuf={pc:0x22f780 sp:0x22fea0 lr:0x0}
	sched={pc:0x476586 sp:0x22fe98 lr:0x0 ctxt:0x0}
runtime: gp=0xc000042480, goid=19, gp->status=0x2
 runtime: split stack overflow: 0x22fe90 < 0xc0000de000
fatal error: runtime: split stack overflow

@aclements
Copy link
Member

aclements commented Jul 13, 2018

I think I understand.

If we go past even the first hint, we're already outside of TSAN's accepted heap space. The second hint is 0x01c000000000, which is outside TSAN's heap [0x00c000000000, 0x00e000000000). I'm not sure how we wound up at 0x080000000000, since you'd expect this to correspond to a MapShadow of (0x1c000000000 - 0xc000000000) // 8 * 4 + 0x076000000000 => 0x07e000000000.

I remember when I wrote this code I was thinking the hints were at 0x00cX00000000, not 0x0Xc000000000. We could just do that for race mode. Arguably this has always been wrong for race mode, since mallocinit could have picked a place for the old heap arena that still fell outside of TSAN's supported heap area.

@gopherbot
Copy link

Change https://golang.org/cl/123740 mentions this issue: runtime: handle Windows exceptions during racecalls better

@gopherbot
Copy link

Change https://golang.org/cl/123780 mentions this issue: runtime: don't create heap hints outside TSAN's supported heap

@golang golang locked and limited conversation to collaborators Jul 19, 2019
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
FrozenDueToAge NeedsInvestigation Someone must examine and confirm this is a valid issue and not a duplicate of an existing one. OS-Windows
Projects
None yet
Development

No branches or pull requests

5 participants