Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

runtime: static lock ranks fail on darwin/amd64 #66386

Open
rhysh opened this issue Mar 18, 2024 · 3 comments
Open

runtime: static lock ranks fail on darwin/amd64 #66386

rhysh opened this issue Mar 18, 2024 · 3 comments
Labels
compiler/runtime Issues related to the Go compiler and/or runtime. NeedsInvestigation Someone must examine and confirm this is a valid issue and not a duplicate of an existing one.
Milestone

Comments

@rhysh
Copy link
Contributor

rhysh commented Mar 18, 2024

Go version

go version devel go1.23-0a6f05e30f Sun Mar 17 15:57:54 2024 +0000 darwin/amd64

Output of go env in your module/workspace:

GO111MODULE=''
GOARCH='amd64'
GOBIN=''
GOCACHE='/Users/rhysh/Library/Caches/go-build'
GOENV='/Users/rhysh/Library/Application Support/go/env'
GOEXE=''
GOEXPERIMENT=''
GOFLAGS=''
GOHOSTARCH='amd64'
GOHOSTOS='darwin'
GOINSECURE=''
GOMODCACHE='/Users/rhysh/work/pkg/mod'
GONOPROXY=''
GONOSUMDB=''
GOOS='darwin'
GOPATH='/Users/rhysh/work'
GOPRIVATE=''
GOPROXY='https://proxy.golang.org,direct'
GOROOT='/usr/local/go'
GOSUMDB='sum.golang.org'
GOTMPDIR=''
GOTOOLCHAIN='auto'
GOTOOLDIR='/usr/local/go/pkg/tool/darwin_amd64'
GOVCS=''
GOVERSION='devel go1.23-0a6f05e30f Sun Mar 17 15:57:54 2024 +0000'
GODEBUG=''
GCCGO='gccgo'
GOAMD64='v1'
AR='ar'
CC='clang'
CXX='clang++'
CGO_ENABLED='1'
GOMOD='/dev/null'
GOWORK=''
CGO_CFLAGS='-O2 -g'
CGO_CPPFLAGS=''
CGO_CXXFLAGS='-O2 -g'
CGO_FFLAGS='-O2 -g'
CGO_LDFLAGS='-O2 -g'
PKG_CONFIG='pkg-config'
GOGCCFLAGS='-fPIC -arch x86_64 -m64 -pthread -fno-caret-diagnostics -Qunused-arguments -fmessage-length=0 -ffile-prefix-map=/var/folders/pw/d_qmtcrd3vs0890gvmrq8qx80000gn/T/go-build2482359617=/tmp/go-build -gno-record-gcc-switches -fno-common'

What did you do?

I visited build.golang.org, found the most recent commit that had succeeded on the linux/amd64 staticlockranking builders, and tried to replicate that success on my darwin/amd64 machine.

What did you see happen?

$ (cd ./src && ./make.bash)
Building Go cmd/dist using /Users/rhysh/sdk/go1.20.6. (go1.20.6 darwin/amd64)
Building Go toolchain1 using /Users/rhysh/sdk/go1.20.6.
Building Go bootstrap cmd/go (go_bootstrap) using Go toolchain1.
Building Go toolchain2 using go_bootstrap and Go toolchain1.
Building Go toolchain3 using go_bootstrap and Go toolchain2.
Building packages and commands for darwin/amd64.
---
Installed Go for darwin/amd64 in /usr/local/go
Installed commands in /usr/local/go/bin
*** You need to add /usr/local/go/bin to your PATH.

$ go version
go version devel go1.23-2e8d84f148 Mon Mar 18 18:31:42 2024 +0000 darwin/amd64

$ GOEXPERIMENT=staticlockranking go test runtime -quick -short
123145398722560  ======
0 : sysmon 1 0xe3ff118
1 : allp 22 0xe466100
2 : execR 19 0x0
fatal error: lock ordering problem

runtime stack:
runtime.throw({0xe00d961?, 0x0?})
	/usr/local/go/src/runtime/panic.go:1021 +0x5c fp=0x700005bf1d48 sp=0x700005bf1d18 pc=0xdc563dc
runtime.checkRanks(0xc000006540, 0x700005bf1dd0?, 0x7fff206ff6b1?)
	/usr/local/go/src/runtime/lockrank_on.go:162 +0x236 fp=0x700005bf1da8 sp=0x700005bf1d48 pc=0xdc26116
runtime.acquireLockRank.func1()
	/usr/local/go/src/runtime/lockrank_on.go:126 +0x78 fp=0x700005bf1dd0 sp=0x700005bf1da8 pc=0xdc25e98
runtime.acquireLockRank(0xc000006540?)
	/usr/local/go/src/runtime/lockrank_on.go:115 +0x3e fp=0x700005bf1e00 sp=0x700005bf1dd0 pc=0xdc25dfe
runtime.(*rwmutex).rlock(0xe466620)
	/usr/local/go/src/runtime/rwmutex.go:77 +0x33 fp=0x700005bf1e30 sp=0x700005bf1e00 pc=0xdc6b113
runtime.preemptM(0xe3fdd20)
	/usr/local/go/src/runtime/signal_unix.go:372 +0x1f fp=0x700005bf1e50 sp=0x700005bf1e30 pc=0xdc6f09f
runtime.preemptone(0xdc9b100?)
	/usr/local/go/src/runtime/proc.go:6174 +0x55 fp=0x700005bf1e68 sp=0x700005bf1e50 pc=0xdc66875
runtime.retake(0x4c98ac8d7cb1)
	/usr/local/go/src/runtime/proc.go:6077 +0xb6 fp=0x700005bf1ec8 sp=0x700005bf1e68 pc=0xdc66576
runtime.sysmon()
	/usr/local/go/src/runtime/proc.go:6016 +0x34f fp=0x700005bf1f30 sp=0x700005bf1ec8 pc=0xdc663af
runtime.mstart1()
	/usr/local/go/src/runtime/proc.go:1730 +0x93 fp=0x700005bf1f58 sp=0x700005bf1f30 pc=0xdc5ce33
runtime.mstart0()
	/usr/local/go/src/runtime/proc.go:1687 +0x6a fp=0x700005bf1f80 sp=0x700005bf1f58 pc=0xdc5cd8a
runtime.mstart()
	/usr/local/go/src/runtime/asm_amd64.s:394 +0x5 fp=0x700005bf1f88 sp=0x700005bf1f80 pc=0xdc97965

goroutine 1 gp=0xc0000061c0 m=nil [runnable, locked to thread]:
runtime.gcTrigger.test({0x0?, 0x0?, 0x0?})
	/usr/local/go/src/runtime/mgc.go:567 +0xdc fp=0xc000064c30 sp=0xc000064c28 pc=0xdc36fbc
runtime.mallocgc(0xe8, 0xe1444c0, 0x1)
	/usr/local/go/src/runtime/malloc.go:1307 +0x845 fp=0xc000064cb8 sp=0xc000064c30 pc=0xdc28965
runtime.newobject(0x0?)
	/usr/local/go/src/runtime/malloc.go:1390 +0x25 fp=0xc000064ce0 sp=0xc000064cb8 pc=0xdc28ba5
runtime.CallersFrames(...)
	/usr/local/go/src/runtime/symtab.go:80
runtime.Caller(0x1)
	/usr/local/go/src/runtime/extern.go:301 +0x75 fp=0xc000064e08 sp=0xc000064ce0 pc=0xdc20b55
runtime.lineNumber(...)
	/usr/local/go/src/runtime/symtabinl_test.go:102
runtime.init()
	/usr/local/go/src/runtime/symtabinl_test.go:108 +0x17e fp=0xc000064e20 sp=0xc000064e08 pc=0xdc189fe
runtime.doInit1(0xe3ca5c0)
	/usr/local/go/src/runtime/proc.go:7097 +0xe8 fp=0xc000064f50 sp=0xc000064e20 pc=0xdc682a8
runtime.doInit(...)
	/usr/local/go/src/runtime/proc.go:7064
runtime.main()
	/usr/local/go/src/runtime/proc.go:200 +0x11f fp=0xc000064fe0 sp=0xc000064f50 pc=0xdc59a5f
runtime.goexit({})
	/usr/local/go/src/runtime/asm_amd64.s:1695 +0x1 fp=0xc000064fe8 sp=0xc000064fe0 pc=0xdc998c1
FAIL	runtime	0.327s
FAIL

What did you expect to see?

I expected GOEXPERIMENT=staticlockranking to work on darwin, in addition to working on linux.

@gopherbot gopherbot added the compiler/runtime Issues related to the Go compiler and/or runtime. label Mar 18, 2024
@rhysh
Copy link
Contributor Author

rhysh commented Mar 18, 2024

Broken versions include go1.22.1, go1.21.8, and go1.20.14, but not go1.20.13. Looks like https://go.dev/cl/549536, "runtime: properly model rwmutex in lock ranking".

Maybe it's a missing edge in mklockrank.go?

@rhysh
Copy link
Contributor Author

rhysh commented Mar 18, 2024

Only GOOS=darwin and GOOS=ios interact with the execlock during preemptM, which is why this hasn't shown up on the build dashboard.

CC @mknyszek and @prattmic — you're both close to the builders (maybe we need one for GOOS=darwin + GOEXPERIMENT=staticlockranking?) and to the scheduler / lock ranking. I haven't dug in to how the exec lock interacts with the scheduler, and if the fix here is as simple as moving execR to the other side of sched and allp.

// preemptM sends a preemption request to mp. This request may be
// handled asynchronously and may be coalesced with other requests to
// the M. When the request is received, if the running G or P are
// marked for preemption and the goroutine is at an asynchronous
// safe-point, it will preempt the goroutine. It always atomically
// increments mp.preemptGen after handling a preemption request.
func preemptM(mp *m) {
// On Darwin, don't try to preempt threads during exec.
// Issue #41702.
if GOOS == "darwin" || GOOS == "ios" {
execLock.rlock()
}

@mknyszek mknyszek added this to the Backlog milestone Mar 20, 2024
@mknyszek mknyszek added the NeedsInvestigation Someone must examine and confirm this is a valid issue and not a duplicate of an existing one. label Mar 20, 2024
@rhysh
Copy link
Contributor Author

rhysh commented Mar 20, 2024

retake temporarily drops allpLock in a few cases; it could do that before calling preemptone on darwin and ios.

But forEachPInternal and stopTheWorldWithSema call preemptall while holding sched.lock. Those could make a point of calling execLock.rlock before lock(&sched.lock), provided that holding execR means it's safe to acquire execR again. (Is it?) The lock rank checker would also need to learn that new rule.

But (again), I also see that runtime.(*gcControllerState).enlistWorker can call preemptone while holding the gscan pseudo-lock (via markroot and scanstack). It looks like this goes deep, and may be hard to fix (and keep fixed) without a GOOS=darwin + GOEXPERIMENT=staticlockranking builder.

@mknyszek , do these represent real (rare) deadlock risks?

From a recent version of tip with un-committed local changes implementing the above (so don't trust the line numbers):

$ GOEXPERIMENT=staticlockranking go test runtime
123145339498496  ======
0 : gscan 42 0x0
1 : execR 19 0x0
fatal error: lock ordering problem

runtime stack:
runtime.throw({0x33f9521?, 0x700002376a88?})
	/usr/local/go/src/runtime/panic.go:1021 +0x5c fp=0x700002376a50 sp=0x700002376a20 pc=0x304143c
runtime.checkRanks(0xc0001921c0, 0x700002376b88?, 0x37a7f60?)
	/usr/local/go/src/runtime/lockrank_on.go:175 +0x276 fp=0x700002376ab0 sp=0x700002376a50 pc=0x3011156
runtime.acquireLockRank.func1()
	/usr/local/go/src/runtime/lockrank_on.go:126 +0x78 fp=0x700002376ad8 sp=0x700002376ab0 pc=0x3010e98
runtime.acquireLockRank(0x700002376b10?)
	/usr/local/go/src/runtime/lockrank_on.go:115 +0x3e fp=0x700002376b08 sp=0x700002376ad8 pc=0x3010dfe
runtime.(*rwmutex).rlock(0x3852620)
	/usr/local/go/src/runtime/rwmutex.go:77 +0x33 fp=0x700002376b38 sp=0x700002376b08 pc=0x30561b3
runtime.preemptM(0x37e9d20)
	/usr/local/go/src/runtime/signal_unix.go:372 +0x1f fp=0x700002376b58 sp=0x700002376b38 pc=0x305a13f
runtime.preemptone(0x700002376b90?)
	/usr/local/go/src/runtime/proc.go:6193 +0x55 fp=0x700002376b70 sp=0x700002376b58 pc=0x3051915
runtime.(*gcControllerState).enlistWorker(0xc000802000?)
	/usr/local/go/src/runtime/mgcpacer.go:723 +0xf5 fp=0x700002376ba0 sp=0x700002376b70 pc=0x3029f75
runtime.(*gcWork).put(0xc000041c58, 0xc00083fcb0)
	/usr/local/go/src/runtime/mgcwork.go:144 +0xfb fp=0x700002376bd0 sp=0x700002376ba0 pc=0x302f8fb
runtime.greyobject(0xc00083fcb0, 0xc001524e48?, 0x341e901?, 0xc000041c58?, 0x700002376d18?, 0x0?)
	/usr/local/go/src/runtime/mgcmark.go:1661 +0x18e fp=0x700002376c18 sp=0x700002376bd0 pc=0x3028d8e
runtime.scanblock(0xc001524e28, 0x8, 0x37b4380, 0xc000041c58, 0x700002376d18)
	/usr/local/go/src/runtime/mgcmark.go:1375 +0xf6 fp=0x700002376c78 sp=0x700002376c18 pc=0x30285f6
runtime.scanstack(0xc000949880, 0xc000041c58)
	/usr/local/go/src/runtime/mgcmark.go:914 +0x33a fp=0x700002376da8 sp=0x700002376c78 pc=0x302781a
runtime.markroot.func1()
	/usr/local/go/src/runtime/mgcmark.go:239 +0xb5 fp=0x700002376df8 sp=0x700002376da8 pc=0x30263f5
runtime.markroot(0xc000041c58, 0xa3c, 0x1)
	/usr/local/go/src/runtime/mgcmark.go:213 +0x1a8 fp=0x700002376ea0 sp=0x700002376df8 pc=0x3026088
runtime.gcDrain(0xc000041c58, 0x7)
	/usr/local/go/src/runtime/mgcmark.go:1198 +0x3d4 fp=0x700002376f08 sp=0x700002376ea0 pc=0x3028274
runtime.gcDrainMarkWorkerIdle(...)
	/usr/local/go/src/runtime/mgcmark.go:1112
runtime.gcBgMarkWorker.func2()
	/usr/local/go/src/runtime/mgc.go:1429 +0x6f fp=0x700002376f58 sp=0x700002376f08 pc=0x30242ef
runtime.systemstack(0xc0001921c0)
	/usr/local/go/src/runtime/asm_amd64.s:509 +0x4a fp=0x700002376f68 sp=0x700002376f58 pc=0x3082b0a

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
compiler/runtime Issues related to the Go compiler and/or runtime. NeedsInvestigation Someone must examine and confirm this is a valid issue and not a duplicate of an existing one.
Projects
Development

No branches or pull requests

3 participants