Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

x/build: cleanup failed reverse buildlets on coordinator #10682

Closed
crawshaw opened this issue May 4, 2015 · 4 comments
Closed

x/build: cleanup failed reverse buildlets on coordinator #10682

crawshaw opened this issue May 4, 2015 · 4 comments

Comments

@crawshaw
Copy link
Member

crawshaw commented May 4, 2015

If a reverse buildlet dies in the middle of the build, the coordinator health check isn't sent and stays on the dashboard in a stuck state. Set a Conn timeout?

@bradfitz
Copy link
Contributor

bradfitz commented May 4, 2015

What does health mean? TCP keep-alives would say the conn is alive at least. But a hung reverse buildlet mid-build is also bad.

@bradfitz
Copy link
Contributor

bradfitz commented May 7, 2015

Stack from such a a dead builder:

goroutine 642 [IO wait, 1 minutes]:
net.runtime_pollWait(0x7fd860ec4790, 0x72, 0xc2085da000)
    /home/bradfitz/go/src/runtime/netpoll.go:157 +0x63
net.(*pollDesc).Wait(0xc2085d8a00, 0x72, 0x0, 0x0)
    /home/bradfitz/go/src/net/fd_poll_runtime.go:73 +0x41
net.(*pollDesc).WaitRead(0xc2085d8a00, 0x0, 0x0)
    /home/bradfitz/go/src/net/fd_poll_runtime.go:78 +0x3d
net.(*netFD).Read(0xc2085d89a0, 0xc2085da000, 0x1000, 0x1000, 0x0, 0x7fd860eae050, 0xc20800a150)
    /home/bradfitz/go/src/net/fd_unix.go:232 +0x24d
net.(*conn).Read(0xc2085965a0, 0xc2085da000, 0x1000, 0x1000, 0x0, 0x0, 0x0)
    /home/bradfitz/go/src/net/net.go:124 +0xe7
crypto/tls.(*block).readFromUntil(0xc2083b0900, 0x7fd860eb3ae8, 0xc2085965a0, 0x5, 0x0, 0x0)
    /home/bradfitz/go/src/crypto/tls/conn.go:455 +0xda
crypto/tls.(*Conn).readRecord(0xc208326580, 0xae0b17, 0x0, 0x0)
    /home/bradfitz/go/src/crypto/tls/conn.go:540 +0x2d2
crypto/tls.(*Conn).Read(0xc208326580, 0xc2085c1000, 0x1000, 0x1000, 0x0, 0x0, 0x0)
    /home/bradfitz/go/src/crypto/tls/conn.go:901 +0x16a
net/http.(*liveSwitchReader).Read(0xc20866a228, 0xc2085c1000, 0x1000, 0x1000, 0x17, 0x0, 0x0)
    /home/bradfitz/go/src/net/http/server.go:212 +0xa7
io.(*LimitedReader).Read(0xc208383b20, 0xc2085c1000, 0x1000, 0x1000, 0x17, 0x0, 0x0)
    /home/bradfitz/go/src/io/io.go:427 +0xca
bufio.(*Reader).fill(0xc208705320)
    /home/bradfitz/go/src/bufio/bufio.go:97 +0x1d6
bufio.(*Reader).ReadSlice(0xc208705320, 0xa, 0x0, 0x0, 0x0, 0x0, 0x0)
    /home/bradfitz/go/src/bufio/bufio.go:328 +0x246
net/http/internal.readLine(0xc208705320, 0x0, 0x0, 0x0, 0x0, 0x0)
    /home/bradfitz/go/src/net/http/internal/chunked.go:110 +0x4e
net/http/internal.(*chunkedReader).beginChunk(0xc2085c7860)
    /home/bradfitz/go/src/net/http/internal/chunked.go:47 +0x3c
net/http/internal.(*chunkedReader).Read(0xc2085c7860, 0xc2085ae000, 0x8000, 0x8000, 0x0, 0x0, 0x0)
    /home/bradfitz/go/src/net/http/internal/chunked.go:77 +0xba
net/http.(*body).readLocked(0xc208b2d7c0, 0xc2085ae000, 0x8000, 0x8000, 0xffffffff, 0x0, 0x0)
    /home/bradfitz/go/src/net/http/transfer.go:585 +0x73
net/http.(*body).Read(0xc208b2d7c0, 0xc2085ae000, 0x8000, 0x8000, 0x0, 0x0, 0x0)
    /home/bradfitz/go/src/net/http/transfer.go:580 +0x113
main.(*reverseLockedBody).Read(0xc2083c0500, 0xc2085ae000, 0x8000, 0x8000, 0x11, 0x0, 0x0)
    /home/bradfitz/src/golang.org/x/build/cmd/coordinator/reverse.go:393 +0x6f
io.copyBuffer(0x7fd860ec43b8, 0xc2086b6e00, 0x7fd860ed78e8, 0xc2083c0500, 0xc2085ae000, 0x8000, 0x8000, 0x20b9, 0x0, 0x0)
    /home/bradfitz/go/src/io/io.go:381 +0x24a
io.Copy(0x7fd860ec43b8, 0xc2086b6e00, 0x7fd860ed78e8, 0xc2083c0500, 0xc2083c0500, 0x0, 0x0)
    /home/bradfitz/go/src/io/io.go:351 +0x6b
golang.org/x/build/buildlet.(*Client).Exec(0xc208595740, 0xc2082c8df0, 0xf, 0x7fd860ec43b8, 0xc2086b6e00, 0x0, 0x0, 0x0, 0x0, 0x0, ...)
    /home/bradfitz/src/golang.org/x/build/buildlet/buildletclient.go:264 +0x1026
main.(*buildStatus).build(0xc2086b6e00, 0x0, 0x0)
    /home/bradfitz/src/golang.org/x/build/cmd/coordinator/coordinator.go:997 +0xf72
main.(*buildStatus).start.func1(0xc2086b6e00)
    /home/bradfitz/src/golang.org/x/build/cmd/coordinator/coordinator.go:910 +0x33
created by main.(*buildStatus).start
    /home/bradfitz/src/golang.org/x/build/cmd/coordinator/coordinator.go:916 +0x7e

@bradfitz
Copy link
Contributor

bradfitz commented May 7, 2015

TCP keep-alive should already be enabled on this incoming TLS reverse connections:

func listenAndServeTLS() { 
        addr := ":443" 
        if *mode == "dev" {
                addr = ":8119"
        }
        ln, err := net.Listen("tcp", addr)
        if err != nil {
                log.Fatalf("net.Listen(%s): %v", addr, err)
        }
        serveTLS(ln)
}

func serveTLS(ln net.Listener) {
        certPEM, err := readGCSFile("farmer-cert.pem")
        if err != nil {
                log.Printf("cannot load TLS cert, skipping https: %v", err)
                return
        }
        keyPEM, err := readGCSFile("farmer-key.pem")
        if err != nil {
                log.Printf("cannot load TLS key, skipping https: %v", err)
                return
        }
        cert, err := tls.X509KeyPair(certPEM, keyPEM)
        if err != nil {
                log.Printf("bad TLS cert: %v", err)
                return
        }

        server := &http.Server{Addr: ln.Addr().String()}
        config := &tls.Config{
                NextProtos:   []string{"http/1.1"},
                Certificates: []tls.Certificate{cert},
        }       
        tlsLn := tls.NewListener(tcpKeepAliveListener{ln.(*net.TCPListener)}, config)
        log.Printf("Coordinator serving on: %v", tlsLn.Addr())
        if err := server.Serve(tlsLn); err != nil {
                log.Fatalf("serve https: %v", err)
        }
}       

type tcpKeepAliveListener struct {
        *net.TCPListener
}

func (ln tcpKeepAliveListener) Accept() (c net.Conn, err error) {
        tc, err := ln.AcceptTCP()
        if err != nil {
                return
        }
        tc.SetKeepAlive(true)
        tc.SetKeepAlivePeriod(3 * time.Minute)
        return tc, nil
}       

@gopherbot
Copy link

CL https://golang.org/cl/9851 mentions this issue.

@golang golang locked and limited conversation to collaborators Jun 25, 2016
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Projects
None yet
Development

No branches or pull requests

3 participants