cmd/compile: complicated bounds check elimination #16092

GordonBGood · 2016-06-17T08:16:30Z

1. go version go1.7beta1 windows/amd64

2.
set GOARCH=amd64
set GOBIN=
set GOEXE=.exe
set GOHOSTARCH=amd64
set GOHOSTOS=windows
set GOOS=windows
set GOPATH=F:\Go
set GORACE=
set GOROOT=F:\Go
set GOTOOLDIR=F:\Go\pkg\tool\windows_amd64
set CC=gcc
set GOGCCFLAGS=-m64 -mthreads -fmessage-length=0 -fdebug-prefix-map=C:\Users\Super\AppData\Local\Temp\go-build211527254=/tmp/go-build -gno-record-gcc-switches
set CXX=g++
set CGO_ENABLED=1

3. Runnable program:

// PrimeSpeed project PrimeSpeed.go
package main

import (
    "fmt"
    "math"
    "time"
    //    "unsafe"
)

func mkCLUT() [65536]byte {
    var arr [65536]byte
    for i := 0; i < 65536; i++ {
        var cnt byte = 0
        for v := (uint16)(i ^ 0xFFFF); v > 0; v &= v - 1 {
            cnt++
        }
        arr[i] = cnt
    }
    return arr
}

var cnstCLUT [65536]byte = mkCLUT()

func primesTest(top uint) int {
    lmtndx := (top - 3) >> 1
    lstw := lmtndx >> 5
    lmt := lstw + 1
    topsqrtndx := (int(math.Sqrt(float64(top))) - 3) >> 1
    cmpsts := make([]uint32, lstw+1)
    //    start := uintptr(unsafe.Pointer(&cmpsts[0]))
    //    step := unsafe.Sizeof(cmpsts[0])
    for i := 0; i <= topsqrtndx; i++ {
        if cmpsts[i>>5]&(uint32(1)<<uint(i)) == 0 {
            p := (uint(i) << 1) + 3
            for j := (p*p - 3) >> 1; j <= topi; j += p {
                cmpsts[j>>5] |= 1 << (j & 31)
            }
            //            p := uintptr((uint(i) << 1) + 3)
            //            lmt := uintptr(lmtndx)
            //            for j := (p*p - 3) >> 1; j <= lmt; j += p {
            //                *(*uint)(unsafe.Pointer(start + step*(j>>5))) |= 1 << (j & 31)
            //            }
        }
    }
    msk := uint32(0xFFFFFFFE) << (lmtndx & 31)
    cmpsts[lstw] |= msk
    cnt := 1
    for i := uint(0); i <= lstw; i++ {
        v := cmpsts[i]
        cnt += int(cnstCLUT[v&0xFFFF] + cnstCLUT[0xFFFF&(v>>16)])
    }
    return cnt
}

func main() {
    n := uint(262146)

    strt := time.Now()

    sum := 0
    for i := 0; i < 1000; i++ {
        sum += primesTest(n)
    }

    end := time.Now()

    fmt.Println("Found", sum, "primes up to", n, "in", end.Sub(strt), ".")
}

play.golang.org link: https://play.golang.org/p/_E5R5JAlGW

4. When "go tool compile -S PrimeSpeed.go > PrimeSpeed.s" is run, the inner tight composite number culling loop as quoted below:

    line 36     for j := (p*p - 3) >> 1; j <= topi; j += p {
    line 37         cmpsts[j>>5] |= 1 << (j & 31)
    line 38     }

looks like the following assembly code from PrimeSpeed.s:

    0x00f1 00241 (Main.go:37)   MOVQ    R8, CX
    0x00f4 00244 (Main.go:37)   SHRQ    $5, R8
    0x00f8 00248 (Main.go:37)   CMPQ    R8, DX
    0x00fb 00251 (Main.go:37)   JCC $0, 454
    0x0101 00257 (Main.go:37)   MOVL    (AX)(R8*4), R10
    0x0105 00261 (Main.go:37)   MOVQ    CX, R11
    0x0108 00264 (Main.go:37)   ANDQ    $31, CX
    0x010c 00268 (Main.go:37)   MOVL    R9, R12 **;; saves 1 to r12**
    0x010f 00271 (Main.go:37)   SHLL    CX, R9
    0x0112 00274 (Main.go:37)   ORL R10, R9
    0x0115 00277 (Main.go:37)   MOVL    R9, (AX)(R8*4)
    0x0119 00281 (Main.go:36)   LEAQ    3(R11)(DI*2), R8
    0x011e 00286 (Main.go:37)   MOVL    R12, R9 **;; restores 1 to r9 from r12**
    0x0121 00289 (Main.go:36)   CMPQ    R8, BX
    0x0124 00292 (Main.go:36)   JLS $0, 241

5. I expected to see:

    0x00f1 00241 (Main.go:37)   MOVQ    R8, CX
    0x00f4 00244 (Main.go:37)   SHRQ    $5, R8
    0x00f8 00248 (Main.go:37)   CMPQ    R8, DX ;; array bounds check, only if no -B option
    0x00fb 00251 (Main.go:37)   JCC $0, 454 ;; panic if out of bounds
    0x0101 00257 (Main.go:37)   MOVL    (AX)(R8*4), R10
    0x0105 00261 (Main.go:37)   MOVQ    CX, R11
    0x0108 00264 (Main.go:37)   ANDQ    $31, CX
                            (Main.go:37)    MOVL    $1,R9 **;; IMMEDIATE LOAD OF 1 to R9**
                            (Main.go:37)    SHLL    CX, R9
                            (Main.go:37)    ORL R10, R9
                            (Main.go:37)    MOVL    R9, (AX)(R8*4)
                            (Main.go:36)    LEAQ    3(R11)(DI*2), R8
                            (Main.go:36)    CMPQ    R8, BX
                            (Main.go:36)    JLS $0, 241

Even better, without recalculating p = 2 * i + 3 thus j += j + 2 * i + 3 inside the inner loop:
Includes changing order of instructions for processors without OOE:

    0x00f1 00241 (Main.go:37)   MOVQ    R8, R11
    0x0105 00261 (Main.go:37)   MOVQ    R8, CX
    0x00f4 00244 (Main.go:37)   SHRQ    $5, R11
    0x0108 00264 (Main.go:37)   ANDQ    $31, CX
    0x00f8 00248 (Main.go:37)   CMPQ    R11, DX ;; array bounds check, only if no -B option
    0x00fb 00251 (Main.go:37)   JCC $0, 454 ;; panic if out of bounds
    0x0101 00257 (Main.go:37)   MOVL    (AX)(R11*4), R10
    0x010c 00268 (Main.go:37)   MOVL    $1,R9 **;; IMMEDIATE LOAD OF 1 to R9**
    0x010f 00271 (Main.go:37)   SHLL    CX, R9
    0x0112 00274 (Main.go:37)   ORL R10, R9
    0x0119 00281 (Main.go:36)   ADDL    R12, R8 **;; ADD PRE-CALCULATED 'p' in R12 to 'j'**
    0x0115 00277 (Main.go:37)   MOVL    R9, (AX)(R11*4)
    0x0121 00289 (Main.go:36)   CMPQ    R8, BX
    0x0124 00292 (Main.go:36)   JLS $0, 241

The following is the same loop without bounds checks generated for C/C++ with the Visual Studio compiler (intel assembler format):

    $Loop:
        mov edx, esi
        mov ecx, esi
        shr edx, 5
        and ecx, 31                 ; 0000001fH
        mov eax, 1
        add esi, edi                    ; add 'p' in edi to 'j'
        shl eax, cl
        or  DWORD PTR [ebx+edx*4], eax
        cmp esi, ebp                    ; ebp contains 'topi'
        jbe SHORT $Loop

Note that the above uses a total of seven registers for this inner loop, so the same code is generated for x86 and x64 compilations. Unfortunately, it takes another register to hold the upper array bound for a range check and the x86 architecture can only have seven available; however, it is possible to slightly change the code as follows:

    line 36            bnds := lstw + 1
    line 37            k := (p*p - 3) >> 1
    line 38            for j, w := j, j >> 5; w <= bnds; w = r >> 5 {
    line 39                cmpsts[w] |= 1 << (j & 31)
    line 40                j += p
    line 41            }

which for Visual Studio C/C++ generates the following same number of instructions in the loop:

    $Loop:
        mov ecx, edx
        mov eax, 1
        and ecx, 31                 ; 0000001fH
        add edx, edi                    ; add 'p' in edi to 'j'
        shl eax, cl
        or  DWORD PTR [ebx+esi*4], eax
    **$Start:**
        mov esi, edx
        shr esi, 5
        cmp esi, ebp                    **; ebp contains the array length, bnds**
        jb  SHORT $Loop             **; this is the same as doing an array bounds check**

and it can be seen that the array bounds check is now done at the same time as the loop completion check; it should be a simple matter to clue the compiler that 'bnds' contains the array length, perhaps by assigning it inside the loop as len(cmpsts) as is done for C# x86 code so that it recognizes that the bounds check is already done. The start point of the loop could be the line after the "or" line at the $Start: label or an external check could be implemented to ensure that the bounds check is done for the first loop before the array is accessed as is done for the Visual Studio C/C++ compiler.

As demonstrated above, the golang code runs slower than C/C++ code by almost a factor of two on some x86 processors and more than that factor for x86 processors. It also runs slightly slower than C#/Java for both architectures.

The text was updated successfully, but these errors were encountered:

quentinmit · 2016-06-17T19:37:34Z

/cc @randall77 @dr2chase

gopherbot · 2016-10-05T21:45:11Z

CL https://golang.org/cl/30471 mentions this issue.

randall77 · 2016-10-05T21:45:53Z

I have a fix out for the main bug, doing the immediate load of 1 in the right place.
For the LEAQ vs ADDQ case, it seems much lower priority - it's a single instruction either way. If we really cared we could disable folding into LEAQ when the args are in a shallower loop depth.

Getting rid of the bounds check is harder.

Better to just rematerialize them when needed instead of cross-register spilling or other techniques for keeping them in registers. This helps for amd64 code that does 1 << x. It is better to do loop: MOVQ $1, AX // materialize arg to SLLQ SLLQ CX, AX ... goto loop than to do MOVQ $1, AX // materialize outsize of loop loop: MOVQ AX, DX // save value that's about to be clobbered SLLQ CX, AX MOVQ DX, AX // move it back to the correct register goto loop Update #16092 Change-Id: If7ac290208f513061ebb0736e8a79dcb0ba338c0 Reviewed-on: https://go-review.googlesource.com/30471 TryBot-Result: Gobot Gobot <gobot@golang.org> Run-TryBot: Keith Randall <khr@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>

rsc · 2016-10-21T04:22:16Z

The immediate load seems to be fixed. If you can send a simpler test case for the bounds check, please do so.

GordonBGood · 2016-10-22T02:50:26Z

If you can send a simpler test case for the bounds check, please do so.

Since version 1.7 is now stable and released, it seems that there is now built-in Bounds Check Elimination (BCE) which does most of what I was talking about as discussed in this Google document and this blog post although there are still many cases where it could eliminate the bounds check and doesn't such as #148058 and #17370 and likely others.

With this new BCE capability, I'm sure the loop can be re-written to eliminate bounds checks, or in any case the '-B' compiler option can turn them off; Alternatively, the use of unsafe pointers can also eliminate them so this isn't too much of any issue. Few languages would be smart enough to recognize that the bit index which can be shown to be 32 times the word index in this case will always be less than the array length with the bit index limit tested outside the loop or recognized by the compiler that it is used in the calculation of the slice size when the slice was created..

I downloaded the master and compiled it for x86_64 and the immediate load problem still seems to be there; perhaps the fix has not yet boon committed? Until I can confirm it is gone, I am going to leave this open.

golang code for this tight loop still seems to be about 2.5 times faster for some other languages such as Rust, Nim, Haskell, C/C++, etc. on a fast CPU without a cache bottleneck, so I will leave this open until there are some gains; until then array bounds checks are the least of the problems.

rasky · 2017-02-16T20:09:30Z

I've just checked again that the materialisation of 1 is fixed in Go 1.8.

The bound check issue is quite complicated, the compiler would have to prove there are no overflows involved. It sounds unlikely to get there.

As per the actual inner loop, it could be faster using the BTS instruction. Basically the hot loop should become:

MOVQ SI, CX
SHRQ $0x5, SI
CMPQ DX, SI
JAE 0x1087900
MOVL 0(R8)(SI*4), R9
BTS CX, R9
MOVL R9, 0(R8)(SI*4)

but this is basically a followup to #18943.

josharian · 2017-05-18T19:06:37Z

The code pasted intp the original post doesn't compile, but the playground link does. I've taken that code, deleted the commented out lines, and converted it into a benchmark. See below. @GordonBGood you might want to double-check that I've preserved the original intent of the benchmark.

Using tip at 638ebb0, tip is better than 1.8, which was better than 1.7. It's hard to figure out which of the many things discussed are still relevant, but since the trend is as least correct for 1.9, I'm moving this to 1.10.

name \ time/op  17          18          tip
PrimesTest-8    327µs ± 3%  306µs ± 2%  284µs ± 3%

package main

import (
	"math"
	"testing"
)

func mkCLUT() [65536]byte {
	var arr [65536]byte
	for i := 0; i < 65536; i++ {
		var cnt byte = 0
		for v := (uint16)(i ^ 0xFFFF); v > 0; v &= v - 1 {
			cnt++
		}
		arr[i] = cnt
	}
	return arr
}

var cnstCLUT [65536]byte = mkCLUT()

func primesTest(top uint) int {
	lmtndx := (top - 3) >> 1
	lstw := lmtndx >> 5
	topsqrtndx := (int(math.Sqrt(float64(top))) - 3) >> 1
	cmpsts := make([]uint32, lstw+1)
	for i := 0; i <= topsqrtndx; i++ {
		if cmpsts[i>>5]&(uint32(1)<<uint(i)) == 0 {
			p := (uint(i) << 1) + 3
			for j := (p*p - 3) >> 1; j <= lmtndx; j += p {
				cmpsts[j>>5] |= 1 << (j & 31)
			}
		}
	}
	msk := uint32(0xFFFFFFFE) << (lmtndx & 31)
	cmpsts[lstw] |= msk
	cnt := 1
	for i := uint(0); i <= lstw; i++ {
		v := cmpsts[i]
		cnt += int(cnstCLUT[v&0xFFFF] + cnstCLUT[0xFFFF&(v>>16)])
	}
	return cnt
}

var sink int

func BenchmarkPrimesTest(b *testing.B) {
	for i := 0; i < b.N; i++ {
		sink = primesTest(262146)
	}
}

GordonBGood · 2017-05-19T11:55:31Z

@josharian, yes, the new code tests what I intended to test, with the results calculated by a Look Up Table (LUT) taking a small part of the time to cull the composite numbers, the timing result now reflecting the time taken by the tight culling loop.

Also, with the range limited to that required by a 16 Kilobyte CPU L1 buffer size, it is testing raw CPU loop speed and not memory access speed, which testing of tight CPU loop speed is what is desired.

bradfitz · 2018-06-19T17:49:01Z

Out of curiosity, I measured where we're at now compared to Go 1.7: (now == tip 1caa062)

bradfitz@gdev:~/src/issue_16092$ benchstat 1.7 tip
name          old time/op  new time/op  delta
PrimesTest-4   399µs ± 1%   301µs ± 0%  -24.46%  (p=0.000 n=10+8)

bradfitz@gdev:~/src/issue_16092$ benchstat 1.8 tip
name          old time/op  new time/op  delta
PrimesTest-4   395µs ± 1%   301µs ± 0%  -23.66%  (p=0.000 n=9+8)

bradfitz@gdev:~/src/issue_16092$ benchstat 1.7 1.8
name          old time/op  new time/op  delta
PrimesTest-4   399µs ± 1%   395µs ± 1%  -1.06%  (p=0.000 n=10+9)

GordonBGood · 2018-07-17T11:06:55Z

This just goes to show that profiling without analysis is of limited use: What it does seem to show is a 25% improvement for current version for the particular CPU used. What it fails to consider is the runtime conditions and the impact on those results. The analysis in terms of clock cycles per cull loop is instructive: This algorithm when compiled efficiently with gcc C/C++, Nim, Rust, etc., takes about 5.5 clock cycles per cull (for about 200K cull operations) whether run on a high end modern CPU or a basic one such as the Intel Atom x5-Z8350 in my Windows tablet; thus a run time of about 0.3 second doesn't sound too bad assuming about a 3.5 GHz clock speed. However, I ran the benchmark on golang version 1.10.3 (which I don't know if includes the tweak or not) on my Atom taking about 13.75 clocks per cull! Even if new code would produce a 25% improvement, that would still take about 10.5 to 11 clocks per cull. For comparison, the same algorithm run with F# (which produces very unoptimized code for this loop) takes about this same number of clocks of 10.5 per cull. What's the difference between high end CPU's and my Atom other than amount and sophistication of caches unused by this benchmark (and obviously the price - grin)? High end processors have more sophisticated branch prediction, better Out of Order Execution (OOE), and simultaneous instruction execution than my poor Atom; this can serve to hide some of the deficiencies of crap code as in this case. I didn't bother doing the full autopsy on the code as I did in the OP, but it seems obvious that it is still producing poor quality native code just as F# does (for which I have also done the autopsy) other than tip has likely (finally) eliminated a redundant register load I identified in the OP. So it's too early to close this issue and I would say you likely have a long way to go before golang speed approaches the leaders of the pack, in many use cases by a factor of two or more as for this benchmark. It also shows the deficiencies of benchmarking/profiling in judging quality of code given modern high-end CPU's abilities to make the best of the bad. It also shows that benchmark results should be given in numbers of clock cycles to eliminate CPU clock speed, but that CPU model and generation can still skew results of benchmarks as here. A case could be made that all benchmarks should be verified on a lower end processor as I did.

…

On Tue, Jun 19, 2018, 11:50 Brad Fitzpatrick ***@***.***> wrote: Out of curiosity, I measured where we're at now compared to Go 1.7: (now == tip 1caa062 <1caa062> ) ***@***.***:~/src/issue_16092$ benchstat 1.7 tip name old time/op new time/op delta PrimesTest-4 399µs ± 1% 301µs ± 0% -24.46% (p=0.000 n=10+8) ***@***.***:~/src/issue_16092$ benchstat 1.8 tip name old time/op new time/op delta PrimesTest-4 395µs ± 1% 301µs ± 0% -23.66% (p=0.000 n=9+8) ***@***.***:~/src/issue_16092$ benchstat 1.7 1.8 name old time/op new time/op delta PrimesTest-4 399µs ± 1% 395µs ± 1% -1.06% (p=0.000 n=10+9) — You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub <#16092 (comment)>, or mute the thread <https://github.com/notifications/unsubscribe-auth/AKM2TV3uSQQN78yddXVWJWLdWjasnSD9ks5t-Tn3gaJpZM4I4IVh> .

GordonBGood · 2018-07-17T11:34:20Z

I should have clarified what I meant by "crap" native code produced by F# and golang: once array bounds checks are turned off or elided, the main restriction is not using read/modify/write instructions that can make the loop much more efficient as discussed previously. This is what is wrong with the F# loops, although in addition array bounds checks can't be turned off or easily elided for F# as they can be/could be in golang: F# never uses r/m/w as I assume golang still doesn't. Sophisticated simultaneous execution and OOE can mitigate this on high end processors just as advanced branch prediction can reduce the cost of bounds checks. On Tue, Jul 17, 2018, 05:06 W. Gordon Goodsman <gordonbgood@gmail.com> wrote:

…

This just goes to show that profiling without analysis is of limited use: What it does seem to show is a 25% improvement for current version for the particular CPU used. What it fails to consider is the runtime conditions and the impact on those results. The analysis in terms of clock cycles per cull loop is instructive: This algorithm when compiled efficiently with gcc C/C++, Nim, Rust, etc., takes about 5.5 clock cycles per cull (for about 200K cull operations) whether run on a high end modern CPU or a basic one such as the Intel Atom x5-Z8350 in my Windows tablet; thus a run time of about 0.3 second doesn't sound too bad assuming about a 3.5 GHz clock speed. However, I ran the benchmark on golang version 1.10.3 (which I don't know if includes the tweak or not) on my Atom taking about 13.75 clocks per cull! Even if new code would produce a 25% improvement, that would still take about 10.5 to 11 clocks per cull. For comparison, the same algorithm run with F# (which produces very unoptimized code for this loop) takes about this same number of clocks of 10.5 per cull. What's the difference between high end CPU's and my Atom other than amount and sophistication of caches unused by this benchmark (and obviously the price - grin)? High end processors have more sophisticated branch prediction, better Out of Order Execution (OOE), and simultaneous instruction execution than my poor Atom; this can serve to hide some of the deficiencies of crap code as in this case. I didn't bother doing the full autopsy on the code as I did in the OP, but it seems obvious that it is still producing poor quality native code just as F# does (for which I have also done the autopsy) other than tip has likely (finally) eliminated a redundant register load I identified in the OP. So it's too early to close this issue and I would say you likely have a long way to go before golang speed approaches the leaders of the pack, in many use cases by a factor of two or more as for this benchmark. It also shows the deficiencies of benchmarking/profiling in judging quality of code given modern high-end CPU's abilities to make the best of the bad. It also shows that benchmark results should be given in numbers of clock cycles to eliminate CPU clock speed, but that CPU model and generation can still skew results of benchmarks as here. A case could be made that all benchmarks should be verified on a lower end processor as I did. On Tue, Jun 19, 2018, 11:50 Brad Fitzpatrick ***@***.***> wrote: > Out of curiosity, I measured where we're at now compared to Go 1.7: (now > == tip 1caa062 > <1caa062> > ) > > ***@***.***:~/src/issue_16092$ benchstat 1.7 tip > name old time/op new time/op delta > PrimesTest-4 399µs ± 1% 301µs ± 0% -24.46% (p=0.000 n=10+8) > > ***@***.***:~/src/issue_16092$ benchstat 1.8 tip > name old time/op new time/op delta > PrimesTest-4 395µs ± 1% 301µs ± 0% -23.66% (p=0.000 n=9+8) > > ***@***.***:~/src/issue_16092$ benchstat 1.7 1.8 > name old time/op new time/op delta > PrimesTest-4 399µs ± 1% 395µs ± 1% -1.06% (p=0.000 n=10+9) > > — > You are receiving this because you were mentioned. > Reply to this email directly, view it on GitHub > <#16092 (comment)>, or mute > the thread > <https://github.com/notifications/unsubscribe-auth/AKM2TV3uSQQN78yddXVWJWLdWjasnSD9ks5t-Tn3gaJpZM4I4IVh> > . >

quentinmit changed the title ~~compiler: new backend does not use immediate load instruction...~~ cmd/compile: new backend does not use immediate load instruction... Jun 17, 2016

quentinmit added this to the Go1.7 milestone Jun 17, 2016

quentinmit added the NeedsDecision label Jun 17, 2016

randall77 modified the milestones: Go1.8, Go1.7 Jun 17, 2016

randall77 self-assigned this Jun 17, 2016

randall77 added Performance and removed NeedsDecision labels Jun 17, 2016

GordonBGood mentioned this issue Jun 27, 2016

cmd/compile: new backend; multiple efficiency issues... #16192

Open

quentinmit added the NeedsFix label Oct 11, 2016

rsc modified the milestones: Go1.9, Go1.8 Oct 21, 2016

rsc changed the title ~~cmd/compile: new backend does not use immediate load instruction...~~ cmd/compile: complicated bounds check elimination Oct 21, 2016

navytux mentioned this issue Feb 16, 2017

cmd/compile: checking len(data) and erroring out if too small does not help BCE #19126

Closed

josharian modified the milestones: Go1.10, Go1.9 May 18, 2017

bradfitz modified the milestones: Go1.10, Go1.11 Nov 28, 2017

bradfitz modified the milestones: Go1.11, Unplanned Jun 19, 2018

rsc unassigned randall77 Jun 23, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

cmd/compile: complicated bounds check elimination #16092

cmd/compile: complicated bounds check elimination #16092

GordonBGood commented Jun 17, 2016 •

edited

Loading

quentinmit commented Jun 17, 2016

gopherbot commented Oct 5, 2016

randall77 commented Oct 5, 2016

rsc commented Oct 21, 2016

GordonBGood commented Oct 22, 2016 •

edited

Loading

rasky commented Feb 16, 2017 •

edited

Loading

josharian commented May 18, 2017

GordonBGood commented May 19, 2017

bradfitz commented Jun 19, 2018

GordonBGood commented Jul 17, 2018 via email

GordonBGood commented Jul 17, 2018 via email

cmd/compile: complicated bounds check elimination #16092

cmd/compile: complicated bounds check elimination #16092

Comments

GordonBGood commented Jun 17, 2016 • edited Loading

quentinmit commented Jun 17, 2016

gopherbot commented Oct 5, 2016

randall77 commented Oct 5, 2016

rsc commented Oct 21, 2016

GordonBGood commented Oct 22, 2016 • edited Loading

rasky commented Feb 16, 2017 • edited Loading

josharian commented May 18, 2017

GordonBGood commented May 19, 2017

bradfitz commented Jun 19, 2018

GordonBGood commented Jul 17, 2018 via email

GordonBGood commented Jul 17, 2018 via email

GordonBGood commented Jun 17, 2016 •

edited

Loading

GordonBGood commented Oct 22, 2016 •

edited

Loading

rasky commented Feb 16, 2017 •

edited

Loading