Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

proposal: bufio: add Reader.DiscardRunes(n int) and utf8: add RuneByteLen #47621

Closed
cafra opened this issue Aug 10, 2021 · 20 comments
Closed

proposal: bufio: add Reader.DiscardRunes(n int) and utf8: add RuneByteLen #47621

cafra opened this issue Aug 10, 2021 · 20 comments

Comments

@cafra
Copy link

cafra commented Aug 10, 2021

What version of Go are you using (go version)?

$ go version
go version go1.17.1 darwin/amd64

Does this issue reproduce with the latest release?

Yes

What operating system and processor architecture are you using (go env)?

go env Output
$ go env
GO111MODULE="on"
GOARCH="amd64"
GOBIN=""
GOCACHE="/Users/cz/Library/Caches/go-build"
GOENV="/Users/cz/Library/Application Support/go/env"
GOEXE=""
GOEXPERIMENT=""
GOFLAGS=""
GOHOSTARCH="amd64"
GOHOSTOS="darwin"
GOINSECURE=""
GOMODCACHE="/Users/cz/go/pkg/mod"
GONOPROXY="gitlab.alipay-inc.com/*"
GONOSUMDB="gitlab.alipay-inc.com/*"
GOOS="darwin"
GOPATH="/Users/cz/go"
GOPRIVATE="gitlab.alipay-inc.com/*"
GOPROXY="https://goproxy.io,direct"
GOROOT="/Users/cz/sdk/go1.17"
GOSUMDB="sum.golang.org"
GOTMPDIR=""
GOTOOLDIR="/Users/cz/sdk/go1.17/pkg/tool/darwin_amd64"
GOVCS=""
GOVERSION="go1.17"
GCCGO="gccgo"
AR="ar"
CC="clang"
CXX="clang++"
CGO_ENABLED="1"
GOMOD="/Users/cz/go/src/go.mod"
CGO_CFLAGS="-g -O2"
CGO_CPPFLAGS=""
CGO_CXXFLAGS="-g -O2"
CGO_FFLAGS="-g -O2"
CGO_LDFLAGS="-g -O2"
PKG_CONFIG="pkg-config"
GOGCCFLAGS="-fPIC -arch x86_64 -m64 -pthread -fno-caret-diagnostics -Qunused-arguments -fmessage-length=0 -fdebug-prefix-map=/var/folders/3g/6n_rr7kx61d3dbbjbsn394fr0000gp/T/go-build180543292=/tmp/go-build -gno-record-gcc-switches -fno-common"

What did you do?

In mesh service traffic management, there are often businesses that need to parse request messages.
But the protocol analysis performance is the bottleneck.
Therefore, by adding the feature of extracting the corresponding value according to the rules in the decode of the hessian-go library, avoid parsing all request messages.
In the hessian protocol, the length of each data type records the length of characters but the bytes, that is, the length of rune.
For data that does not match to the rules, use bufio.DiscardRunes to skip useless data to improve the performance of parsing parameters.
Discard can only skip the length of byte, but there are problems with utf8 type data,
so the bufio.DiscardRunes and utf8.RuneByteLen is extended to meet the demand.

What did you expect to see?

bufio Reader add DiscardRunes(n int) for skipping byte data of specified length characters;
utf8 add RuneByteLen for getting the number of bytes of the first utf-8 encoding in bytes;

  • bufio code
// DiscardRunes skips the next n runes, returning the number of bytes discarded.
//
// If DiscardRunes skips fewer than n runes, it also returns an error.
// If 0 <= n <= b.Buffered(), DiscardRunes is guaranteed to succeed without
// reading from the underlying io.Reader.
func (b *Reader) DiscardRunes(n int) (discardedBytes int, err error) {
	if n < 0 {
		return 0, ErrNegativeCount
	}
	if n == 0 {
		return
	}
	for i := 0; i < n; i++ {
		for b.r+utf8.UTFMax > b.w && !utf8.FullRune(b.buf[b.r:b.w]) && b.err == nil && b.w-b.r < len(b.buf) {
			b.fill() // b.w-b.r < len(buf) => buffer is not full
		}

		r, size := rune(b.buf[b.r]), 1
		if r >= utf8.RuneSelf {
			size = utf8.RuneByteLen(b.buf[b.r:b.w])
		}
		discardedBytes += size
		b.r += size
	}

	return discardedBytes, nil
}
  • utf8 code
// RuneByteLen returns the number of bytes of the first utf-8 encoding in p.
// If p is empty it returns 0. Otherwise, if
// the encoding is invalid, it returns 1.
// Both are impossible results for correct, non-empty UTF-8.
//
// an encoding is invalid if it is incorrect utf-8, encodes a rune that is
// out of range, or is not the shortest possible utf-8 encoding for the
// value. no other validation is performed.
func RuneByteLen(p []byte) (size int) {
	n := len(p)
	if n < 1 {
		return 0
	}
	p0 := p[0]
	x := first[p0]
	if x >= as {
		return 1
	}
	sz := int(x & 7)
	accept := acceptRanges[x>>4]
	if n < sz {
		return 1
	}
	b1 := p[1]
	if b1 < accept.lo || accept.hi < b1 {
		return 1
	}
	if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
		return 2
	}
	b2 := p[2]
	if b2 < locb || hicb < b2 {
		return 1
	}
	if sz <= 3 {
		return 3
	}
	b3 := p[3]
	if b3 < locb || hicb < b3 {
		return 1
	}
	return 4
}
  • performance code
func BenchmarkDiscardVsRead(b *testing.B) {
	b.Run("DiscardRunes", func(b *testing.B) {
		data := strings.Repeat("中", 4097)
		for i := 0; i < b.N; i++ {
			buf := bytes.NewBufferString(data)
			b := NewReader(buf)
			b.DiscardRunes(4097)
		}
	})

	b.Run("DiscardRunesCompare(NoRuneByteLen)", func(b *testing.B) {
		data := strings.Repeat("中", 4097)
		for i := 0; i < b.N; i++ {
			buf := bytes.NewBufferString(data)
			b := NewReader(buf)
			b.DiscardRunesCompare(4097)
		}
	})

	b.Run("readRuneForDiscard", func(b *testing.B) {
		data := strings.Repeat("中", 4097)
		for i := 0; i < b.N; i++ {
			buf := bytes.NewBufferString(data)
			b := NewReader(buf)
			for i := 0; i < 4097; i++ {
				b.ReadRune()
			}
		}
	})
}


func (b *Reader) DiscardRunesCompare(n int) (discardedBytes int, err error) {
	if n < 0 {
		return 0, ErrNegativeCount
	}
	if n == 0 {
		return
	}
	for i := 0; i < n; i++ {
		for b.r+utf8.UTFMax > b.w && !utf8.FullRune(b.buf[b.r:b.w]) && b.err == nil && b.w-b.r < len(b.buf) {
			b.fill() // b.w-b.r < len(buf) => buffer is not full
		}

		r, size := rune(b.buf[b.r]), 1
		if r >= utf8.RuneSelf {
			r, size = utf8.DecodeRune(b.buf[b.r:b.w])
		}
		discardedBytes += size
		b.r += size
	}

	return discardedBytes, nil
}
  • performance data
goos: darwin
goarch: amd64
pkg: bufio
cpu: Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
BenchmarkDiscardVsRead
BenchmarkDiscardVsRead/DiscardRunes-12        	   49599	     23714 ns/op
BenchmarkDiscardVsRead/DiscardRunesCompare-12    42514	     27172 ns/op
BenchmarkDiscardVsRead/readRuneForDiscard-12   	    37992	     31682 ns/op
PASS

What did you see instead?

DiscardRunes has 12% performance improvement
DiscardRunes + RuneByteLen has 31% performance improvement

Please post code as ordinary text or a link to the Go playground, not as an image. Images are hard to read. Thanks.

@seankhliao seankhliao changed the title Add discardRune(runeLen int) to the bufio package proposal: bufio: add Reader.DiscardRune(n int) Aug 10, 2021
@gopherbot gopherbot added this to the Proposal milestone Aug 10, 2021
@ianlancetaylor
Copy link
Contributor

Please post code as ordinary text or a link to the Go playground, not as an image. Images are hard to read. Thanks.

@ianlancetaylor
Copy link
Contributor

To clarify, I think you are suggesting something like this:

// DiscardRune skips the next n UTF-8 encoded runes, returning the number of bytes discarded.
//
// If Discard skips fewer than n runes, it also returns an error.
func (b *Reader) DiscardRune(n int) (discardedBytes int, err error)

Should this be DiscardRunes rather than DiscardRune?

@ianlancetaylor ianlancetaylor added this to Incoming in Proposals (old) Aug 10, 2021
@neild
Copy link
Contributor

neild commented Aug 10, 2021

Definitely DiscardRunes.

Since bufio.Reader already has a methods operating on runes, DiscardRunes doesn't seem prima facie unreasonable. It's perhaps a bit specialized, however, and seems very easy to misuse by discarding part of a grapheme cluster.

@cafra
Copy link
Author

cafra commented Aug 11, 2021

Please post code as ordinary text or a link to the Go playground, not as an image. Images are hard to read. Thanks.

The ordinary code has been pasted. And has been renamed to DiscardRunes

@cafra
Copy link
Author

cafra commented Aug 11, 2021

Definitely DiscardRunes.

Since bufio.Reader already has a methods operating on runes, DiscardRunes doesn't seem prima facie unreasonable. It's perhaps a bit specialized, however, and seems very easy to misuse by discarding part of a grapheme cluster.

func BenchmarkDiscardVsRead(b *testing.B) {
	b.Run("discardRunes", func(b *testing.B) {
		data:= strings.Repeat("中",4097)
		for i := 0; i < b.N; i++ {
			buf:=bytes.NewBufferString(data)
			b:=NewReader(buf)
			b.DiscardRunes(4097)
		}
	})

	b.Run("readRuneForDiscard", func(b *testing.B) {
		data:= strings.Repeat("中",4097)
		for i := 0; i < b.N; i++ {
			buf:=bytes.NewBufferString(data)
			b:=NewReader(buf)
			for i:=0;i<4097;i++{
				b.ReadRune()
			}
		}
	})
}
goos: darwin
goarch: amd64
pkg: bufio
cpu: Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
BenchmarkDiscardVsRead
BenchmarkDiscardVsRead/discardRunes
BenchmarkDiscardVsRead/discardRunes-12         	   40064	     26675 ns/op
BenchmarkDiscardVsRead/readRuneForDiscard
BenchmarkDiscardVsRead/readRuneForDiscard-12   	   27902	     40588 ns/op
PASS

As in the bench test, although there is a rune operation,
ReadRune will execute batch calls to DecodeRune,
and there are a lot of bit operations inside DecodeRune,
such as rune(p0&mask4)<<18 | rune(b1&maskx)<<12 | rune(b2&maskx )<<6 | rune(b3&maskx).
In the discard scene, it is only necessary to obtain the byte length of the rune, and does not need to perform bit operations to obtain the real data.
At this point, if there are DiscardRunes, the performance can be improved by 30%.
In addition, the bufio package is not declared as an interface, so developers cannot provide their own implementation.

@gopherbot
Copy link

Change https://golang.org/cl/341390 mentions this issue: bufio: add Reader.DiscardRune(n int)

@taoyuanyuan
Copy link
Contributor

cool, I have the same scenario, good job!
Btw, ReadRune() and DiscardRunes() in pairs, very easy to identify them.

@martisch
Copy link
Contributor

If this needs to get supported: Instead of GetRuneBytes can the implementation reuse the existing DecodeRune? Its seems a lot of the logic would otherwise be duplicated.

@cafra
Copy link
Author

cafra commented Aug 16, 2021

If this needs to get supported: Instead of GetRuneBytes can the implementation reuse the existing DecodeRune? Its seems a lot of the logic would otherwise be duplicated.

GetRuneBytes has been deleted
Extend and reuse DecodeRune

@rsc
Copy link
Contributor

rsc commented Sep 1, 2021

This proposal has been added to the active column of the proposals project
and will now be reviewed at the weekly proposal review meetings.
— rsc for the proposal review group

@rsc rsc moved this from Incoming to Active in Proposals (old) Sep 1, 2021
@rsc
Copy link
Contributor

rsc commented Sep 8, 2021

It seems like a mistake for the protocol to use rune-length-prefixed strings, but we can't do anything about that.

Is it possible to speed up DecodeRune instead? It seems like GetRuneBytes in the code above is exactly utf8.DecodeRune, just without the shifts and ors. It's hard to believe that that's 30% of the time.

Why is this special case so much faster, and can we make the general case faster?

@cafra cafra changed the title proposal: bufio: add Reader.DiscardRune(n int) proposal: bufio: add Reader.DiscardRune(n int) and utf8: add RuneByteLen Sep 14, 2021
@cafra cafra changed the title proposal: bufio: add Reader.DiscardRune(n int) and utf8: add RuneByteLen proposal: bufio: add Reader.DiscardRunes(n int) and utf8: add RuneByteLen Sep 14, 2021
@cafra
Copy link
Author

cafra commented Sep 14, 2021

It seems like a mistake for the protocol to use rune-length-prefixed strings, but we can't do anything about that.

Is it possible to speed up DecodeRune instead? It seems like GetRuneBytes in the code above is exactly utf8.DecodeRune, just without the shifts and ors. It's hard to believe that that's 30% of the time.

Why is this special case so much faster, and can we make the general case faster?

  1. The hessian protocol is a cross-language encoding and decoding protocol. The java language is used more often and is used in many open source projects, such as sofaabolt-hessian and dubbo-hessian.
    As for why the agreement is set up in this way, I personally cannot comment on its good or bad

  2. In this scenario, the bit operation to get rune in DecodeRune is not needed, this operation is time-consuming and meaningless. Implementation code and test code have been updated above, I will also submit pr.

  3. The purpose of DiscardRunes is to obtain the byte length of each rune in a loop, and then move the r pointer of the reader. This process only moves the r index according to the number of bytes in the rune, so the performance is very good.
    In addition, this case may not be a special case, and other developers have also encountered similar needs.

@gopherbot
Copy link

Change https://golang.org/cl/349773 mentions this issue: bufio: add Reader.DiscardRunes for discarding rune of specified length;

@robpike
Copy link
Contributor

robpike commented Sep 15, 2021

This seems too special-purpose and poorly defined to be added to the standard library. If it's truly a Java service you're talking to, you might not want rune at all but something to do with UTF-16, which is a subtly different thing. Also what happens with surrogate pairs and other horrors?

I believe the logic is specific to the protocol, and should be implemented there. In the meantime, I agree that if we can speed up some of the operations in the library to speed up your solution without adding new functionality, that would be a good way to respond.

If the proposal does go forward, it will need to follow Go's conventions about invalid UTF-8, which are not applicable to Java anyway, another vote against.

@cafra
Copy link
Author

cafra commented Sep 15, 2021

should be implemented ther

This seems too special-purpose and poorly defined to be added to the standard library. If it's truly a Java service you're talking to, you might not want rune at all but something to do with UTF-16, which is a subtly different thing. Also what happens with surrogate pairs and other horrors?

I believe the logic is specific to the protocol, and should be implemented there. In the meantime, I agree that if we can speed up some of the operations in the library to speed up your solution without adding new functionality, that would be a good way to respond.

If the proposal does go forward, it will need to follow Go's conventions about invalid UTF-8, which are not applicable to Java anyway, another vote against.

Maybe what you said makes sense. But we need a solution.
Because the extension method discardRunes cannot be provided for bufio.reader, we can only submit pr to the standard library at present. Do you have an extension method?

Just submit the discardRune proposal, Abandon the utf8 proposal, is this okay?

@martisch
Copy link
Contributor

Since its very special use case: Could this be used when implemented as a new standalone Reader implementation outside the standard library or is there a special reason the standard library bufio Reader needs to be extended?

@rsc
Copy link
Contributor

rsc commented Sep 15, 2021

If there are reasonable performance optimizations possible in utf8.DecodeRune, let's apply them.
Adding utf8.RuneByteLen, which is DecodeRune with one result omitted doesn't make much sense.
Next we would need RuneByteLenInString.
The same for bufio.Reader.DiscardRunes.

Today we already have r.Peek(r.Buffered()) to see the entire buffered data,
and then it's possible to determine the number of bytes to drop and use r.Discard(n).
So the discard runes code can be written outside bufio but still interact efficiently with bufio.

It seems like we should probably decline to add this to the standard library.

@rsc
Copy link
Contributor

rsc commented Sep 22, 2021

Based on the discussion above, this proposal seems like a likely decline.
— rsc for the proposal review group

@rsc rsc moved this from Active to Likely Decline in Proposals (old) Sep 22, 2021
@cafra
Copy link
Author

cafra commented Sep 26, 2021

Thanks for the positive response and help of the go team, the current proposal can be closed, and the existing methods can be used to extend the required functions, because the current scenarios are indeed relatively few.

The following is the code, hope to help other brothers in need.

  • code
package sofahessian

import (
	"bufio"
	"errors"
	"unicode/utf8"
)

const (
	t1 = 0b00000000
	tx = 0b10000000
	t2 = 0b11000000
	t3 = 0b11100000
	t4 = 0b11110000
	t5 = 0b11111000

	maskx = 0b00111111
	mask2 = 0b00011111
	mask3 = 0b00001111
	mask4 = 0b00000111

	rune1Max = 1<<7 - 1
	rune2Max = 1<<11 - 1
	rune3Max = 1<<16 - 1

	// The default lowest and highest continuation byte.
	locb = 0b10000000
	hicb = 0b10111111

	// These names of these constants are chosen to give nice alignment in the
	// table below. The first nibble is an index into acceptRanges or F for
	// special one-byte cases. The second nibble is the Rune length or the
	// Status for the special one-byte case.
	xx = 0xF1 // invalid: size 1
	as = 0xF0 // ASCII: size 1
	s1 = 0x02 // accept 0, size 2
	s2 = 0x13 // accept 1, size 3
	s3 = 0x03 // accept 0, size 3
	s4 = 0x23 // accept 2, size 3
	s5 = 0x34 // accept 3, size 4
	s6 = 0x04 // accept 0, size 4
	s7 = 0x44 // accept 4, size 4
)

var (
	ErrReaderBufIsEmpty = errors.New("sofa-hessian: reader.buf is empty")
)

// first is information about the first byte in a UTF-8 sequence.
var first = [256]uint8{
	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
	xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
	s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
	s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
	s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
}

// acceptRange gives the range of valid values for the second byte in a UTF-8
// sequence.
type acceptRange struct {
	lo uint8 // lowest value for second byte.
	hi uint8 // highest value for second byte.
}

// acceptRanges has size 16 to avoid bounds checks in the code that uses it.
var acceptRanges = [16]acceptRange{
	0: {locb, hicb},
	1: {0xA0, hicb},
	2: {locb, 0x9F},
	3: {0x90, hicb},
	4: {locb, 0x8F},
}

// DiscardRunes skips the next n runes, returning the number of bytes discarded.
//
// If DiscardRunes skips fewer than n runes, it also returns an error.
// If 0 <= n <= b.Buffered(), DiscardRunes is guaranteed to succeed without
func DiscardRunes(reader *bufio.Reader, n int) (int, error) {
	if n <= 0 {
		return 0, bufio.ErrNegativeCount
	}
	totalDiscardBytes := 0
	runeCount := 0

	for runeCount < n {
		if reader.Buffered() == 0 {
			// Initial state/critical need to trigger fill
			_, err := reader.Peek(1)
			if err != nil {
				// The case of empty data
				return totalDiscardBytes, err
			}
		}
		bufLen := reader.Buffered()
		if bufLen == 0 {
			// Robust judgment to avoid infinite loop
			return totalDiscardBytes, ErrReaderBufIsEmpty
		}
		// Here peek will not open up new memory space, so there is no need to worry about memory loss when n is small
		buf, err := reader.Peek(bufLen)
		if err != nil {
			return totalDiscardBytes, err
		}
		byteCount := 0
		fullRuneFlag := true

		for byteCount < len(buf) {
			r, byteSize := rune(buf[byteCount]), 1
			if r >= utf8.RuneSelf {
				byteSize = RuneByteLen(buf[byteCount:])
				if byteSize == -1 {
					fullRuneFlag = false
					break
				}
			}
			byteCount += byteSize
			runeCount++
			if runeCount >= n {
				break
			}
		}
		discardBytes, err := reader.Discard(byteCount)
		if err != nil {
			return discardBytes, err
		}

		totalDiscardBytes += discardBytes
		if !fullRuneFlag {
			err = peekForFill(reader)
			if err != nil {
				return totalDiscardBytes, err
			}
		}
	}

	return totalDiscardBytes, nil
}

const UTF8MaxByte = 4

// peekForFill incremental peek triggers reader.buf fill
// Detailed description: When critical, that is, the remaining // byte is not a complete rune, fill needs to be triggered again,
// But the peek value cannot be evaluated, it can be incremented
func peekForFill(reader *bufio.Reader) error {
	for i := 0; i < UTF8MaxByte; i++ {
		bytes, err := reader.Peek(i + 1)
		if err != nil {
			return err
		}
		byteSize := RuneByteLen(bytes)
		if byteSize != -1 {
			return nil
		}
	}
	return nil
}

// RuneByteLen returns the number of bytes of the first utf-8 encoding in p.
// If p is empty it returns 0. Otherwise, if
// the encoding is invalid, it returns 1.
// Both are impossible results for correct, non-empty UTF-8.
//
// an encoding is invalid if it is incorrect utf-8, encodes a rune that is
// out of range, or is not the shortest possible utf-8 encoding for the
// value. no other validation is performed.
func RuneByteLen(p []byte) (size int) {
	n := len(p)
	if n < 1 {
		return 0
	}
	// Handling the case where critical byte is not a complete rune
	if !utf8.FullRune(p) {
		return -1
	}

	p0 := p[0]
	x := first[p0]
	if x >= as {
		return 1
	}
	sz := int(x & 7)
	accept := acceptRanges[x>>4]
	if n < sz {
		return 1
	}
	b1 := p[1]
	if b1 < accept.lo || accept.hi < b1 {
		return 1
	}
	if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
		return 2
	}
	b2 := p[2]
	if b2 < locb || hicb < b2 {
		return 1
	}
	if sz <= 3 {
		return 3
	}
	b3 := p[3]
	if b3 < locb || hicb < b3 {
		return 1
	}
	return 4
}
  • test and bench code
package sofahessian

import (
	"bufio"
	"bytes"
	"fmt"
	"strings"
	"testing"

	"github.com/stretchr/testify/assert"
)

func TestDiscardRunes(t *testing.T) {
	type args struct {
		n int
	}
	tests := []struct {
		name               string
		fields             string
		args               args
		wantDiscardedBytes int
		wantErr            bool
	}{
		{
			name:               "nil",
			fields:             "",
			args:               struct{ n int }{n: 1},
			wantDiscardedBytes: 0,
			wantErr:            true,
		},
		{
			name:               "decode>buf",
			fields:             "中",
			args:               struct{ n int }{n: 2},
			wantDiscardedBytes: 3,
			wantErr:            true,
		},
		{
			name:               "中",
			fields:             "中",
			args:               struct{ n int }{n: 1},
			wantDiscardedBytes: 3,
			wantErr:            false,
		},
		{
			name:               "a中",
			fields:             "a中",
			args:               struct{ n int }{n: 2},
			wantDiscardedBytes: 4,
			wantErr:            false,
		},
		{
			name:               "a中b",
			fields:             "a中b",
			args:               struct{ n int }{n: 3},
			wantDiscardedBytes: 5,
			wantErr:            false,
		},
		{
			name:               "UTF8 need fill case",
			fields:             strings.Repeat("a", 4094) + "中",
			args:               struct{ n int }{n: 4095},
			wantDiscardedBytes: 4097,
			wantErr:            false,
		},
		{
			name:               "multi_chunk_4097",
			fields:             strings.Repeat("中", 4097),
			args:               struct{ n int }{n: 4097},
			wantDiscardedBytes: 4097 * 3,
			wantErr:            false,
		},
		{
			name:               "1024 a中b",
			fields:             strings.Repeat("a中b", 1024),
			args:               struct{ n int }{n: 1800},
			wantDiscardedBytes: 1800 - (1800 / 3) + (1800/3)*3,
			wantErr:            false,
		},
	}
	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			buf := bytes.NewBufferString(tt.fields)
			reader := bufio.NewReader(buf)
			gotDiscardedBytes, err := DiscardRunes(reader, tt.args.n)
			if (err != nil) != tt.wantErr {
				t.Errorf("DiscardRunes() error = %v, wantErr %v", err, tt.wantErr)
				return
			}
			if gotDiscardedBytes != tt.wantDiscardedBytes {
				t.Errorf("DiscardRunes() gotDiscardedBytes = %v, want %v", gotDiscardedBytes, tt.wantDiscardedBytes)
			}
		})
	}
}

/**
Performance testing should cover different data volume ranges
*/
var (
	assertSwitch = true
)

func BenchmarkDiscardVsReadJustAscii(b *testing.B) {
	caseList := map[int]string{
		1:      "The actual situation is less",
		5:      "The actual situation is not much",
		10:     "The actual situation is common",
		100:    "Large amount of data",
		1000:   "2 Large amount of data",
		10000:  "3 Large amount of data",
		100000: "4 Large amount of data",
	}

	for len, explanation := range caseList {
		caseName := fmt.Sprintf("Character length %v Description:%v", len, explanation)
		b.Run(caseName, func(b *testing.B) {
			expectByteLen := len
			discardRune := len
			data := strings.Repeat("a", len)
			buf := bytes.NewBufferString(data)
			reader := bufio.NewReader(buf)

			b.Run("ReadRuneForDiscard", func(b *testing.B) {
				b.ReportAllocs()
				for i := 0; i < b.N; i++ {
					buf.Reset()
					buf.Write([]byte(data))
					reader.Reset(buf)
					discard, err := ReadRuneForDiscard(reader, discardRune)

					if assertSwitch {
						assert.Nil(b, err)
						assert.Equal(b, expectByteLen, discard)
					}
				}
			})

			b.Run("DiscardRunes", func(b *testing.B) {
				b.ReportAllocs()
				for i := 0; i < b.N; i++ {
					buf.Reset()
					buf.Write([]byte(data))
					reader.Reset(buf)
					discard, err := DiscardRunes(reader, discardRune)
					if assertSwitch {
						assert.Nil(b, err)
						assert.Equal(b, expectByteLen, discard)
					}
				}
			})
		})
	}

}

func BenchmarkDiscardVsReadAsciiUTF8(b *testing.B) {
	caseList := map[int]string{
		1:     "The actual situation is less",
		5:     "The actual situation is not much",
		10:    "The actual situation is common",
		100:   "Large amount of data",
		1000:  "2 Large amount of data",
		10000: "3 Large amount of data",
	}
	//assertSwitch = true
	for len, explanation := range caseList {
		caseName := fmt.Sprintf("Character length %v Description:%v", len*10, explanation)
		b.Run(caseName, func(b *testing.B) {
			expectByteLen := 10 * len
			discardRune := 6 * len

			data := strings.Repeat("a中cd国e", len)
			buf := bytes.NewBufferString(data)
			reader := bufio.NewReader(buf)

			b.Run("ReadRuneForDiscard", func(b *testing.B) {
				b.ReportAllocs()
				for i := 0; i < b.N; i++ {
					buf.Reset()
					buf.Write([]byte(data))
					reader.Reset(buf)
					discard, err := ReadRuneForDiscard(reader, discardRune)

					if assertSwitch {
						assert.Nil(b, err)
						assert.Equal(b, expectByteLen, discard)
					}
				}
			})

			b.Run("DiscardRunes", func(b *testing.B) {
				b.ReportAllocs()
				for i := 0; i < b.N; i++ {
					buf.Reset()
					buf.Write([]byte(data))
					reader.Reset(buf)
					discard, err := DiscardRunes(reader, discardRune)
					if assertSwitch {
						assert.Nil(b, err)
						assert.Equal(b, expectByteLen, discard)
					}
				}
			})
		})
	}

}

func BenchmarkDiscardVsReadUTF(b *testing.B) {
	expectByteLen := 5
	discardRune := 3
	data := strings.Repeat("a中b", 100)
	buf := bytes.NewBufferString(data)
	reader := bufio.NewReader(buf)

	b.Run("DiscardRunes", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			buf.Reset()
			buf.Write([]byte(data))
			reader.Reset(buf)

			discard, err := DiscardRunes(reader, discardRune)

			if assertSwitch {
				assert.Nil(b, err)
				assert.Equal(b, expectByteLen, discard)
			}
		}
	})

	b.Run("ReadRuneForDiscard", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			buf.Reset()
			buf.Write([]byte(data))
			reader.Reset(buf)

			discard, err := ReadRuneForDiscard(reader, discardRune)

			if assertSwitch {
				assert.Nil(b, err)
				assert.Equal(b, expectByteLen, discard)
			}
		}
	})
}

func BenchmarkDiscardVsReadUTFMiddle(b *testing.B) {
	discardRune := 1800
	expectByteLen := 1800 - (1800 / 3) + (1800/3)*3
	data := strings.Repeat("a中b", 1024)
	buf := bytes.NewBufferString(data)
	reader := bufio.NewReader(buf)

	b.Run("DiscardRunes", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			buf.Reset()
			buf.Write([]byte(data))
			reader.Reset(buf)

			discard, err := DiscardRunes(reader, discardRune)

			if assertSwitch {
				assert.Nil(b, err)
				assert.Equal(b, expectByteLen, discard)
			}
		}
	})

	b.Run("ReadRuneForDiscard", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			buf.Reset()
			buf.Write([]byte(data))
			reader.Reset(buf)

			discard, err := ReadRuneForDiscard(reader, discardRune)

			if assertSwitch {
				assert.Nil(b, err)
				assert.Equal(b, expectByteLen, discard)
			}
		}
	})
}

func BenchmarkDiscardVsReadUTFNeedReaderFill(b *testing.B) {
	discardRune := 4098
	expectByteLen := 4098 - (4098 / 3) + (4098/3)*3
	data := strings.Repeat("a中b", 4096)
	buf := bytes.NewBufferString(data)
	reader := bufio.NewReader(buf)

	b.Run("DiscardRunes", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			buf.Reset()
			buf.Write([]byte(data))
			reader.Reset(buf)

			discard, err := DiscardRunes(reader, discardRune)

			if assertSwitch {
				assert.Nil(b, err)
				assert.Equal(b, expectByteLen, discard)
			}
		}
	})

	b.Run("ReadRuneForDiscard", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			buf.Reset()
			buf.Write([]byte(data))
			reader.Reset(buf)

			discard, err := ReadRuneForDiscard(reader, discardRune)

			if assertSwitch {
				assert.Nil(b, err)
				assert.Equal(b, expectByteLen, discard)
			}
		}
	})
}

// ReadRuneForDiscard is just for compare with DiscardRunes.
func ReadRuneForDiscard(reader *bufio.Reader, n int) (discardedBytes int, err error) {
	for i := 0; i < n; i++ {
		_, size, err := reader.ReadRune()
		if err != nil {
			return size, err
		}
		discardedBytes += size
	}
	return discardedBytes, nil
}

@cafra cafra closed this as completed Sep 26, 2021
@rsc rsc moved this from Likely Decline to Declined in Proposals (old) Oct 6, 2021
@rsc
Copy link
Contributor

rsc commented Oct 6, 2021

This proposal has been declined as retracted.
— rsc for the proposal review group

@golang golang locked and limited conversation to collaborators Oct 6, 2022
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Projects
No open projects
Development

Successfully merging a pull request may close this issue.

8 participants