-
Notifications
You must be signed in to change notification settings - Fork 18k
unicode/utf16: some performance optimization #6957
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Labels
FrozenDueToAge
Performance
Suggested
Issues that may be good for new contributors looking for work to do.
Milestone
Comments
// Copyright 2010 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Package utf16 implements encoding and decoding of UTF-16 sequences. package utf16 // The conditions replacementChar==unicode.ReplacementChar and // maxRune==unicode.MaxRune are verified in the tests. // Defining them locally avoids this package depending on package unicode. const ( replacementChar = '\uFFFD' // Unicode replacement character maxRune = '\U0010FFFF' // Maximum valid Unicode code point. ) const ( // 0xd800-0xdc00 encodes the high 10 bits of a pair. // 0xdc00-0xe000 encodes the low 10 bits of a pair. // the value is those 20 bits plus 0x10000. surr1 = 0xd800 surr2 = 0xdc00 surr3 = 0xe000 surrSelf = 0x10000 ) // IsSurrogate returns true if the specified Unicode code point // can appear in a surrogate pair. func IsSurrogate(r rune) bool { return surr1 <= r && r < surr3 } // DecodeRune returns the UTF-16 decoding of a surrogate pair. // If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns // the Unicode replacement code point U+FFFD. func DecodeRune(r1, r2 rune) rune { if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 { return (r1-surr1)<<10 | (r2 - surr2) + surrSelf } return replacementChar } // EncodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune. // If the rune is not a valid Unicode code point or does not need encoding, // EncodeRune returns U+FFFD, U+FFFD. func EncodeRune(r rune) (r1, r2 rune) { if r < surrSelf || r > maxRune { return replacementChar, replacementChar } r -= surrSelf return surr1 + r>>10, surr2 + r&0x3ff } // Encode returns the UTF-16 encoding of the Unicode code point sequence s. func Encode(s []rune) []uint16 { n := len(s) for _, v := range s { if v >= surrSelf { n++ } } a := make([]uint16, n) n = 0 var r1, r2 rune for _, v := range s { switch { case 0 <= v && v < surr1, surr3 <= v && v < surrSelf: a[n] = uint16(v) n++ case surrSelf <= v && v <= maxRune: r1, r2 = EncodeRune(v) a[n] = uint16(r1) a[n+1] = uint16(r2) n += 2 default: a[n] = uint16(replacementChar) n++ } } return a[:n] } // Decode returns the Unicode code point sequence represented // by the UTF-16 encoding s. func Decode(s []uint16) []rune { sLen := len(s) a := make([]rune, sLen) n := 0 var r uint16 for i := 0; i < sLen; i++ { switch r = s[i]; { case r < surr1, surr3 <= r: // normal rune a[n] = rune(r) n++ case surr1 <= r && r < surr2 && i+1 < sLen && surr2 <= s[i+1] && s[i+1] < surr3: // valid surrogate sequence a[n] = DecodeRune(rune(r), rune(s[i+1])) i++ n++ default: // invalid surrogate sequence a[n] = replacementChar n++ } } return a[:n] } Attachments:
|
for the interested, the diff is: --- pkg/unicode/utf16/utf16.go 2013-08-18 14:53:21.000000000 -0400 +++ ~/Downloads/utf16.go 2013-12-15 16:57:24.000000000 -0500 @@ -36,7 +36,7 @@ // the Unicode replacement code point U+FFFD. func DecodeRune(r1, r2 rune) rune { if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 { - return (rune(r1)-surr1)<<10 | (rune(r2) - surr2) + 0x10000 + return (r1-surr1)<<10 | (r2 - surr2) + surrSelf } return replacementChar } @@ -45,11 +45,11 @@ // If the rune is not a valid Unicode code point or does not need encoding, // EncodeRune returns U+FFFD, U+FFFD. func EncodeRune(r rune) (r1, r2 rune) { - if r < surrSelf || r > maxRune || IsSurrogate(r) { + if r < surrSelf || r > maxRune { return replacementChar, replacementChar } r -= surrSelf - return surr1 + (r>>10)&0x3ff, surr2 + r&0x3ff + return surr1 + r>>10, surr2 + r&0x3ff } // Encode returns the UTF-16 encoding of the Unicode code point sequence s. @@ -63,46 +63,49 @@ a := make([]uint16, n) n = 0 + var r1, r2 rune for _, v := range s { switch { - case v < 0, surr1 <= v && v < surr3, v > maxRune: - v = replacementChar - fallthrough - case v < surrSelf: + case 0 <= v && v < surr1, surr3 <= v && v < surrSelf: a[n] = uint16(v) n++ - default: - r1, r2 := EncodeRune(v) + case surrSelf <= v && v <= maxRune: + r1, r2 = EncodeRune(v) a[n] = uint16(r1) a[n+1] = uint16(r2) n += 2 + default: + a[n] = uint16(replacementChar) + n++ } } - return a[0:n] + return a[:n] } // Decode returns the Unicode code point sequence represented // by the UTF-16 encoding s. func Decode(s []uint16) []rune { - a := make([]rune, len(s)) + sLen := len(s) + a := make([]rune, sLen) n := 0 - for i := 0; i < len(s); i++ { - switch r := s[i]; { - case surr1 <= r && r < surr2 && i+1 < len(s) && + var r uint16 + for i := 0; i < sLen; i++ { + switch r = s[i]; { + case r < surr1, surr3 <= r: + // normal rune + a[n] = rune(r) + n++ + case surr1 <= r && r < surr2 && i+1 < sLen && surr2 <= s[i+1] && s[i+1] < surr3: // valid surrogate sequence a[n] = DecodeRune(rune(r), rune(s[i+1])) i++ n++ - case surr1 <= r && r < surr3: + default: // invalid surrogate sequence a[n] = replacementChar n++ - default: - // normal rune - a[n] = rune(r) - n++ } } - return a[0:n] + return a[:n] } |
If you want to suggest performance optimization to unicode/utf16, please follow the steps here: http://golang.org/doc/contribute.html and send CL to golang-dev. I suggest you also add benchmarks to unicode/utf16 and demonstrate the performance differences in your CL description (use $GOROOT/misc/benchcmp to compare output of "go test -bench=. unicode/utf16") Labels changed: added release-none, repo-main, performance. |
thanks all of you, my English sucks, for here the process is not very understanding, can only do so much of that is, If you think I submitted a bit useful, please help me to submit and communication, thank you. We all want golang better. Also, my submit not modify the bug, but remove unnecessary statement, modify the judgment order of switch case. // Copyright 2010 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Package utf16 implements encoding and decoding of UTF-16 sequences. package utf16 // The conditions replacementChar==unicode.ReplacementChar and // maxRune==unicode.MaxRune are verified in the tests. // Defining them locally avoids this package depending on package unicode. const ( replacementChar = '\uFFFD' // Unicode replacement character maxRune = '\U0010FFFF' // Maximum valid Unicode code point. ) const ( // 0xd800-0xdc00 encodes the high 10 bits of a pair. // 0xdc00-0xe000 encodes the low 10 bits of a pair. // the value is those 20 bits plus 0x10000. surr1 = 0xd800 surr2 = 0xdc00 surr3 = 0xe000 surrSelf = 0x10000 ) // IsSurrogate returns true if the specified Unicode code point // can appear in a surrogate pair. func IsSurrogate(r rune) bool { return surr1 <= r && r < surr3 } // decodeRune returns the UTF-16 decoding of a surrogate pair. // the pair r1, r2 is a valid UTF-16 surrogate pair. func decodeRune(r1, r2 rune) rune { return (r1-surr1)<<10 | (r2 - surr2) + surrSelf } // DecodeRune returns the UTF-16 decoding of a surrogate pair. // If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns // the Unicode replacement code point U+FFFD. func DecodeRune(r1, r2 rune) rune { if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 { return decodeRune(r1, r2) } return replacementChar } // encodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune. // surrSelf <= r && r <= maxRune func encodeRune(r rune) (r1, r2 rune) { r -= surrSelf return surr1 + r>>10, surr2 + r&0x3ff } // EncodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune. // If the rune is not a valid Unicode code point or does not need encoding, // EncodeRune returns U+FFFD, U+FFFD. func EncodeRune(r rune) (r1, r2 rune) { if r < surrSelf || r > maxRune { return replacementChar, replacementChar } return encodeRune(r) } // Encode returns the UTF-16 encoding of the Unicode code point sequence s. func Encode(s []rune) []uint16 { n := len(s) for _, v := range s { if v >= surrSelf { n++ } } a := make([]uint16, n) n = 0 var r1, r2 rune for _, v := range s { switch { case 0 <= v && v < surr1, surr3 <= v && v < surrSelf: a[n] = uint16(v) n++ case surrSelf <= v && v <= maxRune: r1, r2 = encodeRune(v) a[n] = uint16(r1) a[n+1] = uint16(r2) n += 2 default: a[n] = uint16(replacementChar) n++ } } return a[:n] } // Decode returns the Unicode code point sequence represented // by the UTF-16 encoding s. func Decode(s []uint16) []rune { sLen := len(s) a := make([]rune, sLen) n := 0 var r uint16 for i := 0; i < sLen; i++ { switch r = s[i]; { case r < surr1, surr3 <= r: // normal rune a[n] = rune(r) n++ case surr1 <= r && r < surr2 && i+1 < sLen && surr2 <= s[i+1] && s[i+1] < surr3: // valid surrogate sequence a[n] = decodeRune(rune(r), rune(s[i+1])) i++ n++ default: // invalid surrogate sequence a[n] = replacementChar n++ } } return a[:n] } |
Marking as accepted, but please note this has not been assigned to a release and may not be worked on for 1.3 or later. @chanxuehong please consider following the contribution guidelines and submitting a proposal. I think this improvement is valid, but we cannot for reasons of policy accept patches except via the Rietveld system. Labels changed: added suggested. Status changed to Accepted. |
CL https://golang.org/cl/19820 mentions this issue. |
gopherbot
pushed a commit
that referenced
this issue
Feb 23, 2016
For #6957 Change-Id: Ic497c12f33efc933e9fe81f6cd1b2a0a01abbabf Reviewed-on: https://go-review.googlesource.com/19820 Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
CL https://golang.org/cl/19826 mentions this issue. |
gopherbot
pushed a commit
that referenced
this issue
Feb 23, 2016
name old time/op new time/op delta DecodeValidASCII-4 94.7ns ± 1% 87.4ns ± 1% -7.71% (p=0.000 n=10+9) DecodeValidJapaneseChars-4 91.0ns ± 2% 84.8ns ± 0% -6.77% (p=0.000 n=9+10) DecodeRune-4 16.5ns ± 0% 16.6ns ± 2% ~ (p=0.108 n=9+10) For #6957 Change-Id: I618c15c2a42ef7ec6a5cd163b7c3f1a65ca4ad01 Reviewed-on: https://go-review.googlesource.com/19826 Reviewed-by: Rob Pike <r@golang.org>
CL https://golang.org/cl/19891 mentions this issue. |
Sign up for free
to subscribe to this conversation on GitHub.
Already have an account?
Sign in.
Labels
FrozenDueToAge
Performance
Suggested
Issues that may be good for new contributors looking for work to do.
by chanxuehong:
The text was updated successfully, but these errors were encountered: