// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Note: the file data_test.go that is generated should not be checked in. //go:generate go run maketables.go triegen.go //go:generate go test -tags test // Package norm contains types and functions for normalizing Unicode strings. package norm // import "golang.org/x/text/unicode/norm" import ( "unicode/utf8" "golang.org/x/text/transform" ) // A Form denotes a canonical representation of Unicode code points. // The Unicode-defined normalization and equivalence forms are: // // NFC Unicode Normalization Form C // NFD Unicode Normalization Form D // NFKC Unicode Normalization Form KC // NFKD Unicode Normalization Form KD // // For a Form f, this documentation uses the notation f(x) to mean // the bytes or string x converted to the given form. // A position n in x is called a boundary if conversion to the form can // proceed independently on both sides: // // f(x) == append(f(x[0:n]), f(x[n:])...) // // References: https://unicode.org/reports/tr15/ and // https://unicode.org/notes/tn5/. type Form int const ( NFC Form = iota NFD NFKC NFKD ) // Bytes returns f(b). May return b if f(b) = b. func (f Form) Bytes(b []byte) []byte { src := inputBytes(b) ft := formTable[f] n, ok := ft.quickSpan(src, 0, len(b), true) if ok { return b } out := make([]byte, n, len(b)) copy(out, b[0:n]) rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush} return doAppendInner(&rb, n) } // String returns f(s). func (f Form) String(s string) string { src := inputString(s) ft := formTable[f] n, ok := ft.quickSpan(src, 0, len(s), true) if ok { return s } out := make([]byte, n, len(s)) copy(out, s[0:n]) rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush} return string(doAppendInner(&rb, n)) } // IsNormal returns true if b == f(b). func (f Form) IsNormal(b []byte) bool { src := inputBytes(b) ft := formTable[f] bp, ok := ft.quickSpan(src, 0, len(b), true) if ok { return true } rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)} rb.setFlusher(nil, cmpNormalBytes) for bp < len(b) { rb.out = b[bp:] if bp = decomposeSegment(&rb, bp, true); bp < 0 { return false } bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true) } return true } func cmpNormalBytes(rb *reorderBuffer) bool { b := rb.out for i := 0; i < rb.nrune; i++ { info := rb.rune[i] if int(info.size) > len(b) { return false } p := info.pos pe := p + info.size for ; p < pe; p++ { if b[0] != rb.byte[p] { return false } b = b[1:] } } return true } // IsNormalString returns true if s == f(s). func (f Form) IsNormalString(s string) bool { src := inputString(s) ft := formTable[f] bp, ok := ft.quickSpan(src, 0, len(s), true) if ok { return true } rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)} rb.setFlusher(nil, func(rb *reorderBuffer) bool { for i := 0; i < rb.nrune; i++ { info := rb.rune[i] if bp+int(info.size) > len(s) { return false } p := info.pos pe := p + info.size for ; p < pe; p++ { if s[bp] != rb.byte[p] { return false } bp++ } } return true }) for bp < len(s) { if bp = decomposeSegment(&rb, bp, true); bp < 0 { return false } bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true) } return true } // patchTail fixes a case where a rune may be incorrectly normalized // if it is followed by illegal continuation bytes. It returns the // patched buffer and whether the decomposition is still in progress. func patchTail(rb *reorderBuffer) bool { info, p := lastRuneStart(&rb.f, rb.out) if p == -1 || info.size == 0 { return true } end := p + int(info.size) extra := len(rb.out) - end if extra > 0 { // Potentially allocating memory. However, this only // happens with ill-formed UTF-8. x := make([]byte, 0) x = append(x, rb.out[len(rb.out)-extra:]...) rb.out = rb.out[:end] decomposeToLastBoundary(rb) rb.doFlush() rb.out = append(rb.out, x...) return false } buf := rb.out[p:] rb.out = rb.out[:p] decomposeToLastBoundary(rb) if s := rb.ss.next(info); s == ssStarter { rb.doFlush() rb.ss.first(info) } else if s == ssOverflow { rb.doFlush() rb.insertCGJ() rb.ss = 0 } rb.insertUnsafe(inputBytes(buf), 0, info) return true } func appendQuick(rb *reorderBuffer, i int) int { if rb.nsrc == i { return i } end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true) rb.out = rb.src.appendSlice(rb.out, i, end) return end } // Append returns f(append(out, b...)). // The buffer out must be nil, empty, or equal to f(out). func (f Form) Append(out []byte, src ...byte) []byte { return f.doAppend(out, inputBytes(src), len(src)) } func (f Form) doAppend(out []byte, src input, n int) []byte { if n == 0 { return out } ft := formTable[f] // Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer. if len(out) == 0 { p, _ := ft.quickSpan(src, 0, n, true) out = src.appendSlice(out, 0, p) if p == n { return out } rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush} return doAppendInner(&rb, p) } rb := reorderBuffer{f: *ft, src: src, nsrc: n} return doAppend(&rb, out, 0) } func doAppend(rb *reorderBuffer, out []byte, p int) []byte { rb.setFlusher(out, appendFlush) src, n := rb.src, rb.nsrc doMerge := len(out) > 0 if q := src.skipContinuationBytes(p); q > p { // Move leading non-starters to destination. rb.out = src.appendSlice(rb.out, p, q) p = q doMerge = patchTail(rb) } fd := &rb.f if doMerge { var info Properties if p < n { info = fd.info(src, p) if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 { if p == 0 { decomposeToLastBoundary(rb) } p = decomposeSegment(rb, p, true) } } if info.size == 0 { rb.doFlush() // Append incomplete UTF-8 encoding. return src.appendSlice(rb.out, p, n) } if rb.nrune > 0 { return doAppendInner(rb, p) } } p = appendQuick(rb, p) return doAppendInner(rb, p) } func doAppendInner(rb *reorderBuffer, p int) []byte { for n := rb.nsrc; p < n; { p = decomposeSegment(rb, p, true) p = appendQuick(rb, p) } return rb.out } // AppendString returns f(append(out, []byte(s))). // The buffer out must be nil, empty, or equal to f(out). func (f Form) AppendString(out []byte, src string) []byte { return f.doAppend(out, inputString(src), len(src)) } // QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]). // It is not guaranteed to return the largest such n. func (f Form) QuickSpan(b []byte) int { n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true) return n } // Span implements transform.SpanningTransformer. It returns a boundary n such // that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n. func (f Form) Span(b []byte, atEOF bool) (n int, err error) { n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF) if n < len(b) { if !ok { err = transform.ErrEndOfSpan } else { err = transform.ErrShortSrc } } return n, err } // SpanString returns a boundary n such that s[0:n] == f(s[0:n]). // It is not guaranteed to return the largest such n. func (f Form) SpanString(s string, atEOF bool) (n int, err error) { n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF) if n < len(s) { if !ok { err = transform.ErrEndOfSpan } else { err = transform.ErrShortSrc } } return n, err } // quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and // whether any non-normalized parts were found. If atEOF is false, n will // not point past the last segment if this segment might be become // non-normalized by appending other runes. func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) { var lastCC uint8 ss := streamSafe(0) lastSegStart := i for n = end; i < n; { if j := src.skipASCII(i, n); i != j { i = j lastSegStart = i - 1 lastCC = 0 ss = 0 continue } info := f.info(src, i) if info.size == 0 { if atEOF { // include incomplete runes return n, true } return lastSegStart, true } // This block needs to be before the next, because it is possible to // have an overflow for runes that are starters (e.g. with U+FF9E). switch ss.next(info) { case ssStarter: lastSegStart = i case ssOverflow: return lastSegStart, false case ssSuccess: if lastCC > info.ccc { return lastSegStart, false } } if f.composing { if !info.isYesC() { break } } else { if !info.isYesD() { break } } lastCC = info.ccc i += int(info.size) } if i == n { if !atEOF { n = lastSegStart } return n, true } return lastSegStart, false } // QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]). // It is not guaranteed to return the largest such n. func (f Form) QuickSpanString(s string) int { n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true) return n } // FirstBoundary returns the position i of the first boundary in b // or -1 if b contains no boundary. func (f Form) FirstBoundary(b []byte) int { return f.firstBoundary(inputBytes(b), len(b)) } func (f Form) firstBoundary(src input, nsrc int) int { i := src.skipContinuationBytes(0) if i >= nsrc { return -1 } fd := formTable[f] ss := streamSafe(0) // We should call ss.first here, but we can't as the first rune is // skipped already. This means FirstBoundary can't really determine // CGJ insertion points correctly. Luckily it doesn't have to. for { info := fd.info(src, i) if info.size == 0 { return -1 } if s := ss.next(info); s != ssSuccess { return i } i += int(info.size) if i >= nsrc { if !info.BoundaryAfter() && !ss.isMax() { return -1 } return nsrc } } } // FirstBoundaryInString returns the position i of the first boundary in s // or -1 if s contains no boundary. func (f Form) FirstBoundaryInString(s string) int { return f.firstBoundary(inputString(s), len(s)) } // NextBoundary reports the index of the boundary between the first and next // segment in b or -1 if atEOF is false and there are not enough bytes to // determine this boundary. func (f Form) NextBoundary(b []byte, atEOF bool) int { return f.nextBoundary(inputBytes(b), len(b), atEOF) } // NextBoundaryInString reports the index of the boundary between the first and // next segment in b or -1 if atEOF is false and there are not enough bytes to // determine this boundary. func (f Form) NextBoundaryInString(s string, atEOF bool) int { return f.nextBoundary(inputString(s), len(s), atEOF) } func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int { if nsrc == 0 { if atEOF { return 0 } return -1 } fd := formTable[f] info := fd.info(src, 0) if info.size == 0 { if atEOF { return 1 } return -1 } ss := streamSafe(0) ss.first(info) for i := int(info.size); i < nsrc; i += int(info.size) { info = fd.info(src, i) if info.size == 0 { if atEOF { return i } return -1 } // TODO: Using streamSafe to determine the boundary isn't the same as // using BoundaryBefore. Determine which should be used. if s := ss.next(info); s != ssSuccess { return i } } if !atEOF && !info.BoundaryAfter() && !ss.isMax() { return -1 } return nsrc } // LastBoundary returns the position i of the last boundary in b // or -1 if b contains no boundary. func (f Form) LastBoundary(b []byte) int { return lastBoundary(formTable[f], b) } func lastBoundary(fd *formInfo, b []byte) int { i := len(b) info, p := lastRuneStart(fd, b) if p == -1 { return -1 } if info.size == 0 { // ends with incomplete rune if p == 0 { // starts with incomplete rune return -1 } i = p info, p = lastRuneStart(fd, b[:i]) if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter return i } } if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8 return i } if info.BoundaryAfter() { return i } ss := streamSafe(0) v := ss.backwards(info) for i = p; i >= 0 && v != ssStarter; i = p { info, p = lastRuneStart(fd, b[:i]) if v = ss.backwards(info); v == ssOverflow { break } if p+int(info.size) != i { if p == -1 { // no boundary found return -1 } return i // boundary after an illegal UTF-8 encoding } } return i } // decomposeSegment scans the first segment in src into rb. It inserts 0x034f // (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters // and returns the number of bytes consumed from src or iShortDst or iShortSrc. func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int { // Force one character to be consumed. info := rb.f.info(rb.src, sp) if info.size == 0 { return 0 } if s := rb.ss.next(info); s == ssStarter { // TODO: this could be removed if we don't support merging. if rb.nrune > 0 { goto end } } else if s == ssOverflow { rb.insertCGJ() goto end } if err := rb.insertFlush(rb.src, sp, info); err != iSuccess { return int(err) } for { sp += int(info.size) if sp >= rb.nsrc { if !atEOF && !info.BoundaryAfter() { return int(iShortSrc) } break } info = rb.f.info(rb.src, sp) if info.size == 0 { if !atEOF { return int(iShortSrc) } break } if s := rb.ss.next(info); s == ssStarter { break } else if s == ssOverflow { rb.insertCGJ() break } if err := rb.insertFlush(rb.src, sp, info); err != iSuccess { return int(err) } } end: if !rb.doFlush() { return int(iShortDst) } return sp } // lastRuneStart returns the runeInfo and position of the last // rune in buf or the zero runeInfo and -1 if no rune was found. func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) { p := len(buf) - 1 for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- { } if p < 0 { return Properties{}, -1 } return fd.info(inputBytes(buf), p), p } // decomposeToLastBoundary finds an open segment at the end of the buffer // and scans it into rb. Returns the buffer minus the last segment. func decomposeToLastBoundary(rb *reorderBuffer) { fd := &rb.f info, i := lastRuneStart(fd, rb.out) if int(info.size) != len(rb.out)-i { // illegal trailing continuation bytes return } if info.BoundaryAfter() { return } var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order padd := 0 ss := streamSafe(0) p := len(rb.out) for { add[padd] = info v := ss.backwards(info) if v == ssOverflow { // Note that if we have an overflow, it the string we are appending to // is not correctly normalized. In this case the behavior is undefined. break } padd++ p -= int(info.size) if v == ssStarter || p < 0 { break } info, i = lastRuneStart(fd, rb.out[:p]) if int(info.size) != p-i { break } } rb.ss = ss // Copy bytes for insertion as we may need to overwrite rb.out. var buf [maxBufferSize * utf8.UTFMax]byte cp := buf[:copy(buf[:], rb.out[p:])] rb.out = rb.out[:p] for padd--; padd >= 0; padd-- { info = add[padd] rb.insertUnsafe(inputBytes(cp), 0, info) cp = cp[info.size:] } }