// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package norm import "encoding/binary" // This file contains Form-specific logic and wrappers for data in tables.go. // Rune info is stored in a separate trie per composing form. A composing form // and its corresponding decomposing form share the same trie. Each trie maps // a rune to a uint16. The values take two forms. For v >= 0x8000: // bits // 15: 1 (inverse of NFD_QC bit of qcInfo) // 13..7: qcInfo (see below). isYesD is always true (no decomposition). // 6..0: ccc (compressed CCC value). // For v < 0x8000, the respective rune has a decomposition and v is an index // into a byte array of UTF-8 decomposition sequences and additional info and // has the form: //
* [ []] // The header contains the number of bytes in the decomposition (excluding this // length byte). The two most significant bits of this length byte correspond // to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1. // The byte sequence is followed by a trailing and leading CCC if the values // for these are not zero. The value of v determines which ccc are appended // to the sequences. For v < firstCCC, there are none, for v >= firstCCC, // the sequence is followed by a trailing ccc, and for v >= firstLeadingCC // there is an additional leading ccc. The value of tccc itself is the // trailing CCC shifted left 2 bits. The two least-significant bits of tccc // are the number of trailing non-starters. const ( qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo headerLenMask = 0x3F // extract the length value from the header byte headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte ) // Properties provides access to normalization properties of a rune. type Properties struct { pos uint8 // start position in reorderBuffer; used in composition.go size uint8 // length of UTF-8 encoding of this rune ccc uint8 // leading canonical combining class (ccc if not decomposition) tccc uint8 // trailing canonical combining class (ccc if not decomposition) nLead uint8 // number of leading non-starters. flags qcInfo // quick check flags index uint16 } // functions dispatchable per form type lookupFunc func(b input, i int) Properties // formInfo holds Form-specific functions and tables. type formInfo struct { form Form composing, compatibility bool // form type info lookupFunc nextMain iterFunc } var formTable = []*formInfo{{ form: NFC, composing: true, compatibility: false, info: lookupInfoNFC, nextMain: nextComposed, }, { form: NFD, composing: false, compatibility: false, info: lookupInfoNFC, nextMain: nextDecomposed, }, { form: NFKC, composing: true, compatibility: true, info: lookupInfoNFKC, nextMain: nextComposed, }, { form: NFKD, composing: false, compatibility: true, info: lookupInfoNFKC, nextMain: nextDecomposed, }} // We do not distinguish between boundaries for NFC, NFD, etc. to avoid // unexpected behavior for the user. For example, in NFD, there is a boundary // after 'a'. However, 'a' might combine with modifiers, so from the application's // perspective it is not a good boundary. We will therefore always use the // boundaries for the combining variants. // BoundaryBefore returns true if this rune starts a new segment and // cannot combine with any rune on the left. func (p Properties) BoundaryBefore() bool { if p.ccc == 0 && !p.combinesBackward() { return true } // We assume that the CCC of the first character in a decomposition // is always non-zero if different from info.ccc and that we can return // false at this point. This is verified by maketables. return false } // BoundaryAfter returns true if runes cannot combine with or otherwise // interact with this or previous runes. func (p Properties) BoundaryAfter() bool { // TODO: loosen these conditions. return p.isInert() } // We pack quick check data in 4 bits: // // 5: Combines forward (0 == false, 1 == true) // 4..3: NFC_QC Yes(00), No (10), or Maybe (11) // 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition. // 1..0: Number of trailing non-starters. // // When all 4 bits are zero, the character is inert, meaning it is never // influenced by normalization. type qcInfo uint8 func (p Properties) isYesC() bool { return p.flags&0x10 == 0 } func (p Properties) isYesD() bool { return p.flags&0x4 == 0 } func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 } func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD func (p Properties) isInert() bool { return p.flags&qcInfoMask == 0 && p.ccc == 0 } func (p Properties) multiSegment() bool { return p.index >= firstMulti && p.index < endMulti } func (p Properties) nLeadingNonStarters() uint8 { return p.nLead } func (p Properties) nTrailingNonStarters() uint8 { return uint8(p.flags & 0x03) } // Decomposition returns the decomposition for the underlying rune // or nil if there is none. func (p Properties) Decomposition() []byte { // TODO: create the decomposition for Hangul? if p.index == 0 { return nil } i := p.index n := decomps[i] & headerLenMask i++ return decomps[i : i+uint16(n)] } // Size returns the length of UTF-8 encoding of the rune. func (p Properties) Size() int { return int(p.size) } // CCC returns the canonical combining class of the underlying rune. func (p Properties) CCC() uint8 { if p.index >= firstCCCZeroExcept { return 0 } return ccc[p.ccc] } // LeadCCC returns the CCC of the first rune in the decomposition. // If there is no decomposition, LeadCCC equals CCC. func (p Properties) LeadCCC() uint8 { return ccc[p.ccc] } // TrailCCC returns the CCC of the last rune in the decomposition. // If there is no decomposition, TrailCCC equals CCC. func (p Properties) TrailCCC() uint8 { return ccc[p.tccc] } func buildRecompMap() { recompMap = make(map[uint32]rune, len(recompMapPacked)/8) var buf [8]byte for i := 0; i < len(recompMapPacked); i += 8 { copy(buf[:], recompMapPacked[i:i+8]) key := binary.BigEndian.Uint32(buf[:4]) val := binary.BigEndian.Uint32(buf[4:]) recompMap[key] = rune(val) } } // Recomposition // We use 32-bit keys instead of 64-bit for the two codepoint keys. // This clips off the bits of three entries, but we know this will not // result in a collision. In the unlikely event that changes to // UnicodeData.txt introduce collisions, the compiler will catch it. // Note that the recomposition map for NFC and NFKC are identical. // combine returns the combined rune or 0 if it doesn't exist. // // The caller is responsible for calling // recompMapOnce.Do(buildRecompMap) sometime before this is called. func combine(a, b rune) rune { key := uint32(uint16(a))<<16 + uint32(uint16(b)) if recompMap == nil { panic("caller error") // see func comment } return recompMap[key] } func lookupInfoNFC(b input, i int) Properties { v, sz := b.charinfoNFC(i) return compInfo(v, sz) } func lookupInfoNFKC(b input, i int) Properties { v, sz := b.charinfoNFKC(i) return compInfo(v, sz) } // Properties returns properties for the first rune in s. func (f Form) Properties(s []byte) Properties { if f == NFC || f == NFD { return compInfo(nfcData.lookup(s)) } return compInfo(nfkcData.lookup(s)) } // PropertiesString returns properties for the first rune in s. func (f Form) PropertiesString(s string) Properties { if f == NFC || f == NFD { return compInfo(nfcData.lookupString(s)) } return compInfo(nfkcData.lookupString(s)) } // compInfo converts the information contained in v and sz // to a Properties. See the comment at the top of the file // for more information on the format. func compInfo(v uint16, sz int) Properties { if v == 0 { return Properties{size: uint8(sz)} } else if v >= 0x8000 { p := Properties{ size: uint8(sz), ccc: uint8(v), tccc: uint8(v), flags: qcInfo(v >> 8), } if p.ccc > 0 || p.combinesBackward() { p.nLead = uint8(p.flags & 0x3) } return p } // has decomposition h := decomps[v] f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4 p := Properties{size: uint8(sz), flags: f, index: v} if v >= firstCCC { v += uint16(h&headerLenMask) + 1 c := decomps[v] p.tccc = c >> 2 p.flags |= qcInfo(c & 0x3) if v >= firstLeadingCCC { p.nLead = c & 0x3 if v >= firstStarterWithNLead { // We were tricked. Remove the decomposition. p.flags &= 0x03 p.index = 0 return p } p.ccc = decomps[v+1] } } return p }