...
Run Format

Source file src/unicode/letter.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package unicode provides data and functions to test some properties of
     6	// Unicode code points.
     7	package unicode
     8	
     9	// Tables are regenerated each time we update the Unicode version.
    10	//go:generate go run maketables.go -tables=all -output tables.go
    11	
    12	const (
    13		MaxRune         = '\U0010FFFF' // Maximum valid Unicode code point.
    14		ReplacementChar = '\uFFFD'     // Represents invalid code points.
    15		MaxASCII        = '\u007F'     // maximum ASCII value.
    16		MaxLatin1       = '\u00FF'     // maximum Latin-1 value.
    17	)
    18	
    19	// RangeTable defines a set of Unicode code points by listing the ranges of
    20	// code points within the set. The ranges are listed in two slices
    21	// to save space: a slice of 16-bit ranges and a slice of 32-bit ranges.
    22	// The two slices must be in sorted order and non-overlapping.
    23	// Also, R32 should contain only values >= 0x10000 (1<<16).
    24	type RangeTable struct {
    25		R16         []Range16
    26		R32         []Range32
    27		LatinOffset int // number of entries in R16 with Hi <= MaxLatin1
    28	}
    29	
    30	// Range16 represents of a range of 16-bit Unicode code points. The range runs from Lo to Hi
    31	// inclusive and has the specified stride.
    32	type Range16 struct {
    33		Lo     uint16
    34		Hi     uint16
    35		Stride uint16
    36	}
    37	
    38	// Range32 represents of a range of Unicode code points and is used when one or
    39	// more of the values will not fit in 16 bits. The range runs from Lo to Hi
    40	// inclusive and has the specified stride. Lo and Hi must always be >= 1<<16.
    41	type Range32 struct {
    42		Lo     uint32
    43		Hi     uint32
    44		Stride uint32
    45	}
    46	
    47	// CaseRange represents a range of Unicode code points for simple (one
    48	// code point to one code point) case conversion.
    49	// The range runs from Lo to Hi inclusive, with a fixed stride of 1.  Deltas
    50	// are the number to add to the code point to reach the code point for a
    51	// different case for that character. They may be negative. If zero, it
    52	// means the character is in the corresponding case. There is a special
    53	// case representing sequences of alternating corresponding Upper and Lower
    54	// pairs. It appears with a fixed Delta of
    55	//	{UpperLower, UpperLower, UpperLower}
    56	// The constant UpperLower has an otherwise impossible delta value.
    57	type CaseRange struct {
    58		Lo    uint32
    59		Hi    uint32
    60		Delta d
    61	}
    62	
    63	// SpecialCase represents language-specific case mappings such as Turkish.
    64	// Methods of SpecialCase customize (by overriding) the standard mappings.
    65	type SpecialCase []CaseRange
    66	
    67	// BUG(r): There is no mechanism for full case folding, that is, for
    68	// characters that involve multiple runes in the input or output.
    69	
    70	// Indices into the Delta arrays inside CaseRanges for case mapping.
    71	const (
    72		UpperCase = iota
    73		LowerCase
    74		TitleCase
    75		MaxCase
    76	)
    77	
    78	type d [MaxCase]rune // to make the CaseRanges text shorter
    79	
    80	// If the Delta field of a CaseRange is UpperLower, it means
    81	// this CaseRange represents a sequence of the form (say)
    82	// Upper Lower Upper Lower.
    83	const (
    84		UpperLower = MaxRune + 1 // (Cannot be a valid delta.)
    85	)
    86	
    87	// linearMax is the maximum size table for linear search for non-Latin1 rune.
    88	// Derived by running 'go test -calibrate'.
    89	const linearMax = 18
    90	
    91	// is16 reports whether r is in the sorted slice of 16-bit ranges.
    92	func is16(ranges []Range16, r uint16) bool {
    93		if len(ranges) <= linearMax || r <= MaxLatin1 {
    94			for i := range ranges {
    95				range_ := &ranges[i]
    96				if r < range_.Lo {
    97					return false
    98				}
    99				if r <= range_.Hi {
   100					return (r-range_.Lo)%range_.Stride == 0
   101				}
   102			}
   103			return false
   104		}
   105	
   106		// binary search over ranges
   107		lo := 0
   108		hi := len(ranges)
   109		for lo < hi {
   110			m := lo + (hi-lo)/2
   111			range_ := &ranges[m]
   112			if range_.Lo <= r && r <= range_.Hi {
   113				return (r-range_.Lo)%range_.Stride == 0
   114			}
   115			if r < range_.Lo {
   116				hi = m
   117			} else {
   118				lo = m + 1
   119			}
   120		}
   121		return false
   122	}
   123	
   124	// is32 reports whether r is in the sorted slice of 32-bit ranges.
   125	func is32(ranges []Range32, r uint32) bool {
   126		if len(ranges) <= linearMax {
   127			for i := range ranges {
   128				range_ := &ranges[i]
   129				if r < range_.Lo {
   130					return false
   131				}
   132				if r <= range_.Hi {
   133					return (r-range_.Lo)%range_.Stride == 0
   134				}
   135			}
   136			return false
   137		}
   138	
   139		// binary search over ranges
   140		lo := 0
   141		hi := len(ranges)
   142		for lo < hi {
   143			m := lo + (hi-lo)/2
   144			range_ := ranges[m]
   145			if range_.Lo <= r && r <= range_.Hi {
   146				return (r-range_.Lo)%range_.Stride == 0
   147			}
   148			if r < range_.Lo {
   149				hi = m
   150			} else {
   151				lo = m + 1
   152			}
   153		}
   154		return false
   155	}
   156	
   157	// Is reports whether the rune is in the specified table of ranges.
   158	func Is(rangeTab *RangeTable, r rune) bool {
   159		r16 := rangeTab.R16
   160		if len(r16) > 0 && r <= rune(r16[len(r16)-1].Hi) {
   161			return is16(r16, uint16(r))
   162		}
   163		r32 := rangeTab.R32
   164		if len(r32) > 0 && r >= rune(r32[0].Lo) {
   165			return is32(r32, uint32(r))
   166		}
   167		return false
   168	}
   169	
   170	func isExcludingLatin(rangeTab *RangeTable, r rune) bool {
   171		r16 := rangeTab.R16
   172		if off := rangeTab.LatinOffset; len(r16) > off && r <= rune(r16[len(r16)-1].Hi) {
   173			return is16(r16[off:], uint16(r))
   174		}
   175		r32 := rangeTab.R32
   176		if len(r32) > 0 && r >= rune(r32[0].Lo) {
   177			return is32(r32, uint32(r))
   178		}
   179		return false
   180	}
   181	
   182	// IsUpper reports whether the rune is an upper case letter.
   183	func IsUpper(r rune) bool {
   184		// See comment in IsGraphic.
   185		if uint32(r) <= MaxLatin1 {
   186			return properties[uint8(r)]&pLmask == pLu
   187		}
   188		return isExcludingLatin(Upper, r)
   189	}
   190	
   191	// IsLower reports whether the rune is a lower case letter.
   192	func IsLower(r rune) bool {
   193		// See comment in IsGraphic.
   194		if uint32(r) <= MaxLatin1 {
   195			return properties[uint8(r)]&pLmask == pLl
   196		}
   197		return isExcludingLatin(Lower, r)
   198	}
   199	
   200	// IsTitle reports whether the rune is a title case letter.
   201	func IsTitle(r rune) bool {
   202		if r <= MaxLatin1 {
   203			return false
   204		}
   205		return isExcludingLatin(Title, r)
   206	}
   207	
   208	// to maps the rune using the specified case mapping.
   209	func to(_case int, r rune, caseRange []CaseRange) rune {
   210		if _case < 0 || MaxCase <= _case {
   211			return ReplacementChar // as reasonable an error as any
   212		}
   213		// binary search over ranges
   214		lo := 0
   215		hi := len(caseRange)
   216		for lo < hi {
   217			m := lo + (hi-lo)/2
   218			cr := caseRange[m]
   219			if rune(cr.Lo) <= r && r <= rune(cr.Hi) {
   220				delta := cr.Delta[_case]
   221				if delta > MaxRune {
   222					// In an Upper-Lower sequence, which always starts with
   223					// an UpperCase letter, the real deltas always look like:
   224					//	{0, 1, 0}    UpperCase (Lower is next)
   225					//	{-1, 0, -1}  LowerCase (Upper, Title are previous)
   226					// The characters at even offsets from the beginning of the
   227					// sequence are upper case; the ones at odd offsets are lower.
   228					// The correct mapping can be done by clearing or setting the low
   229					// bit in the sequence offset.
   230					// The constants UpperCase and TitleCase are even while LowerCase
   231					// is odd so we take the low bit from _case.
   232					return rune(cr.Lo) + ((r-rune(cr.Lo))&^1 | rune(_case&1))
   233				}
   234				return r + delta
   235			}
   236			if r < rune(cr.Lo) {
   237				hi = m
   238			} else {
   239				lo = m + 1
   240			}
   241		}
   242		return r
   243	}
   244	
   245	// To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase.
   246	func To(_case int, r rune) rune {
   247		return to(_case, r, CaseRanges)
   248	}
   249	
   250	// ToUpper maps the rune to upper case.
   251	func ToUpper(r rune) rune {
   252		if r <= MaxASCII {
   253			if 'a' <= r && r <= 'z' {
   254				r -= 'a' - 'A'
   255			}
   256			return r
   257		}
   258		return To(UpperCase, r)
   259	}
   260	
   261	// ToLower maps the rune to lower case.
   262	func ToLower(r rune) rune {
   263		if r <= MaxASCII {
   264			if 'A' <= r && r <= 'Z' {
   265				r += 'a' - 'A'
   266			}
   267			return r
   268		}
   269		return To(LowerCase, r)
   270	}
   271	
   272	// ToTitle maps the rune to title case.
   273	func ToTitle(r rune) rune {
   274		if r <= MaxASCII {
   275			if 'a' <= r && r <= 'z' { // title case is upper case for ASCII
   276				r -= 'a' - 'A'
   277			}
   278			return r
   279		}
   280		return To(TitleCase, r)
   281	}
   282	
   283	// ToUpper maps the rune to upper case giving priority to the special mapping.
   284	func (special SpecialCase) ToUpper(r rune) rune {
   285		r1 := to(UpperCase, r, []CaseRange(special))
   286		if r1 == r {
   287			r1 = ToUpper(r)
   288		}
   289		return r1
   290	}
   291	
   292	// ToTitle maps the rune to title case giving priority to the special mapping.
   293	func (special SpecialCase) ToTitle(r rune) rune {
   294		r1 := to(TitleCase, r, []CaseRange(special))
   295		if r1 == r {
   296			r1 = ToTitle(r)
   297		}
   298		return r1
   299	}
   300	
   301	// ToLower maps the rune to lower case giving priority to the special mapping.
   302	func (special SpecialCase) ToLower(r rune) rune {
   303		r1 := to(LowerCase, r, []CaseRange(special))
   304		if r1 == r {
   305			r1 = ToLower(r)
   306		}
   307		return r1
   308	}
   309	
   310	// caseOrbit is defined in tables.go as []foldPair. Right now all the
   311	// entries fit in uint16, so use uint16.  If that changes, compilation
   312	// will fail (the constants in the composite literal will not fit in uint16)
   313	// and the types here can change to uint32.
   314	type foldPair struct {
   315		From uint16
   316		To   uint16
   317	}
   318	
   319	// SimpleFold iterates over Unicode code points equivalent under
   320	// the Unicode-defined simple case folding. Among the code points
   321	// equivalent to rune (including rune itself), SimpleFold returns the
   322	// smallest rune > r if one exists, or else the smallest rune >= 0.
   323	// If r is not a valid Unicode code point, SimpleFold(r) returns r.
   324	//
   325	// For example:
   326	//	SimpleFold('A') = 'a'
   327	//	SimpleFold('a') = 'A'
   328	//
   329	//	SimpleFold('K') = 'k'
   330	//	SimpleFold('k') = '\u212A' (Kelvin symbol, K)
   331	//	SimpleFold('\u212A') = 'K'
   332	//
   333	//	SimpleFold('1') = '1'
   334	//
   335	//	SimpleFold(-2) = -2
   336	//
   337	func SimpleFold(r rune) rune {
   338		if r < 0 || r > MaxRune {
   339			return r
   340		}
   341	
   342		if int(r) < len(asciiFold) {
   343			return rune(asciiFold[r])
   344		}
   345	
   346		// Consult caseOrbit table for special cases.
   347		lo := 0
   348		hi := len(caseOrbit)
   349		for lo < hi {
   350			m := lo + (hi-lo)/2
   351			if rune(caseOrbit[m].From) < r {
   352				lo = m + 1
   353			} else {
   354				hi = m
   355			}
   356		}
   357		if lo < len(caseOrbit) && rune(caseOrbit[lo].From) == r {
   358			return rune(caseOrbit[lo].To)
   359		}
   360	
   361		// No folding specified. This is a one- or two-element
   362		// equivalence class containing rune and ToLower(rune)
   363		// and ToUpper(rune) if they are different from rune.
   364		if l := ToLower(r); l != r {
   365			return l
   366		}
   367		return ToUpper(r)
   368	}
   369	

View as plain text