...
Run Format

Source file src/unicode/utf8/utf8.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package utf8 implements functions and constants to support text encoded in
     6	// UTF-8. It includes functions to translate between runes and UTF-8 byte sequences.
     7	package utf8
     8	
     9	// The conditions RuneError==unicode.ReplacementChar and
    10	// MaxRune==unicode.MaxRune are verified in the tests.
    11	// Defining them locally avoids this package depending on package unicode.
    12	
    13	// Numbers fundamental to the encoding.
    14	const (
    15		RuneError = '\uFFFD'     // the "error" Rune or "Unicode replacement character"
    16		RuneSelf  = 0x80         // characters below Runeself are represented as themselves in a single byte.
    17		MaxRune   = '\U0010FFFF' // Maximum valid Unicode code point.
    18		UTFMax    = 4            // maximum number of bytes of a UTF-8 encoded Unicode character.
    19	)
    20	
    21	// Code points in the surrogate range are not valid for UTF-8.
    22	const (
    23		surrogateMin = 0xD800
    24		surrogateMax = 0xDFFF
    25	)
    26	
    27	const (
    28		t1 = 0x00 // 0000 0000
    29		tx = 0x80 // 1000 0000
    30		t2 = 0xC0 // 1100 0000
    31		t3 = 0xE0 // 1110 0000
    32		t4 = 0xF0 // 1111 0000
    33		t5 = 0xF8 // 1111 1000
    34	
    35		maskx = 0x3F // 0011 1111
    36		mask2 = 0x1F // 0001 1111
    37		mask3 = 0x0F // 0000 1111
    38		mask4 = 0x07 // 0000 0111
    39	
    40		rune1Max = 1<<7 - 1
    41		rune2Max = 1<<11 - 1
    42		rune3Max = 1<<16 - 1
    43	
    44		// The default lowest and highest continuation byte.
    45		locb = 0x80 // 1000 0000
    46		hicb = 0xBF // 1011 1111
    47	
    48		// These names of these constants are chosen to give nice alignment in the
    49		// table below. The first nibble is an index into acceptRanges or F for
    50		// special one-byte cases. The second nibble is the Rune length or the
    51		// Status for the special one-byte case.
    52		xx = 0xF1 // invalid: size 1
    53		as = 0xF0 // ASCII: size 1
    54		s1 = 0x02 // accept 0, size 2
    55		s2 = 0x13 // accept 1, size 3
    56		s3 = 0x03 // accept 0, size 3
    57		s4 = 0x23 // accept 2, size 3
    58		s5 = 0x34 // accept 3, size 4
    59		s6 = 0x04 // accept 0, size 4
    60		s7 = 0x44 // accept 4, size 4
    61	)
    62	
    63	// first is information about the first byte in a UTF-8 sequence.
    64	var first = [256]uint8{
    65		//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
    66		as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
    67		as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
    68		as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
    69		as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
    70		as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
    71		as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
    72		as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
    73		as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
    74		//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
    75		xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
    76		xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
    77		xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
    78		xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
    79		xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
    80		s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
    81		s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
    82		s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
    83	}
    84	
    85	// acceptRange gives the range of valid values for the second byte in a UTF-8
    86	// sequence.
    87	type acceptRange struct {
    88		lo uint8 // lowest value for second byte.
    89		hi uint8 // highest value for second byte.
    90	}
    91	
    92	var acceptRanges = [...]acceptRange{
    93		0: {locb, hicb},
    94		1: {0xA0, hicb},
    95		2: {locb, 0x9F},
    96		3: {0x90, hicb},
    97		4: {locb, 0x8F},
    98	}
    99	
   100	// FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
   101	// An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
   102	func FullRune(p []byte) bool {
   103		n := len(p)
   104		if n == 0 {
   105			return false
   106		}
   107		x := first[p[0]]
   108		if n >= int(x&7) {
   109			return true // ASCII, invalid or valid.
   110		}
   111		// Must be short or invalid.
   112		accept := acceptRanges[x>>4]
   113		if n > 1 {
   114			if c := p[1]; c < accept.lo || accept.hi < c {
   115				return true
   116			} else if n > 2 && (p[2] < locb || hicb < p[2]) {
   117				return true
   118			}
   119		}
   120		return false
   121	}
   122	
   123	// FullRuneInString is like FullRune but its input is a string.
   124	func FullRuneInString(s string) bool {
   125		n := len(s)
   126		if n == 0 {
   127			return false
   128		}
   129		x := first[s[0]]
   130		if n >= int(x&7) {
   131			return true // ASCII, invalid, or valid.
   132		}
   133		// Must be short or invalid.
   134		accept := acceptRanges[x>>4]
   135		if n > 1 {
   136			if c := s[1]; c < accept.lo || accept.hi < c {
   137				return true
   138			} else if n > 2 && (s[2] < locb || hicb < s[2]) {
   139				return true
   140			}
   141		}
   142		return false
   143	}
   144	
   145	// DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and
   146	// its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
   147	// the encoding is invalid, it returns (RuneError, 1). Both are impossible
   148	// results for correct, non-empty UTF-8.
   149	//
   150	// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   151	// out of range, or is not the shortest possible UTF-8 encoding for the
   152	// value. No other validation is performed.
   153	func DecodeRune(p []byte) (r rune, size int) {
   154		n := len(p)
   155		if n < 1 {
   156			return RuneError, 0
   157		}
   158		p0 := p[0]
   159		x := first[p0]
   160		if x >= as {
   161			// The following code simulates an additional check for x == xx and
   162			// handling the ASCII and invalid cases accordingly. This mask-and-or
   163			// approach prevents an additional branch.
   164			mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
   165			return rune(p[0])&^mask | RuneError&mask, 1
   166		}
   167		sz := x & 7
   168		accept := acceptRanges[x>>4]
   169		if n < int(sz) {
   170			return RuneError, 1
   171		}
   172		b1 := p[1]
   173		if b1 < accept.lo || accept.hi < b1 {
   174			return RuneError, 1
   175		}
   176		if sz == 2 {
   177			return rune(p0&mask2)<<6 | rune(b1&maskx), 2
   178		}
   179		b2 := p[2]
   180		if b2 < locb || hicb < b2 {
   181			return RuneError, 1
   182		}
   183		if sz == 3 {
   184			return rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3
   185		}
   186		b3 := p[3]
   187		if b3 < locb || hicb < b3 {
   188			return RuneError, 1
   189		}
   190		return rune(p0&mask4)<<18 | rune(b1&maskx)<<12 | rune(b2&maskx)<<6 | rune(b3&maskx), 4
   191	}
   192	
   193	// DecodeRuneInString is like DecodeRune but its input is a string. If s is
   194	// empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, it
   195	// returns (RuneError, 1). Both are impossible results for correct, non-empty
   196	// UTF-8.
   197	//
   198	// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   199	// out of range, or is not the shortest possible UTF-8 encoding for the
   200	// value. No other validation is performed.
   201	func DecodeRuneInString(s string) (r rune, size int) {
   202		n := len(s)
   203		if n < 1 {
   204			return RuneError, 0
   205		}
   206		s0 := s[0]
   207		x := first[s0]
   208		if x >= as {
   209			// The following code simulates an additional check for x == xx and
   210			// handling the ASCII and invalid cases accordingly. This mask-and-or
   211			// approach prevents an additional branch.
   212			mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
   213			return rune(s[0])&^mask | RuneError&mask, 1
   214		}
   215		sz := x & 7
   216		accept := acceptRanges[x>>4]
   217		if n < int(sz) {
   218			return RuneError, 1
   219		}
   220		s1 := s[1]
   221		if s1 < accept.lo || accept.hi < s1 {
   222			return RuneError, 1
   223		}
   224		if sz == 2 {
   225			return rune(s0&mask2)<<6 | rune(s1&maskx), 2
   226		}
   227		s2 := s[2]
   228		if s2 < locb || hicb < s2 {
   229			return RuneError, 1
   230		}
   231		if sz == 3 {
   232			return rune(s0&mask3)<<12 | rune(s1&maskx)<<6 | rune(s2&maskx), 3
   233		}
   234		s3 := s[3]
   235		if s3 < locb || hicb < s3 {
   236			return RuneError, 1
   237		}
   238		return rune(s0&mask4)<<18 | rune(s1&maskx)<<12 | rune(s2&maskx)<<6 | rune(s3&maskx), 4
   239	}
   240	
   241	// DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and
   242	// its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
   243	// the encoding is invalid, it returns (RuneError, 1). Both are impossible
   244	// results for correct, non-empty UTF-8.
   245	//
   246	// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   247	// out of range, or is not the shortest possible UTF-8 encoding for the
   248	// value. No other validation is performed.
   249	func DecodeLastRune(p []byte) (r rune, size int) {
   250		end := len(p)
   251		if end == 0 {
   252			return RuneError, 0
   253		}
   254		start := end - 1
   255		r = rune(p[start])
   256		if r < RuneSelf {
   257			return r, 1
   258		}
   259		// guard against O(n^2) behavior when traversing
   260		// backwards through strings with long sequences of
   261		// invalid UTF-8.
   262		lim := end - UTFMax
   263		if lim < 0 {
   264			lim = 0
   265		}
   266		for start--; start >= lim; start-- {
   267			if RuneStart(p[start]) {
   268				break
   269			}
   270		}
   271		if start < 0 {
   272			start = 0
   273		}
   274		r, size = DecodeRune(p[start:end])
   275		if start+size != end {
   276			return RuneError, 1
   277		}
   278		return r, size
   279	}
   280	
   281	// DecodeLastRuneInString is like DecodeLastRune but its input is a string. If
   282	// s is empty it returns (RuneError, 0). Otherwise, if the encoding is invalid,
   283	// it returns (RuneError, 1). Both are impossible results for correct,
   284	// non-empty UTF-8.
   285	//
   286	// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   287	// out of range, or is not the shortest possible UTF-8 encoding for the
   288	// value. No other validation is performed.
   289	func DecodeLastRuneInString(s string) (r rune, size int) {
   290		end := len(s)
   291		if end == 0 {
   292			return RuneError, 0
   293		}
   294		start := end - 1
   295		r = rune(s[start])
   296		if r < RuneSelf {
   297			return r, 1
   298		}
   299		// guard against O(n^2) behavior when traversing
   300		// backwards through strings with long sequences of
   301		// invalid UTF-8.
   302		lim := end - UTFMax
   303		if lim < 0 {
   304			lim = 0
   305		}
   306		for start--; start >= lim; start-- {
   307			if RuneStart(s[start]) {
   308				break
   309			}
   310		}
   311		if start < 0 {
   312			start = 0
   313		}
   314		r, size = DecodeRuneInString(s[start:end])
   315		if start+size != end {
   316			return RuneError, 1
   317		}
   318		return r, size
   319	}
   320	
   321	// RuneLen returns the number of bytes required to encode the rune.
   322	// It returns -1 if the rune is not a valid value to encode in UTF-8.
   323	func RuneLen(r rune) int {
   324		switch {
   325		case r < 0:
   326			return -1
   327		case r <= rune1Max:
   328			return 1
   329		case r <= rune2Max:
   330			return 2
   331		case surrogateMin <= r && r <= surrogateMax:
   332			return -1
   333		case r <= rune3Max:
   334			return 3
   335		case r <= MaxRune:
   336			return 4
   337		}
   338		return -1
   339	}
   340	
   341	// EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune.
   342	// It returns the number of bytes written.
   343	func EncodeRune(p []byte, r rune) int {
   344		// Negative values are erroneous. Making it unsigned addresses the problem.
   345		switch i := uint32(r); {
   346		case i <= rune1Max:
   347			p[0] = byte(r)
   348			return 1
   349		case i <= rune2Max:
   350			_ = p[1] // eliminate bounds checks
   351			p[0] = t2 | byte(r>>6)
   352			p[1] = tx | byte(r)&maskx
   353			return 2
   354		case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
   355			r = RuneError
   356			fallthrough
   357		case i <= rune3Max:
   358			_ = p[2] // eliminate bounds checks
   359			p[0] = t3 | byte(r>>12)
   360			p[1] = tx | byte(r>>6)&maskx
   361			p[2] = tx | byte(r)&maskx
   362			return 3
   363		default:
   364			_ = p[3] // eliminate bounds checks
   365			p[0] = t4 | byte(r>>18)
   366			p[1] = tx | byte(r>>12)&maskx
   367			p[2] = tx | byte(r>>6)&maskx
   368			p[3] = tx | byte(r)&maskx
   369			return 4
   370		}
   371	}
   372	
   373	// RuneCount returns the number of runes in p. Erroneous and short
   374	// encodings are treated as single runes of width 1 byte.
   375	func RuneCount(p []byte) int {
   376		np := len(p)
   377		var n int
   378		for i := 0; i < np; {
   379			n++
   380			c := p[i]
   381			if c < RuneSelf {
   382				// ASCII fast path
   383				i++
   384				continue
   385			}
   386			x := first[c]
   387			if x == xx {
   388				i++ // invalid.
   389				continue
   390			}
   391			size := int(x & 7)
   392			if i+size > np {
   393				i++ // Short or invalid.
   394				continue
   395			}
   396			accept := acceptRanges[x>>4]
   397			if c := p[i+1]; c < accept.lo || accept.hi < c {
   398				size = 1
   399			} else if size == 2 {
   400			} else if c := p[i+2]; c < locb || hicb < c {
   401				size = 1
   402			} else if size == 3 {
   403			} else if c := p[i+3]; c < locb || hicb < c {
   404				size = 1
   405			}
   406			i += size
   407		}
   408		return n
   409	}
   410	
   411	// RuneCountInString is like RuneCount but its input is a string.
   412	func RuneCountInString(s string) (n int) {
   413		ns := len(s)
   414		for i := 0; i < ns; n++ {
   415			c := s[i]
   416			if c < RuneSelf {
   417				// ASCII fast path
   418				i++
   419				continue
   420			}
   421			x := first[c]
   422			if x == xx {
   423				i++ // invalid.
   424				continue
   425			}
   426			size := int(x & 7)
   427			if i+size > ns {
   428				i++ // Short or invalid.
   429				continue
   430			}
   431			accept := acceptRanges[x>>4]
   432			if c := s[i+1]; c < accept.lo || accept.hi < c {
   433				size = 1
   434			} else if size == 2 {
   435			} else if c := s[i+2]; c < locb || hicb < c {
   436				size = 1
   437			} else if size == 3 {
   438			} else if c := s[i+3]; c < locb || hicb < c {
   439				size = 1
   440			}
   441			i += size
   442		}
   443		return n
   444	}
   445	
   446	// RuneStart reports whether the byte could be the first byte of an encoded,
   447	// possibly invalid rune. Second and subsequent bytes always have the top two
   448	// bits set to 10.
   449	func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
   450	
   451	// Valid reports whether p consists entirely of valid UTF-8-encoded runes.
   452	func Valid(p []byte) bool {
   453		n := len(p)
   454		for i := 0; i < n; {
   455			pi := p[i]
   456			if pi < RuneSelf {
   457				i++
   458				continue
   459			}
   460			x := first[pi]
   461			if x == xx {
   462				return false // Illegal starter byte.
   463			}
   464			size := int(x & 7)
   465			if i+size > n {
   466				return false // Short or invalid.
   467			}
   468			accept := acceptRanges[x>>4]
   469			if c := p[i+1]; c < accept.lo || accept.hi < c {
   470				return false
   471			} else if size == 2 {
   472			} else if c := p[i+2]; c < locb || hicb < c {
   473				return false
   474			} else if size == 3 {
   475			} else if c := p[i+3]; c < locb || hicb < c {
   476				return false
   477			}
   478			i += size
   479		}
   480		return true
   481	}
   482	
   483	// ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
   484	func ValidString(s string) bool {
   485		n := len(s)
   486		for i := 0; i < n; {
   487			si := s[i]
   488			if si < RuneSelf {
   489				i++
   490				continue
   491			}
   492			x := first[si]
   493			if x == xx {
   494				return false // Illegal starter byte.
   495			}
   496			size := int(x & 7)
   497			if i+size > n {
   498				return false // Short or invalid.
   499			}
   500			accept := acceptRanges[x>>4]
   501			if c := s[i+1]; c < accept.lo || accept.hi < c {
   502				return false
   503			} else if size == 2 {
   504			} else if c := s[i+2]; c < locb || hicb < c {
   505				return false
   506			} else if size == 3 {
   507			} else if c := s[i+3]; c < locb || hicb < c {
   508				return false
   509			}
   510			i += size
   511		}
   512		return true
   513	}
   514	
   515	// ValidRune reports whether r can be legally encoded as UTF-8.
   516	// Code points that are out of range or a surrogate half are illegal.
   517	func ValidRune(r rune) bool {
   518		switch {
   519		case 0 <= r && r < surrogateMin:
   520			return true
   521		case surrogateMax < r && r <= MaxRune:
   522			return true
   523		}
   524		return false
   525	}
   526	

View as plain text