Run Format

Source file src/pkg/bytes/bytes.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package bytes implements functions for the manipulation of byte slices.
     6	// It is analogous to the facilities of the strings package.
     7	package bytes
     8	
     9	import (
    10		"unicode"
    11		"unicode/utf8"
    12	)
    13	
    14	func equalPortable(a, b []byte) bool {
    15		if len(a) != len(b) {
    16			return false
    17		}
    18		for i, c := range a {
    19			if c != b[i] {
    20				return false
    21			}
    22		}
    23		return true
    24	}
    25	
    26	// explode splits s into a slice of UTF-8 sequences, one per Unicode character (still slices of bytes),
    27	// up to a maximum of n byte slices. Invalid UTF-8 sequences are chopped into individual bytes.
    28	func explode(s []byte, n int) [][]byte {
    29		if n <= 0 {
    30			n = len(s)
    31		}
    32		a := make([][]byte, n)
    33		var size int
    34		na := 0
    35		for len(s) > 0 {
    36			if na+1 >= n {
    37				a[na] = s
    38				na++
    39				break
    40			}
    41			_, size = utf8.DecodeRune(s)
    42			a[na] = s[0:size]
    43			s = s[size:]
    44			na++
    45		}
    46		return a[0:na]
    47	}
    48	
    49	// Count counts the number of non-overlapping instances of sep in s.
    50	func Count(s, sep []byte) int {
    51		n := len(sep)
    52		if n == 0 {
    53			return utf8.RuneCount(s) + 1
    54		}
    55		if n > len(s) {
    56			return 0
    57		}
    58		count := 0
    59		c := sep[0]
    60		i := 0
    61		t := s[:len(s)-n+1]
    62		for i < len(t) {
    63			if t[i] != c {
    64				o := IndexByte(t[i:], c)
    65				if o < 0 {
    66					break
    67				}
    68				i += o
    69			}
    70			if n == 1 || Equal(s[i:i+n], sep) {
    71				count++
    72				i += n
    73				continue
    74			}
    75			i++
    76		}
    77		return count
    78	}
    79	
    80	// Contains reports whether subslice is within b.
    81	func Contains(b, subslice []byte) bool {
    82		return Index(b, subslice) != -1
    83	}
    84	
    85	// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
    86	func Index(s, sep []byte) int {
    87		n := len(sep)
    88		if n == 0 {
    89			return 0
    90		}
    91		if n > len(s) {
    92			return -1
    93		}
    94		c := sep[0]
    95		if n == 1 {
    96			return IndexByte(s, c)
    97		}
    98		i := 0
    99		t := s[:len(s)-n+1]
   100		for i < len(t) {
   101			if t[i] != c {
   102				o := IndexByte(t[i:], c)
   103				if o < 0 {
   104					break
   105				}
   106				i += o
   107			}
   108			if Equal(s[i:i+n], sep) {
   109				return i
   110			}
   111			i++
   112		}
   113		return -1
   114	}
   115	
   116	func indexBytePortable(s []byte, c byte) int {
   117		for i, b := range s {
   118			if b == c {
   119				return i
   120			}
   121		}
   122		return -1
   123	}
   124	
   125	// LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.
   126	func LastIndex(s, sep []byte) int {
   127		n := len(sep)
   128		if n == 0 {
   129			return len(s)
   130		}
   131		c := sep[0]
   132		for i := len(s) - n; i >= 0; i-- {
   133			if s[i] == c && (n == 1 || Equal(s[i:i+n], sep)) {
   134				return i
   135			}
   136		}
   137		return -1
   138	}
   139	
   140	// IndexRune interprets s as a sequence of UTF-8-encoded Unicode code points.
   141	// It returns the byte index of the first occurrence in s of the given rune.
   142	// It returns -1 if rune is not present in s.
   143	func IndexRune(s []byte, r rune) int {
   144		for i := 0; i < len(s); {
   145			r1, size := utf8.DecodeRune(s[i:])
   146			if r == r1 {
   147				return i
   148			}
   149			i += size
   150		}
   151		return -1
   152	}
   153	
   154	// IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points.
   155	// It returns the byte index of the first occurrence in s of any of the Unicode
   156	// code points in chars.  It returns -1 if chars is empty or if there is no code
   157	// point in common.
   158	func IndexAny(s []byte, chars string) int {
   159		if len(chars) > 0 {
   160			var r rune
   161			var width int
   162			for i := 0; i < len(s); i += width {
   163				r = rune(s[i])
   164				if r < utf8.RuneSelf {
   165					width = 1
   166				} else {
   167					r, width = utf8.DecodeRune(s[i:])
   168				}
   169				for _, ch := range chars {
   170					if r == ch {
   171						return i
   172					}
   173				}
   174			}
   175		}
   176		return -1
   177	}
   178	
   179	// LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code
   180	// points.  It returns the byte index of the last occurrence in s of any of
   181	// the Unicode code points in chars.  It returns -1 if chars is empty or if
   182	// there is no code point in common.
   183	func LastIndexAny(s []byte, chars string) int {
   184		if len(chars) > 0 {
   185			for i := len(s); i > 0; {
   186				r, size := utf8.DecodeLastRune(s[0:i])
   187				i -= size
   188				for _, ch := range chars {
   189					if r == ch {
   190						return i
   191					}
   192				}
   193			}
   194		}
   195		return -1
   196	}
   197	
   198	// Generic split: splits after each instance of sep,
   199	// including sepSave bytes of sep in the subslices.
   200	func genSplit(s, sep []byte, sepSave, n int) [][]byte {
   201		if n == 0 {
   202			return nil
   203		}
   204		if len(sep) == 0 {
   205			return explode(s, n)
   206		}
   207		if n < 0 {
   208			n = Count(s, sep) + 1
   209		}
   210		c := sep[0]
   211		start := 0
   212		a := make([][]byte, n)
   213		na := 0
   214		for i := 0; i+len(sep) <= len(s) && na+1 < n; i++ {
   215			if s[i] == c && (len(sep) == 1 || Equal(s[i:i+len(sep)], sep)) {
   216				a[na] = s[start : i+sepSave]
   217				na++
   218				start = i + len(sep)
   219				i += len(sep) - 1
   220			}
   221		}
   222		a[na] = s[start:]
   223		return a[0 : na+1]
   224	}
   225	
   226	// SplitN slices s into subslices separated by sep and returns a slice of
   227	// the subslices between those separators.
   228	// If sep is empty, SplitN splits after each UTF-8 sequence.
   229	// The count determines the number of subslices to return:
   230	//   n > 0: at most n subslices; the last subslice will be the unsplit remainder.
   231	//   n == 0: the result is nil (zero subslices)
   232	//   n < 0: all subslices
   233	func SplitN(s, sep []byte, n int) [][]byte { return genSplit(s, sep, 0, n) }
   234	
   235	// SplitAfterN slices s into subslices after each instance of sep and
   236	// returns a slice of those subslices.
   237	// If sep is empty, SplitAfterN splits after each UTF-8 sequence.
   238	// The count determines the number of subslices to return:
   239	//   n > 0: at most n subslices; the last subslice will be the unsplit remainder.
   240	//   n == 0: the result is nil (zero subslices)
   241	//   n < 0: all subslices
   242	func SplitAfterN(s, sep []byte, n int) [][]byte {
   243		return genSplit(s, sep, len(sep), n)
   244	}
   245	
   246	// Split slices s into all subslices separated by sep and returns a slice of
   247	// the subslices between those separators.
   248	// If sep is empty, Split splits after each UTF-8 sequence.
   249	// It is equivalent to SplitN with a count of -1.
   250	func Split(s, sep []byte) [][]byte { return genSplit(s, sep, 0, -1) }
   251	
   252	// SplitAfter slices s into all subslices after each instance of sep and
   253	// returns a slice of those subslices.
   254	// If sep is empty, SplitAfter splits after each UTF-8 sequence.
   255	// It is equivalent to SplitAfterN with a count of -1.
   256	func SplitAfter(s, sep []byte) [][]byte {
   257		return genSplit(s, sep, len(sep), -1)
   258	}
   259	
   260	// Fields splits the slice s around each instance of one or more consecutive white space
   261	// characters, returning a slice of subslices of s or an empty list if s contains only white space.
   262	func Fields(s []byte) [][]byte {
   263		return FieldsFunc(s, unicode.IsSpace)
   264	}
   265	
   266	// FieldsFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
   267	// It splits the slice s at each run of code points c satisfying f(c) and
   268	// returns a slice of subslices of s.  If no code points in s satisfy f(c), an
   269	// empty slice is returned.
   270	func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
   271		n := 0
   272		inField := false
   273		for i := 0; i < len(s); {
   274			r, size := utf8.DecodeRune(s[i:])
   275			wasInField := inField
   276			inField = !f(r)
   277			if inField && !wasInField {
   278				n++
   279			}
   280			i += size
   281		}
   282	
   283		a := make([][]byte, n)
   284		na := 0
   285		fieldStart := -1
   286		for i := 0; i <= len(s) && na < n; {
   287			r, size := utf8.DecodeRune(s[i:])
   288			if fieldStart < 0 && size > 0 && !f(r) {
   289				fieldStart = i
   290				i += size
   291				continue
   292			}
   293			if fieldStart >= 0 && (size == 0 || f(r)) {
   294				a[na] = s[fieldStart:i]
   295				na++
   296				fieldStart = -1
   297			}
   298			if size == 0 {
   299				break
   300			}
   301			i += size
   302		}
   303		return a[0:na]
   304	}
   305	
   306	// Join concatenates the elements of s to create a new byte slice. The separator
   307	// sep is placed between elements in the resulting slice.
   308	func Join(s [][]byte, sep []byte) []byte {
   309		if len(s) == 0 {
   310			return []byte{}
   311		}
   312		if len(s) == 1 {
   313			// Just return a copy.
   314			return append([]byte(nil), s[0]...)
   315		}
   316		n := len(sep) * (len(s) - 1)
   317		for _, v := range s {
   318			n += len(v)
   319		}
   320	
   321		b := make([]byte, n)
   322		bp := copy(b, s[0])
   323		for _, v := range s[1:] {
   324			bp += copy(b[bp:], sep)
   325			bp += copy(b[bp:], v)
   326		}
   327		return b
   328	}
   329	
   330	// HasPrefix tests whether the byte slice s begins with prefix.
   331	func HasPrefix(s, prefix []byte) bool {
   332		return len(s) >= len(prefix) && Equal(s[0:len(prefix)], prefix)
   333	}
   334	
   335	// HasSuffix tests whether the byte slice s ends with suffix.
   336	func HasSuffix(s, suffix []byte) bool {
   337		return len(s) >= len(suffix) && Equal(s[len(s)-len(suffix):], suffix)
   338	}
   339	
   340	// Map returns a copy of the byte slice s with all its characters modified
   341	// according to the mapping function. If mapping returns a negative value, the character is
   342	// dropped from the string with no replacement.  The characters in s and the
   343	// output are interpreted as UTF-8-encoded Unicode code points.
   344	func Map(mapping func(r rune) rune, s []byte) []byte {
   345		// In the worst case, the slice can grow when mapped, making
   346		// things unpleasant.  But it's so rare we barge in assuming it's
   347		// fine.  It could also shrink but that falls out naturally.
   348		maxbytes := len(s) // length of b
   349		nbytes := 0        // number of bytes encoded in b
   350		b := make([]byte, maxbytes)
   351		for i := 0; i < len(s); {
   352			wid := 1
   353			r := rune(s[i])
   354			if r >= utf8.RuneSelf {
   355				r, wid = utf8.DecodeRune(s[i:])
   356			}
   357			r = mapping(r)
   358			if r >= 0 {
   359				if nbytes+utf8.RuneLen(r) > maxbytes {
   360					// Grow the buffer.
   361					maxbytes = maxbytes*2 + utf8.UTFMax
   362					nb := make([]byte, maxbytes)
   363					copy(nb, b[0:nbytes])
   364					b = nb
   365				}
   366				nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r)
   367			}
   368			i += wid
   369		}
   370		return b[0:nbytes]
   371	}
   372	
   373	// Repeat returns a new byte slice consisting of count copies of b.
   374	func Repeat(b []byte, count int) []byte {
   375		nb := make([]byte, len(b)*count)
   376		bp := 0
   377		for i := 0; i < count; i++ {
   378			bp += copy(nb[bp:], b)
   379		}
   380		return nb
   381	}
   382	
   383	// ToUpper returns a copy of the byte slice s with all Unicode letters mapped to their upper case.
   384	func ToUpper(s []byte) []byte { return Map(unicode.ToUpper, s) }
   385	
   386	// ToLower returns a copy of the byte slice s with all Unicode letters mapped to their lower case.
   387	func ToLower(s []byte) []byte { return Map(unicode.ToLower, s) }
   388	
   389	// ToTitle returns a copy of the byte slice s with all Unicode letters mapped to their title case.
   390	func ToTitle(s []byte) []byte { return Map(unicode.ToTitle, s) }
   391	
   392	// ToUpperSpecial returns a copy of the byte slice s with all Unicode letters mapped to their
   393	// upper case, giving priority to the special casing rules.
   394	func ToUpperSpecial(_case unicode.SpecialCase, s []byte) []byte {
   395		return Map(func(r rune) rune { return _case.ToUpper(r) }, s)
   396	}
   397	
   398	// ToLowerSpecial returns a copy of the byte slice s with all Unicode letters mapped to their
   399	// lower case, giving priority to the special casing rules.
   400	func ToLowerSpecial(_case unicode.SpecialCase, s []byte) []byte {
   401		return Map(func(r rune) rune { return _case.ToLower(r) }, s)
   402	}
   403	
   404	// ToTitleSpecial returns a copy of the byte slice s with all Unicode letters mapped to their
   405	// title case, giving priority to the special casing rules.
   406	func ToTitleSpecial(_case unicode.SpecialCase, s []byte) []byte {
   407		return Map(func(r rune) rune { return _case.ToTitle(r) }, s)
   408	}
   409	
   410	// isSeparator reports whether the rune could mark a word boundary.
   411	// TODO: update when package unicode captures more of the properties.
   412	func isSeparator(r rune) bool {
   413		// ASCII alphanumerics and underscore are not separators
   414		if r <= 0x7F {
   415			switch {
   416			case '0' <= r && r <= '9':
   417				return false
   418			case 'a' <= r && r <= 'z':
   419				return false
   420			case 'A' <= r && r <= 'Z':
   421				return false
   422			case r == '_':
   423				return false
   424			}
   425			return true
   426		}
   427		// Letters and digits are not separators
   428		if unicode.IsLetter(r) || unicode.IsDigit(r) {
   429			return false
   430		}
   431		// Otherwise, all we can do for now is treat spaces as separators.
   432		return unicode.IsSpace(r)
   433	}
   434	
   435	// Title returns a copy of s with all Unicode letters that begin words
   436	// mapped to their title case.
   437	//
   438	// BUG: The rule Title uses for word boundaries does not handle Unicode punctuation properly.
   439	func Title(s []byte) []byte {
   440		// Use a closure here to remember state.
   441		// Hackish but effective. Depends on Map scanning in order and calling
   442		// the closure once per rune.
   443		prev := ' '
   444		return Map(
   445			func(r rune) rune {
   446				if isSeparator(prev) {
   447					prev = r
   448					return unicode.ToTitle(r)
   449				}
   450				prev = r
   451				return r
   452			},
   453			s)
   454	}
   455	
   456	// TrimLeftFunc returns a subslice of s by slicing off all leading UTF-8-encoded
   457	// Unicode code points c that satisfy f(c).
   458	func TrimLeftFunc(s []byte, f func(r rune) bool) []byte {
   459		i := indexFunc(s, f, false)
   460		if i == -1 {
   461			return nil
   462		}
   463		return s[i:]
   464	}
   465	
   466	// TrimRightFunc returns a subslice of s by slicing off all trailing UTF-8
   467	// encoded Unicode code points c that satisfy f(c).
   468	func TrimRightFunc(s []byte, f func(r rune) bool) []byte {
   469		i := lastIndexFunc(s, f, false)
   470		if i >= 0 && s[i] >= utf8.RuneSelf {
   471			_, wid := utf8.DecodeRune(s[i:])
   472			i += wid
   473		} else {
   474			i++
   475		}
   476		return s[0:i]
   477	}
   478	
   479	// TrimFunc returns a subslice of s by slicing off all leading and trailing
   480	// UTF-8-encoded Unicode code points c that satisfy f(c).
   481	func TrimFunc(s []byte, f func(r rune) bool) []byte {
   482		return TrimRightFunc(TrimLeftFunc(s, f), f)
   483	}
   484	
   485	// TrimPrefix returns s without the provided leading prefix string.
   486	// If s doesn't start with prefix, s is returned unchanged.
   487	func TrimPrefix(s, prefix []byte) []byte {
   488		if HasPrefix(s, prefix) {
   489			return s[len(prefix):]
   490		}
   491		return s
   492	}
   493	
   494	// TrimSuffix returns s without the provided trailing suffix string.
   495	// If s doesn't end with suffix, s is returned unchanged.
   496	func TrimSuffix(s, suffix []byte) []byte {
   497		if HasSuffix(s, suffix) {
   498			return s[:len(s)-len(suffix)]
   499		}
   500		return s
   501	}
   502	
   503	// IndexFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
   504	// It returns the byte index in s of the first Unicode
   505	// code point satisfying f(c), or -1 if none do.
   506	func IndexFunc(s []byte, f func(r rune) bool) int {
   507		return indexFunc(s, f, true)
   508	}
   509	
   510	// LastIndexFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
   511	// It returns the byte index in s of the last Unicode
   512	// code point satisfying f(c), or -1 if none do.
   513	func LastIndexFunc(s []byte, f func(r rune) bool) int {
   514		return lastIndexFunc(s, f, true)
   515	}
   516	
   517	// indexFunc is the same as IndexFunc except that if
   518	// truth==false, the sense of the predicate function is
   519	// inverted.
   520	func indexFunc(s []byte, f func(r rune) bool, truth bool) int {
   521		start := 0
   522		for start < len(s) {
   523			wid := 1
   524			r := rune(s[start])
   525			if r >= utf8.RuneSelf {
   526				r, wid = utf8.DecodeRune(s[start:])
   527			}
   528			if f(r) == truth {
   529				return start
   530			}
   531			start += wid
   532		}
   533		return -1
   534	}
   535	
   536	// lastIndexFunc is the same as LastIndexFunc except that if
   537	// truth==false, the sense of the predicate function is
   538	// inverted.
   539	func lastIndexFunc(s []byte, f func(r rune) bool, truth bool) int {
   540		for i := len(s); i > 0; {
   541			r, size := rune(s[i-1]), 1
   542			if r >= utf8.RuneSelf {
   543				r, size = utf8.DecodeLastRune(s[0:i])
   544			}
   545			i -= size
   546			if f(r) == truth {
   547				return i
   548			}
   549		}
   550		return -1
   551	}
   552	
   553	func makeCutsetFunc(cutset string) func(r rune) bool {
   554		return func(r rune) bool {
   555			for _, c := range cutset {
   556				if c == r {
   557					return true
   558				}
   559			}
   560			return false
   561		}
   562	}
   563	
   564	// Trim returns a subslice of s by slicing off all leading and
   565	// trailing UTF-8-encoded Unicode code points contained in cutset.
   566	func Trim(s []byte, cutset string) []byte {
   567		return TrimFunc(s, makeCutsetFunc(cutset))
   568	}
   569	
   570	// TrimLeft returns a subslice of s by slicing off all leading
   571	// UTF-8-encoded Unicode code points contained in cutset.
   572	func TrimLeft(s []byte, cutset string) []byte {
   573		return TrimLeftFunc(s, makeCutsetFunc(cutset))
   574	}
   575	
   576	// TrimRight returns a subslice of s by slicing off all trailing
   577	// UTF-8-encoded Unicode code points that are contained in cutset.
   578	func TrimRight(s []byte, cutset string) []byte {
   579		return TrimRightFunc(s, makeCutsetFunc(cutset))
   580	}
   581	
   582	// TrimSpace returns a subslice of s by slicing off all leading and
   583	// trailing white space, as defined by Unicode.
   584	func TrimSpace(s []byte) []byte {
   585		return TrimFunc(s, unicode.IsSpace)
   586	}
   587	
   588	// Runes returns a slice of runes (Unicode code points) equivalent to s.
   589	func Runes(s []byte) []rune {
   590		t := make([]rune, utf8.RuneCount(s))
   591		i := 0
   592		for len(s) > 0 {
   593			r, l := utf8.DecodeRune(s)
   594			t[i] = r
   595			i++
   596			s = s[l:]
   597		}
   598		return t
   599	}
   600	
   601	// Replace returns a copy of the slice s with the first n
   602	// non-overlapping instances of old replaced by new.
   603	// If n < 0, there is no limit on the number of replacements.
   604	func Replace(s, old, new []byte, n int) []byte {
   605		m := 0
   606		if n != 0 {
   607			// Compute number of replacements.
   608			m = Count(s, old)
   609		}
   610		if m == 0 {
   611			// Just return a copy.
   612			return append([]byte(nil), s...)
   613		}
   614		if n < 0 || m < n {
   615			n = m
   616		}
   617	
   618		// Apply replacements to buffer.
   619		t := make([]byte, len(s)+n*(len(new)-len(old)))
   620		w := 0
   621		start := 0
   622		for i := 0; i < n; i++ {
   623			j := start
   624			if len(old) == 0 {
   625				if i > 0 {
   626					_, wid := utf8.DecodeRune(s[start:])
   627					j += wid
   628				}
   629			} else {
   630				j += Index(s[start:], old)
   631			}
   632			w += copy(t[w:], s[start:j])
   633			w += copy(t[w:], new)
   634			start = j + len(old)
   635		}
   636		w += copy(t[w:], s[start:])
   637		return t[0:w]
   638	}
   639	
   640	// EqualFold reports whether s and t, interpreted as UTF-8 strings,
   641	// are equal under Unicode case-folding.
   642	func EqualFold(s, t []byte) bool {
   643		for len(s) != 0 && len(t) != 0 {
   644			// Extract first rune from each.
   645			var sr, tr rune
   646			if s[0] < utf8.RuneSelf {
   647				sr, s = rune(s[0]), s[1:]
   648			} else {
   649				r, size := utf8.DecodeRune(s)
   650				sr, s = r, s[size:]
   651			}
   652			if t[0] < utf8.RuneSelf {
   653				tr, t = rune(t[0]), t[1:]
   654			} else {
   655				r, size := utf8.DecodeRune(t)
   656				tr, t = r, t[size:]
   657			}
   658	
   659			// If they match, keep going; if not, return false.
   660	
   661			// Easy case.
   662			if tr == sr {
   663				continue
   664			}
   665	
   666			// Make sr < tr to simplify what follows.
   667			if tr < sr {
   668				tr, sr = sr, tr
   669			}
   670			// Fast check for ASCII.
   671			if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' {
   672				// ASCII, and sr is upper case.  tr must be lower case.
   673				if tr == sr+'a'-'A' {
   674					continue
   675				}
   676				return false
   677			}
   678	
   679			// General case.  SimpleFold(x) returns the next equivalent rune > x
   680			// or wraps around to smaller values.
   681			r := unicode.SimpleFold(sr)
   682			for r != sr && r < tr {
   683				r = unicode.SimpleFold(r)
   684			}
   685			if r == tr {
   686				continue
   687			}
   688			return false
   689		}
   690	
   691		// One string is empty.  Are both?
   692		return len(s) == len(t)
   693	}

View as plain text