The Go Programming Language

Source file src/pkg/bytes/bytes.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package bytes implements functions for the manipulation of byte slices.
     6	// It is analogous to the facilities of the strings package.
     7	package bytes
     8	
     9	import (
    10		"unicode"
    11		"utf8"
    12	)
    13	
    14	// Compare returns an integer comparing the two byte arrays lexicographically.
    15	// The result will be 0 if a==b, -1 if a < b, and +1 if a > b
    16	func Compare(a, b []byte) int {
    17		m := len(a)
    18		if m > len(b) {
    19			m = len(b)
    20		}
    21		for i, ac := range a[0:m] {
    22			bc := b[i]
    23			switch {
    24			case ac > bc:
    25				return 1
    26			case ac < bc:
    27				return -1
    28			}
    29		}
    30		switch {
    31		case len(a) < len(b):
    32			return -1
    33		case len(a) > len(b):
    34			return 1
    35		}
    36		return 0
    37	}
    38	
    39	// Equal returns a boolean reporting whether a == b.
    40	func Equal(a, b []byte) bool {
    41		if len(a) != len(b) {
    42			return false
    43		}
    44		for i, c := range a {
    45			if c != b[i] {
    46				return false
    47			}
    48		}
    49		return true
    50	}
    51	
    52	// explode splits s into an array of UTF-8 sequences, one per Unicode character (still arrays of bytes),
    53	// up to a maximum of n byte arrays. Invalid UTF-8 sequences are chopped into individual bytes.
    54	func explode(s []byte, n int) [][]byte {
    55		if n <= 0 {
    56			n = len(s)
    57		}
    58		a := make([][]byte, n)
    59		var size int
    60		na := 0
    61		for len(s) > 0 {
    62			if na+1 >= n {
    63				a[na] = s
    64				na++
    65				break
    66			}
    67			_, size = utf8.DecodeRune(s)
    68			a[na] = s[0:size]
    69			s = s[size:]
    70			na++
    71		}
    72		return a[0:na]
    73	}
    74	
    75	// Count counts the number of non-overlapping instances of sep in s.
    76	func Count(s, sep []byte) int {
    77		if len(sep) == 0 {
    78			return utf8.RuneCount(s) + 1
    79		}
    80		c := sep[0]
    81		n := 0
    82		for i := 0; i+len(sep) <= len(s); i++ {
    83			if s[i] == c && (len(sep) == 1 || Equal(s[i:i+len(sep)], sep)) {
    84				n++
    85				i += len(sep) - 1
    86			}
    87		}
    88		return n
    89	}
    90	
    91	// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
    92	func Index(s, sep []byte) int {
    93		n := len(sep)
    94		if n == 0 {
    95			return 0
    96		}
    97		c := sep[0]
    98		for i := 0; i+n <= len(s); i++ {
    99			if s[i] == c && (n == 1 || Equal(s[i:i+n], sep)) {
   100				return i
   101			}
   102		}
   103		return -1
   104	}
   105	
   106	func indexBytePortable(s []byte, c byte) int {
   107		for i, b := range s {
   108			if b == c {
   109				return i
   110			}
   111		}
   112		return -1
   113	}
   114	
   115	// LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.
   116	func LastIndex(s, sep []byte) int {
   117		n := len(sep)
   118		if n == 0 {
   119			return len(s)
   120		}
   121		c := sep[0]
   122		for i := len(s) - n; i >= 0; i-- {
   123			if s[i] == c && (n == 1 || Equal(s[i:i+n], sep)) {
   124				return i
   125			}
   126		}
   127		return -1
   128	}
   129	
   130	// IndexRune interprets s as a sequence of UTF-8-encoded Unicode code points.
   131	// It returns the byte index of the first occurrence in s of the given rune.
   132	// It returns -1 if rune is not present in s.
   133	func IndexRune(s []byte, rune int) int {
   134		for i := 0; i < len(s); {
   135			r, size := utf8.DecodeRune(s[i:])
   136			if r == rune {
   137				return i
   138			}
   139			i += size
   140		}
   141		return -1
   142	}
   143	
   144	// IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points.
   145	// It returns the byte index of the first occurrence in s of any of the Unicode
   146	// code points in chars.  It returns -1 if chars is empty or if there is no code
   147	// point in common.
   148	func IndexAny(s []byte, chars string) int {
   149		if len(chars) > 0 {
   150			var rune, width int
   151			for i := 0; i < len(s); i += width {
   152				rune = int(s[i])
   153				if rune < utf8.RuneSelf {
   154					width = 1
   155				} else {
   156					rune, width = utf8.DecodeRune(s[i:])
   157				}
   158				for _, r := range chars {
   159					if rune == r {
   160						return i
   161					}
   162				}
   163			}
   164		}
   165		return -1
   166	}
   167	
   168	// LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code
   169	// points.  It returns the byte index of the last occurrence in s of any of
   170	// the Unicode code points in chars.  It returns -1 if chars is empty or if
   171	// there is no code point in common.
   172	func LastIndexAny(s []byte, chars string) int {
   173		if len(chars) > 0 {
   174			for i := len(s); i > 0; {
   175				rune, size := utf8.DecodeLastRune(s[0:i])
   176				i -= size
   177				for _, m := range chars {
   178					if rune == m {
   179						return i
   180					}
   181				}
   182			}
   183		}
   184		return -1
   185	}
   186	
   187	// Generic split: splits after each instance of sep,
   188	// including sepSave bytes of sep in the subarrays.
   189	func genSplit(s, sep []byte, sepSave, n int) [][]byte {
   190		if n == 0 {
   191			return nil
   192		}
   193		if len(sep) == 0 {
   194			return explode(s, n)
   195		}
   196		if n < 0 {
   197			n = Count(s, sep) + 1
   198		}
   199		c := sep[0]
   200		start := 0
   201		a := make([][]byte, n)
   202		na := 0
   203		for i := 0; i+len(sep) <= len(s) && na+1 < n; i++ {
   204			if s[i] == c && (len(sep) == 1 || Equal(s[i:i+len(sep)], sep)) {
   205				a[na] = s[start : i+sepSave]
   206				na++
   207				start = i + len(sep)
   208				i += len(sep) - 1
   209			}
   210		}
   211		a[na] = s[start:]
   212		return a[0 : na+1]
   213	}
   214	
   215	// SplitN slices s into subslices separated by sep and returns a slice of
   216	// the subslices between those separators.
   217	// If sep is empty, SplitN splits after each UTF-8 sequence.
   218	// The count determines the number of subslices to return:
   219	//   n > 0: at most n subslices; the last subslice will be the unsplit remainder.
   220	//   n == 0: the result is nil (zero subslices)
   221	//   n < 0: all subslices
   222	func SplitN(s, sep []byte, n int) [][]byte { return genSplit(s, sep, 0, n) }
   223	
   224	// SplitAfterN slices s into subslices after each instance of sep and
   225	// returns a slice of those subslices.
   226	// If sep is empty, SplitAfterN splits after each UTF-8 sequence.
   227	// The count determines the number of subslices to return:
   228	//   n > 0: at most n subslices; the last subslice will be the unsplit remainder.
   229	//   n == 0: the result is nil (zero subslices)
   230	//   n < 0: all subslices
   231	func SplitAfterN(s, sep []byte, n int) [][]byte {
   232		return genSplit(s, sep, len(sep), n)
   233	}
   234	
   235	// Split slices s into all subslices separated by sep and returns a slice of
   236	// the subslices between those separators.
   237	// If sep is empty, Split splits after each UTF-8 sequence.
   238	// It is equivalent to SplitN with a count of -1.
   239	func Split(s, sep []byte) [][]byte { return genSplit(s, sep, 0, -1) }
   240	
   241	// SplitAfter slices s into all subslices after each instance of sep and
   242	// returns a slice of those subslices.
   243	// If sep is empty, SplitAfter splits after each UTF-8 sequence.
   244	// It is equivalent to SplitAfterN with a count of -1.
   245	func SplitAfter(s, sep []byte) [][]byte {
   246		return genSplit(s, sep, len(sep), -1)
   247	}
   248	
   249	// Fields splits the array s around each instance of one or more consecutive white space
   250	// characters, returning a slice of subarrays of s or an empty list if s contains only white space.
   251	func Fields(s []byte) [][]byte {
   252		return FieldsFunc(s, unicode.IsSpace)
   253	}
   254	
   255	// FieldsFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
   256	// It splits the array s at each run of code points c satisfying f(c) and
   257	// returns a slice of subarrays of s.  If no code points in s satisfy f(c), an
   258	// empty slice is returned.
   259	func FieldsFunc(s []byte, f func(int) bool) [][]byte {
   260		n := 0
   261		inField := false
   262		for i := 0; i < len(s); {
   263			rune, size := utf8.DecodeRune(s[i:])
   264			wasInField := inField
   265			inField = !f(rune)
   266			if inField && !wasInField {
   267				n++
   268			}
   269			i += size
   270		}
   271	
   272		a := make([][]byte, n)
   273		na := 0
   274		fieldStart := -1
   275		for i := 0; i <= len(s) && na < n; {
   276			rune, size := utf8.DecodeRune(s[i:])
   277			if fieldStart < 0 && size > 0 && !f(rune) {
   278				fieldStart = i
   279				i += size
   280				continue
   281			}
   282			if fieldStart >= 0 && (size == 0 || f(rune)) {
   283				a[na] = s[fieldStart:i]
   284				na++
   285				fieldStart = -1
   286			}
   287			if size == 0 {
   288				break
   289			}
   290			i += size
   291		}
   292		return a[0:na]
   293	}
   294	
   295	// Join concatenates the elements of a to create a single byte array.   The separator
   296	// sep is placed between elements in the resulting array.
   297	func Join(a [][]byte, sep []byte) []byte {
   298		if len(a) == 0 {
   299			return []byte{}
   300		}
   301		if len(a) == 1 {
   302			return a[0]
   303		}
   304		n := len(sep) * (len(a) - 1)
   305		for i := 0; i < len(a); i++ {
   306			n += len(a[i])
   307		}
   308	
   309		b := make([]byte, n)
   310		bp := copy(b, a[0])
   311		for _, s := range a[1:] {
   312			bp += copy(b[bp:], sep)
   313			bp += copy(b[bp:], s)
   314		}
   315		return b
   316	}
   317	
   318	// HasPrefix tests whether the byte array s begins with prefix.
   319	func HasPrefix(s, prefix []byte) bool {
   320		return len(s) >= len(prefix) && Equal(s[0:len(prefix)], prefix)
   321	}
   322	
   323	// HasSuffix tests whether the byte array s ends with suffix.
   324	func HasSuffix(s, suffix []byte) bool {
   325		return len(s) >= len(suffix) && Equal(s[len(s)-len(suffix):], suffix)
   326	}
   327	
   328	// Map returns a copy of the byte array s with all its characters modified
   329	// according to the mapping function. If mapping returns a negative value, the character is
   330	// dropped from the string with no replacement.  The characters in s and the
   331	// output are interpreted as UTF-8-encoded Unicode code points.
   332	func Map(mapping func(rune int) int, s []byte) []byte {
   333		// In the worst case, the array can grow when mapped, making
   334		// things unpleasant.  But it's so rare we barge in assuming it's
   335		// fine.  It could also shrink but that falls out naturally.
   336		maxbytes := len(s) // length of b
   337		nbytes := 0        // number of bytes encoded in b
   338		b := make([]byte, maxbytes)
   339		for i := 0; i < len(s); {
   340			wid := 1
   341			rune := int(s[i])
   342			if rune >= utf8.RuneSelf {
   343				rune, wid = utf8.DecodeRune(s[i:])
   344			}
   345			rune = mapping(rune)
   346			if rune >= 0 {
   347				if nbytes+utf8.RuneLen(rune) > maxbytes {
   348					// Grow the buffer.
   349					maxbytes = maxbytes*2 + utf8.UTFMax
   350					nb := make([]byte, maxbytes)
   351					copy(nb, b[0:nbytes])
   352					b = nb
   353				}
   354				nbytes += utf8.EncodeRune(b[nbytes:maxbytes], rune)
   355			}
   356			i += wid
   357		}
   358		return b[0:nbytes]
   359	}
   360	
   361	// Repeat returns a new byte slice consisting of count copies of b.
   362	func Repeat(b []byte, count int) []byte {
   363		nb := make([]byte, len(b)*count)
   364		bp := 0
   365		for i := 0; i < count; i++ {
   366			for j := 0; j < len(b); j++ {
   367				nb[bp] = b[j]
   368				bp++
   369			}
   370		}
   371		return nb
   372	}
   373	
   374	// ToUpper returns a copy of the byte array s with all Unicode letters mapped to their upper case.
   375	func ToUpper(s []byte) []byte { return Map(unicode.ToUpper, s) }
   376	
   377	// ToUpper returns a copy of the byte array s with all Unicode letters mapped to their lower case.
   378	func ToLower(s []byte) []byte { return Map(unicode.ToLower, s) }
   379	
   380	// ToTitle returns a copy of the byte array s with all Unicode letters mapped to their title case.
   381	func ToTitle(s []byte) []byte { return Map(unicode.ToTitle, s) }
   382	
   383	// ToUpperSpecial returns a copy of the byte array s with all Unicode letters mapped to their
   384	// upper case, giving priority to the special casing rules.
   385	func ToUpperSpecial(_case unicode.SpecialCase, s []byte) []byte {
   386		return Map(func(r int) int { return _case.ToUpper(r) }, s)
   387	}
   388	
   389	// ToLowerSpecial returns a copy of the byte array s with all Unicode letters mapped to their
   390	// lower case, giving priority to the special casing rules.
   391	func ToLowerSpecial(_case unicode.SpecialCase, s []byte) []byte {
   392		return Map(func(r int) int { return _case.ToLower(r) }, s)
   393	}
   394	
   395	// ToTitleSpecial returns a copy of the byte array s with all Unicode letters mapped to their
   396	// title case, giving priority to the special casing rules.
   397	func ToTitleSpecial(_case unicode.SpecialCase, s []byte) []byte {
   398		return Map(func(r int) int { return _case.ToTitle(r) }, s)
   399	}
   400	
   401	// isSeparator reports whether the rune could mark a word boundary.
   402	// TODO: update when package unicode captures more of the properties.
   403	func isSeparator(rune int) bool {
   404		// ASCII alphanumerics and underscore are not separators
   405		if rune <= 0x7F {
   406			switch {
   407			case '0' <= rune && rune <= '9':
   408				return false
   409			case 'a' <= rune && rune <= 'z':
   410				return false
   411			case 'A' <= rune && rune <= 'Z':
   412				return false
   413			case rune == '_':
   414				return false
   415			}
   416			return true
   417		}
   418		// Letters and digits are not separators
   419		if unicode.IsLetter(rune) || unicode.IsDigit(rune) {
   420			return false
   421		}
   422		// Otherwise, all we can do for now is treat spaces as separators.
   423		return unicode.IsSpace(rune)
   424	}
   425	
   426	// BUG(r): The rule Title uses for word boundaries does not handle Unicode punctuation properly.
   427	
   428	// Title returns a copy of s with all Unicode letters that begin words
   429	// mapped to their title case.
   430	func Title(s []byte) []byte {
   431		// Use a closure here to remember state.
   432		// Hackish but effective. Depends on Map scanning in order and calling
   433		// the closure once per rune.
   434		prev := ' '
   435		return Map(
   436			func(r int) int {
   437				if isSeparator(prev) {
   438					prev = r
   439					return unicode.ToTitle(r)
   440				}
   441				prev = r
   442				return r
   443			},
   444			s)
   445	}
   446	
   447	// TrimLeftFunc returns a subslice of s by slicing off all leading UTF-8-encoded
   448	// Unicode code points c that satisfy f(c).
   449	func TrimLeftFunc(s []byte, f func(r int) bool) []byte {
   450		i := indexFunc(s, f, false)
   451		if i == -1 {
   452			return nil
   453		}
   454		return s[i:]
   455	}
   456	
   457	// TrimRightFunc returns a subslice of s by slicing off all trailing UTF-8
   458	// encoded Unicode code points c that satisfy f(c).
   459	func TrimRightFunc(s []byte, f func(r int) bool) []byte {
   460		i := lastIndexFunc(s, f, false)
   461		if i >= 0 && s[i] >= utf8.RuneSelf {
   462			_, wid := utf8.DecodeRune(s[i:])
   463			i += wid
   464		} else {
   465			i++
   466		}
   467		return s[0:i]
   468	}
   469	
   470	// TrimFunc returns a subslice of s by slicing off all leading and trailing
   471	// UTF-8-encoded Unicode code points c that satisfy f(c).
   472	func TrimFunc(s []byte, f func(r int) bool) []byte {
   473		return TrimRightFunc(TrimLeftFunc(s, f), f)
   474	}
   475	
   476	// IndexFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
   477	// It returns the byte index in s of the first Unicode
   478	// code point satisfying f(c), or -1 if none do.
   479	func IndexFunc(s []byte, f func(r int) bool) int {
   480		return indexFunc(s, f, true)
   481	}
   482	
   483	// LastIndexFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
   484	// It returns the byte index in s of the last Unicode
   485	// code point satisfying f(c), or -1 if none do.
   486	func LastIndexFunc(s []byte, f func(r int) bool) int {
   487		return lastIndexFunc(s, f, true)
   488	}
   489	
   490	// indexFunc is the same as IndexFunc except that if
   491	// truth==false, the sense of the predicate function is
   492	// inverted.
   493	func indexFunc(s []byte, f func(r int) bool, truth bool) int {
   494		start := 0
   495		for start < len(s) {
   496			wid := 1
   497			rune := int(s[start])
   498			if rune >= utf8.RuneSelf {
   499				rune, wid = utf8.DecodeRune(s[start:])
   500			}
   501			if f(rune) == truth {
   502				return start
   503			}
   504			start += wid
   505		}
   506		return -1
   507	}
   508	
   509	// lastIndexFunc is the same as LastIndexFunc except that if
   510	// truth==false, the sense of the predicate function is
   511	// inverted.
   512	func lastIndexFunc(s []byte, f func(r int) bool, truth bool) int {
   513		for i := len(s); i > 0; {
   514			rune, size := utf8.DecodeLastRune(s[0:i])
   515			i -= size
   516			if f(rune) == truth {
   517				return i
   518			}
   519		}
   520		return -1
   521	}
   522	
   523	func makeCutsetFunc(cutset string) func(rune int) bool {
   524		return func(rune int) bool {
   525			for _, c := range cutset {
   526				if c == rune {
   527					return true
   528				}
   529			}
   530			return false
   531		}
   532	}
   533	
   534	// Trim returns a subslice of s by slicing off all leading and
   535	// trailing UTF-8-encoded Unicode code points contained in cutset.
   536	func Trim(s []byte, cutset string) []byte {
   537		return TrimFunc(s, makeCutsetFunc(cutset))
   538	}
   539	
   540	// TrimLeft returns a subslice of s by slicing off all leading
   541	// UTF-8-encoded Unicode code points contained in cutset.
   542	func TrimLeft(s []byte, cutset string) []byte {
   543		return TrimLeftFunc(s, makeCutsetFunc(cutset))
   544	}
   545	
   546	// TrimRight returns a subslice of s by slicing off all trailing
   547	// UTF-8-encoded Unicode code points that are contained in cutset.
   548	func TrimRight(s []byte, cutset string) []byte {
   549		return TrimRightFunc(s, makeCutsetFunc(cutset))
   550	}
   551	
   552	// TrimSpace returns a subslice of s by slicing off all leading and
   553	// trailing white space, as defined by Unicode.
   554	func TrimSpace(s []byte) []byte {
   555		return TrimFunc(s, unicode.IsSpace)
   556	}
   557	
   558	// Runes returns a slice of runes (Unicode code points) equivalent to s.
   559	func Runes(s []byte) []int {
   560		t := make([]int, utf8.RuneCount(s))
   561		i := 0
   562		for len(s) > 0 {
   563			r, l := utf8.DecodeRune(s)
   564			t[i] = r
   565			i++
   566			s = s[l:]
   567		}
   568		return t
   569	}
   570	
   571	// Replace returns a copy of the slice s with the first n
   572	// non-overlapping instances of old replaced by new.
   573	// If n < 0, there is no limit on the number of replacements.
   574	func Replace(s, old, new []byte, n int) []byte {
   575		if n == 0 {
   576			return s // avoid allocation
   577		}
   578		// Compute number of replacements.
   579		if m := Count(s, old); m == 0 {
   580			return s // avoid allocation
   581		} else if n <= 0 || m < n {
   582			n = m
   583		}
   584	
   585		// Apply replacements to buffer.
   586		t := make([]byte, len(s)+n*(len(new)-len(old)))
   587		w := 0
   588		start := 0
   589		for i := 0; i < n; i++ {
   590			j := start
   591			if len(old) == 0 {
   592				if i > 0 {
   593					_, wid := utf8.DecodeRune(s[start:])
   594					j += wid
   595				}
   596			} else {
   597				j += Index(s[start:], old)
   598			}
   599			w += copy(t[w:], s[start:j])
   600			w += copy(t[w:], new)
   601			start = j + len(old)
   602		}
   603		w += copy(t[w:], s[start:])
   604		return t[0:w]
   605	}

release.r60.3. Except as noted, this content is licensed under a Creative Commons Attribution 3.0 License.