...
Run Format

Source file src/pkg/strings/strings.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package strings implements simple functions to manipulate strings.
     6	package strings
     7	
     8	import (
     9		"unicode"
    10		"unicode/utf8"
    11	)
    12	
    13	// explode splits s into an array of UTF-8 sequences, one per Unicode character (still strings) up to a maximum of n (n < 0 means no limit).
    14	// Invalid UTF-8 sequences become correct encodings of U+FFF8.
    15	func explode(s string, n int) []string {
    16		if n == 0 {
    17			return nil
    18		}
    19		l := utf8.RuneCountInString(s)
    20		if n <= 0 || n > l {
    21			n = l
    22		}
    23		a := make([]string, n)
    24		var size int
    25		var ch rune
    26		i, cur := 0, 0
    27		for ; i+1 < n; i++ {
    28			ch, size = utf8.DecodeRuneInString(s[cur:])
    29			if ch == utf8.RuneError {
    30				a[i] = string(utf8.RuneError)
    31			} else {
    32				a[i] = s[cur : cur+size]
    33			}
    34			cur += size
    35		}
    36		// add the rest, if there is any
    37		if cur < len(s) {
    38			a[i] = s[cur:]
    39		}
    40		return a
    41	}
    42	
    43	// primeRK is the prime base used in Rabin-Karp algorithm.
    44	const primeRK = 16777619
    45	
    46	// hashstr returns the hash and the appropriate multiplicative
    47	// factor for use in Rabin-Karp algorithm.
    48	func hashstr(sep string) (uint32, uint32) {
    49		hash := uint32(0)
    50		for i := 0; i < len(sep); i++ {
    51			hash = hash*primeRK + uint32(sep[i])
    52	
    53		}
    54		var pow, sq uint32 = 1, primeRK
    55		for i := len(sep); i > 0; i >>= 1 {
    56			if i&1 != 0 {
    57				pow *= sq
    58			}
    59			sq *= sq
    60		}
    61		return hash, pow
    62	}
    63	
    64	// Count counts the number of non-overlapping instances of sep in s.
    65	func Count(s, sep string) int {
    66		n := 0
    67		// special cases
    68		switch {
    69		case len(sep) == 0:
    70			return utf8.RuneCountInString(s) + 1
    71		case len(sep) == 1:
    72			// special case worth making fast
    73			c := sep[0]
    74			for i := 0; i < len(s); i++ {
    75				if s[i] == c {
    76					n++
    77				}
    78			}
    79			return n
    80		case len(sep) > len(s):
    81			return 0
    82		case len(sep) == len(s):
    83			if sep == s {
    84				return 1
    85			}
    86			return 0
    87		}
    88		hashsep, pow := hashstr(sep)
    89		h := uint32(0)
    90		for i := 0; i < len(sep); i++ {
    91			h = h*primeRK + uint32(s[i])
    92		}
    93		lastmatch := 0
    94		if h == hashsep && s[:len(sep)] == sep {
    95			n++
    96			lastmatch = len(sep)
    97		}
    98		for i := len(sep); i < len(s); {
    99			h *= primeRK
   100			h += uint32(s[i])
   101			h -= pow * uint32(s[i-len(sep)])
   102			i++
   103			if h == hashsep && lastmatch <= i-len(sep) && s[i-len(sep):i] == sep {
   104				n++
   105				lastmatch = i
   106			}
   107		}
   108		return n
   109	}
   110	
   111	// Contains returns true if substr is within s.
   112	func Contains(s, substr string) bool {
   113		return Index(s, substr) >= 0
   114	}
   115	
   116	// ContainsAny returns true if any Unicode code points in chars are within s.
   117	func ContainsAny(s, chars string) bool {
   118		return IndexAny(s, chars) >= 0
   119	}
   120	
   121	// ContainsRune returns true if the Unicode code point r is within s.
   122	func ContainsRune(s string, r rune) bool {
   123		return IndexRune(s, r) >= 0
   124	}
   125	
   126	// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
   127	func Index(s, sep string) int {
   128		n := len(sep)
   129		switch {
   130		case n == 0:
   131			return 0
   132		case n == 1:
   133			return IndexByte(s, sep[0])
   134		case n == len(s):
   135			if sep == s {
   136				return 0
   137			}
   138			return -1
   139		case n > len(s):
   140			return -1
   141		}
   142		// Hash sep.
   143		hashsep, pow := hashstr(sep)
   144		var h uint32
   145		for i := 0; i < n; i++ {
   146			h = h*primeRK + uint32(s[i])
   147		}
   148		if h == hashsep && s[:n] == sep {
   149			return 0
   150		}
   151		for i := n; i < len(s); {
   152			h *= primeRK
   153			h += uint32(s[i])
   154			h -= pow * uint32(s[i-n])
   155			i++
   156			if h == hashsep && s[i-n:i] == sep {
   157				return i - n
   158			}
   159		}
   160		return -1
   161	}
   162	
   163	// LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.
   164	func LastIndex(s, sep string) int {
   165		n := len(sep)
   166		if n == 0 {
   167			return len(s)
   168		}
   169		c := sep[0]
   170		if n == 1 {
   171			// special case worth making fast
   172			for i := len(s) - 1; i >= 0; i-- {
   173				if s[i] == c {
   174					return i
   175				}
   176			}
   177			return -1
   178		}
   179		// n > 1
   180		for i := len(s) - n; i >= 0; i-- {
   181			if s[i] == c && s[i:i+n] == sep {
   182				return i
   183			}
   184		}
   185		return -1
   186	}
   187	
   188	// IndexRune returns the index of the first instance of the Unicode code point
   189	// r, or -1 if rune is not present in s.
   190	func IndexRune(s string, r rune) int {
   191		switch {
   192		case r < 0x80:
   193			b := byte(r)
   194			for i := 0; i < len(s); i++ {
   195				if s[i] == b {
   196					return i
   197				}
   198			}
   199		default:
   200			for i, c := range s {
   201				if c == r {
   202					return i
   203				}
   204			}
   205		}
   206		return -1
   207	}
   208	
   209	// IndexAny returns the index of the first instance of any Unicode code point
   210	// from chars in s, or -1 if no Unicode code point from chars is present in s.
   211	func IndexAny(s, chars string) int {
   212		if len(chars) > 0 {
   213			for i, c := range s {
   214				for _, m := range chars {
   215					if c == m {
   216						return i
   217					}
   218				}
   219			}
   220		}
   221		return -1
   222	}
   223	
   224	// LastIndexAny returns the index of the last instance of any Unicode code
   225	// point from chars in s, or -1 if no Unicode code point from chars is
   226	// present in s.
   227	func LastIndexAny(s, chars string) int {
   228		if len(chars) > 0 {
   229			for i := len(s); i > 0; {
   230				rune, size := utf8.DecodeLastRuneInString(s[0:i])
   231				i -= size
   232				for _, m := range chars {
   233					if rune == m {
   234						return i
   235					}
   236				}
   237			}
   238		}
   239		return -1
   240	}
   241	
   242	// Generic split: splits after each instance of sep,
   243	// including sepSave bytes of sep in the subarrays.
   244	func genSplit(s, sep string, sepSave, n int) []string {
   245		if n == 0 {
   246			return nil
   247		}
   248		if sep == "" {
   249			return explode(s, n)
   250		}
   251		if n < 0 {
   252			n = Count(s, sep) + 1
   253		}
   254		c := sep[0]
   255		start := 0
   256		a := make([]string, n)
   257		na := 0
   258		for i := 0; i+len(sep) <= len(s) && na+1 < n; i++ {
   259			if s[i] == c && (len(sep) == 1 || s[i:i+len(sep)] == sep) {
   260				a[na] = s[start : i+sepSave]
   261				na++
   262				start = i + len(sep)
   263				i += len(sep) - 1
   264			}
   265		}
   266		a[na] = s[start:]
   267		return a[0 : na+1]
   268	}
   269	
   270	// SplitN slices s into substrings separated by sep and returns a slice of
   271	// the substrings between those separators.
   272	// If sep is empty, SplitN splits after each UTF-8 sequence.
   273	// The count determines the number of substrings to return:
   274	//   n > 0: at most n substrings; the last substring will be the unsplit remainder.
   275	//   n == 0: the result is nil (zero substrings)
   276	//   n < 0: all substrings
   277	func SplitN(s, sep string, n int) []string { return genSplit(s, sep, 0, n) }
   278	
   279	// SplitAfterN slices s into substrings after each instance of sep and
   280	// returns a slice of those substrings.
   281	// If sep is empty, SplitAfterN splits after each UTF-8 sequence.
   282	// The count determines the number of substrings to return:
   283	//   n > 0: at most n substrings; the last substring will be the unsplit remainder.
   284	//   n == 0: the result is nil (zero substrings)
   285	//   n < 0: all substrings
   286	func SplitAfterN(s, sep string, n int) []string {
   287		return genSplit(s, sep, len(sep), n)
   288	}
   289	
   290	// Split slices s into all substrings separated by sep and returns a slice of
   291	// the substrings between those separators.
   292	// If sep is empty, Split splits after each UTF-8 sequence.
   293	// It is equivalent to SplitN with a count of -1.
   294	func Split(s, sep string) []string { return genSplit(s, sep, 0, -1) }
   295	
   296	// SplitAfter slices s into all substrings after each instance of sep and
   297	// returns a slice of those substrings.
   298	// If sep is empty, SplitAfter splits after each UTF-8 sequence.
   299	// It is equivalent to SplitAfterN with a count of -1.
   300	func SplitAfter(s, sep string) []string {
   301		return genSplit(s, sep, len(sep), -1)
   302	}
   303	
   304	// Fields splits the string s around each instance of one or more consecutive white space
   305	// characters, as defined by unicode.IsSpace, returning an array of substrings of s or an
   306	// empty list if s contains only white space.
   307	func Fields(s string) []string {
   308		return FieldsFunc(s, unicode.IsSpace)
   309	}
   310	
   311	// FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c)
   312	// and returns an array of slices of s. If all code points in s satisfy f(c) or the
   313	// string is empty, an empty slice is returned.
   314	func FieldsFunc(s string, f func(rune) bool) []string {
   315		// First count the fields.
   316		n := 0
   317		inField := false
   318		for _, rune := range s {
   319			wasInField := inField
   320			inField = !f(rune)
   321			if inField && !wasInField {
   322				n++
   323			}
   324		}
   325	
   326		// Now create them.
   327		a := make([]string, n)
   328		na := 0
   329		fieldStart := -1 // Set to -1 when looking for start of field.
   330		for i, rune := range s {
   331			if f(rune) {
   332				if fieldStart >= 0 {
   333					a[na] = s[fieldStart:i]
   334					na++
   335					fieldStart = -1
   336				}
   337			} else if fieldStart == -1 {
   338				fieldStart = i
   339			}
   340		}
   341		if fieldStart >= 0 { // Last field might end at EOF.
   342			a[na] = s[fieldStart:]
   343		}
   344		return a
   345	}
   346	
   347	// Join concatenates the elements of a to create a single string.   The separator string
   348	// sep is placed between elements in the resulting string.
   349	func Join(a []string, sep string) string {
   350		if len(a) == 0 {
   351			return ""
   352		}
   353		if len(a) == 1 {
   354			return a[0]
   355		}
   356		n := len(sep) * (len(a) - 1)
   357		for i := 0; i < len(a); i++ {
   358			n += len(a[i])
   359		}
   360	
   361		b := make([]byte, n)
   362		bp := copy(b, a[0])
   363		for _, s := range a[1:] {
   364			bp += copy(b[bp:], sep)
   365			bp += copy(b[bp:], s)
   366		}
   367		return string(b)
   368	}
   369	
   370	// HasPrefix tests whether the string s begins with prefix.
   371	func HasPrefix(s, prefix string) bool {
   372		return len(s) >= len(prefix) && s[0:len(prefix)] == prefix
   373	}
   374	
   375	// HasSuffix tests whether the string s ends with suffix.
   376	func HasSuffix(s, suffix string) bool {
   377		return len(s) >= len(suffix) && s[len(s)-len(suffix):] == suffix
   378	}
   379	
   380	// Map returns a copy of the string s with all its characters modified
   381	// according to the mapping function. If mapping returns a negative value, the character is
   382	// dropped from the string with no replacement.
   383	func Map(mapping func(rune) rune, s string) string {
   384		// In the worst case, the string can grow when mapped, making
   385		// things unpleasant.  But it's so rare we barge in assuming it's
   386		// fine.  It could also shrink but that falls out naturally.
   387		maxbytes := len(s) // length of b
   388		nbytes := 0        // number of bytes encoded in b
   389		// The output buffer b is initialized on demand, the first
   390		// time a character differs.
   391		var b []byte
   392	
   393		for i, c := range s {
   394			r := mapping(c)
   395			if b == nil {
   396				if r == c {
   397					continue
   398				}
   399				b = make([]byte, maxbytes)
   400				nbytes = copy(b, s[:i])
   401			}
   402			if r >= 0 {
   403				wid := 1
   404				if r >= utf8.RuneSelf {
   405					wid = utf8.RuneLen(r)
   406				}
   407				if nbytes+wid > maxbytes {
   408					// Grow the buffer.
   409					maxbytes = maxbytes*2 + utf8.UTFMax
   410					nb := make([]byte, maxbytes)
   411					copy(nb, b[0:nbytes])
   412					b = nb
   413				}
   414				nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r)
   415			}
   416		}
   417		if b == nil {
   418			return s
   419		}
   420		return string(b[0:nbytes])
   421	}
   422	
   423	// Repeat returns a new string consisting of count copies of the string s.
   424	func Repeat(s string, count int) string {
   425		b := make([]byte, len(s)*count)
   426		bp := 0
   427		for i := 0; i < count; i++ {
   428			bp += copy(b[bp:], s)
   429		}
   430		return string(b)
   431	}
   432	
   433	// ToUpper returns a copy of the string s with all Unicode letters mapped to their upper case.
   434	func ToUpper(s string) string { return Map(unicode.ToUpper, s) }
   435	
   436	// ToLower returns a copy of the string s with all Unicode letters mapped to their lower case.
   437	func ToLower(s string) string { return Map(unicode.ToLower, s) }
   438	
   439	// ToTitle returns a copy of the string s with all Unicode letters mapped to their title case.
   440	func ToTitle(s string) string { return Map(unicode.ToTitle, s) }
   441	
   442	// ToUpperSpecial returns a copy of the string s with all Unicode letters mapped to their
   443	// upper case, giving priority to the special casing rules.
   444	func ToUpperSpecial(_case unicode.SpecialCase, s string) string {
   445		return Map(func(r rune) rune { return _case.ToUpper(r) }, s)
   446	}
   447	
   448	// ToLowerSpecial returns a copy of the string s with all Unicode letters mapped to their
   449	// lower case, giving priority to the special casing rules.
   450	func ToLowerSpecial(_case unicode.SpecialCase, s string) string {
   451		return Map(func(r rune) rune { return _case.ToLower(r) }, s)
   452	}
   453	
   454	// ToTitleSpecial returns a copy of the string s with all Unicode letters mapped to their
   455	// title case, giving priority to the special casing rules.
   456	func ToTitleSpecial(_case unicode.SpecialCase, s string) string {
   457		return Map(func(r rune) rune { return _case.ToTitle(r) }, s)
   458	}
   459	
   460	// isSeparator reports whether the rune could mark a word boundary.
   461	// TODO: update when package unicode captures more of the properties.
   462	func isSeparator(r rune) bool {
   463		// ASCII alphanumerics and underscore are not separators
   464		if r <= 0x7F {
   465			switch {
   466			case '0' <= r && r <= '9':
   467				return false
   468			case 'a' <= r && r <= 'z':
   469				return false
   470			case 'A' <= r && r <= 'Z':
   471				return false
   472			case r == '_':
   473				return false
   474			}
   475			return true
   476		}
   477		// Letters and digits are not separators
   478		if unicode.IsLetter(r) || unicode.IsDigit(r) {
   479			return false
   480		}
   481		// Otherwise, all we can do for now is treat spaces as separators.
   482		return unicode.IsSpace(r)
   483	}
   484	
   485	// Title returns a copy of the string s with all Unicode letters that begin words
   486	// mapped to their title case.
   487	//
   488	// BUG: The rule Title uses for word boundaries does not handle Unicode punctuation properly.
   489	func Title(s string) string {
   490		// Use a closure here to remember state.
   491		// Hackish but effective. Depends on Map scanning in order and calling
   492		// the closure once per rune.
   493		prev := ' '
   494		return Map(
   495			func(r rune) rune {
   496				if isSeparator(prev) {
   497					prev = r
   498					return unicode.ToTitle(r)
   499				}
   500				prev = r
   501				return r
   502			},
   503			s)
   504	}
   505	
   506	// TrimLeftFunc returns a slice of the string s with all leading
   507	// Unicode code points c satisfying f(c) removed.
   508	func TrimLeftFunc(s string, f func(rune) bool) string {
   509		i := indexFunc(s, f, false)
   510		if i == -1 {
   511			return ""
   512		}
   513		return s[i:]
   514	}
   515	
   516	// TrimRightFunc returns a slice of the string s with all trailing
   517	// Unicode code points c satisfying f(c) removed.
   518	func TrimRightFunc(s string, f func(rune) bool) string {
   519		i := lastIndexFunc(s, f, false)
   520		if i >= 0 && s[i] >= utf8.RuneSelf {
   521			_, wid := utf8.DecodeRuneInString(s[i:])
   522			i += wid
   523		} else {
   524			i++
   525		}
   526		return s[0:i]
   527	}
   528	
   529	// TrimFunc returns a slice of the string s with all leading
   530	// and trailing Unicode code points c satisfying f(c) removed.
   531	func TrimFunc(s string, f func(rune) bool) string {
   532		return TrimRightFunc(TrimLeftFunc(s, f), f)
   533	}
   534	
   535	// IndexFunc returns the index into s of the first Unicode
   536	// code point satisfying f(c), or -1 if none do.
   537	func IndexFunc(s string, f func(rune) bool) int {
   538		return indexFunc(s, f, true)
   539	}
   540	
   541	// LastIndexFunc returns the index into s of the last
   542	// Unicode code point satisfying f(c), or -1 if none do.
   543	func LastIndexFunc(s string, f func(rune) bool) int {
   544		return lastIndexFunc(s, f, true)
   545	}
   546	
   547	// indexFunc is the same as IndexFunc except that if
   548	// truth==false, the sense of the predicate function is
   549	// inverted.
   550	func indexFunc(s string, f func(rune) bool, truth bool) int {
   551		start := 0
   552		for start < len(s) {
   553			wid := 1
   554			r := rune(s[start])
   555			if r >= utf8.RuneSelf {
   556				r, wid = utf8.DecodeRuneInString(s[start:])
   557			}
   558			if f(r) == truth {
   559				return start
   560			}
   561			start += wid
   562		}
   563		return -1
   564	}
   565	
   566	// lastIndexFunc is the same as LastIndexFunc except that if
   567	// truth==false, the sense of the predicate function is
   568	// inverted.
   569	func lastIndexFunc(s string, f func(rune) bool, truth bool) int {
   570		for i := len(s); i > 0; {
   571			r, size := utf8.DecodeLastRuneInString(s[0:i])
   572			i -= size
   573			if f(r) == truth {
   574				return i
   575			}
   576		}
   577		return -1
   578	}
   579	
   580	func makeCutsetFunc(cutset string) func(rune) bool {
   581		return func(r rune) bool { return IndexRune(cutset, r) >= 0 }
   582	}
   583	
   584	// Trim returns a slice of the string s with all leading and
   585	// trailing Unicode code points contained in cutset removed.
   586	func Trim(s string, cutset string) string {
   587		if s == "" || cutset == "" {
   588			return s
   589		}
   590		return TrimFunc(s, makeCutsetFunc(cutset))
   591	}
   592	
   593	// TrimLeft returns a slice of the string s with all leading
   594	// Unicode code points contained in cutset removed.
   595	func TrimLeft(s string, cutset string) string {
   596		if s == "" || cutset == "" {
   597			return s
   598		}
   599		return TrimLeftFunc(s, makeCutsetFunc(cutset))
   600	}
   601	
   602	// TrimRight returns a slice of the string s, with all trailing
   603	// Unicode code points contained in cutset removed.
   604	func TrimRight(s string, cutset string) string {
   605		if s == "" || cutset == "" {
   606			return s
   607		}
   608		return TrimRightFunc(s, makeCutsetFunc(cutset))
   609	}
   610	
   611	// TrimSpace returns a slice of the string s, with all leading
   612	// and trailing white space removed, as defined by Unicode.
   613	func TrimSpace(s string) string {
   614		return TrimFunc(s, unicode.IsSpace)
   615	}
   616	
   617	// TrimPrefix returns s without the provided leading prefix string.
   618	// If s doesn't start with prefix, s is returned unchanged.
   619	func TrimPrefix(s, prefix string) string {
   620		if HasPrefix(s, prefix) {
   621			return s[len(prefix):]
   622		}
   623		return s
   624	}
   625	
   626	// TrimSuffix returns s without the provided trailing suffix string.
   627	// If s doesn't end with suffix, s is returned unchanged.
   628	func TrimSuffix(s, suffix string) string {
   629		if HasSuffix(s, suffix) {
   630			return s[:len(s)-len(suffix)]
   631		}
   632		return s
   633	}
   634	
   635	// Replace returns a copy of the string s with the first n
   636	// non-overlapping instances of old replaced by new.
   637	// If n < 0, there is no limit on the number of replacements.
   638	func Replace(s, old, new string, n int) string {
   639		if old == new || n == 0 {
   640			return s // avoid allocation
   641		}
   642	
   643		// Compute number of replacements.
   644		if m := Count(s, old); m == 0 {
   645			return s // avoid allocation
   646		} else if n < 0 || m < n {
   647			n = m
   648		}
   649	
   650		// Apply replacements to buffer.
   651		t := make([]byte, len(s)+n*(len(new)-len(old)))
   652		w := 0
   653		start := 0
   654		for i := 0; i < n; i++ {
   655			j := start
   656			if len(old) == 0 {
   657				if i > 0 {
   658					_, wid := utf8.DecodeRuneInString(s[start:])
   659					j += wid
   660				}
   661			} else {
   662				j += Index(s[start:], old)
   663			}
   664			w += copy(t[w:], s[start:j])
   665			w += copy(t[w:], new)
   666			start = j + len(old)
   667		}
   668		w += copy(t[w:], s[start:])
   669		return string(t[0:w])
   670	}
   671	
   672	// EqualFold reports whether s and t, interpreted as UTF-8 strings,
   673	// are equal under Unicode case-folding.
   674	func EqualFold(s, t string) bool {
   675		for s != "" && t != "" {
   676			// Extract first rune from each string.
   677			var sr, tr rune
   678			if s[0] < utf8.RuneSelf {
   679				sr, s = rune(s[0]), s[1:]
   680			} else {
   681				r, size := utf8.DecodeRuneInString(s)
   682				sr, s = r, s[size:]
   683			}
   684			if t[0] < utf8.RuneSelf {
   685				tr, t = rune(t[0]), t[1:]
   686			} else {
   687				r, size := utf8.DecodeRuneInString(t)
   688				tr, t = r, t[size:]
   689			}
   690	
   691			// If they match, keep going; if not, return false.
   692	
   693			// Easy case.
   694			if tr == sr {
   695				continue
   696			}
   697	
   698			// Make sr < tr to simplify what follows.
   699			if tr < sr {
   700				tr, sr = sr, tr
   701			}
   702			// Fast check for ASCII.
   703			if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' {
   704				// ASCII, and sr is upper case.  tr must be lower case.
   705				if tr == sr+'a'-'A' {
   706					continue
   707				}
   708				return false
   709			}
   710	
   711			// General case.  SimpleFold(x) returns the next equivalent rune > x
   712			// or wraps around to smaller values.
   713			r := unicode.SimpleFold(sr)
   714			for r != sr && r < tr {
   715				r = unicode.SimpleFold(r)
   716			}
   717			if r == tr {
   718				continue
   719			}
   720			return false
   721		}
   722	
   723		// One string is empty.  Are both?
   724		return s == t
   725	}
   726	

View as plain text