...
Run Format

Source file src/strings/replace.go

Documentation: strings

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package strings
     6  
     7  import "io"
     8  
     9  // Replacer replaces a list of strings with replacements.
    10  // It is safe for concurrent use by multiple goroutines.
    11  type Replacer struct {
    12  	r replacer
    13  }
    14  
    15  // replacer is the interface that a replacement algorithm needs to implement.
    16  type replacer interface {
    17  	Replace(s string) string
    18  	WriteString(w io.Writer, s string) (n int, err error)
    19  }
    20  
    21  // NewReplacer returns a new Replacer from a list of old, new string
    22  // pairs. Replacements are performed in the order they appear in the
    23  // target string, without overlapping matches.
    24  func NewReplacer(oldnew ...string) *Replacer {
    25  	if len(oldnew)%2 == 1 {
    26  		panic("strings.NewReplacer: odd argument count")
    27  	}
    28  
    29  	if len(oldnew) == 2 && len(oldnew[0]) > 1 {
    30  		return &Replacer{r: makeSingleStringReplacer(oldnew[0], oldnew[1])}
    31  	}
    32  
    33  	allNewBytes := true
    34  	for i := 0; i < len(oldnew); i += 2 {
    35  		if len(oldnew[i]) != 1 {
    36  			return &Replacer{r: makeGenericReplacer(oldnew)}
    37  		}
    38  		if len(oldnew[i+1]) != 1 {
    39  			allNewBytes = false
    40  		}
    41  	}
    42  
    43  	if allNewBytes {
    44  		r := byteReplacer{}
    45  		for i := range r {
    46  			r[i] = byte(i)
    47  		}
    48  		// The first occurrence of old->new map takes precedence
    49  		// over the others with the same old string.
    50  		for i := len(oldnew) - 2; i >= 0; i -= 2 {
    51  			o := oldnew[i][0]
    52  			n := oldnew[i+1][0]
    53  			r[o] = n
    54  		}
    55  		return &Replacer{r: &r}
    56  	}
    57  
    58  	r := byteStringReplacer{toReplace: make([]string, 0, len(oldnew)/2)}
    59  	// The first occurrence of old->new map takes precedence
    60  	// over the others with the same old string.
    61  	for i := len(oldnew) - 2; i >= 0; i -= 2 {
    62  		o := oldnew[i][0]
    63  		n := oldnew[i+1]
    64  		// To avoid counting repetitions multiple times.
    65  		if r.replacements[o] == nil {
    66  			// We need to use string([]byte{o}) instead of string(o),
    67  			// to avoid utf8 encoding of o.
    68  			// E. g. byte(150) produces string of length 2.
    69  			r.toReplace = append(r.toReplace, string([]byte{o}))
    70  		}
    71  		r.replacements[o] = []byte(n)
    72  
    73  	}
    74  	return &Replacer{r: &r}
    75  }
    76  
    77  // Replace returns a copy of s with all replacements performed.
    78  func (r *Replacer) Replace(s string) string {
    79  	return r.r.Replace(s)
    80  }
    81  
    82  // WriteString writes s to w with all replacements performed.
    83  func (r *Replacer) WriteString(w io.Writer, s string) (n int, err error) {
    84  	return r.r.WriteString(w, s)
    85  }
    86  
    87  // trieNode is a node in a lookup trie for prioritized key/value pairs. Keys
    88  // and values may be empty. For example, the trie containing keys "ax", "ay",
    89  // "bcbc", "x" and "xy" could have eight nodes:
    90  //
    91  //  n0  -
    92  //  n1  a-
    93  //  n2  .x+
    94  //  n3  .y+
    95  //  n4  b-
    96  //  n5  .cbc+
    97  //  n6  x+
    98  //  n7  .y+
    99  //
   100  // n0 is the root node, and its children are n1, n4 and n6; n1's children are
   101  // n2 and n3; n4's child is n5; n6's child is n7. Nodes n0, n1 and n4 (marked
   102  // with a trailing "-") are partial keys, and nodes n2, n3, n5, n6 and n7
   103  // (marked with a trailing "+") are complete keys.
   104  type trieNode struct {
   105  	// value is the value of the trie node's key/value pair. It is empty if
   106  	// this node is not a complete key.
   107  	value string
   108  	// priority is the priority (higher is more important) of the trie node's
   109  	// key/value pair; keys are not necessarily matched shortest- or longest-
   110  	// first. Priority is positive if this node is a complete key, and zero
   111  	// otherwise. In the example above, positive/zero priorities are marked
   112  	// with a trailing "+" or "-".
   113  	priority int
   114  
   115  	// A trie node may have zero, one or more child nodes:
   116  	//  * if the remaining fields are zero, there are no children.
   117  	//  * if prefix and next are non-zero, there is one child in next.
   118  	//  * if table is non-zero, it defines all the children.
   119  	//
   120  	// Prefixes are preferred over tables when there is one child, but the
   121  	// root node always uses a table for lookup efficiency.
   122  
   123  	// prefix is the difference in keys between this trie node and the next.
   124  	// In the example above, node n4 has prefix "cbc" and n4's next node is n5.
   125  	// Node n5 has no children and so has zero prefix, next and table fields.
   126  	prefix string
   127  	next   *trieNode
   128  
   129  	// table is a lookup table indexed by the next byte in the key, after
   130  	// remapping that byte through genericReplacer.mapping to create a dense
   131  	// index. In the example above, the keys only use 'a', 'b', 'c', 'x' and
   132  	// 'y', which remap to 0, 1, 2, 3 and 4. All other bytes remap to 5, and
   133  	// genericReplacer.tableSize will be 5. Node n0's table will be
   134  	// []*trieNode{ 0:n1, 1:n4, 3:n6 }, where the 0, 1 and 3 are the remapped
   135  	// 'a', 'b' and 'x'.
   136  	table []*trieNode
   137  }
   138  
   139  func (t *trieNode) add(key, val string, priority int, r *genericReplacer) {
   140  	if key == "" {
   141  		if t.priority == 0 {
   142  			t.value = val
   143  			t.priority = priority
   144  		}
   145  		return
   146  	}
   147  
   148  	if t.prefix != "" {
   149  		// Need to split the prefix among multiple nodes.
   150  		var n int // length of the longest common prefix
   151  		for ; n < len(t.prefix) && n < len(key); n++ {
   152  			if t.prefix[n] != key[n] {
   153  				break
   154  			}
   155  		}
   156  		if n == len(t.prefix) {
   157  			t.next.add(key[n:], val, priority, r)
   158  		} else if n == 0 {
   159  			// First byte differs, start a new lookup table here. Looking up
   160  			// what is currently t.prefix[0] will lead to prefixNode, and
   161  			// looking up key[0] will lead to keyNode.
   162  			var prefixNode *trieNode
   163  			if len(t.prefix) == 1 {
   164  				prefixNode = t.next
   165  			} else {
   166  				prefixNode = &trieNode{
   167  					prefix: t.prefix[1:],
   168  					next:   t.next,
   169  				}
   170  			}
   171  			keyNode := new(trieNode)
   172  			t.table = make([]*trieNode, r.tableSize)
   173  			t.table[r.mapping[t.prefix[0]]] = prefixNode
   174  			t.table[r.mapping[key[0]]] = keyNode
   175  			t.prefix = ""
   176  			t.next = nil
   177  			keyNode.add(key[1:], val, priority, r)
   178  		} else {
   179  			// Insert new node after the common section of the prefix.
   180  			next := &trieNode{
   181  				prefix: t.prefix[n:],
   182  				next:   t.next,
   183  			}
   184  			t.prefix = t.prefix[:n]
   185  			t.next = next
   186  			next.add(key[n:], val, priority, r)
   187  		}
   188  	} else if t.table != nil {
   189  		// Insert into existing table.
   190  		m := r.mapping[key[0]]
   191  		if t.table[m] == nil {
   192  			t.table[m] = new(trieNode)
   193  		}
   194  		t.table[m].add(key[1:], val, priority, r)
   195  	} else {
   196  		t.prefix = key
   197  		t.next = new(trieNode)
   198  		t.next.add("", val, priority, r)
   199  	}
   200  }
   201  
   202  func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int, found bool) {
   203  	// Iterate down the trie to the end, and grab the value and keylen with
   204  	// the highest priority.
   205  	bestPriority := 0
   206  	node := &r.root
   207  	n := 0
   208  	for node != nil {
   209  		if node.priority > bestPriority && !(ignoreRoot && node == &r.root) {
   210  			bestPriority = node.priority
   211  			val = node.value
   212  			keylen = n
   213  			found = true
   214  		}
   215  
   216  		if s == "" {
   217  			break
   218  		}
   219  		if node.table != nil {
   220  			index := r.mapping[s[0]]
   221  			if int(index) == r.tableSize {
   222  				break
   223  			}
   224  			node = node.table[index]
   225  			s = s[1:]
   226  			n++
   227  		} else if node.prefix != "" && HasPrefix(s, node.prefix) {
   228  			n += len(node.prefix)
   229  			s = s[len(node.prefix):]
   230  			node = node.next
   231  		} else {
   232  			break
   233  		}
   234  	}
   235  	return
   236  }
   237  
   238  // genericReplacer is the fully generic algorithm.
   239  // It's used as a fallback when nothing faster can be used.
   240  type genericReplacer struct {
   241  	root trieNode
   242  	// tableSize is the size of a trie node's lookup table. It is the number
   243  	// of unique key bytes.
   244  	tableSize int
   245  	// mapping maps from key bytes to a dense index for trieNode.table.
   246  	mapping [256]byte
   247  }
   248  
   249  func makeGenericReplacer(oldnew []string) *genericReplacer {
   250  	r := new(genericReplacer)
   251  	// Find each byte used, then assign them each an index.
   252  	for i := 0; i < len(oldnew); i += 2 {
   253  		key := oldnew[i]
   254  		for j := 0; j < len(key); j++ {
   255  			r.mapping[key[j]] = 1
   256  		}
   257  	}
   258  
   259  	for _, b := range r.mapping {
   260  		r.tableSize += int(b)
   261  	}
   262  
   263  	var index byte
   264  	for i, b := range r.mapping {
   265  		if b == 0 {
   266  			r.mapping[i] = byte(r.tableSize)
   267  		} else {
   268  			r.mapping[i] = index
   269  			index++
   270  		}
   271  	}
   272  	// Ensure root node uses a lookup table (for performance).
   273  	r.root.table = make([]*trieNode, r.tableSize)
   274  
   275  	for i := 0; i < len(oldnew); i += 2 {
   276  		r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r)
   277  	}
   278  	return r
   279  }
   280  
   281  type appendSliceWriter []byte
   282  
   283  // Write writes to the buffer to satisfy io.Writer.
   284  func (w *appendSliceWriter) Write(p []byte) (int, error) {
   285  	*w = append(*w, p...)
   286  	return len(p), nil
   287  }
   288  
   289  // WriteString writes to the buffer without string->[]byte->string allocations.
   290  func (w *appendSliceWriter) WriteString(s string) (int, error) {
   291  	*w = append(*w, s...)
   292  	return len(s), nil
   293  }
   294  
   295  type stringWriterIface interface {
   296  	WriteString(string) (int, error)
   297  }
   298  
   299  type stringWriter struct {
   300  	w io.Writer
   301  }
   302  
   303  func (w stringWriter) WriteString(s string) (int, error) {
   304  	return w.w.Write([]byte(s))
   305  }
   306  
   307  func getStringWriter(w io.Writer) stringWriterIface {
   308  	sw, ok := w.(stringWriterIface)
   309  	if !ok {
   310  		sw = stringWriter{w}
   311  	}
   312  	return sw
   313  }
   314  
   315  func (r *genericReplacer) Replace(s string) string {
   316  	buf := make(appendSliceWriter, 0, len(s))
   317  	r.WriteString(&buf, s)
   318  	return string(buf)
   319  }
   320  
   321  func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) {
   322  	sw := getStringWriter(w)
   323  	var last, wn int
   324  	var prevMatchEmpty bool
   325  	for i := 0; i <= len(s); {
   326  		// Fast path: s[i] is not a prefix of any pattern.
   327  		if i != len(s) && r.root.priority == 0 {
   328  			index := int(r.mapping[s[i]])
   329  			if index == r.tableSize || r.root.table[index] == nil {
   330  				i++
   331  				continue
   332  			}
   333  		}
   334  
   335  		// Ignore the empty match iff the previous loop found the empty match.
   336  		val, keylen, match := r.lookup(s[i:], prevMatchEmpty)
   337  		prevMatchEmpty = match && keylen == 0
   338  		if match {
   339  			wn, err = sw.WriteString(s[last:i])
   340  			n += wn
   341  			if err != nil {
   342  				return
   343  			}
   344  			wn, err = sw.WriteString(val)
   345  			n += wn
   346  			if err != nil {
   347  				return
   348  			}
   349  			i += keylen
   350  			last = i
   351  			continue
   352  		}
   353  		i++
   354  	}
   355  	if last != len(s) {
   356  		wn, err = sw.WriteString(s[last:])
   357  		n += wn
   358  	}
   359  	return
   360  }
   361  
   362  // singleStringReplacer is the implementation that's used when there is only
   363  // one string to replace (and that string has more than one byte).
   364  type singleStringReplacer struct {
   365  	finder *stringFinder
   366  	// value is the new string that replaces that pattern when it's found.
   367  	value string
   368  }
   369  
   370  func makeSingleStringReplacer(pattern string, value string) *singleStringReplacer {
   371  	return &singleStringReplacer{finder: makeStringFinder(pattern), value: value}
   372  }
   373  
   374  func (r *singleStringReplacer) Replace(s string) string {
   375  	var buf []byte
   376  	i, matched := 0, false
   377  	for {
   378  		match := r.finder.next(s[i:])
   379  		if match == -1 {
   380  			break
   381  		}
   382  		matched = true
   383  		buf = append(buf, s[i:i+match]...)
   384  		buf = append(buf, r.value...)
   385  		i += match + len(r.finder.pattern)
   386  	}
   387  	if !matched {
   388  		return s
   389  	}
   390  	buf = append(buf, s[i:]...)
   391  	return string(buf)
   392  }
   393  
   394  func (r *singleStringReplacer) WriteString(w io.Writer, s string) (n int, err error) {
   395  	sw := getStringWriter(w)
   396  	var i, wn int
   397  	for {
   398  		match := r.finder.next(s[i:])
   399  		if match == -1 {
   400  			break
   401  		}
   402  		wn, err = sw.WriteString(s[i : i+match])
   403  		n += wn
   404  		if err != nil {
   405  			return
   406  		}
   407  		wn, err = sw.WriteString(r.value)
   408  		n += wn
   409  		if err != nil {
   410  			return
   411  		}
   412  		i += match + len(r.finder.pattern)
   413  	}
   414  	wn, err = sw.WriteString(s[i:])
   415  	n += wn
   416  	return
   417  }
   418  
   419  // byteReplacer is the implementation that's used when all the "old"
   420  // and "new" values are single ASCII bytes.
   421  // The array contains replacement bytes indexed by old byte.
   422  type byteReplacer [256]byte
   423  
   424  func (r *byteReplacer) Replace(s string) string {
   425  	var buf []byte // lazily allocated
   426  	for i := 0; i < len(s); i++ {
   427  		b := s[i]
   428  		if r[b] != b {
   429  			if buf == nil {
   430  				buf = []byte(s)
   431  			}
   432  			buf[i] = r[b]
   433  		}
   434  	}
   435  	if buf == nil {
   436  		return s
   437  	}
   438  	return string(buf)
   439  }
   440  
   441  func (r *byteReplacer) WriteString(w io.Writer, s string) (n int, err error) {
   442  	// TODO(bradfitz): use io.WriteString with slices of s, avoiding allocation.
   443  	bufsize := 32 << 10
   444  	if len(s) < bufsize {
   445  		bufsize = len(s)
   446  	}
   447  	buf := make([]byte, bufsize)
   448  
   449  	for len(s) > 0 {
   450  		ncopy := copy(buf, s[:])
   451  		s = s[ncopy:]
   452  		for i, b := range buf[:ncopy] {
   453  			buf[i] = r[b]
   454  		}
   455  		wn, err := w.Write(buf[:ncopy])
   456  		n += wn
   457  		if err != nil {
   458  			return n, err
   459  		}
   460  	}
   461  	return n, nil
   462  }
   463  
   464  // byteStringReplacer is the implementation that's used when all the
   465  // "old" values are single ASCII bytes but the "new" values vary in size.
   466  type byteStringReplacer struct {
   467  	// replacements contains replacement byte slices indexed by old byte.
   468  	// A nil []byte means that the old byte should not be replaced.
   469  	replacements [256][]byte
   470  	// toReplace keeps a list of bytes to replace. Depending on length of toReplace
   471  	// and length of target string it may be faster to use Count, or a plain loop.
   472  	// We store single byte as a string, because Count takes a string.
   473  	toReplace []string
   474  }
   475  
   476  // countCutOff controls the ratio of a string length to a number of replacements
   477  // at which (*byteStringReplacer).Replace switches algorithms.
   478  // For strings with higher ration of length to replacements than that value,
   479  // we call Count, for each replacement from toReplace.
   480  // For strings, with a lower ratio we use simple loop, because of Count overhead.
   481  // countCutOff is an empirically determined overhead multiplier.
   482  // TODO(tocarip) revisit once we have register-based abi/mid-stack inlining.
   483  const countCutOff = 8
   484  
   485  func (r *byteStringReplacer) Replace(s string) string {
   486  	newSize := len(s)
   487  	anyChanges := false
   488  	// Is it faster to use Count?
   489  	if len(r.toReplace)*countCutOff <= len(s) {
   490  		for _, x := range r.toReplace {
   491  			if c := Count(s, x); c != 0 {
   492  				// The -1 is because we are replacing 1 byte with len(replacements[b]) bytes.
   493  				newSize += c * (len(r.replacements[x[0]]) - 1)
   494  				anyChanges = true
   495  			}
   496  
   497  		}
   498  	} else {
   499  		for i := 0; i < len(s); i++ {
   500  			b := s[i]
   501  			if r.replacements[b] != nil {
   502  				// See above for explanation of -1
   503  				newSize += len(r.replacements[b]) - 1
   504  				anyChanges = true
   505  			}
   506  		}
   507  	}
   508  	if !anyChanges {
   509  		return s
   510  	}
   511  	buf := make([]byte, newSize)
   512  	j := 0
   513  	for i := 0; i < len(s); i++ {
   514  		b := s[i]
   515  		if r.replacements[b] != nil {
   516  			j += copy(buf[j:], r.replacements[b])
   517  		} else {
   518  			buf[j] = b
   519  			j++
   520  		}
   521  	}
   522  	return string(buf)
   523  }
   524  
   525  func (r *byteStringReplacer) WriteString(w io.Writer, s string) (n int, err error) {
   526  	sw := getStringWriter(w)
   527  	last := 0
   528  	for i := 0; i < len(s); i++ {
   529  		b := s[i]
   530  		if r.replacements[b] == nil {
   531  			continue
   532  		}
   533  		if last != i {
   534  			nw, err := sw.WriteString(s[last:i])
   535  			n += nw
   536  			if err != nil {
   537  				return n, err
   538  			}
   539  		}
   540  		last = i + 1
   541  		nw, err := w.Write(r.replacements[b])
   542  		n += nw
   543  		if err != nil {
   544  			return n, err
   545  		}
   546  	}
   547  	if last != len(s) {
   548  		var nw int
   549  		nw, err = sw.WriteString(s[last:])
   550  		n += nw
   551  	}
   552  	return
   553  }
   554  

View as plain text