...

Source file src/strings/replace.go

Documentation: strings

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package strings
     6  
     7  import (
     8  	"io"
     9  	"sync"
    10  )
    11  
    12  // Replacer replaces a list of strings with replacements.
    13  // It is safe for concurrent use by multiple goroutines.
    14  type Replacer struct {
    15  	once   sync.Once // guards buildOnce method
    16  	r      replacer
    17  	oldnew []string
    18  }
    19  
    20  // replacer is the interface that a replacement algorithm needs to implement.
    21  type replacer interface {
    22  	Replace(s string) string
    23  	WriteString(w io.Writer, s string) (n int, err error)
    24  }
    25  
    26  // NewReplacer returns a new Replacer from a list of old, new string
    27  // pairs. Replacements are performed in the order they appear in the
    28  // target string, without overlapping matches.
    29  func NewReplacer(oldnew ...string) *Replacer {
    30  	if len(oldnew)%2 == 1 {
    31  		panic("strings.NewReplacer: odd argument count")
    32  	}
    33  	return &Replacer{oldnew: append([]string(nil), oldnew...)}
    34  }
    35  
    36  func (r *Replacer) buildOnce() {
    37  	r.r = r.build()
    38  	r.oldnew = nil
    39  }
    40  
    41  func (b *Replacer) build() replacer {
    42  	oldnew := b.oldnew
    43  	if len(oldnew) == 2 && len(oldnew[0]) > 1 {
    44  		return makeSingleStringReplacer(oldnew[0], oldnew[1])
    45  	}
    46  
    47  	allNewBytes := true
    48  	for i := 0; i < len(oldnew); i += 2 {
    49  		if len(oldnew[i]) != 1 {
    50  			return makeGenericReplacer(oldnew)
    51  		}
    52  		if len(oldnew[i+1]) != 1 {
    53  			allNewBytes = false
    54  		}
    55  	}
    56  
    57  	if allNewBytes {
    58  		r := byteReplacer{}
    59  		for i := range r {
    60  			r[i] = byte(i)
    61  		}
    62  		// The first occurrence of old->new map takes precedence
    63  		// over the others with the same old string.
    64  		for i := len(oldnew) - 2; i >= 0; i -= 2 {
    65  			o := oldnew[i][0]
    66  			n := oldnew[i+1][0]
    67  			r[o] = n
    68  		}
    69  		return &r
    70  	}
    71  
    72  	r := byteStringReplacer{toReplace: make([]string, 0, len(oldnew)/2)}
    73  	// The first occurrence of old->new map takes precedence
    74  	// over the others with the same old string.
    75  	for i := len(oldnew) - 2; i >= 0; i -= 2 {
    76  		o := oldnew[i][0]
    77  		n := oldnew[i+1]
    78  		// To avoid counting repetitions multiple times.
    79  		if r.replacements[o] == nil {
    80  			// We need to use string([]byte{o}) instead of string(o),
    81  			// to avoid utf8 encoding of o.
    82  			// E. g. byte(150) produces string of length 2.
    83  			r.toReplace = append(r.toReplace, string([]byte{o}))
    84  		}
    85  		r.replacements[o] = []byte(n)
    86  
    87  	}
    88  	return &r
    89  }
    90  
    91  // Replace returns a copy of s with all replacements performed.
    92  func (r *Replacer) Replace(s string) string {
    93  	r.once.Do(r.buildOnce)
    94  	return r.r.Replace(s)
    95  }
    96  
    97  // WriteString writes s to w with all replacements performed.
    98  func (r *Replacer) WriteString(w io.Writer, s string) (n int, err error) {
    99  	r.once.Do(r.buildOnce)
   100  	return r.r.WriteString(w, s)
   101  }
   102  
   103  // trieNode is a node in a lookup trie for prioritized key/value pairs. Keys
   104  // and values may be empty. For example, the trie containing keys "ax", "ay",
   105  // "bcbc", "x" and "xy" could have eight nodes:
   106  //
   107  //  n0  -
   108  //  n1  a-
   109  //  n2  .x+
   110  //  n3  .y+
   111  //  n4  b-
   112  //  n5  .cbc+
   113  //  n6  x+
   114  //  n7  .y+
   115  //
   116  // n0 is the root node, and its children are n1, n4 and n6; n1's children are
   117  // n2 and n3; n4's child is n5; n6's child is n7. Nodes n0, n1 and n4 (marked
   118  // with a trailing "-") are partial keys, and nodes n2, n3, n5, n6 and n7
   119  // (marked with a trailing "+") are complete keys.
   120  type trieNode struct {
   121  	// value is the value of the trie node's key/value pair. It is empty if
   122  	// this node is not a complete key.
   123  	value string
   124  	// priority is the priority (higher is more important) of the trie node's
   125  	// key/value pair; keys are not necessarily matched shortest- or longest-
   126  	// first. Priority is positive if this node is a complete key, and zero
   127  	// otherwise. In the example above, positive/zero priorities are marked
   128  	// with a trailing "+" or "-".
   129  	priority int
   130  
   131  	// A trie node may have zero, one or more child nodes:
   132  	//  * if the remaining fields are zero, there are no children.
   133  	//  * if prefix and next are non-zero, there is one child in next.
   134  	//  * if table is non-zero, it defines all the children.
   135  	//
   136  	// Prefixes are preferred over tables when there is one child, but the
   137  	// root node always uses a table for lookup efficiency.
   138  
   139  	// prefix is the difference in keys between this trie node and the next.
   140  	// In the example above, node n4 has prefix "cbc" and n4's next node is n5.
   141  	// Node n5 has no children and so has zero prefix, next and table fields.
   142  	prefix string
   143  	next   *trieNode
   144  
   145  	// table is a lookup table indexed by the next byte in the key, after
   146  	// remapping that byte through genericReplacer.mapping to create a dense
   147  	// index. In the example above, the keys only use 'a', 'b', 'c', 'x' and
   148  	// 'y', which remap to 0, 1, 2, 3 and 4. All other bytes remap to 5, and
   149  	// genericReplacer.tableSize will be 5. Node n0's table will be
   150  	// []*trieNode{ 0:n1, 1:n4, 3:n6 }, where the 0, 1 and 3 are the remapped
   151  	// 'a', 'b' and 'x'.
   152  	table []*trieNode
   153  }
   154  
   155  func (t *trieNode) add(key, val string, priority int, r *genericReplacer) {
   156  	if key == "" {
   157  		if t.priority == 0 {
   158  			t.value = val
   159  			t.priority = priority
   160  		}
   161  		return
   162  	}
   163  
   164  	if t.prefix != "" {
   165  		// Need to split the prefix among multiple nodes.
   166  		var n int // length of the longest common prefix
   167  		for ; n < len(t.prefix) && n < len(key); n++ {
   168  			if t.prefix[n] != key[n] {
   169  				break
   170  			}
   171  		}
   172  		if n == len(t.prefix) {
   173  			t.next.add(key[n:], val, priority, r)
   174  		} else if n == 0 {
   175  			// First byte differs, start a new lookup table here. Looking up
   176  			// what is currently t.prefix[0] will lead to prefixNode, and
   177  			// looking up key[0] will lead to keyNode.
   178  			var prefixNode *trieNode
   179  			if len(t.prefix) == 1 {
   180  				prefixNode = t.next
   181  			} else {
   182  				prefixNode = &trieNode{
   183  					prefix: t.prefix[1:],
   184  					next:   t.next,
   185  				}
   186  			}
   187  			keyNode := new(trieNode)
   188  			t.table = make([]*trieNode, r.tableSize)
   189  			t.table[r.mapping[t.prefix[0]]] = prefixNode
   190  			t.table[r.mapping[key[0]]] = keyNode
   191  			t.prefix = ""
   192  			t.next = nil
   193  			keyNode.add(key[1:], val, priority, r)
   194  		} else {
   195  			// Insert new node after the common section of the prefix.
   196  			next := &trieNode{
   197  				prefix: t.prefix[n:],
   198  				next:   t.next,
   199  			}
   200  			t.prefix = t.prefix[:n]
   201  			t.next = next
   202  			next.add(key[n:], val, priority, r)
   203  		}
   204  	} else if t.table != nil {
   205  		// Insert into existing table.
   206  		m := r.mapping[key[0]]
   207  		if t.table[m] == nil {
   208  			t.table[m] = new(trieNode)
   209  		}
   210  		t.table[m].add(key[1:], val, priority, r)
   211  	} else {
   212  		t.prefix = key
   213  		t.next = new(trieNode)
   214  		t.next.add("", val, priority, r)
   215  	}
   216  }
   217  
   218  func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int, found bool) {
   219  	// Iterate down the trie to the end, and grab the value and keylen with
   220  	// the highest priority.
   221  	bestPriority := 0
   222  	node := &r.root
   223  	n := 0
   224  	for node != nil {
   225  		if node.priority > bestPriority && !(ignoreRoot && node == &r.root) {
   226  			bestPriority = node.priority
   227  			val = node.value
   228  			keylen = n
   229  			found = true
   230  		}
   231  
   232  		if s == "" {
   233  			break
   234  		}
   235  		if node.table != nil {
   236  			index := r.mapping[s[0]]
   237  			if int(index) == r.tableSize {
   238  				break
   239  			}
   240  			node = node.table[index]
   241  			s = s[1:]
   242  			n++
   243  		} else if node.prefix != "" && HasPrefix(s, node.prefix) {
   244  			n += len(node.prefix)
   245  			s = s[len(node.prefix):]
   246  			node = node.next
   247  		} else {
   248  			break
   249  		}
   250  	}
   251  	return
   252  }
   253  
   254  // genericReplacer is the fully generic algorithm.
   255  // It's used as a fallback when nothing faster can be used.
   256  type genericReplacer struct {
   257  	root trieNode
   258  	// tableSize is the size of a trie node's lookup table. It is the number
   259  	// of unique key bytes.
   260  	tableSize int
   261  	// mapping maps from key bytes to a dense index for trieNode.table.
   262  	mapping [256]byte
   263  }
   264  
   265  func makeGenericReplacer(oldnew []string) *genericReplacer {
   266  	r := new(genericReplacer)
   267  	// Find each byte used, then assign them each an index.
   268  	for i := 0; i < len(oldnew); i += 2 {
   269  		key := oldnew[i]
   270  		for j := 0; j < len(key); j++ {
   271  			r.mapping[key[j]] = 1
   272  		}
   273  	}
   274  
   275  	for _, b := range r.mapping {
   276  		r.tableSize += int(b)
   277  	}
   278  
   279  	var index byte
   280  	for i, b := range r.mapping {
   281  		if b == 0 {
   282  			r.mapping[i] = byte(r.tableSize)
   283  		} else {
   284  			r.mapping[i] = index
   285  			index++
   286  		}
   287  	}
   288  	// Ensure root node uses a lookup table (for performance).
   289  	r.root.table = make([]*trieNode, r.tableSize)
   290  
   291  	for i := 0; i < len(oldnew); i += 2 {
   292  		r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r)
   293  	}
   294  	return r
   295  }
   296  
   297  type appendSliceWriter []byte
   298  
   299  // Write writes to the buffer to satisfy io.Writer.
   300  func (w *appendSliceWriter) Write(p []byte) (int, error) {
   301  	*w = append(*w, p...)
   302  	return len(p), nil
   303  }
   304  
   305  // WriteString writes to the buffer without string->[]byte->string allocations.
   306  func (w *appendSliceWriter) WriteString(s string) (int, error) {
   307  	*w = append(*w, s...)
   308  	return len(s), nil
   309  }
   310  
   311  type stringWriter struct {
   312  	w io.Writer
   313  }
   314  
   315  func (w stringWriter) WriteString(s string) (int, error) {
   316  	return w.w.Write([]byte(s))
   317  }
   318  
   319  func getStringWriter(w io.Writer) io.StringWriter {
   320  	sw, ok := w.(io.StringWriter)
   321  	if !ok {
   322  		sw = stringWriter{w}
   323  	}
   324  	return sw
   325  }
   326  
   327  func (r *genericReplacer) Replace(s string) string {
   328  	buf := make(appendSliceWriter, 0, len(s))
   329  	r.WriteString(&buf, s)
   330  	return string(buf)
   331  }
   332  
   333  func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) {
   334  	sw := getStringWriter(w)
   335  	var last, wn int
   336  	var prevMatchEmpty bool
   337  	for i := 0; i <= len(s); {
   338  		// Fast path: s[i] is not a prefix of any pattern.
   339  		if i != len(s) && r.root.priority == 0 {
   340  			index := int(r.mapping[s[i]])
   341  			if index == r.tableSize || r.root.table[index] == nil {
   342  				i++
   343  				continue
   344  			}
   345  		}
   346  
   347  		// Ignore the empty match iff the previous loop found the empty match.
   348  		val, keylen, match := r.lookup(s[i:], prevMatchEmpty)
   349  		prevMatchEmpty = match && keylen == 0
   350  		if match {
   351  			wn, err = sw.WriteString(s[last:i])
   352  			n += wn
   353  			if err != nil {
   354  				return
   355  			}
   356  			wn, err = sw.WriteString(val)
   357  			n += wn
   358  			if err != nil {
   359  				return
   360  			}
   361  			i += keylen
   362  			last = i
   363  			continue
   364  		}
   365  		i++
   366  	}
   367  	if last != len(s) {
   368  		wn, err = sw.WriteString(s[last:])
   369  		n += wn
   370  	}
   371  	return
   372  }
   373  
   374  // singleStringReplacer is the implementation that's used when there is only
   375  // one string to replace (and that string has more than one byte).
   376  type singleStringReplacer struct {
   377  	finder *stringFinder
   378  	// value is the new string that replaces that pattern when it's found.
   379  	value string
   380  }
   381  
   382  func makeSingleStringReplacer(pattern string, value string) *singleStringReplacer {
   383  	return &singleStringReplacer{finder: makeStringFinder(pattern), value: value}
   384  }
   385  
   386  func (r *singleStringReplacer) Replace(s string) string {
   387  	var buf []byte
   388  	i, matched := 0, false
   389  	for {
   390  		match := r.finder.next(s[i:])
   391  		if match == -1 {
   392  			break
   393  		}
   394  		matched = true
   395  		buf = append(buf, s[i:i+match]...)
   396  		buf = append(buf, r.value...)
   397  		i += match + len(r.finder.pattern)
   398  	}
   399  	if !matched {
   400  		return s
   401  	}
   402  	buf = append(buf, s[i:]...)
   403  	return string(buf)
   404  }
   405  
   406  func (r *singleStringReplacer) WriteString(w io.Writer, s string) (n int, err error) {
   407  	sw := getStringWriter(w)
   408  	var i, wn int
   409  	for {
   410  		match := r.finder.next(s[i:])
   411  		if match == -1 {
   412  			break
   413  		}
   414  		wn, err = sw.WriteString(s[i : i+match])
   415  		n += wn
   416  		if err != nil {
   417  			return
   418  		}
   419  		wn, err = sw.WriteString(r.value)
   420  		n += wn
   421  		if err != nil {
   422  			return
   423  		}
   424  		i += match + len(r.finder.pattern)
   425  	}
   426  	wn, err = sw.WriteString(s[i:])
   427  	n += wn
   428  	return
   429  }
   430  
   431  // byteReplacer is the implementation that's used when all the "old"
   432  // and "new" values are single ASCII bytes.
   433  // The array contains replacement bytes indexed by old byte.
   434  type byteReplacer [256]byte
   435  
   436  func (r *byteReplacer) Replace(s string) string {
   437  	var buf []byte // lazily allocated
   438  	for i := 0; i < len(s); i++ {
   439  		b := s[i]
   440  		if r[b] != b {
   441  			if buf == nil {
   442  				buf = []byte(s)
   443  			}
   444  			buf[i] = r[b]
   445  		}
   446  	}
   447  	if buf == nil {
   448  		return s
   449  	}
   450  	return string(buf)
   451  }
   452  
   453  func (r *byteReplacer) WriteString(w io.Writer, s string) (n int, err error) {
   454  	// TODO(bradfitz): use io.WriteString with slices of s, avoiding allocation.
   455  	bufsize := 32 << 10
   456  	if len(s) < bufsize {
   457  		bufsize = len(s)
   458  	}
   459  	buf := make([]byte, bufsize)
   460  
   461  	for len(s) > 0 {
   462  		ncopy := copy(buf, s)
   463  		s = s[ncopy:]
   464  		for i, b := range buf[:ncopy] {
   465  			buf[i] = r[b]
   466  		}
   467  		wn, err := w.Write(buf[:ncopy])
   468  		n += wn
   469  		if err != nil {
   470  			return n, err
   471  		}
   472  	}
   473  	return n, nil
   474  }
   475  
   476  // byteStringReplacer is the implementation that's used when all the
   477  // "old" values are single ASCII bytes but the "new" values vary in size.
   478  type byteStringReplacer struct {
   479  	// replacements contains replacement byte slices indexed by old byte.
   480  	// A nil []byte means that the old byte should not be replaced.
   481  	replacements [256][]byte
   482  	// toReplace keeps a list of bytes to replace. Depending on length of toReplace
   483  	// and length of target string it may be faster to use Count, or a plain loop.
   484  	// We store single byte as a string, because Count takes a string.
   485  	toReplace []string
   486  }
   487  
   488  // countCutOff controls the ratio of a string length to a number of replacements
   489  // at which (*byteStringReplacer).Replace switches algorithms.
   490  // For strings with higher ration of length to replacements than that value,
   491  // we call Count, for each replacement from toReplace.
   492  // For strings, with a lower ratio we use simple loop, because of Count overhead.
   493  // countCutOff is an empirically determined overhead multiplier.
   494  // TODO(tocarip) revisit once we have register-based abi/mid-stack inlining.
   495  const countCutOff = 8
   496  
   497  func (r *byteStringReplacer) Replace(s string) string {
   498  	newSize := len(s)
   499  	anyChanges := false
   500  	// Is it faster to use Count?
   501  	if len(r.toReplace)*countCutOff <= len(s) {
   502  		for _, x := range r.toReplace {
   503  			if c := Count(s, x); c != 0 {
   504  				// The -1 is because we are replacing 1 byte with len(replacements[b]) bytes.
   505  				newSize += c * (len(r.replacements[x[0]]) - 1)
   506  				anyChanges = true
   507  			}
   508  
   509  		}
   510  	} else {
   511  		for i := 0; i < len(s); i++ {
   512  			b := s[i]
   513  			if r.replacements[b] != nil {
   514  				// See above for explanation of -1
   515  				newSize += len(r.replacements[b]) - 1
   516  				anyChanges = true
   517  			}
   518  		}
   519  	}
   520  	if !anyChanges {
   521  		return s
   522  	}
   523  	buf := make([]byte, newSize)
   524  	j := 0
   525  	for i := 0; i < len(s); i++ {
   526  		b := s[i]
   527  		if r.replacements[b] != nil {
   528  			j += copy(buf[j:], r.replacements[b])
   529  		} else {
   530  			buf[j] = b
   531  			j++
   532  		}
   533  	}
   534  	return string(buf)
   535  }
   536  
   537  func (r *byteStringReplacer) WriteString(w io.Writer, s string) (n int, err error) {
   538  	sw := getStringWriter(w)
   539  	last := 0
   540  	for i := 0; i < len(s); i++ {
   541  		b := s[i]
   542  		if r.replacements[b] == nil {
   543  			continue
   544  		}
   545  		if last != i {
   546  			nw, err := sw.WriteString(s[last:i])
   547  			n += nw
   548  			if err != nil {
   549  				return n, err
   550  			}
   551  		}
   552  		last = i + 1
   553  		nw, err := w.Write(r.replacements[b])
   554  		n += nw
   555  		if err != nil {
   556  			return n, err
   557  		}
   558  	}
   559  	if last != len(s) {
   560  		var nw int
   561  		nw, err = sw.WriteString(s[last:])
   562  		n += nw
   563  	}
   564  	return
   565  }
   566  

View as plain text