Run Format

Source file src/pkg/strings/replace.go

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package strings
     6	
     7	import "io"
     8	
     9	// A Replacer replaces a list of strings with replacements.
    10	type Replacer struct {
    11		r replacer
    12	}
    13	
    14	// replacer is the interface that a replacement algorithm needs to implement.
    15	type replacer interface {
    16		Replace(s string) string
    17		WriteString(w io.Writer, s string) (n int, err error)
    18	}
    19	
    20	// byteBitmap represents bytes which are sought for replacement.
    21	// byteBitmap is 256 bits wide, with a bit set for each old byte to be
    22	// replaced.
    23	type byteBitmap [256 / 32]uint32
    24	
    25	func (m *byteBitmap) set(b byte) {
    26		m[b>>5] |= uint32(1 << (b & 31))
    27	}
    28	
    29	// NewReplacer returns a new Replacer from a list of old, new string pairs.
    30	// Replacements are performed in order, without overlapping matches.
    31	func NewReplacer(oldnew ...string) *Replacer {
    32		if len(oldnew)%2 == 1 {
    33			panic("strings.NewReplacer: odd argument count")
    34		}
    35	
    36		if len(oldnew) == 2 && len(oldnew[0]) > 1 {
    37			return &Replacer{r: makeSingleStringReplacer(oldnew[0], oldnew[1])}
    38		}
    39	
    40		allNewBytes := true
    41		for i := 0; i < len(oldnew); i += 2 {
    42			if len(oldnew[i]) != 1 {
    43				return &Replacer{r: makeGenericReplacer(oldnew)}
    44			}
    45			if len(oldnew[i+1]) != 1 {
    46				allNewBytes = false
    47			}
    48		}
    49	
    50		if allNewBytes {
    51			bb := &byteReplacer{}
    52			for i := 0; i < len(oldnew); i += 2 {
    53				o, n := oldnew[i][0], oldnew[i+1][0]
    54				if bb.old[o>>5]&uint32(1<<(o&31)) != 0 {
    55					// Later old->new maps do not override previous ones with the same old string.
    56					continue
    57				}
    58				bb.old.set(o)
    59				bb.new[o] = n
    60			}
    61			return &Replacer{r: bb}
    62		}
    63	
    64		bs := &byteStringReplacer{}
    65		for i := 0; i < len(oldnew); i += 2 {
    66			o, new := oldnew[i][0], oldnew[i+1]
    67			if bs.old[o>>5]&uint32(1<<(o&31)) != 0 {
    68				// Later old->new maps do not override previous ones with the same old string.
    69				continue
    70			}
    71			bs.old.set(o)
    72			bs.new[o] = []byte(new)
    73		}
    74		return &Replacer{r: bs}
    75	}
    76	
    77	// Replace returns a copy of s with all replacements performed.
    78	func (r *Replacer) Replace(s string) string {
    79		return r.r.Replace(s)
    80	}
    81	
    82	// WriteString writes s to w with all replacements performed.
    83	func (r *Replacer) WriteString(w io.Writer, s string) (n int, err error) {
    84		return r.r.WriteString(w, s)
    85	}
    86	
    87	// trieNode is a node in a lookup trie for prioritized key/value pairs. Keys
    88	// and values may be empty. For example, the trie containing keys "ax", "ay",
    89	// "bcbc", "x" and "xy" could have eight nodes:
    90	//
    91	//  n0  -
    92	//  n1  a-
    93	//  n2  .x+
    94	//  n3  .y+
    95	//  n4  b-
    96	//  n5  .cbc+
    97	//  n6  x+
    98	//  n7  .y+
    99	//
   100	// n0 is the root node, and its children are n1, n4 and n6; n1's children are
   101	// n2 and n3; n4's child is n5; n6's child is n7. Nodes n0, n1 and n4 (marked
   102	// with a trailing "-") are partial keys, and nodes n2, n3, n5, n6 and n7
   103	// (marked with a trailing "+") are complete keys.
   104	type trieNode struct {
   105		// value is the value of the trie node's key/value pair. It is empty if
   106		// this node is not a complete key.
   107		value string
   108		// priority is the priority (higher is more important) of the trie node's
   109		// key/value pair; keys are not necessarily matched shortest- or longest-
   110		// first. Priority is positive if this node is a complete key, and zero
   111		// otherwise. In the example above, positive/zero priorities are marked
   112		// with a trailing "+" or "-".
   113		priority int
   114	
   115		// A trie node may have zero, one or more child nodes:
   116		//  * if the remaining fields are zero, there are no children.
   117		//  * if prefix and next are non-zero, there is one child in next.
   118		//  * if table is non-zero, it defines all the children.
   119		//
   120		// Prefixes are preferred over tables when there is one child, but the
   121		// root node always uses a table for lookup efficiency.
   122	
   123		// prefix is the difference in keys between this trie node and the next.
   124		// In the example above, node n4 has prefix "cbc" and n4's next node is n5.
   125		// Node n5 has no children and so has zero prefix, next and table fields.
   126		prefix string
   127		next   *trieNode
   128	
   129		// table is a lookup table indexed by the next byte in the key, after
   130		// remapping that byte through genericReplacer.mapping to create a dense
   131		// index. In the example above, the keys only use 'a', 'b', 'c', 'x' and
   132		// 'y', which remap to 0, 1, 2, 3 and 4. All other bytes remap to 5, and
   133		// genericReplacer.tableSize will be 5. Node n0's table will be
   134		// []*trieNode{ 0:n1, 1:n4, 3:n6 }, where the 0, 1 and 3 are the remapped
   135		// 'a', 'b' and 'x'.
   136		table []*trieNode
   137	}
   138	
   139	func (t *trieNode) add(key, val string, priority int, r *genericReplacer) {
   140		if key == "" {
   141			if t.priority == 0 {
   142				t.value = val
   143				t.priority = priority
   144			}
   145			return
   146		}
   147	
   148		if t.prefix != "" {
   149			// Need to split the prefix among multiple nodes.
   150			var n int // length of the longest common prefix
   151			for ; n < len(t.prefix) && n < len(key); n++ {
   152				if t.prefix[n] != key[n] {
   153					break
   154				}
   155			}
   156			if n == len(t.prefix) {
   157				t.next.add(key[n:], val, priority, r)
   158			} else if n == 0 {
   159				// First byte differs, start a new lookup table here. Looking up
   160				// what is currently t.prefix[0] will lead to prefixNode, and
   161				// looking up key[0] will lead to keyNode.
   162				var prefixNode *trieNode
   163				if len(t.prefix) == 1 {
   164					prefixNode = t.next
   165				} else {
   166					prefixNode = &trieNode{
   167						prefix: t.prefix[1:],
   168						next:   t.next,
   169					}
   170				}
   171				keyNode := new(trieNode)
   172				t.table = make([]*trieNode, r.tableSize)
   173				t.table[r.mapping[t.prefix[0]]] = prefixNode
   174				t.table[r.mapping[key[0]]] = keyNode
   175				t.prefix = ""
   176				t.next = nil
   177				keyNode.add(key[1:], val, priority, r)
   178			} else {
   179				// Insert new node after the common section of the prefix.
   180				next := &trieNode{
   181					prefix: t.prefix[n:],
   182					next:   t.next,
   183				}
   184				t.prefix = t.prefix[:n]
   185				t.next = next
   186				next.add(key[n:], val, priority, r)
   187			}
   188		} else if t.table != nil {
   189			// Insert into existing table.
   190			m := r.mapping[key[0]]
   191			if t.table[m] == nil {
   192				t.table[m] = new(trieNode)
   193			}
   194			t.table[m].add(key[1:], val, priority, r)
   195		} else {
   196			t.prefix = key
   197			t.next = new(trieNode)
   198			t.next.add("", val, priority, r)
   199		}
   200	}
   201	
   202	func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int, found bool) {
   203		// Iterate down the trie to the end, and grab the value and keylen with
   204		// the highest priority.
   205		bestPriority := 0
   206		node := &r.root
   207		n := 0
   208		for node != nil {
   209			if node.priority > bestPriority && !(ignoreRoot && node == &r.root) {
   210				bestPriority = node.priority
   211				val = node.value
   212				keylen = n
   213				found = true
   214			}
   215	
   216			if s == "" {
   217				break
   218			}
   219			if node.table != nil {
   220				index := r.mapping[s[0]]
   221				if int(index) == r.tableSize {
   222					break
   223				}
   224				node = node.table[index]
   225				s = s[1:]
   226				n++
   227			} else if node.prefix != "" && HasPrefix(s, node.prefix) {
   228				n += len(node.prefix)
   229				s = s[len(node.prefix):]
   230				node = node.next
   231			} else {
   232				break
   233			}
   234		}
   235		return
   236	}
   237	
   238	// genericReplacer is the fully generic algorithm.
   239	// It's used as a fallback when nothing faster can be used.
   240	type genericReplacer struct {
   241		root trieNode
   242		// tableSize is the size of a trie node's lookup table. It is the number
   243		// of unique key bytes.
   244		tableSize int
   245		// mapping maps from key bytes to a dense index for trieNode.table.
   246		mapping [256]byte
   247	}
   248	
   249	func makeGenericReplacer(oldnew []string) *genericReplacer {
   250		r := new(genericReplacer)
   251		// Find each byte used, then assign them each an index.
   252		for i := 0; i < len(oldnew); i += 2 {
   253			key := oldnew[i]
   254			for j := 0; j < len(key); j++ {
   255				r.mapping[key[j]] = 1
   256			}
   257		}
   258	
   259		for _, b := range r.mapping {
   260			r.tableSize += int(b)
   261		}
   262	
   263		var index byte
   264		for i, b := range r.mapping {
   265			if b == 0 {
   266				r.mapping[i] = byte(r.tableSize)
   267			} else {
   268				r.mapping[i] = index
   269				index++
   270			}
   271		}
   272		// Ensure root node uses a lookup table (for performance).
   273		r.root.table = make([]*trieNode, r.tableSize)
   274	
   275		for i := 0; i < len(oldnew); i += 2 {
   276			r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r)
   277		}
   278		return r
   279	}
   280	
   281	type appendSliceWriter []byte
   282	
   283	// Write writes to the buffer to satisfy io.Writer.
   284	func (w *appendSliceWriter) Write(p []byte) (int, error) {
   285		*w = append(*w, p...)
   286		return len(p), nil
   287	}
   288	
   289	// WriteString writes to the buffer without string->[]byte->string allocations.
   290	func (w *appendSliceWriter) WriteString(s string) (int, error) {
   291		*w = append(*w, s...)
   292		return len(s), nil
   293	}
   294	
   295	type stringWriterIface interface {
   296		WriteString(string) (int, error)
   297	}
   298	
   299	type stringWriter struct {
   300		w io.Writer
   301	}
   302	
   303	func (w stringWriter) WriteString(s string) (int, error) {
   304		return w.w.Write([]byte(s))
   305	}
   306	
   307	func getStringWriter(w io.Writer) stringWriterIface {
   308		sw, ok := w.(stringWriterIface)
   309		if !ok {
   310			sw = stringWriter{w}
   311		}
   312		return sw
   313	}
   314	
   315	func (r *genericReplacer) Replace(s string) string {
   316		buf := make(appendSliceWriter, 0, len(s))
   317		r.WriteString(&buf, s)
   318		return string(buf)
   319	}
   320	
   321	func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) {
   322		sw := getStringWriter(w)
   323		var last, wn int
   324		var prevMatchEmpty bool
   325		for i := 0; i <= len(s); {
   326			// Ignore the empty match iff the previous loop found the empty match.
   327			val, keylen, match := r.lookup(s[i:], prevMatchEmpty)
   328			prevMatchEmpty = match && keylen == 0
   329			if match {
   330				wn, err = sw.WriteString(s[last:i])
   331				n += wn
   332				if err != nil {
   333					return
   334				}
   335				wn, err = sw.WriteString(val)
   336				n += wn
   337				if err != nil {
   338					return
   339				}
   340				i += keylen
   341				last = i
   342				continue
   343			}
   344			i++
   345		}
   346		if last != len(s) {
   347			wn, err = sw.WriteString(s[last:])
   348			n += wn
   349		}
   350		return
   351	}
   352	
   353	// singleStringReplacer is the implementation that's used when there is only
   354	// one string to replace (and that string has more than one byte).
   355	type singleStringReplacer struct {
   356		finder *stringFinder
   357		// value is the new string that replaces that pattern when it's found.
   358		value string
   359	}
   360	
   361	func makeSingleStringReplacer(pattern string, value string) *singleStringReplacer {
   362		return &singleStringReplacer{finder: makeStringFinder(pattern), value: value}
   363	}
   364	
   365	func (r *singleStringReplacer) Replace(s string) string {
   366		var buf []byte
   367		i, matched := 0, false
   368		for {
   369			match := r.finder.next(s[i:])
   370			if match == -1 {
   371				break
   372			}
   373			matched = true
   374			buf = append(buf, s[i:i+match]...)
   375			buf = append(buf, r.value...)
   376			i += match + len(r.finder.pattern)
   377		}
   378		if !matched {
   379			return s
   380		}
   381		buf = append(buf, s[i:]...)
   382		return string(buf)
   383	}
   384	
   385	func (r *singleStringReplacer) WriteString(w io.Writer, s string) (n int, err error) {
   386		sw := getStringWriter(w)
   387		var i, wn int
   388		for {
   389			match := r.finder.next(s[i:])
   390			if match == -1 {
   391				break
   392			}
   393			wn, err = sw.WriteString(s[i : i+match])
   394			n += wn
   395			if err != nil {
   396				return
   397			}
   398			wn, err = sw.WriteString(r.value)
   399			n += wn
   400			if err != nil {
   401				return
   402			}
   403			i += match + len(r.finder.pattern)
   404		}
   405		wn, err = sw.WriteString(s[i:])
   406		n += wn
   407		return
   408	}
   409	
   410	// byteReplacer is the implementation that's used when all the "old"
   411	// and "new" values are single ASCII bytes.
   412	type byteReplacer struct {
   413		// old has a bit set for each old byte that should be replaced.
   414		old byteBitmap
   415	
   416		// replacement byte, indexed by old byte. only valid if
   417		// corresponding old bit is set.
   418		new [256]byte
   419	}
   420	
   421	func (r *byteReplacer) Replace(s string) string {
   422		var buf []byte // lazily allocated
   423		for i := 0; i < len(s); i++ {
   424			b := s[i]
   425			if r.old[b>>5]&uint32(1<<(b&31)) != 0 {
   426				if buf == nil {
   427					buf = []byte(s)
   428				}
   429				buf[i] = r.new[b]
   430			}
   431		}
   432		if buf == nil {
   433			return s
   434		}
   435		return string(buf)
   436	}
   437	
   438	func (r *byteReplacer) WriteString(w io.Writer, s string) (n int, err error) {
   439		// TODO(bradfitz): use io.WriteString with slices of s, avoiding allocation.
   440		bufsize := 32 << 10
   441		if len(s) < bufsize {
   442			bufsize = len(s)
   443		}
   444		buf := make([]byte, bufsize)
   445	
   446		for len(s) > 0 {
   447			ncopy := copy(buf, s[:])
   448			s = s[ncopy:]
   449			for i, b := range buf[:ncopy] {
   450				if r.old[b>>5]&uint32(1<<(b&31)) != 0 {
   451					buf[i] = r.new[b]
   452				}
   453			}
   454			wn, err := w.Write(buf[:ncopy])
   455			n += wn
   456			if err != nil {
   457				return n, err
   458			}
   459		}
   460		return n, nil
   461	}
   462	
   463	// byteStringReplacer is the implementation that's used when all the
   464	// "old" values are single ASCII bytes but the "new" values vary in
   465	// size.
   466	type byteStringReplacer struct {
   467		// old has a bit set for each old byte that should be replaced.
   468		old byteBitmap
   469	
   470		// replacement string, indexed by old byte. only valid if
   471		// corresponding old bit is set.
   472		new [256][]byte
   473	}
   474	
   475	func (r *byteStringReplacer) Replace(s string) string {
   476		newSize := 0
   477		anyChanges := false
   478		for i := 0; i < len(s); i++ {
   479			b := s[i]
   480			if r.old[b>>5]&uint32(1<<(b&31)) != 0 {
   481				anyChanges = true
   482				newSize += len(r.new[b])
   483			} else {
   484				newSize++
   485			}
   486		}
   487		if !anyChanges {
   488			return s
   489		}
   490		buf := make([]byte, newSize)
   491		bi := buf
   492		for i := 0; i < len(s); i++ {
   493			b := s[i]
   494			if r.old[b>>5]&uint32(1<<(b&31)) != 0 {
   495				n := copy(bi[:], r.new[b])
   496				bi = bi[n:]
   497			} else {
   498				bi[0] = b
   499				bi = bi[1:]
   500			}
   501		}
   502		return string(buf)
   503	}
   504	
   505	// WriteString maintains one buffer that's at most 32KB.  The bytes in
   506	// s are enumerated and the buffer is filled.  If it reaches its
   507	// capacity or a byte has a replacement, the buffer is flushed to w.
   508	func (r *byteStringReplacer) WriteString(w io.Writer, s string) (n int, err error) {
   509		// TODO(bradfitz): use io.WriteString with slices of s instead.
   510		bufsize := 32 << 10
   511		if len(s) < bufsize {
   512			bufsize = len(s)
   513		}
   514		buf := make([]byte, bufsize)
   515		bi := buf[:0]
   516	
   517		for i := 0; i < len(s); i++ {
   518			b := s[i]
   519			var new []byte
   520			if r.old[b>>5]&uint32(1<<(b&31)) != 0 {
   521				new = r.new[b]
   522			} else {
   523				bi = append(bi, b)
   524			}
   525			if len(bi) == cap(bi) || (len(bi) > 0 && len(new) > 0) {
   526				nw, err := w.Write(bi)
   527				n += nw
   528				if err != nil {
   529					return n, err
   530				}
   531				bi = buf[:0]
   532			}
   533			if len(new) > 0 {
   534				nw, err := w.Write(new)
   535				n += nw
   536				if err != nil {
   537					return n, err
   538				}
   539			}
   540		}
   541		if len(bi) > 0 {
   542			nw, err := w.Write(bi)
   543			n += nw
   544			if err != nil {
   545				return n, err
   546			}
   547		}
   548		return n, nil
   549	}

View as plain text