backtrack.go

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // backtrack is a regular expression search with submatch
     6  // tracking for small regular expressions and texts. It allocates
     7  // a bit vector with (length of input) * (length of prog) bits,
     8  // to make sure it never explores the same (character position, instruction)
     9  // state multiple times. This limits the search to run in time linear in
    10  // the length of the test.
    11  //
    12  // backtrack is a fast replacement for the NFA code on small
    13  // regexps when onepass cannot be used.
    14  
    15  package regexp
    16  
    17  import (
    18  	"regexp/syntax"
    19  	"sync"
    20  )
    21  
    22  // A job is an entry on the backtracker's job stack. It holds
    23  // the instruction pc and the position in the input.
    24  type job struct {
    25  	pc  uint32
    26  	arg bool
    27  	pos int
    28  }
    29  
    30  const (
    31  	visitedBits        = 32
    32  	maxBacktrackProg   = 500        // len(prog.Inst) <= max
    33  	maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits)
    34  )
    35  
    36  // bitState holds state for the backtracker.
    37  type bitState struct {
    38  	end      int
    39  	cap      []int
    40  	matchcap []int
    41  	jobs     []job
    42  	visited  []uint32
    43  
    44  	inputs inputs
    45  }
    46  
    47  var bitStatePool sync.Pool
    48  
    49  func newBitState() *bitState {
    50  	b, ok := bitStatePool.Get().(*bitState)
    51  	if !ok {
    52  		b = new(bitState)
    53  	}
    54  	return b
    55  }
    56  
    57  func freeBitState(b *bitState) {
    58  	b.inputs.clear()
    59  	bitStatePool.Put(b)
    60  }
    61  
    62  // maxBitStateLen returns the maximum length of a string to search with
    63  // the backtracker using prog.
    64  func maxBitStateLen(prog *syntax.Prog) int {
    65  	if !shouldBacktrack(prog) {
    66  		return 0
    67  	}
    68  	return maxBacktrackVector / len(prog.Inst)
    69  }
    70  
    71  // shouldBacktrack reports whether the program is too
    72  // long for the backtracker to run.
    73  func shouldBacktrack(prog *syntax.Prog) bool {
    74  	return len(prog.Inst) <= maxBacktrackProg
    75  }
    76  
    77  // reset resets the state of the backtracker.
    78  // end is the end position in the input.
    79  // ncap is the number of captures.
    80  func (b *bitState) reset(prog *syntax.Prog, end int, ncap int) {
    81  	b.end = end
    82  
    83  	if cap(b.jobs) == 0 {
    84  		b.jobs = make([]job, 0, 256)
    85  	} else {
    86  		b.jobs = b.jobs[:0]
    87  	}
    88  
    89  	visitedSize := (len(prog.Inst)*(end+1) + visitedBits - 1) / visitedBits
    90  	if cap(b.visited) < visitedSize {
    91  		b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits)
    92  	} else {
    93  		b.visited = b.visited[:visitedSize]
    94  		clear(b.visited) // set to 0
    95  	}
    96  
    97  	if cap(b.cap) < ncap {
    98  		b.cap = make([]int, ncap)
    99  	} else {
   100  		b.cap = b.cap[:ncap]
   101  	}
   102  	for i := range b.cap {
   103  		b.cap[i] = -1
   104  	}
   105  
   106  	if cap(b.matchcap) < ncap {
   107  		b.matchcap = make([]int, ncap)
   108  	} else {
   109  		b.matchcap = b.matchcap[:ncap]
   110  	}
   111  	for i := range b.matchcap {
   112  		b.matchcap[i] = -1
   113  	}
   114  }
   115  
   116  // shouldVisit reports whether the combination of (pc, pos) has not
   117  // been visited yet.
   118  func (b *bitState) shouldVisit(pc uint32, pos int) bool {
   119  	n := uint(int(pc)*(b.end+1) + pos)
   120  	if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 {
   121  		return false
   122  	}
   123  	b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1))
   124  	return true
   125  }
   126  
   127  // push pushes (pc, pos, arg) onto the job stack if it should be
   128  // visited.
   129  func (b *bitState) push(re *Regexp, pc uint32, pos int, arg bool) {
   130  	// Only check shouldVisit when arg is false.
   131  	// When arg is true, we are continuing a previous visit.
   132  	if re.prog.Inst[pc].Op != syntax.InstFail && (arg || b.shouldVisit(pc, pos)) {
   133  		b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos})
   134  	}
   135  }
   136  
   137  // tryBacktrack runs a backtracking search starting at pos.
   138  func (re *Regexp) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool {
   139  	longest := re.longest
   140  
   141  	b.push(re, pc, pos, false)
   142  	for len(b.jobs) > 0 {
   143  		l := len(b.jobs) - 1
   144  		// Pop job off the stack.
   145  		pc := b.jobs[l].pc
   146  		pos := b.jobs[l].pos
   147  		arg := b.jobs[l].arg
   148  		b.jobs = b.jobs[:l]
   149  
   150  		// Optimization: rather than push and pop,
   151  		// code that is going to Push and continue
   152  		// the loop simply updates ip, p, and arg
   153  		// and jumps to CheckAndLoop. We have to
   154  		// do the ShouldVisit check that Push
   155  		// would have, but we avoid the stack
   156  		// manipulation.
   157  		goto Skip
   158  	CheckAndLoop:
   159  		if !b.shouldVisit(pc, pos) {
   160  			continue
   161  		}
   162  	Skip:
   163  
   164  		inst := &re.prog.Inst[pc]
   165  
   166  		switch inst.Op {
   167  		default:
   168  			panic("bad inst")
   169  		case syntax.InstFail:
   170  			panic("unexpected InstFail")
   171  		case syntax.InstAlt:
   172  			// Cannot just
   173  			//   b.push(inst.Out, pos, false)
   174  			//   b.push(inst.Arg, pos, false)
   175  			// If during the processing of inst.Out, we encounter
   176  			// inst.Arg via another path, we want to process it then.
   177  			// Pushing it here will inhibit that. Instead, re-push
   178  			// inst with arg==true as a reminder to push inst.Arg out
   179  			// later.
   180  			if arg {
   181  				// Finished inst.Out; try inst.Arg.
   182  				arg = false
   183  				pc = inst.Arg
   184  				goto CheckAndLoop
   185  			} else {
   186  				b.push(re, pc, pos, true)
   187  				pc = inst.Out
   188  				goto CheckAndLoop
   189  			}
   190  
   191  		case syntax.InstAltMatch:
   192  			// One opcode consumes runes; the other leads to match.
   193  			switch re.prog.Inst[inst.Out].Op {
   194  			case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
   195  				// inst.Arg is the match.
   196  				b.push(re, inst.Arg, pos, false)
   197  				pc = inst.Arg
   198  				pos = b.end
   199  				goto CheckAndLoop
   200  			}
   201  			// inst.Out is the match - non-greedy
   202  			b.push(re, inst.Out, b.end, false)
   203  			pc = inst.Out
   204  			goto CheckAndLoop
   205  
   206  		case syntax.InstRune:
   207  			r, width := i.step(pos)
   208  			if !inst.MatchRune(r) {
   209  				continue
   210  			}
   211  			pos += width
   212  			pc = inst.Out
   213  			goto CheckAndLoop
   214  
   215  		case syntax.InstRune1:
   216  			r, width := i.step(pos)
   217  			if r != inst.Rune[0] {
   218  				continue
   219  			}
   220  			pos += width
   221  			pc = inst.Out
   222  			goto CheckAndLoop
   223  
   224  		case syntax.InstRuneAnyNotNL:
   225  			r, width := i.step(pos)
   226  			if r == '\n' || r == endOfText {
   227  				continue
   228  			}
   229  			pos += width
   230  			pc = inst.Out
   231  			goto CheckAndLoop
   232  
   233  		case syntax.InstRuneAny:
   234  			r, width := i.step(pos)
   235  			if r == endOfText {
   236  				continue
   237  			}
   238  			pos += width
   239  			pc = inst.Out
   240  			goto CheckAndLoop
   241  
   242  		case syntax.InstCapture:
   243  			if arg {
   244  				// Finished inst.Out; restore the old value.
   245  				b.cap[inst.Arg] = pos
   246  				continue
   247  			} else {
   248  				if inst.Arg < uint32(len(b.cap)) {
   249  					// Capture pos to register, but save old value.
   250  					b.push(re, pc, b.cap[inst.Arg], true) // come back when we're done.
   251  					b.cap[inst.Arg] = pos
   252  				}
   253  				pc = inst.Out
   254  				goto CheckAndLoop
   255  			}
   256  
   257  		case syntax.InstEmptyWidth:
   258  			flag := i.context(pos)
   259  			if !flag.match(syntax.EmptyOp(inst.Arg)) {
   260  				continue
   261  			}
   262  			pc = inst.Out
   263  			goto CheckAndLoop
   264  
   265  		case syntax.InstNop:
   266  			pc = inst.Out
   267  			goto CheckAndLoop
   268  
   269  		case syntax.InstMatch:
   270  			// We found a match. If the caller doesn't care
   271  			// where the match is, no point going further.
   272  			if len(b.cap) == 0 {
   273  				return true
   274  			}
   275  
   276  			// Record best match so far.
   277  			// Only need to check end point, because this entire
   278  			// call is only considering one start position.
   279  			if len(b.cap) > 1 {
   280  				b.cap[1] = pos
   281  			}
   282  			if old := b.matchcap[1]; old == -1 || (longest && pos > 0 && pos > old) {
   283  				copy(b.matchcap, b.cap)
   284  			}
   285  
   286  			// If going for first match, we're done.
   287  			if !longest {
   288  				return true
   289  			}
   290  
   291  			// If we used the entire text, no longer match is possible.
   292  			if pos == b.end {
   293  				return true
   294  			}
   295  
   296  			// Otherwise, continue on in hope of a longer match.
   297  			continue
   298  		}
   299  	}
   300  
   301  	return longest && len(b.matchcap) > 1 && b.matchcap[1] >= 0
   302  }
   303  
   304  // backtrack runs a backtracking search of prog on the input starting at pos.
   305  func (re *Regexp) backtrack(ib []byte, is string, pos int, ncap int, dstCap []int) []int {
   306  	startCond := re.cond
   307  	if startCond == ^syntax.EmptyOp(0) { // impossible
   308  		return nil
   309  	}
   310  	if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
   311  		// Anchored match, past beginning of text.
   312  		return nil
   313  	}
   314  
   315  	b := newBitState()
   316  	i, end := b.inputs.init(nil, ib, is)
   317  	b.reset(re.prog, end, ncap)
   318  
   319  	// Anchored search must start at the beginning of the input
   320  	if startCond&syntax.EmptyBeginText != 0 {
   321  		if len(b.cap) > 0 {
   322  			b.cap[0] = pos
   323  		}
   324  		if !re.tryBacktrack(b, i, uint32(re.prog.Start), pos) {
   325  			freeBitState(b)
   326  			return nil
   327  		}
   328  	} else {
   329  
   330  		// Unanchored search, starting from each possible text position.
   331  		// Notice that we have to try the empty string at the end of
   332  		// the text, so the loop condition is pos <= end, not pos < end.
   333  		// This looks like it's quadratic in the size of the text,
   334  		// but we are not clearing visited between calls to TrySearch,
   335  		// so no work is duplicated and it ends up still being linear.
   336  		width := -1
   337  		for ; pos <= end && width != 0; pos += width {
   338  			if len(re.prefix) > 0 {
   339  				// Match requires literal prefix; fast search for it.
   340  				advance := i.index(re, pos)
   341  				if advance < 0 {
   342  					freeBitState(b)
   343  					return nil
   344  				}
   345  				pos += advance
   346  			}
   347  
   348  			if len(b.cap) > 0 {
   349  				b.cap[0] = pos
   350  			}
   351  			if re.tryBacktrack(b, i, uint32(re.prog.Start), pos) {
   352  				// Match must be leftmost; done.
   353  				goto Match
   354  			}
   355  			_, width = i.step(pos)
   356  		}
   357  		freeBitState(b)
   358  		return nil
   359  	}
   360  
   361  Match:
   362  	dstCap = append(dstCap, b.matchcap...)
   363  	freeBitState(b)
   364  	return dstCap
   365  }
   366
View as plain text