...
Run Format

Source file src/regexp/backtrack.go

     1	// Copyright 2015 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// backtrack is a regular expression search with submatch
     6	// tracking for small regular expressions and texts. It allocates
     7	// a bit vector with (length of input) * (length of prog) bits,
     8	// to make sure it never explores the same (character position, instruction)
     9	// state multiple times. This limits the search to run in time linear in
    10	// the length of the test.
    11	//
    12	// backtrack is a fast replacement for the NFA code on small
    13	// regexps when onepass cannot be used.
    14	
    15	package regexp
    16	
    17	import "regexp/syntax"
    18	
    19	// A job is an entry on the backtracker's job stack. It holds
    20	// the instruction pc and the position in the input.
    21	type job struct {
    22		pc  uint32
    23		arg int
    24		pos int
    25	}
    26	
    27	const (
    28		visitedBits        = 32
    29		maxBacktrackProg   = 500        // len(prog.Inst) <= max
    30		maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits)
    31	)
    32	
    33	// bitState holds state for the backtracker.
    34	type bitState struct {
    35		prog *syntax.Prog
    36	
    37		end     int
    38		cap     []int
    39		jobs    []job
    40		visited []uint32
    41	}
    42	
    43	var notBacktrack *bitState = nil
    44	
    45	// maxBitStateLen returns the maximum length of a string to search with
    46	// the backtracker using prog.
    47	func maxBitStateLen(prog *syntax.Prog) int {
    48		if !shouldBacktrack(prog) {
    49			return 0
    50		}
    51		return maxBacktrackVector / len(prog.Inst)
    52	}
    53	
    54	// newBitState returns a new bitState for the given prog,
    55	// or notBacktrack if the size of the prog exceeds the maximum size that
    56	// the backtracker will be run for.
    57	func newBitState(prog *syntax.Prog) *bitState {
    58		if !shouldBacktrack(prog) {
    59			return notBacktrack
    60		}
    61		return &bitState{
    62			prog: prog,
    63		}
    64	}
    65	
    66	// shouldBacktrack reports whether the program is too
    67	// long for the backtracker to run.
    68	func shouldBacktrack(prog *syntax.Prog) bool {
    69		return len(prog.Inst) <= maxBacktrackProg
    70	}
    71	
    72	// reset resets the state of the backtracker.
    73	// end is the end position in the input.
    74	// ncap is the number of captures.
    75	func (b *bitState) reset(end int, ncap int) {
    76		b.end = end
    77	
    78		if cap(b.jobs) == 0 {
    79			b.jobs = make([]job, 0, 256)
    80		} else {
    81			b.jobs = b.jobs[:0]
    82		}
    83	
    84		visitedSize := (len(b.prog.Inst)*(end+1) + visitedBits - 1) / visitedBits
    85		if cap(b.visited) < visitedSize {
    86			b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits)
    87		} else {
    88			b.visited = b.visited[:visitedSize]
    89			for i := range b.visited {
    90				b.visited[i] = 0
    91			}
    92		}
    93	
    94		if cap(b.cap) < ncap {
    95			b.cap = make([]int, ncap)
    96		} else {
    97			b.cap = b.cap[:ncap]
    98		}
    99		for i := range b.cap {
   100			b.cap[i] = -1
   101		}
   102	}
   103	
   104	// shouldVisit reports whether the combination of (pc, pos) has not
   105	// been visited yet.
   106	func (b *bitState) shouldVisit(pc uint32, pos int) bool {
   107		n := uint(int(pc)*(b.end+1) + pos)
   108		if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 {
   109			return false
   110		}
   111		b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1))
   112		return true
   113	}
   114	
   115	// push pushes (pc, pos, arg) onto the job stack if it should be
   116	// visited.
   117	func (b *bitState) push(pc uint32, pos int, arg int) {
   118		if b.prog.Inst[pc].Op == syntax.InstFail {
   119			return
   120		}
   121	
   122		// Only check shouldVisit when arg == 0.
   123		// When arg > 0, we are continuing a previous visit.
   124		if arg == 0 && !b.shouldVisit(pc, pos) {
   125			return
   126		}
   127	
   128		b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos})
   129	}
   130	
   131	// tryBacktrack runs a backtracking search starting at pos.
   132	func (m *machine) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool {
   133		longest := m.re.longest
   134		m.matched = false
   135	
   136		b.push(pc, pos, 0)
   137		for len(b.jobs) > 0 {
   138			l := len(b.jobs) - 1
   139			// Pop job off the stack.
   140			pc := b.jobs[l].pc
   141			pos := b.jobs[l].pos
   142			arg := b.jobs[l].arg
   143			b.jobs = b.jobs[:l]
   144	
   145			// Optimization: rather than push and pop,
   146			// code that is going to Push and continue
   147			// the loop simply updates ip, p, and arg
   148			// and jumps to CheckAndLoop. We have to
   149			// do the ShouldVisit check that Push
   150			// would have, but we avoid the stack
   151			// manipulation.
   152			goto Skip
   153		CheckAndLoop:
   154			if !b.shouldVisit(pc, pos) {
   155				continue
   156			}
   157		Skip:
   158	
   159			inst := b.prog.Inst[pc]
   160	
   161			switch inst.Op {
   162			default:
   163				panic("bad inst")
   164			case syntax.InstFail:
   165				panic("unexpected InstFail")
   166			case syntax.InstAlt:
   167				// Cannot just
   168				//   b.push(inst.Out, pos, 0)
   169				//   b.push(inst.Arg, pos, 0)
   170				// If during the processing of inst.Out, we encounter
   171				// inst.Arg via another path, we want to process it then.
   172				// Pushing it here will inhibit that. Instead, re-push
   173				// inst with arg==1 as a reminder to push inst.Arg out
   174				// later.
   175				switch arg {
   176				case 0:
   177					b.push(pc, pos, 1)
   178					pc = inst.Out
   179					goto CheckAndLoop
   180				case 1:
   181					// Finished inst.Out; try inst.Arg.
   182					arg = 0
   183					pc = inst.Arg
   184					goto CheckAndLoop
   185				}
   186				panic("bad arg in InstAlt")
   187	
   188			case syntax.InstAltMatch:
   189				// One opcode consumes runes; the other leads to match.
   190				switch b.prog.Inst[inst.Out].Op {
   191				case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
   192					// inst.Arg is the match.
   193					b.push(inst.Arg, pos, 0)
   194					pc = inst.Arg
   195					pos = b.end
   196					goto CheckAndLoop
   197				}
   198				// inst.Out is the match - non-greedy
   199				b.push(inst.Out, b.end, 0)
   200				pc = inst.Out
   201				goto CheckAndLoop
   202	
   203			case syntax.InstRune:
   204				r, width := i.step(pos)
   205				if !inst.MatchRune(r) {
   206					continue
   207				}
   208				pos += width
   209				pc = inst.Out
   210				goto CheckAndLoop
   211	
   212			case syntax.InstRune1:
   213				r, width := i.step(pos)
   214				if r != inst.Rune[0] {
   215					continue
   216				}
   217				pos += width
   218				pc = inst.Out
   219				goto CheckAndLoop
   220	
   221			case syntax.InstRuneAnyNotNL:
   222				r, width := i.step(pos)
   223				if r == '\n' || r == endOfText {
   224					continue
   225				}
   226				pos += width
   227				pc = inst.Out
   228				goto CheckAndLoop
   229	
   230			case syntax.InstRuneAny:
   231				r, width := i.step(pos)
   232				if r == endOfText {
   233					continue
   234				}
   235				pos += width
   236				pc = inst.Out
   237				goto CheckAndLoop
   238	
   239			case syntax.InstCapture:
   240				switch arg {
   241				case 0:
   242					if 0 <= inst.Arg && inst.Arg < uint32(len(b.cap)) {
   243						// Capture pos to register, but save old value.
   244						b.push(pc, b.cap[inst.Arg], 1) // come back when we're done.
   245						b.cap[inst.Arg] = pos
   246					}
   247					pc = inst.Out
   248					goto CheckAndLoop
   249				case 1:
   250					// Finished inst.Out; restore the old value.
   251					b.cap[inst.Arg] = pos
   252					continue
   253	
   254				}
   255				panic("bad arg in InstCapture")
   256	
   257			case syntax.InstEmptyWidth:
   258				if syntax.EmptyOp(inst.Arg)&^i.context(pos) != 0 {
   259					continue
   260				}
   261				pc = inst.Out
   262				goto CheckAndLoop
   263	
   264			case syntax.InstNop:
   265				pc = inst.Out
   266				goto CheckAndLoop
   267	
   268			case syntax.InstMatch:
   269				// We found a match. If the caller doesn't care
   270				// where the match is, no point going further.
   271				if len(b.cap) == 0 {
   272					m.matched = true
   273					return m.matched
   274				}
   275	
   276				// Record best match so far.
   277				// Only need to check end point, because this entire
   278				// call is only considering one start position.
   279				if len(b.cap) > 1 {
   280					b.cap[1] = pos
   281				}
   282				if !m.matched || (longest && pos > 0 && pos > m.matchcap[1]) {
   283					copy(m.matchcap, b.cap)
   284				}
   285				m.matched = true
   286	
   287				// If going for first match, we're done.
   288				if !longest {
   289					return m.matched
   290				}
   291	
   292				// If we used the entire text, no longer match is possible.
   293				if pos == b.end {
   294					return m.matched
   295				}
   296	
   297				// Otherwise, continue on in hope of a longer match.
   298				continue
   299			}
   300		}
   301	
   302		return m.matched
   303	}
   304	
   305	// backtrack runs a backtracking search of prog on the input starting at pos.
   306	func (m *machine) backtrack(i input, pos int, end int, ncap int) bool {
   307		if !i.canCheckPrefix() {
   308			panic("backtrack called for a RuneReader")
   309		}
   310	
   311		startCond := m.re.cond
   312		if startCond == ^syntax.EmptyOp(0) { // impossible
   313			return false
   314		}
   315		if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
   316			// Anchored match, past beginning of text.
   317			return false
   318		}
   319	
   320		b := m.b
   321		b.reset(end, ncap)
   322	
   323		m.matchcap = m.matchcap[:ncap]
   324		for i := range m.matchcap {
   325			m.matchcap[i] = -1
   326		}
   327	
   328		// Anchored search must start at the beginning of the input
   329		if startCond&syntax.EmptyBeginText != 0 {
   330			if len(b.cap) > 0 {
   331				b.cap[0] = pos
   332			}
   333			return m.tryBacktrack(b, i, uint32(m.p.Start), pos)
   334		}
   335	
   336		// Unanchored search, starting from each possible text position.
   337		// Notice that we have to try the empty string at the end of
   338		// the text, so the loop condition is pos <= end, not pos < end.
   339		// This looks like it's quadratic in the size of the text,
   340		// but we are not clearing visited between calls to TrySearch,
   341		// so no work is duplicated and it ends up still being linear.
   342		width := -1
   343		for ; pos <= end && width != 0; pos += width {
   344			if len(m.re.prefix) > 0 {
   345				// Match requires literal prefix; fast search for it.
   346				advance := i.index(m.re, pos)
   347				if advance < 0 {
   348					return false
   349				}
   350				pos += advance
   351			}
   352	
   353			if len(b.cap) > 0 {
   354				b.cap[0] = pos
   355			}
   356			if m.tryBacktrack(b, i, uint32(m.p.Start), pos) {
   357				// Match must be leftmost; done.
   358				return true
   359			}
   360			_, width = i.step(pos)
   361		}
   362		return false
   363	}
   364	

View as plain text