...
Run Format

Source file src/regexp/onepass.go

     1	// Copyright 2014 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package regexp
     6	
     7	import (
     8		"bytes"
     9		"regexp/syntax"
    10		"sort"
    11		"unicode"
    12	)
    13	
    14	// "One-pass" regexp execution.
    15	// Some regexps can be analyzed to determine that they never need
    16	// backtracking: they are guaranteed to run in one pass over the string
    17	// without bothering to save all the usual NFA state.
    18	// Detect those and execute them more quickly.
    19	
    20	// A onePassProg is a compiled one-pass regular expression program.
    21	// It is the same as syntax.Prog except for the use of onePassInst.
    22	type onePassProg struct {
    23		Inst   []onePassInst
    24		Start  int // index of start instruction
    25		NumCap int // number of InstCapture insts in re
    26	}
    27	
    28	// A onePassInst is a single instruction in a one-pass regular expression program.
    29	// It is the same as syntax.Inst except for the new 'Next' field.
    30	type onePassInst struct {
    31		syntax.Inst
    32		Next []uint32
    33	}
    34	
    35	// OnePassPrefix returns a literal string that all matches for the
    36	// regexp must start with. Complete is true if the prefix
    37	// is the entire match. Pc is the index of the last rune instruction
    38	// in the string. The OnePassPrefix skips over the mandatory
    39	// EmptyBeginText
    40	func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
    41		i := &p.Inst[p.Start]
    42		if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
    43			return "", i.Op == syntax.InstMatch, uint32(p.Start)
    44		}
    45		pc = i.Out
    46		i = &p.Inst[pc]
    47		for i.Op == syntax.InstNop {
    48			pc = i.Out
    49			i = &p.Inst[pc]
    50		}
    51		// Avoid allocation of buffer if prefix is empty.
    52		if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
    53			return "", i.Op == syntax.InstMatch, uint32(p.Start)
    54		}
    55	
    56		// Have prefix; gather characters.
    57		var buf bytes.Buffer
    58		for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
    59			buf.WriteRune(i.Rune[0])
    60			pc, i = i.Out, &p.Inst[i.Out]
    61		}
    62		if i.Op == syntax.InstEmptyWidth &&
    63			syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 &&
    64			p.Inst[i.Out].Op == syntax.InstMatch {
    65			complete = true
    66		}
    67		return buf.String(), complete, pc
    68	}
    69	
    70	// OnePassNext selects the next actionable state of the prog, based on the input character.
    71	// It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
    72	// One of the alternates may ultimately lead without input to end of line. If the instruction
    73	// is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
    74	func onePassNext(i *onePassInst, r rune) uint32 {
    75		next := i.MatchRunePos(r)
    76		if next >= 0 {
    77			return i.Next[next]
    78		}
    79		if i.Op == syntax.InstAltMatch {
    80			return i.Out
    81		}
    82		return 0
    83	}
    84	
    85	func iop(i *syntax.Inst) syntax.InstOp {
    86		op := i.Op
    87		switch op {
    88		case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
    89			op = syntax.InstRune
    90		}
    91		return op
    92	}
    93	
    94	// Sparse Array implementation is used as a queueOnePass.
    95	type queueOnePass struct {
    96		sparse          []uint32
    97		dense           []uint32
    98		size, nextIndex uint32
    99	}
   100	
   101	func (q *queueOnePass) empty() bool {
   102		return q.nextIndex >= q.size
   103	}
   104	
   105	func (q *queueOnePass) next() (n uint32) {
   106		n = q.dense[q.nextIndex]
   107		q.nextIndex++
   108		return
   109	}
   110	
   111	func (q *queueOnePass) clear() {
   112		q.size = 0
   113		q.nextIndex = 0
   114	}
   115	
   116	func (q *queueOnePass) contains(u uint32) bool {
   117		if u >= uint32(len(q.sparse)) {
   118			return false
   119		}
   120		return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
   121	}
   122	
   123	func (q *queueOnePass) insert(u uint32) {
   124		if !q.contains(u) {
   125			q.insertNew(u)
   126		}
   127	}
   128	
   129	func (q *queueOnePass) insertNew(u uint32) {
   130		if u >= uint32(len(q.sparse)) {
   131			return
   132		}
   133		q.sparse[u] = q.size
   134		q.dense[q.size] = u
   135		q.size++
   136	}
   137	
   138	func newQueue(size int) (q *queueOnePass) {
   139		return &queueOnePass{
   140			sparse: make([]uint32, size),
   141			dense:  make([]uint32, size),
   142		}
   143	}
   144	
   145	// mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
   146	// and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
   147	// i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
   148	// NextIp array with the single element mergeFailed is returned.
   149	// The code assumes that both inputs contain ordered and non-intersecting rune pairs.
   150	const mergeFailed = uint32(0xffffffff)
   151	
   152	var (
   153		noRune = []rune{}
   154		noNext = []uint32{mergeFailed}
   155	)
   156	
   157	func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
   158		leftLen := len(*leftRunes)
   159		rightLen := len(*rightRunes)
   160		if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
   161			panic("mergeRuneSets odd length []rune")
   162		}
   163		var (
   164			lx, rx int
   165		)
   166		merged := make([]rune, 0)
   167		next := make([]uint32, 0)
   168		ok := true
   169		defer func() {
   170			if !ok {
   171				merged = nil
   172				next = nil
   173			}
   174		}()
   175	
   176		ix := -1
   177		extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
   178			if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
   179				return false
   180			}
   181			merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
   182			*newLow += 2
   183			ix += 2
   184			next = append(next, pc)
   185			return true
   186		}
   187	
   188		for lx < leftLen || rx < rightLen {
   189			switch {
   190			case rx >= rightLen:
   191				ok = extend(&lx, leftRunes, leftPC)
   192			case lx >= leftLen:
   193				ok = extend(&rx, rightRunes, rightPC)
   194			case (*rightRunes)[rx] < (*leftRunes)[lx]:
   195				ok = extend(&rx, rightRunes, rightPC)
   196			default:
   197				ok = extend(&lx, leftRunes, leftPC)
   198			}
   199			if !ok {
   200				return noRune, noNext
   201			}
   202		}
   203		return merged, next
   204	}
   205	
   206	// cleanupOnePass drops working memory, and restores certain shortcut instructions.
   207	func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
   208		for ix, instOriginal := range original.Inst {
   209			switch instOriginal.Op {
   210			case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
   211			case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
   212				prog.Inst[ix].Next = nil
   213			case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
   214				prog.Inst[ix].Next = nil
   215				prog.Inst[ix] = onePassInst{Inst: instOriginal}
   216			}
   217		}
   218	}
   219	
   220	// onePassCopy creates a copy of the original Prog, as we'll be modifying it
   221	func onePassCopy(prog *syntax.Prog) *onePassProg {
   222		p := &onePassProg{
   223			Start:  prog.Start,
   224			NumCap: prog.NumCap,
   225		}
   226		for _, inst := range prog.Inst {
   227			p.Inst = append(p.Inst, onePassInst{Inst: inst})
   228		}
   229	
   230		// rewrites one or more common Prog constructs that enable some otherwise
   231		// non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
   232		// ip A, that points to ips B & C.
   233		// A:BC + B:DA => A:BC + B:CD
   234		// A:BC + B:DC => A:DC + B:DC
   235		for pc := range p.Inst {
   236			switch p.Inst[pc].Op {
   237			default:
   238				continue
   239			case syntax.InstAlt, syntax.InstAltMatch:
   240				// A:Bx + B:Ay
   241				p_A_Other := &p.Inst[pc].Out
   242				p_A_Alt := &p.Inst[pc].Arg
   243				// make sure a target is another Alt
   244				instAlt := p.Inst[*p_A_Alt]
   245				if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
   246					p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
   247					instAlt = p.Inst[*p_A_Alt]
   248					if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
   249						continue
   250					}
   251				}
   252				instOther := p.Inst[*p_A_Other]
   253				// Analyzing both legs pointing to Alts is for another day
   254				if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
   255					// too complicated
   256					continue
   257				}
   258				// simple empty transition loop
   259				// A:BC + B:DA => A:BC + B:DC
   260				p_B_Alt := &p.Inst[*p_A_Alt].Out
   261				p_B_Other := &p.Inst[*p_A_Alt].Arg
   262				patch := false
   263				if instAlt.Out == uint32(pc) {
   264					patch = true
   265				} else if instAlt.Arg == uint32(pc) {
   266					patch = true
   267					p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
   268				}
   269				if patch {
   270					*p_B_Alt = *p_A_Other
   271				}
   272	
   273				// empty transition to common target
   274				// A:BC + B:DC => A:DC + B:DC
   275				if *p_A_Other == *p_B_Alt {
   276					*p_A_Alt = *p_B_Other
   277				}
   278			}
   279		}
   280		return p
   281	}
   282	
   283	// runeSlice exists to permit sorting the case-folded rune sets.
   284	type runeSlice []rune
   285	
   286	func (p runeSlice) Len() int           { return len(p) }
   287	func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
   288	func (p runeSlice) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
   289	
   290	var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
   291	var anyRune = []rune{0, unicode.MaxRune}
   292	
   293	// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
   294	// the match engine can always tell which branch to take. The routine may modify
   295	// p if it is turned into a onepass Prog. If it isn't possible for this to be a
   296	// onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive
   297	// to the size of the Prog.
   298	func makeOnePass(p *onePassProg) *onePassProg {
   299		// If the machine is very long, it's not worth the time to check if we can use one pass.
   300		if len(p.Inst) >= 1000 {
   301			return notOnePass
   302		}
   303	
   304		var (
   305			instQueue    = newQueue(len(p.Inst))
   306			visitQueue   = newQueue(len(p.Inst))
   307			check        func(uint32, map[uint32]bool) bool
   308			onePassRunes = make([][]rune, len(p.Inst))
   309		)
   310	
   311		// check that paths from Alt instructions are unambiguous, and rebuild the new
   312		// program as a onepass program
   313		check = func(pc uint32, m map[uint32]bool) (ok bool) {
   314			ok = true
   315			inst := &p.Inst[pc]
   316			if visitQueue.contains(pc) {
   317				return
   318			}
   319			visitQueue.insert(pc)
   320			switch inst.Op {
   321			case syntax.InstAlt, syntax.InstAltMatch:
   322				ok = check(inst.Out, m) && check(inst.Arg, m)
   323				// check no-input paths to InstMatch
   324				matchOut := m[inst.Out]
   325				matchArg := m[inst.Arg]
   326				if matchOut && matchArg {
   327					ok = false
   328					break
   329				}
   330				// Match on empty goes in inst.Out
   331				if matchArg {
   332					inst.Out, inst.Arg = inst.Arg, inst.Out
   333					matchOut, matchArg = matchArg, matchOut
   334				}
   335				if matchOut {
   336					m[pc] = true
   337					inst.Op = syntax.InstAltMatch
   338				}
   339	
   340				// build a dispatch operator from the two legs of the alt.
   341				onePassRunes[pc], inst.Next = mergeRuneSets(
   342					&onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
   343				if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
   344					ok = false
   345					break
   346				}
   347			case syntax.InstCapture, syntax.InstNop:
   348				ok = check(inst.Out, m)
   349				m[pc] = m[inst.Out]
   350				// pass matching runes back through these no-ops.
   351				onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
   352				inst.Next = []uint32{}
   353				for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
   354					inst.Next = append(inst.Next, inst.Out)
   355				}
   356			case syntax.InstEmptyWidth:
   357				ok = check(inst.Out, m)
   358				m[pc] = m[inst.Out]
   359				onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
   360				inst.Next = []uint32{}
   361				for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
   362					inst.Next = append(inst.Next, inst.Out)
   363				}
   364			case syntax.InstMatch, syntax.InstFail:
   365				m[pc] = inst.Op == syntax.InstMatch
   366				break
   367			case syntax.InstRune:
   368				m[pc] = false
   369				if len(inst.Next) > 0 {
   370					break
   371				}
   372				instQueue.insert(inst.Out)
   373				if len(inst.Rune) == 0 {
   374					onePassRunes[pc] = []rune{}
   375					inst.Next = []uint32{inst.Out}
   376					break
   377				}
   378				runes := make([]rune, 0)
   379				if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
   380					r0 := inst.Rune[0]
   381					runes = append(runes, r0, r0)
   382					for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
   383						runes = append(runes, r1, r1)
   384					}
   385					sort.Sort(runeSlice(runes))
   386				} else {
   387					runes = append(runes, inst.Rune...)
   388				}
   389				onePassRunes[pc] = runes
   390				inst.Next = []uint32{}
   391				for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
   392					inst.Next = append(inst.Next, inst.Out)
   393				}
   394				inst.Op = syntax.InstRune
   395			case syntax.InstRune1:
   396				m[pc] = false
   397				if len(inst.Next) > 0 {
   398					break
   399				}
   400				instQueue.insert(inst.Out)
   401				runes := []rune{}
   402				// expand case-folded runes
   403				if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
   404					r0 := inst.Rune[0]
   405					runes = append(runes, r0, r0)
   406					for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
   407						runes = append(runes, r1, r1)
   408					}
   409					sort.Sort(runeSlice(runes))
   410				} else {
   411					runes = append(runes, inst.Rune[0], inst.Rune[0])
   412				}
   413				onePassRunes[pc] = runes
   414				inst.Next = []uint32{}
   415				for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
   416					inst.Next = append(inst.Next, inst.Out)
   417				}
   418				inst.Op = syntax.InstRune
   419			case syntax.InstRuneAny:
   420				m[pc] = false
   421				if len(inst.Next) > 0 {
   422					break
   423				}
   424				instQueue.insert(inst.Out)
   425				onePassRunes[pc] = append([]rune{}, anyRune...)
   426				inst.Next = []uint32{inst.Out}
   427			case syntax.InstRuneAnyNotNL:
   428				m[pc] = false
   429				if len(inst.Next) > 0 {
   430					break
   431				}
   432				instQueue.insert(inst.Out)
   433				onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
   434				inst.Next = []uint32{}
   435				for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
   436					inst.Next = append(inst.Next, inst.Out)
   437				}
   438			}
   439			return
   440		}
   441	
   442		instQueue.clear()
   443		instQueue.insert(uint32(p.Start))
   444		m := make(map[uint32]bool, len(p.Inst))
   445		for !instQueue.empty() {
   446			visitQueue.clear()
   447			pc := instQueue.next()
   448			if !check(pc, m) {
   449				p = notOnePass
   450				break
   451			}
   452		}
   453		if p != notOnePass {
   454			for i := range p.Inst {
   455				p.Inst[i].Rune = onePassRunes[i]
   456			}
   457		}
   458		return p
   459	}
   460	
   461	var notOnePass *onePassProg = nil
   462	
   463	// compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
   464	// can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the
   465	// Prog cannot be converted. For a one pass prog, the fundamental condition that must
   466	// be true is: at any InstAlt, there must be no ambiguity about what branch to  take.
   467	func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
   468		if prog.Start == 0 {
   469			return notOnePass
   470		}
   471		// onepass regexp is anchored
   472		if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
   473			syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
   474			return notOnePass
   475		}
   476		// every instruction leading to InstMatch must be EmptyEndText
   477		for _, inst := range prog.Inst {
   478			opOut := prog.Inst[inst.Out].Op
   479			switch inst.Op {
   480			default:
   481				if opOut == syntax.InstMatch {
   482					return notOnePass
   483				}
   484			case syntax.InstAlt, syntax.InstAltMatch:
   485				if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
   486					return notOnePass
   487				}
   488			case syntax.InstEmptyWidth:
   489				if opOut == syntax.InstMatch {
   490					if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
   491						continue
   492					}
   493					return notOnePass
   494				}
   495			}
   496		}
   497		// Creates a slightly optimized copy of the original Prog
   498		// that cleans up some Prog idioms that block valid onepass programs
   499		p = onePassCopy(prog)
   500	
   501		// checkAmbiguity on InstAlts, build onepass Prog if possible
   502		p = makeOnePass(p)
   503	
   504		if p != notOnePass {
   505			cleanupOnePass(p, prog)
   506		}
   507		return p
   508	}
   509	

View as plain text