...
Run Format

Source file src/regexp/regexp.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package regexp implements regular expression search.
     6	//
     7	// The syntax of the regular expressions accepted is the same
     8	// general syntax used by Perl, Python, and other languages.
     9	// More precisely, it is the syntax accepted by RE2 and described at
    10	// https://golang.org/s/re2syntax, except for \C.
    11	// For an overview of the syntax, run
    12	//   go doc regexp/syntax
    13	//
    14	// The regexp implementation provided by this package is
    15	// guaranteed to run in time linear in the size of the input.
    16	// (This is a property not guaranteed by most open source
    17	// implementations of regular expressions.) For more information
    18	// about this property, see
    19	//	http://swtch.com/~rsc/regexp/regexp1.html
    20	// or any book about automata theory.
    21	//
    22	// All characters are UTF-8-encoded code points.
    23	//
    24	// There are 16 methods of Regexp that match a regular expression and identify
    25	// the matched text.  Their names are matched by this regular expression:
    26	//
    27	//	Find(All)?(String)?(Submatch)?(Index)?
    28	//
    29	// If 'All' is present, the routine matches successive non-overlapping
    30	// matches of the entire expression.  Empty matches abutting a preceding
    31	// match are ignored.  The return value is a slice containing the successive
    32	// return values of the corresponding non-'All' routine.  These routines take
    33	// an extra integer argument, n; if n >= 0, the function returns at most n
    34	// matches/submatches.
    35	//
    36	// If 'String' is present, the argument is a string; otherwise it is a slice
    37	// of bytes; return values are adjusted as appropriate.
    38	//
    39	// If 'Submatch' is present, the return value is a slice identifying the
    40	// successive submatches of the expression. Submatches are matches of
    41	// parenthesized subexpressions (also known as capturing groups) within the
    42	// regular expression, numbered from left to right in order of opening
    43	// parenthesis. Submatch 0 is the match of the entire expression, submatch 1
    44	// the match of the first parenthesized subexpression, and so on.
    45	//
    46	// If 'Index' is present, matches and submatches are identified by byte index
    47	// pairs within the input string: result[2*n:2*n+1] identifies the indexes of
    48	// the nth submatch.  The pair for n==0 identifies the match of the entire
    49	// expression.  If 'Index' is not present, the match is identified by the
    50	// text of the match/submatch.  If an index is negative, it means that
    51	// subexpression did not match any string in the input.
    52	//
    53	// There is also a subset of the methods that can be applied to text read
    54	// from a RuneReader:
    55	//
    56	//	MatchReader, FindReaderIndex, FindReaderSubmatchIndex
    57	//
    58	// This set may grow.  Note that regular expression matches may need to
    59	// examine text beyond the text returned by a match, so the methods that
    60	// match text from a RuneReader may read arbitrarily far into the input
    61	// before returning.
    62	//
    63	// (There are a few other methods that do not match this pattern.)
    64	//
    65	package regexp
    66	
    67	import (
    68		"bytes"
    69		"io"
    70		"regexp/syntax"
    71		"strconv"
    72		"strings"
    73		"sync"
    74		"unicode"
    75		"unicode/utf8"
    76	)
    77	
    78	var debug = false
    79	
    80	// Regexp is the representation of a compiled regular expression.
    81	// A Regexp is safe for concurrent use by multiple goroutines.
    82	type Regexp struct {
    83		// read-only after Compile
    84		expr           string         // as passed to Compile
    85		prog           *syntax.Prog   // compiled program
    86		onepass        *onePassProg   // onepass program or nil
    87		prefix         string         // required prefix in unanchored matches
    88		prefixBytes    []byte         // prefix, as a []byte
    89		prefixComplete bool           // prefix is the entire regexp
    90		prefixRune     rune           // first rune in prefix
    91		prefixEnd      uint32         // pc for last rune in prefix
    92		cond           syntax.EmptyOp // empty-width conditions required at start of match
    93		numSubexp      int
    94		subexpNames    []string
    95		longest        bool
    96	
    97		// cache of machines for running regexp
    98		mu      sync.Mutex
    99		machine []*machine
   100	}
   101	
   102	// String returns the source text used to compile the regular expression.
   103	func (re *Regexp) String() string {
   104		return re.expr
   105	}
   106	
   107	// Copy returns a new Regexp object copied from re.
   108	//
   109	// When using a Regexp in multiple goroutines, giving each goroutine
   110	// its own copy helps to avoid lock contention.
   111	func (re *Regexp) Copy() *Regexp {
   112		r := *re
   113		r.mu = sync.Mutex{}
   114		r.machine = nil
   115		return &r
   116	}
   117	
   118	// Compile parses a regular expression and returns, if successful,
   119	// a Regexp object that can be used to match against text.
   120	//
   121	// When matching against text, the regexp returns a match that
   122	// begins as early as possible in the input (leftmost), and among those
   123	// it chooses the one that a backtracking search would have found first.
   124	// This so-called leftmost-first matching is the same semantics
   125	// that Perl, Python, and other implementations use, although this
   126	// package implements it without the expense of backtracking.
   127	// For POSIX leftmost-longest matching, see CompilePOSIX.
   128	func Compile(expr string) (*Regexp, error) {
   129		return compile(expr, syntax.Perl, false)
   130	}
   131	
   132	// CompilePOSIX is like Compile but restricts the regular expression
   133	// to POSIX ERE (egrep) syntax and changes the match semantics to
   134	// leftmost-longest.
   135	//
   136	// That is, when matching against text, the regexp returns a match that
   137	// begins as early as possible in the input (leftmost), and among those
   138	// it chooses a match that is as long as possible.
   139	// This so-called leftmost-longest matching is the same semantics
   140	// that early regular expression implementations used and that POSIX
   141	// specifies.
   142	//
   143	// However, there can be multiple leftmost-longest matches, with different
   144	// submatch choices, and here this package diverges from POSIX.
   145	// Among the possible leftmost-longest matches, this package chooses
   146	// the one that a backtracking search would have found first, while POSIX
   147	// specifies that the match be chosen to maximize the length of the first
   148	// subexpression, then the second, and so on from left to right.
   149	// The POSIX rule is computationally prohibitive and not even well-defined.
   150	// See http://swtch.com/~rsc/regexp/regexp2.html#posix for details.
   151	func CompilePOSIX(expr string) (*Regexp, error) {
   152		return compile(expr, syntax.POSIX, true)
   153	}
   154	
   155	// Longest makes future searches prefer the leftmost-longest match.
   156	// That is, when matching against text, the regexp returns a match that
   157	// begins as early as possible in the input (leftmost), and among those
   158	// it chooses a match that is as long as possible.
   159	func (re *Regexp) Longest() {
   160		re.longest = true
   161	}
   162	
   163	func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
   164		re, err := syntax.Parse(expr, mode)
   165		if err != nil {
   166			return nil, err
   167		}
   168		maxCap := re.MaxCap()
   169		capNames := re.CapNames()
   170	
   171		re = re.Simplify()
   172		prog, err := syntax.Compile(re)
   173		if err != nil {
   174			return nil, err
   175		}
   176		regexp := &Regexp{
   177			expr:        expr,
   178			prog:        prog,
   179			onepass:     compileOnePass(prog),
   180			numSubexp:   maxCap,
   181			subexpNames: capNames,
   182			cond:        prog.StartCond(),
   183			longest:     longest,
   184		}
   185		if regexp.onepass == notOnePass {
   186			regexp.prefix, regexp.prefixComplete = prog.Prefix()
   187		} else {
   188			regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog)
   189		}
   190		if regexp.prefix != "" {
   191			// TODO(rsc): Remove this allocation by adding
   192			// IndexString to package bytes.
   193			regexp.prefixBytes = []byte(regexp.prefix)
   194			regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
   195		}
   196		return regexp, nil
   197	}
   198	
   199	// get returns a machine to use for matching re.
   200	// It uses the re's machine cache if possible, to avoid
   201	// unnecessary allocation.
   202	func (re *Regexp) get() *machine {
   203		re.mu.Lock()
   204		if n := len(re.machine); n > 0 {
   205			z := re.machine[n-1]
   206			re.machine = re.machine[:n-1]
   207			re.mu.Unlock()
   208			return z
   209		}
   210		re.mu.Unlock()
   211		z := progMachine(re.prog, re.onepass)
   212		z.re = re
   213		return z
   214	}
   215	
   216	// put returns a machine to the re's machine cache.
   217	// There is no attempt to limit the size of the cache, so it will
   218	// grow to the maximum number of simultaneous matches
   219	// run using re.  (The cache empties when re gets garbage collected.)
   220	func (re *Regexp) put(z *machine) {
   221		re.mu.Lock()
   222		re.machine = append(re.machine, z)
   223		re.mu.Unlock()
   224	}
   225	
   226	// MustCompile is like Compile but panics if the expression cannot be parsed.
   227	// It simplifies safe initialization of global variables holding compiled regular
   228	// expressions.
   229	func MustCompile(str string) *Regexp {
   230		regexp, error := Compile(str)
   231		if error != nil {
   232			panic(`regexp: Compile(` + quote(str) + `): ` + error.Error())
   233		}
   234		return regexp
   235	}
   236	
   237	// MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed.
   238	// It simplifies safe initialization of global variables holding compiled regular
   239	// expressions.
   240	func MustCompilePOSIX(str string) *Regexp {
   241		regexp, error := CompilePOSIX(str)
   242		if error != nil {
   243			panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error())
   244		}
   245		return regexp
   246	}
   247	
   248	func quote(s string) string {
   249		if strconv.CanBackquote(s) {
   250			return "`" + s + "`"
   251		}
   252		return strconv.Quote(s)
   253	}
   254	
   255	// NumSubexp returns the number of parenthesized subexpressions in this Regexp.
   256	func (re *Regexp) NumSubexp() int {
   257		return re.numSubexp
   258	}
   259	
   260	// SubexpNames returns the names of the parenthesized subexpressions
   261	// in this Regexp.  The name for the first sub-expression is names[1],
   262	// so that if m is a match slice, the name for m[i] is SubexpNames()[i].
   263	// Since the Regexp as a whole cannot be named, names[0] is always
   264	// the empty string.  The slice should not be modified.
   265	func (re *Regexp) SubexpNames() []string {
   266		return re.subexpNames
   267	}
   268	
   269	const endOfText rune = -1
   270	
   271	// input abstracts different representations of the input text. It provides
   272	// one-character lookahead.
   273	type input interface {
   274		step(pos int) (r rune, width int) // advance one rune
   275		canCheckPrefix() bool             // can we look ahead without losing info?
   276		hasPrefix(re *Regexp) bool
   277		index(re *Regexp, pos int) int
   278		context(pos int) syntax.EmptyOp
   279	}
   280	
   281	// inputString scans a string.
   282	type inputString struct {
   283		str string
   284	}
   285	
   286	func (i *inputString) step(pos int) (rune, int) {
   287		if pos < len(i.str) {
   288			c := i.str[pos]
   289			if c < utf8.RuneSelf {
   290				return rune(c), 1
   291			}
   292			return utf8.DecodeRuneInString(i.str[pos:])
   293		}
   294		return endOfText, 0
   295	}
   296	
   297	func (i *inputString) canCheckPrefix() bool {
   298		return true
   299	}
   300	
   301	func (i *inputString) hasPrefix(re *Regexp) bool {
   302		return strings.HasPrefix(i.str, re.prefix)
   303	}
   304	
   305	func (i *inputString) index(re *Regexp, pos int) int {
   306		return strings.Index(i.str[pos:], re.prefix)
   307	}
   308	
   309	func (i *inputString) context(pos int) syntax.EmptyOp {
   310		r1, r2 := endOfText, endOfText
   311		if pos > 0 && pos <= len(i.str) {
   312			r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
   313		}
   314		if pos < len(i.str) {
   315			r2, _ = utf8.DecodeRuneInString(i.str[pos:])
   316		}
   317		return syntax.EmptyOpContext(r1, r2)
   318	}
   319	
   320	// inputBytes scans a byte slice.
   321	type inputBytes struct {
   322		str []byte
   323	}
   324	
   325	func (i *inputBytes) step(pos int) (rune, int) {
   326		if pos < len(i.str) {
   327			c := i.str[pos]
   328			if c < utf8.RuneSelf {
   329				return rune(c), 1
   330			}
   331			return utf8.DecodeRune(i.str[pos:])
   332		}
   333		return endOfText, 0
   334	}
   335	
   336	func (i *inputBytes) canCheckPrefix() bool {
   337		return true
   338	}
   339	
   340	func (i *inputBytes) hasPrefix(re *Regexp) bool {
   341		return bytes.HasPrefix(i.str, re.prefixBytes)
   342	}
   343	
   344	func (i *inputBytes) index(re *Regexp, pos int) int {
   345		return bytes.Index(i.str[pos:], re.prefixBytes)
   346	}
   347	
   348	func (i *inputBytes) context(pos int) syntax.EmptyOp {
   349		r1, r2 := endOfText, endOfText
   350		if pos > 0 && pos <= len(i.str) {
   351			r1, _ = utf8.DecodeLastRune(i.str[:pos])
   352		}
   353		if pos < len(i.str) {
   354			r2, _ = utf8.DecodeRune(i.str[pos:])
   355		}
   356		return syntax.EmptyOpContext(r1, r2)
   357	}
   358	
   359	// inputReader scans a RuneReader.
   360	type inputReader struct {
   361		r     io.RuneReader
   362		atEOT bool
   363		pos   int
   364	}
   365	
   366	func (i *inputReader) step(pos int) (rune, int) {
   367		if !i.atEOT && pos != i.pos {
   368			return endOfText, 0
   369	
   370		}
   371		r, w, err := i.r.ReadRune()
   372		if err != nil {
   373			i.atEOT = true
   374			return endOfText, 0
   375		}
   376		i.pos += w
   377		return r, w
   378	}
   379	
   380	func (i *inputReader) canCheckPrefix() bool {
   381		return false
   382	}
   383	
   384	func (i *inputReader) hasPrefix(re *Regexp) bool {
   385		return false
   386	}
   387	
   388	func (i *inputReader) index(re *Regexp, pos int) int {
   389		return -1
   390	}
   391	
   392	func (i *inputReader) context(pos int) syntax.EmptyOp {
   393		return 0
   394	}
   395	
   396	// LiteralPrefix returns a literal string that must begin any match
   397	// of the regular expression re.  It returns the boolean true if the
   398	// literal string comprises the entire regular expression.
   399	func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
   400		return re.prefix, re.prefixComplete
   401	}
   402	
   403	// MatchReader reports whether the Regexp matches the text read by the
   404	// RuneReader.
   405	func (re *Regexp) MatchReader(r io.RuneReader) bool {
   406		return re.doExecute(r, nil, "", 0, 0) != nil
   407	}
   408	
   409	// MatchString reports whether the Regexp matches the string s.
   410	func (re *Regexp) MatchString(s string) bool {
   411		return re.doExecute(nil, nil, s, 0, 0) != nil
   412	}
   413	
   414	// Match reports whether the Regexp matches the byte slice b.
   415	func (re *Regexp) Match(b []byte) bool {
   416		return re.doExecute(nil, b, "", 0, 0) != nil
   417	}
   418	
   419	// MatchReader checks whether a textual regular expression matches the text
   420	// read by the RuneReader.  More complicated queries need to use Compile and
   421	// the full Regexp interface.
   422	func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) {
   423		re, err := Compile(pattern)
   424		if err != nil {
   425			return false, err
   426		}
   427		return re.MatchReader(r), nil
   428	}
   429	
   430	// MatchString checks whether a textual regular expression
   431	// matches a string.  More complicated queries need
   432	// to use Compile and the full Regexp interface.
   433	func MatchString(pattern string, s string) (matched bool, err error) {
   434		re, err := Compile(pattern)
   435		if err != nil {
   436			return false, err
   437		}
   438		return re.MatchString(s), nil
   439	}
   440	
   441	// Match checks whether a textual regular expression
   442	// matches a byte slice.  More complicated queries need
   443	// to use Compile and the full Regexp interface.
   444	func Match(pattern string, b []byte) (matched bool, err error) {
   445		re, err := Compile(pattern)
   446		if err != nil {
   447			return false, err
   448		}
   449		return re.Match(b), nil
   450	}
   451	
   452	// ReplaceAllString returns a copy of src, replacing matches of the Regexp
   453	// with the replacement string repl.  Inside repl, $ signs are interpreted as
   454	// in Expand, so for instance $1 represents the text of the first submatch.
   455	func (re *Regexp) ReplaceAllString(src, repl string) string {
   456		n := 2
   457		if strings.Index(repl, "$") >= 0 {
   458			n = 2 * (re.numSubexp + 1)
   459		}
   460		b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte {
   461			return re.expand(dst, repl, nil, src, match)
   462		})
   463		return string(b)
   464	}
   465	
   466	// ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp
   467	// with the replacement string repl.  The replacement repl is substituted directly,
   468	// without using Expand.
   469	func (re *Regexp) ReplaceAllLiteralString(src, repl string) string {
   470		return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
   471			return append(dst, repl...)
   472		}))
   473	}
   474	
   475	// ReplaceAllStringFunc returns a copy of src in which all matches of the
   476	// Regexp have been replaced by the return value of function repl applied
   477	// to the matched substring.  The replacement returned by repl is substituted
   478	// directly, without using Expand.
   479	func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
   480		b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
   481			return append(dst, repl(src[match[0]:match[1]])...)
   482		})
   483		return string(b)
   484	}
   485	
   486	func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte {
   487		lastMatchEnd := 0 // end position of the most recent match
   488		searchPos := 0    // position where we next look for a match
   489		var buf []byte
   490		var endPos int
   491		if bsrc != nil {
   492			endPos = len(bsrc)
   493		} else {
   494			endPos = len(src)
   495		}
   496		if nmatch > re.prog.NumCap {
   497			nmatch = re.prog.NumCap
   498		}
   499	
   500		for searchPos <= endPos {
   501			a := re.doExecute(nil, bsrc, src, searchPos, nmatch)
   502			if len(a) == 0 {
   503				break // no more matches
   504			}
   505	
   506			// Copy the unmatched characters before this match.
   507			if bsrc != nil {
   508				buf = append(buf, bsrc[lastMatchEnd:a[0]]...)
   509			} else {
   510				buf = append(buf, src[lastMatchEnd:a[0]]...)
   511			}
   512	
   513			// Now insert a copy of the replacement string, but not for a
   514			// match of the empty string immediately after another match.
   515			// (Otherwise, we get double replacement for patterns that
   516			// match both empty and nonempty strings.)
   517			if a[1] > lastMatchEnd || a[0] == 0 {
   518				buf = repl(buf, a)
   519			}
   520			lastMatchEnd = a[1]
   521	
   522			// Advance past this match; always advance at least one character.
   523			var width int
   524			if bsrc != nil {
   525				_, width = utf8.DecodeRune(bsrc[searchPos:])
   526			} else {
   527				_, width = utf8.DecodeRuneInString(src[searchPos:])
   528			}
   529			if searchPos+width > a[1] {
   530				searchPos += width
   531			} else if searchPos+1 > a[1] {
   532				// This clause is only needed at the end of the input
   533				// string.  In that case, DecodeRuneInString returns width=0.
   534				searchPos++
   535			} else {
   536				searchPos = a[1]
   537			}
   538		}
   539	
   540		// Copy the unmatched characters after the last match.
   541		if bsrc != nil {
   542			buf = append(buf, bsrc[lastMatchEnd:]...)
   543		} else {
   544			buf = append(buf, src[lastMatchEnd:]...)
   545		}
   546	
   547		return buf
   548	}
   549	
   550	// ReplaceAll returns a copy of src, replacing matches of the Regexp
   551	// with the replacement text repl.  Inside repl, $ signs are interpreted as
   552	// in Expand, so for instance $1 represents the text of the first submatch.
   553	func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
   554		n := 2
   555		if bytes.IndexByte(repl, '$') >= 0 {
   556			n = 2 * (re.numSubexp + 1)
   557		}
   558		srepl := ""
   559		b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte {
   560			if len(srepl) != len(repl) {
   561				srepl = string(repl)
   562			}
   563			return re.expand(dst, srepl, src, "", match)
   564		})
   565		return b
   566	}
   567	
   568	// ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp
   569	// with the replacement bytes repl.  The replacement repl is substituted directly,
   570	// without using Expand.
   571	func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
   572		return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
   573			return append(dst, repl...)
   574		})
   575	}
   576	
   577	// ReplaceAllFunc returns a copy of src in which all matches of the
   578	// Regexp have been replaced by the return value of function repl applied
   579	// to the matched byte slice.  The replacement returned by repl is substituted
   580	// directly, without using Expand.
   581	func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
   582		return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
   583			return append(dst, repl(src[match[0]:match[1]])...)
   584		})
   585	}
   586	
   587	var specialBytes = []byte(`\.+*?()|[]{}^$`)
   588	
   589	func special(b byte) bool {
   590		return bytes.IndexByte(specialBytes, b) >= 0
   591	}
   592	
   593	// QuoteMeta returns a string that quotes all regular expression metacharacters
   594	// inside the argument text; the returned string is a regular expression matching
   595	// the literal text.  For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
   596	func QuoteMeta(s string) string {
   597		b := make([]byte, 2*len(s))
   598	
   599		// A byte loop is correct because all metacharacters are ASCII.
   600		j := 0
   601		for i := 0; i < len(s); i++ {
   602			if special(s[i]) {
   603				b[j] = '\\'
   604				j++
   605			}
   606			b[j] = s[i]
   607			j++
   608		}
   609		return string(b[0:j])
   610	}
   611	
   612	// The number of capture values in the program may correspond
   613	// to fewer capturing expressions than are in the regexp.
   614	// For example, "(a){0}" turns into an empty program, so the
   615	// maximum capture in the program is 0 but we need to return
   616	// an expression for \1.  Pad appends -1s to the slice a as needed.
   617	func (re *Regexp) pad(a []int) []int {
   618		if a == nil {
   619			// No match.
   620			return nil
   621		}
   622		n := (1 + re.numSubexp) * 2
   623		for len(a) < n {
   624			a = append(a, -1)
   625		}
   626		return a
   627	}
   628	
   629	// Find matches in slice b if b is non-nil, otherwise find matches in string s.
   630	func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
   631		var end int
   632		if b == nil {
   633			end = len(s)
   634		} else {
   635			end = len(b)
   636		}
   637	
   638		for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
   639			matches := re.doExecute(nil, b, s, pos, re.prog.NumCap)
   640			if len(matches) == 0 {
   641				break
   642			}
   643	
   644			accept := true
   645			if matches[1] == pos {
   646				// We've found an empty match.
   647				if matches[0] == prevMatchEnd {
   648					// We don't allow an empty match right
   649					// after a previous match, so ignore it.
   650					accept = false
   651				}
   652				var width int
   653				// TODO: use step()
   654				if b == nil {
   655					_, width = utf8.DecodeRuneInString(s[pos:end])
   656				} else {
   657					_, width = utf8.DecodeRune(b[pos:end])
   658				}
   659				if width > 0 {
   660					pos += width
   661				} else {
   662					pos = end + 1
   663				}
   664			} else {
   665				pos = matches[1]
   666			}
   667			prevMatchEnd = matches[1]
   668	
   669			if accept {
   670				deliver(re.pad(matches))
   671				i++
   672			}
   673		}
   674	}
   675	
   676	// Find returns a slice holding the text of the leftmost match in b of the regular expression.
   677	// A return value of nil indicates no match.
   678	func (re *Regexp) Find(b []byte) []byte {
   679		a := re.doExecute(nil, b, "", 0, 2)
   680		if a == nil {
   681			return nil
   682		}
   683		return b[a[0]:a[1]]
   684	}
   685	
   686	// FindIndex returns a two-element slice of integers defining the location of
   687	// the leftmost match in b of the regular expression.  The match itself is at
   688	// b[loc[0]:loc[1]].
   689	// A return value of nil indicates no match.
   690	func (re *Regexp) FindIndex(b []byte) (loc []int) {
   691		a := re.doExecute(nil, b, "", 0, 2)
   692		if a == nil {
   693			return nil
   694		}
   695		return a[0:2]
   696	}
   697	
   698	// FindString returns a string holding the text of the leftmost match in s of the regular
   699	// expression.  If there is no match, the return value is an empty string,
   700	// but it will also be empty if the regular expression successfully matches
   701	// an empty string.  Use FindStringIndex or FindStringSubmatch if it is
   702	// necessary to distinguish these cases.
   703	func (re *Regexp) FindString(s string) string {
   704		a := re.doExecute(nil, nil, s, 0, 2)
   705		if a == nil {
   706			return ""
   707		}
   708		return s[a[0]:a[1]]
   709	}
   710	
   711	// FindStringIndex returns a two-element slice of integers defining the
   712	// location of the leftmost match in s of the regular expression.  The match
   713	// itself is at s[loc[0]:loc[1]].
   714	// A return value of nil indicates no match.
   715	func (re *Regexp) FindStringIndex(s string) (loc []int) {
   716		a := re.doExecute(nil, nil, s, 0, 2)
   717		if a == nil {
   718			return nil
   719		}
   720		return a[0:2]
   721	}
   722	
   723	// FindReaderIndex returns a two-element slice of integers defining the
   724	// location of the leftmost match of the regular expression in text read from
   725	// the RuneReader.  The match text was found in the input stream at
   726	// byte offset loc[0] through loc[1]-1.
   727	// A return value of nil indicates no match.
   728	func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) {
   729		a := re.doExecute(r, nil, "", 0, 2)
   730		if a == nil {
   731			return nil
   732		}
   733		return a[0:2]
   734	}
   735	
   736	// FindSubmatch returns a slice of slices holding the text of the leftmost
   737	// match of the regular expression in b and the matches, if any, of its
   738	// subexpressions, as defined by the 'Submatch' descriptions in the package
   739	// comment.
   740	// A return value of nil indicates no match.
   741	func (re *Regexp) FindSubmatch(b []byte) [][]byte {
   742		a := re.doExecute(nil, b, "", 0, re.prog.NumCap)
   743		if a == nil {
   744			return nil
   745		}
   746		ret := make([][]byte, 1+re.numSubexp)
   747		for i := range ret {
   748			if 2*i < len(a) && a[2*i] >= 0 {
   749				ret[i] = b[a[2*i]:a[2*i+1]]
   750			}
   751		}
   752		return ret
   753	}
   754	
   755	// Expand appends template to dst and returns the result; during the
   756	// append, Expand replaces variables in the template with corresponding
   757	// matches drawn from src.  The match slice should have been returned by
   758	// FindSubmatchIndex.
   759	//
   760	// In the template, a variable is denoted by a substring of the form
   761	// $name or ${name}, where name is a non-empty sequence of letters,
   762	// digits, and underscores.  A purely numeric name like $1 refers to
   763	// the submatch with the corresponding index; other names refer to
   764	// capturing parentheses named with the (?P<name>...) syntax.  A
   765	// reference to an out of range or unmatched index or a name that is not
   766	// present in the regular expression is replaced with an empty slice.
   767	//
   768	// In the $name form, name is taken to be as long as possible: $1x is
   769	// equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0.
   770	//
   771	// To insert a literal $ in the output, use $$ in the template.
   772	func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte {
   773		return re.expand(dst, string(template), src, "", match)
   774	}
   775	
   776	// ExpandString is like Expand but the template and source are strings.
   777	// It appends to and returns a byte slice in order to give the calling
   778	// code control over allocation.
   779	func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte {
   780		return re.expand(dst, template, nil, src, match)
   781	}
   782	
   783	func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte {
   784		for len(template) > 0 {
   785			i := strings.Index(template, "$")
   786			if i < 0 {
   787				break
   788			}
   789			dst = append(dst, template[:i]...)
   790			template = template[i:]
   791			if len(template) > 1 && template[1] == '$' {
   792				// Treat $$ as $.
   793				dst = append(dst, '$')
   794				template = template[2:]
   795				continue
   796			}
   797			name, num, rest, ok := extract(template)
   798			if !ok {
   799				// Malformed; treat $ as raw text.
   800				dst = append(dst, '$')
   801				template = template[1:]
   802				continue
   803			}
   804			template = rest
   805			if num >= 0 {
   806				if 2*num+1 < len(match) && match[2*num] >= 0 {
   807					if bsrc != nil {
   808						dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...)
   809					} else {
   810						dst = append(dst, src[match[2*num]:match[2*num+1]]...)
   811					}
   812				}
   813			} else {
   814				for i, namei := range re.subexpNames {
   815					if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 {
   816						if bsrc != nil {
   817							dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...)
   818						} else {
   819							dst = append(dst, src[match[2*i]:match[2*i+1]]...)
   820						}
   821						break
   822					}
   823				}
   824			}
   825		}
   826		dst = append(dst, template...)
   827		return dst
   828	}
   829	
   830	// extract returns the name from a leading "$name" or "${name}" in str.
   831	// If it is a number, extract returns num set to that number; otherwise num = -1.
   832	func extract(str string) (name string, num int, rest string, ok bool) {
   833		if len(str) < 2 || str[0] != '$' {
   834			return
   835		}
   836		brace := false
   837		if str[1] == '{' {
   838			brace = true
   839			str = str[2:]
   840		} else {
   841			str = str[1:]
   842		}
   843		i := 0
   844		for i < len(str) {
   845			rune, size := utf8.DecodeRuneInString(str[i:])
   846			if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' {
   847				break
   848			}
   849			i += size
   850		}
   851		if i == 0 {
   852			// empty name is not okay
   853			return
   854		}
   855		name = str[:i]
   856		if brace {
   857			if i >= len(str) || str[i] != '}' {
   858				// missing closing brace
   859				return
   860			}
   861			i++
   862		}
   863	
   864		// Parse number.
   865		num = 0
   866		for i := 0; i < len(name); i++ {
   867			if name[i] < '0' || '9' < name[i] || num >= 1e8 {
   868				num = -1
   869				break
   870			}
   871			num = num*10 + int(name[i]) - '0'
   872		}
   873		// Disallow leading zeros.
   874		if name[0] == '0' && len(name) > 1 {
   875			num = -1
   876		}
   877	
   878		rest = str[i:]
   879		ok = true
   880		return
   881	}
   882	
   883	// FindSubmatchIndex returns a slice holding the index pairs identifying the
   884	// leftmost match of the regular expression in b and the matches, if any, of
   885	// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
   886	// in the package comment.
   887	// A return value of nil indicates no match.
   888	func (re *Regexp) FindSubmatchIndex(b []byte) []int {
   889		return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap))
   890	}
   891	
   892	// FindStringSubmatch returns a slice of strings holding the text of the
   893	// leftmost match of the regular expression in s and the matches, if any, of
   894	// its subexpressions, as defined by the 'Submatch' description in the
   895	// package comment.
   896	// A return value of nil indicates no match.
   897	func (re *Regexp) FindStringSubmatch(s string) []string {
   898		a := re.doExecute(nil, nil, s, 0, re.prog.NumCap)
   899		if a == nil {
   900			return nil
   901		}
   902		ret := make([]string, 1+re.numSubexp)
   903		for i := range ret {
   904			if 2*i < len(a) && a[2*i] >= 0 {
   905				ret[i] = s[a[2*i]:a[2*i+1]]
   906			}
   907		}
   908		return ret
   909	}
   910	
   911	// FindStringSubmatchIndex returns a slice holding the index pairs
   912	// identifying the leftmost match of the regular expression in s and the
   913	// matches, if any, of its subexpressions, as defined by the 'Submatch' and
   914	// 'Index' descriptions in the package comment.
   915	// A return value of nil indicates no match.
   916	func (re *Regexp) FindStringSubmatchIndex(s string) []int {
   917		return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap))
   918	}
   919	
   920	// FindReaderSubmatchIndex returns a slice holding the index pairs
   921	// identifying the leftmost match of the regular expression of text read by
   922	// the RuneReader, and the matches, if any, of its subexpressions, as defined
   923	// by the 'Submatch' and 'Index' descriptions in the package comment.  A
   924	// return value of nil indicates no match.
   925	func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
   926		return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap))
   927	}
   928	
   929	const startSize = 10 // The size at which to start a slice in the 'All' routines.
   930	
   931	// FindAll is the 'All' version of Find; it returns a slice of all successive
   932	// matches of the expression, as defined by the 'All' description in the
   933	// package comment.
   934	// A return value of nil indicates no match.
   935	func (re *Regexp) FindAll(b []byte, n int) [][]byte {
   936		if n < 0 {
   937			n = len(b) + 1
   938		}
   939		result := make([][]byte, 0, startSize)
   940		re.allMatches("", b, n, func(match []int) {
   941			result = append(result, b[match[0]:match[1]])
   942		})
   943		if len(result) == 0 {
   944			return nil
   945		}
   946		return result
   947	}
   948	
   949	// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
   950	// successive matches of the expression, as defined by the 'All' description
   951	// in the package comment.
   952	// A return value of nil indicates no match.
   953	func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
   954		if n < 0 {
   955			n = len(b) + 1
   956		}
   957		result := make([][]int, 0, startSize)
   958		re.allMatches("", b, n, func(match []int) {
   959			result = append(result, match[0:2])
   960		})
   961		if len(result) == 0 {
   962			return nil
   963		}
   964		return result
   965	}
   966	
   967	// FindAllString is the 'All' version of FindString; it returns a slice of all
   968	// successive matches of the expression, as defined by the 'All' description
   969	// in the package comment.
   970	// A return value of nil indicates no match.
   971	func (re *Regexp) FindAllString(s string, n int) []string {
   972		if n < 0 {
   973			n = len(s) + 1
   974		}
   975		result := make([]string, 0, startSize)
   976		re.allMatches(s, nil, n, func(match []int) {
   977			result = append(result, s[match[0]:match[1]])
   978		})
   979		if len(result) == 0 {
   980			return nil
   981		}
   982		return result
   983	}
   984	
   985	// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
   986	// slice of all successive matches of the expression, as defined by the 'All'
   987	// description in the package comment.
   988	// A return value of nil indicates no match.
   989	func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
   990		if n < 0 {
   991			n = len(s) + 1
   992		}
   993		result := make([][]int, 0, startSize)
   994		re.allMatches(s, nil, n, func(match []int) {
   995			result = append(result, match[0:2])
   996		})
   997		if len(result) == 0 {
   998			return nil
   999		}
  1000		return result
  1001	}
  1002	
  1003	// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
  1004	// of all successive matches of the expression, as defined by the 'All'
  1005	// description in the package comment.
  1006	// A return value of nil indicates no match.
  1007	func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
  1008		if n < 0 {
  1009			n = len(b) + 1
  1010		}
  1011		result := make([][][]byte, 0, startSize)
  1012		re.allMatches("", b, n, func(match []int) {
  1013			slice := make([][]byte, len(match)/2)
  1014			for j := range slice {
  1015				if match[2*j] >= 0 {
  1016					slice[j] = b[match[2*j]:match[2*j+1]]
  1017				}
  1018			}
  1019			result = append(result, slice)
  1020		})
  1021		if len(result) == 0 {
  1022			return nil
  1023		}
  1024		return result
  1025	}
  1026	
  1027	// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
  1028	// a slice of all successive matches of the expression, as defined by the
  1029	// 'All' description in the package comment.
  1030	// A return value of nil indicates no match.
  1031	func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
  1032		if n < 0 {
  1033			n = len(b) + 1
  1034		}
  1035		result := make([][]int, 0, startSize)
  1036		re.allMatches("", b, n, func(match []int) {
  1037			result = append(result, match)
  1038		})
  1039		if len(result) == 0 {
  1040			return nil
  1041		}
  1042		return result
  1043	}
  1044	
  1045	// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
  1046	// returns a slice of all successive matches of the expression, as defined by
  1047	// the 'All' description in the package comment.
  1048	// A return value of nil indicates no match.
  1049	func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
  1050		if n < 0 {
  1051			n = len(s) + 1
  1052		}
  1053		result := make([][]string, 0, startSize)
  1054		re.allMatches(s, nil, n, func(match []int) {
  1055			slice := make([]string, len(match)/2)
  1056			for j := range slice {
  1057				if match[2*j] >= 0 {
  1058					slice[j] = s[match[2*j]:match[2*j+1]]
  1059				}
  1060			}
  1061			result = append(result, slice)
  1062		})
  1063		if len(result) == 0 {
  1064			return nil
  1065		}
  1066		return result
  1067	}
  1068	
  1069	// FindAllStringSubmatchIndex is the 'All' version of
  1070	// FindStringSubmatchIndex; it returns a slice of all successive matches of
  1071	// the expression, as defined by the 'All' description in the package
  1072	// comment.
  1073	// A return value of nil indicates no match.
  1074	func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
  1075		if n < 0 {
  1076			n = len(s) + 1
  1077		}
  1078		result := make([][]int, 0, startSize)
  1079		re.allMatches(s, nil, n, func(match []int) {
  1080			result = append(result, match)
  1081		})
  1082		if len(result) == 0 {
  1083			return nil
  1084		}
  1085		return result
  1086	}
  1087	
  1088	// Split slices s into substrings separated by the expression and returns a slice of
  1089	// the substrings between those expression matches.
  1090	//
  1091	// The slice returned by this method consists of all the substrings of s
  1092	// not contained in the slice returned by FindAllString. When called on an expression
  1093	// that contains no metacharacters, it is equivalent to strings.SplitN.
  1094	//
  1095	// Example:
  1096	//   s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5)
  1097	//   // s: ["", "b", "b", "c", "cadaaae"]
  1098	//
  1099	// The count determines the number of substrings to return:
  1100	//   n > 0: at most n substrings; the last substring will be the unsplit remainder.
  1101	//   n == 0: the result is nil (zero substrings)
  1102	//   n < 0: all substrings
  1103	func (re *Regexp) Split(s string, n int) []string {
  1104	
  1105		if n == 0 {
  1106			return nil
  1107		}
  1108	
  1109		if len(re.expr) > 0 && len(s) == 0 {
  1110			return []string{""}
  1111		}
  1112	
  1113		matches := re.FindAllStringIndex(s, n)
  1114		strings := make([]string, 0, len(matches))
  1115	
  1116		beg := 0
  1117		end := 0
  1118		for _, match := range matches {
  1119			if n > 0 && len(strings) >= n-1 {
  1120				break
  1121			}
  1122	
  1123			end = match[0]
  1124			if match[1] != 0 {
  1125				strings = append(strings, s[beg:end])
  1126			}
  1127			beg = match[1]
  1128		}
  1129	
  1130		if end != len(s) {
  1131			strings = append(strings, s[beg:])
  1132		}
  1133	
  1134		return strings
  1135	}
  1136	

View as plain text