Run Format

Source file src/pkg/regexp/regexp.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package regexp implements regular expression search.
     6	//
     7	// The syntax of the regular expressions accepted is the same
     8	// general syntax used by Perl, Python, and other languages.
     9	// More precisely, it is the syntax accepted by RE2 and described at
    10	// http://code.google.com/p/re2/wiki/Syntax, except for \C.
    11	// For an overview of the syntax, run
    12	//   godoc regexp/syntax
    13	//
    14	// All characters are UTF-8-encoded code points.
    15	//
    16	// There are 16 methods of Regexp that match a regular expression and identify
    17	// the matched text.  Their names are matched by this regular expression:
    18	//
    19	//	Find(All)?(String)?(Submatch)?(Index)?
    20	//
    21	// If 'All' is present, the routine matches successive non-overlapping
    22	// matches of the entire expression.  Empty matches abutting a preceding
    23	// match are ignored.  The return value is a slice containing the successive
    24	// return values of the corresponding non-'All' routine.  These routines take
    25	// an extra integer argument, n; if n >= 0, the function returns at most n
    26	// matches/submatches.
    27	//
    28	// If 'String' is present, the argument is a string; otherwise it is a slice
    29	// of bytes; return values are adjusted as appropriate.
    30	//
    31	// If 'Submatch' is present, the return value is a slice identifying the
    32	// successive submatches of the expression. Submatches are matches of
    33	// parenthesized subexpressions (also known as capturing groups) within the
    34	// regular expression, numbered from left to right in order of opening
    35	// parenthesis. Submatch 0 is the match of the entire expression, submatch 1
    36	// the match of the first parenthesized subexpression, and so on.
    37	//
    38	// If 'Index' is present, matches and submatches are identified by byte index
    39	// pairs within the input string: result[2*n:2*n+1] identifies the indexes of
    40	// the nth submatch.  The pair for n==0 identifies the match of the entire
    41	// expression.  If 'Index' is not present, the match is identified by the
    42	// text of the match/submatch.  If an index is negative, it means that
    43	// subexpression did not match any string in the input.
    44	//
    45	// There is also a subset of the methods that can be applied to text read
    46	// from a RuneReader:
    47	//
    48	//	MatchReader, FindReaderIndex, FindReaderSubmatchIndex
    49	//
    50	// This set may grow.  Note that regular expression matches may need to
    51	// examine text beyond the text returned by a match, so the methods that
    52	// match text from a RuneReader may read arbitrarily far into the input
    53	// before returning.
    54	//
    55	// (There are a few other methods that do not match this pattern.)
    56	//
    57	package regexp
    58	
    59	import (
    60		"bytes"
    61		"io"
    62		"regexp/syntax"
    63		"strconv"
    64		"strings"
    65		"sync"
    66		"unicode"
    67		"unicode/utf8"
    68	)
    69	
    70	var debug = false
    71	
    72	// Regexp is the representation of a compiled regular expression.
    73	// The public interface is entirely through methods.
    74	// A Regexp is safe for concurrent use by multiple goroutines.
    75	type Regexp struct {
    76		// read-only after Compile
    77		expr           string         // as passed to Compile
    78		prog           *syntax.Prog   // compiled program
    79		prefix         string         // required prefix in unanchored matches
    80		prefixBytes    []byte         // prefix, as a []byte
    81		prefixComplete bool           // prefix is the entire regexp
    82		prefixRune     rune           // first rune in prefix
    83		cond           syntax.EmptyOp // empty-width conditions required at start of match
    84		numSubexp      int
    85		subexpNames    []string
    86		longest        bool
    87	
    88		// cache of machines for running regexp
    89		mu      sync.Mutex
    90		machine []*machine
    91	}
    92	
    93	// String returns the source text used to compile the regular expression.
    94	func (re *Regexp) String() string {
    95		return re.expr
    96	}
    97	
    98	// Compile parses a regular expression and returns, if successful,
    99	// a Regexp object that can be used to match against text.
   100	//
   101	// When matching against text, the regexp returns a match that
   102	// begins as early as possible in the input (leftmost), and among those
   103	// it chooses the one that a backtracking search would have found first.
   104	// This so-called leftmost-first matching is the same semantics
   105	// that Perl, Python, and other implementations use, although this
   106	// package implements it without the expense of backtracking.
   107	// For POSIX leftmost-longest matching, see CompilePOSIX.
   108	func Compile(expr string) (*Regexp, error) {
   109		return compile(expr, syntax.Perl, false)
   110	}
   111	
   112	// CompilePOSIX is like Compile but restricts the regular expression
   113	// to POSIX ERE (egrep) syntax and changes the match semantics to
   114	// leftmost-longest.
   115	//
   116	// That is, when matching against text, the regexp returns a match that
   117	// begins as early as possible in the input (leftmost), and among those
   118	// it chooses a match that is as long as possible.
   119	// This so-called leftmost-longest matching is the same semantics
   120	// that early regular expression implementations used and that POSIX
   121	// specifies.
   122	//
   123	// However, there can be multiple leftmost-longest matches, with different
   124	// submatch choices, and here this package diverges from POSIX.
   125	// Among the possible leftmost-longest matches, this package chooses
   126	// the one that a backtracking search would have found first, while POSIX
   127	// specifies that the match be chosen to maximize the length of the first
   128	// subexpression, then the second, and so on from left to right.
   129	// The POSIX rule is computationally prohibitive and not even well-defined.
   130	// See http://swtch.com/~rsc/regexp/regexp2.html#posix for details.
   131	func CompilePOSIX(expr string) (*Regexp, error) {
   132		return compile(expr, syntax.POSIX, true)
   133	}
   134	
   135	// Longest makes future searches prefer the leftmost-longest match.
   136	// That is, when matching against text, the regexp returns a match that
   137	// begins as early as possible in the input (leftmost), and among those
   138	// it chooses a match that is as long as possible.
   139	func (re *Regexp) Longest() {
   140		re.longest = true
   141	}
   142	
   143	func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
   144		re, err := syntax.Parse(expr, mode)
   145		if err != nil {
   146			return nil, err
   147		}
   148		maxCap := re.MaxCap()
   149		capNames := re.CapNames()
   150	
   151		re = re.Simplify()
   152		prog, err := syntax.Compile(re)
   153		if err != nil {
   154			return nil, err
   155		}
   156		regexp := &Regexp{
   157			expr:        expr,
   158			prog:        prog,
   159			numSubexp:   maxCap,
   160			subexpNames: capNames,
   161			cond:        prog.StartCond(),
   162			longest:     longest,
   163		}
   164		regexp.prefix, regexp.prefixComplete = prog.Prefix()
   165		if regexp.prefix != "" {
   166			// TODO(rsc): Remove this allocation by adding
   167			// IndexString to package bytes.
   168			regexp.prefixBytes = []byte(regexp.prefix)
   169			regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
   170		}
   171		return regexp, nil
   172	}
   173	
   174	// get returns a machine to use for matching re.
   175	// It uses the re's machine cache if possible, to avoid
   176	// unnecessary allocation.
   177	func (re *Regexp) get() *machine {
   178		re.mu.Lock()
   179		if n := len(re.machine); n > 0 {
   180			z := re.machine[n-1]
   181			re.machine = re.machine[:n-1]
   182			re.mu.Unlock()
   183			return z
   184		}
   185		re.mu.Unlock()
   186		z := progMachine(re.prog)
   187		z.re = re
   188		return z
   189	}
   190	
   191	// put returns a machine to the re's machine cache.
   192	// There is no attempt to limit the size of the cache, so it will
   193	// grow to the maximum number of simultaneous matches
   194	// run using re.  (The cache empties when re gets garbage collected.)
   195	func (re *Regexp) put(z *machine) {
   196		re.mu.Lock()
   197		re.machine = append(re.machine, z)
   198		re.mu.Unlock()
   199	}
   200	
   201	// MustCompile is like Compile but panics if the expression cannot be parsed.
   202	// It simplifies safe initialization of global variables holding compiled regular
   203	// expressions.
   204	func MustCompile(str string) *Regexp {
   205		regexp, error := Compile(str)
   206		if error != nil {
   207			panic(`regexp: Compile(` + quote(str) + `): ` + error.Error())
   208		}
   209		return regexp
   210	}
   211	
   212	// MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed.
   213	// It simplifies safe initialization of global variables holding compiled regular
   214	// expressions.
   215	func MustCompilePOSIX(str string) *Regexp {
   216		regexp, error := CompilePOSIX(str)
   217		if error != nil {
   218			panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error())
   219		}
   220		return regexp
   221	}
   222	
   223	func quote(s string) string {
   224		if strconv.CanBackquote(s) {
   225			return "`" + s + "`"
   226		}
   227		return strconv.Quote(s)
   228	}
   229	
   230	// NumSubexp returns the number of parenthesized subexpressions in this Regexp.
   231	func (re *Regexp) NumSubexp() int {
   232		return re.numSubexp
   233	}
   234	
   235	// SubexpNames returns the names of the parenthesized subexpressions
   236	// in this Regexp.  The name for the first sub-expression is names[1],
   237	// so that if m is a match slice, the name for m[i] is SubexpNames()[i].
   238	// Since the Regexp as a whole cannot be named, names[0] is always
   239	// the empty string.  The slice should not be modified.
   240	func (re *Regexp) SubexpNames() []string {
   241		return re.subexpNames
   242	}
   243	
   244	const endOfText rune = -1
   245	
   246	// input abstracts different representations of the input text. It provides
   247	// one-character lookahead.
   248	type input interface {
   249		step(pos int) (r rune, width int) // advance one rune
   250		canCheckPrefix() bool             // can we look ahead without losing info?
   251		hasPrefix(re *Regexp) bool
   252		index(re *Regexp, pos int) int
   253		context(pos int) syntax.EmptyOp
   254	}
   255	
   256	// inputString scans a string.
   257	type inputString struct {
   258		str string
   259	}
   260	
   261	func (i *inputString) step(pos int) (rune, int) {
   262		if pos < len(i.str) {
   263			c := i.str[pos]
   264			if c < utf8.RuneSelf {
   265				return rune(c), 1
   266			}
   267			return utf8.DecodeRuneInString(i.str[pos:])
   268		}
   269		return endOfText, 0
   270	}
   271	
   272	func (i *inputString) canCheckPrefix() bool {
   273		return true
   274	}
   275	
   276	func (i *inputString) hasPrefix(re *Regexp) bool {
   277		return strings.HasPrefix(i.str, re.prefix)
   278	}
   279	
   280	func (i *inputString) index(re *Regexp, pos int) int {
   281		return strings.Index(i.str[pos:], re.prefix)
   282	}
   283	
   284	func (i *inputString) context(pos int) syntax.EmptyOp {
   285		r1, r2 := endOfText, endOfText
   286		if pos > 0 && pos <= len(i.str) {
   287			r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
   288		}
   289		if pos < len(i.str) {
   290			r2, _ = utf8.DecodeRuneInString(i.str[pos:])
   291		}
   292		return syntax.EmptyOpContext(r1, r2)
   293	}
   294	
   295	// inputBytes scans a byte slice.
   296	type inputBytes struct {
   297		str []byte
   298	}
   299	
   300	func (i *inputBytes) step(pos int) (rune, int) {
   301		if pos < len(i.str) {
   302			c := i.str[pos]
   303			if c < utf8.RuneSelf {
   304				return rune(c), 1
   305			}
   306			return utf8.DecodeRune(i.str[pos:])
   307		}
   308		return endOfText, 0
   309	}
   310	
   311	func (i *inputBytes) canCheckPrefix() bool {
   312		return true
   313	}
   314	
   315	func (i *inputBytes) hasPrefix(re *Regexp) bool {
   316		return bytes.HasPrefix(i.str, re.prefixBytes)
   317	}
   318	
   319	func (i *inputBytes) index(re *Regexp, pos int) int {
   320		return bytes.Index(i.str[pos:], re.prefixBytes)
   321	}
   322	
   323	func (i *inputBytes) context(pos int) syntax.EmptyOp {
   324		r1, r2 := endOfText, endOfText
   325		if pos > 0 && pos <= len(i.str) {
   326			r1, _ = utf8.DecodeLastRune(i.str[:pos])
   327		}
   328		if pos < len(i.str) {
   329			r2, _ = utf8.DecodeRune(i.str[pos:])
   330		}
   331		return syntax.EmptyOpContext(r1, r2)
   332	}
   333	
   334	// inputReader scans a RuneReader.
   335	type inputReader struct {
   336		r     io.RuneReader
   337		atEOT bool
   338		pos   int
   339	}
   340	
   341	func (i *inputReader) step(pos int) (rune, int) {
   342		if !i.atEOT && pos != i.pos {
   343			return endOfText, 0
   344	
   345		}
   346		r, w, err := i.r.ReadRune()
   347		if err != nil {
   348			i.atEOT = true
   349			return endOfText, 0
   350		}
   351		i.pos += w
   352		return r, w
   353	}
   354	
   355	func (i *inputReader) canCheckPrefix() bool {
   356		return false
   357	}
   358	
   359	func (i *inputReader) hasPrefix(re *Regexp) bool {
   360		return false
   361	}
   362	
   363	func (i *inputReader) index(re *Regexp, pos int) int {
   364		return -1
   365	}
   366	
   367	func (i *inputReader) context(pos int) syntax.EmptyOp {
   368		return 0
   369	}
   370	
   371	// LiteralPrefix returns a literal string that must begin any match
   372	// of the regular expression re.  It returns the boolean true if the
   373	// literal string comprises the entire regular expression.
   374	func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
   375		return re.prefix, re.prefixComplete
   376	}
   377	
   378	// MatchReader reports whether the Regexp matches the text read by the
   379	// RuneReader.
   380	func (re *Regexp) MatchReader(r io.RuneReader) bool {
   381		return re.doExecute(r, nil, "", 0, 0) != nil
   382	}
   383	
   384	// MatchString reports whether the Regexp matches the string s.
   385	func (re *Regexp) MatchString(s string) bool {
   386		return re.doExecute(nil, nil, s, 0, 0) != nil
   387	}
   388	
   389	// Match reports whether the Regexp matches the byte slice b.
   390	func (re *Regexp) Match(b []byte) bool {
   391		return re.doExecute(nil, b, "", 0, 0) != nil
   392	}
   393	
   394	// MatchReader checks whether a textual regular expression matches the text
   395	// read by the RuneReader.  More complicated queries need to use Compile and
   396	// the full Regexp interface.
   397	func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) {
   398		re, err := Compile(pattern)
   399		if err != nil {
   400			return false, err
   401		}
   402		return re.MatchReader(r), nil
   403	}
   404	
   405	// MatchString checks whether a textual regular expression
   406	// matches a string.  More complicated queries need
   407	// to use Compile and the full Regexp interface.
   408	func MatchString(pattern string, s string) (matched bool, err error) {
   409		re, err := Compile(pattern)
   410		if err != nil {
   411			return false, err
   412		}
   413		return re.MatchString(s), nil
   414	}
   415	
   416	// Match checks whether a textual regular expression
   417	// matches a byte slice.  More complicated queries need
   418	// to use Compile and the full Regexp interface.
   419	func Match(pattern string, b []byte) (matched bool, err error) {
   420		re, err := Compile(pattern)
   421		if err != nil {
   422			return false, err
   423		}
   424		return re.Match(b), nil
   425	}
   426	
   427	// ReplaceAllString returns a copy of src, replacing matches of the Regexp
   428	// with the replacement string repl.  Inside repl, $ signs are interpreted as
   429	// in Expand, so for instance $1 represents the text of the first submatch.
   430	func (re *Regexp) ReplaceAllString(src, repl string) string {
   431		n := 2
   432		if strings.Index(repl, "$") >= 0 {
   433			n = 2 * (re.numSubexp + 1)
   434		}
   435		b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte {
   436			return re.expand(dst, repl, nil, src, match)
   437		})
   438		return string(b)
   439	}
   440	
   441	// ReplaceAllStringLiteral returns a copy of src, replacing matches of the Regexp
   442	// with the replacement string repl.  The replacement repl is substituted directly,
   443	// without using Expand.
   444	func (re *Regexp) ReplaceAllLiteralString(src, repl string) string {
   445		return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
   446			return append(dst, repl...)
   447		}))
   448	}
   449	
   450	// ReplaceAllStringFunc returns a copy of src in which all matches of the
   451	// Regexp have been replaced by the return value of function repl applied
   452	// to the matched substring.  The replacement returned by repl is substituted
   453	// directly, without using Expand.
   454	func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
   455		b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
   456			return append(dst, repl(src[match[0]:match[1]])...)
   457		})
   458		return string(b)
   459	}
   460	
   461	func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte {
   462		lastMatchEnd := 0 // end position of the most recent match
   463		searchPos := 0    // position where we next look for a match
   464		var buf []byte
   465		var endPos int
   466		if bsrc != nil {
   467			endPos = len(bsrc)
   468		} else {
   469			endPos = len(src)
   470		}
   471		for searchPos <= endPos {
   472			a := re.doExecute(nil, bsrc, src, searchPos, nmatch)
   473			if len(a) == 0 {
   474				break // no more matches
   475			}
   476	
   477			// Copy the unmatched characters before this match.
   478			if bsrc != nil {
   479				buf = append(buf, bsrc[lastMatchEnd:a[0]]...)
   480			} else {
   481				buf = append(buf, src[lastMatchEnd:a[0]]...)
   482			}
   483	
   484			// Now insert a copy of the replacement string, but not for a
   485			// match of the empty string immediately after another match.
   486			// (Otherwise, we get double replacement for patterns that
   487			// match both empty and nonempty strings.)
   488			if a[1] > lastMatchEnd || a[0] == 0 {
   489				buf = repl(buf, a)
   490			}
   491			lastMatchEnd = a[1]
   492	
   493			// Advance past this match; always advance at least one character.
   494			var width int
   495			if bsrc != nil {
   496				_, width = utf8.DecodeRune(bsrc[searchPos:])
   497			} else {
   498				_, width = utf8.DecodeRuneInString(src[searchPos:])
   499			}
   500			if searchPos+width > a[1] {
   501				searchPos += width
   502			} else if searchPos+1 > a[1] {
   503				// This clause is only needed at the end of the input
   504				// string.  In that case, DecodeRuneInString returns width=0.
   505				searchPos++
   506			} else {
   507				searchPos = a[1]
   508			}
   509		}
   510	
   511		// Copy the unmatched characters after the last match.
   512		if bsrc != nil {
   513			buf = append(buf, bsrc[lastMatchEnd:]...)
   514		} else {
   515			buf = append(buf, src[lastMatchEnd:]...)
   516		}
   517	
   518		return buf
   519	}
   520	
   521	// ReplaceAll returns a copy of src, replacing matches of the Regexp
   522	// with the replacement text repl.  Inside repl, $ signs are interpreted as
   523	// in Expand, so for instance $1 represents the text of the first submatch.
   524	func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
   525		n := 2
   526		if bytes.IndexByte(repl, '$') >= 0 {
   527			n = 2 * (re.numSubexp + 1)
   528		}
   529		srepl := ""
   530		b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte {
   531			if len(srepl) != len(repl) {
   532				srepl = string(repl)
   533			}
   534			return re.expand(dst, srepl, src, "", match)
   535		})
   536		return b
   537	}
   538	
   539	// ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp
   540	// with the replacement bytes repl.  The replacement repl is substituted directly,
   541	// without using Expand.
   542	func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
   543		return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
   544			return append(dst, repl...)
   545		})
   546	}
   547	
   548	// ReplaceAllFunc returns a copy of src in which all matches of the
   549	// Regexp have been replaced by the return value of function repl applied
   550	// to the matched byte slice.  The replacement returned by repl is substituted
   551	// directly, without using Expand.
   552	func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
   553		return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
   554			return append(dst, repl(src[match[0]:match[1]])...)
   555		})
   556	}
   557	
   558	var specialBytes = []byte(`\.+*?()|[]{}^$`)
   559	
   560	func special(b byte) bool {
   561		return bytes.IndexByte(specialBytes, b) >= 0
   562	}
   563	
   564	// QuoteMeta returns a string that quotes all regular expression metacharacters
   565	// inside the argument text; the returned string is a regular expression matching
   566	// the literal text.  For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
   567	func QuoteMeta(s string) string {
   568		b := make([]byte, 2*len(s))
   569	
   570		// A byte loop is correct because all metacharacters are ASCII.
   571		j := 0
   572		for i := 0; i < len(s); i++ {
   573			if special(s[i]) {
   574				b[j] = '\\'
   575				j++
   576			}
   577			b[j] = s[i]
   578			j++
   579		}
   580		return string(b[0:j])
   581	}
   582	
   583	// The number of capture values in the program may correspond
   584	// to fewer capturing expressions than are in the regexp.
   585	// For example, "(a){0}" turns into an empty program, so the
   586	// maximum capture in the program is 0 but we need to return
   587	// an expression for \1.  Pad appends -1s to the slice a as needed.
   588	func (re *Regexp) pad(a []int) []int {
   589		if a == nil {
   590			// No match.
   591			return nil
   592		}
   593		n := (1 + re.numSubexp) * 2
   594		for len(a) < n {
   595			a = append(a, -1)
   596		}
   597		return a
   598	}
   599	
   600	// Find matches in slice b if b is non-nil, otherwise find matches in string s.
   601	func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
   602		var end int
   603		if b == nil {
   604			end = len(s)
   605		} else {
   606			end = len(b)
   607		}
   608	
   609		for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
   610			matches := re.doExecute(nil, b, s, pos, re.prog.NumCap)
   611			if len(matches) == 0 {
   612				break
   613			}
   614	
   615			accept := true
   616			if matches[1] == pos {
   617				// We've found an empty match.
   618				if matches[0] == prevMatchEnd {
   619					// We don't allow an empty match right
   620					// after a previous match, so ignore it.
   621					accept = false
   622				}
   623				var width int
   624				// TODO: use step()
   625				if b == nil {
   626					_, width = utf8.DecodeRuneInString(s[pos:end])
   627				} else {
   628					_, width = utf8.DecodeRune(b[pos:end])
   629				}
   630				if width > 0 {
   631					pos += width
   632				} else {
   633					pos = end + 1
   634				}
   635			} else {
   636				pos = matches[1]
   637			}
   638			prevMatchEnd = matches[1]
   639	
   640			if accept {
   641				deliver(re.pad(matches))
   642				i++
   643			}
   644		}
   645	}
   646	
   647	// Find returns a slice holding the text of the leftmost match in b of the regular expression.
   648	// A return value of nil indicates no match.
   649	func (re *Regexp) Find(b []byte) []byte {
   650		a := re.doExecute(nil, b, "", 0, 2)
   651		if a == nil {
   652			return nil
   653		}
   654		return b[a[0]:a[1]]
   655	}
   656	
   657	// FindIndex returns a two-element slice of integers defining the location of
   658	// the leftmost match in b of the regular expression.  The match itself is at
   659	// b[loc[0]:loc[1]].
   660	// A return value of nil indicates no match.
   661	func (re *Regexp) FindIndex(b []byte) (loc []int) {
   662		a := re.doExecute(nil, b, "", 0, 2)
   663		if a == nil {
   664			return nil
   665		}
   666		return a[0:2]
   667	}
   668	
   669	// FindString returns a string holding the text of the leftmost match in s of the regular
   670	// expression.  If there is no match, the return value is an empty string,
   671	// but it will also be empty if the regular expression successfully matches
   672	// an empty string.  Use FindStringIndex or FindStringSubmatch if it is
   673	// necessary to distinguish these cases.
   674	func (re *Regexp) FindString(s string) string {
   675		a := re.doExecute(nil, nil, s, 0, 2)
   676		if a == nil {
   677			return ""
   678		}
   679		return s[a[0]:a[1]]
   680	}
   681	
   682	// FindStringIndex returns a two-element slice of integers defining the
   683	// location of the leftmost match in s of the regular expression.  The match
   684	// itself is at s[loc[0]:loc[1]].
   685	// A return value of nil indicates no match.
   686	func (re *Regexp) FindStringIndex(s string) (loc []int) {
   687		a := re.doExecute(nil, nil, s, 0, 2)
   688		if a == nil {
   689			return nil
   690		}
   691		return a[0:2]
   692	}
   693	
   694	// FindReaderIndex returns a two-element slice of integers defining the
   695	// location of the leftmost match of the regular expression in text read from
   696	// the RuneReader.  The match text was found in the input stream at
   697	// byte offset loc[0] through loc[1]-1.
   698	// A return value of nil indicates no match.
   699	func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) {
   700		a := re.doExecute(r, nil, "", 0, 2)
   701		if a == nil {
   702			return nil
   703		}
   704		return a[0:2]
   705	}
   706	
   707	// FindSubmatch returns a slice of slices holding the text of the leftmost
   708	// match of the regular expression in b and the matches, if any, of its
   709	// subexpressions, as defined by the 'Submatch' descriptions in the package
   710	// comment.
   711	// A return value of nil indicates no match.
   712	func (re *Regexp) FindSubmatch(b []byte) [][]byte {
   713		a := re.doExecute(nil, b, "", 0, re.prog.NumCap)
   714		if a == nil {
   715			return nil
   716		}
   717		ret := make([][]byte, 1+re.numSubexp)
   718		for i := range ret {
   719			if 2*i < len(a) && a[2*i] >= 0 {
   720				ret[i] = b[a[2*i]:a[2*i+1]]
   721			}
   722		}
   723		return ret
   724	}
   725	
   726	// Expand appends template to dst and returns the result; during the
   727	// append, Expand replaces variables in the template with corresponding
   728	// matches drawn from src.  The match slice should have been returned by
   729	// FindSubmatchIndex.
   730	//
   731	// In the template, a variable is denoted by a substring of the form
   732	// $name or ${name}, where name is a non-empty sequence of letters,
   733	// digits, and underscores.  A purely numeric name like $1 refers to
   734	// the submatch with the corresponding index; other names refer to
   735	// capturing parentheses named with the (?P<name>...) syntax.  A
   736	// reference to an out of range or unmatched index or a name that is not
   737	// present in the regular expression is replaced with an empty slice.
   738	//
   739	// In the $name form, name is taken to be as long as possible: $1x is
   740	// equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0.
   741	//
   742	// To insert a literal $ in the output, use $$ in the template.
   743	func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte {
   744		return re.expand(dst, string(template), src, "", match)
   745	}
   746	
   747	// ExpandString is like Expand but the template and source are strings.
   748	// It appends to and returns a byte slice in order to give the calling
   749	// code control over allocation.
   750	func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte {
   751		return re.expand(dst, template, nil, src, match)
   752	}
   753	
   754	func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte {
   755		for len(template) > 0 {
   756			i := strings.Index(template, "$")
   757			if i < 0 {
   758				break
   759			}
   760			dst = append(dst, template[:i]...)
   761			template = template[i:]
   762			if len(template) > 1 && template[1] == '$' {
   763				// Treat $$ as $.
   764				dst = append(dst, '$')
   765				template = template[2:]
   766				continue
   767			}
   768			name, num, rest, ok := extract(template)
   769			if !ok {
   770				// Malformed; treat $ as raw text.
   771				dst = append(dst, '$')
   772				template = template[1:]
   773				continue
   774			}
   775			template = rest
   776			if num >= 0 {
   777				if 2*num+1 < len(match) && match[2*num] >= 0 {
   778					if bsrc != nil {
   779						dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...)
   780					} else {
   781						dst = append(dst, src[match[2*num]:match[2*num+1]]...)
   782					}
   783				}
   784			} else {
   785				for i, namei := range re.subexpNames {
   786					if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 {
   787						if bsrc != nil {
   788							dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...)
   789						} else {
   790							dst = append(dst, src[match[2*i]:match[2*i+1]]...)
   791						}
   792						break
   793					}
   794				}
   795			}
   796		}
   797		dst = append(dst, template...)
   798		return dst
   799	}
   800	
   801	// extract returns the name from a leading "$name" or "${name}" in str.
   802	// If it is a number, extract returns num set to that number; otherwise num = -1.
   803	func extract(str string) (name string, num int, rest string, ok bool) {
   804		if len(str) < 2 || str[0] != '$' {
   805			return
   806		}
   807		brace := false
   808		if str[1] == '{' {
   809			brace = true
   810			str = str[2:]
   811		} else {
   812			str = str[1:]
   813		}
   814		i := 0
   815		for i < len(str) {
   816			rune, size := utf8.DecodeRuneInString(str[i:])
   817			if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' {
   818				break
   819			}
   820			i += size
   821		}
   822		if i == 0 {
   823			// empty name is not okay
   824			return
   825		}
   826		name = str[:i]
   827		if brace {
   828			if i >= len(str) || str[i] != '}' {
   829				// missing closing brace
   830				return
   831			}
   832			i++
   833		}
   834	
   835		// Parse number.
   836		num = 0
   837		for i := 0; i < len(name); i++ {
   838			if name[i] < '0' || '9' < name[i] || num >= 1e8 {
   839				num = -1
   840				break
   841			}
   842			num = num*10 + int(name[i]) - '0'
   843		}
   844		// Disallow leading zeros.
   845		if name[0] == '0' && len(name) > 1 {
   846			num = -1
   847		}
   848	
   849		rest = str[i:]
   850		ok = true
   851		return
   852	}
   853	
   854	// FindSubmatchIndex returns a slice holding the index pairs identifying the
   855	// leftmost match of the regular expression in b and the matches, if any, of
   856	// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
   857	// in the package comment.
   858	// A return value of nil indicates no match.
   859	func (re *Regexp) FindSubmatchIndex(b []byte) []int {
   860		return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap))
   861	}
   862	
   863	// FindStringSubmatch returns a slice of strings holding the text of the
   864	// leftmost match of the regular expression in s and the matches, if any, of
   865	// its subexpressions, as defined by the 'Submatch' description in the
   866	// package comment.
   867	// A return value of nil indicates no match.
   868	func (re *Regexp) FindStringSubmatch(s string) []string {
   869		a := re.doExecute(nil, nil, s, 0, re.prog.NumCap)
   870		if a == nil {
   871			return nil
   872		}
   873		ret := make([]string, 1+re.numSubexp)
   874		for i := range ret {
   875			if 2*i < len(a) && a[2*i] >= 0 {
   876				ret[i] = s[a[2*i]:a[2*i+1]]
   877			}
   878		}
   879		return ret
   880	}
   881	
   882	// FindStringSubmatchIndex returns a slice holding the index pairs
   883	// identifying the leftmost match of the regular expression in s and the
   884	// matches, if any, of its subexpressions, as defined by the 'Submatch' and
   885	// 'Index' descriptions in the package comment.
   886	// A return value of nil indicates no match.
   887	func (re *Regexp) FindStringSubmatchIndex(s string) []int {
   888		return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap))
   889	}
   890	
   891	// FindReaderSubmatchIndex returns a slice holding the index pairs
   892	// identifying the leftmost match of the regular expression of text read by
   893	// the RuneReader, and the matches, if any, of its subexpressions, as defined
   894	// by the 'Submatch' and 'Index' descriptions in the package comment.  A
   895	// return value of nil indicates no match.
   896	func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
   897		return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap))
   898	}
   899	
   900	const startSize = 10 // The size at which to start a slice in the 'All' routines.
   901	
   902	// FindAll is the 'All' version of Find; it returns a slice of all successive
   903	// matches of the expression, as defined by the 'All' description in the
   904	// package comment.
   905	// A return value of nil indicates no match.
   906	func (re *Regexp) FindAll(b []byte, n int) [][]byte {
   907		if n < 0 {
   908			n = len(b) + 1
   909		}
   910		result := make([][]byte, 0, startSize)
   911		re.allMatches("", b, n, func(match []int) {
   912			result = append(result, b[match[0]:match[1]])
   913		})
   914		if len(result) == 0 {
   915			return nil
   916		}
   917		return result
   918	}
   919	
   920	// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
   921	// successive matches of the expression, as defined by the 'All' description
   922	// in the package comment.
   923	// A return value of nil indicates no match.
   924	func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
   925		if n < 0 {
   926			n = len(b) + 1
   927		}
   928		result := make([][]int, 0, startSize)
   929		re.allMatches("", b, n, func(match []int) {
   930			result = append(result, match[0:2])
   931		})
   932		if len(result) == 0 {
   933			return nil
   934		}
   935		return result
   936	}
   937	
   938	// FindAllString is the 'All' version of FindString; it returns a slice of all
   939	// successive matches of the expression, as defined by the 'All' description
   940	// in the package comment.
   941	// A return value of nil indicates no match.
   942	func (re *Regexp) FindAllString(s string, n int) []string {
   943		if n < 0 {
   944			n = len(s) + 1
   945		}
   946		result := make([]string, 0, startSize)
   947		re.allMatches(s, nil, n, func(match []int) {
   948			result = append(result, s[match[0]:match[1]])
   949		})
   950		if len(result) == 0 {
   951			return nil
   952		}
   953		return result
   954	}
   955	
   956	// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
   957	// slice of all successive matches of the expression, as defined by the 'All'
   958	// description in the package comment.
   959	// A return value of nil indicates no match.
   960	func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
   961		if n < 0 {
   962			n = len(s) + 1
   963		}
   964		result := make([][]int, 0, startSize)
   965		re.allMatches(s, nil, n, func(match []int) {
   966			result = append(result, match[0:2])
   967		})
   968		if len(result) == 0 {
   969			return nil
   970		}
   971		return result
   972	}
   973	
   974	// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
   975	// of all successive matches of the expression, as defined by the 'All'
   976	// description in the package comment.
   977	// A return value of nil indicates no match.
   978	func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
   979		if n < 0 {
   980			n = len(b) + 1
   981		}
   982		result := make([][][]byte, 0, startSize)
   983		re.allMatches("", b, n, func(match []int) {
   984			slice := make([][]byte, len(match)/2)
   985			for j := range slice {
   986				if match[2*j] >= 0 {
   987					slice[j] = b[match[2*j]:match[2*j+1]]
   988				}
   989			}
   990			result = append(result, slice)
   991		})
   992		if len(result) == 0 {
   993			return nil
   994		}
   995		return result
   996	}
   997	
   998	// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
   999	// a slice of all successive matches of the expression, as defined by the
  1000	// 'All' description in the package comment.
  1001	// A return value of nil indicates no match.
  1002	func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
  1003		if n < 0 {
  1004			n = len(b) + 1
  1005		}
  1006		result := make([][]int, 0, startSize)
  1007		re.allMatches("", b, n, func(match []int) {
  1008			result = append(result, match)
  1009		})
  1010		if len(result) == 0 {
  1011			return nil
  1012		}
  1013		return result
  1014	}
  1015	
  1016	// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
  1017	// returns a slice of all successive matches of the expression, as defined by
  1018	// the 'All' description in the package comment.
  1019	// A return value of nil indicates no match.
  1020	func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
  1021		if n < 0 {
  1022			n = len(s) + 1
  1023		}
  1024		result := make([][]string, 0, startSize)
  1025		re.allMatches(s, nil, n, func(match []int) {
  1026			slice := make([]string, len(match)/2)
  1027			for j := range slice {
  1028				if match[2*j] >= 0 {
  1029					slice[j] = s[match[2*j]:match[2*j+1]]
  1030				}
  1031			}
  1032			result = append(result, slice)
  1033		})
  1034		if len(result) == 0 {
  1035			return nil
  1036		}
  1037		return result
  1038	}
  1039	
  1040	// FindAllStringSubmatchIndex is the 'All' version of
  1041	// FindStringSubmatchIndex; it returns a slice of all successive matches of
  1042	// the expression, as defined by the 'All' description in the package
  1043	// comment.
  1044	// A return value of nil indicates no match.
  1045	func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
  1046		if n < 0 {
  1047			n = len(s) + 1
  1048		}
  1049		result := make([][]int, 0, startSize)
  1050		re.allMatches(s, nil, n, func(match []int) {
  1051			result = append(result, match)
  1052		})
  1053		if len(result) == 0 {
  1054			return nil
  1055		}
  1056		return result
  1057	}
  1058	
  1059	// Split slices s into substrings separated by the expression and returns a slice of
  1060	// the substrings between those expression matches.
  1061	//
  1062	// The slice returned by this method consists of all the substrings of s
  1063	// not contained in the slice returned by FindAllString. When called on an expression
  1064	// that contains no metacharacters, it is equivalent to strings.SplitN.
  1065	//
  1066	// Example:
  1067	//   s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5)
  1068	//   // s: ["", "b", "b", "c", "cadaaae"]
  1069	//
  1070	// The count determines the number of substrings to return:
  1071	//   n > 0: at most n substrings; the last substring will be the unsplit remainder.
  1072	//   n == 0: the result is nil (zero substrings)
  1073	//   n < 0: all substrings
  1074	func (re *Regexp) Split(s string, n int) []string {
  1075	
  1076		if n == 0 {
  1077			return nil
  1078		}
  1079	
  1080		if len(re.expr) > 0 && len(s) == 0 {
  1081			return []string{""}
  1082		}
  1083	
  1084		matches := re.FindAllStringIndex(s, n)
  1085		strings := make([]string, 0, len(matches))
  1086	
  1087		beg := 0
  1088		end := 0
  1089		for _, match := range matches {
  1090			if n > 0 && len(strings) >= n-1 {
  1091				break
  1092			}
  1093	
  1094			end = match[0]
  1095			if match[1] != 0 {
  1096				strings = append(strings, s[beg:end])
  1097			}
  1098			beg = match[1]
  1099		}
  1100	
  1101		if end != len(s) {
  1102			strings = append(strings, s[beg:])
  1103		}
  1104	
  1105		return strings
  1106	}

View as plain text