Run Format

Source file src/pkg/regexp/regexp.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package regexp implements regular expression search.
     6	//
     7	// The syntax of the regular expressions accepted is the same
     8	// general syntax used by Perl, Python, and other languages.
     9	// More precisely, it is the syntax accepted by RE2 and described at
    10	// http://code.google.com/p/re2/wiki/Syntax, except for \C.
    11	// For an overview of the syntax, run
    12	//   godoc regexp/syntax
    13	//
    14	// All characters are UTF-8-encoded code points.
    15	//
    16	// There are 16 methods of Regexp that match a regular expression and identify
    17	// the matched text.  Their names are matched by this regular expression:
    18	//
    19	//	Find(All)?(String)?(Submatch)?(Index)?
    20	//
    21	// If 'All' is present, the routine matches successive non-overlapping
    22	// matches of the entire expression.  Empty matches abutting a preceding
    23	// match are ignored.  The return value is a slice containing the successive
    24	// return values of the corresponding non-'All' routine.  These routines take
    25	// an extra integer argument, n; if n >= 0, the function returns at most n
    26	// matches/submatches.
    27	//
    28	// If 'String' is present, the argument is a string; otherwise it is a slice
    29	// of bytes; return values are adjusted as appropriate.
    30	//
    31	// If 'Submatch' is present, the return value is a slice identifying the
    32	// successive submatches of the expression. Submatches are matches of
    33	// parenthesized subexpressions (also known as capturing groups) within the
    34	// regular expression, numbered from left to right in order of opening
    35	// parenthesis. Submatch 0 is the match of the entire expression, submatch 1
    36	// the match of the first parenthesized subexpression, and so on.
    37	//
    38	// If 'Index' is present, matches and submatches are identified by byte index
    39	// pairs within the input string: result[2*n:2*n+1] identifies the indexes of
    40	// the nth submatch.  The pair for n==0 identifies the match of the entire
    41	// expression.  If 'Index' is not present, the match is identified by the
    42	// text of the match/submatch.  If an index is negative, it means that
    43	// subexpression did not match any string in the input.
    44	//
    45	// There is also a subset of the methods that can be applied to text read
    46	// from a RuneReader:
    47	//
    48	//	MatchReader, FindReaderIndex, FindReaderSubmatchIndex
    49	//
    50	// This set may grow.  Note that regular expression matches may need to
    51	// examine text beyond the text returned by a match, so the methods that
    52	// match text from a RuneReader may read arbitrarily far into the input
    53	// before returning.
    54	//
    55	// (There are a few other methods that do not match this pattern.)
    56	//
    57	package regexp
    58	
    59	import (
    60		"bytes"
    61		"io"
    62		"regexp/syntax"
    63		"strconv"
    64		"strings"
    65		"sync"
    66		"unicode"
    67		"unicode/utf8"
    68	)
    69	
    70	var debug = false
    71	
    72	// Regexp is the representation of a compiled regular expression.
    73	// The public interface is entirely through methods.
    74	// A Regexp is safe for concurrent use by multiple goroutines.
    75	type Regexp struct {
    76		// read-only after Compile
    77		expr           string         // as passed to Compile
    78		prog           *syntax.Prog   // compiled program
    79		prefix         string         // required prefix in unanchored matches
    80		prefixBytes    []byte         // prefix, as a []byte
    81		prefixComplete bool           // prefix is the entire regexp
    82		prefixRune     rune           // first rune in prefix
    83		cond           syntax.EmptyOp // empty-width conditions required at start of match
    84		numSubexp      int
    85		subexpNames    []string
    86		longest        bool
    87	
    88		// cache of machines for running regexp
    89		mu      sync.Mutex
    90		machine []*machine
    91	}
    92	
    93	// String returns the source text used to compile the regular expression.
    94	func (re *Regexp) String() string {
    95		return re.expr
    96	}
    97	
    98	// Compile parses a regular expression and returns, if successful,
    99	// a Regexp object that can be used to match against text.
   100	//
   101	// When matching against text, the regexp returns a match that
   102	// begins as early as possible in the input (leftmost), and among those
   103	// it chooses the one that a backtracking search would have found first.
   104	// This so-called leftmost-first matching is the same semantics
   105	// that Perl, Python, and other implementations use, although this
   106	// package implements it without the expense of backtracking.
   107	// For POSIX leftmost-longest matching, see CompilePOSIX.
   108	func Compile(expr string) (*Regexp, error) {
   109		return compile(expr, syntax.Perl, false)
   110	}
   111	
   112	// CompilePOSIX is like Compile but restricts the regular expression
   113	// to POSIX ERE (egrep) syntax and changes the match semantics to
   114	// leftmost-longest.
   115	//
   116	// That is, when matching against text, the regexp returns a match that
   117	// begins as early as possible in the input (leftmost), and among those
   118	// it chooses a match that is as long as possible.
   119	// This so-called leftmost-longest matching is the same semantics
   120	// that early regular expression implementations used and that POSIX
   121	// specifies.
   122	//
   123	// However, there can be multiple leftmost-longest matches, with different
   124	// submatch choices, and here this package diverges from POSIX.
   125	// Among the possible leftmost-longest matches, this package chooses
   126	// the one that a backtracking search would have found first, while POSIX
   127	// specifies that the match be chosen to maximize the length of the first
   128	// subexpression, then the second, and so on from left to right.
   129	// The POSIX rule is computationally prohibitive and not even well-defined.
   130	// See http://swtch.com/~rsc/regexp/regexp2.html#posix for details.
   131	func CompilePOSIX(expr string) (*Regexp, error) {
   132		return compile(expr, syntax.POSIX, true)
   133	}
   134	
   135	// Longest makes future searches prefer the leftmost-longest match.
   136	// That is, when matching against text, the regexp returns a match that
   137	// begins as early as possible in the input (leftmost), and among those
   138	// it chooses a match that is as long as possible.
   139	func (re *Regexp) Longest() {
   140		re.longest = true
   141	}
   142	
   143	func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
   144		re, err := syntax.Parse(expr, mode)
   145		if err != nil {
   146			return nil, err
   147		}
   148		maxCap := re.MaxCap()
   149		capNames := re.CapNames()
   150	
   151		re = re.Simplify()
   152		prog, err := syntax.Compile(re)
   153		if err != nil {
   154			return nil, err
   155		}
   156		regexp := &Regexp{
   157			expr:        expr,
   158			prog:        prog,
   159			numSubexp:   maxCap,
   160			subexpNames: capNames,
   161			cond:        prog.StartCond(),
   162			longest:     longest,
   163		}
   164		regexp.prefix, regexp.prefixComplete = prog.Prefix()
   165		if regexp.prefix != "" {
   166			// TODO(rsc): Remove this allocation by adding
   167			// IndexString to package bytes.
   168			regexp.prefixBytes = []byte(regexp.prefix)
   169			regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
   170		}
   171		return regexp, nil
   172	}
   173	
   174	// get returns a machine to use for matching re.
   175	// It uses the re's machine cache if possible, to avoid
   176	// unnecessary allocation.
   177	func (re *Regexp) get() *machine {
   178		re.mu.Lock()
   179		if n := len(re.machine); n > 0 {
   180			z := re.machine[n-1]
   181			re.machine = re.machine[:n-1]
   182			re.mu.Unlock()
   183			return z
   184		}
   185		re.mu.Unlock()
   186		z := progMachine(re.prog)
   187		z.re = re
   188		return z
   189	}
   190	
   191	// put returns a machine to the re's machine cache.
   192	// There is no attempt to limit the size of the cache, so it will
   193	// grow to the maximum number of simultaneous matches
   194	// run using re.  (The cache empties when re gets garbage collected.)
   195	func (re *Regexp) put(z *machine) {
   196		re.mu.Lock()
   197		re.machine = append(re.machine, z)
   198		re.mu.Unlock()
   199	}
   200	
   201	// MustCompile is like Compile but panics if the expression cannot be parsed.
   202	// It simplifies safe initialization of global variables holding compiled regular
   203	// expressions.
   204	func MustCompile(str string) *Regexp {
   205		regexp, error := Compile(str)
   206		if error != nil {
   207			panic(`regexp: Compile(` + quote(str) + `): ` + error.Error())
   208		}
   209		return regexp
   210	}
   211	
   212	// MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed.
   213	// It simplifies safe initialization of global variables holding compiled regular
   214	// expressions.
   215	func MustCompilePOSIX(str string) *Regexp {
   216		regexp, error := CompilePOSIX(str)
   217		if error != nil {
   218			panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error())
   219		}
   220		return regexp
   221	}
   222	
   223	func quote(s string) string {
   224		if strconv.CanBackquote(s) {
   225			return "`" + s + "`"
   226		}
   227		return strconv.Quote(s)
   228	}
   229	
   230	// NumSubexp returns the number of parenthesized subexpressions in this Regexp.
   231	func (re *Regexp) NumSubexp() int {
   232		return re.numSubexp
   233	}
   234	
   235	// SubexpNames returns the names of the parenthesized subexpressions
   236	// in this Regexp.  The name for the first sub-expression is names[1],
   237	// so that if m is a match slice, the name for m[i] is SubexpNames()[i].
   238	// Since the Regexp as a whole cannot be named, names[0] is always
   239	// the empty string.  The slice should not be modified.
   240	func (re *Regexp) SubexpNames() []string {
   241		return re.subexpNames
   242	}
   243	
   244	const endOfText rune = -1
   245	
   246	// input abstracts different representations of the input text. It provides
   247	// one-character lookahead.
   248	type input interface {
   249		step(pos int) (r rune, width int) // advance one rune
   250		canCheckPrefix() bool             // can we look ahead without losing info?
   251		hasPrefix(re *Regexp) bool
   252		index(re *Regexp, pos int) int
   253		context(pos int) syntax.EmptyOp
   254	}
   255	
   256	// inputString scans a string.
   257	type inputString struct {
   258		str string
   259	}
   260	
   261	func (i *inputString) step(pos int) (rune, int) {
   262		if pos < len(i.str) {
   263			c := i.str[pos]
   264			if c < utf8.RuneSelf {
   265				return rune(c), 1
   266			}
   267			return utf8.DecodeRuneInString(i.str[pos:])
   268		}
   269		return endOfText, 0
   270	}
   271	
   272	func (i *inputString) canCheckPrefix() bool {
   273		return true
   274	}
   275	
   276	func (i *inputString) hasPrefix(re *Regexp) bool {
   277		return strings.HasPrefix(i.str, re.prefix)
   278	}
   279	
   280	func (i *inputString) index(re *Regexp, pos int) int {
   281		return strings.Index(i.str[pos:], re.prefix)
   282	}
   283	
   284	func (i *inputString) context(pos int) syntax.EmptyOp {
   285		r1, r2 := endOfText, endOfText
   286		if pos > 0 && pos <= len(i.str) {
   287			r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
   288		}
   289		if pos < len(i.str) {
   290			r2, _ = utf8.DecodeRuneInString(i.str[pos:])
   291		}
   292		return syntax.EmptyOpContext(r1, r2)
   293	}
   294	
   295	// inputBytes scans a byte slice.
   296	type inputBytes struct {
   297		str []byte
   298	}
   299	
   300	func (i *inputBytes) step(pos int) (rune, int) {
   301		if pos < len(i.str) {
   302			c := i.str[pos]
   303			if c < utf8.RuneSelf {
   304				return rune(c), 1
   305			}
   306			return utf8.DecodeRune(i.str[pos:])
   307		}
   308		return endOfText, 0
   309	}
   310	
   311	func (i *inputBytes) canCheckPrefix() bool {
   312		return true
   313	}
   314	
   315	func (i *inputBytes) hasPrefix(re *Regexp) bool {
   316		return bytes.HasPrefix(i.str, re.prefixBytes)
   317	}
   318	
   319	func (i *inputBytes) index(re *Regexp, pos int) int {
   320		return bytes.Index(i.str[pos:], re.prefixBytes)
   321	}
   322	
   323	func (i *inputBytes) context(pos int) syntax.EmptyOp {
   324		r1, r2 := endOfText, endOfText
   325		if pos > 0 && pos <= len(i.str) {
   326			r1, _ = utf8.DecodeLastRune(i.str[:pos])
   327		}
   328		if pos < len(i.str) {
   329			r2, _ = utf8.DecodeRune(i.str[pos:])
   330		}
   331		return syntax.EmptyOpContext(r1, r2)
   332	}
   333	
   334	// inputReader scans a RuneReader.
   335	type inputReader struct {
   336		r     io.RuneReader
   337		atEOT bool
   338		pos   int
   339	}
   340	
   341	func (i *inputReader) step(pos int) (rune, int) {
   342		if !i.atEOT && pos != i.pos {
   343			return endOfText, 0
   344	
   345		}
   346		r, w, err := i.r.ReadRune()
   347		if err != nil {
   348			i.atEOT = true
   349			return endOfText, 0
   350		}
   351		i.pos += w
   352		return r, w
   353	}
   354	
   355	func (i *inputReader) canCheckPrefix() bool {
   356		return false
   357	}
   358	
   359	func (i *inputReader) hasPrefix(re *Regexp) bool {
   360		return false
   361	}
   362	
   363	func (i *inputReader) index(re *Regexp, pos int) int {
   364		return -1
   365	}
   366	
   367	func (i *inputReader) context(pos int) syntax.EmptyOp {
   368		return 0
   369	}
   370	
   371	// LiteralPrefix returns a literal string that must begin any match
   372	// of the regular expression re.  It returns the boolean true if the
   373	// literal string comprises the entire regular expression.
   374	func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
   375		return re.prefix, re.prefixComplete
   376	}
   377	
   378	// MatchReader returns whether the Regexp matches the text read by the
   379	// RuneReader.  The return value is a boolean: true for match, false for no
   380	// match.
   381	func (re *Regexp) MatchReader(r io.RuneReader) bool {
   382		return re.doExecute(r, nil, "", 0, 0) != nil
   383	}
   384	
   385	// MatchString returns whether the Regexp matches the string s.
   386	// The return value is a boolean: true for match, false for no match.
   387	func (re *Regexp) MatchString(s string) bool {
   388		return re.doExecute(nil, nil, s, 0, 0) != nil
   389	}
   390	
   391	// Match returns whether the Regexp matches the byte slice b.
   392	// The return value is a boolean: true for match, false for no match.
   393	func (re *Regexp) Match(b []byte) bool {
   394		return re.doExecute(nil, b, "", 0, 0) != nil
   395	}
   396	
   397	// MatchReader checks whether a textual regular expression matches the text
   398	// read by the RuneReader.  More complicated queries need to use Compile and
   399	// the full Regexp interface.
   400	func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) {
   401		re, err := Compile(pattern)
   402		if err != nil {
   403			return false, err
   404		}
   405		return re.MatchReader(r), nil
   406	}
   407	
   408	// MatchString checks whether a textual regular expression
   409	// matches a string.  More complicated queries need
   410	// to use Compile and the full Regexp interface.
   411	func MatchString(pattern string, s string) (matched bool, err error) {
   412		re, err := Compile(pattern)
   413		if err != nil {
   414			return false, err
   415		}
   416		return re.MatchString(s), nil
   417	}
   418	
   419	// Match checks whether a textual regular expression
   420	// matches a byte slice.  More complicated queries need
   421	// to use Compile and the full Regexp interface.
   422	func Match(pattern string, b []byte) (matched bool, err error) {
   423		re, err := Compile(pattern)
   424		if err != nil {
   425			return false, err
   426		}
   427		return re.Match(b), nil
   428	}
   429	
   430	// ReplaceAllString returns a copy of src, replacing matches of the Regexp
   431	// with the replacement string repl.  Inside repl, $ signs are interpreted as
   432	// in Expand, so for instance $1 represents the text of the first submatch.
   433	func (re *Regexp) ReplaceAllString(src, repl string) string {
   434		n := 2
   435		if strings.Index(repl, "$") >= 0 {
   436			n = 2 * (re.numSubexp + 1)
   437		}
   438		b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte {
   439			return re.expand(dst, repl, nil, src, match)
   440		})
   441		return string(b)
   442	}
   443	
   444	// ReplaceAllStringLiteral returns a copy of src, replacing matches of the Regexp
   445	// with the replacement string repl.  The replacement repl is substituted directly,
   446	// without using Expand.
   447	func (re *Regexp) ReplaceAllLiteralString(src, repl string) string {
   448		return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
   449			return append(dst, repl...)
   450		}))
   451	}
   452	
   453	// ReplaceAllStringFunc returns a copy of src in which all matches of the
   454	// Regexp have been replaced by the return value of function repl applied
   455	// to the matched substring.  The replacement returned by repl is substituted
   456	// directly, without using Expand.
   457	func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
   458		b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
   459			return append(dst, repl(src[match[0]:match[1]])...)
   460		})
   461		return string(b)
   462	}
   463	
   464	func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte {
   465		lastMatchEnd := 0 // end position of the most recent match
   466		searchPos := 0    // position where we next look for a match
   467		var buf []byte
   468		var endPos int
   469		if bsrc != nil {
   470			endPos = len(bsrc)
   471		} else {
   472			endPos = len(src)
   473		}
   474		for searchPos <= endPos {
   475			a := re.doExecute(nil, bsrc, src, searchPos, nmatch)
   476			if len(a) == 0 {
   477				break // no more matches
   478			}
   479	
   480			// Copy the unmatched characters before this match.
   481			if bsrc != nil {
   482				buf = append(buf, bsrc[lastMatchEnd:a[0]]...)
   483			} else {
   484				buf = append(buf, src[lastMatchEnd:a[0]]...)
   485			}
   486	
   487			// Now insert a copy of the replacement string, but not for a
   488			// match of the empty string immediately after another match.
   489			// (Otherwise, we get double replacement for patterns that
   490			// match both empty and nonempty strings.)
   491			if a[1] > lastMatchEnd || a[0] == 0 {
   492				buf = repl(buf, a)
   493			}
   494			lastMatchEnd = a[1]
   495	
   496			// Advance past this match; always advance at least one character.
   497			var width int
   498			if bsrc != nil {
   499				_, width = utf8.DecodeRune(bsrc[searchPos:])
   500			} else {
   501				_, width = utf8.DecodeRuneInString(src[searchPos:])
   502			}
   503			if searchPos+width > a[1] {
   504				searchPos += width
   505			} else if searchPos+1 > a[1] {
   506				// This clause is only needed at the end of the input
   507				// string.  In that case, DecodeRuneInString returns width=0.
   508				searchPos++
   509			} else {
   510				searchPos = a[1]
   511			}
   512		}
   513	
   514		// Copy the unmatched characters after the last match.
   515		if bsrc != nil {
   516			buf = append(buf, bsrc[lastMatchEnd:]...)
   517		} else {
   518			buf = append(buf, src[lastMatchEnd:]...)
   519		}
   520	
   521		return buf
   522	}
   523	
   524	// ReplaceAll returns a copy of src, replacing matches of the Regexp
   525	// with the replacement text repl.  Inside repl, $ signs are interpreted as
   526	// in Expand, so for instance $1 represents the text of the first submatch.
   527	func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
   528		n := 2
   529		if bytes.IndexByte(repl, '$') >= 0 {
   530			n = 2 * (re.numSubexp + 1)
   531		}
   532		srepl := ""
   533		b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte {
   534			if len(srepl) != len(repl) {
   535				srepl = string(repl)
   536			}
   537			return re.expand(dst, srepl, src, "", match)
   538		})
   539		return b
   540	}
   541	
   542	// ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp
   543	// with the replacement bytes repl.  The replacement repl is substituted directly,
   544	// without using Expand.
   545	func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
   546		return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
   547			return append(dst, repl...)
   548		})
   549	}
   550	
   551	// ReplaceAllFunc returns a copy of src in which all matches of the
   552	// Regexp have been replaced by the return value of function repl applied
   553	// to the matched byte slice.  The replacement returned by repl is substituted
   554	// directly, without using Expand.
   555	func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
   556		return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
   557			return append(dst, repl(src[match[0]:match[1]])...)
   558		})
   559	}
   560	
   561	var specialBytes = []byte(`\.+*?()|[]{}^$`)
   562	
   563	func special(b byte) bool {
   564		return bytes.IndexByte(specialBytes, b) >= 0
   565	}
   566	
   567	// QuoteMeta returns a string that quotes all regular expression metacharacters
   568	// inside the argument text; the returned string is a regular expression matching
   569	// the literal text.  For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
   570	func QuoteMeta(s string) string {
   571		b := make([]byte, 2*len(s))
   572	
   573		// A byte loop is correct because all metacharacters are ASCII.
   574		j := 0
   575		for i := 0; i < len(s); i++ {
   576			if special(s[i]) {
   577				b[j] = '\\'
   578				j++
   579			}
   580			b[j] = s[i]
   581			j++
   582		}
   583		return string(b[0:j])
   584	}
   585	
   586	// The number of capture values in the program may correspond
   587	// to fewer capturing expressions than are in the regexp.
   588	// For example, "(a){0}" turns into an empty program, so the
   589	// maximum capture in the program is 0 but we need to return
   590	// an expression for \1.  Pad appends -1s to the slice a as needed.
   591	func (re *Regexp) pad(a []int) []int {
   592		if a == nil {
   593			// No match.
   594			return nil
   595		}
   596		n := (1 + re.numSubexp) * 2
   597		for len(a) < n {
   598			a = append(a, -1)
   599		}
   600		return a
   601	}
   602	
   603	// Find matches in slice b if b is non-nil, otherwise find matches in string s.
   604	func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
   605		var end int
   606		if b == nil {
   607			end = len(s)
   608		} else {
   609			end = len(b)
   610		}
   611	
   612		for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
   613			matches := re.doExecute(nil, b, s, pos, re.prog.NumCap)
   614			if len(matches) == 0 {
   615				break
   616			}
   617	
   618			accept := true
   619			if matches[1] == pos {
   620				// We've found an empty match.
   621				if matches[0] == prevMatchEnd {
   622					// We don't allow an empty match right
   623					// after a previous match, so ignore it.
   624					accept = false
   625				}
   626				var width int
   627				// TODO: use step()
   628				if b == nil {
   629					_, width = utf8.DecodeRuneInString(s[pos:end])
   630				} else {
   631					_, width = utf8.DecodeRune(b[pos:end])
   632				}
   633				if width > 0 {
   634					pos += width
   635				} else {
   636					pos = end + 1
   637				}
   638			} else {
   639				pos = matches[1]
   640			}
   641			prevMatchEnd = matches[1]
   642	
   643			if accept {
   644				deliver(re.pad(matches))
   645				i++
   646			}
   647		}
   648	}
   649	
   650	// Find returns a slice holding the text of the leftmost match in b of the regular expression.
   651	// A return value of nil indicates no match.
   652	func (re *Regexp) Find(b []byte) []byte {
   653		a := re.doExecute(nil, b, "", 0, 2)
   654		if a == nil {
   655			return nil
   656		}
   657		return b[a[0]:a[1]]
   658	}
   659	
   660	// FindIndex returns a two-element slice of integers defining the location of
   661	// the leftmost match in b of the regular expression.  The match itself is at
   662	// b[loc[0]:loc[1]].
   663	// A return value of nil indicates no match.
   664	func (re *Regexp) FindIndex(b []byte) (loc []int) {
   665		a := re.doExecute(nil, b, "", 0, 2)
   666		if a == nil {
   667			return nil
   668		}
   669		return a[0:2]
   670	}
   671	
   672	// FindString returns a string holding the text of the leftmost match in s of the regular
   673	// expression.  If there is no match, the return value is an empty string,
   674	// but it will also be empty if the regular expression successfully matches
   675	// an empty string.  Use FindStringIndex or FindStringSubmatch if it is
   676	// necessary to distinguish these cases.
   677	func (re *Regexp) FindString(s string) string {
   678		a := re.doExecute(nil, nil, s, 0, 2)
   679		if a == nil {
   680			return ""
   681		}
   682		return s[a[0]:a[1]]
   683	}
   684	
   685	// FindStringIndex returns a two-element slice of integers defining the
   686	// location of the leftmost match in s of the regular expression.  The match
   687	// itself is at s[loc[0]:loc[1]].
   688	// A return value of nil indicates no match.
   689	func (re *Regexp) FindStringIndex(s string) (loc []int) {
   690		a := re.doExecute(nil, nil, s, 0, 2)
   691		if a == nil {
   692			return nil
   693		}
   694		return a[0:2]
   695	}
   696	
   697	// FindReaderIndex returns a two-element slice of integers defining the
   698	// location of the leftmost match of the regular expression in text read from
   699	// the RuneReader.  The match text was found in the input stream at
   700	// byte offset loc[0] through loc[1]-1.
   701	// A return value of nil indicates no match.
   702	func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) {
   703		a := re.doExecute(r, nil, "", 0, 2)
   704		if a == nil {
   705			return nil
   706		}
   707		return a[0:2]
   708	}
   709	
   710	// FindSubmatch returns a slice of slices holding the text of the leftmost
   711	// match of the regular expression in b and the matches, if any, of its
   712	// subexpressions, as defined by the 'Submatch' descriptions in the package
   713	// comment.
   714	// A return value of nil indicates no match.
   715	func (re *Regexp) FindSubmatch(b []byte) [][]byte {
   716		a := re.doExecute(nil, b, "", 0, re.prog.NumCap)
   717		if a == nil {
   718			return nil
   719		}
   720		ret := make([][]byte, 1+re.numSubexp)
   721		for i := range ret {
   722			if 2*i < len(a) && a[2*i] >= 0 {
   723				ret[i] = b[a[2*i]:a[2*i+1]]
   724			}
   725		}
   726		return ret
   727	}
   728	
   729	// Expand appends template to dst and returns the result; during the
   730	// append, Expand replaces variables in the template with corresponding
   731	// matches drawn from src.  The match slice should have been returned by
   732	// FindSubmatchIndex.
   733	//
   734	// In the template, a variable is denoted by a substring of the form
   735	// $name or ${name}, where name is a non-empty sequence of letters,
   736	// digits, and underscores.  A purely numeric name like $1 refers to
   737	// the submatch with the corresponding index; other names refer to
   738	// capturing parentheses named with the (?P<name>...) syntax.  A
   739	// reference to an out of range or unmatched index or a name that is not
   740	// present in the regular expression is replaced with an empty slice.
   741	//
   742	// In the $name form, name is taken to be as long as possible: $1x is
   743	// equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0.
   744	//
   745	// To insert a literal $ in the output, use $$ in the template.
   746	func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte {
   747		return re.expand(dst, string(template), src, "", match)
   748	}
   749	
   750	// ExpandString is like Expand but the template and source are strings.
   751	// It appends to and returns a byte slice in order to give the calling
   752	// code control over allocation.
   753	func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte {
   754		return re.expand(dst, template, nil, src, match)
   755	}
   756	
   757	func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte {
   758		for len(template) > 0 {
   759			i := strings.Index(template, "$")
   760			if i < 0 {
   761				break
   762			}
   763			dst = append(dst, template[:i]...)
   764			template = template[i:]
   765			if len(template) > 1 && template[1] == '$' {
   766				// Treat $$ as $.
   767				dst = append(dst, '$')
   768				template = template[2:]
   769				continue
   770			}
   771			name, num, rest, ok := extract(template)
   772			if !ok {
   773				// Malformed; treat $ as raw text.
   774				dst = append(dst, '$')
   775				template = template[1:]
   776				continue
   777			}
   778			template = rest
   779			if num >= 0 {
   780				if 2*num+1 < len(match) && match[2*num] >= 0 {
   781					if bsrc != nil {
   782						dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...)
   783					} else {
   784						dst = append(dst, src[match[2*num]:match[2*num+1]]...)
   785					}
   786				}
   787			} else {
   788				for i, namei := range re.subexpNames {
   789					if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 {
   790						if bsrc != nil {
   791							dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...)
   792						} else {
   793							dst = append(dst, src[match[2*i]:match[2*i+1]]...)
   794						}
   795						break
   796					}
   797				}
   798			}
   799		}
   800		dst = append(dst, template...)
   801		return dst
   802	}
   803	
   804	// extract returns the name from a leading "$name" or "${name}" in str.
   805	// If it is a number, extract returns num set to that number; otherwise num = -1.
   806	func extract(str string) (name string, num int, rest string, ok bool) {
   807		if len(str) < 2 || str[0] != '$' {
   808			return
   809		}
   810		brace := false
   811		if str[1] == '{' {
   812			brace = true
   813			str = str[2:]
   814		} else {
   815			str = str[1:]
   816		}
   817		i := 0
   818		for i < len(str) {
   819			rune, size := utf8.DecodeRuneInString(str[i:])
   820			if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' {
   821				break
   822			}
   823			i += size
   824		}
   825		if i == 0 {
   826			// empty name is not okay
   827			return
   828		}
   829		name = str[:i]
   830		if brace {
   831			if i >= len(str) || str[i] != '}' {
   832				// missing closing brace
   833				return
   834			}
   835			i++
   836		}
   837	
   838		// Parse number.
   839		num = 0
   840		for i := 0; i < len(name); i++ {
   841			if name[i] < '0' || '9' < name[i] || num >= 1e8 {
   842				num = -1
   843				break
   844			}
   845			num = num*10 + int(name[i]) - '0'
   846		}
   847		// Disallow leading zeros.
   848		if name[0] == '0' && len(name) > 1 {
   849			num = -1
   850		}
   851	
   852		rest = str[i:]
   853		ok = true
   854		return
   855	}
   856	
   857	// FindSubmatchIndex returns a slice holding the index pairs identifying the
   858	// leftmost match of the regular expression in b and the matches, if any, of
   859	// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
   860	// in the package comment.
   861	// A return value of nil indicates no match.
   862	func (re *Regexp) FindSubmatchIndex(b []byte) []int {
   863		return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap))
   864	}
   865	
   866	// FindStringSubmatch returns a slice of strings holding the text of the
   867	// leftmost match of the regular expression in s and the matches, if any, of
   868	// its subexpressions, as defined by the 'Submatch' description in the
   869	// package comment.
   870	// A return value of nil indicates no match.
   871	func (re *Regexp) FindStringSubmatch(s string) []string {
   872		a := re.doExecute(nil, nil, s, 0, re.prog.NumCap)
   873		if a == nil {
   874			return nil
   875		}
   876		ret := make([]string, 1+re.numSubexp)
   877		for i := range ret {
   878			if 2*i < len(a) && a[2*i] >= 0 {
   879				ret[i] = s[a[2*i]:a[2*i+1]]
   880			}
   881		}
   882		return ret
   883	}
   884	
   885	// FindStringSubmatchIndex returns a slice holding the index pairs
   886	// identifying the leftmost match of the regular expression in s and the
   887	// matches, if any, of its subexpressions, as defined by the 'Submatch' and
   888	// 'Index' descriptions in the package comment.
   889	// A return value of nil indicates no match.
   890	func (re *Regexp) FindStringSubmatchIndex(s string) []int {
   891		return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap))
   892	}
   893	
   894	// FindReaderSubmatchIndex returns a slice holding the index pairs
   895	// identifying the leftmost match of the regular expression of text read by
   896	// the RuneReader, and the matches, if any, of its subexpressions, as defined
   897	// by the 'Submatch' and 'Index' descriptions in the package comment.  A
   898	// return value of nil indicates no match.
   899	func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
   900		return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap))
   901	}
   902	
   903	const startSize = 10 // The size at which to start a slice in the 'All' routines.
   904	
   905	// FindAll is the 'All' version of Find; it returns a slice of all successive
   906	// matches of the expression, as defined by the 'All' description in the
   907	// package comment.
   908	// A return value of nil indicates no match.
   909	func (re *Regexp) FindAll(b []byte, n int) [][]byte {
   910		if n < 0 {
   911			n = len(b) + 1
   912		}
   913		result := make([][]byte, 0, startSize)
   914		re.allMatches("", b, n, func(match []int) {
   915			result = append(result, b[match[0]:match[1]])
   916		})
   917		if len(result) == 0 {
   918			return nil
   919		}
   920		return result
   921	}
   922	
   923	// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
   924	// successive matches of the expression, as defined by the 'All' description
   925	// in the package comment.
   926	// A return value of nil indicates no match.
   927	func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
   928		if n < 0 {
   929			n = len(b) + 1
   930		}
   931		result := make([][]int, 0, startSize)
   932		re.allMatches("", b, n, func(match []int) {
   933			result = append(result, match[0:2])
   934		})
   935		if len(result) == 0 {
   936			return nil
   937		}
   938		return result
   939	}
   940	
   941	// FindAllString is the 'All' version of FindString; it returns a slice of all
   942	// successive matches of the expression, as defined by the 'All' description
   943	// in the package comment.
   944	// A return value of nil indicates no match.
   945	func (re *Regexp) FindAllString(s string, n int) []string {
   946		if n < 0 {
   947			n = len(s) + 1
   948		}
   949		result := make([]string, 0, startSize)
   950		re.allMatches(s, nil, n, func(match []int) {
   951			result = append(result, s[match[0]:match[1]])
   952		})
   953		if len(result) == 0 {
   954			return nil
   955		}
   956		return result
   957	}
   958	
   959	// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
   960	// slice of all successive matches of the expression, as defined by the 'All'
   961	// description in the package comment.
   962	// A return value of nil indicates no match.
   963	func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
   964		if n < 0 {
   965			n = len(s) + 1
   966		}
   967		result := make([][]int, 0, startSize)
   968		re.allMatches(s, nil, n, func(match []int) {
   969			result = append(result, match[0:2])
   970		})
   971		if len(result) == 0 {
   972			return nil
   973		}
   974		return result
   975	}
   976	
   977	// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
   978	// of all successive matches of the expression, as defined by the 'All'
   979	// description in the package comment.
   980	// A return value of nil indicates no match.
   981	func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
   982		if n < 0 {
   983			n = len(b) + 1
   984		}
   985		result := make([][][]byte, 0, startSize)
   986		re.allMatches("", b, n, func(match []int) {
   987			slice := make([][]byte, len(match)/2)
   988			for j := range slice {
   989				if match[2*j] >= 0 {
   990					slice[j] = b[match[2*j]:match[2*j+1]]
   991				}
   992			}
   993			result = append(result, slice)
   994		})
   995		if len(result) == 0 {
   996			return nil
   997		}
   998		return result
   999	}
  1000	
  1001	// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
  1002	// a slice of all successive matches of the expression, as defined by the
  1003	// 'All' description in the package comment.
  1004	// A return value of nil indicates no match.
  1005	func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
  1006		if n < 0 {
  1007			n = len(b) + 1
  1008		}
  1009		result := make([][]int, 0, startSize)
  1010		re.allMatches("", b, n, func(match []int) {
  1011			result = append(result, match)
  1012		})
  1013		if len(result) == 0 {
  1014			return nil
  1015		}
  1016		return result
  1017	}
  1018	
  1019	// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
  1020	// returns a slice of all successive matches of the expression, as defined by
  1021	// the 'All' description in the package comment.
  1022	// A return value of nil indicates no match.
  1023	func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
  1024		if n < 0 {
  1025			n = len(s) + 1
  1026		}
  1027		result := make([][]string, 0, startSize)
  1028		re.allMatches(s, nil, n, func(match []int) {
  1029			slice := make([]string, len(match)/2)
  1030			for j := range slice {
  1031				if match[2*j] >= 0 {
  1032					slice[j] = s[match[2*j]:match[2*j+1]]
  1033				}
  1034			}
  1035			result = append(result, slice)
  1036		})
  1037		if len(result) == 0 {
  1038			return nil
  1039		}
  1040		return result
  1041	}
  1042	
  1043	// FindAllStringSubmatchIndex is the 'All' version of
  1044	// FindStringSubmatchIndex; it returns a slice of all successive matches of
  1045	// the expression, as defined by the 'All' description in the package
  1046	// comment.
  1047	// A return value of nil indicates no match.
  1048	func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
  1049		if n < 0 {
  1050			n = len(s) + 1
  1051		}
  1052		result := make([][]int, 0, startSize)
  1053		re.allMatches(s, nil, n, func(match []int) {
  1054			result = append(result, match)
  1055		})
  1056		if len(result) == 0 {
  1057			return nil
  1058		}
  1059		return result
  1060	}
  1061	
  1062	// Split slices s into substrings separated by the expression and returns a slice of
  1063	// the substrings between those expression matches.
  1064	//
  1065	// The slice returned by this method consists of all the substrings of s
  1066	// not contained in the slice returned by FindAllString. When called on an expression
  1067	// that contains no metacharacters, it is equivalent to strings.SplitN.
  1068	//
  1069	// Example:
  1070	//   s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5)
  1071	//   // s: ["", "b", "b", "c", "cadaaae"]
  1072	//
  1073	// The count determines the number of substrings to return:
  1074	//   n > 0: at most n substrings; the last substring will be the unsplit remainder.
  1075	//   n == 0: the result is nil (zero substrings)
  1076	//   n < 0: all substrings
  1077	func (re *Regexp) Split(s string, n int) []string {
  1078	
  1079		if n == 0 {
  1080			return nil
  1081		}
  1082	
  1083		if len(re.expr) > 0 && len(s) == 0 {
  1084			return []string{""}
  1085		}
  1086	
  1087		matches := re.FindAllStringIndex(s, n)
  1088		strings := make([]string, 0, len(matches))
  1089	
  1090		beg := 0
  1091		end := 0
  1092		for _, match := range matches {
  1093			if n > 0 && len(strings) >= n-1 {
  1094				break
  1095			}
  1096	
  1097			end = match[0]
  1098			if match[1] != 0 {
  1099				strings = append(strings, s[beg:end])
  1100			}
  1101			beg = match[1]
  1102		}
  1103	
  1104		if end != len(s) {
  1105			strings = append(strings, s[beg:])
  1106		}
  1107	
  1108		return strings
  1109	}

View as plain text