...
Run Format

Source file src/regexp/regexp.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package regexp implements regular expression search.
     6	//
     7	// The syntax of the regular expressions accepted is the same
     8	// general syntax used by Perl, Python, and other languages.
     9	// More precisely, it is the syntax accepted by RE2 and described at
    10	// http://code.google.com/p/re2/wiki/Syntax, except for \C.
    11	// For an overview of the syntax, run
    12	//   godoc regexp/syntax
    13	//
    14	// The regexp implementation provided by this package is
    15	// guaranteed to run in time linear in the size of the input.
    16	// (This is a property not guaranteed by most open source
    17	// implementations of regular expressions.) For more information
    18	// about this property, see
    19	//	http://swtch.com/~rsc/regexp/regexp1.html
    20	// or any book about automata theory.
    21	//
    22	// All characters are UTF-8-encoded code points.
    23	//
    24	// There are 16 methods of Regexp that match a regular expression and identify
    25	// the matched text.  Their names are matched by this regular expression:
    26	//
    27	//	Find(All)?(String)?(Submatch)?(Index)?
    28	//
    29	// If 'All' is present, the routine matches successive non-overlapping
    30	// matches of the entire expression.  Empty matches abutting a preceding
    31	// match are ignored.  The return value is a slice containing the successive
    32	// return values of the corresponding non-'All' routine.  These routines take
    33	// an extra integer argument, n; if n >= 0, the function returns at most n
    34	// matches/submatches.
    35	//
    36	// If 'String' is present, the argument is a string; otherwise it is a slice
    37	// of bytes; return values are adjusted as appropriate.
    38	//
    39	// If 'Submatch' is present, the return value is a slice identifying the
    40	// successive submatches of the expression. Submatches are matches of
    41	// parenthesized subexpressions (also known as capturing groups) within the
    42	// regular expression, numbered from left to right in order of opening
    43	// parenthesis. Submatch 0 is the match of the entire expression, submatch 1
    44	// the match of the first parenthesized subexpression, and so on.
    45	//
    46	// If 'Index' is present, matches and submatches are identified by byte index
    47	// pairs within the input string: result[2*n:2*n+1] identifies the indexes of
    48	// the nth submatch.  The pair for n==0 identifies the match of the entire
    49	// expression.  If 'Index' is not present, the match is identified by the
    50	// text of the match/submatch.  If an index is negative, it means that
    51	// subexpression did not match any string in the input.
    52	//
    53	// There is also a subset of the methods that can be applied to text read
    54	// from a RuneReader:
    55	//
    56	//	MatchReader, FindReaderIndex, FindReaderSubmatchIndex
    57	//
    58	// This set may grow.  Note that regular expression matches may need to
    59	// examine text beyond the text returned by a match, so the methods that
    60	// match text from a RuneReader may read arbitrarily far into the input
    61	// before returning.
    62	//
    63	// (There are a few other methods that do not match this pattern.)
    64	//
    65	package regexp
    66	
    67	import (
    68		"bytes"
    69		"io"
    70		"regexp/syntax"
    71		"strconv"
    72		"strings"
    73		"sync"
    74		"unicode"
    75		"unicode/utf8"
    76	)
    77	
    78	var debug = false
    79	
    80	// Regexp is the representation of a compiled regular expression.
    81	// A Regexp is safe for concurrent use by multiple goroutines.
    82	type Regexp struct {
    83		// read-only after Compile
    84		expr           string         // as passed to Compile
    85		prog           *syntax.Prog   // compiled program
    86		onepass        *onePassProg   // onpass program or nil
    87		prefix         string         // required prefix in unanchored matches
    88		prefixBytes    []byte         // prefix, as a []byte
    89		prefixComplete bool           // prefix is the entire regexp
    90		prefixRune     rune           // first rune in prefix
    91		prefixEnd      uint32         // pc for last rune in prefix
    92		cond           syntax.EmptyOp // empty-width conditions required at start of match
    93		numSubexp      int
    94		subexpNames    []string
    95		longest        bool
    96	
    97		// cache of machines for running regexp
    98		mu      sync.Mutex
    99		machine []*machine
   100	}
   101	
   102	// String returns the source text used to compile the regular expression.
   103	func (re *Regexp) String() string {
   104		return re.expr
   105	}
   106	
   107	// Compile parses a regular expression and returns, if successful,
   108	// a Regexp object that can be used to match against text.
   109	//
   110	// When matching against text, the regexp returns a match that
   111	// begins as early as possible in the input (leftmost), and among those
   112	// it chooses the one that a backtracking search would have found first.
   113	// This so-called leftmost-first matching is the same semantics
   114	// that Perl, Python, and other implementations use, although this
   115	// package implements it without the expense of backtracking.
   116	// For POSIX leftmost-longest matching, see CompilePOSIX.
   117	func Compile(expr string) (*Regexp, error) {
   118		return compile(expr, syntax.Perl, false)
   119	}
   120	
   121	// CompilePOSIX is like Compile but restricts the regular expression
   122	// to POSIX ERE (egrep) syntax and changes the match semantics to
   123	// leftmost-longest.
   124	//
   125	// That is, when matching against text, the regexp returns a match that
   126	// begins as early as possible in the input (leftmost), and among those
   127	// it chooses a match that is as long as possible.
   128	// This so-called leftmost-longest matching is the same semantics
   129	// that early regular expression implementations used and that POSIX
   130	// specifies.
   131	//
   132	// However, there can be multiple leftmost-longest matches, with different
   133	// submatch choices, and here this package diverges from POSIX.
   134	// Among the possible leftmost-longest matches, this package chooses
   135	// the one that a backtracking search would have found first, while POSIX
   136	// specifies that the match be chosen to maximize the length of the first
   137	// subexpression, then the second, and so on from left to right.
   138	// The POSIX rule is computationally prohibitive and not even well-defined.
   139	// See http://swtch.com/~rsc/regexp/regexp2.html#posix for details.
   140	func CompilePOSIX(expr string) (*Regexp, error) {
   141		return compile(expr, syntax.POSIX, true)
   142	}
   143	
   144	// Longest makes future searches prefer the leftmost-longest match.
   145	// That is, when matching against text, the regexp returns a match that
   146	// begins as early as possible in the input (leftmost), and among those
   147	// it chooses a match that is as long as possible.
   148	func (re *Regexp) Longest() {
   149		re.longest = true
   150	}
   151	
   152	func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
   153		re, err := syntax.Parse(expr, mode)
   154		if err != nil {
   155			return nil, err
   156		}
   157		maxCap := re.MaxCap()
   158		capNames := re.CapNames()
   159	
   160		re = re.Simplify()
   161		prog, err := syntax.Compile(re)
   162		if err != nil {
   163			return nil, err
   164		}
   165		regexp := &Regexp{
   166			expr:        expr,
   167			prog:        prog,
   168			onepass:     compileOnePass(prog),
   169			numSubexp:   maxCap,
   170			subexpNames: capNames,
   171			cond:        prog.StartCond(),
   172			longest:     longest,
   173		}
   174		if regexp.onepass == notOnePass {
   175			regexp.prefix, regexp.prefixComplete = prog.Prefix()
   176		} else {
   177			regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog)
   178		}
   179		if regexp.prefix != "" {
   180			// TODO(rsc): Remove this allocation by adding
   181			// IndexString to package bytes.
   182			regexp.prefixBytes = []byte(regexp.prefix)
   183			regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
   184		}
   185		return regexp, nil
   186	}
   187	
   188	// get returns a machine to use for matching re.
   189	// It uses the re's machine cache if possible, to avoid
   190	// unnecessary allocation.
   191	func (re *Regexp) get() *machine {
   192		re.mu.Lock()
   193		if n := len(re.machine); n > 0 {
   194			z := re.machine[n-1]
   195			re.machine = re.machine[:n-1]
   196			re.mu.Unlock()
   197			return z
   198		}
   199		re.mu.Unlock()
   200		z := progMachine(re.prog, re.onepass)
   201		z.re = re
   202		return z
   203	}
   204	
   205	// put returns a machine to the re's machine cache.
   206	// There is no attempt to limit the size of the cache, so it will
   207	// grow to the maximum number of simultaneous matches
   208	// run using re.  (The cache empties when re gets garbage collected.)
   209	func (re *Regexp) put(z *machine) {
   210		re.mu.Lock()
   211		re.machine = append(re.machine, z)
   212		re.mu.Unlock()
   213	}
   214	
   215	// MustCompile is like Compile but panics if the expression cannot be parsed.
   216	// It simplifies safe initialization of global variables holding compiled regular
   217	// expressions.
   218	func MustCompile(str string) *Regexp {
   219		regexp, error := Compile(str)
   220		if error != nil {
   221			panic(`regexp: Compile(` + quote(str) + `): ` + error.Error())
   222		}
   223		return regexp
   224	}
   225	
   226	// MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed.
   227	// It simplifies safe initialization of global variables holding compiled regular
   228	// expressions.
   229	func MustCompilePOSIX(str string) *Regexp {
   230		regexp, error := CompilePOSIX(str)
   231		if error != nil {
   232			panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error())
   233		}
   234		return regexp
   235	}
   236	
   237	func quote(s string) string {
   238		if strconv.CanBackquote(s) {
   239			return "`" + s + "`"
   240		}
   241		return strconv.Quote(s)
   242	}
   243	
   244	// NumSubexp returns the number of parenthesized subexpressions in this Regexp.
   245	func (re *Regexp) NumSubexp() int {
   246		return re.numSubexp
   247	}
   248	
   249	// SubexpNames returns the names of the parenthesized subexpressions
   250	// in this Regexp.  The name for the first sub-expression is names[1],
   251	// so that if m is a match slice, the name for m[i] is SubexpNames()[i].
   252	// Since the Regexp as a whole cannot be named, names[0] is always
   253	// the empty string.  The slice should not be modified.
   254	func (re *Regexp) SubexpNames() []string {
   255		return re.subexpNames
   256	}
   257	
   258	const endOfText rune = -1
   259	
   260	// input abstracts different representations of the input text. It provides
   261	// one-character lookahead.
   262	type input interface {
   263		step(pos int) (r rune, width int) // advance one rune
   264		canCheckPrefix() bool             // can we look ahead without losing info?
   265		hasPrefix(re *Regexp) bool
   266		index(re *Regexp, pos int) int
   267		context(pos int) syntax.EmptyOp
   268	}
   269	
   270	// inputString scans a string.
   271	type inputString struct {
   272		str string
   273	}
   274	
   275	func (i *inputString) step(pos int) (rune, int) {
   276		if pos < len(i.str) {
   277			c := i.str[pos]
   278			if c < utf8.RuneSelf {
   279				return rune(c), 1
   280			}
   281			return utf8.DecodeRuneInString(i.str[pos:])
   282		}
   283		return endOfText, 0
   284	}
   285	
   286	func (i *inputString) canCheckPrefix() bool {
   287		return true
   288	}
   289	
   290	func (i *inputString) hasPrefix(re *Regexp) bool {
   291		return strings.HasPrefix(i.str, re.prefix)
   292	}
   293	
   294	func (i *inputString) index(re *Regexp, pos int) int {
   295		return strings.Index(i.str[pos:], re.prefix)
   296	}
   297	
   298	func (i *inputString) context(pos int) syntax.EmptyOp {
   299		r1, r2 := endOfText, endOfText
   300		if pos > 0 && pos <= len(i.str) {
   301			r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
   302		}
   303		if pos < len(i.str) {
   304			r2, _ = utf8.DecodeRuneInString(i.str[pos:])
   305		}
   306		return syntax.EmptyOpContext(r1, r2)
   307	}
   308	
   309	// inputBytes scans a byte slice.
   310	type inputBytes struct {
   311		str []byte
   312	}
   313	
   314	func (i *inputBytes) step(pos int) (rune, int) {
   315		if pos < len(i.str) {
   316			c := i.str[pos]
   317			if c < utf8.RuneSelf {
   318				return rune(c), 1
   319			}
   320			return utf8.DecodeRune(i.str[pos:])
   321		}
   322		return endOfText, 0
   323	}
   324	
   325	func (i *inputBytes) canCheckPrefix() bool {
   326		return true
   327	}
   328	
   329	func (i *inputBytes) hasPrefix(re *Regexp) bool {
   330		return bytes.HasPrefix(i.str, re.prefixBytes)
   331	}
   332	
   333	func (i *inputBytes) index(re *Regexp, pos int) int {
   334		return bytes.Index(i.str[pos:], re.prefixBytes)
   335	}
   336	
   337	func (i *inputBytes) context(pos int) syntax.EmptyOp {
   338		r1, r2 := endOfText, endOfText
   339		if pos > 0 && pos <= len(i.str) {
   340			r1, _ = utf8.DecodeLastRune(i.str[:pos])
   341		}
   342		if pos < len(i.str) {
   343			r2, _ = utf8.DecodeRune(i.str[pos:])
   344		}
   345		return syntax.EmptyOpContext(r1, r2)
   346	}
   347	
   348	// inputReader scans a RuneReader.
   349	type inputReader struct {
   350		r     io.RuneReader
   351		atEOT bool
   352		pos   int
   353	}
   354	
   355	func (i *inputReader) step(pos int) (rune, int) {
   356		if !i.atEOT && pos != i.pos {
   357			return endOfText, 0
   358	
   359		}
   360		r, w, err := i.r.ReadRune()
   361		if err != nil {
   362			i.atEOT = true
   363			return endOfText, 0
   364		}
   365		i.pos += w
   366		return r, w
   367	}
   368	
   369	func (i *inputReader) canCheckPrefix() bool {
   370		return false
   371	}
   372	
   373	func (i *inputReader) hasPrefix(re *Regexp) bool {
   374		return false
   375	}
   376	
   377	func (i *inputReader) index(re *Regexp, pos int) int {
   378		return -1
   379	}
   380	
   381	func (i *inputReader) context(pos int) syntax.EmptyOp {
   382		return 0
   383	}
   384	
   385	// LiteralPrefix returns a literal string that must begin any match
   386	// of the regular expression re.  It returns the boolean true if the
   387	// literal string comprises the entire regular expression.
   388	func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
   389		return re.prefix, re.prefixComplete
   390	}
   391	
   392	// MatchReader reports whether the Regexp matches the text read by the
   393	// RuneReader.
   394	func (re *Regexp) MatchReader(r io.RuneReader) bool {
   395		return re.doExecute(r, nil, "", 0, 0) != nil
   396	}
   397	
   398	// MatchString reports whether the Regexp matches the string s.
   399	func (re *Regexp) MatchString(s string) bool {
   400		return re.doExecute(nil, nil, s, 0, 0) != nil
   401	}
   402	
   403	// Match reports whether the Regexp matches the byte slice b.
   404	func (re *Regexp) Match(b []byte) bool {
   405		return re.doExecute(nil, b, "", 0, 0) != nil
   406	}
   407	
   408	// MatchReader checks whether a textual regular expression matches the text
   409	// read by the RuneReader.  More complicated queries need to use Compile and
   410	// the full Regexp interface.
   411	func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) {
   412		re, err := Compile(pattern)
   413		if err != nil {
   414			return false, err
   415		}
   416		return re.MatchReader(r), nil
   417	}
   418	
   419	// MatchString checks whether a textual regular expression
   420	// matches a string.  More complicated queries need
   421	// to use Compile and the full Regexp interface.
   422	func MatchString(pattern string, s string) (matched bool, err error) {
   423		re, err := Compile(pattern)
   424		if err != nil {
   425			return false, err
   426		}
   427		return re.MatchString(s), nil
   428	}
   429	
   430	// Match checks whether a textual regular expression
   431	// matches a byte slice.  More complicated queries need
   432	// to use Compile and the full Regexp interface.
   433	func Match(pattern string, b []byte) (matched bool, err error) {
   434		re, err := Compile(pattern)
   435		if err != nil {
   436			return false, err
   437		}
   438		return re.Match(b), nil
   439	}
   440	
   441	// ReplaceAllString returns a copy of src, replacing matches of the Regexp
   442	// with the replacement string repl.  Inside repl, $ signs are interpreted as
   443	// in Expand, so for instance $1 represents the text of the first submatch.
   444	func (re *Regexp) ReplaceAllString(src, repl string) string {
   445		n := 2
   446		if strings.Index(repl, "$") >= 0 {
   447			n = 2 * (re.numSubexp + 1)
   448		}
   449		b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte {
   450			return re.expand(dst, repl, nil, src, match)
   451		})
   452		return string(b)
   453	}
   454	
   455	// ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp
   456	// with the replacement string repl.  The replacement repl is substituted directly,
   457	// without using Expand.
   458	func (re *Regexp) ReplaceAllLiteralString(src, repl string) string {
   459		return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
   460			return append(dst, repl...)
   461		}))
   462	}
   463	
   464	// ReplaceAllStringFunc returns a copy of src in which all matches of the
   465	// Regexp have been replaced by the return value of function repl applied
   466	// to the matched substring.  The replacement returned by repl is substituted
   467	// directly, without using Expand.
   468	func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
   469		b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
   470			return append(dst, repl(src[match[0]:match[1]])...)
   471		})
   472		return string(b)
   473	}
   474	
   475	func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte {
   476		lastMatchEnd := 0 // end position of the most recent match
   477		searchPos := 0    // position where we next look for a match
   478		var buf []byte
   479		var endPos int
   480		if bsrc != nil {
   481			endPos = len(bsrc)
   482		} else {
   483			endPos = len(src)
   484		}
   485		for searchPos <= endPos {
   486			a := re.doExecute(nil, bsrc, src, searchPos, nmatch)
   487			if len(a) == 0 {
   488				break // no more matches
   489			}
   490	
   491			// Copy the unmatched characters before this match.
   492			if bsrc != nil {
   493				buf = append(buf, bsrc[lastMatchEnd:a[0]]...)
   494			} else {
   495				buf = append(buf, src[lastMatchEnd:a[0]]...)
   496			}
   497	
   498			// Now insert a copy of the replacement string, but not for a
   499			// match of the empty string immediately after another match.
   500			// (Otherwise, we get double replacement for patterns that
   501			// match both empty and nonempty strings.)
   502			if a[1] > lastMatchEnd || a[0] == 0 {
   503				buf = repl(buf, a)
   504			}
   505			lastMatchEnd = a[1]
   506	
   507			// Advance past this match; always advance at least one character.
   508			var width int
   509			if bsrc != nil {
   510				_, width = utf8.DecodeRune(bsrc[searchPos:])
   511			} else {
   512				_, width = utf8.DecodeRuneInString(src[searchPos:])
   513			}
   514			if searchPos+width > a[1] {
   515				searchPos += width
   516			} else if searchPos+1 > a[1] {
   517				// This clause is only needed at the end of the input
   518				// string.  In that case, DecodeRuneInString returns width=0.
   519				searchPos++
   520			} else {
   521				searchPos = a[1]
   522			}
   523		}
   524	
   525		// Copy the unmatched characters after the last match.
   526		if bsrc != nil {
   527			buf = append(buf, bsrc[lastMatchEnd:]...)
   528		} else {
   529			buf = append(buf, src[lastMatchEnd:]...)
   530		}
   531	
   532		return buf
   533	}
   534	
   535	// ReplaceAll returns a copy of src, replacing matches of the Regexp
   536	// with the replacement text repl.  Inside repl, $ signs are interpreted as
   537	// in Expand, so for instance $1 represents the text of the first submatch.
   538	func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
   539		n := 2
   540		if bytes.IndexByte(repl, '$') >= 0 {
   541			n = 2 * (re.numSubexp + 1)
   542		}
   543		srepl := ""
   544		b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte {
   545			if len(srepl) != len(repl) {
   546				srepl = string(repl)
   547			}
   548			return re.expand(dst, srepl, src, "", match)
   549		})
   550		return b
   551	}
   552	
   553	// ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp
   554	// with the replacement bytes repl.  The replacement repl is substituted directly,
   555	// without using Expand.
   556	func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
   557		return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
   558			return append(dst, repl...)
   559		})
   560	}
   561	
   562	// ReplaceAllFunc returns a copy of src in which all matches of the
   563	// Regexp have been replaced by the return value of function repl applied
   564	// to the matched byte slice.  The replacement returned by repl is substituted
   565	// directly, without using Expand.
   566	func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
   567		return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
   568			return append(dst, repl(src[match[0]:match[1]])...)
   569		})
   570	}
   571	
   572	var specialBytes = []byte(`\.+*?()|[]{}^$`)
   573	
   574	func special(b byte) bool {
   575		return bytes.IndexByte(specialBytes, b) >= 0
   576	}
   577	
   578	// QuoteMeta returns a string that quotes all regular expression metacharacters
   579	// inside the argument text; the returned string is a regular expression matching
   580	// the literal text.  For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
   581	func QuoteMeta(s string) string {
   582		b := make([]byte, 2*len(s))
   583	
   584		// A byte loop is correct because all metacharacters are ASCII.
   585		j := 0
   586		for i := 0; i < len(s); i++ {
   587			if special(s[i]) {
   588				b[j] = '\\'
   589				j++
   590			}
   591			b[j] = s[i]
   592			j++
   593		}
   594		return string(b[0:j])
   595	}
   596	
   597	// The number of capture values in the program may correspond
   598	// to fewer capturing expressions than are in the regexp.
   599	// For example, "(a){0}" turns into an empty program, so the
   600	// maximum capture in the program is 0 but we need to return
   601	// an expression for \1.  Pad appends -1s to the slice a as needed.
   602	func (re *Regexp) pad(a []int) []int {
   603		if a == nil {
   604			// No match.
   605			return nil
   606		}
   607		n := (1 + re.numSubexp) * 2
   608		for len(a) < n {
   609			a = append(a, -1)
   610		}
   611		return a
   612	}
   613	
   614	// Find matches in slice b if b is non-nil, otherwise find matches in string s.
   615	func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
   616		var end int
   617		if b == nil {
   618			end = len(s)
   619		} else {
   620			end = len(b)
   621		}
   622	
   623		for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
   624			matches := re.doExecute(nil, b, s, pos, re.prog.NumCap)
   625			if len(matches) == 0 {
   626				break
   627			}
   628	
   629			accept := true
   630			if matches[1] == pos {
   631				// We've found an empty match.
   632				if matches[0] == prevMatchEnd {
   633					// We don't allow an empty match right
   634					// after a previous match, so ignore it.
   635					accept = false
   636				}
   637				var width int
   638				// TODO: use step()
   639				if b == nil {
   640					_, width = utf8.DecodeRuneInString(s[pos:end])
   641				} else {
   642					_, width = utf8.DecodeRune(b[pos:end])
   643				}
   644				if width > 0 {
   645					pos += width
   646				} else {
   647					pos = end + 1
   648				}
   649			} else {
   650				pos = matches[1]
   651			}
   652			prevMatchEnd = matches[1]
   653	
   654			if accept {
   655				deliver(re.pad(matches))
   656				i++
   657			}
   658		}
   659	}
   660	
   661	// Find returns a slice holding the text of the leftmost match in b of the regular expression.
   662	// A return value of nil indicates no match.
   663	func (re *Regexp) Find(b []byte) []byte {
   664		a := re.doExecute(nil, b, "", 0, 2)
   665		if a == nil {
   666			return nil
   667		}
   668		return b[a[0]:a[1]]
   669	}
   670	
   671	// FindIndex returns a two-element slice of integers defining the location of
   672	// the leftmost match in b of the regular expression.  The match itself is at
   673	// b[loc[0]:loc[1]].
   674	// A return value of nil indicates no match.
   675	func (re *Regexp) FindIndex(b []byte) (loc []int) {
   676		a := re.doExecute(nil, b, "", 0, 2)
   677		if a == nil {
   678			return nil
   679		}
   680		return a[0:2]
   681	}
   682	
   683	// FindString returns a string holding the text of the leftmost match in s of the regular
   684	// expression.  If there is no match, the return value is an empty string,
   685	// but it will also be empty if the regular expression successfully matches
   686	// an empty string.  Use FindStringIndex or FindStringSubmatch if it is
   687	// necessary to distinguish these cases.
   688	func (re *Regexp) FindString(s string) string {
   689		a := re.doExecute(nil, nil, s, 0, 2)
   690		if a == nil {
   691			return ""
   692		}
   693		return s[a[0]:a[1]]
   694	}
   695	
   696	// FindStringIndex returns a two-element slice of integers defining the
   697	// location of the leftmost match in s of the regular expression.  The match
   698	// itself is at s[loc[0]:loc[1]].
   699	// A return value of nil indicates no match.
   700	func (re *Regexp) FindStringIndex(s string) (loc []int) {
   701		a := re.doExecute(nil, nil, s, 0, 2)
   702		if a == nil {
   703			return nil
   704		}
   705		return a[0:2]
   706	}
   707	
   708	// FindReaderIndex returns a two-element slice of integers defining the
   709	// location of the leftmost match of the regular expression in text read from
   710	// the RuneReader.  The match text was found in the input stream at
   711	// byte offset loc[0] through loc[1]-1.
   712	// A return value of nil indicates no match.
   713	func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) {
   714		a := re.doExecute(r, nil, "", 0, 2)
   715		if a == nil {
   716			return nil
   717		}
   718		return a[0:2]
   719	}
   720	
   721	// FindSubmatch returns a slice of slices holding the text of the leftmost
   722	// match of the regular expression in b and the matches, if any, of its
   723	// subexpressions, as defined by the 'Submatch' descriptions in the package
   724	// comment.
   725	// A return value of nil indicates no match.
   726	func (re *Regexp) FindSubmatch(b []byte) [][]byte {
   727		a := re.doExecute(nil, b, "", 0, re.prog.NumCap)
   728		if a == nil {
   729			return nil
   730		}
   731		ret := make([][]byte, 1+re.numSubexp)
   732		for i := range ret {
   733			if 2*i < len(a) && a[2*i] >= 0 {
   734				ret[i] = b[a[2*i]:a[2*i+1]]
   735			}
   736		}
   737		return ret
   738	}
   739	
   740	// Expand appends template to dst and returns the result; during the
   741	// append, Expand replaces variables in the template with corresponding
   742	// matches drawn from src.  The match slice should have been returned by
   743	// FindSubmatchIndex.
   744	//
   745	// In the template, a variable is denoted by a substring of the form
   746	// $name or ${name}, where name is a non-empty sequence of letters,
   747	// digits, and underscores.  A purely numeric name like $1 refers to
   748	// the submatch with the corresponding index; other names refer to
   749	// capturing parentheses named with the (?P<name>...) syntax.  A
   750	// reference to an out of range or unmatched index or a name that is not
   751	// present in the regular expression is replaced with an empty slice.
   752	//
   753	// In the $name form, name is taken to be as long as possible: $1x is
   754	// equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0.
   755	//
   756	// To insert a literal $ in the output, use $$ in the template.
   757	func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte {
   758		return re.expand(dst, string(template), src, "", match)
   759	}
   760	
   761	// ExpandString is like Expand but the template and source are strings.
   762	// It appends to and returns a byte slice in order to give the calling
   763	// code control over allocation.
   764	func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte {
   765		return re.expand(dst, template, nil, src, match)
   766	}
   767	
   768	func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte {
   769		for len(template) > 0 {
   770			i := strings.Index(template, "$")
   771			if i < 0 {
   772				break
   773			}
   774			dst = append(dst, template[:i]...)
   775			template = template[i:]
   776			if len(template) > 1 && template[1] == '$' {
   777				// Treat $$ as $.
   778				dst = append(dst, '$')
   779				template = template[2:]
   780				continue
   781			}
   782			name, num, rest, ok := extract(template)
   783			if !ok {
   784				// Malformed; treat $ as raw text.
   785				dst = append(dst, '$')
   786				template = template[1:]
   787				continue
   788			}
   789			template = rest
   790			if num >= 0 {
   791				if 2*num+1 < len(match) && match[2*num] >= 0 {
   792					if bsrc != nil {
   793						dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...)
   794					} else {
   795						dst = append(dst, src[match[2*num]:match[2*num+1]]...)
   796					}
   797				}
   798			} else {
   799				for i, namei := range re.subexpNames {
   800					if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 {
   801						if bsrc != nil {
   802							dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...)
   803						} else {
   804							dst = append(dst, src[match[2*i]:match[2*i+1]]...)
   805						}
   806						break
   807					}
   808				}
   809			}
   810		}
   811		dst = append(dst, template...)
   812		return dst
   813	}
   814	
   815	// extract returns the name from a leading "$name" or "${name}" in str.
   816	// If it is a number, extract returns num set to that number; otherwise num = -1.
   817	func extract(str string) (name string, num int, rest string, ok bool) {
   818		if len(str) < 2 || str[0] != '$' {
   819			return
   820		}
   821		brace := false
   822		if str[1] == '{' {
   823			brace = true
   824			str = str[2:]
   825		} else {
   826			str = str[1:]
   827		}
   828		i := 0
   829		for i < len(str) {
   830			rune, size := utf8.DecodeRuneInString(str[i:])
   831			if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' {
   832				break
   833			}
   834			i += size
   835		}
   836		if i == 0 {
   837			// empty name is not okay
   838			return
   839		}
   840		name = str[:i]
   841		if brace {
   842			if i >= len(str) || str[i] != '}' {
   843				// missing closing brace
   844				return
   845			}
   846			i++
   847		}
   848	
   849		// Parse number.
   850		num = 0
   851		for i := 0; i < len(name); i++ {
   852			if name[i] < '0' || '9' < name[i] || num >= 1e8 {
   853				num = -1
   854				break
   855			}
   856			num = num*10 + int(name[i]) - '0'
   857		}
   858		// Disallow leading zeros.
   859		if name[0] == '0' && len(name) > 1 {
   860			num = -1
   861		}
   862	
   863		rest = str[i:]
   864		ok = true
   865		return
   866	}
   867	
   868	// FindSubmatchIndex returns a slice holding the index pairs identifying the
   869	// leftmost match of the regular expression in b and the matches, if any, of
   870	// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
   871	// in the package comment.
   872	// A return value of nil indicates no match.
   873	func (re *Regexp) FindSubmatchIndex(b []byte) []int {
   874		return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap))
   875	}
   876	
   877	// FindStringSubmatch returns a slice of strings holding the text of the
   878	// leftmost match of the regular expression in s and the matches, if any, of
   879	// its subexpressions, as defined by the 'Submatch' description in the
   880	// package comment.
   881	// A return value of nil indicates no match.
   882	func (re *Regexp) FindStringSubmatch(s string) []string {
   883		a := re.doExecute(nil, nil, s, 0, re.prog.NumCap)
   884		if a == nil {
   885			return nil
   886		}
   887		ret := make([]string, 1+re.numSubexp)
   888		for i := range ret {
   889			if 2*i < len(a) && a[2*i] >= 0 {
   890				ret[i] = s[a[2*i]:a[2*i+1]]
   891			}
   892		}
   893		return ret
   894	}
   895	
   896	// FindStringSubmatchIndex returns a slice holding the index pairs
   897	// identifying the leftmost match of the regular expression in s and the
   898	// matches, if any, of its subexpressions, as defined by the 'Submatch' and
   899	// 'Index' descriptions in the package comment.
   900	// A return value of nil indicates no match.
   901	func (re *Regexp) FindStringSubmatchIndex(s string) []int {
   902		return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap))
   903	}
   904	
   905	// FindReaderSubmatchIndex returns a slice holding the index pairs
   906	// identifying the leftmost match of the regular expression of text read by
   907	// the RuneReader, and the matches, if any, of its subexpressions, as defined
   908	// by the 'Submatch' and 'Index' descriptions in the package comment.  A
   909	// return value of nil indicates no match.
   910	func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
   911		return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap))
   912	}
   913	
   914	const startSize = 10 // The size at which to start a slice in the 'All' routines.
   915	
   916	// FindAll is the 'All' version of Find; it returns a slice of all successive
   917	// matches of the expression, as defined by the 'All' description in the
   918	// package comment.
   919	// A return value of nil indicates no match.
   920	func (re *Regexp) FindAll(b []byte, n int) [][]byte {
   921		if n < 0 {
   922			n = len(b) + 1
   923		}
   924		result := make([][]byte, 0, startSize)
   925		re.allMatches("", b, n, func(match []int) {
   926			result = append(result, b[match[0]:match[1]])
   927		})
   928		if len(result) == 0 {
   929			return nil
   930		}
   931		return result
   932	}
   933	
   934	// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
   935	// successive matches of the expression, as defined by the 'All' description
   936	// in the package comment.
   937	// A return value of nil indicates no match.
   938	func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
   939		if n < 0 {
   940			n = len(b) + 1
   941		}
   942		result := make([][]int, 0, startSize)
   943		re.allMatches("", b, n, func(match []int) {
   944			result = append(result, match[0:2])
   945		})
   946		if len(result) == 0 {
   947			return nil
   948		}
   949		return result
   950	}
   951	
   952	// FindAllString is the 'All' version of FindString; it returns a slice of all
   953	// successive matches of the expression, as defined by the 'All' description
   954	// in the package comment.
   955	// A return value of nil indicates no match.
   956	func (re *Regexp) FindAllString(s string, n int) []string {
   957		if n < 0 {
   958			n = len(s) + 1
   959		}
   960		result := make([]string, 0, startSize)
   961		re.allMatches(s, nil, n, func(match []int) {
   962			result = append(result, s[match[0]:match[1]])
   963		})
   964		if len(result) == 0 {
   965			return nil
   966		}
   967		return result
   968	}
   969	
   970	// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
   971	// slice of all successive matches of the expression, as defined by the 'All'
   972	// description in the package comment.
   973	// A return value of nil indicates no match.
   974	func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
   975		if n < 0 {
   976			n = len(s) + 1
   977		}
   978		result := make([][]int, 0, startSize)
   979		re.allMatches(s, nil, n, func(match []int) {
   980			result = append(result, match[0:2])
   981		})
   982		if len(result) == 0 {
   983			return nil
   984		}
   985		return result
   986	}
   987	
   988	// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
   989	// of all successive matches of the expression, as defined by the 'All'
   990	// description in the package comment.
   991	// A return value of nil indicates no match.
   992	func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
   993		if n < 0 {
   994			n = len(b) + 1
   995		}
   996		result := make([][][]byte, 0, startSize)
   997		re.allMatches("", b, n, func(match []int) {
   998			slice := make([][]byte, len(match)/2)
   999			for j := range slice {
  1000				if match[2*j] >= 0 {
  1001					slice[j] = b[match[2*j]:match[2*j+1]]
  1002				}
  1003			}
  1004			result = append(result, slice)
  1005		})
  1006		if len(result) == 0 {
  1007			return nil
  1008		}
  1009		return result
  1010	}
  1011	
  1012	// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
  1013	// a slice of all successive matches of the expression, as defined by the
  1014	// 'All' description in the package comment.
  1015	// A return value of nil indicates no match.
  1016	func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
  1017		if n < 0 {
  1018			n = len(b) + 1
  1019		}
  1020		result := make([][]int, 0, startSize)
  1021		re.allMatches("", b, n, func(match []int) {
  1022			result = append(result, match)
  1023		})
  1024		if len(result) == 0 {
  1025			return nil
  1026		}
  1027		return result
  1028	}
  1029	
  1030	// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
  1031	// returns a slice of all successive matches of the expression, as defined by
  1032	// the 'All' description in the package comment.
  1033	// A return value of nil indicates no match.
  1034	func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
  1035		if n < 0 {
  1036			n = len(s) + 1
  1037		}
  1038		result := make([][]string, 0, startSize)
  1039		re.allMatches(s, nil, n, func(match []int) {
  1040			slice := make([]string, len(match)/2)
  1041			for j := range slice {
  1042				if match[2*j] >= 0 {
  1043					slice[j] = s[match[2*j]:match[2*j+1]]
  1044				}
  1045			}
  1046			result = append(result, slice)
  1047		})
  1048		if len(result) == 0 {
  1049			return nil
  1050		}
  1051		return result
  1052	}
  1053	
  1054	// FindAllStringSubmatchIndex is the 'All' version of
  1055	// FindStringSubmatchIndex; it returns a slice of all successive matches of
  1056	// the expression, as defined by the 'All' description in the package
  1057	// comment.
  1058	// A return value of nil indicates no match.
  1059	func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
  1060		if n < 0 {
  1061			n = len(s) + 1
  1062		}
  1063		result := make([][]int, 0, startSize)
  1064		re.allMatches(s, nil, n, func(match []int) {
  1065			result = append(result, match)
  1066		})
  1067		if len(result) == 0 {
  1068			return nil
  1069		}
  1070		return result
  1071	}
  1072	
  1073	// Split slices s into substrings separated by the expression and returns a slice of
  1074	// the substrings between those expression matches.
  1075	//
  1076	// The slice returned by this method consists of all the substrings of s
  1077	// not contained in the slice returned by FindAllString. When called on an expression
  1078	// that contains no metacharacters, it is equivalent to strings.SplitN.
  1079	//
  1080	// Example:
  1081	//   s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5)
  1082	//   // s: ["", "b", "b", "c", "cadaaae"]
  1083	//
  1084	// The count determines the number of substrings to return:
  1085	//   n > 0: at most n substrings; the last substring will be the unsplit remainder.
  1086	//   n == 0: the result is nil (zero substrings)
  1087	//   n < 0: all substrings
  1088	func (re *Regexp) Split(s string, n int) []string {
  1089	
  1090		if n == 0 {
  1091			return nil
  1092		}
  1093	
  1094		if len(re.expr) > 0 && len(s) == 0 {
  1095			return []string{""}
  1096		}
  1097	
  1098		matches := re.FindAllStringIndex(s, n)
  1099		strings := make([]string, 0, len(matches))
  1100	
  1101		beg := 0
  1102		end := 0
  1103		for _, match := range matches {
  1104			if n > 0 && len(strings) >= n-1 {
  1105				break
  1106			}
  1107	
  1108			end = match[0]
  1109			if match[1] != 0 {
  1110				strings = append(strings, s[beg:end])
  1111			}
  1112			beg = match[1]
  1113		}
  1114	
  1115		if end != len(s) {
  1116			strings = append(strings, s[beg:])
  1117		}
  1118	
  1119		return strings
  1120	}
  1121	

View as plain text