Source file src/regexp/syntax/regexp.go

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package syntax
     6  
     7  // Note to implementers:
     8  // In this package, re is always a *Regexp and r is always a rune.
     9  
    10  import (
    11  	"strconv"
    12  	"strings"
    13  	"unicode"
    14  )
    15  
    16  // A Regexp is a node in a regular expression syntax tree.
    17  type Regexp struct {
    18  	Op       Op // operator
    19  	Flags    Flags
    20  	Sub      []*Regexp  // subexpressions, if any
    21  	Sub0     [1]*Regexp // storage for short Sub
    22  	Rune     []rune     // matched runes, for OpLiteral, OpCharClass
    23  	Rune0    [2]rune    // storage for short Rune
    24  	Min, Max int        // min, max for OpRepeat
    25  	Cap      int        // capturing index, for OpCapture
    26  	Name     string     // capturing name, for OpCapture
    27  }
    28  
    29  //go:generate stringer -type Op -trimprefix Op
    30  
    31  // An Op is a single regular expression operator.
    32  type Op uint8
    33  
    34  // Operators are listed in precedence order, tightest binding to weakest.
    35  // Character class operators are listed simplest to most complex
    36  // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
    37  
    38  const (
    39  	OpNoMatch        Op = 1 + iota // matches no strings
    40  	OpEmptyMatch                   // matches empty string
    41  	OpLiteral                      // matches Runes sequence
    42  	OpCharClass                    // matches Runes interpreted as range pair list
    43  	OpAnyCharNotNL                 // matches any character except newline
    44  	OpAnyChar                      // matches any character
    45  	OpBeginLine                    // matches empty string at beginning of line
    46  	OpEndLine                      // matches empty string at end of line
    47  	OpBeginText                    // matches empty string at beginning of text
    48  	OpEndText                      // matches empty string at end of text
    49  	OpWordBoundary                 // matches word boundary `\b`
    50  	OpNoWordBoundary               // matches word non-boundary `\B`
    51  	OpCapture                      // capturing subexpression with index Cap, optional name Name
    52  	OpStar                         // matches Sub[0] zero or more times
    53  	OpPlus                         // matches Sub[0] one or more times
    54  	OpQuest                        // matches Sub[0] zero or one times
    55  	OpRepeat                       // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
    56  	OpConcat                       // matches concatenation of Subs
    57  	OpAlternate                    // matches alternation of Subs
    58  )
    59  
    60  const opPseudo Op = 128 // where pseudo-ops start
    61  
    62  // Equal reports whether x and y have identical structure.
    63  func (x *Regexp) Equal(y *Regexp) bool {
    64  	if x == nil || y == nil {
    65  		return x == y
    66  	}
    67  	if x.Op != y.Op {
    68  		return false
    69  	}
    70  	switch x.Op {
    71  	case OpEndText:
    72  		// The parse flags remember whether this is \z or \Z.
    73  		if x.Flags&WasDollar != y.Flags&WasDollar {
    74  			return false
    75  		}
    76  
    77  	case OpLiteral, OpCharClass:
    78  		if len(x.Rune) != len(y.Rune) {
    79  			return false
    80  		}
    81  		for i, r := range x.Rune {
    82  			if r != y.Rune[i] {
    83  				return false
    84  			}
    85  		}
    86  
    87  	case OpAlternate, OpConcat:
    88  		if len(x.Sub) != len(y.Sub) {
    89  			return false
    90  		}
    91  		for i, sub := range x.Sub {
    92  			if !sub.Equal(y.Sub[i]) {
    93  				return false
    94  			}
    95  		}
    96  
    97  	case OpStar, OpPlus, OpQuest:
    98  		if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
    99  			return false
   100  		}
   101  
   102  	case OpRepeat:
   103  		if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
   104  			return false
   105  		}
   106  
   107  	case OpCapture:
   108  		if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
   109  			return false
   110  		}
   111  	}
   112  	return true
   113  }
   114  
   115  // printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp.
   116  type printFlags uint8
   117  
   118  const (
   119  	flagI    printFlags = 1 << iota // (?i:
   120  	flagM                           // (?m:
   121  	flagS                           // (?s:
   122  	flagOff                         // )
   123  	flagPrec                        // (?: )
   124  	negShift = 5                    // flagI<<negShift is (?-i:
   125  )
   126  
   127  // addSpan enables the flags f around start..last,
   128  // by setting flags[start] = f and flags[last] = flagOff.
   129  func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) {
   130  	if *flags == nil {
   131  		*flags = make(map[*Regexp]printFlags)
   132  	}
   133  	(*flags)[start] = f
   134  	(*flags)[last] |= flagOff // maybe start==last
   135  }
   136  
   137  // calcFlags calculates the flags to print around each subexpression in re,
   138  // storing that information in (*flags)[sub] for each affected subexpression.
   139  // The first time an entry needs to be written to *flags, calcFlags allocates the map.
   140  // calcFlags also calculates the flags that must be active or can't be active
   141  // around re and returns those flags.
   142  func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) {
   143  	switch re.Op {
   144  	default:
   145  		return 0, 0
   146  
   147  	case OpLiteral:
   148  		// If literal is fold-sensitive, return (flagI, 0) or (0, flagI)
   149  		// according to whether (?i) is active.
   150  		// If literal is not fold-sensitive, return 0, 0.
   151  		for _, r := range re.Rune {
   152  			if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r {
   153  				if re.Flags&FoldCase != 0 {
   154  					return flagI, 0
   155  				} else {
   156  					return 0, flagI
   157  				}
   158  			}
   159  		}
   160  		return 0, 0
   161  
   162  	case OpCharClass:
   163  		// If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out.
   164  		// If literal is not fold-sensitive, return 0, 0.
   165  		for i := 0; i < len(re.Rune); i += 2 {
   166  			lo := max(minFold, re.Rune[i])
   167  			hi := min(maxFold, re.Rune[i+1])
   168  			for r := lo; r <= hi; r++ {
   169  				for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) {
   170  					if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) {
   171  						return 0, flagI
   172  					}
   173  				}
   174  			}
   175  		}
   176  		return 0, 0
   177  
   178  	case OpAnyCharNotNL: // (?-s).
   179  		return 0, flagS
   180  
   181  	case OpAnyChar: // (?s).
   182  		return flagS, 0
   183  
   184  	case OpBeginLine, OpEndLine: // (?m)^ (?m)$
   185  		return flagM, 0
   186  
   187  	case OpEndText:
   188  		if re.Flags&WasDollar != 0 { // (?-m)$
   189  			return 0, flagM
   190  		}
   191  		return 0, 0
   192  
   193  	case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat:
   194  		return calcFlags(re.Sub[0], flags)
   195  
   196  	case OpConcat, OpAlternate:
   197  		// Gather the must and cant for each subexpression.
   198  		// When we find a conflicting subexpression, insert the necessary
   199  		// flags around the previously identified span and start over.
   200  		var must, cant, allCant printFlags
   201  		start := 0
   202  		last := 0
   203  		did := false
   204  		for i, sub := range re.Sub {
   205  			subMust, subCant := calcFlags(sub, flags)
   206  			if must&subCant != 0 || subMust&cant != 0 {
   207  				if must != 0 {
   208  					addSpan(re.Sub[start], re.Sub[last], must, flags)
   209  				}
   210  				must = 0
   211  				cant = 0
   212  				start = i
   213  				did = true
   214  			}
   215  			must |= subMust
   216  			cant |= subCant
   217  			allCant |= subCant
   218  			if subMust != 0 {
   219  				last = i
   220  			}
   221  			if must == 0 && start == i {
   222  				start++
   223  			}
   224  		}
   225  		if !did {
   226  			// No conflicts: pass the accumulated must and cant upward.
   227  			return must, cant
   228  		}
   229  		if must != 0 {
   230  			// Conflicts found; need to finish final span.
   231  			addSpan(re.Sub[start], re.Sub[last], must, flags)
   232  		}
   233  		return 0, allCant
   234  	}
   235  }
   236  
   237  // writeRegexp writes the Perl syntax for the regular expression re to b.
   238  func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) {
   239  	f |= flags[re]
   240  	if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 {
   241  		// flagPrec is redundant with other flags being added and terminated
   242  		f &^= flagPrec
   243  	}
   244  	if f&^(flagOff|flagPrec) != 0 {
   245  		b.WriteString(`(?`)
   246  		if f&flagI != 0 {
   247  			b.WriteString(`i`)
   248  		}
   249  		if f&flagM != 0 {
   250  			b.WriteString(`m`)
   251  		}
   252  		if f&flagS != 0 {
   253  			b.WriteString(`s`)
   254  		}
   255  		if f&((flagM|flagS)<<negShift) != 0 {
   256  			b.WriteString(`-`)
   257  			if f&(flagM<<negShift) != 0 {
   258  				b.WriteString(`m`)
   259  			}
   260  			if f&(flagS<<negShift) != 0 {
   261  				b.WriteString(`s`)
   262  			}
   263  		}
   264  		b.WriteString(`:`)
   265  	}
   266  	if f&flagOff != 0 {
   267  		defer b.WriteString(`)`)
   268  	}
   269  	if f&flagPrec != 0 {
   270  		b.WriteString(`(?:`)
   271  		defer b.WriteString(`)`)
   272  	}
   273  
   274  	switch re.Op {
   275  	default:
   276  		b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
   277  	case OpNoMatch:
   278  		b.WriteString(`[^\x00-\x{10FFFF}]`)
   279  	case OpEmptyMatch:
   280  		b.WriteString(`(?:)`)
   281  	case OpLiteral:
   282  		for _, r := range re.Rune {
   283  			escape(b, r, false)
   284  		}
   285  	case OpCharClass:
   286  		if len(re.Rune)%2 != 0 {
   287  			b.WriteString(`[invalid char class]`)
   288  			break
   289  		}
   290  		b.WriteRune('[')
   291  		if len(re.Rune) == 0 {
   292  			b.WriteString(`^\x00-\x{10FFFF}`)
   293  		} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
   294  			// Contains 0 and MaxRune. Probably a negated class.
   295  			// Print the gaps.
   296  			b.WriteRune('^')
   297  			for i := 1; i < len(re.Rune)-1; i += 2 {
   298  				lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
   299  				escape(b, lo, lo == '-')
   300  				if lo != hi {
   301  					if hi != lo+1 {
   302  						b.WriteRune('-')
   303  					}
   304  					escape(b, hi, hi == '-')
   305  				}
   306  			}
   307  		} else {
   308  			for i := 0; i < len(re.Rune); i += 2 {
   309  				lo, hi := re.Rune[i], re.Rune[i+1]
   310  				escape(b, lo, lo == '-')
   311  				if lo != hi {
   312  					if hi != lo+1 {
   313  						b.WriteRune('-')
   314  					}
   315  					escape(b, hi, hi == '-')
   316  				}
   317  			}
   318  		}
   319  		b.WriteRune(']')
   320  	case OpAnyCharNotNL, OpAnyChar:
   321  		b.WriteString(`.`)
   322  	case OpBeginLine:
   323  		b.WriteString(`^`)
   324  	case OpEndLine:
   325  		b.WriteString(`$`)
   326  	case OpBeginText:
   327  		b.WriteString(`\A`)
   328  	case OpEndText:
   329  		if re.Flags&WasDollar != 0 {
   330  			b.WriteString(`$`)
   331  		} else {
   332  			b.WriteString(`\z`)
   333  		}
   334  	case OpWordBoundary:
   335  		b.WriteString(`\b`)
   336  	case OpNoWordBoundary:
   337  		b.WriteString(`\B`)
   338  	case OpCapture:
   339  		if re.Name != "" {
   340  			b.WriteString(`(?P<`)
   341  			b.WriteString(re.Name)
   342  			b.WriteRune('>')
   343  		} else {
   344  			b.WriteRune('(')
   345  		}
   346  		if re.Sub[0].Op != OpEmptyMatch {
   347  			writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags)
   348  		}
   349  		b.WriteRune(')')
   350  	case OpStar, OpPlus, OpQuest, OpRepeat:
   351  		p := printFlags(0)
   352  		sub := re.Sub[0]
   353  		if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
   354  			p = flagPrec
   355  		}
   356  		writeRegexp(b, sub, p, flags)
   357  
   358  		switch re.Op {
   359  		case OpStar:
   360  			b.WriteRune('*')
   361  		case OpPlus:
   362  			b.WriteRune('+')
   363  		case OpQuest:
   364  			b.WriteRune('?')
   365  		case OpRepeat:
   366  			b.WriteRune('{')
   367  			b.WriteString(strconv.Itoa(re.Min))
   368  			if re.Max != re.Min {
   369  				b.WriteRune(',')
   370  				if re.Max >= 0 {
   371  					b.WriteString(strconv.Itoa(re.Max))
   372  				}
   373  			}
   374  			b.WriteRune('}')
   375  		}
   376  		if re.Flags&NonGreedy != 0 {
   377  			b.WriteRune('?')
   378  		}
   379  	case OpConcat:
   380  		for _, sub := range re.Sub {
   381  			p := printFlags(0)
   382  			if sub.Op == OpAlternate {
   383  				p = flagPrec
   384  			}
   385  			writeRegexp(b, sub, p, flags)
   386  		}
   387  	case OpAlternate:
   388  		for i, sub := range re.Sub {
   389  			if i > 0 {
   390  				b.WriteRune('|')
   391  			}
   392  			writeRegexp(b, sub, 0, flags)
   393  		}
   394  	}
   395  }
   396  
   397  func (re *Regexp) String() string {
   398  	var b strings.Builder
   399  	var flags map[*Regexp]printFlags
   400  	must, cant := calcFlags(re, &flags)
   401  	must |= (cant &^ flagI) << negShift
   402  	if must != 0 {
   403  		must |= flagOff
   404  	}
   405  	writeRegexp(&b, re, must, flags)
   406  	return b.String()
   407  }
   408  
   409  const meta = `\.+*?()|[]{}^$`
   410  
   411  func escape(b *strings.Builder, r rune, force bool) {
   412  	if unicode.IsPrint(r) {
   413  		if strings.ContainsRune(meta, r) || force {
   414  			b.WriteRune('\\')
   415  		}
   416  		b.WriteRune(r)
   417  		return
   418  	}
   419  
   420  	switch r {
   421  	case '\a':
   422  		b.WriteString(`\a`)
   423  	case '\f':
   424  		b.WriteString(`\f`)
   425  	case '\n':
   426  		b.WriteString(`\n`)
   427  	case '\r':
   428  		b.WriteString(`\r`)
   429  	case '\t':
   430  		b.WriteString(`\t`)
   431  	case '\v':
   432  		b.WriteString(`\v`)
   433  	default:
   434  		if r < 0x100 {
   435  			b.WriteString(`\x`)
   436  			s := strconv.FormatInt(int64(r), 16)
   437  			if len(s) == 1 {
   438  				b.WriteRune('0')
   439  			}
   440  			b.WriteString(s)
   441  			break
   442  		}
   443  		b.WriteString(`\x{`)
   444  		b.WriteString(strconv.FormatInt(int64(r), 16))
   445  		b.WriteString(`}`)
   446  	}
   447  }
   448  
   449  // MaxCap walks the regexp to find the maximum capture index.
   450  func (re *Regexp) MaxCap() int {
   451  	m := 0
   452  	if re.Op == OpCapture {
   453  		m = re.Cap
   454  	}
   455  	for _, sub := range re.Sub {
   456  		if n := sub.MaxCap(); m < n {
   457  			m = n
   458  		}
   459  	}
   460  	return m
   461  }
   462  
   463  // CapNames walks the regexp to find the names of capturing groups.
   464  func (re *Regexp) CapNames() []string {
   465  	names := make([]string, re.MaxCap()+1)
   466  	re.capNames(names)
   467  	return names
   468  }
   469  
   470  func (re *Regexp) capNames(names []string) {
   471  	if re.Op == OpCapture {
   472  		names[re.Cap] = re.Name
   473  	}
   474  	for _, sub := range re.Sub {
   475  		sub.capNames(names)
   476  	}
   477  }
   478  

View as plain text