// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package syntax // Note to implementers: // In this package, re is always a *Regexp and r is always a rune. import ( "strconv" "strings" "unicode" ) // A Regexp is a node in a regular expression syntax tree. type Regexp struct { Op Op // operator Flags Flags Sub []*Regexp // subexpressions, if any Sub0 [1]*Regexp // storage for short Sub Rune []rune // matched runes, for OpLiteral, OpCharClass Rune0 [2]rune // storage for short Rune Min, Max int // min, max for OpRepeat Cap int // capturing index, for OpCapture Name string // capturing name, for OpCapture } //go:generate stringer -type Op -trimprefix Op // An Op is a single regular expression operator. type Op uint8 // Operators are listed in precedence order, tightest binding to weakest. // Character class operators are listed simplest to most complex // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar). const ( OpNoMatch Op = 1 + iota // matches no strings OpEmptyMatch // matches empty string OpLiteral // matches Runes sequence OpCharClass // matches Runes interpreted as range pair list OpAnyCharNotNL // matches any character except newline OpAnyChar // matches any character OpBeginLine // matches empty string at beginning of line OpEndLine // matches empty string at end of line OpBeginText // matches empty string at beginning of text OpEndText // matches empty string at end of text OpWordBoundary // matches word boundary `\b` OpNoWordBoundary // matches word non-boundary `\B` OpCapture // capturing subexpression with index Cap, optional name Name OpStar // matches Sub[0] zero or more times OpPlus // matches Sub[0] one or more times OpQuest // matches Sub[0] zero or one times OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit) OpConcat // matches concatenation of Subs OpAlternate // matches alternation of Subs ) const opPseudo Op = 128 // where pseudo-ops start // Equal reports whether x and y have identical structure. func (x *Regexp) Equal(y *Regexp) bool { if x == nil || y == nil { return x == y } if x.Op != y.Op { return false } switch x.Op { case OpEndText: // The parse flags remember whether this is \z or \Z. if x.Flags&WasDollar != y.Flags&WasDollar { return false } case OpLiteral, OpCharClass: if len(x.Rune) != len(y.Rune) { return false } for i, r := range x.Rune { if r != y.Rune[i] { return false } } case OpAlternate, OpConcat: if len(x.Sub) != len(y.Sub) { return false } for i, sub := range x.Sub { if !sub.Equal(y.Sub[i]) { return false } } case OpStar, OpPlus, OpQuest: if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) { return false } case OpRepeat: if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) { return false } case OpCapture: if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) { return false } } return true } // printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp. type printFlags uint8 const ( flagI printFlags = 1 << iota // (?i: flagM // (?m: flagS // (?s: flagOff // ) flagPrec // (?: ) negShift = 5 // flagI<") case OpNoMatch: b.WriteString(`[^\x00-\x{10FFFF}]`) case OpEmptyMatch: b.WriteString(`(?:)`) case OpLiteral: for _, r := range re.Rune { escape(b, r, false) } case OpCharClass: if len(re.Rune)%2 != 0 { b.WriteString(`[invalid char class]`) break } b.WriteRune('[') if len(re.Rune) == 0 { b.WriteString(`^\x00-\x{10FFFF}`) } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 { // Contains 0 and MaxRune. Probably a negated class. // Print the gaps. b.WriteRune('^') for i := 1; i < len(re.Rune)-1; i += 2 { lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 escape(b, lo, lo == '-') if lo != hi { if hi != lo+1 { b.WriteRune('-') } escape(b, hi, hi == '-') } } } else { for i := 0; i < len(re.Rune); i += 2 { lo, hi := re.Rune[i], re.Rune[i+1] escape(b, lo, lo == '-') if lo != hi { if hi != lo+1 { b.WriteRune('-') } escape(b, hi, hi == '-') } } } b.WriteRune(']') case OpAnyCharNotNL, OpAnyChar: b.WriteString(`.`) case OpBeginLine: b.WriteString(`^`) case OpEndLine: b.WriteString(`$`) case OpBeginText: b.WriteString(`\A`) case OpEndText: if re.Flags&WasDollar != 0 { b.WriteString(`$`) } else { b.WriteString(`\z`) } case OpWordBoundary: b.WriteString(`\b`) case OpNoWordBoundary: b.WriteString(`\B`) case OpCapture: if re.Name != "" { b.WriteString(`(?P<`) b.WriteString(re.Name) b.WriteRune('>') } else { b.WriteRune('(') } if re.Sub[0].Op != OpEmptyMatch { writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags) } b.WriteRune(')') case OpStar, OpPlus, OpQuest, OpRepeat: p := printFlags(0) sub := re.Sub[0] if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { p = flagPrec } writeRegexp(b, sub, p, flags) switch re.Op { case OpStar: b.WriteRune('*') case OpPlus: b.WriteRune('+') case OpQuest: b.WriteRune('?') case OpRepeat: b.WriteRune('{') b.WriteString(strconv.Itoa(re.Min)) if re.Max != re.Min { b.WriteRune(',') if re.Max >= 0 { b.WriteString(strconv.Itoa(re.Max)) } } b.WriteRune('}') } if re.Flags&NonGreedy != 0 { b.WriteRune('?') } case OpConcat: for _, sub := range re.Sub { p := printFlags(0) if sub.Op == OpAlternate { p = flagPrec } writeRegexp(b, sub, p, flags) } case OpAlternate: for i, sub := range re.Sub { if i > 0 { b.WriteRune('|') } writeRegexp(b, sub, 0, flags) } } } func (re *Regexp) String() string { var b strings.Builder var flags map[*Regexp]printFlags must, cant := calcFlags(re, &flags) must |= (cant &^ flagI) << negShift if must != 0 { must |= flagOff } writeRegexp(&b, re, must, flags) return b.String() } const meta = `\.+*?()|[]{}^$` func escape(b *strings.Builder, r rune, force bool) { if unicode.IsPrint(r) { if strings.ContainsRune(meta, r) || force { b.WriteRune('\\') } b.WriteRune(r) return } switch r { case '\a': b.WriteString(`\a`) case '\f': b.WriteString(`\f`) case '\n': b.WriteString(`\n`) case '\r': b.WriteString(`\r`) case '\t': b.WriteString(`\t`) case '\v': b.WriteString(`\v`) default: if r < 0x100 { b.WriteString(`\x`) s := strconv.FormatInt(int64(r), 16) if len(s) == 1 { b.WriteRune('0') } b.WriteString(s) break } b.WriteString(`\x{`) b.WriteString(strconv.FormatInt(int64(r), 16)) b.WriteString(`}`) } } // MaxCap walks the regexp to find the maximum capture index. func (re *Regexp) MaxCap() int { m := 0 if re.Op == OpCapture { m = re.Cap } for _, sub := range re.Sub { if n := sub.MaxCap(); m < n { m = n } } return m } // CapNames walks the regexp to find the names of capturing groups. func (re *Regexp) CapNames() []string { names := make([]string, re.MaxCap()+1) re.capNames(names) return names } func (re *Regexp) capNames(names []string) { if re.Op == OpCapture { names[re.Cap] = re.Name } for _, sub := range re.Sub { sub.capNames(names) } }