diff options
| author | Ondřej Surý <ondrej@sury.org> | 2012-01-30 15:38:19 +0100 | 
|---|---|---|
| committer | Ondřej Surý <ondrej@sury.org> | 2012-01-30 16:31:40 +0100 | 
| commit | 3e609a8872e69b2df7447de5f308a5d4340f1cbf (patch) | |
| tree | f49969bc8ba81dd1d3869bd6747ca8b8b7d403fc /src/pkg/regexp/syntax/regexp.go | |
| parent | e0329b098725bc398083bfb04691a257d8ee6a25 (diff) | |
| download | golang-3e609a8872e69b2df7447de5f308a5d4340f1cbf.tar.gz | |
Imported Upstream version 2012.01.27
Diffstat (limited to 'src/pkg/regexp/syntax/regexp.go')
| -rw-r--r-- | src/pkg/regexp/syntax/regexp.go | 321 | 
1 files changed, 321 insertions, 0 deletions
| diff --git a/src/pkg/regexp/syntax/regexp.go b/src/pkg/regexp/syntax/regexp.go new file mode 100644 index 000000000..668a07764 --- /dev/null +++ b/src/pkg/regexp/syntax/regexp.go @@ -0,0 +1,321 @@ +// Copyright 2011 The Go Authors.  All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package syntax parses regular expressions into syntax trees. +// WORK IN PROGRESS. +package syntax + +// Note to implementers: +// In this package, re is always a *Regexp and r is always a rune. + +import ( +	"bytes" +	"strconv" +	"strings" +	"unicode" +) + +// A Regexp is a node in a regular expression syntax tree. +type Regexp struct { +	Op       Op // operator +	Flags    Flags +	Sub      []*Regexp  // subexpressions, if any +	Sub0     [1]*Regexp // storage for short Sub +	Rune     []rune     // matched runes, for OpLiteral, OpCharClass +	Rune0    [2]rune    // storage for short Rune +	Min, Max int        // min, max for OpRepeat +	Cap      int        // capturing index, for OpCapture +	Name     string     // capturing name, for OpCapture +} + +// An Op is a single regular expression operator. +type Op uint8 + +// Operators are listed in precedence order, tightest binding to weakest. +// Character class operators are listed simplest to most complex +// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar). + +const ( +	OpNoMatch        Op = 1 + iota // matches no strings +	OpEmptyMatch                   // matches empty string +	OpLiteral                      // matches Runes sequence +	OpCharClass                    // matches Runes interpreted as range pair list +	OpAnyCharNotNL                 // matches any character +	OpAnyChar                      // matches any character +	OpBeginLine                    // matches empty string at beginning of line +	OpEndLine                      // matches empty string at end of line +	OpBeginText                    // matches empty string at beginning of text +	OpEndText                      // matches empty string at end of text +	OpWordBoundary                 // matches word boundary `\b` +	OpNoWordBoundary               // matches word non-boundary `\B` +	OpCapture                      // capturing subexpression with index Cap, optional name Name +	OpStar                         // matches Sub[0] zero or more times +	OpPlus                         // matches Sub[0] one or more times +	OpQuest                        // matches Sub[0] zero or one times +	OpRepeat                       // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit) +	OpConcat                       // matches concatenation of Subs +	OpAlternate                    // matches alternation of Subs +) + +const opPseudo Op = 128 // where pseudo-ops start + +// Equal returns true if x and y have identical structure. +func (x *Regexp) Equal(y *Regexp) bool { +	if x == nil || y == nil { +		return x == y +	} +	if x.Op != y.Op { +		return false +	} +	switch x.Op { +	case OpEndText: +		// The parse flags remember whether this is \z or \Z. +		if x.Flags&WasDollar != y.Flags&WasDollar { +			return false +		} + +	case OpLiteral, OpCharClass: +		if len(x.Rune) != len(y.Rune) { +			return false +		} +		for i, r := range x.Rune { +			if r != y.Rune[i] { +				return false +			} +		} + +	case OpAlternate, OpConcat: +		if len(x.Sub) != len(y.Sub) { +			return false +		} +		for i, sub := range x.Sub { +			if !sub.Equal(y.Sub[i]) { +				return false +			} +		} + +	case OpStar, OpPlus, OpQuest: +		if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) { +			return false +		} + +	case OpRepeat: +		if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) { +			return false +		} + +	case OpCapture: +		if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) { +			return false +		} +	} +	return true +} + +// writeRegexp writes the Perl syntax for the regular expression re to b. +func writeRegexp(b *bytes.Buffer, re *Regexp) { +	switch re.Op { +	default: +		b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">") +	case OpNoMatch: +		b.WriteString(`[^\x00-\x{10FFFF}]`) +	case OpEmptyMatch: +		b.WriteString(`(?:)`) +	case OpLiteral: +		if re.Flags&FoldCase != 0 { +			b.WriteString(`(?i:`) +		} +		for _, r := range re.Rune { +			escape(b, r, false) +		} +		if re.Flags&FoldCase != 0 { +			b.WriteString(`)`) +		} +	case OpCharClass: +		if len(re.Rune)%2 != 0 { +			b.WriteString(`[invalid char class]`) +			break +		} +		b.WriteRune('[') +		if len(re.Rune) == 0 { +			b.WriteString(`^\x00-\x{10FFFF}`) +		} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune { +			// Contains 0 and MaxRune.  Probably a negated class. +			// Print the gaps. +			b.WriteRune('^') +			for i := 1; i < len(re.Rune)-1; i += 2 { +				lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 +				escape(b, lo, lo == '-') +				if lo != hi { +					b.WriteRune('-') +					escape(b, hi, hi == '-') +				} +			} +		} else { +			for i := 0; i < len(re.Rune); i += 2 { +				lo, hi := re.Rune[i], re.Rune[i+1] +				escape(b, lo, lo == '-') +				if lo != hi { +					b.WriteRune('-') +					escape(b, hi, hi == '-') +				} +			} +		} +		b.WriteRune(']') +	case OpAnyCharNotNL: +		b.WriteString(`(?-s:.)`) +	case OpAnyChar: +		b.WriteString(`(?s:.)`) +	case OpBeginLine: +		b.WriteRune('^') +	case OpEndLine: +		b.WriteRune('$') +	case OpBeginText: +		b.WriteString(`\A`) +	case OpEndText: +		if re.Flags&WasDollar != 0 { +			b.WriteString(`(?-m:$)`) +		} else { +			b.WriteString(`\z`) +		} +	case OpWordBoundary: +		b.WriteString(`\b`) +	case OpNoWordBoundary: +		b.WriteString(`\B`) +	case OpCapture: +		if re.Name != "" { +			b.WriteString(`(?P<`) +			b.WriteString(re.Name) +			b.WriteRune('>') +		} else { +			b.WriteRune('(') +		} +		if re.Sub[0].Op != OpEmptyMatch { +			writeRegexp(b, re.Sub[0]) +		} +		b.WriteRune(')') +	case OpStar, OpPlus, OpQuest, OpRepeat: +		if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { +			b.WriteString(`(?:`) +			writeRegexp(b, sub) +			b.WriteString(`)`) +		} else { +			writeRegexp(b, sub) +		} +		switch re.Op { +		case OpStar: +			b.WriteRune('*') +		case OpPlus: +			b.WriteRune('+') +		case OpQuest: +			b.WriteRune('?') +		case OpRepeat: +			b.WriteRune('{') +			b.WriteString(strconv.Itoa(re.Min)) +			if re.Max != re.Min { +				b.WriteRune(',') +				if re.Max >= 0 { +					b.WriteString(strconv.Itoa(re.Max)) +				} +			} +			b.WriteRune('}') +		} +		if re.Flags&NonGreedy != 0 { +			b.WriteRune('?') +		} +	case OpConcat: +		for _, sub := range re.Sub { +			if sub.Op == OpAlternate { +				b.WriteString(`(?:`) +				writeRegexp(b, sub) +				b.WriteString(`)`) +			} else { +				writeRegexp(b, sub) +			} +		} +	case OpAlternate: +		for i, sub := range re.Sub { +			if i > 0 { +				b.WriteRune('|') +			} +			writeRegexp(b, sub) +		} +	} +} + +func (re *Regexp) String() string { +	var b bytes.Buffer +	writeRegexp(&b, re) +	return b.String() +} + +const meta = `\.+*?()|[]{}^$` + +func escape(b *bytes.Buffer, r rune, force bool) { +	if unicode.IsPrint(r) { +		if strings.IndexRune(meta, r) >= 0 || force { +			b.WriteRune('\\') +		} +		b.WriteRune(r) +		return +	} + +	switch r { +	case '\a': +		b.WriteString(`\a`) +	case '\f': +		b.WriteString(`\f`) +	case '\n': +		b.WriteString(`\n`) +	case '\r': +		b.WriteString(`\r`) +	case '\t': +		b.WriteString(`\t`) +	case '\v': +		b.WriteString(`\v`) +	default: +		if r < 0x100 { +			b.WriteString(`\x`) +			s := strconv.FormatInt(int64(r), 16) +			if len(s) == 1 { +				b.WriteRune('0') +			} +			b.WriteString(s) +			break +		} +		b.WriteString(`\x{`) +		b.WriteString(strconv.FormatInt(int64(r), 16)) +		b.WriteString(`}`) +	} +} + +// MaxCap walks the regexp to find the maximum capture index. +func (re *Regexp) MaxCap() int { +	m := 0 +	if re.Op == OpCapture { +		m = re.Cap +	} +	for _, sub := range re.Sub { +		if n := sub.MaxCap(); m < n { +			m = n +		} +	} +	return m +} + +// CapNames walks the regexp to find the names of capturing groups. +func (re *Regexp) CapNames() []string { +	names := make([]string, re.MaxCap()+1) +	re.capNames(names) +	return names +} + +func (re *Regexp) capNames(names []string) { +	if re.Op == OpCapture { +		names[re.Cap] = re.Name +	} +	for _, sub := range re.Sub { +		sub.capNames(names) +	} +} | 
