summaryrefslogtreecommitdiff
path: root/src/pkg/regexp/syntax
diff options
context:
space:
mode:
authorTianon Gravi <admwiggin@gmail.com>2015-01-15 11:54:00 -0700
committerTianon Gravi <admwiggin@gmail.com>2015-01-15 11:54:00 -0700
commitf154da9e12608589e8d5f0508f908a0c3e88a1bb (patch)
treef8255d51e10c6f1e0ed69702200b966c9556a431 /src/pkg/regexp/syntax
parent8d8329ed5dfb9622c82a9fbec6fd99a580f9c9f6 (diff)
downloadgolang-upstream/1.4.tar.gz
Imported Upstream version 1.4upstream/1.4
Diffstat (limited to 'src/pkg/regexp/syntax')
-rw-r--r--src/pkg/regexp/syntax/compile.go289
-rw-r--r--src/pkg/regexp/syntax/doc.go131
-rwxr-xr-xsrc/pkg/regexp/syntax/make_perl_groups.pl107
-rw-r--r--src/pkg/regexp/syntax/parse.go1863
-rw-r--r--src/pkg/regexp/syntax/parse_test.go559
-rw-r--r--src/pkg/regexp/syntax/perl_groups.go134
-rw-r--r--src/pkg/regexp/syntax/prog.go345
-rw-r--r--src/pkg/regexp/syntax/prog_test.go114
-rw-r--r--src/pkg/regexp/syntax/regexp.go319
-rw-r--r--src/pkg/regexp/syntax/simplify.go151
-rw-r--r--src/pkg/regexp/syntax/simplify_test.go151
11 files changed, 0 insertions, 4163 deletions
diff --git a/src/pkg/regexp/syntax/compile.go b/src/pkg/regexp/syntax/compile.go
deleted file mode 100644
index 95f6f1569..000000000
--- a/src/pkg/regexp/syntax/compile.go
+++ /dev/null
@@ -1,289 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package syntax
-
-import "unicode"
-
-// A patchList is a list of instruction pointers that need to be filled in (patched).
-// Because the pointers haven't been filled in yet, we can reuse their storage
-// to hold the list. It's kind of sleazy, but works well in practice.
-// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
-//
-// These aren't really pointers: they're integers, so we can reinterpret them
-// this way without using package unsafe. A value l denotes
-// p.inst[l>>1].Out (l&1==0) or .Arg (l&1==1).
-// l == 0 denotes the empty list, okay because we start every program
-// with a fail instruction, so we'll never want to point at its output link.
-type patchList uint32
-
-func (l patchList) next(p *Prog) patchList {
- i := &p.Inst[l>>1]
- if l&1 == 0 {
- return patchList(i.Out)
- }
- return patchList(i.Arg)
-}
-
-func (l patchList) patch(p *Prog, val uint32) {
- for l != 0 {
- i := &p.Inst[l>>1]
- if l&1 == 0 {
- l = patchList(i.Out)
- i.Out = val
- } else {
- l = patchList(i.Arg)
- i.Arg = val
- }
- }
-}
-
-func (l1 patchList) append(p *Prog, l2 patchList) patchList {
- if l1 == 0 {
- return l2
- }
- if l2 == 0 {
- return l1
- }
-
- last := l1
- for {
- next := last.next(p)
- if next == 0 {
- break
- }
- last = next
- }
-
- i := &p.Inst[last>>1]
- if last&1 == 0 {
- i.Out = uint32(l2)
- } else {
- i.Arg = uint32(l2)
- }
- return l1
-}
-
-// A frag represents a compiled program fragment.
-type frag struct {
- i uint32 // index of first instruction
- out patchList // where to record end instruction
-}
-
-type compiler struct {
- p *Prog
-}
-
-// Compile compiles the regexp into a program to be executed.
-// The regexp should have been simplified already (returned from re.Simplify).
-func Compile(re *Regexp) (*Prog, error) {
- var c compiler
- c.init()
- f := c.compile(re)
- f.out.patch(c.p, c.inst(InstMatch).i)
- c.p.Start = int(f.i)
- return c.p, nil
-}
-
-func (c *compiler) init() {
- c.p = new(Prog)
- c.p.NumCap = 2 // implicit ( and ) for whole match $0
- c.inst(InstFail)
-}
-
-var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
-var anyRune = []rune{0, unicode.MaxRune}
-
-func (c *compiler) compile(re *Regexp) frag {
- switch re.Op {
- case OpNoMatch:
- return c.fail()
- case OpEmptyMatch:
- return c.nop()
- case OpLiteral:
- if len(re.Rune) == 0 {
- return c.nop()
- }
- var f frag
- for j := range re.Rune {
- f1 := c.rune(re.Rune[j:j+1], re.Flags)
- if j == 0 {
- f = f1
- } else {
- f = c.cat(f, f1)
- }
- }
- return f
- case OpCharClass:
- return c.rune(re.Rune, re.Flags)
- case OpAnyCharNotNL:
- return c.rune(anyRuneNotNL, 0)
- case OpAnyChar:
- return c.rune(anyRune, 0)
- case OpBeginLine:
- return c.empty(EmptyBeginLine)
- case OpEndLine:
- return c.empty(EmptyEndLine)
- case OpBeginText:
- return c.empty(EmptyBeginText)
- case OpEndText:
- return c.empty(EmptyEndText)
- case OpWordBoundary:
- return c.empty(EmptyWordBoundary)
- case OpNoWordBoundary:
- return c.empty(EmptyNoWordBoundary)
- case OpCapture:
- bra := c.cap(uint32(re.Cap << 1))
- sub := c.compile(re.Sub[0])
- ket := c.cap(uint32(re.Cap<<1 | 1))
- return c.cat(c.cat(bra, sub), ket)
- case OpStar:
- return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
- case OpPlus:
- return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
- case OpQuest:
- return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
- case OpConcat:
- if len(re.Sub) == 0 {
- return c.nop()
- }
- var f frag
- for i, sub := range re.Sub {
- if i == 0 {
- f = c.compile(sub)
- } else {
- f = c.cat(f, c.compile(sub))
- }
- }
- return f
- case OpAlternate:
- var f frag
- for _, sub := range re.Sub {
- f = c.alt(f, c.compile(sub))
- }
- return f
- }
- panic("regexp: unhandled case in compile")
-}
-
-func (c *compiler) inst(op InstOp) frag {
- // TODO: impose length limit
- f := frag{i: uint32(len(c.p.Inst))}
- c.p.Inst = append(c.p.Inst, Inst{Op: op})
- return f
-}
-
-func (c *compiler) nop() frag {
- f := c.inst(InstNop)
- f.out = patchList(f.i << 1)
- return f
-}
-
-func (c *compiler) fail() frag {
- return frag{}
-}
-
-func (c *compiler) cap(arg uint32) frag {
- f := c.inst(InstCapture)
- f.out = patchList(f.i << 1)
- c.p.Inst[f.i].Arg = arg
-
- if c.p.NumCap < int(arg)+1 {
- c.p.NumCap = int(arg) + 1
- }
- return f
-}
-
-func (c *compiler) cat(f1, f2 frag) frag {
- // concat of failure is failure
- if f1.i == 0 || f2.i == 0 {
- return frag{}
- }
-
- // TODO: elide nop
-
- f1.out.patch(c.p, f2.i)
- return frag{f1.i, f2.out}
-}
-
-func (c *compiler) alt(f1, f2 frag) frag {
- // alt of failure is other
- if f1.i == 0 {
- return f2
- }
- if f2.i == 0 {
- return f1
- }
-
- f := c.inst(InstAlt)
- i := &c.p.Inst[f.i]
- i.Out = f1.i
- i.Arg = f2.i
- f.out = f1.out.append(c.p, f2.out)
- return f
-}
-
-func (c *compiler) quest(f1 frag, nongreedy bool) frag {
- f := c.inst(InstAlt)
- i := &c.p.Inst[f.i]
- if nongreedy {
- i.Arg = f1.i
- f.out = patchList(f.i << 1)
- } else {
- i.Out = f1.i
- f.out = patchList(f.i<<1 | 1)
- }
- f.out = f.out.append(c.p, f1.out)
- return f
-}
-
-func (c *compiler) star(f1 frag, nongreedy bool) frag {
- f := c.inst(InstAlt)
- i := &c.p.Inst[f.i]
- if nongreedy {
- i.Arg = f1.i
- f.out = patchList(f.i << 1)
- } else {
- i.Out = f1.i
- f.out = patchList(f.i<<1 | 1)
- }
- f1.out.patch(c.p, f.i)
- return f
-}
-
-func (c *compiler) plus(f1 frag, nongreedy bool) frag {
- return frag{f1.i, c.star(f1, nongreedy).out}
-}
-
-func (c *compiler) empty(op EmptyOp) frag {
- f := c.inst(InstEmptyWidth)
- c.p.Inst[f.i].Arg = uint32(op)
- f.out = patchList(f.i << 1)
- return f
-}
-
-func (c *compiler) rune(r []rune, flags Flags) frag {
- f := c.inst(InstRune)
- i := &c.p.Inst[f.i]
- i.Rune = r
- flags &= FoldCase // only relevant flag is FoldCase
- if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] {
- // and sometimes not even that
- flags &^= FoldCase
- }
- i.Arg = uint32(flags)
- f.out = patchList(f.i << 1)
-
- // Special cases for exec machine.
- switch {
- case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]):
- i.Op = InstRune1
- case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune:
- i.Op = InstRuneAny
- case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune:
- i.Op = InstRuneAnyNotNL
- }
-
- return f
-}
diff --git a/src/pkg/regexp/syntax/doc.go b/src/pkg/regexp/syntax/doc.go
deleted file mode 100644
index 8e72c90d3..000000000
--- a/src/pkg/regexp/syntax/doc.go
+++ /dev/null
@@ -1,131 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// DO NOT EDIT. This file is generated by mksyntaxgo from the RE2 distribution.
-
-/*
-Package syntax parses regular expressions into parse trees and compiles
-parse trees into programs. Most clients of regular expressions will use the
-facilities of package regexp (such as Compile and Match) instead of this package.
-
-Syntax
-
-The regular expression syntax understood by this package when parsing with the Perl flag is as follows.
-Parts of the syntax can be disabled by passing alternate flags to Parse.
-
-
-Single characters:
- . any character, possibly including newline (flag s=true)
- [xyz] character class
- [^xyz] negated character class
- \d Perl character class
- \D negated Perl character class
- [:alpha:] ASCII character class
- [:^alpha:] negated ASCII character class
- \pN Unicode character class (one-letter name)
- \p{Greek} Unicode character class
- \PN negated Unicode character class (one-letter name)
- \P{Greek} negated Unicode character class
-
-Composites:
- xy x followed by y
- x|y x or y (prefer x)
-
-Repetitions:
- x* zero or more x, prefer more
- x+ one or more x, prefer more
- x? zero or one x, prefer one
- x{n,m} n or n+1 or ... or m x, prefer more
- x{n,} n or more x, prefer more
- x{n} exactly n x
- x*? zero or more x, prefer fewer
- x+? one or more x, prefer fewer
- x?? zero or one x, prefer zero
- x{n,m}? n or n+1 or ... or m x, prefer fewer
- x{n,}? n or more x, prefer fewer
- x{n}? exactly n x
-
-Implementation restriction: The counting forms x{n} etc. (but not the other
-forms x* etc.) have an upper limit of n=1000. Negative or higher explicit
-counts yield the parse error ErrInvalidRepeatSize.
-
-Grouping:
- (re) numbered capturing group (submatch)
- (?P<name>re) named & numbered capturing group (submatch)
- (?:re) non-capturing group (submatch)
- (?flags) set flags within current group; non-capturing
- (?flags:re) set flags during re; non-capturing
-
- Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are:
-
- i case-insensitive (default false)
- m multi-line mode: ^ and $ match begin/end line in addition to begin/end text (default false)
- s let . match \n (default false)
- U ungreedy: swap meaning of x* and x*?, x+ and x+?, etc (default false)
-
-Empty strings:
- ^ at beginning of text or line (flag m=true)
- $ at end of text (like \z not \Z) or line (flag m=true)
- \A at beginning of text
- \b at ASCII word boundary (\w on one side and \W, \A, or \z on the other)
- \B not an ASCII word boundary
- \z at end of text
-
-Escape sequences:
- \a bell (== \007)
- \f form feed (== \014)
- \t horizontal tab (== \011)
- \n newline (== \012)
- \r carriage return (== \015)
- \v vertical tab character (== \013)
- \* literal *, for any punctuation character *
- \123 octal character code (up to three digits)
- \x7F hex character code (exactly two digits)
- \x{10FFFF} hex character code
- \Q...\E literal text ... even if ... has punctuation
-
-Character class elements:
- x single character
- A-Z character range (inclusive)
- \d Perl character class
- [:foo:] ASCII character class foo
- \p{Foo} Unicode character class Foo
- \pF Unicode character class F (one-letter name)
-
-Named character classes as character class elements:
- [\d] digits (== \d)
- [^\d] not digits (== \D)
- [\D] not digits (== \D)
- [^\D] not not digits (== \d)
- [[:name:]] named ASCII class inside character class (== [:name:])
- [^[:name:]] named ASCII class inside negated character class (== [:^name:])
- [\p{Name}] named Unicode property inside character class (== \p{Name})
- [^\p{Name}] named Unicode property inside negated character class (== \P{Name})
-
-Perl character classes:
- \d digits (== [0-9])
- \D not digits (== [^0-9])
- \s whitespace (== [\t\n\f\r ])
- \S not whitespace (== [^\t\n\f\r ])
- \w ASCII word characters (== [0-9A-Za-z_])
- \W not ASCII word characters (== [^0-9A-Za-z_])
-
-ASCII character classes:
- [:alnum:] alphanumeric (== [0-9A-Za-z])
- [:alpha:] alphabetic (== [A-Za-z])
- [:ascii:] ASCII (== [\x00-\x7F])
- [:blank:] blank (== [\t ])
- [:cntrl:] control (== [\x00-\x1F\x7F])
- [:digit:] digits (== [0-9])
- [:graph:] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~])
- [:lower:] lower case (== [a-z])
- [:print:] printable (== [ -~] == [ [:graph:]])
- [:punct:] punctuation (== [!-/:-@[-`{-~])
- [:space:] whitespace (== [\t\n\v\f\r ])
- [:upper:] upper case (== [A-Z])
- [:word:] word characters (== [0-9A-Za-z_])
- [:xdigit:] hex digit (== [0-9A-Fa-f])
-
-*/
-package syntax
diff --git a/src/pkg/regexp/syntax/make_perl_groups.pl b/src/pkg/regexp/syntax/make_perl_groups.pl
deleted file mode 100755
index 90040fcb4..000000000
--- a/src/pkg/regexp/syntax/make_perl_groups.pl
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/usr/bin/perl
-# Copyright 2008 The Go Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file.
-
-# Modified version of RE2's make_perl_groups.pl.
-
-# Generate table entries giving character ranges
-# for POSIX/Perl character classes. Rather than
-# figure out what the definition is, it is easier to ask
-# Perl about each letter from 0-128 and write down
-# its answer.
-
-@posixclasses = (
- "[:alnum:]",
- "[:alpha:]",
- "[:ascii:]",
- "[:blank:]",
- "[:cntrl:]",
- "[:digit:]",
- "[:graph:]",
- "[:lower:]",
- "[:print:]",
- "[:punct:]",
- "[:space:]",
- "[:upper:]",
- "[:word:]",
- "[:xdigit:]",
-);
-
-@perlclasses = (
- "\\d",
- "\\s",
- "\\w",
-);
-
-sub ComputeClass($) {
- my @ranges;
- my ($class) = @_;
- my $regexp = "[$class]";
- my $start = -1;
- for (my $i=0; $i<=129; $i++) {
- if ($i == 129) { $i = 256; }
- if ($i <= 128 && chr($i) =~ $regexp) {
- if ($start < 0) {
- $start = $i;
- }
- } else {
- if ($start >= 0) {
- push @ranges, [$start, $i-1];
- }
- $start = -1;
- }
- }
- return @ranges;
-}
-
-sub PrintClass($$@) {
- my ($cname, $name, @ranges) = @_;
- print "var code$cname = []rune{ /* $name */\n";
- for (my $i=0; $i<@ranges; $i++) {
- my @a = @{$ranges[$i]};
- printf "\t0x%x, 0x%x,\n", $a[0], $a[1];
- }
- print "}\n\n";
- my $n = @ranges;
- $negname = $name;
- if ($negname =~ /:/) {
- $negname =~ s/:/:^/;
- } else {
- $negname =~ y/a-z/A-Z/;
- }
- return "\t`$name`: {+1, code$cname},\n" .
- "\t`$negname`: {-1, code$cname},\n";
-}
-
-my $gen = 0;
-
-sub PrintClasses($@) {
- my ($cname, @classes) = @_;
- my @entries;
- foreach my $cl (@classes) {
- my @ranges = ComputeClass($cl);
- push @entries, PrintClass(++$gen, $cl, @ranges);
- }
- print "var ${cname}Group = map[string]charGroup{\n";
- foreach my $e (@entries) {
- print $e;
- }
- print "}\n";
- my $count = @entries;
-}
-
-print <<EOF;
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
-// make_perl_groups.pl >perl_groups.go
-
-package syntax
-
-EOF
-
-PrintClasses("perl", @perlclasses);
-PrintClasses("posix", @posixclasses);
diff --git a/src/pkg/regexp/syntax/parse.go b/src/pkg/regexp/syntax/parse.go
deleted file mode 100644
index cb25dca39..000000000
--- a/src/pkg/regexp/syntax/parse.go
+++ /dev/null
@@ -1,1863 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package syntax
-
-import (
- "sort"
- "strings"
- "unicode"
- "unicode/utf8"
-)
-
-// An Error describes a failure to parse a regular expression
-// and gives the offending expression.
-type Error struct {
- Code ErrorCode
- Expr string
-}
-
-func (e *Error) Error() string {
- return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
-}
-
-// An ErrorCode describes a failure to parse a regular expression.
-type ErrorCode string
-
-const (
- // Unexpected error
- ErrInternalError ErrorCode = "regexp/syntax: internal error"
-
- // Parse errors
- ErrInvalidCharClass ErrorCode = "invalid character class"
- ErrInvalidCharRange ErrorCode = "invalid character class range"
- ErrInvalidEscape ErrorCode = "invalid escape sequence"
- ErrInvalidNamedCapture ErrorCode = "invalid named capture"
- ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax"
- ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator"
- ErrInvalidRepeatSize ErrorCode = "invalid repeat count"
- ErrInvalidUTF8 ErrorCode = "invalid UTF-8"
- ErrMissingBracket ErrorCode = "missing closing ]"
- ErrMissingParen ErrorCode = "missing closing )"
- ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
- ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression"
- ErrUnexpectedParen ErrorCode = "unexpected )"
-)
-
-func (e ErrorCode) String() string {
- return string(e)
-}
-
-// Flags control the behavior of the parser and record information about regexp context.
-type Flags uint16
-
-const (
- FoldCase Flags = 1 << iota // case-insensitive match
- Literal // treat pattern as literal string
- ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline
- DotNL // allow . to match newline
- OneLine // treat ^ and $ as only matching at beginning and end of text
- NonGreedy // make repetition operators default to non-greedy
- PerlX // allow Perl extensions
- UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation
- WasDollar // regexp OpEndText was $, not \z
- Simple // regexp contains no counted repetition
-
- MatchNL = ClassNL | DotNL
-
- Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible
- POSIX Flags = 0 // POSIX syntax
-)
-
-// Pseudo-ops for parsing stack.
-const (
- opLeftParen = opPseudo + iota
- opVerticalBar
-)
-
-type parser struct {
- flags Flags // parse mode flags
- stack []*Regexp // stack of parsed expressions
- free *Regexp
- numCap int // number of capturing groups seen
- wholeRegexp string
- tmpClass []rune // temporary char class work space
-}
-
-func (p *parser) newRegexp(op Op) *Regexp {
- re := p.free
- if re != nil {
- p.free = re.Sub0[0]
- *re = Regexp{}
- } else {
- re = new(Regexp)
- }
- re.Op = op
- return re
-}
-
-func (p *parser) reuse(re *Regexp) {
- re.Sub0[0] = p.free
- p.free = re
-}
-
-// Parse stack manipulation.
-
-// push pushes the regexp re onto the parse stack and returns the regexp.
-func (p *parser) push(re *Regexp) *Regexp {
- if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
- // Single rune.
- if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
- return nil
- }
- re.Op = OpLiteral
- re.Rune = re.Rune[:1]
- re.Flags = p.flags &^ FoldCase
- } else if re.Op == OpCharClass && len(re.Rune) == 4 &&
- re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
- unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
- unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
- re.Op == OpCharClass && len(re.Rune) == 2 &&
- re.Rune[0]+1 == re.Rune[1] &&
- unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
- unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
- // Case-insensitive rune like [Aa] or [Δδ].
- if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
- return nil
- }
-
- // Rewrite as (case-insensitive) literal.
- re.Op = OpLiteral
- re.Rune = re.Rune[:1]
- re.Flags = p.flags | FoldCase
- } else {
- // Incremental concatenation.
- p.maybeConcat(-1, 0)
- }
-
- p.stack = append(p.stack, re)
- return re
-}
-
-// maybeConcat implements incremental concatenation
-// of literal runes into string nodes. The parser calls this
-// before each push, so only the top fragment of the stack
-// might need processing. Since this is called before a push,
-// the topmost literal is no longer subject to operators like *
-// (Otherwise ab* would turn into (ab)*.)
-// If r >= 0 and there's a node left over, maybeConcat uses it
-// to push r with the given flags.
-// maybeConcat reports whether r was pushed.
-func (p *parser) maybeConcat(r rune, flags Flags) bool {
- n := len(p.stack)
- if n < 2 {
- return false
- }
-
- re1 := p.stack[n-1]
- re2 := p.stack[n-2]
- if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {
- return false
- }
-
- // Push re1 into re2.
- re2.Rune = append(re2.Rune, re1.Rune...)
-
- // Reuse re1 if possible.
- if r >= 0 {
- re1.Rune = re1.Rune0[:1]
- re1.Rune[0] = r
- re1.Flags = flags
- return true
- }
-
- p.stack = p.stack[:n-1]
- p.reuse(re1)
- return false // did not push r
-}
-
-// newLiteral returns a new OpLiteral Regexp with the given flags
-func (p *parser) newLiteral(r rune, flags Flags) *Regexp {
- re := p.newRegexp(OpLiteral)
- re.Flags = flags
- if flags&FoldCase != 0 {
- r = minFoldRune(r)
- }
- re.Rune0[0] = r
- re.Rune = re.Rune0[:1]
- return re
-}
-
-// minFoldRune returns the minimum rune fold-equivalent to r.
-func minFoldRune(r rune) rune {
- if r < minFold || r > maxFold {
- return r
- }
- min := r
- r0 := r
- for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
- if min > r {
- min = r
- }
- }
- return min
-}
-
-// literal pushes a literal regexp for the rune r on the stack
-// and returns that regexp.
-func (p *parser) literal(r rune) {
- p.push(p.newLiteral(r, p.flags))
-}
-
-// op pushes a regexp with the given op onto the stack
-// and returns that regexp.
-func (p *parser) op(op Op) *Regexp {
- re := p.newRegexp(op)
- re.Flags = p.flags
- return p.push(re)
-}
-
-// repeat replaces the top stack element with itself repeated according to op, min, max.
-// before is the regexp suffix starting at the repetition operator.
-// after is the regexp suffix following after the repetition operator.
-// repeat returns an updated 'after' and an error, if any.
-func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) {
- flags := p.flags
- if p.flags&PerlX != 0 {
- if len(after) > 0 && after[0] == '?' {
- after = after[1:]
- flags ^= NonGreedy
- }
- if lastRepeat != "" {
- // In Perl it is not allowed to stack repetition operators:
- // a** is a syntax error, not a doubled star, and a++ means
- // something else entirely, which we don't support!
- return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]}
- }
- }
- n := len(p.stack)
- if n == 0 {
- return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
- }
- sub := p.stack[n-1]
- if sub.Op >= opPseudo {
- return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
- }
- re := p.newRegexp(op)
- re.Min = min
- re.Max = max
- re.Flags = flags
- re.Sub = re.Sub0[:1]
- re.Sub[0] = sub
- p.stack[n-1] = re
- return after, nil
-}
-
-// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
-func (p *parser) concat() *Regexp {
- p.maybeConcat(-1, 0)
-
- // Scan down to find pseudo-operator | or (.
- i := len(p.stack)
- for i > 0 && p.stack[i-1].Op < opPseudo {
- i--
- }
- subs := p.stack[i:]
- p.stack = p.stack[:i]
-
- // Empty concatenation is special case.
- if len(subs) == 0 {
- return p.push(p.newRegexp(OpEmptyMatch))
- }
-
- return p.push(p.collapse(subs, OpConcat))
-}
-
-// alternate replaces the top of the stack (above the topmost '(') with its alternation.
-func (p *parser) alternate() *Regexp {
- // Scan down to find pseudo-operator (.
- // There are no | above (.
- i := len(p.stack)
- for i > 0 && p.stack[i-1].Op < opPseudo {
- i--
- }
- subs := p.stack[i:]
- p.stack = p.stack[:i]
-
- // Make sure top class is clean.
- // All the others already are (see swapVerticalBar).
- if len(subs) > 0 {
- cleanAlt(subs[len(subs)-1])
- }
-
- // Empty alternate is special case
- // (shouldn't happen but easy to handle).
- if len(subs) == 0 {
- return p.push(p.newRegexp(OpNoMatch))
- }
-
- return p.push(p.collapse(subs, OpAlternate))
-}
-
-// cleanAlt cleans re for eventual inclusion in an alternation.
-func cleanAlt(re *Regexp) {
- switch re.Op {
- case OpCharClass:
- re.Rune = cleanClass(&re.Rune)
- if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {
- re.Rune = nil
- re.Op = OpAnyChar
- return
- }
- if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {
- re.Rune = nil
- re.Op = OpAnyCharNotNL
- return
- }
- if cap(re.Rune)-len(re.Rune) > 100 {
- // re.Rune will not grow any more.
- // Make a copy or inline to reclaim storage.
- re.Rune = append(re.Rune0[:0], re.Rune...)
- }
- }
-}
-
-// collapse returns the result of applying op to sub.
-// If sub contains op nodes, they all get hoisted up
-// so that there is never a concat of a concat or an
-// alternate of an alternate.
-func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
- if len(subs) == 1 {
- return subs[0]
- }
- re := p.newRegexp(op)
- re.Sub = re.Sub0[:0]
- for _, sub := range subs {
- if sub.Op == op {
- re.Sub = append(re.Sub, sub.Sub...)
- p.reuse(sub)
- } else {
- re.Sub = append(re.Sub, sub)
- }
- }
- if op == OpAlternate {
- re.Sub = p.factor(re.Sub, re.Flags)
- if len(re.Sub) == 1 {
- old := re
- re = re.Sub[0]
- p.reuse(old)
- }
- }
- return re
-}
-
-// factor factors common prefixes from the alternation list sub.
-// It returns a replacement list that reuses the same storage and
-// frees (passes to p.reuse) any removed *Regexps.
-//
-// For example,
-// ABC|ABD|AEF|BCX|BCY
-// simplifies by literal prefix extraction to
-// A(B(C|D)|EF)|BC(X|Y)
-// which simplifies by character class introduction to
-// A(B[CD]|EF)|BC[XY]
-//
-func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
- if len(sub) < 2 {
- return sub
- }
-
- // Round 1: Factor out common literal prefixes.
- var str []rune
- var strflags Flags
- start := 0
- out := sub[:0]
- for i := 0; i <= len(sub); i++ {
- // Invariant: the Regexps that were in sub[0:start] have been
- // used or marked for reuse, and the slice space has been reused
- // for out (len(out) <= start).
- //
- // Invariant: sub[start:i] consists of regexps that all begin
- // with str as modified by strflags.
- var istr []rune
- var iflags Flags
- if i < len(sub) {
- istr, iflags = p.leadingString(sub[i])
- if iflags == strflags {
- same := 0
- for same < len(str) && same < len(istr) && str[same] == istr[same] {
- same++
- }
- if same > 0 {
- // Matches at least one rune in current range.
- // Keep going around.
- str = str[:same]
- continue
- }
- }
- }
-
- // Found end of a run with common leading literal string:
- // sub[start:i] all begin with str[0:len(str)], but sub[i]
- // does not even begin with str[0].
- //
- // Factor out common string and append factored expression to out.
- if i == start {
- // Nothing to do - run of length 0.
- } else if i == start+1 {
- // Just one: don't bother factoring.
- out = append(out, sub[start])
- } else {
- // Construct factored form: prefix(suffix1|suffix2|...)
- prefix := p.newRegexp(OpLiteral)
- prefix.Flags = strflags
- prefix.Rune = append(prefix.Rune[:0], str...)
-
- for j := start; j < i; j++ {
- sub[j] = p.removeLeadingString(sub[j], len(str))
- }
- suffix := p.collapse(sub[start:i], OpAlternate) // recurse
-
- re := p.newRegexp(OpConcat)
- re.Sub = append(re.Sub[:0], prefix, suffix)
- out = append(out, re)
- }
-
- // Prepare for next iteration.
- start = i
- str = istr
- strflags = iflags
- }
- sub = out
-
- // Round 2: Factor out common complex prefixes,
- // just the first piece of each concatenation,
- // whatever it is. This is good enough a lot of the time.
- start = 0
- out = sub[:0]
- var first *Regexp
- for i := 0; i <= len(sub); i++ {
- // Invariant: the Regexps that were in sub[0:start] have been
- // used or marked for reuse, and the slice space has been reused
- // for out (len(out) <= start).
- //
- // Invariant: sub[start:i] consists of regexps that all begin with ifirst.
- var ifirst *Regexp
- if i < len(sub) {
- ifirst = p.leadingRegexp(sub[i])
- if first != nil && first.Equal(ifirst) {
- continue
- }
- }
-
- // Found end of a run with common leading regexp:
- // sub[start:i] all begin with first but sub[i] does not.
- //
- // Factor out common regexp and append factored expression to out.
- if i == start {
- // Nothing to do - run of length 0.
- } else if i == start+1 {
- // Just one: don't bother factoring.
- out = append(out, sub[start])
- } else {
- // Construct factored form: prefix(suffix1|suffix2|...)
- prefix := first
- for j := start; j < i; j++ {
- reuse := j != start // prefix came from sub[start]
- sub[j] = p.removeLeadingRegexp(sub[j], reuse)
- }
- suffix := p.collapse(sub[start:i], OpAlternate) // recurse
-
- re := p.newRegexp(OpConcat)
- re.Sub = append(re.Sub[:0], prefix, suffix)
- out = append(out, re)
- }
-
- // Prepare for next iteration.
- start = i
- first = ifirst
- }
- sub = out
-
- // Round 3: Collapse runs of single literals into character classes.
- start = 0
- out = sub[:0]
- for i := 0; i <= len(sub); i++ {
- // Invariant: the Regexps that were in sub[0:start] have been
- // used or marked for reuse, and the slice space has been reused
- // for out (len(out) <= start).
- //
- // Invariant: sub[start:i] consists of regexps that are either
- // literal runes or character classes.
- if i < len(sub) && isCharClass(sub[i]) {
- continue
- }
-
- // sub[i] is not a char or char class;
- // emit char class for sub[start:i]...
- if i == start {
- // Nothing to do - run of length 0.
- } else if i == start+1 {
- out = append(out, sub[start])
- } else {
- // Make new char class.
- // Start with most complex regexp in sub[start].
- max := start
- for j := start + 1; j < i; j++ {
- if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
- max = j
- }
- }
- sub[start], sub[max] = sub[max], sub[start]
-
- for j := start + 1; j < i; j++ {
- mergeCharClass(sub[start], sub[j])
- p.reuse(sub[j])
- }
- cleanAlt(sub[start])
- out = append(out, sub[start])
- }
-
- // ... and then emit sub[i].
- if i < len(sub) {
- out = append(out, sub[i])
- }
- start = i + 1
- }
- sub = out
-
- // Round 4: Collapse runs of empty matches into a single empty match.
- start = 0
- out = sub[:0]
- for i := range sub {
- if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
- continue
- }
- out = append(out, sub[i])
- }
- sub = out
-
- return sub
-}
-
-// leadingString returns the leading literal string that re begins with.
-// The string refers to storage in re or its children.
-func (p *parser) leadingString(re *Regexp) ([]rune, Flags) {
- if re.Op == OpConcat && len(re.Sub) > 0 {
- re = re.Sub[0]
- }
- if re.Op != OpLiteral {
- return nil, 0
- }
- return re.Rune, re.Flags & FoldCase
-}
-
-// removeLeadingString removes the first n leading runes
-// from the beginning of re. It returns the replacement for re.
-func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
- if re.Op == OpConcat && len(re.Sub) > 0 {
- // Removing a leading string in a concatenation
- // might simplify the concatenation.
- sub := re.Sub[0]
- sub = p.removeLeadingString(sub, n)
- re.Sub[0] = sub
- if sub.Op == OpEmptyMatch {
- p.reuse(sub)
- switch len(re.Sub) {
- case 0, 1:
- // Impossible but handle.
- re.Op = OpEmptyMatch
- re.Sub = nil
- case 2:
- old := re
- re = re.Sub[1]
- p.reuse(old)
- default:
- copy(re.Sub, re.Sub[1:])
- re.Sub = re.Sub[:len(re.Sub)-1]
- }
- }
- return re
- }
-
- if re.Op == OpLiteral {
- re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
- if len(re.Rune) == 0 {
- re.Op = OpEmptyMatch
- }
- }
- return re
-}
-
-// leadingRegexp returns the leading regexp that re begins with.
-// The regexp refers to storage in re or its children.
-func (p *parser) leadingRegexp(re *Regexp) *Regexp {
- if re.Op == OpEmptyMatch {
- return nil
- }
- if re.Op == OpConcat && len(re.Sub) > 0 {
- sub := re.Sub[0]
- if sub.Op == OpEmptyMatch {
- return nil
- }
- return sub
- }
- return re
-}
-
-// removeLeadingRegexp removes the leading regexp in re.
-// It returns the replacement for re.
-// If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
-func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
- if re.Op == OpConcat && len(re.Sub) > 0 {
- if reuse {
- p.reuse(re.Sub[0])
- }
- re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
- switch len(re.Sub) {
- case 0:
- re.Op = OpEmptyMatch
- re.Sub = nil
- case 1:
- old := re
- re = re.Sub[0]
- p.reuse(old)
- }
- return re
- }
- if reuse {
- p.reuse(re)
- }
- return p.newRegexp(OpEmptyMatch)
-}
-
-func literalRegexp(s string, flags Flags) *Regexp {
- re := &Regexp{Op: OpLiteral}
- re.Flags = flags
- re.Rune = re.Rune0[:0] // use local storage for small strings
- for _, c := range s {
- if len(re.Rune) >= cap(re.Rune) {
- // string is too long to fit in Rune0. let Go handle it
- re.Rune = []rune(s)
- break
- }
- re.Rune = append(re.Rune, c)
- }
- return re
-}
-
-// Parsing.
-
-// Parse parses a regular expression string s, controlled by the specified
-// Flags, and returns a regular expression parse tree. The syntax is
-// described in the top-level comment.
-func Parse(s string, flags Flags) (*Regexp, error) {
- if flags&Literal != 0 {
- // Trivial parser for literal string.
- if err := checkUTF8(s); err != nil {
- return nil, err
- }
- return literalRegexp(s, flags), nil
- }
-
- // Otherwise, must do real work.
- var (
- p parser
- err error
- c rune
- op Op
- lastRepeat string
- )
- p.flags = flags
- p.wholeRegexp = s
- t := s
- for t != "" {
- repeat := ""
- BigSwitch:
- switch t[0] {
- default:
- if c, t, err = nextRune(t); err != nil {
- return nil, err
- }
- p.literal(c)
-
- case '(':
- if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' {
- // Flag changes and non-capturing groups.
- if t, err = p.parsePerlFlags(t); err != nil {
- return nil, err
- }
- break
- }
- p.numCap++
- p.op(opLeftParen).Cap = p.numCap
- t = t[1:]
- case '|':
- if err = p.parseVerticalBar(); err != nil {
- return nil, err
- }
- t = t[1:]
- case ')':
- if err = p.parseRightParen(); err != nil {
- return nil, err
- }
- t = t[1:]
- case '^':
- if p.flags&OneLine != 0 {
- p.op(OpBeginText)
- } else {
- p.op(OpBeginLine)
- }
- t = t[1:]
- case '$':
- if p.flags&OneLine != 0 {
- p.op(OpEndText).Flags |= WasDollar
- } else {
- p.op(OpEndLine)
- }
- t = t[1:]
- case '.':
- if p.flags&DotNL != 0 {
- p.op(OpAnyChar)
- } else {
- p.op(OpAnyCharNotNL)
- }
- t = t[1:]
- case '[':
- if t, err = p.parseClass(t); err != nil {
- return nil, err
- }
- case '*', '+', '?':
- before := t
- switch t[0] {
- case '*':
- op = OpStar
- case '+':
- op = OpPlus
- case '?':
- op = OpQuest
- }
- after := t[1:]
- if after, err = p.repeat(op, 0, 0, before, after, lastRepeat); err != nil {
- return nil, err
- }
- repeat = before
- t = after
- case '{':
- op = OpRepeat
- before := t
- min, max, after, ok := p.parseRepeat(t)
- if !ok {
- // If the repeat cannot be parsed, { is a literal.
- p.literal('{')
- t = t[1:]
- break
- }
- if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {
- // Numbers were too big, or max is present and min > max.
- return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
- }
- if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
- return nil, err
- }
- repeat = before
- t = after
- case '\\':
- if p.flags&PerlX != 0 && len(t) >= 2 {
- switch t[1] {
- case 'A':
- p.op(OpBeginText)
- t = t[2:]
- break BigSwitch
- case 'b':
- p.op(OpWordBoundary)
- t = t[2:]
- break BigSwitch
- case 'B':
- p.op(OpNoWordBoundary)
- t = t[2:]
- break BigSwitch
- case 'C':
- // any byte; not supported
- return nil, &Error{ErrInvalidEscape, t[:2]}
- case 'Q':
- // \Q ... \E: the ... is always literals
- var lit string
- if i := strings.Index(t, `\E`); i < 0 {
- lit = t[2:]
- t = ""
- } else {
- lit = t[2:i]
- t = t[i+2:]
- }
- p.push(literalRegexp(lit, p.flags))
- break BigSwitch
- case 'z':
- p.op(OpEndText)
- t = t[2:]
- break BigSwitch
- }
- }
-
- re := p.newRegexp(OpCharClass)
- re.Flags = p.flags
-
- // Look for Unicode character group like \p{Han}
- if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
- r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])
- if err != nil {
- return nil, err
- }
- if r != nil {
- re.Rune = r
- t = rest
- p.push(re)
- break BigSwitch
- }
- }
-
- // Perl character class escape.
- if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
- re.Rune = r
- t = rest
- p.push(re)
- break BigSwitch
- }
- p.reuse(re)
-
- // Ordinary single-character escape.
- if c, t, err = p.parseEscape(t); err != nil {
- return nil, err
- }
- p.literal(c)
- }
- lastRepeat = repeat
- }
-
- p.concat()
- if p.swapVerticalBar() {
- // pop vertical bar
- p.stack = p.stack[:len(p.stack)-1]
- }
- p.alternate()
-
- n := len(p.stack)
- if n != 1 {
- return nil, &Error{ErrMissingParen, s}
- }
- return p.stack[0], nil
-}
-
-// parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
-// If s is not of that form, it returns ok == false.
-// If s has the right form but the values are too big, it returns min == -1, ok == true.
-func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
- if s == "" || s[0] != '{' {
- return
- }
- s = s[1:]
- var ok1 bool
- if min, s, ok1 = p.parseInt(s); !ok1 {
- return
- }
- if s == "" {
- return
- }
- if s[0] != ',' {
- max = min
- } else {
- s = s[1:]
- if s == "" {
- return
- }
- if s[0] == '}' {
- max = -1
- } else if max, s, ok1 = p.parseInt(s); !ok1 {
- return
- } else if max < 0 {
- // parseInt found too big a number
- min = -1
- }
- }
- if s == "" || s[0] != '}' {
- return
- }
- rest = s[1:]
- ok = true
- return
-}
-
-// parsePerlFlags parses a Perl flag setting or non-capturing group or both,
-// like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state.
-// The caller must have ensured that s begins with "(?".
-func (p *parser) parsePerlFlags(s string) (rest string, err error) {
- t := s
-
- // Check for named captures, first introduced in Python's regexp library.
- // As usual, there are three slightly different syntaxes:
- //
- // (?P<name>expr) the original, introduced by Python
- // (?<name>expr) the .NET alteration, adopted by Perl 5.10
- // (?'name'expr) another .NET alteration, adopted by Perl 5.10
- //
- // Perl 5.10 gave in and implemented the Python version too,
- // but they claim that the last two are the preferred forms.
- // PCRE and languages based on it (specifically, PHP and Ruby)
- // support all three as well. EcmaScript 4 uses only the Python form.
- //
- // In both the open source world (via Code Search) and the
- // Google source tree, (?P<expr>name) is the dominant form,
- // so that's the one we implement. One is enough.
- if len(t) > 4 && t[2] == 'P' && t[3] == '<' {
- // Pull out name.
- end := strings.IndexRune(t, '>')
- if end < 0 {
- if err = checkUTF8(t); err != nil {
- return "", err
- }
- return "", &Error{ErrInvalidNamedCapture, s}
- }
-
- capture := t[:end+1] // "(?P<name>"
- name := t[4:end] // "name"
- if err = checkUTF8(name); err != nil {
- return "", err
- }
- if !isValidCaptureName(name) {
- return "", &Error{ErrInvalidNamedCapture, capture}
- }
-
- // Like ordinary capture, but named.
- p.numCap++
- re := p.op(opLeftParen)
- re.Cap = p.numCap
- re.Name = name
- return t[end+1:], nil
- }
-
- // Non-capturing group. Might also twiddle Perl flags.
- var c rune
- t = t[2:] // skip (?
- flags := p.flags
- sign := +1
- sawFlag := false
-Loop:
- for t != "" {
- if c, t, err = nextRune(t); err != nil {
- return "", err
- }
- switch c {
- default:
- break Loop
-
- // Flags.
- case 'i':
- flags |= FoldCase
- sawFlag = true
- case 'm':
- flags &^= OneLine
- sawFlag = true
- case 's':
- flags |= DotNL
- sawFlag = true
- case 'U':
- flags |= NonGreedy
- sawFlag = true
-
- // Switch to negation.
- case '-':
- if sign < 0 {
- break Loop
- }
- sign = -1
- // Invert flags so that | above turn into &^ and vice versa.
- // We'll invert flags again before using it below.
- flags = ^flags
- sawFlag = false
-
- // End of flags, starting group or not.
- case ':', ')':
- if sign < 0 {
- if !sawFlag {
- break Loop
- }
- flags = ^flags
- }
- if c == ':' {
- // Open new group
- p.op(opLeftParen)
- }
- p.flags = flags
- return t, nil
- }
- }
-
- return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
-}
-
-// isValidCaptureName reports whether name
-// is a valid capture name: [A-Za-z0-9_]+.
-// PCRE limits names to 32 bytes.
-// Python rejects names starting with digits.
-// We don't enforce either of those.
-func isValidCaptureName(name string) bool {
- if name == "" {
- return false
- }
- for _, c := range name {
- if c != '_' && !isalnum(c) {
- return false
- }
- }
- return true
-}
-
-// parseInt parses a decimal integer.
-func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
- if s == "" || s[0] < '0' || '9' < s[0] {
- return
- }
- // Disallow leading zeros.
- if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
- return
- }
- t := s
- for s != "" && '0' <= s[0] && s[0] <= '9' {
- s = s[1:]
- }
- rest = s
- ok = true
- // Have digits, compute value.
- t = t[:len(t)-len(s)]
- for i := 0; i < len(t); i++ {
- // Avoid overflow.
- if n >= 1e8 {
- n = -1
- break
- }
- n = n*10 + int(t[i]) - '0'
- }
- return
-}
-
-// can this be represented as a character class?
-// single-rune literal string, char class, ., and .|\n.
-func isCharClass(re *Regexp) bool {
- return re.Op == OpLiteral && len(re.Rune) == 1 ||
- re.Op == OpCharClass ||
- re.Op == OpAnyCharNotNL ||
- re.Op == OpAnyChar
-}
-
-// does re match r?
-func matchRune(re *Regexp, r rune) bool {
- switch re.Op {
- case OpLiteral:
- return len(re.Rune) == 1 && re.Rune[0] == r
- case OpCharClass:
- for i := 0; i < len(re.Rune); i += 2 {
- if re.Rune[i] <= r && r <= re.Rune[i+1] {
- return true
- }
- }
- return false
- case OpAnyCharNotNL:
- return r != '\n'
- case OpAnyChar:
- return true
- }
- return false
-}
-
-// parseVerticalBar handles a | in the input.
-func (p *parser) parseVerticalBar() error {
- p.concat()
-
- // The concatenation we just parsed is on top of the stack.
- // If it sits above an opVerticalBar, swap it below
- // (things below an opVerticalBar become an alternation).
- // Otherwise, push a new vertical bar.
- if !p.swapVerticalBar() {
- p.op(opVerticalBar)
- }
-
- return nil
-}
-
-// mergeCharClass makes dst = dst|src.
-// The caller must ensure that dst.Op >= src.Op,
-// to reduce the amount of copying.
-func mergeCharClass(dst, src *Regexp) {
- switch dst.Op {
- case OpAnyChar:
- // src doesn't add anything.
- case OpAnyCharNotNL:
- // src might add \n
- if matchRune(src, '\n') {
- dst.Op = OpAnyChar
- }
- case OpCharClass:
- // src is simpler, so either literal or char class
- if src.Op == OpLiteral {
- dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
- } else {
- dst.Rune = appendClass(dst.Rune, src.Rune)
- }
- case OpLiteral:
- // both literal
- if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
- break
- }
- dst.Op = OpCharClass
- dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
- dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
- }
-}
-
-// If the top of the stack is an element followed by an opVerticalBar
-// swapVerticalBar swaps the two and returns true.
-// Otherwise it returns false.
-func (p *parser) swapVerticalBar() bool {
- // If above and below vertical bar are literal or char class,
- // can merge into a single char class.
- n := len(p.stack)
- if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) {
- re1 := p.stack[n-1]
- re3 := p.stack[n-3]
- // Make re3 the more complex of the two.
- if re1.Op > re3.Op {
- re1, re3 = re3, re1
- p.stack[n-3] = re3
- }
- mergeCharClass(re3, re1)
- p.reuse(re1)
- p.stack = p.stack[:n-1]
- return true
- }
-
- if n >= 2 {
- re1 := p.stack[n-1]
- re2 := p.stack[n-2]
- if re2.Op == opVerticalBar {
- if n >= 3 {
- // Now out of reach.
- // Clean opportunistically.
- cleanAlt(p.stack[n-3])
- }
- p.stack[n-2] = re1
- p.stack[n-1] = re2
- return true
- }
- }
- return false
-}
-
-// parseRightParen handles a ) in the input.
-func (p *parser) parseRightParen() error {
- p.concat()
- if p.swapVerticalBar() {
- // pop vertical bar
- p.stack = p.stack[:len(p.stack)-1]
- }
- p.alternate()
-
- n := len(p.stack)
- if n < 2 {
- return &Error{ErrUnexpectedParen, p.wholeRegexp}
- }
- re1 := p.stack[n-1]
- re2 := p.stack[n-2]
- p.stack = p.stack[:n-2]
- if re2.Op != opLeftParen {
- return &Error{ErrUnexpectedParen, p.wholeRegexp}
- }
- // Restore flags at time of paren.
- p.flags = re2.Flags
- if re2.Cap == 0 {
- // Just for grouping.
- p.push(re1)
- } else {
- re2.Op = OpCapture
- re2.Sub = re2.Sub0[:1]
- re2.Sub[0] = re1
- p.push(re2)
- }
- return nil
-}
-
-// parseEscape parses an escape sequence at the beginning of s
-// and returns the rune.
-func (p *parser) parseEscape(s string) (r rune, rest string, err error) {
- t := s[1:]
- if t == "" {
- return 0, "", &Error{ErrTrailingBackslash, ""}
- }
- c, t, err := nextRune(t)
- if err != nil {
- return 0, "", err
- }
-
-Switch:
- switch c {
- default:
- if c < utf8.RuneSelf && !isalnum(c) {
- // Escaped non-word characters are always themselves.
- // PCRE is not quite so rigorous: it accepts things like
- // \q, but we don't. We once rejected \_, but too many
- // programs and people insist on using it, so allow \_.
- return c, t, nil
- }
-
- // Octal escapes.
- case '1', '2', '3', '4', '5', '6', '7':
- // Single non-zero digit is a backreference; not supported
- if t == "" || t[0] < '0' || t[0] > '7' {
- break
- }
- fallthrough
- case '0':
- // Consume up to three octal digits; already have one.
- r = c - '0'
- for i := 1; i < 3; i++ {
- if t == "" || t[0] < '0' || t[0] > '7' {
- break
- }
- r = r*8 + rune(t[0]) - '0'
- t = t[1:]
- }
- return r, t, nil
-
- // Hexadecimal escapes.
- case 'x':
- if t == "" {
- break
- }
- if c, t, err = nextRune(t); err != nil {
- return 0, "", err
- }
- if c == '{' {
- // Any number of digits in braces.
- // Perl accepts any text at all; it ignores all text
- // after the first non-hex digit. We require only hex digits,
- // and at least one.
- nhex := 0
- r = 0
- for {
- if t == "" {
- break Switch
- }
- if c, t, err = nextRune(t); err != nil {
- return 0, "", err
- }
- if c == '}' {
- break
- }
- v := unhex(c)
- if v < 0 {
- break Switch
- }
- r = r*16 + v
- if r > unicode.MaxRune {
- break Switch
- }
- nhex++
- }
- if nhex == 0 {
- break Switch
- }
- return r, t, nil
- }
-
- // Easy case: two hex digits.
- x := unhex(c)
- if c, t, err = nextRune(t); err != nil {
- return 0, "", err
- }
- y := unhex(c)
- if x < 0 || y < 0 {
- break
- }
- return x*16 + y, t, nil
-
- // C escapes. There is no case 'b', to avoid misparsing
- // the Perl word-boundary \b as the C backspace \b
- // when in POSIX mode. In Perl, /\b/ means word-boundary
- // but /[\b]/ means backspace. We don't support that.
- // If you want a backspace, embed a literal backspace
- // character or use \x08.
- case 'a':
- return '\a', t, err
- case 'f':
- return '\f', t, err
- case 'n':
- return '\n', t, err
- case 'r':
- return '\r', t, err
- case 't':
- return '\t', t, err
- case 'v':
- return '\v', t, err
- }
- return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}
-}
-
-// parseClassChar parses a character class character at the beginning of s
-// and returns it.
-func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) {
- if s == "" {
- return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
- }
-
- // Allow regular escape sequences even though
- // many need not be escaped in this context.
- if s[0] == '\\' {
- return p.parseEscape(s)
- }
-
- return nextRune(s)
-}
-
-type charGroup struct {
- sign int
- class []rune
-}
-
-// parsePerlClassEscape parses a leading Perl character class escape like \d
-// from the beginning of s. If one is present, it appends the characters to r
-// and returns the new slice r and the remainder of the string.
-func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) {
- if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
- return
- }
- g := perlGroup[s[0:2]]
- if g.sign == 0 {
- return
- }
- return p.appendGroup(r, g), s[2:]
-}
-
-// parseNamedClass parses a leading POSIX named character class like [:alnum:]
-// from the beginning of s. If one is present, it appends the characters to r
-// and returns the new slice r and the remainder of the string.
-func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) {
- if len(s) < 2 || s[0] != '[' || s[1] != ':' {
- return
- }
-
- i := strings.Index(s[2:], ":]")
- if i < 0 {
- return
- }
- i += 2
- name, s := s[0:i+2], s[i+2:]
- g := posixGroup[name]
- if g.sign == 0 {
- return nil, "", &Error{ErrInvalidCharRange, name}
- }
- return p.appendGroup(r, g), s, nil
-}
-
-func (p *parser) appendGroup(r []rune, g charGroup) []rune {
- if p.flags&FoldCase == 0 {
- if g.sign < 0 {
- r = appendNegatedClass(r, g.class)
- } else {
- r = appendClass(r, g.class)
- }
- } else {
- tmp := p.tmpClass[:0]
- tmp = appendFoldedClass(tmp, g.class)
- p.tmpClass = tmp
- tmp = cleanClass(&p.tmpClass)
- if g.sign < 0 {
- r = appendNegatedClass(r, tmp)
- } else {
- r = appendClass(r, tmp)
- }
- }
- return r
-}
-
-var anyTable = &unicode.RangeTable{
- R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}},
- R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
-}
-
-// unicodeTable returns the unicode.RangeTable identified by name
-// and the table of additional fold-equivalent code points.
-func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
- // Special case: "Any" means any.
- if name == "Any" {
- return anyTable, anyTable
- }
- if t := unicode.Categories[name]; t != nil {
- return t, unicode.FoldCategory[name]
- }
- if t := unicode.Scripts[name]; t != nil {
- return t, unicode.FoldScript[name]
- }
- return nil, nil
-}
-
-// parseUnicodeClass parses a leading Unicode character class like \p{Han}
-// from the beginning of s. If one is present, it appends the characters to r
-// and returns the new slice r and the remainder of the string.
-func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) {
- if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
- return
- }
-
- // Committed to parse or return error.
- sign := +1
- if s[1] == 'P' {
- sign = -1
- }
- t := s[2:]
- c, t, err := nextRune(t)
- if err != nil {
- return
- }
- var seq, name string
- if c != '{' {
- // Single-letter name.
- seq = s[:len(s)-len(t)]
- name = seq[2:]
- } else {
- // Name is in braces.
- end := strings.IndexRune(s, '}')
- if end < 0 {
- if err = checkUTF8(s); err != nil {
- return
- }
- return nil, "", &Error{ErrInvalidCharRange, s}
- }
- seq, t = s[:end+1], s[end+1:]
- name = s[3:end]
- if err = checkUTF8(name); err != nil {
- return
- }
- }
-
- // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
- if name != "" && name[0] == '^' {
- sign = -sign
- name = name[1:]
- }
-
- tab, fold := unicodeTable(name)
- if tab == nil {
- return nil, "", &Error{ErrInvalidCharRange, seq}
- }
-
- if p.flags&FoldCase == 0 || fold == nil {
- if sign > 0 {
- r = appendTable(r, tab)
- } else {
- r = appendNegatedTable(r, tab)
- }
- } else {
- // Merge and clean tab and fold in a temporary buffer.
- // This is necessary for the negative case and just tidy
- // for the positive case.
- tmp := p.tmpClass[:0]
- tmp = appendTable(tmp, tab)
- tmp = appendTable(tmp, fold)
- p.tmpClass = tmp
- tmp = cleanClass(&p.tmpClass)
- if sign > 0 {
- r = appendClass(r, tmp)
- } else {
- r = appendNegatedClass(r, tmp)
- }
- }
- return r, t, nil
-}
-
-// parseClass parses a character class at the beginning of s
-// and pushes it onto the parse stack.
-func (p *parser) parseClass(s string) (rest string, err error) {
- t := s[1:] // chop [
- re := p.newRegexp(OpCharClass)
- re.Flags = p.flags
- re.Rune = re.Rune0[:0]
-
- sign := +1
- if t != "" && t[0] == '^' {
- sign = -1
- t = t[1:]
-
- // If character class does not match \n, add it here,
- // so that negation later will do the right thing.
- if p.flags&ClassNL == 0 {
- re.Rune = append(re.Rune, '\n', '\n')
- }
- }
-
- class := re.Rune
- first := true // ] and - are okay as first char in class
- for t == "" || t[0] != ']' || first {
- // POSIX: - is only okay unescaped as first or last in class.
- // Perl: - is okay anywhere.
- if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
- _, size := utf8.DecodeRuneInString(t[1:])
- return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
- }
- first = false
-
- // Look for POSIX [:alnum:] etc.
- if len(t) > 2 && t[0] == '[' && t[1] == ':' {
- nclass, nt, err := p.parseNamedClass(t, class)
- if err != nil {
- return "", err
- }
- if nclass != nil {
- class, t = nclass, nt
- continue
- }
- }
-
- // Look for Unicode character group like \p{Han}.
- nclass, nt, err := p.parseUnicodeClass(t, class)
- if err != nil {
- return "", err
- }
- if nclass != nil {
- class, t = nclass, nt
- continue
- }
-
- // Look for Perl character class symbols (extension).
- if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {
- class, t = nclass, nt
- continue
- }
-
- // Single character or simple range.
- rng := t
- var lo, hi rune
- if lo, t, err = p.parseClassChar(t, s); err != nil {
- return "", err
- }
- hi = lo
- // [a-] means (a|-) so check for final ].
- if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
- t = t[1:]
- if hi, t, err = p.parseClassChar(t, s); err != nil {
- return "", err
- }
- if hi < lo {
- rng = rng[:len(rng)-len(t)]
- return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
- }
- }
- if p.flags&FoldCase == 0 {
- class = appendRange(class, lo, hi)
- } else {
- class = appendFoldedRange(class, lo, hi)
- }
- }
- t = t[1:] // chop ]
-
- // Use &re.Rune instead of &class to avoid allocation.
- re.Rune = class
- class = cleanClass(&re.Rune)
- if sign < 0 {
- class = negateClass(class)
- }
- re.Rune = class
- p.push(re)
- return t, nil
-}
-
-// cleanClass sorts the ranges (pairs of elements of r),
-// merges them, and eliminates duplicates.
-func cleanClass(rp *[]rune) []rune {
-
- // Sort by lo increasing, hi decreasing to break ties.
- sort.Sort(ranges{rp})
-
- r := *rp
- if len(r) < 2 {
- return r
- }
-
- // Merge abutting, overlapping.
- w := 2 // write index
- for i := 2; i < len(r); i += 2 {
- lo, hi := r[i], r[i+1]
- if lo <= r[w-1]+1 {
- // merge with previous range
- if hi > r[w-1] {
- r[w-1] = hi
- }
- continue
- }
- // new disjoint range
- r[w] = lo
- r[w+1] = hi
- w += 2
- }
-
- return r[:w]
-}
-
-// appendLiteral returns the result of appending the literal x to the class r.
-func appendLiteral(r []rune, x rune, flags Flags) []rune {
- if flags&FoldCase != 0 {
- return appendFoldedRange(r, x, x)
- }
- return appendRange(r, x, x)
-}
-
-// appendRange returns the result of appending the range lo-hi to the class r.
-func appendRange(r []rune, lo, hi rune) []rune {
- // Expand last range or next to last range if it overlaps or abuts.
- // Checking two ranges helps when appending case-folded
- // alphabets, so that one range can be expanding A-Z and the
- // other expanding a-z.
- n := len(r)
- for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
- if n >= i {
- rlo, rhi := r[n-i], r[n-i+1]
- if lo <= rhi+1 && rlo <= hi+1 {
- if lo < rlo {
- r[n-i] = lo
- }
- if hi > rhi {
- r[n-i+1] = hi
- }
- return r
- }
- }
- }
-
- return append(r, lo, hi)
-}
-
-const (
- // minimum and maximum runes involved in folding.
- // checked during test.
- minFold = 0x0041
- maxFold = 0x1044f
-)
-
-// appendFoldedRange returns the result of appending the range lo-hi
-// and its case folding-equivalent runes to the class r.
-func appendFoldedRange(r []rune, lo, hi rune) []rune {
- // Optimizations.
- if lo <= minFold && hi >= maxFold {
- // Range is full: folding can't add more.
- return appendRange(r, lo, hi)
- }
- if hi < minFold || lo > maxFold {
- // Range is outside folding possibilities.
- return appendRange(r, lo, hi)
- }
- if lo < minFold {
- // [lo, minFold-1] needs no folding.
- r = appendRange(r, lo, minFold-1)
- lo = minFold
- }
- if hi > maxFold {
- // [maxFold+1, hi] needs no folding.
- r = appendRange(r, maxFold+1, hi)
- hi = maxFold
- }
-
- // Brute force. Depend on appendRange to coalesce ranges on the fly.
- for c := lo; c <= hi; c++ {
- r = appendRange(r, c, c)
- f := unicode.SimpleFold(c)
- for f != c {
- r = appendRange(r, f, f)
- f = unicode.SimpleFold(f)
- }
- }
- return r
-}
-
-// appendClass returns the result of appending the class x to the class r.
-// It assume x is clean.
-func appendClass(r []rune, x []rune) []rune {
- for i := 0; i < len(x); i += 2 {
- r = appendRange(r, x[i], x[i+1])
- }
- return r
-}
-
-// appendFolded returns the result of appending the case folding of the class x to the class r.
-func appendFoldedClass(r []rune, x []rune) []rune {
- for i := 0; i < len(x); i += 2 {
- r = appendFoldedRange(r, x[i], x[i+1])
- }
- return r
-}
-
-// appendNegatedClass returns the result of appending the negation of the class x to the class r.
-// It assumes x is clean.
-func appendNegatedClass(r []rune, x []rune) []rune {
- nextLo := '\u0000'
- for i := 0; i < len(x); i += 2 {
- lo, hi := x[i], x[i+1]
- if nextLo <= lo-1 {
- r = appendRange(r, nextLo, lo-1)
- }
- nextLo = hi + 1
- }
- if nextLo <= unicode.MaxRune {
- r = appendRange(r, nextLo, unicode.MaxRune)
- }
- return r
-}
-
-// appendTable returns the result of appending x to the class r.
-func appendTable(r []rune, x *unicode.RangeTable) []rune {
- for _, xr := range x.R16 {
- lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
- if stride == 1 {
- r = appendRange(r, lo, hi)
- continue
- }
- for c := lo; c <= hi; c += stride {
- r = appendRange(r, c, c)
- }
- }
- for _, xr := range x.R32 {
- lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
- if stride == 1 {
- r = appendRange(r, lo, hi)
- continue
- }
- for c := lo; c <= hi; c += stride {
- r = appendRange(r, c, c)
- }
- }
- return r
-}
-
-// appendNegatedTable returns the result of appending the negation of x to the class r.
-func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune {
- nextLo := '\u0000' // lo end of next class to add
- for _, xr := range x.R16 {
- lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
- if stride == 1 {
- if nextLo <= lo-1 {
- r = appendRange(r, nextLo, lo-1)
- }
- nextLo = hi + 1
- continue
- }
- for c := lo; c <= hi; c += stride {
- if nextLo <= c-1 {
- r = appendRange(r, nextLo, c-1)
- }
- nextLo = c + 1
- }
- }
- for _, xr := range x.R32 {
- lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
- if stride == 1 {
- if nextLo <= lo-1 {
- r = appendRange(r, nextLo, lo-1)
- }
- nextLo = hi + 1
- continue
- }
- for c := lo; c <= hi; c += stride {
- if nextLo <= c-1 {
- r = appendRange(r, nextLo, c-1)
- }
- nextLo = c + 1
- }
- }
- if nextLo <= unicode.MaxRune {
- r = appendRange(r, nextLo, unicode.MaxRune)
- }
- return r
-}
-
-// negateClass overwrites r and returns r's negation.
-// It assumes the class r is already clean.
-func negateClass(r []rune) []rune {
- nextLo := '\u0000' // lo end of next class to add
- w := 0 // write index
- for i := 0; i < len(r); i += 2 {
- lo, hi := r[i], r[i+1]
- if nextLo <= lo-1 {
- r[w] = nextLo
- r[w+1] = lo - 1
- w += 2
- }
- nextLo = hi + 1
- }
- r = r[:w]
- if nextLo <= unicode.MaxRune {
- // It's possible for the negation to have one more
- // range - this one - than the original class, so use append.
- r = append(r, nextLo, unicode.MaxRune)
- }
- return r
-}
-
-// ranges implements sort.Interface on a []rune.
-// The choice of receiver type definition is strange
-// but avoids an allocation since we already have
-// a *[]rune.
-type ranges struct {
- p *[]rune
-}
-
-func (ra ranges) Less(i, j int) bool {
- p := *ra.p
- i *= 2
- j *= 2
- return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
-}
-
-func (ra ranges) Len() int {
- return len(*ra.p) / 2
-}
-
-func (ra ranges) Swap(i, j int) {
- p := *ra.p
- i *= 2
- j *= 2
- p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
-}
-
-func checkUTF8(s string) error {
- for s != "" {
- rune, size := utf8.DecodeRuneInString(s)
- if rune == utf8.RuneError && size == 1 {
- return &Error{Code: ErrInvalidUTF8, Expr: s}
- }
- s = s[size:]
- }
- return nil
-}
-
-func nextRune(s string) (c rune, t string, err error) {
- c, size := utf8.DecodeRuneInString(s)
- if c == utf8.RuneError && size == 1 {
- return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
- }
- return c, s[size:], nil
-}
-
-func isalnum(c rune) bool {
- return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
-}
-
-func unhex(c rune) rune {
- if '0' <= c && c <= '9' {
- return c - '0'
- }
- if 'a' <= c && c <= 'f' {
- return c - 'a' + 10
- }
- if 'A' <= c && c <= 'F' {
- return c - 'A' + 10
- }
- return -1
-}
diff --git a/src/pkg/regexp/syntax/parse_test.go b/src/pkg/regexp/syntax/parse_test.go
deleted file mode 100644
index f3089294c..000000000
--- a/src/pkg/regexp/syntax/parse_test.go
+++ /dev/null
@@ -1,559 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package syntax
-
-import (
- "bytes"
- "fmt"
- "testing"
- "unicode"
-)
-
-type parseTest struct {
- Regexp string
- Dump string
-}
-
-var parseTests = []parseTest{
- // Base cases
- {`a`, `lit{a}`},
- {`a.`, `cat{lit{a}dot{}}`},
- {`a.b`, `cat{lit{a}dot{}lit{b}}`},
- {`ab`, `str{ab}`},
- {`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`},
- {`abc`, `str{abc}`},
- {`a|^`, `alt{lit{a}bol{}}`},
- {`a|b`, `cc{0x61-0x62}`},
- {`(a)`, `cap{lit{a}}`},
- {`(a)|b`, `alt{cap{lit{a}}lit{b}}`},
- {`a*`, `star{lit{a}}`},
- {`a+`, `plus{lit{a}}`},
- {`a?`, `que{lit{a}}`},
- {`a{2}`, `rep{2,2 lit{a}}`},
- {`a{2,3}`, `rep{2,3 lit{a}}`},
- {`a{2,}`, `rep{2,-1 lit{a}}`},
- {`a*?`, `nstar{lit{a}}`},
- {`a+?`, `nplus{lit{a}}`},
- {`a??`, `nque{lit{a}}`},
- {`a{2}?`, `nrep{2,2 lit{a}}`},
- {`a{2,3}?`, `nrep{2,3 lit{a}}`},
- {`a{2,}?`, `nrep{2,-1 lit{a}}`},
- // Malformed { } are treated as literals.
- {`x{1001`, `str{x{1001}`},
- {`x{9876543210`, `str{x{9876543210}`},
- {`x{9876543210,`, `str{x{9876543210,}`},
- {`x{2,1`, `str{x{2,1}`},
- {`x{1,9876543210`, `str{x{1,9876543210}`},
- {``, `emp{}`},
- {`|`, `emp{}`}, // alt{emp{}emp{}} but got factored
- {`|x|`, `alt{emp{}lit{x}emp{}}`},
- {`.`, `dot{}`},
- {`^`, `bol{}`},
- {`$`, `eol{}`},
- {`\|`, `lit{|}`},
- {`\(`, `lit{(}`},
- {`\)`, `lit{)}`},
- {`\*`, `lit{*}`},
- {`\+`, `lit{+}`},
- {`\?`, `lit{?}`},
- {`{`, `lit{{}`},
- {`}`, `lit{}}`},
- {`\.`, `lit{.}`},
- {`\^`, `lit{^}`},
- {`\$`, `lit{$}`},
- {`\\`, `lit{\}`},
- {`[ace]`, `cc{0x61 0x63 0x65}`},
- {`[abc]`, `cc{0x61-0x63}`},
- {`[a-z]`, `cc{0x61-0x7a}`},
- {`[a]`, `lit{a}`},
- {`\-`, `lit{-}`},
- {`-`, `lit{-}`},
- {`\_`, `lit{_}`},
- {`abc`, `str{abc}`},
- {`abc|def`, `alt{str{abc}str{def}}`},
- {`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`},
-
- // Posix and Perl extensions
- {`[[:lower:]]`, `cc{0x61-0x7a}`},
- {`[a-z]`, `cc{0x61-0x7a}`},
- {`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
- {`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
- {`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
- {`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
- {`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
- {`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
- {`\d`, `cc{0x30-0x39}`},
- {`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`},
- {`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`},
- {`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`},
- {`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`},
- {`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`},
- {`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`},
- {`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
- {`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`},
- // { `\C`, `byte{}` }, // probably never
-
- // Unicode, negatives, and a double negative.
- {`\p{Braille}`, `cc{0x2800-0x28ff}`},
- {`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
- {`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
- {`\P{^Braille}`, `cc{0x2800-0x28ff}`},
- {`\pZ`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
- {`[\p{Braille}]`, `cc{0x2800-0x28ff}`},
- {`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
- {`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
- {`[\P{^Braille}]`, `cc{0x2800-0x28ff}`},
- {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
- {`\p{Lu}`, mkCharClass(unicode.IsUpper)},
- {`[\p{Lu}]`, mkCharClass(unicode.IsUpper)},
- {`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)},
- {`\p{Any}`, `dot{}`},
- {`\p{^Any}`, `cc{}`},
-
- // Hex, octal.
- {`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`},
- {`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`},
-
- // More interesting regular expressions.
- {`a{,2}`, `str{a{,2}}`},
- {`\.\^\$\\`, `str{.^$\}`},
- {`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`},
- {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
- {`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8
- {`a*{`, `cat{star{lit{a}}lit{{}}`},
-
- // Test precedences
- {`(?:ab)*`, `star{str{ab}}`},
- {`(ab)*`, `star{cap{str{ab}}}`},
- {`ab|cd`, `alt{str{ab}str{cd}}`},
- {`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`},
-
- // Test flattening.
- {`(?:a)`, `lit{a}`},
- {`(?:ab)(?:cd)`, `str{abcd}`},
- {`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
- {`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
- {`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`},
- {`a|.`, `dot{}`},
- {`.|a`, `dot{}`},
- {`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`},
- {`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`},
-
- // Test Perl quoted literals
- {`\Q+|*?{[\E`, `str{+|*?{[}`},
- {`\Q+\E+`, `plus{lit{+}}`},
- {`\Q\\E`, `lit{\}`},
- {`\Q\\\E`, `str{\\}`},
-
- // Test Perl \A and \z
- {`(?m)^`, `bol{}`},
- {`(?m)$`, `eol{}`},
- {`(?-m)^`, `bot{}`},
- {`(?-m)$`, `eot{}`},
- {`(?m)\A`, `bot{}`},
- {`(?m)\z`, `eot{\z}`},
- {`(?-m)\A`, `bot{}`},
- {`(?-m)\z`, `eot{\z}`},
-
- // Test named captures
- {`(?P<name>a)`, `cap{name:lit{a}}`},
-
- // Case-folded literals
- {`[Aa]`, `litfold{A}`},
- {`[\x{100}\x{101}]`, `litfold{Ā}`},
- {`[Δδ]`, `litfold{Δ}`},
-
- // Strings
- {`abcde`, `str{abcde}`},
- {`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`},
-
- // Factoring.
- {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`},
- {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}cc{0x79-0x7a}}cat{plus{lit{y}}lit{w}}}}`},
-
- // Bug fixes.
- {`(?:.)`, `dot{}`},
- {`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`},
- {`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`},
- {`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`},
- {`(?:A|a)`, `litfold{A}`},
- {`A|(?:A|a)`, `litfold{A}`},
- {`(?s).`, `dot{}`},
- {`(?-s).`, `dnl{}`},
- {`(?:(?:^).)`, `cat{bol{}dot{}}`},
- {`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`},
-
- // RE2 prefix_tests
- {`abc|abd`, `cat{str{ab}cc{0x63-0x64}}`},
- {`a(?:b)c|abd`, `cat{str{ab}cc{0x63-0x64}}`},
- {`abc|abd|aef|bcx|bcy`,
- `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}` +
- `cat{str{bc}cc{0x78-0x79}}}`},
- {`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`},
- {`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`},
- {`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`},
- {`(?:xx|yy)c|(?:xx|yy)d`,
- `cat{alt{str{xx}str{yy}}cc{0x63-0x64}}`},
- {`x{2}|x{2}[0-9]`,
- `cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`},
- {`x{2}y|x{2}[0-9]y`,
- `cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`},
-}
-
-const testFlags = MatchNL | PerlX | UnicodeGroups
-
-func TestParseSimple(t *testing.T) {
- testParseDump(t, parseTests, testFlags)
-}
-
-var foldcaseTests = []parseTest{
- {`AbCdE`, `strfold{ABCDE}`},
- {`[Aa]`, `litfold{A}`},
- {`a`, `litfold{A}`},
-
- // 0x17F is an old English long s (looks like an f) and folds to s.
- // 0x212A is the Kelvin symbol and folds to k.
- {`A[F-g]`, `cat{litfold{A}cc{0x41-0x7a 0x17f 0x212a}}`}, // [Aa][A-z...]
- {`[[:upper:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
- {`[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
-}
-
-func TestParseFoldCase(t *testing.T) {
- testParseDump(t, foldcaseTests, FoldCase)
-}
-
-var literalTests = []parseTest{
- {"(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}"},
-}
-
-func TestParseLiteral(t *testing.T) {
- testParseDump(t, literalTests, Literal)
-}
-
-var matchnlTests = []parseTest{
- {`.`, `dot{}`},
- {"\n", "lit{\n}"},
- {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
- {`[a\n]`, `cc{0xa 0x61}`},
-}
-
-func TestParseMatchNL(t *testing.T) {
- testParseDump(t, matchnlTests, MatchNL)
-}
-
-var nomatchnlTests = []parseTest{
- {`.`, `dnl{}`},
- {"\n", "lit{\n}"},
- {`[^a]`, `cc{0x0-0x9 0xb-0x60 0x62-0x10ffff}`},
- {`[a\n]`, `cc{0xa 0x61}`},
-}
-
-func TestParseNoMatchNL(t *testing.T) {
- testParseDump(t, nomatchnlTests, 0)
-}
-
-// Test Parse -> Dump.
-func testParseDump(t *testing.T, tests []parseTest, flags Flags) {
- for _, tt := range tests {
- re, err := Parse(tt.Regexp, flags)
- if err != nil {
- t.Errorf("Parse(%#q): %v", tt.Regexp, err)
- continue
- }
- d := dump(re)
- if d != tt.Dump {
- t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
- }
- }
-}
-
-// dump prints a string representation of the regexp showing
-// the structure explicitly.
-func dump(re *Regexp) string {
- var b bytes.Buffer
- dumpRegexp(&b, re)
- return b.String()
-}
-
-var opNames = []string{
- OpNoMatch: "no",
- OpEmptyMatch: "emp",
- OpLiteral: "lit",
- OpCharClass: "cc",
- OpAnyCharNotNL: "dnl",
- OpAnyChar: "dot",
- OpBeginLine: "bol",
- OpEndLine: "eol",
- OpBeginText: "bot",
- OpEndText: "eot",
- OpWordBoundary: "wb",
- OpNoWordBoundary: "nwb",
- OpCapture: "cap",
- OpStar: "star",
- OpPlus: "plus",
- OpQuest: "que",
- OpRepeat: "rep",
- OpConcat: "cat",
- OpAlternate: "alt",
-}
-
-// dumpRegexp writes an encoding of the syntax tree for the regexp re to b.
-// It is used during testing to distinguish between parses that might print
-// the same using re's String method.
-func dumpRegexp(b *bytes.Buffer, re *Regexp) {
- if int(re.Op) >= len(opNames) || opNames[re.Op] == "" {
- fmt.Fprintf(b, "op%d", re.Op)
- } else {
- switch re.Op {
- default:
- b.WriteString(opNames[re.Op])
- case OpStar, OpPlus, OpQuest, OpRepeat:
- if re.Flags&NonGreedy != 0 {
- b.WriteByte('n')
- }
- b.WriteString(opNames[re.Op])
- case OpLiteral:
- if len(re.Rune) > 1 {
- b.WriteString("str")
- } else {
- b.WriteString("lit")
- }
- if re.Flags&FoldCase != 0 {
- for _, r := range re.Rune {
- if unicode.SimpleFold(r) != r {
- b.WriteString("fold")
- break
- }
- }
- }
- }
- }
- b.WriteByte('{')
- switch re.Op {
- case OpEndText:
- if re.Flags&WasDollar == 0 {
- b.WriteString(`\z`)
- }
- case OpLiteral:
- for _, r := range re.Rune {
- b.WriteRune(r)
- }
- case OpConcat, OpAlternate:
- for _, sub := range re.Sub {
- dumpRegexp(b, sub)
- }
- case OpStar, OpPlus, OpQuest:
- dumpRegexp(b, re.Sub[0])
- case OpRepeat:
- fmt.Fprintf(b, "%d,%d ", re.Min, re.Max)
- dumpRegexp(b, re.Sub[0])
- case OpCapture:
- if re.Name != "" {
- b.WriteString(re.Name)
- b.WriteByte(':')
- }
- dumpRegexp(b, re.Sub[0])
- case OpCharClass:
- sep := ""
- for i := 0; i < len(re.Rune); i += 2 {
- b.WriteString(sep)
- sep = " "
- lo, hi := re.Rune[i], re.Rune[i+1]
- if lo == hi {
- fmt.Fprintf(b, "%#x", lo)
- } else {
- fmt.Fprintf(b, "%#x-%#x", lo, hi)
- }
- }
- }
- b.WriteByte('}')
-}
-
-func mkCharClass(f func(rune) bool) string {
- re := &Regexp{Op: OpCharClass}
- lo := rune(-1)
- for i := rune(0); i <= unicode.MaxRune; i++ {
- if f(i) {
- if lo < 0 {
- lo = i
- }
- } else {
- if lo >= 0 {
- re.Rune = append(re.Rune, lo, i-1)
- lo = -1
- }
- }
- }
- if lo >= 0 {
- re.Rune = append(re.Rune, lo, unicode.MaxRune)
- }
- return dump(re)
-}
-
-func isUpperFold(r rune) bool {
- if unicode.IsUpper(r) {
- return true
- }
- c := unicode.SimpleFold(r)
- for c != r {
- if unicode.IsUpper(c) {
- return true
- }
- c = unicode.SimpleFold(c)
- }
- return false
-}
-
-func TestFoldConstants(t *testing.T) {
- last := rune(-1)
- for i := rune(0); i <= unicode.MaxRune; i++ {
- if unicode.SimpleFold(i) == i {
- continue
- }
- if last == -1 && minFold != i {
- t.Errorf("minFold=%#U should be %#U", minFold, i)
- }
- last = i
- }
- if maxFold != last {
- t.Errorf("maxFold=%#U should be %#U", maxFold, last)
- }
-}
-
-func TestAppendRangeCollapse(t *testing.T) {
- // AppendRange should collapse each of the new ranges
- // into the earlier ones (it looks back two ranges), so that
- // the slice never grows very large.
- // Note that we are not calling cleanClass.
- var r []rune
- for i := rune('A'); i <= 'Z'; i++ {
- r = appendRange(r, i, i)
- r = appendRange(r, i+'a'-'A', i+'a'-'A')
- }
- if string(r) != "AZaz" {
- t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r))
- }
-}
-
-var invalidRegexps = []string{
- `(`,
- `)`,
- `(a`,
- `a)`,
- `(a))`,
- `(a|b|`,
- `a|b|)`,
- `(a|b|))`,
- `(a|b`,
- `a|b)`,
- `(a|b))`,
- `[a-z`,
- `([a-z)`,
- `[a-z)`,
- `([a-z]))`,
- `x{1001}`,
- `x{9876543210}`,
- `x{2,1}`,
- `x{1,9876543210}`,
- "\xff", // Invalid UTF-8
- "[\xff]",
- "[\\\xff]",
- "\\\xff",
- `(?P<name>a`,
- `(?P<name>`,
- `(?P<name`,
- `(?P<x y>a)`,
- `(?P<>a)`,
- `[a-Z]`,
- `(?i)[a-Z]`,
- `a{100000}`,
- `a{100000,}`,
-}
-
-var onlyPerl = []string{
- `[a-b-c]`,
- `\Qabc\E`,
- `\Q*+?{[\E`,
- `\Q\\E`,
- `\Q\\\E`,
- `\Q\\\\E`,
- `\Q\\\\\E`,
- `(?:a)`,
- `(?P<name>a)`,
-}
-
-var onlyPOSIX = []string{
- "a++",
- "a**",
- "a?*",
- "a+*",
- "a{1}*",
- ".{1}{2}.{3}",
-}
-
-func TestParseInvalidRegexps(t *testing.T) {
- for _, regexp := range invalidRegexps {
- if re, err := Parse(regexp, Perl); err == nil {
- t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
- }
- if re, err := Parse(regexp, POSIX); err == nil {
- t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
- }
- }
- for _, regexp := range onlyPerl {
- if _, err := Parse(regexp, Perl); err != nil {
- t.Errorf("Parse(%#q, Perl): %v", regexp, err)
- }
- if re, err := Parse(regexp, POSIX); err == nil {
- t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
- }
- }
- for _, regexp := range onlyPOSIX {
- if re, err := Parse(regexp, Perl); err == nil {
- t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
- }
- if _, err := Parse(regexp, POSIX); err != nil {
- t.Errorf("Parse(%#q, POSIX): %v", regexp, err)
- }
- }
-}
-
-func TestToStringEquivalentParse(t *testing.T) {
- for _, tt := range parseTests {
- re, err := Parse(tt.Regexp, testFlags)
- if err != nil {
- t.Errorf("Parse(%#q): %v", tt.Regexp, err)
- continue
- }
- d := dump(re)
- if d != tt.Dump {
- t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
- continue
- }
-
- s := re.String()
- if s != tt.Regexp {
- // If ToString didn't return the original regexp,
- // it must have found one with fewer parens.
- // Unfortunately we can't check the length here, because
- // ToString produces "\\{" for a literal brace,
- // but "{" is a shorter equivalent in some contexts.
- nre, err := Parse(s, testFlags)
- if err != nil {
- t.Errorf("Parse(%#q.String() = %#q): %v", tt.Regexp, s, err)
- continue
- }
- nd := dump(nre)
- if d != nd {
- t.Errorf("Parse(%#q) -> %#q; %#q vs %#q", tt.Regexp, s, d, nd)
- }
-
- ns := nre.String()
- if s != ns {
- t.Errorf("Parse(%#q) -> %#q -> %#q", tt.Regexp, s, ns)
- }
- }
- }
-}
diff --git a/src/pkg/regexp/syntax/perl_groups.go b/src/pkg/regexp/syntax/perl_groups.go
deleted file mode 100644
index effe4e686..000000000
--- a/src/pkg/regexp/syntax/perl_groups.go
+++ /dev/null
@@ -1,134 +0,0 @@
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
-// make_perl_groups.pl >perl_groups.go
-
-package syntax
-
-var code1 = []rune{ /* \d */
- 0x30, 0x39,
-}
-
-var code2 = []rune{ /* \s */
- 0x9, 0xa,
- 0xc, 0xd,
- 0x20, 0x20,
-}
-
-var code3 = []rune{ /* \w */
- 0x30, 0x39,
- 0x41, 0x5a,
- 0x5f, 0x5f,
- 0x61, 0x7a,
-}
-
-var perlGroup = map[string]charGroup{
- `\d`: {+1, code1},
- `\D`: {-1, code1},
- `\s`: {+1, code2},
- `\S`: {-1, code2},
- `\w`: {+1, code3},
- `\W`: {-1, code3},
-}
-var code4 = []rune{ /* [:alnum:] */
- 0x30, 0x39,
- 0x41, 0x5a,
- 0x61, 0x7a,
-}
-
-var code5 = []rune{ /* [:alpha:] */
- 0x41, 0x5a,
- 0x61, 0x7a,
-}
-
-var code6 = []rune{ /* [:ascii:] */
- 0x0, 0x7f,
-}
-
-var code7 = []rune{ /* [:blank:] */
- 0x9, 0x9,
- 0x20, 0x20,
-}
-
-var code8 = []rune{ /* [:cntrl:] */
- 0x0, 0x1f,
- 0x7f, 0x7f,
-}
-
-var code9 = []rune{ /* [:digit:] */
- 0x30, 0x39,
-}
-
-var code10 = []rune{ /* [:graph:] */
- 0x21, 0x7e,
-}
-
-var code11 = []rune{ /* [:lower:] */
- 0x61, 0x7a,
-}
-
-var code12 = []rune{ /* [:print:] */
- 0x20, 0x7e,
-}
-
-var code13 = []rune{ /* [:punct:] */
- 0x21, 0x2f,
- 0x3a, 0x40,
- 0x5b, 0x60,
- 0x7b, 0x7e,
-}
-
-var code14 = []rune{ /* [:space:] */
- 0x9, 0xd,
- 0x20, 0x20,
-}
-
-var code15 = []rune{ /* [:upper:] */
- 0x41, 0x5a,
-}
-
-var code16 = []rune{ /* [:word:] */
- 0x30, 0x39,
- 0x41, 0x5a,
- 0x5f, 0x5f,
- 0x61, 0x7a,
-}
-
-var code17 = []rune{ /* [:xdigit:] */
- 0x30, 0x39,
- 0x41, 0x46,
- 0x61, 0x66,
-}
-
-var posixGroup = map[string]charGroup{
- `[:alnum:]`: {+1, code4},
- `[:^alnum:]`: {-1, code4},
- `[:alpha:]`: {+1, code5},
- `[:^alpha:]`: {-1, code5},
- `[:ascii:]`: {+1, code6},
- `[:^ascii:]`: {-1, code6},
- `[:blank:]`: {+1, code7},
- `[:^blank:]`: {-1, code7},
- `[:cntrl:]`: {+1, code8},
- `[:^cntrl:]`: {-1, code8},
- `[:digit:]`: {+1, code9},
- `[:^digit:]`: {-1, code9},
- `[:graph:]`: {+1, code10},
- `[:^graph:]`: {-1, code10},
- `[:lower:]`: {+1, code11},
- `[:^lower:]`: {-1, code11},
- `[:print:]`: {+1, code12},
- `[:^print:]`: {-1, code12},
- `[:punct:]`: {+1, code13},
- `[:^punct:]`: {-1, code13},
- `[:space:]`: {+1, code14},
- `[:^space:]`: {-1, code14},
- `[:upper:]`: {+1, code15},
- `[:^upper:]`: {-1, code15},
- `[:word:]`: {+1, code16},
- `[:^word:]`: {-1, code16},
- `[:xdigit:]`: {+1, code17},
- `[:^xdigit:]`: {-1, code17},
-}
diff --git a/src/pkg/regexp/syntax/prog.go b/src/pkg/regexp/syntax/prog.go
deleted file mode 100644
index 29bd282d0..000000000
--- a/src/pkg/regexp/syntax/prog.go
+++ /dev/null
@@ -1,345 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package syntax
-
-import (
- "bytes"
- "strconv"
- "unicode"
-)
-
-// Compiled program.
-// May not belong in this package, but convenient for now.
-
-// A Prog is a compiled regular expression program.
-type Prog struct {
- Inst []Inst
- Start int // index of start instruction
- NumCap int // number of InstCapture insts in re
-}
-
-// An InstOp is an instruction opcode.
-type InstOp uint8
-
-const (
- InstAlt InstOp = iota
- InstAltMatch
- InstCapture
- InstEmptyWidth
- InstMatch
- InstFail
- InstNop
- InstRune
- InstRune1
- InstRuneAny
- InstRuneAnyNotNL
-)
-
-var instOpNames = []string{
- "InstAlt",
- "InstAltMatch",
- "InstCapture",
- "InstEmptyWidth",
- "InstMatch",
- "InstFail",
- "InstNop",
- "InstRune",
- "InstRune1",
- "InstRuneAny",
- "InstRuneAnyNotNL",
-}
-
-func (i InstOp) String() string {
- if uint(i) >= uint(len(instOpNames)) {
- return ""
- }
- return instOpNames[i]
-}
-
-// An EmptyOp specifies a kind or mixture of zero-width assertions.
-type EmptyOp uint8
-
-const (
- EmptyBeginLine EmptyOp = 1 << iota
- EmptyEndLine
- EmptyBeginText
- EmptyEndText
- EmptyWordBoundary
- EmptyNoWordBoundary
-)
-
-// EmptyOpContext returns the zero-width assertions
-// satisfied at the position between the runes r1 and r2.
-// Passing r1 == -1 indicates that the position is
-// at the beginning of the text.
-// Passing r2 == -1 indicates that the position is
-// at the end of the text.
-func EmptyOpContext(r1, r2 rune) EmptyOp {
- var op EmptyOp = EmptyNoWordBoundary
- var boundary byte
- switch {
- case IsWordChar(r1):
- boundary = 1
- case r1 == '\n':
- op |= EmptyBeginLine
- case r1 < 0:
- op |= EmptyBeginText | EmptyBeginLine
- }
- switch {
- case IsWordChar(r2):
- boundary ^= 1
- case r2 == '\n':
- op |= EmptyEndLine
- case r2 < 0:
- op |= EmptyEndText | EmptyEndLine
- }
- if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
- op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
- }
- return op
-}
-
-// IsWordChar reports whether r is consider a ``word character''
-// during the evaluation of the \b and \B zero-width assertions.
-// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
-func IsWordChar(r rune) bool {
- return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
-}
-
-// An Inst is a single instruction in a regular expression program.
-type Inst struct {
- Op InstOp
- Out uint32 // all but InstMatch, InstFail
- Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
- Rune []rune
-}
-
-func (p *Prog) String() string {
- var b bytes.Buffer
- dumpProg(&b, p)
- return b.String()
-}
-
-// skipNop follows any no-op or capturing instructions
-// and returns the resulting pc.
-func (p *Prog) skipNop(pc uint32) (*Inst, uint32) {
- i := &p.Inst[pc]
- for i.Op == InstNop || i.Op == InstCapture {
- pc = i.Out
- i = &p.Inst[pc]
- }
- return i, pc
-}
-
-// op returns i.Op but merges all the Rune special cases into InstRune
-func (i *Inst) op() InstOp {
- op := i.Op
- switch op {
- case InstRune1, InstRuneAny, InstRuneAnyNotNL:
- op = InstRune
- }
- return op
-}
-
-// Prefix returns a literal string that all matches for the
-// regexp must start with. Complete is true if the prefix
-// is the entire match.
-func (p *Prog) Prefix() (prefix string, complete bool) {
- i, _ := p.skipNop(uint32(p.Start))
-
- // Avoid allocation of buffer if prefix is empty.
- if i.op() != InstRune || len(i.Rune) != 1 {
- return "", i.Op == InstMatch
- }
-
- // Have prefix; gather characters.
- var buf bytes.Buffer
- for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 {
- buf.WriteRune(i.Rune[0])
- i, _ = p.skipNop(i.Out)
- }
- return buf.String(), i.Op == InstMatch
-}
-
-// StartCond returns the leading empty-width conditions that must
-// be true in any match. It returns ^EmptyOp(0) if no matches are possible.
-func (p *Prog) StartCond() EmptyOp {
- var flag EmptyOp
- pc := uint32(p.Start)
- i := &p.Inst[pc]
-Loop:
- for {
- switch i.Op {
- case InstEmptyWidth:
- flag |= EmptyOp(i.Arg)
- case InstFail:
- return ^EmptyOp(0)
- case InstCapture, InstNop:
- // skip
- default:
- break Loop
- }
- pc = i.Out
- i = &p.Inst[pc]
- }
- return flag
-}
-
-const noMatch = -1
-
-// MatchRune returns true if the instruction matches (and consumes) r.
-// It should only be called when i.Op == InstRune.
-func (i *Inst) MatchRune(r rune) bool {
- return i.MatchRunePos(r) != noMatch
-}
-
-// MatchRunePos checks whether the instruction matches (and consumes) r.
-// If so, MatchRunePos returns the index of the matching rune pair
-// (or, when len(i.Rune) == 1, rune singleton).
-// If not, MatchRunePos returns -1.
-// MatchRunePos should only be called when i.Op == InstRune.
-func (i *Inst) MatchRunePos(r rune) int {
- rune := i.Rune
-
- // Special case: single-rune slice is from literal string, not char class.
- if len(rune) == 1 {
- r0 := rune[0]
- if r == r0 {
- return 0
- }
- if Flags(i.Arg)&FoldCase != 0 {
- for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
- if r == r1 {
- return 0
- }
- }
- }
- return noMatch
- }
-
- // Peek at the first few pairs.
- // Should handle ASCII well.
- for j := 0; j < len(rune) && j <= 8; j += 2 {
- if r < rune[j] {
- return noMatch
- }
- if r <= rune[j+1] {
- return j / 2
- }
- }
-
- // Otherwise binary search.
- lo := 0
- hi := len(rune) / 2
- for lo < hi {
- m := lo + (hi-lo)/2
- if c := rune[2*m]; c <= r {
- if r <= rune[2*m+1] {
- return m
- }
- lo = m + 1
- } else {
- hi = m
- }
- }
- return noMatch
-}
-
-// As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char.
-// Since we act on runes, it would be easy to support Unicode here.
-func wordRune(r rune) bool {
- return r == '_' ||
- ('A' <= r && r <= 'Z') ||
- ('a' <= r && r <= 'z') ||
- ('0' <= r && r <= '9')
-}
-
-// MatchEmptyWidth returns true if the instruction matches
-// an empty string between the runes before and after.
-// It should only be called when i.Op == InstEmptyWidth.
-func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
- switch EmptyOp(i.Arg) {
- case EmptyBeginLine:
- return before == '\n' || before == -1
- case EmptyEndLine:
- return after == '\n' || after == -1
- case EmptyBeginText:
- return before == -1
- case EmptyEndText:
- return after == -1
- case EmptyWordBoundary:
- return wordRune(before) != wordRune(after)
- case EmptyNoWordBoundary:
- return wordRune(before) == wordRune(after)
- }
- panic("unknown empty width arg")
-}
-
-func (i *Inst) String() string {
- var b bytes.Buffer
- dumpInst(&b, i)
- return b.String()
-}
-
-func bw(b *bytes.Buffer, args ...string) {
- for _, s := range args {
- b.WriteString(s)
- }
-}
-
-func dumpProg(b *bytes.Buffer, p *Prog) {
- for j := range p.Inst {
- i := &p.Inst[j]
- pc := strconv.Itoa(j)
- if len(pc) < 3 {
- b.WriteString(" "[len(pc):])
- }
- if j == p.Start {
- pc += "*"
- }
- bw(b, pc, "\t")
- dumpInst(b, i)
- bw(b, "\n")
- }
-}
-
-func u32(i uint32) string {
- return strconv.FormatUint(uint64(i), 10)
-}
-
-func dumpInst(b *bytes.Buffer, i *Inst) {
- switch i.Op {
- case InstAlt:
- bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
- case InstAltMatch:
- bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
- case InstCapture:
- bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
- case InstEmptyWidth:
- bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
- case InstMatch:
- bw(b, "match")
- case InstFail:
- bw(b, "fail")
- case InstNop:
- bw(b, "nop -> ", u32(i.Out))
- case InstRune:
- if i.Rune == nil {
- // shouldn't happen
- bw(b, "rune <nil>")
- }
- bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
- if Flags(i.Arg)&FoldCase != 0 {
- bw(b, "/i")
- }
- bw(b, " -> ", u32(i.Out))
- case InstRune1:
- bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
- case InstRuneAny:
- bw(b, "any -> ", u32(i.Out))
- case InstRuneAnyNotNL:
- bw(b, "anynotnl -> ", u32(i.Out))
- }
-}
diff --git a/src/pkg/regexp/syntax/prog_test.go b/src/pkg/regexp/syntax/prog_test.go
deleted file mode 100644
index 50bfa3d4b..000000000
--- a/src/pkg/regexp/syntax/prog_test.go
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package syntax
-
-import "testing"
-
-var compileTests = []struct {
- Regexp string
- Prog string
-}{
- {"a", ` 0 fail
- 1* rune1 "a" -> 2
- 2 match
-`},
- {"[A-M][n-z]", ` 0 fail
- 1* rune "AM" -> 2
- 2 rune "nz" -> 3
- 3 match
-`},
- {"", ` 0 fail
- 1* nop -> 2
- 2 match
-`},
- {"a?", ` 0 fail
- 1 rune1 "a" -> 3
- 2* alt -> 1, 3
- 3 match
-`},
- {"a??", ` 0 fail
- 1 rune1 "a" -> 3
- 2* alt -> 3, 1
- 3 match
-`},
- {"a+", ` 0 fail
- 1* rune1 "a" -> 2
- 2 alt -> 1, 3
- 3 match
-`},
- {"a+?", ` 0 fail
- 1* rune1 "a" -> 2
- 2 alt -> 3, 1
- 3 match
-`},
- {"a*", ` 0 fail
- 1 rune1 "a" -> 2
- 2* alt -> 1, 3
- 3 match
-`},
- {"a*?", ` 0 fail
- 1 rune1 "a" -> 2
- 2* alt -> 3, 1
- 3 match
-`},
- {"a+b+", ` 0 fail
- 1* rune1 "a" -> 2
- 2 alt -> 1, 3
- 3 rune1 "b" -> 4
- 4 alt -> 3, 5
- 5 match
-`},
- {"(a+)(b+)", ` 0 fail
- 1* cap 2 -> 2
- 2 rune1 "a" -> 3
- 3 alt -> 2, 4
- 4 cap 3 -> 5
- 5 cap 4 -> 6
- 6 rune1 "b" -> 7
- 7 alt -> 6, 8
- 8 cap 5 -> 9
- 9 match
-`},
- {"a+|b+", ` 0 fail
- 1 rune1 "a" -> 2
- 2 alt -> 1, 6
- 3 rune1 "b" -> 4
- 4 alt -> 3, 6
- 5* alt -> 1, 3
- 6 match
-`},
- {"A[Aa]", ` 0 fail
- 1* rune1 "A" -> 2
- 2 rune "A"/i -> 3
- 3 match
-`},
- {"(?:(?:^).)", ` 0 fail
- 1* empty 4 -> 2
- 2 anynotnl -> 3
- 3 match
-`},
-}
-
-func TestCompile(t *testing.T) {
- for _, tt := range compileTests {
- re, _ := Parse(tt.Regexp, Perl)
- p, _ := Compile(re)
- s := p.String()
- if s != tt.Prog {
- t.Errorf("compiled %#q:\n--- have\n%s---\n--- want\n%s---", tt.Regexp, s, tt.Prog)
- }
- }
-}
-
-func BenchmarkEmptyOpContext(b *testing.B) {
- for i := 0; i < b.N; i++ {
- var r1 rune = -1
- for _, r2 := range "foo, bar, baz\nsome input text.\n" {
- EmptyOpContext(r1, r2)
- r1 = r2
- }
- EmptyOpContext(r1, -1)
- }
-}
diff --git a/src/pkg/regexp/syntax/regexp.go b/src/pkg/regexp/syntax/regexp.go
deleted file mode 100644
index 329a90e01..000000000
--- a/src/pkg/regexp/syntax/regexp.go
+++ /dev/null
@@ -1,319 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package syntax
-
-// Note to implementers:
-// In this package, re is always a *Regexp and r is always a rune.
-
-import (
- "bytes"
- "strconv"
- "strings"
- "unicode"
-)
-
-// A Regexp is a node in a regular expression syntax tree.
-type Regexp struct {
- Op Op // operator
- Flags Flags
- Sub []*Regexp // subexpressions, if any
- Sub0 [1]*Regexp // storage for short Sub
- Rune []rune // matched runes, for OpLiteral, OpCharClass
- Rune0 [2]rune // storage for short Rune
- Min, Max int // min, max for OpRepeat
- Cap int // capturing index, for OpCapture
- Name string // capturing name, for OpCapture
-}
-
-// An Op is a single regular expression operator.
-type Op uint8
-
-// Operators are listed in precedence order, tightest binding to weakest.
-// Character class operators are listed simplest to most complex
-// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
-
-const (
- OpNoMatch Op = 1 + iota // matches no strings
- OpEmptyMatch // matches empty string
- OpLiteral // matches Runes sequence
- OpCharClass // matches Runes interpreted as range pair list
- OpAnyCharNotNL // matches any character
- OpAnyChar // matches any character
- OpBeginLine // matches empty string at beginning of line
- OpEndLine // matches empty string at end of line
- OpBeginText // matches empty string at beginning of text
- OpEndText // matches empty string at end of text
- OpWordBoundary // matches word boundary `\b`
- OpNoWordBoundary // matches word non-boundary `\B`
- OpCapture // capturing subexpression with index Cap, optional name Name
- OpStar // matches Sub[0] zero or more times
- OpPlus // matches Sub[0] one or more times
- OpQuest // matches Sub[0] zero or one times
- OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
- OpConcat // matches concatenation of Subs
- OpAlternate // matches alternation of Subs
-)
-
-const opPseudo Op = 128 // where pseudo-ops start
-
-// Equal returns true if x and y have identical structure.
-func (x *Regexp) Equal(y *Regexp) bool {
- if x == nil || y == nil {
- return x == y
- }
- if x.Op != y.Op {
- return false
- }
- switch x.Op {
- case OpEndText:
- // The parse flags remember whether this is \z or \Z.
- if x.Flags&WasDollar != y.Flags&WasDollar {
- return false
- }
-
- case OpLiteral, OpCharClass:
- if len(x.Rune) != len(y.Rune) {
- return false
- }
- for i, r := range x.Rune {
- if r != y.Rune[i] {
- return false
- }
- }
-
- case OpAlternate, OpConcat:
- if len(x.Sub) != len(y.Sub) {
- return false
- }
- for i, sub := range x.Sub {
- if !sub.Equal(y.Sub[i]) {
- return false
- }
- }
-
- case OpStar, OpPlus, OpQuest:
- if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
- return false
- }
-
- case OpRepeat:
- if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
- return false
- }
-
- case OpCapture:
- if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
- return false
- }
- }
- return true
-}
-
-// writeRegexp writes the Perl syntax for the regular expression re to b.
-func writeRegexp(b *bytes.Buffer, re *Regexp) {
- switch re.Op {
- default:
- b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
- case OpNoMatch:
- b.WriteString(`[^\x00-\x{10FFFF}]`)
- case OpEmptyMatch:
- b.WriteString(`(?:)`)
- case OpLiteral:
- if re.Flags&FoldCase != 0 {
- b.WriteString(`(?i:`)
- }
- for _, r := range re.Rune {
- escape(b, r, false)
- }
- if re.Flags&FoldCase != 0 {
- b.WriteString(`)`)
- }
- case OpCharClass:
- if len(re.Rune)%2 != 0 {
- b.WriteString(`[invalid char class]`)
- break
- }
- b.WriteRune('[')
- if len(re.Rune) == 0 {
- b.WriteString(`^\x00-\x{10FFFF}`)
- } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
- // Contains 0 and MaxRune. Probably a negated class.
- // Print the gaps.
- b.WriteRune('^')
- for i := 1; i < len(re.Rune)-1; i += 2 {
- lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
- escape(b, lo, lo == '-')
- if lo != hi {
- b.WriteRune('-')
- escape(b, hi, hi == '-')
- }
- }
- } else {
- for i := 0; i < len(re.Rune); i += 2 {
- lo, hi := re.Rune[i], re.Rune[i+1]
- escape(b, lo, lo == '-')
- if lo != hi {
- b.WriteRune('-')
- escape(b, hi, hi == '-')
- }
- }
- }
- b.WriteRune(']')
- case OpAnyCharNotNL:
- b.WriteString(`(?-s:.)`)
- case OpAnyChar:
- b.WriteString(`(?s:.)`)
- case OpBeginLine:
- b.WriteRune('^')
- case OpEndLine:
- b.WriteRune('$')
- case OpBeginText:
- b.WriteString(`\A`)
- case OpEndText:
- if re.Flags&WasDollar != 0 {
- b.WriteString(`(?-m:$)`)
- } else {
- b.WriteString(`\z`)
- }
- case OpWordBoundary:
- b.WriteString(`\b`)
- case OpNoWordBoundary:
- b.WriteString(`\B`)
- case OpCapture:
- if re.Name != "" {
- b.WriteString(`(?P<`)
- b.WriteString(re.Name)
- b.WriteRune('>')
- } else {
- b.WriteRune('(')
- }
- if re.Sub[0].Op != OpEmptyMatch {
- writeRegexp(b, re.Sub[0])
- }
- b.WriteRune(')')
- case OpStar, OpPlus, OpQuest, OpRepeat:
- if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
- b.WriteString(`(?:`)
- writeRegexp(b, sub)
- b.WriteString(`)`)
- } else {
- writeRegexp(b, sub)
- }
- switch re.Op {
- case OpStar:
- b.WriteRune('*')
- case OpPlus:
- b.WriteRune('+')
- case OpQuest:
- b.WriteRune('?')
- case OpRepeat:
- b.WriteRune('{')
- b.WriteString(strconv.Itoa(re.Min))
- if re.Max != re.Min {
- b.WriteRune(',')
- if re.Max >= 0 {
- b.WriteString(strconv.Itoa(re.Max))
- }
- }
- b.WriteRune('}')
- }
- if re.Flags&NonGreedy != 0 {
- b.WriteRune('?')
- }
- case OpConcat:
- for _, sub := range re.Sub {
- if sub.Op == OpAlternate {
- b.WriteString(`(?:`)
- writeRegexp(b, sub)
- b.WriteString(`)`)
- } else {
- writeRegexp(b, sub)
- }
- }
- case OpAlternate:
- for i, sub := range re.Sub {
- if i > 0 {
- b.WriteRune('|')
- }
- writeRegexp(b, sub)
- }
- }
-}
-
-func (re *Regexp) String() string {
- var b bytes.Buffer
- writeRegexp(&b, re)
- return b.String()
-}
-
-const meta = `\.+*?()|[]{}^$`
-
-func escape(b *bytes.Buffer, r rune, force bool) {
- if unicode.IsPrint(r) {
- if strings.IndexRune(meta, r) >= 0 || force {
- b.WriteRune('\\')
- }
- b.WriteRune(r)
- return
- }
-
- switch r {
- case '\a':
- b.WriteString(`\a`)
- case '\f':
- b.WriteString(`\f`)
- case '\n':
- b.WriteString(`\n`)
- case '\r':
- b.WriteString(`\r`)
- case '\t':
- b.WriteString(`\t`)
- case '\v':
- b.WriteString(`\v`)
- default:
- if r < 0x100 {
- b.WriteString(`\x`)
- s := strconv.FormatInt(int64(r), 16)
- if len(s) == 1 {
- b.WriteRune('0')
- }
- b.WriteString(s)
- break
- }
- b.WriteString(`\x{`)
- b.WriteString(strconv.FormatInt(int64(r), 16))
- b.WriteString(`}`)
- }
-}
-
-// MaxCap walks the regexp to find the maximum capture index.
-func (re *Regexp) MaxCap() int {
- m := 0
- if re.Op == OpCapture {
- m = re.Cap
- }
- for _, sub := range re.Sub {
- if n := sub.MaxCap(); m < n {
- m = n
- }
- }
- return m
-}
-
-// CapNames walks the regexp to find the names of capturing groups.
-func (re *Regexp) CapNames() []string {
- names := make([]string, re.MaxCap()+1)
- re.capNames(names)
- return names
-}
-
-func (re *Regexp) capNames(names []string) {
- if re.Op == OpCapture {
- names[re.Cap] = re.Name
- }
- for _, sub := range re.Sub {
- sub.capNames(names)
- }
-}
diff --git a/src/pkg/regexp/syntax/simplify.go b/src/pkg/regexp/syntax/simplify.go
deleted file mode 100644
index 72390417b..000000000
--- a/src/pkg/regexp/syntax/simplify.go
+++ /dev/null
@@ -1,151 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package syntax
-
-// Simplify returns a regexp equivalent to re but without counted repetitions
-// and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/.
-// The resulting regexp will execute correctly but its string representation
-// will not produce the same parse tree, because capturing parentheses
-// may have been duplicated or removed. For example, the simplified form
-// for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1.
-// The returned regexp may share structure with or be the original.
-func (re *Regexp) Simplify() *Regexp {
- if re == nil {
- return nil
- }
- switch re.Op {
- case OpCapture, OpConcat, OpAlternate:
- // Simplify children, building new Regexp if children change.
- nre := re
- for i, sub := range re.Sub {
- nsub := sub.Simplify()
- if nre == re && nsub != sub {
- // Start a copy.
- nre = new(Regexp)
- *nre = *re
- nre.Rune = nil
- nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...)
- }
- if nre != re {
- nre.Sub = append(nre.Sub, nsub)
- }
- }
- return nre
-
- case OpStar, OpPlus, OpQuest:
- sub := re.Sub[0].Simplify()
- return simplify1(re.Op, re.Flags, sub, re)
-
- case OpRepeat:
- // Special special case: x{0} matches the empty string
- // and doesn't even need to consider x.
- if re.Min == 0 && re.Max == 0 {
- return &Regexp{Op: OpEmptyMatch}
- }
-
- // The fun begins.
- sub := re.Sub[0].Simplify()
-
- // x{n,} means at least n matches of x.
- if re.Max == -1 {
- // Special case: x{0,} is x*.
- if re.Min == 0 {
- return simplify1(OpStar, re.Flags, sub, nil)
- }
-
- // Special case: x{1,} is x+.
- if re.Min == 1 {
- return simplify1(OpPlus, re.Flags, sub, nil)
- }
-
- // General case: x{4,} is xxxx+.
- nre := &Regexp{Op: OpConcat}
- nre.Sub = nre.Sub0[:0]
- for i := 0; i < re.Min-1; i++ {
- nre.Sub = append(nre.Sub, sub)
- }
- nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil))
- return nre
- }
-
- // Special case x{0} handled above.
-
- // Special case: x{1} is just x.
- if re.Min == 1 && re.Max == 1 {
- return sub
- }
-
- // General case: x{n,m} means n copies of x and m copies of x?
- // The machine will do less work if we nest the final m copies,
- // so that x{2,5} = xx(x(x(x)?)?)?
-
- // Build leading prefix: xx.
- var prefix *Regexp
- if re.Min > 0 {
- prefix = &Regexp{Op: OpConcat}
- prefix.Sub = prefix.Sub0[:0]
- for i := 0; i < re.Min; i++ {
- prefix.Sub = append(prefix.Sub, sub)
- }
- }
-
- // Build and attach suffix: (x(x(x)?)?)?
- if re.Max > re.Min {
- suffix := simplify1(OpQuest, re.Flags, sub, nil)
- for i := re.Min + 1; i < re.Max; i++ {
- nre2 := &Regexp{Op: OpConcat}
- nre2.Sub = append(nre2.Sub0[:0], sub, suffix)
- suffix = simplify1(OpQuest, re.Flags, nre2, nil)
- }
- if prefix == nil {
- return suffix
- }
- prefix.Sub = append(prefix.Sub, suffix)
- }
- if prefix != nil {
- return prefix
- }
-
- // Some degenerate case like min > max or min < max < 0.
- // Handle as impossible match.
- return &Regexp{Op: OpNoMatch}
- }
-
- return re
-}
-
-// simplify1 implements Simplify for the unary OpStar,
-// OpPlus, and OpQuest operators. It returns the simple regexp
-// equivalent to
-//
-// Regexp{Op: op, Flags: flags, Sub: {sub}}
-//
-// under the assumption that sub is already simple, and
-// without first allocating that structure. If the regexp
-// to be returned turns out to be equivalent to re, simplify1
-// returns re instead.
-//
-// simplify1 is factored out of Simplify because the implementation
-// for other operators generates these unary expressions.
-// Letting them call simplify1 makes sure the expressions they
-// generate are simple.
-func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp {
- // Special case: repeat the empty string as much as
- // you want, but it's still the empty string.
- if sub.Op == OpEmptyMatch {
- return sub
- }
- // The operators are idempotent if the flags match.
- if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy {
- return sub
- }
- if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] {
- return re
- }
-
- re = &Regexp{Op: op, Flags: flags}
- re.Sub = append(re.Sub0[:0], sub)
- return re
-}
diff --git a/src/pkg/regexp/syntax/simplify_test.go b/src/pkg/regexp/syntax/simplify_test.go
deleted file mode 100644
index 879eff5be..000000000
--- a/src/pkg/regexp/syntax/simplify_test.go
+++ /dev/null
@@ -1,151 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package syntax
-
-import "testing"
-
-var simplifyTests = []struct {
- Regexp string
- Simple string
-}{
- // Already-simple constructs
- {`a`, `a`},
- {`ab`, `ab`},
- {`a|b`, `[a-b]`},
- {`ab|cd`, `ab|cd`},
- {`(ab)*`, `(ab)*`},
- {`(ab)+`, `(ab)+`},
- {`(ab)?`, `(ab)?`},
- {`.`, `(?s:.)`},
- {`^`, `^`},
- {`$`, `$`},
- {`[ac]`, `[ac]`},
- {`[^ac]`, `[^ac]`},
-
- // Posix character classes
- {`[[:alnum:]]`, `[0-9A-Za-z]`},
- {`[[:alpha:]]`, `[A-Za-z]`},
- {`[[:blank:]]`, `[\t ]`},
- {`[[:cntrl:]]`, `[\x00-\x1f\x7f]`},
- {`[[:digit:]]`, `[0-9]`},
- {`[[:graph:]]`, `[!-~]`},
- {`[[:lower:]]`, `[a-z]`},
- {`[[:print:]]`, `[ -~]`},
- {`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"},
- {`[[:space:]]`, `[\t-\r ]`},
- {`[[:upper:]]`, `[A-Z]`},
- {`[[:xdigit:]]`, `[0-9A-Fa-f]`},
-
- // Perl character classes
- {`\d`, `[0-9]`},
- {`\s`, `[\t-\n\f-\r ]`},
- {`\w`, `[0-9A-Z_a-z]`},
- {`\D`, `[^0-9]`},
- {`\S`, `[^\t-\n\f-\r ]`},
- {`\W`, `[^0-9A-Z_a-z]`},
- {`[\d]`, `[0-9]`},
- {`[\s]`, `[\t-\n\f-\r ]`},
- {`[\w]`, `[0-9A-Z_a-z]`},
- {`[\D]`, `[^0-9]`},
- {`[\S]`, `[^\t-\n\f-\r ]`},
- {`[\W]`, `[^0-9A-Z_a-z]`},
-
- // Posix repetitions
- {`a{1}`, `a`},
- {`a{2}`, `aa`},
- {`a{5}`, `aaaaa`},
- {`a{0,1}`, `a?`},
- // The next three are illegible because Simplify inserts (?:)
- // parens instead of () parens to avoid creating extra
- // captured subexpressions. The comments show a version with fewer parens.
- {`(a){0,2}`, `(?:(a)(a)?)?`}, // (aa?)?
- {`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // (a(a(aa?)?)?)?
- {`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)?
- {`a{0,2}`, `(?:aa?)?`}, // (aa?)?
- {`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`}, // (a(a(aa?)?)?)?
- {`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`}, // aa(a(a(aa?)?)?)?
- {`a{0,}`, `a*`},
- {`a{1,}`, `a+`},
- {`a{2,}`, `aa+`},
- {`a{5,}`, `aaaaa+`},
-
- // Test that operators simplify their arguments.
- {`(?:a{1,}){1,}`, `a+`},
- {`(a{1,}b{1,})`, `(a+b+)`},
- {`a{1,}|b{1,}`, `a+|b+`},
- {`(?:a{1,})*`, `(?:a+)*`},
- {`(?:a{1,})+`, `a+`},
- {`(?:a{1,})?`, `(?:a+)?`},
- {``, `(?:)`},
- {`a{0}`, `(?:)`},
-
- // Character class simplification
- {`[ab]`, `[a-b]`},
- {`[a-za-za-z]`, `[a-z]`},
- {`[A-Za-zA-Za-z]`, `[A-Za-z]`},
- {`[ABCDEFGH]`, `[A-H]`},
- {`[AB-CD-EF-GH]`, `[A-H]`},
- {`[W-ZP-XE-R]`, `[E-Z]`},
- {`[a-ee-gg-m]`, `[a-m]`},
- {`[a-ea-ha-m]`, `[a-m]`},
- {`[a-ma-ha-e]`, `[a-m]`},
- {`[a-zA-Z0-9 -~]`, `[ -~]`},
-
- // Empty character classes
- {`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`},
-
- // Full character classes
- {`[[:cntrl:][:^cntrl:]]`, `(?s:.)`},
-
- // Unicode case folding.
- {`(?i)A`, `(?i:A)`},
- {`(?i)a`, `(?i:A)`},
- {`(?i)[A]`, `(?i:A)`},
- {`(?i)[a]`, `(?i:A)`},
- {`(?i)K`, `(?i:K)`},
- {`(?i)k`, `(?i:K)`},
- {`(?i)\x{212a}`, "(?i:K)"},
- {`(?i)[K]`, "[Kk\u212A]"},
- {`(?i)[k]`, "[Kk\u212A]"},
- {`(?i)[\x{212a}]`, "[Kk\u212A]"},
- {`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"},
- {`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"},
- {`(?i)[\x00-\x{10FFFF}]`, `(?s:.)`},
-
- // Empty string as a regular expression.
- // The empty string must be preserved inside parens in order
- // to make submatches work right, so these tests are less
- // interesting than they might otherwise be. String inserts
- // explicit (?:) in place of non-parenthesized empty strings,
- // to make them easier to spot for other parsers.
- {`(a|b|)`, `([a-b]|(?:))`},
- {`(|)`, `()`},
- {`a()`, `a()`},
- {`(()|())`, `(()|())`},
- {`(a|)`, `(a|(?:))`},
- {`ab()cd()`, `ab()cd()`},
- {`()`, `()`},
- {`()*`, `()*`},
- {`()+`, `()+`},
- {`()?`, `()?`},
- {`(){0}`, `(?:)`},
- {`(){1}`, `()`},
- {`(){1,}`, `()+`},
- {`(){0,2}`, `(?:()()?)?`},
-}
-
-func TestSimplify(t *testing.T) {
- for _, tt := range simplifyTests {
- re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine)
- if err != nil {
- t.Errorf("Parse(%#q) = error %v", tt.Regexp, err)
- continue
- }
- s := re.Simplify().String()
- if s != tt.Simple {
- t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple)
- }
- }
-}