diff options
Diffstat (limited to 'src/pkg/exp')
22 files changed, 6470 insertions, 1 deletions
diff --git a/src/pkg/exp/ogle/cmd.go b/src/pkg/exp/ogle/cmd.go index a8db523ea..ff0d24c69 100644 --- a/src/pkg/exp/ogle/cmd.go +++ b/src/pkg/exp/ogle/cmd.go @@ -154,7 +154,7 @@ func cmdLoad(args []byte) os.Error { } println("Attached to", pid) } else { - parts := strings.Split(path, " ", -1) + parts := strings.Split(path, " ") if len(parts) == 0 { fname = "" } else { diff --git a/src/pkg/exp/regexp/syntax/Makefile b/src/pkg/exp/regexp/syntax/Makefile new file mode 100644 index 000000000..97d4ad6ca --- /dev/null +++ b/src/pkg/exp/regexp/syntax/Makefile @@ -0,0 +1,16 @@ +# Copyright 2011 The Go Authors. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +include ../../../../Make.inc + +TARG=exp/regexp/syntax +GOFILES=\ + compile.go\ + parse.go\ + perl_groups.go\ + prog.go\ + regexp.go\ + simplify.go\ + +include ../../../../Make.pkg diff --git a/src/pkg/exp/regexp/syntax/compile.go b/src/pkg/exp/regexp/syntax/compile.go new file mode 100644 index 000000000..ec9556fde --- /dev/null +++ b/src/pkg/exp/regexp/syntax/compile.go @@ -0,0 +1,264 @@ +package syntax + +import ( + "os" + "unicode" +) + +// A patchList is a list of instruction pointers that need to be filled in (patched). +// Because the pointers haven't been filled in yet, we can reuse their storage +// to hold the list. It's kind of sleazy, but works well in practice. +// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. +// +// These aren't really pointers: they're integers, so we can reinterpret them +// this way without using package unsafe. A value l denotes +// p.inst[l>>1].Out (l&1==0) or .Arg (l&1==1). +// l == 0 denotes the empty list, okay because we start every program +// with a fail instruction, so we'll never want to point at its output link. +type patchList uint32 + +func (l patchList) next(p *Prog) patchList { + i := &p.Inst[l>>1] + if l&1 == 0 { + return patchList(i.Out) + } + return patchList(i.Arg) +} + +func (l patchList) patch(p *Prog, val uint32) { + for l != 0 { + i := &p.Inst[l>>1] + if l&1 == 0 { + l = patchList(i.Out) + i.Out = val + } else { + l = patchList(i.Arg) + i.Arg = val + } + } +} + +func (l1 patchList) append(p *Prog, l2 patchList) patchList { + if l1 == 0 { + return l2 + } + if l2 == 0 { + return l1 + } + + last := l1 + for { + next := last.next(p) + if next == 0 { + break + } + last = next + } + + i := &p.Inst[last>>1] + if last&1 == 0 { + i.Out = uint32(l2) + } else { + i.Arg = uint32(l2) + } + return l1 +} + +// A frag represents a compiled program fragment. +type frag struct { + i uint32 // index of first instruction + out patchList // where to record end instruction +} + +type compiler struct { + p *Prog +} + +// Compile compiles the regexp into a program to be executed. +func Compile(re *Regexp) (*Prog, os.Error) { + var c compiler + c.init() + f := c.compile(re) + f.out.patch(c.p, c.inst(InstMatch).i) + c.p.Start = int(f.i) + return c.p, nil +} + +func (c *compiler) init() { + c.p = new(Prog) + c.inst(InstFail) +} + +var anyRuneNotNL = []int{0, '\n' - 1, '\n' - 1, unicode.MaxRune} +var anyRune = []int{0, unicode.MaxRune} + +func (c *compiler) compile(re *Regexp) frag { + switch re.Op { + case OpNoMatch: + return c.fail() + case OpEmptyMatch: + return c.nop() + case OpLiteral: + if len(re.Rune) == 0 { + return c.nop() + } + var f frag + for j := range re.Rune { + f1 := c.rune(re.Rune[j : j+1]) + if j == 0 { + f = f1 + } else { + f = c.cat(f, f1) + } + } + return f + case OpCharClass: + return c.rune(re.Rune) + case OpAnyCharNotNL: + return c.rune(anyRuneNotNL) + case OpAnyChar: + return c.rune(anyRune) + case OpBeginLine: + return c.empty(EmptyBeginLine) + case OpEndLine: + return c.empty(EmptyEndLine) + case OpBeginText: + return c.empty(EmptyBeginText) + case OpEndText: + return c.empty(EmptyEndText) + case OpWordBoundary: + return c.empty(EmptyWordBoundary) + case OpNoWordBoundary: + return c.empty(EmptyNoWordBoundary) + case OpCapture: + bra := c.cap(uint32(re.Cap << 1)) + sub := c.compile(re.Sub[0]) + ket := c.cap(uint32(re.Cap<<1 | 1)) + return c.cat(c.cat(bra, sub), ket) + case OpStar: + return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) + case OpPlus: + return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) + case OpQuest: + return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) + case OpConcat: + if len(re.Sub) == 0 { + return c.nop() + } + var f frag + for i, sub := range re.Sub { + if i == 0 { + f = c.compile(sub) + } else { + f = c.cat(f, c.compile(sub)) + } + } + return f + case OpAlternate: + var f frag + for _, sub := range re.Sub { + f = c.alt(f, c.compile(sub)) + } + return f + } + panic("regexp: unhandled case in compile") +} + +func (c *compiler) inst(op InstOp) frag { + // TODO: impose length limit + f := frag{i: uint32(len(c.p.Inst))} + c.p.Inst = append(c.p.Inst, Inst{Op: op}) + return f +} + +func (c *compiler) nop() frag { + f := c.inst(InstNop) + f.out = patchList(f.i << 1) + return f +} + +func (c *compiler) fail() frag { + return frag{} +} + +func (c *compiler) cap(arg uint32) frag { + f := c.inst(InstCapture) + f.out = patchList(f.i << 1) + c.p.Inst[f.i].Arg = arg + return f +} + +func (c *compiler) cat(f1, f2 frag) frag { + // concat of failure is failure + if f1.i == 0 || f2.i == 0 { + return frag{} + } + + // TODO: elide nop + + f1.out.patch(c.p, f2.i) + return frag{f1.i, f2.out} +} + +func (c *compiler) alt(f1, f2 frag) frag { + // alt of failure is other + if f1.i == 0 { + return f2 + } + if f2.i == 0 { + return f1 + } + + f := c.inst(InstAlt) + i := &c.p.Inst[f.i] + i.Out = f1.i + i.Arg = f2.i + f.out = f1.out.append(c.p, f2.out) + return f +} + +func (c *compiler) quest(f1 frag, nongreedy bool) frag { + f := c.inst(InstAlt) + i := &c.p.Inst[f.i] + if nongreedy { + i.Arg = f1.i + f.out = patchList(f.i << 1) + } else { + i.Out = f1.i + f.out = patchList(f.i<<1 | 1) + } + f.out = f.out.append(c.p, f1.out) + return f +} + +func (c *compiler) star(f1 frag, nongreedy bool) frag { + f := c.inst(InstAlt) + i := &c.p.Inst[f.i] + if nongreedy { + i.Arg = f1.i + f.out = patchList(f.i << 1) + } else { + i.Out = f1.i + f.out = patchList(f.i<<1 | 1) + } + f1.out.patch(c.p, f.i) + return f +} + +func (c *compiler) plus(f1 frag, nongreedy bool) frag { + return frag{f1.i, c.star(f1, nongreedy).out} +} + +func (c *compiler) empty(op EmptyOp) frag { + f := c.inst(InstEmptyWidth) + c.p.Inst[f.i].Arg = uint32(op) + f.out = patchList(f.i << 1) + return f +} + +func (c *compiler) rune(rune []int) frag { + f := c.inst(InstRune) + c.p.Inst[f.i].Rune = rune + f.out = patchList(f.i << 1) + return f +} diff --git a/src/pkg/exp/regexp/syntax/make_perl_groups.pl b/src/pkg/exp/regexp/syntax/make_perl_groups.pl new file mode 100755 index 000000000..6d1b84b10 --- /dev/null +++ b/src/pkg/exp/regexp/syntax/make_perl_groups.pl @@ -0,0 +1,103 @@ +#!/usr/bin/perl +# Copyright 2008 The Go Authors. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# Modified version of RE2's make_perl_groups.pl. + +# Generate table entries giving character ranges +# for POSIX/Perl character classes. Rather than +# figure out what the definition is, it is easier to ask +# Perl about each letter from 0-128 and write down +# its answer. + +@posixclasses = ( + "[:alnum:]", + "[:alpha:]", + "[:ascii:]", + "[:blank:]", + "[:cntrl:]", + "[:digit:]", + "[:graph:]", + "[:lower:]", + "[:print:]", + "[:punct:]", + "[:space:]", + "[:upper:]", + "[:word:]", + "[:xdigit:]", +); + +@perlclasses = ( + "\\d", + "\\s", + "\\w", +); + +sub ComputeClass($) { + my @ranges; + my ($class) = @_; + my $regexp = "[$class]"; + my $start = -1; + for (my $i=0; $i<=129; $i++) { + if ($i == 129) { $i = 256; } + if ($i <= 128 && chr($i) =~ $regexp) { + if ($start < 0) { + $start = $i; + } + } else { + if ($start >= 0) { + push @ranges, [$start, $i-1]; + } + $start = -1; + } + } + return @ranges; +} + +sub PrintClass($$@) { + my ($cname, $name, @ranges) = @_; + print "var code$cname = []int{ /* $name */\n"; + for (my $i=0; $i<@ranges; $i++) { + my @a = @{$ranges[$i]}; + printf "\t0x%x, 0x%x,\n", $a[0], $a[1]; + } + print "}\n\n"; + my $n = @ranges; + $negname = $name; + if ($negname =~ /:/) { + $negname =~ s/:/:^/; + } else { + $negname =~ y/a-z/A-Z/; + } + return "\t`$name`: {+1, code$cname},\n" . + "\t`$negname`: {-1, code$cname},\n"; +} + +my $gen = 0; + +sub PrintClasses($@) { + my ($cname, @classes) = @_; + my @entries; + foreach my $cl (@classes) { + my @ranges = ComputeClass($cl); + push @entries, PrintClass(++$gen, $cl, @ranges); + } + print "var ${cname}Group = map[string]charGroup{\n"; + foreach my $e (@entries) { + print $e; + } + print "}\n"; + my $count = @entries; +} + +print <<EOF; +// GENERATED BY make_perl_groups.pl; DO NOT EDIT. +// make_perl_groups.pl >perl_groups.go + +package syntax + +EOF + +PrintClasses("perl", @perlclasses); +PrintClasses("posix", @posixclasses); diff --git a/src/pkg/exp/regexp/syntax/parse.go b/src/pkg/exp/regexp/syntax/parse.go new file mode 100644 index 000000000..b6c91f7e1 --- /dev/null +++ b/src/pkg/exp/regexp/syntax/parse.go @@ -0,0 +1,1798 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +import ( + "os" + "sort" + "strings" + "unicode" + "utf8" +) + +// An Error describes a failure to parse a regular expression +// and gives the offending expression. +type Error struct { + Code ErrorCode + Expr string +} + +func (e *Error) String() string { + return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`" +} + +// An ErrorCode describes a failure to parse a regular expression. +type ErrorCode string + +const ( + // Unexpected error + ErrInternalError ErrorCode = "regexp/syntax: internal error" + + // Parse errors + ErrInvalidCharClass ErrorCode = "invalid character class" + ErrInvalidCharRange ErrorCode = "invalid character class range" + ErrInvalidEscape ErrorCode = "invalid escape sequence" + ErrInvalidNamedCapture ErrorCode = "invalid named capture" + ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax" + ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator" + ErrInvalidRepeatSize ErrorCode = "invalid repeat count" + ErrInvalidUTF8 ErrorCode = "invalid UTF-8" + ErrMissingBracket ErrorCode = "missing closing ]" + ErrMissingParen ErrorCode = "missing closing )" + ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator" + ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression" +) + +func (e ErrorCode) String() string { + return string(e) +} + +// Flags control the behavior of the parser and record information about regexp context. +type Flags uint16 + +const ( + FoldCase Flags = 1 << iota // case-insensitive match + Literal // treat pattern as literal string + ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline + DotNL // allow . to match newline + OneLine // treat ^ and $ as only matching at beginning and end of text + NonGreedy // make repetition operators default to non-greedy + PerlX // allow Perl extensions + UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation + WasDollar // regexp OpEndText was $, not \z + Simple // regexp contains no counted repetition + + MatchNL = ClassNL | DotNL + + Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible + POSIX Flags = 0 // POSIX syntax +) + +// Pseudo-ops for parsing stack. +const ( + opLeftParen = opPseudo + iota + opVerticalBar +) + +type parser struct { + flags Flags // parse mode flags + stack []*Regexp // stack of parsed expressions + free *Regexp + numCap int // number of capturing groups seen + wholeRegexp string + tmpClass []int // temporary char class work space +} + +func (p *parser) newRegexp(op Op) *Regexp { + re := p.free + if re != nil { + p.free = re.Sub0[0] + *re = Regexp{} + } else { + re = new(Regexp) + } + re.Op = op + return re +} + +func (p *parser) reuse(re *Regexp) { + re.Sub0[0] = p.free + p.free = re +} + +// Parse stack manipulation. + +// push pushes the regexp re onto the parse stack and returns the regexp. +func (p *parser) push(re *Regexp) *Regexp { + if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] { + // Single rune. + if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) { + return nil + } + re.Op = OpLiteral + re.Rune = re.Rune[:1] + re.Flags = p.flags &^ FoldCase + } else if re.Op == OpCharClass && len(re.Rune) == 4 && + re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] && + unicode.SimpleFold(re.Rune[0]) == re.Rune[2] && + unicode.SimpleFold(re.Rune[2]) == re.Rune[0] || + re.Op == OpCharClass && len(re.Rune) == 2 && + re.Rune[0]+1 == re.Rune[1] && + unicode.SimpleFold(re.Rune[0]) == re.Rune[1] && + unicode.SimpleFold(re.Rune[1]) == re.Rune[0] { + // Case-insensitive rune like [Aa] or [Δδ]. + if p.maybeConcat(re.Rune[0], p.flags|FoldCase) { + return nil + } + + // Rewrite as (case-insensitive) literal. + re.Op = OpLiteral + re.Rune = re.Rune[:1] + re.Flags = p.flags | FoldCase + } else { + // Incremental concatenation. + p.maybeConcat(-1, 0) + } + + p.stack = append(p.stack, re) + return re +} + +// maybeConcat implements incremental concatenation +// of literal runes into string nodes. The parser calls this +// before each push, so only the top fragment of the stack +// might need processing. Since this is called before a push, +// the topmost literal is no longer subject to operators like * +// (Otherwise ab* would turn into (ab)*.) +// If r >= 0 and there's a node left over, maybeConcat uses it +// to push r with the given flags. +// maybeConcat reports whether r was pushed. +func (p *parser) maybeConcat(r int, flags Flags) bool { + n := len(p.stack) + if n < 2 { + return false + } + + re1 := p.stack[n-1] + re2 := p.stack[n-2] + if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase { + return false + } + + // Push re1 into re2. + re2.Rune = append(re2.Rune, re1.Rune...) + + // Reuse re1 if possible. + if r >= 0 { + re1.Rune = re1.Rune0[:1] + re1.Rune[0] = r + re1.Flags = flags + return true + } + + p.stack = p.stack[:n-1] + p.reuse(re1) + return false // did not push r +} + +// newLiteral returns a new OpLiteral Regexp with the given flags +func (p *parser) newLiteral(r int, flags Flags) *Regexp { + re := p.newRegexp(OpLiteral) + re.Flags = flags + re.Rune0[0] = r + re.Rune = re.Rune0[:1] + return re +} + +// literal pushes a literal regexp for the rune r on the stack +// and returns that regexp. +func (p *parser) literal(r int) { + p.push(p.newLiteral(r, p.flags)) +} + +// op pushes a regexp with the given op onto the stack +// and returns that regexp. +func (p *parser) op(op Op) *Regexp { + re := p.newRegexp(op) + re.Flags = p.flags + return p.push(re) +} + +// repeat replaces the top stack element with itself repeated +// according to op. +func (p *parser) repeat(op Op, min, max int, opstr, t, lastRepeat string) (string, os.Error) { + flags := p.flags + if p.flags&PerlX != 0 { + if len(t) > 0 && t[0] == '?' { + t = t[1:] + flags ^= NonGreedy + } + if lastRepeat != "" { + // In Perl it is not allowed to stack repetition operators: + // a** is a syntax error, not a doubled star, and a++ means + // something else entirely, which we don't support! + return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(t)]} + } + } + n := len(p.stack) + if n == 0 { + return "", &Error{ErrMissingRepeatArgument, opstr} + } + sub := p.stack[n-1] + re := p.newRegexp(op) + re.Min = min + re.Max = max + re.Flags = flags + re.Sub = re.Sub0[:1] + re.Sub[0] = sub + p.stack[n-1] = re + return t, nil +} + +// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation. +func (p *parser) concat() *Regexp { + p.maybeConcat(-1, 0) + + // Scan down to find pseudo-operator | or (. + i := len(p.stack) + for i > 0 && p.stack[i-1].Op < opPseudo { + i-- + } + subs := p.stack[i:] + p.stack = p.stack[:i] + + // Empty concatenation is special case. + if len(subs) == 0 { + return p.push(p.newRegexp(OpEmptyMatch)) + } + + return p.push(p.collapse(subs, OpConcat)) +} + +// alternate replaces the top of the stack (above the topmost '(') with its alternation. +func (p *parser) alternate() *Regexp { + // Scan down to find pseudo-operator (. + // There are no | above (. + i := len(p.stack) + for i > 0 && p.stack[i-1].Op < opPseudo { + i-- + } + subs := p.stack[i:] + p.stack = p.stack[:i] + + // Make sure top class is clean. + // All the others already are (see swapVerticalBar). + if len(subs) > 0 { + cleanAlt(subs[len(subs)-1]) + } + + // Empty alternate is special case + // (shouldn't happen but easy to handle). + if len(subs) == 0 { + return p.push(p.newRegexp(OpNoMatch)) + } + + return p.push(p.collapse(subs, OpAlternate)) +} + +// cleanAlt cleans re for eventual inclusion in an alternation. +func cleanAlt(re *Regexp) { + switch re.Op { + case OpCharClass: + re.Rune = cleanClass(&re.Rune) + if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune { + re.Rune = nil + re.Op = OpAnyChar + return + } + if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune { + re.Rune = nil + re.Op = OpAnyCharNotNL + return + } + if cap(re.Rune)-len(re.Rune) > 100 { + // re.Rune will not grow any more. + // Make a copy or inline to reclaim storage. + re.Rune = append(re.Rune0[:0], re.Rune...) + } + } +} + +// collapse returns the result of applying op to sub. +// If sub contains op nodes, they all get hoisted up +// so that there is never a concat of a concat or an +// alternate of an alternate. +func (p *parser) collapse(subs []*Regexp, op Op) *Regexp { + if len(subs) == 1 { + return subs[0] + } + re := p.newRegexp(op) + re.Sub = re.Sub0[:0] + for _, sub := range subs { + if sub.Op == op { + re.Sub = append(re.Sub, sub.Sub...) + p.reuse(sub) + } else { + re.Sub = append(re.Sub, sub) + } + } + if op == OpAlternate { + re.Sub = p.factor(re.Sub, re.Flags) + if len(re.Sub) == 1 { + old := re + re = re.Sub[0] + p.reuse(old) + } + } + return re +} + +// factor factors common prefixes from the alternation list sub. +// It returns a replacement list that reuses the same storage and +// frees (passes to p.reuse) any removed *Regexps. +// +// For example, +// ABC|ABD|AEF|BCX|BCY +// simplifies by literal prefix extraction to +// A(B(C|D)|EF)|BC(X|Y) +// which simplifies by character class introduction to +// A(B[CD]|EF)|BC[XY] +// +func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp { + if len(sub) < 2 { + return sub + } + + // Round 1: Factor out common literal prefixes. + var str []int + var strflags Flags + start := 0 + out := sub[:0] + for i := 0; i <= len(sub); i++ { + // Invariant: the Regexps that were in sub[0:start] have been + // used or marked for reuse, and the slice space has been reused + // for out (len(out) <= start). + // + // Invariant: sub[start:i] consists of regexps that all begin + // with str as modified by strflags. + var istr []int + var iflags Flags + if i < len(sub) { + istr, iflags = p.leadingString(sub[i]) + if iflags == strflags { + same := 0 + for same < len(str) && same < len(istr) && str[same] == istr[same] { + same++ + } + if same > 0 { + // Matches at least one rune in current range. + // Keep going around. + str = str[:same] + continue + } + } + } + + // Found end of a run with common leading literal string: + // sub[start:i] all begin with str[0:len(str)], but sub[i] + // does not even begin with str[0]. + // + // Factor out common string and append factored expression to out. + if i == start { + // Nothing to do - run of length 0. + } else if i == start+1 { + // Just one: don't bother factoring. + out = append(out, sub[start]) + } else { + // Construct factored form: prefix(suffix1|suffix2|...) + prefix := p.newRegexp(OpLiteral) + prefix.Flags = strflags + prefix.Rune = append(prefix.Rune[:0], str...) + + for j := start; j < i; j++ { + sub[j] = p.removeLeadingString(sub[j], len(str)) + } + suffix := p.collapse(sub[start:i], OpAlternate) // recurse + + re := p.newRegexp(OpConcat) + re.Sub = append(re.Sub[:0], prefix, suffix) + out = append(out, re) + } + + // Prepare for next iteration. + start = i + str = istr + strflags = iflags + } + sub = out + + // Round 2: Factor out common complex prefixes, + // just the first piece of each concatenation, + // whatever it is. This is good enough a lot of the time. + start = 0 + out = sub[:0] + var first *Regexp + for i := 0; i <= len(sub); i++ { + // Invariant: the Regexps that were in sub[0:start] have been + // used or marked for reuse, and the slice space has been reused + // for out (len(out) <= start). + // + // Invariant: sub[start:i] consists of regexps that all begin + // with str as modified by strflags. + var ifirst *Regexp + if i < len(sub) { + ifirst = p.leadingRegexp(sub[i]) + if first != nil && first.Equal(ifirst) { + continue + } + } + + // Found end of a run with common leading regexp: + // sub[start:i] all begin with first but sub[i] does not. + // + // Factor out common regexp and append factored expression to out. + if i == start { + // Nothing to do - run of length 0. + } else if i == start+1 { + // Just one: don't bother factoring. + out = append(out, sub[start]) + } else { + // Construct factored form: prefix(suffix1|suffix2|...) + prefix := first + + for j := start; j < i; j++ { + reuse := j != start // prefix came from sub[start] + sub[j] = p.removeLeadingRegexp(sub[j], reuse) + } + suffix := p.collapse(sub[start:i], OpAlternate) // recurse + + re := p.newRegexp(OpConcat) + re.Sub = append(re.Sub[:0], prefix, suffix) + out = append(out, re) + } + + // Prepare for next iteration. + start = i + first = ifirst + } + sub = out + + // Round 3: Collapse runs of single literals into character classes. + start = 0 + out = sub[:0] + for i := 0; i <= len(sub); i++ { + // Invariant: the Regexps that were in sub[0:start] have been + // used or marked for reuse, and the slice space has been reused + // for out (len(out) <= start). + // + // Invariant: sub[start:i] consists of regexps that are either + // literal runes or character classes. + if i < len(sub) && isCharClass(sub[i]) { + continue + } + + // sub[i] is not a char or char class; + // emit char class for sub[start:i]... + if i == start { + // Nothing to do - run of length 0. + } else if i == start+1 { + out = append(out, sub[start]) + } else { + // Make new char class. + // Start with most complex regexp in sub[start]. + max := start + for j := start + 1; j < i; j++ { + if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) { + max = j + } + } + sub[start], sub[max] = sub[max], sub[start] + + for j := start + 1; j < i; j++ { + mergeCharClass(sub[start], sub[j]) + p.reuse(sub[j]) + } + cleanAlt(sub[start]) + out = append(out, sub[start]) + } + + // ... and then emit sub[i]. + if i < len(sub) { + out = append(out, sub[i]) + } + start = i + 1 + } + sub = out + + // Round 4: Collapse runs of empty matches into a single empty match. + start = 0 + out = sub[:0] + for i := range sub { + if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch { + continue + } + out = append(out, sub[i]) + } + sub = out + + return sub +} + +// leadingString returns the leading literal string that re begins with. +// The string refers to storage in re or its children. +func (p *parser) leadingString(re *Regexp) ([]int, Flags) { + if re.Op == OpConcat && len(re.Sub) > 0 { + re = re.Sub[0] + } + if re.Op != OpLiteral { + return nil, 0 + } + return re.Rune, re.Flags & FoldCase +} + +// removeLeadingString removes the first n leading runes +// from the beginning of re. It returns the replacement for re. +func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp { + if re.Op == OpConcat && len(re.Sub) > 0 { + // Removing a leading string in a concatenation + // might simplify the concatenation. + sub := re.Sub[0] + sub = p.removeLeadingString(sub, n) + re.Sub[0] = sub + if sub.Op == OpEmptyMatch { + p.reuse(sub) + switch len(re.Sub) { + case 0, 1: + // Impossible but handle. + re.Op = OpEmptyMatch + re.Sub = nil + case 2: + old := re + re = re.Sub[1] + p.reuse(old) + default: + copy(re.Sub, re.Sub[1:]) + re.Sub = re.Sub[:len(re.Sub)-1] + } + } + return re + } + + if re.Op == OpLiteral { + re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])] + if len(re.Rune) == 0 { + re.Op = OpEmptyMatch + } + } + return re +} + +// leadingRegexp returns the leading regexp that re begins with. +// The regexp refers to storage in re or its children. +func (p *parser) leadingRegexp(re *Regexp) *Regexp { + if re.Op == OpEmptyMatch { + return nil + } + if re.Op == OpConcat && len(re.Sub) > 0 { + sub := re.Sub[0] + if sub.Op == OpEmptyMatch { + return nil + } + return sub + } + return re +} + +// removeLeadingRegexp removes the leading regexp in re. +// It returns the replacement for re. +// If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse. +func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp { + if re.Op == OpConcat && len(re.Sub) > 0 { + if reuse { + p.reuse(re.Sub[0]) + } + re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])] + switch len(re.Sub) { + case 0: + re.Op = OpEmptyMatch + re.Sub = nil + case 1: + old := re + re = re.Sub[0] + p.reuse(old) + } + return re + } + re.Op = OpEmptyMatch + return re +} + +func literalRegexp(s string, flags Flags) *Regexp { + re := &Regexp{Op: OpLiteral} + re.Flags = flags + re.Rune = re.Rune0[:0] // use local storage for small strings + for _, c := range s { + if len(re.Rune) >= cap(re.Rune) { + // string is too long to fit in Rune0. let Go handle it + re.Rune = []int(s) + break + } + re.Rune = append(re.Rune, c) + } + return re +} + +// Parsing. + +func Parse(s string, flags Flags) (*Regexp, os.Error) { + if flags&Literal != 0 { + // Trivial parser for literal string. + if err := checkUTF8(s); err != nil { + return nil, err + } + return literalRegexp(s, flags), nil + } + + // Otherwise, must do real work. + var ( + p parser + err os.Error + c int + op Op + lastRepeat string + min, max int + ) + p.flags = flags + p.wholeRegexp = s + t := s + for t != "" { + repeat := "" + BigSwitch: + switch t[0] { + default: + if c, t, err = nextRune(t); err != nil { + return nil, err + } + p.literal(c) + + case '(': + if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' { + // Flag changes and non-capturing groups. + if t, err = p.parsePerlFlags(t); err != nil { + return nil, err + } + break + } + p.numCap++ + p.op(opLeftParen).Cap = p.numCap + t = t[1:] + case '|': + if err = p.parseVerticalBar(); err != nil { + return nil, err + } + t = t[1:] + case ')': + if err = p.parseRightParen(); err != nil { + return nil, err + } + t = t[1:] + case '^': + if p.flags&OneLine != 0 { + p.op(OpBeginText) + } else { + p.op(OpBeginLine) + } + t = t[1:] + case '$': + if p.flags&OneLine != 0 { + p.op(OpEndText).Flags |= WasDollar + } else { + p.op(OpEndLine) + } + t = t[1:] + case '.': + if p.flags&DotNL != 0 { + p.op(OpAnyChar) + } else { + p.op(OpAnyCharNotNL) + } + t = t[1:] + case '[': + if t, err = p.parseClass(t); err != nil { + return nil, err + } + case '*', '+', '?': + switch t[0] { + case '*': + op = OpStar + case '+': + op = OpPlus + case '?': + op = OpQuest + } + if t, err = p.repeat(op, min, max, t[:1], t[1:], lastRepeat); err != nil { + return nil, err + } + case '{': + op = OpRepeat + min, max, tt, ok := p.parseRepeat(t) + if !ok { + // If the repeat cannot be parsed, { is a literal. + p.literal('{') + t = t[1:] + break + } + if t, err = p.repeat(op, min, max, t[:len(t)-len(tt)], tt, lastRepeat); err != nil { + return nil, err + } + case '\\': + if p.flags&PerlX != 0 && len(t) >= 2 { + switch t[1] { + case 'A': + p.op(OpBeginText) + t = t[2:] + break BigSwitch + case 'b': + p.op(OpWordBoundary) + t = t[2:] + break BigSwitch + case 'B': + p.op(OpNoWordBoundary) + t = t[2:] + break BigSwitch + case 'C': + // any byte; not supported + return nil, &Error{ErrInvalidEscape, t[:2]} + case 'Q': + // \Q ... \E: the ... is always literals + var lit string + if i := strings.Index(t, `\E`); i < 0 { + lit = t[2:] + t = "" + } else { + lit = t[2:i] + t = t[i+2:] + } + p.push(literalRegexp(lit, p.flags)) + break BigSwitch + case 'z': + p.op(OpEndText) + t = t[2:] + break BigSwitch + } + } + + re := p.newRegexp(OpCharClass) + re.Flags = p.flags + + // Look for Unicode character group like \p{Han} + if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') { + r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0]) + if err != nil { + return nil, err + } + if r != nil { + re.Rune = r + t = rest + p.push(re) + break BigSwitch + } + } + + // Perl character class escape. + if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil { + re.Rune = r + t = rest + p.push(re) + break BigSwitch + } + p.reuse(re) + + // Ordinary single-character escape. + if c, t, err = p.parseEscape(t); err != nil { + return nil, err + } + p.literal(c) + } + lastRepeat = repeat + } + + p.concat() + if p.swapVerticalBar() { + // pop vertical bar + p.stack = p.stack[:len(p.stack)-1] + } + p.alternate() + + n := len(p.stack) + if n != 1 { + return nil, &Error{ErrMissingParen, s} + } + return p.stack[0], nil +} + +// parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}. +// If s is not of that form, it returns ok == false. +func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) { + if s == "" || s[0] != '{' { + return + } + s = s[1:] + if min, s, ok = p.parseInt(s); !ok { + return + } + if s == "" { + return + } + if s[0] != ',' { + max = min + } else { + s = s[1:] + if s == "" { + return + } + if s[0] == '}' { + max = -1 + } else if max, s, ok = p.parseInt(s); !ok { + return + } + } + if s == "" || s[0] != '}' { + return + } + rest = s[1:] + ok = true + return +} + +// parsePerlFlags parses a Perl flag setting or non-capturing group or both, +// like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state. +// The caller must have ensured that s begins with "(?". +func (p *parser) parsePerlFlags(s string) (rest string, err os.Error) { + t := s + + // Check for named captures, first introduced in Python's regexp library. + // As usual, there are three slightly different syntaxes: + // + // (?P<name>expr) the original, introduced by Python + // (?<name>expr) the .NET alteration, adopted by Perl 5.10 + // (?'name'expr) another .NET alteration, adopted by Perl 5.10 + // + // Perl 5.10 gave in and implemented the Python version too, + // but they claim that the last two are the preferred forms. + // PCRE and languages based on it (specifically, PHP and Ruby) + // support all three as well. EcmaScript 4 uses only the Python form. + // + // In both the open source world (via Code Search) and the + // Google source tree, (?P<expr>name) is the dominant form, + // so that's the one we implement. One is enough. + if len(t) > 4 && t[2] == 'P' && t[3] == '<' { + // Pull out name. + end := strings.IndexRune(t, '>') + if end < 0 { + if err = checkUTF8(t); err != nil { + return "", err + } + return "", &Error{ErrInvalidNamedCapture, s} + } + + capture := t[:end+1] // "(?P<name>" + name := t[4:end] // "name" + if err = checkUTF8(name); err != nil { + return "", err + } + if !isValidCaptureName(name) { + return "", &Error{ErrInvalidNamedCapture, capture} + } + + // Like ordinary capture, but named. + p.numCap++ + re := p.op(opLeftParen) + re.Cap = p.numCap + re.Name = name + return t[end+1:], nil + } + + // Non-capturing group. Might also twiddle Perl flags. + var c int + t = t[2:] // skip (? + flags := p.flags + sign := +1 + sawFlag := false +Loop: + for t != "" { + if c, t, err = nextRune(t); err != nil { + return "", err + } + switch c { + default: + break Loop + + // Flags. + case 'i': + flags |= FoldCase + sawFlag = true + case 'm': + flags &^= OneLine + sawFlag = true + case 's': + flags |= DotNL + sawFlag = true + case 'U': + flags |= NonGreedy + sawFlag = true + + // Switch to negation. + case '-': + if sign < 0 { + break Loop + } + sign = -1 + // Invert flags so that | above turn into &^ and vice versa. + // We'll invert flags again before using it below. + flags = ^flags + sawFlag = false + + // End of flags, starting group or not. + case ':', ')': + if sign < 0 { + if !sawFlag { + break Loop + } + flags = ^flags + } + if c == ':' { + // Open new group + p.op(opLeftParen) + } + p.flags = flags + return t, nil + } + } + + return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]} +} + +// isValidCaptureName reports whether name +// is a valid capture name: [A-Za-z0-9_]+. +// PCRE limits names to 32 bytes. +// Python rejects names starting with digits. +// We don't enforce either of those. +func isValidCaptureName(name string) bool { + if name == "" { + return false + } + for _, c := range name { + if c != '_' && !isalnum(c) { + return false + } + } + return true +} + +// parseInt parses a decimal integer. +func (p *parser) parseInt(s string) (n int, rest string, ok bool) { + if s == "" || s[0] < '0' || '9' < s[0] { + return + } + // Disallow leading zeros. + if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' { + return + } + for s != "" && '0' <= s[0] && s[0] <= '9' { + // Avoid overflow. + if n >= 1e8 { + return + } + n = n*10 + int(s[0]) - '0' + s = s[1:] + } + rest = s + ok = true + return +} + +// can this be represented as a character class? +// single-rune literal string, char class, ., and .|\n. +func isCharClass(re *Regexp) bool { + return re.Op == OpLiteral && len(re.Rune) == 1 || + re.Op == OpCharClass || + re.Op == OpAnyCharNotNL || + re.Op == OpAnyChar +} + +// does re match r? +func matchRune(re *Regexp, r int) bool { + switch re.Op { + case OpLiteral: + return len(re.Rune) == 1 && re.Rune[0] == r + case OpCharClass: + for i := 0; i < len(re.Rune); i += 2 { + if re.Rune[i] <= r && r <= re.Rune[i+1] { + return true + } + } + return false + case OpAnyCharNotNL: + return r != '\n' + case OpAnyChar: + return true + } + return false +} + +// parseVerticalBar handles a | in the input. +func (p *parser) parseVerticalBar() os.Error { + p.concat() + + // The concatenation we just parsed is on top of the stack. + // If it sits above an opVerticalBar, swap it below + // (things below an opVerticalBar become an alternation). + // Otherwise, push a new vertical bar. + if !p.swapVerticalBar() { + p.op(opVerticalBar) + } + + return nil +} + +// mergeCharClass makes dst = dst|src. +// The caller must ensure that dst.Op >= src.Op, +// to reduce the amount of copying. +func mergeCharClass(dst, src *Regexp) { + switch dst.Op { + case OpAnyChar: + // src doesn't add anything. + case OpAnyCharNotNL: + // src might add \n + if matchRune(src, '\n') { + dst.Op = OpAnyChar + } + case OpCharClass: + // src is simpler, so either literal or char class + if src.Op == OpLiteral { + dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0]) + } else { + dst.Rune = appendClass(dst.Rune, src.Rune) + } + case OpLiteral: + // both literal + if src.Rune[0] == dst.Rune[0] { + break + } + dst.Op = OpCharClass + dst.Rune = append(dst.Rune, dst.Rune[0]) + dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0]) + } +} + +// If the top of the stack is an element followed by an opVerticalBar +// swapVerticalBar swaps the two and returns true. +// Otherwise it returns false. +func (p *parser) swapVerticalBar() bool { + // If above and below vertical bar are literal or char class, + // can merge into a single char class. + n := len(p.stack) + if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) { + re1 := p.stack[n-1] + re3 := p.stack[n-3] + // Make re3 the more complex of the two. + if re1.Op > re3.Op { + re1, re3 = re3, re1 + p.stack[n-3] = re3 + } + mergeCharClass(re3, re1) + p.reuse(re1) + p.stack = p.stack[:n-1] + return true + } + + if n >= 2 { + re1 := p.stack[n-1] + re2 := p.stack[n-2] + if re2.Op == opVerticalBar { + if n >= 3 { + // Now out of reach. + // Clean opportunistically. + cleanAlt(p.stack[n-3]) + } + p.stack[n-2] = re1 + p.stack[n-1] = re2 + return true + } + } + return false +} + +// parseRightParen handles a ) in the input. +func (p *parser) parseRightParen() os.Error { + p.concat() + if p.swapVerticalBar() { + // pop vertical bar + p.stack = p.stack[:len(p.stack)-1] + } + p.alternate() + + n := len(p.stack) + if n < 2 { + return &Error{ErrInternalError, ""} + } + re1 := p.stack[n-1] + re2 := p.stack[n-2] + p.stack = p.stack[:n-2] + if re2.Op != opLeftParen { + return &Error{ErrMissingParen, p.wholeRegexp} + } + if re2.Cap == 0 { + // Just for grouping. + p.push(re1) + } else { + re2.Op = OpCapture + re2.Sub = re2.Sub0[:1] + re2.Sub[0] = re1 + p.push(re2) + } + return nil +} + +// parseEscape parses an escape sequence at the beginning of s +// and returns the rune. +func (p *parser) parseEscape(s string) (r int, rest string, err os.Error) { + t := s[1:] + if t == "" { + return 0, "", &Error{ErrTrailingBackslash, ""} + } + c, t, err := nextRune(t) + if err != nil { + return 0, "", err + } + +Switch: + switch c { + default: + if c < utf8.RuneSelf && !isalnum(c) { + // Escaped non-word characters are always themselves. + // PCRE is not quite so rigorous: it accepts things like + // \q, but we don't. We once rejected \_, but too many + // programs and people insist on using it, so allow \_. + return c, t, nil + } + + // Octal escapes. + case '1', '2', '3', '4', '5', '6', '7': + // Single non-zero digit is a backreference; not supported + if t == "" || t[0] < '0' || t[0] > '7' { + break + } + fallthrough + case '0': + // Consume up to three octal digits; already have one. + r = c - '0' + for i := 1; i < 3; i++ { + if t == "" || t[0] < '0' || t[0] > '7' { + break + } + r = r*8 + int(t[0]) - '0' + t = t[1:] + } + return r, t, nil + + // Hexadecimal escapes. + case 'x': + if t == "" { + break + } + if c, t, err = nextRune(t); err != nil { + return 0, "", err + } + if c == '{' { + // Any number of digits in braces. + // Perl accepts any text at all; it ignores all text + // after the first non-hex digit. We require only hex digits, + // and at least one. + nhex := 0 + r = 0 + for { + if t == "" { + break Switch + } + if c, t, err = nextRune(t); err != nil { + return 0, "", err + } + if c == '}' { + break + } + v := unhex(c) + if v < 0 { + break Switch + } + r = r*16 + v + if r > unicode.MaxRune { + break Switch + } + nhex++ + } + if nhex == 0 { + break Switch + } + return r, t, nil + } + + // Easy case: two hex digits. + x := unhex(c) + if c, t, err = nextRune(t); err != nil { + return 0, "", err + } + y := unhex(c) + if x < 0 || y < 0 { + break + } + return x*16 + y, t, nil + + // C escapes. There is no case 'b', to avoid misparsing + // the Perl word-boundary \b as the C backspace \b + // when in POSIX mode. In Perl, /\b/ means word-boundary + // but /[\b]/ means backspace. We don't support that. + // If you want a backspace, embed a literal backspace + // character or use \x08. + case 'a': + return '\a', t, err + case 'f': + return '\f', t, err + case 'n': + return '\n', t, err + case 'r': + return '\r', t, err + case 't': + return '\t', t, err + case 'v': + return '\v', t, err + } + return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]} +} + +// parseClassChar parses a character class character at the beginning of s +// and returns it. +func (p *parser) parseClassChar(s, wholeClass string) (r int, rest string, err os.Error) { + if s == "" { + return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass} + } + + // Allow regular escape sequences even though + // many need not be escaped in this context. + if s[0] == '\\' { + return p.parseEscape(s) + } + + return nextRune(s) +} + +type charGroup struct { + sign int + class []int +} + +// parsePerlClassEscape parses a leading Perl character class escape like \d +// from the beginning of s. If one is present, it appends the characters to r +// and returns the new slice r and the remainder of the string. +func (p *parser) parsePerlClassEscape(s string, r []int) (out []int, rest string) { + if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' { + return + } + g := perlGroup[s[0:2]] + if g.sign == 0 { + return + } + return p.appendGroup(r, g), s[2:] +} + +// parseNamedClass parses a leading POSIX named character class like [:alnum:] +// from the beginning of s. If one is present, it appends the characters to r +// and returns the new slice r and the remainder of the string. +func (p *parser) parseNamedClass(s string, r []int) (out []int, rest string, err os.Error) { + if len(s) < 2 || s[0] != '[' || s[1] != ':' { + return + } + + i := strings.Index(s[2:], ":]") + if i < 0 { + return + } + i += 2 + name, s := s[0:i+2], s[i+2:] + g := posixGroup[name] + if g.sign == 0 { + return nil, "", &Error{ErrInvalidCharRange, name} + } + return p.appendGroup(r, g), s, nil +} + +func (p *parser) appendGroup(r []int, g charGroup) []int { + if p.flags&FoldCase == 0 { + if g.sign < 0 { + r = appendNegatedClass(r, g.class) + } else { + r = appendClass(r, g.class) + } + } else { + tmp := p.tmpClass[:0] + tmp = appendFoldedClass(tmp, g.class) + p.tmpClass = tmp + tmp = cleanClass(&p.tmpClass) + if g.sign < 0 { + r = appendNegatedClass(r, tmp) + } else { + r = appendClass(r, tmp) + } + } + return r +} + +// unicodeTable returns the unicode.RangeTable identified by name +// and the table of additional fold-equivalent code points. +func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) { + if t := unicode.Categories[name]; t != nil { + return t, unicode.FoldCategory[name] + } + if t := unicode.Scripts[name]; t != nil { + return t, unicode.FoldScript[name] + } + return nil, nil +} + +// parseUnicodeClass parses a leading Unicode character class like \p{Han} +// from the beginning of s. If one is present, it appends the characters to r +// and returns the new slice r and the remainder of the string. +func (p *parser) parseUnicodeClass(s string, r []int) (out []int, rest string, err os.Error) { + if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' { + return + } + + // Committed to parse or return error. + sign := +1 + if s[1] == 'P' { + sign = -1 + } + t := s[2:] + c, t, err := nextRune(t) + if err != nil { + return + } + var seq, name string + if c != '{' { + // Single-letter name. + seq = s[:len(s)-len(t)] + name = seq[2:] + } else { + // Name is in braces. + end := strings.IndexRune(s, '}') + if end < 0 { + if err = checkUTF8(s); err != nil { + return + } + return nil, "", &Error{ErrInvalidCharRange, s} + } + seq, t = s[:end+1], s[end+1:] + name = s[3:end] + if err = checkUTF8(name); err != nil { + return + } + } + + // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}. + if name != "" && name[0] == '^' { + sign = -sign + name = name[1:] + } + + tab, fold := unicodeTable(name) + if tab == nil { + return nil, "", &Error{ErrInvalidCharRange, seq} + } + + if p.flags&FoldCase == 0 || fold == nil { + if sign > 0 { + r = appendTable(r, tab) + } else { + r = appendNegatedTable(r, tab) + } + } else { + // Merge and clean tab and fold in a temporary buffer. + // This is necessary for the negative case and just tidy + // for the positive case. + tmp := p.tmpClass[:0] + tmp = appendTable(tmp, tab) + tmp = appendTable(tmp, fold) + p.tmpClass = tmp + tmp = cleanClass(&p.tmpClass) + if sign > 0 { + r = appendClass(r, tmp) + } else { + r = appendNegatedClass(r, tmp) + } + } + return r, t, nil +} + +// parseClass parses a character class at the beginning of s +// and pushes it onto the parse stack. +func (p *parser) parseClass(s string) (rest string, err os.Error) { + t := s[1:] // chop [ + re := p.newRegexp(OpCharClass) + re.Flags = p.flags + re.Rune = re.Rune0[:0] + + sign := +1 + if t != "" && t[0] == '^' { + sign = -1 + t = t[1:] + + // If character class does not match \n, add it here, + // so that negation later will do the right thing. + if p.flags&ClassNL == 0 { + re.Rune = append(re.Rune, '\n', '\n') + } + } + + class := re.Rune + first := true // ] and - are okay as first char in class + for t == "" || t[0] != ']' || first { + // POSIX: - is only okay unescaped as first or last in class. + // Perl: - is okay anywhere. + if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') { + _, size := utf8.DecodeRuneInString(t[1:]) + return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]} + } + first = false + + // Look for POSIX [:alnum:] etc. + if len(t) > 2 && t[0] == '[' && t[1] == ':' { + nclass, nt, err := p.parseNamedClass(t, class) + if err != nil { + return "", err + } + if nclass != nil { + class, t = nclass, nt + continue + } + } + + // Look for Unicode character group like \p{Han}. + nclass, nt, err := p.parseUnicodeClass(t, class) + if err != nil { + return "", err + } + if nclass != nil { + class, t = nclass, nt + continue + } + + // Look for Perl character class symbols (extension). + if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil { + class, t = nclass, nt + continue + } + + // Single character or simple range. + rng := t + var lo, hi int + if lo, t, err = p.parseClassChar(t, s); err != nil { + return "", err + } + hi = lo + // [a-] means (a|-) so check for final ]. + if len(t) >= 2 && t[0] == '-' && t[1] != ']' { + t = t[1:] + if hi, t, err = p.parseClassChar(t, s); err != nil { + return "", err + } + if hi < lo { + rng = rng[:len(rng)-len(t)] + return "", &Error{Code: ErrInvalidCharRange, Expr: rng} + } + } + if p.flags&FoldCase == 0 { + class = appendRange(class, lo, hi) + } else { + class = appendFoldedRange(class, lo, hi) + } + } + t = t[1:] // chop ] + + // Use &re.Rune instead of &class to avoid allocation. + re.Rune = class + class = cleanClass(&re.Rune) + if sign < 0 { + class = negateClass(class) + } + re.Rune = class + p.push(re) + return t, nil +} + +// cleanClass sorts the ranges (pairs of elements of r), +// merges them, and eliminates duplicates. +func cleanClass(rp *[]int) []int { + + // Sort by lo increasing, hi decreasing to break ties. + sort.Sort(ranges{rp}) + + r := *rp + if len(r) < 2 { + return r + } + + // Merge abutting, overlapping. + w := 2 // write index + for i := 2; i < len(r); i += 2 { + lo, hi := r[i], r[i+1] + if lo <= r[w-1]+1 { + // merge with previous range + if hi > r[w-1] { + r[w-1] = hi + } + continue + } + // new disjoint range + r[w] = lo + r[w+1] = hi + w += 2 + } + + return r[:w] +} + +// appendRange returns the result of appending the range lo-hi to the class r. +func appendRange(r []int, lo, hi int) []int { + // Expand last range or next to last range if it overlaps or abuts. + // Checking two ranges helps when appending case-folded + // alphabets, so that one range can be expanding A-Z and the + // other expanding a-z. + n := len(r) + for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4 + if n >= i { + rlo, rhi := r[n-i], r[n-i+1] + if lo <= rhi+1 && rlo <= hi+1 { + if lo < rlo { + r[n-i] = lo + } + if hi > rhi { + r[n-i+1] = hi + } + return r + } + } + } + + return append(r, lo, hi) +} + +const ( + // minimum and maximum runes involved in folding. + // checked during test. + minFold = 0x0041 + maxFold = 0x1044f +) + +// appendFoldedRange returns the result of appending the range lo-hi +// and its case folding-equivalent runes to the class r. +func appendFoldedRange(r []int, lo, hi int) []int { + // Optimizations. + if lo <= minFold && hi >= maxFold { + // Range is full: folding can't add more. + return appendRange(r, lo, hi) + } + if hi < minFold || lo > maxFold { + // Range is outside folding possibilities. + return appendRange(r, lo, hi) + } + if lo < minFold { + // [lo, minFold-1] needs no folding. + r = appendRange(r, lo, minFold-1) + lo = minFold + } + if hi > maxFold { + // [maxFold+1, hi] needs no folding. + r = appendRange(r, maxFold+1, hi) + hi = maxFold + } + + // Brute force. Depend on appendRange to coalesce ranges on the fly. + for c := lo; c <= hi; c++ { + r = appendRange(r, c, c) + f := unicode.SimpleFold(c) + for f != c { + r = appendRange(r, f, f) + f = unicode.SimpleFold(f) + } + } + return r +} + +// appendClass returns the result of appending the class x to the class r. +// It assume x is clean. +func appendClass(r []int, x []int) []int { + for i := 0; i < len(x); i += 2 { + r = appendRange(r, x[i], x[i+1]) + } + return r +} + +// appendFolded returns the result of appending the case folding of the class x to the class r. +func appendFoldedClass(r []int, x []int) []int { + for i := 0; i < len(x); i += 2 { + r = appendFoldedRange(r, x[i], x[i+1]) + } + return r +} + +// appendNegatedClass returns the result of appending the negation of the class x to the class r. +// It assumes x is clean. +func appendNegatedClass(r []int, x []int) []int { + nextLo := 0 + for i := 0; i < len(x); i += 2 { + lo, hi := x[i], x[i+1] + if nextLo <= lo-1 { + r = appendRange(r, nextLo, lo-1) + } + nextLo = hi + 1 + } + if nextLo <= unicode.MaxRune { + r = appendRange(r, nextLo, unicode.MaxRune) + } + return r +} + +// appendTable returns the result of appending x to the class r. +func appendTable(r []int, x *unicode.RangeTable) []int { + for _, xr := range x.R16 { + lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride) + if stride == 1 { + r = appendRange(r, lo, hi) + continue + } + for c := lo; c <= hi; c += stride { + r = appendRange(r, c, c) + } + } + for _, xr := range x.R32 { + lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride) + if stride == 1 { + r = appendRange(r, lo, hi) + continue + } + for c := lo; c <= hi; c += stride { + r = appendRange(r, c, c) + } + } + return r +} + +// appendNegatedTable returns the result of appending the negation of x to the class r. +func appendNegatedTable(r []int, x *unicode.RangeTable) []int { + nextLo := 0 // lo end of next class to add + for _, xr := range x.R16 { + lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride) + if stride == 1 { + if nextLo <= lo-1 { + r = appendRange(r, nextLo, lo-1) + } + nextLo = hi + 1 + continue + } + for c := lo; c <= hi; c += stride { + if nextLo <= c-1 { + r = appendRange(r, nextLo, c-1) + } + nextLo = c + 1 + } + } + for _, xr := range x.R32 { + lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride) + if stride == 1 { + if nextLo <= lo-1 { + r = appendRange(r, nextLo, lo-1) + } + nextLo = hi + 1 + continue + } + for c := lo; c <= hi; c += stride { + if nextLo <= c-1 { + r = appendRange(r, nextLo, c-1) + } + nextLo = c + 1 + } + } + if nextLo <= unicode.MaxRune { + r = appendRange(r, nextLo, unicode.MaxRune) + } + return r +} + +// negateClass overwrites r and returns r's negation. +// It assumes the class r is already clean. +func negateClass(r []int) []int { + nextLo := 0 // lo end of next class to add + w := 0 // write index + for i := 0; i < len(r); i += 2 { + lo, hi := r[i], r[i+1] + if nextLo <= lo-1 { + r[w] = nextLo + r[w+1] = lo - 1 + w += 2 + } + nextLo = hi + 1 + } + r = r[:w] + if nextLo <= unicode.MaxRune { + // It's possible for the negation to have one more + // range - this one - than the original class, so use append. + r = append(r, nextLo, unicode.MaxRune) + } + return r +} + +// ranges implements sort.Interface on a []rune. +// The choice of receiver type definition is strange +// but avoids an allocation since we already have +// a *[]int. +type ranges struct { + p *[]int +} + +func (ra ranges) Less(i, j int) bool { + p := *ra.p + i *= 2 + j *= 2 + return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1] +} + +func (ra ranges) Len() int { + return len(*ra.p) / 2 +} + +func (ra ranges) Swap(i, j int) { + p := *ra.p + i *= 2 + j *= 2 + p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1] +} + + +func checkUTF8(s string) os.Error { + for s != "" { + rune, size := utf8.DecodeRuneInString(s) + if rune == utf8.RuneError && size == 1 { + return &Error{Code: ErrInvalidUTF8, Expr: s} + } + s = s[size:] + } + return nil +} + +func nextRune(s string) (c int, t string, err os.Error) { + c, size := utf8.DecodeRuneInString(s) + if c == utf8.RuneError && size == 1 { + return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s} + } + return c, s[size:], nil +} + +func isalnum(c int) bool { + return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' +} + +func unhex(c int) int { + if '0' <= c && c <= '9' { + return c - '0' + } + if 'a' <= c && c <= 'f' { + return c - 'a' + 10 + } + if 'A' <= c && c <= 'F' { + return c - 'A' + 10 + } + return -1 +} diff --git a/src/pkg/exp/regexp/syntax/parse_test.go b/src/pkg/exp/regexp/syntax/parse_test.go new file mode 100644 index 000000000..779b9afde --- /dev/null +++ b/src/pkg/exp/regexp/syntax/parse_test.go @@ -0,0 +1,350 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +import ( + "bytes" + "fmt" + "testing" + "unicode" +) + +var parseTests = []struct { + Regexp string + Dump string +}{ + // Base cases + {`a`, `lit{a}`}, + {`a.`, `cat{lit{a}dot{}}`}, + {`a.b`, `cat{lit{a}dot{}lit{b}}`}, + {`ab`, `str{ab}`}, + {`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`}, + {`abc`, `str{abc}`}, + {`a|^`, `alt{lit{a}bol{}}`}, + {`a|b`, `cc{0x61-0x62}`}, + {`(a)`, `cap{lit{a}}`}, + {`(a)|b`, `alt{cap{lit{a}}lit{b}}`}, + {`a*`, `star{lit{a}}`}, + {`a+`, `plus{lit{a}}`}, + {`a?`, `que{lit{a}}`}, + {`a{2}`, `rep{2,2 lit{a}}`}, + {`a{2,3}`, `rep{2,3 lit{a}}`}, + {`a{2,}`, `rep{2,-1 lit{a}}`}, + {`a*?`, `nstar{lit{a}}`}, + {`a+?`, `nplus{lit{a}}`}, + {`a??`, `nque{lit{a}}`}, + {`a{2}?`, `nrep{2,2 lit{a}}`}, + {`a{2,3}?`, `nrep{2,3 lit{a}}`}, + {`a{2,}?`, `nrep{2,-1 lit{a}}`}, + {``, `emp{}`}, + {`|`, `emp{}`}, // alt{emp{}emp{}} but got factored + {`|x|`, `alt{emp{}lit{x}emp{}}`}, + {`.`, `dot{}`}, + {`^`, `bol{}`}, + {`$`, `eol{}`}, + {`\|`, `lit{|}`}, + {`\(`, `lit{(}`}, + {`\)`, `lit{)}`}, + {`\*`, `lit{*}`}, + {`\+`, `lit{+}`}, + {`\?`, `lit{?}`}, + {`{`, `lit{{}`}, + {`}`, `lit{}}`}, + {`\.`, `lit{.}`}, + {`\^`, `lit{^}`}, + {`\$`, `lit{$}`}, + {`\\`, `lit{\}`}, + {`[ace]`, `cc{0x61 0x63 0x65}`}, + {`[abc]`, `cc{0x61-0x63}`}, + {`[a-z]`, `cc{0x61-0x7a}`}, + {`[a]`, `lit{a}`}, + {`\-`, `lit{-}`}, + {`-`, `lit{-}`}, + {`\_`, `lit{_}`}, + {`abc`, `str{abc}`}, + {`abc|def`, `alt{str{abc}str{def}}`}, + {`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`}, + + // Posix and Perl extensions + {`[[:lower:]]`, `cc{0x61-0x7a}`}, + {`[a-z]`, `cc{0x61-0x7a}`}, + {`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, + {`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, + {`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, + {`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, + {`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, + {`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, + {`\d`, `cc{0x30-0x39}`}, + {`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`}, + {`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`}, + {`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`}, + {`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`}, + {`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`}, + {`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`}, + {`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, + {`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`}, + // { `\C`, `byte{}` }, // probably never + + // Unicode, negatives, and a double negative. + {`\p{Braille}`, `cc{0x2800-0x28ff}`}, + {`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`\P{^Braille}`, `cc{0x2800-0x28ff}`}, + {`\pZ`, `cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, + {`[\p{Braille}]`, `cc{0x2800-0x28ff}`}, + {`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`[\P{^Braille}]`, `cc{0x2800-0x28ff}`}, + {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, + {`\p{Lu}`, mkCharClass(unicode.IsUpper)}, + {`[\p{Lu}]`, mkCharClass(unicode.IsUpper)}, + {`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)}, + + // Hex, octal. + {`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`}, + {`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`}, + + // More interesting regular expressions. + {`a{,2}`, `str{a{,2}}`}, + {`\.\^\$\\`, `str{.^$\}`}, + {`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`}, + {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`}, + {`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8 + {`a*{`, `cat{star{lit{a}}lit{{}}`}, + + // Test precedences + {`(?:ab)*`, `star{str{ab}}`}, + {`(ab)*`, `star{cap{str{ab}}}`}, + {`ab|cd`, `alt{str{ab}str{cd}}`}, + {`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`}, + + // Test flattening. + {`(?:a)`, `lit{a}`}, + {`(?:ab)(?:cd)`, `str{abcd}`}, + {`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, + {`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, + {`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`}, + {`a|.`, `dot{}`}, + {`.|a`, `dot{}`}, + {`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`}, + {`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`}, + + // Test Perl quoted literals + {`\Q+|*?{[\E`, `str{+|*?{[}`}, + {`\Q+\E+`, `plus{lit{+}}`}, + {`\Q\\E`, `lit{\}`}, + {`\Q\\\E`, `str{\\}`}, + + // Test Perl \A and \z + {`(?m)^`, `bol{}`}, + {`(?m)$`, `eol{}`}, + {`(?-m)^`, `bot{}`}, + {`(?-m)$`, `eot{}`}, + {`(?m)\A`, `bot{}`}, + {`(?m)\z`, `eot{\z}`}, + {`(?-m)\A`, `bot{}`}, + {`(?-m)\z`, `eot{\z}`}, + + // Test named captures + {`(?P<name>a)`, `cap{name:lit{a}}`}, + + // Case-folded literals + {`[Aa]`, `litfold{A}`}, + {`[\x{100}\x{101}]`, `litfold{Ā}`}, + {`[Δδ]`, `litfold{Δ}`}, + + // Strings + {`abcde`, `str{abcde}`}, + {`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`}, + + // Factoring. + {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`}, + {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}cc{0x79-0x7a}}cat{plus{lit{y}}lit{w}}}}`}, +} + +const testFlags = MatchNL | PerlX | UnicodeGroups + +// Test Parse -> Dump. +func TestParseDump(t *testing.T) { + for _, tt := range parseTests { + re, err := Parse(tt.Regexp, testFlags) + if err != nil { + t.Errorf("Parse(%#q): %v", tt.Regexp, err) + continue + } + d := dump(re) + if d != tt.Dump { + t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump) + } + } +} + +// dump prints a string representation of the regexp showing +// the structure explicitly. +func dump(re *Regexp) string { + var b bytes.Buffer + dumpRegexp(&b, re) + return b.String() +} + +var opNames = []string{ + OpNoMatch: "no", + OpEmptyMatch: "emp", + OpLiteral: "lit", + OpCharClass: "cc", + OpAnyCharNotNL: "dnl", + OpAnyChar: "dot", + OpBeginLine: "bol", + OpEndLine: "eol", + OpBeginText: "bot", + OpEndText: "eot", + OpWordBoundary: "wb", + OpNoWordBoundary: "nwb", + OpCapture: "cap", + OpStar: "star", + OpPlus: "plus", + OpQuest: "que", + OpRepeat: "rep", + OpConcat: "cat", + OpAlternate: "alt", +} + +// dumpRegexp writes an encoding of the syntax tree for the regexp re to b. +// It is used during testing to distinguish between parses that might print +// the same using re's String method. +func dumpRegexp(b *bytes.Buffer, re *Regexp) { + if int(re.Op) >= len(opNames) || opNames[re.Op] == "" { + fmt.Fprintf(b, "op%d", re.Op) + } else { + switch re.Op { + default: + b.WriteString(opNames[re.Op]) + case OpStar, OpPlus, OpQuest, OpRepeat: + if re.Flags&NonGreedy != 0 { + b.WriteByte('n') + } + b.WriteString(opNames[re.Op]) + case OpLiteral: + if len(re.Rune) > 1 { + b.WriteString("str") + } else { + b.WriteString("lit") + } + if re.Flags&FoldCase != 0 { + for _, r := range re.Rune { + if unicode.SimpleFold(r) != r { + b.WriteString("fold") + break + } + } + } + } + } + b.WriteByte('{') + switch re.Op { + case OpEndText: + if re.Flags&WasDollar == 0 { + b.WriteString(`\z`) + } + case OpLiteral: + for _, r := range re.Rune { + b.WriteRune(r) + } + case OpConcat, OpAlternate: + for _, sub := range re.Sub { + dumpRegexp(b, sub) + } + case OpStar, OpPlus, OpQuest: + dumpRegexp(b, re.Sub[0]) + case OpRepeat: + fmt.Fprintf(b, "%d,%d ", re.Min, re.Max) + dumpRegexp(b, re.Sub[0]) + case OpCapture: + if re.Name != "" { + b.WriteString(re.Name) + b.WriteByte(':') + } + dumpRegexp(b, re.Sub[0]) + case OpCharClass: + sep := "" + for i := 0; i < len(re.Rune); i += 2 { + b.WriteString(sep) + sep = " " + lo, hi := re.Rune[i], re.Rune[i+1] + if lo == hi { + fmt.Fprintf(b, "%#x", lo) + } else { + fmt.Fprintf(b, "%#x-%#x", lo, hi) + } + } + } + b.WriteByte('}') +} + +func mkCharClass(f func(int) bool) string { + re := &Regexp{Op: OpCharClass} + lo := -1 + for i := 0; i <= unicode.MaxRune; i++ { + if f(i) { + if lo < 0 { + lo = i + } + } else { + if lo >= 0 { + re.Rune = append(re.Rune, lo, i-1) + lo = -1 + } + } + } + if lo >= 0 { + re.Rune = append(re.Rune, lo, unicode.MaxRune) + } + return dump(re) +} + +func isUpperFold(rune int) bool { + if unicode.IsUpper(rune) { + return true + } + c := unicode.SimpleFold(rune) + for c != rune { + if unicode.IsUpper(c) { + return true + } + c = unicode.SimpleFold(c) + } + return false +} + +func TestFoldConstants(t *testing.T) { + last := -1 + for i := 0; i <= unicode.MaxRune; i++ { + if unicode.SimpleFold(i) == i { + continue + } + if last == -1 && minFold != i { + t.Errorf("minFold=%#U should be %#U", minFold, i) + } + last = i + } + if maxFold != last { + t.Errorf("maxFold=%#U should be %#U", maxFold, last) + } +} + +func TestAppendRangeCollapse(t *testing.T) { + // AppendRange should collapse each of the new ranges + // into the earlier ones (it looks back two ranges), so that + // the slice never grows very large. + // Note that we are not calling cleanClass. + var r []int + for i := 'A'; i <= 'Z'; i++ { + r = appendRange(r, i, i) + r = appendRange(r, i+'a'-'A', i+'a'-'A') + } + if string(r) != "AZaz" { + t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r)) + } +} diff --git a/src/pkg/exp/regexp/syntax/perl_groups.go b/src/pkg/exp/regexp/syntax/perl_groups.go new file mode 100644 index 000000000..05b392c40 --- /dev/null +++ b/src/pkg/exp/regexp/syntax/perl_groups.go @@ -0,0 +1,130 @@ +// GENERATED BY make_perl_groups.pl; DO NOT EDIT. +// make_perl_groups.pl >perl_groups.go + +package syntax + +var code1 = []int{ /* \d */ + 0x30, 0x39, +} + +var code2 = []int{ /* \s */ + 0x9, 0xa, + 0xc, 0xd, + 0x20, 0x20, +} + +var code3 = []int{ /* \w */ + 0x30, 0x39, + 0x41, 0x5a, + 0x5f, 0x5f, + 0x61, 0x7a, +} + +var perlGroup = map[string]charGroup{ + `\d`: {+1, code1}, + `\D`: {-1, code1}, + `\s`: {+1, code2}, + `\S`: {-1, code2}, + `\w`: {+1, code3}, + `\W`: {-1, code3}, +} +var code4 = []int{ /* [:alnum:] */ + 0x30, 0x39, + 0x41, 0x5a, + 0x61, 0x7a, +} + +var code5 = []int{ /* [:alpha:] */ + 0x41, 0x5a, + 0x61, 0x7a, +} + +var code6 = []int{ /* [:ascii:] */ + 0x0, 0x7f, +} + +var code7 = []int{ /* [:blank:] */ + 0x9, 0x9, + 0x20, 0x20, +} + +var code8 = []int{ /* [:cntrl:] */ + 0x0, 0x1f, + 0x7f, 0x7f, +} + +var code9 = []int{ /* [:digit:] */ + 0x30, 0x39, +} + +var code10 = []int{ /* [:graph:] */ + 0x21, 0x7e, +} + +var code11 = []int{ /* [:lower:] */ + 0x61, 0x7a, +} + +var code12 = []int{ /* [:print:] */ + 0x20, 0x7e, +} + +var code13 = []int{ /* [:punct:] */ + 0x21, 0x2f, + 0x3a, 0x40, + 0x5b, 0x60, + 0x7b, 0x7e, +} + +var code14 = []int{ /* [:space:] */ + 0x9, 0xd, + 0x20, 0x20, +} + +var code15 = []int{ /* [:upper:] */ + 0x41, 0x5a, +} + +var code16 = []int{ /* [:word:] */ + 0x30, 0x39, + 0x41, 0x5a, + 0x5f, 0x5f, + 0x61, 0x7a, +} + +var code17 = []int{ /* [:xdigit:] */ + 0x30, 0x39, + 0x41, 0x46, + 0x61, 0x66, +} + +var posixGroup = map[string]charGroup{ + `[:alnum:]`: {+1, code4}, + `[:^alnum:]`: {-1, code4}, + `[:alpha:]`: {+1, code5}, + `[:^alpha:]`: {-1, code5}, + `[:ascii:]`: {+1, code6}, + `[:^ascii:]`: {-1, code6}, + `[:blank:]`: {+1, code7}, + `[:^blank:]`: {-1, code7}, + `[:cntrl:]`: {+1, code8}, + `[:^cntrl:]`: {-1, code8}, + `[:digit:]`: {+1, code9}, + `[:^digit:]`: {-1, code9}, + `[:graph:]`: {+1, code10}, + `[:^graph:]`: {-1, code10}, + `[:lower:]`: {+1, code11}, + `[:^lower:]`: {-1, code11}, + `[:print:]`: {+1, code12}, + `[:^print:]`: {-1, code12}, + `[:punct:]`: {+1, code13}, + `[:^punct:]`: {-1, code13}, + `[:space:]`: {+1, code14}, + `[:^space:]`: {-1, code14}, + `[:upper:]`: {+1, code15}, + `[:^upper:]`: {-1, code15}, + `[:word:]`: {+1, code16}, + `[:^word:]`: {-1, code16}, + `[:xdigit:]`: {+1, code17}, + `[:^xdigit:]`: {-1, code17}, +} diff --git a/src/pkg/exp/regexp/syntax/prog.go b/src/pkg/exp/regexp/syntax/prog.go new file mode 100644 index 000000000..6eeb3da0c --- /dev/null +++ b/src/pkg/exp/regexp/syntax/prog.go @@ -0,0 +1,182 @@ +package syntax + +import ( + "bytes" + "strconv" +) + +// Compiled program. +// May not belong in this package, but convenient for now. + +// A Prog is a compiled regular expression program. +type Prog struct { + Inst []Inst + Start int // index of start instruction +} + +// An InstOp is an instruction opcode. +type InstOp uint8 + +const ( + InstAlt InstOp = iota + InstAltMatch + InstCapture + InstEmptyWidth + InstMatch + InstFail + InstNop + InstRune +) + +// An EmptyOp specifies a kind or mixture of zero-width assertions. +type EmptyOp uint8 + +const ( + EmptyBeginLine EmptyOp = 1 << iota + EmptyEndLine + EmptyBeginText + EmptyEndText + EmptyWordBoundary + EmptyNoWordBoundary +) + +// An Inst is a single instruction in a regular expression program. +type Inst struct { + Op InstOp + Out uint32 // all but InstMatch, InstFail + Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth + Rune []int +} + +func (p *Prog) String() string { + var b bytes.Buffer + dumpProg(&b, p) + return b.String() +} + +// MatchRune returns true if the instruction matches (and consumes) r. +// It should only be called when i.Op == InstRune. +func (i *Inst) MatchRune(r int) bool { + rune := i.Rune + + // Special case: single-rune slice is from literal string, not char class. + // TODO: Case folding. + if len(rune) == 1 { + return r == rune[0] + } + + // Peek at the first few pairs. + // Should handle ASCII well. + for j := 0; j < len(rune) && j <= 8; j += 2 { + if r < rune[j] { + return false + } + if r <= rune[j+1] { + return true + } + } + + // Otherwise binary search. + lo := 0 + hi := len(rune) / 2 + for lo < hi { + m := lo + (hi-lo)/2 + if c := rune[2*m]; c <= r { + if r <= rune[2*m+1] { + return true + } + lo = m + 1 + } else { + hi = m + } + } + return false +} + +// As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char. +// Since we act on runes, it would be easy to support Unicode here. +func wordRune(rune int) bool { + return rune == '_' || + ('A' <= rune && rune <= 'Z') || + ('a' <= rune && rune <= 'z') || + ('0' <= rune && rune <= '9') +} + +// MatchEmptyWidth returns true if the instruction matches +// an empty string between the runes before and after. +// It should only be called when i.Op == InstEmptyWidth. +func (i *Inst) MatchEmptyWidth(before int, after int) bool { + switch EmptyOp(i.Arg) { + case EmptyBeginLine: + return before == '\n' || before == -1 + case EmptyEndLine: + return after == '\n' || after == -1 + case EmptyBeginText: + return before == -1 + case EmptyEndText: + return after == -1 + case EmptyWordBoundary: + return wordRune(before) != wordRune(after) + case EmptyNoWordBoundary: + return wordRune(before) == wordRune(after) + } + panic("unknown empty width arg") +} + + +func (i *Inst) String() string { + var b bytes.Buffer + dumpInst(&b, i) + return b.String() +} + +func bw(b *bytes.Buffer, args ...string) { + for _, s := range args { + b.WriteString(s) + } +} + +func dumpProg(b *bytes.Buffer, p *Prog) { + for j := range p.Inst { + i := &p.Inst[j] + pc := strconv.Itoa(j) + if len(pc) < 3 { + b.WriteString(" "[len(pc):]) + } + if j == p.Start { + pc += "*" + } + bw(b, pc, "\t") + dumpInst(b, i) + bw(b, "\n") + } +} + +func u32(i uint32) string { + return strconv.Uitoa64(uint64(i)) +} + +func dumpInst(b *bytes.Buffer, i *Inst) { + switch i.Op { + case InstAlt: + bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg)) + case InstAltMatch: + bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg)) + case InstCapture: + bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out)) + case InstEmptyWidth: + bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out)) + case InstMatch: + bw(b, "match") + case InstFail: + bw(b, "fail") + case InstNop: + bw(b, "nop -> ", u32(i.Out)) + case InstRune: + if i.Rune == nil { + // shouldn't happen + bw(b, "rune <nil>") + } + bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out)) + } +} diff --git a/src/pkg/exp/regexp/syntax/prog_test.go b/src/pkg/exp/regexp/syntax/prog_test.go new file mode 100644 index 000000000..7be4281c2 --- /dev/null +++ b/src/pkg/exp/regexp/syntax/prog_test.go @@ -0,0 +1,91 @@ +package syntax + +import ( + "testing" +) + +var compileTests = []struct { + Regexp string + Prog string +}{ + {"a", ` 0 fail + 1* rune "a" -> 2 + 2 match +`}, + {"[A-M][n-z]", ` 0 fail + 1* rune "AM" -> 2 + 2 rune "nz" -> 3 + 3 match +`}, + {"", ` 0 fail + 1* nop -> 2 + 2 match +`}, + {"a?", ` 0 fail + 1 rune "a" -> 3 + 2* alt -> 1, 3 + 3 match +`}, + {"a??", ` 0 fail + 1 rune "a" -> 3 + 2* alt -> 3, 1 + 3 match +`}, + {"a+", ` 0 fail + 1* rune "a" -> 2 + 2 alt -> 1, 3 + 3 match +`}, + {"a+?", ` 0 fail + 1* rune "a" -> 2 + 2 alt -> 3, 1 + 3 match +`}, + {"a*", ` 0 fail + 1 rune "a" -> 2 + 2* alt -> 1, 3 + 3 match +`}, + {"a*?", ` 0 fail + 1 rune "a" -> 2 + 2* alt -> 3, 1 + 3 match +`}, + {"a+b+", ` 0 fail + 1* rune "a" -> 2 + 2 alt -> 1, 3 + 3 rune "b" -> 4 + 4 alt -> 3, 5 + 5 match +`}, + {"(a+)(b+)", ` 0 fail + 1* cap 2 -> 2 + 2 rune "a" -> 3 + 3 alt -> 2, 4 + 4 cap 3 -> 5 + 5 cap 4 -> 6 + 6 rune "b" -> 7 + 7 alt -> 6, 8 + 8 cap 5 -> 9 + 9 match +`}, + {"a+|b+", ` 0 fail + 1 rune "a" -> 2 + 2 alt -> 1, 6 + 3 rune "b" -> 4 + 4 alt -> 3, 6 + 5* alt -> 1, 3 + 6 match +`}, +} + +func TestCompile(t *testing.T) { + for _, tt := range compileTests { + re, _ := Parse(tt.Regexp, Perl) + p, _ := Compile(re) + s := p.String() + if s != tt.Prog { + t.Errorf("compiled %#q:\n--- have\n%s---\n--- want\n%s---", tt.Regexp, s, tt.Prog) + } + } +} diff --git a/src/pkg/exp/regexp/syntax/regexp.go b/src/pkg/exp/regexp/syntax/regexp.go new file mode 100644 index 000000000..00a4addef --- /dev/null +++ b/src/pkg/exp/regexp/syntax/regexp.go @@ -0,0 +1,284 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package syntax parses regular expressions into syntax trees. +// WORK IN PROGRESS. +package syntax + +// Note to implementers: +// In this package, re is always a *Regexp and r is always a rune. + +import ( + "bytes" + "strconv" + "strings" + "unicode" +) + +// A Regexp is a node in a regular expression syntax tree. +type Regexp struct { + Op Op // operator + Flags Flags + Sub []*Regexp // subexpressions, if any + Sub0 [1]*Regexp // storage for short Sub + Rune []int // matched runes, for OpLiteral, OpCharClass + Rune0 [2]int // storage for short Rune + Min, Max int // min, max for OpRepeat + Cap int // capturing index, for OpCapture + Name string // capturing name, for OpCapture +} + +// An Op is a single regular expression operator. +type Op uint8 + +// Operators are listed in precedence order, tightest binding to weakest. +// Character class operators are listed simplest to most complex +// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar). + +const ( + OpNoMatch Op = 1 + iota // matches no strings + OpEmptyMatch // matches empty string + OpLiteral // matches Runes sequence + OpCharClass // matches Runes interpreted as range pair list + OpAnyCharNotNL // matches any character + OpAnyChar // matches any character + OpBeginLine // matches empty string at beginning of line + OpEndLine // matches empty string at end of line + OpBeginText // matches empty string at beginning of text + OpEndText // matches empty string at end of text + OpWordBoundary // matches word boundary `\b` + OpNoWordBoundary // matches word non-boundary `\B` + OpCapture // capturing subexpression with index Cap, optional name Name + OpStar // matches Sub[0] zero or more times + OpPlus // matches Sub[0] one or more times + OpQuest // matches Sub[0] zero or one times + OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit) + OpConcat // matches concatenation of Subs + OpAlternate // matches alternation of Subs +) + +const opPseudo Op = 128 // where pseudo-ops start + +// Equal returns true if x and y have identical structure. +func (x *Regexp) Equal(y *Regexp) bool { + if x == nil || y == nil { + return x == y + } + if x.Op != y.Op { + return false + } + switch x.Op { + case OpEndText: + // The parse flags remember whether this is \z or \Z. + if x.Flags&WasDollar != y.Flags&WasDollar { + return false + } + + case OpLiteral, OpCharClass: + if len(x.Rune) != len(y.Rune) { + return false + } + for i, r := range x.Rune { + if r != y.Rune[i] { + return false + } + } + + case OpAlternate, OpConcat: + if len(x.Sub) != len(y.Sub) { + return false + } + for i, sub := range x.Sub { + if !sub.Equal(y.Sub[i]) { + return false + } + } + + case OpStar, OpPlus, OpQuest: + if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) { + return false + } + + case OpRepeat: + if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) { + return false + } + + case OpCapture: + if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) { + return false + } + } + return true +} + +// writeRegexp writes the Perl syntax for the regular expression re to b. +func writeRegexp(b *bytes.Buffer, re *Regexp) { + switch re.Op { + default: + b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">") + case OpNoMatch: + b.WriteString(`[^\x00-\x{10FFFF}]`) + case OpEmptyMatch: + b.WriteString(`(?:)`) + case OpLiteral: + if re.Flags&FoldCase != 0 { + b.WriteString(`(?i:`) + } + for _, r := range re.Rune { + escape(b, r, false) + } + if re.Flags&FoldCase != 0 { + b.WriteString(`)`) + } + case OpCharClass: + if len(re.Rune)%2 != 0 { + b.WriteString(`[invalid char class]`) + break + } + b.WriteRune('[') + if len(re.Rune) == 0 { + b.WriteString(`^\x00-\x{10FFFF}`) + } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune { + // Contains 0 and MaxRune. Probably a negated class. + // Print the gaps. + b.WriteRune('^') + for i := 1; i < len(re.Rune)-1; i += 2 { + lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 + escape(b, lo, lo == '-') + if lo != hi { + b.WriteRune('-') + escape(b, hi, hi == '-') + } + } + } else { + for i := 0; i < len(re.Rune); i += 2 { + lo, hi := re.Rune[i], re.Rune[i+1] + escape(b, lo, lo == '-') + if lo != hi { + b.WriteRune('-') + escape(b, hi, hi == '-') + } + } + } + b.WriteRune(']') + case OpAnyCharNotNL: + b.WriteString(`[^\n]`) + case OpAnyChar: + b.WriteRune('.') + case OpBeginLine: + b.WriteRune('^') + case OpEndLine: + b.WriteRune('$') + case OpBeginText: + b.WriteString(`\A`) + case OpEndText: + b.WriteString(`\z`) + case OpWordBoundary: + b.WriteString(`\b`) + case OpNoWordBoundary: + b.WriteString(`\B`) + case OpCapture: + if re.Name != "" { + b.WriteString(`(?P<`) + b.WriteString(re.Name) + b.WriteRune('>') + } else { + b.WriteRune('(') + } + if re.Sub[0].Op != OpEmptyMatch { + writeRegexp(b, re.Sub[0]) + } + b.WriteRune(')') + case OpStar, OpPlus, OpQuest, OpRepeat: + if sub := re.Sub[0]; sub.Op > OpCapture { + b.WriteString(`(?:`) + writeRegexp(b, sub) + b.WriteString(`)`) + } else { + writeRegexp(b, sub) + } + switch re.Op { + case OpStar: + b.WriteRune('*') + case OpPlus: + b.WriteRune('+') + case OpQuest: + b.WriteRune('?') + case OpRepeat: + b.WriteRune('{') + b.WriteString(strconv.Itoa(re.Min)) + if re.Max != re.Min { + b.WriteRune(',') + if re.Max >= 0 { + b.WriteString(strconv.Itoa(re.Max)) + } + } + b.WriteRune('}') + } + case OpConcat: + for _, sub := range re.Sub { + if sub.Op == OpAlternate { + b.WriteString(`(?:`) + writeRegexp(b, sub) + b.WriteString(`)`) + } else { + writeRegexp(b, sub) + } + } + case OpAlternate: + for i, sub := range re.Sub { + if i > 0 { + b.WriteRune('|') + } + writeRegexp(b, sub) + } + } +} + +func (re *Regexp) String() string { + var b bytes.Buffer + writeRegexp(&b, re) + return b.String() +} + +const meta = `\.+*?()|[]{}^$` + +func escape(b *bytes.Buffer, r int, force bool) { + if unicode.IsPrint(r) { + if strings.IndexRune(meta, r) >= 0 || force { + b.WriteRune('\\') + } + b.WriteRune(r) + return + } + + switch r { + case '\a': + b.WriteString(`\a`) + case '\f': + b.WriteString(`\f`) + case '\n': + b.WriteString(`\n`) + case '\r': + b.WriteString(`\r`) + case '\t': + b.WriteString(`\t`) + case '\v': + b.WriteString(`\v`) + default: + if r < 0x100 { + b.WriteString(`\x`) + s := strconv.Itob(r, 16) + if len(s) == 1 { + b.WriteRune('0') + } + b.WriteString(s) + break + } + b.WriteString(`\x{`) + b.WriteString(strconv.Itob(r, 16)) + b.WriteString(`}`) + } +} diff --git a/src/pkg/exp/regexp/syntax/simplify.go b/src/pkg/exp/regexp/syntax/simplify.go new file mode 100644 index 000000000..72390417b --- /dev/null +++ b/src/pkg/exp/regexp/syntax/simplify.go @@ -0,0 +1,151 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +// Simplify returns a regexp equivalent to re but without counted repetitions +// and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/. +// The resulting regexp will execute correctly but its string representation +// will not produce the same parse tree, because capturing parentheses +// may have been duplicated or removed. For example, the simplified form +// for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1. +// The returned regexp may share structure with or be the original. +func (re *Regexp) Simplify() *Regexp { + if re == nil { + return nil + } + switch re.Op { + case OpCapture, OpConcat, OpAlternate: + // Simplify children, building new Regexp if children change. + nre := re + for i, sub := range re.Sub { + nsub := sub.Simplify() + if nre == re && nsub != sub { + // Start a copy. + nre = new(Regexp) + *nre = *re + nre.Rune = nil + nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...) + } + if nre != re { + nre.Sub = append(nre.Sub, nsub) + } + } + return nre + + case OpStar, OpPlus, OpQuest: + sub := re.Sub[0].Simplify() + return simplify1(re.Op, re.Flags, sub, re) + + case OpRepeat: + // Special special case: x{0} matches the empty string + // and doesn't even need to consider x. + if re.Min == 0 && re.Max == 0 { + return &Regexp{Op: OpEmptyMatch} + } + + // The fun begins. + sub := re.Sub[0].Simplify() + + // x{n,} means at least n matches of x. + if re.Max == -1 { + // Special case: x{0,} is x*. + if re.Min == 0 { + return simplify1(OpStar, re.Flags, sub, nil) + } + + // Special case: x{1,} is x+. + if re.Min == 1 { + return simplify1(OpPlus, re.Flags, sub, nil) + } + + // General case: x{4,} is xxxx+. + nre := &Regexp{Op: OpConcat} + nre.Sub = nre.Sub0[:0] + for i := 0; i < re.Min-1; i++ { + nre.Sub = append(nre.Sub, sub) + } + nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil)) + return nre + } + + // Special case x{0} handled above. + + // Special case: x{1} is just x. + if re.Min == 1 && re.Max == 1 { + return sub + } + + // General case: x{n,m} means n copies of x and m copies of x? + // The machine will do less work if we nest the final m copies, + // so that x{2,5} = xx(x(x(x)?)?)? + + // Build leading prefix: xx. + var prefix *Regexp + if re.Min > 0 { + prefix = &Regexp{Op: OpConcat} + prefix.Sub = prefix.Sub0[:0] + for i := 0; i < re.Min; i++ { + prefix.Sub = append(prefix.Sub, sub) + } + } + + // Build and attach suffix: (x(x(x)?)?)? + if re.Max > re.Min { + suffix := simplify1(OpQuest, re.Flags, sub, nil) + for i := re.Min + 1; i < re.Max; i++ { + nre2 := &Regexp{Op: OpConcat} + nre2.Sub = append(nre2.Sub0[:0], sub, suffix) + suffix = simplify1(OpQuest, re.Flags, nre2, nil) + } + if prefix == nil { + return suffix + } + prefix.Sub = append(prefix.Sub, suffix) + } + if prefix != nil { + return prefix + } + + // Some degenerate case like min > max or min < max < 0. + // Handle as impossible match. + return &Regexp{Op: OpNoMatch} + } + + return re +} + +// simplify1 implements Simplify for the unary OpStar, +// OpPlus, and OpQuest operators. It returns the simple regexp +// equivalent to +// +// Regexp{Op: op, Flags: flags, Sub: {sub}} +// +// under the assumption that sub is already simple, and +// without first allocating that structure. If the regexp +// to be returned turns out to be equivalent to re, simplify1 +// returns re instead. +// +// simplify1 is factored out of Simplify because the implementation +// for other operators generates these unary expressions. +// Letting them call simplify1 makes sure the expressions they +// generate are simple. +func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp { + // Special case: repeat the empty string as much as + // you want, but it's still the empty string. + if sub.Op == OpEmptyMatch { + return sub + } + // The operators are idempotent if the flags match. + if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy { + return sub + } + if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] { + return re + } + + re = &Regexp{Op: op, Flags: flags} + re.Sub = append(re.Sub0[:0], sub) + return re +} diff --git a/src/pkg/exp/regexp/syntax/simplify_test.go b/src/pkg/exp/regexp/syntax/simplify_test.go new file mode 100644 index 000000000..c8cec2183 --- /dev/null +++ b/src/pkg/exp/regexp/syntax/simplify_test.go @@ -0,0 +1,151 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +import "testing" + +var simplifyTests = []struct { + Regexp string + Simple string +}{ + // Already-simple constructs + {`a`, `a`}, + {`ab`, `ab`}, + {`a|b`, `[a-b]`}, + {`ab|cd`, `ab|cd`}, + {`(ab)*`, `(ab)*`}, + {`(ab)+`, `(ab)+`}, + {`(ab)?`, `(ab)?`}, + {`.`, `.`}, + {`^`, `^`}, + {`$`, `$`}, + {`[ac]`, `[ac]`}, + {`[^ac]`, `[^ac]`}, + + // Posix character classes + {`[[:alnum:]]`, `[0-9A-Za-z]`}, + {`[[:alpha:]]`, `[A-Za-z]`}, + {`[[:blank:]]`, `[\t ]`}, + {`[[:cntrl:]]`, `[\x00-\x1f\x7f]`}, + {`[[:digit:]]`, `[0-9]`}, + {`[[:graph:]]`, `[!-~]`}, + {`[[:lower:]]`, `[a-z]`}, + {`[[:print:]]`, `[ -~]`}, + {`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"}, + {`[[:space:]]`, `[\t-\r ]`}, + {`[[:upper:]]`, `[A-Z]`}, + {`[[:xdigit:]]`, `[0-9A-Fa-f]`}, + + // Perl character classes + {`\d`, `[0-9]`}, + {`\s`, `[\t-\n\f-\r ]`}, + {`\w`, `[0-9A-Z_a-z]`}, + {`\D`, `[^0-9]`}, + {`\S`, `[^\t-\n\f-\r ]`}, + {`\W`, `[^0-9A-Z_a-z]`}, + {`[\d]`, `[0-9]`}, + {`[\s]`, `[\t-\n\f-\r ]`}, + {`[\w]`, `[0-9A-Z_a-z]`}, + {`[\D]`, `[^0-9]`}, + {`[\S]`, `[^\t-\n\f-\r ]`}, + {`[\W]`, `[^0-9A-Z_a-z]`}, + + // Posix repetitions + {`a{1}`, `a`}, + {`a{2}`, `aa`}, + {`a{5}`, `aaaaa`}, + {`a{0,1}`, `a?`}, + // The next three are illegible because Simplify inserts (?:) + // parens instead of () parens to avoid creating extra + // captured subexpressions. The comments show a version with fewer parens. + {`(a){0,2}`, `(?:(a)(a)?)?`}, // (aa?)? + {`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // (a(a(aa?)?)?)? + {`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)? + {`a{0,2}`, `(?:aa?)?`}, // (aa?)? + {`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`}, // (a(a(aa?)?)?)? + {`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`}, // aa(a(a(aa?)?)?)? + {`a{0,}`, `a*`}, + {`a{1,}`, `a+`}, + {`a{2,}`, `aa+`}, + {`a{5,}`, `aaaaa+`}, + + // Test that operators simplify their arguments. + {`(?:a{1,}){1,}`, `a+`}, + {`(a{1,}b{1,})`, `(a+b+)`}, + {`a{1,}|b{1,}`, `a+|b+`}, + {`(?:a{1,})*`, `(?:a+)*`}, + {`(?:a{1,})+`, `a+`}, + {`(?:a{1,})?`, `(?:a+)?`}, + {``, `(?:)`}, + {`a{0}`, `(?:)`}, + + // Character class simplification + {`[ab]`, `[a-b]`}, + {`[a-za-za-z]`, `[a-z]`}, + {`[A-Za-zA-Za-z]`, `[A-Za-z]`}, + {`[ABCDEFGH]`, `[A-H]`}, + {`[AB-CD-EF-GH]`, `[A-H]`}, + {`[W-ZP-XE-R]`, `[E-Z]`}, + {`[a-ee-gg-m]`, `[a-m]`}, + {`[a-ea-ha-m]`, `[a-m]`}, + {`[a-ma-ha-e]`, `[a-m]`}, + {`[a-zA-Z0-9 -~]`, `[ -~]`}, + + // Empty character classes + {`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`}, + + // Full character classes + {`[[:cntrl:][:^cntrl:]]`, `.`}, + + // Unicode case folding. + {`(?i)A`, `(?i:A)`}, + {`(?i)a`, `(?i:a)`}, + {`(?i)[A]`, `(?i:A)`}, + {`(?i)[a]`, `(?i:A)`}, + {`(?i)K`, `(?i:K)`}, + {`(?i)k`, `(?i:k)`}, + {`(?i)\x{212a}`, "(?i:\u212A)"}, + {`(?i)[K]`, "[Kk\u212A]"}, + {`(?i)[k]`, "[Kk\u212A]"}, + {`(?i)[\x{212a}]`, "[Kk\u212A]"}, + {`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"}, + {`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"}, + {`(?i)[\x00-\x{10FFFF}]`, `.`}, + + // Empty string as a regular expression. + // The empty string must be preserved inside parens in order + // to make submatches work right, so these tests are less + // interesting than they might otherwise be. String inserts + // explicit (?:) in place of non-parenthesized empty strings, + // to make them easier to spot for other parsers. + {`(a|b|)`, `([a-b]|(?:))`}, + {`(|)`, `()`}, + {`a()`, `a()`}, + {`(()|())`, `(()|())`}, + {`(a|)`, `(a|(?:))`}, + {`ab()cd()`, `ab()cd()`}, + {`()`, `()`}, + {`()*`, `()*`}, + {`()+`, `()+`}, + {`()?`, `()?`}, + {`(){0}`, `(?:)`}, + {`(){1}`, `()`}, + {`(){1,}`, `()+`}, + {`(){0,2}`, `(?:()()?)?`}, +} + +func TestSimplify(t *testing.T) { + for _, tt := range simplifyTests { + re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine) + if err != nil { + t.Errorf("Parse(%#q) = error %v", tt.Regexp, err) + continue + } + s := re.Simplify().String() + if s != tt.Simple { + t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple) + } + } +} diff --git a/src/pkg/exp/template/Makefile b/src/pkg/exp/template/Makefile new file mode 100644 index 000000000..8550b0d52 --- /dev/null +++ b/src/pkg/exp/template/Makefile @@ -0,0 +1,15 @@ +# Copyright 2011 The Go Authors. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +include ../../../Make.inc + +TARG=exp/template +GOFILES=\ + exec.go\ + funcs.go\ + lex.go\ + parse.go\ + set.go\ + +include ../../../Make.pkg diff --git a/src/pkg/exp/template/exec.go b/src/pkg/exp/template/exec.go new file mode 100644 index 000000000..fb0a9e621 --- /dev/null +++ b/src/pkg/exp/template/exec.go @@ -0,0 +1,508 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package template + +import ( + "fmt" + "io" + "os" + "reflect" + "strings" + "unicode" + "utf8" +) + +// state represents the state of an execution. It's not part of the +// template so that multiple executions of the same template +// can execute in parallel. +type state struct { + tmpl *Template + wr io.Writer + set *Set + line int // line number for errors +} + +// errorf formats the error and terminates processing. +func (s *state) errorf(format string, args ...interface{}) { + format = fmt.Sprintf("template: %s:%d: %s", s.tmpl.name, s.line, format) + panic(fmt.Errorf(format, args...)) +} + +// error terminates processing. +func (s *state) error(err os.Error) { + s.errorf("%s", err) +} + +// Execute applies a parsed template to the specified data object, +// writing the output to wr. +func (t *Template) Execute(wr io.Writer, data interface{}) os.Error { + return t.ExecuteInSet(wr, data, nil) +} + +// ExecuteInSet applies a parsed template to the specified data object, +// writing the output to wr. Nested template invocations will be resolved +// from the specified set. +func (t *Template) ExecuteInSet(wr io.Writer, data interface{}, set *Set) (err os.Error) { + defer t.recover(&err) + state := &state{ + tmpl: t, + wr: wr, + set: set, + line: 1, + } + if t.root == nil { + state.errorf("must be parsed before execution") + } + state.walk(reflect.ValueOf(data), t.root) + return +} + +// Walk functions step through the major pieces of the template structure, +// generating output as they go. +func (s *state) walk(data reflect.Value, n node) { + switch n := n.(type) { + case *actionNode: + s.line = n.line + s.printValue(n, s.evalPipeline(data, n.pipeline)) + case *listNode: + for _, node := range n.nodes { + s.walk(data, node) + } + case *ifNode: + s.line = n.line + s.walkIfOrWith(nodeIf, data, n.pipeline, n.list, n.elseList) + case *rangeNode: + s.line = n.line + s.walkRange(data, n) + case *textNode: + if _, err := s.wr.Write(n.text); err != nil { + s.error(err) + } + case *templateNode: + s.line = n.line + s.walkTemplate(data, n) + case *withNode: + s.line = n.line + s.walkIfOrWith(nodeWith, data, n.pipeline, n.list, n.elseList) + default: + s.errorf("unknown node: %s", n) + } +} + +// walkIfOrWith walks an 'if' or 'with' node. The two control structures +// are identical in behavior except that 'with' sets dot. +func (s *state) walkIfOrWith(typ nodeType, data reflect.Value, pipe []*commandNode, list, elseList *listNode) { + val := s.evalPipeline(data, pipe) + truth, ok := isTrue(val) + if !ok { + s.errorf("if/with can't use value of type %T", val.Interface()) + } + if truth { + if typ == nodeWith { + data = val + } + s.walk(data, list) + } else if elseList != nil { + s.walk(data, elseList) + } +} + +// isTrue returns whether the value is 'true', in the sense of not the zero of its type, +// and whether the value has a meaningful truth value. +func isTrue(val reflect.Value) (truth, ok bool) { + switch val.Kind() { + case reflect.Array, reflect.Map, reflect.Slice, reflect.String: + truth = val.Len() > 0 + case reflect.Bool: + truth = val.Bool() + case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: + truth = val.Int() != 0 + case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr: + truth = val.Uint() != 0 + case reflect.Float32, reflect.Float64: + truth = val.Float() != 0 + case reflect.Complex64, reflect.Complex128: + truth = val.Complex() != 0 + case reflect.Chan, reflect.Func, reflect.Ptr: + truth = !val.IsNil() + default: + return + } + return truth, true +} + +func (s *state) walkRange(data reflect.Value, r *rangeNode) { + val := s.evalPipeline(data, r.pipeline) + switch val.Kind() { + case reflect.Array, reflect.Slice: + if val.Len() == 0 { + break + } + for i := 0; i < val.Len(); i++ { + s.walk(val.Index(i), r.list) + } + return + case reflect.Map: + if val.Len() == 0 { + break + } + for _, key := range val.MapKeys() { + s.walk(val.MapIndex(key), r.list) + } + return + default: + s.errorf("range can't iterate over value of type %T", val.Interface()) + } + if r.elseList != nil { + s.walk(data, r.elseList) + } +} + +func (s *state) walkTemplate(data reflect.Value, t *templateNode) { + name := s.evalArg(data, reflect.TypeOf("string"), t.name).String() + if s.set == nil { + s.errorf("no set defined in which to invoke template named %q", name) + } + tmpl := s.set.tmpl[name] + if tmpl == nil { + s.errorf("template %q not in set", name) + } + data = s.evalPipeline(data, t.pipeline) + newState := *s + newState.tmpl = tmpl + newState.walk(data, tmpl.root) +} + +// Eval functions evaluate pipelines, commands, and their elements and extract +// values from the data structure by examining fields, calling methods, and so on. +// The printing of those values happens only through walk functions. + +func (s *state) evalPipeline(data reflect.Value, pipe []*commandNode) reflect.Value { + value := reflect.Value{} + for _, cmd := range pipe { + value = s.evalCommand(data, cmd, value) // previous value is this one's final arg. + // If the object has type interface{}, dig down one level to the thing inside. + if value.Kind() == reflect.Interface && value.Type().NumMethod() == 0 { + value = reflect.ValueOf(value.Interface()) // lovely! + } + } + return value +} + +func (s *state) evalCommand(data reflect.Value, cmd *commandNode, final reflect.Value) reflect.Value { + firstWord := cmd.args[0] + switch n := firstWord.(type) { + case *fieldNode: + return s.evalFieldNode(data, n, cmd.args, final) + case *identifierNode: + return s.evalFieldOrCall(data, n.ident, cmd.args, final) + } + if len(cmd.args) > 1 || final.IsValid() { + s.errorf("can't give argument to non-function %s", cmd.args[0]) + } + switch word := cmd.args[0].(type) { + case *dotNode: + return data + case *boolNode: + return reflect.ValueOf(word.true) + case *numberNode: + // These are ideal constants but we don't know the type + // and we have no context. (If it was a method argument, + // we'd know what we need.) The syntax guides us to some extent. + switch { + case word.isComplex: + return reflect.ValueOf(word.complex128) // incontrovertible. + case word.isFloat && strings.IndexAny(word.text, ".eE") >= 0: + return reflect.ValueOf(word.float64) + case word.isInt: + return reflect.ValueOf(word.int64) + case word.isUint: + return reflect.ValueOf(word.uint64) + } + case *stringNode: + return reflect.ValueOf(word.text) + } + s.errorf("can't handle command %q", firstWord) + panic("not reached") +} + +func (s *state) evalFieldNode(data reflect.Value, field *fieldNode, args []node, final reflect.Value) reflect.Value { + // Up to the last entry, it must be a field. + n := len(field.ident) + for i := 0; i < n-1; i++ { + data = s.evalField(data, field.ident[i]) + } + // Now it can be a field or method and if a method, gets arguments. + return s.evalFieldOrCall(data, field.ident[n-1], args, final) +} + +// Is this an exported - upper case - name? +func isExported(name string) bool { + rune, _ := utf8.DecodeRuneInString(name) + return unicode.IsUpper(rune) +} + +func (s *state) evalField(data reflect.Value, fieldName string) reflect.Value { + var isNil bool + if data, isNil = indirect(data); isNil { + s.errorf("%s is nil pointer", fieldName) + } + switch data.Kind() { + case reflect.Struct: + // Is it a field? + field := data.FieldByName(fieldName) + // TODO: look higher up the tree if we can't find it here. Also unexported fields + // might succeed higher up, as map keys. + if field.IsValid() && isExported(fieldName) { // valid and exported + return field + } + s.errorf("%s has no exported field %q", data.Type(), fieldName) + default: + s.errorf("can't evaluate field %s of type %s", fieldName, data.Type()) + } + panic("not reached") +} + +func (s *state) evalFieldOrCall(data reflect.Value, fieldName string, args []node, final reflect.Value) reflect.Value { + // Is it a function? + if function, ok := findFunction(fieldName, s.tmpl, s.set); ok { + return s.evalCall(data, function, fieldName, false, args, final) + } + ptr := data + for data.Kind() == reflect.Ptr && !data.IsNil() { + ptr, data = data, reflect.Indirect(data) + } + // Is it a method? We use the pointer because it has value methods too. + if method, ok := methodByName(ptr.Type(), fieldName); ok { + return s.evalCall(ptr, method.Func, fieldName, true, args, final) + } + if len(args) > 1 || final.IsValid() { + s.errorf("%s is not a method but has arguments", fieldName) + } + switch data.Kind() { + case reflect.Struct: + return s.evalField(data, fieldName) + default: + s.errorf("can't handle evaluation of field %s of type %s", fieldName, data.Type()) + } + panic("not reached") +} + +// TODO: delete when reflect's own MethodByName is released. +func methodByName(typ reflect.Type, name string) (reflect.Method, bool) { + for i := 0; i < typ.NumMethod(); i++ { + if typ.Method(i).Name == name { + return typ.Method(i), true + } + } + return reflect.Method{}, false +} + +var ( + osErrorType = reflect.TypeOf(new(os.Error)).Elem() +) + +func (s *state) evalCall(v, fun reflect.Value, name string, isMethod bool, args []node, final reflect.Value) reflect.Value { + typ := fun.Type() + if !isMethod && len(args) > 0 { // Args will be nil if it's a niladic call in an argument list + args = args[1:] // first arg is name of function; not used in call. + } + numIn := len(args) + if final.IsValid() { + numIn++ + } + numFixed := len(args) + if typ.IsVariadic() { + numFixed = typ.NumIn() - 1 // last arg is the variadic one. + if numIn < numFixed { + s.errorf("wrong number of args for %s: want at least %d got %d", name, typ.NumIn()-1, len(args)) + } + } else if numIn < typ.NumIn()-1 || !typ.IsVariadic() && numIn != typ.NumIn() { + s.errorf("wrong number of args for %s: want %d got %d", name, typ.NumIn(), len(args)) + } + if !goodFunc(typ) { + s.errorf("can't handle multiple results from method/function %q", name) + } + // Build the arg list. + argv := make([]reflect.Value, numIn) + // First arg is the receiver. + i := 0 + if isMethod { + argv[0] = v + i++ + } + // Others must be evaluated. Fixed args first. + for ; i < numFixed; i++ { + argv[i] = s.evalArg(v, typ.In(i), args[i]) + } + // And now the ... args. + if typ.IsVariadic() { + argType := typ.In(typ.NumIn() - 1).Elem() // Argument is a slice. + for ; i < len(args); i++ { + argv[i] = s.evalArg(v, argType, args[i]) + } + } + // Add final value if necessary. + if final.IsValid() { + argv[len(args)] = final + } + result := fun.Call(argv) + // If we have an os.Error that is not nil, stop execution and return that error to the caller. + if len(result) == 2 && !result[1].IsNil() { + s.error(result[1].Interface().(os.Error)) + } + return result[0] +} + +func (s *state) evalArg(data reflect.Value, typ reflect.Type, n node) reflect.Value { + if field, ok := n.(*fieldNode); ok { + value := s.evalFieldNode(data, field, []node{n}, reflect.Value{}) + if !value.Type().AssignableTo(typ) { + s.errorf("wrong type for value; expected %s; got %s", typ, value.Type()) + } + return value + } + switch typ.Kind() { + case reflect.Bool: + return s.evalBool(typ, n) + case reflect.String: + return s.evalString(typ, n) + case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: + return s.evalInteger(typ, n) + case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr: + return s.evalUnsignedInteger(typ, n) + case reflect.Float32, reflect.Float64: + return s.evalFloat(typ, n) + case reflect.Complex64, reflect.Complex128: + return s.evalComplex(typ, n) + case reflect.Interface: + if typ.NumMethod() == 0 { + return s.evalEmptyInterface(data, typ, n) + } + } + s.errorf("can't handle %s for arg of type %s", n, typ) + panic("not reached") +} + +func (s *state) evalBool(typ reflect.Type, n node) reflect.Value { + if n, ok := n.(*boolNode); ok { + value := reflect.New(typ).Elem() + value.SetBool(n.true) + return value + } + s.errorf("expected bool; found %s", n) + panic("not reached") +} + +func (s *state) evalString(typ reflect.Type, n node) reflect.Value { + if n, ok := n.(*stringNode); ok { + value := reflect.New(typ).Elem() + value.SetString(n.text) + return value + } + s.errorf("expected string; found %s", n) + panic("not reached") +} + +func (s *state) evalInteger(typ reflect.Type, n node) reflect.Value { + if n, ok := n.(*numberNode); ok && n.isInt { + value := reflect.New(typ).Elem() + value.SetInt(n.int64) + return value + } + s.errorf("expected integer; found %s", n) + panic("not reached") +} + +func (s *state) evalUnsignedInteger(typ reflect.Type, n node) reflect.Value { + if n, ok := n.(*numberNode); ok && n.isUint { + value := reflect.New(typ).Elem() + value.SetUint(n.uint64) + return value + } + s.errorf("expected unsigned integer; found %s", n) + panic("not reached") +} + +func (s *state) evalFloat(typ reflect.Type, n node) reflect.Value { + if n, ok := n.(*numberNode); ok && n.isFloat { + value := reflect.New(typ).Elem() + value.SetFloat(n.float64) + return value + } + s.errorf("expected float; found %s", n) + panic("not reached") +} + +func (s *state) evalComplex(typ reflect.Type, n node) reflect.Value { + if n, ok := n.(*numberNode); ok && n.isComplex { + value := reflect.New(typ).Elem() + value.SetComplex(n.complex128) + return value + } + s.errorf("expected complex; found %s", n) + panic("not reached") +} + +func (s *state) evalEmptyInterface(data reflect.Value, typ reflect.Type, n node) reflect.Value { + switch n := n.(type) { + case *boolNode: + return reflect.ValueOf(n.true) + case *dotNode: + return data + case *fieldNode: + return s.evalFieldNode(data, n, nil, reflect.Value{}) + case *identifierNode: + return s.evalFieldOrCall(data, n.ident, nil, reflect.Value{}) + case *numberNode: + if n.isComplex { + return reflect.ValueOf(n.complex128) + } + if n.isInt { + return reflect.ValueOf(n.int64) + } + if n.isUint { + return reflect.ValueOf(n.uint64) + } + if n.isFloat { + return reflect.ValueOf(n.float64) + } + case *stringNode: + return reflect.ValueOf(n.text) + } + s.errorf("can't handle assignment of %s to empty interface argument", n) + panic("not reached") +} + +// indirect returns the item at the end of indirection, and a bool to indicate if it's nil. +func indirect(v reflect.Value) (rv reflect.Value, isNil bool) { + for v.Kind() == reflect.Ptr { + if v.IsNil() { + return v, true + } + v = v.Elem() + } + return v, false +} + +// printValue writes the textual representation of the value to the output of +// the template. +func (s *state) printValue(n node, v reflect.Value) { + if !v.IsValid() { + fmt.Fprint(s.wr, "<no value>") + return + } + switch v.Kind() { + case reflect.Ptr: + var isNil bool + if v, isNil = indirect(v); isNil { + fmt.Fprint(s.wr, "<nil>") + return + } + case reflect.Chan, reflect.Func, reflect.Interface: + s.errorf("can't print %s of type %s", n, v.Type()) + } + fmt.Fprint(s.wr, v.Interface()) +} diff --git a/src/pkg/exp/template/exec_test.go b/src/pkg/exp/template/exec_test.go new file mode 100644 index 000000000..86b958e84 --- /dev/null +++ b/src/pkg/exp/template/exec_test.go @@ -0,0 +1,342 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package template + +import ( + "bytes" + "fmt" + "os" + "sort" + "strings" + "testing" +) + +// T has lots of interesting pieces to use to test execution. +type T struct { + // Basics + I int + U16 uint16 + X string + FloatZero float64 + ComplexZero float64 + // Nested structs. + U *U + // Slices + SI []int + SIEmpty []int + SB []bool + // Maps + MSI map[string]int + MSIone map[string]int // one element, for deterministic output + MSIEmpty map[string]int + SMSI []map[string]int + // Empty interfaces; used to see if we can dig inside one. + Empty0 interface{} // nil + Empty1 interface{} + Empty2 interface{} + Empty3 interface{} + Empty4 interface{} + // Pointers + PI *int + PSI *[]int + NIL *int +} + +var tVal = &T{ + I: 17, + U16: 16, + X: "x", + U: &U{"v"}, + SI: []int{3, 4, 5}, + SB: []bool{true, false}, + MSI: map[string]int{"one": 1, "two": 2, "three": 3}, + MSIone: map[string]int{"one": 1}, + SMSI: []map[string]int{ + {"one": 1, "two": 2}, + {"eleven": 11, "twelve": 12}, + }, + Empty1: 3, + Empty2: "empty2", + Empty3: []int{7, 8}, + Empty4: &U{"v"}, + PI: newInt(23), + PSI: newIntSlice(21, 22, 23), +} + +// Helpers for creation. +func newInt(n int) *int { + p := new(int) + *p = n + return p +} + +func newIntSlice(n ...int) *[]int { + p := new([]int) + *p = make([]int, len(n)) + copy(*p, n) + return p +} + +// Simple methods with and without arguments. +func (t *T) Method0() string { + return "resultOfMethod0" +} + +func (t *T) Method1(a int) int { + return a +} + +func (t *T) Method2(a uint16, b string) string { + return fmt.Sprintf("Method2: %d %s", a, b) +} + +func (t *T) MAdd(a int, b []int) []int { + v := make([]int, len(b)) + for i, x := range b { + v[i] = x + a + } + return v +} + +// MSort is used to sort map keys for stable output. (Nice trick!) +func (t *T) MSort(m map[string]int) []string { + keys := make([]string, len(m)) + i := 0 + for k := range m { + keys[i] = k + i++ + } + sort.Strings(keys) + return keys +} + +// EPERM returns a value and an os.Error according to its argument. +func (t *T) EPERM(error bool) (bool, os.Error) { + if error { + return true, os.EPERM + } + return false, nil +} + +type U struct { + V string +} + +type execTest struct { + name string + input string + output string + data interface{} + ok bool +} + +var execTests = []execTest{ + // Trivial cases. + {"empty", "", "", nil, true}, + {"text", "some text", "some text", nil, true}, + + // Fields of structs. + {".X", "-{{.X}}-", "-x-", tVal, true}, + {".U.V", "-{{.U.V}}-", "-v-", tVal, true}, + + // Dots of all kinds to test basic evaluation. + {"dot int", "<{{.}}>", "<13>", 13, true}, + {"dot uint", "<{{.}}>", "<14>", uint(14), true}, + {"dot float", "<{{.}}>", "<15.1>", 15.1, true}, + {"dot bool", "<{{.}}>", "<true>", true, true}, + {"dot complex", "<{{.}}>", "<(16.2-17i)>", 16.2 - 17i, true}, + {"dot string", "<{{.}}>", "<hello>", "hello", true}, + {"dot slice", "<{{.}}>", "<[-1 -2 -3]>", []int{-1, -2, -3}, true}, + {"dot map", "<{{.}}>", "<map[two:22 one:11]>", map[string]int{"one": 11, "two": 22}, true}, + {"dot struct", "<{{.}}>", "<{7 seven}>", struct { + a int + b string + }{7, "seven"}, true}, + + // Pointers. + {"*int", "{{.PI}}", "23", tVal, true}, + {"*[]int", "{{.PSI}}", "[21 22 23]", tVal, true}, + {"*[]int[1]", "{{index .PSI 1}}", "22", tVal, true}, + {"NIL", "{{.NIL}}", "<nil>", tVal, true}, + + // Emtpy interfaces holding values. + {"empty nil", "{{.Empty0}}", "<no value>", tVal, true}, + {"empty with int", "{{.Empty1}}", "3", tVal, true}, + {"empty with string", "{{.Empty2}}", "empty2", tVal, true}, + {"empty with slice", "{{.Empty3}}", "[7 8]", tVal, true}, + {"empty with struct", "{{.Empty4}}", "{v}", tVal, true}, + + // Method calls. + {".Method0", "-{{.Method0}}-", "-resultOfMethod0-", tVal, true}, + {".Method1(1234)", "-{{.Method1 1234}}-", "-1234-", tVal, true}, + {".Method1(.I)", "-{{.Method1 .I}}-", "-17-", tVal, true}, + {".Method2(3, .X)", "-{{.Method2 3 .X}}-", "-Method2: 3 x-", tVal, true}, + {".Method2(.U16, `str`)", "-{{.Method2 .U16 `str`}}-", "-Method2: 16 str-", tVal, true}, + + // Pipelines. + {"pipeline", "-{{.Method0 | .Method2 .U16}}-", "-Method2: 16 resultOfMethod0-", tVal, true}, + + // If. + {"if true", "{{if true}}TRUE{{end}}", "TRUE", tVal, true}, + {"if false", "{{if false}}TRUE{{else}}FALSE{{end}}", "FALSE", tVal, true}, + {"if 1", "{{if 1}}NON-ZERO{{else}}ZERO{{end}}", "NON-ZERO", tVal, true}, + {"if 0", "{{if 0}}NON-ZERO{{else}}ZERO{{end}}", "ZERO", tVal, true}, + {"if 1.5", "{{if 1.5}}NON-ZERO{{else}}ZERO{{end}}", "NON-ZERO", tVal, true}, + {"if 0.0", "{{if .FloatZero}}NON-ZERO{{else}}ZERO{{end}}", "ZERO", tVal, true}, + {"if 1.5i", "{{if 1.5i}}NON-ZERO{{else}}ZERO{{end}}", "NON-ZERO", tVal, true}, + {"if 0.0i", "{{if .ComplexZero}}NON-ZERO{{else}}ZERO{{end}}", "ZERO", tVal, true}, + {"if emptystring", "{{if ``}}NON-EMPTY{{else}}EMPTY{{end}}", "EMPTY", tVal, true}, + {"if string", "{{if `notempty`}}NON-EMPTY{{else}}EMPTY{{end}}", "NON-EMPTY", tVal, true}, + {"if emptyslice", "{{if .SIEmpty}}NON-EMPTY{{else}}EMPTY{{end}}", "EMPTY", tVal, true}, + {"if slice", "{{if .SI}}NON-EMPTY{{else}}EMPTY{{end}}", "NON-EMPTY", tVal, true}, + {"if emptymap", "{{if .MSIEmpty}}NON-EMPTY{{else}}EMPTY{{end}}", "EMPTY", tVal, true}, + {"if map", "{{if .MSI}}NON-EMPTY{{else}}EMPTY{{end}}", "NON-EMPTY", tVal, true}, + + // Printf. + {"printf", `{{printf "hello, printf"}}`, "hello, printf", tVal, true}, + {"printf int", `{{printf "%04x" 127}}`, "007f", tVal, true}, + {"printf float", `{{printf "%g" 3.5}}`, "3.5", tVal, true}, + {"printf complex", `{{printf "%g" 1+7i}}`, "(1+7i)", tVal, true}, + {"printf string", `{{printf "%s" "hello"}}`, "hello", tVal, true}, + {"printf function", `{{printf "%#q" gopher}}`, "`gopher`", tVal, true}, + {"printf field", `{{printf "%s" .U.V}}`, "v", tVal, true}, + {"printf method", `{{printf "%s" .Method0}}`, "resultOfMethod0", tVal, true}, + {"printf lots", `{{printf "%d %s %g %s" 127 "hello" 7-3i .Method0}}`, "127 hello (7-3i) resultOfMethod0", tVal, true}, + + // HTML. + {"html", `{{html "<script>alert(\"XSS\");</script>"}}`, + "<script>alert("XSS");</script>", nil, true}, + {"html pipeline", `{{printf "<script>alert(\"XSS\");</script>" | html}}`, + "<script>alert("XSS");</script>", nil, true}, + + // JavaScript. + {"js", `{{js .}}`, `It\'d be nice.`, `It'd be nice.`, true}, + + // Booleans + {"not", "{{not true}} {{not false}}", "false true", nil, true}, + {"and", "{{and 0 0}} {{and 1 0}} {{and 0 1}} {{and 1 1}}", "false false false true", nil, true}, + {"or", "{{or 0 0}} {{or 1 0}} {{or 0 1}} {{or 1 1}}", "false true true true", nil, true}, + {"boolean if", "{{if and true 1 `hi`}}TRUE{{else}}FALSE{{end}}", "TRUE", tVal, true}, + {"boolean if not", "{{if and true 1 `hi` | not}}TRUE{{else}}FALSE{{end}}", "FALSE", nil, true}, + + // Indexing. + {"slice[0]", "{{index .SI 0}}", "3", tVal, true}, + {"slice[1]", "{{index .SI 1}}", "4", tVal, true}, + {"slice[HUGE]", "{{index .SI 10}}", "", tVal, false}, + {"slice[WRONG]", "{{index .SI `hello`}}", "", tVal, false}, + {"map[one]", "{{index .MSI `one`}}", "1", tVal, true}, + {"map[two]", "{{index .MSI `two`}}", "2", tVal, true}, + {"map[NO]", "{{index .MSI `XXX`}}", "", tVal, false}, + {"map[WRONG]", "{{index .MSI 10}}", "", tVal, false}, + {"double index", "{{index .SMSI 1 `eleven`}}", "11", tVal, true}, + + // With. + {"with true", "{{with true}}{{.}}{{end}}", "true", tVal, true}, + {"with false", "{{with false}}{{.}}{{else}}FALSE{{end}}", "FALSE", tVal, true}, + {"with 1", "{{with 1}}{{.}}{{else}}ZERO{{end}}", "1", tVal, true}, + {"with 0", "{{with 0}}{{.}}{{else}}ZERO{{end}}", "ZERO", tVal, true}, + {"with 1.5", "{{with 1.5}}{{.}}{{else}}ZERO{{end}}", "1.5", tVal, true}, + {"with 0.0", "{{with .FloatZero}}{{.}}{{else}}ZERO{{end}}", "ZERO", tVal, true}, + {"with 1.5i", "{{with 1.5i}}{{.}}{{else}}ZERO{{end}}", "(0+1.5i)", tVal, true}, + {"with 0.0i", "{{with .ComplexZero}}{{.}}{{else}}ZERO{{end}}", "ZERO", tVal, true}, + {"with emptystring", "{{with ``}}{{.}}{{else}}EMPTY{{end}}", "EMPTY", tVal, true}, + {"with string", "{{with `notempty`}}{{.}}{{else}}EMPTY{{end}}", "notempty", tVal, true}, + {"with emptyslice", "{{with .SIEmpty}}{{.}}{{else}}EMPTY{{end}}", "EMPTY", tVal, true}, + {"with slice", "{{with .SI}}{{.}}{{else}}EMPTY{{end}}", "[3 4 5]", tVal, true}, + {"with emptymap", "{{with .MSIEmpty}}{{.}}{{else}}EMPTY{{end}}", "EMPTY", tVal, true}, + {"with map", "{{with .MSIone}}{{.}}{{else}}EMPTY{{end}}", "map[one:1]", tVal, true}, + {"with empty interface, struct field", "{{with .Empty4}}{{.V}}{{end}}", "v", tVal, true}, + + // Range. + {"range []int", "{{range .SI}}-{{.}}-{{end}}", "-3--4--5-", tVal, true}, + {"range empty no else", "{{range .SIEmpty}}-{{.}}-{{end}}", "", tVal, true}, + {"range []int else", "{{range .SI}}-{{.}}-{{else}}EMPTY{{end}}", "-3--4--5-", tVal, true}, + {"range empty else", "{{range .SIEmpty}}-{{.}}-{{else}}EMPTY{{end}}", "EMPTY", tVal, true}, + {"range []bool", "{{range .SB}}-{{.}}-{{end}}", "-true--false-", tVal, true}, + {"range []int method", "{{range .SI | .MAdd .I}}-{{.}}-{{end}}", "-20--21--22-", tVal, true}, + {"range map", "{{range .MSI | .MSort}}-{{.}}-{{end}}", "-one--three--two-", tVal, true}, + {"range empty map no else", "{{range .MSIEmpty}}-{{.}}-{{end}}", "", tVal, true}, + {"range map else", "{{range .MSI | .MSort}}-{{.}}-{{else}}EMPTY{{end}}", "-one--three--two-", tVal, true}, + {"range empty map else", "{{range .MSIEmpty}}-{{.}}-{{else}}EMPTY{{end}}", "EMPTY", tVal, true}, + {"range empty interface", "{{range .Empty3}}-{{.}}-{{else}}EMPTY{{end}}", "-7--8-", tVal, true}, + + // Error handling. + {"error method, error", "{{.EPERM true}}", "", tVal, false}, + {"error method, no error", "{{.EPERM false}}", "false", tVal, true}, +} + +func gopher() string { + return "gopher" +} + +func testExecute(execTests []execTest, set *Set, t *testing.T) { + b := new(bytes.Buffer) + funcs := FuncMap{"gopher": gopher} + for _, test := range execTests { + tmpl := New(test.name).Funcs(funcs) + err := tmpl.Parse(test.input) + if err != nil { + t.Errorf("%s: parse error: %s", test.name, err) + continue + } + b.Reset() + err = tmpl.ExecuteInSet(b, test.data, set) + switch { + case !test.ok && err == nil: + t.Errorf("%s: expected error; got none", test.name) + continue + case test.ok && err != nil: + t.Errorf("%s: unexpected execute error: %s", test.name, err) + continue + case !test.ok && err != nil: + // expected error, got one + if *debug { + fmt.Printf("%s: %s\n\t%s\n", test.name, test.input, err) + } + } + result := b.String() + if result != test.output { + t.Errorf("%s: expected\n\t%q\ngot\n\t%q", test.name, test.output, result) + } + } +} + +func TestExecute(t *testing.T) { + testExecute(execTests, nil, t) +} + +// Check that an error from a method flows back to the top. +func TestExecuteError(t *testing.T) { + b := new(bytes.Buffer) + tmpl := New("error") + err := tmpl.Parse("{{.EPERM true}}") + if err != nil { + t.Fatalf("parse error: %s", err) + } + err = tmpl.Execute(b, tVal) + if err == nil { + t.Errorf("expected error; got none") + } else if !strings.Contains(err.String(), os.EPERM.String()) { + t.Errorf("expected os.EPERM; got %s", err) + } +} + +func TestJSEscaping(t *testing.T) { + testCases := []struct { + in, exp string + }{ + {`a`, `a`}, + {`'foo`, `\'foo`}, + {`Go "jump" \`, `Go \"jump\" \\`}, + {`Yukihiro says "今日は世界"`, `Yukihiro says \"今日は世界\"`}, + {"unprintable \uFDFF", `unprintable \uFDFF`}, + } + for _, tc := range testCases { + s := JSEscapeString(tc.in) + if s != tc.exp { + t.Errorf("JS escaping [%s] got [%s] want [%s]", tc.in, s, tc.exp) + } + } +} diff --git a/src/pkg/exp/template/funcs.go b/src/pkg/exp/template/funcs.go new file mode 100644 index 000000000..66be40fd4 --- /dev/null +++ b/src/pkg/exp/template/funcs.go @@ -0,0 +1,294 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package template + +import ( + "bytes" + "fmt" + "io" + "os" + "reflect" + "strings" + "unicode" + "utf8" +) + +// FuncMap is the type of the map defining the mapping from names to functions. +// Each function must have either a single return value, or two return values of +// which the second has type os.Error. +type FuncMap map[string]interface{} + +var funcs = map[string]reflect.Value{ + "and": reflect.ValueOf(and), + "html": reflect.ValueOf(HTMLEscaper), + "index": reflect.ValueOf(index), + "js": reflect.ValueOf(JSEscaper), + "not": reflect.ValueOf(not), + "or": reflect.ValueOf(or), + "printf": reflect.ValueOf(fmt.Sprintf), +} + +// addFuncs adds to values the functions in funcs, converting them to reflect.Values. +func addFuncs(values map[string]reflect.Value, funcMap FuncMap) { + for name, fn := range funcMap { + v := reflect.ValueOf(fn) + if v.Kind() != reflect.Func { + panic("value for " + name + " not a function") + } + if !goodFunc(v.Type()) { + panic(fmt.Errorf("can't handle multiple results from method/function %q", name)) + } + values[name] = v + } +} + +// goodFunc checks that the function or method has the right result signature. +func goodFunc(typ reflect.Type) bool { + // We allow functions with 1 result or 2 results where the second is an os.Error. + switch { + case typ.NumOut() == 1: + return true + case typ.NumOut() == 2 && typ.Out(1) == osErrorType: + return true + } + return false +} + +// findFunction looks for a function in the template, set, and global map. +func findFunction(name string, tmpl *Template, set *Set) (reflect.Value, bool) { + if tmpl != nil { + if fn := tmpl.funcs[name]; fn.IsValid() { + return fn, true + } + } + if set != nil { + if fn := set.funcs[name]; fn.IsValid() { + return fn, true + } + } + if fn := funcs[name]; fn.IsValid() { + return fn, true + } + return reflect.Value{}, false +} + +// Indexing. + +// index returns the result of indexing its first argument by the following +// arguments. Thus "index x 1 2 3" is, in Go syntax, x[1][2][3]. Each +// indexed item must be a map, slice, or array. +func index(item interface{}, indices ...interface{}) (interface{}, os.Error) { + v := reflect.ValueOf(item) + for _, i := range indices { + index := reflect.ValueOf(i) + var isNil bool + if v, isNil = indirect(v); isNil { + return nil, fmt.Errorf("index of nil pointer") + } + switch v.Kind() { + case reflect.Array, reflect.Slice: + var x int64 + switch index.Kind() { + case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: + x = index.Int() + case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr: + x = int64(index.Uint()) + default: + return nil, fmt.Errorf("cannot index slice/array with type %s", index.Type()) + } + if x < 0 || x >= int64(v.Len()) { + return nil, fmt.Errorf("index out of range: %d", x) + } + v = v.Index(int(x)) + case reflect.Map: + if !index.Type().AssignableTo(v.Type().Key()) { + return nil, fmt.Errorf("%s is not index type for %s", index.Type(), v.Type()) + } + v = v.MapIndex(index) + if !v.IsValid() { + return nil, fmt.Errorf("index %v not present in map", index.Interface()) + } + default: + return nil, fmt.Errorf("can't index item of type %s", index.Type()) + } + } + return v.Interface(), nil +} + +// Boolean logic. + +// and returns the Boolean AND of its arguments. +func and(arg0 interface{}, args ...interface{}) (truth bool) { + truth, _ = isTrue(reflect.ValueOf(arg0)) + for i := 0; truth && i < len(args); i++ { + truth, _ = isTrue(reflect.ValueOf(args[i])) + } + return +} + +// or returns the Boolean OR of its arguments. +func or(arg0 interface{}, args ...interface{}) (truth bool) { + truth, _ = isTrue(reflect.ValueOf(arg0)) + for i := 0; !truth && i < len(args); i++ { + truth, _ = isTrue(reflect.ValueOf(args[i])) + } + return +} + +// not returns the Boolean negation of its argument. +func not(arg interface{}) (truth bool) { + truth, _ = isTrue(reflect.ValueOf(arg)) + return !truth +} + +// HTML escaping. + +var ( + htmlQuot = []byte(""") // shorter than """ + htmlApos = []byte("'") // shorter than "'" + htmlAmp = []byte("&") + htmlLt = []byte("<") + htmlGt = []byte(">") +) + +// HTMLEscape writes to w the escaped HTML equivalent of the plain text data b. +func HTMLEscape(w io.Writer, b []byte) { + last := 0 + for i, c := range b { + var html []byte + switch c { + case '"': + html = htmlQuot + case '\'': + html = htmlApos + case '&': + html = htmlAmp + case '<': + html = htmlLt + case '>': + html = htmlGt + default: + continue + } + w.Write(b[last:i]) + w.Write(html) + last = i + 1 + } + w.Write(b[last:]) +} + +// HTMLEscapeString returns the escaped HTML equivalent of the plain text data s. +func HTMLEscapeString(s string) string { + // Avoid allocation if we can. + if strings.IndexAny(s, `'"&<>`) < 0 { + return s + } + var b bytes.Buffer + HTMLEscape(&b, []byte(s)) + return b.String() +} + +// HTMLEscaper returns the escaped HTML equivalent of the textual +// representation of its arguments. +func HTMLEscaper(args ...interface{}) string { + ok := false + var s string + if len(args) == 1 { + s, ok = args[0].(string) + } + if !ok { + s = fmt.Sprint(args...) + } + return HTMLEscapeString(s) +} + +// JavaScript escaping. + +var ( + jsLowUni = []byte(`\u00`) + hex = []byte("0123456789ABCDEF") + + jsBackslash = []byte(`\\`) + jsApos = []byte(`\'`) + jsQuot = []byte(`\"`) +) + + +// JSEscape writes to w the escaped JavaScript equivalent of the plain text data b. +func JSEscape(w io.Writer, b []byte) { + last := 0 + for i := 0; i < len(b); i++ { + c := b[i] + + if ' ' <= c && c < utf8.RuneSelf && c != '\\' && c != '"' && c != '\'' { + // fast path: nothing to do + continue + } + w.Write(b[last:i]) + + if c < utf8.RuneSelf { + // Quotes and slashes get quoted. + // Control characters get written as \u00XX. + switch c { + case '\\': + w.Write(jsBackslash) + case '\'': + w.Write(jsApos) + case '"': + w.Write(jsQuot) + default: + w.Write(jsLowUni) + t, b := c>>4, c&0x0f + w.Write(hex[t : t+1]) + w.Write(hex[b : b+1]) + } + } else { + // Unicode rune. + rune, size := utf8.DecodeRune(b[i:]) + if unicode.IsPrint(rune) { + w.Write(b[i : i+size]) + } else { + // TODO(dsymonds): Do this without fmt? + fmt.Fprintf(w, "\\u%04X", rune) + } + i += size - 1 + } + last = i + 1 + } + w.Write(b[last:]) +} + +// JSEscapeString returns the escaped JavaScript equivalent of the plain text data s. +func JSEscapeString(s string) string { + // Avoid allocation if we can. + if strings.IndexFunc(s, jsIsSpecial) < 0 { + return s + } + var b bytes.Buffer + JSEscape(&b, []byte(s)) + return b.String() +} + +func jsIsSpecial(rune int) bool { + switch rune { + case '\\', '\'', '"': + return true + } + return rune < ' ' || utf8.RuneSelf <= rune +} + +// JSEscaper returns the escaped JavaScript equivalent of the textual +// representation of its arguments. +func JSEscaper(args ...interface{}) string { + ok := false + var s string + if len(args) == 1 { + s, ok = args[0].(string) + } + if !ok { + s = fmt.Sprint(args...) + } + return JSEscapeString(s) +} diff --git a/src/pkg/exp/template/lex.go b/src/pkg/exp/template/lex.go new file mode 100644 index 000000000..d78152979 --- /dev/null +++ b/src/pkg/exp/template/lex.go @@ -0,0 +1,431 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package template + +import ( + "fmt" + "strings" + "unicode" + "utf8" +) + +// item represents a token or text string returned from the scanner. +type item struct { + typ itemType + val string +} + +func (i item) String() string { + switch { + case i.typ == itemEOF: + return "EOF" + case i.typ == itemError: + return i.val + case i.typ > itemKeyword: + return fmt.Sprintf("<%s>", i.val) + case len(i.val) > 10: + return fmt.Sprintf("%.10q...", i.val) + } + return fmt.Sprintf("%q", i.val) +} + +// itemType identifies the type of lex items. +type itemType int + +const ( + itemError itemType = iota // error occurred; value is text of error + itemBool // boolean constant + itemComplex // complex constant (1+2i); imaginary is just a number + itemEOF + itemField // alphanumeric identifier, starting with '.', possibly chained ('.x.y') + itemIdentifier // alphanumeric identifier + itemLeftDelim // left action delimiter + itemNumber // simple number, including imaginary + itemPipe // pipe symbol + itemRawString // raw quoted string (includes quotes) + itemRightDelim // right action delimiter + itemString // quoted string (includes quotes) + itemText // plain text + // Keywords appear after all the rest. + itemKeyword // used only to delimit the keywords + itemDot // the cursor, spelled '.'. + itemDefine // define keyword + itemElse // else keyword + itemEnd // end keyword + itemIf // if keyword + itemRange // range keyword + itemTemplate // template keyword + itemWith // with keyword +) + +// Make the types prettyprint. +var itemName = map[itemType]string{ + itemError: "error", + itemBool: "bool", + itemComplex: "complex", + itemEOF: "EOF", + itemField: "field", + itemIdentifier: "identifier", + itemLeftDelim: "left delim", + itemNumber: "number", + itemPipe: "pipe", + itemRawString: "raw string", + itemRightDelim: "right delim", + itemString: "string", + // keywords + itemDot: ".", + itemDefine: "define", + itemElse: "else", + itemIf: "if", + itemEnd: "end", + itemRange: "range", + itemTemplate: "template", + itemWith: "with", +} + +func (i itemType) String() string { + s := itemName[i] + if s == "" { + return fmt.Sprintf("item%d", int(i)) + } + return s +} + +var key = map[string]itemType{ + ".": itemDot, + "define": itemDefine, + "else": itemElse, + "end": itemEnd, + "if": itemIf, + "range": itemRange, + "template": itemTemplate, + "with": itemWith, +} + +const eof = -1 + +// stateFn represents the state of the scanner as a function that returns the next state. +type stateFn func(*lexer) stateFn + +// lexer holds the state of the scanner. +type lexer struct { + name string // the name of the input; used only for error reports. + input string // the string being scanned. + state stateFn // the next lexing function to enter + pos int // current position in the input. + start int // start position of this item. + width int // width of last rune read from input. + items chan item // channel of scanned items. +} + +// next returns the next rune in the input. +func (l *lexer) next() (rune int) { + if l.pos >= len(l.input) { + l.width = 0 + return eof + } + rune, l.width = utf8.DecodeRuneInString(l.input[l.pos:]) + l.pos += l.width + return rune +} + +// peek returns but does not consume the next rune in the input. +func (l *lexer) peek() int { + rune := l.next() + l.backup() + return rune +} + +// backup steps back one rune. Can only be called once per call of next. +func (l *lexer) backup() { + l.pos -= l.width +} + +// emit passes an item back to the client. +func (l *lexer) emit(t itemType) { + l.items <- item{t, l.input[l.start:l.pos]} + l.start = l.pos +} + +// ignore skips over the pending input before this point. +func (l *lexer) ignore() { + l.start = l.pos +} + +// accept consumes the next rune if it's from the valid set. +func (l *lexer) accept(valid string) bool { + if strings.IndexRune(valid, l.next()) >= 0 { + return true + } + l.backup() + return false +} + +// acceptRun consumes a run of runes from the valid set. +func (l *lexer) acceptRun(valid string) { + for strings.IndexRune(valid, l.next()) >= 0 { + } + l.backup() +} + +// lineNumber reports which line we're on. Doing it this way +// means we don't have to worry about peek double counting. +func (l *lexer) lineNumber() int { + return 1 + strings.Count(l.input[:l.pos], "\n") +} + +// error returns an error token and terminates the scan by passing +// back a nil pointer that will be the next state, terminating l.run. +func (l *lexer) errorf(format string, args ...interface{}) stateFn { + l.items <- item{itemError, fmt.Sprintf(format, args...)} + return nil +} + +// nextItem returns the next item from the input. +func (l *lexer) nextItem() item { + for { + select { + case item := <-l.items: + return item + default: + l.state = l.state(l) + } + } + panic("not reached") +} + +// lex creates a new scanner for the input string. +func lex(name, input string) *lexer { + l := &lexer{ + name: name, + input: input, + state: lexText, + items: make(chan item, 2), // Two items of buffering is sufficient for all state functions + } + return l +} + +// state functions + +const ( + leftDelim = "{{" + rightDelim = "}}" + leftComment = "{{/*" + rightComment = "*/}}" +) + +// lexText scans until an opening action delimiter, "{{". +func lexText(l *lexer) stateFn { + for { + if strings.HasPrefix(l.input[l.pos:], leftDelim) { + if l.pos > l.start { + l.emit(itemText) + } + return lexLeftDelim + } + if l.next() == eof { + break + } + } + // Correctly reached EOF. + if l.pos > l.start { + l.emit(itemText) + } + l.emit(itemEOF) + return nil +} + +// lexLeftDelim scans the left delimiter, which is known to be present. +func lexLeftDelim(l *lexer) stateFn { + if strings.HasPrefix(l.input[l.pos:], leftComment) { + return lexComment + } + l.pos += len(leftDelim) + l.emit(itemLeftDelim) + return lexInsideAction +} + +// lexComment scans a comment. The left comment marker is known to be present. +func lexComment(l *lexer) stateFn { + i := strings.Index(l.input[l.pos:], rightComment) + if i < 0 { + return l.errorf("unclosed comment") + } + l.pos += i + len(rightComment) + l.ignore() + return lexText +} + +// lexRightDelim scans the right delimiter, which is known to be present. +func lexRightDelim(l *lexer) stateFn { + l.pos += len(rightDelim) + l.emit(itemRightDelim) + return lexText +} + +// lexInsideAction scans the elements inside action delimiters. +func lexInsideAction(l *lexer) stateFn { + // Either number, quoted string, or identifier. + // Spaces separate and are ignored. + // Pipe symbols separate and are emitted. + for { + if strings.HasPrefix(l.input[l.pos:], rightDelim) { + return lexRightDelim + } + switch r := l.next(); { + case r == eof || r == '\n': + return l.errorf("unclosed action") + case isSpace(r): + l.ignore() + case r == '|': + l.emit(itemPipe) + case r == '"': + return lexQuote + case r == '`': + return lexRawQuote + case r == '.': + // special look-ahead for ".field" so we don't break l.backup(). + if l.pos < len(l.input) { + r := l.input[l.pos] + if r < '0' || '9' < r { + return lexIdentifier // itemDot comes from the keyword table. + } + } + fallthrough // '.' can start a number. + case r == '+' || r == '-' || ('0' <= r && r <= '9'): + l.backup() + return lexNumber + case isAlphaNumeric(r): + l.backup() + return lexIdentifier + default: + return l.errorf("unrecognized character in action: %#U", r) + } + } + return nil +} + +// lexIdentifier scans an alphanumeric or field. +func lexIdentifier(l *lexer) stateFn { +Loop: + for { + switch r := l.next(); { + case isAlphaNumeric(r): + // absorb. + case r == '.' && l.input[l.start] == '.': + // field chaining; absorb into one token. + default: + l.backup() + word := l.input[l.start:l.pos] + switch { + case key[word] > itemKeyword: + l.emit(key[word]) + case word[0] == '.': + l.emit(itemField) + case word == "true", word == "false": + l.emit(itemBool) + default: + l.emit(itemIdentifier) + } + break Loop + } + } + return lexInsideAction +} + +// lexNumber scans a number: decimal, octal, hex, float, or imaginary. This +// isn't a perfect number scanner - for instance it accepts "." and "0x0.2" +// and "089" - but when it's wrong the input is invalid and the parser (via +// strconv) will notice. +func lexNumber(l *lexer) stateFn { + if !l.scanNumber() { + return l.errorf("bad number syntax: %q", l.input[l.start:l.pos]) + } + if sign := l.peek(); sign == '+' || sign == '-' { + // Complex: 1+2i. No spaces, must end in 'i'. + if !l.scanNumber() || l.input[l.pos-1] != 'i' { + return l.errorf("bad number syntax: %q", l.input[l.start:l.pos]) + } + l.emit(itemComplex) + } else { + l.emit(itemNumber) + } + return lexInsideAction +} + +func (l *lexer) scanNumber() bool { + // Optional leading sign. + l.accept("+-") + // Is it hex? + digits := "0123456789" + if l.accept("0") && l.accept("xX") { + digits = "0123456789abcdefABCDEF" + } + l.acceptRun(digits) + if l.accept(".") { + l.acceptRun(digits) + } + if l.accept("eE") { + l.accept("+-") + l.acceptRun("0123456789") + } + // Is it imaginary? + l.accept("i") + // Next thing mustn't be alphanumeric. + if isAlphaNumeric(l.peek()) { + l.next() + return false + } + return true +} + +// lexQuote scans a quoted string. +func lexQuote(l *lexer) stateFn { +Loop: + for { + switch l.next() { + case '\\': + if r := l.next(); r != eof && r != '\n' { + break + } + fallthrough + case eof, '\n': + return l.errorf("unterminated quoted string") + case '"': + break Loop + } + } + l.emit(itemString) + return lexInsideAction +} + +// lexRawQuote scans a raw quoted string. +func lexRawQuote(l *lexer) stateFn { +Loop: + for { + switch l.next() { + case eof, '\n': + return l.errorf("unterminated raw quoted string") + case '`': + break Loop + } + } + l.emit(itemRawString) + return lexInsideAction +} + +// isSpace reports whether r is a space character. +func isSpace(r int) bool { + switch r { + case ' ', '\t', '\n', '\r': + return true + } + return false +} + +// isAlphaNumeric reports whether r is an alphabetic, digit, or underscore. +func isAlphaNumeric(r int) bool { + return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r) +} diff --git a/src/pkg/exp/template/lex_test.go b/src/pkg/exp/template/lex_test.go new file mode 100644 index 000000000..256ec04d8 --- /dev/null +++ b/src/pkg/exp/template/lex_test.go @@ -0,0 +1,153 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package template + +import ( + "reflect" + "testing" +) + +type lexTest struct { + name string + input string + items []item +} + +var ( + tEOF = item{itemEOF, ""} + tLeft = item{itemLeftDelim, "{{"} + tRight = item{itemRightDelim, "}}"} + tRange = item{itemRange, "range"} + tPipe = item{itemPipe, "|"} + tFor = item{itemIdentifier, "for"} + tQuote = item{itemString, `"abc \n\t\" "`} + raw = "`" + `abc\n\t\" ` + "`" + tRawQuote = item{itemRawString, raw} +) + +var lexTests = []lexTest{ + {"empty", "", []item{tEOF}}, + {"spaces", " \t\n", []item{{itemText, " \t\n"}, tEOF}}, + {"text", `now is the time`, []item{{itemText, "now is the time"}, tEOF}}, + {"text with comment", "hello-{{/* this is a comment */}}-world", []item{ + {itemText, "hello-"}, + {itemText, "-world"}, + tEOF, + }}, + {"empty action", `{{}}`, []item{tLeft, tRight, tEOF}}, + {"for", `{{for }}`, []item{tLeft, tFor, tRight, tEOF}}, + {"quote", `{{"abc \n\t\" "}}`, []item{tLeft, tQuote, tRight, tEOF}}, + {"raw quote", "{{" + raw + "}}", []item{tLeft, tRawQuote, tRight, tEOF}}, + {"numbers", "{{1 02 0x14 -7.2i 1e3 +1.2e-4 4.2i 1+2i}}", []item{ + tLeft, + {itemNumber, "1"}, + {itemNumber, "02"}, + {itemNumber, "0x14"}, + {itemNumber, "-7.2i"}, + {itemNumber, "1e3"}, + {itemNumber, "+1.2e-4"}, + {itemNumber, "4.2i"}, + {itemComplex, "1+2i"}, + tRight, + tEOF, + }}, + {"bools", "{{true false}}", []item{ + tLeft, + {itemBool, "true"}, + {itemBool, "false"}, + tRight, + tEOF, + }}, + {"dot", "{{.}}", []item{ + tLeft, + {itemDot, "."}, + tRight, + tEOF, + }}, + {"dots", "{{.x . .2 .x.y}}", []item{ + tLeft, + {itemField, ".x"}, + {itemDot, "."}, + {itemNumber, ".2"}, + {itemField, ".x.y"}, + tRight, + tEOF, + }}, + {"keywords", "{{range if else end with}}", []item{ + tLeft, + {itemRange, "range"}, + {itemIf, "if"}, + {itemElse, "else"}, + {itemEnd, "end"}, + {itemWith, "with"}, + tRight, + tEOF, + }}, + {"pipeline", `intro {{echo hi 1.2 |noargs|args 1 "hi"}} outro`, []item{ + {itemText, "intro "}, + tLeft, + {itemIdentifier, "echo"}, + {itemIdentifier, "hi"}, + {itemNumber, "1.2"}, + tPipe, + {itemIdentifier, "noargs"}, + tPipe, + {itemIdentifier, "args"}, + {itemNumber, "1"}, + {itemString, `"hi"`}, + tRight, + {itemText, " outro"}, + tEOF, + }}, + // errors + {"badchar", "#{{#}}", []item{ + {itemText, "#"}, + tLeft, + {itemError, "unrecognized character in action: U+0023 '#'"}, + }}, + {"unclosed action", "{{\n}}", []item{ + tLeft, + {itemError, "unclosed action"}, + }}, + {"EOF in action", "{{range", []item{ + tLeft, + tRange, + {itemError, "unclosed action"}, + }}, + {"unclosed quote", "{{\"\n\"}}", []item{ + tLeft, + {itemError, "unterminated quoted string"}, + }}, + {"unclosed raw quote", "{{`xx\n`}}", []item{ + tLeft, + {itemError, "unterminated raw quoted string"}, + }}, + {"bad number", "{{3k}}", []item{ + tLeft, + {itemError, `bad number syntax: "3k"`}, + }}, +} + +// collect gathers the emitted items into a slice. +func collect(t *lexTest) (items []item) { + l := lex(t.name, t.input) + for { + item := l.nextItem() + items = append(items, item) + if item.typ == itemEOF || item.typ == itemError { + break + } + } + return +} + +func TestLex(t *testing.T) { + for _, test := range lexTests { + items := collect(&test) + if !reflect.DeepEqual(items, test.items) { + t.Errorf("%s: got\n\t%v\nexpected\n\t%v", test.name, items, test.items) + } + } +} diff --git a/src/pkg/exp/template/parse.go b/src/pkg/exp/template/parse.go new file mode 100644 index 000000000..8b2d60207 --- /dev/null +++ b/src/pkg/exp/template/parse.go @@ -0,0 +1,783 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package template + +import ( + "bytes" + "fmt" + "os" + "reflect" + "runtime" + "strconv" + "strings" + "unicode" +) + +// Template is the representation of a parsed template. +type Template struct { + name string + root *listNode + funcs map[string]reflect.Value + // Parsing only; cleared after parse. + set *Set + lex *lexer + token item // token lookahead for parser + havePeek bool +} + +// next returns the next token. +func (t *Template) next() item { + if t.havePeek { + t.havePeek = false + } else { + t.token = t.lex.nextItem() + } + return t.token +} + +// backup backs the input stream up one token. +func (t *Template) backup() { + t.havePeek = true +} + +// peek returns but does not consume the next token. +func (t *Template) peek() item { + if t.havePeek { + return t.token + } + t.token = t.lex.nextItem() + t.havePeek = true + return t.token +} + +// A node is an element in the parse tree. The interface is trivial. +type node interface { + typ() nodeType + String() string +} + +type nodeType int + +func (t nodeType) typ() nodeType { + return t +} + +const ( + nodeText nodeType = iota + nodeAction + nodeCommand + nodeDot + nodeElse + nodeEnd + nodeField + nodeIdentifier + nodeIf + nodeList + nodeNumber + nodeRange + nodeString + nodeTemplate + nodeWith +) + +// Nodes. + +// listNode holds a sequence of nodes. +type listNode struct { + nodeType + nodes []node +} + +func newList() *listNode { + return &listNode{nodeType: nodeList} +} + +func (l *listNode) append(n node) { + l.nodes = append(l.nodes, n) +} + +func (l *listNode) String() string { + b := new(bytes.Buffer) + fmt.Fprint(b, "[") + for _, n := range l.nodes { + fmt.Fprint(b, n) + } + fmt.Fprint(b, "]") + return b.String() +} + +// textNode holds plain text. +type textNode struct { + nodeType + text []byte +} + +func newText(text string) *textNode { + return &textNode{nodeType: nodeText, text: []byte(text)} +} + +func (t *textNode) String() string { + return fmt.Sprintf("(text: %q)", t.text) +} + +// actionNode holds an action (something bounded by delimiters). +type actionNode struct { + nodeType + line int + pipeline []*commandNode +} + +func newAction(line int, pipeline []*commandNode) *actionNode { + return &actionNode{nodeType: nodeAction, line: line, pipeline: pipeline} +} + +func (a *actionNode) append(command *commandNode) { + a.pipeline = append(a.pipeline, command) +} + +func (a *actionNode) String() string { + return fmt.Sprintf("(action: %v)", a.pipeline) +} + +// commandNode holds a command (a pipeline inside an evaluating action). +type commandNode struct { + nodeType + args []node // identifier, string, or number +} + +func newCommand() *commandNode { + return &commandNode{nodeType: nodeCommand} +} + +func (c *commandNode) append(arg node) { + c.args = append(c.args, arg) +} + +func (c *commandNode) String() string { + return fmt.Sprintf("(command: %v)", c.args) +} + +// identifierNode holds an identifier. +type identifierNode struct { + nodeType + ident string +} + +func newIdentifier(ident string) *identifierNode { + return &identifierNode{nodeType: nodeIdentifier, ident: ident} +} + +func (i *identifierNode) String() string { + return fmt.Sprintf("I=%s", i.ident) +} + +// dotNode holds the special identifier '.'. It is represented by a nil pointer. +type dotNode bool + +func newDot() *dotNode { + return nil +} + +func (d *dotNode) typ() nodeType { + return nodeDot +} + +func (d *dotNode) String() string { + return "{{<.>}}" +} + +// fieldNode holds a field (identifier starting with '.'). +// The names may be chained ('.x.y'). +// The period is dropped from each ident. +type fieldNode struct { + nodeType + ident []string +} + +func newField(ident string) *fieldNode { + return &fieldNode{nodeType: nodeField, ident: strings.Split(ident[1:], ".")} // [1:] to drop leading period +} + +func (f *fieldNode) String() string { + return fmt.Sprintf("F=%s", f.ident) +} + +// boolNode holds a boolean constant. +type boolNode struct { + nodeType + true bool +} + +func newBool(true bool) *boolNode { + return &boolNode{nodeType: nodeString, true: true} +} + +func (b *boolNode) String() string { + if b.true { + return fmt.Sprintf("B=true") + } + return fmt.Sprintf("B=false") +} + +// numberNode holds a number, signed or unsigned integer, floating, or complex. +// The value is parsed and stored under all the types that can represent the value. +// This simulates in a small amount of code the behavior of Go's ideal constants. +type numberNode struct { + nodeType + isInt bool // number has an integral value + isUint bool // number has an unsigned integral value + isFloat bool // number has a floating-point value + isComplex bool // number is complex + int64 // the signed integer value + uint64 // the unsigned integer value + float64 // the floating-point value + complex128 // the complex value + text string +} + +func newNumber(text string, isComplex bool) (*numberNode, os.Error) { + n := &numberNode{nodeType: nodeNumber, text: text} + if isComplex { + // fmt.Sscan can parse the pair, so let it do the work. + if _, err := fmt.Sscan(text, &n.complex128); err != nil { + return nil, err + } + n.isComplex = true + n.simplifyComplex() + return n, nil + } + // Imaginary constants can only be complex unless they are zero. + if len(text) > 0 && text[len(text)-1] == 'i' { + f, err := strconv.Atof64(text[:len(text)-1]) + if err == nil { + n.isComplex = true + n.complex128 = complex(0, f) + n.simplifyComplex() + return n, nil + } + } + // Do integer test first so we get 0x123 etc. + u, err := strconv.Btoui64(text, 0) // will fail for -0; fixed below. + if err == nil { + n.isUint = true + n.uint64 = u + } + i, err := strconv.Btoi64(text, 0) + if err == nil { + n.isInt = true + n.int64 = i + if i == 0 { + n.isUint = true // in case of -0. + n.uint64 = u + } + } + // If an integer extraction succeeded, promote the float. + if n.isInt { + n.isFloat = true + n.float64 = float64(n.int64) + } else if n.isUint { + n.isFloat = true + n.float64 = float64(n.uint64) + } else { + f, err := strconv.Atof64(text) + if err == nil { + n.isFloat = true + n.float64 = f + // If a floating-point extraction succeeded, extract the int if needed. + if !n.isInt && float64(int64(f)) == f { + n.isInt = true + n.int64 = int64(f) + } + if !n.isUint && float64(uint64(f)) == f { + n.isUint = true + n.uint64 = uint64(f) + } + } + } + if !n.isInt && !n.isUint && !n.isFloat { + return nil, fmt.Errorf("illegal number syntax: %q", text) + } + return n, nil +} + +// simplifyComplex pulls out any other types that are represented by the complex number. +// These all require that the imaginary part be zero. +func (n *numberNode) simplifyComplex() { + n.isFloat = imag(n.complex128) == 0 + if n.isFloat { + n.float64 = real(n.complex128) + n.isInt = float64(int64(n.float64)) == n.float64 + if n.isInt { + n.int64 = int64(n.float64) + } + n.isUint = float64(uint64(n.float64)) == n.float64 + if n.isUint { + n.uint64 = uint64(n.float64) + } + } +} + +func (n *numberNode) String() string { + return fmt.Sprintf("N=%s", n.text) +} + +// stringNode holds a quoted string. +type stringNode struct { + nodeType + text string +} + +func newString(text string) *stringNode { + return &stringNode{nodeType: nodeString, text: text} +} + +func (s *stringNode) String() string { + return fmt.Sprintf("S=%#q", s.text) +} + +// endNode represents an {{end}} action. It is represented by a nil pointer. +type endNode bool + +func newEnd() *endNode { + return nil +} + +func (e *endNode) typ() nodeType { + return nodeEnd +} + +func (e *endNode) String() string { + return "{{end}}" +} + +// elseNode represents an {{else}} action. +type elseNode struct { + nodeType + line int +} + +func newElse(line int) *elseNode { + return &elseNode{nodeType: nodeElse, line: line} +} + +func (e *elseNode) typ() nodeType { + return nodeElse +} + +func (e *elseNode) String() string { + return "{{else}}" +} +// ifNode represents an {{if}} action and its commands. +// TODO: what should evaluation look like? is a pipeline enough? +type ifNode struct { + nodeType + line int + pipeline []*commandNode + list *listNode + elseList *listNode +} + +func newIf(line int, pipeline []*commandNode, list, elseList *listNode) *ifNode { + return &ifNode{nodeType: nodeIf, line: line, pipeline: pipeline, list: list, elseList: elseList} +} + +func (i *ifNode) String() string { + if i.elseList != nil { + return fmt.Sprintf("({{if %s}} %s {{else}} %s)", i.pipeline, i.list, i.elseList) + } + return fmt.Sprintf("({{if %s}} %s)", i.pipeline, i.list) +} + +// rangeNode represents a {{range}} action and its commands. +type rangeNode struct { + nodeType + line int + pipeline []*commandNode + list *listNode + elseList *listNode +} + +func newRange(line int, pipeline []*commandNode, list, elseList *listNode) *rangeNode { + return &rangeNode{nodeType: nodeRange, line: line, pipeline: pipeline, list: list, elseList: elseList} +} + +func (r *rangeNode) String() string { + if r.elseList != nil { + return fmt.Sprintf("({{range %s}} %s {{else}} %s)", r.pipeline, r.list, r.elseList) + } + return fmt.Sprintf("({{range %s}} %s)", r.pipeline, r.list) +} + +// templateNode represents a {{template}} action. +type templateNode struct { + nodeType + line int + name node + pipeline []*commandNode +} + +func newTemplate(line int, name node, pipeline []*commandNode) *templateNode { + return &templateNode{nodeType: nodeTemplate, line: line, name: name, pipeline: pipeline} +} + +func (t *templateNode) String() string { + return fmt.Sprintf("{{template %s %s}}", t.name, t.pipeline) +} + +// withNode represents a {{with}} action and its commands. +type withNode struct { + nodeType + line int + pipeline []*commandNode + list *listNode + elseList *listNode +} + +func newWith(line int, pipeline []*commandNode, list, elseList *listNode) *withNode { + return &withNode{nodeType: nodeWith, line: line, pipeline: pipeline, list: list, elseList: elseList} +} + +func (w *withNode) String() string { + if w.elseList != nil { + return fmt.Sprintf("({{with %s}} %s {{else}} %s)", w.pipeline, w.list, w.elseList) + } + return fmt.Sprintf("({{with %s}} %s)", w.pipeline, w.list) +} + + +// Parsing. + +// New allocates a new template with the given name. +func New(name string) *Template { + return &Template{ + name: name, + funcs: make(map[string]reflect.Value), + } +} + +// Funcs adds to the template's function map the elements of the +// argument map. It panics if a value in the map is not a function +// with appropriate return type. +// The return value is the template, so calls can be chained. +func (t *Template) Funcs(funcMap FuncMap) *Template { + addFuncs(t.funcs, funcMap) + return t +} + +// errorf formats the error and terminates processing. +func (t *Template) errorf(format string, args ...interface{}) { + t.root = nil + format = fmt.Sprintf("template: %s:%d: %s", t.name, t.lex.lineNumber(), format) + panic(fmt.Errorf(format, args...)) +} + +// error terminates processing. +func (t *Template) error(err os.Error) { + t.errorf("%s", err) +} + +// expect consumes the next token and guarantees it has the required type. +func (t *Template) expect(expected itemType, context string) item { + token := t.next() + if token.typ != expected { + t.errorf("expected %s in %s; got %s", expected, context, token) + } + return token +} + +// unexpected complains about the token and terminates processing. +func (t *Template) unexpected(token item, context string) { + t.errorf("unexpected %s in %s", token, context) +} + +// recover is the handler that turns panics into returns from the top +// level of Parse or Execute. +func (t *Template) recover(errp *os.Error) { + e := recover() + if e != nil { + if _, ok := e.(runtime.Error); ok { + panic(e) + } + t.stopParse() + *errp = e.(os.Error) + } + return +} + +// startParse starts the template parsing from the lexer. +func (t *Template) startParse(set *Set, lex *lexer) { + t.root = nil + t.set = set + t.lex = lex +} + +// stopParse terminates parsing. +func (t *Template) stopParse() { + t.set, t.lex = nil, nil +} + +// atEOF returns true if, possibly after spaces, we're at EOF. +func (t *Template) atEOF() bool { + for { + token := t.peek() + switch token.typ { + case itemEOF: + return true + case itemText: + for _, r := range token.val { + if !unicode.IsSpace(r) { + return false + } + } + t.next() // skip spaces. + continue + } + break + } + return false +} + +// Parse parses the template definition string to construct an internal representation +// of the template for execution. +func (t *Template) Parse(s string) (err os.Error) { + t.startParse(nil, lex(t.name, s)) + defer t.recover(&err) + t.parse(true) + t.stopParse() + return +} + +// ParseInSet parses the template definition string to construct an internal representation +// of the template for execution. Function bindings are checked against those in the set. +func (t *Template) ParseInSet(s string, set *Set) (err os.Error) { + t.startParse(set, lex(t.name, s)) + defer t.recover(&err) + t.parse(true) + t.stopParse() + return +} + +// parse is the helper for Parse. It triggers an error if we expect EOF but don't reach it. +func (t *Template) parse(toEOF bool) (next node) { + t.root, next = t.itemList(true) + if toEOF && next != nil { + t.errorf("unexpected %s", next) + } + return next +} + +// itemList: +// textOrAction* +// Terminates at EOF and at {{end}} or {{else}}, which is returned separately. +// The toEOF flag tells whether we expect to reach EOF. +func (t *Template) itemList(toEOF bool) (list *listNode, next node) { + list = newList() + for t.peek().typ != itemEOF { + n := t.textOrAction() + switch n.typ() { + case nodeEnd, nodeElse: + return list, n + } + list.append(n) + } + if !toEOF { + t.unexpected(t.next(), "input") + } + return list, nil +} + +// textOrAction: +// text | action +func (t *Template) textOrAction() node { + switch token := t.next(); token.typ { + case itemText: + return newText(token.val) + case itemLeftDelim: + return t.action() + default: + t.unexpected(token, "input") + } + return nil +} + +// Action: +// control +// command ("|" command)* +// Left delim is past. Now get actions. +// First word could be a keyword such as range. +func (t *Template) action() (n node) { + switch token := t.next(); token.typ { + case itemElse: + return t.elseControl() + case itemEnd: + return t.endControl() + case itemIf: + return t.ifControl() + case itemRange: + return t.rangeControl() + case itemTemplate: + return t.templateControl() + case itemWith: + return t.withControl() + } + t.backup() + return newAction(t.lex.lineNumber(), t.pipeline("command")) +} + +// Pipeline: +// field or command +// pipeline "|" pipeline +func (t *Template) pipeline(context string) (pipe []*commandNode) { + for { + switch token := t.next(); token.typ { + case itemRightDelim: + if len(pipe) == 0 { + t.errorf("missing value for %s", context) + } + return + case itemBool, itemComplex, itemDot, itemField, itemIdentifier, itemNumber, itemRawString, itemString: + t.backup() + pipe = append(pipe, t.command()) + default: + t.unexpected(token, context) + } + } + return +} + +func (t *Template) parseControl(context string) (lineNum int, pipe []*commandNode, list, elseList *listNode) { + lineNum = t.lex.lineNumber() + pipe = t.pipeline(context) + var next node + list, next = t.itemList(false) + switch next.typ() { + case nodeEnd: //done + case nodeElse: + elseList, next = t.itemList(false) + if next.typ() != nodeEnd { + t.errorf("expected end; found %s", next) + } + elseList = elseList + } + return lineNum, pipe, list, elseList +} + +// If: +// {{if pipeline}} itemList {{end}} +// {{if pipeline}} itemList {{else}} itemList {{end}} +// If keyword is past. +func (t *Template) ifControl() node { + return newIf(t.parseControl("if")) +} + +// Range: +// {{range pipeline}} itemList {{end}} +// {{range pipeline}} itemList {{else}} itemList {{end}} +// Range keyword is past. +func (t *Template) rangeControl() node { + return newRange(t.parseControl("range")) +} + +// With: +// {{with pipeline}} itemList {{end}} +// {{with pipeline}} itemList {{else}} itemList {{end}} +// If keyword is past. +func (t *Template) withControl() node { + return newWith(t.parseControl("with")) +} + + +// End: +// {{end}} +// End keyword is past. +func (t *Template) endControl() node { + t.expect(itemRightDelim, "end") + return newEnd() +} + +// Else: +// {{else}} +// Else keyword is past. +func (t *Template) elseControl() node { + t.expect(itemRightDelim, "else") + return newElse(t.lex.lineNumber()) +} + +// Template: +// {{template stringValue pipeline}} +// Template keyword is past. The name must be something that can evaluate +// to a string. +func (t *Template) templateControl() node { + var name node + switch token := t.next(); token.typ { + case itemIdentifier: + if _, ok := findFunction(token.val, t, t.set); !ok { + t.errorf("function %q not defined", token.val) + } + name = newIdentifier(token.val) + case itemDot: + name = newDot() + case itemField: + name = newField(token.val) + case itemString, itemRawString: + s, err := strconv.Unquote(token.val) + if err != nil { + t.error(err) + } + name = newString(s) + default: + t.unexpected(token, "template invocation") + } + pipeline := t.pipeline("template") + return newTemplate(t.lex.lineNumber(), name, pipeline) +} + +// command: +// space-separated arguments up to a pipeline character or right delimiter. +// we consume the pipe character but leave the right delim to terminate the action. +func (t *Template) command() *commandNode { + cmd := newCommand() +Loop: + for { + switch token := t.next(); token.typ { + case itemRightDelim: + t.backup() + break Loop + case itemPipe: + break Loop + case itemError: + t.errorf("%s", token.val) + case itemIdentifier: + if _, ok := findFunction(token.val, t, t.set); !ok { + t.errorf("function %q not defined", token.val) + } + cmd.append(newIdentifier(token.val)) + case itemDot: + cmd.append(newDot()) + case itemField: + cmd.append(newField(token.val)) + case itemBool: + cmd.append(newBool(token.val == "true")) + case itemComplex, itemNumber: + number, err := newNumber(token.val, token.typ == itemComplex) + if err != nil { + t.error(err) + } + cmd.append(number) + case itemString, itemRawString: + s, err := strconv.Unquote(token.val) + if err != nil { + t.error(err) + } + cmd.append(newString(s)) + default: + t.unexpected(token, "command") + } + } + if len(cmd.args) == 0 { + t.errorf("empty command") + } + return cmd +} diff --git a/src/pkg/exp/template/parse_test.go b/src/pkg/exp/template/parse_test.go new file mode 100644 index 000000000..71580f8b6 --- /dev/null +++ b/src/pkg/exp/template/parse_test.go @@ -0,0 +1,207 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package template + +import ( + "flag" + "fmt" + "testing" +) + +var debug = flag.Bool("debug", false, "show the errors produced by the tests") + +type numberTest struct { + text string + isInt bool + isUint bool + isFloat bool + isComplex bool + int64 + uint64 + float64 + complex128 +} + +var numberTests = []numberTest{ + // basics + {"0", true, true, true, false, 0, 0, 0, 0}, + {"-0", true, true, true, false, 0, 0, 0, 0}, // check that -0 is a uint. + {"73", true, true, true, false, 73, 73, 73, 0}, + {"-73", true, false, true, false, -73, 0, -73, 0}, + {"+73", true, false, true, false, 73, 0, 73, 0}, + {"100", true, true, true, false, 100, 100, 100, 0}, + {"1e9", true, true, true, false, 1e9, 1e9, 1e9, 0}, + {"-1e9", true, false, true, false, -1e9, 0, -1e9, 0}, + {"-1.2", false, false, true, false, 0, 0, -1.2, 0}, + {"1e19", false, true, true, false, 0, 1e19, 1e19, 0}, + {"-1e19", false, false, true, false, 0, 0, -1e19, 0}, + {"4i", false, false, false, true, 0, 0, 0, 4i}, + {"-1.2+4.2i", false, false, false, true, 0, 0, 0, -1.2 + 4.2i}, + // complex with 0 imaginary are float (and maybe integer) + {"0i", true, true, true, true, 0, 0, 0, 0}, + {"-1.2+0i", false, false, true, true, 0, 0, -1.2, -1.2}, + {"-12+0i", true, false, true, true, -12, 0, -12, -12}, + {"13+0i", true, true, true, true, 13, 13, 13, 13}, + // funny bases + {"0123", true, true, true, false, 0123, 0123, 0123, 0}, + {"-0x0", true, true, true, false, 0, 0, 0, 0}, + {"0xdeadbeef", true, true, true, false, 0xdeadbeef, 0xdeadbeef, 0xdeadbeef, 0}, + // some broken syntax + {text: "+-2"}, + {text: "0x123."}, + {text: "1e."}, + {text: "0xi."}, + {text: "1+2."}, +} + +func TestNumberParse(t *testing.T) { + for _, test := range numberTests { + // If fmt.Sscan thinks it's complex, it's complex. We can't trust the output + // because imaginary comes out as a number. + var c complex128 + _, err := fmt.Sscan(test.text, &c) + n, err := newNumber(test.text, err == nil) + ok := test.isInt || test.isUint || test.isFloat || test.isComplex + if ok && err != nil { + t.Errorf("unexpected error for %q", test.text) + continue + } + if !ok && err == nil { + t.Errorf("expected error for %q", test.text) + continue + } + if !ok { + continue + } + if n.isComplex != test.isComplex { + t.Errorf("complex incorrect for %q; should be %t", test.text, test.isComplex) + } + if test.isInt { + if !n.isInt { + t.Errorf("expected integer for %q", test.text) + } + if n.int64 != test.int64 { + t.Errorf("int64 for %q should be %d is %d", test.text, test.int64, n.int64) + } + } else if n.isInt { + t.Errorf("did not expect integer for %q", test.text) + } + if test.isUint { + if !n.isUint { + t.Errorf("expected unsigned integer for %q", test.text) + } + if n.uint64 != test.uint64 { + t.Errorf("uint64 for %q should be %d is %d", test.text, test.uint64, n.uint64) + } + } else if n.isUint { + t.Errorf("did not expect unsigned integer for %q", test.text) + } + if test.isFloat { + if !n.isFloat { + t.Errorf("expected float for %q", test.text) + } + if n.float64 != test.float64 { + t.Errorf("float64 for %q should be %g is %g", test.text, test.float64, n.float64) + } + } else if n.isFloat { + t.Errorf("did not expect float for %q", test.text) + } + if test.isComplex { + if !n.isComplex { + t.Errorf("expected complex for %q", test.text) + } + if n.complex128 != test.complex128 { + t.Errorf("complex128 for %q should be %g is %g", test.text, test.complex128, n.complex128) + } + } else if n.isComplex { + t.Errorf("did not expect complex for %q", test.text) + } + } +} + +type parseTest struct { + name string + input string + ok bool + result string +} + +const ( + noError = true + hasError = false +) + +var parseTests = []parseTest{ + {"empty", "", noError, + `[]`}, + {"spaces", " \t\n", noError, + `[(text: " \t\n")]`}, + {"text", "some text", noError, + `[(text: "some text")]`}, + {"emptyAction", "{{}}", hasError, + `[(action: [])]`}, + {"field", "{{.X}}", noError, + `[(action: [(command: [F=[X]])])]`}, + {"simple command", "{{printf}}", noError, + `[(action: [(command: [I=printf])])]`}, + {"multi-word command", "{{printf `%d` 23}}", noError, + "[(action: [(command: [I=printf S=`%d` N=23])])]"}, + {"pipeline", "{{.X|.Y}}", noError, + `[(action: [(command: [F=[X]]) (command: [F=[Y]])])]`}, + {"simple if", "{{if .X}}hello{{end}}", noError, + `[({{if [(command: [F=[X]])]}} [(text: "hello")])]`}, + {"if with else", "{{if .X}}true{{else}}false{{end}}", noError, + `[({{if [(command: [F=[X]])]}} [(text: "true")] {{else}} [(text: "false")])]`}, + {"simple range", "{{range .X}}hello{{end}}", noError, + `[({{range [(command: [F=[X]])]}} [(text: "hello")])]`}, + {"chained field range", "{{range .X.Y.Z}}hello{{end}}", noError, + `[({{range [(command: [F=[X Y Z]])]}} [(text: "hello")])]`}, + {"nested range", "{{range .X}}hello{{range .Y}}goodbye{{end}}{{end}}", noError, + `[({{range [(command: [F=[X]])]}} [(text: "hello")({{range [(command: [F=[Y]])]}} [(text: "goodbye")])])]`}, + {"range with else", "{{range .X}}true{{else}}false{{end}}", noError, + `[({{range [(command: [F=[X]])]}} [(text: "true")] {{else}} [(text: "false")])]`}, + {"range over pipeline", "{{range .X|.M}}true{{else}}false{{end}}", noError, + `[({{range [(command: [F=[X]]) (command: [F=[M]])]}} [(text: "true")] {{else}} [(text: "false")])]`}, + {"range []int", "{{range .SI}}{{.}}{{end}}", noError, + `[({{range [(command: [F=[SI]])]}} [(action: [(command: [{{<.>}}])])])]`}, + {"constants", "{{range .SI 1 -3.2i true false }}{{end}}", noError, + `[({{range [(command: [F=[SI] N=1 N=-3.2i B=true B=false])]}} [])]`}, + {"template", "{{template `x` .Y}}", noError, + "[{{template S=`x` [(command: [F=[Y]])]}}]"}, + {"with", "{{with .X}}hello{{end}}", noError, + `[({{with [(command: [F=[X]])]}} [(text: "hello")])]`}, + {"with with else", "{{with .X}}hello{{else}}goodbye{{end}}", noError, + `[({{with [(command: [F=[X]])]}} [(text: "hello")] {{else}} [(text: "goodbye")])]`}, + // Errors. + {"unclosed action", "hello{{range", hasError, ""}, + {"missing end", "hello{{range .x}}", hasError, ""}, + {"missing end after else", "hello{{range .x}}{{else}}", hasError, ""}, + {"undefined function", "hello{{undefined}}", hasError, ""}, +} + +func TestParse(t *testing.T) { + for _, test := range parseTests { + tmpl := New(test.name) + err := tmpl.Parse(test.input) + switch { + case err == nil && !test.ok: + t.Errorf("%q: expected error; got none", test.name) + continue + case err != nil && test.ok: + t.Errorf("%q: unexpected error: %v", test.name, err) + continue + case err != nil && !test.ok: + // expected error, got one + if *debug { + fmt.Printf("%s: %s\n\t%s\n", test.name, test.input, err) + } + continue + } + result := tmpl.root.String() + if result != test.result { + t.Errorf("%s=(%q): got\n\t%v\nexpected\n\t%v", test.name, test.input, result, test.result) + } + } +} diff --git a/src/pkg/exp/template/set.go b/src/pkg/exp/template/set.go new file mode 100644 index 000000000..492e270e1 --- /dev/null +++ b/src/pkg/exp/template/set.go @@ -0,0 +1,115 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package template + +import ( + "fmt" + "io" + "os" + "reflect" + "runtime" + "strconv" +) + +// Set holds a set of related templates that can refer to one another by name. +// A template may be a member of multiple sets. +type Set struct { + tmpl map[string]*Template + funcs map[string]reflect.Value +} + +// NewSet allocates a new, empty template set. +func NewSet() *Set { + return &Set{ + tmpl: make(map[string]*Template), + funcs: make(map[string]reflect.Value), + } +} + +// Funcs adds to the set's function map the elements of the +// argument map. It panics if a value in the map is not a function +// with appropriate return type. +// The return value is the set, so calls can be chained. +func (s *Set) Funcs(funcMap FuncMap) *Set { + addFuncs(s.funcs, funcMap) + return s +} + +// Add adds the argument templates to the set. It panics if the call +// attempts to reuse a name defined in the template. +// The return value is the set, so calls can be chained. +func (s *Set) Add(templates ...*Template) *Set { + for _, t := range templates { + if _, ok := s.tmpl[t.name]; ok { + panic(fmt.Errorf("template: %q already defined in set", t.name)) + } + s.tmpl[t.name] = t + } + return s +} + +// Template returns the template with the given name in the set, +// or nil if there is no such template. +func (s *Set) Template(name string) *Template { + return s.tmpl[name] +} + +// Execute looks for the named template in the set and then applies that +// template to the specified data object, writing the output to wr. Nested +// template invocations will be resolved from the set. +func (s *Set) Execute(name string, wr io.Writer, data interface{}) os.Error { + tmpl := s.tmpl[name] + if tmpl == nil { + return fmt.Errorf("template: no template %q in set", name) + } + return tmpl.ExecuteInSet(wr, data, s) +} + +// recover is the handler that turns panics into returns from the top +// level of Parse. +func (s *Set) recover(errp *os.Error) { + e := recover() + if e != nil { + if _, ok := e.(runtime.Error); ok { + panic(e) + } + s.tmpl = nil + *errp = e.(os.Error) + } + return +} + +// Parse parses the file into a set of named templates. +func (s *Set) Parse(text string) (err os.Error) { + defer s.recover(&err) + lex := lex("set", text) + const context = "define clause" + for { + t := New("set") // name will be updated once we know it. + t.startParse(s, lex) + // Expect EOF or "{{ define name }}". + if t.atEOF() { + return + } + t.expect(itemLeftDelim, context) + t.expect(itemDefine, context) + name := t.expect(itemString, context) + t.name, err = strconv.Unquote(name.val) + if err != nil { + t.error(err) + } + t.expect(itemRightDelim, context) + end := t.parse(false) + if end == nil { + t.errorf("unexpected EOF in %s", context) + } + if end.typ() != nodeEnd { + t.errorf("unexpected %s in %s", end, context) + } + t.stopParse() + s.tmpl[t.name] = t + } + return nil +} diff --git a/src/pkg/exp/template/set_test.go b/src/pkg/exp/template/set_test.go new file mode 100644 index 000000000..c0115ec0a --- /dev/null +++ b/src/pkg/exp/template/set_test.go @@ -0,0 +1,101 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package template + +import ( + "fmt" + "testing" +) + +type setParseTest struct { + name string + input string + ok bool + names []string + results []string +} + +var setParseTests = []setParseTest{ + {"empty", "", noError, + nil, + nil}, + {"one", `{{define "foo"}} FOO {{end}}`, noError, + []string{"foo"}, + []string{`[(text: " FOO ")]`}}, + {"two", `{{define "foo"}} FOO {{end}}{{define "bar"}} BAR {{end}}`, noError, + []string{"foo", "bar"}, + []string{`[(text: " FOO ")]`, `[(text: " BAR ")]`}}, + // errors + {"missing end", `{{define "foo"}} FOO `, hasError, + nil, + nil}, + {"malformed name", `{{define "foo}} FOO `, hasError, + nil, + nil}, +} + +func TestSetParse(t *testing.T) { + for _, test := range setParseTests { + set := NewSet() + err := set.Parse(test.input) + switch { + case err == nil && !test.ok: + t.Errorf("%q: expected error; got none", test.name) + continue + case err != nil && test.ok: + t.Errorf("%q: unexpected error: %v", test.name, err) + continue + case err != nil && !test.ok: + // expected error, got one + if *debug { + fmt.Printf("%s: %s\n\t%s\n", test.name, test.input, err) + } + continue + } + if len(set.tmpl) != len(test.names) { + t.Errorf("%s: wrong number of templates; wanted %d got %d", test.name, len(test.names), len(set.tmpl)) + continue + } + for i, name := range test.names { + tmpl, ok := set.tmpl[name] + if !ok { + t.Errorf("%s: can't find template %q", test.name, name) + continue + } + result := tmpl.root.String() + if result != test.results[i] { + t.Errorf("%s=(%q): got\n\t%v\nexpected\n\t%v", test.name, test.input, result, test.results[i]) + } + } + } +} + + +var setExecTests = []execTest{ + {"empty", "", "", nil, true}, + {"text", "some text", "some text", nil, true}, + {"invoke text", `{{template "text" .SI}}`, "TEXT", tVal, true}, + {"invoke dot int", `{{template "dot" .I}}`, "17", tVal, true}, + {"invoke dot []int", `{{template "dot" .SI}}`, "[3 4 5]", tVal, true}, + {"invoke dotV", `{{template "dotV" .U}}`, "v", tVal, true}, + {"invoke nested int", `{{template "nested" .I}}`, "17", tVal, true}, +} + +const setText = ` + {{define "text"}}TEXT{{end}} + {{define "dotV"}}{{.V}}{{end}} + {{define "dot"}}{{.}}{{end}} + {{define "nested"}}{{template "dot" .}}{{end}} +` + +func TestSetExecute(t *testing.T) { + // Declare a set with a couple of templates first. + set := NewSet() + err := set.Parse(setText) + if err != nil { + t.Fatalf("error parsing set: %s", err) + } + testExecute(setExecTests, set, t) +} |