summaryrefslogtreecommitdiff
path: root/src/pkg
diff options
context:
space:
mode:
authorRob Pike <r@golang.org>2009-08-11 13:54:47 -0700
committerRob Pike <r@golang.org>2009-08-11 13:54:47 -0700
commit84fefa79c535d19e45ba9828862892e18cd40eb7 (patch)
treecd9d6159de384b3bb7b81120bc18058fb7d76a22 /src/pkg
parent40e692180efc8fdff513ffce47f6fdb8463dc314 (diff)
downloadgolang-84fefa79c535d19e45ba9828862892e18cd40eb7.tar.gz
make a simpler regexp implementation with fewer dependencies and put it inside testing.
remove "regexp." from regexp tests. R=rsc DELTA=1173 (1152 added, 1 deleted, 20 changed) OCL=33028 CL=33037
Diffstat (limited to 'src/pkg')
-rw-r--r--src/pkg/regexp/all_test.go28
-rw-r--r--src/pkg/testing/Makefile15
-rw-r--r--src/pkg/testing/regexp.go863
-rw-r--r--src/pkg/testing/regexp_test.go281
-rw-r--r--src/pkg/testing/testing.go8
5 files changed, 1174 insertions, 21 deletions
diff --git a/src/pkg/regexp/all_test.go b/src/pkg/regexp/all_test.go
index 72355e91b..c985dad42 100644
--- a/src/pkg/regexp/all_test.go
+++ b/src/pkg/regexp/all_test.go
@@ -37,18 +37,18 @@ type stringError struct {
err os.Error;
}
var bad_re = []stringError{
- stringError{ `*`, regexp.ErrBareClosure },
- stringError{ `(abc`, regexp.ErrUnmatchedLpar },
- stringError{ `abc)`, regexp.ErrUnmatchedRpar },
- stringError{ `x[a-z`, regexp.ErrUnmatchedLbkt },
- stringError{ `abc]`, regexp.ErrUnmatchedRbkt },
- stringError{ `[z-a]`, regexp.ErrBadRange },
- stringError{ `abc\`, regexp.ErrExtraneousBackslash },
- stringError{ `a**`, regexp.ErrBadClosure },
- stringError{ `a*+`, regexp.ErrBadClosure },
- stringError{ `a??`, regexp.ErrBadClosure },
- stringError{ `*`, regexp.ErrBareClosure },
- stringError{ `\x`, regexp.ErrBadBackslash },
+ stringError{ `*`, ErrBareClosure },
+ stringError{ `(abc`, ErrUnmatchedLpar },
+ stringError{ `abc)`, ErrUnmatchedRpar },
+ stringError{ `x[a-z`, ErrUnmatchedLbkt },
+ stringError{ `abc]`, ErrUnmatchedRbkt },
+ stringError{ `[z-a]`, ErrBadRange },
+ stringError{ `abc\`, ErrExtraneousBackslash },
+ stringError{ `a**`, ErrBadClosure },
+ stringError{ `a*+`, ErrBadClosure },
+ stringError{ `a??`, ErrBadClosure },
+ stringError{ `*`, ErrBareClosure },
+ stringError{ `\x`, ErrBadBackslash },
}
type vec []int;
@@ -88,8 +88,8 @@ var matches = []tester {
tester{ `a*(|(b))c*`, "aacc", vec{0,4, 2,2, -1,-1} },
}
-func compileTest(t *testing.T, expr string, error os.Error) *regexp.Regexp {
- re, err := regexp.Compile(expr);
+func compileTest(t *testing.T, expr string, error os.Error) *Regexp {
+ re, err := Compile(expr);
if err != error {
t.Error("compiling `", expr, "`; unexpected error: ", err.String());
}
diff --git a/src/pkg/testing/Makefile b/src/pkg/testing/Makefile
index 493eba6f2..eab8a4cf0 100644
--- a/src/pkg/testing/Makefile
+++ b/src/pkg/testing/Makefile
@@ -2,6 +2,7 @@
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
+
# DO NOT EDIT. Automatically generated by gobuild.
# gobuild -m >Makefile
@@ -20,7 +21,7 @@ test: packages
coverage: packages
gotest
- 6cov -g `pwd` | grep -v '_test\.go:'
+ 6cov -g $$(pwd) | grep -v '_test\.go:'
%.$O: %.go
$(GC) -I_obj $*.go
@@ -32,16 +33,23 @@ coverage: packages
$(AS) $*.s
O1=\
+ regexp.$O\
+
+O2=\
testing.$O\
-phases: a1
+phases: a1 a2
_obj$D/testing.a: phases
a1: $(O1)
- $(AR) grc _obj$D/testing.a testing.$O
+ $(AR) grc _obj$D/testing.a regexp.$O
rm -f $(O1)
+a2: $(O2)
+ $(AR) grc _obj$D/testing.a testing.$O
+ rm -f $(O2)
+
newpkg: clean
mkdir -p _obj$D
@@ -49,6 +57,7 @@ newpkg: clean
$(O1): newpkg
$(O2): a1
+$(O3): a2
nuke: clean
rm -f $(GOROOT)/pkg/$(GOOS)_$(GOARCH)$D/testing.a
diff --git a/src/pkg/testing/regexp.go b/src/pkg/testing/regexp.go
new file mode 100644
index 000000000..ae698264f
--- /dev/null
+++ b/src/pkg/testing/regexp.go
@@ -0,0 +1,863 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The testing package implements a simple regular expression library.
+// It is a reduced version of the regular expression package suitable
+// for use in tests; it avoids many dependencies.
+//
+// The syntax of the regular expressions accepted is:
+//
+// regexp:
+// concatenation { '|' concatenation }
+// concatenation:
+// { closure }
+// closure:
+// term [ '*' | '+' | '?' ]
+// term:
+// '^'
+// '$'
+// '.'
+// character
+// '[' [ '^' ] character-ranges ']'
+// '(' regexp ')'
+//
+package testing
+
+import (
+ "runtime";
+ "utf8";
+)
+
+var debug = false;
+
+// Error codes returned by failures to parse an expression.
+var (
+ ErrInternal = "internal error";
+ ErrUnmatchedLpar = "unmatched ''";
+ ErrUnmatchedRpar = "unmatched ''";
+ ErrUnmatchedLbkt = "unmatched '['";
+ ErrUnmatchedRbkt = "unmatched ']'";
+ ErrBadRange = "bad range in character class";
+ ErrExtraneousBackslash = "extraneous backslash";
+ ErrBadClosure = "repeated closure **, ++, etc.";
+ ErrBareClosure = "closure applies to nothing";
+ ErrBadBackslash = "illegal backslash escape";
+)
+
+// An instruction executed by the NFA
+type instr interface {
+ kind() int; // the type of this instruction: _CHAR, _ANY, etc.
+ next() instr; // the instruction to execute after this one
+ setNext(i instr);
+ index() int;
+ setIndex(i int);
+ print();
+}
+
+// Fields and methods common to all instructions
+type common struct {
+ _next instr;
+ _index int;
+}
+
+func (c *common) next() instr { return c._next }
+func (c *common) setNext(i instr) { c._next = i }
+func (c *common) index() int { return c._index }
+func (c *common) setIndex(i int) { c._index = i }
+
+// The representation of a compiled regular expression.
+// The public interface is entirely through methods.
+type Regexp struct {
+ expr string; // the original expression
+ ch chan<- *Regexp; // reply channel when we're done
+ error string; // compile- or run-time error; nil if OK
+ inst []instr;
+ start instr;
+ nbra int; // number of brackets in expression, for subexpressions
+}
+
+const (
+ _START // beginning of program
+ = iota;
+ _END; // end of program: success
+ _BOT; // '^' beginning of text
+ _EOT; // '$' end of text
+ _CHAR; // 'a' regular character
+ _CHARCLASS; // [a-z] character class
+ _ANY; // '.' any character including newline
+ _NOTNL; // [^\n] special case: any character but newline
+ _BRA; // '(' parenthesized expression
+ _EBRA; // ')'; end of '(' parenthesized expression
+ _ALT; // '|' alternation
+ _NOP; // do nothing; makes it easy to link without patching
+)
+
+// --- START start of program
+type _Start struct {
+ common
+}
+
+func (start *_Start) kind() int { return _START }
+func (start *_Start) print() { print("start") }
+
+// --- END end of program
+type _End struct {
+ common
+}
+
+func (end *_End) kind() int { return _END }
+func (end *_End) print() { print("end") }
+
+// --- BOT beginning of text
+type _Bot struct {
+ common
+}
+
+func (bot *_Bot) kind() int { return _BOT }
+func (bot *_Bot) print() { print("bot") }
+
+// --- EOT end of text
+type _Eot struct {
+ common
+}
+
+func (eot *_Eot) kind() int { return _EOT }
+func (eot *_Eot) print() { print("eot") }
+
+// --- CHAR a regular character
+type _Char struct {
+ common;
+ char int;
+}
+
+func (char *_Char) kind() int { return _CHAR }
+func (char *_Char) print() { print("char ", string(char.char)) }
+
+func newChar(char int) *_Char {
+ c := new(_Char);
+ c.char = char;
+ return c;
+}
+
+// --- CHARCLASS [a-z]
+
+type _CharClass struct {
+ common;
+ char int;
+ negate bool; // is character class negated? ([^a-z])
+ // stored pairwise: [a-z] is (a,z); x is (x,x):
+ ranges []int;
+}
+
+func (cclass *_CharClass) kind() int { return _CHARCLASS }
+
+func (cclass *_CharClass) print() {
+ print("charclass");
+ if cclass.negate {
+ print(" (negated)");
+ }
+ for i := 0; i < len(cclass.ranges); i += 2 {
+ l := cclass.ranges[i];
+ r := cclass.ranges[i+1];
+ if l == r {
+ print(" [", string(l), "]");
+ } else {
+ print(" [", string(l), "-", string(r), "]");
+ }
+ }
+}
+
+func (cclass *_CharClass) addRange(a, b int) {
+ // range is a through b inclusive
+ n := len(cclass.ranges);
+ if n >= cap(cclass.ranges) {
+ nr := make([]int, n, 2*n);
+ for i, j := range nr {
+ nr[i] = j
+ }
+ cclass.ranges = nr;
+ }
+ cclass.ranges = cclass.ranges[0:n+2];
+ cclass.ranges[n] = a;
+ n++;
+ cclass.ranges[n] = b;
+ n++;
+}
+
+func (cclass *_CharClass) matches(c int) bool {
+ for i := 0; i < len(cclass.ranges); i = i+2 {
+ min := cclass.ranges[i];
+ max := cclass.ranges[i+1];
+ if min <= c && c <= max {
+ return !cclass.negate
+ }
+ }
+ return cclass.negate
+}
+
+func newCharClass() *_CharClass {
+ c := new(_CharClass);
+ c.ranges = make([]int, 0, 20);
+ return c;
+}
+
+// --- ANY any character
+type _Any struct {
+ common
+}
+
+func (any *_Any) kind() int { return _ANY }
+func (any *_Any) print() { print("any") }
+
+// --- NOTNL any character but newline
+type _NotNl struct {
+ common
+}
+
+func (notnl *_NotNl) kind() int { return _NOTNL }
+func (notnl *_NotNl) print() { print("notnl") }
+
+// --- BRA parenthesized expression
+type _Bra struct {
+ common;
+ n int; // subexpression number
+}
+
+func (bra *_Bra) kind() int { return _BRA }
+func (bra *_Bra) print() { print("bra", bra.n); }
+
+// --- EBRA end of parenthesized expression
+type _Ebra struct {
+ common;
+ n int; // subexpression number
+}
+
+func (ebra *_Ebra) kind() int { return _EBRA }
+func (ebra *_Ebra) print() { print("ebra ", ebra.n); }
+
+// --- ALT alternation
+type _Alt struct {
+ common;
+ left instr; // other branch
+}
+
+func (alt *_Alt) kind() int { return _ALT }
+func (alt *_Alt) print() { print("alt(", alt.left.index(), ")"); }
+
+// --- NOP no operation
+type _Nop struct {
+ common
+}
+
+func (nop *_Nop) kind() int { return _NOP }
+func (nop *_Nop) print() { print("nop") }
+
+// report error and exit compiling/executing goroutine
+func (re *Regexp) setError(err string) {
+ re.error = err;
+ re.ch <- re;
+ runtime.Goexit();
+}
+
+func (re *Regexp) add(i instr) instr {
+ n := len(re.inst);
+ i.setIndex(len(re.inst));
+ if n >= cap(re.inst) {
+ ni := make([]instr, n, 2*n);
+ for i, j := range ni {
+ ni[i] = j
+ }
+ re.inst = ni;
+ }
+ re.inst = re.inst[0:n+1];
+ re.inst[n] = i;
+ return i;
+}
+
+type parser struct {
+ re *Regexp;
+ nlpar int; // number of unclosed lpars
+ pos int;
+ ch int;
+}
+
+const endOfFile = -1
+
+func (p *parser) c() int {
+ return p.ch;
+}
+
+func (p *parser) nextc() int {
+ if p.pos >= len(p.re.expr) {
+ p.ch = endOfFile
+ } else {
+ c, w := utf8.DecodeRuneInString(p.re.expr[p.pos:len(p.re.expr)]);
+ p.ch = c;
+ p.pos += w;
+ }
+ return p.ch;
+}
+
+func newParser(re *Regexp) *parser {
+ p := new(parser);
+ p.re = re;
+ p.nextc(); // load p.ch
+ return p;
+}
+
+func (p *parser) regexp() (start, end instr)
+
+var iNULL instr
+
+func special(c int) bool {
+ s := `\.+*?()|[]^$`;
+ for i := 0; i < len(s); i++ {
+ if c == int(s[i]) {
+ return true
+ }
+ }
+ return false
+}
+
+func specialcclass(c int) bool {
+ s := `\-[]`;
+ for i := 0; i < len(s); i++ {
+ if c == int(s[i]) {
+ return true
+ }
+ }
+ return false
+}
+
+func (p *parser) charClass() instr {
+ cc := newCharClass();
+ if p.c() == '^' {
+ cc.negate = true;
+ p.nextc();
+ }
+ left := -1;
+ for {
+ switch c := p.c(); c {
+ case ']', endOfFile:
+ if left >= 0 {
+ p.re.setError(ErrBadRange);
+ }
+ // Is it [^\n]?
+ if cc.negate && len(cc.ranges) == 2 &&
+ cc.ranges[0] == '\n' && cc.ranges[1] == '\n' {
+ nl := new(_NotNl);
+ p.re.add(nl);
+ return nl;
+ }
+ p.re.add(cc);
+ return cc;
+ case '-': // do this before backslash processing
+ p.re.setError(ErrBadRange);
+ case '\\':
+ c = p.nextc();
+ switch {
+ case c == endOfFile:
+ p.re.setError(ErrExtraneousBackslash);
+ case c == 'n':
+ c = '\n';
+ case specialcclass(c):
+ // c is as delivered
+ default:
+ p.re.setError(ErrBadBackslash);
+ }
+ fallthrough;
+ default:
+ p.nextc();
+ switch {
+ case left < 0: // first of pair
+ if p.c() == '-' { // range
+ p.nextc();
+ left = c;
+ } else { // single char
+ cc.addRange(c, c);
+ }
+ case left <= c: // second of pair
+ cc.addRange(left, c);
+ left = -1;
+ default:
+ p.re.setError(ErrBadRange);
+ }
+ }
+ }
+ return iNULL
+}
+
+func (p *parser) term() (start, end instr) {
+ switch c := p.c(); c {
+ case '|', endOfFile:
+ return iNULL, iNULL;
+ case '*', '+':
+ p.re.setError(ErrBareClosure);
+ case ')':
+ if p.nlpar == 0 {
+ p.re.setError(ErrUnmatchedRpar);
+ }
+ return iNULL, iNULL;
+ case ']':
+ p.re.setError(ErrUnmatchedRbkt);
+ case '^':
+ p.nextc();
+ start = p.re.add(new(_Bot));
+ return start, start;
+ case '$':
+ p.nextc();
+ start = p.re.add(new(_Eot));
+ return start, start;
+ case '.':
+ p.nextc();
+ start = p.re.add(new(_Any));
+ return start, start;
+ case '[':
+ p.nextc();
+ start = p.charClass();
+ if p.c() != ']' {
+ p.re.setError(ErrUnmatchedLbkt);
+ }
+ p.nextc();
+ return start, start;
+ case '(':
+ p.nextc();
+ p.nlpar++;
+ p.re.nbra++; // increment first so first subexpr is \1
+ nbra := p.re.nbra;
+ start, end = p.regexp();
+ if p.c() != ')' {
+ p.re.setError(ErrUnmatchedLpar);
+ }
+ p.nlpar--;
+ p.nextc();
+ bra := new(_Bra);
+ p.re.add(bra);
+ ebra := new(_Ebra);
+ p.re.add(ebra);
+ bra.n = nbra;
+ ebra.n = nbra;
+ if start == iNULL {
+ if end == iNULL {
+ p.re.setError(ErrInternal)
+ }
+ start = ebra
+ } else {
+ end.setNext(ebra);
+ }
+ bra.setNext(start);
+ return bra, ebra;
+ case '\\':
+ c = p.nextc();
+ switch {
+ case c == endOfFile:
+ p.re.setError(ErrExtraneousBackslash);
+ case c == 'n':
+ c = '\n';
+ case special(c):
+ // c is as delivered
+ default:
+ p.re.setError(ErrBadBackslash);
+ }
+ fallthrough;
+ default:
+ p.nextc();
+ start = newChar(c);
+ p.re.add(start);
+ return start, start
+ }
+ panic("unreachable");
+}
+
+func (p *parser) closure() (start, end instr) {
+ start, end = p.term();
+ if start == iNULL {
+ return
+ }
+ switch p.c() {
+ case '*':
+ // (start,end)*:
+ alt := new(_Alt);
+ p.re.add(alt);
+ end.setNext(alt); // after end, do alt
+ alt.left = start; // alternate brach: return to start
+ start = alt; // alt becomes new (start, end)
+ end = alt;
+ case '+':
+ // (start,end)+:
+ alt := new(_Alt);
+ p.re.add(alt);
+ end.setNext(alt); // after end, do alt
+ alt.left = start; // alternate brach: return to start
+ end = alt; // start is unchanged; end is alt
+ case '?':
+ // (start,end)?:
+ alt := new(_Alt);
+ p.re.add(alt);
+ nop := new(_Nop);
+ p.re.add(nop);
+ alt.left = start; // alternate branch is start
+ alt.setNext(nop); // follow on to nop
+ end.setNext(nop); // after end, go to nop
+ start = alt; // start is now alt
+ end = nop; // end is nop pointed to by both branches
+ default:
+ return
+ }
+ switch p.nextc() {
+ case '*', '+', '?':
+ p.re.setError(ErrBadClosure);
+ }
+ return
+}
+
+func (p *parser) concatenation() (start, end instr) {
+ start, end = iNULL, iNULL;
+ for {
+ nstart, nend := p.closure();
+ switch {
+ case nstart == iNULL: // end of this concatenation
+ if start == iNULL { // this is the empty string
+ nop := p.re.add(new(_Nop));
+ return nop, nop;
+ }
+ return;
+ case start == iNULL: // this is first element of concatenation
+ start, end = nstart, nend;
+ default:
+ end.setNext(nstart);
+ end = nend;
+ }
+ }
+ panic("unreachable");
+}
+
+func (p *parser) regexp() (start, end instr) {
+ start, end = p.concatenation();
+ for {
+ switch p.c() {
+ default:
+ return;
+ case '|':
+ p.nextc();
+ nstart, nend := p.concatenation();
+ alt := new(_Alt);
+ p.re.add(alt);
+ alt.left = start;
+ alt.setNext(nstart);
+ nop := new(_Nop);
+ p.re.add(nop);
+ end.setNext(nop);
+ nend.setNext(nop);
+ start, end = alt, nop;
+ }
+ }
+ panic("unreachable");
+}
+
+func unNop(i instr) instr {
+ for i.kind() == _NOP {
+ i = i.next()
+ }
+ return i
+}
+
+func (re *Regexp) eliminateNops() {
+ for i := 0; i < len(re.inst); i++ {
+ inst := re.inst[i];
+ if inst.kind() == _END {
+ continue
+ }
+ inst.setNext(unNop(inst.next()));
+ if inst.kind() == _ALT {
+ alt := inst.(*_Alt);
+ alt.left = unNop(alt.left);
+ }
+ }
+}
+
+func (re *Regexp) dump() {
+ for i := 0; i < len(re.inst); i++ {
+ inst := re.inst[i];
+ print(inst.index(), ": ");
+ inst.print();
+ if inst.kind() != _END {
+ print(" -> ", inst.next().index())
+ }
+ print("\n");
+ }
+}
+
+func (re *Regexp) doParse() {
+ p := newParser(re);
+ start := new(_Start);
+ re.add(start);
+ s, e := p.regexp();
+ start.setNext(s);
+ re.start = start;
+ e.setNext(re.add(new(_End)));
+
+ if debug {
+ re.dump();
+ println();
+ }
+
+ re.eliminateNops();
+
+ if debug {
+ re.dump();
+ println();
+ }
+}
+
+
+func compiler(str string, ch chan *Regexp) {
+ re := new(Regexp);
+ re.expr = str;
+ re.inst = make([]instr, 0, 20);
+ re.ch = ch;
+ re.doParse();
+ ch <- re;
+}
+
+// CompileRegexp parses a regular expression and returns, if successful, a Regexp
+// object that can be used to match against text.
+func CompileRegexp(str string) (regexp *Regexp, error string) {
+ // Compile in a separate goroutine and wait for the result.
+ ch := make(chan *Regexp);
+ go compiler(str, ch);
+ re := <-ch;
+ return re, re.error
+}
+
+type state struct {
+ inst instr; // next instruction to execute
+ match []int; // pairs of bracketing submatches. 0th is start,end
+}
+
+// Append new state to to-do list. Leftmost-longest wins so avoid
+// adding a state that's already active.
+func addState(s []state, inst instr, match []int) []state {
+ index := inst.index();
+ l := len(s);
+ pos := match[0];
+ // TODO: Once the state is a vector and we can do insert, have inputs always
+ // go in order correctly and this "earlier" test is never necessary,
+ for i := 0; i < l; i++ {
+ if s[i].inst.index() == index && // same instruction
+ s[i].match[0] < pos { // earlier match already going; lefmost wins
+ return s
+ }
+ }
+ if l == cap(s) {
+ s1 := make([]state, 2*l)[0:l];
+ for i := 0; i < l; i++ {
+ s1[i] = s[i];
+ }
+ s = s1;
+ }
+ s = s[0:l+1];
+ s[l].inst = inst;
+ s[l].match = match;
+ return s;
+}
+
+// Accepts either string or bytes - the logic is identical either way.
+// If bytes == nil, scan str.
+func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int {
+ var s [2][]state; // TODO: use a vector when state values (not ptrs) can be vector elements
+ s[0] = make([]state, 10)[0:0];
+ s[1] = make([]state, 10)[0:0];
+ in, out := 0, 1;
+ var final state;
+ found := false;
+ end := len(str);
+ if bytes != nil {
+ end = len(bytes)
+ }
+ for pos <= end {
+ if !found {
+ // prime the pump if we haven't seen a match yet
+ match := make([]int, 2*(re.nbra+1));
+ for i := 0; i < len(match); i++ {
+ match[i] = -1; // no match seen; catches cases like "a(b)?c" on "ac"
+ }
+ match[0] = pos;
+ s[out] = addState(s[out], re.start.next(), match);
+ }
+ in, out = out, in; // old out state is new in state
+ s[out] = s[out][0:0]; // clear out state
+ if len(s[in]) == 0 {
+ // machine has completed
+ break;
+ }
+ charwidth := 1;
+ c := endOfFile;
+ if pos < end {
+ if bytes == nil {
+ c, charwidth = utf8.DecodeRuneInString(str[pos:end]);
+ } else {
+ c, charwidth = utf8.DecodeRune(bytes[pos:end]);
+ }
+ }
+ for i := 0; i < len(s[in]); i++ {
+ st := s[in][i];
+ switch s[in][i].inst.kind() {
+ case _BOT:
+ if pos == 0 {
+ s[in] = addState(s[in], st.inst.next(), st.match)
+ }
+ case _EOT:
+ if pos == end {
+ s[in] = addState(s[in], st.inst.next(), st.match)
+ }
+ case _CHAR:
+ if c == st.inst.(*_Char).char {
+ s[out] = addState(s[out], st.inst.next(), st.match)
+ }
+ case _CHARCLASS:
+ if st.inst.(*_CharClass).matches(c) {
+ s[out] = addState(s[out], st.inst.next(), st.match)
+ }
+ case _ANY:
+ if c != endOfFile {
+ s[out] = addState(s[out], st.inst.next(), st.match)
+ }
+ case _NOTNL:
+ if c != endOfFile && c != '\n' {
+ s[out] = addState(s[out], st.inst.next(), st.match)
+ }
+ case _BRA:
+ n := st.inst.(*_Bra).n;
+ st.match[2*n] = pos;
+ s[in] = addState(s[in], st.inst.next(), st.match);
+ case _EBRA:
+ n := st.inst.(*_Ebra).n;
+ st.match[2*n+1] = pos;
+ s[in] = addState(s[in], st.inst.next(), st.match);
+ case _ALT:
+ s[in] = addState(s[in], st.inst.(*_Alt).left, st.match);
+ // give other branch a copy of this match vector
+ s1 := make([]int, 2*(re.nbra+1));
+ for i := 0; i < len(s1); i++ {
+ s1[i] = st.match[i]
+ }
+ s[in] = addState(s[in], st.inst.next(), s1);
+ case _END:
+ // choose leftmost longest
+ if !found || // first
+ st.match[0] < final.match[0] || // leftmost
+ (st.match[0] == final.match[0] && pos > final.match[1]) { // longest
+ final = st;
+ final.match[1] = pos;
+ }
+ found = true;
+ default:
+ st.inst.print();
+ panic("unknown instruction in execute");
+ }
+ }
+ pos += charwidth;
+ }
+ return final.match;
+}
+
+
+// ExecuteString matches the Regexp against the string s.
+// The return value is an array of integers, in pairs, identifying the positions of
+// substrings matched by the expression.
+// s[a[0]:a[1]] is the substring matched by the entire expression.
+// s[a[2*i]:a[2*i+1]] for i > 0 is the substring matched by the ith parenthesized subexpression.
+// A negative value means the subexpression did not match any element of the string.
+// An empty array means "no match".
+func (re *Regexp) ExecuteString(s string) (a []int) {
+ return re.doExecute(s, nil, 0)
+}
+
+
+// Execute matches the Regexp against the byte slice b.
+// The return value is an array of integers, in pairs, identifying the positions of
+// subslices matched by the expression.
+// b[a[0]:a[1]] is the subslice matched by the entire expression.
+// b[a[2*i]:a[2*i+1]] for i > 0 is the subslice matched by the ith parenthesized subexpression.
+// A negative value means the subexpression did not match any element of the slice.
+// An empty array means "no match".
+func (re *Regexp) Execute(b []byte) (a []int) {
+ return re.doExecute("", b, 0)
+}
+
+
+// MatchString returns whether the Regexp matches the string s.
+// The return value is a boolean: true for match, false for no match.
+func (re *Regexp) MatchString(s string) bool {
+ return len(re.doExecute(s, nil, 0)) > 0
+}
+
+
+// Match returns whether the Regexp matches the byte slice b.
+// The return value is a boolean: true for match, false for no match.
+func (re *Regexp) Match(b []byte) bool {
+ return len(re.doExecute("", b, 0)) > 0
+}
+
+
+// MatchStrings matches the Regexp against the string s.
+// The return value is an array of strings matched by the expression.
+// a[0] is the substring matched by the entire expression.
+// a[i] for i > 0 is the substring matched by the ith parenthesized subexpression.
+// An empty array means ``no match''.
+func (re *Regexp) MatchStrings(s string) (a []string) {
+ r := re.doExecute(s, nil, 0);
+ if r == nil {
+ return nil
+ }
+ a = make([]string, len(r)/2);
+ for i := 0; i < len(r); i += 2 {
+ if r[i] != -1 { // -1 means no match for this subexpression
+ a[i/2] = s[r[i] : r[i+1]]
+ }
+ }
+ return
+}
+
+// MatchSlices matches the Regexp against the byte slice b.
+// The return value is an array of subslices matched by the expression.
+// a[0] is the subslice matched by the entire expression.
+// a[i] for i > 0 is the subslice matched by the ith parenthesized subexpression.
+// An empty array means ``no match''.
+func (re *Regexp) MatchSlices(b []byte) (a [][]byte) {
+ r := re.doExecute("", b, 0);
+ if r == nil {
+ return nil
+ }
+ a = make([][]byte, len(r)/2);
+ for i := 0; i < len(r); i += 2 {
+ if r[i] != -1 { // -1 means no match for this subexpression
+ a[i/2] = b[r[i] : r[i+1]]
+ }
+ }
+ return
+}
+
+// MatchString checks whether a textual regular expression
+// matches a string. More complicated queries need
+// to use Compile and the full Regexp interface.
+func MatchString(pattern string, s string) (matched bool, error string) {
+ re, err := CompileRegexp(pattern);
+ if err != "" {
+ return false, err
+ }
+ return re.MatchString(s), ""
+}
+
+// Match checks whether a textual regular expression
+// matches a byte slice. More complicated queries need
+// to use Compile and the full Regexp interface.
+func Match(pattern string, b []byte) (matched bool, error string) {
+ re, err := CompileRegexp(pattern);
+ if err != "" {
+ return false, err
+ }
+ return re.Match(b), ""
+}
diff --git a/src/pkg/testing/regexp_test.go b/src/pkg/testing/regexp_test.go
new file mode 100644
index 000000000..d72ca19d4
--- /dev/null
+++ b/src/pkg/testing/regexp_test.go
@@ -0,0 +1,281 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package testing
+
+import (
+ "strings";
+ "testing";
+)
+
+var good_re = []string{
+ ``,
+ `.`,
+ `^.$`,
+ `a`,
+ `a*`,
+ `a+`,
+ `a?`,
+ `a|b`,
+ `a*|b*`,
+ `(a*|b)(c*|d)`,
+ `[a-z]`,
+ `[a-abc-c\-\]\[]`,
+ `[a-z]+`,
+ `[]`,
+ `[abc]`,
+ `[^1234]`,
+ `[^\n]`,
+}
+
+// TODO: nice to do this with a map
+type stringError struct {
+ re string;
+ err string;
+}
+var bad_re = []stringError{
+ stringError{ `*`, ErrBareClosure },
+ stringError{ `(abc`, ErrUnmatchedLpar },
+ stringError{ `abc)`, ErrUnmatchedRpar },
+ stringError{ `x[a-z`, ErrUnmatchedLbkt },
+ stringError{ `abc]`, ErrUnmatchedRbkt },
+ stringError{ `[z-a]`, ErrBadRange },
+ stringError{ `abc\`, ErrExtraneousBackslash },
+ stringError{ `a**`, ErrBadClosure },
+ stringError{ `a*+`, ErrBadClosure },
+ stringError{ `a??`, ErrBadClosure },
+ stringError{ `*`, ErrBareClosure },
+ stringError{ `\x`, ErrBadBackslash },
+}
+
+type vec []int;
+
+type tester struct {
+ re string;
+ text string;
+ match vec;
+}
+
+var matches = []tester {
+ tester{ ``, "", vec{0,0} },
+ tester{ `a`, "a", vec{0,1} },
+ tester{ `x`, "y", vec{} },
+ tester{ `b`, "abc", vec{1,2} },
+ tester{ `.`, "a", vec{0,1} },
+ tester{ `.*`, "abcdef", vec{0,6} },
+ tester{ `^abcd$`, "abcd", vec{0,4} },
+ tester{ `^bcd'`, "abcdef", vec{} },
+ tester{ `^abcd$`, "abcde", vec{} },
+ tester{ `a+`, "baaab", vec{1,4} },
+ tester{ `a*`, "baaab", vec{0,0} },
+ tester{ `[a-z]+`, "abcd", vec{0,4} },
+ tester{ `[^a-z]+`, "ab1234cd", vec{2,6} },
+ tester{ `[a\-\]z]+`, "az]-bcz", vec{0,4} },
+ tester{ `[^\n]+`, "abcd\n", vec{0,4} },
+ tester{ `[日本語]+`, "日本語日本語", vec{0,18} },
+ tester{ `()`, "", vec{0,0, 0,0} },
+ tester{ `(a)`, "a", vec{0,1, 0,1} },
+ tester{ `(.)(.)`, "日a", vec{0,4, 0,3, 3,4} },
+ tester{ `(.*)`, "", vec{0,0, 0,0} },
+ tester{ `(.*)`, "abcd", vec{0,4, 0,4} },
+ tester{ `(..)(..)`, "abcd", vec{0,4, 0,2, 2,4} },
+ tester{ `(([^xyz]*)(d))`, "abcd", vec{0,4, 0,4, 0,3, 3,4} },
+ tester{ `((a|b|c)*(d))`, "abcd", vec{0,4, 0,4, 2,3, 3,4} },
+ tester{ `(((a|b|c)*)(d))`, "abcd", vec{0,4, 0,4, 0,3, 2,3, 3,4} },
+ tester{ `a*(|(b))c*`, "aacc", vec{0,4, 2,2, -1,-1} },
+}
+
+func compileTest(t *testing.T, expr string, error string) *Regexp {
+ re, err := CompileRegexp(expr);
+ if err != error {
+ t.Error("compiling `", expr, "`; unexpected error: ", err);
+ }
+ return re
+}
+
+func printVec(t *testing.T, m []int) {
+ l := len(m);
+ if l == 0 {
+ t.Log("\t<no match>");
+ } else {
+ for i := 0; i < l; i = i+2 {
+ t.Log("\t", m[i], ",", m[i+1])
+ }
+ }
+}
+
+func printStrings(t *testing.T, m []string) {
+ l := len(m);
+ if l == 0 {
+ t.Log("\t<no match>");
+ } else {
+ for i := 0; i < l; i = i+2 {
+ t.Logf("\t%q", m[i])
+ }
+ }
+}
+
+func printBytes(t *testing.T, b [][]byte) {
+ l := len(b);
+ if l == 0 {
+ t.Log("\t<no match>");
+ } else {
+ for i := 0; i < l; i = i+2 {
+ t.Logf("\t%q", b[i])
+ }
+ }
+}
+
+func equal(m1, m2 []int) bool {
+ l := len(m1);
+ if l != len(m2) {
+ return false
+ }
+ for i := 0; i < l; i++ {
+ if m1[i] != m2[i] {
+ return false
+ }
+ }
+ return true
+}
+
+func equalStrings(m1, m2 []string) bool {
+ l := len(m1);
+ if l != len(m2) {
+ return false
+ }
+ for i := 0; i < l; i++ {
+ if m1[i] != m2[i] {
+ return false
+ }
+ }
+ return true
+}
+
+func equalBytes(m1 [][]byte, m2 []string) bool {
+ l := len(m1);
+ if l != len(m2) {
+ return false
+ }
+ for i := 0; i < l; i++ {
+ if string(m1[i]) != m2[i] {
+ return false
+ }
+ }
+ return true
+}
+
+func executeTest(t *testing.T, expr string, str string, match []int) {
+ re := compileTest(t, expr, "");
+ if re == nil {
+ return
+ }
+ m := re.ExecuteString(str);
+ if !equal(m, match) {
+ t.Error("ExecuteString failure on `", expr, "` matching `", str, "`:");
+ printVec(t, m);
+ t.Log("should be:");
+ printVec(t, match);
+ }
+ // now try bytes
+ m = re.Execute(strings.Bytes(str));
+ if !equal(m, match) {
+ t.Error("Execute failure on `", expr, "` matching `", str, "`:");
+ printVec(t, m);
+ t.Log("should be:");
+ printVec(t, match);
+ }
+}
+
+func TestGoodCompile(t *testing.T) {
+ for i := 0; i < len(good_re); i++ {
+ compileTest(t, good_re[i], "");
+ }
+}
+
+func TestBadCompile(t *testing.T) {
+ for i := 0; i < len(bad_re); i++ {
+ compileTest(t, bad_re[i].re, bad_re[i].err)
+ }
+}
+
+func TestExecute(t *testing.T) {
+ for i := 0; i < len(matches); i++ {
+ test := &matches[i];
+ executeTest(t, test.re, test.text, test.match)
+ }
+}
+
+func matchTest(t *testing.T, expr string, str string, match []int) {
+ re := compileTest(t, expr, "");
+ if re == nil {
+ return
+ }
+ m := re.MatchString(str);
+ if m != (len(match) > 0) {
+ t.Error("MatchString failure on `", expr, "` matching `", str, "`:", m, "should be", len(match) > 0);
+ }
+ // now try bytes
+ m = re.Match(strings.Bytes(str));
+ if m != (len(match) > 0) {
+ t.Error("Match failure on `", expr, "` matching `", str, "`:", m, "should be", len(match) > 0);
+ }
+}
+
+func TestMatch(t *testing.T) {
+ for i := 0; i < len(matches); i++ {
+ test := &matches[i];
+ matchTest(t, test.re, test.text, test.match)
+ }
+}
+
+func matchStringsTest(t *testing.T, expr string, str string, match []int) {
+ re := compileTest(t, expr, "");
+ if re == nil {
+ return
+ }
+ strs := make([]string, len(match)/2);
+ for i := 0; i < len(match); i++ {
+ strs[i/2] = str[match[i] : match[i+1]]
+ }
+ m := re.MatchStrings(str);
+ if !equalStrings(m, strs) {
+ t.Error("MatchStrings failure on `", expr, "` matching `", str, "`:");
+ printStrings(t, m);
+ t.Log("should be:");
+ printStrings(t, strs);
+ }
+ // now try bytes
+ s := re.MatchSlices(strings.Bytes(str));
+ if !equalBytes(s, strs) {
+ t.Error("MatchSlices failure on `", expr, "` matching `", str, "`:");
+ printBytes(t, s);
+ t.Log("should be:");
+ printStrings(t, strs);
+ }
+}
+
+func TestMatchStrings(t *testing.T) {
+ for i := 0; i < len(matches); i++ {
+ test := &matches[i];
+ matchTest(t, test.re, test.text, test.match)
+ }
+}
+
+func matchFunctionTest(t *testing.T, expr string, str string, match []int) {
+ m, err := MatchString(expr, str);
+ if err == "" {
+ return
+ }
+ if m != (len(match) > 0) {
+ t.Error("function Match failure on `", expr, "` matching `", str, "`:", m, "should be", len(match) > 0);
+ }
+}
+
+func TestMatchFunction(t *testing.T) {
+ for i := 0; i < len(matches); i++ {
+ test := &matches[i];
+ matchFunctionTest(t, test.re, test.text, test.match)
+ }
+}
diff --git a/src/pkg/testing/testing.go b/src/pkg/testing/testing.go
index e4608ef72..276129393 100644
--- a/src/pkg/testing/testing.go
+++ b/src/pkg/testing/testing.go
@@ -15,8 +15,8 @@ import (
"flag";
"fmt";
"os";
- "regexp";
"runtime";
+ "testing";
)
// Report as tests are run; default is silent for success.
@@ -122,9 +122,9 @@ func Main(tests []Test) {
if len(tests) == 0 {
println("testing: warning: no tests to run");
}
- re, err := regexp.Compile(*match);
- if err != nil {
- println("invalid regexp for -match:", err.String());
+ re, err := CompileRegexp(*match);
+ if err != "" {
+ println("invalid regexp for -match:", err);
os.Exit(1);
}
for i := 0; i < len(tests); i++ {