summaryrefslogtreecommitdiff
path: root/src/lib/regexp
diff options
context:
space:
mode:
authorRob Pike <r@golang.org>2008-10-14 19:22:17 -0700
committerRob Pike <r@golang.org>2008-10-14 19:22:17 -0700
commite1033402ca1125d3517e79f1dbabae9b763ec90c (patch)
treef06f7331ad0775154f3e460e2785d9ad897120e1 /src/lib/regexp
parente36cd17c697c44d238ae9bd15dfafda2b8a4d209 (diff)
downloadgolang-e1033402ca1125d3517e79f1dbabae9b763ec90c.tar.gz
move regexp to lib
next cl will update names and add to build R=rsc DELTA=1876 (938 added, 938 deleted, 0 changed) OCL=17149 CL=17166
Diffstat (limited to 'src/lib/regexp')
-rw-r--r--src/lib/regexp/Makefile20
-rw-r--r--src/lib/regexp/main.go160
-rw-r--r--src/lib/regexp/regexp.go767
3 files changed, 947 insertions, 0 deletions
diff --git a/src/lib/regexp/Makefile b/src/lib/regexp/Makefile
new file mode 100644
index 000000000..ac466a0f1
--- /dev/null
+++ b/src/lib/regexp/Makefile
@@ -0,0 +1,20 @@
+# Copyright 2009 The Go Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+A=6
+G=$(A)g
+L=$(A)l
+
+all: main
+
+main: main.6
+ $L -o main main.6
+
+main.6: regexp.6
+
+clean:
+ rm -f *.6 main
+
+%.6: %.go
+ $G $<
diff --git a/src/lib/regexp/main.go b/src/lib/regexp/main.go
new file mode 100644
index 000000000..c89f9b557
--- /dev/null
+++ b/src/lib/regexp/main.go
@@ -0,0 +1,160 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "os";
+ "regexp";
+)
+
+var good_re = []string{
+ ``,
+ `.`,
+ `^.$`,
+ `a`,
+ `a*`,
+ `a+`,
+ `a?`,
+ `a|b`,
+ `a*|b*`,
+ `(a*|b)(c*|d)`,
+ `[a-z]`,
+ `[a-abc-c\-\]\[]`,
+ `[a-z]+`,
+ `[]`,
+ `[abc]`,
+ `[^1234]`,
+}
+
+// TODO: nice to do this with a map but we don't have an iterator
+type StringError struct {
+ re string;
+ err *os.Error;
+}
+var bad_re = []StringError{
+ StringError{ `*`, regexp.ErrBareClosure },
+ StringError{ `(abc`, regexp.ErrUnmatchedLpar },
+ StringError{ `abc)`, regexp.ErrUnmatchedRpar },
+ StringError{ `x[a-z`, regexp.ErrUnmatchedLbkt },
+ StringError{ `abc]`, regexp.ErrUnmatchedRbkt },
+ StringError{ `[z-a]`, regexp.ErrBadRange },
+ StringError{ `abc\`, regexp.ErrExtraneousBackslash },
+ StringError{ `a**`, regexp.ErrBadClosure },
+ StringError{ `a*+`, regexp.ErrBadClosure },
+ StringError{ `a??`, regexp.ErrBadClosure },
+ StringError{ `*`, regexp.ErrBareClosure },
+ StringError{ `\x`, regexp.ErrBadBackslash },
+}
+
+type Vec [20]int;
+
+type Tester struct {
+ re string;
+ text string;
+ match Vec;
+}
+
+const END = -1000
+
+var matches = []Tester {
+ Tester{ ``, "", Vec{0,0, END} },
+ Tester{ `a`, "a", Vec{0,1, END} },
+ Tester{ `b`, "abc", Vec{1,2, END} },
+ Tester{ `.`, "a", Vec{0,1, END} },
+ Tester{ `.*`, "abcdef", Vec{0,6, END} },
+ Tester{ `^abcd$`, "abcd", Vec{0,4, END} },
+ Tester{ `^bcd'`, "abcdef", Vec{END} },
+ Tester{ `^abcd$`, "abcde", Vec{END} },
+ Tester{ `a+`, "baaab", Vec{1,4, END} },
+ Tester{ `a*`, "baaab", Vec{0,0, END} },
+ Tester{ `[a-z]+`, "abcd", Vec{0,4, END} },
+ Tester{ `[^a-z]+`, "ab1234cd", Vec{2,6, END} },
+ Tester{ `[a\-\]z]+`, "az]-bcz", Vec{0,4, END} },
+ Tester{ `[日本語]+`, "日本語日本語", Vec{0,18, END} },
+ Tester{ `()`, "", Vec{0,0, 0,0, END} },
+ Tester{ `(a)`, "a", Vec{0,1, 0,1, END} },
+ Tester{ `(.)(.)`, "日a", Vec{0,4, 0,3, 3,4, END} },
+ Tester{ `(.*)`, "", Vec{0,0, 0,0, END} },
+ Tester{ `(.*)`, "abcd", Vec{0,4, 0,4, END} },
+ Tester{ `(..)(..)`, "abcd", Vec{0,4, 0,2, 2,4, END} },
+ Tester{ `(([^xyz]*)(d))`, "abcd", Vec{0,4, 0,4, 0,3, 3,4, END} },
+ Tester{ `((a|b|c)*(d))`, "abcd", Vec{0,4, 0,4, 2,3, 3,4, END} },
+ Tester{ `(((a|b|c)*)(d))`, "abcd", Vec{0,4, 0,4, 0,3, 2,3, 3,4, END} },
+ Tester{ `a*(|(b))c*`, "aacc", Vec{0,4, 2,2, -1,-1, END} },
+}
+
+func Compile(expr string, error *os.Error) regexp.Regexp {
+ re, err := regexp.Compile(expr);
+ if err != error {
+ print("compiling `", expr, "`; unexpected error: ", err.String(), "\n");
+ sys.exit(1);
+ }
+ return re
+}
+
+func MarkedLen(m *[] int) int {
+ if m == nil {
+ return 0
+ }
+ var i int;
+ for i = 0; i < len(m) && m[i] != END; i = i+2 {
+ }
+ return i
+}
+
+func PrintVec(m *[] int) {
+ l := MarkedLen(m);
+ if l == 0 {
+ print("<no match>");
+ } else {
+ for i := 0; i < l && m[i] != END; i = i+2 {
+ print(m[i], ",", m[i+1], " ")
+ }
+ }
+}
+
+func Equal(m1, m2 *[]int) bool {
+ l := MarkedLen(m1);
+ if l != MarkedLen(m2) {
+ return false
+ }
+ for i := 0; i < l; i++ {
+ if m1[i] != m2[i] {
+ return false
+ }
+ }
+ return true
+}
+
+func Match(expr string, str string, match *[]int) {
+ re := Compile(expr, nil);
+ m := re.Execute(str);
+ if !Equal(m, match) {
+ print("failure on `", expr, "` matching `", str, "`:\n");
+ PrintVec(m);
+ print("\nshould be:\n");
+ PrintVec(match);
+ print("\n");
+ sys.exit(1);
+ }
+}
+
+func main() {
+ //regexp.debug = true;
+ if sys.argc() > 1 {
+ Compile(sys.argv(1), nil);
+ sys.exit(0);
+ }
+ for i := 0; i < len(good_re); i++ {
+ Compile(good_re[i], nil);
+ }
+ for i := 0; i < len(bad_re); i++ {
+ Compile(bad_re[i].re, bad_re[i].err)
+ }
+ for i := 0; i < len(matches); i++ {
+ t := &matches[i];
+ Match(t.re, t.text, &t.match)
+ }
+}
diff --git a/src/lib/regexp/regexp.go b/src/lib/regexp/regexp.go
new file mode 100644
index 000000000..6535e6ef4
--- /dev/null
+++ b/src/lib/regexp/regexp.go
@@ -0,0 +1,767 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression library.
+
+package regexp
+
+import (
+ "os";
+ "vector";
+)
+
+export var debug = false;
+
+
+export var ErrUnimplemented = os.NewError("unimplemented");
+export var ErrInternal = os.NewError("internal error");
+export var ErrUnmatchedLpar = os.NewError("unmatched '('");
+export var ErrUnmatchedRpar = os.NewError("unmatched ')'");
+export var ErrUnmatchedLbkt = os.NewError("unmatched '['");
+export var ErrUnmatchedRbkt = os.NewError("unmatched ']'");
+export var ErrBadRange = os.NewError("bad range in character class");
+export var ErrExtraneousBackslash = os.NewError("extraneous backslash");
+export var ErrBadClosure = os.NewError("repeated closure (**, ++, etc.)");
+export var ErrBareClosure = os.NewError("closure applies to nothing");
+export var ErrBadBackslash = os.NewError("illegal backslash escape");
+
+// An instruction executed by the NFA
+type Inst interface {
+ Type() int; // the type of this instruction: CHAR, ANY, etc.
+ Next() Inst; // the instruction to execute after this one
+ SetNext(i Inst);
+ Index() int;
+ SetIndex(i int);
+ Print();
+}
+
+type RE struct {
+ expr string; // the original expression
+ ch *chan<- *RE; // reply channel when we're done
+ error *os.Error; // compile- or run-time error; nil if OK
+ inst *vector.Vector;
+ start Inst;
+ nbra int; // number of brackets in expression, for subexpressions
+}
+
+const (
+ START // beginning of program
+ = iota;
+ END; // end of program: success
+ BOT; // '^' beginning of text
+ EOT; // '$' end of text
+ CHAR; // 'a' regular character
+ CHARCLASS; // [a-z] character class
+ ANY; // '.' any character
+ BRA; // '(' parenthesized expression
+ EBRA; // ')'; end of '(' parenthesized expression
+ ALT; // '|' alternation
+ NOP; // do nothing; makes it easy to link without patching
+)
+
+// --- START start of program
+type Start struct {
+ next Inst;
+ index int;
+}
+
+func (start *Start) Type() int { return START }
+func (start *Start) Next() Inst { return start.next }
+func (start *Start) SetNext(i Inst) { start.next = i }
+func (start *Start) Index() int { return start.index }
+func (start *Start) SetIndex(i int) { start.index = i }
+func (start *Start) Print() { print("start") }
+
+// --- END end of program
+type End struct {
+ next Inst;
+ index int;
+}
+
+func (end *End) Type() int { return END }
+func (end *End) Next() Inst { return end.next }
+func (end *End) SetNext(i Inst) { end.next = i }
+func (end *End) Index() int { return end.index }
+func (end *End) SetIndex(i int) { end.index = i }
+func (end *End) Print() { print("end") }
+
+// --- BOT beginning of text
+type Bot struct {
+ next Inst;
+ index int;
+}
+
+func (bot *Bot) Type() int { return BOT }
+func (bot *Bot) Next() Inst { return bot.next }
+func (bot *Bot) SetNext(i Inst) { bot.next = i }
+func (bot *Bot) Index() int { return bot.index }
+func (bot *Bot) SetIndex(i int) { bot.index = i }
+func (bot *Bot) Print() { print("bot") }
+
+// --- EOT end of text
+type Eot struct {
+ next Inst;
+ index int;
+}
+
+func (eot *Eot) Type() int { return EOT }
+func (eot *Eot) Next() Inst { return eot.next }
+func (eot *Eot) SetNext(i Inst) { eot.next = i }
+func (eot *Eot) Index() int { return eot.index }
+func (eot *Eot) SetIndex(i int) { eot.index = i }
+func (eot *Eot) Print() { print("eot") }
+
+// --- CHAR a regular character
+type Char struct {
+ next Inst;
+ index int;
+ char int;
+}
+
+func (char *Char) Type() int { return CHAR }
+func (char *Char) Next() Inst { return char.next }
+func (char *Char) SetNext(i Inst) { char.next = i }
+func (char *Char) Index() int { return char.index }
+func (char *Char) SetIndex(i int) { char.index = i }
+func (char *Char) Print() { print("char ", string(char.char)) }
+
+func NewChar(char int) *Char {
+ c := new(Char);
+ c.char = char;
+ return c;
+}
+
+// --- CHARCLASS [a-z]
+
+type CClassChar int; // BUG: Shouldn't be necessary but 6g won't put ints into vectors
+
+type CharClass struct {
+ next Inst;
+ index int;
+ char int;
+ negate bool; // is character class negated? ([^a-z])
+ // Vector of CClassChar, stored pairwise: [a-z] is (a,z); x is (x,x):
+ ranges *vector.Vector;
+}
+
+func (cclass *CharClass) Type() int { return CHARCLASS }
+func (cclass *CharClass) Next() Inst { return cclass.next }
+func (cclass *CharClass) SetNext(i Inst) { cclass.next = i }
+func (cclass *CharClass) Index() int { return cclass.index }
+func (cclass *CharClass) SetIndex(i int) { cclass.index = i }
+func (cclass *CharClass) Print() {
+ print("charclass");
+ if cclass.negate {
+ print(" (negated)");
+ }
+ for i := 0; i < cclass.ranges.Len(); i += 2 {
+ l := cclass.ranges.At(i).(CClassChar);
+ r := cclass.ranges.At(i+1).(CClassChar);
+ if l == r {
+ print(" [", string(l), "]");
+ } else {
+ print(" [", string(l), "-", string(r), "]");
+ }
+ }
+}
+
+func (cclass *CharClass) AddRange(a, b CClassChar) {
+ // range is a through b inclusive
+ cclass.ranges.Append(a);
+ cclass.ranges.Append(b);
+}
+
+func (cclass *CharClass) Matches(c int) bool {
+ for i := 0; i < cclass.ranges.Len(); i = i+2 {
+ min := cclass.ranges.At(i).(CClassChar);
+ max := cclass.ranges.At(i+1).(CClassChar);
+ if min <= c && c <= max {
+ return !cclass.negate
+ }
+ }
+ return cclass.negate
+}
+
+func NewCharClass() *CharClass {
+ c := new(CharClass);
+ c.ranges = vector.New();
+ return c;
+}
+
+// --- ANY any character
+type Any struct {
+ next Inst;
+ index int;
+}
+
+func (any *Any) Type() int { return ANY }
+func (any *Any) Next() Inst { return any.next }
+func (any *Any) SetNext(i Inst) { any.next = i }
+func (any *Any) Index() int { return any.index }
+func (any *Any) SetIndex(i int) { any.index = i }
+func (any *Any) Print() { print("any") }
+
+// --- BRA parenthesized expression
+type Bra struct {
+ next Inst;
+ index int;
+ n int; // subexpression number
+}
+
+func (bra *Bra) Type() int { return BRA }
+func (bra *Bra) Next() Inst { return bra.next }
+func (bra *Bra) SetNext(i Inst) { bra.next = i }
+func (bra *Bra) Index() int { return bra.index }
+func (bra *Bra) SetIndex(i int) { bra.index = i }
+func (bra *Bra) Print() { print("bra"); }
+
+// --- EBRA end of parenthesized expression
+type Ebra struct {
+ next Inst;
+ index int;
+ n int; // subexpression number
+}
+
+func (ebra *Ebra) Type() int { return EBRA }
+func (ebra *Ebra) Next() Inst { return ebra.next }
+func (ebra *Ebra) SetNext(i Inst) { ebra.next = i }
+func (ebra *Ebra) Index() int { return ebra.index }
+func (ebra *Ebra) SetIndex(i int) { ebra.index = i }
+func (ebra *Ebra) Print() { print("ebra ", ebra.n); }
+
+// --- ALT alternation
+type Alt struct {
+ next Inst;
+ index int;
+ left Inst; // other branch
+}
+
+func (alt *Alt) Type() int { return ALT }
+func (alt *Alt) Next() Inst { return alt.next }
+func (alt *Alt) SetNext(i Inst) { alt.next = i }
+func (alt *Alt) Index() int { return alt.index }
+func (alt *Alt) SetIndex(i int) { alt.index = i }
+func (alt *Alt) Print() { print("alt(", alt.left.Index(), ")"); }
+
+// --- NOP no operation
+type Nop struct {
+ next Inst;
+ index int;
+}
+
+func (nop *Nop) Type() int { return NOP }
+func (nop *Nop) Next() Inst { return nop.next }
+func (nop *Nop) SetNext(i Inst) { nop.next = i }
+func (nop *Nop) Index() int { return nop.index }
+func (nop *Nop) SetIndex(i int) { nop.index = i }
+func (nop *Nop) Print() { print("nop") }
+
+// report error and exit compiling/executing goroutine
+func (re *RE) Error(err *os.Error) {
+ re.error = err;
+ re.ch <- re;
+ sys.goexit();
+}
+
+func (re *RE) Add(i Inst) Inst {
+ i.SetIndex(re.inst.Len());
+ re.inst.Append(i);
+ return i;
+}
+
+type Parser struct {
+ re *RE;
+ nlpar int; // number of unclosed lpars
+ pos int;
+ ch int;
+}
+
+const EOF = -1
+
+func (p *Parser) c() int {
+ return p.ch;
+}
+
+func (p *Parser) nextc() int {
+ if p.pos >= len(p.re.expr) {
+ p.ch = EOF
+ } else {
+ c, w := sys.stringtorune(p.re.expr, p.pos);
+ p.ch = c;
+ p.pos += w;
+ }
+ return p.ch;
+}
+
+func NewParser(re *RE) *Parser {
+ parser := new(Parser);
+ parser.re = re;
+ parser.nextc(); // load p.ch
+ return parser;
+}
+
+/*
+
+Grammar:
+ regexp:
+ concatenation { '|' concatenation }
+ concatenation:
+ { closure }
+ closure:
+ term [ '*' | '+' | '?' ]
+ term:
+ '^'
+ '$'
+ '.'
+ character
+ '[' character-ranges ']'
+ '(' regexp ')'
+
+*/
+
+func (p *Parser) Regexp() (start, end Inst)
+
+var NULL Inst
+type BUGinter interface{}
+
+func special(c int) bool {
+ s := `\.+*?()|[]`;
+ for i := 0; i < len(s); i++ {
+ if c == int(s[i]) {
+ return true
+ }
+ }
+ return false
+}
+
+func specialcclass(c int) bool {
+ s := `\-[]`;
+ for i := 0; i < len(s); i++ {
+ if c == int(s[i]) {
+ return true
+ }
+ }
+ return false
+}
+
+func (p *Parser) CharClass() Inst {
+ cc := NewCharClass();
+ p.re.Add(cc);
+ if p.c() == '^' {
+ cc.negate = true;
+ p.nextc();
+ }
+ left := -1;
+ for {
+ switch c := p.c(); c {
+ case ']', EOF:
+ if left >= 0 {
+ p.re.Error(ErrBadRange);
+ }
+ return cc;
+ case '-': // do this before backslash processing
+ p.re.Error(ErrBadRange);
+ case '\\':
+ c = p.nextc();
+ switch {
+ case c == EOF:
+ p.re.Error(ErrExtraneousBackslash);
+ case c == 'n':
+ c = '\n';
+ case specialcclass(c):
+ // c is as delivered
+ default:
+ p.re.Error(ErrBadBackslash);
+ }
+ fallthrough;
+ default:
+ p.nextc();
+ switch {
+ case left < 0: // first of pair
+ if p.c() == '-' { // range
+ p.nextc();
+ left = c;
+ } else { // single char
+ cc.AddRange(c, c);
+ }
+ case left <= c: // second of pair
+ cc.AddRange(left, c);
+ left = -1;
+ default:
+ p.re.Error(ErrBadRange);
+ }
+ }
+ }
+ return NULL
+}
+
+func (p *Parser) Term() (start, end Inst) {
+ switch c := p.c(); c {
+ case '|', EOF:
+ return NULL, NULL;
+ case '*', '+', '|':
+ p.re.Error(ErrBareClosure);
+ case ')':
+ if p.nlpar == 0 {
+ p.re.Error(ErrUnmatchedRpar);
+ }
+ return NULL, NULL;
+ case ']':
+ p.re.Error(ErrUnmatchedRbkt);
+ case '^':
+ p.nextc();
+ start = p.re.Add(new(Bot));
+ return start, start;
+ case '$':
+ p.nextc();
+ start = p.re.Add(new(Eot));
+ return start, start;
+ case '.':
+ p.nextc();
+ start = p.re.Add(new(Any));
+ return start, start;
+ case '[':
+ p.nextc();
+ start = p.CharClass();
+ if p.c() != ']' {
+ p.re.Error(ErrUnmatchedLbkt);
+ }
+ p.nextc();
+ return start, start;
+ case '(':
+ p.nextc();
+ p.nlpar++;
+ p.re.nbra++; // increment first so first subexpr is \1
+ nbra := p.re.nbra;
+ start, end = p.Regexp();
+ if p.c() != ')' {
+ p.re.Error(ErrUnmatchedLpar);
+ }
+ p.nlpar--;
+ p.nextc();
+ bra := new(Bra);
+ p.re.Add(bra);
+ ebra := new(Ebra);
+ p.re.Add(ebra);
+ bra.n = nbra;
+ ebra.n = nbra;
+ if start == NULL {
+ if end == NULL { p.re.Error(ErrInternal) }
+ start = ebra
+ } else {
+ end.SetNext(ebra);
+ }
+ bra.SetNext(start);
+ return bra, ebra;
+ case '\\':
+ c = p.nextc();
+ switch {
+ case c == EOF:
+ p.re.Error(ErrExtraneousBackslash);
+ case c == 'n':
+ c = '\n';
+ case special(c):
+ // c is as delivered
+ default:
+ p.re.Error(ErrBadBackslash);
+ }
+ fallthrough;
+ default:
+ p.nextc();
+ start = NewChar(c);
+ p.re.Add(start);
+ return start, start
+ }
+ panic("unreachable");
+}
+
+func (p *Parser) Closure() (start, end Inst) {
+ start, end = p.Term();
+ if start == NULL {
+ return
+ }
+ switch p.c() {
+ case '*':
+ // (start,end)*:
+ alt := new(Alt);
+ p.re.Add(alt);
+ end.SetNext(alt); // after end, do alt
+ alt.left = start; // alternate brach: return to start
+ start = alt; // alt becomes new (start, end)
+ end = alt;
+ case '+':
+ // (start,end)+:
+ alt := new(Alt);
+ p.re.Add(alt);
+ end.SetNext(alt); // after end, do alt
+ alt.left = start; // alternate brach: return to start
+ end = alt; // start is unchanged; end is alt
+ case '?':
+ // (start,end)?:
+ alt := new(Alt);
+ p.re.Add(alt);
+ nop := new(Nop);
+ p.re.Add(nop);
+ alt.left = start; // alternate branch is start
+ alt.next = nop; // follow on to nop
+ end.SetNext(nop); // after end, go to nop
+ start = alt; // start is now alt
+ end = nop; // end is nop pointed to by both branches
+ default:
+ return
+ }
+ switch p.nextc() {
+ case '*', '+', '?':
+ p.re.Error(ErrBadClosure);
+ }
+ return
+}
+
+func (p *Parser) Concatenation() (start, end Inst) {
+ start, end = NULL, NULL;
+ for {
+ nstart, nend := p.Closure();
+ switch {
+ case nstart == NULL: // end of this concatenation
+ if start == NULL { // this is the empty string
+ nop := p.re.Add(new(Nop));
+ return nop, nop;
+ }
+ return;
+ case start == NULL: // this is first element of concatenation
+ start, end = nstart, nend;
+ default:
+ end.SetNext(nstart);
+ end = nend;
+ }
+ }
+ panic("unreachable");
+}
+
+func (p *Parser) Regexp() (start, end Inst) {
+ start, end = p.Concatenation();
+ for {
+ switch p.c() {
+ default:
+ return;
+ case '|':
+ p.nextc();
+ nstart, nend := p.Concatenation();
+ alt := new(Alt);
+ p.re.Add(alt);
+ alt.left = start;
+ alt.next = nstart;
+ nop := new(Nop);
+ p.re.Add(nop);
+ end.SetNext(nop);
+ nend.SetNext(nop);
+ start, end = alt, nop;
+ }
+ }
+ panic("unreachable");
+}
+
+func UnNop(i Inst) Inst {
+ for i.Type() == NOP {
+ i = i.Next()
+ }
+ return i
+}
+
+func (re *RE) EliminateNops() {
+ for i := 0; i < re.inst.Len(); i++ {
+ inst := re.inst.At(i).(Inst);
+ if inst.Type() == END {
+ continue
+ }
+ inst.SetNext(UnNop(inst.Next()));
+ if inst.Type() == ALT {
+ alt := inst.(*Alt);
+ alt.left = UnNop(alt.left);
+ }
+ }
+}
+
+func (re *RE) Dump() {
+ for i := 0; i < re.inst.Len(); i++ {
+ inst := re.inst.At(i).(Inst);
+ print(inst.Index(), ": ");
+ inst.Print();
+ if inst.Type() != END {
+ print(" -> ", inst.Next().Index())
+ }
+ print("\n");
+ }
+}
+
+func (re *RE) DoParse() {
+ parser := NewParser(re);
+ start := new(Start);
+ re.Add(start);
+ s, e := parser.Regexp();
+ start.next = s;
+ re.start = start;
+ e.SetNext(re.Add(new(End)));
+
+ if debug {
+ re.Dump();
+ println();
+ }
+
+ re.EliminateNops();
+
+ if debug {
+ re.Dump();
+ println();
+ }
+}
+
+
+func Compiler(str string, ch *chan *RE) {
+ re := new(RE);
+ re.expr = str;
+ re.inst = vector.New();
+ re.ch = ch;
+ re.DoParse();
+ ch <- re;
+}
+
+// Public interface has only execute functionality (not yet implemented)
+export type Regexp interface {
+ Execute(s string) *[]int
+}
+
+// Compile in separate goroutine; wait for result
+export func Compile(str string) (regexp Regexp, error *os.Error) {
+ ch := new(chan *RE);
+ go Compiler(str, ch);
+ re := <-ch;
+ return re, re.error
+}
+
+type State struct {
+ inst Inst; // next instruction to execute
+ match *[]int; // pairs of bracketing submatches. 0th is start,end
+}
+
+// Append new state to to-do list. Leftmost-longest wins so avoid
+// adding a state that's already active.
+func AddState(s *[]State, inst Inst, match *[]int) *[]State {
+ index := inst.Index();
+ l := len(s);
+ pos := match[0];
+ // TODO: Once the state is a vector and we can do insert, have inputs always
+ // go in order correctly and this "earlier" test is never necessary,
+ for i := 0; i < l; i++ {
+ if s[i].inst.Index() == index && // same instruction
+ s[i].match[0] < pos { // earlier match already going; lefmost wins
+ return s
+ }
+ }
+ if l == cap(s) {
+ s1 := new([]State, 2*l)[0:l];
+ for i := 0; i < l; i++ {
+ s1[i] = s[i];
+ }
+ s = s1;
+ }
+ s = s[0:l+1];
+ s[l].inst = inst;
+ s[l].match = match;
+ return s;
+}
+
+func (re *RE) DoExecute(str string, pos int) *[]int {
+ var s [2]*[]State; // TODO: use a vector when State values (not ptrs) can be vector elements
+ s[0] = new([]State, 10)[0:0];
+ s[1] = new([]State, 10)[0:0];
+ in, out := 0, 1;
+ var final State;
+ found := false;
+ for pos <= len(str) {
+ if !found {
+ // prime the pump if we haven't seen a match yet
+ match := new([]int, 2*(re.nbra+1));
+ for i := 0; i < len(match); i++ {
+ match[i] = -1; // no match seen; catches cases like "a(b)?c" on "ac"
+ }
+ match[0] = pos;
+ s[out] = AddState(s[out], re.start.Next(), match);
+ }
+ in, out = out, in; // old out state is new in state
+ s[out] = s[out][0:0]; // clear out state
+ if len(s[in]) == 0 {
+ // machine has completed
+ break;
+ }
+ charwidth := 1;
+ c := EOF;
+ if pos < len(str) {
+ c, charwidth = sys.stringtorune(str, pos);
+ }
+ for i := 0; i < len(s[in]); i++ {
+ state := s[in][i];
+ switch s[in][i].inst.Type() {
+ case BOT:
+ if pos == 0 {
+ s[in] = AddState(s[in], state.inst.Next(), state.match)
+ }
+ case EOT:
+ if pos == len(str) {
+ s[in] = AddState(s[in], state.inst.Next(), state.match)
+ }
+ case CHAR:
+ if c == state.inst.(*Char).char {
+ s[out] = AddState(s[out], state.inst.Next(), state.match)
+ }
+ case CHARCLASS:
+ if state.inst.(*CharClass).Matches(c) {
+ s[out] = AddState(s[out], state.inst.Next(), state.match)
+ }
+ case ANY:
+ if c != EOF {
+ s[out] = AddState(s[out], state.inst.Next(), state.match)
+ }
+ case BRA:
+ n := state.inst.(*Bra).n;
+ state.match[2*n] = pos;
+ s[in] = AddState(s[in], state.inst.Next(), state.match);
+ case EBRA:
+ n := state.inst.(*Ebra).n;
+ state.match[2*n+1] = pos;
+ s[in] = AddState(s[in], state.inst.Next(), state.match);
+ case ALT:
+ s[in] = AddState(s[in], state.inst.(*Alt).left, state.match);
+ // give other branch a copy of this match vector
+ s1 := new([]int, 2*(re.nbra+1));
+ for i := 0; i < len(s1); i++ {
+ s1[i] = state.match[i]
+ }
+ s[in] = AddState(s[in], state.inst.Next(), s1);
+ case END:
+ // choose leftmost longest
+ if !found || // first
+ state.match[0] < final.match[0] || // leftmost
+ (state.match[0] == final.match[0] && pos > final.match[1]) { // longest
+ final = state;
+ final.match[1] = pos;
+ }
+ found = true;
+ default:
+ state.inst.Print();
+ panic("unknown instruction in execute");
+ }
+ }
+ pos += charwidth;
+ }
+ if !found {
+ return nil
+ }
+ return final.match;
+}
+
+
+func (re *RE) Execute(s string) *[]int {
+ return re.DoExecute(s, 0)
+}