summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRob Pike <r@golang.org>2008-10-14 16:32:43 -0700
committerRob Pike <r@golang.org>2008-10-14 16:32:43 -0700
commit6c508518ffd95d9badac7a017e528ce6b64354f9 (patch)
treea7af7692aecfebb223535f3a2ffb207657204b5a
parent8a856d806e1342d09121603de719f0e2fb6b65d6 (diff)
downloadgolang-6c508518ffd95d9badac7a017e528ce6b64354f9.tar.gz
implement matching
clean up interface equality hack still needs more tests; checking in for gccgo testing R=rsc DELTA=304 (261 added, 14 deleted, 29 changed) OCL=17128 CL=17135
-rw-r--r--usr/r/regexp/main.go129
-rw-r--r--usr/r/regexp/regexp.go196
2 files changed, 286 insertions, 39 deletions
diff --git a/usr/r/regexp/main.go b/usr/r/regexp/main.go
index d723d87a9..25ec07ade 100644
--- a/usr/r/regexp/main.go
+++ b/usr/r/regexp/main.go
@@ -9,14 +9,131 @@ import (
"regexp";
)
+var good_re = []string{
+ ``
+, `.`
+, `^.$`
+, `a`
+, `a*`
+, `a+`
+, `a?`
+, `a|b`
+, `a*|b*`
+, `(a*|b)(c*|d)`
+, `[a-z]`
+, `[a-abc-c\-\]\[]`
+, `[a-z]+`
+, `[]`
+, `[abc]`
+, `[^1234]`
+}
+
+// TODO: nice to do this with a map but we don't have an iterator
+type StringError struct {
+ re string;
+ err *os.Error;
+}
+var bad_re = []StringError{
+ StringError{ `*`, regexp.ErrBareClosure },
+ StringError{ `(abc`, regexp.ErrUnmatchedLpar },
+ StringError{ `abc)`, regexp.ErrUnmatchedRpar },
+ StringError{ `x[a-z`, regexp.ErrUnmatchedLbkt },
+ StringError{ `abc]`, regexp.ErrUnmatchedRbkt },
+ StringError{ `[z-a]`, regexp.ErrBadRange },
+ StringError{ `abc\`, regexp.ErrExtraneousBackslash },
+ StringError{ `a**`, regexp.ErrBadClosure },
+ StringError{ `a*+`, regexp.ErrBadClosure },
+ StringError{ `a??`, regexp.ErrBadClosure },
+ StringError{ `*`, regexp.ErrBareClosure },
+ StringError{ `\x`, regexp.ErrBadBackslash }
+}
+
+type Vec [20]int;
+
+type Tester struct {
+ re string;
+ text string;
+ match Vec;
+}
+
+var matches = []Tester {
+ Tester{ ``, "", Vec{0,0, -1,-1} },
+ Tester{ `a`, "a", Vec{0,1, -1,-1} },
+ Tester{ `b`, "abc", Vec{1,2, -1,-1} },
+ Tester{ `.`, "a", Vec{0,1, -1,-1} },
+ Tester{ `.*`, "abcdef", Vec{0,6, -1,-1} },
+ Tester{ `^abcd$`, "abcd", Vec{0,4, -1,-1} },
+ Tester{ `^bcd'`, "abcdef", Vec{-1,-1} },
+ Tester{ `^abcd$`, "abcde", Vec{-1,-1} },
+ Tester{ `a+`, "baaab", Vec{1, 4, -1,-1} },
+ Tester{ `a*`, "baaab", Vec{0, 0, -1,-1} }
+}
+
+func Compile(expr string, error *os.Error) regexp.Regexp {
+ re, err := regexp.Compile(expr);
+ if err != error {
+ print("compiling `", expr, "`; unexpected error: ", err.String(), "\n");
+ sys.exit(1);
+ }
+ return re
+}
+
+func MarkedLen(m *[] int) int {
+ if m == nil {
+ return 0
+ }
+ var i int;
+ for i = 0; i < len(m) && m[i] >= 0; i = i+2 {
+ }
+ return i
+}
+
+func PrintVec(m *[] int) {
+ l := MarkedLen(m);
+ for i := 0; i < l && m[i] >= 0; i = i+2 {
+ print(m[i], ",", m[i+1], " ")
+ }
+}
+
+func Equal(m1, m2 *[]int) bool {
+ l := MarkedLen(m1);
+ if l != MarkedLen(m2) {
+ return false
+ }
+ for i := 0; i < l; i++ {
+ if m1[i] != m2[i] {
+ return false
+ }
+ }
+ return true
+}
+
+func Match(expr string, str string, match *[]int) {
+ re := Compile(expr, nil);
+ m := re.Execute(str);
+ if !Equal(m, match) {
+ print("failure on `", expr, "` matching `", str, "`:\n");
+ PrintVec(m);
+ print("\nshould be:\n");
+ PrintVec(match);
+ print("\n");
+ sys.exit(1);
+ }
+}
+
func main() {
- str := "a*b*c*";
if sys.argc() > 1 {
- str = sys.argv(1);
+ Compile(sys.argv(1), nil);
+ sys.exit(0);
}
- re, err := regexp.Compile(str);
- if err != nil {
- print("error: ", err.String(), "\n");
- sys.exit(1);
+ for i := 0; i < len(good_re); i++ {
+ Compile(good_re[i], nil);
+ }
+ for i := 0; i < len(bad_re); i++ {
+ Compile(bad_re[i].re, bad_re[i].err)
+ }
+ for i := 0; i < len(matches); i++ {
+ t := &matches[i];
+ Match(t.re, t.text, &t.match)
}
}
diff --git a/usr/r/regexp/regexp.go b/usr/r/regexp/regexp.go
index c491b262a..0a6fd3113 100644
--- a/usr/r/regexp/regexp.go
+++ b/usr/r/regexp/regexp.go
@@ -11,6 +11,8 @@ import (
"vector";
)
+export var debug = false;
+
export var ErrUnimplemented = os.NewError("unimplemented");
export var ErrInternal = os.NewError("internal error");
@@ -20,7 +22,6 @@ export var ErrUnmatchedLbkt = os.NewError("unmatched '['");
export var ErrUnmatchedRbkt = os.NewError("unmatched ']'");
export var ErrBadRange = os.NewError("bad range in character class");
export var ErrExtraneousBackslash = os.NewError("extraneous backslash");
-export var ErrEmpty = os.NewError("empty subexpression or alternation");
export var ErrBadClosure = os.NewError("repeated closure (**, ++, etc.)");
export var ErrBareClosure = os.NewError("closure applies to nothing");
export var ErrBadBackslash = os.NewError("illegal backslash escape");
@@ -41,10 +42,11 @@ type RE struct {
error *os.Error; // compile- or run-time error; nil if OK
inst *vector.Vector;
start Inst;
+ nbra int; // number of brackets in expression, for subexpressions
}
const (
- START // beginning of program: indexer to start
+ START // beginning of program
= iota;
END; // end of program: success
BOT; // '^' beginning of text
@@ -113,8 +115,8 @@ func (eot *Eot) Print() { print("eot") }
// --- CHAR a regular character
type Char struct {
next Inst;
- char int;
index int;
+ char int;
}
func (char *Char) Type() int { return CHAR }
@@ -143,7 +145,7 @@ type CharClass struct {
ranges *vector.Vector;
}
-func (cclass *CharClass) Type() int { return CHAR }
+func (cclass *CharClass) Type() int { return CHARCLASS }
func (cclass *CharClass) Next() Inst { return cclass.next }
func (cclass *CharClass) SetNext(i Inst) { cclass.next = i }
func (cclass *CharClass) Index() int { return cclass.index }
@@ -170,6 +172,17 @@ func (cclass *CharClass) AddRange(a, b CClassChar) {
cclass.ranges.Append(b);
}
+func (cclass *CharClass) Matches(c int) bool {
+ for i := 0; i < cclass.ranges.Len(); i = i+2 {
+ min := cclass.ranges.At(i).(CClassChar);
+ max := cclass.ranges.At(i+1).(CClassChar);
+ if min <= c && c <= max {
+ return !cclass.negate
+ }
+ }
+ return cclass.negate
+}
+
func NewCharClass() *CharClass {
c := new(CharClass);
c.ranges = vector.New();
@@ -210,7 +223,7 @@ type Ebra struct {
n int; // subexpression number
}
-func (ebra *Ebra) Type() int { return BRA }
+func (ebra *Ebra) Type() int { return EBRA }
func (ebra *Ebra) Next() Inst { return ebra.next }
func (ebra *Ebra) SetNext(i Inst) { ebra.next = i }
func (ebra *Ebra) Index() int { return ebra.index }
@@ -259,7 +272,6 @@ func (re *RE) Add(i Inst) Inst {
type Parser struct {
re *RE;
- nbra int; // number of brackets in expression, for subexpressions
nlpar int; // number of unclosed lpars
pos int;
ch int;
@@ -314,16 +326,6 @@ func (p *Parser) Regexp() (start, end Inst)
var NULL Inst
type BUGinter interface{}
-// same as i == NULL. TODO: remove when 6g lets me do i == NULL
-func isNULL(i Inst) bool {
- return sys.BUG_intereq(i.(BUGinter), NULL.(BUGinter))
-}
-
-// same as i == j. TODO: remove when 6g lets me do i == j
-func isEQ(i,j Inst) bool {
- return sys.BUG_intereq(i.(BUGinter), j.(BUGinter))
-}
-
func special(c int) bool {
s := `\.+*?()|[]`;
for i := 0; i < len(s); i++ {
@@ -437,15 +439,15 @@ func (p *Parser) Term() (start, end Inst) {
}
p.nlpar--;
p.nextc();
- p.nbra++;
bra := new(Bra);
p.re.Add(bra);
ebra := new(Ebra);
p.re.Add(ebra);
- bra.n = p.nbra;
- ebra.n = p.nbra;
- if isNULL(start) {
- if !isNULL(end) { p.re.Error(ErrInternal) }
+ p.re.nbra++; // increment first so first subexpr is \1
+ bra.n = p.re.nbra;
+ ebra.n = p.re.nbra;
+ if start == NULL {
+ if end == NULL { p.re.Error(ErrInternal) }
start = ebra
} else {
end.SetNext(ebra);
@@ -476,7 +478,7 @@ func (p *Parser) Term() (start, end Inst) {
func (p *Parser) Closure() (start, end Inst) {
start, end = p.Term();
- if isNULL(start) {
+ if start == NULL {
return start, end
}
switch p.c() {
@@ -521,13 +523,13 @@ func (p *Parser) Concatenation() (start, end Inst) {
for {
nstart, nend := p.Closure();
switch {
- case isNULL(nstart): // end of this concatenation
- if isNULL(start) { // this is the empty string
+ case nstart == NULL: // end of this concatenation
+ if start == NULL { // this is the empty string
nop := p.re.Add(new(Nop));
return nop, nop;
}
return start, end;
- case isNULL(start): // this is first element of concatenation
+ case start == NULL: // this is first element of concatenation
start, end = nstart, nend;
default:
end.SetNext(nstart);
@@ -602,17 +604,20 @@ func (re *RE) DoParse() {
re.start = start;
e.SetNext(re.Add(new(End)));
- re.Dump();
- println();
+ if debug {
+ re.Dump();
+ println();
+ }
re.EliminateNops();
- re.Dump();
- println();
-
- re.Error(ErrUnimplemented);
+ if debug {
+ re.Dump();
+ println();
+ }
}
+
func Compiler(str string, ch *chan *RE) {
re := new(RE);
re.expr = str;
@@ -624,13 +629,138 @@ func Compiler(str string, ch *chan *RE) {
// Public interface has only execute functionality (not yet implemented)
export type Regexp interface {
- // Execute() bool
+ Execute(s string) *[]int
}
-// compile in separate goroutine; wait for result
+// Compile in separate goroutine; wait for result
export func Compile(str string) (regexp Regexp, error *os.Error) {
ch := new(chan *RE);
go Compiler(str, ch);
re := <-ch;
return re, re.error
}
+
+type State struct {
+ inst Inst; // next instruction to execute
+ match *[]int; // pairs of bracketing submatches. 0th is start,end
+}
+
+// Append new state to to-do list. Leftmost-longest wins so avoid
+// adding a state that's already active.
+func AddState(s *[]State, inst Inst, match *[]int) *[]State {
+ index := inst.Index();
+ l := len(s);
+ pos := match[0];
+ // TODO: Once the state is a vector and we can do insert, have inputs always
+ // go in order correctly and this "earlier" test is never necessary,
+ for i := 0; i < l; i++ {
+ if s[i].inst.Index() == index && // same instruction
+ s[i].match[0] < pos { // earlier match already going; lefmost wins
+ return s
+ }
+ }
+ if l == cap(s) {
+ s1 := new([]State, 2*l)[0:l];
+ for i := 0; i < l; i++ {
+ s1[i] = s[i];
+ }
+ s = s1;
+ }
+ s = s[0:l+1];
+ s[l].inst = inst;
+ s[l].match = match;
+ return s;
+}
+
+func (re *RE) DoExecute(str string, pos int) *[]int {
+ var s [2]*[]State; // TODO: use a vector when State values (not ptrs) can be vector elements
+ s[0] = new([]State, 10)[0:0];
+ s[1] = new([]State, 10)[0:0];
+ in, out := 0, 1;
+ var final State;
+ found := false;
+ for pos <= len(str) {
+ if !found {
+ // prime the pump if we haven't seen a match yet
+ match := new([]int, 2*(re.nbra+1));
+ match[0] = pos;
+ s[out] = AddState(s[out], re.start.Next(), match);
+ }
+ in, out = out, in; // old out state is new in state
+ s[out] = s[out][0:0]; // clear out state
+ if len(s[in]) == 0 {
+ // machine has completed
+ break;
+ }
+ c := EOF;
+ if pos < len(str) {
+ c = int(str[pos])
+ }
+//println("position ", pos, "char", string(c), "in", in, "out", out, "len in", len(s[in]));
+ for i := 0; i < len(s[in]); i++ {
+ state := s[in][i];
+//state.inst.Print(); print("\n");
+ switch s[in][i].inst.Type() {
+ case BOT:
+ if pos == 0 {
+ s[in] = AddState(s[in], state.inst.Next(), state.match)
+ }
+ case EOT:
+ if pos == len(str) {
+ s[in] = AddState(s[in], state.inst.Next(), state.match)
+ }
+ case CHAR:
+ if c == state.inst.(*Char).char {
+ s[out] = AddState(s[out], state.inst.Next(), state.match)
+ }
+ case CHARCLASS:
+ if state.inst.(*CharClass).Matches(c) {
+ s[out] = AddState(s[out], state.inst.Next(), state.match)
+ }
+ case ANY:
+ if c != EOF {
+ s[out] = AddState(s[out], state.inst.Next(), state.match)
+ }
+ case BRA:
+ n := state.inst.(*Bra).n;
+ state.match[2*n] = pos;
+ s[in] = AddState(s[in], state.inst.Next(), state.match);
+ case EBRA:
+ n := state.inst.(*Ebra).n;
+ state.match[2*n+1] = pos;
+ s[in] = AddState(s[in], state.inst.Next(), state.match);
+ case ALT:
+ s[in] = AddState(s[in], state.inst.(*Alt).left, state.match);
+ // give other branch a copy of this match vector
+ s1 := new([]int, 2*(re.nbra+1));
+ for i := 0; i < len(s1); i++ {
+ s1[i] = state.match[i]
+ }
+ s[in] = AddState(s[in], state.inst.Next(), s1);
+ case END:
+ // choose leftmost longest
+ if !found || // first
+ state.match[0] < final.match[0] || // leftmost
+ (state.match[0] == final.match[0] && pos > final.match[1]) { // longest
+ final = state;
+ final.match[1] = pos;
+ }
+ found = true;
+ default:
+ state.inst.Print();
+ panic("unknown instruction in execute");
+ }
+ }
+ pos++;
+ }
+ if !found {
+ return nil
+ }
+//if found { println("found: from ", final.match[0], "to", final.match[1] )}
+ return final.match;
+}
+
+
+func (re *RE) Execute(s string) *[]int {
+ return re.DoExecute(s, 0)
+}