diff options
author | Rob Pike <r@golang.org> | 2008-10-14 19:22:17 -0700 |
---|---|---|
committer | Rob Pike <r@golang.org> | 2008-10-14 19:22:17 -0700 |
commit | e1033402ca1125d3517e79f1dbabae9b763ec90c (patch) | |
tree | f06f7331ad0775154f3e460e2785d9ad897120e1 /src/lib/regexp | |
parent | e36cd17c697c44d238ae9bd15dfafda2b8a4d209 (diff) | |
download | golang-e1033402ca1125d3517e79f1dbabae9b763ec90c.tar.gz |
move regexp to lib
next cl will update names and add to build
R=rsc
DELTA=1876 (938 added, 938 deleted, 0 changed)
OCL=17149
CL=17166
Diffstat (limited to 'src/lib/regexp')
-rw-r--r-- | src/lib/regexp/Makefile | 20 | ||||
-rw-r--r-- | src/lib/regexp/main.go | 160 | ||||
-rw-r--r-- | src/lib/regexp/regexp.go | 767 |
3 files changed, 947 insertions, 0 deletions
diff --git a/src/lib/regexp/Makefile b/src/lib/regexp/Makefile new file mode 100644 index 000000000..ac466a0f1 --- /dev/null +++ b/src/lib/regexp/Makefile @@ -0,0 +1,20 @@ +# Copyright 2009 The Go Authors. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +A=6 +G=$(A)g +L=$(A)l + +all: main + +main: main.6 + $L -o main main.6 + +main.6: regexp.6 + +clean: + rm -f *.6 main + +%.6: %.go + $G $< diff --git a/src/lib/regexp/main.go b/src/lib/regexp/main.go new file mode 100644 index 000000000..c89f9b557 --- /dev/null +++ b/src/lib/regexp/main.go @@ -0,0 +1,160 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "os"; + "regexp"; +) + +var good_re = []string{ + ``, + `.`, + `^.$`, + `a`, + `a*`, + `a+`, + `a?`, + `a|b`, + `a*|b*`, + `(a*|b)(c*|d)`, + `[a-z]`, + `[a-abc-c\-\]\[]`, + `[a-z]+`, + `[]`, + `[abc]`, + `[^1234]`, +} + +// TODO: nice to do this with a map but we don't have an iterator +type StringError struct { + re string; + err *os.Error; +} +var bad_re = []StringError{ + StringError{ `*`, regexp.ErrBareClosure }, + StringError{ `(abc`, regexp.ErrUnmatchedLpar }, + StringError{ `abc)`, regexp.ErrUnmatchedRpar }, + StringError{ `x[a-z`, regexp.ErrUnmatchedLbkt }, + StringError{ `abc]`, regexp.ErrUnmatchedRbkt }, + StringError{ `[z-a]`, regexp.ErrBadRange }, + StringError{ `abc\`, regexp.ErrExtraneousBackslash }, + StringError{ `a**`, regexp.ErrBadClosure }, + StringError{ `a*+`, regexp.ErrBadClosure }, + StringError{ `a??`, regexp.ErrBadClosure }, + StringError{ `*`, regexp.ErrBareClosure }, + StringError{ `\x`, regexp.ErrBadBackslash }, +} + +type Vec [20]int; + +type Tester struct { + re string; + text string; + match Vec; +} + +const END = -1000 + +var matches = []Tester { + Tester{ ``, "", Vec{0,0, END} }, + Tester{ `a`, "a", Vec{0,1, END} }, + Tester{ `b`, "abc", Vec{1,2, END} }, + Tester{ `.`, "a", Vec{0,1, END} }, + Tester{ `.*`, "abcdef", Vec{0,6, END} }, + Tester{ `^abcd$`, "abcd", Vec{0,4, END} }, + Tester{ `^bcd'`, "abcdef", Vec{END} }, + Tester{ `^abcd$`, "abcde", Vec{END} }, + Tester{ `a+`, "baaab", Vec{1,4, END} }, + Tester{ `a*`, "baaab", Vec{0,0, END} }, + Tester{ `[a-z]+`, "abcd", Vec{0,4, END} }, + Tester{ `[^a-z]+`, "ab1234cd", Vec{2,6, END} }, + Tester{ `[a\-\]z]+`, "az]-bcz", Vec{0,4, END} }, + Tester{ `[日本語]+`, "日本語日本語", Vec{0,18, END} }, + Tester{ `()`, "", Vec{0,0, 0,0, END} }, + Tester{ `(a)`, "a", Vec{0,1, 0,1, END} }, + Tester{ `(.)(.)`, "日a", Vec{0,4, 0,3, 3,4, END} }, + Tester{ `(.*)`, "", Vec{0,0, 0,0, END} }, + Tester{ `(.*)`, "abcd", Vec{0,4, 0,4, END} }, + Tester{ `(..)(..)`, "abcd", Vec{0,4, 0,2, 2,4, END} }, + Tester{ `(([^xyz]*)(d))`, "abcd", Vec{0,4, 0,4, 0,3, 3,4, END} }, + Tester{ `((a|b|c)*(d))`, "abcd", Vec{0,4, 0,4, 2,3, 3,4, END} }, + Tester{ `(((a|b|c)*)(d))`, "abcd", Vec{0,4, 0,4, 0,3, 2,3, 3,4, END} }, + Tester{ `a*(|(b))c*`, "aacc", Vec{0,4, 2,2, -1,-1, END} }, +} + +func Compile(expr string, error *os.Error) regexp.Regexp { + re, err := regexp.Compile(expr); + if err != error { + print("compiling `", expr, "`; unexpected error: ", err.String(), "\n"); + sys.exit(1); + } + return re +} + +func MarkedLen(m *[] int) int { + if m == nil { + return 0 + } + var i int; + for i = 0; i < len(m) && m[i] != END; i = i+2 { + } + return i +} + +func PrintVec(m *[] int) { + l := MarkedLen(m); + if l == 0 { + print("<no match>"); + } else { + for i := 0; i < l && m[i] != END; i = i+2 { + print(m[i], ",", m[i+1], " ") + } + } +} + +func Equal(m1, m2 *[]int) bool { + l := MarkedLen(m1); + if l != MarkedLen(m2) { + return false + } + for i := 0; i < l; i++ { + if m1[i] != m2[i] { + return false + } + } + return true +} + +func Match(expr string, str string, match *[]int) { + re := Compile(expr, nil); + m := re.Execute(str); + if !Equal(m, match) { + print("failure on `", expr, "` matching `", str, "`:\n"); + PrintVec(m); + print("\nshould be:\n"); + PrintVec(match); + print("\n"); + sys.exit(1); + } +} + +func main() { + //regexp.debug = true; + if sys.argc() > 1 { + Compile(sys.argv(1), nil); + sys.exit(0); + } + for i := 0; i < len(good_re); i++ { + Compile(good_re[i], nil); + } + for i := 0; i < len(bad_re); i++ { + Compile(bad_re[i].re, bad_re[i].err) + } + for i := 0; i < len(matches); i++ { + t := &matches[i]; + Match(t.re, t.text, &t.match) + } +} diff --git a/src/lib/regexp/regexp.go b/src/lib/regexp/regexp.go new file mode 100644 index 000000000..6535e6ef4 --- /dev/null +++ b/src/lib/regexp/regexp.go @@ -0,0 +1,767 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Regular expression library. + +package regexp + +import ( + "os"; + "vector"; +) + +export var debug = false; + + +export var ErrUnimplemented = os.NewError("unimplemented"); +export var ErrInternal = os.NewError("internal error"); +export var ErrUnmatchedLpar = os.NewError("unmatched '('"); +export var ErrUnmatchedRpar = os.NewError("unmatched ')'"); +export var ErrUnmatchedLbkt = os.NewError("unmatched '['"); +export var ErrUnmatchedRbkt = os.NewError("unmatched ']'"); +export var ErrBadRange = os.NewError("bad range in character class"); +export var ErrExtraneousBackslash = os.NewError("extraneous backslash"); +export var ErrBadClosure = os.NewError("repeated closure (**, ++, etc.)"); +export var ErrBareClosure = os.NewError("closure applies to nothing"); +export var ErrBadBackslash = os.NewError("illegal backslash escape"); + +// An instruction executed by the NFA +type Inst interface { + Type() int; // the type of this instruction: CHAR, ANY, etc. + Next() Inst; // the instruction to execute after this one + SetNext(i Inst); + Index() int; + SetIndex(i int); + Print(); +} + +type RE struct { + expr string; // the original expression + ch *chan<- *RE; // reply channel when we're done + error *os.Error; // compile- or run-time error; nil if OK + inst *vector.Vector; + start Inst; + nbra int; // number of brackets in expression, for subexpressions +} + +const ( + START // beginning of program + = iota; + END; // end of program: success + BOT; // '^' beginning of text + EOT; // '$' end of text + CHAR; // 'a' regular character + CHARCLASS; // [a-z] character class + ANY; // '.' any character + BRA; // '(' parenthesized expression + EBRA; // ')'; end of '(' parenthesized expression + ALT; // '|' alternation + NOP; // do nothing; makes it easy to link without patching +) + +// --- START start of program +type Start struct { + next Inst; + index int; +} + +func (start *Start) Type() int { return START } +func (start *Start) Next() Inst { return start.next } +func (start *Start) SetNext(i Inst) { start.next = i } +func (start *Start) Index() int { return start.index } +func (start *Start) SetIndex(i int) { start.index = i } +func (start *Start) Print() { print("start") } + +// --- END end of program +type End struct { + next Inst; + index int; +} + +func (end *End) Type() int { return END } +func (end *End) Next() Inst { return end.next } +func (end *End) SetNext(i Inst) { end.next = i } +func (end *End) Index() int { return end.index } +func (end *End) SetIndex(i int) { end.index = i } +func (end *End) Print() { print("end") } + +// --- BOT beginning of text +type Bot struct { + next Inst; + index int; +} + +func (bot *Bot) Type() int { return BOT } +func (bot *Bot) Next() Inst { return bot.next } +func (bot *Bot) SetNext(i Inst) { bot.next = i } +func (bot *Bot) Index() int { return bot.index } +func (bot *Bot) SetIndex(i int) { bot.index = i } +func (bot *Bot) Print() { print("bot") } + +// --- EOT end of text +type Eot struct { + next Inst; + index int; +} + +func (eot *Eot) Type() int { return EOT } +func (eot *Eot) Next() Inst { return eot.next } +func (eot *Eot) SetNext(i Inst) { eot.next = i } +func (eot *Eot) Index() int { return eot.index } +func (eot *Eot) SetIndex(i int) { eot.index = i } +func (eot *Eot) Print() { print("eot") } + +// --- CHAR a regular character +type Char struct { + next Inst; + index int; + char int; +} + +func (char *Char) Type() int { return CHAR } +func (char *Char) Next() Inst { return char.next } +func (char *Char) SetNext(i Inst) { char.next = i } +func (char *Char) Index() int { return char.index } +func (char *Char) SetIndex(i int) { char.index = i } +func (char *Char) Print() { print("char ", string(char.char)) } + +func NewChar(char int) *Char { + c := new(Char); + c.char = char; + return c; +} + +// --- CHARCLASS [a-z] + +type CClassChar int; // BUG: Shouldn't be necessary but 6g won't put ints into vectors + +type CharClass struct { + next Inst; + index int; + char int; + negate bool; // is character class negated? ([^a-z]) + // Vector of CClassChar, stored pairwise: [a-z] is (a,z); x is (x,x): + ranges *vector.Vector; +} + +func (cclass *CharClass) Type() int { return CHARCLASS } +func (cclass *CharClass) Next() Inst { return cclass.next } +func (cclass *CharClass) SetNext(i Inst) { cclass.next = i } +func (cclass *CharClass) Index() int { return cclass.index } +func (cclass *CharClass) SetIndex(i int) { cclass.index = i } +func (cclass *CharClass) Print() { + print("charclass"); + if cclass.negate { + print(" (negated)"); + } + for i := 0; i < cclass.ranges.Len(); i += 2 { + l := cclass.ranges.At(i).(CClassChar); + r := cclass.ranges.At(i+1).(CClassChar); + if l == r { + print(" [", string(l), "]"); + } else { + print(" [", string(l), "-", string(r), "]"); + } + } +} + +func (cclass *CharClass) AddRange(a, b CClassChar) { + // range is a through b inclusive + cclass.ranges.Append(a); + cclass.ranges.Append(b); +} + +func (cclass *CharClass) Matches(c int) bool { + for i := 0; i < cclass.ranges.Len(); i = i+2 { + min := cclass.ranges.At(i).(CClassChar); + max := cclass.ranges.At(i+1).(CClassChar); + if min <= c && c <= max { + return !cclass.negate + } + } + return cclass.negate +} + +func NewCharClass() *CharClass { + c := new(CharClass); + c.ranges = vector.New(); + return c; +} + +// --- ANY any character +type Any struct { + next Inst; + index int; +} + +func (any *Any) Type() int { return ANY } +func (any *Any) Next() Inst { return any.next } +func (any *Any) SetNext(i Inst) { any.next = i } +func (any *Any) Index() int { return any.index } +func (any *Any) SetIndex(i int) { any.index = i } +func (any *Any) Print() { print("any") } + +// --- BRA parenthesized expression +type Bra struct { + next Inst; + index int; + n int; // subexpression number +} + +func (bra *Bra) Type() int { return BRA } +func (bra *Bra) Next() Inst { return bra.next } +func (bra *Bra) SetNext(i Inst) { bra.next = i } +func (bra *Bra) Index() int { return bra.index } +func (bra *Bra) SetIndex(i int) { bra.index = i } +func (bra *Bra) Print() { print("bra"); } + +// --- EBRA end of parenthesized expression +type Ebra struct { + next Inst; + index int; + n int; // subexpression number +} + +func (ebra *Ebra) Type() int { return EBRA } +func (ebra *Ebra) Next() Inst { return ebra.next } +func (ebra *Ebra) SetNext(i Inst) { ebra.next = i } +func (ebra *Ebra) Index() int { return ebra.index } +func (ebra *Ebra) SetIndex(i int) { ebra.index = i } +func (ebra *Ebra) Print() { print("ebra ", ebra.n); } + +// --- ALT alternation +type Alt struct { + next Inst; + index int; + left Inst; // other branch +} + +func (alt *Alt) Type() int { return ALT } +func (alt *Alt) Next() Inst { return alt.next } +func (alt *Alt) SetNext(i Inst) { alt.next = i } +func (alt *Alt) Index() int { return alt.index } +func (alt *Alt) SetIndex(i int) { alt.index = i } +func (alt *Alt) Print() { print("alt(", alt.left.Index(), ")"); } + +// --- NOP no operation +type Nop struct { + next Inst; + index int; +} + +func (nop *Nop) Type() int { return NOP } +func (nop *Nop) Next() Inst { return nop.next } +func (nop *Nop) SetNext(i Inst) { nop.next = i } +func (nop *Nop) Index() int { return nop.index } +func (nop *Nop) SetIndex(i int) { nop.index = i } +func (nop *Nop) Print() { print("nop") } + +// report error and exit compiling/executing goroutine +func (re *RE) Error(err *os.Error) { + re.error = err; + re.ch <- re; + sys.goexit(); +} + +func (re *RE) Add(i Inst) Inst { + i.SetIndex(re.inst.Len()); + re.inst.Append(i); + return i; +} + +type Parser struct { + re *RE; + nlpar int; // number of unclosed lpars + pos int; + ch int; +} + +const EOF = -1 + +func (p *Parser) c() int { + return p.ch; +} + +func (p *Parser) nextc() int { + if p.pos >= len(p.re.expr) { + p.ch = EOF + } else { + c, w := sys.stringtorune(p.re.expr, p.pos); + p.ch = c; + p.pos += w; + } + return p.ch; +} + +func NewParser(re *RE) *Parser { + parser := new(Parser); + parser.re = re; + parser.nextc(); // load p.ch + return parser; +} + +/* + +Grammar: + regexp: + concatenation { '|' concatenation } + concatenation: + { closure } + closure: + term [ '*' | '+' | '?' ] + term: + '^' + '$' + '.' + character + '[' character-ranges ']' + '(' regexp ')' + +*/ + +func (p *Parser) Regexp() (start, end Inst) + +var NULL Inst +type BUGinter interface{} + +func special(c int) bool { + s := `\.+*?()|[]`; + for i := 0; i < len(s); i++ { + if c == int(s[i]) { + return true + } + } + return false +} + +func specialcclass(c int) bool { + s := `\-[]`; + for i := 0; i < len(s); i++ { + if c == int(s[i]) { + return true + } + } + return false +} + +func (p *Parser) CharClass() Inst { + cc := NewCharClass(); + p.re.Add(cc); + if p.c() == '^' { + cc.negate = true; + p.nextc(); + } + left := -1; + for { + switch c := p.c(); c { + case ']', EOF: + if left >= 0 { + p.re.Error(ErrBadRange); + } + return cc; + case '-': // do this before backslash processing + p.re.Error(ErrBadRange); + case '\\': + c = p.nextc(); + switch { + case c == EOF: + p.re.Error(ErrExtraneousBackslash); + case c == 'n': + c = '\n'; + case specialcclass(c): + // c is as delivered + default: + p.re.Error(ErrBadBackslash); + } + fallthrough; + default: + p.nextc(); + switch { + case left < 0: // first of pair + if p.c() == '-' { // range + p.nextc(); + left = c; + } else { // single char + cc.AddRange(c, c); + } + case left <= c: // second of pair + cc.AddRange(left, c); + left = -1; + default: + p.re.Error(ErrBadRange); + } + } + } + return NULL +} + +func (p *Parser) Term() (start, end Inst) { + switch c := p.c(); c { + case '|', EOF: + return NULL, NULL; + case '*', '+', '|': + p.re.Error(ErrBareClosure); + case ')': + if p.nlpar == 0 { + p.re.Error(ErrUnmatchedRpar); + } + return NULL, NULL; + case ']': + p.re.Error(ErrUnmatchedRbkt); + case '^': + p.nextc(); + start = p.re.Add(new(Bot)); + return start, start; + case '$': + p.nextc(); + start = p.re.Add(new(Eot)); + return start, start; + case '.': + p.nextc(); + start = p.re.Add(new(Any)); + return start, start; + case '[': + p.nextc(); + start = p.CharClass(); + if p.c() != ']' { + p.re.Error(ErrUnmatchedLbkt); + } + p.nextc(); + return start, start; + case '(': + p.nextc(); + p.nlpar++; + p.re.nbra++; // increment first so first subexpr is \1 + nbra := p.re.nbra; + start, end = p.Regexp(); + if p.c() != ')' { + p.re.Error(ErrUnmatchedLpar); + } + p.nlpar--; + p.nextc(); + bra := new(Bra); + p.re.Add(bra); + ebra := new(Ebra); + p.re.Add(ebra); + bra.n = nbra; + ebra.n = nbra; + if start == NULL { + if end == NULL { p.re.Error(ErrInternal) } + start = ebra + } else { + end.SetNext(ebra); + } + bra.SetNext(start); + return bra, ebra; + case '\\': + c = p.nextc(); + switch { + case c == EOF: + p.re.Error(ErrExtraneousBackslash); + case c == 'n': + c = '\n'; + case special(c): + // c is as delivered + default: + p.re.Error(ErrBadBackslash); + } + fallthrough; + default: + p.nextc(); + start = NewChar(c); + p.re.Add(start); + return start, start + } + panic("unreachable"); +} + +func (p *Parser) Closure() (start, end Inst) { + start, end = p.Term(); + if start == NULL { + return + } + switch p.c() { + case '*': + // (start,end)*: + alt := new(Alt); + p.re.Add(alt); + end.SetNext(alt); // after end, do alt + alt.left = start; // alternate brach: return to start + start = alt; // alt becomes new (start, end) + end = alt; + case '+': + // (start,end)+: + alt := new(Alt); + p.re.Add(alt); + end.SetNext(alt); // after end, do alt + alt.left = start; // alternate brach: return to start + end = alt; // start is unchanged; end is alt + case '?': + // (start,end)?: + alt := new(Alt); + p.re.Add(alt); + nop := new(Nop); + p.re.Add(nop); + alt.left = start; // alternate branch is start + alt.next = nop; // follow on to nop + end.SetNext(nop); // after end, go to nop + start = alt; // start is now alt + end = nop; // end is nop pointed to by both branches + default: + return + } + switch p.nextc() { + case '*', '+', '?': + p.re.Error(ErrBadClosure); + } + return +} + +func (p *Parser) Concatenation() (start, end Inst) { + start, end = NULL, NULL; + for { + nstart, nend := p.Closure(); + switch { + case nstart == NULL: // end of this concatenation + if start == NULL { // this is the empty string + nop := p.re.Add(new(Nop)); + return nop, nop; + } + return; + case start == NULL: // this is first element of concatenation + start, end = nstart, nend; + default: + end.SetNext(nstart); + end = nend; + } + } + panic("unreachable"); +} + +func (p *Parser) Regexp() (start, end Inst) { + start, end = p.Concatenation(); + for { + switch p.c() { + default: + return; + case '|': + p.nextc(); + nstart, nend := p.Concatenation(); + alt := new(Alt); + p.re.Add(alt); + alt.left = start; + alt.next = nstart; + nop := new(Nop); + p.re.Add(nop); + end.SetNext(nop); + nend.SetNext(nop); + start, end = alt, nop; + } + } + panic("unreachable"); +} + +func UnNop(i Inst) Inst { + for i.Type() == NOP { + i = i.Next() + } + return i +} + +func (re *RE) EliminateNops() { + for i := 0; i < re.inst.Len(); i++ { + inst := re.inst.At(i).(Inst); + if inst.Type() == END { + continue + } + inst.SetNext(UnNop(inst.Next())); + if inst.Type() == ALT { + alt := inst.(*Alt); + alt.left = UnNop(alt.left); + } + } +} + +func (re *RE) Dump() { + for i := 0; i < re.inst.Len(); i++ { + inst := re.inst.At(i).(Inst); + print(inst.Index(), ": "); + inst.Print(); + if inst.Type() != END { + print(" -> ", inst.Next().Index()) + } + print("\n"); + } +} + +func (re *RE) DoParse() { + parser := NewParser(re); + start := new(Start); + re.Add(start); + s, e := parser.Regexp(); + start.next = s; + re.start = start; + e.SetNext(re.Add(new(End))); + + if debug { + re.Dump(); + println(); + } + + re.EliminateNops(); + + if debug { + re.Dump(); + println(); + } +} + + +func Compiler(str string, ch *chan *RE) { + re := new(RE); + re.expr = str; + re.inst = vector.New(); + re.ch = ch; + re.DoParse(); + ch <- re; +} + +// Public interface has only execute functionality (not yet implemented) +export type Regexp interface { + Execute(s string) *[]int +} + +// Compile in separate goroutine; wait for result +export func Compile(str string) (regexp Regexp, error *os.Error) { + ch := new(chan *RE); + go Compiler(str, ch); + re := <-ch; + return re, re.error +} + +type State struct { + inst Inst; // next instruction to execute + match *[]int; // pairs of bracketing submatches. 0th is start,end +} + +// Append new state to to-do list. Leftmost-longest wins so avoid +// adding a state that's already active. +func AddState(s *[]State, inst Inst, match *[]int) *[]State { + index := inst.Index(); + l := len(s); + pos := match[0]; + // TODO: Once the state is a vector and we can do insert, have inputs always + // go in order correctly and this "earlier" test is never necessary, + for i := 0; i < l; i++ { + if s[i].inst.Index() == index && // same instruction + s[i].match[0] < pos { // earlier match already going; lefmost wins + return s + } + } + if l == cap(s) { + s1 := new([]State, 2*l)[0:l]; + for i := 0; i < l; i++ { + s1[i] = s[i]; + } + s = s1; + } + s = s[0:l+1]; + s[l].inst = inst; + s[l].match = match; + return s; +} + +func (re *RE) DoExecute(str string, pos int) *[]int { + var s [2]*[]State; // TODO: use a vector when State values (not ptrs) can be vector elements + s[0] = new([]State, 10)[0:0]; + s[1] = new([]State, 10)[0:0]; + in, out := 0, 1; + var final State; + found := false; + for pos <= len(str) { + if !found { + // prime the pump if we haven't seen a match yet + match := new([]int, 2*(re.nbra+1)); + for i := 0; i < len(match); i++ { + match[i] = -1; // no match seen; catches cases like "a(b)?c" on "ac" + } + match[0] = pos; + s[out] = AddState(s[out], re.start.Next(), match); + } + in, out = out, in; // old out state is new in state + s[out] = s[out][0:0]; // clear out state + if len(s[in]) == 0 { + // machine has completed + break; + } + charwidth := 1; + c := EOF; + if pos < len(str) { + c, charwidth = sys.stringtorune(str, pos); + } + for i := 0; i < len(s[in]); i++ { + state := s[in][i]; + switch s[in][i].inst.Type() { + case BOT: + if pos == 0 { + s[in] = AddState(s[in], state.inst.Next(), state.match) + } + case EOT: + if pos == len(str) { + s[in] = AddState(s[in], state.inst.Next(), state.match) + } + case CHAR: + if c == state.inst.(*Char).char { + s[out] = AddState(s[out], state.inst.Next(), state.match) + } + case CHARCLASS: + if state.inst.(*CharClass).Matches(c) { + s[out] = AddState(s[out], state.inst.Next(), state.match) + } + case ANY: + if c != EOF { + s[out] = AddState(s[out], state.inst.Next(), state.match) + } + case BRA: + n := state.inst.(*Bra).n; + state.match[2*n] = pos; + s[in] = AddState(s[in], state.inst.Next(), state.match); + case EBRA: + n := state.inst.(*Ebra).n; + state.match[2*n+1] = pos; + s[in] = AddState(s[in], state.inst.Next(), state.match); + case ALT: + s[in] = AddState(s[in], state.inst.(*Alt).left, state.match); + // give other branch a copy of this match vector + s1 := new([]int, 2*(re.nbra+1)); + for i := 0; i < len(s1); i++ { + s1[i] = state.match[i] + } + s[in] = AddState(s[in], state.inst.Next(), s1); + case END: + // choose leftmost longest + if !found || // first + state.match[0] < final.match[0] || // leftmost + (state.match[0] == final.match[0] && pos > final.match[1]) { // longest + final = state; + final.match[1] = pos; + } + found = true; + default: + state.inst.Print(); + panic("unknown instruction in execute"); + } + } + pos += charwidth; + } + if !found { + return nil + } + return final.match; +} + + +func (re *RE) Execute(s string) *[]int { + return re.DoExecute(s, 0) +} |