summaryrefslogtreecommitdiff
path: root/src/pkg/regexp
diff options
context:
space:
mode:
authorRob Pike <r@golang.org>2009-11-14 12:23:24 -0800
committerRob Pike <r@golang.org>2009-11-14 12:23:24 -0800
commit995ef119197b8d9a71f2e8c46388cf4c59e0faaf (patch)
tree55781db4cd00616d878a58b0229133037d0335db /src/pkg/regexp
parentf9496250ad7138ef6885522b116e16225cac8c8b (diff)
downloadgolang-995ef119197b8d9a71f2e8c46388cf4c59e0faaf.tar.gz
move evaluation of null-matching instructions one iteration earlier.
performance hit of about 20% but more intuitive results for submatches. we need a good regexp package at some point. Fixes issue 110. R=rsc CC=golang-dev http://codereview.appspot.com/152131
Diffstat (limited to 'src/pkg/regexp')
-rw-r--r--src/pkg/regexp/all_test.go16
-rw-r--r--src/pkg/regexp/regexp.go78
2 files changed, 53 insertions, 41 deletions
diff --git a/src/pkg/regexp/all_test.go b/src/pkg/regexp/all_test.go
index 04453a9d5..a9f23d70c 100644
--- a/src/pkg/regexp/all_test.go
+++ b/src/pkg/regexp/all_test.go
@@ -66,6 +66,8 @@ var matches = []tester{
tester{`b`, "abc", vec{1, 2}},
tester{`.`, "a", vec{0, 1}},
tester{`.*`, "abcdef", vec{0, 6}},
+ tester{`^`, "abcde", vec{0, 0}},
+ tester{`$`, "abcde", vec{5, 5}},
tester{`^abcd$`, "abcd", vec{0, 4}},
tester{`^bcd'`, "abcdef", vec{}},
tester{`^abcd$`, "abcde", vec{}},
@@ -86,6 +88,7 @@ var matches = []tester{
tester{`((a|b|c)*(d))`, "abcd", vec{0, 4, 0, 4, 2, 3, 3, 4}},
tester{`(((a|b|c)*)(d))`, "abcd", vec{0, 4, 0, 4, 0, 3, 2, 3, 3, 4}},
tester{`a*(|(b))c*`, "aacc", vec{0, 4, 2, 2, -1, -1}},
+ tester{`(.*).*`, "ab", vec{0, 2, 0, 2}},
}
func compileTest(t *testing.T, expr string, error os.Error) *Regexp {
@@ -182,12 +185,12 @@ func matchTest(t *testing.T, expr string, str string, match []int) {
}
m := re.MatchString(str);
if m != (len(match) > 0) {
- t.Errorf("MatchString failure on %#q matching %q: %d should be %d", expr, str, m, len(match) > 0)
+ t.Errorf("MatchString failure on %#q matching %q: %t should be %t", expr, str, m, len(match) > 0)
}
// now try bytes
m = re.Match(strings.Bytes(str));
if m != (len(match) > 0) {
- t.Errorf("Match failure on %#q matching %q: %d should be %d", expr, str, m, len(match) > 0)
+ t.Errorf("Match failure on %#q matching %q: %t should be %t", expr, str, m, len(match) > 0)
}
}
@@ -377,14 +380,7 @@ var matchCases = []matchCase{
}
func printStringSlice(t *testing.T, s []string) {
- l := len(s);
- if l == 0 {
- t.Log("\t<empty>")
- } else {
- for i := 0; i < l; i++ {
- t.Logf("\t%q", s[i])
- }
- }
+ t.Logf("%#v", s)
}
func TestAllMatches(t *testing.T) {
diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go
index ab9465b7d..89300be96 100644
--- a/src/pkg/regexp/regexp.go
+++ b/src/pkg/regexp/regexp.go
@@ -639,15 +639,37 @@ type state struct {
// Append new state to to-do list. Leftmost-longest wins so avoid
// adding a state that's already active.
-func addState(s []state, inst instr, match []int) []state {
+func (re *Regexp) addState(s []state, inst instr, match []int, pos, end int) []state {
+ switch inst.kind() {
+ case _BOT:
+ if pos == 0 {
+ s = re.addState(s, inst.next(), match, pos, end)
+ }
+ return s;
+ case _EOT:
+ if pos == end {
+ s = re.addState(s, inst.next(), match, pos, end)
+ }
+ return s;
+ case _BRA:
+ n := inst.(*_Bra).n;
+ match[2*n] = pos;
+ s = re.addState(s, inst.next(), match, pos, end);
+ return s;
+ case _EBRA:
+ n := inst.(*_Ebra).n;
+ match[2*n+1] = pos;
+ s = re.addState(s, inst.next(), match, pos, end);
+ return s;
+ }
index := inst.index();
l := len(s);
- pos := match[0];
+ begin := match[0];
// TODO: Once the state is a vector and we can do insert, have inputs always
// go in order correctly and this "earlier" test is never necessary,
for i := 0; i < l; i++ {
if s[i].inst.index() == index && // same instruction
- s[i].match[0] < pos { // earlier match already going; lefmost wins
+ s[i].match[0] <= begin { // earlier match already going; lefmost wins
return s
}
}
@@ -661,6 +683,19 @@ func addState(s []state, inst instr, match []int) []state {
s = s[0 : l+1];
s[l].inst = inst;
s[l].match = match;
+ if inst.kind() == _ALT {
+ s1 := make([]int, 2*(re.nbra+1));
+ for i := 0; i < len(s1); i++ {
+ s1[i] = match[i]
+ }
+ s = re.addState(s, inst.(*_Alt).left, s1, pos, end);
+ // give other branch a copy of this match vector
+ s1 = make([]int, 2*(re.nbra+1));
+ for i := 0; i < len(s1); i++ {
+ s1[i] = match[i]
+ }
+ s = re.addState(s, inst.next(), s1, pos, end);
+ }
return s;
}
@@ -685,11 +720,11 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int {
match[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
}
match[0] = pos;
- s[out] = addState(s[out], re.start.next(), match);
+ s[out] = re.addState(s[out], re.start.next(), match, pos, end);
}
in, out = out, in; // old out state is new in state
s[out] = s[out][0:0]; // clear out state
- if len(s[in]) == 0 {
+ if found && len(s[in]) == 0 {
// machine has completed
break
}
@@ -702,56 +737,38 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int {
c, charwidth = utf8.DecodeRune(bytes[pos:end])
}
}
+ pos += charwidth;
for i := 0; i < len(s[in]); i++ {
st := s[in][i];
switch s[in][i].inst.kind() {
case _BOT:
- if pos == 0 {
- s[in] = addState(s[in], st.inst.next(), st.match)
- }
case _EOT:
- if pos == end {
- s[in] = addState(s[in], st.inst.next(), st.match)
- }
case _CHAR:
if c == st.inst.(*_Char).char {
- s[out] = addState(s[out], st.inst.next(), st.match)
+ s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
}
case _CHARCLASS:
if st.inst.(*_CharClass).matches(c) {
- s[out] = addState(s[out], st.inst.next(), st.match)
+ s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
}
case _ANY:
if c != endOfFile {
- s[out] = addState(s[out], st.inst.next(), st.match)
+ s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
}
case _NOTNL:
if c != endOfFile && c != '\n' {
- s[out] = addState(s[out], st.inst.next(), st.match)
+ s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
}
case _BRA:
- n := st.inst.(*_Bra).n;
- st.match[2*n] = pos;
- s[in] = addState(s[in], st.inst.next(), st.match);
case _EBRA:
- n := st.inst.(*_Ebra).n;
- st.match[2*n+1] = pos;
- s[in] = addState(s[in], st.inst.next(), st.match);
case _ALT:
- s[in] = addState(s[in], st.inst.(*_Alt).left, st.match);
- // give other branch a copy of this match vector
- s1 := make([]int, 2*(re.nbra+1));
- for i := 0; i < len(s1); i++ {
- s1[i] = st.match[i]
- }
- s[in] = addState(s[in], st.inst.next(), s1);
case _END:
// choose leftmost longest
if !found || // first
st.match[0] < final.match[0] || // leftmost
- (st.match[0] == final.match[0] && pos > final.match[1]) { // longest
+ (st.match[0] == final.match[0] && pos-charwidth > final.match[1]) { // longest
final = st;
- final.match[1] = pos;
+ final.match[1] = pos - charwidth;
}
found = true;
default:
@@ -759,7 +776,6 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int {
panic("unknown instruction in execute");
}
}
- pos += charwidth;
}
return final.match;
}