summaryrefslogtreecommitdiff
path: root/src/pkg/regexp
diff options
context:
space:
mode:
authorRob Pike <r@golang.org>2009-11-19 23:12:01 -0800
committerRob Pike <r@golang.org>2009-11-19 23:12:01 -0800
commit7e51eb4a6b371c20486adfa507cfd164ef0c5c0c (patch)
treef36ce42b4b36a76bb2cf8a7bbd2d6ca9bbad9e27 /src/pkg/regexp
parent35fad56a1ec09f32c18c3315479ddb277bb112a9 (diff)
downloadgolang-7e51eb4a6b371c20486adfa507cfd164ef0c5c0c.tar.gz
add a match arena to regexp to avoid generating garbage.
simple regexps run 20x faster. the regex-dna benchmark goes 3x faster. R=rsc CC=golang-dev http://codereview.appspot.com/156108
Diffstat (limited to 'src/pkg/regexp')
-rw-r--r--src/pkg/regexp/all_test.go27
-rw-r--r--src/pkg/regexp/regexp.go152
2 files changed, 133 insertions, 46 deletions
diff --git a/src/pkg/regexp/all_test.go b/src/pkg/regexp/all_test.go
index 522324871..fb6f3a030 100644
--- a/src/pkg/regexp/all_test.go
+++ b/src/pkg/regexp/all_test.go
@@ -60,6 +60,7 @@ type tester struct {
}
var matches = []tester{
+ tester{`^abcdefg`, "abcdefg", vec{0, 7}},
tester{`a+`, "baaab", vec{1, 4}},
tester{"abcd..", "abcdef", vec{0, 6}},
tester{``, "", vec{0, 0}},
@@ -450,3 +451,29 @@ func TestAllMatches(t *testing.T) {
}
}
}
+
+func BenchmarkLiteral(b *testing.B) {
+ x := strings.Repeat("x", 50);
+ b.StopTimer();
+ re, _ := Compile(x);
+ b.StartTimer();
+ for i := 0; i < b.N; i++ {
+ if !re.MatchString(x) {
+ println("no match!");
+ break;
+ }
+ }
+}
+
+func BenchmarkNotLiteral(b *testing.B) {
+ x := strings.Repeat("x", 49);
+ b.StopTimer();
+ re, _ := Compile("^" + x);
+ b.StartTimer();
+ for i := 0; i < b.N; i++ {
+ if !re.MatchString(x) {
+ println("no match!");
+ break;
+ }
+ }
+}
diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go
index 4a2e70dea..a58fbf44f 100644
--- a/src/pkg/regexp/regexp.go
+++ b/src/pkg/regexp/regexp.go
@@ -626,7 +626,8 @@ func (re *Regexp) doParse() os.Error {
return p.error;
}
-// return regular text at the beginning of str
+// Extract regular text from the beginning of the pattern.
+// That text can be used by doExecute to speed up matching.
func (re *Regexp) setPrefix() {
var b []byte;
var utf = make([]byte, utf8.UTFMax);
@@ -673,44 +674,101 @@ func MustCompile(str string) *Regexp {
return regexp;
}
+// The match arena allows us to reduce the garbage generated by tossing
+// match vectors away as we execute. Matches are ref counted and returned
+// to a free list when no longer active. Increases a simple benchmark by 22X.
+type matchArena struct {
+ head *matchVec;
+ len int; // length of match vector
+}
+
+type matchVec struct {
+ m []int; // pairs of bracketing submatches. 0th is start,end
+ ref int;
+ next *matchVec;
+}
+
+func (a *matchArena) new() *matchVec {
+ if a.head == nil {
+ const N = 10;
+ block := make([]matchVec, N);
+ for i := 0; i < N; i++ {
+ b := &block[i];
+ b.next = a.head;
+ a.head = b;
+ }
+ }
+ m := a.head;
+ a.head = m.next;
+ m.ref = 0;
+ if m.m == nil {
+ m.m = make([]int, a.len)
+ }
+ return m;
+}
+
+func (a *matchArena) free(m *matchVec) {
+ m.ref--;
+ if m.ref == 0 {
+ m.next = a.head;
+ a.head = m;
+ }
+}
+
+func (a *matchArena) copy(m *matchVec) *matchVec {
+ m1 := a.new();
+ copy(m1.m, m.m);
+ return m1;
+}
+
+func (a *matchArena) noMatch() *matchVec {
+ m := a.new();
+ for i := range m.m {
+ m.m[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
+ }
+ m.ref = 1;
+ return m;
+}
+
type state struct {
inst instr; // next instruction to execute
- match []int; // pairs of bracketing submatches. 0th is start,end
+ match *matchVec;
}
// Append new state to to-do list. Leftmost-longest wins so avoid
-// adding a state that's already active.
-func (re *Regexp) addState(s []state, inst instr, match []int, pos, end int) []state {
+// adding a state that's already active. The matchVec will be inc-ref'ed
+// if it is assigned to a state.
+func (a *matchArena) addState(s []state, inst instr, match *matchVec, pos, end int) []state {
switch inst.kind() {
case _BOT:
if pos == 0 {
- s = re.addState(s, inst.next(), match, pos, end)
+ s = a.addState(s, inst.next(), match, pos, end)
}
return s;
case _EOT:
if pos == end {
- s = re.addState(s, inst.next(), match, pos, end)
+ s = a.addState(s, inst.next(), match, pos, end)
}
return s;
case _BRA:
n := inst.(*_Bra).n;
- match[2*n] = pos;
- s = re.addState(s, inst.next(), match, pos, end);
+ match.m[2*n] = pos;
+ s = a.addState(s, inst.next(), match, pos, end);
return s;
case _EBRA:
n := inst.(*_Ebra).n;
- match[2*n+1] = pos;
- s = re.addState(s, inst.next(), match, pos, end);
+ match.m[2*n+1] = pos;
+ s = a.addState(s, inst.next(), match, pos, end);
return s;
}
index := inst.index();
l := len(s);
- begin := match[0];
- // TODO: Once the state is a vector and we can do insert, have inputs always
- // go in order correctly and this "earlier" test is never necessary,
+ begin := match.m[0];
+ // TODO: If the state were a vector and we could do insert, have inputs always
+ // go in order correctly and this "earlier" test is not necessary,
for i := 0; i < l; i++ {
if s[i].inst.index() == index && // same instruction
- s[i].match[0] <= begin { // earlier match already going; lefmost wins
+ s[i].match.m[0] <= begin { // earlier match already going; lefmost wins
return s
}
}
@@ -722,30 +780,19 @@ func (re *Regexp) addState(s []state, inst instr, match []int, pos, end int) []s
s = s[0 : l+1];
s[l].inst = inst;
s[l].match = match;
+ match.ref++;
if inst.kind() == _ALT {
- s1 := make([]int, 2*(re.nbra+1));
- copy(s1, match);
- s = re.addState(s, inst.(*_Alt).left, s1, pos, end);
+ s = a.addState(s, inst.(*_Alt).left, a.copy(match), pos, end);
// give other branch a copy of this match vector
- s1 = make([]int, 2*(re.nbra+1));
- copy(s1, match);
- s = re.addState(s, inst.next(), s1, pos, end);
+ s = a.addState(s, inst.next(), a.copy(match), pos, end);
}
return s;
}
-func noMatch(nbra int) []int {
- match := make([]int, 2*(nbra+1));
- for i := range match {
- match[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
- }
- return match;
-}
-
// Accepts either string or bytes - the logic is identical either way.
// If bytes == nil, scan str.
func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
- var s [2][]state; // TODO: use a vector when state values (not ptrs) can be vector elements
+ var s [2][]state;
s[0] = make([]state, 10)[0:0];
s[1] = make([]state, 10)[0:0];
in, out := 0, 1;
@@ -768,15 +815,22 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
}
pos += advance + len(re.prefix);
}
+ arena := &matchArena{nil, 2 * (re.nbra + 1)};
for pos <= end {
if !found {
// prime the pump if we haven't seen a match yet
- match := noMatch(re.nbra);
- match[0] = pos;
- s[out] = re.addState(s[out], re.start.next(), match, pos, end);
+ match := arena.noMatch();
+ match.m[0] = pos;
+ s[out] = arena.addState(s[out], re.start.next(), match, pos, end);
+ arena.free(match); // if addState saved it, ref was incremented
}
in, out = out, in; // old out state is new in state
- s[out] = s[out][0:0]; // clear out state
+ // clear out old state
+ old := s[out];
+ for _, state := range old {
+ arena.free(state.match)
+ }
+ s[out] = old[0:0]; // truncate state vector
if found && len(s[in]) == 0 {
// machine has completed
break
@@ -791,26 +845,25 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
}
}
pos += charwidth;
- for i := 0; i < len(s[in]); i++ {
- st := s[in][i];
- switch s[in][i].inst.kind() {
+ for _, st := range s[in] {
+ switch st.inst.kind() {
case _BOT:
case _EOT:
case _CHAR:
if c == st.inst.(*_Char).char {
- s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
+ s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
}
case _CHARCLASS:
if st.inst.(*_CharClass).matches(c) {
- s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
+ s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
}
case _ANY:
if c != endOfFile {
- s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
+ s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
}
case _NOTNL:
if c != endOfFile && c != '\n' {
- s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end)
+ s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
}
case _BRA:
case _EBRA:
@@ -818,10 +871,14 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
case _END:
// choose leftmost longest
if !found || // first
- st.match[0] < final.match[0] || // leftmost
- (st.match[0] == final.match[0] && pos-charwidth > final.match[1]) { // longest
+ st.match.m[0] < final.match.m[0] || // leftmost
+ (st.match.m[0] == final.match.m[0] && pos-charwidth > final.match.m[1]) { // longest
+ if final.match != nil {
+ arena.free(final.match)
+ }
final = st;
- final.match[1] = pos - charwidth;
+ final.match.ref++;
+ final.match.m[1] = pos - charwidth;
}
found = true;
default:
@@ -830,11 +887,14 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
}
}
}
+ if final.match == nil {
+ return nil
+ }
// if match found, back up start of match by width of prefix.
- if re.prefix != "" && len(final.match) > 0 {
- final.match[0] -= len(re.prefix)
+ if re.prefix != "" && len(final.match.m) > 0 {
+ final.match.m[0] -= len(re.prefix)
}
- return final.match;
+ return final.match.m;
}