diff options
author | Rob Pike <r@golang.org> | 2009-11-19 23:12:01 -0800 |
---|---|---|
committer | Rob Pike <r@golang.org> | 2009-11-19 23:12:01 -0800 |
commit | 7e51eb4a6b371c20486adfa507cfd164ef0c5c0c (patch) | |
tree | f36ce42b4b36a76bb2cf8a7bbd2d6ca9bbad9e27 /src/pkg/regexp/regexp.go | |
parent | 35fad56a1ec09f32c18c3315479ddb277bb112a9 (diff) | |
download | golang-7e51eb4a6b371c20486adfa507cfd164ef0c5c0c.tar.gz |
add a match arena to regexp to avoid generating garbage.
simple regexps run 20x faster.
the regex-dna benchmark goes 3x faster.
R=rsc
CC=golang-dev
http://codereview.appspot.com/156108
Diffstat (limited to 'src/pkg/regexp/regexp.go')
-rw-r--r-- | src/pkg/regexp/regexp.go | 152 |
1 files changed, 106 insertions, 46 deletions
diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go index 4a2e70dea..a58fbf44f 100644 --- a/src/pkg/regexp/regexp.go +++ b/src/pkg/regexp/regexp.go @@ -626,7 +626,8 @@ func (re *Regexp) doParse() os.Error { return p.error; } -// return regular text at the beginning of str +// Extract regular text from the beginning of the pattern. +// That text can be used by doExecute to speed up matching. func (re *Regexp) setPrefix() { var b []byte; var utf = make([]byte, utf8.UTFMax); @@ -673,44 +674,101 @@ func MustCompile(str string) *Regexp { return regexp; } +// The match arena allows us to reduce the garbage generated by tossing +// match vectors away as we execute. Matches are ref counted and returned +// to a free list when no longer active. Increases a simple benchmark by 22X. +type matchArena struct { + head *matchVec; + len int; // length of match vector +} + +type matchVec struct { + m []int; // pairs of bracketing submatches. 0th is start,end + ref int; + next *matchVec; +} + +func (a *matchArena) new() *matchVec { + if a.head == nil { + const N = 10; + block := make([]matchVec, N); + for i := 0; i < N; i++ { + b := &block[i]; + b.next = a.head; + a.head = b; + } + } + m := a.head; + a.head = m.next; + m.ref = 0; + if m.m == nil { + m.m = make([]int, a.len) + } + return m; +} + +func (a *matchArena) free(m *matchVec) { + m.ref--; + if m.ref == 0 { + m.next = a.head; + a.head = m; + } +} + +func (a *matchArena) copy(m *matchVec) *matchVec { + m1 := a.new(); + copy(m1.m, m.m); + return m1; +} + +func (a *matchArena) noMatch() *matchVec { + m := a.new(); + for i := range m.m { + m.m[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac" + } + m.ref = 1; + return m; +} + type state struct { inst instr; // next instruction to execute - match []int; // pairs of bracketing submatches. 0th is start,end + match *matchVec; } // Append new state to to-do list. Leftmost-longest wins so avoid -// adding a state that's already active. -func (re *Regexp) addState(s []state, inst instr, match []int, pos, end int) []state { +// adding a state that's already active. The matchVec will be inc-ref'ed +// if it is assigned to a state. +func (a *matchArena) addState(s []state, inst instr, match *matchVec, pos, end int) []state { switch inst.kind() { case _BOT: if pos == 0 { - s = re.addState(s, inst.next(), match, pos, end) + s = a.addState(s, inst.next(), match, pos, end) } return s; case _EOT: if pos == end { - s = re.addState(s, inst.next(), match, pos, end) + s = a.addState(s, inst.next(), match, pos, end) } return s; case _BRA: n := inst.(*_Bra).n; - match[2*n] = pos; - s = re.addState(s, inst.next(), match, pos, end); + match.m[2*n] = pos; + s = a.addState(s, inst.next(), match, pos, end); return s; case _EBRA: n := inst.(*_Ebra).n; - match[2*n+1] = pos; - s = re.addState(s, inst.next(), match, pos, end); + match.m[2*n+1] = pos; + s = a.addState(s, inst.next(), match, pos, end); return s; } index := inst.index(); l := len(s); - begin := match[0]; - // TODO: Once the state is a vector and we can do insert, have inputs always - // go in order correctly and this "earlier" test is never necessary, + begin := match.m[0]; + // TODO: If the state were a vector and we could do insert, have inputs always + // go in order correctly and this "earlier" test is not necessary, for i := 0; i < l; i++ { if s[i].inst.index() == index && // same instruction - s[i].match[0] <= begin { // earlier match already going; lefmost wins + s[i].match.m[0] <= begin { // earlier match already going; lefmost wins return s } } @@ -722,30 +780,19 @@ func (re *Regexp) addState(s []state, inst instr, match []int, pos, end int) []s s = s[0 : l+1]; s[l].inst = inst; s[l].match = match; + match.ref++; if inst.kind() == _ALT { - s1 := make([]int, 2*(re.nbra+1)); - copy(s1, match); - s = re.addState(s, inst.(*_Alt).left, s1, pos, end); + s = a.addState(s, inst.(*_Alt).left, a.copy(match), pos, end); // give other branch a copy of this match vector - s1 = make([]int, 2*(re.nbra+1)); - copy(s1, match); - s = re.addState(s, inst.next(), s1, pos, end); + s = a.addState(s, inst.next(), a.copy(match), pos, end); } return s; } -func noMatch(nbra int) []int { - match := make([]int, 2*(nbra+1)); - for i := range match { - match[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac" - } - return match; -} - // Accepts either string or bytes - the logic is identical either way. // If bytes == nil, scan str. func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { - var s [2][]state; // TODO: use a vector when state values (not ptrs) can be vector elements + var s [2][]state; s[0] = make([]state, 10)[0:0]; s[1] = make([]state, 10)[0:0]; in, out := 0, 1; @@ -768,15 +815,22 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { } pos += advance + len(re.prefix); } + arena := &matchArena{nil, 2 * (re.nbra + 1)}; for pos <= end { if !found { // prime the pump if we haven't seen a match yet - match := noMatch(re.nbra); - match[0] = pos; - s[out] = re.addState(s[out], re.start.next(), match, pos, end); + match := arena.noMatch(); + match.m[0] = pos; + s[out] = arena.addState(s[out], re.start.next(), match, pos, end); + arena.free(match); // if addState saved it, ref was incremented } in, out = out, in; // old out state is new in state - s[out] = s[out][0:0]; // clear out state + // clear out old state + old := s[out]; + for _, state := range old { + arena.free(state.match) + } + s[out] = old[0:0]; // truncate state vector if found && len(s[in]) == 0 { // machine has completed break @@ -791,26 +845,25 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { } } pos += charwidth; - for i := 0; i < len(s[in]); i++ { - st := s[in][i]; - switch s[in][i].inst.kind() { + for _, st := range s[in] { + switch st.inst.kind() { case _BOT: case _EOT: case _CHAR: if c == st.inst.(*_Char).char { - s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end) + s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end) } case _CHARCLASS: if st.inst.(*_CharClass).matches(c) { - s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end) + s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end) } case _ANY: if c != endOfFile { - s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end) + s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end) } case _NOTNL: if c != endOfFile && c != '\n' { - s[out] = re.addState(s[out], st.inst.next(), st.match, pos, end) + s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end) } case _BRA: case _EBRA: @@ -818,10 +871,14 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { case _END: // choose leftmost longest if !found || // first - st.match[0] < final.match[0] || // leftmost - (st.match[0] == final.match[0] && pos-charwidth > final.match[1]) { // longest + st.match.m[0] < final.match.m[0] || // leftmost + (st.match.m[0] == final.match.m[0] && pos-charwidth > final.match.m[1]) { // longest + if final.match != nil { + arena.free(final.match) + } final = st; - final.match[1] = pos - charwidth; + final.match.ref++; + final.match.m[1] = pos - charwidth; } found = true; default: @@ -830,11 +887,14 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { } } } + if final.match == nil { + return nil + } // if match found, back up start of match by width of prefix. - if re.prefix != "" && len(final.match) > 0 { - final.match[0] -= len(re.prefix) + if re.prefix != "" && len(final.match.m) > 0 { + final.match.m[0] -= len(re.prefix) } - return final.match; + return final.match.m; } |