diff options
| author | Kyle Consalus <consalus@gmail.com> | 2010-06-02 23:04:44 -0700 | 
|---|---|---|
| committer | Kyle Consalus <consalus@gmail.com> | 2010-06-02 23:04:44 -0700 | 
| commit | 782ef98ee931b10e8454928455f93cf5859ac361 (patch) | |
| tree | f60b39fd63f570b1ad794fdcea8d6ac315820c64 | |
| parent | 164ca008324c94078c26d32252bf314b5c933936 (diff) | |
| download | golang-782ef98ee931b10e8454928455f93cf5859ac361.tar.gz | |
	Optimization to regexp _CharClass: keep track of overall range of
    charclass to avoid unnecessarily iterating over ranges.
    Also, use the fact that IntVector is an []int to avoid method calls.
    On my machine, this brings us from ~27500 ns/op to ~17500 ns/op in the benchmark I've added (it is also faster in the case where a range check
    doesn't help, added a benchmark for this too.)
    I'd also like to propose that "[]", and "[^]" be disallowed. They aren't useful as far as I can tell, they aren't widely supported, and they make reasoning about character classes a bit more complicated.
R=r
CC=golang-dev
http://codereview.appspot.com/1495041
Committer: Rob Pike <r@golang.org>
| -rw-r--r-- | src/pkg/regexp/all_test.go | 36 | ||||
| -rw-r--r-- | src/pkg/regexp/regexp.go | 22 | 
2 files changed, 48 insertions, 10 deletions
| diff --git a/src/pkg/regexp/all_test.go b/src/pkg/regexp/all_test.go index 62dad3aa0..fd7ee2acb 100644 --- a/src/pkg/regexp/all_test.go +++ b/src/pkg/regexp/all_test.go @@ -519,7 +519,7 @@ var numSubexpCases = []numSubexpCase{  func TestNumSubexp(t *testing.T) {  	for _, c := range numSubexpCases { -		re, _ := Compile(c.input) +		re := MustCompile(c.input)  		n := re.NumSubexp()  		if n != c.expected {  			t.Errorf("NumSubexp for %q returned %d, expected %d", c.input, n, c.expected) @@ -530,7 +530,7 @@ func TestNumSubexp(t *testing.T) {  func BenchmarkLiteral(b *testing.B) {  	x := strings.Repeat("x", 50)  	b.StopTimer() -	re, _ := Compile(x) +	re := MustCompile(x)  	b.StartTimer()  	for i := 0; i < b.N; i++ {  		if !re.MatchString(x) { @@ -543,7 +543,35 @@ func BenchmarkLiteral(b *testing.B) {  func BenchmarkNotLiteral(b *testing.B) {  	x := strings.Repeat("x", 49)  	b.StopTimer() -	re, _ := Compile("^" + x) +	re := MustCompile("^" + x) +	b.StartTimer() +	for i := 0; i < b.N; i++ { +		if !re.MatchString(x) { +			println("no match!") +			break +		} +	} +} + +func BenchmarkMatchClass(b *testing.B) { +	b.StopTimer() +	x := strings.Repeat("xxxx", 20) + "w" +	re := MustCompile("[abcdw]") +	b.StartTimer() +	for i := 0; i < b.N; i++ { +		if !re.MatchString(x) { +			println("no match!") +			break +		} +	} +} + +func BenchmarkMatchClass_InRange(b *testing.B) { +	b.StopTimer() +	// 'b' is betwen 'a' and 'c', so the charclass +	// range checking is no help here. +	x := strings.Repeat("bbbb", 20) + "c" +	re := MustCompile("[ac]")  	b.StartTimer()  	for i := 0; i < b.N; i++ {  		if !re.MatchString(x) { @@ -556,7 +584,7 @@ func BenchmarkNotLiteral(b *testing.B) {  func BenchmarkReplaceAll(b *testing.B) {  	x := "abcdefghijklmnopqrstuvwxyz"  	b.StopTimer() -	re, _ := Compile("[cjrw]") +	re := MustCompile("[cjrw]")  	b.StartTimer()  	for i := 0; i < b.N; i++ {  		re.ReplaceAllString(x, "") diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go index cdd5cacdd..edf91531d 100644 --- a/src/pkg/regexp/regexp.go +++ b/src/pkg/regexp/regexp.go @@ -152,10 +152,10 @@ func newChar(char int) *_Char {  type _CharClass struct {  	common -	char   int  	negate bool // is character class negated? ([^a-z])  	// vector of int, stored pairwise: [a-z] is (a,z); x is (x,x): -	ranges *vector.IntVector +	ranges     *vector.IntVector +	cmin, cmax int  }  func (cclass *_CharClass) kind() int { return _CHARCLASS } @@ -180,13 +180,21 @@ func (cclass *_CharClass) addRange(a, b int) {  	// range is a through b inclusive  	cclass.ranges.Push(a)  	cclass.ranges.Push(b) +	if a < cclass.cmin { +		cclass.cmin = a +	} +	if b > cclass.cmax { +		cclass.cmax = b +	}  }  func (cclass *_CharClass) matches(c int) bool { -	for i := 0; i < cclass.ranges.Len(); i = i + 2 { -		min := cclass.ranges.At(i) -		max := cclass.ranges.At(i + 1) -		if min <= c && c <= max { +	if c < cclass.cmin || c > cclass.cmax { +		return cclass.negate +	} +	ranges := []int(*cclass.ranges) +	for i := 0; i < len(ranges); i = i + 2 { +		if ranges[i] <= c && c <= ranges[i+1] {  			return !cclass.negate  		}  	} @@ -196,6 +204,8 @@ func (cclass *_CharClass) matches(c int) bool {  func newCharClass() *_CharClass {  	c := new(_CharClass)  	c.ranges = new(vector.IntVector) +	c.cmin = 0x10FFFF + 1 // MaxRune + 1 +	c.cmax = -1  	return c  } | 
