summaryrefslogtreecommitdiff
path: root/src/pkg/regexp
diff options
context:
space:
mode:
authorRob Pike <r@golang.org>2009-11-19 16:45:50 -0800
committerRob Pike <r@golang.org>2009-11-19 16:45:50 -0800
commit2ccc02b444962f25e2aa195bf1490ea052cb9fdf (patch)
tree5434a9f690a194d749fd0bdf68e50774a0842687 /src/pkg/regexp
parentc73996254551b131642465145fcc4c8cad3a0b44 (diff)
downloadgolang-2ccc02b444962f25e2aa195bf1490ea052cb9fdf.tar.gz
two easy optimizations for regexp:
1) if char class contains a single character, make it a single character. (this is used to quote, e.g. [.] rather than \. 2) if regexp begins with ordinary text substring, use plain string match to start engine R=rsc CC=golang-dev http://codereview.appspot.com/157095
Diffstat (limited to 'src/pkg/regexp')
-rw-r--r--src/pkg/regexp/all_test.go5
-rw-r--r--src/pkg/regexp/regexp.go91
2 files changed, 83 insertions, 13 deletions
diff --git a/src/pkg/regexp/all_test.go b/src/pkg/regexp/all_test.go
index a9f23d70c..522324871 100644
--- a/src/pkg/regexp/all_test.go
+++ b/src/pkg/regexp/all_test.go
@@ -60,6 +60,8 @@ type tester struct {
}
var matches = []tester{
+ tester{`a+`, "baaab", vec{1, 4}},
+ tester{"abcd..", "abcdef", vec{0, 6}},
tester{``, "", vec{0, 0}},
tester{`a`, "a", vec{0, 1}},
tester{`x`, "y", vec{}},
@@ -78,6 +80,8 @@ var matches = []tester{
tester{`[a\-\]z]+`, "az]-bcz", vec{0, 4}},
tester{`[^\n]+`, "abcd\n", vec{0, 4}},
tester{`[日本語]+`, "日本語日本語", vec{0, 18}},
+ tester{`日本語+`, "日本語", vec{0, 9}},
+ tester{`日本語+`, "日本語語語語", vec{0, 18}},
tester{`()`, "", vec{0, 0, 0, 0}},
tester{`(a)`, "a", vec{0, 1, 0, 1}},
tester{`(.)(.)`, "日a", vec{0, 4, 0, 3, 3, 4}},
@@ -89,6 +93,7 @@ var matches = []tester{
tester{`(((a|b|c)*)(d))`, "abcd", vec{0, 4, 0, 4, 0, 3, 2, 3, 3, 4}},
tester{`a*(|(b))c*`, "aacc", vec{0, 4, 2, 2, -1, -1}},
tester{`(.*).*`, "ab", vec{0, 2, 0, 2}},
+ tester{`[.]`, ".", vec{0, 1}},
}
func compileTest(t *testing.T, expr string, error os.Error) *Regexp {
diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go
index b70fa9479..4a2e70dea 100644
--- a/src/pkg/regexp/regexp.go
+++ b/src/pkg/regexp/regexp.go
@@ -27,6 +27,7 @@ import (
"container/vector";
"io";
"os";
+ "strings";
"utf8";
)
@@ -70,10 +71,12 @@ func (c *common) setIndex(i int) { c._index = i }
// Regexp is the representation of a compiled regular expression.
// The public interface is entirely through methods.
type Regexp struct {
- expr string; // the original expression
- inst *vector.Vector;
- start instr;
- nbra int; // number of brackets in expression, for subexpressions
+ expr string; // the original expression
+ prefix string; // initial plain text string
+ prefixBytes []byte; // initial plain text bytes
+ inst *vector.Vector;
+ start instr;
+ nbra int; // number of brackets in expression, for subexpressions
}
const (
@@ -315,6 +318,12 @@ func (p *parser) charClass() instr {
p.re.add(nl);
return nl;
}
+ // Special common case: "[a]" -> "a"
+ if !cc.negate && cc.ranges.Len() == 2 && cc.ranges.At(0) == cc.ranges.At(1) {
+ c := newChar(cc.ranges.At(0));
+ p.re.add(c);
+ return c;
+ }
p.re.add(cc);
return cc;
case '-': // do this before backslash processing
@@ -573,6 +582,7 @@ func (re *Regexp) eliminateNops() {
}
func (re *Regexp) dump() {
+ print("prefix <", re.prefix, ">\n");
for i := 0; i < re.inst.Len(); i++ {
inst := re.inst.At(i).(instr);
print(inst.index(), ": ");
@@ -606,9 +616,42 @@ func (re *Regexp) doParse() os.Error {
re.dump();
println();
}
+ if p.error == nil {
+ re.setPrefix();
+ if debug {
+ re.dump();
+ println();
+ }
+ }
return p.error;
}
+// return regular text at the beginning of str
+func (re *Regexp) setPrefix() {
+ var b []byte;
+ var utf = make([]byte, utf8.UTFMax);
+ // First instruction is start; skip that.
+ i := re.inst.At(0).(instr).next().index();
+ for i < re.inst.Len() {
+ inst := re.inst.At(i).(instr);
+ // stop if this is not a char
+ if inst.kind() != _CHAR {
+ break
+ }
+ // stop if this char starts a closure; liberal but easy test: is an ALT next?
+ if re.inst.At(inst.next().index()).(instr).kind() == _ALT {
+ break
+ }
+ n := utf8.EncodeRune(inst.(*_Char).char, utf);
+ b = bytes.Add(b, utf[0:n]);
+ i = inst.next().index();
+ }
+ // point start instruction to first non-CHAR
+ re.inst.At(0).(instr).setNext(re.inst.At(i).(instr));
+ re.prefixBytes = b;
+ re.prefix = string(b);
+}
+
// Compile parses a regular expression and returns, if successful, a Regexp
// object that can be used to match against text.
func Compile(str string) (regexp *Regexp, error os.Error) {
@@ -691,9 +734,17 @@ func (re *Regexp) addState(s []state, inst instr, match []int, pos, end int) []s
return s;
}
+func noMatch(nbra int) []int {
+ match := make([]int, 2*(nbra+1));
+ for i := range match {
+ match[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
+ }
+ return match;
+}
+
// Accepts either string or bytes - the logic is identical either way.
// If bytes == nil, scan str.
-func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int {
+func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
var s [2][]state; // TODO: use a vector when state values (not ptrs) can be vector elements
s[0] = make([]state, 10)[0:0];
s[1] = make([]state, 10)[0:0];
@@ -701,16 +752,26 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int {
var final state;
found := false;
end := len(str);
- if bytes != nil {
- end = len(bytes)
+ if bytestr != nil {
+ end = len(bytestr)
+ }
+ // fast check for initial plain substring
+ if re.prefix != "" {
+ var advance int;
+ if bytestr == nil {
+ advance = strings.Index(str[pos:len(str)], re.prefix)
+ } else {
+ advance = bytes.Index(bytestr[pos:len(bytestr)], re.prefixBytes)
+ }
+ if advance == -1 {
+ return []int{}
+ }
+ pos += advance + len(re.prefix);
}
for pos <= end {
if !found {
// prime the pump if we haven't seen a match yet
- match := make([]int, 2*(re.nbra+1));
- for i := 0; i < len(match); i++ {
- match[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
- }
+ match := noMatch(re.nbra);
match[0] = pos;
s[out] = re.addState(s[out], re.start.next(), match, pos, end);
}
@@ -723,10 +784,10 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int {
charwidth := 1;
c := endOfFile;
if pos < end {
- if bytes == nil {
+ if bytestr == nil {
c, charwidth = utf8.DecodeRuneInString(str[pos:end])
} else {
- c, charwidth = utf8.DecodeRune(bytes[pos:end])
+ c, charwidth = utf8.DecodeRune(bytestr[pos:end])
}
}
pos += charwidth;
@@ -769,6 +830,10 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int {
}
}
}
+ // if match found, back up start of match by width of prefix.
+ if re.prefix != "" && len(final.match) > 0 {
+ final.match[0] -= len(re.prefix)
+ }
return final.match;
}