diff options
Diffstat (limited to 'src/pkg/testing/regexp.go')
-rw-r--r-- | src/pkg/testing/regexp.go | 288 |
1 files changed, 171 insertions, 117 deletions
diff --git a/src/pkg/testing/regexp.go b/src/pkg/testing/regexp.go index e5b5eac4f..7e6539e9e 100644 --- a/src/pkg/testing/regexp.go +++ b/src/pkg/testing/regexp.go @@ -29,28 +29,28 @@ import ( "utf8"; ) -var debug = false; +var debug = false // Error codes returned by failures to parse an expression. var ( - ErrInternal = "internal error"; - ErrUnmatchedLpar = "unmatched ''"; - ErrUnmatchedRpar = "unmatched ''"; - ErrUnmatchedLbkt = "unmatched '['"; - ErrUnmatchedRbkt = "unmatched ']'"; - ErrBadRange = "bad range in character class"; - ErrExtraneousBackslash = "extraneous backslash"; - ErrBadClosure = "repeated closure **, ++, etc."; - ErrBareClosure = "closure applies to nothing"; - ErrBadBackslash = "illegal backslash escape"; + ErrInternal = "internal error"; + ErrUnmatchedLpar = "unmatched ''"; + ErrUnmatchedRpar = "unmatched ''"; + ErrUnmatchedLbkt = "unmatched '['"; + ErrUnmatchedRbkt = "unmatched ']'"; + ErrBadRange = "bad range in character class"; + ErrExtraneousBackslash = "extraneous backslash"; + ErrBadClosure = "repeated closure **, ++, etc."; + ErrBareClosure = "closure applies to nothing"; + ErrBadBackslash = "illegal backslash escape"; ) // An instruction executed by the NFA type instr interface { - kind() int; // the type of this instruction: _CHAR, _ANY, etc. - next() instr; // the instruction to execute after this one + kind() int; // the type of this instruction: _CHAR, _ANY, etc. + next() instr; // the instruction to execute after this one setNext(i instr); - index() int; + index() int; setIndex(i int); print(); } @@ -61,69 +61,93 @@ type common struct { _index int; } -func (c *common) next() instr { return c._next } -func (c *common) setNext(i instr) { c._next = i } -func (c *common) index() int { return c._index } -func (c *common) setIndex(i int) { c._index = i } +func (c *common) next() instr { + return c._next; +} +func (c *common) setNext(i instr) { + c._next = i; +} +func (c *common) index() int { + return c._index; +} +func (c *common) setIndex(i int) { + c._index = i; +} // The representation of a compiled regular expression. // The public interface is entirely through methods. type Regexp struct { - expr string; // the original expression + expr string; // the original expression ch chan<- *Regexp; // reply channel when we're done - error string; // compile- or run-time error; nil if OK + error string; // compile- or run-time error; nil if OK inst []instr; start instr; nbra int; // number of brackets in expression, for subexpressions } const ( - _START // beginning of program - = iota; + _START = // beginning of program + iota; _END; // end of program: success _BOT; // '^' beginning of text _EOT; // '$' end of text - _CHAR; // 'a' regular character + _CHAR; // 'a' regular character _CHARCLASS; // [a-z] character class _ANY; // '.' any character including newline _NOTNL; // [^\n] special case: any character but newline _BRA; // '(' parenthesized expression - _EBRA; // ')'; end of '(' parenthesized expression + _EBRA; // ')'; end of '(' parenthesized expression _ALT; // '|' alternation _NOP; // do nothing; makes it easy to link without patching ) // --- START start of program type _Start struct { - common + common; } -func (start *_Start) kind() int { return _START } -func (start *_Start) print() { print("start") } +func (start *_Start) kind() int { + return _START; +} +func (start *_Start) print() { + print("start"); +} // --- END end of program type _End struct { - common + common; } -func (end *_End) kind() int { return _END } -func (end *_End) print() { print("end") } +func (end *_End) kind() int { + return _END; +} +func (end *_End) print() { + print("end"); +} // --- BOT beginning of text type _Bot struct { - common + common; } -func (bot *_Bot) kind() int { return _BOT } -func (bot *_Bot) print() { print("bot") } +func (bot *_Bot) kind() int { + return _BOT; +} +func (bot *_Bot) print() { + print("bot"); +} // --- EOT end of text type _Eot struct { - common + common; } -func (eot *_Eot) kind() int { return _EOT } -func (eot *_Eot) print() { print("eot") } +func (eot *_Eot) kind() int { + return _EOT; +} +func (eot *_Eot) print() { + print("eot"); +} // --- CHAR a regular character type _Char struct { @@ -131,8 +155,12 @@ type _Char struct { char int; } -func (char *_Char) kind() int { return _CHAR } -func (char *_Char) print() { print("char ", string(char.char)) } +func (char *_Char) kind() int { + return _CHAR; +} +func (char *_Char) print() { + print("char ", string(char.char)); +} func newChar(char int) *_Char { c := new(_Char); @@ -150,7 +178,9 @@ type _CharClass struct { ranges []int; } -func (cclass *_CharClass) kind() int { return _CHARCLASS } +func (cclass *_CharClass) kind() int { + return _CHARCLASS; +} func (cclass *_CharClass) print() { print("charclass"); @@ -174,11 +204,11 @@ func (cclass *_CharClass) addRange(a, b int) { if n >= cap(cclass.ranges) { nr := make([]int, n, 2*n); for i, j := range nr { - nr[i] = j + nr[i] = j; } cclass.ranges = nr; } - cclass.ranges = cclass.ranges[0:n+2]; + cclass.ranges = cclass.ranges[0 : n+2]; cclass.ranges[n] = a; n++; cclass.ranges[n] = b; @@ -190,10 +220,10 @@ func (cclass *_CharClass) matches(c int) bool { min := cclass.ranges[i]; max := cclass.ranges[i+1]; if min <= c && c <= max { - return !cclass.negate + return !cclass.negate; } } - return cclass.negate + return cclass.negate; } func newCharClass() *_CharClass { @@ -204,19 +234,27 @@ func newCharClass() *_CharClass { // --- ANY any character type _Any struct { - common + common; } -func (any *_Any) kind() int { return _ANY } -func (any *_Any) print() { print("any") } +func (any *_Any) kind() int { + return _ANY; +} +func (any *_Any) print() { + print("any"); +} // --- NOTNL any character but newline type _NotNl struct { - common + common; } -func (notnl *_NotNl) kind() int { return _NOTNL } -func (notnl *_NotNl) print() { print("notnl") } +func (notnl *_NotNl) kind() int { + return _NOTNL; +} +func (notnl *_NotNl) print() { + print("notnl"); +} // --- BRA parenthesized expression type _Bra struct { @@ -224,8 +262,12 @@ type _Bra struct { n int; // subexpression number } -func (bra *_Bra) kind() int { return _BRA } -func (bra *_Bra) print() { print("bra", bra.n); } +func (bra *_Bra) kind() int { + return _BRA; +} +func (bra *_Bra) print() { + print("bra", bra.n); +} // --- EBRA end of parenthesized expression type _Ebra struct { @@ -233,8 +275,12 @@ type _Ebra struct { n int; // subexpression number } -func (ebra *_Ebra) kind() int { return _EBRA } -func (ebra *_Ebra) print() { print("ebra ", ebra.n); } +func (ebra *_Ebra) kind() int { + return _EBRA; +} +func (ebra *_Ebra) print() { + print("ebra ", ebra.n); +} // --- ALT alternation type _Alt struct { @@ -242,16 +288,24 @@ type _Alt struct { left instr; // other branch } -func (alt *_Alt) kind() int { return _ALT } -func (alt *_Alt) print() { print("alt(", alt.left.index(), ")"); } +func (alt *_Alt) kind() int { + return _ALT; +} +func (alt *_Alt) print() { + print("alt(", alt.left.index(), ")"); +} // --- NOP no operation type _Nop struct { - common + common; } -func (nop *_Nop) kind() int { return _NOP } -func (nop *_Nop) print() { print("nop") } +func (nop *_Nop) kind() int { + return _NOP; +} +func (nop *_Nop) print() { + print("nop"); +} // report error and exit compiling/executing goroutine func (re *Regexp) setError(err string) { @@ -266,11 +320,11 @@ func (re *Regexp) add(i instr) instr { if n >= cap(re.inst) { ni := make([]instr, n, 2*n); for i, j := range re.inst { - ni[i] = j + ni[i] = j; } re.inst = ni; } - re.inst = re.inst[0:n+1]; + re.inst = re.inst[0 : n+1]; re.inst[n] = i; return i; } @@ -290,9 +344,9 @@ func (p *parser) c() int { func (p *parser) nextc() int { if p.pos >= len(p.re.expr) { - p.ch = endOfFile + p.ch = endOfFile; } else { - c, w := utf8.DecodeRuneInString(p.re.expr[p.pos:len(p.re.expr)]); + c, w := utf8.DecodeRuneInString(p.re.expr[p.pos : len(p.re.expr)]); p.ch = c; p.pos += w; } @@ -312,20 +366,20 @@ func special(c int) bool { s := `\.+*?()|[]^$`; for i := 0; i < len(s); i++ { if c == int(s[i]) { - return true + return true; } } - return false + return false; } func specialcclass(c int) bool { s := `\-[]`; for i := 0; i < len(s); i++ { if c == int(s[i]) { - return true + return true; } } - return false + return false; } func (p *parser) charClass() instr { @@ -360,7 +414,7 @@ func (p *parser) charClass() instr { case c == 'n': c = '\n'; case specialcclass(c): - // c is as delivered + // c is as delivered default: p.re.setError(ErrBadBackslash); } @@ -383,7 +437,7 @@ func (p *parser) charClass() instr { } } } - return iNULL + return iNULL; } func (p *parser) term() (start, end instr) { @@ -438,9 +492,9 @@ func (p *parser) term() (start, end instr) { ebra.n = nbra; if start == iNULL { if end == iNULL { - p.re.setError(ErrInternal) + p.re.setError(ErrInternal); } - start = ebra + start = ebra; } else { end.setNext(ebra); } @@ -454,7 +508,7 @@ func (p *parser) term() (start, end instr) { case c == 'n': c = '\n'; case special(c): - // c is as delivered + // c is as delivered default: p.re.setError(ErrBadBackslash); } @@ -463,7 +517,7 @@ func (p *parser) term() (start, end instr) { p.nextc(); start = newChar(c); p.re.add(start); - return start, start + return start, start; } panic("unreachable"); } @@ -471,7 +525,7 @@ func (p *parser) term() (start, end instr) { func (p *parser) closure() (start, end instr) { start, end = p.term(); if start == iNULL { - return + return; } switch p.c() { case '*': @@ -480,7 +534,7 @@ func (p *parser) closure() (start, end instr) { p.re.add(alt); end.setNext(alt); // after end, do alt alt.left = start; // alternate brach: return to start - start = alt; // alt becomes new (start, end) + start = alt; // alt becomes new (start, end) end = alt; case '+': // (start,end)+: @@ -488,7 +542,7 @@ func (p *parser) closure() (start, end instr) { p.re.add(alt); end.setNext(alt); // after end, do alt alt.left = start; // alternate brach: return to start - end = alt; // start is unchanged; end is alt + end = alt; // start is unchanged; end is alt case '?': // (start,end)?: alt := new(_Alt); @@ -498,16 +552,16 @@ func (p *parser) closure() (start, end instr) { alt.left = start; // alternate branch is start alt.setNext(nop); // follow on to nop end.setNext(nop); // after end, go to nop - start = alt; // start is now alt - end = nop; // end is nop pointed to by both branches + start = alt; // start is now alt + end = nop; // end is nop pointed to by both branches default: - return + return; } switch p.nextc() { case '*', '+', '?': p.re.setError(ErrBadClosure); } - return + return; } func (p *parser) concatenation() (start, end instr) { @@ -556,16 +610,16 @@ func (p *parser) regexp() (start, end instr) { func unNop(i instr) instr { for i.kind() == _NOP { - i = i.next() + i = i.next(); } - return i + return i; } func (re *Regexp) eliminateNops() { for i := 0; i < len(re.inst); i++ { inst := re.inst[i]; if inst.kind() == _END { - continue + continue; } inst.setNext(unNop(inst.next())); if inst.kind() == _ALT { @@ -581,7 +635,7 @@ func (re *Regexp) dump() { print(inst.index(), ": "); inst.print(); if inst.kind() != _END { - print(" -> ", inst.next().index()) + print(" -> ", inst.next().index()); } print("\n"); } @@ -626,7 +680,7 @@ func CompileRegexp(str string) (regexp *Regexp, error string) { ch := make(chan *Regexp); go compiler(str, ch); re := <-ch; - return re, re.error + return re, re.error; } type state struct { @@ -643,10 +697,10 @@ func addState(s []state, inst instr, match []int) []state { // TODO: Once the state is a vector and we can do insert, have inputs always // go in order correctly and this "earlier" test is never necessary, for i := 0; i < l; i++ { - if s[i].inst.index() == index && // same instruction - s[i].match[0] < pos { // earlier match already going; lefmost wins - return s - } + if s[i].inst.index() == index && // same instruction + s[i].match[0] < pos { // earlier match already going; lefmost wins + return s; + } } if l == cap(s) { s1 := make([]state, 2*l)[0:l]; @@ -655,7 +709,7 @@ func addState(s []state, inst instr, match []int) []state { } s = s1; } - s = s[0:l+1]; + s = s[0 : l+1]; s[l].inst = inst; s[l].match = match; return s; @@ -672,16 +726,16 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int { found := false; end := len(str); if bytes != nil { - end = len(bytes) + end = len(bytes); } for pos <= end { if !found { // prime the pump if we haven't seen a match yet - match := make([]int, 2*(re.nbra+1)); + match := make([]int, 2*(re.nbra + 1)); for i := 0; i < len(match); i++ { match[i] = -1; // no match seen; catches cases like "a(b)?c" on "ac" } - match[0] = pos; + match[0] = pos; s[out] = addState(s[out], re.start.next(), match); } in, out = out, in; // old out state is new in state @@ -704,27 +758,27 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int { switch s[in][i].inst.kind() { case _BOT: if pos == 0 { - s[in] = addState(s[in], st.inst.next(), st.match) + s[in] = addState(s[in], st.inst.next(), st.match); } case _EOT: if pos == end { - s[in] = addState(s[in], st.inst.next(), st.match) + s[in] = addState(s[in], st.inst.next(), st.match); } case _CHAR: if c == st.inst.(*_Char).char { - s[out] = addState(s[out], st.inst.next(), st.match) + s[out] = addState(s[out], st.inst.next(), st.match); } case _CHARCLASS: if st.inst.(*_CharClass).matches(c) { - s[out] = addState(s[out], st.inst.next(), st.match) + s[out] = addState(s[out], st.inst.next(), st.match); } case _ANY: if c != endOfFile { - s[out] = addState(s[out], st.inst.next(), st.match) + s[out] = addState(s[out], st.inst.next(), st.match); } case _NOTNL: if c != endOfFile && c != '\n' { - s[out] = addState(s[out], st.inst.next(), st.match) + s[out] = addState(s[out], st.inst.next(), st.match); } case _BRA: n := st.inst.(*_Bra).n; @@ -732,21 +786,21 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int { s[in] = addState(s[in], st.inst.next(), st.match); case _EBRA: n := st.inst.(*_Ebra).n; - st.match[2*n+1] = pos; + st.match[2*n + 1] = pos; s[in] = addState(s[in], st.inst.next(), st.match); case _ALT: s[in] = addState(s[in], st.inst.(*_Alt).left, st.match); // give other branch a copy of this match vector - s1 := make([]int, 2*(re.nbra+1)); + s1 := make([]int, 2*(re.nbra + 1)); for i := 0; i < len(s1); i++ { - s1[i] = st.match[i] + s1[i] = st.match[i]; } s[in] = addState(s[in], st.inst.next(), s1); case _END: // choose leftmost longest if !found || // first - st.match[0] < final.match[0] || // leftmost - (st.match[0] == final.match[0] && pos > final.match[1]) { // longest + st.match[0] < final.match[0] || // leftmost + (st.match[0] == final.match[0] && pos > final.match[1]) { // longest final = st; final.match[1] = pos; } @@ -770,7 +824,7 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int { // A negative value means the subexpression did not match any element of the string. // An empty array means "no match". func (re *Regexp) ExecuteString(s string) (a []int) { - return re.doExecute(s, nil, 0) + return re.doExecute(s, nil, 0); } @@ -782,21 +836,21 @@ func (re *Regexp) ExecuteString(s string) (a []int) { // A negative value means the subexpression did not match any element of the slice. // An empty array means "no match". func (re *Regexp) Execute(b []byte) (a []int) { - return re.doExecute("", b, 0) + return re.doExecute("", b, 0); } // MatchString returns whether the Regexp matches the string s. // The return value is a boolean: true for match, false for no match. func (re *Regexp) MatchString(s string) bool { - return len(re.doExecute(s, nil, 0)) > 0 + return len(re.doExecute(s, nil, 0)) > 0; } // Match returns whether the Regexp matches the byte slice b. // The return value is a boolean: true for match, false for no match. func (re *Regexp) Match(b []byte) bool { - return len(re.doExecute("", b, 0)) > 0 + return len(re.doExecute("", b, 0)) > 0; } @@ -808,15 +862,15 @@ func (re *Regexp) Match(b []byte) bool { func (re *Regexp) MatchStrings(s string) (a []string) { r := re.doExecute(s, nil, 0); if r == nil { - return nil + return nil; } a = make([]string, len(r)/2); for i := 0; i < len(r); i += 2 { if r[i] != -1 { // -1 means no match for this subexpression - a[i/2] = s[r[i] : r[i+1]] + a[i/2] = s[r[i]:r[i+1]]; } } - return + return; } // MatchSlices matches the Regexp against the byte slice b. @@ -827,15 +881,15 @@ func (re *Regexp) MatchStrings(s string) (a []string) { func (re *Regexp) MatchSlices(b []byte) (a [][]byte) { r := re.doExecute("", b, 0); if r == nil { - return nil + return nil; } a = make([][]byte, len(r)/2); for i := 0; i < len(r); i += 2 { if r[i] != -1 { // -1 means no match for this subexpression - a[i/2] = b[r[i] : r[i+1]] + a[i/2] = b[r[i]:r[i+1]]; } } - return + return; } // MatchString checks whether a textual regular expression @@ -844,9 +898,9 @@ func (re *Regexp) MatchSlices(b []byte) (a [][]byte) { func MatchString(pattern string, s string) (matched bool, error string) { re, err := CompileRegexp(pattern); if err != "" { - return false, err + return false, err; } - return re.MatchString(s), "" + return re.MatchString(s), ""; } // Match checks whether a textual regular expression @@ -855,7 +909,7 @@ func MatchString(pattern string, s string) (matched bool, error string) { func Match(pattern string, b []byte) (matched bool, error string) { re, err := CompileRegexp(pattern); if err != "" { - return false, err + return false, err; } - return re.Match(b), "" + return re.Match(b), ""; } |