diff options
Diffstat (limited to 'src/pkg/regexp/regexp.go')
| -rw-r--r-- | src/pkg/regexp/regexp.go | 296 | 
1 files changed, 223 insertions, 73 deletions
| diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go index d274ccdf5..e3221ac9d 100644 --- a/src/pkg/regexp/regexp.go +++ b/src/pkg/regexp/regexp.go @@ -54,6 +54,16 @@  // text of the match/submatch.  If an index is negative, it means that  // subexpression did not match any string in the input.  // +// There is also a subset of the methods that can be applied to text read +// from a RuneReader: +// +//	MatchReader, FindReaderIndex, FindReaderSubmatchIndex +// +// This set may grow.  Note that regular expression matches may need to +// examine text beyond the text returned by a match, so the methods that +// match text from a RuneReader may read arbitrarily far into the input +// before returning. +//  // (There are a few other methods that do not match this pattern.)  //  package regexp @@ -231,13 +241,13 @@ func (p *parser) error(err Error) {  	panic(err)  } -const endOfFile = -1 +const endOfText = -1  func (p *parser) c() int { return p.ch }  func (p *parser) nextc() int {  	if p.pos >= len(p.re.expr) { -		p.ch = endOfFile +		p.ch = endOfText  	} else {  		c, w := utf8.DecodeRuneInString(p.re.expr[p.pos:])  		p.ch = c @@ -288,7 +298,7 @@ func (p *parser) checkBackslash() int {  	if c == '\\' {  		c = p.nextc()  		switch { -		case c == endOfFile: +		case c == endOfText:  			p.error(ErrExtraneousBackslash)  		case ispunct(c):  			// c is as delivered @@ -311,7 +321,7 @@ func (p *parser) charClass() *instr {  	left := -1  	for {  		switch c := p.c(); c { -		case ']', endOfFile: +		case ']', endOfText:  			if left >= 0 {  				p.error(ErrBadRange)  			} @@ -356,7 +366,7 @@ func (p *parser) charClass() *instr {  func (p *parser) term() (start, end *instr) {  	switch c := p.c(); c { -	case '|', endOfFile: +	case '|', endOfText:  		return nil, nil  	case '*', '+', '?':  		p.error(ErrBareClosure) @@ -638,8 +648,11 @@ func (re *Regexp) NumSubexp() int { return re.nbra }  // match vectors away as we execute.  Matches are ref counted and returned  // to a free list when no longer active.  Increases a simple benchmark by 22X.  type matchArena struct { -	head *matchVec -	len  int // length of match vector +	head  *matchVec +	len   int // length of match vector +	pos   int +	atBOT bool // whether we're at beginning of text +	atEOT bool // whether we're at end of text  }  type matchVec struct { @@ -699,21 +712,21 @@ type state struct {  // Append new state to to-do list.  Leftmost-longest wins so avoid  // adding a state that's already active.  The matchVec will be inc-ref'ed  // if it is assigned to a state. -func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matchVec, pos, end int) []state { +func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matchVec) []state {  	switch inst.kind {  	case iBOT: -		if pos == 0 { -			s = a.addState(s, inst.next, prefixed, match, pos, end) +		if a.atBOT { +			s = a.addState(s, inst.next, prefixed, match)  		}  		return s  	case iEOT: -		if pos == end { -			s = a.addState(s, inst.next, prefixed, match, pos, end) +		if a.atEOT { +			s = a.addState(s, inst.next, prefixed, match)  		}  		return s  	case iBra: -		match.m[inst.braNum] = pos -		s = a.addState(s, inst.next, prefixed, match, pos, end) +		match.m[inst.braNum] = a.pos +		s = a.addState(s, inst.next, prefixed, match)  		return s  	}  	l := len(s) @@ -727,62 +740,157 @@ func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matc  	s = append(s, state{inst, prefixed, match})  	match.ref++  	if inst.kind == iAlt { -		s = a.addState(s, inst.left, prefixed, a.copy(match), pos, end) +		s = a.addState(s, inst.left, prefixed, a.copy(match))  		// give other branch a copy of this match vector -		s = a.addState(s, inst.next, prefixed, a.copy(match), pos, end) +		s = a.addState(s, inst.next, prefixed, a.copy(match))  	}  	return s  } -// Accepts either string or bytes - the logic is identical either way. -// If bytes == nil, scan str. -func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { +// input abstracts different representations of the input text. It provides +// one-character lookahead. +type input interface { +	step(pos int) (rune int, width int) // advance one rune +	canCheckPrefix() bool               // can we look ahead without losing info? +	hasPrefix(re *Regexp) bool +	index(re *Regexp, pos int) int +} + +// inputString scans a string. +type inputString struct { +	str string +} + +func newInputString(str string) *inputString { +	return &inputString{str: str} +} + +func (i *inputString) step(pos int) (int, int) { +	if pos < len(i.str) { +		return utf8.DecodeRuneInString(i.str[pos:len(i.str)]) +	} +	return endOfText, 0 +} + +func (i *inputString) canCheckPrefix() bool { +	return true +} + +func (i *inputString) hasPrefix(re *Regexp) bool { +	return strings.HasPrefix(i.str, re.prefix) +} + +func (i *inputString) index(re *Regexp, pos int) int { +	return strings.Index(i.str[pos:], re.prefix) +} + +// inputBytes scans a byte slice. +type inputBytes struct { +	str []byte +} + +func newInputBytes(str []byte) *inputBytes { +	return &inputBytes{str: str} +} + +func (i *inputBytes) step(pos int) (int, int) { +	if pos < len(i.str) { +		return utf8.DecodeRune(i.str[pos:len(i.str)]) +	} +	return endOfText, 0 +} + +func (i *inputBytes) canCheckPrefix() bool { +	return true +} + +func (i *inputBytes) hasPrefix(re *Regexp) bool { +	return bytes.HasPrefix(i.str, re.prefixBytes) +} + +func (i *inputBytes) index(re *Regexp, pos int) int { +	return bytes.Index(i.str[pos:], re.prefixBytes) +} + +// inputReader scans a RuneReader. +type inputReader struct { +	r     io.RuneReader +	atEOT bool +	pos   int +} + +func newInputReader(r io.RuneReader) *inputReader { +	return &inputReader{r: r} +} + +func (i *inputReader) step(pos int) (int, int) { +	if !i.atEOT && pos != i.pos { +		return endOfText, 0 + +	} +	r, w, err := i.r.ReadRune() +	if err != nil { +		i.atEOT = true +		return endOfText, 0 +	} +	i.pos += w +	return r, w +} + +func (i *inputReader) canCheckPrefix() bool { +	return false +} + +func (i *inputReader) hasPrefix(re *Regexp) bool { +	return false +} + +func (i *inputReader) index(re *Regexp, pos int) int { +	return -1 +} + +// Search match starting from pos bytes into the input. +func (re *Regexp) doExecute(i input, pos int) []int {  	var s [2][]state  	s[0] = make([]state, 0, 10)  	s[1] = make([]state, 0, 10)  	in, out := 0, 1  	var final state  	found := false -	end := len(str) -	if bytestr != nil { -		end = len(bytestr) -	}  	anchored := re.inst[0].next.kind == iBOT  	if anchored && pos > 0 {  		return nil  	}  	// fast check for initial plain substring -	if re.prefix != "" { +	if i.canCheckPrefix() && re.prefix != "" {  		advance := 0  		if anchored { -			if bytestr == nil { -				if !strings.HasPrefix(str, re.prefix) { -					return nil -				} -			} else { -				if !bytes.HasPrefix(bytestr, re.prefixBytes) { -					return nil -				} +			if !i.hasPrefix(re) { +				return nil  			}  		} else { -			if bytestr == nil { -				advance = strings.Index(str[pos:], re.prefix) -			} else { -				advance = bytes.Index(bytestr[pos:], re.prefixBytes) +			advance = i.index(re, pos) +			if advance == -1 { +				return nil  			}  		} -		if advance == -1 { -			return nil -		}  		pos += advance  	} -	arena := &matchArena{nil, 2 * (re.nbra + 1)} -	for startPos := pos; pos <= end; { +	// We look one character ahead so we can match $, which checks whether +	// we are at EOT. +	nextChar, nextWidth := i.step(pos) +	arena := &matchArena{ +		len:   2 * (re.nbra + 1), +		pos:   pos, +		atBOT: pos == 0, +		atEOT: nextChar == endOfText, +	} +	for c, startPos := 0, pos; c != endOfText; {  		if !found && (pos == startPos || !anchored) {  			// prime the pump if we haven't seen a match yet  			match := arena.noMatch()  			match.m[0] = pos -			s[out] = arena.addState(s[out], re.start.next, false, match, pos, end) +			s[out] = arena.addState(s[out], re.start.next, false, match)  			arena.free(match) // if addState saved it, ref was incremented  		} else if len(s[out]) == 0 {  			// machine has completed @@ -795,35 +903,32 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {  			arena.free(state.match)  		}  		s[out] = old[0:0] // truncate state vector -		charwidth := 1 -		c := endOfFile -		if pos < end { -			if bytestr == nil { -				c, charwidth = utf8.DecodeRuneInString(str[pos:end]) -			} else { -				c, charwidth = utf8.DecodeRune(bytestr[pos:end]) -			} -		} -		pos += charwidth +		c = nextChar +		thisPos := pos +		pos += nextWidth +		nextChar, nextWidth = i.step(pos) +		arena.atEOT = nextChar == endOfText +		arena.atBOT = false +		arena.pos = pos  		for _, st := range s[in] {  			switch st.inst.kind {  			case iBOT:  			case iEOT:  			case iChar:  				if c == st.inst.char { -					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end) +					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)  				}  			case iCharClass:  				if st.inst.cclass.matches(c) { -					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end) +					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)  				}  			case iAny: -				if c != endOfFile { -					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end) +				if c != endOfText { +					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)  				}  			case iNotNL: -				if c != endOfFile && c != '\n' { -					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end) +				if c != endOfText && c != '\n' { +					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)  				}  			case iBra:  			case iAlt: @@ -831,13 +936,13 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {  				// choose leftmost longest  				if !found || // first  					st.match.m[0] < final.match.m[0] || // leftmost -					(st.match.m[0] == final.match.m[0] && pos-charwidth > final.match.m[1]) { // longest +					(st.match.m[0] == final.match.m[0] && thisPos > final.match.m[1]) { // longest  					if final.match != nil {  						arena.free(final.match)  					}  					final = st  					final.match.ref++ -					final.match.m[1] = pos - charwidth +					final.match.m[1] = thisPos  				}  				found = true  			default: @@ -874,14 +979,31 @@ func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {  	return string(c[:i]), true  } +// MatchReader returns whether the Regexp matches the text read by the +// RuneReader.  The return value is a boolean: true for match, false for no +// match. +func (re *Regexp) MatchReader(r io.RuneReader) bool { +	return len(re.doExecute(newInputReader(r), 0)) > 0 +} +  // MatchString returns whether the Regexp matches the string s.  // The return value is a boolean: true for match, false for no match. -func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(s, nil, 0)) > 0 } +func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(newInputString(s), 0)) > 0 }  // Match returns whether the Regexp matches the byte slice b.  // The return value is a boolean: true for match, false for no match. -func (re *Regexp) Match(b []byte) bool { return len(re.doExecute("", b, 0)) > 0 } +func (re *Regexp) Match(b []byte) bool { return len(re.doExecute(newInputBytes(b), 0)) > 0 } +// MatchReader checks whether a textual regular expression matches the text +// read by the RuneReader.  More complicated queries need to use Compile and +// the full Regexp interface. +func MatchReader(pattern string, r io.RuneReader) (matched bool, error os.Error) { +	re, err := Compile(pattern) +	if err != nil { +		return false, err +	} +	return re.MatchReader(r), nil +}  // MatchString checks whether a textual regular expression  // matches a string.  More complicated queries need @@ -921,7 +1043,7 @@ func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) str  	searchPos := 0    // position where we next look for a match  	buf := new(bytes.Buffer)  	for searchPos <= len(src) { -		a := re.doExecute(src, nil, searchPos) +		a := re.doExecute(newInputString(src), searchPos)  		if len(a) == 0 {  			break // no more matches  		} @@ -973,7 +1095,7 @@ func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {  	searchPos := 0    // position where we next look for a match  	buf := new(bytes.Buffer)  	for searchPos <= len(src) { -		a := re.doExecute("", src, searchPos) +		a := re.doExecute(newInputBytes(src), searchPos)  		if len(a) == 0 {  			break // no more matches  		} @@ -1038,7 +1160,13 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {  	}  	for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { -		matches := re.doExecute(s, b, pos) +		var in input +		if b == nil { +			in = newInputString(s) +		} else { +			in = newInputBytes(b) +		} +		matches := re.doExecute(in, pos)  		if len(matches) == 0 {  			break  		} @@ -1052,6 +1180,7 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {  				accept = false  			}  			var width int +			// TODO: use step()  			if b == nil {  				_, width = utf8.DecodeRuneInString(s[pos:end])  			} else { @@ -1077,7 +1206,7 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {  // Find returns a slice holding the text of the leftmost match in b of the regular expression.  // A return value of nil indicates no match.  func (re *Regexp) Find(b []byte) []byte { -	a := re.doExecute("", b, 0) +	a := re.doExecute(newInputBytes(b), 0)  	if a == nil {  		return nil  	} @@ -1089,7 +1218,7 @@ func (re *Regexp) Find(b []byte) []byte {  // b[loc[0]:loc[1]].  // A return value of nil indicates no match.  func (re *Regexp) FindIndex(b []byte) (loc []int) { -	a := re.doExecute("", b, 0) +	a := re.doExecute(newInputBytes(b), 0)  	if a == nil {  		return nil  	} @@ -1102,7 +1231,7 @@ func (re *Regexp) FindIndex(b []byte) (loc []int) {  // an empty string.  Use FindStringIndex or FindStringSubmatch if it is  // necessary to distinguish these cases.  func (re *Regexp) FindString(s string) string { -	a := re.doExecute(s, nil, 0) +	a := re.doExecute(newInputString(s), 0)  	if a == nil {  		return ""  	} @@ -1114,7 +1243,19 @@ func (re *Regexp) FindString(s string) string {  // itself is at s[loc[0]:loc[1]].  // A return value of nil indicates no match.  func (re *Regexp) FindStringIndex(s string) []int { -	a := re.doExecute(s, nil, 0) +	a := re.doExecute(newInputString(s), 0) +	if a == nil { +		return nil +	} +	return a[0:2] +} + +// FindReaderIndex returns a two-element slice of integers defining the +// location of the leftmost match of the regular expression in text read from +// the RuneReader.  The match itself is at s[loc[0]:loc[1]].  A return +// value of nil indicates no match. +func (re *Regexp) FindReaderIndex(r io.RuneReader) []int { +	a := re.doExecute(newInputReader(r), 0)  	if a == nil {  		return nil  	} @@ -1127,7 +1268,7 @@ func (re *Regexp) FindStringIndex(s string) []int {  // comment.  // A return value of nil indicates no match.  func (re *Regexp) FindSubmatch(b []byte) [][]byte { -	a := re.doExecute("", b, 0) +	a := re.doExecute(newInputBytes(b), 0)  	if a == nil {  		return nil  	} @@ -1146,7 +1287,7 @@ func (re *Regexp) FindSubmatch(b []byte) [][]byte {  // in the package comment.  // A return value of nil indicates no match.  func (re *Regexp) FindSubmatchIndex(b []byte) []int { -	return re.doExecute("", b, 0) +	return re.doExecute(newInputBytes(b), 0)  }  // FindStringSubmatch returns a slice of strings holding the text of the @@ -1155,7 +1296,7 @@ func (re *Regexp) FindSubmatchIndex(b []byte) []int {  // package comment.  // A return value of nil indicates no match.  func (re *Regexp) FindStringSubmatch(s string) []string { -	a := re.doExecute(s, nil, 0) +	a := re.doExecute(newInputString(s), 0)  	if a == nil {  		return nil  	} @@ -1174,7 +1315,16 @@ func (re *Regexp) FindStringSubmatch(s string) []string {  // 'Index' descriptions in the package comment.  // A return value of nil indicates no match.  func (re *Regexp) FindStringSubmatchIndex(s string) []int { -	return re.doExecute(s, nil, 0) +	return re.doExecute(newInputString(s), 0) +} + +// FindReaderSubmatchIndex returns a slice holding the index pairs +// identifying the leftmost match of the regular expression of text read by +// the RuneReader, and the matches, if any, of its subexpressions, as defined +// by the 'Submatch' and 'Index' descriptions in the package comment.  A +// return value of nil indicates no match. +func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { +	return re.doExecute(newInputReader(r), 0)  }  const startSize = 10 // The size at which to start a slice in the 'All' routines. | 
