diff options
Diffstat (limited to 'src/pkg/regexp/regexp.go')
| -rw-r--r-- | src/pkg/regexp/regexp.go | 950 | 
1 files changed, 530 insertions, 420 deletions
| diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go index 4dd430ea6..2e03b798a 100644 --- a/src/pkg/regexp/regexp.go +++ b/src/pkg/regexp/regexp.go @@ -16,14 +16,50 @@  //		'$'  //		'.'  //		character -//		'[' [ '^' ] character-ranges ']' +//		'[' [ '^' ] { character-range } ']'  //		'(' regexp ')' +//	character-range: +//		character [ '-' character ] +// +// All characters are UTF-8-encoded code points.  Backslashes escape special +// characters, including inside character classes.  The standard Go character +// escapes are also recognized: \a \b \f \n \r \t \v. +// +// There are 16 methods of Regexp that match a regular expression and identify +// the matched text.  Their names are matched by this regular expression: +// +//	Find(All)?(String)?(Submatch)?(Index)? +// +// If 'All' is present, the routine matches successive non-overlapping +// matches of the entire expression.  Empty matches abutting a preceding +// match are ignored.  The return value is a slice containing the successive +// return values of the corresponding non-'All' routine.  These routines take +// an extra integer argument, n; if n >= 0, the function returns at most n +// matches/submatches. +// +// If 'String' is present, the argument is a string; otherwise it is a slice +// of bytes; return values are adjusted as appropriate. +// +// If 'Submatch' is present, the return value is a slice identifying the +// successive submatches of the expression.  Submatches are matches of +// parenthesized subexpressions within the regular expression, numbered from +// left to right in order of opening parenthesis.  Submatch 0 is the match of +// the entire expression, submatch 1 the match of the first parenthesized +// subexpression, and so on. +// +// If 'Index' is present, matches and submatches are identified by byte index +// pairs within the input string: result[2*n:2*n+1] identifies the indexes of +// the nth submatch.  The pair for n==0 identifies the match of the entire +// expression.  If 'Index' is not present, the match is identified by the +// text of the match/submatch.  If an index is negative, it means that +// subexpression did not match any string in the input. +// +// (There are a few other methods that do not match this pattern.)  //  package regexp  import (  	"bytes" -	"container/vector"  	"io"  	"os"  	"strings" @@ -53,121 +89,90 @@ var (  	ErrBadBackslash        = Error("illegal backslash escape")  ) -// An instruction executed by the NFA -type instr interface { -	kind() int   // the type of this instruction: _CHAR, _ANY, etc. -	next() instr // the instruction to execute after this one -	setNext(i instr) -	index() int -	setIndex(i int) -	print() -} +const ( +	iStart     = iota // beginning of program +	iEnd              // end of program: success +	iBOT              // '^' beginning of text +	iEOT              // '$' end of text +	iChar             // 'a' regular character +	iCharClass        // [a-z] character class +	iAny              // '.' any character including newline +	iNotNL            // [^\n] special case: any character but newline +	iBra              // '(' parenthesized expression: 2*braNum for left, 2*braNum+1 for right +	iAlt              // '|' alternation +	iNop              // do nothing; makes it easy to link without patching +) -// Fields and methods common to all instructions -type common struct { -	_next  instr -	_index int +// An instruction executed by the NFA +type instr struct { +	kind  int    // the type of this instruction: iChar, iAny, etc. +	index int    // used only in debugging; could be eliminated +	next  *instr // the instruction to execute after this one +	// Special fields valid only for some items. +	char   int        // iChar +	braNum int        // iBra, iEbra +	cclass *charClass // iCharClass +	left   *instr     // iAlt, other branch +} + +func (i *instr) print() { +	switch i.kind { +	case iStart: +		print("start") +	case iEnd: +		print("end") +	case iBOT: +		print("bot") +	case iEOT: +		print("eot") +	case iChar: +		print("char ", string(i.char)) +	case iCharClass: +		i.cclass.print() +	case iAny: +		print("any") +	case iNotNL: +		print("notnl") +	case iBra: +		if i.braNum&1 == 0 { +			print("bra", i.braNum/2) +		} else { +			print("ebra", i.braNum/2) +		} +	case iAlt: +		print("alt(", i.left.index, ")") +	case iNop: +		print("nop") +	}  } -func (c *common) next() instr     { return c._next } -func (c *common) setNext(i instr) { c._next = i } -func (c *common) index() int      { return c._index } -func (c *common) setIndex(i int)  { c._index = i } -  // Regexp is the representation of a compiled regular expression.  // The public interface is entirely through methods.  type Regexp struct {  	expr        string // the original expression  	prefix      string // initial plain text string  	prefixBytes []byte // initial plain text bytes -	inst        *vector.Vector -	start       instr // first instruction of machine -	prefixStart instr // where to start if there is a prefix -	nbra        int   // number of brackets in expression, for subexpressions -} - -const ( -	_START     = iota // beginning of program -	_END              // end of program: success -	_BOT              // '^' beginning of text -	_EOT              // '$' end of text -	_CHAR             // 'a' regular character -	_CHARCLASS        // [a-z] character class -	_ANY              // '.' any character including newline -	_NOTNL            // [^\n] special case: any character but newline -	_BRA              // '(' parenthesized expression -	_EBRA             // ')'; end of '(' parenthesized expression -	_ALT              // '|' alternation -	_NOP              // do nothing; makes it easy to link without patching -) - -// --- START start of program -type _Start struct { -	common +	inst        []*instr +	start       *instr // first instruction of machine +	prefixStart *instr // where to start if there is a prefix +	nbra        int    // number of brackets in expression, for subexpressions  } -func (start *_Start) kind() int { return _START } -func (start *_Start) print()    { print("start") } - -// --- END end of program -type _End struct { -	common -} - -func (end *_End) kind() int { return _END } -func (end *_End) print()    { print("end") } - -// --- BOT beginning of text -type _Bot struct { -	common -} - -func (bot *_Bot) kind() int { return _BOT } -func (bot *_Bot) print()    { print("bot") } - -// --- EOT end of text -type _Eot struct { -	common -} - -func (eot *_Eot) kind() int { return _EOT } -func (eot *_Eot) print()    { print("eot") } - -// --- CHAR a regular character -type _Char struct { -	common -	char int -} - -func (char *_Char) kind() int { return _CHAR } -func (char *_Char) print()    { print("char ", string(char.char)) } - -func newChar(char int) *_Char { -	c := new(_Char) -	c.char = char -	return c -} - -// --- CHARCLASS [a-z] - -type _CharClass struct { -	common +type charClass struct {  	negate bool // is character class negated? ([^a-z]) -	// vector of int, stored pairwise: [a-z] is (a,z); x is (x,x): -	ranges     *vector.IntVector +	// slice of int, stored pairwise: [a-z] is (a,z); x is (x,x): +	ranges     []int  	cmin, cmax int  } -func (cclass *_CharClass) kind() int { return _CHARCLASS } - -func (cclass *_CharClass) print() { +func (cclass *charClass) print() {  	print("charclass")  	if cclass.negate {  		print(" (negated)")  	} -	for i := 0; i < cclass.ranges.Len(); i += 2 { -		l := cclass.ranges.At(i) -		r := cclass.ranges.At(i + 1) +	for i := 0; i < len(cclass.ranges); i += 2 { +		l := cclass.ranges[i] +		r := cclass.ranges[i+1]  		if l == r {  			print(" [", string(l), "]")  		} else { @@ -176,10 +181,9 @@ func (cclass *_CharClass) print() {  	}  } -func (cclass *_CharClass) addRange(a, b int) { +func (cclass *charClass) addRange(a, b int) {  	// range is a through b inclusive -	cclass.ranges.Push(a) -	cclass.ranges.Push(b) +	cclass.ranges = append(cclass.ranges, a, b)  	if a < cclass.cmin {  		cclass.cmin = a  	} @@ -188,11 +192,11 @@ func (cclass *_CharClass) addRange(a, b int) {  	}  } -func (cclass *_CharClass) matches(c int) bool { +func (cclass *charClass) matches(c int) bool {  	if c < cclass.cmin || c > cclass.cmax {  		return cclass.negate  	} -	ranges := []int(*cclass.ranges) +	ranges := cclass.ranges  	for i := 0; i < len(ranges); i = i + 2 {  		if ranges[i] <= c && c <= ranges[i+1] {  			return !cclass.negate @@ -201,68 +205,18 @@ func (cclass *_CharClass) matches(c int) bool {  	return cclass.negate  } -func newCharClass() *_CharClass { -	c := new(_CharClass) -	c.ranges = new(vector.IntVector) -	c.cmin = 0x10FFFF + 1 // MaxRune + 1 -	c.cmax = -1 -	return c -} - -// --- ANY any character -type _Any struct { -	common -} - -func (any *_Any) kind() int { return _ANY } -func (any *_Any) print()    { print("any") } - -// --- NOTNL any character but newline -type _NotNl struct { -	common -} - -func (notnl *_NotNl) kind() int { return _NOTNL } -func (notnl *_NotNl) print()    { print("notnl") } - -// --- BRA parenthesized expression -type _Bra struct { -	common -	n int // subexpression number -} - -func (bra *_Bra) kind() int { return _BRA } -func (bra *_Bra) print()    { print("bra", bra.n) } - -// --- EBRA end of parenthesized expression -type _Ebra struct { -	common -	n int // subexpression number -} - -func (ebra *_Ebra) kind() int { return _EBRA } -func (ebra *_Ebra) print()    { print("ebra ", ebra.n) } - -// --- ALT alternation -type _Alt struct { -	common -	left instr // other branch -} - -func (alt *_Alt) kind() int { return _ALT } -func (alt *_Alt) print()    { print("alt(", alt.left.index(), ")") } - -// --- NOP no operation -type _Nop struct { -	common +func newCharClass() *instr { +	i := &instr{kind: iCharClass} +	i.cclass = new(charClass) +	i.cclass.ranges = make([]int, 0, 4) +	i.cclass.cmin = 0x10FFFF + 1 // MaxRune + 1 +	i.cclass.cmax = -1 +	return i  } -func (nop *_Nop) kind() int { return _NOP } -func (nop *_Nop) print()    { print("nop") } - -func (re *Regexp) add(i instr) instr { -	i.setIndex(re.inst.Len()) -	re.inst.Push(i) +func (re *Regexp) add(i *instr) *instr { +	i.index = len(re.inst) +	re.inst = append(re.inst, i)  	return i  } @@ -317,8 +271,21 @@ func ispunct(c int) bool {  	return false  } -func (p *parser) charClass() instr { -	cc := newCharClass() +var escapes = []byte("abfnrtv") +var escaped = []byte("\a\b\f\n\r\t\v") + +func escape(c int) int { +	for i, b := range escapes { +		if int(b) == c { +			return i +		} +	} +	return -1 +} + +func (p *parser) charClass() *instr { +	i := newCharClass() +	cc := i.cclass  	if p.c() == '^' {  		cc.negate = true  		p.nextc() @@ -331,20 +298,20 @@ func (p *parser) charClass() instr {  				p.error(ErrBadRange)  			}  			// Is it [^\n]? -			if cc.negate && cc.ranges.Len() == 2 && -				cc.ranges.At(0) == '\n' && cc.ranges.At(1) == '\n' { -				nl := new(_NotNl) +			if cc.negate && len(cc.ranges) == 2 && +				cc.ranges[0] == '\n' && cc.ranges[1] == '\n' { +				nl := &instr{kind: iNotNL}  				p.re.add(nl)  				return nl  			}  			// Special common case: "[a]" -> "a" -			if !cc.negate && cc.ranges.Len() == 2 && cc.ranges.At(0) == cc.ranges.At(1) { -				c := newChar(cc.ranges.At(0)) +			if !cc.negate && len(cc.ranges) == 2 && cc.ranges[0] == cc.ranges[1] { +				c := &instr{kind: iChar, char: cc.ranges[0]}  				p.re.add(c)  				return c  			} -			p.re.add(cc) -			return cc +			p.re.add(i) +			return i  		case '-': // do this before backslash processing  			p.error(ErrBadRange)  		case '\\': @@ -352,10 +319,10 @@ func (p *parser) charClass() instr {  			switch {  			case c == endOfFile:  				p.error(ErrExtraneousBackslash) -			case c == 'n': -				c = '\n'  			case ispunct(c):  				// c is as delivered +			case escape(c) >= 0: +				c = int(escaped[escape(c)])  			default:  				p.error(ErrBadBackslash)  			} @@ -381,7 +348,7 @@ func (p *parser) charClass() instr {  	return nil  } -func (p *parser) term() (start, end instr) { +func (p *parser) term() (start, end *instr) {  	switch c := p.c(); c {  	case '|', endOfFile:  		return nil, nil @@ -396,15 +363,15 @@ func (p *parser) term() (start, end instr) {  		p.error(ErrUnmatchedRbkt)  	case '^':  		p.nextc() -		start = p.re.add(new(_Bot)) +		start = p.re.add(&instr{kind: iBOT})  		return start, start  	case '$':  		p.nextc() -		start = p.re.add(new(_Eot)) +		start = p.re.add(&instr{kind: iEOT})  		return start, start  	case '.':  		p.nextc() -		start = p.re.add(new(_Any)) +		start = p.re.add(&instr{kind: iAny})  		return start, start  	case '[':  		p.nextc() @@ -425,12 +392,10 @@ func (p *parser) term() (start, end instr) {  		}  		p.nlpar--  		p.nextc() -		bra := new(_Bra) +		bra := &instr{kind: iBra, braNum: 2 * nbra}  		p.re.add(bra) -		ebra := new(_Ebra) +		ebra := &instr{kind: iBra, braNum: 2*nbra + 1}  		p.re.add(ebra) -		bra.n = nbra -		ebra.n = nbra  		if start == nil {  			if end == nil {  				p.error(ErrInternal) @@ -438,33 +403,33 @@ func (p *parser) term() (start, end instr) {  			}  			start = ebra  		} else { -			end.setNext(ebra) +			end.next = ebra  		} -		bra.setNext(start) +		bra.next = start  		return bra, ebra  	case '\\':  		c = p.nextc()  		switch {  		case c == endOfFile:  			p.error(ErrExtraneousBackslash) -		case c == 'n': -			c = '\n'  		case ispunct(c):  			// c is as delivered +		case escape(c) >= 0: +			c = int(escaped[escape(c)])  		default:  			p.error(ErrBadBackslash)  		}  		fallthrough  	default:  		p.nextc() -		start = newChar(c) +		start = &instr{kind: iChar, char: c}  		p.re.add(start)  		return start, start  	}  	panic("unreachable")  } -func (p *parser) closure() (start, end instr) { +func (p *parser) closure() (start, end *instr) {  	start, end = p.term()  	if start == nil {  		return @@ -472,28 +437,28 @@ func (p *parser) closure() (start, end instr) {  	switch p.c() {  	case '*':  		// (start,end)*: -		alt := new(_Alt) +		alt := &instr{kind: iAlt}  		p.re.add(alt) -		end.setNext(alt) // after end, do alt +		end.next = alt   // after end, do alt  		alt.left = start // alternate brach: return to start  		start = alt      // alt becomes new (start, end)  		end = alt  	case '+':  		// (start,end)+: -		alt := new(_Alt) +		alt := &instr{kind: iAlt}  		p.re.add(alt) -		end.setNext(alt) // after end, do alt +		end.next = alt   // after end, do alt  		alt.left = start // alternate brach: return to start  		end = alt        // start is unchanged; end is alt  	case '?':  		// (start,end)?: -		alt := new(_Alt) +		alt := &instr{kind: iAlt}  		p.re.add(alt) -		nop := new(_Nop) +		nop := &instr{kind: iNop}  		p.re.add(nop)  		alt.left = start // alternate branch is start -		alt.setNext(nop) // follow on to nop -		end.setNext(nop) // after end, go to nop +		alt.next = nop   // follow on to nop +		end.next = nop   // after end, go to nop  		start = alt      // start is now alt  		end = nop        // end is nop pointed to by both branches  	default: @@ -506,27 +471,27 @@ func (p *parser) closure() (start, end instr) {  	return  } -func (p *parser) concatenation() (start, end instr) { +func (p *parser) concatenation() (start, end *instr) {  	for {  		nstart, nend := p.closure()  		switch {  		case nstart == nil: // end of this concatenation  			if start == nil { // this is the empty string -				nop := p.re.add(new(_Nop)) +				nop := p.re.add(&instr{kind: iNop})  				return nop, nop  			}  			return  		case start == nil: // this is first element of concatenation  			start, end = nstart, nend  		default: -			end.setNext(nstart) +			end.next = nstart  			end = nend  		}  	}  	panic("unreachable")  } -func (p *parser) regexp() (start, end instr) { +func (p *parser) regexp() (start, end *instr) {  	start, end = p.concatenation()  	for {  		switch p.c() { @@ -535,49 +500,46 @@ func (p *parser) regexp() (start, end instr) {  		case '|':  			p.nextc()  			nstart, nend := p.concatenation() -			alt := new(_Alt) +			alt := &instr{kind: iAlt}  			p.re.add(alt)  			alt.left = start -			alt.setNext(nstart) -			nop := new(_Nop) +			alt.next = nstart +			nop := &instr{kind: iNop}  			p.re.add(nop) -			end.setNext(nop) -			nend.setNext(nop) +			end.next = nop +			nend.next = nop  			start, end = alt, nop  		}  	}  	panic("unreachable")  } -func unNop(i instr) instr { -	for i.kind() == _NOP { -		i = i.next() +func unNop(i *instr) *instr { +	for i.kind == iNop { +		i = i.next  	}  	return i  }  func (re *Regexp) eliminateNops() { -	for i := 0; i < re.inst.Len(); i++ { -		inst := re.inst.At(i).(instr) -		if inst.kind() == _END { +	for _, inst := range re.inst { +		if inst.kind == iEnd {  			continue  		} -		inst.setNext(unNop(inst.next())) -		if inst.kind() == _ALT { -			alt := inst.(*_Alt) -			alt.left = unNop(alt.left) +		inst.next = unNop(inst.next) +		if inst.kind == iAlt { +			inst.left = unNop(inst.left)  		}  	}  }  func (re *Regexp) dump() {  	print("prefix <", re.prefix, ">\n") -	for i := 0; i < re.inst.Len(); i++ { -		inst := re.inst.At(i).(instr) -		print(inst.index(), ": ") +	for _, inst := range re.inst { +		print(inst.index, ": ")  		inst.print() -		if inst.kind() != _END { -			print(" -> ", inst.next().index()) +		if inst.kind != iEnd { +			print(" -> ", inst.next.index)  		}  		print("\n")  	} @@ -585,12 +547,12 @@ func (re *Regexp) dump() {  func (re *Regexp) doParse() {  	p := newParser(re) -	start := new(_Start) +	start := &instr{kind: iStart}  	re.add(start)  	s, e := p.regexp() -	start.setNext(s) +	start.next = s  	re.start = start -	e.setNext(re.add(new(_End))) +	e.next = re.add(&instr{kind: iEnd})  	if debug {  		re.dump() @@ -609,36 +571,44 @@ func (re *Regexp) doParse() {  	}  } -// Extract regular text from the beginning of the pattern. +// Extract regular text from the beginning of the pattern, +// possibly after a leading iBOT.  // That text can be used by doExecute to speed up matching.  func (re *Regexp) setPrefix() {  	var b []byte  	var utf = make([]byte, utf8.UTFMax) -	// First instruction is start; skip that. -	i := re.inst.At(0).(instr).next().index() +	var inst *instr +	// First instruction is start; skip that.  Also skip any initial iBOT. +	inst = re.inst[0].next +	for inst.kind == iBOT { +		inst = inst.next +	}  Loop: -	for i < re.inst.Len() { -		inst := re.inst.At(i).(instr) +	for ; inst.kind != iEnd; inst = inst.next {  		// stop if this is not a char -		if inst.kind() != _CHAR { +		if inst.kind != iChar {  			break  		}  		// stop if this char can be followed by a match for an empty string,  		// which includes closures, ^, and $. -		switch re.inst.At(inst.next().index()).(instr).kind() { -		case _BOT, _EOT, _ALT: +		switch inst.next.kind { +		case iBOT, iEOT, iAlt:  			break Loop  		} -		n := utf8.EncodeRune(inst.(*_Char).char, utf) -		b = bytes.Add(b, utf[0:n]) -		i = inst.next().index() +		n := utf8.EncodeRune(utf, inst.char) +		b = append(b, utf[0:n]...)  	}  	// point prefixStart instruction to first non-CHAR after prefix -	re.prefixStart = re.inst.At(i).(instr) +	re.prefixStart = inst  	re.prefixBytes = b  	re.prefix = string(b)  } +// String returns the source text used to compile the regular expression. +func (re *Regexp) String() string { +	return re.expr +} +  // Compile parses a regular expression and returns, if successful, a Regexp  // object that can be used to match against text.  func Compile(str string) (regexp *Regexp, error os.Error) { @@ -651,7 +621,7 @@ func Compile(str string) (regexp *Regexp, error os.Error) {  		}  	}()  	regexp.expr = str -	regexp.inst = new(vector.Vector) +	regexp.inst = make([]*instr, 0, 10)  	regexp.doParse()  	return  } @@ -727,60 +697,45 @@ func (a *matchArena) noMatch() *matchVec {  }  type state struct { -	inst     instr // next instruction to execute -	prefixed bool  // this match began with a fixed prefix +	inst     *instr // next instruction to execute +	prefixed bool   // this match began with a fixed prefix  	match    *matchVec  }  // Append new state to to-do list.  Leftmost-longest wins so avoid  // adding a state that's already active.  The matchVec will be inc-ref'ed  // if it is assigned to a state. -func (a *matchArena) addState(s []state, inst instr, prefixed bool, match *matchVec, pos, end int) []state { -	switch inst.kind() { -	case _BOT: +func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matchVec, pos, end int) []state { +	switch inst.kind { +	case iBOT:  		if pos == 0 { -			s = a.addState(s, inst.next(), prefixed, match, pos, end) +			s = a.addState(s, inst.next, prefixed, match, pos, end)  		}  		return s -	case _EOT: +	case iEOT:  		if pos == end { -			s = a.addState(s, inst.next(), prefixed, match, pos, end) +			s = a.addState(s, inst.next, prefixed, match, pos, end)  		}  		return s -	case _BRA: -		n := inst.(*_Bra).n -		match.m[2*n] = pos -		s = a.addState(s, inst.next(), prefixed, match, pos, end) -		return s -	case _EBRA: -		n := inst.(*_Ebra).n -		match.m[2*n+1] = pos -		s = a.addState(s, inst.next(), prefixed, match, pos, end) +	case iBra: +		match.m[inst.braNum] = pos +		s = a.addState(s, inst.next, prefixed, match, pos, end)  		return s  	} -	index := inst.index()  	l := len(s)  	// States are inserted in order so it's sufficient to see if we have the same  	// instruction; no need to see if existing match is earlier (it is).  	for i := 0; i < l; i++ { -		if s[i].inst.index() == index { +		if s[i].inst == inst {  			return s  		}  	} -	if l == cap(s) { -		s1 := make([]state, 2*l)[0:l] -		copy(s1, s) -		s = s1 -	} -	s = s[0 : l+1] -	s[l].inst = inst -	s[l].prefixed = prefixed -	s[l].match = match +	s = append(s, state{inst, prefixed, match})  	match.ref++ -	if inst.kind() == _ALT { -		s = a.addState(s, inst.(*_Alt).left, prefixed, a.copy(match), pos, end) +	if inst.kind == iAlt { +		s = a.addState(s, inst.left, prefixed, a.copy(match), pos, end)  		// give other branch a copy of this match vector -		s = a.addState(s, inst.next(), prefixed, a.copy(match), pos, end) +		s = a.addState(s, inst.next, prefixed, a.copy(match), pos, end)  	}  	return s  } @@ -789,8 +744,8 @@ func (a *matchArena) addState(s []state, inst instr, prefixed bool, match *match  // If bytes == nil, scan str.  func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {  	var s [2][]state -	s[0] = make([]state, 10)[0:0] -	s[1] = make([]state, 10)[0:0] +	s[0] = make([]state, 0, 10) +	s[1] = make([]state, 0, 10)  	in, out := 0, 1  	var final state  	found := false @@ -798,34 +753,46 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {  	if bytestr != nil {  		end = len(bytestr)  	} +	anchored := re.inst[0].next.kind == iBOT +	if anchored && pos > 0 { +		return nil +	}  	// fast check for initial plain substring -	prefixed := false // has this iteration begun by skipping a prefix?  	if re.prefix != "" { -		var advance int -		if bytestr == nil { -			advance = strings.Index(str[pos:], re.prefix) +		advance := 0 +		if anchored { +			if bytestr == nil { +				if !strings.HasPrefix(str, re.prefix) { +					return nil +				} +			} else { +				if !bytes.HasPrefix(bytestr, re.prefixBytes) { +					return nil +				} +			}  		} else { -			advance = bytes.Index(bytestr[pos:], re.prefixBytes) +			if bytestr == nil { +				advance = strings.Index(str[pos:], re.prefix) +			} else { +				advance = bytes.Index(bytestr[pos:], re.prefixBytes) +			}  		}  		if advance == -1 { -			return []int{} +			return nil  		} -		pos += advance + len(re.prefix) -		prefixed = true +		pos += advance  	}  	arena := &matchArena{nil, 2 * (re.nbra + 1)} -	for pos <= end { -		if !found { +	for startPos := pos; pos <= end; { +		if !found && (pos == startPos || !anchored) {  			// prime the pump if we haven't seen a match yet  			match := arena.noMatch()  			match.m[0] = pos -			if prefixed { -				s[out] = arena.addState(s[out], re.prefixStart, true, match, pos, end) -				prefixed = false // next iteration should start at beginning of machine. -			} else { -				s[out] = arena.addState(s[out], re.start.next(), false, match, pos, end) -			} +			s[out] = arena.addState(s[out], re.start.next, false, match, pos, end)  			arena.free(match) // if addState saved it, ref was incremented +		} else if len(s[out]) == 0 { +			// machine has completed +			break  		}  		in, out = out, in // old out state is new in state  		// clear out old state @@ -834,10 +801,6 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {  			arena.free(state.match)  		}  		s[out] = old[0:0] // truncate state vector -		if found && len(s[in]) == 0 { -			// machine has completed -			break -		}  		charwidth := 1  		c := endOfFile  		if pos < end { @@ -849,29 +812,28 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {  		}  		pos += charwidth  		for _, st := range s[in] { -			switch st.inst.kind() { -			case _BOT: -			case _EOT: -			case _CHAR: -				if c == st.inst.(*_Char).char { -					s[out] = arena.addState(s[out], st.inst.next(), st.prefixed, st.match, pos, end) +			switch st.inst.kind { +			case iBOT: +			case iEOT: +			case iChar: +				if c == st.inst.char { +					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)  				} -			case _CHARCLASS: -				if st.inst.(*_CharClass).matches(c) { -					s[out] = arena.addState(s[out], st.inst.next(), st.prefixed, st.match, pos, end) +			case iCharClass: +				if st.inst.cclass.matches(c) { +					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)  				} -			case _ANY: +			case iAny:  				if c != endOfFile { -					s[out] = arena.addState(s[out], st.inst.next(), st.prefixed, st.match, pos, end) +					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)  				} -			case _NOTNL: +			case iNotNL:  				if c != endOfFile && c != '\n' { -					s[out] = arena.addState(s[out], st.inst.next(), st.prefixed, st.match, pos, end) +					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)  				} -			case _BRA: -			case _EBRA: -			case _ALT: -			case _END: +			case iBra: +			case iAlt: +			case iEnd:  				// choose leftmost longest  				if !found || // first  					st.match.m[0] < final.match.m[0] || // leftmost @@ -900,77 +862,33 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {  	return final.match.m  } - -// ExecuteString matches the Regexp against the string s. -// The return value is an array of integers, in pairs, identifying the positions of -// substrings matched by the expression. -//    s[a[0]:a[1]] is the substring matched by the entire expression. -//    s[a[2*i]:a[2*i+1]] for i > 0 is the substring matched by the ith parenthesized subexpression. -// A negative value means the subexpression did not match any element of the string. -// An empty array means "no match". -func (re *Regexp) ExecuteString(s string) (a []int) { -	return re.doExecute(s, nil, 0) +// LiteralPrefix returns a literal string that must begin any match +// of the regular expression re.  It returns the boolean true if the +// literal string comprises the entire regular expression. +func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { +	c := make([]int, len(re.inst)-2) // minus start and end. +	// First instruction is start; skip that. +	i := 0 +	for inst := re.inst[0].next; inst.kind != iEnd; inst = inst.next { +		// stop if this is not a char +		if inst.kind != iChar { +			return string(c[:i]), false +		} +		c[i] = inst.char +		i++ +	} +	return string(c[:i]), true  } - -// Execute matches the Regexp against the byte slice b. -// The return value is an array of integers, in pairs, identifying the positions of -// subslices matched by the expression. -//    b[a[0]:a[1]] is the subslice matched by the entire expression. -//    b[a[2*i]:a[2*i+1]] for i > 0 is the subslice matched by the ith parenthesized subexpression. -// A negative value means the subexpression did not match any element of the slice. -// An empty array means "no match". -func (re *Regexp) Execute(b []byte) (a []int) { return re.doExecute("", b, 0) } - -  // MatchString returns whether the Regexp matches the string s.  // The return value is a boolean: true for match, false for no match.  func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(s, nil, 0)) > 0 } -  // Match returns whether the Regexp matches the byte slice b.  // The return value is a boolean: true for match, false for no match.  func (re *Regexp) Match(b []byte) bool { return len(re.doExecute("", b, 0)) > 0 } -// MatchStrings matches the Regexp against the string s. -// The return value is an array of strings matched by the expression. -//    a[0] is the substring matched by the entire expression. -//    a[i] for i > 0 is the substring matched by the ith parenthesized subexpression. -// An empty array means ``no match''. -func (re *Regexp) MatchStrings(s string) (a []string) { -	r := re.doExecute(s, nil, 0) -	if r == nil { -		return nil -	} -	a = make([]string, len(r)/2) -	for i := 0; i < len(r); i += 2 { -		if r[i] != -1 { // -1 means no match for this subexpression -			a[i/2] = s[r[i]:r[i+1]] -		} -	} -	return -} - -// MatchSlices matches the Regexp against the byte slice b. -// The return value is an array of subslices matched by the expression. -//    a[0] is the subslice matched by the entire expression. -//    a[i] for i > 0 is the subslice matched by the ith parenthesized subexpression. -// An empty array means ``no match''. -func (re *Regexp) MatchSlices(b []byte) (a [][]byte) { -	r := re.doExecute("", b, 0) -	if r == nil { -		return nil -	} -	a = make([][]byte, len(r)/2) -	for i := 0; i < len(r); i += 2 { -		if r[i] != -1 { // -1 means no match for this subexpression -			a[i/2] = b[r[i]:r[i+1]] -		} -	} -	return -} -  // MatchString checks whether a textual regular expression  // matches a string.  More complicated queries need  // to use Compile and the full Regexp interface. @@ -1117,7 +1035,7 @@ func QuoteMeta(s string) string {  }  // Find matches in slice b if b is non-nil, otherwise find matches in string s. -func (re *Regexp) allMatches(s string, b []byte, n int, deliver func(int, int)) { +func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {  	var end int  	if b == nil {  		end = len(s) @@ -1156,78 +1074,270 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func(int, int))  		prevMatchEnd = matches[1]  		if accept { -			deliver(matches[0], matches[1]) +			deliver(matches)  			i++  		}  	}  } -// AllMatches slices the byte slice b into substrings that are successive -// matches of the Regexp within b. If n > 0, the function returns at most n -// matches. Text that does not match the expression will be skipped. Empty -// matches abutting a preceding match are ignored. The function returns a slice -// containing the matching substrings. -func (re *Regexp) AllMatches(b []byte, n int) [][]byte { -	if n <= 0 { +// Find returns a slice holding the text of the leftmost match in b of the regular expression. +// A return value of nil indicates no match. +func (re *Regexp) Find(b []byte) []byte { +	a := re.doExecute("", b, 0) +	if a == nil { +		return nil +	} +	return b[a[0]:a[1]] +} + +// FindIndex returns a two-element slice of integers defining the location of +// the leftmost match in b of the regular expression.  The match itself is at +// b[loc[0]:loc[1]]. +// A return value of nil indicates no match. +func (re *Regexp) FindIndex(b []byte) (loc []int) { +	a := re.doExecute("", b, 0) +	if a == nil { +		return nil +	} +	return a[0:2] +} + +// FindString returns a string holding the text of the leftmost match in s of the regular +// expression.  If there is no match, the return value is an empty string, +// but it will also be empty if the regular expression successfully matches +// an empty string.  Use FindStringIndex or FindStringSubmatch if it is +// necessary to distinguish these cases. +func (re *Regexp) FindString(s string) string { +	a := re.doExecute(s, nil, 0) +	if a == nil { +		return "" +	} +	return s[a[0]:a[1]] +} + +// FindStringIndex returns a two-element slice of integers defining the +// location of the leftmost match in s of the regular expression.  The match +// itself is at s[loc[0]:loc[1]]. +// A return value of nil indicates no match. +func (re *Regexp) FindStringIndex(s string) []int { +	a := re.doExecute(s, nil, 0) +	if a == nil { +		return nil +	} +	return a[0:2] +} + +// FindSubmatch returns a slice of slices holding the text of the leftmost +// match of the regular expression in b and the matches, if any, of its +// subexpressions, as defined by the 'Submatch' descriptions in the package +// comment. +// A return value of nil indicates no match. +func (re *Regexp) FindSubmatch(b []byte) [][]byte { +	a := re.doExecute("", b, 0) +	if a == nil { +		return nil +	} +	ret := make([][]byte, len(a)/2) +	for i := range ret { +		if a[2*i] >= 0 { +			ret[i] = b[a[2*i]:a[2*i+1]] +		} +	} +	return ret +} + +// FindSubmatchIndex returns a slice holding the index pairs identifying the +// leftmost match of the regular expression in b and the matches, if any, of +// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions +// in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindSubmatchIndex(b []byte) []int { +	return re.doExecute("", b, 0) +} + +// FindStringSubmatch returns a slice of strings holding the text of the +// leftmost match of the regular expression in s and the matches, if any, of +// its subexpressions, as defined by the 'Submatch' description in the +// package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindStringSubmatch(s string) []string { +	a := re.doExecute(s, nil, 0) +	if a == nil { +		return nil +	} +	ret := make([]string, len(a)/2) +	for i := range ret { +		if a[2*i] >= 0 { +			ret[i] = s[a[2*i]:a[2*i+1]] +		} +	} +	return ret +} + +// FindStringSubmatchIndex returns a slice holding the index pairs +// identifying the leftmost match of the regular expression in s and the +// matches, if any, of its subexpressions, as defined by the 'Submatch' and +// 'Index' descriptions in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindStringSubmatchIndex(s string) []int { +	return re.doExecute(s, nil, 0) +} + +const startSize = 10 // The size at which to start a slice in the 'All' routines. + +// FindAll is the 'All' version of Find; it returns a slice of all successive +// matches of the expression, as defined by the 'All' description in the +// package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAll(b []byte, n int) [][]byte { +	if n < 0 {  		n = len(b) + 1  	} -	result := make([][]byte, n) -	i := 0 -	re.allMatches("", b, n, func(start, end int) { -		result[i] = b[start:end] -		i++ +	result := make([][]byte, 0, startSize) +	re.allMatches("", b, n, func(match []int) { +		result = append(result, b[match[0]:match[1]])  	}) -	return result[0:i] +	if len(result) == 0 { +		return nil +	} +	return result +} + +// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all +// successive matches of the expression, as defined by the 'All' description +// in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { +	if n < 0 { +		n = len(b) + 1 +	} +	result := make([][]int, 0, startSize) +	re.allMatches("", b, n, func(match []int) { +		result = append(result, match[0:2]) +	}) +	if len(result) == 0 { +		return nil +	} +	return result  } -// AllMatchesString slices the string s into substrings that are successive -// matches of the Regexp within s. If n > 0, the function returns at most n -// matches. Text that does not match the expression will be skipped. Empty -// matches abutting a preceding match are ignored. The function returns a slice -// containing the matching substrings. -func (re *Regexp) AllMatchesString(s string, n int) []string { -	if n <= 0 { +// FindAllString is the 'All' version of FindString; it returns a slice of all +// successive matches of the expression, as defined by the 'All' description +// in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllString(s string, n int) []string { +	if n < 0 {  		n = len(s) + 1  	} -	result := make([]string, n) -	i := 0 -	re.allMatches(s, nil, n, func(start, end int) { -		result[i] = s[start:end] -		i++ +	result := make([]string, 0, startSize) +	re.allMatches(s, nil, n, func(match []int) { +		result = append(result, s[match[0]:match[1]])  	}) -	return result[0:i] +	if len(result) == 0 { +		return nil +	} +	return result  } -// AllMatchesIter slices the byte slice b into substrings that are successive -// matches of the Regexp within b. If n > 0, the function returns at most n -// matches. Text that does not match the expression will be skipped. Empty -// matches abutting a preceding match are ignored. The function returns a -// channel that iterates over the matching substrings. -func (re *Regexp) AllMatchesIter(b []byte, n int) <-chan []byte { -	if n <= 0 { +// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a +// slice of all successive matches of the expression, as defined by the 'All' +// description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { +	if n < 0 { +		n = len(s) + 1 +	} +	result := make([][]int, 0, startSize) +	re.allMatches(s, nil, n, func(match []int) { +		result = append(result, match[0:2]) +	}) +	if len(result) == 0 { +		return nil +	} +	return result +} + +// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice +// of all successive matches of the expression, as defined by the 'All' +// description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { +	if n < 0 {  		n = len(b) + 1  	} -	c := make(chan []byte, 10) -	go func() { -		re.allMatches("", b, n, func(start, end int) { c <- b[start:end] }) -		close(c) -	}() -	return c +	result := make([][][]byte, 0, startSize) +	re.allMatches("", b, n, func(match []int) { +		slice := make([][]byte, len(match)/2) +		for j := range slice { +			if match[2*j] >= 0 { +				slice[j] = b[match[2*j]:match[2*j+1]] +			} +		} +		result = append(result, slice) +	}) +	if len(result) == 0 { +		return nil +	} +	return result +} + +// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns +// a slice of all successive matches of the expression, as defined by the +// 'All' description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { +	if n < 0 { +		n = len(b) + 1 +	} +	result := make([][]int, 0, startSize) +	re.allMatches("", b, n, func(match []int) { +		result = append(result, match) +	}) +	if len(result) == 0 { +		return nil +	} +	return result  } -// AllMatchesStringIter slices the string s into substrings that are successive -// matches of the Regexp within s. If n > 0, the function returns at most n -// matches. Text that does not match the expression will be skipped. Empty -// matches abutting a preceding match are ignored. The function returns a -// channel that iterates over the matching substrings. -func (re *Regexp) AllMatchesStringIter(s string, n int) <-chan string { -	if n <= 0 { +// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it +// returns a slice of all successive matches of the expression, as defined by +// the 'All' description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { +	if n < 0 {  		n = len(s) + 1  	} -	c := make(chan string, 10) -	go func() { -		re.allMatches(s, nil, n, func(start, end int) { c <- s[start:end] }) -		close(c) -	}() -	return c +	result := make([][]string, 0, startSize) +	re.allMatches(s, nil, n, func(match []int) { +		slice := make([]string, len(match)/2) +		for j := range slice { +			if match[2*j] >= 0 { +				slice[j] = s[match[2*j]:match[2*j+1]] +			} +		} +		result = append(result, slice) +	}) +	if len(result) == 0 { +		return nil +	} +	return result +} + +// FindAllStringSubmatchIndex is the 'All' version of +// FindStringSubmatchIndex; it returns a slice of all successive matches of +// the expression, as defined by the 'All' description in the package +// comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { +	if n < 0 { +		n = len(s) + 1 +	} +	result := make([][]int, 0, startSize) +	re.allMatches(s, nil, n, func(match []int) { +		result = append(result, match) +	}) +	if len(result) == 0 { +		return nil +	} +	return result  } | 
