summaryrefslogtreecommitdiff
path: root/src/lib/regexp/regexp.go
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/regexp/regexp.go')
-rw-r--r--src/lib/regexp/regexp.go374
1 files changed, 187 insertions, 187 deletions
diff --git a/src/lib/regexp/regexp.go b/src/lib/regexp/regexp.go
index 49f971610..32fa113c7 100644
--- a/src/lib/regexp/regexp.go
+++ b/src/lib/regexp/regexp.go
@@ -11,7 +11,7 @@ import (
"array";
)
-export var debug = false;
+var debug = false;
export var ErrInternal = os.NewError("internal error");
@@ -26,110 +26,110 @@ export var ErrBareClosure = os.NewError("closure applies to nothing");
export var ErrBadBackslash = os.NewError("illegal backslash escape");
// An instruction executed by the NFA
-type Inst interface {
- Type() int; // the type of this instruction: CHAR, ANY, etc.
- Next() Inst; // the instruction to execute after this one
- SetNext(i Inst);
+type instr interface {
+ Type() int; // the type of this instruction: cCHAR, cANY, etc.
+ Next() instr; // the instruction to execute after this one
+ SetNext(i instr);
Index() int;
SetIndex(i int);
Print();
}
// Fields and methods common to all instructions
-type Common struct {
- next Inst;
+type iCommon struct {
+ next instr;
index int;
}
-func (c *Common) Next() Inst { return c.next }
-func (c *Common) SetNext(i Inst) { c.next = i }
-func (c *Common) Index() int { return c.index }
-func (c *Common) SetIndex(i int) { c.index = i }
+func (c *iCommon) Next() instr { return c.next }
+func (c *iCommon) SetNext(i instr) { c.next = i }
+func (c *iCommon) Index() int { return c.index }
+func (c *iCommon) SetIndex(i int) { c.index = i }
-type RE struct {
+type regExp struct {
expr string; // the original expression
- ch chan<- *RE; // reply channel when we're done
+ ch chan<- *regExp; // reply channel when we're done
error *os.Error; // compile- or run-time error; nil if OK
inst *array.Array;
- start Inst;
+ start instr;
nbra int; // number of brackets in expression, for subexpressions
}
const (
- START // beginning of program
+ cSTART // beginning of program
= iota;
- END; // end of program: success
- BOT; // '^' beginning of text
- EOT; // '$' end of text
- CHAR; // 'a' regular character
- CHARCLASS; // [a-z] character class
- ANY; // '.' any character
- BRA; // '(' parenthesized expression
- EBRA; // ')'; end of '(' parenthesized expression
- ALT; // '|' alternation
- NOP; // do nothing; makes it easy to link without patching
+ cEND; // end of program: success
+ cBOT; // '^' beginning of text
+ cEOT; // '$' end of text
+ cCHAR; // 'a' regular character
+ cCHARCLASS; // [a-z] character class
+ cANY; // '.' any character
+ cBRA; // '(' parenthesized expression
+ cEBRA; // ')'; end of '(' parenthesized expression
+ cALT; // '|' alternation
+ cNOP; // do nothing; makes it easy to link without patching
)
// --- START start of program
-type Start struct {
- Common
+type iStart struct {
+ iCommon
}
-func (start *Start) Type() int { return START }
-func (start *Start) Print() { print("start") }
+func (start *iStart) Type() int { return cSTART }
+func (start *iStart) Print() { print("start") }
// --- END end of program
-type End struct {
- Common
+type iEnd struct {
+ iCommon
}
-func (end *End) Type() int { return END }
-func (end *End) Print() { print("end") }
+func (end *iEnd) Type() int { return cEND }
+func (end *iEnd) Print() { print("end") }
// --- BOT beginning of text
-type Bot struct {
- Common
+type iBot struct {
+ iCommon
}
-func (bot *Bot) Type() int { return BOT }
-func (bot *Bot) Print() { print("bot") }
+func (bot *iBot) Type() int { return cBOT }
+func (bot *iBot) Print() { print("bot") }
// --- EOT end of text
-type Eot struct {
- Common
+type iEot struct {
+ iCommon
}
-func (eot *Eot) Type() int { return EOT }
-func (eot *Eot) Print() { print("eot") }
+func (eot *iEot) Type() int { return cEOT }
+func (eot *iEot) Print() { print("eot") }
// --- CHAR a regular character
-type Char struct {
- Common;
+type iChar struct {
+ iCommon;
char int;
}
-func (char *Char) Type() int { return CHAR }
-func (char *Char) Print() { print("char ", string(char.char)) }
+func (char *iChar) Type() int { return cCHAR }
+func (char *iChar) Print() { print("char ", string(char.char)) }
-func NewChar(char int) *Char {
- c := new(Char);
+func newChar(char int) *iChar {
+ c := new(iChar);
c.char = char;
return c;
}
// --- CHARCLASS [a-z]
-type CharClass struct {
- Common;
+type iCharClass struct {
+ iCommon;
char int;
negate bool; // is character class negated? ([^a-z])
// array of int, stored pairwise: [a-z] is (a,z); x is (x,x):
ranges *array.IntArray;
}
-func (cclass *CharClass) Type() int { return CHARCLASS }
+func (cclass *iCharClass) Type() int { return cCHARCLASS }
-func (cclass *CharClass) Print() {
+func (cclass *iCharClass) Print() {
print("charclass");
if cclass.negate {
print(" (negated)");
@@ -145,13 +145,13 @@ func (cclass *CharClass) Print() {
}
}
-func (cclass *CharClass) AddRange(a, b int) {
+func (cclass *iCharClass) AddRange(a, b int) {
// range is a through b inclusive
cclass.ranges.Push(a);
cclass.ranges.Push(b);
}
-func (cclass *CharClass) Matches(c int) bool {
+func (cclass *iCharClass) Matches(c int) bool {
for i := 0; i < cclass.ranges.Len(); i = i+2 {
min := cclass.ranges.At(i);
max := cclass.ranges.At(i+1);
@@ -162,84 +162,84 @@ func (cclass *CharClass) Matches(c int) bool {
return cclass.negate
}
-func NewCharClass() *CharClass {
- c := new(CharClass);
+func newCharClass() *iCharClass {
+ c := new(iCharClass);
c.ranges = array.NewIntArray(0);
return c;
}
// --- ANY any character
-type Any struct {
- Common
+type iAny struct {
+ iCommon
}
-func (any *Any) Type() int { return ANY }
-func (any *Any) Print() { print("any") }
+func (any *iAny) Type() int { return cANY }
+func (any *iAny) Print() { print("any") }
// --- BRA parenthesized expression
-type Bra struct {
- Common;
+type iBra struct {
+ iCommon;
n int; // subexpression number
}
-func (bra *Bra) Type() int { return BRA }
-func (bra *Bra) Print() { print("bra", bra.n); }
+func (bra *iBra) Type() int { return cBRA }
+func (bra *iBra) Print() { print("bra", bra.n); }
// --- EBRA end of parenthesized expression
-type Ebra struct {
- Common;
+type iEbra struct {
+ iCommon;
n int; // subexpression number
}
-func (ebra *Ebra) Type() int { return EBRA }
-func (ebra *Ebra) Print() { print("ebra ", ebra.n); }
+func (ebra *iEbra) Type() int { return cEBRA }
+func (ebra *iEbra) Print() { print("ebra ", ebra.n); }
// --- ALT alternation
-type Alt struct {
- Common;
- left Inst; // other branch
+type iAlt struct {
+ iCommon;
+ left instr; // other branch
}
-func (alt *Alt) Type() int { return ALT }
-func (alt *Alt) Print() { print("alt(", alt.left.Index(), ")"); }
+func (alt *iAlt) Type() int { return cALT }
+func (alt *iAlt) Print() { print("alt(", alt.left.Index(), ")"); }
// --- NOP no operation
-type Nop struct {
- Common
+type iNop struct {
+ iCommon
}
-func (nop *Nop) Type() int { return NOP }
-func (nop *Nop) Print() { print("nop") }
+func (nop *iNop) Type() int { return cNOP }
+func (nop *iNop) Print() { print("nop") }
// report error and exit compiling/executing goroutine
-func (re *RE) Error(err *os.Error) {
+func (re *regExp) Error(err *os.Error) {
re.error = err;
re.ch <- re;
sys.goexit();
}
-func (re *RE) Add(i Inst) Inst {
+func (re *regExp) Add(i instr) instr {
i.SetIndex(re.inst.Len());
re.inst.Push(i);
return i;
}
-type Parser struct {
- re *RE;
+type parser struct {
+ re *regExp;
nlpar int; // number of unclosed lpars
pos int;
ch int;
}
-const EOF = -1
+const endOfFile = -1
-func (p *Parser) c() int {
+func (p *parser) c() int {
return p.ch;
}
-func (p *Parser) nextc() int {
+func (p *parser) nextc() int {
if p.pos >= len(p.re.expr) {
- p.ch = EOF
+ p.ch = endOfFile
} else {
c, w := sys.stringtorune(p.re.expr, p.pos);
p.ch = c;
@@ -248,11 +248,11 @@ func (p *Parser) nextc() int {
return p.ch;
}
-func NewParser(re *RE) *Parser {
- parser := new(Parser);
- parser.re = re;
- parser.nextc(); // load p.ch
- return parser;
+func newParser(re *regExp) *parser {
+ p := new(parser);
+ p.re = re;
+ p.nextc(); // load p.ch
+ return p;
}
/*
@@ -274,9 +274,9 @@ Grammar:
*/
-func (p *Parser) Regexp() (start, end Inst)
+func (p *parser) Regexp() (start, end instr)
-var NULL Inst
+var iNULL instr
func special(c int) bool {
s := `\.+*?()|[]`;
@@ -298,8 +298,8 @@ func specialcclass(c int) bool {
return false
}
-func (p *Parser) CharClass() Inst {
- cc := NewCharClass();
+func (p *parser) CharClass() instr {
+ cc := newCharClass();
p.re.Add(cc);
if p.c() == '^' {
cc.negate = true;
@@ -308,7 +308,7 @@ func (p *Parser) CharClass() Inst {
left := -1;
for {
switch c := p.c(); c {
- case ']', EOF:
+ case ']', endOfFile:
if left >= 0 {
p.re.Error(ErrBadRange);
}
@@ -318,7 +318,7 @@ func (p *Parser) CharClass() Inst {
case '\\':
c = p.nextc();
switch {
- case c == EOF:
+ case c == endOfFile:
p.re.Error(ErrExtraneousBackslash);
case c == 'n':
c = '\n';
@@ -346,33 +346,33 @@ func (p *Parser) CharClass() Inst {
}
}
}
- return NULL
+ return iNULL
}
-func (p *Parser) Term() (start, end Inst) {
+func (p *parser) Term() (start, end instr) {
switch c := p.c(); c {
- case '|', EOF:
- return NULL, NULL;
+ case '|', endOfFile:
+ return iNULL, iNULL;
case '*', '+':
p.re.Error(ErrBareClosure);
case ')':
if p.nlpar == 0 {
p.re.Error(ErrUnmatchedRpar);
}
- return NULL, NULL;
+ return iNULL, iNULL;
case ']':
p.re.Error(ErrUnmatchedRbkt);
case '^':
p.nextc();
- start = p.re.Add(new(Bot));
+ start = p.re.Add(new(iBot));
return start, start;
case '$':
p.nextc();
- start = p.re.Add(new(Eot));
+ start = p.re.Add(new(iEot));
return start, start;
case '.':
p.nextc();
- start = p.re.Add(new(Any));
+ start = p.re.Add(new(iAny));
return start, start;
case '[':
p.nextc();
@@ -393,14 +393,14 @@ func (p *Parser) Term() (start, end Inst) {
}
p.nlpar--;
p.nextc();
- bra := new(Bra);
+ bra := new(iBra);
p.re.Add(bra);
- ebra := new(Ebra);
+ ebra := new(iEbra);
p.re.Add(ebra);
bra.n = nbra;
ebra.n = nbra;
- if start == NULL {
- if end == NULL { p.re.Error(ErrInternal) }
+ if start == iNULL {
+ if end == iNULL { p.re.Error(ErrInternal) }
start = ebra
} else {
end.SetNext(ebra);
@@ -410,7 +410,7 @@ func (p *Parser) Term() (start, end Inst) {
case '\\':
c = p.nextc();
switch {
- case c == EOF:
+ case c == endOfFile:
p.re.Error(ErrExtraneousBackslash);
case c == 'n':
c = '\n';
@@ -422,22 +422,22 @@ func (p *Parser) Term() (start, end Inst) {
fallthrough;
default:
p.nextc();
- start = NewChar(c);
+ start = newChar(c);
p.re.Add(start);
return start, start
}
panic("unreachable");
}
-func (p *Parser) Closure() (start, end Inst) {
+func (p *parser) Closure() (start, end instr) {
start, end = p.Term();
- if start == NULL {
+ if start == iNULL {
return
}
switch p.c() {
case '*':
// (start,end)*:
- alt := new(Alt);
+ alt := new(iAlt);
p.re.Add(alt);
end.SetNext(alt); // after end, do alt
alt.left = start; // alternate brach: return to start
@@ -445,16 +445,16 @@ func (p *Parser) Closure() (start, end Inst) {
end = alt;
case '+':
// (start,end)+:
- alt := new(Alt);
+ alt := new(iAlt);
p.re.Add(alt);
end.SetNext(alt); // after end, do alt
alt.left = start; // alternate brach: return to start
end = alt; // start is unchanged; end is alt
case '?':
// (start,end)?:
- alt := new(Alt);
+ alt := new(iAlt);
p.re.Add(alt);
- nop := new(Nop);
+ nop := new(iNop);
p.re.Add(nop);
alt.left = start; // alternate branch is start
alt.next = nop; // follow on to nop
@@ -471,18 +471,18 @@ func (p *Parser) Closure() (start, end Inst) {
return
}
-func (p *Parser) Concatenation() (start, end Inst) {
- start, end = NULL, NULL;
+func (p *parser) Concatenation() (start, end instr) {
+ start, end = iNULL, iNULL;
for {
nstart, nend := p.Closure();
switch {
- case nstart == NULL: // end of this concatenation
- if start == NULL { // this is the empty string
- nop := p.re.Add(new(Nop));
+ case nstart == iNULL: // end of this concatenation
+ if start == iNULL { // this is the empty string
+ nop := p.re.Add(new(iNop));
return nop, nop;
}
return;
- case start == NULL: // this is first element of concatenation
+ case start == iNULL: // this is first element of concatenation
start, end = nstart, nend;
default:
end.SetNext(nstart);
@@ -492,7 +492,7 @@ func (p *Parser) Concatenation() (start, end Inst) {
panic("unreachable");
}
-func (p *Parser) Regexp() (start, end Inst) {
+func (p *parser) Regexp() (start, end instr) {
start, end = p.Concatenation();
for {
switch p.c() {
@@ -501,11 +501,11 @@ func (p *Parser) Regexp() (start, end Inst) {
case '|':
p.nextc();
nstart, nend := p.Concatenation();
- alt := new(Alt);
+ alt := new(iAlt);
p.re.Add(alt);
alt.left = start;
alt.next = nstart;
- nop := new(Nop);
+ nop := new(iNop);
p.re.Add(nop);
end.SetNext(nop);
nend.SetNext(nop);
@@ -515,47 +515,47 @@ func (p *Parser) Regexp() (start, end Inst) {
panic("unreachable");
}
-func UnNop(i Inst) Inst {
- for i.Type() == NOP {
+func UnNop(i instr) instr {
+ for i.Type() == cNOP {
i = i.Next()
}
return i
}
-func (re *RE) EliminateNops() {
+func (re *regExp) EliminateNops() {
for i := 0; i < re.inst.Len(); i++ {
- inst := re.inst.At(i).(Inst);
- if inst.Type() == END {
+ inst := re.inst.At(i).(instr);
+ if inst.Type() == cEND {
continue
}
inst.SetNext(UnNop(inst.Next()));
- if inst.Type() == ALT {
- alt := inst.(*Alt);
+ if inst.Type() == cALT {
+ alt := inst.(*iAlt);
alt.left = UnNop(alt.left);
}
}
}
-func (re *RE) Dump() {
+func (re *regExp) Dump() {
for i := 0; i < re.inst.Len(); i++ {
- inst := re.inst.At(i).(Inst);
+ inst := re.inst.At(i).(instr);
print(inst.Index(), ": ");
inst.Print();
- if inst.Type() != END {
+ if inst.Type() != cEND {
print(" -> ", inst.Next().Index())
}
print("\n");
}
}
-func (re *RE) DoParse() {
- parser := NewParser(re);
- start := new(Start);
+func (re *regExp) DoParse() {
+ p := newParser(re);
+ start := new(iStart);
re.Add(start);
- s, e := parser.Regexp();
+ s, e := p.Regexp();
start.next = s;
re.start = start;
- e.SetNext(re.Add(new(End)));
+ e.SetNext(re.Add(new(iEnd)));
if debug {
re.Dump();
@@ -571,8 +571,8 @@ func (re *RE) DoParse() {
}
-func Compiler(str string, ch chan *RE) {
- re := new(RE);
+func Compiler(str string, ch chan *regExp) {
+ re := new(regExp);
re.expr = str;
re.inst = array.New(0);
re.ch = ch;
@@ -589,20 +589,20 @@ export type Regexp interface {
// Compile in separate goroutine; wait for result
export func Compile(str string) (regexp Regexp, error *os.Error) {
- ch := make(chan *RE);
+ ch := make(chan *regExp);
go Compiler(str, ch);
re := <-ch;
return re, re.error
}
-type State struct {
- inst Inst; // next instruction to execute
+type state struct {
+ inst instr; // next instruction to execute
match []int; // pairs of bracketing submatches. 0th is start,end
}
// Append new state to to-do list. Leftmost-longest wins so avoid
// adding a state that's already active.
-func AddState(s []State, inst Inst, match []int) []State {
+func addState(s []state, inst instr, match []int) []state {
index := inst.Index();
l := len(s);
pos := match[0];
@@ -615,7 +615,7 @@ func AddState(s []State, inst Inst, match []int) []State {
}
}
if l == cap(s) {
- s1 := make([]State, 2*l)[0:l];
+ s1 := make([]state, 2*l)[0:l];
for i := 0; i < l; i++ {
s1[i] = s[i];
}
@@ -627,12 +627,12 @@ func AddState(s []State, inst Inst, match []int) []State {
return s;
}
-func (re *RE) DoExecute(str string, pos int) []int {
- var s [2][]State; // TODO: use a vector when State values (not ptrs) can be vector elements
- s[0] = make([]State, 10)[0:0];
- s[1] = make([]State, 10)[0:0];
+func (re *regExp) DoExecute(str string, pos int) []int {
+ var s [2][]state; // TODO: use a vector when state values (not ptrs) can be vector elements
+ s[0] = make([]state, 10)[0:0];
+ s[1] = make([]state, 10)[0:0];
in, out := 0, 1;
- var final State;
+ var final state;
found := false;
for pos <= len(str) {
if !found {
@@ -642,7 +642,7 @@ func (re *RE) DoExecute(str string, pos int) []int {
match[i] = -1; // no match seen; catches cases like "a(b)?c" on "ac"
}
match[0] = pos;
- s[out] = AddState(s[out], re.start.Next(), match);
+ s[out] = addState(s[out], re.start.Next(), match);
}
in, out = out, in; // old out state is new in state
s[out] = s[out][0:0]; // clear out state
@@ -651,60 +651,60 @@ func (re *RE) DoExecute(str string, pos int) []int {
break;
}
charwidth := 1;
- c := EOF;
+ c := endOfFile;
if pos < len(str) {
c, charwidth = sys.stringtorune(str, pos);
}
for i := 0; i < len(s[in]); i++ {
- state := s[in][i];
+ st := s[in][i];
switch s[in][i].inst.Type() {
- case BOT:
+ case cBOT:
if pos == 0 {
- s[in] = AddState(s[in], state.inst.Next(), state.match)
+ s[in] = addState(s[in], st.inst.Next(), st.match)
}
- case EOT:
+ case cEOT:
if pos == len(str) {
- s[in] = AddState(s[in], state.inst.Next(), state.match)
+ s[in] = addState(s[in], st.inst.Next(), st.match)
}
- case CHAR:
- if c == state.inst.(*Char).char {
- s[out] = AddState(s[out], state.inst.Next(), state.match)
+ case cCHAR:
+ if c == st.inst.(*iChar).char {
+ s[out] = addState(s[out], st.inst.Next(), st.match)
}
- case CHARCLASS:
- if state.inst.(*CharClass).Matches(c) {
- s[out] = AddState(s[out], state.inst.Next(), state.match)
+ case cCHARCLASS:
+ if st.inst.(*iCharClass).Matches(c) {
+ s[out] = addState(s[out], st.inst.Next(), st.match)
}
- case ANY:
- if c != EOF {
- s[out] = AddState(s[out], state.inst.Next(), state.match)
+ case cANY:
+ if c != endOfFile {
+ s[out] = addState(s[out], st.inst.Next(), st.match)
}
- case BRA:
- n := state.inst.(*Bra).n;
- state.match[2*n] = pos;
- s[in] = AddState(s[in], state.inst.Next(), state.match);
- case EBRA:
- n := state.inst.(*Ebra).n;
- state.match[2*n+1] = pos;
- s[in] = AddState(s[in], state.inst.Next(), state.match);
- case ALT:
- s[in] = AddState(s[in], state.inst.(*Alt).left, state.match);
+ case cBRA:
+ n := st.inst.(*iBra).n;
+ st.match[2*n] = pos;
+ s[in] = addState(s[in], st.inst.Next(), st.match);
+ case cEBRA:
+ n := st.inst.(*iEbra).n;
+ st.match[2*n+1] = pos;
+ s[in] = addState(s[in], st.inst.Next(), st.match);
+ case cALT:
+ s[in] = addState(s[in], st.inst.(*iAlt).left, st.match);
// give other branch a copy of this match vector
s1 := make([]int, 2*(re.nbra+1));
for i := 0; i < len(s1); i++ {
- s1[i] = state.match[i]
+ s1[i] = st.match[i]
}
- s[in] = AddState(s[in], state.inst.Next(), s1);
- case END:
+ s[in] = addState(s[in], st.inst.Next(), s1);
+ case cEND:
// choose leftmost longest
if !found || // first
- state.match[0] < final.match[0] || // leftmost
- (state.match[0] == final.match[0] && pos > final.match[1]) { // longest
- final = state;
+ st.match[0] < final.match[0] || // leftmost
+ (st.match[0] == final.match[0] && pos > final.match[1]) { // longest
+ final = st;
final.match[1] = pos;
}
found = true;
default:
- state.inst.Print();
+ st.inst.Print();
panic("unknown instruction in execute");
}
}
@@ -714,17 +714,17 @@ func (re *RE) DoExecute(str string, pos int) []int {
}
-func (re *RE) Execute(s string) []int {
+func (re *regExp) Execute(s string) []int {
return re.DoExecute(s, 0)
}
-func (re *RE) Match(s string) bool {
+func (re *regExp) Match(s string) bool {
return len(re.DoExecute(s, 0)) > 0
}
-func (re *RE) MatchStrings(s string) []string {
+func (re *regExp) MatchStrings(s string) []string {
r := re.DoExecute(s, 0);
if r == nil {
return nil