// Copyright 2009 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Regular expression library. package regexp import os "os" export var ErrUnimplemented = os.NewError("unimplemented"); export var ErrInternal = os.NewError("internal error"); export var ErrUnmatchedLpar = os.NewError("unmatched '('"); export var ErrUnmatchedRpar = os.NewError("unmatched ')'"); export var ErrExtraneousBackslash = os.NewError("extraneous backslash"); export var ErrEmpty = os.NewError("empty subexpression or alternation"); export var ErrBadClosure = os.NewError("repeated closure (**, ++, etc.)"); export var ErrBareClosure = os.NewError("closure applies to nothing"); export var ErrBadBackslash = os.NewError("illegal backslash escape"); // An instruction executed by the NFA type Inst interface { Type() int; // the type of this instruction: CHAR, ANY, etc. This() int; // the index of this instruction Next() int; // the index of the instruction to execute after this one SetThis(i int); SetNext(i int); Print(ind string); } type RE struct { expr string; // the original expression ch *chan<- *RE; // reply channel when we're done error *os.Error; // compile- or run-time error; nil if OK ninst int; inst *[]Inst; start Inst; } const ( START // beginning of program: marker to start = iota; END; // end of program: success BOT; // '^' beginning of text EOT; // '$' end of text CHAR; // 'a' regular character ANY; // '.' any character BRA; // '(' parenthesized expression EBRA; // ')'; end of '(' parenthesized expression ALT; // '|' alternation NOP; // do nothing; makes it easy to link without patching ) // --- START start of program type Start struct { this int; next int; } func (start *Start) Type() int { return START } func (start *Start) This() int { return start.this } func (start *Start) Next() int { return start.next } func (start *Start) SetThis(i int) { start.this = i } func (start *Start) SetNext(i int) { start.next = i } func (start *Start) Print(ind string) { print(ind, "start") } // --- END end of program type End struct { this int; next int; } func (end *End) Type() int { return END } func (end *End) This() int { return end.this } func (end *End) Next() int { return end.next } func (end *End) SetThis(i int) { end.this = i } func (end *End) SetNext(i int) { end.next = i } func (end *End) Print(ind string) { print(ind, "end") } // --- BOT beginning of text type Bot struct { this int; next int; } func (bot *Bot) Type() int { return BOT } func (bot *Bot) This() int { return bot.this } func (bot *Bot) Next() int { return bot.next } func (bot *Bot) SetThis(i int) { bot.this = i } func (bot *Bot) SetNext(i int) { bot.next = i } func (bot *Bot) Print(ind string) { print(ind, "bot") } // --- EOT end of text type Eot struct { this int; next int; } func (eot *Eot) Type() int { return EOT } func (eot *Eot) This() int { return eot.this } func (eot *Eot) Next() int { return eot.next } func (eot *Eot) SetThis(i int) { eot.this = i } func (eot *Eot) SetNext(i int) { eot.next = i } func (eot *Eot) Print(ind string) { print(ind, "eot") } // --- CHAR a regular character type Char struct { this int; next int; char int; set bool; } func (char *Char) Type() int { return CHAR } func (char *Char) This() int { return char.this } func (char *Char) Next() int { return char.next } func (char *Char) SetThis(i int) { char.this = i } func (char *Char) SetNext(i int) { char.next = i } func (char *Char) Print(ind string) { print(ind, "char ", string(char.char)) } func NewChar(char int) *Char { c := new(Char); c.char = char; return c; } // --- ANY any character type Any struct { this int; next int; } func (any *Any) Type() int { return ANY } func (any *Any) This() int { return any.this } func (any *Any) Next() int { return any.next } func (any *Any) SetThis(i int) { any.this = i } func (any *Any) SetNext(i int) { any.next = i } func (any *Any) Print(ind string) { print(ind, "any") } // --- BRA parenthesized expression type Bra struct { this int; next int; } func (bra *Bra) Type() int { return BRA } func (bra *Bra) This() int { return bra.this } func (bra *Bra) Next() int { return bra.next } func (bra *Bra) SetThis(i int) { bra.this = i } func (bra *Bra) SetNext(i int) { bra.next = i } func (bra *Bra) Print(ind string) { print(ind , "bra"); } // --- EBRA end of parenthesized expression type Ebra struct { this int; next int; n int; // subexpression number } func (ebra *Ebra) Type() int { return BRA } func (ebra *Ebra) This() int { return ebra.this } func (ebra *Ebra) Next() int { return ebra.next } func (ebra *Ebra) SetThis(i int) { ebra.this = i } func (ebra *Ebra) SetNext(i int) { ebra.next = i } func (ebra *Ebra) Print(ind string) { print(ind , "ebra ", ebra.n); } // --- ALT alternation type Alt struct { this int; next int; left int; // other branch } func (alt *Alt) Type() int { return ALT } func (alt *Alt) This() int { return alt.this } func (alt *Alt) Next() int { return alt.next } func (alt *Alt) SetThis(i int) { alt.this = i } func (alt *Alt) SetNext(i int) { alt.next = i } func (alt *Alt) Print(ind string) { print(ind , "alt(", alt.left, ")"); } // --- NOP no operation type Nop struct { this int; next int; } func (nop *Nop) Type() int { return NOP } func (nop *Nop) This() int { return nop.this } func (nop *Nop) Next() int { return nop.next } func (nop *Nop) SetThis(i int) { nop.this = i } func (nop *Nop) SetNext(i int) { nop.next = i } func (nop *Nop) Print(ind string) { print(ind, "nop") } func (re *RE) AddInst(inst Inst) int { if re.ninst >= cap(re.inst) { panic("write the code to grow inst") } re.inst[re.ninst] = inst; i := re.ninst; inst.SetThis(re.ninst); re.ninst++; inst.SetNext(re.ninst); return i; } // report error and exit compiling/executing goroutine func (re *RE) Error(err *os.Error) { re.error = err; re.ch <- re; sys.goexit(); } type Parser struct { re *RE; nbra int; pos int; ch int; } const EOF = -1 func (p *Parser) c() int { return p.ch; } func (p *Parser) nextc() int { if p.pos >= len(p.re.expr) { p.ch = EOF } else { c, w := sys.stringtorune(p.re.expr, p.pos); // TODO: stringotorune shoudl take a string* p.ch = c; p.pos += w; } return p.ch; } func NewParser(re *RE) *Parser { parser := new(Parser); parser.re = re; parser.nextc(); // load p.ch return parser; } /* Grammar: regexp: concatenation { '|' concatenation } concatenation: { closure } closure: term { '*' | '+' | '?' } term: '.' character characterclass '(' regexp ')' */ func (p *Parser) Regexp() (start, end int) const NULL = -1 func special(c int) bool { s := `\.+*?()|[-]`; for i := 0; i < len(s); i++ { if c == int(s[i]) { return true } } return false } func (p *Parser) Term() (start, end int) { switch c := p.c(); c { case ')', '|', EOF: return NULL, NULL; case '*', '+', '|': p.re.Error(ErrBareClosure); case '.': p.nextc(); start = p.re.AddInst(new(Any)); return start, start; case '(': p.nextc(); start, end = p.Regexp(); if p.c() != ')' { p.re.Error(ErrUnmatchedLpar); } p.nextc(); p.nbra++; bra := new(Bra); brai := p.re.AddInst(bra); ebra := new(Ebra); ebrai := p.re.AddInst(ebra); ebra.n = p.nbra; if start == NULL { if end != NULL { p.re.Error(ErrInternal) } start = ebrai } else { p.re.inst[end].SetNext(ebrai); } bra.SetNext(start); return brai, ebrai; case '\\': c = p.nextc(); switch { case c == EOF: p.re.Error(ErrExtraneousBackslash); case c == 'n': c = '\n'; case special(c): // c is as delivered default: p.re.Error(ErrBadBackslash); } fallthrough; default: p.nextc(); start = p.re.AddInst(NewChar(c)); return start, start } panic("unreachable"); } func (p *Parser) Closure() (start, end int) { start, end = p.Term(); if start == NULL { return start, end } switch p.c() { case '*': // (start,end)*: alt := new(Alt); alti := p.re.AddInst(alt); p.re.inst[end].SetNext(alti); // after end, do alt alt.left = start; // alternate brach: return to start start = alti; // alt becomes new (start, end) end = alti; case '+': // (start,end)+: alt := new(Alt); alti := p.re.AddInst(alt); p.re.inst[end].SetNext(alti); // after end, do alt alt.left = start; // alternate brach: return to start end = alti; // start is unchanged; end is alt case '?': // (start,end)?: alt := new(Alt); alti := p.re.AddInst(alt); nop := new(Nop); nopi := p.re.AddInst(nop); alt.left = start; // alternate branch is start alt.next = nopi; // follow on to nop p.re.inst[end].SetNext(nopi); // after end, go to nop start = alti; // start is now alt end = nopi; // end is nop pointed to by both branches default: return start, end; } switch p.nextc() { case '*', '+', '?': p.re.Error(ErrBadClosure); } return start, end; } func (p *Parser) Concatenation() (start, end int) { start, end = NULL, NULL; for { nstart, nend := p.Closure(); switch { case nstart == NULL: // end of this concatenation return start, end; case start == NULL: // this is first element of concatenation start, end = nstart, nend; default: p.re.inst[end].SetNext(nstart); end = nend; } } panic("unreachable"); } func (p *Parser) Regexp() (start, end int) { start, end = p.Concatenation(); if start == NULL { return NULL, NULL } for { switch p.c() { default: return start, end; case '|': p.nextc(); nstart, nend := p.Concatenation(); // xyz|(nothing) is xyz or nop if nstart == NULL { nopi := p.re.AddInst(new(Nop)); nstart, nend = nopi, nopi; } alt := new(Alt); alti := p.re.AddInst(alt); alt.left = start; alt.next = nstart; nop := new(Nop); nopi := p.re.AddInst(nop); p.re.inst[end].SetNext(nopi); p.re.inst[nend].SetNext(nopi); start, end = alti, nopi; } } panic("unreachable"); } func (re *RE) UnNop(i int) int { for re.inst[i].Type() == NOP { i = re.inst[i].Next() } return i } func (re *RE) EliminateNops() { for i := 0; i < re.ninst - 1; i++ { // last one is END inst := re.inst[i]; inst.SetNext(re.UnNop(inst.Next())); if inst.Type() == ALT { alt := inst.(*Alt); alt.left = re.UnNop(alt.left) } } } func (re *RE) DoParse() { parser := NewParser(re); start := new(Start); starti := re.AddInst(start); s, e := parser.Regexp(); if s == NULL { if e != NULL { re.Error(ErrInternal) } e = starti; } start.next = s; re.inst[e].SetNext(re.AddInst(new(End))); for i := 0; i < re.ninst; i++ { inst := re.inst[i]; print(i, ":\t"); inst.Print("\t"); print(" -> ", inst.Next(), "\n"); } println(); re.EliminateNops(); for i := 0; i < re.ninst; i++ { inst := re.inst[i]; print(i, ":\t"); inst.Print("\t"); print(" -> ", inst.Next(), "\n"); } println(); re.Error(ErrUnimplemented); } func Compiler(str string, ch *chan *RE) { re := new(RE); re.inst = new([]Inst, 100); re.expr = str; re.ch = ch; re.DoParse(); ch <- re; } // Public interface has only execute functionality (not yet implemented) export type Regexp interface { // Execute() bool } // compile in separate goroutine; wait for result export func Compile(str string) (regexp Regexp, error *os.Error) { ch := new(chan *RE); go Compiler(str, ch); re := <-ch; return re, re.error }