diff options
author | Rob Pike <r@golang.org> | 2008-10-09 19:40:53 -0700 |
---|---|---|
committer | Rob Pike <r@golang.org> | 2008-10-09 19:40:53 -0700 |
commit | 199fec7097cc1bf60226d827be5a0a59f4e65569 (patch) | |
tree | 1c94ae3c8d717ec7f849d37e00f3cd22f67c4700 /usr/r/regexp/regexp.go | |
parent | 63afa9b4c88ed94faba388d89b8eda7c2cbc2ed9 (diff) | |
download | golang-199fec7097cc1bf60226d827be5a0a59f4e65569.tar.gz |
beginnings of regular expression library.
will move elsewhere when more complete.
parses, does not execute.
no character classes yet.
R=rsc
DELTA=522 (522 added, 0 deleted, 0 changed)
OCL=16863
CL=16874
Diffstat (limited to 'usr/r/regexp/regexp.go')
-rw-r--r-- | usr/r/regexp/regexp.go | 489 |
1 files changed, 489 insertions, 0 deletions
diff --git a/usr/r/regexp/regexp.go b/usr/r/regexp/regexp.go new file mode 100644 index 000000000..0ee754e6f --- /dev/null +++ b/usr/r/regexp/regexp.go @@ -0,0 +1,489 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Regular expression library. + +package regexp + +import os "os" + +export var ErrUnimplemented = os.NewError("unimplemented"); +export var ErrInternal = os.NewError("internal error"); +export var ErrUnmatchedLpar = os.NewError("unmatched '('"); +export var ErrUnmatchedRpar = os.NewError("unmatched ')'"); +export var ErrExtraneousBackslash = os.NewError("extraneous backslash"); +export var ErrEmpty = os.NewError("empty subexpression or alternation"); +export var ErrBadClosure = os.NewError("repeated closure (**, ++, etc.)"); +export var ErrBareClosure = os.NewError("closure applies to nothing"); +export var ErrBadBackslash = os.NewError("illegal backslash escape"); + +// An instruction executed by the NFA +type Inst interface { + Type() int; // the type of this instruction: CHAR, ANY, etc. + This() int; // the index of this instruction + Next() int; // the index of the instruction to execute after this one + SetThis(i int); + SetNext(i int); + Print(ind string); +} + +type RE struct { + expr string; // the original expression + ch *chan<- *RE; // reply channel when we're done + error *os.Error; // compile- or run-time error; nil if OK + ninst int; + inst *[]Inst; + start Inst; +} + +const ( + START // beginning of program: marker to start + = iota; + END; // end of program: success + BOT; // '^' beginning of text + EOT; // '$' end of text + CHAR; // 'a' regular character + ANY; // '.' any character + BRA; // '(' parenthesized expression + EBRA; // ')'; end of '(' parenthesized expression + ALT; // '|' alternation + NOP; // do nothing; makes it easy to link without patching +) + +// --- START start of program +type Start struct { + this int; + next int; +} + +func (start *Start) Type() int { return START } +func (start *Start) This() int { return start.this } +func (start *Start) Next() int { return start.next } +func (start *Start) SetThis(i int) { start.this = i } +func (start *Start) SetNext(i int) { start.next = i } +func (start *Start) Print(ind string) { print(ind, "start") } + +// --- END end of program +type End struct { + this int; + next int; +} + +func (end *End) Type() int { return END } +func (end *End) This() int { return end.this } +func (end *End) Next() int { return end.next } +func (end *End) SetThis(i int) { end.this = i } +func (end *End) SetNext(i int) { end.next = i } +func (end *End) Print(ind string) { print(ind, "end") } + +// --- BOT beginning of text +type Bot struct { + this int; + next int; +} + +func (bot *Bot) Type() int { return BOT } +func (bot *Bot) This() int { return bot.this } +func (bot *Bot) Next() int { return bot.next } +func (bot *Bot) SetThis(i int) { bot.this = i } +func (bot *Bot) SetNext(i int) { bot.next = i } +func (bot *Bot) Print(ind string) { print(ind, "bot") } + +// --- EOT end of text +type Eot struct { + this int; + next int; +} + +func (eot *Eot) Type() int { return EOT } +func (eot *Eot) This() int { return eot.this } +func (eot *Eot) Next() int { return eot.next } +func (eot *Eot) SetThis(i int) { eot.this = i } +func (eot *Eot) SetNext(i int) { eot.next = i } +func (eot *Eot) Print(ind string) { print(ind, "eot") } + +// --- CHAR a regular character +type Char struct { + this int; + next int; + char int; + set bool; +} + +func (char *Char) Type() int { return CHAR } +func (char *Char) This() int { return char.this } +func (char *Char) Next() int { return char.next } +func (char *Char) SetThis(i int) { char.this = i } +func (char *Char) SetNext(i int) { char.next = i } +func (char *Char) Print(ind string) { print(ind, "char ", string(char.char)) } + +func NewChar(char int) *Char { + c := new(Char); + c.char = char; + return c; +} + +// --- ANY any character +type Any struct { + this int; + next int; +} + +func (any *Any) Type() int { return ANY } +func (any *Any) This() int { return any.this } +func (any *Any) Next() int { return any.next } +func (any *Any) SetThis(i int) { any.this = i } +func (any *Any) SetNext(i int) { any.next = i } +func (any *Any) Print(ind string) { print(ind, "any") } + +// --- BRA parenthesized expression +type Bra struct { + this int; + next int; +} + +func (bra *Bra) Type() int { return BRA } +func (bra *Bra) This() int { return bra.this } +func (bra *Bra) Next() int { return bra.next } +func (bra *Bra) SetThis(i int) { bra.this = i } +func (bra *Bra) SetNext(i int) { bra.next = i } +func (bra *Bra) Print(ind string) { print(ind , "bra"); } + +// --- EBRA end of parenthesized expression +type Ebra struct { + this int; + next int; + n int; // subexpression number +} + +func (ebra *Ebra) Type() int { return BRA } +func (ebra *Ebra) This() int { return ebra.this } +func (ebra *Ebra) Next() int { return ebra.next } +func (ebra *Ebra) SetThis(i int) { ebra.this = i } +func (ebra *Ebra) SetNext(i int) { ebra.next = i } +func (ebra *Ebra) Print(ind string) { print(ind , "ebra ", ebra.n); } + +// --- ALT alternation +type Alt struct { + this int; + next int; + left int; // other branch +} + +func (alt *Alt) Type() int { return ALT } +func (alt *Alt) This() int { return alt.this } +func (alt *Alt) Next() int { return alt.next } +func (alt *Alt) SetThis(i int) { alt.this = i } +func (alt *Alt) SetNext(i int) { alt.next = i } +func (alt *Alt) Print(ind string) { print(ind , "alt(", alt.left, ")"); } + +// --- NOP no operation +type Nop struct { + this int; + next int; +} + +func (nop *Nop) Type() int { return NOP } +func (nop *Nop) This() int { return nop.this } +func (nop *Nop) Next() int { return nop.next } +func (nop *Nop) SetThis(i int) { nop.this = i } +func (nop *Nop) SetNext(i int) { nop.next = i } +func (nop *Nop) Print(ind string) { print(ind, "nop") } + + +func (re *RE) AddInst(inst Inst) int { + if re.ninst >= cap(re.inst) { + panic("write the code to grow inst") + } + re.inst[re.ninst] = inst; + i := re.ninst; + inst.SetThis(re.ninst); + re.ninst++; + inst.SetNext(re.ninst); + return i; +} + +// report error and exit compiling/executing goroutine +func (re *RE) Error(err *os.Error) { + re.error = err; + re.ch <- re; + sys.goexit(); +} + +type Parser struct { + re *RE; + nbra int; + pos int; + ch int; +} + +const EOF = -1 + +func (p *Parser) c() int { + return p.ch; +} + +func (p *Parser) nextc() int { + if p.pos >= len(p.re.expr) { + p.ch = EOF + } else { + c, w := sys.stringtorune(p.re.expr, p.pos); // TODO: stringotorune shoudl take a string* + p.ch = c; + p.pos += w; + } + return p.ch; +} + +func NewParser(re *RE) *Parser { + parser := new(Parser); + parser.re = re; + parser.nextc(); // load p.ch + return parser; +} + +/* + +Grammar: + regexp: + concatenation { '|' concatenation } + concatenation: + { closure } + closure: + term { '*' | '+' | '?' } + term: + '.' + character + characterclass + '(' regexp ')' + +*/ + +func (p *Parser) Regexp() (start, end int) + +const NULL = -1 + +func special(c int) bool { + s := `\.+*?()|[-]`; + for i := 0; i < len(s); i++ { + if c == int(s[i]) { + return true + } + } + return false +} + +func (p *Parser) Term() (start, end int) { + switch c := p.c(); c { + case ')', '|', EOF: + return NULL, NULL; + case '*', '+', '|': + p.re.Error(ErrBareClosure); + case '.': + p.nextc(); + start = p.re.AddInst(new(Any)); + return start, start; + case '(': + p.nextc(); + start, end = p.Regexp(); + if p.c() != ')' { + p.re.Error(ErrUnmatchedLpar); + } + p.nextc(); + p.nbra++; + bra := new(Bra); + brai := p.re.AddInst(bra); + ebra := new(Ebra); + ebrai := p.re.AddInst(ebra); + ebra.n = p.nbra; + if start == NULL { + if end != NULL { p.re.Error(ErrInternal) } + start = ebrai + } else { + p.re.inst[end].SetNext(ebrai); + } + bra.SetNext(start); + return brai, ebrai; + case '\\': + c = p.nextc(); + switch { + case c == EOF: + p.re.Error(ErrExtraneousBackslash); + case c == 'n': + c = '\n'; + case special(c): + // c is as delivered + default: + p.re.Error(ErrBadBackslash); + } + fallthrough; + default: + p.nextc(); + start = p.re.AddInst(NewChar(c)); + return start, start + } + panic("unreachable"); +} + +func (p *Parser) Closure() (start, end int) { + start, end = p.Term(); + if start == NULL { + return start, end + } + switch p.c() { + case '*': + // (start,end)*: + alt := new(Alt); + alti := p.re.AddInst(alt); + p.re.inst[end].SetNext(alti); // after end, do alt + alt.left = start; // alternate brach: return to start + start = alti; // alt becomes new (start, end) + end = alti; + case '+': + // (start,end)+: + alt := new(Alt); + alti := p.re.AddInst(alt); + p.re.inst[end].SetNext(alti); // after end, do alt + alt.left = start; // alternate brach: return to start + end = alti; // start is unchanged; end is alt + case '?': + // (start,end)?: + alt := new(Alt); + alti := p.re.AddInst(alt); + nop := new(Nop); + nopi := p.re.AddInst(nop); + alt.left = start; // alternate branch is start + alt.next = nopi; // follow on to nop + p.re.inst[end].SetNext(nopi); // after end, go to nop + start = alti; // start is now alt + end = nopi; // end is nop pointed to by both branches + default: + return start, end; + } + switch p.nextc() { + case '*', '+', '?': + p.re.Error(ErrBadClosure); + } + return start, end; +} + +func (p *Parser) Concatenation() (start, end int) { + start, end = NULL, NULL; + for { + nstart, nend := p.Closure(); + switch { + case nstart == NULL: // end of this concatenation + return start, end; + case start == NULL: // this is first element of concatenation + start, end = nstart, nend; + default: + p.re.inst[end].SetNext(nstart); + end = nend; + } + } + panic("unreachable"); +} + +func (p *Parser) Regexp() (start, end int) { + start, end = p.Concatenation(); + if start == NULL { + return NULL, NULL + } + for { + switch p.c() { + default: + return start, end; + case '|': + p.nextc(); + nstart, nend := p.Concatenation(); + // xyz|(nothing) is xyz or nop + if nstart == NULL { + nopi := p.re.AddInst(new(Nop)); + nstart, nend = nopi, nopi; + } + alt := new(Alt); + alti := p.re.AddInst(alt); + alt.left = start; + alt.next = nstart; + nop := new(Nop); + nopi := p.re.AddInst(nop); + p.re.inst[end].SetNext(nopi); + p.re.inst[nend].SetNext(nopi); + start, end = alti, nopi; + } + } + panic("unreachable"); +} + +func (re *RE) UnNop(i int) int { + for re.inst[i].Type() == NOP { + i = re.inst[i].Next() + } + return i +} + +func (re *RE) EliminateNops() { + for i := 0; i < re.ninst - 1; i++ { // last one is END + inst := re.inst[i]; + inst.SetNext(re.UnNop(inst.Next())); + if inst.Type() == ALT { + alt := inst.(*Alt); + alt.left = re.UnNop(alt.left) + } + } +} + +func (re *RE) DoParse() { + parser := NewParser(re); + start := new(Start); + starti := re.AddInst(start); + s, e := parser.Regexp(); + if s == NULL { + if e != NULL { re.Error(ErrInternal) } + e = starti; + } + start.next = s; + re.inst[e].SetNext(re.AddInst(new(End))); + + for i := 0; i < re.ninst; i++ { + inst := re.inst[i]; + print(i, ":\t"); + inst.Print("\t"); + print(" -> ", inst.Next(), "\n"); + } + println(); + + re.EliminateNops(); + + for i := 0; i < re.ninst; i++ { + inst := re.inst[i]; + print(i, ":\t"); + inst.Print("\t"); + print(" -> ", inst.Next(), "\n"); + } + println(); + + re.Error(ErrUnimplemented); +} + +func Compiler(str string, ch *chan *RE) { + re := new(RE); + re.inst = new([]Inst, 100); + re.expr = str; + re.ch = ch; + re.DoParse(); + ch <- re; +} + +// Public interface has only execute functionality (not yet implemented) +export type Regexp interface { + // Execute() bool +} + +// compile in separate goroutine; wait for result +export func Compile(str string) (regexp Regexp, error *os.Error) { + ch := new(chan *RE); + go Compiler(str, ch); + re := <-ch; + return re, re.error +} |