summaryrefslogtreecommitdiff
path: root/usr/gri/gosrc/scanner.go
diff options
context:
space:
mode:
Diffstat (limited to 'usr/gri/gosrc/scanner.go')
-rw-r--r--usr/gri/gosrc/scanner.go777
1 files changed, 777 insertions, 0 deletions
diff --git a/usr/gri/gosrc/scanner.go b/usr/gri/gosrc/scanner.go
new file mode 100644
index 000000000..94d8f1915
--- /dev/null
+++ b/usr/gri/gosrc/scanner.go
@@ -0,0 +1,777 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package Scanner
+
+export
+ ILLEGAL, EOF, IDENT, STRING, NUMBER,
+ COMMA, COLON, SEMICOLON, PERIOD,
+ LPAREN, RPAREN, LBRACK, RBRACK, LBRACE, RBRACE,
+ ASSIGN, DEFINE,
+ INC, DEC, NOT,
+ AND, OR, XOR,
+ ADD, SUB, MUL, QUO, REM,
+ EQL, NEQ, LSS, LEQ, GTR, GEQ,
+ SHL, SHR,
+ ADD_ASSIGN, SUB_ASSIGN, MUL_ASSIGN, QUO_ASSIGN, REM_ASSIGN,
+ AND_ASSIGN, OR_ASSIGN, XOR_ASSIGN, SHL_ASSIGN, SHR_ASSIGN,
+ LAND, LOR,
+ BREAK, CASE, CHAN, CONST, CONTINUE, DEFAULT, ELSE, EXPORT, FALLTHROUGH, FALSE,
+ FOR, FUNC, GO, GOTO, IF, IMPORT, INTERFACE, IOTA, MAP, NEW, NIL, PACKAGE, RANGE,
+ RETURN, SELECT, STRUCT, SWITCH, TRUE, TYPE, VAR
+
+
+const (
+ ILLEGAL = iota;
+ EOF;
+ IDENT;
+ STRING;
+ NUMBER;
+
+ COMMA;
+ COLON;
+ SEMICOLON;
+ PERIOD;
+
+ LPAREN;
+ RPAREN;
+ LBRACK;
+ RBRACK;
+ LBRACE;
+ RBRACE;
+
+ ASSIGN;
+ DEFINE;
+
+ INC;
+ DEC;
+ NOT;
+
+ AND;
+ OR;
+ XOR;
+
+ ADD;
+ SUB;
+ MUL;
+ QUO;
+ REM;
+
+ EQL;
+ NEQ;
+ LSS;
+ LEQ;
+ GTR;
+ GEQ;
+
+ SHL;
+ SHR;
+
+ ADD_ASSIGN;
+ SUB_ASSIGN;
+ MUL_ASSIGN;
+ QUO_ASSIGN;
+ REM_ASSIGN;
+
+ AND_ASSIGN;
+ OR_ASSIGN;
+ XOR_ASSIGN;
+
+ SHL_ASSIGN;
+ SHR_ASSIGN;
+
+ LAND;
+ LOR;
+
+ // keywords
+ KEYWORDS_BEG;
+ BREAK;
+ CASE;
+ CHAN;
+ CONST;
+ CONTINUE;
+ DEFAULT;
+ ELSE;
+ EXPORT;
+ FALLTHROUGH;
+ FALSE;
+ FOR;
+ FUNC;
+ GO;
+ GOTO;
+ IF;
+ IMPORT;
+ INTERFACE;
+ IOTA;
+ MAP;
+ NEW;
+ NIL;
+ PACKAGE;
+ RANGE;
+ RETURN;
+ SELECT;
+ STRUCT;
+ SWITCH;
+ TRUE;
+ TYPE;
+ VAR;
+ KEYWORDS_END;
+)
+
+
+var Keywords *map [string] int;
+
+
+export TokenName
+func TokenName(tok int) string {
+ switch (tok) {
+ case ILLEGAL: return "illegal";
+ case EOF: return "eof";
+ case IDENT: return "ident";
+ case STRING: return "string";
+ case NUMBER: return "number";
+
+ case COMMA: return ",";
+ case COLON: return ":";
+ case SEMICOLON: return ";";
+ case PERIOD: return ".";
+
+ case LPAREN: return "(";
+ case RPAREN: return ")";
+ case LBRACK: return "[";
+ case RBRACK: return "]";
+ case LBRACE: return "LBRACE";
+ case RBRACE: return "RBRACE";
+
+ case ASSIGN: return "=";
+ case DEFINE: return ":=";
+
+ case INC: return "++";
+ case DEC: return "--";
+ case NOT: return "!";
+
+ case AND: return "&";
+ case OR: return "|";
+ case XOR: return "^";
+
+ case ADD: return "+";
+ case SUB: return "-";
+ case MUL: return "*";
+ case QUO: return "/";
+ case REM: return "%";
+
+ case EQL: return "==";
+ case NEQ: return "!=";
+ case LSS: return "<";
+ case LEQ: return "<=";
+ case GTR: return ">";
+ case GEQ: return ">=";
+
+ case SHL: return "<<";
+ case SHR: return ">>";
+
+ case ADD_ASSIGN: return "+=";
+ case SUB_ASSIGN: return "-=";
+ case MUL_ASSIGN: return "+=";
+ case QUO_ASSIGN: return "/=";
+ case REM_ASSIGN: return "%=";
+
+ case AND_ASSIGN: return "&=";
+ case OR_ASSIGN: return "|=";
+ case XOR_ASSIGN: return "^=";
+
+ case SHL_ASSIGN: return "<<=";
+ case SHR_ASSIGN: return ">>=";
+
+ case LAND: return "&&";
+ case LOR: return "||";
+
+ case BREAK: return "break";
+ case CASE: return "case";
+ case CHAN: return "chan";
+ case CONST: return "const";
+ case CONTINUE: return "continue";
+ case DEFAULT: return "default";
+ case ELSE: return "else";
+ case EXPORT: return "export";
+ case FALLTHROUGH: return "fallthrough";
+ case FALSE: return "false";
+ case FOR: return "for";
+ case FUNC: return "func";
+ case GO: return "go";
+ case GOTO: return "goto";
+ case IF: return "if";
+ case IMPORT: return "import";
+ case INTERFACE: return "interface";
+ case IOTA: return "iota";
+ case MAP: return "map";
+ case NEW: return "new";
+ case NIL: return "nil";
+ case PACKAGE: return "package";
+ case RANGE: return "range";
+ case RETURN: return "return";
+ case SELECT: return "select";
+ case STRUCT: return "struct";
+ case SWITCH: return "switch";
+ case TRUE: return "true";
+ case TYPE: return "type";
+ case VAR: return "var";
+ }
+
+ return "???";
+}
+
+
+func is_whitespace (ch int) bool {
+ return ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t';
+}
+
+
+func is_letter (ch int) bool {
+ return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 128 ;
+}
+
+
+func digit_val (ch int) int {
+ if '0' <= ch && ch <= '9' {
+ return ch - '0';
+ }
+ if 'a' <= ch && ch <= 'f' {
+ return ch - 'a' + 10;
+ }
+ if 'A' <= ch && ch <= 'F' {
+ return ch - 'A' + 10;
+ }
+ return 16; // larger than any legal digit val
+}
+
+
+export Scanner
+type Scanner struct {
+ filename string; // error reporting only
+ nerrors int; // number of errors
+ errpos int; // last error position
+
+ src string;
+ pos int; // current reading position
+ ch int; // one char look-ahead
+ chpos int; // position of ch
+}
+
+
+// Read the next Unicode char into S.ch.
+// S.ch < 0 means end-of-file.
+//
+func (S *Scanner) Next () {
+ const (
+ Bit1 = 7;
+ Bitx = 6;
+ Bit2 = 5;
+ Bit3 = 4;
+ Bit4 = 3;
+
+ // TODO 6g constant evaluation incomplete
+ T1 = 0x00; // (1 << (Bit1 + 1) - 1) ^ 0xFF; // 0000 0000
+ Tx = 0x80; // (1 << (Bitx + 1) - 1) ^ 0xFF; // 1000 0000
+ T2 = 0xC0; // (1 << (Bit2 + 1) - 1) ^ 0xFF; // 1100 0000
+ T3 = 0xE0; // (1 << (Bit3 + 1) - 1) ^ 0xFF; // 1110 0000
+ T4 = 0xF0; // (1 << (Bit4 + 1) - 1) ^ 0xFF; // 1111 0000
+
+ Rune1 = 1 << (Bit1 + 0*Bitx) - 1; // 0000 0000 0111 1111
+ Rune2 = 1 << (Bit2 + 1*Bitx) - 1; // 0000 0111 1111 1111
+ Rune3 = 1 << (Bit3 + 2*Bitx) - 1; // 1111 1111 1111 1111
+
+ Maskx = 0x3F; // 1 << Bitx - 1; // 0011 1111
+ Testx = 0xC0; // Maskx ^ 0xFF; // 1100 0000
+
+ Bad = 0xFFFD; // Runeerror
+ );
+
+ src := S.src; // TODO only needed because of 6g bug
+ lim := len(src);
+ pos := S.pos;
+
+ // 1-byte sequence
+ // 0000-007F => T1
+ if pos >= lim {
+ S.ch = -1; // end of file
+ S.chpos = lim;
+ return;
+ }
+ c0 := int(src[pos]);
+ pos++;
+ if c0 < Tx {
+ S.ch = c0;
+ S.chpos = S.pos;
+ S.pos = pos;
+ return;
+ }
+
+ // 2-byte sequence
+ // 0080-07FF => T2 Tx
+ if pos >= lim {
+ goto bad;
+ }
+ c1 := int(src[pos]) ^ Tx;
+ pos++;
+ if c1 & Testx != 0 {
+ goto bad;
+ }
+ if c0 < T3 {
+ if c0 < T2 {
+ goto bad;
+ }
+ r := (c0 << Bitx | c1) & Rune2;
+ if r <= Rune1 {
+ goto bad;
+ }
+ S.ch = r;
+ S.chpos = S.pos;
+ S.pos = pos;
+ return;
+ }
+
+ // 3-byte sequence
+ // 0800-FFFF => T3 Tx Tx
+ if pos >= lim {
+ goto bad;
+ }
+ c2 := int(src[pos]) ^ Tx;
+ pos++;
+ if c2 & Testx != 0 {
+ goto bad;
+ }
+ if c0 < T4 {
+ r := (((c0 << Bitx | c1) << Bitx) | c2) & Rune3;
+ if r <= Rune2 {
+ goto bad;
+ }
+ S.ch = r;
+ S.chpos = S.pos;
+ S.pos = pos;
+ return;
+ }
+
+ // bad encoding
+bad:
+ S.ch = Bad;
+ S.chpos = S.pos;
+ S.pos += 1;
+ return;
+}
+
+
+func Init () {
+ Keywords = new(map [string] int);
+
+ for i := KEYWORDS_BEG; i <= KEYWORDS_END; i++ {
+ Keywords[TokenName(i)] = i;
+ }
+}
+
+
+// Compute (line, column) information for a given source position.
+func (S *Scanner) LineCol(pos int) (line, col int) {
+ line = 1;
+ lpos := 0;
+
+ src := S.src;
+ if pos > len(src) {
+ pos = len(src);
+ }
+
+ for i := 0; i < pos; i++ {
+ if src[i] == '\n' {
+ line++;
+ lpos = i;
+ }
+ }
+
+ return line, pos - lpos;
+}
+
+
+func (S *Scanner) Error(pos int, msg string) {
+ const errdist = 10;
+ if pos > S.errpos + errdist || S.nerrors == 0 {
+ line, col := S.LineCol(pos);
+ print S.filename, ":", line, ":", col, ": ", msg, "\n";
+ S.nerrors++;
+ S.errpos = pos;
+ }
+}
+
+
+func (S *Scanner) Open (filename, src string) {
+ if Keywords == nil {
+ Init();
+ }
+
+ S.filename = filename;
+ S.nerrors = 0;
+ S.errpos = 0;
+
+ S.src = src;
+ S.pos = 0;
+ S.Next();
+}
+
+
+// TODO this needs to go elsewhere
+func IntString(x, base int) string {
+ neg := false;
+ if x < 0 {
+ x = -x;
+ if x < 0 {
+ panic "smallest int not handled";
+ }
+ neg = true;
+ }
+
+ hex := "0123456789ABCDEF";
+ var buf [16] byte;
+ i := 0;
+ for x > 0 || i == 0 {
+ buf[i] = hex[x % base];
+ x /= base;
+ i++;
+ }
+
+ s := "";
+ if neg {
+ s = "-";
+ }
+ for i > 0 {
+ i--;
+ s = s + string(int(buf[i]));
+ }
+ return s;
+}
+
+
+func CharString(ch int) string {
+ s := string(ch);
+ switch ch {
+ case '\a': s = `\a`;
+ case '\b': s = `\b`;
+ case '\f': s = `\f`;
+ case '\n': s = `\n`;
+ case '\r': s = `\r`;
+ case '\t': s = `\t`;
+ case '\v': s = `\v`;
+ case '\\': s = `\\`;
+ case '\'': s = `\'`;
+ }
+ return "'" + s + "' (U+" + IntString(ch, 16) + ")";
+}
+
+
+func (S *Scanner) Expect (ch int) {
+ if S.ch != ch {
+ S.Error(S.chpos, "expected " + CharString(ch) + ", found " + CharString(S.ch));
+ }
+ S.Next(); // make always progress
+}
+
+
+func (S *Scanner) SkipWhitespace () {
+ for is_whitespace(S.ch) {
+ S.Next();
+ }
+}
+
+
+func (S *Scanner) SkipComment () {
+ // '/' already consumed
+ if S.ch == '/' {
+ // comment
+ S.Next();
+ for S.ch != '\n' && S.ch >= 0 {
+ S.Next();
+ }
+
+ } else {
+ /* comment */
+ pos := S.chpos - 1;
+ S.Expect('*');
+ for S.ch >= 0 {
+ ch := S.ch;
+ S.Next();
+ if ch == '*' && S.ch == '/' {
+ S.Next();
+ return;
+ }
+ }
+ S.Error(pos, "comment not terminated");
+ }
+}
+
+
+func (S *Scanner) ScanIdentifier () int {
+ beg := S.pos - 1;
+ for is_letter(S.ch) || digit_val(S.ch) < 10 {
+ S.Next();
+ }
+ end := S.pos - 1;
+
+ var tok int;
+ var present bool;
+ tok, present = Keywords[S.src[beg : end]];
+ if !present {
+ tok = IDENT;
+ }
+
+ return tok;
+}
+
+
+func (S *Scanner) ScanMantissa (base int) {
+ for digit_val(S.ch) < base {
+ S.Next();
+ }
+}
+
+
+func (S *Scanner) ScanNumber (seen_decimal_point bool) int {
+ if seen_decimal_point {
+ S.ScanMantissa(10);
+ goto exponent;
+ }
+
+ if S.ch == '0' {
+ // int or float
+ S.Next();
+ if S.ch == 'x' || S.ch == 'X' {
+ // hexadecimal int
+ S.Next();
+ S.ScanMantissa(16);
+ } else {
+ // octal int or float
+ S.ScanMantissa(8);
+ if digit_val(S.ch) < 10 || S.ch == '.' || S.ch == 'e' || S.ch == 'E' {
+ // float
+ goto mantissa;
+ }
+ // octal int
+ }
+ return NUMBER;
+ }
+
+mantissa:
+ // decimal int or float
+ S.ScanMantissa(10);
+
+ if S.ch == '.' {
+ // float
+ S.Next();
+ S.ScanMantissa(10)
+ }
+
+exponent:
+ if S.ch == 'e' || S.ch == 'E' {
+ // float
+ S.Next();
+ if S.ch == '-' || S.ch == '+' {
+ S.Next();
+ }
+ S.ScanMantissa(10);
+ }
+ return NUMBER;
+}
+
+
+func (S *Scanner) ScanDigits(n int, base int) {
+ for digit_val(S.ch) < base {
+ S.Next();
+ n--;
+ }
+ if n > 0 {
+ S.Error(S.chpos, "illegal char escape");
+ }
+}
+
+
+func (S *Scanner) ScanEscape () string {
+ // TODO: fix this routine
+
+ ch := S.ch;
+ pos := S.chpos;
+ S.Next();
+ switch (ch) {
+ case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"':
+ return string(ch);
+
+ case '0', '1', '2', '3', '4', '5', '6', '7':
+ S.ScanDigits(3 - 1, 8); // 1 char already read
+ return ""; // TODO fix this
+
+ case 'x':
+ S.ScanDigits(2, 16);
+ return ""; // TODO fix this
+
+ case 'u':
+ S.ScanDigits(4, 16);
+ return ""; // TODO fix this
+
+ case 'U':
+ S.ScanDigits(8, 16);
+ return ""; // TODO fix this
+
+ default:
+ S.Error(pos, "illegal char escape");
+ }
+}
+
+
+func (S *Scanner) ScanChar () int {
+ // '\'' already consumed
+
+ ch := S.ch;
+ S.Next();
+ if ch == '\\' {
+ S.ScanEscape();
+ }
+
+ S.Expect('\'');
+ return NUMBER;
+}
+
+
+func (S *Scanner) ScanString () int {
+ // '"' already consumed
+
+ pos := S.chpos - 1;
+ for S.ch != '"' {
+ ch := S.ch;
+ S.Next();
+ if ch == '\n' || ch < 0 {
+ S.Error(pos, "string not terminated");
+ break;
+ }
+ if ch == '\\' {
+ S.ScanEscape();
+ }
+ }
+
+ S.Next();
+ return STRING;
+}
+
+
+func (S *Scanner) ScanRawString () int {
+ // '`' already consumed
+
+ pos := S.chpos - 1;
+ for S.ch != '`' {
+ ch := S.ch;
+ S.Next();
+ if ch == '\n' || ch < 0 {
+ S.Error(pos, "string not terminated");
+ break;
+ }
+ }
+
+ S.Next();
+ return STRING;
+}
+
+
+func (S *Scanner) Select2 (tok0, tok1 int) int {
+ if S.ch == '=' {
+ S.Next();
+ return tok1;
+ }
+ return tok0;
+}
+
+
+func (S *Scanner) Select3 (tok0, tok1, ch2, tok2 int) int {
+ if S.ch == '=' {
+ S.Next();
+ return tok1;
+ }
+ if S.ch == ch2 {
+ S.Next();
+ return tok2;
+ }
+ return tok0;
+}
+
+
+func (S *Scanner) Select4 (tok0, tok1, ch2, tok2, tok3 int) int {
+ if S.ch == '=' {
+ S.Next();
+ return tok1;
+ }
+ if S.ch == ch2 {
+ S.Next();
+ if S.ch == '=' {
+ S.Next();
+ return tok3;
+ }
+ return tok2;
+ }
+ return tok0;
+}
+
+
+func (S *Scanner) Scan () (tok, beg, end int) {
+ S.SkipWhitespace();
+
+ ch := S.ch;
+ tok = ILLEGAL;
+ beg = S.chpos;
+
+ switch {
+ case is_letter(ch): tok = S.ScanIdentifier();
+ case digit_val(ch) < 10: tok = S.ScanNumber(false);
+ default:
+ S.Next(); // always make progress
+ switch ch {
+ case -1: tok = EOF;
+ case '"': tok = S.ScanString();
+ case '\'': tok = S.ScanChar();
+ case '`': tok = S.ScanRawString();
+ case ':': tok = S.Select2(COLON, DEFINE);
+ case '.':
+ if digit_val(S.ch) < 10 {
+ tok = S.ScanNumber(true);
+ } else {
+ tok = PERIOD;
+ }
+ case ',': tok = COMMA;
+ case ';': tok = SEMICOLON;
+ case '(': tok = LPAREN;
+ case ')': tok = RPAREN;
+ case '[': tok = LBRACK;
+ case ']': tok = RBRACK;
+ case '{': tok = LBRACE;
+ case '}': tok = RBRACE;
+ case '+': tok = S.Select3(ADD, ADD_ASSIGN, '+', INC);
+ case '-': tok = S.Select3(SUB, SUB_ASSIGN, '-', DEC);
+ case '*': tok = S.Select2(MUL, MUL_ASSIGN);
+ case '/':
+ if S.ch == '/' || S.ch == '*' {
+ S.SkipComment();
+ // cannot simply return because of 6g bug
+ tok, beg, end = S.Scan();
+ return tok, beg, end;
+ }
+ tok = S.Select2(QUO, QUO_ASSIGN);
+ case '%': tok = S.Select2(REM, REM_ASSIGN);
+ case '^': tok = S.Select2(XOR, XOR_ASSIGN);
+ case '<': tok = S.Select4(LSS, LEQ, '<', SHL, SHL_ASSIGN);
+ case '>': tok = S.Select4(GTR, GEQ, '>', SHR, SHR_ASSIGN);
+ case '=': tok = S.Select2(ASSIGN, EQL);
+ case '!': tok = S.Select2(NOT, NEQ);
+ case '&': tok = S.Select3(AND, AND_ASSIGN, '&', LAND);
+ case '|': tok = S.Select3(OR, OR_ASSIGN, '|', LOR);
+ default:
+ S.Error(beg, "illegal character " + CharString(ch));
+ tok = ILLEGAL;
+ }
+ }
+
+ return tok, beg, S.chpos;
+}