diff options
Diffstat (limited to 'usr/gri/gosrc/scanner.go')
-rw-r--r-- | usr/gri/gosrc/scanner.go | 777 |
1 files changed, 777 insertions, 0 deletions
diff --git a/usr/gri/gosrc/scanner.go b/usr/gri/gosrc/scanner.go new file mode 100644 index 000000000..94d8f1915 --- /dev/null +++ b/usr/gri/gosrc/scanner.go @@ -0,0 +1,777 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package Scanner + +export + ILLEGAL, EOF, IDENT, STRING, NUMBER, + COMMA, COLON, SEMICOLON, PERIOD, + LPAREN, RPAREN, LBRACK, RBRACK, LBRACE, RBRACE, + ASSIGN, DEFINE, + INC, DEC, NOT, + AND, OR, XOR, + ADD, SUB, MUL, QUO, REM, + EQL, NEQ, LSS, LEQ, GTR, GEQ, + SHL, SHR, + ADD_ASSIGN, SUB_ASSIGN, MUL_ASSIGN, QUO_ASSIGN, REM_ASSIGN, + AND_ASSIGN, OR_ASSIGN, XOR_ASSIGN, SHL_ASSIGN, SHR_ASSIGN, + LAND, LOR, + BREAK, CASE, CHAN, CONST, CONTINUE, DEFAULT, ELSE, EXPORT, FALLTHROUGH, FALSE, + FOR, FUNC, GO, GOTO, IF, IMPORT, INTERFACE, IOTA, MAP, NEW, NIL, PACKAGE, RANGE, + RETURN, SELECT, STRUCT, SWITCH, TRUE, TYPE, VAR + + +const ( + ILLEGAL = iota; + EOF; + IDENT; + STRING; + NUMBER; + + COMMA; + COLON; + SEMICOLON; + PERIOD; + + LPAREN; + RPAREN; + LBRACK; + RBRACK; + LBRACE; + RBRACE; + + ASSIGN; + DEFINE; + + INC; + DEC; + NOT; + + AND; + OR; + XOR; + + ADD; + SUB; + MUL; + QUO; + REM; + + EQL; + NEQ; + LSS; + LEQ; + GTR; + GEQ; + + SHL; + SHR; + + ADD_ASSIGN; + SUB_ASSIGN; + MUL_ASSIGN; + QUO_ASSIGN; + REM_ASSIGN; + + AND_ASSIGN; + OR_ASSIGN; + XOR_ASSIGN; + + SHL_ASSIGN; + SHR_ASSIGN; + + LAND; + LOR; + + // keywords + KEYWORDS_BEG; + BREAK; + CASE; + CHAN; + CONST; + CONTINUE; + DEFAULT; + ELSE; + EXPORT; + FALLTHROUGH; + FALSE; + FOR; + FUNC; + GO; + GOTO; + IF; + IMPORT; + INTERFACE; + IOTA; + MAP; + NEW; + NIL; + PACKAGE; + RANGE; + RETURN; + SELECT; + STRUCT; + SWITCH; + TRUE; + TYPE; + VAR; + KEYWORDS_END; +) + + +var Keywords *map [string] int; + + +export TokenName +func TokenName(tok int) string { + switch (tok) { + case ILLEGAL: return "illegal"; + case EOF: return "eof"; + case IDENT: return "ident"; + case STRING: return "string"; + case NUMBER: return "number"; + + case COMMA: return ","; + case COLON: return ":"; + case SEMICOLON: return ";"; + case PERIOD: return "."; + + case LPAREN: return "("; + case RPAREN: return ")"; + case LBRACK: return "["; + case RBRACK: return "]"; + case LBRACE: return "LBRACE"; + case RBRACE: return "RBRACE"; + + case ASSIGN: return "="; + case DEFINE: return ":="; + + case INC: return "++"; + case DEC: return "--"; + case NOT: return "!"; + + case AND: return "&"; + case OR: return "|"; + case XOR: return "^"; + + case ADD: return "+"; + case SUB: return "-"; + case MUL: return "*"; + case QUO: return "/"; + case REM: return "%"; + + case EQL: return "=="; + case NEQ: return "!="; + case LSS: return "<"; + case LEQ: return "<="; + case GTR: return ">"; + case GEQ: return ">="; + + case SHL: return "<<"; + case SHR: return ">>"; + + case ADD_ASSIGN: return "+="; + case SUB_ASSIGN: return "-="; + case MUL_ASSIGN: return "+="; + case QUO_ASSIGN: return "/="; + case REM_ASSIGN: return "%="; + + case AND_ASSIGN: return "&="; + case OR_ASSIGN: return "|="; + case XOR_ASSIGN: return "^="; + + case SHL_ASSIGN: return "<<="; + case SHR_ASSIGN: return ">>="; + + case LAND: return "&&"; + case LOR: return "||"; + + case BREAK: return "break"; + case CASE: return "case"; + case CHAN: return "chan"; + case CONST: return "const"; + case CONTINUE: return "continue"; + case DEFAULT: return "default"; + case ELSE: return "else"; + case EXPORT: return "export"; + case FALLTHROUGH: return "fallthrough"; + case FALSE: return "false"; + case FOR: return "for"; + case FUNC: return "func"; + case GO: return "go"; + case GOTO: return "goto"; + case IF: return "if"; + case IMPORT: return "import"; + case INTERFACE: return "interface"; + case IOTA: return "iota"; + case MAP: return "map"; + case NEW: return "new"; + case NIL: return "nil"; + case PACKAGE: return "package"; + case RANGE: return "range"; + case RETURN: return "return"; + case SELECT: return "select"; + case STRUCT: return "struct"; + case SWITCH: return "switch"; + case TRUE: return "true"; + case TYPE: return "type"; + case VAR: return "var"; + } + + return "???"; +} + + +func is_whitespace (ch int) bool { + return ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t'; +} + + +func is_letter (ch int) bool { + return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 128 ; +} + + +func digit_val (ch int) int { + if '0' <= ch && ch <= '9' { + return ch - '0'; + } + if 'a' <= ch && ch <= 'f' { + return ch - 'a' + 10; + } + if 'A' <= ch && ch <= 'F' { + return ch - 'A' + 10; + } + return 16; // larger than any legal digit val +} + + +export Scanner +type Scanner struct { + filename string; // error reporting only + nerrors int; // number of errors + errpos int; // last error position + + src string; + pos int; // current reading position + ch int; // one char look-ahead + chpos int; // position of ch +} + + +// Read the next Unicode char into S.ch. +// S.ch < 0 means end-of-file. +// +func (S *Scanner) Next () { + const ( + Bit1 = 7; + Bitx = 6; + Bit2 = 5; + Bit3 = 4; + Bit4 = 3; + + // TODO 6g constant evaluation incomplete + T1 = 0x00; // (1 << (Bit1 + 1) - 1) ^ 0xFF; // 0000 0000 + Tx = 0x80; // (1 << (Bitx + 1) - 1) ^ 0xFF; // 1000 0000 + T2 = 0xC0; // (1 << (Bit2 + 1) - 1) ^ 0xFF; // 1100 0000 + T3 = 0xE0; // (1 << (Bit3 + 1) - 1) ^ 0xFF; // 1110 0000 + T4 = 0xF0; // (1 << (Bit4 + 1) - 1) ^ 0xFF; // 1111 0000 + + Rune1 = 1 << (Bit1 + 0*Bitx) - 1; // 0000 0000 0111 1111 + Rune2 = 1 << (Bit2 + 1*Bitx) - 1; // 0000 0111 1111 1111 + Rune3 = 1 << (Bit3 + 2*Bitx) - 1; // 1111 1111 1111 1111 + + Maskx = 0x3F; // 1 << Bitx - 1; // 0011 1111 + Testx = 0xC0; // Maskx ^ 0xFF; // 1100 0000 + + Bad = 0xFFFD; // Runeerror + ); + + src := S.src; // TODO only needed because of 6g bug + lim := len(src); + pos := S.pos; + + // 1-byte sequence + // 0000-007F => T1 + if pos >= lim { + S.ch = -1; // end of file + S.chpos = lim; + return; + } + c0 := int(src[pos]); + pos++; + if c0 < Tx { + S.ch = c0; + S.chpos = S.pos; + S.pos = pos; + return; + } + + // 2-byte sequence + // 0080-07FF => T2 Tx + if pos >= lim { + goto bad; + } + c1 := int(src[pos]) ^ Tx; + pos++; + if c1 & Testx != 0 { + goto bad; + } + if c0 < T3 { + if c0 < T2 { + goto bad; + } + r := (c0 << Bitx | c1) & Rune2; + if r <= Rune1 { + goto bad; + } + S.ch = r; + S.chpos = S.pos; + S.pos = pos; + return; + } + + // 3-byte sequence + // 0800-FFFF => T3 Tx Tx + if pos >= lim { + goto bad; + } + c2 := int(src[pos]) ^ Tx; + pos++; + if c2 & Testx != 0 { + goto bad; + } + if c0 < T4 { + r := (((c0 << Bitx | c1) << Bitx) | c2) & Rune3; + if r <= Rune2 { + goto bad; + } + S.ch = r; + S.chpos = S.pos; + S.pos = pos; + return; + } + + // bad encoding +bad: + S.ch = Bad; + S.chpos = S.pos; + S.pos += 1; + return; +} + + +func Init () { + Keywords = new(map [string] int); + + for i := KEYWORDS_BEG; i <= KEYWORDS_END; i++ { + Keywords[TokenName(i)] = i; + } +} + + +// Compute (line, column) information for a given source position. +func (S *Scanner) LineCol(pos int) (line, col int) { + line = 1; + lpos := 0; + + src := S.src; + if pos > len(src) { + pos = len(src); + } + + for i := 0; i < pos; i++ { + if src[i] == '\n' { + line++; + lpos = i; + } + } + + return line, pos - lpos; +} + + +func (S *Scanner) Error(pos int, msg string) { + const errdist = 10; + if pos > S.errpos + errdist || S.nerrors == 0 { + line, col := S.LineCol(pos); + print S.filename, ":", line, ":", col, ": ", msg, "\n"; + S.nerrors++; + S.errpos = pos; + } +} + + +func (S *Scanner) Open (filename, src string) { + if Keywords == nil { + Init(); + } + + S.filename = filename; + S.nerrors = 0; + S.errpos = 0; + + S.src = src; + S.pos = 0; + S.Next(); +} + + +// TODO this needs to go elsewhere +func IntString(x, base int) string { + neg := false; + if x < 0 { + x = -x; + if x < 0 { + panic "smallest int not handled"; + } + neg = true; + } + + hex := "0123456789ABCDEF"; + var buf [16] byte; + i := 0; + for x > 0 || i == 0 { + buf[i] = hex[x % base]; + x /= base; + i++; + } + + s := ""; + if neg { + s = "-"; + } + for i > 0 { + i--; + s = s + string(int(buf[i])); + } + return s; +} + + +func CharString(ch int) string { + s := string(ch); + switch ch { + case '\a': s = `\a`; + case '\b': s = `\b`; + case '\f': s = `\f`; + case '\n': s = `\n`; + case '\r': s = `\r`; + case '\t': s = `\t`; + case '\v': s = `\v`; + case '\\': s = `\\`; + case '\'': s = `\'`; + } + return "'" + s + "' (U+" + IntString(ch, 16) + ")"; +} + + +func (S *Scanner) Expect (ch int) { + if S.ch != ch { + S.Error(S.chpos, "expected " + CharString(ch) + ", found " + CharString(S.ch)); + } + S.Next(); // make always progress +} + + +func (S *Scanner) SkipWhitespace () { + for is_whitespace(S.ch) { + S.Next(); + } +} + + +func (S *Scanner) SkipComment () { + // '/' already consumed + if S.ch == '/' { + // comment + S.Next(); + for S.ch != '\n' && S.ch >= 0 { + S.Next(); + } + + } else { + /* comment */ + pos := S.chpos - 1; + S.Expect('*'); + for S.ch >= 0 { + ch := S.ch; + S.Next(); + if ch == '*' && S.ch == '/' { + S.Next(); + return; + } + } + S.Error(pos, "comment not terminated"); + } +} + + +func (S *Scanner) ScanIdentifier () int { + beg := S.pos - 1; + for is_letter(S.ch) || digit_val(S.ch) < 10 { + S.Next(); + } + end := S.pos - 1; + + var tok int; + var present bool; + tok, present = Keywords[S.src[beg : end]]; + if !present { + tok = IDENT; + } + + return tok; +} + + +func (S *Scanner) ScanMantissa (base int) { + for digit_val(S.ch) < base { + S.Next(); + } +} + + +func (S *Scanner) ScanNumber (seen_decimal_point bool) int { + if seen_decimal_point { + S.ScanMantissa(10); + goto exponent; + } + + if S.ch == '0' { + // int or float + S.Next(); + if S.ch == 'x' || S.ch == 'X' { + // hexadecimal int + S.Next(); + S.ScanMantissa(16); + } else { + // octal int or float + S.ScanMantissa(8); + if digit_val(S.ch) < 10 || S.ch == '.' || S.ch == 'e' || S.ch == 'E' { + // float + goto mantissa; + } + // octal int + } + return NUMBER; + } + +mantissa: + // decimal int or float + S.ScanMantissa(10); + + if S.ch == '.' { + // float + S.Next(); + S.ScanMantissa(10) + } + +exponent: + if S.ch == 'e' || S.ch == 'E' { + // float + S.Next(); + if S.ch == '-' || S.ch == '+' { + S.Next(); + } + S.ScanMantissa(10); + } + return NUMBER; +} + + +func (S *Scanner) ScanDigits(n int, base int) { + for digit_val(S.ch) < base { + S.Next(); + n--; + } + if n > 0 { + S.Error(S.chpos, "illegal char escape"); + } +} + + +func (S *Scanner) ScanEscape () string { + // TODO: fix this routine + + ch := S.ch; + pos := S.chpos; + S.Next(); + switch (ch) { + case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"': + return string(ch); + + case '0', '1', '2', '3', '4', '5', '6', '7': + S.ScanDigits(3 - 1, 8); // 1 char already read + return ""; // TODO fix this + + case 'x': + S.ScanDigits(2, 16); + return ""; // TODO fix this + + case 'u': + S.ScanDigits(4, 16); + return ""; // TODO fix this + + case 'U': + S.ScanDigits(8, 16); + return ""; // TODO fix this + + default: + S.Error(pos, "illegal char escape"); + } +} + + +func (S *Scanner) ScanChar () int { + // '\'' already consumed + + ch := S.ch; + S.Next(); + if ch == '\\' { + S.ScanEscape(); + } + + S.Expect('\''); + return NUMBER; +} + + +func (S *Scanner) ScanString () int { + // '"' already consumed + + pos := S.chpos - 1; + for S.ch != '"' { + ch := S.ch; + S.Next(); + if ch == '\n' || ch < 0 { + S.Error(pos, "string not terminated"); + break; + } + if ch == '\\' { + S.ScanEscape(); + } + } + + S.Next(); + return STRING; +} + + +func (S *Scanner) ScanRawString () int { + // '`' already consumed + + pos := S.chpos - 1; + for S.ch != '`' { + ch := S.ch; + S.Next(); + if ch == '\n' || ch < 0 { + S.Error(pos, "string not terminated"); + break; + } + } + + S.Next(); + return STRING; +} + + +func (S *Scanner) Select2 (tok0, tok1 int) int { + if S.ch == '=' { + S.Next(); + return tok1; + } + return tok0; +} + + +func (S *Scanner) Select3 (tok0, tok1, ch2, tok2 int) int { + if S.ch == '=' { + S.Next(); + return tok1; + } + if S.ch == ch2 { + S.Next(); + return tok2; + } + return tok0; +} + + +func (S *Scanner) Select4 (tok0, tok1, ch2, tok2, tok3 int) int { + if S.ch == '=' { + S.Next(); + return tok1; + } + if S.ch == ch2 { + S.Next(); + if S.ch == '=' { + S.Next(); + return tok3; + } + return tok2; + } + return tok0; +} + + +func (S *Scanner) Scan () (tok, beg, end int) { + S.SkipWhitespace(); + + ch := S.ch; + tok = ILLEGAL; + beg = S.chpos; + + switch { + case is_letter(ch): tok = S.ScanIdentifier(); + case digit_val(ch) < 10: tok = S.ScanNumber(false); + default: + S.Next(); // always make progress + switch ch { + case -1: tok = EOF; + case '"': tok = S.ScanString(); + case '\'': tok = S.ScanChar(); + case '`': tok = S.ScanRawString(); + case ':': tok = S.Select2(COLON, DEFINE); + case '.': + if digit_val(S.ch) < 10 { + tok = S.ScanNumber(true); + } else { + tok = PERIOD; + } + case ',': tok = COMMA; + case ';': tok = SEMICOLON; + case '(': tok = LPAREN; + case ')': tok = RPAREN; + case '[': tok = LBRACK; + case ']': tok = RBRACK; + case '{': tok = LBRACE; + case '}': tok = RBRACE; + case '+': tok = S.Select3(ADD, ADD_ASSIGN, '+', INC); + case '-': tok = S.Select3(SUB, SUB_ASSIGN, '-', DEC); + case '*': tok = S.Select2(MUL, MUL_ASSIGN); + case '/': + if S.ch == '/' || S.ch == '*' { + S.SkipComment(); + // cannot simply return because of 6g bug + tok, beg, end = S.Scan(); + return tok, beg, end; + } + tok = S.Select2(QUO, QUO_ASSIGN); + case '%': tok = S.Select2(REM, REM_ASSIGN); + case '^': tok = S.Select2(XOR, XOR_ASSIGN); + case '<': tok = S.Select4(LSS, LEQ, '<', SHL, SHL_ASSIGN); + case '>': tok = S.Select4(GTR, GEQ, '>', SHR, SHR_ASSIGN); + case '=': tok = S.Select2(ASSIGN, EQL); + case '!': tok = S.Select2(NOT, NEQ); + case '&': tok = S.Select3(AND, AND_ASSIGN, '&', LAND); + case '|': tok = S.Select3(OR, OR_ASSIGN, '|', LOR); + default: + S.Error(beg, "illegal character " + CharString(ch)); + tok = ILLEGAL; + } + } + + return tok, beg, S.chpos; +} |