summaryrefslogtreecommitdiff
path: root/src/lib/go/scanner.go
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/go/scanner.go')
-rw-r--r--src/lib/go/scanner.go475
1 files changed, 475 insertions, 0 deletions
diff --git a/src/lib/go/scanner.go b/src/lib/go/scanner.go
new file mode 100644
index 000000000..ad7f80b5b
--- /dev/null
+++ b/src/lib/go/scanner.go
@@ -0,0 +1,475 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scanner
+
+// A Go scanner. Takes a []byte as source which can then be
+// tokenized through repeated calls to the Scan() function.
+//
+// Sample use:
+//
+// import "token"
+// import "scanner"
+//
+// func tokenize(src []byte) {
+// var s scanner.Scanner;
+// s.Init(src, nil /* no error handler */, false /* ignore comments */);
+// for {
+// pos, tok, lit := s.Scan();
+// if tok == Scanner.EOF {
+// return;
+// }
+// println(pos, token.TokenString(tok), string(lit));
+// }
+// }
+
+import (
+ "utf8";
+ "unicode";
+ "strconv";
+ "token";
+)
+
+
+// An implementation of an ErrorHandler must be provided to the Scanner.
+// If a syntax error is encountered, Error() is called with the exact
+// token position (the byte position of the token in the source) and the
+// error message.
+
+type ErrorHandler interface {
+ Error(pos int, msg string);
+}
+
+
+type Scanner struct {
+ // immutable state
+ src []byte; // source
+ err ErrorHandler; // error reporting
+ scan_comments bool; // if set, comments are reported as tokens
+
+ // scanning state
+ pos int; // current reading position
+ ch int; // one char look-ahead
+ chpos int; // position of ch
+}
+
+
+func isLetter(ch int) bool {
+ return
+ 'a' <= ch && ch <= 'z' ||
+ 'A' <= ch && ch <= 'Z' ||
+ ch == '_' ||
+ ch >= 0x80 && unicode.IsLetter(ch);
+}
+
+
+func digitVal(ch int) int {
+ switch {
+ case '0' <= ch && ch <= '9': return ch - '0';
+ case 'a' <= ch && ch <= 'f': return ch - 'a' + 10;
+ case 'A' <= ch && ch <= 'F': return ch - 'A' + 10;
+ }
+ return 16; // larger than any legal digit val
+}
+
+
+// Read the next Unicode char into S.ch.
+// S.ch < 0 means end-of-file.
+func (S *Scanner) next() {
+ if S.pos < len(S.src) {
+ // assume ASCII
+ r, w := int(S.src[S.pos]), 1;
+ if r >= 0x80 {
+ // not ASCII
+ r, w = utf8.DecodeRune(S.src[S.pos : len(S.src)]);
+ }
+ S.ch = r;
+ S.chpos = S.pos;
+ S.pos += w;
+ } else {
+ S.ch = -1; // eof
+ S.chpos = len(S.src);
+ }
+}
+
+
+// Initialize the scanner.
+//
+// The error handler (err) is called when an illegal token is encountered.
+// If scan_comments is set to true, newline characters ('\n') and comments
+// are recognized as token.COMMENT, otherwise they are treated as white
+// space and ignored.
+
+func (S *Scanner) Init(src []byte, err ErrorHandler, scan_comments bool) {
+ S.src = src;
+ S.err = err;
+ S.scan_comments = scan_comments;
+ S.next();
+}
+
+
+func charString(ch int) string {
+ s := string(ch);
+ switch ch {
+ case '\a': s = `\a`;
+ case '\b': s = `\b`;
+ case '\f': s = `\f`;
+ case '\n': s = `\n`;
+ case '\r': s = `\r`;
+ case '\t': s = `\t`;
+ case '\v': s = `\v`;
+ case '\\': s = `\\`;
+ case '\'': s = `\'`;
+ }
+ return "'" + s + "' (U+" + strconv.Itob(ch, 16) + ")";
+}
+
+
+func (S *Scanner) error(pos int, msg string) {
+ S.err.Error(pos, msg);
+}
+
+
+func (S *Scanner) expect(ch int) {
+ if S.ch != ch {
+ S.error(S.chpos, "expected " + charString(ch) + ", found " + charString(S.ch));
+ }
+ S.next(); // always make progress
+}
+
+
+func (S *Scanner) skipWhitespace() {
+ for {
+ switch S.ch {
+ case '\t', '\r', ' ':
+ // nothing to do
+ case '\n':
+ if S.scan_comments {
+ return;
+ }
+ default:
+ return;
+ }
+ S.next();
+ }
+ panic("UNREACHABLE");
+}
+
+
+func (S *Scanner) scanComment() []byte {
+ // first '/' already consumed
+ pos := S.chpos - 1;
+
+ if S.ch == '/' {
+ //-style comment
+ for S.ch >= 0 {
+ S.next();
+ if S.ch == '\n' {
+ // '\n' terminates comment but we do not include
+ // it in the comment (otherwise we don't see the
+ // start of a newline in skipWhitespace()).
+ return S.src[pos : S.chpos];
+ }
+ }
+
+ } else {
+ /*-style comment */
+ S.expect('*');
+ for S.ch >= 0 {
+ ch := S.ch;
+ S.next();
+ if ch == '*' && S.ch == '/' {
+ S.next();
+ return S.src[pos : S.chpos];
+ }
+ }
+ }
+
+ S.error(pos, "comment not terminated");
+ return S.src[pos : S.chpos];
+}
+
+
+func (S *Scanner) scanIdentifier() (tok int, lit []byte) {
+ pos := S.chpos;
+ for isLetter(S.ch) || digitVal(S.ch) < 10 {
+ S.next();
+ }
+ lit = S.src[pos : S.chpos];
+ return token.Lookup(lit), lit;
+}
+
+
+func (S *Scanner) scanMantissa(base int) {
+ for digitVal(S.ch) < base {
+ S.next();
+ }
+}
+
+
+func (S *Scanner) scanNumber(seen_decimal_point bool) (tok int, lit []byte) {
+ pos := S.chpos;
+ tok = token.INT;
+
+ if seen_decimal_point {
+ tok = token.FLOAT;
+ pos--; // '.' is one byte
+ S.scanMantissa(10);
+ goto exponent;
+ }
+
+ if S.ch == '0' {
+ // int or float
+ S.next();
+ if S.ch == 'x' || S.ch == 'X' {
+ // hexadecimal int
+ S.next();
+ S.scanMantissa(16);
+ } else {
+ // octal int or float
+ S.scanMantissa(8);
+ if digitVal(S.ch) < 10 || S.ch == '.' || S.ch == 'e' || S.ch == 'E' {
+ // float
+ tok = token.FLOAT;
+ goto mantissa;
+ }
+ // octal int
+ }
+ goto exit;
+ }
+
+mantissa:
+ // decimal int or float
+ S.scanMantissa(10);
+
+ if S.ch == '.' {
+ // float
+ tok = token.FLOAT;
+ S.next();
+ S.scanMantissa(10)
+ }
+
+exponent:
+ if S.ch == 'e' || S.ch == 'E' {
+ // float
+ tok = token.FLOAT;
+ S.next();
+ if S.ch == '-' || S.ch == '+' {
+ S.next();
+ }
+ S.scanMantissa(10);
+ }
+
+exit:
+ return tok, S.src[pos : S.chpos];
+}
+
+
+func (S *Scanner) scanDigits(n int, base int) {
+ for digitVal(S.ch) < base {
+ S.next();
+ n--;
+ }
+ if n > 0 {
+ S.error(S.chpos, "illegal char escape");
+ }
+}
+
+
+func (S *Scanner) scanEscape(quote int) {
+ ch := S.ch;
+ pos := S.chpos;
+ S.next();
+ switch ch {
+ case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
+ // nothing to do
+ case '0', '1', '2', '3', '4', '5', '6', '7':
+ S.scanDigits(3 - 1, 8); // 1 char read already
+ case 'x':
+ S.scanDigits(2, 16);
+ case 'u':
+ S.scanDigits(4, 16);
+ case 'U':
+ S.scanDigits(8, 16);
+ default:
+ S.error(pos, "illegal char escape");
+ }
+}
+
+
+func (S *Scanner) scanChar() []byte {
+ // '\'' already consumed
+
+ pos := S.chpos - 1;
+ ch := S.ch;
+ S.next();
+ if ch == '\\' {
+ S.scanEscape('\'');
+ }
+
+ S.expect('\'');
+ return S.src[pos : S.chpos];
+}
+
+
+func (S *Scanner) scanString() []byte {
+ // '"' already consumed
+
+ pos := S.chpos - 1;
+ for S.ch != '"' {
+ ch := S.ch;
+ S.next();
+ if ch == '\n' || ch < 0 {
+ S.error(pos, "string not terminated");
+ break;
+ }
+ if ch == '\\' {
+ S.scanEscape('"');
+ }
+ }
+
+ S.next();
+ return S.src[pos : S.chpos];
+}
+
+
+func (S *Scanner) scanRawString() []byte {
+ // '`' already consumed
+
+ pos := S.chpos - 1;
+ for S.ch != '`' {
+ ch := S.ch;
+ S.next();
+ if ch == '\n' || ch < 0 {
+ S.error(pos, "string not terminated");
+ break;
+ }
+ }
+
+ S.next();
+ return S.src[pos : S.chpos];
+}
+
+
+// Helper functions for scanning multi-byte tokens such as >> += >>= .
+// Different routines recognize different length tok_i based on matches
+// of ch_i. If a token ends in '=', the result is tok1 or tok3
+// respectively. Otherwise, the result is tok0 if there was no other
+// matching character, or tok2 if the matching character was ch2.
+
+func (S *Scanner) switch2(tok0, tok1 int) int {
+ if S.ch == '=' {
+ S.next();
+ return tok1;
+ }
+ return tok0;
+}
+
+
+func (S *Scanner) switch3(tok0, tok1, ch2, tok2 int) int {
+ if S.ch == '=' {
+ S.next();
+ return tok1;
+ }
+ if S.ch == ch2 {
+ S.next();
+ return tok2;
+ }
+ return tok0;
+}
+
+
+func (S *Scanner) switch4(tok0, tok1, ch2, tok2, tok3 int) int {
+ if S.ch == '=' {
+ S.next();
+ return tok1;
+ }
+ if S.ch == ch2 {
+ S.next();
+ if S.ch == '=' {
+ S.next();
+ return tok3;
+ }
+ return tok2;
+ }
+ return tok0;
+}
+
+
+// Scans the next token. Returns the token byte position in the source,
+// its token value, and the corresponding literal text if the token is
+// an identifier or basic type literal (token.IsLiteral(tok) == true).
+
+func (S *Scanner) Scan() (pos, tok int, lit []byte) {
+scan_again:
+ S.skipWhitespace();
+
+ pos, tok = S.chpos, token.ILLEGAL;
+
+ switch ch := S.ch; {
+ case isLetter(ch):
+ tok, lit = S.scanIdentifier();
+ case digitVal(ch) < 10:
+ tok, lit = S.scanNumber(false);
+ default:
+ S.next(); // always make progress
+ switch ch {
+ case -1 : tok = token.EOF;
+ case '\n': tok, lit = token.COMMENT, []byte{'\n'};
+ case '"' : tok, lit = token.STRING, S.scanString();
+ case '\'': tok, lit = token.CHAR, S.scanChar();
+ case '`' : tok, lit = token.STRING, S.scanRawString();
+ case ':' : tok = S.switch2(token.COLON, token.DEFINE);
+ case '.' :
+ if digitVal(S.ch) < 10 {
+ tok, lit = S.scanNumber(true);
+ } else if S.ch == '.' {
+ S.next();
+ if S.ch == '.' {
+ S.next();
+ tok = token.ELLIPSIS;
+ }
+ } else {
+ tok = token.PERIOD;
+ }
+ case ',': tok = token.COMMA;
+ case ';': tok = token.SEMICOLON;
+ case '(': tok = token.LPAREN;
+ case ')': tok = token.RPAREN;
+ case '[': tok = token.LBRACK;
+ case ']': tok = token.RBRACK;
+ case '{': tok = token.LBRACE;
+ case '}': tok = token.RBRACE;
+ case '+': tok = S.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC);
+ case '-': tok = S.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC);
+ case '*': tok = S.switch2(token.MUL, token.MUL_ASSIGN);
+ case '/':
+ if S.ch == '/' || S.ch == '*' {
+ tok, lit = token.COMMENT, S.scanComment();
+ if !S.scan_comments {
+ goto scan_again;
+ }
+ } else {
+ tok = S.switch2(token.QUO, token.QUO_ASSIGN);
+ }
+ case '%': tok = S.switch2(token.REM, token.REM_ASSIGN);
+ case '^': tok = S.switch2(token.XOR, token.XOR_ASSIGN);
+ case '<':
+ if S.ch == '-' {
+ S.next();
+ tok = token.ARROW;
+ } else {
+ tok = S.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN);
+ }
+ case '>': tok = S.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN);
+ case '=': tok = S.switch2(token.ASSIGN, token.EQL);
+ case '!': tok = S.switch2(token.NOT, token.NEQ);
+ case '&': tok = S.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND);
+ case '|': tok = S.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR);
+ default: S.error(pos, "illegal character " + charString(ch));
+ }
+ }
+
+ return pos, tok, lit;
+}