diff options
Diffstat (limited to 'src/lib/go/scanner.go')
-rw-r--r-- | src/lib/go/scanner.go | 475 |
1 files changed, 475 insertions, 0 deletions
diff --git a/src/lib/go/scanner.go b/src/lib/go/scanner.go new file mode 100644 index 000000000..ad7f80b5b --- /dev/null +++ b/src/lib/go/scanner.go @@ -0,0 +1,475 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package scanner + +// A Go scanner. Takes a []byte as source which can then be +// tokenized through repeated calls to the Scan() function. +// +// Sample use: +// +// import "token" +// import "scanner" +// +// func tokenize(src []byte) { +// var s scanner.Scanner; +// s.Init(src, nil /* no error handler */, false /* ignore comments */); +// for { +// pos, tok, lit := s.Scan(); +// if tok == Scanner.EOF { +// return; +// } +// println(pos, token.TokenString(tok), string(lit)); +// } +// } + +import ( + "utf8"; + "unicode"; + "strconv"; + "token"; +) + + +// An implementation of an ErrorHandler must be provided to the Scanner. +// If a syntax error is encountered, Error() is called with the exact +// token position (the byte position of the token in the source) and the +// error message. + +type ErrorHandler interface { + Error(pos int, msg string); +} + + +type Scanner struct { + // immutable state + src []byte; // source + err ErrorHandler; // error reporting + scan_comments bool; // if set, comments are reported as tokens + + // scanning state + pos int; // current reading position + ch int; // one char look-ahead + chpos int; // position of ch +} + + +func isLetter(ch int) bool { + return + 'a' <= ch && ch <= 'z' || + 'A' <= ch && ch <= 'Z' || + ch == '_' || + ch >= 0x80 && unicode.IsLetter(ch); +} + + +func digitVal(ch int) int { + switch { + case '0' <= ch && ch <= '9': return ch - '0'; + case 'a' <= ch && ch <= 'f': return ch - 'a' + 10; + case 'A' <= ch && ch <= 'F': return ch - 'A' + 10; + } + return 16; // larger than any legal digit val +} + + +// Read the next Unicode char into S.ch. +// S.ch < 0 means end-of-file. +func (S *Scanner) next() { + if S.pos < len(S.src) { + // assume ASCII + r, w := int(S.src[S.pos]), 1; + if r >= 0x80 { + // not ASCII + r, w = utf8.DecodeRune(S.src[S.pos : len(S.src)]); + } + S.ch = r; + S.chpos = S.pos; + S.pos += w; + } else { + S.ch = -1; // eof + S.chpos = len(S.src); + } +} + + +// Initialize the scanner. +// +// The error handler (err) is called when an illegal token is encountered. +// If scan_comments is set to true, newline characters ('\n') and comments +// are recognized as token.COMMENT, otherwise they are treated as white +// space and ignored. + +func (S *Scanner) Init(src []byte, err ErrorHandler, scan_comments bool) { + S.src = src; + S.err = err; + S.scan_comments = scan_comments; + S.next(); +} + + +func charString(ch int) string { + s := string(ch); + switch ch { + case '\a': s = `\a`; + case '\b': s = `\b`; + case '\f': s = `\f`; + case '\n': s = `\n`; + case '\r': s = `\r`; + case '\t': s = `\t`; + case '\v': s = `\v`; + case '\\': s = `\\`; + case '\'': s = `\'`; + } + return "'" + s + "' (U+" + strconv.Itob(ch, 16) + ")"; +} + + +func (S *Scanner) error(pos int, msg string) { + S.err.Error(pos, msg); +} + + +func (S *Scanner) expect(ch int) { + if S.ch != ch { + S.error(S.chpos, "expected " + charString(ch) + ", found " + charString(S.ch)); + } + S.next(); // always make progress +} + + +func (S *Scanner) skipWhitespace() { + for { + switch S.ch { + case '\t', '\r', ' ': + // nothing to do + case '\n': + if S.scan_comments { + return; + } + default: + return; + } + S.next(); + } + panic("UNREACHABLE"); +} + + +func (S *Scanner) scanComment() []byte { + // first '/' already consumed + pos := S.chpos - 1; + + if S.ch == '/' { + //-style comment + for S.ch >= 0 { + S.next(); + if S.ch == '\n' { + // '\n' terminates comment but we do not include + // it in the comment (otherwise we don't see the + // start of a newline in skipWhitespace()). + return S.src[pos : S.chpos]; + } + } + + } else { + /*-style comment */ + S.expect('*'); + for S.ch >= 0 { + ch := S.ch; + S.next(); + if ch == '*' && S.ch == '/' { + S.next(); + return S.src[pos : S.chpos]; + } + } + } + + S.error(pos, "comment not terminated"); + return S.src[pos : S.chpos]; +} + + +func (S *Scanner) scanIdentifier() (tok int, lit []byte) { + pos := S.chpos; + for isLetter(S.ch) || digitVal(S.ch) < 10 { + S.next(); + } + lit = S.src[pos : S.chpos]; + return token.Lookup(lit), lit; +} + + +func (S *Scanner) scanMantissa(base int) { + for digitVal(S.ch) < base { + S.next(); + } +} + + +func (S *Scanner) scanNumber(seen_decimal_point bool) (tok int, lit []byte) { + pos := S.chpos; + tok = token.INT; + + if seen_decimal_point { + tok = token.FLOAT; + pos--; // '.' is one byte + S.scanMantissa(10); + goto exponent; + } + + if S.ch == '0' { + // int or float + S.next(); + if S.ch == 'x' || S.ch == 'X' { + // hexadecimal int + S.next(); + S.scanMantissa(16); + } else { + // octal int or float + S.scanMantissa(8); + if digitVal(S.ch) < 10 || S.ch == '.' || S.ch == 'e' || S.ch == 'E' { + // float + tok = token.FLOAT; + goto mantissa; + } + // octal int + } + goto exit; + } + +mantissa: + // decimal int or float + S.scanMantissa(10); + + if S.ch == '.' { + // float + tok = token.FLOAT; + S.next(); + S.scanMantissa(10) + } + +exponent: + if S.ch == 'e' || S.ch == 'E' { + // float + tok = token.FLOAT; + S.next(); + if S.ch == '-' || S.ch == '+' { + S.next(); + } + S.scanMantissa(10); + } + +exit: + return tok, S.src[pos : S.chpos]; +} + + +func (S *Scanner) scanDigits(n int, base int) { + for digitVal(S.ch) < base { + S.next(); + n--; + } + if n > 0 { + S.error(S.chpos, "illegal char escape"); + } +} + + +func (S *Scanner) scanEscape(quote int) { + ch := S.ch; + pos := S.chpos; + S.next(); + switch ch { + case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: + // nothing to do + case '0', '1', '2', '3', '4', '5', '6', '7': + S.scanDigits(3 - 1, 8); // 1 char read already + case 'x': + S.scanDigits(2, 16); + case 'u': + S.scanDigits(4, 16); + case 'U': + S.scanDigits(8, 16); + default: + S.error(pos, "illegal char escape"); + } +} + + +func (S *Scanner) scanChar() []byte { + // '\'' already consumed + + pos := S.chpos - 1; + ch := S.ch; + S.next(); + if ch == '\\' { + S.scanEscape('\''); + } + + S.expect('\''); + return S.src[pos : S.chpos]; +} + + +func (S *Scanner) scanString() []byte { + // '"' already consumed + + pos := S.chpos - 1; + for S.ch != '"' { + ch := S.ch; + S.next(); + if ch == '\n' || ch < 0 { + S.error(pos, "string not terminated"); + break; + } + if ch == '\\' { + S.scanEscape('"'); + } + } + + S.next(); + return S.src[pos : S.chpos]; +} + + +func (S *Scanner) scanRawString() []byte { + // '`' already consumed + + pos := S.chpos - 1; + for S.ch != '`' { + ch := S.ch; + S.next(); + if ch == '\n' || ch < 0 { + S.error(pos, "string not terminated"); + break; + } + } + + S.next(); + return S.src[pos : S.chpos]; +} + + +// Helper functions for scanning multi-byte tokens such as >> += >>= . +// Different routines recognize different length tok_i based on matches +// of ch_i. If a token ends in '=', the result is tok1 or tok3 +// respectively. Otherwise, the result is tok0 if there was no other +// matching character, or tok2 if the matching character was ch2. + +func (S *Scanner) switch2(tok0, tok1 int) int { + if S.ch == '=' { + S.next(); + return tok1; + } + return tok0; +} + + +func (S *Scanner) switch3(tok0, tok1, ch2, tok2 int) int { + if S.ch == '=' { + S.next(); + return tok1; + } + if S.ch == ch2 { + S.next(); + return tok2; + } + return tok0; +} + + +func (S *Scanner) switch4(tok0, tok1, ch2, tok2, tok3 int) int { + if S.ch == '=' { + S.next(); + return tok1; + } + if S.ch == ch2 { + S.next(); + if S.ch == '=' { + S.next(); + return tok3; + } + return tok2; + } + return tok0; +} + + +// Scans the next token. Returns the token byte position in the source, +// its token value, and the corresponding literal text if the token is +// an identifier or basic type literal (token.IsLiteral(tok) == true). + +func (S *Scanner) Scan() (pos, tok int, lit []byte) { +scan_again: + S.skipWhitespace(); + + pos, tok = S.chpos, token.ILLEGAL; + + switch ch := S.ch; { + case isLetter(ch): + tok, lit = S.scanIdentifier(); + case digitVal(ch) < 10: + tok, lit = S.scanNumber(false); + default: + S.next(); // always make progress + switch ch { + case -1 : tok = token.EOF; + case '\n': tok, lit = token.COMMENT, []byte{'\n'}; + case '"' : tok, lit = token.STRING, S.scanString(); + case '\'': tok, lit = token.CHAR, S.scanChar(); + case '`' : tok, lit = token.STRING, S.scanRawString(); + case ':' : tok = S.switch2(token.COLON, token.DEFINE); + case '.' : + if digitVal(S.ch) < 10 { + tok, lit = S.scanNumber(true); + } else if S.ch == '.' { + S.next(); + if S.ch == '.' { + S.next(); + tok = token.ELLIPSIS; + } + } else { + tok = token.PERIOD; + } + case ',': tok = token.COMMA; + case ';': tok = token.SEMICOLON; + case '(': tok = token.LPAREN; + case ')': tok = token.RPAREN; + case '[': tok = token.LBRACK; + case ']': tok = token.RBRACK; + case '{': tok = token.LBRACE; + case '}': tok = token.RBRACE; + case '+': tok = S.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC); + case '-': tok = S.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC); + case '*': tok = S.switch2(token.MUL, token.MUL_ASSIGN); + case '/': + if S.ch == '/' || S.ch == '*' { + tok, lit = token.COMMENT, S.scanComment(); + if !S.scan_comments { + goto scan_again; + } + } else { + tok = S.switch2(token.QUO, token.QUO_ASSIGN); + } + case '%': tok = S.switch2(token.REM, token.REM_ASSIGN); + case '^': tok = S.switch2(token.XOR, token.XOR_ASSIGN); + case '<': + if S.ch == '-' { + S.next(); + tok = token.ARROW; + } else { + tok = S.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN); + } + case '>': tok = S.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN); + case '=': tok = S.switch2(token.ASSIGN, token.EQL); + case '!': tok = S.switch2(token.NOT, token.NEQ); + case '&': tok = S.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND); + case '|': tok = S.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR); + default: S.error(pos, "illegal character " + charString(ch)); + } + } + + return pos, tok, lit; +} |