diff options
Diffstat (limited to 'src/pkg/go/scanner')
-rw-r--r-- | src/pkg/go/scanner/Makefile | 2 | ||||
-rw-r--r-- | src/pkg/go/scanner/scanner.go | 336 | ||||
-rw-r--r-- | src/pkg/go/scanner/scanner_test.go | 496 |
3 files changed, 455 insertions, 379 deletions
diff --git a/src/pkg/go/scanner/Makefile b/src/pkg/go/scanner/Makefile index 70d21a972..453faac00 100644 --- a/src/pkg/go/scanner/Makefile +++ b/src/pkg/go/scanner/Makefile @@ -2,7 +2,7 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. -include ../../../Make.$(GOARCH) +include ../../../Make.inc TARG=go/scanner GOFILES=\ diff --git a/src/pkg/go/scanner/scanner.go b/src/pkg/go/scanner/scanner.go index e5ac9d772..6ce846cd8 100644 --- a/src/pkg/go/scanner/scanner.go +++ b/src/pkg/go/scanner/scanner.go @@ -4,13 +4,25 @@ // A scanner for Go source text. Takes a []byte as source which can // then be tokenized through repeated calls to the Scan function. -// For a sample use of a scanner, see the implementation of Tokenize. +// Typical use: +// +// var s Scanner +// fset := token.NewFileSet() // position information is relative to fset +// s.Init(fset, filename, src, nil /* no error handler */, 0) +// for { +// pos, tok, lit := s.Scan() +// if tok == token.EOF { +// break +// } +// // do something here with pos, tok, and lit +// } // package scanner import ( "bytes" "go/token" + "path" "strconv" "unicode" "utf8" @@ -19,20 +31,22 @@ import ( // A Scanner holds the scanner's internal state while processing // a given text. It can be allocated as part of another data -// structure but must be initialized via Init before use. For -// a sample use, see the implementation of Tokenize. +// structure but must be initialized via Init before use. // type Scanner struct { // immutable state + file *token.File // source file handle + dir string // directory portion of file.Name() src []byte // source err ErrorHandler // error reporting; or nil mode uint // scanning mode // scanning state - pos token.Position // previous reading position (position before ch) - offset int // current reading offset (position after ch) - ch int // one char look-ahead - insertSemi bool // insert a semicolon before next newline + ch int // current character + offset int // character offset + rdOffset int // reading offset (position after current character) + lineOffset int // current line offset + insertSemi bool // insert a semicolon before next newline // public state - ok to modify ErrorCount int // number of errors encountered @@ -43,29 +57,31 @@ type Scanner struct { // S.ch < 0 means end-of-file. // func (S *Scanner) next() { - if S.offset < len(S.src) { - S.pos.Offset = S.offset - S.pos.Column++ + if S.rdOffset < len(S.src) { + S.offset = S.rdOffset if S.ch == '\n' { - // next character starts a new line - S.pos.Line++ - S.pos.Column = 1 + S.lineOffset = S.offset + S.file.AddLine(S.offset) } - r, w := int(S.src[S.offset]), 1 + r, w := int(S.src[S.rdOffset]), 1 switch { case r == 0: - S.error(S.pos, "illegal character NUL") + S.error(S.offset, "illegal character NUL") case r >= 0x80: // not ASCII - r, w = utf8.DecodeRune(S.src[S.offset:]) + r, w = utf8.DecodeRune(S.src[S.rdOffset:]) if r == utf8.RuneError && w == 1 { - S.error(S.pos, "illegal UTF-8 encoding") + S.error(S.offset, "illegal UTF-8 encoding") } } - S.offset += w + S.rdOffset += w S.ch = r } else { - S.pos.Offset = len(S.src) + S.offset = len(S.src) + if S.ch == '\n' { + S.lineOffset = S.offset + S.file.AddLine(S.offset) + } S.ch = -1 // eof } } @@ -80,24 +96,38 @@ const ( InsertSemis // automatically insert semicolons ) +// TODO(gri) Would it be better to simply provide *token.File to Init +// instead of fset, and filename, and then return the file? +// It could cause an error/panic if the provided file.Size() +// doesn't match len(src). -// Init prepares the scanner S to tokenize the text src. Calls to Scan -// will use the error handler err if they encounter a syntax error and -// err is not nil. Also, for each error encountered, the Scanner field -// ErrorCount is incremented by one. The filename parameter is used as -// filename in the token.Position returned by Scan for each token. The -// mode parameter determines how comments and illegal characters are -// handled. +// Init prepares the scanner S to tokenize the text src. It sets the +// scanner at the beginning of the source text, adds a new file with +// the given filename to the file set fset, and returns that file. +// +// Calls to Scan will use the error handler err if they encounter a +// syntax error and err is not nil. Also, for each error encountered, +// the Scanner field ErrorCount is incremented by one. The mode parameter +// determines how comments, illegal characters, and semicolons are handled. // -func (S *Scanner) Init(filename string, src []byte, err ErrorHandler, mode uint) { +func (S *Scanner) Init(fset *token.FileSet, filename string, src []byte, err ErrorHandler, mode uint) *token.File { // Explicitly initialize all fields since a scanner may be reused. + S.file = fset.AddFile(filename, fset.Base(), len(src)) + S.dir, _ = path.Split(filename) S.src = src S.err = err S.mode = mode - S.pos = token.Position{filename, 0, 1, 0} + + S.ch = ' ' S.offset = 0 + S.rdOffset = 0 + S.lineOffset = 0 + S.insertSemi = false S.ErrorCount = 0 + S.next() + + return S.file } @@ -131,111 +161,109 @@ func charString(ch int) string { } -func (S *Scanner) error(pos token.Position, msg string) { +func (S *Scanner) error(offs int, msg string) { if S.err != nil { - S.err.Error(pos, msg) + S.err.Error(S.file.Position(S.file.Pos(offs)), msg) } S.ErrorCount++ } -func (S *Scanner) expect(ch int) { - if S.ch != ch { - S.error(S.pos, "expected "+charString(ch)+", found "+charString(S.ch)) +var prefix = []byte("//line ") + +func (S *Scanner) interpretLineComment(text []byte) { + if bytes.HasPrefix(text, prefix) { + // get filename and line number, if any + if i := bytes.Index(text, []byte{':'}); i > 0 { + if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 { + // valid //line filename:line comment; + filename := path.Clean(string(text[len(prefix):i])) + if filename[0] != '/' { + // make filename relative to current directory + filename = path.Join(S.dir, filename) + } + // update scanner position + S.file.AddLineInfo(S.lineOffset, filename, line-1) // -1 since comment applies to next line + } + } } - S.next() // always make progress } -var prefix = []byte("line ") - -func (S *Scanner) scanComment(pos token.Position) { - // first '/' already consumed +func (S *Scanner) scanComment() { + // initial '/' already consumed; S.ch == '/' || S.ch == '*' + offs := S.offset - 1 // position of initial '/' if S.ch == '/' { //-style comment - for S.ch >= 0 { + S.next() + for S.ch != '\n' && S.ch >= 0 { S.next() - if S.ch == '\n' { - // '\n' is not part of the comment for purposes of scanning - // (the comment ends on the same line where it started) - if pos.Column == 1 { - text := S.src[pos.Offset+2 : S.pos.Offset] - if bytes.HasPrefix(text, prefix) { - // comment starts at beginning of line with "//line "; - // get filename and line number, if any - i := bytes.Index(text, []byte{':'}) - if i >= 0 { - if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 { - // valid //line filename:line comment; - // update scanner position - S.pos.Filename = string(text[len(prefix):i]) - S.pos.Line = line - 1 // -1 since the '\n' has not been consumed yet - } - } - } - } - return - } } + if offs == S.lineOffset { + // comment starts at the beginning of the current line + S.interpretLineComment(S.src[offs:S.offset]) + } + return + } - } else { - /*-style comment */ - S.expect('*') - for S.ch >= 0 { - ch := S.ch + /*-style comment */ + S.next() + for S.ch >= 0 { + ch := S.ch + S.next() + if ch == '*' && S.ch == '/' { S.next() - if ch == '*' && S.ch == '/' { - S.next() - return - } + return } } - S.error(pos, "comment not terminated") + S.error(offs, "comment not terminated") } -func (S *Scanner) findNewline(pos token.Position) bool { - // first '/' already consumed; assume S.ch == '/' || S.ch == '*' +func (S *Scanner) findLineEnd() bool { + // initial '/' already consumed + + defer func(offs int) { + // reset scanner state to where it was upon calling findLineEnd + S.ch = '/' + S.offset = offs + S.rdOffset = offs + 1 + S.next() // consume initial '/' again + }(S.offset - 1) - // read ahead until a newline or non-comment token is found - newline := false - for pos1 := pos; S.ch >= 0; { + // read ahead until a newline, EOF, or non-comment token is found + for S.ch == '/' || S.ch == '*' { if S.ch == '/' { //-style comment always contains a newline - newline = true - break + return true } - S.scanComment(pos1) - if pos1.Line < S.pos.Line { - /*-style comment contained a newline */ - newline = true - break + /*-style comment: look for newline */ + S.next() + for S.ch >= 0 { + ch := S.ch + if ch == '\n' { + return true + } + S.next() + if ch == '*' && S.ch == '/' { + S.next() + break + } } S.skipWhitespace() // S.insertSemi is set - if S.ch == '\n' { - newline = true - break + if S.ch < 0 || S.ch == '\n' { + return true } if S.ch != '/' { // non-comment token - break - } - pos1 = S.pos - S.next() - if S.ch != '/' && S.ch != '*' { - // non-comment token - break + return false } + S.next() // consume '/' } - // reset position to where it was upon calling findNewline - S.pos = pos - S.offset = pos.Offset + 1 - S.next() - - return newline + return false } @@ -250,11 +278,11 @@ func isDigit(ch int) bool { func (S *Scanner) scanIdentifier() token.Token { - pos := S.pos.Offset + offs := S.offset for isLetter(S.ch) || isDigit(S.ch) { S.next() } - return token.Lookup(S.src[pos:S.pos.Offset]) + return token.Lookup(S.src[offs:S.offset]) } @@ -278,7 +306,7 @@ func (S *Scanner) scanMantissa(base int) { } -func (S *Scanner) scanNumber(pos token.Position, seenDecimalPoint bool) token.Token { +func (S *Scanner) scanNumber(seenDecimalPoint bool) token.Token { // digitVal(S.ch) < 10 tok := token.INT @@ -290,6 +318,7 @@ func (S *Scanner) scanNumber(pos token.Position, seenDecimalPoint bool) token.To if S.ch == '0' { // int or float + offs := S.offset S.next() if S.ch == 'x' || S.ch == 'X' { // hexadecimal int @@ -309,7 +338,7 @@ func (S *Scanner) scanNumber(pos token.Position, seenDecimalPoint bool) token.To } // octal int if seenDecimalDigit { - S.error(pos, "illegal octal number") + S.error(offs, "illegal octal number") } } goto exit @@ -346,7 +375,7 @@ exit: func (S *Scanner) scanEscape(quote int) { - pos := S.pos + offs := S.offset var i, base, max uint32 switch S.ch { @@ -366,28 +395,33 @@ func (S *Scanner) scanEscape(quote int) { i, base, max = 8, 16, unicode.MaxRune default: S.next() // always make progress - S.error(pos, "unknown escape sequence") + S.error(offs, "unknown escape sequence") return } var x uint32 - for ; i > 0; i-- { + for ; i > 0 && S.ch != quote && S.ch >= 0; i-- { d := uint32(digitVal(S.ch)) - if d > base { - S.error(S.pos, "illegal character in escape sequence") - return + if d >= base { + S.error(S.offset, "illegal character in escape sequence") + break } x = x*base + d S.next() } + // in case of an error, consume remaining chars + for ; i > 0 && S.ch != quote && S.ch >= 0; i-- { + S.next() + } if x > max || 0xd800 <= x && x < 0xe000 { - S.error(pos, "escape sequence is invalid Unicode code point") + S.error(offs, "escape sequence is invalid Unicode code point") } } -func (S *Scanner) scanChar(pos token.Position) { - // '\'' already consumed +func (S *Scanner) scanChar() { + // '\'' opening already consumed + offs := S.offset - 1 n := 0 for S.ch != '\'' { @@ -395,7 +429,7 @@ func (S *Scanner) scanChar(pos token.Position) { n++ S.next() if ch == '\n' || ch < 0 { - S.error(pos, "character literal not terminated") + S.error(offs, "character literal not terminated") n = 1 break } @@ -407,19 +441,20 @@ func (S *Scanner) scanChar(pos token.Position) { S.next() if n != 1 { - S.error(pos, "illegal character literal") + S.error(offs, "illegal character literal") } } -func (S *Scanner) scanString(pos token.Position) { - // '"' already consumed +func (S *Scanner) scanString() { + // '"' opening already consumed + offs := S.offset - 1 for S.ch != '"' { ch := S.ch S.next() if ch == '\n' || ch < 0 { - S.error(pos, "string not terminated") + S.error(offs, "string not terminated") break } if ch == '\\' { @@ -431,14 +466,15 @@ func (S *Scanner) scanString(pos token.Position) { } -func (S *Scanner) scanRawString(pos token.Position) { - // '`' already consumed +func (S *Scanner) scanRawString() { + // '`' opening already consumed + offs := S.offset - 1 for S.ch != '`' { ch := S.ch S.next() if ch < 0 { - S.error(pos, "string not terminated") + S.error(offs, "string not terminated") break } } @@ -499,12 +535,17 @@ func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Toke } -var semicolon = []byte{';'} +var newline = []byte{'\n'} // Scan scans the next token and returns the token position pos, // the token tok, and the literal text lit corresponding to the // token. The source end is indicated by token.EOF. // +// If the returned token is token.SEMICOLON, the corresponding +// literal value is ";" if the semicolon was present in the source, +// and "\n" if the semicolon was inserted because of a newline or +// at EOF. +// // For more tolerant parsing, Scan will return a valid token if // possible even if a syntax error was encountered. Thus, even // if the resulting token sequence contains no illegal tokens, @@ -512,13 +553,18 @@ var semicolon = []byte{';'} // must check the scanner's ErrorCount or the number of calls // of the error handler, if there was one installed. // -func (S *Scanner) Scan() (pos token.Position, tok token.Token, lit []byte) { +// Scan adds line information to the file added to the file +// set with Init. Token positions are relative to that file +// and thus relative to the file set. +// +func (S *Scanner) Scan() (token.Pos, token.Token, []byte) { scanAgain: S.skipWhitespace() // current token start insertSemi := false - pos, tok = S.pos, token.ILLEGAL + offs := S.offset + tok := token.ILLEGAL // determine token value switch ch := S.ch; { @@ -530,36 +576,40 @@ scanAgain: } case digitVal(ch) < 10: insertSemi = true - tok = S.scanNumber(pos, false) + tok = S.scanNumber(false) default: S.next() // always make progress switch ch { case -1: + if S.insertSemi { + S.insertSemi = false // EOF consumed + return S.file.Pos(offs), token.SEMICOLON, newline + } tok = token.EOF case '\n': // we only reach here if S.insertSemi was // set in the first place and exited early // from S.skipWhitespace() S.insertSemi = false // newline consumed - return pos, token.SEMICOLON, semicolon + return S.file.Pos(offs), token.SEMICOLON, newline case '"': insertSemi = true tok = token.STRING - S.scanString(pos) + S.scanString() case '\'': insertSemi = true tok = token.CHAR - S.scanChar(pos) + S.scanChar() case '`': insertSemi = true tok = token.STRING - S.scanRawString(pos) + S.scanRawString() case ':': tok = S.switch2(token.COLON, token.DEFINE) case '.': if digitVal(S.ch) < 10 { insertSemi = true - tok = S.scanNumber(pos, true) + tok = S.scanNumber(true) } else if S.ch == '.' { S.next() if S.ch == '.' { @@ -603,15 +653,15 @@ scanAgain: case '/': if S.ch == '/' || S.ch == '*' { // comment - if S.insertSemi && S.findNewline(pos) { + if S.insertSemi && S.findLineEnd() { // reset position to the beginning of the comment - S.pos = pos - S.offset = pos.Offset + 1 S.ch = '/' + S.offset = offs + S.rdOffset = offs + 1 S.insertSemi = false // newline consumed - return pos, token.SEMICOLON, semicolon + return S.file.Pos(offs), token.SEMICOLON, newline } - S.scanComment(pos) + S.scanComment() if S.mode&ScanComments == 0 { // skip comment S.insertSemi = false // newline consumed @@ -649,7 +699,7 @@ scanAgain: tok = S.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR) default: if S.mode&AllowIllegalChars == 0 { - S.error(pos, "illegal character "+charString(ch)) + S.error(offs, "illegal character "+charString(ch)) } insertSemi = S.insertSemi // preserve insertSemi info } @@ -658,21 +708,5 @@ scanAgain: if S.mode&InsertSemis != 0 { S.insertSemi = insertSemi } - return pos, tok, S.src[pos.Offset:S.pos.Offset] -} - - -// Tokenize calls a function f with the token position, token value, and token -// text for each token in the source src. The other parameters have the same -// meaning as for the Init function. Tokenize keeps scanning until f returns -// false (usually when the token value is token.EOF). The result is the number -// of errors encountered. -// -func Tokenize(filename string, src []byte, err ErrorHandler, mode uint, f func(pos token.Position, tok token.Token, lit []byte) bool) int { - var s Scanner - s.Init(filename, src, err, mode) - for f(s.Scan()) { - // action happens in f - } - return s.ErrorCount + return S.file.Pos(offs), tok, S.src[offs:S.offset] } diff --git a/src/pkg/go/scanner/scanner_test.go b/src/pkg/go/scanner/scanner_test.go index 002a81dd9..b1004f89d 100644 --- a/src/pkg/go/scanner/scanner_test.go +++ b/src/pkg/go/scanner/scanner_test.go @@ -11,6 +11,9 @@ import ( ) +var fset = token.NewFileSet() + + const /* class */ ( special = iota literal @@ -41,136 +44,136 @@ type elt struct { var tokens = [...]elt{ // Special tokens - elt{token.COMMENT, "/* a comment */", special}, - elt{token.COMMENT, "// a comment \n", special}, + {token.COMMENT, "/* a comment */", special}, + {token.COMMENT, "// a comment \n", special}, // Identifiers and basic type literals - elt{token.IDENT, "foobar", literal}, - elt{token.IDENT, "a۰۱۸", literal}, - elt{token.IDENT, "foo६४", literal}, - elt{token.IDENT, "bar9876", literal}, - elt{token.INT, "0", literal}, - elt{token.INT, "1", literal}, - elt{token.INT, "123456789012345678890", literal}, - elt{token.INT, "01234567", literal}, - elt{token.INT, "0xcafebabe", literal}, - elt{token.FLOAT, "0.", literal}, - elt{token.FLOAT, ".0", literal}, - elt{token.FLOAT, "3.14159265", literal}, - elt{token.FLOAT, "1e0", literal}, - elt{token.FLOAT, "1e+100", literal}, - elt{token.FLOAT, "1e-100", literal}, - elt{token.FLOAT, "2.71828e-1000", literal}, - elt{token.IMAG, "0i", literal}, - elt{token.IMAG, "1i", literal}, - elt{token.IMAG, "012345678901234567889i", literal}, - elt{token.IMAG, "123456789012345678890i", literal}, - elt{token.IMAG, "0.i", literal}, - elt{token.IMAG, ".0i", literal}, - elt{token.IMAG, "3.14159265i", literal}, - elt{token.IMAG, "1e0i", literal}, - elt{token.IMAG, "1e+100i", literal}, - elt{token.IMAG, "1e-100i", literal}, - elt{token.IMAG, "2.71828e-1000i", literal}, - elt{token.CHAR, "'a'", literal}, - elt{token.CHAR, "'\\000'", literal}, - elt{token.CHAR, "'\\xFF'", literal}, - elt{token.CHAR, "'\\uff16'", literal}, - elt{token.CHAR, "'\\U0000ff16'", literal}, - elt{token.STRING, "`foobar`", literal}, - elt{token.STRING, "`" + `foo + {token.IDENT, "foobar", literal}, + {token.IDENT, "a۰۱۸", literal}, + {token.IDENT, "foo६४", literal}, + {token.IDENT, "bar9876", literal}, + {token.INT, "0", literal}, + {token.INT, "1", literal}, + {token.INT, "123456789012345678890", literal}, + {token.INT, "01234567", literal}, + {token.INT, "0xcafebabe", literal}, + {token.FLOAT, "0.", literal}, + {token.FLOAT, ".0", literal}, + {token.FLOAT, "3.14159265", literal}, + {token.FLOAT, "1e0", literal}, + {token.FLOAT, "1e+100", literal}, + {token.FLOAT, "1e-100", literal}, + {token.FLOAT, "2.71828e-1000", literal}, + {token.IMAG, "0i", literal}, + {token.IMAG, "1i", literal}, + {token.IMAG, "012345678901234567889i", literal}, + {token.IMAG, "123456789012345678890i", literal}, + {token.IMAG, "0.i", literal}, + {token.IMAG, ".0i", literal}, + {token.IMAG, "3.14159265i", literal}, + {token.IMAG, "1e0i", literal}, + {token.IMAG, "1e+100i", literal}, + {token.IMAG, "1e-100i", literal}, + {token.IMAG, "2.71828e-1000i", literal}, + {token.CHAR, "'a'", literal}, + {token.CHAR, "'\\000'", literal}, + {token.CHAR, "'\\xFF'", literal}, + {token.CHAR, "'\\uff16'", literal}, + {token.CHAR, "'\\U0000ff16'", literal}, + {token.STRING, "`foobar`", literal}, + {token.STRING, "`" + `foo bar` + "`", literal, }, // Operators and delimitors - elt{token.ADD, "+", operator}, - elt{token.SUB, "-", operator}, - elt{token.MUL, "*", operator}, - elt{token.QUO, "/", operator}, - elt{token.REM, "%", operator}, - - elt{token.AND, "&", operator}, - elt{token.OR, "|", operator}, - elt{token.XOR, "^", operator}, - elt{token.SHL, "<<", operator}, - elt{token.SHR, ">>", operator}, - elt{token.AND_NOT, "&^", operator}, - - elt{token.ADD_ASSIGN, "+=", operator}, - elt{token.SUB_ASSIGN, "-=", operator}, - elt{token.MUL_ASSIGN, "*=", operator}, - elt{token.QUO_ASSIGN, "/=", operator}, - elt{token.REM_ASSIGN, "%=", operator}, - - elt{token.AND_ASSIGN, "&=", operator}, - elt{token.OR_ASSIGN, "|=", operator}, - elt{token.XOR_ASSIGN, "^=", operator}, - elt{token.SHL_ASSIGN, "<<=", operator}, - elt{token.SHR_ASSIGN, ">>=", operator}, - elt{token.AND_NOT_ASSIGN, "&^=", operator}, - - elt{token.LAND, "&&", operator}, - elt{token.LOR, "||", operator}, - elt{token.ARROW, "<-", operator}, - elt{token.INC, "++", operator}, - elt{token.DEC, "--", operator}, - - elt{token.EQL, "==", operator}, - elt{token.LSS, "<", operator}, - elt{token.GTR, ">", operator}, - elt{token.ASSIGN, "=", operator}, - elt{token.NOT, "!", operator}, - - elt{token.NEQ, "!=", operator}, - elt{token.LEQ, "<=", operator}, - elt{token.GEQ, ">=", operator}, - elt{token.DEFINE, ":=", operator}, - elt{token.ELLIPSIS, "...", operator}, - - elt{token.LPAREN, "(", operator}, - elt{token.LBRACK, "[", operator}, - elt{token.LBRACE, "{", operator}, - elt{token.COMMA, ",", operator}, - elt{token.PERIOD, ".", operator}, - - elt{token.RPAREN, ")", operator}, - elt{token.RBRACK, "]", operator}, - elt{token.RBRACE, "}", operator}, - elt{token.SEMICOLON, ";", operator}, - elt{token.COLON, ":", operator}, + {token.ADD, "+", operator}, + {token.SUB, "-", operator}, + {token.MUL, "*", operator}, + {token.QUO, "/", operator}, + {token.REM, "%", operator}, + + {token.AND, "&", operator}, + {token.OR, "|", operator}, + {token.XOR, "^", operator}, + {token.SHL, "<<", operator}, + {token.SHR, ">>", operator}, + {token.AND_NOT, "&^", operator}, + + {token.ADD_ASSIGN, "+=", operator}, + {token.SUB_ASSIGN, "-=", operator}, + {token.MUL_ASSIGN, "*=", operator}, + {token.QUO_ASSIGN, "/=", operator}, + {token.REM_ASSIGN, "%=", operator}, + + {token.AND_ASSIGN, "&=", operator}, + {token.OR_ASSIGN, "|=", operator}, + {token.XOR_ASSIGN, "^=", operator}, + {token.SHL_ASSIGN, "<<=", operator}, + {token.SHR_ASSIGN, ">>=", operator}, + {token.AND_NOT_ASSIGN, "&^=", operator}, + + {token.LAND, "&&", operator}, + {token.LOR, "||", operator}, + {token.ARROW, "<-", operator}, + {token.INC, "++", operator}, + {token.DEC, "--", operator}, + + {token.EQL, "==", operator}, + {token.LSS, "<", operator}, + {token.GTR, ">", operator}, + {token.ASSIGN, "=", operator}, + {token.NOT, "!", operator}, + + {token.NEQ, "!=", operator}, + {token.LEQ, "<=", operator}, + {token.GEQ, ">=", operator}, + {token.DEFINE, ":=", operator}, + {token.ELLIPSIS, "...", operator}, + + {token.LPAREN, "(", operator}, + {token.LBRACK, "[", operator}, + {token.LBRACE, "{", operator}, + {token.COMMA, ",", operator}, + {token.PERIOD, ".", operator}, + + {token.RPAREN, ")", operator}, + {token.RBRACK, "]", operator}, + {token.RBRACE, "}", operator}, + {token.SEMICOLON, ";", operator}, + {token.COLON, ":", operator}, // Keywords - elt{token.BREAK, "break", keyword}, - elt{token.CASE, "case", keyword}, - elt{token.CHAN, "chan", keyword}, - elt{token.CONST, "const", keyword}, - elt{token.CONTINUE, "continue", keyword}, - - elt{token.DEFAULT, "default", keyword}, - elt{token.DEFER, "defer", keyword}, - elt{token.ELSE, "else", keyword}, - elt{token.FALLTHROUGH, "fallthrough", keyword}, - elt{token.FOR, "for", keyword}, - - elt{token.FUNC, "func", keyword}, - elt{token.GO, "go", keyword}, - elt{token.GOTO, "goto", keyword}, - elt{token.IF, "if", keyword}, - elt{token.IMPORT, "import", keyword}, - - elt{token.INTERFACE, "interface", keyword}, - elt{token.MAP, "map", keyword}, - elt{token.PACKAGE, "package", keyword}, - elt{token.RANGE, "range", keyword}, - elt{token.RETURN, "return", keyword}, - - elt{token.SELECT, "select", keyword}, - elt{token.STRUCT, "struct", keyword}, - elt{token.SWITCH, "switch", keyword}, - elt{token.TYPE, "type", keyword}, - elt{token.VAR, "var", keyword}, + {token.BREAK, "break", keyword}, + {token.CASE, "case", keyword}, + {token.CHAN, "chan", keyword}, + {token.CONST, "const", keyword}, + {token.CONTINUE, "continue", keyword}, + + {token.DEFAULT, "default", keyword}, + {token.DEFER, "defer", keyword}, + {token.ELSE, "else", keyword}, + {token.FALLTHROUGH, "fallthrough", keyword}, + {token.FOR, "for", keyword}, + + {token.FUNC, "func", keyword}, + {token.GO, "go", keyword}, + {token.GOTO, "goto", keyword}, + {token.IF, "if", keyword}, + {token.IMPORT, "import", keyword}, + + {token.INTERFACE, "interface", keyword}, + {token.MAP, "map", keyword}, + {token.PACKAGE, "package", keyword}, + {token.RANGE, "range", keyword}, + {token.RETURN, "return", keyword}, + + {token.SELECT, "select", keyword}, + {token.STRUCT, "struct", keyword}, + {token.SWITCH, "switch", keyword}, + {token.TYPE, "type", keyword}, + {token.VAR, "var", keyword}, } @@ -196,18 +199,19 @@ func newlineCount(s string) int { } -func checkPos(t *testing.T, lit string, pos, expected token.Position) { +func checkPos(t *testing.T, lit string, p token.Pos, expected token.Position) { + pos := fset.Position(p) if pos.Filename != expected.Filename { - t.Errorf("bad filename for %s: got %s, expected %s", lit, pos.Filename, expected.Filename) + t.Errorf("bad filename for %q: got %s, expected %s", lit, pos.Filename, expected.Filename) } if pos.Offset != expected.Offset { - t.Errorf("bad position for %s: got %d, expected %d", lit, pos.Offset, expected.Offset) + t.Errorf("bad position for %q: got %d, expected %d", lit, pos.Offset, expected.Offset) } if pos.Line != expected.Line { - t.Errorf("bad line for %s: got %d, expected %d", lit, pos.Line, expected.Line) + t.Errorf("bad line for %q: got %d, expected %d", lit, pos.Line, expected.Line) } if pos.Column != expected.Column { - t.Errorf("bad column for %s: got %d, expected %d", lit, pos.Column, expected.Column) + t.Errorf("bad column for %q: got %d, expected %d", lit, pos.Column, expected.Column) } } @@ -219,66 +223,76 @@ func TestScan(t *testing.T) { for _, e := range tokens { src += e.lit + whitespace } - src_linecount := newlineCount(src) + src_linecount := newlineCount(src) + 1 whitespace_linecount := newlineCount(whitespace) // verify scan + var s Scanner + s.Init(fset, "", []byte(src), &testErrorHandler{t}, ScanComments) index := 0 epos := token.Position{"", 0, 1, 1} // expected position - nerrors := Tokenize("", []byte(src), &testErrorHandler{t}, ScanComments, - func(pos token.Position, tok token.Token, litb []byte) bool { - e := elt{token.EOF, "", special} - if index < len(tokens) { - e = tokens[index] - } - lit := string(litb) - if tok == token.EOF { - lit = "<EOF>" - epos.Line = src_linecount - epos.Column = 1 - } - checkPos(t, lit, pos, epos) - if tok != e.tok { - t.Errorf("bad token for %q: got %s, expected %s", lit, tok.String(), e.tok.String()) - } - if e.tok.IsLiteral() && lit != e.lit { - t.Errorf("bad literal for %q: got %q, expected %q", lit, lit, e.lit) - } - if tokenclass(tok) != e.class { - t.Errorf("bad class for %q: got %d, expected %d", lit, tokenclass(tok), e.class) - } - epos.Offset += len(lit) + len(whitespace) - epos.Line += newlineCount(lit) + whitespace_linecount - if tok == token.COMMENT && litb[1] == '/' { - // correct for unaccounted '/n' in //-style comment - epos.Offset++ - epos.Line++ - } - index++ - return tok != token.EOF - }) - if nerrors != 0 { - t.Errorf("found %d errors", nerrors) + for { + pos, tok, litb := s.Scan() + e := elt{token.EOF, "", special} + if index < len(tokens) { + e = tokens[index] + } + lit := string(litb) + if tok == token.EOF { + lit = "<EOF>" + epos.Line = src_linecount + epos.Column = 1 + } + checkPos(t, lit, pos, epos) + if tok != e.tok { + t.Errorf("bad token for %q: got %s, expected %s", lit, tok.String(), e.tok.String()) + } + if e.tok.IsLiteral() && lit != e.lit { + t.Errorf("bad literal for %q: got %q, expected %q", lit, lit, e.lit) + } + if tokenclass(tok) != e.class { + t.Errorf("bad class for %q: got %d, expected %d", lit, tokenclass(tok), e.class) + } + epos.Offset += len(lit) + len(whitespace) + epos.Line += newlineCount(lit) + whitespace_linecount + if tok == token.COMMENT && litb[1] == '/' { + // correct for unaccounted '/n' in //-style comment + epos.Offset++ + epos.Line++ + } + index++ + if tok == token.EOF { + break + } + } + if s.ErrorCount != 0 { + t.Errorf("found %d errors", s.ErrorCount) } } func checkSemi(t *testing.T, line string, mode uint) { var S Scanner - S.Init("TestSemis", []byte(line), nil, mode) + file := S.Init(fset, "TestSemis", []byte(line), nil, mode) pos, tok, lit := S.Scan() for tok != token.EOF { if tok == token.ILLEGAL { + // the illegal token literal indicates what + // kind of semicolon literal to expect + semiLit := "\n" + if lit[0] == '#' { + semiLit = ";" + } // next token must be a semicolon - offs := pos.Offset + 1 + semiPos := file.Position(pos) + semiPos.Offset++ + semiPos.Column++ pos, tok, lit = S.Scan() if tok == token.SEMICOLON { - if pos.Offset != offs { - t.Errorf("bad offset for %q: got %d, expected %d", line, pos.Offset, offs) - } - if string(lit) != ";" { - t.Errorf(`bad literal for %q: got %q, expected ";"`, line, lit) + if string(lit) != semiLit { + t.Errorf(`bad literal for %q: got %q, expected %q`, line, lit, semiLit) } + checkPos(t, line, pos, semiPos) } else { t.Errorf("bad token for %q: got %s, expected ;", line, tok.String()) } @@ -291,9 +305,10 @@ func checkSemi(t *testing.T, line string, mode uint) { var lines = []string{ - // the $ character indicates where a semicolon is expected + // # indicates a semicolon present in the source + // $ indicates an automatically inserted semicolon "", - "$;", + "#;", "foo$\n", "123$\n", "1.2$\n", @@ -354,7 +369,7 @@ var lines = []string{ ")$\n", "]$\n", "}$\n", - "$;\n", + "#;\n", ":\n", "break$\n", @@ -388,57 +403,66 @@ var lines = []string{ "var\n", "foo$//comment\n", + "foo$//comment", "foo$/*comment*/\n", "foo$/*\n*/", "foo$/*comment*/ \n", "foo$/*\n*/ ", + "foo $// comment\n", + "foo $// comment", "foo $/*comment*/\n", "foo $/*\n*/", - - "foo $/*comment*/\n", + "foo $/* */ /* \n */ bar$/**/\n", "foo $/*0*/ /*1*/ /*2*/\n", + "foo $/*comment*/ \n", "foo $/*0*/ /*1*/ /*2*/ \n", - "foo $/**/ /*-------------*/ /*----\n*/bar $/* \n*/baa", + "foo $/**/ /*-------------*/ /*----\n*/bar $/* \n*/baa$\n", + "foo $/* an EOF terminates a line */", + "foo $/* an EOF terminates a line */ /*", + "foo $/* an EOF terminates a line */ //", "package main$\n\nfunc main() {\n\tif {\n\t\treturn /* */ }$\n}$\n", + "package main$", } func TestSemis(t *testing.T) { for _, line := range lines { checkSemi(t, line, AllowIllegalChars|InsertSemis) - } - for _, line := range lines { checkSemi(t, line, AllowIllegalChars|InsertSemis|ScanComments) + + // if the input ended in newlines, the input must tokenize the + // same with or without those newlines + for i := len(line) - 1; i >= 0 && line[i] == '\n'; i-- { + checkSemi(t, line[0:i], AllowIllegalChars|InsertSemis) + checkSemi(t, line[0:i], AllowIllegalChars|InsertSemis|ScanComments) + } } } -type seg struct { +var segments = []struct { srcline string // a line of source text filename string // filename for current token line int // line number for current token -} - - -var segments = []seg{ +}{ // exactly one token per line since the test consumes one token per segment - seg{" line1", "TestLineComments", 1}, - seg{"\nline2", "TestLineComments", 2}, - seg{"\nline3 //line File1.go:100", "TestLineComments", 3}, // bad line comment, ignored - seg{"\nline4", "TestLineComments", 4}, - seg{"\n//line File1.go:100\n line100", "File1.go", 100}, - seg{"\n//line File2.go:200\n line200", "File2.go", 200}, - seg{"\n//line :1\n line1", "", 1}, - seg{"\n//line foo:42\n line42", "foo", 42}, - seg{"\n //line foo:42\n line44", "foo", 44}, // bad line comment, ignored - seg{"\n//line foo 42\n line46", "foo", 46}, // bad line comment, ignored - seg{"\n//line foo:42 extra text\n line48", "foo", 48}, // bad line comment, ignored - seg{"\n//line foo:42\n line42", "foo", 42}, - seg{"\n//line foo:42\n line42", "foo", 42}, - seg{"\n//line File1.go:100\n line100", "File1.go", 100}, + {" line1", "dir/TestLineComments", 1}, + {"\nline2", "dir/TestLineComments", 2}, + {"\nline3 //line File1.go:100", "dir/TestLineComments", 3}, // bad line comment, ignored + {"\nline4", "dir/TestLineComments", 4}, + {"\n//line File1.go:100\n line100", "dir/File1.go", 100}, + {"\n//line File2.go:200\n line200", "dir/File2.go", 200}, + {"\n//line :1\n line1", "dir", 1}, + {"\n//line foo:42\n line42", "dir/foo", 42}, + {"\n //line foo:42\n line44", "dir/foo", 44}, // bad line comment, ignored + {"\n//line foo 42\n line46", "dir/foo", 46}, // bad line comment, ignored + {"\n//line foo:42 extra text\n line48", "dir/foo", 48}, // bad line comment, ignored + {"\n//line /bar:42\n line42", "/bar", 42}, + {"\n//line ./foo:42\n line42", "dir/foo", 42}, + {"\n//line a/b/c/File1.go:100\n line100", "dir/a/b/c/File1.go", 100}, } @@ -452,10 +476,11 @@ func TestLineComments(t *testing.T) { // verify scan var S Scanner - S.Init("TestLineComments", []byte(src), nil, 0) + file := S.Init(fset, "dir/TestLineComments", []byte(src), nil, 0) for _, s := range segments { - pos, _, lit := S.Scan() - checkPos(t, string(lit), pos, token.Position{s.filename, pos.Offset, s.line, pos.Column}) + p, _, lit := S.Scan() + pos := file.Position(p) + checkPos(t, string(lit), p, token.Position{s.filename, pos.Offset, s.line, pos.Column}) } if S.ErrorCount != 0 { @@ -469,7 +494,11 @@ func TestInit(t *testing.T) { var s Scanner // 1st init - s.Init("", []byte("if true { }"), nil, 0) + src1 := "if true { }" + f1 := s.Init(fset, "", []byte(src1), nil, 0) + if f1.Size() != len(src1) { + t.Errorf("bad file size: got %d, expected %d", f1.Size(), len(src1)) + } s.Scan() // if s.Scan() // true _, tok, _ := s.Scan() // { @@ -478,7 +507,11 @@ func TestInit(t *testing.T) { } // 2nd init - s.Init("", []byte("go true { ]"), nil, 0) + src2 := "go true { ]" + f2 := s.Init(fset, "", []byte(src2), nil, 0) + if f2.Size() != len(src2) { + t.Errorf("bad file size: got %d, expected %d", f2.Size(), len(src2)) + } _, tok, _ = s.Scan() // go if tok != token.GO { t.Errorf("bad token: got %s, expected %s", tok.String(), token.GO) @@ -494,11 +527,11 @@ func TestIllegalChars(t *testing.T) { var s Scanner const src = "*?*$*@*" - s.Init("", []byte(src), &testErrorHandler{t}, AllowIllegalChars) + file := s.Init(fset, "", []byte(src), &testErrorHandler{t}, AllowIllegalChars) for offs, ch := range src { pos, tok, lit := s.Scan() - if pos.Offset != offs { - t.Errorf("bad position for %s: got %d, expected %d", string(lit), pos.Offset, offs) + if poffs := file.Offset(pos); poffs != offs { + t.Errorf("bad position for %s: got %d, expected %d", string(lit), poffs, offs) } if tok == token.ILLEGAL && string(lit) != string(ch) { t.Errorf("bad token: got %s, expected %s", string(lit), string(ch)) @@ -522,10 +555,13 @@ func TestStdErrorHander(t *testing.T) { "@ @ @" // original file, line 1 again v := new(ErrorVector) - nerrors := Tokenize("File1", []byte(src), v, 0, - func(pos token.Position, tok token.Token, litb []byte) bool { - return tok != token.EOF - }) + var s Scanner + s.Init(fset, "File1", []byte(src), v, 0) + for { + if _, tok, _ := s.Scan(); tok == token.EOF { + break + } + } list := v.GetErrorList(Raw) if len(list) != 9 { @@ -545,8 +581,8 @@ func TestStdErrorHander(t *testing.T) { PrintError(os.Stderr, list) } - if v.ErrorCount() != nerrors { - t.Errorf("found %d errors, expected %d", v.ErrorCount(), nerrors) + if v.ErrorCount() != s.ErrorCount { + t.Errorf("found %d errors, expected %d", v.ErrorCount(), s.ErrorCount) } } @@ -568,7 +604,7 @@ func (h *errorCollector) Error(pos token.Position, msg string) { func checkError(t *testing.T, src string, tok token.Token, pos int, err string) { var s Scanner var h errorCollector - s.Init("", []byte(src), &h, ScanComments) + s.Init(fset, "", []byte(src), &h, ScanComments) _, tok0, _ := s.Scan() _, tok1, _ := s.Scan() if tok0 != tok { @@ -593,28 +629,34 @@ func checkError(t *testing.T, src string, tok token.Token, pos int, err string) } -type srcerr struct { +var errors = []struct { src string tok token.Token pos int err string -} - -var errors = []srcerr{ - srcerr{"\"\"", token.STRING, 0, ""}, - srcerr{"\"", token.STRING, 0, "string not terminated"}, - srcerr{"/**/", token.COMMENT, 0, ""}, - srcerr{"/*", token.COMMENT, 0, "comment not terminated"}, - srcerr{"//\n", token.COMMENT, 0, ""}, - srcerr{"//", token.COMMENT, 0, "comment not terminated"}, - srcerr{"077", token.INT, 0, ""}, - srcerr{"078.", token.FLOAT, 0, ""}, - srcerr{"07801234567.", token.FLOAT, 0, ""}, - srcerr{"078e0", token.FLOAT, 0, ""}, - srcerr{"078", token.INT, 0, "illegal octal number"}, - srcerr{"07800000009", token.INT, 0, "illegal octal number"}, - srcerr{"\"abc\x00def\"", token.STRING, 4, "illegal character NUL"}, - srcerr{"\"abc\x80def\"", token.STRING, 4, "illegal UTF-8 encoding"}, +}{ + {`#`, token.ILLEGAL, 0, "illegal character '#' (U+23)"}, + {`' '`, token.CHAR, 0, ""}, + {`''`, token.CHAR, 0, "illegal character literal"}, + {`'\8'`, token.CHAR, 2, "unknown escape sequence"}, + {`'\08'`, token.CHAR, 3, "illegal character in escape sequence"}, + {`'\x0g'`, token.CHAR, 4, "illegal character in escape sequence"}, + {`'\Uffffffff'`, token.CHAR, 2, "escape sequence is invalid Unicode code point"}, + {`'`, token.CHAR, 0, "character literal not terminated"}, + {`""`, token.STRING, 0, ""}, + {`"`, token.STRING, 0, "string not terminated"}, + {"``", token.STRING, 0, ""}, + {"`", token.STRING, 0, "string not terminated"}, + {"/**/", token.COMMENT, 0, ""}, + {"/*", token.COMMENT, 0, "comment not terminated"}, + {"077", token.INT, 0, ""}, + {"078.", token.FLOAT, 0, ""}, + {"07801234567.", token.FLOAT, 0, ""}, + {"078e0", token.FLOAT, 0, ""}, + {"078", token.INT, 0, "illegal octal number"}, + {"07800000009", token.INT, 0, "illegal octal number"}, + {"\"abc\x00def\"", token.STRING, 4, "illegal character NUL"}, + {"\"abc\x80def\"", token.STRING, 4, "illegal UTF-8 encoding"}, } |