summaryrefslogtreecommitdiff
path: root/src/pkg/go/scanner
diff options
context:
space:
mode:
Diffstat (limited to 'src/pkg/go/scanner')
-rw-r--r--src/pkg/go/scanner/Makefile2
-rw-r--r--src/pkg/go/scanner/scanner.go336
-rw-r--r--src/pkg/go/scanner/scanner_test.go496
3 files changed, 455 insertions, 379 deletions
diff --git a/src/pkg/go/scanner/Makefile b/src/pkg/go/scanner/Makefile
index 70d21a972..453faac00 100644
--- a/src/pkg/go/scanner/Makefile
+++ b/src/pkg/go/scanner/Makefile
@@ -2,7 +2,7 @@
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
-include ../../../Make.$(GOARCH)
+include ../../../Make.inc
TARG=go/scanner
GOFILES=\
diff --git a/src/pkg/go/scanner/scanner.go b/src/pkg/go/scanner/scanner.go
index e5ac9d772..6ce846cd8 100644
--- a/src/pkg/go/scanner/scanner.go
+++ b/src/pkg/go/scanner/scanner.go
@@ -4,13 +4,25 @@
// A scanner for Go source text. Takes a []byte as source which can
// then be tokenized through repeated calls to the Scan function.
-// For a sample use of a scanner, see the implementation of Tokenize.
+// Typical use:
+//
+// var s Scanner
+// fset := token.NewFileSet() // position information is relative to fset
+// s.Init(fset, filename, src, nil /* no error handler */, 0)
+// for {
+// pos, tok, lit := s.Scan()
+// if tok == token.EOF {
+// break
+// }
+// // do something here with pos, tok, and lit
+// }
//
package scanner
import (
"bytes"
"go/token"
+ "path"
"strconv"
"unicode"
"utf8"
@@ -19,20 +31,22 @@ import (
// A Scanner holds the scanner's internal state while processing
// a given text. It can be allocated as part of another data
-// structure but must be initialized via Init before use. For
-// a sample use, see the implementation of Tokenize.
+// structure but must be initialized via Init before use.
//
type Scanner struct {
// immutable state
+ file *token.File // source file handle
+ dir string // directory portion of file.Name()
src []byte // source
err ErrorHandler // error reporting; or nil
mode uint // scanning mode
// scanning state
- pos token.Position // previous reading position (position before ch)
- offset int // current reading offset (position after ch)
- ch int // one char look-ahead
- insertSemi bool // insert a semicolon before next newline
+ ch int // current character
+ offset int // character offset
+ rdOffset int // reading offset (position after current character)
+ lineOffset int // current line offset
+ insertSemi bool // insert a semicolon before next newline
// public state - ok to modify
ErrorCount int // number of errors encountered
@@ -43,29 +57,31 @@ type Scanner struct {
// S.ch < 0 means end-of-file.
//
func (S *Scanner) next() {
- if S.offset < len(S.src) {
- S.pos.Offset = S.offset
- S.pos.Column++
+ if S.rdOffset < len(S.src) {
+ S.offset = S.rdOffset
if S.ch == '\n' {
- // next character starts a new line
- S.pos.Line++
- S.pos.Column = 1
+ S.lineOffset = S.offset
+ S.file.AddLine(S.offset)
}
- r, w := int(S.src[S.offset]), 1
+ r, w := int(S.src[S.rdOffset]), 1
switch {
case r == 0:
- S.error(S.pos, "illegal character NUL")
+ S.error(S.offset, "illegal character NUL")
case r >= 0x80:
// not ASCII
- r, w = utf8.DecodeRune(S.src[S.offset:])
+ r, w = utf8.DecodeRune(S.src[S.rdOffset:])
if r == utf8.RuneError && w == 1 {
- S.error(S.pos, "illegal UTF-8 encoding")
+ S.error(S.offset, "illegal UTF-8 encoding")
}
}
- S.offset += w
+ S.rdOffset += w
S.ch = r
} else {
- S.pos.Offset = len(S.src)
+ S.offset = len(S.src)
+ if S.ch == '\n' {
+ S.lineOffset = S.offset
+ S.file.AddLine(S.offset)
+ }
S.ch = -1 // eof
}
}
@@ -80,24 +96,38 @@ const (
InsertSemis // automatically insert semicolons
)
+// TODO(gri) Would it be better to simply provide *token.File to Init
+// instead of fset, and filename, and then return the file?
+// It could cause an error/panic if the provided file.Size()
+// doesn't match len(src).
-// Init prepares the scanner S to tokenize the text src. Calls to Scan
-// will use the error handler err if they encounter a syntax error and
-// err is not nil. Also, for each error encountered, the Scanner field
-// ErrorCount is incremented by one. The filename parameter is used as
-// filename in the token.Position returned by Scan for each token. The
-// mode parameter determines how comments and illegal characters are
-// handled.
+// Init prepares the scanner S to tokenize the text src. It sets the
+// scanner at the beginning of the source text, adds a new file with
+// the given filename to the file set fset, and returns that file.
+//
+// Calls to Scan will use the error handler err if they encounter a
+// syntax error and err is not nil. Also, for each error encountered,
+// the Scanner field ErrorCount is incremented by one. The mode parameter
+// determines how comments, illegal characters, and semicolons are handled.
//
-func (S *Scanner) Init(filename string, src []byte, err ErrorHandler, mode uint) {
+func (S *Scanner) Init(fset *token.FileSet, filename string, src []byte, err ErrorHandler, mode uint) *token.File {
// Explicitly initialize all fields since a scanner may be reused.
+ S.file = fset.AddFile(filename, fset.Base(), len(src))
+ S.dir, _ = path.Split(filename)
S.src = src
S.err = err
S.mode = mode
- S.pos = token.Position{filename, 0, 1, 0}
+
+ S.ch = ' '
S.offset = 0
+ S.rdOffset = 0
+ S.lineOffset = 0
+ S.insertSemi = false
S.ErrorCount = 0
+
S.next()
+
+ return S.file
}
@@ -131,111 +161,109 @@ func charString(ch int) string {
}
-func (S *Scanner) error(pos token.Position, msg string) {
+func (S *Scanner) error(offs int, msg string) {
if S.err != nil {
- S.err.Error(pos, msg)
+ S.err.Error(S.file.Position(S.file.Pos(offs)), msg)
}
S.ErrorCount++
}
-func (S *Scanner) expect(ch int) {
- if S.ch != ch {
- S.error(S.pos, "expected "+charString(ch)+", found "+charString(S.ch))
+var prefix = []byte("//line ")
+
+func (S *Scanner) interpretLineComment(text []byte) {
+ if bytes.HasPrefix(text, prefix) {
+ // get filename and line number, if any
+ if i := bytes.Index(text, []byte{':'}); i > 0 {
+ if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 {
+ // valid //line filename:line comment;
+ filename := path.Clean(string(text[len(prefix):i]))
+ if filename[0] != '/' {
+ // make filename relative to current directory
+ filename = path.Join(S.dir, filename)
+ }
+ // update scanner position
+ S.file.AddLineInfo(S.lineOffset, filename, line-1) // -1 since comment applies to next line
+ }
+ }
}
- S.next() // always make progress
}
-var prefix = []byte("line ")
-
-func (S *Scanner) scanComment(pos token.Position) {
- // first '/' already consumed
+func (S *Scanner) scanComment() {
+ // initial '/' already consumed; S.ch == '/' || S.ch == '*'
+ offs := S.offset - 1 // position of initial '/'
if S.ch == '/' {
//-style comment
- for S.ch >= 0 {
+ S.next()
+ for S.ch != '\n' && S.ch >= 0 {
S.next()
- if S.ch == '\n' {
- // '\n' is not part of the comment for purposes of scanning
- // (the comment ends on the same line where it started)
- if pos.Column == 1 {
- text := S.src[pos.Offset+2 : S.pos.Offset]
- if bytes.HasPrefix(text, prefix) {
- // comment starts at beginning of line with "//line ";
- // get filename and line number, if any
- i := bytes.Index(text, []byte{':'})
- if i >= 0 {
- if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 {
- // valid //line filename:line comment;
- // update scanner position
- S.pos.Filename = string(text[len(prefix):i])
- S.pos.Line = line - 1 // -1 since the '\n' has not been consumed yet
- }
- }
- }
- }
- return
- }
}
+ if offs == S.lineOffset {
+ // comment starts at the beginning of the current line
+ S.interpretLineComment(S.src[offs:S.offset])
+ }
+ return
+ }
- } else {
- /*-style comment */
- S.expect('*')
- for S.ch >= 0 {
- ch := S.ch
+ /*-style comment */
+ S.next()
+ for S.ch >= 0 {
+ ch := S.ch
+ S.next()
+ if ch == '*' && S.ch == '/' {
S.next()
- if ch == '*' && S.ch == '/' {
- S.next()
- return
- }
+ return
}
}
- S.error(pos, "comment not terminated")
+ S.error(offs, "comment not terminated")
}
-func (S *Scanner) findNewline(pos token.Position) bool {
- // first '/' already consumed; assume S.ch == '/' || S.ch == '*'
+func (S *Scanner) findLineEnd() bool {
+ // initial '/' already consumed
+
+ defer func(offs int) {
+ // reset scanner state to where it was upon calling findLineEnd
+ S.ch = '/'
+ S.offset = offs
+ S.rdOffset = offs + 1
+ S.next() // consume initial '/' again
+ }(S.offset - 1)
- // read ahead until a newline or non-comment token is found
- newline := false
- for pos1 := pos; S.ch >= 0; {
+ // read ahead until a newline, EOF, or non-comment token is found
+ for S.ch == '/' || S.ch == '*' {
if S.ch == '/' {
//-style comment always contains a newline
- newline = true
- break
+ return true
}
- S.scanComment(pos1)
- if pos1.Line < S.pos.Line {
- /*-style comment contained a newline */
- newline = true
- break
+ /*-style comment: look for newline */
+ S.next()
+ for S.ch >= 0 {
+ ch := S.ch
+ if ch == '\n' {
+ return true
+ }
+ S.next()
+ if ch == '*' && S.ch == '/' {
+ S.next()
+ break
+ }
}
S.skipWhitespace() // S.insertSemi is set
- if S.ch == '\n' {
- newline = true
- break
+ if S.ch < 0 || S.ch == '\n' {
+ return true
}
if S.ch != '/' {
// non-comment token
- break
- }
- pos1 = S.pos
- S.next()
- if S.ch != '/' && S.ch != '*' {
- // non-comment token
- break
+ return false
}
+ S.next() // consume '/'
}
- // reset position to where it was upon calling findNewline
- S.pos = pos
- S.offset = pos.Offset + 1
- S.next()
-
- return newline
+ return false
}
@@ -250,11 +278,11 @@ func isDigit(ch int) bool {
func (S *Scanner) scanIdentifier() token.Token {
- pos := S.pos.Offset
+ offs := S.offset
for isLetter(S.ch) || isDigit(S.ch) {
S.next()
}
- return token.Lookup(S.src[pos:S.pos.Offset])
+ return token.Lookup(S.src[offs:S.offset])
}
@@ -278,7 +306,7 @@ func (S *Scanner) scanMantissa(base int) {
}
-func (S *Scanner) scanNumber(pos token.Position, seenDecimalPoint bool) token.Token {
+func (S *Scanner) scanNumber(seenDecimalPoint bool) token.Token {
// digitVal(S.ch) < 10
tok := token.INT
@@ -290,6 +318,7 @@ func (S *Scanner) scanNumber(pos token.Position, seenDecimalPoint bool) token.To
if S.ch == '0' {
// int or float
+ offs := S.offset
S.next()
if S.ch == 'x' || S.ch == 'X' {
// hexadecimal int
@@ -309,7 +338,7 @@ func (S *Scanner) scanNumber(pos token.Position, seenDecimalPoint bool) token.To
}
// octal int
if seenDecimalDigit {
- S.error(pos, "illegal octal number")
+ S.error(offs, "illegal octal number")
}
}
goto exit
@@ -346,7 +375,7 @@ exit:
func (S *Scanner) scanEscape(quote int) {
- pos := S.pos
+ offs := S.offset
var i, base, max uint32
switch S.ch {
@@ -366,28 +395,33 @@ func (S *Scanner) scanEscape(quote int) {
i, base, max = 8, 16, unicode.MaxRune
default:
S.next() // always make progress
- S.error(pos, "unknown escape sequence")
+ S.error(offs, "unknown escape sequence")
return
}
var x uint32
- for ; i > 0; i-- {
+ for ; i > 0 && S.ch != quote && S.ch >= 0; i-- {
d := uint32(digitVal(S.ch))
- if d > base {
- S.error(S.pos, "illegal character in escape sequence")
- return
+ if d >= base {
+ S.error(S.offset, "illegal character in escape sequence")
+ break
}
x = x*base + d
S.next()
}
+ // in case of an error, consume remaining chars
+ for ; i > 0 && S.ch != quote && S.ch >= 0; i-- {
+ S.next()
+ }
if x > max || 0xd800 <= x && x < 0xe000 {
- S.error(pos, "escape sequence is invalid Unicode code point")
+ S.error(offs, "escape sequence is invalid Unicode code point")
}
}
-func (S *Scanner) scanChar(pos token.Position) {
- // '\'' already consumed
+func (S *Scanner) scanChar() {
+ // '\'' opening already consumed
+ offs := S.offset - 1
n := 0
for S.ch != '\'' {
@@ -395,7 +429,7 @@ func (S *Scanner) scanChar(pos token.Position) {
n++
S.next()
if ch == '\n' || ch < 0 {
- S.error(pos, "character literal not terminated")
+ S.error(offs, "character literal not terminated")
n = 1
break
}
@@ -407,19 +441,20 @@ func (S *Scanner) scanChar(pos token.Position) {
S.next()
if n != 1 {
- S.error(pos, "illegal character literal")
+ S.error(offs, "illegal character literal")
}
}
-func (S *Scanner) scanString(pos token.Position) {
- // '"' already consumed
+func (S *Scanner) scanString() {
+ // '"' opening already consumed
+ offs := S.offset - 1
for S.ch != '"' {
ch := S.ch
S.next()
if ch == '\n' || ch < 0 {
- S.error(pos, "string not terminated")
+ S.error(offs, "string not terminated")
break
}
if ch == '\\' {
@@ -431,14 +466,15 @@ func (S *Scanner) scanString(pos token.Position) {
}
-func (S *Scanner) scanRawString(pos token.Position) {
- // '`' already consumed
+func (S *Scanner) scanRawString() {
+ // '`' opening already consumed
+ offs := S.offset - 1
for S.ch != '`' {
ch := S.ch
S.next()
if ch < 0 {
- S.error(pos, "string not terminated")
+ S.error(offs, "string not terminated")
break
}
}
@@ -499,12 +535,17 @@ func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Toke
}
-var semicolon = []byte{';'}
+var newline = []byte{'\n'}
// Scan scans the next token and returns the token position pos,
// the token tok, and the literal text lit corresponding to the
// token. The source end is indicated by token.EOF.
//
+// If the returned token is token.SEMICOLON, the corresponding
+// literal value is ";" if the semicolon was present in the source,
+// and "\n" if the semicolon was inserted because of a newline or
+// at EOF.
+//
// For more tolerant parsing, Scan will return a valid token if
// possible even if a syntax error was encountered. Thus, even
// if the resulting token sequence contains no illegal tokens,
@@ -512,13 +553,18 @@ var semicolon = []byte{';'}
// must check the scanner's ErrorCount or the number of calls
// of the error handler, if there was one installed.
//
-func (S *Scanner) Scan() (pos token.Position, tok token.Token, lit []byte) {
+// Scan adds line information to the file added to the file
+// set with Init. Token positions are relative to that file
+// and thus relative to the file set.
+//
+func (S *Scanner) Scan() (token.Pos, token.Token, []byte) {
scanAgain:
S.skipWhitespace()
// current token start
insertSemi := false
- pos, tok = S.pos, token.ILLEGAL
+ offs := S.offset
+ tok := token.ILLEGAL
// determine token value
switch ch := S.ch; {
@@ -530,36 +576,40 @@ scanAgain:
}
case digitVal(ch) < 10:
insertSemi = true
- tok = S.scanNumber(pos, false)
+ tok = S.scanNumber(false)
default:
S.next() // always make progress
switch ch {
case -1:
+ if S.insertSemi {
+ S.insertSemi = false // EOF consumed
+ return S.file.Pos(offs), token.SEMICOLON, newline
+ }
tok = token.EOF
case '\n':
// we only reach here if S.insertSemi was
// set in the first place and exited early
// from S.skipWhitespace()
S.insertSemi = false // newline consumed
- return pos, token.SEMICOLON, semicolon
+ return S.file.Pos(offs), token.SEMICOLON, newline
case '"':
insertSemi = true
tok = token.STRING
- S.scanString(pos)
+ S.scanString()
case '\'':
insertSemi = true
tok = token.CHAR
- S.scanChar(pos)
+ S.scanChar()
case '`':
insertSemi = true
tok = token.STRING
- S.scanRawString(pos)
+ S.scanRawString()
case ':':
tok = S.switch2(token.COLON, token.DEFINE)
case '.':
if digitVal(S.ch) < 10 {
insertSemi = true
- tok = S.scanNumber(pos, true)
+ tok = S.scanNumber(true)
} else if S.ch == '.' {
S.next()
if S.ch == '.' {
@@ -603,15 +653,15 @@ scanAgain:
case '/':
if S.ch == '/' || S.ch == '*' {
// comment
- if S.insertSemi && S.findNewline(pos) {
+ if S.insertSemi && S.findLineEnd() {
// reset position to the beginning of the comment
- S.pos = pos
- S.offset = pos.Offset + 1
S.ch = '/'
+ S.offset = offs
+ S.rdOffset = offs + 1
S.insertSemi = false // newline consumed
- return pos, token.SEMICOLON, semicolon
+ return S.file.Pos(offs), token.SEMICOLON, newline
}
- S.scanComment(pos)
+ S.scanComment()
if S.mode&ScanComments == 0 {
// skip comment
S.insertSemi = false // newline consumed
@@ -649,7 +699,7 @@ scanAgain:
tok = S.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
default:
if S.mode&AllowIllegalChars == 0 {
- S.error(pos, "illegal character "+charString(ch))
+ S.error(offs, "illegal character "+charString(ch))
}
insertSemi = S.insertSemi // preserve insertSemi info
}
@@ -658,21 +708,5 @@ scanAgain:
if S.mode&InsertSemis != 0 {
S.insertSemi = insertSemi
}
- return pos, tok, S.src[pos.Offset:S.pos.Offset]
-}
-
-
-// Tokenize calls a function f with the token position, token value, and token
-// text for each token in the source src. The other parameters have the same
-// meaning as for the Init function. Tokenize keeps scanning until f returns
-// false (usually when the token value is token.EOF). The result is the number
-// of errors encountered.
-//
-func Tokenize(filename string, src []byte, err ErrorHandler, mode uint, f func(pos token.Position, tok token.Token, lit []byte) bool) int {
- var s Scanner
- s.Init(filename, src, err, mode)
- for f(s.Scan()) {
- // action happens in f
- }
- return s.ErrorCount
+ return S.file.Pos(offs), tok, S.src[offs:S.offset]
}
diff --git a/src/pkg/go/scanner/scanner_test.go b/src/pkg/go/scanner/scanner_test.go
index 002a81dd9..b1004f89d 100644
--- a/src/pkg/go/scanner/scanner_test.go
+++ b/src/pkg/go/scanner/scanner_test.go
@@ -11,6 +11,9 @@ import (
)
+var fset = token.NewFileSet()
+
+
const /* class */ (
special = iota
literal
@@ -41,136 +44,136 @@ type elt struct {
var tokens = [...]elt{
// Special tokens
- elt{token.COMMENT, "/* a comment */", special},
- elt{token.COMMENT, "// a comment \n", special},
+ {token.COMMENT, "/* a comment */", special},
+ {token.COMMENT, "// a comment \n", special},
// Identifiers and basic type literals
- elt{token.IDENT, "foobar", literal},
- elt{token.IDENT, "a۰۱۸", literal},
- elt{token.IDENT, "foo६४", literal},
- elt{token.IDENT, "bar9876", literal},
- elt{token.INT, "0", literal},
- elt{token.INT, "1", literal},
- elt{token.INT, "123456789012345678890", literal},
- elt{token.INT, "01234567", literal},
- elt{token.INT, "0xcafebabe", literal},
- elt{token.FLOAT, "0.", literal},
- elt{token.FLOAT, ".0", literal},
- elt{token.FLOAT, "3.14159265", literal},
- elt{token.FLOAT, "1e0", literal},
- elt{token.FLOAT, "1e+100", literal},
- elt{token.FLOAT, "1e-100", literal},
- elt{token.FLOAT, "2.71828e-1000", literal},
- elt{token.IMAG, "0i", literal},
- elt{token.IMAG, "1i", literal},
- elt{token.IMAG, "012345678901234567889i", literal},
- elt{token.IMAG, "123456789012345678890i", literal},
- elt{token.IMAG, "0.i", literal},
- elt{token.IMAG, ".0i", literal},
- elt{token.IMAG, "3.14159265i", literal},
- elt{token.IMAG, "1e0i", literal},
- elt{token.IMAG, "1e+100i", literal},
- elt{token.IMAG, "1e-100i", literal},
- elt{token.IMAG, "2.71828e-1000i", literal},
- elt{token.CHAR, "'a'", literal},
- elt{token.CHAR, "'\\000'", literal},
- elt{token.CHAR, "'\\xFF'", literal},
- elt{token.CHAR, "'\\uff16'", literal},
- elt{token.CHAR, "'\\U0000ff16'", literal},
- elt{token.STRING, "`foobar`", literal},
- elt{token.STRING, "`" + `foo
+ {token.IDENT, "foobar", literal},
+ {token.IDENT, "a۰۱۸", literal},
+ {token.IDENT, "foo६४", literal},
+ {token.IDENT, "bar9876", literal},
+ {token.INT, "0", literal},
+ {token.INT, "1", literal},
+ {token.INT, "123456789012345678890", literal},
+ {token.INT, "01234567", literal},
+ {token.INT, "0xcafebabe", literal},
+ {token.FLOAT, "0.", literal},
+ {token.FLOAT, ".0", literal},
+ {token.FLOAT, "3.14159265", literal},
+ {token.FLOAT, "1e0", literal},
+ {token.FLOAT, "1e+100", literal},
+ {token.FLOAT, "1e-100", literal},
+ {token.FLOAT, "2.71828e-1000", literal},
+ {token.IMAG, "0i", literal},
+ {token.IMAG, "1i", literal},
+ {token.IMAG, "012345678901234567889i", literal},
+ {token.IMAG, "123456789012345678890i", literal},
+ {token.IMAG, "0.i", literal},
+ {token.IMAG, ".0i", literal},
+ {token.IMAG, "3.14159265i", literal},
+ {token.IMAG, "1e0i", literal},
+ {token.IMAG, "1e+100i", literal},
+ {token.IMAG, "1e-100i", literal},
+ {token.IMAG, "2.71828e-1000i", literal},
+ {token.CHAR, "'a'", literal},
+ {token.CHAR, "'\\000'", literal},
+ {token.CHAR, "'\\xFF'", literal},
+ {token.CHAR, "'\\uff16'", literal},
+ {token.CHAR, "'\\U0000ff16'", literal},
+ {token.STRING, "`foobar`", literal},
+ {token.STRING, "`" + `foo
bar` +
"`",
literal,
},
// Operators and delimitors
- elt{token.ADD, "+", operator},
- elt{token.SUB, "-", operator},
- elt{token.MUL, "*", operator},
- elt{token.QUO, "/", operator},
- elt{token.REM, "%", operator},
-
- elt{token.AND, "&", operator},
- elt{token.OR, "|", operator},
- elt{token.XOR, "^", operator},
- elt{token.SHL, "<<", operator},
- elt{token.SHR, ">>", operator},
- elt{token.AND_NOT, "&^", operator},
-
- elt{token.ADD_ASSIGN, "+=", operator},
- elt{token.SUB_ASSIGN, "-=", operator},
- elt{token.MUL_ASSIGN, "*=", operator},
- elt{token.QUO_ASSIGN, "/=", operator},
- elt{token.REM_ASSIGN, "%=", operator},
-
- elt{token.AND_ASSIGN, "&=", operator},
- elt{token.OR_ASSIGN, "|=", operator},
- elt{token.XOR_ASSIGN, "^=", operator},
- elt{token.SHL_ASSIGN, "<<=", operator},
- elt{token.SHR_ASSIGN, ">>=", operator},
- elt{token.AND_NOT_ASSIGN, "&^=", operator},
-
- elt{token.LAND, "&&", operator},
- elt{token.LOR, "||", operator},
- elt{token.ARROW, "<-", operator},
- elt{token.INC, "++", operator},
- elt{token.DEC, "--", operator},
-
- elt{token.EQL, "==", operator},
- elt{token.LSS, "<", operator},
- elt{token.GTR, ">", operator},
- elt{token.ASSIGN, "=", operator},
- elt{token.NOT, "!", operator},
-
- elt{token.NEQ, "!=", operator},
- elt{token.LEQ, "<=", operator},
- elt{token.GEQ, ">=", operator},
- elt{token.DEFINE, ":=", operator},
- elt{token.ELLIPSIS, "...", operator},
-
- elt{token.LPAREN, "(", operator},
- elt{token.LBRACK, "[", operator},
- elt{token.LBRACE, "{", operator},
- elt{token.COMMA, ",", operator},
- elt{token.PERIOD, ".", operator},
-
- elt{token.RPAREN, ")", operator},
- elt{token.RBRACK, "]", operator},
- elt{token.RBRACE, "}", operator},
- elt{token.SEMICOLON, ";", operator},
- elt{token.COLON, ":", operator},
+ {token.ADD, "+", operator},
+ {token.SUB, "-", operator},
+ {token.MUL, "*", operator},
+ {token.QUO, "/", operator},
+ {token.REM, "%", operator},
+
+ {token.AND, "&", operator},
+ {token.OR, "|", operator},
+ {token.XOR, "^", operator},
+ {token.SHL, "<<", operator},
+ {token.SHR, ">>", operator},
+ {token.AND_NOT, "&^", operator},
+
+ {token.ADD_ASSIGN, "+=", operator},
+ {token.SUB_ASSIGN, "-=", operator},
+ {token.MUL_ASSIGN, "*=", operator},
+ {token.QUO_ASSIGN, "/=", operator},
+ {token.REM_ASSIGN, "%=", operator},
+
+ {token.AND_ASSIGN, "&=", operator},
+ {token.OR_ASSIGN, "|=", operator},
+ {token.XOR_ASSIGN, "^=", operator},
+ {token.SHL_ASSIGN, "<<=", operator},
+ {token.SHR_ASSIGN, ">>=", operator},
+ {token.AND_NOT_ASSIGN, "&^=", operator},
+
+ {token.LAND, "&&", operator},
+ {token.LOR, "||", operator},
+ {token.ARROW, "<-", operator},
+ {token.INC, "++", operator},
+ {token.DEC, "--", operator},
+
+ {token.EQL, "==", operator},
+ {token.LSS, "<", operator},
+ {token.GTR, ">", operator},
+ {token.ASSIGN, "=", operator},
+ {token.NOT, "!", operator},
+
+ {token.NEQ, "!=", operator},
+ {token.LEQ, "<=", operator},
+ {token.GEQ, ">=", operator},
+ {token.DEFINE, ":=", operator},
+ {token.ELLIPSIS, "...", operator},
+
+ {token.LPAREN, "(", operator},
+ {token.LBRACK, "[", operator},
+ {token.LBRACE, "{", operator},
+ {token.COMMA, ",", operator},
+ {token.PERIOD, ".", operator},
+
+ {token.RPAREN, ")", operator},
+ {token.RBRACK, "]", operator},
+ {token.RBRACE, "}", operator},
+ {token.SEMICOLON, ";", operator},
+ {token.COLON, ":", operator},
// Keywords
- elt{token.BREAK, "break", keyword},
- elt{token.CASE, "case", keyword},
- elt{token.CHAN, "chan", keyword},
- elt{token.CONST, "const", keyword},
- elt{token.CONTINUE, "continue", keyword},
-
- elt{token.DEFAULT, "default", keyword},
- elt{token.DEFER, "defer", keyword},
- elt{token.ELSE, "else", keyword},
- elt{token.FALLTHROUGH, "fallthrough", keyword},
- elt{token.FOR, "for", keyword},
-
- elt{token.FUNC, "func", keyword},
- elt{token.GO, "go", keyword},
- elt{token.GOTO, "goto", keyword},
- elt{token.IF, "if", keyword},
- elt{token.IMPORT, "import", keyword},
-
- elt{token.INTERFACE, "interface", keyword},
- elt{token.MAP, "map", keyword},
- elt{token.PACKAGE, "package", keyword},
- elt{token.RANGE, "range", keyword},
- elt{token.RETURN, "return", keyword},
-
- elt{token.SELECT, "select", keyword},
- elt{token.STRUCT, "struct", keyword},
- elt{token.SWITCH, "switch", keyword},
- elt{token.TYPE, "type", keyword},
- elt{token.VAR, "var", keyword},
+ {token.BREAK, "break", keyword},
+ {token.CASE, "case", keyword},
+ {token.CHAN, "chan", keyword},
+ {token.CONST, "const", keyword},
+ {token.CONTINUE, "continue", keyword},
+
+ {token.DEFAULT, "default", keyword},
+ {token.DEFER, "defer", keyword},
+ {token.ELSE, "else", keyword},
+ {token.FALLTHROUGH, "fallthrough", keyword},
+ {token.FOR, "for", keyword},
+
+ {token.FUNC, "func", keyword},
+ {token.GO, "go", keyword},
+ {token.GOTO, "goto", keyword},
+ {token.IF, "if", keyword},
+ {token.IMPORT, "import", keyword},
+
+ {token.INTERFACE, "interface", keyword},
+ {token.MAP, "map", keyword},
+ {token.PACKAGE, "package", keyword},
+ {token.RANGE, "range", keyword},
+ {token.RETURN, "return", keyword},
+
+ {token.SELECT, "select", keyword},
+ {token.STRUCT, "struct", keyword},
+ {token.SWITCH, "switch", keyword},
+ {token.TYPE, "type", keyword},
+ {token.VAR, "var", keyword},
}
@@ -196,18 +199,19 @@ func newlineCount(s string) int {
}
-func checkPos(t *testing.T, lit string, pos, expected token.Position) {
+func checkPos(t *testing.T, lit string, p token.Pos, expected token.Position) {
+ pos := fset.Position(p)
if pos.Filename != expected.Filename {
- t.Errorf("bad filename for %s: got %s, expected %s", lit, pos.Filename, expected.Filename)
+ t.Errorf("bad filename for %q: got %s, expected %s", lit, pos.Filename, expected.Filename)
}
if pos.Offset != expected.Offset {
- t.Errorf("bad position for %s: got %d, expected %d", lit, pos.Offset, expected.Offset)
+ t.Errorf("bad position for %q: got %d, expected %d", lit, pos.Offset, expected.Offset)
}
if pos.Line != expected.Line {
- t.Errorf("bad line for %s: got %d, expected %d", lit, pos.Line, expected.Line)
+ t.Errorf("bad line for %q: got %d, expected %d", lit, pos.Line, expected.Line)
}
if pos.Column != expected.Column {
- t.Errorf("bad column for %s: got %d, expected %d", lit, pos.Column, expected.Column)
+ t.Errorf("bad column for %q: got %d, expected %d", lit, pos.Column, expected.Column)
}
}
@@ -219,66 +223,76 @@ func TestScan(t *testing.T) {
for _, e := range tokens {
src += e.lit + whitespace
}
- src_linecount := newlineCount(src)
+ src_linecount := newlineCount(src) + 1
whitespace_linecount := newlineCount(whitespace)
// verify scan
+ var s Scanner
+ s.Init(fset, "", []byte(src), &testErrorHandler{t}, ScanComments)
index := 0
epos := token.Position{"", 0, 1, 1} // expected position
- nerrors := Tokenize("", []byte(src), &testErrorHandler{t}, ScanComments,
- func(pos token.Position, tok token.Token, litb []byte) bool {
- e := elt{token.EOF, "", special}
- if index < len(tokens) {
- e = tokens[index]
- }
- lit := string(litb)
- if tok == token.EOF {
- lit = "<EOF>"
- epos.Line = src_linecount
- epos.Column = 1
- }
- checkPos(t, lit, pos, epos)
- if tok != e.tok {
- t.Errorf("bad token for %q: got %s, expected %s", lit, tok.String(), e.tok.String())
- }
- if e.tok.IsLiteral() && lit != e.lit {
- t.Errorf("bad literal for %q: got %q, expected %q", lit, lit, e.lit)
- }
- if tokenclass(tok) != e.class {
- t.Errorf("bad class for %q: got %d, expected %d", lit, tokenclass(tok), e.class)
- }
- epos.Offset += len(lit) + len(whitespace)
- epos.Line += newlineCount(lit) + whitespace_linecount
- if tok == token.COMMENT && litb[1] == '/' {
- // correct for unaccounted '/n' in //-style comment
- epos.Offset++
- epos.Line++
- }
- index++
- return tok != token.EOF
- })
- if nerrors != 0 {
- t.Errorf("found %d errors", nerrors)
+ for {
+ pos, tok, litb := s.Scan()
+ e := elt{token.EOF, "", special}
+ if index < len(tokens) {
+ e = tokens[index]
+ }
+ lit := string(litb)
+ if tok == token.EOF {
+ lit = "<EOF>"
+ epos.Line = src_linecount
+ epos.Column = 1
+ }
+ checkPos(t, lit, pos, epos)
+ if tok != e.tok {
+ t.Errorf("bad token for %q: got %s, expected %s", lit, tok.String(), e.tok.String())
+ }
+ if e.tok.IsLiteral() && lit != e.lit {
+ t.Errorf("bad literal for %q: got %q, expected %q", lit, lit, e.lit)
+ }
+ if tokenclass(tok) != e.class {
+ t.Errorf("bad class for %q: got %d, expected %d", lit, tokenclass(tok), e.class)
+ }
+ epos.Offset += len(lit) + len(whitespace)
+ epos.Line += newlineCount(lit) + whitespace_linecount
+ if tok == token.COMMENT && litb[1] == '/' {
+ // correct for unaccounted '/n' in //-style comment
+ epos.Offset++
+ epos.Line++
+ }
+ index++
+ if tok == token.EOF {
+ break
+ }
+ }
+ if s.ErrorCount != 0 {
+ t.Errorf("found %d errors", s.ErrorCount)
}
}
func checkSemi(t *testing.T, line string, mode uint) {
var S Scanner
- S.Init("TestSemis", []byte(line), nil, mode)
+ file := S.Init(fset, "TestSemis", []byte(line), nil, mode)
pos, tok, lit := S.Scan()
for tok != token.EOF {
if tok == token.ILLEGAL {
+ // the illegal token literal indicates what
+ // kind of semicolon literal to expect
+ semiLit := "\n"
+ if lit[0] == '#' {
+ semiLit = ";"
+ }
// next token must be a semicolon
- offs := pos.Offset + 1
+ semiPos := file.Position(pos)
+ semiPos.Offset++
+ semiPos.Column++
pos, tok, lit = S.Scan()
if tok == token.SEMICOLON {
- if pos.Offset != offs {
- t.Errorf("bad offset for %q: got %d, expected %d", line, pos.Offset, offs)
- }
- if string(lit) != ";" {
- t.Errorf(`bad literal for %q: got %q, expected ";"`, line, lit)
+ if string(lit) != semiLit {
+ t.Errorf(`bad literal for %q: got %q, expected %q`, line, lit, semiLit)
}
+ checkPos(t, line, pos, semiPos)
} else {
t.Errorf("bad token for %q: got %s, expected ;", line, tok.String())
}
@@ -291,9 +305,10 @@ func checkSemi(t *testing.T, line string, mode uint) {
var lines = []string{
- // the $ character indicates where a semicolon is expected
+ // # indicates a semicolon present in the source
+ // $ indicates an automatically inserted semicolon
"",
- "$;",
+ "#;",
"foo$\n",
"123$\n",
"1.2$\n",
@@ -354,7 +369,7 @@ var lines = []string{
")$\n",
"]$\n",
"}$\n",
- "$;\n",
+ "#;\n",
":\n",
"break$\n",
@@ -388,57 +403,66 @@ var lines = []string{
"var\n",
"foo$//comment\n",
+ "foo$//comment",
"foo$/*comment*/\n",
"foo$/*\n*/",
"foo$/*comment*/ \n",
"foo$/*\n*/ ",
+
"foo $// comment\n",
+ "foo $// comment",
"foo $/*comment*/\n",
"foo $/*\n*/",
-
- "foo $/*comment*/\n",
+ "foo $/* */ /* \n */ bar$/**/\n",
"foo $/*0*/ /*1*/ /*2*/\n",
+
"foo $/*comment*/ \n",
"foo $/*0*/ /*1*/ /*2*/ \n",
- "foo $/**/ /*-------------*/ /*----\n*/bar $/* \n*/baa",
+ "foo $/**/ /*-------------*/ /*----\n*/bar $/* \n*/baa$\n",
+ "foo $/* an EOF terminates a line */",
+ "foo $/* an EOF terminates a line */ /*",
+ "foo $/* an EOF terminates a line */ //",
"package main$\n\nfunc main() {\n\tif {\n\t\treturn /* */ }$\n}$\n",
+ "package main$",
}
func TestSemis(t *testing.T) {
for _, line := range lines {
checkSemi(t, line, AllowIllegalChars|InsertSemis)
- }
- for _, line := range lines {
checkSemi(t, line, AllowIllegalChars|InsertSemis|ScanComments)
+
+ // if the input ended in newlines, the input must tokenize the
+ // same with or without those newlines
+ for i := len(line) - 1; i >= 0 && line[i] == '\n'; i-- {
+ checkSemi(t, line[0:i], AllowIllegalChars|InsertSemis)
+ checkSemi(t, line[0:i], AllowIllegalChars|InsertSemis|ScanComments)
+ }
}
}
-type seg struct {
+var segments = []struct {
srcline string // a line of source text
filename string // filename for current token
line int // line number for current token
-}
-
-
-var segments = []seg{
+}{
// exactly one token per line since the test consumes one token per segment
- seg{" line1", "TestLineComments", 1},
- seg{"\nline2", "TestLineComments", 2},
- seg{"\nline3 //line File1.go:100", "TestLineComments", 3}, // bad line comment, ignored
- seg{"\nline4", "TestLineComments", 4},
- seg{"\n//line File1.go:100\n line100", "File1.go", 100},
- seg{"\n//line File2.go:200\n line200", "File2.go", 200},
- seg{"\n//line :1\n line1", "", 1},
- seg{"\n//line foo:42\n line42", "foo", 42},
- seg{"\n //line foo:42\n line44", "foo", 44}, // bad line comment, ignored
- seg{"\n//line foo 42\n line46", "foo", 46}, // bad line comment, ignored
- seg{"\n//line foo:42 extra text\n line48", "foo", 48}, // bad line comment, ignored
- seg{"\n//line foo:42\n line42", "foo", 42},
- seg{"\n//line foo:42\n line42", "foo", 42},
- seg{"\n//line File1.go:100\n line100", "File1.go", 100},
+ {" line1", "dir/TestLineComments", 1},
+ {"\nline2", "dir/TestLineComments", 2},
+ {"\nline3 //line File1.go:100", "dir/TestLineComments", 3}, // bad line comment, ignored
+ {"\nline4", "dir/TestLineComments", 4},
+ {"\n//line File1.go:100\n line100", "dir/File1.go", 100},
+ {"\n//line File2.go:200\n line200", "dir/File2.go", 200},
+ {"\n//line :1\n line1", "dir", 1},
+ {"\n//line foo:42\n line42", "dir/foo", 42},
+ {"\n //line foo:42\n line44", "dir/foo", 44}, // bad line comment, ignored
+ {"\n//line foo 42\n line46", "dir/foo", 46}, // bad line comment, ignored
+ {"\n//line foo:42 extra text\n line48", "dir/foo", 48}, // bad line comment, ignored
+ {"\n//line /bar:42\n line42", "/bar", 42},
+ {"\n//line ./foo:42\n line42", "dir/foo", 42},
+ {"\n//line a/b/c/File1.go:100\n line100", "dir/a/b/c/File1.go", 100},
}
@@ -452,10 +476,11 @@ func TestLineComments(t *testing.T) {
// verify scan
var S Scanner
- S.Init("TestLineComments", []byte(src), nil, 0)
+ file := S.Init(fset, "dir/TestLineComments", []byte(src), nil, 0)
for _, s := range segments {
- pos, _, lit := S.Scan()
- checkPos(t, string(lit), pos, token.Position{s.filename, pos.Offset, s.line, pos.Column})
+ p, _, lit := S.Scan()
+ pos := file.Position(p)
+ checkPos(t, string(lit), p, token.Position{s.filename, pos.Offset, s.line, pos.Column})
}
if S.ErrorCount != 0 {
@@ -469,7 +494,11 @@ func TestInit(t *testing.T) {
var s Scanner
// 1st init
- s.Init("", []byte("if true { }"), nil, 0)
+ src1 := "if true { }"
+ f1 := s.Init(fset, "", []byte(src1), nil, 0)
+ if f1.Size() != len(src1) {
+ t.Errorf("bad file size: got %d, expected %d", f1.Size(), len(src1))
+ }
s.Scan() // if
s.Scan() // true
_, tok, _ := s.Scan() // {
@@ -478,7 +507,11 @@ func TestInit(t *testing.T) {
}
// 2nd init
- s.Init("", []byte("go true { ]"), nil, 0)
+ src2 := "go true { ]"
+ f2 := s.Init(fset, "", []byte(src2), nil, 0)
+ if f2.Size() != len(src2) {
+ t.Errorf("bad file size: got %d, expected %d", f2.Size(), len(src2))
+ }
_, tok, _ = s.Scan() // go
if tok != token.GO {
t.Errorf("bad token: got %s, expected %s", tok.String(), token.GO)
@@ -494,11 +527,11 @@ func TestIllegalChars(t *testing.T) {
var s Scanner
const src = "*?*$*@*"
- s.Init("", []byte(src), &testErrorHandler{t}, AllowIllegalChars)
+ file := s.Init(fset, "", []byte(src), &testErrorHandler{t}, AllowIllegalChars)
for offs, ch := range src {
pos, tok, lit := s.Scan()
- if pos.Offset != offs {
- t.Errorf("bad position for %s: got %d, expected %d", string(lit), pos.Offset, offs)
+ if poffs := file.Offset(pos); poffs != offs {
+ t.Errorf("bad position for %s: got %d, expected %d", string(lit), poffs, offs)
}
if tok == token.ILLEGAL && string(lit) != string(ch) {
t.Errorf("bad token: got %s, expected %s", string(lit), string(ch))
@@ -522,10 +555,13 @@ func TestStdErrorHander(t *testing.T) {
"@ @ @" // original file, line 1 again
v := new(ErrorVector)
- nerrors := Tokenize("File1", []byte(src), v, 0,
- func(pos token.Position, tok token.Token, litb []byte) bool {
- return tok != token.EOF
- })
+ var s Scanner
+ s.Init(fset, "File1", []byte(src), v, 0)
+ for {
+ if _, tok, _ := s.Scan(); tok == token.EOF {
+ break
+ }
+ }
list := v.GetErrorList(Raw)
if len(list) != 9 {
@@ -545,8 +581,8 @@ func TestStdErrorHander(t *testing.T) {
PrintError(os.Stderr, list)
}
- if v.ErrorCount() != nerrors {
- t.Errorf("found %d errors, expected %d", v.ErrorCount(), nerrors)
+ if v.ErrorCount() != s.ErrorCount {
+ t.Errorf("found %d errors, expected %d", v.ErrorCount(), s.ErrorCount)
}
}
@@ -568,7 +604,7 @@ func (h *errorCollector) Error(pos token.Position, msg string) {
func checkError(t *testing.T, src string, tok token.Token, pos int, err string) {
var s Scanner
var h errorCollector
- s.Init("", []byte(src), &h, ScanComments)
+ s.Init(fset, "", []byte(src), &h, ScanComments)
_, tok0, _ := s.Scan()
_, tok1, _ := s.Scan()
if tok0 != tok {
@@ -593,28 +629,34 @@ func checkError(t *testing.T, src string, tok token.Token, pos int, err string)
}
-type srcerr struct {
+var errors = []struct {
src string
tok token.Token
pos int
err string
-}
-
-var errors = []srcerr{
- srcerr{"\"\"", token.STRING, 0, ""},
- srcerr{"\"", token.STRING, 0, "string not terminated"},
- srcerr{"/**/", token.COMMENT, 0, ""},
- srcerr{"/*", token.COMMENT, 0, "comment not terminated"},
- srcerr{"//\n", token.COMMENT, 0, ""},
- srcerr{"//", token.COMMENT, 0, "comment not terminated"},
- srcerr{"077", token.INT, 0, ""},
- srcerr{"078.", token.FLOAT, 0, ""},
- srcerr{"07801234567.", token.FLOAT, 0, ""},
- srcerr{"078e0", token.FLOAT, 0, ""},
- srcerr{"078", token.INT, 0, "illegal octal number"},
- srcerr{"07800000009", token.INT, 0, "illegal octal number"},
- srcerr{"\"abc\x00def\"", token.STRING, 4, "illegal character NUL"},
- srcerr{"\"abc\x80def\"", token.STRING, 4, "illegal UTF-8 encoding"},
+}{
+ {`#`, token.ILLEGAL, 0, "illegal character '#' (U+23)"},
+ {`' '`, token.CHAR, 0, ""},
+ {`''`, token.CHAR, 0, "illegal character literal"},
+ {`'\8'`, token.CHAR, 2, "unknown escape sequence"},
+ {`'\08'`, token.CHAR, 3, "illegal character in escape sequence"},
+ {`'\x0g'`, token.CHAR, 4, "illegal character in escape sequence"},
+ {`'\Uffffffff'`, token.CHAR, 2, "escape sequence is invalid Unicode code point"},
+ {`'`, token.CHAR, 0, "character literal not terminated"},
+ {`""`, token.STRING, 0, ""},
+ {`"`, token.STRING, 0, "string not terminated"},
+ {"``", token.STRING, 0, ""},
+ {"`", token.STRING, 0, "string not terminated"},
+ {"/**/", token.COMMENT, 0, ""},
+ {"/*", token.COMMENT, 0, "comment not terminated"},
+ {"077", token.INT, 0, ""},
+ {"078.", token.FLOAT, 0, ""},
+ {"07801234567.", token.FLOAT, 0, ""},
+ {"078e0", token.FLOAT, 0, ""},
+ {"078", token.INT, 0, "illegal octal number"},
+ {"07800000009", token.INT, 0, "illegal octal number"},
+ {"\"abc\x00def\"", token.STRING, 4, "illegal character NUL"},
+ {"\"abc\x80def\"", token.STRING, 4, "illegal UTF-8 encoding"},
}