diff options
Diffstat (limited to 'src/pkg/go/scanner')
-rw-r--r-- | src/pkg/go/scanner/errors.go | 2 | ||||
-rw-r--r-- | src/pkg/go/scanner/scanner.go | 39 | ||||
-rw-r--r-- | src/pkg/go/scanner/scanner_test.go | 116 |
3 files changed, 115 insertions, 42 deletions
diff --git a/src/pkg/go/scanner/errors.go b/src/pkg/go/scanner/errors.go index 8a75a9650..22de69c3c 100644 --- a/src/pkg/go/scanner/errors.go +++ b/src/pkg/go/scanner/errors.go @@ -120,7 +120,7 @@ func PrintError(w io.Writer, err error) { for _, e := range list { fmt.Fprintf(w, "%s\n", e) } - } else { + } else if err != nil { fmt.Fprintf(w, "%s\n", err) } } diff --git a/src/pkg/go/scanner/scanner.go b/src/pkg/go/scanner/scanner.go index da508747a..3322c58b3 100644 --- a/src/pkg/go/scanner/scanner.go +++ b/src/pkg/go/scanner/scanner.go @@ -81,7 +81,7 @@ func (s *Scanner) next() { } } -// A mode value is set of flags (or 0). +// A mode value is a set of flags (or 0). // They control scanner behavior. // type Mode uint @@ -125,6 +125,9 @@ func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode s.ErrorCount = 0 s.next() + if s.ch == '\uFEFF' { + s.next() // ignore BOM + } } func (s *Scanner) error(offs int, msg string) { @@ -157,11 +160,15 @@ func (s *Scanner) interpretLineComment(text []byte) { func (s *Scanner) scanComment() string { // initial '/' already consumed; s.ch == '/' || s.ch == '*' offs := s.offset - 1 // position of initial '/' + hasCR := false if s.ch == '/' { //-style comment s.next() for s.ch != '\n' && s.ch >= 0 { + if s.ch == '\r' { + hasCR = true + } s.next() } if offs == s.lineOffset { @@ -175,6 +182,9 @@ func (s *Scanner) scanComment() string { s.next() for s.ch >= 0 { ch := s.ch + if ch == '\r' { + hasCR = true + } s.next() if ch == '*' && s.ch == '/' { s.next() @@ -185,7 +195,12 @@ func (s *Scanner) scanComment() string { s.error(offs, "comment not terminated") exit: - return string(s.src[offs:s.offset]) + lit := s.src[offs:s.offset] + if hasCR { + lit = stripCR(lit) + } + + return string(lit) } func (s *Scanner) findLineEnd() bool { @@ -378,7 +393,7 @@ func (s *Scanner) scanEscape(quote rune) { for ; i > 0 && s.ch != quote && s.ch >= 0; i-- { s.next() } - if x > max || 0xd800 <= x && x < 0xe000 { + if x > max || 0xD800 <= x && x < 0xE000 { s.error(offs, "escape sequence is invalid Unicode code point") } } @@ -527,6 +542,8 @@ func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Tok // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string // has the corresponding value. // +// If the returned token is a keyword, the literal string is the keyword. +// // If the returned token is token.SEMICOLON, the corresponding // literal string is ";" if the semicolon was present in the source, // and "\n" if the semicolon was inserted because of a newline or @@ -560,12 +577,18 @@ scanAgain: switch ch := s.ch; { case isLetter(ch): lit = s.scanIdentifier() - tok = token.Lookup(lit) - switch tok { - case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN: + if len(lit) > 1 { + // keywords are longer than one letter - avoid lookup otherwise + tok = token.Lookup(lit) + switch tok { + case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN: + insertSemi = true + } + } else { insertSemi = true + tok = token.IDENT } - case digitVal(ch) < 10: + case '0' <= ch && ch <= '9': insertSemi = true tok, lit = s.scanNumber(false) default: @@ -598,7 +621,7 @@ scanAgain: case ':': tok = s.switch2(token.COLON, token.DEFINE) case '.': - if digitVal(s.ch) < 10 { + if '0' <= s.ch && s.ch <= '9' { insertSemi = true tok, lit = s.scanNumber(true) } else if s.ch == '.' { diff --git a/src/pkg/go/scanner/scanner_test.go b/src/pkg/go/scanner/scanner_test.go index 06223e23b..1c19053e6 100644 --- a/src/pkg/go/scanner/scanner_test.go +++ b/src/pkg/go/scanner/scanner_test.go @@ -6,6 +6,7 @@ package scanner import ( "go/token" + "io/ioutil" "os" "path/filepath" "runtime" @@ -43,12 +44,16 @@ var tokens = [...]elt{ // Special tokens {token.COMMENT, "/* a comment */", special}, {token.COMMENT, "// a comment \n", special}, + {token.COMMENT, "/*\r*/", special}, + {token.COMMENT, "//\r\n", special}, // Identifiers and basic type literals {token.IDENT, "foobar", literal}, {token.IDENT, "a۰۱۸", literal}, {token.IDENT, "foo६४", literal}, {token.IDENT, "bar9876", literal}, + {token.IDENT, "ŝ", literal}, // was bug (issue 4000) + {token.IDENT, "ŝfoo", literal}, // was bug (issue 4000) {token.INT, "0", literal}, {token.INT, "1", literal}, {token.INT, "123456789012345678890", literal}, @@ -214,8 +219,6 @@ func checkPos(t *testing.T, lit string, p token.Pos, expected token.Position) { // Verify that calling Scan() provides the correct results. func TestScan(t *testing.T) { - // make source - src_linecount := newlineCount(string(source)) whitespace_linecount := newlineCount(whitespace) // error handler @@ -226,59 +229,81 @@ func TestScan(t *testing.T) { // verify scan var s Scanner s.Init(fset.AddFile("", fset.Base(), len(source)), source, eh, ScanComments|dontInsertSemis) - index := 0 - // epos is the expected position + + // set up expected position epos := token.Position{ Filename: "", Offset: 0, Line: 1, Column: 1, } + + index := 0 for { pos, tok, lit := s.Scan() - if lit == "" { - // no literal value for non-literal tokens - lit = tok.String() + + // check position + if tok == token.EOF { + // correction for EOF + epos.Line = newlineCount(string(source)) + epos.Column = 2 } + checkPos(t, lit, pos, epos) + + // check token e := elt{token.EOF, "", special} if index < len(tokens) { e = tokens[index] + index++ } - if tok == token.EOF { - lit = "<EOF>" - epos.Line = src_linecount - epos.Column = 2 - } - checkPos(t, lit, pos, epos) if tok != e.tok { t.Errorf("bad token for %q: got %s, expected %s", lit, tok, e.tok) } - if e.tok.IsLiteral() { - // no CRs in raw string literals - elit := e.lit - if elit[0] == '`' { - elit = string(stripCR([]byte(elit))) - epos.Offset += len(e.lit) - len(lit) // correct position - } - if lit != elit { - t.Errorf("bad literal for %q: got %q, expected %q", lit, lit, elit) - } - } + + // check token class if tokenclass(tok) != e.class { t.Errorf("bad class for %q: got %d, expected %d", lit, tokenclass(tok), e.class) } - epos.Offset += len(lit) + len(whitespace) - epos.Line += newlineCount(lit) + whitespace_linecount - if tok == token.COMMENT && lit[1] == '/' { - // correct for unaccounted '/n' in //-style comment - epos.Offset++ - epos.Line++ + + // check literal + elit := "" + switch e.tok { + case token.COMMENT: + // no CRs in comments + elit = string(stripCR([]byte(e.lit))) + //-style comment literal doesn't contain newline + if elit[1] == '/' { + elit = elit[0 : len(elit)-1] + } + case token.IDENT: + elit = e.lit + case token.SEMICOLON: + elit = ";" + default: + if e.tok.IsLiteral() { + // no CRs in raw string literals + elit = e.lit + if elit[0] == '`' { + elit = string(stripCR([]byte(elit))) + } + } else if e.tok.IsKeyword() { + elit = e.lit + } + } + if lit != elit { + t.Errorf("bad literal for %q: got %q, expected %q", lit, lit, elit) } - index++ + if tok == token.EOF { break } + + // update position + epos.Offset += len(e.lit) + len(whitespace) + epos.Line += newlineCount(e.lit) + whitespace_linecount + } + if s.ErrorCount != 0 { t.Errorf("found %d errors", s.ErrorCount) } @@ -321,6 +346,7 @@ var lines = []string{ // # indicates a semicolon present in the source // $ indicates an automatically inserted semicolon "", + "\ufeff#;", // first BOM is ignored "#;", "foo$\n", "123$\n", @@ -521,7 +547,7 @@ func TestLineComments(t *testing.T) { } } -// Verify that initializing the same scanner more then once works correctly. +// Verify that initializing the same scanner more than once works correctly. func TestInit(t *testing.T) { var s Scanner @@ -669,6 +695,7 @@ var errors = []struct { {"0X", token.INT, 0, "illegal hexadecimal number"}, {"\"abc\x00def\"", token.STRING, 4, "illegal character NUL"}, {"\"abc\x80def\"", token.STRING, 4, "illegal UTF-8 encoding"}, + {"\ufeff\ufeff", token.ILLEGAL, 3, "illegal character U+FEFF"}, // only first BOM is ignored } func TestScanErrors(t *testing.T) { @@ -683,7 +710,7 @@ func BenchmarkScan(b *testing.B) { file := fset.AddFile("", fset.Base(), len(source)) var s Scanner b.StartTimer() - for i := b.N - 1; i >= 0; i-- { + for i := 0; i < b.N; i++ { s.Init(file, source, nil, ScanComments) for { _, tok, _ := s.Scan() @@ -693,3 +720,26 @@ func BenchmarkScan(b *testing.B) { } } } + +func BenchmarkScanFile(b *testing.B) { + b.StopTimer() + const filename = "scanner.go" + src, err := ioutil.ReadFile(filename) + if err != nil { + panic(err) + } + fset := token.NewFileSet() + file := fset.AddFile(filename, fset.Base(), len(src)) + b.SetBytes(int64(len(src))) + var s Scanner + b.StartTimer() + for i := 0; i < b.N; i++ { + s.Init(file, src, nil, ScanComments) + for { + _, tok, _ := s.Scan() + if tok == token.EOF { + break + } + } + } +} |