3 files changed, 115 insertions, 42 deletions
diff --git a/src/pkg/go/scanner/errors.go b/src/pkg/go/scanner/errors.go
index 8a75a9650..22de69c3c 100644
--- a/src/pkg/go/scanner/errors.go
+++ b/src/pkg/go/scanner/errors.go
@@ -120,7 +120,7 @@ func PrintError(w io.Writer, err error) {
 		for _, e := range list {
 			fmt.Fprintf(w, "%s\n", e)
 		}
-	} else {
+	} else if err != nil {
 		fmt.Fprintf(w, "%s\n", err)
 	}
 }
diff --git a/src/pkg/go/scanner/scanner.go b/src/pkg/go/scanner/scanner.go
index da508747a..3322c58b3 100644
--- a/src/pkg/go/scanner/scanner.go
+++ b/src/pkg/go/scanner/scanner.go
@@ -81,7 +81,7 @@ func (s *Scanner) next() {
 	}
 }
 
-// A mode value is set of flags (or 0).
+// A mode value is a set of flags (or 0).
 // They control scanner behavior.
 //
 type Mode uint
@@ -125,6 +125,9 @@ func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode
 	s.ErrorCount = 0
 
 	s.next()
+	if s.ch == '\uFEFF' {
+		s.next() // ignore BOM
+	}
 }
 
 func (s *Scanner) error(offs int, msg string) {
@@ -157,11 +160,15 @@ func (s *Scanner) interpretLineComment(text []byte) {
 func (s *Scanner) scanComment() string {
 	// initial '/' already consumed; s.ch == '/' || s.ch == '*'
 	offs := s.offset - 1 // position of initial '/'
+	hasCR := false
 
 	if s.ch == '/' {
 		//-style comment
 		s.next()
 		for s.ch != '\n' && s.ch >= 0 {
+			if s.ch == '\r' {
+				hasCR = true
+			}
 			s.next()
 		}
 		if offs == s.lineOffset {
@@ -175,6 +182,9 @@ func (s *Scanner) scanComment() string {
 	s.next()
 	for s.ch >= 0 {
 		ch := s.ch
+		if ch == '\r' {
+			hasCR = true
+		}
 		s.next()
 		if ch == '*' && s.ch == '/' {
 			s.next()
@@ -185,7 +195,12 @@ func (s *Scanner) scanComment() string {
 	s.error(offs, "comment not terminated")
 
 exit:
-	return string(s.src[offs:s.offset])
+	lit := s.src[offs:s.offset]
+	if hasCR {
+		lit = stripCR(lit)
+	}
+
+	return string(lit)
 }
 
 func (s *Scanner) findLineEnd() bool {
@@ -378,7 +393,7 @@ func (s *Scanner) scanEscape(quote rune) {
 	for ; i > 0 && s.ch != quote && s.ch >= 0; i-- {
 		s.next()
 	}
-	if x > max || 0xd800 <= x && x < 0xe000 {
+	if x > max || 0xD800 <= x && x < 0xE000 {
 		s.error(offs, "escape sequence is invalid Unicode code point")
 	}
 }
@@ -527,6 +542,8 @@ func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Tok
 // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string
 // has the corresponding value.
 //
+// If the returned token is a keyword, the literal string is the keyword.
+//
 // If the returned token is token.SEMICOLON, the corresponding
 // literal string is ";" if the semicolon was present in the source,
 // and "\n" if the semicolon was inserted because of a newline or
@@ -560,12 +577,18 @@ scanAgain:
 	switch ch := s.ch; {
 	case isLetter(ch):
 		lit = s.scanIdentifier()
-		tok = token.Lookup(lit)
-		switch tok {
-		case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
+		if len(lit) > 1 {
+			// keywords are longer than one letter - avoid lookup otherwise
+			tok = token.Lookup(lit)
+			switch tok {
+			case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
+				insertSemi = true
+			}
+		} else {
 			insertSemi = true
+			tok = token.IDENT
 		}
-	case digitVal(ch) < 10:
+	case '0' <= ch && ch <= '9':
 		insertSemi = true
 		tok, lit = s.scanNumber(false)
 	default:
@@ -598,7 +621,7 @@ scanAgain:
 		case ':':
 			tok = s.switch2(token.COLON, token.DEFINE)
 		case '.':
-			if digitVal(s.ch) < 10 {
+			if '0' <= s.ch && s.ch <= '9' {
 				insertSemi = true
 				tok, lit = s.scanNumber(true)
 			} else if s.ch == '.' {
diff --git a/src/pkg/go/scanner/scanner_test.go b/src/pkg/go/scanner/scanner_test.go
index 06223e23b..1c19053e6 100644
--- a/src/pkg/go/scanner/scanner_test.go
+++ b/src/pkg/go/scanner/scanner_test.go
@@ -6,6 +6,7 @@ package scanner
 
 import (
 	"go/token"
+	"io/ioutil"
 	"os"
 	"path/filepath"
 	"runtime"
@@ -43,12 +44,16 @@ var tokens = [...]elt{
 	// Special tokens
 	{token.COMMENT, "/* a comment */", special},
 	{token.COMMENT, "// a comment \n", special},
+	{token.COMMENT, "/*\r*/", special},
+	{token.COMMENT, "//\r\n", special},
 
 	// Identifiers and basic type literals
 	{token.IDENT, "foobar", literal},
 	{token.IDENT, "a۰۱۸", literal},
 	{token.IDENT, "foo६४", literal},
 	{token.IDENT, "bar９８７６", literal},
+	{token.IDENT, "ŝ", literal},    // was bug (issue 4000)
+	{token.IDENT, "ŝfoo", literal}, // was bug (issue 4000)
 	{token.INT, "0", literal},
 	{token.INT, "1", literal},
 	{token.INT, "123456789012345678890", literal},
@@ -214,8 +219,6 @@ func checkPos(t *testing.T, lit string, p token.Pos, expected token.Position) {
 
 // Verify that calling Scan() provides the correct results.
 func TestScan(t *testing.T) {
-	// make source
-	src_linecount := newlineCount(string(source))
 	whitespace_linecount := newlineCount(whitespace)
 
 	// error handler
@@ -226,59 +229,81 @@ func TestScan(t *testing.T) {
 	// verify scan
 	var s Scanner
 	s.Init(fset.AddFile("", fset.Base(), len(source)), source, eh, ScanComments|dontInsertSemis)
-	index := 0
-	// epos is the expected position
+
+	// set up expected position
 	epos := token.Position{
 		Filename: "",
 		Offset:   0,
 		Line:     1,
 		Column:   1,
 	}
+
+	index := 0
 	for {
 		pos, tok, lit := s.Scan()
-		if lit == "" {
-			// no literal value for non-literal tokens
-			lit = tok.String()
+
+		// check position
+		if tok == token.EOF {
+			// correction for EOF
+			epos.Line = newlineCount(string(source))
+			epos.Column = 2
 		}
+		checkPos(t, lit, pos, epos)
+
+		// check token
 		e := elt{token.EOF, "", special}
 		if index < len(tokens) {
 			e = tokens[index]
+			index++
 		}
-		if tok == token.EOF {
-			lit = "<EOF>"
-			epos.Line = src_linecount
-			epos.Column = 2
-		}
-		checkPos(t, lit, pos, epos)
 		if tok != e.tok {
 			t.Errorf("bad token for %q: got %s, expected %s", lit, tok, e.tok)
 		}
-		if e.tok.IsLiteral() {
-			// no CRs in raw string literals
-			elit := e.lit
-			if elit[0] == '`' {
-				elit = string(stripCR([]byte(elit)))
-				epos.Offset += len(e.lit) - len(lit) // correct position
-			}
-			if lit != elit {
-				t.Errorf("bad literal for %q: got %q, expected %q", lit, lit, elit)
-			}
-		}
+
+		// check token class
 		if tokenclass(tok) != e.class {
 			t.Errorf("bad class for %q: got %d, expected %d", lit, tokenclass(tok), e.class)
 		}
-		epos.Offset += len(lit) + len(whitespace)
-		epos.Line += newlineCount(lit) + whitespace_linecount
-		if tok == token.COMMENT && lit[1] == '/' {
-			// correct for unaccounted '/n' in //-style comment
-			epos.Offset++
-			epos.Line++
+
+		// check literal
+		elit := ""
+		switch e.tok {
+		case token.COMMENT:
+			// no CRs in comments
+			elit = string(stripCR([]byte(e.lit)))
+			//-style comment literal doesn't contain newline
+			if elit[1] == '/' {
+				elit = elit[0 : len(elit)-1]
+			}
+		case token.IDENT:
+			elit = e.lit
+		case token.SEMICOLON:
+			elit = ";"
+		default:
+			if e.tok.IsLiteral() {
+				// no CRs in raw string literals
+				elit = e.lit
+				if elit[0] == '`' {
+					elit = string(stripCR([]byte(elit)))
+				}
+			} else if e.tok.IsKeyword() {
+				elit = e.lit
+			}
+		}
+		if lit != elit {
+			t.Errorf("bad literal for %q: got %q, expected %q", lit, lit, elit)
 		}
-		index++
+
 		if tok == token.EOF {
 			break
 		}
+
+		// update position
+		epos.Offset += len(e.lit) + len(whitespace)
+		epos.Line += newlineCount(e.lit) + whitespace_linecount
+
 	}
+
 	if s.ErrorCount != 0 {
 		t.Errorf("found %d errors", s.ErrorCount)
 	}
@@ -321,6 +346,7 @@ var lines = []string{
 	// # indicates a semicolon present in the source
 	// $ indicates an automatically inserted semicolon
 	"",
+	"\ufeff#;", // first BOM is ignored
 	"#;",
 	"foo$\n",
 	"123$\n",
@@ -521,7 +547,7 @@ func TestLineComments(t *testing.T) {
 	}
 }
 
-// Verify that initializing the same scanner more then once works correctly.
+// Verify that initializing the same scanner more than once works correctly.
 func TestInit(t *testing.T) {
 	var s Scanner
 
@@ -669,6 +695,7 @@ var errors = []struct {
 	{"0X", token.INT, 0, "illegal hexadecimal number"},
 	{"\"abc\x00def\"", token.STRING, 4, "illegal character NUL"},
 	{"\"abc\x80def\"", token.STRING, 4, "illegal UTF-8 encoding"},
+	{"\ufeff\ufeff", token.ILLEGAL, 3, "illegal character U+FEFF"}, // only first BOM is ignored
 }
 
 func TestScanErrors(t *testing.T) {
@@ -683,7 +710,7 @@ func BenchmarkScan(b *testing.B) {
 	file := fset.AddFile("", fset.Base(), len(source))
 	var s Scanner
 	b.StartTimer()
-	for i := b.N - 1; i >= 0; i-- {
+	for i := 0; i < b.N; i++ {
 		s.Init(file, source, nil, ScanComments)
 		for {
 			_, tok, _ := s.Scan()
@@ -693,3 +720,26 @@ func BenchmarkScan(b *testing.B) {
 		}
 	}
 }
+
+func BenchmarkScanFile(b *testing.B) {
+	b.StopTimer()
+	const filename = "scanner.go"
+	src, err := ioutil.ReadFile(filename)
+	if err != nil {
+		panic(err)
+	}
+	fset := token.NewFileSet()
+	file := fset.AddFile(filename, fset.Base(), len(src))
+	b.SetBytes(int64(len(src)))
+	var s Scanner
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		s.Init(file, src, nil, ScanComments)
+		for {
+			_, tok, _ := s.Scan()
+			if tok == token.EOF {
+				break
+			}
+		}
+	}
+}