diff options
Diffstat (limited to 'src/pkg/go/scanner')
-rw-r--r-- | src/pkg/go/scanner/errors.go | 17 | ||||
-rw-r--r-- | src/pkg/go/scanner/scanner.go | 157 | ||||
-rw-r--r-- | src/pkg/go/scanner/scanner_test.go | 101 |
3 files changed, 164 insertions, 111 deletions
diff --git a/src/pkg/go/scanner/errors.go b/src/pkg/go/scanner/errors.go index a0927e416..cd9620b87 100644 --- a/src/pkg/go/scanner/errors.go +++ b/src/pkg/go/scanner/errors.go @@ -8,7 +8,6 @@ import ( "fmt" "go/token" "io" - "os" "sort" ) @@ -49,7 +48,7 @@ type Error struct { Msg string } -func (e *Error) String() string { +func (e *Error) Error() string { if e.Pos.Filename != "" || e.Pos.IsValid() { // don't print "<unknown position>" // TODO(gri) reconsider the semantics of Position.IsValid @@ -85,14 +84,14 @@ func (p ErrorList) Less(i, j int) bool { return false } -func (p ErrorList) String() string { +func (p ErrorList) Error() string { switch len(p) { case 0: return "unspecified error" case 1: - return p[0].String() + return p[0].Error() } - return fmt.Sprintf("%s (and %d more errors)", p[0].String(), len(p)-1) + return fmt.Sprintf("%s (and %d more errors)", p[0], len(p)-1) } // These constants control the construction of the ErrorList @@ -136,11 +135,11 @@ func (h *ErrorVector) GetErrorList(mode int) ErrorList { return list } -// GetError is like GetErrorList, but it returns an os.Error instead -// so that a nil result can be assigned to an os.Error variable and +// GetError is like GetErrorList, but it returns an error instead +// so that a nil result can be assigned to an error variable and // remains nil. // -func (h *ErrorVector) GetError(mode int) os.Error { +func (h *ErrorVector) GetError(mode int) error { if len(h.errors) == 0 { return nil } @@ -157,7 +156,7 @@ func (h *ErrorVector) Error(pos token.Position, msg string) { // one error per line, if the err parameter is an ErrorList. Otherwise // it prints the err string. // -func PrintError(w io.Writer, err os.Error) { +func PrintError(w io.Writer, err error) { if list, ok := err.(ErrorList); ok { for _, e := range list { fmt.Fprintf(w, "%s\n", e) diff --git a/src/pkg/go/scanner/scanner.go b/src/pkg/go/scanner/scanner.go index 7f3dd2373..7c72c0a46 100644 --- a/src/pkg/go/scanner/scanner.go +++ b/src/pkg/go/scanner/scanner.go @@ -6,7 +6,7 @@ // source which can then be tokenized through repeated calls to the Scan // function. Typical use: // -// var s Scanner +// var s scanner.Scanner // fset := token.NewFileSet() // position information is relative to fset // file := fset.AddFile(filename, fset.Base(), len(src)) // register file // s.Init(file, src, nil /* no error handler */, 0) @@ -27,7 +27,7 @@ import ( "path/filepath" "strconv" "unicode" - "utf8" + "unicode/utf8" ) // A Scanner holds the scanner's internal state while processing @@ -40,10 +40,10 @@ type Scanner struct { dir string // directory portion of file.Name() src []byte // source err ErrorHandler // error reporting; or nil - mode uint // scanning mode + mode Mode // scanning mode // scanning state - ch int // current character + ch rune // current character offset int // character offset rdOffset int // reading offset (position after current character) lineOffset int // current line offset @@ -63,7 +63,7 @@ func (S *Scanner) next() { S.lineOffset = S.offset S.file.AddLine(S.offset) } - r, w := int(S.src[S.rdOffset]), 1 + r, w := rune(S.src[S.rdOffset]), 1 switch { case r == 0: S.error(S.offset, "illegal character NUL") @@ -86,13 +86,14 @@ func (S *Scanner) next() { } } -// The mode parameter to the Init function is a set of flags (or 0). +// A mode value is set of flags (or 0). // They control scanner behavior. // +type Mode uint + const ( - ScanComments = 1 << iota // return comments as COMMENT tokens - AllowIllegalChars // do not report an error for illegal chars - InsertSemis // automatically insert semicolons + ScanComments Mode = 1 << iota // return comments as COMMENT tokens + dontInsertSemis // do not automatically insert semicolons - for testing only ) // Init prepares the scanner S to tokenize the text src by setting the @@ -105,12 +106,12 @@ const ( // Calls to Scan will use the error handler err if they encounter a // syntax error and err is not nil. Also, for each error encountered, // the Scanner field ErrorCount is incremented by one. The mode parameter -// determines how comments, illegal characters, and semicolons are handled. +// determines how comments are handled. // // Note that Init may call err if there is an error in the first character // of the file. // -func (S *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode uint) { +func (S *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) { // Explicitly initialize all fields since a scanner may be reused. if file.Size() != len(src) { panic("file size does not match src len") @@ -152,13 +153,13 @@ func (S *Scanner) interpretLineComment(text []byte) { filename = filepath.Join(S.dir, filename) } // update scanner position - S.file.AddLineInfo(S.lineOffset, filename, line-1) // -1 since comment applies to next line + S.file.AddLineInfo(S.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line } } } } -func (S *Scanner) scanComment() { +func (S *Scanner) scanComment() string { // initial '/' already consumed; S.ch == '/' || S.ch == '*' offs := S.offset - 1 // position of initial '/' @@ -172,7 +173,7 @@ func (S *Scanner) scanComment() { // comment starts at the beginning of the current line S.interpretLineComment(S.src[offs:S.offset]) } - return + goto exit } /*-style comment */ @@ -182,11 +183,14 @@ func (S *Scanner) scanComment() { S.next() if ch == '*' && S.ch == '/' { S.next() - return + goto exit } } S.error(offs, "comment not terminated") + +exit: + return string(S.src[offs:S.offset]) } func (S *Scanner) findLineEnd() bool { @@ -233,30 +237,30 @@ func (S *Scanner) findLineEnd() bool { return false } -func isLetter(ch int) bool { +func isLetter(ch rune) bool { return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch) } -func isDigit(ch int) bool { +func isDigit(ch rune) bool { return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch) } -func (S *Scanner) scanIdentifier() token.Token { +func (S *Scanner) scanIdentifier() string { offs := S.offset for isLetter(S.ch) || isDigit(S.ch) { S.next() } - return token.Lookup(S.src[offs:S.offset]) + return string(S.src[offs:S.offset]) } -func digitVal(ch int) int { +func digitVal(ch rune) int { switch { case '0' <= ch && ch <= '9': - return ch - '0' + return int(ch - '0') case 'a' <= ch && ch <= 'f': - return ch - 'a' + 10 + return int(ch - 'a' + 10) case 'A' <= ch && ch <= 'F': - return ch - 'A' + 10 + return int(ch - 'A' + 10) } return 16 // larger than any legal digit val } @@ -267,11 +271,13 @@ func (S *Scanner) scanMantissa(base int) { } } -func (S *Scanner) scanNumber(seenDecimalPoint bool) token.Token { +func (S *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) { // digitVal(S.ch) < 10 + offs := S.offset tok := token.INT if seenDecimalPoint { + offs-- tok = token.FLOAT S.scanMantissa(10) goto exponent @@ -335,10 +341,10 @@ exponent: } exit: - return tok + return tok, string(S.src[offs:S.offset]) } -func (S *Scanner) scanEscape(quote int) { +func (S *Scanner) scanEscape(quote rune) { offs := S.offset var i, base, max uint32 @@ -382,7 +388,7 @@ func (S *Scanner) scanEscape(quote int) { } } -func (S *Scanner) scanChar() { +func (S *Scanner) scanChar() string { // '\'' opening already consumed offs := S.offset - 1 @@ -406,9 +412,11 @@ func (S *Scanner) scanChar() { if n != 1 { S.error(offs, "illegal character literal") } + + return string(S.src[offs:S.offset]) } -func (S *Scanner) scanString() { +func (S *Scanner) scanString() string { // '"' opening already consumed offs := S.offset - 1 @@ -425,15 +433,33 @@ func (S *Scanner) scanString() { } S.next() + + return string(S.src[offs:S.offset]) +} + +func stripCR(b []byte) []byte { + c := make([]byte, len(b)) + i := 0 + for _, ch := range b { + if ch != '\r' { + c[i] = ch + i++ + } + } + return c[:i] } -func (S *Scanner) scanRawString() { +func (S *Scanner) scanRawString() string { // '`' opening already consumed offs := S.offset - 1 + hasCR := false for S.ch != '`' { ch := S.ch S.next() + if ch == '\r' { + hasCR = true + } if ch < 0 { S.error(offs, "string not terminated") break @@ -441,6 +467,13 @@ func (S *Scanner) scanRawString() { } S.next() + + lit := S.src[offs:S.offset] + if hasCR { + lit = stripCR(lit) + } + + return string(lit) } func (S *Scanner) skipWhitespace() { @@ -463,7 +496,7 @@ func (S *Scanner) switch2(tok0, tok1 token.Token) token.Token { return tok0 } -func (S *Scanner) switch3(tok0, tok1 token.Token, ch2 int, tok2 token.Token) token.Token { +func (S *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token { if S.ch == '=' { S.next() return tok1 @@ -475,7 +508,7 @@ func (S *Scanner) switch3(tok0, tok1 token.Token, ch2 int, tok2 token.Token) tok return tok0 } -func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Token) token.Token { +func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token { if S.ch == '=' { S.next() return tok1 @@ -491,15 +524,24 @@ func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Toke return tok0 } -// Scan scans the next token and returns the token position, -// the token, and the literal string corresponding to the -// token. The source end is indicated by token.EOF. +// Scan scans the next token and returns the token position, the token, +// and its literal string if applicable. The source end is indicated by +// token.EOF. +// +// If the returned token is a literal (token.IDENT, token.INT, token.FLOAT, +// token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string +// has the corresponding value. // // If the returned token is token.SEMICOLON, the corresponding // literal string is ";" if the semicolon was present in the source, // and "\n" if the semicolon was inserted because of a newline or // at EOF. // +// If the returned token is token.ILLEGAL, the literal string is the +// offending character. +// +// In all other cases, Scan returns an empty literal string. +// // For more tolerant parsing, Scan will return a valid token if // possible even if a syntax error was encountered. Thus, even // if the resulting token sequence contains no illegal tokens, @@ -511,33 +553,33 @@ func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Toke // set with Init. Token positions are relative to that file // and thus relative to the file set. // -func (S *Scanner) Scan() (token.Pos, token.Token, string) { +func (S *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) { scanAgain: S.skipWhitespace() // current token start - insertSemi := false - offs := S.offset - tok := token.ILLEGAL + pos = S.file.Pos(S.offset) // determine token value + insertSemi := false switch ch := S.ch; { case isLetter(ch): - tok = S.scanIdentifier() + lit = S.scanIdentifier() + tok = token.Lookup(lit) switch tok { case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN: insertSemi = true } case digitVal(ch) < 10: insertSemi = true - tok = S.scanNumber(false) + tok, lit = S.scanNumber(false) default: S.next() // always make progress switch ch { case -1: if S.insertSemi { S.insertSemi = false // EOF consumed - return S.file.Pos(offs), token.SEMICOLON, "\n" + return pos, token.SEMICOLON, "\n" } tok = token.EOF case '\n': @@ -545,25 +587,25 @@ scanAgain: // set in the first place and exited early // from S.skipWhitespace() S.insertSemi = false // newline consumed - return S.file.Pos(offs), token.SEMICOLON, "\n" + return pos, token.SEMICOLON, "\n" case '"': insertSemi = true tok = token.STRING - S.scanString() + lit = S.scanString() case '\'': insertSemi = true tok = token.CHAR - S.scanChar() + lit = S.scanChar() case '`': insertSemi = true tok = token.STRING - S.scanRawString() + lit = S.scanRawString() case ':': tok = S.switch2(token.COLON, token.DEFINE) case '.': if digitVal(S.ch) < 10 { insertSemi = true - tok = S.scanNumber(true) + tok, lit = S.scanNumber(true) } else if S.ch == '.' { S.next() if S.ch == '.' { @@ -577,6 +619,7 @@ scanAgain: tok = token.COMMA case ';': tok = token.SEMICOLON + lit = ";" case '(': tok = token.LPAREN case ')': @@ -610,12 +653,12 @@ scanAgain: if S.insertSemi && S.findLineEnd() { // reset position to the beginning of the comment S.ch = '/' - S.offset = offs - S.rdOffset = offs + 1 + S.offset = S.file.Offset(pos) + S.rdOffset = S.offset + 1 S.insertSemi = false // newline consumed - return S.file.Pos(offs), token.SEMICOLON, "\n" + return pos, token.SEMICOLON, "\n" } - S.scanComment() + lit = S.scanComment() if S.mode&ScanComments == 0 { // skip comment S.insertSemi = false // newline consumed @@ -652,19 +695,15 @@ scanAgain: case '|': tok = S.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR) default: - if S.mode&AllowIllegalChars == 0 { - S.error(offs, fmt.Sprintf("illegal character %#U", ch)) - } + S.error(S.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch)) insertSemi = S.insertSemi // preserve insertSemi info + tok = token.ILLEGAL + lit = string(ch) } } - - if S.mode&InsertSemis != 0 { + if S.mode&dontInsertSemis == 0 { S.insertSemi = insertSemi } - // TODO(gri): The scanner API should change such that the literal string - // is only valid if an actual literal was scanned. This will - // permit a more efficient implementation. - return S.file.Pos(offs), tok, string(S.src[offs:S.offset]) + return } diff --git a/src/pkg/go/scanner/scanner_test.go b/src/pkg/go/scanner/scanner_test.go index eb9e1cb81..af45bc5b1 100644 --- a/src/pkg/go/scanner/scanner_test.go +++ b/src/pkg/go/scanner/scanner_test.go @@ -83,6 +83,8 @@ var tokens = [...]elt{ "`", literal, }, + {token.STRING, "`\r`", literal}, + {token.STRING, "`foo\r\nbar`", literal}, // Operators and delimiters {token.ADD, "+", operator}, @@ -175,6 +177,15 @@ var tokens = [...]elt{ const whitespace = " \t \n\n\n" // to separate tokens +var source = func() []byte { + var src []byte + for _, t := range tokens { + src = append(src, t.lit...) + src = append(src, whitespace...) + } + return src +}() + type testErrorHandler struct { t *testing.T } @@ -212,20 +223,20 @@ func checkPos(t *testing.T, lit string, p token.Pos, expected token.Position) { // Verify that calling Scan() provides the correct results. func TestScan(t *testing.T) { // make source - var src string - for _, e := range tokens { - src += e.lit + whitespace - } - src_linecount := newlineCount(src) + src_linecount := newlineCount(string(source)) whitespace_linecount := newlineCount(whitespace) // verify scan var s Scanner - s.Init(fset.AddFile("", fset.Base(), len(src)), []byte(src), &testErrorHandler{t}, ScanComments) + s.Init(fset.AddFile("", fset.Base(), len(source)), source, &testErrorHandler{t}, ScanComments|dontInsertSemis) index := 0 epos := token.Position{"", 0, 1, 1} // expected position for { pos, tok, lit := s.Scan() + if lit == "" { + // no literal value for non-literal tokens + lit = tok.String() + } e := elt{token.EOF, "", special} if index < len(tokens) { e = tokens[index] @@ -237,10 +248,18 @@ func TestScan(t *testing.T) { } checkPos(t, lit, pos, epos) if tok != e.tok { - t.Errorf("bad token for %q: got %s, expected %s", lit, tok.String(), e.tok.String()) + t.Errorf("bad token for %q: got %s, expected %s", lit, tok, e.tok) } - if e.tok.IsLiteral() && lit != e.lit { - t.Errorf("bad literal for %q: got %q, expected %q", lit, lit, e.lit) + if e.tok.IsLiteral() { + // no CRs in raw string literals + elit := e.lit + if elit[0] == '`' { + elit = string(stripCR([]byte(elit))) + epos.Offset += len(e.lit) - len(lit) // correct position + } + if lit != elit { + t.Errorf("bad literal for %q: got %q, expected %q", lit, lit, elit) + } } if tokenclass(tok) != e.class { t.Errorf("bad class for %q: got %d, expected %d", lit, tokenclass(tok), e.class) @@ -262,7 +281,7 @@ func TestScan(t *testing.T) { } } -func checkSemi(t *testing.T, line string, mode uint) { +func checkSemi(t *testing.T, line string, mode Mode) { var S Scanner file := fset.AddFile("TestSemis", fset.Base(), len(line)) S.Init(file, []byte(line), nil, mode) @@ -286,7 +305,7 @@ func checkSemi(t *testing.T, line string, mode uint) { } checkPos(t, line, pos, semiPos) } else { - t.Errorf("bad token for %q: got %s, expected ;", line, tok.String()) + t.Errorf("bad token for %q: got %s, expected ;", line, tok) } } else if tok == token.SEMICOLON { t.Errorf("bad token for %q: got ;, expected no ;", line) @@ -420,14 +439,14 @@ var lines = []string{ func TestSemis(t *testing.T) { for _, line := range lines { - checkSemi(t, line, AllowIllegalChars|InsertSemis) - checkSemi(t, line, AllowIllegalChars|InsertSemis|ScanComments) + checkSemi(t, line, 0) + checkSemi(t, line, ScanComments) // if the input ended in newlines, the input must tokenize the // same with or without those newlines for i := len(line) - 1; i >= 0 && line[i] == '\n'; i-- { - checkSemi(t, line[0:i], AllowIllegalChars|InsertSemis) - checkSemi(t, line[0:i], AllowIllegalChars|InsertSemis|ScanComments) + checkSemi(t, line[0:i], 0) + checkSemi(t, line[0:i], ScanComments) } } } @@ -482,7 +501,7 @@ func TestLineComments(t *testing.T) { // verify scan var S Scanner file := fset.AddFile(filepath.Join("dir", "TestLineComments"), fset.Base(), len(src)) - S.Init(file, []byte(src), nil, 0) + S.Init(file, []byte(src), nil, dontInsertSemis) for _, s := range segs { p, _, lit := S.Scan() pos := file.Position(p) @@ -501,7 +520,7 @@ func TestInit(t *testing.T) { // 1st init src1 := "if true { }" f1 := fset.AddFile("src1", fset.Base(), len(src1)) - s.Init(f1, []byte(src1), nil, 0) + s.Init(f1, []byte(src1), nil, dontInsertSemis) if f1.Size() != len(src1) { t.Errorf("bad file size: got %d, expected %d", f1.Size(), len(src1)) } @@ -509,40 +528,19 @@ func TestInit(t *testing.T) { s.Scan() // true _, tok, _ := s.Scan() // { if tok != token.LBRACE { - t.Errorf("bad token: got %s, expected %s", tok.String(), token.LBRACE) + t.Errorf("bad token: got %s, expected %s", tok, token.LBRACE) } // 2nd init src2 := "go true { ]" f2 := fset.AddFile("src2", fset.Base(), len(src2)) - s.Init(f2, []byte(src2), nil, 0) + s.Init(f2, []byte(src2), nil, dontInsertSemis) if f2.Size() != len(src2) { t.Errorf("bad file size: got %d, expected %d", f2.Size(), len(src2)) } _, tok, _ = s.Scan() // go if tok != token.GO { - t.Errorf("bad token: got %s, expected %s", tok.String(), token.GO) - } - - if s.ErrorCount != 0 { - t.Errorf("found %d errors", s.ErrorCount) - } -} - -func TestIllegalChars(t *testing.T) { - var s Scanner - - const src = "*?*$*@*" - file := fset.AddFile("", fset.Base(), len(src)) - s.Init(file, []byte(src), &testErrorHandler{t}, AllowIllegalChars) - for offs, ch := range src { - pos, tok, lit := s.Scan() - if poffs := file.Offset(pos); poffs != offs { - t.Errorf("bad position for %s: got %d, expected %d", lit, poffs, offs) - } - if tok == token.ILLEGAL && lit != string(ch) { - t.Errorf("bad token: got %s, expected %s", lit, string(ch)) - } + t.Errorf("bad token: got %s, expected %s", tok, token.GO) } if s.ErrorCount != 0 { @@ -562,7 +560,7 @@ func TestStdErrorHander(t *testing.T) { v := new(ErrorVector) var s Scanner - s.Init(fset.AddFile("File1", fset.Base(), len(src)), []byte(src), v, 0) + s.Init(fset.AddFile("File1", fset.Base(), len(src)), []byte(src), v, dontInsertSemis) for { if _, tok, _ := s.Scan(); tok == token.EOF { break @@ -607,7 +605,7 @@ func (h *errorCollector) Error(pos token.Position, msg string) { func checkError(t *testing.T, src string, tok token.Token, pos int, err string) { var s Scanner var h errorCollector - s.Init(fset.AddFile("", fset.Base(), len(src)), []byte(src), &h, ScanComments) + s.Init(fset.AddFile("", fset.Base(), len(src)), []byte(src), &h, ScanComments|dontInsertSemis) _, tok0, _ := s.Scan() _, tok1, _ := s.Scan() if tok0 != tok { @@ -670,3 +668,20 @@ func TestScanErrors(t *testing.T) { checkError(t, e.src, e.tok, e.pos, e.err) } } + +func BenchmarkScan(b *testing.B) { + b.StopTimer() + fset := token.NewFileSet() + file := fset.AddFile("", fset.Base(), len(source)) + var s Scanner + b.StartTimer() + for i := b.N - 1; i >= 0; i-- { + s.Init(file, source, nil, ScanComments) + for { + _, tok, _ := s.Scan() + if tok == token.EOF { + break + } + } + } +} |