Imported Upstream version 2012.01.27upstream-weekly/2012.01.27

author: Ondřej Surý <ondrej@sury.org> 2012-01-30 15:38:19 +0100
committer: Ondřej Surý <ondrej@sury.org> 2012-01-30 15:38:19 +0100
commit: 4cecda6c347bd6902b960c6a35a967add7070b0d (patch)
tree: a462e224ff41ec9f3eb1a0b6e815806f9e8804ad /src/pkg/go/scanner/scanner.go
parent: 6c7ca6e4d4e26e4c8cbe0d183966011b3b088a0a (diff)
download: golang-4cecda6c347bd6902b960c6a35a967add7070b0d.tar.gz
1 files changed, 98 insertions, 59 deletions
diff --git a/src/pkg/go/scanner/scanner.go b/src/pkg/go/scanner/scanner.go
index 7f3dd2373..7c72c0a46 100644
--- a/src/pkg/go/scanner/scanner.go
+++ b/src/pkg/go/scanner/scanner.go
@@ -6,7 +6,7 @@
 // source which can then be tokenized through repeated calls to the Scan
 // function. Typical use:
 //
-//	var s Scanner
+//	var s scanner.Scanner
 //	fset := token.NewFileSet()  // position information is relative to fset
 //      file := fset.AddFile(filename, fset.Base(), len(src))  // register file
 //	s.Init(file, src, nil /* no error handler */, 0)
@@ -27,7 +27,7 @@ import (
 	"path/filepath"
 	"strconv"
 	"unicode"
-	"utf8"
+	"unicode/utf8"
 )
 
 // A Scanner holds the scanner's internal state while processing
@@ -40,10 +40,10 @@ type Scanner struct {
 	dir  string       // directory portion of file.Name()
 	src  []byte       // source
 	err  ErrorHandler // error reporting; or nil
-	mode uint         // scanning mode
+	mode Mode         // scanning mode
 
 	// scanning state
-	ch         int  // current character
+	ch         rune // current character
 	offset     int  // character offset
 	rdOffset   int  // reading offset (position after current character)
 	lineOffset int  // current line offset
@@ -63,7 +63,7 @@ func (S *Scanner) next() {
 			S.lineOffset = S.offset
 			S.file.AddLine(S.offset)
 		}
-		r, w := int(S.src[S.rdOffset]), 1
+		r, w := rune(S.src[S.rdOffset]), 1
 		switch {
 		case r == 0:
 			S.error(S.offset, "illegal character NUL")
@@ -86,13 +86,14 @@ func (S *Scanner) next() {
 	}
 }
 
-// The mode parameter to the Init function is a set of flags (or 0).
+// A mode value is set of flags (or 0).
 // They control scanner behavior.
 //
+type Mode uint
+
 const (
-	ScanComments      = 1 << iota // return comments as COMMENT tokens
-	AllowIllegalChars             // do not report an error for illegal chars
-	InsertSemis                   // automatically insert semicolons
+	ScanComments    Mode = 1 << iota // return comments as COMMENT tokens
+	dontInsertSemis                  // do not automatically insert semicolons - for testing only
 )
 
 // Init prepares the scanner S to tokenize the text src by setting the
@@ -105,12 +106,12 @@ const (
 // Calls to Scan will use the error handler err if they encounter a
 // syntax error and err is not nil. Also, for each error encountered,
 // the Scanner field ErrorCount is incremented by one. The mode parameter
-// determines how comments, illegal characters, and semicolons are handled.
+// determines how comments are handled.
 //
 // Note that Init may call err if there is an error in the first character
 // of the file.
 //
-func (S *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode uint) {
+func (S *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
 	// Explicitly initialize all fields since a scanner may be reused.
 	if file.Size() != len(src) {
 		panic("file size does not match src len")
@@ -152,13 +153,13 @@ func (S *Scanner) interpretLineComment(text []byte) {
 					filename = filepath.Join(S.dir, filename)
 				}
 				// update scanner position
-				S.file.AddLineInfo(S.lineOffset, filename, line-1) // -1 since comment applies to next line
+				S.file.AddLineInfo(S.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line
 			}
 		}
 	}
 }
 
-func (S *Scanner) scanComment() {
+func (S *Scanner) scanComment() string {
 	// initial '/' already consumed; S.ch == '/' || S.ch == '*'
 	offs := S.offset - 1 // position of initial '/'
 
@@ -172,7 +173,7 @@ func (S *Scanner) scanComment() {
 			// comment starts at the beginning of the current line
 			S.interpretLineComment(S.src[offs:S.offset])
 		}
-		return
+		goto exit
 	}
 
 	/*-style comment */
@@ -182,11 +183,14 @@ func (S *Scanner) scanComment() {
 		S.next()
 		if ch == '*' && S.ch == '/' {
 			S.next()
-			return
+			goto exit
 		}
 	}
 
 	S.error(offs, "comment not terminated")
+
+exit:
+	return string(S.src[offs:S.offset])
 }
 
 func (S *Scanner) findLineEnd() bool {
@@ -233,30 +237,30 @@ func (S *Scanner) findLineEnd() bool {
 	return false
 }
 
-func isLetter(ch int) bool {
+func isLetter(ch rune) bool {
 	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
 }
 
-func isDigit(ch int) bool {
+func isDigit(ch rune) bool {
 	return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
 }
 
-func (S *Scanner) scanIdentifier() token.Token {
+func (S *Scanner) scanIdentifier() string {
 	offs := S.offset
 	for isLetter(S.ch) || isDigit(S.ch) {
 		S.next()
 	}
-	return token.Lookup(S.src[offs:S.offset])
+	return string(S.src[offs:S.offset])
 }
 
-func digitVal(ch int) int {
+func digitVal(ch rune) int {
 	switch {
 	case '0' <= ch && ch <= '9':
-		return ch - '0'
+		return int(ch - '0')
 	case 'a' <= ch && ch <= 'f':
-		return ch - 'a' + 10
+		return int(ch - 'a' + 10)
 	case 'A' <= ch && ch <= 'F':
-		return ch - 'A' + 10
+		return int(ch - 'A' + 10)
 	}
 	return 16 // larger than any legal digit val
 }
@@ -267,11 +271,13 @@ func (S *Scanner) scanMantissa(base int) {
 	}
 }
 
-func (S *Scanner) scanNumber(seenDecimalPoint bool) token.Token {
+func (S *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) {
 	// digitVal(S.ch) < 10
+	offs := S.offset
 	tok := token.INT
 
 	if seenDecimalPoint {
+		offs--
 		tok = token.FLOAT
 		S.scanMantissa(10)
 		goto exponent
@@ -335,10 +341,10 @@ exponent:
 	}
 
 exit:
-	return tok
+	return tok, string(S.src[offs:S.offset])
 }
 
-func (S *Scanner) scanEscape(quote int) {
+func (S *Scanner) scanEscape(quote rune) {
 	offs := S.offset
 
 	var i, base, max uint32
@@ -382,7 +388,7 @@ func (S *Scanner) scanEscape(quote int) {
 	}
 }
 
-func (S *Scanner) scanChar() {
+func (S *Scanner) scanChar() string {
 	// '\'' opening already consumed
 	offs := S.offset - 1
 
@@ -406,9 +412,11 @@ func (S *Scanner) scanChar() {
 	if n != 1 {
 		S.error(offs, "illegal character literal")
 	}
+
+	return string(S.src[offs:S.offset])
 }
 
-func (S *Scanner) scanString() {
+func (S *Scanner) scanString() string {
 	// '"' opening already consumed
 	offs := S.offset - 1
 
@@ -425,15 +433,33 @@ func (S *Scanner) scanString() {
 	}
 
 	S.next()
+
+	return string(S.src[offs:S.offset])
+}
+
+func stripCR(b []byte) []byte {
+	c := make([]byte, len(b))
+	i := 0
+	for _, ch := range b {
+		if ch != '\r' {
+			c[i] = ch
+			i++
+		}
+	}
+	return c[:i]
 }
 
-func (S *Scanner) scanRawString() {
+func (S *Scanner) scanRawString() string {
 	// '`' opening already consumed
 	offs := S.offset - 1
 
+	hasCR := false
 	for S.ch != '`' {
 		ch := S.ch
 		S.next()
+		if ch == '\r' {
+			hasCR = true
+		}
 		if ch < 0 {
 			S.error(offs, "string not terminated")
 			break
@@ -441,6 +467,13 @@ func (S *Scanner) scanRawString() {
 	}
 
 	S.next()
+
+	lit := S.src[offs:S.offset]
+	if hasCR {
+		lit = stripCR(lit)
+	}
+
+	return string(lit)
 }
 
 func (S *Scanner) skipWhitespace() {
@@ -463,7 +496,7 @@ func (S *Scanner) switch2(tok0, tok1 token.Token) token.Token {
 	return tok0
 }
 
-func (S *Scanner) switch3(tok0, tok1 token.Token, ch2 int, tok2 token.Token) token.Token {
+func (S *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
 	if S.ch == '=' {
 		S.next()
 		return tok1
@@ -475,7 +508,7 @@ func (S *Scanner) switch3(tok0, tok1 token.Token, ch2 int, tok2 token.Token) tok
 	return tok0
 }
 
-func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Token) token.Token {
+func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
 	if S.ch == '=' {
 		S.next()
 		return tok1
@@ -491,15 +524,24 @@ func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Toke
 	return tok0
 }
 
-// Scan scans the next token and returns the token position,
-// the token, and the literal string corresponding to the
-// token. The source end is indicated by token.EOF.
+// Scan scans the next token and returns the token position, the token,
+// and its literal string if applicable. The source end is indicated by
+// token.EOF.
+//
+// If the returned token is a literal (token.IDENT, token.INT, token.FLOAT,
+// token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string
+// has the corresponding value.
 //
 // If the returned token is token.SEMICOLON, the corresponding
 // literal string is ";" if the semicolon was present in the source,
 // and "\n" if the semicolon was inserted because of a newline or
 // at EOF.
 //
+// If the returned token is token.ILLEGAL, the literal string is the
+// offending character.
+//
+// In all other cases, Scan returns an empty literal string.
+//
 // For more tolerant parsing, Scan will return a valid token if
 // possible even if a syntax error was encountered. Thus, even
 // if the resulting token sequence contains no illegal tokens,
@@ -511,33 +553,33 @@ func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Toke
 // set with Init. Token positions are relative to that file
 // and thus relative to the file set.
 //
-func (S *Scanner) Scan() (token.Pos, token.Token, string) {
+func (S *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
 scanAgain:
 	S.skipWhitespace()
 
 	// current token start
-	insertSemi := false
-	offs := S.offset
-	tok := token.ILLEGAL
+	pos = S.file.Pos(S.offset)
 
 	// determine token value
+	insertSemi := false
 	switch ch := S.ch; {
 	case isLetter(ch):
-		tok = S.scanIdentifier()
+		lit = S.scanIdentifier()
+		tok = token.Lookup(lit)
 		switch tok {
 		case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
 			insertSemi = true
 		}
 	case digitVal(ch) < 10:
 		insertSemi = true
-		tok = S.scanNumber(false)
+		tok, lit = S.scanNumber(false)
 	default:
 		S.next() // always make progress
 		switch ch {
 		case -1:
 			if S.insertSemi {
 				S.insertSemi = false // EOF consumed
-				return S.file.Pos(offs), token.SEMICOLON, "\n"
+				return pos, token.SEMICOLON, "\n"
 			}
 			tok = token.EOF
 		case '\n':
@@ -545,25 +587,25 @@ scanAgain:
 			// set in the first place and exited early
 			// from S.skipWhitespace()
 			S.insertSemi = false // newline consumed
-			return S.file.Pos(offs), token.SEMICOLON, "\n"
+			return pos, token.SEMICOLON, "\n"
 		case '"':
 			insertSemi = true
 			tok = token.STRING
-			S.scanString()
+			lit = S.scanString()
 		case '\'':
 			insertSemi = true
 			tok = token.CHAR
-			S.scanChar()
+			lit = S.scanChar()
 		case '`':
 			insertSemi = true
 			tok = token.STRING
-			S.scanRawString()
+			lit = S.scanRawString()
 		case ':':
 			tok = S.switch2(token.COLON, token.DEFINE)
 		case '.':
 			if digitVal(S.ch) < 10 {
 				insertSemi = true
-				tok = S.scanNumber(true)
+				tok, lit = S.scanNumber(true)
 			} else if S.ch == '.' {
 				S.next()
 				if S.ch == '.' {
@@ -577,6 +619,7 @@ scanAgain:
 			tok = token.COMMA
 		case ';':
 			tok = token.SEMICOLON
+			lit = ";"
 		case '(':
 			tok = token.LPAREN
 		case ')':
@@ -610,12 +653,12 @@ scanAgain:
 				if S.insertSemi && S.findLineEnd() {
 					// reset position to the beginning of the comment
 					S.ch = '/'
-					S.offset = offs
-					S.rdOffset = offs + 1
+					S.offset = S.file.Offset(pos)
+					S.rdOffset = S.offset + 1
 					S.insertSemi = false // newline consumed
-					return S.file.Pos(offs), token.SEMICOLON, "\n"
+					return pos, token.SEMICOLON, "\n"
 				}
-				S.scanComment()
+				lit = S.scanComment()
 				if S.mode&ScanComments == 0 {
 					// skip comment
 					S.insertSemi = false // newline consumed
@@ -652,19 +695,15 @@ scanAgain:
 		case '|':
 			tok = S.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
 		default:
-			if S.mode&AllowIllegalChars == 0 {
-				S.error(offs, fmt.Sprintf("illegal character %#U", ch))
-			}
+			S.error(S.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
 			insertSemi = S.insertSemi // preserve insertSemi info
+			tok = token.ILLEGAL
+			lit = string(ch)
 		}
 	}
-
-	if S.mode&InsertSemis != 0 {
+	if S.mode&dontInsertSemis == 0 {
 		S.insertSemi = insertSemi
 	}
 
-	// TODO(gri): The scanner API should change such that the literal string
-	//            is only valid if an actual literal was scanned. This will
-	//            permit a more efficient implementation.
-	return S.file.Pos(offs), tok, string(S.src[offs:S.offset])
+	return
 }
author	Ondřej Surý <ondrej@sury.org>	2012-01-30 15:38:19 +0100
committer	Ondřej Surý <ondrej@sury.org>	2012-01-30 15:38:19 +0100
commit	4cecda6c347bd6902b960c6a35a967add7070b0d (patch)
tree	a462e224ff41ec9f3eb1a0b6e815806f9e8804ad /src/pkg/go/scanner/scanner.go
parent	6c7ca6e4d4e26e4c8cbe0d183966011b3b088a0a (diff)
download	golang-4cecda6c347bd6902b960c6a35a967add7070b0d.tar.gz