diff options
| author | Ondřej Surý <ondrej@sury.org> | 2012-01-30 15:38:19 +0100 | 
|---|---|---|
| committer | Ondřej Surý <ondrej@sury.org> | 2012-01-30 15:38:19 +0100 | 
| commit | 4cecda6c347bd6902b960c6a35a967add7070b0d (patch) | |
| tree | a462e224ff41ec9f3eb1a0b6e815806f9e8804ad /src/pkg/go/scanner/scanner.go | |
| parent | 6c7ca6e4d4e26e4c8cbe0d183966011b3b088a0a (diff) | |
| download | golang-4cecda6c347bd6902b960c6a35a967add7070b0d.tar.gz | |
Imported Upstream version 2012.01.27upstream-weekly/2012.01.27
Diffstat (limited to 'src/pkg/go/scanner/scanner.go')
| -rw-r--r-- | src/pkg/go/scanner/scanner.go | 157 | 
1 files changed, 98 insertions, 59 deletions
| diff --git a/src/pkg/go/scanner/scanner.go b/src/pkg/go/scanner/scanner.go index 7f3dd2373..7c72c0a46 100644 --- a/src/pkg/go/scanner/scanner.go +++ b/src/pkg/go/scanner/scanner.go @@ -6,7 +6,7 @@  // source which can then be tokenized through repeated calls to the Scan  // function. Typical use:  // -//	var s Scanner +//	var s scanner.Scanner  //	fset := token.NewFileSet()  // position information is relative to fset  //      file := fset.AddFile(filename, fset.Base(), len(src))  // register file  //	s.Init(file, src, nil /* no error handler */, 0) @@ -27,7 +27,7 @@ import (  	"path/filepath"  	"strconv"  	"unicode" -	"utf8" +	"unicode/utf8"  )  // A Scanner holds the scanner's internal state while processing @@ -40,10 +40,10 @@ type Scanner struct {  	dir  string       // directory portion of file.Name()  	src  []byte       // source  	err  ErrorHandler // error reporting; or nil -	mode uint         // scanning mode +	mode Mode         // scanning mode  	// scanning state -	ch         int  // current character +	ch         rune // current character  	offset     int  // character offset  	rdOffset   int  // reading offset (position after current character)  	lineOffset int  // current line offset @@ -63,7 +63,7 @@ func (S *Scanner) next() {  			S.lineOffset = S.offset  			S.file.AddLine(S.offset)  		} -		r, w := int(S.src[S.rdOffset]), 1 +		r, w := rune(S.src[S.rdOffset]), 1  		switch {  		case r == 0:  			S.error(S.offset, "illegal character NUL") @@ -86,13 +86,14 @@ func (S *Scanner) next() {  	}  } -// The mode parameter to the Init function is a set of flags (or 0). +// A mode value is set of flags (or 0).  // They control scanner behavior.  // +type Mode uint +  const ( -	ScanComments      = 1 << iota // return comments as COMMENT tokens -	AllowIllegalChars             // do not report an error for illegal chars -	InsertSemis                   // automatically insert semicolons +	ScanComments    Mode = 1 << iota // return comments as COMMENT tokens +	dontInsertSemis                  // do not automatically insert semicolons - for testing only  )  // Init prepares the scanner S to tokenize the text src by setting the @@ -105,12 +106,12 @@ const (  // Calls to Scan will use the error handler err if they encounter a  // syntax error and err is not nil. Also, for each error encountered,  // the Scanner field ErrorCount is incremented by one. The mode parameter -// determines how comments, illegal characters, and semicolons are handled. +// determines how comments are handled.  //  // Note that Init may call err if there is an error in the first character  // of the file.  // -func (S *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode uint) { +func (S *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {  	// Explicitly initialize all fields since a scanner may be reused.  	if file.Size() != len(src) {  		panic("file size does not match src len") @@ -152,13 +153,13 @@ func (S *Scanner) interpretLineComment(text []byte) {  					filename = filepath.Join(S.dir, filename)  				}  				// update scanner position -				S.file.AddLineInfo(S.lineOffset, filename, line-1) // -1 since comment applies to next line +				S.file.AddLineInfo(S.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line  			}  		}  	}  } -func (S *Scanner) scanComment() { +func (S *Scanner) scanComment() string {  	// initial '/' already consumed; S.ch == '/' || S.ch == '*'  	offs := S.offset - 1 // position of initial '/' @@ -172,7 +173,7 @@ func (S *Scanner) scanComment() {  			// comment starts at the beginning of the current line  			S.interpretLineComment(S.src[offs:S.offset])  		} -		return +		goto exit  	}  	/*-style comment */ @@ -182,11 +183,14 @@ func (S *Scanner) scanComment() {  		S.next()  		if ch == '*' && S.ch == '/' {  			S.next() -			return +			goto exit  		}  	}  	S.error(offs, "comment not terminated") + +exit: +	return string(S.src[offs:S.offset])  }  func (S *Scanner) findLineEnd() bool { @@ -233,30 +237,30 @@ func (S *Scanner) findLineEnd() bool {  	return false  } -func isLetter(ch int) bool { +func isLetter(ch rune) bool {  	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)  } -func isDigit(ch int) bool { +func isDigit(ch rune) bool {  	return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)  } -func (S *Scanner) scanIdentifier() token.Token { +func (S *Scanner) scanIdentifier() string {  	offs := S.offset  	for isLetter(S.ch) || isDigit(S.ch) {  		S.next()  	} -	return token.Lookup(S.src[offs:S.offset]) +	return string(S.src[offs:S.offset])  } -func digitVal(ch int) int { +func digitVal(ch rune) int {  	switch {  	case '0' <= ch && ch <= '9': -		return ch - '0' +		return int(ch - '0')  	case 'a' <= ch && ch <= 'f': -		return ch - 'a' + 10 +		return int(ch - 'a' + 10)  	case 'A' <= ch && ch <= 'F': -		return ch - 'A' + 10 +		return int(ch - 'A' + 10)  	}  	return 16 // larger than any legal digit val  } @@ -267,11 +271,13 @@ func (S *Scanner) scanMantissa(base int) {  	}  } -func (S *Scanner) scanNumber(seenDecimalPoint bool) token.Token { +func (S *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) {  	// digitVal(S.ch) < 10 +	offs := S.offset  	tok := token.INT  	if seenDecimalPoint { +		offs--  		tok = token.FLOAT  		S.scanMantissa(10)  		goto exponent @@ -335,10 +341,10 @@ exponent:  	}  exit: -	return tok +	return tok, string(S.src[offs:S.offset])  } -func (S *Scanner) scanEscape(quote int) { +func (S *Scanner) scanEscape(quote rune) {  	offs := S.offset  	var i, base, max uint32 @@ -382,7 +388,7 @@ func (S *Scanner) scanEscape(quote int) {  	}  } -func (S *Scanner) scanChar() { +func (S *Scanner) scanChar() string {  	// '\'' opening already consumed  	offs := S.offset - 1 @@ -406,9 +412,11 @@ func (S *Scanner) scanChar() {  	if n != 1 {  		S.error(offs, "illegal character literal")  	} + +	return string(S.src[offs:S.offset])  } -func (S *Scanner) scanString() { +func (S *Scanner) scanString() string {  	// '"' opening already consumed  	offs := S.offset - 1 @@ -425,15 +433,33 @@ func (S *Scanner) scanString() {  	}  	S.next() + +	return string(S.src[offs:S.offset]) +} + +func stripCR(b []byte) []byte { +	c := make([]byte, len(b)) +	i := 0 +	for _, ch := range b { +		if ch != '\r' { +			c[i] = ch +			i++ +		} +	} +	return c[:i]  } -func (S *Scanner) scanRawString() { +func (S *Scanner) scanRawString() string {  	// '`' opening already consumed  	offs := S.offset - 1 +	hasCR := false  	for S.ch != '`' {  		ch := S.ch  		S.next() +		if ch == '\r' { +			hasCR = true +		}  		if ch < 0 {  			S.error(offs, "string not terminated")  			break @@ -441,6 +467,13 @@ func (S *Scanner) scanRawString() {  	}  	S.next() + +	lit := S.src[offs:S.offset] +	if hasCR { +		lit = stripCR(lit) +	} + +	return string(lit)  }  func (S *Scanner) skipWhitespace() { @@ -463,7 +496,7 @@ func (S *Scanner) switch2(tok0, tok1 token.Token) token.Token {  	return tok0  } -func (S *Scanner) switch3(tok0, tok1 token.Token, ch2 int, tok2 token.Token) token.Token { +func (S *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {  	if S.ch == '=' {  		S.next()  		return tok1 @@ -475,7 +508,7 @@ func (S *Scanner) switch3(tok0, tok1 token.Token, ch2 int, tok2 token.Token) tok  	return tok0  } -func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Token) token.Token { +func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {  	if S.ch == '=' {  		S.next()  		return tok1 @@ -491,15 +524,24 @@ func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Toke  	return tok0  } -// Scan scans the next token and returns the token position, -// the token, and the literal string corresponding to the -// token. The source end is indicated by token.EOF. +// Scan scans the next token and returns the token position, the token, +// and its literal string if applicable. The source end is indicated by +// token.EOF. +// +// If the returned token is a literal (token.IDENT, token.INT, token.FLOAT, +// token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string +// has the corresponding value.  //  // If the returned token is token.SEMICOLON, the corresponding  // literal string is ";" if the semicolon was present in the source,  // and "\n" if the semicolon was inserted because of a newline or  // at EOF.  // +// If the returned token is token.ILLEGAL, the literal string is the +// offending character. +// +// In all other cases, Scan returns an empty literal string. +//  // For more tolerant parsing, Scan will return a valid token if  // possible even if a syntax error was encountered. Thus, even  // if the resulting token sequence contains no illegal tokens, @@ -511,33 +553,33 @@ func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Toke  // set with Init. Token positions are relative to that file  // and thus relative to the file set.  // -func (S *Scanner) Scan() (token.Pos, token.Token, string) { +func (S *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {  scanAgain:  	S.skipWhitespace()  	// current token start -	insertSemi := false -	offs := S.offset -	tok := token.ILLEGAL +	pos = S.file.Pos(S.offset)  	// determine token value +	insertSemi := false  	switch ch := S.ch; {  	case isLetter(ch): -		tok = S.scanIdentifier() +		lit = S.scanIdentifier() +		tok = token.Lookup(lit)  		switch tok {  		case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:  			insertSemi = true  		}  	case digitVal(ch) < 10:  		insertSemi = true -		tok = S.scanNumber(false) +		tok, lit = S.scanNumber(false)  	default:  		S.next() // always make progress  		switch ch {  		case -1:  			if S.insertSemi {  				S.insertSemi = false // EOF consumed -				return S.file.Pos(offs), token.SEMICOLON, "\n" +				return pos, token.SEMICOLON, "\n"  			}  			tok = token.EOF  		case '\n': @@ -545,25 +587,25 @@ scanAgain:  			// set in the first place and exited early  			// from S.skipWhitespace()  			S.insertSemi = false // newline consumed -			return S.file.Pos(offs), token.SEMICOLON, "\n" +			return pos, token.SEMICOLON, "\n"  		case '"':  			insertSemi = true  			tok = token.STRING -			S.scanString() +			lit = S.scanString()  		case '\'':  			insertSemi = true  			tok = token.CHAR -			S.scanChar() +			lit = S.scanChar()  		case '`':  			insertSemi = true  			tok = token.STRING -			S.scanRawString() +			lit = S.scanRawString()  		case ':':  			tok = S.switch2(token.COLON, token.DEFINE)  		case '.':  			if digitVal(S.ch) < 10 {  				insertSemi = true -				tok = S.scanNumber(true) +				tok, lit = S.scanNumber(true)  			} else if S.ch == '.' {  				S.next()  				if S.ch == '.' { @@ -577,6 +619,7 @@ scanAgain:  			tok = token.COMMA  		case ';':  			tok = token.SEMICOLON +			lit = ";"  		case '(':  			tok = token.LPAREN  		case ')': @@ -610,12 +653,12 @@ scanAgain:  				if S.insertSemi && S.findLineEnd() {  					// reset position to the beginning of the comment  					S.ch = '/' -					S.offset = offs -					S.rdOffset = offs + 1 +					S.offset = S.file.Offset(pos) +					S.rdOffset = S.offset + 1  					S.insertSemi = false // newline consumed -					return S.file.Pos(offs), token.SEMICOLON, "\n" +					return pos, token.SEMICOLON, "\n"  				} -				S.scanComment() +				lit = S.scanComment()  				if S.mode&ScanComments == 0 {  					// skip comment  					S.insertSemi = false // newline consumed @@ -652,19 +695,15 @@ scanAgain:  		case '|':  			tok = S.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)  		default: -			if S.mode&AllowIllegalChars == 0 { -				S.error(offs, fmt.Sprintf("illegal character %#U", ch)) -			} +			S.error(S.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))  			insertSemi = S.insertSemi // preserve insertSemi info +			tok = token.ILLEGAL +			lit = string(ch)  		}  	} - -	if S.mode&InsertSemis != 0 { +	if S.mode&dontInsertSemis == 0 {  		S.insertSemi = insertSemi  	} -	// TODO(gri): The scanner API should change such that the literal string -	//            is only valid if an actual literal was scanned. This will -	//            permit a more efficient implementation. -	return S.file.Pos(offs), tok, string(S.src[offs:S.offset]) +	return  } | 
