Imported Upstream version 2011-02-01.1upstream/2011-02-01.1

author: Ondřej Surý <ondrej@sury.org> 2011-02-14 13:23:51 +0100
committer: Ondřej Surý <ondrej@sury.org> 2011-02-14 13:23:51 +0100
commit: 758ff64c69e34965f8af5b2d6ffd65e8d7ab2150 (patch)
tree: 6d6b34f8c678862fe9b56c945a7b63f68502c245 /src/pkg/scanner
parent: 3e45412327a2654a77944249962b3652e6142299 (diff)
download: golang-758ff64c69e34965f8af5b2d6ffd65e8d7ab2150.tar.gz
2 files changed, 173 insertions, 58 deletions
diff --git a/src/pkg/scanner/scanner.go b/src/pkg/scanner/scanner.go
index 11aa9f43f..2396cdd9a 100644
--- a/src/pkg/scanner/scanner.go
+++ b/src/pkg/scanner/scanner.go
@@ -34,13 +34,15 @@ import (
 )
 
 
+// TODO(gri): Consider changing this to use the new (token) Position package.
+
 // A source position is represented by a Position value.
 // A position is valid if Line > 0.
 type Position struct {
 	Filename string // filename, if any
 	Offset   int    // byte offset, starting at 0
 	Line     int    // line number, starting at 1
-	Column   int    // column number, starting at 0 (character count per line)
+	Column   int    // column number, starting at 1 (character count per line)
 }
 
 
@@ -136,15 +138,17 @@ type Scanner struct {
 
 	// Source position
 	srcBufOffset int // byte offset of srcBuf[0] in source
-	line         int // newline count + 1
-	column       int // character count on line
+	line         int // line count
+	column       int // character count
+	lastLineLen  int // length of last line in characters (for correct column reporting)
+	lastCharLen  int // length of last character in bytes
 
 	// Token text buffer
 	// Typically, token text is stored completely in srcBuf, but in general
 	// the token text's head may be buffered in tokBuf while the token text's
 	// tail is stored in srcBuf.
 	tokBuf bytes.Buffer // token text head that is not in srcBuf anymore
-	tokPos int          // token text tail position (srcBuf index)
+	tokPos int          // token text tail position (srcBuf index); valid if >= 0
 	tokEnd int          // token text tail end (srcBuf index)
 
 	// One character look-ahead
@@ -175,13 +179,14 @@ type Scanner struct {
 }
 
 
-// Init initializes a Scanner with a new source and returns itself.
+// Init initializes a Scanner with a new source and returns s.
 // Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens,
 // and Whitespace is set to GoWhitespace.
 func (s *Scanner) Init(src io.Reader) *Scanner {
 	s.src = src
 
 	// initialize source buffer
+	// (the first call to next() will fill it by calling src.Read)
 	s.srcBuf[0] = utf8.RuneSelf // sentinel
 	s.srcPos = 0
 	s.srcEnd = 0
@@ -190,12 +195,15 @@ func (s *Scanner) Init(src io.Reader) *Scanner {
 	s.srcBufOffset = 0
 	s.line = 1
 	s.column = 0
+	s.lastLineLen = 0
+	s.lastCharLen = 0
 
 	// initialize token text buffer
+	// (required for first call to next()).
 	s.tokPos = -1
 
 	// initialize one character look-ahead
-	s.ch = s.next()
+	s.ch = -1 // no char read yet
 
 	// initialize public fields
 	s.Error = nil
@@ -207,12 +215,17 @@ func (s *Scanner) Init(src io.Reader) *Scanner {
 }
 
 
+// TODO(gri): The code for next() and the internal scanner state could benefit
+//            from a rethink. While next() is optimized for the common ASCII
+//            case, the "corrections" needed for proper position tracking undo
+//            some of the attempts for fast-path optimization.
+
 // next reads and returns the next Unicode character. It is designed such
 // that only a minimal amount of work needs to be done in the common ASCII
 // case (one test to check for both ASCII and end-of-buffer, and one test
 // to check for newlines).
 func (s *Scanner) next() int {
-	ch := int(s.srcBuf[s.srcPos])
+	ch, width := int(s.srcBuf[s.srcPos]), 1
 
 	if ch >= utf8.RuneSelf {
 		// uncommon case: not ASCII or not enough bytes
@@ -222,47 +235,64 @@ func (s *Scanner) next() int {
 			if s.tokPos >= 0 {
 				s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos])
 				s.tokPos = 0
+				// s.tokEnd is set by Scan()
 			}
 			// move unread bytes to beginning of buffer
 			copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd])
 			s.srcBufOffset += s.srcPos
 			// read more bytes
+			// (an io.Reader must return os.EOF when it reaches
+			// the end of what it is reading - simply returning
+			// n == 0 will make this loop retry forever; but the
+			// error is in the reader implementation in that case)
 			i := s.srcEnd - s.srcPos
 			n, err := s.src.Read(s.srcBuf[i:bufLen])
-			s.srcEnd = i + n
 			s.srcPos = 0
+			s.srcEnd = i + n
 			s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel
 			if err != nil {
 				if s.srcEnd == 0 {
+					if s.lastCharLen > 0 {
+						// previous character was not EOF
+						s.column++
+					}
+					s.lastCharLen = 0
 					return EOF
 				}
 				if err != os.EOF {
 					s.error(err.String())
-					break
 				}
+				// If err == EOF, we won't be getting more
+				// bytes; break to avoid infinite loop. If
+				// err is something else, we don't know if
+				// we can get more bytes; thus also break.
+				break
 			}
 		}
 		// at least one byte
 		ch = int(s.srcBuf[s.srcPos])
 		if ch >= utf8.RuneSelf {
 			// uncommon case: not ASCII
-			var width int
 			ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd])
 			if ch == utf8.RuneError && width == 1 {
 				s.error("illegal UTF-8 encoding")
 			}
-			s.srcPos += width - 1
 		}
 	}
 
-	s.srcPos++
+	// advance
+	s.srcPos += width
+	s.lastCharLen = width
 	s.column++
+
+	// special situations
 	switch ch {
 	case 0:
 		// implementation restriction for compatibility with other tools
 		s.error("illegal character NUL")
 	case '\n':
 		s.line++
+		s.lastLineLen = s.column
 		s.column = 0
 	}
 
@@ -272,13 +302,13 @@ func (s *Scanner) next() int {
 
 // Next reads and returns the next Unicode character.
 // It returns EOF at the end of the source. It reports
-// a read error by calling s.Error, if set, or else
-// prints an error message to os.Stderr. Next does not
+// a read error by calling s.Error, if not nil; otherwise
+// it prints an error message to os.Stderr. Next does not
 // update the Scanner's Position field; use Pos() to
 // get the current position.
 func (s *Scanner) Next() int {
 	s.tokPos = -1 // don't collect token text
-	ch := s.ch
+	ch := s.Peek()
 	s.ch = s.next()
 	return ch
 }
@@ -288,6 +318,9 @@ func (s *Scanner) Next() int {
 // the scanner. It returns EOF if the scanner's position is at the last
 // character of the source.
 func (s *Scanner) Peek() int {
+	if s.ch < 0 {
+		s.ch = s.next()
+	}
 	return s.ch
 }
 
@@ -511,10 +544,10 @@ func (s *Scanner) scanComment(ch int) {
 // Scan reads the next token or Unicode character from source and returns it.
 // It only recognizes tokens t for which the respective Mode bit (1<<-t) is set.
 // It returns EOF at the end of the source. It reports scanner errors (read and
-// token errors) by calling s.Error, if set; otherwise it prints an error message
-// to os.Stderr.
+// token errors) by calling s.Error, if not nil; otherwise it prints an error
+// message to os.Stderr.
 func (s *Scanner) Scan() int {
-	ch := s.ch
+	ch := s.Peek()
 
 	// reset token text position
 	s.tokPos = -1
@@ -527,12 +560,22 @@ redo:
 
 	// start collecting token text
 	s.tokBuf.Reset()
-	s.tokPos = s.srcPos - 1
+	s.tokPos = s.srcPos - s.lastCharLen
 
 	// set token position
+	// (this is a slightly optimized version of the code in Pos())
 	s.Offset = s.srcBufOffset + s.tokPos
-	s.Line = s.line
-	s.Column = s.column
+	if s.column > 0 {
+		// common case: last character was not a '\n'
+		s.Line = s.line
+		s.Column = s.column
+	} else {
+		// last character was a '\n'
+		// (we cannot be at the beginning of the source
+		// since we have called next() at least once)
+		s.Line = s.line - 1
+		s.Column = s.lastLineLen
+	}
 
 	// determine token value
 	tok := ch
@@ -596,25 +639,33 @@ redo:
 	}
 
 	// end of token text
-	s.tokEnd = s.srcPos - 1
+	s.tokEnd = s.srcPos - s.lastCharLen
 
 	s.ch = ch
 	return tok
 }
 
 
-// Position returns the current source position. If called before Next()
-// or Scan(), it returns the position of the next Unicode character or token
-// returned by these functions. If called afterwards, it returns the position
-// immediately after the last character of the most recent token or character
-// scanned.
-func (s *Scanner) Pos() Position {
-	return Position{
-		s.Filename,
-		s.srcBufOffset + s.srcPos - 1,
-		s.line,
-		s.column,
+// Pos returns the position of the character immediately after
+// the character or token returned by the last call to Next or Scan.
+func (s *Scanner) Pos() (pos Position) {
+	pos.Filename = s.Filename
+	pos.Offset = s.srcBufOffset + s.srcPos - s.lastCharLen
+	switch {
+	case s.column > 0:
+		// common case: last character was not a '\n'
+		pos.Line = s.line
+		pos.Column = s.column
+	case s.lastLineLen > 0:
+		// last character was a '\n'
+		pos.Line = s.line - 1
+		pos.Column = s.lastLineLen
+	default:
+		// at the beginning of the source
+		pos.Line = 1
+		pos.Column = 1
 	}
+	return
 }
 
 
diff --git a/src/pkg/scanner/scanner_test.go b/src/pkg/scanner/scanner_test.go
index 506f434fe..002252de8 100644
--- a/src/pkg/scanner/scanner_test.go
+++ b/src/pkg/scanner/scanner_test.go
@@ -10,6 +10,7 @@ import (
 	"os"
 	"strings"
 	"testing"
+	"utf8"
 )
 
 
@@ -408,7 +409,7 @@ func TestScanWhitespace(t *testing.T) {
 func testError(t *testing.T, src, msg string, tok int) {
 	s := new(Scanner).Init(bytes.NewBufferString(src))
 	errorCalled := false
-	s.Error = func(s *Scanner, m string) {
+	s.Error = func(_ *Scanner, m string) {
 		if !errorCalled {
 			// only look at first error
 			if m != msg {
@@ -431,6 +432,8 @@ func testError(t *testing.T, src, msg string, tok int) {
 
 
 func TestError(t *testing.T) {
+	testError(t, "\x00", "illegal character NUL", 0)
+	testError(t, "\xff", "illegal UTF-8 encoding", utf8.RuneError)
 	testError(t, `01238`, "illegal octal number", Int)
 	testError(t, `'\"'`, "illegal char escape", Char)
 	testError(t, `'aa'`, "illegal char literal", Char)
@@ -445,38 +448,99 @@ func TestError(t *testing.T) {
 }
 
 
-func checkPos(t *testing.T, s *Scanner, offset, line, column, char int) {
-	pos := s.Pos()
-	if pos.Offset != offset {
-		t.Errorf("offset = %d, want %d", pos.Offset, offset)
+func checkPos(t *testing.T, got, want Position) {
+	if got.Offset != want.Offset || got.Line != want.Line || got.Column != want.Column {
+		t.Errorf("got offset, line, column = %d, %d, %d; want %d, %d, %d",
+			got.Offset, got.Line, got.Column, want.Offset, want.Line, want.Column)
 	}
-	if pos.Line != line {
-		t.Errorf("line = %d, want %d", pos.Line, line)
-	}
-	if pos.Column != column {
-		t.Errorf("column = %d, want %d", pos.Column, column)
+}
+
+
+func checkNextPos(t *testing.T, s *Scanner, offset, line, column, char int) {
+	if ch := s.Next(); ch != char {
+		t.Errorf("ch = %s, want %s", TokenString(ch), TokenString(char))
 	}
-	ch := s.Scan()
-	if ch != char {
+	want := Position{Offset: offset, Line: line, Column: column}
+	checkPos(t, s.Pos(), want)
+}
+
+
+func checkScanPos(t *testing.T, s *Scanner, offset, line, column, char int) {
+	want := Position{Offset: offset, Line: line, Column: column}
+	checkPos(t, s.Pos(), want)
+	if ch := s.Scan(); ch != char {
 		t.Errorf("ch = %s, want %s", TokenString(ch), TokenString(char))
+		if string(ch) != s.TokenText() {
+			t.Errorf("tok = %q, want %q", s.TokenText(), string(ch))
+		}
 	}
+	checkPos(t, s.Position, want)
 }
 
 
 func TestPos(t *testing.T) {
-	s := new(Scanner).Init(bytes.NewBufferString("abc\n012\n\nx"))
+	// corner case: empty source
+	s := new(Scanner).Init(bytes.NewBufferString(""))
+	checkPos(t, s.Pos(), Position{Offset: 0, Line: 1, Column: 1})
+	s.Peek() // peek doesn't affect the position
+	checkPos(t, s.Pos(), Position{Offset: 0, Line: 1, Column: 1})
+
+	// corner case: source with only a newline
+	s = new(Scanner).Init(bytes.NewBufferString("\n"))
+	checkPos(t, s.Pos(), Position{Offset: 0, Line: 1, Column: 1})
+	checkNextPos(t, s, 1, 2, 1, '\n')
+	// after EOF position doesn't change
+	for i := 10; i > 0; i-- {
+		checkScanPos(t, s, 1, 2, 1, EOF)
+	}
+
+	// corner case: source with only a single character
+	s = new(Scanner).Init(bytes.NewBufferString("本"))
+	checkPos(t, s.Pos(), Position{Offset: 0, Line: 1, Column: 1})
+	checkNextPos(t, s, 3, 1, 2, '本')
+	// after EOF position doesn't change
+	for i := 10; i > 0; i-- {
+		checkScanPos(t, s, 3, 1, 2, EOF)
+	}
+
+	// positions after calling Next
+	s = new(Scanner).Init(bytes.NewBufferString("  foo६४  \n\n本語\n"))
+	checkNextPos(t, s, 1, 1, 2, ' ')
+	s.Peek() // peek doesn't affect the position
+	checkNextPos(t, s, 2, 1, 3, ' ')
+	checkNextPos(t, s, 3, 1, 4, 'f')
+	checkNextPos(t, s, 4, 1, 5, 'o')
+	checkNextPos(t, s, 5, 1, 6, 'o')
+	checkNextPos(t, s, 8, 1, 7, '६')
+	checkNextPos(t, s, 11, 1, 8, '४')
+	checkNextPos(t, s, 12, 1, 9, ' ')
+	checkNextPos(t, s, 13, 1, 10, ' ')
+	checkNextPos(t, s, 14, 2, 1, '\n')
+	checkNextPos(t, s, 15, 3, 1, '\n')
+	checkNextPos(t, s, 18, 3, 2, '本')
+	checkNextPos(t, s, 21, 3, 3, '語')
+	checkNextPos(t, s, 22, 4, 1, '\n')
+	// after EOF position doesn't change
+	for i := 10; i > 0; i-- {
+		checkScanPos(t, s, 22, 4, 1, EOF)
+	}
+
+	// positions after calling Scan
+	s = new(Scanner).Init(bytes.NewBufferString("abc\n本語\n\nx"))
 	s.Mode = 0
 	s.Whitespace = 0
-	checkPos(t, s, 0, 1, 1, 'a')
-	checkPos(t, s, 1, 1, 2, 'b')
-	checkPos(t, s, 2, 1, 3, 'c')
-	checkPos(t, s, 3, 2, 0, '\n')
-	checkPos(t, s, 4, 2, 1, '0')
-	checkPos(t, s, 5, 2, 2, '1')
-	checkPos(t, s, 6, 2, 3, '2')
-	checkPos(t, s, 7, 3, 0, '\n')
-	checkPos(t, s, 8, 4, 0, '\n')
-	checkPos(t, s, 9, 4, 1, 'x')
-	checkPos(t, s, 9, 4, 1, EOF)
-	checkPos(t, s, 9, 4, 1, EOF) // after EOF, position doesn't change
+	checkScanPos(t, s, 0, 1, 1, 'a')
+	s.Peek() // peek doesn't affect the position
+	checkScanPos(t, s, 1, 1, 2, 'b')
+	checkScanPos(t, s, 2, 1, 3, 'c')
+	checkScanPos(t, s, 3, 1, 4, '\n')
+	checkScanPos(t, s, 4, 2, 1, '本')
+	checkScanPos(t, s, 7, 2, 2, '語')
+	checkScanPos(t, s, 10, 2, 3, '\n')
+	checkScanPos(t, s, 11, 3, 1, '\n')
+	checkScanPos(t, s, 12, 4, 1, 'x')
+	// after EOF position doesn't change
+	for i := 10; i > 0; i-- {
+		checkScanPos(t, s, 13, 4, 2, EOF)
+	}
 }
author	Ondřej Surý <ondrej@sury.org>	2011-02-14 13:23:51 +0100
committer	Ondřej Surý <ondrej@sury.org>	2011-02-14 13:23:51 +0100
commit	758ff64c69e34965f8af5b2d6ffd65e8d7ab2150 (patch)
tree	6d6b34f8c678862fe9b56c945a7b63f68502c245 /src/pkg/scanner
parent	3e45412327a2654a77944249962b3652e6142299 (diff)
download	golang-758ff64c69e34965f8af5b2d6ffd65e8d7ab2150.tar.gz